{ "best_global_step": 32694, "best_metric": 0.27814048528671265, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_multirc_456_1767257752/checkpoint-32694", "epoch": 20.0, "eval_steps": 10898, "global_step": 108980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009175995595522114, "grad_norm": 195.80181884765625, "learning_rate": 3.670398238208846e-09, "loss": 13.6584, "num_input_tokens_seen": 9888, "step": 5 }, { "epoch": 0.0018351991191044228, "grad_norm": 186.83648681640625, "learning_rate": 8.258396035969903e-09, "loss": 14.2069, "num_input_tokens_seen": 20640, "step": 10 }, { "epoch": 0.0027527986786566342, "grad_norm": 177.4124755859375, "learning_rate": 1.284639383373096e-08, "loss": 13.8127, "num_input_tokens_seen": 32704, "step": 15 }, { "epoch": 0.0036703982382088455, "grad_norm": 175.5933074951172, "learning_rate": 1.7434391631492018e-08, "loss": 14.1446, "num_input_tokens_seen": 43520, "step": 20 }, { "epoch": 0.004587997797761057, "grad_norm": 236.6551971435547, "learning_rate": 2.2022389429253076e-08, "loss": 13.5472, "num_input_tokens_seen": 53344, "step": 25 }, { "epoch": 0.0055055973573132685, "grad_norm": 185.80487060546875, "learning_rate": 2.6610387227014133e-08, "loss": 14.2697, "num_input_tokens_seen": 64640, "step": 30 }, { "epoch": 0.00642319691686548, "grad_norm": 220.73582458496094, "learning_rate": 3.119838502477519e-08, "loss": 13.5571, "num_input_tokens_seen": 75328, "step": 35 }, { "epoch": 0.007340796476417691, "grad_norm": 211.26937866210938, "learning_rate": 3.578638282253625e-08, "loss": 13.7824, "num_input_tokens_seen": 86560, "step": 40 }, { "epoch": 0.008258396035969904, "grad_norm": 215.51040649414062, "learning_rate": 4.0374380620297305e-08, "loss": 13.3531, "num_input_tokens_seen": 96864, "step": 45 }, { "epoch": 0.009175995595522114, "grad_norm": 223.17626953125, "learning_rate": 4.496237841805836e-08, "loss": 13.6964, "num_input_tokens_seen": 107808, "step": 50 }, { "epoch": 0.010093595155074325, "grad_norm": 211.7814178466797, "learning_rate": 4.9550376215819427e-08, "loss": 13.7469, "num_input_tokens_seen": 118112, "step": 55 }, { "epoch": 0.011011194714626537, "grad_norm": 188.33291625976562, "learning_rate": 5.413837401358047e-08, "loss": 13.8422, "num_input_tokens_seen": 130048, "step": 60 }, { "epoch": 0.011928794274178749, "grad_norm": 231.34088134765625, "learning_rate": 5.8726371811341535e-08, "loss": 13.6389, "num_input_tokens_seen": 139904, "step": 65 }, { "epoch": 0.01284639383373096, "grad_norm": 184.79541015625, "learning_rate": 6.331436960910259e-08, "loss": 13.1998, "num_input_tokens_seen": 150368, "step": 70 }, { "epoch": 0.013763993393283172, "grad_norm": 204.53443908691406, "learning_rate": 6.790236740686365e-08, "loss": 13.3697, "num_input_tokens_seen": 161248, "step": 75 }, { "epoch": 0.014681592952835382, "grad_norm": 198.4752197265625, "learning_rate": 7.249036520462471e-08, "loss": 13.4932, "num_input_tokens_seen": 171648, "step": 80 }, { "epoch": 0.015599192512387594, "grad_norm": 239.75894165039062, "learning_rate": 7.707836300238576e-08, "loss": 13.1249, "num_input_tokens_seen": 182560, "step": 85 }, { "epoch": 0.016516792071939807, "grad_norm": 231.36502075195312, "learning_rate": 8.166636080014682e-08, "loss": 13.2945, "num_input_tokens_seen": 192384, "step": 90 }, { "epoch": 0.017434391631492015, "grad_norm": 246.2892303466797, "learning_rate": 8.625435859790789e-08, "loss": 12.7628, "num_input_tokens_seen": 202464, "step": 95 }, { "epoch": 0.018351991191044227, "grad_norm": 251.09202575683594, "learning_rate": 9.084235639566895e-08, "loss": 12.7484, "num_input_tokens_seen": 214560, "step": 100 }, { "epoch": 0.01926959075059644, "grad_norm": 271.3606262207031, "learning_rate": 9.543035419342998e-08, "loss": 12.7842, "num_input_tokens_seen": 224608, "step": 105 }, { "epoch": 0.02018719031014865, "grad_norm": 248.28463745117188, "learning_rate": 1.0001835199119105e-07, "loss": 12.4328, "num_input_tokens_seen": 235296, "step": 110 }, { "epoch": 0.021104789869700862, "grad_norm": 220.30870056152344, "learning_rate": 1.0460634978895211e-07, "loss": 12.3583, "num_input_tokens_seen": 245632, "step": 115 }, { "epoch": 0.022022389429253074, "grad_norm": 285.77178955078125, "learning_rate": 1.0919434758671317e-07, "loss": 12.6525, "num_input_tokens_seen": 257760, "step": 120 }, { "epoch": 0.022939988988805286, "grad_norm": 304.834228515625, "learning_rate": 1.1378234538447422e-07, "loss": 12.3031, "num_input_tokens_seen": 268512, "step": 125 }, { "epoch": 0.023857588548357497, "grad_norm": 277.16485595703125, "learning_rate": 1.1837034318223528e-07, "loss": 11.5697, "num_input_tokens_seen": 279424, "step": 130 }, { "epoch": 0.02477518810790971, "grad_norm": 237.00404357910156, "learning_rate": 1.2295834097999634e-07, "loss": 11.7769, "num_input_tokens_seen": 290624, "step": 135 }, { "epoch": 0.02569278766746192, "grad_norm": 207.2158203125, "learning_rate": 1.275463387777574e-07, "loss": 11.2646, "num_input_tokens_seen": 300960, "step": 140 }, { "epoch": 0.026610387227014132, "grad_norm": 189.80996704101562, "learning_rate": 1.3213433657551845e-07, "loss": 11.3254, "num_input_tokens_seen": 312160, "step": 145 }, { "epoch": 0.027527986786566344, "grad_norm": 201.11465454101562, "learning_rate": 1.3672233437327952e-07, "loss": 11.0269, "num_input_tokens_seen": 323360, "step": 150 }, { "epoch": 0.028445586346118552, "grad_norm": 189.19351196289062, "learning_rate": 1.4131033217104057e-07, "loss": 10.747, "num_input_tokens_seen": 333536, "step": 155 }, { "epoch": 0.029363185905670764, "grad_norm": 233.34719848632812, "learning_rate": 1.4589832996880164e-07, "loss": 10.5583, "num_input_tokens_seen": 345088, "step": 160 }, { "epoch": 0.030280785465222976, "grad_norm": 170.25611877441406, "learning_rate": 1.5048632776656268e-07, "loss": 10.5295, "num_input_tokens_seen": 354784, "step": 165 }, { "epoch": 0.031198385024775187, "grad_norm": 162.67916870117188, "learning_rate": 1.5507432556432373e-07, "loss": 10.3695, "num_input_tokens_seen": 364768, "step": 170 }, { "epoch": 0.0321159845843274, "grad_norm": 201.7498321533203, "learning_rate": 1.596623233620848e-07, "loss": 10.2817, "num_input_tokens_seen": 374816, "step": 175 }, { "epoch": 0.033033584143879614, "grad_norm": 182.86593627929688, "learning_rate": 1.6425032115984587e-07, "loss": 10.2786, "num_input_tokens_seen": 385472, "step": 180 }, { "epoch": 0.03395118370343182, "grad_norm": 179.91465759277344, "learning_rate": 1.688383189576069e-07, "loss": 9.8296, "num_input_tokens_seen": 395968, "step": 185 }, { "epoch": 0.03486878326298403, "grad_norm": 196.19342041015625, "learning_rate": 1.7342631675536798e-07, "loss": 9.3924, "num_input_tokens_seen": 405984, "step": 190 }, { "epoch": 0.035786382822536246, "grad_norm": 164.0355682373047, "learning_rate": 1.7801431455312903e-07, "loss": 9.5214, "num_input_tokens_seen": 416064, "step": 195 }, { "epoch": 0.036703982382088454, "grad_norm": 184.93223571777344, "learning_rate": 1.8260231235089007e-07, "loss": 9.5057, "num_input_tokens_seen": 427040, "step": 200 }, { "epoch": 0.03762158194164067, "grad_norm": 156.22518920898438, "learning_rate": 1.8719031014865114e-07, "loss": 8.751, "num_input_tokens_seen": 438240, "step": 205 }, { "epoch": 0.03853918150119288, "grad_norm": 168.95034790039062, "learning_rate": 1.9177830794641219e-07, "loss": 8.4578, "num_input_tokens_seen": 449536, "step": 210 }, { "epoch": 0.03945678106074509, "grad_norm": 179.74029541015625, "learning_rate": 1.9636630574417326e-07, "loss": 8.6476, "num_input_tokens_seen": 460320, "step": 215 }, { "epoch": 0.0403743806202973, "grad_norm": 171.0745391845703, "learning_rate": 2.009543035419343e-07, "loss": 8.3524, "num_input_tokens_seen": 471232, "step": 220 }, { "epoch": 0.041291980179849516, "grad_norm": 174.24514770507812, "learning_rate": 2.0554230133969537e-07, "loss": 7.9538, "num_input_tokens_seen": 481600, "step": 225 }, { "epoch": 0.042209579739401724, "grad_norm": 158.8634033203125, "learning_rate": 2.1013029913745642e-07, "loss": 7.9892, "num_input_tokens_seen": 492128, "step": 230 }, { "epoch": 0.04312717929895394, "grad_norm": 199.88499450683594, "learning_rate": 2.1471829693521751e-07, "loss": 7.7696, "num_input_tokens_seen": 503296, "step": 235 }, { "epoch": 0.04404477885850615, "grad_norm": 179.1539306640625, "learning_rate": 2.1930629473297853e-07, "loss": 7.0059, "num_input_tokens_seen": 513600, "step": 240 }, { "epoch": 0.044962378418058356, "grad_norm": 165.3380584716797, "learning_rate": 2.2389429253073963e-07, "loss": 7.1367, "num_input_tokens_seen": 524416, "step": 245 }, { "epoch": 0.04587997797761057, "grad_norm": 195.83982849121094, "learning_rate": 2.2848229032850067e-07, "loss": 7.0496, "num_input_tokens_seen": 535776, "step": 250 }, { "epoch": 0.04679757753716278, "grad_norm": 183.7938995361328, "learning_rate": 2.330702881262617e-07, "loss": 6.5136, "num_input_tokens_seen": 548160, "step": 255 }, { "epoch": 0.047715177096714995, "grad_norm": 209.2859344482422, "learning_rate": 2.376582859240228e-07, "loss": 6.0191, "num_input_tokens_seen": 558144, "step": 260 }, { "epoch": 0.0486327766562672, "grad_norm": 166.6923828125, "learning_rate": 2.4224628372178383e-07, "loss": 6.07, "num_input_tokens_seen": 568896, "step": 265 }, { "epoch": 0.04955037621581942, "grad_norm": 176.57740783691406, "learning_rate": 2.468342815195449e-07, "loss": 5.4625, "num_input_tokens_seen": 579488, "step": 270 }, { "epoch": 0.050467975775371626, "grad_norm": 184.2537384033203, "learning_rate": 2.514222793173059e-07, "loss": 5.4213, "num_input_tokens_seen": 590912, "step": 275 }, { "epoch": 0.05138557533492384, "grad_norm": 180.38540649414062, "learning_rate": 2.56010277115067e-07, "loss": 4.9569, "num_input_tokens_seen": 600832, "step": 280 }, { "epoch": 0.05230317489447605, "grad_norm": 186.5254669189453, "learning_rate": 2.6059827491282806e-07, "loss": 4.7656, "num_input_tokens_seen": 611040, "step": 285 }, { "epoch": 0.053220774454028265, "grad_norm": 190.7725372314453, "learning_rate": 2.651862727105891e-07, "loss": 4.0199, "num_input_tokens_seen": 621184, "step": 290 }, { "epoch": 0.05413837401358047, "grad_norm": 184.3965606689453, "learning_rate": 2.6977427050835015e-07, "loss": 3.9317, "num_input_tokens_seen": 632032, "step": 295 }, { "epoch": 0.05505597357313269, "grad_norm": 180.16038513183594, "learning_rate": 2.7436226830611125e-07, "loss": 3.5786, "num_input_tokens_seen": 642432, "step": 300 }, { "epoch": 0.055973573132684896, "grad_norm": 181.35836791992188, "learning_rate": 2.789502661038723e-07, "loss": 3.2327, "num_input_tokens_seen": 653632, "step": 305 }, { "epoch": 0.056891172692237105, "grad_norm": 165.52392578125, "learning_rate": 2.835382639016334e-07, "loss": 2.8793, "num_input_tokens_seen": 662720, "step": 310 }, { "epoch": 0.05780877225178932, "grad_norm": 189.20057678222656, "learning_rate": 2.8812626169939443e-07, "loss": 2.7761, "num_input_tokens_seen": 674112, "step": 315 }, { "epoch": 0.05872637181134153, "grad_norm": 134.37010192871094, "learning_rate": 2.927142594971554e-07, "loss": 2.3197, "num_input_tokens_seen": 685216, "step": 320 }, { "epoch": 0.05964397137089374, "grad_norm": 135.7668914794922, "learning_rate": 2.973022572949165e-07, "loss": 2.1023, "num_input_tokens_seen": 695904, "step": 325 }, { "epoch": 0.06056157093044595, "grad_norm": 123.38438415527344, "learning_rate": 3.0189025509267756e-07, "loss": 1.6989, "num_input_tokens_seen": 706816, "step": 330 }, { "epoch": 0.06147917048999817, "grad_norm": 149.3142852783203, "learning_rate": 3.0647825289043866e-07, "loss": 1.702, "num_input_tokens_seen": 718368, "step": 335 }, { "epoch": 0.062396770049550375, "grad_norm": 106.7747802734375, "learning_rate": 3.110662506881997e-07, "loss": 1.4481, "num_input_tokens_seen": 729792, "step": 340 }, { "epoch": 0.06331436960910258, "grad_norm": 93.24066162109375, "learning_rate": 3.1565424848596075e-07, "loss": 1.2409, "num_input_tokens_seen": 740256, "step": 345 }, { "epoch": 0.0642319691686548, "grad_norm": 106.1113510131836, "learning_rate": 3.202422462837218e-07, "loss": 1.0983, "num_input_tokens_seen": 751040, "step": 350 }, { "epoch": 0.06514956872820701, "grad_norm": 82.32642364501953, "learning_rate": 3.248302440814829e-07, "loss": 1.0887, "num_input_tokens_seen": 760768, "step": 355 }, { "epoch": 0.06606716828775923, "grad_norm": 79.33126068115234, "learning_rate": 3.2941824187924394e-07, "loss": 0.8937, "num_input_tokens_seen": 772160, "step": 360 }, { "epoch": 0.06698476784731143, "grad_norm": 79.86894226074219, "learning_rate": 3.34006239677005e-07, "loss": 0.8529, "num_input_tokens_seen": 782816, "step": 365 }, { "epoch": 0.06790236740686365, "grad_norm": 77.21674346923828, "learning_rate": 3.38594237474766e-07, "loss": 0.7332, "num_input_tokens_seen": 793920, "step": 370 }, { "epoch": 0.06881996696641586, "grad_norm": 68.04637908935547, "learning_rate": 3.431822352725271e-07, "loss": 0.6076, "num_input_tokens_seen": 803136, "step": 375 }, { "epoch": 0.06973756652596806, "grad_norm": 87.47193145751953, "learning_rate": 3.477702330702881e-07, "loss": 0.6272, "num_input_tokens_seen": 813184, "step": 380 }, { "epoch": 0.07065516608552028, "grad_norm": 54.01599884033203, "learning_rate": 3.523582308680492e-07, "loss": 0.5717, "num_input_tokens_seen": 824544, "step": 385 }, { "epoch": 0.07157276564507249, "grad_norm": 77.26618194580078, "learning_rate": 3.5694622866581025e-07, "loss": 0.6705, "num_input_tokens_seen": 834752, "step": 390 }, { "epoch": 0.07249036520462471, "grad_norm": 45.91414260864258, "learning_rate": 3.6153422646357135e-07, "loss": 0.5051, "num_input_tokens_seen": 844896, "step": 395 }, { "epoch": 0.07340796476417691, "grad_norm": 43.409759521484375, "learning_rate": 3.6612222426133234e-07, "loss": 0.4392, "num_input_tokens_seen": 855584, "step": 400 }, { "epoch": 0.07432556432372912, "grad_norm": 83.42304992675781, "learning_rate": 3.7071022205909344e-07, "loss": 0.5003, "num_input_tokens_seen": 866560, "step": 405 }, { "epoch": 0.07524316388328134, "grad_norm": 72.11143493652344, "learning_rate": 3.752982198568545e-07, "loss": 0.5171, "num_input_tokens_seen": 877664, "step": 410 }, { "epoch": 0.07616076344283355, "grad_norm": 58.74807357788086, "learning_rate": 3.798862176546156e-07, "loss": 0.5361, "num_input_tokens_seen": 888480, "step": 415 }, { "epoch": 0.07707836300238576, "grad_norm": 100.45182800292969, "learning_rate": 3.8447421545237657e-07, "loss": 0.5381, "num_input_tokens_seen": 899840, "step": 420 }, { "epoch": 0.07799596256193797, "grad_norm": 54.55415344238281, "learning_rate": 3.8906221325013767e-07, "loss": 0.478, "num_input_tokens_seen": 909984, "step": 425 }, { "epoch": 0.07891356212149019, "grad_norm": 60.278507232666016, "learning_rate": 3.9365021104789877e-07, "loss": 0.4847, "num_input_tokens_seen": 920544, "step": 430 }, { "epoch": 0.07983116168104239, "grad_norm": 70.94810485839844, "learning_rate": 3.982382088456598e-07, "loss": 0.4899, "num_input_tokens_seen": 929952, "step": 435 }, { "epoch": 0.0807487612405946, "grad_norm": 122.45109558105469, "learning_rate": 4.0282620664342085e-07, "loss": 0.451, "num_input_tokens_seen": 939392, "step": 440 }, { "epoch": 0.08166636080014682, "grad_norm": 70.87744903564453, "learning_rate": 4.074142044411819e-07, "loss": 0.5238, "num_input_tokens_seen": 950080, "step": 445 }, { "epoch": 0.08258396035969903, "grad_norm": 64.996337890625, "learning_rate": 4.12002202238943e-07, "loss": 0.4661, "num_input_tokens_seen": 959904, "step": 450 }, { "epoch": 0.08350155991925123, "grad_norm": 85.25394439697266, "learning_rate": 4.16590200036704e-07, "loss": 0.4782, "num_input_tokens_seen": 970112, "step": 455 }, { "epoch": 0.08441915947880345, "grad_norm": 117.95867156982422, "learning_rate": 4.211781978344651e-07, "loss": 0.437, "num_input_tokens_seen": 982144, "step": 460 }, { "epoch": 0.08533675903835566, "grad_norm": 49.47980499267578, "learning_rate": 4.2576619563222613e-07, "loss": 0.4709, "num_input_tokens_seen": 992800, "step": 465 }, { "epoch": 0.08625435859790788, "grad_norm": 70.54230499267578, "learning_rate": 4.303541934299872e-07, "loss": 0.4653, "num_input_tokens_seen": 1003232, "step": 470 }, { "epoch": 0.08717195815746008, "grad_norm": 54.4733772277832, "learning_rate": 4.349421912277482e-07, "loss": 0.4566, "num_input_tokens_seen": 1013888, "step": 475 }, { "epoch": 0.0880895577170123, "grad_norm": 65.53541564941406, "learning_rate": 4.395301890255093e-07, "loss": 0.3974, "num_input_tokens_seen": 1025376, "step": 480 }, { "epoch": 0.08900715727656451, "grad_norm": 49.07026290893555, "learning_rate": 4.4411818682327036e-07, "loss": 0.4023, "num_input_tokens_seen": 1037056, "step": 485 }, { "epoch": 0.08992475683611671, "grad_norm": 55.78066635131836, "learning_rate": 4.4870618462103145e-07, "loss": 0.3726, "num_input_tokens_seen": 1047488, "step": 490 }, { "epoch": 0.09084235639566893, "grad_norm": 64.5465316772461, "learning_rate": 4.5329418241879245e-07, "loss": 0.3819, "num_input_tokens_seen": 1058400, "step": 495 }, { "epoch": 0.09175995595522114, "grad_norm": 81.43326568603516, "learning_rate": 4.5788218021655354e-07, "loss": 0.5412, "num_input_tokens_seen": 1069184, "step": 500 }, { "epoch": 0.09267755551477336, "grad_norm": 50.9625244140625, "learning_rate": 4.624701780143146e-07, "loss": 0.4687, "num_input_tokens_seen": 1079616, "step": 505 }, { "epoch": 0.09359515507432556, "grad_norm": 71.29508209228516, "learning_rate": 4.6705817581207563e-07, "loss": 0.4646, "num_input_tokens_seen": 1090944, "step": 510 }, { "epoch": 0.09451275463387777, "grad_norm": 64.39861297607422, "learning_rate": 4.716461736098367e-07, "loss": 0.3828, "num_input_tokens_seen": 1102976, "step": 515 }, { "epoch": 0.09543035419342999, "grad_norm": 55.598976135253906, "learning_rate": 4.7623417140759777e-07, "loss": 0.4948, "num_input_tokens_seen": 1113760, "step": 520 }, { "epoch": 0.0963479537529822, "grad_norm": 71.0578384399414, "learning_rate": 4.808221692053589e-07, "loss": 0.3562, "num_input_tokens_seen": 1124672, "step": 525 }, { "epoch": 0.0972655533125344, "grad_norm": 101.07612609863281, "learning_rate": 4.854101670031198e-07, "loss": 0.3847, "num_input_tokens_seen": 1135264, "step": 530 }, { "epoch": 0.09818315287208662, "grad_norm": 53.22134780883789, "learning_rate": 4.89998164800881e-07, "loss": 0.4338, "num_input_tokens_seen": 1146656, "step": 535 }, { "epoch": 0.09910075243163884, "grad_norm": 55.99235916137695, "learning_rate": 4.94586162598642e-07, "loss": 0.4597, "num_input_tokens_seen": 1158368, "step": 540 }, { "epoch": 0.10001835199119104, "grad_norm": 42.14553451538086, "learning_rate": 4.99174160396403e-07, "loss": 0.3956, "num_input_tokens_seen": 1168384, "step": 545 }, { "epoch": 0.10093595155074325, "grad_norm": 56.37135696411133, "learning_rate": 5.037621581941641e-07, "loss": 0.3709, "num_input_tokens_seen": 1180032, "step": 550 }, { "epoch": 0.10185355111029547, "grad_norm": 52.64231872558594, "learning_rate": 5.083501559919251e-07, "loss": 0.3931, "num_input_tokens_seen": 1190784, "step": 555 }, { "epoch": 0.10277115066984768, "grad_norm": 149.2633056640625, "learning_rate": 5.129381537896863e-07, "loss": 0.3913, "num_input_tokens_seen": 1202784, "step": 560 }, { "epoch": 0.10368875022939988, "grad_norm": 77.35460662841797, "learning_rate": 5.175261515874472e-07, "loss": 0.3682, "num_input_tokens_seen": 1212640, "step": 565 }, { "epoch": 0.1046063497889521, "grad_norm": 46.90050506591797, "learning_rate": 5.221141493852084e-07, "loss": 0.3135, "num_input_tokens_seen": 1223840, "step": 570 }, { "epoch": 0.10552394934850431, "grad_norm": 87.11160278320312, "learning_rate": 5.267021471829694e-07, "loss": 0.4347, "num_input_tokens_seen": 1233856, "step": 575 }, { "epoch": 0.10644154890805653, "grad_norm": 82.868408203125, "learning_rate": 5.312901449807305e-07, "loss": 0.45, "num_input_tokens_seen": 1245408, "step": 580 }, { "epoch": 0.10735914846760873, "grad_norm": 78.02718353271484, "learning_rate": 5.358781427784915e-07, "loss": 0.4032, "num_input_tokens_seen": 1255776, "step": 585 }, { "epoch": 0.10827674802716095, "grad_norm": 50.62297058105469, "learning_rate": 5.404661405762526e-07, "loss": 0.3736, "num_input_tokens_seen": 1266336, "step": 590 }, { "epoch": 0.10919434758671316, "grad_norm": 83.71673583984375, "learning_rate": 5.450541383740136e-07, "loss": 0.4478, "num_input_tokens_seen": 1276288, "step": 595 }, { "epoch": 0.11011194714626538, "grad_norm": 76.95209503173828, "learning_rate": 5.496421361717747e-07, "loss": 0.3991, "num_input_tokens_seen": 1287264, "step": 600 }, { "epoch": 0.11102954670581758, "grad_norm": 43.91206741333008, "learning_rate": 5.542301339695357e-07, "loss": 0.4165, "num_input_tokens_seen": 1297792, "step": 605 }, { "epoch": 0.11194714626536979, "grad_norm": 77.59616088867188, "learning_rate": 5.588181317672968e-07, "loss": 0.4219, "num_input_tokens_seen": 1309536, "step": 610 }, { "epoch": 0.11286474582492201, "grad_norm": 51.43492126464844, "learning_rate": 5.634061295650579e-07, "loss": 0.4766, "num_input_tokens_seen": 1321536, "step": 615 }, { "epoch": 0.11378234538447421, "grad_norm": 53.2288818359375, "learning_rate": 5.679941273628189e-07, "loss": 0.3233, "num_input_tokens_seen": 1332032, "step": 620 }, { "epoch": 0.11469994494402642, "grad_norm": 42.365074157714844, "learning_rate": 5.7258212516058e-07, "loss": 0.3489, "num_input_tokens_seen": 1342400, "step": 625 }, { "epoch": 0.11561754450357864, "grad_norm": 39.719051361083984, "learning_rate": 5.77170122958341e-07, "loss": 0.2888, "num_input_tokens_seen": 1352864, "step": 630 }, { "epoch": 0.11653514406313085, "grad_norm": 45.886863708496094, "learning_rate": 5.817581207561022e-07, "loss": 0.4036, "num_input_tokens_seen": 1362368, "step": 635 }, { "epoch": 0.11745274362268306, "grad_norm": 62.6154899597168, "learning_rate": 5.863461185538631e-07, "loss": 0.4354, "num_input_tokens_seen": 1372512, "step": 640 }, { "epoch": 0.11837034318223527, "grad_norm": 68.16102600097656, "learning_rate": 5.909341163516241e-07, "loss": 0.3237, "num_input_tokens_seen": 1382464, "step": 645 }, { "epoch": 0.11928794274178749, "grad_norm": 62.37977600097656, "learning_rate": 5.955221141493853e-07, "loss": 0.3323, "num_input_tokens_seen": 1393344, "step": 650 }, { "epoch": 0.1202055423013397, "grad_norm": 60.43949890136719, "learning_rate": 6.001101119471463e-07, "loss": 0.3473, "num_input_tokens_seen": 1404352, "step": 655 }, { "epoch": 0.1211231418608919, "grad_norm": 66.6657485961914, "learning_rate": 6.046981097449074e-07, "loss": 0.3578, "num_input_tokens_seen": 1415168, "step": 660 }, { "epoch": 0.12204074142044412, "grad_norm": 75.26553344726562, "learning_rate": 6.092861075426684e-07, "loss": 0.4011, "num_input_tokens_seen": 1425376, "step": 665 }, { "epoch": 0.12295834097999633, "grad_norm": 50.0697135925293, "learning_rate": 6.138741053404295e-07, "loss": 0.3357, "num_input_tokens_seen": 1436032, "step": 670 }, { "epoch": 0.12387594053954853, "grad_norm": 37.479583740234375, "learning_rate": 6.184621031381906e-07, "loss": 0.3451, "num_input_tokens_seen": 1446432, "step": 675 }, { "epoch": 0.12479354009910075, "grad_norm": 37.32172393798828, "learning_rate": 6.230501009359516e-07, "loss": 0.3258, "num_input_tokens_seen": 1457824, "step": 680 }, { "epoch": 0.12571113965865297, "grad_norm": 171.1046905517578, "learning_rate": 6.276380987337126e-07, "loss": 0.4117, "num_input_tokens_seen": 1467104, "step": 685 }, { "epoch": 0.12662873921820517, "grad_norm": 96.09493255615234, "learning_rate": 6.322260965314738e-07, "loss": 0.4057, "num_input_tokens_seen": 1478144, "step": 690 }, { "epoch": 0.1275463387777574, "grad_norm": 67.82720184326172, "learning_rate": 6.368140943292348e-07, "loss": 0.3902, "num_input_tokens_seen": 1489344, "step": 695 }, { "epoch": 0.1284639383373096, "grad_norm": 65.99671936035156, "learning_rate": 6.414020921269958e-07, "loss": 0.4009, "num_input_tokens_seen": 1500064, "step": 700 }, { "epoch": 0.1293815378968618, "grad_norm": 56.758155822753906, "learning_rate": 6.459900899247569e-07, "loss": 0.344, "num_input_tokens_seen": 1511328, "step": 705 }, { "epoch": 0.13029913745641403, "grad_norm": 64.57466888427734, "learning_rate": 6.505780877225179e-07, "loss": 0.4044, "num_input_tokens_seen": 1521760, "step": 710 }, { "epoch": 0.13121673701596623, "grad_norm": 75.22454071044922, "learning_rate": 6.551660855202791e-07, "loss": 0.4375, "num_input_tokens_seen": 1532992, "step": 715 }, { "epoch": 0.13213433657551846, "grad_norm": 68.04094696044922, "learning_rate": 6.5975408331804e-07, "loss": 0.4044, "num_input_tokens_seen": 1543232, "step": 720 }, { "epoch": 0.13305193613507066, "grad_norm": 51.80154800415039, "learning_rate": 6.643420811158011e-07, "loss": 0.4157, "num_input_tokens_seen": 1554400, "step": 725 }, { "epoch": 0.13396953569462286, "grad_norm": 100.56809997558594, "learning_rate": 6.689300789135622e-07, "loss": 0.4368, "num_input_tokens_seen": 1564768, "step": 730 }, { "epoch": 0.1348871352541751, "grad_norm": 53.6478157043457, "learning_rate": 6.735180767113233e-07, "loss": 0.425, "num_input_tokens_seen": 1576096, "step": 735 }, { "epoch": 0.1358047348137273, "grad_norm": 66.56684112548828, "learning_rate": 6.781060745090842e-07, "loss": 0.3812, "num_input_tokens_seen": 1586624, "step": 740 }, { "epoch": 0.1367223343732795, "grad_norm": 51.597171783447266, "learning_rate": 6.826940723068453e-07, "loss": 0.4009, "num_input_tokens_seen": 1599200, "step": 745 }, { "epoch": 0.13763993393283172, "grad_norm": 49.5509033203125, "learning_rate": 6.872820701046064e-07, "loss": 0.3126, "num_input_tokens_seen": 1609632, "step": 750 }, { "epoch": 0.13855753349238392, "grad_norm": 41.140140533447266, "learning_rate": 6.918700679023675e-07, "loss": 0.3674, "num_input_tokens_seen": 1618560, "step": 755 }, { "epoch": 0.13947513305193612, "grad_norm": 88.05484771728516, "learning_rate": 6.964580657001285e-07, "loss": 0.4857, "num_input_tokens_seen": 1628064, "step": 760 }, { "epoch": 0.14039273261148835, "grad_norm": 49.591121673583984, "learning_rate": 7.010460634978895e-07, "loss": 0.3921, "num_input_tokens_seen": 1638848, "step": 765 }, { "epoch": 0.14131033217104055, "grad_norm": 54.75838851928711, "learning_rate": 7.056340612956507e-07, "loss": 0.3902, "num_input_tokens_seen": 1649408, "step": 770 }, { "epoch": 0.14222793173059278, "grad_norm": 68.41617584228516, "learning_rate": 7.102220590934117e-07, "loss": 0.3198, "num_input_tokens_seen": 1661056, "step": 775 }, { "epoch": 0.14314553129014498, "grad_norm": 70.3355484008789, "learning_rate": 7.148100568911727e-07, "loss": 0.4396, "num_input_tokens_seen": 1672288, "step": 780 }, { "epoch": 0.14406313084969719, "grad_norm": 87.1603012084961, "learning_rate": 7.193980546889338e-07, "loss": 0.4122, "num_input_tokens_seen": 1682912, "step": 785 }, { "epoch": 0.14498073040924941, "grad_norm": 43.56504440307617, "learning_rate": 7.239860524866948e-07, "loss": 0.3538, "num_input_tokens_seen": 1693248, "step": 790 }, { "epoch": 0.14589832996880162, "grad_norm": 29.472312927246094, "learning_rate": 7.28574050284456e-07, "loss": 0.3375, "num_input_tokens_seen": 1703040, "step": 795 }, { "epoch": 0.14681592952835382, "grad_norm": 57.62322998046875, "learning_rate": 7.33162048082217e-07, "loss": 0.3275, "num_input_tokens_seen": 1714912, "step": 800 }, { "epoch": 0.14773352908790605, "grad_norm": 35.66807174682617, "learning_rate": 7.37750045879978e-07, "loss": 0.2992, "num_input_tokens_seen": 1724800, "step": 805 }, { "epoch": 0.14865112864745825, "grad_norm": 52.37841033935547, "learning_rate": 7.423380436777391e-07, "loss": 0.2995, "num_input_tokens_seen": 1735712, "step": 810 }, { "epoch": 0.14956872820701045, "grad_norm": 110.7730484008789, "learning_rate": 7.469260414755002e-07, "loss": 0.3836, "num_input_tokens_seen": 1747776, "step": 815 }, { "epoch": 0.15048632776656268, "grad_norm": 75.693603515625, "learning_rate": 7.515140392732611e-07, "loss": 0.382, "num_input_tokens_seen": 1758592, "step": 820 }, { "epoch": 0.15140392732611488, "grad_norm": 32.31290054321289, "learning_rate": 7.561020370710223e-07, "loss": 0.3932, "num_input_tokens_seen": 1768416, "step": 825 }, { "epoch": 0.1523215268856671, "grad_norm": 46.06276321411133, "learning_rate": 7.606900348687833e-07, "loss": 0.3449, "num_input_tokens_seen": 1779104, "step": 830 }, { "epoch": 0.1532391264452193, "grad_norm": 56.4880485534668, "learning_rate": 7.652780326665444e-07, "loss": 0.3537, "num_input_tokens_seen": 1789504, "step": 835 }, { "epoch": 0.1541567260047715, "grad_norm": 51.39072799682617, "learning_rate": 7.698660304643055e-07, "loss": 0.2792, "num_input_tokens_seen": 1800288, "step": 840 }, { "epoch": 0.15507432556432374, "grad_norm": 40.318851470947266, "learning_rate": 7.744540282620664e-07, "loss": 0.3514, "num_input_tokens_seen": 1810336, "step": 845 }, { "epoch": 0.15599192512387594, "grad_norm": 49.802452087402344, "learning_rate": 7.790420260598276e-07, "loss": 0.4072, "num_input_tokens_seen": 1820096, "step": 850 }, { "epoch": 0.15690952468342814, "grad_norm": 57.85746383666992, "learning_rate": 7.836300238575886e-07, "loss": 0.5003, "num_input_tokens_seen": 1831616, "step": 855 }, { "epoch": 0.15782712424298037, "grad_norm": 33.668331146240234, "learning_rate": 7.882180216553497e-07, "loss": 0.3542, "num_input_tokens_seen": 1840928, "step": 860 }, { "epoch": 0.15874472380253257, "grad_norm": 43.881126403808594, "learning_rate": 7.928060194531108e-07, "loss": 0.3701, "num_input_tokens_seen": 1852512, "step": 865 }, { "epoch": 0.15966232336208477, "grad_norm": 39.424720764160156, "learning_rate": 7.973940172508718e-07, "loss": 0.4211, "num_input_tokens_seen": 1862976, "step": 870 }, { "epoch": 0.160579922921637, "grad_norm": 64.42204284667969, "learning_rate": 8.019820150486328e-07, "loss": 0.393, "num_input_tokens_seen": 1873728, "step": 875 }, { "epoch": 0.1614975224811892, "grad_norm": 60.61343765258789, "learning_rate": 8.06570012846394e-07, "loss": 0.4595, "num_input_tokens_seen": 1885440, "step": 880 }, { "epoch": 0.16241512204074143, "grad_norm": 42.49543762207031, "learning_rate": 8.111580106441549e-07, "loss": 0.3471, "num_input_tokens_seen": 1896480, "step": 885 }, { "epoch": 0.16333272160029363, "grad_norm": 67.28429412841797, "learning_rate": 8.157460084419159e-07, "loss": 0.3635, "num_input_tokens_seen": 1907040, "step": 890 }, { "epoch": 0.16425032115984584, "grad_norm": 95.6258544921875, "learning_rate": 8.203340062396771e-07, "loss": 0.3965, "num_input_tokens_seen": 1917536, "step": 895 }, { "epoch": 0.16516792071939806, "grad_norm": 55.943580627441406, "learning_rate": 8.249220040374381e-07, "loss": 0.4253, "num_input_tokens_seen": 1927840, "step": 900 }, { "epoch": 0.16608552027895027, "grad_norm": 67.03715515136719, "learning_rate": 8.295100018351993e-07, "loss": 0.3263, "num_input_tokens_seen": 1938432, "step": 905 }, { "epoch": 0.16700311983850247, "grad_norm": 61.201820373535156, "learning_rate": 8.340979996329602e-07, "loss": 0.4175, "num_input_tokens_seen": 1948960, "step": 910 }, { "epoch": 0.1679207193980547, "grad_norm": 39.432098388671875, "learning_rate": 8.386859974307213e-07, "loss": 0.3513, "num_input_tokens_seen": 1959136, "step": 915 }, { "epoch": 0.1688383189576069, "grad_norm": 50.33808135986328, "learning_rate": 8.432739952284824e-07, "loss": 0.3562, "num_input_tokens_seen": 1970208, "step": 920 }, { "epoch": 0.1697559185171591, "grad_norm": 56.19043731689453, "learning_rate": 8.478619930262435e-07, "loss": 0.3388, "num_input_tokens_seen": 1982016, "step": 925 }, { "epoch": 0.17067351807671133, "grad_norm": 82.8695297241211, "learning_rate": 8.524499908240044e-07, "loss": 0.4509, "num_input_tokens_seen": 1993152, "step": 930 }, { "epoch": 0.17159111763626353, "grad_norm": 53.15449523925781, "learning_rate": 8.570379886217656e-07, "loss": 0.3063, "num_input_tokens_seen": 2004448, "step": 935 }, { "epoch": 0.17250871719581576, "grad_norm": 55.73544692993164, "learning_rate": 8.616259864195266e-07, "loss": 0.3462, "num_input_tokens_seen": 2014976, "step": 940 }, { "epoch": 0.17342631675536796, "grad_norm": 64.6926040649414, "learning_rate": 8.662139842172875e-07, "loss": 0.3214, "num_input_tokens_seen": 2025440, "step": 945 }, { "epoch": 0.17434391631492016, "grad_norm": 67.28742980957031, "learning_rate": 8.708019820150487e-07, "loss": 0.31, "num_input_tokens_seen": 2036992, "step": 950 }, { "epoch": 0.1752615158744724, "grad_norm": 44.025062561035156, "learning_rate": 8.753899798128097e-07, "loss": 0.5076, "num_input_tokens_seen": 2046752, "step": 955 }, { "epoch": 0.1761791154340246, "grad_norm": 46.424922943115234, "learning_rate": 8.799779776105709e-07, "loss": 0.4448, "num_input_tokens_seen": 2058080, "step": 960 }, { "epoch": 0.1770967149935768, "grad_norm": 42.898643493652344, "learning_rate": 8.845659754083319e-07, "loss": 0.3906, "num_input_tokens_seen": 2069632, "step": 965 }, { "epoch": 0.17801431455312902, "grad_norm": 70.98038482666016, "learning_rate": 8.891539732060929e-07, "loss": 0.4281, "num_input_tokens_seen": 2081952, "step": 970 }, { "epoch": 0.17893191411268122, "grad_norm": 60.493038177490234, "learning_rate": 8.93741971003854e-07, "loss": 0.3626, "num_input_tokens_seen": 2091680, "step": 975 }, { "epoch": 0.17984951367223342, "grad_norm": 33.80522918701172, "learning_rate": 8.983299688016151e-07, "loss": 0.3324, "num_input_tokens_seen": 2100288, "step": 980 }, { "epoch": 0.18076711323178565, "grad_norm": 60.04612350463867, "learning_rate": 9.029179665993761e-07, "loss": 0.4398, "num_input_tokens_seen": 2111072, "step": 985 }, { "epoch": 0.18168471279133785, "grad_norm": 51.097007751464844, "learning_rate": 9.075059643971372e-07, "loss": 0.348, "num_input_tokens_seen": 2122176, "step": 990 }, { "epoch": 0.18260231235089008, "grad_norm": 49.67466354370117, "learning_rate": 9.120939621948982e-07, "loss": 0.3925, "num_input_tokens_seen": 2133152, "step": 995 }, { "epoch": 0.18351991191044228, "grad_norm": 58.41849899291992, "learning_rate": 9.166819599926592e-07, "loss": 0.3102, "num_input_tokens_seen": 2144768, "step": 1000 }, { "epoch": 0.1844375114699945, "grad_norm": 69.4049072265625, "learning_rate": 9.212699577904204e-07, "loss": 0.4381, "num_input_tokens_seen": 2154080, "step": 1005 }, { "epoch": 0.18535511102954672, "grad_norm": 42.96855545043945, "learning_rate": 9.258579555881813e-07, "loss": 0.3068, "num_input_tokens_seen": 2164480, "step": 1010 }, { "epoch": 0.18627271058909892, "grad_norm": 49.36765670776367, "learning_rate": 9.304459533859425e-07, "loss": 0.3314, "num_input_tokens_seen": 2175168, "step": 1015 }, { "epoch": 0.18719031014865112, "grad_norm": 42.140750885009766, "learning_rate": 9.350339511837035e-07, "loss": 0.3653, "num_input_tokens_seen": 2186912, "step": 1020 }, { "epoch": 0.18810790970820335, "grad_norm": 65.18651580810547, "learning_rate": 9.396219489814646e-07, "loss": 0.3403, "num_input_tokens_seen": 2196160, "step": 1025 }, { "epoch": 0.18902550926775555, "grad_norm": 47.198631286621094, "learning_rate": 9.442099467792257e-07, "loss": 0.3293, "num_input_tokens_seen": 2205824, "step": 1030 }, { "epoch": 0.18994310882730775, "grad_norm": 92.93871307373047, "learning_rate": 9.487979445769866e-07, "loss": 0.3467, "num_input_tokens_seen": 2217056, "step": 1035 }, { "epoch": 0.19086070838685998, "grad_norm": 56.624507904052734, "learning_rate": 9.533859423747477e-07, "loss": 0.3181, "num_input_tokens_seen": 2227488, "step": 1040 }, { "epoch": 0.19177830794641218, "grad_norm": 49.28251647949219, "learning_rate": 9.579739401725087e-07, "loss": 0.3403, "num_input_tokens_seen": 2239456, "step": 1045 }, { "epoch": 0.1926959075059644, "grad_norm": 38.55929946899414, "learning_rate": 9.625619379702699e-07, "loss": 0.3356, "num_input_tokens_seen": 2250528, "step": 1050 }, { "epoch": 0.1936135070655166, "grad_norm": 38.48645782470703, "learning_rate": 9.671499357680308e-07, "loss": 0.3672, "num_input_tokens_seen": 2259968, "step": 1055 }, { "epoch": 0.1945311066250688, "grad_norm": 46.43902587890625, "learning_rate": 9.71737933565792e-07, "loss": 0.31, "num_input_tokens_seen": 2271008, "step": 1060 }, { "epoch": 0.19544870618462104, "grad_norm": 61.566715240478516, "learning_rate": 9.76325931363553e-07, "loss": 0.3401, "num_input_tokens_seen": 2282112, "step": 1065 }, { "epoch": 0.19636630574417324, "grad_norm": 41.30095291137695, "learning_rate": 9.80913929161314e-07, "loss": 0.4062, "num_input_tokens_seen": 2292512, "step": 1070 }, { "epoch": 0.19728390530372544, "grad_norm": 105.42024993896484, "learning_rate": 9.855019269590752e-07, "loss": 0.407, "num_input_tokens_seen": 2303872, "step": 1075 }, { "epoch": 0.19820150486327767, "grad_norm": 52.47296905517578, "learning_rate": 9.900899247568362e-07, "loss": 0.3681, "num_input_tokens_seen": 2314144, "step": 1080 }, { "epoch": 0.19911910442282987, "grad_norm": 37.433223724365234, "learning_rate": 9.946779225545973e-07, "loss": 0.3438, "num_input_tokens_seen": 2324704, "step": 1085 }, { "epoch": 0.20003670398238207, "grad_norm": 33.044189453125, "learning_rate": 9.992659203523582e-07, "loss": 0.2895, "num_input_tokens_seen": 2334432, "step": 1090 }, { "epoch": 0.2009543035419343, "grad_norm": 25.523481369018555, "learning_rate": 1.0038539181501194e-06, "loss": 0.2947, "num_input_tokens_seen": 2346112, "step": 1095 }, { "epoch": 0.2018719031014865, "grad_norm": 64.42964935302734, "learning_rate": 1.0084419159478805e-06, "loss": 0.5282, "num_input_tokens_seen": 2357280, "step": 1100 }, { "epoch": 0.20278950266103873, "grad_norm": 30.541776657104492, "learning_rate": 1.0130299137456415e-06, "loss": 0.3707, "num_input_tokens_seen": 2367584, "step": 1105 }, { "epoch": 0.20370710222059094, "grad_norm": 66.6951904296875, "learning_rate": 1.0176179115434024e-06, "loss": 0.355, "num_input_tokens_seen": 2377184, "step": 1110 }, { "epoch": 0.20462470178014314, "grad_norm": 36.04513931274414, "learning_rate": 1.0222059093411636e-06, "loss": 0.3509, "num_input_tokens_seen": 2388960, "step": 1115 }, { "epoch": 0.20554230133969537, "grad_norm": 44.41426467895508, "learning_rate": 1.0267939071389247e-06, "loss": 0.348, "num_input_tokens_seen": 2398176, "step": 1120 }, { "epoch": 0.20645990089924757, "grad_norm": 34.35039138793945, "learning_rate": 1.0313819049366859e-06, "loss": 0.3887, "num_input_tokens_seen": 2408672, "step": 1125 }, { "epoch": 0.20737750045879977, "grad_norm": 63.59209060668945, "learning_rate": 1.0359699027344468e-06, "loss": 0.4037, "num_input_tokens_seen": 2418208, "step": 1130 }, { "epoch": 0.208295100018352, "grad_norm": 37.068756103515625, "learning_rate": 1.0405579005322077e-06, "loss": 0.3002, "num_input_tokens_seen": 2430080, "step": 1135 }, { "epoch": 0.2092126995779042, "grad_norm": 40.887027740478516, "learning_rate": 1.0451458983299689e-06, "loss": 0.3448, "num_input_tokens_seen": 2440832, "step": 1140 }, { "epoch": 0.21013029913745643, "grad_norm": 60.464210510253906, "learning_rate": 1.04973389612773e-06, "loss": 0.3707, "num_input_tokens_seen": 2451712, "step": 1145 }, { "epoch": 0.21104789869700863, "grad_norm": 43.50973129272461, "learning_rate": 1.054321893925491e-06, "loss": 0.3765, "num_input_tokens_seen": 2461728, "step": 1150 }, { "epoch": 0.21196549825656083, "grad_norm": 48.5921516418457, "learning_rate": 1.0589098917232521e-06, "loss": 0.3459, "num_input_tokens_seen": 2471552, "step": 1155 }, { "epoch": 0.21288309781611306, "grad_norm": 42.61063003540039, "learning_rate": 1.063497889521013e-06, "loss": 0.3293, "num_input_tokens_seen": 2482816, "step": 1160 }, { "epoch": 0.21380069737566526, "grad_norm": 32.003963470458984, "learning_rate": 1.068085887318774e-06, "loss": 0.3933, "num_input_tokens_seen": 2493632, "step": 1165 }, { "epoch": 0.21471829693521746, "grad_norm": 72.92633819580078, "learning_rate": 1.0726738851165352e-06, "loss": 0.3309, "num_input_tokens_seen": 2505952, "step": 1170 }, { "epoch": 0.2156358964947697, "grad_norm": 38.65166091918945, "learning_rate": 1.0772618829142963e-06, "loss": 0.3876, "num_input_tokens_seen": 2516352, "step": 1175 }, { "epoch": 0.2165534960543219, "grad_norm": 48.1838493347168, "learning_rate": 1.0818498807120575e-06, "loss": 0.3293, "num_input_tokens_seen": 2527328, "step": 1180 }, { "epoch": 0.2174710956138741, "grad_norm": 45.378204345703125, "learning_rate": 1.0864378785098184e-06, "loss": 0.3546, "num_input_tokens_seen": 2538304, "step": 1185 }, { "epoch": 0.21838869517342632, "grad_norm": 41.53936767578125, "learning_rate": 1.0910258763075793e-06, "loss": 0.3325, "num_input_tokens_seen": 2549536, "step": 1190 }, { "epoch": 0.21930629473297852, "grad_norm": 61.963130950927734, "learning_rate": 1.0956138741053405e-06, "loss": 0.3638, "num_input_tokens_seen": 2561184, "step": 1195 }, { "epoch": 0.22022389429253075, "grad_norm": 48.45806121826172, "learning_rate": 1.1002018719031016e-06, "loss": 0.308, "num_input_tokens_seen": 2572544, "step": 1200 }, { "epoch": 0.22114149385208295, "grad_norm": 23.293027877807617, "learning_rate": 1.1047898697008626e-06, "loss": 0.3501, "num_input_tokens_seen": 2582976, "step": 1205 }, { "epoch": 0.22205909341163516, "grad_norm": 45.94143295288086, "learning_rate": 1.1093778674986237e-06, "loss": 0.3571, "num_input_tokens_seen": 2594656, "step": 1210 }, { "epoch": 0.22297669297118738, "grad_norm": 40.27142333984375, "learning_rate": 1.1139658652963847e-06, "loss": 0.3851, "num_input_tokens_seen": 2605568, "step": 1215 }, { "epoch": 0.22389429253073959, "grad_norm": 52.1435546875, "learning_rate": 1.1185538630941458e-06, "loss": 0.3063, "num_input_tokens_seen": 2617856, "step": 1220 }, { "epoch": 0.2248118920902918, "grad_norm": 30.82248306274414, "learning_rate": 1.123141860891907e-06, "loss": 0.3673, "num_input_tokens_seen": 2628128, "step": 1225 }, { "epoch": 0.22572949164984402, "grad_norm": 57.37691879272461, "learning_rate": 1.127729858689668e-06, "loss": 0.3621, "num_input_tokens_seen": 2639872, "step": 1230 }, { "epoch": 0.22664709120939622, "grad_norm": 28.86202049255371, "learning_rate": 1.132317856487429e-06, "loss": 0.3153, "num_input_tokens_seen": 2651040, "step": 1235 }, { "epoch": 0.22756469076894842, "grad_norm": 21.67624855041504, "learning_rate": 1.13690585428519e-06, "loss": 0.4338, "num_input_tokens_seen": 2661248, "step": 1240 }, { "epoch": 0.22848229032850065, "grad_norm": 22.40297508239746, "learning_rate": 1.1414938520829511e-06, "loss": 0.3601, "num_input_tokens_seen": 2671680, "step": 1245 }, { "epoch": 0.22939988988805285, "grad_norm": 43.35359573364258, "learning_rate": 1.1460818498807123e-06, "loss": 0.3561, "num_input_tokens_seen": 2683488, "step": 1250 }, { "epoch": 0.23031748944760508, "grad_norm": 34.91040802001953, "learning_rate": 1.1506698476784732e-06, "loss": 0.3681, "num_input_tokens_seen": 2694848, "step": 1255 }, { "epoch": 0.23123508900715728, "grad_norm": 33.98200988769531, "learning_rate": 1.1552578454762342e-06, "loss": 0.3371, "num_input_tokens_seen": 2706144, "step": 1260 }, { "epoch": 0.23215268856670948, "grad_norm": 30.41978645324707, "learning_rate": 1.1598458432739953e-06, "loss": 0.3276, "num_input_tokens_seen": 2715296, "step": 1265 }, { "epoch": 0.2330702881262617, "grad_norm": 47.10363006591797, "learning_rate": 1.1644338410717565e-06, "loss": 0.3789, "num_input_tokens_seen": 2727424, "step": 1270 }, { "epoch": 0.2339878876858139, "grad_norm": 22.949325561523438, "learning_rate": 1.1690218388695174e-06, "loss": 0.3327, "num_input_tokens_seen": 2739264, "step": 1275 }, { "epoch": 0.2349054872453661, "grad_norm": 79.26911163330078, "learning_rate": 1.1736098366672786e-06, "loss": 0.418, "num_input_tokens_seen": 2748928, "step": 1280 }, { "epoch": 0.23582308680491834, "grad_norm": 36.942909240722656, "learning_rate": 1.1781978344650395e-06, "loss": 0.3845, "num_input_tokens_seen": 2759392, "step": 1285 }, { "epoch": 0.23674068636447054, "grad_norm": 48.3026008605957, "learning_rate": 1.1827858322628006e-06, "loss": 0.3516, "num_input_tokens_seen": 2770368, "step": 1290 }, { "epoch": 0.23765828592402274, "grad_norm": 27.20196533203125, "learning_rate": 1.1873738300605616e-06, "loss": 0.3264, "num_input_tokens_seen": 2781824, "step": 1295 }, { "epoch": 0.23857588548357497, "grad_norm": 21.79205894470215, "learning_rate": 1.1919618278583227e-06, "loss": 0.2844, "num_input_tokens_seen": 2790976, "step": 1300 }, { "epoch": 0.23949348504312717, "grad_norm": 27.84050941467285, "learning_rate": 1.1965498256560839e-06, "loss": 0.3236, "num_input_tokens_seen": 2803072, "step": 1305 }, { "epoch": 0.2404110846026794, "grad_norm": 67.84495544433594, "learning_rate": 1.2011378234538448e-06, "loss": 0.4151, "num_input_tokens_seen": 2814112, "step": 1310 }, { "epoch": 0.2413286841622316, "grad_norm": 55.03118133544922, "learning_rate": 1.2057258212516058e-06, "loss": 0.2641, "num_input_tokens_seen": 2824704, "step": 1315 }, { "epoch": 0.2422462837217838, "grad_norm": 42.796112060546875, "learning_rate": 1.210313819049367e-06, "loss": 0.3713, "num_input_tokens_seen": 2835488, "step": 1320 }, { "epoch": 0.24316388328133604, "grad_norm": 37.00703811645508, "learning_rate": 1.214901816847128e-06, "loss": 0.3192, "num_input_tokens_seen": 2847456, "step": 1325 }, { "epoch": 0.24408148284088824, "grad_norm": 38.34793472290039, "learning_rate": 1.2194898146448892e-06, "loss": 0.3255, "num_input_tokens_seen": 2859168, "step": 1330 }, { "epoch": 0.24499908240044044, "grad_norm": 53.269561767578125, "learning_rate": 1.2240778124426501e-06, "loss": 0.2983, "num_input_tokens_seen": 2869632, "step": 1335 }, { "epoch": 0.24591668195999267, "grad_norm": 73.42984008789062, "learning_rate": 1.228665810240411e-06, "loss": 0.3324, "num_input_tokens_seen": 2879904, "step": 1340 }, { "epoch": 0.24683428151954487, "grad_norm": 35.653804779052734, "learning_rate": 1.2332538080381722e-06, "loss": 0.3403, "num_input_tokens_seen": 2890656, "step": 1345 }, { "epoch": 0.24775188107909707, "grad_norm": 55.360233306884766, "learning_rate": 1.2378418058359334e-06, "loss": 0.4641, "num_input_tokens_seen": 2900576, "step": 1350 }, { "epoch": 0.2486694806386493, "grad_norm": 58.75682830810547, "learning_rate": 1.2424298036336943e-06, "loss": 0.4603, "num_input_tokens_seen": 2911392, "step": 1355 }, { "epoch": 0.2495870801982015, "grad_norm": 45.682395935058594, "learning_rate": 1.2470178014314555e-06, "loss": 0.3165, "num_input_tokens_seen": 2923264, "step": 1360 }, { "epoch": 0.2505046797577537, "grad_norm": 32.128662109375, "learning_rate": 1.2516057992292166e-06, "loss": 0.3241, "num_input_tokens_seen": 2933984, "step": 1365 }, { "epoch": 0.25142227931730593, "grad_norm": 27.068531036376953, "learning_rate": 1.2561937970269776e-06, "loss": 0.3146, "num_input_tokens_seen": 2944256, "step": 1370 }, { "epoch": 0.25233987887685816, "grad_norm": 35.56260681152344, "learning_rate": 1.2607817948247387e-06, "loss": 0.3891, "num_input_tokens_seen": 2955648, "step": 1375 }, { "epoch": 0.25325747843641033, "grad_norm": 36.00885772705078, "learning_rate": 1.2653697926224999e-06, "loss": 0.301, "num_input_tokens_seen": 2967264, "step": 1380 }, { "epoch": 0.25417507799596256, "grad_norm": 22.7398624420166, "learning_rate": 1.2699577904202606e-06, "loss": 0.3001, "num_input_tokens_seen": 2979072, "step": 1385 }, { "epoch": 0.2550926775555148, "grad_norm": 72.74430084228516, "learning_rate": 1.2745457882180217e-06, "loss": 0.513, "num_input_tokens_seen": 2989856, "step": 1390 }, { "epoch": 0.25601027711506696, "grad_norm": 31.572784423828125, "learning_rate": 1.2791337860157829e-06, "loss": 0.3877, "num_input_tokens_seen": 2999424, "step": 1395 }, { "epoch": 0.2569278766746192, "grad_norm": 33.786903381347656, "learning_rate": 1.2837217838135438e-06, "loss": 0.3267, "num_input_tokens_seen": 3010432, "step": 1400 }, { "epoch": 0.2578454762341714, "grad_norm": 51.16359329223633, "learning_rate": 1.288309781611305e-06, "loss": 0.3833, "num_input_tokens_seen": 3019904, "step": 1405 }, { "epoch": 0.2587630757937236, "grad_norm": 30.39256477355957, "learning_rate": 1.2928977794090661e-06, "loss": 0.3555, "num_input_tokens_seen": 3029568, "step": 1410 }, { "epoch": 0.2596806753532758, "grad_norm": 27.073772430419922, "learning_rate": 1.2974857772068269e-06, "loss": 0.3469, "num_input_tokens_seen": 3040928, "step": 1415 }, { "epoch": 0.26059827491282805, "grad_norm": 25.154436111450195, "learning_rate": 1.302073775004588e-06, "loss": 0.4025, "num_input_tokens_seen": 3050656, "step": 1420 }, { "epoch": 0.2615158744723802, "grad_norm": 28.170669555664062, "learning_rate": 1.3066617728023492e-06, "loss": 0.3584, "num_input_tokens_seen": 3062208, "step": 1425 }, { "epoch": 0.26243347403193246, "grad_norm": 25.92691993713379, "learning_rate": 1.31124977060011e-06, "loss": 0.4004, "num_input_tokens_seen": 3072384, "step": 1430 }, { "epoch": 0.2633510735914847, "grad_norm": 23.22834014892578, "learning_rate": 1.3158377683978712e-06, "loss": 0.3218, "num_input_tokens_seen": 3084416, "step": 1435 }, { "epoch": 0.2642686731510369, "grad_norm": 25.330089569091797, "learning_rate": 1.3204257661956324e-06, "loss": 0.328, "num_input_tokens_seen": 3095168, "step": 1440 }, { "epoch": 0.2651862727105891, "grad_norm": 31.356401443481445, "learning_rate": 1.3250137639933933e-06, "loss": 0.4004, "num_input_tokens_seen": 3105920, "step": 1445 }, { "epoch": 0.2661038722701413, "grad_norm": 23.392555236816406, "learning_rate": 1.3296017617911545e-06, "loss": 0.3228, "num_input_tokens_seen": 3115872, "step": 1450 }, { "epoch": 0.26702147182969355, "grad_norm": 25.48957633972168, "learning_rate": 1.3341897595889156e-06, "loss": 0.3499, "num_input_tokens_seen": 3125920, "step": 1455 }, { "epoch": 0.2679390713892457, "grad_norm": 25.27141571044922, "learning_rate": 1.3387777573866768e-06, "loss": 0.2949, "num_input_tokens_seen": 3136544, "step": 1460 }, { "epoch": 0.26885667094879795, "grad_norm": 32.911190032958984, "learning_rate": 1.3433657551844375e-06, "loss": 0.3428, "num_input_tokens_seen": 3147904, "step": 1465 }, { "epoch": 0.2697742705083502, "grad_norm": 27.006107330322266, "learning_rate": 1.3479537529821987e-06, "loss": 0.3465, "num_input_tokens_seen": 3160160, "step": 1470 }, { "epoch": 0.27069187006790235, "grad_norm": 23.118982315063477, "learning_rate": 1.3525417507799598e-06, "loss": 0.3877, "num_input_tokens_seen": 3171168, "step": 1475 }, { "epoch": 0.2716094696274546, "grad_norm": 29.69192886352539, "learning_rate": 1.3571297485777207e-06, "loss": 0.3296, "num_input_tokens_seen": 3180928, "step": 1480 }, { "epoch": 0.2725270691870068, "grad_norm": 26.782875061035156, "learning_rate": 1.3617177463754819e-06, "loss": 0.3126, "num_input_tokens_seen": 3193184, "step": 1485 }, { "epoch": 0.273444668746559, "grad_norm": 34.40494918823242, "learning_rate": 1.366305744173243e-06, "loss": 0.3648, "num_input_tokens_seen": 3203712, "step": 1490 }, { "epoch": 0.2743622683061112, "grad_norm": 36.51277160644531, "learning_rate": 1.370893741971004e-06, "loss": 0.2984, "num_input_tokens_seen": 3213920, "step": 1495 }, { "epoch": 0.27527986786566344, "grad_norm": 27.61792755126953, "learning_rate": 1.3754817397687651e-06, "loss": 0.3348, "num_input_tokens_seen": 3224544, "step": 1500 }, { "epoch": 0.2761974674252156, "grad_norm": 48.40155792236328, "learning_rate": 1.3800697375665263e-06, "loss": 0.3941, "num_input_tokens_seen": 3235424, "step": 1505 }, { "epoch": 0.27711506698476784, "grad_norm": 26.281108856201172, "learning_rate": 1.384657735364287e-06, "loss": 0.4342, "num_input_tokens_seen": 3246752, "step": 1510 }, { "epoch": 0.2780326665443201, "grad_norm": 21.581636428833008, "learning_rate": 1.3892457331620482e-06, "loss": 0.3761, "num_input_tokens_seen": 3257632, "step": 1515 }, { "epoch": 0.27895026610387225, "grad_norm": 35.357460021972656, "learning_rate": 1.3938337309598093e-06, "loss": 0.3438, "num_input_tokens_seen": 3268896, "step": 1520 }, { "epoch": 0.2798678656634245, "grad_norm": 22.20013999938965, "learning_rate": 1.3984217287575702e-06, "loss": 0.3142, "num_input_tokens_seen": 3280160, "step": 1525 }, { "epoch": 0.2807854652229767, "grad_norm": 33.526004791259766, "learning_rate": 1.4030097265553314e-06, "loss": 0.3755, "num_input_tokens_seen": 3291168, "step": 1530 }, { "epoch": 0.2817030647825289, "grad_norm": 22.402103424072266, "learning_rate": 1.4075977243530925e-06, "loss": 0.3372, "num_input_tokens_seen": 3301952, "step": 1535 }, { "epoch": 0.2826206643420811, "grad_norm": 33.37721252441406, "learning_rate": 1.4121857221508533e-06, "loss": 0.3119, "num_input_tokens_seen": 3312992, "step": 1540 }, { "epoch": 0.28353826390163334, "grad_norm": 23.229354858398438, "learning_rate": 1.4167737199486144e-06, "loss": 0.3319, "num_input_tokens_seen": 3323072, "step": 1545 }, { "epoch": 0.28445586346118557, "grad_norm": 31.525604248046875, "learning_rate": 1.4213617177463756e-06, "loss": 0.3284, "num_input_tokens_seen": 3332672, "step": 1550 }, { "epoch": 0.28537346302073774, "grad_norm": 40.07917785644531, "learning_rate": 1.4259497155441365e-06, "loss": 0.3306, "num_input_tokens_seen": 3344032, "step": 1555 }, { "epoch": 0.28629106258028997, "grad_norm": 50.89376449584961, "learning_rate": 1.4305377133418977e-06, "loss": 0.3573, "num_input_tokens_seen": 3354784, "step": 1560 }, { "epoch": 0.2872086621398422, "grad_norm": 36.92658996582031, "learning_rate": 1.4351257111396588e-06, "loss": 0.3504, "num_input_tokens_seen": 3364960, "step": 1565 }, { "epoch": 0.28812626169939437, "grad_norm": 43.50271987915039, "learning_rate": 1.43971370893742e-06, "loss": 0.286, "num_input_tokens_seen": 3374912, "step": 1570 }, { "epoch": 0.2890438612589466, "grad_norm": 24.795217514038086, "learning_rate": 1.444301706735181e-06, "loss": 0.3311, "num_input_tokens_seen": 3385184, "step": 1575 }, { "epoch": 0.28996146081849883, "grad_norm": 14.225390434265137, "learning_rate": 1.448889704532942e-06, "loss": 0.3354, "num_input_tokens_seen": 3395872, "step": 1580 }, { "epoch": 0.290879060378051, "grad_norm": 38.587703704833984, "learning_rate": 1.4534777023307032e-06, "loss": 0.2681, "num_input_tokens_seen": 3405920, "step": 1585 }, { "epoch": 0.29179665993760323, "grad_norm": 35.63397979736328, "learning_rate": 1.458065700128464e-06, "loss": 0.5525, "num_input_tokens_seen": 3416704, "step": 1590 }, { "epoch": 0.29271425949715546, "grad_norm": 47.584720611572266, "learning_rate": 1.462653697926225e-06, "loss": 0.3552, "num_input_tokens_seen": 3426624, "step": 1595 }, { "epoch": 0.29363185905670763, "grad_norm": 27.941390991210938, "learning_rate": 1.4672416957239862e-06, "loss": 0.3034, "num_input_tokens_seen": 3438144, "step": 1600 }, { "epoch": 0.29454945861625986, "grad_norm": 61.69209289550781, "learning_rate": 1.4718296935217472e-06, "loss": 0.4236, "num_input_tokens_seen": 3448736, "step": 1605 }, { "epoch": 0.2954670581758121, "grad_norm": 82.66265869140625, "learning_rate": 1.4764176913195083e-06, "loss": 0.456, "num_input_tokens_seen": 3460384, "step": 1610 }, { "epoch": 0.29638465773536427, "grad_norm": 64.51631927490234, "learning_rate": 1.4810056891172695e-06, "loss": 0.2952, "num_input_tokens_seen": 3472000, "step": 1615 }, { "epoch": 0.2973022572949165, "grad_norm": 85.19046020507812, "learning_rate": 1.4855936869150304e-06, "loss": 0.4192, "num_input_tokens_seen": 3481888, "step": 1620 }, { "epoch": 0.2982198568544687, "grad_norm": 39.20394515991211, "learning_rate": 1.4901816847127916e-06, "loss": 0.5724, "num_input_tokens_seen": 3493280, "step": 1625 }, { "epoch": 0.2991374564140209, "grad_norm": 45.12376022338867, "learning_rate": 1.4947696825105527e-06, "loss": 0.3643, "num_input_tokens_seen": 3503840, "step": 1630 }, { "epoch": 0.3000550559735731, "grad_norm": 24.82008934020996, "learning_rate": 1.4993576803083134e-06, "loss": 0.3016, "num_input_tokens_seen": 3514784, "step": 1635 }, { "epoch": 0.30097265553312536, "grad_norm": 27.31792449951172, "learning_rate": 1.5039456781060746e-06, "loss": 0.3312, "num_input_tokens_seen": 3523744, "step": 1640 }, { "epoch": 0.30189025509267753, "grad_norm": 17.299869537353516, "learning_rate": 1.5085336759038357e-06, "loss": 0.3796, "num_input_tokens_seen": 3533760, "step": 1645 }, { "epoch": 0.30280785465222976, "grad_norm": 38.111839294433594, "learning_rate": 1.5131216737015967e-06, "loss": 0.342, "num_input_tokens_seen": 3544416, "step": 1650 }, { "epoch": 0.303725454211782, "grad_norm": 22.549762725830078, "learning_rate": 1.5177096714993578e-06, "loss": 0.3088, "num_input_tokens_seen": 3555488, "step": 1655 }, { "epoch": 0.3046430537713342, "grad_norm": 21.668617248535156, "learning_rate": 1.522297669297119e-06, "loss": 0.2919, "num_input_tokens_seen": 3565984, "step": 1660 }, { "epoch": 0.3055606533308864, "grad_norm": 45.84638977050781, "learning_rate": 1.5268856670948797e-06, "loss": 0.4473, "num_input_tokens_seen": 3576608, "step": 1665 }, { "epoch": 0.3064782528904386, "grad_norm": 28.271493911743164, "learning_rate": 1.5314736648926408e-06, "loss": 0.3573, "num_input_tokens_seen": 3587712, "step": 1670 }, { "epoch": 0.30739585244999085, "grad_norm": 21.2509765625, "learning_rate": 1.536061662690402e-06, "loss": 0.3811, "num_input_tokens_seen": 3598144, "step": 1675 }, { "epoch": 0.308313452009543, "grad_norm": 33.151283264160156, "learning_rate": 1.5406496604881631e-06, "loss": 0.3323, "num_input_tokens_seen": 3607328, "step": 1680 }, { "epoch": 0.30923105156909525, "grad_norm": 23.77123260498047, "learning_rate": 1.545237658285924e-06, "loss": 0.3865, "num_input_tokens_seen": 3616640, "step": 1685 }, { "epoch": 0.3101486511286475, "grad_norm": 17.505643844604492, "learning_rate": 1.5498256560836852e-06, "loss": 0.3681, "num_input_tokens_seen": 3626880, "step": 1690 }, { "epoch": 0.31106625068819965, "grad_norm": 19.262348175048828, "learning_rate": 1.5544136538814464e-06, "loss": 0.4017, "num_input_tokens_seen": 3638176, "step": 1695 }, { "epoch": 0.3119838502477519, "grad_norm": 24.677757263183594, "learning_rate": 1.5590016516792073e-06, "loss": 0.3587, "num_input_tokens_seen": 3648224, "step": 1700 }, { "epoch": 0.3129014498073041, "grad_norm": 14.801255226135254, "learning_rate": 1.5635896494769685e-06, "loss": 0.3238, "num_input_tokens_seen": 3657920, "step": 1705 }, { "epoch": 0.3138190493668563, "grad_norm": 24.539127349853516, "learning_rate": 1.5681776472747296e-06, "loss": 0.3251, "num_input_tokens_seen": 3668960, "step": 1710 }, { "epoch": 0.3147366489264085, "grad_norm": 36.38299560546875, "learning_rate": 1.5727656450724903e-06, "loss": 0.3149, "num_input_tokens_seen": 3680768, "step": 1715 }, { "epoch": 0.31565424848596074, "grad_norm": 27.464420318603516, "learning_rate": 1.5773536428702515e-06, "loss": 0.2944, "num_input_tokens_seen": 3691200, "step": 1720 }, { "epoch": 0.3165718480455129, "grad_norm": 16.526203155517578, "learning_rate": 1.5819416406680126e-06, "loss": 0.3124, "num_input_tokens_seen": 3701856, "step": 1725 }, { "epoch": 0.31748944760506514, "grad_norm": 31.93816566467285, "learning_rate": 1.5865296384657736e-06, "loss": 0.3336, "num_input_tokens_seen": 3712096, "step": 1730 }, { "epoch": 0.3184070471646174, "grad_norm": 32.76673126220703, "learning_rate": 1.5911176362635347e-06, "loss": 0.4021, "num_input_tokens_seen": 3723072, "step": 1735 }, { "epoch": 0.31932464672416955, "grad_norm": 59.29141616821289, "learning_rate": 1.5957056340612959e-06, "loss": 0.4234, "num_input_tokens_seen": 3732832, "step": 1740 }, { "epoch": 0.3202422462837218, "grad_norm": 25.137483596801758, "learning_rate": 1.6002936318590568e-06, "loss": 0.3033, "num_input_tokens_seen": 3743552, "step": 1745 }, { "epoch": 0.321159845843274, "grad_norm": 20.444713592529297, "learning_rate": 1.604881629656818e-06, "loss": 0.2842, "num_input_tokens_seen": 3754208, "step": 1750 }, { "epoch": 0.3220774454028262, "grad_norm": 17.568239212036133, "learning_rate": 1.609469627454579e-06, "loss": 0.3631, "num_input_tokens_seen": 3765536, "step": 1755 }, { "epoch": 0.3229950449623784, "grad_norm": 23.47675895690918, "learning_rate": 1.6140576252523399e-06, "loss": 0.3544, "num_input_tokens_seen": 3777536, "step": 1760 }, { "epoch": 0.32391264452193064, "grad_norm": 38.485877990722656, "learning_rate": 1.618645623050101e-06, "loss": 0.2967, "num_input_tokens_seen": 3786976, "step": 1765 }, { "epoch": 0.32483024408148287, "grad_norm": 50.91438674926758, "learning_rate": 1.6232336208478622e-06, "loss": 0.3684, "num_input_tokens_seen": 3796832, "step": 1770 }, { "epoch": 0.32574784364103504, "grad_norm": 35.022064208984375, "learning_rate": 1.627821618645623e-06, "loss": 0.3995, "num_input_tokens_seen": 3806048, "step": 1775 }, { "epoch": 0.32666544320058727, "grad_norm": 30.899391174316406, "learning_rate": 1.6324096164433842e-06, "loss": 0.3345, "num_input_tokens_seen": 3816704, "step": 1780 }, { "epoch": 0.3275830427601395, "grad_norm": 40.29933166503906, "learning_rate": 1.6369976142411454e-06, "loss": 0.3392, "num_input_tokens_seen": 3826944, "step": 1785 }, { "epoch": 0.32850064231969167, "grad_norm": 27.368297576904297, "learning_rate": 1.6415856120389065e-06, "loss": 0.3879, "num_input_tokens_seen": 3836704, "step": 1790 }, { "epoch": 0.3294182418792439, "grad_norm": 27.857791900634766, "learning_rate": 1.6461736098366673e-06, "loss": 0.4075, "num_input_tokens_seen": 3847552, "step": 1795 }, { "epoch": 0.33033584143879613, "grad_norm": 18.105783462524414, "learning_rate": 1.6507616076344284e-06, "loss": 0.2933, "num_input_tokens_seen": 3858944, "step": 1800 }, { "epoch": 0.3312534409983483, "grad_norm": 37.204566955566406, "learning_rate": 1.6553496054321896e-06, "loss": 0.3317, "num_input_tokens_seen": 3869408, "step": 1805 }, { "epoch": 0.33217104055790053, "grad_norm": 33.38567352294922, "learning_rate": 1.6599376032299505e-06, "loss": 0.3269, "num_input_tokens_seen": 3880064, "step": 1810 }, { "epoch": 0.33308864011745276, "grad_norm": 16.097736358642578, "learning_rate": 1.6645256010277117e-06, "loss": 0.3765, "num_input_tokens_seen": 3891296, "step": 1815 }, { "epoch": 0.33400623967700493, "grad_norm": 34.77144241333008, "learning_rate": 1.6691135988254728e-06, "loss": 0.3675, "num_input_tokens_seen": 3903168, "step": 1820 }, { "epoch": 0.33492383923655716, "grad_norm": 52.30303955078125, "learning_rate": 1.6737015966232337e-06, "loss": 0.3749, "num_input_tokens_seen": 3913888, "step": 1825 }, { "epoch": 0.3358414387961094, "grad_norm": 21.073678970336914, "learning_rate": 1.678289594420995e-06, "loss": 0.3556, "num_input_tokens_seen": 3925120, "step": 1830 }, { "epoch": 0.33675903835566157, "grad_norm": 35.96950912475586, "learning_rate": 1.682877592218756e-06, "loss": 0.328, "num_input_tokens_seen": 3935456, "step": 1835 }, { "epoch": 0.3376766379152138, "grad_norm": 44.043724060058594, "learning_rate": 1.6874655900165168e-06, "loss": 0.441, "num_input_tokens_seen": 3946560, "step": 1840 }, { "epoch": 0.338594237474766, "grad_norm": 25.271488189697266, "learning_rate": 1.692053587814278e-06, "loss": 0.403, "num_input_tokens_seen": 3957504, "step": 1845 }, { "epoch": 0.3395118370343182, "grad_norm": 30.710695266723633, "learning_rate": 1.696641585612039e-06, "loss": 0.4062, "num_input_tokens_seen": 3969088, "step": 1850 }, { "epoch": 0.3404294365938704, "grad_norm": 33.44942092895508, "learning_rate": 1.7012295834098e-06, "loss": 0.3477, "num_input_tokens_seen": 3979648, "step": 1855 }, { "epoch": 0.34134703615342266, "grad_norm": 22.45128059387207, "learning_rate": 1.7058175812075612e-06, "loss": 0.3044, "num_input_tokens_seen": 3991168, "step": 1860 }, { "epoch": 0.3422646357129749, "grad_norm": 30.156753540039062, "learning_rate": 1.7104055790053223e-06, "loss": 0.3726, "num_input_tokens_seen": 4002464, "step": 1865 }, { "epoch": 0.34318223527252706, "grad_norm": 47.567596435546875, "learning_rate": 1.7149935768030832e-06, "loss": 0.3398, "num_input_tokens_seen": 4012544, "step": 1870 }, { "epoch": 0.3440998348320793, "grad_norm": 33.57914352416992, "learning_rate": 1.7195815746008444e-06, "loss": 0.3367, "num_input_tokens_seen": 4024160, "step": 1875 }, { "epoch": 0.3450174343916315, "grad_norm": 41.427635192871094, "learning_rate": 1.7241695723986053e-06, "loss": 0.3611, "num_input_tokens_seen": 4034976, "step": 1880 }, { "epoch": 0.3459350339511837, "grad_norm": 16.437400817871094, "learning_rate": 1.7287575701963665e-06, "loss": 0.3335, "num_input_tokens_seen": 4045792, "step": 1885 }, { "epoch": 0.3468526335107359, "grad_norm": 24.70295524597168, "learning_rate": 1.7333455679941274e-06, "loss": 0.2851, "num_input_tokens_seen": 4055744, "step": 1890 }, { "epoch": 0.34777023307028815, "grad_norm": 44.8120002746582, "learning_rate": 1.7379335657918886e-06, "loss": 0.3513, "num_input_tokens_seen": 4066048, "step": 1895 }, { "epoch": 0.3486878326298403, "grad_norm": 31.419612884521484, "learning_rate": 1.7425215635896497e-06, "loss": 0.3506, "num_input_tokens_seen": 4076000, "step": 1900 }, { "epoch": 0.34960543218939255, "grad_norm": 16.82847023010254, "learning_rate": 1.7471095613874107e-06, "loss": 0.3198, "num_input_tokens_seen": 4086240, "step": 1905 }, { "epoch": 0.3505230317489448, "grad_norm": 28.161405563354492, "learning_rate": 1.7516975591851718e-06, "loss": 0.392, "num_input_tokens_seen": 4098624, "step": 1910 }, { "epoch": 0.35144063130849695, "grad_norm": 33.383880615234375, "learning_rate": 1.756285556982933e-06, "loss": 0.378, "num_input_tokens_seen": 4109280, "step": 1915 }, { "epoch": 0.3523582308680492, "grad_norm": 31.694971084594727, "learning_rate": 1.7608735547806937e-06, "loss": 0.5054, "num_input_tokens_seen": 4121056, "step": 1920 }, { "epoch": 0.3532758304276014, "grad_norm": 14.1510591506958, "learning_rate": 1.7654615525784548e-06, "loss": 0.4022, "num_input_tokens_seen": 4132320, "step": 1925 }, { "epoch": 0.3541934299871536, "grad_norm": 25.136587142944336, "learning_rate": 1.770049550376216e-06, "loss": 0.3819, "num_input_tokens_seen": 4142912, "step": 1930 }, { "epoch": 0.3551110295467058, "grad_norm": 17.054824829101562, "learning_rate": 1.774637548173977e-06, "loss": 0.3463, "num_input_tokens_seen": 4153408, "step": 1935 }, { "epoch": 0.35602862910625804, "grad_norm": 30.61651039123535, "learning_rate": 1.779225545971738e-06, "loss": 0.3459, "num_input_tokens_seen": 4163840, "step": 1940 }, { "epoch": 0.3569462286658102, "grad_norm": 17.1136417388916, "learning_rate": 1.7838135437694992e-06, "loss": 0.3227, "num_input_tokens_seen": 4174432, "step": 1945 }, { "epoch": 0.35786382822536245, "grad_norm": 15.819372177124023, "learning_rate": 1.7884015415672602e-06, "loss": 0.3562, "num_input_tokens_seen": 4186048, "step": 1950 }, { "epoch": 0.3587814277849147, "grad_norm": 15.03344440460205, "learning_rate": 1.7929895393650213e-06, "loss": 0.3315, "num_input_tokens_seen": 4196800, "step": 1955 }, { "epoch": 0.35969902734446685, "grad_norm": 14.697476387023926, "learning_rate": 1.7975775371627825e-06, "loss": 0.3255, "num_input_tokens_seen": 4207488, "step": 1960 }, { "epoch": 0.3606166269040191, "grad_norm": 30.110902786254883, "learning_rate": 1.8021655349605432e-06, "loss": 0.3202, "num_input_tokens_seen": 4219744, "step": 1965 }, { "epoch": 0.3615342264635713, "grad_norm": 14.255681037902832, "learning_rate": 1.8067535327583043e-06, "loss": 0.2864, "num_input_tokens_seen": 4231488, "step": 1970 }, { "epoch": 0.36245182602312354, "grad_norm": 40.70220184326172, "learning_rate": 1.8113415305560655e-06, "loss": 0.4266, "num_input_tokens_seen": 4242144, "step": 1975 }, { "epoch": 0.3633694255826757, "grad_norm": 38.1359977722168, "learning_rate": 1.8159295283538264e-06, "loss": 0.4535, "num_input_tokens_seen": 4253824, "step": 1980 }, { "epoch": 0.36428702514222794, "grad_norm": 31.17839241027832, "learning_rate": 1.8205175261515876e-06, "loss": 0.3337, "num_input_tokens_seen": 4263424, "step": 1985 }, { "epoch": 0.36520462470178017, "grad_norm": 44.15243911743164, "learning_rate": 1.8251055239493487e-06, "loss": 0.3422, "num_input_tokens_seen": 4274016, "step": 1990 }, { "epoch": 0.36612222426133234, "grad_norm": 41.11705780029297, "learning_rate": 1.8296935217471099e-06, "loss": 0.3766, "num_input_tokens_seen": 4284352, "step": 1995 }, { "epoch": 0.36703982382088457, "grad_norm": 30.37810516357422, "learning_rate": 1.8342815195448708e-06, "loss": 0.3733, "num_input_tokens_seen": 4293824, "step": 2000 }, { "epoch": 0.3679574233804368, "grad_norm": 34.29375457763672, "learning_rate": 1.8388695173426318e-06, "loss": 0.3005, "num_input_tokens_seen": 4303680, "step": 2005 }, { "epoch": 0.368875022939989, "grad_norm": 36.486473083496094, "learning_rate": 1.843457515140393e-06, "loss": 0.3031, "num_input_tokens_seen": 4315264, "step": 2010 }, { "epoch": 0.3697926224995412, "grad_norm": 59.92923355102539, "learning_rate": 1.8480455129381538e-06, "loss": 0.4327, "num_input_tokens_seen": 4326848, "step": 2015 }, { "epoch": 0.37071022205909343, "grad_norm": 46.990379333496094, "learning_rate": 1.852633510735915e-06, "loss": 0.4361, "num_input_tokens_seen": 4338912, "step": 2020 }, { "epoch": 0.3716278216186456, "grad_norm": 21.3079776763916, "learning_rate": 1.8572215085336761e-06, "loss": 0.3726, "num_input_tokens_seen": 4349696, "step": 2025 }, { "epoch": 0.37254542117819783, "grad_norm": 56.27289962768555, "learning_rate": 1.861809506331437e-06, "loss": 0.4333, "num_input_tokens_seen": 4358880, "step": 2030 }, { "epoch": 0.37346302073775006, "grad_norm": 26.389333724975586, "learning_rate": 1.8663975041291982e-06, "loss": 0.3676, "num_input_tokens_seen": 4368160, "step": 2035 }, { "epoch": 0.37438062029730224, "grad_norm": 59.38691329956055, "learning_rate": 1.8709855019269594e-06, "loss": 0.291, "num_input_tokens_seen": 4379936, "step": 2040 }, { "epoch": 0.37529821985685446, "grad_norm": 36.21766662597656, "learning_rate": 1.8755734997247201e-06, "loss": 0.3697, "num_input_tokens_seen": 4390912, "step": 2045 }, { "epoch": 0.3762158194164067, "grad_norm": 28.78647804260254, "learning_rate": 1.8801614975224813e-06, "loss": 0.3082, "num_input_tokens_seen": 4401696, "step": 2050 }, { "epoch": 0.37713341897595887, "grad_norm": 42.220611572265625, "learning_rate": 1.8847494953202424e-06, "loss": 0.4132, "num_input_tokens_seen": 4413056, "step": 2055 }, { "epoch": 0.3780510185355111, "grad_norm": 18.32174301147461, "learning_rate": 1.8893374931180034e-06, "loss": 0.3533, "num_input_tokens_seen": 4424544, "step": 2060 }, { "epoch": 0.3789686180950633, "grad_norm": 23.58198356628418, "learning_rate": 1.8939254909157645e-06, "loss": 0.3421, "num_input_tokens_seen": 4434976, "step": 2065 }, { "epoch": 0.3798862176546155, "grad_norm": 30.355798721313477, "learning_rate": 1.8985134887135257e-06, "loss": 0.3179, "num_input_tokens_seen": 4446176, "step": 2070 }, { "epoch": 0.38080381721416773, "grad_norm": 23.763824462890625, "learning_rate": 1.9031014865112866e-06, "loss": 0.4002, "num_input_tokens_seen": 4456608, "step": 2075 }, { "epoch": 0.38172141677371996, "grad_norm": 26.11985206604004, "learning_rate": 1.9076894843090475e-06, "loss": 0.3373, "num_input_tokens_seen": 4466912, "step": 2080 }, { "epoch": 0.3826390163332722, "grad_norm": 20.34716033935547, "learning_rate": 1.912277482106809e-06, "loss": 0.3174, "num_input_tokens_seen": 4477632, "step": 2085 }, { "epoch": 0.38355661589282436, "grad_norm": 33.755401611328125, "learning_rate": 1.91686547990457e-06, "loss": 0.3451, "num_input_tokens_seen": 4489056, "step": 2090 }, { "epoch": 0.3844742154523766, "grad_norm": 44.10348129272461, "learning_rate": 1.9214534777023308e-06, "loss": 0.3387, "num_input_tokens_seen": 4499488, "step": 2095 }, { "epoch": 0.3853918150119288, "grad_norm": 19.073314666748047, "learning_rate": 1.926041475500092e-06, "loss": 0.3617, "num_input_tokens_seen": 4509888, "step": 2100 }, { "epoch": 0.386309414571481, "grad_norm": 14.323580741882324, "learning_rate": 1.930629473297853e-06, "loss": 0.308, "num_input_tokens_seen": 4520000, "step": 2105 }, { "epoch": 0.3872270141310332, "grad_norm": 15.0672607421875, "learning_rate": 1.935217471095614e-06, "loss": 0.3077, "num_input_tokens_seen": 4530880, "step": 2110 }, { "epoch": 0.38814461369058545, "grad_norm": 17.25522804260254, "learning_rate": 1.939805468893375e-06, "loss": 0.2663, "num_input_tokens_seen": 4541536, "step": 2115 }, { "epoch": 0.3890622132501376, "grad_norm": 26.19532585144043, "learning_rate": 1.9443934666911363e-06, "loss": 0.3869, "num_input_tokens_seen": 4553440, "step": 2120 }, { "epoch": 0.38997981280968985, "grad_norm": 22.379716873168945, "learning_rate": 1.9489814644888972e-06, "loss": 0.4625, "num_input_tokens_seen": 4564480, "step": 2125 }, { "epoch": 0.3908974123692421, "grad_norm": 75.81175231933594, "learning_rate": 1.953569462286658e-06, "loss": 0.3723, "num_input_tokens_seen": 4574272, "step": 2130 }, { "epoch": 0.39181501192879425, "grad_norm": 21.50809097290039, "learning_rate": 1.9581574600844195e-06, "loss": 0.4349, "num_input_tokens_seen": 4584928, "step": 2135 }, { "epoch": 0.3927326114883465, "grad_norm": 77.0367202758789, "learning_rate": 1.9627454578821805e-06, "loss": 0.37, "num_input_tokens_seen": 4595008, "step": 2140 }, { "epoch": 0.3936502110478987, "grad_norm": 26.812971115112305, "learning_rate": 1.9673334556799414e-06, "loss": 0.2834, "num_input_tokens_seen": 4605696, "step": 2145 }, { "epoch": 0.3945678106074509, "grad_norm": 44.71448516845703, "learning_rate": 1.9719214534777028e-06, "loss": 0.2723, "num_input_tokens_seen": 4616480, "step": 2150 }, { "epoch": 0.3954854101670031, "grad_norm": 21.275962829589844, "learning_rate": 1.9765094512754633e-06, "loss": 0.5747, "num_input_tokens_seen": 4627520, "step": 2155 }, { "epoch": 0.39640300972655534, "grad_norm": 28.825992584228516, "learning_rate": 1.9810974490732247e-06, "loss": 0.4299, "num_input_tokens_seen": 4639072, "step": 2160 }, { "epoch": 0.3973206092861075, "grad_norm": 20.19837188720703, "learning_rate": 1.9856854468709856e-06, "loss": 0.372, "num_input_tokens_seen": 4649440, "step": 2165 }, { "epoch": 0.39823820884565975, "grad_norm": 18.509647369384766, "learning_rate": 1.9902734446687465e-06, "loss": 0.3516, "num_input_tokens_seen": 4660128, "step": 2170 }, { "epoch": 0.399155808405212, "grad_norm": 17.340805053710938, "learning_rate": 1.994861442466508e-06, "loss": 0.343, "num_input_tokens_seen": 4671616, "step": 2175 }, { "epoch": 0.40007340796476415, "grad_norm": 32.92369079589844, "learning_rate": 1.999449440264269e-06, "loss": 0.3402, "num_input_tokens_seen": 4682816, "step": 2180 }, { "epoch": 0.4009910075243164, "grad_norm": 46.55028533935547, "learning_rate": 2.0040374380620298e-06, "loss": 0.3865, "num_input_tokens_seen": 4694976, "step": 2185 }, { "epoch": 0.4019086070838686, "grad_norm": 25.60576820373535, "learning_rate": 2.0086254358597907e-06, "loss": 0.3448, "num_input_tokens_seen": 4704544, "step": 2190 }, { "epoch": 0.40282620664342084, "grad_norm": 18.469703674316406, "learning_rate": 2.013213433657552e-06, "loss": 0.3085, "num_input_tokens_seen": 4716480, "step": 2195 }, { "epoch": 0.403743806202973, "grad_norm": 16.505130767822266, "learning_rate": 2.017801431455313e-06, "loss": 0.3856, "num_input_tokens_seen": 4727552, "step": 2200 }, { "epoch": 0.40466140576252524, "grad_norm": 22.76926040649414, "learning_rate": 2.022389429253074e-06, "loss": 0.3844, "num_input_tokens_seen": 4738240, "step": 2205 }, { "epoch": 0.40557900532207747, "grad_norm": 19.339984893798828, "learning_rate": 2.0269774270508353e-06, "loss": 0.3125, "num_input_tokens_seen": 4747680, "step": 2210 }, { "epoch": 0.40649660488162964, "grad_norm": 31.060274124145508, "learning_rate": 2.0315654248485962e-06, "loss": 0.3666, "num_input_tokens_seen": 4757536, "step": 2215 }, { "epoch": 0.40741420444118187, "grad_norm": 15.241979598999023, "learning_rate": 2.036153422646357e-06, "loss": 0.3633, "num_input_tokens_seen": 4767744, "step": 2220 }, { "epoch": 0.4083318040007341, "grad_norm": 19.36517906188965, "learning_rate": 2.0407414204441185e-06, "loss": 0.3713, "num_input_tokens_seen": 4778784, "step": 2225 }, { "epoch": 0.4092494035602863, "grad_norm": 31.874258041381836, "learning_rate": 2.0453294182418795e-06, "loss": 0.3816, "num_input_tokens_seen": 4789888, "step": 2230 }, { "epoch": 0.4101670031198385, "grad_norm": 11.006619453430176, "learning_rate": 2.0499174160396404e-06, "loss": 0.3783, "num_input_tokens_seen": 4799968, "step": 2235 }, { "epoch": 0.41108460267939073, "grad_norm": 10.651599884033203, "learning_rate": 2.0545054138374014e-06, "loss": 0.3353, "num_input_tokens_seen": 4810912, "step": 2240 }, { "epoch": 0.4120022022389429, "grad_norm": 45.60321044921875, "learning_rate": 2.0590934116351627e-06, "loss": 0.3918, "num_input_tokens_seen": 4822144, "step": 2245 }, { "epoch": 0.41291980179849513, "grad_norm": 15.442235946655273, "learning_rate": 2.0636814094329237e-06, "loss": 0.3508, "num_input_tokens_seen": 4831552, "step": 2250 }, { "epoch": 0.41383740135804736, "grad_norm": 16.28531837463379, "learning_rate": 2.0682694072306846e-06, "loss": 0.3572, "num_input_tokens_seen": 4842720, "step": 2255 }, { "epoch": 0.41475500091759954, "grad_norm": 18.362293243408203, "learning_rate": 2.072857405028446e-06, "loss": 0.3567, "num_input_tokens_seen": 4853600, "step": 2260 }, { "epoch": 0.41567260047715177, "grad_norm": 21.820310592651367, "learning_rate": 2.077445402826207e-06, "loss": 0.3842, "num_input_tokens_seen": 4863712, "step": 2265 }, { "epoch": 0.416590200036704, "grad_norm": 25.701387405395508, "learning_rate": 2.082033400623968e-06, "loss": 0.2908, "num_input_tokens_seen": 4873920, "step": 2270 }, { "epoch": 0.41750779959625617, "grad_norm": 48.246826171875, "learning_rate": 2.086621398421729e-06, "loss": 0.404, "num_input_tokens_seen": 4885376, "step": 2275 }, { "epoch": 0.4184253991558084, "grad_norm": 15.704812049865723, "learning_rate": 2.0912093962194897e-06, "loss": 0.2923, "num_input_tokens_seen": 4895488, "step": 2280 }, { "epoch": 0.4193429987153606, "grad_norm": 20.68638038635254, "learning_rate": 2.095797394017251e-06, "loss": 0.3945, "num_input_tokens_seen": 4906560, "step": 2285 }, { "epoch": 0.42026059827491286, "grad_norm": 27.61456298828125, "learning_rate": 2.100385391815012e-06, "loss": 0.342, "num_input_tokens_seen": 4917728, "step": 2290 }, { "epoch": 0.42117819783446503, "grad_norm": 25.8890380859375, "learning_rate": 2.104973389612773e-06, "loss": 0.3345, "num_input_tokens_seen": 4927776, "step": 2295 }, { "epoch": 0.42209579739401726, "grad_norm": 39.110774993896484, "learning_rate": 2.1095613874105343e-06, "loss": 0.363, "num_input_tokens_seen": 4939232, "step": 2300 }, { "epoch": 0.4230133969535695, "grad_norm": 20.50452423095703, "learning_rate": 2.1141493852082953e-06, "loss": 0.3488, "num_input_tokens_seen": 4950464, "step": 2305 }, { "epoch": 0.42393099651312166, "grad_norm": 16.37712287902832, "learning_rate": 2.118737383006056e-06, "loss": 0.4226, "num_input_tokens_seen": 4960064, "step": 2310 }, { "epoch": 0.4248485960726739, "grad_norm": 19.589035034179688, "learning_rate": 2.123325380803817e-06, "loss": 0.2963, "num_input_tokens_seen": 4970336, "step": 2315 }, { "epoch": 0.4257661956322261, "grad_norm": 40.516021728515625, "learning_rate": 2.1279133786015785e-06, "loss": 0.3772, "num_input_tokens_seen": 4981568, "step": 2320 }, { "epoch": 0.4266837951917783, "grad_norm": 25.034048080444336, "learning_rate": 2.1325013763993394e-06, "loss": 0.4643, "num_input_tokens_seen": 4992320, "step": 2325 }, { "epoch": 0.4276013947513305, "grad_norm": 28.36398696899414, "learning_rate": 2.1370893741971004e-06, "loss": 0.3109, "num_input_tokens_seen": 5003840, "step": 2330 }, { "epoch": 0.42851899431088275, "grad_norm": 16.099210739135742, "learning_rate": 2.1416773719948617e-06, "loss": 0.3032, "num_input_tokens_seen": 5014208, "step": 2335 }, { "epoch": 0.4294365938704349, "grad_norm": 11.395024299621582, "learning_rate": 2.1462653697926227e-06, "loss": 0.3983, "num_input_tokens_seen": 5025536, "step": 2340 }, { "epoch": 0.43035419342998715, "grad_norm": 26.353595733642578, "learning_rate": 2.1508533675903836e-06, "loss": 0.3629, "num_input_tokens_seen": 5036000, "step": 2345 }, { "epoch": 0.4312717929895394, "grad_norm": 25.51546859741211, "learning_rate": 2.155441365388145e-06, "loss": 0.3739, "num_input_tokens_seen": 5046272, "step": 2350 }, { "epoch": 0.43218939254909156, "grad_norm": 33.83680725097656, "learning_rate": 2.160029363185906e-06, "loss": 0.3177, "num_input_tokens_seen": 5057664, "step": 2355 }, { "epoch": 0.4331069921086438, "grad_norm": 25.6871395111084, "learning_rate": 2.164617360983667e-06, "loss": 0.2955, "num_input_tokens_seen": 5068576, "step": 2360 }, { "epoch": 0.434024591668196, "grad_norm": 19.015857696533203, "learning_rate": 2.1692053587814278e-06, "loss": 0.327, "num_input_tokens_seen": 5079392, "step": 2365 }, { "epoch": 0.4349421912277482, "grad_norm": 18.539518356323242, "learning_rate": 2.173793356579189e-06, "loss": 0.3305, "num_input_tokens_seen": 5090560, "step": 2370 }, { "epoch": 0.4358597907873004, "grad_norm": 54.83269119262695, "learning_rate": 2.17838135437695e-06, "loss": 0.3984, "num_input_tokens_seen": 5101088, "step": 2375 }, { "epoch": 0.43677739034685265, "grad_norm": 22.61105728149414, "learning_rate": 2.182969352174711e-06, "loss": 0.4946, "num_input_tokens_seen": 5111104, "step": 2380 }, { "epoch": 0.4376949899064048, "grad_norm": 33.210182189941406, "learning_rate": 2.1875573499724724e-06, "loss": 0.338, "num_input_tokens_seen": 5121344, "step": 2385 }, { "epoch": 0.43861258946595705, "grad_norm": 13.592948913574219, "learning_rate": 2.1921453477702333e-06, "loss": 0.2788, "num_input_tokens_seen": 5133088, "step": 2390 }, { "epoch": 0.4395301890255093, "grad_norm": 42.8021354675293, "learning_rate": 2.1967333455679943e-06, "loss": 0.4053, "num_input_tokens_seen": 5143520, "step": 2395 }, { "epoch": 0.4404477885850615, "grad_norm": 32.31578063964844, "learning_rate": 2.2013213433657556e-06, "loss": 0.2993, "num_input_tokens_seen": 5154080, "step": 2400 }, { "epoch": 0.4413653881446137, "grad_norm": 28.788536071777344, "learning_rate": 2.205909341163516e-06, "loss": 0.3326, "num_input_tokens_seen": 5164864, "step": 2405 }, { "epoch": 0.4422829877041659, "grad_norm": 17.309932708740234, "learning_rate": 2.2104973389612775e-06, "loss": 0.3435, "num_input_tokens_seen": 5176864, "step": 2410 }, { "epoch": 0.44320058726371814, "grad_norm": 25.20652961730957, "learning_rate": 2.2150853367590384e-06, "loss": 0.3522, "num_input_tokens_seen": 5188000, "step": 2415 }, { "epoch": 0.4441181868232703, "grad_norm": 34.27097702026367, "learning_rate": 2.2196733345568e-06, "loss": 0.3449, "num_input_tokens_seen": 5198944, "step": 2420 }, { "epoch": 0.44503578638282254, "grad_norm": 25.383337020874023, "learning_rate": 2.2242613323545607e-06, "loss": 0.2918, "num_input_tokens_seen": 5210176, "step": 2425 }, { "epoch": 0.44595338594237477, "grad_norm": 17.90848159790039, "learning_rate": 2.2288493301523217e-06, "loss": 0.3565, "num_input_tokens_seen": 5220864, "step": 2430 }, { "epoch": 0.44687098550192694, "grad_norm": 59.19816970825195, "learning_rate": 2.233437327950083e-06, "loss": 0.3859, "num_input_tokens_seen": 5231488, "step": 2435 }, { "epoch": 0.44778858506147917, "grad_norm": 18.06940460205078, "learning_rate": 2.2380253257478436e-06, "loss": 0.4101, "num_input_tokens_seen": 5241440, "step": 2440 }, { "epoch": 0.4487061846210314, "grad_norm": 17.334548950195312, "learning_rate": 2.242613323545605e-06, "loss": 0.3776, "num_input_tokens_seen": 5252608, "step": 2445 }, { "epoch": 0.4496237841805836, "grad_norm": 20.363964080810547, "learning_rate": 2.247201321343366e-06, "loss": 0.3514, "num_input_tokens_seen": 5264064, "step": 2450 }, { "epoch": 0.4505413837401358, "grad_norm": 22.020322799682617, "learning_rate": 2.251789319141127e-06, "loss": 0.3913, "num_input_tokens_seen": 5276864, "step": 2455 }, { "epoch": 0.45145898329968803, "grad_norm": 21.334707260131836, "learning_rate": 2.256377316938888e-06, "loss": 0.3221, "num_input_tokens_seen": 5287264, "step": 2460 }, { "epoch": 0.4523765828592402, "grad_norm": 21.314367294311523, "learning_rate": 2.260965314736649e-06, "loss": 0.3621, "num_input_tokens_seen": 5296768, "step": 2465 }, { "epoch": 0.45329418241879244, "grad_norm": 12.705357551574707, "learning_rate": 2.26555331253441e-06, "loss": 0.2909, "num_input_tokens_seen": 5307680, "step": 2470 }, { "epoch": 0.45421178197834466, "grad_norm": 16.0123348236084, "learning_rate": 2.2701413103321714e-06, "loss": 0.3616, "num_input_tokens_seen": 5318176, "step": 2475 }, { "epoch": 0.45512938153789684, "grad_norm": 24.170103073120117, "learning_rate": 2.2747293081299323e-06, "loss": 0.4324, "num_input_tokens_seen": 5328704, "step": 2480 }, { "epoch": 0.45604698109744907, "grad_norm": 25.499282836914062, "learning_rate": 2.2793173059276933e-06, "loss": 0.403, "num_input_tokens_seen": 5339392, "step": 2485 }, { "epoch": 0.4569645806570013, "grad_norm": 12.859582901000977, "learning_rate": 2.283905303725454e-06, "loss": 0.3473, "num_input_tokens_seen": 5350368, "step": 2490 }, { "epoch": 0.45788218021655347, "grad_norm": 18.373668670654297, "learning_rate": 2.2884933015232156e-06, "loss": 0.3432, "num_input_tokens_seen": 5360704, "step": 2495 }, { "epoch": 0.4587997797761057, "grad_norm": 13.0654878616333, "learning_rate": 2.2930812993209765e-06, "loss": 0.3213, "num_input_tokens_seen": 5371264, "step": 2500 }, { "epoch": 0.4597173793356579, "grad_norm": 17.583450317382812, "learning_rate": 2.2976692971187374e-06, "loss": 0.3411, "num_input_tokens_seen": 5381760, "step": 2505 }, { "epoch": 0.46063497889521016, "grad_norm": 12.199898719787598, "learning_rate": 2.302257294916499e-06, "loss": 0.3184, "num_input_tokens_seen": 5393120, "step": 2510 }, { "epoch": 0.46155257845476233, "grad_norm": 12.230085372924805, "learning_rate": 2.3068452927142597e-06, "loss": 0.3741, "num_input_tokens_seen": 5404320, "step": 2515 }, { "epoch": 0.46247017801431456, "grad_norm": 14.666898727416992, "learning_rate": 2.3114332905120207e-06, "loss": 0.355, "num_input_tokens_seen": 5415712, "step": 2520 }, { "epoch": 0.4633877775738668, "grad_norm": 8.519980430603027, "learning_rate": 2.316021288309782e-06, "loss": 0.3572, "num_input_tokens_seen": 5426912, "step": 2525 }, { "epoch": 0.46430537713341896, "grad_norm": 30.687911987304688, "learning_rate": 2.320609286107543e-06, "loss": 0.3772, "num_input_tokens_seen": 5438144, "step": 2530 }, { "epoch": 0.4652229766929712, "grad_norm": 29.13631820678711, "learning_rate": 2.325197283905304e-06, "loss": 0.3795, "num_input_tokens_seen": 5448416, "step": 2535 }, { "epoch": 0.4661405762525234, "grad_norm": 13.803494453430176, "learning_rate": 2.329785281703065e-06, "loss": 0.2891, "num_input_tokens_seen": 5459456, "step": 2540 }, { "epoch": 0.4670581758120756, "grad_norm": 47.261837005615234, "learning_rate": 2.3343732795008262e-06, "loss": 0.3226, "num_input_tokens_seen": 5469344, "step": 2545 }, { "epoch": 0.4679757753716278, "grad_norm": 44.08089828491211, "learning_rate": 2.338961277298587e-06, "loss": 0.4947, "num_input_tokens_seen": 5478944, "step": 2550 }, { "epoch": 0.46889337493118005, "grad_norm": 14.116663932800293, "learning_rate": 2.343549275096348e-06, "loss": 0.3825, "num_input_tokens_seen": 5487392, "step": 2555 }, { "epoch": 0.4698109744907322, "grad_norm": 22.04372787475586, "learning_rate": 2.3481372728941095e-06, "loss": 0.3386, "num_input_tokens_seen": 5498240, "step": 2560 }, { "epoch": 0.47072857405028445, "grad_norm": 11.668913841247559, "learning_rate": 2.35272527069187e-06, "loss": 0.3195, "num_input_tokens_seen": 5508768, "step": 2565 }, { "epoch": 0.4716461736098367, "grad_norm": 13.394247055053711, "learning_rate": 2.3573132684896313e-06, "loss": 0.3598, "num_input_tokens_seen": 5519936, "step": 2570 }, { "epoch": 0.47256377316938886, "grad_norm": 12.982272148132324, "learning_rate": 2.3619012662873923e-06, "loss": 0.3196, "num_input_tokens_seen": 5531136, "step": 2575 }, { "epoch": 0.4734813727289411, "grad_norm": 24.61566162109375, "learning_rate": 2.3664892640851532e-06, "loss": 0.332, "num_input_tokens_seen": 5542816, "step": 2580 }, { "epoch": 0.4743989722884933, "grad_norm": 12.123361587524414, "learning_rate": 2.3710772618829146e-06, "loss": 0.311, "num_input_tokens_seen": 5552320, "step": 2585 }, { "epoch": 0.4753165718480455, "grad_norm": 15.248634338378906, "learning_rate": 2.3756652596806755e-06, "loss": 0.3672, "num_input_tokens_seen": 5563040, "step": 2590 }, { "epoch": 0.4762341714075977, "grad_norm": 47.3591423034668, "learning_rate": 2.3802532574784365e-06, "loss": 0.3361, "num_input_tokens_seen": 5574400, "step": 2595 }, { "epoch": 0.47715177096714995, "grad_norm": 27.39943504333496, "learning_rate": 2.384841255276198e-06, "loss": 0.4158, "num_input_tokens_seen": 5585984, "step": 2600 }, { "epoch": 0.4780693705267021, "grad_norm": 13.378046035766602, "learning_rate": 2.3894292530739588e-06, "loss": 0.3507, "num_input_tokens_seen": 5597024, "step": 2605 }, { "epoch": 0.47898697008625435, "grad_norm": 17.795629501342773, "learning_rate": 2.3940172508717197e-06, "loss": 0.346, "num_input_tokens_seen": 5607328, "step": 2610 }, { "epoch": 0.4799045696458066, "grad_norm": 12.244285583496094, "learning_rate": 2.3986052486694806e-06, "loss": 0.3201, "num_input_tokens_seen": 5618656, "step": 2615 }, { "epoch": 0.4808221692053588, "grad_norm": 10.72674560546875, "learning_rate": 2.403193246467242e-06, "loss": 0.2915, "num_input_tokens_seen": 5629088, "step": 2620 }, { "epoch": 0.481739768764911, "grad_norm": 31.058544158935547, "learning_rate": 2.407781244265003e-06, "loss": 0.3152, "num_input_tokens_seen": 5640224, "step": 2625 }, { "epoch": 0.4826573683244632, "grad_norm": 16.496997833251953, "learning_rate": 2.412369242062764e-06, "loss": 0.3262, "num_input_tokens_seen": 5650880, "step": 2630 }, { "epoch": 0.48357496788401544, "grad_norm": 20.80038833618164, "learning_rate": 2.4169572398605252e-06, "loss": 0.2831, "num_input_tokens_seen": 5662624, "step": 2635 }, { "epoch": 0.4844925674435676, "grad_norm": 15.555756568908691, "learning_rate": 2.421545237658286e-06, "loss": 0.3001, "num_input_tokens_seen": 5673376, "step": 2640 }, { "epoch": 0.48541016700311984, "grad_norm": 10.715004920959473, "learning_rate": 2.426133235456047e-06, "loss": 0.2058, "num_input_tokens_seen": 5684768, "step": 2645 }, { "epoch": 0.48632776656267207, "grad_norm": 18.81361198425293, "learning_rate": 2.4307212332538085e-06, "loss": 0.3268, "num_input_tokens_seen": 5695008, "step": 2650 }, { "epoch": 0.48724536612222424, "grad_norm": 79.9828872680664, "learning_rate": 2.4353092310515694e-06, "loss": 0.5992, "num_input_tokens_seen": 5706208, "step": 2655 }, { "epoch": 0.4881629656817765, "grad_norm": 37.850032806396484, "learning_rate": 2.4398972288493303e-06, "loss": 0.3229, "num_input_tokens_seen": 5716544, "step": 2660 }, { "epoch": 0.4890805652413287, "grad_norm": 29.43597984313965, "learning_rate": 2.4444852266470913e-06, "loss": 0.3182, "num_input_tokens_seen": 5726624, "step": 2665 }, { "epoch": 0.4899981648008809, "grad_norm": 13.496642112731934, "learning_rate": 2.4490732244448526e-06, "loss": 0.3093, "num_input_tokens_seen": 5737952, "step": 2670 }, { "epoch": 0.4909157643604331, "grad_norm": 27.31941032409668, "learning_rate": 2.4536612222426136e-06, "loss": 0.3452, "num_input_tokens_seen": 5747904, "step": 2675 }, { "epoch": 0.49183336391998533, "grad_norm": 17.81527328491211, "learning_rate": 2.4582492200403745e-06, "loss": 0.318, "num_input_tokens_seen": 5758784, "step": 2680 }, { "epoch": 0.4927509634795375, "grad_norm": 57.98853302001953, "learning_rate": 2.462837217838136e-06, "loss": 0.3686, "num_input_tokens_seen": 5768736, "step": 2685 }, { "epoch": 0.49366856303908974, "grad_norm": 18.125003814697266, "learning_rate": 2.4674252156358964e-06, "loss": 0.3835, "num_input_tokens_seen": 5780352, "step": 2690 }, { "epoch": 0.49458616259864197, "grad_norm": 25.08787727355957, "learning_rate": 2.4720132134336578e-06, "loss": 0.4792, "num_input_tokens_seen": 5791936, "step": 2695 }, { "epoch": 0.49550376215819414, "grad_norm": 35.281494140625, "learning_rate": 2.4766012112314187e-06, "loss": 0.3202, "num_input_tokens_seen": 5801632, "step": 2700 }, { "epoch": 0.49642136171774637, "grad_norm": 17.233734130859375, "learning_rate": 2.4811892090291796e-06, "loss": 0.3261, "num_input_tokens_seen": 5813184, "step": 2705 }, { "epoch": 0.4973389612772986, "grad_norm": 21.576065063476562, "learning_rate": 2.485777206826941e-06, "loss": 0.4396, "num_input_tokens_seen": 5823552, "step": 2710 }, { "epoch": 0.4982565608368508, "grad_norm": 11.880083084106445, "learning_rate": 2.490365204624702e-06, "loss": 0.3517, "num_input_tokens_seen": 5834880, "step": 2715 }, { "epoch": 0.499174160396403, "grad_norm": 14.485062599182129, "learning_rate": 2.494953202422463e-06, "loss": 0.3487, "num_input_tokens_seen": 5845088, "step": 2720 }, { "epoch": 0.5000917599559552, "grad_norm": 13.081591606140137, "learning_rate": 2.4995412002202242e-06, "loss": 0.3518, "num_input_tokens_seen": 5856928, "step": 2725 }, { "epoch": 0.5010093595155074, "grad_norm": 21.721940994262695, "learning_rate": 2.5041291980179848e-06, "loss": 0.3305, "num_input_tokens_seen": 5868064, "step": 2730 }, { "epoch": 0.5019269590750597, "grad_norm": 9.453421592712402, "learning_rate": 2.5087171958157465e-06, "loss": 0.3868, "num_input_tokens_seen": 5878656, "step": 2735 }, { "epoch": 0.5028445586346119, "grad_norm": 26.704601287841797, "learning_rate": 2.513305193613507e-06, "loss": 0.383, "num_input_tokens_seen": 5889024, "step": 2740 }, { "epoch": 0.503762158194164, "grad_norm": 13.854019165039062, "learning_rate": 2.517893191411268e-06, "loss": 0.3198, "num_input_tokens_seen": 5900160, "step": 2745 }, { "epoch": 0.5046797577537163, "grad_norm": 10.239603042602539, "learning_rate": 2.5224811892090294e-06, "loss": 0.3313, "num_input_tokens_seen": 5910432, "step": 2750 }, { "epoch": 0.5055973573132685, "grad_norm": 19.85295295715332, "learning_rate": 2.5270691870067903e-06, "loss": 0.3762, "num_input_tokens_seen": 5921152, "step": 2755 }, { "epoch": 0.5065149568728207, "grad_norm": 13.182622909545898, "learning_rate": 2.5316571848045512e-06, "loss": 0.3863, "num_input_tokens_seen": 5932384, "step": 2760 }, { "epoch": 0.507432556432373, "grad_norm": 23.641849517822266, "learning_rate": 2.5362451826023126e-06, "loss": 0.3401, "num_input_tokens_seen": 5942720, "step": 2765 }, { "epoch": 0.5083501559919251, "grad_norm": 17.75780487060547, "learning_rate": 2.5408331804000735e-06, "loss": 0.372, "num_input_tokens_seen": 5952864, "step": 2770 }, { "epoch": 0.5092677555514773, "grad_norm": 15.087646484375, "learning_rate": 2.5454211781978345e-06, "loss": 0.3414, "num_input_tokens_seen": 5963776, "step": 2775 }, { "epoch": 0.5101853551110296, "grad_norm": 11.09009838104248, "learning_rate": 2.550009175995596e-06, "loss": 0.3113, "num_input_tokens_seen": 5973184, "step": 2780 }, { "epoch": 0.5111029546705818, "grad_norm": 12.527292251586914, "learning_rate": 2.5545971737933568e-06, "loss": 0.328, "num_input_tokens_seen": 5984000, "step": 2785 }, { "epoch": 0.5120205542301339, "grad_norm": 11.742023468017578, "learning_rate": 2.559185171591118e-06, "loss": 0.333, "num_input_tokens_seen": 5995424, "step": 2790 }, { "epoch": 0.5129381537896862, "grad_norm": 10.126466751098633, "learning_rate": 2.563773169388879e-06, "loss": 0.3697, "num_input_tokens_seen": 6006368, "step": 2795 }, { "epoch": 0.5138557533492384, "grad_norm": 19.350296020507812, "learning_rate": 2.56836116718664e-06, "loss": 0.3571, "num_input_tokens_seen": 6017184, "step": 2800 }, { "epoch": 0.5147733529087906, "grad_norm": 20.077335357666016, "learning_rate": 2.5729491649844014e-06, "loss": 0.3601, "num_input_tokens_seen": 6027168, "step": 2805 }, { "epoch": 0.5156909524683428, "grad_norm": 23.905410766601562, "learning_rate": 2.5775371627821623e-06, "loss": 0.323, "num_input_tokens_seen": 6037024, "step": 2810 }, { "epoch": 0.516608552027895, "grad_norm": 17.83196449279785, "learning_rate": 2.582125160579923e-06, "loss": 0.3518, "num_input_tokens_seen": 6048448, "step": 2815 }, { "epoch": 0.5175261515874472, "grad_norm": 11.076176643371582, "learning_rate": 2.5867131583776846e-06, "loss": 0.3305, "num_input_tokens_seen": 6057984, "step": 2820 }, { "epoch": 0.5184437511469995, "grad_norm": 18.991846084594727, "learning_rate": 2.591301156175445e-06, "loss": 0.4256, "num_input_tokens_seen": 6068160, "step": 2825 }, { "epoch": 0.5193613507065516, "grad_norm": 21.14695930480957, "learning_rate": 2.595889153973206e-06, "loss": 0.3457, "num_input_tokens_seen": 6078432, "step": 2830 }, { "epoch": 0.5202789502661038, "grad_norm": 18.32658576965332, "learning_rate": 2.6004771517709674e-06, "loss": 0.3163, "num_input_tokens_seen": 6088736, "step": 2835 }, { "epoch": 0.5211965498256561, "grad_norm": 12.353631973266602, "learning_rate": 2.6050651495687284e-06, "loss": 0.3, "num_input_tokens_seen": 6098944, "step": 2840 }, { "epoch": 0.5221141493852083, "grad_norm": 17.921480178833008, "learning_rate": 2.6096531473664893e-06, "loss": 0.3341, "num_input_tokens_seen": 6111072, "step": 2845 }, { "epoch": 0.5230317489447605, "grad_norm": 36.65262985229492, "learning_rate": 2.6142411451642507e-06, "loss": 0.3599, "num_input_tokens_seen": 6121120, "step": 2850 }, { "epoch": 0.5239493485043127, "grad_norm": 28.089086532592773, "learning_rate": 2.6188291429620116e-06, "loss": 0.3443, "num_input_tokens_seen": 6133248, "step": 2855 }, { "epoch": 0.5248669480638649, "grad_norm": 9.244318008422852, "learning_rate": 2.6234171407597725e-06, "loss": 0.3357, "num_input_tokens_seen": 6142848, "step": 2860 }, { "epoch": 0.5257845476234171, "grad_norm": 11.518836975097656, "learning_rate": 2.628005138557534e-06, "loss": 0.3704, "num_input_tokens_seen": 6154240, "step": 2865 }, { "epoch": 0.5267021471829694, "grad_norm": 15.720504760742188, "learning_rate": 2.632593136355295e-06, "loss": 0.3114, "num_input_tokens_seen": 6165824, "step": 2870 }, { "epoch": 0.5276197467425215, "grad_norm": 14.561087608337402, "learning_rate": 2.6371811341530558e-06, "loss": 0.4239, "num_input_tokens_seen": 6176416, "step": 2875 }, { "epoch": 0.5285373463020738, "grad_norm": 9.691047668457031, "learning_rate": 2.641769131950817e-06, "loss": 0.327, "num_input_tokens_seen": 6186848, "step": 2880 }, { "epoch": 0.529454945861626, "grad_norm": 11.036839485168457, "learning_rate": 2.646357129748578e-06, "loss": 0.3316, "num_input_tokens_seen": 6197856, "step": 2885 }, { "epoch": 0.5303725454211782, "grad_norm": 12.84501838684082, "learning_rate": 2.650945127546339e-06, "loss": 0.369, "num_input_tokens_seen": 6207232, "step": 2890 }, { "epoch": 0.5312901449807305, "grad_norm": 13.972488403320312, "learning_rate": 2.6555331253441004e-06, "loss": 0.3706, "num_input_tokens_seen": 6218496, "step": 2895 }, { "epoch": 0.5322077445402826, "grad_norm": 10.300612449645996, "learning_rate": 2.6601211231418613e-06, "loss": 0.3237, "num_input_tokens_seen": 6228832, "step": 2900 }, { "epoch": 0.5331253440998348, "grad_norm": 12.840726852416992, "learning_rate": 2.664709120939622e-06, "loss": 0.2643, "num_input_tokens_seen": 6239232, "step": 2905 }, { "epoch": 0.5340429436593871, "grad_norm": 13.852191925048828, "learning_rate": 2.669297118737383e-06, "loss": 0.3004, "num_input_tokens_seen": 6249920, "step": 2910 }, { "epoch": 0.5349605432189393, "grad_norm": 10.922554016113281, "learning_rate": 2.673885116535144e-06, "loss": 0.3042, "num_input_tokens_seen": 6260352, "step": 2915 }, { "epoch": 0.5358781427784914, "grad_norm": 13.42144775390625, "learning_rate": 2.678473114332905e-06, "loss": 0.4082, "num_input_tokens_seen": 6272128, "step": 2920 }, { "epoch": 0.5367957423380437, "grad_norm": 12.00441837310791, "learning_rate": 2.6830611121306664e-06, "loss": 0.3517, "num_input_tokens_seen": 6283168, "step": 2925 }, { "epoch": 0.5377133418975959, "grad_norm": 14.825648307800293, "learning_rate": 2.6876491099284274e-06, "loss": 0.3099, "num_input_tokens_seen": 6294016, "step": 2930 }, { "epoch": 0.5386309414571481, "grad_norm": 20.205570220947266, "learning_rate": 2.6922371077261883e-06, "loss": 0.2925, "num_input_tokens_seen": 6305504, "step": 2935 }, { "epoch": 0.5395485410167004, "grad_norm": 10.640286445617676, "learning_rate": 2.6968251055239497e-06, "loss": 0.3654, "num_input_tokens_seen": 6317280, "step": 2940 }, { "epoch": 0.5404661405762525, "grad_norm": 17.9730281829834, "learning_rate": 2.7014131033217106e-06, "loss": 0.2969, "num_input_tokens_seen": 6328960, "step": 2945 }, { "epoch": 0.5413837401358047, "grad_norm": 10.922622680664062, "learning_rate": 2.7060011011194715e-06, "loss": 0.3476, "num_input_tokens_seen": 6338720, "step": 2950 }, { "epoch": 0.542301339695357, "grad_norm": 13.41237735748291, "learning_rate": 2.710589098917233e-06, "loss": 0.3778, "num_input_tokens_seen": 6349984, "step": 2955 }, { "epoch": 0.5432189392549092, "grad_norm": 17.2983341217041, "learning_rate": 2.715177096714994e-06, "loss": 0.2819, "num_input_tokens_seen": 6360160, "step": 2960 }, { "epoch": 0.5441365388144613, "grad_norm": 18.197925567626953, "learning_rate": 2.7197650945127548e-06, "loss": 0.3249, "num_input_tokens_seen": 6370816, "step": 2965 }, { "epoch": 0.5450541383740136, "grad_norm": 20.254911422729492, "learning_rate": 2.724353092310516e-06, "loss": 0.3311, "num_input_tokens_seen": 6381632, "step": 2970 }, { "epoch": 0.5459717379335658, "grad_norm": 25.384536743164062, "learning_rate": 2.728941090108277e-06, "loss": 0.3326, "num_input_tokens_seen": 6393952, "step": 2975 }, { "epoch": 0.546889337493118, "grad_norm": 16.62976837158203, "learning_rate": 2.7335290879060376e-06, "loss": 0.3755, "num_input_tokens_seen": 6404288, "step": 2980 }, { "epoch": 0.5478069370526703, "grad_norm": 52.88230514526367, "learning_rate": 2.7381170857037994e-06, "loss": 0.4159, "num_input_tokens_seen": 6413536, "step": 2985 }, { "epoch": 0.5487245366122224, "grad_norm": 33.052486419677734, "learning_rate": 2.74270508350156e-06, "loss": 0.329, "num_input_tokens_seen": 6424992, "step": 2990 }, { "epoch": 0.5496421361717746, "grad_norm": 18.747480392456055, "learning_rate": 2.747293081299321e-06, "loss": 0.3646, "num_input_tokens_seen": 6435264, "step": 2995 }, { "epoch": 0.5505597357313269, "grad_norm": 12.113896369934082, "learning_rate": 2.751881079097082e-06, "loss": 0.3313, "num_input_tokens_seen": 6445408, "step": 3000 }, { "epoch": 0.5514773352908791, "grad_norm": 15.691314697265625, "learning_rate": 2.756469076894843e-06, "loss": 0.296, "num_input_tokens_seen": 6455072, "step": 3005 }, { "epoch": 0.5523949348504312, "grad_norm": 15.935409545898438, "learning_rate": 2.7610570746926045e-06, "loss": 0.3379, "num_input_tokens_seen": 6465632, "step": 3010 }, { "epoch": 0.5533125344099835, "grad_norm": 14.214043617248535, "learning_rate": 2.7656450724903654e-06, "loss": 0.3322, "num_input_tokens_seen": 6476960, "step": 3015 }, { "epoch": 0.5542301339695357, "grad_norm": 19.039770126342773, "learning_rate": 2.7702330702881264e-06, "loss": 0.343, "num_input_tokens_seen": 6488128, "step": 3020 }, { "epoch": 0.5551477335290879, "grad_norm": 18.3267822265625, "learning_rate": 2.7748210680858877e-06, "loss": 0.3778, "num_input_tokens_seen": 6499264, "step": 3025 }, { "epoch": 0.5560653330886401, "grad_norm": 12.214080810546875, "learning_rate": 2.7794090658836487e-06, "loss": 0.3099, "num_input_tokens_seen": 6509632, "step": 3030 }, { "epoch": 0.5569829326481923, "grad_norm": 16.855501174926758, "learning_rate": 2.7839970636814096e-06, "loss": 0.3491, "num_input_tokens_seen": 6520512, "step": 3035 }, { "epoch": 0.5579005322077445, "grad_norm": 14.794134140014648, "learning_rate": 2.788585061479171e-06, "loss": 0.2616, "num_input_tokens_seen": 6531040, "step": 3040 }, { "epoch": 0.5588181317672968, "grad_norm": 9.713869094848633, "learning_rate": 2.793173059276932e-06, "loss": 0.3607, "num_input_tokens_seen": 6542016, "step": 3045 }, { "epoch": 0.559735731326849, "grad_norm": 22.468523025512695, "learning_rate": 2.797761057074693e-06, "loss": 0.3484, "num_input_tokens_seen": 6552672, "step": 3050 }, { "epoch": 0.5606533308864011, "grad_norm": 23.36037826538086, "learning_rate": 2.8023490548724542e-06, "loss": 0.3563, "num_input_tokens_seen": 6562560, "step": 3055 }, { "epoch": 0.5615709304459534, "grad_norm": 10.984821319580078, "learning_rate": 2.806937052670215e-06, "loss": 0.3195, "num_input_tokens_seen": 6574528, "step": 3060 }, { "epoch": 0.5624885300055056, "grad_norm": 10.903881072998047, "learning_rate": 2.8115250504679757e-06, "loss": 0.2928, "num_input_tokens_seen": 6586240, "step": 3065 }, { "epoch": 0.5634061295650578, "grad_norm": 15.243195533752441, "learning_rate": 2.8161130482657375e-06, "loss": 0.3703, "num_input_tokens_seen": 6596448, "step": 3070 }, { "epoch": 0.56432372912461, "grad_norm": 23.24837303161621, "learning_rate": 2.820701046063498e-06, "loss": 0.2965, "num_input_tokens_seen": 6607872, "step": 3075 }, { "epoch": 0.5652413286841622, "grad_norm": 15.38476276397705, "learning_rate": 2.825289043861259e-06, "loss": 0.3487, "num_input_tokens_seen": 6618752, "step": 3080 }, { "epoch": 0.5661589282437144, "grad_norm": 14.14157772064209, "learning_rate": 2.8298770416590203e-06, "loss": 0.2588, "num_input_tokens_seen": 6630016, "step": 3085 }, { "epoch": 0.5670765278032667, "grad_norm": 18.543880462646484, "learning_rate": 2.834465039456781e-06, "loss": 0.3268, "num_input_tokens_seen": 6641664, "step": 3090 }, { "epoch": 0.5679941273628188, "grad_norm": 16.801694869995117, "learning_rate": 2.839053037254542e-06, "loss": 0.2546, "num_input_tokens_seen": 6653056, "step": 3095 }, { "epoch": 0.5689117269223711, "grad_norm": 30.127044677734375, "learning_rate": 2.8436410350523035e-06, "loss": 0.387, "num_input_tokens_seen": 6663872, "step": 3100 }, { "epoch": 0.5698293264819233, "grad_norm": 35.86362838745117, "learning_rate": 2.8482290328500644e-06, "loss": 0.3821, "num_input_tokens_seen": 6675200, "step": 3105 }, { "epoch": 0.5707469260414755, "grad_norm": 21.87876319885254, "learning_rate": 2.8528170306478254e-06, "loss": 0.3818, "num_input_tokens_seen": 6684544, "step": 3110 }, { "epoch": 0.5716645256010278, "grad_norm": 9.565210342407227, "learning_rate": 2.8574050284455867e-06, "loss": 0.3526, "num_input_tokens_seen": 6694208, "step": 3115 }, { "epoch": 0.5725821251605799, "grad_norm": 13.083508491516113, "learning_rate": 2.8619930262433477e-06, "loss": 0.3744, "num_input_tokens_seen": 6705536, "step": 3120 }, { "epoch": 0.5734997247201321, "grad_norm": 11.302468299865723, "learning_rate": 2.8665810240411086e-06, "loss": 0.3554, "num_input_tokens_seen": 6715552, "step": 3125 }, { "epoch": 0.5744173242796844, "grad_norm": 10.907852172851562, "learning_rate": 2.87116902183887e-06, "loss": 0.3402, "num_input_tokens_seen": 6725440, "step": 3130 }, { "epoch": 0.5753349238392366, "grad_norm": 40.762107849121094, "learning_rate": 2.875757019636631e-06, "loss": 0.4074, "num_input_tokens_seen": 6736768, "step": 3135 }, { "epoch": 0.5762525233987887, "grad_norm": 11.893350601196289, "learning_rate": 2.880345017434392e-06, "loss": 0.3553, "num_input_tokens_seen": 6747904, "step": 3140 }, { "epoch": 0.577170122958341, "grad_norm": 25.315723419189453, "learning_rate": 2.8849330152321532e-06, "loss": 0.3428, "num_input_tokens_seen": 6758624, "step": 3145 }, { "epoch": 0.5780877225178932, "grad_norm": 10.964978218078613, "learning_rate": 2.889521013029914e-06, "loss": 0.2922, "num_input_tokens_seen": 6768256, "step": 3150 }, { "epoch": 0.5790053220774454, "grad_norm": 22.466012954711914, "learning_rate": 2.8941090108276747e-06, "loss": 0.3232, "num_input_tokens_seen": 6777056, "step": 3155 }, { "epoch": 0.5799229216369977, "grad_norm": 12.966497421264648, "learning_rate": 2.898697008625436e-06, "loss": 0.3273, "num_input_tokens_seen": 6787552, "step": 3160 }, { "epoch": 0.5808405211965498, "grad_norm": 12.074576377868652, "learning_rate": 2.903285006423197e-06, "loss": 0.401, "num_input_tokens_seen": 6797728, "step": 3165 }, { "epoch": 0.581758120756102, "grad_norm": 10.890507698059082, "learning_rate": 2.907873004220958e-06, "loss": 0.3025, "num_input_tokens_seen": 6807584, "step": 3170 }, { "epoch": 0.5826757203156543, "grad_norm": 13.875943183898926, "learning_rate": 2.9124610020187193e-06, "loss": 0.357, "num_input_tokens_seen": 6818528, "step": 3175 }, { "epoch": 0.5835933198752065, "grad_norm": 16.446290969848633, "learning_rate": 2.91704899981648e-06, "loss": 0.3474, "num_input_tokens_seen": 6829504, "step": 3180 }, { "epoch": 0.5845109194347586, "grad_norm": 18.476327896118164, "learning_rate": 2.921636997614241e-06, "loss": 0.3336, "num_input_tokens_seen": 6840736, "step": 3185 }, { "epoch": 0.5854285189943109, "grad_norm": 11.009688377380371, "learning_rate": 2.9262249954120025e-06, "loss": 0.3185, "num_input_tokens_seen": 6851904, "step": 3190 }, { "epoch": 0.5863461185538631, "grad_norm": 10.868685722351074, "learning_rate": 2.9308129932097634e-06, "loss": 0.3972, "num_input_tokens_seen": 6863232, "step": 3195 }, { "epoch": 0.5872637181134153, "grad_norm": 6.034368991851807, "learning_rate": 2.9354009910075244e-06, "loss": 0.3495, "num_input_tokens_seen": 6874112, "step": 3200 }, { "epoch": 0.5881813176729676, "grad_norm": 13.97108268737793, "learning_rate": 2.9399889888052857e-06, "loss": 0.3607, "num_input_tokens_seen": 6883424, "step": 3205 }, { "epoch": 0.5890989172325197, "grad_norm": 9.793375015258789, "learning_rate": 2.9445769866030467e-06, "loss": 0.3147, "num_input_tokens_seen": 6893664, "step": 3210 }, { "epoch": 0.5900165167920719, "grad_norm": 9.261698722839355, "learning_rate": 2.9491649844008076e-06, "loss": 0.2958, "num_input_tokens_seen": 6905664, "step": 3215 }, { "epoch": 0.5909341163516242, "grad_norm": 24.777862548828125, "learning_rate": 2.953752982198569e-06, "loss": 0.3623, "num_input_tokens_seen": 6915840, "step": 3220 }, { "epoch": 0.5918517159111764, "grad_norm": 30.426870346069336, "learning_rate": 2.95834097999633e-06, "loss": 0.3853, "num_input_tokens_seen": 6927488, "step": 3225 }, { "epoch": 0.5927693154707285, "grad_norm": 10.506986618041992, "learning_rate": 2.9629289777940913e-06, "loss": 0.312, "num_input_tokens_seen": 6939136, "step": 3230 }, { "epoch": 0.5936869150302808, "grad_norm": 12.35336685180664, "learning_rate": 2.9675169755918522e-06, "loss": 0.3421, "num_input_tokens_seen": 6948672, "step": 3235 }, { "epoch": 0.594604514589833, "grad_norm": 13.616107940673828, "learning_rate": 2.9721049733896127e-06, "loss": 0.3298, "num_input_tokens_seen": 6960576, "step": 3240 }, { "epoch": 0.5955221141493852, "grad_norm": 11.691659927368164, "learning_rate": 2.9766929711873745e-06, "loss": 0.2693, "num_input_tokens_seen": 6971392, "step": 3245 }, { "epoch": 0.5964397137089374, "grad_norm": 12.295083999633789, "learning_rate": 2.981280968985135e-06, "loss": 0.3212, "num_input_tokens_seen": 6981120, "step": 3250 }, { "epoch": 0.5973573132684896, "grad_norm": 19.493854522705078, "learning_rate": 2.985868966782896e-06, "loss": 0.4025, "num_input_tokens_seen": 6991744, "step": 3255 }, { "epoch": 0.5982749128280418, "grad_norm": 31.741756439208984, "learning_rate": 2.9904569645806573e-06, "loss": 0.3215, "num_input_tokens_seen": 7002080, "step": 3260 }, { "epoch": 0.5991925123875941, "grad_norm": 15.696276664733887, "learning_rate": 2.9950449623784183e-06, "loss": 0.3788, "num_input_tokens_seen": 7012992, "step": 3265 }, { "epoch": 0.6001101119471463, "grad_norm": 28.411405563354492, "learning_rate": 2.9996329601761792e-06, "loss": 0.4073, "num_input_tokens_seen": 7023424, "step": 3270 }, { "epoch": 0.6010277115066984, "grad_norm": 10.756189346313477, "learning_rate": 3.0042209579739406e-06, "loss": 0.3027, "num_input_tokens_seen": 7033568, "step": 3275 }, { "epoch": 0.6019453110662507, "grad_norm": 12.056127548217773, "learning_rate": 3.0088089557717015e-06, "loss": 0.3641, "num_input_tokens_seen": 7045088, "step": 3280 }, { "epoch": 0.6028629106258029, "grad_norm": 22.567001342773438, "learning_rate": 3.0133969535694625e-06, "loss": 0.3084, "num_input_tokens_seen": 7055328, "step": 3285 }, { "epoch": 0.6037805101853551, "grad_norm": 7.634453773498535, "learning_rate": 3.017984951367224e-06, "loss": 0.296, "num_input_tokens_seen": 7066112, "step": 3290 }, { "epoch": 0.6046981097449073, "grad_norm": 14.40410041809082, "learning_rate": 3.0225729491649848e-06, "loss": 0.2947, "num_input_tokens_seen": 7077536, "step": 3295 }, { "epoch": 0.6056157093044595, "grad_norm": 22.29962730407715, "learning_rate": 3.0271609469627457e-06, "loss": 0.3092, "num_input_tokens_seen": 7088256, "step": 3300 }, { "epoch": 0.6065333088640118, "grad_norm": 13.350929260253906, "learning_rate": 3.031748944760507e-06, "loss": 0.3068, "num_input_tokens_seen": 7098368, "step": 3305 }, { "epoch": 0.607450908423564, "grad_norm": 10.746889114379883, "learning_rate": 3.036336942558268e-06, "loss": 0.4091, "num_input_tokens_seen": 7109632, "step": 3310 }, { "epoch": 0.6083685079831161, "grad_norm": 8.753594398498535, "learning_rate": 3.0409249403560285e-06, "loss": 0.3202, "num_input_tokens_seen": 7120608, "step": 3315 }, { "epoch": 0.6092861075426684, "grad_norm": 13.915252685546875, "learning_rate": 3.0455129381537903e-06, "loss": 0.3718, "num_input_tokens_seen": 7130496, "step": 3320 }, { "epoch": 0.6102037071022206, "grad_norm": 9.080474853515625, "learning_rate": 3.050100935951551e-06, "loss": 0.3609, "num_input_tokens_seen": 7138720, "step": 3325 }, { "epoch": 0.6111213066617728, "grad_norm": 11.497451782226562, "learning_rate": 3.0546889337493117e-06, "loss": 0.346, "num_input_tokens_seen": 7150688, "step": 3330 }, { "epoch": 0.6120389062213251, "grad_norm": 10.531758308410645, "learning_rate": 3.059276931547073e-06, "loss": 0.3179, "num_input_tokens_seen": 7160032, "step": 3335 }, { "epoch": 0.6129565057808772, "grad_norm": 11.256844520568848, "learning_rate": 3.063864929344834e-06, "loss": 0.3275, "num_input_tokens_seen": 7171776, "step": 3340 }, { "epoch": 0.6138741053404294, "grad_norm": 17.201412200927734, "learning_rate": 3.068452927142595e-06, "loss": 0.3585, "num_input_tokens_seen": 7184128, "step": 3345 }, { "epoch": 0.6147917048999817, "grad_norm": 10.937265396118164, "learning_rate": 3.0730409249403563e-06, "loss": 0.3431, "num_input_tokens_seen": 7194720, "step": 3350 }, { "epoch": 0.6157093044595339, "grad_norm": 16.20707893371582, "learning_rate": 3.0776289227381173e-06, "loss": 0.3825, "num_input_tokens_seen": 7204736, "step": 3355 }, { "epoch": 0.616626904019086, "grad_norm": 12.70119857788086, "learning_rate": 3.0822169205358782e-06, "loss": 0.3427, "num_input_tokens_seen": 7215584, "step": 3360 }, { "epoch": 0.6175445035786383, "grad_norm": 10.180986404418945, "learning_rate": 3.0868049183336396e-06, "loss": 0.2693, "num_input_tokens_seen": 7225536, "step": 3365 }, { "epoch": 0.6184621031381905, "grad_norm": 21.093936920166016, "learning_rate": 3.0913929161314005e-06, "loss": 0.3926, "num_input_tokens_seen": 7236960, "step": 3370 }, { "epoch": 0.6193797026977427, "grad_norm": 6.811307907104492, "learning_rate": 3.0959809139291615e-06, "loss": 0.3758, "num_input_tokens_seen": 7248288, "step": 3375 }, { "epoch": 0.620297302257295, "grad_norm": 5.9213714599609375, "learning_rate": 3.100568911726923e-06, "loss": 0.343, "num_input_tokens_seen": 7260192, "step": 3380 }, { "epoch": 0.6212149018168471, "grad_norm": 6.066280364990234, "learning_rate": 3.1051569095246838e-06, "loss": 0.319, "num_input_tokens_seen": 7272064, "step": 3385 }, { "epoch": 0.6221325013763993, "grad_norm": 8.126656532287598, "learning_rate": 3.1097449073224447e-06, "loss": 0.342, "num_input_tokens_seen": 7281824, "step": 3390 }, { "epoch": 0.6230501009359516, "grad_norm": 6.423999309539795, "learning_rate": 3.114332905120206e-06, "loss": 0.3964, "num_input_tokens_seen": 7291840, "step": 3395 }, { "epoch": 0.6239677004955038, "grad_norm": 10.695549964904785, "learning_rate": 3.1189209029179666e-06, "loss": 0.3131, "num_input_tokens_seen": 7301824, "step": 3400 }, { "epoch": 0.6248853000550559, "grad_norm": 15.257095336914062, "learning_rate": 3.1235089007157275e-06, "loss": 0.3305, "num_input_tokens_seen": 7311968, "step": 3405 }, { "epoch": 0.6258028996146082, "grad_norm": 9.360732078552246, "learning_rate": 3.128096898513489e-06, "loss": 0.2997, "num_input_tokens_seen": 7322848, "step": 3410 }, { "epoch": 0.6267204991741604, "grad_norm": 11.409748077392578, "learning_rate": 3.13268489631125e-06, "loss": 0.3085, "num_input_tokens_seen": 7333696, "step": 3415 }, { "epoch": 0.6276380987337126, "grad_norm": 27.427762985229492, "learning_rate": 3.1372728941090108e-06, "loss": 0.3947, "num_input_tokens_seen": 7345184, "step": 3420 }, { "epoch": 0.6285556982932649, "grad_norm": 15.523066520690918, "learning_rate": 3.141860891906772e-06, "loss": 0.3042, "num_input_tokens_seen": 7357280, "step": 3425 }, { "epoch": 0.629473297852817, "grad_norm": 16.528776168823242, "learning_rate": 3.146448889704533e-06, "loss": 0.2899, "num_input_tokens_seen": 7367808, "step": 3430 }, { "epoch": 0.6303908974123692, "grad_norm": 25.490217208862305, "learning_rate": 3.1510368875022944e-06, "loss": 0.2764, "num_input_tokens_seen": 7378880, "step": 3435 }, { "epoch": 0.6313084969719215, "grad_norm": 17.544099807739258, "learning_rate": 3.1556248853000554e-06, "loss": 0.3451, "num_input_tokens_seen": 7390304, "step": 3440 }, { "epoch": 0.6322260965314737, "grad_norm": 54.14759063720703, "learning_rate": 3.1602128830978163e-06, "loss": 0.4422, "num_input_tokens_seen": 7400512, "step": 3445 }, { "epoch": 0.6331436960910258, "grad_norm": 21.825040817260742, "learning_rate": 3.1648008808955777e-06, "loss": 0.4125, "num_input_tokens_seen": 7410816, "step": 3450 }, { "epoch": 0.6340612956505781, "grad_norm": 30.295141220092773, "learning_rate": 3.1693888786933386e-06, "loss": 0.3175, "num_input_tokens_seen": 7420640, "step": 3455 }, { "epoch": 0.6349788952101303, "grad_norm": 14.01533031463623, "learning_rate": 3.1739768764910995e-06, "loss": 0.4513, "num_input_tokens_seen": 7431840, "step": 3460 }, { "epoch": 0.6358964947696825, "grad_norm": 15.212451934814453, "learning_rate": 3.178564874288861e-06, "loss": 0.4271, "num_input_tokens_seen": 7441152, "step": 3465 }, { "epoch": 0.6368140943292347, "grad_norm": 9.572691917419434, "learning_rate": 3.183152872086622e-06, "loss": 0.3187, "num_input_tokens_seen": 7452064, "step": 3470 }, { "epoch": 0.6377316938887869, "grad_norm": 4.968668460845947, "learning_rate": 3.1877408698843828e-06, "loss": 0.2956, "num_input_tokens_seen": 7463136, "step": 3475 }, { "epoch": 0.6386492934483391, "grad_norm": 7.752847194671631, "learning_rate": 3.192328867682144e-06, "loss": 0.3743, "num_input_tokens_seen": 7474656, "step": 3480 }, { "epoch": 0.6395668930078914, "grad_norm": 29.809284210205078, "learning_rate": 3.196916865479905e-06, "loss": 0.4985, "num_input_tokens_seen": 7486496, "step": 3485 }, { "epoch": 0.6404844925674436, "grad_norm": 11.199283599853516, "learning_rate": 3.2015048632776656e-06, "loss": 0.2905, "num_input_tokens_seen": 7496512, "step": 3490 }, { "epoch": 0.6414020921269957, "grad_norm": 6.672518253326416, "learning_rate": 3.2060928610754274e-06, "loss": 0.3513, "num_input_tokens_seen": 7504704, "step": 3495 }, { "epoch": 0.642319691686548, "grad_norm": 8.912283897399902, "learning_rate": 3.210680858873188e-06, "loss": 0.3024, "num_input_tokens_seen": 7515488, "step": 3500 }, { "epoch": 0.6432372912461002, "grad_norm": 10.164663314819336, "learning_rate": 3.215268856670949e-06, "loss": 0.3031, "num_input_tokens_seen": 7526944, "step": 3505 }, { "epoch": 0.6441548908056524, "grad_norm": 13.537016868591309, "learning_rate": 3.21985685446871e-06, "loss": 0.4236, "num_input_tokens_seen": 7537120, "step": 3510 }, { "epoch": 0.6450724903652046, "grad_norm": 19.413137435913086, "learning_rate": 3.224444852266471e-06, "loss": 0.3504, "num_input_tokens_seen": 7548160, "step": 3515 }, { "epoch": 0.6459900899247568, "grad_norm": 9.287425994873047, "learning_rate": 3.229032850064232e-06, "loss": 0.3428, "num_input_tokens_seen": 7559424, "step": 3520 }, { "epoch": 0.6469076894843091, "grad_norm": 6.945512771606445, "learning_rate": 3.2336208478619934e-06, "loss": 0.3851, "num_input_tokens_seen": 7570496, "step": 3525 }, { "epoch": 0.6478252890438613, "grad_norm": 12.814559936523438, "learning_rate": 3.2382088456597544e-06, "loss": 0.2248, "num_input_tokens_seen": 7581888, "step": 3530 }, { "epoch": 0.6487428886034134, "grad_norm": 13.394281387329102, "learning_rate": 3.2427968434575153e-06, "loss": 0.3201, "num_input_tokens_seen": 7592992, "step": 3535 }, { "epoch": 0.6496604881629657, "grad_norm": 21.832496643066406, "learning_rate": 3.2473848412552767e-06, "loss": 0.3077, "num_input_tokens_seen": 7604320, "step": 3540 }, { "epoch": 0.6505780877225179, "grad_norm": 13.282804489135742, "learning_rate": 3.2519728390530376e-06, "loss": 0.3004, "num_input_tokens_seen": 7616224, "step": 3545 }, { "epoch": 0.6514956872820701, "grad_norm": 67.87030029296875, "learning_rate": 3.2565608368507985e-06, "loss": 0.4033, "num_input_tokens_seen": 7626208, "step": 3550 }, { "epoch": 0.6524132868416224, "grad_norm": 53.77069091796875, "learning_rate": 3.26114883464856e-06, "loss": 0.3631, "num_input_tokens_seen": 7637152, "step": 3555 }, { "epoch": 0.6533308864011745, "grad_norm": 21.40153694152832, "learning_rate": 3.265736832446321e-06, "loss": 0.3563, "num_input_tokens_seen": 7647520, "step": 3560 }, { "epoch": 0.6542484859607267, "grad_norm": 11.463028907775879, "learning_rate": 3.2703248302440814e-06, "loss": 0.3389, "num_input_tokens_seen": 7659136, "step": 3565 }, { "epoch": 0.655166085520279, "grad_norm": 13.553479194641113, "learning_rate": 3.274912828041843e-06, "loss": 0.3453, "num_input_tokens_seen": 7671360, "step": 3570 }, { "epoch": 0.6560836850798312, "grad_norm": 7.885971546173096, "learning_rate": 3.2795008258396037e-06, "loss": 0.2354, "num_input_tokens_seen": 7682272, "step": 3575 }, { "epoch": 0.6570012846393833, "grad_norm": 23.46184730529785, "learning_rate": 3.2840888236373646e-06, "loss": 0.2804, "num_input_tokens_seen": 7692896, "step": 3580 }, { "epoch": 0.6579188841989356, "grad_norm": 16.61871910095215, "learning_rate": 3.288676821435126e-06, "loss": 0.2808, "num_input_tokens_seen": 7703616, "step": 3585 }, { "epoch": 0.6588364837584878, "grad_norm": 18.867115020751953, "learning_rate": 3.293264819232887e-06, "loss": 0.3413, "num_input_tokens_seen": 7713760, "step": 3590 }, { "epoch": 0.65975408331804, "grad_norm": 24.459613800048828, "learning_rate": 3.297852817030648e-06, "loss": 0.4128, "num_input_tokens_seen": 7724512, "step": 3595 }, { "epoch": 0.6606716828775923, "grad_norm": 26.88646125793457, "learning_rate": 3.302440814828409e-06, "loss": 0.4257, "num_input_tokens_seen": 7734496, "step": 3600 }, { "epoch": 0.6615892824371444, "grad_norm": 11.06870174407959, "learning_rate": 3.30702881262617e-06, "loss": 0.364, "num_input_tokens_seen": 7745216, "step": 3605 }, { "epoch": 0.6625068819966966, "grad_norm": 14.95732593536377, "learning_rate": 3.311616810423931e-06, "loss": 0.2679, "num_input_tokens_seen": 7755200, "step": 3610 }, { "epoch": 0.6634244815562489, "grad_norm": 16.998117446899414, "learning_rate": 3.3162048082216924e-06, "loss": 0.2648, "num_input_tokens_seen": 7767296, "step": 3615 }, { "epoch": 0.6643420811158011, "grad_norm": 18.618532180786133, "learning_rate": 3.3207928060194534e-06, "loss": 0.2582, "num_input_tokens_seen": 7777984, "step": 3620 }, { "epoch": 0.6652596806753532, "grad_norm": 9.07779598236084, "learning_rate": 3.3253808038172143e-06, "loss": 0.4218, "num_input_tokens_seen": 7788800, "step": 3625 }, { "epoch": 0.6661772802349055, "grad_norm": 14.040912628173828, "learning_rate": 3.3299688016149757e-06, "loss": 0.3122, "num_input_tokens_seen": 7799104, "step": 3630 }, { "epoch": 0.6670948797944577, "grad_norm": 39.3931999206543, "learning_rate": 3.3345567994127366e-06, "loss": 0.4639, "num_input_tokens_seen": 7809216, "step": 3635 }, { "epoch": 0.6680124793540099, "grad_norm": 19.849178314208984, "learning_rate": 3.3391447972104975e-06, "loss": 0.2468, "num_input_tokens_seen": 7821184, "step": 3640 }, { "epoch": 0.6689300789135622, "grad_norm": 7.692641735076904, "learning_rate": 3.343732795008259e-06, "loss": 0.3926, "num_input_tokens_seen": 7830432, "step": 3645 }, { "epoch": 0.6698476784731143, "grad_norm": 18.97268295288086, "learning_rate": 3.3483207928060194e-06, "loss": 0.3566, "num_input_tokens_seen": 7841984, "step": 3650 }, { "epoch": 0.6707652780326665, "grad_norm": 13.590991020202637, "learning_rate": 3.352908790603781e-06, "loss": 0.2801, "num_input_tokens_seen": 7853440, "step": 3655 }, { "epoch": 0.6716828775922188, "grad_norm": 17.544160842895508, "learning_rate": 3.3574967884015417e-06, "loss": 0.358, "num_input_tokens_seen": 7864032, "step": 3660 }, { "epoch": 0.672600477151771, "grad_norm": 21.741376876831055, "learning_rate": 3.3620847861993027e-06, "loss": 0.3048, "num_input_tokens_seen": 7876320, "step": 3665 }, { "epoch": 0.6735180767113231, "grad_norm": 8.635332107543945, "learning_rate": 3.366672783997064e-06, "loss": 0.3413, "num_input_tokens_seen": 7887264, "step": 3670 }, { "epoch": 0.6744356762708754, "grad_norm": 14.97873592376709, "learning_rate": 3.371260781794825e-06, "loss": 0.2793, "num_input_tokens_seen": 7898240, "step": 3675 }, { "epoch": 0.6753532758304276, "grad_norm": 17.519439697265625, "learning_rate": 3.375848779592586e-06, "loss": 0.327, "num_input_tokens_seen": 7909920, "step": 3680 }, { "epoch": 0.6762708753899798, "grad_norm": 35.894466400146484, "learning_rate": 3.3804367773903473e-06, "loss": 0.3598, "num_input_tokens_seen": 7920288, "step": 3685 }, { "epoch": 0.677188474949532, "grad_norm": 12.784793853759766, "learning_rate": 3.385024775188108e-06, "loss": 0.336, "num_input_tokens_seen": 7931936, "step": 3690 }, { "epoch": 0.6781060745090842, "grad_norm": 16.498929977416992, "learning_rate": 3.389612772985869e-06, "loss": 0.3503, "num_input_tokens_seen": 7942880, "step": 3695 }, { "epoch": 0.6790236740686364, "grad_norm": 24.152957916259766, "learning_rate": 3.3942007707836305e-06, "loss": 0.3753, "num_input_tokens_seen": 7952608, "step": 3700 }, { "epoch": 0.6799412736281887, "grad_norm": 7.797973155975342, "learning_rate": 3.3987887685813914e-06, "loss": 0.2927, "num_input_tokens_seen": 7962016, "step": 3705 }, { "epoch": 0.6808588731877409, "grad_norm": 8.021470069885254, "learning_rate": 3.4033767663791524e-06, "loss": 0.397, "num_input_tokens_seen": 7974176, "step": 3710 }, { "epoch": 0.681776472747293, "grad_norm": 16.89824867248535, "learning_rate": 3.4079647641769137e-06, "loss": 0.3712, "num_input_tokens_seen": 7984768, "step": 3715 }, { "epoch": 0.6826940723068453, "grad_norm": 18.629140853881836, "learning_rate": 3.4125527619746747e-06, "loss": 0.3759, "num_input_tokens_seen": 7995392, "step": 3720 }, { "epoch": 0.6836116718663975, "grad_norm": 12.196891784667969, "learning_rate": 3.4171407597724356e-06, "loss": 0.2842, "num_input_tokens_seen": 8007168, "step": 3725 }, { "epoch": 0.6845292714259498, "grad_norm": 11.629692077636719, "learning_rate": 3.421728757570197e-06, "loss": 0.3335, "num_input_tokens_seen": 8016832, "step": 3730 }, { "epoch": 0.6854468709855019, "grad_norm": 8.611486434936523, "learning_rate": 3.426316755367958e-06, "loss": 0.3166, "num_input_tokens_seen": 8026144, "step": 3735 }, { "epoch": 0.6863644705450541, "grad_norm": 17.130172729492188, "learning_rate": 3.4309047531657184e-06, "loss": 0.3125, "num_input_tokens_seen": 8035776, "step": 3740 }, { "epoch": 0.6872820701046064, "grad_norm": 11.666013717651367, "learning_rate": 3.4354927509634802e-06, "loss": 0.3261, "num_input_tokens_seen": 8044512, "step": 3745 }, { "epoch": 0.6881996696641586, "grad_norm": 10.526180267333984, "learning_rate": 3.4400807487612407e-06, "loss": 0.2537, "num_input_tokens_seen": 8055584, "step": 3750 }, { "epoch": 0.6891172692237107, "grad_norm": 14.48332691192627, "learning_rate": 3.4446687465590017e-06, "loss": 0.3304, "num_input_tokens_seen": 8067264, "step": 3755 }, { "epoch": 0.690034868783263, "grad_norm": 15.18975830078125, "learning_rate": 3.449256744356763e-06, "loss": 0.3834, "num_input_tokens_seen": 8077568, "step": 3760 }, { "epoch": 0.6909524683428152, "grad_norm": 9.47579288482666, "learning_rate": 3.453844742154524e-06, "loss": 0.3399, "num_input_tokens_seen": 8089920, "step": 3765 }, { "epoch": 0.6918700679023674, "grad_norm": 14.351473808288574, "learning_rate": 3.458432739952285e-06, "loss": 0.3809, "num_input_tokens_seen": 8101216, "step": 3770 }, { "epoch": 0.6927876674619197, "grad_norm": 10.573907852172852, "learning_rate": 3.4630207377500463e-06, "loss": 0.3661, "num_input_tokens_seen": 8111232, "step": 3775 }, { "epoch": 0.6937052670214718, "grad_norm": 13.326565742492676, "learning_rate": 3.467608735547807e-06, "loss": 0.3443, "num_input_tokens_seen": 8122880, "step": 3780 }, { "epoch": 0.694622866581024, "grad_norm": 15.770484924316406, "learning_rate": 3.472196733345568e-06, "loss": 0.3105, "num_input_tokens_seen": 8132128, "step": 3785 }, { "epoch": 0.6955404661405763, "grad_norm": 9.595698356628418, "learning_rate": 3.4767847311433295e-06, "loss": 0.3134, "num_input_tokens_seen": 8142784, "step": 3790 }, { "epoch": 0.6964580657001285, "grad_norm": 8.640095710754395, "learning_rate": 3.4813727289410904e-06, "loss": 0.3077, "num_input_tokens_seen": 8153152, "step": 3795 }, { "epoch": 0.6973756652596806, "grad_norm": 5.484158515930176, "learning_rate": 3.4859607267388514e-06, "loss": 0.2859, "num_input_tokens_seen": 8163488, "step": 3800 }, { "epoch": 0.6982932648192329, "grad_norm": 12.95715045928955, "learning_rate": 3.4905487245366127e-06, "loss": 0.3358, "num_input_tokens_seen": 8175040, "step": 3805 }, { "epoch": 0.6992108643787851, "grad_norm": 14.936224937438965, "learning_rate": 3.4951367223343737e-06, "loss": 0.2924, "num_input_tokens_seen": 8184736, "step": 3810 }, { "epoch": 0.7001284639383373, "grad_norm": 8.561795234680176, "learning_rate": 3.499724720132134e-06, "loss": 0.4875, "num_input_tokens_seen": 8194560, "step": 3815 }, { "epoch": 0.7010460634978896, "grad_norm": 9.887624740600586, "learning_rate": 3.504312717929896e-06, "loss": 0.3262, "num_input_tokens_seen": 8205344, "step": 3820 }, { "epoch": 0.7019636630574417, "grad_norm": 20.988248825073242, "learning_rate": 3.5089007157276565e-06, "loss": 0.4079, "num_input_tokens_seen": 8216992, "step": 3825 }, { "epoch": 0.7028812626169939, "grad_norm": 8.72948169708252, "learning_rate": 3.5134887135254174e-06, "loss": 0.3504, "num_input_tokens_seen": 8227424, "step": 3830 }, { "epoch": 0.7037988621765462, "grad_norm": 5.756874084472656, "learning_rate": 3.518076711323179e-06, "loss": 0.325, "num_input_tokens_seen": 8237664, "step": 3835 }, { "epoch": 0.7047164617360984, "grad_norm": 6.633523464202881, "learning_rate": 3.5226647091209397e-06, "loss": 0.2825, "num_input_tokens_seen": 8248384, "step": 3840 }, { "epoch": 0.7056340612956505, "grad_norm": 9.538445472717285, "learning_rate": 3.5272527069187007e-06, "loss": 0.3169, "num_input_tokens_seen": 8258624, "step": 3845 }, { "epoch": 0.7065516608552028, "grad_norm": 8.209423065185547, "learning_rate": 3.531840704716462e-06, "loss": 0.334, "num_input_tokens_seen": 8269760, "step": 3850 }, { "epoch": 0.707469260414755, "grad_norm": 11.845721244812012, "learning_rate": 3.536428702514223e-06, "loss": 0.3497, "num_input_tokens_seen": 8280512, "step": 3855 }, { "epoch": 0.7083868599743072, "grad_norm": 38.00782012939453, "learning_rate": 3.5410167003119843e-06, "loss": 0.3588, "num_input_tokens_seen": 8291040, "step": 3860 }, { "epoch": 0.7093044595338595, "grad_norm": 9.527670860290527, "learning_rate": 3.5456046981097453e-06, "loss": 0.4506, "num_input_tokens_seen": 8301248, "step": 3865 }, { "epoch": 0.7102220590934116, "grad_norm": 7.12551736831665, "learning_rate": 3.5501926959075062e-06, "loss": 0.3885, "num_input_tokens_seen": 8312576, "step": 3870 }, { "epoch": 0.7111396586529638, "grad_norm": 9.65181827545166, "learning_rate": 3.5547806937052676e-06, "loss": 0.3348, "num_input_tokens_seen": 8323584, "step": 3875 }, { "epoch": 0.7120572582125161, "grad_norm": 8.162137031555176, "learning_rate": 3.5593686915030285e-06, "loss": 0.3724, "num_input_tokens_seen": 8335296, "step": 3880 }, { "epoch": 0.7129748577720683, "grad_norm": 13.99979305267334, "learning_rate": 3.5639566893007895e-06, "loss": 0.3591, "num_input_tokens_seen": 8346944, "step": 3885 }, { "epoch": 0.7138924573316204, "grad_norm": 18.503915786743164, "learning_rate": 3.568544687098551e-06, "loss": 0.3193, "num_input_tokens_seen": 8356928, "step": 3890 }, { "epoch": 0.7148100568911727, "grad_norm": 12.05429744720459, "learning_rate": 3.5731326848963118e-06, "loss": 0.238, "num_input_tokens_seen": 8367008, "step": 3895 }, { "epoch": 0.7157276564507249, "grad_norm": 25.539701461791992, "learning_rate": 3.5777206826940723e-06, "loss": 0.4501, "num_input_tokens_seen": 8379200, "step": 3900 }, { "epoch": 0.7166452560102771, "grad_norm": 31.78485107421875, "learning_rate": 3.582308680491834e-06, "loss": 0.4257, "num_input_tokens_seen": 8390464, "step": 3905 }, { "epoch": 0.7175628555698293, "grad_norm": 6.7009406089782715, "learning_rate": 3.5868966782895946e-06, "loss": 0.2549, "num_input_tokens_seen": 8401952, "step": 3910 }, { "epoch": 0.7184804551293815, "grad_norm": 11.069293975830078, "learning_rate": 3.5914846760873555e-06, "loss": 0.3234, "num_input_tokens_seen": 8412672, "step": 3915 }, { "epoch": 0.7193980546889337, "grad_norm": 6.560601711273193, "learning_rate": 3.596072673885117e-06, "loss": 0.348, "num_input_tokens_seen": 8424160, "step": 3920 }, { "epoch": 0.720315654248486, "grad_norm": 11.549817085266113, "learning_rate": 3.600660671682878e-06, "loss": 0.3525, "num_input_tokens_seen": 8433792, "step": 3925 }, { "epoch": 0.7212332538080382, "grad_norm": 8.740411758422852, "learning_rate": 3.6052486694806387e-06, "loss": 0.3981, "num_input_tokens_seen": 8444320, "step": 3930 }, { "epoch": 0.7221508533675903, "grad_norm": 7.640132427215576, "learning_rate": 3.6098366672784e-06, "loss": 0.2795, "num_input_tokens_seen": 8456448, "step": 3935 }, { "epoch": 0.7230684529271426, "grad_norm": 29.824844360351562, "learning_rate": 3.614424665076161e-06, "loss": 0.4481, "num_input_tokens_seen": 8467840, "step": 3940 }, { "epoch": 0.7239860524866948, "grad_norm": 6.042175769805908, "learning_rate": 3.619012662873922e-06, "loss": 0.3647, "num_input_tokens_seen": 8477984, "step": 3945 }, { "epoch": 0.7249036520462471, "grad_norm": 19.180742263793945, "learning_rate": 3.6236006606716833e-06, "loss": 0.3953, "num_input_tokens_seen": 8489568, "step": 3950 }, { "epoch": 0.7258212516057992, "grad_norm": 8.277606964111328, "learning_rate": 3.6281886584694443e-06, "loss": 0.304, "num_input_tokens_seen": 8500000, "step": 3955 }, { "epoch": 0.7267388511653514, "grad_norm": 9.521469116210938, "learning_rate": 3.6327766562672052e-06, "loss": 0.2637, "num_input_tokens_seen": 8511072, "step": 3960 }, { "epoch": 0.7276564507249037, "grad_norm": 10.251713752746582, "learning_rate": 3.6373646540649666e-06, "loss": 0.3372, "num_input_tokens_seen": 8521792, "step": 3965 }, { "epoch": 0.7285740502844559, "grad_norm": 21.882675170898438, "learning_rate": 3.6419526518627275e-06, "loss": 0.2492, "num_input_tokens_seen": 8533760, "step": 3970 }, { "epoch": 0.729491649844008, "grad_norm": 16.558637619018555, "learning_rate": 3.6465406496604885e-06, "loss": 0.4289, "num_input_tokens_seen": 8543520, "step": 3975 }, { "epoch": 0.7304092494035603, "grad_norm": 9.318933486938477, "learning_rate": 3.65112864745825e-06, "loss": 0.2577, "num_input_tokens_seen": 8554272, "step": 3980 }, { "epoch": 0.7313268489631125, "grad_norm": 31.706932067871094, "learning_rate": 3.6557166452560108e-06, "loss": 0.3805, "num_input_tokens_seen": 8565984, "step": 3985 }, { "epoch": 0.7322444485226647, "grad_norm": 10.925561904907227, "learning_rate": 3.6603046430537713e-06, "loss": 0.2701, "num_input_tokens_seen": 8576224, "step": 3990 }, { "epoch": 0.733162048082217, "grad_norm": 23.270200729370117, "learning_rate": 3.6648926408515326e-06, "loss": 0.3743, "num_input_tokens_seen": 8588032, "step": 3995 }, { "epoch": 0.7340796476417691, "grad_norm": 12.72998046875, "learning_rate": 3.6694806386492936e-06, "loss": 0.3672, "num_input_tokens_seen": 8597760, "step": 4000 }, { "epoch": 0.7349972472013213, "grad_norm": 8.550434112548828, "learning_rate": 3.6740686364470545e-06, "loss": 0.3074, "num_input_tokens_seen": 8609344, "step": 4005 }, { "epoch": 0.7359148467608736, "grad_norm": 30.339189529418945, "learning_rate": 3.678656634244816e-06, "loss": 0.3865, "num_input_tokens_seen": 8620672, "step": 4010 }, { "epoch": 0.7368324463204258, "grad_norm": 8.360519409179688, "learning_rate": 3.683244632042577e-06, "loss": 0.3126, "num_input_tokens_seen": 8631936, "step": 4015 }, { "epoch": 0.737750045879978, "grad_norm": 20.429977416992188, "learning_rate": 3.6878326298403378e-06, "loss": 0.3593, "num_input_tokens_seen": 8643168, "step": 4020 }, { "epoch": 0.7386676454395302, "grad_norm": 15.549233436584473, "learning_rate": 3.692420627638099e-06, "loss": 0.3594, "num_input_tokens_seen": 8654272, "step": 4025 }, { "epoch": 0.7395852449990824, "grad_norm": 12.084697723388672, "learning_rate": 3.69700862543586e-06, "loss": 0.3347, "num_input_tokens_seen": 8665152, "step": 4030 }, { "epoch": 0.7405028445586346, "grad_norm": 5.5375590324401855, "learning_rate": 3.701596623233621e-06, "loss": 0.3237, "num_input_tokens_seen": 8675904, "step": 4035 }, { "epoch": 0.7414204441181869, "grad_norm": 17.07838249206543, "learning_rate": 3.7061846210313824e-06, "loss": 0.3583, "num_input_tokens_seen": 8687296, "step": 4040 }, { "epoch": 0.742338043677739, "grad_norm": 5.476425647735596, "learning_rate": 3.7107726188291433e-06, "loss": 0.3343, "num_input_tokens_seen": 8699040, "step": 4045 }, { "epoch": 0.7432556432372912, "grad_norm": 4.964951038360596, "learning_rate": 3.7153606166269042e-06, "loss": 0.3307, "num_input_tokens_seen": 8710272, "step": 4050 }, { "epoch": 0.7441732427968435, "grad_norm": 7.226194381713867, "learning_rate": 3.7199486144246656e-06, "loss": 0.3476, "num_input_tokens_seen": 8721184, "step": 4055 }, { "epoch": 0.7450908423563957, "grad_norm": 6.174010753631592, "learning_rate": 3.7245366122224265e-06, "loss": 0.3594, "num_input_tokens_seen": 8731456, "step": 4060 }, { "epoch": 0.7460084419159478, "grad_norm": 4.6198344230651855, "learning_rate": 3.729124610020187e-06, "loss": 0.2871, "num_input_tokens_seen": 8741632, "step": 4065 }, { "epoch": 0.7469260414755001, "grad_norm": 6.090936183929443, "learning_rate": 3.733712607817949e-06, "loss": 0.3005, "num_input_tokens_seen": 8752640, "step": 4070 }, { "epoch": 0.7478436410350523, "grad_norm": 5.331394195556641, "learning_rate": 3.7383006056157093e-06, "loss": 0.344, "num_input_tokens_seen": 8761888, "step": 4075 }, { "epoch": 0.7487612405946045, "grad_norm": 6.012934684753418, "learning_rate": 3.742888603413471e-06, "loss": 0.3326, "num_input_tokens_seen": 8772960, "step": 4080 }, { "epoch": 0.7496788401541568, "grad_norm": 8.267065048217773, "learning_rate": 3.7474766012112316e-06, "loss": 0.3175, "num_input_tokens_seen": 8783296, "step": 4085 }, { "epoch": 0.7505964397137089, "grad_norm": 18.681941986083984, "learning_rate": 3.7520645990089926e-06, "loss": 0.3161, "num_input_tokens_seen": 8794272, "step": 4090 }, { "epoch": 0.7515140392732611, "grad_norm": 15.349081993103027, "learning_rate": 3.756652596806754e-06, "loss": 0.366, "num_input_tokens_seen": 8805824, "step": 4095 }, { "epoch": 0.7524316388328134, "grad_norm": 8.713051795959473, "learning_rate": 3.761240594604515e-06, "loss": 0.3175, "num_input_tokens_seen": 8816000, "step": 4100 }, { "epoch": 0.7533492383923656, "grad_norm": 7.281641006469727, "learning_rate": 3.765828592402276e-06, "loss": 0.3724, "num_input_tokens_seen": 8827776, "step": 4105 }, { "epoch": 0.7542668379519177, "grad_norm": 19.74154281616211, "learning_rate": 3.770416590200037e-06, "loss": 0.3615, "num_input_tokens_seen": 8838656, "step": 4110 }, { "epoch": 0.75518443751147, "grad_norm": 14.584485054016113, "learning_rate": 3.775004587997798e-06, "loss": 0.3269, "num_input_tokens_seen": 8849056, "step": 4115 }, { "epoch": 0.7561020370710222, "grad_norm": 15.40415096282959, "learning_rate": 3.779592585795559e-06, "loss": 0.3209, "num_input_tokens_seen": 8860768, "step": 4120 }, { "epoch": 0.7570196366305744, "grad_norm": 10.430819511413574, "learning_rate": 3.7841805835933204e-06, "loss": 0.3181, "num_input_tokens_seen": 8871776, "step": 4125 }, { "epoch": 0.7579372361901267, "grad_norm": 12.612916946411133, "learning_rate": 3.7887685813910814e-06, "loss": 0.3582, "num_input_tokens_seen": 8883200, "step": 4130 }, { "epoch": 0.7588548357496788, "grad_norm": 11.373978614807129, "learning_rate": 3.7933565791888423e-06, "loss": 0.3439, "num_input_tokens_seen": 8892672, "step": 4135 }, { "epoch": 0.759772435309231, "grad_norm": 10.6359281539917, "learning_rate": 3.7979445769866037e-06, "loss": 0.3381, "num_input_tokens_seen": 8902432, "step": 4140 }, { "epoch": 0.7606900348687833, "grad_norm": 8.929352760314941, "learning_rate": 3.8025325747843646e-06, "loss": 0.2694, "num_input_tokens_seen": 8914400, "step": 4145 }, { "epoch": 0.7616076344283355, "grad_norm": 6.55936336517334, "learning_rate": 3.807120572582125e-06, "loss": 0.4075, "num_input_tokens_seen": 8924704, "step": 4150 }, { "epoch": 0.7625252339878877, "grad_norm": 7.751646518707275, "learning_rate": 3.811708570379887e-06, "loss": 0.3094, "num_input_tokens_seen": 8934848, "step": 4155 }, { "epoch": 0.7634428335474399, "grad_norm": 13.904364585876465, "learning_rate": 3.816296568177648e-06, "loss": 0.4317, "num_input_tokens_seen": 8946144, "step": 4160 }, { "epoch": 0.7643604331069921, "grad_norm": 6.95390510559082, "learning_rate": 3.820884565975408e-06, "loss": 0.3065, "num_input_tokens_seen": 8956768, "step": 4165 }, { "epoch": 0.7652780326665444, "grad_norm": 7.381718158721924, "learning_rate": 3.82547256377317e-06, "loss": 0.3684, "num_input_tokens_seen": 8967616, "step": 4170 }, { "epoch": 0.7661956322260965, "grad_norm": 7.28955602645874, "learning_rate": 3.830060561570931e-06, "loss": 0.2944, "num_input_tokens_seen": 8978816, "step": 4175 }, { "epoch": 0.7671132317856487, "grad_norm": 7.7892937660217285, "learning_rate": 3.834648559368692e-06, "loss": 0.3518, "num_input_tokens_seen": 8990784, "step": 4180 }, { "epoch": 0.768030831345201, "grad_norm": 6.7761945724487305, "learning_rate": 3.839236557166453e-06, "loss": 0.3628, "num_input_tokens_seen": 9001408, "step": 4185 }, { "epoch": 0.7689484309047532, "grad_norm": 13.329741477966309, "learning_rate": 3.843824554964214e-06, "loss": 0.3101, "num_input_tokens_seen": 9011840, "step": 4190 }, { "epoch": 0.7698660304643054, "grad_norm": 19.483470916748047, "learning_rate": 3.848412552761975e-06, "loss": 0.2852, "num_input_tokens_seen": 9020896, "step": 4195 }, { "epoch": 0.7707836300238576, "grad_norm": 16.958330154418945, "learning_rate": 3.853000550559736e-06, "loss": 0.3693, "num_input_tokens_seen": 9032352, "step": 4200 }, { "epoch": 0.7717012295834098, "grad_norm": 27.00396728515625, "learning_rate": 3.857588548357497e-06, "loss": 0.4344, "num_input_tokens_seen": 9043488, "step": 4205 }, { "epoch": 0.772618829142962, "grad_norm": 15.300982475280762, "learning_rate": 3.862176546155258e-06, "loss": 0.3538, "num_input_tokens_seen": 9054592, "step": 4210 }, { "epoch": 0.7735364287025143, "grad_norm": 18.864620208740234, "learning_rate": 3.8667645439530194e-06, "loss": 0.2907, "num_input_tokens_seen": 9066112, "step": 4215 }, { "epoch": 0.7744540282620664, "grad_norm": 30.568273544311523, "learning_rate": 3.87135254175078e-06, "loss": 0.3816, "num_input_tokens_seen": 9078432, "step": 4220 }, { "epoch": 0.7753716278216186, "grad_norm": 14.351422309875488, "learning_rate": 3.875940539548541e-06, "loss": 0.3993, "num_input_tokens_seen": 9089504, "step": 4225 }, { "epoch": 0.7762892273811709, "grad_norm": 7.643891334533691, "learning_rate": 3.880528537346303e-06, "loss": 0.3011, "num_input_tokens_seen": 9101152, "step": 4230 }, { "epoch": 0.7772068269407231, "grad_norm": 6.007418155670166, "learning_rate": 3.885116535144063e-06, "loss": 0.3394, "num_input_tokens_seen": 9112096, "step": 4235 }, { "epoch": 0.7781244265002752, "grad_norm": 14.00527286529541, "learning_rate": 3.8897045329418245e-06, "loss": 0.2798, "num_input_tokens_seen": 9122784, "step": 4240 }, { "epoch": 0.7790420260598275, "grad_norm": 6.884554862976074, "learning_rate": 3.894292530739586e-06, "loss": 0.3569, "num_input_tokens_seen": 9134208, "step": 4245 }, { "epoch": 0.7799596256193797, "grad_norm": 8.894865036010742, "learning_rate": 3.898880528537346e-06, "loss": 0.3857, "num_input_tokens_seen": 9144608, "step": 4250 }, { "epoch": 0.7808772251789319, "grad_norm": 8.391230583190918, "learning_rate": 3.903468526335108e-06, "loss": 0.3089, "num_input_tokens_seen": 9155424, "step": 4255 }, { "epoch": 0.7817948247384842, "grad_norm": 13.96725845336914, "learning_rate": 3.908056524132869e-06, "loss": 0.3766, "num_input_tokens_seen": 9167040, "step": 4260 }, { "epoch": 0.7827124242980363, "grad_norm": 4.965471267700195, "learning_rate": 3.91264452193063e-06, "loss": 0.3249, "num_input_tokens_seen": 9178528, "step": 4265 }, { "epoch": 0.7836300238575885, "grad_norm": 4.525793552398682, "learning_rate": 3.91723251972839e-06, "loss": 0.3294, "num_input_tokens_seen": 9189344, "step": 4270 }, { "epoch": 0.7845476234171408, "grad_norm": 5.1087517738342285, "learning_rate": 3.921820517526152e-06, "loss": 0.2964, "num_input_tokens_seen": 9200992, "step": 4275 }, { "epoch": 0.785465222976693, "grad_norm": 4.877570629119873, "learning_rate": 3.926408515323913e-06, "loss": 0.3381, "num_input_tokens_seen": 9211616, "step": 4280 }, { "epoch": 0.7863828225362451, "grad_norm": 10.126522064208984, "learning_rate": 3.930996513121674e-06, "loss": 0.2692, "num_input_tokens_seen": 9221888, "step": 4285 }, { "epoch": 0.7873004220957974, "grad_norm": 8.810115814208984, "learning_rate": 3.935584510919435e-06, "loss": 0.2576, "num_input_tokens_seen": 9232416, "step": 4290 }, { "epoch": 0.7882180216553496, "grad_norm": 11.586681365966797, "learning_rate": 3.940172508717196e-06, "loss": 0.3363, "num_input_tokens_seen": 9243232, "step": 4295 }, { "epoch": 0.7891356212149018, "grad_norm": 17.352798461914062, "learning_rate": 3.9447605065149575e-06, "loss": 0.3257, "num_input_tokens_seen": 9254368, "step": 4300 }, { "epoch": 0.7900532207744541, "grad_norm": 10.051648139953613, "learning_rate": 3.949348504312718e-06, "loss": 0.3073, "num_input_tokens_seen": 9265344, "step": 4305 }, { "epoch": 0.7909708203340062, "grad_norm": 15.0191011428833, "learning_rate": 3.953936502110479e-06, "loss": 0.2789, "num_input_tokens_seen": 9276608, "step": 4310 }, { "epoch": 0.7918884198935584, "grad_norm": 21.657609939575195, "learning_rate": 3.958524499908241e-06, "loss": 0.3996, "num_input_tokens_seen": 9287392, "step": 4315 }, { "epoch": 0.7928060194531107, "grad_norm": 15.034111022949219, "learning_rate": 3.963112497706001e-06, "loss": 0.3349, "num_input_tokens_seen": 9297888, "step": 4320 }, { "epoch": 0.7937236190126629, "grad_norm": 7.328571319580078, "learning_rate": 3.967700495503763e-06, "loss": 0.3249, "num_input_tokens_seen": 9307968, "step": 4325 }, { "epoch": 0.794641218572215, "grad_norm": 8.172501564025879, "learning_rate": 3.972288493301524e-06, "loss": 0.4226, "num_input_tokens_seen": 9317184, "step": 4330 }, { "epoch": 0.7955588181317673, "grad_norm": 9.396809577941895, "learning_rate": 3.9768764910992845e-06, "loss": 0.3155, "num_input_tokens_seen": 9327040, "step": 4335 }, { "epoch": 0.7964764176913195, "grad_norm": 9.559391975402832, "learning_rate": 3.981464488897046e-06, "loss": 0.423, "num_input_tokens_seen": 9337696, "step": 4340 }, { "epoch": 0.7973940172508717, "grad_norm": 6.4961113929748535, "learning_rate": 3.986052486694807e-06, "loss": 0.3555, "num_input_tokens_seen": 9348608, "step": 4345 }, { "epoch": 0.798311616810424, "grad_norm": 6.134031295776367, "learning_rate": 3.990640484492568e-06, "loss": 0.352, "num_input_tokens_seen": 9359744, "step": 4350 }, { "epoch": 0.7992292163699761, "grad_norm": 9.32012939453125, "learning_rate": 3.995228482290329e-06, "loss": 0.3065, "num_input_tokens_seen": 9370848, "step": 4355 }, { "epoch": 0.8001468159295283, "grad_norm": 4.679318428039551, "learning_rate": 3.9998164800880904e-06, "loss": 0.3125, "num_input_tokens_seen": 9381440, "step": 4360 }, { "epoch": 0.8010644154890806, "grad_norm": 7.9622297286987305, "learning_rate": 4.004404477885851e-06, "loss": 0.335, "num_input_tokens_seen": 9393152, "step": 4365 }, { "epoch": 0.8019820150486328, "grad_norm": 5.5195770263671875, "learning_rate": 4.0089924756836115e-06, "loss": 0.2647, "num_input_tokens_seen": 9403168, "step": 4370 }, { "epoch": 0.802899614608185, "grad_norm": 10.975074768066406, "learning_rate": 4.013580473481373e-06, "loss": 0.2924, "num_input_tokens_seen": 9413472, "step": 4375 }, { "epoch": 0.8038172141677372, "grad_norm": 6.781746864318848, "learning_rate": 4.018168471279134e-06, "loss": 0.3105, "num_input_tokens_seen": 9424608, "step": 4380 }, { "epoch": 0.8047348137272894, "grad_norm": 5.614599227905273, "learning_rate": 4.022756469076895e-06, "loss": 0.4487, "num_input_tokens_seen": 9436128, "step": 4385 }, { "epoch": 0.8056524132868417, "grad_norm": 14.38370132446289, "learning_rate": 4.027344466874656e-06, "loss": 0.2612, "num_input_tokens_seen": 9446624, "step": 4390 }, { "epoch": 0.8065700128463938, "grad_norm": 5.609727382659912, "learning_rate": 4.0319324646724174e-06, "loss": 0.2853, "num_input_tokens_seen": 9458080, "step": 4395 }, { "epoch": 0.807487612405946, "grad_norm": 14.668912887573242, "learning_rate": 4.036520462470178e-06, "loss": 0.3564, "num_input_tokens_seen": 9468544, "step": 4400 }, { "epoch": 0.8084052119654983, "grad_norm": 6.312440395355225, "learning_rate": 4.041108460267939e-06, "loss": 0.3521, "num_input_tokens_seen": 9479520, "step": 4405 }, { "epoch": 0.8093228115250505, "grad_norm": 44.65073776245117, "learning_rate": 4.045696458065701e-06, "loss": 0.2808, "num_input_tokens_seen": 9489952, "step": 4410 }, { "epoch": 0.8102404110846027, "grad_norm": 11.273595809936523, "learning_rate": 4.050284455863461e-06, "loss": 0.2577, "num_input_tokens_seen": 9500544, "step": 4415 }, { "epoch": 0.8111580106441549, "grad_norm": 14.096966743469238, "learning_rate": 4.0548724536612226e-06, "loss": 0.3038, "num_input_tokens_seen": 9511296, "step": 4420 }, { "epoch": 0.8120756102037071, "grad_norm": 7.689033031463623, "learning_rate": 4.059460451458984e-06, "loss": 0.3469, "num_input_tokens_seen": 9522752, "step": 4425 }, { "epoch": 0.8129932097632593, "grad_norm": 11.920412063598633, "learning_rate": 4.0640484492567444e-06, "loss": 0.3059, "num_input_tokens_seen": 9533376, "step": 4430 }, { "epoch": 0.8139108093228116, "grad_norm": 8.092939376831055, "learning_rate": 4.068636447054506e-06, "loss": 0.3445, "num_input_tokens_seen": 9545056, "step": 4435 }, { "epoch": 0.8148284088823637, "grad_norm": 20.237106323242188, "learning_rate": 4.073224444852267e-06, "loss": 0.4468, "num_input_tokens_seen": 9556032, "step": 4440 }, { "epoch": 0.8157460084419159, "grad_norm": 8.068367004394531, "learning_rate": 4.077812442650028e-06, "loss": 0.3362, "num_input_tokens_seen": 9565120, "step": 4445 }, { "epoch": 0.8166636080014682, "grad_norm": 10.329432487487793, "learning_rate": 4.082400440447789e-06, "loss": 0.4053, "num_input_tokens_seen": 9576832, "step": 4450 }, { "epoch": 0.8175812075610204, "grad_norm": 13.49870777130127, "learning_rate": 4.0869884382455495e-06, "loss": 0.3664, "num_input_tokens_seen": 9586624, "step": 4455 }, { "epoch": 0.8184988071205725, "grad_norm": 14.212019920349121, "learning_rate": 4.091576436043311e-06, "loss": 0.3352, "num_input_tokens_seen": 9597408, "step": 4460 }, { "epoch": 0.8194164066801248, "grad_norm": 4.624407768249512, "learning_rate": 4.096164433841072e-06, "loss": 0.3621, "num_input_tokens_seen": 9608800, "step": 4465 }, { "epoch": 0.820334006239677, "grad_norm": 9.436686515808105, "learning_rate": 4.100752431638833e-06, "loss": 0.3558, "num_input_tokens_seen": 9619808, "step": 4470 }, { "epoch": 0.8212516057992292, "grad_norm": 5.476808071136475, "learning_rate": 4.105340429436594e-06, "loss": 0.3333, "num_input_tokens_seen": 9629984, "step": 4475 }, { "epoch": 0.8221692053587815, "grad_norm": 5.923276424407959, "learning_rate": 4.1099284272343555e-06, "loss": 0.3284, "num_input_tokens_seen": 9640512, "step": 4480 }, { "epoch": 0.8230868049183336, "grad_norm": 5.0629472732543945, "learning_rate": 4.114516425032116e-06, "loss": 0.3177, "num_input_tokens_seen": 9650912, "step": 4485 }, { "epoch": 0.8240044044778858, "grad_norm": 5.158577919006348, "learning_rate": 4.119104422829877e-06, "loss": 0.3098, "num_input_tokens_seen": 9663008, "step": 4490 }, { "epoch": 0.8249220040374381, "grad_norm": 4.254199504852295, "learning_rate": 4.123692420627639e-06, "loss": 0.3266, "num_input_tokens_seen": 9673184, "step": 4495 }, { "epoch": 0.8258396035969903, "grad_norm": 8.66910457611084, "learning_rate": 4.128280418425399e-06, "loss": 0.3449, "num_input_tokens_seen": 9684608, "step": 4500 }, { "epoch": 0.8267572031565424, "grad_norm": 6.362743854522705, "learning_rate": 4.132868416223161e-06, "loss": 0.345, "num_input_tokens_seen": 9696384, "step": 4505 }, { "epoch": 0.8276748027160947, "grad_norm": 9.515473365783691, "learning_rate": 4.137456414020922e-06, "loss": 0.3256, "num_input_tokens_seen": 9707072, "step": 4510 }, { "epoch": 0.8285924022756469, "grad_norm": 7.368187427520752, "learning_rate": 4.1420444118186825e-06, "loss": 0.3346, "num_input_tokens_seen": 9718304, "step": 4515 }, { "epoch": 0.8295100018351991, "grad_norm": 6.728349685668945, "learning_rate": 4.146632409616444e-06, "loss": 0.3291, "num_input_tokens_seen": 9729088, "step": 4520 }, { "epoch": 0.8304276013947514, "grad_norm": 5.525705814361572, "learning_rate": 4.151220407414205e-06, "loss": 0.2946, "num_input_tokens_seen": 9739616, "step": 4525 }, { "epoch": 0.8313452009543035, "grad_norm": 11.114752769470215, "learning_rate": 4.155808405211966e-06, "loss": 0.2674, "num_input_tokens_seen": 9749344, "step": 4530 }, { "epoch": 0.8322628005138557, "grad_norm": 4.516454219818115, "learning_rate": 4.160396403009727e-06, "loss": 0.3681, "num_input_tokens_seen": 9759424, "step": 4535 }, { "epoch": 0.833180400073408, "grad_norm": 10.120243072509766, "learning_rate": 4.164984400807488e-06, "loss": 0.3564, "num_input_tokens_seen": 9769696, "step": 4540 }, { "epoch": 0.8340979996329602, "grad_norm": 6.9119696617126465, "learning_rate": 4.169572398605249e-06, "loss": 0.3644, "num_input_tokens_seen": 9781280, "step": 4545 }, { "epoch": 0.8350155991925123, "grad_norm": 5.976223468780518, "learning_rate": 4.17416039640301e-06, "loss": 0.258, "num_input_tokens_seen": 9791424, "step": 4550 }, { "epoch": 0.8359331987520646, "grad_norm": 6.06378698348999, "learning_rate": 4.178748394200771e-06, "loss": 0.315, "num_input_tokens_seen": 9802336, "step": 4555 }, { "epoch": 0.8368507983116168, "grad_norm": 37.03574752807617, "learning_rate": 4.183336391998532e-06, "loss": 0.2805, "num_input_tokens_seen": 9814496, "step": 4560 }, { "epoch": 0.837768397871169, "grad_norm": 14.232616424560547, "learning_rate": 4.187924389796294e-06, "loss": 0.4368, "num_input_tokens_seen": 9824544, "step": 4565 }, { "epoch": 0.8386859974307213, "grad_norm": 37.92394256591797, "learning_rate": 4.192512387594054e-06, "loss": 0.4321, "num_input_tokens_seen": 9837344, "step": 4570 }, { "epoch": 0.8396035969902734, "grad_norm": 8.800310134887695, "learning_rate": 4.1971003853918155e-06, "loss": 0.3282, "num_input_tokens_seen": 9846432, "step": 4575 }, { "epoch": 0.8405211965498257, "grad_norm": 6.450525760650635, "learning_rate": 4.201688383189577e-06, "loss": 0.3142, "num_input_tokens_seen": 9857536, "step": 4580 }, { "epoch": 0.8414387961093779, "grad_norm": 11.847301483154297, "learning_rate": 4.206276380987337e-06, "loss": 0.3665, "num_input_tokens_seen": 9867200, "step": 4585 }, { "epoch": 0.8423563956689301, "grad_norm": 12.118802070617676, "learning_rate": 4.210864378785099e-06, "loss": 0.3832, "num_input_tokens_seen": 9878144, "step": 4590 }, { "epoch": 0.8432739952284823, "grad_norm": 6.7261199951171875, "learning_rate": 4.21545237658286e-06, "loss": 0.3521, "num_input_tokens_seen": 9888896, "step": 4595 }, { "epoch": 0.8441915947880345, "grad_norm": 7.92734956741333, "learning_rate": 4.2200403743806206e-06, "loss": 0.309, "num_input_tokens_seen": 9899136, "step": 4600 }, { "epoch": 0.8451091943475867, "grad_norm": 6.04618501663208, "learning_rate": 4.224628372178382e-06, "loss": 0.3126, "num_input_tokens_seen": 9907360, "step": 4605 }, { "epoch": 0.846026793907139, "grad_norm": 8.260908126831055, "learning_rate": 4.229216369976143e-06, "loss": 0.3431, "num_input_tokens_seen": 9918368, "step": 4610 }, { "epoch": 0.8469443934666911, "grad_norm": 15.409194946289062, "learning_rate": 4.233804367773904e-06, "loss": 0.3439, "num_input_tokens_seen": 9928288, "step": 4615 }, { "epoch": 0.8478619930262433, "grad_norm": 8.01392936706543, "learning_rate": 4.238392365571664e-06, "loss": 0.4561, "num_input_tokens_seen": 9937984, "step": 4620 }, { "epoch": 0.8487795925857956, "grad_norm": 13.997448921203613, "learning_rate": 4.242980363369426e-06, "loss": 0.3755, "num_input_tokens_seen": 9949152, "step": 4625 }, { "epoch": 0.8496971921453478, "grad_norm": 6.046789646148682, "learning_rate": 4.247568361167187e-06, "loss": 0.3055, "num_input_tokens_seen": 9959648, "step": 4630 }, { "epoch": 0.8506147917049, "grad_norm": 5.442129611968994, "learning_rate": 4.2521563589649476e-06, "loss": 0.4029, "num_input_tokens_seen": 9970592, "step": 4635 }, { "epoch": 0.8515323912644522, "grad_norm": 5.314269542694092, "learning_rate": 4.256744356762709e-06, "loss": 0.3171, "num_input_tokens_seen": 9981888, "step": 4640 }, { "epoch": 0.8524499908240044, "grad_norm": 9.11137580871582, "learning_rate": 4.26133235456047e-06, "loss": 0.3369, "num_input_tokens_seen": 9992320, "step": 4645 }, { "epoch": 0.8533675903835566, "grad_norm": 7.593227863311768, "learning_rate": 4.265920352358231e-06, "loss": 0.2778, "num_input_tokens_seen": 10003360, "step": 4650 }, { "epoch": 0.8542851899431089, "grad_norm": 4.229405403137207, "learning_rate": 4.270508350155992e-06, "loss": 0.2813, "num_input_tokens_seen": 10014176, "step": 4655 }, { "epoch": 0.855202789502661, "grad_norm": 9.980815887451172, "learning_rate": 4.2750963479537535e-06, "loss": 0.3158, "num_input_tokens_seen": 10025568, "step": 4660 }, { "epoch": 0.8561203890622132, "grad_norm": 14.67064380645752, "learning_rate": 4.279684345751514e-06, "loss": 0.3238, "num_input_tokens_seen": 10035680, "step": 4665 }, { "epoch": 0.8570379886217655, "grad_norm": 9.086355209350586, "learning_rate": 4.284272343549275e-06, "loss": 0.3062, "num_input_tokens_seen": 10046272, "step": 4670 }, { "epoch": 0.8579555881813177, "grad_norm": 8.535317420959473, "learning_rate": 4.288860341347037e-06, "loss": 0.3411, "num_input_tokens_seen": 10056960, "step": 4675 }, { "epoch": 0.8588731877408698, "grad_norm": 13.976900100708008, "learning_rate": 4.293448339144797e-06, "loss": 0.2971, "num_input_tokens_seen": 10068256, "step": 4680 }, { "epoch": 0.8597907873004221, "grad_norm": 26.64862060546875, "learning_rate": 4.298036336942559e-06, "loss": 0.3555, "num_input_tokens_seen": 10079200, "step": 4685 }, { "epoch": 0.8607083868599743, "grad_norm": 9.116353988647461, "learning_rate": 4.30262433474032e-06, "loss": 0.246, "num_input_tokens_seen": 10090176, "step": 4690 }, { "epoch": 0.8616259864195265, "grad_norm": 15.864362716674805, "learning_rate": 4.3072123325380805e-06, "loss": 0.3907, "num_input_tokens_seen": 10101120, "step": 4695 }, { "epoch": 0.8625435859790788, "grad_norm": 25.202444076538086, "learning_rate": 4.311800330335842e-06, "loss": 0.3971, "num_input_tokens_seen": 10111648, "step": 4700 }, { "epoch": 0.8634611855386309, "grad_norm": 31.366615295410156, "learning_rate": 4.316388328133602e-06, "loss": 0.3851, "num_input_tokens_seen": 10123136, "step": 4705 }, { "epoch": 0.8643787850981831, "grad_norm": 19.485031127929688, "learning_rate": 4.320976325931365e-06, "loss": 0.3588, "num_input_tokens_seen": 10134176, "step": 4710 }, { "epoch": 0.8652963846577354, "grad_norm": 54.54817199707031, "learning_rate": 4.325564323729125e-06, "loss": 0.2871, "num_input_tokens_seen": 10146304, "step": 4715 }, { "epoch": 0.8662139842172876, "grad_norm": 12.761239051818848, "learning_rate": 4.330152321526886e-06, "loss": 0.3152, "num_input_tokens_seen": 10157472, "step": 4720 }, { "epoch": 0.8671315837768397, "grad_norm": 12.86854076385498, "learning_rate": 4.334740319324647e-06, "loss": 0.265, "num_input_tokens_seen": 10168480, "step": 4725 }, { "epoch": 0.868049183336392, "grad_norm": 35.25227737426758, "learning_rate": 4.339328317122408e-06, "loss": 0.4762, "num_input_tokens_seen": 10178496, "step": 4730 }, { "epoch": 0.8689667828959442, "grad_norm": 5.218728542327881, "learning_rate": 4.343916314920169e-06, "loss": 0.3856, "num_input_tokens_seen": 10188992, "step": 4735 }, { "epoch": 0.8698843824554964, "grad_norm": 5.5560126304626465, "learning_rate": 4.34850431271793e-06, "loss": 0.2874, "num_input_tokens_seen": 10199680, "step": 4740 }, { "epoch": 0.8708019820150487, "grad_norm": 42.369991302490234, "learning_rate": 4.353092310515692e-06, "loss": 0.3965, "num_input_tokens_seen": 10210688, "step": 4745 }, { "epoch": 0.8717195815746008, "grad_norm": 5.338914394378662, "learning_rate": 4.357680308313452e-06, "loss": 0.3119, "num_input_tokens_seen": 10220864, "step": 4750 }, { "epoch": 0.872637181134153, "grad_norm": 4.717718124389648, "learning_rate": 4.3622683061112135e-06, "loss": 0.3944, "num_input_tokens_seen": 10232544, "step": 4755 }, { "epoch": 0.8735547806937053, "grad_norm": 18.466480255126953, "learning_rate": 4.366856303908975e-06, "loss": 0.3356, "num_input_tokens_seen": 10242784, "step": 4760 }, { "epoch": 0.8744723802532575, "grad_norm": 16.302705764770508, "learning_rate": 4.371444301706735e-06, "loss": 0.2247, "num_input_tokens_seen": 10253504, "step": 4765 }, { "epoch": 0.8753899798128096, "grad_norm": 13.764081954956055, "learning_rate": 4.376032299504497e-06, "loss": 0.4088, "num_input_tokens_seen": 10265024, "step": 4770 }, { "epoch": 0.8763075793723619, "grad_norm": 4.719555854797363, "learning_rate": 4.380620297302258e-06, "loss": 0.3182, "num_input_tokens_seen": 10274816, "step": 4775 }, { "epoch": 0.8772251789319141, "grad_norm": 20.695940017700195, "learning_rate": 4.385208295100019e-06, "loss": 0.3592, "num_input_tokens_seen": 10286400, "step": 4780 }, { "epoch": 0.8781427784914663, "grad_norm": 5.582788944244385, "learning_rate": 4.38979629289778e-06, "loss": 0.3805, "num_input_tokens_seen": 10297024, "step": 4785 }, { "epoch": 0.8790603780510186, "grad_norm": 5.588581562042236, "learning_rate": 4.3943842906955405e-06, "loss": 0.3701, "num_input_tokens_seen": 10307392, "step": 4790 }, { "epoch": 0.8799779776105707, "grad_norm": 4.8477463722229, "learning_rate": 4.398972288493302e-06, "loss": 0.5379, "num_input_tokens_seen": 10317792, "step": 4795 }, { "epoch": 0.880895577170123, "grad_norm": 5.066054344177246, "learning_rate": 4.403560286291063e-06, "loss": 0.3207, "num_input_tokens_seen": 10329248, "step": 4800 }, { "epoch": 0.8818131767296752, "grad_norm": 5.621156215667725, "learning_rate": 4.408148284088824e-06, "loss": 0.2706, "num_input_tokens_seen": 10340768, "step": 4805 }, { "epoch": 0.8827307762892274, "grad_norm": 3.9171600341796875, "learning_rate": 4.412736281886585e-06, "loss": 0.3315, "num_input_tokens_seen": 10352096, "step": 4810 }, { "epoch": 0.8836483758487796, "grad_norm": 10.205620765686035, "learning_rate": 4.417324279684346e-06, "loss": 0.4127, "num_input_tokens_seen": 10363136, "step": 4815 }, { "epoch": 0.8845659754083318, "grad_norm": 5.31640625, "learning_rate": 4.421912277482107e-06, "loss": 0.3194, "num_input_tokens_seen": 10374304, "step": 4820 }, { "epoch": 0.885483574967884, "grad_norm": 8.875844955444336, "learning_rate": 4.426500275279868e-06, "loss": 0.3498, "num_input_tokens_seen": 10385600, "step": 4825 }, { "epoch": 0.8864011745274363, "grad_norm": 4.6464691162109375, "learning_rate": 4.43108827307763e-06, "loss": 0.2382, "num_input_tokens_seen": 10397952, "step": 4830 }, { "epoch": 0.8873187740869884, "grad_norm": 5.472815036773682, "learning_rate": 4.43567627087539e-06, "loss": 0.3696, "num_input_tokens_seen": 10408544, "step": 4835 }, { "epoch": 0.8882363736465406, "grad_norm": 7.875409126281738, "learning_rate": 4.4402642686731515e-06, "loss": 0.3369, "num_input_tokens_seen": 10417504, "step": 4840 }, { "epoch": 0.8891539732060929, "grad_norm": 10.073831558227539, "learning_rate": 4.444852266470913e-06, "loss": 0.3377, "num_input_tokens_seen": 10427712, "step": 4845 }, { "epoch": 0.8900715727656451, "grad_norm": 7.273239612579346, "learning_rate": 4.449440264268673e-06, "loss": 0.372, "num_input_tokens_seen": 10438368, "step": 4850 }, { "epoch": 0.8909891723251973, "grad_norm": 5.007936477661133, "learning_rate": 4.454028262066434e-06, "loss": 0.3985, "num_input_tokens_seen": 10449344, "step": 4855 }, { "epoch": 0.8919067718847495, "grad_norm": 11.971400260925293, "learning_rate": 4.458616259864196e-06, "loss": 0.2947, "num_input_tokens_seen": 10459840, "step": 4860 }, { "epoch": 0.8928243714443017, "grad_norm": 6.75638484954834, "learning_rate": 4.463204257661957e-06, "loss": 0.4462, "num_input_tokens_seen": 10470688, "step": 4865 }, { "epoch": 0.8937419710038539, "grad_norm": 4.198892116546631, "learning_rate": 4.467792255459717e-06, "loss": 0.3838, "num_input_tokens_seen": 10481376, "step": 4870 }, { "epoch": 0.8946595705634062, "grad_norm": 3.9606072902679443, "learning_rate": 4.4723802532574785e-06, "loss": 0.3216, "num_input_tokens_seen": 10493248, "step": 4875 }, { "epoch": 0.8955771701229583, "grad_norm": 5.558568954467773, "learning_rate": 4.47696825105524e-06, "loss": 0.3555, "num_input_tokens_seen": 10503808, "step": 4880 }, { "epoch": 0.8964947696825105, "grad_norm": 4.6418328285217285, "learning_rate": 4.481556248853e-06, "loss": 0.3952, "num_input_tokens_seen": 10515552, "step": 4885 }, { "epoch": 0.8974123692420628, "grad_norm": 6.10359001159668, "learning_rate": 4.486144246650762e-06, "loss": 0.3495, "num_input_tokens_seen": 10527296, "step": 4890 }, { "epoch": 0.898329968801615, "grad_norm": 5.229445457458496, "learning_rate": 4.490732244448523e-06, "loss": 0.3335, "num_input_tokens_seen": 10539008, "step": 4895 }, { "epoch": 0.8992475683611671, "grad_norm": 4.636869430541992, "learning_rate": 4.495320242246284e-06, "loss": 0.327, "num_input_tokens_seen": 10549920, "step": 4900 }, { "epoch": 0.9001651679207194, "grad_norm": 3.824532985687256, "learning_rate": 4.499908240044045e-06, "loss": 0.3492, "num_input_tokens_seen": 10560000, "step": 4905 }, { "epoch": 0.9010827674802716, "grad_norm": 6.065446376800537, "learning_rate": 4.504496237841806e-06, "loss": 0.3438, "num_input_tokens_seen": 10569984, "step": 4910 }, { "epoch": 0.9020003670398238, "grad_norm": 7.488662242889404, "learning_rate": 4.509084235639567e-06, "loss": 0.3462, "num_input_tokens_seen": 10581568, "step": 4915 }, { "epoch": 0.9029179665993761, "grad_norm": 10.485336303710938, "learning_rate": 4.513672233437328e-06, "loss": 0.3742, "num_input_tokens_seen": 10591680, "step": 4920 }, { "epoch": 0.9038355661589282, "grad_norm": 25.808340072631836, "learning_rate": 4.51826023123509e-06, "loss": 0.3815, "num_input_tokens_seen": 10602976, "step": 4925 }, { "epoch": 0.9047531657184804, "grad_norm": 7.557337284088135, "learning_rate": 4.522848229032851e-06, "loss": 0.2841, "num_input_tokens_seen": 10614752, "step": 4930 }, { "epoch": 0.9056707652780327, "grad_norm": 14.51357650756836, "learning_rate": 4.5274362268306115e-06, "loss": 0.4334, "num_input_tokens_seen": 10626368, "step": 4935 }, { "epoch": 0.9065883648375849, "grad_norm": 4.387885093688965, "learning_rate": 4.532024224628373e-06, "loss": 0.3258, "num_input_tokens_seen": 10637152, "step": 4940 }, { "epoch": 0.907505964397137, "grad_norm": 11.354185104370117, "learning_rate": 4.536612222426134e-06, "loss": 0.2844, "num_input_tokens_seen": 10646912, "step": 4945 }, { "epoch": 0.9084235639566893, "grad_norm": 7.3453497886657715, "learning_rate": 4.541200220223895e-06, "loss": 0.2921, "num_input_tokens_seen": 10657536, "step": 4950 }, { "epoch": 0.9093411635162415, "grad_norm": 6.052501678466797, "learning_rate": 4.545788218021655e-06, "loss": 0.3241, "num_input_tokens_seen": 10667328, "step": 4955 }, { "epoch": 0.9102587630757937, "grad_norm": 13.102521896362305, "learning_rate": 4.550376215819417e-06, "loss": 0.3659, "num_input_tokens_seen": 10678912, "step": 4960 }, { "epoch": 0.911176362635346, "grad_norm": 9.315043449401855, "learning_rate": 4.554964213617178e-06, "loss": 0.4002, "num_input_tokens_seen": 10689088, "step": 4965 }, { "epoch": 0.9120939621948981, "grad_norm": 6.65394401550293, "learning_rate": 4.5595522114149385e-06, "loss": 0.3577, "num_input_tokens_seen": 10698880, "step": 4970 }, { "epoch": 0.9130115617544503, "grad_norm": 3.8470075130462646, "learning_rate": 4.5641402092127e-06, "loss": 0.2573, "num_input_tokens_seen": 10709408, "step": 4975 }, { "epoch": 0.9139291613140026, "grad_norm": 6.145107746124268, "learning_rate": 4.568728207010461e-06, "loss": 0.2865, "num_input_tokens_seen": 10721024, "step": 4980 }, { "epoch": 0.9148467608735548, "grad_norm": 12.850170135498047, "learning_rate": 4.573316204808222e-06, "loss": 0.2667, "num_input_tokens_seen": 10730912, "step": 4985 }, { "epoch": 0.9157643604331069, "grad_norm": 13.524791717529297, "learning_rate": 4.577904202605983e-06, "loss": 0.2785, "num_input_tokens_seen": 10740256, "step": 4990 }, { "epoch": 0.9166819599926592, "grad_norm": 4.8134989738464355, "learning_rate": 4.5824922004037444e-06, "loss": 0.2672, "num_input_tokens_seen": 10751136, "step": 4995 }, { "epoch": 0.9175995595522114, "grad_norm": 8.810150146484375, "learning_rate": 4.587080198201505e-06, "loss": 0.2973, "num_input_tokens_seen": 10761280, "step": 5000 }, { "epoch": 0.9185171591117637, "grad_norm": 9.952218055725098, "learning_rate": 4.591668195999266e-06, "loss": 0.3272, "num_input_tokens_seen": 10771648, "step": 5005 }, { "epoch": 0.9194347586713159, "grad_norm": 13.593283653259277, "learning_rate": 4.596256193797028e-06, "loss": 0.2994, "num_input_tokens_seen": 10782304, "step": 5010 }, { "epoch": 0.920352358230868, "grad_norm": 10.681312561035156, "learning_rate": 4.600844191594788e-06, "loss": 0.4553, "num_input_tokens_seen": 10793472, "step": 5015 }, { "epoch": 0.9212699577904203, "grad_norm": 5.669282913208008, "learning_rate": 4.6054321893925496e-06, "loss": 0.3351, "num_input_tokens_seen": 10804384, "step": 5020 }, { "epoch": 0.9221875573499725, "grad_norm": 10.053821563720703, "learning_rate": 4.610020187190311e-06, "loss": 0.3548, "num_input_tokens_seen": 10812736, "step": 5025 }, { "epoch": 0.9231051569095247, "grad_norm": 3.912978410720825, "learning_rate": 4.6146081849880714e-06, "loss": 0.3774, "num_input_tokens_seen": 10822208, "step": 5030 }, { "epoch": 0.924022756469077, "grad_norm": 10.200760841369629, "learning_rate": 4.619196182785833e-06, "loss": 0.3586, "num_input_tokens_seen": 10833664, "step": 5035 }, { "epoch": 0.9249403560286291, "grad_norm": 6.030453205108643, "learning_rate": 4.623784180583593e-06, "loss": 0.2758, "num_input_tokens_seen": 10844832, "step": 5040 }, { "epoch": 0.9258579555881813, "grad_norm": 3.0611460208892822, "learning_rate": 4.628372178381355e-06, "loss": 0.2751, "num_input_tokens_seen": 10855680, "step": 5045 }, { "epoch": 0.9267755551477336, "grad_norm": 9.617507934570312, "learning_rate": 4.632960176179116e-06, "loss": 0.3252, "num_input_tokens_seen": 10866592, "step": 5050 }, { "epoch": 0.9276931547072857, "grad_norm": 24.784332275390625, "learning_rate": 4.6375481739768765e-06, "loss": 0.3362, "num_input_tokens_seen": 10877856, "step": 5055 }, { "epoch": 0.9286107542668379, "grad_norm": 30.55122184753418, "learning_rate": 4.642136171774638e-06, "loss": 0.3645, "num_input_tokens_seen": 10888704, "step": 5060 }, { "epoch": 0.9295283538263902, "grad_norm": 4.872676849365234, "learning_rate": 4.646724169572399e-06, "loss": 0.3055, "num_input_tokens_seen": 10900384, "step": 5065 }, { "epoch": 0.9304459533859424, "grad_norm": 20.514148712158203, "learning_rate": 4.65131216737016e-06, "loss": 0.2869, "num_input_tokens_seen": 10910112, "step": 5070 }, { "epoch": 0.9313635529454946, "grad_norm": 5.199975490570068, "learning_rate": 4.655900165167921e-06, "loss": 0.3501, "num_input_tokens_seen": 10920928, "step": 5075 }, { "epoch": 0.9322811525050468, "grad_norm": 10.983232498168945, "learning_rate": 4.6604881629656825e-06, "loss": 0.3029, "num_input_tokens_seen": 10931680, "step": 5080 }, { "epoch": 0.933198752064599, "grad_norm": 68.52812957763672, "learning_rate": 4.665076160763443e-06, "loss": 0.5285, "num_input_tokens_seen": 10941472, "step": 5085 }, { "epoch": 0.9341163516241512, "grad_norm": 8.425037384033203, "learning_rate": 4.669664158561204e-06, "loss": 0.3544, "num_input_tokens_seen": 10952096, "step": 5090 }, { "epoch": 0.9350339511837035, "grad_norm": 6.653318405151367, "learning_rate": 4.674252156358966e-06, "loss": 0.4075, "num_input_tokens_seen": 10961280, "step": 5095 }, { "epoch": 0.9359515507432556, "grad_norm": 6.973940372467041, "learning_rate": 4.678840154156726e-06, "loss": 0.3324, "num_input_tokens_seen": 10972480, "step": 5100 }, { "epoch": 0.9368691503028078, "grad_norm": 13.195307731628418, "learning_rate": 4.683428151954487e-06, "loss": 0.3618, "num_input_tokens_seen": 10983424, "step": 5105 }, { "epoch": 0.9377867498623601, "grad_norm": 4.867939472198486, "learning_rate": 4.688016149752249e-06, "loss": 0.3555, "num_input_tokens_seen": 10995168, "step": 5110 }, { "epoch": 0.9387043494219123, "grad_norm": 6.0892791748046875, "learning_rate": 4.6926041475500095e-06, "loss": 0.3547, "num_input_tokens_seen": 11006304, "step": 5115 }, { "epoch": 0.9396219489814644, "grad_norm": 5.729202747344971, "learning_rate": 4.69719214534777e-06, "loss": 0.3075, "num_input_tokens_seen": 11018016, "step": 5120 }, { "epoch": 0.9405395485410167, "grad_norm": 18.766706466674805, "learning_rate": 4.701780143145531e-06, "loss": 0.3708, "num_input_tokens_seen": 11029408, "step": 5125 }, { "epoch": 0.9414571481005689, "grad_norm": 15.258736610412598, "learning_rate": 4.706368140943293e-06, "loss": 0.303, "num_input_tokens_seen": 11040000, "step": 5130 }, { "epoch": 0.9423747476601211, "grad_norm": 10.677972793579102, "learning_rate": 4.710956138741054e-06, "loss": 0.3865, "num_input_tokens_seen": 11050560, "step": 5135 }, { "epoch": 0.9432923472196734, "grad_norm": 7.070220470428467, "learning_rate": 4.715544136538815e-06, "loss": 0.3783, "num_input_tokens_seen": 11062016, "step": 5140 }, { "epoch": 0.9442099467792255, "grad_norm": 4.945295333862305, "learning_rate": 4.720132134336576e-06, "loss": 0.3859, "num_input_tokens_seen": 11071488, "step": 5145 }, { "epoch": 0.9451275463387777, "grad_norm": 9.550470352172852, "learning_rate": 4.724720132134337e-06, "loss": 0.2531, "num_input_tokens_seen": 11082304, "step": 5150 }, { "epoch": 0.94604514589833, "grad_norm": 15.413687705993652, "learning_rate": 4.729308129932098e-06, "loss": 0.3032, "num_input_tokens_seen": 11093952, "step": 5155 }, { "epoch": 0.9469627454578822, "grad_norm": 30.067270278930664, "learning_rate": 4.733896127729859e-06, "loss": 0.4174, "num_input_tokens_seen": 11106112, "step": 5160 }, { "epoch": 0.9478803450174343, "grad_norm": 8.203078269958496, "learning_rate": 4.7384841255276206e-06, "loss": 0.3618, "num_input_tokens_seen": 11117408, "step": 5165 }, { "epoch": 0.9487979445769866, "grad_norm": 11.268416404724121, "learning_rate": 4.743072123325381e-06, "loss": 0.3981, "num_input_tokens_seen": 11128608, "step": 5170 }, { "epoch": 0.9497155441365388, "grad_norm": 5.614422798156738, "learning_rate": 4.7476601211231424e-06, "loss": 0.3431, "num_input_tokens_seen": 11139296, "step": 5175 }, { "epoch": 0.950633143696091, "grad_norm": 6.386839389801025, "learning_rate": 4.752248118920904e-06, "loss": 0.3123, "num_input_tokens_seen": 11151520, "step": 5180 }, { "epoch": 0.9515507432556433, "grad_norm": 5.2811970710754395, "learning_rate": 4.756836116718664e-06, "loss": 0.3391, "num_input_tokens_seen": 11162656, "step": 5185 }, { "epoch": 0.9524683428151954, "grad_norm": 12.995345115661621, "learning_rate": 4.761424114516426e-06, "loss": 0.3252, "num_input_tokens_seen": 11172512, "step": 5190 }, { "epoch": 0.9533859423747476, "grad_norm": 4.626156806945801, "learning_rate": 4.766012112314187e-06, "loss": 0.2991, "num_input_tokens_seen": 11183040, "step": 5195 }, { "epoch": 0.9543035419342999, "grad_norm": 23.29116439819336, "learning_rate": 4.7706001101119476e-06, "loss": 0.4678, "num_input_tokens_seen": 11192896, "step": 5200 }, { "epoch": 0.9552211414938521, "grad_norm": 16.585350036621094, "learning_rate": 4.775188107909708e-06, "loss": 0.376, "num_input_tokens_seen": 11203968, "step": 5205 }, { "epoch": 0.9561387410534042, "grad_norm": 3.8546135425567627, "learning_rate": 4.7797761057074694e-06, "loss": 0.3949, "num_input_tokens_seen": 11214080, "step": 5210 }, { "epoch": 0.9570563406129565, "grad_norm": 5.1782636642456055, "learning_rate": 4.784364103505231e-06, "loss": 0.3937, "num_input_tokens_seen": 11224800, "step": 5215 }, { "epoch": 0.9579739401725087, "grad_norm": 7.290918350219727, "learning_rate": 4.788952101302991e-06, "loss": 0.3473, "num_input_tokens_seen": 11235296, "step": 5220 }, { "epoch": 0.958891539732061, "grad_norm": 6.078563690185547, "learning_rate": 4.793540099100753e-06, "loss": 0.3063, "num_input_tokens_seen": 11245376, "step": 5225 }, { "epoch": 0.9598091392916132, "grad_norm": 3.1084351539611816, "learning_rate": 4.798128096898514e-06, "loss": 0.3105, "num_input_tokens_seen": 11257120, "step": 5230 }, { "epoch": 0.9607267388511653, "grad_norm": 2.8470473289489746, "learning_rate": 4.8027160946962746e-06, "loss": 0.3539, "num_input_tokens_seen": 11267456, "step": 5235 }, { "epoch": 0.9616443384107176, "grad_norm": 8.223278045654297, "learning_rate": 4.807304092494036e-06, "loss": 0.3727, "num_input_tokens_seen": 11278336, "step": 5240 }, { "epoch": 0.9625619379702698, "grad_norm": 4.822170257568359, "learning_rate": 4.811892090291797e-06, "loss": 0.297, "num_input_tokens_seen": 11289504, "step": 5245 }, { "epoch": 0.963479537529822, "grad_norm": 4.311956405639648, "learning_rate": 4.816480088089558e-06, "loss": 0.2757, "num_input_tokens_seen": 11300416, "step": 5250 }, { "epoch": 0.9643971370893742, "grad_norm": 5.975355625152588, "learning_rate": 4.821068085887319e-06, "loss": 0.2938, "num_input_tokens_seen": 11312128, "step": 5255 }, { "epoch": 0.9653147366489264, "grad_norm": 8.176257133483887, "learning_rate": 4.8256560836850805e-06, "loss": 0.3131, "num_input_tokens_seen": 11322784, "step": 5260 }, { "epoch": 0.9662323362084786, "grad_norm": 5.628109931945801, "learning_rate": 4.830244081482841e-06, "loss": 0.4322, "num_input_tokens_seen": 11334464, "step": 5265 }, { "epoch": 0.9671499357680309, "grad_norm": 28.989761352539062, "learning_rate": 4.834832079280602e-06, "loss": 0.3721, "num_input_tokens_seen": 11345248, "step": 5270 }, { "epoch": 0.968067535327583, "grad_norm": 4.755873203277588, "learning_rate": 4.839420077078364e-06, "loss": 0.3074, "num_input_tokens_seen": 11356960, "step": 5275 }, { "epoch": 0.9689851348871352, "grad_norm": 4.876829624176025, "learning_rate": 4.844008074876124e-06, "loss": 0.4398, "num_input_tokens_seen": 11367840, "step": 5280 }, { "epoch": 0.9699027344466875, "grad_norm": 9.46894645690918, "learning_rate": 4.848596072673886e-06, "loss": 0.3477, "num_input_tokens_seen": 11378752, "step": 5285 }, { "epoch": 0.9708203340062397, "grad_norm": 10.845797538757324, "learning_rate": 4.853184070471646e-06, "loss": 0.376, "num_input_tokens_seen": 11390144, "step": 5290 }, { "epoch": 0.9717379335657919, "grad_norm": 4.447262763977051, "learning_rate": 4.8577720682694075e-06, "loss": 0.3674, "num_input_tokens_seen": 11401376, "step": 5295 }, { "epoch": 0.9726555331253441, "grad_norm": 9.07640552520752, "learning_rate": 4.862360066067169e-06, "loss": 0.3644, "num_input_tokens_seen": 11412928, "step": 5300 }, { "epoch": 0.9735731326848963, "grad_norm": 2.786336660385132, "learning_rate": 4.866948063864929e-06, "loss": 0.3005, "num_input_tokens_seen": 11423872, "step": 5305 }, { "epoch": 0.9744907322444485, "grad_norm": 3.043856620788574, "learning_rate": 4.871536061662691e-06, "loss": 0.3428, "num_input_tokens_seen": 11434944, "step": 5310 }, { "epoch": 0.9754083318040008, "grad_norm": 7.0948286056518555, "learning_rate": 4.876124059460452e-06, "loss": 0.3302, "num_input_tokens_seen": 11445216, "step": 5315 }, { "epoch": 0.976325931363553, "grad_norm": 5.399091720581055, "learning_rate": 4.880712057258213e-06, "loss": 0.3286, "num_input_tokens_seen": 11455040, "step": 5320 }, { "epoch": 0.9772435309231051, "grad_norm": 3.0693376064300537, "learning_rate": 4.885300055055974e-06, "loss": 0.2886, "num_input_tokens_seen": 11467200, "step": 5325 }, { "epoch": 0.9781611304826574, "grad_norm": 17.6737060546875, "learning_rate": 4.889888052853735e-06, "loss": 0.3787, "num_input_tokens_seen": 11478144, "step": 5330 }, { "epoch": 0.9790787300422096, "grad_norm": 10.343507766723633, "learning_rate": 4.894476050651496e-06, "loss": 0.2835, "num_input_tokens_seen": 11488800, "step": 5335 }, { "epoch": 0.9799963296017618, "grad_norm": 8.26794147491455, "learning_rate": 4.899064048449257e-06, "loss": 0.2962, "num_input_tokens_seen": 11500192, "step": 5340 }, { "epoch": 0.980913929161314, "grad_norm": 2.701359272003174, "learning_rate": 4.903652046247019e-06, "loss": 0.3378, "num_input_tokens_seen": 11511648, "step": 5345 }, { "epoch": 0.9818315287208662, "grad_norm": 10.520733833312988, "learning_rate": 4.908240044044779e-06, "loss": 0.3616, "num_input_tokens_seen": 11522176, "step": 5350 }, { "epoch": 0.9827491282804184, "grad_norm": 5.224367141723633, "learning_rate": 4.9128280418425405e-06, "loss": 0.2921, "num_input_tokens_seen": 11532544, "step": 5355 }, { "epoch": 0.9836667278399707, "grad_norm": 7.705044269561768, "learning_rate": 4.917416039640302e-06, "loss": 0.3847, "num_input_tokens_seen": 11542816, "step": 5360 }, { "epoch": 0.9845843273995228, "grad_norm": 3.4392452239990234, "learning_rate": 4.922004037438062e-06, "loss": 0.3789, "num_input_tokens_seen": 11553088, "step": 5365 }, { "epoch": 0.985501926959075, "grad_norm": 4.13350248336792, "learning_rate": 4.926592035235824e-06, "loss": 0.3296, "num_input_tokens_seen": 11563104, "step": 5370 }, { "epoch": 0.9864195265186273, "grad_norm": 3.1915178298950195, "learning_rate": 4.931180033033584e-06, "loss": 0.3202, "num_input_tokens_seen": 11573760, "step": 5375 }, { "epoch": 0.9873371260781795, "grad_norm": 5.434667587280273, "learning_rate": 4.935768030831346e-06, "loss": 0.3242, "num_input_tokens_seen": 11584768, "step": 5380 }, { "epoch": 0.9882547256377316, "grad_norm": 3.5990242958068848, "learning_rate": 4.940356028629107e-06, "loss": 0.3323, "num_input_tokens_seen": 11595584, "step": 5385 }, { "epoch": 0.9891723251972839, "grad_norm": 4.74513578414917, "learning_rate": 4.9449440264268675e-06, "loss": 0.3207, "num_input_tokens_seen": 11607296, "step": 5390 }, { "epoch": 0.9900899247568361, "grad_norm": 3.5978102684020996, "learning_rate": 4.949532024224629e-06, "loss": 0.2992, "num_input_tokens_seen": 11618016, "step": 5395 }, { "epoch": 0.9910075243163883, "grad_norm": 16.20389747619629, "learning_rate": 4.95412002202239e-06, "loss": 0.3031, "num_input_tokens_seen": 11630112, "step": 5400 }, { "epoch": 0.9919251238759406, "grad_norm": 4.3013386726379395, "learning_rate": 4.958708019820151e-06, "loss": 0.3622, "num_input_tokens_seen": 11641408, "step": 5405 }, { "epoch": 0.9928427234354927, "grad_norm": 5.486088752746582, "learning_rate": 4.963296017617912e-06, "loss": 0.2913, "num_input_tokens_seen": 11651872, "step": 5410 }, { "epoch": 0.9937603229950449, "grad_norm": 2.828300714492798, "learning_rate": 4.967884015415673e-06, "loss": 0.2505, "num_input_tokens_seen": 11662880, "step": 5415 }, { "epoch": 0.9946779225545972, "grad_norm": 14.02361011505127, "learning_rate": 4.972472013213434e-06, "loss": 0.3117, "num_input_tokens_seen": 11673600, "step": 5420 }, { "epoch": 0.9955955221141494, "grad_norm": 17.130794525146484, "learning_rate": 4.977060011011195e-06, "loss": 0.3085, "num_input_tokens_seen": 11683904, "step": 5425 }, { "epoch": 0.9965131216737017, "grad_norm": 49.900184631347656, "learning_rate": 4.981648008808957e-06, "loss": 0.4871, "num_input_tokens_seen": 11696384, "step": 5430 }, { "epoch": 0.9974307212332538, "grad_norm": 8.922218322753906, "learning_rate": 4.986236006606717e-06, "loss": 0.283, "num_input_tokens_seen": 11706240, "step": 5435 }, { "epoch": 0.998348320792806, "grad_norm": 19.24534034729004, "learning_rate": 4.9908240044044785e-06, "loss": 0.3032, "num_input_tokens_seen": 11715488, "step": 5440 }, { "epoch": 0.9992659203523583, "grad_norm": 3.2451908588409424, "learning_rate": 4.99541200220224e-06, "loss": 0.3257, "num_input_tokens_seen": 11727072, "step": 5445 }, { "epoch": 1.0001835199119105, "grad_norm": 29.187971115112305, "learning_rate": 5e-06, "loss": 0.4394, "num_input_tokens_seen": 11737024, "step": 5450 }, { "epoch": 1.0011011194714627, "grad_norm": 5.592909812927246, "learning_rate": 5.004587997797762e-06, "loss": 0.2451, "num_input_tokens_seen": 11746976, "step": 5455 }, { "epoch": 1.0020187190310148, "grad_norm": 7.111072540283203, "learning_rate": 5.009175995595522e-06, "loss": 0.4103, "num_input_tokens_seen": 11757856, "step": 5460 }, { "epoch": 1.002936318590567, "grad_norm": 8.611125946044922, "learning_rate": 5.013763993393284e-06, "loss": 0.3784, "num_input_tokens_seen": 11769632, "step": 5465 }, { "epoch": 1.0038539181501194, "grad_norm": 2.853651523590088, "learning_rate": 5.018351991191045e-06, "loss": 0.3661, "num_input_tokens_seen": 11779552, "step": 5470 }, { "epoch": 1.0047715177096714, "grad_norm": 5.052237033843994, "learning_rate": 5.0229399889888055e-06, "loss": 0.2907, "num_input_tokens_seen": 11790816, "step": 5475 }, { "epoch": 1.0056891172692237, "grad_norm": 3.426151990890503, "learning_rate": 5.027527986786567e-06, "loss": 0.3415, "num_input_tokens_seen": 11800896, "step": 5480 }, { "epoch": 1.006606716828776, "grad_norm": 26.429319381713867, "learning_rate": 5.032115984584328e-06, "loss": 0.3567, "num_input_tokens_seen": 11811776, "step": 5485 }, { "epoch": 1.007524316388328, "grad_norm": 2.5261754989624023, "learning_rate": 5.036703982382089e-06, "loss": 0.292, "num_input_tokens_seen": 11823040, "step": 5490 }, { "epoch": 1.0084419159478804, "grad_norm": 2.406127691268921, "learning_rate": 5.04129198017985e-06, "loss": 0.2455, "num_input_tokens_seen": 11834048, "step": 5495 }, { "epoch": 1.0093595155074326, "grad_norm": 8.601799964904785, "learning_rate": 5.0458799779776115e-06, "loss": 0.3455, "num_input_tokens_seen": 11845312, "step": 5500 }, { "epoch": 1.0102771150669847, "grad_norm": 3.123659372329712, "learning_rate": 5.050467975775372e-06, "loss": 0.3593, "num_input_tokens_seen": 11856352, "step": 5505 }, { "epoch": 1.011194714626537, "grad_norm": 30.673131942749023, "learning_rate": 5.055055973573133e-06, "loss": 0.3034, "num_input_tokens_seen": 11867328, "step": 5510 }, { "epoch": 1.0121123141860893, "grad_norm": 6.5138773918151855, "learning_rate": 5.059643971370895e-06, "loss": 0.2862, "num_input_tokens_seen": 11879136, "step": 5515 }, { "epoch": 1.0130299137456413, "grad_norm": 6.301662921905518, "learning_rate": 5.064231969168654e-06, "loss": 0.3998, "num_input_tokens_seen": 11890336, "step": 5520 }, { "epoch": 1.0139475133051936, "grad_norm": 2.117151975631714, "learning_rate": 5.068819966966417e-06, "loss": 0.2702, "num_input_tokens_seen": 11900928, "step": 5525 }, { "epoch": 1.014865112864746, "grad_norm": 29.281383514404297, "learning_rate": 5.073407964764178e-06, "loss": 0.4326, "num_input_tokens_seen": 11911744, "step": 5530 }, { "epoch": 1.015782712424298, "grad_norm": 3.142728567123413, "learning_rate": 5.077995962561938e-06, "loss": 0.3445, "num_input_tokens_seen": 11921728, "step": 5535 }, { "epoch": 1.0167003119838502, "grad_norm": 5.177005290985107, "learning_rate": 5.082583960359699e-06, "loss": 0.3057, "num_input_tokens_seen": 11932704, "step": 5540 }, { "epoch": 1.0176179115434025, "grad_norm": 16.2574520111084, "learning_rate": 5.087171958157461e-06, "loss": 0.4087, "num_input_tokens_seen": 11943456, "step": 5545 }, { "epoch": 1.0185355111029546, "grad_norm": 7.007924556732178, "learning_rate": 5.091759955955221e-06, "loss": 0.2842, "num_input_tokens_seen": 11954912, "step": 5550 }, { "epoch": 1.0194531106625069, "grad_norm": 3.1341147422790527, "learning_rate": 5.096347953752982e-06, "loss": 0.2754, "num_input_tokens_seen": 11966080, "step": 5555 }, { "epoch": 1.0203707102220592, "grad_norm": 7.054694175720215, "learning_rate": 5.100935951550744e-06, "loss": 0.3684, "num_input_tokens_seen": 11977600, "step": 5560 }, { "epoch": 1.0212883097816112, "grad_norm": 4.973776817321777, "learning_rate": 5.105523949348504e-06, "loss": 0.372, "num_input_tokens_seen": 11988512, "step": 5565 }, { "epoch": 1.0222059093411635, "grad_norm": 11.688465118408203, "learning_rate": 5.1101119471462655e-06, "loss": 0.3026, "num_input_tokens_seen": 11999456, "step": 5570 }, { "epoch": 1.0231235089007158, "grad_norm": 13.192741394042969, "learning_rate": 5.114699944944027e-06, "loss": 0.365, "num_input_tokens_seen": 12009952, "step": 5575 }, { "epoch": 1.0240411084602679, "grad_norm": 5.361562252044678, "learning_rate": 5.119287942741787e-06, "loss": 0.2932, "num_input_tokens_seen": 12019872, "step": 5580 }, { "epoch": 1.0249587080198201, "grad_norm": 3.8133604526519775, "learning_rate": 5.123875940539549e-06, "loss": 0.2438, "num_input_tokens_seen": 12030624, "step": 5585 }, { "epoch": 1.0258763075793724, "grad_norm": 6.984651565551758, "learning_rate": 5.12846393833731e-06, "loss": 0.2253, "num_input_tokens_seen": 12042080, "step": 5590 }, { "epoch": 1.0267939071389245, "grad_norm": 2.7717678546905518, "learning_rate": 5.133051936135071e-06, "loss": 0.313, "num_input_tokens_seen": 12053696, "step": 5595 }, { "epoch": 1.0277115066984768, "grad_norm": 17.193817138671875, "learning_rate": 5.137639933932832e-06, "loss": 0.3317, "num_input_tokens_seen": 12063840, "step": 5600 }, { "epoch": 1.028629106258029, "grad_norm": 5.114714622497559, "learning_rate": 5.142227931730593e-06, "loss": 0.2746, "num_input_tokens_seen": 12075008, "step": 5605 }, { "epoch": 1.0295467058175811, "grad_norm": 4.6169819831848145, "learning_rate": 5.146815929528354e-06, "loss": 0.5228, "num_input_tokens_seen": 12086464, "step": 5610 }, { "epoch": 1.0304643053771334, "grad_norm": 2.4613516330718994, "learning_rate": 5.151403927326115e-06, "loss": 0.2408, "num_input_tokens_seen": 12098336, "step": 5615 }, { "epoch": 1.0313819049366857, "grad_norm": 2.7979955673217773, "learning_rate": 5.1559919251238765e-06, "loss": 0.3599, "num_input_tokens_seen": 12108992, "step": 5620 }, { "epoch": 1.0322995044962378, "grad_norm": 10.734068870544434, "learning_rate": 5.160579922921637e-06, "loss": 0.2727, "num_input_tokens_seen": 12119808, "step": 5625 }, { "epoch": 1.03321710405579, "grad_norm": 6.122023105621338, "learning_rate": 5.165167920719398e-06, "loss": 0.3854, "num_input_tokens_seen": 12129792, "step": 5630 }, { "epoch": 1.0341347036153423, "grad_norm": 17.44521713256836, "learning_rate": 5.16975591851716e-06, "loss": 0.4909, "num_input_tokens_seen": 12141344, "step": 5635 }, { "epoch": 1.0350523031748944, "grad_norm": 4.599830150604248, "learning_rate": 5.17434391631492e-06, "loss": 0.2609, "num_input_tokens_seen": 12152384, "step": 5640 }, { "epoch": 1.0359699027344467, "grad_norm": 12.372787475585938, "learning_rate": 5.178931914112682e-06, "loss": 0.3167, "num_input_tokens_seen": 12162240, "step": 5645 }, { "epoch": 1.036887502293999, "grad_norm": 17.727914810180664, "learning_rate": 5.183519911910443e-06, "loss": 0.3558, "num_input_tokens_seen": 12172288, "step": 5650 }, { "epoch": 1.037805101853551, "grad_norm": 2.775463819503784, "learning_rate": 5.1881079097082035e-06, "loss": 0.3539, "num_input_tokens_seen": 12183520, "step": 5655 }, { "epoch": 1.0387227014131033, "grad_norm": 3.9430923461914062, "learning_rate": 5.192695907505965e-06, "loss": 0.3452, "num_input_tokens_seen": 12194560, "step": 5660 }, { "epoch": 1.0396403009726556, "grad_norm": 6.301229953765869, "learning_rate": 5.197283905303726e-06, "loss": 0.3243, "num_input_tokens_seen": 12206400, "step": 5665 }, { "epoch": 1.0405579005322076, "grad_norm": 3.3710803985595703, "learning_rate": 5.201871903101488e-06, "loss": 0.4166, "num_input_tokens_seen": 12217152, "step": 5670 }, { "epoch": 1.04147550009176, "grad_norm": 2.9834787845611572, "learning_rate": 5.206459900899248e-06, "loss": 0.35, "num_input_tokens_seen": 12227744, "step": 5675 }, { "epoch": 1.0423930996513122, "grad_norm": 2.985922336578369, "learning_rate": 5.2110478986970095e-06, "loss": 0.3342, "num_input_tokens_seen": 12237760, "step": 5680 }, { "epoch": 1.0433106992108643, "grad_norm": 4.852530002593994, "learning_rate": 5.215635896494771e-06, "loss": 0.3064, "num_input_tokens_seen": 12248640, "step": 5685 }, { "epoch": 1.0442282987704166, "grad_norm": 4.197665214538574, "learning_rate": 5.220223894292531e-06, "loss": 0.3089, "num_input_tokens_seen": 12259616, "step": 5690 }, { "epoch": 1.0451458983299688, "grad_norm": 2.5671699047088623, "learning_rate": 5.224811892090293e-06, "loss": 0.3289, "num_input_tokens_seen": 12271424, "step": 5695 }, { "epoch": 1.046063497889521, "grad_norm": 5.083977699279785, "learning_rate": 5.229399889888054e-06, "loss": 0.3693, "num_input_tokens_seen": 12282048, "step": 5700 }, { "epoch": 1.0469810974490732, "grad_norm": 5.454805850982666, "learning_rate": 5.233987887685814e-06, "loss": 0.2495, "num_input_tokens_seen": 12292640, "step": 5705 }, { "epoch": 1.0478986970086255, "grad_norm": 27.347036361694336, "learning_rate": 5.238575885483575e-06, "loss": 0.4372, "num_input_tokens_seen": 12302784, "step": 5710 }, { "epoch": 1.0488162965681775, "grad_norm": 21.786956787109375, "learning_rate": 5.243163883281337e-06, "loss": 0.4351, "num_input_tokens_seen": 12312960, "step": 5715 }, { "epoch": 1.0497338961277298, "grad_norm": 2.4698781967163086, "learning_rate": 5.247751881079097e-06, "loss": 0.2748, "num_input_tokens_seen": 12323648, "step": 5720 }, { "epoch": 1.050651495687282, "grad_norm": 10.238044738769531, "learning_rate": 5.252339878876858e-06, "loss": 0.3247, "num_input_tokens_seen": 12335616, "step": 5725 }, { "epoch": 1.0515690952468342, "grad_norm": 4.656627655029297, "learning_rate": 5.25692787667462e-06, "loss": 0.3662, "num_input_tokens_seen": 12346560, "step": 5730 }, { "epoch": 1.0524866948063865, "grad_norm": 4.462958335876465, "learning_rate": 5.26151587447238e-06, "loss": 0.3879, "num_input_tokens_seen": 12357856, "step": 5735 }, { "epoch": 1.0534042943659387, "grad_norm": 7.114033222198486, "learning_rate": 5.266103872270142e-06, "loss": 0.4096, "num_input_tokens_seen": 12369120, "step": 5740 }, { "epoch": 1.0543218939254908, "grad_norm": 8.648015022277832, "learning_rate": 5.270691870067903e-06, "loss": 0.3216, "num_input_tokens_seen": 12379232, "step": 5745 }, { "epoch": 1.055239493485043, "grad_norm": 6.568331241607666, "learning_rate": 5.2752798678656635e-06, "loss": 0.3352, "num_input_tokens_seen": 12390944, "step": 5750 }, { "epoch": 1.0561570930445954, "grad_norm": 2.7866361141204834, "learning_rate": 5.279867865663425e-06, "loss": 0.2791, "num_input_tokens_seen": 12401280, "step": 5755 }, { "epoch": 1.0570746926041474, "grad_norm": 3.357417345046997, "learning_rate": 5.284455863461186e-06, "loss": 0.3434, "num_input_tokens_seen": 12411296, "step": 5760 }, { "epoch": 1.0579922921636997, "grad_norm": 4.831390380859375, "learning_rate": 5.289043861258947e-06, "loss": 0.3375, "num_input_tokens_seen": 12422560, "step": 5765 }, { "epoch": 1.058909891723252, "grad_norm": 3.4121463298797607, "learning_rate": 5.293631859056708e-06, "loss": 0.3812, "num_input_tokens_seen": 12432928, "step": 5770 }, { "epoch": 1.0598274912828043, "grad_norm": 5.54572868347168, "learning_rate": 5.2982198568544694e-06, "loss": 0.2835, "num_input_tokens_seen": 12444384, "step": 5775 }, { "epoch": 1.0607450908423564, "grad_norm": 6.302184581756592, "learning_rate": 5.30280785465223e-06, "loss": 0.3744, "num_input_tokens_seen": 12455392, "step": 5780 }, { "epoch": 1.0616626904019086, "grad_norm": 4.317813873291016, "learning_rate": 5.307395852449991e-06, "loss": 0.3542, "num_input_tokens_seen": 12466272, "step": 5785 }, { "epoch": 1.062580289961461, "grad_norm": 6.536075115203857, "learning_rate": 5.311983850247753e-06, "loss": 0.322, "num_input_tokens_seen": 12477056, "step": 5790 }, { "epoch": 1.063497889521013, "grad_norm": 5.2962260246276855, "learning_rate": 5.316571848045513e-06, "loss": 0.3213, "num_input_tokens_seen": 12488288, "step": 5795 }, { "epoch": 1.0644154890805653, "grad_norm": 7.307922840118408, "learning_rate": 5.3211598458432746e-06, "loss": 0.2724, "num_input_tokens_seen": 12499808, "step": 5800 }, { "epoch": 1.0653330886401176, "grad_norm": 2.7740583419799805, "learning_rate": 5.325747843641036e-06, "loss": 0.2923, "num_input_tokens_seen": 12511392, "step": 5805 }, { "epoch": 1.0662506881996696, "grad_norm": 4.847443103790283, "learning_rate": 5.3303358414387964e-06, "loss": 0.2265, "num_input_tokens_seen": 12522208, "step": 5810 }, { "epoch": 1.067168287759222, "grad_norm": 8.464733123779297, "learning_rate": 5.334923839236558e-06, "loss": 0.324, "num_input_tokens_seen": 12532896, "step": 5815 }, { "epoch": 1.0680858873187742, "grad_norm": 8.884017944335938, "learning_rate": 5.339511837034319e-06, "loss": 0.4371, "num_input_tokens_seen": 12544064, "step": 5820 }, { "epoch": 1.0690034868783262, "grad_norm": 5.258182525634766, "learning_rate": 5.34409983483208e-06, "loss": 0.2882, "num_input_tokens_seen": 12552864, "step": 5825 }, { "epoch": 1.0699210864378785, "grad_norm": 8.320103645324707, "learning_rate": 5.348687832629841e-06, "loss": 0.3826, "num_input_tokens_seen": 12563168, "step": 5830 }, { "epoch": 1.0708386859974308, "grad_norm": 4.57405948638916, "learning_rate": 5.353275830427602e-06, "loss": 0.2655, "num_input_tokens_seen": 12574304, "step": 5835 }, { "epoch": 1.0717562855569829, "grad_norm": 4.87999153137207, "learning_rate": 5.357863828225363e-06, "loss": 0.3094, "num_input_tokens_seen": 12586016, "step": 5840 }, { "epoch": 1.0726738851165352, "grad_norm": 3.363180637359619, "learning_rate": 5.362451826023124e-06, "loss": 0.4218, "num_input_tokens_seen": 12595488, "step": 5845 }, { "epoch": 1.0735914846760874, "grad_norm": 5.7538161277771, "learning_rate": 5.367039823820886e-06, "loss": 0.3387, "num_input_tokens_seen": 12607296, "step": 5850 }, { "epoch": 1.0745090842356395, "grad_norm": 4.484171390533447, "learning_rate": 5.371627821618645e-06, "loss": 0.3099, "num_input_tokens_seen": 12618336, "step": 5855 }, { "epoch": 1.0754266837951918, "grad_norm": 2.9161148071289062, "learning_rate": 5.3762158194164075e-06, "loss": 0.3518, "num_input_tokens_seen": 12628928, "step": 5860 }, { "epoch": 1.076344283354744, "grad_norm": 2.673123359680176, "learning_rate": 5.380803817214169e-06, "loss": 0.2845, "num_input_tokens_seen": 12639680, "step": 5865 }, { "epoch": 1.0772618829142961, "grad_norm": 3.5084400177001953, "learning_rate": 5.3853918150119285e-06, "loss": 0.4473, "num_input_tokens_seen": 12650752, "step": 5870 }, { "epoch": 1.0781794824738484, "grad_norm": 3.1748769283294678, "learning_rate": 5.38997981280969e-06, "loss": 0.285, "num_input_tokens_seen": 12660480, "step": 5875 }, { "epoch": 1.0790970820334007, "grad_norm": 7.568345069885254, "learning_rate": 5.394567810607452e-06, "loss": 0.389, "num_input_tokens_seen": 12671424, "step": 5880 }, { "epoch": 1.0800146815929528, "grad_norm": 3.3993632793426514, "learning_rate": 5.399155808405212e-06, "loss": 0.36, "num_input_tokens_seen": 12682592, "step": 5885 }, { "epoch": 1.080932281152505, "grad_norm": 3.273611068725586, "learning_rate": 5.403743806202973e-06, "loss": 0.37, "num_input_tokens_seen": 12693120, "step": 5890 }, { "epoch": 1.0818498807120573, "grad_norm": 4.38447380065918, "learning_rate": 5.4083318040007345e-06, "loss": 0.2822, "num_input_tokens_seen": 12702880, "step": 5895 }, { "epoch": 1.0827674802716094, "grad_norm": 2.3666045665740967, "learning_rate": 5.412919801798495e-06, "loss": 0.301, "num_input_tokens_seen": 12714336, "step": 5900 }, { "epoch": 1.0836850798311617, "grad_norm": 9.014426231384277, "learning_rate": 5.417507799596256e-06, "loss": 0.423, "num_input_tokens_seen": 12723392, "step": 5905 }, { "epoch": 1.084602679390714, "grad_norm": 2.459977626800537, "learning_rate": 5.422095797394018e-06, "loss": 0.3511, "num_input_tokens_seen": 12734880, "step": 5910 }, { "epoch": 1.085520278950266, "grad_norm": 2.9977283477783203, "learning_rate": 5.426683795191778e-06, "loss": 0.2437, "num_input_tokens_seen": 12746432, "step": 5915 }, { "epoch": 1.0864378785098183, "grad_norm": 3.8377833366394043, "learning_rate": 5.43127179298954e-06, "loss": 0.396, "num_input_tokens_seen": 12758144, "step": 5920 }, { "epoch": 1.0873554780693706, "grad_norm": 19.198686599731445, "learning_rate": 5.435859790787301e-06, "loss": 0.4088, "num_input_tokens_seen": 12769248, "step": 5925 }, { "epoch": 1.0882730776289227, "grad_norm": 2.3650705814361572, "learning_rate": 5.4404477885850615e-06, "loss": 0.303, "num_input_tokens_seen": 12780128, "step": 5930 }, { "epoch": 1.089190677188475, "grad_norm": 14.670632362365723, "learning_rate": 5.445035786382823e-06, "loss": 0.3612, "num_input_tokens_seen": 12791104, "step": 5935 }, { "epoch": 1.0901082767480272, "grad_norm": 3.547974109649658, "learning_rate": 5.449623784180584e-06, "loss": 0.3075, "num_input_tokens_seen": 12801824, "step": 5940 }, { "epoch": 1.0910258763075793, "grad_norm": 4.759642601013184, "learning_rate": 5.454211781978345e-06, "loss": 0.3945, "num_input_tokens_seen": 12813120, "step": 5945 }, { "epoch": 1.0919434758671316, "grad_norm": 2.0287413597106934, "learning_rate": 5.458799779776106e-06, "loss": 0.3567, "num_input_tokens_seen": 12825088, "step": 5950 }, { "epoch": 1.0928610754266839, "grad_norm": 7.103523254394531, "learning_rate": 5.4633877775738675e-06, "loss": 0.3703, "num_input_tokens_seen": 12836768, "step": 5955 }, { "epoch": 1.093778674986236, "grad_norm": 3.961149215698242, "learning_rate": 5.467975775371628e-06, "loss": 0.2842, "num_input_tokens_seen": 12846880, "step": 5960 }, { "epoch": 1.0946962745457882, "grad_norm": 2.518381118774414, "learning_rate": 5.472563773169389e-06, "loss": 0.2843, "num_input_tokens_seen": 12857184, "step": 5965 }, { "epoch": 1.0956138741053405, "grad_norm": 4.991613864898682, "learning_rate": 5.477151770967151e-06, "loss": 0.3335, "num_input_tokens_seen": 12867904, "step": 5970 }, { "epoch": 1.0965314736648926, "grad_norm": 4.942618370056152, "learning_rate": 5.481739768764911e-06, "loss": 0.3304, "num_input_tokens_seen": 12878720, "step": 5975 }, { "epoch": 1.0974490732244448, "grad_norm": 19.639530181884766, "learning_rate": 5.4863277665626726e-06, "loss": 0.4095, "num_input_tokens_seen": 12888832, "step": 5980 }, { "epoch": 1.0983666727839971, "grad_norm": 17.329513549804688, "learning_rate": 5.490915764360434e-06, "loss": 0.4068, "num_input_tokens_seen": 12901152, "step": 5985 }, { "epoch": 1.0992842723435492, "grad_norm": 3.9402225017547607, "learning_rate": 5.4955037621581945e-06, "loss": 0.329, "num_input_tokens_seen": 12911296, "step": 5990 }, { "epoch": 1.1002018719031015, "grad_norm": 7.507729530334473, "learning_rate": 5.500091759955956e-06, "loss": 0.433, "num_input_tokens_seen": 12922336, "step": 5995 }, { "epoch": 1.1011194714626538, "grad_norm": 3.405222177505493, "learning_rate": 5.504679757753717e-06, "loss": 0.3125, "num_input_tokens_seen": 12932736, "step": 6000 }, { "epoch": 1.1020370710222058, "grad_norm": 8.396830558776855, "learning_rate": 5.509267755551478e-06, "loss": 0.3037, "num_input_tokens_seen": 12942528, "step": 6005 }, { "epoch": 1.1029546705817581, "grad_norm": 14.61989688873291, "learning_rate": 5.513855753349239e-06, "loss": 0.3389, "num_input_tokens_seen": 12953600, "step": 6010 }, { "epoch": 1.1038722701413104, "grad_norm": 4.6982221603393555, "learning_rate": 5.518443751147e-06, "loss": 0.3515, "num_input_tokens_seen": 12965664, "step": 6015 }, { "epoch": 1.1047898697008625, "grad_norm": 9.11826229095459, "learning_rate": 5.52303174894476e-06, "loss": 0.2804, "num_input_tokens_seen": 12977024, "step": 6020 }, { "epoch": 1.1057074692604147, "grad_norm": 6.702033519744873, "learning_rate": 5.527619746742522e-06, "loss": 0.3714, "num_input_tokens_seen": 12986496, "step": 6025 }, { "epoch": 1.106625068819967, "grad_norm": 4.391521453857422, "learning_rate": 5.532207744540284e-06, "loss": 0.2977, "num_input_tokens_seen": 12997664, "step": 6030 }, { "epoch": 1.107542668379519, "grad_norm": 8.89072322845459, "learning_rate": 5.536795742338043e-06, "loss": 0.2948, "num_input_tokens_seen": 13009056, "step": 6035 }, { "epoch": 1.1084602679390714, "grad_norm": 19.819122314453125, "learning_rate": 5.541383740135805e-06, "loss": 0.4074, "num_input_tokens_seen": 13019104, "step": 6040 }, { "epoch": 1.1093778674986237, "grad_norm": 22.459911346435547, "learning_rate": 5.545971737933567e-06, "loss": 0.3374, "num_input_tokens_seen": 13030176, "step": 6045 }, { "epoch": 1.1102954670581757, "grad_norm": 1.8010332584381104, "learning_rate": 5.5505597357313266e-06, "loss": 0.3001, "num_input_tokens_seen": 13040800, "step": 6050 }, { "epoch": 1.111213066617728, "grad_norm": 2.422367811203003, "learning_rate": 5.555147733529088e-06, "loss": 0.3102, "num_input_tokens_seen": 13051968, "step": 6055 }, { "epoch": 1.1121306661772803, "grad_norm": 22.356958389282227, "learning_rate": 5.559735731326849e-06, "loss": 0.336, "num_input_tokens_seen": 13062464, "step": 6060 }, { "epoch": 1.1130482657368324, "grad_norm": 2.323014736175537, "learning_rate": 5.56432372912461e-06, "loss": 0.3203, "num_input_tokens_seen": 13073472, "step": 6065 }, { "epoch": 1.1139658652963846, "grad_norm": 4.186243534088135, "learning_rate": 5.568911726922371e-06, "loss": 0.3102, "num_input_tokens_seen": 13083968, "step": 6070 }, { "epoch": 1.114883464855937, "grad_norm": 3.8119254112243652, "learning_rate": 5.5734997247201325e-06, "loss": 0.3153, "num_input_tokens_seen": 13095680, "step": 6075 }, { "epoch": 1.115801064415489, "grad_norm": 8.366226196289062, "learning_rate": 5.578087722517893e-06, "loss": 0.4197, "num_input_tokens_seen": 13106688, "step": 6080 }, { "epoch": 1.1167186639750413, "grad_norm": 2.052983045578003, "learning_rate": 5.582675720315654e-06, "loss": 0.2726, "num_input_tokens_seen": 13117248, "step": 6085 }, { "epoch": 1.1176362635345936, "grad_norm": 16.64936065673828, "learning_rate": 5.587263718113416e-06, "loss": 0.3199, "num_input_tokens_seen": 13128928, "step": 6090 }, { "epoch": 1.1185538630941456, "grad_norm": 1.666856288909912, "learning_rate": 5.591851715911177e-06, "loss": 0.2477, "num_input_tokens_seen": 13140384, "step": 6095 }, { "epoch": 1.119471462653698, "grad_norm": 25.926523208618164, "learning_rate": 5.596439713708938e-06, "loss": 0.3881, "num_input_tokens_seen": 13151808, "step": 6100 }, { "epoch": 1.1203890622132502, "grad_norm": 25.221073150634766, "learning_rate": 5.601027711506699e-06, "loss": 0.4001, "num_input_tokens_seen": 13162432, "step": 6105 }, { "epoch": 1.1213066617728022, "grad_norm": 2.054042100906372, "learning_rate": 5.60561570930446e-06, "loss": 0.3442, "num_input_tokens_seen": 13173568, "step": 6110 }, { "epoch": 1.1222242613323545, "grad_norm": 22.215469360351562, "learning_rate": 5.610203707102221e-06, "loss": 0.3066, "num_input_tokens_seen": 13185536, "step": 6115 }, { "epoch": 1.1231418608919068, "grad_norm": 3.6190152168273926, "learning_rate": 5.614791704899982e-06, "loss": 0.408, "num_input_tokens_seen": 13197120, "step": 6120 }, { "epoch": 1.1240594604514589, "grad_norm": 2.698239803314209, "learning_rate": 5.619379702697744e-06, "loss": 0.3647, "num_input_tokens_seen": 13207168, "step": 6125 }, { "epoch": 1.1249770600110112, "grad_norm": 2.661078453063965, "learning_rate": 5.623967700495504e-06, "loss": 0.3404, "num_input_tokens_seen": 13216960, "step": 6130 }, { "epoch": 1.1258946595705634, "grad_norm": 1.752899408340454, "learning_rate": 5.6285556982932655e-06, "loss": 0.3055, "num_input_tokens_seen": 13227968, "step": 6135 }, { "epoch": 1.1268122591301157, "grad_norm": 1.8465532064437866, "learning_rate": 5.633143696091027e-06, "loss": 0.3359, "num_input_tokens_seen": 13238368, "step": 6140 }, { "epoch": 1.1277298586896678, "grad_norm": 2.5217416286468506, "learning_rate": 5.637731693888787e-06, "loss": 0.343, "num_input_tokens_seen": 13248992, "step": 6145 }, { "epoch": 1.12864745824922, "grad_norm": 4.018441200256348, "learning_rate": 5.642319691686549e-06, "loss": 0.3982, "num_input_tokens_seen": 13258560, "step": 6150 }, { "epoch": 1.1295650578087724, "grad_norm": 4.83346700668335, "learning_rate": 5.64690768948431e-06, "loss": 0.3164, "num_input_tokens_seen": 13268416, "step": 6155 }, { "epoch": 1.1304826573683244, "grad_norm": 3.7810614109039307, "learning_rate": 5.651495687282071e-06, "loss": 0.3365, "num_input_tokens_seen": 13279008, "step": 6160 }, { "epoch": 1.1314002569278767, "grad_norm": 3.9695842266082764, "learning_rate": 5.656083685079832e-06, "loss": 0.2632, "num_input_tokens_seen": 13290624, "step": 6165 }, { "epoch": 1.132317856487429, "grad_norm": 4.048430919647217, "learning_rate": 5.660671682877593e-06, "loss": 0.3536, "num_input_tokens_seen": 13301664, "step": 6170 }, { "epoch": 1.133235456046981, "grad_norm": 5.7887349128723145, "learning_rate": 5.665259680675354e-06, "loss": 0.3083, "num_input_tokens_seen": 13312096, "step": 6175 }, { "epoch": 1.1341530556065333, "grad_norm": 1.9298937320709229, "learning_rate": 5.669847678473115e-06, "loss": 0.3465, "num_input_tokens_seen": 13322912, "step": 6180 }, { "epoch": 1.1350706551660856, "grad_norm": 16.1289119720459, "learning_rate": 5.6744356762708766e-06, "loss": 0.3601, "num_input_tokens_seen": 13333632, "step": 6185 }, { "epoch": 1.1359882547256377, "grad_norm": 6.434851169586182, "learning_rate": 5.679023674068636e-06, "loss": 0.3514, "num_input_tokens_seen": 13345248, "step": 6190 }, { "epoch": 1.13690585428519, "grad_norm": 1.7961586713790894, "learning_rate": 5.6836116718663984e-06, "loss": 0.3215, "num_input_tokens_seen": 13356288, "step": 6195 }, { "epoch": 1.1378234538447423, "grad_norm": 3.952150821685791, "learning_rate": 5.68819966966416e-06, "loss": 0.2745, "num_input_tokens_seen": 13367776, "step": 6200 }, { "epoch": 1.1387410534042943, "grad_norm": 4.300410747528076, "learning_rate": 5.6927876674619195e-06, "loss": 0.2934, "num_input_tokens_seen": 13378720, "step": 6205 }, { "epoch": 1.1396586529638466, "grad_norm": 1.6620572805404663, "learning_rate": 5.697375665259681e-06, "loss": 0.3016, "num_input_tokens_seen": 13390112, "step": 6210 }, { "epoch": 1.140576252523399, "grad_norm": 6.750332355499268, "learning_rate": 5.701963663057443e-06, "loss": 0.2356, "num_input_tokens_seen": 13401184, "step": 6215 }, { "epoch": 1.141493852082951, "grad_norm": 5.3270416259765625, "learning_rate": 5.706551660855203e-06, "loss": 0.4246, "num_input_tokens_seen": 13412544, "step": 6220 }, { "epoch": 1.1424114516425032, "grad_norm": 7.243684768676758, "learning_rate": 5.711139658652964e-06, "loss": 0.3092, "num_input_tokens_seen": 13421824, "step": 6225 }, { "epoch": 1.1433290512020555, "grad_norm": 3.586557388305664, "learning_rate": 5.715727656450725e-06, "loss": 0.3941, "num_input_tokens_seen": 13431808, "step": 6230 }, { "epoch": 1.1442466507616076, "grad_norm": 5.132724285125732, "learning_rate": 5.720315654248486e-06, "loss": 0.4449, "num_input_tokens_seen": 13443008, "step": 6235 }, { "epoch": 1.1451642503211599, "grad_norm": 19.677343368530273, "learning_rate": 5.724903652046247e-06, "loss": 0.4648, "num_input_tokens_seen": 13452736, "step": 6240 }, { "epoch": 1.1460818498807122, "grad_norm": 5.495584487915039, "learning_rate": 5.729491649844009e-06, "loss": 0.3703, "num_input_tokens_seen": 13463744, "step": 6245 }, { "epoch": 1.1469994494402642, "grad_norm": 4.67022705078125, "learning_rate": 5.734079647641769e-06, "loss": 0.3381, "num_input_tokens_seen": 13474656, "step": 6250 }, { "epoch": 1.1479170489998165, "grad_norm": 10.171172142028809, "learning_rate": 5.7386676454395305e-06, "loss": 0.3213, "num_input_tokens_seen": 13484736, "step": 6255 }, { "epoch": 1.1488346485593688, "grad_norm": 14.29875373840332, "learning_rate": 5.743255643237292e-06, "loss": 0.3524, "num_input_tokens_seen": 13496000, "step": 6260 }, { "epoch": 1.1497522481189208, "grad_norm": 14.582406997680664, "learning_rate": 5.747843641035052e-06, "loss": 0.3007, "num_input_tokens_seen": 13505792, "step": 6265 }, { "epoch": 1.1506698476784731, "grad_norm": 6.7383928298950195, "learning_rate": 5.752431638832814e-06, "loss": 0.4188, "num_input_tokens_seen": 13514976, "step": 6270 }, { "epoch": 1.1515874472380254, "grad_norm": 5.67682409286499, "learning_rate": 5.757019636630575e-06, "loss": 0.3199, "num_input_tokens_seen": 13526272, "step": 6275 }, { "epoch": 1.1525050467975775, "grad_norm": 5.76076602935791, "learning_rate": 5.761607634428336e-06, "loss": 0.3039, "num_input_tokens_seen": 13537696, "step": 6280 }, { "epoch": 1.1534226463571298, "grad_norm": 13.594922065734863, "learning_rate": 5.766195632226097e-06, "loss": 0.3745, "num_input_tokens_seen": 13547680, "step": 6285 }, { "epoch": 1.154340245916682, "grad_norm": 4.711610317230225, "learning_rate": 5.770783630023858e-06, "loss": 0.3643, "num_input_tokens_seen": 13558304, "step": 6290 }, { "epoch": 1.1552578454762341, "grad_norm": 2.554295539855957, "learning_rate": 5.775371627821619e-06, "loss": 0.2934, "num_input_tokens_seen": 13571104, "step": 6295 }, { "epoch": 1.1561754450357864, "grad_norm": 1.9490880966186523, "learning_rate": 5.77995962561938e-06, "loss": 0.3297, "num_input_tokens_seen": 13581216, "step": 6300 }, { "epoch": 1.1570930445953387, "grad_norm": 3.2562034130096436, "learning_rate": 5.784547623417142e-06, "loss": 0.2719, "num_input_tokens_seen": 13591776, "step": 6305 }, { "epoch": 1.1580106441548907, "grad_norm": 3.7847554683685303, "learning_rate": 5.789135621214902e-06, "loss": 0.3462, "num_input_tokens_seen": 13603744, "step": 6310 }, { "epoch": 1.158928243714443, "grad_norm": 5.102985382080078, "learning_rate": 5.7937236190126635e-06, "loss": 0.3607, "num_input_tokens_seen": 13614304, "step": 6315 }, { "epoch": 1.1598458432739953, "grad_norm": 11.821611404418945, "learning_rate": 5.798311616810425e-06, "loss": 0.4144, "num_input_tokens_seen": 13623936, "step": 6320 }, { "epoch": 1.1607634428335474, "grad_norm": 4.535499572753906, "learning_rate": 5.802899614608185e-06, "loss": 0.2615, "num_input_tokens_seen": 13635136, "step": 6325 }, { "epoch": 1.1616810423930997, "grad_norm": 4.787611484527588, "learning_rate": 5.807487612405947e-06, "loss": 0.4525, "num_input_tokens_seen": 13646528, "step": 6330 }, { "epoch": 1.162598641952652, "grad_norm": 2.838538885116577, "learning_rate": 5.812075610203708e-06, "loss": 0.3604, "num_input_tokens_seen": 13656992, "step": 6335 }, { "epoch": 1.163516241512204, "grad_norm": 4.783871173858643, "learning_rate": 5.816663608001469e-06, "loss": 0.2922, "num_input_tokens_seen": 13667840, "step": 6340 }, { "epoch": 1.1644338410717563, "grad_norm": 1.7015165090560913, "learning_rate": 5.82125160579923e-06, "loss": 0.3305, "num_input_tokens_seen": 13678496, "step": 6345 }, { "epoch": 1.1653514406313086, "grad_norm": 11.905956268310547, "learning_rate": 5.825839603596991e-06, "loss": 0.3775, "num_input_tokens_seen": 13689888, "step": 6350 }, { "epoch": 1.1662690401908606, "grad_norm": 11.10606575012207, "learning_rate": 5.830427601394751e-06, "loss": 0.4108, "num_input_tokens_seen": 13700928, "step": 6355 }, { "epoch": 1.167186639750413, "grad_norm": 9.246686935424805, "learning_rate": 5.835015599192513e-06, "loss": 0.3173, "num_input_tokens_seen": 13712192, "step": 6360 }, { "epoch": 1.1681042393099652, "grad_norm": 4.96632719039917, "learning_rate": 5.8396035969902746e-06, "loss": 0.4376, "num_input_tokens_seen": 13723232, "step": 6365 }, { "epoch": 1.1690218388695173, "grad_norm": 2.3954505920410156, "learning_rate": 5.844191594788034e-06, "loss": 0.3498, "num_input_tokens_seen": 13734144, "step": 6370 }, { "epoch": 1.1699394384290696, "grad_norm": 6.854695796966553, "learning_rate": 5.848779592585796e-06, "loss": 0.3085, "num_input_tokens_seen": 13744800, "step": 6375 }, { "epoch": 1.1708570379886218, "grad_norm": 2.0103085041046143, "learning_rate": 5.853367590383558e-06, "loss": 0.3194, "num_input_tokens_seen": 13755840, "step": 6380 }, { "epoch": 1.171774637548174, "grad_norm": 2.7754604816436768, "learning_rate": 5.8579555881813175e-06, "loss": 0.3277, "num_input_tokens_seen": 13766688, "step": 6385 }, { "epoch": 1.1726922371077262, "grad_norm": 9.702460289001465, "learning_rate": 5.862543585979079e-06, "loss": 0.2878, "num_input_tokens_seen": 13776480, "step": 6390 }, { "epoch": 1.1736098366672785, "grad_norm": 2.306180238723755, "learning_rate": 5.86713158377684e-06, "loss": 0.2972, "num_input_tokens_seen": 13787360, "step": 6395 }, { "epoch": 1.1745274362268305, "grad_norm": 9.322551727294922, "learning_rate": 5.871719581574601e-06, "loss": 0.2514, "num_input_tokens_seen": 13798464, "step": 6400 }, { "epoch": 1.1754450357863828, "grad_norm": 8.094287872314453, "learning_rate": 5.876307579372362e-06, "loss": 0.2909, "num_input_tokens_seen": 13808672, "step": 6405 }, { "epoch": 1.176362635345935, "grad_norm": 21.138427734375, "learning_rate": 5.8808955771701234e-06, "loss": 0.4451, "num_input_tokens_seen": 13819552, "step": 6410 }, { "epoch": 1.1772802349054872, "grad_norm": 78.48795318603516, "learning_rate": 5.885483574967884e-06, "loss": 0.4563, "num_input_tokens_seen": 13831040, "step": 6415 }, { "epoch": 1.1781978344650395, "grad_norm": 6.842347621917725, "learning_rate": 5.890071572765645e-06, "loss": 0.3583, "num_input_tokens_seen": 13842432, "step": 6420 }, { "epoch": 1.1791154340245917, "grad_norm": 7.757870674133301, "learning_rate": 5.894659570563407e-06, "loss": 0.3418, "num_input_tokens_seen": 13852896, "step": 6425 }, { "epoch": 1.1800330335841438, "grad_norm": 8.1692476272583, "learning_rate": 5.899247568361167e-06, "loss": 0.2848, "num_input_tokens_seen": 13863648, "step": 6430 }, { "epoch": 1.180950633143696, "grad_norm": 14.765419006347656, "learning_rate": 5.9038355661589285e-06, "loss": 0.3471, "num_input_tokens_seen": 13875072, "step": 6435 }, { "epoch": 1.1818682327032484, "grad_norm": 14.137579917907715, "learning_rate": 5.90842356395669e-06, "loss": 0.4103, "num_input_tokens_seen": 13886400, "step": 6440 }, { "epoch": 1.1827858322628004, "grad_norm": 2.028891086578369, "learning_rate": 5.9130115617544504e-06, "loss": 0.3252, "num_input_tokens_seen": 13897408, "step": 6445 }, { "epoch": 1.1837034318223527, "grad_norm": 5.8520708084106445, "learning_rate": 5.917599559552212e-06, "loss": 0.2778, "num_input_tokens_seen": 13907936, "step": 6450 }, { "epoch": 1.184621031381905, "grad_norm": 2.743873119354248, "learning_rate": 5.922187557349973e-06, "loss": 0.3847, "num_input_tokens_seen": 13918464, "step": 6455 }, { "epoch": 1.185538630941457, "grad_norm": 1.708422303199768, "learning_rate": 5.926775555147734e-06, "loss": 0.2711, "num_input_tokens_seen": 13930048, "step": 6460 }, { "epoch": 1.1864562305010093, "grad_norm": 11.414994239807129, "learning_rate": 5.931363552945495e-06, "loss": 0.3146, "num_input_tokens_seen": 13940320, "step": 6465 }, { "epoch": 1.1873738300605616, "grad_norm": 2.992233991622925, "learning_rate": 5.935951550743256e-06, "loss": 0.2781, "num_input_tokens_seen": 13950656, "step": 6470 }, { "epoch": 1.1882914296201137, "grad_norm": 2.534202814102173, "learning_rate": 5.940539548541017e-06, "loss": 0.3875, "num_input_tokens_seen": 13960480, "step": 6475 }, { "epoch": 1.189209029179666, "grad_norm": 2.630742311477661, "learning_rate": 5.945127546338778e-06, "loss": 0.3689, "num_input_tokens_seen": 13971008, "step": 6480 }, { "epoch": 1.1901266287392183, "grad_norm": 3.0327951908111572, "learning_rate": 5.94971554413654e-06, "loss": 0.2462, "num_input_tokens_seen": 13980160, "step": 6485 }, { "epoch": 1.1910442282987703, "grad_norm": 2.701315402984619, "learning_rate": 5.9543035419343e-06, "loss": 0.3793, "num_input_tokens_seen": 13990944, "step": 6490 }, { "epoch": 1.1919618278583226, "grad_norm": 13.674487113952637, "learning_rate": 5.9588915397320615e-06, "loss": 0.4699, "num_input_tokens_seen": 14001600, "step": 6495 }, { "epoch": 1.192879427417875, "grad_norm": 9.516464233398438, "learning_rate": 5.963479537529823e-06, "loss": 0.3881, "num_input_tokens_seen": 14013632, "step": 6500 }, { "epoch": 1.193797026977427, "grad_norm": 10.082305908203125, "learning_rate": 5.968067535327583e-06, "loss": 0.2915, "num_input_tokens_seen": 14024128, "step": 6505 }, { "epoch": 1.1947146265369792, "grad_norm": 2.6980881690979004, "learning_rate": 5.972655533125345e-06, "loss": 0.4064, "num_input_tokens_seen": 14035104, "step": 6510 }, { "epoch": 1.1956322260965315, "grad_norm": 3.0142860412597656, "learning_rate": 5.977243530923106e-06, "loss": 0.3485, "num_input_tokens_seen": 14045728, "step": 6515 }, { "epoch": 1.1965498256560836, "grad_norm": 3.965773582458496, "learning_rate": 5.9818315287208675e-06, "loss": 0.3765, "num_input_tokens_seen": 14056192, "step": 6520 }, { "epoch": 1.1974674252156359, "grad_norm": 5.198568820953369, "learning_rate": 5.986419526518628e-06, "loss": 0.3886, "num_input_tokens_seen": 14067328, "step": 6525 }, { "epoch": 1.1983850247751882, "grad_norm": 2.165437936782837, "learning_rate": 5.991007524316389e-06, "loss": 0.2834, "num_input_tokens_seen": 14078720, "step": 6530 }, { "epoch": 1.1993026243347402, "grad_norm": 1.3272950649261475, "learning_rate": 5.995595522114151e-06, "loss": 0.2938, "num_input_tokens_seen": 14090144, "step": 6535 }, { "epoch": 1.2002202238942925, "grad_norm": 3.0393526554107666, "learning_rate": 6.00018351991191e-06, "loss": 0.3189, "num_input_tokens_seen": 14100608, "step": 6540 }, { "epoch": 1.2011378234538448, "grad_norm": 2.8439505100250244, "learning_rate": 6.004771517709672e-06, "loss": 0.2926, "num_input_tokens_seen": 14111488, "step": 6545 }, { "epoch": 1.2020554230133969, "grad_norm": 4.275595188140869, "learning_rate": 6.009359515507434e-06, "loss": 0.3119, "num_input_tokens_seen": 14121664, "step": 6550 }, { "epoch": 1.2029730225729491, "grad_norm": 2.2887189388275146, "learning_rate": 6.013947513305194e-06, "loss": 0.378, "num_input_tokens_seen": 14133568, "step": 6555 }, { "epoch": 1.2038906221325014, "grad_norm": 2.759716749191284, "learning_rate": 6.018535511102955e-06, "loss": 0.3488, "num_input_tokens_seen": 14144288, "step": 6560 }, { "epoch": 1.2048082216920535, "grad_norm": 13.434942245483398, "learning_rate": 6.023123508900716e-06, "loss": 0.394, "num_input_tokens_seen": 14155808, "step": 6565 }, { "epoch": 1.2057258212516058, "grad_norm": 2.2341415882110596, "learning_rate": 6.027711506698477e-06, "loss": 0.2949, "num_input_tokens_seen": 14166688, "step": 6570 }, { "epoch": 1.206643420811158, "grad_norm": 5.951671600341797, "learning_rate": 6.032299504496238e-06, "loss": 0.318, "num_input_tokens_seen": 14177440, "step": 6575 }, { "epoch": 1.2075610203707101, "grad_norm": 1.5618441104888916, "learning_rate": 6.0368875022939996e-06, "loss": 0.2978, "num_input_tokens_seen": 14190112, "step": 6580 }, { "epoch": 1.2084786199302624, "grad_norm": 1.8822541236877441, "learning_rate": 6.04147550009176e-06, "loss": 0.343, "num_input_tokens_seen": 14201280, "step": 6585 }, { "epoch": 1.2093962194898147, "grad_norm": 5.4564080238342285, "learning_rate": 6.0460634978895214e-06, "loss": 0.4493, "num_input_tokens_seen": 14212672, "step": 6590 }, { "epoch": 1.2103138190493667, "grad_norm": 16.364395141601562, "learning_rate": 6.050651495687283e-06, "loss": 0.3558, "num_input_tokens_seen": 14222272, "step": 6595 }, { "epoch": 1.211231418608919, "grad_norm": 1.608719825744629, "learning_rate": 6.055239493485043e-06, "loss": 0.2976, "num_input_tokens_seen": 14233184, "step": 6600 }, { "epoch": 1.2121490181684713, "grad_norm": 2.602687120437622, "learning_rate": 6.059827491282805e-06, "loss": 0.2898, "num_input_tokens_seen": 14243776, "step": 6605 }, { "epoch": 1.2130666177280234, "grad_norm": 5.827252388000488, "learning_rate": 6.064415489080566e-06, "loss": 0.3565, "num_input_tokens_seen": 14255392, "step": 6610 }, { "epoch": 1.2139842172875757, "grad_norm": 3.57228946685791, "learning_rate": 6.0690034868783266e-06, "loss": 0.369, "num_input_tokens_seen": 14266240, "step": 6615 }, { "epoch": 1.214901816847128, "grad_norm": 5.619304180145264, "learning_rate": 6.073591484676088e-06, "loss": 0.2591, "num_input_tokens_seen": 14277792, "step": 6620 }, { "epoch": 1.21581941640668, "grad_norm": 4.427534580230713, "learning_rate": 6.078179482473849e-06, "loss": 0.3676, "num_input_tokens_seen": 14288032, "step": 6625 }, { "epoch": 1.2167370159662323, "grad_norm": 2.2642722129821777, "learning_rate": 6.08276748027161e-06, "loss": 0.2916, "num_input_tokens_seen": 14299552, "step": 6630 }, { "epoch": 1.2176546155257846, "grad_norm": 2.9791979789733887, "learning_rate": 6.087355478069371e-06, "loss": 0.3804, "num_input_tokens_seen": 14309088, "step": 6635 }, { "epoch": 1.2185722150853366, "grad_norm": 3.7701592445373535, "learning_rate": 6.0919434758671325e-06, "loss": 0.3917, "num_input_tokens_seen": 14320480, "step": 6640 }, { "epoch": 1.219489814644889, "grad_norm": 2.365963935852051, "learning_rate": 6.096531473664893e-06, "loss": 0.3603, "num_input_tokens_seen": 14330304, "step": 6645 }, { "epoch": 1.2204074142044412, "grad_norm": 4.520293712615967, "learning_rate": 6.101119471462654e-06, "loss": 0.2724, "num_input_tokens_seen": 14341632, "step": 6650 }, { "epoch": 1.2213250137639933, "grad_norm": 1.4099771976470947, "learning_rate": 6.105707469260416e-06, "loss": 0.3027, "num_input_tokens_seen": 14353152, "step": 6655 }, { "epoch": 1.2222426133235456, "grad_norm": 4.034055233001709, "learning_rate": 6.110295467058176e-06, "loss": 0.3069, "num_input_tokens_seen": 14364064, "step": 6660 }, { "epoch": 1.2231602128830978, "grad_norm": 16.919477462768555, "learning_rate": 6.114883464855938e-06, "loss": 0.359, "num_input_tokens_seen": 14374432, "step": 6665 }, { "epoch": 1.22407781244265, "grad_norm": 13.615675926208496, "learning_rate": 6.119471462653699e-06, "loss": 0.359, "num_input_tokens_seen": 14385312, "step": 6670 }, { "epoch": 1.2249954120022022, "grad_norm": 9.370862007141113, "learning_rate": 6.1240594604514595e-06, "loss": 0.3497, "num_input_tokens_seen": 14395456, "step": 6675 }, { "epoch": 1.2259130115617545, "grad_norm": 6.77980375289917, "learning_rate": 6.128647458249221e-06, "loss": 0.3844, "num_input_tokens_seen": 14405888, "step": 6680 }, { "epoch": 1.2268306111213068, "grad_norm": 10.313652038574219, "learning_rate": 6.133235456046982e-06, "loss": 0.3295, "num_input_tokens_seen": 14416416, "step": 6685 }, { "epoch": 1.2277482106808588, "grad_norm": 5.510473251342773, "learning_rate": 6.137823453844742e-06, "loss": 0.336, "num_input_tokens_seen": 14427456, "step": 6690 }, { "epoch": 1.228665810240411, "grad_norm": 5.330352306365967, "learning_rate": 6.142411451642504e-06, "loss": 0.3469, "num_input_tokens_seen": 14438816, "step": 6695 }, { "epoch": 1.2295834097999634, "grad_norm": 3.930426597595215, "learning_rate": 6.1469994494402655e-06, "loss": 0.3086, "num_input_tokens_seen": 14450016, "step": 6700 }, { "epoch": 1.2305010093595155, "grad_norm": 9.66871166229248, "learning_rate": 6.151587447238025e-06, "loss": 0.3482, "num_input_tokens_seen": 14460704, "step": 6705 }, { "epoch": 1.2314186089190677, "grad_norm": 2.760037422180176, "learning_rate": 6.1561754450357865e-06, "loss": 0.3607, "num_input_tokens_seen": 14470912, "step": 6710 }, { "epoch": 1.23233620847862, "grad_norm": 3.9283499717712402, "learning_rate": 6.160763442833549e-06, "loss": 0.3086, "num_input_tokens_seen": 14481088, "step": 6715 }, { "epoch": 1.233253808038172, "grad_norm": 3.183601140975952, "learning_rate": 6.165351440631308e-06, "loss": 0.3519, "num_input_tokens_seen": 14493088, "step": 6720 }, { "epoch": 1.2341714075977244, "grad_norm": 7.6206135749816895, "learning_rate": 6.16993943842907e-06, "loss": 0.2729, "num_input_tokens_seen": 14504544, "step": 6725 }, { "epoch": 1.2350890071572767, "grad_norm": 5.767255783081055, "learning_rate": 6.174527436226831e-06, "loss": 0.3572, "num_input_tokens_seen": 14515648, "step": 6730 }, { "epoch": 1.2360066067168287, "grad_norm": 5.0369391441345215, "learning_rate": 6.179115434024592e-06, "loss": 0.3457, "num_input_tokens_seen": 14526784, "step": 6735 }, { "epoch": 1.236924206276381, "grad_norm": 3.7870185375213623, "learning_rate": 6.183703431822353e-06, "loss": 0.3776, "num_input_tokens_seen": 14539136, "step": 6740 }, { "epoch": 1.2378418058359333, "grad_norm": 2.507354259490967, "learning_rate": 6.188291429620114e-06, "loss": 0.3525, "num_input_tokens_seen": 14549504, "step": 6745 }, { "epoch": 1.2387594053954853, "grad_norm": 6.540931701660156, "learning_rate": 6.192879427417875e-06, "loss": 0.422, "num_input_tokens_seen": 14559968, "step": 6750 }, { "epoch": 1.2396770049550376, "grad_norm": 2.693699836730957, "learning_rate": 6.197467425215636e-06, "loss": 0.3703, "num_input_tokens_seen": 14570656, "step": 6755 }, { "epoch": 1.24059460451459, "grad_norm": 11.195013046264648, "learning_rate": 6.202055423013398e-06, "loss": 0.268, "num_input_tokens_seen": 14581664, "step": 6760 }, { "epoch": 1.241512204074142, "grad_norm": 2.7024946212768555, "learning_rate": 6.206643420811158e-06, "loss": 0.3699, "num_input_tokens_seen": 14591968, "step": 6765 }, { "epoch": 1.2424298036336943, "grad_norm": 2.1428544521331787, "learning_rate": 6.2112314186089195e-06, "loss": 0.2673, "num_input_tokens_seen": 14603328, "step": 6770 }, { "epoch": 1.2433474031932465, "grad_norm": 3.9319369792938232, "learning_rate": 6.215819416406681e-06, "loss": 0.3856, "num_input_tokens_seen": 14614880, "step": 6775 }, { "epoch": 1.2442650027527986, "grad_norm": 14.70113754272461, "learning_rate": 6.220407414204441e-06, "loss": 0.315, "num_input_tokens_seen": 14625248, "step": 6780 }, { "epoch": 1.245182602312351, "grad_norm": 1.6220197677612305, "learning_rate": 6.224995412002203e-06, "loss": 0.267, "num_input_tokens_seen": 14636960, "step": 6785 }, { "epoch": 1.2461002018719032, "grad_norm": 4.752938270568848, "learning_rate": 6.229583409799964e-06, "loss": 0.3449, "num_input_tokens_seen": 14648192, "step": 6790 }, { "epoch": 1.2470178014314552, "grad_norm": 6.068205833435059, "learning_rate": 6.2341714075977246e-06, "loss": 0.3394, "num_input_tokens_seen": 14659136, "step": 6795 }, { "epoch": 1.2479354009910075, "grad_norm": 20.01545524597168, "learning_rate": 6.238759405395486e-06, "loss": 0.3963, "num_input_tokens_seen": 14670048, "step": 6800 }, { "epoch": 1.2488530005505598, "grad_norm": 13.386645317077637, "learning_rate": 6.243347403193247e-06, "loss": 0.3618, "num_input_tokens_seen": 14679296, "step": 6805 }, { "epoch": 1.2497706001101119, "grad_norm": 5.289206027984619, "learning_rate": 6.247935400991008e-06, "loss": 0.2755, "num_input_tokens_seen": 14689664, "step": 6810 }, { "epoch": 1.2506881996696642, "grad_norm": 1.2640470266342163, "learning_rate": 6.252523398788769e-06, "loss": 0.4002, "num_input_tokens_seen": 14698944, "step": 6815 }, { "epoch": 1.2516057992292164, "grad_norm": 2.558947801589966, "learning_rate": 6.2571113965865305e-06, "loss": 0.2475, "num_input_tokens_seen": 14708800, "step": 6820 }, { "epoch": 1.2525233987887685, "grad_norm": 4.110755443572998, "learning_rate": 6.261699394384291e-06, "loss": 0.4151, "num_input_tokens_seen": 14719648, "step": 6825 }, { "epoch": 1.2534409983483208, "grad_norm": 3.610144853591919, "learning_rate": 6.266287392182052e-06, "loss": 0.3809, "num_input_tokens_seen": 14730656, "step": 6830 }, { "epoch": 1.254358597907873, "grad_norm": 1.9193994998931885, "learning_rate": 6.270875389979814e-06, "loss": 0.3492, "num_input_tokens_seen": 14741312, "step": 6835 }, { "epoch": 1.2552761974674251, "grad_norm": 1.7506341934204102, "learning_rate": 6.275463387777574e-06, "loss": 0.3396, "num_input_tokens_seen": 14752544, "step": 6840 }, { "epoch": 1.2561937970269774, "grad_norm": 3.700096368789673, "learning_rate": 6.280051385575336e-06, "loss": 0.3308, "num_input_tokens_seen": 14763136, "step": 6845 }, { "epoch": 1.2571113965865297, "grad_norm": 2.47286057472229, "learning_rate": 6.284639383373097e-06, "loss": 0.3319, "num_input_tokens_seen": 14774880, "step": 6850 }, { "epoch": 1.2580289961460818, "grad_norm": 6.8317131996154785, "learning_rate": 6.289227381170857e-06, "loss": 0.3458, "num_input_tokens_seen": 14786080, "step": 6855 }, { "epoch": 1.258946595705634, "grad_norm": 1.6497106552124023, "learning_rate": 6.293815378968619e-06, "loss": 0.3078, "num_input_tokens_seen": 14797984, "step": 6860 }, { "epoch": 1.2598641952651863, "grad_norm": 1.5703169107437134, "learning_rate": 6.29840337676638e-06, "loss": 0.3225, "num_input_tokens_seen": 14808544, "step": 6865 }, { "epoch": 1.2607817948247386, "grad_norm": 1.2104618549346924, "learning_rate": 6.30299137456414e-06, "loss": 0.399, "num_input_tokens_seen": 14819616, "step": 6870 }, { "epoch": 1.2616993943842907, "grad_norm": 2.205125093460083, "learning_rate": 6.307579372361901e-06, "loss": 0.4112, "num_input_tokens_seen": 14831072, "step": 6875 }, { "epoch": 1.262616993943843, "grad_norm": 2.289327621459961, "learning_rate": 6.3121673701596635e-06, "loss": 0.2725, "num_input_tokens_seen": 14840992, "step": 6880 }, { "epoch": 1.2635345935033953, "grad_norm": 1.726245403289795, "learning_rate": 6.316755367957423e-06, "loss": 0.3445, "num_input_tokens_seen": 14851680, "step": 6885 }, { "epoch": 1.2644521930629473, "grad_norm": 1.8403512239456177, "learning_rate": 6.3213433657551845e-06, "loss": 0.3208, "num_input_tokens_seen": 14862784, "step": 6890 }, { "epoch": 1.2653697926224996, "grad_norm": 3.076395273208618, "learning_rate": 6.325931363552946e-06, "loss": 0.3361, "num_input_tokens_seen": 14873568, "step": 6895 }, { "epoch": 1.2662873921820519, "grad_norm": 4.0772504806518555, "learning_rate": 6.330519361350706e-06, "loss": 0.3194, "num_input_tokens_seen": 14884864, "step": 6900 }, { "epoch": 1.267204991741604, "grad_norm": 11.46209716796875, "learning_rate": 6.335107359148468e-06, "loss": 0.2984, "num_input_tokens_seen": 14895680, "step": 6905 }, { "epoch": 1.2681225913011562, "grad_norm": 21.73181915283203, "learning_rate": 6.339695356946229e-06, "loss": 0.381, "num_input_tokens_seen": 14904768, "step": 6910 }, { "epoch": 1.2690401908607085, "grad_norm": 9.708525657653809, "learning_rate": 6.34428335474399e-06, "loss": 0.4266, "num_input_tokens_seen": 14915104, "step": 6915 }, { "epoch": 1.2699577904202606, "grad_norm": 1.7227246761322021, "learning_rate": 6.348871352541751e-06, "loss": 0.3605, "num_input_tokens_seen": 14926112, "step": 6920 }, { "epoch": 1.2708753899798129, "grad_norm": 4.3222737312316895, "learning_rate": 6.353459350339512e-06, "loss": 0.3329, "num_input_tokens_seen": 14936960, "step": 6925 }, { "epoch": 1.2717929895393651, "grad_norm": 4.581358432769775, "learning_rate": 6.358047348137273e-06, "loss": 0.3638, "num_input_tokens_seen": 14947392, "step": 6930 }, { "epoch": 1.2727105890989172, "grad_norm": 1.1764806509017944, "learning_rate": 6.362635345935034e-06, "loss": 0.4106, "num_input_tokens_seen": 14958176, "step": 6935 }, { "epoch": 1.2736281886584695, "grad_norm": 2.6163241863250732, "learning_rate": 6.367223343732796e-06, "loss": 0.2788, "num_input_tokens_seen": 14969056, "step": 6940 }, { "epoch": 1.2745457882180218, "grad_norm": 1.011793851852417, "learning_rate": 6.371811341530557e-06, "loss": 0.3261, "num_input_tokens_seen": 14981056, "step": 6945 }, { "epoch": 1.2754633877775738, "grad_norm": 1.7642412185668945, "learning_rate": 6.3763993393283175e-06, "loss": 0.3144, "num_input_tokens_seen": 14991648, "step": 6950 }, { "epoch": 1.2763809873371261, "grad_norm": 1.946519374847412, "learning_rate": 6.380987337126079e-06, "loss": 0.3844, "num_input_tokens_seen": 15003008, "step": 6955 }, { "epoch": 1.2772985868966784, "grad_norm": 1.3724473714828491, "learning_rate": 6.38557533492384e-06, "loss": 0.3512, "num_input_tokens_seen": 15013216, "step": 6960 }, { "epoch": 1.2782161864562305, "grad_norm": 8.399800300598145, "learning_rate": 6.390163332721601e-06, "loss": 0.3505, "num_input_tokens_seen": 15022848, "step": 6965 }, { "epoch": 1.2791337860157828, "grad_norm": 2.9983677864074707, "learning_rate": 6.394751330519362e-06, "loss": 0.3848, "num_input_tokens_seen": 15032992, "step": 6970 }, { "epoch": 1.280051385575335, "grad_norm": 1.9241942167282104, "learning_rate": 6.3993393283171234e-06, "loss": 0.341, "num_input_tokens_seen": 15044032, "step": 6975 }, { "epoch": 1.280968985134887, "grad_norm": 11.632718086242676, "learning_rate": 6.403927326114884e-06, "loss": 0.3528, "num_input_tokens_seen": 15054848, "step": 6980 }, { "epoch": 1.2818865846944394, "grad_norm": 1.7414309978485107, "learning_rate": 6.408515323912645e-06, "loss": 0.3462, "num_input_tokens_seen": 15065120, "step": 6985 }, { "epoch": 1.2828041842539917, "grad_norm": 3.5471785068511963, "learning_rate": 6.413103321710407e-06, "loss": 0.3435, "num_input_tokens_seen": 15074880, "step": 6990 }, { "epoch": 1.2837217838135437, "grad_norm": 3.5779354572296143, "learning_rate": 6.417691319508167e-06, "loss": 0.4559, "num_input_tokens_seen": 15085760, "step": 6995 }, { "epoch": 1.284639383373096, "grad_norm": 1.6784543991088867, "learning_rate": 6.4222793173059286e-06, "loss": 0.3347, "num_input_tokens_seen": 15096640, "step": 7000 }, { "epoch": 1.2855569829326483, "grad_norm": 4.926183700561523, "learning_rate": 6.42686731510369e-06, "loss": 0.3462, "num_input_tokens_seen": 15107840, "step": 7005 }, { "epoch": 1.2864745824922004, "grad_norm": 2.1159937381744385, "learning_rate": 6.4314553129014504e-06, "loss": 0.3283, "num_input_tokens_seen": 15119456, "step": 7010 }, { "epoch": 1.2873921820517527, "grad_norm": 1.2220255136489868, "learning_rate": 6.436043310699212e-06, "loss": 0.3633, "num_input_tokens_seen": 15130528, "step": 7015 }, { "epoch": 1.288309781611305, "grad_norm": 2.5686848163604736, "learning_rate": 6.440631308496973e-06, "loss": 0.3541, "num_input_tokens_seen": 15141184, "step": 7020 }, { "epoch": 1.289227381170857, "grad_norm": 1.6091773509979248, "learning_rate": 6.445219306294734e-06, "loss": 0.2936, "num_input_tokens_seen": 15152128, "step": 7025 }, { "epoch": 1.2901449807304093, "grad_norm": 1.7309026718139648, "learning_rate": 6.449807304092495e-06, "loss": 0.302, "num_input_tokens_seen": 15163136, "step": 7030 }, { "epoch": 1.2910625802899616, "grad_norm": 4.699894905090332, "learning_rate": 6.454395301890256e-06, "loss": 0.2996, "num_input_tokens_seen": 15174016, "step": 7035 }, { "epoch": 1.2919801798495136, "grad_norm": 3.7266643047332764, "learning_rate": 6.458983299688016e-06, "loss": 0.2314, "num_input_tokens_seen": 15184992, "step": 7040 }, { "epoch": 1.292897779409066, "grad_norm": 6.105358123779297, "learning_rate": 6.463571297485777e-06, "loss": 0.3628, "num_input_tokens_seen": 15195712, "step": 7045 }, { "epoch": 1.2938153789686182, "grad_norm": 4.070807456970215, "learning_rate": 6.46815929528354e-06, "loss": 0.4156, "num_input_tokens_seen": 15206720, "step": 7050 }, { "epoch": 1.2947329785281703, "grad_norm": 1.131791591644287, "learning_rate": 6.472747293081299e-06, "loss": 0.3194, "num_input_tokens_seen": 15217728, "step": 7055 }, { "epoch": 1.2956505780877225, "grad_norm": 2.0265004634857178, "learning_rate": 6.477335290879061e-06, "loss": 0.3903, "num_input_tokens_seen": 15229344, "step": 7060 }, { "epoch": 1.2965681776472748, "grad_norm": 1.3337031602859497, "learning_rate": 6.481923288676822e-06, "loss": 0.3366, "num_input_tokens_seen": 15240032, "step": 7065 }, { "epoch": 1.297485777206827, "grad_norm": 3.6906962394714355, "learning_rate": 6.4865112864745825e-06, "loss": 0.298, "num_input_tokens_seen": 15249952, "step": 7070 }, { "epoch": 1.2984033767663792, "grad_norm": 1.7518138885498047, "learning_rate": 6.491099284272344e-06, "loss": 0.398, "num_input_tokens_seen": 15259744, "step": 7075 }, { "epoch": 1.2993209763259315, "grad_norm": 4.486844062805176, "learning_rate": 6.495687282070105e-06, "loss": 0.3176, "num_input_tokens_seen": 15269024, "step": 7080 }, { "epoch": 1.3002385758854835, "grad_norm": 1.8790442943572998, "learning_rate": 6.500275279867866e-06, "loss": 0.2879, "num_input_tokens_seen": 15279136, "step": 7085 }, { "epoch": 1.3011561754450358, "grad_norm": 9.098950386047363, "learning_rate": 6.504863277665627e-06, "loss": 0.3407, "num_input_tokens_seen": 15290816, "step": 7090 }, { "epoch": 1.302073775004588, "grad_norm": 14.169060707092285, "learning_rate": 6.5094512754633885e-06, "loss": 0.6029, "num_input_tokens_seen": 15301472, "step": 7095 }, { "epoch": 1.3029913745641402, "grad_norm": 5.228341579437256, "learning_rate": 6.514039273261149e-06, "loss": 0.3136, "num_input_tokens_seen": 15312448, "step": 7100 }, { "epoch": 1.3039089741236924, "grad_norm": 6.971251010894775, "learning_rate": 6.51862727105891e-06, "loss": 0.2963, "num_input_tokens_seen": 15323584, "step": 7105 }, { "epoch": 1.3048265736832447, "grad_norm": 11.015853881835938, "learning_rate": 6.523215268856672e-06, "loss": 0.344, "num_input_tokens_seen": 15335072, "step": 7110 }, { "epoch": 1.3057441732427968, "grad_norm": 3.7012078762054443, "learning_rate": 6.527803266654432e-06, "loss": 0.314, "num_input_tokens_seen": 15345984, "step": 7115 }, { "epoch": 1.306661772802349, "grad_norm": 5.4000563621521, "learning_rate": 6.532391264452194e-06, "loss": 0.283, "num_input_tokens_seen": 15357248, "step": 7120 }, { "epoch": 1.3075793723619014, "grad_norm": 3.3561851978302, "learning_rate": 6.536979262249955e-06, "loss": 0.2975, "num_input_tokens_seen": 15368416, "step": 7125 }, { "epoch": 1.3084969719214534, "grad_norm": 2.7540299892425537, "learning_rate": 6.5415672600477155e-06, "loss": 0.3872, "num_input_tokens_seen": 15379040, "step": 7130 }, { "epoch": 1.3094145714810057, "grad_norm": 1.3955161571502686, "learning_rate": 6.546155257845477e-06, "loss": 0.3565, "num_input_tokens_seen": 15391392, "step": 7135 }, { "epoch": 1.310332171040558, "grad_norm": 6.153592586517334, "learning_rate": 6.550743255643238e-06, "loss": 0.4676, "num_input_tokens_seen": 15402144, "step": 7140 }, { "epoch": 1.31124977060011, "grad_norm": 2.465609550476074, "learning_rate": 6.555331253440999e-06, "loss": 0.2209, "num_input_tokens_seen": 15414624, "step": 7145 }, { "epoch": 1.3121673701596623, "grad_norm": 3.9581480026245117, "learning_rate": 6.55991925123876e-06, "loss": 0.3492, "num_input_tokens_seen": 15425568, "step": 7150 }, { "epoch": 1.3130849697192146, "grad_norm": 19.983341217041016, "learning_rate": 6.5645072490365215e-06, "loss": 0.4283, "num_input_tokens_seen": 15437664, "step": 7155 }, { "epoch": 1.3140025692787667, "grad_norm": 10.60355281829834, "learning_rate": 6.569095246834282e-06, "loss": 0.3846, "num_input_tokens_seen": 15449952, "step": 7160 }, { "epoch": 1.314920168838319, "grad_norm": 5.607883453369141, "learning_rate": 6.573683244632043e-06, "loss": 0.3024, "num_input_tokens_seen": 15460032, "step": 7165 }, { "epoch": 1.3158377683978713, "grad_norm": 2.1256585121154785, "learning_rate": 6.578271242429805e-06, "loss": 0.3481, "num_input_tokens_seen": 15470048, "step": 7170 }, { "epoch": 1.3167553679574233, "grad_norm": 7.2645134925842285, "learning_rate": 6.582859240227565e-06, "loss": 0.3681, "num_input_tokens_seen": 15480544, "step": 7175 }, { "epoch": 1.3176729675169756, "grad_norm": 1.2183218002319336, "learning_rate": 6.5874472380253266e-06, "loss": 0.2885, "num_input_tokens_seen": 15491200, "step": 7180 }, { "epoch": 1.3185905670765279, "grad_norm": 5.8879475593566895, "learning_rate": 6.592035235823088e-06, "loss": 0.3998, "num_input_tokens_seen": 15503360, "step": 7185 }, { "epoch": 1.31950816663608, "grad_norm": 1.7677485942840576, "learning_rate": 6.596623233620848e-06, "loss": 0.4335, "num_input_tokens_seen": 15514496, "step": 7190 }, { "epoch": 1.3204257661956322, "grad_norm": 2.0663840770721436, "learning_rate": 6.60121123141861e-06, "loss": 0.322, "num_input_tokens_seen": 15526432, "step": 7195 }, { "epoch": 1.3213433657551845, "grad_norm": 3.2687296867370605, "learning_rate": 6.605799229216371e-06, "loss": 0.3512, "num_input_tokens_seen": 15536640, "step": 7200 }, { "epoch": 1.3222609653147366, "grad_norm": 2.68448805809021, "learning_rate": 6.610387227014131e-06, "loss": 0.3005, "num_input_tokens_seen": 15548320, "step": 7205 }, { "epoch": 1.3231785648742889, "grad_norm": 3.3029046058654785, "learning_rate": 6.614975224811892e-06, "loss": 0.3379, "num_input_tokens_seen": 15560128, "step": 7210 }, { "epoch": 1.3240961644338411, "grad_norm": 4.333962440490723, "learning_rate": 6.619563222609654e-06, "loss": 0.3461, "num_input_tokens_seen": 15572320, "step": 7215 }, { "epoch": 1.3250137639933932, "grad_norm": 4.2839531898498535, "learning_rate": 6.624151220407414e-06, "loss": 0.3162, "num_input_tokens_seen": 15583424, "step": 7220 }, { "epoch": 1.3259313635529455, "grad_norm": 5.377345085144043, "learning_rate": 6.6287392182051754e-06, "loss": 0.3273, "num_input_tokens_seen": 15594752, "step": 7225 }, { "epoch": 1.3268489631124978, "grad_norm": 2.0077109336853027, "learning_rate": 6.633327216002937e-06, "loss": 0.2237, "num_input_tokens_seen": 15606656, "step": 7230 }, { "epoch": 1.3277665626720498, "grad_norm": 4.7131733894348145, "learning_rate": 6.637915213800697e-06, "loss": 0.3259, "num_input_tokens_seen": 15617184, "step": 7235 }, { "epoch": 1.3286841622316021, "grad_norm": 5.415958881378174, "learning_rate": 6.642503211598459e-06, "loss": 0.3238, "num_input_tokens_seen": 15628416, "step": 7240 }, { "epoch": 1.3296017617911544, "grad_norm": 4.826539516448975, "learning_rate": 6.64709120939622e-06, "loss": 0.3299, "num_input_tokens_seen": 15638688, "step": 7245 }, { "epoch": 1.3305193613507065, "grad_norm": 11.458160400390625, "learning_rate": 6.6516792071939805e-06, "loss": 0.2584, "num_input_tokens_seen": 15648640, "step": 7250 }, { "epoch": 1.3314369609102588, "grad_norm": 8.880363464355469, "learning_rate": 6.656267204991742e-06, "loss": 0.285, "num_input_tokens_seen": 15658848, "step": 7255 }, { "epoch": 1.332354560469811, "grad_norm": 3.4619908332824707, "learning_rate": 6.660855202789503e-06, "loss": 0.448, "num_input_tokens_seen": 15669216, "step": 7260 }, { "epoch": 1.333272160029363, "grad_norm": 8.31905746459961, "learning_rate": 6.665443200587264e-06, "loss": 0.4063, "num_input_tokens_seen": 15680832, "step": 7265 }, { "epoch": 1.3341897595889154, "grad_norm": 4.394126892089844, "learning_rate": 6.670031198385025e-06, "loss": 0.3451, "num_input_tokens_seen": 15692192, "step": 7270 }, { "epoch": 1.3351073591484677, "grad_norm": 4.605011940002441, "learning_rate": 6.6746191961827865e-06, "loss": 0.3446, "num_input_tokens_seen": 15702080, "step": 7275 }, { "epoch": 1.3360249587080197, "grad_norm": 7.325903415679932, "learning_rate": 6.679207193980547e-06, "loss": 0.3694, "num_input_tokens_seen": 15713952, "step": 7280 }, { "epoch": 1.336942558267572, "grad_norm": 11.082159042358398, "learning_rate": 6.683795191778308e-06, "loss": 0.3875, "num_input_tokens_seen": 15725312, "step": 7285 }, { "epoch": 1.3378601578271243, "grad_norm": 13.739103317260742, "learning_rate": 6.68838318957607e-06, "loss": 0.3561, "num_input_tokens_seen": 15735104, "step": 7290 }, { "epoch": 1.3387777573866764, "grad_norm": 8.305682182312012, "learning_rate": 6.69297118737383e-06, "loss": 0.3718, "num_input_tokens_seen": 15745568, "step": 7295 }, { "epoch": 1.3396953569462287, "grad_norm": 3.668065309524536, "learning_rate": 6.697559185171592e-06, "loss": 0.3578, "num_input_tokens_seen": 15755232, "step": 7300 }, { "epoch": 1.340612956505781, "grad_norm": 5.182516574859619, "learning_rate": 6.702147182969353e-06, "loss": 0.3565, "num_input_tokens_seen": 15767616, "step": 7305 }, { "epoch": 1.341530556065333, "grad_norm": 5.068854808807373, "learning_rate": 6.7067351807671135e-06, "loss": 0.3361, "num_input_tokens_seen": 15779392, "step": 7310 }, { "epoch": 1.3424481556248853, "grad_norm": 8.169697761535645, "learning_rate": 6.711323178564875e-06, "loss": 0.387, "num_input_tokens_seen": 15789824, "step": 7315 }, { "epoch": 1.3433657551844376, "grad_norm": 6.562323093414307, "learning_rate": 6.715911176362636e-06, "loss": 0.2912, "num_input_tokens_seen": 15800864, "step": 7320 }, { "epoch": 1.3442833547439896, "grad_norm": 5.528965473175049, "learning_rate": 6.720499174160397e-06, "loss": 0.319, "num_input_tokens_seen": 15811520, "step": 7325 }, { "epoch": 1.345200954303542, "grad_norm": 10.57800579071045, "learning_rate": 6.725087171958158e-06, "loss": 0.3601, "num_input_tokens_seen": 15822880, "step": 7330 }, { "epoch": 1.3461185538630942, "grad_norm": 3.105300188064575, "learning_rate": 6.7296751697559195e-06, "loss": 0.328, "num_input_tokens_seen": 15833856, "step": 7335 }, { "epoch": 1.3470361534226463, "grad_norm": 9.018294334411621, "learning_rate": 6.73426316755368e-06, "loss": 0.3878, "num_input_tokens_seen": 15845312, "step": 7340 }, { "epoch": 1.3479537529821985, "grad_norm": 4.525030136108398, "learning_rate": 6.738851165351441e-06, "loss": 0.3348, "num_input_tokens_seen": 15857280, "step": 7345 }, { "epoch": 1.3488713525417508, "grad_norm": 3.88462233543396, "learning_rate": 6.743439163149203e-06, "loss": 0.3205, "num_input_tokens_seen": 15868032, "step": 7350 }, { "epoch": 1.349788952101303, "grad_norm": 9.898040771484375, "learning_rate": 6.748027160946962e-06, "loss": 0.3576, "num_input_tokens_seen": 15877792, "step": 7355 }, { "epoch": 1.3507065516608552, "grad_norm": 3.1518311500549316, "learning_rate": 6.752615158744725e-06, "loss": 0.3059, "num_input_tokens_seen": 15888192, "step": 7360 }, { "epoch": 1.3516241512204075, "grad_norm": 3.3002448081970215, "learning_rate": 6.757203156542486e-06, "loss": 0.2731, "num_input_tokens_seen": 15897696, "step": 7365 }, { "epoch": 1.3525417507799595, "grad_norm": 4.070114612579346, "learning_rate": 6.761791154340247e-06, "loss": 0.3004, "num_input_tokens_seen": 15908288, "step": 7370 }, { "epoch": 1.3534593503395118, "grad_norm": 6.592120170593262, "learning_rate": 6.766379152138007e-06, "loss": 0.3532, "num_input_tokens_seen": 15919072, "step": 7375 }, { "epoch": 1.354376949899064, "grad_norm": 3.2893526554107666, "learning_rate": 6.770967149935769e-06, "loss": 0.3391, "num_input_tokens_seen": 15930880, "step": 7380 }, { "epoch": 1.3552945494586162, "grad_norm": 3.207671642303467, "learning_rate": 6.7755551477335305e-06, "loss": 0.2885, "num_input_tokens_seen": 15942528, "step": 7385 }, { "epoch": 1.3562121490181684, "grad_norm": 2.5787642002105713, "learning_rate": 6.78014314553129e-06, "loss": 0.3305, "num_input_tokens_seen": 15952320, "step": 7390 }, { "epoch": 1.3571297485777207, "grad_norm": 8.21411418914795, "learning_rate": 6.7847311433290516e-06, "loss": 0.3345, "num_input_tokens_seen": 15962528, "step": 7395 }, { "epoch": 1.3580473481372728, "grad_norm": 8.411494255065918, "learning_rate": 6.789319141126813e-06, "loss": 0.3675, "num_input_tokens_seen": 15973248, "step": 7400 }, { "epoch": 1.358964947696825, "grad_norm": 1.952466607093811, "learning_rate": 6.7939071389245734e-06, "loss": 0.3352, "num_input_tokens_seen": 15985120, "step": 7405 }, { "epoch": 1.3598825472563774, "grad_norm": 2.603550910949707, "learning_rate": 6.798495136722335e-06, "loss": 0.3592, "num_input_tokens_seen": 15995872, "step": 7410 }, { "epoch": 1.3608001468159294, "grad_norm": 3.2421483993530273, "learning_rate": 6.803083134520096e-06, "loss": 0.2342, "num_input_tokens_seen": 16006208, "step": 7415 }, { "epoch": 1.3617177463754817, "grad_norm": 2.9208760261535645, "learning_rate": 6.807671132317857e-06, "loss": 0.4497, "num_input_tokens_seen": 16016896, "step": 7420 }, { "epoch": 1.362635345935034, "grad_norm": 1.8803343772888184, "learning_rate": 6.812259130115618e-06, "loss": 0.2742, "num_input_tokens_seen": 16028352, "step": 7425 }, { "epoch": 1.363552945494586, "grad_norm": 2.123943328857422, "learning_rate": 6.816847127913379e-06, "loss": 0.287, "num_input_tokens_seen": 16039680, "step": 7430 }, { "epoch": 1.3644705450541383, "grad_norm": 3.4193532466888428, "learning_rate": 6.82143512571114e-06, "loss": 0.311, "num_input_tokens_seen": 16051008, "step": 7435 }, { "epoch": 1.3653881446136906, "grad_norm": 3.7861146926879883, "learning_rate": 6.826023123508901e-06, "loss": 0.3181, "num_input_tokens_seen": 16060960, "step": 7440 }, { "epoch": 1.3663057441732427, "grad_norm": 1.7627055644989014, "learning_rate": 6.830611121306663e-06, "loss": 0.3281, "num_input_tokens_seen": 16071808, "step": 7445 }, { "epoch": 1.367223343732795, "grad_norm": 3.0038671493530273, "learning_rate": 6.835199119104423e-06, "loss": 0.2079, "num_input_tokens_seen": 16082976, "step": 7450 }, { "epoch": 1.3681409432923473, "grad_norm": 3.411869764328003, "learning_rate": 6.8397871169021845e-06, "loss": 0.326, "num_input_tokens_seen": 16092352, "step": 7455 }, { "epoch": 1.3690585428518993, "grad_norm": 6.726849555969238, "learning_rate": 6.844375114699946e-06, "loss": 0.3325, "num_input_tokens_seen": 16103584, "step": 7460 }, { "epoch": 1.3699761424114516, "grad_norm": 9.806964874267578, "learning_rate": 6.848963112497706e-06, "loss": 0.4342, "num_input_tokens_seen": 16113760, "step": 7465 }, { "epoch": 1.3708937419710039, "grad_norm": 3.2937350273132324, "learning_rate": 6.853551110295468e-06, "loss": 0.3698, "num_input_tokens_seen": 16124960, "step": 7470 }, { "epoch": 1.371811341530556, "grad_norm": 3.1423916816711426, "learning_rate": 6.858139108093229e-06, "loss": 0.3668, "num_input_tokens_seen": 16137152, "step": 7475 }, { "epoch": 1.3727289410901082, "grad_norm": 11.137935638427734, "learning_rate": 6.86272710589099e-06, "loss": 0.402, "num_input_tokens_seen": 16148544, "step": 7480 }, { "epoch": 1.3736465406496605, "grad_norm": 2.6245317459106445, "learning_rate": 6.867315103688751e-06, "loss": 0.3121, "num_input_tokens_seen": 16160480, "step": 7485 }, { "epoch": 1.3745641402092126, "grad_norm": 5.259744644165039, "learning_rate": 6.871903101486512e-06, "loss": 0.3243, "num_input_tokens_seen": 16170976, "step": 7490 }, { "epoch": 1.3754817397687649, "grad_norm": 3.071526050567627, "learning_rate": 6.876491099284273e-06, "loss": 0.2929, "num_input_tokens_seen": 16182656, "step": 7495 }, { "epoch": 1.3763993393283172, "grad_norm": 6.753301620483398, "learning_rate": 6.881079097082034e-06, "loss": 0.3561, "num_input_tokens_seen": 16193536, "step": 7500 }, { "epoch": 1.3773169388878692, "grad_norm": 1.6002216339111328, "learning_rate": 6.885667094879796e-06, "loss": 0.3697, "num_input_tokens_seen": 16205280, "step": 7505 }, { "epoch": 1.3782345384474215, "grad_norm": 1.2752195596694946, "learning_rate": 6.890255092677556e-06, "loss": 0.3143, "num_input_tokens_seen": 16216672, "step": 7510 }, { "epoch": 1.3791521380069738, "grad_norm": 3.412217617034912, "learning_rate": 6.8948430904753175e-06, "loss": 0.2966, "num_input_tokens_seen": 16226976, "step": 7515 }, { "epoch": 1.3800697375665258, "grad_norm": 13.103461265563965, "learning_rate": 6.899431088273079e-06, "loss": 0.3354, "num_input_tokens_seen": 16238624, "step": 7520 }, { "epoch": 1.3809873371260781, "grad_norm": 1.903518557548523, "learning_rate": 6.9040190860708385e-06, "loss": 0.265, "num_input_tokens_seen": 16250848, "step": 7525 }, { "epoch": 1.3819049366856304, "grad_norm": 1.986028790473938, "learning_rate": 6.908607083868601e-06, "loss": 0.4612, "num_input_tokens_seen": 16261920, "step": 7530 }, { "epoch": 1.3828225362451825, "grad_norm": 3.4519715309143066, "learning_rate": 6.913195081666362e-06, "loss": 0.3011, "num_input_tokens_seen": 16272928, "step": 7535 }, { "epoch": 1.3837401358047348, "grad_norm": 1.6553490161895752, "learning_rate": 6.917783079464122e-06, "loss": 0.3726, "num_input_tokens_seen": 16284000, "step": 7540 }, { "epoch": 1.384657735364287, "grad_norm": 2.9562339782714844, "learning_rate": 6.922371077261883e-06, "loss": 0.3992, "num_input_tokens_seen": 16294720, "step": 7545 }, { "epoch": 1.385575334923839, "grad_norm": 4.129822731018066, "learning_rate": 6.926959075059645e-06, "loss": 0.2928, "num_input_tokens_seen": 16305888, "step": 7550 }, { "epoch": 1.3864929344833914, "grad_norm": 4.0475754737854, "learning_rate": 6.931547072857405e-06, "loss": 0.2827, "num_input_tokens_seen": 16316608, "step": 7555 }, { "epoch": 1.3874105340429437, "grad_norm": 1.975672721862793, "learning_rate": 6.936135070655166e-06, "loss": 0.3137, "num_input_tokens_seen": 16327232, "step": 7560 }, { "epoch": 1.3883281336024957, "grad_norm": 1.8520276546478271, "learning_rate": 6.940723068452928e-06, "loss": 0.3486, "num_input_tokens_seen": 16336256, "step": 7565 }, { "epoch": 1.389245733162048, "grad_norm": 3.0272445678710938, "learning_rate": 6.945311066250688e-06, "loss": 0.397, "num_input_tokens_seen": 16347072, "step": 7570 }, { "epoch": 1.3901633327216003, "grad_norm": 3.7829957008361816, "learning_rate": 6.94989906404845e-06, "loss": 0.3747, "num_input_tokens_seen": 16357408, "step": 7575 }, { "epoch": 1.3910809322811524, "grad_norm": 1.9140146970748901, "learning_rate": 6.954487061846211e-06, "loss": 0.2881, "num_input_tokens_seen": 16367360, "step": 7580 }, { "epoch": 1.3919985318407047, "grad_norm": 3.403455972671509, "learning_rate": 6.9590750596439715e-06, "loss": 0.3197, "num_input_tokens_seen": 16379264, "step": 7585 }, { "epoch": 1.392916131400257, "grad_norm": 2.9776737689971924, "learning_rate": 6.963663057441733e-06, "loss": 0.3475, "num_input_tokens_seen": 16387264, "step": 7590 }, { "epoch": 1.393833730959809, "grad_norm": 4.468038082122803, "learning_rate": 6.968251055239494e-06, "loss": 0.3726, "num_input_tokens_seen": 16397440, "step": 7595 }, { "epoch": 1.3947513305193613, "grad_norm": 1.401636004447937, "learning_rate": 6.972839053037255e-06, "loss": 0.3024, "num_input_tokens_seen": 16408064, "step": 7600 }, { "epoch": 1.3956689300789136, "grad_norm": 4.062580585479736, "learning_rate": 6.977427050835016e-06, "loss": 0.2574, "num_input_tokens_seen": 16419328, "step": 7605 }, { "epoch": 1.3965865296384659, "grad_norm": 3.867309331893921, "learning_rate": 6.982015048632777e-06, "loss": 0.2964, "num_input_tokens_seen": 16429184, "step": 7610 }, { "epoch": 1.397504129198018, "grad_norm": 2.1337056159973145, "learning_rate": 6.986603046430538e-06, "loss": 0.2735, "num_input_tokens_seen": 16441088, "step": 7615 }, { "epoch": 1.3984217287575702, "grad_norm": 2.655860185623169, "learning_rate": 6.991191044228299e-06, "loss": 0.3017, "num_input_tokens_seen": 16451936, "step": 7620 }, { "epoch": 1.3993393283171225, "grad_norm": 1.8608002662658691, "learning_rate": 6.995779042026061e-06, "loss": 0.2678, "num_input_tokens_seen": 16462624, "step": 7625 }, { "epoch": 1.4002569278766746, "grad_norm": 1.3965375423431396, "learning_rate": 7.000367039823821e-06, "loss": 0.3153, "num_input_tokens_seen": 16474496, "step": 7630 }, { "epoch": 1.4011745274362268, "grad_norm": 6.576589584350586, "learning_rate": 7.0049550376215825e-06, "loss": 0.31, "num_input_tokens_seen": 16485248, "step": 7635 }, { "epoch": 1.4020921269957791, "grad_norm": 1.1371707916259766, "learning_rate": 7.009543035419344e-06, "loss": 0.4393, "num_input_tokens_seen": 16495200, "step": 7640 }, { "epoch": 1.4030097265553312, "grad_norm": 15.814042091369629, "learning_rate": 7.014131033217104e-06, "loss": 0.3771, "num_input_tokens_seen": 16505600, "step": 7645 }, { "epoch": 1.4039273261148835, "grad_norm": 2.8849034309387207, "learning_rate": 7.018719031014866e-06, "loss": 0.3277, "num_input_tokens_seen": 16515296, "step": 7650 }, { "epoch": 1.4048449256744358, "grad_norm": 3.0220091342926025, "learning_rate": 7.023307028812627e-06, "loss": 0.2725, "num_input_tokens_seen": 16526272, "step": 7655 }, { "epoch": 1.4057625252339878, "grad_norm": 3.2355258464813232, "learning_rate": 7.027895026610388e-06, "loss": 0.3497, "num_input_tokens_seen": 16537664, "step": 7660 }, { "epoch": 1.40668012479354, "grad_norm": 1.624013066291809, "learning_rate": 7.032483024408149e-06, "loss": 0.3331, "num_input_tokens_seen": 16548032, "step": 7665 }, { "epoch": 1.4075977243530924, "grad_norm": 4.9919328689575195, "learning_rate": 7.03707102220591e-06, "loss": 0.2905, "num_input_tokens_seen": 16558208, "step": 7670 }, { "epoch": 1.4085153239126444, "grad_norm": 5.252910137176514, "learning_rate": 7.041659020003671e-06, "loss": 0.4734, "num_input_tokens_seen": 16569824, "step": 7675 }, { "epoch": 1.4094329234721967, "grad_norm": 8.831968307495117, "learning_rate": 7.046247017801432e-06, "loss": 0.3715, "num_input_tokens_seen": 16581408, "step": 7680 }, { "epoch": 1.410350523031749, "grad_norm": 4.533358573913574, "learning_rate": 7.050835015599194e-06, "loss": 0.3026, "num_input_tokens_seen": 16591136, "step": 7685 }, { "epoch": 1.411268122591301, "grad_norm": 2.9166998863220215, "learning_rate": 7.055423013396953e-06, "loss": 0.3324, "num_input_tokens_seen": 16600640, "step": 7690 }, { "epoch": 1.4121857221508534, "grad_norm": 1.4686157703399658, "learning_rate": 7.0600110111947155e-06, "loss": 0.326, "num_input_tokens_seen": 16610240, "step": 7695 }, { "epoch": 1.4131033217104056, "grad_norm": 1.7510031461715698, "learning_rate": 7.064599008992477e-06, "loss": 0.3098, "num_input_tokens_seen": 16620192, "step": 7700 }, { "epoch": 1.4140209212699577, "grad_norm": 6.065941333770752, "learning_rate": 7.0691870067902365e-06, "loss": 0.4352, "num_input_tokens_seen": 16630048, "step": 7705 }, { "epoch": 1.41493852082951, "grad_norm": 3.4608378410339355, "learning_rate": 7.073775004587998e-06, "loss": 0.358, "num_input_tokens_seen": 16640672, "step": 7710 }, { "epoch": 1.4158561203890623, "grad_norm": 1.598578691482544, "learning_rate": 7.07836300238576e-06, "loss": 0.3102, "num_input_tokens_seen": 16650944, "step": 7715 }, { "epoch": 1.4167737199486146, "grad_norm": 1.0618685483932495, "learning_rate": 7.08295100018352e-06, "loss": 0.2984, "num_input_tokens_seen": 16661984, "step": 7720 }, { "epoch": 1.4176913195081666, "grad_norm": 8.746587753295898, "learning_rate": 7.087538997981281e-06, "loss": 0.4217, "num_input_tokens_seen": 16672032, "step": 7725 }, { "epoch": 1.418608919067719, "grad_norm": 7.32619047164917, "learning_rate": 7.0921269957790425e-06, "loss": 0.4488, "num_input_tokens_seen": 16682208, "step": 7730 }, { "epoch": 1.4195265186272712, "grad_norm": 11.169187545776367, "learning_rate": 7.096714993576803e-06, "loss": 0.3883, "num_input_tokens_seen": 16693504, "step": 7735 }, { "epoch": 1.4204441181868233, "grad_norm": 11.741860389709473, "learning_rate": 7.101302991374564e-06, "loss": 0.3911, "num_input_tokens_seen": 16704896, "step": 7740 }, { "epoch": 1.4213617177463755, "grad_norm": 2.6401913166046143, "learning_rate": 7.105890989172326e-06, "loss": 0.3679, "num_input_tokens_seen": 16715168, "step": 7745 }, { "epoch": 1.4222793173059278, "grad_norm": 3.0341384410858154, "learning_rate": 7.110478986970086e-06, "loss": 0.3252, "num_input_tokens_seen": 16724960, "step": 7750 }, { "epoch": 1.4231969168654799, "grad_norm": 5.023054599761963, "learning_rate": 7.115066984767848e-06, "loss": 0.2637, "num_input_tokens_seen": 16735552, "step": 7755 }, { "epoch": 1.4241145164250322, "grad_norm": 6.679210662841797, "learning_rate": 7.119654982565609e-06, "loss": 0.2893, "num_input_tokens_seen": 16747264, "step": 7760 }, { "epoch": 1.4250321159845845, "grad_norm": 9.233141899108887, "learning_rate": 7.1242429803633695e-06, "loss": 0.4003, "num_input_tokens_seen": 16758272, "step": 7765 }, { "epoch": 1.4259497155441365, "grad_norm": 1.6395756006240845, "learning_rate": 7.128830978161131e-06, "loss": 0.2602, "num_input_tokens_seen": 16769440, "step": 7770 }, { "epoch": 1.4268673151036888, "grad_norm": 1.239335060119629, "learning_rate": 7.133418975958892e-06, "loss": 0.3151, "num_input_tokens_seen": 16780960, "step": 7775 }, { "epoch": 1.427784914663241, "grad_norm": 2.122218132019043, "learning_rate": 7.138006973756653e-06, "loss": 0.259, "num_input_tokens_seen": 16791648, "step": 7780 }, { "epoch": 1.4287025142227932, "grad_norm": 3.5620434284210205, "learning_rate": 7.142594971554414e-06, "loss": 0.3176, "num_input_tokens_seen": 16800640, "step": 7785 }, { "epoch": 1.4296201137823454, "grad_norm": 4.516326904296875, "learning_rate": 7.1471829693521754e-06, "loss": 0.3703, "num_input_tokens_seen": 16811520, "step": 7790 }, { "epoch": 1.4305377133418977, "grad_norm": 2.482617139816284, "learning_rate": 7.151770967149937e-06, "loss": 0.3323, "num_input_tokens_seen": 16821760, "step": 7795 }, { "epoch": 1.4314553129014498, "grad_norm": 5.756852626800537, "learning_rate": 7.156358964947697e-06, "loss": 0.347, "num_input_tokens_seen": 16832960, "step": 7800 }, { "epoch": 1.432372912461002, "grad_norm": 14.536664009094238, "learning_rate": 7.160946962745459e-06, "loss": 0.3571, "num_input_tokens_seen": 16842496, "step": 7805 }, { "epoch": 1.4332905120205544, "grad_norm": 7.3390374183654785, "learning_rate": 7.16553496054322e-06, "loss": 0.36, "num_input_tokens_seen": 16854016, "step": 7810 }, { "epoch": 1.4342081115801064, "grad_norm": 3.7133312225341797, "learning_rate": 7.1701229583409806e-06, "loss": 0.3878, "num_input_tokens_seen": 16863552, "step": 7815 }, { "epoch": 1.4351257111396587, "grad_norm": 3.9044747352600098, "learning_rate": 7.174710956138742e-06, "loss": 0.291, "num_input_tokens_seen": 16874336, "step": 7820 }, { "epoch": 1.436043310699211, "grad_norm": 2.9662508964538574, "learning_rate": 7.179298953936503e-06, "loss": 0.352, "num_input_tokens_seen": 16884384, "step": 7825 }, { "epoch": 1.436960910258763, "grad_norm": 2.2783827781677246, "learning_rate": 7.183886951734264e-06, "loss": 0.3585, "num_input_tokens_seen": 16895456, "step": 7830 }, { "epoch": 1.4378785098183153, "grad_norm": 4.237573146820068, "learning_rate": 7.188474949532025e-06, "loss": 0.3435, "num_input_tokens_seen": 16907072, "step": 7835 }, { "epoch": 1.4387961093778676, "grad_norm": 3.0250070095062256, "learning_rate": 7.1930629473297865e-06, "loss": 0.3248, "num_input_tokens_seen": 16916928, "step": 7840 }, { "epoch": 1.4397137089374197, "grad_norm": 2.585361957550049, "learning_rate": 7.197650945127547e-06, "loss": 0.2991, "num_input_tokens_seen": 16927232, "step": 7845 }, { "epoch": 1.440631308496972, "grad_norm": 5.288979530334473, "learning_rate": 7.202238942925308e-06, "loss": 0.3848, "num_input_tokens_seen": 16937088, "step": 7850 }, { "epoch": 1.4415489080565242, "grad_norm": 6.590927600860596, "learning_rate": 7.20682694072307e-06, "loss": 0.3418, "num_input_tokens_seen": 16948896, "step": 7855 }, { "epoch": 1.4424665076160763, "grad_norm": 5.1812357902526855, "learning_rate": 7.21141493852083e-06, "loss": 0.2947, "num_input_tokens_seen": 16959584, "step": 7860 }, { "epoch": 1.4433841071756286, "grad_norm": 4.880795001983643, "learning_rate": 7.216002936318592e-06, "loss": 0.3903, "num_input_tokens_seen": 16970048, "step": 7865 }, { "epoch": 1.4443017067351809, "grad_norm": 1.3027911186218262, "learning_rate": 7.220590934116353e-06, "loss": 0.3491, "num_input_tokens_seen": 16980992, "step": 7870 }, { "epoch": 1.445219306294733, "grad_norm": 3.29948353767395, "learning_rate": 7.225178931914113e-06, "loss": 0.3514, "num_input_tokens_seen": 16993056, "step": 7875 }, { "epoch": 1.4461369058542852, "grad_norm": 2.1754112243652344, "learning_rate": 7.229766929711874e-06, "loss": 0.2779, "num_input_tokens_seen": 17003776, "step": 7880 }, { "epoch": 1.4470545054138375, "grad_norm": 2.1414685249328613, "learning_rate": 7.234354927509636e-06, "loss": 0.3577, "num_input_tokens_seen": 17014624, "step": 7885 }, { "epoch": 1.4479721049733896, "grad_norm": 5.924065113067627, "learning_rate": 7.238942925307396e-06, "loss": 0.367, "num_input_tokens_seen": 17026016, "step": 7890 }, { "epoch": 1.4488897045329419, "grad_norm": 3.0072853565216064, "learning_rate": 7.243530923105157e-06, "loss": 0.2613, "num_input_tokens_seen": 17036480, "step": 7895 }, { "epoch": 1.4498073040924941, "grad_norm": 3.7449681758880615, "learning_rate": 7.248118920902919e-06, "loss": 0.2906, "num_input_tokens_seen": 17046784, "step": 7900 }, { "epoch": 1.4507249036520462, "grad_norm": 1.3532999753952026, "learning_rate": 7.252706918700679e-06, "loss": 0.2749, "num_input_tokens_seen": 17057120, "step": 7905 }, { "epoch": 1.4516425032115985, "grad_norm": 4.570882797241211, "learning_rate": 7.2572949164984405e-06, "loss": 0.2236, "num_input_tokens_seen": 17068384, "step": 7910 }, { "epoch": 1.4525601027711508, "grad_norm": 3.5611584186553955, "learning_rate": 7.261882914296202e-06, "loss": 0.4228, "num_input_tokens_seen": 17079424, "step": 7915 }, { "epoch": 1.4534777023307028, "grad_norm": 9.682060241699219, "learning_rate": 7.266470912093962e-06, "loss": 0.4672, "num_input_tokens_seen": 17091136, "step": 7920 }, { "epoch": 1.4543953018902551, "grad_norm": 6.170392990112305, "learning_rate": 7.271058909891724e-06, "loss": 0.425, "num_input_tokens_seen": 17102048, "step": 7925 }, { "epoch": 1.4553129014498074, "grad_norm": 1.93644118309021, "learning_rate": 7.275646907689485e-06, "loss": 0.4334, "num_input_tokens_seen": 17113152, "step": 7930 }, { "epoch": 1.4562305010093595, "grad_norm": 6.902830600738525, "learning_rate": 7.280234905487246e-06, "loss": 0.3576, "num_input_tokens_seen": 17124416, "step": 7935 }, { "epoch": 1.4571481005689118, "grad_norm": 2.9543709754943848, "learning_rate": 7.284822903285007e-06, "loss": 0.3305, "num_input_tokens_seen": 17135136, "step": 7940 }, { "epoch": 1.458065700128464, "grad_norm": 1.151971697807312, "learning_rate": 7.289410901082768e-06, "loss": 0.284, "num_input_tokens_seen": 17146144, "step": 7945 }, { "epoch": 1.458983299688016, "grad_norm": 1.4877676963806152, "learning_rate": 7.293998898880529e-06, "loss": 0.2888, "num_input_tokens_seen": 17156288, "step": 7950 }, { "epoch": 1.4599008992475684, "grad_norm": 7.788760185241699, "learning_rate": 7.29858689667829e-06, "loss": 0.3055, "num_input_tokens_seen": 17167968, "step": 7955 }, { "epoch": 1.4608184988071207, "grad_norm": 3.403038263320923, "learning_rate": 7.3031748944760516e-06, "loss": 0.4323, "num_input_tokens_seen": 17179616, "step": 7960 }, { "epoch": 1.4617360983666727, "grad_norm": 2.1132760047912598, "learning_rate": 7.307762892273812e-06, "loss": 0.3552, "num_input_tokens_seen": 17189440, "step": 7965 }, { "epoch": 1.462653697926225, "grad_norm": 3.221550464630127, "learning_rate": 7.3123508900715735e-06, "loss": 0.2587, "num_input_tokens_seen": 17200864, "step": 7970 }, { "epoch": 1.4635712974857773, "grad_norm": 4.737313270568848, "learning_rate": 7.316938887869335e-06, "loss": 0.3683, "num_input_tokens_seen": 17211776, "step": 7975 }, { "epoch": 1.4644888970453294, "grad_norm": 6.430923938751221, "learning_rate": 7.321526885667095e-06, "loss": 0.3345, "num_input_tokens_seen": 17222656, "step": 7980 }, { "epoch": 1.4654064966048816, "grad_norm": 5.696585178375244, "learning_rate": 7.326114883464857e-06, "loss": 0.3426, "num_input_tokens_seen": 17231616, "step": 7985 }, { "epoch": 1.466324096164434, "grad_norm": 2.006079912185669, "learning_rate": 7.330702881262618e-06, "loss": 0.3139, "num_input_tokens_seen": 17241472, "step": 7990 }, { "epoch": 1.467241695723986, "grad_norm": 5.278911113739014, "learning_rate": 7.3352908790603786e-06, "loss": 0.3074, "num_input_tokens_seen": 17253216, "step": 7995 }, { "epoch": 1.4681592952835383, "grad_norm": 1.124485969543457, "learning_rate": 7.33987887685814e-06, "loss": 0.333, "num_input_tokens_seen": 17264832, "step": 8000 }, { "epoch": 1.4690768948430906, "grad_norm": 1.420691967010498, "learning_rate": 7.344466874655901e-06, "loss": 0.2879, "num_input_tokens_seen": 17275424, "step": 8005 }, { "epoch": 1.4699944944026426, "grad_norm": 6.021546840667725, "learning_rate": 7.349054872453662e-06, "loss": 0.2948, "num_input_tokens_seen": 17285344, "step": 8010 }, { "epoch": 1.470912093962195, "grad_norm": 2.843137741088867, "learning_rate": 7.353642870251423e-06, "loss": 0.4279, "num_input_tokens_seen": 17295296, "step": 8015 }, { "epoch": 1.4718296935217472, "grad_norm": 10.68410587310791, "learning_rate": 7.3582308680491845e-06, "loss": 0.3453, "num_input_tokens_seen": 17304768, "step": 8020 }, { "epoch": 1.4727472930812993, "grad_norm": 11.153578758239746, "learning_rate": 7.362818865846944e-06, "loss": 0.4547, "num_input_tokens_seen": 17315552, "step": 8025 }, { "epoch": 1.4736648926408515, "grad_norm": 3.3099348545074463, "learning_rate": 7.367406863644706e-06, "loss": 0.4356, "num_input_tokens_seen": 17325696, "step": 8030 }, { "epoch": 1.4745824922004038, "grad_norm": 4.4500532150268555, "learning_rate": 7.371994861442468e-06, "loss": 0.3437, "num_input_tokens_seen": 17335808, "step": 8035 }, { "epoch": 1.475500091759956, "grad_norm": 3.0132174491882324, "learning_rate": 7.3765828592402274e-06, "loss": 0.2925, "num_input_tokens_seen": 17347392, "step": 8040 }, { "epoch": 1.4764176913195082, "grad_norm": 1.8424644470214844, "learning_rate": 7.381170857037989e-06, "loss": 0.319, "num_input_tokens_seen": 17358336, "step": 8045 }, { "epoch": 1.4773352908790605, "grad_norm": 5.05138635635376, "learning_rate": 7.385758854835751e-06, "loss": 0.2703, "num_input_tokens_seen": 17368352, "step": 8050 }, { "epoch": 1.4782528904386125, "grad_norm": 2.1879286766052246, "learning_rate": 7.390346852633511e-06, "loss": 0.3351, "num_input_tokens_seen": 17379360, "step": 8055 }, { "epoch": 1.4791704899981648, "grad_norm": 2.1022188663482666, "learning_rate": 7.394934850431272e-06, "loss": 0.2616, "num_input_tokens_seen": 17390976, "step": 8060 }, { "epoch": 1.480088089557717, "grad_norm": 2.2679824829101562, "learning_rate": 7.399522848229033e-06, "loss": 0.2746, "num_input_tokens_seen": 17400992, "step": 8065 }, { "epoch": 1.4810056891172692, "grad_norm": 6.085402011871338, "learning_rate": 7.404110846026794e-06, "loss": 0.3883, "num_input_tokens_seen": 17411520, "step": 8070 }, { "epoch": 1.4819232886768214, "grad_norm": 2.704296350479126, "learning_rate": 7.408698843824555e-06, "loss": 0.3941, "num_input_tokens_seen": 17422272, "step": 8075 }, { "epoch": 1.4828408882363737, "grad_norm": 1.9068694114685059, "learning_rate": 7.413286841622317e-06, "loss": 0.2197, "num_input_tokens_seen": 17433856, "step": 8080 }, { "epoch": 1.4837584877959258, "grad_norm": 6.503290176391602, "learning_rate": 7.417874839420077e-06, "loss": 0.3604, "num_input_tokens_seen": 17444288, "step": 8085 }, { "epoch": 1.484676087355478, "grad_norm": 4.084494113922119, "learning_rate": 7.4224628372178385e-06, "loss": 0.3941, "num_input_tokens_seen": 17454016, "step": 8090 }, { "epoch": 1.4855936869150304, "grad_norm": 3.8290295600891113, "learning_rate": 7.4270508350156e-06, "loss": 0.3399, "num_input_tokens_seen": 17463040, "step": 8095 }, { "epoch": 1.4865112864745824, "grad_norm": 3.4445087909698486, "learning_rate": 7.43163883281336e-06, "loss": 0.3932, "num_input_tokens_seen": 17473344, "step": 8100 }, { "epoch": 1.4874288860341347, "grad_norm": 5.46002197265625, "learning_rate": 7.436226830611122e-06, "loss": 0.3484, "num_input_tokens_seen": 17484704, "step": 8105 }, { "epoch": 1.488346485593687, "grad_norm": 3.6438100337982178, "learning_rate": 7.440814828408883e-06, "loss": 0.3105, "num_input_tokens_seen": 17495520, "step": 8110 }, { "epoch": 1.489264085153239, "grad_norm": 3.7111451625823975, "learning_rate": 7.445402826206644e-06, "loss": 0.373, "num_input_tokens_seen": 17508096, "step": 8115 }, { "epoch": 1.4901816847127913, "grad_norm": 1.4713786840438843, "learning_rate": 7.449990824004405e-06, "loss": 0.3153, "num_input_tokens_seen": 17519552, "step": 8120 }, { "epoch": 1.4910992842723436, "grad_norm": 1.6796411275863647, "learning_rate": 7.454578821802166e-06, "loss": 0.3319, "num_input_tokens_seen": 17529600, "step": 8125 }, { "epoch": 1.4920168838318957, "grad_norm": 1.5508838891983032, "learning_rate": 7.459166819599927e-06, "loss": 0.3493, "num_input_tokens_seen": 17540064, "step": 8130 }, { "epoch": 1.492934483391448, "grad_norm": 2.3884222507476807, "learning_rate": 7.463754817397688e-06, "loss": 0.2961, "num_input_tokens_seen": 17549888, "step": 8135 }, { "epoch": 1.4938520829510002, "grad_norm": 10.392287254333496, "learning_rate": 7.46834281519545e-06, "loss": 0.2958, "num_input_tokens_seen": 17560096, "step": 8140 }, { "epoch": 1.4947696825105523, "grad_norm": 10.407519340515137, "learning_rate": 7.47293081299321e-06, "loss": 0.387, "num_input_tokens_seen": 17570976, "step": 8145 }, { "epoch": 1.4956872820701046, "grad_norm": 5.612729072570801, "learning_rate": 7.4775188107909715e-06, "loss": 0.3309, "num_input_tokens_seen": 17581408, "step": 8150 }, { "epoch": 1.4966048816296569, "grad_norm": 2.3024537563323975, "learning_rate": 7.482106808588733e-06, "loss": 0.2659, "num_input_tokens_seen": 17592864, "step": 8155 }, { "epoch": 1.497522481189209, "grad_norm": 12.95432186126709, "learning_rate": 7.486694806386493e-06, "loss": 0.3113, "num_input_tokens_seen": 17603424, "step": 8160 }, { "epoch": 1.4984400807487612, "grad_norm": 3.8270926475524902, "learning_rate": 7.491282804184255e-06, "loss": 0.3218, "num_input_tokens_seen": 17614784, "step": 8165 }, { "epoch": 1.4993576803083135, "grad_norm": 4.027288913726807, "learning_rate": 7.495870801982016e-06, "loss": 0.2756, "num_input_tokens_seen": 17624512, "step": 8170 }, { "epoch": 1.5002752798678656, "grad_norm": 4.9556708335876465, "learning_rate": 7.500458799779777e-06, "loss": 0.3455, "num_input_tokens_seen": 17634016, "step": 8175 }, { "epoch": 1.5011928794274179, "grad_norm": 2.44948410987854, "learning_rate": 7.505046797577538e-06, "loss": 0.319, "num_input_tokens_seen": 17644608, "step": 8180 }, { "epoch": 1.5021104789869701, "grad_norm": 1.58460533618927, "learning_rate": 7.509634795375299e-06, "loss": 0.3843, "num_input_tokens_seen": 17656128, "step": 8185 }, { "epoch": 1.5030280785465222, "grad_norm": 2.0267322063446045, "learning_rate": 7.514222793173059e-06, "loss": 0.2806, "num_input_tokens_seen": 17667296, "step": 8190 }, { "epoch": 1.5039456781060745, "grad_norm": 3.042285680770874, "learning_rate": 7.518810790970821e-06, "loss": 0.3864, "num_input_tokens_seen": 17679168, "step": 8195 }, { "epoch": 1.5048632776656268, "grad_norm": 5.2724456787109375, "learning_rate": 7.5233987887685825e-06, "loss": 0.272, "num_input_tokens_seen": 17689280, "step": 8200 }, { "epoch": 1.5057808772251788, "grad_norm": 2.6074602603912354, "learning_rate": 7.527986786566342e-06, "loss": 0.4174, "num_input_tokens_seen": 17700704, "step": 8205 }, { "epoch": 1.5066984767847311, "grad_norm": 5.482270240783691, "learning_rate": 7.5325747843641036e-06, "loss": 0.3215, "num_input_tokens_seen": 17712064, "step": 8210 }, { "epoch": 1.5076160763442834, "grad_norm": 7.005738258361816, "learning_rate": 7.537162782161866e-06, "loss": 0.376, "num_input_tokens_seen": 17723584, "step": 8215 }, { "epoch": 1.5085336759038355, "grad_norm": 1.2882661819458008, "learning_rate": 7.541750779959627e-06, "loss": 0.3927, "num_input_tokens_seen": 17735328, "step": 8220 }, { "epoch": 1.5094512754633878, "grad_norm": 1.9694465398788452, "learning_rate": 7.546338777757387e-06, "loss": 0.3772, "num_input_tokens_seen": 17745216, "step": 8225 }, { "epoch": 1.51036887502294, "grad_norm": 3.923214912414551, "learning_rate": 7.550926775555148e-06, "loss": 0.3128, "num_input_tokens_seen": 17756608, "step": 8230 }, { "epoch": 1.511286474582492, "grad_norm": 3.0409488677978516, "learning_rate": 7.5555147733529095e-06, "loss": 0.3609, "num_input_tokens_seen": 17767104, "step": 8235 }, { "epoch": 1.5122040741420444, "grad_norm": 2.0048792362213135, "learning_rate": 7.56010277115067e-06, "loss": 0.3362, "num_input_tokens_seen": 17777792, "step": 8240 }, { "epoch": 1.5131216737015967, "grad_norm": 6.232755661010742, "learning_rate": 7.564690768948431e-06, "loss": 0.3692, "num_input_tokens_seen": 17789184, "step": 8245 }, { "epoch": 1.5140392732611487, "grad_norm": 2.8217992782592773, "learning_rate": 7.569278766746193e-06, "loss": 0.3308, "num_input_tokens_seen": 17800384, "step": 8250 }, { "epoch": 1.514956872820701, "grad_norm": 2.3239762783050537, "learning_rate": 7.573866764543953e-06, "loss": 0.3146, "num_input_tokens_seen": 17810656, "step": 8255 }, { "epoch": 1.5158744723802533, "grad_norm": 1.981149435043335, "learning_rate": 7.578454762341715e-06, "loss": 0.3849, "num_input_tokens_seen": 17820864, "step": 8260 }, { "epoch": 1.5167920719398054, "grad_norm": 1.2892353534698486, "learning_rate": 7.583042760139476e-06, "loss": 0.2508, "num_input_tokens_seen": 17830368, "step": 8265 }, { "epoch": 1.5177096714993576, "grad_norm": 2.119483470916748, "learning_rate": 7.5876307579372365e-06, "loss": 0.3223, "num_input_tokens_seen": 17841184, "step": 8270 }, { "epoch": 1.51862727105891, "grad_norm": 1.1383414268493652, "learning_rate": 7.592218755734998e-06, "loss": 0.3204, "num_input_tokens_seen": 17851264, "step": 8275 }, { "epoch": 1.519544870618462, "grad_norm": 2.0764851570129395, "learning_rate": 7.596806753532759e-06, "loss": 0.3538, "num_input_tokens_seen": 17861696, "step": 8280 }, { "epoch": 1.5204624701780143, "grad_norm": 1.801366925239563, "learning_rate": 7.60139475133052e-06, "loss": 0.377, "num_input_tokens_seen": 17872160, "step": 8285 }, { "epoch": 1.5213800697375666, "grad_norm": 2.007826805114746, "learning_rate": 7.605982749128281e-06, "loss": 0.3348, "num_input_tokens_seen": 17882912, "step": 8290 }, { "epoch": 1.5222976692971186, "grad_norm": 3.2606966495513916, "learning_rate": 7.6105707469260425e-06, "loss": 0.3048, "num_input_tokens_seen": 17893120, "step": 8295 }, { "epoch": 1.523215268856671, "grad_norm": 2.507507562637329, "learning_rate": 7.615158744723803e-06, "loss": 0.3239, "num_input_tokens_seen": 17904576, "step": 8300 }, { "epoch": 1.5241328684162232, "grad_norm": 1.5200961828231812, "learning_rate": 7.619746742521564e-06, "loss": 0.3621, "num_input_tokens_seen": 17916192, "step": 8305 }, { "epoch": 1.5250504679757753, "grad_norm": 4.882346153259277, "learning_rate": 7.624334740319326e-06, "loss": 0.3086, "num_input_tokens_seen": 17927040, "step": 8310 }, { "epoch": 1.5259680675353275, "grad_norm": 1.5825912952423096, "learning_rate": 7.628922738117086e-06, "loss": 0.3194, "num_input_tokens_seen": 17936864, "step": 8315 }, { "epoch": 1.5268856670948798, "grad_norm": 2.6444814205169678, "learning_rate": 7.633510735914847e-06, "loss": 0.322, "num_input_tokens_seen": 17948160, "step": 8320 }, { "epoch": 1.527803266654432, "grad_norm": 2.709249973297119, "learning_rate": 7.638098733712609e-06, "loss": 0.281, "num_input_tokens_seen": 17958976, "step": 8325 }, { "epoch": 1.5287208662139842, "grad_norm": 4.274721145629883, "learning_rate": 7.64268673151037e-06, "loss": 0.2878, "num_input_tokens_seen": 17968896, "step": 8330 }, { "epoch": 1.5296384657735365, "grad_norm": 1.2876085042953491, "learning_rate": 7.64727472930813e-06, "loss": 0.2972, "num_input_tokens_seen": 17979776, "step": 8335 }, { "epoch": 1.5305560653330885, "grad_norm": 3.2133781909942627, "learning_rate": 7.651862727105892e-06, "loss": 0.506, "num_input_tokens_seen": 17990080, "step": 8340 }, { "epoch": 1.5314736648926408, "grad_norm": 1.319056510925293, "learning_rate": 7.656450724903653e-06, "loss": 0.3413, "num_input_tokens_seen": 18001248, "step": 8345 }, { "epoch": 1.532391264452193, "grad_norm": 0.9521996974945068, "learning_rate": 7.661038722701413e-06, "loss": 0.3405, "num_input_tokens_seen": 18011264, "step": 8350 }, { "epoch": 1.5333088640117452, "grad_norm": 3.025357246398926, "learning_rate": 7.665626720499175e-06, "loss": 0.4085, "num_input_tokens_seen": 18021952, "step": 8355 }, { "epoch": 1.5342264635712974, "grad_norm": 3.717257261276245, "learning_rate": 7.670214718296936e-06, "loss": 0.2766, "num_input_tokens_seen": 18032544, "step": 8360 }, { "epoch": 1.5351440631308497, "grad_norm": 1.142805814743042, "learning_rate": 7.674802716094696e-06, "loss": 0.3014, "num_input_tokens_seen": 18043552, "step": 8365 }, { "epoch": 1.5360616626904018, "grad_norm": 11.29027271270752, "learning_rate": 7.679390713892459e-06, "loss": 0.3797, "num_input_tokens_seen": 18054496, "step": 8370 }, { "epoch": 1.536979262249954, "grad_norm": 1.5454320907592773, "learning_rate": 7.68397871169022e-06, "loss": 0.3484, "num_input_tokens_seen": 18063904, "step": 8375 }, { "epoch": 1.5378968618095064, "grad_norm": 2.0621438026428223, "learning_rate": 7.68856670948798e-06, "loss": 0.321, "num_input_tokens_seen": 18074816, "step": 8380 }, { "epoch": 1.5388144613690584, "grad_norm": 5.185544490814209, "learning_rate": 7.693154707285742e-06, "loss": 0.2789, "num_input_tokens_seen": 18085152, "step": 8385 }, { "epoch": 1.5397320609286107, "grad_norm": 4.791853427886963, "learning_rate": 7.697742705083502e-06, "loss": 0.3033, "num_input_tokens_seen": 18096608, "step": 8390 }, { "epoch": 1.540649660488163, "grad_norm": 2.016822576522827, "learning_rate": 7.702330702881263e-06, "loss": 0.2932, "num_input_tokens_seen": 18107904, "step": 8395 }, { "epoch": 1.541567260047715, "grad_norm": 2.724693775177002, "learning_rate": 7.706918700679025e-06, "loss": 0.4956, "num_input_tokens_seen": 18118816, "step": 8400 }, { "epoch": 1.5424848596072673, "grad_norm": 1.4131104946136475, "learning_rate": 7.711506698476786e-06, "loss": 0.2951, "num_input_tokens_seen": 18129888, "step": 8405 }, { "epoch": 1.5434024591668196, "grad_norm": 13.98045539855957, "learning_rate": 7.716094696274546e-06, "loss": 0.4198, "num_input_tokens_seen": 18139616, "step": 8410 }, { "epoch": 1.5443200587263717, "grad_norm": 3.9189956188201904, "learning_rate": 7.720682694072308e-06, "loss": 0.3627, "num_input_tokens_seen": 18150368, "step": 8415 }, { "epoch": 1.5452376582859242, "grad_norm": 1.9570566415786743, "learning_rate": 7.725270691870069e-06, "loss": 0.3709, "num_input_tokens_seen": 18161760, "step": 8420 }, { "epoch": 1.5461552578454762, "grad_norm": 11.239961624145508, "learning_rate": 7.72985868966783e-06, "loss": 0.3878, "num_input_tokens_seen": 18172256, "step": 8425 }, { "epoch": 1.5470728574050283, "grad_norm": 2.0869290828704834, "learning_rate": 7.734446687465592e-06, "loss": 0.3151, "num_input_tokens_seen": 18182816, "step": 8430 }, { "epoch": 1.5479904569645808, "grad_norm": 0.8435831665992737, "learning_rate": 7.739034685263352e-06, "loss": 0.3271, "num_input_tokens_seen": 18194528, "step": 8435 }, { "epoch": 1.5489080565241329, "grad_norm": 0.8674530982971191, "learning_rate": 7.743622683061113e-06, "loss": 0.3215, "num_input_tokens_seen": 18205664, "step": 8440 }, { "epoch": 1.549825656083685, "grad_norm": 3.280893564224243, "learning_rate": 7.748210680858875e-06, "loss": 0.333, "num_input_tokens_seen": 18215488, "step": 8445 }, { "epoch": 1.5507432556432375, "grad_norm": 1.6826907396316528, "learning_rate": 7.752798678656634e-06, "loss": 0.3694, "num_input_tokens_seen": 18226016, "step": 8450 }, { "epoch": 1.5516608552027895, "grad_norm": 2.71687650680542, "learning_rate": 7.757386676454396e-06, "loss": 0.3112, "num_input_tokens_seen": 18236800, "step": 8455 }, { "epoch": 1.5525784547623416, "grad_norm": 3.514909029006958, "learning_rate": 7.761974674252158e-06, "loss": 0.313, "num_input_tokens_seen": 18247808, "step": 8460 }, { "epoch": 1.553496054321894, "grad_norm": 7.533764839172363, "learning_rate": 7.766562672049917e-06, "loss": 0.3325, "num_input_tokens_seen": 18258176, "step": 8465 }, { "epoch": 1.5544136538814461, "grad_norm": 3.3617560863494873, "learning_rate": 7.771150669847679e-06, "loss": 0.3284, "num_input_tokens_seen": 18268192, "step": 8470 }, { "epoch": 1.5553312534409982, "grad_norm": 7.392329216003418, "learning_rate": 7.77573866764544e-06, "loss": 0.3259, "num_input_tokens_seen": 18278528, "step": 8475 }, { "epoch": 1.5562488530005507, "grad_norm": 10.861474990844727, "learning_rate": 7.7803266654432e-06, "loss": 0.3966, "num_input_tokens_seen": 18289600, "step": 8480 }, { "epoch": 1.5571664525601028, "grad_norm": 15.731416702270508, "learning_rate": 7.784914663240962e-06, "loss": 0.4006, "num_input_tokens_seen": 18299520, "step": 8485 }, { "epoch": 1.5580840521196548, "grad_norm": 7.732333660125732, "learning_rate": 7.789502661038723e-06, "loss": 0.3069, "num_input_tokens_seen": 18311392, "step": 8490 }, { "epoch": 1.5590016516792073, "grad_norm": 1.429889440536499, "learning_rate": 7.794090658836483e-06, "loss": 0.3426, "num_input_tokens_seen": 18322080, "step": 8495 }, { "epoch": 1.5599192512387594, "grad_norm": 2.7737104892730713, "learning_rate": 7.798678656634246e-06, "loss": 0.2894, "num_input_tokens_seen": 18332512, "step": 8500 }, { "epoch": 1.5608368507983115, "grad_norm": 0.9810894727706909, "learning_rate": 7.803266654432006e-06, "loss": 0.2938, "num_input_tokens_seen": 18343424, "step": 8505 }, { "epoch": 1.561754450357864, "grad_norm": 1.4552369117736816, "learning_rate": 7.807854652229767e-06, "loss": 0.4102, "num_input_tokens_seen": 18353056, "step": 8510 }, { "epoch": 1.562672049917416, "grad_norm": 3.0501019954681396, "learning_rate": 7.812442650027529e-06, "loss": 0.3217, "num_input_tokens_seen": 18364448, "step": 8515 }, { "epoch": 1.563589649476968, "grad_norm": 2.3228840827941895, "learning_rate": 7.81703064782529e-06, "loss": 0.354, "num_input_tokens_seen": 18374496, "step": 8520 }, { "epoch": 1.5645072490365206, "grad_norm": 1.2313839197158813, "learning_rate": 7.82161864562305e-06, "loss": 0.3195, "num_input_tokens_seen": 18384096, "step": 8525 }, { "epoch": 1.5654248485960727, "grad_norm": 2.1874585151672363, "learning_rate": 7.826206643420812e-06, "loss": 0.2993, "num_input_tokens_seen": 18395168, "step": 8530 }, { "epoch": 1.5663424481556247, "grad_norm": 1.3571879863739014, "learning_rate": 7.830794641218573e-06, "loss": 0.3995, "num_input_tokens_seen": 18405952, "step": 8535 }, { "epoch": 1.5672600477151772, "grad_norm": 1.6849838495254517, "learning_rate": 7.835382639016333e-06, "loss": 0.2962, "num_input_tokens_seen": 18417504, "step": 8540 }, { "epoch": 1.5681776472747293, "grad_norm": 3.7830986976623535, "learning_rate": 7.839970636814095e-06, "loss": 0.3243, "num_input_tokens_seen": 18428352, "step": 8545 }, { "epoch": 1.5690952468342814, "grad_norm": 4.4129533767700195, "learning_rate": 7.844558634611856e-06, "loss": 0.3456, "num_input_tokens_seen": 18438816, "step": 8550 }, { "epoch": 1.5700128463938339, "grad_norm": 2.2293379306793213, "learning_rate": 7.849146632409616e-06, "loss": 0.3103, "num_input_tokens_seen": 18450336, "step": 8555 }, { "epoch": 1.570930445953386, "grad_norm": 3.7739906311035156, "learning_rate": 7.853734630207379e-06, "loss": 0.327, "num_input_tokens_seen": 18460832, "step": 8560 }, { "epoch": 1.571848045512938, "grad_norm": 1.8145062923431396, "learning_rate": 7.858322628005139e-06, "loss": 0.3197, "num_input_tokens_seen": 18470720, "step": 8565 }, { "epoch": 1.5727656450724905, "grad_norm": 1.541182279586792, "learning_rate": 7.8629106258029e-06, "loss": 0.5611, "num_input_tokens_seen": 18480992, "step": 8570 }, { "epoch": 1.5736832446320426, "grad_norm": 5.26710319519043, "learning_rate": 7.867498623600662e-06, "loss": 0.2659, "num_input_tokens_seen": 18492320, "step": 8575 }, { "epoch": 1.5746008441915946, "grad_norm": 3.007094144821167, "learning_rate": 7.872086621398422e-06, "loss": 0.3146, "num_input_tokens_seen": 18503168, "step": 8580 }, { "epoch": 1.5755184437511471, "grad_norm": 2.0664143562316895, "learning_rate": 7.876674619196183e-06, "loss": 0.2634, "num_input_tokens_seen": 18512800, "step": 8585 }, { "epoch": 1.5764360433106992, "grad_norm": 5.267702102661133, "learning_rate": 7.881262616993945e-06, "loss": 0.3148, "num_input_tokens_seen": 18524096, "step": 8590 }, { "epoch": 1.5773536428702513, "grad_norm": 11.319408416748047, "learning_rate": 7.885850614791706e-06, "loss": 0.4781, "num_input_tokens_seen": 18535904, "step": 8595 }, { "epoch": 1.5782712424298038, "grad_norm": 13.709735870361328, "learning_rate": 7.890438612589466e-06, "loss": 0.376, "num_input_tokens_seen": 18546144, "step": 8600 }, { "epoch": 1.5791888419893558, "grad_norm": 3.4019229412078857, "learning_rate": 7.895026610387228e-06, "loss": 0.5303, "num_input_tokens_seen": 18557568, "step": 8605 }, { "epoch": 1.580106441548908, "grad_norm": 6.522906303405762, "learning_rate": 7.899614608184989e-06, "loss": 0.3631, "num_input_tokens_seen": 18568192, "step": 8610 }, { "epoch": 1.5810240411084604, "grad_norm": 2.0131750106811523, "learning_rate": 7.90420260598275e-06, "loss": 0.3464, "num_input_tokens_seen": 18578176, "step": 8615 }, { "epoch": 1.5819416406680125, "grad_norm": 1.6943165063858032, "learning_rate": 7.90879060378051e-06, "loss": 0.3252, "num_input_tokens_seen": 18589792, "step": 8620 }, { "epoch": 1.5828592402275647, "grad_norm": 3.471348285675049, "learning_rate": 7.913378601578272e-06, "loss": 0.3387, "num_input_tokens_seen": 18599936, "step": 8625 }, { "epoch": 1.583776839787117, "grad_norm": 1.7979843616485596, "learning_rate": 7.917966599376033e-06, "loss": 0.3574, "num_input_tokens_seen": 18611616, "step": 8630 }, { "epoch": 1.584694439346669, "grad_norm": 4.289613723754883, "learning_rate": 7.922554597173793e-06, "loss": 0.3657, "num_input_tokens_seen": 18622496, "step": 8635 }, { "epoch": 1.5856120389062214, "grad_norm": 3.2766029834747314, "learning_rate": 7.927142594971555e-06, "loss": 0.3062, "num_input_tokens_seen": 18632960, "step": 8640 }, { "epoch": 1.5865296384657737, "grad_norm": 2.010075330734253, "learning_rate": 7.931730592769317e-06, "loss": 0.3425, "num_input_tokens_seen": 18645632, "step": 8645 }, { "epoch": 1.5874472380253257, "grad_norm": 7.4092698097229, "learning_rate": 7.936318590567076e-06, "loss": 0.3528, "num_input_tokens_seen": 18657280, "step": 8650 }, { "epoch": 1.588364837584878, "grad_norm": 3.091728925704956, "learning_rate": 7.940906588364839e-06, "loss": 0.3629, "num_input_tokens_seen": 18668640, "step": 8655 }, { "epoch": 1.5892824371444303, "grad_norm": 3.4529309272766113, "learning_rate": 7.945494586162599e-06, "loss": 0.3026, "num_input_tokens_seen": 18678944, "step": 8660 }, { "epoch": 1.5902000367039824, "grad_norm": 4.276089191436768, "learning_rate": 7.95008258396036e-06, "loss": 0.3663, "num_input_tokens_seen": 18690592, "step": 8665 }, { "epoch": 1.5911176362635346, "grad_norm": 1.6028598546981812, "learning_rate": 7.954670581758122e-06, "loss": 0.3434, "num_input_tokens_seen": 18700288, "step": 8670 }, { "epoch": 1.592035235823087, "grad_norm": 2.674983024597168, "learning_rate": 7.959258579555882e-06, "loss": 0.3356, "num_input_tokens_seen": 18710816, "step": 8675 }, { "epoch": 1.592952835382639, "grad_norm": 2.9686291217803955, "learning_rate": 7.963846577353643e-06, "loss": 0.3208, "num_input_tokens_seen": 18721344, "step": 8680 }, { "epoch": 1.5938704349421913, "grad_norm": 1.693164348602295, "learning_rate": 7.968434575151405e-06, "loss": 0.349, "num_input_tokens_seen": 18732608, "step": 8685 }, { "epoch": 1.5947880345017436, "grad_norm": 2.8135266304016113, "learning_rate": 7.973022572949166e-06, "loss": 0.2697, "num_input_tokens_seen": 18743808, "step": 8690 }, { "epoch": 1.5957056340612956, "grad_norm": 4.963754653930664, "learning_rate": 7.977610570746926e-06, "loss": 0.3185, "num_input_tokens_seen": 18755008, "step": 8695 }, { "epoch": 1.596623233620848, "grad_norm": 2.1486597061157227, "learning_rate": 7.982198568544688e-06, "loss": 0.384, "num_input_tokens_seen": 18766336, "step": 8700 }, { "epoch": 1.5975408331804002, "grad_norm": 1.831447958946228, "learning_rate": 7.986786566342449e-06, "loss": 0.2925, "num_input_tokens_seen": 18776576, "step": 8705 }, { "epoch": 1.5984584327399523, "grad_norm": 1.0496139526367188, "learning_rate": 7.99137456414021e-06, "loss": 0.3168, "num_input_tokens_seen": 18787296, "step": 8710 }, { "epoch": 1.5993760322995045, "grad_norm": 1.4183610677719116, "learning_rate": 7.995962561937971e-06, "loss": 0.2936, "num_input_tokens_seen": 18796928, "step": 8715 }, { "epoch": 1.6002936318590568, "grad_norm": 1.953601598739624, "learning_rate": 8.000550559735732e-06, "loss": 0.2992, "num_input_tokens_seen": 18807744, "step": 8720 }, { "epoch": 1.6012112314186089, "grad_norm": 3.194197654724121, "learning_rate": 8.005138557533492e-06, "loss": 0.3748, "num_input_tokens_seen": 18818400, "step": 8725 }, { "epoch": 1.6021288309781612, "grad_norm": 7.558193683624268, "learning_rate": 8.009726555331255e-06, "loss": 0.3861, "num_input_tokens_seen": 18831104, "step": 8730 }, { "epoch": 1.6030464305377135, "grad_norm": 7.163750648498535, "learning_rate": 8.014314553129015e-06, "loss": 0.3536, "num_input_tokens_seen": 18842048, "step": 8735 }, { "epoch": 1.6039640300972655, "grad_norm": 3.6226580142974854, "learning_rate": 8.018902550926776e-06, "loss": 0.348, "num_input_tokens_seen": 18853152, "step": 8740 }, { "epoch": 1.6048816296568178, "grad_norm": 3.4675724506378174, "learning_rate": 8.023490548724538e-06, "loss": 0.4124, "num_input_tokens_seen": 18863424, "step": 8745 }, { "epoch": 1.60579922921637, "grad_norm": 4.955069065093994, "learning_rate": 8.028078546522298e-06, "loss": 0.3172, "num_input_tokens_seen": 18873728, "step": 8750 }, { "epoch": 1.6067168287759221, "grad_norm": 1.4484988451004028, "learning_rate": 8.032666544320059e-06, "loss": 0.3349, "num_input_tokens_seen": 18884800, "step": 8755 }, { "epoch": 1.6076344283354744, "grad_norm": 2.1835780143737793, "learning_rate": 8.037254542117821e-06, "loss": 0.3275, "num_input_tokens_seen": 18895904, "step": 8760 }, { "epoch": 1.6085520278950267, "grad_norm": 4.6153459548950195, "learning_rate": 8.041842539915582e-06, "loss": 0.3429, "num_input_tokens_seen": 18905888, "step": 8765 }, { "epoch": 1.6094696274545788, "grad_norm": 5.039000034332275, "learning_rate": 8.046430537713342e-06, "loss": 0.3053, "num_input_tokens_seen": 18915040, "step": 8770 }, { "epoch": 1.610387227014131, "grad_norm": 3.4393696784973145, "learning_rate": 8.051018535511104e-06, "loss": 0.4008, "num_input_tokens_seen": 18923872, "step": 8775 }, { "epoch": 1.6113048265736833, "grad_norm": 1.7417562007904053, "learning_rate": 8.055606533308865e-06, "loss": 0.3264, "num_input_tokens_seen": 18933952, "step": 8780 }, { "epoch": 1.6122224261332354, "grad_norm": 3.070065975189209, "learning_rate": 8.060194531106625e-06, "loss": 0.2782, "num_input_tokens_seen": 18945248, "step": 8785 }, { "epoch": 1.6131400256927877, "grad_norm": 3.928929090499878, "learning_rate": 8.064782528904388e-06, "loss": 0.3139, "num_input_tokens_seen": 18954912, "step": 8790 }, { "epoch": 1.61405762525234, "grad_norm": 3.152193546295166, "learning_rate": 8.069370526702148e-06, "loss": 0.2958, "num_input_tokens_seen": 18966080, "step": 8795 }, { "epoch": 1.614975224811892, "grad_norm": 5.096840858459473, "learning_rate": 8.073958524499909e-06, "loss": 0.3541, "num_input_tokens_seen": 18976128, "step": 8800 }, { "epoch": 1.6158928243714443, "grad_norm": 2.3051860332489014, "learning_rate": 8.07854652229767e-06, "loss": 0.2979, "num_input_tokens_seen": 18987424, "step": 8805 }, { "epoch": 1.6168104239309966, "grad_norm": 12.48357105255127, "learning_rate": 8.083134520095431e-06, "loss": 0.3102, "num_input_tokens_seen": 18997792, "step": 8810 }, { "epoch": 1.6177280234905487, "grad_norm": 3.7961623668670654, "learning_rate": 8.087722517893192e-06, "loss": 0.2732, "num_input_tokens_seen": 19007392, "step": 8815 }, { "epoch": 1.618645623050101, "grad_norm": 2.8981857299804688, "learning_rate": 8.092310515690952e-06, "loss": 0.3225, "num_input_tokens_seen": 19018112, "step": 8820 }, { "epoch": 1.6195632226096532, "grad_norm": 14.310503959655762, "learning_rate": 8.096898513488715e-06, "loss": 0.3699, "num_input_tokens_seen": 19028832, "step": 8825 }, { "epoch": 1.6204808221692053, "grad_norm": 6.288309097290039, "learning_rate": 8.101486511286475e-06, "loss": 0.4262, "num_input_tokens_seen": 19039136, "step": 8830 }, { "epoch": 1.6213984217287576, "grad_norm": 1.6993869543075562, "learning_rate": 8.106074509084236e-06, "loss": 0.3468, "num_input_tokens_seen": 19050464, "step": 8835 }, { "epoch": 1.6223160212883099, "grad_norm": 3.374448299407959, "learning_rate": 8.110662506881998e-06, "loss": 0.3347, "num_input_tokens_seen": 19061344, "step": 8840 }, { "epoch": 1.623233620847862, "grad_norm": 1.5214282274246216, "learning_rate": 8.115250504679758e-06, "loss": 0.3845, "num_input_tokens_seen": 19071360, "step": 8845 }, { "epoch": 1.6241512204074142, "grad_norm": 4.217715263366699, "learning_rate": 8.119838502477519e-06, "loss": 0.4052, "num_input_tokens_seen": 19082912, "step": 8850 }, { "epoch": 1.6250688199669665, "grad_norm": 1.7491374015808105, "learning_rate": 8.124426500275281e-06, "loss": 0.3821, "num_input_tokens_seen": 19093536, "step": 8855 }, { "epoch": 1.6259864195265186, "grad_norm": 1.4613161087036133, "learning_rate": 8.129014498073042e-06, "loss": 0.3374, "num_input_tokens_seen": 19102944, "step": 8860 }, { "epoch": 1.6269040190860709, "grad_norm": 1.0894936323165894, "learning_rate": 8.133602495870802e-06, "loss": 0.3032, "num_input_tokens_seen": 19114912, "step": 8865 }, { "epoch": 1.6278216186456231, "grad_norm": 2.229111909866333, "learning_rate": 8.138190493668564e-06, "loss": 0.3239, "num_input_tokens_seen": 19127104, "step": 8870 }, { "epoch": 1.6287392182051752, "grad_norm": 3.5216922760009766, "learning_rate": 8.142778491466325e-06, "loss": 0.3146, "num_input_tokens_seen": 19137120, "step": 8875 }, { "epoch": 1.6296568177647275, "grad_norm": 2.3791322708129883, "learning_rate": 8.147366489264085e-06, "loss": 0.3226, "num_input_tokens_seen": 19146528, "step": 8880 }, { "epoch": 1.6305744173242798, "grad_norm": 4.090445041656494, "learning_rate": 8.151954487061848e-06, "loss": 0.426, "num_input_tokens_seen": 19156416, "step": 8885 }, { "epoch": 1.6314920168838318, "grad_norm": 7.705338478088379, "learning_rate": 8.156542484859608e-06, "loss": 0.4912, "num_input_tokens_seen": 19165728, "step": 8890 }, { "epoch": 1.6324096164433841, "grad_norm": 7.18982458114624, "learning_rate": 8.161130482657369e-06, "loss": 0.3052, "num_input_tokens_seen": 19176480, "step": 8895 }, { "epoch": 1.6333272160029364, "grad_norm": 4.313545227050781, "learning_rate": 8.16571848045513e-06, "loss": 0.3433, "num_input_tokens_seen": 19187936, "step": 8900 }, { "epoch": 1.6342448155624885, "grad_norm": 1.4618688821792603, "learning_rate": 8.170306478252891e-06, "loss": 0.3583, "num_input_tokens_seen": 19198336, "step": 8905 }, { "epoch": 1.6351624151220407, "grad_norm": 1.3402533531188965, "learning_rate": 8.174894476050652e-06, "loss": 0.2912, "num_input_tokens_seen": 19208672, "step": 8910 }, { "epoch": 1.636080014681593, "grad_norm": 1.4861977100372314, "learning_rate": 8.179482473848414e-06, "loss": 0.3588, "num_input_tokens_seen": 19219520, "step": 8915 }, { "epoch": 1.636997614241145, "grad_norm": 2.690077781677246, "learning_rate": 8.184070471646175e-06, "loss": 0.3143, "num_input_tokens_seen": 19231008, "step": 8920 }, { "epoch": 1.6379152138006974, "grad_norm": 1.357125163078308, "learning_rate": 8.188658469443935e-06, "loss": 0.2685, "num_input_tokens_seen": 19240128, "step": 8925 }, { "epoch": 1.6388328133602497, "grad_norm": 10.208110809326172, "learning_rate": 8.193246467241697e-06, "loss": 0.4016, "num_input_tokens_seen": 19250400, "step": 8930 }, { "epoch": 1.6397504129198017, "grad_norm": 8.43456745147705, "learning_rate": 8.197834465039458e-06, "loss": 0.3474, "num_input_tokens_seen": 19261856, "step": 8935 }, { "epoch": 1.640668012479354, "grad_norm": 1.4494608640670776, "learning_rate": 8.202422462837218e-06, "loss": 0.3333, "num_input_tokens_seen": 19273024, "step": 8940 }, { "epoch": 1.6415856120389063, "grad_norm": 3.3656771183013916, "learning_rate": 8.20701046063498e-06, "loss": 0.3162, "num_input_tokens_seen": 19282336, "step": 8945 }, { "epoch": 1.6425032115984584, "grad_norm": 1.893804907798767, "learning_rate": 8.21159845843274e-06, "loss": 0.304, "num_input_tokens_seen": 19293376, "step": 8950 }, { "epoch": 1.6434208111580106, "grad_norm": 1.0690484046936035, "learning_rate": 8.216186456230502e-06, "loss": 0.2296, "num_input_tokens_seen": 19304320, "step": 8955 }, { "epoch": 1.644338410717563, "grad_norm": 8.9883394241333, "learning_rate": 8.220774454028264e-06, "loss": 0.3682, "num_input_tokens_seen": 19316064, "step": 8960 }, { "epoch": 1.645256010277115, "grad_norm": 5.919795036315918, "learning_rate": 8.225362451826023e-06, "loss": 0.2576, "num_input_tokens_seen": 19326592, "step": 8965 }, { "epoch": 1.6461736098366673, "grad_norm": 1.4414139986038208, "learning_rate": 8.229950449623785e-06, "loss": 0.3554, "num_input_tokens_seen": 19336000, "step": 8970 }, { "epoch": 1.6470912093962196, "grad_norm": 0.9654784202575684, "learning_rate": 8.234538447421545e-06, "loss": 0.2975, "num_input_tokens_seen": 19348160, "step": 8975 }, { "epoch": 1.6480088089557716, "grad_norm": 3.5633962154388428, "learning_rate": 8.239126445219306e-06, "loss": 0.3616, "num_input_tokens_seen": 19359232, "step": 8980 }, { "epoch": 1.648926408515324, "grad_norm": 3.903825521469116, "learning_rate": 8.243714443017068e-06, "loss": 0.248, "num_input_tokens_seen": 19369504, "step": 8985 }, { "epoch": 1.6498440080748762, "grad_norm": 3.4878902435302734, "learning_rate": 8.248302440814829e-06, "loss": 0.3654, "num_input_tokens_seen": 19380544, "step": 8990 }, { "epoch": 1.6507616076344283, "grad_norm": 1.5116541385650635, "learning_rate": 8.252890438612589e-06, "loss": 0.3142, "num_input_tokens_seen": 19392224, "step": 8995 }, { "epoch": 1.6516792071939805, "grad_norm": 1.5815151929855347, "learning_rate": 8.257478436410351e-06, "loss": 0.2474, "num_input_tokens_seen": 19402400, "step": 9000 }, { "epoch": 1.6525968067535328, "grad_norm": 1.1778069734573364, "learning_rate": 8.262066434208112e-06, "loss": 0.3796, "num_input_tokens_seen": 19413440, "step": 9005 }, { "epoch": 1.6535144063130849, "grad_norm": 3.221135377883911, "learning_rate": 8.266654432005872e-06, "loss": 0.2678, "num_input_tokens_seen": 19423328, "step": 9010 }, { "epoch": 1.6544320058726372, "grad_norm": 2.845586061477661, "learning_rate": 8.271242429803635e-06, "loss": 0.3341, "num_input_tokens_seen": 19435264, "step": 9015 }, { "epoch": 1.6553496054321895, "grad_norm": 6.887685775756836, "learning_rate": 8.275830427601395e-06, "loss": 0.3342, "num_input_tokens_seen": 19447552, "step": 9020 }, { "epoch": 1.6562672049917415, "grad_norm": 1.5378084182739258, "learning_rate": 8.280418425399156e-06, "loss": 0.267, "num_input_tokens_seen": 19457856, "step": 9025 }, { "epoch": 1.6571848045512938, "grad_norm": 1.5684987306594849, "learning_rate": 8.285006423196918e-06, "loss": 0.3288, "num_input_tokens_seen": 19468224, "step": 9030 }, { "epoch": 1.658102404110846, "grad_norm": 11.415267944335938, "learning_rate": 8.289594420994678e-06, "loss": 0.262, "num_input_tokens_seen": 19479904, "step": 9035 }, { "epoch": 1.6590200036703981, "grad_norm": 1.0542572736740112, "learning_rate": 8.294182418792439e-06, "loss": 0.4032, "num_input_tokens_seen": 19489760, "step": 9040 }, { "epoch": 1.6599376032299504, "grad_norm": 1.681405782699585, "learning_rate": 8.298770416590201e-06, "loss": 0.4422, "num_input_tokens_seen": 19500992, "step": 9045 }, { "epoch": 1.6608552027895027, "grad_norm": 9.938291549682617, "learning_rate": 8.303358414387962e-06, "loss": 0.4369, "num_input_tokens_seen": 19512608, "step": 9050 }, { "epoch": 1.6617728023490548, "grad_norm": 4.044089317321777, "learning_rate": 8.307946412185722e-06, "loss": 0.2556, "num_input_tokens_seen": 19522624, "step": 9055 }, { "epoch": 1.662690401908607, "grad_norm": 8.998908042907715, "learning_rate": 8.312534409983484e-06, "loss": 0.3606, "num_input_tokens_seen": 19533312, "step": 9060 }, { "epoch": 1.6636080014681593, "grad_norm": 2.655623197555542, "learning_rate": 8.317122407781245e-06, "loss": 0.3503, "num_input_tokens_seen": 19545184, "step": 9065 }, { "epoch": 1.6645256010277114, "grad_norm": 1.1810977458953857, "learning_rate": 8.321710405579007e-06, "loss": 0.3432, "num_input_tokens_seen": 19555808, "step": 9070 }, { "epoch": 1.6654432005872637, "grad_norm": 2.1386687755584717, "learning_rate": 8.326298403376767e-06, "loss": 0.2837, "num_input_tokens_seen": 19565440, "step": 9075 }, { "epoch": 1.666360800146816, "grad_norm": 4.605058670043945, "learning_rate": 8.330886401174528e-06, "loss": 0.4111, "num_input_tokens_seen": 19575744, "step": 9080 }, { "epoch": 1.667278399706368, "grad_norm": 9.968988418579102, "learning_rate": 8.33547439897229e-06, "loss": 0.3162, "num_input_tokens_seen": 19586464, "step": 9085 }, { "epoch": 1.6681959992659203, "grad_norm": 9.529478073120117, "learning_rate": 8.34006239677005e-06, "loss": 0.3319, "num_input_tokens_seen": 19597472, "step": 9090 }, { "epoch": 1.6691135988254726, "grad_norm": 2.5227556228637695, "learning_rate": 8.344650394567811e-06, "loss": 0.3734, "num_input_tokens_seen": 19607232, "step": 9095 }, { "epoch": 1.6700311983850247, "grad_norm": 1.4169389009475708, "learning_rate": 8.349238392365573e-06, "loss": 0.2792, "num_input_tokens_seen": 19617536, "step": 9100 }, { "epoch": 1.670948797944577, "grad_norm": 1.3173680305480957, "learning_rate": 8.353826390163334e-06, "loss": 0.2941, "num_input_tokens_seen": 19628384, "step": 9105 }, { "epoch": 1.6718663975041292, "grad_norm": 4.150110244750977, "learning_rate": 8.358414387961094e-06, "loss": 0.3323, "num_input_tokens_seen": 19639296, "step": 9110 }, { "epoch": 1.6727839970636813, "grad_norm": 1.9390817880630493, "learning_rate": 8.363002385758857e-06, "loss": 0.3369, "num_input_tokens_seen": 19649344, "step": 9115 }, { "epoch": 1.6737015966232336, "grad_norm": 1.02650785446167, "learning_rate": 8.367590383556616e-06, "loss": 0.291, "num_input_tokens_seen": 19660672, "step": 9120 }, { "epoch": 1.6746191961827859, "grad_norm": 13.251317024230957, "learning_rate": 8.372178381354378e-06, "loss": 0.4455, "num_input_tokens_seen": 19672640, "step": 9125 }, { "epoch": 1.675536795742338, "grad_norm": 1.0070161819458008, "learning_rate": 8.37676637915214e-06, "loss": 0.2975, "num_input_tokens_seen": 19683424, "step": 9130 }, { "epoch": 1.6764543953018902, "grad_norm": 12.562267303466797, "learning_rate": 8.381354376949899e-06, "loss": 0.4155, "num_input_tokens_seen": 19694848, "step": 9135 }, { "epoch": 1.6773719948614425, "grad_norm": 2.505156993865967, "learning_rate": 8.385942374747661e-06, "loss": 0.2947, "num_input_tokens_seen": 19706528, "step": 9140 }, { "epoch": 1.6782895944209946, "grad_norm": 15.220481872558594, "learning_rate": 8.390530372545423e-06, "loss": 0.3521, "num_input_tokens_seen": 19716704, "step": 9145 }, { "epoch": 1.6792071939805469, "grad_norm": 26.57557487487793, "learning_rate": 8.395118370343182e-06, "loss": 0.3207, "num_input_tokens_seen": 19727616, "step": 9150 }, { "epoch": 1.6801247935400991, "grad_norm": 1.1731131076812744, "learning_rate": 8.399706368140944e-06, "loss": 0.3245, "num_input_tokens_seen": 19738368, "step": 9155 }, { "epoch": 1.6810423930996512, "grad_norm": 2.1194045543670654, "learning_rate": 8.404294365938705e-06, "loss": 0.4213, "num_input_tokens_seen": 19749376, "step": 9160 }, { "epoch": 1.6819599926592035, "grad_norm": 0.8808299899101257, "learning_rate": 8.408882363736465e-06, "loss": 0.4437, "num_input_tokens_seen": 19760640, "step": 9165 }, { "epoch": 1.6828775922187558, "grad_norm": 0.9237498641014099, "learning_rate": 8.413470361534227e-06, "loss": 0.41, "num_input_tokens_seen": 19772256, "step": 9170 }, { "epoch": 1.6837951917783078, "grad_norm": 1.0455071926116943, "learning_rate": 8.418058359331988e-06, "loss": 0.3938, "num_input_tokens_seen": 19783200, "step": 9175 }, { "epoch": 1.6847127913378601, "grad_norm": 0.9983702301979065, "learning_rate": 8.422646357129748e-06, "loss": 0.3029, "num_input_tokens_seen": 19794496, "step": 9180 }, { "epoch": 1.6856303908974124, "grad_norm": 0.9947406053543091, "learning_rate": 8.42723435492751e-06, "loss": 0.2913, "num_input_tokens_seen": 19806208, "step": 9185 }, { "epoch": 1.6865479904569645, "grad_norm": 2.1865487098693848, "learning_rate": 8.431822352725271e-06, "loss": 0.3689, "num_input_tokens_seen": 19816960, "step": 9190 }, { "epoch": 1.6874655900165167, "grad_norm": 2.286395311355591, "learning_rate": 8.436410350523032e-06, "loss": 0.4389, "num_input_tokens_seen": 19826112, "step": 9195 }, { "epoch": 1.688383189576069, "grad_norm": 2.1454503536224365, "learning_rate": 8.440998348320794e-06, "loss": 0.2819, "num_input_tokens_seen": 19836928, "step": 9200 }, { "epoch": 1.689300789135621, "grad_norm": 1.7015726566314697, "learning_rate": 8.445586346118554e-06, "loss": 0.2885, "num_input_tokens_seen": 19847104, "step": 9205 }, { "epoch": 1.6902183886951734, "grad_norm": 12.328689575195312, "learning_rate": 8.450174343916315e-06, "loss": 0.4653, "num_input_tokens_seen": 19859072, "step": 9210 }, { "epoch": 1.6911359882547257, "grad_norm": 7.421411514282227, "learning_rate": 8.454762341714077e-06, "loss": 0.3725, "num_input_tokens_seen": 19870720, "step": 9215 }, { "epoch": 1.6920535878142777, "grad_norm": 2.335338592529297, "learning_rate": 8.459350339511838e-06, "loss": 0.3025, "num_input_tokens_seen": 19881376, "step": 9220 }, { "epoch": 1.69297118737383, "grad_norm": 6.353479385375977, "learning_rate": 8.463938337309598e-06, "loss": 0.3531, "num_input_tokens_seen": 19891584, "step": 9225 }, { "epoch": 1.6938887869333823, "grad_norm": 1.0776197910308838, "learning_rate": 8.46852633510736e-06, "loss": 0.4329, "num_input_tokens_seen": 19901504, "step": 9230 }, { "epoch": 1.6948063864929344, "grad_norm": 3.441455364227295, "learning_rate": 8.473114332905121e-06, "loss": 0.3036, "num_input_tokens_seen": 19912256, "step": 9235 }, { "epoch": 1.6957239860524866, "grad_norm": 2.4477765560150146, "learning_rate": 8.477702330702881e-06, "loss": 0.3338, "num_input_tokens_seen": 19924288, "step": 9240 }, { "epoch": 1.696641585612039, "grad_norm": 2.277864694595337, "learning_rate": 8.482290328500644e-06, "loss": 0.3141, "num_input_tokens_seen": 19935936, "step": 9245 }, { "epoch": 1.697559185171591, "grad_norm": 3.6358540058135986, "learning_rate": 8.486878326298404e-06, "loss": 0.2867, "num_input_tokens_seen": 19947648, "step": 9250 }, { "epoch": 1.6984767847311433, "grad_norm": 1.8258130550384521, "learning_rate": 8.491466324096165e-06, "loss": 0.2876, "num_input_tokens_seen": 19957856, "step": 9255 }, { "epoch": 1.6993943842906956, "grad_norm": 3.8544929027557373, "learning_rate": 8.496054321893927e-06, "loss": 0.2629, "num_input_tokens_seen": 19968224, "step": 9260 }, { "epoch": 1.7003119838502476, "grad_norm": 1.5514986515045166, "learning_rate": 8.500642319691687e-06, "loss": 0.3732, "num_input_tokens_seen": 19978624, "step": 9265 }, { "epoch": 1.7012295834098001, "grad_norm": 2.69378662109375, "learning_rate": 8.505230317489448e-06, "loss": 0.2625, "num_input_tokens_seen": 19989216, "step": 9270 }, { "epoch": 1.7021471829693522, "grad_norm": 3.5386393070220947, "learning_rate": 8.50981831528721e-06, "loss": 0.3779, "num_input_tokens_seen": 19999424, "step": 9275 }, { "epoch": 1.7030647825289043, "grad_norm": 2.7491610050201416, "learning_rate": 8.51440631308497e-06, "loss": 0.3087, "num_input_tokens_seen": 20009984, "step": 9280 }, { "epoch": 1.7039823820884568, "grad_norm": 1.8875356912612915, "learning_rate": 8.518994310882731e-06, "loss": 0.348, "num_input_tokens_seen": 20021376, "step": 9285 }, { "epoch": 1.7048999816480088, "grad_norm": 1.5555696487426758, "learning_rate": 8.523582308680493e-06, "loss": 0.3204, "num_input_tokens_seen": 20031104, "step": 9290 }, { "epoch": 1.7058175812075609, "grad_norm": 2.8281307220458984, "learning_rate": 8.528170306478254e-06, "loss": 0.3453, "num_input_tokens_seen": 20041440, "step": 9295 }, { "epoch": 1.7067351807671134, "grad_norm": 2.301123857498169, "learning_rate": 8.532758304276014e-06, "loss": 0.2886, "num_input_tokens_seen": 20051296, "step": 9300 }, { "epoch": 1.7076527803266655, "grad_norm": 1.7164592742919922, "learning_rate": 8.537346302073775e-06, "loss": 0.3715, "num_input_tokens_seen": 20062112, "step": 9305 }, { "epoch": 1.7085703798862175, "grad_norm": 8.340019226074219, "learning_rate": 8.541934299871537e-06, "loss": 0.3323, "num_input_tokens_seen": 20074240, "step": 9310 }, { "epoch": 1.70948797944577, "grad_norm": 4.860424995422363, "learning_rate": 8.546522297669298e-06, "loss": 0.3342, "num_input_tokens_seen": 20085248, "step": 9315 }, { "epoch": 1.710405579005322, "grad_norm": 7.119139194488525, "learning_rate": 8.551110295467058e-06, "loss": 0.3169, "num_input_tokens_seen": 20094976, "step": 9320 }, { "epoch": 1.7113231785648741, "grad_norm": 3.670837163925171, "learning_rate": 8.55569829326482e-06, "loss": 0.3073, "num_input_tokens_seen": 20104960, "step": 9325 }, { "epoch": 1.7122407781244267, "grad_norm": 1.8851343393325806, "learning_rate": 8.560286291062581e-06, "loss": 0.2829, "num_input_tokens_seen": 20115488, "step": 9330 }, { "epoch": 1.7131583776839787, "grad_norm": 11.465951919555664, "learning_rate": 8.564874288860341e-06, "loss": 0.3777, "num_input_tokens_seen": 20126528, "step": 9335 }, { "epoch": 1.7140759772435308, "grad_norm": 1.7235777378082275, "learning_rate": 8.569462286658104e-06, "loss": 0.3008, "num_input_tokens_seen": 20137696, "step": 9340 }, { "epoch": 1.7149935768030833, "grad_norm": 1.7296044826507568, "learning_rate": 8.574050284455864e-06, "loss": 0.3281, "num_input_tokens_seen": 20149024, "step": 9345 }, { "epoch": 1.7159111763626353, "grad_norm": 8.832470893859863, "learning_rate": 8.578638282253625e-06, "loss": 0.2563, "num_input_tokens_seen": 20158848, "step": 9350 }, { "epoch": 1.7168287759221874, "grad_norm": 0.9262158870697021, "learning_rate": 8.583226280051387e-06, "loss": 0.364, "num_input_tokens_seen": 20169696, "step": 9355 }, { "epoch": 1.71774637548174, "grad_norm": 2.2041587829589844, "learning_rate": 8.587814277849147e-06, "loss": 0.3251, "num_input_tokens_seen": 20181408, "step": 9360 }, { "epoch": 1.718663975041292, "grad_norm": 8.060083389282227, "learning_rate": 8.592402275646908e-06, "loss": 0.3894, "num_input_tokens_seen": 20192384, "step": 9365 }, { "epoch": 1.719581574600844, "grad_norm": 1.1666715145111084, "learning_rate": 8.59699027344467e-06, "loss": 0.2887, "num_input_tokens_seen": 20203968, "step": 9370 }, { "epoch": 1.7204991741603965, "grad_norm": 3.4017791748046875, "learning_rate": 8.60157827124243e-06, "loss": 0.4615, "num_input_tokens_seen": 20214336, "step": 9375 }, { "epoch": 1.7214167737199486, "grad_norm": 1.7954766750335693, "learning_rate": 8.606166269040191e-06, "loss": 0.4552, "num_input_tokens_seen": 20224768, "step": 9380 }, { "epoch": 1.7223343732795007, "grad_norm": 1.6648228168487549, "learning_rate": 8.610754266837953e-06, "loss": 0.3032, "num_input_tokens_seen": 20234656, "step": 9385 }, { "epoch": 1.7232519728390532, "grad_norm": 2.6958208084106445, "learning_rate": 8.615342264635714e-06, "loss": 0.3622, "num_input_tokens_seen": 20245312, "step": 9390 }, { "epoch": 1.7241695723986052, "grad_norm": 1.635355830192566, "learning_rate": 8.619930262433474e-06, "loss": 0.3147, "num_input_tokens_seen": 20256416, "step": 9395 }, { "epoch": 1.7250871719581573, "grad_norm": 1.168377161026001, "learning_rate": 8.624518260231237e-06, "loss": 0.3122, "num_input_tokens_seen": 20266752, "step": 9400 }, { "epoch": 1.7260047715177098, "grad_norm": 2.600409746170044, "learning_rate": 8.629106258028997e-06, "loss": 0.3241, "num_input_tokens_seen": 20277888, "step": 9405 }, { "epoch": 1.7269223710772619, "grad_norm": 6.693609714508057, "learning_rate": 8.633694255826758e-06, "loss": 0.3116, "num_input_tokens_seen": 20287904, "step": 9410 }, { "epoch": 1.727839970636814, "grad_norm": 1.2885538339614868, "learning_rate": 8.63828225362452e-06, "loss": 0.3224, "num_input_tokens_seen": 20299744, "step": 9415 }, { "epoch": 1.7287575701963664, "grad_norm": 2.952519416809082, "learning_rate": 8.64287025142228e-06, "loss": 0.3516, "num_input_tokens_seen": 20310592, "step": 9420 }, { "epoch": 1.7296751697559185, "grad_norm": 1.9963332414627075, "learning_rate": 8.64745824922004e-06, "loss": 0.3678, "num_input_tokens_seen": 20323072, "step": 9425 }, { "epoch": 1.7305927693154706, "grad_norm": 2.3195910453796387, "learning_rate": 8.652046247017803e-06, "loss": 0.4577, "num_input_tokens_seen": 20333728, "step": 9430 }, { "epoch": 1.731510368875023, "grad_norm": 1.3899649381637573, "learning_rate": 8.656634244815564e-06, "loss": 0.4178, "num_input_tokens_seen": 20344928, "step": 9435 }, { "epoch": 1.7324279684345751, "grad_norm": 2.011446714401245, "learning_rate": 8.661222242613324e-06, "loss": 0.3004, "num_input_tokens_seen": 20355904, "step": 9440 }, { "epoch": 1.7333455679941272, "grad_norm": 3.3209962844848633, "learning_rate": 8.665810240411086e-06, "loss": 0.4249, "num_input_tokens_seen": 20366624, "step": 9445 }, { "epoch": 1.7342631675536797, "grad_norm": 2.156484842300415, "learning_rate": 8.670398238208845e-06, "loss": 0.3929, "num_input_tokens_seen": 20377408, "step": 9450 }, { "epoch": 1.7351807671132318, "grad_norm": 3.945518970489502, "learning_rate": 8.674986236006607e-06, "loss": 0.3162, "num_input_tokens_seen": 20388384, "step": 9455 }, { "epoch": 1.7360983666727838, "grad_norm": 1.9114364385604858, "learning_rate": 8.67957423380437e-06, "loss": 0.2652, "num_input_tokens_seen": 20398464, "step": 9460 }, { "epoch": 1.7370159662323363, "grad_norm": 1.7328693866729736, "learning_rate": 8.684162231602128e-06, "loss": 0.3465, "num_input_tokens_seen": 20409312, "step": 9465 }, { "epoch": 1.7379335657918884, "grad_norm": 1.0709564685821533, "learning_rate": 8.68875022939989e-06, "loss": 0.389, "num_input_tokens_seen": 20419584, "step": 9470 }, { "epoch": 1.7388511653514407, "grad_norm": 3.358848810195923, "learning_rate": 8.693338227197651e-06, "loss": 0.2799, "num_input_tokens_seen": 20430048, "step": 9475 }, { "epoch": 1.739768764910993, "grad_norm": 4.879283905029297, "learning_rate": 8.697926224995412e-06, "loss": 0.396, "num_input_tokens_seen": 20441568, "step": 9480 }, { "epoch": 1.740686364470545, "grad_norm": 6.509923934936523, "learning_rate": 8.702514222793174e-06, "loss": 0.3578, "num_input_tokens_seen": 20451968, "step": 9485 }, { "epoch": 1.7416039640300973, "grad_norm": 4.762311935424805, "learning_rate": 8.707102220590934e-06, "loss": 0.4393, "num_input_tokens_seen": 20462400, "step": 9490 }, { "epoch": 1.7425215635896496, "grad_norm": 1.7283719778060913, "learning_rate": 8.711690218388696e-06, "loss": 0.2651, "num_input_tokens_seen": 20474624, "step": 9495 }, { "epoch": 1.7434391631492017, "grad_norm": 5.53123664855957, "learning_rate": 8.716278216186457e-06, "loss": 0.2812, "num_input_tokens_seen": 20484576, "step": 9500 }, { "epoch": 1.744356762708754, "grad_norm": 1.4838021993637085, "learning_rate": 8.720866213984218e-06, "loss": 0.3764, "num_input_tokens_seen": 20495232, "step": 9505 }, { "epoch": 1.7452743622683062, "grad_norm": 4.426962375640869, "learning_rate": 8.72545421178198e-06, "loss": 0.3622, "num_input_tokens_seen": 20506144, "step": 9510 }, { "epoch": 1.7461919618278583, "grad_norm": 1.8080943822860718, "learning_rate": 8.73004220957974e-06, "loss": 0.2882, "num_input_tokens_seen": 20517760, "step": 9515 }, { "epoch": 1.7471095613874106, "grad_norm": 10.292023658752441, "learning_rate": 8.7346302073775e-06, "loss": 0.3362, "num_input_tokens_seen": 20528896, "step": 9520 }, { "epoch": 1.7480271609469629, "grad_norm": 3.208510637283325, "learning_rate": 8.739218205175263e-06, "loss": 0.3202, "num_input_tokens_seen": 20539616, "step": 9525 }, { "epoch": 1.748944760506515, "grad_norm": 3.3398499488830566, "learning_rate": 8.743806202973023e-06, "loss": 0.3581, "num_input_tokens_seen": 20551648, "step": 9530 }, { "epoch": 1.7498623600660672, "grad_norm": 1.1537553071975708, "learning_rate": 8.748394200770784e-06, "loss": 0.2879, "num_input_tokens_seen": 20563264, "step": 9535 }, { "epoch": 1.7507799596256195, "grad_norm": 1.8177913427352905, "learning_rate": 8.752982198568546e-06, "loss": 0.3924, "num_input_tokens_seen": 20575520, "step": 9540 }, { "epoch": 1.7516975591851716, "grad_norm": 11.587604522705078, "learning_rate": 8.757570196366307e-06, "loss": 0.4748, "num_input_tokens_seen": 20586592, "step": 9545 }, { "epoch": 1.7526151587447238, "grad_norm": 0.9979106187820435, "learning_rate": 8.762158194164067e-06, "loss": 0.2846, "num_input_tokens_seen": 20597056, "step": 9550 }, { "epoch": 1.7535327583042761, "grad_norm": 6.999756336212158, "learning_rate": 8.76674619196183e-06, "loss": 0.3676, "num_input_tokens_seen": 20607840, "step": 9555 }, { "epoch": 1.7544503578638282, "grad_norm": 5.395453453063965, "learning_rate": 8.77133418975959e-06, "loss": 0.3861, "num_input_tokens_seen": 20618656, "step": 9560 }, { "epoch": 1.7553679574233805, "grad_norm": 2.1799914836883545, "learning_rate": 8.77592218755735e-06, "loss": 0.3847, "num_input_tokens_seen": 20629728, "step": 9565 }, { "epoch": 1.7562855569829328, "grad_norm": 2.8070740699768066, "learning_rate": 8.780510185355113e-06, "loss": 0.3175, "num_input_tokens_seen": 20639392, "step": 9570 }, { "epoch": 1.7572031565424848, "grad_norm": 1.789604663848877, "learning_rate": 8.785098183152873e-06, "loss": 0.3151, "num_input_tokens_seen": 20651232, "step": 9575 }, { "epoch": 1.758120756102037, "grad_norm": 0.8528683185577393, "learning_rate": 8.789686180950634e-06, "loss": 0.3042, "num_input_tokens_seen": 20661344, "step": 9580 }, { "epoch": 1.7590383556615894, "grad_norm": 1.841916561126709, "learning_rate": 8.794274178748396e-06, "loss": 0.2608, "num_input_tokens_seen": 20672672, "step": 9585 }, { "epoch": 1.7599559552211415, "grad_norm": 1.4890650510787964, "learning_rate": 8.798862176546156e-06, "loss": 0.3435, "num_input_tokens_seen": 20683328, "step": 9590 }, { "epoch": 1.7608735547806937, "grad_norm": 2.996521234512329, "learning_rate": 8.803450174343917e-06, "loss": 0.4234, "num_input_tokens_seen": 20693824, "step": 9595 }, { "epoch": 1.761791154340246, "grad_norm": 8.436470031738281, "learning_rate": 8.808038172141679e-06, "loss": 0.3758, "num_input_tokens_seen": 20704640, "step": 9600 }, { "epoch": 1.762708753899798, "grad_norm": 7.188626766204834, "learning_rate": 8.81262616993944e-06, "loss": 0.3179, "num_input_tokens_seen": 20713664, "step": 9605 }, { "epoch": 1.7636263534593504, "grad_norm": 1.8943895101547241, "learning_rate": 8.8172141677372e-06, "loss": 0.307, "num_input_tokens_seen": 20725408, "step": 9610 }, { "epoch": 1.7645439530189027, "grad_norm": 2.250753402709961, "learning_rate": 8.821802165534962e-06, "loss": 0.2672, "num_input_tokens_seen": 20736864, "step": 9615 }, { "epoch": 1.7654615525784547, "grad_norm": 3.8131051063537598, "learning_rate": 8.826390163332721e-06, "loss": 0.3353, "num_input_tokens_seen": 20747264, "step": 9620 }, { "epoch": 1.766379152138007, "grad_norm": 4.290916919708252, "learning_rate": 8.830978161130483e-06, "loss": 0.2545, "num_input_tokens_seen": 20756416, "step": 9625 }, { "epoch": 1.7672967516975593, "grad_norm": 1.5454485416412354, "learning_rate": 8.835566158928246e-06, "loss": 0.357, "num_input_tokens_seen": 20767648, "step": 9630 }, { "epoch": 1.7682143512571113, "grad_norm": 8.82250690460205, "learning_rate": 8.840154156726004e-06, "loss": 0.3746, "num_input_tokens_seen": 20778464, "step": 9635 }, { "epoch": 1.7691319508166636, "grad_norm": 7.311185359954834, "learning_rate": 8.844742154523767e-06, "loss": 0.3679, "num_input_tokens_seen": 20789504, "step": 9640 }, { "epoch": 1.770049550376216, "grad_norm": 1.2449074983596802, "learning_rate": 8.849330152321529e-06, "loss": 0.2977, "num_input_tokens_seen": 20800064, "step": 9645 }, { "epoch": 1.770967149935768, "grad_norm": 1.1433651447296143, "learning_rate": 8.853918150119288e-06, "loss": 0.2791, "num_input_tokens_seen": 20811264, "step": 9650 }, { "epoch": 1.7718847494953203, "grad_norm": 3.6090340614318848, "learning_rate": 8.85850614791705e-06, "loss": 0.3879, "num_input_tokens_seen": 20822208, "step": 9655 }, { "epoch": 1.7728023490548726, "grad_norm": 2.36299729347229, "learning_rate": 8.86309414571481e-06, "loss": 0.3345, "num_input_tokens_seen": 20832960, "step": 9660 }, { "epoch": 1.7737199486144246, "grad_norm": 6.861721038818359, "learning_rate": 8.867682143512571e-06, "loss": 0.3445, "num_input_tokens_seen": 20844032, "step": 9665 }, { "epoch": 1.774637548173977, "grad_norm": 1.6615842580795288, "learning_rate": 8.872270141310333e-06, "loss": 0.431, "num_input_tokens_seen": 20853280, "step": 9670 }, { "epoch": 1.7755551477335292, "grad_norm": 2.8262860774993896, "learning_rate": 8.876858139108094e-06, "loss": 0.4068, "num_input_tokens_seen": 20863360, "step": 9675 }, { "epoch": 1.7764727472930812, "grad_norm": 2.556103229522705, "learning_rate": 8.881446136905854e-06, "loss": 0.3807, "num_input_tokens_seen": 20873984, "step": 9680 }, { "epoch": 1.7773903468526335, "grad_norm": 4.88082218170166, "learning_rate": 8.886034134703616e-06, "loss": 0.3543, "num_input_tokens_seen": 20884960, "step": 9685 }, { "epoch": 1.7783079464121858, "grad_norm": 2.5193865299224854, "learning_rate": 8.890622132501377e-06, "loss": 0.3229, "num_input_tokens_seen": 20896512, "step": 9690 }, { "epoch": 1.7792255459717379, "grad_norm": 0.9843069314956665, "learning_rate": 8.895210130299137e-06, "loss": 0.3313, "num_input_tokens_seen": 20907232, "step": 9695 }, { "epoch": 1.7801431455312902, "grad_norm": 8.724559783935547, "learning_rate": 8.8997981280969e-06, "loss": 0.3849, "num_input_tokens_seen": 20917088, "step": 9700 }, { "epoch": 1.7810607450908424, "grad_norm": 1.8300307989120483, "learning_rate": 8.90438612589466e-06, "loss": 0.3296, "num_input_tokens_seen": 20927200, "step": 9705 }, { "epoch": 1.7819783446503945, "grad_norm": 1.108540654182434, "learning_rate": 8.90897412369242e-06, "loss": 0.3476, "num_input_tokens_seen": 20936800, "step": 9710 }, { "epoch": 1.7828959442099468, "grad_norm": 4.791634559631348, "learning_rate": 8.913562121490183e-06, "loss": 0.2808, "num_input_tokens_seen": 20947392, "step": 9715 }, { "epoch": 1.783813543769499, "grad_norm": 0.6707165837287903, "learning_rate": 8.918150119287943e-06, "loss": 0.2977, "num_input_tokens_seen": 20958848, "step": 9720 }, { "epoch": 1.7847311433290511, "grad_norm": 2.519115686416626, "learning_rate": 8.922738117085704e-06, "loss": 0.3369, "num_input_tokens_seen": 20969696, "step": 9725 }, { "epoch": 1.7856487428886034, "grad_norm": 5.993084907531738, "learning_rate": 8.927326114883466e-06, "loss": 0.4048, "num_input_tokens_seen": 20979904, "step": 9730 }, { "epoch": 1.7865663424481557, "grad_norm": 3.192577362060547, "learning_rate": 8.931914112681227e-06, "loss": 0.3428, "num_input_tokens_seen": 20990784, "step": 9735 }, { "epoch": 1.7874839420077078, "grad_norm": 1.5705751180648804, "learning_rate": 8.936502110478987e-06, "loss": 0.3565, "num_input_tokens_seen": 21000544, "step": 9740 }, { "epoch": 1.78840154156726, "grad_norm": 2.258673667907715, "learning_rate": 8.94109010827675e-06, "loss": 0.3194, "num_input_tokens_seen": 21010976, "step": 9745 }, { "epoch": 1.7893191411268123, "grad_norm": 3.9831020832061768, "learning_rate": 8.94567810607451e-06, "loss": 0.2823, "num_input_tokens_seen": 21021312, "step": 9750 }, { "epoch": 1.7902367406863644, "grad_norm": 7.993356227874756, "learning_rate": 8.95026610387227e-06, "loss": 0.3573, "num_input_tokens_seen": 21032288, "step": 9755 }, { "epoch": 1.7911543402459167, "grad_norm": 5.239504814147949, "learning_rate": 8.954854101670033e-06, "loss": 0.4127, "num_input_tokens_seen": 21043296, "step": 9760 }, { "epoch": 1.792071939805469, "grad_norm": 2.936466693878174, "learning_rate": 8.959442099467793e-06, "loss": 0.305, "num_input_tokens_seen": 21053408, "step": 9765 }, { "epoch": 1.792989539365021, "grad_norm": 1.8145800828933716, "learning_rate": 8.964030097265554e-06, "loss": 0.3327, "num_input_tokens_seen": 21064288, "step": 9770 }, { "epoch": 1.7939071389245733, "grad_norm": 4.540433406829834, "learning_rate": 8.968618095063316e-06, "loss": 0.3687, "num_input_tokens_seen": 21074048, "step": 9775 }, { "epoch": 1.7948247384841256, "grad_norm": 2.6524131298065186, "learning_rate": 8.973206092861076e-06, "loss": 0.3748, "num_input_tokens_seen": 21083840, "step": 9780 }, { "epoch": 1.7957423380436777, "grad_norm": 1.0187149047851562, "learning_rate": 8.977794090658837e-06, "loss": 0.3317, "num_input_tokens_seen": 21094720, "step": 9785 }, { "epoch": 1.79665993760323, "grad_norm": 3.2586240768432617, "learning_rate": 8.982382088456599e-06, "loss": 0.3102, "num_input_tokens_seen": 21106464, "step": 9790 }, { "epoch": 1.7975775371627822, "grad_norm": 2.553189277648926, "learning_rate": 8.98697008625436e-06, "loss": 0.4338, "num_input_tokens_seen": 21118016, "step": 9795 }, { "epoch": 1.7984951367223343, "grad_norm": 1.2914692163467407, "learning_rate": 8.99155808405212e-06, "loss": 0.3315, "num_input_tokens_seen": 21129184, "step": 9800 }, { "epoch": 1.7994127362818866, "grad_norm": 2.8665900230407715, "learning_rate": 8.99614608184988e-06, "loss": 0.3498, "num_input_tokens_seen": 21139584, "step": 9805 }, { "epoch": 1.8003303358414389, "grad_norm": 2.738420248031616, "learning_rate": 9.000734079647643e-06, "loss": 0.3146, "num_input_tokens_seen": 21150752, "step": 9810 }, { "epoch": 1.801247935400991, "grad_norm": 1.878616452217102, "learning_rate": 9.005322077445403e-06, "loss": 0.2765, "num_input_tokens_seen": 21161632, "step": 9815 }, { "epoch": 1.8021655349605432, "grad_norm": 3.596827507019043, "learning_rate": 9.009910075243164e-06, "loss": 0.3387, "num_input_tokens_seen": 21172864, "step": 9820 }, { "epoch": 1.8030831345200955, "grad_norm": 0.9068413376808167, "learning_rate": 9.014498073040926e-06, "loss": 0.3398, "num_input_tokens_seen": 21184512, "step": 9825 }, { "epoch": 1.8040007340796476, "grad_norm": 1.6959354877471924, "learning_rate": 9.019086070838687e-06, "loss": 0.2999, "num_input_tokens_seen": 21195584, "step": 9830 }, { "epoch": 1.8049183336391998, "grad_norm": 1.9502794742584229, "learning_rate": 9.023674068636447e-06, "loss": 0.3278, "num_input_tokens_seen": 21206720, "step": 9835 }, { "epoch": 1.8058359331987521, "grad_norm": 1.2529202699661255, "learning_rate": 9.02826206643421e-06, "loss": 0.3716, "num_input_tokens_seen": 21216448, "step": 9840 }, { "epoch": 1.8067535327583042, "grad_norm": 1.1171764135360718, "learning_rate": 9.03285006423197e-06, "loss": 0.3622, "num_input_tokens_seen": 21227232, "step": 9845 }, { "epoch": 1.8076711323178565, "grad_norm": 2.2536656856536865, "learning_rate": 9.03743806202973e-06, "loss": 0.322, "num_input_tokens_seen": 21237984, "step": 9850 }, { "epoch": 1.8085887318774088, "grad_norm": 1.094133734703064, "learning_rate": 9.042026059827493e-06, "loss": 0.3638, "num_input_tokens_seen": 21248672, "step": 9855 }, { "epoch": 1.8095063314369608, "grad_norm": 1.7097954750061035, "learning_rate": 9.046614057625253e-06, "loss": 0.3387, "num_input_tokens_seen": 21259392, "step": 9860 }, { "epoch": 1.810423930996513, "grad_norm": 1.817872405052185, "learning_rate": 9.051202055423014e-06, "loss": 0.3512, "num_input_tokens_seen": 21268896, "step": 9865 }, { "epoch": 1.8113415305560654, "grad_norm": 4.7047905921936035, "learning_rate": 9.055790053220776e-06, "loss": 0.3718, "num_input_tokens_seen": 21280896, "step": 9870 }, { "epoch": 1.8122591301156175, "grad_norm": 1.575992465019226, "learning_rate": 9.060378051018536e-06, "loss": 0.3004, "num_input_tokens_seen": 21292512, "step": 9875 }, { "epoch": 1.8131767296751697, "grad_norm": 3.1673636436462402, "learning_rate": 9.064966048816297e-06, "loss": 0.2543, "num_input_tokens_seen": 21304032, "step": 9880 }, { "epoch": 1.814094329234722, "grad_norm": 2.6126365661621094, "learning_rate": 9.069554046614059e-06, "loss": 0.3364, "num_input_tokens_seen": 21316032, "step": 9885 }, { "epoch": 1.815011928794274, "grad_norm": 1.4810900688171387, "learning_rate": 9.07414204441182e-06, "loss": 0.4052, "num_input_tokens_seen": 21327968, "step": 9890 }, { "epoch": 1.8159295283538264, "grad_norm": 3.5570428371429443, "learning_rate": 9.07873004220958e-06, "loss": 0.342, "num_input_tokens_seen": 21339040, "step": 9895 }, { "epoch": 1.8168471279133787, "grad_norm": 1.2331089973449707, "learning_rate": 9.083318040007342e-06, "loss": 0.3764, "num_input_tokens_seen": 21350528, "step": 9900 }, { "epoch": 1.8177647274729307, "grad_norm": 6.2577691078186035, "learning_rate": 9.087906037805103e-06, "loss": 0.3777, "num_input_tokens_seen": 21360192, "step": 9905 }, { "epoch": 1.818682327032483, "grad_norm": 1.6248067617416382, "learning_rate": 9.092494035602863e-06, "loss": 0.3018, "num_input_tokens_seen": 21370016, "step": 9910 }, { "epoch": 1.8195999265920353, "grad_norm": 4.115573883056641, "learning_rate": 9.097082033400625e-06, "loss": 0.3278, "num_input_tokens_seen": 21380608, "step": 9915 }, { "epoch": 1.8205175261515874, "grad_norm": 1.4035738706588745, "learning_rate": 9.101670031198386e-06, "loss": 0.3192, "num_input_tokens_seen": 21391744, "step": 9920 }, { "epoch": 1.8214351257111396, "grad_norm": 2.756838321685791, "learning_rate": 9.106258028996146e-06, "loss": 0.3064, "num_input_tokens_seen": 21403040, "step": 9925 }, { "epoch": 1.822352725270692, "grad_norm": 11.409869194030762, "learning_rate": 9.110846026793909e-06, "loss": 0.3906, "num_input_tokens_seen": 21414816, "step": 9930 }, { "epoch": 1.823270324830244, "grad_norm": 6.159553527832031, "learning_rate": 9.11543402459167e-06, "loss": 0.3738, "num_input_tokens_seen": 21423232, "step": 9935 }, { "epoch": 1.8241879243897963, "grad_norm": 4.533266544342041, "learning_rate": 9.12002202238943e-06, "loss": 0.3548, "num_input_tokens_seen": 21433376, "step": 9940 }, { "epoch": 1.8251055239493486, "grad_norm": 2.7601776123046875, "learning_rate": 9.124610020187192e-06, "loss": 0.3372, "num_input_tokens_seen": 21443968, "step": 9945 }, { "epoch": 1.8260231235089006, "grad_norm": 0.7765056490898132, "learning_rate": 9.129198017984952e-06, "loss": 0.3402, "num_input_tokens_seen": 21454368, "step": 9950 }, { "epoch": 1.826940723068453, "grad_norm": 1.0896389484405518, "learning_rate": 9.133786015782713e-06, "loss": 0.2846, "num_input_tokens_seen": 21465856, "step": 9955 }, { "epoch": 1.8278583226280052, "grad_norm": 1.9199365377426147, "learning_rate": 9.138374013580475e-06, "loss": 0.3269, "num_input_tokens_seen": 21476192, "step": 9960 }, { "epoch": 1.8287759221875572, "grad_norm": 2.5308876037597656, "learning_rate": 9.142962011378236e-06, "loss": 0.2769, "num_input_tokens_seen": 21486336, "step": 9965 }, { "epoch": 1.8296935217471095, "grad_norm": 3.1429362297058105, "learning_rate": 9.147550009175996e-06, "loss": 0.2955, "num_input_tokens_seen": 21496704, "step": 9970 }, { "epoch": 1.8306111213066618, "grad_norm": 3.954237937927246, "learning_rate": 9.152138006973757e-06, "loss": 0.318, "num_input_tokens_seen": 21507232, "step": 9975 }, { "epoch": 1.8315287208662139, "grad_norm": 2.8027591705322266, "learning_rate": 9.156726004771519e-06, "loss": 0.3002, "num_input_tokens_seen": 21518336, "step": 9980 }, { "epoch": 1.8324463204257662, "grad_norm": 3.900028705596924, "learning_rate": 9.16131400256928e-06, "loss": 0.4089, "num_input_tokens_seen": 21528320, "step": 9985 }, { "epoch": 1.8333639199853184, "grad_norm": 2.368940830230713, "learning_rate": 9.16590200036704e-06, "loss": 0.2849, "num_input_tokens_seen": 21539488, "step": 9990 }, { "epoch": 1.8342815195448705, "grad_norm": 0.8637698888778687, "learning_rate": 9.170489998164802e-06, "loss": 0.2615, "num_input_tokens_seen": 21550944, "step": 9995 }, { "epoch": 1.8351991191044228, "grad_norm": 2.58439564704895, "learning_rate": 9.175077995962563e-06, "loss": 0.3309, "num_input_tokens_seen": 21561088, "step": 10000 }, { "epoch": 1.836116718663975, "grad_norm": 4.596208095550537, "learning_rate": 9.179665993760323e-06, "loss": 0.3188, "num_input_tokens_seen": 21572960, "step": 10005 }, { "epoch": 1.8370343182235271, "grad_norm": 3.661944627761841, "learning_rate": 9.184253991558085e-06, "loss": 0.3364, "num_input_tokens_seen": 21582560, "step": 10010 }, { "epoch": 1.8379519177830794, "grad_norm": 2.4702534675598145, "learning_rate": 9.188841989355846e-06, "loss": 0.3627, "num_input_tokens_seen": 21593696, "step": 10015 }, { "epoch": 1.8388695173426317, "grad_norm": 1.007191777229309, "learning_rate": 9.193429987153606e-06, "loss": 0.4306, "num_input_tokens_seen": 21604928, "step": 10020 }, { "epoch": 1.8397871169021838, "grad_norm": 2.256667375564575, "learning_rate": 9.198017984951369e-06, "loss": 0.2789, "num_input_tokens_seen": 21615968, "step": 10025 }, { "epoch": 1.840704716461736, "grad_norm": 1.495627760887146, "learning_rate": 9.20260598274913e-06, "loss": 0.2943, "num_input_tokens_seen": 21626400, "step": 10030 }, { "epoch": 1.8416223160212883, "grad_norm": 0.9454565644264221, "learning_rate": 9.20719398054689e-06, "loss": 0.2968, "num_input_tokens_seen": 21637664, "step": 10035 }, { "epoch": 1.8425399155808404, "grad_norm": 2.6142418384552, "learning_rate": 9.211781978344652e-06, "loss": 0.3357, "num_input_tokens_seen": 21647424, "step": 10040 }, { "epoch": 1.8434575151403927, "grad_norm": 4.291890621185303, "learning_rate": 9.216369976142412e-06, "loss": 0.2951, "num_input_tokens_seen": 21658720, "step": 10045 }, { "epoch": 1.844375114699945, "grad_norm": 9.602057456970215, "learning_rate": 9.220957973940173e-06, "loss": 0.3612, "num_input_tokens_seen": 21670368, "step": 10050 }, { "epoch": 1.845292714259497, "grad_norm": 1.6616839170455933, "learning_rate": 9.225545971737935e-06, "loss": 0.3518, "num_input_tokens_seen": 21680864, "step": 10055 }, { "epoch": 1.8462103138190493, "grad_norm": 1.3926076889038086, "learning_rate": 9.230133969535696e-06, "loss": 0.268, "num_input_tokens_seen": 21691456, "step": 10060 }, { "epoch": 1.8471279133786016, "grad_norm": 8.660700798034668, "learning_rate": 9.234721967333456e-06, "loss": 0.7087, "num_input_tokens_seen": 21700992, "step": 10065 }, { "epoch": 1.8480455129381537, "grad_norm": 2.7962288856506348, "learning_rate": 9.239309965131218e-06, "loss": 0.2792, "num_input_tokens_seen": 21711840, "step": 10070 }, { "epoch": 1.848963112497706, "grad_norm": 2.7613134384155273, "learning_rate": 9.243897962928979e-06, "loss": 0.3076, "num_input_tokens_seen": 21723680, "step": 10075 }, { "epoch": 1.8498807120572582, "grad_norm": 6.257632255554199, "learning_rate": 9.24848596072674e-06, "loss": 0.3527, "num_input_tokens_seen": 21734240, "step": 10080 }, { "epoch": 1.8507983116168103, "grad_norm": 3.6291615962982178, "learning_rate": 9.253073958524502e-06, "loss": 0.3218, "num_input_tokens_seen": 21745664, "step": 10085 }, { "epoch": 1.8517159111763626, "grad_norm": 2.6234192848205566, "learning_rate": 9.257661956322262e-06, "loss": 0.382, "num_input_tokens_seen": 21756384, "step": 10090 }, { "epoch": 1.8526335107359149, "grad_norm": 2.11533522605896, "learning_rate": 9.262249954120023e-06, "loss": 0.3055, "num_input_tokens_seen": 21766880, "step": 10095 }, { "epoch": 1.853551110295467, "grad_norm": 0.6191956400871277, "learning_rate": 9.266837951917785e-06, "loss": 0.3146, "num_input_tokens_seen": 21777664, "step": 10100 }, { "epoch": 1.8544687098550192, "grad_norm": 0.9936919212341309, "learning_rate": 9.271425949715545e-06, "loss": 0.316, "num_input_tokens_seen": 21788480, "step": 10105 }, { "epoch": 1.8553863094145715, "grad_norm": 2.9317405223846436, "learning_rate": 9.276013947513306e-06, "loss": 0.3108, "num_input_tokens_seen": 21799072, "step": 10110 }, { "epoch": 1.8563039089741236, "grad_norm": 6.9009623527526855, "learning_rate": 9.280601945311068e-06, "loss": 0.3563, "num_input_tokens_seen": 21809696, "step": 10115 }, { "epoch": 1.857221508533676, "grad_norm": 1.5975669622421265, "learning_rate": 9.285189943108827e-06, "loss": 0.3187, "num_input_tokens_seen": 21821184, "step": 10120 }, { "epoch": 1.8581391080932281, "grad_norm": 1.6152496337890625, "learning_rate": 9.289777940906589e-06, "loss": 0.3593, "num_input_tokens_seen": 21830816, "step": 10125 }, { "epoch": 1.8590567076527802, "grad_norm": 0.6879180073738098, "learning_rate": 9.294365938704351e-06, "loss": 0.323, "num_input_tokens_seen": 21841632, "step": 10130 }, { "epoch": 1.8599743072123327, "grad_norm": 1.6434332132339478, "learning_rate": 9.29895393650211e-06, "loss": 0.3787, "num_input_tokens_seen": 21853248, "step": 10135 }, { "epoch": 1.8608919067718848, "grad_norm": 1.1311471462249756, "learning_rate": 9.303541934299872e-06, "loss": 0.3157, "num_input_tokens_seen": 21864352, "step": 10140 }, { "epoch": 1.8618095063314368, "grad_norm": 1.1336692571640015, "learning_rate": 9.308129932097635e-06, "loss": 0.3069, "num_input_tokens_seen": 21874528, "step": 10145 }, { "epoch": 1.8627271058909893, "grad_norm": 0.9134458899497986, "learning_rate": 9.312717929895393e-06, "loss": 0.3194, "num_input_tokens_seen": 21885376, "step": 10150 }, { "epoch": 1.8636447054505414, "grad_norm": 2.4321939945220947, "learning_rate": 9.317305927693156e-06, "loss": 0.3013, "num_input_tokens_seen": 21896800, "step": 10155 }, { "epoch": 1.8645623050100935, "grad_norm": 1.9303629398345947, "learning_rate": 9.321893925490916e-06, "loss": 0.3459, "num_input_tokens_seen": 21906912, "step": 10160 }, { "epoch": 1.865479904569646, "grad_norm": 1.2431434392929077, "learning_rate": 9.326481923288677e-06, "loss": 0.3081, "num_input_tokens_seen": 21918624, "step": 10165 }, { "epoch": 1.866397504129198, "grad_norm": 2.2922277450561523, "learning_rate": 9.331069921086439e-06, "loss": 0.3882, "num_input_tokens_seen": 21929888, "step": 10170 }, { "epoch": 1.86731510368875, "grad_norm": 1.2089749574661255, "learning_rate": 9.3356579188842e-06, "loss": 0.2627, "num_input_tokens_seen": 21942240, "step": 10175 }, { "epoch": 1.8682327032483026, "grad_norm": 4.549960613250732, "learning_rate": 9.34024591668196e-06, "loss": 0.3089, "num_input_tokens_seen": 21953664, "step": 10180 }, { "epoch": 1.8691503028078547, "grad_norm": 0.7716321349143982, "learning_rate": 9.344833914479722e-06, "loss": 0.3572, "num_input_tokens_seen": 21964736, "step": 10185 }, { "epoch": 1.8700679023674067, "grad_norm": 8.180394172668457, "learning_rate": 9.349421912277483e-06, "loss": 0.3967, "num_input_tokens_seen": 21976256, "step": 10190 }, { "epoch": 1.8709855019269592, "grad_norm": 4.556398391723633, "learning_rate": 9.354009910075243e-06, "loss": 0.362, "num_input_tokens_seen": 21987168, "step": 10195 }, { "epoch": 1.8719031014865113, "grad_norm": 0.9611129760742188, "learning_rate": 9.358597907873005e-06, "loss": 0.2865, "num_input_tokens_seen": 21997632, "step": 10200 }, { "epoch": 1.8728207010460634, "grad_norm": 1.4691392183303833, "learning_rate": 9.363185905670766e-06, "loss": 0.3577, "num_input_tokens_seen": 22008480, "step": 10205 }, { "epoch": 1.8737383006056159, "grad_norm": 2.2135202884674072, "learning_rate": 9.367773903468526e-06, "loss": 0.337, "num_input_tokens_seen": 22018080, "step": 10210 }, { "epoch": 1.874655900165168, "grad_norm": 3.668299436569214, "learning_rate": 9.372361901266289e-06, "loss": 0.3137, "num_input_tokens_seen": 22029440, "step": 10215 }, { "epoch": 1.87557349972472, "grad_norm": 3.8641040325164795, "learning_rate": 9.376949899064049e-06, "loss": 0.3632, "num_input_tokens_seen": 22039456, "step": 10220 }, { "epoch": 1.8764910992842725, "grad_norm": 2.2303199768066406, "learning_rate": 9.38153789686181e-06, "loss": 0.3107, "num_input_tokens_seen": 22048576, "step": 10225 }, { "epoch": 1.8774086988438246, "grad_norm": 2.45701003074646, "learning_rate": 9.386125894659572e-06, "loss": 0.2951, "num_input_tokens_seen": 22058784, "step": 10230 }, { "epoch": 1.8783262984033766, "grad_norm": 2.5551400184631348, "learning_rate": 9.390713892457332e-06, "loss": 0.2225, "num_input_tokens_seen": 22069824, "step": 10235 }, { "epoch": 1.8792438979629291, "grad_norm": 4.184276103973389, "learning_rate": 9.395301890255093e-06, "loss": 0.3276, "num_input_tokens_seen": 22080704, "step": 10240 }, { "epoch": 1.8801614975224812, "grad_norm": 1.0418741703033447, "learning_rate": 9.399889888052855e-06, "loss": 0.3356, "num_input_tokens_seen": 22091136, "step": 10245 }, { "epoch": 1.8810790970820332, "grad_norm": 8.298624992370605, "learning_rate": 9.404477885850616e-06, "loss": 0.4182, "num_input_tokens_seen": 22102688, "step": 10250 }, { "epoch": 1.8819966966415858, "grad_norm": 0.991019606590271, "learning_rate": 9.409065883648376e-06, "loss": 0.3652, "num_input_tokens_seen": 22113632, "step": 10255 }, { "epoch": 1.8829142962011378, "grad_norm": 7.322532653808594, "learning_rate": 9.413653881446138e-06, "loss": 0.379, "num_input_tokens_seen": 22123648, "step": 10260 }, { "epoch": 1.8838318957606899, "grad_norm": 4.385741233825684, "learning_rate": 9.418241879243899e-06, "loss": 0.2951, "num_input_tokens_seen": 22134144, "step": 10265 }, { "epoch": 1.8847494953202424, "grad_norm": 2.4128918647766113, "learning_rate": 9.42282987704166e-06, "loss": 0.2017, "num_input_tokens_seen": 22145184, "step": 10270 }, { "epoch": 1.8856670948797944, "grad_norm": 0.4612635374069214, "learning_rate": 9.427417874839421e-06, "loss": 0.3304, "num_input_tokens_seen": 22154912, "step": 10275 }, { "epoch": 1.8865846944393465, "grad_norm": 0.9699788093566895, "learning_rate": 9.432005872637182e-06, "loss": 0.2605, "num_input_tokens_seen": 22164896, "step": 10280 }, { "epoch": 1.887502293998899, "grad_norm": 1.8383690118789673, "learning_rate": 9.436593870434943e-06, "loss": 0.308, "num_input_tokens_seen": 22176128, "step": 10285 }, { "epoch": 1.888419893558451, "grad_norm": 1.9136704206466675, "learning_rate": 9.441181868232705e-06, "loss": 0.2372, "num_input_tokens_seen": 22186080, "step": 10290 }, { "epoch": 1.8893374931180031, "grad_norm": 1.1957842111587524, "learning_rate": 9.445769866030465e-06, "loss": 0.2963, "num_input_tokens_seen": 22197536, "step": 10295 }, { "epoch": 1.8902550926775556, "grad_norm": 0.9299176335334778, "learning_rate": 9.450357863828226e-06, "loss": 0.2393, "num_input_tokens_seen": 22207424, "step": 10300 }, { "epoch": 1.8911726922371077, "grad_norm": 0.5789614915847778, "learning_rate": 9.454945861625986e-06, "loss": 0.3832, "num_input_tokens_seen": 22217376, "step": 10305 }, { "epoch": 1.8920902917966598, "grad_norm": 1.74385666847229, "learning_rate": 9.459533859423748e-06, "loss": 0.3969, "num_input_tokens_seen": 22229600, "step": 10310 }, { "epoch": 1.8930078913562123, "grad_norm": 2.9632670879364014, "learning_rate": 9.464121857221509e-06, "loss": 0.386, "num_input_tokens_seen": 22241248, "step": 10315 }, { "epoch": 1.8939254909157643, "grad_norm": 1.087584137916565, "learning_rate": 9.46870985501927e-06, "loss": 0.3263, "num_input_tokens_seen": 22251232, "step": 10320 }, { "epoch": 1.8948430904753166, "grad_norm": 1.6336625814437866, "learning_rate": 9.473297852817032e-06, "loss": 0.2522, "num_input_tokens_seen": 22261504, "step": 10325 }, { "epoch": 1.895760690034869, "grad_norm": 0.8873095512390137, "learning_rate": 9.477885850614792e-06, "loss": 0.3807, "num_input_tokens_seen": 22271424, "step": 10330 }, { "epoch": 1.896678289594421, "grad_norm": 8.4276762008667, "learning_rate": 9.482473848412553e-06, "loss": 0.3638, "num_input_tokens_seen": 22282016, "step": 10335 }, { "epoch": 1.8975958891539733, "grad_norm": 1.5133029222488403, "learning_rate": 9.487061846210315e-06, "loss": 0.2429, "num_input_tokens_seen": 22292960, "step": 10340 }, { "epoch": 1.8985134887135255, "grad_norm": 0.9139063358306885, "learning_rate": 9.491649844008075e-06, "loss": 0.3864, "num_input_tokens_seen": 22303136, "step": 10345 }, { "epoch": 1.8994310882730776, "grad_norm": 1.932857871055603, "learning_rate": 9.496237841805836e-06, "loss": 0.2182, "num_input_tokens_seen": 22315040, "step": 10350 }, { "epoch": 1.90034868783263, "grad_norm": 9.354880332946777, "learning_rate": 9.500825839603598e-06, "loss": 0.3806, "num_input_tokens_seen": 22326208, "step": 10355 }, { "epoch": 1.9012662873921822, "grad_norm": 8.036694526672363, "learning_rate": 9.505413837401359e-06, "loss": 0.3444, "num_input_tokens_seen": 22336832, "step": 10360 }, { "epoch": 1.9021838869517342, "grad_norm": 0.7624452114105225, "learning_rate": 9.51000183519912e-06, "loss": 0.2892, "num_input_tokens_seen": 22346464, "step": 10365 }, { "epoch": 1.9031014865112865, "grad_norm": 1.867435336112976, "learning_rate": 9.514589832996881e-06, "loss": 0.3255, "num_input_tokens_seen": 22357568, "step": 10370 }, { "epoch": 1.9040190860708388, "grad_norm": 1.0513579845428467, "learning_rate": 9.519177830794642e-06, "loss": 0.3856, "num_input_tokens_seen": 22368160, "step": 10375 }, { "epoch": 1.9049366856303909, "grad_norm": 1.0188751220703125, "learning_rate": 9.523765828592402e-06, "loss": 0.3245, "num_input_tokens_seen": 22380160, "step": 10380 }, { "epoch": 1.9058542851899432, "grad_norm": 1.4458202123641968, "learning_rate": 9.528353826390165e-06, "loss": 0.3051, "num_input_tokens_seen": 22390272, "step": 10385 }, { "epoch": 1.9067718847494954, "grad_norm": 0.8348124623298645, "learning_rate": 9.532941824187925e-06, "loss": 0.2892, "num_input_tokens_seen": 22401184, "step": 10390 }, { "epoch": 1.9076894843090475, "grad_norm": 1.1208702325820923, "learning_rate": 9.537529821985686e-06, "loss": 0.2945, "num_input_tokens_seen": 22411040, "step": 10395 }, { "epoch": 1.9086070838685998, "grad_norm": 2.4632468223571777, "learning_rate": 9.542117819783448e-06, "loss": 0.3022, "num_input_tokens_seen": 22421344, "step": 10400 }, { "epoch": 1.909524683428152, "grad_norm": 2.3098456859588623, "learning_rate": 9.546705817581208e-06, "loss": 0.4219, "num_input_tokens_seen": 22431488, "step": 10405 }, { "epoch": 1.9104422829877041, "grad_norm": 1.1794742345809937, "learning_rate": 9.551293815378969e-06, "loss": 0.4322, "num_input_tokens_seen": 22442880, "step": 10410 }, { "epoch": 1.9113598825472564, "grad_norm": 1.4602118730545044, "learning_rate": 9.555881813176731e-06, "loss": 0.3269, "num_input_tokens_seen": 22453440, "step": 10415 }, { "epoch": 1.9122774821068087, "grad_norm": 2.400806427001953, "learning_rate": 9.560469810974492e-06, "loss": 0.2672, "num_input_tokens_seen": 22463104, "step": 10420 }, { "epoch": 1.9131950816663608, "grad_norm": 1.3230847120285034, "learning_rate": 9.565057808772252e-06, "loss": 0.3448, "num_input_tokens_seen": 22474720, "step": 10425 }, { "epoch": 1.914112681225913, "grad_norm": 2.6625823974609375, "learning_rate": 9.569645806570014e-06, "loss": 0.364, "num_input_tokens_seen": 22486208, "step": 10430 }, { "epoch": 1.9150302807854653, "grad_norm": 1.4088860750198364, "learning_rate": 9.574233804367775e-06, "loss": 0.3023, "num_input_tokens_seen": 22496800, "step": 10435 }, { "epoch": 1.9159478803450174, "grad_norm": 9.400765419006348, "learning_rate": 9.578821802165535e-06, "loss": 0.3618, "num_input_tokens_seen": 22507264, "step": 10440 }, { "epoch": 1.9168654799045697, "grad_norm": 1.6322929859161377, "learning_rate": 9.583409799963298e-06, "loss": 0.3019, "num_input_tokens_seen": 22517216, "step": 10445 }, { "epoch": 1.917783079464122, "grad_norm": 6.4942426681518555, "learning_rate": 9.587997797761058e-06, "loss": 0.3173, "num_input_tokens_seen": 22527360, "step": 10450 }, { "epoch": 1.918700679023674, "grad_norm": 0.9207500219345093, "learning_rate": 9.592585795558819e-06, "loss": 0.3678, "num_input_tokens_seen": 22538464, "step": 10455 }, { "epoch": 1.9196182785832263, "grad_norm": 1.549501895904541, "learning_rate": 9.597173793356581e-06, "loss": 0.2955, "num_input_tokens_seen": 22548672, "step": 10460 }, { "epoch": 1.9205358781427786, "grad_norm": 1.9524619579315186, "learning_rate": 9.601761791154341e-06, "loss": 0.3453, "num_input_tokens_seen": 22560768, "step": 10465 }, { "epoch": 1.9214534777023307, "grad_norm": 1.6192138195037842, "learning_rate": 9.606349788952102e-06, "loss": 0.4221, "num_input_tokens_seen": 22571104, "step": 10470 }, { "epoch": 1.922371077261883, "grad_norm": 2.8128230571746826, "learning_rate": 9.610937786749862e-06, "loss": 0.2797, "num_input_tokens_seen": 22581568, "step": 10475 }, { "epoch": 1.9232886768214352, "grad_norm": 2.570138692855835, "learning_rate": 9.615525784547625e-06, "loss": 0.3209, "num_input_tokens_seen": 22593024, "step": 10480 }, { "epoch": 1.9242062763809873, "grad_norm": 10.675296783447266, "learning_rate": 9.620113782345385e-06, "loss": 0.4304, "num_input_tokens_seen": 22603520, "step": 10485 }, { "epoch": 1.9251238759405396, "grad_norm": 3.473890781402588, "learning_rate": 9.624701780143146e-06, "loss": 0.3465, "num_input_tokens_seen": 22614144, "step": 10490 }, { "epoch": 1.9260414755000919, "grad_norm": 3.511413335800171, "learning_rate": 9.629289777940908e-06, "loss": 0.4045, "num_input_tokens_seen": 22625152, "step": 10495 }, { "epoch": 1.926959075059644, "grad_norm": 4.773426532745361, "learning_rate": 9.633877775738668e-06, "loss": 0.3735, "num_input_tokens_seen": 22635680, "step": 10500 }, { "epoch": 1.9278766746191962, "grad_norm": 2.153268575668335, "learning_rate": 9.638465773536429e-06, "loss": 0.3268, "num_input_tokens_seen": 22644704, "step": 10505 }, { "epoch": 1.9287942741787485, "grad_norm": 4.726211071014404, "learning_rate": 9.643053771334191e-06, "loss": 0.317, "num_input_tokens_seen": 22654912, "step": 10510 }, { "epoch": 1.9297118737383006, "grad_norm": 3.4360852241516113, "learning_rate": 9.647641769131952e-06, "loss": 0.347, "num_input_tokens_seen": 22666752, "step": 10515 }, { "epoch": 1.9306294732978528, "grad_norm": 4.57522439956665, "learning_rate": 9.652229766929712e-06, "loss": 0.3245, "num_input_tokens_seen": 22675520, "step": 10520 }, { "epoch": 1.9315470728574051, "grad_norm": 3.485556125640869, "learning_rate": 9.656817764727474e-06, "loss": 0.3844, "num_input_tokens_seen": 22685920, "step": 10525 }, { "epoch": 1.9324646724169572, "grad_norm": 1.9213727712631226, "learning_rate": 9.661405762525235e-06, "loss": 0.279, "num_input_tokens_seen": 22696576, "step": 10530 }, { "epoch": 1.9333822719765095, "grad_norm": 0.9822983145713806, "learning_rate": 9.665993760322995e-06, "loss": 0.25, "num_input_tokens_seen": 22708192, "step": 10535 }, { "epoch": 1.9342998715360618, "grad_norm": 1.2526074647903442, "learning_rate": 9.670581758120758e-06, "loss": 0.3399, "num_input_tokens_seen": 22719552, "step": 10540 }, { "epoch": 1.9352174710956138, "grad_norm": 9.29358959197998, "learning_rate": 9.675169755918518e-06, "loss": 0.4337, "num_input_tokens_seen": 22728992, "step": 10545 }, { "epoch": 1.936135070655166, "grad_norm": 5.862563610076904, "learning_rate": 9.679757753716279e-06, "loss": 0.2785, "num_input_tokens_seen": 22739040, "step": 10550 }, { "epoch": 1.9370526702147184, "grad_norm": 2.0716328620910645, "learning_rate": 9.68434575151404e-06, "loss": 0.1835, "num_input_tokens_seen": 22750112, "step": 10555 }, { "epoch": 1.9379702697742704, "grad_norm": 5.8044352531433105, "learning_rate": 9.688933749311801e-06, "loss": 0.3913, "num_input_tokens_seen": 22761120, "step": 10560 }, { "epoch": 1.9388878693338227, "grad_norm": 4.501874923706055, "learning_rate": 9.693521747109562e-06, "loss": 0.3164, "num_input_tokens_seen": 22770272, "step": 10565 }, { "epoch": 1.939805468893375, "grad_norm": 2.5798985958099365, "learning_rate": 9.698109744907324e-06, "loss": 0.2584, "num_input_tokens_seen": 22780512, "step": 10570 }, { "epoch": 1.940723068452927, "grad_norm": 6.757374286651611, "learning_rate": 9.702697742705085e-06, "loss": 0.3084, "num_input_tokens_seen": 22791584, "step": 10575 }, { "epoch": 1.9416406680124794, "grad_norm": 3.8409671783447266, "learning_rate": 9.707285740502845e-06, "loss": 0.3778, "num_input_tokens_seen": 22802944, "step": 10580 }, { "epoch": 1.9425582675720316, "grad_norm": 1.3965508937835693, "learning_rate": 9.711873738300607e-06, "loss": 0.3756, "num_input_tokens_seen": 22813472, "step": 10585 }, { "epoch": 1.9434758671315837, "grad_norm": 1.0070239305496216, "learning_rate": 9.716461736098368e-06, "loss": 0.2464, "num_input_tokens_seen": 22824192, "step": 10590 }, { "epoch": 1.944393466691136, "grad_norm": 1.1250613927841187, "learning_rate": 9.721049733896128e-06, "loss": 0.2893, "num_input_tokens_seen": 22836800, "step": 10595 }, { "epoch": 1.9453110662506883, "grad_norm": 1.1230617761611938, "learning_rate": 9.72563773169389e-06, "loss": 0.3276, "num_input_tokens_seen": 22847136, "step": 10600 }, { "epoch": 1.9462286658102403, "grad_norm": 1.8634980916976929, "learning_rate": 9.730225729491651e-06, "loss": 0.4253, "num_input_tokens_seen": 22858688, "step": 10605 }, { "epoch": 1.9471462653697926, "grad_norm": 3.3812952041625977, "learning_rate": 9.734813727289412e-06, "loss": 0.3494, "num_input_tokens_seen": 22870944, "step": 10610 }, { "epoch": 1.948063864929345, "grad_norm": 0.8490012884140015, "learning_rate": 9.739401725087174e-06, "loss": 0.2761, "num_input_tokens_seen": 22880864, "step": 10615 }, { "epoch": 1.948981464488897, "grad_norm": 1.2943123579025269, "learning_rate": 9.743989722884933e-06, "loss": 0.3296, "num_input_tokens_seen": 22891232, "step": 10620 }, { "epoch": 1.9498990640484493, "grad_norm": 4.732146739959717, "learning_rate": 9.748577720682695e-06, "loss": 0.358, "num_input_tokens_seen": 22902240, "step": 10625 }, { "epoch": 1.9508166636080015, "grad_norm": 1.2595455646514893, "learning_rate": 9.753165718480457e-06, "loss": 0.2767, "num_input_tokens_seen": 22913440, "step": 10630 }, { "epoch": 1.9517342631675536, "grad_norm": 2.1280415058135986, "learning_rate": 9.757753716278216e-06, "loss": 0.4883, "num_input_tokens_seen": 22923968, "step": 10635 }, { "epoch": 1.952651862727106, "grad_norm": 2.808117628097534, "learning_rate": 9.762341714075978e-06, "loss": 0.3673, "num_input_tokens_seen": 22932704, "step": 10640 }, { "epoch": 1.9535694622866582, "grad_norm": 2.3157410621643066, "learning_rate": 9.76692971187374e-06, "loss": 0.2677, "num_input_tokens_seen": 22944384, "step": 10645 }, { "epoch": 1.9544870618462102, "grad_norm": 9.07361125946045, "learning_rate": 9.771517709671499e-06, "loss": 0.3537, "num_input_tokens_seen": 22955328, "step": 10650 }, { "epoch": 1.9554046614057625, "grad_norm": 1.7219958305358887, "learning_rate": 9.776105707469261e-06, "loss": 0.3823, "num_input_tokens_seen": 22966048, "step": 10655 }, { "epoch": 1.9563222609653148, "grad_norm": 7.441725730895996, "learning_rate": 9.780693705267022e-06, "loss": 0.4095, "num_input_tokens_seen": 22977312, "step": 10660 }, { "epoch": 1.9572398605248669, "grad_norm": 1.6956560611724854, "learning_rate": 9.785281703064782e-06, "loss": 0.3434, "num_input_tokens_seen": 22986464, "step": 10665 }, { "epoch": 1.9581574600844192, "grad_norm": 5.025102138519287, "learning_rate": 9.789869700862545e-06, "loss": 0.367, "num_input_tokens_seen": 22995936, "step": 10670 }, { "epoch": 1.9590750596439714, "grad_norm": 1.949173092842102, "learning_rate": 9.794457698660305e-06, "loss": 0.3905, "num_input_tokens_seen": 23005504, "step": 10675 }, { "epoch": 1.9599926592035235, "grad_norm": 6.169823169708252, "learning_rate": 9.799045696458066e-06, "loss": 0.4624, "num_input_tokens_seen": 23017760, "step": 10680 }, { "epoch": 1.9609102587630758, "grad_norm": 1.9425327777862549, "learning_rate": 9.803633694255828e-06, "loss": 0.3863, "num_input_tokens_seen": 23028768, "step": 10685 }, { "epoch": 1.961827858322628, "grad_norm": 1.6514354944229126, "learning_rate": 9.808221692053588e-06, "loss": 0.2931, "num_input_tokens_seen": 23039104, "step": 10690 }, { "epoch": 1.9627454578821801, "grad_norm": 1.7296432256698608, "learning_rate": 9.812809689851349e-06, "loss": 0.3257, "num_input_tokens_seen": 23050592, "step": 10695 }, { "epoch": 1.9636630574417324, "grad_norm": 1.066807508468628, "learning_rate": 9.817397687649111e-06, "loss": 0.328, "num_input_tokens_seen": 23061024, "step": 10700 }, { "epoch": 1.9645806570012847, "grad_norm": 0.9876201748847961, "learning_rate": 9.821985685446871e-06, "loss": 0.3153, "num_input_tokens_seen": 23070752, "step": 10705 }, { "epoch": 1.9654982565608368, "grad_norm": 4.005167007446289, "learning_rate": 9.826573683244632e-06, "loss": 0.3986, "num_input_tokens_seen": 23081952, "step": 10710 }, { "epoch": 1.966415856120389, "grad_norm": 1.4988758563995361, "learning_rate": 9.831161681042394e-06, "loss": 0.2883, "num_input_tokens_seen": 23092992, "step": 10715 }, { "epoch": 1.9673334556799413, "grad_norm": 0.9596266746520996, "learning_rate": 9.835749678840155e-06, "loss": 0.3132, "num_input_tokens_seen": 23103520, "step": 10720 }, { "epoch": 1.9682510552394934, "grad_norm": 1.2389028072357178, "learning_rate": 9.840337676637915e-06, "loss": 0.2444, "num_input_tokens_seen": 23115104, "step": 10725 }, { "epoch": 1.9691686547990457, "grad_norm": 1.9725533723831177, "learning_rate": 9.844925674435677e-06, "loss": 0.3052, "num_input_tokens_seen": 23126944, "step": 10730 }, { "epoch": 1.970086254358598, "grad_norm": 3.2057507038116455, "learning_rate": 9.849513672233438e-06, "loss": 0.4181, "num_input_tokens_seen": 23139264, "step": 10735 }, { "epoch": 1.97100385391815, "grad_norm": 5.248388290405273, "learning_rate": 9.854101670031198e-06, "loss": 0.2876, "num_input_tokens_seen": 23149696, "step": 10740 }, { "epoch": 1.9719214534777023, "grad_norm": 5.8271870613098145, "learning_rate": 9.85868966782896e-06, "loss": 0.3033, "num_input_tokens_seen": 23160768, "step": 10745 }, { "epoch": 1.9728390530372546, "grad_norm": 0.853024423122406, "learning_rate": 9.863277665626721e-06, "loss": 0.3729, "num_input_tokens_seen": 23169824, "step": 10750 }, { "epoch": 1.9737566525968067, "grad_norm": 1.0931603908538818, "learning_rate": 9.867865663424482e-06, "loss": 0.3173, "num_input_tokens_seen": 23181824, "step": 10755 }, { "epoch": 1.974674252156359, "grad_norm": 2.3445682525634766, "learning_rate": 9.872453661222244e-06, "loss": 0.2783, "num_input_tokens_seen": 23193024, "step": 10760 }, { "epoch": 1.9755918517159112, "grad_norm": 3.731905221939087, "learning_rate": 9.877041659020004e-06, "loss": 0.2925, "num_input_tokens_seen": 23203456, "step": 10765 }, { "epoch": 1.9765094512754633, "grad_norm": 1.9806127548217773, "learning_rate": 9.881629656817765e-06, "loss": 0.4112, "num_input_tokens_seen": 23213792, "step": 10770 }, { "epoch": 1.9774270508350156, "grad_norm": 2.3782081604003906, "learning_rate": 9.886217654615527e-06, "loss": 0.4019, "num_input_tokens_seen": 23225184, "step": 10775 }, { "epoch": 1.9783446503945679, "grad_norm": 1.0268526077270508, "learning_rate": 9.890805652413288e-06, "loss": 0.3989, "num_input_tokens_seen": 23236320, "step": 10780 }, { "epoch": 1.97926224995412, "grad_norm": 0.8816027641296387, "learning_rate": 9.89539365021105e-06, "loss": 0.3616, "num_input_tokens_seen": 23248160, "step": 10785 }, { "epoch": 1.9801798495136722, "grad_norm": 3.804442882537842, "learning_rate": 9.899981648008809e-06, "loss": 0.2854, "num_input_tokens_seen": 23260512, "step": 10790 }, { "epoch": 1.9810974490732245, "grad_norm": 1.152321696281433, "learning_rate": 9.904569645806571e-06, "loss": 0.2621, "num_input_tokens_seen": 23271360, "step": 10795 }, { "epoch": 1.9820150486327766, "grad_norm": 5.056352615356445, "learning_rate": 9.909157643604333e-06, "loss": 0.3489, "num_input_tokens_seen": 23281920, "step": 10800 }, { "epoch": 1.9829326481923288, "grad_norm": 2.0648176670074463, "learning_rate": 9.913745641402092e-06, "loss": 0.3277, "num_input_tokens_seen": 23292992, "step": 10805 }, { "epoch": 1.9838502477518811, "grad_norm": 1.932327389717102, "learning_rate": 9.918333639199854e-06, "loss": 0.2986, "num_input_tokens_seen": 23303936, "step": 10810 }, { "epoch": 1.9847678473114332, "grad_norm": 1.6671842336654663, "learning_rate": 9.922921636997616e-06, "loss": 0.3061, "num_input_tokens_seen": 23315328, "step": 10815 }, { "epoch": 1.9856854468709855, "grad_norm": 1.367178201675415, "learning_rate": 9.927509634795375e-06, "loss": 0.2636, "num_input_tokens_seen": 23326048, "step": 10820 }, { "epoch": 1.9866030464305378, "grad_norm": 6.016845226287842, "learning_rate": 9.932097632593137e-06, "loss": 0.3685, "num_input_tokens_seen": 23337440, "step": 10825 }, { "epoch": 1.9875206459900898, "grad_norm": 0.6338575482368469, "learning_rate": 9.936685630390898e-06, "loss": 0.2977, "num_input_tokens_seen": 23347488, "step": 10830 }, { "epoch": 1.988438245549642, "grad_norm": 4.590443134307861, "learning_rate": 9.941273628188658e-06, "loss": 0.3129, "num_input_tokens_seen": 23357728, "step": 10835 }, { "epoch": 1.9893558451091944, "grad_norm": 3.4536263942718506, "learning_rate": 9.94586162598642e-06, "loss": 0.3186, "num_input_tokens_seen": 23367328, "step": 10840 }, { "epoch": 1.9902734446687464, "grad_norm": 2.874544143676758, "learning_rate": 9.950449623784181e-06, "loss": 0.3109, "num_input_tokens_seen": 23377664, "step": 10845 }, { "epoch": 1.9911910442282987, "grad_norm": 3.048105239868164, "learning_rate": 9.955037621581942e-06, "loss": 0.3993, "num_input_tokens_seen": 23390400, "step": 10850 }, { "epoch": 1.992108643787851, "grad_norm": 0.8302862644195557, "learning_rate": 9.959625619379704e-06, "loss": 0.4028, "num_input_tokens_seen": 23400608, "step": 10855 }, { "epoch": 1.993026243347403, "grad_norm": 6.725632667541504, "learning_rate": 9.964213617177464e-06, "loss": 0.2882, "num_input_tokens_seen": 23411712, "step": 10860 }, { "epoch": 1.9939438429069554, "grad_norm": 0.8256027698516846, "learning_rate": 9.968801614975225e-06, "loss": 0.32, "num_input_tokens_seen": 23421568, "step": 10865 }, { "epoch": 1.9948614424665077, "grad_norm": 1.0969101190567017, "learning_rate": 9.973389612772987e-06, "loss": 0.324, "num_input_tokens_seen": 23431424, "step": 10870 }, { "epoch": 1.9957790420260597, "grad_norm": 1.5949487686157227, "learning_rate": 9.977977610570748e-06, "loss": 0.3288, "num_input_tokens_seen": 23442048, "step": 10875 }, { "epoch": 1.996696641585612, "grad_norm": 2.391688108444214, "learning_rate": 9.982565608368508e-06, "loss": 0.3986, "num_input_tokens_seen": 23452384, "step": 10880 }, { "epoch": 1.9976142411451643, "grad_norm": 1.350221872329712, "learning_rate": 9.98715360616627e-06, "loss": 0.3366, "num_input_tokens_seen": 23463360, "step": 10885 }, { "epoch": 1.9985318407047163, "grad_norm": 8.899313926696777, "learning_rate": 9.991741603964031e-06, "loss": 0.3541, "num_input_tokens_seen": 23475488, "step": 10890 }, { "epoch": 1.9994494402642686, "grad_norm": 1.3494046926498413, "learning_rate": 9.996329601761791e-06, "loss": 0.3159, "num_input_tokens_seen": 23486624, "step": 10895 }, { "epoch": 2.0, "eval_loss": 0.3301483988761902, "eval_runtime": 179.4306, "eval_samples_per_second": 30.368, "eval_steps_per_second": 7.596, "num_input_tokens_seen": 23491904, "step": 10898 }, { "epoch": 2.000367039823821, "grad_norm": 7.5278544425964355, "learning_rate": 9.999999997435156e-06, "loss": 0.394, "num_input_tokens_seen": 23495040, "step": 10900 }, { "epoch": 2.001284639383373, "grad_norm": 0.6199629306793213, "learning_rate": 9.999999907665581e-06, "loss": 0.3485, "num_input_tokens_seen": 23507232, "step": 10905 }, { "epoch": 2.0022022389429255, "grad_norm": 2.0813310146331787, "learning_rate": 9.999999689653756e-06, "loss": 0.4224, "num_input_tokens_seen": 23517952, "step": 10910 }, { "epoch": 2.0031198385024775, "grad_norm": 3.15898060798645, "learning_rate": 9.99999934339969e-06, "loss": 0.288, "num_input_tokens_seen": 23529952, "step": 10915 }, { "epoch": 2.0040374380620296, "grad_norm": 4.3949737548828125, "learning_rate": 9.99999886890339e-06, "loss": 0.2385, "num_input_tokens_seen": 23540768, "step": 10920 }, { "epoch": 2.004955037621582, "grad_norm": 2.2123982906341553, "learning_rate": 9.999998266164868e-06, "loss": 0.3849, "num_input_tokens_seen": 23552800, "step": 10925 }, { "epoch": 2.005872637181134, "grad_norm": 1.7239412069320679, "learning_rate": 9.99999753518414e-06, "loss": 0.3125, "num_input_tokens_seen": 23565024, "step": 10930 }, { "epoch": 2.0067902367406862, "grad_norm": 5.406741619110107, "learning_rate": 9.999996675961223e-06, "loss": 0.4054, "num_input_tokens_seen": 23576384, "step": 10935 }, { "epoch": 2.0077078363002387, "grad_norm": 1.3175380229949951, "learning_rate": 9.999995688496142e-06, "loss": 0.349, "num_input_tokens_seen": 23586944, "step": 10940 }, { "epoch": 2.008625435859791, "grad_norm": 1.0134775638580322, "learning_rate": 9.999994572788922e-06, "loss": 0.3103, "num_input_tokens_seen": 23597888, "step": 10945 }, { "epoch": 2.009543035419343, "grad_norm": 1.439016580581665, "learning_rate": 9.999993328839588e-06, "loss": 0.3835, "num_input_tokens_seen": 23609504, "step": 10950 }, { "epoch": 2.0104606349788954, "grad_norm": 7.776140213012695, "learning_rate": 9.999991956648177e-06, "loss": 0.3648, "num_input_tokens_seen": 23618528, "step": 10955 }, { "epoch": 2.0113782345384474, "grad_norm": 1.512137770652771, "learning_rate": 9.999990456214719e-06, "loss": 0.2824, "num_input_tokens_seen": 23628736, "step": 10960 }, { "epoch": 2.0122958340979995, "grad_norm": 1.8532624244689941, "learning_rate": 9.999988827539256e-06, "loss": 0.2401, "num_input_tokens_seen": 23638848, "step": 10965 }, { "epoch": 2.013213433657552, "grad_norm": 6.272525310516357, "learning_rate": 9.999987070621831e-06, "loss": 0.3125, "num_input_tokens_seen": 23649664, "step": 10970 }, { "epoch": 2.014131033217104, "grad_norm": 2.338796854019165, "learning_rate": 9.999985185462483e-06, "loss": 0.2905, "num_input_tokens_seen": 23661472, "step": 10975 }, { "epoch": 2.015048632776656, "grad_norm": 8.004322052001953, "learning_rate": 9.999983172061268e-06, "loss": 0.4135, "num_input_tokens_seen": 23673632, "step": 10980 }, { "epoch": 2.0159662323362086, "grad_norm": 1.5982537269592285, "learning_rate": 9.999981030418231e-06, "loss": 0.2604, "num_input_tokens_seen": 23683552, "step": 10985 }, { "epoch": 2.0168838318957607, "grad_norm": 0.6489207744598389, "learning_rate": 9.999978760533432e-06, "loss": 0.4826, "num_input_tokens_seen": 23694304, "step": 10990 }, { "epoch": 2.0178014314553128, "grad_norm": 0.9486220479011536, "learning_rate": 9.999976362406924e-06, "loss": 0.2954, "num_input_tokens_seen": 23705120, "step": 10995 }, { "epoch": 2.0187190310148653, "grad_norm": 2.3075435161590576, "learning_rate": 9.999973836038775e-06, "loss": 0.3548, "num_input_tokens_seen": 23715232, "step": 11000 }, { "epoch": 2.0196366305744173, "grad_norm": 2.6073741912841797, "learning_rate": 9.999971181429045e-06, "loss": 0.2801, "num_input_tokens_seen": 23726400, "step": 11005 }, { "epoch": 2.0205542301339694, "grad_norm": 1.1359405517578125, "learning_rate": 9.999968398577804e-06, "loss": 0.2581, "num_input_tokens_seen": 23737248, "step": 11010 }, { "epoch": 2.021471829693522, "grad_norm": 0.7732378840446472, "learning_rate": 9.999965487485122e-06, "loss": 0.3094, "num_input_tokens_seen": 23747712, "step": 11015 }, { "epoch": 2.022389429253074, "grad_norm": 5.574945449829102, "learning_rate": 9.999962448151075e-06, "loss": 0.3721, "num_input_tokens_seen": 23757760, "step": 11020 }, { "epoch": 2.023307028812626, "grad_norm": 0.6935253739356995, "learning_rate": 9.999959280575739e-06, "loss": 0.3771, "num_input_tokens_seen": 23769024, "step": 11025 }, { "epoch": 2.0242246283721785, "grad_norm": 1.0466097593307495, "learning_rate": 9.9999559847592e-06, "loss": 0.3779, "num_input_tokens_seen": 23779680, "step": 11030 }, { "epoch": 2.0251422279317306, "grad_norm": 1.2552459239959717, "learning_rate": 9.999952560701536e-06, "loss": 0.3096, "num_input_tokens_seen": 23790400, "step": 11035 }, { "epoch": 2.0260598274912827, "grad_norm": 2.2210161685943604, "learning_rate": 9.99994900840284e-06, "loss": 0.2908, "num_input_tokens_seen": 23802144, "step": 11040 }, { "epoch": 2.026977427050835, "grad_norm": 1.3046022653579712, "learning_rate": 9.9999453278632e-06, "loss": 0.3471, "num_input_tokens_seen": 23812512, "step": 11045 }, { "epoch": 2.0278950266103872, "grad_norm": 6.057827949523926, "learning_rate": 9.999941519082713e-06, "loss": 0.3167, "num_input_tokens_seen": 23822752, "step": 11050 }, { "epoch": 2.0288126261699393, "grad_norm": 9.638723373413086, "learning_rate": 9.999937582061472e-06, "loss": 0.3628, "num_input_tokens_seen": 23834240, "step": 11055 }, { "epoch": 2.029730225729492, "grad_norm": 10.713582992553711, "learning_rate": 9.999933516799584e-06, "loss": 0.3896, "num_input_tokens_seen": 23844640, "step": 11060 }, { "epoch": 2.030647825289044, "grad_norm": 0.993120014667511, "learning_rate": 9.999929323297151e-06, "loss": 0.3068, "num_input_tokens_seen": 23855616, "step": 11065 }, { "epoch": 2.031565424848596, "grad_norm": 0.7972759008407593, "learning_rate": 9.999925001554277e-06, "loss": 0.4266, "num_input_tokens_seen": 23866432, "step": 11070 }, { "epoch": 2.0324830244081484, "grad_norm": 1.2319600582122803, "learning_rate": 9.99992055157108e-06, "loss": 0.2667, "num_input_tokens_seen": 23877216, "step": 11075 }, { "epoch": 2.0334006239677005, "grad_norm": 0.8460760116577148, "learning_rate": 9.999915973347667e-06, "loss": 0.2511, "num_input_tokens_seen": 23887680, "step": 11080 }, { "epoch": 2.0343182235272526, "grad_norm": 1.476208209991455, "learning_rate": 9.99991126688416e-06, "loss": 0.2893, "num_input_tokens_seen": 23898752, "step": 11085 }, { "epoch": 2.035235823086805, "grad_norm": 1.0908063650131226, "learning_rate": 9.999906432180676e-06, "loss": 0.274, "num_input_tokens_seen": 23909440, "step": 11090 }, { "epoch": 2.036153422646357, "grad_norm": 1.1631793975830078, "learning_rate": 9.999901469237344e-06, "loss": 0.4355, "num_input_tokens_seen": 23920544, "step": 11095 }, { "epoch": 2.037071022205909, "grad_norm": 9.474855422973633, "learning_rate": 9.999896378054285e-06, "loss": 0.4663, "num_input_tokens_seen": 23932224, "step": 11100 }, { "epoch": 2.0379886217654617, "grad_norm": 9.696959495544434, "learning_rate": 9.999891158631637e-06, "loss": 0.5808, "num_input_tokens_seen": 23942752, "step": 11105 }, { "epoch": 2.0389062213250138, "grad_norm": 1.0386817455291748, "learning_rate": 9.999885810969528e-06, "loss": 0.3678, "num_input_tokens_seen": 23954880, "step": 11110 }, { "epoch": 2.039823820884566, "grad_norm": 1.276006817817688, "learning_rate": 9.999880335068096e-06, "loss": 0.5222, "num_input_tokens_seen": 23965024, "step": 11115 }, { "epoch": 2.0407414204441183, "grad_norm": 0.5272539854049683, "learning_rate": 9.999874730927484e-06, "loss": 0.3119, "num_input_tokens_seen": 23976320, "step": 11120 }, { "epoch": 2.0416590200036704, "grad_norm": 2.2477784156799316, "learning_rate": 9.999868998547834e-06, "loss": 0.3147, "num_input_tokens_seen": 23988256, "step": 11125 }, { "epoch": 2.0425766195632225, "grad_norm": 0.7202411890029907, "learning_rate": 9.999863137929293e-06, "loss": 0.2773, "num_input_tokens_seen": 23997472, "step": 11130 }, { "epoch": 2.043494219122775, "grad_norm": 2.8767995834350586, "learning_rate": 9.999857149072011e-06, "loss": 0.3243, "num_input_tokens_seen": 24007712, "step": 11135 }, { "epoch": 2.044411818682327, "grad_norm": 3.6067049503326416, "learning_rate": 9.999851031976142e-06, "loss": 0.3028, "num_input_tokens_seen": 24018656, "step": 11140 }, { "epoch": 2.045329418241879, "grad_norm": 5.0785064697265625, "learning_rate": 9.999844786641845e-06, "loss": 0.3054, "num_input_tokens_seen": 24029632, "step": 11145 }, { "epoch": 2.0462470178014316, "grad_norm": 3.507521152496338, "learning_rate": 9.999838413069279e-06, "loss": 0.2936, "num_input_tokens_seen": 24041024, "step": 11150 }, { "epoch": 2.0471646173609837, "grad_norm": 0.8792659640312195, "learning_rate": 9.999831911258604e-06, "loss": 0.4145, "num_input_tokens_seen": 24052288, "step": 11155 }, { "epoch": 2.0480822169205357, "grad_norm": 3.4200336933135986, "learning_rate": 9.999825281209989e-06, "loss": 0.2909, "num_input_tokens_seen": 24063648, "step": 11160 }, { "epoch": 2.048999816480088, "grad_norm": 2.5749521255493164, "learning_rate": 9.999818522923608e-06, "loss": 0.2869, "num_input_tokens_seen": 24074624, "step": 11165 }, { "epoch": 2.0499174160396403, "grad_norm": 0.5052868127822876, "learning_rate": 9.999811636399628e-06, "loss": 0.4613, "num_input_tokens_seen": 24085120, "step": 11170 }, { "epoch": 2.0508350155991923, "grad_norm": 0.885348379611969, "learning_rate": 9.99980462163823e-06, "loss": 0.4659, "num_input_tokens_seen": 24095584, "step": 11175 }, { "epoch": 2.051752615158745, "grad_norm": 0.7760540246963501, "learning_rate": 9.999797478639593e-06, "loss": 0.3469, "num_input_tokens_seen": 24106144, "step": 11180 }, { "epoch": 2.052670214718297, "grad_norm": 2.751737356185913, "learning_rate": 9.999790207403898e-06, "loss": 0.3797, "num_input_tokens_seen": 24116704, "step": 11185 }, { "epoch": 2.053587814277849, "grad_norm": 2.4688949584960938, "learning_rate": 9.999782807931333e-06, "loss": 0.3019, "num_input_tokens_seen": 24127072, "step": 11190 }, { "epoch": 2.0545054138374015, "grad_norm": 4.625059604644775, "learning_rate": 9.999775280222089e-06, "loss": 0.3459, "num_input_tokens_seen": 24138272, "step": 11195 }, { "epoch": 2.0554230133969535, "grad_norm": 0.4812219440937042, "learning_rate": 9.999767624276357e-06, "loss": 0.4178, "num_input_tokens_seen": 24148608, "step": 11200 }, { "epoch": 2.0563406129565056, "grad_norm": 0.9280379414558411, "learning_rate": 9.999759840094336e-06, "loss": 0.3305, "num_input_tokens_seen": 24159200, "step": 11205 }, { "epoch": 2.057258212516058, "grad_norm": 3.2564852237701416, "learning_rate": 9.999751927676223e-06, "loss": 0.3032, "num_input_tokens_seen": 24170272, "step": 11210 }, { "epoch": 2.05817581207561, "grad_norm": 2.0151612758636475, "learning_rate": 9.999743887022223e-06, "loss": 0.3393, "num_input_tokens_seen": 24180576, "step": 11215 }, { "epoch": 2.0590934116351622, "grad_norm": 1.6439460515975952, "learning_rate": 9.99973571813254e-06, "loss": 0.3315, "num_input_tokens_seen": 24190496, "step": 11220 }, { "epoch": 2.0600110111947147, "grad_norm": 0.8817186951637268, "learning_rate": 9.999727421007387e-06, "loss": 0.2927, "num_input_tokens_seen": 24202016, "step": 11225 }, { "epoch": 2.060928610754267, "grad_norm": 0.6963930130004883, "learning_rate": 9.99971899564697e-06, "loss": 0.3193, "num_input_tokens_seen": 24213536, "step": 11230 }, { "epoch": 2.061846210313819, "grad_norm": 0.5447385907173157, "learning_rate": 9.999710442051514e-06, "loss": 0.3286, "num_input_tokens_seen": 24225248, "step": 11235 }, { "epoch": 2.0627638098733714, "grad_norm": 3.444864511489868, "learning_rate": 9.999701760221231e-06, "loss": 0.3035, "num_input_tokens_seen": 24233792, "step": 11240 }, { "epoch": 2.0636814094329234, "grad_norm": 0.8488918542861938, "learning_rate": 9.999692950156347e-06, "loss": 0.3069, "num_input_tokens_seen": 24245536, "step": 11245 }, { "epoch": 2.0645990089924755, "grad_norm": 3.8419103622436523, "learning_rate": 9.999684011857089e-06, "loss": 0.3089, "num_input_tokens_seen": 24256576, "step": 11250 }, { "epoch": 2.065516608552028, "grad_norm": 1.0864739418029785, "learning_rate": 9.999674945323685e-06, "loss": 0.3019, "num_input_tokens_seen": 24268480, "step": 11255 }, { "epoch": 2.06643420811158, "grad_norm": 1.5039914846420288, "learning_rate": 9.999665750556367e-06, "loss": 0.317, "num_input_tokens_seen": 24280000, "step": 11260 }, { "epoch": 2.067351807671132, "grad_norm": 7.8272624015808105, "learning_rate": 9.99965642755537e-06, "loss": 0.3462, "num_input_tokens_seen": 24291584, "step": 11265 }, { "epoch": 2.0682694072306846, "grad_norm": 1.1218775510787964, "learning_rate": 9.999646976320937e-06, "loss": 0.2642, "num_input_tokens_seen": 24302624, "step": 11270 }, { "epoch": 2.0691870067902367, "grad_norm": 4.2251200675964355, "learning_rate": 9.999637396853306e-06, "loss": 0.427, "num_input_tokens_seen": 24312608, "step": 11275 }, { "epoch": 2.0701046063497888, "grad_norm": 4.7581562995910645, "learning_rate": 9.999627689152725e-06, "loss": 0.2775, "num_input_tokens_seen": 24324096, "step": 11280 }, { "epoch": 2.0710222059093413, "grad_norm": 1.0949413776397705, "learning_rate": 9.999617853219444e-06, "loss": 0.4564, "num_input_tokens_seen": 24336096, "step": 11285 }, { "epoch": 2.0719398054688933, "grad_norm": 2.2981271743774414, "learning_rate": 9.99960788905371e-06, "loss": 0.3644, "num_input_tokens_seen": 24345856, "step": 11290 }, { "epoch": 2.0728574050284454, "grad_norm": 1.712394118309021, "learning_rate": 9.999597796655785e-06, "loss": 0.3286, "num_input_tokens_seen": 24356992, "step": 11295 }, { "epoch": 2.073775004587998, "grad_norm": 2.7769782543182373, "learning_rate": 9.999587576025924e-06, "loss": 0.3363, "num_input_tokens_seen": 24367424, "step": 11300 }, { "epoch": 2.07469260414755, "grad_norm": 2.2530195713043213, "learning_rate": 9.999577227164393e-06, "loss": 0.3015, "num_input_tokens_seen": 24377792, "step": 11305 }, { "epoch": 2.075610203707102, "grad_norm": 1.2298862934112549, "learning_rate": 9.999566750071453e-06, "loss": 0.2369, "num_input_tokens_seen": 24387168, "step": 11310 }, { "epoch": 2.0765278032666545, "grad_norm": 0.9703628420829773, "learning_rate": 9.999556144747373e-06, "loss": 0.2647, "num_input_tokens_seen": 24398368, "step": 11315 }, { "epoch": 2.0774454028262066, "grad_norm": 1.2233338356018066, "learning_rate": 9.999545411192428e-06, "loss": 0.2988, "num_input_tokens_seen": 24408096, "step": 11320 }, { "epoch": 2.0783630023857587, "grad_norm": 6.750339031219482, "learning_rate": 9.99953454940689e-06, "loss": 0.3042, "num_input_tokens_seen": 24417728, "step": 11325 }, { "epoch": 2.079280601945311, "grad_norm": 1.157158613204956, "learning_rate": 9.999523559391042e-06, "loss": 0.3467, "num_input_tokens_seen": 24427360, "step": 11330 }, { "epoch": 2.0801982015048632, "grad_norm": 8.706501960754395, "learning_rate": 9.999512441145163e-06, "loss": 0.3614, "num_input_tokens_seen": 24439104, "step": 11335 }, { "epoch": 2.0811158010644153, "grad_norm": 6.5793256759643555, "learning_rate": 9.999501194669536e-06, "loss": 0.4526, "num_input_tokens_seen": 24450176, "step": 11340 }, { "epoch": 2.082033400623968, "grad_norm": 2.374218225479126, "learning_rate": 9.999489819964454e-06, "loss": 0.3275, "num_input_tokens_seen": 24460352, "step": 11345 }, { "epoch": 2.08295100018352, "grad_norm": 1.0395846366882324, "learning_rate": 9.999478317030207e-06, "loss": 0.286, "num_input_tokens_seen": 24471392, "step": 11350 }, { "epoch": 2.083868599743072, "grad_norm": 5.121419429779053, "learning_rate": 9.999466685867089e-06, "loss": 0.3114, "num_input_tokens_seen": 24481984, "step": 11355 }, { "epoch": 2.0847861993026244, "grad_norm": 1.4170547723770142, "learning_rate": 9.999454926475399e-06, "loss": 0.3602, "num_input_tokens_seen": 24492704, "step": 11360 }, { "epoch": 2.0857037988621765, "grad_norm": 1.2309144735336304, "learning_rate": 9.999443038855438e-06, "loss": 0.3615, "num_input_tokens_seen": 24503616, "step": 11365 }, { "epoch": 2.0866213984217286, "grad_norm": 3.0831329822540283, "learning_rate": 9.999431023007511e-06, "loss": 0.4094, "num_input_tokens_seen": 24515008, "step": 11370 }, { "epoch": 2.087538997981281, "grad_norm": 1.153564691543579, "learning_rate": 9.999418878931927e-06, "loss": 0.3481, "num_input_tokens_seen": 24525824, "step": 11375 }, { "epoch": 2.088456597540833, "grad_norm": 1.945768117904663, "learning_rate": 9.999406606628999e-06, "loss": 0.2764, "num_input_tokens_seen": 24535584, "step": 11380 }, { "epoch": 2.089374197100385, "grad_norm": 1.9591799974441528, "learning_rate": 9.999394206099038e-06, "loss": 0.3117, "num_input_tokens_seen": 24547168, "step": 11385 }, { "epoch": 2.0902917966599377, "grad_norm": 2.352860689163208, "learning_rate": 9.999381677342365e-06, "loss": 0.3238, "num_input_tokens_seen": 24558048, "step": 11390 }, { "epoch": 2.0912093962194898, "grad_norm": 3.3925487995147705, "learning_rate": 9.999369020359299e-06, "loss": 0.344, "num_input_tokens_seen": 24568448, "step": 11395 }, { "epoch": 2.092126995779042, "grad_norm": 1.3055766820907593, "learning_rate": 9.999356235150169e-06, "loss": 0.2677, "num_input_tokens_seen": 24579776, "step": 11400 }, { "epoch": 2.0930445953385943, "grad_norm": 1.3144506216049194, "learning_rate": 9.999343321715296e-06, "loss": 0.2714, "num_input_tokens_seen": 24591328, "step": 11405 }, { "epoch": 2.0939621948981464, "grad_norm": 3.297381639480591, "learning_rate": 9.999330280055018e-06, "loss": 0.3706, "num_input_tokens_seen": 24600864, "step": 11410 }, { "epoch": 2.0948797944576985, "grad_norm": 1.2600656747817993, "learning_rate": 9.999317110169665e-06, "loss": 0.3014, "num_input_tokens_seen": 24611776, "step": 11415 }, { "epoch": 2.095797394017251, "grad_norm": 1.490432620048523, "learning_rate": 9.999303812059576e-06, "loss": 0.3576, "num_input_tokens_seen": 24623840, "step": 11420 }, { "epoch": 2.096714993576803, "grad_norm": 0.9630469679832458, "learning_rate": 9.999290385725093e-06, "loss": 0.2838, "num_input_tokens_seen": 24633664, "step": 11425 }, { "epoch": 2.097632593136355, "grad_norm": 2.516975164413452, "learning_rate": 9.99927683116656e-06, "loss": 0.3554, "num_input_tokens_seen": 24645024, "step": 11430 }, { "epoch": 2.0985501926959076, "grad_norm": 1.561043381690979, "learning_rate": 9.999263148384326e-06, "loss": 0.2894, "num_input_tokens_seen": 24655680, "step": 11435 }, { "epoch": 2.0994677922554597, "grad_norm": 2.24752140045166, "learning_rate": 9.999249337378739e-06, "loss": 0.3286, "num_input_tokens_seen": 24665824, "step": 11440 }, { "epoch": 2.1003853918150117, "grad_norm": 14.319795608520508, "learning_rate": 9.999235398150154e-06, "loss": 0.3303, "num_input_tokens_seen": 24675776, "step": 11445 }, { "epoch": 2.101302991374564, "grad_norm": 3.5734663009643555, "learning_rate": 9.99922133069893e-06, "loss": 0.4566, "num_input_tokens_seen": 24686624, "step": 11450 }, { "epoch": 2.1022205909341163, "grad_norm": 1.5359340906143188, "learning_rate": 9.999207135025425e-06, "loss": 0.3629, "num_input_tokens_seen": 24695776, "step": 11455 }, { "epoch": 2.1031381904936683, "grad_norm": 4.464333534240723, "learning_rate": 9.999192811130008e-06, "loss": 0.2671, "num_input_tokens_seen": 24707488, "step": 11460 }, { "epoch": 2.104055790053221, "grad_norm": 1.3088792562484741, "learning_rate": 9.999178359013042e-06, "loss": 0.3542, "num_input_tokens_seen": 24719168, "step": 11465 }, { "epoch": 2.104973389612773, "grad_norm": 5.564323425292969, "learning_rate": 9.9991637786749e-06, "loss": 0.284, "num_input_tokens_seen": 24730688, "step": 11470 }, { "epoch": 2.105890989172325, "grad_norm": 2.6671335697174072, "learning_rate": 9.999149070115952e-06, "loss": 0.2685, "num_input_tokens_seen": 24741984, "step": 11475 }, { "epoch": 2.1068085887318775, "grad_norm": 5.317220211029053, "learning_rate": 9.999134233336581e-06, "loss": 0.4043, "num_input_tokens_seen": 24752544, "step": 11480 }, { "epoch": 2.1077261882914295, "grad_norm": 1.48103666305542, "learning_rate": 9.999119268337165e-06, "loss": 0.4327, "num_input_tokens_seen": 24762112, "step": 11485 }, { "epoch": 2.1086437878509816, "grad_norm": 0.6246538758277893, "learning_rate": 9.999104175118087e-06, "loss": 0.3209, "num_input_tokens_seen": 24772832, "step": 11490 }, { "epoch": 2.109561387410534, "grad_norm": 0.7854511141777039, "learning_rate": 9.999088953679734e-06, "loss": 0.3202, "num_input_tokens_seen": 24783040, "step": 11495 }, { "epoch": 2.110478986970086, "grad_norm": 1.1702253818511963, "learning_rate": 9.999073604022498e-06, "loss": 0.3124, "num_input_tokens_seen": 24793088, "step": 11500 }, { "epoch": 2.1113965865296382, "grad_norm": 0.8105711340904236, "learning_rate": 9.999058126146773e-06, "loss": 0.3124, "num_input_tokens_seen": 24801600, "step": 11505 }, { "epoch": 2.1123141860891907, "grad_norm": 1.5575507879257202, "learning_rate": 9.999042520052954e-06, "loss": 0.2958, "num_input_tokens_seen": 24811936, "step": 11510 }, { "epoch": 2.113231785648743, "grad_norm": 2.7904841899871826, "learning_rate": 9.999026785741443e-06, "loss": 0.3066, "num_input_tokens_seen": 24822208, "step": 11515 }, { "epoch": 2.114149385208295, "grad_norm": 2.5953829288482666, "learning_rate": 9.999010923212642e-06, "loss": 0.2841, "num_input_tokens_seen": 24832800, "step": 11520 }, { "epoch": 2.1150669847678474, "grad_norm": 6.322160720825195, "learning_rate": 9.998994932466958e-06, "loss": 0.4507, "num_input_tokens_seen": 24843168, "step": 11525 }, { "epoch": 2.1159845843273994, "grad_norm": 2.1592154502868652, "learning_rate": 9.998978813504803e-06, "loss": 0.2977, "num_input_tokens_seen": 24853248, "step": 11530 }, { "epoch": 2.116902183886952, "grad_norm": 0.6397737264633179, "learning_rate": 9.998962566326587e-06, "loss": 0.3045, "num_input_tokens_seen": 24865248, "step": 11535 }, { "epoch": 2.117819783446504, "grad_norm": 5.6011834144592285, "learning_rate": 9.99894619093273e-06, "loss": 0.3876, "num_input_tokens_seen": 24875872, "step": 11540 }, { "epoch": 2.118737383006056, "grad_norm": 4.281527996063232, "learning_rate": 9.998929687323651e-06, "loss": 0.3658, "num_input_tokens_seen": 24885472, "step": 11545 }, { "epoch": 2.1196549825656086, "grad_norm": 2.8942418098449707, "learning_rate": 9.998913055499775e-06, "loss": 0.3802, "num_input_tokens_seen": 24895968, "step": 11550 }, { "epoch": 2.1205725821251606, "grad_norm": 2.638768196105957, "learning_rate": 9.998896295461524e-06, "loss": 0.2897, "num_input_tokens_seen": 24906688, "step": 11555 }, { "epoch": 2.1214901816847127, "grad_norm": 1.6521687507629395, "learning_rate": 9.998879407209332e-06, "loss": 0.3234, "num_input_tokens_seen": 24916096, "step": 11560 }, { "epoch": 2.122407781244265, "grad_norm": 2.7563138008117676, "learning_rate": 9.998862390743632e-06, "loss": 0.3536, "num_input_tokens_seen": 24925824, "step": 11565 }, { "epoch": 2.1233253808038173, "grad_norm": 3.7020628452301025, "learning_rate": 9.998845246064856e-06, "loss": 0.3138, "num_input_tokens_seen": 24935168, "step": 11570 }, { "epoch": 2.1242429803633693, "grad_norm": 1.1906105279922485, "learning_rate": 9.998827973173448e-06, "loss": 0.285, "num_input_tokens_seen": 24947776, "step": 11575 }, { "epoch": 2.125160579922922, "grad_norm": 6.430753707885742, "learning_rate": 9.998810572069851e-06, "loss": 0.3059, "num_input_tokens_seen": 24959456, "step": 11580 }, { "epoch": 2.126078179482474, "grad_norm": 1.983910083770752, "learning_rate": 9.99879304275451e-06, "loss": 0.2553, "num_input_tokens_seen": 24970816, "step": 11585 }, { "epoch": 2.126995779042026, "grad_norm": 1.4765743017196655, "learning_rate": 9.998775385227875e-06, "loss": 0.3155, "num_input_tokens_seen": 24982016, "step": 11590 }, { "epoch": 2.1279133786015785, "grad_norm": 1.8049297332763672, "learning_rate": 9.998757599490398e-06, "loss": 0.4098, "num_input_tokens_seen": 24991872, "step": 11595 }, { "epoch": 2.1288309781611305, "grad_norm": 0.6923867464065552, "learning_rate": 9.998739685542536e-06, "loss": 0.432, "num_input_tokens_seen": 25002400, "step": 11600 }, { "epoch": 2.1297485777206826, "grad_norm": 0.8677658438682556, "learning_rate": 9.998721643384748e-06, "loss": 0.3785, "num_input_tokens_seen": 25013120, "step": 11605 }, { "epoch": 2.130666177280235, "grad_norm": 1.7265762090682983, "learning_rate": 9.998703473017499e-06, "loss": 0.3312, "num_input_tokens_seen": 25023648, "step": 11610 }, { "epoch": 2.131583776839787, "grad_norm": 0.8343827724456787, "learning_rate": 9.998685174441252e-06, "loss": 0.4036, "num_input_tokens_seen": 25033760, "step": 11615 }, { "epoch": 2.1325013763993392, "grad_norm": 2.431931734085083, "learning_rate": 9.998666747656479e-06, "loss": 0.3338, "num_input_tokens_seen": 25044928, "step": 11620 }, { "epoch": 2.1334189759588917, "grad_norm": 3.0548017024993896, "learning_rate": 9.998648192663648e-06, "loss": 0.3059, "num_input_tokens_seen": 25055808, "step": 11625 }, { "epoch": 2.134336575518444, "grad_norm": 3.837247610092163, "learning_rate": 9.99862950946324e-06, "loss": 0.3589, "num_input_tokens_seen": 25067040, "step": 11630 }, { "epoch": 2.135254175077996, "grad_norm": 1.5139323472976685, "learning_rate": 9.998610698055732e-06, "loss": 0.3026, "num_input_tokens_seen": 25077600, "step": 11635 }, { "epoch": 2.1361717746375484, "grad_norm": 1.3041845560073853, "learning_rate": 9.998591758441608e-06, "loss": 0.2962, "num_input_tokens_seen": 25088640, "step": 11640 }, { "epoch": 2.1370893741971004, "grad_norm": 1.1529090404510498, "learning_rate": 9.99857269062135e-06, "loss": 0.3162, "num_input_tokens_seen": 25099648, "step": 11645 }, { "epoch": 2.1380069737566525, "grad_norm": 7.608303546905518, "learning_rate": 9.998553494595453e-06, "loss": 0.3481, "num_input_tokens_seen": 25111808, "step": 11650 }, { "epoch": 2.138924573316205, "grad_norm": 5.081164360046387, "learning_rate": 9.998534170364403e-06, "loss": 0.4351, "num_input_tokens_seen": 25123808, "step": 11655 }, { "epoch": 2.139842172875757, "grad_norm": 1.0334553718566895, "learning_rate": 9.9985147179287e-06, "loss": 0.3755, "num_input_tokens_seen": 25134368, "step": 11660 }, { "epoch": 2.140759772435309, "grad_norm": 2.0841495990753174, "learning_rate": 9.99849513728884e-06, "loss": 0.342, "num_input_tokens_seen": 25144736, "step": 11665 }, { "epoch": 2.1416773719948616, "grad_norm": 3.271641731262207, "learning_rate": 9.998475428445329e-06, "loss": 0.3792, "num_input_tokens_seen": 25155456, "step": 11670 }, { "epoch": 2.1425949715544137, "grad_norm": 1.2343361377716064, "learning_rate": 9.998455591398668e-06, "loss": 0.3322, "num_input_tokens_seen": 25165920, "step": 11675 }, { "epoch": 2.1435125711139658, "grad_norm": 0.6917622089385986, "learning_rate": 9.99843562614937e-06, "loss": 0.3209, "num_input_tokens_seen": 25177344, "step": 11680 }, { "epoch": 2.1444301706735183, "grad_norm": 2.3564388751983643, "learning_rate": 9.998415532697943e-06, "loss": 0.309, "num_input_tokens_seen": 25189024, "step": 11685 }, { "epoch": 2.1453477702330703, "grad_norm": 2.7638843059539795, "learning_rate": 9.998395311044907e-06, "loss": 0.3231, "num_input_tokens_seen": 25198688, "step": 11690 }, { "epoch": 2.1462653697926224, "grad_norm": 2.5170044898986816, "learning_rate": 9.998374961190776e-06, "loss": 0.3284, "num_input_tokens_seen": 25209024, "step": 11695 }, { "epoch": 2.147182969352175, "grad_norm": 3.572395086288452, "learning_rate": 9.998354483136073e-06, "loss": 0.2943, "num_input_tokens_seen": 25219520, "step": 11700 }, { "epoch": 2.148100568911727, "grad_norm": 1.4069199562072754, "learning_rate": 9.998333876881325e-06, "loss": 0.3804, "num_input_tokens_seen": 25229856, "step": 11705 }, { "epoch": 2.149018168471279, "grad_norm": 3.143646717071533, "learning_rate": 9.998313142427061e-06, "loss": 0.4345, "num_input_tokens_seen": 25241184, "step": 11710 }, { "epoch": 2.1499357680308315, "grad_norm": 3.3050918579101562, "learning_rate": 9.998292279773812e-06, "loss": 0.2864, "num_input_tokens_seen": 25251872, "step": 11715 }, { "epoch": 2.1508533675903836, "grad_norm": 2.1657047271728516, "learning_rate": 9.998271288922111e-06, "loss": 0.3289, "num_input_tokens_seen": 25261088, "step": 11720 }, { "epoch": 2.1517709671499357, "grad_norm": 2.377822160720825, "learning_rate": 9.998250169872499e-06, "loss": 0.379, "num_input_tokens_seen": 25272224, "step": 11725 }, { "epoch": 2.152688566709488, "grad_norm": 2.547837972640991, "learning_rate": 9.998228922625517e-06, "loss": 0.4455, "num_input_tokens_seen": 25281984, "step": 11730 }, { "epoch": 2.1536061662690402, "grad_norm": 1.8015413284301758, "learning_rate": 9.998207547181708e-06, "loss": 0.3571, "num_input_tokens_seen": 25293920, "step": 11735 }, { "epoch": 2.1545237658285923, "grad_norm": 1.206863284111023, "learning_rate": 9.998186043541624e-06, "loss": 0.3465, "num_input_tokens_seen": 25305312, "step": 11740 }, { "epoch": 2.155441365388145, "grad_norm": 1.5201219320297241, "learning_rate": 9.998164411705812e-06, "loss": 0.3547, "num_input_tokens_seen": 25316352, "step": 11745 }, { "epoch": 2.156358964947697, "grad_norm": 1.2937666177749634, "learning_rate": 9.998142651674832e-06, "loss": 0.3035, "num_input_tokens_seen": 25327584, "step": 11750 }, { "epoch": 2.157276564507249, "grad_norm": 0.9267260432243347, "learning_rate": 9.998120763449238e-06, "loss": 0.3258, "num_input_tokens_seen": 25338720, "step": 11755 }, { "epoch": 2.1581941640668014, "grad_norm": 1.230002760887146, "learning_rate": 9.998098747029594e-06, "loss": 0.3246, "num_input_tokens_seen": 25348224, "step": 11760 }, { "epoch": 2.1591117636263535, "grad_norm": 3.051154851913452, "learning_rate": 9.998076602416462e-06, "loss": 0.3281, "num_input_tokens_seen": 25360544, "step": 11765 }, { "epoch": 2.1600293631859055, "grad_norm": 0.9217913150787354, "learning_rate": 9.99805432961041e-06, "loss": 0.2926, "num_input_tokens_seen": 25370816, "step": 11770 }, { "epoch": 2.160946962745458, "grad_norm": 1.9842290878295898, "learning_rate": 9.998031928612015e-06, "loss": 0.3884, "num_input_tokens_seen": 25381152, "step": 11775 }, { "epoch": 2.16186456230501, "grad_norm": 0.9835959672927856, "learning_rate": 9.998009399421845e-06, "loss": 0.2801, "num_input_tokens_seen": 25392384, "step": 11780 }, { "epoch": 2.162782161864562, "grad_norm": 1.3343554735183716, "learning_rate": 9.997986742040479e-06, "loss": 0.3539, "num_input_tokens_seen": 25402496, "step": 11785 }, { "epoch": 2.1636997614241147, "grad_norm": 0.9324467182159424, "learning_rate": 9.997963956468501e-06, "loss": 0.3002, "num_input_tokens_seen": 25413184, "step": 11790 }, { "epoch": 2.1646173609836667, "grad_norm": 0.6280568242073059, "learning_rate": 9.997941042706493e-06, "loss": 0.2373, "num_input_tokens_seen": 25423296, "step": 11795 }, { "epoch": 2.165534960543219, "grad_norm": 1.476279377937317, "learning_rate": 9.997918000755044e-06, "loss": 0.2426, "num_input_tokens_seen": 25434304, "step": 11800 }, { "epoch": 2.1664525601027713, "grad_norm": 0.897819995880127, "learning_rate": 9.997894830614743e-06, "loss": 0.3768, "num_input_tokens_seen": 25444992, "step": 11805 }, { "epoch": 2.1673701596623234, "grad_norm": 5.466442108154297, "learning_rate": 9.997871532286187e-06, "loss": 0.3327, "num_input_tokens_seen": 25455808, "step": 11810 }, { "epoch": 2.1682877592218754, "grad_norm": 11.832416534423828, "learning_rate": 9.997848105769972e-06, "loss": 0.358, "num_input_tokens_seen": 25467168, "step": 11815 }, { "epoch": 2.169205358781428, "grad_norm": 4.083078384399414, "learning_rate": 9.9978245510667e-06, "loss": 0.3622, "num_input_tokens_seen": 25478208, "step": 11820 }, { "epoch": 2.17012295834098, "grad_norm": 1.5506954193115234, "learning_rate": 9.997800868176973e-06, "loss": 0.3801, "num_input_tokens_seen": 25488032, "step": 11825 }, { "epoch": 2.171040557900532, "grad_norm": 1.011228084564209, "learning_rate": 9.9977770571014e-06, "loss": 0.304, "num_input_tokens_seen": 25498464, "step": 11830 }, { "epoch": 2.1719581574600846, "grad_norm": 0.7500734925270081, "learning_rate": 9.99775311784059e-06, "loss": 0.3407, "num_input_tokens_seen": 25509760, "step": 11835 }, { "epoch": 2.1728757570196366, "grad_norm": 0.6608017683029175, "learning_rate": 9.997729050395157e-06, "loss": 0.3184, "num_input_tokens_seen": 25519904, "step": 11840 }, { "epoch": 2.1737933565791887, "grad_norm": 2.2105913162231445, "learning_rate": 9.997704854765723e-06, "loss": 0.401, "num_input_tokens_seen": 25530464, "step": 11845 }, { "epoch": 2.174710956138741, "grad_norm": 0.8501237630844116, "learning_rate": 9.997680530952904e-06, "loss": 0.4007, "num_input_tokens_seen": 25542080, "step": 11850 }, { "epoch": 2.1756285556982933, "grad_norm": 2.6931207180023193, "learning_rate": 9.997656078957325e-06, "loss": 0.2538, "num_input_tokens_seen": 25552320, "step": 11855 }, { "epoch": 2.1765461552578453, "grad_norm": 5.139497756958008, "learning_rate": 9.997631498779614e-06, "loss": 0.548, "num_input_tokens_seen": 25562784, "step": 11860 }, { "epoch": 2.177463754817398, "grad_norm": 6.218994617462158, "learning_rate": 9.9976067904204e-06, "loss": 0.3157, "num_input_tokens_seen": 25573632, "step": 11865 }, { "epoch": 2.17838135437695, "grad_norm": 0.6847988367080688, "learning_rate": 9.997581953880316e-06, "loss": 0.2904, "num_input_tokens_seen": 25585184, "step": 11870 }, { "epoch": 2.179298953936502, "grad_norm": 6.8758368492126465, "learning_rate": 9.997556989160002e-06, "loss": 0.3834, "num_input_tokens_seen": 25595904, "step": 11875 }, { "epoch": 2.1802165534960545, "grad_norm": 3.2617499828338623, "learning_rate": 9.997531896260097e-06, "loss": 0.3668, "num_input_tokens_seen": 25605856, "step": 11880 }, { "epoch": 2.1811341530556065, "grad_norm": 3.9841792583465576, "learning_rate": 9.997506675181243e-06, "loss": 0.3448, "num_input_tokens_seen": 25618240, "step": 11885 }, { "epoch": 2.1820517526151586, "grad_norm": 2.6325454711914062, "learning_rate": 9.99748132592409e-06, "loss": 0.2826, "num_input_tokens_seen": 25629280, "step": 11890 }, { "epoch": 2.182969352174711, "grad_norm": 1.4738327264785767, "learning_rate": 9.997455848489286e-06, "loss": 0.267, "num_input_tokens_seen": 25640000, "step": 11895 }, { "epoch": 2.183886951734263, "grad_norm": 2.337184190750122, "learning_rate": 9.997430242877484e-06, "loss": 0.2668, "num_input_tokens_seen": 25651424, "step": 11900 }, { "epoch": 2.1848045512938152, "grad_norm": 3.0497732162475586, "learning_rate": 9.997404509089342e-06, "loss": 0.3658, "num_input_tokens_seen": 25663136, "step": 11905 }, { "epoch": 2.1857221508533677, "grad_norm": 2.0034706592559814, "learning_rate": 9.99737864712552e-06, "loss": 0.2309, "num_input_tokens_seen": 25673632, "step": 11910 }, { "epoch": 2.18663975041292, "grad_norm": 5.182445049285889, "learning_rate": 9.997352656986681e-06, "loss": 0.3556, "num_input_tokens_seen": 25684832, "step": 11915 }, { "epoch": 2.187557349972472, "grad_norm": 5.206379413604736, "learning_rate": 9.99732653867349e-06, "loss": 0.368, "num_input_tokens_seen": 25695264, "step": 11920 }, { "epoch": 2.1884749495320244, "grad_norm": 3.9892632961273193, "learning_rate": 9.99730029218662e-06, "loss": 0.3642, "num_input_tokens_seen": 25705344, "step": 11925 }, { "epoch": 2.1893925490915764, "grad_norm": 1.84531569480896, "learning_rate": 9.997273917526742e-06, "loss": 0.2812, "num_input_tokens_seen": 25716608, "step": 11930 }, { "epoch": 2.1903101486511285, "grad_norm": 1.3937495946884155, "learning_rate": 9.997247414694532e-06, "loss": 0.2644, "num_input_tokens_seen": 25727392, "step": 11935 }, { "epoch": 2.191227748210681, "grad_norm": 1.032992959022522, "learning_rate": 9.997220783690673e-06, "loss": 0.2912, "num_input_tokens_seen": 25737440, "step": 11940 }, { "epoch": 2.192145347770233, "grad_norm": 1.3361889123916626, "learning_rate": 9.997194024515846e-06, "loss": 0.2519, "num_input_tokens_seen": 25748384, "step": 11945 }, { "epoch": 2.193062947329785, "grad_norm": 2.0045371055603027, "learning_rate": 9.997167137170736e-06, "loss": 0.3991, "num_input_tokens_seen": 25758816, "step": 11950 }, { "epoch": 2.1939805468893376, "grad_norm": 5.521737098693848, "learning_rate": 9.997140121656033e-06, "loss": 0.2628, "num_input_tokens_seen": 25771104, "step": 11955 }, { "epoch": 2.1948981464488897, "grad_norm": 4.223482131958008, "learning_rate": 9.997112977972432e-06, "loss": 0.3097, "num_input_tokens_seen": 25781568, "step": 11960 }, { "epoch": 2.1958157460084418, "grad_norm": 6.224055290222168, "learning_rate": 9.997085706120628e-06, "loss": 0.4593, "num_input_tokens_seen": 25793024, "step": 11965 }, { "epoch": 2.1967333455679943, "grad_norm": 6.353507995605469, "learning_rate": 9.99705830610132e-06, "loss": 0.3852, "num_input_tokens_seen": 25804480, "step": 11970 }, { "epoch": 2.1976509451275463, "grad_norm": 2.9819424152374268, "learning_rate": 9.997030777915211e-06, "loss": 0.2708, "num_input_tokens_seen": 25815936, "step": 11975 }, { "epoch": 2.1985685446870984, "grad_norm": 2.810541868209839, "learning_rate": 9.997003121563007e-06, "loss": 0.3101, "num_input_tokens_seen": 25826912, "step": 11980 }, { "epoch": 2.199486144246651, "grad_norm": 5.276630401611328, "learning_rate": 9.996975337045419e-06, "loss": 0.4699, "num_input_tokens_seen": 25838208, "step": 11985 }, { "epoch": 2.200403743806203, "grad_norm": 2.5498409271240234, "learning_rate": 9.996947424363157e-06, "loss": 0.3134, "num_input_tokens_seen": 25849632, "step": 11990 }, { "epoch": 2.201321343365755, "grad_norm": 3.5057458877563477, "learning_rate": 9.996919383516938e-06, "loss": 0.2807, "num_input_tokens_seen": 25860480, "step": 11995 }, { "epoch": 2.2022389429253075, "grad_norm": 2.8138225078582764, "learning_rate": 9.996891214507483e-06, "loss": 0.3643, "num_input_tokens_seen": 25869920, "step": 12000 }, { "epoch": 2.2031565424848596, "grad_norm": 2.7903389930725098, "learning_rate": 9.99686291733551e-06, "loss": 0.3147, "num_input_tokens_seen": 25880192, "step": 12005 }, { "epoch": 2.2040741420444117, "grad_norm": 2.4265055656433105, "learning_rate": 9.99683449200175e-06, "loss": 0.3501, "num_input_tokens_seen": 25890592, "step": 12010 }, { "epoch": 2.204991741603964, "grad_norm": 2.202554941177368, "learning_rate": 9.996805938506928e-06, "loss": 0.3672, "num_input_tokens_seen": 25899232, "step": 12015 }, { "epoch": 2.2059093411635162, "grad_norm": 1.4544367790222168, "learning_rate": 9.99677725685178e-06, "loss": 0.3383, "num_input_tokens_seen": 25909760, "step": 12020 }, { "epoch": 2.2068269407230683, "grad_norm": 0.646861732006073, "learning_rate": 9.996748447037039e-06, "loss": 0.3006, "num_input_tokens_seen": 25921088, "step": 12025 }, { "epoch": 2.207744540282621, "grad_norm": 2.938877582550049, "learning_rate": 9.996719509063444e-06, "loss": 0.3886, "num_input_tokens_seen": 25931488, "step": 12030 }, { "epoch": 2.208662139842173, "grad_norm": 0.8520025014877319, "learning_rate": 9.996690442931737e-06, "loss": 0.3446, "num_input_tokens_seen": 25943552, "step": 12035 }, { "epoch": 2.209579739401725, "grad_norm": 2.4888453483581543, "learning_rate": 9.996661248642665e-06, "loss": 0.333, "num_input_tokens_seen": 25954368, "step": 12040 }, { "epoch": 2.2104973389612774, "grad_norm": 4.31345796585083, "learning_rate": 9.996631926196977e-06, "loss": 0.3883, "num_input_tokens_seen": 25965312, "step": 12045 }, { "epoch": 2.2114149385208295, "grad_norm": 1.8434906005859375, "learning_rate": 9.996602475595424e-06, "loss": 0.3483, "num_input_tokens_seen": 25976224, "step": 12050 }, { "epoch": 2.2123325380803815, "grad_norm": 1.5051661729812622, "learning_rate": 9.996572896838761e-06, "loss": 0.2648, "num_input_tokens_seen": 25987808, "step": 12055 }, { "epoch": 2.213250137639934, "grad_norm": 1.441200852394104, "learning_rate": 9.996543189927747e-06, "loss": 0.3252, "num_input_tokens_seen": 25997376, "step": 12060 }, { "epoch": 2.214167737199486, "grad_norm": 2.463624954223633, "learning_rate": 9.996513354863144e-06, "loss": 0.362, "num_input_tokens_seen": 26008288, "step": 12065 }, { "epoch": 2.215085336759038, "grad_norm": 1.969184398651123, "learning_rate": 9.996483391645719e-06, "loss": 0.3038, "num_input_tokens_seen": 26018656, "step": 12070 }, { "epoch": 2.2160029363185907, "grad_norm": 1.1839159727096558, "learning_rate": 9.996453300276237e-06, "loss": 0.3046, "num_input_tokens_seen": 26030752, "step": 12075 }, { "epoch": 2.2169205358781428, "grad_norm": 2.0442404747009277, "learning_rate": 9.996423080755472e-06, "loss": 0.2337, "num_input_tokens_seen": 26041056, "step": 12080 }, { "epoch": 2.217838135437695, "grad_norm": 1.7684940099716187, "learning_rate": 9.9963927330842e-06, "loss": 0.3315, "num_input_tokens_seen": 26051968, "step": 12085 }, { "epoch": 2.2187557349972473, "grad_norm": 0.903729259967804, "learning_rate": 9.996362257263195e-06, "loss": 0.3201, "num_input_tokens_seen": 26063104, "step": 12090 }, { "epoch": 2.2196733345567994, "grad_norm": 0.9079043865203857, "learning_rate": 9.996331653293245e-06, "loss": 0.322, "num_input_tokens_seen": 26072768, "step": 12095 }, { "epoch": 2.2205909341163514, "grad_norm": 3.7270567417144775, "learning_rate": 9.99630092117513e-06, "loss": 0.2967, "num_input_tokens_seen": 26084640, "step": 12100 }, { "epoch": 2.221508533675904, "grad_norm": 1.3009058237075806, "learning_rate": 9.99627006090964e-06, "loss": 0.3376, "num_input_tokens_seen": 26095008, "step": 12105 }, { "epoch": 2.222426133235456, "grad_norm": 2.119901418685913, "learning_rate": 9.996239072497568e-06, "loss": 0.378, "num_input_tokens_seen": 26106400, "step": 12110 }, { "epoch": 2.223343732795008, "grad_norm": 2.27203631401062, "learning_rate": 9.996207955939705e-06, "loss": 0.3764, "num_input_tokens_seen": 26117568, "step": 12115 }, { "epoch": 2.2242613323545606, "grad_norm": 0.8122716546058655, "learning_rate": 9.996176711236854e-06, "loss": 0.2975, "num_input_tokens_seen": 26128544, "step": 12120 }, { "epoch": 2.2251789319141126, "grad_norm": 0.6760318279266357, "learning_rate": 9.99614533838981e-06, "loss": 0.3233, "num_input_tokens_seen": 26139328, "step": 12125 }, { "epoch": 2.2260965314736647, "grad_norm": 0.7050527930259705, "learning_rate": 9.996113837399385e-06, "loss": 0.3471, "num_input_tokens_seen": 26149728, "step": 12130 }, { "epoch": 2.227014131033217, "grad_norm": 0.9867526888847351, "learning_rate": 9.996082208266382e-06, "loss": 0.2877, "num_input_tokens_seen": 26159872, "step": 12135 }, { "epoch": 2.2279317305927693, "grad_norm": 0.7462685108184814, "learning_rate": 9.996050450991614e-06, "loss": 0.3618, "num_input_tokens_seen": 26170048, "step": 12140 }, { "epoch": 2.2288493301523213, "grad_norm": 1.62642502784729, "learning_rate": 9.996018565575894e-06, "loss": 0.2663, "num_input_tokens_seen": 26181856, "step": 12145 }, { "epoch": 2.229766929711874, "grad_norm": 3.6166722774505615, "learning_rate": 9.995986552020043e-06, "loss": 0.3677, "num_input_tokens_seen": 26191488, "step": 12150 }, { "epoch": 2.230684529271426, "grad_norm": 3.3608953952789307, "learning_rate": 9.995954410324877e-06, "loss": 0.2704, "num_input_tokens_seen": 26202240, "step": 12155 }, { "epoch": 2.231602128830978, "grad_norm": 0.9290370941162109, "learning_rate": 9.995922140491225e-06, "loss": 0.3782, "num_input_tokens_seen": 26213760, "step": 12160 }, { "epoch": 2.2325197283905305, "grad_norm": 2.87794828414917, "learning_rate": 9.995889742519914e-06, "loss": 0.3001, "num_input_tokens_seen": 26223488, "step": 12165 }, { "epoch": 2.2334373279500825, "grad_norm": 0.9665380120277405, "learning_rate": 9.995857216411772e-06, "loss": 0.2765, "num_input_tokens_seen": 26234208, "step": 12170 }, { "epoch": 2.2343549275096346, "grad_norm": 1.4260045289993286, "learning_rate": 9.995824562167638e-06, "loss": 0.322, "num_input_tokens_seen": 26245248, "step": 12175 }, { "epoch": 2.235272527069187, "grad_norm": 2.0377137660980225, "learning_rate": 9.995791779788344e-06, "loss": 0.3436, "num_input_tokens_seen": 26256672, "step": 12180 }, { "epoch": 2.236190126628739, "grad_norm": 1.4043874740600586, "learning_rate": 9.995758869274735e-06, "loss": 0.2776, "num_input_tokens_seen": 26266912, "step": 12185 }, { "epoch": 2.2371077261882912, "grad_norm": 2.1815342903137207, "learning_rate": 9.995725830627654e-06, "loss": 0.2597, "num_input_tokens_seen": 26278176, "step": 12190 }, { "epoch": 2.2380253257478437, "grad_norm": 2.319538116455078, "learning_rate": 9.995692663847949e-06, "loss": 0.3035, "num_input_tokens_seen": 26289824, "step": 12195 }, { "epoch": 2.238942925307396, "grad_norm": 2.2465572357177734, "learning_rate": 9.995659368936468e-06, "loss": 0.264, "num_input_tokens_seen": 26301120, "step": 12200 }, { "epoch": 2.239860524866948, "grad_norm": 1.9488191604614258, "learning_rate": 9.995625945894067e-06, "loss": 0.3467, "num_input_tokens_seen": 26310976, "step": 12205 }, { "epoch": 2.2407781244265004, "grad_norm": 3.241971969604492, "learning_rate": 9.995592394721603e-06, "loss": 0.4697, "num_input_tokens_seen": 26322016, "step": 12210 }, { "epoch": 2.2416957239860524, "grad_norm": 2.672654151916504, "learning_rate": 9.995558715419938e-06, "loss": 0.4417, "num_input_tokens_seen": 26332800, "step": 12215 }, { "epoch": 2.2426133235456045, "grad_norm": 5.227190971374512, "learning_rate": 9.995524907989933e-06, "loss": 0.4022, "num_input_tokens_seen": 26342624, "step": 12220 }, { "epoch": 2.243530923105157, "grad_norm": 3.512516736984253, "learning_rate": 9.995490972432455e-06, "loss": 0.3457, "num_input_tokens_seen": 26353760, "step": 12225 }, { "epoch": 2.244448522664709, "grad_norm": 2.1666078567504883, "learning_rate": 9.995456908748378e-06, "loss": 0.3855, "num_input_tokens_seen": 26364448, "step": 12230 }, { "epoch": 2.245366122224261, "grad_norm": 2.3037941455841064, "learning_rate": 9.995422716938573e-06, "loss": 0.3295, "num_input_tokens_seen": 26375936, "step": 12235 }, { "epoch": 2.2462837217838136, "grad_norm": 1.977475643157959, "learning_rate": 9.995388397003919e-06, "loss": 0.3463, "num_input_tokens_seen": 26387936, "step": 12240 }, { "epoch": 2.2472013213433657, "grad_norm": 2.990093231201172, "learning_rate": 9.995353948945292e-06, "loss": 0.3053, "num_input_tokens_seen": 26398912, "step": 12245 }, { "epoch": 2.2481189209029178, "grad_norm": 3.4951088428497314, "learning_rate": 9.995319372763578e-06, "loss": 0.3347, "num_input_tokens_seen": 26409664, "step": 12250 }, { "epoch": 2.2490365204624703, "grad_norm": 2.5070276260375977, "learning_rate": 9.995284668459668e-06, "loss": 0.3418, "num_input_tokens_seen": 26418784, "step": 12255 }, { "epoch": 2.2499541200220223, "grad_norm": 1.744564175605774, "learning_rate": 9.995249836034446e-06, "loss": 0.3502, "num_input_tokens_seen": 26429664, "step": 12260 }, { "epoch": 2.250871719581575, "grad_norm": 2.8623547554016113, "learning_rate": 9.995214875488806e-06, "loss": 0.3426, "num_input_tokens_seen": 26441056, "step": 12265 }, { "epoch": 2.251789319141127, "grad_norm": 1.5970714092254639, "learning_rate": 9.99517978682365e-06, "loss": 0.3171, "num_input_tokens_seen": 26450720, "step": 12270 }, { "epoch": 2.252706918700679, "grad_norm": 1.8564953804016113, "learning_rate": 9.99514457003987e-06, "loss": 0.2922, "num_input_tokens_seen": 26462272, "step": 12275 }, { "epoch": 2.2536245182602315, "grad_norm": 1.4372390508651733, "learning_rate": 9.995109225138377e-06, "loss": 0.3839, "num_input_tokens_seen": 26473600, "step": 12280 }, { "epoch": 2.2545421178197835, "grad_norm": 1.6918951272964478, "learning_rate": 9.995073752120073e-06, "loss": 0.2947, "num_input_tokens_seen": 26482624, "step": 12285 }, { "epoch": 2.2554597173793356, "grad_norm": 1.749263048171997, "learning_rate": 9.995038150985868e-06, "loss": 0.2842, "num_input_tokens_seen": 26492608, "step": 12290 }, { "epoch": 2.256377316938888, "grad_norm": 1.4553240537643433, "learning_rate": 9.995002421736677e-06, "loss": 0.3129, "num_input_tokens_seen": 26502944, "step": 12295 }, { "epoch": 2.25729491649844, "grad_norm": 3.6421761512756348, "learning_rate": 9.994966564373416e-06, "loss": 0.3207, "num_input_tokens_seen": 26512224, "step": 12300 }, { "epoch": 2.2582125160579922, "grad_norm": 1.7213306427001953, "learning_rate": 9.994930578897002e-06, "loss": 0.2478, "num_input_tokens_seen": 26523456, "step": 12305 }, { "epoch": 2.2591301156175447, "grad_norm": 1.9580886363983154, "learning_rate": 9.994894465308363e-06, "loss": 0.4012, "num_input_tokens_seen": 26534144, "step": 12310 }, { "epoch": 2.260047715177097, "grad_norm": 1.50132155418396, "learning_rate": 9.99485822360842e-06, "loss": 0.3428, "num_input_tokens_seen": 26544928, "step": 12315 }, { "epoch": 2.260965314736649, "grad_norm": 1.794653058052063, "learning_rate": 9.994821853798107e-06, "loss": 0.349, "num_input_tokens_seen": 26555840, "step": 12320 }, { "epoch": 2.2618829142962014, "grad_norm": 1.977759599685669, "learning_rate": 9.994785355878352e-06, "loss": 0.2784, "num_input_tokens_seen": 26566880, "step": 12325 }, { "epoch": 2.2628005138557534, "grad_norm": 1.8164786100387573, "learning_rate": 9.994748729850097e-06, "loss": 0.2785, "num_input_tokens_seen": 26578336, "step": 12330 }, { "epoch": 2.2637181134153055, "grad_norm": 2.304814100265503, "learning_rate": 9.994711975714275e-06, "loss": 0.3164, "num_input_tokens_seen": 26587520, "step": 12335 }, { "epoch": 2.264635712974858, "grad_norm": 0.6634453535079956, "learning_rate": 9.994675093471833e-06, "loss": 0.3163, "num_input_tokens_seen": 26597248, "step": 12340 }, { "epoch": 2.26555331253441, "grad_norm": 1.3685691356658936, "learning_rate": 9.994638083123717e-06, "loss": 0.3213, "num_input_tokens_seen": 26607616, "step": 12345 }, { "epoch": 2.266470912093962, "grad_norm": 1.947274923324585, "learning_rate": 9.994600944670876e-06, "loss": 0.2724, "num_input_tokens_seen": 26618304, "step": 12350 }, { "epoch": 2.2673885116535146, "grad_norm": 0.720862090587616, "learning_rate": 9.99456367811426e-06, "loss": 0.2132, "num_input_tokens_seen": 26629664, "step": 12355 }, { "epoch": 2.2683061112130667, "grad_norm": 2.571432590484619, "learning_rate": 9.994526283454826e-06, "loss": 0.2742, "num_input_tokens_seen": 26640864, "step": 12360 }, { "epoch": 2.2692237107726188, "grad_norm": 0.8174633979797363, "learning_rate": 9.994488760693535e-06, "loss": 0.3373, "num_input_tokens_seen": 26650688, "step": 12365 }, { "epoch": 2.2701413103321713, "grad_norm": 1.1817001104354858, "learning_rate": 9.994451109831347e-06, "loss": 0.3033, "num_input_tokens_seen": 26661152, "step": 12370 }, { "epoch": 2.2710589098917233, "grad_norm": 7.802074909210205, "learning_rate": 9.994413330869229e-06, "loss": 0.3861, "num_input_tokens_seen": 26670592, "step": 12375 }, { "epoch": 2.2719765094512754, "grad_norm": 7.505598545074463, "learning_rate": 9.99437542380815e-06, "loss": 0.3119, "num_input_tokens_seen": 26682816, "step": 12380 }, { "epoch": 2.272894109010828, "grad_norm": 3.3216848373413086, "learning_rate": 9.994337388649082e-06, "loss": 0.2203, "num_input_tokens_seen": 26694208, "step": 12385 }, { "epoch": 2.27381170857038, "grad_norm": 0.7590365409851074, "learning_rate": 9.994299225393e-06, "loss": 0.3403, "num_input_tokens_seen": 26705568, "step": 12390 }, { "epoch": 2.274729308129932, "grad_norm": 0.5027832984924316, "learning_rate": 9.994260934040884e-06, "loss": 0.3037, "num_input_tokens_seen": 26715904, "step": 12395 }, { "epoch": 2.2756469076894845, "grad_norm": 1.245766282081604, "learning_rate": 9.994222514593715e-06, "loss": 0.2851, "num_input_tokens_seen": 26726784, "step": 12400 }, { "epoch": 2.2765645072490366, "grad_norm": 1.6061426401138306, "learning_rate": 9.99418396705248e-06, "loss": 0.3084, "num_input_tokens_seen": 26738176, "step": 12405 }, { "epoch": 2.2774821068085886, "grad_norm": 0.784661591053009, "learning_rate": 9.994145291418165e-06, "loss": 0.2708, "num_input_tokens_seen": 26748224, "step": 12410 }, { "epoch": 2.278399706368141, "grad_norm": 1.2222384214401245, "learning_rate": 9.994106487691763e-06, "loss": 0.3878, "num_input_tokens_seen": 26759904, "step": 12415 }, { "epoch": 2.279317305927693, "grad_norm": 0.9593188166618347, "learning_rate": 9.994067555874272e-06, "loss": 0.3519, "num_input_tokens_seen": 26771520, "step": 12420 }, { "epoch": 2.2802349054872453, "grad_norm": 3.347043514251709, "learning_rate": 9.994028495966686e-06, "loss": 0.245, "num_input_tokens_seen": 26782880, "step": 12425 }, { "epoch": 2.281152505046798, "grad_norm": 2.1568562984466553, "learning_rate": 9.99398930797001e-06, "loss": 0.2957, "num_input_tokens_seen": 26793472, "step": 12430 }, { "epoch": 2.28207010460635, "grad_norm": 5.598630428314209, "learning_rate": 9.993949991885248e-06, "loss": 0.3371, "num_input_tokens_seen": 26804192, "step": 12435 }, { "epoch": 2.282987704165902, "grad_norm": 12.733397483825684, "learning_rate": 9.99391054771341e-06, "loss": 0.3948, "num_input_tokens_seen": 26815200, "step": 12440 }, { "epoch": 2.2839053037254544, "grad_norm": 6.373460292816162, "learning_rate": 9.993870975455506e-06, "loss": 0.5455, "num_input_tokens_seen": 26825728, "step": 12445 }, { "epoch": 2.2848229032850065, "grad_norm": 1.5328227281570435, "learning_rate": 9.99383127511255e-06, "loss": 0.5409, "num_input_tokens_seen": 26837056, "step": 12450 }, { "epoch": 2.2857405028445585, "grad_norm": 6.862070560455322, "learning_rate": 9.993791446685562e-06, "loss": 0.3005, "num_input_tokens_seen": 26848384, "step": 12455 }, { "epoch": 2.286658102404111, "grad_norm": 1.5365235805511475, "learning_rate": 9.993751490175563e-06, "loss": 0.3576, "num_input_tokens_seen": 26858528, "step": 12460 }, { "epoch": 2.287575701963663, "grad_norm": 3.1812727451324463, "learning_rate": 9.993711405583579e-06, "loss": 0.4087, "num_input_tokens_seen": 26868096, "step": 12465 }, { "epoch": 2.288493301523215, "grad_norm": 1.139825463294983, "learning_rate": 9.993671192910635e-06, "loss": 0.4515, "num_input_tokens_seen": 26879264, "step": 12470 }, { "epoch": 2.2894109010827677, "grad_norm": 1.5325828790664673, "learning_rate": 9.993630852157765e-06, "loss": 0.4193, "num_input_tokens_seen": 26890176, "step": 12475 }, { "epoch": 2.2903285006423197, "grad_norm": 1.6278049945831299, "learning_rate": 9.993590383326003e-06, "loss": 0.3303, "num_input_tokens_seen": 26901952, "step": 12480 }, { "epoch": 2.291246100201872, "grad_norm": 0.931205153465271, "learning_rate": 9.993549786416389e-06, "loss": 0.2689, "num_input_tokens_seen": 26912736, "step": 12485 }, { "epoch": 2.2921636997614243, "grad_norm": 2.3595564365386963, "learning_rate": 9.99350906142996e-06, "loss": 0.2681, "num_input_tokens_seen": 26923168, "step": 12490 }, { "epoch": 2.2930812993209764, "grad_norm": 1.2940713167190552, "learning_rate": 9.993468208367765e-06, "loss": 0.2511, "num_input_tokens_seen": 26934048, "step": 12495 }, { "epoch": 2.2939988988805284, "grad_norm": 1.2405633926391602, "learning_rate": 9.993427227230847e-06, "loss": 0.4385, "num_input_tokens_seen": 26944928, "step": 12500 }, { "epoch": 2.294916498440081, "grad_norm": 1.8623872995376587, "learning_rate": 9.993386118020262e-06, "loss": 0.4659, "num_input_tokens_seen": 26955264, "step": 12505 }, { "epoch": 2.295834097999633, "grad_norm": 4.542228698730469, "learning_rate": 9.99334488073706e-06, "loss": 0.2749, "num_input_tokens_seen": 26965728, "step": 12510 }, { "epoch": 2.296751697559185, "grad_norm": 2.0849807262420654, "learning_rate": 9.993303515382302e-06, "loss": 0.2798, "num_input_tokens_seen": 26976576, "step": 12515 }, { "epoch": 2.2976692971187376, "grad_norm": 4.125819683074951, "learning_rate": 9.993262021957048e-06, "loss": 0.3056, "num_input_tokens_seen": 26987776, "step": 12520 }, { "epoch": 2.2985868966782896, "grad_norm": 1.249102234840393, "learning_rate": 9.993220400462362e-06, "loss": 0.2992, "num_input_tokens_seen": 26998656, "step": 12525 }, { "epoch": 2.2995044962378417, "grad_norm": 0.961449921131134, "learning_rate": 9.993178650899312e-06, "loss": 0.3785, "num_input_tokens_seen": 27009184, "step": 12530 }, { "epoch": 2.300422095797394, "grad_norm": 1.4997605085372925, "learning_rate": 9.993136773268967e-06, "loss": 0.3631, "num_input_tokens_seen": 27017760, "step": 12535 }, { "epoch": 2.3013396953569463, "grad_norm": 2.8563339710235596, "learning_rate": 9.993094767572401e-06, "loss": 0.2921, "num_input_tokens_seen": 27028416, "step": 12540 }, { "epoch": 2.3022572949164983, "grad_norm": 0.6491833329200745, "learning_rate": 9.993052633810697e-06, "loss": 0.3961, "num_input_tokens_seen": 27039680, "step": 12545 }, { "epoch": 2.303174894476051, "grad_norm": 2.537346124649048, "learning_rate": 9.993010371984929e-06, "loss": 0.3082, "num_input_tokens_seen": 27050656, "step": 12550 }, { "epoch": 2.304092494035603, "grad_norm": 0.8646541237831116, "learning_rate": 9.992967982096183e-06, "loss": 0.3036, "num_input_tokens_seen": 27061248, "step": 12555 }, { "epoch": 2.305010093595155, "grad_norm": 0.5718975067138672, "learning_rate": 9.992925464145548e-06, "loss": 0.3363, "num_input_tokens_seen": 27072416, "step": 12560 }, { "epoch": 2.3059276931547075, "grad_norm": 0.7109525203704834, "learning_rate": 9.992882818134114e-06, "loss": 0.336, "num_input_tokens_seen": 27083392, "step": 12565 }, { "epoch": 2.3068452927142595, "grad_norm": 0.6133586764335632, "learning_rate": 9.99284004406297e-06, "loss": 0.3244, "num_input_tokens_seen": 27094112, "step": 12570 }, { "epoch": 2.3077628922738116, "grad_norm": 0.6091567873954773, "learning_rate": 9.99279714193322e-06, "loss": 0.2901, "num_input_tokens_seen": 27103520, "step": 12575 }, { "epoch": 2.308680491833364, "grad_norm": 1.9674370288848877, "learning_rate": 9.992754111745961e-06, "loss": 0.3449, "num_input_tokens_seen": 27115104, "step": 12580 }, { "epoch": 2.309598091392916, "grad_norm": 0.9835459589958191, "learning_rate": 9.992710953502298e-06, "loss": 0.2809, "num_input_tokens_seen": 27127200, "step": 12585 }, { "epoch": 2.3105156909524682, "grad_norm": 4.547301292419434, "learning_rate": 9.992667667203336e-06, "loss": 0.4058, "num_input_tokens_seen": 27138624, "step": 12590 }, { "epoch": 2.3114332905120207, "grad_norm": 0.8600929379463196, "learning_rate": 9.992624252850186e-06, "loss": 0.3353, "num_input_tokens_seen": 27149824, "step": 12595 }, { "epoch": 2.312350890071573, "grad_norm": 0.9902522563934326, "learning_rate": 9.992580710443962e-06, "loss": 0.1933, "num_input_tokens_seen": 27160288, "step": 12600 }, { "epoch": 2.313268489631125, "grad_norm": 0.9364607334136963, "learning_rate": 9.992537039985782e-06, "loss": 0.3553, "num_input_tokens_seen": 27172384, "step": 12605 }, { "epoch": 2.3141860891906774, "grad_norm": 7.587008953094482, "learning_rate": 9.992493241476761e-06, "loss": 0.3981, "num_input_tokens_seen": 27184096, "step": 12610 }, { "epoch": 2.3151036887502294, "grad_norm": 0.9977240562438965, "learning_rate": 9.99244931491803e-06, "loss": 0.3842, "num_input_tokens_seen": 27195808, "step": 12615 }, { "epoch": 2.3160212883097815, "grad_norm": 6.028744220733643, "learning_rate": 9.99240526031071e-06, "loss": 0.3174, "num_input_tokens_seen": 27206816, "step": 12620 }, { "epoch": 2.316938887869334, "grad_norm": 1.0374542474746704, "learning_rate": 9.992361077655933e-06, "loss": 0.4341, "num_input_tokens_seen": 27218304, "step": 12625 }, { "epoch": 2.317856487428886, "grad_norm": 0.6850430369377136, "learning_rate": 9.99231676695483e-06, "loss": 0.4197, "num_input_tokens_seen": 27228832, "step": 12630 }, { "epoch": 2.318774086988438, "grad_norm": 1.3016117811203003, "learning_rate": 9.992272328208542e-06, "loss": 0.3052, "num_input_tokens_seen": 27238624, "step": 12635 }, { "epoch": 2.3196916865479906, "grad_norm": 1.7609020471572876, "learning_rate": 9.992227761418206e-06, "loss": 0.3441, "num_input_tokens_seen": 27249856, "step": 12640 }, { "epoch": 2.3206092861075427, "grad_norm": 0.48571133613586426, "learning_rate": 9.992183066584964e-06, "loss": 0.3444, "num_input_tokens_seen": 27260640, "step": 12645 }, { "epoch": 2.3215268856670948, "grad_norm": 1.2151648998260498, "learning_rate": 9.992138243709964e-06, "loss": 0.3081, "num_input_tokens_seen": 27270944, "step": 12650 }, { "epoch": 2.3224444852266473, "grad_norm": 1.8722914457321167, "learning_rate": 9.992093292794355e-06, "loss": 0.3233, "num_input_tokens_seen": 27281536, "step": 12655 }, { "epoch": 2.3233620847861993, "grad_norm": 0.9902397990226746, "learning_rate": 9.99204821383929e-06, "loss": 0.415, "num_input_tokens_seen": 27293280, "step": 12660 }, { "epoch": 2.3242796843457514, "grad_norm": 6.006730556488037, "learning_rate": 9.992003006845924e-06, "loss": 0.3399, "num_input_tokens_seen": 27304224, "step": 12665 }, { "epoch": 2.325197283905304, "grad_norm": 2.181220054626465, "learning_rate": 9.991957671815418e-06, "loss": 0.3662, "num_input_tokens_seen": 27314368, "step": 12670 }, { "epoch": 2.326114883464856, "grad_norm": 0.989963173866272, "learning_rate": 9.991912208748937e-06, "loss": 0.3201, "num_input_tokens_seen": 27324896, "step": 12675 }, { "epoch": 2.327032483024408, "grad_norm": 2.636638641357422, "learning_rate": 9.991866617647643e-06, "loss": 0.3409, "num_input_tokens_seen": 27335200, "step": 12680 }, { "epoch": 2.3279500825839605, "grad_norm": 0.4985384941101074, "learning_rate": 9.991820898512706e-06, "loss": 0.2937, "num_input_tokens_seen": 27345888, "step": 12685 }, { "epoch": 2.3288676821435126, "grad_norm": 0.5314604043960571, "learning_rate": 9.9917750513453e-06, "loss": 0.3348, "num_input_tokens_seen": 27356480, "step": 12690 }, { "epoch": 2.3297852817030646, "grad_norm": 0.7179103493690491, "learning_rate": 9.991729076146602e-06, "loss": 0.2859, "num_input_tokens_seen": 27368128, "step": 12695 }, { "epoch": 2.330702881262617, "grad_norm": 4.351797580718994, "learning_rate": 9.991682972917788e-06, "loss": 0.3977, "num_input_tokens_seen": 27379904, "step": 12700 }, { "epoch": 2.331620480822169, "grad_norm": 1.1624847650527954, "learning_rate": 9.991636741660044e-06, "loss": 0.3492, "num_input_tokens_seen": 27390784, "step": 12705 }, { "epoch": 2.3325380803817213, "grad_norm": 1.9006068706512451, "learning_rate": 9.991590382374552e-06, "loss": 0.3407, "num_input_tokens_seen": 27401344, "step": 12710 }, { "epoch": 2.333455679941274, "grad_norm": 0.6084515452384949, "learning_rate": 9.991543895062504e-06, "loss": 0.3176, "num_input_tokens_seen": 27412416, "step": 12715 }, { "epoch": 2.334373279500826, "grad_norm": 5.605698585510254, "learning_rate": 9.991497279725091e-06, "loss": 0.3665, "num_input_tokens_seen": 27423744, "step": 12720 }, { "epoch": 2.335290879060378, "grad_norm": 1.2187572717666626, "learning_rate": 9.99145053636351e-06, "loss": 0.27, "num_input_tokens_seen": 27434464, "step": 12725 }, { "epoch": 2.3362084786199304, "grad_norm": 0.6584973931312561, "learning_rate": 9.991403664978959e-06, "loss": 0.3107, "num_input_tokens_seen": 27445504, "step": 12730 }, { "epoch": 2.3371260781794825, "grad_norm": 4.989978790283203, "learning_rate": 9.991356665572639e-06, "loss": 0.3144, "num_input_tokens_seen": 27456384, "step": 12735 }, { "epoch": 2.3380436777390345, "grad_norm": 4.634035587310791, "learning_rate": 9.991309538145754e-06, "loss": 0.3902, "num_input_tokens_seen": 27467328, "step": 12740 }, { "epoch": 2.338961277298587, "grad_norm": 1.0003902912139893, "learning_rate": 9.991262282699518e-06, "loss": 0.3629, "num_input_tokens_seen": 27479200, "step": 12745 }, { "epoch": 2.339878876858139, "grad_norm": 1.384547472000122, "learning_rate": 9.99121489923514e-06, "loss": 0.2138, "num_input_tokens_seen": 27489120, "step": 12750 }, { "epoch": 2.340796476417691, "grad_norm": 0.7193577885627747, "learning_rate": 9.991167387753834e-06, "loss": 0.3762, "num_input_tokens_seen": 27500736, "step": 12755 }, { "epoch": 2.3417140759772437, "grad_norm": 1.2891845703125, "learning_rate": 9.99111974825682e-06, "loss": 0.2414, "num_input_tokens_seen": 27511392, "step": 12760 }, { "epoch": 2.3426316755367957, "grad_norm": 2.6633718013763428, "learning_rate": 9.991071980745323e-06, "loss": 0.3528, "num_input_tokens_seen": 27522368, "step": 12765 }, { "epoch": 2.343549275096348, "grad_norm": 1.0006955862045288, "learning_rate": 9.991024085220563e-06, "loss": 0.3147, "num_input_tokens_seen": 27532832, "step": 12770 }, { "epoch": 2.3444668746559003, "grad_norm": 1.25566565990448, "learning_rate": 9.99097606168377e-06, "loss": 0.3215, "num_input_tokens_seen": 27543392, "step": 12775 }, { "epoch": 2.3453844742154524, "grad_norm": 1.1360628604888916, "learning_rate": 9.990927910136178e-06, "loss": 0.2634, "num_input_tokens_seen": 27553696, "step": 12780 }, { "epoch": 2.3463020737750044, "grad_norm": 3.8633556365966797, "learning_rate": 9.99087963057902e-06, "loss": 0.3889, "num_input_tokens_seen": 27565120, "step": 12785 }, { "epoch": 2.347219673334557, "grad_norm": 1.5582219362258911, "learning_rate": 9.990831223013533e-06, "loss": 0.2394, "num_input_tokens_seen": 27575200, "step": 12790 }, { "epoch": 2.348137272894109, "grad_norm": 1.1571791172027588, "learning_rate": 9.990782687440962e-06, "loss": 0.3279, "num_input_tokens_seen": 27587072, "step": 12795 }, { "epoch": 2.349054872453661, "grad_norm": 7.19442081451416, "learning_rate": 9.99073402386255e-06, "loss": 0.488, "num_input_tokens_seen": 27597024, "step": 12800 }, { "epoch": 2.3499724720132136, "grad_norm": 5.941720962524414, "learning_rate": 9.990685232279544e-06, "loss": 0.3829, "num_input_tokens_seen": 27606272, "step": 12805 }, { "epoch": 2.3508900715727656, "grad_norm": 0.8875526785850525, "learning_rate": 9.990636312693197e-06, "loss": 0.4256, "num_input_tokens_seen": 27616704, "step": 12810 }, { "epoch": 2.3518076711323177, "grad_norm": 2.5759589672088623, "learning_rate": 9.990587265104765e-06, "loss": 0.4297, "num_input_tokens_seen": 27627328, "step": 12815 }, { "epoch": 2.35272527069187, "grad_norm": 1.7915289402008057, "learning_rate": 9.990538089515503e-06, "loss": 0.284, "num_input_tokens_seen": 27638816, "step": 12820 }, { "epoch": 2.3536428702514223, "grad_norm": 2.376924753189087, "learning_rate": 9.990488785926675e-06, "loss": 0.3688, "num_input_tokens_seen": 27649504, "step": 12825 }, { "epoch": 2.3545604698109743, "grad_norm": 4.483773708343506, "learning_rate": 9.990439354339544e-06, "loss": 0.3759, "num_input_tokens_seen": 27661184, "step": 12830 }, { "epoch": 2.355478069370527, "grad_norm": 1.0425326824188232, "learning_rate": 9.990389794755375e-06, "loss": 0.3278, "num_input_tokens_seen": 27671776, "step": 12835 }, { "epoch": 2.356395668930079, "grad_norm": 1.637963891029358, "learning_rate": 9.990340107175444e-06, "loss": 0.3262, "num_input_tokens_seen": 27683136, "step": 12840 }, { "epoch": 2.357313268489631, "grad_norm": 2.11934232711792, "learning_rate": 9.990290291601024e-06, "loss": 0.3112, "num_input_tokens_seen": 27694144, "step": 12845 }, { "epoch": 2.3582308680491835, "grad_norm": 5.422089576721191, "learning_rate": 9.990240348033392e-06, "loss": 0.2821, "num_input_tokens_seen": 27704032, "step": 12850 }, { "epoch": 2.3591484676087355, "grad_norm": 1.3943110704421997, "learning_rate": 9.99019027647383e-06, "loss": 0.3808, "num_input_tokens_seen": 27715520, "step": 12855 }, { "epoch": 2.3600660671682876, "grad_norm": 1.2934608459472656, "learning_rate": 9.99014007692362e-06, "loss": 0.3082, "num_input_tokens_seen": 27726752, "step": 12860 }, { "epoch": 2.36098366672784, "grad_norm": 0.7160236835479736, "learning_rate": 9.990089749384053e-06, "loss": 0.335, "num_input_tokens_seen": 27738304, "step": 12865 }, { "epoch": 2.361901266287392, "grad_norm": 1.129476547241211, "learning_rate": 9.990039293856415e-06, "loss": 0.2742, "num_input_tokens_seen": 27749824, "step": 12870 }, { "epoch": 2.3628188658469442, "grad_norm": 5.476420879364014, "learning_rate": 9.989988710342006e-06, "loss": 0.4017, "num_input_tokens_seen": 27761472, "step": 12875 }, { "epoch": 2.3637364654064967, "grad_norm": 0.7524293661117554, "learning_rate": 9.989937998842119e-06, "loss": 0.2767, "num_input_tokens_seen": 27773824, "step": 12880 }, { "epoch": 2.364654064966049, "grad_norm": 1.2195981740951538, "learning_rate": 9.989887159358056e-06, "loss": 0.3216, "num_input_tokens_seen": 27784832, "step": 12885 }, { "epoch": 2.365571664525601, "grad_norm": 1.0242998600006104, "learning_rate": 9.98983619189112e-06, "loss": 0.3474, "num_input_tokens_seen": 27795328, "step": 12890 }, { "epoch": 2.3664892640851534, "grad_norm": 0.7428274750709534, "learning_rate": 9.989785096442622e-06, "loss": 0.2789, "num_input_tokens_seen": 27806304, "step": 12895 }, { "epoch": 2.3674068636447054, "grad_norm": 0.6066387891769409, "learning_rate": 9.989733873013867e-06, "loss": 0.359, "num_input_tokens_seen": 27815424, "step": 12900 }, { "epoch": 2.3683244632042575, "grad_norm": 0.4544408619403839, "learning_rate": 9.989682521606171e-06, "loss": 0.334, "num_input_tokens_seen": 27826848, "step": 12905 }, { "epoch": 2.36924206276381, "grad_norm": 1.5110423564910889, "learning_rate": 9.989631042220855e-06, "loss": 0.3016, "num_input_tokens_seen": 27838016, "step": 12910 }, { "epoch": 2.370159662323362, "grad_norm": 0.8333905339241028, "learning_rate": 9.989579434859233e-06, "loss": 0.3378, "num_input_tokens_seen": 27849152, "step": 12915 }, { "epoch": 2.371077261882914, "grad_norm": 0.42801633477211, "learning_rate": 9.989527699522634e-06, "loss": 0.2843, "num_input_tokens_seen": 27860032, "step": 12920 }, { "epoch": 2.3719948614424666, "grad_norm": 0.8404970765113831, "learning_rate": 9.98947583621238e-06, "loss": 0.3162, "num_input_tokens_seen": 27870944, "step": 12925 }, { "epoch": 2.3729124610020187, "grad_norm": 0.7475168704986572, "learning_rate": 9.989423844929806e-06, "loss": 0.3124, "num_input_tokens_seen": 27881120, "step": 12930 }, { "epoch": 2.3738300605615708, "grad_norm": 1.9358594417572021, "learning_rate": 9.989371725676242e-06, "loss": 0.3351, "num_input_tokens_seen": 27892096, "step": 12935 }, { "epoch": 2.3747476601211233, "grad_norm": 0.8753983974456787, "learning_rate": 9.989319478453028e-06, "loss": 0.3977, "num_input_tokens_seen": 27903936, "step": 12940 }, { "epoch": 2.3756652596806753, "grad_norm": 1.0801340341567993, "learning_rate": 9.9892671032615e-06, "loss": 0.3593, "num_input_tokens_seen": 27915328, "step": 12945 }, { "epoch": 2.3765828592402274, "grad_norm": 1.3269226551055908, "learning_rate": 9.989214600103003e-06, "loss": 0.3269, "num_input_tokens_seen": 27926016, "step": 12950 }, { "epoch": 2.37750045879978, "grad_norm": 0.7764836549758911, "learning_rate": 9.989161968978887e-06, "loss": 0.2981, "num_input_tokens_seen": 27937024, "step": 12955 }, { "epoch": 2.378418058359332, "grad_norm": 0.7054703831672668, "learning_rate": 9.989109209890496e-06, "loss": 0.4234, "num_input_tokens_seen": 27946496, "step": 12960 }, { "epoch": 2.379335657918884, "grad_norm": 1.0451115369796753, "learning_rate": 9.989056322839188e-06, "loss": 0.3708, "num_input_tokens_seen": 27957280, "step": 12965 }, { "epoch": 2.3802532574784365, "grad_norm": 0.9668044447898865, "learning_rate": 9.989003307826317e-06, "loss": 0.2924, "num_input_tokens_seen": 27968416, "step": 12970 }, { "epoch": 2.3811708570379886, "grad_norm": 1.1954143047332764, "learning_rate": 9.988950164853244e-06, "loss": 0.3402, "num_input_tokens_seen": 27979488, "step": 12975 }, { "epoch": 2.3820884565975406, "grad_norm": 4.638052940368652, "learning_rate": 9.988896893921331e-06, "loss": 0.4021, "num_input_tokens_seen": 27989760, "step": 12980 }, { "epoch": 2.383006056157093, "grad_norm": 1.1505131721496582, "learning_rate": 9.988843495031944e-06, "loss": 0.4091, "num_input_tokens_seen": 28000960, "step": 12985 }, { "epoch": 2.383923655716645, "grad_norm": 0.5401841998100281, "learning_rate": 9.988789968186455e-06, "loss": 0.2725, "num_input_tokens_seen": 28011264, "step": 12990 }, { "epoch": 2.3848412552761973, "grad_norm": 0.7197602391242981, "learning_rate": 9.988736313386234e-06, "loss": 0.3223, "num_input_tokens_seen": 28021792, "step": 12995 }, { "epoch": 2.38575885483575, "grad_norm": 0.6836327910423279, "learning_rate": 9.988682530632659e-06, "loss": 0.3338, "num_input_tokens_seen": 28032320, "step": 13000 }, { "epoch": 2.386676454395302, "grad_norm": 0.7093567252159119, "learning_rate": 9.988628619927108e-06, "loss": 0.3215, "num_input_tokens_seen": 28042560, "step": 13005 }, { "epoch": 2.387594053954854, "grad_norm": 1.0148372650146484, "learning_rate": 9.988574581270965e-06, "loss": 0.2916, "num_input_tokens_seen": 28053184, "step": 13010 }, { "epoch": 2.3885116535144064, "grad_norm": 1.9574393033981323, "learning_rate": 9.988520414665615e-06, "loss": 0.387, "num_input_tokens_seen": 28063808, "step": 13015 }, { "epoch": 2.3894292530739585, "grad_norm": 2.6041018962860107, "learning_rate": 9.988466120112448e-06, "loss": 0.2843, "num_input_tokens_seen": 28075200, "step": 13020 }, { "epoch": 2.3903468526335105, "grad_norm": 0.5513238906860352, "learning_rate": 9.988411697612858e-06, "loss": 0.2672, "num_input_tokens_seen": 28086528, "step": 13025 }, { "epoch": 2.391264452193063, "grad_norm": 1.4047077894210815, "learning_rate": 9.988357147168237e-06, "loss": 0.2965, "num_input_tokens_seen": 28096224, "step": 13030 }, { "epoch": 2.392182051752615, "grad_norm": 0.6081379652023315, "learning_rate": 9.98830246877999e-06, "loss": 0.3966, "num_input_tokens_seen": 28106208, "step": 13035 }, { "epoch": 2.393099651312167, "grad_norm": 0.999946117401123, "learning_rate": 9.988247662449513e-06, "loss": 0.322, "num_input_tokens_seen": 28117760, "step": 13040 }, { "epoch": 2.3940172508717197, "grad_norm": 1.2472606897354126, "learning_rate": 9.988192728178214e-06, "loss": 0.3354, "num_input_tokens_seen": 28129376, "step": 13045 }, { "epoch": 2.3949348504312717, "grad_norm": 6.4090471267700195, "learning_rate": 9.988137665967503e-06, "loss": 0.4192, "num_input_tokens_seen": 28140992, "step": 13050 }, { "epoch": 2.395852449990824, "grad_norm": 1.3963038921356201, "learning_rate": 9.988082475818794e-06, "loss": 0.3631, "num_input_tokens_seen": 28152448, "step": 13055 }, { "epoch": 2.3967700495503763, "grad_norm": 1.244479775428772, "learning_rate": 9.988027157733497e-06, "loss": 0.257, "num_input_tokens_seen": 28162016, "step": 13060 }, { "epoch": 2.3976876491099284, "grad_norm": 2.157813787460327, "learning_rate": 9.987971711713036e-06, "loss": 0.3465, "num_input_tokens_seen": 28172864, "step": 13065 }, { "epoch": 2.3986052486694804, "grad_norm": 1.5762406587600708, "learning_rate": 9.987916137758832e-06, "loss": 0.2883, "num_input_tokens_seen": 28182144, "step": 13070 }, { "epoch": 2.399522848229033, "grad_norm": 1.487293004989624, "learning_rate": 9.987860435872308e-06, "loss": 0.254, "num_input_tokens_seen": 28192608, "step": 13075 }, { "epoch": 2.400440447788585, "grad_norm": 8.179920196533203, "learning_rate": 9.987804606054897e-06, "loss": 0.274, "num_input_tokens_seen": 28203488, "step": 13080 }, { "epoch": 2.401358047348137, "grad_norm": 2.669578790664673, "learning_rate": 9.987748648308024e-06, "loss": 0.3041, "num_input_tokens_seen": 28214080, "step": 13085 }, { "epoch": 2.4022756469076896, "grad_norm": 0.9786076545715332, "learning_rate": 9.987692562633132e-06, "loss": 0.5126, "num_input_tokens_seen": 28224640, "step": 13090 }, { "epoch": 2.4031932464672416, "grad_norm": 0.8949984312057495, "learning_rate": 9.987636349031655e-06, "loss": 0.3208, "num_input_tokens_seen": 28234880, "step": 13095 }, { "epoch": 2.4041108460267937, "grad_norm": 0.7444722652435303, "learning_rate": 9.987580007505035e-06, "loss": 0.3065, "num_input_tokens_seen": 28245824, "step": 13100 }, { "epoch": 2.405028445586346, "grad_norm": 2.7757797241210938, "learning_rate": 9.987523538054717e-06, "loss": 0.2987, "num_input_tokens_seen": 28256832, "step": 13105 }, { "epoch": 2.4059460451458983, "grad_norm": 1.6290380954742432, "learning_rate": 9.987466940682154e-06, "loss": 0.3192, "num_input_tokens_seen": 28267872, "step": 13110 }, { "epoch": 2.4068636447054503, "grad_norm": 3.969377040863037, "learning_rate": 9.98741021538879e-06, "loss": 0.3396, "num_input_tokens_seen": 28277664, "step": 13115 }, { "epoch": 2.407781244265003, "grad_norm": 4.751615047454834, "learning_rate": 9.987353362176086e-06, "loss": 0.3926, "num_input_tokens_seen": 28286784, "step": 13120 }, { "epoch": 2.408698843824555, "grad_norm": 2.7569832801818848, "learning_rate": 9.987296381045497e-06, "loss": 0.4828, "num_input_tokens_seen": 28297344, "step": 13125 }, { "epoch": 2.409616443384107, "grad_norm": 1.1646804809570312, "learning_rate": 9.987239271998486e-06, "loss": 0.4479, "num_input_tokens_seen": 28308960, "step": 13130 }, { "epoch": 2.4105340429436595, "grad_norm": 1.2833523750305176, "learning_rate": 9.987182035036516e-06, "loss": 0.393, "num_input_tokens_seen": 28320032, "step": 13135 }, { "epoch": 2.4114516425032115, "grad_norm": 1.6707741022109985, "learning_rate": 9.987124670161057e-06, "loss": 0.3574, "num_input_tokens_seen": 28330368, "step": 13140 }, { "epoch": 2.4123692420627636, "grad_norm": 2.6157383918762207, "learning_rate": 9.987067177373579e-06, "loss": 0.3707, "num_input_tokens_seen": 28340192, "step": 13145 }, { "epoch": 2.413286841622316, "grad_norm": 1.556992530822754, "learning_rate": 9.987009556675558e-06, "loss": 0.3302, "num_input_tokens_seen": 28351584, "step": 13150 }, { "epoch": 2.414204441181868, "grad_norm": 3.074876070022583, "learning_rate": 9.986951808068472e-06, "loss": 0.301, "num_input_tokens_seen": 28362592, "step": 13155 }, { "epoch": 2.4151220407414202, "grad_norm": 0.7749272584915161, "learning_rate": 9.986893931553798e-06, "loss": 0.3095, "num_input_tokens_seen": 28373536, "step": 13160 }, { "epoch": 2.4160396403009727, "grad_norm": 0.5810155272483826, "learning_rate": 9.986835927133028e-06, "loss": 0.322, "num_input_tokens_seen": 28384544, "step": 13165 }, { "epoch": 2.416957239860525, "grad_norm": 1.6660937070846558, "learning_rate": 9.986777794807641e-06, "loss": 0.334, "num_input_tokens_seen": 28394208, "step": 13170 }, { "epoch": 2.417874839420077, "grad_norm": 1.997214436531067, "learning_rate": 9.986719534579135e-06, "loss": 0.2234, "num_input_tokens_seen": 28405504, "step": 13175 }, { "epoch": 2.4187924389796294, "grad_norm": 3.624441385269165, "learning_rate": 9.986661146449002e-06, "loss": 0.4189, "num_input_tokens_seen": 28416320, "step": 13180 }, { "epoch": 2.4197100385391814, "grad_norm": 1.2532398700714111, "learning_rate": 9.986602630418737e-06, "loss": 0.3253, "num_input_tokens_seen": 28428416, "step": 13185 }, { "epoch": 2.4206276380987335, "grad_norm": 10.992816925048828, "learning_rate": 9.986543986489845e-06, "loss": 0.3028, "num_input_tokens_seen": 28437824, "step": 13190 }, { "epoch": 2.421545237658286, "grad_norm": 2.911295175552368, "learning_rate": 9.986485214663826e-06, "loss": 0.321, "num_input_tokens_seen": 28448416, "step": 13195 }, { "epoch": 2.422462837217838, "grad_norm": 0.8462924957275391, "learning_rate": 9.986426314942192e-06, "loss": 0.2729, "num_input_tokens_seen": 28459520, "step": 13200 }, { "epoch": 2.42338043677739, "grad_norm": 3.5023674964904785, "learning_rate": 9.98636728732645e-06, "loss": 0.3041, "num_input_tokens_seen": 28470144, "step": 13205 }, { "epoch": 2.4242980363369426, "grad_norm": 5.266607284545898, "learning_rate": 9.986308131818116e-06, "loss": 0.297, "num_input_tokens_seen": 28480704, "step": 13210 }, { "epoch": 2.4252156358964947, "grad_norm": 7.218873023986816, "learning_rate": 9.986248848418706e-06, "loss": 0.3099, "num_input_tokens_seen": 28492544, "step": 13215 }, { "epoch": 2.4261332354560468, "grad_norm": 4.828380584716797, "learning_rate": 9.98618943712974e-06, "loss": 0.4048, "num_input_tokens_seen": 28503328, "step": 13220 }, { "epoch": 2.4270508350155993, "grad_norm": 0.5557964444160461, "learning_rate": 9.986129897952745e-06, "loss": 0.3111, "num_input_tokens_seen": 28514720, "step": 13225 }, { "epoch": 2.4279684345751513, "grad_norm": 7.823485851287842, "learning_rate": 9.986070230889244e-06, "loss": 0.3211, "num_input_tokens_seen": 28525152, "step": 13230 }, { "epoch": 2.4288860341347034, "grad_norm": 0.9267225861549377, "learning_rate": 9.986010435940771e-06, "loss": 0.4228, "num_input_tokens_seen": 28535552, "step": 13235 }, { "epoch": 2.429803633694256, "grad_norm": 1.218102216720581, "learning_rate": 9.985950513108858e-06, "loss": 0.3104, "num_input_tokens_seen": 28546112, "step": 13240 }, { "epoch": 2.430721233253808, "grad_norm": 0.9437767863273621, "learning_rate": 9.98589046239504e-06, "loss": 0.2907, "num_input_tokens_seen": 28556736, "step": 13245 }, { "epoch": 2.43163883281336, "grad_norm": 0.9558477997779846, "learning_rate": 9.98583028380086e-06, "loss": 0.281, "num_input_tokens_seen": 28568128, "step": 13250 }, { "epoch": 2.4325564323729125, "grad_norm": 1.138840675354004, "learning_rate": 9.98576997732786e-06, "loss": 0.4281, "num_input_tokens_seen": 28579104, "step": 13255 }, { "epoch": 2.4334740319324646, "grad_norm": 0.9429531097412109, "learning_rate": 9.985709542977589e-06, "loss": 0.3134, "num_input_tokens_seen": 28589984, "step": 13260 }, { "epoch": 2.4343916314920166, "grad_norm": 2.704833745956421, "learning_rate": 9.985648980751595e-06, "loss": 0.3914, "num_input_tokens_seen": 28601792, "step": 13265 }, { "epoch": 2.435309231051569, "grad_norm": 0.583808422088623, "learning_rate": 9.98558829065143e-06, "loss": 0.3052, "num_input_tokens_seen": 28612800, "step": 13270 }, { "epoch": 2.436226830611121, "grad_norm": 0.9034389853477478, "learning_rate": 9.985527472678654e-06, "loss": 0.3135, "num_input_tokens_seen": 28623712, "step": 13275 }, { "epoch": 2.4371444301706733, "grad_norm": 4.608211994171143, "learning_rate": 9.985466526834823e-06, "loss": 0.3742, "num_input_tokens_seen": 28635424, "step": 13280 }, { "epoch": 2.438062029730226, "grad_norm": 0.6527873277664185, "learning_rate": 9.985405453121505e-06, "loss": 0.3642, "num_input_tokens_seen": 28646912, "step": 13285 }, { "epoch": 2.438979629289778, "grad_norm": 0.7133873701095581, "learning_rate": 9.985344251540262e-06, "loss": 0.3147, "num_input_tokens_seen": 28656896, "step": 13290 }, { "epoch": 2.43989722884933, "grad_norm": 1.7939846515655518, "learning_rate": 9.985282922092667e-06, "loss": 0.3297, "num_input_tokens_seen": 28667680, "step": 13295 }, { "epoch": 2.4408148284088824, "grad_norm": 2.6231727600097656, "learning_rate": 9.98522146478029e-06, "loss": 0.3108, "num_input_tokens_seen": 28679072, "step": 13300 }, { "epoch": 2.4417324279684345, "grad_norm": 1.3766906261444092, "learning_rate": 9.985159879604708e-06, "loss": 0.3211, "num_input_tokens_seen": 28689408, "step": 13305 }, { "epoch": 2.4426500275279865, "grad_norm": 1.58781099319458, "learning_rate": 9.985098166567504e-06, "loss": 0.366, "num_input_tokens_seen": 28699712, "step": 13310 }, { "epoch": 2.443567627087539, "grad_norm": 3.7107021808624268, "learning_rate": 9.985036325670257e-06, "loss": 0.3227, "num_input_tokens_seen": 28710848, "step": 13315 }, { "epoch": 2.444485226647091, "grad_norm": 2.234231472015381, "learning_rate": 9.984974356914555e-06, "loss": 0.3448, "num_input_tokens_seen": 28721184, "step": 13320 }, { "epoch": 2.445402826206643, "grad_norm": 1.228846549987793, "learning_rate": 9.984912260301986e-06, "loss": 0.3222, "num_input_tokens_seen": 28730144, "step": 13325 }, { "epoch": 2.4463204257661957, "grad_norm": 3.78753924369812, "learning_rate": 9.984850035834144e-06, "loss": 0.3474, "num_input_tokens_seen": 28741984, "step": 13330 }, { "epoch": 2.4472380253257477, "grad_norm": 1.2786706686019897, "learning_rate": 9.984787683512624e-06, "loss": 0.3455, "num_input_tokens_seen": 28752992, "step": 13335 }, { "epoch": 2.4481556248853, "grad_norm": 3.4947543144226074, "learning_rate": 9.984725203339025e-06, "loss": 0.3369, "num_input_tokens_seen": 28764320, "step": 13340 }, { "epoch": 2.4490732244448523, "grad_norm": 2.9798424243927, "learning_rate": 9.98466259531495e-06, "loss": 0.4195, "num_input_tokens_seen": 28774560, "step": 13345 }, { "epoch": 2.4499908240044044, "grad_norm": 1.731554388999939, "learning_rate": 9.984599859442009e-06, "loss": 0.3056, "num_input_tokens_seen": 28784800, "step": 13350 }, { "epoch": 2.4509084235639564, "grad_norm": 2.0326955318450928, "learning_rate": 9.984536995721803e-06, "loss": 0.3439, "num_input_tokens_seen": 28795840, "step": 13355 }, { "epoch": 2.451826023123509, "grad_norm": 1.5037890672683716, "learning_rate": 9.984474004155948e-06, "loss": 0.3181, "num_input_tokens_seen": 28805696, "step": 13360 }, { "epoch": 2.452743622683061, "grad_norm": 1.311643123626709, "learning_rate": 9.984410884746062e-06, "loss": 0.2986, "num_input_tokens_seen": 28814528, "step": 13365 }, { "epoch": 2.4536612222426135, "grad_norm": 1.1259030103683472, "learning_rate": 9.984347637493761e-06, "loss": 0.3814, "num_input_tokens_seen": 28825184, "step": 13370 }, { "epoch": 2.4545788218021656, "grad_norm": 2.3411383628845215, "learning_rate": 9.984284262400668e-06, "loss": 0.3639, "num_input_tokens_seen": 28836032, "step": 13375 }, { "epoch": 2.4554964213617176, "grad_norm": 1.4436063766479492, "learning_rate": 9.984220759468409e-06, "loss": 0.3306, "num_input_tokens_seen": 28846272, "step": 13380 }, { "epoch": 2.45641402092127, "grad_norm": 1.410785436630249, "learning_rate": 9.984157128698612e-06, "loss": 0.3314, "num_input_tokens_seen": 28856896, "step": 13385 }, { "epoch": 2.457331620480822, "grad_norm": 1.220957636833191, "learning_rate": 9.98409337009291e-06, "loss": 0.3329, "num_input_tokens_seen": 28867520, "step": 13390 }, { "epoch": 2.4582492200403743, "grad_norm": 1.2128056287765503, "learning_rate": 9.984029483652937e-06, "loss": 0.3096, "num_input_tokens_seen": 28877504, "step": 13395 }, { "epoch": 2.459166819599927, "grad_norm": 0.8822658658027649, "learning_rate": 9.983965469380333e-06, "loss": 0.3199, "num_input_tokens_seen": 28888352, "step": 13400 }, { "epoch": 2.460084419159479, "grad_norm": 2.224652051925659, "learning_rate": 9.98390132727674e-06, "loss": 0.3294, "num_input_tokens_seen": 28897984, "step": 13405 }, { "epoch": 2.461002018719031, "grad_norm": 0.9364616870880127, "learning_rate": 9.9838370573438e-06, "loss": 0.3345, "num_input_tokens_seen": 28909568, "step": 13410 }, { "epoch": 2.4619196182785834, "grad_norm": 1.2640631198883057, "learning_rate": 9.983772659583166e-06, "loss": 0.3279, "num_input_tokens_seen": 28919840, "step": 13415 }, { "epoch": 2.4628372178381355, "grad_norm": 1.0646928548812866, "learning_rate": 9.983708133996486e-06, "loss": 0.3757, "num_input_tokens_seen": 28930048, "step": 13420 }, { "epoch": 2.4637548173976875, "grad_norm": 1.2801743745803833, "learning_rate": 9.983643480585416e-06, "loss": 0.3207, "num_input_tokens_seen": 28940832, "step": 13425 }, { "epoch": 2.46467241695724, "grad_norm": 1.0453059673309326, "learning_rate": 9.983578699351616e-06, "loss": 0.3104, "num_input_tokens_seen": 28951008, "step": 13430 }, { "epoch": 2.465590016516792, "grad_norm": 0.7661073803901672, "learning_rate": 9.983513790296747e-06, "loss": 0.354, "num_input_tokens_seen": 28961792, "step": 13435 }, { "epoch": 2.466507616076344, "grad_norm": 4.101332664489746, "learning_rate": 9.98344875342247e-06, "loss": 0.4011, "num_input_tokens_seen": 28972960, "step": 13440 }, { "epoch": 2.4674252156358967, "grad_norm": 1.4814894199371338, "learning_rate": 9.983383588730457e-06, "loss": 0.3789, "num_input_tokens_seen": 28984000, "step": 13445 }, { "epoch": 2.4683428151954487, "grad_norm": 1.9050945043563843, "learning_rate": 9.98331829622238e-06, "loss": 0.2821, "num_input_tokens_seen": 28994240, "step": 13450 }, { "epoch": 2.469260414755001, "grad_norm": 2.234020471572876, "learning_rate": 9.983252875899912e-06, "loss": 0.3489, "num_input_tokens_seen": 29004832, "step": 13455 }, { "epoch": 2.4701780143145533, "grad_norm": 0.8253656029701233, "learning_rate": 9.983187327764729e-06, "loss": 0.3083, "num_input_tokens_seen": 29014016, "step": 13460 }, { "epoch": 2.4710956138741054, "grad_norm": 0.9203344583511353, "learning_rate": 9.983121651818518e-06, "loss": 0.328, "num_input_tokens_seen": 29024416, "step": 13465 }, { "epoch": 2.4720132134336574, "grad_norm": 6.404714107513428, "learning_rate": 9.983055848062958e-06, "loss": 0.4551, "num_input_tokens_seen": 29036288, "step": 13470 }, { "epoch": 2.47293081299321, "grad_norm": 0.9209461808204651, "learning_rate": 9.982989916499736e-06, "loss": 0.3494, "num_input_tokens_seen": 29046784, "step": 13475 }, { "epoch": 2.473848412552762, "grad_norm": 1.5302410125732422, "learning_rate": 9.98292385713055e-06, "loss": 0.3448, "num_input_tokens_seen": 29057216, "step": 13480 }, { "epoch": 2.474766012112314, "grad_norm": 1.4651718139648438, "learning_rate": 9.982857669957086e-06, "loss": 0.3116, "num_input_tokens_seen": 29068128, "step": 13485 }, { "epoch": 2.4756836116718666, "grad_norm": 1.236925482749939, "learning_rate": 9.982791354981048e-06, "loss": 0.2865, "num_input_tokens_seen": 29079136, "step": 13490 }, { "epoch": 2.4766012112314186, "grad_norm": 4.691192626953125, "learning_rate": 9.982724912204132e-06, "loss": 0.367, "num_input_tokens_seen": 29089856, "step": 13495 }, { "epoch": 2.4775188107909707, "grad_norm": 0.8339153528213501, "learning_rate": 9.982658341628046e-06, "loss": 0.3651, "num_input_tokens_seen": 29100928, "step": 13500 }, { "epoch": 2.478436410350523, "grad_norm": 5.606436729431152, "learning_rate": 9.982591643254496e-06, "loss": 0.3671, "num_input_tokens_seen": 29112960, "step": 13505 }, { "epoch": 2.4793540099100753, "grad_norm": 6.279265880584717, "learning_rate": 9.982524817085193e-06, "loss": 0.4172, "num_input_tokens_seen": 29124576, "step": 13510 }, { "epoch": 2.4802716094696273, "grad_norm": 3.669311761856079, "learning_rate": 9.98245786312185e-06, "loss": 0.3341, "num_input_tokens_seen": 29135680, "step": 13515 }, { "epoch": 2.48118920902918, "grad_norm": 1.815921425819397, "learning_rate": 9.982390781366185e-06, "loss": 0.3421, "num_input_tokens_seen": 29147552, "step": 13520 }, { "epoch": 2.482106808588732, "grad_norm": 1.2825182676315308, "learning_rate": 9.982323571819919e-06, "loss": 0.3253, "num_input_tokens_seen": 29156064, "step": 13525 }, { "epoch": 2.483024408148284, "grad_norm": 1.0699779987335205, "learning_rate": 9.982256234484775e-06, "loss": 0.3032, "num_input_tokens_seen": 29167392, "step": 13530 }, { "epoch": 2.4839420077078365, "grad_norm": 0.9343776702880859, "learning_rate": 9.98218876936248e-06, "loss": 0.3015, "num_input_tokens_seen": 29177792, "step": 13535 }, { "epoch": 2.4848596072673885, "grad_norm": 1.5061516761779785, "learning_rate": 9.982121176454764e-06, "loss": 0.3113, "num_input_tokens_seen": 29189216, "step": 13540 }, { "epoch": 2.4857772068269406, "grad_norm": 1.3083621263504028, "learning_rate": 9.982053455763364e-06, "loss": 0.2523, "num_input_tokens_seen": 29200832, "step": 13545 }, { "epoch": 2.486694806386493, "grad_norm": 1.220425009727478, "learning_rate": 9.981985607290012e-06, "loss": 0.3005, "num_input_tokens_seen": 29211136, "step": 13550 }, { "epoch": 2.487612405946045, "grad_norm": 3.4528651237487793, "learning_rate": 9.98191763103645e-06, "loss": 0.3032, "num_input_tokens_seen": 29220960, "step": 13555 }, { "epoch": 2.488530005505597, "grad_norm": 1.2801169157028198, "learning_rate": 9.981849527004425e-06, "loss": 0.3439, "num_input_tokens_seen": 29232416, "step": 13560 }, { "epoch": 2.4894476050651497, "grad_norm": 3.683668613433838, "learning_rate": 9.981781295195678e-06, "loss": 0.2814, "num_input_tokens_seen": 29242912, "step": 13565 }, { "epoch": 2.490365204624702, "grad_norm": 2.1571431159973145, "learning_rate": 9.981712935611964e-06, "loss": 0.4119, "num_input_tokens_seen": 29255040, "step": 13570 }, { "epoch": 2.491282804184254, "grad_norm": 1.3067395687103271, "learning_rate": 9.981644448255033e-06, "loss": 0.2333, "num_input_tokens_seen": 29265248, "step": 13575 }, { "epoch": 2.4922004037438064, "grad_norm": 1.1081959009170532, "learning_rate": 9.981575833126643e-06, "loss": 0.2158, "num_input_tokens_seen": 29275904, "step": 13580 }, { "epoch": 2.4931180033033584, "grad_norm": 3.9009742736816406, "learning_rate": 9.981507090228553e-06, "loss": 0.3172, "num_input_tokens_seen": 29287232, "step": 13585 }, { "epoch": 2.4940356028629105, "grad_norm": 2.9097368717193604, "learning_rate": 9.981438219562529e-06, "loss": 0.3381, "num_input_tokens_seen": 29298240, "step": 13590 }, { "epoch": 2.494953202422463, "grad_norm": 2.2093265056610107, "learning_rate": 9.981369221130332e-06, "loss": 0.3226, "num_input_tokens_seen": 29309216, "step": 13595 }, { "epoch": 2.495870801982015, "grad_norm": 6.9858622550964355, "learning_rate": 9.981300094933737e-06, "loss": 0.4425, "num_input_tokens_seen": 29321824, "step": 13600 }, { "epoch": 2.496788401541567, "grad_norm": 2.965378522872925, "learning_rate": 9.981230840974514e-06, "loss": 0.1235, "num_input_tokens_seen": 29332992, "step": 13605 }, { "epoch": 2.4977060011011196, "grad_norm": 1.8331968784332275, "learning_rate": 9.98116145925444e-06, "loss": 0.3759, "num_input_tokens_seen": 29343008, "step": 13610 }, { "epoch": 2.4986236006606717, "grad_norm": 2.968139171600342, "learning_rate": 9.981091949775296e-06, "loss": 0.3534, "num_input_tokens_seen": 29354240, "step": 13615 }, { "epoch": 2.4995412002202237, "grad_norm": 4.919466495513916, "learning_rate": 9.981022312538862e-06, "loss": 0.3006, "num_input_tokens_seen": 29365824, "step": 13620 }, { "epoch": 2.5004587997797763, "grad_norm": 5.078500270843506, "learning_rate": 9.980952547546927e-06, "loss": 0.216, "num_input_tokens_seen": 29377568, "step": 13625 }, { "epoch": 2.5013763993393283, "grad_norm": 2.8634626865386963, "learning_rate": 9.980882654801278e-06, "loss": 0.2929, "num_input_tokens_seen": 29387520, "step": 13630 }, { "epoch": 2.5022939988988804, "grad_norm": 2.9182851314544678, "learning_rate": 9.980812634303708e-06, "loss": 0.4217, "num_input_tokens_seen": 29398496, "step": 13635 }, { "epoch": 2.503211598458433, "grad_norm": 3.4188175201416016, "learning_rate": 9.980742486056014e-06, "loss": 0.3366, "num_input_tokens_seen": 29408320, "step": 13640 }, { "epoch": 2.504129198017985, "grad_norm": 1.2405192852020264, "learning_rate": 9.980672210059994e-06, "loss": 0.2586, "num_input_tokens_seen": 29417760, "step": 13645 }, { "epoch": 2.505046797577537, "grad_norm": 2.873634099960327, "learning_rate": 9.980601806317454e-06, "loss": 0.2961, "num_input_tokens_seen": 29429696, "step": 13650 }, { "epoch": 2.5059643971370895, "grad_norm": 2.3668341636657715, "learning_rate": 9.980531274830194e-06, "loss": 0.3955, "num_input_tokens_seen": 29440320, "step": 13655 }, { "epoch": 2.5068819966966416, "grad_norm": 2.2721195220947266, "learning_rate": 9.980460615600027e-06, "loss": 0.3092, "num_input_tokens_seen": 29449888, "step": 13660 }, { "epoch": 2.5077995962561936, "grad_norm": 2.0199084281921387, "learning_rate": 9.980389828628765e-06, "loss": 0.3215, "num_input_tokens_seen": 29461600, "step": 13665 }, { "epoch": 2.508717195815746, "grad_norm": 1.0365700721740723, "learning_rate": 9.98031891391822e-06, "loss": 0.3589, "num_input_tokens_seen": 29470560, "step": 13670 }, { "epoch": 2.509634795375298, "grad_norm": 1.5030388832092285, "learning_rate": 9.980247871470217e-06, "loss": 0.4503, "num_input_tokens_seen": 29482976, "step": 13675 }, { "epoch": 2.5105523949348503, "grad_norm": 1.2439029216766357, "learning_rate": 9.980176701286572e-06, "loss": 0.2501, "num_input_tokens_seen": 29494848, "step": 13680 }, { "epoch": 2.511469994494403, "grad_norm": 5.004456043243408, "learning_rate": 9.980105403369116e-06, "loss": 0.3971, "num_input_tokens_seen": 29504000, "step": 13685 }, { "epoch": 2.512387594053955, "grad_norm": 3.635449171066284, "learning_rate": 9.980033977719671e-06, "loss": 0.345, "num_input_tokens_seen": 29516096, "step": 13690 }, { "epoch": 2.513305193613507, "grad_norm": 2.522446632385254, "learning_rate": 9.979962424340076e-06, "loss": 0.356, "num_input_tokens_seen": 29527360, "step": 13695 }, { "epoch": 2.5142227931730594, "grad_norm": 0.7474042773246765, "learning_rate": 9.979890743232161e-06, "loss": 0.2751, "num_input_tokens_seen": 29537760, "step": 13700 }, { "epoch": 2.5151403927326115, "grad_norm": 1.6800867319107056, "learning_rate": 9.979818934397768e-06, "loss": 0.3458, "num_input_tokens_seen": 29548320, "step": 13705 }, { "epoch": 2.5160579922921635, "grad_norm": 1.5997167825698853, "learning_rate": 9.979746997838738e-06, "loss": 0.3626, "num_input_tokens_seen": 29559744, "step": 13710 }, { "epoch": 2.516975591851716, "grad_norm": 2.1870315074920654, "learning_rate": 9.979674933556915e-06, "loss": 0.2661, "num_input_tokens_seen": 29570304, "step": 13715 }, { "epoch": 2.517893191411268, "grad_norm": 1.1032888889312744, "learning_rate": 9.979602741554147e-06, "loss": 0.2805, "num_input_tokens_seen": 29581216, "step": 13720 }, { "epoch": 2.51881079097082, "grad_norm": 1.298197865486145, "learning_rate": 9.979530421832286e-06, "loss": 0.3579, "num_input_tokens_seen": 29591712, "step": 13725 }, { "epoch": 2.5197283905303727, "grad_norm": 1.5977656841278076, "learning_rate": 9.979457974393188e-06, "loss": 0.3994, "num_input_tokens_seen": 29603072, "step": 13730 }, { "epoch": 2.5206459900899247, "grad_norm": 2.6912033557891846, "learning_rate": 9.97938539923871e-06, "loss": 0.3519, "num_input_tokens_seen": 29612704, "step": 13735 }, { "epoch": 2.5215635896494772, "grad_norm": 1.1698925495147705, "learning_rate": 9.979312696370715e-06, "loss": 0.2894, "num_input_tokens_seen": 29622496, "step": 13740 }, { "epoch": 2.5224811892090293, "grad_norm": 3.0624053478240967, "learning_rate": 9.979239865791065e-06, "loss": 0.3475, "num_input_tokens_seen": 29631776, "step": 13745 }, { "epoch": 2.5233987887685814, "grad_norm": 1.6814786195755005, "learning_rate": 9.979166907501631e-06, "loss": 0.3738, "num_input_tokens_seen": 29642592, "step": 13750 }, { "epoch": 2.524316388328134, "grad_norm": 1.5756361484527588, "learning_rate": 9.979093821504282e-06, "loss": 0.3478, "num_input_tokens_seen": 29653376, "step": 13755 }, { "epoch": 2.525233987887686, "grad_norm": 2.0008773803710938, "learning_rate": 9.979020607800894e-06, "loss": 0.3632, "num_input_tokens_seen": 29664576, "step": 13760 }, { "epoch": 2.526151587447238, "grad_norm": 1.7521644830703735, "learning_rate": 9.978947266393345e-06, "loss": 0.2996, "num_input_tokens_seen": 29675584, "step": 13765 }, { "epoch": 2.5270691870067905, "grad_norm": 1.9472801685333252, "learning_rate": 9.978873797283512e-06, "loss": 0.2917, "num_input_tokens_seen": 29684864, "step": 13770 }, { "epoch": 2.5279867865663426, "grad_norm": 1.679215431213379, "learning_rate": 9.978800200473285e-06, "loss": 0.3508, "num_input_tokens_seen": 29693600, "step": 13775 }, { "epoch": 2.5289043861258946, "grad_norm": 1.8327549695968628, "learning_rate": 9.978726475964548e-06, "loss": 0.2947, "num_input_tokens_seen": 29704736, "step": 13780 }, { "epoch": 2.529821985685447, "grad_norm": 1.5203545093536377, "learning_rate": 9.978652623759194e-06, "loss": 0.3263, "num_input_tokens_seen": 29715968, "step": 13785 }, { "epoch": 2.530739585244999, "grad_norm": 1.0180097818374634, "learning_rate": 9.978578643859118e-06, "loss": 0.2527, "num_input_tokens_seen": 29727424, "step": 13790 }, { "epoch": 2.5316571848045513, "grad_norm": 1.3740794658660889, "learning_rate": 9.978504536266215e-06, "loss": 0.274, "num_input_tokens_seen": 29739104, "step": 13795 }, { "epoch": 2.5325747843641038, "grad_norm": 2.195791721343994, "learning_rate": 9.978430300982387e-06, "loss": 0.2434, "num_input_tokens_seen": 29748928, "step": 13800 }, { "epoch": 2.533492383923656, "grad_norm": 2.4052867889404297, "learning_rate": 9.978355938009535e-06, "loss": 0.2321, "num_input_tokens_seen": 29760192, "step": 13805 }, { "epoch": 2.534409983483208, "grad_norm": 7.98270845413208, "learning_rate": 9.978281447349572e-06, "loss": 0.4128, "num_input_tokens_seen": 29771040, "step": 13810 }, { "epoch": 2.5353275830427604, "grad_norm": 1.254338026046753, "learning_rate": 9.978206829004403e-06, "loss": 0.3951, "num_input_tokens_seen": 29782304, "step": 13815 }, { "epoch": 2.5362451826023125, "grad_norm": 3.7668166160583496, "learning_rate": 9.978132082975947e-06, "loss": 0.4247, "num_input_tokens_seen": 29793888, "step": 13820 }, { "epoch": 2.5371627821618645, "grad_norm": 1.4279356002807617, "learning_rate": 9.978057209266116e-06, "loss": 0.3601, "num_input_tokens_seen": 29805792, "step": 13825 }, { "epoch": 2.538080381721417, "grad_norm": 1.621747374534607, "learning_rate": 9.977982207876834e-06, "loss": 0.3258, "num_input_tokens_seen": 29817120, "step": 13830 }, { "epoch": 2.538997981280969, "grad_norm": 2.1104907989501953, "learning_rate": 9.977907078810023e-06, "loss": 0.4499, "num_input_tokens_seen": 29828000, "step": 13835 }, { "epoch": 2.539915580840521, "grad_norm": 1.2050089836120605, "learning_rate": 9.977831822067611e-06, "loss": 0.3195, "num_input_tokens_seen": 29839648, "step": 13840 }, { "epoch": 2.5408331804000737, "grad_norm": 1.3055498600006104, "learning_rate": 9.977756437651528e-06, "loss": 0.3032, "num_input_tokens_seen": 29850880, "step": 13845 }, { "epoch": 2.5417507799596257, "grad_norm": 2.646406650543213, "learning_rate": 9.977680925563706e-06, "loss": 0.3423, "num_input_tokens_seen": 29861792, "step": 13850 }, { "epoch": 2.542668379519178, "grad_norm": 1.2145476341247559, "learning_rate": 9.977605285806085e-06, "loss": 0.3548, "num_input_tokens_seen": 29871232, "step": 13855 }, { "epoch": 2.5435859790787303, "grad_norm": 0.9245721697807312, "learning_rate": 9.977529518380603e-06, "loss": 0.2917, "num_input_tokens_seen": 29882464, "step": 13860 }, { "epoch": 2.5445035786382824, "grad_norm": 1.4691776037216187, "learning_rate": 9.9774536232892e-06, "loss": 0.3171, "num_input_tokens_seen": 29893440, "step": 13865 }, { "epoch": 2.5454211781978344, "grad_norm": 3.992262840270996, "learning_rate": 9.977377600533828e-06, "loss": 0.373, "num_input_tokens_seen": 29904352, "step": 13870 }, { "epoch": 2.546338777757387, "grad_norm": 3.2771413326263428, "learning_rate": 9.977301450116435e-06, "loss": 0.3291, "num_input_tokens_seen": 29915776, "step": 13875 }, { "epoch": 2.547256377316939, "grad_norm": 1.0518994331359863, "learning_rate": 9.977225172038973e-06, "loss": 0.2319, "num_input_tokens_seen": 29926912, "step": 13880 }, { "epoch": 2.548173976876491, "grad_norm": 1.12034010887146, "learning_rate": 9.977148766303402e-06, "loss": 0.3207, "num_input_tokens_seen": 29937536, "step": 13885 }, { "epoch": 2.5490915764360436, "grad_norm": 1.243994951248169, "learning_rate": 9.977072232911677e-06, "loss": 0.3506, "num_input_tokens_seen": 29948448, "step": 13890 }, { "epoch": 2.5500091759955956, "grad_norm": 1.3333183526992798, "learning_rate": 9.976995571865762e-06, "loss": 0.2547, "num_input_tokens_seen": 29959296, "step": 13895 }, { "epoch": 2.5509267755551477, "grad_norm": 2.3236799240112305, "learning_rate": 9.976918783167625e-06, "loss": 0.3236, "num_input_tokens_seen": 29970240, "step": 13900 }, { "epoch": 2.5518443751147, "grad_norm": 1.0720125436782837, "learning_rate": 9.976841866819235e-06, "loss": 0.3349, "num_input_tokens_seen": 29981824, "step": 13905 }, { "epoch": 2.5527619746742523, "grad_norm": 0.8308094143867493, "learning_rate": 9.976764822822566e-06, "loss": 0.3768, "num_input_tokens_seen": 29991744, "step": 13910 }, { "epoch": 2.5536795742338043, "grad_norm": 1.4879891872406006, "learning_rate": 9.97668765117959e-06, "loss": 0.2899, "num_input_tokens_seen": 30002176, "step": 13915 }, { "epoch": 2.554597173793357, "grad_norm": 1.9862220287322998, "learning_rate": 9.97661035189229e-06, "loss": 0.2488, "num_input_tokens_seen": 30013824, "step": 13920 }, { "epoch": 2.555514773352909, "grad_norm": 1.5075215101242065, "learning_rate": 9.976532924962648e-06, "loss": 0.36, "num_input_tokens_seen": 30023776, "step": 13925 }, { "epoch": 2.556432372912461, "grad_norm": 6.366465091705322, "learning_rate": 9.97645537039265e-06, "loss": 0.3675, "num_input_tokens_seen": 30035616, "step": 13930 }, { "epoch": 2.5573499724720135, "grad_norm": 1.7399753332138062, "learning_rate": 9.976377688184282e-06, "loss": 0.2911, "num_input_tokens_seen": 30045952, "step": 13935 }, { "epoch": 2.5582675720315655, "grad_norm": 5.435624599456787, "learning_rate": 9.976299878339541e-06, "loss": 0.3975, "num_input_tokens_seen": 30055584, "step": 13940 }, { "epoch": 2.5591851715911176, "grad_norm": 0.6395095586776733, "learning_rate": 9.97622194086042e-06, "loss": 0.276, "num_input_tokens_seen": 30067072, "step": 13945 }, { "epoch": 2.56010277115067, "grad_norm": 1.1501842737197876, "learning_rate": 9.97614387574892e-06, "loss": 0.3097, "num_input_tokens_seen": 30077536, "step": 13950 }, { "epoch": 2.561020370710222, "grad_norm": 1.3997780084609985, "learning_rate": 9.97606568300704e-06, "loss": 0.3238, "num_input_tokens_seen": 30089248, "step": 13955 }, { "epoch": 2.561937970269774, "grad_norm": 1.788145661354065, "learning_rate": 9.975987362636789e-06, "loss": 0.3182, "num_input_tokens_seen": 30099424, "step": 13960 }, { "epoch": 2.5628555698293267, "grad_norm": 0.9740017056465149, "learning_rate": 9.975908914640174e-06, "loss": 0.386, "num_input_tokens_seen": 30108064, "step": 13965 }, { "epoch": 2.563773169388879, "grad_norm": 1.0299327373504639, "learning_rate": 9.975830339019205e-06, "loss": 0.3213, "num_input_tokens_seen": 30118592, "step": 13970 }, { "epoch": 2.564690768948431, "grad_norm": 1.0250234603881836, "learning_rate": 9.975751635775903e-06, "loss": 0.3249, "num_input_tokens_seen": 30128544, "step": 13975 }, { "epoch": 2.5656083685079834, "grad_norm": 1.461645483970642, "learning_rate": 9.97567280491228e-06, "loss": 0.2951, "num_input_tokens_seen": 30138848, "step": 13980 }, { "epoch": 2.5665259680675354, "grad_norm": 8.852825164794922, "learning_rate": 9.975593846430364e-06, "loss": 0.5784, "num_input_tokens_seen": 30149568, "step": 13985 }, { "epoch": 2.5674435676270875, "grad_norm": 2.7067768573760986, "learning_rate": 9.975514760332176e-06, "loss": 0.3833, "num_input_tokens_seen": 30160992, "step": 13990 }, { "epoch": 2.56836116718664, "grad_norm": 6.747600555419922, "learning_rate": 9.975435546619745e-06, "loss": 0.4193, "num_input_tokens_seen": 30171296, "step": 13995 }, { "epoch": 2.569278766746192, "grad_norm": 0.4879162311553955, "learning_rate": 9.975356205295105e-06, "loss": 0.2636, "num_input_tokens_seen": 30182528, "step": 14000 }, { "epoch": 2.570196366305744, "grad_norm": 7.469110488891602, "learning_rate": 9.975276736360288e-06, "loss": 0.3419, "num_input_tokens_seen": 30193984, "step": 14005 }, { "epoch": 2.5711139658652966, "grad_norm": 0.7387610077857971, "learning_rate": 9.975197139817336e-06, "loss": 0.4367, "num_input_tokens_seen": 30205152, "step": 14010 }, { "epoch": 2.5720315654248487, "grad_norm": 1.156805396080017, "learning_rate": 9.975117415668285e-06, "loss": 0.3363, "num_input_tokens_seen": 30215136, "step": 14015 }, { "epoch": 2.5729491649844007, "grad_norm": 0.43609580397605896, "learning_rate": 9.975037563915186e-06, "loss": 0.3651, "num_input_tokens_seen": 30226176, "step": 14020 }, { "epoch": 2.5738667645439532, "grad_norm": 0.6136215925216675, "learning_rate": 9.974957584560082e-06, "loss": 0.3361, "num_input_tokens_seen": 30236960, "step": 14025 }, { "epoch": 2.5747843641035053, "grad_norm": 1.3094422817230225, "learning_rate": 9.974877477605027e-06, "loss": 0.3674, "num_input_tokens_seen": 30248448, "step": 14030 }, { "epoch": 2.5757019636630574, "grad_norm": 1.02117919921875, "learning_rate": 9.974797243052077e-06, "loss": 0.3608, "num_input_tokens_seen": 30259808, "step": 14035 }, { "epoch": 2.57661956322261, "grad_norm": 3.045663356781006, "learning_rate": 9.974716880903286e-06, "loss": 0.3234, "num_input_tokens_seen": 30271264, "step": 14040 }, { "epoch": 2.577537162782162, "grad_norm": 1.1499985456466675, "learning_rate": 9.974636391160717e-06, "loss": 0.4102, "num_input_tokens_seen": 30282112, "step": 14045 }, { "epoch": 2.578454762341714, "grad_norm": 2.5824294090270996, "learning_rate": 9.974555773826434e-06, "loss": 0.3694, "num_input_tokens_seen": 30293024, "step": 14050 }, { "epoch": 2.5793723619012665, "grad_norm": 2.4086272716522217, "learning_rate": 9.974475028902506e-06, "loss": 0.3432, "num_input_tokens_seen": 30303040, "step": 14055 }, { "epoch": 2.5802899614608186, "grad_norm": 2.2085819244384766, "learning_rate": 9.974394156391004e-06, "loss": 0.2797, "num_input_tokens_seen": 30312864, "step": 14060 }, { "epoch": 2.5812075610203706, "grad_norm": 1.0152192115783691, "learning_rate": 9.974313156294e-06, "loss": 0.319, "num_input_tokens_seen": 30323200, "step": 14065 }, { "epoch": 2.582125160579923, "grad_norm": 0.7551283836364746, "learning_rate": 9.974232028613575e-06, "loss": 0.3729, "num_input_tokens_seen": 30334400, "step": 14070 }, { "epoch": 2.583042760139475, "grad_norm": 3.385342836380005, "learning_rate": 9.974150773351808e-06, "loss": 0.3861, "num_input_tokens_seen": 30347392, "step": 14075 }, { "epoch": 2.5839603596990273, "grad_norm": 1.7893062829971313, "learning_rate": 9.974069390510781e-06, "loss": 0.3723, "num_input_tokens_seen": 30358816, "step": 14080 }, { "epoch": 2.5848779592585798, "grad_norm": 0.8259076476097107, "learning_rate": 9.973987880092583e-06, "loss": 0.2495, "num_input_tokens_seen": 30368416, "step": 14085 }, { "epoch": 2.585795558818132, "grad_norm": 0.582406222820282, "learning_rate": 9.973906242099305e-06, "loss": 0.3445, "num_input_tokens_seen": 30378912, "step": 14090 }, { "epoch": 2.586713158377684, "grad_norm": 1.032840371131897, "learning_rate": 9.973824476533043e-06, "loss": 0.2921, "num_input_tokens_seen": 30388928, "step": 14095 }, { "epoch": 2.5876307579372364, "grad_norm": 1.7081036567687988, "learning_rate": 9.97374258339589e-06, "loss": 0.3375, "num_input_tokens_seen": 30399008, "step": 14100 }, { "epoch": 2.5885483574967885, "grad_norm": 2.8105154037475586, "learning_rate": 9.973660562689948e-06, "loss": 0.366, "num_input_tokens_seen": 30410144, "step": 14105 }, { "epoch": 2.5894659570563405, "grad_norm": 3.8302102088928223, "learning_rate": 9.973578414417322e-06, "loss": 0.3726, "num_input_tokens_seen": 30420096, "step": 14110 }, { "epoch": 2.590383556615893, "grad_norm": 0.5226858258247375, "learning_rate": 9.973496138580119e-06, "loss": 0.2649, "num_input_tokens_seen": 30431968, "step": 14115 }, { "epoch": 2.591301156175445, "grad_norm": 1.0439622402191162, "learning_rate": 9.973413735180446e-06, "loss": 0.248, "num_input_tokens_seen": 30443360, "step": 14120 }, { "epoch": 2.592218755734997, "grad_norm": 0.7519725561141968, "learning_rate": 9.97333120422042e-06, "loss": 0.3159, "num_input_tokens_seen": 30453920, "step": 14125 }, { "epoch": 2.5931363552945497, "grad_norm": 0.8860782980918884, "learning_rate": 9.973248545702156e-06, "loss": 0.3017, "num_input_tokens_seen": 30464768, "step": 14130 }, { "epoch": 2.5940539548541017, "grad_norm": 1.3257505893707275, "learning_rate": 9.973165759627777e-06, "loss": 0.3827, "num_input_tokens_seen": 30476416, "step": 14135 }, { "epoch": 2.594971554413654, "grad_norm": 1.5480027198791504, "learning_rate": 9.973082845999401e-06, "loss": 0.3364, "num_input_tokens_seen": 30487328, "step": 14140 }, { "epoch": 2.5958891539732063, "grad_norm": 1.0006239414215088, "learning_rate": 9.97299980481916e-06, "loss": 0.2809, "num_input_tokens_seen": 30497248, "step": 14145 }, { "epoch": 2.5968067535327584, "grad_norm": 3.497741222381592, "learning_rate": 9.972916636089178e-06, "loss": 0.4891, "num_input_tokens_seen": 30508672, "step": 14150 }, { "epoch": 2.5977243530923104, "grad_norm": 1.3265570402145386, "learning_rate": 9.972833339811594e-06, "loss": 0.3571, "num_input_tokens_seen": 30519008, "step": 14155 }, { "epoch": 2.598641952651863, "grad_norm": 0.988057553768158, "learning_rate": 9.972749915988542e-06, "loss": 0.3265, "num_input_tokens_seen": 30530048, "step": 14160 }, { "epoch": 2.599559552211415, "grad_norm": 2.8117802143096924, "learning_rate": 9.972666364622162e-06, "loss": 0.3039, "num_input_tokens_seen": 30540704, "step": 14165 }, { "epoch": 2.600477151770967, "grad_norm": 0.7777758836746216, "learning_rate": 9.972582685714597e-06, "loss": 0.3435, "num_input_tokens_seen": 30551424, "step": 14170 }, { "epoch": 2.6013947513305196, "grad_norm": 0.7810243368148804, "learning_rate": 9.972498879267992e-06, "loss": 0.2973, "num_input_tokens_seen": 30562720, "step": 14175 }, { "epoch": 2.6023123508900716, "grad_norm": 1.7705552577972412, "learning_rate": 9.972414945284496e-06, "loss": 0.3184, "num_input_tokens_seen": 30574208, "step": 14180 }, { "epoch": 2.6032299504496237, "grad_norm": 1.3169715404510498, "learning_rate": 9.972330883766266e-06, "loss": 0.3942, "num_input_tokens_seen": 30584544, "step": 14185 }, { "epoch": 2.604147550009176, "grad_norm": 0.9564467072486877, "learning_rate": 9.972246694715452e-06, "loss": 0.26, "num_input_tokens_seen": 30595872, "step": 14190 }, { "epoch": 2.6050651495687283, "grad_norm": 1.5452708005905151, "learning_rate": 9.97216237813422e-06, "loss": 0.3564, "num_input_tokens_seen": 30606880, "step": 14195 }, { "epoch": 2.6059827491282803, "grad_norm": 0.8190767765045166, "learning_rate": 9.972077934024728e-06, "loss": 0.254, "num_input_tokens_seen": 30616704, "step": 14200 }, { "epoch": 2.606900348687833, "grad_norm": 1.0621429681777954, "learning_rate": 9.971993362389143e-06, "loss": 0.3148, "num_input_tokens_seen": 30626560, "step": 14205 }, { "epoch": 2.607817948247385, "grad_norm": 0.8952221870422363, "learning_rate": 9.971908663229632e-06, "loss": 0.2864, "num_input_tokens_seen": 30636864, "step": 14210 }, { "epoch": 2.608735547806937, "grad_norm": 0.9416490793228149, "learning_rate": 9.971823836548373e-06, "loss": 0.3159, "num_input_tokens_seen": 30647168, "step": 14215 }, { "epoch": 2.6096531473664895, "grad_norm": 1.7544721364974976, "learning_rate": 9.971738882347535e-06, "loss": 0.2753, "num_input_tokens_seen": 30656896, "step": 14220 }, { "epoch": 2.6105707469260415, "grad_norm": 1.110072135925293, "learning_rate": 9.971653800629302e-06, "loss": 0.2635, "num_input_tokens_seen": 30668960, "step": 14225 }, { "epoch": 2.6114883464855936, "grad_norm": 5.64354944229126, "learning_rate": 9.971568591395855e-06, "loss": 0.2979, "num_input_tokens_seen": 30679264, "step": 14230 }, { "epoch": 2.612405946045146, "grad_norm": 4.781474590301514, "learning_rate": 9.971483254649378e-06, "loss": 0.4967, "num_input_tokens_seen": 30689664, "step": 14235 }, { "epoch": 2.613323545604698, "grad_norm": 2.0861692428588867, "learning_rate": 9.971397790392062e-06, "loss": 0.3258, "num_input_tokens_seen": 30699008, "step": 14240 }, { "epoch": 2.61424114516425, "grad_norm": 0.7728284597396851, "learning_rate": 9.971312198626096e-06, "loss": 0.2695, "num_input_tokens_seen": 30709664, "step": 14245 }, { "epoch": 2.6151587447238027, "grad_norm": 1.1037321090698242, "learning_rate": 9.971226479353675e-06, "loss": 0.375, "num_input_tokens_seen": 30721760, "step": 14250 }, { "epoch": 2.616076344283355, "grad_norm": 0.9330450296401978, "learning_rate": 9.971140632577003e-06, "loss": 0.2787, "num_input_tokens_seen": 30733376, "step": 14255 }, { "epoch": 2.616993943842907, "grad_norm": 1.245542287826538, "learning_rate": 9.971054658298276e-06, "loss": 0.3179, "num_input_tokens_seen": 30743808, "step": 14260 }, { "epoch": 2.6179115434024594, "grad_norm": 1.401934027671814, "learning_rate": 9.970968556519702e-06, "loss": 0.3405, "num_input_tokens_seen": 30753280, "step": 14265 }, { "epoch": 2.6188291429620114, "grad_norm": 1.4793049097061157, "learning_rate": 9.97088232724349e-06, "loss": 0.2901, "num_input_tokens_seen": 30764672, "step": 14270 }, { "epoch": 2.6197467425215635, "grad_norm": 0.9693373441696167, "learning_rate": 9.97079597047185e-06, "loss": 0.2254, "num_input_tokens_seen": 30775776, "step": 14275 }, { "epoch": 2.620664342081116, "grad_norm": 2.451509952545166, "learning_rate": 9.970709486206997e-06, "loss": 0.4049, "num_input_tokens_seen": 30786144, "step": 14280 }, { "epoch": 2.621581941640668, "grad_norm": 3.771409511566162, "learning_rate": 9.97062287445115e-06, "loss": 0.3772, "num_input_tokens_seen": 30797280, "step": 14285 }, { "epoch": 2.62249954120022, "grad_norm": 2.0579001903533936, "learning_rate": 9.97053613520653e-06, "loss": 0.3319, "num_input_tokens_seen": 30807904, "step": 14290 }, { "epoch": 2.6234171407597726, "grad_norm": 2.0563809871673584, "learning_rate": 9.970449268475362e-06, "loss": 0.3225, "num_input_tokens_seen": 30819424, "step": 14295 }, { "epoch": 2.6243347403193247, "grad_norm": 0.8224369883537292, "learning_rate": 9.970362274259873e-06, "loss": 0.3744, "num_input_tokens_seen": 30830592, "step": 14300 }, { "epoch": 2.6252523398788767, "grad_norm": 1.7698006629943848, "learning_rate": 9.970275152562296e-06, "loss": 0.3751, "num_input_tokens_seen": 30840960, "step": 14305 }, { "epoch": 2.6261699394384292, "grad_norm": 2.6306231021881104, "learning_rate": 9.970187903384863e-06, "loss": 0.4103, "num_input_tokens_seen": 30852704, "step": 14310 }, { "epoch": 2.6270875389979813, "grad_norm": 1.1404366493225098, "learning_rate": 9.970100526729815e-06, "loss": 0.2802, "num_input_tokens_seen": 30862944, "step": 14315 }, { "epoch": 2.6280051385575334, "grad_norm": 1.29569673538208, "learning_rate": 9.97001302259939e-06, "loss": 0.4403, "num_input_tokens_seen": 30873888, "step": 14320 }, { "epoch": 2.628922738117086, "grad_norm": 0.9526706337928772, "learning_rate": 9.969925390995835e-06, "loss": 0.3244, "num_input_tokens_seen": 30884896, "step": 14325 }, { "epoch": 2.629840337676638, "grad_norm": 3.918955087661743, "learning_rate": 9.969837631921395e-06, "loss": 0.3253, "num_input_tokens_seen": 30895936, "step": 14330 }, { "epoch": 2.63075793723619, "grad_norm": 3.008070945739746, "learning_rate": 9.969749745378324e-06, "loss": 0.3645, "num_input_tokens_seen": 30908352, "step": 14335 }, { "epoch": 2.6316755367957425, "grad_norm": 1.6050379276275635, "learning_rate": 9.969661731368874e-06, "loss": 0.3115, "num_input_tokens_seen": 30918368, "step": 14340 }, { "epoch": 2.6325931363552946, "grad_norm": 1.9429696798324585, "learning_rate": 9.969573589895303e-06, "loss": 0.241, "num_input_tokens_seen": 30929184, "step": 14345 }, { "epoch": 2.6335107359148466, "grad_norm": 1.2565639019012451, "learning_rate": 9.969485320959871e-06, "loss": 0.1847, "num_input_tokens_seen": 30939104, "step": 14350 }, { "epoch": 2.634428335474399, "grad_norm": 2.82987117767334, "learning_rate": 9.969396924564843e-06, "loss": 0.3387, "num_input_tokens_seen": 30951232, "step": 14355 }, { "epoch": 2.635345935033951, "grad_norm": 2.668379306793213, "learning_rate": 9.969308400712485e-06, "loss": 0.3035, "num_input_tokens_seen": 30962080, "step": 14360 }, { "epoch": 2.6362635345935033, "grad_norm": 2.817422389984131, "learning_rate": 9.969219749405068e-06, "loss": 0.3459, "num_input_tokens_seen": 30973088, "step": 14365 }, { "epoch": 2.6371811341530558, "grad_norm": 8.009231567382812, "learning_rate": 9.969130970644868e-06, "loss": 0.5004, "num_input_tokens_seen": 30984896, "step": 14370 }, { "epoch": 2.638098733712608, "grad_norm": 7.920609951019287, "learning_rate": 9.969042064434158e-06, "loss": 0.5743, "num_input_tokens_seen": 30996608, "step": 14375 }, { "epoch": 2.63901633327216, "grad_norm": 3.451605796813965, "learning_rate": 9.968953030775221e-06, "loss": 0.3515, "num_input_tokens_seen": 31007968, "step": 14380 }, { "epoch": 2.6399339328317124, "grad_norm": 2.222414255142212, "learning_rate": 9.96886386967034e-06, "loss": 0.2595, "num_input_tokens_seen": 31018688, "step": 14385 }, { "epoch": 2.6408515323912645, "grad_norm": 3.3740010261535645, "learning_rate": 9.968774581121801e-06, "loss": 0.325, "num_input_tokens_seen": 31030400, "step": 14390 }, { "epoch": 2.6417691319508165, "grad_norm": 1.370141625404358, "learning_rate": 9.968685165131896e-06, "loss": 0.2719, "num_input_tokens_seen": 31041472, "step": 14395 }, { "epoch": 2.642686731510369, "grad_norm": 3.1585676670074463, "learning_rate": 9.968595621702916e-06, "loss": 0.2459, "num_input_tokens_seen": 31053088, "step": 14400 }, { "epoch": 2.643604331069921, "grad_norm": 1.6257156133651733, "learning_rate": 9.968505950837162e-06, "loss": 0.3563, "num_input_tokens_seen": 31065440, "step": 14405 }, { "epoch": 2.644521930629473, "grad_norm": 2.606358766555786, "learning_rate": 9.968416152536929e-06, "loss": 0.4788, "num_input_tokens_seen": 31076256, "step": 14410 }, { "epoch": 2.6454395301890257, "grad_norm": 1.4971954822540283, "learning_rate": 9.96832622680452e-06, "loss": 0.3421, "num_input_tokens_seen": 31086912, "step": 14415 }, { "epoch": 2.6463571297485777, "grad_norm": 3.4510445594787598, "learning_rate": 9.968236173642244e-06, "loss": 0.4015, "num_input_tokens_seen": 31098368, "step": 14420 }, { "epoch": 2.64727472930813, "grad_norm": 1.8542437553405762, "learning_rate": 9.968145993052413e-06, "loss": 0.3374, "num_input_tokens_seen": 31109344, "step": 14425 }, { "epoch": 2.6481923288676823, "grad_norm": 1.8491343259811401, "learning_rate": 9.968055685037336e-06, "loss": 0.3119, "num_input_tokens_seen": 31120320, "step": 14430 }, { "epoch": 2.6491099284272344, "grad_norm": 1.8221904039382935, "learning_rate": 9.96796524959933e-06, "loss": 0.3227, "num_input_tokens_seen": 31131392, "step": 14435 }, { "epoch": 2.6500275279867864, "grad_norm": 1.5518996715545654, "learning_rate": 9.967874686740716e-06, "loss": 0.3256, "num_input_tokens_seen": 31142464, "step": 14440 }, { "epoch": 2.650945127546339, "grad_norm": 1.1263923645019531, "learning_rate": 9.967783996463815e-06, "loss": 0.3066, "num_input_tokens_seen": 31152672, "step": 14445 }, { "epoch": 2.651862727105891, "grad_norm": 1.3961708545684814, "learning_rate": 9.967693178770952e-06, "loss": 0.2646, "num_input_tokens_seen": 31163136, "step": 14450 }, { "epoch": 2.652780326665443, "grad_norm": 2.843052625656128, "learning_rate": 9.967602233664462e-06, "loss": 0.2592, "num_input_tokens_seen": 31174432, "step": 14455 }, { "epoch": 2.6536979262249956, "grad_norm": 1.88677179813385, "learning_rate": 9.96751116114667e-06, "loss": 0.3761, "num_input_tokens_seen": 31185088, "step": 14460 }, { "epoch": 2.6546155257845476, "grad_norm": 1.036758303642273, "learning_rate": 9.967419961219918e-06, "loss": 0.2411, "num_input_tokens_seen": 31195424, "step": 14465 }, { "epoch": 2.6555331253440997, "grad_norm": 1.7774945497512817, "learning_rate": 9.967328633886542e-06, "loss": 0.231, "num_input_tokens_seen": 31205920, "step": 14470 }, { "epoch": 2.656450724903652, "grad_norm": 3.0543510913848877, "learning_rate": 9.967237179148886e-06, "loss": 0.3784, "num_input_tokens_seen": 31216160, "step": 14475 }, { "epoch": 2.6573683244632043, "grad_norm": 1.680625319480896, "learning_rate": 9.967145597009295e-06, "loss": 0.4673, "num_input_tokens_seen": 31226976, "step": 14480 }, { "epoch": 2.6582859240227563, "grad_norm": 1.685711145401001, "learning_rate": 9.967053887470117e-06, "loss": 0.4276, "num_input_tokens_seen": 31238272, "step": 14485 }, { "epoch": 2.659203523582309, "grad_norm": 1.7948790788650513, "learning_rate": 9.966962050533705e-06, "loss": 0.224, "num_input_tokens_seen": 31250720, "step": 14490 }, { "epoch": 2.660121123141861, "grad_norm": 1.891700267791748, "learning_rate": 9.966870086202413e-06, "loss": 0.3413, "num_input_tokens_seen": 31260992, "step": 14495 }, { "epoch": 2.661038722701413, "grad_norm": 0.8767420053482056, "learning_rate": 9.966777994478605e-06, "loss": 0.244, "num_input_tokens_seen": 31272992, "step": 14500 }, { "epoch": 2.6619563222609655, "grad_norm": 0.8161317110061646, "learning_rate": 9.966685775364637e-06, "loss": 0.2728, "num_input_tokens_seen": 31283552, "step": 14505 }, { "epoch": 2.6628739218205175, "grad_norm": 1.112235426902771, "learning_rate": 9.966593428862876e-06, "loss": 0.3341, "num_input_tokens_seen": 31293152, "step": 14510 }, { "epoch": 2.6637915213800696, "grad_norm": 1.4113813638687134, "learning_rate": 9.966500954975692e-06, "loss": 0.3132, "num_input_tokens_seen": 31304000, "step": 14515 }, { "epoch": 2.664709120939622, "grad_norm": 2.1349031925201416, "learning_rate": 9.966408353705455e-06, "loss": 0.319, "num_input_tokens_seen": 31315104, "step": 14520 }, { "epoch": 2.665626720499174, "grad_norm": 7.387120723724365, "learning_rate": 9.966315625054542e-06, "loss": 0.3884, "num_input_tokens_seen": 31327360, "step": 14525 }, { "epoch": 2.666544320058726, "grad_norm": 2.802239418029785, "learning_rate": 9.966222769025329e-06, "loss": 0.4508, "num_input_tokens_seen": 31338240, "step": 14530 }, { "epoch": 2.6674619196182787, "grad_norm": 6.95620059967041, "learning_rate": 9.966129785620201e-06, "loss": 0.3841, "num_input_tokens_seen": 31348416, "step": 14535 }, { "epoch": 2.668379519177831, "grad_norm": 1.2775399684906006, "learning_rate": 9.96603667484154e-06, "loss": 0.3498, "num_input_tokens_seen": 31358656, "step": 14540 }, { "epoch": 2.669297118737383, "grad_norm": 1.067320466041565, "learning_rate": 9.965943436691734e-06, "loss": 0.2499, "num_input_tokens_seen": 31370592, "step": 14545 }, { "epoch": 2.6702147182969354, "grad_norm": 1.6090303659439087, "learning_rate": 9.965850071173177e-06, "loss": 0.3722, "num_input_tokens_seen": 31380896, "step": 14550 }, { "epoch": 2.6711323178564874, "grad_norm": 1.7259695529937744, "learning_rate": 9.96575657828826e-06, "loss": 0.3021, "num_input_tokens_seen": 31389824, "step": 14555 }, { "epoch": 2.6720499174160395, "grad_norm": 1.733698844909668, "learning_rate": 9.965662958039384e-06, "loss": 0.3086, "num_input_tokens_seen": 31401152, "step": 14560 }, { "epoch": 2.672967516975592, "grad_norm": 1.9830100536346436, "learning_rate": 9.96556921042895e-06, "loss": 0.3705, "num_input_tokens_seen": 31410368, "step": 14565 }, { "epoch": 2.673885116535144, "grad_norm": 2.2582449913024902, "learning_rate": 9.96547533545936e-06, "loss": 0.3324, "num_input_tokens_seen": 31420864, "step": 14570 }, { "epoch": 2.674802716094696, "grad_norm": 3.140789270401001, "learning_rate": 9.965381333133024e-06, "loss": 0.4043, "num_input_tokens_seen": 31431552, "step": 14575 }, { "epoch": 2.6757203156542486, "grad_norm": 1.175919771194458, "learning_rate": 9.965287203452354e-06, "loss": 0.3234, "num_input_tokens_seen": 31443456, "step": 14580 }, { "epoch": 2.6766379152138007, "grad_norm": 1.0778851509094238, "learning_rate": 9.965192946419762e-06, "loss": 0.269, "num_input_tokens_seen": 31452832, "step": 14585 }, { "epoch": 2.6775555147733527, "grad_norm": 1.4716318845748901, "learning_rate": 9.965098562037665e-06, "loss": 0.2919, "num_input_tokens_seen": 31464096, "step": 14590 }, { "epoch": 2.6784731143329052, "grad_norm": 3.910677433013916, "learning_rate": 9.965004050308485e-06, "loss": 0.3507, "num_input_tokens_seen": 31475296, "step": 14595 }, { "epoch": 2.6793907138924573, "grad_norm": 1.1876798868179321, "learning_rate": 9.964909411234646e-06, "loss": 0.2863, "num_input_tokens_seen": 31487264, "step": 14600 }, { "epoch": 2.6803083134520094, "grad_norm": 3.186438798904419, "learning_rate": 9.964814644818578e-06, "loss": 0.3577, "num_input_tokens_seen": 31498752, "step": 14605 }, { "epoch": 2.681225913011562, "grad_norm": 1.0295957326889038, "learning_rate": 9.964719751062708e-06, "loss": 0.3266, "num_input_tokens_seen": 31509856, "step": 14610 }, { "epoch": 2.682143512571114, "grad_norm": 1.1034184694290161, "learning_rate": 9.96462472996947e-06, "loss": 0.3302, "num_input_tokens_seen": 31520256, "step": 14615 }, { "epoch": 2.683061112130666, "grad_norm": 1.855978012084961, "learning_rate": 9.964529581541304e-06, "loss": 0.3769, "num_input_tokens_seen": 31531744, "step": 14620 }, { "epoch": 2.6839787116902185, "grad_norm": 1.706161618232727, "learning_rate": 9.964434305780646e-06, "loss": 0.3821, "num_input_tokens_seen": 31543712, "step": 14625 }, { "epoch": 2.6848963112497706, "grad_norm": 0.6411318778991699, "learning_rate": 9.964338902689945e-06, "loss": 0.2615, "num_input_tokens_seen": 31554304, "step": 14630 }, { "epoch": 2.6858139108093226, "grad_norm": 4.189149379730225, "learning_rate": 9.964243372271642e-06, "loss": 0.3194, "num_input_tokens_seen": 31564608, "step": 14635 }, { "epoch": 2.686731510368875, "grad_norm": 0.9200499653816223, "learning_rate": 9.964147714528194e-06, "loss": 0.2869, "num_input_tokens_seen": 31574208, "step": 14640 }, { "epoch": 2.687649109928427, "grad_norm": 3.60683536529541, "learning_rate": 9.96405192946205e-06, "loss": 0.3671, "num_input_tokens_seen": 31585376, "step": 14645 }, { "epoch": 2.6885667094879793, "grad_norm": 1.0338290929794312, "learning_rate": 9.963956017075664e-06, "loss": 0.3403, "num_input_tokens_seen": 31596416, "step": 14650 }, { "epoch": 2.6894843090475318, "grad_norm": 3.2567384243011475, "learning_rate": 9.963859977371503e-06, "loss": 0.2962, "num_input_tokens_seen": 31606112, "step": 14655 }, { "epoch": 2.690401908607084, "grad_norm": 1.0690529346466064, "learning_rate": 9.963763810352026e-06, "loss": 0.3, "num_input_tokens_seen": 31616096, "step": 14660 }, { "epoch": 2.691319508166636, "grad_norm": 1.5568193197250366, "learning_rate": 9.9636675160197e-06, "loss": 0.3187, "num_input_tokens_seen": 31625824, "step": 14665 }, { "epoch": 2.6922371077261884, "grad_norm": 5.849722385406494, "learning_rate": 9.963571094376995e-06, "loss": 0.4385, "num_input_tokens_seen": 31635456, "step": 14670 }, { "epoch": 2.6931547072857405, "grad_norm": 3.516512632369995, "learning_rate": 9.963474545426386e-06, "loss": 0.3391, "num_input_tokens_seen": 31646240, "step": 14675 }, { "epoch": 2.6940723068452925, "grad_norm": 0.6802106499671936, "learning_rate": 9.963377869170347e-06, "loss": 0.2905, "num_input_tokens_seen": 31656000, "step": 14680 }, { "epoch": 2.694989906404845, "grad_norm": 0.9467127919197083, "learning_rate": 9.963281065611358e-06, "loss": 0.3881, "num_input_tokens_seen": 31668000, "step": 14685 }, { "epoch": 2.695907505964397, "grad_norm": 0.9674330353736877, "learning_rate": 9.963184134751903e-06, "loss": 0.3045, "num_input_tokens_seen": 31678112, "step": 14690 }, { "epoch": 2.696825105523949, "grad_norm": 0.6430094838142395, "learning_rate": 9.963087076594464e-06, "loss": 0.3263, "num_input_tokens_seen": 31688128, "step": 14695 }, { "epoch": 2.6977427050835017, "grad_norm": 1.1604640483856201, "learning_rate": 9.962989891141535e-06, "loss": 0.3203, "num_input_tokens_seen": 31699136, "step": 14700 }, { "epoch": 2.6986603046430537, "grad_norm": 1.268641710281372, "learning_rate": 9.962892578395608e-06, "loss": 0.3067, "num_input_tokens_seen": 31709792, "step": 14705 }, { "epoch": 2.699577904202606, "grad_norm": 1.6960362195968628, "learning_rate": 9.962795138359178e-06, "loss": 0.3107, "num_input_tokens_seen": 31720032, "step": 14710 }, { "epoch": 2.7004955037621583, "grad_norm": 2.73905086517334, "learning_rate": 9.962697571034745e-06, "loss": 0.2893, "num_input_tokens_seen": 31729536, "step": 14715 }, { "epoch": 2.7014131033217104, "grad_norm": 1.7938135862350464, "learning_rate": 9.96259987642481e-06, "loss": 0.3357, "num_input_tokens_seen": 31741408, "step": 14720 }, { "epoch": 2.7023307028812624, "grad_norm": 1.4595611095428467, "learning_rate": 9.96250205453188e-06, "loss": 0.3498, "num_input_tokens_seen": 31752384, "step": 14725 }, { "epoch": 2.703248302440815, "grad_norm": 4.983068466186523, "learning_rate": 9.962404105358463e-06, "loss": 0.3262, "num_input_tokens_seen": 31763488, "step": 14730 }, { "epoch": 2.704165902000367, "grad_norm": 1.1813230514526367, "learning_rate": 9.962306028907072e-06, "loss": 0.3678, "num_input_tokens_seen": 31774048, "step": 14735 }, { "epoch": 2.705083501559919, "grad_norm": 1.1388158798217773, "learning_rate": 9.96220782518022e-06, "loss": 0.2381, "num_input_tokens_seen": 31784928, "step": 14740 }, { "epoch": 2.7060011011194716, "grad_norm": 0.9477680921554565, "learning_rate": 9.962109494180431e-06, "loss": 0.315, "num_input_tokens_seen": 31796640, "step": 14745 }, { "epoch": 2.7069187006790236, "grad_norm": 1.568809986114502, "learning_rate": 9.962011035910223e-06, "loss": 0.3345, "num_input_tokens_seen": 31808256, "step": 14750 }, { "epoch": 2.7078363002385757, "grad_norm": 0.9745279550552368, "learning_rate": 9.961912450372122e-06, "loss": 0.3643, "num_input_tokens_seen": 31819136, "step": 14755 }, { "epoch": 2.708753899798128, "grad_norm": 1.347019076347351, "learning_rate": 9.961813737568658e-06, "loss": 0.3115, "num_input_tokens_seen": 31830112, "step": 14760 }, { "epoch": 2.7096714993576803, "grad_norm": 1.3711289167404175, "learning_rate": 9.961714897502362e-06, "loss": 0.3504, "num_input_tokens_seen": 31841056, "step": 14765 }, { "epoch": 2.7105890989172323, "grad_norm": 1.103410243988037, "learning_rate": 9.961615930175767e-06, "loss": 0.2994, "num_input_tokens_seen": 31852800, "step": 14770 }, { "epoch": 2.711506698476785, "grad_norm": 1.2516493797302246, "learning_rate": 9.961516835591414e-06, "loss": 0.2778, "num_input_tokens_seen": 31863520, "step": 14775 }, { "epoch": 2.712424298036337, "grad_norm": 1.4523588418960571, "learning_rate": 9.961417613751845e-06, "loss": 0.3821, "num_input_tokens_seen": 31874848, "step": 14780 }, { "epoch": 2.713341897595889, "grad_norm": 6.295846939086914, "learning_rate": 9.961318264659601e-06, "loss": 0.5712, "num_input_tokens_seen": 31885056, "step": 14785 }, { "epoch": 2.7142594971554415, "grad_norm": 5.380529403686523, "learning_rate": 9.961218788317235e-06, "loss": 0.4654, "num_input_tokens_seen": 31896128, "step": 14790 }, { "epoch": 2.7151770967149935, "grad_norm": 1.1761506795883179, "learning_rate": 9.961119184727297e-06, "loss": 0.2845, "num_input_tokens_seen": 31906816, "step": 14795 }, { "epoch": 2.7160946962745456, "grad_norm": 1.1124740839004517, "learning_rate": 9.96101945389234e-06, "loss": 0.3091, "num_input_tokens_seen": 31918688, "step": 14800 }, { "epoch": 2.717012295834098, "grad_norm": 1.1319912672042847, "learning_rate": 9.960919595814922e-06, "loss": 0.2985, "num_input_tokens_seen": 31929056, "step": 14805 }, { "epoch": 2.71792989539365, "grad_norm": 0.9773393869400024, "learning_rate": 9.960819610497606e-06, "loss": 0.3541, "num_input_tokens_seen": 31939808, "step": 14810 }, { "epoch": 2.718847494953202, "grad_norm": 0.9516828060150146, "learning_rate": 9.960719497942954e-06, "loss": 0.34, "num_input_tokens_seen": 31949664, "step": 14815 }, { "epoch": 2.7197650945127547, "grad_norm": 2.994804859161377, "learning_rate": 9.960619258153536e-06, "loss": 0.3755, "num_input_tokens_seen": 31960640, "step": 14820 }, { "epoch": 2.720682694072307, "grad_norm": 1.9246643781661987, "learning_rate": 9.960518891131923e-06, "loss": 0.267, "num_input_tokens_seen": 31971904, "step": 14825 }, { "epoch": 2.721600293631859, "grad_norm": 2.8967154026031494, "learning_rate": 9.960418396880689e-06, "loss": 0.2129, "num_input_tokens_seen": 31982688, "step": 14830 }, { "epoch": 2.7225178931914114, "grad_norm": 8.43899154663086, "learning_rate": 9.96031777540241e-06, "loss": 0.3227, "num_input_tokens_seen": 31992608, "step": 14835 }, { "epoch": 2.7234354927509634, "grad_norm": 1.463445782661438, "learning_rate": 9.96021702669967e-06, "loss": 0.3652, "num_input_tokens_seen": 32003584, "step": 14840 }, { "epoch": 2.7243530923105155, "grad_norm": 0.9360761046409607, "learning_rate": 9.960116150775048e-06, "loss": 0.3094, "num_input_tokens_seen": 32015616, "step": 14845 }, { "epoch": 2.725270691870068, "grad_norm": 1.2505066394805908, "learning_rate": 9.960015147631136e-06, "loss": 0.3616, "num_input_tokens_seen": 32027488, "step": 14850 }, { "epoch": 2.72618829142962, "grad_norm": 6.845249176025391, "learning_rate": 9.959914017270522e-06, "loss": 0.4851, "num_input_tokens_seen": 32038208, "step": 14855 }, { "epoch": 2.727105890989172, "grad_norm": 2.0878021717071533, "learning_rate": 9.9598127596958e-06, "loss": 0.2573, "num_input_tokens_seen": 32049952, "step": 14860 }, { "epoch": 2.7280234905487246, "grad_norm": 4.031430721282959, "learning_rate": 9.959711374909568e-06, "loss": 0.3254, "num_input_tokens_seen": 32060480, "step": 14865 }, { "epoch": 2.7289410901082767, "grad_norm": 3.2703123092651367, "learning_rate": 9.959609862914427e-06, "loss": 0.2948, "num_input_tokens_seen": 32070656, "step": 14870 }, { "epoch": 2.7298586896678287, "grad_norm": 2.3580307960510254, "learning_rate": 9.95950822371298e-06, "loss": 0.2294, "num_input_tokens_seen": 32081984, "step": 14875 }, { "epoch": 2.7307762892273812, "grad_norm": 2.1665518283843994, "learning_rate": 9.959406457307833e-06, "loss": 0.2538, "num_input_tokens_seen": 32092032, "step": 14880 }, { "epoch": 2.7316938887869333, "grad_norm": 1.6534864902496338, "learning_rate": 9.959304563701598e-06, "loss": 0.1989, "num_input_tokens_seen": 32101664, "step": 14885 }, { "epoch": 2.7326114883464854, "grad_norm": 2.9164156913757324, "learning_rate": 9.959202542896885e-06, "loss": 0.4547, "num_input_tokens_seen": 32112576, "step": 14890 }, { "epoch": 2.733529087906038, "grad_norm": 3.0197291374206543, "learning_rate": 9.959100394896314e-06, "loss": 0.28, "num_input_tokens_seen": 32123872, "step": 14895 }, { "epoch": 2.73444668746559, "grad_norm": 3.047245502471924, "learning_rate": 9.958998119702503e-06, "loss": 0.3297, "num_input_tokens_seen": 32135264, "step": 14900 }, { "epoch": 2.735364287025142, "grad_norm": 1.7907501459121704, "learning_rate": 9.958895717318076e-06, "loss": 0.272, "num_input_tokens_seen": 32146656, "step": 14905 }, { "epoch": 2.7362818865846945, "grad_norm": 2.1713814735412598, "learning_rate": 9.958793187745662e-06, "loss": 0.3156, "num_input_tokens_seen": 32157024, "step": 14910 }, { "epoch": 2.7371994861442466, "grad_norm": 3.3235836029052734, "learning_rate": 9.958690530987885e-06, "loss": 0.3079, "num_input_tokens_seen": 32167808, "step": 14915 }, { "epoch": 2.7381170857037986, "grad_norm": 2.816267490386963, "learning_rate": 9.958587747047382e-06, "loss": 0.3906, "num_input_tokens_seen": 32178304, "step": 14920 }, { "epoch": 2.739034685263351, "grad_norm": 2.8825137615203857, "learning_rate": 9.95848483592679e-06, "loss": 0.3308, "num_input_tokens_seen": 32189600, "step": 14925 }, { "epoch": 2.739952284822903, "grad_norm": 1.5193358659744263, "learning_rate": 9.958381797628745e-06, "loss": 0.3695, "num_input_tokens_seen": 32199968, "step": 14930 }, { "epoch": 2.7408698843824553, "grad_norm": 1.131682276725769, "learning_rate": 9.958278632155892e-06, "loss": 0.3852, "num_input_tokens_seen": 32210848, "step": 14935 }, { "epoch": 2.7417874839420078, "grad_norm": 2.2248549461364746, "learning_rate": 9.958175339510875e-06, "loss": 0.3836, "num_input_tokens_seen": 32221120, "step": 14940 }, { "epoch": 2.74270508350156, "grad_norm": 1.9774929285049438, "learning_rate": 9.958071919696349e-06, "loss": 0.2351, "num_input_tokens_seen": 32232000, "step": 14945 }, { "epoch": 2.743622683061112, "grad_norm": 1.575102686882019, "learning_rate": 9.95796837271496e-06, "loss": 0.3895, "num_input_tokens_seen": 32242944, "step": 14950 }, { "epoch": 2.7445402826206644, "grad_norm": 2.460181951522827, "learning_rate": 9.957864698569368e-06, "loss": 0.3538, "num_input_tokens_seen": 32254464, "step": 14955 }, { "epoch": 2.7454578821802165, "grad_norm": 1.2726929187774658, "learning_rate": 9.95776089726223e-06, "loss": 0.3299, "num_input_tokens_seen": 32265408, "step": 14960 }, { "epoch": 2.7463754817397685, "grad_norm": 4.4075703620910645, "learning_rate": 9.957656968796208e-06, "loss": 0.3401, "num_input_tokens_seen": 32277184, "step": 14965 }, { "epoch": 2.747293081299321, "grad_norm": 1.197556495666504, "learning_rate": 9.957552913173969e-06, "loss": 0.309, "num_input_tokens_seen": 32287232, "step": 14970 }, { "epoch": 2.748210680858873, "grad_norm": 1.3561683893203735, "learning_rate": 9.957448730398181e-06, "loss": 0.3151, "num_input_tokens_seen": 32297216, "step": 14975 }, { "epoch": 2.749128280418425, "grad_norm": 1.8190836906433105, "learning_rate": 9.957344420471515e-06, "loss": 0.3464, "num_input_tokens_seen": 32308800, "step": 14980 }, { "epoch": 2.7500458799779777, "grad_norm": 1.050626277923584, "learning_rate": 9.95723998339665e-06, "loss": 0.3192, "num_input_tokens_seen": 32318816, "step": 14985 }, { "epoch": 2.7509634795375297, "grad_norm": 1.2781680822372437, "learning_rate": 9.957135419176262e-06, "loss": 0.2961, "num_input_tokens_seen": 32330432, "step": 14990 }, { "epoch": 2.751881079097082, "grad_norm": 1.2520604133605957, "learning_rate": 9.957030727813033e-06, "loss": 0.2238, "num_input_tokens_seen": 32340352, "step": 14995 }, { "epoch": 2.7527986786566343, "grad_norm": 2.3022513389587402, "learning_rate": 9.956925909309647e-06, "loss": 0.2731, "num_input_tokens_seen": 32351040, "step": 15000 }, { "epoch": 2.7537162782161864, "grad_norm": 1.5977777242660522, "learning_rate": 9.956820963668797e-06, "loss": 0.3846, "num_input_tokens_seen": 32362592, "step": 15005 }, { "epoch": 2.7546338777757384, "grad_norm": 2.1611218452453613, "learning_rate": 9.956715890893169e-06, "loss": 0.3888, "num_input_tokens_seen": 32374208, "step": 15010 }, { "epoch": 2.755551477335291, "grad_norm": 0.9633023142814636, "learning_rate": 9.956610690985463e-06, "loss": 0.3724, "num_input_tokens_seen": 32384960, "step": 15015 }, { "epoch": 2.756469076894843, "grad_norm": 1.7878755331039429, "learning_rate": 9.956505363948372e-06, "loss": 0.2563, "num_input_tokens_seen": 32395328, "step": 15020 }, { "epoch": 2.757386676454395, "grad_norm": 1.039223313331604, "learning_rate": 9.956399909784603e-06, "loss": 0.2208, "num_input_tokens_seen": 32405760, "step": 15025 }, { "epoch": 2.7583042760139476, "grad_norm": 3.1202616691589355, "learning_rate": 9.956294328496856e-06, "loss": 0.3255, "num_input_tokens_seen": 32416512, "step": 15030 }, { "epoch": 2.7592218755734996, "grad_norm": 4.286348342895508, "learning_rate": 9.956188620087844e-06, "loss": 0.4285, "num_input_tokens_seen": 32427744, "step": 15035 }, { "epoch": 2.7601394751330517, "grad_norm": 1.2261580228805542, "learning_rate": 9.956082784560273e-06, "loss": 0.3138, "num_input_tokens_seen": 32439360, "step": 15040 }, { "epoch": 2.761057074692604, "grad_norm": 3.9899089336395264, "learning_rate": 9.95597682191686e-06, "loss": 0.2636, "num_input_tokens_seen": 32450688, "step": 15045 }, { "epoch": 2.7619746742521563, "grad_norm": 1.6336828470230103, "learning_rate": 9.955870732160321e-06, "loss": 0.3621, "num_input_tokens_seen": 32462688, "step": 15050 }, { "epoch": 2.7628922738117083, "grad_norm": 1.271242380142212, "learning_rate": 9.955764515293381e-06, "loss": 0.313, "num_input_tokens_seen": 32473152, "step": 15055 }, { "epoch": 2.763809873371261, "grad_norm": 3.659722089767456, "learning_rate": 9.955658171318762e-06, "loss": 0.3532, "num_input_tokens_seen": 32483008, "step": 15060 }, { "epoch": 2.764727472930813, "grad_norm": 1.717255711555481, "learning_rate": 9.955551700239189e-06, "loss": 0.2886, "num_input_tokens_seen": 32493984, "step": 15065 }, { "epoch": 2.765645072490365, "grad_norm": 2.2157413959503174, "learning_rate": 9.955445102057398e-06, "loss": 0.3322, "num_input_tokens_seen": 32504800, "step": 15070 }, { "epoch": 2.7665626720499175, "grad_norm": 1.6083112955093384, "learning_rate": 9.95533837677612e-06, "loss": 0.3989, "num_input_tokens_seen": 32515552, "step": 15075 }, { "epoch": 2.7674802716094695, "grad_norm": 2.481818675994873, "learning_rate": 9.955231524398093e-06, "loss": 0.3908, "num_input_tokens_seen": 32527040, "step": 15080 }, { "epoch": 2.7683978711690216, "grad_norm": 2.8742218017578125, "learning_rate": 9.955124544926056e-06, "loss": 0.39, "num_input_tokens_seen": 32538240, "step": 15085 }, { "epoch": 2.769315470728574, "grad_norm": 1.557064414024353, "learning_rate": 9.955017438362752e-06, "loss": 0.2987, "num_input_tokens_seen": 32548288, "step": 15090 }, { "epoch": 2.770233070288126, "grad_norm": 1.7051557302474976, "learning_rate": 9.954910204710935e-06, "loss": 0.408, "num_input_tokens_seen": 32560032, "step": 15095 }, { "epoch": 2.771150669847678, "grad_norm": 1.3571066856384277, "learning_rate": 9.954802843973348e-06, "loss": 0.3161, "num_input_tokens_seen": 32570272, "step": 15100 }, { "epoch": 2.7720682694072307, "grad_norm": 0.9375829696655273, "learning_rate": 9.954695356152747e-06, "loss": 0.3673, "num_input_tokens_seen": 32581632, "step": 15105 }, { "epoch": 2.772985868966783, "grad_norm": 1.281847596168518, "learning_rate": 9.95458774125189e-06, "loss": 0.3047, "num_input_tokens_seen": 32592704, "step": 15110 }, { "epoch": 2.773903468526335, "grad_norm": 1.004926085472107, "learning_rate": 9.954479999273537e-06, "loss": 0.309, "num_input_tokens_seen": 32603584, "step": 15115 }, { "epoch": 2.7748210680858874, "grad_norm": 1.1435503959655762, "learning_rate": 9.95437213022045e-06, "loss": 0.299, "num_input_tokens_seen": 32615328, "step": 15120 }, { "epoch": 2.7757386676454394, "grad_norm": 1.5192946195602417, "learning_rate": 9.954264134095397e-06, "loss": 0.359, "num_input_tokens_seen": 32626624, "step": 15125 }, { "epoch": 2.7766562672049915, "grad_norm": 1.0378624200820923, "learning_rate": 9.954156010901146e-06, "loss": 0.3082, "num_input_tokens_seen": 32636832, "step": 15130 }, { "epoch": 2.777573866764544, "grad_norm": 1.478980541229248, "learning_rate": 9.954047760640472e-06, "loss": 0.3619, "num_input_tokens_seen": 32647968, "step": 15135 }, { "epoch": 2.778491466324096, "grad_norm": 1.0372501611709595, "learning_rate": 9.953939383316154e-06, "loss": 0.3019, "num_input_tokens_seen": 32658432, "step": 15140 }, { "epoch": 2.779409065883648, "grad_norm": 1.2318799495697021, "learning_rate": 9.953830878930966e-06, "loss": 0.2906, "num_input_tokens_seen": 32669248, "step": 15145 }, { "epoch": 2.7803266654432006, "grad_norm": 0.9904127717018127, "learning_rate": 9.953722247487694e-06, "loss": 0.3212, "num_input_tokens_seen": 32680192, "step": 15150 }, { "epoch": 2.7812442650027527, "grad_norm": 1.2328691482543945, "learning_rate": 9.953613488989123e-06, "loss": 0.3019, "num_input_tokens_seen": 32690240, "step": 15155 }, { "epoch": 2.7821618645623047, "grad_norm": 0.9962790012359619, "learning_rate": 9.953504603438045e-06, "loss": 0.311, "num_input_tokens_seen": 32701920, "step": 15160 }, { "epoch": 2.7830794641218572, "grad_norm": 2.2868800163269043, "learning_rate": 9.95339559083725e-06, "loss": 0.2855, "num_input_tokens_seen": 32712672, "step": 15165 }, { "epoch": 2.7839970636814093, "grad_norm": 1.6661866903305054, "learning_rate": 9.953286451189535e-06, "loss": 0.2849, "num_input_tokens_seen": 32721664, "step": 15170 }, { "epoch": 2.7849146632409614, "grad_norm": 3.4388880729675293, "learning_rate": 9.9531771844977e-06, "loss": 0.2409, "num_input_tokens_seen": 32731616, "step": 15175 }, { "epoch": 2.785832262800514, "grad_norm": 1.3751753568649292, "learning_rate": 9.953067790764548e-06, "loss": 0.3645, "num_input_tokens_seen": 32740768, "step": 15180 }, { "epoch": 2.786749862360066, "grad_norm": 1.588455319404602, "learning_rate": 9.952958269992883e-06, "loss": 0.3927, "num_input_tokens_seen": 32751168, "step": 15185 }, { "epoch": 2.787667461919618, "grad_norm": 3.4664673805236816, "learning_rate": 9.952848622185514e-06, "loss": 0.3996, "num_input_tokens_seen": 32762112, "step": 15190 }, { "epoch": 2.7885850614791705, "grad_norm": 4.462398052215576, "learning_rate": 9.952738847345254e-06, "loss": 0.4509, "num_input_tokens_seen": 32772256, "step": 15195 }, { "epoch": 2.7895026610387226, "grad_norm": 1.9300438165664673, "learning_rate": 9.95262894547492e-06, "loss": 0.3266, "num_input_tokens_seen": 32782624, "step": 15200 }, { "epoch": 2.7904202605982746, "grad_norm": 1.9459408521652222, "learning_rate": 9.952518916577328e-06, "loss": 0.333, "num_input_tokens_seen": 32792416, "step": 15205 }, { "epoch": 2.791337860157827, "grad_norm": 1.2507119178771973, "learning_rate": 9.952408760655302e-06, "loss": 0.3182, "num_input_tokens_seen": 32803680, "step": 15210 }, { "epoch": 2.792255459717379, "grad_norm": 1.248759150505066, "learning_rate": 9.952298477711667e-06, "loss": 0.3388, "num_input_tokens_seen": 32813632, "step": 15215 }, { "epoch": 2.7931730592769317, "grad_norm": 1.7620627880096436, "learning_rate": 9.95218806774925e-06, "loss": 0.2497, "num_input_tokens_seen": 32824864, "step": 15220 }, { "epoch": 2.7940906588364838, "grad_norm": 1.2044709920883179, "learning_rate": 9.952077530770887e-06, "loss": 0.3162, "num_input_tokens_seen": 32835680, "step": 15225 }, { "epoch": 2.795008258396036, "grad_norm": 1.0395437479019165, "learning_rate": 9.951966866779409e-06, "loss": 0.2754, "num_input_tokens_seen": 32846528, "step": 15230 }, { "epoch": 2.7959258579555883, "grad_norm": 2.0498199462890625, "learning_rate": 9.951856075777655e-06, "loss": 0.3098, "num_input_tokens_seen": 32856832, "step": 15235 }, { "epoch": 2.7968434575151404, "grad_norm": 2.6823508739471436, "learning_rate": 9.951745157768468e-06, "loss": 0.2798, "num_input_tokens_seen": 32867072, "step": 15240 }, { "epoch": 2.7977610570746925, "grad_norm": 4.672540187835693, "learning_rate": 9.951634112754693e-06, "loss": 0.3547, "num_input_tokens_seen": 32877472, "step": 15245 }, { "epoch": 2.798678656634245, "grad_norm": 3.852008581161499, "learning_rate": 9.951522940739177e-06, "loss": 0.312, "num_input_tokens_seen": 32888480, "step": 15250 }, { "epoch": 2.799596256193797, "grad_norm": 2.620398998260498, "learning_rate": 9.95141164172477e-06, "loss": 0.3609, "num_input_tokens_seen": 32898784, "step": 15255 }, { "epoch": 2.800513855753349, "grad_norm": 2.252610921859741, "learning_rate": 9.95130021571433e-06, "loss": 0.3413, "num_input_tokens_seen": 32909248, "step": 15260 }, { "epoch": 2.8014314553129016, "grad_norm": 3.105875253677368, "learning_rate": 9.951188662710713e-06, "loss": 0.2953, "num_input_tokens_seen": 32919776, "step": 15265 }, { "epoch": 2.8023490548724537, "grad_norm": 3.7273967266082764, "learning_rate": 9.951076982716781e-06, "loss": 0.327, "num_input_tokens_seen": 32930688, "step": 15270 }, { "epoch": 2.8032666544320057, "grad_norm": 2.1506714820861816, "learning_rate": 9.950965175735397e-06, "loss": 0.2493, "num_input_tokens_seen": 32942912, "step": 15275 }, { "epoch": 2.8041842539915582, "grad_norm": 0.9504969120025635, "learning_rate": 9.95085324176943e-06, "loss": 0.2363, "num_input_tokens_seen": 32952896, "step": 15280 }, { "epoch": 2.8051018535511103, "grad_norm": 2.359729051589966, "learning_rate": 9.950741180821751e-06, "loss": 0.3117, "num_input_tokens_seen": 32964384, "step": 15285 }, { "epoch": 2.8060194531106624, "grad_norm": 4.544327259063721, "learning_rate": 9.950628992895232e-06, "loss": 0.3677, "num_input_tokens_seen": 32975168, "step": 15290 }, { "epoch": 2.806937052670215, "grad_norm": 4.519323825836182, "learning_rate": 9.950516677992755e-06, "loss": 0.2896, "num_input_tokens_seen": 32986656, "step": 15295 }, { "epoch": 2.807854652229767, "grad_norm": 1.9178988933563232, "learning_rate": 9.950404236117195e-06, "loss": 0.3836, "num_input_tokens_seen": 32998368, "step": 15300 }, { "epoch": 2.808772251789319, "grad_norm": 2.2843191623687744, "learning_rate": 9.950291667271438e-06, "loss": 0.268, "num_input_tokens_seen": 33007136, "step": 15305 }, { "epoch": 2.8096898513488715, "grad_norm": 4.387221336364746, "learning_rate": 9.950178971458375e-06, "loss": 0.3592, "num_input_tokens_seen": 33018432, "step": 15310 }, { "epoch": 2.8106074509084236, "grad_norm": 2.057666778564453, "learning_rate": 9.950066148680893e-06, "loss": 0.3089, "num_input_tokens_seen": 33029280, "step": 15315 }, { "epoch": 2.8115250504679756, "grad_norm": 3.5645995140075684, "learning_rate": 9.949953198941884e-06, "loss": 0.2434, "num_input_tokens_seen": 33038656, "step": 15320 }, { "epoch": 2.812442650027528, "grad_norm": 8.903146743774414, "learning_rate": 9.94984012224425e-06, "loss": 0.2953, "num_input_tokens_seen": 33048704, "step": 15325 }, { "epoch": 2.81336024958708, "grad_norm": 1.8311487436294556, "learning_rate": 9.949726918590885e-06, "loss": 0.3191, "num_input_tokens_seen": 33058432, "step": 15330 }, { "epoch": 2.8142778491466323, "grad_norm": 1.726782202720642, "learning_rate": 9.9496135879847e-06, "loss": 0.3441, "num_input_tokens_seen": 33068960, "step": 15335 }, { "epoch": 2.8151954487061848, "grad_norm": 11.18346118927002, "learning_rate": 9.949500130428593e-06, "loss": 0.2375, "num_input_tokens_seen": 33079552, "step": 15340 }, { "epoch": 2.816113048265737, "grad_norm": 2.993037462234497, "learning_rate": 9.949386545925482e-06, "loss": 0.2604, "num_input_tokens_seen": 33089696, "step": 15345 }, { "epoch": 2.817030647825289, "grad_norm": 1.6549158096313477, "learning_rate": 9.949272834478276e-06, "loss": 0.2206, "num_input_tokens_seen": 33101440, "step": 15350 }, { "epoch": 2.8179482473848414, "grad_norm": 7.956735610961914, "learning_rate": 9.949158996089893e-06, "loss": 0.3037, "num_input_tokens_seen": 33113280, "step": 15355 }, { "epoch": 2.8188658469443935, "grad_norm": 2.641575574874878, "learning_rate": 9.949045030763251e-06, "loss": 0.2584, "num_input_tokens_seen": 33124416, "step": 15360 }, { "epoch": 2.8197834465039455, "grad_norm": 11.433732986450195, "learning_rate": 9.948930938501275e-06, "loss": 0.3847, "num_input_tokens_seen": 33135936, "step": 15365 }, { "epoch": 2.820701046063498, "grad_norm": 2.810472011566162, "learning_rate": 9.948816719306892e-06, "loss": 0.2419, "num_input_tokens_seen": 33146976, "step": 15370 }, { "epoch": 2.82161864562305, "grad_norm": 4.227038383483887, "learning_rate": 9.948702373183027e-06, "loss": 0.3676, "num_input_tokens_seen": 33158080, "step": 15375 }, { "epoch": 2.822536245182602, "grad_norm": 2.845858097076416, "learning_rate": 9.948587900132619e-06, "loss": 0.3125, "num_input_tokens_seen": 33169184, "step": 15380 }, { "epoch": 2.8234538447421547, "grad_norm": 3.1629159450531006, "learning_rate": 9.9484733001586e-06, "loss": 0.3524, "num_input_tokens_seen": 33179616, "step": 15385 }, { "epoch": 2.8243714443017067, "grad_norm": 1.4252605438232422, "learning_rate": 9.948358573263909e-06, "loss": 0.2996, "num_input_tokens_seen": 33189440, "step": 15390 }, { "epoch": 2.825289043861259, "grad_norm": 6.111982345581055, "learning_rate": 9.948243719451491e-06, "loss": 0.2201, "num_input_tokens_seen": 33200672, "step": 15395 }, { "epoch": 2.8262066434208113, "grad_norm": 11.969623565673828, "learning_rate": 9.948128738724291e-06, "loss": 0.4189, "num_input_tokens_seen": 33212704, "step": 15400 }, { "epoch": 2.8271242429803634, "grad_norm": 8.812172889709473, "learning_rate": 9.948013631085258e-06, "loss": 0.3331, "num_input_tokens_seen": 33223296, "step": 15405 }, { "epoch": 2.8280418425399154, "grad_norm": 4.961365222930908, "learning_rate": 9.947898396537344e-06, "loss": 0.3292, "num_input_tokens_seen": 33234976, "step": 15410 }, { "epoch": 2.828959442099468, "grad_norm": 1.6336849927902222, "learning_rate": 9.947783035083503e-06, "loss": 0.3377, "num_input_tokens_seen": 33245792, "step": 15415 }, { "epoch": 2.82987704165902, "grad_norm": 1.9223581552505493, "learning_rate": 9.947667546726697e-06, "loss": 0.336, "num_input_tokens_seen": 33256800, "step": 15420 }, { "epoch": 2.830794641218572, "grad_norm": 4.527938365936279, "learning_rate": 9.947551931469886e-06, "loss": 0.3518, "num_input_tokens_seen": 33267936, "step": 15425 }, { "epoch": 2.8317122407781246, "grad_norm": 1.5087945461273193, "learning_rate": 9.947436189316037e-06, "loss": 0.3406, "num_input_tokens_seen": 33278624, "step": 15430 }, { "epoch": 2.8326298403376766, "grad_norm": 1.954032063484192, "learning_rate": 9.947320320268116e-06, "loss": 0.29, "num_input_tokens_seen": 33289504, "step": 15435 }, { "epoch": 2.833547439897229, "grad_norm": 4.473601341247559, "learning_rate": 9.947204324329098e-06, "loss": 0.3395, "num_input_tokens_seen": 33301184, "step": 15440 }, { "epoch": 2.834465039456781, "grad_norm": 4.423393249511719, "learning_rate": 9.947088201501956e-06, "loss": 0.3371, "num_input_tokens_seen": 33310656, "step": 15445 }, { "epoch": 2.8353826390163333, "grad_norm": 2.5835084915161133, "learning_rate": 9.946971951789668e-06, "loss": 0.2553, "num_input_tokens_seen": 33321792, "step": 15450 }, { "epoch": 2.8363002385758858, "grad_norm": 2.6682209968566895, "learning_rate": 9.946855575195217e-06, "loss": 0.3828, "num_input_tokens_seen": 33332736, "step": 15455 }, { "epoch": 2.837217838135438, "grad_norm": 3.5254452228546143, "learning_rate": 9.946739071721587e-06, "loss": 0.2983, "num_input_tokens_seen": 33344192, "step": 15460 }, { "epoch": 2.83813543769499, "grad_norm": 8.365373611450195, "learning_rate": 9.946622441371768e-06, "loss": 0.335, "num_input_tokens_seen": 33354912, "step": 15465 }, { "epoch": 2.8390530372545424, "grad_norm": 3.6668357849121094, "learning_rate": 9.94650568414875e-06, "loss": 0.3792, "num_input_tokens_seen": 33363744, "step": 15470 }, { "epoch": 2.8399706368140945, "grad_norm": 3.945399284362793, "learning_rate": 9.946388800055527e-06, "loss": 0.3205, "num_input_tokens_seen": 33374720, "step": 15475 }, { "epoch": 2.8408882363736465, "grad_norm": 2.5507445335388184, "learning_rate": 9.946271789095096e-06, "loss": 0.3623, "num_input_tokens_seen": 33386048, "step": 15480 }, { "epoch": 2.841805835933199, "grad_norm": 2.377093553543091, "learning_rate": 9.94615465127046e-06, "loss": 0.273, "num_input_tokens_seen": 33397056, "step": 15485 }, { "epoch": 2.842723435492751, "grad_norm": 2.1261978149414062, "learning_rate": 9.946037386584626e-06, "loss": 0.294, "num_input_tokens_seen": 33407776, "step": 15490 }, { "epoch": 2.843641035052303, "grad_norm": 10.64345932006836, "learning_rate": 9.945919995040595e-06, "loss": 0.3862, "num_input_tokens_seen": 33419520, "step": 15495 }, { "epoch": 2.8445586346118557, "grad_norm": 4.6037774085998535, "learning_rate": 9.945802476641383e-06, "loss": 0.2463, "num_input_tokens_seen": 33430176, "step": 15500 }, { "epoch": 2.8454762341714077, "grad_norm": 3.5327112674713135, "learning_rate": 9.945684831390004e-06, "loss": 0.2324, "num_input_tokens_seen": 33441184, "step": 15505 }, { "epoch": 2.8463938337309598, "grad_norm": 6.429937839508057, "learning_rate": 9.945567059289474e-06, "loss": 0.2935, "num_input_tokens_seen": 33451712, "step": 15510 }, { "epoch": 2.8473114332905123, "grad_norm": 1.876579761505127, "learning_rate": 9.945449160342812e-06, "loss": 0.3962, "num_input_tokens_seen": 33461376, "step": 15515 }, { "epoch": 2.8482290328500643, "grad_norm": 61.597434997558594, "learning_rate": 9.945331134553045e-06, "loss": 0.2813, "num_input_tokens_seen": 33471456, "step": 15520 }, { "epoch": 2.8491466324096164, "grad_norm": 2.443000316619873, "learning_rate": 9.945212981923199e-06, "loss": 0.1946, "num_input_tokens_seen": 33482048, "step": 15525 }, { "epoch": 2.850064231969169, "grad_norm": 5.693744659423828, "learning_rate": 9.945094702456305e-06, "loss": 0.2548, "num_input_tokens_seen": 33492288, "step": 15530 }, { "epoch": 2.850981831528721, "grad_norm": 2.9358136653900146, "learning_rate": 9.944976296155395e-06, "loss": 0.3475, "num_input_tokens_seen": 33502240, "step": 15535 }, { "epoch": 2.851899431088273, "grad_norm": 7.590970516204834, "learning_rate": 9.944857763023507e-06, "loss": 0.2076, "num_input_tokens_seen": 33513632, "step": 15540 }, { "epoch": 2.8528170306478255, "grad_norm": 7.44780158996582, "learning_rate": 9.94473910306368e-06, "loss": 0.4282, "num_input_tokens_seen": 33524224, "step": 15545 }, { "epoch": 2.8537346302073776, "grad_norm": 4.932660102844238, "learning_rate": 9.944620316278961e-06, "loss": 0.4231, "num_input_tokens_seen": 33534752, "step": 15550 }, { "epoch": 2.8546522297669297, "grad_norm": 7.7676591873168945, "learning_rate": 9.944501402672394e-06, "loss": 0.4724, "num_input_tokens_seen": 33546304, "step": 15555 }, { "epoch": 2.855569829326482, "grad_norm": 7.702747821807861, "learning_rate": 9.94438236224703e-06, "loss": 0.4302, "num_input_tokens_seen": 33557760, "step": 15560 }, { "epoch": 2.8564874288860342, "grad_norm": 12.960053443908691, "learning_rate": 9.944263195005918e-06, "loss": 0.3045, "num_input_tokens_seen": 33569248, "step": 15565 }, { "epoch": 2.8574050284455863, "grad_norm": 16.99052619934082, "learning_rate": 9.944143900952122e-06, "loss": 0.4976, "num_input_tokens_seen": 33580000, "step": 15570 }, { "epoch": 2.858322628005139, "grad_norm": 8.998351097106934, "learning_rate": 9.944024480088697e-06, "loss": 0.2683, "num_input_tokens_seen": 33589952, "step": 15575 }, { "epoch": 2.859240227564691, "grad_norm": 7.809005260467529, "learning_rate": 9.943904932418704e-06, "loss": 0.3167, "num_input_tokens_seen": 33601376, "step": 15580 }, { "epoch": 2.860157827124243, "grad_norm": 4.218306064605713, "learning_rate": 9.943785257945214e-06, "loss": 0.3967, "num_input_tokens_seen": 33613152, "step": 15585 }, { "epoch": 2.8610754266837954, "grad_norm": 7.310495853424072, "learning_rate": 9.943665456671295e-06, "loss": 0.3472, "num_input_tokens_seen": 33624128, "step": 15590 }, { "epoch": 2.8619930262433475, "grad_norm": 8.1301908493042, "learning_rate": 9.943545528600017e-06, "loss": 0.2748, "num_input_tokens_seen": 33635904, "step": 15595 }, { "epoch": 2.8629106258028996, "grad_norm": 3.2530128955841064, "learning_rate": 9.943425473734459e-06, "loss": 0.2914, "num_input_tokens_seen": 33646528, "step": 15600 }, { "epoch": 2.863828225362452, "grad_norm": 6.358757019042969, "learning_rate": 9.943305292077698e-06, "loss": 0.2622, "num_input_tokens_seen": 33658272, "step": 15605 }, { "epoch": 2.864745824922004, "grad_norm": 6.664064407348633, "learning_rate": 9.943184983632819e-06, "loss": 0.2499, "num_input_tokens_seen": 33669120, "step": 15610 }, { "epoch": 2.865663424481556, "grad_norm": 13.120838165283203, "learning_rate": 9.943064548402906e-06, "loss": 0.6229, "num_input_tokens_seen": 33679392, "step": 15615 }, { "epoch": 2.8665810240411087, "grad_norm": 43.382652282714844, "learning_rate": 9.94294398639105e-06, "loss": 0.4764, "num_input_tokens_seen": 33689888, "step": 15620 }, { "epoch": 2.8674986236006608, "grad_norm": 1.9255363941192627, "learning_rate": 9.942823297600339e-06, "loss": 0.3265, "num_input_tokens_seen": 33701312, "step": 15625 }, { "epoch": 2.868416223160213, "grad_norm": 3.8615996837615967, "learning_rate": 9.942702482033873e-06, "loss": 0.2959, "num_input_tokens_seen": 33711776, "step": 15630 }, { "epoch": 2.8693338227197653, "grad_norm": 2.6387927532196045, "learning_rate": 9.942581539694747e-06, "loss": 0.2235, "num_input_tokens_seen": 33722944, "step": 15635 }, { "epoch": 2.8702514222793174, "grad_norm": 6.099435329437256, "learning_rate": 9.942460470586066e-06, "loss": 0.299, "num_input_tokens_seen": 33733952, "step": 15640 }, { "epoch": 2.8711690218388695, "grad_norm": 11.301929473876953, "learning_rate": 9.942339274710933e-06, "loss": 0.3227, "num_input_tokens_seen": 33743904, "step": 15645 }, { "epoch": 2.872086621398422, "grad_norm": 29.151317596435547, "learning_rate": 9.942217952072459e-06, "loss": 0.4153, "num_input_tokens_seen": 33754752, "step": 15650 }, { "epoch": 2.873004220957974, "grad_norm": 2.6050217151641846, "learning_rate": 9.942096502673754e-06, "loss": 0.2397, "num_input_tokens_seen": 33765696, "step": 15655 }, { "epoch": 2.873921820517526, "grad_norm": 23.624082565307617, "learning_rate": 9.941974926517932e-06, "loss": 0.3564, "num_input_tokens_seen": 33776928, "step": 15660 }, { "epoch": 2.8748394200770786, "grad_norm": 3.055619239807129, "learning_rate": 9.941853223608114e-06, "loss": 0.3019, "num_input_tokens_seen": 33787680, "step": 15665 }, { "epoch": 2.8757570196366307, "grad_norm": 12.366018295288086, "learning_rate": 9.94173139394742e-06, "loss": 0.2948, "num_input_tokens_seen": 33796672, "step": 15670 }, { "epoch": 2.8766746191961827, "grad_norm": 4.308652877807617, "learning_rate": 9.941609437538973e-06, "loss": 0.3366, "num_input_tokens_seen": 33808128, "step": 15675 }, { "epoch": 2.8775922187557352, "grad_norm": 5.325841903686523, "learning_rate": 9.941487354385904e-06, "loss": 0.3781, "num_input_tokens_seen": 33820128, "step": 15680 }, { "epoch": 2.8785098183152873, "grad_norm": 3.3702940940856934, "learning_rate": 9.941365144491344e-06, "loss": 0.3139, "num_input_tokens_seen": 33831296, "step": 15685 }, { "epoch": 2.8794274178748394, "grad_norm": 2.8394250869750977, "learning_rate": 9.941242807858424e-06, "loss": 0.3395, "num_input_tokens_seen": 33841856, "step": 15690 }, { "epoch": 2.880345017434392, "grad_norm": 2.8237318992614746, "learning_rate": 9.941120344490287e-06, "loss": 0.4033, "num_input_tokens_seen": 33852224, "step": 15695 }, { "epoch": 2.881262616993944, "grad_norm": 2.393326997756958, "learning_rate": 9.940997754390069e-06, "loss": 0.3266, "num_input_tokens_seen": 33861952, "step": 15700 }, { "epoch": 2.882180216553496, "grad_norm": 4.517077922821045, "learning_rate": 9.940875037560917e-06, "loss": 0.2974, "num_input_tokens_seen": 33872864, "step": 15705 }, { "epoch": 2.8830978161130485, "grad_norm": 2.1454355716705322, "learning_rate": 9.940752194005978e-06, "loss": 0.2258, "num_input_tokens_seen": 33884096, "step": 15710 }, { "epoch": 2.8840154156726006, "grad_norm": 5.88010835647583, "learning_rate": 9.940629223728403e-06, "loss": 0.2943, "num_input_tokens_seen": 33894272, "step": 15715 }, { "epoch": 2.8849330152321526, "grad_norm": 1.9071135520935059, "learning_rate": 9.940506126731346e-06, "loss": 0.2683, "num_input_tokens_seen": 33905984, "step": 15720 }, { "epoch": 2.885850614791705, "grad_norm": 6.0439653396606445, "learning_rate": 9.940382903017964e-06, "loss": 0.3398, "num_input_tokens_seen": 33917600, "step": 15725 }, { "epoch": 2.886768214351257, "grad_norm": 4.212911128997803, "learning_rate": 9.940259552591416e-06, "loss": 0.3383, "num_input_tokens_seen": 33927712, "step": 15730 }, { "epoch": 2.8876858139108093, "grad_norm": 2.6441667079925537, "learning_rate": 9.940136075454869e-06, "loss": 0.3584, "num_input_tokens_seen": 33938144, "step": 15735 }, { "epoch": 2.8886034134703618, "grad_norm": 4.356240749359131, "learning_rate": 9.940012471611486e-06, "loss": 0.3452, "num_input_tokens_seen": 33948640, "step": 15740 }, { "epoch": 2.889521013029914, "grad_norm": 1.7440389394760132, "learning_rate": 9.939888741064441e-06, "loss": 0.4905, "num_input_tokens_seen": 33960000, "step": 15745 }, { "epoch": 2.890438612589466, "grad_norm": 1.8577913045883179, "learning_rate": 9.939764883816907e-06, "loss": 0.3268, "num_input_tokens_seen": 33971936, "step": 15750 }, { "epoch": 2.8913562121490184, "grad_norm": 1.7656534910202026, "learning_rate": 9.939640899872058e-06, "loss": 0.2737, "num_input_tokens_seen": 33983616, "step": 15755 }, { "epoch": 2.8922738117085705, "grad_norm": 9.615360260009766, "learning_rate": 9.939516789233076e-06, "loss": 0.3633, "num_input_tokens_seen": 33994912, "step": 15760 }, { "epoch": 2.8931914112681225, "grad_norm": 2.7941150665283203, "learning_rate": 9.939392551903144e-06, "loss": 0.2372, "num_input_tokens_seen": 34005920, "step": 15765 }, { "epoch": 2.894109010827675, "grad_norm": 1.7565579414367676, "learning_rate": 9.939268187885449e-06, "loss": 0.247, "num_input_tokens_seen": 34016256, "step": 15770 }, { "epoch": 2.895026610387227, "grad_norm": 5.582398891448975, "learning_rate": 9.939143697183178e-06, "loss": 0.1912, "num_input_tokens_seen": 34027776, "step": 15775 }, { "epoch": 2.895944209946779, "grad_norm": 4.500740051269531, "learning_rate": 9.939019079799527e-06, "loss": 0.3255, "num_input_tokens_seen": 34039296, "step": 15780 }, { "epoch": 2.8968618095063317, "grad_norm": 4.043691158294678, "learning_rate": 9.938894335737693e-06, "loss": 0.3386, "num_input_tokens_seen": 34047968, "step": 15785 }, { "epoch": 2.8977794090658837, "grad_norm": 3.051295042037964, "learning_rate": 9.938769465000873e-06, "loss": 0.3313, "num_input_tokens_seen": 34058304, "step": 15790 }, { "epoch": 2.898697008625436, "grad_norm": 2.2689404487609863, "learning_rate": 9.93864446759227e-06, "loss": 0.2316, "num_input_tokens_seen": 34068128, "step": 15795 }, { "epoch": 2.8996146081849883, "grad_norm": 8.944284439086914, "learning_rate": 9.938519343515091e-06, "loss": 0.3603, "num_input_tokens_seen": 34078816, "step": 15800 }, { "epoch": 2.9005322077445403, "grad_norm": 17.58098793029785, "learning_rate": 9.938394092772545e-06, "loss": 0.3939, "num_input_tokens_seen": 34090368, "step": 15805 }, { "epoch": 2.9014498073040924, "grad_norm": 16.04743766784668, "learning_rate": 9.938268715367846e-06, "loss": 0.4798, "num_input_tokens_seen": 34100096, "step": 15810 }, { "epoch": 2.902367406863645, "grad_norm": 2.9120285511016846, "learning_rate": 9.938143211304205e-06, "loss": 0.3122, "num_input_tokens_seen": 34110688, "step": 15815 }, { "epoch": 2.903285006423197, "grad_norm": 10.84020709991455, "learning_rate": 9.938017580584846e-06, "loss": 0.2605, "num_input_tokens_seen": 34121856, "step": 15820 }, { "epoch": 2.904202605982749, "grad_norm": 1.3332511186599731, "learning_rate": 9.937891823212989e-06, "loss": 0.2417, "num_input_tokens_seen": 34132448, "step": 15825 }, { "epoch": 2.9051202055423015, "grad_norm": 3.486140489578247, "learning_rate": 9.937765939191859e-06, "loss": 0.313, "num_input_tokens_seen": 34141728, "step": 15830 }, { "epoch": 2.9060378051018536, "grad_norm": 9.186018943786621, "learning_rate": 9.937639928524687e-06, "loss": 0.3428, "num_input_tokens_seen": 34152160, "step": 15835 }, { "epoch": 2.9069554046614057, "grad_norm": 50.51309585571289, "learning_rate": 9.9375137912147e-06, "loss": 0.2618, "num_input_tokens_seen": 34162016, "step": 15840 }, { "epoch": 2.907873004220958, "grad_norm": 23.469331741333008, "learning_rate": 9.937387527265142e-06, "loss": 0.4453, "num_input_tokens_seen": 34173248, "step": 15845 }, { "epoch": 2.9087906037805102, "grad_norm": 8.617754936218262, "learning_rate": 9.937261136679243e-06, "loss": 0.4239, "num_input_tokens_seen": 34184800, "step": 15850 }, { "epoch": 2.9097082033400623, "grad_norm": 3.9474616050720215, "learning_rate": 9.937134619460248e-06, "loss": 0.347, "num_input_tokens_seen": 34195648, "step": 15855 }, { "epoch": 2.910625802899615, "grad_norm": 7.010577201843262, "learning_rate": 9.9370079756114e-06, "loss": 0.2352, "num_input_tokens_seen": 34205984, "step": 15860 }, { "epoch": 2.911543402459167, "grad_norm": 8.865894317626953, "learning_rate": 9.936881205135953e-06, "loss": 0.2847, "num_input_tokens_seen": 34217600, "step": 15865 }, { "epoch": 2.912461002018719, "grad_norm": 4.193812847137451, "learning_rate": 9.936754308037154e-06, "loss": 0.4895, "num_input_tokens_seen": 34227328, "step": 15870 }, { "epoch": 2.9133786015782714, "grad_norm": 27.25208854675293, "learning_rate": 9.936627284318257e-06, "loss": 0.2234, "num_input_tokens_seen": 34238432, "step": 15875 }, { "epoch": 2.9142962011378235, "grad_norm": 12.799263000488281, "learning_rate": 9.93650013398252e-06, "loss": 0.5523, "num_input_tokens_seen": 34249536, "step": 15880 }, { "epoch": 2.9152138006973756, "grad_norm": 15.967068672180176, "learning_rate": 9.936372857033207e-06, "loss": 0.3244, "num_input_tokens_seen": 34259648, "step": 15885 }, { "epoch": 2.916131400256928, "grad_norm": 1.8953512907028198, "learning_rate": 9.93624545347358e-06, "loss": 0.4721, "num_input_tokens_seen": 34270592, "step": 15890 }, { "epoch": 2.91704899981648, "grad_norm": 3.820265769958496, "learning_rate": 9.93611792330691e-06, "loss": 0.2183, "num_input_tokens_seen": 34281984, "step": 15895 }, { "epoch": 2.917966599376032, "grad_norm": 5.686174392700195, "learning_rate": 9.935990266536464e-06, "loss": 0.3433, "num_input_tokens_seen": 34293184, "step": 15900 }, { "epoch": 2.9188841989355847, "grad_norm": 3.0954525470733643, "learning_rate": 9.935862483165517e-06, "loss": 0.2713, "num_input_tokens_seen": 34303904, "step": 15905 }, { "epoch": 2.9198017984951368, "grad_norm": 11.655904769897461, "learning_rate": 9.935734573197348e-06, "loss": 0.3233, "num_input_tokens_seen": 34315616, "step": 15910 }, { "epoch": 2.920719398054689, "grad_norm": 6.3692240715026855, "learning_rate": 9.935606536635237e-06, "loss": 0.3856, "num_input_tokens_seen": 34326336, "step": 15915 }, { "epoch": 2.9216369976142413, "grad_norm": 3.553879737854004, "learning_rate": 9.935478373482466e-06, "loss": 0.3925, "num_input_tokens_seen": 34337312, "step": 15920 }, { "epoch": 2.9225545971737934, "grad_norm": 3.5068397521972656, "learning_rate": 9.935350083742325e-06, "loss": 0.4175, "num_input_tokens_seen": 34347456, "step": 15925 }, { "epoch": 2.9234721967333455, "grad_norm": 3.2103893756866455, "learning_rate": 9.935221667418105e-06, "loss": 0.2708, "num_input_tokens_seen": 34358016, "step": 15930 }, { "epoch": 2.924389796292898, "grad_norm": 2.433079242706299, "learning_rate": 9.935093124513098e-06, "loss": 0.3019, "num_input_tokens_seen": 34370592, "step": 15935 }, { "epoch": 2.92530739585245, "grad_norm": 3.959728240966797, "learning_rate": 9.9349644550306e-06, "loss": 0.2994, "num_input_tokens_seen": 34379456, "step": 15940 }, { "epoch": 2.926224995412002, "grad_norm": 1.3092797994613647, "learning_rate": 9.934835658973912e-06, "loss": 0.3875, "num_input_tokens_seen": 34389120, "step": 15945 }, { "epoch": 2.9271425949715546, "grad_norm": 7.1256422996521, "learning_rate": 9.934706736346337e-06, "loss": 0.274, "num_input_tokens_seen": 34399488, "step": 15950 }, { "epoch": 2.9280601945311067, "grad_norm": 3.031135082244873, "learning_rate": 9.934577687151184e-06, "loss": 0.3391, "num_input_tokens_seen": 34409184, "step": 15955 }, { "epoch": 2.9289777940906587, "grad_norm": 4.762291431427002, "learning_rate": 9.934448511391762e-06, "loss": 0.3166, "num_input_tokens_seen": 34419648, "step": 15960 }, { "epoch": 2.9298953936502112, "grad_norm": 2.6926465034484863, "learning_rate": 9.934319209071382e-06, "loss": 0.2595, "num_input_tokens_seen": 34430720, "step": 15965 }, { "epoch": 2.9308129932097633, "grad_norm": 2.3948874473571777, "learning_rate": 9.934189780193361e-06, "loss": 0.3171, "num_input_tokens_seen": 34441696, "step": 15970 }, { "epoch": 2.9317305927693154, "grad_norm": 4.561857223510742, "learning_rate": 9.93406022476102e-06, "loss": 0.347, "num_input_tokens_seen": 34452512, "step": 15975 }, { "epoch": 2.932648192328868, "grad_norm": 7.506640911102295, "learning_rate": 9.933930542777681e-06, "loss": 0.2712, "num_input_tokens_seen": 34463392, "step": 15980 }, { "epoch": 2.93356579188842, "grad_norm": 1.6349575519561768, "learning_rate": 9.933800734246673e-06, "loss": 0.3036, "num_input_tokens_seen": 34474432, "step": 15985 }, { "epoch": 2.934483391447972, "grad_norm": 3.004706859588623, "learning_rate": 9.933670799171319e-06, "loss": 0.2437, "num_input_tokens_seen": 34484288, "step": 15990 }, { "epoch": 2.9354009910075245, "grad_norm": 2.701847553253174, "learning_rate": 9.933540737554959e-06, "loss": 0.26, "num_input_tokens_seen": 34495168, "step": 15995 }, { "epoch": 2.9363185905670766, "grad_norm": 6.947020053863525, "learning_rate": 9.933410549400924e-06, "loss": 0.3062, "num_input_tokens_seen": 34506112, "step": 16000 }, { "epoch": 2.9372361901266286, "grad_norm": 3.981842279434204, "learning_rate": 9.933280234712552e-06, "loss": 0.2056, "num_input_tokens_seen": 34515968, "step": 16005 }, { "epoch": 2.938153789686181, "grad_norm": 6.832682132720947, "learning_rate": 9.933149793493191e-06, "loss": 0.2732, "num_input_tokens_seen": 34526656, "step": 16010 }, { "epoch": 2.939071389245733, "grad_norm": 3.1909677982330322, "learning_rate": 9.933019225746183e-06, "loss": 0.3317, "num_input_tokens_seen": 34536640, "step": 16015 }, { "epoch": 2.9399889888052853, "grad_norm": 2.1103620529174805, "learning_rate": 9.932888531474877e-06, "loss": 0.2225, "num_input_tokens_seen": 34546624, "step": 16020 }, { "epoch": 2.9409065883648378, "grad_norm": 16.360984802246094, "learning_rate": 9.932757710682625e-06, "loss": 0.4443, "num_input_tokens_seen": 34557312, "step": 16025 }, { "epoch": 2.94182418792439, "grad_norm": 13.513683319091797, "learning_rate": 9.932626763372784e-06, "loss": 0.4409, "num_input_tokens_seen": 34567616, "step": 16030 }, { "epoch": 2.942741787483942, "grad_norm": 14.20640754699707, "learning_rate": 9.93249568954871e-06, "loss": 0.5153, "num_input_tokens_seen": 34580608, "step": 16035 }, { "epoch": 2.9436593870434944, "grad_norm": 4.32028865814209, "learning_rate": 9.932364489213767e-06, "loss": 0.2438, "num_input_tokens_seen": 34592192, "step": 16040 }, { "epoch": 2.9445769866030465, "grad_norm": 3.643583297729492, "learning_rate": 9.932233162371318e-06, "loss": 0.2451, "num_input_tokens_seen": 34603456, "step": 16045 }, { "epoch": 2.9454945861625985, "grad_norm": 6.709008693695068, "learning_rate": 9.932101709024735e-06, "loss": 0.2481, "num_input_tokens_seen": 34614048, "step": 16050 }, { "epoch": 2.946412185722151, "grad_norm": 3.261958599090576, "learning_rate": 9.931970129177387e-06, "loss": 0.2809, "num_input_tokens_seen": 34626080, "step": 16055 }, { "epoch": 2.947329785281703, "grad_norm": 2.9369773864746094, "learning_rate": 9.931838422832646e-06, "loss": 0.2961, "num_input_tokens_seen": 34636544, "step": 16060 }, { "epoch": 2.948247384841255, "grad_norm": 5.831468105316162, "learning_rate": 9.931706589993898e-06, "loss": 0.3023, "num_input_tokens_seen": 34645760, "step": 16065 }, { "epoch": 2.9491649844008077, "grad_norm": 2.5164778232574463, "learning_rate": 9.931574630664516e-06, "loss": 0.3844, "num_input_tokens_seen": 34656224, "step": 16070 }, { "epoch": 2.9500825839603597, "grad_norm": 5.728172779083252, "learning_rate": 9.931442544847888e-06, "loss": 0.2538, "num_input_tokens_seen": 34665696, "step": 16075 }, { "epoch": 2.951000183519912, "grad_norm": 5.9007487297058105, "learning_rate": 9.931310332547402e-06, "loss": 0.2753, "num_input_tokens_seen": 34676640, "step": 16080 }, { "epoch": 2.9519177830794643, "grad_norm": 7.9076008796691895, "learning_rate": 9.93117799376645e-06, "loss": 0.2617, "num_input_tokens_seen": 34687872, "step": 16085 }, { "epoch": 2.9528353826390163, "grad_norm": 2.1256136894226074, "learning_rate": 9.931045528508423e-06, "loss": 0.2382, "num_input_tokens_seen": 34698592, "step": 16090 }, { "epoch": 2.9537529821985684, "grad_norm": 2.7676753997802734, "learning_rate": 9.930912936776723e-06, "loss": 0.1767, "num_input_tokens_seen": 34709024, "step": 16095 }, { "epoch": 2.954670581758121, "grad_norm": 4.926875114440918, "learning_rate": 9.930780218574746e-06, "loss": 0.2511, "num_input_tokens_seen": 34719936, "step": 16100 }, { "epoch": 2.955588181317673, "grad_norm": 65.2608871459961, "learning_rate": 9.930647373905901e-06, "loss": 0.6043, "num_input_tokens_seen": 34730464, "step": 16105 }, { "epoch": 2.956505780877225, "grad_norm": 2.443830728530884, "learning_rate": 9.930514402773591e-06, "loss": 0.2956, "num_input_tokens_seen": 34740800, "step": 16110 }, { "epoch": 2.9574233804367775, "grad_norm": 5.276951313018799, "learning_rate": 9.93038130518123e-06, "loss": 0.3011, "num_input_tokens_seen": 34751392, "step": 16115 }, { "epoch": 2.9583409799963296, "grad_norm": 1.8254839181900024, "learning_rate": 9.930248081132227e-06, "loss": 0.3712, "num_input_tokens_seen": 34762016, "step": 16120 }, { "epoch": 2.9592585795558817, "grad_norm": 1.4615974426269531, "learning_rate": 9.930114730630003e-06, "loss": 0.6113, "num_input_tokens_seen": 34772352, "step": 16125 }, { "epoch": 2.960176179115434, "grad_norm": 2.563260555267334, "learning_rate": 9.929981253677978e-06, "loss": 0.431, "num_input_tokens_seen": 34782976, "step": 16130 }, { "epoch": 2.9610937786749862, "grad_norm": 3.773831367492676, "learning_rate": 9.929847650279573e-06, "loss": 0.2998, "num_input_tokens_seen": 34794528, "step": 16135 }, { "epoch": 2.9620113782345383, "grad_norm": 11.363553047180176, "learning_rate": 9.929713920438218e-06, "loss": 0.5481, "num_input_tokens_seen": 34804992, "step": 16140 }, { "epoch": 2.962928977794091, "grad_norm": 2.9786555767059326, "learning_rate": 9.929580064157341e-06, "loss": 0.3696, "num_input_tokens_seen": 34814336, "step": 16145 }, { "epoch": 2.963846577353643, "grad_norm": 1.7755084037780762, "learning_rate": 9.929446081440376e-06, "loss": 0.3046, "num_input_tokens_seen": 34825088, "step": 16150 }, { "epoch": 2.964764176913195, "grad_norm": 1.950728416442871, "learning_rate": 9.929311972290758e-06, "loss": 0.265, "num_input_tokens_seen": 34835680, "step": 16155 }, { "epoch": 2.9656817764727474, "grad_norm": 2.4702746868133545, "learning_rate": 9.929177736711927e-06, "loss": 0.2427, "num_input_tokens_seen": 34846464, "step": 16160 }, { "epoch": 2.9665993760322995, "grad_norm": 3.9840705394744873, "learning_rate": 9.929043374707329e-06, "loss": 0.4429, "num_input_tokens_seen": 34857248, "step": 16165 }, { "epoch": 2.9675169755918516, "grad_norm": 2.612353563308716, "learning_rate": 9.928908886280406e-06, "loss": 0.3833, "num_input_tokens_seen": 34868544, "step": 16170 }, { "epoch": 2.968434575151404, "grad_norm": 3.1535768508911133, "learning_rate": 9.92877427143461e-06, "loss": 0.4307, "num_input_tokens_seen": 34879072, "step": 16175 }, { "epoch": 2.969352174710956, "grad_norm": 5.876064777374268, "learning_rate": 9.928639530173392e-06, "loss": 0.3237, "num_input_tokens_seen": 34889824, "step": 16180 }, { "epoch": 2.970269774270508, "grad_norm": 4.631380081176758, "learning_rate": 9.928504662500209e-06, "loss": 0.3184, "num_input_tokens_seen": 34899936, "step": 16185 }, { "epoch": 2.9711873738300607, "grad_norm": 2.51759934425354, "learning_rate": 9.92836966841852e-06, "loss": 0.3145, "num_input_tokens_seen": 34910560, "step": 16190 }, { "epoch": 2.9721049733896128, "grad_norm": 2.6292901039123535, "learning_rate": 9.928234547931787e-06, "loss": 0.2491, "num_input_tokens_seen": 34920672, "step": 16195 }, { "epoch": 2.973022572949165, "grad_norm": 3.399930238723755, "learning_rate": 9.928099301043476e-06, "loss": 0.3269, "num_input_tokens_seen": 34930688, "step": 16200 }, { "epoch": 2.9739401725087173, "grad_norm": 0.8739780783653259, "learning_rate": 9.927963927757057e-06, "loss": 0.2693, "num_input_tokens_seen": 34940160, "step": 16205 }, { "epoch": 2.9748577720682694, "grad_norm": 5.851470470428467, "learning_rate": 9.927828428075998e-06, "loss": 0.364, "num_input_tokens_seen": 34950464, "step": 16210 }, { "epoch": 2.9757753716278215, "grad_norm": 6.776010036468506, "learning_rate": 9.92769280200378e-06, "loss": 0.2174, "num_input_tokens_seen": 34961408, "step": 16215 }, { "epoch": 2.976692971187374, "grad_norm": 0.7313140034675598, "learning_rate": 9.927557049543877e-06, "loss": 0.2544, "num_input_tokens_seen": 34971168, "step": 16220 }, { "epoch": 2.977610570746926, "grad_norm": 6.158401966094971, "learning_rate": 9.927421170699775e-06, "loss": 0.2907, "num_input_tokens_seen": 34981504, "step": 16225 }, { "epoch": 2.978528170306478, "grad_norm": 6.842074394226074, "learning_rate": 9.927285165474955e-06, "loss": 0.4189, "num_input_tokens_seen": 34993024, "step": 16230 }, { "epoch": 2.9794457698660306, "grad_norm": 4.8156418800354, "learning_rate": 9.927149033872908e-06, "loss": 0.3064, "num_input_tokens_seen": 35003456, "step": 16235 }, { "epoch": 2.9803633694255827, "grad_norm": 23.167787551879883, "learning_rate": 9.927012775897124e-06, "loss": 0.3831, "num_input_tokens_seen": 35014080, "step": 16240 }, { "epoch": 2.9812809689851347, "grad_norm": 6.30485200881958, "learning_rate": 9.9268763915511e-06, "loss": 0.432, "num_input_tokens_seen": 35023936, "step": 16245 }, { "epoch": 2.9821985685446872, "grad_norm": 4.507814884185791, "learning_rate": 9.92673988083833e-06, "loss": 0.3105, "num_input_tokens_seen": 35035360, "step": 16250 }, { "epoch": 2.9831161681042393, "grad_norm": 12.734962463378906, "learning_rate": 9.926603243762319e-06, "loss": 0.2583, "num_input_tokens_seen": 35046976, "step": 16255 }, { "epoch": 2.9840337676637914, "grad_norm": 2.852482795715332, "learning_rate": 9.926466480326571e-06, "loss": 0.2107, "num_input_tokens_seen": 35058208, "step": 16260 }, { "epoch": 2.984951367223344, "grad_norm": 5.079199314117432, "learning_rate": 9.92632959053459e-06, "loss": 0.3811, "num_input_tokens_seen": 35069408, "step": 16265 }, { "epoch": 2.985868966782896, "grad_norm": 8.068860054016113, "learning_rate": 9.926192574389894e-06, "loss": 0.3918, "num_input_tokens_seen": 35080672, "step": 16270 }, { "epoch": 2.986786566342448, "grad_norm": 3.588634490966797, "learning_rate": 9.926055431895993e-06, "loss": 0.3331, "num_input_tokens_seen": 35091424, "step": 16275 }, { "epoch": 2.9877041659020005, "grad_norm": 5.302219867706299, "learning_rate": 9.925918163056402e-06, "loss": 0.2659, "num_input_tokens_seen": 35101344, "step": 16280 }, { "epoch": 2.9886217654615526, "grad_norm": 2.5654983520507812, "learning_rate": 9.925780767874648e-06, "loss": 0.364, "num_input_tokens_seen": 35112128, "step": 16285 }, { "epoch": 2.9895393650211046, "grad_norm": 2.092550277709961, "learning_rate": 9.92564324635425e-06, "loss": 0.2987, "num_input_tokens_seen": 35123424, "step": 16290 }, { "epoch": 2.990456964580657, "grad_norm": 2.3262059688568115, "learning_rate": 9.925505598498738e-06, "loss": 0.4211, "num_input_tokens_seen": 35133664, "step": 16295 }, { "epoch": 2.991374564140209, "grad_norm": 1.6316165924072266, "learning_rate": 9.925367824311639e-06, "loss": 0.3491, "num_input_tokens_seen": 35145920, "step": 16300 }, { "epoch": 2.9922921636997613, "grad_norm": 6.6866841316223145, "learning_rate": 9.92522992379649e-06, "loss": 0.3012, "num_input_tokens_seen": 35157216, "step": 16305 }, { "epoch": 2.9932097632593138, "grad_norm": 6.9706854820251465, "learning_rate": 9.925091896956827e-06, "loss": 0.4381, "num_input_tokens_seen": 35167328, "step": 16310 }, { "epoch": 2.994127362818866, "grad_norm": 1.5367861986160278, "learning_rate": 9.92495374379619e-06, "loss": 0.2382, "num_input_tokens_seen": 35178592, "step": 16315 }, { "epoch": 2.995044962378418, "grad_norm": 9.814679145812988, "learning_rate": 9.924815464318121e-06, "loss": 0.418, "num_input_tokens_seen": 35189856, "step": 16320 }, { "epoch": 2.9959625619379704, "grad_norm": 1.311260461807251, "learning_rate": 9.92467705852617e-06, "loss": 0.2434, "num_input_tokens_seen": 35201792, "step": 16325 }, { "epoch": 2.9968801614975225, "grad_norm": 9.578582763671875, "learning_rate": 9.924538526423884e-06, "loss": 0.309, "num_input_tokens_seen": 35211040, "step": 16330 }, { "epoch": 2.9977977610570745, "grad_norm": 4.440088748931885, "learning_rate": 9.924399868014817e-06, "loss": 0.2535, "num_input_tokens_seen": 35221344, "step": 16335 }, { "epoch": 2.998715360616627, "grad_norm": 5.458823204040527, "learning_rate": 9.924261083302528e-06, "loss": 0.2822, "num_input_tokens_seen": 35232832, "step": 16340 }, { "epoch": 2.999632960176179, "grad_norm": 12.318227767944336, "learning_rate": 9.924122172290571e-06, "loss": 0.5388, "num_input_tokens_seen": 35243168, "step": 16345 }, { "epoch": 3.000550559735731, "grad_norm": 3.517336845397949, "learning_rate": 9.923983134982514e-06, "loss": 0.4935, "num_input_tokens_seen": 35253408, "step": 16350 }, { "epoch": 3.0014681592952837, "grad_norm": 51.76689529418945, "learning_rate": 9.92384397138192e-06, "loss": 0.314, "num_input_tokens_seen": 35263232, "step": 16355 }, { "epoch": 3.0023857588548357, "grad_norm": 10.750457763671875, "learning_rate": 9.92370468149236e-06, "loss": 0.4171, "num_input_tokens_seen": 35274400, "step": 16360 }, { "epoch": 3.003303358414388, "grad_norm": 10.138545036315918, "learning_rate": 9.923565265317406e-06, "loss": 0.2557, "num_input_tokens_seen": 35283584, "step": 16365 }, { "epoch": 3.0042209579739403, "grad_norm": 10.437137603759766, "learning_rate": 9.923425722860633e-06, "loss": 0.4292, "num_input_tokens_seen": 35295488, "step": 16370 }, { "epoch": 3.0051385575334923, "grad_norm": 4.745554447174072, "learning_rate": 9.923286054125621e-06, "loss": 0.2726, "num_input_tokens_seen": 35305344, "step": 16375 }, { "epoch": 3.0060561570930444, "grad_norm": 4.082912921905518, "learning_rate": 9.923146259115953e-06, "loss": 0.2233, "num_input_tokens_seen": 35314720, "step": 16380 }, { "epoch": 3.006973756652597, "grad_norm": 3.630828857421875, "learning_rate": 9.923006337835213e-06, "loss": 0.2959, "num_input_tokens_seen": 35325888, "step": 16385 }, { "epoch": 3.007891356212149, "grad_norm": 7.75374698638916, "learning_rate": 9.92286629028699e-06, "loss": 0.4368, "num_input_tokens_seen": 35336608, "step": 16390 }, { "epoch": 3.008808955771701, "grad_norm": 5.537052154541016, "learning_rate": 9.922726116474877e-06, "loss": 0.2798, "num_input_tokens_seen": 35347648, "step": 16395 }, { "epoch": 3.0097265553312536, "grad_norm": 3.599639892578125, "learning_rate": 9.922585816402468e-06, "loss": 0.3599, "num_input_tokens_seen": 35358208, "step": 16400 }, { "epoch": 3.0106441548908056, "grad_norm": 7.121408939361572, "learning_rate": 9.922445390073363e-06, "loss": 0.287, "num_input_tokens_seen": 35369504, "step": 16405 }, { "epoch": 3.0115617544503577, "grad_norm": 11.092029571533203, "learning_rate": 9.922304837491164e-06, "loss": 0.3647, "num_input_tokens_seen": 35381984, "step": 16410 }, { "epoch": 3.01247935400991, "grad_norm": 8.73799991607666, "learning_rate": 9.922164158659472e-06, "loss": 0.3225, "num_input_tokens_seen": 35391936, "step": 16415 }, { "epoch": 3.0133969535694622, "grad_norm": 6.106758117675781, "learning_rate": 9.9220233535819e-06, "loss": 0.229, "num_input_tokens_seen": 35403200, "step": 16420 }, { "epoch": 3.0143145531290143, "grad_norm": 0.9896991848945618, "learning_rate": 9.921882422262057e-06, "loss": 0.3826, "num_input_tokens_seen": 35414048, "step": 16425 }, { "epoch": 3.015232152688567, "grad_norm": 6.068443775177002, "learning_rate": 9.921741364703557e-06, "loss": 0.3126, "num_input_tokens_seen": 35424128, "step": 16430 }, { "epoch": 3.016149752248119, "grad_norm": 3.4177567958831787, "learning_rate": 9.921600180910019e-06, "loss": 0.3359, "num_input_tokens_seen": 35435136, "step": 16435 }, { "epoch": 3.017067351807671, "grad_norm": 4.522719383239746, "learning_rate": 9.921458870885066e-06, "loss": 0.3458, "num_input_tokens_seen": 35445472, "step": 16440 }, { "epoch": 3.0179849513672234, "grad_norm": 10.572189331054688, "learning_rate": 9.921317434632318e-06, "loss": 0.3152, "num_input_tokens_seen": 35456832, "step": 16445 }, { "epoch": 3.0189025509267755, "grad_norm": 1.7698373794555664, "learning_rate": 9.921175872155408e-06, "loss": 0.3005, "num_input_tokens_seen": 35467392, "step": 16450 }, { "epoch": 3.0198201504863276, "grad_norm": 2.852423667907715, "learning_rate": 9.921034183457963e-06, "loss": 0.3518, "num_input_tokens_seen": 35476960, "step": 16455 }, { "epoch": 3.02073775004588, "grad_norm": 4.861311435699463, "learning_rate": 9.920892368543617e-06, "loss": 0.3173, "num_input_tokens_seen": 35488064, "step": 16460 }, { "epoch": 3.021655349605432, "grad_norm": 10.566608428955078, "learning_rate": 9.920750427416008e-06, "loss": 0.4394, "num_input_tokens_seen": 35499040, "step": 16465 }, { "epoch": 3.022572949164984, "grad_norm": 2.692201614379883, "learning_rate": 9.920608360078778e-06, "loss": 0.3084, "num_input_tokens_seen": 35509664, "step": 16470 }, { "epoch": 3.0234905487245367, "grad_norm": 1.8843166828155518, "learning_rate": 9.920466166535571e-06, "loss": 0.2973, "num_input_tokens_seen": 35519936, "step": 16475 }, { "epoch": 3.0244081482840888, "grad_norm": 0.9770910143852234, "learning_rate": 9.920323846790032e-06, "loss": 0.3048, "num_input_tokens_seen": 35529504, "step": 16480 }, { "epoch": 3.025325747843641, "grad_norm": 2.0792253017425537, "learning_rate": 9.920181400845811e-06, "loss": 0.2301, "num_input_tokens_seen": 35540992, "step": 16485 }, { "epoch": 3.0262433474031933, "grad_norm": 6.7907328605651855, "learning_rate": 9.920038828706563e-06, "loss": 0.2681, "num_input_tokens_seen": 35552736, "step": 16490 }, { "epoch": 3.0271609469627454, "grad_norm": 3.9288828372955322, "learning_rate": 9.919896130375947e-06, "loss": 0.318, "num_input_tokens_seen": 35562880, "step": 16495 }, { "epoch": 3.0280785465222975, "grad_norm": 7.33154821395874, "learning_rate": 9.919753305857618e-06, "loss": 0.3518, "num_input_tokens_seen": 35573600, "step": 16500 }, { "epoch": 3.02899614608185, "grad_norm": 6.973477363586426, "learning_rate": 9.919610355155243e-06, "loss": 0.2395, "num_input_tokens_seen": 35584480, "step": 16505 }, { "epoch": 3.029913745641402, "grad_norm": 4.949598789215088, "learning_rate": 9.919467278272485e-06, "loss": 0.3205, "num_input_tokens_seen": 35595776, "step": 16510 }, { "epoch": 3.030831345200954, "grad_norm": 8.038788795471191, "learning_rate": 9.919324075213016e-06, "loss": 0.3619, "num_input_tokens_seen": 35607168, "step": 16515 }, { "epoch": 3.0317489447605066, "grad_norm": 13.518840789794922, "learning_rate": 9.91918074598051e-06, "loss": 0.3518, "num_input_tokens_seen": 35618496, "step": 16520 }, { "epoch": 3.0326665443200587, "grad_norm": 0.9638714790344238, "learning_rate": 9.919037290578644e-06, "loss": 0.3257, "num_input_tokens_seen": 35628672, "step": 16525 }, { "epoch": 3.0335841438796107, "grad_norm": 7.880343914031982, "learning_rate": 9.918893709011092e-06, "loss": 0.3019, "num_input_tokens_seen": 35638592, "step": 16530 }, { "epoch": 3.0345017434391632, "grad_norm": 5.088778495788574, "learning_rate": 9.918750001281541e-06, "loss": 0.4097, "num_input_tokens_seen": 35649984, "step": 16535 }, { "epoch": 3.0354193429987153, "grad_norm": 4.798499584197998, "learning_rate": 9.918606167393675e-06, "loss": 0.2342, "num_input_tokens_seen": 35661504, "step": 16540 }, { "epoch": 3.0363369425582674, "grad_norm": 1.5990036725997925, "learning_rate": 9.918462207351185e-06, "loss": 0.3343, "num_input_tokens_seen": 35672384, "step": 16545 }, { "epoch": 3.03725454211782, "grad_norm": 2.375276565551758, "learning_rate": 9.918318121157762e-06, "loss": 0.2506, "num_input_tokens_seen": 35684128, "step": 16550 }, { "epoch": 3.038172141677372, "grad_norm": 2.5923547744750977, "learning_rate": 9.918173908817101e-06, "loss": 0.3345, "num_input_tokens_seen": 35694688, "step": 16555 }, { "epoch": 3.039089741236924, "grad_norm": 1.9469285011291504, "learning_rate": 9.918029570332903e-06, "loss": 0.2047, "num_input_tokens_seen": 35705856, "step": 16560 }, { "epoch": 3.0400073407964765, "grad_norm": 12.160371780395508, "learning_rate": 9.91788510570887e-06, "loss": 0.3627, "num_input_tokens_seen": 35716640, "step": 16565 }, { "epoch": 3.0409249403560286, "grad_norm": 8.632402420043945, "learning_rate": 9.917740514948704e-06, "loss": 0.3163, "num_input_tokens_seen": 35727296, "step": 16570 }, { "epoch": 3.0418425399155806, "grad_norm": 3.9232406616210938, "learning_rate": 9.917595798056116e-06, "loss": 0.2968, "num_input_tokens_seen": 35738368, "step": 16575 }, { "epoch": 3.042760139475133, "grad_norm": 4.4845356941223145, "learning_rate": 9.917450955034818e-06, "loss": 0.3192, "num_input_tokens_seen": 35749888, "step": 16580 }, { "epoch": 3.043677739034685, "grad_norm": 3.10770845413208, "learning_rate": 9.917305985888523e-06, "loss": 0.2415, "num_input_tokens_seen": 35761504, "step": 16585 }, { "epoch": 3.0445953385942373, "grad_norm": 5.255565643310547, "learning_rate": 9.917160890620952e-06, "loss": 0.2975, "num_input_tokens_seen": 35772768, "step": 16590 }, { "epoch": 3.0455129381537898, "grad_norm": 1.4016673564910889, "learning_rate": 9.917015669235823e-06, "loss": 0.2843, "num_input_tokens_seen": 35782624, "step": 16595 }, { "epoch": 3.046430537713342, "grad_norm": 3.0110535621643066, "learning_rate": 9.916870321736864e-06, "loss": 0.3286, "num_input_tokens_seen": 35793664, "step": 16600 }, { "epoch": 3.047348137272894, "grad_norm": 1.7243894338607788, "learning_rate": 9.916724848127803e-06, "loss": 0.3909, "num_input_tokens_seen": 35805056, "step": 16605 }, { "epoch": 3.0482657368324464, "grad_norm": 3.183126211166382, "learning_rate": 9.916579248412368e-06, "loss": 0.262, "num_input_tokens_seen": 35816608, "step": 16610 }, { "epoch": 3.0491833363919985, "grad_norm": 3.8836288452148438, "learning_rate": 9.916433522594296e-06, "loss": 0.2927, "num_input_tokens_seen": 35825856, "step": 16615 }, { "epoch": 3.0501009359515505, "grad_norm": 4.188173294067383, "learning_rate": 9.916287670677325e-06, "loss": 0.2555, "num_input_tokens_seen": 35836960, "step": 16620 }, { "epoch": 3.051018535511103, "grad_norm": 2.713340997695923, "learning_rate": 9.916141692665193e-06, "loss": 0.3121, "num_input_tokens_seen": 35847328, "step": 16625 }, { "epoch": 3.051936135070655, "grad_norm": 2.2738747596740723, "learning_rate": 9.915995588561647e-06, "loss": 0.2937, "num_input_tokens_seen": 35858432, "step": 16630 }, { "epoch": 3.0528537346302076, "grad_norm": 4.803301811218262, "learning_rate": 9.915849358370433e-06, "loss": 0.2689, "num_input_tokens_seen": 35868640, "step": 16635 }, { "epoch": 3.0537713341897597, "grad_norm": 1.3562451601028442, "learning_rate": 9.9157030020953e-06, "loss": 0.2341, "num_input_tokens_seen": 35879520, "step": 16640 }, { "epoch": 3.0546889337493117, "grad_norm": 18.523021697998047, "learning_rate": 9.915556519740008e-06, "loss": 0.3433, "num_input_tokens_seen": 35890304, "step": 16645 }, { "epoch": 3.0556065333088642, "grad_norm": 2.4740188121795654, "learning_rate": 9.915409911308307e-06, "loss": 0.3996, "num_input_tokens_seen": 35901216, "step": 16650 }, { "epoch": 3.0565241328684163, "grad_norm": 2.431290864944458, "learning_rate": 9.91526317680396e-06, "loss": 0.272, "num_input_tokens_seen": 35911520, "step": 16655 }, { "epoch": 3.0574417324279684, "grad_norm": 2.7802846431732178, "learning_rate": 9.915116316230731e-06, "loss": 0.3826, "num_input_tokens_seen": 35922304, "step": 16660 }, { "epoch": 3.058359331987521, "grad_norm": 1.674242615699768, "learning_rate": 9.914969329592386e-06, "loss": 0.1876, "num_input_tokens_seen": 35932896, "step": 16665 }, { "epoch": 3.059276931547073, "grad_norm": 1.14314866065979, "learning_rate": 9.914822216892694e-06, "loss": 0.3688, "num_input_tokens_seen": 35943936, "step": 16670 }, { "epoch": 3.060194531106625, "grad_norm": 30.082406997680664, "learning_rate": 9.91467497813543e-06, "loss": 0.3836, "num_input_tokens_seen": 35954144, "step": 16675 }, { "epoch": 3.0611121306661775, "grad_norm": 21.329347610473633, "learning_rate": 9.91452761332437e-06, "loss": 0.3228, "num_input_tokens_seen": 35965024, "step": 16680 }, { "epoch": 3.0620297302257296, "grad_norm": 4.752723693847656, "learning_rate": 9.914380122463295e-06, "loss": 0.286, "num_input_tokens_seen": 35975744, "step": 16685 }, { "epoch": 3.0629473297852816, "grad_norm": 8.714681625366211, "learning_rate": 9.914232505555985e-06, "loss": 0.335, "num_input_tokens_seen": 35987616, "step": 16690 }, { "epoch": 3.063864929344834, "grad_norm": 5.648097515106201, "learning_rate": 9.91408476260623e-06, "loss": 0.3121, "num_input_tokens_seen": 35998912, "step": 16695 }, { "epoch": 3.064782528904386, "grad_norm": 2.4995217323303223, "learning_rate": 9.913936893617813e-06, "loss": 0.2833, "num_input_tokens_seen": 36009696, "step": 16700 }, { "epoch": 3.0657001284639382, "grad_norm": 9.331900596618652, "learning_rate": 9.913788898594532e-06, "loss": 0.3472, "num_input_tokens_seen": 36021408, "step": 16705 }, { "epoch": 3.0666177280234908, "grad_norm": 1.807905912399292, "learning_rate": 9.913640777540183e-06, "loss": 0.2839, "num_input_tokens_seen": 36031904, "step": 16710 }, { "epoch": 3.067535327583043, "grad_norm": 16.133529663085938, "learning_rate": 9.913492530458561e-06, "loss": 0.4597, "num_input_tokens_seen": 36042816, "step": 16715 }, { "epoch": 3.068452927142595, "grad_norm": 8.249162673950195, "learning_rate": 9.913344157353472e-06, "loss": 0.2828, "num_input_tokens_seen": 36053152, "step": 16720 }, { "epoch": 3.0693705267021474, "grad_norm": 2.538349151611328, "learning_rate": 9.913195658228722e-06, "loss": 0.3013, "num_input_tokens_seen": 36063776, "step": 16725 }, { "epoch": 3.0702881262616994, "grad_norm": 25.080902099609375, "learning_rate": 9.913047033088117e-06, "loss": 0.3728, "num_input_tokens_seen": 36074176, "step": 16730 }, { "epoch": 3.0712057258212515, "grad_norm": 10.706364631652832, "learning_rate": 9.91289828193547e-06, "loss": 0.3016, "num_input_tokens_seen": 36085376, "step": 16735 }, { "epoch": 3.072123325380804, "grad_norm": 3.8482508659362793, "learning_rate": 9.912749404774596e-06, "loss": 0.3366, "num_input_tokens_seen": 36097376, "step": 16740 }, { "epoch": 3.073040924940356, "grad_norm": 2.0439388751983643, "learning_rate": 9.912600401609314e-06, "loss": 0.2075, "num_input_tokens_seen": 36108032, "step": 16745 }, { "epoch": 3.073958524499908, "grad_norm": 39.054847717285156, "learning_rate": 9.912451272443444e-06, "loss": 0.2973, "num_input_tokens_seen": 36119104, "step": 16750 }, { "epoch": 3.0748761240594606, "grad_norm": 41.724998474121094, "learning_rate": 9.912302017280814e-06, "loss": 0.383, "num_input_tokens_seen": 36129664, "step": 16755 }, { "epoch": 3.0757937236190127, "grad_norm": 1.1367223262786865, "learning_rate": 9.912152636125252e-06, "loss": 0.4309, "num_input_tokens_seen": 36140672, "step": 16760 }, { "epoch": 3.0767113231785648, "grad_norm": 1.7937610149383545, "learning_rate": 9.912003128980588e-06, "loss": 0.2649, "num_input_tokens_seen": 36151488, "step": 16765 }, { "epoch": 3.0776289227381173, "grad_norm": 6.1144914627075195, "learning_rate": 9.911853495850653e-06, "loss": 0.2173, "num_input_tokens_seen": 36163008, "step": 16770 }, { "epoch": 3.0785465222976693, "grad_norm": 8.860791206359863, "learning_rate": 9.911703736739292e-06, "loss": 0.3175, "num_input_tokens_seen": 36174752, "step": 16775 }, { "epoch": 3.0794641218572214, "grad_norm": 17.204694747924805, "learning_rate": 9.911553851650342e-06, "loss": 0.412, "num_input_tokens_seen": 36185504, "step": 16780 }, { "epoch": 3.080381721416774, "grad_norm": 2.291189432144165, "learning_rate": 9.911403840587648e-06, "loss": 0.1849, "num_input_tokens_seen": 36196384, "step": 16785 }, { "epoch": 3.081299320976326, "grad_norm": 1.6019859313964844, "learning_rate": 9.911253703555055e-06, "loss": 0.2703, "num_input_tokens_seen": 36207776, "step": 16790 }, { "epoch": 3.082216920535878, "grad_norm": 3.478550910949707, "learning_rate": 9.91110344055642e-06, "loss": 0.3418, "num_input_tokens_seen": 36218016, "step": 16795 }, { "epoch": 3.0831345200954305, "grad_norm": 10.941197395324707, "learning_rate": 9.910953051595591e-06, "loss": 0.4613, "num_input_tokens_seen": 36226304, "step": 16800 }, { "epoch": 3.0840521196549826, "grad_norm": 8.049959182739258, "learning_rate": 9.910802536676427e-06, "loss": 0.2737, "num_input_tokens_seen": 36237504, "step": 16805 }, { "epoch": 3.0849697192145347, "grad_norm": 11.801324844360352, "learning_rate": 9.910651895802791e-06, "loss": 0.3185, "num_input_tokens_seen": 36249152, "step": 16810 }, { "epoch": 3.085887318774087, "grad_norm": 5.716513156890869, "learning_rate": 9.910501128978543e-06, "loss": 0.414, "num_input_tokens_seen": 36260512, "step": 16815 }, { "epoch": 3.0868049183336392, "grad_norm": 8.675763130187988, "learning_rate": 9.910350236207554e-06, "loss": 0.283, "num_input_tokens_seen": 36270656, "step": 16820 }, { "epoch": 3.0877225178931913, "grad_norm": 7.112901210784912, "learning_rate": 9.910199217493688e-06, "loss": 0.3105, "num_input_tokens_seen": 36279808, "step": 16825 }, { "epoch": 3.088640117452744, "grad_norm": 3.3705568313598633, "learning_rate": 9.910048072840825e-06, "loss": 0.1359, "num_input_tokens_seen": 36290656, "step": 16830 }, { "epoch": 3.089557717012296, "grad_norm": 1.2871544361114502, "learning_rate": 9.909896802252838e-06, "loss": 0.3248, "num_input_tokens_seen": 36300416, "step": 16835 }, { "epoch": 3.090475316571848, "grad_norm": 6.550382137298584, "learning_rate": 9.909745405733609e-06, "loss": 0.2776, "num_input_tokens_seen": 36310112, "step": 16840 }, { "epoch": 3.0913929161314004, "grad_norm": 9.825751304626465, "learning_rate": 9.909593883287016e-06, "loss": 0.4245, "num_input_tokens_seen": 36320928, "step": 16845 }, { "epoch": 3.0923105156909525, "grad_norm": 1.5525389909744263, "learning_rate": 9.909442234916953e-06, "loss": 0.1493, "num_input_tokens_seen": 36330816, "step": 16850 }, { "epoch": 3.0932281152505046, "grad_norm": 3.8045010566711426, "learning_rate": 9.909290460627304e-06, "loss": 0.249, "num_input_tokens_seen": 36341632, "step": 16855 }, { "epoch": 3.094145714810057, "grad_norm": 6.014512538909912, "learning_rate": 9.909138560421964e-06, "loss": 0.3344, "num_input_tokens_seen": 36352640, "step": 16860 }, { "epoch": 3.095063314369609, "grad_norm": 6.939537048339844, "learning_rate": 9.908986534304827e-06, "loss": 0.2478, "num_input_tokens_seen": 36363680, "step": 16865 }, { "epoch": 3.095980913929161, "grad_norm": 2.4901819229125977, "learning_rate": 9.908834382279795e-06, "loss": 0.1812, "num_input_tokens_seen": 36372448, "step": 16870 }, { "epoch": 3.0968985134887137, "grad_norm": 3.1159603595733643, "learning_rate": 9.908682104350769e-06, "loss": 0.4132, "num_input_tokens_seen": 36383488, "step": 16875 }, { "epoch": 3.0978161130482658, "grad_norm": 44.52141571044922, "learning_rate": 9.908529700521654e-06, "loss": 0.2867, "num_input_tokens_seen": 36392896, "step": 16880 }, { "epoch": 3.098733712607818, "grad_norm": 0.9885234236717224, "learning_rate": 9.908377170796362e-06, "loss": 0.2036, "num_input_tokens_seen": 36403232, "step": 16885 }, { "epoch": 3.0996513121673703, "grad_norm": 3.414935350418091, "learning_rate": 9.9082245151788e-06, "loss": 0.3648, "num_input_tokens_seen": 36414528, "step": 16890 }, { "epoch": 3.1005689117269224, "grad_norm": 16.56966209411621, "learning_rate": 9.908071733672886e-06, "loss": 0.3086, "num_input_tokens_seen": 36425760, "step": 16895 }, { "epoch": 3.1014865112864745, "grad_norm": 3.5622668266296387, "learning_rate": 9.90791882628254e-06, "loss": 0.4003, "num_input_tokens_seen": 36437728, "step": 16900 }, { "epoch": 3.102404110846027, "grad_norm": 4.863277912139893, "learning_rate": 9.907765793011684e-06, "loss": 0.3705, "num_input_tokens_seen": 36447520, "step": 16905 }, { "epoch": 3.103321710405579, "grad_norm": 2.1957786083221436, "learning_rate": 9.90761263386424e-06, "loss": 0.2406, "num_input_tokens_seen": 36459136, "step": 16910 }, { "epoch": 3.104239309965131, "grad_norm": 46.344459533691406, "learning_rate": 9.90745934884414e-06, "loss": 0.3023, "num_input_tokens_seen": 36470272, "step": 16915 }, { "epoch": 3.1051569095246836, "grad_norm": 2.252074956893921, "learning_rate": 9.907305937955312e-06, "loss": 0.3984, "num_input_tokens_seen": 36482144, "step": 16920 }, { "epoch": 3.1060745090842357, "grad_norm": 0.9126464128494263, "learning_rate": 9.907152401201693e-06, "loss": 0.2249, "num_input_tokens_seen": 36493120, "step": 16925 }, { "epoch": 3.1069921086437877, "grad_norm": 2.1949167251586914, "learning_rate": 9.906998738587219e-06, "loss": 0.3668, "num_input_tokens_seen": 36502976, "step": 16930 }, { "epoch": 3.1079097082033402, "grad_norm": 1.2054976224899292, "learning_rate": 9.906844950115836e-06, "loss": 0.3085, "num_input_tokens_seen": 36513056, "step": 16935 }, { "epoch": 3.1088273077628923, "grad_norm": 18.495521545410156, "learning_rate": 9.90669103579148e-06, "loss": 0.3983, "num_input_tokens_seen": 36524512, "step": 16940 }, { "epoch": 3.1097449073224444, "grad_norm": 4.061986923217773, "learning_rate": 9.906536995618106e-06, "loss": 0.2714, "num_input_tokens_seen": 36535616, "step": 16945 }, { "epoch": 3.110662506881997, "grad_norm": 1.584916353225708, "learning_rate": 9.906382829599664e-06, "loss": 0.2413, "num_input_tokens_seen": 36545184, "step": 16950 }, { "epoch": 3.111580106441549, "grad_norm": 7.017276287078857, "learning_rate": 9.906228537740103e-06, "loss": 0.3082, "num_input_tokens_seen": 36555808, "step": 16955 }, { "epoch": 3.112497706001101, "grad_norm": 1.4781473875045776, "learning_rate": 9.906074120043387e-06, "loss": 0.2646, "num_input_tokens_seen": 36566656, "step": 16960 }, { "epoch": 3.1134153055606535, "grad_norm": 2.4067306518554688, "learning_rate": 9.905919576513473e-06, "loss": 0.3114, "num_input_tokens_seen": 36578208, "step": 16965 }, { "epoch": 3.1143329051202056, "grad_norm": 1.202501893043518, "learning_rate": 9.905764907154325e-06, "loss": 0.3554, "num_input_tokens_seen": 36589024, "step": 16970 }, { "epoch": 3.1152505046797576, "grad_norm": 1.2461024522781372, "learning_rate": 9.90561011196991e-06, "loss": 0.32, "num_input_tokens_seen": 36599744, "step": 16975 }, { "epoch": 3.11616810423931, "grad_norm": 8.627678871154785, "learning_rate": 9.9054551909642e-06, "loss": 0.27, "num_input_tokens_seen": 36611584, "step": 16980 }, { "epoch": 3.117085703798862, "grad_norm": 2.429583787918091, "learning_rate": 9.905300144141165e-06, "loss": 0.3072, "num_input_tokens_seen": 36622272, "step": 16985 }, { "epoch": 3.1180033033584142, "grad_norm": 2.821331262588501, "learning_rate": 9.905144971504786e-06, "loss": 0.2307, "num_input_tokens_seen": 36632032, "step": 16990 }, { "epoch": 3.1189209029179668, "grad_norm": 4.550827980041504, "learning_rate": 9.904989673059038e-06, "loss": 0.2972, "num_input_tokens_seen": 36643168, "step": 16995 }, { "epoch": 3.119838502477519, "grad_norm": 3.0695574283599854, "learning_rate": 9.90483424880791e-06, "loss": 0.2998, "num_input_tokens_seen": 36653760, "step": 17000 }, { "epoch": 3.120756102037071, "grad_norm": 11.588824272155762, "learning_rate": 9.904678698755383e-06, "loss": 0.4013, "num_input_tokens_seen": 36664288, "step": 17005 }, { "epoch": 3.1216737015966234, "grad_norm": 15.010364532470703, "learning_rate": 9.90452302290545e-06, "loss": 0.379, "num_input_tokens_seen": 36676192, "step": 17010 }, { "epoch": 3.1225913011561754, "grad_norm": 0.9338548183441162, "learning_rate": 9.904367221262103e-06, "loss": 0.3683, "num_input_tokens_seen": 36686656, "step": 17015 }, { "epoch": 3.1235089007157275, "grad_norm": 3.5622663497924805, "learning_rate": 9.904211293829339e-06, "loss": 0.4139, "num_input_tokens_seen": 36697504, "step": 17020 }, { "epoch": 3.12442650027528, "grad_norm": 2.0595407485961914, "learning_rate": 9.904055240611153e-06, "loss": 0.2721, "num_input_tokens_seen": 36707776, "step": 17025 }, { "epoch": 3.125344099834832, "grad_norm": 3.3988211154937744, "learning_rate": 9.903899061611553e-06, "loss": 0.2941, "num_input_tokens_seen": 36718784, "step": 17030 }, { "epoch": 3.126261699394384, "grad_norm": 5.191098213195801, "learning_rate": 9.903742756834543e-06, "loss": 0.2924, "num_input_tokens_seen": 36729344, "step": 17035 }, { "epoch": 3.1271792989539366, "grad_norm": 3.28071928024292, "learning_rate": 9.90358632628413e-06, "loss": 0.2769, "num_input_tokens_seen": 36739712, "step": 17040 }, { "epoch": 3.1280968985134887, "grad_norm": 6.899115562438965, "learning_rate": 9.903429769964326e-06, "loss": 0.2807, "num_input_tokens_seen": 36749664, "step": 17045 }, { "epoch": 3.1290144980730408, "grad_norm": 6.659373760223389, "learning_rate": 9.90327308787915e-06, "loss": 0.3631, "num_input_tokens_seen": 36759392, "step": 17050 }, { "epoch": 3.1299320976325933, "grad_norm": 3.3576772212982178, "learning_rate": 9.903116280032618e-06, "loss": 0.4674, "num_input_tokens_seen": 36769760, "step": 17055 }, { "epoch": 3.1308496971921453, "grad_norm": 1.9925400018692017, "learning_rate": 9.902959346428753e-06, "loss": 0.1932, "num_input_tokens_seen": 36780704, "step": 17060 }, { "epoch": 3.1317672967516974, "grad_norm": 0.9805219769477844, "learning_rate": 9.90280228707158e-06, "loss": 0.3808, "num_input_tokens_seen": 36792832, "step": 17065 }, { "epoch": 3.13268489631125, "grad_norm": 3.26008939743042, "learning_rate": 9.902645101965127e-06, "loss": 0.2815, "num_input_tokens_seen": 36801984, "step": 17070 }, { "epoch": 3.133602495870802, "grad_norm": 6.732960224151611, "learning_rate": 9.902487791113426e-06, "loss": 0.4798, "num_input_tokens_seen": 36813408, "step": 17075 }, { "epoch": 3.134520095430354, "grad_norm": 3.2403132915496826, "learning_rate": 9.902330354520511e-06, "loss": 0.1862, "num_input_tokens_seen": 36824928, "step": 17080 }, { "epoch": 3.1354376949899065, "grad_norm": 9.677046775817871, "learning_rate": 9.902172792190418e-06, "loss": 0.3125, "num_input_tokens_seen": 36836896, "step": 17085 }, { "epoch": 3.1363552945494586, "grad_norm": 2.2238736152648926, "learning_rate": 9.902015104127194e-06, "loss": 0.2464, "num_input_tokens_seen": 36847296, "step": 17090 }, { "epoch": 3.1372728941090107, "grad_norm": 0.6417091488838196, "learning_rate": 9.901857290334878e-06, "loss": 0.2505, "num_input_tokens_seen": 36858464, "step": 17095 }, { "epoch": 3.138190493668563, "grad_norm": 2.7701005935668945, "learning_rate": 9.901699350817519e-06, "loss": 0.2077, "num_input_tokens_seen": 36869312, "step": 17100 }, { "epoch": 3.1391080932281152, "grad_norm": 5.8774919509887695, "learning_rate": 9.901541285579171e-06, "loss": 0.2817, "num_input_tokens_seen": 36880960, "step": 17105 }, { "epoch": 3.1400256927876673, "grad_norm": 16.535627365112305, "learning_rate": 9.901383094623883e-06, "loss": 0.3298, "num_input_tokens_seen": 36890624, "step": 17110 }, { "epoch": 3.14094329234722, "grad_norm": 2.4094045162200928, "learning_rate": 9.901224777955718e-06, "loss": 0.3165, "num_input_tokens_seen": 36898432, "step": 17115 }, { "epoch": 3.141860891906772, "grad_norm": 4.268815040588379, "learning_rate": 9.901066335578732e-06, "loss": 0.332, "num_input_tokens_seen": 36909184, "step": 17120 }, { "epoch": 3.142778491466324, "grad_norm": 8.590883255004883, "learning_rate": 9.900907767496992e-06, "loss": 0.2814, "num_input_tokens_seen": 36917888, "step": 17125 }, { "epoch": 3.1436960910258764, "grad_norm": 7.303964614868164, "learning_rate": 9.900749073714562e-06, "loss": 0.349, "num_input_tokens_seen": 36929568, "step": 17130 }, { "epoch": 3.1446136905854285, "grad_norm": 4.349113464355469, "learning_rate": 9.900590254235513e-06, "loss": 0.238, "num_input_tokens_seen": 36941312, "step": 17135 }, { "epoch": 3.1455312901449806, "grad_norm": 2.04887318611145, "learning_rate": 9.90043130906392e-06, "loss": 0.3676, "num_input_tokens_seen": 36953152, "step": 17140 }, { "epoch": 3.146448889704533, "grad_norm": 16.341611862182617, "learning_rate": 9.90027223820386e-06, "loss": 0.2708, "num_input_tokens_seen": 36964416, "step": 17145 }, { "epoch": 3.147366489264085, "grad_norm": 2.3034181594848633, "learning_rate": 9.90011304165941e-06, "loss": 0.2213, "num_input_tokens_seen": 36976256, "step": 17150 }, { "epoch": 3.148284088823637, "grad_norm": 10.7131929397583, "learning_rate": 9.899953719434655e-06, "loss": 0.6114, "num_input_tokens_seen": 36986432, "step": 17155 }, { "epoch": 3.1492016883831897, "grad_norm": 7.6733903884887695, "learning_rate": 9.899794271533684e-06, "loss": 0.2874, "num_input_tokens_seen": 36996128, "step": 17160 }, { "epoch": 3.1501192879427418, "grad_norm": 2.5791077613830566, "learning_rate": 9.899634697960582e-06, "loss": 0.2784, "num_input_tokens_seen": 37007744, "step": 17165 }, { "epoch": 3.151036887502294, "grad_norm": 4.612555980682373, "learning_rate": 9.899474998719443e-06, "loss": 0.2558, "num_input_tokens_seen": 37017824, "step": 17170 }, { "epoch": 3.1519544870618463, "grad_norm": 8.352150917053223, "learning_rate": 9.899315173814366e-06, "loss": 0.2664, "num_input_tokens_seen": 37029280, "step": 17175 }, { "epoch": 3.1528720866213984, "grad_norm": 4.383173942565918, "learning_rate": 9.899155223249445e-06, "loss": 0.2575, "num_input_tokens_seen": 37040000, "step": 17180 }, { "epoch": 3.1537896861809505, "grad_norm": 2.3900630474090576, "learning_rate": 9.898995147028786e-06, "loss": 0.4103, "num_input_tokens_seen": 37050848, "step": 17185 }, { "epoch": 3.154707285740503, "grad_norm": 4.591903209686279, "learning_rate": 9.898834945156497e-06, "loss": 0.3219, "num_input_tokens_seen": 37062080, "step": 17190 }, { "epoch": 3.155624885300055, "grad_norm": 3.595571517944336, "learning_rate": 9.898674617636684e-06, "loss": 0.3595, "num_input_tokens_seen": 37072576, "step": 17195 }, { "epoch": 3.156542484859607, "grad_norm": 4.883039951324463, "learning_rate": 9.898514164473456e-06, "loss": 0.4791, "num_input_tokens_seen": 37083616, "step": 17200 }, { "epoch": 3.1574600844191596, "grad_norm": 6.125211238861084, "learning_rate": 9.898353585670934e-06, "loss": 0.2586, "num_input_tokens_seen": 37094080, "step": 17205 }, { "epoch": 3.1583776839787117, "grad_norm": 1.0704032182693481, "learning_rate": 9.898192881233233e-06, "loss": 0.2886, "num_input_tokens_seen": 37103680, "step": 17210 }, { "epoch": 3.1592952835382637, "grad_norm": 1.7861889600753784, "learning_rate": 9.898032051164478e-06, "loss": 0.3734, "num_input_tokens_seen": 37115872, "step": 17215 }, { "epoch": 3.1602128830978162, "grad_norm": 2.200995683670044, "learning_rate": 9.897871095468792e-06, "loss": 0.2467, "num_input_tokens_seen": 37126592, "step": 17220 }, { "epoch": 3.1611304826573683, "grad_norm": 9.566085815429688, "learning_rate": 9.897710014150301e-06, "loss": 0.3691, "num_input_tokens_seen": 37137440, "step": 17225 }, { "epoch": 3.1620480822169204, "grad_norm": 3.7019894123077393, "learning_rate": 9.897548807213142e-06, "loss": 0.2846, "num_input_tokens_seen": 37148608, "step": 17230 }, { "epoch": 3.162965681776473, "grad_norm": 1.4592446088790894, "learning_rate": 9.897387474661443e-06, "loss": 0.2923, "num_input_tokens_seen": 37158784, "step": 17235 }, { "epoch": 3.163883281336025, "grad_norm": 3.3860292434692383, "learning_rate": 9.897226016499348e-06, "loss": 0.4033, "num_input_tokens_seen": 37169792, "step": 17240 }, { "epoch": 3.164800880895577, "grad_norm": 1.5559591054916382, "learning_rate": 9.897064432730996e-06, "loss": 0.2607, "num_input_tokens_seen": 37181056, "step": 17245 }, { "epoch": 3.1657184804551295, "grad_norm": 1.869962453842163, "learning_rate": 9.89690272336053e-06, "loss": 0.1634, "num_input_tokens_seen": 37191616, "step": 17250 }, { "epoch": 3.1666360800146816, "grad_norm": 4.862576007843018, "learning_rate": 9.8967408883921e-06, "loss": 0.3157, "num_input_tokens_seen": 37201728, "step": 17255 }, { "epoch": 3.1675536795742336, "grad_norm": 3.642897367477417, "learning_rate": 9.896578927829854e-06, "loss": 0.2944, "num_input_tokens_seen": 37213120, "step": 17260 }, { "epoch": 3.168471279133786, "grad_norm": 9.29061508178711, "learning_rate": 9.896416841677947e-06, "loss": 0.3487, "num_input_tokens_seen": 37224416, "step": 17265 }, { "epoch": 3.169388878693338, "grad_norm": 5.090692520141602, "learning_rate": 9.896254629940539e-06, "loss": 0.272, "num_input_tokens_seen": 37234848, "step": 17270 }, { "epoch": 3.1703064782528902, "grad_norm": 9.236541748046875, "learning_rate": 9.896092292621787e-06, "loss": 0.3499, "num_input_tokens_seen": 37245696, "step": 17275 }, { "epoch": 3.1712240778124428, "grad_norm": 12.926182746887207, "learning_rate": 9.895929829725856e-06, "loss": 0.1927, "num_input_tokens_seen": 37256480, "step": 17280 }, { "epoch": 3.172141677371995, "grad_norm": 3.317878484725952, "learning_rate": 9.895767241256912e-06, "loss": 0.2301, "num_input_tokens_seen": 37266880, "step": 17285 }, { "epoch": 3.173059276931547, "grad_norm": 3.5184974670410156, "learning_rate": 9.895604527219127e-06, "loss": 0.301, "num_input_tokens_seen": 37276576, "step": 17290 }, { "epoch": 3.1739768764910994, "grad_norm": 2.423396348953247, "learning_rate": 9.895441687616673e-06, "loss": 0.2575, "num_input_tokens_seen": 37287680, "step": 17295 }, { "epoch": 3.1748944760506514, "grad_norm": 10.075455665588379, "learning_rate": 9.895278722453728e-06, "loss": 0.3388, "num_input_tokens_seen": 37299232, "step": 17300 }, { "epoch": 3.1758120756102035, "grad_norm": 2.6884303092956543, "learning_rate": 9.895115631734469e-06, "loss": 0.3444, "num_input_tokens_seen": 37310656, "step": 17305 }, { "epoch": 3.176729675169756, "grad_norm": 1.0285178422927856, "learning_rate": 9.894952415463082e-06, "loss": 0.3981, "num_input_tokens_seen": 37321696, "step": 17310 }, { "epoch": 3.177647274729308, "grad_norm": 12.541731834411621, "learning_rate": 9.894789073643752e-06, "loss": 0.2902, "num_input_tokens_seen": 37331808, "step": 17315 }, { "epoch": 3.17856487428886, "grad_norm": 8.578080177307129, "learning_rate": 9.894625606280668e-06, "loss": 0.292, "num_input_tokens_seen": 37342528, "step": 17320 }, { "epoch": 3.1794824738484126, "grad_norm": 2.0547428131103516, "learning_rate": 9.894462013378024e-06, "loss": 0.3624, "num_input_tokens_seen": 37353088, "step": 17325 }, { "epoch": 3.1804000734079647, "grad_norm": 2.6374638080596924, "learning_rate": 9.894298294940015e-06, "loss": 0.2208, "num_input_tokens_seen": 37364160, "step": 17330 }, { "epoch": 3.1813176729675168, "grad_norm": 6.408619403839111, "learning_rate": 9.894134450970838e-06, "loss": 0.3173, "num_input_tokens_seen": 37374560, "step": 17335 }, { "epoch": 3.1822352725270693, "grad_norm": 2.133807897567749, "learning_rate": 9.8939704814747e-06, "loss": 0.3052, "num_input_tokens_seen": 37386016, "step": 17340 }, { "epoch": 3.1831528720866213, "grad_norm": 3.0037617683410645, "learning_rate": 9.893806386455804e-06, "loss": 0.2656, "num_input_tokens_seen": 37395648, "step": 17345 }, { "epoch": 3.1840704716461734, "grad_norm": 8.664044380187988, "learning_rate": 9.893642165918358e-06, "loss": 0.377, "num_input_tokens_seen": 37405536, "step": 17350 }, { "epoch": 3.184988071205726, "grad_norm": 3.5991716384887695, "learning_rate": 9.893477819866574e-06, "loss": 0.2893, "num_input_tokens_seen": 37415744, "step": 17355 }, { "epoch": 3.185905670765278, "grad_norm": 7.992667198181152, "learning_rate": 9.893313348304669e-06, "loss": 0.3915, "num_input_tokens_seen": 37427072, "step": 17360 }, { "epoch": 3.18682327032483, "grad_norm": 0.8606137633323669, "learning_rate": 9.893148751236861e-06, "loss": 0.1327, "num_input_tokens_seen": 37438176, "step": 17365 }, { "epoch": 3.1877408698843825, "grad_norm": 3.6801326274871826, "learning_rate": 9.892984028667372e-06, "loss": 0.3488, "num_input_tokens_seen": 37448928, "step": 17370 }, { "epoch": 3.1886584694439346, "grad_norm": 3.452545166015625, "learning_rate": 9.892819180600426e-06, "loss": 0.2438, "num_input_tokens_seen": 37459744, "step": 17375 }, { "epoch": 3.189576069003487, "grad_norm": 6.6734161376953125, "learning_rate": 9.89265420704025e-06, "loss": 0.2968, "num_input_tokens_seen": 37470688, "step": 17380 }, { "epoch": 3.190493668563039, "grad_norm": 2.815534830093384, "learning_rate": 9.892489107991077e-06, "loss": 0.3092, "num_input_tokens_seen": 37481056, "step": 17385 }, { "epoch": 3.1914112681225912, "grad_norm": 1.670493483543396, "learning_rate": 9.89232388345714e-06, "loss": 0.3361, "num_input_tokens_seen": 37491872, "step": 17390 }, { "epoch": 3.1923288676821437, "grad_norm": 6.832144737243652, "learning_rate": 9.89215853344268e-06, "loss": 0.2612, "num_input_tokens_seen": 37503168, "step": 17395 }, { "epoch": 3.193246467241696, "grad_norm": 2.8687875270843506, "learning_rate": 9.891993057951935e-06, "loss": 0.2813, "num_input_tokens_seen": 37512736, "step": 17400 }, { "epoch": 3.194164066801248, "grad_norm": 10.815032958984375, "learning_rate": 9.891827456989149e-06, "loss": 0.2075, "num_input_tokens_seen": 37522976, "step": 17405 }, { "epoch": 3.1950816663608004, "grad_norm": 15.106369972229004, "learning_rate": 9.89166173055857e-06, "loss": 0.2964, "num_input_tokens_seen": 37533536, "step": 17410 }, { "epoch": 3.1959992659203524, "grad_norm": 13.327372550964355, "learning_rate": 9.89149587866445e-06, "loss": 0.5348, "num_input_tokens_seen": 37544192, "step": 17415 }, { "epoch": 3.1969168654799045, "grad_norm": 28.06230926513672, "learning_rate": 9.891329901311043e-06, "loss": 0.5784, "num_input_tokens_seen": 37553536, "step": 17420 }, { "epoch": 3.197834465039457, "grad_norm": 7.277583122253418, "learning_rate": 9.891163798502603e-06, "loss": 0.2062, "num_input_tokens_seen": 37562816, "step": 17425 }, { "epoch": 3.198752064599009, "grad_norm": 12.449174880981445, "learning_rate": 9.890997570243392e-06, "loss": 0.5569, "num_input_tokens_seen": 37572608, "step": 17430 }, { "epoch": 3.199669664158561, "grad_norm": 8.773885726928711, "learning_rate": 9.890831216537674e-06, "loss": 0.253, "num_input_tokens_seen": 37583360, "step": 17435 }, { "epoch": 3.2005872637181136, "grad_norm": 1.4836342334747314, "learning_rate": 9.890664737389718e-06, "loss": 0.3046, "num_input_tokens_seen": 37594528, "step": 17440 }, { "epoch": 3.2015048632776657, "grad_norm": 4.067760944366455, "learning_rate": 9.890498132803788e-06, "loss": 0.3327, "num_input_tokens_seen": 37605568, "step": 17445 }, { "epoch": 3.2024224628372178, "grad_norm": 4.501507759094238, "learning_rate": 9.89033140278416e-06, "loss": 0.2692, "num_input_tokens_seen": 37615776, "step": 17450 }, { "epoch": 3.2033400623967703, "grad_norm": 6.409521102905273, "learning_rate": 9.890164547335115e-06, "loss": 0.4337, "num_input_tokens_seen": 37627104, "step": 17455 }, { "epoch": 3.2042576619563223, "grad_norm": 2.902031183242798, "learning_rate": 9.889997566460926e-06, "loss": 0.2965, "num_input_tokens_seen": 37638048, "step": 17460 }, { "epoch": 3.2051752615158744, "grad_norm": 4.108881950378418, "learning_rate": 9.889830460165877e-06, "loss": 0.2731, "num_input_tokens_seen": 37649120, "step": 17465 }, { "epoch": 3.206092861075427, "grad_norm": 2.7612569332122803, "learning_rate": 9.889663228454257e-06, "loss": 0.2687, "num_input_tokens_seen": 37660096, "step": 17470 }, { "epoch": 3.207010460634979, "grad_norm": 1.2318984270095825, "learning_rate": 9.889495871330352e-06, "loss": 0.3949, "num_input_tokens_seen": 37670208, "step": 17475 }, { "epoch": 3.207928060194531, "grad_norm": 3.3620057106018066, "learning_rate": 9.889328388798459e-06, "loss": 0.1485, "num_input_tokens_seen": 37680704, "step": 17480 }, { "epoch": 3.2088456597540835, "grad_norm": 4.72150993347168, "learning_rate": 9.889160780862868e-06, "loss": 0.346, "num_input_tokens_seen": 37691936, "step": 17485 }, { "epoch": 3.2097632593136356, "grad_norm": 4.996923923492432, "learning_rate": 9.88899304752788e-06, "loss": 0.2634, "num_input_tokens_seen": 37701728, "step": 17490 }, { "epoch": 3.2106808588731877, "grad_norm": 6.3317952156066895, "learning_rate": 9.888825188797799e-06, "loss": 0.5063, "num_input_tokens_seen": 37712960, "step": 17495 }, { "epoch": 3.21159845843274, "grad_norm": 6.567049980163574, "learning_rate": 9.888657204676928e-06, "loss": 0.3611, "num_input_tokens_seen": 37722368, "step": 17500 }, { "epoch": 3.2125160579922922, "grad_norm": 6.397927761077881, "learning_rate": 9.888489095169578e-06, "loss": 0.2692, "num_input_tokens_seen": 37733056, "step": 17505 }, { "epoch": 3.2134336575518443, "grad_norm": 3.4493319988250732, "learning_rate": 9.888320860280058e-06, "loss": 0.31, "num_input_tokens_seen": 37744320, "step": 17510 }, { "epoch": 3.214351257111397, "grad_norm": 11.090240478515625, "learning_rate": 9.888152500012683e-06, "loss": 0.334, "num_input_tokens_seen": 37754720, "step": 17515 }, { "epoch": 3.215268856670949, "grad_norm": 7.131779670715332, "learning_rate": 9.887984014371774e-06, "loss": 0.2867, "num_input_tokens_seen": 37765696, "step": 17520 }, { "epoch": 3.216186456230501, "grad_norm": 3.3578033447265625, "learning_rate": 9.88781540336165e-06, "loss": 0.4254, "num_input_tokens_seen": 37777696, "step": 17525 }, { "epoch": 3.2171040557900534, "grad_norm": 7.036109924316406, "learning_rate": 9.887646666986637e-06, "loss": 0.3083, "num_input_tokens_seen": 37788640, "step": 17530 }, { "epoch": 3.2180216553496055, "grad_norm": 4.89315128326416, "learning_rate": 9.887477805251062e-06, "loss": 0.4262, "num_input_tokens_seen": 37800160, "step": 17535 }, { "epoch": 3.2189392549091576, "grad_norm": 7.510410785675049, "learning_rate": 9.887308818159256e-06, "loss": 0.2252, "num_input_tokens_seen": 37811296, "step": 17540 }, { "epoch": 3.21985685446871, "grad_norm": 7.4804816246032715, "learning_rate": 9.887139705715553e-06, "loss": 0.4145, "num_input_tokens_seen": 37820768, "step": 17545 }, { "epoch": 3.220774454028262, "grad_norm": 8.115212440490723, "learning_rate": 9.88697046792429e-06, "loss": 0.2877, "num_input_tokens_seen": 37831968, "step": 17550 }, { "epoch": 3.221692053587814, "grad_norm": 3.5274949073791504, "learning_rate": 9.886801104789811e-06, "loss": 0.246, "num_input_tokens_seen": 37841888, "step": 17555 }, { "epoch": 3.2226096531473667, "grad_norm": 1.7519222497940063, "learning_rate": 9.886631616316457e-06, "loss": 0.2774, "num_input_tokens_seen": 37852736, "step": 17560 }, { "epoch": 3.2235272527069188, "grad_norm": 3.4015438556671143, "learning_rate": 9.886462002508575e-06, "loss": 0.3446, "num_input_tokens_seen": 37863936, "step": 17565 }, { "epoch": 3.224444852266471, "grad_norm": 3.5153045654296875, "learning_rate": 9.886292263370516e-06, "loss": 0.3652, "num_input_tokens_seen": 37873952, "step": 17570 }, { "epoch": 3.2253624518260233, "grad_norm": 5.441950798034668, "learning_rate": 9.886122398906633e-06, "loss": 0.2501, "num_input_tokens_seen": 37885408, "step": 17575 }, { "epoch": 3.2262800513855754, "grad_norm": 2.4776906967163086, "learning_rate": 9.885952409121284e-06, "loss": 0.2934, "num_input_tokens_seen": 37895424, "step": 17580 }, { "epoch": 3.2271976509451274, "grad_norm": 2.646423101425171, "learning_rate": 9.88578229401883e-06, "loss": 0.2225, "num_input_tokens_seen": 37905920, "step": 17585 }, { "epoch": 3.22811525050468, "grad_norm": 3.8705267906188965, "learning_rate": 9.885612053603628e-06, "loss": 0.3169, "num_input_tokens_seen": 37917440, "step": 17590 }, { "epoch": 3.229032850064232, "grad_norm": 5.62960958480835, "learning_rate": 9.885441687880052e-06, "loss": 0.2287, "num_input_tokens_seen": 37928672, "step": 17595 }, { "epoch": 3.229950449623784, "grad_norm": 7.214649200439453, "learning_rate": 9.885271196852469e-06, "loss": 0.1618, "num_input_tokens_seen": 37938752, "step": 17600 }, { "epoch": 3.2308680491833366, "grad_norm": 2.9640331268310547, "learning_rate": 9.885100580525248e-06, "loss": 0.2471, "num_input_tokens_seen": 37947744, "step": 17605 }, { "epoch": 3.2317856487428887, "grad_norm": 18.361799240112305, "learning_rate": 9.884929838902771e-06, "loss": 0.5175, "num_input_tokens_seen": 37959200, "step": 17610 }, { "epoch": 3.2327032483024407, "grad_norm": 2.609757900238037, "learning_rate": 9.884758971989415e-06, "loss": 0.3169, "num_input_tokens_seen": 37969504, "step": 17615 }, { "epoch": 3.233620847861993, "grad_norm": 3.617673397064209, "learning_rate": 9.88458797978956e-06, "loss": 0.2465, "num_input_tokens_seen": 37980448, "step": 17620 }, { "epoch": 3.2345384474215453, "grad_norm": 4.9300537109375, "learning_rate": 9.884416862307596e-06, "loss": 0.3269, "num_input_tokens_seen": 37991392, "step": 17625 }, { "epoch": 3.2354560469810973, "grad_norm": 2.3570713996887207, "learning_rate": 9.884245619547908e-06, "loss": 0.3689, "num_input_tokens_seen": 38001984, "step": 17630 }, { "epoch": 3.23637364654065, "grad_norm": 2.6846609115600586, "learning_rate": 9.88407425151489e-06, "loss": 0.3883, "num_input_tokens_seen": 38013088, "step": 17635 }, { "epoch": 3.237291246100202, "grad_norm": 3.0131757259368896, "learning_rate": 9.883902758212938e-06, "loss": 0.3098, "num_input_tokens_seen": 38024576, "step": 17640 }, { "epoch": 3.238208845659754, "grad_norm": 4.788300037384033, "learning_rate": 9.883731139646449e-06, "loss": 0.3547, "num_input_tokens_seen": 38035712, "step": 17645 }, { "epoch": 3.2391264452193065, "grad_norm": 2.5594701766967773, "learning_rate": 9.883559395819824e-06, "loss": 0.2904, "num_input_tokens_seen": 38047104, "step": 17650 }, { "epoch": 3.2400440447788585, "grad_norm": 3.383946418762207, "learning_rate": 9.883387526737471e-06, "loss": 0.2389, "num_input_tokens_seen": 38057280, "step": 17655 }, { "epoch": 3.2409616443384106, "grad_norm": 2.3578386306762695, "learning_rate": 9.883215532403796e-06, "loss": 0.3428, "num_input_tokens_seen": 38067808, "step": 17660 }, { "epoch": 3.241879243897963, "grad_norm": 3.9620442390441895, "learning_rate": 9.88304341282321e-06, "loss": 0.4851, "num_input_tokens_seen": 38077536, "step": 17665 }, { "epoch": 3.242796843457515, "grad_norm": 1.2390491962432861, "learning_rate": 9.88287116800013e-06, "loss": 0.3007, "num_input_tokens_seen": 38088448, "step": 17670 }, { "epoch": 3.2437144430170672, "grad_norm": 1.2705003023147583, "learning_rate": 9.882698797938972e-06, "loss": 0.2328, "num_input_tokens_seen": 38099360, "step": 17675 }, { "epoch": 3.2446320425766197, "grad_norm": 4.695685863494873, "learning_rate": 9.882526302644157e-06, "loss": 0.2729, "num_input_tokens_seen": 38110496, "step": 17680 }, { "epoch": 3.245549642136172, "grad_norm": 3.4652934074401855, "learning_rate": 9.88235368212011e-06, "loss": 0.2975, "num_input_tokens_seen": 38120576, "step": 17685 }, { "epoch": 3.246467241695724, "grad_norm": 2.389538288116455, "learning_rate": 9.882180936371257e-06, "loss": 0.3154, "num_input_tokens_seen": 38132256, "step": 17690 }, { "epoch": 3.2473848412552764, "grad_norm": 1.9567821025848389, "learning_rate": 9.882008065402031e-06, "loss": 0.2856, "num_input_tokens_seen": 38142624, "step": 17695 }, { "epoch": 3.2483024408148284, "grad_norm": 4.190267086029053, "learning_rate": 9.881835069216864e-06, "loss": 0.2853, "num_input_tokens_seen": 38154304, "step": 17700 }, { "epoch": 3.2492200403743805, "grad_norm": 3.700037956237793, "learning_rate": 9.881661947820194e-06, "loss": 0.3699, "num_input_tokens_seen": 38164800, "step": 17705 }, { "epoch": 3.250137639933933, "grad_norm": 16.950977325439453, "learning_rate": 9.88148870121646e-06, "loss": 0.3191, "num_input_tokens_seen": 38174752, "step": 17710 }, { "epoch": 3.251055239493485, "grad_norm": 2.4570364952087402, "learning_rate": 9.881315329410108e-06, "loss": 0.2724, "num_input_tokens_seen": 38184192, "step": 17715 }, { "epoch": 3.251972839053037, "grad_norm": 10.45431137084961, "learning_rate": 9.88114183240558e-06, "loss": 0.3605, "num_input_tokens_seen": 38194624, "step": 17720 }, { "epoch": 3.2528904386125896, "grad_norm": 2.222886323928833, "learning_rate": 9.880968210207333e-06, "loss": 0.3447, "num_input_tokens_seen": 38205344, "step": 17725 }, { "epoch": 3.2538080381721417, "grad_norm": 12.560565948486328, "learning_rate": 9.880794462819814e-06, "loss": 0.2826, "num_input_tokens_seen": 38215328, "step": 17730 }, { "epoch": 3.2547256377316938, "grad_norm": 6.265956878662109, "learning_rate": 9.880620590247482e-06, "loss": 0.249, "num_input_tokens_seen": 38227616, "step": 17735 }, { "epoch": 3.2556432372912463, "grad_norm": 6.417857646942139, "learning_rate": 9.880446592494795e-06, "loss": 0.3207, "num_input_tokens_seen": 38238816, "step": 17740 }, { "epoch": 3.2565608368507983, "grad_norm": 2.2127037048339844, "learning_rate": 9.880272469566219e-06, "loss": 0.3438, "num_input_tokens_seen": 38250016, "step": 17745 }, { "epoch": 3.2574784364103504, "grad_norm": 3.335662364959717, "learning_rate": 9.880098221466217e-06, "loss": 0.172, "num_input_tokens_seen": 38259584, "step": 17750 }, { "epoch": 3.258396035969903, "grad_norm": 1.5089058876037598, "learning_rate": 9.879923848199257e-06, "loss": 0.3941, "num_input_tokens_seen": 38269376, "step": 17755 }, { "epoch": 3.259313635529455, "grad_norm": 3.623340606689453, "learning_rate": 9.879749349769816e-06, "loss": 0.2696, "num_input_tokens_seen": 38279552, "step": 17760 }, { "epoch": 3.260231235089007, "grad_norm": 1.456007480621338, "learning_rate": 9.879574726182369e-06, "loss": 0.5042, "num_input_tokens_seen": 38290304, "step": 17765 }, { "epoch": 3.2611488346485595, "grad_norm": 2.304502010345459, "learning_rate": 9.879399977441389e-06, "loss": 0.1869, "num_input_tokens_seen": 38301312, "step": 17770 }, { "epoch": 3.2620664342081116, "grad_norm": 5.8124098777771, "learning_rate": 9.879225103551364e-06, "loss": 0.3728, "num_input_tokens_seen": 38311712, "step": 17775 }, { "epoch": 3.2629840337676637, "grad_norm": 3.64219069480896, "learning_rate": 9.879050104516776e-06, "loss": 0.269, "num_input_tokens_seen": 38323200, "step": 17780 }, { "epoch": 3.263901633327216, "grad_norm": 8.761560440063477, "learning_rate": 9.878874980342116e-06, "loss": 0.2193, "num_input_tokens_seen": 38334080, "step": 17785 }, { "epoch": 3.2648192328867682, "grad_norm": 3.3114612102508545, "learning_rate": 9.878699731031873e-06, "loss": 0.2372, "num_input_tokens_seen": 38344928, "step": 17790 }, { "epoch": 3.2657368324463203, "grad_norm": 4.02592658996582, "learning_rate": 9.878524356590545e-06, "loss": 0.2672, "num_input_tokens_seen": 38356576, "step": 17795 }, { "epoch": 3.266654432005873, "grad_norm": 3.0812463760375977, "learning_rate": 9.878348857022626e-06, "loss": 0.3049, "num_input_tokens_seen": 38367232, "step": 17800 }, { "epoch": 3.267572031565425, "grad_norm": 3.3565940856933594, "learning_rate": 9.87817323233262e-06, "loss": 0.3791, "num_input_tokens_seen": 38378208, "step": 17805 }, { "epoch": 3.268489631124977, "grad_norm": 2.2316606044769287, "learning_rate": 9.877997482525032e-06, "loss": 0.2466, "num_input_tokens_seen": 38388576, "step": 17810 }, { "epoch": 3.2694072306845294, "grad_norm": 5.272356986999512, "learning_rate": 9.87782160760437e-06, "loss": 0.2802, "num_input_tokens_seen": 38400032, "step": 17815 }, { "epoch": 3.2703248302440815, "grad_norm": 2.4043054580688477, "learning_rate": 9.877645607575142e-06, "loss": 0.2083, "num_input_tokens_seen": 38411456, "step": 17820 }, { "epoch": 3.2712424298036336, "grad_norm": 3.6583614349365234, "learning_rate": 9.877469482441864e-06, "loss": 0.3429, "num_input_tokens_seen": 38422560, "step": 17825 }, { "epoch": 3.272160029363186, "grad_norm": 6.433937072753906, "learning_rate": 9.877293232209053e-06, "loss": 0.3314, "num_input_tokens_seen": 38432896, "step": 17830 }, { "epoch": 3.273077628922738, "grad_norm": 2.534169912338257, "learning_rate": 9.877116856881231e-06, "loss": 0.3117, "num_input_tokens_seen": 38443840, "step": 17835 }, { "epoch": 3.27399522848229, "grad_norm": 20.479537963867188, "learning_rate": 9.87694035646292e-06, "loss": 0.3509, "num_input_tokens_seen": 38455168, "step": 17840 }, { "epoch": 3.2749128280418427, "grad_norm": 7.317805767059326, "learning_rate": 9.876763730958644e-06, "loss": 0.3126, "num_input_tokens_seen": 38466464, "step": 17845 }, { "epoch": 3.2758304276013948, "grad_norm": 17.812095642089844, "learning_rate": 9.876586980372941e-06, "loss": 0.3189, "num_input_tokens_seen": 38477856, "step": 17850 }, { "epoch": 3.276748027160947, "grad_norm": 6.465087890625, "learning_rate": 9.876410104710338e-06, "loss": 0.2897, "num_input_tokens_seen": 38488768, "step": 17855 }, { "epoch": 3.2776656267204993, "grad_norm": 11.262904167175293, "learning_rate": 9.876233103975375e-06, "loss": 0.2342, "num_input_tokens_seen": 38500128, "step": 17860 }, { "epoch": 3.2785832262800514, "grad_norm": 3.1100399494171143, "learning_rate": 9.87605597817259e-06, "loss": 0.2751, "num_input_tokens_seen": 38511488, "step": 17865 }, { "epoch": 3.2795008258396035, "grad_norm": 5.253654956817627, "learning_rate": 9.875878727306525e-06, "loss": 0.3648, "num_input_tokens_seen": 38523072, "step": 17870 }, { "epoch": 3.280418425399156, "grad_norm": 4.439220905303955, "learning_rate": 9.875701351381729e-06, "loss": 0.2483, "num_input_tokens_seen": 38532480, "step": 17875 }, { "epoch": 3.281336024958708, "grad_norm": 7.480441093444824, "learning_rate": 9.875523850402748e-06, "loss": 0.3894, "num_input_tokens_seen": 38543904, "step": 17880 }, { "epoch": 3.28225362451826, "grad_norm": 1.4001319408416748, "learning_rate": 9.875346224374138e-06, "loss": 0.2469, "num_input_tokens_seen": 38554624, "step": 17885 }, { "epoch": 3.2831712240778126, "grad_norm": 12.595582962036133, "learning_rate": 9.875168473300453e-06, "loss": 0.3581, "num_input_tokens_seen": 38565440, "step": 17890 }, { "epoch": 3.2840888236373647, "grad_norm": 2.2975571155548096, "learning_rate": 9.874990597186253e-06, "loss": 0.2554, "num_input_tokens_seen": 38576288, "step": 17895 }, { "epoch": 3.2850064231969167, "grad_norm": 1.946682095527649, "learning_rate": 9.874812596036099e-06, "loss": 0.4185, "num_input_tokens_seen": 38587360, "step": 17900 }, { "epoch": 3.285924022756469, "grad_norm": 2.944983720779419, "learning_rate": 9.874634469854558e-06, "loss": 0.3076, "num_input_tokens_seen": 38598240, "step": 17905 }, { "epoch": 3.2868416223160213, "grad_norm": 3.6271560192108154, "learning_rate": 9.874456218646198e-06, "loss": 0.368, "num_input_tokens_seen": 38608480, "step": 17910 }, { "epoch": 3.2877592218755733, "grad_norm": 3.6890037059783936, "learning_rate": 9.874277842415591e-06, "loss": 0.2235, "num_input_tokens_seen": 38619168, "step": 17915 }, { "epoch": 3.288676821435126, "grad_norm": 3.0285181999206543, "learning_rate": 9.87409934116731e-06, "loss": 0.3343, "num_input_tokens_seen": 38631136, "step": 17920 }, { "epoch": 3.289594420994678, "grad_norm": 9.13481616973877, "learning_rate": 9.873920714905936e-06, "loss": 0.4142, "num_input_tokens_seen": 38641600, "step": 17925 }, { "epoch": 3.29051202055423, "grad_norm": 6.631096839904785, "learning_rate": 9.87374196363605e-06, "loss": 0.2491, "num_input_tokens_seen": 38652128, "step": 17930 }, { "epoch": 3.2914296201137825, "grad_norm": 3.765636682510376, "learning_rate": 9.873563087362236e-06, "loss": 0.3143, "num_input_tokens_seen": 38661888, "step": 17935 }, { "epoch": 3.2923472196733345, "grad_norm": 3.6326494216918945, "learning_rate": 9.873384086089084e-06, "loss": 0.3038, "num_input_tokens_seen": 38672928, "step": 17940 }, { "epoch": 3.2932648192328866, "grad_norm": 5.021527290344238, "learning_rate": 9.87320495982118e-06, "loss": 0.3305, "num_input_tokens_seen": 38684096, "step": 17945 }, { "epoch": 3.294182418792439, "grad_norm": 2.028531312942505, "learning_rate": 9.873025708563123e-06, "loss": 0.2887, "num_input_tokens_seen": 38695104, "step": 17950 }, { "epoch": 3.295100018351991, "grad_norm": 2.2436368465423584, "learning_rate": 9.872846332319508e-06, "loss": 0.2385, "num_input_tokens_seen": 38705984, "step": 17955 }, { "epoch": 3.2960176179115432, "grad_norm": 8.113212585449219, "learning_rate": 9.87266683109494e-06, "loss": 0.2925, "num_input_tokens_seen": 38717824, "step": 17960 }, { "epoch": 3.2969352174710957, "grad_norm": 7.547194957733154, "learning_rate": 9.872487204894018e-06, "loss": 0.4551, "num_input_tokens_seen": 38729088, "step": 17965 }, { "epoch": 3.297852817030648, "grad_norm": 7.246298313140869, "learning_rate": 9.872307453721348e-06, "loss": 0.3508, "num_input_tokens_seen": 38739392, "step": 17970 }, { "epoch": 3.2987704165902, "grad_norm": 7.861620903015137, "learning_rate": 9.872127577581547e-06, "loss": 0.6234, "num_input_tokens_seen": 38750496, "step": 17975 }, { "epoch": 3.2996880161497524, "grad_norm": 2.5828189849853516, "learning_rate": 9.871947576479223e-06, "loss": 0.2324, "num_input_tokens_seen": 38759200, "step": 17980 }, { "epoch": 3.3006056157093044, "grad_norm": 3.4932351112365723, "learning_rate": 9.871767450418995e-06, "loss": 0.2635, "num_input_tokens_seen": 38771008, "step": 17985 }, { "epoch": 3.3015232152688565, "grad_norm": 6.2978997230529785, "learning_rate": 9.871587199405483e-06, "loss": 0.3038, "num_input_tokens_seen": 38780192, "step": 17990 }, { "epoch": 3.302440814828409, "grad_norm": 4.588089466094971, "learning_rate": 9.871406823443308e-06, "loss": 0.3186, "num_input_tokens_seen": 38789696, "step": 17995 }, { "epoch": 3.303358414387961, "grad_norm": 3.392688751220703, "learning_rate": 9.8712263225371e-06, "loss": 0.2134, "num_input_tokens_seen": 38800512, "step": 18000 }, { "epoch": 3.304276013947513, "grad_norm": 2.1398494243621826, "learning_rate": 9.871045696691484e-06, "loss": 0.2752, "num_input_tokens_seen": 38810912, "step": 18005 }, { "epoch": 3.3051936135070656, "grad_norm": 8.851873397827148, "learning_rate": 9.870864945911097e-06, "loss": 0.3988, "num_input_tokens_seen": 38820864, "step": 18010 }, { "epoch": 3.3061112130666177, "grad_norm": 6.387499809265137, "learning_rate": 9.870684070200574e-06, "loss": 0.387, "num_input_tokens_seen": 38831584, "step": 18015 }, { "epoch": 3.3070288126261698, "grad_norm": 2.8968005180358887, "learning_rate": 9.870503069564552e-06, "loss": 0.3374, "num_input_tokens_seen": 38842784, "step": 18020 }, { "epoch": 3.3079464121857223, "grad_norm": 9.877421379089355, "learning_rate": 9.870321944007674e-06, "loss": 0.3261, "num_input_tokens_seen": 38853504, "step": 18025 }, { "epoch": 3.3088640117452743, "grad_norm": 4.8619160652160645, "learning_rate": 9.870140693534589e-06, "loss": 0.2808, "num_input_tokens_seen": 38864192, "step": 18030 }, { "epoch": 3.3097816113048264, "grad_norm": 2.759209156036377, "learning_rate": 9.869959318149942e-06, "loss": 0.2825, "num_input_tokens_seen": 38873152, "step": 18035 }, { "epoch": 3.310699210864379, "grad_norm": 10.855318069458008, "learning_rate": 9.869777817858385e-06, "loss": 0.3471, "num_input_tokens_seen": 38883712, "step": 18040 }, { "epoch": 3.311616810423931, "grad_norm": 4.788239002227783, "learning_rate": 9.869596192664576e-06, "loss": 0.3859, "num_input_tokens_seen": 38893184, "step": 18045 }, { "epoch": 3.312534409983483, "grad_norm": 12.648698806762695, "learning_rate": 9.869414442573172e-06, "loss": 0.2422, "num_input_tokens_seen": 38902912, "step": 18050 }, { "epoch": 3.3134520095430355, "grad_norm": 1.3742250204086304, "learning_rate": 9.869232567588836e-06, "loss": 0.2556, "num_input_tokens_seen": 38915872, "step": 18055 }, { "epoch": 3.3143696091025876, "grad_norm": 2.7498111724853516, "learning_rate": 9.869050567716228e-06, "loss": 0.3608, "num_input_tokens_seen": 38928000, "step": 18060 }, { "epoch": 3.3152872086621397, "grad_norm": 2.9370715618133545, "learning_rate": 9.86886844296002e-06, "loss": 0.2887, "num_input_tokens_seen": 38937120, "step": 18065 }, { "epoch": 3.316204808221692, "grad_norm": 4.26263952255249, "learning_rate": 9.868686193324885e-06, "loss": 0.3666, "num_input_tokens_seen": 38947936, "step": 18070 }, { "epoch": 3.3171224077812442, "grad_norm": 1.6004384756088257, "learning_rate": 9.868503818815492e-06, "loss": 0.2534, "num_input_tokens_seen": 38960128, "step": 18075 }, { "epoch": 3.3180400073407963, "grad_norm": 11.750271797180176, "learning_rate": 9.868321319436522e-06, "loss": 0.3208, "num_input_tokens_seen": 38970240, "step": 18080 }, { "epoch": 3.318957606900349, "grad_norm": 2.981710195541382, "learning_rate": 9.868138695192658e-06, "loss": 0.3964, "num_input_tokens_seen": 38981344, "step": 18085 }, { "epoch": 3.319875206459901, "grad_norm": 2.2863168716430664, "learning_rate": 9.86795594608858e-06, "loss": 0.2312, "num_input_tokens_seen": 38992000, "step": 18090 }, { "epoch": 3.320792806019453, "grad_norm": 6.485623836517334, "learning_rate": 9.867773072128974e-06, "loss": 0.2461, "num_input_tokens_seen": 39003040, "step": 18095 }, { "epoch": 3.3217104055790054, "grad_norm": 13.073005676269531, "learning_rate": 9.867590073318536e-06, "loss": 0.2949, "num_input_tokens_seen": 39014112, "step": 18100 }, { "epoch": 3.3226280051385575, "grad_norm": 3.3104865550994873, "learning_rate": 9.867406949661956e-06, "loss": 0.3036, "num_input_tokens_seen": 39023584, "step": 18105 }, { "epoch": 3.3235456046981096, "grad_norm": 4.041023254394531, "learning_rate": 9.86722370116393e-06, "loss": 0.2334, "num_input_tokens_seen": 39033408, "step": 18110 }, { "epoch": 3.324463204257662, "grad_norm": 2.9697110652923584, "learning_rate": 9.867040327829162e-06, "loss": 0.3131, "num_input_tokens_seen": 39044704, "step": 18115 }, { "epoch": 3.325380803817214, "grad_norm": 15.904776573181152, "learning_rate": 9.866856829662351e-06, "loss": 0.2219, "num_input_tokens_seen": 39054528, "step": 18120 }, { "epoch": 3.326298403376766, "grad_norm": 1.7901731729507446, "learning_rate": 9.866673206668207e-06, "loss": 0.5006, "num_input_tokens_seen": 39064160, "step": 18125 }, { "epoch": 3.3272160029363187, "grad_norm": 2.4533474445343018, "learning_rate": 9.866489458851437e-06, "loss": 0.2828, "num_input_tokens_seen": 39073632, "step": 18130 }, { "epoch": 3.3281336024958708, "grad_norm": 3.1947181224823, "learning_rate": 9.866305586216754e-06, "loss": 0.3586, "num_input_tokens_seen": 39085312, "step": 18135 }, { "epoch": 3.329051202055423, "grad_norm": 7.907479763031006, "learning_rate": 9.866121588768876e-06, "loss": 0.2682, "num_input_tokens_seen": 39097216, "step": 18140 }, { "epoch": 3.3299688016149753, "grad_norm": 2.250340223312378, "learning_rate": 9.86593746651252e-06, "loss": 0.2159, "num_input_tokens_seen": 39107392, "step": 18145 }, { "epoch": 3.3308864011745274, "grad_norm": 6.87771463394165, "learning_rate": 9.865753219452409e-06, "loss": 0.2642, "num_input_tokens_seen": 39118048, "step": 18150 }, { "epoch": 3.3318040007340795, "grad_norm": 3.2565128803253174, "learning_rate": 9.865568847593271e-06, "loss": 0.3333, "num_input_tokens_seen": 39128640, "step": 18155 }, { "epoch": 3.332721600293632, "grad_norm": 2.4011881351470947, "learning_rate": 9.865384350939833e-06, "loss": 0.2528, "num_input_tokens_seen": 39140032, "step": 18160 }, { "epoch": 3.333639199853184, "grad_norm": 5.088667392730713, "learning_rate": 9.865199729496827e-06, "loss": 0.2357, "num_input_tokens_seen": 39150848, "step": 18165 }, { "epoch": 3.334556799412736, "grad_norm": 2.33345103263855, "learning_rate": 9.865014983268986e-06, "loss": 0.3269, "num_input_tokens_seen": 39162688, "step": 18170 }, { "epoch": 3.3354743989722886, "grad_norm": 3.7570290565490723, "learning_rate": 9.864830112261052e-06, "loss": 0.3238, "num_input_tokens_seen": 39172832, "step": 18175 }, { "epoch": 3.3363919985318407, "grad_norm": 10.786298751831055, "learning_rate": 9.864645116477767e-06, "loss": 0.3511, "num_input_tokens_seen": 39183168, "step": 18180 }, { "epoch": 3.3373095980913927, "grad_norm": 5.77261209487915, "learning_rate": 9.864459995923872e-06, "loss": 0.2806, "num_input_tokens_seen": 39194272, "step": 18185 }, { "epoch": 3.338227197650945, "grad_norm": 1.6170965433120728, "learning_rate": 9.86427475060412e-06, "loss": 0.2616, "num_input_tokens_seen": 39205024, "step": 18190 }, { "epoch": 3.3391447972104973, "grad_norm": 1.694577932357788, "learning_rate": 9.864089380523256e-06, "loss": 0.3857, "num_input_tokens_seen": 39214848, "step": 18195 }, { "epoch": 3.3400623967700493, "grad_norm": 3.8894121646881104, "learning_rate": 9.863903885686041e-06, "loss": 0.3073, "num_input_tokens_seen": 39226560, "step": 18200 }, { "epoch": 3.340979996329602, "grad_norm": 3.0350699424743652, "learning_rate": 9.86371826609723e-06, "loss": 0.4705, "num_input_tokens_seen": 39237184, "step": 18205 }, { "epoch": 3.341897595889154, "grad_norm": 4.209670543670654, "learning_rate": 9.863532521761581e-06, "loss": 0.2997, "num_input_tokens_seen": 39246720, "step": 18210 }, { "epoch": 3.342815195448706, "grad_norm": 2.1908395290374756, "learning_rate": 9.86334665268386e-06, "loss": 0.3084, "num_input_tokens_seen": 39258752, "step": 18215 }, { "epoch": 3.3437327950082585, "grad_norm": 5.121740818023682, "learning_rate": 9.863160658868838e-06, "loss": 0.2614, "num_input_tokens_seen": 39269504, "step": 18220 }, { "epoch": 3.3446503945678105, "grad_norm": 3.0680034160614014, "learning_rate": 9.862974540321281e-06, "loss": 0.2092, "num_input_tokens_seen": 39281088, "step": 18225 }, { "epoch": 3.3455679941273626, "grad_norm": 10.169529914855957, "learning_rate": 9.862788297045964e-06, "loss": 0.3298, "num_input_tokens_seen": 39292736, "step": 18230 }, { "epoch": 3.346485593686915, "grad_norm": 1.8074828386306763, "learning_rate": 9.862601929047663e-06, "loss": 0.3752, "num_input_tokens_seen": 39303360, "step": 18235 }, { "epoch": 3.347403193246467, "grad_norm": 12.767346382141113, "learning_rate": 9.86241543633116e-06, "loss": 0.3627, "num_input_tokens_seen": 39313632, "step": 18240 }, { "epoch": 3.3483207928060192, "grad_norm": 9.39219856262207, "learning_rate": 9.862228818901237e-06, "loss": 0.4597, "num_input_tokens_seen": 39324512, "step": 18245 }, { "epoch": 3.3492383923655717, "grad_norm": 5.421501159667969, "learning_rate": 9.86204207676268e-06, "loss": 0.282, "num_input_tokens_seen": 39335264, "step": 18250 }, { "epoch": 3.350155991925124, "grad_norm": 2.3615903854370117, "learning_rate": 9.86185520992028e-06, "loss": 0.2502, "num_input_tokens_seen": 39346240, "step": 18255 }, { "epoch": 3.351073591484676, "grad_norm": 3.362173557281494, "learning_rate": 9.861668218378828e-06, "loss": 0.3183, "num_input_tokens_seen": 39356800, "step": 18260 }, { "epoch": 3.3519911910442284, "grad_norm": 2.521764039993286, "learning_rate": 9.861481102143122e-06, "loss": 0.2893, "num_input_tokens_seen": 39367424, "step": 18265 }, { "epoch": 3.3529087906037804, "grad_norm": 8.953873634338379, "learning_rate": 9.86129386121796e-06, "loss": 0.3846, "num_input_tokens_seen": 39376256, "step": 18270 }, { "epoch": 3.3538263901633325, "grad_norm": 1.7824336290359497, "learning_rate": 9.861106495608147e-06, "loss": 0.3521, "num_input_tokens_seen": 39387552, "step": 18275 }, { "epoch": 3.354743989722885, "grad_norm": 4.231604099273682, "learning_rate": 9.860919005318484e-06, "loss": 0.2577, "num_input_tokens_seen": 39399104, "step": 18280 }, { "epoch": 3.355661589282437, "grad_norm": 1.4245926141738892, "learning_rate": 9.860731390353782e-06, "loss": 0.2723, "num_input_tokens_seen": 39409888, "step": 18285 }, { "epoch": 3.356579188841989, "grad_norm": 0.9127984046936035, "learning_rate": 9.860543650718853e-06, "loss": 0.2356, "num_input_tokens_seen": 39420768, "step": 18290 }, { "epoch": 3.3574967884015416, "grad_norm": 0.9794880151748657, "learning_rate": 9.860355786418514e-06, "loss": 0.1725, "num_input_tokens_seen": 39432192, "step": 18295 }, { "epoch": 3.3584143879610937, "grad_norm": 15.558619499206543, "learning_rate": 9.86016779745758e-06, "loss": 0.3376, "num_input_tokens_seen": 39443360, "step": 18300 }, { "epoch": 3.3593319875206458, "grad_norm": 4.115752220153809, "learning_rate": 9.859979683840877e-06, "loss": 0.3274, "num_input_tokens_seen": 39454080, "step": 18305 }, { "epoch": 3.3602495870801983, "grad_norm": 5.915306568145752, "learning_rate": 9.859791445573226e-06, "loss": 0.2824, "num_input_tokens_seen": 39464448, "step": 18310 }, { "epoch": 3.3611671866397503, "grad_norm": 2.749459743499756, "learning_rate": 9.859603082659456e-06, "loss": 0.2985, "num_input_tokens_seen": 39475712, "step": 18315 }, { "epoch": 3.3620847861993024, "grad_norm": 2.7523770332336426, "learning_rate": 9.859414595104399e-06, "loss": 0.1997, "num_input_tokens_seen": 39484096, "step": 18320 }, { "epoch": 3.363002385758855, "grad_norm": 12.573197364807129, "learning_rate": 9.85922598291289e-06, "loss": 0.3572, "num_input_tokens_seen": 39494112, "step": 18325 }, { "epoch": 3.363919985318407, "grad_norm": 4.439600467681885, "learning_rate": 9.859037246089766e-06, "loss": 0.2306, "num_input_tokens_seen": 39503776, "step": 18330 }, { "epoch": 3.364837584877959, "grad_norm": 18.589399337768555, "learning_rate": 9.858848384639864e-06, "loss": 0.2546, "num_input_tokens_seen": 39514560, "step": 18335 }, { "epoch": 3.3657551844375115, "grad_norm": 15.560226440429688, "learning_rate": 9.858659398568035e-06, "loss": 0.2441, "num_input_tokens_seen": 39525376, "step": 18340 }, { "epoch": 3.3666727839970636, "grad_norm": 3.6424803733825684, "learning_rate": 9.858470287879123e-06, "loss": 0.3201, "num_input_tokens_seen": 39534784, "step": 18345 }, { "epoch": 3.3675903835566157, "grad_norm": 9.26689624786377, "learning_rate": 9.858281052577976e-06, "loss": 0.3305, "num_input_tokens_seen": 39543872, "step": 18350 }, { "epoch": 3.368507983116168, "grad_norm": 12.046781539916992, "learning_rate": 9.85809169266945e-06, "loss": 0.3098, "num_input_tokens_seen": 39554432, "step": 18355 }, { "epoch": 3.3694255826757202, "grad_norm": 7.837129592895508, "learning_rate": 9.857902208158402e-06, "loss": 0.4463, "num_input_tokens_seen": 39564192, "step": 18360 }, { "epoch": 3.3703431822352723, "grad_norm": 4.927725315093994, "learning_rate": 9.857712599049691e-06, "loss": 0.3752, "num_input_tokens_seen": 39574752, "step": 18365 }, { "epoch": 3.371260781794825, "grad_norm": 11.78203010559082, "learning_rate": 9.857522865348182e-06, "loss": 0.3805, "num_input_tokens_seen": 39585792, "step": 18370 }, { "epoch": 3.372178381354377, "grad_norm": 6.2865166664123535, "learning_rate": 9.857333007058739e-06, "loss": 0.2292, "num_input_tokens_seen": 39596416, "step": 18375 }, { "epoch": 3.373095980913929, "grad_norm": 3.2768094539642334, "learning_rate": 9.857143024186231e-06, "loss": 0.3314, "num_input_tokens_seen": 39608096, "step": 18380 }, { "epoch": 3.3740135804734814, "grad_norm": 2.1520559787750244, "learning_rate": 9.856952916735533e-06, "loss": 0.2264, "num_input_tokens_seen": 39618368, "step": 18385 }, { "epoch": 3.3749311800330335, "grad_norm": 18.900634765625, "learning_rate": 9.856762684711522e-06, "loss": 0.3516, "num_input_tokens_seen": 39630080, "step": 18390 }, { "epoch": 3.3758487795925856, "grad_norm": 2.380467414855957, "learning_rate": 9.856572328119074e-06, "loss": 0.4268, "num_input_tokens_seen": 39641568, "step": 18395 }, { "epoch": 3.376766379152138, "grad_norm": 18.961315155029297, "learning_rate": 9.856381846963073e-06, "loss": 0.4251, "num_input_tokens_seen": 39651776, "step": 18400 }, { "epoch": 3.37768397871169, "grad_norm": 2.2397847175598145, "learning_rate": 9.856191241248405e-06, "loss": 0.2769, "num_input_tokens_seen": 39662304, "step": 18405 }, { "epoch": 3.378601578271242, "grad_norm": 3.0122320652008057, "learning_rate": 9.856000510979958e-06, "loss": 0.2603, "num_input_tokens_seen": 39674720, "step": 18410 }, { "epoch": 3.3795191778307947, "grad_norm": 4.121589183807373, "learning_rate": 9.855809656162622e-06, "loss": 0.2845, "num_input_tokens_seen": 39685152, "step": 18415 }, { "epoch": 3.3804367773903468, "grad_norm": 9.806066513061523, "learning_rate": 9.855618676801297e-06, "loss": 0.4172, "num_input_tokens_seen": 39695008, "step": 18420 }, { "epoch": 3.381354376949899, "grad_norm": 6.227540016174316, "learning_rate": 9.855427572900877e-06, "loss": 0.3564, "num_input_tokens_seen": 39706144, "step": 18425 }, { "epoch": 3.3822719765094513, "grad_norm": 4.854717254638672, "learning_rate": 9.855236344466265e-06, "loss": 0.3269, "num_input_tokens_seen": 39715936, "step": 18430 }, { "epoch": 3.3831895760690034, "grad_norm": 2.6573212146759033, "learning_rate": 9.855044991502367e-06, "loss": 0.36, "num_input_tokens_seen": 39727104, "step": 18435 }, { "epoch": 3.3841071756285555, "grad_norm": 3.231844186782837, "learning_rate": 9.854853514014088e-06, "loss": 0.2854, "num_input_tokens_seen": 39737632, "step": 18440 }, { "epoch": 3.385024775188108, "grad_norm": 1.5718353986740112, "learning_rate": 9.85466191200634e-06, "loss": 0.3593, "num_input_tokens_seen": 39749184, "step": 18445 }, { "epoch": 3.38594237474766, "grad_norm": 2.465407609939575, "learning_rate": 9.85447018548404e-06, "loss": 0.2776, "num_input_tokens_seen": 39758656, "step": 18450 }, { "epoch": 3.386859974307212, "grad_norm": 4.172145843505859, "learning_rate": 9.854278334452102e-06, "loss": 0.2158, "num_input_tokens_seen": 39769088, "step": 18455 }, { "epoch": 3.3877775738667646, "grad_norm": 4.069618225097656, "learning_rate": 9.854086358915449e-06, "loss": 0.2718, "num_input_tokens_seen": 39779968, "step": 18460 }, { "epoch": 3.3886951734263167, "grad_norm": 4.698436737060547, "learning_rate": 9.853894258879004e-06, "loss": 0.392, "num_input_tokens_seen": 39791040, "step": 18465 }, { "epoch": 3.389612772985869, "grad_norm": 2.8545401096343994, "learning_rate": 9.853702034347695e-06, "loss": 0.4033, "num_input_tokens_seen": 39800512, "step": 18470 }, { "epoch": 3.3905303725454212, "grad_norm": 2.666419267654419, "learning_rate": 9.85350968532645e-06, "loss": 0.2741, "num_input_tokens_seen": 39810912, "step": 18475 }, { "epoch": 3.3914479721049733, "grad_norm": 7.008378982543945, "learning_rate": 9.853317211820203e-06, "loss": 0.2388, "num_input_tokens_seen": 39821024, "step": 18480 }, { "epoch": 3.392365571664526, "grad_norm": 2.6620254516601562, "learning_rate": 9.853124613833894e-06, "loss": 0.3344, "num_input_tokens_seen": 39833184, "step": 18485 }, { "epoch": 3.393283171224078, "grad_norm": 2.3396947383880615, "learning_rate": 9.852931891372459e-06, "loss": 0.3729, "num_input_tokens_seen": 39843872, "step": 18490 }, { "epoch": 3.39420077078363, "grad_norm": 3.892709970474243, "learning_rate": 9.852739044440842e-06, "loss": 0.2621, "num_input_tokens_seen": 39853984, "step": 18495 }, { "epoch": 3.3951183703431824, "grad_norm": 2.485653877258301, "learning_rate": 9.85254607304399e-06, "loss": 0.1361, "num_input_tokens_seen": 39864352, "step": 18500 }, { "epoch": 3.3960359699027345, "grad_norm": 12.685907363891602, "learning_rate": 9.852352977186852e-06, "loss": 0.4012, "num_input_tokens_seen": 39875712, "step": 18505 }, { "epoch": 3.3969535694622865, "grad_norm": 2.645098924636841, "learning_rate": 9.85215975687438e-06, "loss": 0.2746, "num_input_tokens_seen": 39885920, "step": 18510 }, { "epoch": 3.397871169021839, "grad_norm": 7.552219390869141, "learning_rate": 9.851966412111531e-06, "loss": 0.3413, "num_input_tokens_seen": 39897120, "step": 18515 }, { "epoch": 3.398788768581391, "grad_norm": 9.808351516723633, "learning_rate": 9.851772942903263e-06, "loss": 0.2509, "num_input_tokens_seen": 39907936, "step": 18520 }, { "epoch": 3.399706368140943, "grad_norm": 13.743949890136719, "learning_rate": 9.85157934925454e-06, "loss": 0.2315, "num_input_tokens_seen": 39918688, "step": 18525 }, { "epoch": 3.4006239677004957, "grad_norm": 4.431407451629639, "learning_rate": 9.851385631170325e-06, "loss": 0.2401, "num_input_tokens_seen": 39930944, "step": 18530 }, { "epoch": 3.4015415672600477, "grad_norm": 2.6246824264526367, "learning_rate": 9.851191788655587e-06, "loss": 0.2782, "num_input_tokens_seen": 39941312, "step": 18535 }, { "epoch": 3.4024591668196, "grad_norm": 11.084551811218262, "learning_rate": 9.8509978217153e-06, "loss": 0.407, "num_input_tokens_seen": 39953056, "step": 18540 }, { "epoch": 3.4033767663791523, "grad_norm": 6.6804986000061035, "learning_rate": 9.850803730354435e-06, "loss": 0.2578, "num_input_tokens_seen": 39963424, "step": 18545 }, { "epoch": 3.4042943659387044, "grad_norm": 15.804757118225098, "learning_rate": 9.850609514577974e-06, "loss": 0.2991, "num_input_tokens_seen": 39974432, "step": 18550 }, { "epoch": 3.4052119654982564, "grad_norm": 7.253089427947998, "learning_rate": 9.850415174390895e-06, "loss": 0.4256, "num_input_tokens_seen": 39985248, "step": 18555 }, { "epoch": 3.406129565057809, "grad_norm": 22.66535186767578, "learning_rate": 9.850220709798186e-06, "loss": 0.428, "num_input_tokens_seen": 39996672, "step": 18560 }, { "epoch": 3.407047164617361, "grad_norm": 1.5229605436325073, "learning_rate": 9.850026120804832e-06, "loss": 0.305, "num_input_tokens_seen": 40007616, "step": 18565 }, { "epoch": 3.407964764176913, "grad_norm": 2.1628546714782715, "learning_rate": 9.849831407415824e-06, "loss": 0.256, "num_input_tokens_seen": 40019072, "step": 18570 }, { "epoch": 3.4088823637364656, "grad_norm": 9.299148559570312, "learning_rate": 9.849636569636159e-06, "loss": 0.4824, "num_input_tokens_seen": 40029984, "step": 18575 }, { "epoch": 3.4097999632960176, "grad_norm": 1.9919531345367432, "learning_rate": 9.849441607470832e-06, "loss": 0.2616, "num_input_tokens_seen": 40040864, "step": 18580 }, { "epoch": 3.4107175628555697, "grad_norm": 2.0062451362609863, "learning_rate": 9.849246520924842e-06, "loss": 0.2458, "num_input_tokens_seen": 40053056, "step": 18585 }, { "epoch": 3.411635162415122, "grad_norm": 2.985994577407837, "learning_rate": 9.849051310003194e-06, "loss": 0.2685, "num_input_tokens_seen": 40063072, "step": 18590 }, { "epoch": 3.4125527619746743, "grad_norm": 5.119337558746338, "learning_rate": 9.848855974710897e-06, "loss": 0.3844, "num_input_tokens_seen": 40074304, "step": 18595 }, { "epoch": 3.4134703615342263, "grad_norm": 12.378145217895508, "learning_rate": 9.84866051505296e-06, "loss": 0.3828, "num_input_tokens_seen": 40085632, "step": 18600 }, { "epoch": 3.414387961093779, "grad_norm": 4.682395935058594, "learning_rate": 9.848464931034394e-06, "loss": 0.3186, "num_input_tokens_seen": 40096128, "step": 18605 }, { "epoch": 3.415305560653331, "grad_norm": 3.2249085903167725, "learning_rate": 9.848269222660219e-06, "loss": 0.1867, "num_input_tokens_seen": 40107360, "step": 18610 }, { "epoch": 3.416223160212883, "grad_norm": 2.9628210067749023, "learning_rate": 9.84807338993545e-06, "loss": 0.3796, "num_input_tokens_seen": 40117856, "step": 18615 }, { "epoch": 3.4171407597724355, "grad_norm": 2.7978835105895996, "learning_rate": 9.847877432865113e-06, "loss": 0.2668, "num_input_tokens_seen": 40129536, "step": 18620 }, { "epoch": 3.4180583593319875, "grad_norm": 1.1778751611709595, "learning_rate": 9.847681351454235e-06, "loss": 0.1608, "num_input_tokens_seen": 40139040, "step": 18625 }, { "epoch": 3.4189759588915396, "grad_norm": 6.252956390380859, "learning_rate": 9.847485145707842e-06, "loss": 0.3851, "num_input_tokens_seen": 40149824, "step": 18630 }, { "epoch": 3.419893558451092, "grad_norm": 7.500002861022949, "learning_rate": 9.847288815630968e-06, "loss": 0.351, "num_input_tokens_seen": 40160384, "step": 18635 }, { "epoch": 3.420811158010644, "grad_norm": 2.7635302543640137, "learning_rate": 9.847092361228648e-06, "loss": 0.2395, "num_input_tokens_seen": 40170432, "step": 18640 }, { "epoch": 3.4217287575701962, "grad_norm": 3.473513603210449, "learning_rate": 9.846895782505922e-06, "loss": 0.1038, "num_input_tokens_seen": 40180576, "step": 18645 }, { "epoch": 3.4226463571297487, "grad_norm": 2.6471660137176514, "learning_rate": 9.846699079467832e-06, "loss": 0.3044, "num_input_tokens_seen": 40191296, "step": 18650 }, { "epoch": 3.423563956689301, "grad_norm": 1.533457636833191, "learning_rate": 9.846502252119421e-06, "loss": 0.3168, "num_input_tokens_seen": 40201824, "step": 18655 }, { "epoch": 3.424481556248853, "grad_norm": 8.695174217224121, "learning_rate": 9.846305300465739e-06, "loss": 0.2631, "num_input_tokens_seen": 40212288, "step": 18660 }, { "epoch": 3.4253991558084054, "grad_norm": 2.317742109298706, "learning_rate": 9.846108224511836e-06, "loss": 0.2784, "num_input_tokens_seen": 40223584, "step": 18665 }, { "epoch": 3.4263167553679574, "grad_norm": 14.256332397460938, "learning_rate": 9.845911024262771e-06, "loss": 0.2893, "num_input_tokens_seen": 40235136, "step": 18670 }, { "epoch": 3.4272343549275095, "grad_norm": 8.73677921295166, "learning_rate": 9.845713699723596e-06, "loss": 0.2563, "num_input_tokens_seen": 40245088, "step": 18675 }, { "epoch": 3.428151954487062, "grad_norm": 2.835731267929077, "learning_rate": 9.845516250899376e-06, "loss": 0.3995, "num_input_tokens_seen": 40255168, "step": 18680 }, { "epoch": 3.429069554046614, "grad_norm": 1.2904202938079834, "learning_rate": 9.845318677795173e-06, "loss": 0.2572, "num_input_tokens_seen": 40266752, "step": 18685 }, { "epoch": 3.429987153606166, "grad_norm": 15.13827133178711, "learning_rate": 9.845120980416057e-06, "loss": 0.3607, "num_input_tokens_seen": 40277088, "step": 18690 }, { "epoch": 3.4309047531657186, "grad_norm": 1.4096969366073608, "learning_rate": 9.844923158767096e-06, "loss": 0.3278, "num_input_tokens_seen": 40288160, "step": 18695 }, { "epoch": 3.4318223527252707, "grad_norm": 26.223873138427734, "learning_rate": 9.844725212853365e-06, "loss": 0.4178, "num_input_tokens_seen": 40298272, "step": 18700 }, { "epoch": 3.4327399522848228, "grad_norm": 21.034547805786133, "learning_rate": 9.844527142679941e-06, "loss": 0.3642, "num_input_tokens_seen": 40309408, "step": 18705 }, { "epoch": 3.4336575518443753, "grad_norm": 2.2620904445648193, "learning_rate": 9.844328948251904e-06, "loss": 0.2621, "num_input_tokens_seen": 40321088, "step": 18710 }, { "epoch": 3.4345751514039273, "grad_norm": 5.654733180999756, "learning_rate": 9.844130629574338e-06, "loss": 0.3963, "num_input_tokens_seen": 40332416, "step": 18715 }, { "epoch": 3.4354927509634794, "grad_norm": 2.3205792903900146, "learning_rate": 9.843932186652328e-06, "loss": 0.3087, "num_input_tokens_seen": 40341376, "step": 18720 }, { "epoch": 3.436410350523032, "grad_norm": 2.824256420135498, "learning_rate": 9.843733619490965e-06, "loss": 0.2208, "num_input_tokens_seen": 40351744, "step": 18725 }, { "epoch": 3.437327950082584, "grad_norm": 3.110006093978882, "learning_rate": 9.843534928095343e-06, "loss": 0.2121, "num_input_tokens_seen": 40362496, "step": 18730 }, { "epoch": 3.438245549642136, "grad_norm": 1.4722208976745605, "learning_rate": 9.843336112470556e-06, "loss": 0.3192, "num_input_tokens_seen": 40371712, "step": 18735 }, { "epoch": 3.4391631492016885, "grad_norm": 2.230595111846924, "learning_rate": 9.843137172621705e-06, "loss": 0.3061, "num_input_tokens_seen": 40382688, "step": 18740 }, { "epoch": 3.4400807487612406, "grad_norm": 3.038472890853882, "learning_rate": 9.842938108553892e-06, "loss": 0.3296, "num_input_tokens_seen": 40394464, "step": 18745 }, { "epoch": 3.4409983483207927, "grad_norm": 3.351480484008789, "learning_rate": 9.84273892027222e-06, "loss": 0.4523, "num_input_tokens_seen": 40405120, "step": 18750 }, { "epoch": 3.441915947880345, "grad_norm": 2.8639674186706543, "learning_rate": 9.842539607781803e-06, "loss": 0.2003, "num_input_tokens_seen": 40417056, "step": 18755 }, { "epoch": 3.4428335474398972, "grad_norm": 5.485602855682373, "learning_rate": 9.842340171087748e-06, "loss": 0.3005, "num_input_tokens_seen": 40427488, "step": 18760 }, { "epoch": 3.4437511469994493, "grad_norm": 2.021254539489746, "learning_rate": 9.842140610195174e-06, "loss": 0.2333, "num_input_tokens_seen": 40437760, "step": 18765 }, { "epoch": 3.444668746559002, "grad_norm": 9.428647994995117, "learning_rate": 9.841940925109198e-06, "loss": 0.3708, "num_input_tokens_seen": 40449408, "step": 18770 }, { "epoch": 3.445586346118554, "grad_norm": 7.3962860107421875, "learning_rate": 9.841741115834942e-06, "loss": 0.2728, "num_input_tokens_seen": 40459520, "step": 18775 }, { "epoch": 3.446503945678106, "grad_norm": 2.37084698677063, "learning_rate": 9.841541182377528e-06, "loss": 0.3603, "num_input_tokens_seen": 40469824, "step": 18780 }, { "epoch": 3.4474215452376584, "grad_norm": 2.989375591278076, "learning_rate": 9.841341124742089e-06, "loss": 0.289, "num_input_tokens_seen": 40480928, "step": 18785 }, { "epoch": 3.4483391447972105, "grad_norm": 1.9214755296707153, "learning_rate": 9.841140942933752e-06, "loss": 0.3612, "num_input_tokens_seen": 40490976, "step": 18790 }, { "epoch": 3.4492567443567625, "grad_norm": 3.608196973800659, "learning_rate": 9.840940636957655e-06, "loss": 0.2991, "num_input_tokens_seen": 40501632, "step": 18795 }, { "epoch": 3.450174343916315, "grad_norm": 7.9506683349609375, "learning_rate": 9.84074020681893e-06, "loss": 0.3105, "num_input_tokens_seen": 40512768, "step": 18800 }, { "epoch": 3.451091943475867, "grad_norm": 2.341043710708618, "learning_rate": 9.840539652522724e-06, "loss": 0.2588, "num_input_tokens_seen": 40524064, "step": 18805 }, { "epoch": 3.452009543035419, "grad_norm": 3.730670690536499, "learning_rate": 9.840338974074178e-06, "loss": 0.2483, "num_input_tokens_seen": 40534912, "step": 18810 }, { "epoch": 3.4529271425949717, "grad_norm": 11.850648880004883, "learning_rate": 9.840138171478437e-06, "loss": 0.2808, "num_input_tokens_seen": 40544960, "step": 18815 }, { "epoch": 3.4538447421545238, "grad_norm": 1.7414089441299438, "learning_rate": 9.839937244740655e-06, "loss": 0.3451, "num_input_tokens_seen": 40556224, "step": 18820 }, { "epoch": 3.454762341714076, "grad_norm": 13.477387428283691, "learning_rate": 9.839736193865982e-06, "loss": 0.5482, "num_input_tokens_seen": 40566752, "step": 18825 }, { "epoch": 3.4556799412736283, "grad_norm": 3.784245729446411, "learning_rate": 9.83953501885958e-06, "loss": 0.21, "num_input_tokens_seen": 40577984, "step": 18830 }, { "epoch": 3.4565975408331804, "grad_norm": 2.972898244857788, "learning_rate": 9.839333719726603e-06, "loss": 0.2387, "num_input_tokens_seen": 40588352, "step": 18835 }, { "epoch": 3.4575151403927324, "grad_norm": 12.443953514099121, "learning_rate": 9.839132296472217e-06, "loss": 0.3351, "num_input_tokens_seen": 40600512, "step": 18840 }, { "epoch": 3.458432739952285, "grad_norm": 10.610203742980957, "learning_rate": 9.838930749101587e-06, "loss": 0.4778, "num_input_tokens_seen": 40612096, "step": 18845 }, { "epoch": 3.459350339511837, "grad_norm": 2.1290194988250732, "learning_rate": 9.838729077619884e-06, "loss": 0.3654, "num_input_tokens_seen": 40623392, "step": 18850 }, { "epoch": 3.460267939071389, "grad_norm": 5.025760173797607, "learning_rate": 9.838527282032279e-06, "loss": 0.4215, "num_input_tokens_seen": 40634880, "step": 18855 }, { "epoch": 3.4611855386309416, "grad_norm": 1.5106797218322754, "learning_rate": 9.838325362343948e-06, "loss": 0.2855, "num_input_tokens_seen": 40646176, "step": 18860 }, { "epoch": 3.4621031381904936, "grad_norm": 2.280017137527466, "learning_rate": 9.838123318560072e-06, "loss": 0.3868, "num_input_tokens_seen": 40657824, "step": 18865 }, { "epoch": 3.4630207377500457, "grad_norm": 1.1348968744277954, "learning_rate": 9.837921150685828e-06, "loss": 0.1577, "num_input_tokens_seen": 40668960, "step": 18870 }, { "epoch": 3.463938337309598, "grad_norm": 4.6605377197265625, "learning_rate": 9.837718858726406e-06, "loss": 0.2967, "num_input_tokens_seen": 40679744, "step": 18875 }, { "epoch": 3.4648559368691503, "grad_norm": 4.169010162353516, "learning_rate": 9.837516442686993e-06, "loss": 0.2375, "num_input_tokens_seen": 40690592, "step": 18880 }, { "epoch": 3.4657735364287023, "grad_norm": 1.8129404783248901, "learning_rate": 9.837313902572783e-06, "loss": 0.4177, "num_input_tokens_seen": 40701088, "step": 18885 }, { "epoch": 3.466691135988255, "grad_norm": 5.626043319702148, "learning_rate": 9.837111238388966e-06, "loss": 0.3264, "num_input_tokens_seen": 40712576, "step": 18890 }, { "epoch": 3.467608735547807, "grad_norm": 2.246565818786621, "learning_rate": 9.836908450140743e-06, "loss": 0.3734, "num_input_tokens_seen": 40722624, "step": 18895 }, { "epoch": 3.468526335107359, "grad_norm": 1.545357584953308, "learning_rate": 9.836705537833315e-06, "loss": 0.2716, "num_input_tokens_seen": 40733728, "step": 18900 }, { "epoch": 3.4694439346669115, "grad_norm": 2.0791971683502197, "learning_rate": 9.836502501471886e-06, "loss": 0.272, "num_input_tokens_seen": 40745440, "step": 18905 }, { "epoch": 3.4703615342264635, "grad_norm": 3.018570899963379, "learning_rate": 9.836299341061663e-06, "loss": 0.3514, "num_input_tokens_seen": 40756000, "step": 18910 }, { "epoch": 3.4712791337860156, "grad_norm": 0.981903612613678, "learning_rate": 9.83609605660786e-06, "loss": 0.1827, "num_input_tokens_seen": 40765792, "step": 18915 }, { "epoch": 3.472196733345568, "grad_norm": 12.230868339538574, "learning_rate": 9.835892648115686e-06, "loss": 0.5186, "num_input_tokens_seen": 40776000, "step": 18920 }, { "epoch": 3.47311433290512, "grad_norm": 2.3322207927703857, "learning_rate": 9.835689115590361e-06, "loss": 0.2219, "num_input_tokens_seen": 40787232, "step": 18925 }, { "epoch": 3.4740319324646722, "grad_norm": 8.932516098022461, "learning_rate": 9.835485459037107e-06, "loss": 0.1497, "num_input_tokens_seen": 40799040, "step": 18930 }, { "epoch": 3.4749495320242247, "grad_norm": 0.766062319278717, "learning_rate": 9.835281678461141e-06, "loss": 0.316, "num_input_tokens_seen": 40810112, "step": 18935 }, { "epoch": 3.475867131583777, "grad_norm": 2.3315045833587646, "learning_rate": 9.835077773867699e-06, "loss": 0.2862, "num_input_tokens_seen": 40820768, "step": 18940 }, { "epoch": 3.476784731143329, "grad_norm": 5.591325283050537, "learning_rate": 9.834873745262002e-06, "loss": 0.3166, "num_input_tokens_seen": 40830752, "step": 18945 }, { "epoch": 3.4777023307028814, "grad_norm": 4.133894443511963, "learning_rate": 9.834669592649288e-06, "loss": 0.313, "num_input_tokens_seen": 40841856, "step": 18950 }, { "epoch": 3.4786199302624334, "grad_norm": 5.369180202484131, "learning_rate": 9.83446531603479e-06, "loss": 0.204, "num_input_tokens_seen": 40853280, "step": 18955 }, { "epoch": 3.4795375298219855, "grad_norm": 4.407801628112793, "learning_rate": 9.834260915423752e-06, "loss": 0.3732, "num_input_tokens_seen": 40862688, "step": 18960 }, { "epoch": 3.480455129381538, "grad_norm": 3.4652583599090576, "learning_rate": 9.834056390821414e-06, "loss": 0.3263, "num_input_tokens_seen": 40873760, "step": 18965 }, { "epoch": 3.48137272894109, "grad_norm": 2.3894248008728027, "learning_rate": 9.833851742233022e-06, "loss": 0.2547, "num_input_tokens_seen": 40883776, "step": 18970 }, { "epoch": 3.482290328500642, "grad_norm": 1.4563502073287964, "learning_rate": 9.833646969663824e-06, "loss": 0.2265, "num_input_tokens_seen": 40894528, "step": 18975 }, { "epoch": 3.4832079280601946, "grad_norm": 1.9405012130737305, "learning_rate": 9.83344207311907e-06, "loss": 0.3115, "num_input_tokens_seen": 40905984, "step": 18980 }, { "epoch": 3.4841255276197467, "grad_norm": 2.6808602809906006, "learning_rate": 9.833237052604021e-06, "loss": 0.5435, "num_input_tokens_seen": 40916288, "step": 18985 }, { "epoch": 3.4850431271792988, "grad_norm": 1.884353756904602, "learning_rate": 9.833031908123932e-06, "loss": 0.1856, "num_input_tokens_seen": 40927648, "step": 18990 }, { "epoch": 3.4859607267388513, "grad_norm": 4.041708469390869, "learning_rate": 9.832826639684065e-06, "loss": 0.2096, "num_input_tokens_seen": 40938048, "step": 18995 }, { "epoch": 3.4868783262984033, "grad_norm": 3.499906539916992, "learning_rate": 9.832621247289684e-06, "loss": 0.3454, "num_input_tokens_seen": 40949376, "step": 19000 }, { "epoch": 3.487795925857956, "grad_norm": 4.620008945465088, "learning_rate": 9.832415730946059e-06, "loss": 0.3544, "num_input_tokens_seen": 40960768, "step": 19005 }, { "epoch": 3.488713525417508, "grad_norm": 4.08467435836792, "learning_rate": 9.832210090658461e-06, "loss": 0.4417, "num_input_tokens_seen": 40970976, "step": 19010 }, { "epoch": 3.48963112497706, "grad_norm": 5.612421989440918, "learning_rate": 9.83200432643216e-06, "loss": 0.3058, "num_input_tokens_seen": 40981888, "step": 19015 }, { "epoch": 3.4905487245366125, "grad_norm": 1.2366938591003418, "learning_rate": 9.831798438272439e-06, "loss": 0.2414, "num_input_tokens_seen": 40991008, "step": 19020 }, { "epoch": 3.4914663240961645, "grad_norm": 4.503997802734375, "learning_rate": 9.831592426184577e-06, "loss": 0.3399, "num_input_tokens_seen": 41002624, "step": 19025 }, { "epoch": 3.4923839236557166, "grad_norm": 4.337555885314941, "learning_rate": 9.831386290173859e-06, "loss": 0.267, "num_input_tokens_seen": 41012096, "step": 19030 }, { "epoch": 3.493301523215269, "grad_norm": 2.164327383041382, "learning_rate": 9.831180030245568e-06, "loss": 0.3089, "num_input_tokens_seen": 41023136, "step": 19035 }, { "epoch": 3.494219122774821, "grad_norm": 1.5405588150024414, "learning_rate": 9.830973646404997e-06, "loss": 0.2842, "num_input_tokens_seen": 41034240, "step": 19040 }, { "epoch": 3.4951367223343732, "grad_norm": 2.048326015472412, "learning_rate": 9.83076713865744e-06, "loss": 0.3254, "num_input_tokens_seen": 41045120, "step": 19045 }, { "epoch": 3.4960543218939257, "grad_norm": 2.2591729164123535, "learning_rate": 9.830560507008194e-06, "loss": 0.1495, "num_input_tokens_seen": 41055264, "step": 19050 }, { "epoch": 3.496971921453478, "grad_norm": 1.930435299873352, "learning_rate": 9.830353751462555e-06, "loss": 0.3863, "num_input_tokens_seen": 41065600, "step": 19055 }, { "epoch": 3.49788952101303, "grad_norm": 3.5515363216400146, "learning_rate": 9.830146872025832e-06, "loss": 0.4471, "num_input_tokens_seen": 41076832, "step": 19060 }, { "epoch": 3.4988071205725824, "grad_norm": 3.345444917678833, "learning_rate": 9.829939868703327e-06, "loss": 0.2322, "num_input_tokens_seen": 41087456, "step": 19065 }, { "epoch": 3.4997247201321344, "grad_norm": 3.4894769191741943, "learning_rate": 9.82973274150035e-06, "loss": 0.2077, "num_input_tokens_seen": 41098080, "step": 19070 }, { "epoch": 3.5006423196916865, "grad_norm": 6.504776477813721, "learning_rate": 9.829525490422212e-06, "loss": 0.2879, "num_input_tokens_seen": 41108192, "step": 19075 }, { "epoch": 3.501559919251239, "grad_norm": 3.0695960521698, "learning_rate": 9.82931811547423e-06, "loss": 0.2022, "num_input_tokens_seen": 41119584, "step": 19080 }, { "epoch": 3.502477518810791, "grad_norm": 4.4880571365356445, "learning_rate": 9.829110616661723e-06, "loss": 0.2341, "num_input_tokens_seen": 41129600, "step": 19085 }, { "epoch": 3.503395118370343, "grad_norm": 4.8719587326049805, "learning_rate": 9.828902993990015e-06, "loss": 0.3137, "num_input_tokens_seen": 41140288, "step": 19090 }, { "epoch": 3.5043127179298956, "grad_norm": 4.323271751403809, "learning_rate": 9.828695247464429e-06, "loss": 0.2529, "num_input_tokens_seen": 41152640, "step": 19095 }, { "epoch": 3.5052303174894477, "grad_norm": 5.453233242034912, "learning_rate": 9.828487377090293e-06, "loss": 0.2813, "num_input_tokens_seen": 41162688, "step": 19100 }, { "epoch": 3.5061479170489998, "grad_norm": 3.6256470680236816, "learning_rate": 9.828279382872939e-06, "loss": 0.1645, "num_input_tokens_seen": 41172800, "step": 19105 }, { "epoch": 3.5070655166085523, "grad_norm": 2.7160756587982178, "learning_rate": 9.828071264817703e-06, "loss": 0.3433, "num_input_tokens_seen": 41184064, "step": 19110 }, { "epoch": 3.5079831161681043, "grad_norm": 5.179262638092041, "learning_rate": 9.827863022929922e-06, "loss": 0.3127, "num_input_tokens_seen": 41195616, "step": 19115 }, { "epoch": 3.5089007157276564, "grad_norm": 2.8675217628479004, "learning_rate": 9.827654657214936e-06, "loss": 0.3954, "num_input_tokens_seen": 41204928, "step": 19120 }, { "epoch": 3.509818315287209, "grad_norm": 9.683563232421875, "learning_rate": 9.827446167678091e-06, "loss": 0.4642, "num_input_tokens_seen": 41215360, "step": 19125 }, { "epoch": 3.510735914846761, "grad_norm": 12.809481620788574, "learning_rate": 9.827237554324733e-06, "loss": 0.3745, "num_input_tokens_seen": 41226592, "step": 19130 }, { "epoch": 3.511653514406313, "grad_norm": 1.4801944494247437, "learning_rate": 9.827028817160214e-06, "loss": 0.3737, "num_input_tokens_seen": 41236608, "step": 19135 }, { "epoch": 3.5125711139658655, "grad_norm": 0.5197771191596985, "learning_rate": 9.826819956189886e-06, "loss": 0.273, "num_input_tokens_seen": 41247840, "step": 19140 }, { "epoch": 3.5134887135254176, "grad_norm": 3.1053266525268555, "learning_rate": 9.826610971419108e-06, "loss": 0.3304, "num_input_tokens_seen": 41257760, "step": 19145 }, { "epoch": 3.5144063130849696, "grad_norm": 9.902447700500488, "learning_rate": 9.826401862853238e-06, "loss": 0.3273, "num_input_tokens_seen": 41267360, "step": 19150 }, { "epoch": 3.515323912644522, "grad_norm": 5.654006004333496, "learning_rate": 9.826192630497642e-06, "loss": 0.348, "num_input_tokens_seen": 41279072, "step": 19155 }, { "epoch": 3.516241512204074, "grad_norm": 1.3566144704818726, "learning_rate": 9.825983274357684e-06, "loss": 0.2308, "num_input_tokens_seen": 41290624, "step": 19160 }, { "epoch": 3.5171591117636263, "grad_norm": 3.170755386352539, "learning_rate": 9.825773794438735e-06, "loss": 0.3181, "num_input_tokens_seen": 41300544, "step": 19165 }, { "epoch": 3.518076711323179, "grad_norm": 2.3543589115142822, "learning_rate": 9.825564190746166e-06, "loss": 0.3288, "num_input_tokens_seen": 41311488, "step": 19170 }, { "epoch": 3.518994310882731, "grad_norm": 12.00158977508545, "learning_rate": 9.825354463285357e-06, "loss": 0.2776, "num_input_tokens_seen": 41321920, "step": 19175 }, { "epoch": 3.519911910442283, "grad_norm": 3.977560043334961, "learning_rate": 9.825144612061683e-06, "loss": 0.3455, "num_input_tokens_seen": 41331360, "step": 19180 }, { "epoch": 3.5208295100018354, "grad_norm": 4.239177703857422, "learning_rate": 9.824934637080528e-06, "loss": 0.2652, "num_input_tokens_seen": 41341408, "step": 19185 }, { "epoch": 3.5217471095613875, "grad_norm": 1.4541535377502441, "learning_rate": 9.824724538347278e-06, "loss": 0.2415, "num_input_tokens_seen": 41352352, "step": 19190 }, { "epoch": 3.5226647091209395, "grad_norm": 5.007016181945801, "learning_rate": 9.824514315867321e-06, "loss": 0.4202, "num_input_tokens_seen": 41363616, "step": 19195 }, { "epoch": 3.523582308680492, "grad_norm": 1.7459520101547241, "learning_rate": 9.82430396964605e-06, "loss": 0.2101, "num_input_tokens_seen": 41377024, "step": 19200 }, { "epoch": 3.524499908240044, "grad_norm": 3.521559715270996, "learning_rate": 9.824093499688858e-06, "loss": 0.3186, "num_input_tokens_seen": 41386624, "step": 19205 }, { "epoch": 3.525417507799596, "grad_norm": 3.191229820251465, "learning_rate": 9.823882906001145e-06, "loss": 0.3293, "num_input_tokens_seen": 41396480, "step": 19210 }, { "epoch": 3.5263351073591487, "grad_norm": 4.166468620300293, "learning_rate": 9.823672188588312e-06, "loss": 0.2845, "num_input_tokens_seen": 41407008, "step": 19215 }, { "epoch": 3.5272527069187007, "grad_norm": 2.898132085800171, "learning_rate": 9.823461347455761e-06, "loss": 0.3686, "num_input_tokens_seen": 41418240, "step": 19220 }, { "epoch": 3.528170306478253, "grad_norm": 4.35529088973999, "learning_rate": 9.823250382608905e-06, "loss": 0.2653, "num_input_tokens_seen": 41430368, "step": 19225 }, { "epoch": 3.5290879060378053, "grad_norm": 2.190293550491333, "learning_rate": 9.823039294053152e-06, "loss": 0.3978, "num_input_tokens_seen": 41441568, "step": 19230 }, { "epoch": 3.5300055055973574, "grad_norm": 2.7339625358581543, "learning_rate": 9.822828081793913e-06, "loss": 0.2482, "num_input_tokens_seen": 41452672, "step": 19235 }, { "epoch": 3.5309231051569094, "grad_norm": 1.6409912109375, "learning_rate": 9.822616745836613e-06, "loss": 0.348, "num_input_tokens_seen": 41463392, "step": 19240 }, { "epoch": 3.531840704716462, "grad_norm": 4.712111949920654, "learning_rate": 9.822405286186664e-06, "loss": 0.4478, "num_input_tokens_seen": 41475168, "step": 19245 }, { "epoch": 3.532758304276014, "grad_norm": 2.1532793045043945, "learning_rate": 9.822193702849496e-06, "loss": 0.1961, "num_input_tokens_seen": 41485568, "step": 19250 }, { "epoch": 3.533675903835566, "grad_norm": 2.965811252593994, "learning_rate": 9.821981995830532e-06, "loss": 0.2957, "num_input_tokens_seen": 41496160, "step": 19255 }, { "epoch": 3.5345935033951186, "grad_norm": 2.7280590534210205, "learning_rate": 9.821770165135203e-06, "loss": 0.3262, "num_input_tokens_seen": 41507552, "step": 19260 }, { "epoch": 3.5355111029546706, "grad_norm": 9.421355247497559, "learning_rate": 9.821558210768942e-06, "loss": 0.3017, "num_input_tokens_seen": 41519008, "step": 19265 }, { "epoch": 3.5364287025142227, "grad_norm": 5.524330139160156, "learning_rate": 9.821346132737188e-06, "loss": 0.2254, "num_input_tokens_seen": 41530880, "step": 19270 }, { "epoch": 3.537346302073775, "grad_norm": 6.06604528427124, "learning_rate": 9.821133931045375e-06, "loss": 0.3783, "num_input_tokens_seen": 41541760, "step": 19275 }, { "epoch": 3.5382639016333273, "grad_norm": 6.003017425537109, "learning_rate": 9.820921605698951e-06, "loss": 0.2823, "num_input_tokens_seen": 41552384, "step": 19280 }, { "epoch": 3.5391815011928793, "grad_norm": 4.634954452514648, "learning_rate": 9.820709156703359e-06, "loss": 0.2807, "num_input_tokens_seen": 41562624, "step": 19285 }, { "epoch": 3.540099100752432, "grad_norm": 12.49707317352295, "learning_rate": 9.820496584064048e-06, "loss": 0.269, "num_input_tokens_seen": 41574880, "step": 19290 }, { "epoch": 3.541016700311984, "grad_norm": 5.804086685180664, "learning_rate": 9.820283887786472e-06, "loss": 0.4421, "num_input_tokens_seen": 41585248, "step": 19295 }, { "epoch": 3.541934299871536, "grad_norm": 6.519284248352051, "learning_rate": 9.820071067876084e-06, "loss": 0.3166, "num_input_tokens_seen": 41596736, "step": 19300 }, { "epoch": 3.5428518994310885, "grad_norm": 16.932090759277344, "learning_rate": 9.819858124338344e-06, "loss": 0.35, "num_input_tokens_seen": 41606560, "step": 19305 }, { "epoch": 3.5437694989906405, "grad_norm": 1.8745628595352173, "learning_rate": 9.819645057178713e-06, "loss": 0.3281, "num_input_tokens_seen": 41619008, "step": 19310 }, { "epoch": 3.5446870985501926, "grad_norm": 4.624971866607666, "learning_rate": 9.819431866402655e-06, "loss": 0.2986, "num_input_tokens_seen": 41629600, "step": 19315 }, { "epoch": 3.545604698109745, "grad_norm": 4.7505693435668945, "learning_rate": 9.819218552015639e-06, "loss": 0.2773, "num_input_tokens_seen": 41641152, "step": 19320 }, { "epoch": 3.546522297669297, "grad_norm": 5.4962663650512695, "learning_rate": 9.819005114023138e-06, "loss": 0.357, "num_input_tokens_seen": 41652352, "step": 19325 }, { "epoch": 3.5474398972288492, "grad_norm": 3.7385504245758057, "learning_rate": 9.818791552430625e-06, "loss": 0.2967, "num_input_tokens_seen": 41661632, "step": 19330 }, { "epoch": 3.5483574967884017, "grad_norm": 4.288638114929199, "learning_rate": 9.818577867243575e-06, "loss": 0.3246, "num_input_tokens_seen": 41671904, "step": 19335 }, { "epoch": 3.549275096347954, "grad_norm": 6.720065593719482, "learning_rate": 9.818364058467471e-06, "loss": 0.3492, "num_input_tokens_seen": 41683456, "step": 19340 }, { "epoch": 3.550192695907506, "grad_norm": 4.184709548950195, "learning_rate": 9.818150126107798e-06, "loss": 0.3375, "num_input_tokens_seen": 41695200, "step": 19345 }, { "epoch": 3.5511102954670584, "grad_norm": 4.7512664794921875, "learning_rate": 9.817936070170042e-06, "loss": 0.3058, "num_input_tokens_seen": 41705088, "step": 19350 }, { "epoch": 3.5520278950266104, "grad_norm": 1.643864631652832, "learning_rate": 9.817721890659691e-06, "loss": 0.3169, "num_input_tokens_seen": 41716096, "step": 19355 }, { "epoch": 3.5529454945861625, "grad_norm": 3.2884647846221924, "learning_rate": 9.817507587582242e-06, "loss": 0.2531, "num_input_tokens_seen": 41728352, "step": 19360 }, { "epoch": 3.553863094145715, "grad_norm": 5.289462566375732, "learning_rate": 9.81729316094319e-06, "loss": 0.3006, "num_input_tokens_seen": 41740896, "step": 19365 }, { "epoch": 3.554780693705267, "grad_norm": 3.070279598236084, "learning_rate": 9.817078610748034e-06, "loss": 0.2625, "num_input_tokens_seen": 41752736, "step": 19370 }, { "epoch": 3.555698293264819, "grad_norm": 9.129317283630371, "learning_rate": 9.816863937002276e-06, "loss": 0.2193, "num_input_tokens_seen": 41763392, "step": 19375 }, { "epoch": 3.5566158928243716, "grad_norm": 1.9931696653366089, "learning_rate": 9.816649139711424e-06, "loss": 0.3301, "num_input_tokens_seen": 41775552, "step": 19380 }, { "epoch": 3.5575334923839237, "grad_norm": 9.529406547546387, "learning_rate": 9.816434218880989e-06, "loss": 0.3504, "num_input_tokens_seen": 41787200, "step": 19385 }, { "epoch": 3.5584510919434758, "grad_norm": 2.490800619125366, "learning_rate": 9.81621917451648e-06, "loss": 0.3111, "num_input_tokens_seen": 41797696, "step": 19390 }, { "epoch": 3.5593686915030283, "grad_norm": 2.653444528579712, "learning_rate": 9.816004006623411e-06, "loss": 0.3699, "num_input_tokens_seen": 41808672, "step": 19395 }, { "epoch": 3.5602862910625803, "grad_norm": 7.454522132873535, "learning_rate": 9.815788715207306e-06, "loss": 0.3137, "num_input_tokens_seen": 41819968, "step": 19400 }, { "epoch": 3.5612038906221324, "grad_norm": 1.537583827972412, "learning_rate": 9.815573300273684e-06, "loss": 0.2366, "num_input_tokens_seen": 41831840, "step": 19405 }, { "epoch": 3.562121490181685, "grad_norm": 1.3763477802276611, "learning_rate": 9.81535776182807e-06, "loss": 0.152, "num_input_tokens_seen": 41841344, "step": 19410 }, { "epoch": 3.563039089741237, "grad_norm": 2.0903420448303223, "learning_rate": 9.815142099875994e-06, "loss": 0.2223, "num_input_tokens_seen": 41852288, "step": 19415 }, { "epoch": 3.563956689300789, "grad_norm": 6.07159948348999, "learning_rate": 9.814926314422983e-06, "loss": 0.3673, "num_input_tokens_seen": 41864288, "step": 19420 }, { "epoch": 3.5648742888603415, "grad_norm": 2.240333080291748, "learning_rate": 9.814710405474577e-06, "loss": 0.3989, "num_input_tokens_seen": 41875680, "step": 19425 }, { "epoch": 3.5657918884198936, "grad_norm": 7.709094524383545, "learning_rate": 9.81449437303631e-06, "loss": 0.2906, "num_input_tokens_seen": 41886624, "step": 19430 }, { "epoch": 3.5667094879794456, "grad_norm": 2.8001949787139893, "learning_rate": 9.814278217113725e-06, "loss": 0.2717, "num_input_tokens_seen": 41896896, "step": 19435 }, { "epoch": 3.567627087538998, "grad_norm": 3.013634443283081, "learning_rate": 9.814061937712364e-06, "loss": 0.3723, "num_input_tokens_seen": 41908064, "step": 19440 }, { "epoch": 3.56854468709855, "grad_norm": 7.625001430511475, "learning_rate": 9.813845534837776e-06, "loss": 0.3319, "num_input_tokens_seen": 41916192, "step": 19445 }, { "epoch": 3.5694622866581023, "grad_norm": 4.644966125488281, "learning_rate": 9.813629008495511e-06, "loss": 0.2944, "num_input_tokens_seen": 41926528, "step": 19450 }, { "epoch": 3.570379886217655, "grad_norm": 22.585424423217773, "learning_rate": 9.813412358691122e-06, "loss": 0.2765, "num_input_tokens_seen": 41937408, "step": 19455 }, { "epoch": 3.571297485777207, "grad_norm": 7.0080156326293945, "learning_rate": 9.813195585430166e-06, "loss": 0.4063, "num_input_tokens_seen": 41948736, "step": 19460 }, { "epoch": 3.572215085336759, "grad_norm": 1.464306354522705, "learning_rate": 9.812978688718204e-06, "loss": 0.2124, "num_input_tokens_seen": 41959072, "step": 19465 }, { "epoch": 3.5731326848963114, "grad_norm": 8.691486358642578, "learning_rate": 9.812761668560797e-06, "loss": 0.3274, "num_input_tokens_seen": 41970400, "step": 19470 }, { "epoch": 3.5740502844558635, "grad_norm": 7.637323379516602, "learning_rate": 9.812544524963512e-06, "loss": 0.2025, "num_input_tokens_seen": 41981312, "step": 19475 }, { "epoch": 3.5749678840154155, "grad_norm": 3.808619499206543, "learning_rate": 9.81232725793192e-06, "loss": 0.1407, "num_input_tokens_seen": 41991776, "step": 19480 }, { "epoch": 3.575885483574968, "grad_norm": 4.671675205230713, "learning_rate": 9.812109867471591e-06, "loss": 0.2918, "num_input_tokens_seen": 42002304, "step": 19485 }, { "epoch": 3.57680308313452, "grad_norm": 5.759401798248291, "learning_rate": 9.811892353588103e-06, "loss": 0.3004, "num_input_tokens_seen": 42013792, "step": 19490 }, { "epoch": 3.577720682694072, "grad_norm": 11.223628044128418, "learning_rate": 9.811674716287034e-06, "loss": 0.36, "num_input_tokens_seen": 42025312, "step": 19495 }, { "epoch": 3.5786382822536247, "grad_norm": 22.564878463745117, "learning_rate": 9.811456955573965e-06, "loss": 0.4124, "num_input_tokens_seen": 42036416, "step": 19500 }, { "epoch": 3.5795558818131767, "grad_norm": 12.265878677368164, "learning_rate": 9.811239071454483e-06, "loss": 0.3576, "num_input_tokens_seen": 42047008, "step": 19505 }, { "epoch": 3.580473481372729, "grad_norm": 7.5902628898620605, "learning_rate": 9.811021063934174e-06, "loss": 0.3183, "num_input_tokens_seen": 42059200, "step": 19510 }, { "epoch": 3.5813910809322813, "grad_norm": 2.9687013626098633, "learning_rate": 9.810802933018634e-06, "loss": 0.2623, "num_input_tokens_seen": 42070976, "step": 19515 }, { "epoch": 3.5823086804918334, "grad_norm": 11.422389030456543, "learning_rate": 9.810584678713454e-06, "loss": 0.3545, "num_input_tokens_seen": 42082112, "step": 19520 }, { "epoch": 3.5832262800513854, "grad_norm": 5.397528171539307, "learning_rate": 9.81036630102423e-06, "loss": 0.1767, "num_input_tokens_seen": 42093088, "step": 19525 }, { "epoch": 3.584143879610938, "grad_norm": 6.573503494262695, "learning_rate": 9.810147799956568e-06, "loss": 0.3345, "num_input_tokens_seen": 42104480, "step": 19530 }, { "epoch": 3.58506147917049, "grad_norm": 7.281611919403076, "learning_rate": 9.80992917551607e-06, "loss": 0.249, "num_input_tokens_seen": 42115456, "step": 19535 }, { "epoch": 3.585979078730042, "grad_norm": 5.152072906494141, "learning_rate": 9.809710427708342e-06, "loss": 0.3806, "num_input_tokens_seen": 42126784, "step": 19540 }, { "epoch": 3.5868966782895946, "grad_norm": 0.985587477684021, "learning_rate": 9.809491556538999e-06, "loss": 0.212, "num_input_tokens_seen": 42136864, "step": 19545 }, { "epoch": 3.5878142778491466, "grad_norm": 9.409414291381836, "learning_rate": 9.809272562013648e-06, "loss": 0.3166, "num_input_tokens_seen": 42149280, "step": 19550 }, { "epoch": 3.5887318774086987, "grad_norm": 11.846354484558105, "learning_rate": 9.809053444137911e-06, "loss": 0.3316, "num_input_tokens_seen": 42160320, "step": 19555 }, { "epoch": 3.589649476968251, "grad_norm": 9.303396224975586, "learning_rate": 9.808834202917408e-06, "loss": 0.2784, "num_input_tokens_seen": 42171968, "step": 19560 }, { "epoch": 3.5905670765278033, "grad_norm": 2.5211195945739746, "learning_rate": 9.808614838357759e-06, "loss": 0.3966, "num_input_tokens_seen": 42182720, "step": 19565 }, { "epoch": 3.5914846760873553, "grad_norm": 2.6029438972473145, "learning_rate": 9.808395350464592e-06, "loss": 0.2155, "num_input_tokens_seen": 42194240, "step": 19570 }, { "epoch": 3.592402275646908, "grad_norm": 7.207155227661133, "learning_rate": 9.808175739243538e-06, "loss": 0.3238, "num_input_tokens_seen": 42205568, "step": 19575 }, { "epoch": 3.59331987520646, "grad_norm": 11.15566349029541, "learning_rate": 9.807956004700226e-06, "loss": 0.2905, "num_input_tokens_seen": 42216576, "step": 19580 }, { "epoch": 3.594237474766012, "grad_norm": 4.754467487335205, "learning_rate": 9.807736146840295e-06, "loss": 0.3034, "num_input_tokens_seen": 42227712, "step": 19585 }, { "epoch": 3.5951550743255645, "grad_norm": 2.198075294494629, "learning_rate": 9.807516165669385e-06, "loss": 0.3427, "num_input_tokens_seen": 42238560, "step": 19590 }, { "epoch": 3.5960726738851165, "grad_norm": 1.3211944103240967, "learning_rate": 9.807296061193134e-06, "loss": 0.2059, "num_input_tokens_seen": 42250336, "step": 19595 }, { "epoch": 3.5969902734446686, "grad_norm": 1.7334179878234863, "learning_rate": 9.80707583341719e-06, "loss": 0.2982, "num_input_tokens_seen": 42262496, "step": 19600 }, { "epoch": 3.597907873004221, "grad_norm": 3.2510814666748047, "learning_rate": 9.806855482347202e-06, "loss": 0.3473, "num_input_tokens_seen": 42273184, "step": 19605 }, { "epoch": 3.598825472563773, "grad_norm": 7.141234874725342, "learning_rate": 9.806635007988821e-06, "loss": 0.2621, "num_input_tokens_seen": 42284864, "step": 19610 }, { "epoch": 3.5997430721233252, "grad_norm": 1.9937763214111328, "learning_rate": 9.8064144103477e-06, "loss": 0.2646, "num_input_tokens_seen": 42296384, "step": 19615 }, { "epoch": 3.6006606716828777, "grad_norm": 3.534457206726074, "learning_rate": 9.8061936894295e-06, "loss": 0.3789, "num_input_tokens_seen": 42306336, "step": 19620 }, { "epoch": 3.60157827124243, "grad_norm": 2.0722734928131104, "learning_rate": 9.805972845239881e-06, "loss": 0.3546, "num_input_tokens_seen": 42317248, "step": 19625 }, { "epoch": 3.602495870801982, "grad_norm": 2.712736129760742, "learning_rate": 9.805751877784507e-06, "loss": 0.2973, "num_input_tokens_seen": 42327328, "step": 19630 }, { "epoch": 3.6034134703615344, "grad_norm": 5.4076738357543945, "learning_rate": 9.805530787069044e-06, "loss": 0.3122, "num_input_tokens_seen": 42336928, "step": 19635 }, { "epoch": 3.6043310699210864, "grad_norm": 1.143990397453308, "learning_rate": 9.805309573099165e-06, "loss": 0.3305, "num_input_tokens_seen": 42348096, "step": 19640 }, { "epoch": 3.6052486694806385, "grad_norm": 17.153873443603516, "learning_rate": 9.805088235880545e-06, "loss": 0.1835, "num_input_tokens_seen": 42358048, "step": 19645 }, { "epoch": 3.606166269040191, "grad_norm": 1.5197521448135376, "learning_rate": 9.804866775418856e-06, "loss": 0.2884, "num_input_tokens_seen": 42369184, "step": 19650 }, { "epoch": 3.607083868599743, "grad_norm": 1.4162869453430176, "learning_rate": 9.804645191719784e-06, "loss": 0.4198, "num_input_tokens_seen": 42380096, "step": 19655 }, { "epoch": 3.608001468159295, "grad_norm": 4.108301639556885, "learning_rate": 9.804423484789008e-06, "loss": 0.209, "num_input_tokens_seen": 42391296, "step": 19660 }, { "epoch": 3.6089190677188476, "grad_norm": 1.4433757066726685, "learning_rate": 9.804201654632215e-06, "loss": 0.2099, "num_input_tokens_seen": 42403328, "step": 19665 }, { "epoch": 3.6098366672783997, "grad_norm": 9.479300498962402, "learning_rate": 9.803979701255095e-06, "loss": 0.2553, "num_input_tokens_seen": 42414688, "step": 19670 }, { "epoch": 3.6107542668379518, "grad_norm": 0.4031085669994354, "learning_rate": 9.803757624663342e-06, "loss": 0.2291, "num_input_tokens_seen": 42425344, "step": 19675 }, { "epoch": 3.6116718663975043, "grad_norm": 5.581915378570557, "learning_rate": 9.80353542486265e-06, "loss": 0.5071, "num_input_tokens_seen": 42435232, "step": 19680 }, { "epoch": 3.6125894659570563, "grad_norm": 3.0870869159698486, "learning_rate": 9.803313101858723e-06, "loss": 0.3316, "num_input_tokens_seen": 42445184, "step": 19685 }, { "epoch": 3.6135070655166084, "grad_norm": 14.824840545654297, "learning_rate": 9.803090655657258e-06, "loss": 0.2116, "num_input_tokens_seen": 42455168, "step": 19690 }, { "epoch": 3.614424665076161, "grad_norm": 3.2923402786254883, "learning_rate": 9.80286808626396e-06, "loss": 0.2156, "num_input_tokens_seen": 42464960, "step": 19695 }, { "epoch": 3.615342264635713, "grad_norm": 4.267143726348877, "learning_rate": 9.802645393684539e-06, "loss": 0.2934, "num_input_tokens_seen": 42476416, "step": 19700 }, { "epoch": 3.616259864195265, "grad_norm": 3.3473567962646484, "learning_rate": 9.802422577924708e-06, "loss": 0.2297, "num_input_tokens_seen": 42486944, "step": 19705 }, { "epoch": 3.6171774637548175, "grad_norm": 12.366461753845215, "learning_rate": 9.802199638990181e-06, "loss": 0.3925, "num_input_tokens_seen": 42497824, "step": 19710 }, { "epoch": 3.6180950633143696, "grad_norm": 7.36331844329834, "learning_rate": 9.801976576886676e-06, "loss": 0.3587, "num_input_tokens_seen": 42507840, "step": 19715 }, { "epoch": 3.6190126628739216, "grad_norm": 3.8702139854431152, "learning_rate": 9.801753391619915e-06, "loss": 0.2205, "num_input_tokens_seen": 42518912, "step": 19720 }, { "epoch": 3.619930262433474, "grad_norm": 8.932354927062988, "learning_rate": 9.80153008319562e-06, "loss": 0.3826, "num_input_tokens_seen": 42528704, "step": 19725 }, { "epoch": 3.620847861993026, "grad_norm": 6.064081192016602, "learning_rate": 9.80130665161952e-06, "loss": 0.2865, "num_input_tokens_seen": 42540000, "step": 19730 }, { "epoch": 3.6217654615525783, "grad_norm": 2.7777535915374756, "learning_rate": 9.801083096897347e-06, "loss": 0.3788, "num_input_tokens_seen": 42552064, "step": 19735 }, { "epoch": 3.622683061112131, "grad_norm": 4.555933952331543, "learning_rate": 9.800859419034833e-06, "loss": 0.154, "num_input_tokens_seen": 42562912, "step": 19740 }, { "epoch": 3.623600660671683, "grad_norm": 1.255325198173523, "learning_rate": 9.800635618037717e-06, "loss": 0.1657, "num_input_tokens_seen": 42574112, "step": 19745 }, { "epoch": 3.624518260231235, "grad_norm": 0.7977313995361328, "learning_rate": 9.800411693911735e-06, "loss": 0.256, "num_input_tokens_seen": 42584960, "step": 19750 }, { "epoch": 3.6254358597907874, "grad_norm": 6.166070461273193, "learning_rate": 9.800187646662636e-06, "loss": 0.3653, "num_input_tokens_seen": 42595328, "step": 19755 }, { "epoch": 3.6263534593503395, "grad_norm": 3.9408040046691895, "learning_rate": 9.799963476296162e-06, "loss": 0.2521, "num_input_tokens_seen": 42606272, "step": 19760 }, { "epoch": 3.6272710589098915, "grad_norm": 3.7382869720458984, "learning_rate": 9.799739182818062e-06, "loss": 0.3509, "num_input_tokens_seen": 42616224, "step": 19765 }, { "epoch": 3.628188658469444, "grad_norm": 8.872807502746582, "learning_rate": 9.799514766234093e-06, "loss": 0.3041, "num_input_tokens_seen": 42626592, "step": 19770 }, { "epoch": 3.629106258028996, "grad_norm": 8.585748672485352, "learning_rate": 9.79929022655001e-06, "loss": 0.3872, "num_input_tokens_seen": 42637152, "step": 19775 }, { "epoch": 3.630023857588548, "grad_norm": 16.085521697998047, "learning_rate": 9.799065563771569e-06, "loss": 0.2416, "num_input_tokens_seen": 42648256, "step": 19780 }, { "epoch": 3.6309414571481007, "grad_norm": 4.954892635345459, "learning_rate": 9.798840777904535e-06, "loss": 0.352, "num_input_tokens_seen": 42657664, "step": 19785 }, { "epoch": 3.6318590567076527, "grad_norm": 4.400522708892822, "learning_rate": 9.798615868954672e-06, "loss": 0.2632, "num_input_tokens_seen": 42668832, "step": 19790 }, { "epoch": 3.632776656267205, "grad_norm": 7.776968002319336, "learning_rate": 9.79839083692775e-06, "loss": 0.5173, "num_input_tokens_seen": 42680640, "step": 19795 }, { "epoch": 3.6336942558267573, "grad_norm": 2.6845338344573975, "learning_rate": 9.798165681829538e-06, "loss": 0.2974, "num_input_tokens_seen": 42691360, "step": 19800 }, { "epoch": 3.6346118553863094, "grad_norm": 3.523733615875244, "learning_rate": 9.797940403665815e-06, "loss": 0.4157, "num_input_tokens_seen": 42702368, "step": 19805 }, { "epoch": 3.6355294549458614, "grad_norm": 3.421133279800415, "learning_rate": 9.797715002442356e-06, "loss": 0.2727, "num_input_tokens_seen": 42712544, "step": 19810 }, { "epoch": 3.636447054505414, "grad_norm": 2.558701753616333, "learning_rate": 9.797489478164943e-06, "loss": 0.243, "num_input_tokens_seen": 42721632, "step": 19815 }, { "epoch": 3.637364654064966, "grad_norm": 5.901526927947998, "learning_rate": 9.79726383083936e-06, "loss": 0.3575, "num_input_tokens_seen": 42732032, "step": 19820 }, { "epoch": 3.638282253624518, "grad_norm": 4.584321975708008, "learning_rate": 9.797038060471395e-06, "loss": 0.2648, "num_input_tokens_seen": 42743744, "step": 19825 }, { "epoch": 3.6391998531840706, "grad_norm": 5.166088104248047, "learning_rate": 9.796812167066837e-06, "loss": 0.2751, "num_input_tokens_seen": 42754624, "step": 19830 }, { "epoch": 3.6401174527436226, "grad_norm": 3.8685920238494873, "learning_rate": 9.796586150631485e-06, "loss": 0.4591, "num_input_tokens_seen": 42764192, "step": 19835 }, { "epoch": 3.6410350523031747, "grad_norm": 2.65244197845459, "learning_rate": 9.796360011171128e-06, "loss": 0.27, "num_input_tokens_seen": 42775264, "step": 19840 }, { "epoch": 3.641952651862727, "grad_norm": 4.013584613800049, "learning_rate": 9.796133748691575e-06, "loss": 0.2177, "num_input_tokens_seen": 42785440, "step": 19845 }, { "epoch": 3.6428702514222793, "grad_norm": 2.6578385829925537, "learning_rate": 9.79590736319862e-06, "loss": 0.3265, "num_input_tokens_seen": 42796128, "step": 19850 }, { "epoch": 3.6437878509818313, "grad_norm": 4.679131984710693, "learning_rate": 9.795680854698077e-06, "loss": 0.3411, "num_input_tokens_seen": 42807232, "step": 19855 }, { "epoch": 3.644705450541384, "grad_norm": 2.0584871768951416, "learning_rate": 9.795454223195752e-06, "loss": 0.348, "num_input_tokens_seen": 42818528, "step": 19860 }, { "epoch": 3.645623050100936, "grad_norm": 2.437382459640503, "learning_rate": 9.795227468697458e-06, "loss": 0.3549, "num_input_tokens_seen": 42828320, "step": 19865 }, { "epoch": 3.646540649660488, "grad_norm": 3.0101590156555176, "learning_rate": 9.795000591209013e-06, "loss": 0.2853, "num_input_tokens_seen": 42839200, "step": 19870 }, { "epoch": 3.6474582492200405, "grad_norm": 4.459094524383545, "learning_rate": 9.794773590736233e-06, "loss": 0.3729, "num_input_tokens_seen": 42848192, "step": 19875 }, { "epoch": 3.6483758487795925, "grad_norm": 10.669719696044922, "learning_rate": 9.794546467284941e-06, "loss": 0.3926, "num_input_tokens_seen": 42859040, "step": 19880 }, { "epoch": 3.6492934483391446, "grad_norm": 4.03959846496582, "learning_rate": 9.794319220860963e-06, "loss": 0.2669, "num_input_tokens_seen": 42870336, "step": 19885 }, { "epoch": 3.650211047898697, "grad_norm": 6.065758228302002, "learning_rate": 9.79409185147013e-06, "loss": 0.2782, "num_input_tokens_seen": 42881696, "step": 19890 }, { "epoch": 3.651128647458249, "grad_norm": 1.7431279420852661, "learning_rate": 9.793864359118267e-06, "loss": 0.2209, "num_input_tokens_seen": 42893440, "step": 19895 }, { "epoch": 3.6520462470178012, "grad_norm": 3.0877883434295654, "learning_rate": 9.793636743811218e-06, "loss": 0.3369, "num_input_tokens_seen": 42904768, "step": 19900 }, { "epoch": 3.6529638465773537, "grad_norm": 1.767635703086853, "learning_rate": 9.793409005554813e-06, "loss": 0.4133, "num_input_tokens_seen": 42914112, "step": 19905 }, { "epoch": 3.653881446136906, "grad_norm": 1.757766842842102, "learning_rate": 9.793181144354895e-06, "loss": 0.2738, "num_input_tokens_seen": 42925696, "step": 19910 }, { "epoch": 3.654799045696458, "grad_norm": 1.7002391815185547, "learning_rate": 9.792953160217311e-06, "loss": 0.4499, "num_input_tokens_seen": 42936928, "step": 19915 }, { "epoch": 3.6557166452560104, "grad_norm": 3.795381546020508, "learning_rate": 9.792725053147908e-06, "loss": 0.3177, "num_input_tokens_seen": 42948288, "step": 19920 }, { "epoch": 3.6566342448155624, "grad_norm": 1.5892606973648071, "learning_rate": 9.792496823152534e-06, "loss": 0.3503, "num_input_tokens_seen": 42959392, "step": 19925 }, { "epoch": 3.6575518443751145, "grad_norm": 1.3751128911972046, "learning_rate": 9.792268470237046e-06, "loss": 0.3078, "num_input_tokens_seen": 42970496, "step": 19930 }, { "epoch": 3.658469443934667, "grad_norm": 2.9301226139068604, "learning_rate": 9.792039994407297e-06, "loss": 0.2133, "num_input_tokens_seen": 42980960, "step": 19935 }, { "epoch": 3.659387043494219, "grad_norm": 1.5998941659927368, "learning_rate": 9.79181139566915e-06, "loss": 0.2878, "num_input_tokens_seen": 42990528, "step": 19940 }, { "epoch": 3.660304643053771, "grad_norm": 2.8752501010894775, "learning_rate": 9.791582674028465e-06, "loss": 0.2615, "num_input_tokens_seen": 43002592, "step": 19945 }, { "epoch": 3.6612222426133236, "grad_norm": 1.6469342708587646, "learning_rate": 9.791353829491112e-06, "loss": 0.2898, "num_input_tokens_seen": 43013472, "step": 19950 }, { "epoch": 3.6621398421728757, "grad_norm": 3.15549898147583, "learning_rate": 9.791124862062962e-06, "loss": 0.44, "num_input_tokens_seen": 43025376, "step": 19955 }, { "epoch": 3.6630574417324278, "grad_norm": 10.186911582946777, "learning_rate": 9.790895771749881e-06, "loss": 0.3674, "num_input_tokens_seen": 43036416, "step": 19960 }, { "epoch": 3.6639750412919803, "grad_norm": 3.1632726192474365, "learning_rate": 9.79066655855775e-06, "loss": 0.281, "num_input_tokens_seen": 43046624, "step": 19965 }, { "epoch": 3.6648926408515323, "grad_norm": 3.7234697341918945, "learning_rate": 9.790437222492448e-06, "loss": 0.2007, "num_input_tokens_seen": 43057376, "step": 19970 }, { "epoch": 3.6658102404110844, "grad_norm": 3.074831008911133, "learning_rate": 9.790207763559855e-06, "loss": 0.2229, "num_input_tokens_seen": 43068224, "step": 19975 }, { "epoch": 3.666727839970637, "grad_norm": 2.996049404144287, "learning_rate": 9.789978181765857e-06, "loss": 0.2601, "num_input_tokens_seen": 43079584, "step": 19980 }, { "epoch": 3.667645439530189, "grad_norm": 12.947614669799805, "learning_rate": 9.789748477116343e-06, "loss": 0.3269, "num_input_tokens_seen": 43091712, "step": 19985 }, { "epoch": 3.668563039089741, "grad_norm": 4.35408353805542, "learning_rate": 9.789518649617202e-06, "loss": 0.32, "num_input_tokens_seen": 43102624, "step": 19990 }, { "epoch": 3.6694806386492935, "grad_norm": 2.613781452178955, "learning_rate": 9.789288699274333e-06, "loss": 0.2159, "num_input_tokens_seen": 43112288, "step": 19995 }, { "epoch": 3.6703982382088456, "grad_norm": 4.111056804656982, "learning_rate": 9.78905862609363e-06, "loss": 0.5474, "num_input_tokens_seen": 43123776, "step": 20000 }, { "epoch": 3.6713158377683976, "grad_norm": 2.068416118621826, "learning_rate": 9.788828430080996e-06, "loss": 0.2143, "num_input_tokens_seen": 43133952, "step": 20005 }, { "epoch": 3.67223343732795, "grad_norm": 0.6558953523635864, "learning_rate": 9.788598111242335e-06, "loss": 0.2149, "num_input_tokens_seen": 43145120, "step": 20010 }, { "epoch": 3.673151036887502, "grad_norm": 19.460493087768555, "learning_rate": 9.788367669583554e-06, "loss": 0.3953, "num_input_tokens_seen": 43154912, "step": 20015 }, { "epoch": 3.6740686364470543, "grad_norm": 14.07957649230957, "learning_rate": 9.788137105110565e-06, "loss": 0.2985, "num_input_tokens_seen": 43165056, "step": 20020 }, { "epoch": 3.674986236006607, "grad_norm": 3.2279257774353027, "learning_rate": 9.787906417829279e-06, "loss": 0.3418, "num_input_tokens_seen": 43175680, "step": 20025 }, { "epoch": 3.675903835566159, "grad_norm": 7.155717372894287, "learning_rate": 9.787675607745612e-06, "loss": 0.2707, "num_input_tokens_seen": 43187136, "step": 20030 }, { "epoch": 3.676821435125711, "grad_norm": 6.55573034286499, "learning_rate": 9.787444674865487e-06, "loss": 0.4411, "num_input_tokens_seen": 43198528, "step": 20035 }, { "epoch": 3.6777390346852634, "grad_norm": 3.481987237930298, "learning_rate": 9.787213619194827e-06, "loss": 0.2188, "num_input_tokens_seen": 43209408, "step": 20040 }, { "epoch": 3.6786566342448155, "grad_norm": 0.6634702086448669, "learning_rate": 9.786982440739557e-06, "loss": 0.2382, "num_input_tokens_seen": 43220032, "step": 20045 }, { "epoch": 3.6795742338043675, "grad_norm": 2.2264063358306885, "learning_rate": 9.786751139505607e-06, "loss": 0.2925, "num_input_tokens_seen": 43230784, "step": 20050 }, { "epoch": 3.68049183336392, "grad_norm": 7.994165897369385, "learning_rate": 9.786519715498907e-06, "loss": 0.1294, "num_input_tokens_seen": 43242176, "step": 20055 }, { "epoch": 3.681409432923472, "grad_norm": 8.798029899597168, "learning_rate": 9.786288168725397e-06, "loss": 0.294, "num_input_tokens_seen": 43252416, "step": 20060 }, { "epoch": 3.682327032483024, "grad_norm": 18.506662368774414, "learning_rate": 9.786056499191013e-06, "loss": 0.1907, "num_input_tokens_seen": 43262656, "step": 20065 }, { "epoch": 3.6832446320425767, "grad_norm": 2.3603339195251465, "learning_rate": 9.785824706901696e-06, "loss": 0.5431, "num_input_tokens_seen": 43272800, "step": 20070 }, { "epoch": 3.6841622316021287, "grad_norm": 1.2343814373016357, "learning_rate": 9.785592791863394e-06, "loss": 0.2853, "num_input_tokens_seen": 43284256, "step": 20075 }, { "epoch": 3.685079831161681, "grad_norm": 4.995569705963135, "learning_rate": 9.785360754082054e-06, "loss": 0.3029, "num_input_tokens_seen": 43295040, "step": 20080 }, { "epoch": 3.6859974307212333, "grad_norm": 9.002760887145996, "learning_rate": 9.785128593563627e-06, "loss": 0.326, "num_input_tokens_seen": 43305696, "step": 20085 }, { "epoch": 3.6869150302807854, "grad_norm": 4.887593746185303, "learning_rate": 9.784896310314068e-06, "loss": 0.3391, "num_input_tokens_seen": 43316448, "step": 20090 }, { "epoch": 3.6878326298403374, "grad_norm": 18.40029525756836, "learning_rate": 9.784663904339336e-06, "loss": 0.293, "num_input_tokens_seen": 43327104, "step": 20095 }, { "epoch": 3.68875022939989, "grad_norm": 2.373283624649048, "learning_rate": 9.784431375645387e-06, "loss": 0.191, "num_input_tokens_seen": 43338016, "step": 20100 }, { "epoch": 3.689667828959442, "grad_norm": 1.9772608280181885, "learning_rate": 9.784198724238191e-06, "loss": 0.3966, "num_input_tokens_seen": 43348672, "step": 20105 }, { "epoch": 3.690585428518994, "grad_norm": 3.9254136085510254, "learning_rate": 9.783965950123712e-06, "loss": 0.2243, "num_input_tokens_seen": 43361376, "step": 20110 }, { "epoch": 3.6915030280785466, "grad_norm": 12.050542831420898, "learning_rate": 9.78373305330792e-06, "loss": 0.363, "num_input_tokens_seen": 43372928, "step": 20115 }, { "epoch": 3.6924206276380986, "grad_norm": 7.958895683288574, "learning_rate": 9.78350003379679e-06, "loss": 0.2041, "num_input_tokens_seen": 43383744, "step": 20120 }, { "epoch": 3.6933382271976507, "grad_norm": 8.570326805114746, "learning_rate": 9.783266891596298e-06, "loss": 0.2415, "num_input_tokens_seen": 43394688, "step": 20125 }, { "epoch": 3.694255826757203, "grad_norm": 6.517988204956055, "learning_rate": 9.783033626712423e-06, "loss": 0.2058, "num_input_tokens_seen": 43404928, "step": 20130 }, { "epoch": 3.6951734263167553, "grad_norm": 7.598762035369873, "learning_rate": 9.782800239151149e-06, "loss": 0.3014, "num_input_tokens_seen": 43415040, "step": 20135 }, { "epoch": 3.6960910258763073, "grad_norm": 7.937109470367432, "learning_rate": 9.78256672891846e-06, "loss": 0.2243, "num_input_tokens_seen": 43426400, "step": 20140 }, { "epoch": 3.69700862543586, "grad_norm": 1.285715937614441, "learning_rate": 9.78233309602035e-06, "loss": 0.2523, "num_input_tokens_seen": 43436288, "step": 20145 }, { "epoch": 3.697926224995412, "grad_norm": 0.9811822772026062, "learning_rate": 9.782099340462806e-06, "loss": 0.2294, "num_input_tokens_seen": 43447136, "step": 20150 }, { "epoch": 3.698843824554964, "grad_norm": 1.2185100317001343, "learning_rate": 9.781865462251827e-06, "loss": 0.3664, "num_input_tokens_seen": 43457472, "step": 20155 }, { "epoch": 3.6997614241145165, "grad_norm": 7.145819664001465, "learning_rate": 9.781631461393408e-06, "loss": 0.4276, "num_input_tokens_seen": 43467264, "step": 20160 }, { "epoch": 3.7006790236740685, "grad_norm": 6.651144027709961, "learning_rate": 9.781397337893553e-06, "loss": 0.2922, "num_input_tokens_seen": 43478176, "step": 20165 }, { "epoch": 3.7015966232336206, "grad_norm": 3.8184523582458496, "learning_rate": 9.781163091758269e-06, "loss": 0.2423, "num_input_tokens_seen": 43488672, "step": 20170 }, { "epoch": 3.702514222793173, "grad_norm": 5.6471428871154785, "learning_rate": 9.780928722993559e-06, "loss": 0.2751, "num_input_tokens_seen": 43500192, "step": 20175 }, { "epoch": 3.703431822352725, "grad_norm": 5.2746734619140625, "learning_rate": 9.780694231605438e-06, "loss": 0.3668, "num_input_tokens_seen": 43511072, "step": 20180 }, { "epoch": 3.7043494219122772, "grad_norm": 1.688828706741333, "learning_rate": 9.78045961759992e-06, "loss": 0.3435, "num_input_tokens_seen": 43521824, "step": 20185 }, { "epoch": 3.7052670214718297, "grad_norm": 3.027998447418213, "learning_rate": 9.780224880983023e-06, "loss": 0.3652, "num_input_tokens_seen": 43533312, "step": 20190 }, { "epoch": 3.706184621031382, "grad_norm": 4.02042293548584, "learning_rate": 9.779990021760763e-06, "loss": 0.3361, "num_input_tokens_seen": 43543584, "step": 20195 }, { "epoch": 3.707102220590934, "grad_norm": 3.161870002746582, "learning_rate": 9.77975503993917e-06, "loss": 0.2851, "num_input_tokens_seen": 43555104, "step": 20200 }, { "epoch": 3.7080198201504864, "grad_norm": 6.539317607879639, "learning_rate": 9.779519935524267e-06, "loss": 0.2209, "num_input_tokens_seen": 43565696, "step": 20205 }, { "epoch": 3.7089374197100384, "grad_norm": 3.050461769104004, "learning_rate": 9.779284708522085e-06, "loss": 0.2228, "num_input_tokens_seen": 43576256, "step": 20210 }, { "epoch": 3.7098550192695905, "grad_norm": 7.492711067199707, "learning_rate": 9.77904935893866e-06, "loss": 0.2525, "num_input_tokens_seen": 43587648, "step": 20215 }, { "epoch": 3.710772618829143, "grad_norm": 3.282263994216919, "learning_rate": 9.778813886780023e-06, "loss": 0.3912, "num_input_tokens_seen": 43597696, "step": 20220 }, { "epoch": 3.711690218388695, "grad_norm": 2.073294162750244, "learning_rate": 9.778578292052218e-06, "loss": 0.2969, "num_input_tokens_seen": 43609088, "step": 20225 }, { "epoch": 3.712607817948247, "grad_norm": 1.9373717308044434, "learning_rate": 9.778342574761285e-06, "loss": 0.2586, "num_input_tokens_seen": 43619968, "step": 20230 }, { "epoch": 3.7135254175077996, "grad_norm": 11.750436782836914, "learning_rate": 9.778106734913271e-06, "loss": 0.3009, "num_input_tokens_seen": 43632736, "step": 20235 }, { "epoch": 3.7144430170673517, "grad_norm": 2.9512741565704346, "learning_rate": 9.777870772514224e-06, "loss": 0.3266, "num_input_tokens_seen": 43643840, "step": 20240 }, { "epoch": 3.7153606166269038, "grad_norm": 1.2679246664047241, "learning_rate": 9.777634687570197e-06, "loss": 0.233, "num_input_tokens_seen": 43655008, "step": 20245 }, { "epoch": 3.7162782161864563, "grad_norm": 2.7008464336395264, "learning_rate": 9.777398480087246e-06, "loss": 0.3504, "num_input_tokens_seen": 43666304, "step": 20250 }, { "epoch": 3.7171958157460083, "grad_norm": 14.04004192352295, "learning_rate": 9.777162150071427e-06, "loss": 0.1515, "num_input_tokens_seen": 43676992, "step": 20255 }, { "epoch": 3.7181134153055604, "grad_norm": 2.591383695602417, "learning_rate": 9.776925697528803e-06, "loss": 0.2285, "num_input_tokens_seen": 43688544, "step": 20260 }, { "epoch": 3.719031014865113, "grad_norm": 9.610870361328125, "learning_rate": 9.776689122465439e-06, "loss": 0.228, "num_input_tokens_seen": 43698848, "step": 20265 }, { "epoch": 3.719948614424665, "grad_norm": 2.196202278137207, "learning_rate": 9.776452424887402e-06, "loss": 0.225, "num_input_tokens_seen": 43709024, "step": 20270 }, { "epoch": 3.720866213984217, "grad_norm": 3.7950119972229004, "learning_rate": 9.776215604800763e-06, "loss": 0.2417, "num_input_tokens_seen": 43720064, "step": 20275 }, { "epoch": 3.7217838135437695, "grad_norm": 2.0570690631866455, "learning_rate": 9.775978662211596e-06, "loss": 0.2454, "num_input_tokens_seen": 43731552, "step": 20280 }, { "epoch": 3.7227014131033216, "grad_norm": 5.774537086486816, "learning_rate": 9.775741597125979e-06, "loss": 0.328, "num_input_tokens_seen": 43743200, "step": 20285 }, { "epoch": 3.7236190126628737, "grad_norm": 8.365900993347168, "learning_rate": 9.77550440954999e-06, "loss": 0.3303, "num_input_tokens_seen": 43754432, "step": 20290 }, { "epoch": 3.724536612222426, "grad_norm": 3.4212779998779297, "learning_rate": 9.775267099489716e-06, "loss": 0.2476, "num_input_tokens_seen": 43763456, "step": 20295 }, { "epoch": 3.725454211781978, "grad_norm": 2.843214988708496, "learning_rate": 9.775029666951242e-06, "loss": 0.3422, "num_input_tokens_seen": 43775360, "step": 20300 }, { "epoch": 3.7263718113415303, "grad_norm": 4.310157775878906, "learning_rate": 9.774792111940657e-06, "loss": 0.3203, "num_input_tokens_seen": 43786816, "step": 20305 }, { "epoch": 3.727289410901083, "grad_norm": 1.3794313669204712, "learning_rate": 9.774554434464055e-06, "loss": 0.2138, "num_input_tokens_seen": 43798112, "step": 20310 }, { "epoch": 3.728207010460635, "grad_norm": 3.640296459197998, "learning_rate": 9.774316634527532e-06, "loss": 0.2328, "num_input_tokens_seen": 43807808, "step": 20315 }, { "epoch": 3.729124610020187, "grad_norm": 3.9176366329193115, "learning_rate": 9.774078712137185e-06, "loss": 0.5338, "num_input_tokens_seen": 43817184, "step": 20320 }, { "epoch": 3.7300422095797394, "grad_norm": 6.758996486663818, "learning_rate": 9.77384066729912e-06, "loss": 0.314, "num_input_tokens_seen": 43828032, "step": 20325 }, { "epoch": 3.7309598091392915, "grad_norm": 8.145447731018066, "learning_rate": 9.77360250001944e-06, "loss": 0.329, "num_input_tokens_seen": 43839232, "step": 20330 }, { "epoch": 3.731877408698844, "grad_norm": 1.6854419708251953, "learning_rate": 9.773364210304254e-06, "loss": 0.3318, "num_input_tokens_seen": 43849408, "step": 20335 }, { "epoch": 3.732795008258396, "grad_norm": 2.6778063774108887, "learning_rate": 9.773125798159674e-06, "loss": 0.4109, "num_input_tokens_seen": 43861216, "step": 20340 }, { "epoch": 3.733712607817948, "grad_norm": 3.130941390991211, "learning_rate": 9.772887263591817e-06, "loss": 0.3552, "num_input_tokens_seen": 43871840, "step": 20345 }, { "epoch": 3.7346302073775006, "grad_norm": 1.035222053527832, "learning_rate": 9.772648606606796e-06, "loss": 0.3056, "num_input_tokens_seen": 43881664, "step": 20350 }, { "epoch": 3.7355478069370527, "grad_norm": 3.4857914447784424, "learning_rate": 9.772409827210738e-06, "loss": 0.3297, "num_input_tokens_seen": 43893408, "step": 20355 }, { "epoch": 3.7364654064966047, "grad_norm": 1.3342630863189697, "learning_rate": 9.772170925409764e-06, "loss": 0.1718, "num_input_tokens_seen": 43903328, "step": 20360 }, { "epoch": 3.7373830060561573, "grad_norm": 1.7876131534576416, "learning_rate": 9.771931901209998e-06, "loss": 0.3557, "num_input_tokens_seen": 43914176, "step": 20365 }, { "epoch": 3.7383006056157093, "grad_norm": 5.995250225067139, "learning_rate": 9.771692754617578e-06, "loss": 0.3421, "num_input_tokens_seen": 43925088, "step": 20370 }, { "epoch": 3.7392182051752614, "grad_norm": 2.250669479370117, "learning_rate": 9.771453485638635e-06, "loss": 0.3776, "num_input_tokens_seen": 43936096, "step": 20375 }, { "epoch": 3.740135804734814, "grad_norm": 0.8642157316207886, "learning_rate": 9.771214094279304e-06, "loss": 0.1907, "num_input_tokens_seen": 43946464, "step": 20380 }, { "epoch": 3.741053404294366, "grad_norm": 1.1748555898666382, "learning_rate": 9.770974580545727e-06, "loss": 0.3026, "num_input_tokens_seen": 43957280, "step": 20385 }, { "epoch": 3.741971003853918, "grad_norm": 2.252316474914551, "learning_rate": 9.770734944444044e-06, "loss": 0.2297, "num_input_tokens_seen": 43968736, "step": 20390 }, { "epoch": 3.7428886034134705, "grad_norm": 4.752324104309082, "learning_rate": 9.770495185980407e-06, "loss": 0.2913, "num_input_tokens_seen": 43980288, "step": 20395 }, { "epoch": 3.7438062029730226, "grad_norm": 1.6769888401031494, "learning_rate": 9.77025530516096e-06, "loss": 0.3438, "num_input_tokens_seen": 43990560, "step": 20400 }, { "epoch": 3.7447238025325746, "grad_norm": 1.7104833126068115, "learning_rate": 9.770015301991858e-06, "loss": 0.2369, "num_input_tokens_seen": 44001536, "step": 20405 }, { "epoch": 3.745641402092127, "grad_norm": 2.529029369354248, "learning_rate": 9.769775176479256e-06, "loss": 0.2768, "num_input_tokens_seen": 44011808, "step": 20410 }, { "epoch": 3.746559001651679, "grad_norm": 12.021599769592285, "learning_rate": 9.769534928629313e-06, "loss": 0.4039, "num_input_tokens_seen": 44022560, "step": 20415 }, { "epoch": 3.7474766012112313, "grad_norm": 2.2933080196380615, "learning_rate": 9.769294558448192e-06, "loss": 0.3427, "num_input_tokens_seen": 44034592, "step": 20420 }, { "epoch": 3.748394200770784, "grad_norm": 1.9355493783950806, "learning_rate": 9.769054065942056e-06, "loss": 0.2507, "num_input_tokens_seen": 44044512, "step": 20425 }, { "epoch": 3.749311800330336, "grad_norm": 2.591789484024048, "learning_rate": 9.768813451117077e-06, "loss": 0.3438, "num_input_tokens_seen": 44054080, "step": 20430 }, { "epoch": 3.750229399889888, "grad_norm": 3.245164394378662, "learning_rate": 9.768572713979423e-06, "loss": 0.2851, "num_input_tokens_seen": 44064992, "step": 20435 }, { "epoch": 3.7511469994494404, "grad_norm": 3.998014450073242, "learning_rate": 9.768331854535268e-06, "loss": 0.316, "num_input_tokens_seen": 44076672, "step": 20440 }, { "epoch": 3.7520645990089925, "grad_norm": 5.807168483734131, "learning_rate": 9.768090872790792e-06, "loss": 0.2665, "num_input_tokens_seen": 44087392, "step": 20445 }, { "epoch": 3.7529821985685445, "grad_norm": 4.690238952636719, "learning_rate": 9.767849768752175e-06, "loss": 0.2679, "num_input_tokens_seen": 44097344, "step": 20450 }, { "epoch": 3.753899798128097, "grad_norm": 6.845367431640625, "learning_rate": 9.767608542425601e-06, "loss": 0.3226, "num_input_tokens_seen": 44107648, "step": 20455 }, { "epoch": 3.754817397687649, "grad_norm": 3.959658145904541, "learning_rate": 9.76736719381726e-06, "loss": 0.3165, "num_input_tokens_seen": 44118816, "step": 20460 }, { "epoch": 3.755734997247201, "grad_norm": 2.787527561187744, "learning_rate": 9.767125722933335e-06, "loss": 0.199, "num_input_tokens_seen": 44129408, "step": 20465 }, { "epoch": 3.7566525968067537, "grad_norm": 5.3694748878479, "learning_rate": 9.766884129780024e-06, "loss": 0.271, "num_input_tokens_seen": 44140416, "step": 20470 }, { "epoch": 3.7575701963663057, "grad_norm": 5.989445686340332, "learning_rate": 9.766642414363524e-06, "loss": 0.2572, "num_input_tokens_seen": 44151552, "step": 20475 }, { "epoch": 3.758487795925858, "grad_norm": 4.917352199554443, "learning_rate": 9.766400576690034e-06, "loss": 0.2589, "num_input_tokens_seen": 44162464, "step": 20480 }, { "epoch": 3.7594053954854103, "grad_norm": 3.0267059803009033, "learning_rate": 9.766158616765756e-06, "loss": 0.2635, "num_input_tokens_seen": 44174240, "step": 20485 }, { "epoch": 3.7603229950449624, "grad_norm": 12.853515625, "learning_rate": 9.765916534596897e-06, "loss": 0.3347, "num_input_tokens_seen": 44185568, "step": 20490 }, { "epoch": 3.7612405946045144, "grad_norm": 2.455307960510254, "learning_rate": 9.765674330189664e-06, "loss": 0.3202, "num_input_tokens_seen": 44196928, "step": 20495 }, { "epoch": 3.762158194164067, "grad_norm": 4.22739315032959, "learning_rate": 9.765432003550273e-06, "loss": 0.3372, "num_input_tokens_seen": 44206720, "step": 20500 }, { "epoch": 3.763075793723619, "grad_norm": 3.083552598953247, "learning_rate": 9.765189554684936e-06, "loss": 0.2647, "num_input_tokens_seen": 44217600, "step": 20505 }, { "epoch": 3.763993393283171, "grad_norm": 2.940194606781006, "learning_rate": 9.76494698359987e-06, "loss": 0.464, "num_input_tokens_seen": 44227488, "step": 20510 }, { "epoch": 3.7649109928427236, "grad_norm": 7.974202632904053, "learning_rate": 9.7647042903013e-06, "loss": 0.3043, "num_input_tokens_seen": 44238944, "step": 20515 }, { "epoch": 3.7658285924022756, "grad_norm": 13.350253105163574, "learning_rate": 9.76446147479545e-06, "loss": 0.3953, "num_input_tokens_seen": 44250656, "step": 20520 }, { "epoch": 3.7667461919618277, "grad_norm": 1.552134394645691, "learning_rate": 9.764218537088548e-06, "loss": 0.3713, "num_input_tokens_seen": 44261696, "step": 20525 }, { "epoch": 3.76766379152138, "grad_norm": 5.605724334716797, "learning_rate": 9.763975477186824e-06, "loss": 0.2769, "num_input_tokens_seen": 44272960, "step": 20530 }, { "epoch": 3.7685813910809323, "grad_norm": 4.087650299072266, "learning_rate": 9.763732295096513e-06, "loss": 0.2945, "num_input_tokens_seen": 44284544, "step": 20535 }, { "epoch": 3.7694989906404843, "grad_norm": 4.993375778198242, "learning_rate": 9.76348899082385e-06, "loss": 0.2642, "num_input_tokens_seen": 44296064, "step": 20540 }, { "epoch": 3.770416590200037, "grad_norm": 1.5873953104019165, "learning_rate": 9.76324556437508e-06, "loss": 0.2459, "num_input_tokens_seen": 44308000, "step": 20545 }, { "epoch": 3.771334189759589, "grad_norm": 5.022450923919678, "learning_rate": 9.763002015756443e-06, "loss": 0.206, "num_input_tokens_seen": 44319040, "step": 20550 }, { "epoch": 3.7722517893191414, "grad_norm": 4.276097297668457, "learning_rate": 9.762758344974184e-06, "loss": 0.2406, "num_input_tokens_seen": 44330432, "step": 20555 }, { "epoch": 3.7731693888786935, "grad_norm": 14.042343139648438, "learning_rate": 9.762514552034557e-06, "loss": 0.3796, "num_input_tokens_seen": 44340320, "step": 20560 }, { "epoch": 3.7740869884382455, "grad_norm": 2.287957191467285, "learning_rate": 9.762270636943812e-06, "loss": 0.5621, "num_input_tokens_seen": 44351680, "step": 20565 }, { "epoch": 3.775004587997798, "grad_norm": 12.126644134521484, "learning_rate": 9.762026599708205e-06, "loss": 0.2932, "num_input_tokens_seen": 44361536, "step": 20570 }, { "epoch": 3.77592218755735, "grad_norm": 5.2388386726379395, "learning_rate": 9.761782440333997e-06, "loss": 0.5915, "num_input_tokens_seen": 44371872, "step": 20575 }, { "epoch": 3.776839787116902, "grad_norm": 2.8840932846069336, "learning_rate": 9.76153815882745e-06, "loss": 0.25, "num_input_tokens_seen": 44382848, "step": 20580 }, { "epoch": 3.7777573866764547, "grad_norm": 0.9633359313011169, "learning_rate": 9.76129375519483e-06, "loss": 0.2157, "num_input_tokens_seen": 44394080, "step": 20585 }, { "epoch": 3.7786749862360067, "grad_norm": 9.527101516723633, "learning_rate": 9.761049229442404e-06, "loss": 0.2458, "num_input_tokens_seen": 44404864, "step": 20590 }, { "epoch": 3.779592585795559, "grad_norm": 5.603982448577881, "learning_rate": 9.760804581576443e-06, "loss": 0.3158, "num_input_tokens_seen": 44416384, "step": 20595 }, { "epoch": 3.7805101853551113, "grad_norm": 2.442640781402588, "learning_rate": 9.760559811603223e-06, "loss": 0.214, "num_input_tokens_seen": 44427968, "step": 20600 }, { "epoch": 3.7814277849146634, "grad_norm": 11.53278636932373, "learning_rate": 9.760314919529024e-06, "loss": 0.3497, "num_input_tokens_seen": 44438688, "step": 20605 }, { "epoch": 3.7823453844742154, "grad_norm": 1.9298256635665894, "learning_rate": 9.760069905360124e-06, "loss": 0.3233, "num_input_tokens_seen": 44448096, "step": 20610 }, { "epoch": 3.783262984033768, "grad_norm": 5.169457912445068, "learning_rate": 9.759824769102807e-06, "loss": 0.3311, "num_input_tokens_seen": 44459328, "step": 20615 }, { "epoch": 3.78418058359332, "grad_norm": 6.408871173858643, "learning_rate": 9.759579510763362e-06, "loss": 0.3008, "num_input_tokens_seen": 44470464, "step": 20620 }, { "epoch": 3.785098183152872, "grad_norm": 2.5456528663635254, "learning_rate": 9.759334130348082e-06, "loss": 0.3788, "num_input_tokens_seen": 44482112, "step": 20625 }, { "epoch": 3.7860157827124246, "grad_norm": 1.15781569480896, "learning_rate": 9.759088627863255e-06, "loss": 0.2602, "num_input_tokens_seen": 44493504, "step": 20630 }, { "epoch": 3.7869333822719766, "grad_norm": 10.840882301330566, "learning_rate": 9.758843003315182e-06, "loss": 0.2752, "num_input_tokens_seen": 44504000, "step": 20635 }, { "epoch": 3.7878509818315287, "grad_norm": 8.283297538757324, "learning_rate": 9.75859725671016e-06, "loss": 0.2641, "num_input_tokens_seen": 44513312, "step": 20640 }, { "epoch": 3.788768581391081, "grad_norm": 22.344844818115234, "learning_rate": 9.758351388054496e-06, "loss": 0.4169, "num_input_tokens_seen": 44524128, "step": 20645 }, { "epoch": 3.7896861809506333, "grad_norm": 4.857443809509277, "learning_rate": 9.758105397354492e-06, "loss": 0.4271, "num_input_tokens_seen": 44535008, "step": 20650 }, { "epoch": 3.7906037805101853, "grad_norm": 2.4236106872558594, "learning_rate": 9.75785928461646e-06, "loss": 0.2107, "num_input_tokens_seen": 44546144, "step": 20655 }, { "epoch": 3.791521380069738, "grad_norm": 1.6633410453796387, "learning_rate": 9.75761304984671e-06, "loss": 0.3665, "num_input_tokens_seen": 44557408, "step": 20660 }, { "epoch": 3.79243897962929, "grad_norm": 1.795159935951233, "learning_rate": 9.757366693051559e-06, "loss": 0.2989, "num_input_tokens_seen": 44567488, "step": 20665 }, { "epoch": 3.793356579188842, "grad_norm": 2.6691324710845947, "learning_rate": 9.757120214237326e-06, "loss": 0.2928, "num_input_tokens_seen": 44578880, "step": 20670 }, { "epoch": 3.7942741787483945, "grad_norm": 2.295163869857788, "learning_rate": 9.756873613410333e-06, "loss": 0.3088, "num_input_tokens_seen": 44588768, "step": 20675 }, { "epoch": 3.7951917783079465, "grad_norm": 2.208970546722412, "learning_rate": 9.756626890576904e-06, "loss": 0.3176, "num_input_tokens_seen": 44600736, "step": 20680 }, { "epoch": 3.7961093778674986, "grad_norm": 3.4656176567077637, "learning_rate": 9.756380045743368e-06, "loss": 0.2217, "num_input_tokens_seen": 44611328, "step": 20685 }, { "epoch": 3.797026977427051, "grad_norm": 4.6800713539123535, "learning_rate": 9.756133078916054e-06, "loss": 0.2812, "num_input_tokens_seen": 44622688, "step": 20690 }, { "epoch": 3.797944576986603, "grad_norm": 3.690093755722046, "learning_rate": 9.7558859901013e-06, "loss": 0.3896, "num_input_tokens_seen": 44632864, "step": 20695 }, { "epoch": 3.798862176546155, "grad_norm": 4.2832932472229, "learning_rate": 9.755638779305439e-06, "loss": 0.3333, "num_input_tokens_seen": 44643776, "step": 20700 }, { "epoch": 3.7997797761057077, "grad_norm": 1.9987809658050537, "learning_rate": 9.755391446534814e-06, "loss": 0.2434, "num_input_tokens_seen": 44655232, "step": 20705 }, { "epoch": 3.80069737566526, "grad_norm": 1.4556334018707275, "learning_rate": 9.75514399179577e-06, "loss": 0.2204, "num_input_tokens_seen": 44666080, "step": 20710 }, { "epoch": 3.801614975224812, "grad_norm": 6.3525824546813965, "learning_rate": 9.754896415094651e-06, "loss": 0.3577, "num_input_tokens_seen": 44675744, "step": 20715 }, { "epoch": 3.8025325747843643, "grad_norm": 4.859945297241211, "learning_rate": 9.754648716437808e-06, "loss": 0.3213, "num_input_tokens_seen": 44685152, "step": 20720 }, { "epoch": 3.8034501743439164, "grad_norm": 8.274099349975586, "learning_rate": 9.754400895831597e-06, "loss": 0.3606, "num_input_tokens_seen": 44695680, "step": 20725 }, { "epoch": 3.8043677739034685, "grad_norm": 3.718109130859375, "learning_rate": 9.754152953282369e-06, "loss": 0.2007, "num_input_tokens_seen": 44706240, "step": 20730 }, { "epoch": 3.805285373463021, "grad_norm": 3.364809513092041, "learning_rate": 9.753904888796489e-06, "loss": 0.2967, "num_input_tokens_seen": 44716544, "step": 20735 }, { "epoch": 3.806202973022573, "grad_norm": 5.86991024017334, "learning_rate": 9.753656702380314e-06, "loss": 0.3026, "num_input_tokens_seen": 44727776, "step": 20740 }, { "epoch": 3.807120572582125, "grad_norm": 20.14276123046875, "learning_rate": 9.753408394040214e-06, "loss": 0.2979, "num_input_tokens_seen": 44739552, "step": 20745 }, { "epoch": 3.8080381721416776, "grad_norm": 4.680997848510742, "learning_rate": 9.753159963782554e-06, "loss": 0.2188, "num_input_tokens_seen": 44749760, "step": 20750 }, { "epoch": 3.8089557717012297, "grad_norm": 1.9814739227294922, "learning_rate": 9.752911411613709e-06, "loss": 0.2422, "num_input_tokens_seen": 44760736, "step": 20755 }, { "epoch": 3.8098733712607817, "grad_norm": 15.647127151489258, "learning_rate": 9.752662737540051e-06, "loss": 0.3457, "num_input_tokens_seen": 44772832, "step": 20760 }, { "epoch": 3.8107909708203342, "grad_norm": 2.4886348247528076, "learning_rate": 9.75241394156796e-06, "loss": 0.2267, "num_input_tokens_seen": 44785024, "step": 20765 }, { "epoch": 3.8117085703798863, "grad_norm": 9.359512329101562, "learning_rate": 9.75216502370382e-06, "loss": 0.2746, "num_input_tokens_seen": 44795808, "step": 20770 }, { "epoch": 3.8126261699394384, "grad_norm": 9.436208724975586, "learning_rate": 9.751915983954009e-06, "loss": 0.3495, "num_input_tokens_seen": 44806400, "step": 20775 }, { "epoch": 3.813543769498991, "grad_norm": 0.8692083358764648, "learning_rate": 9.751666822324919e-06, "loss": 0.2713, "num_input_tokens_seen": 44816960, "step": 20780 }, { "epoch": 3.814461369058543, "grad_norm": 7.288999557495117, "learning_rate": 9.751417538822938e-06, "loss": 0.4478, "num_input_tokens_seen": 44828288, "step": 20785 }, { "epoch": 3.815378968618095, "grad_norm": 1.3785784244537354, "learning_rate": 9.751168133454462e-06, "loss": 0.2222, "num_input_tokens_seen": 44838976, "step": 20790 }, { "epoch": 3.8162965681776475, "grad_norm": 4.667150497436523, "learning_rate": 9.75091860622589e-06, "loss": 0.3592, "num_input_tokens_seen": 44850656, "step": 20795 }, { "epoch": 3.8172141677371996, "grad_norm": 3.9283535480499268, "learning_rate": 9.750668957143616e-06, "loss": 0.3037, "num_input_tokens_seen": 44860832, "step": 20800 }, { "epoch": 3.8181317672967516, "grad_norm": 2.191121816635132, "learning_rate": 9.750419186214047e-06, "loss": 0.3621, "num_input_tokens_seen": 44871744, "step": 20805 }, { "epoch": 3.819049366856304, "grad_norm": 11.061843872070312, "learning_rate": 9.750169293443586e-06, "loss": 0.3695, "num_input_tokens_seen": 44882400, "step": 20810 }, { "epoch": 3.819966966415856, "grad_norm": 18.67864418029785, "learning_rate": 9.749919278838648e-06, "loss": 0.3271, "num_input_tokens_seen": 44892064, "step": 20815 }, { "epoch": 3.8208845659754083, "grad_norm": 3.4163825511932373, "learning_rate": 9.74966914240564e-06, "loss": 0.2427, "num_input_tokens_seen": 44903136, "step": 20820 }, { "epoch": 3.8218021655349608, "grad_norm": 2.9477407932281494, "learning_rate": 9.74941888415098e-06, "loss": 0.3111, "num_input_tokens_seen": 44913888, "step": 20825 }, { "epoch": 3.822719765094513, "grad_norm": 6.901739120483398, "learning_rate": 9.749168504081088e-06, "loss": 0.3053, "num_input_tokens_seen": 44925120, "step": 20830 }, { "epoch": 3.823637364654065, "grad_norm": 1.909825086593628, "learning_rate": 9.748918002202384e-06, "loss": 0.3128, "num_input_tokens_seen": 44936128, "step": 20835 }, { "epoch": 3.8245549642136174, "grad_norm": 1.5705442428588867, "learning_rate": 9.748667378521292e-06, "loss": 0.3641, "num_input_tokens_seen": 44948000, "step": 20840 }, { "epoch": 3.8254725637731695, "grad_norm": 2.24708890914917, "learning_rate": 9.748416633044242e-06, "loss": 0.3635, "num_input_tokens_seen": 44957856, "step": 20845 }, { "epoch": 3.8263901633327215, "grad_norm": 6.6833271980285645, "learning_rate": 9.748165765777666e-06, "loss": 0.3893, "num_input_tokens_seen": 44968960, "step": 20850 }, { "epoch": 3.827307762892274, "grad_norm": 1.7730896472930908, "learning_rate": 9.747914776727997e-06, "loss": 0.208, "num_input_tokens_seen": 44979520, "step": 20855 }, { "epoch": 3.828225362451826, "grad_norm": 1.2160143852233887, "learning_rate": 9.747663665901672e-06, "loss": 0.3505, "num_input_tokens_seen": 44991104, "step": 20860 }, { "epoch": 3.829142962011378, "grad_norm": 2.194662094116211, "learning_rate": 9.747412433305132e-06, "loss": 0.2952, "num_input_tokens_seen": 45000608, "step": 20865 }, { "epoch": 3.8300605615709307, "grad_norm": 1.9407978057861328, "learning_rate": 9.747161078944821e-06, "loss": 0.2915, "num_input_tokens_seen": 45012000, "step": 20870 }, { "epoch": 3.8309781611304827, "grad_norm": 1.3130861520767212, "learning_rate": 9.746909602827187e-06, "loss": 0.2701, "num_input_tokens_seen": 45022624, "step": 20875 }, { "epoch": 3.831895760690035, "grad_norm": 1.6066460609436035, "learning_rate": 9.746658004958676e-06, "loss": 0.2463, "num_input_tokens_seen": 45032736, "step": 20880 }, { "epoch": 3.8328133602495873, "grad_norm": 8.654242515563965, "learning_rate": 9.746406285345747e-06, "loss": 0.2865, "num_input_tokens_seen": 45042432, "step": 20885 }, { "epoch": 3.8337309598091394, "grad_norm": 3.0009849071502686, "learning_rate": 9.746154443994851e-06, "loss": 0.3751, "num_input_tokens_seen": 45053952, "step": 20890 }, { "epoch": 3.8346485593686914, "grad_norm": 5.046767711639404, "learning_rate": 9.745902480912449e-06, "loss": 0.2808, "num_input_tokens_seen": 45065184, "step": 20895 }, { "epoch": 3.835566158928244, "grad_norm": 9.817477226257324, "learning_rate": 9.745650396105004e-06, "loss": 0.3788, "num_input_tokens_seen": 45076672, "step": 20900 }, { "epoch": 3.836483758487796, "grad_norm": 3.645862340927124, "learning_rate": 9.745398189578983e-06, "loss": 0.3076, "num_input_tokens_seen": 45087680, "step": 20905 }, { "epoch": 3.837401358047348, "grad_norm": 1.3010149002075195, "learning_rate": 9.745145861340852e-06, "loss": 0.3144, "num_input_tokens_seen": 45096992, "step": 20910 }, { "epoch": 3.8383189576069006, "grad_norm": 1.8465256690979004, "learning_rate": 9.744893411397085e-06, "loss": 0.2811, "num_input_tokens_seen": 45108000, "step": 20915 }, { "epoch": 3.8392365571664526, "grad_norm": 1.6610190868377686, "learning_rate": 9.744640839754154e-06, "loss": 0.2588, "num_input_tokens_seen": 45118048, "step": 20920 }, { "epoch": 3.8401541567260047, "grad_norm": 10.004034042358398, "learning_rate": 9.74438814641854e-06, "loss": 0.3923, "num_input_tokens_seen": 45127008, "step": 20925 }, { "epoch": 3.841071756285557, "grad_norm": 2.1192829608917236, "learning_rate": 9.744135331396724e-06, "loss": 0.2694, "num_input_tokens_seen": 45137088, "step": 20930 }, { "epoch": 3.8419893558451093, "grad_norm": 1.3280036449432373, "learning_rate": 9.743882394695187e-06, "loss": 0.2343, "num_input_tokens_seen": 45147424, "step": 20935 }, { "epoch": 3.8429069554046613, "grad_norm": 6.997003078460693, "learning_rate": 9.743629336320422e-06, "loss": 0.2629, "num_input_tokens_seen": 45158752, "step": 20940 }, { "epoch": 3.843824554964214, "grad_norm": 3.718214273452759, "learning_rate": 9.743376156278915e-06, "loss": 0.3293, "num_input_tokens_seen": 45169952, "step": 20945 }, { "epoch": 3.844742154523766, "grad_norm": 3.5393331050872803, "learning_rate": 9.743122854577162e-06, "loss": 0.2233, "num_input_tokens_seen": 45181184, "step": 20950 }, { "epoch": 3.845659754083318, "grad_norm": 2.9133987426757812, "learning_rate": 9.742869431221658e-06, "loss": 0.2804, "num_input_tokens_seen": 45191232, "step": 20955 }, { "epoch": 3.8465773536428705, "grad_norm": 4.733728885650635, "learning_rate": 9.742615886218905e-06, "loss": 0.2648, "num_input_tokens_seen": 45201952, "step": 20960 }, { "epoch": 3.8474949532024225, "grad_norm": 1.774170160293579, "learning_rate": 9.742362219575403e-06, "loss": 0.2049, "num_input_tokens_seen": 45211968, "step": 20965 }, { "epoch": 3.8484125527619746, "grad_norm": 2.0156192779541016, "learning_rate": 9.742108431297662e-06, "loss": 0.2656, "num_input_tokens_seen": 45222752, "step": 20970 }, { "epoch": 3.849330152321527, "grad_norm": 3.0866496562957764, "learning_rate": 9.741854521392186e-06, "loss": 0.3558, "num_input_tokens_seen": 45233632, "step": 20975 }, { "epoch": 3.850247751881079, "grad_norm": 1.3676940202713013, "learning_rate": 9.741600489865494e-06, "loss": 0.2614, "num_input_tokens_seen": 45243328, "step": 20980 }, { "epoch": 3.851165351440631, "grad_norm": 8.177327156066895, "learning_rate": 9.741346336724098e-06, "loss": 0.1768, "num_input_tokens_seen": 45254720, "step": 20985 }, { "epoch": 3.8520829510001837, "grad_norm": 8.956045150756836, "learning_rate": 9.741092061974516e-06, "loss": 0.2973, "num_input_tokens_seen": 45264448, "step": 20990 }, { "epoch": 3.853000550559736, "grad_norm": 3.707256317138672, "learning_rate": 9.74083766562327e-06, "loss": 0.4161, "num_input_tokens_seen": 45275840, "step": 20995 }, { "epoch": 3.853918150119288, "grad_norm": 3.7014336585998535, "learning_rate": 9.740583147676887e-06, "loss": 0.4641, "num_input_tokens_seen": 45285408, "step": 21000 }, { "epoch": 3.8548357496788404, "grad_norm": 4.721241474151611, "learning_rate": 9.740328508141894e-06, "loss": 0.2288, "num_input_tokens_seen": 45296384, "step": 21005 }, { "epoch": 3.8557533492383924, "grad_norm": 17.987890243530273, "learning_rate": 9.74007374702482e-06, "loss": 0.3717, "num_input_tokens_seen": 45305216, "step": 21010 }, { "epoch": 3.8566709487979445, "grad_norm": 16.68295669555664, "learning_rate": 9.739818864332203e-06, "loss": 0.3725, "num_input_tokens_seen": 45315584, "step": 21015 }, { "epoch": 3.857588548357497, "grad_norm": 1.27244234085083, "learning_rate": 9.739563860070576e-06, "loss": 0.2161, "num_input_tokens_seen": 45327200, "step": 21020 }, { "epoch": 3.858506147917049, "grad_norm": 5.192765235900879, "learning_rate": 9.739308734246482e-06, "loss": 0.2454, "num_input_tokens_seen": 45336416, "step": 21025 }, { "epoch": 3.859423747476601, "grad_norm": 3.7610886096954346, "learning_rate": 9.739053486866464e-06, "loss": 0.3266, "num_input_tokens_seen": 45346976, "step": 21030 }, { "epoch": 3.8603413470361536, "grad_norm": 4.4795823097229, "learning_rate": 9.73879811793707e-06, "loss": 0.1709, "num_input_tokens_seen": 45359040, "step": 21035 }, { "epoch": 3.8612589465957057, "grad_norm": 1.3832592964172363, "learning_rate": 9.738542627464848e-06, "loss": 0.1916, "num_input_tokens_seen": 45370048, "step": 21040 }, { "epoch": 3.8621765461552577, "grad_norm": 1.2046844959259033, "learning_rate": 9.738287015456351e-06, "loss": 0.2477, "num_input_tokens_seen": 45380992, "step": 21045 }, { "epoch": 3.8630941457148102, "grad_norm": 1.0545769929885864, "learning_rate": 9.738031281918137e-06, "loss": 0.3607, "num_input_tokens_seen": 45392704, "step": 21050 }, { "epoch": 3.8640117452743623, "grad_norm": 3.8841819763183594, "learning_rate": 9.737775426856763e-06, "loss": 0.1868, "num_input_tokens_seen": 45403808, "step": 21055 }, { "epoch": 3.8649293448339144, "grad_norm": 2.6018972396850586, "learning_rate": 9.737519450278795e-06, "loss": 0.2296, "num_input_tokens_seen": 45416352, "step": 21060 }, { "epoch": 3.865846944393467, "grad_norm": 8.768732070922852, "learning_rate": 9.737263352190792e-06, "loss": 0.5344, "num_input_tokens_seen": 45426656, "step": 21065 }, { "epoch": 3.866764543953019, "grad_norm": 3.3151473999023438, "learning_rate": 9.737007132599326e-06, "loss": 0.1838, "num_input_tokens_seen": 45437088, "step": 21070 }, { "epoch": 3.867682143512571, "grad_norm": 2.8584938049316406, "learning_rate": 9.736750791510972e-06, "loss": 0.3303, "num_input_tokens_seen": 45447968, "step": 21075 }, { "epoch": 3.8685997430721235, "grad_norm": 15.02318286895752, "learning_rate": 9.736494328932298e-06, "loss": 0.2675, "num_input_tokens_seen": 45458976, "step": 21080 }, { "epoch": 3.8695173426316756, "grad_norm": 11.535273551940918, "learning_rate": 9.736237744869887e-06, "loss": 0.4618, "num_input_tokens_seen": 45469696, "step": 21085 }, { "epoch": 3.8704349421912276, "grad_norm": 2.1752357482910156, "learning_rate": 9.735981039330319e-06, "loss": 0.483, "num_input_tokens_seen": 45480128, "step": 21090 }, { "epoch": 3.87135254175078, "grad_norm": 5.04503059387207, "learning_rate": 9.735724212320177e-06, "loss": 0.3212, "num_input_tokens_seen": 45490656, "step": 21095 }, { "epoch": 3.872270141310332, "grad_norm": 1.7769399881362915, "learning_rate": 9.735467263846048e-06, "loss": 0.3665, "num_input_tokens_seen": 45501760, "step": 21100 }, { "epoch": 3.8731877408698843, "grad_norm": 2.1020874977111816, "learning_rate": 9.735210193914524e-06, "loss": 0.2569, "num_input_tokens_seen": 45513248, "step": 21105 }, { "epoch": 3.8741053404294368, "grad_norm": 22.70367431640625, "learning_rate": 9.734953002532195e-06, "loss": 0.2483, "num_input_tokens_seen": 45523456, "step": 21110 }, { "epoch": 3.875022939988989, "grad_norm": 3.797393321990967, "learning_rate": 9.734695689705664e-06, "loss": 0.3017, "num_input_tokens_seen": 45534720, "step": 21115 }, { "epoch": 3.875940539548541, "grad_norm": 7.2552642822265625, "learning_rate": 9.734438255441523e-06, "loss": 0.4102, "num_input_tokens_seen": 45545760, "step": 21120 }, { "epoch": 3.8768581391080934, "grad_norm": 1.876314640045166, "learning_rate": 9.73418069974638e-06, "loss": 0.2723, "num_input_tokens_seen": 45556704, "step": 21125 }, { "epoch": 3.8777757386676455, "grad_norm": 3.4921767711639404, "learning_rate": 9.73392302262684e-06, "loss": 0.2755, "num_input_tokens_seen": 45568288, "step": 21130 }, { "epoch": 3.8786933382271975, "grad_norm": 2.942847967147827, "learning_rate": 9.73366522408951e-06, "loss": 0.2309, "num_input_tokens_seen": 45577952, "step": 21135 }, { "epoch": 3.87961093778675, "grad_norm": 7.947328567504883, "learning_rate": 9.733407304141005e-06, "loss": 0.4212, "num_input_tokens_seen": 45588640, "step": 21140 }, { "epoch": 3.880528537346302, "grad_norm": 6.772996425628662, "learning_rate": 9.733149262787937e-06, "loss": 0.2622, "num_input_tokens_seen": 45599680, "step": 21145 }, { "epoch": 3.881446136905854, "grad_norm": 3.1380841732025146, "learning_rate": 9.732891100036927e-06, "loss": 0.4359, "num_input_tokens_seen": 45609248, "step": 21150 }, { "epoch": 3.8823637364654067, "grad_norm": 5.083590030670166, "learning_rate": 9.732632815894596e-06, "loss": 0.1554, "num_input_tokens_seen": 45621312, "step": 21155 }, { "epoch": 3.8832813360249587, "grad_norm": 13.977910041809082, "learning_rate": 9.732374410367569e-06, "loss": 0.385, "num_input_tokens_seen": 45631552, "step": 21160 }, { "epoch": 3.884198935584511, "grad_norm": 2.556058645248413, "learning_rate": 9.732115883462471e-06, "loss": 0.2016, "num_input_tokens_seen": 45642048, "step": 21165 }, { "epoch": 3.8851165351440633, "grad_norm": 9.291132926940918, "learning_rate": 9.731857235185935e-06, "loss": 0.4157, "num_input_tokens_seen": 45652192, "step": 21170 }, { "epoch": 3.8860341347036154, "grad_norm": 7.226924419403076, "learning_rate": 9.731598465544596e-06, "loss": 0.354, "num_input_tokens_seen": 45661824, "step": 21175 }, { "epoch": 3.8869517342631674, "grad_norm": 1.4714224338531494, "learning_rate": 9.731339574545089e-06, "loss": 0.3874, "num_input_tokens_seen": 45672800, "step": 21180 }, { "epoch": 3.88786933382272, "grad_norm": 10.049051284790039, "learning_rate": 9.731080562194056e-06, "loss": 0.253, "num_input_tokens_seen": 45682912, "step": 21185 }, { "epoch": 3.888786933382272, "grad_norm": 3.333753824234009, "learning_rate": 9.730821428498136e-06, "loss": 0.2218, "num_input_tokens_seen": 45692704, "step": 21190 }, { "epoch": 3.889704532941824, "grad_norm": 4.054529666900635, "learning_rate": 9.73056217346398e-06, "loss": 0.2705, "num_input_tokens_seen": 45703584, "step": 21195 }, { "epoch": 3.8906221325013766, "grad_norm": 1.1074023246765137, "learning_rate": 9.730302797098237e-06, "loss": 0.3051, "num_input_tokens_seen": 45712768, "step": 21200 }, { "epoch": 3.8915397320609286, "grad_norm": 4.752849102020264, "learning_rate": 9.730043299407557e-06, "loss": 0.2993, "num_input_tokens_seen": 45722560, "step": 21205 }, { "epoch": 3.8924573316204807, "grad_norm": 4.815357685089111, "learning_rate": 9.7297836803986e-06, "loss": 0.2817, "num_input_tokens_seen": 45733536, "step": 21210 }, { "epoch": 3.893374931180033, "grad_norm": 2.5351033210754395, "learning_rate": 9.729523940078019e-06, "loss": 0.3469, "num_input_tokens_seen": 45744064, "step": 21215 }, { "epoch": 3.8942925307395853, "grad_norm": 8.479358673095703, "learning_rate": 9.72926407845248e-06, "loss": 0.3656, "num_input_tokens_seen": 45754848, "step": 21220 }, { "epoch": 3.8952101302991373, "grad_norm": 1.8619052171707153, "learning_rate": 9.729004095528647e-06, "loss": 0.2619, "num_input_tokens_seen": 45766720, "step": 21225 }, { "epoch": 3.89612772985869, "grad_norm": 2.6388590335845947, "learning_rate": 9.728743991313187e-06, "loss": 0.2678, "num_input_tokens_seen": 45777696, "step": 21230 }, { "epoch": 3.897045329418242, "grad_norm": 2.079549551010132, "learning_rate": 9.728483765812774e-06, "loss": 0.2691, "num_input_tokens_seen": 45787840, "step": 21235 }, { "epoch": 3.897962928977794, "grad_norm": 6.1832146644592285, "learning_rate": 9.728223419034081e-06, "loss": 0.298, "num_input_tokens_seen": 45799040, "step": 21240 }, { "epoch": 3.8988805285373465, "grad_norm": 9.64089584350586, "learning_rate": 9.727962950983787e-06, "loss": 0.393, "num_input_tokens_seen": 45810048, "step": 21245 }, { "epoch": 3.8997981280968985, "grad_norm": 4.20875883102417, "learning_rate": 9.727702361668568e-06, "loss": 0.1978, "num_input_tokens_seen": 45820704, "step": 21250 }, { "epoch": 3.9007157276564506, "grad_norm": 13.219008445739746, "learning_rate": 9.727441651095112e-06, "loss": 0.3359, "num_input_tokens_seen": 45832608, "step": 21255 }, { "epoch": 3.901633327216003, "grad_norm": 3.0084526538848877, "learning_rate": 9.727180819270105e-06, "loss": 0.1595, "num_input_tokens_seen": 45843904, "step": 21260 }, { "epoch": 3.902550926775555, "grad_norm": 4.587116718292236, "learning_rate": 9.726919866200236e-06, "loss": 0.4318, "num_input_tokens_seen": 45854304, "step": 21265 }, { "epoch": 3.903468526335107, "grad_norm": 3.86428165435791, "learning_rate": 9.726658791892198e-06, "loss": 0.2799, "num_input_tokens_seen": 45863776, "step": 21270 }, { "epoch": 3.9043861258946597, "grad_norm": 5.630114555358887, "learning_rate": 9.726397596352688e-06, "loss": 0.4179, "num_input_tokens_seen": 45875008, "step": 21275 }, { "epoch": 3.905303725454212, "grad_norm": 3.6992177963256836, "learning_rate": 9.726136279588405e-06, "loss": 0.2695, "num_input_tokens_seen": 45886368, "step": 21280 }, { "epoch": 3.906221325013764, "grad_norm": 2.280137300491333, "learning_rate": 9.725874841606051e-06, "loss": 0.336, "num_input_tokens_seen": 45898112, "step": 21285 }, { "epoch": 3.9071389245733164, "grad_norm": 6.1284074783325195, "learning_rate": 9.725613282412332e-06, "loss": 0.2909, "num_input_tokens_seen": 45909120, "step": 21290 }, { "epoch": 3.9080565241328684, "grad_norm": 3.6286683082580566, "learning_rate": 9.725351602013957e-06, "loss": 0.2763, "num_input_tokens_seen": 45920096, "step": 21295 }, { "epoch": 3.9089741236924205, "grad_norm": 4.647928714752197, "learning_rate": 9.725089800417635e-06, "loss": 0.3756, "num_input_tokens_seen": 45931872, "step": 21300 }, { "epoch": 3.909891723251973, "grad_norm": 1.3127777576446533, "learning_rate": 9.724827877630086e-06, "loss": 0.2203, "num_input_tokens_seen": 45942560, "step": 21305 }, { "epoch": 3.910809322811525, "grad_norm": 4.735749244689941, "learning_rate": 9.724565833658022e-06, "loss": 0.2525, "num_input_tokens_seen": 45952608, "step": 21310 }, { "epoch": 3.911726922371077, "grad_norm": 8.20600414276123, "learning_rate": 9.724303668508168e-06, "loss": 0.4035, "num_input_tokens_seen": 45962816, "step": 21315 }, { "epoch": 3.9126445219306296, "grad_norm": 23.298574447631836, "learning_rate": 9.724041382187247e-06, "loss": 0.1805, "num_input_tokens_seen": 45972704, "step": 21320 }, { "epoch": 3.9135621214901817, "grad_norm": 13.07918930053711, "learning_rate": 9.723778974701985e-06, "loss": 0.2664, "num_input_tokens_seen": 45982944, "step": 21325 }, { "epoch": 3.9144797210497337, "grad_norm": 3.9358034133911133, "learning_rate": 9.723516446059115e-06, "loss": 0.2416, "num_input_tokens_seen": 45995072, "step": 21330 }, { "epoch": 3.9153973206092862, "grad_norm": 18.112104415893555, "learning_rate": 9.723253796265369e-06, "loss": 0.3472, "num_input_tokens_seen": 46004320, "step": 21335 }, { "epoch": 3.9163149201688383, "grad_norm": 5.59602689743042, "learning_rate": 9.722991025327481e-06, "loss": 0.35, "num_input_tokens_seen": 46014112, "step": 21340 }, { "epoch": 3.9172325197283904, "grad_norm": 11.443163871765137, "learning_rate": 9.722728133252195e-06, "loss": 0.3004, "num_input_tokens_seen": 46025152, "step": 21345 }, { "epoch": 3.918150119287943, "grad_norm": 20.749828338623047, "learning_rate": 9.722465120046252e-06, "loss": 0.3409, "num_input_tokens_seen": 46036608, "step": 21350 }, { "epoch": 3.919067718847495, "grad_norm": 12.125694274902344, "learning_rate": 9.722201985716397e-06, "loss": 0.3924, "num_input_tokens_seen": 46047808, "step": 21355 }, { "epoch": 3.919985318407047, "grad_norm": 4.255568504333496, "learning_rate": 9.72193873026938e-06, "loss": 0.2724, "num_input_tokens_seen": 46058528, "step": 21360 }, { "epoch": 3.9209029179665995, "grad_norm": 6.6071648597717285, "learning_rate": 9.721675353711955e-06, "loss": 0.2984, "num_input_tokens_seen": 46068512, "step": 21365 }, { "epoch": 3.9218205175261516, "grad_norm": 4.996205806732178, "learning_rate": 9.721411856050873e-06, "loss": 0.3435, "num_input_tokens_seen": 46079552, "step": 21370 }, { "epoch": 3.9227381170857036, "grad_norm": 3.2087905406951904, "learning_rate": 9.721148237292896e-06, "loss": 0.4098, "num_input_tokens_seen": 46090400, "step": 21375 }, { "epoch": 3.923655716645256, "grad_norm": 4.875048637390137, "learning_rate": 9.720884497444782e-06, "loss": 0.2233, "num_input_tokens_seen": 46101600, "step": 21380 }, { "epoch": 3.924573316204808, "grad_norm": 2.355577230453491, "learning_rate": 9.720620636513299e-06, "loss": 0.2579, "num_input_tokens_seen": 46112608, "step": 21385 }, { "epoch": 3.9254909157643603, "grad_norm": 5.92716646194458, "learning_rate": 9.720356654505212e-06, "loss": 0.3723, "num_input_tokens_seen": 46122592, "step": 21390 }, { "epoch": 3.9264085153239128, "grad_norm": 4.826829433441162, "learning_rate": 9.720092551427292e-06, "loss": 0.2934, "num_input_tokens_seen": 46133120, "step": 21395 }, { "epoch": 3.927326114883465, "grad_norm": 7.975438117980957, "learning_rate": 9.719828327286314e-06, "loss": 0.3146, "num_input_tokens_seen": 46142848, "step": 21400 }, { "epoch": 3.928243714443017, "grad_norm": 6.195648193359375, "learning_rate": 9.719563982089055e-06, "loss": 0.2483, "num_input_tokens_seen": 46154304, "step": 21405 }, { "epoch": 3.9291613140025694, "grad_norm": 7.631836414337158, "learning_rate": 9.719299515842295e-06, "loss": 0.1783, "num_input_tokens_seen": 46166464, "step": 21410 }, { "epoch": 3.9300789135621215, "grad_norm": 1.6473417282104492, "learning_rate": 9.719034928552815e-06, "loss": 0.2615, "num_input_tokens_seen": 46177728, "step": 21415 }, { "epoch": 3.9309965131216735, "grad_norm": 1.482665777206421, "learning_rate": 9.718770220227405e-06, "loss": 0.2891, "num_input_tokens_seen": 46188544, "step": 21420 }, { "epoch": 3.931914112681226, "grad_norm": 6.698054790496826, "learning_rate": 9.71850539087285e-06, "loss": 0.2637, "num_input_tokens_seen": 46199104, "step": 21425 }, { "epoch": 3.932831712240778, "grad_norm": 14.936540603637695, "learning_rate": 9.718240440495946e-06, "loss": 0.2808, "num_input_tokens_seen": 46209632, "step": 21430 }, { "epoch": 3.93374931180033, "grad_norm": 2.713050603866577, "learning_rate": 9.717975369103488e-06, "loss": 0.3218, "num_input_tokens_seen": 46220800, "step": 21435 }, { "epoch": 3.9346669113598827, "grad_norm": 3.4928438663482666, "learning_rate": 9.717710176702273e-06, "loss": 0.2582, "num_input_tokens_seen": 46232128, "step": 21440 }, { "epoch": 3.9355845109194347, "grad_norm": 9.881083488464355, "learning_rate": 9.717444863299104e-06, "loss": 0.4812, "num_input_tokens_seen": 46243808, "step": 21445 }, { "epoch": 3.936502110478987, "grad_norm": 2.397958517074585, "learning_rate": 9.717179428900784e-06, "loss": 0.314, "num_input_tokens_seen": 46254208, "step": 21450 }, { "epoch": 3.9374197100385393, "grad_norm": 2.1581263542175293, "learning_rate": 9.716913873514127e-06, "loss": 0.2188, "num_input_tokens_seen": 46265696, "step": 21455 }, { "epoch": 3.9383373095980914, "grad_norm": 5.548471450805664, "learning_rate": 9.716648197145937e-06, "loss": 0.2451, "num_input_tokens_seen": 46276256, "step": 21460 }, { "epoch": 3.9392549091576434, "grad_norm": 1.1206070184707642, "learning_rate": 9.71638239980303e-06, "loss": 0.2705, "num_input_tokens_seen": 46286944, "step": 21465 }, { "epoch": 3.940172508717196, "grad_norm": 7.681771755218506, "learning_rate": 9.716116481492225e-06, "loss": 0.3548, "num_input_tokens_seen": 46297184, "step": 21470 }, { "epoch": 3.941090108276748, "grad_norm": 1.4201641082763672, "learning_rate": 9.71585044222034e-06, "loss": 0.214, "num_input_tokens_seen": 46308416, "step": 21475 }, { "epoch": 3.9420077078363, "grad_norm": 19.423145294189453, "learning_rate": 9.715584281994202e-06, "loss": 0.3857, "num_input_tokens_seen": 46318656, "step": 21480 }, { "epoch": 3.9429253073958526, "grad_norm": 8.6765718460083, "learning_rate": 9.715318000820635e-06, "loss": 0.4566, "num_input_tokens_seen": 46328608, "step": 21485 }, { "epoch": 3.9438429069554046, "grad_norm": 1.5174390077590942, "learning_rate": 9.71505159870647e-06, "loss": 0.1239, "num_input_tokens_seen": 46338144, "step": 21490 }, { "epoch": 3.9447605065149567, "grad_norm": 6.724429607391357, "learning_rate": 9.71478507565854e-06, "loss": 0.2568, "num_input_tokens_seen": 46348480, "step": 21495 }, { "epoch": 3.945678106074509, "grad_norm": 6.037076473236084, "learning_rate": 9.714518431683678e-06, "loss": 0.2023, "num_input_tokens_seen": 46360032, "step": 21500 }, { "epoch": 3.9465957056340613, "grad_norm": 5.932572841644287, "learning_rate": 9.714251666788726e-06, "loss": 0.2904, "num_input_tokens_seen": 46370400, "step": 21505 }, { "epoch": 3.9475133051936133, "grad_norm": 3.007720947265625, "learning_rate": 9.713984780980525e-06, "loss": 0.3647, "num_input_tokens_seen": 46380480, "step": 21510 }, { "epoch": 3.948430904753166, "grad_norm": 0.63322514295578, "learning_rate": 9.71371777426592e-06, "loss": 0.3415, "num_input_tokens_seen": 46391520, "step": 21515 }, { "epoch": 3.949348504312718, "grad_norm": 1.946963906288147, "learning_rate": 9.713450646651762e-06, "loss": 0.2743, "num_input_tokens_seen": 46403136, "step": 21520 }, { "epoch": 3.95026610387227, "grad_norm": 10.167868614196777, "learning_rate": 9.713183398144898e-06, "loss": 0.3553, "num_input_tokens_seen": 46413568, "step": 21525 }, { "epoch": 3.9511837034318225, "grad_norm": 9.785183906555176, "learning_rate": 9.712916028752185e-06, "loss": 0.3433, "num_input_tokens_seen": 46424352, "step": 21530 }, { "epoch": 3.9521013029913745, "grad_norm": 3.8222994804382324, "learning_rate": 9.71264853848048e-06, "loss": 0.3324, "num_input_tokens_seen": 46435840, "step": 21535 }, { "epoch": 3.9530189025509266, "grad_norm": 13.852409362792969, "learning_rate": 9.712380927336645e-06, "loss": 0.2708, "num_input_tokens_seen": 46447328, "step": 21540 }, { "epoch": 3.953936502110479, "grad_norm": 6.4355902671813965, "learning_rate": 9.712113195327541e-06, "loss": 0.2699, "num_input_tokens_seen": 46457536, "step": 21545 }, { "epoch": 3.954854101670031, "grad_norm": 2.5156431198120117, "learning_rate": 9.711845342460037e-06, "loss": 0.4028, "num_input_tokens_seen": 46468320, "step": 21550 }, { "epoch": 3.955771701229583, "grad_norm": 8.542729377746582, "learning_rate": 9.711577368741003e-06, "loss": 0.3192, "num_input_tokens_seen": 46478752, "step": 21555 }, { "epoch": 3.9566893007891357, "grad_norm": 1.0468099117279053, "learning_rate": 9.711309274177312e-06, "loss": 0.3081, "num_input_tokens_seen": 46489920, "step": 21560 }, { "epoch": 3.957606900348688, "grad_norm": 3.2990286350250244, "learning_rate": 9.71104105877584e-06, "loss": 0.2815, "num_input_tokens_seen": 46501344, "step": 21565 }, { "epoch": 3.95852449990824, "grad_norm": 7.7343268394470215, "learning_rate": 9.710772722543467e-06, "loss": 0.2527, "num_input_tokens_seen": 46510912, "step": 21570 }, { "epoch": 3.9594420994677924, "grad_norm": 1.2890292406082153, "learning_rate": 9.710504265487074e-06, "loss": 0.3004, "num_input_tokens_seen": 46521952, "step": 21575 }, { "epoch": 3.9603596990273444, "grad_norm": 3.7867982387542725, "learning_rate": 9.710235687613545e-06, "loss": 0.3269, "num_input_tokens_seen": 46532992, "step": 21580 }, { "epoch": 3.9612772985868965, "grad_norm": 8.898470878601074, "learning_rate": 9.709966988929774e-06, "loss": 0.4621, "num_input_tokens_seen": 46544128, "step": 21585 }, { "epoch": 3.962194898146449, "grad_norm": 1.2023978233337402, "learning_rate": 9.709698169442647e-06, "loss": 0.2409, "num_input_tokens_seen": 46555008, "step": 21590 }, { "epoch": 3.963112497706001, "grad_norm": 6.902583122253418, "learning_rate": 9.709429229159065e-06, "loss": 0.1973, "num_input_tokens_seen": 46564256, "step": 21595 }, { "epoch": 3.964030097265553, "grad_norm": 5.723618984222412, "learning_rate": 9.709160168085918e-06, "loss": 0.3247, "num_input_tokens_seen": 46573408, "step": 21600 }, { "epoch": 3.9649476968251056, "grad_norm": 3.314194679260254, "learning_rate": 9.708890986230114e-06, "loss": 0.2651, "num_input_tokens_seen": 46584128, "step": 21605 }, { "epoch": 3.9658652963846577, "grad_norm": 6.475298881530762, "learning_rate": 9.708621683598553e-06, "loss": 0.3639, "num_input_tokens_seen": 46596192, "step": 21610 }, { "epoch": 3.9667828959442097, "grad_norm": 16.055198669433594, "learning_rate": 9.708352260198144e-06, "loss": 0.3726, "num_input_tokens_seen": 46606976, "step": 21615 }, { "epoch": 3.9677004955037622, "grad_norm": 3.2841405868530273, "learning_rate": 9.708082716035799e-06, "loss": 0.1904, "num_input_tokens_seen": 46619168, "step": 21620 }, { "epoch": 3.9686180950633143, "grad_norm": 5.463937759399414, "learning_rate": 9.707813051118426e-06, "loss": 0.3304, "num_input_tokens_seen": 46630752, "step": 21625 }, { "epoch": 3.9695356946228664, "grad_norm": 25.701391220092773, "learning_rate": 9.707543265452945e-06, "loss": 0.339, "num_input_tokens_seen": 46641344, "step": 21630 }, { "epoch": 3.970453294182419, "grad_norm": 11.40113353729248, "learning_rate": 9.707273359046276e-06, "loss": 0.2721, "num_input_tokens_seen": 46652352, "step": 21635 }, { "epoch": 3.971370893741971, "grad_norm": 3.3397233486175537, "learning_rate": 9.707003331905341e-06, "loss": 0.3283, "num_input_tokens_seen": 46664160, "step": 21640 }, { "epoch": 3.972288493301523, "grad_norm": 2.909053087234497, "learning_rate": 9.706733184037066e-06, "loss": 0.1918, "num_input_tokens_seen": 46675168, "step": 21645 }, { "epoch": 3.9732060928610755, "grad_norm": 1.109403133392334, "learning_rate": 9.70646291544838e-06, "loss": 0.2202, "num_input_tokens_seen": 46686272, "step": 21650 }, { "epoch": 3.9741236924206276, "grad_norm": 0.5201388597488403, "learning_rate": 9.706192526146213e-06, "loss": 0.1627, "num_input_tokens_seen": 46697856, "step": 21655 }, { "epoch": 3.9750412919801796, "grad_norm": 4.6894941329956055, "learning_rate": 9.705922016137502e-06, "loss": 0.3846, "num_input_tokens_seen": 46707840, "step": 21660 }, { "epoch": 3.975958891539732, "grad_norm": 3.1478443145751953, "learning_rate": 9.705651385429185e-06, "loss": 0.2384, "num_input_tokens_seen": 46719456, "step": 21665 }, { "epoch": 3.976876491099284, "grad_norm": 1.4396005868911743, "learning_rate": 9.705380634028204e-06, "loss": 0.2555, "num_input_tokens_seen": 46730624, "step": 21670 }, { "epoch": 3.9777940906588363, "grad_norm": 21.574743270874023, "learning_rate": 9.705109761941502e-06, "loss": 0.5234, "num_input_tokens_seen": 46742176, "step": 21675 }, { "epoch": 3.9787116902183888, "grad_norm": 3.844391107559204, "learning_rate": 9.704838769176026e-06, "loss": 0.4318, "num_input_tokens_seen": 46753088, "step": 21680 }, { "epoch": 3.979629289777941, "grad_norm": 5.171829700469971, "learning_rate": 9.704567655738728e-06, "loss": 0.3837, "num_input_tokens_seen": 46763136, "step": 21685 }, { "epoch": 3.980546889337493, "grad_norm": 3.6808488368988037, "learning_rate": 9.704296421636562e-06, "loss": 0.347, "num_input_tokens_seen": 46773760, "step": 21690 }, { "epoch": 3.9814644888970454, "grad_norm": 14.1692533493042, "learning_rate": 9.704025066876484e-06, "loss": 0.5255, "num_input_tokens_seen": 46785152, "step": 21695 }, { "epoch": 3.9823820884565975, "grad_norm": 1.3174775838851929, "learning_rate": 9.703753591465451e-06, "loss": 0.3171, "num_input_tokens_seen": 46796128, "step": 21700 }, { "epoch": 3.9832996880161495, "grad_norm": 5.122918605804443, "learning_rate": 9.70348199541043e-06, "loss": 0.1911, "num_input_tokens_seen": 46807776, "step": 21705 }, { "epoch": 3.984217287575702, "grad_norm": 6.219152927398682, "learning_rate": 9.703210278718386e-06, "loss": 0.3705, "num_input_tokens_seen": 46818176, "step": 21710 }, { "epoch": 3.985134887135254, "grad_norm": 3.9228315353393555, "learning_rate": 9.702938441396288e-06, "loss": 0.3554, "num_input_tokens_seen": 46829888, "step": 21715 }, { "epoch": 3.986052486694806, "grad_norm": 7.061718940734863, "learning_rate": 9.702666483451107e-06, "loss": 0.3267, "num_input_tokens_seen": 46841792, "step": 21720 }, { "epoch": 3.9869700862543587, "grad_norm": 3.1403567790985107, "learning_rate": 9.702394404889818e-06, "loss": 0.2274, "num_input_tokens_seen": 46851552, "step": 21725 }, { "epoch": 3.9878876858139107, "grad_norm": 10.782620429992676, "learning_rate": 9.702122205719402e-06, "loss": 0.2665, "num_input_tokens_seen": 46863648, "step": 21730 }, { "epoch": 3.988805285373463, "grad_norm": 7.091450214385986, "learning_rate": 9.701849885946838e-06, "loss": 0.2755, "num_input_tokens_seen": 46873920, "step": 21735 }, { "epoch": 3.9897228849330153, "grad_norm": 6.523108959197998, "learning_rate": 9.701577445579113e-06, "loss": 0.4754, "num_input_tokens_seen": 46883488, "step": 21740 }, { "epoch": 3.9906404844925674, "grad_norm": 5.504472732543945, "learning_rate": 9.701304884623213e-06, "loss": 0.1983, "num_input_tokens_seen": 46893152, "step": 21745 }, { "epoch": 3.9915580840521194, "grad_norm": 4.114348411560059, "learning_rate": 9.701032203086129e-06, "loss": 0.2986, "num_input_tokens_seen": 46904928, "step": 21750 }, { "epoch": 3.992475683611672, "grad_norm": 1.5952223539352417, "learning_rate": 9.700759400974855e-06, "loss": 0.169, "num_input_tokens_seen": 46915616, "step": 21755 }, { "epoch": 3.993393283171224, "grad_norm": 6.2546539306640625, "learning_rate": 9.70048647829639e-06, "loss": 0.3095, "num_input_tokens_seen": 46926144, "step": 21760 }, { "epoch": 3.994310882730776, "grad_norm": 2.3385672569274902, "learning_rate": 9.700213435057727e-06, "loss": 0.4822, "num_input_tokens_seen": 46937056, "step": 21765 }, { "epoch": 3.9952284822903286, "grad_norm": 2.6266279220581055, "learning_rate": 9.699940271265877e-06, "loss": 0.2861, "num_input_tokens_seen": 46948576, "step": 21770 }, { "epoch": 3.9961460818498806, "grad_norm": 6.198358058929443, "learning_rate": 9.699666986927843e-06, "loss": 0.3955, "num_input_tokens_seen": 46959808, "step": 21775 }, { "epoch": 3.9970636814094327, "grad_norm": 3.700263738632202, "learning_rate": 9.699393582050636e-06, "loss": 0.2748, "num_input_tokens_seen": 46970304, "step": 21780 }, { "epoch": 3.997981280968985, "grad_norm": 6.703349590301514, "learning_rate": 9.699120056641264e-06, "loss": 0.3004, "num_input_tokens_seen": 46979680, "step": 21785 }, { "epoch": 3.9988988805285373, "grad_norm": 1.7605044841766357, "learning_rate": 9.698846410706749e-06, "loss": 0.2291, "num_input_tokens_seen": 46990848, "step": 21790 }, { "epoch": 3.9998164800880893, "grad_norm": 7.407350540161133, "learning_rate": 9.698572644254102e-06, "loss": 0.2466, "num_input_tokens_seen": 47002016, "step": 21795 }, { "epoch": 4.0, "eval_loss": 0.2969399690628052, "eval_runtime": 178.9932, "eval_samples_per_second": 30.442, "eval_steps_per_second": 7.615, "num_input_tokens_seen": 47003136, "step": 21796 }, { "epoch": 4.000734079647642, "grad_norm": 2.061523914337158, "learning_rate": 9.698298757290351e-06, "loss": 0.3168, "num_input_tokens_seen": 47012160, "step": 21800 }, { "epoch": 4.001651679207194, "grad_norm": 4.493215084075928, "learning_rate": 9.698024749822522e-06, "loss": 0.3963, "num_input_tokens_seen": 47021536, "step": 21805 }, { "epoch": 4.002569278766746, "grad_norm": 4.656918048858643, "learning_rate": 9.697750621857634e-06, "loss": 0.3155, "num_input_tokens_seen": 47032896, "step": 21810 }, { "epoch": 4.0034868783262985, "grad_norm": 2.955251455307007, "learning_rate": 9.697476373402726e-06, "loss": 0.2599, "num_input_tokens_seen": 47043360, "step": 21815 }, { "epoch": 4.004404477885851, "grad_norm": 3.3539838790893555, "learning_rate": 9.697202004464829e-06, "loss": 0.3464, "num_input_tokens_seen": 47055040, "step": 21820 }, { "epoch": 4.005322077445403, "grad_norm": 8.659408569335938, "learning_rate": 9.69692751505098e-06, "loss": 0.2776, "num_input_tokens_seen": 47065888, "step": 21825 }, { "epoch": 4.006239677004955, "grad_norm": 11.975139617919922, "learning_rate": 9.696652905168222e-06, "loss": 0.2487, "num_input_tokens_seen": 47076928, "step": 21830 }, { "epoch": 4.007157276564508, "grad_norm": 12.573503494262695, "learning_rate": 9.696378174823593e-06, "loss": 0.2303, "num_input_tokens_seen": 47087840, "step": 21835 }, { "epoch": 4.008074876124059, "grad_norm": 0.6348719000816345, "learning_rate": 9.696103324024145e-06, "loss": 0.1593, "num_input_tokens_seen": 47098816, "step": 21840 }, { "epoch": 4.008992475683612, "grad_norm": 11.086713790893555, "learning_rate": 9.695828352776923e-06, "loss": 0.3076, "num_input_tokens_seen": 47110080, "step": 21845 }, { "epoch": 4.009910075243164, "grad_norm": 1.5293726921081543, "learning_rate": 9.695553261088984e-06, "loss": 0.3367, "num_input_tokens_seen": 47121088, "step": 21850 }, { "epoch": 4.010827674802716, "grad_norm": 2.64363956451416, "learning_rate": 9.69527804896738e-06, "loss": 0.1996, "num_input_tokens_seen": 47132896, "step": 21855 }, { "epoch": 4.011745274362268, "grad_norm": 2.521141290664673, "learning_rate": 9.69500271641917e-06, "loss": 0.3204, "num_input_tokens_seen": 47144736, "step": 21860 }, { "epoch": 4.012662873921821, "grad_norm": 7.147257328033447, "learning_rate": 9.694727263451419e-06, "loss": 0.3503, "num_input_tokens_seen": 47156000, "step": 21865 }, { "epoch": 4.0135804734813725, "grad_norm": 6.322372913360596, "learning_rate": 9.694451690071189e-06, "loss": 0.3565, "num_input_tokens_seen": 47166848, "step": 21870 }, { "epoch": 4.014498073040925, "grad_norm": 3.845447301864624, "learning_rate": 9.69417599628555e-06, "loss": 0.2095, "num_input_tokens_seen": 47176864, "step": 21875 }, { "epoch": 4.0154156726004775, "grad_norm": 6.867711544036865, "learning_rate": 9.693900182101569e-06, "loss": 0.2662, "num_input_tokens_seen": 47187936, "step": 21880 }, { "epoch": 4.016333272160029, "grad_norm": 5.289664268493652, "learning_rate": 9.693624247526326e-06, "loss": 0.1687, "num_input_tokens_seen": 47198720, "step": 21885 }, { "epoch": 4.017250871719582, "grad_norm": 3.6507480144500732, "learning_rate": 9.693348192566893e-06, "loss": 0.2634, "num_input_tokens_seen": 47208096, "step": 21890 }, { "epoch": 4.018168471279134, "grad_norm": 9.661910057067871, "learning_rate": 9.693072017230355e-06, "loss": 0.2577, "num_input_tokens_seen": 47218528, "step": 21895 }, { "epoch": 4.019086070838686, "grad_norm": 10.04304027557373, "learning_rate": 9.692795721523794e-06, "loss": 0.2344, "num_input_tokens_seen": 47229024, "step": 21900 }, { "epoch": 4.020003670398238, "grad_norm": 7.005120277404785, "learning_rate": 9.692519305454293e-06, "loss": 0.1837, "num_input_tokens_seen": 47239872, "step": 21905 }, { "epoch": 4.020921269957791, "grad_norm": 5.4382429122924805, "learning_rate": 9.692242769028946e-06, "loss": 0.4256, "num_input_tokens_seen": 47250816, "step": 21910 }, { "epoch": 4.021838869517342, "grad_norm": 4.137386798858643, "learning_rate": 9.691966112254846e-06, "loss": 0.2182, "num_input_tokens_seen": 47260416, "step": 21915 }, { "epoch": 4.022756469076895, "grad_norm": 5.45871639251709, "learning_rate": 9.691689335139084e-06, "loss": 0.2105, "num_input_tokens_seen": 47271424, "step": 21920 }, { "epoch": 4.023674068636447, "grad_norm": 5.34626579284668, "learning_rate": 9.691412437688764e-06, "loss": 0.4027, "num_input_tokens_seen": 47282976, "step": 21925 }, { "epoch": 4.024591668195999, "grad_norm": 0.9431745409965515, "learning_rate": 9.691135419910987e-06, "loss": 0.3388, "num_input_tokens_seen": 47293824, "step": 21930 }, { "epoch": 4.0255092677555515, "grad_norm": 4.823709964752197, "learning_rate": 9.690858281812853e-06, "loss": 0.1548, "num_input_tokens_seen": 47305664, "step": 21935 }, { "epoch": 4.026426867315104, "grad_norm": 2.152273178100586, "learning_rate": 9.690581023401479e-06, "loss": 0.2093, "num_input_tokens_seen": 47316672, "step": 21940 }, { "epoch": 4.027344466874656, "grad_norm": 19.64348602294922, "learning_rate": 9.69030364468397e-06, "loss": 0.1918, "num_input_tokens_seen": 47328832, "step": 21945 }, { "epoch": 4.028262066434208, "grad_norm": 6.7997236251831055, "learning_rate": 9.69002614566744e-06, "loss": 0.3629, "num_input_tokens_seen": 47341088, "step": 21950 }, { "epoch": 4.029179665993761, "grad_norm": 22.905258178710938, "learning_rate": 9.68974852635901e-06, "loss": 0.2939, "num_input_tokens_seen": 47351456, "step": 21955 }, { "epoch": 4.030097265553312, "grad_norm": 5.142542362213135, "learning_rate": 9.689470786765798e-06, "loss": 0.2759, "num_input_tokens_seen": 47362688, "step": 21960 }, { "epoch": 4.031014865112865, "grad_norm": 4.659932613372803, "learning_rate": 9.689192926894929e-06, "loss": 0.2721, "num_input_tokens_seen": 47373344, "step": 21965 }, { "epoch": 4.031932464672417, "grad_norm": 10.130589485168457, "learning_rate": 9.688914946753528e-06, "loss": 0.3751, "num_input_tokens_seen": 47384992, "step": 21970 }, { "epoch": 4.032850064231969, "grad_norm": 8.041638374328613, "learning_rate": 9.688636846348727e-06, "loss": 0.4281, "num_input_tokens_seen": 47395456, "step": 21975 }, { "epoch": 4.033767663791521, "grad_norm": 5.441341876983643, "learning_rate": 9.688358625687657e-06, "loss": 0.2269, "num_input_tokens_seen": 47406528, "step": 21980 }, { "epoch": 4.034685263351074, "grad_norm": 18.16669273376465, "learning_rate": 9.688080284777454e-06, "loss": 0.4109, "num_input_tokens_seen": 47418464, "step": 21985 }, { "epoch": 4.0356028629106255, "grad_norm": 7.821381568908691, "learning_rate": 9.687801823625258e-06, "loss": 0.1896, "num_input_tokens_seen": 47428544, "step": 21990 }, { "epoch": 4.036520462470178, "grad_norm": 5.10111141204834, "learning_rate": 9.687523242238212e-06, "loss": 0.2942, "num_input_tokens_seen": 47439968, "step": 21995 }, { "epoch": 4.0374380620297305, "grad_norm": 4.735964775085449, "learning_rate": 9.687244540623459e-06, "loss": 0.4087, "num_input_tokens_seen": 47450656, "step": 22000 }, { "epoch": 4.038355661589282, "grad_norm": 3.9474635124206543, "learning_rate": 9.686965718788146e-06, "loss": 0.2597, "num_input_tokens_seen": 47459968, "step": 22005 }, { "epoch": 4.039273261148835, "grad_norm": 8.877132415771484, "learning_rate": 9.68668677673943e-06, "loss": 0.3002, "num_input_tokens_seen": 47471456, "step": 22010 }, { "epoch": 4.040190860708387, "grad_norm": 2.8000380992889404, "learning_rate": 9.68640771448446e-06, "loss": 0.3906, "num_input_tokens_seen": 47481696, "step": 22015 }, { "epoch": 4.041108460267939, "grad_norm": 1.8231650590896606, "learning_rate": 9.686128532030395e-06, "loss": 0.263, "num_input_tokens_seen": 47492416, "step": 22020 }, { "epoch": 4.042026059827491, "grad_norm": 2.28623628616333, "learning_rate": 9.685849229384397e-06, "loss": 0.2563, "num_input_tokens_seen": 47503520, "step": 22025 }, { "epoch": 4.042943659387044, "grad_norm": 5.566304683685303, "learning_rate": 9.685569806553627e-06, "loss": 0.3953, "num_input_tokens_seen": 47513984, "step": 22030 }, { "epoch": 4.043861258946595, "grad_norm": 2.445303440093994, "learning_rate": 9.685290263545255e-06, "loss": 0.2463, "num_input_tokens_seen": 47524160, "step": 22035 }, { "epoch": 4.044778858506148, "grad_norm": 10.094043731689453, "learning_rate": 9.685010600366448e-06, "loss": 0.2984, "num_input_tokens_seen": 47535232, "step": 22040 }, { "epoch": 4.0456964580657, "grad_norm": 1.208267092704773, "learning_rate": 9.684730817024382e-06, "loss": 0.352, "num_input_tokens_seen": 47545024, "step": 22045 }, { "epoch": 4.046614057625252, "grad_norm": 5.415385723114014, "learning_rate": 9.68445091352623e-06, "loss": 0.2797, "num_input_tokens_seen": 47556704, "step": 22050 }, { "epoch": 4.047531657184805, "grad_norm": 2.4505980014801025, "learning_rate": 9.684170889879171e-06, "loss": 0.3412, "num_input_tokens_seen": 47568736, "step": 22055 }, { "epoch": 4.048449256744357, "grad_norm": 1.3676656484603882, "learning_rate": 9.683890746090393e-06, "loss": 0.2531, "num_input_tokens_seen": 47579680, "step": 22060 }, { "epoch": 4.049366856303909, "grad_norm": 7.172229290008545, "learning_rate": 9.683610482167072e-06, "loss": 0.3153, "num_input_tokens_seen": 47589472, "step": 22065 }, { "epoch": 4.050284455863461, "grad_norm": 3.8246066570281982, "learning_rate": 9.683330098116403e-06, "loss": 0.3502, "num_input_tokens_seen": 47601504, "step": 22070 }, { "epoch": 4.051202055423014, "grad_norm": 2.8127548694610596, "learning_rate": 9.683049593945575e-06, "loss": 0.2573, "num_input_tokens_seen": 47611904, "step": 22075 }, { "epoch": 4.052119654982565, "grad_norm": 4.846385478973389, "learning_rate": 9.682768969661784e-06, "loss": 0.2801, "num_input_tokens_seen": 47623360, "step": 22080 }, { "epoch": 4.053037254542118, "grad_norm": 4.081661701202393, "learning_rate": 9.682488225272227e-06, "loss": 0.2736, "num_input_tokens_seen": 47634624, "step": 22085 }, { "epoch": 4.05395485410167, "grad_norm": 3.511920213699341, "learning_rate": 9.682207360784102e-06, "loss": 0.2934, "num_input_tokens_seen": 47646144, "step": 22090 }, { "epoch": 4.054872453661222, "grad_norm": 5.013962268829346, "learning_rate": 9.681926376204616e-06, "loss": 0.2226, "num_input_tokens_seen": 47656736, "step": 22095 }, { "epoch": 4.0557900532207745, "grad_norm": 10.297389030456543, "learning_rate": 9.681645271540976e-06, "loss": 0.2877, "num_input_tokens_seen": 47667264, "step": 22100 }, { "epoch": 4.056707652780327, "grad_norm": 6.763589382171631, "learning_rate": 9.681364046800388e-06, "loss": 0.3049, "num_input_tokens_seen": 47677152, "step": 22105 }, { "epoch": 4.057625252339879, "grad_norm": 2.632099151611328, "learning_rate": 9.68108270199007e-06, "loss": 0.2398, "num_input_tokens_seen": 47687904, "step": 22110 }, { "epoch": 4.058542851899431, "grad_norm": 23.100481033325195, "learning_rate": 9.680801237117234e-06, "loss": 0.3617, "num_input_tokens_seen": 47697632, "step": 22115 }, { "epoch": 4.059460451458984, "grad_norm": 2.7613234519958496, "learning_rate": 9.680519652189101e-06, "loss": 0.2783, "num_input_tokens_seen": 47707264, "step": 22120 }, { "epoch": 4.060378051018535, "grad_norm": 6.602947235107422, "learning_rate": 9.680237947212896e-06, "loss": 0.1962, "num_input_tokens_seen": 47717472, "step": 22125 }, { "epoch": 4.061295650578088, "grad_norm": 5.56353235244751, "learning_rate": 9.67995612219584e-06, "loss": 0.2688, "num_input_tokens_seen": 47727200, "step": 22130 }, { "epoch": 4.06221325013764, "grad_norm": 7.119767665863037, "learning_rate": 9.67967417714516e-06, "loss": 0.319, "num_input_tokens_seen": 47738176, "step": 22135 }, { "epoch": 4.063130849697192, "grad_norm": 8.714101791381836, "learning_rate": 9.679392112068094e-06, "loss": 0.3036, "num_input_tokens_seen": 47748320, "step": 22140 }, { "epoch": 4.064048449256744, "grad_norm": 8.53699016571045, "learning_rate": 9.67910992697187e-06, "loss": 0.3122, "num_input_tokens_seen": 47758880, "step": 22145 }, { "epoch": 4.064966048816297, "grad_norm": 4.393734931945801, "learning_rate": 9.67882762186373e-06, "loss": 0.2281, "num_input_tokens_seen": 47770048, "step": 22150 }, { "epoch": 4.0658836483758485, "grad_norm": 9.55358600616455, "learning_rate": 9.67854519675091e-06, "loss": 0.2813, "num_input_tokens_seen": 47779328, "step": 22155 }, { "epoch": 4.066801247935401, "grad_norm": 11.616108894348145, "learning_rate": 9.67826265164066e-06, "loss": 0.3364, "num_input_tokens_seen": 47789568, "step": 22160 }, { "epoch": 4.0677188474949535, "grad_norm": 12.990324020385742, "learning_rate": 9.677979986540223e-06, "loss": 0.2962, "num_input_tokens_seen": 47800672, "step": 22165 }, { "epoch": 4.068636447054505, "grad_norm": 4.915610313415527, "learning_rate": 9.677697201456848e-06, "loss": 0.3243, "num_input_tokens_seen": 47811680, "step": 22170 }, { "epoch": 4.069554046614058, "grad_norm": 5.42933464050293, "learning_rate": 9.67741429639779e-06, "loss": 0.3196, "num_input_tokens_seen": 47822176, "step": 22175 }, { "epoch": 4.07047164617361, "grad_norm": 10.405242919921875, "learning_rate": 9.677131271370307e-06, "loss": 0.4771, "num_input_tokens_seen": 47833952, "step": 22180 }, { "epoch": 4.071389245733162, "grad_norm": 3.241091251373291, "learning_rate": 9.676848126381654e-06, "loss": 0.2645, "num_input_tokens_seen": 47844896, "step": 22185 }, { "epoch": 4.072306845292714, "grad_norm": 3.1331217288970947, "learning_rate": 9.676564861439095e-06, "loss": 0.3041, "num_input_tokens_seen": 47854848, "step": 22190 }, { "epoch": 4.073224444852267, "grad_norm": 6.745425224304199, "learning_rate": 9.676281476549896e-06, "loss": 0.4281, "num_input_tokens_seen": 47866912, "step": 22195 }, { "epoch": 4.074142044411818, "grad_norm": 2.256165027618408, "learning_rate": 9.675997971721325e-06, "loss": 0.2756, "num_input_tokens_seen": 47877824, "step": 22200 }, { "epoch": 4.075059643971371, "grad_norm": 3.0866663455963135, "learning_rate": 9.675714346960651e-06, "loss": 0.1978, "num_input_tokens_seen": 47888608, "step": 22205 }, { "epoch": 4.075977243530923, "grad_norm": 10.868300437927246, "learning_rate": 9.675430602275153e-06, "loss": 0.3066, "num_input_tokens_seen": 47899552, "step": 22210 }, { "epoch": 4.076894843090475, "grad_norm": 4.028732776641846, "learning_rate": 9.675146737672106e-06, "loss": 0.2796, "num_input_tokens_seen": 47910560, "step": 22215 }, { "epoch": 4.0778124426500275, "grad_norm": 8.758068084716797, "learning_rate": 9.674862753158788e-06, "loss": 0.2456, "num_input_tokens_seen": 47921824, "step": 22220 }, { "epoch": 4.07873004220958, "grad_norm": 8.69735050201416, "learning_rate": 9.67457864874249e-06, "loss": 0.2754, "num_input_tokens_seen": 47932736, "step": 22225 }, { "epoch": 4.079647641769132, "grad_norm": 1.6880078315734863, "learning_rate": 9.674294424430493e-06, "loss": 0.1624, "num_input_tokens_seen": 47944064, "step": 22230 }, { "epoch": 4.080565241328684, "grad_norm": 18.63725471496582, "learning_rate": 9.674010080230087e-06, "loss": 0.2918, "num_input_tokens_seen": 47954208, "step": 22235 }, { "epoch": 4.081482840888237, "grad_norm": 3.143223524093628, "learning_rate": 9.673725616148568e-06, "loss": 0.3581, "num_input_tokens_seen": 47964544, "step": 22240 }, { "epoch": 4.082400440447788, "grad_norm": 1.6104685068130493, "learning_rate": 9.673441032193232e-06, "loss": 0.183, "num_input_tokens_seen": 47975360, "step": 22245 }, { "epoch": 4.083318040007341, "grad_norm": 0.7924087643623352, "learning_rate": 9.673156328371374e-06, "loss": 0.329, "num_input_tokens_seen": 47986944, "step": 22250 }, { "epoch": 4.084235639566893, "grad_norm": 3.075080394744873, "learning_rate": 9.6728715046903e-06, "loss": 0.346, "num_input_tokens_seen": 47996608, "step": 22255 }, { "epoch": 4.085153239126445, "grad_norm": 8.053619384765625, "learning_rate": 9.672586561157313e-06, "loss": 0.4774, "num_input_tokens_seen": 48008256, "step": 22260 }, { "epoch": 4.086070838685997, "grad_norm": 2.9985156059265137, "learning_rate": 9.672301497779725e-06, "loss": 0.2191, "num_input_tokens_seen": 48017280, "step": 22265 }, { "epoch": 4.08698843824555, "grad_norm": 1.2901992797851562, "learning_rate": 9.672016314564843e-06, "loss": 0.3127, "num_input_tokens_seen": 48029856, "step": 22270 }, { "epoch": 4.0879060378051015, "grad_norm": 9.40842056274414, "learning_rate": 9.671731011519984e-06, "loss": 0.3025, "num_input_tokens_seen": 48039680, "step": 22275 }, { "epoch": 4.088823637364654, "grad_norm": 5.634472370147705, "learning_rate": 9.671445588652465e-06, "loss": 0.3398, "num_input_tokens_seen": 48049504, "step": 22280 }, { "epoch": 4.0897412369242065, "grad_norm": 7.457856178283691, "learning_rate": 9.671160045969607e-06, "loss": 0.2695, "num_input_tokens_seen": 48059936, "step": 22285 }, { "epoch": 4.090658836483758, "grad_norm": 1.768998146057129, "learning_rate": 9.670874383478734e-06, "loss": 0.3533, "num_input_tokens_seen": 48070304, "step": 22290 }, { "epoch": 4.091576436043311, "grad_norm": 2.1315314769744873, "learning_rate": 9.670588601187171e-06, "loss": 0.39, "num_input_tokens_seen": 48081536, "step": 22295 }, { "epoch": 4.092494035602863, "grad_norm": 5.027862071990967, "learning_rate": 9.670302699102251e-06, "loss": 0.1888, "num_input_tokens_seen": 48091328, "step": 22300 }, { "epoch": 4.093411635162415, "grad_norm": 7.037667274475098, "learning_rate": 9.670016677231304e-06, "loss": 0.2528, "num_input_tokens_seen": 48102848, "step": 22305 }, { "epoch": 4.094329234721967, "grad_norm": 4.56348991394043, "learning_rate": 9.669730535581667e-06, "loss": 0.1505, "num_input_tokens_seen": 48112928, "step": 22310 }, { "epoch": 4.09524683428152, "grad_norm": 10.373854637145996, "learning_rate": 9.66944427416068e-06, "loss": 0.5708, "num_input_tokens_seen": 48122592, "step": 22315 }, { "epoch": 4.096164433841071, "grad_norm": 10.440035820007324, "learning_rate": 9.669157892975684e-06, "loss": 0.2067, "num_input_tokens_seen": 48133504, "step": 22320 }, { "epoch": 4.097082033400624, "grad_norm": 1.6370548009872437, "learning_rate": 9.668871392034023e-06, "loss": 0.2227, "num_input_tokens_seen": 48144544, "step": 22325 }, { "epoch": 4.097999632960176, "grad_norm": 4.811557769775391, "learning_rate": 9.668584771343047e-06, "loss": 0.2039, "num_input_tokens_seen": 48154496, "step": 22330 }, { "epoch": 4.098917232519728, "grad_norm": 2.3121731281280518, "learning_rate": 9.66829803091011e-06, "loss": 0.4337, "num_input_tokens_seen": 48164768, "step": 22335 }, { "epoch": 4.099834832079281, "grad_norm": 3.6382763385772705, "learning_rate": 9.668011170742562e-06, "loss": 0.3755, "num_input_tokens_seen": 48176480, "step": 22340 }, { "epoch": 4.100752431638833, "grad_norm": 1.0365763902664185, "learning_rate": 9.667724190847763e-06, "loss": 0.2409, "num_input_tokens_seen": 48186304, "step": 22345 }, { "epoch": 4.101670031198385, "grad_norm": 4.627627372741699, "learning_rate": 9.667437091233071e-06, "loss": 0.2253, "num_input_tokens_seen": 48198080, "step": 22350 }, { "epoch": 4.102587630757937, "grad_norm": 4.839760780334473, "learning_rate": 9.667149871905853e-06, "loss": 0.2875, "num_input_tokens_seen": 48208640, "step": 22355 }, { "epoch": 4.10350523031749, "grad_norm": 5.001447677612305, "learning_rate": 9.666862532873474e-06, "loss": 0.2568, "num_input_tokens_seen": 48219264, "step": 22360 }, { "epoch": 4.104422829877041, "grad_norm": 15.140642166137695, "learning_rate": 9.666575074143303e-06, "loss": 0.3187, "num_input_tokens_seen": 48230272, "step": 22365 }, { "epoch": 4.105340429436594, "grad_norm": 1.5424079895019531, "learning_rate": 9.666287495722714e-06, "loss": 0.274, "num_input_tokens_seen": 48240224, "step": 22370 }, { "epoch": 4.106258028996146, "grad_norm": 1.614320993423462, "learning_rate": 9.665999797619086e-06, "loss": 0.2196, "num_input_tokens_seen": 48251040, "step": 22375 }, { "epoch": 4.107175628555698, "grad_norm": 4.377028942108154, "learning_rate": 9.665711979839792e-06, "loss": 0.3836, "num_input_tokens_seen": 48261152, "step": 22380 }, { "epoch": 4.1080932281152505, "grad_norm": 5.6139092445373535, "learning_rate": 9.665424042392216e-06, "loss": 0.2695, "num_input_tokens_seen": 48273344, "step": 22385 }, { "epoch": 4.109010827674803, "grad_norm": 2.542184352874756, "learning_rate": 9.665135985283746e-06, "loss": 0.3295, "num_input_tokens_seen": 48284992, "step": 22390 }, { "epoch": 4.109928427234355, "grad_norm": 7.360270977020264, "learning_rate": 9.664847808521767e-06, "loss": 0.2586, "num_input_tokens_seen": 48295616, "step": 22395 }, { "epoch": 4.110846026793907, "grad_norm": 6.020533561706543, "learning_rate": 9.664559512113672e-06, "loss": 0.3328, "num_input_tokens_seen": 48306656, "step": 22400 }, { "epoch": 4.11176362635346, "grad_norm": 3.4175305366516113, "learning_rate": 9.664271096066856e-06, "loss": 0.2195, "num_input_tokens_seen": 48317760, "step": 22405 }, { "epoch": 4.112681225913011, "grad_norm": 10.859901428222656, "learning_rate": 9.663982560388714e-06, "loss": 0.2794, "num_input_tokens_seen": 48328128, "step": 22410 }, { "epoch": 4.113598825472564, "grad_norm": 1.5565139055252075, "learning_rate": 9.663693905086649e-06, "loss": 0.2539, "num_input_tokens_seen": 48339680, "step": 22415 }, { "epoch": 4.114516425032116, "grad_norm": 11.523969650268555, "learning_rate": 9.663405130168063e-06, "loss": 0.2835, "num_input_tokens_seen": 48351616, "step": 22420 }, { "epoch": 4.115434024591668, "grad_norm": 3.670614242553711, "learning_rate": 9.663116235640362e-06, "loss": 0.2918, "num_input_tokens_seen": 48363744, "step": 22425 }, { "epoch": 4.11635162415122, "grad_norm": 10.660451889038086, "learning_rate": 9.662827221510958e-06, "loss": 0.2528, "num_input_tokens_seen": 48374816, "step": 22430 }, { "epoch": 4.117269223710773, "grad_norm": 6.9450364112854, "learning_rate": 9.66253808778726e-06, "loss": 0.2658, "num_input_tokens_seen": 48384992, "step": 22435 }, { "epoch": 4.1181868232703245, "grad_norm": 12.22946548461914, "learning_rate": 9.66224883447669e-06, "loss": 0.2465, "num_input_tokens_seen": 48395872, "step": 22440 }, { "epoch": 4.119104422829877, "grad_norm": 8.079936981201172, "learning_rate": 9.661959461586662e-06, "loss": 0.3765, "num_input_tokens_seen": 48406944, "step": 22445 }, { "epoch": 4.1200220223894295, "grad_norm": 9.52340030670166, "learning_rate": 9.6616699691246e-06, "loss": 0.2276, "num_input_tokens_seen": 48417664, "step": 22450 }, { "epoch": 4.120939621948981, "grad_norm": 0.6328138709068298, "learning_rate": 9.661380357097924e-06, "loss": 0.4056, "num_input_tokens_seen": 48430080, "step": 22455 }, { "epoch": 4.121857221508534, "grad_norm": 2.1254196166992188, "learning_rate": 9.661090625514071e-06, "loss": 0.2186, "num_input_tokens_seen": 48440000, "step": 22460 }, { "epoch": 4.122774821068086, "grad_norm": 8.405420303344727, "learning_rate": 9.660800774380466e-06, "loss": 0.3782, "num_input_tokens_seen": 48451712, "step": 22465 }, { "epoch": 4.123692420627638, "grad_norm": 5.992462635040283, "learning_rate": 9.660510803704543e-06, "loss": 0.3706, "num_input_tokens_seen": 48462144, "step": 22470 }, { "epoch": 4.12461002018719, "grad_norm": 3.9175162315368652, "learning_rate": 9.660220713493743e-06, "loss": 0.3031, "num_input_tokens_seen": 48474048, "step": 22475 }, { "epoch": 4.125527619746743, "grad_norm": 2.061044692993164, "learning_rate": 9.659930503755504e-06, "loss": 0.3174, "num_input_tokens_seen": 48485344, "step": 22480 }, { "epoch": 4.126445219306294, "grad_norm": 3.313854217529297, "learning_rate": 9.65964017449727e-06, "loss": 0.1296, "num_input_tokens_seen": 48494592, "step": 22485 }, { "epoch": 4.127362818865847, "grad_norm": 2.7723944187164307, "learning_rate": 9.659349725726487e-06, "loss": 0.3656, "num_input_tokens_seen": 48504064, "step": 22490 }, { "epoch": 4.128280418425399, "grad_norm": 6.178964614868164, "learning_rate": 9.659059157450606e-06, "loss": 0.233, "num_input_tokens_seen": 48515040, "step": 22495 }, { "epoch": 4.129198017984951, "grad_norm": 7.683549880981445, "learning_rate": 9.658768469677076e-06, "loss": 0.278, "num_input_tokens_seen": 48525952, "step": 22500 }, { "epoch": 4.1301156175445035, "grad_norm": 2.451650857925415, "learning_rate": 9.658477662413358e-06, "loss": 0.2888, "num_input_tokens_seen": 48536928, "step": 22505 }, { "epoch": 4.131033217104056, "grad_norm": 1.84690260887146, "learning_rate": 9.658186735666905e-06, "loss": 0.2154, "num_input_tokens_seen": 48547040, "step": 22510 }, { "epoch": 4.131950816663608, "grad_norm": 2.0035862922668457, "learning_rate": 9.657895689445186e-06, "loss": 0.3497, "num_input_tokens_seen": 48557728, "step": 22515 }, { "epoch": 4.13286841622316, "grad_norm": 1.4806976318359375, "learning_rate": 9.657604523755657e-06, "loss": 0.2394, "num_input_tokens_seen": 48567840, "step": 22520 }, { "epoch": 4.133786015782713, "grad_norm": 3.1348044872283936, "learning_rate": 9.657313238605792e-06, "loss": 0.1978, "num_input_tokens_seen": 48578848, "step": 22525 }, { "epoch": 4.134703615342264, "grad_norm": 12.468323707580566, "learning_rate": 9.657021834003061e-06, "loss": 0.312, "num_input_tokens_seen": 48591648, "step": 22530 }, { "epoch": 4.135621214901817, "grad_norm": 5.048914909362793, "learning_rate": 9.656730309954938e-06, "loss": 0.4143, "num_input_tokens_seen": 48603040, "step": 22535 }, { "epoch": 4.136538814461369, "grad_norm": 7.200996398925781, "learning_rate": 9.6564386664689e-06, "loss": 0.3742, "num_input_tokens_seen": 48613312, "step": 22540 }, { "epoch": 4.137456414020921, "grad_norm": 2.7555220127105713, "learning_rate": 9.656146903552427e-06, "loss": 0.4916, "num_input_tokens_seen": 48623456, "step": 22545 }, { "epoch": 4.138374013580473, "grad_norm": 1.3599743843078613, "learning_rate": 9.655855021213002e-06, "loss": 0.2915, "num_input_tokens_seen": 48634080, "step": 22550 }, { "epoch": 4.139291613140026, "grad_norm": 7.53685188293457, "learning_rate": 9.655563019458112e-06, "loss": 0.3277, "num_input_tokens_seen": 48644960, "step": 22555 }, { "epoch": 4.1402092126995775, "grad_norm": 1.7557957172393799, "learning_rate": 9.655270898295246e-06, "loss": 0.2171, "num_input_tokens_seen": 48655232, "step": 22560 }, { "epoch": 4.14112681225913, "grad_norm": 5.061581611633301, "learning_rate": 9.654978657731895e-06, "loss": 0.3423, "num_input_tokens_seen": 48666208, "step": 22565 }, { "epoch": 4.1420444118186825, "grad_norm": 3.081204891204834, "learning_rate": 9.654686297775557e-06, "loss": 0.3111, "num_input_tokens_seen": 48677600, "step": 22570 }, { "epoch": 4.142962011378234, "grad_norm": 10.997915267944336, "learning_rate": 9.65439381843373e-06, "loss": 0.3954, "num_input_tokens_seen": 48688512, "step": 22575 }, { "epoch": 4.143879610937787, "grad_norm": 2.4288887977600098, "learning_rate": 9.654101219713915e-06, "loss": 0.2653, "num_input_tokens_seen": 48700032, "step": 22580 }, { "epoch": 4.144797210497339, "grad_norm": 11.196022987365723, "learning_rate": 9.653808501623617e-06, "loss": 0.202, "num_input_tokens_seen": 48709152, "step": 22585 }, { "epoch": 4.145714810056891, "grad_norm": 2.2932755947113037, "learning_rate": 9.653515664170343e-06, "loss": 0.2298, "num_input_tokens_seen": 48719520, "step": 22590 }, { "epoch": 4.146632409616443, "grad_norm": 5.961313247680664, "learning_rate": 9.653222707361605e-06, "loss": 0.1523, "num_input_tokens_seen": 48731392, "step": 22595 }, { "epoch": 4.147550009175996, "grad_norm": 7.2570013999938965, "learning_rate": 9.652929631204917e-06, "loss": 0.5282, "num_input_tokens_seen": 48742336, "step": 22600 }, { "epoch": 4.148467608735547, "grad_norm": 1.3831095695495605, "learning_rate": 9.652636435707793e-06, "loss": 0.3845, "num_input_tokens_seen": 48753536, "step": 22605 }, { "epoch": 4.1493852082951, "grad_norm": 2.471511125564575, "learning_rate": 9.652343120877758e-06, "loss": 0.3697, "num_input_tokens_seen": 48764672, "step": 22610 }, { "epoch": 4.150302807854652, "grad_norm": 4.01619291305542, "learning_rate": 9.652049686722332e-06, "loss": 0.2057, "num_input_tokens_seen": 48776576, "step": 22615 }, { "epoch": 4.151220407414204, "grad_norm": 6.774570465087891, "learning_rate": 9.651756133249041e-06, "loss": 0.2476, "num_input_tokens_seen": 48787040, "step": 22620 }, { "epoch": 4.152138006973757, "grad_norm": 2.57175612449646, "learning_rate": 9.651462460465415e-06, "loss": 0.2768, "num_input_tokens_seen": 48795936, "step": 22625 }, { "epoch": 4.153055606533309, "grad_norm": 2.180971622467041, "learning_rate": 9.651168668378987e-06, "loss": 0.3915, "num_input_tokens_seen": 48807552, "step": 22630 }, { "epoch": 4.153973206092861, "grad_norm": 5.552826404571533, "learning_rate": 9.650874756997289e-06, "loss": 0.3303, "num_input_tokens_seen": 48817440, "step": 22635 }, { "epoch": 4.154890805652413, "grad_norm": 1.920234203338623, "learning_rate": 9.650580726327863e-06, "loss": 0.2071, "num_input_tokens_seen": 48828032, "step": 22640 }, { "epoch": 4.155808405211966, "grad_norm": 1.833460807800293, "learning_rate": 9.65028657637825e-06, "loss": 0.2999, "num_input_tokens_seen": 48839104, "step": 22645 }, { "epoch": 4.156726004771517, "grad_norm": 1.663773536682129, "learning_rate": 9.649992307155992e-06, "loss": 0.3109, "num_input_tokens_seen": 48848832, "step": 22650 }, { "epoch": 4.15764360433107, "grad_norm": 8.1505708694458, "learning_rate": 9.64969791866864e-06, "loss": 0.2885, "num_input_tokens_seen": 48860416, "step": 22655 }, { "epoch": 4.158561203890622, "grad_norm": 15.452686309814453, "learning_rate": 9.64940341092374e-06, "loss": 0.2308, "num_input_tokens_seen": 48872032, "step": 22660 }, { "epoch": 4.159478803450174, "grad_norm": 2.1656644344329834, "learning_rate": 9.64910878392885e-06, "loss": 0.3645, "num_input_tokens_seen": 48881664, "step": 22665 }, { "epoch": 4.1603964030097265, "grad_norm": 1.2096132040023804, "learning_rate": 9.648814037691524e-06, "loss": 0.1686, "num_input_tokens_seen": 48894400, "step": 22670 }, { "epoch": 4.161314002569279, "grad_norm": 1.1385694742202759, "learning_rate": 9.648519172219326e-06, "loss": 0.2433, "num_input_tokens_seen": 48905824, "step": 22675 }, { "epoch": 4.162231602128831, "grad_norm": 6.341528415679932, "learning_rate": 9.648224187519812e-06, "loss": 0.3132, "num_input_tokens_seen": 48916800, "step": 22680 }, { "epoch": 4.163149201688383, "grad_norm": 5.112613201141357, "learning_rate": 9.647929083600555e-06, "loss": 0.187, "num_input_tokens_seen": 48927392, "step": 22685 }, { "epoch": 4.164066801247936, "grad_norm": 4.5234375, "learning_rate": 9.647633860469118e-06, "loss": 0.1307, "num_input_tokens_seen": 48938176, "step": 22690 }, { "epoch": 4.164984400807487, "grad_norm": 4.416260242462158, "learning_rate": 9.647338518133078e-06, "loss": 0.2215, "num_input_tokens_seen": 48948544, "step": 22695 }, { "epoch": 4.16590200036704, "grad_norm": 6.605684757232666, "learning_rate": 9.647043056600006e-06, "loss": 0.2859, "num_input_tokens_seen": 48959968, "step": 22700 }, { "epoch": 4.166819599926592, "grad_norm": 13.885313987731934, "learning_rate": 9.646747475877483e-06, "loss": 0.2377, "num_input_tokens_seen": 48971872, "step": 22705 }, { "epoch": 4.167737199486144, "grad_norm": 1.4101835489273071, "learning_rate": 9.646451775973088e-06, "loss": 0.2735, "num_input_tokens_seen": 48982080, "step": 22710 }, { "epoch": 4.168654799045696, "grad_norm": 1.2383540868759155, "learning_rate": 9.646155956894407e-06, "loss": 0.2932, "num_input_tokens_seen": 48993024, "step": 22715 }, { "epoch": 4.169572398605249, "grad_norm": 5.869962215423584, "learning_rate": 9.645860018649027e-06, "loss": 0.3041, "num_input_tokens_seen": 49004032, "step": 22720 }, { "epoch": 4.1704899981648005, "grad_norm": 2.3692142963409424, "learning_rate": 9.645563961244537e-06, "loss": 0.284, "num_input_tokens_seen": 49013984, "step": 22725 }, { "epoch": 4.171407597724353, "grad_norm": 2.503828525543213, "learning_rate": 9.645267784688531e-06, "loss": 0.275, "num_input_tokens_seen": 49025792, "step": 22730 }, { "epoch": 4.1723251972839055, "grad_norm": 1.584314227104187, "learning_rate": 9.644971488988606e-06, "loss": 0.3264, "num_input_tokens_seen": 49035424, "step": 22735 }, { "epoch": 4.173242796843457, "grad_norm": 4.265892028808594, "learning_rate": 9.644675074152364e-06, "loss": 0.2271, "num_input_tokens_seen": 49044736, "step": 22740 }, { "epoch": 4.17416039640301, "grad_norm": 1.5904366970062256, "learning_rate": 9.644378540187402e-06, "loss": 0.4023, "num_input_tokens_seen": 49056672, "step": 22745 }, { "epoch": 4.175077995962562, "grad_norm": 4.759538173675537, "learning_rate": 9.644081887101329e-06, "loss": 0.3871, "num_input_tokens_seen": 49067648, "step": 22750 }, { "epoch": 4.175995595522114, "grad_norm": 5.878173828125, "learning_rate": 9.643785114901754e-06, "loss": 0.3422, "num_input_tokens_seen": 49078016, "step": 22755 }, { "epoch": 4.176913195081666, "grad_norm": 0.9681944847106934, "learning_rate": 9.643488223596287e-06, "loss": 0.2844, "num_input_tokens_seen": 49088512, "step": 22760 }, { "epoch": 4.177830794641219, "grad_norm": 2.626819610595703, "learning_rate": 9.643191213192545e-06, "loss": 0.2143, "num_input_tokens_seen": 49098400, "step": 22765 }, { "epoch": 4.17874839420077, "grad_norm": 6.269490718841553, "learning_rate": 9.642894083698145e-06, "loss": 0.3669, "num_input_tokens_seen": 49108544, "step": 22770 }, { "epoch": 4.179665993760323, "grad_norm": 3.465482711791992, "learning_rate": 9.642596835120705e-06, "loss": 0.1781, "num_input_tokens_seen": 49119072, "step": 22775 }, { "epoch": 4.180583593319875, "grad_norm": 8.082725524902344, "learning_rate": 9.642299467467854e-06, "loss": 0.4223, "num_input_tokens_seen": 49129952, "step": 22780 }, { "epoch": 4.181501192879427, "grad_norm": 2.332933187484741, "learning_rate": 9.642001980747216e-06, "loss": 0.3949, "num_input_tokens_seen": 49138336, "step": 22785 }, { "epoch": 4.1824187924389795, "grad_norm": 5.430646896362305, "learning_rate": 9.641704374966421e-06, "loss": 0.3228, "num_input_tokens_seen": 49149728, "step": 22790 }, { "epoch": 4.183336391998532, "grad_norm": 1.764787197113037, "learning_rate": 9.641406650133104e-06, "loss": 0.2178, "num_input_tokens_seen": 49160224, "step": 22795 }, { "epoch": 4.184253991558084, "grad_norm": 4.704018592834473, "learning_rate": 9.6411088062549e-06, "loss": 0.4269, "num_input_tokens_seen": 49171840, "step": 22800 }, { "epoch": 4.185171591117636, "grad_norm": 14.338257789611816, "learning_rate": 9.640810843339445e-06, "loss": 0.255, "num_input_tokens_seen": 49182816, "step": 22805 }, { "epoch": 4.186089190677189, "grad_norm": 3.080071210861206, "learning_rate": 9.640512761394389e-06, "loss": 0.2898, "num_input_tokens_seen": 49193440, "step": 22810 }, { "epoch": 4.18700679023674, "grad_norm": 1.3877220153808594, "learning_rate": 9.64021456042737e-06, "loss": 0.3045, "num_input_tokens_seen": 49205760, "step": 22815 }, { "epoch": 4.187924389796293, "grad_norm": 3.427713394165039, "learning_rate": 9.63991624044604e-06, "loss": 0.2437, "num_input_tokens_seen": 49215712, "step": 22820 }, { "epoch": 4.188841989355845, "grad_norm": 1.0875250101089478, "learning_rate": 9.63961780145805e-06, "loss": 0.2763, "num_input_tokens_seen": 49227168, "step": 22825 }, { "epoch": 4.189759588915397, "grad_norm": 5.885824203491211, "learning_rate": 9.639319243471052e-06, "loss": 0.2948, "num_input_tokens_seen": 49237664, "step": 22830 }, { "epoch": 4.190677188474949, "grad_norm": 0.5786705613136292, "learning_rate": 9.639020566492708e-06, "loss": 0.226, "num_input_tokens_seen": 49249088, "step": 22835 }, { "epoch": 4.191594788034502, "grad_norm": 4.526058197021484, "learning_rate": 9.638721770530677e-06, "loss": 0.3944, "num_input_tokens_seen": 49260352, "step": 22840 }, { "epoch": 4.1925123875940535, "grad_norm": 1.6127315759658813, "learning_rate": 9.63842285559262e-06, "loss": 0.3299, "num_input_tokens_seen": 49270784, "step": 22845 }, { "epoch": 4.193429987153606, "grad_norm": 2.8191890716552734, "learning_rate": 9.638123821686206e-06, "loss": 0.3342, "num_input_tokens_seen": 49281088, "step": 22850 }, { "epoch": 4.1943475867131585, "grad_norm": 6.1007280349731445, "learning_rate": 9.637824668819104e-06, "loss": 0.2395, "num_input_tokens_seen": 49292160, "step": 22855 }, { "epoch": 4.19526518627271, "grad_norm": 12.736553192138672, "learning_rate": 9.63752539699899e-06, "loss": 0.1774, "num_input_tokens_seen": 49302944, "step": 22860 }, { "epoch": 4.196182785832263, "grad_norm": 6.0167717933654785, "learning_rate": 9.637226006233533e-06, "loss": 0.3559, "num_input_tokens_seen": 49313728, "step": 22865 }, { "epoch": 4.197100385391815, "grad_norm": 4.56124210357666, "learning_rate": 9.63692649653042e-06, "loss": 0.2401, "num_input_tokens_seen": 49324864, "step": 22870 }, { "epoch": 4.198017984951367, "grad_norm": 5.0760979652404785, "learning_rate": 9.636626867897325e-06, "loss": 0.2811, "num_input_tokens_seen": 49336832, "step": 22875 }, { "epoch": 4.198935584510919, "grad_norm": 2.21905517578125, "learning_rate": 9.63632712034194e-06, "loss": 0.1679, "num_input_tokens_seen": 49347136, "step": 22880 }, { "epoch": 4.199853184070472, "grad_norm": 1.5334522724151611, "learning_rate": 9.636027253871949e-06, "loss": 0.2394, "num_input_tokens_seen": 49359360, "step": 22885 }, { "epoch": 4.200770783630023, "grad_norm": 4.0941057205200195, "learning_rate": 9.635727268495043e-06, "loss": 0.2947, "num_input_tokens_seen": 49371808, "step": 22890 }, { "epoch": 4.201688383189576, "grad_norm": 4.048149585723877, "learning_rate": 9.63542716421892e-06, "loss": 0.3551, "num_input_tokens_seen": 49383136, "step": 22895 }, { "epoch": 4.202605982749128, "grad_norm": 8.800416946411133, "learning_rate": 9.635126941051271e-06, "loss": 0.2736, "num_input_tokens_seen": 49393632, "step": 22900 }, { "epoch": 4.20352358230868, "grad_norm": 4.393517017364502, "learning_rate": 9.634826598999802e-06, "loss": 0.3184, "num_input_tokens_seen": 49404384, "step": 22905 }, { "epoch": 4.204441181868233, "grad_norm": 3.9650118350982666, "learning_rate": 9.634526138072215e-06, "loss": 0.3614, "num_input_tokens_seen": 49414784, "step": 22910 }, { "epoch": 4.205358781427785, "grad_norm": 4.45690393447876, "learning_rate": 9.634225558276214e-06, "loss": 0.407, "num_input_tokens_seen": 49425568, "step": 22915 }, { "epoch": 4.206276380987337, "grad_norm": 2.3678762912750244, "learning_rate": 9.63392485961951e-06, "loss": 0.2199, "num_input_tokens_seen": 49436320, "step": 22920 }, { "epoch": 4.207193980546889, "grad_norm": 3.5995607376098633, "learning_rate": 9.633624042109815e-06, "loss": 0.2928, "num_input_tokens_seen": 49446240, "step": 22925 }, { "epoch": 4.208111580106442, "grad_norm": 3.8747997283935547, "learning_rate": 9.633323105754844e-06, "loss": 0.2684, "num_input_tokens_seen": 49457728, "step": 22930 }, { "epoch": 4.209029179665993, "grad_norm": 6.041750431060791, "learning_rate": 9.633022050562318e-06, "loss": 0.3031, "num_input_tokens_seen": 49468672, "step": 22935 }, { "epoch": 4.209946779225546, "grad_norm": 7.043107986450195, "learning_rate": 9.632720876539956e-06, "loss": 0.3369, "num_input_tokens_seen": 49480992, "step": 22940 }, { "epoch": 4.210864378785098, "grad_norm": 5.128864765167236, "learning_rate": 9.632419583695484e-06, "loss": 0.2638, "num_input_tokens_seen": 49491936, "step": 22945 }, { "epoch": 4.21178197834465, "grad_norm": 4.349945545196533, "learning_rate": 9.63211817203663e-06, "loss": 0.2168, "num_input_tokens_seen": 49502656, "step": 22950 }, { "epoch": 4.2126995779042025, "grad_norm": 2.955932140350342, "learning_rate": 9.631816641571123e-06, "loss": 0.3236, "num_input_tokens_seen": 49514208, "step": 22955 }, { "epoch": 4.213617177463755, "grad_norm": 2.749302864074707, "learning_rate": 9.631514992306698e-06, "loss": 0.3052, "num_input_tokens_seen": 49524384, "step": 22960 }, { "epoch": 4.214534777023307, "grad_norm": 4.6111063957214355, "learning_rate": 9.631213224251091e-06, "loss": 0.186, "num_input_tokens_seen": 49535200, "step": 22965 }, { "epoch": 4.215452376582859, "grad_norm": 2.394233465194702, "learning_rate": 9.630911337412044e-06, "loss": 0.3024, "num_input_tokens_seen": 49546656, "step": 22970 }, { "epoch": 4.216369976142412, "grad_norm": 5.436326503753662, "learning_rate": 9.630609331797297e-06, "loss": 0.2804, "num_input_tokens_seen": 49556608, "step": 22975 }, { "epoch": 4.217287575701963, "grad_norm": 3.9179327487945557, "learning_rate": 9.630307207414598e-06, "loss": 0.244, "num_input_tokens_seen": 49567648, "step": 22980 }, { "epoch": 4.218205175261516, "grad_norm": 2.3315694332122803, "learning_rate": 9.630004964271696e-06, "loss": 0.2438, "num_input_tokens_seen": 49578656, "step": 22985 }, { "epoch": 4.219122774821068, "grad_norm": 5.44738245010376, "learning_rate": 9.629702602376341e-06, "loss": 0.2061, "num_input_tokens_seen": 49589824, "step": 22990 }, { "epoch": 4.22004037438062, "grad_norm": 5.339926719665527, "learning_rate": 9.629400121736291e-06, "loss": 0.1874, "num_input_tokens_seen": 49601664, "step": 22995 }, { "epoch": 4.220957973940172, "grad_norm": 2.620173931121826, "learning_rate": 9.629097522359304e-06, "loss": 0.2496, "num_input_tokens_seen": 49614144, "step": 23000 }, { "epoch": 4.221875573499725, "grad_norm": 3.4374921321868896, "learning_rate": 9.628794804253137e-06, "loss": 0.2577, "num_input_tokens_seen": 49625824, "step": 23005 }, { "epoch": 4.2227931730592765, "grad_norm": 1.2049943208694458, "learning_rate": 9.62849196742556e-06, "loss": 0.3386, "num_input_tokens_seen": 49636192, "step": 23010 }, { "epoch": 4.223710772618829, "grad_norm": 2.563356399536133, "learning_rate": 9.628189011884335e-06, "loss": 0.1343, "num_input_tokens_seen": 49646912, "step": 23015 }, { "epoch": 4.2246283721783815, "grad_norm": 1.7400598526000977, "learning_rate": 9.627885937637236e-06, "loss": 0.2848, "num_input_tokens_seen": 49657952, "step": 23020 }, { "epoch": 4.225545971737933, "grad_norm": 6.500543594360352, "learning_rate": 9.627582744692036e-06, "loss": 0.2345, "num_input_tokens_seen": 49669568, "step": 23025 }, { "epoch": 4.226463571297486, "grad_norm": 3.884122133255005, "learning_rate": 9.62727943305651e-06, "loss": 0.3044, "num_input_tokens_seen": 49680480, "step": 23030 }, { "epoch": 4.227381170857038, "grad_norm": 17.900110244750977, "learning_rate": 9.626976002738438e-06, "loss": 0.2709, "num_input_tokens_seen": 49689888, "step": 23035 }, { "epoch": 4.22829877041659, "grad_norm": 3.879436492919922, "learning_rate": 9.626672453745603e-06, "loss": 0.3019, "num_input_tokens_seen": 49701344, "step": 23040 }, { "epoch": 4.229216369976142, "grad_norm": 3.170379161834717, "learning_rate": 9.626368786085792e-06, "loss": 0.2616, "num_input_tokens_seen": 49712640, "step": 23045 }, { "epoch": 4.230133969535695, "grad_norm": 8.982373237609863, "learning_rate": 9.626064999766788e-06, "loss": 0.5434, "num_input_tokens_seen": 49723904, "step": 23050 }, { "epoch": 4.231051569095246, "grad_norm": 5.272853374481201, "learning_rate": 9.62576109479639e-06, "loss": 0.2805, "num_input_tokens_seen": 49734272, "step": 23055 }, { "epoch": 4.231969168654799, "grad_norm": 6.1480536460876465, "learning_rate": 9.625457071182388e-06, "loss": 0.2276, "num_input_tokens_seen": 49744160, "step": 23060 }, { "epoch": 4.232886768214351, "grad_norm": 2.995164394378662, "learning_rate": 9.625152928932579e-06, "loss": 0.2525, "num_input_tokens_seen": 49754720, "step": 23065 }, { "epoch": 4.233804367773904, "grad_norm": 1.3990086317062378, "learning_rate": 9.62484866805477e-06, "loss": 0.1836, "num_input_tokens_seen": 49764128, "step": 23070 }, { "epoch": 4.2347219673334555, "grad_norm": 3.644235134124756, "learning_rate": 9.624544288556757e-06, "loss": 0.3038, "num_input_tokens_seen": 49775392, "step": 23075 }, { "epoch": 4.235639566893008, "grad_norm": 6.7572221755981445, "learning_rate": 9.62423979044635e-06, "loss": 0.2263, "num_input_tokens_seen": 49786080, "step": 23080 }, { "epoch": 4.2365571664525605, "grad_norm": 3.5638527870178223, "learning_rate": 9.623935173731362e-06, "loss": 0.3225, "num_input_tokens_seen": 49797600, "step": 23085 }, { "epoch": 4.237474766012112, "grad_norm": 18.98099136352539, "learning_rate": 9.623630438419602e-06, "loss": 0.211, "num_input_tokens_seen": 49806176, "step": 23090 }, { "epoch": 4.238392365571665, "grad_norm": 3.420006036758423, "learning_rate": 9.623325584518887e-06, "loss": 0.1843, "num_input_tokens_seen": 49816992, "step": 23095 }, { "epoch": 4.239309965131217, "grad_norm": 10.314916610717773, "learning_rate": 9.623020612037036e-06, "loss": 0.2495, "num_input_tokens_seen": 49828384, "step": 23100 }, { "epoch": 4.240227564690769, "grad_norm": 7.050529479980469, "learning_rate": 9.622715520981871e-06, "loss": 0.2354, "num_input_tokens_seen": 49839072, "step": 23105 }, { "epoch": 4.241145164250321, "grad_norm": 11.905165672302246, "learning_rate": 9.622410311361219e-06, "loss": 0.1805, "num_input_tokens_seen": 49849472, "step": 23110 }, { "epoch": 4.242062763809874, "grad_norm": 3.083662509918213, "learning_rate": 9.622104983182905e-06, "loss": 0.3124, "num_input_tokens_seen": 49860000, "step": 23115 }, { "epoch": 4.242980363369425, "grad_norm": 0.3249673545360565, "learning_rate": 9.621799536454763e-06, "loss": 0.3241, "num_input_tokens_seen": 49870624, "step": 23120 }, { "epoch": 4.243897962928978, "grad_norm": 4.0460638999938965, "learning_rate": 9.621493971184625e-06, "loss": 0.2648, "num_input_tokens_seen": 49881920, "step": 23125 }, { "epoch": 4.24481556248853, "grad_norm": 14.360228538513184, "learning_rate": 9.621188287380331e-06, "loss": 0.2845, "num_input_tokens_seen": 49893056, "step": 23130 }, { "epoch": 4.245733162048082, "grad_norm": 6.522379398345947, "learning_rate": 9.620882485049718e-06, "loss": 0.3242, "num_input_tokens_seen": 49902816, "step": 23135 }, { "epoch": 4.2466507616076345, "grad_norm": 5.216811180114746, "learning_rate": 9.620576564200632e-06, "loss": 0.3651, "num_input_tokens_seen": 49914912, "step": 23140 }, { "epoch": 4.247568361167187, "grad_norm": 20.072795867919922, "learning_rate": 9.620270524840918e-06, "loss": 0.336, "num_input_tokens_seen": 49925536, "step": 23145 }, { "epoch": 4.248485960726739, "grad_norm": 8.803990364074707, "learning_rate": 9.619964366978426e-06, "loss": 0.2969, "num_input_tokens_seen": 49934624, "step": 23150 }, { "epoch": 4.249403560286291, "grad_norm": 8.519938468933105, "learning_rate": 9.619658090621008e-06, "loss": 0.4957, "num_input_tokens_seen": 49945440, "step": 23155 }, { "epoch": 4.250321159845844, "grad_norm": 3.8588311672210693, "learning_rate": 9.61935169577652e-06, "loss": 0.3406, "num_input_tokens_seen": 49956288, "step": 23160 }, { "epoch": 4.251238759405395, "grad_norm": 30.916229248046875, "learning_rate": 9.61904518245282e-06, "loss": 0.2894, "num_input_tokens_seen": 49966400, "step": 23165 }, { "epoch": 4.252156358964948, "grad_norm": 3.658398151397705, "learning_rate": 9.61873855065777e-06, "loss": 0.3014, "num_input_tokens_seen": 49975904, "step": 23170 }, { "epoch": 4.2530739585245, "grad_norm": 5.560966491699219, "learning_rate": 9.618431800399236e-06, "loss": 0.2438, "num_input_tokens_seen": 49986944, "step": 23175 }, { "epoch": 4.253991558084052, "grad_norm": 5.934645175933838, "learning_rate": 9.61812493168508e-06, "loss": 0.349, "num_input_tokens_seen": 49998144, "step": 23180 }, { "epoch": 4.2549091576436044, "grad_norm": 6.229612827301025, "learning_rate": 9.617817944523181e-06, "loss": 0.3194, "num_input_tokens_seen": 50009088, "step": 23185 }, { "epoch": 4.255826757203157, "grad_norm": 8.358918190002441, "learning_rate": 9.617510838921407e-06, "loss": 0.2779, "num_input_tokens_seen": 50020608, "step": 23190 }, { "epoch": 4.256744356762709, "grad_norm": 16.887794494628906, "learning_rate": 9.617203614887639e-06, "loss": 0.2272, "num_input_tokens_seen": 50031712, "step": 23195 }, { "epoch": 4.257661956322261, "grad_norm": 5.662384510040283, "learning_rate": 9.616896272429752e-06, "loss": 0.2769, "num_input_tokens_seen": 50042272, "step": 23200 }, { "epoch": 4.258579555881814, "grad_norm": 2.205718994140625, "learning_rate": 9.61658881155563e-06, "loss": 0.4511, "num_input_tokens_seen": 50052832, "step": 23205 }, { "epoch": 4.259497155441365, "grad_norm": 2.4033100605010986, "learning_rate": 9.61628123227316e-06, "loss": 0.2474, "num_input_tokens_seen": 50063616, "step": 23210 }, { "epoch": 4.260414755000918, "grad_norm": 6.580888271331787, "learning_rate": 9.615973534590235e-06, "loss": 0.503, "num_input_tokens_seen": 50074176, "step": 23215 }, { "epoch": 4.26133235456047, "grad_norm": 5.887866973876953, "learning_rate": 9.61566571851474e-06, "loss": 0.4222, "num_input_tokens_seen": 50084896, "step": 23220 }, { "epoch": 4.262249954120022, "grad_norm": 5.8971686363220215, "learning_rate": 9.615357784054572e-06, "loss": 0.2762, "num_input_tokens_seen": 50097280, "step": 23225 }, { "epoch": 4.263167553679574, "grad_norm": 3.059969663619995, "learning_rate": 9.615049731217632e-06, "loss": 0.2244, "num_input_tokens_seen": 50107744, "step": 23230 }, { "epoch": 4.264085153239127, "grad_norm": 3.00797700881958, "learning_rate": 9.61474156001182e-06, "loss": 0.2695, "num_input_tokens_seen": 50118656, "step": 23235 }, { "epoch": 4.2650027527986785, "grad_norm": 3.377450466156006, "learning_rate": 9.614433270445036e-06, "loss": 0.2143, "num_input_tokens_seen": 50127808, "step": 23240 }, { "epoch": 4.265920352358231, "grad_norm": 1.1938611268997192, "learning_rate": 9.614124862525192e-06, "loss": 0.301, "num_input_tokens_seen": 50137952, "step": 23245 }, { "epoch": 4.2668379519177835, "grad_norm": 1.4986722469329834, "learning_rate": 9.613816336260198e-06, "loss": 0.2065, "num_input_tokens_seen": 50147904, "step": 23250 }, { "epoch": 4.267755551477335, "grad_norm": 2.3288674354553223, "learning_rate": 9.613507691657965e-06, "loss": 0.2263, "num_input_tokens_seen": 50159264, "step": 23255 }, { "epoch": 4.268673151036888, "grad_norm": 17.973054885864258, "learning_rate": 9.613198928726408e-06, "loss": 0.2288, "num_input_tokens_seen": 50170272, "step": 23260 }, { "epoch": 4.26959075059644, "grad_norm": 9.678686141967773, "learning_rate": 9.612890047473449e-06, "loss": 0.3642, "num_input_tokens_seen": 50180064, "step": 23265 }, { "epoch": 4.270508350155992, "grad_norm": 9.27664566040039, "learning_rate": 9.612581047907011e-06, "loss": 0.4119, "num_input_tokens_seen": 50191904, "step": 23270 }, { "epoch": 4.271425949715544, "grad_norm": 4.109041690826416, "learning_rate": 9.612271930035017e-06, "loss": 0.3295, "num_input_tokens_seen": 50203200, "step": 23275 }, { "epoch": 4.272343549275097, "grad_norm": 2.9821836948394775, "learning_rate": 9.611962693865395e-06, "loss": 0.3372, "num_input_tokens_seen": 50213920, "step": 23280 }, { "epoch": 4.273261148834648, "grad_norm": 3.0146217346191406, "learning_rate": 9.61165333940608e-06, "loss": 0.2033, "num_input_tokens_seen": 50224480, "step": 23285 }, { "epoch": 4.274178748394201, "grad_norm": 2.0757062435150146, "learning_rate": 9.611343866665004e-06, "loss": 0.22, "num_input_tokens_seen": 50235584, "step": 23290 }, { "epoch": 4.275096347953753, "grad_norm": 3.1358866691589355, "learning_rate": 9.611034275650104e-06, "loss": 0.2115, "num_input_tokens_seen": 50246592, "step": 23295 }, { "epoch": 4.276013947513305, "grad_norm": 2.5497663021087646, "learning_rate": 9.610724566369322e-06, "loss": 0.4401, "num_input_tokens_seen": 50257216, "step": 23300 }, { "epoch": 4.2769315470728575, "grad_norm": 1.5115587711334229, "learning_rate": 9.6104147388306e-06, "loss": 0.288, "num_input_tokens_seen": 50267808, "step": 23305 }, { "epoch": 4.27784914663241, "grad_norm": 1.345154881477356, "learning_rate": 9.610104793041885e-06, "loss": 0.2021, "num_input_tokens_seen": 50278720, "step": 23310 }, { "epoch": 4.278766746191962, "grad_norm": 8.537190437316895, "learning_rate": 9.609794729011128e-06, "loss": 0.2825, "num_input_tokens_seen": 50288224, "step": 23315 }, { "epoch": 4.279684345751514, "grad_norm": 7.8312458992004395, "learning_rate": 9.609484546746282e-06, "loss": 0.3558, "num_input_tokens_seen": 50299424, "step": 23320 }, { "epoch": 4.280601945311067, "grad_norm": 4.388204097747803, "learning_rate": 9.6091742462553e-06, "loss": 0.3687, "num_input_tokens_seen": 50309248, "step": 23325 }, { "epoch": 4.281519544870618, "grad_norm": 10.859227180480957, "learning_rate": 9.608863827546142e-06, "loss": 0.2568, "num_input_tokens_seen": 50320128, "step": 23330 }, { "epoch": 4.282437144430171, "grad_norm": 11.619739532470703, "learning_rate": 9.60855329062677e-06, "loss": 0.3295, "num_input_tokens_seen": 50330944, "step": 23335 }, { "epoch": 4.283354743989723, "grad_norm": 13.47713565826416, "learning_rate": 9.60824263550515e-06, "loss": 0.2252, "num_input_tokens_seen": 50341920, "step": 23340 }, { "epoch": 4.284272343549275, "grad_norm": 1.4645198583602905, "learning_rate": 9.607931862189246e-06, "loss": 0.2374, "num_input_tokens_seen": 50353344, "step": 23345 }, { "epoch": 4.285189943108827, "grad_norm": 13.29472541809082, "learning_rate": 9.607620970687032e-06, "loss": 0.2646, "num_input_tokens_seen": 50364096, "step": 23350 }, { "epoch": 4.28610754266838, "grad_norm": 6.468348503112793, "learning_rate": 9.607309961006484e-06, "loss": 0.3404, "num_input_tokens_seen": 50374592, "step": 23355 }, { "epoch": 4.2870251422279315, "grad_norm": 8.090575218200684, "learning_rate": 9.606998833155574e-06, "loss": 0.3507, "num_input_tokens_seen": 50385696, "step": 23360 }, { "epoch": 4.287942741787484, "grad_norm": 1.6746231317520142, "learning_rate": 9.606687587142284e-06, "loss": 0.2827, "num_input_tokens_seen": 50395552, "step": 23365 }, { "epoch": 4.2888603413470365, "grad_norm": 5.190488815307617, "learning_rate": 9.606376222974599e-06, "loss": 0.4142, "num_input_tokens_seen": 50407424, "step": 23370 }, { "epoch": 4.289777940906588, "grad_norm": 10.398826599121094, "learning_rate": 9.606064740660501e-06, "loss": 0.3352, "num_input_tokens_seen": 50418208, "step": 23375 }, { "epoch": 4.290695540466141, "grad_norm": 11.61768627166748, "learning_rate": 9.60575314020798e-06, "loss": 0.366, "num_input_tokens_seen": 50428128, "step": 23380 }, { "epoch": 4.291613140025693, "grad_norm": 4.949852466583252, "learning_rate": 9.605441421625032e-06, "loss": 0.3586, "num_input_tokens_seen": 50440000, "step": 23385 }, { "epoch": 4.292530739585245, "grad_norm": 3.3478643894195557, "learning_rate": 9.605129584919649e-06, "loss": 0.2646, "num_input_tokens_seen": 50451200, "step": 23390 }, { "epoch": 4.293448339144797, "grad_norm": 6.3639984130859375, "learning_rate": 9.604817630099827e-06, "loss": 0.3435, "num_input_tokens_seen": 50461632, "step": 23395 }, { "epoch": 4.29436593870435, "grad_norm": 2.233222723007202, "learning_rate": 9.604505557173573e-06, "loss": 0.2855, "num_input_tokens_seen": 50473728, "step": 23400 }, { "epoch": 4.295283538263901, "grad_norm": 1.1976993083953857, "learning_rate": 9.604193366148887e-06, "loss": 0.2349, "num_input_tokens_seen": 50485024, "step": 23405 }, { "epoch": 4.296201137823454, "grad_norm": 1.8393833637237549, "learning_rate": 9.603881057033775e-06, "loss": 0.2402, "num_input_tokens_seen": 50496256, "step": 23410 }, { "epoch": 4.297118737383006, "grad_norm": 6.114284038543701, "learning_rate": 9.60356862983625e-06, "loss": 0.319, "num_input_tokens_seen": 50506016, "step": 23415 }, { "epoch": 4.298036336942558, "grad_norm": 2.4438514709472656, "learning_rate": 9.603256084564325e-06, "loss": 0.2972, "num_input_tokens_seen": 50516224, "step": 23420 }, { "epoch": 4.2989539365021106, "grad_norm": 1.6700698137283325, "learning_rate": 9.602943421226017e-06, "loss": 0.1845, "num_input_tokens_seen": 50527136, "step": 23425 }, { "epoch": 4.299871536061663, "grad_norm": 14.685625076293945, "learning_rate": 9.602630639829343e-06, "loss": 0.1765, "num_input_tokens_seen": 50536992, "step": 23430 }, { "epoch": 4.300789135621215, "grad_norm": 1.7800326347351074, "learning_rate": 9.602317740382325e-06, "loss": 0.2859, "num_input_tokens_seen": 50548512, "step": 23435 }, { "epoch": 4.301706735180767, "grad_norm": 16.023334503173828, "learning_rate": 9.602004722892993e-06, "loss": 0.2219, "num_input_tokens_seen": 50560160, "step": 23440 }, { "epoch": 4.30262433474032, "grad_norm": 3.3907742500305176, "learning_rate": 9.60169158736937e-06, "loss": 0.2781, "num_input_tokens_seen": 50571552, "step": 23445 }, { "epoch": 4.303541934299871, "grad_norm": 21.362321853637695, "learning_rate": 9.60137833381949e-06, "loss": 0.2305, "num_input_tokens_seen": 50582272, "step": 23450 }, { "epoch": 4.304459533859424, "grad_norm": 3.0601654052734375, "learning_rate": 9.601064962251387e-06, "loss": 0.4531, "num_input_tokens_seen": 50592736, "step": 23455 }, { "epoch": 4.305377133418976, "grad_norm": 2.792661666870117, "learning_rate": 9.6007514726731e-06, "loss": 0.2718, "num_input_tokens_seen": 50603552, "step": 23460 }, { "epoch": 4.306294732978528, "grad_norm": 7.454261302947998, "learning_rate": 9.600437865092667e-06, "loss": 0.3441, "num_input_tokens_seen": 50613376, "step": 23465 }, { "epoch": 4.3072123325380804, "grad_norm": 2.584181308746338, "learning_rate": 9.600124139518134e-06, "loss": 0.2766, "num_input_tokens_seen": 50623392, "step": 23470 }, { "epoch": 4.308129932097633, "grad_norm": 2.7258219718933105, "learning_rate": 9.599810295957545e-06, "loss": 0.4186, "num_input_tokens_seen": 50634848, "step": 23475 }, { "epoch": 4.309047531657185, "grad_norm": 2.2531418800354004, "learning_rate": 9.599496334418952e-06, "loss": 0.2446, "num_input_tokens_seen": 50645120, "step": 23480 }, { "epoch": 4.309965131216737, "grad_norm": 1.1355493068695068, "learning_rate": 9.599182254910407e-06, "loss": 0.1607, "num_input_tokens_seen": 50656032, "step": 23485 }, { "epoch": 4.31088273077629, "grad_norm": 2.8630688190460205, "learning_rate": 9.598868057439965e-06, "loss": 0.3552, "num_input_tokens_seen": 50667616, "step": 23490 }, { "epoch": 4.311800330335841, "grad_norm": 4.2768659591674805, "learning_rate": 9.598553742015685e-06, "loss": 0.2939, "num_input_tokens_seen": 50679072, "step": 23495 }, { "epoch": 4.312717929895394, "grad_norm": 5.558958053588867, "learning_rate": 9.598239308645627e-06, "loss": 0.2806, "num_input_tokens_seen": 50688928, "step": 23500 }, { "epoch": 4.313635529454946, "grad_norm": 3.5466301441192627, "learning_rate": 9.59792475733786e-06, "loss": 0.1847, "num_input_tokens_seen": 50700160, "step": 23505 }, { "epoch": 4.314553129014498, "grad_norm": 10.623424530029297, "learning_rate": 9.597610088100446e-06, "loss": 0.2091, "num_input_tokens_seen": 50711296, "step": 23510 }, { "epoch": 4.31547072857405, "grad_norm": 3.1261448860168457, "learning_rate": 9.59729530094146e-06, "loss": 0.293, "num_input_tokens_seen": 50721024, "step": 23515 }, { "epoch": 4.316388328133603, "grad_norm": 5.554008483886719, "learning_rate": 9.596980395868976e-06, "loss": 0.2771, "num_input_tokens_seen": 50732608, "step": 23520 }, { "epoch": 4.3173059276931545, "grad_norm": 3.4314463138580322, "learning_rate": 9.59666537289107e-06, "loss": 0.3348, "num_input_tokens_seen": 50743232, "step": 23525 }, { "epoch": 4.318223527252707, "grad_norm": 1.0864490270614624, "learning_rate": 9.59635023201582e-06, "loss": 0.1139, "num_input_tokens_seen": 50753600, "step": 23530 }, { "epoch": 4.3191411268122595, "grad_norm": 2.0603315830230713, "learning_rate": 9.59603497325131e-06, "loss": 0.2617, "num_input_tokens_seen": 50764768, "step": 23535 }, { "epoch": 4.320058726371811, "grad_norm": 5.019749164581299, "learning_rate": 9.595719596605626e-06, "loss": 0.3292, "num_input_tokens_seen": 50775296, "step": 23540 }, { "epoch": 4.320976325931364, "grad_norm": 5.370241641998291, "learning_rate": 9.595404102086858e-06, "loss": 0.2895, "num_input_tokens_seen": 50785088, "step": 23545 }, { "epoch": 4.321893925490916, "grad_norm": 1.7398898601531982, "learning_rate": 9.595088489703096e-06, "loss": 0.2491, "num_input_tokens_seen": 50795552, "step": 23550 }, { "epoch": 4.322811525050468, "grad_norm": 11.044583320617676, "learning_rate": 9.594772759462436e-06, "loss": 0.3913, "num_input_tokens_seen": 50807680, "step": 23555 }, { "epoch": 4.32372912461002, "grad_norm": 13.354517936706543, "learning_rate": 9.594456911372978e-06, "loss": 0.2676, "num_input_tokens_seen": 50818976, "step": 23560 }, { "epoch": 4.324646724169573, "grad_norm": 5.523928165435791, "learning_rate": 9.594140945442819e-06, "loss": 0.2022, "num_input_tokens_seen": 50829504, "step": 23565 }, { "epoch": 4.325564323729124, "grad_norm": 1.8523080348968506, "learning_rate": 9.593824861680065e-06, "loss": 0.2051, "num_input_tokens_seen": 50839904, "step": 23570 }, { "epoch": 4.326481923288677, "grad_norm": 7.751831531524658, "learning_rate": 9.593508660092824e-06, "loss": 0.3827, "num_input_tokens_seen": 50849984, "step": 23575 }, { "epoch": 4.327399522848229, "grad_norm": 14.941081047058105, "learning_rate": 9.593192340689204e-06, "loss": 0.2551, "num_input_tokens_seen": 50861824, "step": 23580 }, { "epoch": 4.328317122407781, "grad_norm": 1.474661111831665, "learning_rate": 9.592875903477321e-06, "loss": 0.2078, "num_input_tokens_seen": 50872224, "step": 23585 }, { "epoch": 4.3292347219673335, "grad_norm": 1.158443808555603, "learning_rate": 9.592559348465289e-06, "loss": 0.2054, "num_input_tokens_seen": 50882720, "step": 23590 }, { "epoch": 4.330152321526886, "grad_norm": 10.992081642150879, "learning_rate": 9.592242675661227e-06, "loss": 0.4121, "num_input_tokens_seen": 50892800, "step": 23595 }, { "epoch": 4.331069921086438, "grad_norm": 9.848660469055176, "learning_rate": 9.591925885073257e-06, "loss": 0.4122, "num_input_tokens_seen": 50903520, "step": 23600 }, { "epoch": 4.33198752064599, "grad_norm": 5.644924640655518, "learning_rate": 9.591608976709505e-06, "loss": 0.1809, "num_input_tokens_seen": 50914432, "step": 23605 }, { "epoch": 4.332905120205543, "grad_norm": 8.835258483886719, "learning_rate": 9.5912919505781e-06, "loss": 0.2753, "num_input_tokens_seen": 50925856, "step": 23610 }, { "epoch": 4.333822719765094, "grad_norm": 0.7265892028808594, "learning_rate": 9.590974806687173e-06, "loss": 0.2346, "num_input_tokens_seen": 50935904, "step": 23615 }, { "epoch": 4.334740319324647, "grad_norm": 4.836278438568115, "learning_rate": 9.590657545044856e-06, "loss": 0.2063, "num_input_tokens_seen": 50948320, "step": 23620 }, { "epoch": 4.335657918884199, "grad_norm": 2.065675973892212, "learning_rate": 9.590340165659288e-06, "loss": 0.3474, "num_input_tokens_seen": 50957824, "step": 23625 }, { "epoch": 4.336575518443751, "grad_norm": 17.13572120666504, "learning_rate": 9.59002266853861e-06, "loss": 0.2774, "num_input_tokens_seen": 50968320, "step": 23630 }, { "epoch": 4.337493118003303, "grad_norm": 6.530811786651611, "learning_rate": 9.589705053690963e-06, "loss": 0.3809, "num_input_tokens_seen": 50979424, "step": 23635 }, { "epoch": 4.338410717562856, "grad_norm": 11.385183334350586, "learning_rate": 9.589387321124496e-06, "loss": 0.2559, "num_input_tokens_seen": 50989728, "step": 23640 }, { "epoch": 4.3393283171224075, "grad_norm": 5.08544921875, "learning_rate": 9.589069470847358e-06, "loss": 0.2932, "num_input_tokens_seen": 50999232, "step": 23645 }, { "epoch": 4.34024591668196, "grad_norm": 10.34582233428955, "learning_rate": 9.588751502867699e-06, "loss": 0.3378, "num_input_tokens_seen": 51011072, "step": 23650 }, { "epoch": 4.3411635162415125, "grad_norm": 1.2287627458572388, "learning_rate": 9.588433417193677e-06, "loss": 0.3259, "num_input_tokens_seen": 51022688, "step": 23655 }, { "epoch": 4.342081115801064, "grad_norm": 11.689382553100586, "learning_rate": 9.588115213833447e-06, "loss": 0.1423, "num_input_tokens_seen": 51033792, "step": 23660 }, { "epoch": 4.342998715360617, "grad_norm": 4.942392349243164, "learning_rate": 9.587796892795175e-06, "loss": 0.3134, "num_input_tokens_seen": 51045056, "step": 23665 }, { "epoch": 4.343916314920169, "grad_norm": 4.276736736297607, "learning_rate": 9.587478454087022e-06, "loss": 0.3556, "num_input_tokens_seen": 51056096, "step": 23670 }, { "epoch": 4.344833914479721, "grad_norm": 6.009681701660156, "learning_rate": 9.587159897717157e-06, "loss": 0.3218, "num_input_tokens_seen": 51066528, "step": 23675 }, { "epoch": 4.345751514039273, "grad_norm": 3.4541308879852295, "learning_rate": 9.586841223693751e-06, "loss": 0.3079, "num_input_tokens_seen": 51077952, "step": 23680 }, { "epoch": 4.346669113598826, "grad_norm": 11.057889938354492, "learning_rate": 9.586522432024974e-06, "loss": 0.2831, "num_input_tokens_seen": 51088608, "step": 23685 }, { "epoch": 4.347586713158377, "grad_norm": 2.6887881755828857, "learning_rate": 9.586203522719007e-06, "loss": 0.4081, "num_input_tokens_seen": 51099360, "step": 23690 }, { "epoch": 4.34850431271793, "grad_norm": 6.097958087921143, "learning_rate": 9.585884495784027e-06, "loss": 0.2762, "num_input_tokens_seen": 51110080, "step": 23695 }, { "epoch": 4.349421912277482, "grad_norm": 3.000917911529541, "learning_rate": 9.585565351228218e-06, "loss": 0.2395, "num_input_tokens_seen": 51120288, "step": 23700 }, { "epoch": 4.350339511837034, "grad_norm": 4.396874904632568, "learning_rate": 9.585246089059765e-06, "loss": 0.2698, "num_input_tokens_seen": 51131136, "step": 23705 }, { "epoch": 4.3512571113965866, "grad_norm": 19.296972274780273, "learning_rate": 9.584926709286855e-06, "loss": 0.2469, "num_input_tokens_seen": 51141760, "step": 23710 }, { "epoch": 4.352174710956139, "grad_norm": 3.9940919876098633, "learning_rate": 9.584607211917681e-06, "loss": 0.3241, "num_input_tokens_seen": 51152640, "step": 23715 }, { "epoch": 4.353092310515691, "grad_norm": 10.316180229187012, "learning_rate": 9.584287596960437e-06, "loss": 0.1726, "num_input_tokens_seen": 51162944, "step": 23720 }, { "epoch": 4.354009910075243, "grad_norm": 1.5195879936218262, "learning_rate": 9.583967864423323e-06, "loss": 0.2139, "num_input_tokens_seen": 51173600, "step": 23725 }, { "epoch": 4.354927509634796, "grad_norm": 5.710203170776367, "learning_rate": 9.583648014314537e-06, "loss": 0.2626, "num_input_tokens_seen": 51184544, "step": 23730 }, { "epoch": 4.355845109194347, "grad_norm": 8.375776290893555, "learning_rate": 9.583328046642283e-06, "loss": 0.2573, "num_input_tokens_seen": 51195168, "step": 23735 }, { "epoch": 4.3567627087539, "grad_norm": 4.905691146850586, "learning_rate": 9.583007961414769e-06, "loss": 0.3466, "num_input_tokens_seen": 51204480, "step": 23740 }, { "epoch": 4.357680308313452, "grad_norm": 2.975684404373169, "learning_rate": 9.582687758640204e-06, "loss": 0.1434, "num_input_tokens_seen": 51216256, "step": 23745 }, { "epoch": 4.358597907873004, "grad_norm": 5.253269195556641, "learning_rate": 9.582367438326799e-06, "loss": 0.2797, "num_input_tokens_seen": 51226656, "step": 23750 }, { "epoch": 4.3595155074325564, "grad_norm": 2.1584904193878174, "learning_rate": 9.58204700048277e-06, "loss": 0.4253, "num_input_tokens_seen": 51237760, "step": 23755 }, { "epoch": 4.360433106992109, "grad_norm": 3.852555274963379, "learning_rate": 9.58172644511634e-06, "loss": 0.3343, "num_input_tokens_seen": 51248768, "step": 23760 }, { "epoch": 4.361350706551661, "grad_norm": 14.924681663513184, "learning_rate": 9.581405772235726e-06, "loss": 0.2093, "num_input_tokens_seen": 51260224, "step": 23765 }, { "epoch": 4.362268306111213, "grad_norm": 2.9751994609832764, "learning_rate": 9.581084981849156e-06, "loss": 0.2541, "num_input_tokens_seen": 51271200, "step": 23770 }, { "epoch": 4.363185905670766, "grad_norm": 8.650030136108398, "learning_rate": 9.580764073964855e-06, "loss": 0.2145, "num_input_tokens_seen": 51281696, "step": 23775 }, { "epoch": 4.364103505230317, "grad_norm": 8.251240730285645, "learning_rate": 9.580443048591055e-06, "loss": 0.3219, "num_input_tokens_seen": 51292512, "step": 23780 }, { "epoch": 4.36502110478987, "grad_norm": 7.550590991973877, "learning_rate": 9.58012190573599e-06, "loss": 0.3711, "num_input_tokens_seen": 51302976, "step": 23785 }, { "epoch": 4.365938704349422, "grad_norm": 6.526272296905518, "learning_rate": 9.579800645407897e-06, "loss": 0.2706, "num_input_tokens_seen": 51313824, "step": 23790 }, { "epoch": 4.366856303908974, "grad_norm": 7.666321277618408, "learning_rate": 9.579479267615016e-06, "loss": 0.2163, "num_input_tokens_seen": 51324416, "step": 23795 }, { "epoch": 4.367773903468526, "grad_norm": 1.1739429235458374, "learning_rate": 9.579157772365589e-06, "loss": 0.273, "num_input_tokens_seen": 51335232, "step": 23800 }, { "epoch": 4.368691503028079, "grad_norm": 3.6406917572021484, "learning_rate": 9.578836159667861e-06, "loss": 0.3163, "num_input_tokens_seen": 51345632, "step": 23805 }, { "epoch": 4.3696091025876305, "grad_norm": 7.037081718444824, "learning_rate": 9.578514429530084e-06, "loss": 0.1824, "num_input_tokens_seen": 51356544, "step": 23810 }, { "epoch": 4.370526702147183, "grad_norm": 5.604095458984375, "learning_rate": 9.578192581960506e-06, "loss": 0.257, "num_input_tokens_seen": 51368000, "step": 23815 }, { "epoch": 4.3714443017067355, "grad_norm": 14.991826057434082, "learning_rate": 9.577870616967386e-06, "loss": 0.293, "num_input_tokens_seen": 51379008, "step": 23820 }, { "epoch": 4.372361901266287, "grad_norm": 12.487054824829102, "learning_rate": 9.577548534558979e-06, "loss": 0.4296, "num_input_tokens_seen": 51388864, "step": 23825 }, { "epoch": 4.37327950082584, "grad_norm": 0.8704041838645935, "learning_rate": 9.577226334743546e-06, "loss": 0.2677, "num_input_tokens_seen": 51400160, "step": 23830 }, { "epoch": 4.374197100385392, "grad_norm": 10.39426326751709, "learning_rate": 9.576904017529351e-06, "loss": 0.2261, "num_input_tokens_seen": 51409472, "step": 23835 }, { "epoch": 4.375114699944944, "grad_norm": 1.1469510793685913, "learning_rate": 9.576581582924663e-06, "loss": 0.1855, "num_input_tokens_seen": 51419360, "step": 23840 }, { "epoch": 4.376032299504496, "grad_norm": 5.267359256744385, "learning_rate": 9.57625903093775e-06, "loss": 0.258, "num_input_tokens_seen": 51429792, "step": 23845 }, { "epoch": 4.376949899064049, "grad_norm": 4.95540714263916, "learning_rate": 9.575936361576884e-06, "loss": 0.2375, "num_input_tokens_seen": 51440160, "step": 23850 }, { "epoch": 4.3778674986236, "grad_norm": 4.366537570953369, "learning_rate": 9.575613574850344e-06, "loss": 0.2703, "num_input_tokens_seen": 51450304, "step": 23855 }, { "epoch": 4.378785098183153, "grad_norm": 26.35650634765625, "learning_rate": 9.575290670766406e-06, "loss": 0.3069, "num_input_tokens_seen": 51460768, "step": 23860 }, { "epoch": 4.379702697742705, "grad_norm": 5.961838722229004, "learning_rate": 9.574967649333354e-06, "loss": 0.3533, "num_input_tokens_seen": 51470560, "step": 23865 }, { "epoch": 4.380620297302257, "grad_norm": 5.1764726638793945, "learning_rate": 9.574644510559472e-06, "loss": 0.2666, "num_input_tokens_seen": 51480384, "step": 23870 }, { "epoch": 4.3815378968618095, "grad_norm": 2.5676660537719727, "learning_rate": 9.57432125445305e-06, "loss": 0.2917, "num_input_tokens_seen": 51490848, "step": 23875 }, { "epoch": 4.382455496421362, "grad_norm": 10.29787540435791, "learning_rate": 9.573997881022377e-06, "loss": 0.3303, "num_input_tokens_seen": 51500288, "step": 23880 }, { "epoch": 4.383373095980914, "grad_norm": 5.836280822753906, "learning_rate": 9.573674390275746e-06, "loss": 0.3161, "num_input_tokens_seen": 51509888, "step": 23885 }, { "epoch": 4.384290695540466, "grad_norm": 2.7931158542633057, "learning_rate": 9.573350782221456e-06, "loss": 0.2664, "num_input_tokens_seen": 51519520, "step": 23890 }, { "epoch": 4.385208295100019, "grad_norm": 1.648515224456787, "learning_rate": 9.573027056867807e-06, "loss": 0.1714, "num_input_tokens_seen": 51529152, "step": 23895 }, { "epoch": 4.38612589465957, "grad_norm": 3.401010036468506, "learning_rate": 9.5727032142231e-06, "loss": 0.3007, "num_input_tokens_seen": 51541056, "step": 23900 }, { "epoch": 4.387043494219123, "grad_norm": 3.225330114364624, "learning_rate": 9.572379254295645e-06, "loss": 0.2113, "num_input_tokens_seen": 51550368, "step": 23905 }, { "epoch": 4.387961093778675, "grad_norm": 6.414811611175537, "learning_rate": 9.572055177093747e-06, "loss": 0.2662, "num_input_tokens_seen": 51561728, "step": 23910 }, { "epoch": 4.388878693338227, "grad_norm": 2.334543466567993, "learning_rate": 9.57173098262572e-06, "loss": 0.426, "num_input_tokens_seen": 51572864, "step": 23915 }, { "epoch": 4.389796292897779, "grad_norm": 5.943380832672119, "learning_rate": 9.571406670899879e-06, "loss": 0.2806, "num_input_tokens_seen": 51584032, "step": 23920 }, { "epoch": 4.390713892457332, "grad_norm": 4.763813018798828, "learning_rate": 9.57108224192454e-06, "loss": 0.3065, "num_input_tokens_seen": 51594528, "step": 23925 }, { "epoch": 4.3916314920168835, "grad_norm": 1.407142996788025, "learning_rate": 9.57075769570803e-06, "loss": 0.2737, "num_input_tokens_seen": 51605344, "step": 23930 }, { "epoch": 4.392549091576436, "grad_norm": 2.5865437984466553, "learning_rate": 9.570433032258666e-06, "loss": 0.2337, "num_input_tokens_seen": 51615072, "step": 23935 }, { "epoch": 4.3934666911359885, "grad_norm": 2.711660623550415, "learning_rate": 9.57010825158478e-06, "loss": 0.3026, "num_input_tokens_seen": 51627328, "step": 23940 }, { "epoch": 4.39438429069554, "grad_norm": 2.8349242210388184, "learning_rate": 9.569783353694699e-06, "loss": 0.2881, "num_input_tokens_seen": 51639104, "step": 23945 }, { "epoch": 4.395301890255093, "grad_norm": 7.389303207397461, "learning_rate": 9.56945833859676e-06, "loss": 0.2507, "num_input_tokens_seen": 51649632, "step": 23950 }, { "epoch": 4.396219489814645, "grad_norm": 5.537904262542725, "learning_rate": 9.569133206299294e-06, "loss": 0.528, "num_input_tokens_seen": 51660288, "step": 23955 }, { "epoch": 4.397137089374197, "grad_norm": 2.991814613342285, "learning_rate": 9.568807956810645e-06, "loss": 0.1479, "num_input_tokens_seen": 51671840, "step": 23960 }, { "epoch": 4.398054688933749, "grad_norm": 3.792978525161743, "learning_rate": 9.56848259013915e-06, "loss": 0.2695, "num_input_tokens_seen": 51681792, "step": 23965 }, { "epoch": 4.398972288493302, "grad_norm": 17.753738403320312, "learning_rate": 9.568157106293158e-06, "loss": 0.3214, "num_input_tokens_seen": 51692352, "step": 23970 }, { "epoch": 4.399889888052853, "grad_norm": 4.625783920288086, "learning_rate": 9.567831505281018e-06, "loss": 0.3083, "num_input_tokens_seen": 51701856, "step": 23975 }, { "epoch": 4.400807487612406, "grad_norm": 3.882580518722534, "learning_rate": 9.567505787111078e-06, "loss": 0.1896, "num_input_tokens_seen": 51711680, "step": 23980 }, { "epoch": 4.401725087171958, "grad_norm": 3.9585554599761963, "learning_rate": 9.567179951791695e-06, "loss": 0.4536, "num_input_tokens_seen": 51724288, "step": 23985 }, { "epoch": 4.40264268673151, "grad_norm": 6.620211124420166, "learning_rate": 9.566853999331222e-06, "loss": 0.2488, "num_input_tokens_seen": 51735968, "step": 23990 }, { "epoch": 4.4035602862910626, "grad_norm": 2.188222885131836, "learning_rate": 9.566527929738023e-06, "loss": 0.2697, "num_input_tokens_seen": 51747200, "step": 23995 }, { "epoch": 4.404477885850615, "grad_norm": 3.4888246059417725, "learning_rate": 9.56620174302046e-06, "loss": 0.2491, "num_input_tokens_seen": 51759008, "step": 24000 }, { "epoch": 4.405395485410167, "grad_norm": 1.0962680578231812, "learning_rate": 9.565875439186901e-06, "loss": 0.3898, "num_input_tokens_seen": 51768960, "step": 24005 }, { "epoch": 4.406313084969719, "grad_norm": 2.1427788734436035, "learning_rate": 9.565549018245712e-06, "loss": 0.2092, "num_input_tokens_seen": 51780352, "step": 24010 }, { "epoch": 4.407230684529272, "grad_norm": 4.322109699249268, "learning_rate": 9.565222480205268e-06, "loss": 0.241, "num_input_tokens_seen": 51789152, "step": 24015 }, { "epoch": 4.408148284088823, "grad_norm": 10.16307544708252, "learning_rate": 9.56489582507394e-06, "loss": 0.3767, "num_input_tokens_seen": 51800864, "step": 24020 }, { "epoch": 4.409065883648376, "grad_norm": 4.176291465759277, "learning_rate": 9.564569052860111e-06, "loss": 0.3251, "num_input_tokens_seen": 51811488, "step": 24025 }, { "epoch": 4.409983483207928, "grad_norm": 4.460374355316162, "learning_rate": 9.56424216357216e-06, "loss": 0.2222, "num_input_tokens_seen": 51822432, "step": 24030 }, { "epoch": 4.41090108276748, "grad_norm": 7.8647141456604, "learning_rate": 9.56391515721847e-06, "loss": 0.2461, "num_input_tokens_seen": 51833120, "step": 24035 }, { "epoch": 4.4118186823270324, "grad_norm": 1.590415120124817, "learning_rate": 9.563588033807431e-06, "loss": 0.2987, "num_input_tokens_seen": 51844512, "step": 24040 }, { "epoch": 4.412736281886585, "grad_norm": 3.0271360874176025, "learning_rate": 9.563260793347433e-06, "loss": 0.4239, "num_input_tokens_seen": 51855392, "step": 24045 }, { "epoch": 4.413653881446137, "grad_norm": 2.838681936264038, "learning_rate": 9.562933435846868e-06, "loss": 0.3806, "num_input_tokens_seen": 51866144, "step": 24050 }, { "epoch": 4.414571481005689, "grad_norm": 10.034987449645996, "learning_rate": 9.56260596131413e-06, "loss": 0.2853, "num_input_tokens_seen": 51877792, "step": 24055 }, { "epoch": 4.415489080565242, "grad_norm": 1.4807775020599365, "learning_rate": 9.562278369757623e-06, "loss": 0.2346, "num_input_tokens_seen": 51887808, "step": 24060 }, { "epoch": 4.416406680124793, "grad_norm": 2.2392005920410156, "learning_rate": 9.561950661185744e-06, "loss": 0.1196, "num_input_tokens_seen": 51899040, "step": 24065 }, { "epoch": 4.417324279684346, "grad_norm": 7.2964606285095215, "learning_rate": 9.561622835606903e-06, "loss": 0.4254, "num_input_tokens_seen": 51909760, "step": 24070 }, { "epoch": 4.418241879243898, "grad_norm": 14.410778999328613, "learning_rate": 9.561294893029504e-06, "loss": 0.4366, "num_input_tokens_seen": 51920960, "step": 24075 }, { "epoch": 4.41915947880345, "grad_norm": 4.0431365966796875, "learning_rate": 9.560966833461964e-06, "loss": 0.223, "num_input_tokens_seen": 51932160, "step": 24080 }, { "epoch": 4.420077078363002, "grad_norm": 3.4878203868865967, "learning_rate": 9.56063865691269e-06, "loss": 0.1972, "num_input_tokens_seen": 51942592, "step": 24085 }, { "epoch": 4.420994677922555, "grad_norm": 3.662106990814209, "learning_rate": 9.560310363390105e-06, "loss": 0.276, "num_input_tokens_seen": 51954112, "step": 24090 }, { "epoch": 4.4219122774821065, "grad_norm": 11.836435317993164, "learning_rate": 9.559981952902626e-06, "loss": 0.2962, "num_input_tokens_seen": 51965120, "step": 24095 }, { "epoch": 4.422829877041659, "grad_norm": 3.4717166423797607, "learning_rate": 9.559653425458677e-06, "loss": 0.2729, "num_input_tokens_seen": 51974656, "step": 24100 }, { "epoch": 4.4237474766012115, "grad_norm": 11.653332710266113, "learning_rate": 9.559324781066686e-06, "loss": 0.3129, "num_input_tokens_seen": 51985344, "step": 24105 }, { "epoch": 4.424665076160763, "grad_norm": 1.1322206258773804, "learning_rate": 9.558996019735078e-06, "loss": 0.2805, "num_input_tokens_seen": 51995936, "step": 24110 }, { "epoch": 4.425582675720316, "grad_norm": 4.211614608764648, "learning_rate": 9.55866714147229e-06, "loss": 0.2076, "num_input_tokens_seen": 52005792, "step": 24115 }, { "epoch": 4.426500275279868, "grad_norm": 7.429007530212402, "learning_rate": 9.558338146286756e-06, "loss": 0.3268, "num_input_tokens_seen": 52016544, "step": 24120 }, { "epoch": 4.42741787483942, "grad_norm": 7.406566619873047, "learning_rate": 9.55800903418691e-06, "loss": 0.4653, "num_input_tokens_seen": 52027424, "step": 24125 }, { "epoch": 4.428335474398972, "grad_norm": 4.255368232727051, "learning_rate": 9.5576798051812e-06, "loss": 0.2301, "num_input_tokens_seen": 52039424, "step": 24130 }, { "epoch": 4.429253073958525, "grad_norm": 1.513807773590088, "learning_rate": 9.557350459278065e-06, "loss": 0.3876, "num_input_tokens_seen": 52049184, "step": 24135 }, { "epoch": 4.430170673518076, "grad_norm": 6.25029182434082, "learning_rate": 9.557020996485954e-06, "loss": 0.2529, "num_input_tokens_seen": 52060800, "step": 24140 }, { "epoch": 4.431088273077629, "grad_norm": 7.185837745666504, "learning_rate": 9.556691416813317e-06, "loss": 0.3224, "num_input_tokens_seen": 52071712, "step": 24145 }, { "epoch": 4.432005872637181, "grad_norm": 4.267635345458984, "learning_rate": 9.556361720268609e-06, "loss": 0.3481, "num_input_tokens_seen": 52082592, "step": 24150 }, { "epoch": 4.432923472196733, "grad_norm": 7.446473121643066, "learning_rate": 9.556031906860283e-06, "loss": 0.3028, "num_input_tokens_seen": 52093984, "step": 24155 }, { "epoch": 4.4338410717562855, "grad_norm": 3.1916282176971436, "learning_rate": 9.555701976596801e-06, "loss": 0.3397, "num_input_tokens_seen": 52105248, "step": 24160 }, { "epoch": 4.434758671315838, "grad_norm": 4.145855903625488, "learning_rate": 9.555371929486621e-06, "loss": 0.2114, "num_input_tokens_seen": 52115552, "step": 24165 }, { "epoch": 4.43567627087539, "grad_norm": 2.5394067764282227, "learning_rate": 9.555041765538215e-06, "loss": 0.3596, "num_input_tokens_seen": 52127232, "step": 24170 }, { "epoch": 4.436593870434942, "grad_norm": 4.362191200256348, "learning_rate": 9.554711484760046e-06, "loss": 0.3063, "num_input_tokens_seen": 52138592, "step": 24175 }, { "epoch": 4.437511469994495, "grad_norm": 3.2457687854766846, "learning_rate": 9.554381087160586e-06, "loss": 0.2315, "num_input_tokens_seen": 52149216, "step": 24180 }, { "epoch": 4.438429069554046, "grad_norm": 4.668839454650879, "learning_rate": 9.55405057274831e-06, "loss": 0.3203, "num_input_tokens_seen": 52159456, "step": 24185 }, { "epoch": 4.439346669113599, "grad_norm": 1.875443696975708, "learning_rate": 9.553719941531695e-06, "loss": 0.2422, "num_input_tokens_seen": 52168576, "step": 24190 }, { "epoch": 4.440264268673151, "grad_norm": 3.1501646041870117, "learning_rate": 9.553389193519221e-06, "loss": 0.1978, "num_input_tokens_seen": 52179616, "step": 24195 }, { "epoch": 4.441181868232703, "grad_norm": 4.139534950256348, "learning_rate": 9.553058328719372e-06, "loss": 0.3173, "num_input_tokens_seen": 52191200, "step": 24200 }, { "epoch": 4.442099467792255, "grad_norm": 10.308931350708008, "learning_rate": 9.552727347140634e-06, "loss": 0.2877, "num_input_tokens_seen": 52202400, "step": 24205 }, { "epoch": 4.443017067351808, "grad_norm": 2.595888137817383, "learning_rate": 9.552396248791494e-06, "loss": 0.2526, "num_input_tokens_seen": 52213472, "step": 24210 }, { "epoch": 4.4439346669113595, "grad_norm": 9.001760482788086, "learning_rate": 9.552065033680446e-06, "loss": 0.3201, "num_input_tokens_seen": 52224512, "step": 24215 }, { "epoch": 4.444852266470912, "grad_norm": 6.432249546051025, "learning_rate": 9.551733701815985e-06, "loss": 0.4364, "num_input_tokens_seen": 52234912, "step": 24220 }, { "epoch": 4.4457698660304645, "grad_norm": 2.049123764038086, "learning_rate": 9.55140225320661e-06, "loss": 0.2412, "num_input_tokens_seen": 52245632, "step": 24225 }, { "epoch": 4.446687465590016, "grad_norm": 14.06724739074707, "learning_rate": 9.55107068786082e-06, "loss": 0.3668, "num_input_tokens_seen": 52255776, "step": 24230 }, { "epoch": 4.447605065149569, "grad_norm": 29.153919219970703, "learning_rate": 9.550739005787122e-06, "loss": 0.2849, "num_input_tokens_seen": 52267872, "step": 24235 }, { "epoch": 4.448522664709121, "grad_norm": 11.471261978149414, "learning_rate": 9.55040720699402e-06, "loss": 0.3336, "num_input_tokens_seen": 52279424, "step": 24240 }, { "epoch": 4.449440264268673, "grad_norm": 12.035400390625, "learning_rate": 9.550075291490026e-06, "loss": 0.4459, "num_input_tokens_seen": 52290592, "step": 24245 }, { "epoch": 4.450357863828225, "grad_norm": 2.37558913230896, "learning_rate": 9.549743259283653e-06, "loss": 0.1893, "num_input_tokens_seen": 52301952, "step": 24250 }, { "epoch": 4.451275463387778, "grad_norm": 7.36936616897583, "learning_rate": 9.549411110383418e-06, "loss": 0.1921, "num_input_tokens_seen": 52313344, "step": 24255 }, { "epoch": 4.452193062947329, "grad_norm": 8.04607105255127, "learning_rate": 9.549078844797837e-06, "loss": 0.2947, "num_input_tokens_seen": 52323168, "step": 24260 }, { "epoch": 4.453110662506882, "grad_norm": 1.9515587091445923, "learning_rate": 9.548746462535434e-06, "loss": 0.2471, "num_input_tokens_seen": 52333472, "step": 24265 }, { "epoch": 4.454028262066434, "grad_norm": 10.141246795654297, "learning_rate": 9.548413963604736e-06, "loss": 0.2194, "num_input_tokens_seen": 52344704, "step": 24270 }, { "epoch": 4.454945861625986, "grad_norm": 6.884393215179443, "learning_rate": 9.548081348014268e-06, "loss": 0.1937, "num_input_tokens_seen": 52354528, "step": 24275 }, { "epoch": 4.4558634611855386, "grad_norm": 4.1444830894470215, "learning_rate": 9.547748615772563e-06, "loss": 0.2519, "num_input_tokens_seen": 52365056, "step": 24280 }, { "epoch": 4.456781060745091, "grad_norm": 3.7548060417175293, "learning_rate": 9.547415766888152e-06, "loss": 0.1888, "num_input_tokens_seen": 52375040, "step": 24285 }, { "epoch": 4.457698660304643, "grad_norm": 5.308456897735596, "learning_rate": 9.547082801369578e-06, "loss": 0.2518, "num_input_tokens_seen": 52385280, "step": 24290 }, { "epoch": 4.458616259864195, "grad_norm": 24.08344268798828, "learning_rate": 9.546749719225376e-06, "loss": 0.5028, "num_input_tokens_seen": 52395488, "step": 24295 }, { "epoch": 4.459533859423748, "grad_norm": 10.243464469909668, "learning_rate": 9.54641652046409e-06, "loss": 0.3606, "num_input_tokens_seen": 52406592, "step": 24300 }, { "epoch": 4.460451458983299, "grad_norm": 4.999638080596924, "learning_rate": 9.546083205094268e-06, "loss": 0.274, "num_input_tokens_seen": 52416960, "step": 24305 }, { "epoch": 4.461369058542852, "grad_norm": 3.3184189796447754, "learning_rate": 9.545749773124455e-06, "loss": 0.3067, "num_input_tokens_seen": 52428096, "step": 24310 }, { "epoch": 4.462286658102404, "grad_norm": 4.816166877746582, "learning_rate": 9.545416224563207e-06, "loss": 0.2722, "num_input_tokens_seen": 52439168, "step": 24315 }, { "epoch": 4.463204257661956, "grad_norm": 4.186143398284912, "learning_rate": 9.545082559419078e-06, "loss": 0.3518, "num_input_tokens_seen": 52450432, "step": 24320 }, { "epoch": 4.4641218572215084, "grad_norm": 8.617265701293945, "learning_rate": 9.544748777700626e-06, "loss": 0.4121, "num_input_tokens_seen": 52461376, "step": 24325 }, { "epoch": 4.465039456781061, "grad_norm": 9.004462242126465, "learning_rate": 9.54441487941641e-06, "loss": 0.3365, "num_input_tokens_seen": 52471648, "step": 24330 }, { "epoch": 4.465957056340613, "grad_norm": 2.0507235527038574, "learning_rate": 9.544080864574995e-06, "loss": 0.3954, "num_input_tokens_seen": 52483168, "step": 24335 }, { "epoch": 4.466874655900165, "grad_norm": 4.2549309730529785, "learning_rate": 9.543746733184952e-06, "loss": 0.322, "num_input_tokens_seen": 52494208, "step": 24340 }, { "epoch": 4.467792255459718, "grad_norm": 2.837817430496216, "learning_rate": 9.543412485254845e-06, "loss": 0.3535, "num_input_tokens_seen": 52504768, "step": 24345 }, { "epoch": 4.468709855019269, "grad_norm": 10.477034568786621, "learning_rate": 9.54307812079325e-06, "loss": 0.3522, "num_input_tokens_seen": 52516000, "step": 24350 }, { "epoch": 4.469627454578822, "grad_norm": 8.358033180236816, "learning_rate": 9.542743639808743e-06, "loss": 0.4372, "num_input_tokens_seen": 52527040, "step": 24355 }, { "epoch": 4.470545054138374, "grad_norm": 5.045813083648682, "learning_rate": 9.5424090423099e-06, "loss": 0.3768, "num_input_tokens_seen": 52537600, "step": 24360 }, { "epoch": 4.471462653697926, "grad_norm": 4.462034702301025, "learning_rate": 9.542074328305307e-06, "loss": 0.42, "num_input_tokens_seen": 52548960, "step": 24365 }, { "epoch": 4.472380253257478, "grad_norm": 2.0089375972747803, "learning_rate": 9.541739497803546e-06, "loss": 0.3297, "num_input_tokens_seen": 52560576, "step": 24370 }, { "epoch": 4.473297852817031, "grad_norm": 3.4299895763397217, "learning_rate": 9.541404550813207e-06, "loss": 0.2645, "num_input_tokens_seen": 52571776, "step": 24375 }, { "epoch": 4.4742154523765825, "grad_norm": 5.019701957702637, "learning_rate": 9.54106948734288e-06, "loss": 0.3554, "num_input_tokens_seen": 52582368, "step": 24380 }, { "epoch": 4.475133051936135, "grad_norm": 4.618330478668213, "learning_rate": 9.540734307401158e-06, "loss": 0.3024, "num_input_tokens_seen": 52592672, "step": 24385 }, { "epoch": 4.4760506514956875, "grad_norm": 2.2389416694641113, "learning_rate": 9.540399010996639e-06, "loss": 0.2182, "num_input_tokens_seen": 52603328, "step": 24390 }, { "epoch": 4.476968251055239, "grad_norm": 5.443319320678711, "learning_rate": 9.54006359813792e-06, "loss": 0.369, "num_input_tokens_seen": 52614528, "step": 24395 }, { "epoch": 4.477885850614792, "grad_norm": 4.137149810791016, "learning_rate": 9.539728068833608e-06, "loss": 0.2564, "num_input_tokens_seen": 52625504, "step": 24400 }, { "epoch": 4.478803450174344, "grad_norm": 2.4053709506988525, "learning_rate": 9.539392423092309e-06, "loss": 0.3482, "num_input_tokens_seen": 52636960, "step": 24405 }, { "epoch": 4.479721049733896, "grad_norm": 4.76399564743042, "learning_rate": 9.539056660922627e-06, "loss": 0.2758, "num_input_tokens_seen": 52648384, "step": 24410 }, { "epoch": 4.480638649293448, "grad_norm": 2.8814167976379395, "learning_rate": 9.538720782333178e-06, "loss": 0.2294, "num_input_tokens_seen": 52659552, "step": 24415 }, { "epoch": 4.481556248853001, "grad_norm": 6.9121623039245605, "learning_rate": 9.538384787332572e-06, "loss": 0.3067, "num_input_tokens_seen": 52670592, "step": 24420 }, { "epoch": 4.482473848412552, "grad_norm": 2.287750005722046, "learning_rate": 9.538048675929434e-06, "loss": 0.2506, "num_input_tokens_seen": 52682400, "step": 24425 }, { "epoch": 4.483391447972105, "grad_norm": 3.045032024383545, "learning_rate": 9.53771244813238e-06, "loss": 0.1587, "num_input_tokens_seen": 52692960, "step": 24430 }, { "epoch": 4.484309047531657, "grad_norm": 1.970085620880127, "learning_rate": 9.537376103950034e-06, "loss": 0.3038, "num_input_tokens_seen": 52703584, "step": 24435 }, { "epoch": 4.485226647091209, "grad_norm": 1.2254804372787476, "learning_rate": 9.537039643391025e-06, "loss": 0.4249, "num_input_tokens_seen": 52714976, "step": 24440 }, { "epoch": 4.4861442466507615, "grad_norm": 2.4496312141418457, "learning_rate": 9.536703066463976e-06, "loss": 0.3466, "num_input_tokens_seen": 52725632, "step": 24445 }, { "epoch": 4.487061846210314, "grad_norm": 6.416898727416992, "learning_rate": 9.536366373177529e-06, "loss": 0.2622, "num_input_tokens_seen": 52735520, "step": 24450 }, { "epoch": 4.487979445769866, "grad_norm": 1.0728782415390015, "learning_rate": 9.536029563540314e-06, "loss": 0.2404, "num_input_tokens_seen": 52745440, "step": 24455 }, { "epoch": 4.488897045329418, "grad_norm": 2.457221508026123, "learning_rate": 9.535692637560972e-06, "loss": 0.329, "num_input_tokens_seen": 52756704, "step": 24460 }, { "epoch": 4.489814644888971, "grad_norm": 5.4529194831848145, "learning_rate": 9.535355595248142e-06, "loss": 0.3716, "num_input_tokens_seen": 52767488, "step": 24465 }, { "epoch": 4.490732244448522, "grad_norm": 2.6043877601623535, "learning_rate": 9.53501843661047e-06, "loss": 0.2624, "num_input_tokens_seen": 52778720, "step": 24470 }, { "epoch": 4.491649844008075, "grad_norm": 1.7470821142196655, "learning_rate": 9.534681161656606e-06, "loss": 0.3323, "num_input_tokens_seen": 52789376, "step": 24475 }, { "epoch": 4.492567443567627, "grad_norm": 3.931748390197754, "learning_rate": 9.534343770395196e-06, "loss": 0.3041, "num_input_tokens_seen": 52800672, "step": 24480 }, { "epoch": 4.493485043127179, "grad_norm": 2.1827175617218018, "learning_rate": 9.534006262834896e-06, "loss": 0.306, "num_input_tokens_seen": 52811264, "step": 24485 }, { "epoch": 4.494402642686731, "grad_norm": 2.363197088241577, "learning_rate": 9.533668638984363e-06, "loss": 0.2136, "num_input_tokens_seen": 52821824, "step": 24490 }, { "epoch": 4.495320242246284, "grad_norm": 1.0752284526824951, "learning_rate": 9.533330898852256e-06, "loss": 0.261, "num_input_tokens_seen": 52832064, "step": 24495 }, { "epoch": 4.4962378418058355, "grad_norm": 1.3060901165008545, "learning_rate": 9.532993042447238e-06, "loss": 0.2578, "num_input_tokens_seen": 52843744, "step": 24500 }, { "epoch": 4.497155441365388, "grad_norm": 3.7142529487609863, "learning_rate": 9.532655069777972e-06, "loss": 0.3088, "num_input_tokens_seen": 52854720, "step": 24505 }, { "epoch": 4.4980730409249405, "grad_norm": 12.119291305541992, "learning_rate": 9.532316980853132e-06, "loss": 0.4206, "num_input_tokens_seen": 52866112, "step": 24510 }, { "epoch": 4.498990640484492, "grad_norm": 6.098724842071533, "learning_rate": 9.531978775681383e-06, "loss": 0.21, "num_input_tokens_seen": 52876032, "step": 24515 }, { "epoch": 4.499908240044045, "grad_norm": 4.603750228881836, "learning_rate": 9.531640454271403e-06, "loss": 0.2497, "num_input_tokens_seen": 52886464, "step": 24520 }, { "epoch": 4.500825839603597, "grad_norm": 9.425682067871094, "learning_rate": 9.53130201663187e-06, "loss": 0.2204, "num_input_tokens_seen": 52897760, "step": 24525 }, { "epoch": 4.50174343916315, "grad_norm": 2.7964746952056885, "learning_rate": 9.530963462771461e-06, "loss": 0.3559, "num_input_tokens_seen": 52907936, "step": 24530 }, { "epoch": 4.502661038722701, "grad_norm": 2.623236656188965, "learning_rate": 9.530624792698862e-06, "loss": 0.3398, "num_input_tokens_seen": 52919264, "step": 24535 }, { "epoch": 4.503578638282254, "grad_norm": 4.931210517883301, "learning_rate": 9.53028600642276e-06, "loss": 0.3874, "num_input_tokens_seen": 52930304, "step": 24540 }, { "epoch": 4.504496237841806, "grad_norm": 2.3489022254943848, "learning_rate": 9.529947103951843e-06, "loss": 0.2085, "num_input_tokens_seen": 52941440, "step": 24545 }, { "epoch": 4.505413837401358, "grad_norm": 6.82802677154541, "learning_rate": 9.529608085294803e-06, "loss": 0.4489, "num_input_tokens_seen": 52953216, "step": 24550 }, { "epoch": 4.50633143696091, "grad_norm": 3.787059783935547, "learning_rate": 9.529268950460335e-06, "loss": 0.2976, "num_input_tokens_seen": 52965216, "step": 24555 }, { "epoch": 4.507249036520463, "grad_norm": 1.2422714233398438, "learning_rate": 9.528929699457138e-06, "loss": 0.2055, "num_input_tokens_seen": 52974912, "step": 24560 }, { "epoch": 4.5081666360800146, "grad_norm": 1.5131555795669556, "learning_rate": 9.528590332293917e-06, "loss": 0.2431, "num_input_tokens_seen": 52986112, "step": 24565 }, { "epoch": 4.509084235639567, "grad_norm": 8.619180679321289, "learning_rate": 9.52825084897937e-06, "loss": 0.2464, "num_input_tokens_seen": 52996544, "step": 24570 }, { "epoch": 4.51000183519912, "grad_norm": 4.961188793182373, "learning_rate": 9.527911249522207e-06, "loss": 0.2, "num_input_tokens_seen": 53006976, "step": 24575 }, { "epoch": 4.510919434758671, "grad_norm": 5.038387775421143, "learning_rate": 9.527571533931137e-06, "loss": 0.4658, "num_input_tokens_seen": 53018784, "step": 24580 }, { "epoch": 4.511837034318224, "grad_norm": 4.179665565490723, "learning_rate": 9.527231702214876e-06, "loss": 0.2377, "num_input_tokens_seen": 53030624, "step": 24585 }, { "epoch": 4.512754633877776, "grad_norm": 1.2174149751663208, "learning_rate": 9.52689175438214e-06, "loss": 0.3444, "num_input_tokens_seen": 53040576, "step": 24590 }, { "epoch": 4.513672233437328, "grad_norm": 2.4377951622009277, "learning_rate": 9.526551690441643e-06, "loss": 0.2596, "num_input_tokens_seen": 53052672, "step": 24595 }, { "epoch": 4.51458983299688, "grad_norm": 6.800613880157471, "learning_rate": 9.526211510402112e-06, "loss": 0.24, "num_input_tokens_seen": 53064160, "step": 24600 }, { "epoch": 4.515507432556433, "grad_norm": 3.958049774169922, "learning_rate": 9.525871214272272e-06, "loss": 0.3476, "num_input_tokens_seen": 53075040, "step": 24605 }, { "epoch": 4.5164250321159845, "grad_norm": 3.389866352081299, "learning_rate": 9.525530802060847e-06, "loss": 0.2394, "num_input_tokens_seen": 53085632, "step": 24610 }, { "epoch": 4.517342631675537, "grad_norm": 3.664492607116699, "learning_rate": 9.525190273776574e-06, "loss": 0.207, "num_input_tokens_seen": 53095296, "step": 24615 }, { "epoch": 4.5182602312350895, "grad_norm": 8.703035354614258, "learning_rate": 9.52484962942818e-06, "loss": 0.3066, "num_input_tokens_seen": 53104704, "step": 24620 }, { "epoch": 4.519177830794641, "grad_norm": 4.155395984649658, "learning_rate": 9.52450886902441e-06, "loss": 0.1106, "num_input_tokens_seen": 53115712, "step": 24625 }, { "epoch": 4.520095430354194, "grad_norm": 23.604446411132812, "learning_rate": 9.524167992573998e-06, "loss": 0.452, "num_input_tokens_seen": 53126592, "step": 24630 }, { "epoch": 4.521013029913746, "grad_norm": 8.057770729064941, "learning_rate": 9.52382700008569e-06, "loss": 0.3861, "num_input_tokens_seen": 53138240, "step": 24635 }, { "epoch": 4.521930629473298, "grad_norm": 4.4662766456604, "learning_rate": 9.523485891568229e-06, "loss": 0.2959, "num_input_tokens_seen": 53149344, "step": 24640 }, { "epoch": 4.52284822903285, "grad_norm": 8.022137641906738, "learning_rate": 9.523144667030366e-06, "loss": 0.1787, "num_input_tokens_seen": 53160640, "step": 24645 }, { "epoch": 4.523765828592403, "grad_norm": 11.43196964263916, "learning_rate": 9.522803326480853e-06, "loss": 0.2877, "num_input_tokens_seen": 53170976, "step": 24650 }, { "epoch": 4.524683428151954, "grad_norm": 2.7541348934173584, "learning_rate": 9.522461869928445e-06, "loss": 0.2315, "num_input_tokens_seen": 53181824, "step": 24655 }, { "epoch": 4.525601027711507, "grad_norm": 7.088068008422852, "learning_rate": 9.522120297381898e-06, "loss": 0.3028, "num_input_tokens_seen": 53193120, "step": 24660 }, { "epoch": 4.526518627271059, "grad_norm": 2.9048516750335693, "learning_rate": 9.521778608849973e-06, "loss": 0.267, "num_input_tokens_seen": 53203840, "step": 24665 }, { "epoch": 4.527436226830611, "grad_norm": 8.616168022155762, "learning_rate": 9.521436804341438e-06, "loss": 0.2155, "num_input_tokens_seen": 53214976, "step": 24670 }, { "epoch": 4.5283538263901635, "grad_norm": 2.089127540588379, "learning_rate": 9.521094883865055e-06, "loss": 0.2575, "num_input_tokens_seen": 53226336, "step": 24675 }, { "epoch": 4.529271425949716, "grad_norm": 4.397508144378662, "learning_rate": 9.520752847429595e-06, "loss": 0.3762, "num_input_tokens_seen": 53238432, "step": 24680 }, { "epoch": 4.530189025509268, "grad_norm": 4.649923324584961, "learning_rate": 9.520410695043832e-06, "loss": 0.2859, "num_input_tokens_seen": 53250048, "step": 24685 }, { "epoch": 4.53110662506882, "grad_norm": 2.3718490600585938, "learning_rate": 9.52006842671654e-06, "loss": 0.39, "num_input_tokens_seen": 53261184, "step": 24690 }, { "epoch": 4.532024224628373, "grad_norm": 3.156299352645874, "learning_rate": 9.519726042456499e-06, "loss": 0.196, "num_input_tokens_seen": 53272064, "step": 24695 }, { "epoch": 4.532941824187924, "grad_norm": 3.033487558364868, "learning_rate": 9.519383542272488e-06, "loss": 0.1935, "num_input_tokens_seen": 53283168, "step": 24700 }, { "epoch": 4.533859423747477, "grad_norm": 4.6836748123168945, "learning_rate": 9.519040926173295e-06, "loss": 0.3358, "num_input_tokens_seen": 53292384, "step": 24705 }, { "epoch": 4.534777023307029, "grad_norm": 3.7053635120391846, "learning_rate": 9.518698194167706e-06, "loss": 0.2158, "num_input_tokens_seen": 53304000, "step": 24710 }, { "epoch": 4.535694622866581, "grad_norm": 6.240157127380371, "learning_rate": 9.518355346264511e-06, "loss": 0.2312, "num_input_tokens_seen": 53314560, "step": 24715 }, { "epoch": 4.536612222426133, "grad_norm": 13.690195083618164, "learning_rate": 9.518012382472505e-06, "loss": 0.3506, "num_input_tokens_seen": 53325088, "step": 24720 }, { "epoch": 4.537529821985686, "grad_norm": 13.874639511108398, "learning_rate": 9.517669302800483e-06, "loss": 0.271, "num_input_tokens_seen": 53335488, "step": 24725 }, { "epoch": 4.5384474215452375, "grad_norm": 15.838520050048828, "learning_rate": 9.517326107257245e-06, "loss": 0.2165, "num_input_tokens_seen": 53345984, "step": 24730 }, { "epoch": 4.53936502110479, "grad_norm": 5.316636085510254, "learning_rate": 9.516982795851594e-06, "loss": 0.2593, "num_input_tokens_seen": 53355296, "step": 24735 }, { "epoch": 4.5402826206643425, "grad_norm": 1.8705307245254517, "learning_rate": 9.516639368592335e-06, "loss": 0.1484, "num_input_tokens_seen": 53366208, "step": 24740 }, { "epoch": 4.541200220223894, "grad_norm": 2.9062561988830566, "learning_rate": 9.516295825488278e-06, "loss": 0.1449, "num_input_tokens_seen": 53377088, "step": 24745 }, { "epoch": 4.542117819783447, "grad_norm": 9.305188179016113, "learning_rate": 9.51595216654823e-06, "loss": 0.3145, "num_input_tokens_seen": 53387168, "step": 24750 }, { "epoch": 4.543035419342999, "grad_norm": 8.0132417678833, "learning_rate": 9.51560839178101e-06, "loss": 0.2633, "num_input_tokens_seen": 53398880, "step": 24755 }, { "epoch": 4.543953018902551, "grad_norm": 11.644488334655762, "learning_rate": 9.515264501195431e-06, "loss": 0.3917, "num_input_tokens_seen": 53409568, "step": 24760 }, { "epoch": 4.544870618462103, "grad_norm": 5.859004497528076, "learning_rate": 9.514920494800318e-06, "loss": 0.2308, "num_input_tokens_seen": 53420480, "step": 24765 }, { "epoch": 4.545788218021656, "grad_norm": 5.200327396392822, "learning_rate": 9.51457637260449e-06, "loss": 0.3961, "num_input_tokens_seen": 53431776, "step": 24770 }, { "epoch": 4.546705817581207, "grad_norm": 4.497376441955566, "learning_rate": 9.514232134616777e-06, "loss": 0.247, "num_input_tokens_seen": 53442944, "step": 24775 }, { "epoch": 4.54762341714076, "grad_norm": 12.57028865814209, "learning_rate": 9.513887780846004e-06, "loss": 0.276, "num_input_tokens_seen": 53453568, "step": 24780 }, { "epoch": 4.548541016700312, "grad_norm": 4.040666580200195, "learning_rate": 9.513543311301007e-06, "loss": 0.2323, "num_input_tokens_seen": 53464544, "step": 24785 }, { "epoch": 4.549458616259864, "grad_norm": 6.096155166625977, "learning_rate": 9.513198725990618e-06, "loss": 0.3521, "num_input_tokens_seen": 53474496, "step": 24790 }, { "epoch": 4.5503762158194165, "grad_norm": 5.05198860168457, "learning_rate": 9.512854024923678e-06, "loss": 0.2214, "num_input_tokens_seen": 53482368, "step": 24795 }, { "epoch": 4.551293815378969, "grad_norm": 3.040432929992676, "learning_rate": 9.512509208109026e-06, "loss": 0.4008, "num_input_tokens_seen": 53493600, "step": 24800 }, { "epoch": 4.552211414938521, "grad_norm": 6.9594621658325195, "learning_rate": 9.512164275555507e-06, "loss": 0.1973, "num_input_tokens_seen": 53504352, "step": 24805 }, { "epoch": 4.553129014498073, "grad_norm": 3.7802600860595703, "learning_rate": 9.511819227271965e-06, "loss": 0.3289, "num_input_tokens_seen": 53516096, "step": 24810 }, { "epoch": 4.554046614057626, "grad_norm": 7.7883782386779785, "learning_rate": 9.511474063267255e-06, "loss": 0.1961, "num_input_tokens_seen": 53525408, "step": 24815 }, { "epoch": 4.554964213617177, "grad_norm": 8.440230369567871, "learning_rate": 9.511128783550228e-06, "loss": 0.2969, "num_input_tokens_seen": 53536096, "step": 24820 }, { "epoch": 4.55588181317673, "grad_norm": 2.807445764541626, "learning_rate": 9.510783388129737e-06, "loss": 0.3847, "num_input_tokens_seen": 53546176, "step": 24825 }, { "epoch": 4.556799412736282, "grad_norm": 2.0943186283111572, "learning_rate": 9.510437877014645e-06, "loss": 0.3315, "num_input_tokens_seen": 53557600, "step": 24830 }, { "epoch": 4.557717012295834, "grad_norm": 8.875231742858887, "learning_rate": 9.510092250213811e-06, "loss": 0.4223, "num_input_tokens_seen": 53568288, "step": 24835 }, { "epoch": 4.558634611855386, "grad_norm": 2.28381085395813, "learning_rate": 9.509746507736101e-06, "loss": 0.3014, "num_input_tokens_seen": 53578912, "step": 24840 }, { "epoch": 4.559552211414939, "grad_norm": 4.739549160003662, "learning_rate": 9.509400649590383e-06, "loss": 0.1888, "num_input_tokens_seen": 53590560, "step": 24845 }, { "epoch": 4.560469810974491, "grad_norm": 5.244546413421631, "learning_rate": 9.509054675785528e-06, "loss": 0.2045, "num_input_tokens_seen": 53601536, "step": 24850 }, { "epoch": 4.561387410534043, "grad_norm": 5.214475631713867, "learning_rate": 9.508708586330407e-06, "loss": 0.2486, "num_input_tokens_seen": 53612352, "step": 24855 }, { "epoch": 4.562305010093596, "grad_norm": 21.193429946899414, "learning_rate": 9.508362381233898e-06, "loss": 0.3548, "num_input_tokens_seen": 53622624, "step": 24860 }, { "epoch": 4.563222609653147, "grad_norm": 1.0685850381851196, "learning_rate": 9.508016060504883e-06, "loss": 0.1699, "num_input_tokens_seen": 53633856, "step": 24865 }, { "epoch": 4.5641402092127, "grad_norm": 3.0762782096862793, "learning_rate": 9.507669624152242e-06, "loss": 0.2574, "num_input_tokens_seen": 53644448, "step": 24870 }, { "epoch": 4.565057808772252, "grad_norm": 1.158963918685913, "learning_rate": 9.507323072184864e-06, "loss": 0.2051, "num_input_tokens_seen": 53655680, "step": 24875 }, { "epoch": 4.565975408331804, "grad_norm": 7.780467510223389, "learning_rate": 9.506976404611632e-06, "loss": 0.4091, "num_input_tokens_seen": 53667264, "step": 24880 }, { "epoch": 4.566893007891356, "grad_norm": 4.023841857910156, "learning_rate": 9.506629621441442e-06, "loss": 0.5388, "num_input_tokens_seen": 53676288, "step": 24885 }, { "epoch": 4.567810607450909, "grad_norm": 12.363677024841309, "learning_rate": 9.506282722683186e-06, "loss": 0.2736, "num_input_tokens_seen": 53686080, "step": 24890 }, { "epoch": 4.5687282070104605, "grad_norm": 6.68340539932251, "learning_rate": 9.505935708345762e-06, "loss": 0.4163, "num_input_tokens_seen": 53696928, "step": 24895 }, { "epoch": 4.569645806570013, "grad_norm": 7.351390838623047, "learning_rate": 9.505588578438073e-06, "loss": 0.3049, "num_input_tokens_seen": 53707776, "step": 24900 }, { "epoch": 4.5705634061295655, "grad_norm": 4.5614542961120605, "learning_rate": 9.505241332969016e-06, "loss": 0.3442, "num_input_tokens_seen": 53719552, "step": 24905 }, { "epoch": 4.571481005689117, "grad_norm": 14.313932418823242, "learning_rate": 9.504893971947503e-06, "loss": 0.2385, "num_input_tokens_seen": 53729376, "step": 24910 }, { "epoch": 4.57239860524867, "grad_norm": 2.3917949199676514, "learning_rate": 9.504546495382443e-06, "loss": 0.4524, "num_input_tokens_seen": 53740416, "step": 24915 }, { "epoch": 4.573316204808222, "grad_norm": 4.703078746795654, "learning_rate": 9.504198903282746e-06, "loss": 0.4219, "num_input_tokens_seen": 53751520, "step": 24920 }, { "epoch": 4.574233804367774, "grad_norm": 3.2058727741241455, "learning_rate": 9.503851195657328e-06, "loss": 0.3614, "num_input_tokens_seen": 53761664, "step": 24925 }, { "epoch": 4.575151403927326, "grad_norm": 12.183920860290527, "learning_rate": 9.503503372515107e-06, "loss": 0.3967, "num_input_tokens_seen": 53773376, "step": 24930 }, { "epoch": 4.576069003486879, "grad_norm": 1.4953547716140747, "learning_rate": 9.503155433865003e-06, "loss": 0.2372, "num_input_tokens_seen": 53784192, "step": 24935 }, { "epoch": 4.57698660304643, "grad_norm": 11.914592742919922, "learning_rate": 9.502807379715943e-06, "loss": 0.2154, "num_input_tokens_seen": 53794272, "step": 24940 }, { "epoch": 4.577904202605983, "grad_norm": 2.7788634300231934, "learning_rate": 9.502459210076853e-06, "loss": 0.364, "num_input_tokens_seen": 53803808, "step": 24945 }, { "epoch": 4.578821802165535, "grad_norm": 4.565908908843994, "learning_rate": 9.50211092495666e-06, "loss": 0.2903, "num_input_tokens_seen": 53813472, "step": 24950 }, { "epoch": 4.579739401725087, "grad_norm": 5.67878532409668, "learning_rate": 9.501762524364301e-06, "loss": 0.2855, "num_input_tokens_seen": 53825728, "step": 24955 }, { "epoch": 4.5806570012846395, "grad_norm": 9.374978065490723, "learning_rate": 9.50141400830871e-06, "loss": 0.3117, "num_input_tokens_seen": 53836288, "step": 24960 }, { "epoch": 4.581574600844192, "grad_norm": 1.5273635387420654, "learning_rate": 9.501065376798828e-06, "loss": 0.2975, "num_input_tokens_seen": 53846688, "step": 24965 }, { "epoch": 4.582492200403744, "grad_norm": 3.859362840652466, "learning_rate": 9.500716629843594e-06, "loss": 0.228, "num_input_tokens_seen": 53857152, "step": 24970 }, { "epoch": 4.583409799963296, "grad_norm": 12.520477294921875, "learning_rate": 9.500367767451952e-06, "loss": 0.4104, "num_input_tokens_seen": 53867808, "step": 24975 }, { "epoch": 4.584327399522849, "grad_norm": 2.971897840499878, "learning_rate": 9.500018789632855e-06, "loss": 0.3422, "num_input_tokens_seen": 53878240, "step": 24980 }, { "epoch": 4.5852449990824, "grad_norm": 1.073720097541809, "learning_rate": 9.499669696395248e-06, "loss": 0.2073, "num_input_tokens_seen": 53887808, "step": 24985 }, { "epoch": 4.586162598641953, "grad_norm": 2.0240609645843506, "learning_rate": 9.499320487748087e-06, "loss": 0.3026, "num_input_tokens_seen": 53897120, "step": 24990 }, { "epoch": 4.587080198201505, "grad_norm": 2.601841449737549, "learning_rate": 9.49897116370033e-06, "loss": 0.2908, "num_input_tokens_seen": 53909760, "step": 24995 }, { "epoch": 4.587997797761057, "grad_norm": 2.293834686279297, "learning_rate": 9.498621724260934e-06, "loss": 0.241, "num_input_tokens_seen": 53921088, "step": 25000 }, { "epoch": 4.588915397320609, "grad_norm": 3.7866291999816895, "learning_rate": 9.498272169438865e-06, "loss": 0.354, "num_input_tokens_seen": 53931360, "step": 25005 }, { "epoch": 4.589832996880162, "grad_norm": 7.92766809463501, "learning_rate": 9.497922499243085e-06, "loss": 0.3893, "num_input_tokens_seen": 53943648, "step": 25010 }, { "epoch": 4.5907505964397135, "grad_norm": 14.323405265808105, "learning_rate": 9.497572713682565e-06, "loss": 0.306, "num_input_tokens_seen": 53954432, "step": 25015 }, { "epoch": 4.591668195999266, "grad_norm": 4.305089473724365, "learning_rate": 9.497222812766276e-06, "loss": 0.2723, "num_input_tokens_seen": 53965344, "step": 25020 }, { "epoch": 4.5925857955588185, "grad_norm": 9.671747207641602, "learning_rate": 9.49687279650319e-06, "loss": 0.3168, "num_input_tokens_seen": 53976640, "step": 25025 }, { "epoch": 4.59350339511837, "grad_norm": 3.2966086864471436, "learning_rate": 9.496522664902288e-06, "loss": 0.3433, "num_input_tokens_seen": 53987744, "step": 25030 }, { "epoch": 4.594420994677923, "grad_norm": 6.549583911895752, "learning_rate": 9.496172417972547e-06, "loss": 0.2554, "num_input_tokens_seen": 53998688, "step": 25035 }, { "epoch": 4.595338594237475, "grad_norm": 2.192786693572998, "learning_rate": 9.495822055722953e-06, "loss": 0.3128, "num_input_tokens_seen": 54010464, "step": 25040 }, { "epoch": 4.596256193797027, "grad_norm": 2.799490213394165, "learning_rate": 9.495471578162492e-06, "loss": 0.3189, "num_input_tokens_seen": 54020864, "step": 25045 }, { "epoch": 4.597173793356579, "grad_norm": 2.069364547729492, "learning_rate": 9.495120985300152e-06, "loss": 0.2118, "num_input_tokens_seen": 54031104, "step": 25050 }, { "epoch": 4.598091392916132, "grad_norm": 6.846571922302246, "learning_rate": 9.494770277144925e-06, "loss": 0.3115, "num_input_tokens_seen": 54042368, "step": 25055 }, { "epoch": 4.599008992475683, "grad_norm": 2.4616646766662598, "learning_rate": 9.494419453705806e-06, "loss": 0.2839, "num_input_tokens_seen": 54052384, "step": 25060 }, { "epoch": 4.599926592035236, "grad_norm": 3.5598199367523193, "learning_rate": 9.494068514991794e-06, "loss": 0.3987, "num_input_tokens_seen": 54063008, "step": 25065 }, { "epoch": 4.600844191594788, "grad_norm": 2.1234843730926514, "learning_rate": 9.493717461011891e-06, "loss": 0.3242, "num_input_tokens_seen": 54074176, "step": 25070 }, { "epoch": 4.60176179115434, "grad_norm": 5.448391437530518, "learning_rate": 9.493366291775098e-06, "loss": 0.425, "num_input_tokens_seen": 54085760, "step": 25075 }, { "epoch": 4.6026793907138925, "grad_norm": 2.5225603580474854, "learning_rate": 9.493015007290424e-06, "loss": 0.262, "num_input_tokens_seen": 54097184, "step": 25080 }, { "epoch": 4.603596990273445, "grad_norm": 1.5047634840011597, "learning_rate": 9.49266360756688e-06, "loss": 0.2734, "num_input_tokens_seen": 54107616, "step": 25085 }, { "epoch": 4.604514589832997, "grad_norm": 3.344851493835449, "learning_rate": 9.492312092613476e-06, "loss": 0.2231, "num_input_tokens_seen": 54118080, "step": 25090 }, { "epoch": 4.605432189392549, "grad_norm": 5.564931392669678, "learning_rate": 9.49196046243923e-06, "loss": 0.3012, "num_input_tokens_seen": 54128704, "step": 25095 }, { "epoch": 4.606349788952102, "grad_norm": 17.752635955810547, "learning_rate": 9.49160871705316e-06, "loss": 0.18, "num_input_tokens_seen": 54140128, "step": 25100 }, { "epoch": 4.607267388511653, "grad_norm": 5.873232841491699, "learning_rate": 9.491256856464288e-06, "loss": 0.2762, "num_input_tokens_seen": 54150560, "step": 25105 }, { "epoch": 4.608184988071206, "grad_norm": 7.797729015350342, "learning_rate": 9.490904880681638e-06, "loss": 0.3062, "num_input_tokens_seen": 54161600, "step": 25110 }, { "epoch": 4.609102587630758, "grad_norm": 3.3986306190490723, "learning_rate": 9.490552789714238e-06, "loss": 0.1886, "num_input_tokens_seen": 54172832, "step": 25115 }, { "epoch": 4.61002018719031, "grad_norm": 3.6160151958465576, "learning_rate": 9.490200583571119e-06, "loss": 0.2914, "num_input_tokens_seen": 54182912, "step": 25120 }, { "epoch": 4.610937786749862, "grad_norm": 3.93038272857666, "learning_rate": 9.489848262261314e-06, "loss": 0.2932, "num_input_tokens_seen": 54194240, "step": 25125 }, { "epoch": 4.611855386309415, "grad_norm": 1.1068669557571411, "learning_rate": 9.48949582579386e-06, "loss": 0.1814, "num_input_tokens_seen": 54204800, "step": 25130 }, { "epoch": 4.612772985868967, "grad_norm": 4.16361665725708, "learning_rate": 9.489143274177797e-06, "loss": 0.2253, "num_input_tokens_seen": 54215808, "step": 25135 }, { "epoch": 4.613690585428519, "grad_norm": 4.80509090423584, "learning_rate": 9.488790607422165e-06, "loss": 0.3044, "num_input_tokens_seen": 54226240, "step": 25140 }, { "epoch": 4.614608184988072, "grad_norm": 2.7560203075408936, "learning_rate": 9.488437825536012e-06, "loss": 0.2295, "num_input_tokens_seen": 54235936, "step": 25145 }, { "epoch": 4.615525784547623, "grad_norm": 3.2847156524658203, "learning_rate": 9.488084928528385e-06, "loss": 0.3404, "num_input_tokens_seen": 54246272, "step": 25150 }, { "epoch": 4.616443384107176, "grad_norm": 4.032003879547119, "learning_rate": 9.487731916408334e-06, "loss": 0.2975, "num_input_tokens_seen": 54257792, "step": 25155 }, { "epoch": 4.617360983666728, "grad_norm": 6.558960914611816, "learning_rate": 9.487378789184915e-06, "loss": 0.2812, "num_input_tokens_seen": 54268832, "step": 25160 }, { "epoch": 4.61827858322628, "grad_norm": 4.114383220672607, "learning_rate": 9.487025546867187e-06, "loss": 0.3315, "num_input_tokens_seen": 54279744, "step": 25165 }, { "epoch": 4.619196182785832, "grad_norm": 3.639566659927368, "learning_rate": 9.486672189464206e-06, "loss": 0.4627, "num_input_tokens_seen": 54291520, "step": 25170 }, { "epoch": 4.620113782345385, "grad_norm": 2.030663013458252, "learning_rate": 9.48631871698504e-06, "loss": 0.2404, "num_input_tokens_seen": 54301504, "step": 25175 }, { "epoch": 4.6210313819049365, "grad_norm": 3.815420150756836, "learning_rate": 9.485965129438748e-06, "loss": 0.229, "num_input_tokens_seen": 54312352, "step": 25180 }, { "epoch": 4.621948981464489, "grad_norm": 4.957703590393066, "learning_rate": 9.485611426834405e-06, "loss": 0.3075, "num_input_tokens_seen": 54323296, "step": 25185 }, { "epoch": 4.6228665810240415, "grad_norm": 8.84874153137207, "learning_rate": 9.48525760918108e-06, "loss": 0.225, "num_input_tokens_seen": 54333696, "step": 25190 }, { "epoch": 4.623784180583593, "grad_norm": 2.4071204662323, "learning_rate": 9.484903676487852e-06, "loss": 0.3269, "num_input_tokens_seen": 54343552, "step": 25195 }, { "epoch": 4.624701780143146, "grad_norm": 7.318710803985596, "learning_rate": 9.484549628763793e-06, "loss": 0.3375, "num_input_tokens_seen": 54354304, "step": 25200 }, { "epoch": 4.625619379702698, "grad_norm": 2.622269630432129, "learning_rate": 9.484195466017986e-06, "loss": 0.3273, "num_input_tokens_seen": 54365312, "step": 25205 }, { "epoch": 4.62653697926225, "grad_norm": 3.484578847885132, "learning_rate": 9.483841188259516e-06, "loss": 0.398, "num_input_tokens_seen": 54375808, "step": 25210 }, { "epoch": 4.627454578821802, "grad_norm": 7.087005615234375, "learning_rate": 9.483486795497469e-06, "loss": 0.3969, "num_input_tokens_seen": 54387072, "step": 25215 }, { "epoch": 4.628372178381355, "grad_norm": 2.5722415447235107, "learning_rate": 9.483132287740934e-06, "loss": 0.2942, "num_input_tokens_seen": 54398624, "step": 25220 }, { "epoch": 4.629289777940906, "grad_norm": 6.50424337387085, "learning_rate": 9.482777664999005e-06, "loss": 0.3905, "num_input_tokens_seen": 54409728, "step": 25225 }, { "epoch": 4.630207377500459, "grad_norm": 2.7285521030426025, "learning_rate": 9.482422927280775e-06, "loss": 0.2166, "num_input_tokens_seen": 54420288, "step": 25230 }, { "epoch": 4.631124977060011, "grad_norm": 3.5235908031463623, "learning_rate": 9.482068074595345e-06, "loss": 0.2447, "num_input_tokens_seen": 54430368, "step": 25235 }, { "epoch": 4.632042576619563, "grad_norm": 3.048677444458008, "learning_rate": 9.481713106951816e-06, "loss": 0.2324, "num_input_tokens_seen": 54440672, "step": 25240 }, { "epoch": 4.6329601761791155, "grad_norm": 7.308864593505859, "learning_rate": 9.48135802435929e-06, "loss": 0.2823, "num_input_tokens_seen": 54452224, "step": 25245 }, { "epoch": 4.633877775738668, "grad_norm": 3.8569111824035645, "learning_rate": 9.481002826826878e-06, "loss": 0.3204, "num_input_tokens_seen": 54462272, "step": 25250 }, { "epoch": 4.63479537529822, "grad_norm": 1.506127119064331, "learning_rate": 9.480647514363689e-06, "loss": 0.3002, "num_input_tokens_seen": 54474176, "step": 25255 }, { "epoch": 4.635712974857772, "grad_norm": 2.1680240631103516, "learning_rate": 9.480292086978835e-06, "loss": 0.2571, "num_input_tokens_seen": 54483136, "step": 25260 }, { "epoch": 4.636630574417325, "grad_norm": 8.835474967956543, "learning_rate": 9.47993654468143e-06, "loss": 0.2498, "num_input_tokens_seen": 54493856, "step": 25265 }, { "epoch": 4.637548173976876, "grad_norm": 5.742399215698242, "learning_rate": 9.4795808874806e-06, "loss": 0.1753, "num_input_tokens_seen": 54503520, "step": 25270 }, { "epoch": 4.638465773536429, "grad_norm": 8.177569389343262, "learning_rate": 9.47922511538546e-06, "loss": 0.333, "num_input_tokens_seen": 54513184, "step": 25275 }, { "epoch": 4.639383373095981, "grad_norm": 3.617522716522217, "learning_rate": 9.478869228405138e-06, "loss": 0.1976, "num_input_tokens_seen": 54523872, "step": 25280 }, { "epoch": 4.640300972655533, "grad_norm": 4.945554733276367, "learning_rate": 9.478513226548765e-06, "loss": 0.3024, "num_input_tokens_seen": 54534528, "step": 25285 }, { "epoch": 4.641218572215085, "grad_norm": 2.2914552688598633, "learning_rate": 9.478157109825466e-06, "loss": 0.2422, "num_input_tokens_seen": 54546144, "step": 25290 }, { "epoch": 4.642136171774638, "grad_norm": 14.631083488464355, "learning_rate": 9.47780087824438e-06, "loss": 0.2551, "num_input_tokens_seen": 54557536, "step": 25295 }, { "epoch": 4.6430537713341895, "grad_norm": 9.69826889038086, "learning_rate": 9.477444531814639e-06, "loss": 0.4016, "num_input_tokens_seen": 54568544, "step": 25300 }, { "epoch": 4.643971370893742, "grad_norm": 17.326366424560547, "learning_rate": 9.477088070545386e-06, "loss": 0.4074, "num_input_tokens_seen": 54579552, "step": 25305 }, { "epoch": 4.6448889704532945, "grad_norm": 5.373518943786621, "learning_rate": 9.476731494445762e-06, "loss": 0.3303, "num_input_tokens_seen": 54590016, "step": 25310 }, { "epoch": 4.645806570012846, "grad_norm": 3.9859840869903564, "learning_rate": 9.476374803524915e-06, "loss": 0.3659, "num_input_tokens_seen": 54601024, "step": 25315 }, { "epoch": 4.646724169572399, "grad_norm": 8.067803382873535, "learning_rate": 9.476017997791991e-06, "loss": 0.206, "num_input_tokens_seen": 54612096, "step": 25320 }, { "epoch": 4.647641769131951, "grad_norm": 5.62422513961792, "learning_rate": 9.475661077256144e-06, "loss": 0.3447, "num_input_tokens_seen": 54622560, "step": 25325 }, { "epoch": 4.648559368691503, "grad_norm": 9.251867294311523, "learning_rate": 9.475304041926525e-06, "loss": 0.2992, "num_input_tokens_seen": 54634208, "step": 25330 }, { "epoch": 4.649476968251055, "grad_norm": 3.126873731613159, "learning_rate": 9.474946891812295e-06, "loss": 0.3638, "num_input_tokens_seen": 54645248, "step": 25335 }, { "epoch": 4.650394567810608, "grad_norm": 1.7876054048538208, "learning_rate": 9.474589626922612e-06, "loss": 0.2067, "num_input_tokens_seen": 54655584, "step": 25340 }, { "epoch": 4.651312167370159, "grad_norm": 8.657245635986328, "learning_rate": 9.47423224726664e-06, "loss": 0.2243, "num_input_tokens_seen": 54668224, "step": 25345 }, { "epoch": 4.652229766929712, "grad_norm": 6.962724685668945, "learning_rate": 9.473874752853544e-06, "loss": 0.2812, "num_input_tokens_seen": 54680320, "step": 25350 }, { "epoch": 4.653147366489264, "grad_norm": 4.443596839904785, "learning_rate": 9.473517143692497e-06, "loss": 0.243, "num_input_tokens_seen": 54691136, "step": 25355 }, { "epoch": 4.654064966048816, "grad_norm": 3.3957924842834473, "learning_rate": 9.473159419792668e-06, "loss": 0.3771, "num_input_tokens_seen": 54701888, "step": 25360 }, { "epoch": 4.6549825656083685, "grad_norm": 3.2977283000946045, "learning_rate": 9.472801581163232e-06, "loss": 0.2392, "num_input_tokens_seen": 54712512, "step": 25365 }, { "epoch": 4.655900165167921, "grad_norm": 2.0745532512664795, "learning_rate": 9.472443627813369e-06, "loss": 0.2856, "num_input_tokens_seen": 54722976, "step": 25370 }, { "epoch": 4.656817764727473, "grad_norm": 1.8326550722122192, "learning_rate": 9.472085559752256e-06, "loss": 0.3442, "num_input_tokens_seen": 54733888, "step": 25375 }, { "epoch": 4.657735364287025, "grad_norm": 2.7156529426574707, "learning_rate": 9.471727376989081e-06, "loss": 0.2505, "num_input_tokens_seen": 54743360, "step": 25380 }, { "epoch": 4.658652963846578, "grad_norm": 2.0742530822753906, "learning_rate": 9.47136907953303e-06, "loss": 0.2076, "num_input_tokens_seen": 54754272, "step": 25385 }, { "epoch": 4.659570563406129, "grad_norm": 8.160486221313477, "learning_rate": 9.47101066739329e-06, "loss": 0.3681, "num_input_tokens_seen": 54765472, "step": 25390 }, { "epoch": 4.660488162965682, "grad_norm": 3.5778698921203613, "learning_rate": 9.470652140579057e-06, "loss": 0.3589, "num_input_tokens_seen": 54776288, "step": 25395 }, { "epoch": 4.661405762525234, "grad_norm": 4.224736213684082, "learning_rate": 9.470293499099526e-06, "loss": 0.224, "num_input_tokens_seen": 54788192, "step": 25400 }, { "epoch": 4.662323362084786, "grad_norm": 9.431407928466797, "learning_rate": 9.469934742963896e-06, "loss": 0.2557, "num_input_tokens_seen": 54797696, "step": 25405 }, { "epoch": 4.663240961644338, "grad_norm": 8.648791313171387, "learning_rate": 9.469575872181366e-06, "loss": 0.2474, "num_input_tokens_seen": 54807200, "step": 25410 }, { "epoch": 4.664158561203891, "grad_norm": 4.201943397521973, "learning_rate": 9.469216886761142e-06, "loss": 0.3819, "num_input_tokens_seen": 54818208, "step": 25415 }, { "epoch": 4.665076160763443, "grad_norm": 1.4609838724136353, "learning_rate": 9.468857786712434e-06, "loss": 0.3355, "num_input_tokens_seen": 54829408, "step": 25420 }, { "epoch": 4.665993760322995, "grad_norm": 9.63943862915039, "learning_rate": 9.468498572044447e-06, "loss": 0.2322, "num_input_tokens_seen": 54840704, "step": 25425 }, { "epoch": 4.666911359882548, "grad_norm": 2.9699366092681885, "learning_rate": 9.468139242766397e-06, "loss": 0.2785, "num_input_tokens_seen": 54851840, "step": 25430 }, { "epoch": 4.667828959442099, "grad_norm": 2.5661730766296387, "learning_rate": 9.467779798887502e-06, "loss": 0.1554, "num_input_tokens_seen": 54862848, "step": 25435 }, { "epoch": 4.668746559001652, "grad_norm": 5.447769641876221, "learning_rate": 9.467420240416978e-06, "loss": 0.1908, "num_input_tokens_seen": 54873792, "step": 25440 }, { "epoch": 4.669664158561204, "grad_norm": 6.45830774307251, "learning_rate": 9.46706056736405e-06, "loss": 0.3768, "num_input_tokens_seen": 54884224, "step": 25445 }, { "epoch": 4.670581758120756, "grad_norm": 3.498304843902588, "learning_rate": 9.466700779737942e-06, "loss": 0.2587, "num_input_tokens_seen": 54892896, "step": 25450 }, { "epoch": 4.671499357680308, "grad_norm": 6.781832695007324, "learning_rate": 9.466340877547882e-06, "loss": 0.2109, "num_input_tokens_seen": 54902752, "step": 25455 }, { "epoch": 4.672416957239861, "grad_norm": 3.483630895614624, "learning_rate": 9.465980860803098e-06, "loss": 0.3416, "num_input_tokens_seen": 54913696, "step": 25460 }, { "epoch": 4.6733345567994125, "grad_norm": 9.488380432128906, "learning_rate": 9.46562072951283e-06, "loss": 0.1664, "num_input_tokens_seen": 54924992, "step": 25465 }, { "epoch": 4.674252156358965, "grad_norm": 8.136301040649414, "learning_rate": 9.465260483686309e-06, "loss": 0.34, "num_input_tokens_seen": 54935136, "step": 25470 }, { "epoch": 4.6751697559185175, "grad_norm": 3.4505512714385986, "learning_rate": 9.46490012333278e-06, "loss": 0.1485, "num_input_tokens_seen": 54946336, "step": 25475 }, { "epoch": 4.676087355478069, "grad_norm": 8.219185829162598, "learning_rate": 9.46453964846148e-06, "loss": 0.4408, "num_input_tokens_seen": 54957408, "step": 25480 }, { "epoch": 4.677004955037622, "grad_norm": 9.406825065612793, "learning_rate": 9.464179059081657e-06, "loss": 0.3515, "num_input_tokens_seen": 54968768, "step": 25485 }, { "epoch": 4.677922554597174, "grad_norm": 4.5340142250061035, "learning_rate": 9.463818355202562e-06, "loss": 0.299, "num_input_tokens_seen": 54979552, "step": 25490 }, { "epoch": 4.678840154156726, "grad_norm": 3.6946768760681152, "learning_rate": 9.463457536833443e-06, "loss": 0.2932, "num_input_tokens_seen": 54990112, "step": 25495 }, { "epoch": 4.679757753716278, "grad_norm": 3.801403045654297, "learning_rate": 9.463096603983557e-06, "loss": 0.1607, "num_input_tokens_seen": 55000736, "step": 25500 }, { "epoch": 4.680675353275831, "grad_norm": 2.634288787841797, "learning_rate": 9.46273555666216e-06, "loss": 0.3801, "num_input_tokens_seen": 55012416, "step": 25505 }, { "epoch": 4.681592952835382, "grad_norm": 11.13072395324707, "learning_rate": 9.462374394878513e-06, "loss": 0.1511, "num_input_tokens_seen": 55022656, "step": 25510 }, { "epoch": 4.682510552394935, "grad_norm": 6.57916784286499, "learning_rate": 9.462013118641878e-06, "loss": 0.3433, "num_input_tokens_seen": 55033568, "step": 25515 }, { "epoch": 4.683428151954487, "grad_norm": 1.3402882814407349, "learning_rate": 9.461651727961523e-06, "loss": 0.1352, "num_input_tokens_seen": 55043616, "step": 25520 }, { "epoch": 4.684345751514039, "grad_norm": 9.136510848999023, "learning_rate": 9.461290222846716e-06, "loss": 0.2629, "num_input_tokens_seen": 55053472, "step": 25525 }, { "epoch": 4.6852633510735915, "grad_norm": 9.581138610839844, "learning_rate": 9.460928603306728e-06, "loss": 0.2896, "num_input_tokens_seen": 55064352, "step": 25530 }, { "epoch": 4.686180950633144, "grad_norm": 3.198476552963257, "learning_rate": 9.460566869350835e-06, "loss": 0.2716, "num_input_tokens_seen": 55074976, "step": 25535 }, { "epoch": 4.687098550192696, "grad_norm": 9.002999305725098, "learning_rate": 9.460205020988316e-06, "loss": 0.3174, "num_input_tokens_seen": 55087008, "step": 25540 }, { "epoch": 4.688016149752248, "grad_norm": 3.476306915283203, "learning_rate": 9.459843058228451e-06, "loss": 0.392, "num_input_tokens_seen": 55098336, "step": 25545 }, { "epoch": 4.688933749311801, "grad_norm": 2.1799917221069336, "learning_rate": 9.459480981080523e-06, "loss": 0.3447, "num_input_tokens_seen": 55108480, "step": 25550 }, { "epoch": 4.689851348871352, "grad_norm": 5.45437479019165, "learning_rate": 9.459118789553818e-06, "loss": 0.277, "num_input_tokens_seen": 55119264, "step": 25555 }, { "epoch": 4.690768948430905, "grad_norm": 3.272061824798584, "learning_rate": 9.45875648365763e-06, "loss": 0.422, "num_input_tokens_seen": 55130560, "step": 25560 }, { "epoch": 4.691686547990457, "grad_norm": 11.048601150512695, "learning_rate": 9.458394063401249e-06, "loss": 0.4262, "num_input_tokens_seen": 55140896, "step": 25565 }, { "epoch": 4.692604147550009, "grad_norm": 2.936340093612671, "learning_rate": 9.458031528793968e-06, "loss": 0.2867, "num_input_tokens_seen": 55150784, "step": 25570 }, { "epoch": 4.693521747109561, "grad_norm": 4.6383376121521, "learning_rate": 9.457668879845088e-06, "loss": 0.3277, "num_input_tokens_seen": 55161792, "step": 25575 }, { "epoch": 4.694439346669114, "grad_norm": 9.53497314453125, "learning_rate": 9.457306116563909e-06, "loss": 0.3102, "num_input_tokens_seen": 55173056, "step": 25580 }, { "epoch": 4.6953569462286655, "grad_norm": 10.351850509643555, "learning_rate": 9.456943238959738e-06, "loss": 0.2164, "num_input_tokens_seen": 55183616, "step": 25585 }, { "epoch": 4.696274545788218, "grad_norm": 8.42113208770752, "learning_rate": 9.45658024704188e-06, "loss": 0.204, "num_input_tokens_seen": 55194144, "step": 25590 }, { "epoch": 4.6971921453477705, "grad_norm": 4.281341075897217, "learning_rate": 9.456217140819645e-06, "loss": 0.4212, "num_input_tokens_seen": 55204096, "step": 25595 }, { "epoch": 4.698109744907322, "grad_norm": 3.268224000930786, "learning_rate": 9.45585392030235e-06, "loss": 0.2754, "num_input_tokens_seen": 55214976, "step": 25600 }, { "epoch": 4.699027344466875, "grad_norm": 2.649686813354492, "learning_rate": 9.455490585499304e-06, "loss": 0.2195, "num_input_tokens_seen": 55225472, "step": 25605 }, { "epoch": 4.699944944026427, "grad_norm": 1.5602470636367798, "learning_rate": 9.455127136419832e-06, "loss": 0.2301, "num_input_tokens_seen": 55237312, "step": 25610 }, { "epoch": 4.700862543585979, "grad_norm": 8.140501976013184, "learning_rate": 9.454763573073253e-06, "loss": 0.3156, "num_input_tokens_seen": 55248736, "step": 25615 }, { "epoch": 4.701780143145531, "grad_norm": 3.584346055984497, "learning_rate": 9.454399895468893e-06, "loss": 0.3159, "num_input_tokens_seen": 55259232, "step": 25620 }, { "epoch": 4.702697742705084, "grad_norm": 0.6489643454551697, "learning_rate": 9.454036103616078e-06, "loss": 0.1929, "num_input_tokens_seen": 55268544, "step": 25625 }, { "epoch": 4.703615342264635, "grad_norm": 7.145849227905273, "learning_rate": 9.453672197524142e-06, "loss": 0.3187, "num_input_tokens_seen": 55278752, "step": 25630 }, { "epoch": 4.704532941824188, "grad_norm": 1.9006452560424805, "learning_rate": 9.453308177202416e-06, "loss": 0.2177, "num_input_tokens_seen": 55290560, "step": 25635 }, { "epoch": 4.70545054138374, "grad_norm": 6.934666633605957, "learning_rate": 9.452944042660238e-06, "loss": 0.4366, "num_input_tokens_seen": 55300800, "step": 25640 }, { "epoch": 4.706368140943292, "grad_norm": 1.9689147472381592, "learning_rate": 9.452579793906945e-06, "loss": 0.3044, "num_input_tokens_seen": 55311808, "step": 25645 }, { "epoch": 4.7072857405028445, "grad_norm": 6.850203990936279, "learning_rate": 9.452215430951883e-06, "loss": 0.2222, "num_input_tokens_seen": 55322144, "step": 25650 }, { "epoch": 4.708203340062397, "grad_norm": 1.1899250745773315, "learning_rate": 9.451850953804393e-06, "loss": 0.3352, "num_input_tokens_seen": 55332224, "step": 25655 }, { "epoch": 4.709120939621949, "grad_norm": 5.072474479675293, "learning_rate": 9.451486362473829e-06, "loss": 0.2037, "num_input_tokens_seen": 55342400, "step": 25660 }, { "epoch": 4.710038539181501, "grad_norm": 9.426180839538574, "learning_rate": 9.451121656969537e-06, "loss": 0.3214, "num_input_tokens_seen": 55352992, "step": 25665 }, { "epoch": 4.710956138741054, "grad_norm": 7.165760040283203, "learning_rate": 9.450756837300873e-06, "loss": 0.2864, "num_input_tokens_seen": 55364384, "step": 25670 }, { "epoch": 4.711873738300605, "grad_norm": 2.850010871887207, "learning_rate": 9.450391903477196e-06, "loss": 0.2, "num_input_tokens_seen": 55374656, "step": 25675 }, { "epoch": 4.712791337860158, "grad_norm": 0.5923324823379517, "learning_rate": 9.450026855507861e-06, "loss": 0.1356, "num_input_tokens_seen": 55386080, "step": 25680 }, { "epoch": 4.71370893741971, "grad_norm": 5.2656378746032715, "learning_rate": 9.449661693402237e-06, "loss": 0.3522, "num_input_tokens_seen": 55394688, "step": 25685 }, { "epoch": 4.714626536979262, "grad_norm": 6.301571846008301, "learning_rate": 9.449296417169685e-06, "loss": 0.2675, "num_input_tokens_seen": 55406368, "step": 25690 }, { "epoch": 4.715544136538814, "grad_norm": 9.681878089904785, "learning_rate": 9.448931026819577e-06, "loss": 0.4314, "num_input_tokens_seen": 55416256, "step": 25695 }, { "epoch": 4.716461736098367, "grad_norm": 2.2333154678344727, "learning_rate": 9.448565522361282e-06, "loss": 0.3548, "num_input_tokens_seen": 55427424, "step": 25700 }, { "epoch": 4.717379335657919, "grad_norm": 11.356388092041016, "learning_rate": 9.448199903804178e-06, "loss": 0.416, "num_input_tokens_seen": 55437280, "step": 25705 }, { "epoch": 4.718296935217471, "grad_norm": 2.2749757766723633, "learning_rate": 9.44783417115764e-06, "loss": 0.2583, "num_input_tokens_seen": 55447296, "step": 25710 }, { "epoch": 4.719214534777024, "grad_norm": 1.806891918182373, "learning_rate": 9.447468324431049e-06, "loss": 0.189, "num_input_tokens_seen": 55457856, "step": 25715 }, { "epoch": 4.720132134336575, "grad_norm": 6.916558742523193, "learning_rate": 9.447102363633787e-06, "loss": 0.3043, "num_input_tokens_seen": 55468512, "step": 25720 }, { "epoch": 4.721049733896128, "grad_norm": 2.6669297218322754, "learning_rate": 9.446736288775242e-06, "loss": 0.2087, "num_input_tokens_seen": 55479488, "step": 25725 }, { "epoch": 4.72196733345568, "grad_norm": 4.406946182250977, "learning_rate": 9.446370099864803e-06, "loss": 0.2662, "num_input_tokens_seen": 55489792, "step": 25730 }, { "epoch": 4.722884933015232, "grad_norm": 4.152502536773682, "learning_rate": 9.446003796911864e-06, "loss": 0.3164, "num_input_tokens_seen": 55500512, "step": 25735 }, { "epoch": 4.723802532574784, "grad_norm": 13.71589469909668, "learning_rate": 9.445637379925816e-06, "loss": 0.2372, "num_input_tokens_seen": 55511456, "step": 25740 }, { "epoch": 4.724720132134337, "grad_norm": 3.8634626865386963, "learning_rate": 9.445270848916061e-06, "loss": 0.1979, "num_input_tokens_seen": 55522624, "step": 25745 }, { "epoch": 4.7256377316938885, "grad_norm": 5.2467451095581055, "learning_rate": 9.444904203891999e-06, "loss": 0.5065, "num_input_tokens_seen": 55533440, "step": 25750 }, { "epoch": 4.726555331253441, "grad_norm": 1.8990569114685059, "learning_rate": 9.44453744486303e-06, "loss": 0.2249, "num_input_tokens_seen": 55544320, "step": 25755 }, { "epoch": 4.7274729308129935, "grad_norm": 3.3464386463165283, "learning_rate": 9.444170571838566e-06, "loss": 0.2797, "num_input_tokens_seen": 55555200, "step": 25760 }, { "epoch": 4.728390530372545, "grad_norm": 13.110651016235352, "learning_rate": 9.443803584828016e-06, "loss": 0.3213, "num_input_tokens_seen": 55566368, "step": 25765 }, { "epoch": 4.729308129932098, "grad_norm": 12.325387001037598, "learning_rate": 9.443436483840788e-06, "loss": 0.3016, "num_input_tokens_seen": 55576320, "step": 25770 }, { "epoch": 4.73022572949165, "grad_norm": 3.1129353046417236, "learning_rate": 9.443069268886304e-06, "loss": 0.3633, "num_input_tokens_seen": 55586432, "step": 25775 }, { "epoch": 4.731143329051202, "grad_norm": 4.143754005432129, "learning_rate": 9.442701939973978e-06, "loss": 0.2605, "num_input_tokens_seen": 55597760, "step": 25780 }, { "epoch": 4.732060928610754, "grad_norm": 2.317783832550049, "learning_rate": 9.442334497113233e-06, "loss": 0.2561, "num_input_tokens_seen": 55608032, "step": 25785 }, { "epoch": 4.732978528170307, "grad_norm": 8.090363502502441, "learning_rate": 9.441966940313493e-06, "loss": 0.2141, "num_input_tokens_seen": 55619296, "step": 25790 }, { "epoch": 4.733896127729858, "grad_norm": 16.121023178100586, "learning_rate": 9.441599269584185e-06, "loss": 0.3071, "num_input_tokens_seen": 55628352, "step": 25795 }, { "epoch": 4.734813727289411, "grad_norm": 6.394625186920166, "learning_rate": 9.44123148493474e-06, "loss": 0.2319, "num_input_tokens_seen": 55639904, "step": 25800 }, { "epoch": 4.735731326848963, "grad_norm": 4.130856037139893, "learning_rate": 9.44086358637459e-06, "loss": 0.4636, "num_input_tokens_seen": 55651456, "step": 25805 }, { "epoch": 4.736648926408515, "grad_norm": 5.347288131713867, "learning_rate": 9.440495573913174e-06, "loss": 0.326, "num_input_tokens_seen": 55662400, "step": 25810 }, { "epoch": 4.7375665259680675, "grad_norm": 4.615303993225098, "learning_rate": 9.440127447559926e-06, "loss": 0.3347, "num_input_tokens_seen": 55673920, "step": 25815 }, { "epoch": 4.73848412552762, "grad_norm": 22.8259334564209, "learning_rate": 9.439759207324292e-06, "loss": 0.4087, "num_input_tokens_seen": 55685024, "step": 25820 }, { "epoch": 4.739401725087172, "grad_norm": 5.818353652954102, "learning_rate": 9.439390853215716e-06, "loss": 0.1907, "num_input_tokens_seen": 55696160, "step": 25825 }, { "epoch": 4.740319324646724, "grad_norm": 2.822514057159424, "learning_rate": 9.439022385243643e-06, "loss": 0.1863, "num_input_tokens_seen": 55707200, "step": 25830 }, { "epoch": 4.741236924206277, "grad_norm": 6.348196506500244, "learning_rate": 9.438653803417526e-06, "loss": 0.2873, "num_input_tokens_seen": 55718080, "step": 25835 }, { "epoch": 4.742154523765828, "grad_norm": 8.90600872039795, "learning_rate": 9.438285107746819e-06, "loss": 0.2562, "num_input_tokens_seen": 55729600, "step": 25840 }, { "epoch": 4.743072123325381, "grad_norm": 9.239771842956543, "learning_rate": 9.437916298240979e-06, "loss": 0.323, "num_input_tokens_seen": 55741440, "step": 25845 }, { "epoch": 4.743989722884933, "grad_norm": 2.6161587238311768, "learning_rate": 9.437547374909462e-06, "loss": 0.296, "num_input_tokens_seen": 55752032, "step": 25850 }, { "epoch": 4.744907322444485, "grad_norm": 4.507023334503174, "learning_rate": 9.437178337761733e-06, "loss": 0.4011, "num_input_tokens_seen": 55762496, "step": 25855 }, { "epoch": 4.745824922004037, "grad_norm": 8.56574535369873, "learning_rate": 9.436809186807257e-06, "loss": 0.3968, "num_input_tokens_seen": 55774560, "step": 25860 }, { "epoch": 4.74674252156359, "grad_norm": 3.291977643966675, "learning_rate": 9.436439922055502e-06, "loss": 0.2775, "num_input_tokens_seen": 55784224, "step": 25865 }, { "epoch": 4.7476601211231415, "grad_norm": 4.948385715484619, "learning_rate": 9.436070543515939e-06, "loss": 0.2256, "num_input_tokens_seen": 55795744, "step": 25870 }, { "epoch": 4.748577720682694, "grad_norm": 2.817228078842163, "learning_rate": 9.43570105119804e-06, "loss": 0.2454, "num_input_tokens_seen": 55807328, "step": 25875 }, { "epoch": 4.7494953202422465, "grad_norm": 6.095754146575928, "learning_rate": 9.435331445111285e-06, "loss": 0.2008, "num_input_tokens_seen": 55817792, "step": 25880 }, { "epoch": 4.750412919801798, "grad_norm": 0.8818953633308411, "learning_rate": 9.434961725265153e-06, "loss": 0.1472, "num_input_tokens_seen": 55828896, "step": 25885 }, { "epoch": 4.751330519361351, "grad_norm": 13.628582000732422, "learning_rate": 9.434591891669125e-06, "loss": 0.3314, "num_input_tokens_seen": 55838496, "step": 25890 }, { "epoch": 4.752248118920903, "grad_norm": 10.688754081726074, "learning_rate": 9.43422194433269e-06, "loss": 0.3122, "num_input_tokens_seen": 55850272, "step": 25895 }, { "epoch": 4.753165718480455, "grad_norm": 3.9288411140441895, "learning_rate": 9.433851883265334e-06, "loss": 0.2143, "num_input_tokens_seen": 55860352, "step": 25900 }, { "epoch": 4.754083318040007, "grad_norm": 1.135769248008728, "learning_rate": 9.433481708476548e-06, "loss": 0.2128, "num_input_tokens_seen": 55871616, "step": 25905 }, { "epoch": 4.75500091759956, "grad_norm": 3.5774550437927246, "learning_rate": 9.433111419975828e-06, "loss": 0.295, "num_input_tokens_seen": 55881792, "step": 25910 }, { "epoch": 4.755918517159111, "grad_norm": 2.6637563705444336, "learning_rate": 9.432741017772671e-06, "loss": 0.1753, "num_input_tokens_seen": 55891456, "step": 25915 }, { "epoch": 4.756836116718664, "grad_norm": 2.7914960384368896, "learning_rate": 9.432370501876577e-06, "loss": 0.1993, "num_input_tokens_seen": 55902784, "step": 25920 }, { "epoch": 4.757753716278216, "grad_norm": 5.000894546508789, "learning_rate": 9.431999872297048e-06, "loss": 0.358, "num_input_tokens_seen": 55913728, "step": 25925 }, { "epoch": 4.758671315837768, "grad_norm": 2.499415636062622, "learning_rate": 9.431629129043593e-06, "loss": 0.228, "num_input_tokens_seen": 55924192, "step": 25930 }, { "epoch": 4.7595889153973205, "grad_norm": 4.378442764282227, "learning_rate": 9.431258272125718e-06, "loss": 0.2384, "num_input_tokens_seen": 55935456, "step": 25935 }, { "epoch": 4.760506514956873, "grad_norm": 5.5613813400268555, "learning_rate": 9.430887301552936e-06, "loss": 0.223, "num_input_tokens_seen": 55944000, "step": 25940 }, { "epoch": 4.761424114516425, "grad_norm": 9.006390571594238, "learning_rate": 9.430516217334762e-06, "loss": 0.2012, "num_input_tokens_seen": 55955008, "step": 25945 }, { "epoch": 4.762341714075977, "grad_norm": 6.012295722961426, "learning_rate": 9.430145019480715e-06, "loss": 0.2735, "num_input_tokens_seen": 55965632, "step": 25950 }, { "epoch": 4.76325931363553, "grad_norm": 4.060110092163086, "learning_rate": 9.429773708000314e-06, "loss": 0.1499, "num_input_tokens_seen": 55976736, "step": 25955 }, { "epoch": 4.764176913195081, "grad_norm": 2.237424612045288, "learning_rate": 9.429402282903082e-06, "loss": 0.389, "num_input_tokens_seen": 55987936, "step": 25960 }, { "epoch": 4.765094512754634, "grad_norm": 1.518609642982483, "learning_rate": 9.429030744198547e-06, "loss": 0.1692, "num_input_tokens_seen": 55998496, "step": 25965 }, { "epoch": 4.766012112314186, "grad_norm": 2.336477756500244, "learning_rate": 9.428659091896237e-06, "loss": 0.223, "num_input_tokens_seen": 56009856, "step": 25970 }, { "epoch": 4.766929711873738, "grad_norm": 8.639717102050781, "learning_rate": 9.428287326005687e-06, "loss": 0.2023, "num_input_tokens_seen": 56020736, "step": 25975 }, { "epoch": 4.76784731143329, "grad_norm": 1.078254222869873, "learning_rate": 9.427915446536428e-06, "loss": 0.3211, "num_input_tokens_seen": 56030528, "step": 25980 }, { "epoch": 4.768764910992843, "grad_norm": 6.035226821899414, "learning_rate": 9.427543453498003e-06, "loss": 0.2463, "num_input_tokens_seen": 56041120, "step": 25985 }, { "epoch": 4.769682510552395, "grad_norm": 4.761897563934326, "learning_rate": 9.427171346899949e-06, "loss": 0.2706, "num_input_tokens_seen": 56052320, "step": 25990 }, { "epoch": 4.770600110111947, "grad_norm": 3.9337689876556396, "learning_rate": 9.426799126751811e-06, "loss": 0.2546, "num_input_tokens_seen": 56063232, "step": 25995 }, { "epoch": 4.7715177096715, "grad_norm": 2.2726798057556152, "learning_rate": 9.426426793063136e-06, "loss": 0.4035, "num_input_tokens_seen": 56073920, "step": 26000 }, { "epoch": 4.772435309231051, "grad_norm": 2.2117905616760254, "learning_rate": 9.426054345843476e-06, "loss": 0.3023, "num_input_tokens_seen": 56084064, "step": 26005 }, { "epoch": 4.773352908790604, "grad_norm": 5.356832504272461, "learning_rate": 9.42568178510238e-06, "loss": 0.2716, "num_input_tokens_seen": 56095200, "step": 26010 }, { "epoch": 4.774270508350156, "grad_norm": 1.5693002939224243, "learning_rate": 9.425309110849407e-06, "loss": 0.2415, "num_input_tokens_seen": 56106848, "step": 26015 }, { "epoch": 4.775188107909708, "grad_norm": 5.341166973114014, "learning_rate": 9.424936323094111e-06, "loss": 0.2474, "num_input_tokens_seen": 56118400, "step": 26020 }, { "epoch": 4.77610570746926, "grad_norm": 0.9825311899185181, "learning_rate": 9.42456342184606e-06, "loss": 0.1632, "num_input_tokens_seen": 56128224, "step": 26025 }, { "epoch": 4.777023307028813, "grad_norm": 6.688595294952393, "learning_rate": 9.424190407114812e-06, "loss": 0.44, "num_input_tokens_seen": 56139776, "step": 26030 }, { "epoch": 4.7779409065883645, "grad_norm": 24.4301700592041, "learning_rate": 9.42381727890994e-06, "loss": 0.2618, "num_input_tokens_seen": 56149856, "step": 26035 }, { "epoch": 4.778858506147917, "grad_norm": 0.9237750172615051, "learning_rate": 9.423444037241007e-06, "loss": 0.219, "num_input_tokens_seen": 56159456, "step": 26040 }, { "epoch": 4.7797761057074695, "grad_norm": 0.8011351823806763, "learning_rate": 9.423070682117592e-06, "loss": 0.3464, "num_input_tokens_seen": 56168448, "step": 26045 }, { "epoch": 4.780693705267021, "grad_norm": 0.9564129114151001, "learning_rate": 9.42269721354927e-06, "loss": 0.1779, "num_input_tokens_seen": 56178624, "step": 26050 }, { "epoch": 4.781611304826574, "grad_norm": 6.72743558883667, "learning_rate": 9.422323631545618e-06, "loss": 0.2944, "num_input_tokens_seen": 56189664, "step": 26055 }, { "epoch": 4.782528904386126, "grad_norm": 6.315370559692383, "learning_rate": 9.42194993611622e-06, "loss": 0.3742, "num_input_tokens_seen": 56200128, "step": 26060 }, { "epoch": 4.783446503945678, "grad_norm": 18.281526565551758, "learning_rate": 9.421576127270658e-06, "loss": 0.2314, "num_input_tokens_seen": 56211296, "step": 26065 }, { "epoch": 4.78436410350523, "grad_norm": 14.094537734985352, "learning_rate": 9.421202205018522e-06, "loss": 0.3141, "num_input_tokens_seen": 56220832, "step": 26070 }, { "epoch": 4.785281703064783, "grad_norm": 13.170684814453125, "learning_rate": 9.420828169369403e-06, "loss": 0.3425, "num_input_tokens_seen": 56232384, "step": 26075 }, { "epoch": 4.786199302624334, "grad_norm": 13.641843795776367, "learning_rate": 9.42045402033289e-06, "loss": 0.2202, "num_input_tokens_seen": 56242976, "step": 26080 }, { "epoch": 4.787116902183887, "grad_norm": 10.980340003967285, "learning_rate": 9.420079757918585e-06, "loss": 0.4048, "num_input_tokens_seen": 56253984, "step": 26085 }, { "epoch": 4.788034501743439, "grad_norm": 0.731786847114563, "learning_rate": 9.419705382136084e-06, "loss": 0.3638, "num_input_tokens_seen": 56265600, "step": 26090 }, { "epoch": 4.788952101302991, "grad_norm": 2.582420587539673, "learning_rate": 9.41933089299499e-06, "loss": 0.237, "num_input_tokens_seen": 56277728, "step": 26095 }, { "epoch": 4.7898697008625435, "grad_norm": 0.803958535194397, "learning_rate": 9.418956290504908e-06, "loss": 0.2304, "num_input_tokens_seen": 56288320, "step": 26100 }, { "epoch": 4.790787300422096, "grad_norm": 4.118902683258057, "learning_rate": 9.418581574675445e-06, "loss": 0.2825, "num_input_tokens_seen": 56299936, "step": 26105 }, { "epoch": 4.791704899981648, "grad_norm": 7.6139960289001465, "learning_rate": 9.418206745516213e-06, "loss": 0.3133, "num_input_tokens_seen": 56310560, "step": 26110 }, { "epoch": 4.7926224995412, "grad_norm": 1.4172393083572388, "learning_rate": 9.417831803036826e-06, "loss": 0.308, "num_input_tokens_seen": 56321280, "step": 26115 }, { "epoch": 4.793540099100753, "grad_norm": 5.2952399253845215, "learning_rate": 9.4174567472469e-06, "loss": 0.3479, "num_input_tokens_seen": 56332480, "step": 26120 }, { "epoch": 4.794457698660304, "grad_norm": 7.379637241363525, "learning_rate": 9.417081578156055e-06, "loss": 0.2694, "num_input_tokens_seen": 56343616, "step": 26125 }, { "epoch": 4.795375298219857, "grad_norm": 1.1475375890731812, "learning_rate": 9.416706295773914e-06, "loss": 0.2204, "num_input_tokens_seen": 56354624, "step": 26130 }, { "epoch": 4.796292897779409, "grad_norm": 14.25490951538086, "learning_rate": 9.416330900110102e-06, "loss": 0.3086, "num_input_tokens_seen": 56365344, "step": 26135 }, { "epoch": 4.797210497338961, "grad_norm": 10.09211540222168, "learning_rate": 9.415955391174245e-06, "loss": 0.1614, "num_input_tokens_seen": 56375264, "step": 26140 }, { "epoch": 4.798128096898513, "grad_norm": 19.033065795898438, "learning_rate": 9.415579768975979e-06, "loss": 0.1717, "num_input_tokens_seen": 56385728, "step": 26145 }, { "epoch": 4.799045696458066, "grad_norm": 3.969425678253174, "learning_rate": 9.415204033524934e-06, "loss": 0.3348, "num_input_tokens_seen": 56397472, "step": 26150 }, { "epoch": 4.7999632960176175, "grad_norm": 21.711048126220703, "learning_rate": 9.41482818483075e-06, "loss": 0.3222, "num_input_tokens_seen": 56408832, "step": 26155 }, { "epoch": 4.80088089557717, "grad_norm": 4.81239652633667, "learning_rate": 9.414452222903064e-06, "loss": 0.3715, "num_input_tokens_seen": 56420576, "step": 26160 }, { "epoch": 4.8017984951367225, "grad_norm": 9.0165376663208, "learning_rate": 9.41407614775152e-06, "loss": 0.3522, "num_input_tokens_seen": 56431744, "step": 26165 }, { "epoch": 4.802716094696274, "grad_norm": 10.338902473449707, "learning_rate": 9.413699959385762e-06, "loss": 0.3589, "num_input_tokens_seen": 56443424, "step": 26170 }, { "epoch": 4.803633694255827, "grad_norm": 7.757254600524902, "learning_rate": 9.413323657815444e-06, "loss": 0.315, "num_input_tokens_seen": 56454336, "step": 26175 }, { "epoch": 4.804551293815379, "grad_norm": 5.629192352294922, "learning_rate": 9.412947243050213e-06, "loss": 0.3223, "num_input_tokens_seen": 56464416, "step": 26180 }, { "epoch": 4.805468893374931, "grad_norm": 4.125792980194092, "learning_rate": 9.412570715099725e-06, "loss": 0.3057, "num_input_tokens_seen": 56475392, "step": 26185 }, { "epoch": 4.806386492934483, "grad_norm": 1.9703325033187866, "learning_rate": 9.412194073973637e-06, "loss": 0.193, "num_input_tokens_seen": 56486848, "step": 26190 }, { "epoch": 4.807304092494036, "grad_norm": 7.5750017166137695, "learning_rate": 9.411817319681608e-06, "loss": 0.3339, "num_input_tokens_seen": 56499456, "step": 26195 }, { "epoch": 4.808221692053587, "grad_norm": 8.519920349121094, "learning_rate": 9.411440452233305e-06, "loss": 0.3361, "num_input_tokens_seen": 56510336, "step": 26200 }, { "epoch": 4.80913929161314, "grad_norm": 10.99833869934082, "learning_rate": 9.41106347163839e-06, "loss": 0.2484, "num_input_tokens_seen": 56521760, "step": 26205 }, { "epoch": 4.810056891172692, "grad_norm": 11.16801929473877, "learning_rate": 9.410686377906532e-06, "loss": 0.2679, "num_input_tokens_seen": 56532256, "step": 26210 }, { "epoch": 4.810974490732244, "grad_norm": 4.292949676513672, "learning_rate": 9.410309171047407e-06, "loss": 0.1967, "num_input_tokens_seen": 56543520, "step": 26215 }, { "epoch": 4.8118920902917965, "grad_norm": 5.241404056549072, "learning_rate": 9.409931851070687e-06, "loss": 0.3444, "num_input_tokens_seen": 56554592, "step": 26220 }, { "epoch": 4.812809689851349, "grad_norm": 13.831949234008789, "learning_rate": 9.40955441798605e-06, "loss": 0.3008, "num_input_tokens_seen": 56564416, "step": 26225 }, { "epoch": 4.813727289410901, "grad_norm": 13.421677589416504, "learning_rate": 9.409176871803175e-06, "loss": 0.2807, "num_input_tokens_seen": 56575104, "step": 26230 }, { "epoch": 4.814644888970453, "grad_norm": 4.21735954284668, "learning_rate": 9.408799212531745e-06, "loss": 0.2477, "num_input_tokens_seen": 56586080, "step": 26235 }, { "epoch": 4.815562488530006, "grad_norm": 4.862264156341553, "learning_rate": 9.40842144018145e-06, "loss": 0.4285, "num_input_tokens_seen": 56597024, "step": 26240 }, { "epoch": 4.816480088089557, "grad_norm": 12.18935489654541, "learning_rate": 9.408043554761979e-06, "loss": 0.3651, "num_input_tokens_seen": 56607136, "step": 26245 }, { "epoch": 4.81739768764911, "grad_norm": 3.364845037460327, "learning_rate": 9.40766555628302e-06, "loss": 0.2794, "num_input_tokens_seen": 56617504, "step": 26250 }, { "epoch": 4.818315287208662, "grad_norm": 9.316247940063477, "learning_rate": 9.407287444754275e-06, "loss": 0.3856, "num_input_tokens_seen": 56628416, "step": 26255 }, { "epoch": 4.819232886768214, "grad_norm": 3.096149206161499, "learning_rate": 9.406909220185435e-06, "loss": 0.2953, "num_input_tokens_seen": 56639776, "step": 26260 }, { "epoch": 4.820150486327766, "grad_norm": 3.5858612060546875, "learning_rate": 9.406530882586202e-06, "loss": 0.3391, "num_input_tokens_seen": 56649024, "step": 26265 }, { "epoch": 4.821068085887319, "grad_norm": 4.581793308258057, "learning_rate": 9.406152431966283e-06, "loss": 0.1787, "num_input_tokens_seen": 56660416, "step": 26270 }, { "epoch": 4.821985685446871, "grad_norm": 4.72875452041626, "learning_rate": 9.405773868335384e-06, "loss": 0.3241, "num_input_tokens_seen": 56671776, "step": 26275 }, { "epoch": 4.822903285006423, "grad_norm": 5.045322418212891, "learning_rate": 9.405395191703212e-06, "loss": 0.3322, "num_input_tokens_seen": 56682464, "step": 26280 }, { "epoch": 4.823820884565976, "grad_norm": 9.275459289550781, "learning_rate": 9.40501640207948e-06, "loss": 0.3724, "num_input_tokens_seen": 56693024, "step": 26285 }, { "epoch": 4.824738484125527, "grad_norm": 5.61154842376709, "learning_rate": 9.404637499473907e-06, "loss": 0.3644, "num_input_tokens_seen": 56704192, "step": 26290 }, { "epoch": 4.82565608368508, "grad_norm": 2.910485029220581, "learning_rate": 9.404258483896208e-06, "loss": 0.3127, "num_input_tokens_seen": 56714304, "step": 26295 }, { "epoch": 4.826573683244632, "grad_norm": 4.8373260498046875, "learning_rate": 9.403879355356104e-06, "loss": 0.2322, "num_input_tokens_seen": 56724960, "step": 26300 }, { "epoch": 4.827491282804184, "grad_norm": 4.558058261871338, "learning_rate": 9.403500113863321e-06, "loss": 0.2565, "num_input_tokens_seen": 56735264, "step": 26305 }, { "epoch": 4.828408882363736, "grad_norm": 4.028566360473633, "learning_rate": 9.403120759427583e-06, "loss": 0.3014, "num_input_tokens_seen": 56744768, "step": 26310 }, { "epoch": 4.829326481923289, "grad_norm": 4.278326511383057, "learning_rate": 9.402741292058625e-06, "loss": 0.2871, "num_input_tokens_seen": 56756160, "step": 26315 }, { "epoch": 4.8302440814828405, "grad_norm": 7.1906561851501465, "learning_rate": 9.402361711766175e-06, "loss": 0.3163, "num_input_tokens_seen": 56766944, "step": 26320 }, { "epoch": 4.831161681042393, "grad_norm": 8.264822006225586, "learning_rate": 9.401982018559969e-06, "loss": 0.2397, "num_input_tokens_seen": 56778784, "step": 26325 }, { "epoch": 4.8320792806019455, "grad_norm": 4.78572940826416, "learning_rate": 9.401602212449748e-06, "loss": 0.2827, "num_input_tokens_seen": 56790432, "step": 26330 }, { "epoch": 4.832996880161497, "grad_norm": 2.3266992568969727, "learning_rate": 9.401222293445252e-06, "loss": 0.2322, "num_input_tokens_seen": 56800352, "step": 26335 }, { "epoch": 4.83391447972105, "grad_norm": 9.550124168395996, "learning_rate": 9.400842261556225e-06, "loss": 0.215, "num_input_tokens_seen": 56809536, "step": 26340 }, { "epoch": 4.834832079280602, "grad_norm": 2.5392673015594482, "learning_rate": 9.400462116792415e-06, "loss": 0.2294, "num_input_tokens_seen": 56821024, "step": 26345 }, { "epoch": 4.835749678840154, "grad_norm": 5.189125061035156, "learning_rate": 9.400081859163572e-06, "loss": 0.244, "num_input_tokens_seen": 56831456, "step": 26350 }, { "epoch": 4.836667278399706, "grad_norm": 7.4933037757873535, "learning_rate": 9.399701488679447e-06, "loss": 0.2517, "num_input_tokens_seen": 56841152, "step": 26355 }, { "epoch": 4.837584877959259, "grad_norm": 10.780450820922852, "learning_rate": 9.3993210053498e-06, "loss": 0.2597, "num_input_tokens_seen": 56851488, "step": 26360 }, { "epoch": 4.83850247751881, "grad_norm": 4.361547470092773, "learning_rate": 9.398940409184387e-06, "loss": 0.1593, "num_input_tokens_seen": 56861472, "step": 26365 }, { "epoch": 4.839420077078363, "grad_norm": 7.839821815490723, "learning_rate": 9.398559700192969e-06, "loss": 0.4182, "num_input_tokens_seen": 56871072, "step": 26370 }, { "epoch": 4.840337676637915, "grad_norm": 8.857744216918945, "learning_rate": 9.398178878385313e-06, "loss": 0.2138, "num_input_tokens_seen": 56883072, "step": 26375 }, { "epoch": 4.841255276197467, "grad_norm": 5.270936489105225, "learning_rate": 9.397797943771184e-06, "loss": 0.398, "num_input_tokens_seen": 56893376, "step": 26380 }, { "epoch": 4.8421728757570195, "grad_norm": 6.712385654449463, "learning_rate": 9.397416896360354e-06, "loss": 0.3258, "num_input_tokens_seen": 56904640, "step": 26385 }, { "epoch": 4.843090475316572, "grad_norm": 11.631603240966797, "learning_rate": 9.397035736162598e-06, "loss": 0.2956, "num_input_tokens_seen": 56915136, "step": 26390 }, { "epoch": 4.844008074876124, "grad_norm": 7.819456100463867, "learning_rate": 9.396654463187689e-06, "loss": 0.3331, "num_input_tokens_seen": 56926528, "step": 26395 }, { "epoch": 4.844925674435676, "grad_norm": 3.594017267227173, "learning_rate": 9.396273077445406e-06, "loss": 0.293, "num_input_tokens_seen": 56938688, "step": 26400 }, { "epoch": 4.845843273995229, "grad_norm": 2.9347355365753174, "learning_rate": 9.395891578945535e-06, "loss": 0.1783, "num_input_tokens_seen": 56949984, "step": 26405 }, { "epoch": 4.84676087355478, "grad_norm": 3.786672592163086, "learning_rate": 9.395509967697856e-06, "loss": 0.2561, "num_input_tokens_seen": 56961280, "step": 26410 }, { "epoch": 4.847678473114333, "grad_norm": 5.072165012359619, "learning_rate": 9.39512824371216e-06, "loss": 0.2391, "num_input_tokens_seen": 56971040, "step": 26415 }, { "epoch": 4.848596072673885, "grad_norm": 12.784424781799316, "learning_rate": 9.394746406998234e-06, "loss": 0.3401, "num_input_tokens_seen": 56980896, "step": 26420 }, { "epoch": 4.849513672233437, "grad_norm": 4.3277459144592285, "learning_rate": 9.394364457565876e-06, "loss": 0.3083, "num_input_tokens_seen": 56991424, "step": 26425 }, { "epoch": 4.850431271792989, "grad_norm": 12.442761421203613, "learning_rate": 9.39398239542488e-06, "loss": 0.2865, "num_input_tokens_seen": 57003072, "step": 26430 }, { "epoch": 4.851348871352542, "grad_norm": 8.16612720489502, "learning_rate": 9.393600220585044e-06, "loss": 0.2591, "num_input_tokens_seen": 57014272, "step": 26435 }, { "epoch": 4.8522664709120935, "grad_norm": 7.215559959411621, "learning_rate": 9.393217933056173e-06, "loss": 0.2381, "num_input_tokens_seen": 57024960, "step": 26440 }, { "epoch": 4.853184070471646, "grad_norm": 5.37131404876709, "learning_rate": 9.392835532848071e-06, "loss": 0.3051, "num_input_tokens_seen": 57036128, "step": 26445 }, { "epoch": 4.8541016700311985, "grad_norm": 3.3418638706207275, "learning_rate": 9.392453019970547e-06, "loss": 0.1789, "num_input_tokens_seen": 57047040, "step": 26450 }, { "epoch": 4.85501926959075, "grad_norm": 3.5338711738586426, "learning_rate": 9.392070394433408e-06, "loss": 0.2801, "num_input_tokens_seen": 57057664, "step": 26455 }, { "epoch": 4.855936869150303, "grad_norm": 5.024675369262695, "learning_rate": 9.391687656246473e-06, "loss": 0.271, "num_input_tokens_seen": 57068768, "step": 26460 }, { "epoch": 4.856854468709855, "grad_norm": 9.612539291381836, "learning_rate": 9.391304805419554e-06, "loss": 0.3092, "num_input_tokens_seen": 57079456, "step": 26465 }, { "epoch": 4.857772068269407, "grad_norm": 16.63933753967285, "learning_rate": 9.390921841962475e-06, "loss": 0.2856, "num_input_tokens_seen": 57089760, "step": 26470 }, { "epoch": 4.858689667828959, "grad_norm": 14.000500679016113, "learning_rate": 9.390538765885055e-06, "loss": 0.2639, "num_input_tokens_seen": 57100000, "step": 26475 }, { "epoch": 4.859607267388512, "grad_norm": 7.036573886871338, "learning_rate": 9.39015557719712e-06, "loss": 0.3951, "num_input_tokens_seen": 57109920, "step": 26480 }, { "epoch": 4.860524866948063, "grad_norm": 7.986185073852539, "learning_rate": 9.389772275908499e-06, "loss": 0.2229, "num_input_tokens_seen": 57119328, "step": 26485 }, { "epoch": 4.861442466507616, "grad_norm": 6.628552436828613, "learning_rate": 9.389388862029023e-06, "loss": 0.3298, "num_input_tokens_seen": 57128320, "step": 26490 }, { "epoch": 4.862360066067168, "grad_norm": 1.2207955121994019, "learning_rate": 9.389005335568524e-06, "loss": 0.2189, "num_input_tokens_seen": 57140640, "step": 26495 }, { "epoch": 4.86327766562672, "grad_norm": 9.081201553344727, "learning_rate": 9.388621696536842e-06, "loss": 0.3533, "num_input_tokens_seen": 57151936, "step": 26500 }, { "epoch": 4.8641952651862725, "grad_norm": 6.859666347503662, "learning_rate": 9.388237944943814e-06, "loss": 0.4363, "num_input_tokens_seen": 57163392, "step": 26505 }, { "epoch": 4.865112864745825, "grad_norm": 3.8399319648742676, "learning_rate": 9.387854080799286e-06, "loss": 0.2791, "num_input_tokens_seen": 57174784, "step": 26510 }, { "epoch": 4.866030464305377, "grad_norm": 1.935528039932251, "learning_rate": 9.3874701041131e-06, "loss": 0.1796, "num_input_tokens_seen": 57186176, "step": 26515 }, { "epoch": 4.866948063864929, "grad_norm": 25.886138916015625, "learning_rate": 9.387086014895105e-06, "loss": 0.4261, "num_input_tokens_seen": 57197312, "step": 26520 }, { "epoch": 4.867865663424482, "grad_norm": 3.9489176273345947, "learning_rate": 9.386701813155155e-06, "loss": 0.4339, "num_input_tokens_seen": 57207296, "step": 26525 }, { "epoch": 4.868783262984033, "grad_norm": 5.929258823394775, "learning_rate": 9.3863174989031e-06, "loss": 0.1858, "num_input_tokens_seen": 57217728, "step": 26530 }, { "epoch": 4.869700862543586, "grad_norm": 4.303304672241211, "learning_rate": 9.3859330721488e-06, "loss": 0.2548, "num_input_tokens_seen": 57228288, "step": 26535 }, { "epoch": 4.870618462103138, "grad_norm": 10.933419227600098, "learning_rate": 9.385548532902115e-06, "loss": 0.3133, "num_input_tokens_seen": 57239520, "step": 26540 }, { "epoch": 4.87153606166269, "grad_norm": 2.236231803894043, "learning_rate": 9.385163881172907e-06, "loss": 0.3613, "num_input_tokens_seen": 57250400, "step": 26545 }, { "epoch": 4.872453661222242, "grad_norm": 7.8156609535217285, "learning_rate": 9.384779116971042e-06, "loss": 0.339, "num_input_tokens_seen": 57260352, "step": 26550 }, { "epoch": 4.873371260781795, "grad_norm": 7.870728015899658, "learning_rate": 9.384394240306388e-06, "loss": 0.3281, "num_input_tokens_seen": 57271264, "step": 26555 }, { "epoch": 4.874288860341347, "grad_norm": 5.846837043762207, "learning_rate": 9.384009251188816e-06, "loss": 0.4285, "num_input_tokens_seen": 57279968, "step": 26560 }, { "epoch": 4.875206459900899, "grad_norm": 9.20400619506836, "learning_rate": 9.383624149628202e-06, "loss": 0.209, "num_input_tokens_seen": 57289344, "step": 26565 }, { "epoch": 4.876124059460452, "grad_norm": 1.403277039527893, "learning_rate": 9.383238935634424e-06, "loss": 0.3083, "num_input_tokens_seen": 57299776, "step": 26570 }, { "epoch": 4.877041659020003, "grad_norm": 21.15199851989746, "learning_rate": 9.38285360921736e-06, "loss": 0.1579, "num_input_tokens_seen": 57312000, "step": 26575 }, { "epoch": 4.877959258579556, "grad_norm": 8.372400283813477, "learning_rate": 9.382468170386894e-06, "loss": 0.2494, "num_input_tokens_seen": 57323008, "step": 26580 }, { "epoch": 4.878876858139108, "grad_norm": 1.224334478378296, "learning_rate": 9.38208261915291e-06, "loss": 0.282, "num_input_tokens_seen": 57333856, "step": 26585 }, { "epoch": 4.87979445769866, "grad_norm": 2.0005524158477783, "learning_rate": 9.3816969555253e-06, "loss": 0.4351, "num_input_tokens_seen": 57345056, "step": 26590 }, { "epoch": 4.880712057258212, "grad_norm": 20.25326156616211, "learning_rate": 9.381311179513954e-06, "loss": 0.2405, "num_input_tokens_seen": 57355264, "step": 26595 }, { "epoch": 4.881629656817765, "grad_norm": 14.41641902923584, "learning_rate": 9.380925291128767e-06, "loss": 0.2634, "num_input_tokens_seen": 57365888, "step": 26600 }, { "epoch": 4.8825472563773165, "grad_norm": 2.731623411178589, "learning_rate": 9.380539290379634e-06, "loss": 0.2193, "num_input_tokens_seen": 57377344, "step": 26605 }, { "epoch": 4.883464855936869, "grad_norm": 6.450376987457275, "learning_rate": 9.380153177276459e-06, "loss": 0.3545, "num_input_tokens_seen": 57388800, "step": 26610 }, { "epoch": 4.8843824554964215, "grad_norm": 2.8385438919067383, "learning_rate": 9.379766951829144e-06, "loss": 0.3124, "num_input_tokens_seen": 57398432, "step": 26615 }, { "epoch": 4.885300055055973, "grad_norm": 13.245796203613281, "learning_rate": 9.379380614047594e-06, "loss": 0.4604, "num_input_tokens_seen": 57409024, "step": 26620 }, { "epoch": 4.886217654615526, "grad_norm": 10.917572975158691, "learning_rate": 9.378994163941719e-06, "loss": 0.3795, "num_input_tokens_seen": 57419072, "step": 26625 }, { "epoch": 4.887135254175078, "grad_norm": 3.1700141429901123, "learning_rate": 9.37860760152143e-06, "loss": 0.2806, "num_input_tokens_seen": 57429984, "step": 26630 }, { "epoch": 4.88805285373463, "grad_norm": 4.768980979919434, "learning_rate": 9.378220926796641e-06, "loss": 0.3861, "num_input_tokens_seen": 57440672, "step": 26635 }, { "epoch": 4.888970453294182, "grad_norm": 5.4188032150268555, "learning_rate": 9.377834139777274e-06, "loss": 0.296, "num_input_tokens_seen": 57451936, "step": 26640 }, { "epoch": 4.889888052853735, "grad_norm": 2.449685573577881, "learning_rate": 9.377447240473245e-06, "loss": 0.2452, "num_input_tokens_seen": 57462368, "step": 26645 }, { "epoch": 4.890805652413286, "grad_norm": 4.6656646728515625, "learning_rate": 9.377060228894478e-06, "loss": 0.2536, "num_input_tokens_seen": 57472768, "step": 26650 }, { "epoch": 4.891723251972839, "grad_norm": 2.7125539779663086, "learning_rate": 9.3766731050509e-06, "loss": 0.2576, "num_input_tokens_seen": 57481792, "step": 26655 }, { "epoch": 4.892640851532391, "grad_norm": 9.712935447692871, "learning_rate": 9.376285868952441e-06, "loss": 0.2966, "num_input_tokens_seen": 57492992, "step": 26660 }, { "epoch": 4.893558451091943, "grad_norm": 6.9709882736206055, "learning_rate": 9.375898520609032e-06, "loss": 0.2351, "num_input_tokens_seen": 57503872, "step": 26665 }, { "epoch": 4.8944760506514955, "grad_norm": 5.284501075744629, "learning_rate": 9.375511060030606e-06, "loss": 0.178, "num_input_tokens_seen": 57514752, "step": 26670 }, { "epoch": 4.895393650211048, "grad_norm": 1.8454331159591675, "learning_rate": 9.375123487227104e-06, "loss": 0.2412, "num_input_tokens_seen": 57525664, "step": 26675 }, { "epoch": 4.8963112497706, "grad_norm": 10.084166526794434, "learning_rate": 9.374735802208468e-06, "loss": 0.3008, "num_input_tokens_seen": 57535424, "step": 26680 }, { "epoch": 4.897228849330152, "grad_norm": 3.9300849437713623, "learning_rate": 9.374348004984635e-06, "loss": 0.276, "num_input_tokens_seen": 57546304, "step": 26685 }, { "epoch": 4.898146448889705, "grad_norm": 7.9368767738342285, "learning_rate": 9.373960095565558e-06, "loss": 0.2707, "num_input_tokens_seen": 57557824, "step": 26690 }, { "epoch": 4.899064048449256, "grad_norm": 4.683889389038086, "learning_rate": 9.373572073961182e-06, "loss": 0.2289, "num_input_tokens_seen": 57567872, "step": 26695 }, { "epoch": 4.899981648008809, "grad_norm": 5.369232654571533, "learning_rate": 9.37318394018146e-06, "loss": 0.2474, "num_input_tokens_seen": 57578720, "step": 26700 }, { "epoch": 4.900899247568361, "grad_norm": 11.577917098999023, "learning_rate": 9.37279569423635e-06, "loss": 0.2311, "num_input_tokens_seen": 57589248, "step": 26705 }, { "epoch": 4.901816847127913, "grad_norm": 8.877406120300293, "learning_rate": 9.372407336135807e-06, "loss": 0.366, "num_input_tokens_seen": 57600512, "step": 26710 }, { "epoch": 4.902734446687465, "grad_norm": 6.831262588500977, "learning_rate": 9.372018865889792e-06, "loss": 0.3056, "num_input_tokens_seen": 57611744, "step": 26715 }, { "epoch": 4.903652046247018, "grad_norm": 7.134085178375244, "learning_rate": 9.371630283508269e-06, "loss": 0.2373, "num_input_tokens_seen": 57622240, "step": 26720 }, { "epoch": 4.90456964580657, "grad_norm": 2.210218906402588, "learning_rate": 9.371241589001205e-06, "loss": 0.2339, "num_input_tokens_seen": 57632864, "step": 26725 }, { "epoch": 4.905487245366122, "grad_norm": 10.647278785705566, "learning_rate": 9.370852782378567e-06, "loss": 0.2557, "num_input_tokens_seen": 57642912, "step": 26730 }, { "epoch": 4.9064048449256745, "grad_norm": 7.181154727935791, "learning_rate": 9.370463863650333e-06, "loss": 0.2009, "num_input_tokens_seen": 57653312, "step": 26735 }, { "epoch": 4.907322444485227, "grad_norm": 2.66145920753479, "learning_rate": 9.370074832826473e-06, "loss": 0.1297, "num_input_tokens_seen": 57663104, "step": 26740 }, { "epoch": 4.908240044044779, "grad_norm": 0.5430107116699219, "learning_rate": 9.369685689916965e-06, "loss": 0.1223, "num_input_tokens_seen": 57674752, "step": 26745 }, { "epoch": 4.909157643604331, "grad_norm": 4.688313961029053, "learning_rate": 9.369296434931794e-06, "loss": 0.2764, "num_input_tokens_seen": 57685216, "step": 26750 }, { "epoch": 4.910075243163884, "grad_norm": 4.23723840713501, "learning_rate": 9.36890706788094e-06, "loss": 0.388, "num_input_tokens_seen": 57696768, "step": 26755 }, { "epoch": 4.910992842723435, "grad_norm": 7.405388832092285, "learning_rate": 9.36851758877439e-06, "loss": 0.2226, "num_input_tokens_seen": 57707392, "step": 26760 }, { "epoch": 4.911910442282988, "grad_norm": 3.7746877670288086, "learning_rate": 9.368127997622135e-06, "loss": 0.2237, "num_input_tokens_seen": 57718400, "step": 26765 }, { "epoch": 4.91282804184254, "grad_norm": 5.886713981628418, "learning_rate": 9.367738294434167e-06, "loss": 0.3882, "num_input_tokens_seen": 57729376, "step": 26770 }, { "epoch": 4.913745641402092, "grad_norm": 2.7145678997039795, "learning_rate": 9.367348479220481e-06, "loss": 0.3635, "num_input_tokens_seen": 57741152, "step": 26775 }, { "epoch": 4.914663240961644, "grad_norm": 0.7121067643165588, "learning_rate": 9.366958551991077e-06, "loss": 0.2298, "num_input_tokens_seen": 57751968, "step": 26780 }, { "epoch": 4.915580840521197, "grad_norm": 1.0882182121276855, "learning_rate": 9.366568512755952e-06, "loss": 0.2403, "num_input_tokens_seen": 57762816, "step": 26785 }, { "epoch": 4.9164984400807485, "grad_norm": 14.14788818359375, "learning_rate": 9.366178361525114e-06, "loss": 0.4589, "num_input_tokens_seen": 57773568, "step": 26790 }, { "epoch": 4.917416039640301, "grad_norm": 10.061004638671875, "learning_rate": 9.365788098308566e-06, "loss": 0.3893, "num_input_tokens_seen": 57783360, "step": 26795 }, { "epoch": 4.918333639199854, "grad_norm": 1.6304103136062622, "learning_rate": 9.36539772311632e-06, "loss": 0.248, "num_input_tokens_seen": 57794400, "step": 26800 }, { "epoch": 4.919251238759405, "grad_norm": 10.630772590637207, "learning_rate": 9.36500723595839e-06, "loss": 0.3124, "num_input_tokens_seen": 57805248, "step": 26805 }, { "epoch": 4.920168838318958, "grad_norm": 3.7302234172821045, "learning_rate": 9.364616636844788e-06, "loss": 0.1487, "num_input_tokens_seen": 57814848, "step": 26810 }, { "epoch": 4.92108643787851, "grad_norm": 4.795755863189697, "learning_rate": 9.364225925785533e-06, "loss": 0.3122, "num_input_tokens_seen": 57825184, "step": 26815 }, { "epoch": 4.922004037438062, "grad_norm": 9.245687484741211, "learning_rate": 9.363835102790649e-06, "loss": 0.2218, "num_input_tokens_seen": 57836480, "step": 26820 }, { "epoch": 4.922921636997614, "grad_norm": 6.654991149902344, "learning_rate": 9.363444167870158e-06, "loss": 0.2864, "num_input_tokens_seen": 57847264, "step": 26825 }, { "epoch": 4.923839236557167, "grad_norm": 9.974323272705078, "learning_rate": 9.363053121034084e-06, "loss": 0.2228, "num_input_tokens_seen": 57857024, "step": 26830 }, { "epoch": 4.924756836116718, "grad_norm": 0.942800760269165, "learning_rate": 9.362661962292464e-06, "loss": 0.2057, "num_input_tokens_seen": 57868032, "step": 26835 }, { "epoch": 4.925674435676271, "grad_norm": 8.098884582519531, "learning_rate": 9.362270691655322e-06, "loss": 0.2247, "num_input_tokens_seen": 57879072, "step": 26840 }, { "epoch": 4.9265920352358235, "grad_norm": 6.20599365234375, "learning_rate": 9.361879309132699e-06, "loss": 0.2391, "num_input_tokens_seen": 57889568, "step": 26845 }, { "epoch": 4.927509634795375, "grad_norm": 3.939537763595581, "learning_rate": 9.361487814734633e-06, "loss": 0.2557, "num_input_tokens_seen": 57901152, "step": 26850 }, { "epoch": 4.928427234354928, "grad_norm": 21.861371994018555, "learning_rate": 9.361096208471163e-06, "loss": 0.3098, "num_input_tokens_seen": 57911776, "step": 26855 }, { "epoch": 4.92934483391448, "grad_norm": 13.683601379394531, "learning_rate": 9.360704490352334e-06, "loss": 0.4664, "num_input_tokens_seen": 57922208, "step": 26860 }, { "epoch": 4.930262433474032, "grad_norm": 10.975329399108887, "learning_rate": 9.360312660388196e-06, "loss": 0.3243, "num_input_tokens_seen": 57933632, "step": 26865 }, { "epoch": 4.931180033033584, "grad_norm": 9.34784984588623, "learning_rate": 9.359920718588793e-06, "loss": 0.2366, "num_input_tokens_seen": 57944928, "step": 26870 }, { "epoch": 4.932097632593137, "grad_norm": 2.8269801139831543, "learning_rate": 9.359528664964183e-06, "loss": 0.2097, "num_input_tokens_seen": 57956000, "step": 26875 }, { "epoch": 4.933015232152688, "grad_norm": 11.371831893920898, "learning_rate": 9.359136499524418e-06, "loss": 0.2211, "num_input_tokens_seen": 57965952, "step": 26880 }, { "epoch": 4.933932831712241, "grad_norm": 9.159039497375488, "learning_rate": 9.35874422227956e-06, "loss": 0.2969, "num_input_tokens_seen": 57976736, "step": 26885 }, { "epoch": 4.934850431271793, "grad_norm": 0.4904453754425049, "learning_rate": 9.358351833239666e-06, "loss": 0.231, "num_input_tokens_seen": 57987776, "step": 26890 }, { "epoch": 4.935768030831345, "grad_norm": 2.534194231033325, "learning_rate": 9.357959332414803e-06, "loss": 0.3477, "num_input_tokens_seen": 57998912, "step": 26895 }, { "epoch": 4.9366856303908975, "grad_norm": 14.125435829162598, "learning_rate": 9.357566719815036e-06, "loss": 0.4161, "num_input_tokens_seen": 58010240, "step": 26900 }, { "epoch": 4.93760322995045, "grad_norm": 5.991829872131348, "learning_rate": 9.357173995450438e-06, "loss": 0.2322, "num_input_tokens_seen": 58022944, "step": 26905 }, { "epoch": 4.938520829510002, "grad_norm": 6.86874532699585, "learning_rate": 9.356781159331078e-06, "loss": 0.271, "num_input_tokens_seen": 58033312, "step": 26910 }, { "epoch": 4.939438429069554, "grad_norm": 8.50064754486084, "learning_rate": 9.356388211467037e-06, "loss": 0.3116, "num_input_tokens_seen": 58045120, "step": 26915 }, { "epoch": 4.940356028629107, "grad_norm": 6.805362224578857, "learning_rate": 9.355995151868387e-06, "loss": 0.2103, "num_input_tokens_seen": 58056000, "step": 26920 }, { "epoch": 4.941273628188658, "grad_norm": 21.53473663330078, "learning_rate": 9.355601980545215e-06, "loss": 0.3212, "num_input_tokens_seen": 58066816, "step": 26925 }, { "epoch": 4.942191227748211, "grad_norm": 5.298315525054932, "learning_rate": 9.3552086975076e-06, "loss": 0.2536, "num_input_tokens_seen": 58077856, "step": 26930 }, { "epoch": 4.943108827307763, "grad_norm": 1.4760594367980957, "learning_rate": 9.354815302765634e-06, "loss": 0.3217, "num_input_tokens_seen": 58089280, "step": 26935 }, { "epoch": 4.944026426867315, "grad_norm": 12.842513084411621, "learning_rate": 9.354421796329405e-06, "loss": 0.1539, "num_input_tokens_seen": 58100352, "step": 26940 }, { "epoch": 4.944944026426867, "grad_norm": 3.409444808959961, "learning_rate": 9.354028178209005e-06, "loss": 0.2475, "num_input_tokens_seen": 58110400, "step": 26945 }, { "epoch": 4.94586162598642, "grad_norm": 14.304766654968262, "learning_rate": 9.35363444841453e-06, "loss": 0.386, "num_input_tokens_seen": 58121856, "step": 26950 }, { "epoch": 4.9467792255459715, "grad_norm": 20.057703018188477, "learning_rate": 9.35324060695608e-06, "loss": 0.3828, "num_input_tokens_seen": 58134080, "step": 26955 }, { "epoch": 4.947696825105524, "grad_norm": 4.956549644470215, "learning_rate": 9.352846653843755e-06, "loss": 0.2468, "num_input_tokens_seen": 58143680, "step": 26960 }, { "epoch": 4.9486144246650765, "grad_norm": 9.568831443786621, "learning_rate": 9.352452589087658e-06, "loss": 0.3152, "num_input_tokens_seen": 58154880, "step": 26965 }, { "epoch": 4.949532024224628, "grad_norm": 4.0602521896362305, "learning_rate": 9.3520584126979e-06, "loss": 0.4101, "num_input_tokens_seen": 58166464, "step": 26970 }, { "epoch": 4.950449623784181, "grad_norm": 3.5041890144348145, "learning_rate": 9.351664124684587e-06, "loss": 0.307, "num_input_tokens_seen": 58178208, "step": 26975 }, { "epoch": 4.951367223343733, "grad_norm": 5.554853916168213, "learning_rate": 9.351269725057834e-06, "loss": 0.3228, "num_input_tokens_seen": 58189920, "step": 26980 }, { "epoch": 4.952284822903285, "grad_norm": 7.651604175567627, "learning_rate": 9.350875213827757e-06, "loss": 0.4367, "num_input_tokens_seen": 58199712, "step": 26985 }, { "epoch": 4.953202422462837, "grad_norm": 3.1867594718933105, "learning_rate": 9.350480591004474e-06, "loss": 0.2632, "num_input_tokens_seen": 58211424, "step": 26990 }, { "epoch": 4.95412002202239, "grad_norm": 5.563750267028809, "learning_rate": 9.350085856598107e-06, "loss": 0.2284, "num_input_tokens_seen": 58223104, "step": 26995 }, { "epoch": 4.955037621581941, "grad_norm": 7.698718070983887, "learning_rate": 9.34969101061878e-06, "loss": 0.3825, "num_input_tokens_seen": 58235200, "step": 27000 }, { "epoch": 4.955955221141494, "grad_norm": 5.579156875610352, "learning_rate": 9.349296053076617e-06, "loss": 0.2771, "num_input_tokens_seen": 58245600, "step": 27005 }, { "epoch": 4.956872820701046, "grad_norm": 6.22976541519165, "learning_rate": 9.348900983981754e-06, "loss": 0.2367, "num_input_tokens_seen": 58256416, "step": 27010 }, { "epoch": 4.957790420260598, "grad_norm": 2.8523077964782715, "learning_rate": 9.348505803344322e-06, "loss": 0.2648, "num_input_tokens_seen": 58267840, "step": 27015 }, { "epoch": 4.9587080198201505, "grad_norm": 6.14574670791626, "learning_rate": 9.348110511174453e-06, "loss": 0.2969, "num_input_tokens_seen": 58279168, "step": 27020 }, { "epoch": 4.959625619379703, "grad_norm": 4.471270561218262, "learning_rate": 9.347715107482289e-06, "loss": 0.3, "num_input_tokens_seen": 58289664, "step": 27025 }, { "epoch": 4.960543218939255, "grad_norm": 8.437618255615234, "learning_rate": 9.347319592277971e-06, "loss": 0.4068, "num_input_tokens_seen": 58300352, "step": 27030 }, { "epoch": 4.961460818498807, "grad_norm": 1.553253173828125, "learning_rate": 9.346923965571644e-06, "loss": 0.2647, "num_input_tokens_seen": 58310976, "step": 27035 }, { "epoch": 4.96237841805836, "grad_norm": 3.8523874282836914, "learning_rate": 9.346528227373453e-06, "loss": 0.319, "num_input_tokens_seen": 58322784, "step": 27040 }, { "epoch": 4.963296017617911, "grad_norm": 3.079205274581909, "learning_rate": 9.346132377693549e-06, "loss": 0.2987, "num_input_tokens_seen": 58334624, "step": 27045 }, { "epoch": 4.964213617177464, "grad_norm": 3.4878246784210205, "learning_rate": 9.345736416542087e-06, "loss": 0.3593, "num_input_tokens_seen": 58343680, "step": 27050 }, { "epoch": 4.965131216737016, "grad_norm": 5.296670436859131, "learning_rate": 9.345340343929222e-06, "loss": 0.1988, "num_input_tokens_seen": 58354752, "step": 27055 }, { "epoch": 4.966048816296568, "grad_norm": 3.4076895713806152, "learning_rate": 9.34494415986511e-06, "loss": 0.3432, "num_input_tokens_seen": 58364256, "step": 27060 }, { "epoch": 4.96696641585612, "grad_norm": 1.8607181310653687, "learning_rate": 9.344547864359915e-06, "loss": 0.2077, "num_input_tokens_seen": 58374272, "step": 27065 }, { "epoch": 4.967884015415673, "grad_norm": 1.9024479389190674, "learning_rate": 9.344151457423801e-06, "loss": 0.2332, "num_input_tokens_seen": 58385344, "step": 27070 }, { "epoch": 4.9688016149752245, "grad_norm": 9.677631378173828, "learning_rate": 9.343754939066934e-06, "loss": 0.2429, "num_input_tokens_seen": 58396384, "step": 27075 }, { "epoch": 4.969719214534777, "grad_norm": 1.1159553527832031, "learning_rate": 9.343358309299484e-06, "loss": 0.2949, "num_input_tokens_seen": 58406976, "step": 27080 }, { "epoch": 4.97063681409433, "grad_norm": 1.3909474611282349, "learning_rate": 9.342961568131627e-06, "loss": 0.2148, "num_input_tokens_seen": 58418432, "step": 27085 }, { "epoch": 4.971554413653881, "grad_norm": 2.858712911605835, "learning_rate": 9.342564715573536e-06, "loss": 0.4119, "num_input_tokens_seen": 58429120, "step": 27090 }, { "epoch": 4.972472013213434, "grad_norm": 16.1066951751709, "learning_rate": 9.342167751635392e-06, "loss": 0.4974, "num_input_tokens_seen": 58439936, "step": 27095 }, { "epoch": 4.973389612772986, "grad_norm": 8.11895751953125, "learning_rate": 9.341770676327372e-06, "loss": 0.5114, "num_input_tokens_seen": 58450752, "step": 27100 }, { "epoch": 4.974307212332538, "grad_norm": 2.5969250202178955, "learning_rate": 9.341373489659667e-06, "loss": 0.2303, "num_input_tokens_seen": 58462432, "step": 27105 }, { "epoch": 4.97522481189209, "grad_norm": 8.099502563476562, "learning_rate": 9.340976191642458e-06, "loss": 0.2839, "num_input_tokens_seen": 58473824, "step": 27110 }, { "epoch": 4.976142411451643, "grad_norm": 10.537073135375977, "learning_rate": 9.340578782285938e-06, "loss": 0.2512, "num_input_tokens_seen": 58484032, "step": 27115 }, { "epoch": 4.977060011011194, "grad_norm": 16.07234764099121, "learning_rate": 9.3401812616003e-06, "loss": 0.2932, "num_input_tokens_seen": 58494720, "step": 27120 }, { "epoch": 4.977977610570747, "grad_norm": 5.970976829528809, "learning_rate": 9.33978362959574e-06, "loss": 0.3758, "num_input_tokens_seen": 58504576, "step": 27125 }, { "epoch": 4.9788952101302995, "grad_norm": 5.359224319458008, "learning_rate": 9.339385886282453e-06, "loss": 0.3036, "num_input_tokens_seen": 58516672, "step": 27130 }, { "epoch": 4.979812809689851, "grad_norm": 5.575088977813721, "learning_rate": 9.338988031670645e-06, "loss": 0.1926, "num_input_tokens_seen": 58528224, "step": 27135 }, { "epoch": 4.980730409249404, "grad_norm": 6.2354207038879395, "learning_rate": 9.33859006577052e-06, "loss": 0.2613, "num_input_tokens_seen": 58538496, "step": 27140 }, { "epoch": 4.981648008808956, "grad_norm": 6.368210315704346, "learning_rate": 9.338191988592282e-06, "loss": 0.375, "num_input_tokens_seen": 58549312, "step": 27145 }, { "epoch": 4.982565608368508, "grad_norm": 5.321725368499756, "learning_rate": 9.337793800146145e-06, "loss": 0.2333, "num_input_tokens_seen": 58560480, "step": 27150 }, { "epoch": 4.98348320792806, "grad_norm": 2.193474292755127, "learning_rate": 9.33739550044232e-06, "loss": 0.2943, "num_input_tokens_seen": 58571584, "step": 27155 }, { "epoch": 4.984400807487613, "grad_norm": 3.5984385013580322, "learning_rate": 9.33699708949102e-06, "loss": 0.3541, "num_input_tokens_seen": 58583360, "step": 27160 }, { "epoch": 4.985318407047164, "grad_norm": 1.413889765739441, "learning_rate": 9.336598567302469e-06, "loss": 0.2267, "num_input_tokens_seen": 58594368, "step": 27165 }, { "epoch": 4.986236006606717, "grad_norm": 9.710661888122559, "learning_rate": 9.336199933886885e-06, "loss": 0.2974, "num_input_tokens_seen": 58605408, "step": 27170 }, { "epoch": 4.987153606166269, "grad_norm": 1.179306983947754, "learning_rate": 9.335801189254495e-06, "loss": 0.2985, "num_input_tokens_seen": 58615488, "step": 27175 }, { "epoch": 4.988071205725821, "grad_norm": 2.211768865585327, "learning_rate": 9.335402333415522e-06, "loss": 0.2748, "num_input_tokens_seen": 58626976, "step": 27180 }, { "epoch": 4.9889888052853735, "grad_norm": 3.0950210094451904, "learning_rate": 9.3350033663802e-06, "loss": 0.211, "num_input_tokens_seen": 58635776, "step": 27185 }, { "epoch": 4.989906404844926, "grad_norm": 7.746952533721924, "learning_rate": 9.33460428815876e-06, "loss": 0.2105, "num_input_tokens_seen": 58646656, "step": 27190 }, { "epoch": 4.990824004404478, "grad_norm": 2.9023497104644775, "learning_rate": 9.334205098761436e-06, "loss": 0.3915, "num_input_tokens_seen": 58656672, "step": 27195 }, { "epoch": 4.99174160396403, "grad_norm": 3.9023313522338867, "learning_rate": 9.33380579819847e-06, "loss": 0.1734, "num_input_tokens_seen": 58666880, "step": 27200 }, { "epoch": 4.992659203523583, "grad_norm": 3.1952929496765137, "learning_rate": 9.333406386480103e-06, "loss": 0.4909, "num_input_tokens_seen": 58677792, "step": 27205 }, { "epoch": 4.993576803083134, "grad_norm": 6.793649673461914, "learning_rate": 9.333006863616577e-06, "loss": 0.3696, "num_input_tokens_seen": 58688928, "step": 27210 }, { "epoch": 4.994494402642687, "grad_norm": 0.7637962698936462, "learning_rate": 9.332607229618142e-06, "loss": 0.1908, "num_input_tokens_seen": 58700320, "step": 27215 }, { "epoch": 4.995412002202239, "grad_norm": 3.274925708770752, "learning_rate": 9.332207484495046e-06, "loss": 0.2502, "num_input_tokens_seen": 58710304, "step": 27220 }, { "epoch": 4.996329601761791, "grad_norm": 10.236943244934082, "learning_rate": 9.33180762825754e-06, "loss": 0.3428, "num_input_tokens_seen": 58721504, "step": 27225 }, { "epoch": 4.997247201321343, "grad_norm": 5.223082065582275, "learning_rate": 9.331407660915886e-06, "loss": 0.3568, "num_input_tokens_seen": 58731040, "step": 27230 }, { "epoch": 4.998164800880896, "grad_norm": 4.181958198547363, "learning_rate": 9.331007582480336e-06, "loss": 0.301, "num_input_tokens_seen": 58741280, "step": 27235 }, { "epoch": 4.9990824004404475, "grad_norm": 0.9840342402458191, "learning_rate": 9.330607392961153e-06, "loss": 0.1559, "num_input_tokens_seen": 58752352, "step": 27240 }, { "epoch": 5.0, "grad_norm": 3.9441299438476562, "learning_rate": 9.330207092368604e-06, "loss": 0.3376, "num_input_tokens_seen": 58762464, "step": 27245 }, { "epoch": 5.0009175995595525, "grad_norm": 2.568679094314575, "learning_rate": 9.329806680712954e-06, "loss": 0.278, "num_input_tokens_seen": 58773632, "step": 27250 }, { "epoch": 5.001835199119104, "grad_norm": 1.6202962398529053, "learning_rate": 9.329406158004473e-06, "loss": 0.2205, "num_input_tokens_seen": 58784032, "step": 27255 }, { "epoch": 5.002752798678657, "grad_norm": 4.308501720428467, "learning_rate": 9.329005524253435e-06, "loss": 0.3022, "num_input_tokens_seen": 58794432, "step": 27260 }, { "epoch": 5.003670398238209, "grad_norm": 12.693188667297363, "learning_rate": 9.328604779470115e-06, "loss": 0.1977, "num_input_tokens_seen": 58804320, "step": 27265 }, { "epoch": 5.004587997797761, "grad_norm": 4.6389031410217285, "learning_rate": 9.328203923664789e-06, "loss": 0.1281, "num_input_tokens_seen": 58815008, "step": 27270 }, { "epoch": 5.005505597357313, "grad_norm": 7.5408124923706055, "learning_rate": 9.327802956847741e-06, "loss": 0.2355, "num_input_tokens_seen": 58826144, "step": 27275 }, { "epoch": 5.006423196916866, "grad_norm": 1.1789332628250122, "learning_rate": 9.327401879029257e-06, "loss": 0.371, "num_input_tokens_seen": 58836256, "step": 27280 }, { "epoch": 5.007340796476417, "grad_norm": 7.584331035614014, "learning_rate": 9.327000690219619e-06, "loss": 0.3638, "num_input_tokens_seen": 58847744, "step": 27285 }, { "epoch": 5.00825839603597, "grad_norm": 12.367388725280762, "learning_rate": 9.326599390429119e-06, "loss": 0.2259, "num_input_tokens_seen": 58859840, "step": 27290 }, { "epoch": 5.009175995595522, "grad_norm": 1.5974730253219604, "learning_rate": 9.326197979668052e-06, "loss": 0.1666, "num_input_tokens_seen": 58870720, "step": 27295 }, { "epoch": 5.010093595155074, "grad_norm": 2.831760883331299, "learning_rate": 9.325796457946712e-06, "loss": 0.345, "num_input_tokens_seen": 58881824, "step": 27300 }, { "epoch": 5.0110111947146265, "grad_norm": 5.377204895019531, "learning_rate": 9.325394825275396e-06, "loss": 0.2307, "num_input_tokens_seen": 58892992, "step": 27305 }, { "epoch": 5.011928794274179, "grad_norm": 8.517900466918945, "learning_rate": 9.324993081664407e-06, "loss": 0.3835, "num_input_tokens_seen": 58904544, "step": 27310 }, { "epoch": 5.012846393833731, "grad_norm": 3.272080421447754, "learning_rate": 9.324591227124049e-06, "loss": 0.44, "num_input_tokens_seen": 58915168, "step": 27315 }, { "epoch": 5.013763993393283, "grad_norm": 1.9221386909484863, "learning_rate": 9.32418926166463e-06, "loss": 0.1635, "num_input_tokens_seen": 58925728, "step": 27320 }, { "epoch": 5.014681592952836, "grad_norm": 8.81533145904541, "learning_rate": 9.323787185296456e-06, "loss": 0.3077, "num_input_tokens_seen": 58936416, "step": 27325 }, { "epoch": 5.015599192512387, "grad_norm": 5.458537578582764, "learning_rate": 9.323384998029842e-06, "loss": 0.1774, "num_input_tokens_seen": 58948192, "step": 27330 }, { "epoch": 5.01651679207194, "grad_norm": 9.66301155090332, "learning_rate": 9.322982699875104e-06, "loss": 0.2461, "num_input_tokens_seen": 58959840, "step": 27335 }, { "epoch": 5.017434391631492, "grad_norm": 5.155001640319824, "learning_rate": 9.32258029084256e-06, "loss": 0.4268, "num_input_tokens_seen": 58969984, "step": 27340 }, { "epoch": 5.018351991191044, "grad_norm": 5.174427032470703, "learning_rate": 9.322177770942532e-06, "loss": 0.2263, "num_input_tokens_seen": 58981856, "step": 27345 }, { "epoch": 5.019269590750596, "grad_norm": 7.550772666931152, "learning_rate": 9.32177514018534e-06, "loss": 0.3235, "num_input_tokens_seen": 58992320, "step": 27350 }, { "epoch": 5.020187190310149, "grad_norm": 10.844990730285645, "learning_rate": 9.321372398581315e-06, "loss": 0.1773, "num_input_tokens_seen": 59003776, "step": 27355 }, { "epoch": 5.0211047898697005, "grad_norm": 7.5273756980896, "learning_rate": 9.320969546140786e-06, "loss": 0.3628, "num_input_tokens_seen": 59013856, "step": 27360 }, { "epoch": 5.022022389429253, "grad_norm": 3.72009539604187, "learning_rate": 9.320566582874085e-06, "loss": 0.2077, "num_input_tokens_seen": 59025120, "step": 27365 }, { "epoch": 5.022939988988806, "grad_norm": 4.572055816650391, "learning_rate": 9.320163508791546e-06, "loss": 0.235, "num_input_tokens_seen": 59035648, "step": 27370 }, { "epoch": 5.023857588548357, "grad_norm": 4.765300750732422, "learning_rate": 9.319760323903511e-06, "loss": 0.2036, "num_input_tokens_seen": 59046272, "step": 27375 }, { "epoch": 5.02477518810791, "grad_norm": 7.584940433502197, "learning_rate": 9.319357028220319e-06, "loss": 0.3864, "num_input_tokens_seen": 59058016, "step": 27380 }, { "epoch": 5.025692787667462, "grad_norm": 7.258271217346191, "learning_rate": 9.318953621752312e-06, "loss": 0.2734, "num_input_tokens_seen": 59069344, "step": 27385 }, { "epoch": 5.026610387227014, "grad_norm": 4.499054908752441, "learning_rate": 9.318550104509838e-06, "loss": 0.2203, "num_input_tokens_seen": 59077920, "step": 27390 }, { "epoch": 5.027527986786566, "grad_norm": 12.987319946289062, "learning_rate": 9.318146476503249e-06, "loss": 0.3428, "num_input_tokens_seen": 59090368, "step": 27395 }, { "epoch": 5.028445586346119, "grad_norm": 6.301766395568848, "learning_rate": 9.317742737742894e-06, "loss": 0.1465, "num_input_tokens_seen": 59100736, "step": 27400 }, { "epoch": 5.02936318590567, "grad_norm": 0.7829353213310242, "learning_rate": 9.317338888239129e-06, "loss": 0.0988, "num_input_tokens_seen": 59109056, "step": 27405 }, { "epoch": 5.030280785465223, "grad_norm": 6.73265266418457, "learning_rate": 9.316934928002313e-06, "loss": 0.2703, "num_input_tokens_seen": 59120096, "step": 27410 }, { "epoch": 5.0311983850247755, "grad_norm": 15.760417938232422, "learning_rate": 9.316530857042807e-06, "loss": 0.4174, "num_input_tokens_seen": 59129248, "step": 27415 }, { "epoch": 5.032115984584327, "grad_norm": 0.661227285861969, "learning_rate": 9.316126675370975e-06, "loss": 0.305, "num_input_tokens_seen": 59138912, "step": 27420 }, { "epoch": 5.03303358414388, "grad_norm": 12.604962348937988, "learning_rate": 9.315722382997184e-06, "loss": 0.2749, "num_input_tokens_seen": 59149792, "step": 27425 }, { "epoch": 5.033951183703432, "grad_norm": 8.269655227661133, "learning_rate": 9.315317979931802e-06, "loss": 0.372, "num_input_tokens_seen": 59160832, "step": 27430 }, { "epoch": 5.034868783262984, "grad_norm": 4.5486626625061035, "learning_rate": 9.314913466185201e-06, "loss": 0.3768, "num_input_tokens_seen": 59171200, "step": 27435 }, { "epoch": 5.035786382822536, "grad_norm": 1.3726568222045898, "learning_rate": 9.314508841767757e-06, "loss": 0.2607, "num_input_tokens_seen": 59182016, "step": 27440 }, { "epoch": 5.036703982382089, "grad_norm": 2.1299033164978027, "learning_rate": 9.31410410668985e-06, "loss": 0.2427, "num_input_tokens_seen": 59193184, "step": 27445 }, { "epoch": 5.03762158194164, "grad_norm": 2.069692611694336, "learning_rate": 9.313699260961856e-06, "loss": 0.2842, "num_input_tokens_seen": 59202976, "step": 27450 }, { "epoch": 5.038539181501193, "grad_norm": 6.223118305206299, "learning_rate": 9.313294304594164e-06, "loss": 0.2552, "num_input_tokens_seen": 59213344, "step": 27455 }, { "epoch": 5.039456781060745, "grad_norm": 5.0909318923950195, "learning_rate": 9.312889237597158e-06, "loss": 0.3513, "num_input_tokens_seen": 59223776, "step": 27460 }, { "epoch": 5.040374380620297, "grad_norm": 0.9614860415458679, "learning_rate": 9.312484059981226e-06, "loss": 0.1466, "num_input_tokens_seen": 59235200, "step": 27465 }, { "epoch": 5.0412919801798495, "grad_norm": 6.091808319091797, "learning_rate": 9.312078771756763e-06, "loss": 0.2999, "num_input_tokens_seen": 59245664, "step": 27470 }, { "epoch": 5.042209579739402, "grad_norm": 4.245639801025391, "learning_rate": 9.311673372934162e-06, "loss": 0.2744, "num_input_tokens_seen": 59257376, "step": 27475 }, { "epoch": 5.043127179298954, "grad_norm": 7.982977390289307, "learning_rate": 9.311267863523821e-06, "loss": 0.226, "num_input_tokens_seen": 59269056, "step": 27480 }, { "epoch": 5.044044778858506, "grad_norm": 4.165440082550049, "learning_rate": 9.310862243536142e-06, "loss": 0.1064, "num_input_tokens_seen": 59279936, "step": 27485 }, { "epoch": 5.044962378418059, "grad_norm": 5.001402854919434, "learning_rate": 9.310456512981526e-06, "loss": 0.3087, "num_input_tokens_seen": 59290528, "step": 27490 }, { "epoch": 5.04587997797761, "grad_norm": 1.4609415531158447, "learning_rate": 9.310050671870383e-06, "loss": 0.2093, "num_input_tokens_seen": 59301504, "step": 27495 }, { "epoch": 5.046797577537163, "grad_norm": 12.88697624206543, "learning_rate": 9.30964472021312e-06, "loss": 0.3901, "num_input_tokens_seen": 59312416, "step": 27500 }, { "epoch": 5.047715177096715, "grad_norm": 8.257553100585938, "learning_rate": 9.309238658020148e-06, "loss": 0.1721, "num_input_tokens_seen": 59323488, "step": 27505 }, { "epoch": 5.048632776656267, "grad_norm": 7.941009521484375, "learning_rate": 9.308832485301885e-06, "loss": 0.3947, "num_input_tokens_seen": 59332864, "step": 27510 }, { "epoch": 5.049550376215819, "grad_norm": 4.846401214599609, "learning_rate": 9.308426202068746e-06, "loss": 0.3122, "num_input_tokens_seen": 59344224, "step": 27515 }, { "epoch": 5.050467975775372, "grad_norm": 2.3152365684509277, "learning_rate": 9.308019808331153e-06, "loss": 0.2108, "num_input_tokens_seen": 59355552, "step": 27520 }, { "epoch": 5.0513855753349235, "grad_norm": 14.487022399902344, "learning_rate": 9.307613304099527e-06, "loss": 0.2932, "num_input_tokens_seen": 59366848, "step": 27525 }, { "epoch": 5.052303174894476, "grad_norm": 1.9640851020812988, "learning_rate": 9.307206689384298e-06, "loss": 0.262, "num_input_tokens_seen": 59377312, "step": 27530 }, { "epoch": 5.0532207744540285, "grad_norm": 4.173678874969482, "learning_rate": 9.30679996419589e-06, "loss": 0.2521, "num_input_tokens_seen": 59388256, "step": 27535 }, { "epoch": 5.05413837401358, "grad_norm": 7.917609214782715, "learning_rate": 9.306393128544741e-06, "loss": 0.1949, "num_input_tokens_seen": 59399552, "step": 27540 }, { "epoch": 5.055055973573133, "grad_norm": 2.7285499572753906, "learning_rate": 9.305986182441282e-06, "loss": 0.2525, "num_input_tokens_seen": 59411040, "step": 27545 }, { "epoch": 5.055973573132685, "grad_norm": 19.584415435791016, "learning_rate": 9.305579125895949e-06, "loss": 0.2054, "num_input_tokens_seen": 59421920, "step": 27550 }, { "epoch": 5.056891172692237, "grad_norm": 5.564090251922607, "learning_rate": 9.305171958919185e-06, "loss": 0.1949, "num_input_tokens_seen": 59433920, "step": 27555 }, { "epoch": 5.057808772251789, "grad_norm": 9.750178337097168, "learning_rate": 9.304764681521435e-06, "loss": 0.2451, "num_input_tokens_seen": 59445504, "step": 27560 }, { "epoch": 5.058726371811342, "grad_norm": 10.58462142944336, "learning_rate": 9.304357293713141e-06, "loss": 0.3045, "num_input_tokens_seen": 59455264, "step": 27565 }, { "epoch": 5.059643971370893, "grad_norm": 12.64331340789795, "learning_rate": 9.303949795504755e-06, "loss": 0.2556, "num_input_tokens_seen": 59466144, "step": 27570 }, { "epoch": 5.060561570930446, "grad_norm": 7.230830669403076, "learning_rate": 9.303542186906724e-06, "loss": 0.28, "num_input_tokens_seen": 59476544, "step": 27575 }, { "epoch": 5.061479170489998, "grad_norm": 6.758540153503418, "learning_rate": 9.303134467929508e-06, "loss": 0.1616, "num_input_tokens_seen": 59486944, "step": 27580 }, { "epoch": 5.06239677004955, "grad_norm": 0.8666322231292725, "learning_rate": 9.302726638583563e-06, "loss": 0.2406, "num_input_tokens_seen": 59497056, "step": 27585 }, { "epoch": 5.0633143696091025, "grad_norm": 13.56991195678711, "learning_rate": 9.302318698879346e-06, "loss": 0.2768, "num_input_tokens_seen": 59507680, "step": 27590 }, { "epoch": 5.064231969168655, "grad_norm": 2.567830801010132, "learning_rate": 9.301910648827325e-06, "loss": 0.3489, "num_input_tokens_seen": 59517504, "step": 27595 }, { "epoch": 5.065149568728207, "grad_norm": 16.822086334228516, "learning_rate": 9.30150248843796e-06, "loss": 0.4041, "num_input_tokens_seen": 59528224, "step": 27600 }, { "epoch": 5.066067168287759, "grad_norm": 6.990515232086182, "learning_rate": 9.301094217721727e-06, "loss": 0.2108, "num_input_tokens_seen": 59539872, "step": 27605 }, { "epoch": 5.066984767847312, "grad_norm": 25.94373893737793, "learning_rate": 9.30068583668909e-06, "loss": 0.3583, "num_input_tokens_seen": 59550592, "step": 27610 }, { "epoch": 5.067902367406863, "grad_norm": 9.083600044250488, "learning_rate": 9.300277345350528e-06, "loss": 0.2663, "num_input_tokens_seen": 59562464, "step": 27615 }, { "epoch": 5.068819966966416, "grad_norm": 5.834156513214111, "learning_rate": 9.299868743716518e-06, "loss": 0.2077, "num_input_tokens_seen": 59572736, "step": 27620 }, { "epoch": 5.069737566525968, "grad_norm": 11.847580909729004, "learning_rate": 9.299460031797537e-06, "loss": 0.2763, "num_input_tokens_seen": 59582976, "step": 27625 }, { "epoch": 5.07065516608552, "grad_norm": 22.946117401123047, "learning_rate": 9.29905120960407e-06, "loss": 0.3158, "num_input_tokens_seen": 59593856, "step": 27630 }, { "epoch": 5.071572765645072, "grad_norm": 1.5971152782440186, "learning_rate": 9.298642277146603e-06, "loss": 0.3275, "num_input_tokens_seen": 59604192, "step": 27635 }, { "epoch": 5.072490365204625, "grad_norm": 2.5921199321746826, "learning_rate": 9.298233234435625e-06, "loss": 0.3896, "num_input_tokens_seen": 59615360, "step": 27640 }, { "epoch": 5.0734079647641765, "grad_norm": 6.290018081665039, "learning_rate": 9.297824081481625e-06, "loss": 0.2298, "num_input_tokens_seen": 59626816, "step": 27645 }, { "epoch": 5.074325564323729, "grad_norm": 6.977802753448486, "learning_rate": 9.297414818295098e-06, "loss": 0.4015, "num_input_tokens_seen": 59637760, "step": 27650 }, { "epoch": 5.075243163883282, "grad_norm": 2.6226441860198975, "learning_rate": 9.297005444886542e-06, "loss": 0.3238, "num_input_tokens_seen": 59649792, "step": 27655 }, { "epoch": 5.076160763442833, "grad_norm": 3.0335376262664795, "learning_rate": 9.296595961266456e-06, "loss": 0.2815, "num_input_tokens_seen": 59659808, "step": 27660 }, { "epoch": 5.077078363002386, "grad_norm": 7.902709484100342, "learning_rate": 9.296186367445343e-06, "loss": 0.2746, "num_input_tokens_seen": 59670560, "step": 27665 }, { "epoch": 5.077995962561938, "grad_norm": 5.879814147949219, "learning_rate": 9.295776663433707e-06, "loss": 0.3915, "num_input_tokens_seen": 59682080, "step": 27670 }, { "epoch": 5.07891356212149, "grad_norm": 3.7362782955169678, "learning_rate": 9.295366849242058e-06, "loss": 0.2675, "num_input_tokens_seen": 59692096, "step": 27675 }, { "epoch": 5.079831161681042, "grad_norm": 3.700235366821289, "learning_rate": 9.294956924880907e-06, "loss": 0.2693, "num_input_tokens_seen": 59701760, "step": 27680 }, { "epoch": 5.080748761240595, "grad_norm": 6.1707987785339355, "learning_rate": 9.294546890360768e-06, "loss": 0.2153, "num_input_tokens_seen": 59712928, "step": 27685 }, { "epoch": 5.081666360800146, "grad_norm": 4.2242841720581055, "learning_rate": 9.294136745692155e-06, "loss": 0.3077, "num_input_tokens_seen": 59724384, "step": 27690 }, { "epoch": 5.082583960359699, "grad_norm": 1.8407738208770752, "learning_rate": 9.293726490885591e-06, "loss": 0.235, "num_input_tokens_seen": 59734784, "step": 27695 }, { "epoch": 5.0835015599192515, "grad_norm": 3.1415586471557617, "learning_rate": 9.293316125951597e-06, "loss": 0.2777, "num_input_tokens_seen": 59745312, "step": 27700 }, { "epoch": 5.084419159478803, "grad_norm": 3.53395414352417, "learning_rate": 9.292905650900699e-06, "loss": 0.227, "num_input_tokens_seen": 59756448, "step": 27705 }, { "epoch": 5.085336759038356, "grad_norm": 6.6284661293029785, "learning_rate": 9.292495065743424e-06, "loss": 0.2901, "num_input_tokens_seen": 59766816, "step": 27710 }, { "epoch": 5.086254358597908, "grad_norm": 4.065348148345947, "learning_rate": 9.292084370490304e-06, "loss": 0.1331, "num_input_tokens_seen": 59777216, "step": 27715 }, { "epoch": 5.08717195815746, "grad_norm": 15.665816307067871, "learning_rate": 9.29167356515187e-06, "loss": 0.2864, "num_input_tokens_seen": 59787200, "step": 27720 }, { "epoch": 5.088089557717012, "grad_norm": 5.841418743133545, "learning_rate": 9.291262649738663e-06, "loss": 0.2295, "num_input_tokens_seen": 59797632, "step": 27725 }, { "epoch": 5.089007157276565, "grad_norm": 6.393078327178955, "learning_rate": 9.29085162426122e-06, "loss": 0.3311, "num_input_tokens_seen": 59809088, "step": 27730 }, { "epoch": 5.089924756836116, "grad_norm": 5.6632561683654785, "learning_rate": 9.29044048873008e-06, "loss": 0.2339, "num_input_tokens_seen": 59820032, "step": 27735 }, { "epoch": 5.090842356395669, "grad_norm": 12.736222267150879, "learning_rate": 9.290029243155793e-06, "loss": 0.2834, "num_input_tokens_seen": 59830560, "step": 27740 }, { "epoch": 5.091759955955221, "grad_norm": 3.553414821624756, "learning_rate": 9.289617887548905e-06, "loss": 0.2686, "num_input_tokens_seen": 59840480, "step": 27745 }, { "epoch": 5.092677555514773, "grad_norm": 7.818282604217529, "learning_rate": 9.289206421919966e-06, "loss": 0.2346, "num_input_tokens_seen": 59850592, "step": 27750 }, { "epoch": 5.0935951550743255, "grad_norm": 9.742478370666504, "learning_rate": 9.28879484627953e-06, "loss": 0.156, "num_input_tokens_seen": 59861920, "step": 27755 }, { "epoch": 5.094512754633878, "grad_norm": 9.655875205993652, "learning_rate": 9.288383160638154e-06, "loss": 0.2024, "num_input_tokens_seen": 59872576, "step": 27760 }, { "epoch": 5.09543035419343, "grad_norm": 1.2162582874298096, "learning_rate": 9.287971365006396e-06, "loss": 0.1906, "num_input_tokens_seen": 59883680, "step": 27765 }, { "epoch": 5.096347953752982, "grad_norm": 19.885147094726562, "learning_rate": 9.287559459394818e-06, "loss": 0.3036, "num_input_tokens_seen": 59894368, "step": 27770 }, { "epoch": 5.097265553312535, "grad_norm": 26.230266571044922, "learning_rate": 9.287147443813985e-06, "loss": 0.3545, "num_input_tokens_seen": 59905536, "step": 27775 }, { "epoch": 5.098183152872086, "grad_norm": 7.80070161819458, "learning_rate": 9.286735318274464e-06, "loss": 0.1087, "num_input_tokens_seen": 59915808, "step": 27780 }, { "epoch": 5.099100752431639, "grad_norm": 11.515979766845703, "learning_rate": 9.286323082786828e-06, "loss": 0.3608, "num_input_tokens_seen": 59926688, "step": 27785 }, { "epoch": 5.100018351991191, "grad_norm": 6.8507161140441895, "learning_rate": 9.285910737361645e-06, "loss": 0.2867, "num_input_tokens_seen": 59937536, "step": 27790 }, { "epoch": 5.100935951550743, "grad_norm": 18.2251033782959, "learning_rate": 9.285498282009497e-06, "loss": 0.3516, "num_input_tokens_seen": 59948128, "step": 27795 }, { "epoch": 5.101853551110295, "grad_norm": 5.734992980957031, "learning_rate": 9.285085716740958e-06, "loss": 0.2603, "num_input_tokens_seen": 59958976, "step": 27800 }, { "epoch": 5.102771150669848, "grad_norm": 12.176619529724121, "learning_rate": 9.284673041566613e-06, "loss": 0.2574, "num_input_tokens_seen": 59969696, "step": 27805 }, { "epoch": 5.1036887502293995, "grad_norm": 17.318010330200195, "learning_rate": 9.284260256497044e-06, "loss": 0.2217, "num_input_tokens_seen": 59980384, "step": 27810 }, { "epoch": 5.104606349788952, "grad_norm": 3.704730272293091, "learning_rate": 9.28384736154284e-06, "loss": 0.3339, "num_input_tokens_seen": 59990560, "step": 27815 }, { "epoch": 5.1055239493485045, "grad_norm": 10.866930961608887, "learning_rate": 9.283434356714591e-06, "loss": 0.2011, "num_input_tokens_seen": 60000704, "step": 27820 }, { "epoch": 5.106441548908056, "grad_norm": 2.734797716140747, "learning_rate": 9.28302124202289e-06, "loss": 0.2707, "num_input_tokens_seen": 60011296, "step": 27825 }, { "epoch": 5.107359148467609, "grad_norm": 9.452630043029785, "learning_rate": 9.28260801747833e-06, "loss": 0.2064, "num_input_tokens_seen": 60022560, "step": 27830 }, { "epoch": 5.108276748027161, "grad_norm": 3.8227717876434326, "learning_rate": 9.282194683091512e-06, "loss": 0.3029, "num_input_tokens_seen": 60032416, "step": 27835 }, { "epoch": 5.109194347586713, "grad_norm": 3.5984487533569336, "learning_rate": 9.281781238873038e-06, "loss": 0.3789, "num_input_tokens_seen": 60043264, "step": 27840 }, { "epoch": 5.110111947146265, "grad_norm": 6.087731838226318, "learning_rate": 9.281367684833512e-06, "loss": 0.2537, "num_input_tokens_seen": 60054176, "step": 27845 }, { "epoch": 5.111029546705818, "grad_norm": 12.209630966186523, "learning_rate": 9.280954020983541e-06, "loss": 0.2191, "num_input_tokens_seen": 60065728, "step": 27850 }, { "epoch": 5.111947146265369, "grad_norm": 10.287036895751953, "learning_rate": 9.280540247333732e-06, "loss": 0.2605, "num_input_tokens_seen": 60077504, "step": 27855 }, { "epoch": 5.112864745824922, "grad_norm": 3.0645906925201416, "learning_rate": 9.280126363894701e-06, "loss": 0.2641, "num_input_tokens_seen": 60088896, "step": 27860 }, { "epoch": 5.113782345384474, "grad_norm": 1.2948070764541626, "learning_rate": 9.279712370677062e-06, "loss": 0.3318, "num_input_tokens_seen": 60098464, "step": 27865 }, { "epoch": 5.114699944944026, "grad_norm": 8.780696868896484, "learning_rate": 9.279298267691436e-06, "loss": 0.2119, "num_input_tokens_seen": 60109216, "step": 27870 }, { "epoch": 5.1156175445035785, "grad_norm": 3.8998913764953613, "learning_rate": 9.27888405494844e-06, "loss": 0.2058, "num_input_tokens_seen": 60119296, "step": 27875 }, { "epoch": 5.116535144063131, "grad_norm": 13.550979614257812, "learning_rate": 9.2784697324587e-06, "loss": 0.2596, "num_input_tokens_seen": 60129728, "step": 27880 }, { "epoch": 5.117452743622683, "grad_norm": 2.1315724849700928, "learning_rate": 9.27805530023284e-06, "loss": 0.1917, "num_input_tokens_seen": 60139296, "step": 27885 }, { "epoch": 5.118370343182235, "grad_norm": 3.1794307231903076, "learning_rate": 9.277640758281494e-06, "loss": 0.2687, "num_input_tokens_seen": 60150240, "step": 27890 }, { "epoch": 5.119287942741788, "grad_norm": 11.704000473022461, "learning_rate": 9.277226106615293e-06, "loss": 0.359, "num_input_tokens_seen": 60160000, "step": 27895 }, { "epoch": 5.120205542301339, "grad_norm": 10.685401916503906, "learning_rate": 9.276811345244869e-06, "loss": 0.2046, "num_input_tokens_seen": 60170304, "step": 27900 }, { "epoch": 5.121123141860892, "grad_norm": 10.04411792755127, "learning_rate": 9.276396474180864e-06, "loss": 0.2118, "num_input_tokens_seen": 60181504, "step": 27905 }, { "epoch": 5.122040741420444, "grad_norm": 15.719732284545898, "learning_rate": 9.27598149343392e-06, "loss": 0.2797, "num_input_tokens_seen": 60192160, "step": 27910 }, { "epoch": 5.122958340979996, "grad_norm": 9.69278621673584, "learning_rate": 9.275566403014673e-06, "loss": 0.3226, "num_input_tokens_seen": 60204000, "step": 27915 }, { "epoch": 5.123875940539548, "grad_norm": 4.653435230255127, "learning_rate": 9.275151202933776e-06, "loss": 0.2599, "num_input_tokens_seen": 60214080, "step": 27920 }, { "epoch": 5.124793540099101, "grad_norm": 3.778266191482544, "learning_rate": 9.274735893201878e-06, "loss": 0.2661, "num_input_tokens_seen": 60225440, "step": 27925 }, { "epoch": 5.1257111396586525, "grad_norm": 0.838598906993866, "learning_rate": 9.274320473829628e-06, "loss": 0.268, "num_input_tokens_seen": 60235456, "step": 27930 }, { "epoch": 5.126628739218205, "grad_norm": 18.550050735473633, "learning_rate": 9.273904944827684e-06, "loss": 0.3501, "num_input_tokens_seen": 60245856, "step": 27935 }, { "epoch": 5.127546338777758, "grad_norm": 4.400610446929932, "learning_rate": 9.2734893062067e-06, "loss": 0.2119, "num_input_tokens_seen": 60256320, "step": 27940 }, { "epoch": 5.128463938337309, "grad_norm": 14.713492393493652, "learning_rate": 9.27307355797734e-06, "loss": 0.3448, "num_input_tokens_seen": 60266144, "step": 27945 }, { "epoch": 5.129381537896862, "grad_norm": 9.767011642456055, "learning_rate": 9.272657700150264e-06, "loss": 0.3108, "num_input_tokens_seen": 60277760, "step": 27950 }, { "epoch": 5.130299137456414, "grad_norm": 2.4843478202819824, "learning_rate": 9.272241732736144e-06, "loss": 0.2233, "num_input_tokens_seen": 60288960, "step": 27955 }, { "epoch": 5.131216737015966, "grad_norm": 1.0398136377334595, "learning_rate": 9.271825655745642e-06, "loss": 0.1855, "num_input_tokens_seen": 60299456, "step": 27960 }, { "epoch": 5.132134336575518, "grad_norm": 6.304870128631592, "learning_rate": 9.271409469189432e-06, "loss": 0.2747, "num_input_tokens_seen": 60309696, "step": 27965 }, { "epoch": 5.133051936135071, "grad_norm": 4.216471195220947, "learning_rate": 9.270993173078192e-06, "loss": 0.1898, "num_input_tokens_seen": 60320320, "step": 27970 }, { "epoch": 5.133969535694622, "grad_norm": 6.4855499267578125, "learning_rate": 9.270576767422594e-06, "loss": 0.3292, "num_input_tokens_seen": 60331136, "step": 27975 }, { "epoch": 5.134887135254175, "grad_norm": 5.034763336181641, "learning_rate": 9.270160252233322e-06, "loss": 0.1762, "num_input_tokens_seen": 60343360, "step": 27980 }, { "epoch": 5.1358047348137275, "grad_norm": 1.5918141603469849, "learning_rate": 9.269743627521057e-06, "loss": 0.1057, "num_input_tokens_seen": 60354592, "step": 27985 }, { "epoch": 5.136722334373279, "grad_norm": 8.39922046661377, "learning_rate": 9.269326893296486e-06, "loss": 0.224, "num_input_tokens_seen": 60365696, "step": 27990 }, { "epoch": 5.137639933932832, "grad_norm": 3.282723903656006, "learning_rate": 9.268910049570297e-06, "loss": 0.2385, "num_input_tokens_seen": 60376032, "step": 27995 }, { "epoch": 5.138557533492384, "grad_norm": 16.939708709716797, "learning_rate": 9.268493096353181e-06, "loss": 0.242, "num_input_tokens_seen": 60385504, "step": 28000 }, { "epoch": 5.139475133051936, "grad_norm": 16.52045440673828, "learning_rate": 9.268076033655832e-06, "loss": 0.2504, "num_input_tokens_seen": 60397216, "step": 28005 }, { "epoch": 5.140392732611488, "grad_norm": 16.84238052368164, "learning_rate": 9.26765886148895e-06, "loss": 0.1877, "num_input_tokens_seen": 60407744, "step": 28010 }, { "epoch": 5.141310332171041, "grad_norm": 11.216991424560547, "learning_rate": 9.267241579863232e-06, "loss": 0.1684, "num_input_tokens_seen": 60418816, "step": 28015 }, { "epoch": 5.142227931730593, "grad_norm": 5.399565696716309, "learning_rate": 9.266824188789378e-06, "loss": 0.1605, "num_input_tokens_seen": 60430400, "step": 28020 }, { "epoch": 5.143145531290145, "grad_norm": 5.518280506134033, "learning_rate": 9.2664066882781e-06, "loss": 0.3462, "num_input_tokens_seen": 60441600, "step": 28025 }, { "epoch": 5.144063130849697, "grad_norm": 5.267577171325684, "learning_rate": 9.265989078340101e-06, "loss": 0.5058, "num_input_tokens_seen": 60451872, "step": 28030 }, { "epoch": 5.14498073040925, "grad_norm": 6.188251972198486, "learning_rate": 9.265571358986094e-06, "loss": 0.2231, "num_input_tokens_seen": 60463904, "step": 28035 }, { "epoch": 5.1458983299688015, "grad_norm": 15.510404586791992, "learning_rate": 9.265153530226794e-06, "loss": 0.4237, "num_input_tokens_seen": 60474944, "step": 28040 }, { "epoch": 5.146815929528354, "grad_norm": 10.931070327758789, "learning_rate": 9.264735592072915e-06, "loss": 0.224, "num_input_tokens_seen": 60484544, "step": 28045 }, { "epoch": 5.1477335290879065, "grad_norm": 7.695959568023682, "learning_rate": 9.264317544535178e-06, "loss": 0.3274, "num_input_tokens_seen": 60495488, "step": 28050 }, { "epoch": 5.148651128647458, "grad_norm": 7.661043643951416, "learning_rate": 9.263899387624305e-06, "loss": 0.3125, "num_input_tokens_seen": 60506944, "step": 28055 }, { "epoch": 5.149568728207011, "grad_norm": 6.336050510406494, "learning_rate": 9.26348112135102e-06, "loss": 0.2475, "num_input_tokens_seen": 60516512, "step": 28060 }, { "epoch": 5.150486327766563, "grad_norm": 3.41070294380188, "learning_rate": 9.263062745726054e-06, "loss": 0.2137, "num_input_tokens_seen": 60527488, "step": 28065 }, { "epoch": 5.151403927326115, "grad_norm": 1.626463770866394, "learning_rate": 9.262644260760137e-06, "loss": 0.3488, "num_input_tokens_seen": 60537568, "step": 28070 }, { "epoch": 5.152321526885667, "grad_norm": 5.12426233291626, "learning_rate": 9.262225666463998e-06, "loss": 0.2417, "num_input_tokens_seen": 60548800, "step": 28075 }, { "epoch": 5.15323912644522, "grad_norm": 1.4468716382980347, "learning_rate": 9.261806962848379e-06, "loss": 0.2293, "num_input_tokens_seen": 60558848, "step": 28080 }, { "epoch": 5.154156726004771, "grad_norm": 3.998678684234619, "learning_rate": 9.261388149924015e-06, "loss": 0.3332, "num_input_tokens_seen": 60568896, "step": 28085 }, { "epoch": 5.155074325564324, "grad_norm": 8.373278617858887, "learning_rate": 9.26096922770165e-06, "loss": 0.2672, "num_input_tokens_seen": 60579392, "step": 28090 }, { "epoch": 5.155991925123876, "grad_norm": 0.7289019227027893, "learning_rate": 9.260550196192027e-06, "loss": 0.1632, "num_input_tokens_seen": 60587712, "step": 28095 }, { "epoch": 5.156909524683428, "grad_norm": 3.7200305461883545, "learning_rate": 9.260131055405897e-06, "loss": 0.3113, "num_input_tokens_seen": 60598016, "step": 28100 }, { "epoch": 5.1578271242429805, "grad_norm": 10.0325927734375, "learning_rate": 9.259711805354006e-06, "loss": 0.3289, "num_input_tokens_seen": 60610240, "step": 28105 }, { "epoch": 5.158744723802533, "grad_norm": 11.413534164428711, "learning_rate": 9.25929244604711e-06, "loss": 0.3301, "num_input_tokens_seen": 60620640, "step": 28110 }, { "epoch": 5.159662323362085, "grad_norm": 0.8059895038604736, "learning_rate": 9.258872977495964e-06, "loss": 0.229, "num_input_tokens_seen": 60631040, "step": 28115 }, { "epoch": 5.160579922921637, "grad_norm": 4.822997093200684, "learning_rate": 9.258453399711327e-06, "loss": 0.2571, "num_input_tokens_seen": 60641664, "step": 28120 }, { "epoch": 5.16149752248119, "grad_norm": 12.2971830368042, "learning_rate": 9.25803371270396e-06, "loss": 0.2301, "num_input_tokens_seen": 60651296, "step": 28125 }, { "epoch": 5.162415122040741, "grad_norm": 1.676830530166626, "learning_rate": 9.257613916484628e-06, "loss": 0.2499, "num_input_tokens_seen": 60662144, "step": 28130 }, { "epoch": 5.163332721600294, "grad_norm": 0.8250247240066528, "learning_rate": 9.257194011064097e-06, "loss": 0.3919, "num_input_tokens_seen": 60673504, "step": 28135 }, { "epoch": 5.164250321159846, "grad_norm": 15.468364715576172, "learning_rate": 9.256773996453139e-06, "loss": 0.1939, "num_input_tokens_seen": 60685728, "step": 28140 }, { "epoch": 5.165167920719398, "grad_norm": 3.5614609718322754, "learning_rate": 9.256353872662524e-06, "loss": 0.286, "num_input_tokens_seen": 60694976, "step": 28145 }, { "epoch": 5.16608552027895, "grad_norm": 5.976461410522461, "learning_rate": 9.25593363970303e-06, "loss": 0.321, "num_input_tokens_seen": 60705696, "step": 28150 }, { "epoch": 5.167003119838503, "grad_norm": 13.94484806060791, "learning_rate": 9.255513297585434e-06, "loss": 0.4118, "num_input_tokens_seen": 60716320, "step": 28155 }, { "epoch": 5.1679207193980545, "grad_norm": 6.3334126472473145, "learning_rate": 9.255092846320517e-06, "loss": 0.2984, "num_input_tokens_seen": 60727200, "step": 28160 }, { "epoch": 5.168838318957607, "grad_norm": 17.358678817749023, "learning_rate": 9.254672285919064e-06, "loss": 0.2959, "num_input_tokens_seen": 60737376, "step": 28165 }, { "epoch": 5.1697559185171595, "grad_norm": 3.1776678562164307, "learning_rate": 9.25425161639186e-06, "loss": 0.2273, "num_input_tokens_seen": 60748672, "step": 28170 }, { "epoch": 5.170673518076711, "grad_norm": 6.96806526184082, "learning_rate": 9.253830837749695e-06, "loss": 0.2151, "num_input_tokens_seen": 60759584, "step": 28175 }, { "epoch": 5.171591117636264, "grad_norm": 8.905856132507324, "learning_rate": 9.253409950003363e-06, "loss": 0.317, "num_input_tokens_seen": 60770688, "step": 28180 }, { "epoch": 5.172508717195816, "grad_norm": 3.571540355682373, "learning_rate": 9.252988953163658e-06, "loss": 0.1797, "num_input_tokens_seen": 60781440, "step": 28185 }, { "epoch": 5.173426316755368, "grad_norm": 4.773742198944092, "learning_rate": 9.252567847241378e-06, "loss": 0.253, "num_input_tokens_seen": 60792608, "step": 28190 }, { "epoch": 5.17434391631492, "grad_norm": 13.930411338806152, "learning_rate": 9.252146632247323e-06, "loss": 0.2242, "num_input_tokens_seen": 60803680, "step": 28195 }, { "epoch": 5.175261515874473, "grad_norm": 9.99024772644043, "learning_rate": 9.251725308192299e-06, "loss": 0.2841, "num_input_tokens_seen": 60813312, "step": 28200 }, { "epoch": 5.176179115434024, "grad_norm": 5.676161766052246, "learning_rate": 9.251303875087108e-06, "loss": 0.3322, "num_input_tokens_seen": 60825472, "step": 28205 }, { "epoch": 5.177096714993577, "grad_norm": 3.492283344268799, "learning_rate": 9.250882332942562e-06, "loss": 0.3383, "num_input_tokens_seen": 60836352, "step": 28210 }, { "epoch": 5.178014314553129, "grad_norm": 3.065241813659668, "learning_rate": 9.250460681769473e-06, "loss": 0.2677, "num_input_tokens_seen": 60848352, "step": 28215 }, { "epoch": 5.178931914112681, "grad_norm": 11.904293060302734, "learning_rate": 9.250038921578655e-06, "loss": 0.3139, "num_input_tokens_seen": 60858912, "step": 28220 }, { "epoch": 5.179849513672234, "grad_norm": 6.676523208618164, "learning_rate": 9.249617052380926e-06, "loss": 0.3838, "num_input_tokens_seen": 60869120, "step": 28225 }, { "epoch": 5.180767113231786, "grad_norm": 7.679541110992432, "learning_rate": 9.249195074187105e-06, "loss": 0.2066, "num_input_tokens_seen": 60877696, "step": 28230 }, { "epoch": 5.181684712791338, "grad_norm": 8.04545783996582, "learning_rate": 9.248772987008015e-06, "loss": 0.2503, "num_input_tokens_seen": 60888320, "step": 28235 }, { "epoch": 5.18260231235089, "grad_norm": 4.05026912689209, "learning_rate": 9.248350790854486e-06, "loss": 0.1893, "num_input_tokens_seen": 60897504, "step": 28240 }, { "epoch": 5.183519911910443, "grad_norm": 6.688884735107422, "learning_rate": 9.24792848573734e-06, "loss": 0.2828, "num_input_tokens_seen": 60908224, "step": 28245 }, { "epoch": 5.184437511469994, "grad_norm": 3.1695032119750977, "learning_rate": 9.247506071667416e-06, "loss": 0.258, "num_input_tokens_seen": 60919072, "step": 28250 }, { "epoch": 5.185355111029547, "grad_norm": 4.40892219543457, "learning_rate": 9.247083548655542e-06, "loss": 0.2926, "num_input_tokens_seen": 60929824, "step": 28255 }, { "epoch": 5.186272710589099, "grad_norm": 9.273987770080566, "learning_rate": 9.246660916712557e-06, "loss": 0.2922, "num_input_tokens_seen": 60939040, "step": 28260 }, { "epoch": 5.187190310148651, "grad_norm": 7.4761247634887695, "learning_rate": 9.246238175849302e-06, "loss": 0.264, "num_input_tokens_seen": 60949664, "step": 28265 }, { "epoch": 5.1881079097082035, "grad_norm": 12.203021049499512, "learning_rate": 9.245815326076619e-06, "loss": 0.2717, "num_input_tokens_seen": 60960096, "step": 28270 }, { "epoch": 5.189025509267756, "grad_norm": 2.8764052391052246, "learning_rate": 9.245392367405353e-06, "loss": 0.3173, "num_input_tokens_seen": 60971584, "step": 28275 }, { "epoch": 5.189943108827308, "grad_norm": 1.102613925933838, "learning_rate": 9.244969299846352e-06, "loss": 0.1981, "num_input_tokens_seen": 60982336, "step": 28280 }, { "epoch": 5.19086070838686, "grad_norm": 3.774988889694214, "learning_rate": 9.244546123410468e-06, "loss": 0.1572, "num_input_tokens_seen": 60993152, "step": 28285 }, { "epoch": 5.191778307946413, "grad_norm": 9.008590698242188, "learning_rate": 9.244122838108554e-06, "loss": 0.402, "num_input_tokens_seen": 61002368, "step": 28290 }, { "epoch": 5.192695907505964, "grad_norm": 2.837944746017456, "learning_rate": 9.243699443951469e-06, "loss": 0.2562, "num_input_tokens_seen": 61013728, "step": 28295 }, { "epoch": 5.193613507065517, "grad_norm": 1.4474343061447144, "learning_rate": 9.243275940950067e-06, "loss": 0.1915, "num_input_tokens_seen": 61024992, "step": 28300 }, { "epoch": 5.194531106625069, "grad_norm": 12.197395324707031, "learning_rate": 9.242852329115215e-06, "loss": 0.3669, "num_input_tokens_seen": 61036096, "step": 28305 }, { "epoch": 5.195448706184621, "grad_norm": 12.400628089904785, "learning_rate": 9.24242860845778e-06, "loss": 0.2256, "num_input_tokens_seen": 61047520, "step": 28310 }, { "epoch": 5.196366305744173, "grad_norm": 3.0945236682891846, "learning_rate": 9.242004778988622e-06, "loss": 0.2578, "num_input_tokens_seen": 61058880, "step": 28315 }, { "epoch": 5.197283905303726, "grad_norm": 8.509782791137695, "learning_rate": 9.241580840718617e-06, "loss": 0.2313, "num_input_tokens_seen": 61069120, "step": 28320 }, { "epoch": 5.1982015048632775, "grad_norm": 12.6871919631958, "learning_rate": 9.241156793658638e-06, "loss": 0.2967, "num_input_tokens_seen": 61080640, "step": 28325 }, { "epoch": 5.19911910442283, "grad_norm": 11.299798011779785, "learning_rate": 9.240732637819559e-06, "loss": 0.4082, "num_input_tokens_seen": 61091712, "step": 28330 }, { "epoch": 5.2000367039823825, "grad_norm": 3.8063864707946777, "learning_rate": 9.240308373212261e-06, "loss": 0.1875, "num_input_tokens_seen": 61103456, "step": 28335 }, { "epoch": 5.200954303541934, "grad_norm": 1.7171214818954468, "learning_rate": 9.239883999847626e-06, "loss": 0.3126, "num_input_tokens_seen": 61113600, "step": 28340 }, { "epoch": 5.201871903101487, "grad_norm": 16.679706573486328, "learning_rate": 9.239459517736537e-06, "loss": 0.3166, "num_input_tokens_seen": 61124128, "step": 28345 }, { "epoch": 5.202789502661039, "grad_norm": 14.284003257751465, "learning_rate": 9.239034926889882e-06, "loss": 0.2704, "num_input_tokens_seen": 61134528, "step": 28350 }, { "epoch": 5.203707102220591, "grad_norm": 2.7086589336395264, "learning_rate": 9.238610227318551e-06, "loss": 0.264, "num_input_tokens_seen": 61144864, "step": 28355 }, { "epoch": 5.204624701780143, "grad_norm": 12.742646217346191, "learning_rate": 9.238185419033438e-06, "loss": 0.1946, "num_input_tokens_seen": 61155552, "step": 28360 }, { "epoch": 5.205542301339696, "grad_norm": 6.417068004608154, "learning_rate": 9.237760502045436e-06, "loss": 0.1892, "num_input_tokens_seen": 61165440, "step": 28365 }, { "epoch": 5.206459900899247, "grad_norm": 8.497502326965332, "learning_rate": 9.237335476365447e-06, "loss": 0.2213, "num_input_tokens_seen": 61176736, "step": 28370 }, { "epoch": 5.2073775004588, "grad_norm": 5.2089762687683105, "learning_rate": 9.236910342004367e-06, "loss": 0.311, "num_input_tokens_seen": 61187776, "step": 28375 }, { "epoch": 5.208295100018352, "grad_norm": 4.25374698638916, "learning_rate": 9.236485098973107e-06, "loss": 0.256, "num_input_tokens_seen": 61198464, "step": 28380 }, { "epoch": 5.209212699577904, "grad_norm": 5.407290458679199, "learning_rate": 9.236059747282569e-06, "loss": 0.2011, "num_input_tokens_seen": 61209952, "step": 28385 }, { "epoch": 5.2101302991374565, "grad_norm": 1.7967568635940552, "learning_rate": 9.235634286943663e-06, "loss": 0.3024, "num_input_tokens_seen": 61220896, "step": 28390 }, { "epoch": 5.211047898697009, "grad_norm": 2.0789012908935547, "learning_rate": 9.235208717967301e-06, "loss": 0.1427, "num_input_tokens_seen": 61230944, "step": 28395 }, { "epoch": 5.211965498256561, "grad_norm": 5.207965850830078, "learning_rate": 9.234783040364402e-06, "loss": 0.2856, "num_input_tokens_seen": 61242944, "step": 28400 }, { "epoch": 5.212883097816113, "grad_norm": 8.961100578308105, "learning_rate": 9.23435725414588e-06, "loss": 0.4137, "num_input_tokens_seen": 61253472, "step": 28405 }, { "epoch": 5.213800697375666, "grad_norm": 8.302109718322754, "learning_rate": 9.233931359322658e-06, "loss": 0.3584, "num_input_tokens_seen": 61264224, "step": 28410 }, { "epoch": 5.214718296935217, "grad_norm": 5.313312530517578, "learning_rate": 9.233505355905658e-06, "loss": 0.2133, "num_input_tokens_seen": 61274784, "step": 28415 }, { "epoch": 5.21563589649477, "grad_norm": 15.117197036743164, "learning_rate": 9.233079243905806e-06, "loss": 0.4069, "num_input_tokens_seen": 61285824, "step": 28420 }, { "epoch": 5.216553496054322, "grad_norm": 14.717450141906738, "learning_rate": 9.232653023334033e-06, "loss": 0.1923, "num_input_tokens_seen": 61297664, "step": 28425 }, { "epoch": 5.217471095613874, "grad_norm": 7.010638236999512, "learning_rate": 9.23222669420127e-06, "loss": 0.3993, "num_input_tokens_seen": 61308896, "step": 28430 }, { "epoch": 5.218388695173426, "grad_norm": 9.234624862670898, "learning_rate": 9.231800256518451e-06, "loss": 0.2215, "num_input_tokens_seen": 61319616, "step": 28435 }, { "epoch": 5.219306294732979, "grad_norm": 1.9351869821548462, "learning_rate": 9.231373710296516e-06, "loss": 0.342, "num_input_tokens_seen": 61330624, "step": 28440 }, { "epoch": 5.2202238942925305, "grad_norm": 11.284725189208984, "learning_rate": 9.230947055546402e-06, "loss": 0.2593, "num_input_tokens_seen": 61341024, "step": 28445 }, { "epoch": 5.221141493852083, "grad_norm": 4.734067440032959, "learning_rate": 9.230520292279053e-06, "loss": 0.257, "num_input_tokens_seen": 61352736, "step": 28450 }, { "epoch": 5.2220590934116355, "grad_norm": 4.981573581695557, "learning_rate": 9.230093420505415e-06, "loss": 0.2989, "num_input_tokens_seen": 61363552, "step": 28455 }, { "epoch": 5.222976692971187, "grad_norm": 6.974640846252441, "learning_rate": 9.229666440236438e-06, "loss": 0.2305, "num_input_tokens_seen": 61375168, "step": 28460 }, { "epoch": 5.22389429253074, "grad_norm": 1.7100225687026978, "learning_rate": 9.229239351483072e-06, "loss": 0.3146, "num_input_tokens_seen": 61385696, "step": 28465 }, { "epoch": 5.224811892090292, "grad_norm": 1.264033317565918, "learning_rate": 9.22881215425627e-06, "loss": 0.3139, "num_input_tokens_seen": 61396512, "step": 28470 }, { "epoch": 5.225729491649844, "grad_norm": 15.389921188354492, "learning_rate": 9.22838484856699e-06, "loss": 0.2268, "num_input_tokens_seen": 61407168, "step": 28475 }, { "epoch": 5.226647091209396, "grad_norm": 4.533607006072998, "learning_rate": 9.227957434426196e-06, "loss": 0.2542, "num_input_tokens_seen": 61419104, "step": 28480 }, { "epoch": 5.227564690768949, "grad_norm": 3.8643884658813477, "learning_rate": 9.227529911844844e-06, "loss": 0.3528, "num_input_tokens_seen": 61428768, "step": 28485 }, { "epoch": 5.2284822903285, "grad_norm": 7.9977335929870605, "learning_rate": 9.227102280833901e-06, "loss": 0.3749, "num_input_tokens_seen": 61439520, "step": 28490 }, { "epoch": 5.229399889888053, "grad_norm": 12.305541038513184, "learning_rate": 9.226674541404337e-06, "loss": 0.2638, "num_input_tokens_seen": 61449664, "step": 28495 }, { "epoch": 5.230317489447605, "grad_norm": 8.673149108886719, "learning_rate": 9.22624669356712e-06, "loss": 0.284, "num_input_tokens_seen": 61460960, "step": 28500 }, { "epoch": 5.231235089007157, "grad_norm": 4.4321513175964355, "learning_rate": 9.225818737333227e-06, "loss": 0.2094, "num_input_tokens_seen": 61470848, "step": 28505 }, { "epoch": 5.23215268856671, "grad_norm": 3.911405563354492, "learning_rate": 9.225390672713635e-06, "loss": 0.3177, "num_input_tokens_seen": 61481024, "step": 28510 }, { "epoch": 5.233070288126262, "grad_norm": 3.9936342239379883, "learning_rate": 9.224962499719317e-06, "loss": 0.2264, "num_input_tokens_seen": 61493152, "step": 28515 }, { "epoch": 5.233987887685814, "grad_norm": 3.3026859760284424, "learning_rate": 9.224534218361261e-06, "loss": 0.1794, "num_input_tokens_seen": 61504832, "step": 28520 }, { "epoch": 5.234905487245366, "grad_norm": 4.146224498748779, "learning_rate": 9.22410582865045e-06, "loss": 0.1869, "num_input_tokens_seen": 61514208, "step": 28525 }, { "epoch": 5.235823086804919, "grad_norm": 4.0616607666015625, "learning_rate": 9.22367733059787e-06, "loss": 0.2301, "num_input_tokens_seen": 61524992, "step": 28530 }, { "epoch": 5.23674068636447, "grad_norm": 14.001602172851562, "learning_rate": 9.223248724214513e-06, "loss": 0.3284, "num_input_tokens_seen": 61535616, "step": 28535 }, { "epoch": 5.237658285924023, "grad_norm": 5.227431297302246, "learning_rate": 9.222820009511373e-06, "loss": 0.173, "num_input_tokens_seen": 61545152, "step": 28540 }, { "epoch": 5.238575885483575, "grad_norm": 6.5515971183776855, "learning_rate": 9.222391186499442e-06, "loss": 0.3134, "num_input_tokens_seen": 61555392, "step": 28545 }, { "epoch": 5.239493485043127, "grad_norm": 6.133193016052246, "learning_rate": 9.221962255189723e-06, "loss": 0.2269, "num_input_tokens_seen": 61566240, "step": 28550 }, { "epoch": 5.2404110846026795, "grad_norm": 3.5868537425994873, "learning_rate": 9.221533215593214e-06, "loss": 0.1521, "num_input_tokens_seen": 61577728, "step": 28555 }, { "epoch": 5.241328684162232, "grad_norm": 1.8507012128829956, "learning_rate": 9.221104067720923e-06, "loss": 0.1241, "num_input_tokens_seen": 61587584, "step": 28560 }, { "epoch": 5.242246283721784, "grad_norm": 2.068830728530884, "learning_rate": 9.220674811583855e-06, "loss": 0.1474, "num_input_tokens_seen": 61598784, "step": 28565 }, { "epoch": 5.243163883281336, "grad_norm": 14.193105697631836, "learning_rate": 9.220245447193016e-06, "loss": 0.2669, "num_input_tokens_seen": 61608928, "step": 28570 }, { "epoch": 5.244081482840889, "grad_norm": 8.764437675476074, "learning_rate": 9.219815974559425e-06, "loss": 0.1845, "num_input_tokens_seen": 61619296, "step": 28575 }, { "epoch": 5.24499908240044, "grad_norm": 7.771762371063232, "learning_rate": 9.219386393694094e-06, "loss": 0.4244, "num_input_tokens_seen": 61630688, "step": 28580 }, { "epoch": 5.245916681959993, "grad_norm": 5.159433364868164, "learning_rate": 9.218956704608042e-06, "loss": 0.3686, "num_input_tokens_seen": 61640768, "step": 28585 }, { "epoch": 5.246834281519545, "grad_norm": 21.644969940185547, "learning_rate": 9.218526907312289e-06, "loss": 0.3495, "num_input_tokens_seen": 61649984, "step": 28590 }, { "epoch": 5.247751881079097, "grad_norm": 11.19908618927002, "learning_rate": 9.218097001817857e-06, "loss": 0.3468, "num_input_tokens_seen": 61660512, "step": 28595 }, { "epoch": 5.248669480638649, "grad_norm": 1.016344666481018, "learning_rate": 9.217666988135777e-06, "loss": 0.1957, "num_input_tokens_seen": 61671104, "step": 28600 }, { "epoch": 5.249587080198202, "grad_norm": 6.402462482452393, "learning_rate": 9.217236866277072e-06, "loss": 0.2756, "num_input_tokens_seen": 61681664, "step": 28605 }, { "epoch": 5.2505046797577535, "grad_norm": 1.4612212181091309, "learning_rate": 9.216806636252782e-06, "loss": 0.0891, "num_input_tokens_seen": 61690848, "step": 28610 }, { "epoch": 5.251422279317306, "grad_norm": 30.134475708007812, "learning_rate": 9.216376298073935e-06, "loss": 0.3084, "num_input_tokens_seen": 61701280, "step": 28615 }, { "epoch": 5.2523398788768585, "grad_norm": 11.119623184204102, "learning_rate": 9.21594585175157e-06, "loss": 0.4193, "num_input_tokens_seen": 61713472, "step": 28620 }, { "epoch": 5.25325747843641, "grad_norm": 2.591717004776001, "learning_rate": 9.21551529729673e-06, "loss": 0.1617, "num_input_tokens_seen": 61723616, "step": 28625 }, { "epoch": 5.254175077995963, "grad_norm": 13.281366348266602, "learning_rate": 9.215084634720455e-06, "loss": 0.3581, "num_input_tokens_seen": 61736320, "step": 28630 }, { "epoch": 5.255092677555515, "grad_norm": 13.420868873596191, "learning_rate": 9.214653864033791e-06, "loss": 0.3439, "num_input_tokens_seen": 61747488, "step": 28635 }, { "epoch": 5.256010277115067, "grad_norm": 8.423672676086426, "learning_rate": 9.21422298524779e-06, "loss": 0.1977, "num_input_tokens_seen": 61757248, "step": 28640 }, { "epoch": 5.256927876674619, "grad_norm": 11.383398056030273, "learning_rate": 9.213791998373498e-06, "loss": 0.2791, "num_input_tokens_seen": 61767808, "step": 28645 }, { "epoch": 5.257845476234172, "grad_norm": 1.1518833637237549, "learning_rate": 9.213360903421973e-06, "loss": 0.2844, "num_input_tokens_seen": 61778048, "step": 28650 }, { "epoch": 5.258763075793723, "grad_norm": 5.563906192779541, "learning_rate": 9.212929700404272e-06, "loss": 0.3272, "num_input_tokens_seen": 61788320, "step": 28655 }, { "epoch": 5.259680675353276, "grad_norm": 22.188533782958984, "learning_rate": 9.212498389331452e-06, "loss": 0.1948, "num_input_tokens_seen": 61799776, "step": 28660 }, { "epoch": 5.260598274912828, "grad_norm": 25.421222686767578, "learning_rate": 9.212066970214579e-06, "loss": 0.3472, "num_input_tokens_seen": 61808320, "step": 28665 }, { "epoch": 5.26151587447238, "grad_norm": 5.307572364807129, "learning_rate": 9.211635443064714e-06, "loss": 0.1747, "num_input_tokens_seen": 61819680, "step": 28670 }, { "epoch": 5.2624334740319325, "grad_norm": 5.773427486419678, "learning_rate": 9.21120380789293e-06, "loss": 0.2774, "num_input_tokens_seen": 61830880, "step": 28675 }, { "epoch": 5.263351073591485, "grad_norm": 4.443333148956299, "learning_rate": 9.210772064710293e-06, "loss": 0.2368, "num_input_tokens_seen": 61840672, "step": 28680 }, { "epoch": 5.264268673151037, "grad_norm": 2.674041509628296, "learning_rate": 9.21034021352788e-06, "loss": 0.3405, "num_input_tokens_seen": 61850912, "step": 28685 }, { "epoch": 5.265186272710589, "grad_norm": 0.3931063115596771, "learning_rate": 9.209908254356765e-06, "loss": 0.2347, "num_input_tokens_seen": 61862176, "step": 28690 }, { "epoch": 5.266103872270142, "grad_norm": 17.516077041625977, "learning_rate": 9.209476187208029e-06, "loss": 0.3483, "num_input_tokens_seen": 61873664, "step": 28695 }, { "epoch": 5.267021471829693, "grad_norm": 5.073994159698486, "learning_rate": 9.209044012092752e-06, "loss": 0.1644, "num_input_tokens_seen": 61885664, "step": 28700 }, { "epoch": 5.267939071389246, "grad_norm": 7.274850368499756, "learning_rate": 9.208611729022019e-06, "loss": 0.2206, "num_input_tokens_seen": 61897088, "step": 28705 }, { "epoch": 5.268856670948798, "grad_norm": 2.812086582183838, "learning_rate": 9.208179338006919e-06, "loss": 0.2383, "num_input_tokens_seen": 61907680, "step": 28710 }, { "epoch": 5.26977427050835, "grad_norm": 5.595332145690918, "learning_rate": 9.20774683905854e-06, "loss": 0.2569, "num_input_tokens_seen": 61918912, "step": 28715 }, { "epoch": 5.270691870067902, "grad_norm": 9.233187675476074, "learning_rate": 9.207314232187978e-06, "loss": 0.3957, "num_input_tokens_seen": 61930272, "step": 28720 }, { "epoch": 5.271609469627455, "grad_norm": 11.498154640197754, "learning_rate": 9.206881517406325e-06, "loss": 0.2026, "num_input_tokens_seen": 61941312, "step": 28725 }, { "epoch": 5.2725270691870065, "grad_norm": 3.2451701164245605, "learning_rate": 9.206448694724682e-06, "loss": 0.2643, "num_input_tokens_seen": 61952288, "step": 28730 }, { "epoch": 5.273444668746559, "grad_norm": 15.246417999267578, "learning_rate": 9.206015764154149e-06, "loss": 0.2979, "num_input_tokens_seen": 61964160, "step": 28735 }, { "epoch": 5.2743622683061115, "grad_norm": 9.287569046020508, "learning_rate": 9.205582725705831e-06, "loss": 0.2864, "num_input_tokens_seen": 61973984, "step": 28740 }, { "epoch": 5.275279867865663, "grad_norm": 7.606593132019043, "learning_rate": 9.205149579390833e-06, "loss": 0.2537, "num_input_tokens_seen": 61984864, "step": 28745 }, { "epoch": 5.276197467425216, "grad_norm": 3.2251768112182617, "learning_rate": 9.204716325220266e-06, "loss": 0.25, "num_input_tokens_seen": 61994400, "step": 28750 }, { "epoch": 5.277115066984768, "grad_norm": 10.194856643676758, "learning_rate": 9.204282963205242e-06, "loss": 0.2307, "num_input_tokens_seen": 62006176, "step": 28755 }, { "epoch": 5.27803266654432, "grad_norm": 7.249310493469238, "learning_rate": 9.203849493356875e-06, "loss": 0.2222, "num_input_tokens_seen": 62016768, "step": 28760 }, { "epoch": 5.278950266103872, "grad_norm": 2.907773733139038, "learning_rate": 9.203415915686287e-06, "loss": 0.1809, "num_input_tokens_seen": 62028288, "step": 28765 }, { "epoch": 5.279867865663425, "grad_norm": 8.030747413635254, "learning_rate": 9.202982230204594e-06, "loss": 0.3669, "num_input_tokens_seen": 62038784, "step": 28770 }, { "epoch": 5.280785465222976, "grad_norm": 10.554821968078613, "learning_rate": 9.20254843692292e-06, "loss": 0.2474, "num_input_tokens_seen": 62049312, "step": 28775 }, { "epoch": 5.281703064782529, "grad_norm": 11.455031394958496, "learning_rate": 9.202114535852392e-06, "loss": 0.2711, "num_input_tokens_seen": 62060256, "step": 28780 }, { "epoch": 5.282620664342081, "grad_norm": 9.912631034851074, "learning_rate": 9.201680527004139e-06, "loss": 0.1994, "num_input_tokens_seen": 62071104, "step": 28785 }, { "epoch": 5.283538263901633, "grad_norm": 7.446699619293213, "learning_rate": 9.201246410389293e-06, "loss": 0.2568, "num_input_tokens_seen": 62082752, "step": 28790 }, { "epoch": 5.284455863461186, "grad_norm": 6.149369239807129, "learning_rate": 9.200812186018987e-06, "loss": 0.2639, "num_input_tokens_seen": 62094240, "step": 28795 }, { "epoch": 5.285373463020738, "grad_norm": 1.7814348936080933, "learning_rate": 9.20037785390436e-06, "loss": 0.1162, "num_input_tokens_seen": 62103808, "step": 28800 }, { "epoch": 5.28629106258029, "grad_norm": 14.186139106750488, "learning_rate": 9.19994341405655e-06, "loss": 0.2934, "num_input_tokens_seen": 62114080, "step": 28805 }, { "epoch": 5.287208662139842, "grad_norm": 4.45753812789917, "learning_rate": 9.199508866486701e-06, "loss": 0.2358, "num_input_tokens_seen": 62124000, "step": 28810 }, { "epoch": 5.288126261699395, "grad_norm": 9.449620246887207, "learning_rate": 9.19907421120596e-06, "loss": 0.1491, "num_input_tokens_seen": 62134912, "step": 28815 }, { "epoch": 5.289043861258946, "grad_norm": 12.23009204864502, "learning_rate": 9.198639448225472e-06, "loss": 0.2571, "num_input_tokens_seen": 62146048, "step": 28820 }, { "epoch": 5.289961460818499, "grad_norm": 1.9010330438613892, "learning_rate": 9.198204577556388e-06, "loss": 0.1835, "num_input_tokens_seen": 62157600, "step": 28825 }, { "epoch": 5.290879060378051, "grad_norm": 9.991787910461426, "learning_rate": 9.197769599209867e-06, "loss": 0.2559, "num_input_tokens_seen": 62168032, "step": 28830 }, { "epoch": 5.291796659937603, "grad_norm": 2.0243029594421387, "learning_rate": 9.19733451319706e-06, "loss": 0.2189, "num_input_tokens_seen": 62179840, "step": 28835 }, { "epoch": 5.2927142594971555, "grad_norm": 11.189555168151855, "learning_rate": 9.196899319529126e-06, "loss": 0.4021, "num_input_tokens_seen": 62189664, "step": 28840 }, { "epoch": 5.293631859056708, "grad_norm": 9.733259201049805, "learning_rate": 9.19646401821723e-06, "loss": 0.4202, "num_input_tokens_seen": 62199232, "step": 28845 }, { "epoch": 5.29454945861626, "grad_norm": 1.5686951875686646, "learning_rate": 9.196028609272538e-06, "loss": 0.3068, "num_input_tokens_seen": 62209504, "step": 28850 }, { "epoch": 5.295467058175812, "grad_norm": 1.6009461879730225, "learning_rate": 9.195593092706214e-06, "loss": 0.0963, "num_input_tokens_seen": 62220384, "step": 28855 }, { "epoch": 5.296384657735365, "grad_norm": 4.285977840423584, "learning_rate": 9.19515746852943e-06, "loss": 0.2076, "num_input_tokens_seen": 62230336, "step": 28860 }, { "epoch": 5.297302257294916, "grad_norm": 20.35727310180664, "learning_rate": 9.194721736753358e-06, "loss": 0.2728, "num_input_tokens_seen": 62241504, "step": 28865 }, { "epoch": 5.298219856854469, "grad_norm": 5.638705730438232, "learning_rate": 9.194285897389175e-06, "loss": 0.2469, "num_input_tokens_seen": 62251936, "step": 28870 }, { "epoch": 5.299137456414021, "grad_norm": 3.280578374862671, "learning_rate": 9.19384995044806e-06, "loss": 0.2064, "num_input_tokens_seen": 62263136, "step": 28875 }, { "epoch": 5.300055055973573, "grad_norm": 11.8799409866333, "learning_rate": 9.193413895941192e-06, "loss": 0.2557, "num_input_tokens_seen": 62274016, "step": 28880 }, { "epoch": 5.300972655533125, "grad_norm": 3.61163592338562, "learning_rate": 9.192977733879758e-06, "loss": 0.4368, "num_input_tokens_seen": 62284416, "step": 28885 }, { "epoch": 5.301890255092678, "grad_norm": 0.30444473028182983, "learning_rate": 9.192541464274944e-06, "loss": 0.2492, "num_input_tokens_seen": 62294720, "step": 28890 }, { "epoch": 5.3028078546522295, "grad_norm": 9.549060821533203, "learning_rate": 9.192105087137938e-06, "loss": 0.5185, "num_input_tokens_seen": 62305024, "step": 28895 }, { "epoch": 5.303725454211782, "grad_norm": 11.6326904296875, "learning_rate": 9.191668602479935e-06, "loss": 0.3644, "num_input_tokens_seen": 62316544, "step": 28900 }, { "epoch": 5.3046430537713345, "grad_norm": 3.956143856048584, "learning_rate": 9.191232010312128e-06, "loss": 0.2909, "num_input_tokens_seen": 62326528, "step": 28905 }, { "epoch": 5.305560653330886, "grad_norm": 3.6706244945526123, "learning_rate": 9.190795310645716e-06, "loss": 0.2482, "num_input_tokens_seen": 62337376, "step": 28910 }, { "epoch": 5.306478252890439, "grad_norm": 16.22526741027832, "learning_rate": 9.190358503491901e-06, "loss": 0.3542, "num_input_tokens_seen": 62348576, "step": 28915 }, { "epoch": 5.307395852449991, "grad_norm": 1.3497164249420166, "learning_rate": 9.189921588861883e-06, "loss": 0.2568, "num_input_tokens_seen": 62359456, "step": 28920 }, { "epoch": 5.308313452009543, "grad_norm": 18.347476959228516, "learning_rate": 9.18948456676687e-06, "loss": 0.252, "num_input_tokens_seen": 62371392, "step": 28925 }, { "epoch": 5.309231051569095, "grad_norm": 1.1131953001022339, "learning_rate": 9.189047437218072e-06, "loss": 0.4088, "num_input_tokens_seen": 62383200, "step": 28930 }, { "epoch": 5.310148651128648, "grad_norm": 8.030713081359863, "learning_rate": 9.1886102002267e-06, "loss": 0.3078, "num_input_tokens_seen": 62394336, "step": 28935 }, { "epoch": 5.311066250688199, "grad_norm": 5.050251483917236, "learning_rate": 9.188172855803966e-06, "loss": 0.314, "num_input_tokens_seen": 62404832, "step": 28940 }, { "epoch": 5.311983850247752, "grad_norm": 2.7979679107666016, "learning_rate": 9.187735403961091e-06, "loss": 0.2913, "num_input_tokens_seen": 62415712, "step": 28945 }, { "epoch": 5.312901449807304, "grad_norm": 8.927008628845215, "learning_rate": 9.187297844709293e-06, "loss": 0.2228, "num_input_tokens_seen": 62425856, "step": 28950 }, { "epoch": 5.313819049366856, "grad_norm": 11.244982719421387, "learning_rate": 9.186860178059794e-06, "loss": 0.1559, "num_input_tokens_seen": 62435808, "step": 28955 }, { "epoch": 5.3147366489264085, "grad_norm": 2.309501886367798, "learning_rate": 9.186422404023822e-06, "loss": 0.3708, "num_input_tokens_seen": 62446368, "step": 28960 }, { "epoch": 5.315654248485961, "grad_norm": 5.147299289703369, "learning_rate": 9.185984522612602e-06, "loss": 0.3094, "num_input_tokens_seen": 62457472, "step": 28965 }, { "epoch": 5.316571848045513, "grad_norm": 6.7453765869140625, "learning_rate": 9.185546533837368e-06, "loss": 0.2768, "num_input_tokens_seen": 62468640, "step": 28970 }, { "epoch": 5.317489447605065, "grad_norm": 4.115233421325684, "learning_rate": 9.185108437709354e-06, "loss": 0.1966, "num_input_tokens_seen": 62480288, "step": 28975 }, { "epoch": 5.318407047164618, "grad_norm": 4.693488597869873, "learning_rate": 9.184670234239792e-06, "loss": 0.4081, "num_input_tokens_seen": 62491296, "step": 28980 }, { "epoch": 5.319324646724169, "grad_norm": 2.545858144760132, "learning_rate": 9.184231923439924e-06, "loss": 0.2842, "num_input_tokens_seen": 62501600, "step": 28985 }, { "epoch": 5.320242246283722, "grad_norm": 5.283565044403076, "learning_rate": 9.183793505320996e-06, "loss": 0.1804, "num_input_tokens_seen": 62512576, "step": 28990 }, { "epoch": 5.321159845843274, "grad_norm": 8.084365844726562, "learning_rate": 9.183354979894244e-06, "loss": 0.162, "num_input_tokens_seen": 62523456, "step": 28995 }, { "epoch": 5.322077445402826, "grad_norm": 0.7180259823799133, "learning_rate": 9.182916347170923e-06, "loss": 0.1434, "num_input_tokens_seen": 62533952, "step": 29000 }, { "epoch": 5.322995044962378, "grad_norm": 7.143066883087158, "learning_rate": 9.182477607162281e-06, "loss": 0.286, "num_input_tokens_seen": 62544608, "step": 29005 }, { "epoch": 5.323912644521931, "grad_norm": 4.289830207824707, "learning_rate": 9.18203875987957e-06, "loss": 0.3417, "num_input_tokens_seen": 62555904, "step": 29010 }, { "epoch": 5.3248302440814825, "grad_norm": 4.2092204093933105, "learning_rate": 9.181599805334045e-06, "loss": 0.3239, "num_input_tokens_seen": 62566752, "step": 29015 }, { "epoch": 5.325747843641035, "grad_norm": 2.369760751724243, "learning_rate": 9.181160743536968e-06, "loss": 0.1135, "num_input_tokens_seen": 62577856, "step": 29020 }, { "epoch": 5.3266654432005875, "grad_norm": 8.929920196533203, "learning_rate": 9.180721574499598e-06, "loss": 0.4135, "num_input_tokens_seen": 62588480, "step": 29025 }, { "epoch": 5.327583042760139, "grad_norm": 3.0758297443389893, "learning_rate": 9.180282298233197e-06, "loss": 0.1208, "num_input_tokens_seen": 62598976, "step": 29030 }, { "epoch": 5.328500642319692, "grad_norm": 1.8258038759231567, "learning_rate": 9.179842914749035e-06, "loss": 0.1212, "num_input_tokens_seen": 62610624, "step": 29035 }, { "epoch": 5.329418241879244, "grad_norm": 19.497468948364258, "learning_rate": 9.17940342405838e-06, "loss": 0.3393, "num_input_tokens_seen": 62620384, "step": 29040 }, { "epoch": 5.330335841438796, "grad_norm": 18.637067794799805, "learning_rate": 9.178963826172506e-06, "loss": 0.1814, "num_input_tokens_seen": 62630752, "step": 29045 }, { "epoch": 5.331253440998348, "grad_norm": 14.125188827514648, "learning_rate": 9.178524121102687e-06, "loss": 0.2189, "num_input_tokens_seen": 62641696, "step": 29050 }, { "epoch": 5.332171040557901, "grad_norm": 6.580478191375732, "learning_rate": 9.178084308860199e-06, "loss": 0.6116, "num_input_tokens_seen": 62651936, "step": 29055 }, { "epoch": 5.333088640117452, "grad_norm": 7.471675395965576, "learning_rate": 9.177644389456324e-06, "loss": 0.3928, "num_input_tokens_seen": 62661376, "step": 29060 }, { "epoch": 5.334006239677005, "grad_norm": 8.883646965026855, "learning_rate": 9.177204362902345e-06, "loss": 0.3583, "num_input_tokens_seen": 62672096, "step": 29065 }, { "epoch": 5.334923839236557, "grad_norm": 7.832742691040039, "learning_rate": 9.17676422920955e-06, "loss": 0.4001, "num_input_tokens_seen": 62683392, "step": 29070 }, { "epoch": 5.335841438796109, "grad_norm": 8.48029613494873, "learning_rate": 9.176323988389224e-06, "loss": 0.3063, "num_input_tokens_seen": 62692704, "step": 29075 }, { "epoch": 5.336759038355662, "grad_norm": 1.5967411994934082, "learning_rate": 9.17588364045266e-06, "loss": 0.2024, "num_input_tokens_seen": 62703584, "step": 29080 }, { "epoch": 5.337676637915214, "grad_norm": 23.15115737915039, "learning_rate": 9.175443185411155e-06, "loss": 0.2305, "num_input_tokens_seen": 62714400, "step": 29085 }, { "epoch": 5.338594237474766, "grad_norm": 30.04216766357422, "learning_rate": 9.175002623276e-06, "loss": 0.4734, "num_input_tokens_seen": 62725280, "step": 29090 }, { "epoch": 5.339511837034318, "grad_norm": 0.9670719504356384, "learning_rate": 9.174561954058503e-06, "loss": 0.1469, "num_input_tokens_seen": 62736416, "step": 29095 }, { "epoch": 5.340429436593871, "grad_norm": 2.2912542819976807, "learning_rate": 9.174121177769959e-06, "loss": 0.2431, "num_input_tokens_seen": 62748352, "step": 29100 }, { "epoch": 5.341347036153422, "grad_norm": 2.1709694862365723, "learning_rate": 9.173680294421678e-06, "loss": 0.2509, "num_input_tokens_seen": 62759712, "step": 29105 }, { "epoch": 5.342264635712975, "grad_norm": 9.32803726196289, "learning_rate": 9.173239304024964e-06, "loss": 0.353, "num_input_tokens_seen": 62770784, "step": 29110 }, { "epoch": 5.343182235272527, "grad_norm": 18.7833194732666, "learning_rate": 9.17279820659113e-06, "loss": 0.4081, "num_input_tokens_seen": 62781792, "step": 29115 }, { "epoch": 5.344099834832079, "grad_norm": 0.787797749042511, "learning_rate": 9.17235700213149e-06, "loss": 0.2661, "num_input_tokens_seen": 62792416, "step": 29120 }, { "epoch": 5.3450174343916315, "grad_norm": 3.8132894039154053, "learning_rate": 9.171915690657359e-06, "loss": 0.2058, "num_input_tokens_seen": 62802304, "step": 29125 }, { "epoch": 5.345935033951184, "grad_norm": 8.407398223876953, "learning_rate": 9.171474272180057e-06, "loss": 0.392, "num_input_tokens_seen": 62813120, "step": 29130 }, { "epoch": 5.346852633510736, "grad_norm": 4.489461898803711, "learning_rate": 9.171032746710905e-06, "loss": 0.2161, "num_input_tokens_seen": 62824128, "step": 29135 }, { "epoch": 5.347770233070288, "grad_norm": 5.648756980895996, "learning_rate": 9.170591114261226e-06, "loss": 0.3156, "num_input_tokens_seen": 62835584, "step": 29140 }, { "epoch": 5.348687832629841, "grad_norm": 7.268819332122803, "learning_rate": 9.170149374842352e-06, "loss": 0.2879, "num_input_tokens_seen": 62847296, "step": 29145 }, { "epoch": 5.349605432189392, "grad_norm": 6.547806262969971, "learning_rate": 9.169707528465606e-06, "loss": 0.1325, "num_input_tokens_seen": 62859200, "step": 29150 }, { "epoch": 5.350523031748945, "grad_norm": 0.9221236705780029, "learning_rate": 9.169265575142328e-06, "loss": 0.1515, "num_input_tokens_seen": 62871072, "step": 29155 }, { "epoch": 5.351440631308497, "grad_norm": 2.3922595977783203, "learning_rate": 9.168823514883846e-06, "loss": 0.3155, "num_input_tokens_seen": 62881536, "step": 29160 }, { "epoch": 5.352358230868049, "grad_norm": 1.2508351802825928, "learning_rate": 9.168381347701505e-06, "loss": 0.2108, "num_input_tokens_seen": 62893024, "step": 29165 }, { "epoch": 5.353275830427601, "grad_norm": 40.6347541809082, "learning_rate": 9.167939073606642e-06, "loss": 0.2112, "num_input_tokens_seen": 62904288, "step": 29170 }, { "epoch": 5.354193429987154, "grad_norm": 8.172061920166016, "learning_rate": 9.1674966926106e-06, "loss": 0.3645, "num_input_tokens_seen": 62914304, "step": 29175 }, { "epoch": 5.3551110295467055, "grad_norm": 19.059574127197266, "learning_rate": 9.16705420472473e-06, "loss": 0.2924, "num_input_tokens_seen": 62926272, "step": 29180 }, { "epoch": 5.356028629106258, "grad_norm": 13.89327335357666, "learning_rate": 9.166611609960375e-06, "loss": 0.4112, "num_input_tokens_seen": 62938016, "step": 29185 }, { "epoch": 5.3569462286658105, "grad_norm": 0.3690129518508911, "learning_rate": 9.166168908328891e-06, "loss": 0.1286, "num_input_tokens_seen": 62948800, "step": 29190 }, { "epoch": 5.357863828225362, "grad_norm": 15.200090408325195, "learning_rate": 9.16572609984163e-06, "loss": 0.2824, "num_input_tokens_seen": 62958720, "step": 29195 }, { "epoch": 5.358781427784915, "grad_norm": 7.809762954711914, "learning_rate": 9.165283184509953e-06, "loss": 0.5857, "num_input_tokens_seen": 62970240, "step": 29200 }, { "epoch": 5.359699027344467, "grad_norm": 6.704598426818848, "learning_rate": 9.164840162345216e-06, "loss": 0.3549, "num_input_tokens_seen": 62981536, "step": 29205 }, { "epoch": 5.360616626904019, "grad_norm": 8.992522239685059, "learning_rate": 9.164397033358787e-06, "loss": 0.3416, "num_input_tokens_seen": 62992544, "step": 29210 }, { "epoch": 5.361534226463571, "grad_norm": 5.833464622497559, "learning_rate": 9.163953797562026e-06, "loss": 0.4052, "num_input_tokens_seen": 63004032, "step": 29215 }, { "epoch": 5.362451826023124, "grad_norm": 26.118080139160156, "learning_rate": 9.163510454966304e-06, "loss": 0.2981, "num_input_tokens_seen": 63014080, "step": 29220 }, { "epoch": 5.363369425582675, "grad_norm": 6.2467427253723145, "learning_rate": 9.16306700558299e-06, "loss": 0.198, "num_input_tokens_seen": 63025216, "step": 29225 }, { "epoch": 5.364287025142228, "grad_norm": 4.618954658508301, "learning_rate": 9.162623449423463e-06, "loss": 0.166, "num_input_tokens_seen": 63035936, "step": 29230 }, { "epoch": 5.36520462470178, "grad_norm": 12.63574504852295, "learning_rate": 9.162179786499093e-06, "loss": 0.2231, "num_input_tokens_seen": 63047712, "step": 29235 }, { "epoch": 5.366122224261332, "grad_norm": 3.0851147174835205, "learning_rate": 9.161736016821264e-06, "loss": 0.3001, "num_input_tokens_seen": 63057792, "step": 29240 }, { "epoch": 5.3670398238208845, "grad_norm": 7.258671760559082, "learning_rate": 9.161292140401354e-06, "loss": 0.2993, "num_input_tokens_seen": 63067904, "step": 29245 }, { "epoch": 5.367957423380437, "grad_norm": 7.988420486450195, "learning_rate": 9.160848157250752e-06, "loss": 0.2309, "num_input_tokens_seen": 63079328, "step": 29250 }, { "epoch": 5.368875022939989, "grad_norm": 2.667240619659424, "learning_rate": 9.160404067380843e-06, "loss": 0.3306, "num_input_tokens_seen": 63090208, "step": 29255 }, { "epoch": 5.369792622499541, "grad_norm": 2.089168071746826, "learning_rate": 9.159959870803018e-06, "loss": 0.1917, "num_input_tokens_seen": 63101216, "step": 29260 }, { "epoch": 5.370710222059094, "grad_norm": 13.48814582824707, "learning_rate": 9.15951556752867e-06, "loss": 0.2804, "num_input_tokens_seen": 63113536, "step": 29265 }, { "epoch": 5.371627821618645, "grad_norm": 4.924554347991943, "learning_rate": 9.159071157569193e-06, "loss": 0.2314, "num_input_tokens_seen": 63123520, "step": 29270 }, { "epoch": 5.372545421178198, "grad_norm": 3.5064752101898193, "learning_rate": 9.158626640935987e-06, "loss": 0.2055, "num_input_tokens_seen": 63135264, "step": 29275 }, { "epoch": 5.37346302073775, "grad_norm": 5.899367809295654, "learning_rate": 9.158182017640453e-06, "loss": 0.3289, "num_input_tokens_seen": 63147296, "step": 29280 }, { "epoch": 5.374380620297302, "grad_norm": 4.574054718017578, "learning_rate": 9.157737287693997e-06, "loss": 0.1568, "num_input_tokens_seen": 63158240, "step": 29285 }, { "epoch": 5.375298219856854, "grad_norm": 5.8464884757995605, "learning_rate": 9.157292451108022e-06, "loss": 0.2238, "num_input_tokens_seen": 63169216, "step": 29290 }, { "epoch": 5.376215819416407, "grad_norm": 4.902029037475586, "learning_rate": 9.156847507893937e-06, "loss": 0.243, "num_input_tokens_seen": 63178720, "step": 29295 }, { "epoch": 5.3771334189759585, "grad_norm": 6.314154148101807, "learning_rate": 9.156402458063158e-06, "loss": 0.2628, "num_input_tokens_seen": 63189824, "step": 29300 }, { "epoch": 5.378051018535511, "grad_norm": 1.8174617290496826, "learning_rate": 9.155957301627098e-06, "loss": 0.1229, "num_input_tokens_seen": 63200480, "step": 29305 }, { "epoch": 5.3789686180950635, "grad_norm": 13.020270347595215, "learning_rate": 9.155512038597174e-06, "loss": 0.5604, "num_input_tokens_seen": 63211072, "step": 29310 }, { "epoch": 5.379886217654615, "grad_norm": 3.856490135192871, "learning_rate": 9.155066668984806e-06, "loss": 0.2965, "num_input_tokens_seen": 63221760, "step": 29315 }, { "epoch": 5.380803817214168, "grad_norm": 12.839054107666016, "learning_rate": 9.154621192801419e-06, "loss": 0.2425, "num_input_tokens_seen": 63233120, "step": 29320 }, { "epoch": 5.38172141677372, "grad_norm": 1.6404495239257812, "learning_rate": 9.154175610058437e-06, "loss": 0.2762, "num_input_tokens_seen": 63243936, "step": 29325 }, { "epoch": 5.382639016333272, "grad_norm": 9.553844451904297, "learning_rate": 9.153729920767288e-06, "loss": 0.2651, "num_input_tokens_seen": 63253856, "step": 29330 }, { "epoch": 5.383556615892824, "grad_norm": 13.828847885131836, "learning_rate": 9.153284124939405e-06, "loss": 0.3597, "num_input_tokens_seen": 63264256, "step": 29335 }, { "epoch": 5.384474215452377, "grad_norm": 3.1833767890930176, "learning_rate": 9.15283822258622e-06, "loss": 0.2169, "num_input_tokens_seen": 63275456, "step": 29340 }, { "epoch": 5.385391815011928, "grad_norm": 7.600213050842285, "learning_rate": 9.152392213719173e-06, "loss": 0.202, "num_input_tokens_seen": 63286048, "step": 29345 }, { "epoch": 5.386309414571481, "grad_norm": 4.569200038909912, "learning_rate": 9.1519460983497e-06, "loss": 0.3209, "num_input_tokens_seen": 63296448, "step": 29350 }, { "epoch": 5.387227014131033, "grad_norm": 2.442084312438965, "learning_rate": 9.151499876489244e-06, "loss": 0.3142, "num_input_tokens_seen": 63306496, "step": 29355 }, { "epoch": 5.388144613690585, "grad_norm": 10.53449821472168, "learning_rate": 9.151053548149253e-06, "loss": 0.207, "num_input_tokens_seen": 63316512, "step": 29360 }, { "epoch": 5.389062213250138, "grad_norm": 8.680343627929688, "learning_rate": 9.15060711334117e-06, "loss": 0.3121, "num_input_tokens_seen": 63327904, "step": 29365 }, { "epoch": 5.38997981280969, "grad_norm": 2.0126237869262695, "learning_rate": 9.150160572076447e-06, "loss": 0.1886, "num_input_tokens_seen": 63338144, "step": 29370 }, { "epoch": 5.390897412369242, "grad_norm": 7.994683742523193, "learning_rate": 9.149713924366539e-06, "loss": 0.2226, "num_input_tokens_seen": 63348768, "step": 29375 }, { "epoch": 5.391815011928794, "grad_norm": 4.8203229904174805, "learning_rate": 9.1492671702229e-06, "loss": 0.2912, "num_input_tokens_seen": 63360928, "step": 29380 }, { "epoch": 5.392732611488347, "grad_norm": 5.648495197296143, "learning_rate": 9.148820309656988e-06, "loss": 0.1581, "num_input_tokens_seen": 63372704, "step": 29385 }, { "epoch": 5.393650211047898, "grad_norm": 11.853872299194336, "learning_rate": 9.148373342680265e-06, "loss": 0.3321, "num_input_tokens_seen": 63383392, "step": 29390 }, { "epoch": 5.394567810607451, "grad_norm": 6.471208572387695, "learning_rate": 9.147926269304195e-06, "loss": 0.2028, "num_input_tokens_seen": 63393600, "step": 29395 }, { "epoch": 5.395485410167003, "grad_norm": 4.760984420776367, "learning_rate": 9.147479089540247e-06, "loss": 0.3217, "num_input_tokens_seen": 63405088, "step": 29400 }, { "epoch": 5.396403009726555, "grad_norm": 16.64960479736328, "learning_rate": 9.147031803399887e-06, "loss": 0.2631, "num_input_tokens_seen": 63416512, "step": 29405 }, { "epoch": 5.3973206092861075, "grad_norm": 2.601691246032715, "learning_rate": 9.146584410894588e-06, "loss": 0.3682, "num_input_tokens_seen": 63426912, "step": 29410 }, { "epoch": 5.39823820884566, "grad_norm": 1.8959977626800537, "learning_rate": 9.146136912035825e-06, "loss": 0.2374, "num_input_tokens_seen": 63438592, "step": 29415 }, { "epoch": 5.399155808405212, "grad_norm": 6.276991367340088, "learning_rate": 9.145689306835077e-06, "loss": 0.2644, "num_input_tokens_seen": 63450688, "step": 29420 }, { "epoch": 5.400073407964764, "grad_norm": 12.225031852722168, "learning_rate": 9.145241595303824e-06, "loss": 0.3537, "num_input_tokens_seen": 63460448, "step": 29425 }, { "epoch": 5.400991007524317, "grad_norm": 4.190361022949219, "learning_rate": 9.14479377745355e-06, "loss": 0.2985, "num_input_tokens_seen": 63470752, "step": 29430 }, { "epoch": 5.401908607083868, "grad_norm": 5.496159553527832, "learning_rate": 9.144345853295736e-06, "loss": 0.2013, "num_input_tokens_seen": 63481504, "step": 29435 }, { "epoch": 5.402826206643421, "grad_norm": 4.044693470001221, "learning_rate": 9.143897822841877e-06, "loss": 0.2892, "num_input_tokens_seen": 63493152, "step": 29440 }, { "epoch": 5.403743806202973, "grad_norm": 8.694664001464844, "learning_rate": 9.143449686103459e-06, "loss": 0.2794, "num_input_tokens_seen": 63503904, "step": 29445 }, { "epoch": 5.404661405762525, "grad_norm": 1.623992681503296, "learning_rate": 9.143001443091979e-06, "loss": 0.1835, "num_input_tokens_seen": 63514528, "step": 29450 }, { "epoch": 5.405579005322077, "grad_norm": 2.902606725692749, "learning_rate": 9.142553093818934e-06, "loss": 0.2034, "num_input_tokens_seen": 63524640, "step": 29455 }, { "epoch": 5.40649660488163, "grad_norm": 3.806713819503784, "learning_rate": 9.14210463829582e-06, "loss": 0.3822, "num_input_tokens_seen": 63535264, "step": 29460 }, { "epoch": 5.4074142044411815, "grad_norm": 1.8189432621002197, "learning_rate": 9.141656076534144e-06, "loss": 0.232, "num_input_tokens_seen": 63545632, "step": 29465 }, { "epoch": 5.408331804000734, "grad_norm": 2.7932140827178955, "learning_rate": 9.14120740854541e-06, "loss": 0.198, "num_input_tokens_seen": 63556832, "step": 29470 }, { "epoch": 5.4092494035602865, "grad_norm": 3.417931318283081, "learning_rate": 9.140758634341123e-06, "loss": 0.1684, "num_input_tokens_seen": 63567872, "step": 29475 }, { "epoch": 5.410167003119838, "grad_norm": 12.34681510925293, "learning_rate": 9.140309753932794e-06, "loss": 0.36, "num_input_tokens_seen": 63577984, "step": 29480 }, { "epoch": 5.411084602679391, "grad_norm": 7.432911396026611, "learning_rate": 9.139860767331937e-06, "loss": 0.3777, "num_input_tokens_seen": 63589280, "step": 29485 }, { "epoch": 5.412002202238943, "grad_norm": 10.928447723388672, "learning_rate": 9.139411674550067e-06, "loss": 0.3828, "num_input_tokens_seen": 63600320, "step": 29490 }, { "epoch": 5.412919801798495, "grad_norm": 12.071490287780762, "learning_rate": 9.138962475598703e-06, "loss": 0.2795, "num_input_tokens_seen": 63611904, "step": 29495 }, { "epoch": 5.413837401358047, "grad_norm": 9.314895629882812, "learning_rate": 9.138513170489366e-06, "loss": 0.2433, "num_input_tokens_seen": 63622208, "step": 29500 }, { "epoch": 5.4147550009176, "grad_norm": 8.43525505065918, "learning_rate": 9.138063759233581e-06, "loss": 0.3678, "num_input_tokens_seen": 63633408, "step": 29505 }, { "epoch": 5.415672600477151, "grad_norm": 1.881682276725769, "learning_rate": 9.137614241842876e-06, "loss": 0.1516, "num_input_tokens_seen": 63644288, "step": 29510 }, { "epoch": 5.416590200036704, "grad_norm": 7.5502190589904785, "learning_rate": 9.137164618328776e-06, "loss": 0.1974, "num_input_tokens_seen": 63654528, "step": 29515 }, { "epoch": 5.417507799596256, "grad_norm": 3.0512869358062744, "learning_rate": 9.136714888702816e-06, "loss": 0.1471, "num_input_tokens_seen": 63666016, "step": 29520 }, { "epoch": 5.418425399155808, "grad_norm": 4.528174877166748, "learning_rate": 9.136265052976529e-06, "loss": 0.3505, "num_input_tokens_seen": 63674912, "step": 29525 }, { "epoch": 5.4193429987153605, "grad_norm": 7.723391532897949, "learning_rate": 9.135815111161456e-06, "loss": 0.2743, "num_input_tokens_seen": 63685920, "step": 29530 }, { "epoch": 5.420260598274913, "grad_norm": 6.836340427398682, "learning_rate": 9.135365063269134e-06, "loss": 0.3016, "num_input_tokens_seen": 63697728, "step": 29535 }, { "epoch": 5.421178197834465, "grad_norm": 3.610291004180908, "learning_rate": 9.134914909311109e-06, "loss": 0.2031, "num_input_tokens_seen": 63708384, "step": 29540 }, { "epoch": 5.422095797394017, "grad_norm": 6.625733375549316, "learning_rate": 9.134464649298923e-06, "loss": 0.3121, "num_input_tokens_seen": 63719424, "step": 29545 }, { "epoch": 5.42301339695357, "grad_norm": 2.299170970916748, "learning_rate": 9.134014283244129e-06, "loss": 0.1347, "num_input_tokens_seen": 63729728, "step": 29550 }, { "epoch": 5.423930996513121, "grad_norm": 12.561629295349121, "learning_rate": 9.133563811158275e-06, "loss": 0.2117, "num_input_tokens_seen": 63740288, "step": 29555 }, { "epoch": 5.424848596072674, "grad_norm": 4.30233097076416, "learning_rate": 9.133113233052914e-06, "loss": 0.2329, "num_input_tokens_seen": 63751552, "step": 29560 }, { "epoch": 5.425766195632226, "grad_norm": 1.5044591426849365, "learning_rate": 9.132662548939606e-06, "loss": 0.1275, "num_input_tokens_seen": 63762400, "step": 29565 }, { "epoch": 5.426683795191778, "grad_norm": 1.210548758506775, "learning_rate": 9.13221175882991e-06, "loss": 0.2116, "num_input_tokens_seen": 63773888, "step": 29570 }, { "epoch": 5.42760139475133, "grad_norm": 8.83597183227539, "learning_rate": 9.131760862735383e-06, "loss": 0.2325, "num_input_tokens_seen": 63785760, "step": 29575 }, { "epoch": 5.428518994310883, "grad_norm": 13.209210395812988, "learning_rate": 9.131309860667596e-06, "loss": 0.451, "num_input_tokens_seen": 63796608, "step": 29580 }, { "epoch": 5.4294365938704345, "grad_norm": 5.050354480743408, "learning_rate": 9.130858752638114e-06, "loss": 0.2918, "num_input_tokens_seen": 63807776, "step": 29585 }, { "epoch": 5.430354193429987, "grad_norm": 2.3827202320098877, "learning_rate": 9.130407538658506e-06, "loss": 0.3017, "num_input_tokens_seen": 63817920, "step": 29590 }, { "epoch": 5.4312717929895395, "grad_norm": 2.277127981185913, "learning_rate": 9.129956218740348e-06, "loss": 0.2053, "num_input_tokens_seen": 63828896, "step": 29595 }, { "epoch": 5.432189392549091, "grad_norm": 5.5949273109436035, "learning_rate": 9.129504792895211e-06, "loss": 0.2087, "num_input_tokens_seen": 63839904, "step": 29600 }, { "epoch": 5.433106992108644, "grad_norm": 11.750736236572266, "learning_rate": 9.12905326113468e-06, "loss": 0.3103, "num_input_tokens_seen": 63851840, "step": 29605 }, { "epoch": 5.434024591668196, "grad_norm": 4.891606330871582, "learning_rate": 9.12860162347033e-06, "loss": 0.3402, "num_input_tokens_seen": 63863040, "step": 29610 }, { "epoch": 5.434942191227748, "grad_norm": 5.927898406982422, "learning_rate": 9.128149879913749e-06, "loss": 0.1632, "num_input_tokens_seen": 63873408, "step": 29615 }, { "epoch": 5.4358597907873, "grad_norm": 14.365559577941895, "learning_rate": 9.127698030476518e-06, "loss": 0.2309, "num_input_tokens_seen": 63883840, "step": 29620 }, { "epoch": 5.436777390346853, "grad_norm": 6.6554460525512695, "learning_rate": 9.127246075170232e-06, "loss": 0.3511, "num_input_tokens_seen": 63894624, "step": 29625 }, { "epoch": 5.437694989906404, "grad_norm": 24.961393356323242, "learning_rate": 9.126794014006482e-06, "loss": 0.3164, "num_input_tokens_seen": 63906304, "step": 29630 }, { "epoch": 5.438612589465957, "grad_norm": 3.1765475273132324, "learning_rate": 9.12634184699686e-06, "loss": 0.2047, "num_input_tokens_seen": 63917760, "step": 29635 }, { "epoch": 5.439530189025509, "grad_norm": 0.5285593271255493, "learning_rate": 9.125889574152964e-06, "loss": 0.326, "num_input_tokens_seen": 63928064, "step": 29640 }, { "epoch": 5.440447788585061, "grad_norm": 5.390133857727051, "learning_rate": 9.125437195486397e-06, "loss": 0.3323, "num_input_tokens_seen": 63938656, "step": 29645 }, { "epoch": 5.441365388144614, "grad_norm": 3.8622093200683594, "learning_rate": 9.12498471100876e-06, "loss": 0.1566, "num_input_tokens_seen": 63948608, "step": 29650 }, { "epoch": 5.442282987704166, "grad_norm": 6.679734706878662, "learning_rate": 9.124532120731656e-06, "loss": 0.3052, "num_input_tokens_seen": 63958688, "step": 29655 }, { "epoch": 5.443200587263718, "grad_norm": 2.2747139930725098, "learning_rate": 9.124079424666696e-06, "loss": 0.3337, "num_input_tokens_seen": 63970464, "step": 29660 }, { "epoch": 5.44411818682327, "grad_norm": 5.310300350189209, "learning_rate": 9.123626622825492e-06, "loss": 0.2577, "num_input_tokens_seen": 63980512, "step": 29665 }, { "epoch": 5.445035786382823, "grad_norm": 10.68344497680664, "learning_rate": 9.123173715219656e-06, "loss": 0.3614, "num_input_tokens_seen": 63992000, "step": 29670 }, { "epoch": 5.445953385942374, "grad_norm": 9.641231536865234, "learning_rate": 9.122720701860804e-06, "loss": 0.3591, "num_input_tokens_seen": 64002656, "step": 29675 }, { "epoch": 5.446870985501927, "grad_norm": 2.216663360595703, "learning_rate": 9.122267582760555e-06, "loss": 0.2733, "num_input_tokens_seen": 64014176, "step": 29680 }, { "epoch": 5.447788585061479, "grad_norm": 6.916276454925537, "learning_rate": 9.121814357930533e-06, "loss": 0.3859, "num_input_tokens_seen": 64026368, "step": 29685 }, { "epoch": 5.448706184621031, "grad_norm": 5.4266510009765625, "learning_rate": 9.121361027382358e-06, "loss": 0.2881, "num_input_tokens_seen": 64035936, "step": 29690 }, { "epoch": 5.4496237841805835, "grad_norm": 2.748129367828369, "learning_rate": 9.120907591127663e-06, "loss": 0.3247, "num_input_tokens_seen": 64046368, "step": 29695 }, { "epoch": 5.450541383740136, "grad_norm": 6.468038082122803, "learning_rate": 9.120454049178075e-06, "loss": 0.2491, "num_input_tokens_seen": 64055456, "step": 29700 }, { "epoch": 5.451458983299688, "grad_norm": 3.9466919898986816, "learning_rate": 9.120000401545226e-06, "loss": 0.1636, "num_input_tokens_seen": 64066176, "step": 29705 }, { "epoch": 5.45237658285924, "grad_norm": 8.377907752990723, "learning_rate": 9.11954664824075e-06, "loss": 0.2274, "num_input_tokens_seen": 64076960, "step": 29710 }, { "epoch": 5.453294182418793, "grad_norm": 1.5656119585037231, "learning_rate": 9.119092789276292e-06, "loss": 0.1763, "num_input_tokens_seen": 64087488, "step": 29715 }, { "epoch": 5.454211781978344, "grad_norm": 2.110109567642212, "learning_rate": 9.118638824663483e-06, "loss": 0.2298, "num_input_tokens_seen": 64098688, "step": 29720 }, { "epoch": 5.455129381537897, "grad_norm": 6.642563819885254, "learning_rate": 9.118184754413975e-06, "loss": 0.5326, "num_input_tokens_seen": 64107456, "step": 29725 }, { "epoch": 5.456046981097449, "grad_norm": 11.73363971710205, "learning_rate": 9.11773057853941e-06, "loss": 0.3714, "num_input_tokens_seen": 64116320, "step": 29730 }, { "epoch": 5.456964580657001, "grad_norm": 9.186223030090332, "learning_rate": 9.117276297051437e-06, "loss": 0.299, "num_input_tokens_seen": 64127104, "step": 29735 }, { "epoch": 5.457882180216553, "grad_norm": 5.2105631828308105, "learning_rate": 9.116821909961708e-06, "loss": 0.3304, "num_input_tokens_seen": 64138464, "step": 29740 }, { "epoch": 5.458799779776106, "grad_norm": 12.42223834991455, "learning_rate": 9.116367417281877e-06, "loss": 0.271, "num_input_tokens_seen": 64148032, "step": 29745 }, { "epoch": 5.4597173793356575, "grad_norm": 2.438166856765747, "learning_rate": 9.115912819023602e-06, "loss": 0.2165, "num_input_tokens_seen": 64158784, "step": 29750 }, { "epoch": 5.46063497889521, "grad_norm": 1.2392386198043823, "learning_rate": 9.115458115198544e-06, "loss": 0.3485, "num_input_tokens_seen": 64170208, "step": 29755 }, { "epoch": 5.4615525784547625, "grad_norm": 5.61447811126709, "learning_rate": 9.115003305818362e-06, "loss": 0.2354, "num_input_tokens_seen": 64182080, "step": 29760 }, { "epoch": 5.462470178014314, "grad_norm": 2.687889814376831, "learning_rate": 9.114548390894723e-06, "loss": 0.2069, "num_input_tokens_seen": 64192480, "step": 29765 }, { "epoch": 5.463387777573867, "grad_norm": 7.7000732421875, "learning_rate": 9.114093370439294e-06, "loss": 0.3828, "num_input_tokens_seen": 64202912, "step": 29770 }, { "epoch": 5.464305377133419, "grad_norm": 5.8602190017700195, "learning_rate": 9.113638244463749e-06, "loss": 0.3248, "num_input_tokens_seen": 64213856, "step": 29775 }, { "epoch": 5.465222976692971, "grad_norm": 6.581956386566162, "learning_rate": 9.113183012979756e-06, "loss": 0.4211, "num_input_tokens_seen": 64223840, "step": 29780 }, { "epoch": 5.466140576252523, "grad_norm": 8.37090015411377, "learning_rate": 9.112727675998993e-06, "loss": 0.25, "num_input_tokens_seen": 64235744, "step": 29785 }, { "epoch": 5.467058175812076, "grad_norm": 6.936907768249512, "learning_rate": 9.11227223353314e-06, "loss": 0.4415, "num_input_tokens_seen": 64245664, "step": 29790 }, { "epoch": 5.467975775371627, "grad_norm": 1.5610140562057495, "learning_rate": 9.11181668559388e-06, "loss": 0.184, "num_input_tokens_seen": 64257984, "step": 29795 }, { "epoch": 5.46889337493118, "grad_norm": 4.236198902130127, "learning_rate": 9.111361032192894e-06, "loss": 0.2223, "num_input_tokens_seen": 64269280, "step": 29800 }, { "epoch": 5.469810974490732, "grad_norm": 6.116049289703369, "learning_rate": 9.110905273341869e-06, "loss": 0.2448, "num_input_tokens_seen": 64282016, "step": 29805 }, { "epoch": 5.470728574050284, "grad_norm": 4.5098724365234375, "learning_rate": 9.110449409052492e-06, "loss": 0.4139, "num_input_tokens_seen": 64292256, "step": 29810 }, { "epoch": 5.4716461736098365, "grad_norm": 7.551184177398682, "learning_rate": 9.109993439336462e-06, "loss": 0.3954, "num_input_tokens_seen": 64300768, "step": 29815 }, { "epoch": 5.472563773169389, "grad_norm": 7.240324974060059, "learning_rate": 9.10953736420547e-06, "loss": 0.2965, "num_input_tokens_seen": 64311808, "step": 29820 }, { "epoch": 5.473481372728941, "grad_norm": 3.754134178161621, "learning_rate": 9.109081183671212e-06, "loss": 0.259, "num_input_tokens_seen": 64320672, "step": 29825 }, { "epoch": 5.474398972288493, "grad_norm": 4.551209926605225, "learning_rate": 9.108624897745391e-06, "loss": 0.3211, "num_input_tokens_seen": 64332192, "step": 29830 }, { "epoch": 5.475316571848046, "grad_norm": 11.913309097290039, "learning_rate": 9.10816850643971e-06, "loss": 0.2726, "num_input_tokens_seen": 64342688, "step": 29835 }, { "epoch": 5.476234171407597, "grad_norm": 3.636107921600342, "learning_rate": 9.107712009765872e-06, "loss": 0.2643, "num_input_tokens_seen": 64353472, "step": 29840 }, { "epoch": 5.47715177096715, "grad_norm": 4.991424083709717, "learning_rate": 9.10725540773559e-06, "loss": 0.2814, "num_input_tokens_seen": 64364736, "step": 29845 }, { "epoch": 5.478069370526702, "grad_norm": 2.5628855228424072, "learning_rate": 9.106798700360571e-06, "loss": 0.2537, "num_input_tokens_seen": 64375296, "step": 29850 }, { "epoch": 5.478986970086254, "grad_norm": 4.652215003967285, "learning_rate": 9.106341887652531e-06, "loss": 0.3414, "num_input_tokens_seen": 64386496, "step": 29855 }, { "epoch": 5.479904569645806, "grad_norm": 2.3572998046875, "learning_rate": 9.105884969623184e-06, "loss": 0.2489, "num_input_tokens_seen": 64397728, "step": 29860 }, { "epoch": 5.480822169205359, "grad_norm": 25.993576049804688, "learning_rate": 9.105427946284251e-06, "loss": 0.2731, "num_input_tokens_seen": 64407936, "step": 29865 }, { "epoch": 5.4817397687649105, "grad_norm": 5.234388828277588, "learning_rate": 9.104970817647456e-06, "loss": 0.1821, "num_input_tokens_seen": 64418240, "step": 29870 }, { "epoch": 5.482657368324463, "grad_norm": 15.287841796875, "learning_rate": 9.104513583724522e-06, "loss": 0.2192, "num_input_tokens_seen": 64429312, "step": 29875 }, { "epoch": 5.4835749678840155, "grad_norm": 4.09333610534668, "learning_rate": 9.104056244527173e-06, "loss": 0.2924, "num_input_tokens_seen": 64440960, "step": 29880 }, { "epoch": 5.484492567443567, "grad_norm": 5.693175315856934, "learning_rate": 9.103598800067144e-06, "loss": 0.1876, "num_input_tokens_seen": 64452384, "step": 29885 }, { "epoch": 5.48541016700312, "grad_norm": 4.092442989349365, "learning_rate": 9.103141250356166e-06, "loss": 0.1551, "num_input_tokens_seen": 64462592, "step": 29890 }, { "epoch": 5.486327766562672, "grad_norm": 9.742166519165039, "learning_rate": 9.102683595405973e-06, "loss": 0.2887, "num_input_tokens_seen": 64472928, "step": 29895 }, { "epoch": 5.487245366122224, "grad_norm": 12.793736457824707, "learning_rate": 9.102225835228306e-06, "loss": 0.2634, "num_input_tokens_seen": 64484576, "step": 29900 }, { "epoch": 5.488162965681776, "grad_norm": 3.0185840129852295, "learning_rate": 9.101767969834903e-06, "loss": 0.2855, "num_input_tokens_seen": 64496096, "step": 29905 }, { "epoch": 5.489080565241329, "grad_norm": 6.102469444274902, "learning_rate": 9.101309999237509e-06, "loss": 0.189, "num_input_tokens_seen": 64508000, "step": 29910 }, { "epoch": 5.48999816480088, "grad_norm": 9.292366981506348, "learning_rate": 9.100851923447871e-06, "loss": 0.2843, "num_input_tokens_seen": 64519712, "step": 29915 }, { "epoch": 5.490915764360433, "grad_norm": 9.26910400390625, "learning_rate": 9.100393742477736e-06, "loss": 0.4215, "num_input_tokens_seen": 64530240, "step": 29920 }, { "epoch": 5.4918333639199854, "grad_norm": 4.290946006774902, "learning_rate": 9.099935456338856e-06, "loss": 0.151, "num_input_tokens_seen": 64541856, "step": 29925 }, { "epoch": 5.492750963479537, "grad_norm": 0.9097123146057129, "learning_rate": 9.099477065042986e-06, "loss": 0.3009, "num_input_tokens_seen": 64553376, "step": 29930 }, { "epoch": 5.49366856303909, "grad_norm": 7.400552272796631, "learning_rate": 9.099018568601884e-06, "loss": 0.3016, "num_input_tokens_seen": 64563360, "step": 29935 }, { "epoch": 5.494586162598642, "grad_norm": 12.960561752319336, "learning_rate": 9.098559967027308e-06, "loss": 0.1596, "num_input_tokens_seen": 64572960, "step": 29940 }, { "epoch": 5.495503762158194, "grad_norm": 0.49205413460731506, "learning_rate": 9.09810126033102e-06, "loss": 0.3717, "num_input_tokens_seen": 64584096, "step": 29945 }, { "epoch": 5.496421361717746, "grad_norm": 7.017162322998047, "learning_rate": 9.097642448524788e-06, "loss": 0.2319, "num_input_tokens_seen": 64594848, "step": 29950 }, { "epoch": 5.497338961277299, "grad_norm": 9.675261497497559, "learning_rate": 9.097183531620377e-06, "loss": 0.1963, "num_input_tokens_seen": 64604672, "step": 29955 }, { "epoch": 5.498256560836851, "grad_norm": 15.36375617980957, "learning_rate": 9.09672450962956e-06, "loss": 0.3301, "num_input_tokens_seen": 64615424, "step": 29960 }, { "epoch": 5.499174160396403, "grad_norm": 1.3789477348327637, "learning_rate": 9.096265382564105e-06, "loss": 0.3504, "num_input_tokens_seen": 64626176, "step": 29965 }, { "epoch": 5.500091759955955, "grad_norm": 8.082880020141602, "learning_rate": 9.095806150435796e-06, "loss": 0.2674, "num_input_tokens_seen": 64638592, "step": 29970 }, { "epoch": 5.501009359515507, "grad_norm": 3.2652194499969482, "learning_rate": 9.095346813256404e-06, "loss": 0.2435, "num_input_tokens_seen": 64650240, "step": 29975 }, { "epoch": 5.5019269590750595, "grad_norm": 7.752130508422852, "learning_rate": 9.094887371037713e-06, "loss": 0.2831, "num_input_tokens_seen": 64661920, "step": 29980 }, { "epoch": 5.502844558634612, "grad_norm": 11.950374603271484, "learning_rate": 9.09442782379151e-06, "loss": 0.3519, "num_input_tokens_seen": 64672864, "step": 29985 }, { "epoch": 5.503762158194164, "grad_norm": 9.491239547729492, "learning_rate": 9.093968171529578e-06, "loss": 0.2372, "num_input_tokens_seen": 64682048, "step": 29990 }, { "epoch": 5.504679757753716, "grad_norm": 0.8613620400428772, "learning_rate": 9.093508414263708e-06, "loss": 0.1908, "num_input_tokens_seen": 64693632, "step": 29995 }, { "epoch": 5.505597357313269, "grad_norm": 7.415971279144287, "learning_rate": 9.09304855200569e-06, "loss": 0.2119, "num_input_tokens_seen": 64704064, "step": 30000 }, { "epoch": 5.50651495687282, "grad_norm": 5.936736106872559, "learning_rate": 9.09258858476732e-06, "loss": 0.2964, "num_input_tokens_seen": 64715648, "step": 30005 }, { "epoch": 5.507432556432373, "grad_norm": 11.328234672546387, "learning_rate": 9.0921285125604e-06, "loss": 0.2836, "num_input_tokens_seen": 64726560, "step": 30010 }, { "epoch": 5.508350155991925, "grad_norm": 6.894524097442627, "learning_rate": 9.091668335396721e-06, "loss": 0.4053, "num_input_tokens_seen": 64737632, "step": 30015 }, { "epoch": 5.509267755551477, "grad_norm": 1.9008430242538452, "learning_rate": 9.091208053288093e-06, "loss": 0.1635, "num_input_tokens_seen": 64747904, "step": 30020 }, { "epoch": 5.510185355111029, "grad_norm": 3.961198329925537, "learning_rate": 9.090747666246319e-06, "loss": 0.156, "num_input_tokens_seen": 64758016, "step": 30025 }, { "epoch": 5.511102954670582, "grad_norm": 3.1066815853118896, "learning_rate": 9.090287174283208e-06, "loss": 0.232, "num_input_tokens_seen": 64768704, "step": 30030 }, { "epoch": 5.5120205542301335, "grad_norm": 10.490439414978027, "learning_rate": 9.08982657741057e-06, "loss": 0.236, "num_input_tokens_seen": 64779456, "step": 30035 }, { "epoch": 5.512938153789686, "grad_norm": 9.359946250915527, "learning_rate": 9.089365875640217e-06, "loss": 0.2122, "num_input_tokens_seen": 64791136, "step": 30040 }, { "epoch": 5.5138557533492385, "grad_norm": 8.360873222351074, "learning_rate": 9.088905068983968e-06, "loss": 0.3414, "num_input_tokens_seen": 64802048, "step": 30045 }, { "epoch": 5.51477335290879, "grad_norm": 1.2603248357772827, "learning_rate": 9.088444157453643e-06, "loss": 0.2081, "num_input_tokens_seen": 64812960, "step": 30050 }, { "epoch": 5.515690952468343, "grad_norm": 1.6647167205810547, "learning_rate": 9.08798314106106e-06, "loss": 0.2278, "num_input_tokens_seen": 64823968, "step": 30055 }, { "epoch": 5.516608552027895, "grad_norm": 1.0164008140563965, "learning_rate": 9.087522019818048e-06, "loss": 0.3874, "num_input_tokens_seen": 64834688, "step": 30060 }, { "epoch": 5.517526151587447, "grad_norm": 0.9414782524108887, "learning_rate": 9.087060793736428e-06, "loss": 0.1218, "num_input_tokens_seen": 64844448, "step": 30065 }, { "epoch": 5.518443751146999, "grad_norm": 6.327815055847168, "learning_rate": 9.086599462828034e-06, "loss": 0.3352, "num_input_tokens_seen": 64853216, "step": 30070 }, { "epoch": 5.519361350706552, "grad_norm": 0.4229185879230499, "learning_rate": 9.0861380271047e-06, "loss": 0.1817, "num_input_tokens_seen": 64864256, "step": 30075 }, { "epoch": 5.520278950266103, "grad_norm": 9.140106201171875, "learning_rate": 9.085676486578256e-06, "loss": 0.1954, "num_input_tokens_seen": 64875392, "step": 30080 }, { "epoch": 5.521196549825656, "grad_norm": 5.495337963104248, "learning_rate": 9.085214841260542e-06, "loss": 0.3569, "num_input_tokens_seen": 64886272, "step": 30085 }, { "epoch": 5.522114149385208, "grad_norm": 4.073338508605957, "learning_rate": 9.0847530911634e-06, "loss": 0.1626, "num_input_tokens_seen": 64896448, "step": 30090 }, { "epoch": 5.52303174894476, "grad_norm": 15.78303050994873, "learning_rate": 9.084291236298671e-06, "loss": 0.3804, "num_input_tokens_seen": 64907136, "step": 30095 }, { "epoch": 5.5239493485043125, "grad_norm": 12.335434913635254, "learning_rate": 9.083829276678202e-06, "loss": 0.3828, "num_input_tokens_seen": 64917312, "step": 30100 }, { "epoch": 5.524866948063865, "grad_norm": 19.23911476135254, "learning_rate": 9.083367212313843e-06, "loss": 0.1824, "num_input_tokens_seen": 64929184, "step": 30105 }, { "epoch": 5.525784547623417, "grad_norm": 6.03046178817749, "learning_rate": 9.082905043217443e-06, "loss": 0.3903, "num_input_tokens_seen": 64942048, "step": 30110 }, { "epoch": 5.526702147182969, "grad_norm": 17.237058639526367, "learning_rate": 9.082442769400854e-06, "loss": 0.4491, "num_input_tokens_seen": 64952512, "step": 30115 }, { "epoch": 5.527619746742522, "grad_norm": 6.7092814445495605, "learning_rate": 9.081980390875938e-06, "loss": 0.2595, "num_input_tokens_seen": 64962560, "step": 30120 }, { "epoch": 5.528537346302074, "grad_norm": 6.04686164855957, "learning_rate": 9.08151790765455e-06, "loss": 0.2328, "num_input_tokens_seen": 64973792, "step": 30125 }, { "epoch": 5.529454945861626, "grad_norm": 11.96484661102295, "learning_rate": 9.081055319748555e-06, "loss": 0.2365, "num_input_tokens_seen": 64985056, "step": 30130 }, { "epoch": 5.530372545421178, "grad_norm": 5.995643138885498, "learning_rate": 9.080592627169815e-06, "loss": 0.2044, "num_input_tokens_seen": 64997152, "step": 30135 }, { "epoch": 5.531290144980731, "grad_norm": 8.407464027404785, "learning_rate": 9.080129829930199e-06, "loss": 0.2873, "num_input_tokens_seen": 65007872, "step": 30140 }, { "epoch": 5.532207744540282, "grad_norm": 7.137453556060791, "learning_rate": 9.079666928041577e-06, "loss": 0.3738, "num_input_tokens_seen": 65018880, "step": 30145 }, { "epoch": 5.533125344099835, "grad_norm": 9.56857967376709, "learning_rate": 9.07920392151582e-06, "loss": 0.258, "num_input_tokens_seen": 65028608, "step": 30150 }, { "epoch": 5.534042943659387, "grad_norm": 18.699052810668945, "learning_rate": 9.078740810364806e-06, "loss": 0.2891, "num_input_tokens_seen": 65039712, "step": 30155 }, { "epoch": 5.534960543218939, "grad_norm": 6.435752868652344, "learning_rate": 9.07827759460041e-06, "loss": 0.3001, "num_input_tokens_seen": 65050112, "step": 30160 }, { "epoch": 5.5358781427784916, "grad_norm": 19.675819396972656, "learning_rate": 9.077814274234516e-06, "loss": 0.3914, "num_input_tokens_seen": 65060480, "step": 30165 }, { "epoch": 5.536795742338044, "grad_norm": 2.781442642211914, "learning_rate": 9.077350849279005e-06, "loss": 0.2969, "num_input_tokens_seen": 65071552, "step": 30170 }, { "epoch": 5.537713341897596, "grad_norm": 5.867318630218506, "learning_rate": 9.076887319745763e-06, "loss": 0.2684, "num_input_tokens_seen": 65083296, "step": 30175 }, { "epoch": 5.538630941457148, "grad_norm": 1.8652327060699463, "learning_rate": 9.07642368564668e-06, "loss": 0.2035, "num_input_tokens_seen": 65094016, "step": 30180 }, { "epoch": 5.539548541016701, "grad_norm": 2.71354603767395, "learning_rate": 9.075959946993649e-06, "loss": 0.1375, "num_input_tokens_seen": 65104416, "step": 30185 }, { "epoch": 5.540466140576252, "grad_norm": 1.2821458578109741, "learning_rate": 9.075496103798562e-06, "loss": 0.213, "num_input_tokens_seen": 65115584, "step": 30190 }, { "epoch": 5.541383740135805, "grad_norm": 12.907920837402344, "learning_rate": 9.075032156073316e-06, "loss": 0.3724, "num_input_tokens_seen": 65127552, "step": 30195 }, { "epoch": 5.542301339695357, "grad_norm": 9.194496154785156, "learning_rate": 9.074568103829812e-06, "loss": 0.1367, "num_input_tokens_seen": 65138208, "step": 30200 }, { "epoch": 5.543218939254909, "grad_norm": 3.6536223888397217, "learning_rate": 9.07410394707995e-06, "loss": 0.2986, "num_input_tokens_seen": 65150208, "step": 30205 }, { "epoch": 5.5441365388144614, "grad_norm": 7.725154399871826, "learning_rate": 9.073639685835636e-06, "loss": 0.3974, "num_input_tokens_seen": 65160992, "step": 30210 }, { "epoch": 5.545054138374014, "grad_norm": 7.729605674743652, "learning_rate": 9.073175320108779e-06, "loss": 0.3898, "num_input_tokens_seen": 65170976, "step": 30215 }, { "epoch": 5.545971737933566, "grad_norm": 8.549029350280762, "learning_rate": 9.072710849911287e-06, "loss": 0.3902, "num_input_tokens_seen": 65180384, "step": 30220 }, { "epoch": 5.546889337493118, "grad_norm": 5.588630199432373, "learning_rate": 9.072246275255073e-06, "loss": 0.2371, "num_input_tokens_seen": 65190912, "step": 30225 }, { "epoch": 5.547806937052671, "grad_norm": 3.640444040298462, "learning_rate": 9.071781596152054e-06, "loss": 0.456, "num_input_tokens_seen": 65201920, "step": 30230 }, { "epoch": 5.548724536612222, "grad_norm": 4.347113609313965, "learning_rate": 9.071316812614147e-06, "loss": 0.2395, "num_input_tokens_seen": 65212096, "step": 30235 }, { "epoch": 5.549642136171775, "grad_norm": 1.0265860557556152, "learning_rate": 9.070851924653275e-06, "loss": 0.1444, "num_input_tokens_seen": 65222720, "step": 30240 }, { "epoch": 5.550559735731327, "grad_norm": 8.328314781188965, "learning_rate": 9.07038693228136e-06, "loss": 0.3532, "num_input_tokens_seen": 65233664, "step": 30245 }, { "epoch": 5.551477335290879, "grad_norm": 4.826756000518799, "learning_rate": 9.06992183551033e-06, "loss": 0.2905, "num_input_tokens_seen": 65245120, "step": 30250 }, { "epoch": 5.552394934850431, "grad_norm": 1.4474811553955078, "learning_rate": 9.06945663435211e-06, "loss": 0.2565, "num_input_tokens_seen": 65255936, "step": 30255 }, { "epoch": 5.553312534409984, "grad_norm": 8.257841110229492, "learning_rate": 9.068991328818637e-06, "loss": 0.3621, "num_input_tokens_seen": 65265728, "step": 30260 }, { "epoch": 5.5542301339695355, "grad_norm": 2.9489448070526123, "learning_rate": 9.068525918921841e-06, "loss": 0.1262, "num_input_tokens_seen": 65276352, "step": 30265 }, { "epoch": 5.555147733529088, "grad_norm": 3.915461540222168, "learning_rate": 9.068060404673663e-06, "loss": 0.2747, "num_input_tokens_seen": 65285408, "step": 30270 }, { "epoch": 5.5560653330886405, "grad_norm": 2.281752109527588, "learning_rate": 9.067594786086038e-06, "loss": 0.2938, "num_input_tokens_seen": 65296448, "step": 30275 }, { "epoch": 5.556982932648192, "grad_norm": 0.6366900205612183, "learning_rate": 9.067129063170912e-06, "loss": 0.1594, "num_input_tokens_seen": 65306304, "step": 30280 }, { "epoch": 5.557900532207745, "grad_norm": 8.960639953613281, "learning_rate": 9.066663235940229e-06, "loss": 0.3792, "num_input_tokens_seen": 65317824, "step": 30285 }, { "epoch": 5.558818131767297, "grad_norm": 8.553919792175293, "learning_rate": 9.066197304405936e-06, "loss": 0.3094, "num_input_tokens_seen": 65328448, "step": 30290 }, { "epoch": 5.559735731326849, "grad_norm": 7.779367923736572, "learning_rate": 9.065731268579985e-06, "loss": 0.3176, "num_input_tokens_seen": 65338944, "step": 30295 }, { "epoch": 5.560653330886401, "grad_norm": 4.193559169769287, "learning_rate": 9.065265128474327e-06, "loss": 0.2962, "num_input_tokens_seen": 65349536, "step": 30300 }, { "epoch": 5.561570930445954, "grad_norm": 2.8578152656555176, "learning_rate": 9.064798884100921e-06, "loss": 0.2538, "num_input_tokens_seen": 65360704, "step": 30305 }, { "epoch": 5.562488530005505, "grad_norm": 16.74209213256836, "learning_rate": 9.064332535471723e-06, "loss": 0.2467, "num_input_tokens_seen": 65370592, "step": 30310 }, { "epoch": 5.563406129565058, "grad_norm": 10.132415771484375, "learning_rate": 9.063866082598694e-06, "loss": 0.304, "num_input_tokens_seen": 65382592, "step": 30315 }, { "epoch": 5.56432372912461, "grad_norm": 4.2907891273498535, "learning_rate": 9.063399525493798e-06, "loss": 0.1597, "num_input_tokens_seen": 65394336, "step": 30320 }, { "epoch": 5.565241328684162, "grad_norm": 8.440269470214844, "learning_rate": 9.062932864169003e-06, "loss": 0.1888, "num_input_tokens_seen": 65405056, "step": 30325 }, { "epoch": 5.5661589282437145, "grad_norm": 7.4088053703308105, "learning_rate": 9.062466098636277e-06, "loss": 0.235, "num_input_tokens_seen": 65415808, "step": 30330 }, { "epoch": 5.567076527803267, "grad_norm": 4.328369617462158, "learning_rate": 9.061999228907592e-06, "loss": 0.2719, "num_input_tokens_seen": 65426208, "step": 30335 }, { "epoch": 5.567994127362819, "grad_norm": 19.586612701416016, "learning_rate": 9.061532254994922e-06, "loss": 0.2735, "num_input_tokens_seen": 65438464, "step": 30340 }, { "epoch": 5.568911726922371, "grad_norm": 0.8619778752326965, "learning_rate": 9.061065176910244e-06, "loss": 0.2822, "num_input_tokens_seen": 65448928, "step": 30345 }, { "epoch": 5.569829326481924, "grad_norm": 10.513836860656738, "learning_rate": 9.06059799466554e-06, "loss": 0.1051, "num_input_tokens_seen": 65458560, "step": 30350 }, { "epoch": 5.570746926041475, "grad_norm": 5.351761817932129, "learning_rate": 9.060130708272788e-06, "loss": 0.325, "num_input_tokens_seen": 65469184, "step": 30355 }, { "epoch": 5.571664525601028, "grad_norm": 5.625452518463135, "learning_rate": 9.059663317743976e-06, "loss": 0.1915, "num_input_tokens_seen": 65479520, "step": 30360 }, { "epoch": 5.57258212516058, "grad_norm": 4.225148677825928, "learning_rate": 9.059195823091094e-06, "loss": 0.4632, "num_input_tokens_seen": 65489664, "step": 30365 }, { "epoch": 5.573499724720132, "grad_norm": 4.3521647453308105, "learning_rate": 9.058728224326129e-06, "loss": 0.3917, "num_input_tokens_seen": 65500800, "step": 30370 }, { "epoch": 5.574417324279684, "grad_norm": 5.394655227661133, "learning_rate": 9.058260521461075e-06, "loss": 0.2187, "num_input_tokens_seen": 65512064, "step": 30375 }, { "epoch": 5.575334923839237, "grad_norm": 9.035003662109375, "learning_rate": 9.05779271450793e-06, "loss": 0.2446, "num_input_tokens_seen": 65522528, "step": 30380 }, { "epoch": 5.5762525233987885, "grad_norm": 4.314700603485107, "learning_rate": 9.05732480347869e-06, "loss": 0.337, "num_input_tokens_seen": 65534208, "step": 30385 }, { "epoch": 5.577170122958341, "grad_norm": 8.855003356933594, "learning_rate": 9.056856788385358e-06, "loss": 0.1775, "num_input_tokens_seen": 65546016, "step": 30390 }, { "epoch": 5.5780877225178935, "grad_norm": 8.31261920928955, "learning_rate": 9.056388669239934e-06, "loss": 0.1552, "num_input_tokens_seen": 65557664, "step": 30395 }, { "epoch": 5.579005322077445, "grad_norm": 4.261944770812988, "learning_rate": 9.055920446054432e-06, "loss": 0.1248, "num_input_tokens_seen": 65568480, "step": 30400 }, { "epoch": 5.579922921636998, "grad_norm": 9.501677513122559, "learning_rate": 9.055452118840852e-06, "loss": 0.1925, "num_input_tokens_seen": 65579424, "step": 30405 }, { "epoch": 5.58084052119655, "grad_norm": 15.833301544189453, "learning_rate": 9.054983687611213e-06, "loss": 0.2534, "num_input_tokens_seen": 65590048, "step": 30410 }, { "epoch": 5.581758120756102, "grad_norm": 11.554676055908203, "learning_rate": 9.054515152377528e-06, "loss": 0.317, "num_input_tokens_seen": 65601216, "step": 30415 }, { "epoch": 5.582675720315654, "grad_norm": 4.342592239379883, "learning_rate": 9.054046513151813e-06, "loss": 0.2252, "num_input_tokens_seen": 65611648, "step": 30420 }, { "epoch": 5.583593319875207, "grad_norm": 2.545794725418091, "learning_rate": 9.053577769946088e-06, "loss": 0.2894, "num_input_tokens_seen": 65622880, "step": 30425 }, { "epoch": 5.584510919434758, "grad_norm": 8.211151123046875, "learning_rate": 9.053108922772374e-06, "loss": 0.4074, "num_input_tokens_seen": 65633856, "step": 30430 }, { "epoch": 5.585428518994311, "grad_norm": 1.0507718324661255, "learning_rate": 9.052639971642699e-06, "loss": 0.29, "num_input_tokens_seen": 65646048, "step": 30435 }, { "epoch": 5.586346118553863, "grad_norm": 13.820143699645996, "learning_rate": 9.052170916569088e-06, "loss": 0.3712, "num_input_tokens_seen": 65654144, "step": 30440 }, { "epoch": 5.587263718113415, "grad_norm": 9.400226593017578, "learning_rate": 9.051701757563575e-06, "loss": 0.2024, "num_input_tokens_seen": 65664608, "step": 30445 }, { "epoch": 5.5881813176729676, "grad_norm": 7.827149868011475, "learning_rate": 9.051232494638191e-06, "loss": 0.2684, "num_input_tokens_seen": 65675520, "step": 30450 }, { "epoch": 5.58909891723252, "grad_norm": 11.063289642333984, "learning_rate": 9.050763127804973e-06, "loss": 0.4176, "num_input_tokens_seen": 65686432, "step": 30455 }, { "epoch": 5.590016516792072, "grad_norm": 13.956883430480957, "learning_rate": 9.050293657075959e-06, "loss": 0.1688, "num_input_tokens_seen": 65696608, "step": 30460 }, { "epoch": 5.590934116351624, "grad_norm": 9.701118469238281, "learning_rate": 9.04982408246319e-06, "loss": 0.236, "num_input_tokens_seen": 65707840, "step": 30465 }, { "epoch": 5.591851715911177, "grad_norm": 5.119300842285156, "learning_rate": 9.04935440397871e-06, "loss": 0.3336, "num_input_tokens_seen": 65718240, "step": 30470 }, { "epoch": 5.592769315470728, "grad_norm": 2.0691773891448975, "learning_rate": 9.048884621634563e-06, "loss": 0.2874, "num_input_tokens_seen": 65728864, "step": 30475 }, { "epoch": 5.593686915030281, "grad_norm": 15.598081588745117, "learning_rate": 9.048414735442804e-06, "loss": 0.3072, "num_input_tokens_seen": 65740256, "step": 30480 }, { "epoch": 5.594604514589833, "grad_norm": 14.09078598022461, "learning_rate": 9.047944745415481e-06, "loss": 0.2226, "num_input_tokens_seen": 65750784, "step": 30485 }, { "epoch": 5.595522114149385, "grad_norm": 18.27501678466797, "learning_rate": 9.047474651564646e-06, "loss": 0.2321, "num_input_tokens_seen": 65760160, "step": 30490 }, { "epoch": 5.5964397137089374, "grad_norm": 11.062871932983398, "learning_rate": 9.047004453902364e-06, "loss": 0.477, "num_input_tokens_seen": 65771168, "step": 30495 }, { "epoch": 5.59735731326849, "grad_norm": 1.9425312280654907, "learning_rate": 9.046534152440687e-06, "loss": 0.1858, "num_input_tokens_seen": 65782368, "step": 30500 }, { "epoch": 5.598274912828042, "grad_norm": 2.5923311710357666, "learning_rate": 9.04606374719168e-06, "loss": 0.2991, "num_input_tokens_seen": 65793056, "step": 30505 }, { "epoch": 5.599192512387594, "grad_norm": 7.931814193725586, "learning_rate": 9.04559323816741e-06, "loss": 0.2613, "num_input_tokens_seen": 65804896, "step": 30510 }, { "epoch": 5.600110111947147, "grad_norm": 3.5354905128479004, "learning_rate": 9.045122625379944e-06, "loss": 0.2709, "num_input_tokens_seen": 65815424, "step": 30515 }, { "epoch": 5.601027711506698, "grad_norm": 4.648916721343994, "learning_rate": 9.04465190884135e-06, "loss": 0.2154, "num_input_tokens_seen": 65824800, "step": 30520 }, { "epoch": 5.601945311066251, "grad_norm": 1.9552937746047974, "learning_rate": 9.044181088563705e-06, "loss": 0.1865, "num_input_tokens_seen": 65835232, "step": 30525 }, { "epoch": 5.602862910625803, "grad_norm": 5.367273807525635, "learning_rate": 9.043710164559083e-06, "loss": 0.2742, "num_input_tokens_seen": 65846080, "step": 30530 }, { "epoch": 5.603780510185355, "grad_norm": 2.1535909175872803, "learning_rate": 9.043239136839562e-06, "loss": 0.1316, "num_input_tokens_seen": 65856736, "step": 30535 }, { "epoch": 5.604698109744907, "grad_norm": 9.508365631103516, "learning_rate": 9.042768005417225e-06, "loss": 0.1541, "num_input_tokens_seen": 65867328, "step": 30540 }, { "epoch": 5.60561570930446, "grad_norm": 8.354763984680176, "learning_rate": 9.042296770304151e-06, "loss": 0.1866, "num_input_tokens_seen": 65877024, "step": 30545 }, { "epoch": 5.6065333088640115, "grad_norm": 5.461110591888428, "learning_rate": 9.041825431512433e-06, "loss": 0.2921, "num_input_tokens_seen": 65888064, "step": 30550 }, { "epoch": 5.607450908423564, "grad_norm": 4.1876091957092285, "learning_rate": 9.041353989054156e-06, "loss": 0.4579, "num_input_tokens_seen": 65898880, "step": 30555 }, { "epoch": 5.6083685079831165, "grad_norm": 1.5812031030654907, "learning_rate": 9.040882442941412e-06, "loss": 0.1232, "num_input_tokens_seen": 65910752, "step": 30560 }, { "epoch": 5.609286107542668, "grad_norm": 11.171096801757812, "learning_rate": 9.040410793186297e-06, "loss": 0.403, "num_input_tokens_seen": 65920896, "step": 30565 }, { "epoch": 5.610203707102221, "grad_norm": 3.6874747276306152, "learning_rate": 9.039939039800907e-06, "loss": 0.5529, "num_input_tokens_seen": 65932480, "step": 30570 }, { "epoch": 5.611121306661773, "grad_norm": 9.348947525024414, "learning_rate": 9.039467182797342e-06, "loss": 0.4292, "num_input_tokens_seen": 65943520, "step": 30575 }, { "epoch": 5.612038906221325, "grad_norm": 4.738325595855713, "learning_rate": 9.038995222187703e-06, "loss": 0.1728, "num_input_tokens_seen": 65955328, "step": 30580 }, { "epoch": 5.612956505780877, "grad_norm": 6.089955806732178, "learning_rate": 9.038523157984099e-06, "loss": 0.2655, "num_input_tokens_seen": 65964928, "step": 30585 }, { "epoch": 5.61387410534043, "grad_norm": 16.192686080932617, "learning_rate": 9.038050990198633e-06, "loss": 0.2816, "num_input_tokens_seen": 65975072, "step": 30590 }, { "epoch": 5.614791704899981, "grad_norm": 5.68488073348999, "learning_rate": 9.037578718843418e-06, "loss": 0.358, "num_input_tokens_seen": 65985632, "step": 30595 }, { "epoch": 5.615709304459534, "grad_norm": 16.676618576049805, "learning_rate": 9.037106343930566e-06, "loss": 0.3113, "num_input_tokens_seen": 65996768, "step": 30600 }, { "epoch": 5.616626904019086, "grad_norm": 10.24699878692627, "learning_rate": 9.036633865472195e-06, "loss": 0.2603, "num_input_tokens_seen": 66008416, "step": 30605 }, { "epoch": 5.617544503578638, "grad_norm": 5.00665283203125, "learning_rate": 9.03616128348042e-06, "loss": 0.2562, "num_input_tokens_seen": 66018400, "step": 30610 }, { "epoch": 5.6184621031381905, "grad_norm": 2.261774778366089, "learning_rate": 9.035688597967364e-06, "loss": 0.2216, "num_input_tokens_seen": 66029248, "step": 30615 }, { "epoch": 5.619379702697743, "grad_norm": 8.09789752960205, "learning_rate": 9.03521580894515e-06, "loss": 0.3035, "num_input_tokens_seen": 66038816, "step": 30620 }, { "epoch": 5.620297302257295, "grad_norm": 3.3210363388061523, "learning_rate": 9.034742916425905e-06, "loss": 0.3606, "num_input_tokens_seen": 66049024, "step": 30625 }, { "epoch": 5.621214901816847, "grad_norm": 10.649744033813477, "learning_rate": 9.034269920421758e-06, "loss": 0.2131, "num_input_tokens_seen": 66059744, "step": 30630 }, { "epoch": 5.6221325013764, "grad_norm": 1.431130051612854, "learning_rate": 9.03379682094484e-06, "loss": 0.2798, "num_input_tokens_seen": 66071488, "step": 30635 }, { "epoch": 5.623050100935951, "grad_norm": 8.737886428833008, "learning_rate": 9.033323618007283e-06, "loss": 0.2734, "num_input_tokens_seen": 66081728, "step": 30640 }, { "epoch": 5.623967700495504, "grad_norm": 2.5189199447631836, "learning_rate": 9.032850311621229e-06, "loss": 0.2614, "num_input_tokens_seen": 66092992, "step": 30645 }, { "epoch": 5.624885300055056, "grad_norm": 5.147494792938232, "learning_rate": 9.032376901798814e-06, "loss": 0.3177, "num_input_tokens_seen": 66104608, "step": 30650 }, { "epoch": 5.625802899614608, "grad_norm": 4.5348405838012695, "learning_rate": 9.03190338855218e-06, "loss": 0.2433, "num_input_tokens_seen": 66115296, "step": 30655 }, { "epoch": 5.62672049917416, "grad_norm": 1.8780306577682495, "learning_rate": 9.031429771893473e-06, "loss": 0.3509, "num_input_tokens_seen": 66126592, "step": 30660 }, { "epoch": 5.627638098733713, "grad_norm": 7.266695976257324, "learning_rate": 9.030956051834842e-06, "loss": 0.2656, "num_input_tokens_seen": 66136352, "step": 30665 }, { "epoch": 5.6285556982932645, "grad_norm": 2.750152826309204, "learning_rate": 9.030482228388436e-06, "loss": 0.2446, "num_input_tokens_seen": 66146880, "step": 30670 }, { "epoch": 5.629473297852817, "grad_norm": 5.99625301361084, "learning_rate": 9.030008301566406e-06, "loss": 0.2351, "num_input_tokens_seen": 66158560, "step": 30675 }, { "epoch": 5.6303908974123695, "grad_norm": 5.09860897064209, "learning_rate": 9.029534271380912e-06, "loss": 0.3184, "num_input_tokens_seen": 66167232, "step": 30680 }, { "epoch": 5.631308496971921, "grad_norm": 4.804082870483398, "learning_rate": 9.029060137844106e-06, "loss": 0.2851, "num_input_tokens_seen": 66177312, "step": 30685 }, { "epoch": 5.632226096531474, "grad_norm": 4.357847690582275, "learning_rate": 9.028585900968152e-06, "loss": 0.2052, "num_input_tokens_seen": 66187712, "step": 30690 }, { "epoch": 5.633143696091026, "grad_norm": 5.605047702789307, "learning_rate": 9.028111560765214e-06, "loss": 0.2146, "num_input_tokens_seen": 66198048, "step": 30695 }, { "epoch": 5.634061295650578, "grad_norm": 5.046430587768555, "learning_rate": 9.027637117247459e-06, "loss": 0.2929, "num_input_tokens_seen": 66209344, "step": 30700 }, { "epoch": 5.63497889521013, "grad_norm": 6.465850830078125, "learning_rate": 9.027162570427052e-06, "loss": 0.3109, "num_input_tokens_seen": 66220608, "step": 30705 }, { "epoch": 5.635896494769683, "grad_norm": 1.9087404012680054, "learning_rate": 9.026687920316168e-06, "loss": 0.2387, "num_input_tokens_seen": 66231296, "step": 30710 }, { "epoch": 5.636814094329234, "grad_norm": 1.2358120679855347, "learning_rate": 9.026213166926977e-06, "loss": 0.2314, "num_input_tokens_seen": 66242336, "step": 30715 }, { "epoch": 5.637731693888787, "grad_norm": 3.082172155380249, "learning_rate": 9.025738310271663e-06, "loss": 0.18, "num_input_tokens_seen": 66252832, "step": 30720 }, { "epoch": 5.638649293448339, "grad_norm": 6.9131693840026855, "learning_rate": 9.025263350362397e-06, "loss": 0.1945, "num_input_tokens_seen": 66263424, "step": 30725 }, { "epoch": 5.639566893007891, "grad_norm": 5.6552581787109375, "learning_rate": 9.024788287211365e-06, "loss": 0.2311, "num_input_tokens_seen": 66274208, "step": 30730 }, { "epoch": 5.6404844925674436, "grad_norm": 8.252635955810547, "learning_rate": 9.024313120830754e-06, "loss": 0.1909, "num_input_tokens_seen": 66284160, "step": 30735 }, { "epoch": 5.641402092126996, "grad_norm": 2.5109124183654785, "learning_rate": 9.023837851232746e-06, "loss": 0.2122, "num_input_tokens_seen": 66296288, "step": 30740 }, { "epoch": 5.642319691686548, "grad_norm": 13.740878105163574, "learning_rate": 9.023362478429533e-06, "loss": 0.2538, "num_input_tokens_seen": 66305888, "step": 30745 }, { "epoch": 5.6432372912461, "grad_norm": 1.662726879119873, "learning_rate": 9.02288700243331e-06, "loss": 0.1159, "num_input_tokens_seen": 66316576, "step": 30750 }, { "epoch": 5.644154890805653, "grad_norm": 15.953649520874023, "learning_rate": 9.02241142325627e-06, "loss": 0.2304, "num_input_tokens_seen": 66327424, "step": 30755 }, { "epoch": 5.645072490365204, "grad_norm": 12.874183654785156, "learning_rate": 9.02193574091061e-06, "loss": 0.3053, "num_input_tokens_seen": 66338144, "step": 30760 }, { "epoch": 5.645990089924757, "grad_norm": 13.905436515808105, "learning_rate": 9.021459955408532e-06, "loss": 0.2622, "num_input_tokens_seen": 66349952, "step": 30765 }, { "epoch": 5.646907689484309, "grad_norm": 5.180163383483887, "learning_rate": 9.020984066762239e-06, "loss": 0.2062, "num_input_tokens_seen": 66360480, "step": 30770 }, { "epoch": 5.647825289043861, "grad_norm": 4.932943820953369, "learning_rate": 9.020508074983939e-06, "loss": 0.3255, "num_input_tokens_seen": 66372384, "step": 30775 }, { "epoch": 5.6487428886034134, "grad_norm": 13.640936851501465, "learning_rate": 9.020031980085835e-06, "loss": 0.3715, "num_input_tokens_seen": 66384000, "step": 30780 }, { "epoch": 5.649660488162966, "grad_norm": 10.005221366882324, "learning_rate": 9.019555782080143e-06, "loss": 0.355, "num_input_tokens_seen": 66394400, "step": 30785 }, { "epoch": 5.650578087722518, "grad_norm": 6.377419471740723, "learning_rate": 9.019079480979074e-06, "loss": 0.2267, "num_input_tokens_seen": 66405216, "step": 30790 }, { "epoch": 5.65149568728207, "grad_norm": 5.399049282073975, "learning_rate": 9.018603076794845e-06, "loss": 0.2242, "num_input_tokens_seen": 66414464, "step": 30795 }, { "epoch": 5.652413286841623, "grad_norm": 6.638850212097168, "learning_rate": 9.018126569539675e-06, "loss": 0.2171, "num_input_tokens_seen": 66425600, "step": 30800 }, { "epoch": 5.653330886401174, "grad_norm": 4.153999328613281, "learning_rate": 9.017649959225787e-06, "loss": 0.4377, "num_input_tokens_seen": 66435552, "step": 30805 }, { "epoch": 5.654248485960727, "grad_norm": 7.732017993927002, "learning_rate": 9.017173245865404e-06, "loss": 0.2996, "num_input_tokens_seen": 66446560, "step": 30810 }, { "epoch": 5.655166085520279, "grad_norm": 8.090109825134277, "learning_rate": 9.016696429470753e-06, "loss": 0.1079, "num_input_tokens_seen": 66457408, "step": 30815 }, { "epoch": 5.656083685079831, "grad_norm": 8.934765815734863, "learning_rate": 9.016219510054064e-06, "loss": 0.2379, "num_input_tokens_seen": 66466816, "step": 30820 }, { "epoch": 5.657001284639383, "grad_norm": 1.023118019104004, "learning_rate": 9.01574248762757e-06, "loss": 0.1254, "num_input_tokens_seen": 66478176, "step": 30825 }, { "epoch": 5.657918884198936, "grad_norm": 2.0820631980895996, "learning_rate": 9.015265362203505e-06, "loss": 0.2347, "num_input_tokens_seen": 66489120, "step": 30830 }, { "epoch": 5.6588364837584875, "grad_norm": 6.376258373260498, "learning_rate": 9.014788133794105e-06, "loss": 0.1236, "num_input_tokens_seen": 66498816, "step": 30835 }, { "epoch": 5.65975408331804, "grad_norm": 10.6781005859375, "learning_rate": 9.014310802411613e-06, "loss": 0.2775, "num_input_tokens_seen": 66509856, "step": 30840 }, { "epoch": 5.6606716828775925, "grad_norm": 7.528177738189697, "learning_rate": 9.01383336806827e-06, "loss": 0.4141, "num_input_tokens_seen": 66520544, "step": 30845 }, { "epoch": 5.661589282437144, "grad_norm": 1.4278786182403564, "learning_rate": 9.013355830776323e-06, "loss": 0.2746, "num_input_tokens_seen": 66532224, "step": 30850 }, { "epoch": 5.662506881996697, "grad_norm": 0.9971939325332642, "learning_rate": 9.012878190548018e-06, "loss": 0.3699, "num_input_tokens_seen": 66544160, "step": 30855 }, { "epoch": 5.663424481556249, "grad_norm": 3.539780855178833, "learning_rate": 9.012400447395607e-06, "loss": 0.2882, "num_input_tokens_seen": 66555008, "step": 30860 }, { "epoch": 5.664342081115801, "grad_norm": 2.936652660369873, "learning_rate": 9.011922601331345e-06, "loss": 0.218, "num_input_tokens_seen": 66565984, "step": 30865 }, { "epoch": 5.665259680675353, "grad_norm": 4.462067127227783, "learning_rate": 9.011444652367483e-06, "loss": 0.4852, "num_input_tokens_seen": 66576576, "step": 30870 }, { "epoch": 5.666177280234906, "grad_norm": 12.335482597351074, "learning_rate": 9.010966600516284e-06, "loss": 0.4105, "num_input_tokens_seen": 66587968, "step": 30875 }, { "epoch": 5.667094879794457, "grad_norm": 0.34413042664527893, "learning_rate": 9.010488445790008e-06, "loss": 0.1986, "num_input_tokens_seen": 66598880, "step": 30880 }, { "epoch": 5.66801247935401, "grad_norm": 6.5009074211120605, "learning_rate": 9.010010188200922e-06, "loss": 0.5005, "num_input_tokens_seen": 66609056, "step": 30885 }, { "epoch": 5.668930078913562, "grad_norm": 7.668620586395264, "learning_rate": 9.009531827761286e-06, "loss": 0.52, "num_input_tokens_seen": 66620064, "step": 30890 }, { "epoch": 5.669847678473114, "grad_norm": 5.248694896697998, "learning_rate": 9.009053364483374e-06, "loss": 0.2487, "num_input_tokens_seen": 66630464, "step": 30895 }, { "epoch": 5.6707652780326665, "grad_norm": 6.47257137298584, "learning_rate": 9.008574798379457e-06, "loss": 0.2332, "num_input_tokens_seen": 66640544, "step": 30900 }, { "epoch": 5.671682877592219, "grad_norm": 8.922889709472656, "learning_rate": 9.008096129461808e-06, "loss": 0.2128, "num_input_tokens_seen": 66649984, "step": 30905 }, { "epoch": 5.672600477151771, "grad_norm": 9.771597862243652, "learning_rate": 9.007617357742707e-06, "loss": 0.2758, "num_input_tokens_seen": 66661248, "step": 30910 }, { "epoch": 5.673518076711323, "grad_norm": 6.449770450592041, "learning_rate": 9.00713848323443e-06, "loss": 0.2803, "num_input_tokens_seen": 66670272, "step": 30915 }, { "epoch": 5.674435676270876, "grad_norm": 10.577116966247559, "learning_rate": 9.006659505949264e-06, "loss": 0.1706, "num_input_tokens_seen": 66682112, "step": 30920 }, { "epoch": 5.675353275830427, "grad_norm": 7.895167827606201, "learning_rate": 9.00618042589949e-06, "loss": 0.3742, "num_input_tokens_seen": 66692192, "step": 30925 }, { "epoch": 5.67627087538998, "grad_norm": 0.8394784331321716, "learning_rate": 9.005701243097397e-06, "loss": 0.2067, "num_input_tokens_seen": 66701568, "step": 30930 }, { "epoch": 5.677188474949532, "grad_norm": 13.948771476745605, "learning_rate": 9.005221957555274e-06, "loss": 0.265, "num_input_tokens_seen": 66713184, "step": 30935 }, { "epoch": 5.678106074509084, "grad_norm": 8.939054489135742, "learning_rate": 9.004742569285418e-06, "loss": 0.215, "num_input_tokens_seen": 66725184, "step": 30940 }, { "epoch": 5.679023674068636, "grad_norm": 12.862039566040039, "learning_rate": 9.00426307830012e-06, "loss": 0.3761, "num_input_tokens_seen": 66734944, "step": 30945 }, { "epoch": 5.679941273628189, "grad_norm": 8.365309715270996, "learning_rate": 9.003783484611681e-06, "loss": 0.2435, "num_input_tokens_seen": 66745984, "step": 30950 }, { "epoch": 5.6808588731877405, "grad_norm": 3.728210687637329, "learning_rate": 9.0033037882324e-06, "loss": 0.2236, "num_input_tokens_seen": 66757152, "step": 30955 }, { "epoch": 5.681776472747293, "grad_norm": 0.7314704656600952, "learning_rate": 9.002823989174582e-06, "loss": 0.2784, "num_input_tokens_seen": 66768384, "step": 30960 }, { "epoch": 5.6826940723068455, "grad_norm": 7.235638618469238, "learning_rate": 9.002344087450535e-06, "loss": 0.2667, "num_input_tokens_seen": 66778976, "step": 30965 }, { "epoch": 5.683611671866397, "grad_norm": 1.8492740392684937, "learning_rate": 9.00186408307256e-06, "loss": 0.1609, "num_input_tokens_seen": 66790240, "step": 30970 }, { "epoch": 5.68452927142595, "grad_norm": 7.839957237243652, "learning_rate": 9.001383976052977e-06, "loss": 0.2489, "num_input_tokens_seen": 66800896, "step": 30975 }, { "epoch": 5.685446870985502, "grad_norm": 14.708844184875488, "learning_rate": 9.000903766404097e-06, "loss": 0.1832, "num_input_tokens_seen": 66812032, "step": 30980 }, { "epoch": 5.686364470545054, "grad_norm": 5.389851093292236, "learning_rate": 9.000423454138235e-06, "loss": 0.3203, "num_input_tokens_seen": 66821984, "step": 30985 }, { "epoch": 5.687282070104606, "grad_norm": 19.398765563964844, "learning_rate": 8.999943039267711e-06, "loss": 0.2851, "num_input_tokens_seen": 66833664, "step": 30990 }, { "epoch": 5.688199669664159, "grad_norm": 3.7302587032318115, "learning_rate": 8.999462521804849e-06, "loss": 0.1979, "num_input_tokens_seen": 66845024, "step": 30995 }, { "epoch": 5.68911726922371, "grad_norm": 7.592618465423584, "learning_rate": 8.998981901761971e-06, "loss": 0.3839, "num_input_tokens_seen": 66855584, "step": 31000 }, { "epoch": 5.690034868783263, "grad_norm": 19.55974578857422, "learning_rate": 8.998501179151405e-06, "loss": 0.2013, "num_input_tokens_seen": 66867136, "step": 31005 }, { "epoch": 5.690952468342815, "grad_norm": 2.9050345420837402, "learning_rate": 8.998020353985481e-06, "loss": 0.2647, "num_input_tokens_seen": 66878816, "step": 31010 }, { "epoch": 5.691870067902367, "grad_norm": 12.49510669708252, "learning_rate": 8.997539426276532e-06, "loss": 0.5282, "num_input_tokens_seen": 66890560, "step": 31015 }, { "epoch": 5.6927876674619196, "grad_norm": 5.775964736938477, "learning_rate": 8.997058396036891e-06, "loss": 0.297, "num_input_tokens_seen": 66901856, "step": 31020 }, { "epoch": 5.693705267021472, "grad_norm": 3.478684186935425, "learning_rate": 8.996577263278897e-06, "loss": 0.3815, "num_input_tokens_seen": 66911872, "step": 31025 }, { "epoch": 5.694622866581024, "grad_norm": 3.338038206100464, "learning_rate": 8.99609602801489e-06, "loss": 0.2985, "num_input_tokens_seen": 66921792, "step": 31030 }, { "epoch": 5.695540466140576, "grad_norm": 1.1565427780151367, "learning_rate": 8.995614690257216e-06, "loss": 0.2046, "num_input_tokens_seen": 66932032, "step": 31035 }, { "epoch": 5.696458065700129, "grad_norm": 7.063148498535156, "learning_rate": 8.995133250018215e-06, "loss": 0.3018, "num_input_tokens_seen": 66942144, "step": 31040 }, { "epoch": 5.69737566525968, "grad_norm": 0.6327551603317261, "learning_rate": 8.994651707310241e-06, "loss": 0.3666, "num_input_tokens_seen": 66953344, "step": 31045 }, { "epoch": 5.698293264819233, "grad_norm": 6.043720722198486, "learning_rate": 8.994170062145639e-06, "loss": 0.3517, "num_input_tokens_seen": 66963200, "step": 31050 }, { "epoch": 5.699210864378785, "grad_norm": 7.4965081214904785, "learning_rate": 8.993688314536766e-06, "loss": 0.1935, "num_input_tokens_seen": 66974464, "step": 31055 }, { "epoch": 5.700128463938337, "grad_norm": 1.0698493719100952, "learning_rate": 8.993206464495979e-06, "loss": 0.3368, "num_input_tokens_seen": 66985856, "step": 31060 }, { "epoch": 5.7010460634978894, "grad_norm": 6.40511417388916, "learning_rate": 8.992724512035632e-06, "loss": 0.3067, "num_input_tokens_seen": 66995232, "step": 31065 }, { "epoch": 5.701963663057442, "grad_norm": 3.407414436340332, "learning_rate": 8.992242457168091e-06, "loss": 0.2042, "num_input_tokens_seen": 67004736, "step": 31070 }, { "epoch": 5.702881262616994, "grad_norm": 17.74281120300293, "learning_rate": 8.991760299905718e-06, "loss": 0.3622, "num_input_tokens_seen": 67016512, "step": 31075 }, { "epoch": 5.703798862176546, "grad_norm": 18.927602767944336, "learning_rate": 8.991278040260882e-06, "loss": 0.3596, "num_input_tokens_seen": 67027200, "step": 31080 }, { "epoch": 5.704716461736099, "grad_norm": 5.771481513977051, "learning_rate": 8.990795678245949e-06, "loss": 0.1948, "num_input_tokens_seen": 67037056, "step": 31085 }, { "epoch": 5.70563406129565, "grad_norm": 2.9044547080993652, "learning_rate": 8.990313213873291e-06, "loss": 0.2132, "num_input_tokens_seen": 67047648, "step": 31090 }, { "epoch": 5.706551660855203, "grad_norm": 4.2110443115234375, "learning_rate": 8.989830647155285e-06, "loss": 0.2936, "num_input_tokens_seen": 67059456, "step": 31095 }, { "epoch": 5.707469260414755, "grad_norm": 4.773096084594727, "learning_rate": 8.989347978104305e-06, "loss": 0.2687, "num_input_tokens_seen": 67070752, "step": 31100 }, { "epoch": 5.708386859974307, "grad_norm": 10.776630401611328, "learning_rate": 8.988865206732733e-06, "loss": 0.1239, "num_input_tokens_seen": 67080544, "step": 31105 }, { "epoch": 5.709304459533859, "grad_norm": 8.699507713317871, "learning_rate": 8.988382333052951e-06, "loss": 0.3728, "num_input_tokens_seen": 67090944, "step": 31110 }, { "epoch": 5.710222059093412, "grad_norm": 4.717658042907715, "learning_rate": 8.987899357077343e-06, "loss": 0.198, "num_input_tokens_seen": 67101888, "step": 31115 }, { "epoch": 5.7111396586529635, "grad_norm": 10.242571830749512, "learning_rate": 8.987416278818298e-06, "loss": 0.2322, "num_input_tokens_seen": 67112576, "step": 31120 }, { "epoch": 5.712057258212516, "grad_norm": 14.693575859069824, "learning_rate": 8.986933098288205e-06, "loss": 0.271, "num_input_tokens_seen": 67124256, "step": 31125 }, { "epoch": 5.7129748577720685, "grad_norm": 11.137954711914062, "learning_rate": 8.986449815499456e-06, "loss": 0.338, "num_input_tokens_seen": 67135520, "step": 31130 }, { "epoch": 5.71389245733162, "grad_norm": 9.583542823791504, "learning_rate": 8.98596643046445e-06, "loss": 0.3182, "num_input_tokens_seen": 67146304, "step": 31135 }, { "epoch": 5.714810056891173, "grad_norm": 4.256266117095947, "learning_rate": 8.985482943195581e-06, "loss": 0.1381, "num_input_tokens_seen": 67156672, "step": 31140 }, { "epoch": 5.715727656450725, "grad_norm": 9.279648780822754, "learning_rate": 8.984999353705252e-06, "loss": 0.3778, "num_input_tokens_seen": 67166784, "step": 31145 }, { "epoch": 5.716645256010277, "grad_norm": 4.057147979736328, "learning_rate": 8.984515662005865e-06, "loss": 0.3907, "num_input_tokens_seen": 67176672, "step": 31150 }, { "epoch": 5.717562855569829, "grad_norm": 9.935009956359863, "learning_rate": 8.984031868109828e-06, "loss": 0.3024, "num_input_tokens_seen": 67186528, "step": 31155 }, { "epoch": 5.718480455129382, "grad_norm": 6.1743340492248535, "learning_rate": 8.983547972029547e-06, "loss": 0.1706, "num_input_tokens_seen": 67196832, "step": 31160 }, { "epoch": 5.719398054688933, "grad_norm": 6.600044250488281, "learning_rate": 8.983063973777436e-06, "loss": 0.2584, "num_input_tokens_seen": 67206208, "step": 31165 }, { "epoch": 5.720315654248486, "grad_norm": 3.232367515563965, "learning_rate": 8.982579873365906e-06, "loss": 0.2499, "num_input_tokens_seen": 67216896, "step": 31170 }, { "epoch": 5.721233253808038, "grad_norm": 4.722378730773926, "learning_rate": 8.982095670807376e-06, "loss": 0.3406, "num_input_tokens_seen": 67226784, "step": 31175 }, { "epoch": 5.72215085336759, "grad_norm": 5.958011150360107, "learning_rate": 8.981611366114263e-06, "loss": 0.3291, "num_input_tokens_seen": 67237184, "step": 31180 }, { "epoch": 5.7230684529271425, "grad_norm": 4.112259387969971, "learning_rate": 8.981126959298988e-06, "loss": 0.2468, "num_input_tokens_seen": 67246560, "step": 31185 }, { "epoch": 5.723986052486695, "grad_norm": 4.7269816398620605, "learning_rate": 8.980642450373977e-06, "loss": 0.2534, "num_input_tokens_seen": 67256480, "step": 31190 }, { "epoch": 5.7249036520462475, "grad_norm": 3.4070892333984375, "learning_rate": 8.98015783935166e-06, "loss": 0.2294, "num_input_tokens_seen": 67267776, "step": 31195 }, { "epoch": 5.725821251605799, "grad_norm": 3.635256052017212, "learning_rate": 8.97967312624446e-06, "loss": 0.3649, "num_input_tokens_seen": 67278976, "step": 31200 }, { "epoch": 5.726738851165352, "grad_norm": 6.703744888305664, "learning_rate": 8.979188311064812e-06, "loss": 0.1486, "num_input_tokens_seen": 67289888, "step": 31205 }, { "epoch": 5.727656450724904, "grad_norm": 4.196514129638672, "learning_rate": 8.978703393825152e-06, "loss": 0.2891, "num_input_tokens_seen": 67299840, "step": 31210 }, { "epoch": 5.728574050284456, "grad_norm": 3.0621302127838135, "learning_rate": 8.978218374537917e-06, "loss": 0.2216, "num_input_tokens_seen": 67310784, "step": 31215 }, { "epoch": 5.729491649844008, "grad_norm": 6.99836540222168, "learning_rate": 8.977733253215545e-06, "loss": 0.2976, "num_input_tokens_seen": 67321472, "step": 31220 }, { "epoch": 5.730409249403561, "grad_norm": 5.453486442565918, "learning_rate": 8.97724802987048e-06, "loss": 0.1774, "num_input_tokens_seen": 67331712, "step": 31225 }, { "epoch": 5.731326848963112, "grad_norm": 7.952020645141602, "learning_rate": 8.97676270451517e-06, "loss": 0.4965, "num_input_tokens_seen": 67341728, "step": 31230 }, { "epoch": 5.732244448522665, "grad_norm": 6.838696479797363, "learning_rate": 8.976277277162055e-06, "loss": 0.2561, "num_input_tokens_seen": 67353120, "step": 31235 }, { "epoch": 5.733162048082217, "grad_norm": 9.1903657913208, "learning_rate": 8.975791747823595e-06, "loss": 0.1874, "num_input_tokens_seen": 67364064, "step": 31240 }, { "epoch": 5.734079647641769, "grad_norm": 2.0511093139648438, "learning_rate": 8.975306116512236e-06, "loss": 0.4356, "num_input_tokens_seen": 67374688, "step": 31245 }, { "epoch": 5.7349972472013215, "grad_norm": 6.15565824508667, "learning_rate": 8.974820383240439e-06, "loss": 0.3068, "num_input_tokens_seen": 67385440, "step": 31250 }, { "epoch": 5.735914846760874, "grad_norm": 4.4474687576293945, "learning_rate": 8.974334548020657e-06, "loss": 0.2088, "num_input_tokens_seen": 67396320, "step": 31255 }, { "epoch": 5.736832446320426, "grad_norm": 5.6132893562316895, "learning_rate": 8.973848610865354e-06, "loss": 0.261, "num_input_tokens_seen": 67406240, "step": 31260 }, { "epoch": 5.737750045879978, "grad_norm": 10.342777252197266, "learning_rate": 8.973362571786993e-06, "loss": 0.266, "num_input_tokens_seen": 67417568, "step": 31265 }, { "epoch": 5.738667645439531, "grad_norm": 6.198208808898926, "learning_rate": 8.972876430798041e-06, "loss": 0.3072, "num_input_tokens_seen": 67428640, "step": 31270 }, { "epoch": 5.739585244999082, "grad_norm": 1.0124770402908325, "learning_rate": 8.972390187910966e-06, "loss": 0.2018, "num_input_tokens_seen": 67438848, "step": 31275 }, { "epoch": 5.740502844558635, "grad_norm": 10.25199031829834, "learning_rate": 8.971903843138238e-06, "loss": 0.206, "num_input_tokens_seen": 67449536, "step": 31280 }, { "epoch": 5.741420444118187, "grad_norm": 1.1036618947982788, "learning_rate": 8.971417396492333e-06, "loss": 0.1557, "num_input_tokens_seen": 67460480, "step": 31285 }, { "epoch": 5.742338043677739, "grad_norm": 5.804757595062256, "learning_rate": 8.970930847985727e-06, "loss": 0.2489, "num_input_tokens_seen": 67471456, "step": 31290 }, { "epoch": 5.743255643237291, "grad_norm": 9.564481735229492, "learning_rate": 8.970444197630898e-06, "loss": 0.2071, "num_input_tokens_seen": 67481920, "step": 31295 }, { "epoch": 5.744173242796844, "grad_norm": 11.232348442077637, "learning_rate": 8.969957445440332e-06, "loss": 0.2262, "num_input_tokens_seen": 67493728, "step": 31300 }, { "epoch": 5.7450908423563956, "grad_norm": 1.2256039381027222, "learning_rate": 8.969470591426507e-06, "loss": 0.2341, "num_input_tokens_seen": 67502976, "step": 31305 }, { "epoch": 5.746008441915948, "grad_norm": 4.437829494476318, "learning_rate": 8.968983635601916e-06, "loss": 0.3136, "num_input_tokens_seen": 67514464, "step": 31310 }, { "epoch": 5.746926041475501, "grad_norm": 8.801020622253418, "learning_rate": 8.968496577979045e-06, "loss": 0.1847, "num_input_tokens_seen": 67524800, "step": 31315 }, { "epoch": 5.747843641035052, "grad_norm": 8.59488582611084, "learning_rate": 8.968009418570388e-06, "loss": 0.3958, "num_input_tokens_seen": 67535680, "step": 31320 }, { "epoch": 5.748761240594605, "grad_norm": 16.740428924560547, "learning_rate": 8.967522157388439e-06, "loss": 0.2903, "num_input_tokens_seen": 67546816, "step": 31325 }, { "epoch": 5.749678840154157, "grad_norm": 6.9402079582214355, "learning_rate": 8.967034794445695e-06, "loss": 0.3908, "num_input_tokens_seen": 67557152, "step": 31330 }, { "epoch": 5.750596439713709, "grad_norm": 12.712435722351074, "learning_rate": 8.966547329754658e-06, "loss": 0.2246, "num_input_tokens_seen": 67568256, "step": 31335 }, { "epoch": 5.751514039273261, "grad_norm": 0.9004251956939697, "learning_rate": 8.966059763327828e-06, "loss": 0.152, "num_input_tokens_seen": 67578560, "step": 31340 }, { "epoch": 5.752431638832814, "grad_norm": 9.490478515625, "learning_rate": 8.965572095177714e-06, "loss": 0.2899, "num_input_tokens_seen": 67589696, "step": 31345 }, { "epoch": 5.7533492383923655, "grad_norm": 14.4307861328125, "learning_rate": 8.96508432531682e-06, "loss": 0.3725, "num_input_tokens_seen": 67600032, "step": 31350 }, { "epoch": 5.754266837951918, "grad_norm": 5.761959552764893, "learning_rate": 8.964596453757659e-06, "loss": 0.2972, "num_input_tokens_seen": 67611776, "step": 31355 }, { "epoch": 5.7551844375114705, "grad_norm": 4.792989253997803, "learning_rate": 8.964108480512744e-06, "loss": 0.2918, "num_input_tokens_seen": 67623968, "step": 31360 }, { "epoch": 5.756102037071022, "grad_norm": 2.9831719398498535, "learning_rate": 8.963620405594591e-06, "loss": 0.4095, "num_input_tokens_seen": 67634752, "step": 31365 }, { "epoch": 5.757019636630575, "grad_norm": 11.176779747009277, "learning_rate": 8.963132229015716e-06, "loss": 0.2225, "num_input_tokens_seen": 67645056, "step": 31370 }, { "epoch": 5.757937236190127, "grad_norm": 8.890013694763184, "learning_rate": 8.962643950788643e-06, "loss": 0.2451, "num_input_tokens_seen": 67657024, "step": 31375 }, { "epoch": 5.758854835749679, "grad_norm": 9.024591445922852, "learning_rate": 8.962155570925892e-06, "loss": 0.2726, "num_input_tokens_seen": 67668160, "step": 31380 }, { "epoch": 5.759772435309231, "grad_norm": 4.402358531951904, "learning_rate": 8.961667089439993e-06, "loss": 0.2992, "num_input_tokens_seen": 67679648, "step": 31385 }, { "epoch": 5.760690034868784, "grad_norm": 9.780187606811523, "learning_rate": 8.961178506343473e-06, "loss": 0.2514, "num_input_tokens_seen": 67691776, "step": 31390 }, { "epoch": 5.761607634428335, "grad_norm": 2.1962692737579346, "learning_rate": 8.960689821648864e-06, "loss": 0.2869, "num_input_tokens_seen": 67701632, "step": 31395 }, { "epoch": 5.762525233987888, "grad_norm": 1.2168725728988647, "learning_rate": 8.960201035368698e-06, "loss": 0.2055, "num_input_tokens_seen": 67712768, "step": 31400 }, { "epoch": 5.76344283354744, "grad_norm": 8.261598587036133, "learning_rate": 8.959712147515515e-06, "loss": 0.1772, "num_input_tokens_seen": 67723488, "step": 31405 }, { "epoch": 5.764360433106992, "grad_norm": 8.107905387878418, "learning_rate": 8.959223158101852e-06, "loss": 0.3674, "num_input_tokens_seen": 67733952, "step": 31410 }, { "epoch": 5.7652780326665445, "grad_norm": 9.130719184875488, "learning_rate": 8.95873406714025e-06, "loss": 0.1498, "num_input_tokens_seen": 67744800, "step": 31415 }, { "epoch": 5.766195632226097, "grad_norm": 13.093875885009766, "learning_rate": 8.958244874643255e-06, "loss": 0.3697, "num_input_tokens_seen": 67756736, "step": 31420 }, { "epoch": 5.767113231785649, "grad_norm": 0.4083816111087799, "learning_rate": 8.957755580623416e-06, "loss": 0.0878, "num_input_tokens_seen": 67768480, "step": 31425 }, { "epoch": 5.768030831345201, "grad_norm": 8.507538795471191, "learning_rate": 8.957266185093279e-06, "loss": 0.4767, "num_input_tokens_seen": 67778880, "step": 31430 }, { "epoch": 5.768948430904754, "grad_norm": 11.70125675201416, "learning_rate": 8.956776688065397e-06, "loss": 0.2403, "num_input_tokens_seen": 67789280, "step": 31435 }, { "epoch": 5.769866030464305, "grad_norm": 2.746734142303467, "learning_rate": 8.956287089552325e-06, "loss": 0.1837, "num_input_tokens_seen": 67800736, "step": 31440 }, { "epoch": 5.770783630023858, "grad_norm": 22.429994583129883, "learning_rate": 8.955797389566622e-06, "loss": 0.3494, "num_input_tokens_seen": 67811872, "step": 31445 }, { "epoch": 5.77170122958341, "grad_norm": 5.467024326324463, "learning_rate": 8.955307588120847e-06, "loss": 0.369, "num_input_tokens_seen": 67821920, "step": 31450 }, { "epoch": 5.772618829142962, "grad_norm": 1.0101269483566284, "learning_rate": 8.954817685227561e-06, "loss": 0.2188, "num_input_tokens_seen": 67831488, "step": 31455 }, { "epoch": 5.773536428702514, "grad_norm": 10.371167182922363, "learning_rate": 8.954327680899333e-06, "loss": 0.1327, "num_input_tokens_seen": 67841504, "step": 31460 }, { "epoch": 5.774454028262067, "grad_norm": 6.4214348793029785, "learning_rate": 8.953837575148726e-06, "loss": 0.3407, "num_input_tokens_seen": 67852544, "step": 31465 }, { "epoch": 5.7753716278216185, "grad_norm": 17.074628829956055, "learning_rate": 8.953347367988314e-06, "loss": 0.2678, "num_input_tokens_seen": 67861888, "step": 31470 }, { "epoch": 5.776289227381171, "grad_norm": 3.1883833408355713, "learning_rate": 8.95285705943067e-06, "loss": 0.2455, "num_input_tokens_seen": 67872096, "step": 31475 }, { "epoch": 5.7772068269407235, "grad_norm": 9.02784538269043, "learning_rate": 8.952366649488368e-06, "loss": 0.221, "num_input_tokens_seen": 67884384, "step": 31480 }, { "epoch": 5.778124426500275, "grad_norm": 9.204316139221191, "learning_rate": 8.951876138173988e-06, "loss": 0.6116, "num_input_tokens_seen": 67895264, "step": 31485 }, { "epoch": 5.779042026059828, "grad_norm": 8.012147903442383, "learning_rate": 8.951385525500109e-06, "loss": 0.2107, "num_input_tokens_seen": 67904256, "step": 31490 }, { "epoch": 5.77995962561938, "grad_norm": 8.009357452392578, "learning_rate": 8.950894811479317e-06, "loss": 0.2683, "num_input_tokens_seen": 67914880, "step": 31495 }, { "epoch": 5.780877225178932, "grad_norm": 1.0964981317520142, "learning_rate": 8.950403996124193e-06, "loss": 0.3037, "num_input_tokens_seen": 67925344, "step": 31500 }, { "epoch": 5.781794824738484, "grad_norm": 3.76119065284729, "learning_rate": 8.949913079447333e-06, "loss": 0.2947, "num_input_tokens_seen": 67935616, "step": 31505 }, { "epoch": 5.782712424298037, "grad_norm": 12.20234203338623, "learning_rate": 8.949422061461322e-06, "loss": 0.3718, "num_input_tokens_seen": 67945280, "step": 31510 }, { "epoch": 5.783630023857588, "grad_norm": 8.530158996582031, "learning_rate": 8.948930942178756e-06, "loss": 0.3076, "num_input_tokens_seen": 67956352, "step": 31515 }, { "epoch": 5.784547623417141, "grad_norm": 4.1127753257751465, "learning_rate": 8.948439721612232e-06, "loss": 0.3675, "num_input_tokens_seen": 67967712, "step": 31520 }, { "epoch": 5.785465222976693, "grad_norm": 2.423933506011963, "learning_rate": 8.94794839977435e-06, "loss": 0.3482, "num_input_tokens_seen": 67979264, "step": 31525 }, { "epoch": 5.786382822536245, "grad_norm": 8.94726848602295, "learning_rate": 8.94745697667771e-06, "loss": 0.2295, "num_input_tokens_seen": 67988896, "step": 31530 }, { "epoch": 5.7873004220957975, "grad_norm": 2.954535484313965, "learning_rate": 8.946965452334915e-06, "loss": 0.1932, "num_input_tokens_seen": 67999840, "step": 31535 }, { "epoch": 5.78821802165535, "grad_norm": 3.752012252807617, "learning_rate": 8.946473826758574e-06, "loss": 0.2671, "num_input_tokens_seen": 68009568, "step": 31540 }, { "epoch": 5.789135621214902, "grad_norm": 10.194947242736816, "learning_rate": 8.945982099961297e-06, "loss": 0.2773, "num_input_tokens_seen": 68019520, "step": 31545 }, { "epoch": 5.790053220774454, "grad_norm": 9.828535079956055, "learning_rate": 8.945490271955693e-06, "loss": 0.3174, "num_input_tokens_seen": 68031200, "step": 31550 }, { "epoch": 5.790970820334007, "grad_norm": 0.31759345531463623, "learning_rate": 8.944998342754379e-06, "loss": 0.2228, "num_input_tokens_seen": 68041376, "step": 31555 }, { "epoch": 5.791888419893558, "grad_norm": 1.8939450979232788, "learning_rate": 8.944506312369971e-06, "loss": 0.2406, "num_input_tokens_seen": 68053376, "step": 31560 }, { "epoch": 5.792806019453111, "grad_norm": 7.80426549911499, "learning_rate": 8.944014180815093e-06, "loss": 0.4348, "num_input_tokens_seen": 68063392, "step": 31565 }, { "epoch": 5.793723619012663, "grad_norm": 6.9134016036987305, "learning_rate": 8.943521948102361e-06, "loss": 0.3702, "num_input_tokens_seen": 68074016, "step": 31570 }, { "epoch": 5.794641218572215, "grad_norm": 5.707268238067627, "learning_rate": 8.943029614244404e-06, "loss": 0.3772, "num_input_tokens_seen": 68083872, "step": 31575 }, { "epoch": 5.795558818131767, "grad_norm": 7.335583686828613, "learning_rate": 8.942537179253848e-06, "loss": 0.1603, "num_input_tokens_seen": 68094944, "step": 31580 }, { "epoch": 5.79647641769132, "grad_norm": 4.87169075012207, "learning_rate": 8.942044643143323e-06, "loss": 0.2047, "num_input_tokens_seen": 68105632, "step": 31585 }, { "epoch": 5.797394017250872, "grad_norm": 2.0022215843200684, "learning_rate": 8.941552005925463e-06, "loss": 0.2685, "num_input_tokens_seen": 68116256, "step": 31590 }, { "epoch": 5.798311616810424, "grad_norm": 1.831583023071289, "learning_rate": 8.941059267612903e-06, "loss": 0.212, "num_input_tokens_seen": 68126560, "step": 31595 }, { "epoch": 5.799229216369977, "grad_norm": 3.93748140335083, "learning_rate": 8.94056642821828e-06, "loss": 0.3176, "num_input_tokens_seen": 68137472, "step": 31600 }, { "epoch": 5.800146815929528, "grad_norm": 3.5620687007904053, "learning_rate": 8.940073487754236e-06, "loss": 0.1843, "num_input_tokens_seen": 68150144, "step": 31605 }, { "epoch": 5.801064415489081, "grad_norm": 10.882197380065918, "learning_rate": 8.939580446233416e-06, "loss": 0.2995, "num_input_tokens_seen": 68162496, "step": 31610 }, { "epoch": 5.801982015048633, "grad_norm": 2.6567254066467285, "learning_rate": 8.93908730366846e-06, "loss": 0.2509, "num_input_tokens_seen": 68173248, "step": 31615 }, { "epoch": 5.802899614608185, "grad_norm": 7.056206703186035, "learning_rate": 8.938594060072022e-06, "loss": 0.1772, "num_input_tokens_seen": 68184192, "step": 31620 }, { "epoch": 5.803817214167737, "grad_norm": 3.377812147140503, "learning_rate": 8.93810071545675e-06, "loss": 0.3683, "num_input_tokens_seen": 68194880, "step": 31625 }, { "epoch": 5.80473481372729, "grad_norm": 11.899008750915527, "learning_rate": 8.937607269835298e-06, "loss": 0.2073, "num_input_tokens_seen": 68205664, "step": 31630 }, { "epoch": 5.8056524132868415, "grad_norm": 9.598428726196289, "learning_rate": 8.937113723220322e-06, "loss": 0.2511, "num_input_tokens_seen": 68216896, "step": 31635 }, { "epoch": 5.806570012846394, "grad_norm": 3.5631942749023438, "learning_rate": 8.93662007562448e-06, "loss": 0.2726, "num_input_tokens_seen": 68227680, "step": 31640 }, { "epoch": 5.8074876124059465, "grad_norm": 14.080196380615234, "learning_rate": 8.936126327060438e-06, "loss": 0.4837, "num_input_tokens_seen": 68238880, "step": 31645 }, { "epoch": 5.808405211965498, "grad_norm": 4.571241855621338, "learning_rate": 8.935632477540852e-06, "loss": 0.3064, "num_input_tokens_seen": 68248992, "step": 31650 }, { "epoch": 5.809322811525051, "grad_norm": 9.041975021362305, "learning_rate": 8.935138527078394e-06, "loss": 0.3336, "num_input_tokens_seen": 68259488, "step": 31655 }, { "epoch": 5.810240411084603, "grad_norm": 4.437520980834961, "learning_rate": 8.934644475685732e-06, "loss": 0.1707, "num_input_tokens_seen": 68269472, "step": 31660 }, { "epoch": 5.811158010644155, "grad_norm": 1.8081618547439575, "learning_rate": 8.934150323375537e-06, "loss": 0.2561, "num_input_tokens_seen": 68279936, "step": 31665 }, { "epoch": 5.812075610203707, "grad_norm": 8.03672981262207, "learning_rate": 8.933656070160483e-06, "loss": 0.3352, "num_input_tokens_seen": 68290784, "step": 31670 }, { "epoch": 5.81299320976326, "grad_norm": 2.264108657836914, "learning_rate": 8.933161716053249e-06, "loss": 0.2625, "num_input_tokens_seen": 68301280, "step": 31675 }, { "epoch": 5.813910809322811, "grad_norm": 2.825160264968872, "learning_rate": 8.932667261066511e-06, "loss": 0.2536, "num_input_tokens_seen": 68311904, "step": 31680 }, { "epoch": 5.814828408882364, "grad_norm": 0.996908962726593, "learning_rate": 8.932172705212954e-06, "loss": 0.1858, "num_input_tokens_seen": 68322944, "step": 31685 }, { "epoch": 5.815746008441916, "grad_norm": 1.944732904434204, "learning_rate": 8.931678048505263e-06, "loss": 0.1931, "num_input_tokens_seen": 68333664, "step": 31690 }, { "epoch": 5.816663608001468, "grad_norm": 2.745875120162964, "learning_rate": 8.931183290956121e-06, "loss": 0.2782, "num_input_tokens_seen": 68343840, "step": 31695 }, { "epoch": 5.8175812075610205, "grad_norm": 1.8607220649719238, "learning_rate": 8.930688432578221e-06, "loss": 0.2333, "num_input_tokens_seen": 68355392, "step": 31700 }, { "epoch": 5.818498807120573, "grad_norm": 8.185256004333496, "learning_rate": 8.930193473384255e-06, "loss": 0.4743, "num_input_tokens_seen": 68366368, "step": 31705 }, { "epoch": 5.819416406680125, "grad_norm": 19.378076553344727, "learning_rate": 8.929698413386918e-06, "loss": 0.2936, "num_input_tokens_seen": 68376832, "step": 31710 }, { "epoch": 5.820334006239677, "grad_norm": 5.792255878448486, "learning_rate": 8.929203252598907e-06, "loss": 0.1865, "num_input_tokens_seen": 68387296, "step": 31715 }, { "epoch": 5.82125160579923, "grad_norm": 5.763931751251221, "learning_rate": 8.928707991032923e-06, "loss": 0.2769, "num_input_tokens_seen": 68397504, "step": 31720 }, { "epoch": 5.822169205358781, "grad_norm": 7.878177642822266, "learning_rate": 8.928212628701667e-06, "loss": 0.2254, "num_input_tokens_seen": 68406880, "step": 31725 }, { "epoch": 5.823086804918334, "grad_norm": 2.670147180557251, "learning_rate": 8.927717165617844e-06, "loss": 0.1969, "num_input_tokens_seen": 68416864, "step": 31730 }, { "epoch": 5.824004404477886, "grad_norm": 10.86068058013916, "learning_rate": 8.927221601794165e-06, "loss": 0.3178, "num_input_tokens_seen": 68427584, "step": 31735 }, { "epoch": 5.824922004037438, "grad_norm": 1.657796025276184, "learning_rate": 8.926725937243337e-06, "loss": 0.1859, "num_input_tokens_seen": 68437888, "step": 31740 }, { "epoch": 5.82583960359699, "grad_norm": 8.165653228759766, "learning_rate": 8.926230171978076e-06, "loss": 0.3568, "num_input_tokens_seen": 68447616, "step": 31745 }, { "epoch": 5.826757203156543, "grad_norm": 1.1160194873809814, "learning_rate": 8.925734306011096e-06, "loss": 0.2209, "num_input_tokens_seen": 68458688, "step": 31750 }, { "epoch": 5.8276748027160945, "grad_norm": 2.381728172302246, "learning_rate": 8.925238339355115e-06, "loss": 0.3682, "num_input_tokens_seen": 68470368, "step": 31755 }, { "epoch": 5.828592402275647, "grad_norm": 3.2693841457366943, "learning_rate": 8.924742272022855e-06, "loss": 0.1607, "num_input_tokens_seen": 68481088, "step": 31760 }, { "epoch": 5.8295100018351995, "grad_norm": 12.322501182556152, "learning_rate": 8.924246104027036e-06, "loss": 0.2526, "num_input_tokens_seen": 68491360, "step": 31765 }, { "epoch": 5.830427601394751, "grad_norm": 11.232376098632812, "learning_rate": 8.92374983538039e-06, "loss": 0.314, "num_input_tokens_seen": 68501376, "step": 31770 }, { "epoch": 5.831345200954304, "grad_norm": 16.725906372070312, "learning_rate": 8.92325346609564e-06, "loss": 0.3658, "num_input_tokens_seen": 68512800, "step": 31775 }, { "epoch": 5.832262800513856, "grad_norm": 4.700488567352295, "learning_rate": 8.92275699618552e-06, "loss": 0.379, "num_input_tokens_seen": 68523904, "step": 31780 }, { "epoch": 5.833180400073408, "grad_norm": 4.680229187011719, "learning_rate": 8.922260425662762e-06, "loss": 0.2603, "num_input_tokens_seen": 68533760, "step": 31785 }, { "epoch": 5.83409799963296, "grad_norm": 10.291678428649902, "learning_rate": 8.921763754540102e-06, "loss": 0.2698, "num_input_tokens_seen": 68543424, "step": 31790 }, { "epoch": 5.835015599192513, "grad_norm": 11.673328399658203, "learning_rate": 8.921266982830282e-06, "loss": 0.4347, "num_input_tokens_seen": 68554336, "step": 31795 }, { "epoch": 5.835933198752064, "grad_norm": 2.1492247581481934, "learning_rate": 8.92077011054604e-06, "loss": 0.2267, "num_input_tokens_seen": 68564640, "step": 31800 }, { "epoch": 5.836850798311617, "grad_norm": 4.435303688049316, "learning_rate": 8.920273137700121e-06, "loss": 0.3206, "num_input_tokens_seen": 68574272, "step": 31805 }, { "epoch": 5.837768397871169, "grad_norm": 5.866128444671631, "learning_rate": 8.919776064305274e-06, "loss": 0.2958, "num_input_tokens_seen": 68585440, "step": 31810 }, { "epoch": 5.838685997430721, "grad_norm": 5.1049957275390625, "learning_rate": 8.919278890374243e-06, "loss": 0.3164, "num_input_tokens_seen": 68595040, "step": 31815 }, { "epoch": 5.8396035969902735, "grad_norm": 4.895611763000488, "learning_rate": 8.918781615919785e-06, "loss": 0.2135, "num_input_tokens_seen": 68604864, "step": 31820 }, { "epoch": 5.840521196549826, "grad_norm": 3.1869113445281982, "learning_rate": 8.918284240954653e-06, "loss": 0.1562, "num_input_tokens_seen": 68613920, "step": 31825 }, { "epoch": 5.841438796109378, "grad_norm": 18.229583740234375, "learning_rate": 8.9177867654916e-06, "loss": 0.5182, "num_input_tokens_seen": 68625792, "step": 31830 }, { "epoch": 5.84235639566893, "grad_norm": 11.950606346130371, "learning_rate": 8.91728918954339e-06, "loss": 0.2409, "num_input_tokens_seen": 68636608, "step": 31835 }, { "epoch": 5.843273995228483, "grad_norm": 10.6542329788208, "learning_rate": 8.916791513122784e-06, "loss": 0.2693, "num_input_tokens_seen": 68648032, "step": 31840 }, { "epoch": 5.844191594788034, "grad_norm": 2.934675455093384, "learning_rate": 8.916293736242546e-06, "loss": 0.2196, "num_input_tokens_seen": 68659360, "step": 31845 }, { "epoch": 5.845109194347587, "grad_norm": 10.93692684173584, "learning_rate": 8.915795858915444e-06, "loss": 0.3874, "num_input_tokens_seen": 68671424, "step": 31850 }, { "epoch": 5.846026793907139, "grad_norm": 6.091270923614502, "learning_rate": 8.915297881154246e-06, "loss": 0.1455, "num_input_tokens_seen": 68681696, "step": 31855 }, { "epoch": 5.846944393466691, "grad_norm": 10.241564750671387, "learning_rate": 8.914799802971725e-06, "loss": 0.2607, "num_input_tokens_seen": 68692672, "step": 31860 }, { "epoch": 5.847861993026243, "grad_norm": 6.327685356140137, "learning_rate": 8.914301624380657e-06, "loss": 0.356, "num_input_tokens_seen": 68702688, "step": 31865 }, { "epoch": 5.848779592585796, "grad_norm": 8.736257553100586, "learning_rate": 8.91380334539382e-06, "loss": 0.2258, "num_input_tokens_seen": 68713312, "step": 31870 }, { "epoch": 5.849697192145348, "grad_norm": 21.404499053955078, "learning_rate": 8.913304966023993e-06, "loss": 0.3276, "num_input_tokens_seen": 68723776, "step": 31875 }, { "epoch": 5.8506147917049, "grad_norm": 4.834427356719971, "learning_rate": 8.912806486283956e-06, "loss": 0.2516, "num_input_tokens_seen": 68732832, "step": 31880 }, { "epoch": 5.851532391264453, "grad_norm": 5.306298732757568, "learning_rate": 8.9123079061865e-06, "loss": 0.3035, "num_input_tokens_seen": 68742880, "step": 31885 }, { "epoch": 5.852449990824004, "grad_norm": 1.8349367380142212, "learning_rate": 8.911809225744407e-06, "loss": 0.2658, "num_input_tokens_seen": 68753728, "step": 31890 }, { "epoch": 5.853367590383557, "grad_norm": 4.358051776885986, "learning_rate": 8.911310444970473e-06, "loss": 0.4728, "num_input_tokens_seen": 68764512, "step": 31895 }, { "epoch": 5.854285189943109, "grad_norm": 3.481762409210205, "learning_rate": 8.910811563877486e-06, "loss": 0.1533, "num_input_tokens_seen": 68775840, "step": 31900 }, { "epoch": 5.855202789502661, "grad_norm": 9.809226989746094, "learning_rate": 8.910312582478245e-06, "loss": 0.2939, "num_input_tokens_seen": 68788064, "step": 31905 }, { "epoch": 5.856120389062213, "grad_norm": 3.62251353263855, "learning_rate": 8.909813500785546e-06, "loss": 0.3275, "num_input_tokens_seen": 68798976, "step": 31910 }, { "epoch": 5.857037988621766, "grad_norm": 2.9148614406585693, "learning_rate": 8.90931431881219e-06, "loss": 0.2317, "num_input_tokens_seen": 68809568, "step": 31915 }, { "epoch": 5.8579555881813175, "grad_norm": 2.5310721397399902, "learning_rate": 8.90881503657098e-06, "loss": 0.2175, "num_input_tokens_seen": 68819520, "step": 31920 }, { "epoch": 5.85887318774087, "grad_norm": 12.129361152648926, "learning_rate": 8.908315654074724e-06, "loss": 0.2168, "num_input_tokens_seen": 68829248, "step": 31925 }, { "epoch": 5.8597907873004225, "grad_norm": 10.54216194152832, "learning_rate": 8.907816171336229e-06, "loss": 0.2236, "num_input_tokens_seen": 68839296, "step": 31930 }, { "epoch": 5.860708386859974, "grad_norm": 7.995594501495361, "learning_rate": 8.907316588368305e-06, "loss": 0.174, "num_input_tokens_seen": 68850112, "step": 31935 }, { "epoch": 5.861625986419527, "grad_norm": 4.35649299621582, "learning_rate": 8.906816905183766e-06, "loss": 0.2241, "num_input_tokens_seen": 68861952, "step": 31940 }, { "epoch": 5.862543585979079, "grad_norm": 15.890336036682129, "learning_rate": 8.90631712179543e-06, "loss": 0.2691, "num_input_tokens_seen": 68873024, "step": 31945 }, { "epoch": 5.863461185538631, "grad_norm": 5.188009262084961, "learning_rate": 8.905817238216112e-06, "loss": 0.231, "num_input_tokens_seen": 68883264, "step": 31950 }, { "epoch": 5.864378785098183, "grad_norm": 10.23843002319336, "learning_rate": 8.905317254458636e-06, "loss": 0.2373, "num_input_tokens_seen": 68895968, "step": 31955 }, { "epoch": 5.865296384657736, "grad_norm": 8.326930046081543, "learning_rate": 8.904817170535825e-06, "loss": 0.2035, "num_input_tokens_seen": 68907296, "step": 31960 }, { "epoch": 5.866213984217287, "grad_norm": 6.203354835510254, "learning_rate": 8.904316986460506e-06, "loss": 0.2151, "num_input_tokens_seen": 68919392, "step": 31965 }, { "epoch": 5.86713158377684, "grad_norm": 19.98744773864746, "learning_rate": 8.903816702245507e-06, "loss": 0.3275, "num_input_tokens_seen": 68929600, "step": 31970 }, { "epoch": 5.868049183336392, "grad_norm": 3.0705811977386475, "learning_rate": 8.90331631790366e-06, "loss": 0.2512, "num_input_tokens_seen": 68941376, "step": 31975 }, { "epoch": 5.868966782895944, "grad_norm": 15.837177276611328, "learning_rate": 8.902815833447799e-06, "loss": 0.213, "num_input_tokens_seen": 68951424, "step": 31980 }, { "epoch": 5.8698843824554965, "grad_norm": 10.459942817687988, "learning_rate": 8.90231524889076e-06, "loss": 0.2102, "num_input_tokens_seen": 68963360, "step": 31985 }, { "epoch": 5.870801982015049, "grad_norm": 6.7176055908203125, "learning_rate": 8.901814564245385e-06, "loss": 0.2539, "num_input_tokens_seen": 68974848, "step": 31990 }, { "epoch": 5.871719581574601, "grad_norm": 8.735099792480469, "learning_rate": 8.901313779524512e-06, "loss": 0.2744, "num_input_tokens_seen": 68984544, "step": 31995 }, { "epoch": 5.872637181134153, "grad_norm": 5.395994663238525, "learning_rate": 8.900812894740986e-06, "loss": 0.3413, "num_input_tokens_seen": 68993120, "step": 32000 }, { "epoch": 5.873554780693706, "grad_norm": 18.43574333190918, "learning_rate": 8.900311909907658e-06, "loss": 0.4164, "num_input_tokens_seen": 69004352, "step": 32005 }, { "epoch": 5.874472380253257, "grad_norm": 7.478888034820557, "learning_rate": 8.899810825037373e-06, "loss": 0.2548, "num_input_tokens_seen": 69015392, "step": 32010 }, { "epoch": 5.87538997981281, "grad_norm": 6.896108150482178, "learning_rate": 8.899309640142984e-06, "loss": 0.2092, "num_input_tokens_seen": 69026368, "step": 32015 }, { "epoch": 5.876307579372362, "grad_norm": 1.3991461992263794, "learning_rate": 8.898808355237346e-06, "loss": 0.335, "num_input_tokens_seen": 69035648, "step": 32020 }, { "epoch": 5.877225178931914, "grad_norm": 4.918111801147461, "learning_rate": 8.898306970333316e-06, "loss": 0.2297, "num_input_tokens_seen": 69047328, "step": 32025 }, { "epoch": 5.878142778491466, "grad_norm": 12.936844825744629, "learning_rate": 8.897805485443756e-06, "loss": 0.3743, "num_input_tokens_seen": 69058528, "step": 32030 }, { "epoch": 5.879060378051019, "grad_norm": 7.937311172485352, "learning_rate": 8.897303900581525e-06, "loss": 0.2198, "num_input_tokens_seen": 69070656, "step": 32035 }, { "epoch": 5.8799779776105705, "grad_norm": 5.884500026702881, "learning_rate": 8.89680221575949e-06, "loss": 0.3788, "num_input_tokens_seen": 69081632, "step": 32040 }, { "epoch": 5.880895577170123, "grad_norm": 8.976335525512695, "learning_rate": 8.896300430990516e-06, "loss": 0.2144, "num_input_tokens_seen": 69091584, "step": 32045 }, { "epoch": 5.8818131767296755, "grad_norm": 15.261255264282227, "learning_rate": 8.895798546287477e-06, "loss": 0.2627, "num_input_tokens_seen": 69103264, "step": 32050 }, { "epoch": 5.882730776289227, "grad_norm": 12.286794662475586, "learning_rate": 8.895296561663241e-06, "loss": 0.2876, "num_input_tokens_seen": 69113504, "step": 32055 }, { "epoch": 5.88364837584878, "grad_norm": 9.547778129577637, "learning_rate": 8.894794477130686e-06, "loss": 0.2807, "num_input_tokens_seen": 69123104, "step": 32060 }, { "epoch": 5.884565975408332, "grad_norm": 6.209046363830566, "learning_rate": 8.89429229270269e-06, "loss": 0.1768, "num_input_tokens_seen": 69134688, "step": 32065 }, { "epoch": 5.885483574967884, "grad_norm": 10.18857192993164, "learning_rate": 8.893790008392132e-06, "loss": 0.2675, "num_input_tokens_seen": 69144960, "step": 32070 }, { "epoch": 5.886401174527436, "grad_norm": 6.973294258117676, "learning_rate": 8.893287624211896e-06, "loss": 0.3173, "num_input_tokens_seen": 69155456, "step": 32075 }, { "epoch": 5.887318774086989, "grad_norm": 3.186757802963257, "learning_rate": 8.892785140174864e-06, "loss": 0.1137, "num_input_tokens_seen": 69166016, "step": 32080 }, { "epoch": 5.88823637364654, "grad_norm": 21.316997528076172, "learning_rate": 8.892282556293928e-06, "loss": 0.375, "num_input_tokens_seen": 69177088, "step": 32085 }, { "epoch": 5.889153973206093, "grad_norm": 4.820840358734131, "learning_rate": 8.891779872581976e-06, "loss": 0.1337, "num_input_tokens_seen": 69187136, "step": 32090 }, { "epoch": 5.890071572765645, "grad_norm": 4.936785697937012, "learning_rate": 8.891277089051902e-06, "loss": 0.3191, "num_input_tokens_seen": 69197120, "step": 32095 }, { "epoch": 5.890989172325197, "grad_norm": 4.153290748596191, "learning_rate": 8.890774205716602e-06, "loss": 0.2989, "num_input_tokens_seen": 69208032, "step": 32100 }, { "epoch": 5.8919067718847495, "grad_norm": 5.077254295349121, "learning_rate": 8.890271222588974e-06, "loss": 0.3513, "num_input_tokens_seen": 69219328, "step": 32105 }, { "epoch": 5.892824371444302, "grad_norm": 11.131360054016113, "learning_rate": 8.88976813968192e-06, "loss": 0.2494, "num_input_tokens_seen": 69230496, "step": 32110 }, { "epoch": 5.893741971003854, "grad_norm": 2.4928812980651855, "learning_rate": 8.889264957008339e-06, "loss": 0.3117, "num_input_tokens_seen": 69240576, "step": 32115 }, { "epoch": 5.894659570563406, "grad_norm": 1.1781831979751587, "learning_rate": 8.888761674581141e-06, "loss": 0.1984, "num_input_tokens_seen": 69251680, "step": 32120 }, { "epoch": 5.895577170122959, "grad_norm": 2.6716525554656982, "learning_rate": 8.888258292413233e-06, "loss": 0.1885, "num_input_tokens_seen": 69263712, "step": 32125 }, { "epoch": 5.89649476968251, "grad_norm": 22.458261489868164, "learning_rate": 8.887754810517525e-06, "loss": 0.3207, "num_input_tokens_seen": 69274144, "step": 32130 }, { "epoch": 5.897412369242063, "grad_norm": 8.266545295715332, "learning_rate": 8.887251228906934e-06, "loss": 0.2917, "num_input_tokens_seen": 69285472, "step": 32135 }, { "epoch": 5.898329968801615, "grad_norm": 4.224808692932129, "learning_rate": 8.886747547594372e-06, "loss": 0.3501, "num_input_tokens_seen": 69296864, "step": 32140 }, { "epoch": 5.899247568361167, "grad_norm": 13.576416015625, "learning_rate": 8.88624376659276e-06, "loss": 0.259, "num_input_tokens_seen": 69308896, "step": 32145 }, { "epoch": 5.900165167920719, "grad_norm": 2.122666358947754, "learning_rate": 8.885739885915018e-06, "loss": 0.1032, "num_input_tokens_seen": 69320224, "step": 32150 }, { "epoch": 5.901082767480272, "grad_norm": 21.31815528869629, "learning_rate": 8.88523590557407e-06, "loss": 0.4491, "num_input_tokens_seen": 69331776, "step": 32155 }, { "epoch": 5.902000367039824, "grad_norm": 24.998550415039062, "learning_rate": 8.884731825582844e-06, "loss": 0.3828, "num_input_tokens_seen": 69343360, "step": 32160 }, { "epoch": 5.902917966599376, "grad_norm": 3.5522425174713135, "learning_rate": 8.884227645954267e-06, "loss": 0.1914, "num_input_tokens_seen": 69354432, "step": 32165 }, { "epoch": 5.903835566158929, "grad_norm": 3.5741360187530518, "learning_rate": 8.883723366701268e-06, "loss": 0.2942, "num_input_tokens_seen": 69365760, "step": 32170 }, { "epoch": 5.90475316571848, "grad_norm": 3.9793431758880615, "learning_rate": 8.883218987836788e-06, "loss": 0.1176, "num_input_tokens_seen": 69376288, "step": 32175 }, { "epoch": 5.905670765278033, "grad_norm": 8.499490737915039, "learning_rate": 8.882714509373758e-06, "loss": 0.2066, "num_input_tokens_seen": 69386592, "step": 32180 }, { "epoch": 5.906588364837585, "grad_norm": 54.865570068359375, "learning_rate": 8.882209931325118e-06, "loss": 0.3005, "num_input_tokens_seen": 69396768, "step": 32185 }, { "epoch": 5.907505964397137, "grad_norm": 14.504080772399902, "learning_rate": 8.88170525370381e-06, "loss": 0.3462, "num_input_tokens_seen": 69407680, "step": 32190 }, { "epoch": 5.908423563956689, "grad_norm": 10.621869087219238, "learning_rate": 8.881200476522778e-06, "loss": 0.4363, "num_input_tokens_seen": 69418560, "step": 32195 }, { "epoch": 5.909341163516242, "grad_norm": 6.76815938949585, "learning_rate": 8.88069559979497e-06, "loss": 0.4473, "num_input_tokens_seen": 69429888, "step": 32200 }, { "epoch": 5.9102587630757935, "grad_norm": 1.0394487380981445, "learning_rate": 8.880190623533334e-06, "loss": 0.357, "num_input_tokens_seen": 69440384, "step": 32205 }, { "epoch": 5.911176362635346, "grad_norm": 4.717196464538574, "learning_rate": 8.879685547750823e-06, "loss": 0.2169, "num_input_tokens_seen": 69451968, "step": 32210 }, { "epoch": 5.9120939621948985, "grad_norm": 1.4320557117462158, "learning_rate": 8.87918037246039e-06, "loss": 0.164, "num_input_tokens_seen": 69463200, "step": 32215 }, { "epoch": 5.91301156175445, "grad_norm": 11.998836517333984, "learning_rate": 8.878675097674991e-06, "loss": 0.3702, "num_input_tokens_seen": 69474240, "step": 32220 }, { "epoch": 5.913929161314003, "grad_norm": 10.858144760131836, "learning_rate": 8.87816972340759e-06, "loss": 0.2889, "num_input_tokens_seen": 69485824, "step": 32225 }, { "epoch": 5.914846760873555, "grad_norm": 10.089937210083008, "learning_rate": 8.877664249671145e-06, "loss": 0.2712, "num_input_tokens_seen": 69497056, "step": 32230 }, { "epoch": 5.915764360433107, "grad_norm": 3.0636184215545654, "learning_rate": 8.877158676478622e-06, "loss": 0.2871, "num_input_tokens_seen": 69508288, "step": 32235 }, { "epoch": 5.916681959992659, "grad_norm": 4.603169918060303, "learning_rate": 8.876653003842987e-06, "loss": 0.2863, "num_input_tokens_seen": 69518304, "step": 32240 }, { "epoch": 5.917599559552212, "grad_norm": 4.6642022132873535, "learning_rate": 8.876147231777212e-06, "loss": 0.179, "num_input_tokens_seen": 69528288, "step": 32245 }, { "epoch": 5.918517159111763, "grad_norm": 15.094995498657227, "learning_rate": 8.875641360294267e-06, "loss": 0.2064, "num_input_tokens_seen": 69537888, "step": 32250 }, { "epoch": 5.919434758671316, "grad_norm": 11.428691864013672, "learning_rate": 8.87513538940713e-06, "loss": 0.2851, "num_input_tokens_seen": 69548800, "step": 32255 }, { "epoch": 5.920352358230868, "grad_norm": 12.379149436950684, "learning_rate": 8.874629319128774e-06, "loss": 0.2672, "num_input_tokens_seen": 69559520, "step": 32260 }, { "epoch": 5.92126995779042, "grad_norm": 3.424124240875244, "learning_rate": 8.87412314947218e-06, "loss": 0.2043, "num_input_tokens_seen": 69570944, "step": 32265 }, { "epoch": 5.9221875573499725, "grad_norm": 8.17475700378418, "learning_rate": 8.873616880450335e-06, "loss": 0.2188, "num_input_tokens_seen": 69581216, "step": 32270 }, { "epoch": 5.923105156909525, "grad_norm": 6.302539825439453, "learning_rate": 8.873110512076218e-06, "loss": 0.2542, "num_input_tokens_seen": 69590976, "step": 32275 }, { "epoch": 5.924022756469077, "grad_norm": 18.67901611328125, "learning_rate": 8.87260404436282e-06, "loss": 0.2491, "num_input_tokens_seen": 69600736, "step": 32280 }, { "epoch": 5.924940356028629, "grad_norm": 13.27451229095459, "learning_rate": 8.87209747732313e-06, "loss": 0.2743, "num_input_tokens_seen": 69611520, "step": 32285 }, { "epoch": 5.925857955588182, "grad_norm": 11.013703346252441, "learning_rate": 8.871590810970143e-06, "loss": 0.3121, "num_input_tokens_seen": 69623136, "step": 32290 }, { "epoch": 5.926775555147733, "grad_norm": 3.5137624740600586, "learning_rate": 8.871084045316849e-06, "loss": 0.2661, "num_input_tokens_seen": 69634112, "step": 32295 }, { "epoch": 5.927693154707286, "grad_norm": 6.60158634185791, "learning_rate": 8.87057718037625e-06, "loss": 0.3419, "num_input_tokens_seen": 69644224, "step": 32300 }, { "epoch": 5.928610754266838, "grad_norm": 7.299474716186523, "learning_rate": 8.870070216161346e-06, "loss": 0.3063, "num_input_tokens_seen": 69654880, "step": 32305 }, { "epoch": 5.92952835382639, "grad_norm": 6.156376838684082, "learning_rate": 8.869563152685139e-06, "loss": 0.1875, "num_input_tokens_seen": 69666272, "step": 32310 }, { "epoch": 5.930445953385942, "grad_norm": 4.406830787658691, "learning_rate": 8.869055989960633e-06, "loss": 0.2482, "num_input_tokens_seen": 69677408, "step": 32315 }, { "epoch": 5.931363552945495, "grad_norm": 1.730648159980774, "learning_rate": 8.868548728000838e-06, "loss": 0.1615, "num_input_tokens_seen": 69688768, "step": 32320 }, { "epoch": 5.9322811525050465, "grad_norm": 9.1436128616333, "learning_rate": 8.868041366818762e-06, "loss": 0.3523, "num_input_tokens_seen": 69699712, "step": 32325 }, { "epoch": 5.933198752064599, "grad_norm": 1.703811526298523, "learning_rate": 8.867533906427424e-06, "loss": 0.255, "num_input_tokens_seen": 69711328, "step": 32330 }, { "epoch": 5.9341163516241515, "grad_norm": 3.19218111038208, "learning_rate": 8.867026346839832e-06, "loss": 0.3312, "num_input_tokens_seen": 69722272, "step": 32335 }, { "epoch": 5.935033951183703, "grad_norm": 5.825297832489014, "learning_rate": 8.866518688069008e-06, "loss": 0.3604, "num_input_tokens_seen": 69731616, "step": 32340 }, { "epoch": 5.935951550743256, "grad_norm": 1.4092893600463867, "learning_rate": 8.866010930127974e-06, "loss": 0.3223, "num_input_tokens_seen": 69742624, "step": 32345 }, { "epoch": 5.936869150302808, "grad_norm": 8.252368927001953, "learning_rate": 8.865503073029751e-06, "loss": 0.3549, "num_input_tokens_seen": 69752800, "step": 32350 }, { "epoch": 5.93778674986236, "grad_norm": 2.3056228160858154, "learning_rate": 8.864995116787363e-06, "loss": 0.1606, "num_input_tokens_seen": 69764256, "step": 32355 }, { "epoch": 5.938704349421912, "grad_norm": 7.2803239822387695, "learning_rate": 8.864487061413842e-06, "loss": 0.1982, "num_input_tokens_seen": 69774944, "step": 32360 }, { "epoch": 5.939621948981465, "grad_norm": 5.045107841491699, "learning_rate": 8.863978906922219e-06, "loss": 0.375, "num_input_tokens_seen": 69786176, "step": 32365 }, { "epoch": 5.940539548541016, "grad_norm": 2.6516966819763184, "learning_rate": 8.863470653325523e-06, "loss": 0.1541, "num_input_tokens_seen": 69795872, "step": 32370 }, { "epoch": 5.941457148100569, "grad_norm": 0.8332914113998413, "learning_rate": 8.862962300636793e-06, "loss": 0.2102, "num_input_tokens_seen": 69807648, "step": 32375 }, { "epoch": 5.942374747660121, "grad_norm": 10.158830642700195, "learning_rate": 8.862453848869067e-06, "loss": 0.289, "num_input_tokens_seen": 69818880, "step": 32380 }, { "epoch": 5.943292347219673, "grad_norm": 23.457595825195312, "learning_rate": 8.861945298035389e-06, "loss": 0.3523, "num_input_tokens_seen": 69829024, "step": 32385 }, { "epoch": 5.9442099467792255, "grad_norm": 9.058063507080078, "learning_rate": 8.861436648148796e-06, "loss": 0.3653, "num_input_tokens_seen": 69839744, "step": 32390 }, { "epoch": 5.945127546338778, "grad_norm": 3.3996870517730713, "learning_rate": 8.86092789922234e-06, "loss": 0.2473, "num_input_tokens_seen": 69850656, "step": 32395 }, { "epoch": 5.94604514589833, "grad_norm": 15.134533882141113, "learning_rate": 8.860419051269064e-06, "loss": 0.2566, "num_input_tokens_seen": 69860896, "step": 32400 }, { "epoch": 5.946962745457882, "grad_norm": 0.4464537799358368, "learning_rate": 8.859910104302025e-06, "loss": 0.1214, "num_input_tokens_seen": 69872864, "step": 32405 }, { "epoch": 5.947880345017435, "grad_norm": 20.083499908447266, "learning_rate": 8.859401058334274e-06, "loss": 0.3367, "num_input_tokens_seen": 69884160, "step": 32410 }, { "epoch": 5.948797944576986, "grad_norm": 8.493526458740234, "learning_rate": 8.858891913378866e-06, "loss": 0.3307, "num_input_tokens_seen": 69894304, "step": 32415 }, { "epoch": 5.949715544136539, "grad_norm": 2.514765977859497, "learning_rate": 8.858382669448863e-06, "loss": 0.2005, "num_input_tokens_seen": 69905248, "step": 32420 }, { "epoch": 5.950633143696091, "grad_norm": 10.32254409790039, "learning_rate": 8.857873326557325e-06, "loss": 0.2883, "num_input_tokens_seen": 69914816, "step": 32425 }, { "epoch": 5.951550743255643, "grad_norm": 12.716124534606934, "learning_rate": 8.857363884717314e-06, "loss": 0.4151, "num_input_tokens_seen": 69926144, "step": 32430 }, { "epoch": 5.952468342815195, "grad_norm": 15.460444450378418, "learning_rate": 8.856854343941896e-06, "loss": 0.181, "num_input_tokens_seen": 69937920, "step": 32435 }, { "epoch": 5.953385942374748, "grad_norm": 17.559356689453125, "learning_rate": 8.856344704244143e-06, "loss": 0.2291, "num_input_tokens_seen": 69947264, "step": 32440 }, { "epoch": 5.9543035419343, "grad_norm": 12.135049819946289, "learning_rate": 8.855834965637127e-06, "loss": 0.2322, "num_input_tokens_seen": 69957792, "step": 32445 }, { "epoch": 5.955221141493852, "grad_norm": 11.656749725341797, "learning_rate": 8.855325128133918e-06, "loss": 0.197, "num_input_tokens_seen": 69969408, "step": 32450 }, { "epoch": 5.956138741053405, "grad_norm": 6.789177894592285, "learning_rate": 8.854815191747596e-06, "loss": 0.2583, "num_input_tokens_seen": 69979488, "step": 32455 }, { "epoch": 5.957056340612956, "grad_norm": 6.891666412353516, "learning_rate": 8.854305156491239e-06, "loss": 0.2714, "num_input_tokens_seen": 69990240, "step": 32460 }, { "epoch": 5.957973940172509, "grad_norm": 3.883694648742676, "learning_rate": 8.853795022377927e-06, "loss": 0.3929, "num_input_tokens_seen": 70000512, "step": 32465 }, { "epoch": 5.958891539732061, "grad_norm": 7.213171482086182, "learning_rate": 8.853284789420746e-06, "loss": 0.371, "num_input_tokens_seen": 70010656, "step": 32470 }, { "epoch": 5.959809139291613, "grad_norm": 3.372316837310791, "learning_rate": 8.852774457632782e-06, "loss": 0.2474, "num_input_tokens_seen": 70021216, "step": 32475 }, { "epoch": 5.960726738851165, "grad_norm": 2.5376298427581787, "learning_rate": 8.852264027027124e-06, "loss": 0.3341, "num_input_tokens_seen": 70032480, "step": 32480 }, { "epoch": 5.961644338410718, "grad_norm": 6.102280139923096, "learning_rate": 8.851753497616866e-06, "loss": 0.3537, "num_input_tokens_seen": 70042528, "step": 32485 }, { "epoch": 5.9625619379702695, "grad_norm": 9.975178718566895, "learning_rate": 8.851242869415099e-06, "loss": 0.2676, "num_input_tokens_seen": 70053440, "step": 32490 }, { "epoch": 5.963479537529822, "grad_norm": 1.6489362716674805, "learning_rate": 8.850732142434921e-06, "loss": 0.2065, "num_input_tokens_seen": 70065120, "step": 32495 }, { "epoch": 5.9643971370893745, "grad_norm": 4.304116249084473, "learning_rate": 8.850221316689432e-06, "loss": 0.1619, "num_input_tokens_seen": 70076992, "step": 32500 }, { "epoch": 5.965314736648926, "grad_norm": 11.087782859802246, "learning_rate": 8.849710392191735e-06, "loss": 0.1651, "num_input_tokens_seen": 70088256, "step": 32505 }, { "epoch": 5.966232336208479, "grad_norm": 7.112191677093506, "learning_rate": 8.84919936895493e-06, "loss": 0.2641, "num_input_tokens_seen": 70098752, "step": 32510 }, { "epoch": 5.967149935768031, "grad_norm": 5.506443023681641, "learning_rate": 8.84868824699213e-06, "loss": 0.2368, "num_input_tokens_seen": 70109088, "step": 32515 }, { "epoch": 5.968067535327583, "grad_norm": 11.126543045043945, "learning_rate": 8.84817702631644e-06, "loss": 0.3037, "num_input_tokens_seen": 70119360, "step": 32520 }, { "epoch": 5.968985134887135, "grad_norm": 12.194693565368652, "learning_rate": 8.847665706940973e-06, "loss": 0.331, "num_input_tokens_seen": 70130912, "step": 32525 }, { "epoch": 5.969902734446688, "grad_norm": 1.937133550643921, "learning_rate": 8.847154288878845e-06, "loss": 0.1136, "num_input_tokens_seen": 70140864, "step": 32530 }, { "epoch": 5.970820334006239, "grad_norm": 2.1668264865875244, "learning_rate": 8.84664277214317e-06, "loss": 0.2301, "num_input_tokens_seen": 70149536, "step": 32535 }, { "epoch": 5.971737933565792, "grad_norm": 3.28816819190979, "learning_rate": 8.84613115674707e-06, "loss": 0.2496, "num_input_tokens_seen": 70160288, "step": 32540 }, { "epoch": 5.972655533125344, "grad_norm": 11.707781791687012, "learning_rate": 8.84561944270367e-06, "loss": 0.5163, "num_input_tokens_seen": 70171136, "step": 32545 }, { "epoch": 5.973573132684896, "grad_norm": 9.086692810058594, "learning_rate": 8.845107630026088e-06, "loss": 0.3221, "num_input_tokens_seen": 70182144, "step": 32550 }, { "epoch": 5.9744907322444485, "grad_norm": 7.860579490661621, "learning_rate": 8.844595718727457e-06, "loss": 0.3426, "num_input_tokens_seen": 70193408, "step": 32555 }, { "epoch": 5.975408331804001, "grad_norm": 3.995086193084717, "learning_rate": 8.844083708820903e-06, "loss": 0.3639, "num_input_tokens_seen": 70203488, "step": 32560 }, { "epoch": 5.976325931363553, "grad_norm": 7.289380073547363, "learning_rate": 8.84357160031956e-06, "loss": 0.2713, "num_input_tokens_seen": 70212928, "step": 32565 }, { "epoch": 5.977243530923105, "grad_norm": 0.9375603795051575, "learning_rate": 8.843059393236563e-06, "loss": 0.0995, "num_input_tokens_seen": 70222400, "step": 32570 }, { "epoch": 5.978161130482658, "grad_norm": 6.474207401275635, "learning_rate": 8.842547087585047e-06, "loss": 0.2073, "num_input_tokens_seen": 70233440, "step": 32575 }, { "epoch": 5.979078730042209, "grad_norm": 15.5761079788208, "learning_rate": 8.842034683378156e-06, "loss": 0.1669, "num_input_tokens_seen": 70244864, "step": 32580 }, { "epoch": 5.979996329601762, "grad_norm": 15.448195457458496, "learning_rate": 8.841522180629029e-06, "loss": 0.345, "num_input_tokens_seen": 70256416, "step": 32585 }, { "epoch": 5.980913929161314, "grad_norm": 2.5536742210388184, "learning_rate": 8.84100957935081e-06, "loss": 0.2293, "num_input_tokens_seen": 70266720, "step": 32590 }, { "epoch": 5.981831528720866, "grad_norm": 4.455763816833496, "learning_rate": 8.840496879556651e-06, "loss": 0.2114, "num_input_tokens_seen": 70278464, "step": 32595 }, { "epoch": 5.982749128280418, "grad_norm": 6.1631178855896, "learning_rate": 8.839984081259699e-06, "loss": 0.2808, "num_input_tokens_seen": 70289632, "step": 32600 }, { "epoch": 5.983666727839971, "grad_norm": 12.273063659667969, "learning_rate": 8.839471184473105e-06, "loss": 0.1293, "num_input_tokens_seen": 70300960, "step": 32605 }, { "epoch": 5.9845843273995225, "grad_norm": 5.92336368560791, "learning_rate": 8.838958189210028e-06, "loss": 0.1484, "num_input_tokens_seen": 70312256, "step": 32610 }, { "epoch": 5.985501926959075, "grad_norm": 13.40535831451416, "learning_rate": 8.838445095483622e-06, "loss": 0.4511, "num_input_tokens_seen": 70322560, "step": 32615 }, { "epoch": 5.9864195265186275, "grad_norm": 26.143774032592773, "learning_rate": 8.837931903307047e-06, "loss": 0.3352, "num_input_tokens_seen": 70332896, "step": 32620 }, { "epoch": 5.987337126078179, "grad_norm": 2.880679130554199, "learning_rate": 8.83741861269347e-06, "loss": 0.2001, "num_input_tokens_seen": 70343072, "step": 32625 }, { "epoch": 5.988254725637732, "grad_norm": 0.8497637510299683, "learning_rate": 8.836905223656052e-06, "loss": 0.1706, "num_input_tokens_seen": 70354528, "step": 32630 }, { "epoch": 5.989172325197284, "grad_norm": 1.0201131105422974, "learning_rate": 8.83639173620796e-06, "loss": 0.2383, "num_input_tokens_seen": 70364896, "step": 32635 }, { "epoch": 5.990089924756836, "grad_norm": 33.04788589477539, "learning_rate": 8.835878150362368e-06, "loss": 0.3877, "num_input_tokens_seen": 70377152, "step": 32640 }, { "epoch": 5.991007524316388, "grad_norm": 25.43934440612793, "learning_rate": 8.835364466132443e-06, "loss": 0.4307, "num_input_tokens_seen": 70386624, "step": 32645 }, { "epoch": 5.991925123875941, "grad_norm": 11.581665992736816, "learning_rate": 8.834850683531367e-06, "loss": 0.5195, "num_input_tokens_seen": 70396832, "step": 32650 }, { "epoch": 5.992842723435492, "grad_norm": 7.269350051879883, "learning_rate": 8.834336802572314e-06, "loss": 0.2005, "num_input_tokens_seen": 70408992, "step": 32655 }, { "epoch": 5.993760322995045, "grad_norm": 4.636247634887695, "learning_rate": 8.833822823268466e-06, "loss": 0.262, "num_input_tokens_seen": 70419648, "step": 32660 }, { "epoch": 5.994677922554597, "grad_norm": 4.2654290199279785, "learning_rate": 8.833308745633001e-06, "loss": 0.1545, "num_input_tokens_seen": 70428736, "step": 32665 }, { "epoch": 5.995595522114149, "grad_norm": 2.8611905574798584, "learning_rate": 8.83279456967911e-06, "loss": 0.2551, "num_input_tokens_seen": 70439744, "step": 32670 }, { "epoch": 5.9965131216737015, "grad_norm": 6.222018718719482, "learning_rate": 8.832280295419978e-06, "loss": 0.3615, "num_input_tokens_seen": 70449856, "step": 32675 }, { "epoch": 5.997430721233254, "grad_norm": 9.846790313720703, "learning_rate": 8.831765922868796e-06, "loss": 0.2142, "num_input_tokens_seen": 70460896, "step": 32680 }, { "epoch": 5.998348320792806, "grad_norm": 1.932141661643982, "learning_rate": 8.831251452038757e-06, "loss": 0.3365, "num_input_tokens_seen": 70473120, "step": 32685 }, { "epoch": 5.999265920352358, "grad_norm": 9.625370979309082, "learning_rate": 8.830736882943057e-06, "loss": 0.186, "num_input_tokens_seen": 70483840, "step": 32690 }, { "epoch": 6.0, "eval_loss": 0.27814048528671265, "eval_runtime": 178.8905, "eval_samples_per_second": 30.46, "eval_steps_per_second": 7.619, "num_input_tokens_seen": 70491792, "step": 32694 }, { "epoch": 6.000183519911911, "grad_norm": 1.7156938314437866, "learning_rate": 8.83022221559489e-06, "loss": 0.3897, "num_input_tokens_seen": 70494192, "step": 32695 }, { "epoch": 6.001101119471462, "grad_norm": 10.54978084564209, "learning_rate": 8.829707450007463e-06, "loss": 0.2524, "num_input_tokens_seen": 70504336, "step": 32700 }, { "epoch": 6.002018719031015, "grad_norm": 6.62769079208374, "learning_rate": 8.829192586193972e-06, "loss": 0.269, "num_input_tokens_seen": 70515184, "step": 32705 }, { "epoch": 6.002936318590567, "grad_norm": 5.778395652770996, "learning_rate": 8.82867762416763e-06, "loss": 0.1911, "num_input_tokens_seen": 70526224, "step": 32710 }, { "epoch": 6.003853918150119, "grad_norm": 3.598629951477051, "learning_rate": 8.828162563941638e-06, "loss": 0.292, "num_input_tokens_seen": 70537744, "step": 32715 }, { "epoch": 6.004771517709671, "grad_norm": 3.7638566493988037, "learning_rate": 8.827647405529209e-06, "loss": 0.2045, "num_input_tokens_seen": 70549104, "step": 32720 }, { "epoch": 6.005689117269224, "grad_norm": 2.6395959854125977, "learning_rate": 8.827132148943557e-06, "loss": 0.2264, "num_input_tokens_seen": 70560112, "step": 32725 }, { "epoch": 6.006606716828776, "grad_norm": 1.8404759168624878, "learning_rate": 8.826616794197898e-06, "loss": 0.1999, "num_input_tokens_seen": 70570992, "step": 32730 }, { "epoch": 6.007524316388328, "grad_norm": 1.5201796293258667, "learning_rate": 8.82610134130545e-06, "loss": 0.3271, "num_input_tokens_seen": 70581936, "step": 32735 }, { "epoch": 6.008441915947881, "grad_norm": 1.1589974164962769, "learning_rate": 8.825585790279429e-06, "loss": 0.1506, "num_input_tokens_seen": 70593008, "step": 32740 }, { "epoch": 6.009359515507432, "grad_norm": 9.367097854614258, "learning_rate": 8.825070141133064e-06, "loss": 0.151, "num_input_tokens_seen": 70603376, "step": 32745 }, { "epoch": 6.010277115066985, "grad_norm": 29.356130599975586, "learning_rate": 8.824554393879578e-06, "loss": 0.4643, "num_input_tokens_seen": 70614416, "step": 32750 }, { "epoch": 6.011194714626537, "grad_norm": 2.84635591506958, "learning_rate": 8.824038548532199e-06, "loss": 0.2065, "num_input_tokens_seen": 70624336, "step": 32755 }, { "epoch": 6.012112314186089, "grad_norm": 6.600661754608154, "learning_rate": 8.823522605104157e-06, "loss": 0.2411, "num_input_tokens_seen": 70635152, "step": 32760 }, { "epoch": 6.013029913745641, "grad_norm": 12.204617500305176, "learning_rate": 8.823006563608687e-06, "loss": 0.2298, "num_input_tokens_seen": 70645808, "step": 32765 }, { "epoch": 6.013947513305194, "grad_norm": 10.541616439819336, "learning_rate": 8.822490424059025e-06, "loss": 0.2611, "num_input_tokens_seen": 70655376, "step": 32770 }, { "epoch": 6.0148651128647455, "grad_norm": 8.779311180114746, "learning_rate": 8.821974186468405e-06, "loss": 0.1463, "num_input_tokens_seen": 70666736, "step": 32775 }, { "epoch": 6.015782712424298, "grad_norm": 3.1982297897338867, "learning_rate": 8.821457850850073e-06, "loss": 0.5176, "num_input_tokens_seen": 70677808, "step": 32780 }, { "epoch": 6.0167003119838505, "grad_norm": 4.196584701538086, "learning_rate": 8.820941417217269e-06, "loss": 0.1785, "num_input_tokens_seen": 70689008, "step": 32785 }, { "epoch": 6.017617911543402, "grad_norm": 6.401419162750244, "learning_rate": 8.82042488558324e-06, "loss": 0.1317, "num_input_tokens_seen": 70699952, "step": 32790 }, { "epoch": 6.018535511102955, "grad_norm": 9.629251480102539, "learning_rate": 8.819908255961234e-06, "loss": 0.2558, "num_input_tokens_seen": 70711120, "step": 32795 }, { "epoch": 6.019453110662507, "grad_norm": 5.480619430541992, "learning_rate": 8.8193915283645e-06, "loss": 0.2516, "num_input_tokens_seen": 70721968, "step": 32800 }, { "epoch": 6.020370710222059, "grad_norm": 13.013922691345215, "learning_rate": 8.818874702806294e-06, "loss": 0.2314, "num_input_tokens_seen": 70731344, "step": 32805 }, { "epoch": 6.021288309781611, "grad_norm": 8.775097846984863, "learning_rate": 8.81835777929987e-06, "loss": 0.2868, "num_input_tokens_seen": 70742416, "step": 32810 }, { "epoch": 6.022205909341164, "grad_norm": 6.413057804107666, "learning_rate": 8.817840757858487e-06, "loss": 0.3766, "num_input_tokens_seen": 70752656, "step": 32815 }, { "epoch": 6.023123508900715, "grad_norm": 5.336539268493652, "learning_rate": 8.817323638495408e-06, "loss": 0.1964, "num_input_tokens_seen": 70763792, "step": 32820 }, { "epoch": 6.024041108460268, "grad_norm": 30.36350440979004, "learning_rate": 8.81680642122389e-06, "loss": 0.3923, "num_input_tokens_seen": 70775760, "step": 32825 }, { "epoch": 6.02495870801982, "grad_norm": 1.4380362033843994, "learning_rate": 8.816289106057206e-06, "loss": 0.1713, "num_input_tokens_seen": 70787088, "step": 32830 }, { "epoch": 6.025876307579372, "grad_norm": 17.782175064086914, "learning_rate": 8.815771693008619e-06, "loss": 0.2318, "num_input_tokens_seen": 70796080, "step": 32835 }, { "epoch": 6.0267939071389245, "grad_norm": 11.318697929382324, "learning_rate": 8.815254182091403e-06, "loss": 0.1911, "num_input_tokens_seen": 70808144, "step": 32840 }, { "epoch": 6.027711506698477, "grad_norm": 18.14368438720703, "learning_rate": 8.81473657331883e-06, "loss": 0.3254, "num_input_tokens_seen": 70818928, "step": 32845 }, { "epoch": 6.028629106258029, "grad_norm": 2.4563446044921875, "learning_rate": 8.814218866704176e-06, "loss": 0.0909, "num_input_tokens_seen": 70828816, "step": 32850 }, { "epoch": 6.029546705817581, "grad_norm": 18.394180297851562, "learning_rate": 8.813701062260722e-06, "loss": 0.3464, "num_input_tokens_seen": 70839632, "step": 32855 }, { "epoch": 6.030464305377134, "grad_norm": 2.2420763969421387, "learning_rate": 8.813183160001743e-06, "loss": 0.1689, "num_input_tokens_seen": 70850992, "step": 32860 }, { "epoch": 6.031381904936685, "grad_norm": 2.6782751083374023, "learning_rate": 8.812665159940527e-06, "loss": 0.2378, "num_input_tokens_seen": 70862032, "step": 32865 }, { "epoch": 6.032299504496238, "grad_norm": 1.5574898719787598, "learning_rate": 8.812147062090361e-06, "loss": 0.3172, "num_input_tokens_seen": 70873552, "step": 32870 }, { "epoch": 6.03321710405579, "grad_norm": 27.093891143798828, "learning_rate": 8.811628866464529e-06, "loss": 0.4762, "num_input_tokens_seen": 70883920, "step": 32875 }, { "epoch": 6.034134703615342, "grad_norm": 16.26984405517578, "learning_rate": 8.811110573076324e-06, "loss": 0.1826, "num_input_tokens_seen": 70894896, "step": 32880 }, { "epoch": 6.035052303174894, "grad_norm": 0.5386794209480286, "learning_rate": 8.81059218193904e-06, "loss": 0.2704, "num_input_tokens_seen": 70904528, "step": 32885 }, { "epoch": 6.035969902734447, "grad_norm": 4.2730841636657715, "learning_rate": 8.810073693065973e-06, "loss": 0.2433, "num_input_tokens_seen": 70915440, "step": 32890 }, { "epoch": 6.0368875022939985, "grad_norm": 4.829817771911621, "learning_rate": 8.809555106470418e-06, "loss": 0.3006, "num_input_tokens_seen": 70925616, "step": 32895 }, { "epoch": 6.037805101853551, "grad_norm": 13.716513633728027, "learning_rate": 8.809036422165681e-06, "loss": 0.2306, "num_input_tokens_seen": 70936496, "step": 32900 }, { "epoch": 6.0387227014131035, "grad_norm": 2.7280077934265137, "learning_rate": 8.808517640165064e-06, "loss": 0.1794, "num_input_tokens_seen": 70947536, "step": 32905 }, { "epoch": 6.039640300972655, "grad_norm": 9.738554954528809, "learning_rate": 8.807998760481871e-06, "loss": 0.1794, "num_input_tokens_seen": 70958352, "step": 32910 }, { "epoch": 6.040557900532208, "grad_norm": 0.611847996711731, "learning_rate": 8.807479783129411e-06, "loss": 0.22, "num_input_tokens_seen": 70968848, "step": 32915 }, { "epoch": 6.04147550009176, "grad_norm": 4.633967399597168, "learning_rate": 8.806960708120997e-06, "loss": 0.1159, "num_input_tokens_seen": 70980720, "step": 32920 }, { "epoch": 6.042393099651312, "grad_norm": 19.930347442626953, "learning_rate": 8.806441535469941e-06, "loss": 0.1831, "num_input_tokens_seen": 70991920, "step": 32925 }, { "epoch": 6.043310699210864, "grad_norm": 1.353808045387268, "learning_rate": 8.80592226518956e-06, "loss": 0.1468, "num_input_tokens_seen": 71001712, "step": 32930 }, { "epoch": 6.044228298770417, "grad_norm": 1.3149253129959106, "learning_rate": 8.80540289729317e-06, "loss": 0.1297, "num_input_tokens_seen": 71012400, "step": 32935 }, { "epoch": 6.045145898329968, "grad_norm": 6.4986701011657715, "learning_rate": 8.804883431794094e-06, "loss": 0.3241, "num_input_tokens_seen": 71022896, "step": 32940 }, { "epoch": 6.046063497889521, "grad_norm": 12.461764335632324, "learning_rate": 8.804363868705654e-06, "loss": 0.317, "num_input_tokens_seen": 71032464, "step": 32945 }, { "epoch": 6.046981097449073, "grad_norm": 12.082388877868652, "learning_rate": 8.803844208041177e-06, "loss": 0.4588, "num_input_tokens_seen": 71042864, "step": 32950 }, { "epoch": 6.047898697008625, "grad_norm": 6.5766096115112305, "learning_rate": 8.803324449813992e-06, "loss": 0.4747, "num_input_tokens_seen": 71054224, "step": 32955 }, { "epoch": 6.0488162965681775, "grad_norm": 33.8420524597168, "learning_rate": 8.80280459403743e-06, "loss": 0.2924, "num_input_tokens_seen": 71064240, "step": 32960 }, { "epoch": 6.04973389612773, "grad_norm": 4.814281940460205, "learning_rate": 8.802284640724825e-06, "loss": 0.123, "num_input_tokens_seen": 71075120, "step": 32965 }, { "epoch": 6.050651495687282, "grad_norm": 3.1980135440826416, "learning_rate": 8.80176458988951e-06, "loss": 0.2772, "num_input_tokens_seen": 71085968, "step": 32970 }, { "epoch": 6.051569095246834, "grad_norm": 0.6465420126914978, "learning_rate": 8.801244441544828e-06, "loss": 0.0805, "num_input_tokens_seen": 71097360, "step": 32975 }, { "epoch": 6.052486694806387, "grad_norm": 5.688056468963623, "learning_rate": 8.800724195704114e-06, "loss": 0.2007, "num_input_tokens_seen": 71107920, "step": 32980 }, { "epoch": 6.053404294365938, "grad_norm": 10.834610939025879, "learning_rate": 8.800203852380718e-06, "loss": 0.1636, "num_input_tokens_seen": 71117968, "step": 32985 }, { "epoch": 6.054321893925491, "grad_norm": 16.97230339050293, "learning_rate": 8.79968341158798e-06, "loss": 0.2476, "num_input_tokens_seen": 71129360, "step": 32990 }, { "epoch": 6.055239493485043, "grad_norm": 14.658267974853516, "learning_rate": 8.799162873339253e-06, "loss": 0.3099, "num_input_tokens_seen": 71140304, "step": 32995 }, { "epoch": 6.056157093044595, "grad_norm": 16.736684799194336, "learning_rate": 8.798642237647888e-06, "loss": 0.2306, "num_input_tokens_seen": 71150864, "step": 33000 }, { "epoch": 6.057074692604147, "grad_norm": 11.678757667541504, "learning_rate": 8.798121504527235e-06, "loss": 0.3194, "num_input_tokens_seen": 71161520, "step": 33005 }, { "epoch": 6.0579922921637, "grad_norm": 8.970335006713867, "learning_rate": 8.797600673990652e-06, "loss": 0.2248, "num_input_tokens_seen": 71171856, "step": 33010 }, { "epoch": 6.058909891723252, "grad_norm": 8.652607917785645, "learning_rate": 8.797079746051497e-06, "loss": 0.2276, "num_input_tokens_seen": 71182608, "step": 33015 }, { "epoch": 6.059827491282804, "grad_norm": 14.495512962341309, "learning_rate": 8.796558720723133e-06, "loss": 0.165, "num_input_tokens_seen": 71192944, "step": 33020 }, { "epoch": 6.060745090842357, "grad_norm": 5.525396347045898, "learning_rate": 8.796037598018923e-06, "loss": 0.2225, "num_input_tokens_seen": 71204784, "step": 33025 }, { "epoch": 6.061662690401908, "grad_norm": 1.039292812347412, "learning_rate": 8.79551637795223e-06, "loss": 0.1898, "num_input_tokens_seen": 71215664, "step": 33030 }, { "epoch": 6.062580289961461, "grad_norm": 11.719962120056152, "learning_rate": 8.794995060536424e-06, "loss": 0.1685, "num_input_tokens_seen": 71228080, "step": 33035 }, { "epoch": 6.063497889521013, "grad_norm": 2.6265225410461426, "learning_rate": 8.794473645784878e-06, "loss": 0.3771, "num_input_tokens_seen": 71238032, "step": 33040 }, { "epoch": 6.064415489080565, "grad_norm": 1.4739917516708374, "learning_rate": 8.793952133710962e-06, "loss": 0.2832, "num_input_tokens_seen": 71248816, "step": 33045 }, { "epoch": 6.065333088640117, "grad_norm": 13.12682819366455, "learning_rate": 8.793430524328055e-06, "loss": 0.218, "num_input_tokens_seen": 71259152, "step": 33050 }, { "epoch": 6.06625068819967, "grad_norm": 6.690230846405029, "learning_rate": 8.792908817649534e-06, "loss": 0.3013, "num_input_tokens_seen": 71270256, "step": 33055 }, { "epoch": 6.0671682877592215, "grad_norm": 2.2148337364196777, "learning_rate": 8.792387013688781e-06, "loss": 0.2224, "num_input_tokens_seen": 71281808, "step": 33060 }, { "epoch": 6.068085887318774, "grad_norm": 9.371444702148438, "learning_rate": 8.791865112459178e-06, "loss": 0.3257, "num_input_tokens_seen": 71292848, "step": 33065 }, { "epoch": 6.0690034868783265, "grad_norm": 2.126249313354492, "learning_rate": 8.791343113974112e-06, "loss": 0.3058, "num_input_tokens_seen": 71301712, "step": 33070 }, { "epoch": 6.069921086437878, "grad_norm": 12.994230270385742, "learning_rate": 8.790821018246971e-06, "loss": 0.2225, "num_input_tokens_seen": 71313168, "step": 33075 }, { "epoch": 6.070838685997431, "grad_norm": 7.636841297149658, "learning_rate": 8.790298825291145e-06, "loss": 0.0955, "num_input_tokens_seen": 71322416, "step": 33080 }, { "epoch": 6.071756285556983, "grad_norm": 1.0830495357513428, "learning_rate": 8.789776535120032e-06, "loss": 0.3238, "num_input_tokens_seen": 71333936, "step": 33085 }, { "epoch": 6.072673885116535, "grad_norm": 0.6894646286964417, "learning_rate": 8.789254147747022e-06, "loss": 0.0857, "num_input_tokens_seen": 71343728, "step": 33090 }, { "epoch": 6.073591484676087, "grad_norm": 8.182804107666016, "learning_rate": 8.788731663185516e-06, "loss": 0.3605, "num_input_tokens_seen": 71354224, "step": 33095 }, { "epoch": 6.07450908423564, "grad_norm": 5.189671039581299, "learning_rate": 8.788209081448916e-06, "loss": 0.125, "num_input_tokens_seen": 71364048, "step": 33100 }, { "epoch": 6.075426683795191, "grad_norm": 17.54810333251953, "learning_rate": 8.787686402550622e-06, "loss": 0.1679, "num_input_tokens_seen": 71373488, "step": 33105 }, { "epoch": 6.076344283354744, "grad_norm": 8.151691436767578, "learning_rate": 8.787163626504043e-06, "loss": 0.0998, "num_input_tokens_seen": 71383760, "step": 33110 }, { "epoch": 6.077261882914296, "grad_norm": 28.06276512145996, "learning_rate": 8.786640753322588e-06, "loss": 0.1921, "num_input_tokens_seen": 71394288, "step": 33115 }, { "epoch": 6.078179482473848, "grad_norm": 0.7602683901786804, "learning_rate": 8.786117783019666e-06, "loss": 0.1155, "num_input_tokens_seen": 71404688, "step": 33120 }, { "epoch": 6.0790970820334005, "grad_norm": 37.28805160522461, "learning_rate": 8.78559471560869e-06, "loss": 0.4215, "num_input_tokens_seen": 71415376, "step": 33125 }, { "epoch": 6.080014681592953, "grad_norm": 2.916837692260742, "learning_rate": 8.785071551103075e-06, "loss": 0.3167, "num_input_tokens_seen": 71426064, "step": 33130 }, { "epoch": 6.080932281152505, "grad_norm": 2.7196364402770996, "learning_rate": 8.784548289516244e-06, "loss": 0.2428, "num_input_tokens_seen": 71437744, "step": 33135 }, { "epoch": 6.081849880712057, "grad_norm": 2.9479501247406006, "learning_rate": 8.784024930861614e-06, "loss": 0.2908, "num_input_tokens_seen": 71449328, "step": 33140 }, { "epoch": 6.08276748027161, "grad_norm": 2.195436954498291, "learning_rate": 8.78350147515261e-06, "loss": 0.3261, "num_input_tokens_seen": 71459280, "step": 33145 }, { "epoch": 6.083685079831161, "grad_norm": 16.474702835083008, "learning_rate": 8.782977922402656e-06, "loss": 0.2543, "num_input_tokens_seen": 71469296, "step": 33150 }, { "epoch": 6.084602679390714, "grad_norm": 37.22083282470703, "learning_rate": 8.782454272625181e-06, "loss": 0.1817, "num_input_tokens_seen": 71479952, "step": 33155 }, { "epoch": 6.085520278950266, "grad_norm": 5.231707572937012, "learning_rate": 8.781930525833617e-06, "loss": 0.398, "num_input_tokens_seen": 71491952, "step": 33160 }, { "epoch": 6.086437878509818, "grad_norm": 9.216586112976074, "learning_rate": 8.781406682041395e-06, "loss": 0.3491, "num_input_tokens_seen": 71502352, "step": 33165 }, { "epoch": 6.08735547806937, "grad_norm": 0.46144554018974304, "learning_rate": 8.780882741261954e-06, "loss": 0.3356, "num_input_tokens_seen": 71512080, "step": 33170 }, { "epoch": 6.088273077628923, "grad_norm": 1.0763782262802124, "learning_rate": 8.780358703508732e-06, "loss": 0.2691, "num_input_tokens_seen": 71521968, "step": 33175 }, { "epoch": 6.0891906771884745, "grad_norm": 1.422452688217163, "learning_rate": 8.779834568795165e-06, "loss": 0.2759, "num_input_tokens_seen": 71532688, "step": 33180 }, { "epoch": 6.090108276748027, "grad_norm": 15.823577880859375, "learning_rate": 8.779310337134702e-06, "loss": 0.3724, "num_input_tokens_seen": 71543568, "step": 33185 }, { "epoch": 6.0910258763075795, "grad_norm": 26.382509231567383, "learning_rate": 8.778786008540785e-06, "loss": 0.261, "num_input_tokens_seen": 71553840, "step": 33190 }, { "epoch": 6.091943475867131, "grad_norm": 8.573965072631836, "learning_rate": 8.778261583026864e-06, "loss": 0.378, "num_input_tokens_seen": 71565232, "step": 33195 }, { "epoch": 6.092861075426684, "grad_norm": 6.516139507293701, "learning_rate": 8.777737060606388e-06, "loss": 0.2071, "num_input_tokens_seen": 71575664, "step": 33200 }, { "epoch": 6.093778674986236, "grad_norm": 10.899785995483398, "learning_rate": 8.777212441292811e-06, "loss": 0.2579, "num_input_tokens_seen": 71587728, "step": 33205 }, { "epoch": 6.094696274545788, "grad_norm": 1.995434284210205, "learning_rate": 8.776687725099591e-06, "loss": 0.1471, "num_input_tokens_seen": 71598608, "step": 33210 }, { "epoch": 6.09561387410534, "grad_norm": 10.213563919067383, "learning_rate": 8.776162912040183e-06, "loss": 0.5712, "num_input_tokens_seen": 71608592, "step": 33215 }, { "epoch": 6.096531473664893, "grad_norm": 32.56828689575195, "learning_rate": 8.775638002128048e-06, "loss": 0.3021, "num_input_tokens_seen": 71621200, "step": 33220 }, { "epoch": 6.097449073224444, "grad_norm": 4.518642902374268, "learning_rate": 8.775112995376653e-06, "loss": 0.3217, "num_input_tokens_seen": 71631696, "step": 33225 }, { "epoch": 6.098366672783997, "grad_norm": 2.0605783462524414, "learning_rate": 8.774587891799457e-06, "loss": 0.2913, "num_input_tokens_seen": 71642864, "step": 33230 }, { "epoch": 6.099284272343549, "grad_norm": 17.714107513427734, "learning_rate": 8.774062691409934e-06, "loss": 0.2046, "num_input_tokens_seen": 71652944, "step": 33235 }, { "epoch": 6.100201871903101, "grad_norm": 9.109479904174805, "learning_rate": 8.773537394221552e-06, "loss": 0.2252, "num_input_tokens_seen": 71663664, "step": 33240 }, { "epoch": 6.1011194714626535, "grad_norm": 3.4525439739227295, "learning_rate": 8.773012000247784e-06, "loss": 0.1959, "num_input_tokens_seen": 71675120, "step": 33245 }, { "epoch": 6.102037071022206, "grad_norm": 11.406947135925293, "learning_rate": 8.772486509502105e-06, "loss": 0.2793, "num_input_tokens_seen": 71686704, "step": 33250 }, { "epoch": 6.102954670581759, "grad_norm": 1.2953850030899048, "learning_rate": 8.771960921997995e-06, "loss": 0.301, "num_input_tokens_seen": 71696656, "step": 33255 }, { "epoch": 6.10387227014131, "grad_norm": 39.798240661621094, "learning_rate": 8.771435237748932e-06, "loss": 0.1773, "num_input_tokens_seen": 71707792, "step": 33260 }, { "epoch": 6.104789869700863, "grad_norm": 11.485376358032227, "learning_rate": 8.770909456768401e-06, "loss": 0.2978, "num_input_tokens_seen": 71717776, "step": 33265 }, { "epoch": 6.105707469260415, "grad_norm": 35.949344635009766, "learning_rate": 8.770383579069888e-06, "loss": 0.2506, "num_input_tokens_seen": 71727632, "step": 33270 }, { "epoch": 6.106625068819967, "grad_norm": 4.354643821716309, "learning_rate": 8.769857604666878e-06, "loss": 0.4127, "num_input_tokens_seen": 71738128, "step": 33275 }, { "epoch": 6.107542668379519, "grad_norm": 23.685327529907227, "learning_rate": 8.769331533572864e-06, "loss": 0.1626, "num_input_tokens_seen": 71747152, "step": 33280 }, { "epoch": 6.108460267939072, "grad_norm": 6.075535774230957, "learning_rate": 8.768805365801338e-06, "loss": 0.2328, "num_input_tokens_seen": 71757040, "step": 33285 }, { "epoch": 6.109377867498623, "grad_norm": 6.1861748695373535, "learning_rate": 8.768279101365796e-06, "loss": 0.21, "num_input_tokens_seen": 71767504, "step": 33290 }, { "epoch": 6.110295467058176, "grad_norm": 15.216475486755371, "learning_rate": 8.767752740279736e-06, "loss": 0.3476, "num_input_tokens_seen": 71777936, "step": 33295 }, { "epoch": 6.1112130666177285, "grad_norm": 12.16093635559082, "learning_rate": 8.767226282556656e-06, "loss": 0.4519, "num_input_tokens_seen": 71788432, "step": 33300 }, { "epoch": 6.11213066617728, "grad_norm": 0.4557192325592041, "learning_rate": 8.76669972821006e-06, "loss": 0.1338, "num_input_tokens_seen": 71799824, "step": 33305 }, { "epoch": 6.113048265736833, "grad_norm": 18.406124114990234, "learning_rate": 8.766173077253456e-06, "loss": 0.3024, "num_input_tokens_seen": 71811088, "step": 33310 }, { "epoch": 6.113965865296385, "grad_norm": 6.413926124572754, "learning_rate": 8.765646329700348e-06, "loss": 0.3088, "num_input_tokens_seen": 71821264, "step": 33315 }, { "epoch": 6.114883464855937, "grad_norm": 6.490668296813965, "learning_rate": 8.765119485564248e-06, "loss": 0.1587, "num_input_tokens_seen": 71832208, "step": 33320 }, { "epoch": 6.115801064415489, "grad_norm": 4.67224645614624, "learning_rate": 8.76459254485867e-06, "loss": 0.1453, "num_input_tokens_seen": 71842032, "step": 33325 }, { "epoch": 6.116718663975042, "grad_norm": 21.838613510131836, "learning_rate": 8.764065507597125e-06, "loss": 0.1197, "num_input_tokens_seen": 71852720, "step": 33330 }, { "epoch": 6.117636263534593, "grad_norm": 5.482916831970215, "learning_rate": 8.763538373793136e-06, "loss": 0.1781, "num_input_tokens_seen": 71862416, "step": 33335 }, { "epoch": 6.118553863094146, "grad_norm": 19.195161819458008, "learning_rate": 8.763011143460221e-06, "loss": 0.2279, "num_input_tokens_seen": 71872880, "step": 33340 }, { "epoch": 6.119471462653698, "grad_norm": 11.69224739074707, "learning_rate": 8.762483816611901e-06, "loss": 0.1285, "num_input_tokens_seen": 71884080, "step": 33345 }, { "epoch": 6.12038906221325, "grad_norm": 1.9829996824264526, "learning_rate": 8.761956393261703e-06, "loss": 0.2631, "num_input_tokens_seen": 71894416, "step": 33350 }, { "epoch": 6.1213066617728025, "grad_norm": 1.7583345174789429, "learning_rate": 8.761428873423155e-06, "loss": 0.2363, "num_input_tokens_seen": 71905200, "step": 33355 }, { "epoch": 6.122224261332355, "grad_norm": 24.669418334960938, "learning_rate": 8.760901257109784e-06, "loss": 0.3819, "num_input_tokens_seen": 71916176, "step": 33360 }, { "epoch": 6.123141860891907, "grad_norm": 15.268752098083496, "learning_rate": 8.760373544335125e-06, "loss": 0.3057, "num_input_tokens_seen": 71927728, "step": 33365 }, { "epoch": 6.124059460451459, "grad_norm": 9.828463554382324, "learning_rate": 8.759845735112714e-06, "loss": 0.3294, "num_input_tokens_seen": 71938768, "step": 33370 }, { "epoch": 6.124977060011012, "grad_norm": 16.77505874633789, "learning_rate": 8.759317829456086e-06, "loss": 0.274, "num_input_tokens_seen": 71949200, "step": 33375 }, { "epoch": 6.125894659570563, "grad_norm": 1.2512696981430054, "learning_rate": 8.758789827378782e-06, "loss": 0.2423, "num_input_tokens_seen": 71960240, "step": 33380 }, { "epoch": 6.126812259130116, "grad_norm": 3.35430645942688, "learning_rate": 8.758261728894345e-06, "loss": 0.1997, "num_input_tokens_seen": 71970288, "step": 33385 }, { "epoch": 6.127729858689668, "grad_norm": 6.530426025390625, "learning_rate": 8.757733534016319e-06, "loss": 0.1797, "num_input_tokens_seen": 71981200, "step": 33390 }, { "epoch": 6.12864745824922, "grad_norm": 17.995697021484375, "learning_rate": 8.757205242758252e-06, "loss": 0.1504, "num_input_tokens_seen": 71990192, "step": 33395 }, { "epoch": 6.129565057808772, "grad_norm": 32.17768096923828, "learning_rate": 8.756676855133694e-06, "loss": 0.2289, "num_input_tokens_seen": 72002064, "step": 33400 }, { "epoch": 6.130482657368325, "grad_norm": 5.244203567504883, "learning_rate": 8.756148371156197e-06, "loss": 0.4615, "num_input_tokens_seen": 72013136, "step": 33405 }, { "epoch": 6.1314002569278765, "grad_norm": 10.026082038879395, "learning_rate": 8.755619790839316e-06, "loss": 0.1569, "num_input_tokens_seen": 72023248, "step": 33410 }, { "epoch": 6.132317856487429, "grad_norm": 2.7424981594085693, "learning_rate": 8.755091114196608e-06, "loss": 0.1921, "num_input_tokens_seen": 72034480, "step": 33415 }, { "epoch": 6.1332354560469815, "grad_norm": 4.2199907302856445, "learning_rate": 8.754562341241631e-06, "loss": 0.14, "num_input_tokens_seen": 72045680, "step": 33420 }, { "epoch": 6.134153055606533, "grad_norm": 18.198144912719727, "learning_rate": 8.75403347198795e-06, "loss": 0.4503, "num_input_tokens_seen": 72055888, "step": 33425 }, { "epoch": 6.135070655166086, "grad_norm": 0.6895683407783508, "learning_rate": 8.75350450644913e-06, "loss": 0.1326, "num_input_tokens_seen": 72067120, "step": 33430 }, { "epoch": 6.135988254725638, "grad_norm": 24.72648048400879, "learning_rate": 8.752975444638737e-06, "loss": 0.1737, "num_input_tokens_seen": 72078256, "step": 33435 }, { "epoch": 6.13690585428519, "grad_norm": 5.467484474182129, "learning_rate": 8.752446286570337e-06, "loss": 0.3169, "num_input_tokens_seen": 72089488, "step": 33440 }, { "epoch": 6.137823453844742, "grad_norm": 1.602051019668579, "learning_rate": 8.751917032257508e-06, "loss": 0.0997, "num_input_tokens_seen": 72100208, "step": 33445 }, { "epoch": 6.138741053404295, "grad_norm": 23.969932556152344, "learning_rate": 8.751387681713822e-06, "loss": 0.1153, "num_input_tokens_seen": 72111056, "step": 33450 }, { "epoch": 6.139658652963846, "grad_norm": 4.173307418823242, "learning_rate": 8.750858234952856e-06, "loss": 0.2125, "num_input_tokens_seen": 72122480, "step": 33455 }, { "epoch": 6.140576252523399, "grad_norm": 6.502249240875244, "learning_rate": 8.750328691988191e-06, "loss": 0.75, "num_input_tokens_seen": 72133648, "step": 33460 }, { "epoch": 6.141493852082951, "grad_norm": 16.180999755859375, "learning_rate": 8.749799052833405e-06, "loss": 0.4304, "num_input_tokens_seen": 72144016, "step": 33465 }, { "epoch": 6.142411451642503, "grad_norm": 26.90887451171875, "learning_rate": 8.749269317502085e-06, "loss": 0.3111, "num_input_tokens_seen": 72154032, "step": 33470 }, { "epoch": 6.1433290512020555, "grad_norm": 8.9912748336792, "learning_rate": 8.74873948600782e-06, "loss": 0.2041, "num_input_tokens_seen": 72164144, "step": 33475 }, { "epoch": 6.144246650761608, "grad_norm": 6.224349021911621, "learning_rate": 8.748209558364195e-06, "loss": 0.2819, "num_input_tokens_seen": 72174768, "step": 33480 }, { "epoch": 6.14516425032116, "grad_norm": 8.044365882873535, "learning_rate": 8.747679534584806e-06, "loss": 0.336, "num_input_tokens_seen": 72185488, "step": 33485 }, { "epoch": 6.146081849880712, "grad_norm": 19.014097213745117, "learning_rate": 8.747149414683245e-06, "loss": 0.1553, "num_input_tokens_seen": 72197520, "step": 33490 }, { "epoch": 6.146999449440265, "grad_norm": 5.208014965057373, "learning_rate": 8.746619198673108e-06, "loss": 0.2084, "num_input_tokens_seen": 72209648, "step": 33495 }, { "epoch": 6.147917048999816, "grad_norm": 22.54482078552246, "learning_rate": 8.746088886567996e-06, "loss": 0.3405, "num_input_tokens_seen": 72219760, "step": 33500 }, { "epoch": 6.148834648559369, "grad_norm": 1.8360745906829834, "learning_rate": 8.745558478381508e-06, "loss": 0.3556, "num_input_tokens_seen": 72231056, "step": 33505 }, { "epoch": 6.149752248118921, "grad_norm": 4.414877891540527, "learning_rate": 8.745027974127253e-06, "loss": 0.2721, "num_input_tokens_seen": 72241744, "step": 33510 }, { "epoch": 6.150669847678473, "grad_norm": 21.3192195892334, "learning_rate": 8.744497373818834e-06, "loss": 0.1283, "num_input_tokens_seen": 72254000, "step": 33515 }, { "epoch": 6.151587447238025, "grad_norm": 5.538733005523682, "learning_rate": 8.74396667746986e-06, "loss": 0.4196, "num_input_tokens_seen": 72264976, "step": 33520 }, { "epoch": 6.152505046797578, "grad_norm": 26.432239532470703, "learning_rate": 8.743435885093945e-06, "loss": 0.1901, "num_input_tokens_seen": 72276688, "step": 33525 }, { "epoch": 6.1534226463571295, "grad_norm": 13.251628875732422, "learning_rate": 8.742904996704699e-06, "loss": 0.2438, "num_input_tokens_seen": 72288624, "step": 33530 }, { "epoch": 6.154340245916682, "grad_norm": 14.953575134277344, "learning_rate": 8.742374012315742e-06, "loss": 0.1953, "num_input_tokens_seen": 72299344, "step": 33535 }, { "epoch": 6.155257845476235, "grad_norm": 13.07089900970459, "learning_rate": 8.741842931940692e-06, "loss": 0.0753, "num_input_tokens_seen": 72310576, "step": 33540 }, { "epoch": 6.156175445035786, "grad_norm": 16.363962173461914, "learning_rate": 8.74131175559317e-06, "loss": 0.2434, "num_input_tokens_seen": 72321936, "step": 33545 }, { "epoch": 6.157093044595339, "grad_norm": 43.99884033203125, "learning_rate": 8.7407804832868e-06, "loss": 0.3092, "num_input_tokens_seen": 72332592, "step": 33550 }, { "epoch": 6.158010644154891, "grad_norm": 27.233367919921875, "learning_rate": 8.740249115035206e-06, "loss": 0.1845, "num_input_tokens_seen": 72343344, "step": 33555 }, { "epoch": 6.158928243714443, "grad_norm": 35.533817291259766, "learning_rate": 8.739717650852023e-06, "loss": 0.226, "num_input_tokens_seen": 72352656, "step": 33560 }, { "epoch": 6.159845843273995, "grad_norm": 8.149599075317383, "learning_rate": 8.739186090750875e-06, "loss": 0.3356, "num_input_tokens_seen": 72362640, "step": 33565 }, { "epoch": 6.160763442833548, "grad_norm": 22.641786575317383, "learning_rate": 8.738654434745402e-06, "loss": 0.2392, "num_input_tokens_seen": 72373520, "step": 33570 }, { "epoch": 6.161681042393099, "grad_norm": 0.9543883800506592, "learning_rate": 8.738122682849235e-06, "loss": 0.0557, "num_input_tokens_seen": 72383376, "step": 33575 }, { "epoch": 6.162598641952652, "grad_norm": 0.9596734642982483, "learning_rate": 8.737590835076015e-06, "loss": 0.3952, "num_input_tokens_seen": 72393776, "step": 33580 }, { "epoch": 6.1635162415122045, "grad_norm": 12.39971923828125, "learning_rate": 8.737058891439383e-06, "loss": 0.2716, "num_input_tokens_seen": 72404528, "step": 33585 }, { "epoch": 6.164433841071756, "grad_norm": 1.8458226919174194, "learning_rate": 8.736526851952982e-06, "loss": 0.1529, "num_input_tokens_seen": 72415408, "step": 33590 }, { "epoch": 6.165351440631309, "grad_norm": 10.454972267150879, "learning_rate": 8.735994716630457e-06, "loss": 0.2974, "num_input_tokens_seen": 72426864, "step": 33595 }, { "epoch": 6.166269040190861, "grad_norm": 44.69896697998047, "learning_rate": 8.735462485485462e-06, "loss": 0.1559, "num_input_tokens_seen": 72438256, "step": 33600 }, { "epoch": 6.167186639750413, "grad_norm": 2.7862086296081543, "learning_rate": 8.73493015853164e-06, "loss": 0.4455, "num_input_tokens_seen": 72450000, "step": 33605 }, { "epoch": 6.168104239309965, "grad_norm": 0.38416191935539246, "learning_rate": 8.73439773578265e-06, "loss": 0.274, "num_input_tokens_seen": 72461552, "step": 33610 }, { "epoch": 6.169021838869518, "grad_norm": 22.75653076171875, "learning_rate": 8.733865217252144e-06, "loss": 0.4796, "num_input_tokens_seen": 72471696, "step": 33615 }, { "epoch": 6.169939438429069, "grad_norm": 12.635534286499023, "learning_rate": 8.733332602953784e-06, "loss": 0.3542, "num_input_tokens_seen": 72483472, "step": 33620 }, { "epoch": 6.170857037988622, "grad_norm": 7.644508361816406, "learning_rate": 8.73279989290123e-06, "loss": 0.1839, "num_input_tokens_seen": 72495184, "step": 33625 }, { "epoch": 6.171774637548174, "grad_norm": 38.8824348449707, "learning_rate": 8.732267087108142e-06, "loss": 0.3244, "num_input_tokens_seen": 72504752, "step": 33630 }, { "epoch": 6.172692237107726, "grad_norm": 1.1918503046035767, "learning_rate": 8.731734185588186e-06, "loss": 0.4709, "num_input_tokens_seen": 72516272, "step": 33635 }, { "epoch": 6.1736098366672785, "grad_norm": 14.450275421142578, "learning_rate": 8.731201188355035e-06, "loss": 0.264, "num_input_tokens_seen": 72528112, "step": 33640 }, { "epoch": 6.174527436226831, "grad_norm": 9.86398983001709, "learning_rate": 8.730668095422354e-06, "loss": 0.3436, "num_input_tokens_seen": 72538704, "step": 33645 }, { "epoch": 6.175445035786383, "grad_norm": 0.6488649249076843, "learning_rate": 8.73013490680382e-06, "loss": 0.1717, "num_input_tokens_seen": 72549392, "step": 33650 }, { "epoch": 6.176362635345935, "grad_norm": 12.835917472839355, "learning_rate": 8.729601622513107e-06, "loss": 0.3417, "num_input_tokens_seen": 72560272, "step": 33655 }, { "epoch": 6.177280234905488, "grad_norm": 23.270463943481445, "learning_rate": 8.729068242563892e-06, "loss": 0.1818, "num_input_tokens_seen": 72570640, "step": 33660 }, { "epoch": 6.178197834465039, "grad_norm": 18.120128631591797, "learning_rate": 8.728534766969856e-06, "loss": 0.1988, "num_input_tokens_seen": 72581744, "step": 33665 }, { "epoch": 6.179115434024592, "grad_norm": 12.7843599319458, "learning_rate": 8.728001195744682e-06, "loss": 0.2193, "num_input_tokens_seen": 72591696, "step": 33670 }, { "epoch": 6.180033033584144, "grad_norm": 8.400177001953125, "learning_rate": 8.727467528902055e-06, "loss": 0.2191, "num_input_tokens_seen": 72602544, "step": 33675 }, { "epoch": 6.180950633143696, "grad_norm": 15.954157829284668, "learning_rate": 8.726933766455663e-06, "loss": 0.3038, "num_input_tokens_seen": 72613808, "step": 33680 }, { "epoch": 6.181868232703248, "grad_norm": 10.705291748046875, "learning_rate": 8.726399908419196e-06, "loss": 0.1928, "num_input_tokens_seen": 72624240, "step": 33685 }, { "epoch": 6.182785832262801, "grad_norm": 12.662598609924316, "learning_rate": 8.725865954806348e-06, "loss": 0.3319, "num_input_tokens_seen": 72636112, "step": 33690 }, { "epoch": 6.1837034318223525, "grad_norm": 10.059844017028809, "learning_rate": 8.725331905630811e-06, "loss": 0.1871, "num_input_tokens_seen": 72646608, "step": 33695 }, { "epoch": 6.184621031381905, "grad_norm": 10.00013542175293, "learning_rate": 8.724797760906285e-06, "loss": 0.1923, "num_input_tokens_seen": 72658064, "step": 33700 }, { "epoch": 6.1855386309414575, "grad_norm": 31.15730857849121, "learning_rate": 8.72426352064647e-06, "loss": 0.24, "num_input_tokens_seen": 72669552, "step": 33705 }, { "epoch": 6.186456230501009, "grad_norm": 0.26597681641578674, "learning_rate": 8.723729184865068e-06, "loss": 0.169, "num_input_tokens_seen": 72681360, "step": 33710 }, { "epoch": 6.187373830060562, "grad_norm": 7.72059965133667, "learning_rate": 8.723194753575782e-06, "loss": 0.3087, "num_input_tokens_seen": 72692560, "step": 33715 }, { "epoch": 6.188291429620114, "grad_norm": 6.525753498077393, "learning_rate": 8.722660226792324e-06, "loss": 0.288, "num_input_tokens_seen": 72704208, "step": 33720 }, { "epoch": 6.189209029179666, "grad_norm": 10.562926292419434, "learning_rate": 8.722125604528398e-06, "loss": 0.3479, "num_input_tokens_seen": 72714096, "step": 33725 }, { "epoch": 6.190126628739218, "grad_norm": 28.491724014282227, "learning_rate": 8.72159088679772e-06, "loss": 0.3801, "num_input_tokens_seen": 72726640, "step": 33730 }, { "epoch": 6.191044228298771, "grad_norm": 27.255443572998047, "learning_rate": 8.721056073614002e-06, "loss": 0.324, "num_input_tokens_seen": 72737840, "step": 33735 }, { "epoch": 6.191961827858322, "grad_norm": 4.1755828857421875, "learning_rate": 8.720521164990964e-06, "loss": 0.1728, "num_input_tokens_seen": 72748784, "step": 33740 }, { "epoch": 6.192879427417875, "grad_norm": 1.7878323793411255, "learning_rate": 8.719986160942326e-06, "loss": 0.0695, "num_input_tokens_seen": 72760624, "step": 33745 }, { "epoch": 6.193797026977427, "grad_norm": 37.541194915771484, "learning_rate": 8.719451061481808e-06, "loss": 0.2136, "num_input_tokens_seen": 72772112, "step": 33750 }, { "epoch": 6.194714626536979, "grad_norm": 17.80178451538086, "learning_rate": 8.718915866623134e-06, "loss": 0.5241, "num_input_tokens_seen": 72782352, "step": 33755 }, { "epoch": 6.1956322260965315, "grad_norm": 24.18625259399414, "learning_rate": 8.718380576380032e-06, "loss": 0.3535, "num_input_tokens_seen": 72792176, "step": 33760 }, { "epoch": 6.196549825656084, "grad_norm": 31.733129501342773, "learning_rate": 8.717845190766229e-06, "loss": 0.1831, "num_input_tokens_seen": 72803984, "step": 33765 }, { "epoch": 6.197467425215636, "grad_norm": 17.187602996826172, "learning_rate": 8.717309709795463e-06, "loss": 0.3645, "num_input_tokens_seen": 72815152, "step": 33770 }, { "epoch": 6.198385024775188, "grad_norm": 99.89041137695312, "learning_rate": 8.716774133481462e-06, "loss": 0.6266, "num_input_tokens_seen": 72824496, "step": 33775 }, { "epoch": 6.199302624334741, "grad_norm": 7.242213249206543, "learning_rate": 8.716238461837964e-06, "loss": 0.2014, "num_input_tokens_seen": 72835664, "step": 33780 }, { "epoch": 6.200220223894292, "grad_norm": 35.58095169067383, "learning_rate": 8.715702694878712e-06, "loss": 0.3159, "num_input_tokens_seen": 72845136, "step": 33785 }, { "epoch": 6.201137823453845, "grad_norm": 4.033657073974609, "learning_rate": 8.715166832617444e-06, "loss": 0.1688, "num_input_tokens_seen": 72855536, "step": 33790 }, { "epoch": 6.202055423013397, "grad_norm": 1.9878458976745605, "learning_rate": 8.714630875067901e-06, "loss": 0.2392, "num_input_tokens_seen": 72866896, "step": 33795 }, { "epoch": 6.202973022572949, "grad_norm": 9.434977531433105, "learning_rate": 8.714094822243837e-06, "loss": 0.3044, "num_input_tokens_seen": 72877616, "step": 33800 }, { "epoch": 6.203890622132501, "grad_norm": 2.2284460067749023, "learning_rate": 8.713558674158997e-06, "loss": 0.2074, "num_input_tokens_seen": 72889616, "step": 33805 }, { "epoch": 6.204808221692054, "grad_norm": 6.765059947967529, "learning_rate": 8.713022430827132e-06, "loss": 0.2213, "num_input_tokens_seen": 72900400, "step": 33810 }, { "epoch": 6.2057258212516055, "grad_norm": 14.969019889831543, "learning_rate": 8.712486092261997e-06, "loss": 0.265, "num_input_tokens_seen": 72910256, "step": 33815 }, { "epoch": 6.206643420811158, "grad_norm": 23.784358978271484, "learning_rate": 8.711949658477346e-06, "loss": 0.142, "num_input_tokens_seen": 72921200, "step": 33820 }, { "epoch": 6.207561020370711, "grad_norm": 3.128398895263672, "learning_rate": 8.711413129486938e-06, "loss": 0.1597, "num_input_tokens_seen": 72931856, "step": 33825 }, { "epoch": 6.208478619930262, "grad_norm": 14.423249244689941, "learning_rate": 8.710876505304538e-06, "loss": 0.3659, "num_input_tokens_seen": 72943056, "step": 33830 }, { "epoch": 6.209396219489815, "grad_norm": 1.957833170890808, "learning_rate": 8.710339785943906e-06, "loss": 0.1426, "num_input_tokens_seen": 72952784, "step": 33835 }, { "epoch": 6.210313819049367, "grad_norm": 3.0465121269226074, "learning_rate": 8.70980297141881e-06, "loss": 0.2143, "num_input_tokens_seen": 72963280, "step": 33840 }, { "epoch": 6.211231418608919, "grad_norm": 18.454214096069336, "learning_rate": 8.709266061743015e-06, "loss": 0.3844, "num_input_tokens_seen": 72974352, "step": 33845 }, { "epoch": 6.212149018168471, "grad_norm": 8.828197479248047, "learning_rate": 8.708729056930297e-06, "loss": 0.284, "num_input_tokens_seen": 72985744, "step": 33850 }, { "epoch": 6.213066617728024, "grad_norm": 23.557676315307617, "learning_rate": 8.708191956994425e-06, "loss": 0.2745, "num_input_tokens_seen": 72996080, "step": 33855 }, { "epoch": 6.213984217287575, "grad_norm": 22.760316848754883, "learning_rate": 8.707654761949178e-06, "loss": 0.4384, "num_input_tokens_seen": 73006224, "step": 33860 }, { "epoch": 6.214901816847128, "grad_norm": 7.571423053741455, "learning_rate": 8.707117471808332e-06, "loss": 0.2674, "num_input_tokens_seen": 73017008, "step": 33865 }, { "epoch": 6.2158194164066805, "grad_norm": 22.246803283691406, "learning_rate": 8.706580086585667e-06, "loss": 0.3453, "num_input_tokens_seen": 73028752, "step": 33870 }, { "epoch": 6.216737015966232, "grad_norm": 7.327118396759033, "learning_rate": 8.70604260629497e-06, "loss": 0.3118, "num_input_tokens_seen": 73039568, "step": 33875 }, { "epoch": 6.217654615525785, "grad_norm": 1.4188330173492432, "learning_rate": 8.705505030950022e-06, "loss": 0.1476, "num_input_tokens_seen": 73052336, "step": 33880 }, { "epoch": 6.218572215085337, "grad_norm": 13.439730644226074, "learning_rate": 8.704967360564614e-06, "loss": 0.3592, "num_input_tokens_seen": 73063440, "step": 33885 }, { "epoch": 6.219489814644889, "grad_norm": 23.44438934326172, "learning_rate": 8.704429595152535e-06, "loss": 0.2117, "num_input_tokens_seen": 73074000, "step": 33890 }, { "epoch": 6.220407414204441, "grad_norm": 3.166506290435791, "learning_rate": 8.703891734727578e-06, "loss": 0.2881, "num_input_tokens_seen": 73083824, "step": 33895 }, { "epoch": 6.221325013763994, "grad_norm": 5.809389591217041, "learning_rate": 8.70335377930354e-06, "loss": 0.5277, "num_input_tokens_seen": 73093200, "step": 33900 }, { "epoch": 6.222242613323545, "grad_norm": 1.172899603843689, "learning_rate": 8.702815728894216e-06, "loss": 0.3255, "num_input_tokens_seen": 73104112, "step": 33905 }, { "epoch": 6.223160212883098, "grad_norm": 8.631399154663086, "learning_rate": 8.702277583513408e-06, "loss": 0.4375, "num_input_tokens_seen": 73114544, "step": 33910 }, { "epoch": 6.22407781244265, "grad_norm": 5.592848777770996, "learning_rate": 8.701739343174916e-06, "loss": 0.1856, "num_input_tokens_seen": 73125008, "step": 33915 }, { "epoch": 6.224995412002202, "grad_norm": 14.12292766571045, "learning_rate": 8.70120100789255e-06, "loss": 0.2733, "num_input_tokens_seen": 73136816, "step": 33920 }, { "epoch": 6.2259130115617545, "grad_norm": 4.677799701690674, "learning_rate": 8.700662577680113e-06, "loss": 0.3219, "num_input_tokens_seen": 73147728, "step": 33925 }, { "epoch": 6.226830611121307, "grad_norm": 12.8119535446167, "learning_rate": 8.700124052551415e-06, "loss": 0.2748, "num_input_tokens_seen": 73158800, "step": 33930 }, { "epoch": 6.227748210680859, "grad_norm": 11.61180591583252, "learning_rate": 8.69958543252027e-06, "loss": 0.2213, "num_input_tokens_seen": 73169584, "step": 33935 }, { "epoch": 6.228665810240411, "grad_norm": 8.999907493591309, "learning_rate": 8.699046717600494e-06, "loss": 0.1846, "num_input_tokens_seen": 73181328, "step": 33940 }, { "epoch": 6.229583409799964, "grad_norm": 17.833463668823242, "learning_rate": 8.698507907805903e-06, "loss": 0.2244, "num_input_tokens_seen": 73192368, "step": 33945 }, { "epoch": 6.230501009359515, "grad_norm": 9.825766563415527, "learning_rate": 8.697969003150314e-06, "loss": 0.2846, "num_input_tokens_seen": 73203376, "step": 33950 }, { "epoch": 6.231418608919068, "grad_norm": 1.3219131231307983, "learning_rate": 8.697430003647554e-06, "loss": 0.2469, "num_input_tokens_seen": 73214320, "step": 33955 }, { "epoch": 6.23233620847862, "grad_norm": 6.24299955368042, "learning_rate": 8.696890909311443e-06, "loss": 0.2856, "num_input_tokens_seen": 73224496, "step": 33960 }, { "epoch": 6.233253808038172, "grad_norm": 37.49713134765625, "learning_rate": 8.69635172015581e-06, "loss": 0.2439, "num_input_tokens_seen": 73235216, "step": 33965 }, { "epoch": 6.234171407597724, "grad_norm": 12.107063293457031, "learning_rate": 8.695812436194487e-06, "loss": 0.2524, "num_input_tokens_seen": 73245904, "step": 33970 }, { "epoch": 6.235089007157277, "grad_norm": 2.362694263458252, "learning_rate": 8.6952730574413e-06, "loss": 0.1771, "num_input_tokens_seen": 73257296, "step": 33975 }, { "epoch": 6.2360066067168285, "grad_norm": 8.26502513885498, "learning_rate": 8.694733583910089e-06, "loss": 0.1387, "num_input_tokens_seen": 73269712, "step": 33980 }, { "epoch": 6.236924206276381, "grad_norm": 4.592140197753906, "learning_rate": 8.694194015614686e-06, "loss": 0.4049, "num_input_tokens_seen": 73279664, "step": 33985 }, { "epoch": 6.2378418058359335, "grad_norm": 12.831296920776367, "learning_rate": 8.693654352568932e-06, "loss": 0.333, "num_input_tokens_seen": 73290512, "step": 33990 }, { "epoch": 6.238759405395485, "grad_norm": 3.8476693630218506, "learning_rate": 8.693114594786667e-06, "loss": 0.3227, "num_input_tokens_seen": 73301424, "step": 33995 }, { "epoch": 6.239677004955038, "grad_norm": 36.33852005004883, "learning_rate": 8.692574742281739e-06, "loss": 0.2369, "num_input_tokens_seen": 73312400, "step": 34000 }, { "epoch": 6.24059460451459, "grad_norm": 11.932642936706543, "learning_rate": 8.692034795067991e-06, "loss": 0.1066, "num_input_tokens_seen": 73323632, "step": 34005 }, { "epoch": 6.241512204074142, "grad_norm": 6.746516227722168, "learning_rate": 8.691494753159272e-06, "loss": 0.2212, "num_input_tokens_seen": 73334192, "step": 34010 }, { "epoch": 6.242429803633694, "grad_norm": 10.046058654785156, "learning_rate": 8.690954616569434e-06, "loss": 0.4064, "num_input_tokens_seen": 73343696, "step": 34015 }, { "epoch": 6.243347403193247, "grad_norm": 1.670390009880066, "learning_rate": 8.69041438531233e-06, "loss": 0.1294, "num_input_tokens_seen": 73354992, "step": 34020 }, { "epoch": 6.244265002752798, "grad_norm": 7.381054401397705, "learning_rate": 8.68987405940182e-06, "loss": 0.1845, "num_input_tokens_seen": 73365008, "step": 34025 }, { "epoch": 6.245182602312351, "grad_norm": 0.9685163497924805, "learning_rate": 8.689333638851756e-06, "loss": 0.1529, "num_input_tokens_seen": 73375760, "step": 34030 }, { "epoch": 6.246100201871903, "grad_norm": 16.21347427368164, "learning_rate": 8.688793123676002e-06, "loss": 0.3824, "num_input_tokens_seen": 73386672, "step": 34035 }, { "epoch": 6.247017801431455, "grad_norm": 10.456464767456055, "learning_rate": 8.688252513888423e-06, "loss": 0.3874, "num_input_tokens_seen": 73397616, "step": 34040 }, { "epoch": 6.2479354009910075, "grad_norm": 7.220827579498291, "learning_rate": 8.68771180950288e-06, "loss": 0.2734, "num_input_tokens_seen": 73407888, "step": 34045 }, { "epoch": 6.24885300055056, "grad_norm": 5.849499225616455, "learning_rate": 8.687171010533249e-06, "loss": 0.2172, "num_input_tokens_seen": 73419824, "step": 34050 }, { "epoch": 6.249770600110112, "grad_norm": 10.13736343383789, "learning_rate": 8.686630116993395e-06, "loss": 0.2288, "num_input_tokens_seen": 73430384, "step": 34055 }, { "epoch": 6.250688199669664, "grad_norm": 13.164109230041504, "learning_rate": 8.686089128897191e-06, "loss": 0.2606, "num_input_tokens_seen": 73440496, "step": 34060 }, { "epoch": 6.251605799229217, "grad_norm": 41.9191780090332, "learning_rate": 8.685548046258514e-06, "loss": 0.3151, "num_input_tokens_seen": 73451952, "step": 34065 }, { "epoch": 6.252523398788768, "grad_norm": 7.32934045791626, "learning_rate": 8.68500686909124e-06, "loss": 0.5384, "num_input_tokens_seen": 73462896, "step": 34070 }, { "epoch": 6.253440998348321, "grad_norm": 8.62772274017334, "learning_rate": 8.684465597409255e-06, "loss": 0.2893, "num_input_tokens_seen": 73473424, "step": 34075 }, { "epoch": 6.254358597907873, "grad_norm": 1.407397747039795, "learning_rate": 8.683924231226436e-06, "loss": 0.1881, "num_input_tokens_seen": 73483728, "step": 34080 }, { "epoch": 6.255276197467425, "grad_norm": 6.6344990730285645, "learning_rate": 8.68338277055667e-06, "loss": 0.4934, "num_input_tokens_seen": 73495632, "step": 34085 }, { "epoch": 6.256193797026977, "grad_norm": 6.6889777183532715, "learning_rate": 8.682841215413844e-06, "loss": 0.2042, "num_input_tokens_seen": 73507376, "step": 34090 }, { "epoch": 6.25711139658653, "grad_norm": 9.838288307189941, "learning_rate": 8.68229956581185e-06, "loss": 0.3512, "num_input_tokens_seen": 73518096, "step": 34095 }, { "epoch": 6.2580289961460815, "grad_norm": 2.5985097885131836, "learning_rate": 8.681757821764578e-06, "loss": 0.1322, "num_input_tokens_seen": 73529392, "step": 34100 }, { "epoch": 6.258946595705634, "grad_norm": 2.2850918769836426, "learning_rate": 8.681215983285924e-06, "loss": 0.1683, "num_input_tokens_seen": 73540752, "step": 34105 }, { "epoch": 6.259864195265187, "grad_norm": 1.6804150342941284, "learning_rate": 8.680674050389787e-06, "loss": 0.0617, "num_input_tokens_seen": 73551216, "step": 34110 }, { "epoch": 6.260781794824738, "grad_norm": 10.9625244140625, "learning_rate": 8.680132023090065e-06, "loss": 0.355, "num_input_tokens_seen": 73562000, "step": 34115 }, { "epoch": 6.261699394384291, "grad_norm": 4.57906436920166, "learning_rate": 8.679589901400657e-06, "loss": 0.1768, "num_input_tokens_seen": 73572048, "step": 34120 }, { "epoch": 6.262616993943843, "grad_norm": 9.416275978088379, "learning_rate": 8.679047685335474e-06, "loss": 0.2006, "num_input_tokens_seen": 73583536, "step": 34125 }, { "epoch": 6.263534593503395, "grad_norm": 13.562826156616211, "learning_rate": 8.67850537490842e-06, "loss": 0.3863, "num_input_tokens_seen": 73594192, "step": 34130 }, { "epoch": 6.264452193062947, "grad_norm": 5.244032859802246, "learning_rate": 8.677962970133403e-06, "loss": 0.1439, "num_input_tokens_seen": 73605168, "step": 34135 }, { "epoch": 6.2653697926225, "grad_norm": 9.468151092529297, "learning_rate": 8.677420471024336e-06, "loss": 0.245, "num_input_tokens_seen": 73615600, "step": 34140 }, { "epoch": 6.266287392182051, "grad_norm": 30.205398559570312, "learning_rate": 8.676877877595135e-06, "loss": 0.1901, "num_input_tokens_seen": 73626960, "step": 34145 }, { "epoch": 6.267204991741604, "grad_norm": 7.711582660675049, "learning_rate": 8.676335189859712e-06, "loss": 0.3103, "num_input_tokens_seen": 73639248, "step": 34150 }, { "epoch": 6.2681225913011565, "grad_norm": 30.409069061279297, "learning_rate": 8.675792407831994e-06, "loss": 0.1079, "num_input_tokens_seen": 73648816, "step": 34155 }, { "epoch": 6.269040190860708, "grad_norm": 11.104389190673828, "learning_rate": 8.675249531525894e-06, "loss": 0.0668, "num_input_tokens_seen": 73658320, "step": 34160 }, { "epoch": 6.269957790420261, "grad_norm": 10.160953521728516, "learning_rate": 8.67470656095534e-06, "loss": 0.1782, "num_input_tokens_seen": 73668976, "step": 34165 }, { "epoch": 6.270875389979813, "grad_norm": 6.566156387329102, "learning_rate": 8.67416349613426e-06, "loss": 0.6935, "num_input_tokens_seen": 73680528, "step": 34170 }, { "epoch": 6.271792989539365, "grad_norm": 2.605759620666504, "learning_rate": 8.673620337076578e-06, "loss": 0.1668, "num_input_tokens_seen": 73691536, "step": 34175 }, { "epoch": 6.272710589098917, "grad_norm": 20.527360916137695, "learning_rate": 8.67307708379623e-06, "loss": 0.2487, "num_input_tokens_seen": 73701808, "step": 34180 }, { "epoch": 6.27362818865847, "grad_norm": 9.38135814666748, "learning_rate": 8.672533736307148e-06, "loss": 0.3564, "num_input_tokens_seen": 73712272, "step": 34185 }, { "epoch": 6.274545788218021, "grad_norm": 24.72007179260254, "learning_rate": 8.671990294623265e-06, "loss": 0.358, "num_input_tokens_seen": 73723984, "step": 34190 }, { "epoch": 6.275463387777574, "grad_norm": 3.470027208328247, "learning_rate": 8.671446758758523e-06, "loss": 0.2602, "num_input_tokens_seen": 73733648, "step": 34195 }, { "epoch": 6.276380987337126, "grad_norm": 19.972827911376953, "learning_rate": 8.670903128726862e-06, "loss": 0.2664, "num_input_tokens_seen": 73744496, "step": 34200 }, { "epoch": 6.277298586896678, "grad_norm": 14.654345512390137, "learning_rate": 8.670359404542226e-06, "loss": 0.2267, "num_input_tokens_seen": 73753936, "step": 34205 }, { "epoch": 6.2782161864562305, "grad_norm": 0.7499194741249084, "learning_rate": 8.669815586218559e-06, "loss": 0.2274, "num_input_tokens_seen": 73765264, "step": 34210 }, { "epoch": 6.279133786015783, "grad_norm": 1.513833999633789, "learning_rate": 8.669271673769811e-06, "loss": 0.0345, "num_input_tokens_seen": 73776240, "step": 34215 }, { "epoch": 6.280051385575335, "grad_norm": 5.679277420043945, "learning_rate": 8.66872766720993e-06, "loss": 0.2849, "num_input_tokens_seen": 73788272, "step": 34220 }, { "epoch": 6.280968985134887, "grad_norm": 16.839981079101562, "learning_rate": 8.66818356655287e-06, "loss": 0.4363, "num_input_tokens_seen": 73799088, "step": 34225 }, { "epoch": 6.28188658469444, "grad_norm": 10.07079792022705, "learning_rate": 8.667639371812588e-06, "loss": 0.166, "num_input_tokens_seen": 73809808, "step": 34230 }, { "epoch": 6.282804184253991, "grad_norm": 19.480762481689453, "learning_rate": 8.66709508300304e-06, "loss": 0.2097, "num_input_tokens_seen": 73821360, "step": 34235 }, { "epoch": 6.283721783813544, "grad_norm": 0.9361969828605652, "learning_rate": 8.666550700138187e-06, "loss": 0.2787, "num_input_tokens_seen": 73832304, "step": 34240 }, { "epoch": 6.284639383373096, "grad_norm": 46.80681610107422, "learning_rate": 8.66600622323199e-06, "loss": 0.3467, "num_input_tokens_seen": 73842704, "step": 34245 }, { "epoch": 6.285556982932648, "grad_norm": 9.234562873840332, "learning_rate": 8.665461652298416e-06, "loss": 0.2951, "num_input_tokens_seen": 73854448, "step": 34250 }, { "epoch": 6.2864745824922, "grad_norm": 7.296894550323486, "learning_rate": 8.664916987351432e-06, "loss": 0.2087, "num_input_tokens_seen": 73866160, "step": 34255 }, { "epoch": 6.287392182051753, "grad_norm": 0.8629394173622131, "learning_rate": 8.664372228405007e-06, "loss": 0.297, "num_input_tokens_seen": 73877360, "step": 34260 }, { "epoch": 6.2883097816113045, "grad_norm": 5.063983917236328, "learning_rate": 8.663827375473114e-06, "loss": 0.3465, "num_input_tokens_seen": 73888112, "step": 34265 }, { "epoch": 6.289227381170857, "grad_norm": 4.8725810050964355, "learning_rate": 8.663282428569728e-06, "loss": 0.4339, "num_input_tokens_seen": 73897808, "step": 34270 }, { "epoch": 6.2901449807304095, "grad_norm": 10.199779510498047, "learning_rate": 8.662737387708822e-06, "loss": 0.3008, "num_input_tokens_seen": 73909168, "step": 34275 }, { "epoch": 6.291062580289961, "grad_norm": 14.578518867492676, "learning_rate": 8.662192252904381e-06, "loss": 0.2479, "num_input_tokens_seen": 73920144, "step": 34280 }, { "epoch": 6.291980179849514, "grad_norm": 5.801905155181885, "learning_rate": 8.661647024170385e-06, "loss": 0.2894, "num_input_tokens_seen": 73930384, "step": 34285 }, { "epoch": 6.292897779409066, "grad_norm": 12.91928768157959, "learning_rate": 8.661101701520817e-06, "loss": 0.2808, "num_input_tokens_seen": 73941616, "step": 34290 }, { "epoch": 6.293815378968618, "grad_norm": 12.069855690002441, "learning_rate": 8.660556284969665e-06, "loss": 0.3598, "num_input_tokens_seen": 73952688, "step": 34295 }, { "epoch": 6.29473297852817, "grad_norm": 5.317779541015625, "learning_rate": 8.660010774530917e-06, "loss": 0.2218, "num_input_tokens_seen": 73963728, "step": 34300 }, { "epoch": 6.295650578087723, "grad_norm": 6.509701251983643, "learning_rate": 8.659465170218565e-06, "loss": 0.3196, "num_input_tokens_seen": 73974096, "step": 34305 }, { "epoch": 6.296568177647274, "grad_norm": 6.378002643585205, "learning_rate": 8.658919472046606e-06, "loss": 0.1357, "num_input_tokens_seen": 73984880, "step": 34310 }, { "epoch": 6.297485777206827, "grad_norm": 7.030650615692139, "learning_rate": 8.658373680029029e-06, "loss": 0.2191, "num_input_tokens_seen": 73996848, "step": 34315 }, { "epoch": 6.298403376766379, "grad_norm": 25.3587646484375, "learning_rate": 8.65782779417984e-06, "loss": 0.2537, "num_input_tokens_seen": 74007952, "step": 34320 }, { "epoch": 6.299320976325931, "grad_norm": 35.82129669189453, "learning_rate": 8.657281814513037e-06, "loss": 0.2106, "num_input_tokens_seen": 74018544, "step": 34325 }, { "epoch": 6.3002385758854835, "grad_norm": 3.746781826019287, "learning_rate": 8.65673574104262e-06, "loss": 0.2759, "num_input_tokens_seen": 74029520, "step": 34330 }, { "epoch": 6.301156175445036, "grad_norm": 8.88820743560791, "learning_rate": 8.656189573782602e-06, "loss": 0.3547, "num_input_tokens_seen": 74039664, "step": 34335 }, { "epoch": 6.302073775004588, "grad_norm": 7.582515239715576, "learning_rate": 8.655643312746988e-06, "loss": 0.5153, "num_input_tokens_seen": 74051152, "step": 34340 }, { "epoch": 6.30299137456414, "grad_norm": 7.509541034698486, "learning_rate": 8.65509695794979e-06, "loss": 0.253, "num_input_tokens_seen": 74061840, "step": 34345 }, { "epoch": 6.303908974123693, "grad_norm": 10.98784065246582, "learning_rate": 8.654550509405018e-06, "loss": 0.2808, "num_input_tokens_seen": 74071984, "step": 34350 }, { "epoch": 6.304826573683244, "grad_norm": 1.435011625289917, "learning_rate": 8.65400396712669e-06, "loss": 0.2957, "num_input_tokens_seen": 74082768, "step": 34355 }, { "epoch": 6.305744173242797, "grad_norm": 1.072344183921814, "learning_rate": 8.653457331128823e-06, "loss": 0.115, "num_input_tokens_seen": 74094064, "step": 34360 }, { "epoch": 6.306661772802349, "grad_norm": 3.7931790351867676, "learning_rate": 8.652910601425438e-06, "loss": 0.2102, "num_input_tokens_seen": 74105872, "step": 34365 }, { "epoch": 6.307579372361901, "grad_norm": 14.326773643493652, "learning_rate": 8.652363778030558e-06, "loss": 0.401, "num_input_tokens_seen": 74115824, "step": 34370 }, { "epoch": 6.308496971921453, "grad_norm": 1.0552982091903687, "learning_rate": 8.651816860958207e-06, "loss": 0.1604, "num_input_tokens_seen": 74125328, "step": 34375 }, { "epoch": 6.309414571481006, "grad_norm": 4.308262348175049, "learning_rate": 8.651269850222414e-06, "loss": 0.3389, "num_input_tokens_seen": 74136112, "step": 34380 }, { "epoch": 6.3103321710405575, "grad_norm": 11.109164237976074, "learning_rate": 8.650722745837208e-06, "loss": 0.3563, "num_input_tokens_seen": 74147600, "step": 34385 }, { "epoch": 6.31124977060011, "grad_norm": 0.4222208559513092, "learning_rate": 8.650175547816621e-06, "loss": 0.2593, "num_input_tokens_seen": 74158480, "step": 34390 }, { "epoch": 6.312167370159663, "grad_norm": 2.9570415019989014, "learning_rate": 8.649628256174689e-06, "loss": 0.162, "num_input_tokens_seen": 74169744, "step": 34395 }, { "epoch": 6.313084969719214, "grad_norm": 10.843975067138672, "learning_rate": 8.64908087092545e-06, "loss": 0.1181, "num_input_tokens_seen": 74180336, "step": 34400 }, { "epoch": 6.314002569278767, "grad_norm": 13.577634811401367, "learning_rate": 8.648533392082941e-06, "loss": 0.254, "num_input_tokens_seen": 74193072, "step": 34405 }, { "epoch": 6.314920168838319, "grad_norm": 5.2242045402526855, "learning_rate": 8.647985819661204e-06, "loss": 0.2319, "num_input_tokens_seen": 74204368, "step": 34410 }, { "epoch": 6.315837768397871, "grad_norm": 1.5733951330184937, "learning_rate": 8.647438153674286e-06, "loss": 0.1645, "num_input_tokens_seen": 74214000, "step": 34415 }, { "epoch": 6.316755367957423, "grad_norm": 4.563241004943848, "learning_rate": 8.646890394136232e-06, "loss": 0.2649, "num_input_tokens_seen": 74225008, "step": 34420 }, { "epoch": 6.317672967516976, "grad_norm": 4.7878313064575195, "learning_rate": 8.646342541061093e-06, "loss": 0.1892, "num_input_tokens_seen": 74235632, "step": 34425 }, { "epoch": 6.318590567076527, "grad_norm": 1.3529151678085327, "learning_rate": 8.645794594462918e-06, "loss": 0.1947, "num_input_tokens_seen": 74247696, "step": 34430 }, { "epoch": 6.31950816663608, "grad_norm": 2.142679214477539, "learning_rate": 8.645246554355761e-06, "loss": 0.2142, "num_input_tokens_seen": 74258544, "step": 34435 }, { "epoch": 6.3204257661956325, "grad_norm": 13.416200637817383, "learning_rate": 8.64469842075368e-06, "loss": 0.2221, "num_input_tokens_seen": 74269392, "step": 34440 }, { "epoch": 6.321343365755184, "grad_norm": 4.037795066833496, "learning_rate": 8.644150193670735e-06, "loss": 0.1986, "num_input_tokens_seen": 74280272, "step": 34445 }, { "epoch": 6.322260965314737, "grad_norm": 12.85984992980957, "learning_rate": 8.643601873120983e-06, "loss": 0.327, "num_input_tokens_seen": 74291408, "step": 34450 }, { "epoch": 6.323178564874289, "grad_norm": 3.873490571975708, "learning_rate": 8.643053459118492e-06, "loss": 0.2166, "num_input_tokens_seen": 74301520, "step": 34455 }, { "epoch": 6.324096164433841, "grad_norm": 1.4436068534851074, "learning_rate": 8.642504951677325e-06, "loss": 0.3262, "num_input_tokens_seen": 74312144, "step": 34460 }, { "epoch": 6.325013763993393, "grad_norm": 1.8303693532943726, "learning_rate": 8.64195635081155e-06, "loss": 0.238, "num_input_tokens_seen": 74324080, "step": 34465 }, { "epoch": 6.325931363552946, "grad_norm": 0.9801138043403625, "learning_rate": 8.641407656535242e-06, "loss": 0.1698, "num_input_tokens_seen": 74336144, "step": 34470 }, { "epoch": 6.326848963112497, "grad_norm": 0.4554586708545685, "learning_rate": 8.64085886886247e-06, "loss": 0.1399, "num_input_tokens_seen": 74345936, "step": 34475 }, { "epoch": 6.32776656267205, "grad_norm": 0.7528419494628906, "learning_rate": 8.640309987807311e-06, "loss": 0.1974, "num_input_tokens_seen": 74355504, "step": 34480 }, { "epoch": 6.328684162231602, "grad_norm": 7.759350776672363, "learning_rate": 8.639761013383842e-06, "loss": 0.3272, "num_input_tokens_seen": 74366992, "step": 34485 }, { "epoch": 6.329601761791154, "grad_norm": 3.346144676208496, "learning_rate": 8.639211945606146e-06, "loss": 0.4664, "num_input_tokens_seen": 74377840, "step": 34490 }, { "epoch": 6.3305193613507065, "grad_norm": 18.911212921142578, "learning_rate": 8.638662784488302e-06, "loss": 0.4589, "num_input_tokens_seen": 74389360, "step": 34495 }, { "epoch": 6.331436960910259, "grad_norm": 14.308477401733398, "learning_rate": 8.638113530044397e-06, "loss": 0.3067, "num_input_tokens_seen": 74398864, "step": 34500 }, { "epoch": 6.332354560469811, "grad_norm": 5.2037811279296875, "learning_rate": 8.63756418228852e-06, "loss": 0.1801, "num_input_tokens_seen": 74408976, "step": 34505 }, { "epoch": 6.333272160029363, "grad_norm": 5.638893127441406, "learning_rate": 8.637014741234758e-06, "loss": 0.3273, "num_input_tokens_seen": 74419792, "step": 34510 }, { "epoch": 6.334189759588916, "grad_norm": 5.578425884246826, "learning_rate": 8.636465206897207e-06, "loss": 0.2563, "num_input_tokens_seen": 74430576, "step": 34515 }, { "epoch": 6.335107359148467, "grad_norm": 15.379647254943848, "learning_rate": 8.635915579289957e-06, "loss": 0.2925, "num_input_tokens_seen": 74442896, "step": 34520 }, { "epoch": 6.33602495870802, "grad_norm": 22.32973289489746, "learning_rate": 8.635365858427107e-06, "loss": 0.244, "num_input_tokens_seen": 74454192, "step": 34525 }, { "epoch": 6.336942558267572, "grad_norm": 8.853982925415039, "learning_rate": 8.634816044322759e-06, "loss": 0.2409, "num_input_tokens_seen": 74464752, "step": 34530 }, { "epoch": 6.337860157827124, "grad_norm": 3.847970724105835, "learning_rate": 8.634266136991011e-06, "loss": 0.1348, "num_input_tokens_seen": 74476112, "step": 34535 }, { "epoch": 6.338777757386676, "grad_norm": 2.0080063343048096, "learning_rate": 8.633716136445971e-06, "loss": 0.4133, "num_input_tokens_seen": 74486224, "step": 34540 }, { "epoch": 6.339695356946229, "grad_norm": 15.954817771911621, "learning_rate": 8.633166042701744e-06, "loss": 0.2191, "num_input_tokens_seen": 74497040, "step": 34545 }, { "epoch": 6.3406129565057805, "grad_norm": 22.007600784301758, "learning_rate": 8.632615855772439e-06, "loss": 0.196, "num_input_tokens_seen": 74508720, "step": 34550 }, { "epoch": 6.341530556065333, "grad_norm": 0.8520638346672058, "learning_rate": 8.632065575672166e-06, "loss": 0.1122, "num_input_tokens_seen": 74519568, "step": 34555 }, { "epoch": 6.3424481556248855, "grad_norm": 1.2585382461547852, "learning_rate": 8.631515202415041e-06, "loss": 0.1193, "num_input_tokens_seen": 74529776, "step": 34560 }, { "epoch": 6.343365755184437, "grad_norm": 0.6895467042922974, "learning_rate": 8.630964736015179e-06, "loss": 0.1812, "num_input_tokens_seen": 74541104, "step": 34565 }, { "epoch": 6.34428335474399, "grad_norm": 15.14358139038086, "learning_rate": 8.630414176486699e-06, "loss": 0.1427, "num_input_tokens_seen": 74551280, "step": 34570 }, { "epoch": 6.345200954303542, "grad_norm": 0.5781453251838684, "learning_rate": 8.629863523843722e-06, "loss": 0.1896, "num_input_tokens_seen": 74563248, "step": 34575 }, { "epoch": 6.346118553863094, "grad_norm": 6.485296249389648, "learning_rate": 8.629312778100371e-06, "loss": 0.2935, "num_input_tokens_seen": 74574480, "step": 34580 }, { "epoch": 6.347036153422646, "grad_norm": 22.551342010498047, "learning_rate": 8.628761939270774e-06, "loss": 0.3095, "num_input_tokens_seen": 74584432, "step": 34585 }, { "epoch": 6.347953752982199, "grad_norm": 14.924776077270508, "learning_rate": 8.628211007369056e-06, "loss": 0.5847, "num_input_tokens_seen": 74594800, "step": 34590 }, { "epoch": 6.34887135254175, "grad_norm": 10.895851135253906, "learning_rate": 8.62765998240935e-06, "loss": 0.1331, "num_input_tokens_seen": 74605168, "step": 34595 }, { "epoch": 6.349788952101303, "grad_norm": 11.851900100708008, "learning_rate": 8.627108864405784e-06, "loss": 0.4748, "num_input_tokens_seen": 74616208, "step": 34600 }, { "epoch": 6.350706551660855, "grad_norm": 12.215657234191895, "learning_rate": 8.6265576533725e-06, "loss": 0.3075, "num_input_tokens_seen": 74626768, "step": 34605 }, { "epoch": 6.351624151220407, "grad_norm": 32.97916030883789, "learning_rate": 8.626006349323633e-06, "loss": 0.2717, "num_input_tokens_seen": 74636624, "step": 34610 }, { "epoch": 6.3525417507799595, "grad_norm": 6.234999179840088, "learning_rate": 8.625454952273323e-06, "loss": 0.4985, "num_input_tokens_seen": 74647088, "step": 34615 }, { "epoch": 6.353459350339512, "grad_norm": 5.30214786529541, "learning_rate": 8.624903462235713e-06, "loss": 0.2205, "num_input_tokens_seen": 74658320, "step": 34620 }, { "epoch": 6.354376949899064, "grad_norm": 5.538553714752197, "learning_rate": 8.624351879224945e-06, "loss": 0.2128, "num_input_tokens_seen": 74669264, "step": 34625 }, { "epoch": 6.355294549458616, "grad_norm": 27.61943244934082, "learning_rate": 8.62380020325517e-06, "loss": 0.1082, "num_input_tokens_seen": 74680208, "step": 34630 }, { "epoch": 6.356212149018169, "grad_norm": 8.613398551940918, "learning_rate": 8.623248434340537e-06, "loss": 0.2643, "num_input_tokens_seen": 74689808, "step": 34635 }, { "epoch": 6.35712974857772, "grad_norm": 9.034622192382812, "learning_rate": 8.622696572495195e-06, "loss": 0.2542, "num_input_tokens_seen": 74700912, "step": 34640 }, { "epoch": 6.358047348137273, "grad_norm": 10.844810485839844, "learning_rate": 8.622144617733302e-06, "loss": 0.1574, "num_input_tokens_seen": 74711888, "step": 34645 }, { "epoch": 6.358964947696825, "grad_norm": 11.978307723999023, "learning_rate": 8.621592570069013e-06, "loss": 0.2264, "num_input_tokens_seen": 74722896, "step": 34650 }, { "epoch": 6.359882547256377, "grad_norm": 13.237751960754395, "learning_rate": 8.621040429516488e-06, "loss": 0.3504, "num_input_tokens_seen": 74734288, "step": 34655 }, { "epoch": 6.360800146815929, "grad_norm": 0.6893316507339478, "learning_rate": 8.620488196089888e-06, "loss": 0.1863, "num_input_tokens_seen": 74745936, "step": 34660 }, { "epoch": 6.361717746375482, "grad_norm": 1.1266766786575317, "learning_rate": 8.619935869803378e-06, "loss": 0.2236, "num_input_tokens_seen": 74756368, "step": 34665 }, { "epoch": 6.3626353459350335, "grad_norm": 21.183530807495117, "learning_rate": 8.619383450671121e-06, "loss": 0.2891, "num_input_tokens_seen": 74769168, "step": 34670 }, { "epoch": 6.363552945494586, "grad_norm": 8.966829299926758, "learning_rate": 8.61883093870729e-06, "loss": 0.1981, "num_input_tokens_seen": 74780560, "step": 34675 }, { "epoch": 6.364470545054139, "grad_norm": 6.872433662414551, "learning_rate": 8.618278333926053e-06, "loss": 0.3525, "num_input_tokens_seen": 74792272, "step": 34680 }, { "epoch": 6.36538814461369, "grad_norm": 5.168094635009766, "learning_rate": 8.617725636341585e-06, "loss": 0.0977, "num_input_tokens_seen": 74803184, "step": 34685 }, { "epoch": 6.366305744173243, "grad_norm": 12.258829116821289, "learning_rate": 8.61717284596806e-06, "loss": 0.2971, "num_input_tokens_seen": 74814352, "step": 34690 }, { "epoch": 6.367223343732795, "grad_norm": 1.4989664554595947, "learning_rate": 8.61661996281966e-06, "loss": 0.429, "num_input_tokens_seen": 74824528, "step": 34695 }, { "epoch": 6.368140943292347, "grad_norm": 3.6361136436462402, "learning_rate": 8.616066986910561e-06, "loss": 0.1255, "num_input_tokens_seen": 74835472, "step": 34700 }, { "epoch": 6.369058542851899, "grad_norm": 9.849392890930176, "learning_rate": 8.61551391825495e-06, "loss": 0.3488, "num_input_tokens_seen": 74846672, "step": 34705 }, { "epoch": 6.369976142411452, "grad_norm": 9.94106674194336, "learning_rate": 8.614960756867009e-06, "loss": 0.368, "num_input_tokens_seen": 74857008, "step": 34710 }, { "epoch": 6.370893741971003, "grad_norm": 12.547865867614746, "learning_rate": 8.614407502760928e-06, "loss": 0.2067, "num_input_tokens_seen": 74867152, "step": 34715 }, { "epoch": 6.371811341530556, "grad_norm": 5.764678478240967, "learning_rate": 8.613854155950897e-06, "loss": 0.1707, "num_input_tokens_seen": 74876784, "step": 34720 }, { "epoch": 6.3727289410901085, "grad_norm": 0.875232994556427, "learning_rate": 8.613300716451107e-06, "loss": 0.2348, "num_input_tokens_seen": 74887792, "step": 34725 }, { "epoch": 6.37364654064966, "grad_norm": 19.574352264404297, "learning_rate": 8.612747184275753e-06, "loss": 0.2479, "num_input_tokens_seen": 74898640, "step": 34730 }, { "epoch": 6.374564140209213, "grad_norm": 16.6701717376709, "learning_rate": 8.612193559439035e-06, "loss": 0.2734, "num_input_tokens_seen": 74909808, "step": 34735 }, { "epoch": 6.375481739768765, "grad_norm": 16.848312377929688, "learning_rate": 8.61163984195515e-06, "loss": 0.2486, "num_input_tokens_seen": 74920048, "step": 34740 }, { "epoch": 6.376399339328317, "grad_norm": 30.97053337097168, "learning_rate": 8.6110860318383e-06, "loss": 0.2676, "num_input_tokens_seen": 74930960, "step": 34745 }, { "epoch": 6.377316938887869, "grad_norm": 17.783573150634766, "learning_rate": 8.610532129102689e-06, "loss": 0.2968, "num_input_tokens_seen": 74942128, "step": 34750 }, { "epoch": 6.378234538447422, "grad_norm": 8.462260246276855, "learning_rate": 8.609978133762527e-06, "loss": 0.3388, "num_input_tokens_seen": 74953456, "step": 34755 }, { "epoch": 6.379152138006974, "grad_norm": 19.637720108032227, "learning_rate": 8.60942404583202e-06, "loss": 0.2545, "num_input_tokens_seen": 74966128, "step": 34760 }, { "epoch": 6.380069737566526, "grad_norm": 25.959823608398438, "learning_rate": 8.60886986532538e-06, "loss": 0.2716, "num_input_tokens_seen": 74976912, "step": 34765 }, { "epoch": 6.380987337126078, "grad_norm": 1.795961618423462, "learning_rate": 8.60831559225682e-06, "loss": 0.2192, "num_input_tokens_seen": 74989072, "step": 34770 }, { "epoch": 6.381904936685631, "grad_norm": 4.417482376098633, "learning_rate": 8.607761226640559e-06, "loss": 0.1716, "num_input_tokens_seen": 75000048, "step": 34775 }, { "epoch": 6.3828225362451825, "grad_norm": 6.226463317871094, "learning_rate": 8.607206768490815e-06, "loss": 0.2982, "num_input_tokens_seen": 75009968, "step": 34780 }, { "epoch": 6.383740135804735, "grad_norm": 1.4989672899246216, "learning_rate": 8.606652217821806e-06, "loss": 0.1516, "num_input_tokens_seen": 75020656, "step": 34785 }, { "epoch": 6.3846577353642875, "grad_norm": 11.591711044311523, "learning_rate": 8.606097574647759e-06, "loss": 0.3379, "num_input_tokens_seen": 75031216, "step": 34790 }, { "epoch": 6.385575334923839, "grad_norm": 0.8488574624061584, "learning_rate": 8.605542838982896e-06, "loss": 0.1953, "num_input_tokens_seen": 75042992, "step": 34795 }, { "epoch": 6.386492934483392, "grad_norm": 5.91269588470459, "learning_rate": 8.604988010841448e-06, "loss": 0.2816, "num_input_tokens_seen": 75054832, "step": 34800 }, { "epoch": 6.387410534042944, "grad_norm": 12.479416847229004, "learning_rate": 8.604433090237646e-06, "loss": 0.3856, "num_input_tokens_seen": 75065648, "step": 34805 }, { "epoch": 6.388328133602496, "grad_norm": 15.769739151000977, "learning_rate": 8.60387807718572e-06, "loss": 0.36, "num_input_tokens_seen": 75076944, "step": 34810 }, { "epoch": 6.389245733162048, "grad_norm": 5.9879913330078125, "learning_rate": 8.603322971699908e-06, "loss": 0.3267, "num_input_tokens_seen": 75087632, "step": 34815 }, { "epoch": 6.390163332721601, "grad_norm": 13.806763648986816, "learning_rate": 8.602767773794447e-06, "loss": 0.3442, "num_input_tokens_seen": 75099056, "step": 34820 }, { "epoch": 6.391080932281152, "grad_norm": 13.160304069519043, "learning_rate": 8.602212483483575e-06, "loss": 0.1989, "num_input_tokens_seen": 75108912, "step": 34825 }, { "epoch": 6.391998531840705, "grad_norm": 21.580181121826172, "learning_rate": 8.601657100781537e-06, "loss": 0.339, "num_input_tokens_seen": 75120880, "step": 34830 }, { "epoch": 6.392916131400257, "grad_norm": 16.973299026489258, "learning_rate": 8.601101625702575e-06, "loss": 0.4544, "num_input_tokens_seen": 75132720, "step": 34835 }, { "epoch": 6.393833730959809, "grad_norm": 7.156024932861328, "learning_rate": 8.60054605826094e-06, "loss": 0.1624, "num_input_tokens_seen": 75144144, "step": 34840 }, { "epoch": 6.3947513305193615, "grad_norm": 6.453733921051025, "learning_rate": 8.599990398470875e-06, "loss": 0.1839, "num_input_tokens_seen": 75155664, "step": 34845 }, { "epoch": 6.395668930078914, "grad_norm": 15.911724090576172, "learning_rate": 8.599434646346638e-06, "loss": 0.2081, "num_input_tokens_seen": 75167536, "step": 34850 }, { "epoch": 6.396586529638466, "grad_norm": 12.174631118774414, "learning_rate": 8.598878801902481e-06, "loss": 0.2598, "num_input_tokens_seen": 75178032, "step": 34855 }, { "epoch": 6.397504129198018, "grad_norm": 8.612403869628906, "learning_rate": 8.598322865152661e-06, "loss": 0.4413, "num_input_tokens_seen": 75187760, "step": 34860 }, { "epoch": 6.398421728757571, "grad_norm": 1.6705392599105835, "learning_rate": 8.597766836111434e-06, "loss": 0.1821, "num_input_tokens_seen": 75199248, "step": 34865 }, { "epoch": 6.399339328317122, "grad_norm": 14.283385276794434, "learning_rate": 8.597210714793068e-06, "loss": 0.2898, "num_input_tokens_seen": 75210736, "step": 34870 }, { "epoch": 6.400256927876675, "grad_norm": 1.3138881921768188, "learning_rate": 8.596654501211819e-06, "loss": 0.0792, "num_input_tokens_seen": 75221328, "step": 34875 }, { "epoch": 6.401174527436227, "grad_norm": 4.763573169708252, "learning_rate": 8.596098195381956e-06, "loss": 0.2436, "num_input_tokens_seen": 75231920, "step": 34880 }, { "epoch": 6.402092126995779, "grad_norm": 15.481524467468262, "learning_rate": 8.595541797317751e-06, "loss": 0.2077, "num_input_tokens_seen": 75242768, "step": 34885 }, { "epoch": 6.403009726555331, "grad_norm": 13.84090805053711, "learning_rate": 8.594985307033467e-06, "loss": 0.2215, "num_input_tokens_seen": 75253904, "step": 34890 }, { "epoch": 6.403927326114884, "grad_norm": 17.234830856323242, "learning_rate": 8.594428724543384e-06, "loss": 0.2496, "num_input_tokens_seen": 75266448, "step": 34895 }, { "epoch": 6.4048449256744355, "grad_norm": 7.429708003997803, "learning_rate": 8.593872049861776e-06, "loss": 0.3104, "num_input_tokens_seen": 75276304, "step": 34900 }, { "epoch": 6.405762525233988, "grad_norm": 6.503519535064697, "learning_rate": 8.593315283002919e-06, "loss": 0.179, "num_input_tokens_seen": 75287792, "step": 34905 }, { "epoch": 6.4066801247935405, "grad_norm": 12.528441429138184, "learning_rate": 8.592758423981093e-06, "loss": 0.2476, "num_input_tokens_seen": 75298928, "step": 34910 }, { "epoch": 6.407597724353092, "grad_norm": 4.260528087615967, "learning_rate": 8.592201472810584e-06, "loss": 0.1505, "num_input_tokens_seen": 75310288, "step": 34915 }, { "epoch": 6.408515323912645, "grad_norm": 0.8395993709564209, "learning_rate": 8.591644429505672e-06, "loss": 0.1732, "num_input_tokens_seen": 75322160, "step": 34920 }, { "epoch": 6.409432923472197, "grad_norm": 15.266879081726074, "learning_rate": 8.591087294080648e-06, "loss": 0.3453, "num_input_tokens_seen": 75332048, "step": 34925 }, { "epoch": 6.410350523031749, "grad_norm": 1.750220537185669, "learning_rate": 8.590530066549802e-06, "loss": 0.1559, "num_input_tokens_seen": 75342864, "step": 34930 }, { "epoch": 6.411268122591301, "grad_norm": 2.9313862323760986, "learning_rate": 8.589972746927425e-06, "loss": 0.4774, "num_input_tokens_seen": 75353168, "step": 34935 }, { "epoch": 6.412185722150854, "grad_norm": 17.31282615661621, "learning_rate": 8.58941533522781e-06, "loss": 0.2621, "num_input_tokens_seen": 75363312, "step": 34940 }, { "epoch": 6.413103321710405, "grad_norm": 10.368977546691895, "learning_rate": 8.588857831465252e-06, "loss": 0.2171, "num_input_tokens_seen": 75372304, "step": 34945 }, { "epoch": 6.414020921269958, "grad_norm": 13.16063117980957, "learning_rate": 8.588300235654055e-06, "loss": 0.1389, "num_input_tokens_seen": 75383120, "step": 34950 }, { "epoch": 6.41493852082951, "grad_norm": 8.317540168762207, "learning_rate": 8.587742547808519e-06, "loss": 0.1694, "num_input_tokens_seen": 75393456, "step": 34955 }, { "epoch": 6.415856120389062, "grad_norm": 0.7783006429672241, "learning_rate": 8.587184767942946e-06, "loss": 0.1641, "num_input_tokens_seen": 75403952, "step": 34960 }, { "epoch": 6.416773719948615, "grad_norm": 2.5179271697998047, "learning_rate": 8.586626896071643e-06, "loss": 0.3134, "num_input_tokens_seen": 75413968, "step": 34965 }, { "epoch": 6.417691319508167, "grad_norm": 3.956176280975342, "learning_rate": 8.586068932208922e-06, "loss": 0.1773, "num_input_tokens_seen": 75425776, "step": 34970 }, { "epoch": 6.418608919067719, "grad_norm": 9.981300354003906, "learning_rate": 8.585510876369088e-06, "loss": 0.3245, "num_input_tokens_seen": 75436336, "step": 34975 }, { "epoch": 6.419526518627271, "grad_norm": 15.280652046203613, "learning_rate": 8.584952728566459e-06, "loss": 0.2216, "num_input_tokens_seen": 75446608, "step": 34980 }, { "epoch": 6.420444118186824, "grad_norm": 0.8022050857543945, "learning_rate": 8.584394488815347e-06, "loss": 0.3276, "num_input_tokens_seen": 75458256, "step": 34985 }, { "epoch": 6.421361717746375, "grad_norm": 5.374061107635498, "learning_rate": 8.58383615713007e-06, "loss": 0.2363, "num_input_tokens_seen": 75468688, "step": 34990 }, { "epoch": 6.422279317305928, "grad_norm": 18.475223541259766, "learning_rate": 8.583277733524952e-06, "loss": 0.1317, "num_input_tokens_seen": 75479824, "step": 34995 }, { "epoch": 6.42319691686548, "grad_norm": 30.19614028930664, "learning_rate": 8.582719218014314e-06, "loss": 0.2499, "num_input_tokens_seen": 75490960, "step": 35000 }, { "epoch": 6.424114516425032, "grad_norm": 20.68231773376465, "learning_rate": 8.58216061061248e-06, "loss": 0.2868, "num_input_tokens_seen": 75502416, "step": 35005 }, { "epoch": 6.4250321159845845, "grad_norm": 5.299926280975342, "learning_rate": 8.581601911333778e-06, "loss": 0.3328, "num_input_tokens_seen": 75512656, "step": 35010 }, { "epoch": 6.425949715544137, "grad_norm": 17.621248245239258, "learning_rate": 8.581043120192541e-06, "loss": 0.721, "num_input_tokens_seen": 75523440, "step": 35015 }, { "epoch": 6.426867315103689, "grad_norm": 75.50369262695312, "learning_rate": 8.580484237203095e-06, "loss": 0.3225, "num_input_tokens_seen": 75534224, "step": 35020 }, { "epoch": 6.427784914663241, "grad_norm": 4.696043014526367, "learning_rate": 8.579925262379778e-06, "loss": 0.3768, "num_input_tokens_seen": 75543504, "step": 35025 }, { "epoch": 6.428702514222794, "grad_norm": 5.430944442749023, "learning_rate": 8.579366195736927e-06, "loss": 0.2338, "num_input_tokens_seen": 75553840, "step": 35030 }, { "epoch": 6.429620113782345, "grad_norm": 7.961238384246826, "learning_rate": 8.57880703728888e-06, "loss": 0.3447, "num_input_tokens_seen": 75562800, "step": 35035 }, { "epoch": 6.430537713341898, "grad_norm": 11.356358528137207, "learning_rate": 8.57824778704998e-06, "loss": 0.2576, "num_input_tokens_seen": 75573712, "step": 35040 }, { "epoch": 6.43145531290145, "grad_norm": 10.329620361328125, "learning_rate": 8.57768844503457e-06, "loss": 0.1515, "num_input_tokens_seen": 75584240, "step": 35045 }, { "epoch": 6.432372912461002, "grad_norm": 36.22211456298828, "learning_rate": 8.577129011256996e-06, "loss": 0.2692, "num_input_tokens_seen": 75595504, "step": 35050 }, { "epoch": 6.433290512020554, "grad_norm": 10.75595760345459, "learning_rate": 8.576569485731605e-06, "loss": 0.1949, "num_input_tokens_seen": 75606736, "step": 35055 }, { "epoch": 6.434208111580107, "grad_norm": 13.805514335632324, "learning_rate": 8.57600986847275e-06, "loss": 0.2059, "num_input_tokens_seen": 75615440, "step": 35060 }, { "epoch": 6.4351257111396585, "grad_norm": 9.320586204528809, "learning_rate": 8.575450159494787e-06, "loss": 0.1915, "num_input_tokens_seen": 75625200, "step": 35065 }, { "epoch": 6.436043310699211, "grad_norm": 8.806697845458984, "learning_rate": 8.574890358812066e-06, "loss": 0.083, "num_input_tokens_seen": 75635856, "step": 35070 }, { "epoch": 6.4369609102587635, "grad_norm": 1.7917317152023315, "learning_rate": 8.57433046643895e-06, "loss": 0.1774, "num_input_tokens_seen": 75646352, "step": 35075 }, { "epoch": 6.437878509818315, "grad_norm": 17.712247848510742, "learning_rate": 8.573770482389799e-06, "loss": 0.2151, "num_input_tokens_seen": 75656848, "step": 35080 }, { "epoch": 6.438796109377868, "grad_norm": 12.481095314025879, "learning_rate": 8.573210406678972e-06, "loss": 0.1591, "num_input_tokens_seen": 75668144, "step": 35085 }, { "epoch": 6.43971370893742, "grad_norm": 12.98702621459961, "learning_rate": 8.572650239320835e-06, "loss": 0.1958, "num_input_tokens_seen": 75678768, "step": 35090 }, { "epoch": 6.440631308496972, "grad_norm": 47.30796813964844, "learning_rate": 8.572089980329757e-06, "loss": 0.3537, "num_input_tokens_seen": 75690192, "step": 35095 }, { "epoch": 6.441548908056524, "grad_norm": 25.31382179260254, "learning_rate": 8.571529629720107e-06, "loss": 0.3165, "num_input_tokens_seen": 75699600, "step": 35100 }, { "epoch": 6.442466507616077, "grad_norm": 11.038637161254883, "learning_rate": 8.570969187506257e-06, "loss": 0.2976, "num_input_tokens_seen": 75708816, "step": 35105 }, { "epoch": 6.443384107175628, "grad_norm": 13.07610034942627, "learning_rate": 8.570408653702582e-06, "loss": 0.2535, "num_input_tokens_seen": 75719408, "step": 35110 }, { "epoch": 6.444301706735181, "grad_norm": 13.176599502563477, "learning_rate": 8.56984802832346e-06, "loss": 0.4678, "num_input_tokens_seen": 75730672, "step": 35115 }, { "epoch": 6.445219306294733, "grad_norm": 3.489161491394043, "learning_rate": 8.569287311383268e-06, "loss": 0.2856, "num_input_tokens_seen": 75741840, "step": 35120 }, { "epoch": 6.446136905854285, "grad_norm": 4.2403154373168945, "learning_rate": 8.568726502896389e-06, "loss": 0.3499, "num_input_tokens_seen": 75752496, "step": 35125 }, { "epoch": 6.4470545054138375, "grad_norm": 16.38365936279297, "learning_rate": 8.568165602877206e-06, "loss": 0.2534, "num_input_tokens_seen": 75762928, "step": 35130 }, { "epoch": 6.44797210497339, "grad_norm": 2.503265619277954, "learning_rate": 8.567604611340104e-06, "loss": 0.2902, "num_input_tokens_seen": 75773552, "step": 35135 }, { "epoch": 6.448889704532942, "grad_norm": 18.992156982421875, "learning_rate": 8.567043528299474e-06, "loss": 0.3028, "num_input_tokens_seen": 75784496, "step": 35140 }, { "epoch": 6.449807304092494, "grad_norm": 3.0145232677459717, "learning_rate": 8.566482353769708e-06, "loss": 0.2902, "num_input_tokens_seen": 75795696, "step": 35145 }, { "epoch": 6.450724903652047, "grad_norm": 4.560806751251221, "learning_rate": 8.565921087765195e-06, "loss": 0.2088, "num_input_tokens_seen": 75806800, "step": 35150 }, { "epoch": 6.451642503211598, "grad_norm": 15.183513641357422, "learning_rate": 8.565359730300332e-06, "loss": 0.2372, "num_input_tokens_seen": 75818384, "step": 35155 }, { "epoch": 6.452560102771151, "grad_norm": 3.75162410736084, "learning_rate": 8.56479828138952e-06, "loss": 0.1086, "num_input_tokens_seen": 75829680, "step": 35160 }, { "epoch": 6.453477702330703, "grad_norm": 13.36301326751709, "learning_rate": 8.564236741047154e-06, "loss": 0.2814, "num_input_tokens_seen": 75839120, "step": 35165 }, { "epoch": 6.454395301890255, "grad_norm": 12.496785163879395, "learning_rate": 8.56367510928764e-06, "loss": 0.3764, "num_input_tokens_seen": 75848848, "step": 35170 }, { "epoch": 6.455312901449807, "grad_norm": 1.3066540956497192, "learning_rate": 8.563113386125385e-06, "loss": 0.1385, "num_input_tokens_seen": 75860176, "step": 35175 }, { "epoch": 6.45623050100936, "grad_norm": 0.9340277314186096, "learning_rate": 8.562551571574793e-06, "loss": 0.3609, "num_input_tokens_seen": 75870832, "step": 35180 }, { "epoch": 6.4571481005689115, "grad_norm": 14.21999740600586, "learning_rate": 8.561989665650276e-06, "loss": 0.2589, "num_input_tokens_seen": 75881424, "step": 35185 }, { "epoch": 6.458065700128464, "grad_norm": 5.699329376220703, "learning_rate": 8.561427668366243e-06, "loss": 0.1666, "num_input_tokens_seen": 75892464, "step": 35190 }, { "epoch": 6.4589832996880165, "grad_norm": 0.3506600260734558, "learning_rate": 8.56086557973711e-06, "loss": 0.4008, "num_input_tokens_seen": 75901904, "step": 35195 }, { "epoch": 6.459900899247568, "grad_norm": 7.838793754577637, "learning_rate": 8.560303399777294e-06, "loss": 0.1958, "num_input_tokens_seen": 75912496, "step": 35200 }, { "epoch": 6.460818498807121, "grad_norm": 1.5252323150634766, "learning_rate": 8.559741128501214e-06, "loss": 0.4652, "num_input_tokens_seen": 75923824, "step": 35205 }, { "epoch": 6.461736098366673, "grad_norm": 3.026189088821411, "learning_rate": 8.55917876592329e-06, "loss": 0.135, "num_input_tokens_seen": 75933872, "step": 35210 }, { "epoch": 6.462653697926225, "grad_norm": 22.851787567138672, "learning_rate": 8.558616312057948e-06, "loss": 0.2246, "num_input_tokens_seen": 75944976, "step": 35215 }, { "epoch": 6.463571297485777, "grad_norm": 0.5357487201690674, "learning_rate": 8.558053766919614e-06, "loss": 0.2116, "num_input_tokens_seen": 75956080, "step": 35220 }, { "epoch": 6.46448889704533, "grad_norm": 0.9959874749183655, "learning_rate": 8.557491130522713e-06, "loss": 0.1551, "num_input_tokens_seen": 75966320, "step": 35225 }, { "epoch": 6.465406496604881, "grad_norm": 1.2794641256332397, "learning_rate": 8.55692840288168e-06, "loss": 0.259, "num_input_tokens_seen": 75977168, "step": 35230 }, { "epoch": 6.466324096164434, "grad_norm": 30.673551559448242, "learning_rate": 8.556365584010946e-06, "loss": 0.2431, "num_input_tokens_seen": 75988880, "step": 35235 }, { "epoch": 6.467241695723986, "grad_norm": 7.159262180328369, "learning_rate": 8.555802673924945e-06, "loss": 0.1973, "num_input_tokens_seen": 75999088, "step": 35240 }, { "epoch": 6.468159295283538, "grad_norm": 10.162837028503418, "learning_rate": 8.555239672638119e-06, "loss": 0.2649, "num_input_tokens_seen": 76010960, "step": 35245 }, { "epoch": 6.469076894843091, "grad_norm": 1.1933842897415161, "learning_rate": 8.554676580164903e-06, "loss": 0.3712, "num_input_tokens_seen": 76022896, "step": 35250 }, { "epoch": 6.469994494402643, "grad_norm": 17.94766616821289, "learning_rate": 8.554113396519744e-06, "loss": 0.3357, "num_input_tokens_seen": 76033648, "step": 35255 }, { "epoch": 6.470912093962195, "grad_norm": 11.269551277160645, "learning_rate": 8.553550121717083e-06, "loss": 0.3758, "num_input_tokens_seen": 76044432, "step": 35260 }, { "epoch": 6.471829693521747, "grad_norm": 8.715999603271484, "learning_rate": 8.55298675577137e-06, "loss": 0.4138, "num_input_tokens_seen": 76054416, "step": 35265 }, { "epoch": 6.4727472930813, "grad_norm": 0.5106893181800842, "learning_rate": 8.552423298697052e-06, "loss": 0.3471, "num_input_tokens_seen": 76065456, "step": 35270 }, { "epoch": 6.473664892640851, "grad_norm": 30.15365982055664, "learning_rate": 8.551859750508584e-06, "loss": 0.3078, "num_input_tokens_seen": 76075856, "step": 35275 }, { "epoch": 6.474582492200404, "grad_norm": 16.238327026367188, "learning_rate": 8.551296111220418e-06, "loss": 0.2514, "num_input_tokens_seen": 76086640, "step": 35280 }, { "epoch": 6.475500091759956, "grad_norm": 18.116085052490234, "learning_rate": 8.55073238084701e-06, "loss": 0.1655, "num_input_tokens_seen": 76098288, "step": 35285 }, { "epoch": 6.476417691319508, "grad_norm": 14.263367652893066, "learning_rate": 8.550168559402819e-06, "loss": 0.2725, "num_input_tokens_seen": 76109264, "step": 35290 }, { "epoch": 6.4773352908790605, "grad_norm": 10.826844215393066, "learning_rate": 8.549604646902307e-06, "loss": 0.2411, "num_input_tokens_seen": 76119984, "step": 35295 }, { "epoch": 6.478252890438613, "grad_norm": 43.93132781982422, "learning_rate": 8.549040643359938e-06, "loss": 0.2933, "num_input_tokens_seen": 76130352, "step": 35300 }, { "epoch": 6.479170489998165, "grad_norm": 8.601897239685059, "learning_rate": 8.548476548790177e-06, "loss": 0.1715, "num_input_tokens_seen": 76141168, "step": 35305 }, { "epoch": 6.480088089557717, "grad_norm": 12.883208274841309, "learning_rate": 8.547912363207492e-06, "loss": 0.134, "num_input_tokens_seen": 76152208, "step": 35310 }, { "epoch": 6.48100568911727, "grad_norm": 3.821533203125, "learning_rate": 8.547348086626354e-06, "loss": 0.1729, "num_input_tokens_seen": 76162544, "step": 35315 }, { "epoch": 6.481923288676821, "grad_norm": 3.9801595211029053, "learning_rate": 8.546783719061234e-06, "loss": 0.2678, "num_input_tokens_seen": 76172784, "step": 35320 }, { "epoch": 6.482840888236374, "grad_norm": 3.600942611694336, "learning_rate": 8.54621926052661e-06, "loss": 0.2932, "num_input_tokens_seen": 76183664, "step": 35325 }, { "epoch": 6.483758487795926, "grad_norm": 9.702526092529297, "learning_rate": 8.545654711036957e-06, "loss": 0.189, "num_input_tokens_seen": 76194064, "step": 35330 }, { "epoch": 6.484676087355478, "grad_norm": 0.9935020208358765, "learning_rate": 8.545090070606757e-06, "loss": 0.2199, "num_input_tokens_seen": 76203920, "step": 35335 }, { "epoch": 6.48559368691503, "grad_norm": 6.2115478515625, "learning_rate": 8.544525339250491e-06, "loss": 0.2584, "num_input_tokens_seen": 76214672, "step": 35340 }, { "epoch": 6.486511286474583, "grad_norm": 3.374283790588379, "learning_rate": 8.543960516982643e-06, "loss": 0.2952, "num_input_tokens_seen": 76227024, "step": 35345 }, { "epoch": 6.4874288860341345, "grad_norm": 25.793556213378906, "learning_rate": 8.5433956038177e-06, "loss": 0.4065, "num_input_tokens_seen": 76238512, "step": 35350 }, { "epoch": 6.488346485593687, "grad_norm": 29.161806106567383, "learning_rate": 8.542830599770153e-06, "loss": 0.2795, "num_input_tokens_seen": 76249040, "step": 35355 }, { "epoch": 6.4892640851532395, "grad_norm": 8.425565719604492, "learning_rate": 8.542265504854492e-06, "loss": 0.1548, "num_input_tokens_seen": 76259344, "step": 35360 }, { "epoch": 6.490181684712791, "grad_norm": 15.51317310333252, "learning_rate": 8.541700319085209e-06, "loss": 0.1031, "num_input_tokens_seen": 76270224, "step": 35365 }, { "epoch": 6.491099284272344, "grad_norm": 4.5521440505981445, "learning_rate": 8.541135042476804e-06, "loss": 0.1845, "num_input_tokens_seen": 76281392, "step": 35370 }, { "epoch": 6.492016883831896, "grad_norm": 13.4790678024292, "learning_rate": 8.54056967504377e-06, "loss": 0.2479, "num_input_tokens_seen": 76291664, "step": 35375 }, { "epoch": 6.492934483391448, "grad_norm": 46.10129165649414, "learning_rate": 8.540004216800614e-06, "loss": 0.2454, "num_input_tokens_seen": 76303024, "step": 35380 }, { "epoch": 6.493852082951, "grad_norm": 6.046929359436035, "learning_rate": 8.539438667761836e-06, "loss": 0.2678, "num_input_tokens_seen": 76313584, "step": 35385 }, { "epoch": 6.494769682510553, "grad_norm": 7.389369487762451, "learning_rate": 8.538873027941943e-06, "loss": 0.3703, "num_input_tokens_seen": 76324720, "step": 35390 }, { "epoch": 6.495687282070104, "grad_norm": 50.858123779296875, "learning_rate": 8.53830729735544e-06, "loss": 0.4491, "num_input_tokens_seen": 76334928, "step": 35395 }, { "epoch": 6.496604881629657, "grad_norm": 27.555301666259766, "learning_rate": 8.537741476016838e-06, "loss": 0.1221, "num_input_tokens_seen": 76345072, "step": 35400 }, { "epoch": 6.497522481189209, "grad_norm": 12.48149585723877, "learning_rate": 8.537175563940652e-06, "loss": 0.2242, "num_input_tokens_seen": 76356784, "step": 35405 }, { "epoch": 6.498440080748761, "grad_norm": 11.02448844909668, "learning_rate": 8.536609561141394e-06, "loss": 0.2096, "num_input_tokens_seen": 76367920, "step": 35410 }, { "epoch": 6.4993576803083135, "grad_norm": 5.306981086730957, "learning_rate": 8.536043467633582e-06, "loss": 0.3866, "num_input_tokens_seen": 76378288, "step": 35415 }, { "epoch": 6.500275279867866, "grad_norm": 30.61592674255371, "learning_rate": 8.535477283431736e-06, "loss": 0.2099, "num_input_tokens_seen": 76388400, "step": 35420 }, { "epoch": 6.501192879427418, "grad_norm": 13.252050399780273, "learning_rate": 8.534911008550378e-06, "loss": 0.1212, "num_input_tokens_seen": 76398992, "step": 35425 }, { "epoch": 6.50211047898697, "grad_norm": 15.080839157104492, "learning_rate": 8.53434464300403e-06, "loss": 0.4739, "num_input_tokens_seen": 76409712, "step": 35430 }, { "epoch": 6.503028078546523, "grad_norm": 3.0646557807922363, "learning_rate": 8.533778186807217e-06, "loss": 0.1373, "num_input_tokens_seen": 76419792, "step": 35435 }, { "epoch": 6.503945678106074, "grad_norm": 23.346599578857422, "learning_rate": 8.533211639974474e-06, "loss": 0.0825, "num_input_tokens_seen": 76430864, "step": 35440 }, { "epoch": 6.504863277665627, "grad_norm": 7.996065139770508, "learning_rate": 8.532645002520328e-06, "loss": 0.2903, "num_input_tokens_seen": 76442352, "step": 35445 }, { "epoch": 6.505780877225179, "grad_norm": 1.4578627347946167, "learning_rate": 8.532078274459313e-06, "loss": 0.3096, "num_input_tokens_seen": 76452432, "step": 35450 }, { "epoch": 6.506698476784731, "grad_norm": 29.749780654907227, "learning_rate": 8.531511455805964e-06, "loss": 0.1321, "num_input_tokens_seen": 76463888, "step": 35455 }, { "epoch": 6.507616076344283, "grad_norm": 11.252808570861816, "learning_rate": 8.530944546574818e-06, "loss": 0.2805, "num_input_tokens_seen": 76475024, "step": 35460 }, { "epoch": 6.508533675903836, "grad_norm": 8.608383178710938, "learning_rate": 8.530377546780417e-06, "loss": 0.1917, "num_input_tokens_seen": 76485968, "step": 35465 }, { "epoch": 6.5094512754633875, "grad_norm": 25.79638671875, "learning_rate": 8.529810456437303e-06, "loss": 0.2816, "num_input_tokens_seen": 76497104, "step": 35470 }, { "epoch": 6.51036887502294, "grad_norm": 13.509115219116211, "learning_rate": 8.529243275560025e-06, "loss": 0.3844, "num_input_tokens_seen": 76508048, "step": 35475 }, { "epoch": 6.5112864745824925, "grad_norm": 14.092370986938477, "learning_rate": 8.528676004163124e-06, "loss": 0.1478, "num_input_tokens_seen": 76519088, "step": 35480 }, { "epoch": 6.512204074142044, "grad_norm": 49.37205123901367, "learning_rate": 8.528108642261154e-06, "loss": 0.1729, "num_input_tokens_seen": 76531472, "step": 35485 }, { "epoch": 6.513121673701597, "grad_norm": 20.96441078186035, "learning_rate": 8.527541189868664e-06, "loss": 0.3876, "num_input_tokens_seen": 76542128, "step": 35490 }, { "epoch": 6.514039273261149, "grad_norm": 3.6302151679992676, "learning_rate": 8.526973647000212e-06, "loss": 0.3697, "num_input_tokens_seen": 76552176, "step": 35495 }, { "epoch": 6.514956872820701, "grad_norm": 46.8869514465332, "learning_rate": 8.52640601367035e-06, "loss": 0.3475, "num_input_tokens_seen": 76563216, "step": 35500 }, { "epoch": 6.515874472380253, "grad_norm": 8.208907127380371, "learning_rate": 8.52583828989364e-06, "loss": 0.2905, "num_input_tokens_seen": 76574320, "step": 35505 }, { "epoch": 6.516792071939806, "grad_norm": 13.76321792602539, "learning_rate": 8.525270475684642e-06, "loss": 0.3509, "num_input_tokens_seen": 76586448, "step": 35510 }, { "epoch": 6.517709671499357, "grad_norm": 0.9144593477249146, "learning_rate": 8.52470257105792e-06, "loss": 0.1992, "num_input_tokens_seen": 76596688, "step": 35515 }, { "epoch": 6.51862727105891, "grad_norm": 0.9348101615905762, "learning_rate": 8.52413457602804e-06, "loss": 0.1214, "num_input_tokens_seen": 76606960, "step": 35520 }, { "epoch": 6.519544870618462, "grad_norm": 7.172868728637695, "learning_rate": 8.52356649060957e-06, "loss": 0.3901, "num_input_tokens_seen": 76617808, "step": 35525 }, { "epoch": 6.520462470178014, "grad_norm": 10.00924015045166, "learning_rate": 8.52299831481708e-06, "loss": 0.2836, "num_input_tokens_seen": 76628464, "step": 35530 }, { "epoch": 6.521380069737567, "grad_norm": 37.7280387878418, "learning_rate": 8.522430048665145e-06, "loss": 0.2646, "num_input_tokens_seen": 76638832, "step": 35535 }, { "epoch": 6.522297669297119, "grad_norm": 5.417015075683594, "learning_rate": 8.521861692168337e-06, "loss": 0.2929, "num_input_tokens_seen": 76650704, "step": 35540 }, { "epoch": 6.523215268856671, "grad_norm": 69.74219512939453, "learning_rate": 8.521293245341235e-06, "loss": 0.2679, "num_input_tokens_seen": 76660784, "step": 35545 }, { "epoch": 6.524132868416223, "grad_norm": 11.479623794555664, "learning_rate": 8.52072470819842e-06, "loss": 0.2271, "num_input_tokens_seen": 76670544, "step": 35550 }, { "epoch": 6.525050467975776, "grad_norm": 19.373437881469727, "learning_rate": 8.520156080754471e-06, "loss": 0.535, "num_input_tokens_seen": 76681232, "step": 35555 }, { "epoch": 6.525968067535327, "grad_norm": 1.5334014892578125, "learning_rate": 8.519587363023978e-06, "loss": 0.2222, "num_input_tokens_seen": 76691824, "step": 35560 }, { "epoch": 6.52688566709488, "grad_norm": 6.431143760681152, "learning_rate": 8.51901855502152e-06, "loss": 0.2598, "num_input_tokens_seen": 76703664, "step": 35565 }, { "epoch": 6.527803266654432, "grad_norm": 44.41065979003906, "learning_rate": 8.518449656761692e-06, "loss": 0.2828, "num_input_tokens_seen": 76715088, "step": 35570 }, { "epoch": 6.528720866213984, "grad_norm": 22.317556381225586, "learning_rate": 8.517880668259082e-06, "loss": 0.4269, "num_input_tokens_seen": 76726448, "step": 35575 }, { "epoch": 6.5296384657735365, "grad_norm": 0.4495997726917267, "learning_rate": 8.517311589528286e-06, "loss": 0.0709, "num_input_tokens_seen": 76737040, "step": 35580 }, { "epoch": 6.530556065333089, "grad_norm": 0.54774010181427, "learning_rate": 8.516742420583899e-06, "loss": 0.1418, "num_input_tokens_seen": 76748176, "step": 35585 }, { "epoch": 6.531473664892641, "grad_norm": 19.806612014770508, "learning_rate": 8.51617316144052e-06, "loss": 0.4137, "num_input_tokens_seen": 76759696, "step": 35590 }, { "epoch": 6.532391264452193, "grad_norm": 17.72535514831543, "learning_rate": 8.515603812112749e-06, "loss": 0.4803, "num_input_tokens_seen": 76771376, "step": 35595 }, { "epoch": 6.533308864011746, "grad_norm": 10.445899963378906, "learning_rate": 8.515034372615188e-06, "loss": 0.4026, "num_input_tokens_seen": 76781680, "step": 35600 }, { "epoch": 6.534226463571297, "grad_norm": 18.95955467224121, "learning_rate": 8.514464842962442e-06, "loss": 0.2672, "num_input_tokens_seen": 76792624, "step": 35605 }, { "epoch": 6.53514406313085, "grad_norm": 23.5615177154541, "learning_rate": 8.513895223169122e-06, "loss": 0.3721, "num_input_tokens_seen": 76803984, "step": 35610 }, { "epoch": 6.536061662690402, "grad_norm": 2.478954315185547, "learning_rate": 8.513325513249835e-06, "loss": 0.1827, "num_input_tokens_seen": 76814512, "step": 35615 }, { "epoch": 6.536979262249954, "grad_norm": 12.237139701843262, "learning_rate": 8.512755713219193e-06, "loss": 0.284, "num_input_tokens_seen": 76825040, "step": 35620 }, { "epoch": 6.537896861809506, "grad_norm": 1.1289347410202026, "learning_rate": 8.512185823091812e-06, "loss": 0.1046, "num_input_tokens_seen": 76835248, "step": 35625 }, { "epoch": 6.538814461369059, "grad_norm": 17.342571258544922, "learning_rate": 8.511615842882307e-06, "loss": 0.448, "num_input_tokens_seen": 76845392, "step": 35630 }, { "epoch": 6.5397320609286105, "grad_norm": 3.6300296783447266, "learning_rate": 8.511045772605299e-06, "loss": 0.2491, "num_input_tokens_seen": 76857168, "step": 35635 }, { "epoch": 6.540649660488163, "grad_norm": 24.792179107666016, "learning_rate": 8.510475612275409e-06, "loss": 0.2752, "num_input_tokens_seen": 76866512, "step": 35640 }, { "epoch": 6.5415672600477155, "grad_norm": 1.3816639184951782, "learning_rate": 8.50990536190726e-06, "loss": 0.3085, "num_input_tokens_seen": 76876816, "step": 35645 }, { "epoch": 6.542484859607267, "grad_norm": 38.23713302612305, "learning_rate": 8.509335021515476e-06, "loss": 0.1021, "num_input_tokens_seen": 76887504, "step": 35650 }, { "epoch": 6.54340245916682, "grad_norm": 1.1180247068405151, "learning_rate": 8.50876459111469e-06, "loss": 0.2621, "num_input_tokens_seen": 76898608, "step": 35655 }, { "epoch": 6.544320058726372, "grad_norm": 7.241423606872559, "learning_rate": 8.50819407071953e-06, "loss": 0.2359, "num_input_tokens_seen": 76909680, "step": 35660 }, { "epoch": 6.545237658285924, "grad_norm": 1.3410019874572754, "learning_rate": 8.50762346034463e-06, "loss": 0.2556, "num_input_tokens_seen": 76920848, "step": 35665 }, { "epoch": 6.546155257845476, "grad_norm": 23.435401916503906, "learning_rate": 8.507052760004626e-06, "loss": 0.3233, "num_input_tokens_seen": 76932560, "step": 35670 }, { "epoch": 6.547072857405029, "grad_norm": 3.0388453006744385, "learning_rate": 8.50648196971415e-06, "loss": 0.4452, "num_input_tokens_seen": 76943952, "step": 35675 }, { "epoch": 6.54799045696458, "grad_norm": 0.34337282180786133, "learning_rate": 8.505911089487848e-06, "loss": 0.1557, "num_input_tokens_seen": 76954832, "step": 35680 }, { "epoch": 6.548908056524133, "grad_norm": 20.25291633605957, "learning_rate": 8.505340119340362e-06, "loss": 0.1174, "num_input_tokens_seen": 76964944, "step": 35685 }, { "epoch": 6.549825656083685, "grad_norm": 5.638761520385742, "learning_rate": 8.504769059286332e-06, "loss": 0.1573, "num_input_tokens_seen": 76975952, "step": 35690 }, { "epoch": 6.550743255643237, "grad_norm": 3.298910140991211, "learning_rate": 8.504197909340409e-06, "loss": 0.3697, "num_input_tokens_seen": 76984528, "step": 35695 }, { "epoch": 6.5516608552027895, "grad_norm": 7.905529022216797, "learning_rate": 8.50362666951724e-06, "loss": 0.2949, "num_input_tokens_seen": 76995568, "step": 35700 }, { "epoch": 6.552578454762342, "grad_norm": 13.00118637084961, "learning_rate": 8.503055339831477e-06, "loss": 0.4186, "num_input_tokens_seen": 77007664, "step": 35705 }, { "epoch": 6.553496054321894, "grad_norm": 13.333768844604492, "learning_rate": 8.502483920297774e-06, "loss": 0.3223, "num_input_tokens_seen": 77018512, "step": 35710 }, { "epoch": 6.554413653881446, "grad_norm": 4.782689571380615, "learning_rate": 8.501912410930786e-06, "loss": 0.2279, "num_input_tokens_seen": 77029488, "step": 35715 }, { "epoch": 6.555331253440999, "grad_norm": 1.3468230962753296, "learning_rate": 8.501340811745174e-06, "loss": 0.0482, "num_input_tokens_seen": 77039568, "step": 35720 }, { "epoch": 6.55624885300055, "grad_norm": 1.9134454727172852, "learning_rate": 8.500769122755596e-06, "loss": 0.319, "num_input_tokens_seen": 77049936, "step": 35725 }, { "epoch": 6.557166452560103, "grad_norm": 15.371513366699219, "learning_rate": 8.500197343976714e-06, "loss": 0.3686, "num_input_tokens_seen": 77060784, "step": 35730 }, { "epoch": 6.558084052119655, "grad_norm": 9.972671508789062, "learning_rate": 8.499625475423197e-06, "loss": 0.2946, "num_input_tokens_seen": 77071376, "step": 35735 }, { "epoch": 6.559001651679207, "grad_norm": 0.9662387371063232, "learning_rate": 8.49905351710971e-06, "loss": 0.2932, "num_input_tokens_seen": 77080880, "step": 35740 }, { "epoch": 6.559919251238759, "grad_norm": 0.41556745767593384, "learning_rate": 8.498481469050923e-06, "loss": 0.3362, "num_input_tokens_seen": 77091920, "step": 35745 }, { "epoch": 6.560836850798312, "grad_norm": 21.765960693359375, "learning_rate": 8.49790933126151e-06, "loss": 0.4794, "num_input_tokens_seen": 77102800, "step": 35750 }, { "epoch": 6.5617544503578635, "grad_norm": 5.980466365814209, "learning_rate": 8.497337103756142e-06, "loss": 0.4218, "num_input_tokens_seen": 77113360, "step": 35755 }, { "epoch": 6.562672049917416, "grad_norm": 4.450167655944824, "learning_rate": 8.496764786549499e-06, "loss": 0.3175, "num_input_tokens_seen": 77123376, "step": 35760 }, { "epoch": 6.5635896494769685, "grad_norm": 12.596720695495605, "learning_rate": 8.496192379656257e-06, "loss": 0.219, "num_input_tokens_seen": 77133936, "step": 35765 }, { "epoch": 6.56450724903652, "grad_norm": 4.683165550231934, "learning_rate": 8.4956198830911e-06, "loss": 0.4099, "num_input_tokens_seen": 77144848, "step": 35770 }, { "epoch": 6.565424848596073, "grad_norm": 11.76482105255127, "learning_rate": 8.49504729686871e-06, "loss": 0.13, "num_input_tokens_seen": 77155600, "step": 35775 }, { "epoch": 6.566342448155625, "grad_norm": 4.244116306304932, "learning_rate": 8.494474621003776e-06, "loss": 0.2651, "num_input_tokens_seen": 77166512, "step": 35780 }, { "epoch": 6.567260047715177, "grad_norm": 15.027005195617676, "learning_rate": 8.493901855510983e-06, "loss": 0.3089, "num_input_tokens_seen": 77177488, "step": 35785 }, { "epoch": 6.568177647274729, "grad_norm": 15.199058532714844, "learning_rate": 8.493329000405019e-06, "loss": 0.2283, "num_input_tokens_seen": 77188368, "step": 35790 }, { "epoch": 6.569095246834282, "grad_norm": 11.511472702026367, "learning_rate": 8.492756055700584e-06, "loss": 0.1615, "num_input_tokens_seen": 77199888, "step": 35795 }, { "epoch": 6.570012846393833, "grad_norm": 12.717501640319824, "learning_rate": 8.492183021412368e-06, "loss": 0.2189, "num_input_tokens_seen": 77211376, "step": 35800 }, { "epoch": 6.570930445953386, "grad_norm": 30.649526596069336, "learning_rate": 8.49160989755507e-06, "loss": 0.3117, "num_input_tokens_seen": 77220432, "step": 35805 }, { "epoch": 6.571848045512938, "grad_norm": 31.4195613861084, "learning_rate": 8.491036684143391e-06, "loss": 0.1964, "num_input_tokens_seen": 77230800, "step": 35810 }, { "epoch": 6.57276564507249, "grad_norm": 15.041671752929688, "learning_rate": 8.490463381192031e-06, "loss": 0.3641, "num_input_tokens_seen": 77241712, "step": 35815 }, { "epoch": 6.573683244632043, "grad_norm": 12.315461158752441, "learning_rate": 8.489889988715696e-06, "loss": 0.2882, "num_input_tokens_seen": 77253776, "step": 35820 }, { "epoch": 6.574600844191595, "grad_norm": 2.7552108764648438, "learning_rate": 8.48931650672909e-06, "loss": 0.2133, "num_input_tokens_seen": 77263280, "step": 35825 }, { "epoch": 6.575518443751147, "grad_norm": 4.591350555419922, "learning_rate": 8.488742935246923e-06, "loss": 0.3125, "num_input_tokens_seen": 77273520, "step": 35830 }, { "epoch": 6.576436043310699, "grad_norm": 19.25555992126465, "learning_rate": 8.488169274283908e-06, "loss": 0.3197, "num_input_tokens_seen": 77285328, "step": 35835 }, { "epoch": 6.577353642870252, "grad_norm": 14.825587272644043, "learning_rate": 8.487595523854758e-06, "loss": 0.4267, "num_input_tokens_seen": 77295312, "step": 35840 }, { "epoch": 6.578271242429803, "grad_norm": 23.703479766845703, "learning_rate": 8.487021683974186e-06, "loss": 0.3599, "num_input_tokens_seen": 77304016, "step": 35845 }, { "epoch": 6.579188841989356, "grad_norm": 11.908703804016113, "learning_rate": 8.486447754656912e-06, "loss": 0.3893, "num_input_tokens_seen": 77314928, "step": 35850 }, { "epoch": 6.580106441548908, "grad_norm": 5.618305683135986, "learning_rate": 8.48587373591766e-06, "loss": 0.2663, "num_input_tokens_seen": 77325808, "step": 35855 }, { "epoch": 6.58102404110846, "grad_norm": 8.61541748046875, "learning_rate": 8.485299627771146e-06, "loss": 0.2175, "num_input_tokens_seen": 77337296, "step": 35860 }, { "epoch": 6.5819416406680125, "grad_norm": 8.613652229309082, "learning_rate": 8.4847254302321e-06, "loss": 0.3387, "num_input_tokens_seen": 77347856, "step": 35865 }, { "epoch": 6.582859240227565, "grad_norm": 9.587224006652832, "learning_rate": 8.484151143315247e-06, "loss": 0.085, "num_input_tokens_seen": 77359248, "step": 35870 }, { "epoch": 6.583776839787117, "grad_norm": 10.693305969238281, "learning_rate": 8.483576767035318e-06, "loss": 0.3621, "num_input_tokens_seen": 77369200, "step": 35875 }, { "epoch": 6.584694439346669, "grad_norm": 3.611884832382202, "learning_rate": 8.483002301407042e-06, "loss": 0.2699, "num_input_tokens_seen": 77380624, "step": 35880 }, { "epoch": 6.585612038906222, "grad_norm": 3.322844982147217, "learning_rate": 8.482427746445156e-06, "loss": 0.3094, "num_input_tokens_seen": 77392720, "step": 35885 }, { "epoch": 6.586529638465773, "grad_norm": 5.760677814483643, "learning_rate": 8.481853102164397e-06, "loss": 0.2093, "num_input_tokens_seen": 77403152, "step": 35890 }, { "epoch": 6.587447238025326, "grad_norm": 7.145930767059326, "learning_rate": 8.4812783685795e-06, "loss": 0.296, "num_input_tokens_seen": 77412048, "step": 35895 }, { "epoch": 6.588364837584878, "grad_norm": 7.192882537841797, "learning_rate": 8.48070354570521e-06, "loss": 0.1355, "num_input_tokens_seen": 77421040, "step": 35900 }, { "epoch": 6.58928243714443, "grad_norm": 5.867382526397705, "learning_rate": 8.480128633556269e-06, "loss": 0.3007, "num_input_tokens_seen": 77431984, "step": 35905 }, { "epoch": 6.590200036703982, "grad_norm": 1.869038462638855, "learning_rate": 8.47955363214742e-06, "loss": 0.1686, "num_input_tokens_seen": 77442448, "step": 35910 }, { "epoch": 6.591117636263535, "grad_norm": 4.725600242614746, "learning_rate": 8.478978541493414e-06, "loss": 0.2609, "num_input_tokens_seen": 77452688, "step": 35915 }, { "epoch": 6.5920352358230865, "grad_norm": 0.9523090124130249, "learning_rate": 8.478403361609002e-06, "loss": 0.2183, "num_input_tokens_seen": 77463440, "step": 35920 }, { "epoch": 6.592952835382639, "grad_norm": 1.1380856037139893, "learning_rate": 8.477828092508932e-06, "loss": 0.1652, "num_input_tokens_seen": 77474512, "step": 35925 }, { "epoch": 6.5938704349421915, "grad_norm": 18.229660034179688, "learning_rate": 8.477252734207965e-06, "loss": 0.1937, "num_input_tokens_seen": 77485616, "step": 35930 }, { "epoch": 6.594788034501743, "grad_norm": 12.77236557006836, "learning_rate": 8.476677286720853e-06, "loss": 0.0778, "num_input_tokens_seen": 77496016, "step": 35935 }, { "epoch": 6.595705634061296, "grad_norm": 9.774027824401855, "learning_rate": 8.476101750062357e-06, "loss": 0.1915, "num_input_tokens_seen": 77506544, "step": 35940 }, { "epoch": 6.596623233620848, "grad_norm": 5.654873371124268, "learning_rate": 8.475526124247238e-06, "loss": 0.2459, "num_input_tokens_seen": 77517424, "step": 35945 }, { "epoch": 6.5975408331804, "grad_norm": 19.417110443115234, "learning_rate": 8.47495040929026e-06, "loss": 0.4052, "num_input_tokens_seen": 77528944, "step": 35950 }, { "epoch": 6.598458432739952, "grad_norm": 7.017580509185791, "learning_rate": 8.474374605206191e-06, "loss": 0.4827, "num_input_tokens_seen": 77540048, "step": 35955 }, { "epoch": 6.599376032299505, "grad_norm": 0.31835025548934937, "learning_rate": 8.473798712009798e-06, "loss": 0.0978, "num_input_tokens_seen": 77551280, "step": 35960 }, { "epoch": 6.600293631859056, "grad_norm": 21.98101043701172, "learning_rate": 8.473222729715852e-06, "loss": 0.4433, "num_input_tokens_seen": 77561872, "step": 35965 }, { "epoch": 6.601211231418609, "grad_norm": 1.3042023181915283, "learning_rate": 8.472646658339126e-06, "loss": 0.0498, "num_input_tokens_seen": 77574128, "step": 35970 }, { "epoch": 6.602128830978161, "grad_norm": 1.083333134651184, "learning_rate": 8.472070497894394e-06, "loss": 0.114, "num_input_tokens_seen": 77585264, "step": 35975 }, { "epoch": 6.603046430537713, "grad_norm": 2.2554454803466797, "learning_rate": 8.471494248396437e-06, "loss": 0.2241, "num_input_tokens_seen": 77596432, "step": 35980 }, { "epoch": 6.6039640300972655, "grad_norm": 13.735586166381836, "learning_rate": 8.47091790986003e-06, "loss": 0.1415, "num_input_tokens_seen": 77606352, "step": 35985 }, { "epoch": 6.604881629656818, "grad_norm": 18.58674430847168, "learning_rate": 8.47034148229996e-06, "loss": 0.4808, "num_input_tokens_seen": 77617808, "step": 35990 }, { "epoch": 6.60579922921637, "grad_norm": 4.928508281707764, "learning_rate": 8.469764965731011e-06, "loss": 0.2553, "num_input_tokens_seen": 77627984, "step": 35995 }, { "epoch": 6.606716828775922, "grad_norm": 11.323168754577637, "learning_rate": 8.469188360167966e-06, "loss": 0.2891, "num_input_tokens_seen": 77637648, "step": 36000 }, { "epoch": 6.607634428335475, "grad_norm": 5.088058948516846, "learning_rate": 8.468611665625616e-06, "loss": 0.5481, "num_input_tokens_seen": 77648976, "step": 36005 }, { "epoch": 6.608552027895026, "grad_norm": 9.450152397155762, "learning_rate": 8.468034882118753e-06, "loss": 0.3504, "num_input_tokens_seen": 77659504, "step": 36010 }, { "epoch": 6.609469627454579, "grad_norm": 5.051367282867432, "learning_rate": 8.467458009662173e-06, "loss": 0.3122, "num_input_tokens_seen": 77671280, "step": 36015 }, { "epoch": 6.610387227014131, "grad_norm": 7.482090950012207, "learning_rate": 8.466881048270666e-06, "loss": 0.2946, "num_input_tokens_seen": 77682512, "step": 36020 }, { "epoch": 6.611304826573683, "grad_norm": 8.140429496765137, "learning_rate": 8.466303997959035e-06, "loss": 0.1156, "num_input_tokens_seen": 77694000, "step": 36025 }, { "epoch": 6.612222426133235, "grad_norm": 12.830901145935059, "learning_rate": 8.465726858742079e-06, "loss": 0.4455, "num_input_tokens_seen": 77704240, "step": 36030 }, { "epoch": 6.613140025692788, "grad_norm": 2.296226978302002, "learning_rate": 8.465149630634598e-06, "loss": 0.2422, "num_input_tokens_seen": 77715056, "step": 36035 }, { "epoch": 6.6140576252523395, "grad_norm": 0.9572895765304565, "learning_rate": 8.464572313651401e-06, "loss": 0.4552, "num_input_tokens_seen": 77724944, "step": 36040 }, { "epoch": 6.614975224811892, "grad_norm": 10.301694869995117, "learning_rate": 8.463994907807294e-06, "loss": 0.143, "num_input_tokens_seen": 77735728, "step": 36045 }, { "epoch": 6.6158928243714445, "grad_norm": 52.61489486694336, "learning_rate": 8.463417413117087e-06, "loss": 0.3832, "num_input_tokens_seen": 77744528, "step": 36050 }, { "epoch": 6.616810423930996, "grad_norm": 14.677163124084473, "learning_rate": 8.46283982959559e-06, "loss": 0.2528, "num_input_tokens_seen": 77755248, "step": 36055 }, { "epoch": 6.617728023490549, "grad_norm": 7.401126384735107, "learning_rate": 8.462262157257618e-06, "loss": 0.2668, "num_input_tokens_seen": 77766960, "step": 36060 }, { "epoch": 6.618645623050101, "grad_norm": 11.490029335021973, "learning_rate": 8.461684396117989e-06, "loss": 0.1718, "num_input_tokens_seen": 77777008, "step": 36065 }, { "epoch": 6.619563222609653, "grad_norm": 9.723563194274902, "learning_rate": 8.461106546191518e-06, "loss": 0.0902, "num_input_tokens_seen": 77787536, "step": 36070 }, { "epoch": 6.620480822169205, "grad_norm": 0.37993714213371277, "learning_rate": 8.46052860749303e-06, "loss": 0.2389, "num_input_tokens_seen": 77797552, "step": 36075 }, { "epoch": 6.621398421728758, "grad_norm": 18.192075729370117, "learning_rate": 8.459950580037346e-06, "loss": 0.3064, "num_input_tokens_seen": 77807312, "step": 36080 }, { "epoch": 6.622316021288309, "grad_norm": 5.703252792358398, "learning_rate": 8.459372463839293e-06, "loss": 0.1742, "num_input_tokens_seen": 77816240, "step": 36085 }, { "epoch": 6.623233620847862, "grad_norm": 18.324186325073242, "learning_rate": 8.458794258913697e-06, "loss": 0.263, "num_input_tokens_seen": 77825456, "step": 36090 }, { "epoch": 6.624151220407414, "grad_norm": 6.150895595550537, "learning_rate": 8.45821596527539e-06, "loss": 0.4489, "num_input_tokens_seen": 77836752, "step": 36095 }, { "epoch": 6.625068819966966, "grad_norm": 22.943302154541016, "learning_rate": 8.457637582939202e-06, "loss": 0.2522, "num_input_tokens_seen": 77846160, "step": 36100 }, { "epoch": 6.625986419526519, "grad_norm": 13.326762199401855, "learning_rate": 8.45705911191997e-06, "loss": 0.2759, "num_input_tokens_seen": 77857296, "step": 36105 }, { "epoch": 6.626904019086071, "grad_norm": 10.778120040893555, "learning_rate": 8.456480552232528e-06, "loss": 0.5904, "num_input_tokens_seen": 77867312, "step": 36110 }, { "epoch": 6.627821618645623, "grad_norm": 9.572990417480469, "learning_rate": 8.455901903891719e-06, "loss": 0.4742, "num_input_tokens_seen": 77877968, "step": 36115 }, { "epoch": 6.628739218205175, "grad_norm": 4.1719160079956055, "learning_rate": 8.45532316691238e-06, "loss": 0.1954, "num_input_tokens_seen": 77889424, "step": 36120 }, { "epoch": 6.629656817764728, "grad_norm": 1.6631486415863037, "learning_rate": 8.454744341309359e-06, "loss": 0.2203, "num_input_tokens_seen": 77899568, "step": 36125 }, { "epoch": 6.630574417324279, "grad_norm": 13.158647537231445, "learning_rate": 8.454165427097499e-06, "loss": 0.3725, "num_input_tokens_seen": 77910384, "step": 36130 }, { "epoch": 6.631492016883832, "grad_norm": 11.400093078613281, "learning_rate": 8.45358642429165e-06, "loss": 0.2909, "num_input_tokens_seen": 77921552, "step": 36135 }, { "epoch": 6.632409616443384, "grad_norm": 6.176295757293701, "learning_rate": 8.453007332906662e-06, "loss": 0.2906, "num_input_tokens_seen": 77932432, "step": 36140 }, { "epoch": 6.633327216002936, "grad_norm": 4.672567844390869, "learning_rate": 8.452428152957386e-06, "loss": 0.087, "num_input_tokens_seen": 77943792, "step": 36145 }, { "epoch": 6.6342448155624885, "grad_norm": 1.6102343797683716, "learning_rate": 8.45184888445868e-06, "loss": 0.2003, "num_input_tokens_seen": 77954672, "step": 36150 }, { "epoch": 6.635162415122041, "grad_norm": 8.210760116577148, "learning_rate": 8.451269527425399e-06, "loss": 0.2979, "num_input_tokens_seen": 77963664, "step": 36155 }, { "epoch": 6.636080014681593, "grad_norm": 9.924396514892578, "learning_rate": 8.450690081872405e-06, "loss": 0.0818, "num_input_tokens_seen": 77974448, "step": 36160 }, { "epoch": 6.636997614241145, "grad_norm": 11.98731803894043, "learning_rate": 8.450110547814557e-06, "loss": 0.2629, "num_input_tokens_seen": 77984656, "step": 36165 }, { "epoch": 6.637915213800698, "grad_norm": 19.479686737060547, "learning_rate": 8.449530925266721e-06, "loss": 0.2394, "num_input_tokens_seen": 77995344, "step": 36170 }, { "epoch": 6.638832813360249, "grad_norm": 13.30691909790039, "learning_rate": 8.448951214243763e-06, "loss": 0.2281, "num_input_tokens_seen": 78005584, "step": 36175 }, { "epoch": 6.639750412919802, "grad_norm": 15.862508773803711, "learning_rate": 8.448371414760553e-06, "loss": 0.3138, "num_input_tokens_seen": 78016368, "step": 36180 }, { "epoch": 6.640668012479354, "grad_norm": 11.032122611999512, "learning_rate": 8.44779152683196e-06, "loss": 0.1577, "num_input_tokens_seen": 78027824, "step": 36185 }, { "epoch": 6.641585612038906, "grad_norm": 4.538224697113037, "learning_rate": 8.447211550472858e-06, "loss": 0.2203, "num_input_tokens_seen": 78040176, "step": 36190 }, { "epoch": 6.642503211598458, "grad_norm": 14.94363021850586, "learning_rate": 8.44663148569812e-06, "loss": 0.2497, "num_input_tokens_seen": 78051376, "step": 36195 }, { "epoch": 6.643420811158011, "grad_norm": 19.435123443603516, "learning_rate": 8.44605133252263e-06, "loss": 0.1299, "num_input_tokens_seen": 78063792, "step": 36200 }, { "epoch": 6.6443384107175625, "grad_norm": 32.88593673706055, "learning_rate": 8.445471090961262e-06, "loss": 0.4283, "num_input_tokens_seen": 78072656, "step": 36205 }, { "epoch": 6.645256010277115, "grad_norm": 16.67753791809082, "learning_rate": 8.444890761028902e-06, "loss": 0.5631, "num_input_tokens_seen": 78084112, "step": 36210 }, { "epoch": 6.6461736098366675, "grad_norm": 1.3476723432540894, "learning_rate": 8.444310342740432e-06, "loss": 0.4066, "num_input_tokens_seen": 78093680, "step": 36215 }, { "epoch": 6.647091209396219, "grad_norm": 4.557727813720703, "learning_rate": 8.44372983611074e-06, "loss": 0.3125, "num_input_tokens_seen": 78102928, "step": 36220 }, { "epoch": 6.648008808955772, "grad_norm": 5.49085807800293, "learning_rate": 8.443149241154716e-06, "loss": 0.2638, "num_input_tokens_seen": 78115184, "step": 36225 }, { "epoch": 6.648926408515324, "grad_norm": 11.797054290771484, "learning_rate": 8.442568557887248e-06, "loss": 0.4685, "num_input_tokens_seen": 78126896, "step": 36230 }, { "epoch": 6.649844008074876, "grad_norm": 4.647130012512207, "learning_rate": 8.441987786323234e-06, "loss": 0.3253, "num_input_tokens_seen": 78137776, "step": 36235 }, { "epoch": 6.650761607634428, "grad_norm": 2.809983015060425, "learning_rate": 8.441406926477567e-06, "loss": 0.2805, "num_input_tokens_seen": 78148528, "step": 36240 }, { "epoch": 6.651679207193981, "grad_norm": 8.313600540161133, "learning_rate": 8.440825978365145e-06, "loss": 0.1325, "num_input_tokens_seen": 78157520, "step": 36245 }, { "epoch": 6.652596806753532, "grad_norm": 4.444033145904541, "learning_rate": 8.440244942000873e-06, "loss": 0.2463, "num_input_tokens_seen": 78167952, "step": 36250 }, { "epoch": 6.653514406313085, "grad_norm": 10.201903343200684, "learning_rate": 8.439663817399647e-06, "loss": 0.1789, "num_input_tokens_seen": 78178832, "step": 36255 }, { "epoch": 6.654432005872637, "grad_norm": 12.02037525177002, "learning_rate": 8.439082604576376e-06, "loss": 0.2082, "num_input_tokens_seen": 78189936, "step": 36260 }, { "epoch": 6.655349605432189, "grad_norm": 9.636922836303711, "learning_rate": 8.438501303545966e-06, "loss": 0.2363, "num_input_tokens_seen": 78200208, "step": 36265 }, { "epoch": 6.6562672049917415, "grad_norm": 3.554213762283325, "learning_rate": 8.437919914323326e-06, "loss": 0.2902, "num_input_tokens_seen": 78211088, "step": 36270 }, { "epoch": 6.657184804551294, "grad_norm": 11.156784057617188, "learning_rate": 8.43733843692337e-06, "loss": 0.3386, "num_input_tokens_seen": 78221072, "step": 36275 }, { "epoch": 6.658102404110846, "grad_norm": 14.881430625915527, "learning_rate": 8.43675687136101e-06, "loss": 0.383, "num_input_tokens_seen": 78231920, "step": 36280 }, { "epoch": 6.659020003670398, "grad_norm": 4.291914939880371, "learning_rate": 8.436175217651164e-06, "loss": 0.2419, "num_input_tokens_seen": 78242480, "step": 36285 }, { "epoch": 6.659937603229951, "grad_norm": 13.861113548278809, "learning_rate": 8.435593475808747e-06, "loss": 0.309, "num_input_tokens_seen": 78253968, "step": 36290 }, { "epoch": 6.660855202789502, "grad_norm": 25.619247436523438, "learning_rate": 8.435011645848683e-06, "loss": 0.3908, "num_input_tokens_seen": 78266128, "step": 36295 }, { "epoch": 6.661772802349055, "grad_norm": 8.21577262878418, "learning_rate": 8.434429727785895e-06, "loss": 0.2662, "num_input_tokens_seen": 78277392, "step": 36300 }, { "epoch": 6.662690401908607, "grad_norm": 6.586075782775879, "learning_rate": 8.433847721635307e-06, "loss": 0.3611, "num_input_tokens_seen": 78288048, "step": 36305 }, { "epoch": 6.663608001468159, "grad_norm": 6.460374355316162, "learning_rate": 8.433265627411846e-06, "loss": 0.231, "num_input_tokens_seen": 78300080, "step": 36310 }, { "epoch": 6.664525601027711, "grad_norm": 7.467911243438721, "learning_rate": 8.432683445130444e-06, "loss": 0.3572, "num_input_tokens_seen": 78310352, "step": 36315 }, { "epoch": 6.665443200587264, "grad_norm": 8.639981269836426, "learning_rate": 8.432101174806031e-06, "loss": 0.1886, "num_input_tokens_seen": 78319984, "step": 36320 }, { "epoch": 6.6663608001468155, "grad_norm": 7.865184783935547, "learning_rate": 8.431518816453541e-06, "loss": 0.2251, "num_input_tokens_seen": 78329488, "step": 36325 }, { "epoch": 6.667278399706368, "grad_norm": 8.97371768951416, "learning_rate": 8.430936370087911e-06, "loss": 0.2409, "num_input_tokens_seen": 78339824, "step": 36330 }, { "epoch": 6.6681959992659205, "grad_norm": 1.780527114868164, "learning_rate": 8.430353835724085e-06, "loss": 0.1395, "num_input_tokens_seen": 78349776, "step": 36335 }, { "epoch": 6.669113598825472, "grad_norm": 6.03251314163208, "learning_rate": 8.429771213376996e-06, "loss": 0.119, "num_input_tokens_seen": 78360592, "step": 36340 }, { "epoch": 6.670031198385025, "grad_norm": 7.403782844543457, "learning_rate": 8.429188503061593e-06, "loss": 0.2178, "num_input_tokens_seen": 78371248, "step": 36345 }, { "epoch": 6.670948797944577, "grad_norm": 6.016415596008301, "learning_rate": 8.428605704792818e-06, "loss": 0.2604, "num_input_tokens_seen": 78382032, "step": 36350 }, { "epoch": 6.671866397504129, "grad_norm": 1.7177554368972778, "learning_rate": 8.428022818585622e-06, "loss": 0.2288, "num_input_tokens_seen": 78393456, "step": 36355 }, { "epoch": 6.672783997063681, "grad_norm": 2.6560018062591553, "learning_rate": 8.427439844454952e-06, "loss": 0.1902, "num_input_tokens_seen": 78403856, "step": 36360 }, { "epoch": 6.673701596623234, "grad_norm": 2.471586227416992, "learning_rate": 8.426856782415765e-06, "loss": 0.224, "num_input_tokens_seen": 78413296, "step": 36365 }, { "epoch": 6.674619196182785, "grad_norm": 14.035717010498047, "learning_rate": 8.426273632483009e-06, "loss": 0.1157, "num_input_tokens_seen": 78423152, "step": 36370 }, { "epoch": 6.675536795742338, "grad_norm": 13.954767227172852, "learning_rate": 8.425690394671646e-06, "loss": 0.1694, "num_input_tokens_seen": 78433776, "step": 36375 }, { "epoch": 6.67645439530189, "grad_norm": 50.00187301635742, "learning_rate": 8.425107068996635e-06, "loss": 0.2564, "num_input_tokens_seen": 78444016, "step": 36380 }, { "epoch": 6.677371994861442, "grad_norm": 4.990281581878662, "learning_rate": 8.424523655472934e-06, "loss": 0.1714, "num_input_tokens_seen": 78454352, "step": 36385 }, { "epoch": 6.678289594420995, "grad_norm": 19.29231071472168, "learning_rate": 8.42394015411551e-06, "loss": 0.229, "num_input_tokens_seen": 78465072, "step": 36390 }, { "epoch": 6.679207193980547, "grad_norm": 4.814748287200928, "learning_rate": 8.423356564939328e-06, "loss": 0.365, "num_input_tokens_seen": 78474960, "step": 36395 }, { "epoch": 6.680124793540099, "grad_norm": 13.078696250915527, "learning_rate": 8.422772887959355e-06, "loss": 0.1958, "num_input_tokens_seen": 78486160, "step": 36400 }, { "epoch": 6.681042393099651, "grad_norm": 10.175670623779297, "learning_rate": 8.422189123190563e-06, "loss": 0.3165, "num_input_tokens_seen": 78496976, "step": 36405 }, { "epoch": 6.681959992659204, "grad_norm": 17.814760208129883, "learning_rate": 8.421605270647924e-06, "loss": 0.3878, "num_input_tokens_seen": 78506000, "step": 36410 }, { "epoch": 6.682877592218755, "grad_norm": 17.466707229614258, "learning_rate": 8.421021330346412e-06, "loss": 0.4984, "num_input_tokens_seen": 78517328, "step": 36415 }, { "epoch": 6.683795191778308, "grad_norm": 21.62596893310547, "learning_rate": 8.420437302301005e-06, "loss": 0.3505, "num_input_tokens_seen": 78528624, "step": 36420 }, { "epoch": 6.68471279133786, "grad_norm": 2.44423508644104, "learning_rate": 8.419853186526682e-06, "loss": 0.2845, "num_input_tokens_seen": 78540944, "step": 36425 }, { "epoch": 6.685630390897412, "grad_norm": 8.046889305114746, "learning_rate": 8.419268983038427e-06, "loss": 0.321, "num_input_tokens_seen": 78552560, "step": 36430 }, { "epoch": 6.6865479904569645, "grad_norm": 3.0044636726379395, "learning_rate": 8.41868469185122e-06, "loss": 0.3436, "num_input_tokens_seen": 78562704, "step": 36435 }, { "epoch": 6.687465590016517, "grad_norm": 6.176224708557129, "learning_rate": 8.41810031298005e-06, "loss": 0.2145, "num_input_tokens_seen": 78572848, "step": 36440 }, { "epoch": 6.688383189576069, "grad_norm": 7.671430587768555, "learning_rate": 8.417515846439904e-06, "loss": 0.2187, "num_input_tokens_seen": 78582736, "step": 36445 }, { "epoch": 6.689300789135621, "grad_norm": 4.581944465637207, "learning_rate": 8.416931292245773e-06, "loss": 0.527, "num_input_tokens_seen": 78593104, "step": 36450 }, { "epoch": 6.690218388695174, "grad_norm": 22.077224731445312, "learning_rate": 8.416346650412651e-06, "loss": 0.4133, "num_input_tokens_seen": 78603472, "step": 36455 }, { "epoch": 6.691135988254725, "grad_norm": 0.6690900921821594, "learning_rate": 8.415761920955532e-06, "loss": 0.1992, "num_input_tokens_seen": 78614128, "step": 36460 }, { "epoch": 6.692053587814278, "grad_norm": 1.2166095972061157, "learning_rate": 8.415177103889413e-06, "loss": 0.2307, "num_input_tokens_seen": 78625552, "step": 36465 }, { "epoch": 6.69297118737383, "grad_norm": 1.8855538368225098, "learning_rate": 8.414592199229297e-06, "loss": 0.1469, "num_input_tokens_seen": 78636400, "step": 36470 }, { "epoch": 6.693888786933382, "grad_norm": 17.22719383239746, "learning_rate": 8.41400720699018e-06, "loss": 0.2526, "num_input_tokens_seen": 78647248, "step": 36475 }, { "epoch": 6.694806386492934, "grad_norm": 14.311716079711914, "learning_rate": 8.413422127187072e-06, "loss": 0.554, "num_input_tokens_seen": 78658288, "step": 36480 }, { "epoch": 6.695723986052487, "grad_norm": 2.88443922996521, "learning_rate": 8.412836959834975e-06, "loss": 0.3395, "num_input_tokens_seen": 78669744, "step": 36485 }, { "epoch": 6.6966415856120385, "grad_norm": 3.5319690704345703, "learning_rate": 8.4122517049489e-06, "loss": 0.1431, "num_input_tokens_seen": 78679344, "step": 36490 }, { "epoch": 6.697559185171591, "grad_norm": 4.715252876281738, "learning_rate": 8.411666362543857e-06, "loss": 0.2272, "num_input_tokens_seen": 78690000, "step": 36495 }, { "epoch": 6.6984767847311435, "grad_norm": 1.8678290843963623, "learning_rate": 8.41108093263486e-06, "loss": 0.3314, "num_input_tokens_seen": 78698800, "step": 36500 }, { "epoch": 6.699394384290695, "grad_norm": 0.9275680184364319, "learning_rate": 8.410495415236923e-06, "loss": 0.1556, "num_input_tokens_seen": 78709456, "step": 36505 }, { "epoch": 6.700311983850248, "grad_norm": 9.642060279846191, "learning_rate": 8.409909810365064e-06, "loss": 0.1102, "num_input_tokens_seen": 78721680, "step": 36510 }, { "epoch": 6.7012295834098, "grad_norm": 6.29718017578125, "learning_rate": 8.409324118034304e-06, "loss": 0.2363, "num_input_tokens_seen": 78733008, "step": 36515 }, { "epoch": 6.702147182969352, "grad_norm": 12.5189790725708, "learning_rate": 8.408738338259663e-06, "loss": 0.3014, "num_input_tokens_seen": 78744208, "step": 36520 }, { "epoch": 6.703064782528904, "grad_norm": 6.396328926086426, "learning_rate": 8.408152471056168e-06, "loss": 0.261, "num_input_tokens_seen": 78754256, "step": 36525 }, { "epoch": 6.703982382088457, "grad_norm": 7.828024387359619, "learning_rate": 8.407566516438845e-06, "loss": 0.4514, "num_input_tokens_seen": 78764848, "step": 36530 }, { "epoch": 6.704899981648008, "grad_norm": 7.07710599899292, "learning_rate": 8.406980474422721e-06, "loss": 0.1036, "num_input_tokens_seen": 78776080, "step": 36535 }, { "epoch": 6.705817581207561, "grad_norm": 16.424232482910156, "learning_rate": 8.406394345022828e-06, "loss": 0.2737, "num_input_tokens_seen": 78786288, "step": 36540 }, { "epoch": 6.706735180767113, "grad_norm": 9.319113731384277, "learning_rate": 8.4058081282542e-06, "loss": 0.2887, "num_input_tokens_seen": 78797104, "step": 36545 }, { "epoch": 6.707652780326665, "grad_norm": 2.6065258979797363, "learning_rate": 8.405221824131873e-06, "loss": 0.1543, "num_input_tokens_seen": 78809584, "step": 36550 }, { "epoch": 6.7085703798862175, "grad_norm": 13.183588027954102, "learning_rate": 8.404635432670882e-06, "loss": 0.2176, "num_input_tokens_seen": 78820784, "step": 36555 }, { "epoch": 6.70948797944577, "grad_norm": 11.413079261779785, "learning_rate": 8.40404895388627e-06, "loss": 0.3069, "num_input_tokens_seen": 78832464, "step": 36560 }, { "epoch": 6.710405579005322, "grad_norm": 6.549863815307617, "learning_rate": 8.403462387793077e-06, "loss": 0.3089, "num_input_tokens_seen": 78843952, "step": 36565 }, { "epoch": 6.711323178564874, "grad_norm": 8.051431655883789, "learning_rate": 8.402875734406351e-06, "loss": 0.353, "num_input_tokens_seen": 78856496, "step": 36570 }, { "epoch": 6.712240778124427, "grad_norm": 6.062917232513428, "learning_rate": 8.402288993741134e-06, "loss": 0.13, "num_input_tokens_seen": 78868144, "step": 36575 }, { "epoch": 6.713158377683978, "grad_norm": 8.92528247833252, "learning_rate": 8.401702165812478e-06, "loss": 0.1347, "num_input_tokens_seen": 78878608, "step": 36580 }, { "epoch": 6.714075977243531, "grad_norm": 4.446948528289795, "learning_rate": 8.401115250635434e-06, "loss": 0.2525, "num_input_tokens_seen": 78890416, "step": 36585 }, { "epoch": 6.714993576803083, "grad_norm": 1.485896110534668, "learning_rate": 8.400528248225055e-06, "loss": 0.2734, "num_input_tokens_seen": 78901136, "step": 36590 }, { "epoch": 6.715911176362635, "grad_norm": 8.945725440979004, "learning_rate": 8.399941158596397e-06, "loss": 0.2856, "num_input_tokens_seen": 78911312, "step": 36595 }, { "epoch": 6.716828775922187, "grad_norm": 0.8837042450904846, "learning_rate": 8.399353981764516e-06, "loss": 0.2703, "num_input_tokens_seen": 78922512, "step": 36600 }, { "epoch": 6.71774637548174, "grad_norm": 5.613224983215332, "learning_rate": 8.398766717744476e-06, "loss": 0.3257, "num_input_tokens_seen": 78932752, "step": 36605 }, { "epoch": 6.7186639750412915, "grad_norm": 6.915438175201416, "learning_rate": 8.398179366551336e-06, "loss": 0.3243, "num_input_tokens_seen": 78945008, "step": 36610 }, { "epoch": 6.719581574600844, "grad_norm": 1.8417835235595703, "learning_rate": 8.397591928200163e-06, "loss": 0.1301, "num_input_tokens_seen": 78954512, "step": 36615 }, { "epoch": 6.7204991741603965, "grad_norm": 13.634084701538086, "learning_rate": 8.397004402706021e-06, "loss": 0.4061, "num_input_tokens_seen": 78965008, "step": 36620 }, { "epoch": 6.721416773719948, "grad_norm": 10.81894588470459, "learning_rate": 8.396416790083983e-06, "loss": 0.3497, "num_input_tokens_seen": 78976368, "step": 36625 }, { "epoch": 6.722334373279501, "grad_norm": 0.5542864799499512, "learning_rate": 8.395829090349118e-06, "loss": 0.3605, "num_input_tokens_seen": 78987312, "step": 36630 }, { "epoch": 6.723251972839053, "grad_norm": 11.33669376373291, "learning_rate": 8.395241303516499e-06, "loss": 0.3905, "num_input_tokens_seen": 78998096, "step": 36635 }, { "epoch": 6.724169572398605, "grad_norm": 29.386024475097656, "learning_rate": 8.394653429601203e-06, "loss": 0.2542, "num_input_tokens_seen": 79009456, "step": 36640 }, { "epoch": 6.725087171958157, "grad_norm": 9.813275337219238, "learning_rate": 8.394065468618309e-06, "loss": 0.324, "num_input_tokens_seen": 79021040, "step": 36645 }, { "epoch": 6.72600477151771, "grad_norm": 8.274032592773438, "learning_rate": 8.393477420582894e-06, "loss": 0.2944, "num_input_tokens_seen": 79031696, "step": 36650 }, { "epoch": 6.726922371077261, "grad_norm": 11.300191879272461, "learning_rate": 8.392889285510045e-06, "loss": 0.485, "num_input_tokens_seen": 79042864, "step": 36655 }, { "epoch": 6.727839970636814, "grad_norm": 6.207554340362549, "learning_rate": 8.392301063414843e-06, "loss": 0.3933, "num_input_tokens_seen": 79052752, "step": 36660 }, { "epoch": 6.7287575701963664, "grad_norm": 15.967100143432617, "learning_rate": 8.391712754312375e-06, "loss": 0.1548, "num_input_tokens_seen": 79062736, "step": 36665 }, { "epoch": 6.729675169755918, "grad_norm": 2.142072916030884, "learning_rate": 8.391124358217732e-06, "loss": 0.0585, "num_input_tokens_seen": 79073136, "step": 36670 }, { "epoch": 6.730592769315471, "grad_norm": 0.7732707858085632, "learning_rate": 8.390535875146006e-06, "loss": 0.1261, "num_input_tokens_seen": 79084208, "step": 36675 }, { "epoch": 6.731510368875023, "grad_norm": 9.249924659729004, "learning_rate": 8.38994730511229e-06, "loss": 0.1933, "num_input_tokens_seen": 79094256, "step": 36680 }, { "epoch": 6.732427968434575, "grad_norm": 1.3220903873443604, "learning_rate": 8.38935864813168e-06, "loss": 0.3154, "num_input_tokens_seen": 79105648, "step": 36685 }, { "epoch": 6.733345567994127, "grad_norm": 3.0441222190856934, "learning_rate": 8.388769904219272e-06, "loss": 0.2948, "num_input_tokens_seen": 79115696, "step": 36690 }, { "epoch": 6.73426316755368, "grad_norm": 0.9573777914047241, "learning_rate": 8.388181073390169e-06, "loss": 0.2842, "num_input_tokens_seen": 79125232, "step": 36695 }, { "epoch": 6.735180767113231, "grad_norm": 1.2287733554840088, "learning_rate": 8.387592155659472e-06, "loss": 0.1622, "num_input_tokens_seen": 79136240, "step": 36700 }, { "epoch": 6.736098366672784, "grad_norm": 0.9183265566825867, "learning_rate": 8.387003151042288e-06, "loss": 0.2612, "num_input_tokens_seen": 79147536, "step": 36705 }, { "epoch": 6.737015966232336, "grad_norm": 12.64716625213623, "learning_rate": 8.38641405955372e-06, "loss": 0.2206, "num_input_tokens_seen": 79158608, "step": 36710 }, { "epoch": 6.737933565791888, "grad_norm": 17.058271408081055, "learning_rate": 8.385824881208881e-06, "loss": 0.61, "num_input_tokens_seen": 79169200, "step": 36715 }, { "epoch": 6.7388511653514405, "grad_norm": 1.1432127952575684, "learning_rate": 8.385235616022883e-06, "loss": 0.1298, "num_input_tokens_seen": 79180784, "step": 36720 }, { "epoch": 6.739768764910993, "grad_norm": 0.8613370656967163, "learning_rate": 8.384646264010836e-06, "loss": 0.3577, "num_input_tokens_seen": 79191152, "step": 36725 }, { "epoch": 6.740686364470545, "grad_norm": 16.58624839782715, "learning_rate": 8.384056825187859e-06, "loss": 0.2124, "num_input_tokens_seen": 79202128, "step": 36730 }, { "epoch": 6.741603964030097, "grad_norm": 0.6869890689849854, "learning_rate": 8.383467299569068e-06, "loss": 0.1474, "num_input_tokens_seen": 79214032, "step": 36735 }, { "epoch": 6.74252156358965, "grad_norm": 8.871127128601074, "learning_rate": 8.382877687169586e-06, "loss": 0.424, "num_input_tokens_seen": 79225040, "step": 36740 }, { "epoch": 6.743439163149201, "grad_norm": 26.417316436767578, "learning_rate": 8.382287988004534e-06, "loss": 0.2865, "num_input_tokens_seen": 79235088, "step": 36745 }, { "epoch": 6.744356762708754, "grad_norm": 0.588508129119873, "learning_rate": 8.381698202089036e-06, "loss": 0.1213, "num_input_tokens_seen": 79246512, "step": 36750 }, { "epoch": 6.745274362268306, "grad_norm": 4.366723537445068, "learning_rate": 8.381108329438224e-06, "loss": 0.1713, "num_input_tokens_seen": 79256336, "step": 36755 }, { "epoch": 6.746191961827858, "grad_norm": 18.222131729125977, "learning_rate": 8.38051837006722e-06, "loss": 0.1767, "num_input_tokens_seen": 79265584, "step": 36760 }, { "epoch": 6.74710956138741, "grad_norm": 0.370789498090744, "learning_rate": 8.379928323991162e-06, "loss": 0.2631, "num_input_tokens_seen": 79277776, "step": 36765 }, { "epoch": 6.748027160946963, "grad_norm": 5.470444202423096, "learning_rate": 8.379338191225177e-06, "loss": 0.2752, "num_input_tokens_seen": 79287376, "step": 36770 }, { "epoch": 6.7489447605065145, "grad_norm": 11.901711463928223, "learning_rate": 8.378747971784407e-06, "loss": 0.3478, "num_input_tokens_seen": 79298064, "step": 36775 }, { "epoch": 6.749862360066067, "grad_norm": 0.6650668978691101, "learning_rate": 8.378157665683987e-06, "loss": 0.3246, "num_input_tokens_seen": 79309424, "step": 36780 }, { "epoch": 6.7507799596256195, "grad_norm": 17.346920013427734, "learning_rate": 8.377567272939063e-06, "loss": 0.3828, "num_input_tokens_seen": 79319888, "step": 36785 }, { "epoch": 6.751697559185171, "grad_norm": 4.015365123748779, "learning_rate": 8.376976793564769e-06, "loss": 0.2765, "num_input_tokens_seen": 79330480, "step": 36790 }, { "epoch": 6.752615158744724, "grad_norm": 7.900008678436279, "learning_rate": 8.376386227576254e-06, "loss": 0.2537, "num_input_tokens_seen": 79341328, "step": 36795 }, { "epoch": 6.753532758304276, "grad_norm": 0.8344241976737976, "learning_rate": 8.375795574988667e-06, "loss": 0.3047, "num_input_tokens_seen": 79351632, "step": 36800 }, { "epoch": 6.754450357863828, "grad_norm": 7.319640636444092, "learning_rate": 8.375204835817155e-06, "loss": 0.2771, "num_input_tokens_seen": 79362320, "step": 36805 }, { "epoch": 6.75536795742338, "grad_norm": 50.190887451171875, "learning_rate": 8.374614010076869e-06, "loss": 0.3469, "num_input_tokens_seen": 79373520, "step": 36810 }, { "epoch": 6.756285556982933, "grad_norm": 4.720483303070068, "learning_rate": 8.374023097782963e-06, "loss": 0.296, "num_input_tokens_seen": 79384720, "step": 36815 }, { "epoch": 6.757203156542484, "grad_norm": 7.695402145385742, "learning_rate": 8.373432098950595e-06, "loss": 0.2765, "num_input_tokens_seen": 79394864, "step": 36820 }, { "epoch": 6.758120756102037, "grad_norm": 5.362732887268066, "learning_rate": 8.372841013594924e-06, "loss": 0.3137, "num_input_tokens_seen": 79405680, "step": 36825 }, { "epoch": 6.759038355661589, "grad_norm": 20.008460998535156, "learning_rate": 8.372249841731105e-06, "loss": 0.2566, "num_input_tokens_seen": 79417008, "step": 36830 }, { "epoch": 6.759955955221141, "grad_norm": 3.9478559494018555, "learning_rate": 8.371658583374306e-06, "loss": 0.3157, "num_input_tokens_seen": 79427312, "step": 36835 }, { "epoch": 6.7608735547806935, "grad_norm": 7.687046527862549, "learning_rate": 8.371067238539692e-06, "loss": 0.2453, "num_input_tokens_seen": 79438480, "step": 36840 }, { "epoch": 6.761791154340246, "grad_norm": 4.660411834716797, "learning_rate": 8.370475807242425e-06, "loss": 0.2069, "num_input_tokens_seen": 79450096, "step": 36845 }, { "epoch": 6.762708753899798, "grad_norm": 0.9904753565788269, "learning_rate": 8.369884289497678e-06, "loss": 0.2919, "num_input_tokens_seen": 79460624, "step": 36850 }, { "epoch": 6.76362635345935, "grad_norm": 11.627387046813965, "learning_rate": 8.369292685320623e-06, "loss": 0.2363, "num_input_tokens_seen": 79472336, "step": 36855 }, { "epoch": 6.764543953018903, "grad_norm": 3.2370123863220215, "learning_rate": 8.368700994726432e-06, "loss": 0.0814, "num_input_tokens_seen": 79484368, "step": 36860 }, { "epoch": 6.765461552578454, "grad_norm": 9.3468599319458, "learning_rate": 8.36810921773028e-06, "loss": 0.2413, "num_input_tokens_seen": 79494256, "step": 36865 }, { "epoch": 6.766379152138007, "grad_norm": 5.321096420288086, "learning_rate": 8.367517354347347e-06, "loss": 0.1658, "num_input_tokens_seen": 79506384, "step": 36870 }, { "epoch": 6.767296751697559, "grad_norm": 11.790637969970703, "learning_rate": 8.366925404592814e-06, "loss": 0.3054, "num_input_tokens_seen": 79516912, "step": 36875 }, { "epoch": 6.768214351257111, "grad_norm": 22.06077766418457, "learning_rate": 8.366333368481862e-06, "loss": 0.3454, "num_input_tokens_seen": 79527792, "step": 36880 }, { "epoch": 6.769131950816663, "grad_norm": 22.209457397460938, "learning_rate": 8.365741246029677e-06, "loss": 0.3955, "num_input_tokens_seen": 79538544, "step": 36885 }, { "epoch": 6.770049550376216, "grad_norm": 7.058590412139893, "learning_rate": 8.365149037251445e-06, "loss": 0.2245, "num_input_tokens_seen": 79548432, "step": 36890 }, { "epoch": 6.7709671499357675, "grad_norm": 6.176314830780029, "learning_rate": 8.364556742162355e-06, "loss": 0.1063, "num_input_tokens_seen": 79559376, "step": 36895 }, { "epoch": 6.77188474949532, "grad_norm": 4.016136646270752, "learning_rate": 8.363964360777602e-06, "loss": 0.1895, "num_input_tokens_seen": 79570512, "step": 36900 }, { "epoch": 6.7728023490548726, "grad_norm": 0.7275692820549011, "learning_rate": 8.363371893112372e-06, "loss": 0.1614, "num_input_tokens_seen": 79581264, "step": 36905 }, { "epoch": 6.773719948614424, "grad_norm": 5.636078357696533, "learning_rate": 8.36277933918187e-06, "loss": 0.3663, "num_input_tokens_seen": 79590960, "step": 36910 }, { "epoch": 6.774637548173977, "grad_norm": 16.212528228759766, "learning_rate": 8.36218669900129e-06, "loss": 0.2173, "num_input_tokens_seen": 79602032, "step": 36915 }, { "epoch": 6.775555147733529, "grad_norm": 21.17627716064453, "learning_rate": 8.36159397258583e-06, "loss": 0.241, "num_input_tokens_seen": 79612560, "step": 36920 }, { "epoch": 6.776472747293081, "grad_norm": 19.32586669921875, "learning_rate": 8.361001159950694e-06, "loss": 0.2981, "num_input_tokens_seen": 79623568, "step": 36925 }, { "epoch": 6.777390346852633, "grad_norm": 18.62322235107422, "learning_rate": 8.360408261111088e-06, "loss": 0.3173, "num_input_tokens_seen": 79634192, "step": 36930 }, { "epoch": 6.778307946412186, "grad_norm": 3.5698306560516357, "learning_rate": 8.359815276082219e-06, "loss": 0.2977, "num_input_tokens_seen": 79644560, "step": 36935 }, { "epoch": 6.779225545971738, "grad_norm": 13.670241355895996, "learning_rate": 8.359222204879296e-06, "loss": 0.226, "num_input_tokens_seen": 79656016, "step": 36940 }, { "epoch": 6.78014314553129, "grad_norm": 9.926196098327637, "learning_rate": 8.358629047517528e-06, "loss": 0.3597, "num_input_tokens_seen": 79667952, "step": 36945 }, { "epoch": 6.7810607450908424, "grad_norm": 2.7143213748931885, "learning_rate": 8.358035804012131e-06, "loss": 0.1935, "num_input_tokens_seen": 79677072, "step": 36950 }, { "epoch": 6.781978344650395, "grad_norm": 27.180212020874023, "learning_rate": 8.35744247437832e-06, "loss": 0.312, "num_input_tokens_seen": 79687952, "step": 36955 }, { "epoch": 6.782895944209947, "grad_norm": 6.545809745788574, "learning_rate": 8.356849058631314e-06, "loss": 0.208, "num_input_tokens_seen": 79699440, "step": 36960 }, { "epoch": 6.783813543769499, "grad_norm": 9.021157264709473, "learning_rate": 8.356255556786332e-06, "loss": 0.1239, "num_input_tokens_seen": 79710224, "step": 36965 }, { "epoch": 6.784731143329052, "grad_norm": 2.2018351554870605, "learning_rate": 8.355661968858595e-06, "loss": 0.3271, "num_input_tokens_seen": 79722960, "step": 36970 }, { "epoch": 6.785648742888603, "grad_norm": 10.951251983642578, "learning_rate": 8.355068294863331e-06, "loss": 0.1947, "num_input_tokens_seen": 79733744, "step": 36975 }, { "epoch": 6.786566342448156, "grad_norm": 3.4220690727233887, "learning_rate": 8.354474534815764e-06, "loss": 0.296, "num_input_tokens_seen": 79744144, "step": 36980 }, { "epoch": 6.787483942007708, "grad_norm": 38.97854232788086, "learning_rate": 8.353880688731126e-06, "loss": 0.3658, "num_input_tokens_seen": 79753616, "step": 36985 }, { "epoch": 6.78840154156726, "grad_norm": 4.748682498931885, "learning_rate": 8.353286756624645e-06, "loss": 0.26, "num_input_tokens_seen": 79765104, "step": 36990 }, { "epoch": 6.789319141126812, "grad_norm": 12.428865432739258, "learning_rate": 8.352692738511556e-06, "loss": 0.2385, "num_input_tokens_seen": 79775792, "step": 36995 }, { "epoch": 6.790236740686365, "grad_norm": 4.383593559265137, "learning_rate": 8.352098634407095e-06, "loss": 0.3707, "num_input_tokens_seen": 79785200, "step": 37000 }, { "epoch": 6.7911543402459165, "grad_norm": 6.845480918884277, "learning_rate": 8.3515044443265e-06, "loss": 0.2709, "num_input_tokens_seen": 79794960, "step": 37005 }, { "epoch": 6.792071939805469, "grad_norm": 0.758646547794342, "learning_rate": 8.350910168285008e-06, "loss": 0.1341, "num_input_tokens_seen": 79807088, "step": 37010 }, { "epoch": 6.7929895393650215, "grad_norm": 8.763187408447266, "learning_rate": 8.350315806297865e-06, "loss": 0.1149, "num_input_tokens_seen": 79817072, "step": 37015 }, { "epoch": 6.793907138924573, "grad_norm": 11.770079612731934, "learning_rate": 8.349721358380314e-06, "loss": 0.3571, "num_input_tokens_seen": 79828624, "step": 37020 }, { "epoch": 6.794824738484126, "grad_norm": 0.7009132504463196, "learning_rate": 8.349126824547603e-06, "loss": 0.3055, "num_input_tokens_seen": 79837680, "step": 37025 }, { "epoch": 6.795742338043678, "grad_norm": 11.777411460876465, "learning_rate": 8.348532204814976e-06, "loss": 0.1336, "num_input_tokens_seen": 79848400, "step": 37030 }, { "epoch": 6.79665993760323, "grad_norm": 3.761697292327881, "learning_rate": 8.347937499197691e-06, "loss": 0.3466, "num_input_tokens_seen": 79860368, "step": 37035 }, { "epoch": 6.797577537162782, "grad_norm": 16.03070068359375, "learning_rate": 8.347342707710997e-06, "loss": 0.1903, "num_input_tokens_seen": 79871120, "step": 37040 }, { "epoch": 6.798495136722335, "grad_norm": 20.02899742126465, "learning_rate": 8.34674783037015e-06, "loss": 0.3363, "num_input_tokens_seen": 79881232, "step": 37045 }, { "epoch": 6.799412736281886, "grad_norm": 19.710596084594727, "learning_rate": 8.346152867190409e-06, "loss": 0.2349, "num_input_tokens_seen": 79892112, "step": 37050 }, { "epoch": 6.800330335841439, "grad_norm": 1.1412770748138428, "learning_rate": 8.345557818187033e-06, "loss": 0.3665, "num_input_tokens_seen": 79902608, "step": 37055 }, { "epoch": 6.801247935400991, "grad_norm": 1.5250648260116577, "learning_rate": 8.344962683375284e-06, "loss": 0.2809, "num_input_tokens_seen": 79914000, "step": 37060 }, { "epoch": 6.802165534960543, "grad_norm": 6.720069885253906, "learning_rate": 8.344367462770426e-06, "loss": 0.4218, "num_input_tokens_seen": 79924080, "step": 37065 }, { "epoch": 6.8030831345200955, "grad_norm": 5.91660213470459, "learning_rate": 8.343772156387725e-06, "loss": 0.2788, "num_input_tokens_seen": 79934960, "step": 37070 }, { "epoch": 6.804000734079648, "grad_norm": 3.5878312587738037, "learning_rate": 8.343176764242452e-06, "loss": 0.248, "num_input_tokens_seen": 79946320, "step": 37075 }, { "epoch": 6.8049183336392, "grad_norm": 13.053787231445312, "learning_rate": 8.342581286349876e-06, "loss": 0.2978, "num_input_tokens_seen": 79957648, "step": 37080 }, { "epoch": 6.805835933198752, "grad_norm": 5.166977405548096, "learning_rate": 8.34198572272527e-06, "loss": 0.2607, "num_input_tokens_seen": 79969488, "step": 37085 }, { "epoch": 6.806753532758305, "grad_norm": 2.3561816215515137, "learning_rate": 8.341390073383911e-06, "loss": 0.3302, "num_input_tokens_seen": 79980240, "step": 37090 }, { "epoch": 6.807671132317856, "grad_norm": 11.571708679199219, "learning_rate": 8.340794338341075e-06, "loss": 0.3356, "num_input_tokens_seen": 79991120, "step": 37095 }, { "epoch": 6.808588731877409, "grad_norm": 2.2079906463623047, "learning_rate": 8.340198517612042e-06, "loss": 0.2909, "num_input_tokens_seen": 80003088, "step": 37100 }, { "epoch": 6.809506331436961, "grad_norm": 12.895278930664062, "learning_rate": 8.339602611212093e-06, "loss": 0.3241, "num_input_tokens_seen": 80014224, "step": 37105 }, { "epoch": 6.810423930996513, "grad_norm": 3.615431785583496, "learning_rate": 8.339006619156513e-06, "loss": 0.201, "num_input_tokens_seen": 80025680, "step": 37110 }, { "epoch": 6.811341530556065, "grad_norm": 4.00222110748291, "learning_rate": 8.338410541460589e-06, "loss": 0.2439, "num_input_tokens_seen": 80037200, "step": 37115 }, { "epoch": 6.812259130115618, "grad_norm": 0.8914550542831421, "learning_rate": 8.337814378139607e-06, "loss": 0.2199, "num_input_tokens_seen": 80048016, "step": 37120 }, { "epoch": 6.8131767296751695, "grad_norm": 2.5553958415985107, "learning_rate": 8.337218129208862e-06, "loss": 0.1729, "num_input_tokens_seen": 80057776, "step": 37125 }, { "epoch": 6.814094329234722, "grad_norm": 10.64294719696045, "learning_rate": 8.336621794683643e-06, "loss": 0.2493, "num_input_tokens_seen": 80069136, "step": 37130 }, { "epoch": 6.8150119287942745, "grad_norm": 0.580510139465332, "learning_rate": 8.336025374579246e-06, "loss": 0.1696, "num_input_tokens_seen": 80079920, "step": 37135 }, { "epoch": 6.815929528353826, "grad_norm": 11.805498123168945, "learning_rate": 8.335428868910968e-06, "loss": 0.3637, "num_input_tokens_seen": 80090384, "step": 37140 }, { "epoch": 6.816847127913379, "grad_norm": 12.353370666503906, "learning_rate": 8.33483227769411e-06, "loss": 0.2732, "num_input_tokens_seen": 80101392, "step": 37145 }, { "epoch": 6.817764727472931, "grad_norm": 7.077300548553467, "learning_rate": 8.334235600943972e-06, "loss": 0.2862, "num_input_tokens_seen": 80112400, "step": 37150 }, { "epoch": 6.818682327032483, "grad_norm": 6.788750171661377, "learning_rate": 8.33363883867586e-06, "loss": 0.2066, "num_input_tokens_seen": 80122672, "step": 37155 }, { "epoch": 6.819599926592035, "grad_norm": 3.056764602661133, "learning_rate": 8.333041990905076e-06, "loss": 0.3032, "num_input_tokens_seen": 80133712, "step": 37160 }, { "epoch": 6.820517526151588, "grad_norm": 4.7023162841796875, "learning_rate": 8.332445057646931e-06, "loss": 0.2766, "num_input_tokens_seen": 80142672, "step": 37165 }, { "epoch": 6.821435125711139, "grad_norm": 5.390524387359619, "learning_rate": 8.331848038916737e-06, "loss": 0.2064, "num_input_tokens_seen": 80153424, "step": 37170 }, { "epoch": 6.822352725270692, "grad_norm": 1.922349452972412, "learning_rate": 8.331250934729805e-06, "loss": 0.2525, "num_input_tokens_seen": 80164624, "step": 37175 }, { "epoch": 6.823270324830244, "grad_norm": 12.249918937683105, "learning_rate": 8.330653745101447e-06, "loss": 0.1449, "num_input_tokens_seen": 80174896, "step": 37180 }, { "epoch": 6.824187924389796, "grad_norm": 2.80535626411438, "learning_rate": 8.330056470046983e-06, "loss": 0.2172, "num_input_tokens_seen": 80185360, "step": 37185 }, { "epoch": 6.8251055239493486, "grad_norm": 10.865224838256836, "learning_rate": 8.329459109581731e-06, "loss": 0.2303, "num_input_tokens_seen": 80196560, "step": 37190 }, { "epoch": 6.826023123508901, "grad_norm": 0.6832436919212341, "learning_rate": 8.328861663721017e-06, "loss": 0.1452, "num_input_tokens_seen": 80207760, "step": 37195 }, { "epoch": 6.826940723068453, "grad_norm": 1.0553010702133179, "learning_rate": 8.328264132480157e-06, "loss": 0.1624, "num_input_tokens_seen": 80218704, "step": 37200 }, { "epoch": 6.827858322628005, "grad_norm": 2.642725944519043, "learning_rate": 8.32766651587448e-06, "loss": 0.2622, "num_input_tokens_seen": 80229744, "step": 37205 }, { "epoch": 6.828775922187558, "grad_norm": 16.13677978515625, "learning_rate": 8.327068813919317e-06, "loss": 0.2429, "num_input_tokens_seen": 80240272, "step": 37210 }, { "epoch": 6.829693521747109, "grad_norm": 10.47223949432373, "learning_rate": 8.326471026629994e-06, "loss": 0.3014, "num_input_tokens_seen": 80251600, "step": 37215 }, { "epoch": 6.830611121306662, "grad_norm": 10.926518440246582, "learning_rate": 8.325873154021844e-06, "loss": 0.356, "num_input_tokens_seen": 80262480, "step": 37220 }, { "epoch": 6.831528720866214, "grad_norm": 11.868793487548828, "learning_rate": 8.325275196110202e-06, "loss": 0.3294, "num_input_tokens_seen": 80273168, "step": 37225 }, { "epoch": 6.832446320425766, "grad_norm": 1.677975058555603, "learning_rate": 8.324677152910406e-06, "loss": 0.163, "num_input_tokens_seen": 80282608, "step": 37230 }, { "epoch": 6.8333639199853184, "grad_norm": 1.146077036857605, "learning_rate": 8.324079024437795e-06, "loss": 0.1082, "num_input_tokens_seen": 80294384, "step": 37235 }, { "epoch": 6.834281519544871, "grad_norm": 1.0859723091125488, "learning_rate": 8.323480810707707e-06, "loss": 0.2737, "num_input_tokens_seen": 80303344, "step": 37240 }, { "epoch": 6.835199119104423, "grad_norm": 0.65693199634552, "learning_rate": 8.322882511735489e-06, "loss": 0.2589, "num_input_tokens_seen": 80314544, "step": 37245 }, { "epoch": 6.836116718663975, "grad_norm": 0.42517802119255066, "learning_rate": 8.322284127536481e-06, "loss": 0.171, "num_input_tokens_seen": 80325520, "step": 37250 }, { "epoch": 6.837034318223528, "grad_norm": 5.055140018463135, "learning_rate": 8.321685658126037e-06, "loss": 0.2671, "num_input_tokens_seen": 80334288, "step": 37255 }, { "epoch": 6.837951917783079, "grad_norm": 2.6693332195281982, "learning_rate": 8.321087103519503e-06, "loss": 0.2339, "num_input_tokens_seen": 80344272, "step": 37260 }, { "epoch": 6.838869517342632, "grad_norm": 6.031564235687256, "learning_rate": 8.320488463732232e-06, "loss": 0.2956, "num_input_tokens_seen": 80356560, "step": 37265 }, { "epoch": 6.839787116902184, "grad_norm": 9.062698364257812, "learning_rate": 8.31988973877958e-06, "loss": 0.2899, "num_input_tokens_seen": 80366384, "step": 37270 }, { "epoch": 6.840704716461736, "grad_norm": 0.9060702919960022, "learning_rate": 8.319290928676899e-06, "loss": 0.25, "num_input_tokens_seen": 80377424, "step": 37275 }, { "epoch": 6.841622316021288, "grad_norm": 18.20264434814453, "learning_rate": 8.318692033439553e-06, "loss": 0.5041, "num_input_tokens_seen": 80388400, "step": 37280 }, { "epoch": 6.842539915580841, "grad_norm": 0.7322723269462585, "learning_rate": 8.318093053082898e-06, "loss": 0.3207, "num_input_tokens_seen": 80398608, "step": 37285 }, { "epoch": 6.8434575151403925, "grad_norm": 3.194673776626587, "learning_rate": 8.317493987622299e-06, "loss": 0.3289, "num_input_tokens_seen": 80410160, "step": 37290 }, { "epoch": 6.844375114699945, "grad_norm": 7.246490001678467, "learning_rate": 8.316894837073119e-06, "loss": 0.236, "num_input_tokens_seen": 80421072, "step": 37295 }, { "epoch": 6.8452927142594975, "grad_norm": 1.935093879699707, "learning_rate": 8.31629560145073e-06, "loss": 0.1925, "num_input_tokens_seen": 80430864, "step": 37300 }, { "epoch": 6.846210313819049, "grad_norm": 35.989498138427734, "learning_rate": 8.315696280770498e-06, "loss": 0.295, "num_input_tokens_seen": 80441424, "step": 37305 }, { "epoch": 6.847127913378602, "grad_norm": 8.720141410827637, "learning_rate": 8.315096875047795e-06, "loss": 0.4562, "num_input_tokens_seen": 80451920, "step": 37310 }, { "epoch": 6.848045512938154, "grad_norm": 2.6346657276153564, "learning_rate": 8.314497384297994e-06, "loss": 0.2992, "num_input_tokens_seen": 80462672, "step": 37315 }, { "epoch": 6.848963112497706, "grad_norm": 17.253278732299805, "learning_rate": 8.313897808536472e-06, "loss": 0.2448, "num_input_tokens_seen": 80473744, "step": 37320 }, { "epoch": 6.849880712057258, "grad_norm": 7.719501972198486, "learning_rate": 8.31329814777861e-06, "loss": 0.4089, "num_input_tokens_seen": 80484848, "step": 37325 }, { "epoch": 6.850798311616811, "grad_norm": 3.151381254196167, "learning_rate": 8.312698402039783e-06, "loss": 0.1604, "num_input_tokens_seen": 80496208, "step": 37330 }, { "epoch": 6.851715911176362, "grad_norm": 3.118424415588379, "learning_rate": 8.312098571335377e-06, "loss": 0.3489, "num_input_tokens_seen": 80506384, "step": 37335 }, { "epoch": 6.852633510735915, "grad_norm": 20.815187454223633, "learning_rate": 8.311498655680777e-06, "loss": 0.3459, "num_input_tokens_seen": 80518352, "step": 37340 }, { "epoch": 6.853551110295467, "grad_norm": 9.463088989257812, "learning_rate": 8.310898655091368e-06, "loss": 0.2499, "num_input_tokens_seen": 80529264, "step": 37345 }, { "epoch": 6.854468709855019, "grad_norm": 1.2214739322662354, "learning_rate": 8.310298569582539e-06, "loss": 0.1167, "num_input_tokens_seen": 80540048, "step": 37350 }, { "epoch": 6.8553863094145715, "grad_norm": 7.303586959838867, "learning_rate": 8.309698399169683e-06, "loss": 0.1689, "num_input_tokens_seen": 80552048, "step": 37355 }, { "epoch": 6.856303908974124, "grad_norm": 25.543285369873047, "learning_rate": 8.309098143868193e-06, "loss": 0.2345, "num_input_tokens_seen": 80561328, "step": 37360 }, { "epoch": 6.857221508533676, "grad_norm": 2.353032112121582, "learning_rate": 8.308497803693463e-06, "loss": 0.2953, "num_input_tokens_seen": 80572304, "step": 37365 }, { "epoch": 6.858139108093228, "grad_norm": 16.76913070678711, "learning_rate": 8.307897378660894e-06, "loss": 0.1881, "num_input_tokens_seen": 80580816, "step": 37370 }, { "epoch": 6.859056707652781, "grad_norm": 23.235815048217773, "learning_rate": 8.307296868785882e-06, "loss": 0.2762, "num_input_tokens_seen": 80590928, "step": 37375 }, { "epoch": 6.859974307212332, "grad_norm": 15.125642776489258, "learning_rate": 8.306696274083833e-06, "loss": 0.3208, "num_input_tokens_seen": 80602544, "step": 37380 }, { "epoch": 6.860891906771885, "grad_norm": 1.0121188163757324, "learning_rate": 8.306095594570149e-06, "loss": 0.2218, "num_input_tokens_seen": 80613520, "step": 37385 }, { "epoch": 6.861809506331437, "grad_norm": 1.0883749723434448, "learning_rate": 8.305494830260237e-06, "loss": 0.2541, "num_input_tokens_seen": 80624464, "step": 37390 }, { "epoch": 6.862727105890989, "grad_norm": 2.901479482650757, "learning_rate": 8.304893981169503e-06, "loss": 0.2839, "num_input_tokens_seen": 80636560, "step": 37395 }, { "epoch": 6.863644705450541, "grad_norm": 11.919904708862305, "learning_rate": 8.304293047313363e-06, "loss": 0.3978, "num_input_tokens_seen": 80647376, "step": 37400 }, { "epoch": 6.864562305010094, "grad_norm": 18.85370445251465, "learning_rate": 8.303692028707229e-06, "loss": 0.1309, "num_input_tokens_seen": 80658384, "step": 37405 }, { "epoch": 6.8654799045696455, "grad_norm": 0.7614285945892334, "learning_rate": 8.303090925366513e-06, "loss": 0.3109, "num_input_tokens_seen": 80668432, "step": 37410 }, { "epoch": 6.866397504129198, "grad_norm": 5.8913750648498535, "learning_rate": 8.302489737306634e-06, "loss": 0.1951, "num_input_tokens_seen": 80679440, "step": 37415 }, { "epoch": 6.8673151036887505, "grad_norm": 9.951282501220703, "learning_rate": 8.30188846454301e-06, "loss": 0.391, "num_input_tokens_seen": 80690704, "step": 37420 }, { "epoch": 6.868232703248302, "grad_norm": 22.427642822265625, "learning_rate": 8.301287107091067e-06, "loss": 0.4378, "num_input_tokens_seen": 80701424, "step": 37425 }, { "epoch": 6.869150302807855, "grad_norm": 29.18395233154297, "learning_rate": 8.300685664966226e-06, "loss": 0.1848, "num_input_tokens_seen": 80711696, "step": 37430 }, { "epoch": 6.870067902367407, "grad_norm": 15.344293594360352, "learning_rate": 8.300084138183913e-06, "loss": 0.3116, "num_input_tokens_seen": 80722032, "step": 37435 }, { "epoch": 6.870985501926959, "grad_norm": 11.574642181396484, "learning_rate": 8.299482526759554e-06, "loss": 0.479, "num_input_tokens_seen": 80731568, "step": 37440 }, { "epoch": 6.871903101486511, "grad_norm": 13.614300727844238, "learning_rate": 8.298880830708586e-06, "loss": 0.2217, "num_input_tokens_seen": 80742448, "step": 37445 }, { "epoch": 6.872820701046064, "grad_norm": 7.921148777008057, "learning_rate": 8.298279050046434e-06, "loss": 0.3042, "num_input_tokens_seen": 80753136, "step": 37450 }, { "epoch": 6.873738300605615, "grad_norm": 1.4595680236816406, "learning_rate": 8.297677184788539e-06, "loss": 0.3179, "num_input_tokens_seen": 80762992, "step": 37455 }, { "epoch": 6.874655900165168, "grad_norm": 14.746747970581055, "learning_rate": 8.297075234950333e-06, "loss": 0.2148, "num_input_tokens_seen": 80774736, "step": 37460 }, { "epoch": 6.87557349972472, "grad_norm": 7.017154693603516, "learning_rate": 8.29647320054726e-06, "loss": 0.3215, "num_input_tokens_seen": 80785712, "step": 37465 }, { "epoch": 6.876491099284272, "grad_norm": 15.697115898132324, "learning_rate": 8.295871081594755e-06, "loss": 0.4076, "num_input_tokens_seen": 80796944, "step": 37470 }, { "epoch": 6.8774086988438246, "grad_norm": 3.897469997406006, "learning_rate": 8.295268878108266e-06, "loss": 0.4835, "num_input_tokens_seen": 80806672, "step": 37475 }, { "epoch": 6.878326298403377, "grad_norm": 2.7218453884124756, "learning_rate": 8.29466659010324e-06, "loss": 0.5156, "num_input_tokens_seen": 80817968, "step": 37480 }, { "epoch": 6.879243897962929, "grad_norm": 0.8698859810829163, "learning_rate": 8.29406421759512e-06, "loss": 0.1525, "num_input_tokens_seen": 80829200, "step": 37485 }, { "epoch": 6.880161497522481, "grad_norm": 8.478529930114746, "learning_rate": 8.293461760599357e-06, "loss": 0.204, "num_input_tokens_seen": 80840848, "step": 37490 }, { "epoch": 6.881079097082034, "grad_norm": 11.385581970214844, "learning_rate": 8.292859219131406e-06, "loss": 0.263, "num_input_tokens_seen": 80851600, "step": 37495 }, { "epoch": 6.881996696641585, "grad_norm": 5.560281276702881, "learning_rate": 8.292256593206719e-06, "loss": 0.1112, "num_input_tokens_seen": 80862128, "step": 37500 }, { "epoch": 6.882914296201138, "grad_norm": 6.4480881690979, "learning_rate": 8.291653882840754e-06, "loss": 0.2373, "num_input_tokens_seen": 80872528, "step": 37505 }, { "epoch": 6.88383189576069, "grad_norm": 8.895123481750488, "learning_rate": 8.291051088048967e-06, "loss": 0.186, "num_input_tokens_seen": 80883600, "step": 37510 }, { "epoch": 6.884749495320242, "grad_norm": 22.430395126342773, "learning_rate": 8.290448208846823e-06, "loss": 0.3103, "num_input_tokens_seen": 80894256, "step": 37515 }, { "epoch": 6.8856670948797944, "grad_norm": 1.5543667078018188, "learning_rate": 8.289845245249779e-06, "loss": 0.2523, "num_input_tokens_seen": 80904528, "step": 37520 }, { "epoch": 6.886584694439347, "grad_norm": 2.4661238193511963, "learning_rate": 8.289242197273303e-06, "loss": 0.3253, "num_input_tokens_seen": 80915952, "step": 37525 }, { "epoch": 6.887502293998899, "grad_norm": 4.445474624633789, "learning_rate": 8.288639064932864e-06, "loss": 0.3678, "num_input_tokens_seen": 80927184, "step": 37530 }, { "epoch": 6.888419893558451, "grad_norm": 11.398106575012207, "learning_rate": 8.28803584824393e-06, "loss": 0.2193, "num_input_tokens_seen": 80937936, "step": 37535 }, { "epoch": 6.889337493118004, "grad_norm": 4.388995170593262, "learning_rate": 8.287432547221972e-06, "loss": 0.1991, "num_input_tokens_seen": 80948176, "step": 37540 }, { "epoch": 6.890255092677555, "grad_norm": 5.8506083488464355, "learning_rate": 8.286829161882463e-06, "loss": 0.2966, "num_input_tokens_seen": 80959088, "step": 37545 }, { "epoch": 6.891172692237108, "grad_norm": 1.5415401458740234, "learning_rate": 8.286225692240883e-06, "loss": 0.1296, "num_input_tokens_seen": 80970544, "step": 37550 }, { "epoch": 6.89209029179666, "grad_norm": 4.767662525177002, "learning_rate": 8.285622138312705e-06, "loss": 0.3509, "num_input_tokens_seen": 80981680, "step": 37555 }, { "epoch": 6.893007891356212, "grad_norm": 4.213907241821289, "learning_rate": 8.285018500113413e-06, "loss": 0.2118, "num_input_tokens_seen": 80992048, "step": 37560 }, { "epoch": 6.893925490915764, "grad_norm": 17.70275115966797, "learning_rate": 8.284414777658487e-06, "loss": 0.4059, "num_input_tokens_seen": 81002672, "step": 37565 }, { "epoch": 6.894843090475317, "grad_norm": 17.75473976135254, "learning_rate": 8.283810970963411e-06, "loss": 0.3502, "num_input_tokens_seen": 81015216, "step": 37570 }, { "epoch": 6.8957606900348685, "grad_norm": 13.041672706604004, "learning_rate": 8.283207080043675e-06, "loss": 0.4054, "num_input_tokens_seen": 81027184, "step": 37575 }, { "epoch": 6.896678289594421, "grad_norm": 1.395337462425232, "learning_rate": 8.282603104914765e-06, "loss": 0.0633, "num_input_tokens_seen": 81038000, "step": 37580 }, { "epoch": 6.8975958891539735, "grad_norm": 2.6817879676818848, "learning_rate": 8.281999045592172e-06, "loss": 0.1664, "num_input_tokens_seen": 81048304, "step": 37585 }, { "epoch": 6.898513488713525, "grad_norm": 22.165376663208008, "learning_rate": 8.281394902091392e-06, "loss": 0.3248, "num_input_tokens_seen": 81058448, "step": 37590 }, { "epoch": 6.899431088273078, "grad_norm": 8.659002304077148, "learning_rate": 8.280790674427917e-06, "loss": 0.2758, "num_input_tokens_seen": 81069168, "step": 37595 }, { "epoch": 6.90034868783263, "grad_norm": 0.7933565974235535, "learning_rate": 8.280186362617247e-06, "loss": 0.1826, "num_input_tokens_seen": 81080336, "step": 37600 }, { "epoch": 6.901266287392182, "grad_norm": 15.31432819366455, "learning_rate": 8.279581966674881e-06, "loss": 0.1937, "num_input_tokens_seen": 81090416, "step": 37605 }, { "epoch": 6.902183886951734, "grad_norm": 9.728704452514648, "learning_rate": 8.27897748661632e-06, "loss": 0.4154, "num_input_tokens_seen": 81100176, "step": 37610 }, { "epoch": 6.903101486511287, "grad_norm": 10.815613746643066, "learning_rate": 8.278372922457067e-06, "loss": 0.3171, "num_input_tokens_seen": 81110896, "step": 37615 }, { "epoch": 6.904019086070838, "grad_norm": 0.37496694922447205, "learning_rate": 8.27776827421263e-06, "loss": 0.1686, "num_input_tokens_seen": 81120912, "step": 37620 }, { "epoch": 6.904936685630391, "grad_norm": 9.505131721496582, "learning_rate": 8.277163541898518e-06, "loss": 0.3981, "num_input_tokens_seen": 81131056, "step": 37625 }, { "epoch": 6.905854285189943, "grad_norm": 2.8788490295410156, "learning_rate": 8.27655872553024e-06, "loss": 0.2988, "num_input_tokens_seen": 81141008, "step": 37630 }, { "epoch": 6.906771884749495, "grad_norm": 1.4593638181686401, "learning_rate": 8.275953825123308e-06, "loss": 0.1973, "num_input_tokens_seen": 81152016, "step": 37635 }, { "epoch": 6.9076894843090475, "grad_norm": 10.233607292175293, "learning_rate": 8.275348840693241e-06, "loss": 0.3655, "num_input_tokens_seen": 81163376, "step": 37640 }, { "epoch": 6.9086070838686, "grad_norm": 2.7908966541290283, "learning_rate": 8.274743772255549e-06, "loss": 0.2081, "num_input_tokens_seen": 81173232, "step": 37645 }, { "epoch": 6.909524683428152, "grad_norm": 9.926161766052246, "learning_rate": 8.274138619825756e-06, "loss": 0.1799, "num_input_tokens_seen": 81184560, "step": 37650 }, { "epoch": 6.910442282987704, "grad_norm": 20.390399932861328, "learning_rate": 8.27353338341938e-06, "loss": 0.3538, "num_input_tokens_seen": 81195568, "step": 37655 }, { "epoch": 6.911359882547257, "grad_norm": 0.750754177570343, "learning_rate": 8.272928063051948e-06, "loss": 0.266, "num_input_tokens_seen": 81206448, "step": 37660 }, { "epoch": 6.912277482106808, "grad_norm": 5.356266975402832, "learning_rate": 8.272322658738984e-06, "loss": 0.389, "num_input_tokens_seen": 81217328, "step": 37665 }, { "epoch": 6.913195081666361, "grad_norm": 1.2807279825210571, "learning_rate": 8.271717170496013e-06, "loss": 0.1826, "num_input_tokens_seen": 81228624, "step": 37670 }, { "epoch": 6.914112681225913, "grad_norm": 7.026865482330322, "learning_rate": 8.271111598338571e-06, "loss": 0.1745, "num_input_tokens_seen": 81238512, "step": 37675 }, { "epoch": 6.915030280785465, "grad_norm": 0.971001386642456, "learning_rate": 8.270505942282184e-06, "loss": 0.2822, "num_input_tokens_seen": 81249200, "step": 37680 }, { "epoch": 6.915947880345017, "grad_norm": 5.268797874450684, "learning_rate": 8.269900202342388e-06, "loss": 0.0755, "num_input_tokens_seen": 81260112, "step": 37685 }, { "epoch": 6.91686547990457, "grad_norm": 10.956745147705078, "learning_rate": 8.269294378534722e-06, "loss": 0.2485, "num_input_tokens_seen": 81271568, "step": 37690 }, { "epoch": 6.9177830794641215, "grad_norm": 6.398222923278809, "learning_rate": 8.268688470874719e-06, "loss": 0.3104, "num_input_tokens_seen": 81283216, "step": 37695 }, { "epoch": 6.918700679023674, "grad_norm": 8.955700874328613, "learning_rate": 8.268082479377926e-06, "loss": 0.3066, "num_input_tokens_seen": 81293040, "step": 37700 }, { "epoch": 6.9196182785832265, "grad_norm": 24.55470085144043, "learning_rate": 8.26747640405988e-06, "loss": 0.4158, "num_input_tokens_seen": 81304464, "step": 37705 }, { "epoch": 6.920535878142778, "grad_norm": 0.8057831525802612, "learning_rate": 8.26687024493613e-06, "loss": 0.157, "num_input_tokens_seen": 81315120, "step": 37710 }, { "epoch": 6.921453477702331, "grad_norm": 6.853453636169434, "learning_rate": 8.26626400202222e-06, "loss": 0.2956, "num_input_tokens_seen": 81325680, "step": 37715 }, { "epoch": 6.922371077261883, "grad_norm": 15.568110466003418, "learning_rate": 8.2656576753337e-06, "loss": 0.1561, "num_input_tokens_seen": 81337200, "step": 37720 }, { "epoch": 6.923288676821435, "grad_norm": 13.291322708129883, "learning_rate": 8.265051264886124e-06, "loss": 0.1763, "num_input_tokens_seen": 81347472, "step": 37725 }, { "epoch": 6.924206276380987, "grad_norm": 0.7511348724365234, "learning_rate": 8.264444770695043e-06, "loss": 0.278, "num_input_tokens_seen": 81358224, "step": 37730 }, { "epoch": 6.92512387594054, "grad_norm": 2.539106845855713, "learning_rate": 8.263838192776014e-06, "loss": 0.3238, "num_input_tokens_seen": 81369456, "step": 37735 }, { "epoch": 6.926041475500091, "grad_norm": 14.372078895568848, "learning_rate": 8.26323153114459e-06, "loss": 0.3608, "num_input_tokens_seen": 81381904, "step": 37740 }, { "epoch": 6.926959075059644, "grad_norm": 12.432210922241211, "learning_rate": 8.262624785816338e-06, "loss": 0.2519, "num_input_tokens_seen": 81393808, "step": 37745 }, { "epoch": 6.927876674619196, "grad_norm": 11.487756729125977, "learning_rate": 8.262017956806818e-06, "loss": 0.3905, "num_input_tokens_seen": 81404688, "step": 37750 }, { "epoch": 6.928794274178748, "grad_norm": 23.048362731933594, "learning_rate": 8.261411044131591e-06, "loss": 0.2443, "num_input_tokens_seen": 81414896, "step": 37755 }, { "epoch": 6.9297118737383006, "grad_norm": 0.8838080763816833, "learning_rate": 8.260804047806226e-06, "loss": 0.2742, "num_input_tokens_seen": 81425584, "step": 37760 }, { "epoch": 6.930629473297853, "grad_norm": 0.6008715629577637, "learning_rate": 8.26019696784629e-06, "loss": 0.1113, "num_input_tokens_seen": 81437744, "step": 37765 }, { "epoch": 6.931547072857405, "grad_norm": 8.204492568969727, "learning_rate": 8.259589804267354e-06, "loss": 0.3303, "num_input_tokens_seen": 81448944, "step": 37770 }, { "epoch": 6.932464672416957, "grad_norm": 2.034013509750366, "learning_rate": 8.258982557084993e-06, "loss": 0.2559, "num_input_tokens_seen": 81459056, "step": 37775 }, { "epoch": 6.93338227197651, "grad_norm": 10.802020072937012, "learning_rate": 8.258375226314781e-06, "loss": 0.4769, "num_input_tokens_seen": 81470800, "step": 37780 }, { "epoch": 6.934299871536061, "grad_norm": 8.127385139465332, "learning_rate": 8.257767811972292e-06, "loss": 0.3421, "num_input_tokens_seen": 81479920, "step": 37785 }, { "epoch": 6.935217471095614, "grad_norm": 0.567926287651062, "learning_rate": 8.25716031407311e-06, "loss": 0.12, "num_input_tokens_seen": 81490992, "step": 37790 }, { "epoch": 6.936135070655166, "grad_norm": 8.59365177154541, "learning_rate": 8.256552732632813e-06, "loss": 0.4677, "num_input_tokens_seen": 81502192, "step": 37795 }, { "epoch": 6.937052670214718, "grad_norm": 2.999291181564331, "learning_rate": 8.255945067666987e-06, "loss": 0.1225, "num_input_tokens_seen": 81512816, "step": 37800 }, { "epoch": 6.9379702697742704, "grad_norm": 6.431742191314697, "learning_rate": 8.255337319191215e-06, "loss": 0.283, "num_input_tokens_seen": 81524080, "step": 37805 }, { "epoch": 6.938887869333823, "grad_norm": 9.405827522277832, "learning_rate": 8.254729487221086e-06, "loss": 0.2658, "num_input_tokens_seen": 81534960, "step": 37810 }, { "epoch": 6.939805468893375, "grad_norm": 7.215823650360107, "learning_rate": 8.25412157177219e-06, "loss": 0.234, "num_input_tokens_seen": 81545808, "step": 37815 }, { "epoch": 6.940723068452927, "grad_norm": 17.011323928833008, "learning_rate": 8.253513572860119e-06, "loss": 0.3836, "num_input_tokens_seen": 81556752, "step": 37820 }, { "epoch": 6.94164066801248, "grad_norm": 7.073508262634277, "learning_rate": 8.25290549050047e-06, "loss": 0.2405, "num_input_tokens_seen": 81566544, "step": 37825 }, { "epoch": 6.942558267572031, "grad_norm": 51.58307647705078, "learning_rate": 8.252297324708834e-06, "loss": 0.2579, "num_input_tokens_seen": 81577264, "step": 37830 }, { "epoch": 6.943475867131584, "grad_norm": 1.3935099840164185, "learning_rate": 8.251689075500811e-06, "loss": 0.215, "num_input_tokens_seen": 81588656, "step": 37835 }, { "epoch": 6.944393466691136, "grad_norm": 7.298877716064453, "learning_rate": 8.251080742892005e-06, "loss": 0.2707, "num_input_tokens_seen": 81598864, "step": 37840 }, { "epoch": 6.945311066250688, "grad_norm": 10.579069137573242, "learning_rate": 8.250472326898016e-06, "loss": 0.2122, "num_input_tokens_seen": 81609072, "step": 37845 }, { "epoch": 6.94622866581024, "grad_norm": 6.709453582763672, "learning_rate": 8.24986382753445e-06, "loss": 0.2296, "num_input_tokens_seen": 81620208, "step": 37850 }, { "epoch": 6.947146265369793, "grad_norm": 11.819411277770996, "learning_rate": 8.249255244816914e-06, "loss": 0.2768, "num_input_tokens_seen": 81629040, "step": 37855 }, { "epoch": 6.9480638649293445, "grad_norm": 11.693320274353027, "learning_rate": 8.248646578761016e-06, "loss": 0.2378, "num_input_tokens_seen": 81640272, "step": 37860 }, { "epoch": 6.948981464488897, "grad_norm": 10.717169761657715, "learning_rate": 8.248037829382369e-06, "loss": 0.2096, "num_input_tokens_seen": 81650224, "step": 37865 }, { "epoch": 6.9498990640484495, "grad_norm": 0.7338463068008423, "learning_rate": 8.247428996696584e-06, "loss": 0.3166, "num_input_tokens_seen": 81660592, "step": 37870 }, { "epoch": 6.950816663608001, "grad_norm": 1.0061116218566895, "learning_rate": 8.24682008071928e-06, "loss": 0.2358, "num_input_tokens_seen": 81670960, "step": 37875 }, { "epoch": 6.951734263167554, "grad_norm": 6.550140857696533, "learning_rate": 8.246211081466073e-06, "loss": 0.2782, "num_input_tokens_seen": 81681296, "step": 37880 }, { "epoch": 6.952651862727106, "grad_norm": 1.6886547803878784, "learning_rate": 8.245601998952583e-06, "loss": 0.2592, "num_input_tokens_seen": 81690992, "step": 37885 }, { "epoch": 6.953569462286658, "grad_norm": 7.982189655303955, "learning_rate": 8.244992833194431e-06, "loss": 0.2004, "num_input_tokens_seen": 81702160, "step": 37890 }, { "epoch": 6.95448706184621, "grad_norm": 6.293023586273193, "learning_rate": 8.244383584207244e-06, "loss": 0.2865, "num_input_tokens_seen": 81713136, "step": 37895 }, { "epoch": 6.955404661405763, "grad_norm": 18.428903579711914, "learning_rate": 8.243774252006643e-06, "loss": 0.2361, "num_input_tokens_seen": 81724144, "step": 37900 }, { "epoch": 6.956322260965314, "grad_norm": 15.110496520996094, "learning_rate": 8.243164836608261e-06, "loss": 0.2851, "num_input_tokens_seen": 81734352, "step": 37905 }, { "epoch": 6.957239860524867, "grad_norm": 23.416776657104492, "learning_rate": 8.242555338027729e-06, "loss": 0.1941, "num_input_tokens_seen": 81744848, "step": 37910 }, { "epoch": 6.958157460084419, "grad_norm": 4.149850368499756, "learning_rate": 8.241945756280676e-06, "loss": 0.3572, "num_input_tokens_seen": 81755408, "step": 37915 }, { "epoch": 6.959075059643971, "grad_norm": 4.351869106292725, "learning_rate": 8.241336091382741e-06, "loss": 0.2868, "num_input_tokens_seen": 81766704, "step": 37920 }, { "epoch": 6.9599926592035235, "grad_norm": 17.258569717407227, "learning_rate": 8.240726343349559e-06, "loss": 0.2967, "num_input_tokens_seen": 81777872, "step": 37925 }, { "epoch": 6.960910258763076, "grad_norm": 1.0768566131591797, "learning_rate": 8.240116512196767e-06, "loss": 0.175, "num_input_tokens_seen": 81788080, "step": 37930 }, { "epoch": 6.961827858322628, "grad_norm": 1.5474187135696411, "learning_rate": 8.23950659794001e-06, "loss": 0.3166, "num_input_tokens_seen": 81799472, "step": 37935 }, { "epoch": 6.96274545788218, "grad_norm": 17.782485961914062, "learning_rate": 8.238896600594928e-06, "loss": 0.2748, "num_input_tokens_seen": 81809328, "step": 37940 }, { "epoch": 6.963663057441733, "grad_norm": 9.334488868713379, "learning_rate": 8.23828652017717e-06, "loss": 0.3677, "num_input_tokens_seen": 81820528, "step": 37945 }, { "epoch": 6.964580657001284, "grad_norm": 11.060689926147461, "learning_rate": 8.23767635670238e-06, "loss": 0.2426, "num_input_tokens_seen": 81831408, "step": 37950 }, { "epoch": 6.965498256560837, "grad_norm": 13.397388458251953, "learning_rate": 8.23706611018621e-06, "loss": 0.2906, "num_input_tokens_seen": 81842736, "step": 37955 }, { "epoch": 6.966415856120389, "grad_norm": 16.344640731811523, "learning_rate": 8.23645578064431e-06, "loss": 0.2458, "num_input_tokens_seen": 81854256, "step": 37960 }, { "epoch": 6.967333455679941, "grad_norm": 8.346494674682617, "learning_rate": 8.235845368092336e-06, "loss": 0.2767, "num_input_tokens_seen": 81864816, "step": 37965 }, { "epoch": 6.968251055239493, "grad_norm": 30.966310501098633, "learning_rate": 8.235234872545946e-06, "loss": 0.1226, "num_input_tokens_seen": 81875024, "step": 37970 }, { "epoch": 6.969168654799046, "grad_norm": 5.8606061935424805, "learning_rate": 8.234624294020792e-06, "loss": 0.2178, "num_input_tokens_seen": 81886768, "step": 37975 }, { "epoch": 6.9700862543585975, "grad_norm": 8.28218936920166, "learning_rate": 8.23401363253254e-06, "loss": 0.1475, "num_input_tokens_seen": 81897360, "step": 37980 }, { "epoch": 6.97100385391815, "grad_norm": 5.052970886230469, "learning_rate": 8.23340288809685e-06, "loss": 0.2521, "num_input_tokens_seen": 81909232, "step": 37985 }, { "epoch": 6.9719214534777025, "grad_norm": 14.706514358520508, "learning_rate": 8.232792060729386e-06, "loss": 0.3735, "num_input_tokens_seen": 81921584, "step": 37990 }, { "epoch": 6.972839053037255, "grad_norm": 17.35789680480957, "learning_rate": 8.23218115044582e-06, "loss": 0.3425, "num_input_tokens_seen": 81932816, "step": 37995 }, { "epoch": 6.973756652596807, "grad_norm": 17.753828048706055, "learning_rate": 8.231570157261813e-06, "loss": 0.2033, "num_input_tokens_seen": 81944528, "step": 38000 }, { "epoch": 6.974674252156359, "grad_norm": 21.57781982421875, "learning_rate": 8.230959081193042e-06, "loss": 0.2402, "num_input_tokens_seen": 81955888, "step": 38005 }, { "epoch": 6.975591851715912, "grad_norm": 12.938467979431152, "learning_rate": 8.230347922255177e-06, "loss": 0.3442, "num_input_tokens_seen": 81967056, "step": 38010 }, { "epoch": 6.976509451275463, "grad_norm": 2.130842685699463, "learning_rate": 8.229736680463893e-06, "loss": 0.2178, "num_input_tokens_seen": 81977424, "step": 38015 }, { "epoch": 6.977427050835016, "grad_norm": 22.4941463470459, "learning_rate": 8.229125355834872e-06, "loss": 0.2621, "num_input_tokens_seen": 81988624, "step": 38020 }, { "epoch": 6.978344650394568, "grad_norm": 3.07096266746521, "learning_rate": 8.22851394838379e-06, "loss": 0.2604, "num_input_tokens_seen": 81999888, "step": 38025 }, { "epoch": 6.97926224995412, "grad_norm": 43.0995979309082, "learning_rate": 8.227902458126326e-06, "loss": 0.1655, "num_input_tokens_seen": 82010032, "step": 38030 }, { "epoch": 6.980179849513672, "grad_norm": 6.325157165527344, "learning_rate": 8.22729088507817e-06, "loss": 0.1962, "num_input_tokens_seen": 82019184, "step": 38035 }, { "epoch": 6.981097449073225, "grad_norm": 14.512238502502441, "learning_rate": 8.226679229255001e-06, "loss": 0.3813, "num_input_tokens_seen": 82030256, "step": 38040 }, { "epoch": 6.9820150486327766, "grad_norm": 20.5076961517334, "learning_rate": 8.226067490672514e-06, "loss": 0.3757, "num_input_tokens_seen": 82041968, "step": 38045 }, { "epoch": 6.982932648192329, "grad_norm": 2.5894558429718018, "learning_rate": 8.225455669346394e-06, "loss": 0.3466, "num_input_tokens_seen": 82053456, "step": 38050 }, { "epoch": 6.983850247751882, "grad_norm": 25.254226684570312, "learning_rate": 8.224843765292335e-06, "loss": 0.1654, "num_input_tokens_seen": 82064592, "step": 38055 }, { "epoch": 6.984767847311433, "grad_norm": 10.903646469116211, "learning_rate": 8.224231778526034e-06, "loss": 0.1973, "num_input_tokens_seen": 82074480, "step": 38060 }, { "epoch": 6.985685446870986, "grad_norm": 0.621453583240509, "learning_rate": 8.223619709063182e-06, "loss": 0.2077, "num_input_tokens_seen": 82086032, "step": 38065 }, { "epoch": 6.986603046430538, "grad_norm": 0.6585515737533569, "learning_rate": 8.223007556919482e-06, "loss": 0.1623, "num_input_tokens_seen": 82096752, "step": 38070 }, { "epoch": 6.98752064599009, "grad_norm": 16.207109451293945, "learning_rate": 8.222395322110634e-06, "loss": 0.2589, "num_input_tokens_seen": 82107568, "step": 38075 }, { "epoch": 6.988438245549642, "grad_norm": 23.674875259399414, "learning_rate": 8.22178300465234e-06, "loss": 0.3027, "num_input_tokens_seen": 82118992, "step": 38080 }, { "epoch": 6.989355845109195, "grad_norm": 1.8471423387527466, "learning_rate": 8.221170604560305e-06, "loss": 0.2168, "num_input_tokens_seen": 82128080, "step": 38085 }, { "epoch": 6.9902734446687464, "grad_norm": 3.6379470825195312, "learning_rate": 8.220558121850235e-06, "loss": 0.3694, "num_input_tokens_seen": 82138992, "step": 38090 }, { "epoch": 6.991191044228299, "grad_norm": 9.257548332214355, "learning_rate": 8.219945556537842e-06, "loss": 0.1673, "num_input_tokens_seen": 82151280, "step": 38095 }, { "epoch": 6.9921086437878515, "grad_norm": 3.543215751647949, "learning_rate": 8.219332908638835e-06, "loss": 0.1482, "num_input_tokens_seen": 82162032, "step": 38100 }, { "epoch": 6.993026243347403, "grad_norm": 9.866215705871582, "learning_rate": 8.21872017816893e-06, "loss": 0.1862, "num_input_tokens_seen": 82173424, "step": 38105 }, { "epoch": 6.993943842906956, "grad_norm": 12.42190170288086, "learning_rate": 8.21810736514384e-06, "loss": 0.3189, "num_input_tokens_seen": 82184112, "step": 38110 }, { "epoch": 6.994861442466508, "grad_norm": 0.5199564695358276, "learning_rate": 8.217494469579283e-06, "loss": 0.2718, "num_input_tokens_seen": 82195216, "step": 38115 }, { "epoch": 6.99577904202606, "grad_norm": 4.946572780609131, "learning_rate": 8.21688149149098e-06, "loss": 0.1966, "num_input_tokens_seen": 82206768, "step": 38120 }, { "epoch": 6.996696641585612, "grad_norm": 13.492544174194336, "learning_rate": 8.216268430894651e-06, "loss": 0.3203, "num_input_tokens_seen": 82217744, "step": 38125 }, { "epoch": 6.997614241145165, "grad_norm": 4.636890888214111, "learning_rate": 8.215655287806024e-06, "loss": 0.131, "num_input_tokens_seen": 82228592, "step": 38130 }, { "epoch": 6.998531840704716, "grad_norm": 1.8132169246673584, "learning_rate": 8.215042062240823e-06, "loss": 0.209, "num_input_tokens_seen": 82240112, "step": 38135 }, { "epoch": 6.999449440264269, "grad_norm": 15.165705680847168, "learning_rate": 8.214428754214774e-06, "loss": 0.3391, "num_input_tokens_seen": 82251248, "step": 38140 }, { "epoch": 7.0003670398238205, "grad_norm": 10.049514770507812, "learning_rate": 8.213815363743612e-06, "loss": 0.2716, "num_input_tokens_seen": 82261104, "step": 38145 }, { "epoch": 7.001284639383373, "grad_norm": 1.2826340198516846, "learning_rate": 8.213201890843064e-06, "loss": 0.1191, "num_input_tokens_seen": 82271024, "step": 38150 }, { "epoch": 7.0022022389429255, "grad_norm": 5.256178379058838, "learning_rate": 8.212588335528868e-06, "loss": 0.3532, "num_input_tokens_seen": 82282480, "step": 38155 }, { "epoch": 7.003119838502477, "grad_norm": 20.371334075927734, "learning_rate": 8.21197469781676e-06, "loss": 0.2483, "num_input_tokens_seen": 82294800, "step": 38160 }, { "epoch": 7.00403743806203, "grad_norm": 5.578940391540527, "learning_rate": 8.211360977722482e-06, "loss": 0.1051, "num_input_tokens_seen": 82305936, "step": 38165 }, { "epoch": 7.004955037621582, "grad_norm": 6.895158290863037, "learning_rate": 8.210747175261768e-06, "loss": 0.1727, "num_input_tokens_seen": 82315536, "step": 38170 }, { "epoch": 7.005872637181134, "grad_norm": 20.604473114013672, "learning_rate": 8.210133290450369e-06, "loss": 0.199, "num_input_tokens_seen": 82327024, "step": 38175 }, { "epoch": 7.006790236740686, "grad_norm": 10.146439552307129, "learning_rate": 8.209519323304025e-06, "loss": 0.3133, "num_input_tokens_seen": 82337136, "step": 38180 }, { "epoch": 7.007707836300239, "grad_norm": 6.6734395027160645, "learning_rate": 8.208905273838483e-06, "loss": 0.2948, "num_input_tokens_seen": 82346352, "step": 38185 }, { "epoch": 7.00862543585979, "grad_norm": 0.7950156927108765, "learning_rate": 8.208291142069495e-06, "loss": 0.2396, "num_input_tokens_seen": 82357488, "step": 38190 }, { "epoch": 7.009543035419343, "grad_norm": 10.94304084777832, "learning_rate": 8.207676928012813e-06, "loss": 0.2787, "num_input_tokens_seen": 82368048, "step": 38195 }, { "epoch": 7.010460634978895, "grad_norm": 16.396190643310547, "learning_rate": 8.207062631684186e-06, "loss": 0.1443, "num_input_tokens_seen": 82379984, "step": 38200 }, { "epoch": 7.011378234538447, "grad_norm": 22.50387191772461, "learning_rate": 8.206448253099377e-06, "loss": 0.2575, "num_input_tokens_seen": 82389488, "step": 38205 }, { "epoch": 7.0122958340979995, "grad_norm": 10.122906684875488, "learning_rate": 8.205833792274136e-06, "loss": 0.1694, "num_input_tokens_seen": 82400240, "step": 38210 }, { "epoch": 7.013213433657552, "grad_norm": 1.0934609174728394, "learning_rate": 8.20521924922423e-06, "loss": 0.1544, "num_input_tokens_seen": 82411760, "step": 38215 }, { "epoch": 7.014131033217104, "grad_norm": 0.13620750606060028, "learning_rate": 8.204604623965417e-06, "loss": 0.0578, "num_input_tokens_seen": 82421584, "step": 38220 }, { "epoch": 7.015048632776656, "grad_norm": 6.044722557067871, "learning_rate": 8.203989916513462e-06, "loss": 0.2352, "num_input_tokens_seen": 82432400, "step": 38225 }, { "epoch": 7.015966232336209, "grad_norm": 7.06267786026001, "learning_rate": 8.20337512688413e-06, "loss": 0.0476, "num_input_tokens_seen": 82442512, "step": 38230 }, { "epoch": 7.01688383189576, "grad_norm": 0.6123321652412415, "learning_rate": 8.202760255093192e-06, "loss": 0.0946, "num_input_tokens_seen": 82452464, "step": 38235 }, { "epoch": 7.017801431455313, "grad_norm": 5.994657516479492, "learning_rate": 8.202145301156417e-06, "loss": 0.2552, "num_input_tokens_seen": 82463600, "step": 38240 }, { "epoch": 7.018719031014865, "grad_norm": 7.005747318267822, "learning_rate": 8.201530265089579e-06, "loss": 0.0154, "num_input_tokens_seen": 82475024, "step": 38245 }, { "epoch": 7.019636630574417, "grad_norm": 11.061650276184082, "learning_rate": 8.20091514690845e-06, "loss": 0.4259, "num_input_tokens_seen": 82485456, "step": 38250 }, { "epoch": 7.020554230133969, "grad_norm": 34.2844352722168, "learning_rate": 8.20029994662881e-06, "loss": 0.0829, "num_input_tokens_seen": 82496816, "step": 38255 }, { "epoch": 7.021471829693522, "grad_norm": 17.59709358215332, "learning_rate": 8.199684664266436e-06, "loss": 0.2665, "num_input_tokens_seen": 82508048, "step": 38260 }, { "epoch": 7.0223894292530735, "grad_norm": 5.280510902404785, "learning_rate": 8.199069299837108e-06, "loss": 0.3682, "num_input_tokens_seen": 82518928, "step": 38265 }, { "epoch": 7.023307028812626, "grad_norm": 40.299171447753906, "learning_rate": 8.198453853356612e-06, "loss": 0.4015, "num_input_tokens_seen": 82528560, "step": 38270 }, { "epoch": 7.0242246283721785, "grad_norm": 11.02038860321045, "learning_rate": 8.19783832484073e-06, "loss": 0.0689, "num_input_tokens_seen": 82539344, "step": 38275 }, { "epoch": 7.02514222793173, "grad_norm": 9.674540519714355, "learning_rate": 8.197222714305253e-06, "loss": 0.1963, "num_input_tokens_seen": 82550032, "step": 38280 }, { "epoch": 7.026059827491283, "grad_norm": 0.2407209426164627, "learning_rate": 8.196607021765968e-06, "loss": 0.0569, "num_input_tokens_seen": 82561232, "step": 38285 }, { "epoch": 7.026977427050835, "grad_norm": 2.9031081199645996, "learning_rate": 8.195991247238668e-06, "loss": 0.334, "num_input_tokens_seen": 82571792, "step": 38290 }, { "epoch": 7.027895026610387, "grad_norm": 1.4256929159164429, "learning_rate": 8.195375390739146e-06, "loss": 0.2813, "num_input_tokens_seen": 82581136, "step": 38295 }, { "epoch": 7.028812626169939, "grad_norm": 58.121524810791016, "learning_rate": 8.194759452283196e-06, "loss": 0.194, "num_input_tokens_seen": 82592016, "step": 38300 }, { "epoch": 7.029730225729492, "grad_norm": 15.121862411499023, "learning_rate": 8.194143431886619e-06, "loss": 0.0863, "num_input_tokens_seen": 82603824, "step": 38305 }, { "epoch": 7.030647825289044, "grad_norm": 25.717395782470703, "learning_rate": 8.193527329565211e-06, "loss": 0.3372, "num_input_tokens_seen": 82613776, "step": 38310 }, { "epoch": 7.031565424848596, "grad_norm": 0.47613540291786194, "learning_rate": 8.19291114533478e-06, "loss": 0.2972, "num_input_tokens_seen": 82624400, "step": 38315 }, { "epoch": 7.032483024408148, "grad_norm": 0.45621684193611145, "learning_rate": 8.192294879211124e-06, "loss": 0.3368, "num_input_tokens_seen": 82635440, "step": 38320 }, { "epoch": 7.033400623967701, "grad_norm": 15.526268005371094, "learning_rate": 8.191678531210055e-06, "loss": 0.2634, "num_input_tokens_seen": 82645584, "step": 38325 }, { "epoch": 7.034318223527253, "grad_norm": 1.1378618478775024, "learning_rate": 8.191062101347375e-06, "loss": 0.5028, "num_input_tokens_seen": 82655888, "step": 38330 }, { "epoch": 7.035235823086805, "grad_norm": 3.8189635276794434, "learning_rate": 8.190445589638898e-06, "loss": 0.2143, "num_input_tokens_seen": 82665552, "step": 38335 }, { "epoch": 7.036153422646358, "grad_norm": 0.8203499913215637, "learning_rate": 8.189828996100437e-06, "loss": 0.1557, "num_input_tokens_seen": 82677264, "step": 38340 }, { "epoch": 7.037071022205909, "grad_norm": 18.075658798217773, "learning_rate": 8.189212320747807e-06, "loss": 0.2528, "num_input_tokens_seen": 82687568, "step": 38345 }, { "epoch": 7.037988621765462, "grad_norm": 27.283851623535156, "learning_rate": 8.188595563596824e-06, "loss": 0.5398, "num_input_tokens_seen": 82698256, "step": 38350 }, { "epoch": 7.038906221325014, "grad_norm": 10.72409725189209, "learning_rate": 8.187978724663305e-06, "loss": 0.6137, "num_input_tokens_seen": 82708912, "step": 38355 }, { "epoch": 7.039823820884566, "grad_norm": 5.942599296569824, "learning_rate": 8.187361803963074e-06, "loss": 0.2911, "num_input_tokens_seen": 82718960, "step": 38360 }, { "epoch": 7.040741420444118, "grad_norm": 32.858211517333984, "learning_rate": 8.186744801511953e-06, "loss": 0.1171, "num_input_tokens_seen": 82729776, "step": 38365 }, { "epoch": 7.041659020003671, "grad_norm": 0.9245325326919556, "learning_rate": 8.186127717325765e-06, "loss": 0.0192, "num_input_tokens_seen": 82738000, "step": 38370 }, { "epoch": 7.0425766195632225, "grad_norm": 34.41877365112305, "learning_rate": 8.185510551420341e-06, "loss": 0.211, "num_input_tokens_seen": 82747952, "step": 38375 }, { "epoch": 7.043494219122775, "grad_norm": 12.529680252075195, "learning_rate": 8.184893303811507e-06, "loss": 0.1462, "num_input_tokens_seen": 82758224, "step": 38380 }, { "epoch": 7.0444118186823275, "grad_norm": 2.761751651763916, "learning_rate": 8.184275974515096e-06, "loss": 0.0978, "num_input_tokens_seen": 82769136, "step": 38385 }, { "epoch": 7.045329418241879, "grad_norm": 20.620689392089844, "learning_rate": 8.183658563546942e-06, "loss": 0.2024, "num_input_tokens_seen": 82780080, "step": 38390 }, { "epoch": 7.046247017801432, "grad_norm": 0.34686774015426636, "learning_rate": 8.18304107092288e-06, "loss": 0.1382, "num_input_tokens_seen": 82790768, "step": 38395 }, { "epoch": 7.047164617360984, "grad_norm": 14.501688003540039, "learning_rate": 8.182423496658749e-06, "loss": 0.1986, "num_input_tokens_seen": 82802512, "step": 38400 }, { "epoch": 7.048082216920536, "grad_norm": 65.33301544189453, "learning_rate": 8.181805840770386e-06, "loss": 0.4219, "num_input_tokens_seen": 82812400, "step": 38405 }, { "epoch": 7.048999816480088, "grad_norm": 22.371597290039062, "learning_rate": 8.181188103273634e-06, "loss": 0.0516, "num_input_tokens_seen": 82822928, "step": 38410 }, { "epoch": 7.049917416039641, "grad_norm": 24.554859161376953, "learning_rate": 8.18057028418434e-06, "loss": 0.4053, "num_input_tokens_seen": 82834480, "step": 38415 }, { "epoch": 7.050835015599192, "grad_norm": 45.44984817504883, "learning_rate": 8.179952383518346e-06, "loss": 0.3327, "num_input_tokens_seen": 82846672, "step": 38420 }, { "epoch": 7.051752615158745, "grad_norm": 46.72211456298828, "learning_rate": 8.1793344012915e-06, "loss": 0.3436, "num_input_tokens_seen": 82858384, "step": 38425 }, { "epoch": 7.052670214718297, "grad_norm": 27.097021102905273, "learning_rate": 8.178716337519657e-06, "loss": 0.2878, "num_input_tokens_seen": 82870032, "step": 38430 }, { "epoch": 7.053587814277849, "grad_norm": 40.559383392333984, "learning_rate": 8.178098192218666e-06, "loss": 0.2261, "num_input_tokens_seen": 82881648, "step": 38435 }, { "epoch": 7.0545054138374015, "grad_norm": 0.08850899338722229, "learning_rate": 8.177479965404382e-06, "loss": 0.1707, "num_input_tokens_seen": 82892816, "step": 38440 }, { "epoch": 7.055423013396954, "grad_norm": 14.239452362060547, "learning_rate": 8.176861657092661e-06, "loss": 0.3151, "num_input_tokens_seen": 82903568, "step": 38445 }, { "epoch": 7.056340612956506, "grad_norm": 1.6589393615722656, "learning_rate": 8.176243267299362e-06, "loss": 0.1836, "num_input_tokens_seen": 82911824, "step": 38450 }, { "epoch": 7.057258212516058, "grad_norm": 7.203428745269775, "learning_rate": 8.175624796040347e-06, "loss": 0.2963, "num_input_tokens_seen": 82921968, "step": 38455 }, { "epoch": 7.058175812075611, "grad_norm": 0.2526934742927551, "learning_rate": 8.175006243331477e-06, "loss": 0.0979, "num_input_tokens_seen": 82932848, "step": 38460 }, { "epoch": 7.059093411635162, "grad_norm": 15.761656761169434, "learning_rate": 8.174387609188618e-06, "loss": 0.1969, "num_input_tokens_seen": 82943152, "step": 38465 }, { "epoch": 7.060011011194715, "grad_norm": 76.52474212646484, "learning_rate": 8.173768893627635e-06, "loss": 0.3943, "num_input_tokens_seen": 82955376, "step": 38470 }, { "epoch": 7.060928610754267, "grad_norm": 55.614906311035156, "learning_rate": 8.173150096664401e-06, "loss": 0.2906, "num_input_tokens_seen": 82966512, "step": 38475 }, { "epoch": 7.061846210313819, "grad_norm": 30.297460556030273, "learning_rate": 8.172531218314783e-06, "loss": 0.2226, "num_input_tokens_seen": 82977200, "step": 38480 }, { "epoch": 7.062763809873371, "grad_norm": 9.906913757324219, "learning_rate": 8.17191225859466e-06, "loss": 0.1623, "num_input_tokens_seen": 82987344, "step": 38485 }, { "epoch": 7.063681409432924, "grad_norm": 5.945491313934326, "learning_rate": 8.171293217519899e-06, "loss": 0.1489, "num_input_tokens_seen": 82997488, "step": 38490 }, { "epoch": 7.0645990089924755, "grad_norm": 17.148788452148438, "learning_rate": 8.170674095106384e-06, "loss": 0.5818, "num_input_tokens_seen": 83007696, "step": 38495 }, { "epoch": 7.065516608552028, "grad_norm": 20.333206176757812, "learning_rate": 8.170054891369991e-06, "loss": 0.3474, "num_input_tokens_seen": 83018448, "step": 38500 }, { "epoch": 7.0664342081115805, "grad_norm": 80.3776626586914, "learning_rate": 8.169435606326605e-06, "loss": 0.3869, "num_input_tokens_seen": 83029072, "step": 38505 }, { "epoch": 7.067351807671132, "grad_norm": 45.0032958984375, "learning_rate": 8.168816239992109e-06, "loss": 0.348, "num_input_tokens_seen": 83041040, "step": 38510 }, { "epoch": 7.068269407230685, "grad_norm": 10.217435836791992, "learning_rate": 8.168196792382386e-06, "loss": 0.3344, "num_input_tokens_seen": 83051312, "step": 38515 }, { "epoch": 7.069187006790237, "grad_norm": 16.096166610717773, "learning_rate": 8.167577263513325e-06, "loss": 0.3151, "num_input_tokens_seen": 83063312, "step": 38520 }, { "epoch": 7.070104606349789, "grad_norm": 15.288174629211426, "learning_rate": 8.166957653400818e-06, "loss": 0.2145, "num_input_tokens_seen": 83074608, "step": 38525 }, { "epoch": 7.071022205909341, "grad_norm": 2.0785670280456543, "learning_rate": 8.166337962060755e-06, "loss": 0.1849, "num_input_tokens_seen": 83085232, "step": 38530 }, { "epoch": 7.071939805468894, "grad_norm": 17.173442840576172, "learning_rate": 8.16571818950903e-06, "loss": 0.269, "num_input_tokens_seen": 83097328, "step": 38535 }, { "epoch": 7.072857405028445, "grad_norm": 1.7298717498779297, "learning_rate": 8.165098335761541e-06, "loss": 0.1154, "num_input_tokens_seen": 83106928, "step": 38540 }, { "epoch": 7.073775004587998, "grad_norm": 0.827531635761261, "learning_rate": 8.164478400834184e-06, "loss": 0.1639, "num_input_tokens_seen": 83116656, "step": 38545 }, { "epoch": 7.07469260414755, "grad_norm": 12.302451133728027, "learning_rate": 8.16385838474286e-06, "loss": 0.3581, "num_input_tokens_seen": 83125232, "step": 38550 }, { "epoch": 7.075610203707102, "grad_norm": 31.346879959106445, "learning_rate": 8.163238287503473e-06, "loss": 0.4737, "num_input_tokens_seen": 83135888, "step": 38555 }, { "epoch": 7.0765278032666545, "grad_norm": 9.82655143737793, "learning_rate": 8.162618109131928e-06, "loss": 0.2392, "num_input_tokens_seen": 83146224, "step": 38560 }, { "epoch": 7.077445402826207, "grad_norm": 2.671600580215454, "learning_rate": 8.161997849644127e-06, "loss": 0.201, "num_input_tokens_seen": 83157168, "step": 38565 }, { "epoch": 7.078363002385759, "grad_norm": 22.462854385375977, "learning_rate": 8.161377509055983e-06, "loss": 0.107, "num_input_tokens_seen": 83166480, "step": 38570 }, { "epoch": 7.079280601945311, "grad_norm": 23.225187301635742, "learning_rate": 8.160757087383406e-06, "loss": 0.1519, "num_input_tokens_seen": 83178096, "step": 38575 }, { "epoch": 7.080198201504864, "grad_norm": 2.9220786094665527, "learning_rate": 8.160136584642308e-06, "loss": 0.1275, "num_input_tokens_seen": 83189008, "step": 38580 }, { "epoch": 7.081115801064415, "grad_norm": 3.2980685234069824, "learning_rate": 8.159516000848606e-06, "loss": 0.2256, "num_input_tokens_seen": 83200048, "step": 38585 }, { "epoch": 7.082033400623968, "grad_norm": 3.780518054962158, "learning_rate": 8.158895336018213e-06, "loss": 0.3199, "num_input_tokens_seen": 83209552, "step": 38590 }, { "epoch": 7.08295100018352, "grad_norm": 22.02280044555664, "learning_rate": 8.158274590167052e-06, "loss": 0.493, "num_input_tokens_seen": 83219888, "step": 38595 }, { "epoch": 7.083868599743072, "grad_norm": 14.79416561126709, "learning_rate": 8.157653763311041e-06, "loss": 0.4127, "num_input_tokens_seen": 83231088, "step": 38600 }, { "epoch": 7.084786199302624, "grad_norm": 12.639470100402832, "learning_rate": 8.157032855466106e-06, "loss": 0.211, "num_input_tokens_seen": 83242800, "step": 38605 }, { "epoch": 7.085703798862177, "grad_norm": 16.67135238647461, "learning_rate": 8.156411866648172e-06, "loss": 0.2535, "num_input_tokens_seen": 83254448, "step": 38610 }, { "epoch": 7.086621398421729, "grad_norm": 13.36862564086914, "learning_rate": 8.155790796873167e-06, "loss": 0.4973, "num_input_tokens_seen": 83265456, "step": 38615 }, { "epoch": 7.087538997981281, "grad_norm": 26.356958389282227, "learning_rate": 8.155169646157017e-06, "loss": 0.2479, "num_input_tokens_seen": 83277200, "step": 38620 }, { "epoch": 7.088456597540834, "grad_norm": 5.786741256713867, "learning_rate": 8.154548414515655e-06, "loss": 0.2205, "num_input_tokens_seen": 83288240, "step": 38625 }, { "epoch": 7.089374197100385, "grad_norm": 1.432823657989502, "learning_rate": 8.153927101965015e-06, "loss": 0.4842, "num_input_tokens_seen": 83298672, "step": 38630 }, { "epoch": 7.090291796659938, "grad_norm": 12.302202224731445, "learning_rate": 8.153305708521035e-06, "loss": 0.1251, "num_input_tokens_seen": 83310160, "step": 38635 }, { "epoch": 7.09120939621949, "grad_norm": 0.7207170128822327, "learning_rate": 8.15268423419965e-06, "loss": 0.1365, "num_input_tokens_seen": 83320944, "step": 38640 }, { "epoch": 7.092126995779042, "grad_norm": 1.0148595571517944, "learning_rate": 8.1520626790168e-06, "loss": 0.1453, "num_input_tokens_seen": 83330288, "step": 38645 }, { "epoch": 7.093044595338594, "grad_norm": 22.115478515625, "learning_rate": 8.151441042988428e-06, "loss": 0.2008, "num_input_tokens_seen": 83340880, "step": 38650 }, { "epoch": 7.093962194898147, "grad_norm": 23.89981460571289, "learning_rate": 8.150819326130477e-06, "loss": 0.312, "num_input_tokens_seen": 83351984, "step": 38655 }, { "epoch": 7.0948797944576985, "grad_norm": 3.1262130737304688, "learning_rate": 8.150197528458894e-06, "loss": 0.1139, "num_input_tokens_seen": 83363152, "step": 38660 }, { "epoch": 7.095797394017251, "grad_norm": 0.789393961429596, "learning_rate": 8.149575649989627e-06, "loss": 0.1086, "num_input_tokens_seen": 83373456, "step": 38665 }, { "epoch": 7.0967149935768035, "grad_norm": 1.9257464408874512, "learning_rate": 8.148953690738625e-06, "loss": 0.1346, "num_input_tokens_seen": 83383504, "step": 38670 }, { "epoch": 7.097632593136355, "grad_norm": 19.258268356323242, "learning_rate": 8.148331650721843e-06, "loss": 0.2526, "num_input_tokens_seen": 83394288, "step": 38675 }, { "epoch": 7.098550192695908, "grad_norm": 2.529149055480957, "learning_rate": 8.147709529955233e-06, "loss": 0.2726, "num_input_tokens_seen": 83405264, "step": 38680 }, { "epoch": 7.09946779225546, "grad_norm": 38.20964431762695, "learning_rate": 8.14708732845475e-06, "loss": 0.3061, "num_input_tokens_seen": 83415440, "step": 38685 }, { "epoch": 7.100385391815012, "grad_norm": 25.271854400634766, "learning_rate": 8.146465046236357e-06, "loss": 0.256, "num_input_tokens_seen": 83425200, "step": 38690 }, { "epoch": 7.101302991374564, "grad_norm": 7.342050552368164, "learning_rate": 8.145842683316013e-06, "loss": 0.1384, "num_input_tokens_seen": 83436944, "step": 38695 }, { "epoch": 7.102220590934117, "grad_norm": 13.78803825378418, "learning_rate": 8.145220239709676e-06, "loss": 0.4431, "num_input_tokens_seen": 83448848, "step": 38700 }, { "epoch": 7.103138190493668, "grad_norm": 5.294398307800293, "learning_rate": 8.144597715433316e-06, "loss": 0.1673, "num_input_tokens_seen": 83460784, "step": 38705 }, { "epoch": 7.104055790053221, "grad_norm": 2.02719783782959, "learning_rate": 8.1439751105029e-06, "loss": 0.1389, "num_input_tokens_seen": 83471184, "step": 38710 }, { "epoch": 7.104973389612773, "grad_norm": 1.8540657758712769, "learning_rate": 8.143352424934394e-06, "loss": 0.2737, "num_input_tokens_seen": 83481520, "step": 38715 }, { "epoch": 7.105890989172325, "grad_norm": 4.062963485717773, "learning_rate": 8.142729658743771e-06, "loss": 0.1229, "num_input_tokens_seen": 83492816, "step": 38720 }, { "epoch": 7.1068085887318775, "grad_norm": 24.047948837280273, "learning_rate": 8.142106811947002e-06, "loss": 0.1332, "num_input_tokens_seen": 83504496, "step": 38725 }, { "epoch": 7.10772618829143, "grad_norm": 36.93534851074219, "learning_rate": 8.141483884560063e-06, "loss": 0.3964, "num_input_tokens_seen": 83515472, "step": 38730 }, { "epoch": 7.108643787850982, "grad_norm": 6.305418968200684, "learning_rate": 8.140860876598931e-06, "loss": 0.2316, "num_input_tokens_seen": 83526352, "step": 38735 }, { "epoch": 7.109561387410534, "grad_norm": 0.9253206253051758, "learning_rate": 8.140237788079586e-06, "loss": 0.2093, "num_input_tokens_seen": 83538096, "step": 38740 }, { "epoch": 7.110478986970087, "grad_norm": 1.8542872667312622, "learning_rate": 8.139614619018011e-06, "loss": 0.3111, "num_input_tokens_seen": 83549072, "step": 38745 }, { "epoch": 7.111396586529638, "grad_norm": 2.002469778060913, "learning_rate": 8.138991369430182e-06, "loss": 0.1803, "num_input_tokens_seen": 83560432, "step": 38750 }, { "epoch": 7.112314186089191, "grad_norm": 3.2420382499694824, "learning_rate": 8.138368039332092e-06, "loss": 0.2789, "num_input_tokens_seen": 83571920, "step": 38755 }, { "epoch": 7.113231785648743, "grad_norm": 10.646785736083984, "learning_rate": 8.137744628739726e-06, "loss": 0.1946, "num_input_tokens_seen": 83583312, "step": 38760 }, { "epoch": 7.114149385208295, "grad_norm": 38.55412673950195, "learning_rate": 8.137121137669072e-06, "loss": 0.2323, "num_input_tokens_seen": 83592368, "step": 38765 }, { "epoch": 7.115066984767847, "grad_norm": 1.982857584953308, "learning_rate": 8.136497566136126e-06, "loss": 0.225, "num_input_tokens_seen": 83603952, "step": 38770 }, { "epoch": 7.1159845843274, "grad_norm": 0.5721944570541382, "learning_rate": 8.135873914156875e-06, "loss": 0.278, "num_input_tokens_seen": 83613616, "step": 38775 }, { "epoch": 7.1169021838869515, "grad_norm": 0.4601473808288574, "learning_rate": 8.13525018174732e-06, "loss": 0.0557, "num_input_tokens_seen": 83623344, "step": 38780 }, { "epoch": 7.117819783446504, "grad_norm": 8.771303176879883, "learning_rate": 8.134626368923458e-06, "loss": 0.3432, "num_input_tokens_seen": 83634416, "step": 38785 }, { "epoch": 7.1187373830060565, "grad_norm": 37.22168731689453, "learning_rate": 8.134002475701287e-06, "loss": 0.2429, "num_input_tokens_seen": 83645520, "step": 38790 }, { "epoch": 7.119654982565608, "grad_norm": 17.834333419799805, "learning_rate": 8.13337850209681e-06, "loss": 0.2634, "num_input_tokens_seen": 83654064, "step": 38795 }, { "epoch": 7.120572582125161, "grad_norm": 45.65095138549805, "learning_rate": 8.13275444812603e-06, "loss": 0.1769, "num_input_tokens_seen": 83665680, "step": 38800 }, { "epoch": 7.121490181684713, "grad_norm": 49.2367057800293, "learning_rate": 8.132130313804953e-06, "loss": 0.3609, "num_input_tokens_seen": 83676336, "step": 38805 }, { "epoch": 7.122407781244265, "grad_norm": 23.542085647583008, "learning_rate": 8.131506099149589e-06, "loss": 0.2841, "num_input_tokens_seen": 83687504, "step": 38810 }, { "epoch": 7.123325380803817, "grad_norm": 22.096290588378906, "learning_rate": 8.130881804175948e-06, "loss": 0.2091, "num_input_tokens_seen": 83697904, "step": 38815 }, { "epoch": 7.12424298036337, "grad_norm": 56.92855453491211, "learning_rate": 8.130257428900039e-06, "loss": 0.2096, "num_input_tokens_seen": 83708080, "step": 38820 }, { "epoch": 7.125160579922921, "grad_norm": 13.890748977661133, "learning_rate": 8.129632973337879e-06, "loss": 0.2874, "num_input_tokens_seen": 83719888, "step": 38825 }, { "epoch": 7.126078179482474, "grad_norm": 0.6809630393981934, "learning_rate": 8.129008437505485e-06, "loss": 0.1714, "num_input_tokens_seen": 83730576, "step": 38830 }, { "epoch": 7.126995779042026, "grad_norm": 28.42023468017578, "learning_rate": 8.128383821418873e-06, "loss": 0.2866, "num_input_tokens_seen": 83741200, "step": 38835 }, { "epoch": 7.127913378601578, "grad_norm": 1.7364344596862793, "learning_rate": 8.127759125094064e-06, "loss": 0.4181, "num_input_tokens_seen": 83753680, "step": 38840 }, { "epoch": 7.1288309781611305, "grad_norm": 7.272566318511963, "learning_rate": 8.127134348547082e-06, "loss": 0.3287, "num_input_tokens_seen": 83763856, "step": 38845 }, { "epoch": 7.129748577720683, "grad_norm": 7.190592288970947, "learning_rate": 8.12650949179395e-06, "loss": 0.1288, "num_input_tokens_seen": 83775632, "step": 38850 }, { "epoch": 7.130666177280235, "grad_norm": 4.277838230133057, "learning_rate": 8.125884554850696e-06, "loss": 0.2235, "num_input_tokens_seen": 83785424, "step": 38855 }, { "epoch": 7.131583776839787, "grad_norm": 7.911693096160889, "learning_rate": 8.125259537733347e-06, "loss": 0.3492, "num_input_tokens_seen": 83796912, "step": 38860 }, { "epoch": 7.13250137639934, "grad_norm": 0.46516674757003784, "learning_rate": 8.124634440457935e-06, "loss": 0.1438, "num_input_tokens_seen": 83806704, "step": 38865 }, { "epoch": 7.133418975958891, "grad_norm": 6.101799011230469, "learning_rate": 8.124009263040491e-06, "loss": 0.3832, "num_input_tokens_seen": 83817648, "step": 38870 }, { "epoch": 7.134336575518444, "grad_norm": 42.454612731933594, "learning_rate": 8.123384005497053e-06, "loss": 0.5117, "num_input_tokens_seen": 83829424, "step": 38875 }, { "epoch": 7.135254175077996, "grad_norm": 0.639845073223114, "learning_rate": 8.122758667843655e-06, "loss": 0.2334, "num_input_tokens_seen": 83840016, "step": 38880 }, { "epoch": 7.136171774637548, "grad_norm": 37.65176010131836, "learning_rate": 8.122133250096337e-06, "loss": 0.1964, "num_input_tokens_seen": 83851952, "step": 38885 }, { "epoch": 7.1370893741971, "grad_norm": 5.286468982696533, "learning_rate": 8.121507752271142e-06, "loss": 0.2074, "num_input_tokens_seen": 83861680, "step": 38890 }, { "epoch": 7.138006973756653, "grad_norm": 15.245847702026367, "learning_rate": 8.120882174384109e-06, "loss": 0.4152, "num_input_tokens_seen": 83873168, "step": 38895 }, { "epoch": 7.138924573316205, "grad_norm": 35.7620849609375, "learning_rate": 8.120256516451286e-06, "loss": 0.5782, "num_input_tokens_seen": 83883504, "step": 38900 }, { "epoch": 7.139842172875757, "grad_norm": 25.185077667236328, "learning_rate": 8.119630778488718e-06, "loss": 0.5718, "num_input_tokens_seen": 83892336, "step": 38905 }, { "epoch": 7.14075977243531, "grad_norm": 1.2793209552764893, "learning_rate": 8.119004960512457e-06, "loss": 0.142, "num_input_tokens_seen": 83903536, "step": 38910 }, { "epoch": 7.141677371994861, "grad_norm": 5.5373992919921875, "learning_rate": 8.118379062538553e-06, "loss": 0.3067, "num_input_tokens_seen": 83914704, "step": 38915 }, { "epoch": 7.142594971554414, "grad_norm": 15.417181015014648, "learning_rate": 8.117753084583057e-06, "loss": 0.2603, "num_input_tokens_seen": 83925648, "step": 38920 }, { "epoch": 7.143512571113966, "grad_norm": 27.469348907470703, "learning_rate": 8.117127026662028e-06, "loss": 0.2271, "num_input_tokens_seen": 83937168, "step": 38925 }, { "epoch": 7.144430170673518, "grad_norm": 0.5007756352424622, "learning_rate": 8.116500888791523e-06, "loss": 0.2189, "num_input_tokens_seen": 83949552, "step": 38930 }, { "epoch": 7.14534777023307, "grad_norm": 35.63847351074219, "learning_rate": 8.115874670987598e-06, "loss": 0.3251, "num_input_tokens_seen": 83960048, "step": 38935 }, { "epoch": 7.146265369792623, "grad_norm": 6.423951148986816, "learning_rate": 8.11524837326632e-06, "loss": 0.2001, "num_input_tokens_seen": 83970832, "step": 38940 }, { "epoch": 7.1471829693521745, "grad_norm": 35.03335952758789, "learning_rate": 8.114621995643746e-06, "loss": 0.2148, "num_input_tokens_seen": 83982128, "step": 38945 }, { "epoch": 7.148100568911727, "grad_norm": 3.098123788833618, "learning_rate": 8.113995538135946e-06, "loss": 0.2752, "num_input_tokens_seen": 83992656, "step": 38950 }, { "epoch": 7.1490181684712795, "grad_norm": 4.025151252746582, "learning_rate": 8.113369000758988e-06, "loss": 0.2228, "num_input_tokens_seen": 84003600, "step": 38955 }, { "epoch": 7.149935768030831, "grad_norm": 1.011543869972229, "learning_rate": 8.112742383528939e-06, "loss": 0.1653, "num_input_tokens_seen": 84014192, "step": 38960 }, { "epoch": 7.150853367590384, "grad_norm": 1.1709778308868408, "learning_rate": 8.112115686461873e-06, "loss": 0.1658, "num_input_tokens_seen": 84024816, "step": 38965 }, { "epoch": 7.151770967149936, "grad_norm": 8.12764835357666, "learning_rate": 8.111488909573863e-06, "loss": 0.3318, "num_input_tokens_seen": 84035024, "step": 38970 }, { "epoch": 7.152688566709488, "grad_norm": 8.67572021484375, "learning_rate": 8.110862052880983e-06, "loss": 0.2027, "num_input_tokens_seen": 84046096, "step": 38975 }, { "epoch": 7.15360616626904, "grad_norm": 5.317435264587402, "learning_rate": 8.110235116399315e-06, "loss": 0.2686, "num_input_tokens_seen": 84057392, "step": 38980 }, { "epoch": 7.154523765828593, "grad_norm": 22.858108520507812, "learning_rate": 8.109608100144935e-06, "loss": 0.2813, "num_input_tokens_seen": 84068144, "step": 38985 }, { "epoch": 7.155441365388144, "grad_norm": 21.450719833374023, "learning_rate": 8.108981004133929e-06, "loss": 0.5373, "num_input_tokens_seen": 84078384, "step": 38990 }, { "epoch": 7.156358964947697, "grad_norm": 25.13863182067871, "learning_rate": 8.108353828382376e-06, "loss": 0.1273, "num_input_tokens_seen": 84089040, "step": 38995 }, { "epoch": 7.157276564507249, "grad_norm": 0.9205318689346313, "learning_rate": 8.107726572906366e-06, "loss": 0.1447, "num_input_tokens_seen": 84098768, "step": 39000 }, { "epoch": 7.158194164066801, "grad_norm": 0.7294172644615173, "learning_rate": 8.107099237721987e-06, "loss": 0.1635, "num_input_tokens_seen": 84108848, "step": 39005 }, { "epoch": 7.1591117636263535, "grad_norm": 13.658778190612793, "learning_rate": 8.106471822845327e-06, "loss": 0.2589, "num_input_tokens_seen": 84118640, "step": 39010 }, { "epoch": 7.160029363185906, "grad_norm": 20.951431274414062, "learning_rate": 8.10584432829248e-06, "loss": 0.4235, "num_input_tokens_seen": 84128912, "step": 39015 }, { "epoch": 7.160946962745458, "grad_norm": 39.60874557495117, "learning_rate": 8.105216754079538e-06, "loss": 0.2451, "num_input_tokens_seen": 84139440, "step": 39020 }, { "epoch": 7.16186456230501, "grad_norm": 5.144970893859863, "learning_rate": 8.1045891002226e-06, "loss": 0.4568, "num_input_tokens_seen": 84150640, "step": 39025 }, { "epoch": 7.162782161864563, "grad_norm": 24.996278762817383, "learning_rate": 8.103961366737765e-06, "loss": 0.3763, "num_input_tokens_seen": 84161168, "step": 39030 }, { "epoch": 7.163699761424114, "grad_norm": 0.49623262882232666, "learning_rate": 8.10333355364113e-06, "loss": 0.2152, "num_input_tokens_seen": 84171632, "step": 39035 }, { "epoch": 7.164617360983667, "grad_norm": 1.6446342468261719, "learning_rate": 8.102705660948799e-06, "loss": 0.1333, "num_input_tokens_seen": 84183152, "step": 39040 }, { "epoch": 7.165534960543219, "grad_norm": 0.8166462182998657, "learning_rate": 8.102077688676875e-06, "loss": 0.132, "num_input_tokens_seen": 84194992, "step": 39045 }, { "epoch": 7.166452560102771, "grad_norm": 0.5571412444114685, "learning_rate": 8.101449636841468e-06, "loss": 0.278, "num_input_tokens_seen": 84206096, "step": 39050 }, { "epoch": 7.167370159662323, "grad_norm": 19.986724853515625, "learning_rate": 8.100821505458684e-06, "loss": 0.2396, "num_input_tokens_seen": 84216688, "step": 39055 }, { "epoch": 7.168287759221876, "grad_norm": 11.145785331726074, "learning_rate": 8.100193294544637e-06, "loss": 0.3608, "num_input_tokens_seen": 84226672, "step": 39060 }, { "epoch": 7.1692053587814275, "grad_norm": 26.570764541625977, "learning_rate": 8.099565004115432e-06, "loss": 0.242, "num_input_tokens_seen": 84237264, "step": 39065 }, { "epoch": 7.17012295834098, "grad_norm": 14.035603523254395, "learning_rate": 8.098936634187193e-06, "loss": 0.318, "num_input_tokens_seen": 84247472, "step": 39070 }, { "epoch": 7.1710405579005325, "grad_norm": 16.13722038269043, "learning_rate": 8.09830818477603e-06, "loss": 0.1539, "num_input_tokens_seen": 84257488, "step": 39075 }, { "epoch": 7.171958157460084, "grad_norm": 15.504168510437012, "learning_rate": 8.097679655898063e-06, "loss": 0.1679, "num_input_tokens_seen": 84269392, "step": 39080 }, { "epoch": 7.172875757019637, "grad_norm": 9.94910717010498, "learning_rate": 8.097051047569416e-06, "loss": 0.2751, "num_input_tokens_seen": 84280304, "step": 39085 }, { "epoch": 7.173793356579189, "grad_norm": 0.5301753282546997, "learning_rate": 8.096422359806209e-06, "loss": 0.1911, "num_input_tokens_seen": 84292272, "step": 39090 }, { "epoch": 7.174710956138741, "grad_norm": 2.9318549633026123, "learning_rate": 8.095793592624566e-06, "loss": 0.3468, "num_input_tokens_seen": 84301808, "step": 39095 }, { "epoch": 7.175628555698293, "grad_norm": 1.0786672830581665, "learning_rate": 8.095164746040618e-06, "loss": 0.299, "num_input_tokens_seen": 84312272, "step": 39100 }, { "epoch": 7.176546155257846, "grad_norm": 26.784629821777344, "learning_rate": 8.094535820070488e-06, "loss": 0.1978, "num_input_tokens_seen": 84322640, "step": 39105 }, { "epoch": 7.177463754817397, "grad_norm": 5.179018020629883, "learning_rate": 8.093906814730313e-06, "loss": 0.3435, "num_input_tokens_seen": 84332848, "step": 39110 }, { "epoch": 7.17838135437695, "grad_norm": 0.696190595626831, "learning_rate": 8.093277730036221e-06, "loss": 0.2254, "num_input_tokens_seen": 84343472, "step": 39115 }, { "epoch": 7.179298953936502, "grad_norm": 2.4760661125183105, "learning_rate": 8.092648566004352e-06, "loss": 0.5338, "num_input_tokens_seen": 84354512, "step": 39120 }, { "epoch": 7.180216553496054, "grad_norm": 28.119365692138672, "learning_rate": 8.09201932265084e-06, "loss": 0.2632, "num_input_tokens_seen": 84365648, "step": 39125 }, { "epoch": 7.1811341530556065, "grad_norm": 2.4452402591705322, "learning_rate": 8.091389999991824e-06, "loss": 0.1717, "num_input_tokens_seen": 84376944, "step": 39130 }, { "epoch": 7.182051752615159, "grad_norm": 11.256343841552734, "learning_rate": 8.090760598043444e-06, "loss": 0.4629, "num_input_tokens_seen": 84387568, "step": 39135 }, { "epoch": 7.182969352174711, "grad_norm": 11.887953758239746, "learning_rate": 8.090131116821846e-06, "loss": 0.1939, "num_input_tokens_seen": 84396976, "step": 39140 }, { "epoch": 7.183886951734263, "grad_norm": 0.541348934173584, "learning_rate": 8.089501556343175e-06, "loss": 0.3258, "num_input_tokens_seen": 84406864, "step": 39145 }, { "epoch": 7.184804551293816, "grad_norm": 9.106996536254883, "learning_rate": 8.088871916623577e-06, "loss": 0.2517, "num_input_tokens_seen": 84418160, "step": 39150 }, { "epoch": 7.185722150853367, "grad_norm": 20.537546157836914, "learning_rate": 8.0882421976792e-06, "loss": 0.2493, "num_input_tokens_seen": 84428624, "step": 39155 }, { "epoch": 7.18663975041292, "grad_norm": 40.50090789794922, "learning_rate": 8.087612399526201e-06, "loss": 0.3073, "num_input_tokens_seen": 84439472, "step": 39160 }, { "epoch": 7.187557349972472, "grad_norm": 5.961398124694824, "learning_rate": 8.086982522180726e-06, "loss": 0.255, "num_input_tokens_seen": 84450192, "step": 39165 }, { "epoch": 7.188474949532024, "grad_norm": 3.768059730529785, "learning_rate": 8.086352565658934e-06, "loss": 0.2526, "num_input_tokens_seen": 84461616, "step": 39170 }, { "epoch": 7.189392549091576, "grad_norm": 2.4720144271850586, "learning_rate": 8.085722529976985e-06, "loss": 0.2212, "num_input_tokens_seen": 84472048, "step": 39175 }, { "epoch": 7.190310148651129, "grad_norm": 8.62882137298584, "learning_rate": 8.085092415151032e-06, "loss": 0.1551, "num_input_tokens_seen": 84482768, "step": 39180 }, { "epoch": 7.191227748210681, "grad_norm": 5.066779136657715, "learning_rate": 8.084462221197241e-06, "loss": 0.205, "num_input_tokens_seen": 84493712, "step": 39185 }, { "epoch": 7.192145347770233, "grad_norm": 3.6694984436035156, "learning_rate": 8.083831948131774e-06, "loss": 0.3261, "num_input_tokens_seen": 84504752, "step": 39190 }, { "epoch": 7.193062947329786, "grad_norm": 11.457208633422852, "learning_rate": 8.0832015959708e-06, "loss": 0.3722, "num_input_tokens_seen": 84515504, "step": 39195 }, { "epoch": 7.193980546889337, "grad_norm": 26.126008987426758, "learning_rate": 8.082571164730482e-06, "loss": 0.2557, "num_input_tokens_seen": 84525168, "step": 39200 }, { "epoch": 7.19489814644889, "grad_norm": 1.0381194353103638, "learning_rate": 8.08194065442699e-06, "loss": 0.2154, "num_input_tokens_seen": 84535728, "step": 39205 }, { "epoch": 7.195815746008442, "grad_norm": 27.336164474487305, "learning_rate": 8.081310065076497e-06, "loss": 0.1836, "num_input_tokens_seen": 84546544, "step": 39210 }, { "epoch": 7.196733345567994, "grad_norm": 3.490541934967041, "learning_rate": 8.080679396695177e-06, "loss": 0.1342, "num_input_tokens_seen": 84556560, "step": 39215 }, { "epoch": 7.197650945127546, "grad_norm": 11.145432472229004, "learning_rate": 8.080048649299203e-06, "loss": 0.1699, "num_input_tokens_seen": 84567248, "step": 39220 }, { "epoch": 7.198568544687099, "grad_norm": 2.772474527359009, "learning_rate": 8.079417822904759e-06, "loss": 0.2232, "num_input_tokens_seen": 84578544, "step": 39225 }, { "epoch": 7.1994861442466505, "grad_norm": 23.72833824157715, "learning_rate": 8.078786917528016e-06, "loss": 0.1803, "num_input_tokens_seen": 84588016, "step": 39230 }, { "epoch": 7.200403743806203, "grad_norm": 0.9925110340118408, "learning_rate": 8.078155933185163e-06, "loss": 0.1892, "num_input_tokens_seen": 84598160, "step": 39235 }, { "epoch": 7.2013213433657555, "grad_norm": 25.190526962280273, "learning_rate": 8.077524869892382e-06, "loss": 0.372, "num_input_tokens_seen": 84609360, "step": 39240 }, { "epoch": 7.202238942925307, "grad_norm": 24.746492385864258, "learning_rate": 8.076893727665855e-06, "loss": 0.1803, "num_input_tokens_seen": 84620144, "step": 39245 }, { "epoch": 7.20315654248486, "grad_norm": 16.14093780517578, "learning_rate": 8.076262506521774e-06, "loss": 0.4788, "num_input_tokens_seen": 84631600, "step": 39250 }, { "epoch": 7.204074142044412, "grad_norm": 11.477705001831055, "learning_rate": 8.075631206476328e-06, "loss": 0.289, "num_input_tokens_seen": 84641360, "step": 39255 }, { "epoch": 7.204991741603964, "grad_norm": 1.6742876768112183, "learning_rate": 8.074999827545708e-06, "loss": 0.1766, "num_input_tokens_seen": 84651984, "step": 39260 }, { "epoch": 7.205909341163516, "grad_norm": 0.5269910097122192, "learning_rate": 8.074368369746107e-06, "loss": 0.2888, "num_input_tokens_seen": 84662704, "step": 39265 }, { "epoch": 7.206826940723069, "grad_norm": 23.354106903076172, "learning_rate": 8.073736833093725e-06, "loss": 0.3231, "num_input_tokens_seen": 84673072, "step": 39270 }, { "epoch": 7.20774454028262, "grad_norm": 6.631023406982422, "learning_rate": 8.073105217604754e-06, "loss": 0.0774, "num_input_tokens_seen": 84685456, "step": 39275 }, { "epoch": 7.208662139842173, "grad_norm": 17.973041534423828, "learning_rate": 8.072473523295398e-06, "loss": 0.3115, "num_input_tokens_seen": 84695600, "step": 39280 }, { "epoch": 7.209579739401725, "grad_norm": 18.871259689331055, "learning_rate": 8.071841750181858e-06, "loss": 0.2219, "num_input_tokens_seen": 84707280, "step": 39285 }, { "epoch": 7.210497338961277, "grad_norm": 5.608584403991699, "learning_rate": 8.071209898280339e-06, "loss": 0.3741, "num_input_tokens_seen": 84718064, "step": 39290 }, { "epoch": 7.2114149385208295, "grad_norm": 9.6630859375, "learning_rate": 8.070577967607044e-06, "loss": 0.2164, "num_input_tokens_seen": 84729712, "step": 39295 }, { "epoch": 7.212332538080382, "grad_norm": 36.928550720214844, "learning_rate": 8.069945958178187e-06, "loss": 0.23, "num_input_tokens_seen": 84741264, "step": 39300 }, { "epoch": 7.213250137639934, "grad_norm": 10.40417766571045, "learning_rate": 8.069313870009971e-06, "loss": 0.1012, "num_input_tokens_seen": 84752976, "step": 39305 }, { "epoch": 7.214167737199486, "grad_norm": 27.38776969909668, "learning_rate": 8.068681703118611e-06, "loss": 0.0956, "num_input_tokens_seen": 84764144, "step": 39310 }, { "epoch": 7.215085336759039, "grad_norm": 42.81570053100586, "learning_rate": 8.068049457520322e-06, "loss": 0.2755, "num_input_tokens_seen": 84776048, "step": 39315 }, { "epoch": 7.21600293631859, "grad_norm": 25.690765380859375, "learning_rate": 8.06741713323132e-06, "loss": 0.1126, "num_input_tokens_seen": 84788080, "step": 39320 }, { "epoch": 7.216920535878143, "grad_norm": 2.677173614501953, "learning_rate": 8.066784730267822e-06, "loss": 0.2191, "num_input_tokens_seen": 84798768, "step": 39325 }, { "epoch": 7.217838135437695, "grad_norm": 52.92029571533203, "learning_rate": 8.06615224864605e-06, "loss": 0.3714, "num_input_tokens_seen": 84809296, "step": 39330 }, { "epoch": 7.218755734997247, "grad_norm": 59.4455680847168, "learning_rate": 8.065519688382224e-06, "loss": 0.3412, "num_input_tokens_seen": 84818480, "step": 39335 }, { "epoch": 7.219673334556799, "grad_norm": 10.811457633972168, "learning_rate": 8.06488704949257e-06, "loss": 0.323, "num_input_tokens_seen": 84829552, "step": 39340 }, { "epoch": 7.220590934116352, "grad_norm": 0.39317387342453003, "learning_rate": 8.064254331993311e-06, "loss": 0.182, "num_input_tokens_seen": 84839760, "step": 39345 }, { "epoch": 7.2215085336759035, "grad_norm": 29.8445987701416, "learning_rate": 8.063621535900679e-06, "loss": 0.2465, "num_input_tokens_seen": 84850288, "step": 39350 }, { "epoch": 7.222426133235456, "grad_norm": 41.41569900512695, "learning_rate": 8.062988661230903e-06, "loss": 0.2739, "num_input_tokens_seen": 84861680, "step": 39355 }, { "epoch": 7.2233437327950085, "grad_norm": 10.689459800720215, "learning_rate": 8.062355708000215e-06, "loss": 0.2027, "num_input_tokens_seen": 84872208, "step": 39360 }, { "epoch": 7.22426133235456, "grad_norm": 0.46724212169647217, "learning_rate": 8.061722676224848e-06, "loss": 0.1359, "num_input_tokens_seen": 84883120, "step": 39365 }, { "epoch": 7.225178931914113, "grad_norm": 13.9129056930542, "learning_rate": 8.061089565921042e-06, "loss": 0.4408, "num_input_tokens_seen": 84894256, "step": 39370 }, { "epoch": 7.226096531473665, "grad_norm": 6.509185314178467, "learning_rate": 8.060456377105031e-06, "loss": 0.3114, "num_input_tokens_seen": 84905200, "step": 39375 }, { "epoch": 7.227014131033217, "grad_norm": 28.19145965576172, "learning_rate": 8.059823109793058e-06, "loss": 0.2483, "num_input_tokens_seen": 84916784, "step": 39380 }, { "epoch": 7.227931730592769, "grad_norm": 1.0879347324371338, "learning_rate": 8.059189764001366e-06, "loss": 0.36, "num_input_tokens_seen": 84927056, "step": 39385 }, { "epoch": 7.228849330152322, "grad_norm": 12.341802597045898, "learning_rate": 8.058556339746195e-06, "loss": 0.6396, "num_input_tokens_seen": 84937552, "step": 39390 }, { "epoch": 7.229766929711873, "grad_norm": 30.353450775146484, "learning_rate": 8.057922837043796e-06, "loss": 0.2256, "num_input_tokens_seen": 84947792, "step": 39395 }, { "epoch": 7.230684529271426, "grad_norm": 8.766671180725098, "learning_rate": 8.057289255910415e-06, "loss": 0.1819, "num_input_tokens_seen": 84959984, "step": 39400 }, { "epoch": 7.231602128830978, "grad_norm": 29.938289642333984, "learning_rate": 8.056655596362302e-06, "loss": 0.1426, "num_input_tokens_seen": 84971184, "step": 39405 }, { "epoch": 7.23251972839053, "grad_norm": 50.23788070678711, "learning_rate": 8.056021858415711e-06, "loss": 0.2322, "num_input_tokens_seen": 84982416, "step": 39410 }, { "epoch": 7.2334373279500825, "grad_norm": 15.074295997619629, "learning_rate": 8.055388042086895e-06, "loss": 0.368, "num_input_tokens_seen": 84994448, "step": 39415 }, { "epoch": 7.234354927509635, "grad_norm": 0.4051225781440735, "learning_rate": 8.054754147392114e-06, "loss": 0.2699, "num_input_tokens_seen": 85004752, "step": 39420 }, { "epoch": 7.235272527069187, "grad_norm": 24.16618537902832, "learning_rate": 8.054120174347622e-06, "loss": 0.2248, "num_input_tokens_seen": 85015696, "step": 39425 }, { "epoch": 7.236190126628739, "grad_norm": 30.062129974365234, "learning_rate": 8.05348612296968e-06, "loss": 0.1412, "num_input_tokens_seen": 85028432, "step": 39430 }, { "epoch": 7.237107726188292, "grad_norm": 3.4284701347351074, "learning_rate": 8.052851993274552e-06, "loss": 0.0884, "num_input_tokens_seen": 85038992, "step": 39435 }, { "epoch": 7.238025325747843, "grad_norm": 0.9003222584724426, "learning_rate": 8.052217785278503e-06, "loss": 0.1853, "num_input_tokens_seen": 85047984, "step": 39440 }, { "epoch": 7.238942925307396, "grad_norm": 14.068648338317871, "learning_rate": 8.051583498997797e-06, "loss": 0.2921, "num_input_tokens_seen": 85059408, "step": 39445 }, { "epoch": 7.239860524866948, "grad_norm": 12.342755317687988, "learning_rate": 8.050949134448703e-06, "loss": 0.3446, "num_input_tokens_seen": 85070320, "step": 39450 }, { "epoch": 7.2407781244265, "grad_norm": 31.3696231842041, "learning_rate": 8.050314691647494e-06, "loss": 0.2671, "num_input_tokens_seen": 85081296, "step": 39455 }, { "epoch": 7.241695723986052, "grad_norm": 2.899158239364624, "learning_rate": 8.04968017061044e-06, "loss": 0.4443, "num_input_tokens_seen": 85091888, "step": 39460 }, { "epoch": 7.242613323545605, "grad_norm": 7.634820938110352, "learning_rate": 8.049045571353816e-06, "loss": 0.3131, "num_input_tokens_seen": 85104528, "step": 39465 }, { "epoch": 7.243530923105157, "grad_norm": 42.46074676513672, "learning_rate": 8.048410893893898e-06, "loss": 0.4112, "num_input_tokens_seen": 85116432, "step": 39470 }, { "epoch": 7.244448522664709, "grad_norm": 19.809282302856445, "learning_rate": 8.047776138246968e-06, "loss": 0.0666, "num_input_tokens_seen": 85127408, "step": 39475 }, { "epoch": 7.245366122224262, "grad_norm": 19.503864288330078, "learning_rate": 8.047141304429301e-06, "loss": 0.4748, "num_input_tokens_seen": 85137968, "step": 39480 }, { "epoch": 7.246283721783813, "grad_norm": 1.0782685279846191, "learning_rate": 8.046506392457183e-06, "loss": 0.1686, "num_input_tokens_seen": 85149456, "step": 39485 }, { "epoch": 7.247201321343366, "grad_norm": 8.222166061401367, "learning_rate": 8.045871402346897e-06, "loss": 0.1541, "num_input_tokens_seen": 85160368, "step": 39490 }, { "epoch": 7.248118920902918, "grad_norm": 20.731420516967773, "learning_rate": 8.045236334114732e-06, "loss": 0.2121, "num_input_tokens_seen": 85170736, "step": 39495 }, { "epoch": 7.24903652046247, "grad_norm": 18.246896743774414, "learning_rate": 8.044601187776973e-06, "loss": 0.2751, "num_input_tokens_seen": 85181616, "step": 39500 }, { "epoch": 7.249954120022022, "grad_norm": 5.56240177154541, "learning_rate": 8.043965963349914e-06, "loss": 0.2079, "num_input_tokens_seen": 85192496, "step": 39505 }, { "epoch": 7.250871719581575, "grad_norm": 1.3710426092147827, "learning_rate": 8.043330660849844e-06, "loss": 0.0834, "num_input_tokens_seen": 85204048, "step": 39510 }, { "epoch": 7.2517893191411265, "grad_norm": 6.185701370239258, "learning_rate": 8.04269528029306e-06, "loss": 0.2183, "num_input_tokens_seen": 85214928, "step": 39515 }, { "epoch": 7.252706918700679, "grad_norm": 36.175254821777344, "learning_rate": 8.042059821695857e-06, "loss": 0.2719, "num_input_tokens_seen": 85224912, "step": 39520 }, { "epoch": 7.2536245182602315, "grad_norm": 6.183506011962891, "learning_rate": 8.041424285074535e-06, "loss": 0.2033, "num_input_tokens_seen": 85235280, "step": 39525 }, { "epoch": 7.254542117819783, "grad_norm": 34.74939727783203, "learning_rate": 8.040788670445394e-06, "loss": 0.4146, "num_input_tokens_seen": 85245904, "step": 39530 }, { "epoch": 7.255459717379336, "grad_norm": 15.849849700927734, "learning_rate": 8.040152977824736e-06, "loss": 0.3422, "num_input_tokens_seen": 85256976, "step": 39535 }, { "epoch": 7.256377316938888, "grad_norm": 9.071948051452637, "learning_rate": 8.039517207228867e-06, "loss": 0.506, "num_input_tokens_seen": 85266704, "step": 39540 }, { "epoch": 7.25729491649844, "grad_norm": 2.2001407146453857, "learning_rate": 8.038881358674092e-06, "loss": 0.316, "num_input_tokens_seen": 85277456, "step": 39545 }, { "epoch": 7.258212516057992, "grad_norm": 8.124303817749023, "learning_rate": 8.038245432176718e-06, "loss": 0.1786, "num_input_tokens_seen": 85288336, "step": 39550 }, { "epoch": 7.259130115617545, "grad_norm": 3.0089759826660156, "learning_rate": 8.037609427753062e-06, "loss": 0.2332, "num_input_tokens_seen": 85299344, "step": 39555 }, { "epoch": 7.260047715177096, "grad_norm": 9.981029510498047, "learning_rate": 8.036973345419428e-06, "loss": 0.2809, "num_input_tokens_seen": 85310448, "step": 39560 }, { "epoch": 7.260965314736649, "grad_norm": 10.585942268371582, "learning_rate": 8.036337185192135e-06, "loss": 0.1592, "num_input_tokens_seen": 85321456, "step": 39565 }, { "epoch": 7.261882914296201, "grad_norm": 77.44208526611328, "learning_rate": 8.0357009470875e-06, "loss": 0.3403, "num_input_tokens_seen": 85332176, "step": 39570 }, { "epoch": 7.262800513855753, "grad_norm": 2.662299633026123, "learning_rate": 8.03506463112184e-06, "loss": 0.1446, "num_input_tokens_seen": 85342032, "step": 39575 }, { "epoch": 7.2637181134153055, "grad_norm": 27.57468032836914, "learning_rate": 8.034428237311478e-06, "loss": 0.2423, "num_input_tokens_seen": 85351440, "step": 39580 }, { "epoch": 7.264635712974858, "grad_norm": 16.56031608581543, "learning_rate": 8.033791765672732e-06, "loss": 0.5404, "num_input_tokens_seen": 85362512, "step": 39585 }, { "epoch": 7.26555331253441, "grad_norm": 4.851760387420654, "learning_rate": 8.033155216221931e-06, "loss": 0.295, "num_input_tokens_seen": 85373264, "step": 39590 }, { "epoch": 7.266470912093962, "grad_norm": 19.629024505615234, "learning_rate": 8.0325185889754e-06, "loss": 0.2026, "num_input_tokens_seen": 85383792, "step": 39595 }, { "epoch": 7.267388511653515, "grad_norm": 17.826236724853516, "learning_rate": 8.031881883949467e-06, "loss": 0.2377, "num_input_tokens_seen": 85394672, "step": 39600 }, { "epoch": 7.268306111213066, "grad_norm": 1.904333233833313, "learning_rate": 8.03124510116046e-06, "loss": 0.1335, "num_input_tokens_seen": 85406608, "step": 39605 }, { "epoch": 7.269223710772619, "grad_norm": 9.05300235748291, "learning_rate": 8.030608240624717e-06, "loss": 0.3172, "num_input_tokens_seen": 85417232, "step": 39610 }, { "epoch": 7.270141310332171, "grad_norm": 5.638969898223877, "learning_rate": 8.029971302358568e-06, "loss": 0.2101, "num_input_tokens_seen": 85428688, "step": 39615 }, { "epoch": 7.271058909891723, "grad_norm": 1.6587153673171997, "learning_rate": 8.029334286378352e-06, "loss": 0.2298, "num_input_tokens_seen": 85438512, "step": 39620 }, { "epoch": 7.271976509451275, "grad_norm": 16.38833999633789, "learning_rate": 8.028697192700407e-06, "loss": 0.2466, "num_input_tokens_seen": 85449520, "step": 39625 }, { "epoch": 7.272894109010828, "grad_norm": 26.903043746948242, "learning_rate": 8.028060021341074e-06, "loss": 0.3396, "num_input_tokens_seen": 85460656, "step": 39630 }, { "epoch": 7.2738117085703795, "grad_norm": 28.65750503540039, "learning_rate": 8.027422772316692e-06, "loss": 0.327, "num_input_tokens_seen": 85471952, "step": 39635 }, { "epoch": 7.274729308129932, "grad_norm": 7.586109161376953, "learning_rate": 8.026785445643608e-06, "loss": 0.3421, "num_input_tokens_seen": 85482544, "step": 39640 }, { "epoch": 7.2756469076894845, "grad_norm": 8.311342239379883, "learning_rate": 8.026148041338171e-06, "loss": 0.1794, "num_input_tokens_seen": 85492048, "step": 39645 }, { "epoch": 7.276564507249036, "grad_norm": 11.968737602233887, "learning_rate": 8.025510559416725e-06, "loss": 0.3185, "num_input_tokens_seen": 85502704, "step": 39650 }, { "epoch": 7.277482106808589, "grad_norm": 47.377784729003906, "learning_rate": 8.024872999895623e-06, "loss": 0.3559, "num_input_tokens_seen": 85513552, "step": 39655 }, { "epoch": 7.278399706368141, "grad_norm": 10.871438980102539, "learning_rate": 8.024235362791216e-06, "loss": 0.1797, "num_input_tokens_seen": 85523536, "step": 39660 }, { "epoch": 7.279317305927693, "grad_norm": 22.734331130981445, "learning_rate": 8.023597648119859e-06, "loss": 0.2424, "num_input_tokens_seen": 85534416, "step": 39665 }, { "epoch": 7.280234905487245, "grad_norm": 11.438923835754395, "learning_rate": 8.02295985589791e-06, "loss": 0.2448, "num_input_tokens_seen": 85546800, "step": 39670 }, { "epoch": 7.281152505046798, "grad_norm": 13.060967445373535, "learning_rate": 8.022321986141724e-06, "loss": 0.4445, "num_input_tokens_seen": 85557904, "step": 39675 }, { "epoch": 7.282070104606349, "grad_norm": 36.969234466552734, "learning_rate": 8.021684038867663e-06, "loss": 0.321, "num_input_tokens_seen": 85568720, "step": 39680 }, { "epoch": 7.282987704165902, "grad_norm": 1.4283416271209717, "learning_rate": 8.021046014092091e-06, "loss": 0.2853, "num_input_tokens_seen": 85578768, "step": 39685 }, { "epoch": 7.283905303725454, "grad_norm": 15.947187423706055, "learning_rate": 8.02040791183137e-06, "loss": 0.258, "num_input_tokens_seen": 85588816, "step": 39690 }, { "epoch": 7.284822903285006, "grad_norm": 53.46385192871094, "learning_rate": 8.019769732101868e-06, "loss": 0.1447, "num_input_tokens_seen": 85598512, "step": 39695 }, { "epoch": 7.2857405028445585, "grad_norm": 27.65129852294922, "learning_rate": 8.019131474919953e-06, "loss": 0.2321, "num_input_tokens_seen": 85610384, "step": 39700 }, { "epoch": 7.286658102404111, "grad_norm": 29.118837356567383, "learning_rate": 8.018493140301994e-06, "loss": 0.2777, "num_input_tokens_seen": 85621104, "step": 39705 }, { "epoch": 7.287575701963663, "grad_norm": 18.982097625732422, "learning_rate": 8.017854728264363e-06, "loss": 0.1939, "num_input_tokens_seen": 85631984, "step": 39710 }, { "epoch": 7.288493301523215, "grad_norm": 12.95595932006836, "learning_rate": 8.017216238823437e-06, "loss": 0.2778, "num_input_tokens_seen": 85642864, "step": 39715 }, { "epoch": 7.289410901082768, "grad_norm": 0.9732431769371033, "learning_rate": 8.016577671995592e-06, "loss": 0.2148, "num_input_tokens_seen": 85653488, "step": 39720 }, { "epoch": 7.290328500642319, "grad_norm": 13.986525535583496, "learning_rate": 8.015939027797203e-06, "loss": 0.2181, "num_input_tokens_seen": 85663760, "step": 39725 }, { "epoch": 7.291246100201872, "grad_norm": 8.394669532775879, "learning_rate": 8.015300306244653e-06, "loss": 0.3034, "num_input_tokens_seen": 85673936, "step": 39730 }, { "epoch": 7.292163699761424, "grad_norm": 7.362354755401611, "learning_rate": 8.014661507354322e-06, "loss": 0.165, "num_input_tokens_seen": 85684496, "step": 39735 }, { "epoch": 7.293081299320976, "grad_norm": 4.832427501678467, "learning_rate": 8.014022631142599e-06, "loss": 0.1427, "num_input_tokens_seen": 85695280, "step": 39740 }, { "epoch": 7.293998898880528, "grad_norm": 1.2920918464660645, "learning_rate": 8.013383677625864e-06, "loss": 0.1065, "num_input_tokens_seen": 85706224, "step": 39745 }, { "epoch": 7.294916498440081, "grad_norm": 7.001616477966309, "learning_rate": 8.012744646820509e-06, "loss": 0.2298, "num_input_tokens_seen": 85716432, "step": 39750 }, { "epoch": 7.295834097999633, "grad_norm": 20.506654739379883, "learning_rate": 8.012105538742922e-06, "loss": 0.1635, "num_input_tokens_seen": 85728304, "step": 39755 }, { "epoch": 7.296751697559185, "grad_norm": 14.344520568847656, "learning_rate": 8.011466353409498e-06, "loss": 0.1075, "num_input_tokens_seen": 85739696, "step": 39760 }, { "epoch": 7.297669297118738, "grad_norm": 8.419336318969727, "learning_rate": 8.010827090836628e-06, "loss": 0.3463, "num_input_tokens_seen": 85750000, "step": 39765 }, { "epoch": 7.298586896678289, "grad_norm": 4.0712103843688965, "learning_rate": 8.01018775104071e-06, "loss": 0.2322, "num_input_tokens_seen": 85760656, "step": 39770 }, { "epoch": 7.299504496237842, "grad_norm": 12.716968536376953, "learning_rate": 8.00954833403814e-06, "loss": 0.181, "num_input_tokens_seen": 85771760, "step": 39775 }, { "epoch": 7.300422095797394, "grad_norm": 5.530653953552246, "learning_rate": 8.008908839845321e-06, "loss": 0.2577, "num_input_tokens_seen": 85782928, "step": 39780 }, { "epoch": 7.301339695356946, "grad_norm": 23.38453483581543, "learning_rate": 8.008269268478654e-06, "loss": 0.2983, "num_input_tokens_seen": 85793264, "step": 39785 }, { "epoch": 7.302257294916498, "grad_norm": 26.055166244506836, "learning_rate": 8.007629619954544e-06, "loss": 0.2607, "num_input_tokens_seen": 85803952, "step": 39790 }, { "epoch": 7.303174894476051, "grad_norm": 39.740684509277344, "learning_rate": 8.006989894289394e-06, "loss": 0.2096, "num_input_tokens_seen": 85815600, "step": 39795 }, { "epoch": 7.3040924940356025, "grad_norm": 17.136577606201172, "learning_rate": 8.006350091499613e-06, "loss": 0.3087, "num_input_tokens_seen": 85825488, "step": 39800 }, { "epoch": 7.305010093595155, "grad_norm": 15.828527450561523, "learning_rate": 8.005710211601613e-06, "loss": 0.2235, "num_input_tokens_seen": 85835312, "step": 39805 }, { "epoch": 7.3059276931547075, "grad_norm": 4.448218822479248, "learning_rate": 8.005070254611802e-06, "loss": 0.2369, "num_input_tokens_seen": 85846576, "step": 39810 }, { "epoch": 7.306845292714259, "grad_norm": 7.30742073059082, "learning_rate": 8.0044302205466e-06, "loss": 0.1074, "num_input_tokens_seen": 85857776, "step": 39815 }, { "epoch": 7.307762892273812, "grad_norm": 11.693455696105957, "learning_rate": 8.003790109422417e-06, "loss": 0.2537, "num_input_tokens_seen": 85868080, "step": 39820 }, { "epoch": 7.308680491833364, "grad_norm": 1.3149302005767822, "learning_rate": 8.003149921255673e-06, "loss": 0.1184, "num_input_tokens_seen": 85879280, "step": 39825 }, { "epoch": 7.309598091392916, "grad_norm": 14.084232330322266, "learning_rate": 8.00250965606279e-06, "loss": 0.0784, "num_input_tokens_seen": 85890096, "step": 39830 }, { "epoch": 7.310515690952468, "grad_norm": 2.036607265472412, "learning_rate": 8.001869313860185e-06, "loss": 0.2963, "num_input_tokens_seen": 85901296, "step": 39835 }, { "epoch": 7.311433290512021, "grad_norm": 26.86720848083496, "learning_rate": 8.001228894664287e-06, "loss": 0.4802, "num_input_tokens_seen": 85912624, "step": 39840 }, { "epoch": 7.312350890071572, "grad_norm": 11.66576862335205, "learning_rate": 8.000588398491519e-06, "loss": 0.264, "num_input_tokens_seen": 85924432, "step": 39845 }, { "epoch": 7.313268489631125, "grad_norm": 14.68399715423584, "learning_rate": 7.999947825358307e-06, "loss": 0.5153, "num_input_tokens_seen": 85935792, "step": 39850 }, { "epoch": 7.314186089190677, "grad_norm": 5.981970310211182, "learning_rate": 7.999307175281084e-06, "loss": 0.2635, "num_input_tokens_seen": 85946928, "step": 39855 }, { "epoch": 7.315103688750229, "grad_norm": 9.123069763183594, "learning_rate": 7.99866644827628e-06, "loss": 0.242, "num_input_tokens_seen": 85957712, "step": 39860 }, { "epoch": 7.3160212883097815, "grad_norm": 25.214590072631836, "learning_rate": 7.998025644360332e-06, "loss": 0.1378, "num_input_tokens_seen": 85969072, "step": 39865 }, { "epoch": 7.316938887869334, "grad_norm": 0.36158713698387146, "learning_rate": 7.99738476354967e-06, "loss": 0.0446, "num_input_tokens_seen": 85979792, "step": 39870 }, { "epoch": 7.317856487428886, "grad_norm": 20.74321937561035, "learning_rate": 7.996743805860734e-06, "loss": 0.1228, "num_input_tokens_seen": 85990640, "step": 39875 }, { "epoch": 7.318774086988438, "grad_norm": 14.008186340332031, "learning_rate": 7.996102771309965e-06, "loss": 0.3221, "num_input_tokens_seen": 86001808, "step": 39880 }, { "epoch": 7.319691686547991, "grad_norm": 42.8815803527832, "learning_rate": 7.995461659913803e-06, "loss": 0.2407, "num_input_tokens_seen": 86013232, "step": 39885 }, { "epoch": 7.320609286107542, "grad_norm": 8.783638000488281, "learning_rate": 7.994820471688692e-06, "loss": 0.421, "num_input_tokens_seen": 86025136, "step": 39890 }, { "epoch": 7.321526885667095, "grad_norm": 16.16294288635254, "learning_rate": 7.994179206651078e-06, "loss": 0.4013, "num_input_tokens_seen": 86034928, "step": 39895 }, { "epoch": 7.322444485226647, "grad_norm": 19.59066390991211, "learning_rate": 7.993537864817407e-06, "loss": 0.196, "num_input_tokens_seen": 86045552, "step": 39900 }, { "epoch": 7.323362084786199, "grad_norm": 15.323495864868164, "learning_rate": 7.992896446204131e-06, "loss": 0.1311, "num_input_tokens_seen": 86056816, "step": 39905 }, { "epoch": 7.324279684345751, "grad_norm": 13.863497734069824, "learning_rate": 7.992254950827698e-06, "loss": 0.3324, "num_input_tokens_seen": 86067824, "step": 39910 }, { "epoch": 7.325197283905304, "grad_norm": 30.666833877563477, "learning_rate": 7.991613378704564e-06, "loss": 0.2025, "num_input_tokens_seen": 86077904, "step": 39915 }, { "epoch": 7.3261148834648555, "grad_norm": 20.616914749145508, "learning_rate": 7.990971729851183e-06, "loss": 0.2092, "num_input_tokens_seen": 86090224, "step": 39920 }, { "epoch": 7.327032483024408, "grad_norm": 13.368470191955566, "learning_rate": 7.990330004284012e-06, "loss": 0.4235, "num_input_tokens_seen": 86102384, "step": 39925 }, { "epoch": 7.3279500825839605, "grad_norm": 19.612701416015625, "learning_rate": 7.989688202019512e-06, "loss": 0.2501, "num_input_tokens_seen": 86113392, "step": 39930 }, { "epoch": 7.328867682143512, "grad_norm": 4.598055839538574, "learning_rate": 7.989046323074143e-06, "loss": 0.0982, "num_input_tokens_seen": 86122864, "step": 39935 }, { "epoch": 7.329785281703065, "grad_norm": 24.493633270263672, "learning_rate": 7.988404367464369e-06, "loss": 0.2225, "num_input_tokens_seen": 86135184, "step": 39940 }, { "epoch": 7.330702881262617, "grad_norm": 8.9852294921875, "learning_rate": 7.987762335206653e-06, "loss": 0.2073, "num_input_tokens_seen": 86145520, "step": 39945 }, { "epoch": 7.331620480822169, "grad_norm": 25.927387237548828, "learning_rate": 7.987120226317466e-06, "loss": 0.3457, "num_input_tokens_seen": 86156368, "step": 39950 }, { "epoch": 7.332538080381721, "grad_norm": 28.116647720336914, "learning_rate": 7.986478040813273e-06, "loss": 0.133, "num_input_tokens_seen": 86166832, "step": 39955 }, { "epoch": 7.333455679941274, "grad_norm": 35.718257904052734, "learning_rate": 7.985835778710546e-06, "loss": 0.1686, "num_input_tokens_seen": 86178704, "step": 39960 }, { "epoch": 7.334373279500825, "grad_norm": 7.11460018157959, "learning_rate": 7.985193440025761e-06, "loss": 0.349, "num_input_tokens_seen": 86189424, "step": 39965 }, { "epoch": 7.335290879060378, "grad_norm": 8.182977676391602, "learning_rate": 7.98455102477539e-06, "loss": 0.2969, "num_input_tokens_seen": 86201456, "step": 39970 }, { "epoch": 7.33620847861993, "grad_norm": 21.36870002746582, "learning_rate": 7.98390853297591e-06, "loss": 0.1358, "num_input_tokens_seen": 86210512, "step": 39975 }, { "epoch": 7.337126078179482, "grad_norm": 20.21390724182129, "learning_rate": 7.983265964643802e-06, "loss": 0.374, "num_input_tokens_seen": 86220496, "step": 39980 }, { "epoch": 7.3380436777390345, "grad_norm": 19.90821075439453, "learning_rate": 7.982623319795546e-06, "loss": 0.2921, "num_input_tokens_seen": 86229328, "step": 39985 }, { "epoch": 7.338961277298587, "grad_norm": 9.934195518493652, "learning_rate": 7.981980598447623e-06, "loss": 0.258, "num_input_tokens_seen": 86238032, "step": 39990 }, { "epoch": 7.339878876858139, "grad_norm": 22.418210983276367, "learning_rate": 7.981337800616521e-06, "loss": 0.3692, "num_input_tokens_seen": 86247856, "step": 39995 }, { "epoch": 7.340796476417691, "grad_norm": 22.494529724121094, "learning_rate": 7.980694926318724e-06, "loss": 0.3296, "num_input_tokens_seen": 86258320, "step": 40000 }, { "epoch": 7.341714075977244, "grad_norm": 3.785048246383667, "learning_rate": 7.980051975570721e-06, "loss": 0.2859, "num_input_tokens_seen": 86269328, "step": 40005 }, { "epoch": 7.342631675536795, "grad_norm": 17.042741775512695, "learning_rate": 7.979408948389007e-06, "loss": 0.3369, "num_input_tokens_seen": 86280720, "step": 40010 }, { "epoch": 7.343549275096348, "grad_norm": 34.14393615722656, "learning_rate": 7.978765844790068e-06, "loss": 0.1919, "num_input_tokens_seen": 86291952, "step": 40015 }, { "epoch": 7.3444668746559, "grad_norm": 26.62105941772461, "learning_rate": 7.978122664790403e-06, "loss": 0.0859, "num_input_tokens_seen": 86302576, "step": 40020 }, { "epoch": 7.345384474215452, "grad_norm": 0.281695693731308, "learning_rate": 7.977479408406507e-06, "loss": 0.3478, "num_input_tokens_seen": 86314672, "step": 40025 }, { "epoch": 7.346302073775004, "grad_norm": 9.152100563049316, "learning_rate": 7.976836075654879e-06, "loss": 0.4328, "num_input_tokens_seen": 86325488, "step": 40030 }, { "epoch": 7.347219673334557, "grad_norm": 2.6210098266601562, "learning_rate": 7.976192666552018e-06, "loss": 0.1863, "num_input_tokens_seen": 86336144, "step": 40035 }, { "epoch": 7.348137272894109, "grad_norm": 7.047579765319824, "learning_rate": 7.975549181114429e-06, "loss": 0.3048, "num_input_tokens_seen": 86346832, "step": 40040 }, { "epoch": 7.349054872453661, "grad_norm": 17.487258911132812, "learning_rate": 7.974905619358615e-06, "loss": 0.2213, "num_input_tokens_seen": 86358448, "step": 40045 }, { "epoch": 7.349972472013214, "grad_norm": 30.673694610595703, "learning_rate": 7.974261981301082e-06, "loss": 0.1851, "num_input_tokens_seen": 86369808, "step": 40050 }, { "epoch": 7.350890071572765, "grad_norm": 11.39187240600586, "learning_rate": 7.97361826695834e-06, "loss": 0.1176, "num_input_tokens_seen": 86380784, "step": 40055 }, { "epoch": 7.351807671132318, "grad_norm": 9.223291397094727, "learning_rate": 7.972974476346898e-06, "loss": 0.328, "num_input_tokens_seen": 86391504, "step": 40060 }, { "epoch": 7.35272527069187, "grad_norm": 6.4084625244140625, "learning_rate": 7.972330609483266e-06, "loss": 0.3512, "num_input_tokens_seen": 86401904, "step": 40065 }, { "epoch": 7.353642870251422, "grad_norm": 7.482303142547607, "learning_rate": 7.971686666383963e-06, "loss": 0.27, "num_input_tokens_seen": 86412624, "step": 40070 }, { "epoch": 7.354560469810974, "grad_norm": 50.771080017089844, "learning_rate": 7.971042647065503e-06, "loss": 0.2977, "num_input_tokens_seen": 86423824, "step": 40075 }, { "epoch": 7.355478069370527, "grad_norm": 4.9212646484375, "learning_rate": 7.970398551544403e-06, "loss": 0.3093, "num_input_tokens_seen": 86433968, "step": 40080 }, { "epoch": 7.3563956689300785, "grad_norm": 2.7051117420196533, "learning_rate": 7.969754379837184e-06, "loss": 0.121, "num_input_tokens_seen": 86445232, "step": 40085 }, { "epoch": 7.357313268489631, "grad_norm": 1.131311297416687, "learning_rate": 7.969110131960368e-06, "loss": 0.1638, "num_input_tokens_seen": 86456848, "step": 40090 }, { "epoch": 7.3582308680491835, "grad_norm": 13.705482482910156, "learning_rate": 7.968465807930477e-06, "loss": 0.3129, "num_input_tokens_seen": 86468112, "step": 40095 }, { "epoch": 7.359148467608735, "grad_norm": 0.8216562867164612, "learning_rate": 7.96782140776404e-06, "loss": 0.2183, "num_input_tokens_seen": 86478672, "step": 40100 }, { "epoch": 7.360066067168288, "grad_norm": 1.3346480131149292, "learning_rate": 7.967176931477583e-06, "loss": 0.1903, "num_input_tokens_seen": 86489456, "step": 40105 }, { "epoch": 7.36098366672784, "grad_norm": 18.854000091552734, "learning_rate": 7.966532379087639e-06, "loss": 0.1211, "num_input_tokens_seen": 86500752, "step": 40110 }, { "epoch": 7.361901266287392, "grad_norm": 14.988192558288574, "learning_rate": 7.965887750610735e-06, "loss": 0.2366, "num_input_tokens_seen": 86512656, "step": 40115 }, { "epoch": 7.362818865846944, "grad_norm": 17.7205753326416, "learning_rate": 7.965243046063407e-06, "loss": 0.2621, "num_input_tokens_seen": 86522960, "step": 40120 }, { "epoch": 7.363736465406497, "grad_norm": 31.38924217224121, "learning_rate": 7.964598265462192e-06, "loss": 0.1903, "num_input_tokens_seen": 86533808, "step": 40125 }, { "epoch": 7.364654064966048, "grad_norm": 33.6756477355957, "learning_rate": 7.963953408823623e-06, "loss": 0.4261, "num_input_tokens_seen": 86545136, "step": 40130 }, { "epoch": 7.365571664525601, "grad_norm": 4.429157257080078, "learning_rate": 7.963308476164246e-06, "loss": 0.1953, "num_input_tokens_seen": 86556080, "step": 40135 }, { "epoch": 7.366489264085153, "grad_norm": 4.775152683258057, "learning_rate": 7.962663467500597e-06, "loss": 0.1481, "num_input_tokens_seen": 86565936, "step": 40140 }, { "epoch": 7.367406863644705, "grad_norm": 46.82748794555664, "learning_rate": 7.962018382849224e-06, "loss": 0.2044, "num_input_tokens_seen": 86574576, "step": 40145 }, { "epoch": 7.3683244632042575, "grad_norm": 3.0693397521972656, "learning_rate": 7.961373222226669e-06, "loss": 0.146, "num_input_tokens_seen": 86584848, "step": 40150 }, { "epoch": 7.36924206276381, "grad_norm": 1.0297378301620483, "learning_rate": 7.960727985649481e-06, "loss": 0.2691, "num_input_tokens_seen": 86593904, "step": 40155 }, { "epoch": 7.370159662323362, "grad_norm": 8.715145111083984, "learning_rate": 7.960082673134208e-06, "loss": 0.1333, "num_input_tokens_seen": 86604304, "step": 40160 }, { "epoch": 7.371077261882914, "grad_norm": 9.534635543823242, "learning_rate": 7.959437284697403e-06, "loss": 0.3474, "num_input_tokens_seen": 86615888, "step": 40165 }, { "epoch": 7.371994861442467, "grad_norm": 18.558317184448242, "learning_rate": 7.958791820355619e-06, "loss": 0.3184, "num_input_tokens_seen": 86627440, "step": 40170 }, { "epoch": 7.372912461002018, "grad_norm": 17.626140594482422, "learning_rate": 7.95814628012541e-06, "loss": 0.2894, "num_input_tokens_seen": 86637904, "step": 40175 }, { "epoch": 7.373830060561571, "grad_norm": 13.41164779663086, "learning_rate": 7.957500664023332e-06, "loss": 0.4139, "num_input_tokens_seen": 86649008, "step": 40180 }, { "epoch": 7.374747660121123, "grad_norm": 1.6051630973815918, "learning_rate": 7.956854972065948e-06, "loss": 0.2123, "num_input_tokens_seen": 86659664, "step": 40185 }, { "epoch": 7.375665259680676, "grad_norm": 15.20649528503418, "learning_rate": 7.956209204269815e-06, "loss": 0.2105, "num_input_tokens_seen": 86669616, "step": 40190 }, { "epoch": 7.376582859240227, "grad_norm": 11.538326263427734, "learning_rate": 7.955563360651499e-06, "loss": 0.0959, "num_input_tokens_seen": 86680304, "step": 40195 }, { "epoch": 7.37750045879978, "grad_norm": 22.914323806762695, "learning_rate": 7.95491744122756e-06, "loss": 0.3154, "num_input_tokens_seen": 86690672, "step": 40200 }, { "epoch": 7.378418058359332, "grad_norm": 26.932279586791992, "learning_rate": 7.954271446014572e-06, "loss": 0.227, "num_input_tokens_seen": 86700368, "step": 40205 }, { "epoch": 7.379335657918884, "grad_norm": 7.464587688446045, "learning_rate": 7.953625375029099e-06, "loss": 0.1721, "num_input_tokens_seen": 86711248, "step": 40210 }, { "epoch": 7.3802532574784365, "grad_norm": 11.349729537963867, "learning_rate": 7.952979228287715e-06, "loss": 0.5124, "num_input_tokens_seen": 86721104, "step": 40215 }, { "epoch": 7.381170857037989, "grad_norm": 1.4181742668151855, "learning_rate": 7.952333005806987e-06, "loss": 0.254, "num_input_tokens_seen": 86732336, "step": 40220 }, { "epoch": 7.382088456597541, "grad_norm": 22.740591049194336, "learning_rate": 7.951686707603495e-06, "loss": 0.3476, "num_input_tokens_seen": 86742832, "step": 40225 }, { "epoch": 7.383006056157093, "grad_norm": 32.90613555908203, "learning_rate": 7.951040333693813e-06, "loss": 0.4262, "num_input_tokens_seen": 86752016, "step": 40230 }, { "epoch": 7.383923655716646, "grad_norm": 2.3907032012939453, "learning_rate": 7.95039388409452e-06, "loss": 0.3441, "num_input_tokens_seen": 86763920, "step": 40235 }, { "epoch": 7.384841255276197, "grad_norm": 9.514843940734863, "learning_rate": 7.949747358822197e-06, "loss": 0.1948, "num_input_tokens_seen": 86774928, "step": 40240 }, { "epoch": 7.38575885483575, "grad_norm": 0.8241187334060669, "learning_rate": 7.949100757893426e-06, "loss": 0.2772, "num_input_tokens_seen": 86785776, "step": 40245 }, { "epoch": 7.386676454395302, "grad_norm": 10.411612510681152, "learning_rate": 7.948454081324793e-06, "loss": 0.3747, "num_input_tokens_seen": 86797008, "step": 40250 }, { "epoch": 7.387594053954854, "grad_norm": 11.802131652832031, "learning_rate": 7.94780732913288e-06, "loss": 0.2955, "num_input_tokens_seen": 86805680, "step": 40255 }, { "epoch": 7.388511653514406, "grad_norm": 0.6592656373977661, "learning_rate": 7.947160501334278e-06, "loss": 0.1954, "num_input_tokens_seen": 86816336, "step": 40260 }, { "epoch": 7.389429253073959, "grad_norm": 11.263285636901855, "learning_rate": 7.946513597945577e-06, "loss": 0.2805, "num_input_tokens_seen": 86826992, "step": 40265 }, { "epoch": 7.3903468526335105, "grad_norm": 1.9325644969940186, "learning_rate": 7.945866618983368e-06, "loss": 0.2368, "num_input_tokens_seen": 86837872, "step": 40270 }, { "epoch": 7.391264452193063, "grad_norm": 0.7620349526405334, "learning_rate": 7.945219564464249e-06, "loss": 0.2335, "num_input_tokens_seen": 86848976, "step": 40275 }, { "epoch": 7.392182051752616, "grad_norm": 6.4880290031433105, "learning_rate": 7.94457243440481e-06, "loss": 0.1024, "num_input_tokens_seen": 86858544, "step": 40280 }, { "epoch": 7.393099651312167, "grad_norm": 8.755306243896484, "learning_rate": 7.943925228821652e-06, "loss": 0.1514, "num_input_tokens_seen": 86869680, "step": 40285 }, { "epoch": 7.39401725087172, "grad_norm": 1.232153058052063, "learning_rate": 7.943277947731374e-06, "loss": 0.1683, "num_input_tokens_seen": 86880976, "step": 40290 }, { "epoch": 7.394934850431272, "grad_norm": 16.053403854370117, "learning_rate": 7.94263059115058e-06, "loss": 0.3743, "num_input_tokens_seen": 86892464, "step": 40295 }, { "epoch": 7.395852449990824, "grad_norm": 11.113155364990234, "learning_rate": 7.941983159095872e-06, "loss": 0.3594, "num_input_tokens_seen": 86903888, "step": 40300 }, { "epoch": 7.396770049550376, "grad_norm": 7.156048774719238, "learning_rate": 7.941335651583856e-06, "loss": 0.2582, "num_input_tokens_seen": 86914512, "step": 40305 }, { "epoch": 7.397687649109929, "grad_norm": 1.1093941926956177, "learning_rate": 7.940688068631136e-06, "loss": 0.1854, "num_input_tokens_seen": 86925776, "step": 40310 }, { "epoch": 7.39860524866948, "grad_norm": 0.756811797618866, "learning_rate": 7.940040410254328e-06, "loss": 0.2656, "num_input_tokens_seen": 86936272, "step": 40315 }, { "epoch": 7.399522848229033, "grad_norm": 39.25615692138672, "learning_rate": 7.93939267647004e-06, "loss": 0.1106, "num_input_tokens_seen": 86946544, "step": 40320 }, { "epoch": 7.4004404477885855, "grad_norm": 4.10837984085083, "learning_rate": 7.938744867294883e-06, "loss": 0.2287, "num_input_tokens_seen": 86957712, "step": 40325 }, { "epoch": 7.401358047348137, "grad_norm": 39.399322509765625, "learning_rate": 7.938096982745478e-06, "loss": 0.2634, "num_input_tokens_seen": 86968464, "step": 40330 }, { "epoch": 7.40227564690769, "grad_norm": 24.207077026367188, "learning_rate": 7.937449022838438e-06, "loss": 0.1894, "num_input_tokens_seen": 86979728, "step": 40335 }, { "epoch": 7.403193246467242, "grad_norm": 33.9066047668457, "learning_rate": 7.93680098759038e-06, "loss": 0.2921, "num_input_tokens_seen": 86990992, "step": 40340 }, { "epoch": 7.404110846026794, "grad_norm": 30.329837799072266, "learning_rate": 7.936152877017933e-06, "loss": 0.2833, "num_input_tokens_seen": 87002544, "step": 40345 }, { "epoch": 7.405028445586346, "grad_norm": 9.584502220153809, "learning_rate": 7.935504691137712e-06, "loss": 0.298, "num_input_tokens_seen": 87013680, "step": 40350 }, { "epoch": 7.405946045145899, "grad_norm": 2.0781643390655518, "learning_rate": 7.934856429966347e-06, "loss": 0.2044, "num_input_tokens_seen": 87024048, "step": 40355 }, { "epoch": 7.40686364470545, "grad_norm": 30.943675994873047, "learning_rate": 7.934208093520462e-06, "loss": 0.3965, "num_input_tokens_seen": 87033616, "step": 40360 }, { "epoch": 7.407781244265003, "grad_norm": 28.426551818847656, "learning_rate": 7.933559681816687e-06, "loss": 0.199, "num_input_tokens_seen": 87044144, "step": 40365 }, { "epoch": 7.408698843824555, "grad_norm": 11.234179496765137, "learning_rate": 7.932911194871656e-06, "loss": 0.2131, "num_input_tokens_seen": 87055312, "step": 40370 }, { "epoch": 7.409616443384107, "grad_norm": 16.648237228393555, "learning_rate": 7.932262632701995e-06, "loss": 0.4419, "num_input_tokens_seen": 87064656, "step": 40375 }, { "epoch": 7.4105340429436595, "grad_norm": 6.481751918792725, "learning_rate": 7.931613995324343e-06, "loss": 0.3373, "num_input_tokens_seen": 87074864, "step": 40380 }, { "epoch": 7.411451642503212, "grad_norm": 31.801450729370117, "learning_rate": 7.930965282755334e-06, "loss": 0.1174, "num_input_tokens_seen": 87086896, "step": 40385 }, { "epoch": 7.412369242062764, "grad_norm": 23.922771453857422, "learning_rate": 7.930316495011609e-06, "loss": 0.1859, "num_input_tokens_seen": 87099056, "step": 40390 }, { "epoch": 7.413286841622316, "grad_norm": 27.03713035583496, "learning_rate": 7.92966763210981e-06, "loss": 0.3416, "num_input_tokens_seen": 87109328, "step": 40395 }, { "epoch": 7.414204441181869, "grad_norm": 14.787009239196777, "learning_rate": 7.929018694066575e-06, "loss": 0.314, "num_input_tokens_seen": 87121072, "step": 40400 }, { "epoch": 7.41512204074142, "grad_norm": 18.660531997680664, "learning_rate": 7.92836968089855e-06, "loss": 0.2901, "num_input_tokens_seen": 87132368, "step": 40405 }, { "epoch": 7.416039640300973, "grad_norm": 0.6781501173973083, "learning_rate": 7.927720592622382e-06, "loss": 0.1393, "num_input_tokens_seen": 87142736, "step": 40410 }, { "epoch": 7.416957239860525, "grad_norm": 0.40741223096847534, "learning_rate": 7.927071429254715e-06, "loss": 0.1888, "num_input_tokens_seen": 87151632, "step": 40415 }, { "epoch": 7.417874839420077, "grad_norm": 16.215553283691406, "learning_rate": 7.926422190812206e-06, "loss": 0.3314, "num_input_tokens_seen": 87162128, "step": 40420 }, { "epoch": 7.418792438979629, "grad_norm": 14.278325080871582, "learning_rate": 7.925772877311503e-06, "loss": 0.3363, "num_input_tokens_seen": 87174192, "step": 40425 }, { "epoch": 7.419710038539182, "grad_norm": 13.526715278625488, "learning_rate": 7.92512348876926e-06, "loss": 0.4715, "num_input_tokens_seen": 87185328, "step": 40430 }, { "epoch": 7.4206276380987335, "grad_norm": 4.270859241485596, "learning_rate": 7.924474025202131e-06, "loss": 0.2268, "num_input_tokens_seen": 87196560, "step": 40435 }, { "epoch": 7.421545237658286, "grad_norm": 4.494729518890381, "learning_rate": 7.923824486626778e-06, "loss": 0.3367, "num_input_tokens_seen": 87207696, "step": 40440 }, { "epoch": 7.4224628372178385, "grad_norm": 2.539093255996704, "learning_rate": 7.923174873059859e-06, "loss": 0.1287, "num_input_tokens_seen": 87217744, "step": 40445 }, { "epoch": 7.42338043677739, "grad_norm": 16.043127059936523, "learning_rate": 7.922525184518032e-06, "loss": 0.2665, "num_input_tokens_seen": 87228592, "step": 40450 }, { "epoch": 7.424298036336943, "grad_norm": 2.327709197998047, "learning_rate": 7.921875421017966e-06, "loss": 0.203, "num_input_tokens_seen": 87240304, "step": 40455 }, { "epoch": 7.425215635896495, "grad_norm": 11.895960807800293, "learning_rate": 7.921225582576323e-06, "loss": 0.1671, "num_input_tokens_seen": 87250928, "step": 40460 }, { "epoch": 7.426133235456047, "grad_norm": 19.05278205871582, "learning_rate": 7.920575669209774e-06, "loss": 0.1648, "num_input_tokens_seen": 87261936, "step": 40465 }, { "epoch": 7.427050835015599, "grad_norm": 7.5982136726379395, "learning_rate": 7.91992568093498e-06, "loss": 0.1883, "num_input_tokens_seen": 87273840, "step": 40470 }, { "epoch": 7.427968434575152, "grad_norm": 12.58369255065918, "learning_rate": 7.919275617768622e-06, "loss": 0.0762, "num_input_tokens_seen": 87284816, "step": 40475 }, { "epoch": 7.428886034134703, "grad_norm": 12.53950309753418, "learning_rate": 7.918625479727368e-06, "loss": 0.272, "num_input_tokens_seen": 87294928, "step": 40480 }, { "epoch": 7.429803633694256, "grad_norm": 21.75238800048828, "learning_rate": 7.917975266827893e-06, "loss": 0.3019, "num_input_tokens_seen": 87305776, "step": 40485 }, { "epoch": 7.430721233253808, "grad_norm": 10.662430763244629, "learning_rate": 7.917324979086878e-06, "loss": 0.1276, "num_input_tokens_seen": 87316400, "step": 40490 }, { "epoch": 7.43163883281336, "grad_norm": 18.409690856933594, "learning_rate": 7.916674616520995e-06, "loss": 0.4624, "num_input_tokens_seen": 87326800, "step": 40495 }, { "epoch": 7.4325564323729125, "grad_norm": 5.932810306549072, "learning_rate": 7.91602417914693e-06, "loss": 0.229, "num_input_tokens_seen": 87337520, "step": 40500 }, { "epoch": 7.433474031932465, "grad_norm": 13.1992769241333, "learning_rate": 7.915373666981364e-06, "loss": 0.5274, "num_input_tokens_seen": 87348848, "step": 40505 }, { "epoch": 7.434391631492017, "grad_norm": 55.847206115722656, "learning_rate": 7.914723080040982e-06, "loss": 0.3173, "num_input_tokens_seen": 87360880, "step": 40510 }, { "epoch": 7.435309231051569, "grad_norm": 22.949935913085938, "learning_rate": 7.91407241834247e-06, "loss": 0.2624, "num_input_tokens_seen": 87371088, "step": 40515 }, { "epoch": 7.436226830611122, "grad_norm": 27.80573272705078, "learning_rate": 7.913421681902518e-06, "loss": 0.296, "num_input_tokens_seen": 87380784, "step": 40520 }, { "epoch": 7.437144430170673, "grad_norm": 33.56031036376953, "learning_rate": 7.912770870737814e-06, "loss": 0.282, "num_input_tokens_seen": 87392848, "step": 40525 }, { "epoch": 7.438062029730226, "grad_norm": 2.6851422786712646, "learning_rate": 7.912119984865052e-06, "loss": 0.112, "num_input_tokens_seen": 87402928, "step": 40530 }, { "epoch": 7.438979629289778, "grad_norm": 14.822802543640137, "learning_rate": 7.911469024300927e-06, "loss": 0.1485, "num_input_tokens_seen": 87413200, "step": 40535 }, { "epoch": 7.43989722884933, "grad_norm": 41.21881866455078, "learning_rate": 7.910817989062131e-06, "loss": 0.3383, "num_input_tokens_seen": 87423728, "step": 40540 }, { "epoch": 7.440814828408882, "grad_norm": 39.77649688720703, "learning_rate": 7.910166879165367e-06, "loss": 0.3421, "num_input_tokens_seen": 87434352, "step": 40545 }, { "epoch": 7.441732427968435, "grad_norm": 10.26754379272461, "learning_rate": 7.909515694627333e-06, "loss": 0.2425, "num_input_tokens_seen": 87444144, "step": 40550 }, { "epoch": 7.4426500275279865, "grad_norm": 20.2640438079834, "learning_rate": 7.908864435464728e-06, "loss": 0.11, "num_input_tokens_seen": 87454352, "step": 40555 }, { "epoch": 7.443567627087539, "grad_norm": 9.366510391235352, "learning_rate": 7.908213101694263e-06, "loss": 0.2284, "num_input_tokens_seen": 87466192, "step": 40560 }, { "epoch": 7.444485226647092, "grad_norm": 9.929484367370605, "learning_rate": 7.907561693332638e-06, "loss": 0.1795, "num_input_tokens_seen": 87477072, "step": 40565 }, { "epoch": 7.445402826206643, "grad_norm": 18.717897415161133, "learning_rate": 7.90691021039656e-06, "loss": 0.1506, "num_input_tokens_seen": 87487248, "step": 40570 }, { "epoch": 7.446320425766196, "grad_norm": 13.372432708740234, "learning_rate": 7.906258652902741e-06, "loss": 0.3053, "num_input_tokens_seen": 87497360, "step": 40575 }, { "epoch": 7.447238025325748, "grad_norm": 15.297942161560059, "learning_rate": 7.905607020867892e-06, "loss": 0.1523, "num_input_tokens_seen": 87508656, "step": 40580 }, { "epoch": 7.4481556248853, "grad_norm": 3.1082491874694824, "learning_rate": 7.904955314308726e-06, "loss": 0.2015, "num_input_tokens_seen": 87519024, "step": 40585 }, { "epoch": 7.449073224444852, "grad_norm": 3.6123952865600586, "learning_rate": 7.90430353324196e-06, "loss": 0.1818, "num_input_tokens_seen": 87529424, "step": 40590 }, { "epoch": 7.449990824004405, "grad_norm": 12.7933931350708, "learning_rate": 7.903651677684308e-06, "loss": 0.3305, "num_input_tokens_seen": 87540720, "step": 40595 }, { "epoch": 7.450908423563956, "grad_norm": 2.3889498710632324, "learning_rate": 7.902999747652492e-06, "loss": 0.1656, "num_input_tokens_seen": 87551536, "step": 40600 }, { "epoch": 7.451826023123509, "grad_norm": 1.0331579446792603, "learning_rate": 7.90234774316323e-06, "loss": 0.3715, "num_input_tokens_seen": 87562192, "step": 40605 }, { "epoch": 7.4527436226830615, "grad_norm": 5.399735450744629, "learning_rate": 7.901695664233248e-06, "loss": 0.1516, "num_input_tokens_seen": 87573168, "step": 40610 }, { "epoch": 7.453661222242613, "grad_norm": 2.8568525314331055, "learning_rate": 7.90104351087927e-06, "loss": 0.1879, "num_input_tokens_seen": 87583472, "step": 40615 }, { "epoch": 7.454578821802166, "grad_norm": 26.32591438293457, "learning_rate": 7.90039128311802e-06, "loss": 0.1438, "num_input_tokens_seen": 87594064, "step": 40620 }, { "epoch": 7.455496421361718, "grad_norm": 49.578250885009766, "learning_rate": 7.899738980966231e-06, "loss": 0.3958, "num_input_tokens_seen": 87604944, "step": 40625 }, { "epoch": 7.45641402092127, "grad_norm": 1.13224458694458, "learning_rate": 7.89908660444063e-06, "loss": 0.0854, "num_input_tokens_seen": 87616624, "step": 40630 }, { "epoch": 7.457331620480822, "grad_norm": 22.96198081970215, "learning_rate": 7.89843415355795e-06, "loss": 0.2484, "num_input_tokens_seen": 87627312, "step": 40635 }, { "epoch": 7.458249220040375, "grad_norm": 26.30442237854004, "learning_rate": 7.897781628334928e-06, "loss": 0.2644, "num_input_tokens_seen": 87638480, "step": 40640 }, { "epoch": 7.459166819599926, "grad_norm": 2.111609697341919, "learning_rate": 7.897129028788297e-06, "loss": 0.2177, "num_input_tokens_seen": 87649232, "step": 40645 }, { "epoch": 7.460084419159479, "grad_norm": 0.5496144890785217, "learning_rate": 7.896476354934798e-06, "loss": 0.1207, "num_input_tokens_seen": 87660176, "step": 40650 }, { "epoch": 7.461002018719031, "grad_norm": 23.846574783325195, "learning_rate": 7.895823606791169e-06, "loss": 0.3319, "num_input_tokens_seen": 87670512, "step": 40655 }, { "epoch": 7.461919618278583, "grad_norm": 16.546571731567383, "learning_rate": 7.895170784374152e-06, "loss": 0.5544, "num_input_tokens_seen": 87682160, "step": 40660 }, { "epoch": 7.4628372178381355, "grad_norm": 5.004534721374512, "learning_rate": 7.894517887700492e-06, "loss": 0.0829, "num_input_tokens_seen": 87693008, "step": 40665 }, { "epoch": 7.463754817397688, "grad_norm": 0.795651376247406, "learning_rate": 7.893864916786934e-06, "loss": 0.2483, "num_input_tokens_seen": 87703792, "step": 40670 }, { "epoch": 7.46467241695724, "grad_norm": 38.820552825927734, "learning_rate": 7.893211871650226e-06, "loss": 0.2641, "num_input_tokens_seen": 87714000, "step": 40675 }, { "epoch": 7.465590016516792, "grad_norm": 8.876546859741211, "learning_rate": 7.892558752307118e-06, "loss": 0.2297, "num_input_tokens_seen": 87724208, "step": 40680 }, { "epoch": 7.466507616076345, "grad_norm": 15.14470386505127, "learning_rate": 7.891905558774359e-06, "loss": 0.3401, "num_input_tokens_seen": 87732656, "step": 40685 }, { "epoch": 7.467425215635896, "grad_norm": 17.236967086791992, "learning_rate": 7.891252291068707e-06, "loss": 0.4106, "num_input_tokens_seen": 87744464, "step": 40690 }, { "epoch": 7.468342815195449, "grad_norm": 19.29170036315918, "learning_rate": 7.890598949206915e-06, "loss": 0.2248, "num_input_tokens_seen": 87756368, "step": 40695 }, { "epoch": 7.469260414755001, "grad_norm": 4.466846942901611, "learning_rate": 7.889945533205738e-06, "loss": 0.182, "num_input_tokens_seen": 87766224, "step": 40700 }, { "epoch": 7.470178014314553, "grad_norm": 5.163993835449219, "learning_rate": 7.88929204308194e-06, "loss": 0.3576, "num_input_tokens_seen": 87775920, "step": 40705 }, { "epoch": 7.471095613874105, "grad_norm": 12.622753143310547, "learning_rate": 7.888638478852275e-06, "loss": 0.4284, "num_input_tokens_seen": 87787376, "step": 40710 }, { "epoch": 7.472013213433658, "grad_norm": 0.9375430345535278, "learning_rate": 7.887984840533514e-06, "loss": 0.212, "num_input_tokens_seen": 87796720, "step": 40715 }, { "epoch": 7.4729308129932095, "grad_norm": 0.7235944271087646, "learning_rate": 7.887331128142415e-06, "loss": 0.2766, "num_input_tokens_seen": 87808208, "step": 40720 }, { "epoch": 7.473848412552762, "grad_norm": 0.685938835144043, "learning_rate": 7.88667734169575e-06, "loss": 0.1152, "num_input_tokens_seen": 87819536, "step": 40725 }, { "epoch": 7.4747660121123145, "grad_norm": 48.41343307495117, "learning_rate": 7.886023481210281e-06, "loss": 0.1578, "num_input_tokens_seen": 87830192, "step": 40730 }, { "epoch": 7.475683611671866, "grad_norm": 0.857576847076416, "learning_rate": 7.885369546702785e-06, "loss": 0.1947, "num_input_tokens_seen": 87842288, "step": 40735 }, { "epoch": 7.476601211231419, "grad_norm": 17.77408790588379, "learning_rate": 7.884715538190034e-06, "loss": 0.3043, "num_input_tokens_seen": 87854064, "step": 40740 }, { "epoch": 7.477518810790971, "grad_norm": 14.422045707702637, "learning_rate": 7.884061455688797e-06, "loss": 0.4053, "num_input_tokens_seen": 87864496, "step": 40745 }, { "epoch": 7.478436410350523, "grad_norm": 4.025254726409912, "learning_rate": 7.883407299215856e-06, "loss": 0.2313, "num_input_tokens_seen": 87875248, "step": 40750 }, { "epoch": 7.479354009910075, "grad_norm": 11.888897895812988, "learning_rate": 7.882753068787984e-06, "loss": 0.5372, "num_input_tokens_seen": 87886896, "step": 40755 }, { "epoch": 7.480271609469628, "grad_norm": 30.159595489501953, "learning_rate": 7.882098764421968e-06, "loss": 0.1417, "num_input_tokens_seen": 87897488, "step": 40760 }, { "epoch": 7.481189209029179, "grad_norm": 15.355894088745117, "learning_rate": 7.881444386134583e-06, "loss": 0.2699, "num_input_tokens_seen": 87908976, "step": 40765 }, { "epoch": 7.482106808588732, "grad_norm": 9.078551292419434, "learning_rate": 7.880789933942614e-06, "loss": 0.06, "num_input_tokens_seen": 87920368, "step": 40770 }, { "epoch": 7.483024408148284, "grad_norm": 18.33244514465332, "learning_rate": 7.88013540786285e-06, "loss": 0.247, "num_input_tokens_seen": 87930192, "step": 40775 }, { "epoch": 7.483942007707836, "grad_norm": 59.2277717590332, "learning_rate": 7.879480807912077e-06, "loss": 0.2224, "num_input_tokens_seen": 87941776, "step": 40780 }, { "epoch": 7.4848596072673885, "grad_norm": 6.736498832702637, "learning_rate": 7.878826134107082e-06, "loss": 0.2506, "num_input_tokens_seen": 87952496, "step": 40785 }, { "epoch": 7.485777206826941, "grad_norm": 0.461505264043808, "learning_rate": 7.87817138646466e-06, "loss": 0.1945, "num_input_tokens_seen": 87964048, "step": 40790 }, { "epoch": 7.486694806386493, "grad_norm": 2.397270679473877, "learning_rate": 7.877516565001602e-06, "loss": 0.0235, "num_input_tokens_seen": 87974416, "step": 40795 }, { "epoch": 7.487612405946045, "grad_norm": 12.336344718933105, "learning_rate": 7.876861669734703e-06, "loss": 0.2876, "num_input_tokens_seen": 87985776, "step": 40800 }, { "epoch": 7.488530005505598, "grad_norm": 52.60045623779297, "learning_rate": 7.876206700680762e-06, "loss": 0.2293, "num_input_tokens_seen": 87996304, "step": 40805 }, { "epoch": 7.489447605065149, "grad_norm": 5.732468605041504, "learning_rate": 7.875551657856577e-06, "loss": 0.5028, "num_input_tokens_seen": 88007728, "step": 40810 }, { "epoch": 7.490365204624702, "grad_norm": 1.8161709308624268, "learning_rate": 7.87489654127895e-06, "loss": 0.2922, "num_input_tokens_seen": 88018288, "step": 40815 }, { "epoch": 7.491282804184254, "grad_norm": 4.579884052276611, "learning_rate": 7.87424135096468e-06, "loss": 0.0453, "num_input_tokens_seen": 88027888, "step": 40820 }, { "epoch": 7.492200403743806, "grad_norm": 0.4060969352722168, "learning_rate": 7.873586086930573e-06, "loss": 0.0774, "num_input_tokens_seen": 88038832, "step": 40825 }, { "epoch": 7.493118003303358, "grad_norm": 2.5505619049072266, "learning_rate": 7.87293074919344e-06, "loss": 0.0952, "num_input_tokens_seen": 88050096, "step": 40830 }, { "epoch": 7.494035602862911, "grad_norm": 22.921110153198242, "learning_rate": 7.872275337770084e-06, "loss": 0.2967, "num_input_tokens_seen": 88060912, "step": 40835 }, { "epoch": 7.4949532024224625, "grad_norm": 0.3563733994960785, "learning_rate": 7.871619852677317e-06, "loss": 0.2503, "num_input_tokens_seen": 88071696, "step": 40840 }, { "epoch": 7.495870801982015, "grad_norm": 10.813596725463867, "learning_rate": 7.870964293931952e-06, "loss": 0.2779, "num_input_tokens_seen": 88083152, "step": 40845 }, { "epoch": 7.496788401541568, "grad_norm": 6.738819599151611, "learning_rate": 7.870308661550802e-06, "loss": 0.3909, "num_input_tokens_seen": 88093264, "step": 40850 }, { "epoch": 7.497706001101119, "grad_norm": 33.05017852783203, "learning_rate": 7.869652955550684e-06, "loss": 0.2371, "num_input_tokens_seen": 88104144, "step": 40855 }, { "epoch": 7.498623600660672, "grad_norm": 25.780258178710938, "learning_rate": 7.868997175948417e-06, "loss": 0.3483, "num_input_tokens_seen": 88114768, "step": 40860 }, { "epoch": 7.499541200220224, "grad_norm": 15.413848876953125, "learning_rate": 7.868341322760815e-06, "loss": 0.4077, "num_input_tokens_seen": 88126832, "step": 40865 }, { "epoch": 7.500458799779776, "grad_norm": 6.768989086151123, "learning_rate": 7.867685396004704e-06, "loss": 0.4369, "num_input_tokens_seen": 88137360, "step": 40870 }, { "epoch": 7.501376399339328, "grad_norm": 3.9841647148132324, "learning_rate": 7.86702939569691e-06, "loss": 0.1641, "num_input_tokens_seen": 88147568, "step": 40875 }, { "epoch": 7.502293998898881, "grad_norm": 25.893165588378906, "learning_rate": 7.866373321854255e-06, "loss": 0.2425, "num_input_tokens_seen": 88158320, "step": 40880 }, { "epoch": 7.503211598458432, "grad_norm": 10.698966026306152, "learning_rate": 7.865717174493566e-06, "loss": 0.3015, "num_input_tokens_seen": 88167664, "step": 40885 }, { "epoch": 7.504129198017985, "grad_norm": 11.144908905029297, "learning_rate": 7.865060953631672e-06, "loss": 0.1885, "num_input_tokens_seen": 88178736, "step": 40890 }, { "epoch": 7.5050467975775375, "grad_norm": 6.80944299697876, "learning_rate": 7.864404659285406e-06, "loss": 0.2263, "num_input_tokens_seen": 88189488, "step": 40895 }, { "epoch": 7.505964397137089, "grad_norm": 4.145636558532715, "learning_rate": 7.8637482914716e-06, "loss": 0.2015, "num_input_tokens_seen": 88199472, "step": 40900 }, { "epoch": 7.506881996696642, "grad_norm": 4.600327968597412, "learning_rate": 7.863091850207088e-06, "loss": 0.1411, "num_input_tokens_seen": 88210544, "step": 40905 }, { "epoch": 7.507799596256194, "grad_norm": 0.9787130951881409, "learning_rate": 7.862435335508709e-06, "loss": 0.2018, "num_input_tokens_seen": 88221968, "step": 40910 }, { "epoch": 7.508717195815746, "grad_norm": 14.493002891540527, "learning_rate": 7.861778747393299e-06, "loss": 0.316, "num_input_tokens_seen": 88233520, "step": 40915 }, { "epoch": 7.509634795375298, "grad_norm": 2.174360990524292, "learning_rate": 7.8611220858777e-06, "loss": 0.0858, "num_input_tokens_seen": 88243728, "step": 40920 }, { "epoch": 7.510552394934851, "grad_norm": 9.732678413391113, "learning_rate": 7.860465350978752e-06, "loss": 0.134, "num_input_tokens_seen": 88254640, "step": 40925 }, { "epoch": 7.511469994494402, "grad_norm": 17.903539657592773, "learning_rate": 7.859808542713304e-06, "loss": 0.3898, "num_input_tokens_seen": 88265968, "step": 40930 }, { "epoch": 7.512387594053955, "grad_norm": 16.490442276000977, "learning_rate": 7.859151661098197e-06, "loss": 0.5958, "num_input_tokens_seen": 88276624, "step": 40935 }, { "epoch": 7.513305193613507, "grad_norm": 36.44301223754883, "learning_rate": 7.858494706150282e-06, "loss": 0.2155, "num_input_tokens_seen": 88287344, "step": 40940 }, { "epoch": 7.514222793173059, "grad_norm": 6.818782806396484, "learning_rate": 7.857837677886406e-06, "loss": 0.2901, "num_input_tokens_seen": 88297584, "step": 40945 }, { "epoch": 7.5151403927326115, "grad_norm": 0.6658656001091003, "learning_rate": 7.857180576323425e-06, "loss": 0.2523, "num_input_tokens_seen": 88308368, "step": 40950 }, { "epoch": 7.516057992292164, "grad_norm": 6.2558159828186035, "learning_rate": 7.85652340147819e-06, "loss": 0.1537, "num_input_tokens_seen": 88320336, "step": 40955 }, { "epoch": 7.516975591851716, "grad_norm": 9.968925476074219, "learning_rate": 7.855866153367557e-06, "loss": 0.2279, "num_input_tokens_seen": 88330928, "step": 40960 }, { "epoch": 7.517893191411268, "grad_norm": 24.865047454833984, "learning_rate": 7.855208832008383e-06, "loss": 0.1175, "num_input_tokens_seen": 88341424, "step": 40965 }, { "epoch": 7.518810790970821, "grad_norm": 1.7247177362442017, "learning_rate": 7.854551437417528e-06, "loss": 0.1244, "num_input_tokens_seen": 88351440, "step": 40970 }, { "epoch": 7.519728390530372, "grad_norm": 27.066959381103516, "learning_rate": 7.853893969611852e-06, "loss": 0.2531, "num_input_tokens_seen": 88363120, "step": 40975 }, { "epoch": 7.520645990089925, "grad_norm": 14.401097297668457, "learning_rate": 7.853236428608219e-06, "loss": 0.2548, "num_input_tokens_seen": 88374704, "step": 40980 }, { "epoch": 7.521563589649477, "grad_norm": 21.448463439941406, "learning_rate": 7.852578814423497e-06, "loss": 0.1738, "num_input_tokens_seen": 88386032, "step": 40985 }, { "epoch": 7.522481189209029, "grad_norm": 38.96174621582031, "learning_rate": 7.851921127074545e-06, "loss": 0.2425, "num_input_tokens_seen": 88397296, "step": 40990 }, { "epoch": 7.523398788768581, "grad_norm": 10.303441047668457, "learning_rate": 7.851263366578239e-06, "loss": 0.2302, "num_input_tokens_seen": 88407856, "step": 40995 }, { "epoch": 7.524316388328134, "grad_norm": 8.466526985168457, "learning_rate": 7.850605532951446e-06, "loss": 0.0641, "num_input_tokens_seen": 88418192, "step": 41000 }, { "epoch": 7.5252339878876855, "grad_norm": 7.234395503997803, "learning_rate": 7.849947626211037e-06, "loss": 0.3031, "num_input_tokens_seen": 88430288, "step": 41005 }, { "epoch": 7.526151587447238, "grad_norm": 14.909868240356445, "learning_rate": 7.84928964637389e-06, "loss": 0.244, "num_input_tokens_seen": 88440624, "step": 41010 }, { "epoch": 7.5270691870067905, "grad_norm": 11.383251190185547, "learning_rate": 7.848631593456881e-06, "loss": 0.2154, "num_input_tokens_seen": 88451376, "step": 41015 }, { "epoch": 7.527986786566342, "grad_norm": 44.47955322265625, "learning_rate": 7.847973467476885e-06, "loss": 0.1759, "num_input_tokens_seen": 88463312, "step": 41020 }, { "epoch": 7.528904386125895, "grad_norm": 13.306769371032715, "learning_rate": 7.847315268450783e-06, "loss": 0.3643, "num_input_tokens_seen": 88474160, "step": 41025 }, { "epoch": 7.529821985685447, "grad_norm": 1.5664242506027222, "learning_rate": 7.84665699639546e-06, "loss": 0.0994, "num_input_tokens_seen": 88484560, "step": 41030 }, { "epoch": 7.530739585244999, "grad_norm": 14.33426570892334, "learning_rate": 7.845998651327794e-06, "loss": 0.2513, "num_input_tokens_seen": 88494864, "step": 41035 }, { "epoch": 7.531657184804551, "grad_norm": 18.056394577026367, "learning_rate": 7.845340233264675e-06, "loss": 0.4446, "num_input_tokens_seen": 88505648, "step": 41040 }, { "epoch": 7.532574784364104, "grad_norm": 16.468273162841797, "learning_rate": 7.844681742222989e-06, "loss": 0.4055, "num_input_tokens_seen": 88517456, "step": 41045 }, { "epoch": 7.533492383923655, "grad_norm": 3.7879116535186768, "learning_rate": 7.844023178219624e-06, "loss": 0.3287, "num_input_tokens_seen": 88528080, "step": 41050 }, { "epoch": 7.534409983483208, "grad_norm": 0.5116102695465088, "learning_rate": 7.843364541271475e-06, "loss": 0.1645, "num_input_tokens_seen": 88538608, "step": 41055 }, { "epoch": 7.53532758304276, "grad_norm": 8.267159461975098, "learning_rate": 7.842705831395429e-06, "loss": 0.173, "num_input_tokens_seen": 88548368, "step": 41060 }, { "epoch": 7.536245182602312, "grad_norm": 0.2942846417427063, "learning_rate": 7.842047048608386e-06, "loss": 0.2889, "num_input_tokens_seen": 88558480, "step": 41065 }, { "epoch": 7.5371627821618645, "grad_norm": 2.4633588790893555, "learning_rate": 7.841388192927239e-06, "loss": 0.1807, "num_input_tokens_seen": 88568368, "step": 41070 }, { "epoch": 7.538080381721417, "grad_norm": 4.992413520812988, "learning_rate": 7.84072926436889e-06, "loss": 0.5434, "num_input_tokens_seen": 88579184, "step": 41075 }, { "epoch": 7.538997981280969, "grad_norm": 1.236264944076538, "learning_rate": 7.84007026295024e-06, "loss": 0.2387, "num_input_tokens_seen": 88588848, "step": 41080 }, { "epoch": 7.539915580840521, "grad_norm": 24.166526794433594, "learning_rate": 7.839411188688187e-06, "loss": 0.2454, "num_input_tokens_seen": 88598736, "step": 41085 }, { "epoch": 7.540833180400074, "grad_norm": 24.18166732788086, "learning_rate": 7.838752041599637e-06, "loss": 0.2461, "num_input_tokens_seen": 88609808, "step": 41090 }, { "epoch": 7.541750779959625, "grad_norm": 1.3882583379745483, "learning_rate": 7.838092821701499e-06, "loss": 0.2165, "num_input_tokens_seen": 88620720, "step": 41095 }, { "epoch": 7.542668379519178, "grad_norm": 16.97504997253418, "learning_rate": 7.837433529010679e-06, "loss": 0.4077, "num_input_tokens_seen": 88631344, "step": 41100 }, { "epoch": 7.54358597907873, "grad_norm": 17.4217472076416, "learning_rate": 7.836774163544084e-06, "loss": 0.1611, "num_input_tokens_seen": 88643760, "step": 41105 }, { "epoch": 7.544503578638282, "grad_norm": 0.43445003032684326, "learning_rate": 7.83611472531863e-06, "loss": 0.1482, "num_input_tokens_seen": 88655792, "step": 41110 }, { "epoch": 7.545421178197834, "grad_norm": 5.445255279541016, "learning_rate": 7.835455214351228e-06, "loss": 0.1012, "num_input_tokens_seen": 88666576, "step": 41115 }, { "epoch": 7.546338777757387, "grad_norm": 50.97035598754883, "learning_rate": 7.834795630658797e-06, "loss": 0.3809, "num_input_tokens_seen": 88678480, "step": 41120 }, { "epoch": 7.5472563773169385, "grad_norm": 2.3702309131622314, "learning_rate": 7.834135974258249e-06, "loss": 0.3919, "num_input_tokens_seen": 88689424, "step": 41125 }, { "epoch": 7.548173976876491, "grad_norm": 31.452808380126953, "learning_rate": 7.833476245166507e-06, "loss": 0.2611, "num_input_tokens_seen": 88701104, "step": 41130 }, { "epoch": 7.549091576436044, "grad_norm": 3.3782224655151367, "learning_rate": 7.83281644340049e-06, "loss": 0.3793, "num_input_tokens_seen": 88711056, "step": 41135 }, { "epoch": 7.550009175995595, "grad_norm": 0.3304215669631958, "learning_rate": 7.832156568977122e-06, "loss": 0.0857, "num_input_tokens_seen": 88721968, "step": 41140 }, { "epoch": 7.550926775555148, "grad_norm": 7.199509143829346, "learning_rate": 7.831496621913327e-06, "loss": 0.0345, "num_input_tokens_seen": 88731280, "step": 41145 }, { "epoch": 7.5518443751147, "grad_norm": 0.4610369801521301, "learning_rate": 7.830836602226032e-06, "loss": 0.0995, "num_input_tokens_seen": 88742608, "step": 41150 }, { "epoch": 7.552761974674252, "grad_norm": 1.964524507522583, "learning_rate": 7.830176509932167e-06, "loss": 0.1601, "num_input_tokens_seen": 88753040, "step": 41155 }, { "epoch": 7.553679574233804, "grad_norm": 9.932466506958008, "learning_rate": 7.82951634504866e-06, "loss": 0.3751, "num_input_tokens_seen": 88764528, "step": 41160 }, { "epoch": 7.554597173793357, "grad_norm": 22.636503219604492, "learning_rate": 7.828856107592443e-06, "loss": 0.5245, "num_input_tokens_seen": 88774416, "step": 41165 }, { "epoch": 7.555514773352908, "grad_norm": 0.5687596201896667, "learning_rate": 7.828195797580454e-06, "loss": 0.158, "num_input_tokens_seen": 88784880, "step": 41170 }, { "epoch": 7.556432372912461, "grad_norm": 75.06647491455078, "learning_rate": 7.827535415029624e-06, "loss": 0.2246, "num_input_tokens_seen": 88795088, "step": 41175 }, { "epoch": 7.5573499724720135, "grad_norm": 37.222232818603516, "learning_rate": 7.826874959956891e-06, "loss": 0.1286, "num_input_tokens_seen": 88804880, "step": 41180 }, { "epoch": 7.558267572031565, "grad_norm": 40.76626205444336, "learning_rate": 7.8262144323792e-06, "loss": 0.2145, "num_input_tokens_seen": 88816496, "step": 41185 }, { "epoch": 7.559185171591118, "grad_norm": 0.4222452640533447, "learning_rate": 7.825553832313486e-06, "loss": 0.271, "num_input_tokens_seen": 88828944, "step": 41190 }, { "epoch": 7.56010277115067, "grad_norm": 42.06650924682617, "learning_rate": 7.824893159776698e-06, "loss": 0.3202, "num_input_tokens_seen": 88838768, "step": 41195 }, { "epoch": 7.561020370710222, "grad_norm": 17.84971809387207, "learning_rate": 7.824232414785778e-06, "loss": 0.0965, "num_input_tokens_seen": 88850864, "step": 41200 }, { "epoch": 7.561937970269774, "grad_norm": 4.15632438659668, "learning_rate": 7.823571597357675e-06, "loss": 0.3016, "num_input_tokens_seen": 88861648, "step": 41205 }, { "epoch": 7.562855569829327, "grad_norm": 8.117081642150879, "learning_rate": 7.822910707509335e-06, "loss": 0.5229, "num_input_tokens_seen": 88872368, "step": 41210 }, { "epoch": 7.563773169388878, "grad_norm": 0.3486824035644531, "learning_rate": 7.82224974525771e-06, "loss": 0.2565, "num_input_tokens_seen": 88882832, "step": 41215 }, { "epoch": 7.564690768948431, "grad_norm": 0.9252259731292725, "learning_rate": 7.821588710619753e-06, "loss": 0.2588, "num_input_tokens_seen": 88894160, "step": 41220 }, { "epoch": 7.565608368507983, "grad_norm": 101.83052825927734, "learning_rate": 7.820927603612421e-06, "loss": 0.4441, "num_input_tokens_seen": 88905552, "step": 41225 }, { "epoch": 7.566525968067535, "grad_norm": 5.221270561218262, "learning_rate": 7.820266424252665e-06, "loss": 0.3686, "num_input_tokens_seen": 88917360, "step": 41230 }, { "epoch": 7.5674435676270875, "grad_norm": 82.89717102050781, "learning_rate": 7.819605172557448e-06, "loss": 0.2894, "num_input_tokens_seen": 88929008, "step": 41235 }, { "epoch": 7.56836116718664, "grad_norm": 29.613636016845703, "learning_rate": 7.818943848543729e-06, "loss": 0.2269, "num_input_tokens_seen": 88940240, "step": 41240 }, { "epoch": 7.569278766746192, "grad_norm": 0.9174361824989319, "learning_rate": 7.818282452228466e-06, "loss": 0.2082, "num_input_tokens_seen": 88951440, "step": 41245 }, { "epoch": 7.570196366305744, "grad_norm": 0.26947861909866333, "learning_rate": 7.817620983628629e-06, "loss": 0.137, "num_input_tokens_seen": 88963152, "step": 41250 }, { "epoch": 7.571113965865297, "grad_norm": 38.634559631347656, "learning_rate": 7.81695944276118e-06, "loss": 0.301, "num_input_tokens_seen": 88974928, "step": 41255 }, { "epoch": 7.572031565424848, "grad_norm": 24.641765594482422, "learning_rate": 7.816297829643088e-06, "loss": 0.2734, "num_input_tokens_seen": 88986704, "step": 41260 }, { "epoch": 7.572949164984401, "grad_norm": 11.804685592651367, "learning_rate": 7.815636144291321e-06, "loss": 0.3167, "num_input_tokens_seen": 88998160, "step": 41265 }, { "epoch": 7.573866764543953, "grad_norm": 0.5501947999000549, "learning_rate": 7.81497438672285e-06, "loss": 0.2899, "num_input_tokens_seen": 89007952, "step": 41270 }, { "epoch": 7.574784364103505, "grad_norm": 22.711545944213867, "learning_rate": 7.814312556954648e-06, "loss": 0.4651, "num_input_tokens_seen": 89018928, "step": 41275 }, { "epoch": 7.575701963663057, "grad_norm": 12.30671215057373, "learning_rate": 7.813650655003693e-06, "loss": 0.2519, "num_input_tokens_seen": 89031248, "step": 41280 }, { "epoch": 7.57661956322261, "grad_norm": 14.975008964538574, "learning_rate": 7.812988680886959e-06, "loss": 0.2674, "num_input_tokens_seen": 89041168, "step": 41285 }, { "epoch": 7.5775371627821615, "grad_norm": 1.3553129434585571, "learning_rate": 7.812326634621424e-06, "loss": 0.1451, "num_input_tokens_seen": 89052080, "step": 41290 }, { "epoch": 7.578454762341714, "grad_norm": 10.978583335876465, "learning_rate": 7.811664516224069e-06, "loss": 0.2957, "num_input_tokens_seen": 89061968, "step": 41295 }, { "epoch": 7.5793723619012665, "grad_norm": 3.9424538612365723, "learning_rate": 7.811002325711879e-06, "loss": 0.4304, "num_input_tokens_seen": 89071408, "step": 41300 }, { "epoch": 7.580289961460818, "grad_norm": 2.7102766036987305, "learning_rate": 7.810340063101835e-06, "loss": 0.1647, "num_input_tokens_seen": 89082736, "step": 41305 }, { "epoch": 7.581207561020371, "grad_norm": 23.331565856933594, "learning_rate": 7.809677728410922e-06, "loss": 0.3327, "num_input_tokens_seen": 89095248, "step": 41310 }, { "epoch": 7.582125160579923, "grad_norm": 16.71365737915039, "learning_rate": 7.809015321656132e-06, "loss": 0.2528, "num_input_tokens_seen": 89106352, "step": 41315 }, { "epoch": 7.583042760139475, "grad_norm": 9.632549285888672, "learning_rate": 7.808352842854454e-06, "loss": 0.1701, "num_input_tokens_seen": 89117584, "step": 41320 }, { "epoch": 7.583960359699027, "grad_norm": 31.63303565979004, "learning_rate": 7.807690292022876e-06, "loss": 0.43, "num_input_tokens_seen": 89128368, "step": 41325 }, { "epoch": 7.58487795925858, "grad_norm": 16.82042121887207, "learning_rate": 7.807027669178394e-06, "loss": 0.1897, "num_input_tokens_seen": 89139248, "step": 41330 }, { "epoch": 7.585795558818131, "grad_norm": 1.6216187477111816, "learning_rate": 7.806364974338001e-06, "loss": 0.085, "num_input_tokens_seen": 89149872, "step": 41335 }, { "epoch": 7.586713158377684, "grad_norm": 0.7307427525520325, "learning_rate": 7.805702207518699e-06, "loss": 0.3361, "num_input_tokens_seen": 89160976, "step": 41340 }, { "epoch": 7.587630757937236, "grad_norm": 19.80877113342285, "learning_rate": 7.805039368737483e-06, "loss": 0.2567, "num_input_tokens_seen": 89172432, "step": 41345 }, { "epoch": 7.588548357496788, "grad_norm": 13.987131118774414, "learning_rate": 7.804376458011354e-06, "loss": 0.3269, "num_input_tokens_seen": 89183600, "step": 41350 }, { "epoch": 7.5894659570563405, "grad_norm": 8.5684175491333, "learning_rate": 7.803713475357316e-06, "loss": 0.2338, "num_input_tokens_seen": 89194480, "step": 41355 }, { "epoch": 7.590383556615893, "grad_norm": 11.782665252685547, "learning_rate": 7.803050420792371e-06, "loss": 0.2553, "num_input_tokens_seen": 89204752, "step": 41360 }, { "epoch": 7.591301156175445, "grad_norm": 1.2218095064163208, "learning_rate": 7.802387294333528e-06, "loss": 0.2891, "num_input_tokens_seen": 89215600, "step": 41365 }, { "epoch": 7.592218755734997, "grad_norm": 0.7555840015411377, "learning_rate": 7.801724095997794e-06, "loss": 0.122, "num_input_tokens_seen": 89225520, "step": 41370 }, { "epoch": 7.59313635529455, "grad_norm": 1.2590113878250122, "learning_rate": 7.801060825802181e-06, "loss": 0.1992, "num_input_tokens_seen": 89235120, "step": 41375 }, { "epoch": 7.594053954854101, "grad_norm": 17.968223571777344, "learning_rate": 7.800397483763697e-06, "loss": 0.38, "num_input_tokens_seen": 89246160, "step": 41380 }, { "epoch": 7.594971554413654, "grad_norm": 0.8298125863075256, "learning_rate": 7.799734069899357e-06, "loss": 0.1592, "num_input_tokens_seen": 89257552, "step": 41385 }, { "epoch": 7.595889153973206, "grad_norm": 11.857087135314941, "learning_rate": 7.799070584226179e-06, "loss": 0.2312, "num_input_tokens_seen": 89268688, "step": 41390 }, { "epoch": 7.596806753532758, "grad_norm": 27.255109786987305, "learning_rate": 7.798407026761178e-06, "loss": 0.2048, "num_input_tokens_seen": 89278768, "step": 41395 }, { "epoch": 7.59772435309231, "grad_norm": 0.9320698976516724, "learning_rate": 7.797743397521376e-06, "loss": 0.0799, "num_input_tokens_seen": 89289712, "step": 41400 }, { "epoch": 7.598641952651863, "grad_norm": 1.0114185810089111, "learning_rate": 7.797079696523788e-06, "loss": 0.268, "num_input_tokens_seen": 89299984, "step": 41405 }, { "epoch": 7.599559552211415, "grad_norm": 36.50450134277344, "learning_rate": 7.796415923785443e-06, "loss": 0.2387, "num_input_tokens_seen": 89311120, "step": 41410 }, { "epoch": 7.600477151770967, "grad_norm": 10.059686660766602, "learning_rate": 7.795752079323364e-06, "loss": 0.133, "num_input_tokens_seen": 89323312, "step": 41415 }, { "epoch": 7.60139475133052, "grad_norm": 7.7129340171813965, "learning_rate": 7.795088163154578e-06, "loss": 0.3898, "num_input_tokens_seen": 89334288, "step": 41420 }, { "epoch": 7.602312350890072, "grad_norm": 0.5797349810600281, "learning_rate": 7.794424175296111e-06, "loss": 0.2622, "num_input_tokens_seen": 89344528, "step": 41425 }, { "epoch": 7.603229950449624, "grad_norm": 45.690364837646484, "learning_rate": 7.793760115764995e-06, "loss": 0.2599, "num_input_tokens_seen": 89355952, "step": 41430 }, { "epoch": 7.604147550009176, "grad_norm": 5.246316909790039, "learning_rate": 7.793095984578263e-06, "loss": 0.1461, "num_input_tokens_seen": 89366960, "step": 41435 }, { "epoch": 7.605065149568729, "grad_norm": 23.366962432861328, "learning_rate": 7.792431781752946e-06, "loss": 0.3126, "num_input_tokens_seen": 89376688, "step": 41440 }, { "epoch": 7.60598274912828, "grad_norm": 13.070926666259766, "learning_rate": 7.791767507306083e-06, "loss": 0.3209, "num_input_tokens_seen": 89386864, "step": 41445 }, { "epoch": 7.606900348687833, "grad_norm": 17.86489486694336, "learning_rate": 7.791103161254711e-06, "loss": 0.4007, "num_input_tokens_seen": 89397680, "step": 41450 }, { "epoch": 7.607817948247385, "grad_norm": 28.854900360107422, "learning_rate": 7.790438743615867e-06, "loss": 0.3929, "num_input_tokens_seen": 89407792, "step": 41455 }, { "epoch": 7.608735547806937, "grad_norm": 4.534712314605713, "learning_rate": 7.789774254406595e-06, "loss": 0.0759, "num_input_tokens_seen": 89418224, "step": 41460 }, { "epoch": 7.6096531473664895, "grad_norm": 24.903568267822266, "learning_rate": 7.789109693643936e-06, "loss": 0.1558, "num_input_tokens_seen": 89429072, "step": 41465 }, { "epoch": 7.610570746926042, "grad_norm": 61.4766960144043, "learning_rate": 7.788445061344938e-06, "loss": 0.1048, "num_input_tokens_seen": 89439504, "step": 41470 }, { "epoch": 7.611488346485594, "grad_norm": 9.245072364807129, "learning_rate": 7.787780357526646e-06, "loss": 0.6379, "num_input_tokens_seen": 89450480, "step": 41475 }, { "epoch": 7.612405946045146, "grad_norm": 4.7121262550354, "learning_rate": 7.787115582206105e-06, "loss": 0.2577, "num_input_tokens_seen": 89459344, "step": 41480 }, { "epoch": 7.613323545604699, "grad_norm": 4.86020565032959, "learning_rate": 7.786450735400373e-06, "loss": 0.157, "num_input_tokens_seen": 89470000, "step": 41485 }, { "epoch": 7.61424114516425, "grad_norm": 30.119976043701172, "learning_rate": 7.785785817126497e-06, "loss": 0.199, "num_input_tokens_seen": 89479440, "step": 41490 }, { "epoch": 7.615158744723803, "grad_norm": 15.020466804504395, "learning_rate": 7.785120827401531e-06, "loss": 0.0846, "num_input_tokens_seen": 89489232, "step": 41495 }, { "epoch": 7.616076344283355, "grad_norm": 14.29654598236084, "learning_rate": 7.784455766242535e-06, "loss": 0.3703, "num_input_tokens_seen": 89500464, "step": 41500 }, { "epoch": 7.616993943842907, "grad_norm": 9.922901153564453, "learning_rate": 7.783790633666562e-06, "loss": 0.2407, "num_input_tokens_seen": 89510160, "step": 41505 }, { "epoch": 7.617911543402459, "grad_norm": 0.8232864737510681, "learning_rate": 7.783125429690675e-06, "loss": 0.3478, "num_input_tokens_seen": 89520560, "step": 41510 }, { "epoch": 7.618829142962012, "grad_norm": 18.884246826171875, "learning_rate": 7.782460154331932e-06, "loss": 0.4286, "num_input_tokens_seen": 89531536, "step": 41515 }, { "epoch": 7.6197467425215635, "grad_norm": 62.21073532104492, "learning_rate": 7.7817948076074e-06, "loss": 0.2638, "num_input_tokens_seen": 89542800, "step": 41520 }, { "epoch": 7.620664342081116, "grad_norm": 49.649208068847656, "learning_rate": 7.781129389534144e-06, "loss": 0.5241, "num_input_tokens_seen": 89552528, "step": 41525 }, { "epoch": 7.6215819416406685, "grad_norm": 25.996322631835938, "learning_rate": 7.780463900129228e-06, "loss": 0.183, "num_input_tokens_seen": 89563408, "step": 41530 }, { "epoch": 7.62249954120022, "grad_norm": 40.54090118408203, "learning_rate": 7.779798339409721e-06, "loss": 0.2841, "num_input_tokens_seen": 89573232, "step": 41535 }, { "epoch": 7.623417140759773, "grad_norm": 21.73257064819336, "learning_rate": 7.779132707392695e-06, "loss": 0.1744, "num_input_tokens_seen": 89583664, "step": 41540 }, { "epoch": 7.624334740319325, "grad_norm": 19.775869369506836, "learning_rate": 7.778467004095225e-06, "loss": 0.3814, "num_input_tokens_seen": 89594288, "step": 41545 }, { "epoch": 7.625252339878877, "grad_norm": 21.93951416015625, "learning_rate": 7.77780122953438e-06, "loss": 0.3706, "num_input_tokens_seen": 89604816, "step": 41550 }, { "epoch": 7.626169939438429, "grad_norm": 27.002635955810547, "learning_rate": 7.77713538372724e-06, "loss": 0.1896, "num_input_tokens_seen": 89615856, "step": 41555 }, { "epoch": 7.627087538997982, "grad_norm": 10.262258529663086, "learning_rate": 7.77646946669088e-06, "loss": 0.29, "num_input_tokens_seen": 89626416, "step": 41560 }, { "epoch": 7.628005138557533, "grad_norm": 43.446380615234375, "learning_rate": 7.775803478442384e-06, "loss": 0.2952, "num_input_tokens_seen": 89637136, "step": 41565 }, { "epoch": 7.628922738117086, "grad_norm": 7.065023422241211, "learning_rate": 7.77513741899883e-06, "loss": 0.2416, "num_input_tokens_seen": 89648176, "step": 41570 }, { "epoch": 7.629840337676638, "grad_norm": 50.88926315307617, "learning_rate": 7.7744712883773e-06, "loss": 0.1795, "num_input_tokens_seen": 89658224, "step": 41575 }, { "epoch": 7.63075793723619, "grad_norm": 4.748318195343018, "learning_rate": 7.773805086594884e-06, "loss": 0.1405, "num_input_tokens_seen": 89668944, "step": 41580 }, { "epoch": 7.6316755367957425, "grad_norm": 4.884249687194824, "learning_rate": 7.773138813668666e-06, "loss": 0.1728, "num_input_tokens_seen": 89680144, "step": 41585 }, { "epoch": 7.632593136355295, "grad_norm": 4.4919562339782715, "learning_rate": 7.772472469615734e-06, "loss": 0.3157, "num_input_tokens_seen": 89690768, "step": 41590 }, { "epoch": 7.633510735914847, "grad_norm": 17.331417083740234, "learning_rate": 7.771806054453182e-06, "loss": 0.519, "num_input_tokens_seen": 89701392, "step": 41595 }, { "epoch": 7.634428335474399, "grad_norm": 11.634039878845215, "learning_rate": 7.771139568198101e-06, "loss": 0.3108, "num_input_tokens_seen": 89711472, "step": 41600 }, { "epoch": 7.635345935033952, "grad_norm": 3.001646041870117, "learning_rate": 7.770473010867582e-06, "loss": 0.1778, "num_input_tokens_seen": 89720432, "step": 41605 }, { "epoch": 7.636263534593503, "grad_norm": 9.7599515914917, "learning_rate": 7.769806382478728e-06, "loss": 0.1323, "num_input_tokens_seen": 89731216, "step": 41610 }, { "epoch": 7.637181134153056, "grad_norm": 21.80433464050293, "learning_rate": 7.76913968304863e-06, "loss": 0.2309, "num_input_tokens_seen": 89740624, "step": 41615 }, { "epoch": 7.638098733712608, "grad_norm": 9.012589454650879, "learning_rate": 7.768472912594392e-06, "loss": 0.1526, "num_input_tokens_seen": 89751280, "step": 41620 }, { "epoch": 7.63901633327216, "grad_norm": 4.701495170593262, "learning_rate": 7.767806071133116e-06, "loss": 0.1297, "num_input_tokens_seen": 89761648, "step": 41625 }, { "epoch": 7.639933932831712, "grad_norm": 9.748592376708984, "learning_rate": 7.767139158681901e-06, "loss": 0.1721, "num_input_tokens_seen": 89772752, "step": 41630 }, { "epoch": 7.640851532391265, "grad_norm": 14.061262130737305, "learning_rate": 7.766472175257857e-06, "loss": 0.334, "num_input_tokens_seen": 89783344, "step": 41635 }, { "epoch": 7.6417691319508165, "grad_norm": 26.775468826293945, "learning_rate": 7.76580512087809e-06, "loss": 0.35, "num_input_tokens_seen": 89794576, "step": 41640 }, { "epoch": 7.642686731510369, "grad_norm": 0.45693835616111755, "learning_rate": 7.765137995559706e-06, "loss": 0.3418, "num_input_tokens_seen": 89805040, "step": 41645 }, { "epoch": 7.6436043310699215, "grad_norm": 25.61665916442871, "learning_rate": 7.76447079931982e-06, "loss": 0.3364, "num_input_tokens_seen": 89815504, "step": 41650 }, { "epoch": 7.644521930629473, "grad_norm": 27.73756217956543, "learning_rate": 7.76380353217554e-06, "loss": 0.3179, "num_input_tokens_seen": 89827056, "step": 41655 }, { "epoch": 7.645439530189026, "grad_norm": 8.255729675292969, "learning_rate": 7.763136194143987e-06, "loss": 0.0579, "num_input_tokens_seen": 89838832, "step": 41660 }, { "epoch": 7.646357129748578, "grad_norm": 0.2841930091381073, "learning_rate": 7.76246878524227e-06, "loss": 0.2937, "num_input_tokens_seen": 89849712, "step": 41665 }, { "epoch": 7.64727472930813, "grad_norm": 37.14554214477539, "learning_rate": 7.761801305487511e-06, "loss": 0.4612, "num_input_tokens_seen": 89859760, "step": 41670 }, { "epoch": 7.648192328867682, "grad_norm": 11.247065544128418, "learning_rate": 7.76113375489683e-06, "loss": 0.3921, "num_input_tokens_seen": 89871056, "step": 41675 }, { "epoch": 7.649109928427235, "grad_norm": 3.398176431655884, "learning_rate": 7.760466133487346e-06, "loss": 0.217, "num_input_tokens_seen": 89880336, "step": 41680 }, { "epoch": 7.650027527986786, "grad_norm": 26.10242462158203, "learning_rate": 7.759798441276184e-06, "loss": 0.1421, "num_input_tokens_seen": 89890992, "step": 41685 }, { "epoch": 7.650945127546339, "grad_norm": 10.517354011535645, "learning_rate": 7.75913067828047e-06, "loss": 0.376, "num_input_tokens_seen": 89902832, "step": 41690 }, { "epoch": 7.651862727105891, "grad_norm": 6.0879669189453125, "learning_rate": 7.75846284451733e-06, "loss": 0.1448, "num_input_tokens_seen": 89914128, "step": 41695 }, { "epoch": 7.652780326665443, "grad_norm": 27.611907958984375, "learning_rate": 7.757794940003892e-06, "loss": 0.1941, "num_input_tokens_seen": 89925552, "step": 41700 }, { "epoch": 7.653697926224996, "grad_norm": 25.65779685974121, "learning_rate": 7.757126964757291e-06, "loss": 0.4964, "num_input_tokens_seen": 89936624, "step": 41705 }, { "epoch": 7.654615525784548, "grad_norm": 14.597596168518066, "learning_rate": 7.756458918794655e-06, "loss": 0.0975, "num_input_tokens_seen": 89947216, "step": 41710 }, { "epoch": 7.6555331253441, "grad_norm": 14.535293579101562, "learning_rate": 7.755790802133119e-06, "loss": 0.4136, "num_input_tokens_seen": 89958736, "step": 41715 }, { "epoch": 7.656450724903652, "grad_norm": 3.4352238178253174, "learning_rate": 7.75512261478982e-06, "loss": 0.4269, "num_input_tokens_seen": 89968912, "step": 41720 }, { "epoch": 7.657368324463205, "grad_norm": 10.209319114685059, "learning_rate": 7.754454356781898e-06, "loss": 0.4737, "num_input_tokens_seen": 89979216, "step": 41725 }, { "epoch": 7.658285924022756, "grad_norm": 24.26261329650879, "learning_rate": 7.753786028126488e-06, "loss": 0.1628, "num_input_tokens_seen": 89990960, "step": 41730 }, { "epoch": 7.659203523582309, "grad_norm": 8.702451705932617, "learning_rate": 7.753117628840736e-06, "loss": 0.1722, "num_input_tokens_seen": 90002064, "step": 41735 }, { "epoch": 7.660121123141861, "grad_norm": 11.27281379699707, "learning_rate": 7.752449158941785e-06, "loss": 0.2299, "num_input_tokens_seen": 90013232, "step": 41740 }, { "epoch": 7.661038722701413, "grad_norm": 10.035390853881836, "learning_rate": 7.751780618446778e-06, "loss": 0.3227, "num_input_tokens_seen": 90024368, "step": 41745 }, { "epoch": 7.6619563222609655, "grad_norm": 10.063578605651855, "learning_rate": 7.751112007372862e-06, "loss": 0.1232, "num_input_tokens_seen": 90034544, "step": 41750 }, { "epoch": 7.662873921820518, "grad_norm": 14.065406799316406, "learning_rate": 7.750443325737186e-06, "loss": 0.276, "num_input_tokens_seen": 90045424, "step": 41755 }, { "epoch": 7.66379152138007, "grad_norm": 0.16862252354621887, "learning_rate": 7.749774573556905e-06, "loss": 0.254, "num_input_tokens_seen": 90056240, "step": 41760 }, { "epoch": 7.664709120939622, "grad_norm": 41.32894515991211, "learning_rate": 7.749105750849165e-06, "loss": 0.1289, "num_input_tokens_seen": 90065744, "step": 41765 }, { "epoch": 7.665626720499175, "grad_norm": 17.476449966430664, "learning_rate": 7.748436857631125e-06, "loss": 0.0887, "num_input_tokens_seen": 90075984, "step": 41770 }, { "epoch": 7.666544320058726, "grad_norm": 49.612579345703125, "learning_rate": 7.747767893919938e-06, "loss": 0.2422, "num_input_tokens_seen": 90086480, "step": 41775 }, { "epoch": 7.667461919618279, "grad_norm": 17.019445419311523, "learning_rate": 7.747098859732762e-06, "loss": 0.085, "num_input_tokens_seen": 90097712, "step": 41780 }, { "epoch": 7.668379519177831, "grad_norm": 0.5877202153205872, "learning_rate": 7.74642975508676e-06, "loss": 0.0333, "num_input_tokens_seen": 90108144, "step": 41785 }, { "epoch": 7.669297118737383, "grad_norm": 21.244747161865234, "learning_rate": 7.74576057999909e-06, "loss": 0.157, "num_input_tokens_seen": 90118480, "step": 41790 }, { "epoch": 7.670214718296935, "grad_norm": 19.440216064453125, "learning_rate": 7.74509133448692e-06, "loss": 0.304, "num_input_tokens_seen": 90129872, "step": 41795 }, { "epoch": 7.671132317856488, "grad_norm": 13.553196907043457, "learning_rate": 7.744422018567408e-06, "loss": 0.2363, "num_input_tokens_seen": 90140336, "step": 41800 }, { "epoch": 7.6720499174160395, "grad_norm": 108.94821166992188, "learning_rate": 7.743752632257725e-06, "loss": 0.3858, "num_input_tokens_seen": 90148976, "step": 41805 }, { "epoch": 7.672967516975592, "grad_norm": 0.19426120817661285, "learning_rate": 7.743083175575041e-06, "loss": 0.3642, "num_input_tokens_seen": 90159440, "step": 41810 }, { "epoch": 7.6738851165351445, "grad_norm": 13.798185348510742, "learning_rate": 7.742413648536524e-06, "loss": 0.1882, "num_input_tokens_seen": 90171152, "step": 41815 }, { "epoch": 7.674802716094696, "grad_norm": 0.365093469619751, "learning_rate": 7.74174405115935e-06, "loss": 0.3643, "num_input_tokens_seen": 90182736, "step": 41820 }, { "epoch": 7.675720315654249, "grad_norm": 0.9305203557014465, "learning_rate": 7.741074383460687e-06, "loss": 0.234, "num_input_tokens_seen": 90193168, "step": 41825 }, { "epoch": 7.676637915213801, "grad_norm": 2.0383734703063965, "learning_rate": 7.740404645457716e-06, "loss": 0.2666, "num_input_tokens_seen": 90202960, "step": 41830 }, { "epoch": 7.677555514773353, "grad_norm": 49.690086364746094, "learning_rate": 7.739734837167612e-06, "loss": 0.0543, "num_input_tokens_seen": 90213424, "step": 41835 }, { "epoch": 7.678473114332905, "grad_norm": 15.032483100891113, "learning_rate": 7.739064958607556e-06, "loss": 0.3212, "num_input_tokens_seen": 90223472, "step": 41840 }, { "epoch": 7.679390713892458, "grad_norm": 0.20047275722026825, "learning_rate": 7.738395009794728e-06, "loss": 0.4695, "num_input_tokens_seen": 90234032, "step": 41845 }, { "epoch": 7.680308313452009, "grad_norm": 17.780826568603516, "learning_rate": 7.737724990746313e-06, "loss": 0.3861, "num_input_tokens_seen": 90245712, "step": 41850 }, { "epoch": 7.681225913011562, "grad_norm": 9.760285377502441, "learning_rate": 7.737054901479497e-06, "loss": 0.4055, "num_input_tokens_seen": 90256464, "step": 41855 }, { "epoch": 7.682143512571114, "grad_norm": 5.84177303314209, "learning_rate": 7.736384742011462e-06, "loss": 0.2237, "num_input_tokens_seen": 90267792, "step": 41860 }, { "epoch": 7.683061112130666, "grad_norm": 36.31642532348633, "learning_rate": 7.7357145123594e-06, "loss": 0.2519, "num_input_tokens_seen": 90277776, "step": 41865 }, { "epoch": 7.6839787116902185, "grad_norm": 41.10787582397461, "learning_rate": 7.7350442125405e-06, "loss": 0.213, "num_input_tokens_seen": 90287056, "step": 41870 }, { "epoch": 7.684896311249771, "grad_norm": 2.325871467590332, "learning_rate": 7.734373842571958e-06, "loss": 0.4731, "num_input_tokens_seen": 90297424, "step": 41875 }, { "epoch": 7.685813910809323, "grad_norm": 11.989400863647461, "learning_rate": 7.733703402470963e-06, "loss": 0.2396, "num_input_tokens_seen": 90307888, "step": 41880 }, { "epoch": 7.686731510368875, "grad_norm": 13.393136024475098, "learning_rate": 7.733032892254711e-06, "loss": 0.3416, "num_input_tokens_seen": 90317584, "step": 41885 }, { "epoch": 7.687649109928428, "grad_norm": 44.08612060546875, "learning_rate": 7.732362311940403e-06, "loss": 0.3032, "num_input_tokens_seen": 90327792, "step": 41890 }, { "epoch": 7.688566709487979, "grad_norm": 37.86954116821289, "learning_rate": 7.731691661545237e-06, "loss": 0.1438, "num_input_tokens_seen": 90339216, "step": 41895 }, { "epoch": 7.689484309047532, "grad_norm": 32.77317810058594, "learning_rate": 7.731020941086412e-06, "loss": 0.2858, "num_input_tokens_seen": 90348784, "step": 41900 }, { "epoch": 7.690401908607084, "grad_norm": 20.665494918823242, "learning_rate": 7.730350150581134e-06, "loss": 0.4609, "num_input_tokens_seen": 90359984, "step": 41905 }, { "epoch": 7.691319508166636, "grad_norm": 10.605206489562988, "learning_rate": 7.729679290046606e-06, "loss": 0.2101, "num_input_tokens_seen": 90370704, "step": 41910 }, { "epoch": 7.692237107726188, "grad_norm": 16.21033477783203, "learning_rate": 7.729008359500033e-06, "loss": 0.1165, "num_input_tokens_seen": 90382192, "step": 41915 }, { "epoch": 7.693154707285741, "grad_norm": 0.6750629544258118, "learning_rate": 7.728337358958627e-06, "loss": 0.4168, "num_input_tokens_seen": 90392656, "step": 41920 }, { "epoch": 7.6940723068452925, "grad_norm": 11.32298469543457, "learning_rate": 7.727666288439595e-06, "loss": 0.2704, "num_input_tokens_seen": 90402928, "step": 41925 }, { "epoch": 7.694989906404845, "grad_norm": 3.953756809234619, "learning_rate": 7.726995147960153e-06, "loss": 0.1973, "num_input_tokens_seen": 90413904, "step": 41930 }, { "epoch": 7.6959075059643975, "grad_norm": 74.23918151855469, "learning_rate": 7.726323937537508e-06, "loss": 0.1901, "num_input_tokens_seen": 90425712, "step": 41935 }, { "epoch": 7.696825105523949, "grad_norm": 2.143305778503418, "learning_rate": 7.725652657188883e-06, "loss": 0.1221, "num_input_tokens_seen": 90436912, "step": 41940 }, { "epoch": 7.697742705083502, "grad_norm": 13.401360511779785, "learning_rate": 7.72498130693149e-06, "loss": 0.2805, "num_input_tokens_seen": 90448080, "step": 41945 }, { "epoch": 7.698660304643054, "grad_norm": 13.443305969238281, "learning_rate": 7.724309886782548e-06, "loss": 0.363, "num_input_tokens_seen": 90459408, "step": 41950 }, { "epoch": 7.699577904202606, "grad_norm": 6.665759563446045, "learning_rate": 7.723638396759283e-06, "loss": 0.3221, "num_input_tokens_seen": 90470544, "step": 41955 }, { "epoch": 7.700495503762158, "grad_norm": 0.8358678817749023, "learning_rate": 7.722966836878914e-06, "loss": 0.2522, "num_input_tokens_seen": 90480240, "step": 41960 }, { "epoch": 7.701413103321711, "grad_norm": 24.484098434448242, "learning_rate": 7.722295207158663e-06, "loss": 0.2799, "num_input_tokens_seen": 90491088, "step": 41965 }, { "epoch": 7.702330702881262, "grad_norm": 49.773189544677734, "learning_rate": 7.721623507615761e-06, "loss": 0.179, "num_input_tokens_seen": 90501392, "step": 41970 }, { "epoch": 7.703248302440815, "grad_norm": 69.70437622070312, "learning_rate": 7.720951738267434e-06, "loss": 0.3762, "num_input_tokens_seen": 90512880, "step": 41975 }, { "epoch": 7.704165902000367, "grad_norm": 21.86023712158203, "learning_rate": 7.720279899130914e-06, "loss": 0.3889, "num_input_tokens_seen": 90523024, "step": 41980 }, { "epoch": 7.705083501559919, "grad_norm": 3.1832327842712402, "learning_rate": 7.719607990223427e-06, "loss": 0.0966, "num_input_tokens_seen": 90534096, "step": 41985 }, { "epoch": 7.706001101119472, "grad_norm": 0.7235594987869263, "learning_rate": 7.718936011562213e-06, "loss": 0.3406, "num_input_tokens_seen": 90544048, "step": 41990 }, { "epoch": 7.706918700679024, "grad_norm": 40.592689514160156, "learning_rate": 7.718263963164502e-06, "loss": 0.1604, "num_input_tokens_seen": 90554736, "step": 41995 }, { "epoch": 7.707836300238576, "grad_norm": 20.114917755126953, "learning_rate": 7.717591845047533e-06, "loss": 0.1019, "num_input_tokens_seen": 90565968, "step": 42000 }, { "epoch": 7.708753899798128, "grad_norm": 17.859615325927734, "learning_rate": 7.716919657228548e-06, "loss": 0.2236, "num_input_tokens_seen": 90576496, "step": 42005 }, { "epoch": 7.709671499357681, "grad_norm": 7.431556701660156, "learning_rate": 7.716247399724783e-06, "loss": 0.2591, "num_input_tokens_seen": 90587056, "step": 42010 }, { "epoch": 7.710589098917232, "grad_norm": 34.946022033691406, "learning_rate": 7.715575072553482e-06, "loss": 0.2595, "num_input_tokens_seen": 90597680, "step": 42015 }, { "epoch": 7.711506698476785, "grad_norm": 12.379385948181152, "learning_rate": 7.71490267573189e-06, "loss": 0.2751, "num_input_tokens_seen": 90609104, "step": 42020 }, { "epoch": 7.712424298036337, "grad_norm": 16.457683563232422, "learning_rate": 7.71423020927725e-06, "loss": 0.2253, "num_input_tokens_seen": 90618960, "step": 42025 }, { "epoch": 7.713341897595889, "grad_norm": 5.4487738609313965, "learning_rate": 7.713557673206813e-06, "loss": 0.3188, "num_input_tokens_seen": 90628272, "step": 42030 }, { "epoch": 7.7142594971554415, "grad_norm": 2.855053663253784, "learning_rate": 7.712885067537827e-06, "loss": 0.2172, "num_input_tokens_seen": 90638128, "step": 42035 }, { "epoch": 7.715177096714994, "grad_norm": 15.995429039001465, "learning_rate": 7.712212392287546e-06, "loss": 0.2955, "num_input_tokens_seen": 90649328, "step": 42040 }, { "epoch": 7.716094696274546, "grad_norm": 4.996705532073975, "learning_rate": 7.711539647473219e-06, "loss": 0.2194, "num_input_tokens_seen": 90659088, "step": 42045 }, { "epoch": 7.717012295834098, "grad_norm": 2.7203500270843506, "learning_rate": 7.710866833112101e-06, "loss": 0.1566, "num_input_tokens_seen": 90669392, "step": 42050 }, { "epoch": 7.717929895393651, "grad_norm": 22.005569458007812, "learning_rate": 7.710193949221452e-06, "loss": 0.4658, "num_input_tokens_seen": 90680496, "step": 42055 }, { "epoch": 7.718847494953202, "grad_norm": 24.728456497192383, "learning_rate": 7.709520995818527e-06, "loss": 0.292, "num_input_tokens_seen": 90691504, "step": 42060 }, { "epoch": 7.719765094512755, "grad_norm": 0.6023089289665222, "learning_rate": 7.70884797292059e-06, "loss": 0.0281, "num_input_tokens_seen": 90702640, "step": 42065 }, { "epoch": 7.720682694072307, "grad_norm": 18.16162109375, "learning_rate": 7.708174880544899e-06, "loss": 0.1892, "num_input_tokens_seen": 90712528, "step": 42070 }, { "epoch": 7.721600293631859, "grad_norm": 4.1876044273376465, "learning_rate": 7.707501718708721e-06, "loss": 0.2607, "num_input_tokens_seen": 90722032, "step": 42075 }, { "epoch": 7.722517893191411, "grad_norm": 16.862350463867188, "learning_rate": 7.706828487429318e-06, "loss": 0.2119, "num_input_tokens_seen": 90731440, "step": 42080 }, { "epoch": 7.723435492750964, "grad_norm": 8.04917049407959, "learning_rate": 7.706155186723962e-06, "loss": 0.3592, "num_input_tokens_seen": 90742192, "step": 42085 }, { "epoch": 7.7243530923105155, "grad_norm": 6.026425838470459, "learning_rate": 7.705481816609918e-06, "loss": 0.2948, "num_input_tokens_seen": 90753520, "step": 42090 }, { "epoch": 7.725270691870068, "grad_norm": 1.7046663761138916, "learning_rate": 7.70480837710446e-06, "loss": 0.2579, "num_input_tokens_seen": 90762928, "step": 42095 }, { "epoch": 7.7261882914296205, "grad_norm": 7.35628604888916, "learning_rate": 7.704134868224857e-06, "loss": 0.0907, "num_input_tokens_seen": 90773648, "step": 42100 }, { "epoch": 7.727105890989172, "grad_norm": 4.199678421020508, "learning_rate": 7.703461289988387e-06, "loss": 0.1844, "num_input_tokens_seen": 90784816, "step": 42105 }, { "epoch": 7.728023490548725, "grad_norm": 6.613985061645508, "learning_rate": 7.702787642412326e-06, "loss": 0.2831, "num_input_tokens_seen": 90796048, "step": 42110 }, { "epoch": 7.728941090108277, "grad_norm": 3.6762149333953857, "learning_rate": 7.70211392551395e-06, "loss": 0.2409, "num_input_tokens_seen": 90807504, "step": 42115 }, { "epoch": 7.729858689667829, "grad_norm": 15.663844108581543, "learning_rate": 7.701440139310538e-06, "loss": 0.3238, "num_input_tokens_seen": 90818512, "step": 42120 }, { "epoch": 7.730776289227381, "grad_norm": 4.280213832855225, "learning_rate": 7.700766283819376e-06, "loss": 0.052, "num_input_tokens_seen": 90829008, "step": 42125 }, { "epoch": 7.731693888786934, "grad_norm": 30.87662124633789, "learning_rate": 7.700092359057743e-06, "loss": 0.4672, "num_input_tokens_seen": 90839760, "step": 42130 }, { "epoch": 7.732611488346485, "grad_norm": 37.329444885253906, "learning_rate": 7.699418365042928e-06, "loss": 0.1543, "num_input_tokens_seen": 90850608, "step": 42135 }, { "epoch": 7.733529087906038, "grad_norm": 7.490741729736328, "learning_rate": 7.698744301792213e-06, "loss": 0.19, "num_input_tokens_seen": 90861104, "step": 42140 }, { "epoch": 7.73444668746559, "grad_norm": 15.7926607131958, "learning_rate": 7.69807016932289e-06, "loss": 0.4535, "num_input_tokens_seen": 90870640, "step": 42145 }, { "epoch": 7.735364287025142, "grad_norm": 18.392253875732422, "learning_rate": 7.697395967652248e-06, "loss": 0.1394, "num_input_tokens_seen": 90880112, "step": 42150 }, { "epoch": 7.7362818865846945, "grad_norm": 7.713744163513184, "learning_rate": 7.696721696797583e-06, "loss": 0.1326, "num_input_tokens_seen": 90891696, "step": 42155 }, { "epoch": 7.737199486144247, "grad_norm": 11.738924980163574, "learning_rate": 7.696047356776184e-06, "loss": 0.4231, "num_input_tokens_seen": 90902896, "step": 42160 }, { "epoch": 7.738117085703799, "grad_norm": 24.69038200378418, "learning_rate": 7.69537294760535e-06, "loss": 0.0635, "num_input_tokens_seen": 90912688, "step": 42165 }, { "epoch": 7.739034685263351, "grad_norm": 68.0609359741211, "learning_rate": 7.694698469302373e-06, "loss": 0.2906, "num_input_tokens_seen": 90923280, "step": 42170 }, { "epoch": 7.739952284822904, "grad_norm": 0.4272153079509735, "learning_rate": 7.694023921884562e-06, "loss": 0.2058, "num_input_tokens_seen": 90935280, "step": 42175 }, { "epoch": 7.740869884382455, "grad_norm": 13.247282028198242, "learning_rate": 7.693349305369208e-06, "loss": 0.4401, "num_input_tokens_seen": 90945200, "step": 42180 }, { "epoch": 7.741787483942008, "grad_norm": 8.56750202178955, "learning_rate": 7.692674619773622e-06, "loss": 0.0307, "num_input_tokens_seen": 90956080, "step": 42185 }, { "epoch": 7.74270508350156, "grad_norm": 18.81876564025879, "learning_rate": 7.691999865115106e-06, "loss": 0.3308, "num_input_tokens_seen": 90966928, "step": 42190 }, { "epoch": 7.743622683061112, "grad_norm": 14.954582214355469, "learning_rate": 7.691325041410962e-06, "loss": 0.5001, "num_input_tokens_seen": 90977552, "step": 42195 }, { "epoch": 7.744540282620664, "grad_norm": 5.962055206298828, "learning_rate": 7.690650148678505e-06, "loss": 0.2695, "num_input_tokens_seen": 90988176, "step": 42200 }, { "epoch": 7.745457882180217, "grad_norm": 1.4995450973510742, "learning_rate": 7.689975186935041e-06, "loss": 0.3864, "num_input_tokens_seen": 90998448, "step": 42205 }, { "epoch": 7.7463754817397685, "grad_norm": 3.9708447456359863, "learning_rate": 7.68930015619788e-06, "loss": 0.4086, "num_input_tokens_seen": 91008752, "step": 42210 }, { "epoch": 7.747293081299321, "grad_norm": 15.723808288574219, "learning_rate": 7.688625056484343e-06, "loss": 0.4975, "num_input_tokens_seen": 91017808, "step": 42215 }, { "epoch": 7.7482106808588735, "grad_norm": 20.838890075683594, "learning_rate": 7.687949887811736e-06, "loss": 0.2618, "num_input_tokens_seen": 91028176, "step": 42220 }, { "epoch": 7.749128280418425, "grad_norm": 0.7303740978240967, "learning_rate": 7.687274650197383e-06, "loss": 0.3568, "num_input_tokens_seen": 91039120, "step": 42225 }, { "epoch": 7.750045879977978, "grad_norm": 2.315281629562378, "learning_rate": 7.686599343658598e-06, "loss": 0.1177, "num_input_tokens_seen": 91049040, "step": 42230 }, { "epoch": 7.75096347953753, "grad_norm": 25.83125877380371, "learning_rate": 7.685923968212704e-06, "loss": 0.1911, "num_input_tokens_seen": 91059856, "step": 42235 }, { "epoch": 7.751881079097082, "grad_norm": 11.603554725646973, "learning_rate": 7.685248523877025e-06, "loss": 0.2256, "num_input_tokens_seen": 91071920, "step": 42240 }, { "epoch": 7.752798678656634, "grad_norm": 0.43092960119247437, "learning_rate": 7.684573010668884e-06, "loss": 0.1879, "num_input_tokens_seen": 91082192, "step": 42245 }, { "epoch": 7.753716278216187, "grad_norm": 12.959521293640137, "learning_rate": 7.683897428605603e-06, "loss": 0.3551, "num_input_tokens_seen": 91092656, "step": 42250 }, { "epoch": 7.754633877775738, "grad_norm": 8.112160682678223, "learning_rate": 7.683221777704512e-06, "loss": 0.2008, "num_input_tokens_seen": 91103376, "step": 42255 }, { "epoch": 7.755551477335291, "grad_norm": 46.11621856689453, "learning_rate": 7.682546057982943e-06, "loss": 0.3501, "num_input_tokens_seen": 91114608, "step": 42260 }, { "epoch": 7.756469076894843, "grad_norm": 3.164118528366089, "learning_rate": 7.681870269458226e-06, "loss": 0.249, "num_input_tokens_seen": 91124368, "step": 42265 }, { "epoch": 7.757386676454395, "grad_norm": 15.029585838317871, "learning_rate": 7.681194412147691e-06, "loss": 0.3322, "num_input_tokens_seen": 91135344, "step": 42270 }, { "epoch": 7.758304276013948, "grad_norm": 5.549605369567871, "learning_rate": 7.680518486068677e-06, "loss": 0.0839, "num_input_tokens_seen": 91145296, "step": 42275 }, { "epoch": 7.7592218755735, "grad_norm": 0.7232574224472046, "learning_rate": 7.679842491238517e-06, "loss": 0.0727, "num_input_tokens_seen": 91157072, "step": 42280 }, { "epoch": 7.760139475133052, "grad_norm": 1.4772145748138428, "learning_rate": 7.67916642767455e-06, "loss": 0.0274, "num_input_tokens_seen": 91168304, "step": 42285 }, { "epoch": 7.761057074692604, "grad_norm": 1.2376295328140259, "learning_rate": 7.678490295394116e-06, "loss": 0.2174, "num_input_tokens_seen": 91179376, "step": 42290 }, { "epoch": 7.761974674252157, "grad_norm": 1.155466079711914, "learning_rate": 7.677814094414557e-06, "loss": 0.1992, "num_input_tokens_seen": 91190544, "step": 42295 }, { "epoch": 7.762892273811708, "grad_norm": 10.245113372802734, "learning_rate": 7.677137824753219e-06, "loss": 0.4221, "num_input_tokens_seen": 91201232, "step": 42300 }, { "epoch": 7.763809873371261, "grad_norm": 14.048638343811035, "learning_rate": 7.676461486427444e-06, "loss": 0.4501, "num_input_tokens_seen": 91212080, "step": 42305 }, { "epoch": 7.764727472930813, "grad_norm": 22.286638259887695, "learning_rate": 7.67578507945458e-06, "loss": 0.4959, "num_input_tokens_seen": 91222736, "step": 42310 }, { "epoch": 7.765645072490365, "grad_norm": 49.807655334472656, "learning_rate": 7.675108603851976e-06, "loss": 0.2564, "num_input_tokens_seen": 91232912, "step": 42315 }, { "epoch": 7.7665626720499175, "grad_norm": 80.65010833740234, "learning_rate": 7.67443205963698e-06, "loss": 0.3865, "num_input_tokens_seen": 91243792, "step": 42320 }, { "epoch": 7.76748027160947, "grad_norm": 13.16765308380127, "learning_rate": 7.673755446826949e-06, "loss": 0.4587, "num_input_tokens_seen": 91253808, "step": 42325 }, { "epoch": 7.768397871169022, "grad_norm": 9.001132011413574, "learning_rate": 7.673078765439235e-06, "loss": 0.1138, "num_input_tokens_seen": 91265136, "step": 42330 }, { "epoch": 7.769315470728574, "grad_norm": 4.542629241943359, "learning_rate": 7.672402015491194e-06, "loss": 0.2627, "num_input_tokens_seen": 91275568, "step": 42335 }, { "epoch": 7.770233070288127, "grad_norm": 20.60993766784668, "learning_rate": 7.67172519700018e-06, "loss": 0.1924, "num_input_tokens_seen": 91286544, "step": 42340 }, { "epoch": 7.771150669847678, "grad_norm": 3.514575958251953, "learning_rate": 7.671048309983558e-06, "loss": 0.3459, "num_input_tokens_seen": 91297520, "step": 42345 }, { "epoch": 7.772068269407231, "grad_norm": 5.520260334014893, "learning_rate": 7.670371354458686e-06, "loss": 0.3403, "num_input_tokens_seen": 91308816, "step": 42350 }, { "epoch": 7.772985868966783, "grad_norm": 25.544504165649414, "learning_rate": 7.669694330442929e-06, "loss": 0.156, "num_input_tokens_seen": 91318096, "step": 42355 }, { "epoch": 7.773903468526335, "grad_norm": 15.46055793762207, "learning_rate": 7.669017237953648e-06, "loss": 0.2547, "num_input_tokens_seen": 91327632, "step": 42360 }, { "epoch": 7.774821068085887, "grad_norm": 5.491621017456055, "learning_rate": 7.668340077008212e-06, "loss": 0.2998, "num_input_tokens_seen": 91339728, "step": 42365 }, { "epoch": 7.77573866764544, "grad_norm": 1.403964877128601, "learning_rate": 7.667662847623989e-06, "loss": 0.0357, "num_input_tokens_seen": 91350064, "step": 42370 }, { "epoch": 7.7766562672049915, "grad_norm": 13.216615676879883, "learning_rate": 7.66698554981835e-06, "loss": 0.2902, "num_input_tokens_seen": 91361808, "step": 42375 }, { "epoch": 7.777573866764544, "grad_norm": 1.0029575824737549, "learning_rate": 7.666308183608662e-06, "loss": 0.41, "num_input_tokens_seen": 91371536, "step": 42380 }, { "epoch": 7.7784914663240965, "grad_norm": 1.1232563257217407, "learning_rate": 7.665630749012303e-06, "loss": 0.1474, "num_input_tokens_seen": 91381616, "step": 42385 }, { "epoch": 7.779409065883648, "grad_norm": 3.8040120601654053, "learning_rate": 7.664953246046644e-06, "loss": 0.0993, "num_input_tokens_seen": 91391184, "step": 42390 }, { "epoch": 7.780326665443201, "grad_norm": 0.4982285499572754, "learning_rate": 7.664275674729068e-06, "loss": 0.3941, "num_input_tokens_seen": 91402480, "step": 42395 }, { "epoch": 7.781244265002753, "grad_norm": 4.1174845695495605, "learning_rate": 7.663598035076949e-06, "loss": 0.047, "num_input_tokens_seen": 91414096, "step": 42400 }, { "epoch": 7.782161864562305, "grad_norm": 16.136934280395508, "learning_rate": 7.662920327107669e-06, "loss": 0.2715, "num_input_tokens_seen": 91426256, "step": 42405 }, { "epoch": 7.783079464121857, "grad_norm": 13.674408912658691, "learning_rate": 7.66224255083861e-06, "loss": 0.0943, "num_input_tokens_seen": 91437456, "step": 42410 }, { "epoch": 7.78399706368141, "grad_norm": 43.482418060302734, "learning_rate": 7.661564706287155e-06, "loss": 0.2706, "num_input_tokens_seen": 91449904, "step": 42415 }, { "epoch": 7.784914663240961, "grad_norm": 5.6390156745910645, "learning_rate": 7.66088679347069e-06, "loss": 0.1992, "num_input_tokens_seen": 91460208, "step": 42420 }, { "epoch": 7.785832262800514, "grad_norm": 3.0176703929901123, "learning_rate": 7.660208812406605e-06, "loss": 0.0267, "num_input_tokens_seen": 91471056, "step": 42425 }, { "epoch": 7.786749862360066, "grad_norm": 27.488367080688477, "learning_rate": 7.659530763112284e-06, "loss": 0.3746, "num_input_tokens_seen": 91482256, "step": 42430 }, { "epoch": 7.787667461919618, "grad_norm": 5.932308673858643, "learning_rate": 7.658852645605122e-06, "loss": 0.1826, "num_input_tokens_seen": 91493744, "step": 42435 }, { "epoch": 7.7885850614791705, "grad_norm": 14.850872039794922, "learning_rate": 7.65817445990251e-06, "loss": 0.2309, "num_input_tokens_seen": 91503856, "step": 42440 }, { "epoch": 7.789502661038723, "grad_norm": 0.9779007434844971, "learning_rate": 7.657496206021843e-06, "loss": 0.215, "num_input_tokens_seen": 91515728, "step": 42445 }, { "epoch": 7.790420260598275, "grad_norm": 18.77098274230957, "learning_rate": 7.656817883980518e-06, "loss": 0.4325, "num_input_tokens_seen": 91526928, "step": 42450 }, { "epoch": 7.791337860157827, "grad_norm": 11.986090660095215, "learning_rate": 7.656139493795932e-06, "loss": 0.3625, "num_input_tokens_seen": 91537392, "step": 42455 }, { "epoch": 7.79225545971738, "grad_norm": 27.22732162475586, "learning_rate": 7.655461035485483e-06, "loss": 0.5534, "num_input_tokens_seen": 91547440, "step": 42460 }, { "epoch": 7.793173059276931, "grad_norm": 48.828304290771484, "learning_rate": 7.654782509066577e-06, "loss": 0.3328, "num_input_tokens_seen": 91558352, "step": 42465 }, { "epoch": 7.794090658836484, "grad_norm": 15.755846977233887, "learning_rate": 7.654103914556611e-06, "loss": 0.4361, "num_input_tokens_seen": 91569392, "step": 42470 }, { "epoch": 7.795008258396036, "grad_norm": 9.637175559997559, "learning_rate": 7.653425251972995e-06, "loss": 0.2427, "num_input_tokens_seen": 91579952, "step": 42475 }, { "epoch": 7.795925857955588, "grad_norm": 14.56929874420166, "learning_rate": 7.652746521333132e-06, "loss": 0.3656, "num_input_tokens_seen": 91590128, "step": 42480 }, { "epoch": 7.79684345751514, "grad_norm": 0.35117867588996887, "learning_rate": 7.652067722654435e-06, "loss": 0.3133, "num_input_tokens_seen": 91601072, "step": 42485 }, { "epoch": 7.797761057074693, "grad_norm": 20.603717803955078, "learning_rate": 7.651388855954308e-06, "loss": 0.4523, "num_input_tokens_seen": 91611472, "step": 42490 }, { "epoch": 7.7986786566342445, "grad_norm": 22.733482360839844, "learning_rate": 7.650709921250168e-06, "loss": 0.3591, "num_input_tokens_seen": 91622800, "step": 42495 }, { "epoch": 7.799596256193797, "grad_norm": 5.91981315612793, "learning_rate": 7.650030918559426e-06, "loss": 0.3054, "num_input_tokens_seen": 91633840, "step": 42500 }, { "epoch": 7.8005138557533495, "grad_norm": 10.9781494140625, "learning_rate": 7.649351847899498e-06, "loss": 0.2636, "num_input_tokens_seen": 91642608, "step": 42505 }, { "epoch": 7.801431455312901, "grad_norm": 6.733226299285889, "learning_rate": 7.648672709287802e-06, "loss": 0.3163, "num_input_tokens_seen": 91651376, "step": 42510 }, { "epoch": 7.802349054872454, "grad_norm": 23.260862350463867, "learning_rate": 7.647993502741755e-06, "loss": 0.2355, "num_input_tokens_seen": 91661904, "step": 42515 }, { "epoch": 7.803266654432006, "grad_norm": 31.68190574645996, "learning_rate": 7.64731422827878e-06, "loss": 0.1079, "num_input_tokens_seen": 91673680, "step": 42520 }, { "epoch": 7.804184253991558, "grad_norm": 9.336584091186523, "learning_rate": 7.646634885916298e-06, "loss": 0.1981, "num_input_tokens_seen": 91685232, "step": 42525 }, { "epoch": 7.80510185355111, "grad_norm": 1.9001271724700928, "learning_rate": 7.64595547567173e-06, "loss": 0.1997, "num_input_tokens_seen": 91696304, "step": 42530 }, { "epoch": 7.806019453110663, "grad_norm": 5.433056831359863, "learning_rate": 7.645275997562509e-06, "loss": 0.1305, "num_input_tokens_seen": 91707440, "step": 42535 }, { "epoch": 7.806937052670214, "grad_norm": 0.7479037642478943, "learning_rate": 7.644596451606057e-06, "loss": 0.1879, "num_input_tokens_seen": 91718320, "step": 42540 }, { "epoch": 7.807854652229767, "grad_norm": 3.1393957138061523, "learning_rate": 7.643916837819804e-06, "loss": 0.2643, "num_input_tokens_seen": 91729712, "step": 42545 }, { "epoch": 7.808772251789319, "grad_norm": 50.44803237915039, "learning_rate": 7.643237156221183e-06, "loss": 0.2691, "num_input_tokens_seen": 91740496, "step": 42550 }, { "epoch": 7.809689851348871, "grad_norm": 29.534698486328125, "learning_rate": 7.642557406827625e-06, "loss": 0.4636, "num_input_tokens_seen": 91752400, "step": 42555 }, { "epoch": 7.810607450908424, "grad_norm": 10.77176570892334, "learning_rate": 7.641877589656566e-06, "loss": 0.1756, "num_input_tokens_seen": 91763888, "step": 42560 }, { "epoch": 7.811525050467976, "grad_norm": 16.50542449951172, "learning_rate": 7.64119770472544e-06, "loss": 0.5902, "num_input_tokens_seen": 91773968, "step": 42565 }, { "epoch": 7.812442650027528, "grad_norm": 0.7428107261657715, "learning_rate": 7.640517752051686e-06, "loss": 0.2305, "num_input_tokens_seen": 91785488, "step": 42570 }, { "epoch": 7.81336024958708, "grad_norm": 14.933897018432617, "learning_rate": 7.639837731652745e-06, "loss": 0.2984, "num_input_tokens_seen": 91796688, "step": 42575 }, { "epoch": 7.814277849146633, "grad_norm": 0.5260871052742004, "learning_rate": 7.639157643546059e-06, "loss": 0.2063, "num_input_tokens_seen": 91806416, "step": 42580 }, { "epoch": 7.815195448706184, "grad_norm": 18.79364585876465, "learning_rate": 7.638477487749068e-06, "loss": 0.2018, "num_input_tokens_seen": 91817328, "step": 42585 }, { "epoch": 7.816113048265737, "grad_norm": 17.84011459350586, "learning_rate": 7.637797264279218e-06, "loss": 0.2768, "num_input_tokens_seen": 91828336, "step": 42590 }, { "epoch": 7.817030647825289, "grad_norm": 11.499497413635254, "learning_rate": 7.637116973153958e-06, "loss": 0.2218, "num_input_tokens_seen": 91838960, "step": 42595 }, { "epoch": 7.817948247384841, "grad_norm": 26.243566513061523, "learning_rate": 7.636436614390734e-06, "loss": 0.1798, "num_input_tokens_seen": 91849360, "step": 42600 }, { "epoch": 7.8188658469443935, "grad_norm": 4.787023067474365, "learning_rate": 7.635756188006998e-06, "loss": 0.2919, "num_input_tokens_seen": 91860688, "step": 42605 }, { "epoch": 7.819783446503946, "grad_norm": 11.589967727661133, "learning_rate": 7.6350756940202e-06, "loss": 0.4671, "num_input_tokens_seen": 91871792, "step": 42610 }, { "epoch": 7.820701046063498, "grad_norm": 22.265708923339844, "learning_rate": 7.634395132447793e-06, "loss": 0.3812, "num_input_tokens_seen": 91882768, "step": 42615 }, { "epoch": 7.82161864562305, "grad_norm": 12.623394012451172, "learning_rate": 7.633714503307236e-06, "loss": 0.3243, "num_input_tokens_seen": 91893392, "step": 42620 }, { "epoch": 7.822536245182603, "grad_norm": 1.4707660675048828, "learning_rate": 7.633033806615982e-06, "loss": 0.3586, "num_input_tokens_seen": 91903056, "step": 42625 }, { "epoch": 7.823453844742154, "grad_norm": 22.963623046875, "learning_rate": 7.632353042391493e-06, "loss": 0.2445, "num_input_tokens_seen": 91914032, "step": 42630 }, { "epoch": 7.824371444301707, "grad_norm": 1.727339744567871, "learning_rate": 7.63167221065123e-06, "loss": 0.1487, "num_input_tokens_seen": 91925808, "step": 42635 }, { "epoch": 7.825289043861259, "grad_norm": 18.35921287536621, "learning_rate": 7.63099131141265e-06, "loss": 0.2183, "num_input_tokens_seen": 91936656, "step": 42640 }, { "epoch": 7.826206643420811, "grad_norm": 9.925250053405762, "learning_rate": 7.630310344693222e-06, "loss": 0.3023, "num_input_tokens_seen": 91947376, "step": 42645 }, { "epoch": 7.827124242980363, "grad_norm": 36.175662994384766, "learning_rate": 7.62962931051041e-06, "loss": 0.1994, "num_input_tokens_seen": 91958384, "step": 42650 }, { "epoch": 7.828041842539916, "grad_norm": 5.942753791809082, "learning_rate": 7.628948208881683e-06, "loss": 0.2794, "num_input_tokens_seen": 91969264, "step": 42655 }, { "epoch": 7.8289594420994675, "grad_norm": 22.816505432128906, "learning_rate": 7.628267039824508e-06, "loss": 0.1285, "num_input_tokens_seen": 91980464, "step": 42660 }, { "epoch": 7.82987704165902, "grad_norm": 10.128803253173828, "learning_rate": 7.627585803356355e-06, "loss": 0.0627, "num_input_tokens_seen": 91991024, "step": 42665 }, { "epoch": 7.8307946412185725, "grad_norm": 0.7143713235855103, "learning_rate": 7.626904499494702e-06, "loss": 0.2296, "num_input_tokens_seen": 92000432, "step": 42670 }, { "epoch": 7.831712240778124, "grad_norm": 10.907705307006836, "learning_rate": 7.626223128257018e-06, "loss": 0.3212, "num_input_tokens_seen": 92012720, "step": 42675 }, { "epoch": 7.832629840337677, "grad_norm": 1.1294610500335693, "learning_rate": 7.6255416896607814e-06, "loss": 0.1703, "num_input_tokens_seen": 92024752, "step": 42680 }, { "epoch": 7.833547439897229, "grad_norm": 32.58766555786133, "learning_rate": 7.62486018372347e-06, "loss": 0.2335, "num_input_tokens_seen": 92036016, "step": 42685 }, { "epoch": 7.834465039456781, "grad_norm": 3.802903890609741, "learning_rate": 7.624178610462563e-06, "loss": 0.1095, "num_input_tokens_seen": 92046672, "step": 42690 }, { "epoch": 7.835382639016333, "grad_norm": 9.613334655761719, "learning_rate": 7.623496969895541e-06, "loss": 0.3625, "num_input_tokens_seen": 92058032, "step": 42695 }, { "epoch": 7.836300238575886, "grad_norm": 1.3569960594177246, "learning_rate": 7.622815262039889e-06, "loss": 0.2246, "num_input_tokens_seen": 92069168, "step": 42700 }, { "epoch": 7.837217838135437, "grad_norm": 4.475591659545898, "learning_rate": 7.622133486913089e-06, "loss": 0.2423, "num_input_tokens_seen": 92079760, "step": 42705 }, { "epoch": 7.83813543769499, "grad_norm": 3.5955467224121094, "learning_rate": 7.621451644532629e-06, "loss": 0.3422, "num_input_tokens_seen": 92090032, "step": 42710 }, { "epoch": 7.839053037254542, "grad_norm": 11.918219566345215, "learning_rate": 7.620769734915998e-06, "loss": 0.3289, "num_input_tokens_seen": 92100240, "step": 42715 }, { "epoch": 7.839970636814094, "grad_norm": 13.052288055419922, "learning_rate": 7.620087758080685e-06, "loss": 0.3209, "num_input_tokens_seen": 92111312, "step": 42720 }, { "epoch": 7.8408882363736465, "grad_norm": 1.5156854391098022, "learning_rate": 7.6194057140441825e-06, "loss": 0.1448, "num_input_tokens_seen": 92122416, "step": 42725 }, { "epoch": 7.841805835933199, "grad_norm": 4.813201427459717, "learning_rate": 7.618723602823983e-06, "loss": 0.4139, "num_input_tokens_seen": 92133072, "step": 42730 }, { "epoch": 7.842723435492751, "grad_norm": 3.3103158473968506, "learning_rate": 7.618041424437581e-06, "loss": 0.1006, "num_input_tokens_seen": 92143696, "step": 42735 }, { "epoch": 7.843641035052303, "grad_norm": 1.4679994583129883, "learning_rate": 7.617359178902475e-06, "loss": 0.0739, "num_input_tokens_seen": 92155696, "step": 42740 }, { "epoch": 7.844558634611856, "grad_norm": 14.994767189025879, "learning_rate": 7.616676866236161e-06, "loss": 0.1455, "num_input_tokens_seen": 92165584, "step": 42745 }, { "epoch": 7.845476234171407, "grad_norm": 58.84405517578125, "learning_rate": 7.615994486456142e-06, "loss": 0.1481, "num_input_tokens_seen": 92176016, "step": 42750 }, { "epoch": 7.84639383373096, "grad_norm": 0.7141845226287842, "learning_rate": 7.6153120395799185e-06, "loss": 0.3029, "num_input_tokens_seen": 92186704, "step": 42755 }, { "epoch": 7.847311433290512, "grad_norm": 23.87287139892578, "learning_rate": 7.6146295256249944e-06, "loss": 0.4758, "num_input_tokens_seen": 92196976, "step": 42760 }, { "epoch": 7.848229032850064, "grad_norm": 0.1853751689195633, "learning_rate": 7.613946944608875e-06, "loss": 0.3333, "num_input_tokens_seen": 92208688, "step": 42765 }, { "epoch": 7.849146632409616, "grad_norm": 4.728427886962891, "learning_rate": 7.613264296549068e-06, "loss": 0.1837, "num_input_tokens_seen": 92219312, "step": 42770 }, { "epoch": 7.850064231969169, "grad_norm": 16.45721435546875, "learning_rate": 7.612581581463082e-06, "loss": 0.4326, "num_input_tokens_seen": 92229072, "step": 42775 }, { "epoch": 7.8509818315287205, "grad_norm": 51.831050872802734, "learning_rate": 7.611898799368429e-06, "loss": 0.1009, "num_input_tokens_seen": 92240720, "step": 42780 }, { "epoch": 7.851899431088273, "grad_norm": 7.213172435760498, "learning_rate": 7.611215950282619e-06, "loss": 0.354, "num_input_tokens_seen": 92250960, "step": 42785 }, { "epoch": 7.8528170306478255, "grad_norm": 1.3281430006027222, "learning_rate": 7.6105330342231665e-06, "loss": 0.1027, "num_input_tokens_seen": 92262192, "step": 42790 }, { "epoch": 7.853734630207377, "grad_norm": 48.152740478515625, "learning_rate": 7.609850051207588e-06, "loss": 0.4466, "num_input_tokens_seen": 92272176, "step": 42795 }, { "epoch": 7.85465222976693, "grad_norm": 0.391433447599411, "learning_rate": 7.609167001253399e-06, "loss": 0.2665, "num_input_tokens_seen": 92283024, "step": 42800 }, { "epoch": 7.855569829326482, "grad_norm": 79.0921859741211, "learning_rate": 7.608483884378123e-06, "loss": 0.3875, "num_input_tokens_seen": 92293808, "step": 42805 }, { "epoch": 7.856487428886034, "grad_norm": 18.529586791992188, "learning_rate": 7.607800700599276e-06, "loss": 0.1885, "num_input_tokens_seen": 92304016, "step": 42810 }, { "epoch": 7.857405028445586, "grad_norm": 24.551992416381836, "learning_rate": 7.607117449934384e-06, "loss": 0.323, "num_input_tokens_seen": 92313296, "step": 42815 }, { "epoch": 7.858322628005139, "grad_norm": 1.1424283981323242, "learning_rate": 7.606434132400968e-06, "loss": 0.0216, "num_input_tokens_seen": 92323760, "step": 42820 }, { "epoch": 7.85924022756469, "grad_norm": 16.903095245361328, "learning_rate": 7.605750748016558e-06, "loss": 0.2515, "num_input_tokens_seen": 92335120, "step": 42825 }, { "epoch": 7.860157827124243, "grad_norm": 15.3985013961792, "learning_rate": 7.60506729679868e-06, "loss": 0.2452, "num_input_tokens_seen": 92346736, "step": 42830 }, { "epoch": 7.861075426683795, "grad_norm": 0.46080535650253296, "learning_rate": 7.604383778764863e-06, "loss": 0.3477, "num_input_tokens_seen": 92357200, "step": 42835 }, { "epoch": 7.861993026243347, "grad_norm": 33.34782791137695, "learning_rate": 7.603700193932638e-06, "loss": 0.3946, "num_input_tokens_seen": 92367344, "step": 42840 }, { "epoch": 7.8629106258029, "grad_norm": 15.130355834960938, "learning_rate": 7.603016542319539e-06, "loss": 0.2974, "num_input_tokens_seen": 92378896, "step": 42845 }, { "epoch": 7.863828225362452, "grad_norm": 0.31121954321861267, "learning_rate": 7.602332823943099e-06, "loss": 0.0559, "num_input_tokens_seen": 92389392, "step": 42850 }, { "epoch": 7.864745824922004, "grad_norm": 0.20193259418010712, "learning_rate": 7.601649038820857e-06, "loss": 0.2277, "num_input_tokens_seen": 92400912, "step": 42855 }, { "epoch": 7.865663424481556, "grad_norm": 3.268829584121704, "learning_rate": 7.6009651869703485e-06, "loss": 0.3751, "num_input_tokens_seen": 92411856, "step": 42860 }, { "epoch": 7.866581024041109, "grad_norm": 3.337498188018799, "learning_rate": 7.600281268409113e-06, "loss": 0.1997, "num_input_tokens_seen": 92423280, "step": 42865 }, { "epoch": 7.86749862360066, "grad_norm": 5.243459224700928, "learning_rate": 7.599597283154694e-06, "loss": 0.3021, "num_input_tokens_seen": 92433168, "step": 42870 }, { "epoch": 7.868416223160213, "grad_norm": 4.077928066253662, "learning_rate": 7.598913231224634e-06, "loss": 0.1568, "num_input_tokens_seen": 92444624, "step": 42875 }, { "epoch": 7.869333822719765, "grad_norm": 13.19941520690918, "learning_rate": 7.598229112636477e-06, "loss": 0.4544, "num_input_tokens_seen": 92455056, "step": 42880 }, { "epoch": 7.870251422279317, "grad_norm": 1.405957818031311, "learning_rate": 7.59754492740777e-06, "loss": 0.3069, "num_input_tokens_seen": 92467280, "step": 42885 }, { "epoch": 7.8711690218388695, "grad_norm": 26.61372184753418, "learning_rate": 7.5968606755560625e-06, "loss": 0.3293, "num_input_tokens_seen": 92478448, "step": 42890 }, { "epoch": 7.872086621398422, "grad_norm": 19.75468635559082, "learning_rate": 7.596176357098904e-06, "loss": 0.3845, "num_input_tokens_seen": 92489072, "step": 42895 }, { "epoch": 7.873004220957974, "grad_norm": 0.46773236989974976, "learning_rate": 7.595491972053843e-06, "loss": 0.2371, "num_input_tokens_seen": 92499984, "step": 42900 }, { "epoch": 7.873921820517526, "grad_norm": 44.9691047668457, "learning_rate": 7.5948075204384385e-06, "loss": 0.2397, "num_input_tokens_seen": 92511984, "step": 42905 }, { "epoch": 7.874839420077079, "grad_norm": 1.5483803749084473, "learning_rate": 7.594123002270239e-06, "loss": 0.3189, "num_input_tokens_seen": 92522576, "step": 42910 }, { "epoch": 7.87575701963663, "grad_norm": 28.313627243041992, "learning_rate": 7.59343841756681e-06, "loss": 0.3075, "num_input_tokens_seen": 92533008, "step": 42915 }, { "epoch": 7.876674619196183, "grad_norm": 3.077298641204834, "learning_rate": 7.592753766345701e-06, "loss": 0.2011, "num_input_tokens_seen": 92543792, "step": 42920 }, { "epoch": 7.877592218755735, "grad_norm": 9.534914016723633, "learning_rate": 7.592069048624478e-06, "loss": 0.291, "num_input_tokens_seen": 92554384, "step": 42925 }, { "epoch": 7.878509818315287, "grad_norm": 16.336761474609375, "learning_rate": 7.5913842644207005e-06, "loss": 0.2539, "num_input_tokens_seen": 92566352, "step": 42930 }, { "epoch": 7.879427417874839, "grad_norm": 11.932656288146973, "learning_rate": 7.590699413751932e-06, "loss": 0.1367, "num_input_tokens_seen": 92576304, "step": 42935 }, { "epoch": 7.880345017434392, "grad_norm": 6.950238227844238, "learning_rate": 7.59001449663574e-06, "loss": 0.3037, "num_input_tokens_seen": 92587024, "step": 42940 }, { "epoch": 7.8812626169939435, "grad_norm": 7.980741500854492, "learning_rate": 7.589329513089692e-06, "loss": 0.1811, "num_input_tokens_seen": 92597008, "step": 42945 }, { "epoch": 7.882180216553496, "grad_norm": 0.7124529480934143, "learning_rate": 7.5886444631313525e-06, "loss": 0.2037, "num_input_tokens_seen": 92608848, "step": 42950 }, { "epoch": 7.8830978161130485, "grad_norm": 2.2175419330596924, "learning_rate": 7.587959346778295e-06, "loss": 0.2937, "num_input_tokens_seen": 92619280, "step": 42955 }, { "epoch": 7.8840154156726, "grad_norm": 28.4649658203125, "learning_rate": 7.587274164048092e-06, "loss": 0.2094, "num_input_tokens_seen": 92630160, "step": 42960 }, { "epoch": 7.884933015232153, "grad_norm": 4.184412002563477, "learning_rate": 7.5865889149583176e-06, "loss": 0.1973, "num_input_tokens_seen": 92639792, "step": 42965 }, { "epoch": 7.885850614791705, "grad_norm": 17.697368621826172, "learning_rate": 7.5859035995265425e-06, "loss": 0.3846, "num_input_tokens_seen": 92650064, "step": 42970 }, { "epoch": 7.886768214351257, "grad_norm": 12.314677238464355, "learning_rate": 7.585218217770351e-06, "loss": 0.31, "num_input_tokens_seen": 92658544, "step": 42975 }, { "epoch": 7.887685813910809, "grad_norm": 14.376219749450684, "learning_rate": 7.584532769707319e-06, "loss": 0.2931, "num_input_tokens_seen": 92668848, "step": 42980 }, { "epoch": 7.888603413470362, "grad_norm": 10.06814956665039, "learning_rate": 7.5838472553550255e-06, "loss": 0.3918, "num_input_tokens_seen": 92679600, "step": 42985 }, { "epoch": 7.889521013029913, "grad_norm": 0.7619074583053589, "learning_rate": 7.5831616747310565e-06, "loss": 0.1319, "num_input_tokens_seen": 92691920, "step": 42990 }, { "epoch": 7.890438612589466, "grad_norm": 1.1559582948684692, "learning_rate": 7.582476027852992e-06, "loss": 0.2018, "num_input_tokens_seen": 92702960, "step": 42995 }, { "epoch": 7.891356212149018, "grad_norm": 10.081012725830078, "learning_rate": 7.581790314738422e-06, "loss": 0.1903, "num_input_tokens_seen": 92713520, "step": 43000 }, { "epoch": 7.89227381170857, "grad_norm": 15.071898460388184, "learning_rate": 7.58110453540493e-06, "loss": 0.1274, "num_input_tokens_seen": 92724688, "step": 43005 }, { "epoch": 7.8931914112681225, "grad_norm": 0.4215269088745117, "learning_rate": 7.5804186898701085e-06, "loss": 0.2977, "num_input_tokens_seen": 92735568, "step": 43010 }, { "epoch": 7.894109010827675, "grad_norm": 25.882495880126953, "learning_rate": 7.5797327781515475e-06, "loss": 0.2551, "num_input_tokens_seen": 92745744, "step": 43015 }, { "epoch": 7.895026610387227, "grad_norm": 17.250957489013672, "learning_rate": 7.579046800266836e-06, "loss": 0.3042, "num_input_tokens_seen": 92756848, "step": 43020 }, { "epoch": 7.895944209946779, "grad_norm": 1.5903961658477783, "learning_rate": 7.578360756233574e-06, "loss": 0.2772, "num_input_tokens_seen": 92767728, "step": 43025 }, { "epoch": 7.896861809506332, "grad_norm": 3.689661979675293, "learning_rate": 7.5776746460693525e-06, "loss": 0.2688, "num_input_tokens_seen": 92778192, "step": 43030 }, { "epoch": 7.897779409065883, "grad_norm": 11.181744575500488, "learning_rate": 7.5769884697917726e-06, "loss": 0.2979, "num_input_tokens_seen": 92788624, "step": 43035 }, { "epoch": 7.898697008625436, "grad_norm": 10.79654312133789, "learning_rate": 7.576302227418433e-06, "loss": 0.1063, "num_input_tokens_seen": 92799760, "step": 43040 }, { "epoch": 7.899614608184988, "grad_norm": 3.708214044570923, "learning_rate": 7.5756159189669325e-06, "loss": 0.2046, "num_input_tokens_seen": 92809968, "step": 43045 }, { "epoch": 7.90053220774454, "grad_norm": 51.22536087036133, "learning_rate": 7.574929544454877e-06, "loss": 0.3047, "num_input_tokens_seen": 92821456, "step": 43050 }, { "epoch": 7.901449807304092, "grad_norm": 1.6461328268051147, "learning_rate": 7.574243103899869e-06, "loss": 0.3041, "num_input_tokens_seen": 92831184, "step": 43055 }, { "epoch": 7.902367406863645, "grad_norm": 37.57052993774414, "learning_rate": 7.573556597319516e-06, "loss": 0.3908, "num_input_tokens_seen": 92842576, "step": 43060 }, { "epoch": 7.9032850064231965, "grad_norm": 14.955761909484863, "learning_rate": 7.572870024731423e-06, "loss": 0.3459, "num_input_tokens_seen": 92852944, "step": 43065 }, { "epoch": 7.904202605982749, "grad_norm": 0.8175199031829834, "learning_rate": 7.572183386153203e-06, "loss": 0.1863, "num_input_tokens_seen": 92862864, "step": 43070 }, { "epoch": 7.9051202055423015, "grad_norm": 9.48062515258789, "learning_rate": 7.571496681602464e-06, "loss": 0.3089, "num_input_tokens_seen": 92872976, "step": 43075 }, { "epoch": 7.906037805101853, "grad_norm": 148.9627685546875, "learning_rate": 7.5708099110968214e-06, "loss": 0.2744, "num_input_tokens_seen": 92884240, "step": 43080 }, { "epoch": 7.906955404661406, "grad_norm": 0.3646297752857208, "learning_rate": 7.57012307465389e-06, "loss": 0.3044, "num_input_tokens_seen": 92896304, "step": 43085 }, { "epoch": 7.907873004220958, "grad_norm": 39.98339080810547, "learning_rate": 7.569436172291284e-06, "loss": 0.2061, "num_input_tokens_seen": 92907856, "step": 43090 }, { "epoch": 7.90879060378051, "grad_norm": 3.935495615005493, "learning_rate": 7.568749204026622e-06, "loss": 0.3523, "num_input_tokens_seen": 92919952, "step": 43095 }, { "epoch": 7.909708203340062, "grad_norm": 79.2145767211914, "learning_rate": 7.568062169877526e-06, "loss": 0.4885, "num_input_tokens_seen": 92930704, "step": 43100 }, { "epoch": 7.910625802899615, "grad_norm": 0.5184888243675232, "learning_rate": 7.567375069861614e-06, "loss": 0.2726, "num_input_tokens_seen": 92940592, "step": 43105 }, { "epoch": 7.911543402459166, "grad_norm": 0.6314509510993958, "learning_rate": 7.56668790399651e-06, "loss": 0.226, "num_input_tokens_seen": 92952336, "step": 43110 }, { "epoch": 7.912461002018719, "grad_norm": 62.75861740112305, "learning_rate": 7.56600067229984e-06, "loss": 0.1827, "num_input_tokens_seen": 92963280, "step": 43115 }, { "epoch": 7.913378601578271, "grad_norm": 1.4270647764205933, "learning_rate": 7.56531337478923e-06, "loss": 0.0971, "num_input_tokens_seen": 92975056, "step": 43120 }, { "epoch": 7.914296201137823, "grad_norm": 7.415470123291016, "learning_rate": 7.564626011482308e-06, "loss": 0.1736, "num_input_tokens_seen": 92987216, "step": 43125 }, { "epoch": 7.915213800697376, "grad_norm": 11.198803901672363, "learning_rate": 7.563938582396704e-06, "loss": 0.4973, "num_input_tokens_seen": 92999184, "step": 43130 }, { "epoch": 7.916131400256928, "grad_norm": 13.533016204833984, "learning_rate": 7.563251087550047e-06, "loss": 0.2769, "num_input_tokens_seen": 93010672, "step": 43135 }, { "epoch": 7.91704899981648, "grad_norm": 0.6294947266578674, "learning_rate": 7.562563526959974e-06, "loss": 0.1429, "num_input_tokens_seen": 93022032, "step": 43140 }, { "epoch": 7.917966599376032, "grad_norm": 21.199848175048828, "learning_rate": 7.5618759006441175e-06, "loss": 0.3866, "num_input_tokens_seen": 93033008, "step": 43145 }, { "epoch": 7.918884198935585, "grad_norm": 28.010501861572266, "learning_rate": 7.561188208620116e-06, "loss": 0.5741, "num_input_tokens_seen": 93043152, "step": 43150 }, { "epoch": 7.919801798495136, "grad_norm": 40.048030853271484, "learning_rate": 7.560500450905605e-06, "loss": 0.3108, "num_input_tokens_seen": 93054864, "step": 43155 }, { "epoch": 7.920719398054689, "grad_norm": 13.917119979858398, "learning_rate": 7.559812627518226e-06, "loss": 0.2062, "num_input_tokens_seen": 93066896, "step": 43160 }, { "epoch": 7.921636997614241, "grad_norm": 11.265028953552246, "learning_rate": 7.559124738475621e-06, "loss": 0.0965, "num_input_tokens_seen": 93076560, "step": 43165 }, { "epoch": 7.922554597173793, "grad_norm": 60.073097229003906, "learning_rate": 7.558436783795432e-06, "loss": 0.4099, "num_input_tokens_seen": 93087280, "step": 43170 }, { "epoch": 7.9234721967333455, "grad_norm": 23.09903335571289, "learning_rate": 7.557748763495305e-06, "loss": 0.2362, "num_input_tokens_seen": 93096912, "step": 43175 }, { "epoch": 7.924389796292898, "grad_norm": 2.1459219455718994, "learning_rate": 7.557060677592887e-06, "loss": 0.1257, "num_input_tokens_seen": 93107440, "step": 43180 }, { "epoch": 7.92530739585245, "grad_norm": 24.878429412841797, "learning_rate": 7.556372526105825e-06, "loss": 0.2338, "num_input_tokens_seen": 93116432, "step": 43185 }, { "epoch": 7.926224995412002, "grad_norm": 43.24748611450195, "learning_rate": 7.55568430905177e-06, "loss": 0.1375, "num_input_tokens_seen": 93127536, "step": 43190 }, { "epoch": 7.927142594971555, "grad_norm": 97.36808776855469, "learning_rate": 7.554996026448374e-06, "loss": 0.3391, "num_input_tokens_seen": 93137744, "step": 43195 }, { "epoch": 7.928060194531106, "grad_norm": 0.40682363510131836, "learning_rate": 7.554307678313289e-06, "loss": 0.3533, "num_input_tokens_seen": 93148400, "step": 43200 }, { "epoch": 7.928977794090659, "grad_norm": 4.476165771484375, "learning_rate": 7.553619264664169e-06, "loss": 0.3156, "num_input_tokens_seen": 93159280, "step": 43205 }, { "epoch": 7.929895393650211, "grad_norm": 68.96530151367188, "learning_rate": 7.552930785518676e-06, "loss": 0.5103, "num_input_tokens_seen": 93169744, "step": 43210 }, { "epoch": 7.930812993209763, "grad_norm": 53.655418395996094, "learning_rate": 7.552242240894465e-06, "loss": 0.1441, "num_input_tokens_seen": 93180496, "step": 43215 }, { "epoch": 7.931730592769315, "grad_norm": 6.824683666229248, "learning_rate": 7.551553630809194e-06, "loss": 0.3167, "num_input_tokens_seen": 93191024, "step": 43220 }, { "epoch": 7.932648192328868, "grad_norm": 0.4321858882904053, "learning_rate": 7.550864955280528e-06, "loss": 0.2193, "num_input_tokens_seen": 93202832, "step": 43225 }, { "epoch": 7.9335657918884195, "grad_norm": 4.490972995758057, "learning_rate": 7.5501762143261285e-06, "loss": 0.187, "num_input_tokens_seen": 93214064, "step": 43230 }, { "epoch": 7.934483391447972, "grad_norm": 0.36104267835617065, "learning_rate": 7.549487407963663e-06, "loss": 0.4025, "num_input_tokens_seen": 93225104, "step": 43235 }, { "epoch": 7.9354009910075245, "grad_norm": 1.1380730867385864, "learning_rate": 7.548798536210795e-06, "loss": 0.1, "num_input_tokens_seen": 93235984, "step": 43240 }, { "epoch": 7.936318590567076, "grad_norm": 70.9664077758789, "learning_rate": 7.5481095990851975e-06, "loss": 0.3373, "num_input_tokens_seen": 93246192, "step": 43245 }, { "epoch": 7.937236190126629, "grad_norm": 35.06941223144531, "learning_rate": 7.5474205966045356e-06, "loss": 0.128, "num_input_tokens_seen": 93257232, "step": 43250 }, { "epoch": 7.938153789686181, "grad_norm": 8.590425491333008, "learning_rate": 7.546731528786484e-06, "loss": 0.1546, "num_input_tokens_seen": 93268208, "step": 43255 }, { "epoch": 7.939071389245733, "grad_norm": 2.4687602519989014, "learning_rate": 7.546042395648716e-06, "loss": 0.129, "num_input_tokens_seen": 93278288, "step": 43260 }, { "epoch": 7.939988988805285, "grad_norm": 5.767090797424316, "learning_rate": 7.5453531972089064e-06, "loss": 0.2306, "num_input_tokens_seen": 93289264, "step": 43265 }, { "epoch": 7.940906588364838, "grad_norm": 30.130699157714844, "learning_rate": 7.544663933484733e-06, "loss": 0.5014, "num_input_tokens_seen": 93300048, "step": 43270 }, { "epoch": 7.941824187924389, "grad_norm": 9.187445640563965, "learning_rate": 7.543974604493873e-06, "loss": 0.1174, "num_input_tokens_seen": 93311472, "step": 43275 }, { "epoch": 7.942741787483942, "grad_norm": 11.572796821594238, "learning_rate": 7.5432852102540055e-06, "loss": 0.1343, "num_input_tokens_seen": 93323248, "step": 43280 }, { "epoch": 7.943659387043494, "grad_norm": 16.56703758239746, "learning_rate": 7.542595750782817e-06, "loss": 0.2534, "num_input_tokens_seen": 93333936, "step": 43285 }, { "epoch": 7.944576986603046, "grad_norm": 10.06590747833252, "learning_rate": 7.541906226097986e-06, "loss": 0.2693, "num_input_tokens_seen": 93344592, "step": 43290 }, { "epoch": 7.9454945861625985, "grad_norm": 37.792484283447266, "learning_rate": 7.541216636217201e-06, "loss": 0.2068, "num_input_tokens_seen": 93355888, "step": 43295 }, { "epoch": 7.946412185722151, "grad_norm": 16.440542221069336, "learning_rate": 7.540526981158147e-06, "loss": 0.0901, "num_input_tokens_seen": 93365456, "step": 43300 }, { "epoch": 7.947329785281703, "grad_norm": 15.634572982788086, "learning_rate": 7.539837260938514e-06, "loss": 0.2978, "num_input_tokens_seen": 93375984, "step": 43305 }, { "epoch": 7.948247384841255, "grad_norm": 45.32862854003906, "learning_rate": 7.539147475575992e-06, "loss": 0.3011, "num_input_tokens_seen": 93387472, "step": 43310 }, { "epoch": 7.949164984400808, "grad_norm": 2.245609998703003, "learning_rate": 7.5384576250882725e-06, "loss": 0.2546, "num_input_tokens_seen": 93398384, "step": 43315 }, { "epoch": 7.950082583960359, "grad_norm": 20.522695541381836, "learning_rate": 7.537767709493049e-06, "loss": 0.1654, "num_input_tokens_seen": 93409168, "step": 43320 }, { "epoch": 7.951000183519912, "grad_norm": 24.901409149169922, "learning_rate": 7.537077728808018e-06, "loss": 0.1012, "num_input_tokens_seen": 93420432, "step": 43325 }, { "epoch": 7.951917783079464, "grad_norm": 28.586280822753906, "learning_rate": 7.536387683050874e-06, "loss": 0.2591, "num_input_tokens_seen": 93431632, "step": 43330 }, { "epoch": 7.952835382639016, "grad_norm": 0.3450715243816376, "learning_rate": 7.535697572239318e-06, "loss": 0.1228, "num_input_tokens_seen": 93442896, "step": 43335 }, { "epoch": 7.953752982198568, "grad_norm": 5.706108093261719, "learning_rate": 7.535007396391047e-06, "loss": 0.459, "num_input_tokens_seen": 93453328, "step": 43340 }, { "epoch": 7.954670581758121, "grad_norm": 7.324375152587891, "learning_rate": 7.534317155523767e-06, "loss": 0.408, "num_input_tokens_seen": 93463952, "step": 43345 }, { "epoch": 7.9555881813176725, "grad_norm": 20.837081909179688, "learning_rate": 7.5336268496551805e-06, "loss": 0.1109, "num_input_tokens_seen": 93475600, "step": 43350 }, { "epoch": 7.956505780877225, "grad_norm": 16.63170623779297, "learning_rate": 7.5329364788029905e-06, "loss": 0.3382, "num_input_tokens_seen": 93486416, "step": 43355 }, { "epoch": 7.9574233804367775, "grad_norm": 6.133291244506836, "learning_rate": 7.532246042984906e-06, "loss": 0.4444, "num_input_tokens_seen": 93497904, "step": 43360 }, { "epoch": 7.958340979996329, "grad_norm": 0.580961287021637, "learning_rate": 7.5315555422186335e-06, "loss": 0.2886, "num_input_tokens_seen": 93509648, "step": 43365 }, { "epoch": 7.959258579555882, "grad_norm": 9.017151832580566, "learning_rate": 7.530864976521888e-06, "loss": 0.0913, "num_input_tokens_seen": 93519760, "step": 43370 }, { "epoch": 7.960176179115434, "grad_norm": 7.55728006362915, "learning_rate": 7.5301743459123755e-06, "loss": 0.4652, "num_input_tokens_seen": 93530096, "step": 43375 }, { "epoch": 7.961093778674986, "grad_norm": 0.5637047290802002, "learning_rate": 7.529483650407815e-06, "loss": 0.3055, "num_input_tokens_seen": 93541584, "step": 43380 }, { "epoch": 7.962011378234538, "grad_norm": 0.0684506967663765, "learning_rate": 7.528792890025918e-06, "loss": 0.3776, "num_input_tokens_seen": 93552272, "step": 43385 }, { "epoch": 7.962928977794091, "grad_norm": 11.3829984664917, "learning_rate": 7.5281020647844015e-06, "loss": 0.2111, "num_input_tokens_seen": 93562416, "step": 43390 }, { "epoch": 7.963846577353642, "grad_norm": 3.5446879863739014, "learning_rate": 7.527411174700987e-06, "loss": 0.303, "num_input_tokens_seen": 93573200, "step": 43395 }, { "epoch": 7.964764176913195, "grad_norm": 0.4573604464530945, "learning_rate": 7.526720219793393e-06, "loss": 0.0763, "num_input_tokens_seen": 93582640, "step": 43400 }, { "epoch": 7.965681776472747, "grad_norm": 10.60019588470459, "learning_rate": 7.526029200079341e-06, "loss": 0.2864, "num_input_tokens_seen": 93594128, "step": 43405 }, { "epoch": 7.966599376032299, "grad_norm": 55.50629425048828, "learning_rate": 7.525338115576555e-06, "loss": 0.3603, "num_input_tokens_seen": 93604144, "step": 43410 }, { "epoch": 7.967516975591852, "grad_norm": 2.3540024757385254, "learning_rate": 7.524646966302759e-06, "loss": 0.1622, "num_input_tokens_seen": 93615504, "step": 43415 }, { "epoch": 7.968434575151404, "grad_norm": 2.017400026321411, "learning_rate": 7.523955752275682e-06, "loss": 0.1243, "num_input_tokens_seen": 93628688, "step": 43420 }, { "epoch": 7.969352174710956, "grad_norm": 8.303692817687988, "learning_rate": 7.523264473513052e-06, "loss": 0.2748, "num_input_tokens_seen": 93637808, "step": 43425 }, { "epoch": 7.970269774270508, "grad_norm": 10.027059555053711, "learning_rate": 7.5225731300326e-06, "loss": 0.2521, "num_input_tokens_seen": 93646928, "step": 43430 }, { "epoch": 7.971187373830061, "grad_norm": 2.2866005897521973, "learning_rate": 7.521881721852056e-06, "loss": 0.0934, "num_input_tokens_seen": 93657616, "step": 43435 }, { "epoch": 7.972104973389612, "grad_norm": 6.954025745391846, "learning_rate": 7.521190248989154e-06, "loss": 0.0804, "num_input_tokens_seen": 93668464, "step": 43440 }, { "epoch": 7.973022572949165, "grad_norm": 0.3301714360713959, "learning_rate": 7.52049871146163e-06, "loss": 0.1092, "num_input_tokens_seen": 93677968, "step": 43445 }, { "epoch": 7.973940172508717, "grad_norm": 3.787607431411743, "learning_rate": 7.51980710928722e-06, "loss": 0.2759, "num_input_tokens_seen": 93688400, "step": 43450 }, { "epoch": 7.974857772068269, "grad_norm": 30.289159774780273, "learning_rate": 7.519115442483664e-06, "loss": 0.4198, "num_input_tokens_seen": 93697680, "step": 43455 }, { "epoch": 7.9757753716278215, "grad_norm": 0.6926225423812866, "learning_rate": 7.5184237110686995e-06, "loss": 0.2257, "num_input_tokens_seen": 93708496, "step": 43460 }, { "epoch": 7.976692971187374, "grad_norm": 2.4430248737335205, "learning_rate": 7.5177319150600714e-06, "loss": 0.0886, "num_input_tokens_seen": 93718576, "step": 43465 }, { "epoch": 7.977610570746926, "grad_norm": 2.295365333557129, "learning_rate": 7.517040054475522e-06, "loss": 0.216, "num_input_tokens_seen": 93729616, "step": 43470 }, { "epoch": 7.978528170306478, "grad_norm": 12.272199630737305, "learning_rate": 7.516348129332794e-06, "loss": 0.4543, "num_input_tokens_seen": 93740592, "step": 43475 }, { "epoch": 7.979445769866031, "grad_norm": 25.704580307006836, "learning_rate": 7.515656139649639e-06, "loss": 0.4139, "num_input_tokens_seen": 93751152, "step": 43480 }, { "epoch": 7.980363369425582, "grad_norm": 4.514914035797119, "learning_rate": 7.514964085443801e-06, "loss": 0.2342, "num_input_tokens_seen": 93761584, "step": 43485 }, { "epoch": 7.981280968985135, "grad_norm": 1.4224967956542969, "learning_rate": 7.514271966733034e-06, "loss": 0.2411, "num_input_tokens_seen": 93771088, "step": 43490 }, { "epoch": 7.982198568544687, "grad_norm": 10.245036125183105, "learning_rate": 7.513579783535088e-06, "loss": 0.2566, "num_input_tokens_seen": 93781520, "step": 43495 }, { "epoch": 7.983116168104239, "grad_norm": 14.99707317352295, "learning_rate": 7.512887535867713e-06, "loss": 0.7051, "num_input_tokens_seen": 93791056, "step": 43500 }, { "epoch": 7.984033767663791, "grad_norm": 0.47494933009147644, "learning_rate": 7.51219522374867e-06, "loss": 0.2182, "num_input_tokens_seen": 93802704, "step": 43505 }, { "epoch": 7.984951367223344, "grad_norm": 37.893821716308594, "learning_rate": 7.511502847195713e-06, "loss": 0.1715, "num_input_tokens_seen": 93813776, "step": 43510 }, { "epoch": 7.9858689667828955, "grad_norm": 13.650439262390137, "learning_rate": 7.510810406226601e-06, "loss": 0.3374, "num_input_tokens_seen": 93823984, "step": 43515 }, { "epoch": 7.986786566342448, "grad_norm": 7.775912284851074, "learning_rate": 7.510117900859091e-06, "loss": 0.4292, "num_input_tokens_seen": 93833904, "step": 43520 }, { "epoch": 7.9877041659020005, "grad_norm": 22.39940071105957, "learning_rate": 7.509425331110949e-06, "loss": 0.2981, "num_input_tokens_seen": 93844976, "step": 43525 }, { "epoch": 7.988621765461552, "grad_norm": 26.983631134033203, "learning_rate": 7.508732696999937e-06, "loss": 0.2687, "num_input_tokens_seen": 93855600, "step": 43530 }, { "epoch": 7.989539365021105, "grad_norm": 27.213708877563477, "learning_rate": 7.508039998543817e-06, "loss": 0.3588, "num_input_tokens_seen": 93865968, "step": 43535 }, { "epoch": 7.990456964580657, "grad_norm": 0.2556130588054657, "learning_rate": 7.507347235760361e-06, "loss": 0.037, "num_input_tokens_seen": 93875568, "step": 43540 }, { "epoch": 7.991374564140209, "grad_norm": 13.397899627685547, "learning_rate": 7.5066544086673335e-06, "loss": 0.2883, "num_input_tokens_seen": 93886192, "step": 43545 }, { "epoch": 7.992292163699761, "grad_norm": 72.31537628173828, "learning_rate": 7.505961517282505e-06, "loss": 0.2754, "num_input_tokens_seen": 93897040, "step": 43550 }, { "epoch": 7.993209763259314, "grad_norm": 5.047935962677002, "learning_rate": 7.505268561623647e-06, "loss": 0.4265, "num_input_tokens_seen": 93907376, "step": 43555 }, { "epoch": 7.994127362818865, "grad_norm": 21.17313003540039, "learning_rate": 7.504575541708534e-06, "loss": 0.2922, "num_input_tokens_seen": 93917520, "step": 43560 }, { "epoch": 7.995044962378418, "grad_norm": 11.951384544372559, "learning_rate": 7.503882457554941e-06, "loss": 0.371, "num_input_tokens_seen": 93928368, "step": 43565 }, { "epoch": 7.99596256193797, "grad_norm": 2.957901954650879, "learning_rate": 7.503189309180642e-06, "loss": 0.2118, "num_input_tokens_seen": 93937968, "step": 43570 }, { "epoch": 7.996880161497522, "grad_norm": 11.793187141418457, "learning_rate": 7.502496096603417e-06, "loss": 0.1745, "num_input_tokens_seen": 93947088, "step": 43575 }, { "epoch": 7.9977977610570745, "grad_norm": 6.887354850769043, "learning_rate": 7.501802819841046e-06, "loss": 0.4193, "num_input_tokens_seen": 93957808, "step": 43580 }, { "epoch": 7.998715360616627, "grad_norm": 21.30345344543457, "learning_rate": 7.50110947891131e-06, "loss": 0.2678, "num_input_tokens_seen": 93967472, "step": 43585 }, { "epoch": 7.999632960176179, "grad_norm": 2.25521183013916, "learning_rate": 7.5004160738319934e-06, "loss": 0.2792, "num_input_tokens_seen": 93978000, "step": 43590 }, { "epoch": 8.0, "eval_loss": 0.3070063889026642, "eval_runtime": 178.9933, "eval_samples_per_second": 30.442, "eval_steps_per_second": 7.615, "num_input_tokens_seen": 93981376, "step": 43592 }, { "epoch": 8.000550559735732, "grad_norm": 8.1637544631958, "learning_rate": 7.499722604620878e-06, "loss": 0.1029, "num_input_tokens_seen": 93989024, "step": 43595 }, { "epoch": 8.001468159295284, "grad_norm": 18.686098098754883, "learning_rate": 7.4990290712957515e-06, "loss": 0.2346, "num_input_tokens_seen": 93999264, "step": 43600 }, { "epoch": 8.002385758854835, "grad_norm": 4.5904107093811035, "learning_rate": 7.498335473874405e-06, "loss": 0.1482, "num_input_tokens_seen": 94010144, "step": 43605 }, { "epoch": 8.003303358414389, "grad_norm": 28.312503814697266, "learning_rate": 7.497641812374623e-06, "loss": 0.4613, "num_input_tokens_seen": 94020032, "step": 43610 }, { "epoch": 8.00422095797394, "grad_norm": 7.1299967765808105, "learning_rate": 7.496948086814202e-06, "loss": 0.1398, "num_input_tokens_seen": 94031968, "step": 43615 }, { "epoch": 8.005138557533492, "grad_norm": 18.107715606689453, "learning_rate": 7.496254297210931e-06, "loss": 0.2796, "num_input_tokens_seen": 94041888, "step": 43620 }, { "epoch": 8.006056157093045, "grad_norm": 22.96294593811035, "learning_rate": 7.495560443582606e-06, "loss": 0.0897, "num_input_tokens_seen": 94052224, "step": 43625 }, { "epoch": 8.006973756652597, "grad_norm": 8.571648597717285, "learning_rate": 7.494866525947024e-06, "loss": 0.0974, "num_input_tokens_seen": 94063328, "step": 43630 }, { "epoch": 8.007891356212149, "grad_norm": 12.97227954864502, "learning_rate": 7.494172544321982e-06, "loss": 0.2029, "num_input_tokens_seen": 94073728, "step": 43635 }, { "epoch": 8.008808955771702, "grad_norm": 8.449463844299316, "learning_rate": 7.4934784987252805e-06, "loss": 0.2238, "num_input_tokens_seen": 94084800, "step": 43640 }, { "epoch": 8.009726555331254, "grad_norm": 14.619339942932129, "learning_rate": 7.49278438917472e-06, "loss": 0.2125, "num_input_tokens_seen": 94096256, "step": 43645 }, { "epoch": 8.010644154890805, "grad_norm": 21.4246768951416, "learning_rate": 7.492090215688103e-06, "loss": 0.2646, "num_input_tokens_seen": 94107520, "step": 43650 }, { "epoch": 8.011561754450359, "grad_norm": 0.71149080991745, "learning_rate": 7.491395978283235e-06, "loss": 0.2961, "num_input_tokens_seen": 94118400, "step": 43655 }, { "epoch": 8.01247935400991, "grad_norm": 23.06046485900879, "learning_rate": 7.4907016769779206e-06, "loss": 0.407, "num_input_tokens_seen": 94128960, "step": 43660 }, { "epoch": 8.013396953569462, "grad_norm": 12.771419525146484, "learning_rate": 7.4900073117899686e-06, "loss": 0.2471, "num_input_tokens_seen": 94138944, "step": 43665 }, { "epoch": 8.014314553129015, "grad_norm": 11.754528045654297, "learning_rate": 7.4893128827371875e-06, "loss": 0.229, "num_input_tokens_seen": 94148928, "step": 43670 }, { "epoch": 8.015232152688567, "grad_norm": 6.214428424835205, "learning_rate": 7.48861838983739e-06, "loss": 0.3754, "num_input_tokens_seen": 94159296, "step": 43675 }, { "epoch": 8.016149752248118, "grad_norm": 13.854456901550293, "learning_rate": 7.487923833108388e-06, "loss": 0.1501, "num_input_tokens_seen": 94169216, "step": 43680 }, { "epoch": 8.017067351807672, "grad_norm": 35.24629211425781, "learning_rate": 7.487229212567995e-06, "loss": 0.1221, "num_input_tokens_seen": 94180224, "step": 43685 }, { "epoch": 8.017984951367223, "grad_norm": 3.1321284770965576, "learning_rate": 7.486534528234028e-06, "loss": 0.1887, "num_input_tokens_seen": 94190784, "step": 43690 }, { "epoch": 8.018902550926775, "grad_norm": 2.680208206176758, "learning_rate": 7.485839780124303e-06, "loss": 0.0585, "num_input_tokens_seen": 94202208, "step": 43695 }, { "epoch": 8.019820150486328, "grad_norm": 85.10100555419922, "learning_rate": 7.485144968256641e-06, "loss": 0.1194, "num_input_tokens_seen": 94212320, "step": 43700 }, { "epoch": 8.02073775004588, "grad_norm": 47.52214050292969, "learning_rate": 7.484450092648863e-06, "loss": 0.2324, "num_input_tokens_seen": 94223552, "step": 43705 }, { "epoch": 8.021655349605432, "grad_norm": 38.54783630371094, "learning_rate": 7.48375515331879e-06, "loss": 0.2547, "num_input_tokens_seen": 94234496, "step": 43710 }, { "epoch": 8.022572949164985, "grad_norm": 12.029515266418457, "learning_rate": 7.483060150284247e-06, "loss": 0.4639, "num_input_tokens_seen": 94245824, "step": 43715 }, { "epoch": 8.023490548724537, "grad_norm": 17.587116241455078, "learning_rate": 7.4823650835630594e-06, "loss": 0.0293, "num_input_tokens_seen": 94257248, "step": 43720 }, { "epoch": 8.024408148284088, "grad_norm": 2.8346266746520996, "learning_rate": 7.481669953173055e-06, "loss": 0.093, "num_input_tokens_seen": 94269824, "step": 43725 }, { "epoch": 8.025325747843642, "grad_norm": 19.91733169555664, "learning_rate": 7.480974759132061e-06, "loss": 0.1151, "num_input_tokens_seen": 94280192, "step": 43730 }, { "epoch": 8.026243347403193, "grad_norm": 2.1370959281921387, "learning_rate": 7.480279501457911e-06, "loss": 0.1242, "num_input_tokens_seen": 94291968, "step": 43735 }, { "epoch": 8.027160946962745, "grad_norm": 1.5962918996810913, "learning_rate": 7.479584180168437e-06, "loss": 0.0288, "num_input_tokens_seen": 94303424, "step": 43740 }, { "epoch": 8.028078546522298, "grad_norm": 1.889299750328064, "learning_rate": 7.47888879528147e-06, "loss": 0.2169, "num_input_tokens_seen": 94312768, "step": 43745 }, { "epoch": 8.02899614608185, "grad_norm": 38.16264724731445, "learning_rate": 7.478193346814848e-06, "loss": 0.3132, "num_input_tokens_seen": 94322464, "step": 43750 }, { "epoch": 8.029913745641402, "grad_norm": 122.96979522705078, "learning_rate": 7.477497834786408e-06, "loss": 0.1935, "num_input_tokens_seen": 94332960, "step": 43755 }, { "epoch": 8.030831345200955, "grad_norm": 1.9182677268981934, "learning_rate": 7.476802259213987e-06, "loss": 0.2124, "num_input_tokens_seen": 94343456, "step": 43760 }, { "epoch": 8.031748944760507, "grad_norm": 6.977285385131836, "learning_rate": 7.476106620115429e-06, "loss": 0.2424, "num_input_tokens_seen": 94353984, "step": 43765 }, { "epoch": 8.032666544320058, "grad_norm": 19.18617057800293, "learning_rate": 7.475410917508571e-06, "loss": 0.1791, "num_input_tokens_seen": 94365728, "step": 43770 }, { "epoch": 8.033584143879612, "grad_norm": 0.47964251041412354, "learning_rate": 7.47471515141126e-06, "loss": 0.019, "num_input_tokens_seen": 94376896, "step": 43775 }, { "epoch": 8.034501743439163, "grad_norm": 9.75123119354248, "learning_rate": 7.474019321841343e-06, "loss": 0.2361, "num_input_tokens_seen": 94387936, "step": 43780 }, { "epoch": 8.035419342998715, "grad_norm": 28.54258155822754, "learning_rate": 7.4733234288166625e-06, "loss": 0.3099, "num_input_tokens_seen": 94397728, "step": 43785 }, { "epoch": 8.036336942558268, "grad_norm": 6.781284809112549, "learning_rate": 7.472627472355071e-06, "loss": 0.2545, "num_input_tokens_seen": 94408896, "step": 43790 }, { "epoch": 8.03725454211782, "grad_norm": 2.101647138595581, "learning_rate": 7.471931452474414e-06, "loss": 0.0853, "num_input_tokens_seen": 94419648, "step": 43795 }, { "epoch": 8.038172141677371, "grad_norm": 0.17581118643283844, "learning_rate": 7.471235369192551e-06, "loss": 0.1227, "num_input_tokens_seen": 94431072, "step": 43800 }, { "epoch": 8.039089741236925, "grad_norm": 35.81757354736328, "learning_rate": 7.470539222527328e-06, "loss": 0.5823, "num_input_tokens_seen": 94443264, "step": 43805 }, { "epoch": 8.040007340796476, "grad_norm": 20.99687957763672, "learning_rate": 7.469843012496603e-06, "loss": 0.3737, "num_input_tokens_seen": 94454144, "step": 43810 }, { "epoch": 8.040924940356028, "grad_norm": 20.05990982055664, "learning_rate": 7.469146739118233e-06, "loss": 0.1827, "num_input_tokens_seen": 94463552, "step": 43815 }, { "epoch": 8.041842539915582, "grad_norm": 22.145551681518555, "learning_rate": 7.468450402410076e-06, "loss": 0.1341, "num_input_tokens_seen": 94474624, "step": 43820 }, { "epoch": 8.042760139475133, "grad_norm": 44.34401321411133, "learning_rate": 7.467754002389992e-06, "loss": 0.6048, "num_input_tokens_seen": 94484992, "step": 43825 }, { "epoch": 8.043677739034685, "grad_norm": 6.6188883781433105, "learning_rate": 7.467057539075842e-06, "loss": 0.2733, "num_input_tokens_seen": 94495488, "step": 43830 }, { "epoch": 8.044595338594238, "grad_norm": 25.261232376098633, "learning_rate": 7.466361012485491e-06, "loss": 0.1048, "num_input_tokens_seen": 94505728, "step": 43835 }, { "epoch": 8.04551293815379, "grad_norm": 54.71550369262695, "learning_rate": 7.465664422636801e-06, "loss": 0.193, "num_input_tokens_seen": 94515136, "step": 43840 }, { "epoch": 8.046430537713341, "grad_norm": 3.3707990646362305, "learning_rate": 7.464967769547641e-06, "loss": 0.2465, "num_input_tokens_seen": 94525600, "step": 43845 }, { "epoch": 8.047348137272895, "grad_norm": 1.7203717231750488, "learning_rate": 7.464271053235877e-06, "loss": 0.4708, "num_input_tokens_seen": 94536608, "step": 43850 }, { "epoch": 8.048265736832446, "grad_norm": 15.687417984008789, "learning_rate": 7.463574273719381e-06, "loss": 0.1796, "num_input_tokens_seen": 94548032, "step": 43855 }, { "epoch": 8.049183336391998, "grad_norm": 21.02165985107422, "learning_rate": 7.4628774310160235e-06, "loss": 0.2157, "num_input_tokens_seen": 94557344, "step": 43860 }, { "epoch": 8.050100935951551, "grad_norm": 0.5657174587249756, "learning_rate": 7.462180525143676e-06, "loss": 0.0993, "num_input_tokens_seen": 94566720, "step": 43865 }, { "epoch": 8.051018535511103, "grad_norm": 0.3252248466014862, "learning_rate": 7.461483556120214e-06, "loss": 0.0639, "num_input_tokens_seen": 94578144, "step": 43870 }, { "epoch": 8.051936135070655, "grad_norm": 32.16112518310547, "learning_rate": 7.4607865239635145e-06, "loss": 0.3903, "num_input_tokens_seen": 94589536, "step": 43875 }, { "epoch": 8.052853734630208, "grad_norm": 5.406974792480469, "learning_rate": 7.4600894286914535e-06, "loss": 0.2473, "num_input_tokens_seen": 94600736, "step": 43880 }, { "epoch": 8.05377133418976, "grad_norm": 12.932289123535156, "learning_rate": 7.4593922703219126e-06, "loss": 0.2154, "num_input_tokens_seen": 94611808, "step": 43885 }, { "epoch": 8.054688933749311, "grad_norm": 27.218076705932617, "learning_rate": 7.45869504887277e-06, "loss": 0.2152, "num_input_tokens_seen": 94622432, "step": 43890 }, { "epoch": 8.055606533308865, "grad_norm": 0.4944351613521576, "learning_rate": 7.4579977643619104e-06, "loss": 0.1684, "num_input_tokens_seen": 94633248, "step": 43895 }, { "epoch": 8.056524132868416, "grad_norm": 0.5296030640602112, "learning_rate": 7.457300416807219e-06, "loss": 0.153, "num_input_tokens_seen": 94644800, "step": 43900 }, { "epoch": 8.057441732427968, "grad_norm": 40.7379150390625, "learning_rate": 7.45660300622658e-06, "loss": 0.2714, "num_input_tokens_seen": 94655136, "step": 43905 }, { "epoch": 8.058359331987521, "grad_norm": 30.508628845214844, "learning_rate": 7.455905532637881e-06, "loss": 0.3978, "num_input_tokens_seen": 94665440, "step": 43910 }, { "epoch": 8.059276931547073, "grad_norm": 34.13716125488281, "learning_rate": 7.455207996059011e-06, "loss": 0.2945, "num_input_tokens_seen": 94676768, "step": 43915 }, { "epoch": 8.060194531106625, "grad_norm": 40.09669876098633, "learning_rate": 7.454510396507861e-06, "loss": 0.2361, "num_input_tokens_seen": 94687904, "step": 43920 }, { "epoch": 8.061112130666178, "grad_norm": 19.604610443115234, "learning_rate": 7.453812734002325e-06, "loss": 0.2532, "num_input_tokens_seen": 94699072, "step": 43925 }, { "epoch": 8.06202973022573, "grad_norm": 1.280616283416748, "learning_rate": 7.453115008560295e-06, "loss": 0.0661, "num_input_tokens_seen": 94709632, "step": 43930 }, { "epoch": 8.062947329785281, "grad_norm": 5.941107749938965, "learning_rate": 7.452417220199666e-06, "loss": 0.3897, "num_input_tokens_seen": 94721472, "step": 43935 }, { "epoch": 8.063864929344835, "grad_norm": 2.4548187255859375, "learning_rate": 7.4517193689383364e-06, "loss": 0.2491, "num_input_tokens_seen": 94731648, "step": 43940 }, { "epoch": 8.064782528904386, "grad_norm": 3.051579475402832, "learning_rate": 7.451021454794204e-06, "loss": 0.0922, "num_input_tokens_seen": 94743232, "step": 43945 }, { "epoch": 8.065700128463938, "grad_norm": 77.01422882080078, "learning_rate": 7.4503234777851716e-06, "loss": 0.1157, "num_input_tokens_seen": 94753824, "step": 43950 }, { "epoch": 8.066617728023491, "grad_norm": 12.67207145690918, "learning_rate": 7.449625437929139e-06, "loss": 0.3281, "num_input_tokens_seen": 94764768, "step": 43955 }, { "epoch": 8.067535327583043, "grad_norm": 10.93409252166748, "learning_rate": 7.448927335244012e-06, "loss": 0.2484, "num_input_tokens_seen": 94774176, "step": 43960 }, { "epoch": 8.068452927142594, "grad_norm": 4.130917072296143, "learning_rate": 7.448229169747692e-06, "loss": 0.2939, "num_input_tokens_seen": 94785152, "step": 43965 }, { "epoch": 8.069370526702148, "grad_norm": 1.5739227533340454, "learning_rate": 7.4475309414580896e-06, "loss": 0.1131, "num_input_tokens_seen": 94796224, "step": 43970 }, { "epoch": 8.0702881262617, "grad_norm": 59.26219177246094, "learning_rate": 7.446832650393112e-06, "loss": 0.2655, "num_input_tokens_seen": 94807424, "step": 43975 }, { "epoch": 8.071205725821251, "grad_norm": 30.949861526489258, "learning_rate": 7.446134296570669e-06, "loss": 0.1799, "num_input_tokens_seen": 94818944, "step": 43980 }, { "epoch": 8.072123325380804, "grad_norm": 0.7737221121788025, "learning_rate": 7.4454358800086715e-06, "loss": 0.2509, "num_input_tokens_seen": 94830176, "step": 43985 }, { "epoch": 8.073040924940356, "grad_norm": 36.0152587890625, "learning_rate": 7.444737400725034e-06, "loss": 0.2152, "num_input_tokens_seen": 94840672, "step": 43990 }, { "epoch": 8.073958524499908, "grad_norm": 69.902587890625, "learning_rate": 7.444038858737672e-06, "loss": 0.3317, "num_input_tokens_seen": 94850848, "step": 43995 }, { "epoch": 8.074876124059461, "grad_norm": 4.01743745803833, "learning_rate": 7.443340254064499e-06, "loss": 0.1595, "num_input_tokens_seen": 94862016, "step": 44000 }, { "epoch": 8.075793723619013, "grad_norm": 11.074621200561523, "learning_rate": 7.442641586723438e-06, "loss": 0.3252, "num_input_tokens_seen": 94872512, "step": 44005 }, { "epoch": 8.076711323178564, "grad_norm": 0.4711672067642212, "learning_rate": 7.441942856732405e-06, "loss": 0.1995, "num_input_tokens_seen": 94884640, "step": 44010 }, { "epoch": 8.077628922738118, "grad_norm": 16.3809871673584, "learning_rate": 7.441244064109322e-06, "loss": 0.2364, "num_input_tokens_seen": 94895296, "step": 44015 }, { "epoch": 8.07854652229767, "grad_norm": 10.465503692626953, "learning_rate": 7.440545208872114e-06, "loss": 0.1912, "num_input_tokens_seen": 94906816, "step": 44020 }, { "epoch": 8.079464121857221, "grad_norm": 32.13044738769531, "learning_rate": 7.4398462910387016e-06, "loss": 0.2634, "num_input_tokens_seen": 94917760, "step": 44025 }, { "epoch": 8.080381721416774, "grad_norm": 35.802974700927734, "learning_rate": 7.439147310627014e-06, "loss": 0.4383, "num_input_tokens_seen": 94927584, "step": 44030 }, { "epoch": 8.081299320976326, "grad_norm": 5.954806804656982, "learning_rate": 7.43844826765498e-06, "loss": 0.1159, "num_input_tokens_seen": 94939104, "step": 44035 }, { "epoch": 8.082216920535878, "grad_norm": 8.164630889892578, "learning_rate": 7.437749162140524e-06, "loss": 0.2414, "num_input_tokens_seen": 94949504, "step": 44040 }, { "epoch": 8.083134520095431, "grad_norm": 37.146793365478516, "learning_rate": 7.437049994101583e-06, "loss": 0.2493, "num_input_tokens_seen": 94961056, "step": 44045 }, { "epoch": 8.084052119654983, "grad_norm": 24.68697738647461, "learning_rate": 7.436350763556085e-06, "loss": 0.1991, "num_input_tokens_seen": 94972384, "step": 44050 }, { "epoch": 8.084969719214534, "grad_norm": 5.733673095703125, "learning_rate": 7.4356514705219664e-06, "loss": 0.0792, "num_input_tokens_seen": 94982688, "step": 44055 }, { "epoch": 8.085887318774088, "grad_norm": 1.0704904794692993, "learning_rate": 7.4349521150171634e-06, "loss": 0.0938, "num_input_tokens_seen": 94992064, "step": 44060 }, { "epoch": 8.08680491833364, "grad_norm": 4.485786437988281, "learning_rate": 7.434252697059611e-06, "loss": 0.2535, "num_input_tokens_seen": 95002656, "step": 44065 }, { "epoch": 8.08772251789319, "grad_norm": 17.895244598388672, "learning_rate": 7.433553216667251e-06, "loss": 0.2041, "num_input_tokens_seen": 95013312, "step": 44070 }, { "epoch": 8.088640117452744, "grad_norm": 7.0147786140441895, "learning_rate": 7.432853673858021e-06, "loss": 0.266, "num_input_tokens_seen": 95024000, "step": 44075 }, { "epoch": 8.089557717012296, "grad_norm": 2.3913333415985107, "learning_rate": 7.432154068649867e-06, "loss": 0.0283, "num_input_tokens_seen": 95035616, "step": 44080 }, { "epoch": 8.090475316571847, "grad_norm": 0.8928533792495728, "learning_rate": 7.4314544010607306e-06, "loss": 0.3429, "num_input_tokens_seen": 95048320, "step": 44085 }, { "epoch": 8.0913929161314, "grad_norm": 4.843916893005371, "learning_rate": 7.430754671108555e-06, "loss": 0.1353, "num_input_tokens_seen": 95058912, "step": 44090 }, { "epoch": 8.092310515690952, "grad_norm": 48.69519805908203, "learning_rate": 7.430054878811292e-06, "loss": 0.2, "num_input_tokens_seen": 95068416, "step": 44095 }, { "epoch": 8.093228115250504, "grad_norm": 35.18515396118164, "learning_rate": 7.429355024186885e-06, "loss": 0.3155, "num_input_tokens_seen": 95078304, "step": 44100 }, { "epoch": 8.094145714810058, "grad_norm": 8.345308303833008, "learning_rate": 7.428655107253288e-06, "loss": 0.2791, "num_input_tokens_seen": 95089248, "step": 44105 }, { "epoch": 8.09506331436961, "grad_norm": 20.798290252685547, "learning_rate": 7.427955128028452e-06, "loss": 0.1478, "num_input_tokens_seen": 95099840, "step": 44110 }, { "epoch": 8.09598091392916, "grad_norm": 4.128190040588379, "learning_rate": 7.42725508653033e-06, "loss": 0.1959, "num_input_tokens_seen": 95112704, "step": 44115 }, { "epoch": 8.096898513488714, "grad_norm": 34.64964294433594, "learning_rate": 7.4265549827768755e-06, "loss": 0.4366, "num_input_tokens_seen": 95123904, "step": 44120 }, { "epoch": 8.097816113048266, "grad_norm": 1.2143607139587402, "learning_rate": 7.425854816786048e-06, "loss": 0.2936, "num_input_tokens_seen": 95134912, "step": 44125 }, { "epoch": 8.098733712607817, "grad_norm": 21.60252571105957, "learning_rate": 7.425154588575803e-06, "loss": 0.304, "num_input_tokens_seen": 95146560, "step": 44130 }, { "epoch": 8.09965131216737, "grad_norm": 42.75450134277344, "learning_rate": 7.424454298164102e-06, "loss": 0.1824, "num_input_tokens_seen": 95156736, "step": 44135 }, { "epoch": 8.100568911726922, "grad_norm": 0.5956956744194031, "learning_rate": 7.4237539455689055e-06, "loss": 0.2107, "num_input_tokens_seen": 95168192, "step": 44140 }, { "epoch": 8.101486511286474, "grad_norm": 0.1656503975391388, "learning_rate": 7.423053530808178e-06, "loss": 0.137, "num_input_tokens_seen": 95180192, "step": 44145 }, { "epoch": 8.102404110846027, "grad_norm": 8.636752128601074, "learning_rate": 7.422353053899881e-06, "loss": 0.1684, "num_input_tokens_seen": 95189792, "step": 44150 }, { "epoch": 8.103321710405579, "grad_norm": 6.252366065979004, "learning_rate": 7.421652514861985e-06, "loss": 0.0452, "num_input_tokens_seen": 95199968, "step": 44155 }, { "epoch": 8.10423930996513, "grad_norm": 19.975133895874023, "learning_rate": 7.420951913712453e-06, "loss": 0.1414, "num_input_tokens_seen": 95210464, "step": 44160 }, { "epoch": 8.105156909524684, "grad_norm": 13.822908401489258, "learning_rate": 7.420251250469257e-06, "loss": 0.0746, "num_input_tokens_seen": 95221024, "step": 44165 }, { "epoch": 8.106074509084236, "grad_norm": 1.7563745975494385, "learning_rate": 7.419550525150367e-06, "loss": 0.5852, "num_input_tokens_seen": 95232320, "step": 44170 }, { "epoch": 8.106992108643787, "grad_norm": 97.8157730102539, "learning_rate": 7.4188497377737565e-06, "loss": 0.4617, "num_input_tokens_seen": 95242208, "step": 44175 }, { "epoch": 8.10790970820334, "grad_norm": 0.39983296394348145, "learning_rate": 7.4181488883574e-06, "loss": 0.1458, "num_input_tokens_seen": 95253376, "step": 44180 }, { "epoch": 8.108827307762892, "grad_norm": 0.08767339587211609, "learning_rate": 7.417447976919272e-06, "loss": 0.1781, "num_input_tokens_seen": 95263808, "step": 44185 }, { "epoch": 8.109744907322444, "grad_norm": 30.289600372314453, "learning_rate": 7.4167470034773505e-06, "loss": 0.3774, "num_input_tokens_seen": 95274656, "step": 44190 }, { "epoch": 8.110662506881997, "grad_norm": 7.746365547180176, "learning_rate": 7.416045968049613e-06, "loss": 0.2545, "num_input_tokens_seen": 95285248, "step": 44195 }, { "epoch": 8.111580106441549, "grad_norm": 15.446609497070312, "learning_rate": 7.415344870654041e-06, "loss": 0.3768, "num_input_tokens_seen": 95297120, "step": 44200 }, { "epoch": 8.1124977060011, "grad_norm": 6.443824291229248, "learning_rate": 7.4146437113086164e-06, "loss": 0.2226, "num_input_tokens_seen": 95308128, "step": 44205 }, { "epoch": 8.113415305560654, "grad_norm": 12.138888359069824, "learning_rate": 7.4139424900313225e-06, "loss": 0.3762, "num_input_tokens_seen": 95319904, "step": 44210 }, { "epoch": 8.114332905120206, "grad_norm": 0.5949898362159729, "learning_rate": 7.413241206840146e-06, "loss": 0.2468, "num_input_tokens_seen": 95331392, "step": 44215 }, { "epoch": 8.115250504679757, "grad_norm": 35.35400390625, "learning_rate": 7.412539861753073e-06, "loss": 0.4902, "num_input_tokens_seen": 95341984, "step": 44220 }, { "epoch": 8.11616810423931, "grad_norm": 2.602546453475952, "learning_rate": 7.41183845478809e-06, "loss": 0.1168, "num_input_tokens_seen": 95353888, "step": 44225 }, { "epoch": 8.117085703798862, "grad_norm": 0.6128736138343811, "learning_rate": 7.411136985963191e-06, "loss": 0.3789, "num_input_tokens_seen": 95364224, "step": 44230 }, { "epoch": 8.118003303358414, "grad_norm": 11.032026290893555, "learning_rate": 7.410435455296364e-06, "loss": 0.1063, "num_input_tokens_seen": 95375680, "step": 44235 }, { "epoch": 8.118920902917967, "grad_norm": 1.3074045181274414, "learning_rate": 7.409733862805603e-06, "loss": 0.0119, "num_input_tokens_seen": 95386208, "step": 44240 }, { "epoch": 8.119838502477519, "grad_norm": 0.34705638885498047, "learning_rate": 7.409032208508904e-06, "loss": 0.2663, "num_input_tokens_seen": 95398528, "step": 44245 }, { "epoch": 8.12075610203707, "grad_norm": 69.47299194335938, "learning_rate": 7.408330492424262e-06, "loss": 0.3924, "num_input_tokens_seen": 95409440, "step": 44250 }, { "epoch": 8.121673701596624, "grad_norm": 0.43191826343536377, "learning_rate": 7.407628714569676e-06, "loss": 0.3837, "num_input_tokens_seen": 95421280, "step": 44255 }, { "epoch": 8.122591301156175, "grad_norm": 21.746370315551758, "learning_rate": 7.406926874963144e-06, "loss": 0.5065, "num_input_tokens_seen": 95433632, "step": 44260 }, { "epoch": 8.123508900715727, "grad_norm": 29.597307205200195, "learning_rate": 7.4062249736226685e-06, "loss": 0.2016, "num_input_tokens_seen": 95443072, "step": 44265 }, { "epoch": 8.12442650027528, "grad_norm": 43.63833999633789, "learning_rate": 7.405523010566252e-06, "loss": 0.2158, "num_input_tokens_seen": 95451968, "step": 44270 }, { "epoch": 8.125344099834832, "grad_norm": 28.483516693115234, "learning_rate": 7.404820985811898e-06, "loss": 0.3145, "num_input_tokens_seen": 95461664, "step": 44275 }, { "epoch": 8.126261699394384, "grad_norm": 0.9632588028907776, "learning_rate": 7.404118899377612e-06, "loss": 0.2107, "num_input_tokens_seen": 95471744, "step": 44280 }, { "epoch": 8.127179298953937, "grad_norm": 0.20026978850364685, "learning_rate": 7.403416751281403e-06, "loss": 0.3343, "num_input_tokens_seen": 95482304, "step": 44285 }, { "epoch": 8.128096898513489, "grad_norm": 31.078996658325195, "learning_rate": 7.4027145415412816e-06, "loss": 0.0962, "num_input_tokens_seen": 95492832, "step": 44290 }, { "epoch": 8.12901449807304, "grad_norm": 39.168582916259766, "learning_rate": 7.402012270175254e-06, "loss": 0.4108, "num_input_tokens_seen": 95504928, "step": 44295 }, { "epoch": 8.129932097632594, "grad_norm": 0.697143018245697, "learning_rate": 7.401309937201334e-06, "loss": 0.162, "num_input_tokens_seen": 95516224, "step": 44300 }, { "epoch": 8.130849697192145, "grad_norm": 6.02606725692749, "learning_rate": 7.400607542637537e-06, "loss": 0.4825, "num_input_tokens_seen": 95526880, "step": 44305 }, { "epoch": 8.131767296751697, "grad_norm": 2.025048017501831, "learning_rate": 7.3999050865018764e-06, "loss": 0.1815, "num_input_tokens_seen": 95538496, "step": 44310 }, { "epoch": 8.13268489631125, "grad_norm": 3.298381805419922, "learning_rate": 7.39920256881237e-06, "loss": 0.3421, "num_input_tokens_seen": 95549408, "step": 44315 }, { "epoch": 8.133602495870802, "grad_norm": 1.4791871309280396, "learning_rate": 7.398499989587036e-06, "loss": 0.0921, "num_input_tokens_seen": 95559808, "step": 44320 }, { "epoch": 8.134520095430354, "grad_norm": 43.21430206298828, "learning_rate": 7.3977973488438945e-06, "loss": 0.3616, "num_input_tokens_seen": 95570720, "step": 44325 }, { "epoch": 8.135437694989907, "grad_norm": 1.7391918897628784, "learning_rate": 7.397094646600968e-06, "loss": 0.2338, "num_input_tokens_seen": 95580448, "step": 44330 }, { "epoch": 8.136355294549459, "grad_norm": 23.022113800048828, "learning_rate": 7.3963918828762785e-06, "loss": 0.2243, "num_input_tokens_seen": 95590464, "step": 44335 }, { "epoch": 8.13727289410901, "grad_norm": 13.472220420837402, "learning_rate": 7.3956890576878515e-06, "loss": 0.3312, "num_input_tokens_seen": 95601824, "step": 44340 }, { "epoch": 8.138190493668564, "grad_norm": 12.225330352783203, "learning_rate": 7.394986171053713e-06, "loss": 0.2474, "num_input_tokens_seen": 95613184, "step": 44345 }, { "epoch": 8.139108093228115, "grad_norm": 33.05497741699219, "learning_rate": 7.39428322299189e-06, "loss": 0.2474, "num_input_tokens_seen": 95624736, "step": 44350 }, { "epoch": 8.140025692787667, "grad_norm": 52.285301208496094, "learning_rate": 7.393580213520415e-06, "loss": 0.2615, "num_input_tokens_seen": 95634240, "step": 44355 }, { "epoch": 8.14094329234722, "grad_norm": 25.818593978881836, "learning_rate": 7.392877142657316e-06, "loss": 0.2159, "num_input_tokens_seen": 95645600, "step": 44360 }, { "epoch": 8.141860891906772, "grad_norm": 29.073917388916016, "learning_rate": 7.392174010420628e-06, "loss": 0.2303, "num_input_tokens_seen": 95655616, "step": 44365 }, { "epoch": 8.142778491466323, "grad_norm": 1.3502131700515747, "learning_rate": 7.3914708168283824e-06, "loss": 0.2504, "num_input_tokens_seen": 95665408, "step": 44370 }, { "epoch": 8.143696091025877, "grad_norm": 18.474742889404297, "learning_rate": 7.390767561898617e-06, "loss": 0.2579, "num_input_tokens_seen": 95677632, "step": 44375 }, { "epoch": 8.144613690585429, "grad_norm": 1.0760329961776733, "learning_rate": 7.390064245649371e-06, "loss": 0.1211, "num_input_tokens_seen": 95688768, "step": 44380 }, { "epoch": 8.14553129014498, "grad_norm": 20.08260726928711, "learning_rate": 7.389360868098679e-06, "loss": 0.5863, "num_input_tokens_seen": 95698688, "step": 44385 }, { "epoch": 8.146448889704534, "grad_norm": 0.08149389922618866, "learning_rate": 7.3886574292645865e-06, "loss": 0.2047, "num_input_tokens_seen": 95709280, "step": 44390 }, { "epoch": 8.147366489264085, "grad_norm": 17.813861846923828, "learning_rate": 7.38795392916513e-06, "loss": 0.3065, "num_input_tokens_seen": 95720128, "step": 44395 }, { "epoch": 8.148284088823637, "grad_norm": 12.517032623291016, "learning_rate": 7.38725036781836e-06, "loss": 0.3329, "num_input_tokens_seen": 95730656, "step": 44400 }, { "epoch": 8.14920168838319, "grad_norm": 3.2173678874969482, "learning_rate": 7.386546745242316e-06, "loss": 0.2727, "num_input_tokens_seen": 95741600, "step": 44405 }, { "epoch": 8.150119287942742, "grad_norm": 10.598125457763672, "learning_rate": 7.3858430614550455e-06, "loss": 0.2619, "num_input_tokens_seen": 95751488, "step": 44410 }, { "epoch": 8.151036887502293, "grad_norm": 4.819704055786133, "learning_rate": 7.3851393164746e-06, "loss": 0.3015, "num_input_tokens_seen": 95762848, "step": 44415 }, { "epoch": 8.151954487061847, "grad_norm": 6.954087734222412, "learning_rate": 7.384435510319027e-06, "loss": 0.0745, "num_input_tokens_seen": 95773184, "step": 44420 }, { "epoch": 8.152872086621398, "grad_norm": 4.358417987823486, "learning_rate": 7.383731643006379e-06, "loss": 0.1832, "num_input_tokens_seen": 95784992, "step": 44425 }, { "epoch": 8.15378968618095, "grad_norm": 53.05571365356445, "learning_rate": 7.383027714554708e-06, "loss": 0.1336, "num_input_tokens_seen": 95794720, "step": 44430 }, { "epoch": 8.154707285740503, "grad_norm": 33.69858932495117, "learning_rate": 7.38232372498207e-06, "loss": 0.1565, "num_input_tokens_seen": 95806496, "step": 44435 }, { "epoch": 8.155624885300055, "grad_norm": 17.413785934448242, "learning_rate": 7.381619674306521e-06, "loss": 0.2139, "num_input_tokens_seen": 95816736, "step": 44440 }, { "epoch": 8.156542484859607, "grad_norm": 7.621151924133301, "learning_rate": 7.380915562546117e-06, "loss": 0.1057, "num_input_tokens_seen": 95826912, "step": 44445 }, { "epoch": 8.15746008441916, "grad_norm": 13.947540283203125, "learning_rate": 7.380211389718921e-06, "loss": 0.0924, "num_input_tokens_seen": 95837216, "step": 44450 }, { "epoch": 8.158377683978712, "grad_norm": 41.799835205078125, "learning_rate": 7.379507155842991e-06, "loss": 0.3321, "num_input_tokens_seen": 95848096, "step": 44455 }, { "epoch": 8.159295283538263, "grad_norm": 1.341589331626892, "learning_rate": 7.378802860936389e-06, "loss": 0.1718, "num_input_tokens_seen": 95859776, "step": 44460 }, { "epoch": 8.160212883097817, "grad_norm": 4.457686424255371, "learning_rate": 7.378098505017183e-06, "loss": 0.3999, "num_input_tokens_seen": 95870752, "step": 44465 }, { "epoch": 8.161130482657368, "grad_norm": 0.7276471853256226, "learning_rate": 7.377394088103433e-06, "loss": 0.13, "num_input_tokens_seen": 95882208, "step": 44470 }, { "epoch": 8.16204808221692, "grad_norm": 22.79157257080078, "learning_rate": 7.376689610213212e-06, "loss": 0.3695, "num_input_tokens_seen": 95893088, "step": 44475 }, { "epoch": 8.162965681776473, "grad_norm": 20.386171340942383, "learning_rate": 7.375985071364585e-06, "loss": 0.1387, "num_input_tokens_seen": 95904864, "step": 44480 }, { "epoch": 8.163883281336025, "grad_norm": 9.098913192749023, "learning_rate": 7.375280471575624e-06, "loss": 0.2113, "num_input_tokens_seen": 95917440, "step": 44485 }, { "epoch": 8.164800880895577, "grad_norm": 20.50450325012207, "learning_rate": 7.3745758108643995e-06, "loss": 0.2323, "num_input_tokens_seen": 95928480, "step": 44490 }, { "epoch": 8.16571848045513, "grad_norm": 29.749536514282227, "learning_rate": 7.373871089248985e-06, "loss": 0.0998, "num_input_tokens_seen": 95938368, "step": 44495 }, { "epoch": 8.166636080014682, "grad_norm": 50.08067321777344, "learning_rate": 7.373166306747458e-06, "loss": 0.0903, "num_input_tokens_seen": 95948768, "step": 44500 }, { "epoch": 8.167553679574233, "grad_norm": 9.738625526428223, "learning_rate": 7.3724614633778925e-06, "loss": 0.219, "num_input_tokens_seen": 95960000, "step": 44505 }, { "epoch": 8.168471279133787, "grad_norm": 25.670629501342773, "learning_rate": 7.371756559158367e-06, "loss": 0.2112, "num_input_tokens_seen": 95971584, "step": 44510 }, { "epoch": 8.169388878693338, "grad_norm": 12.495015144348145, "learning_rate": 7.371051594106964e-06, "loss": 0.3674, "num_input_tokens_seen": 95982144, "step": 44515 }, { "epoch": 8.17030647825289, "grad_norm": 23.155433654785156, "learning_rate": 7.37034656824176e-06, "loss": 0.3205, "num_input_tokens_seen": 95992608, "step": 44520 }, { "epoch": 8.171224077812443, "grad_norm": 0.5633730292320251, "learning_rate": 7.369641481580841e-06, "loss": 0.1739, "num_input_tokens_seen": 96002496, "step": 44525 }, { "epoch": 8.172141677371995, "grad_norm": 11.3672456741333, "learning_rate": 7.368936334142289e-06, "loss": 0.3107, "num_input_tokens_seen": 96014112, "step": 44530 }, { "epoch": 8.173059276931546, "grad_norm": 35.47451400756836, "learning_rate": 7.368231125944193e-06, "loss": 0.3092, "num_input_tokens_seen": 96025184, "step": 44535 }, { "epoch": 8.1739768764911, "grad_norm": 7.532867431640625, "learning_rate": 7.3675258570046395e-06, "loss": 0.1089, "num_input_tokens_seen": 96036640, "step": 44540 }, { "epoch": 8.174894476050651, "grad_norm": 18.18372917175293, "learning_rate": 7.366820527341716e-06, "loss": 0.3765, "num_input_tokens_seen": 96047488, "step": 44545 }, { "epoch": 8.175812075610203, "grad_norm": 1.3427903652191162, "learning_rate": 7.366115136973515e-06, "loss": 0.2879, "num_input_tokens_seen": 96057664, "step": 44550 }, { "epoch": 8.176729675169756, "grad_norm": 25.790109634399414, "learning_rate": 7.365409685918128e-06, "loss": 0.1264, "num_input_tokens_seen": 96068416, "step": 44555 }, { "epoch": 8.177647274729308, "grad_norm": 4.735416412353516, "learning_rate": 7.364704174193646e-06, "loss": 0.2234, "num_input_tokens_seen": 96077504, "step": 44560 }, { "epoch": 8.17856487428886, "grad_norm": 6.698342323303223, "learning_rate": 7.363998601818171e-06, "loss": 0.3347, "num_input_tokens_seen": 96088512, "step": 44565 }, { "epoch": 8.179482473848413, "grad_norm": 30.37181854248047, "learning_rate": 7.363292968809793e-06, "loss": 0.1543, "num_input_tokens_seen": 96098208, "step": 44570 }, { "epoch": 8.180400073407965, "grad_norm": 24.64922332763672, "learning_rate": 7.362587275186614e-06, "loss": 0.2317, "num_input_tokens_seen": 96109248, "step": 44575 }, { "epoch": 8.181317672967516, "grad_norm": 0.3037783205509186, "learning_rate": 7.361881520966733e-06, "loss": 0.1214, "num_input_tokens_seen": 96120864, "step": 44580 }, { "epoch": 8.18223527252707, "grad_norm": 34.29084777832031, "learning_rate": 7.361175706168252e-06, "loss": 0.1728, "num_input_tokens_seen": 96131744, "step": 44585 }, { "epoch": 8.183152872086621, "grad_norm": 12.100143432617188, "learning_rate": 7.360469830809272e-06, "loss": 0.2033, "num_input_tokens_seen": 96142368, "step": 44590 }, { "epoch": 8.184070471646173, "grad_norm": 26.386436462402344, "learning_rate": 7.359763894907901e-06, "loss": 0.1178, "num_input_tokens_seen": 96153600, "step": 44595 }, { "epoch": 8.184988071205726, "grad_norm": 5.949862480163574, "learning_rate": 7.359057898482244e-06, "loss": 0.2155, "num_input_tokens_seen": 96165568, "step": 44600 }, { "epoch": 8.185905670765278, "grad_norm": 10.803995132446289, "learning_rate": 7.358351841550406e-06, "loss": 0.1029, "num_input_tokens_seen": 96176608, "step": 44605 }, { "epoch": 8.18682327032483, "grad_norm": 2.102036237716675, "learning_rate": 7.3576457241305e-06, "loss": 0.3164, "num_input_tokens_seen": 96187680, "step": 44610 }, { "epoch": 8.187740869884383, "grad_norm": 12.466869354248047, "learning_rate": 7.3569395462406335e-06, "loss": 0.3243, "num_input_tokens_seen": 96198176, "step": 44615 }, { "epoch": 8.188658469443935, "grad_norm": 14.803071022033691, "learning_rate": 7.356233307898922e-06, "loss": 0.4262, "num_input_tokens_seen": 96208704, "step": 44620 }, { "epoch": 8.189576069003486, "grad_norm": 4.6592936515808105, "learning_rate": 7.355527009123479e-06, "loss": 0.2918, "num_input_tokens_seen": 96218080, "step": 44625 }, { "epoch": 8.19049366856304, "grad_norm": 69.09712219238281, "learning_rate": 7.354820649932417e-06, "loss": 0.3162, "num_input_tokens_seen": 96228960, "step": 44630 }, { "epoch": 8.191411268122591, "grad_norm": 59.80573654174805, "learning_rate": 7.354114230343856e-06, "loss": 0.2819, "num_input_tokens_seen": 96240320, "step": 44635 }, { "epoch": 8.192328867682143, "grad_norm": 1.1475327014923096, "learning_rate": 7.3534077503759125e-06, "loss": 0.2506, "num_input_tokens_seen": 96250688, "step": 44640 }, { "epoch": 8.193246467241696, "grad_norm": 29.922466278076172, "learning_rate": 7.352701210046708e-06, "loss": 0.3421, "num_input_tokens_seen": 96261504, "step": 44645 }, { "epoch": 8.194164066801248, "grad_norm": 37.175174713134766, "learning_rate": 7.351994609374364e-06, "loss": 0.1727, "num_input_tokens_seen": 96273408, "step": 44650 }, { "epoch": 8.1950816663608, "grad_norm": 5.395966053009033, "learning_rate": 7.3512879483770035e-06, "loss": 0.1672, "num_input_tokens_seen": 96284640, "step": 44655 }, { "epoch": 8.195999265920353, "grad_norm": 12.630372047424316, "learning_rate": 7.350581227072752e-06, "loss": 0.1717, "num_input_tokens_seen": 96294560, "step": 44660 }, { "epoch": 8.196916865479905, "grad_norm": 1.5288199186325073, "learning_rate": 7.349874445479733e-06, "loss": 0.1456, "num_input_tokens_seen": 96305856, "step": 44665 }, { "epoch": 8.197834465039456, "grad_norm": 4.634335041046143, "learning_rate": 7.349167603616079e-06, "loss": 0.1035, "num_input_tokens_seen": 96316160, "step": 44670 }, { "epoch": 8.19875206459901, "grad_norm": 22.933326721191406, "learning_rate": 7.348460701499915e-06, "loss": 0.5268, "num_input_tokens_seen": 96326624, "step": 44675 }, { "epoch": 8.199669664158561, "grad_norm": 1.6251945495605469, "learning_rate": 7.3477537391493745e-06, "loss": 0.1598, "num_input_tokens_seen": 96338336, "step": 44680 }, { "epoch": 8.200587263718113, "grad_norm": 0.5936795473098755, "learning_rate": 7.34704671658259e-06, "loss": 0.2526, "num_input_tokens_seen": 96348384, "step": 44685 }, { "epoch": 8.201504863277666, "grad_norm": 6.300563812255859, "learning_rate": 7.346339633817694e-06, "loss": 0.2407, "num_input_tokens_seen": 96359520, "step": 44690 }, { "epoch": 8.202422462837218, "grad_norm": 44.238277435302734, "learning_rate": 7.345632490872821e-06, "loss": 0.3982, "num_input_tokens_seen": 96370624, "step": 44695 }, { "epoch": 8.20334006239677, "grad_norm": 29.369138717651367, "learning_rate": 7.344925287766114e-06, "loss": 0.3318, "num_input_tokens_seen": 96382976, "step": 44700 }, { "epoch": 8.204257661956323, "grad_norm": 18.35712242126465, "learning_rate": 7.344218024515704e-06, "loss": 0.273, "num_input_tokens_seen": 96392800, "step": 44705 }, { "epoch": 8.205175261515874, "grad_norm": 16.901044845581055, "learning_rate": 7.343510701139737e-06, "loss": 0.4289, "num_input_tokens_seen": 96404640, "step": 44710 }, { "epoch": 8.206092861075426, "grad_norm": 34.62877655029297, "learning_rate": 7.342803317656353e-06, "loss": 0.4952, "num_input_tokens_seen": 96414432, "step": 44715 }, { "epoch": 8.20701046063498, "grad_norm": 36.7763557434082, "learning_rate": 7.342095874083694e-06, "loss": 0.2011, "num_input_tokens_seen": 96423808, "step": 44720 }, { "epoch": 8.207928060194531, "grad_norm": 3.7755320072174072, "learning_rate": 7.341388370439907e-06, "loss": 0.2803, "num_input_tokens_seen": 96433888, "step": 44725 }, { "epoch": 8.208845659754083, "grad_norm": 5.903262615203857, "learning_rate": 7.340680806743135e-06, "loss": 0.2269, "num_input_tokens_seen": 96442272, "step": 44730 }, { "epoch": 8.209763259313636, "grad_norm": 15.112974166870117, "learning_rate": 7.33997318301153e-06, "loss": 0.0529, "num_input_tokens_seen": 96452576, "step": 44735 }, { "epoch": 8.210680858873188, "grad_norm": 0.27641037106513977, "learning_rate": 7.339265499263237e-06, "loss": 0.1246, "num_input_tokens_seen": 96463584, "step": 44740 }, { "epoch": 8.21159845843274, "grad_norm": 13.22353744506836, "learning_rate": 7.338557755516412e-06, "loss": 0.1746, "num_input_tokens_seen": 96475296, "step": 44745 }, { "epoch": 8.212516057992293, "grad_norm": 15.200532913208008, "learning_rate": 7.337849951789204e-06, "loss": 0.661, "num_input_tokens_seen": 96485696, "step": 44750 }, { "epoch": 8.213433657551844, "grad_norm": 20.01258659362793, "learning_rate": 7.337142088099767e-06, "loss": 0.0604, "num_input_tokens_seen": 96497024, "step": 44755 }, { "epoch": 8.214351257111396, "grad_norm": 1.042234182357788, "learning_rate": 7.33643416446626e-06, "loss": 0.1582, "num_input_tokens_seen": 96508160, "step": 44760 }, { "epoch": 8.21526885667095, "grad_norm": 9.034211158752441, "learning_rate": 7.335726180906836e-06, "loss": 0.4143, "num_input_tokens_seen": 96518944, "step": 44765 }, { "epoch": 8.216186456230501, "grad_norm": 19.71019744873047, "learning_rate": 7.335018137439657e-06, "loss": 0.3183, "num_input_tokens_seen": 96528640, "step": 44770 }, { "epoch": 8.217104055790053, "grad_norm": 2.1062159538269043, "learning_rate": 7.33431003408288e-06, "loss": 0.0991, "num_input_tokens_seen": 96540064, "step": 44775 }, { "epoch": 8.218021655349606, "grad_norm": 12.071126937866211, "learning_rate": 7.333601870854669e-06, "loss": 0.1974, "num_input_tokens_seen": 96551584, "step": 44780 }, { "epoch": 8.218939254909158, "grad_norm": 33.448238372802734, "learning_rate": 7.332893647773187e-06, "loss": 0.2136, "num_input_tokens_seen": 96562400, "step": 44785 }, { "epoch": 8.21985685446871, "grad_norm": 7.707973957061768, "learning_rate": 7.332185364856599e-06, "loss": 0.2658, "num_input_tokens_seen": 96572960, "step": 44790 }, { "epoch": 8.220774454028263, "grad_norm": 42.37853240966797, "learning_rate": 7.33147702212307e-06, "loss": 0.2474, "num_input_tokens_seen": 96582976, "step": 44795 }, { "epoch": 8.221692053587814, "grad_norm": 3.7977452278137207, "learning_rate": 7.330768619590769e-06, "loss": 0.0671, "num_input_tokens_seen": 96593792, "step": 44800 }, { "epoch": 8.222609653147366, "grad_norm": 52.74278259277344, "learning_rate": 7.3300601572778655e-06, "loss": 0.428, "num_input_tokens_seen": 96606016, "step": 44805 }, { "epoch": 8.22352725270692, "grad_norm": 10.122787475585938, "learning_rate": 7.32935163520253e-06, "loss": 0.28, "num_input_tokens_seen": 96617120, "step": 44810 }, { "epoch": 8.22444485226647, "grad_norm": 19.822195053100586, "learning_rate": 7.328643053382937e-06, "loss": 0.2103, "num_input_tokens_seen": 96628352, "step": 44815 }, { "epoch": 8.225362451826022, "grad_norm": 1.32039213180542, "learning_rate": 7.3279344118372575e-06, "loss": 0.1122, "num_input_tokens_seen": 96639456, "step": 44820 }, { "epoch": 8.226280051385576, "grad_norm": 56.62736511230469, "learning_rate": 7.327225710583668e-06, "loss": 0.3064, "num_input_tokens_seen": 96650464, "step": 44825 }, { "epoch": 8.227197650945127, "grad_norm": 1.3750609159469604, "learning_rate": 7.326516949640346e-06, "loss": 0.0151, "num_input_tokens_seen": 96661696, "step": 44830 }, { "epoch": 8.228115250504679, "grad_norm": 23.591506958007812, "learning_rate": 7.3258081290254715e-06, "loss": 0.4014, "num_input_tokens_seen": 96672384, "step": 44835 }, { "epoch": 8.229032850064232, "grad_norm": 0.3557870388031006, "learning_rate": 7.325099248757221e-06, "loss": 0.0868, "num_input_tokens_seen": 96683456, "step": 44840 }, { "epoch": 8.229950449623784, "grad_norm": 27.78290367126465, "learning_rate": 7.324390308853779e-06, "loss": 0.0899, "num_input_tokens_seen": 96694912, "step": 44845 }, { "epoch": 8.230868049183336, "grad_norm": 14.581938743591309, "learning_rate": 7.323681309333328e-06, "loss": 0.0932, "num_input_tokens_seen": 96705312, "step": 44850 }, { "epoch": 8.231785648742889, "grad_norm": 0.48631107807159424, "learning_rate": 7.322972250214054e-06, "loss": 0.1118, "num_input_tokens_seen": 96714784, "step": 44855 }, { "epoch": 8.23270324830244, "grad_norm": 36.862159729003906, "learning_rate": 7.32226313151414e-06, "loss": 0.0423, "num_input_tokens_seen": 96725344, "step": 44860 }, { "epoch": 8.233620847861992, "grad_norm": 5.344030380249023, "learning_rate": 7.321553953251777e-06, "loss": 0.3076, "num_input_tokens_seen": 96737024, "step": 44865 }, { "epoch": 8.234538447421546, "grad_norm": 32.20966720581055, "learning_rate": 7.320844715445153e-06, "loss": 0.4965, "num_input_tokens_seen": 96748032, "step": 44870 }, { "epoch": 8.235456046981097, "grad_norm": 18.397375106811523, "learning_rate": 7.32013541811246e-06, "loss": 0.358, "num_input_tokens_seen": 96758752, "step": 44875 }, { "epoch": 8.236373646540649, "grad_norm": 1.9074230194091797, "learning_rate": 7.319426061271888e-06, "loss": 0.2572, "num_input_tokens_seen": 96768864, "step": 44880 }, { "epoch": 8.237291246100202, "grad_norm": 31.650978088378906, "learning_rate": 7.318716644941633e-06, "loss": 0.2893, "num_input_tokens_seen": 96779936, "step": 44885 }, { "epoch": 8.238208845659754, "grad_norm": 0.38113933801651, "learning_rate": 7.318007169139889e-06, "loss": 0.162, "num_input_tokens_seen": 96790880, "step": 44890 }, { "epoch": 8.239126445219306, "grad_norm": 38.02662658691406, "learning_rate": 7.317297633884854e-06, "loss": 0.1745, "num_input_tokens_seen": 96800608, "step": 44895 }, { "epoch": 8.240044044778859, "grad_norm": 0.3784507215023041, "learning_rate": 7.316588039194726e-06, "loss": 0.2755, "num_input_tokens_seen": 96812288, "step": 44900 }, { "epoch": 8.24096164433841, "grad_norm": 0.18214568495750427, "learning_rate": 7.315878385087707e-06, "loss": 0.0589, "num_input_tokens_seen": 96823360, "step": 44905 }, { "epoch": 8.241879243897962, "grad_norm": 16.272836685180664, "learning_rate": 7.315168671581995e-06, "loss": 0.3189, "num_input_tokens_seen": 96833632, "step": 44910 }, { "epoch": 8.242796843457516, "grad_norm": 1.632253885269165, "learning_rate": 7.314458898695794e-06, "loss": 0.1962, "num_input_tokens_seen": 96845792, "step": 44915 }, { "epoch": 8.243714443017067, "grad_norm": 14.662149429321289, "learning_rate": 7.313749066447311e-06, "loss": 0.1803, "num_input_tokens_seen": 96857024, "step": 44920 }, { "epoch": 8.244632042576619, "grad_norm": 6.208530426025391, "learning_rate": 7.31303917485475e-06, "loss": 0.0201, "num_input_tokens_seen": 96867968, "step": 44925 }, { "epoch": 8.245549642136172, "grad_norm": 0.2635782063007355, "learning_rate": 7.31232922393632e-06, "loss": 0.3464, "num_input_tokens_seen": 96878080, "step": 44930 }, { "epoch": 8.246467241695724, "grad_norm": 28.192975997924805, "learning_rate": 7.311619213710227e-06, "loss": 0.2093, "num_input_tokens_seen": 96888864, "step": 44935 }, { "epoch": 8.247384841255275, "grad_norm": 0.6816564202308655, "learning_rate": 7.310909144194685e-06, "loss": 0.2006, "num_input_tokens_seen": 96899744, "step": 44940 }, { "epoch": 8.248302440814829, "grad_norm": 48.61027526855469, "learning_rate": 7.310199015407906e-06, "loss": 0.1205, "num_input_tokens_seen": 96910464, "step": 44945 }, { "epoch": 8.24922004037438, "grad_norm": 24.787887573242188, "learning_rate": 7.309488827368102e-06, "loss": 0.3513, "num_input_tokens_seen": 96921248, "step": 44950 }, { "epoch": 8.250137639933932, "grad_norm": 6.354525089263916, "learning_rate": 7.308778580093489e-06, "loss": 0.1056, "num_input_tokens_seen": 96931584, "step": 44955 }, { "epoch": 8.251055239493486, "grad_norm": 1.3224648237228394, "learning_rate": 7.308068273602283e-06, "loss": 0.2124, "num_input_tokens_seen": 96942304, "step": 44960 }, { "epoch": 8.251972839053037, "grad_norm": 0.29812461137771606, "learning_rate": 7.307357907912702e-06, "loss": 0.0675, "num_input_tokens_seen": 96952384, "step": 44965 }, { "epoch": 8.252890438612589, "grad_norm": 85.09071350097656, "learning_rate": 7.306647483042969e-06, "loss": 0.2932, "num_input_tokens_seen": 96962016, "step": 44970 }, { "epoch": 8.253808038172142, "grad_norm": 0.1894483119249344, "learning_rate": 7.305936999011303e-06, "loss": 0.3394, "num_input_tokens_seen": 96973600, "step": 44975 }, { "epoch": 8.254725637731694, "grad_norm": 0.18200254440307617, "learning_rate": 7.305226455835926e-06, "loss": 0.0085, "num_input_tokens_seen": 96983968, "step": 44980 }, { "epoch": 8.255643237291245, "grad_norm": 33.29436492919922, "learning_rate": 7.304515853535062e-06, "loss": 0.4086, "num_input_tokens_seen": 96995200, "step": 44985 }, { "epoch": 8.256560836850799, "grad_norm": 24.52219009399414, "learning_rate": 7.303805192126939e-06, "loss": 0.5735, "num_input_tokens_seen": 97005696, "step": 44990 }, { "epoch": 8.25747843641035, "grad_norm": 1.3092330694198608, "learning_rate": 7.303094471629785e-06, "loss": 0.4033, "num_input_tokens_seen": 97017408, "step": 44995 }, { "epoch": 8.258396035969902, "grad_norm": 0.7557641267776489, "learning_rate": 7.302383692061825e-06, "loss": 0.123, "num_input_tokens_seen": 97029088, "step": 45000 }, { "epoch": 8.259313635529455, "grad_norm": 0.41989612579345703, "learning_rate": 7.301672853441293e-06, "loss": 0.2293, "num_input_tokens_seen": 97040096, "step": 45005 }, { "epoch": 8.260231235089007, "grad_norm": 11.144266128540039, "learning_rate": 7.300961955786419e-06, "loss": 0.184, "num_input_tokens_seen": 97051104, "step": 45010 }, { "epoch": 8.261148834648559, "grad_norm": 11.963422775268555, "learning_rate": 7.300250999115437e-06, "loss": 0.3522, "num_input_tokens_seen": 97061440, "step": 45015 }, { "epoch": 8.262066434208112, "grad_norm": 9.207074165344238, "learning_rate": 7.299539983446582e-06, "loss": 0.1857, "num_input_tokens_seen": 97072256, "step": 45020 }, { "epoch": 8.262984033767664, "grad_norm": 21.754528045654297, "learning_rate": 7.29882890879809e-06, "loss": 0.0775, "num_input_tokens_seen": 97083296, "step": 45025 }, { "epoch": 8.263901633327215, "grad_norm": 0.28053638339042664, "learning_rate": 7.298117775188201e-06, "loss": 0.1044, "num_input_tokens_seen": 97094784, "step": 45030 }, { "epoch": 8.264819232886769, "grad_norm": 4.972821235656738, "learning_rate": 7.29740658263515e-06, "loss": 0.2921, "num_input_tokens_seen": 97106208, "step": 45035 }, { "epoch": 8.26573683244632, "grad_norm": 23.638525009155273, "learning_rate": 7.296695331157184e-06, "loss": 0.2636, "num_input_tokens_seen": 97115296, "step": 45040 }, { "epoch": 8.266654432005872, "grad_norm": 5.675787448883057, "learning_rate": 7.29598402077254e-06, "loss": 0.2463, "num_input_tokens_seen": 97126048, "step": 45045 }, { "epoch": 8.267572031565425, "grad_norm": 111.3604965209961, "learning_rate": 7.295272651499465e-06, "loss": 0.4136, "num_input_tokens_seen": 97137312, "step": 45050 }, { "epoch": 8.268489631124977, "grad_norm": 0.3410952389240265, "learning_rate": 7.2945612233562045e-06, "loss": 0.1056, "num_input_tokens_seen": 97148768, "step": 45055 }, { "epoch": 8.269407230684529, "grad_norm": 5.520234107971191, "learning_rate": 7.293849736361005e-06, "loss": 0.1231, "num_input_tokens_seen": 97160800, "step": 45060 }, { "epoch": 8.270324830244082, "grad_norm": 21.011323928833008, "learning_rate": 7.293138190532114e-06, "loss": 0.2174, "num_input_tokens_seen": 97172896, "step": 45065 }, { "epoch": 8.271242429803634, "grad_norm": 16.041397094726562, "learning_rate": 7.292426585887783e-06, "loss": 0.2747, "num_input_tokens_seen": 97182912, "step": 45070 }, { "epoch": 8.272160029363185, "grad_norm": 39.304405212402344, "learning_rate": 7.291714922446262e-06, "loss": 0.2321, "num_input_tokens_seen": 97193728, "step": 45075 }, { "epoch": 8.273077628922739, "grad_norm": 5.7848052978515625, "learning_rate": 7.291003200225806e-06, "loss": 0.1866, "num_input_tokens_seen": 97203616, "step": 45080 }, { "epoch": 8.27399522848229, "grad_norm": 0.7763028740882874, "learning_rate": 7.290291419244669e-06, "loss": 0.3067, "num_input_tokens_seen": 97214976, "step": 45085 }, { "epoch": 8.274912828041842, "grad_norm": 20.831377029418945, "learning_rate": 7.289579579521106e-06, "loss": 0.2916, "num_input_tokens_seen": 97226144, "step": 45090 }, { "epoch": 8.275830427601395, "grad_norm": 9.698351860046387, "learning_rate": 7.288867681073375e-06, "loss": 0.0578, "num_input_tokens_seen": 97236160, "step": 45095 }, { "epoch": 8.276748027160947, "grad_norm": 25.362632751464844, "learning_rate": 7.288155723919735e-06, "loss": 0.2595, "num_input_tokens_seen": 97247456, "step": 45100 }, { "epoch": 8.277665626720498, "grad_norm": 3.4140207767486572, "learning_rate": 7.287443708078448e-06, "loss": 0.2595, "num_input_tokens_seen": 97258080, "step": 45105 }, { "epoch": 8.278583226280052, "grad_norm": 14.23525619506836, "learning_rate": 7.286731633567775e-06, "loss": 0.3694, "num_input_tokens_seen": 97269472, "step": 45110 }, { "epoch": 8.279500825839603, "grad_norm": 7.213769435882568, "learning_rate": 7.2860195004059806e-06, "loss": 0.259, "num_input_tokens_seen": 97280800, "step": 45115 }, { "epoch": 8.280418425399155, "grad_norm": 0.719217836856842, "learning_rate": 7.285307308611327e-06, "loss": 0.2543, "num_input_tokens_seen": 97290752, "step": 45120 }, { "epoch": 8.281336024958708, "grad_norm": 38.67720031738281, "learning_rate": 7.2845950582020844e-06, "loss": 0.1921, "num_input_tokens_seen": 97302016, "step": 45125 }, { "epoch": 8.28225362451826, "grad_norm": 25.97420310974121, "learning_rate": 7.283882749196519e-06, "loss": 0.4385, "num_input_tokens_seen": 97312768, "step": 45130 }, { "epoch": 8.283171224077812, "grad_norm": 21.83955192565918, "learning_rate": 7.2831703816129e-06, "loss": 0.2576, "num_input_tokens_seen": 97324000, "step": 45135 }, { "epoch": 8.284088823637365, "grad_norm": 12.680859565734863, "learning_rate": 7.2824579554695e-06, "loss": 0.1627, "num_input_tokens_seen": 97333920, "step": 45140 }, { "epoch": 8.285006423196917, "grad_norm": 43.61531448364258, "learning_rate": 7.2817454707845914e-06, "loss": 0.3781, "num_input_tokens_seen": 97344640, "step": 45145 }, { "epoch": 8.285924022756468, "grad_norm": 63.779117584228516, "learning_rate": 7.281032927576448e-06, "loss": 0.1797, "num_input_tokens_seen": 97355392, "step": 45150 }, { "epoch": 8.286841622316022, "grad_norm": 11.221650123596191, "learning_rate": 7.280320325863344e-06, "loss": 0.2982, "num_input_tokens_seen": 97366464, "step": 45155 }, { "epoch": 8.287759221875573, "grad_norm": 21.785770416259766, "learning_rate": 7.27960766566356e-06, "loss": 0.2324, "num_input_tokens_seen": 97377440, "step": 45160 }, { "epoch": 8.288676821435125, "grad_norm": 15.186002731323242, "learning_rate": 7.27889494699537e-06, "loss": 0.259, "num_input_tokens_seen": 97388032, "step": 45165 }, { "epoch": 8.289594420994678, "grad_norm": 0.47217538952827454, "learning_rate": 7.278182169877057e-06, "loss": 0.0947, "num_input_tokens_seen": 97398976, "step": 45170 }, { "epoch": 8.29051202055423, "grad_norm": 31.9471492767334, "learning_rate": 7.277469334326903e-06, "loss": 0.3286, "num_input_tokens_seen": 97410688, "step": 45175 }, { "epoch": 8.291429620113782, "grad_norm": 1.9908839464187622, "learning_rate": 7.276756440363191e-06, "loss": 0.3703, "num_input_tokens_seen": 97420064, "step": 45180 }, { "epoch": 8.292347219673335, "grad_norm": 6.914261341094971, "learning_rate": 7.276043488004203e-06, "loss": 0.3714, "num_input_tokens_seen": 97430976, "step": 45185 }, { "epoch": 8.293264819232887, "grad_norm": 25.580331802368164, "learning_rate": 7.275330477268229e-06, "loss": 0.4697, "num_input_tokens_seen": 97442400, "step": 45190 }, { "epoch": 8.294182418792438, "grad_norm": 0.6354662179946899, "learning_rate": 7.274617408173552e-06, "loss": 0.1579, "num_input_tokens_seen": 97454016, "step": 45195 }, { "epoch": 8.295100018351992, "grad_norm": 4.536813735961914, "learning_rate": 7.273904280738466e-06, "loss": 0.3499, "num_input_tokens_seen": 97465184, "step": 45200 }, { "epoch": 8.296017617911543, "grad_norm": 0.3740100860595703, "learning_rate": 7.27319109498126e-06, "loss": 0.0438, "num_input_tokens_seen": 97475968, "step": 45205 }, { "epoch": 8.296935217471095, "grad_norm": 3.0111887454986572, "learning_rate": 7.2724778509202235e-06, "loss": 0.1245, "num_input_tokens_seen": 97487072, "step": 45210 }, { "epoch": 8.297852817030648, "grad_norm": 4.055325031280518, "learning_rate": 7.271764548573654e-06, "loss": 0.2129, "num_input_tokens_seen": 97497184, "step": 45215 }, { "epoch": 8.2987704165902, "grad_norm": 6.704037666320801, "learning_rate": 7.271051187959843e-06, "loss": 0.2972, "num_input_tokens_seen": 97509184, "step": 45220 }, { "epoch": 8.299688016149751, "grad_norm": 0.9421833753585815, "learning_rate": 7.270337769097091e-06, "loss": 0.287, "num_input_tokens_seen": 97519424, "step": 45225 }, { "epoch": 8.300605615709305, "grad_norm": 6.47460412979126, "learning_rate": 7.269624292003692e-06, "loss": 0.1532, "num_input_tokens_seen": 97530464, "step": 45230 }, { "epoch": 8.301523215268857, "grad_norm": 17.171226501464844, "learning_rate": 7.268910756697948e-06, "loss": 0.0595, "num_input_tokens_seen": 97542880, "step": 45235 }, { "epoch": 8.302440814828408, "grad_norm": 0.5032790899276733, "learning_rate": 7.268197163198161e-06, "loss": 0.1249, "num_input_tokens_seen": 97554176, "step": 45240 }, { "epoch": 8.303358414387962, "grad_norm": 9.296095848083496, "learning_rate": 7.26748351152263e-06, "loss": 0.119, "num_input_tokens_seen": 97565824, "step": 45245 }, { "epoch": 8.304276013947513, "grad_norm": 32.44639205932617, "learning_rate": 7.266769801689662e-06, "loss": 0.4771, "num_input_tokens_seen": 97577824, "step": 45250 }, { "epoch": 8.305193613507065, "grad_norm": 8.944246292114258, "learning_rate": 7.266056033717561e-06, "loss": 0.1947, "num_input_tokens_seen": 97588800, "step": 45255 }, { "epoch": 8.306111213066618, "grad_norm": 0.742310106754303, "learning_rate": 7.265342207624637e-06, "loss": 0.4189, "num_input_tokens_seen": 97599328, "step": 45260 }, { "epoch": 8.30702881262617, "grad_norm": 2.7487144470214844, "learning_rate": 7.264628323429196e-06, "loss": 0.3512, "num_input_tokens_seen": 97611136, "step": 45265 }, { "epoch": 8.307946412185721, "grad_norm": 31.66740608215332, "learning_rate": 7.263914381149546e-06, "loss": 0.2822, "num_input_tokens_seen": 97622688, "step": 45270 }, { "epoch": 8.308864011745275, "grad_norm": 1.771332859992981, "learning_rate": 7.263200380804003e-06, "loss": 0.2416, "num_input_tokens_seen": 97632800, "step": 45275 }, { "epoch": 8.309781611304826, "grad_norm": 17.06414794921875, "learning_rate": 7.2624863224108775e-06, "loss": 0.1101, "num_input_tokens_seen": 97645280, "step": 45280 }, { "epoch": 8.310699210864378, "grad_norm": 2.400722026824951, "learning_rate": 7.261772205988484e-06, "loss": 0.0763, "num_input_tokens_seen": 97656192, "step": 45285 }, { "epoch": 8.311616810423931, "grad_norm": 20.731597900390625, "learning_rate": 7.261058031555139e-06, "loss": 0.1129, "num_input_tokens_seen": 97666592, "step": 45290 }, { "epoch": 8.312534409983483, "grad_norm": 45.17441940307617, "learning_rate": 7.260343799129159e-06, "loss": 0.275, "num_input_tokens_seen": 97677856, "step": 45295 }, { "epoch": 8.313452009543035, "grad_norm": 0.4932657778263092, "learning_rate": 7.259629508728865e-06, "loss": 0.0368, "num_input_tokens_seen": 97690176, "step": 45300 }, { "epoch": 8.314369609102588, "grad_norm": 30.027727127075195, "learning_rate": 7.258915160372575e-06, "loss": 0.4527, "num_input_tokens_seen": 97701024, "step": 45305 }, { "epoch": 8.31528720866214, "grad_norm": 23.727148056030273, "learning_rate": 7.2582007540786125e-06, "loss": 0.3908, "num_input_tokens_seen": 97711616, "step": 45310 }, { "epoch": 8.316204808221691, "grad_norm": 27.025556564331055, "learning_rate": 7.257486289865302e-06, "loss": 0.4262, "num_input_tokens_seen": 97721632, "step": 45315 }, { "epoch": 8.317122407781245, "grad_norm": 0.7023254036903381, "learning_rate": 7.256771767750965e-06, "loss": 0.1711, "num_input_tokens_seen": 97732416, "step": 45320 }, { "epoch": 8.318040007340796, "grad_norm": 12.341402053833008, "learning_rate": 7.256057187753931e-06, "loss": 0.1535, "num_input_tokens_seen": 97744576, "step": 45325 }, { "epoch": 8.318957606900348, "grad_norm": 32.511451721191406, "learning_rate": 7.255342549892525e-06, "loss": 0.2554, "num_input_tokens_seen": 97754144, "step": 45330 }, { "epoch": 8.319875206459901, "grad_norm": 8.488600730895996, "learning_rate": 7.254627854185081e-06, "loss": 0.1313, "num_input_tokens_seen": 97765024, "step": 45335 }, { "epoch": 8.320792806019453, "grad_norm": 30.202064514160156, "learning_rate": 7.253913100649926e-06, "loss": 0.3102, "num_input_tokens_seen": 97775904, "step": 45340 }, { "epoch": 8.321710405579005, "grad_norm": 33.194610595703125, "learning_rate": 7.253198289305391e-06, "loss": 0.3688, "num_input_tokens_seen": 97787040, "step": 45345 }, { "epoch": 8.322628005138558, "grad_norm": 43.163211822509766, "learning_rate": 7.252483420169813e-06, "loss": 0.4857, "num_input_tokens_seen": 97798560, "step": 45350 }, { "epoch": 8.32354560469811, "grad_norm": 0.1464751958847046, "learning_rate": 7.251768493261527e-06, "loss": 0.4032, "num_input_tokens_seen": 97810784, "step": 45355 }, { "epoch": 8.324463204257661, "grad_norm": 7.320033550262451, "learning_rate": 7.2510535085988695e-06, "loss": 0.4808, "num_input_tokens_seen": 97820576, "step": 45360 }, { "epoch": 8.325380803817215, "grad_norm": 39.084415435791016, "learning_rate": 7.250338466200178e-06, "loss": 0.1243, "num_input_tokens_seen": 97831648, "step": 45365 }, { "epoch": 8.326298403376766, "grad_norm": 5.603810787200928, "learning_rate": 7.249623366083793e-06, "loss": 0.5168, "num_input_tokens_seen": 97843648, "step": 45370 }, { "epoch": 8.327216002936318, "grad_norm": 13.307881355285645, "learning_rate": 7.248908208268055e-06, "loss": 0.2341, "num_input_tokens_seen": 97854880, "step": 45375 }, { "epoch": 8.328133602495871, "grad_norm": 24.323564529418945, "learning_rate": 7.248192992771306e-06, "loss": 0.1523, "num_input_tokens_seen": 97864896, "step": 45380 }, { "epoch": 8.329051202055423, "grad_norm": 0.16920088231563568, "learning_rate": 7.247477719611893e-06, "loss": 0.3576, "num_input_tokens_seen": 97875040, "step": 45385 }, { "epoch": 8.329968801614974, "grad_norm": 0.7069584131240845, "learning_rate": 7.246762388808158e-06, "loss": 0.2423, "num_input_tokens_seen": 97884896, "step": 45390 }, { "epoch": 8.330886401174528, "grad_norm": 61.1592903137207, "learning_rate": 7.246047000378449e-06, "loss": 0.3451, "num_input_tokens_seen": 97895840, "step": 45395 }, { "epoch": 8.33180400073408, "grad_norm": 2.1709721088409424, "learning_rate": 7.245331554341118e-06, "loss": 0.1978, "num_input_tokens_seen": 97907200, "step": 45400 }, { "epoch": 8.332721600293631, "grad_norm": 12.195630073547363, "learning_rate": 7.24461605071451e-06, "loss": 0.1401, "num_input_tokens_seen": 97917408, "step": 45405 }, { "epoch": 8.333639199853184, "grad_norm": 39.36972427368164, "learning_rate": 7.243900489516982e-06, "loss": 0.2672, "num_input_tokens_seen": 97928064, "step": 45410 }, { "epoch": 8.334556799412736, "grad_norm": 1.3738008737564087, "learning_rate": 7.2431848707668815e-06, "loss": 0.1213, "num_input_tokens_seen": 97939296, "step": 45415 }, { "epoch": 8.335474398972288, "grad_norm": 45.454002380371094, "learning_rate": 7.242469194482566e-06, "loss": 0.3229, "num_input_tokens_seen": 97950240, "step": 45420 }, { "epoch": 8.336391998531841, "grad_norm": 22.41594696044922, "learning_rate": 7.241753460682393e-06, "loss": 0.4255, "num_input_tokens_seen": 97961728, "step": 45425 }, { "epoch": 8.337309598091393, "grad_norm": 24.624881744384766, "learning_rate": 7.241037669384716e-06, "loss": 0.3098, "num_input_tokens_seen": 97973248, "step": 45430 }, { "epoch": 8.338227197650944, "grad_norm": 0.1778283268213272, "learning_rate": 7.2403218206078985e-06, "loss": 0.3787, "num_input_tokens_seen": 97983776, "step": 45435 }, { "epoch": 8.339144797210498, "grad_norm": 16.886577606201172, "learning_rate": 7.239605914370297e-06, "loss": 0.0343, "num_input_tokens_seen": 97995552, "step": 45440 }, { "epoch": 8.34006239677005, "grad_norm": 33.93215560913086, "learning_rate": 7.238889950690275e-06, "loss": 0.1649, "num_input_tokens_seen": 98006048, "step": 45445 }, { "epoch": 8.340979996329601, "grad_norm": 20.70102310180664, "learning_rate": 7.238173929586196e-06, "loss": 0.6186, "num_input_tokens_seen": 98017728, "step": 45450 }, { "epoch": 8.341897595889154, "grad_norm": 16.272619247436523, "learning_rate": 7.237457851076424e-06, "loss": 0.1241, "num_input_tokens_seen": 98029792, "step": 45455 }, { "epoch": 8.342815195448706, "grad_norm": 0.1659684032201767, "learning_rate": 7.236741715179327e-06, "loss": 0.316, "num_input_tokens_seen": 98040896, "step": 45460 }, { "epoch": 8.343732795008258, "grad_norm": 14.067943572998047, "learning_rate": 7.2360255219132705e-06, "loss": 0.1361, "num_input_tokens_seen": 98050816, "step": 45465 }, { "epoch": 8.344650394567811, "grad_norm": 10.462875366210938, "learning_rate": 7.235309271296625e-06, "loss": 0.1997, "num_input_tokens_seen": 98062176, "step": 45470 }, { "epoch": 8.345567994127363, "grad_norm": 17.884050369262695, "learning_rate": 7.234592963347762e-06, "loss": 0.2009, "num_input_tokens_seen": 98072512, "step": 45475 }, { "epoch": 8.346485593686914, "grad_norm": 0.651104211807251, "learning_rate": 7.233876598085053e-06, "loss": 0.2714, "num_input_tokens_seen": 98083200, "step": 45480 }, { "epoch": 8.347403193246468, "grad_norm": 109.49971008300781, "learning_rate": 7.233160175526871e-06, "loss": 0.316, "num_input_tokens_seen": 98094240, "step": 45485 }, { "epoch": 8.34832079280602, "grad_norm": 0.5767163038253784, "learning_rate": 7.23244369569159e-06, "loss": 0.1567, "num_input_tokens_seen": 98104992, "step": 45490 }, { "epoch": 8.34923839236557, "grad_norm": 27.603015899658203, "learning_rate": 7.23172715859759e-06, "loss": 0.1087, "num_input_tokens_seen": 98116736, "step": 45495 }, { "epoch": 8.350155991925124, "grad_norm": 0.18399669229984283, "learning_rate": 7.2310105642632465e-06, "loss": 0.2084, "num_input_tokens_seen": 98126272, "step": 45500 }, { "epoch": 8.351073591484676, "grad_norm": 0.8452672958374023, "learning_rate": 7.23029391270694e-06, "loss": 0.2067, "num_input_tokens_seen": 98136672, "step": 45505 }, { "epoch": 8.351991191044227, "grad_norm": 73.80401611328125, "learning_rate": 7.229577203947051e-06, "loss": 0.1369, "num_input_tokens_seen": 98147264, "step": 45510 }, { "epoch": 8.35290879060378, "grad_norm": 6.621392250061035, "learning_rate": 7.228860438001962e-06, "loss": 0.1396, "num_input_tokens_seen": 98157472, "step": 45515 }, { "epoch": 8.353826390163333, "grad_norm": 36.152442932128906, "learning_rate": 7.228143614890058e-06, "loss": 0.2747, "num_input_tokens_seen": 98168032, "step": 45520 }, { "epoch": 8.354743989722884, "grad_norm": 1.3339059352874756, "learning_rate": 7.2274267346297235e-06, "loss": 0.2149, "num_input_tokens_seen": 98178880, "step": 45525 }, { "epoch": 8.355661589282438, "grad_norm": 2.0120835304260254, "learning_rate": 7.226709797239344e-06, "loss": 0.1567, "num_input_tokens_seen": 98189984, "step": 45530 }, { "epoch": 8.35657918884199, "grad_norm": 31.2883243560791, "learning_rate": 7.2259928027373116e-06, "loss": 0.2822, "num_input_tokens_seen": 98201056, "step": 45535 }, { "epoch": 8.35749678840154, "grad_norm": 28.19333839416504, "learning_rate": 7.225275751142013e-06, "loss": 0.2586, "num_input_tokens_seen": 98212000, "step": 45540 }, { "epoch": 8.358414387961094, "grad_norm": 4.842146873474121, "learning_rate": 7.22455864247184e-06, "loss": 0.2902, "num_input_tokens_seen": 98221440, "step": 45545 }, { "epoch": 8.359331987520646, "grad_norm": 0.5713520050048828, "learning_rate": 7.223841476745185e-06, "loss": 0.0226, "num_input_tokens_seen": 98232352, "step": 45550 }, { "epoch": 8.360249587080197, "grad_norm": 0.38091784715652466, "learning_rate": 7.2231242539804425e-06, "loss": 0.5291, "num_input_tokens_seen": 98243776, "step": 45555 }, { "epoch": 8.36116718663975, "grad_norm": 56.702232360839844, "learning_rate": 7.22240697419601e-06, "loss": 0.0693, "num_input_tokens_seen": 98254208, "step": 45560 }, { "epoch": 8.362084786199302, "grad_norm": 0.2319275438785553, "learning_rate": 7.221689637410282e-06, "loss": 0.1757, "num_input_tokens_seen": 98265152, "step": 45565 }, { "epoch": 8.363002385758854, "grad_norm": 63.31283950805664, "learning_rate": 7.220972243641658e-06, "loss": 0.4151, "num_input_tokens_seen": 98275456, "step": 45570 }, { "epoch": 8.363919985318407, "grad_norm": 15.175568580627441, "learning_rate": 7.220254792908539e-06, "loss": 0.3563, "num_input_tokens_seen": 98286176, "step": 45575 }, { "epoch": 8.364837584877959, "grad_norm": 0.8273910880088806, "learning_rate": 7.219537285229325e-06, "loss": 0.0438, "num_input_tokens_seen": 98297088, "step": 45580 }, { "epoch": 8.36575518443751, "grad_norm": 49.80023193359375, "learning_rate": 7.21881972062242e-06, "loss": 0.1399, "num_input_tokens_seen": 98307040, "step": 45585 }, { "epoch": 8.366672783997064, "grad_norm": 1.4458508491516113, "learning_rate": 7.218102099106228e-06, "loss": 0.2843, "num_input_tokens_seen": 98317408, "step": 45590 }, { "epoch": 8.367590383556616, "grad_norm": 1.4109737873077393, "learning_rate": 7.217384420699155e-06, "loss": 0.3727, "num_input_tokens_seen": 98328928, "step": 45595 }, { "epoch": 8.368507983116167, "grad_norm": 21.587772369384766, "learning_rate": 7.2166666854196075e-06, "loss": 0.3611, "num_input_tokens_seen": 98340096, "step": 45600 }, { "epoch": 8.36942558267572, "grad_norm": 9.925346374511719, "learning_rate": 7.215948893285996e-06, "loss": 0.3986, "num_input_tokens_seen": 98351104, "step": 45605 }, { "epoch": 8.370343182235272, "grad_norm": 14.684531211853027, "learning_rate": 7.215231044316728e-06, "loss": 0.4551, "num_input_tokens_seen": 98362752, "step": 45610 }, { "epoch": 8.371260781794824, "grad_norm": 0.15669801831245422, "learning_rate": 7.214513138530219e-06, "loss": 0.4213, "num_input_tokens_seen": 98374592, "step": 45615 }, { "epoch": 8.372178381354377, "grad_norm": 1.7784299850463867, "learning_rate": 7.21379517594488e-06, "loss": 0.2152, "num_input_tokens_seen": 98385632, "step": 45620 }, { "epoch": 8.373095980913929, "grad_norm": 22.29583168029785, "learning_rate": 7.213077156579125e-06, "loss": 0.306, "num_input_tokens_seen": 98397120, "step": 45625 }, { "epoch": 8.37401358047348, "grad_norm": 0.48306798934936523, "learning_rate": 7.21235908045137e-06, "loss": 0.1589, "num_input_tokens_seen": 98408320, "step": 45630 }, { "epoch": 8.374931180033034, "grad_norm": 60.85105895996094, "learning_rate": 7.2116409475800356e-06, "loss": 0.0788, "num_input_tokens_seen": 98418912, "step": 45635 }, { "epoch": 8.375848779592586, "grad_norm": 0.573580801486969, "learning_rate": 7.210922757983536e-06, "loss": 0.351, "num_input_tokens_seen": 98429888, "step": 45640 }, { "epoch": 8.376766379152137, "grad_norm": 111.07514953613281, "learning_rate": 7.210204511680296e-06, "loss": 0.186, "num_input_tokens_seen": 98440480, "step": 45645 }, { "epoch": 8.37768397871169, "grad_norm": 0.5051262974739075, "learning_rate": 7.209486208688736e-06, "loss": 0.1321, "num_input_tokens_seen": 98450784, "step": 45650 }, { "epoch": 8.378601578271242, "grad_norm": 1.4978580474853516, "learning_rate": 7.20876784902728e-06, "loss": 0.0745, "num_input_tokens_seen": 98460160, "step": 45655 }, { "epoch": 8.379519177830794, "grad_norm": 0.09658533334732056, "learning_rate": 7.20804943271435e-06, "loss": 0.0428, "num_input_tokens_seen": 98470624, "step": 45660 }, { "epoch": 8.380436777390347, "grad_norm": 74.3915786743164, "learning_rate": 7.207330959768375e-06, "loss": 0.3817, "num_input_tokens_seen": 98481280, "step": 45665 }, { "epoch": 8.381354376949899, "grad_norm": 71.19020080566406, "learning_rate": 7.206612430207782e-06, "loss": 0.1418, "num_input_tokens_seen": 98492576, "step": 45670 }, { "epoch": 8.38227197650945, "grad_norm": 32.516910552978516, "learning_rate": 7.205893844051e-06, "loss": 0.4573, "num_input_tokens_seen": 98502592, "step": 45675 }, { "epoch": 8.383189576069004, "grad_norm": 21.662817001342773, "learning_rate": 7.2051752013164585e-06, "loss": 0.3491, "num_input_tokens_seen": 98513376, "step": 45680 }, { "epoch": 8.384107175628555, "grad_norm": 34.76599884033203, "learning_rate": 7.204456502022592e-06, "loss": 0.2106, "num_input_tokens_seen": 98523584, "step": 45685 }, { "epoch": 8.385024775188107, "grad_norm": 17.701160430908203, "learning_rate": 7.2037377461878334e-06, "loss": 0.3488, "num_input_tokens_seen": 98532160, "step": 45690 }, { "epoch": 8.38594237474766, "grad_norm": 0.0847715362906456, "learning_rate": 7.203018933830617e-06, "loss": 0.3606, "num_input_tokens_seen": 98543104, "step": 45695 }, { "epoch": 8.386859974307212, "grad_norm": 362.5022277832031, "learning_rate": 7.202300064969378e-06, "loss": 0.3928, "num_input_tokens_seen": 98553824, "step": 45700 }, { "epoch": 8.387777573866764, "grad_norm": 3.491422414779663, "learning_rate": 7.2015811396225574e-06, "loss": 0.2881, "num_input_tokens_seen": 98563616, "step": 45705 }, { "epoch": 8.388695173426317, "grad_norm": 0.5264386534690857, "learning_rate": 7.200862157808593e-06, "loss": 0.3022, "num_input_tokens_seen": 98575232, "step": 45710 }, { "epoch": 8.389612772985869, "grad_norm": 15.888943672180176, "learning_rate": 7.200143119545922e-06, "loss": 0.4377, "num_input_tokens_seen": 98585984, "step": 45715 }, { "epoch": 8.39053037254542, "grad_norm": 9.237039566040039, "learning_rate": 7.199424024852993e-06, "loss": 0.1036, "num_input_tokens_seen": 98597600, "step": 45720 }, { "epoch": 8.391447972104974, "grad_norm": 0.590593159198761, "learning_rate": 7.198704873748245e-06, "loss": 0.2142, "num_input_tokens_seen": 98608864, "step": 45725 }, { "epoch": 8.392365571664525, "grad_norm": 0.704804539680481, "learning_rate": 7.197985666250126e-06, "loss": 0.2076, "num_input_tokens_seen": 98619808, "step": 45730 }, { "epoch": 8.393283171224077, "grad_norm": 0.10753312706947327, "learning_rate": 7.19726640237708e-06, "loss": 0.3044, "num_input_tokens_seen": 98631584, "step": 45735 }, { "epoch": 8.39420077078363, "grad_norm": 2.900909185409546, "learning_rate": 7.196547082147556e-06, "loss": 0.223, "num_input_tokens_seen": 98642752, "step": 45740 }, { "epoch": 8.395118370343182, "grad_norm": 0.7945612668991089, "learning_rate": 7.195827705580006e-06, "loss": 0.0422, "num_input_tokens_seen": 98654432, "step": 45745 }, { "epoch": 8.396035969902734, "grad_norm": 2.9742512702941895, "learning_rate": 7.195108272692874e-06, "loss": 0.43, "num_input_tokens_seen": 98666112, "step": 45750 }, { "epoch": 8.396953569462287, "grad_norm": 53.04445266723633, "learning_rate": 7.194388783504621e-06, "loss": 0.1943, "num_input_tokens_seen": 98676288, "step": 45755 }, { "epoch": 8.397871169021839, "grad_norm": 0.6018746495246887, "learning_rate": 7.193669238033696e-06, "loss": 0.2792, "num_input_tokens_seen": 98687008, "step": 45760 }, { "epoch": 8.39878876858139, "grad_norm": 34.285911560058594, "learning_rate": 7.192949636298554e-06, "loss": 0.2477, "num_input_tokens_seen": 98697280, "step": 45765 }, { "epoch": 8.399706368140944, "grad_norm": 15.514542579650879, "learning_rate": 7.192229978317653e-06, "loss": 0.2883, "num_input_tokens_seen": 98707648, "step": 45770 }, { "epoch": 8.400623967700495, "grad_norm": 26.22902488708496, "learning_rate": 7.191510264109451e-06, "loss": 0.2803, "num_input_tokens_seen": 98718848, "step": 45775 }, { "epoch": 8.401541567260047, "grad_norm": 58.83744812011719, "learning_rate": 7.190790493692407e-06, "loss": 0.1982, "num_input_tokens_seen": 98728544, "step": 45780 }, { "epoch": 8.4024591668196, "grad_norm": 9.797439575195312, "learning_rate": 7.1900706670849815e-06, "loss": 0.0159, "num_input_tokens_seen": 98737920, "step": 45785 }, { "epoch": 8.403376766379152, "grad_norm": 36.51246643066406, "learning_rate": 7.189350784305639e-06, "loss": 0.4084, "num_input_tokens_seen": 98748992, "step": 45790 }, { "epoch": 8.404294365938703, "grad_norm": 30.41817855834961, "learning_rate": 7.188630845372841e-06, "loss": 0.243, "num_input_tokens_seen": 98759584, "step": 45795 }, { "epoch": 8.405211965498257, "grad_norm": 157.1449432373047, "learning_rate": 7.187910850305055e-06, "loss": 0.3451, "num_input_tokens_seen": 98770176, "step": 45800 }, { "epoch": 8.406129565057809, "grad_norm": 0.49716663360595703, "learning_rate": 7.187190799120747e-06, "loss": 0.0383, "num_input_tokens_seen": 98781344, "step": 45805 }, { "epoch": 8.40704716461736, "grad_norm": 1.2434104681015015, "learning_rate": 7.186470691838383e-06, "loss": 0.166, "num_input_tokens_seen": 98790592, "step": 45810 }, { "epoch": 8.407964764176914, "grad_norm": 11.65662670135498, "learning_rate": 7.1857505284764365e-06, "loss": 0.1963, "num_input_tokens_seen": 98802528, "step": 45815 }, { "epoch": 8.408882363736465, "grad_norm": 24.95650291442871, "learning_rate": 7.185030309053374e-06, "loss": 0.2062, "num_input_tokens_seen": 98812832, "step": 45820 }, { "epoch": 8.409799963296017, "grad_norm": 0.8218289017677307, "learning_rate": 7.184310033587672e-06, "loss": 0.0652, "num_input_tokens_seen": 98822624, "step": 45825 }, { "epoch": 8.41071756285557, "grad_norm": 17.606800079345703, "learning_rate": 7.183589702097803e-06, "loss": 0.2043, "num_input_tokens_seen": 98834112, "step": 45830 }, { "epoch": 8.411635162415122, "grad_norm": 16.856121063232422, "learning_rate": 7.182869314602242e-06, "loss": 0.3823, "num_input_tokens_seen": 98844000, "step": 45835 }, { "epoch": 8.412552761974673, "grad_norm": 73.53711700439453, "learning_rate": 7.182148871119467e-06, "loss": 0.1132, "num_input_tokens_seen": 98855840, "step": 45840 }, { "epoch": 8.413470361534227, "grad_norm": 0.10857035964727402, "learning_rate": 7.181428371667954e-06, "loss": 0.0263, "num_input_tokens_seen": 98866592, "step": 45845 }, { "epoch": 8.414387961093778, "grad_norm": 0.5225886702537537, "learning_rate": 7.180707816266186e-06, "loss": 0.3431, "num_input_tokens_seen": 98877632, "step": 45850 }, { "epoch": 8.41530556065333, "grad_norm": 30.292438507080078, "learning_rate": 7.179987204932641e-06, "loss": 0.1699, "num_input_tokens_seen": 98886944, "step": 45855 }, { "epoch": 8.416223160212883, "grad_norm": 44.198726654052734, "learning_rate": 7.179266537685804e-06, "loss": 0.3096, "num_input_tokens_seen": 98897600, "step": 45860 }, { "epoch": 8.417140759772435, "grad_norm": 35.18069839477539, "learning_rate": 7.178545814544158e-06, "loss": 0.1712, "num_input_tokens_seen": 98907872, "step": 45865 }, { "epoch": 8.418058359331987, "grad_norm": 0.6968480348587036, "learning_rate": 7.177825035526187e-06, "loss": 0.3201, "num_input_tokens_seen": 98918752, "step": 45870 }, { "epoch": 8.41897595889154, "grad_norm": 3.419680595397949, "learning_rate": 7.1771042006503784e-06, "loss": 0.2115, "num_input_tokens_seen": 98929696, "step": 45875 }, { "epoch": 8.419893558451092, "grad_norm": 94.46510314941406, "learning_rate": 7.176383309935224e-06, "loss": 0.2591, "num_input_tokens_seen": 98941216, "step": 45880 }, { "epoch": 8.420811158010643, "grad_norm": 7.285913944244385, "learning_rate": 7.175662363399208e-06, "loss": 0.1622, "num_input_tokens_seen": 98951616, "step": 45885 }, { "epoch": 8.421728757570197, "grad_norm": 15.031009674072266, "learning_rate": 7.174941361060826e-06, "loss": 0.115, "num_input_tokens_seen": 98963488, "step": 45890 }, { "epoch": 8.422646357129748, "grad_norm": 1.1460026502609253, "learning_rate": 7.174220302938569e-06, "loss": 0.3408, "num_input_tokens_seen": 98975328, "step": 45895 }, { "epoch": 8.4235639566893, "grad_norm": 27.121051788330078, "learning_rate": 7.173499189050931e-06, "loss": 0.2269, "num_input_tokens_seen": 98986176, "step": 45900 }, { "epoch": 8.424481556248853, "grad_norm": 3.7311344146728516, "learning_rate": 7.172778019416407e-06, "loss": 0.3, "num_input_tokens_seen": 98996992, "step": 45905 }, { "epoch": 8.425399155808405, "grad_norm": 0.37890028953552246, "learning_rate": 7.1720567940534945e-06, "loss": 0.2, "num_input_tokens_seen": 99008608, "step": 45910 }, { "epoch": 8.426316755367957, "grad_norm": 81.11250305175781, "learning_rate": 7.1713355129806925e-06, "loss": 0.1458, "num_input_tokens_seen": 99019968, "step": 45915 }, { "epoch": 8.42723435492751, "grad_norm": 69.24382019042969, "learning_rate": 7.170614176216498e-06, "loss": 0.2097, "num_input_tokens_seen": 99031680, "step": 45920 }, { "epoch": 8.428151954487062, "grad_norm": 3.392540693283081, "learning_rate": 7.169892783779414e-06, "loss": 0.2156, "num_input_tokens_seen": 99042688, "step": 45925 }, { "epoch": 8.429069554046613, "grad_norm": 2.4361491203308105, "learning_rate": 7.1691713356879455e-06, "loss": 0.1167, "num_input_tokens_seen": 99053280, "step": 45930 }, { "epoch": 8.429987153606167, "grad_norm": 1.125092625617981, "learning_rate": 7.168449831960591e-06, "loss": 0.2862, "num_input_tokens_seen": 99063232, "step": 45935 }, { "epoch": 8.430904753165718, "grad_norm": 6.992030143737793, "learning_rate": 7.167728272615862e-06, "loss": 0.3265, "num_input_tokens_seen": 99074176, "step": 45940 }, { "epoch": 8.43182235272527, "grad_norm": 0.505827009677887, "learning_rate": 7.1670066576722605e-06, "loss": 0.1099, "num_input_tokens_seen": 99085952, "step": 45945 }, { "epoch": 8.432739952284823, "grad_norm": 27.427230834960938, "learning_rate": 7.166284987148299e-06, "loss": 0.2776, "num_input_tokens_seen": 99096512, "step": 45950 }, { "epoch": 8.433657551844375, "grad_norm": 22.91480255126953, "learning_rate": 7.165563261062482e-06, "loss": 0.1344, "num_input_tokens_seen": 99106304, "step": 45955 }, { "epoch": 8.434575151403926, "grad_norm": 0.552784264087677, "learning_rate": 7.164841479433326e-06, "loss": 0.1578, "num_input_tokens_seen": 99116512, "step": 45960 }, { "epoch": 8.43549275096348, "grad_norm": 6.477694988250732, "learning_rate": 7.16411964227934e-06, "loss": 0.1987, "num_input_tokens_seen": 99126784, "step": 45965 }, { "epoch": 8.436410350523031, "grad_norm": 2.0302786827087402, "learning_rate": 7.163397749619039e-06, "loss": 0.1905, "num_input_tokens_seen": 99136992, "step": 45970 }, { "epoch": 8.437327950082583, "grad_norm": 89.73612213134766, "learning_rate": 7.16267580147094e-06, "loss": 0.331, "num_input_tokens_seen": 99148704, "step": 45975 }, { "epoch": 8.438245549642136, "grad_norm": 24.825458526611328, "learning_rate": 7.161953797853558e-06, "loss": 0.2491, "num_input_tokens_seen": 99160000, "step": 45980 }, { "epoch": 8.439163149201688, "grad_norm": 0.4084710478782654, "learning_rate": 7.161231738785411e-06, "loss": 0.1046, "num_input_tokens_seen": 99171424, "step": 45985 }, { "epoch": 8.44008074876124, "grad_norm": 8.22743034362793, "learning_rate": 7.160509624285021e-06, "loss": 0.0282, "num_input_tokens_seen": 99182368, "step": 45990 }, { "epoch": 8.440998348320793, "grad_norm": 27.65538215637207, "learning_rate": 7.159787454370906e-06, "loss": 0.2328, "num_input_tokens_seen": 99193472, "step": 45995 }, { "epoch": 8.441915947880345, "grad_norm": 37.37873077392578, "learning_rate": 7.159065229061592e-06, "loss": 0.2445, "num_input_tokens_seen": 99204736, "step": 46000 }, { "epoch": 8.442833547439896, "grad_norm": 20.70696258544922, "learning_rate": 7.1583429483756e-06, "loss": 0.4599, "num_input_tokens_seen": 99216352, "step": 46005 }, { "epoch": 8.44375114699945, "grad_norm": 6.5497260093688965, "learning_rate": 7.157620612331457e-06, "loss": 0.187, "num_input_tokens_seen": 99226336, "step": 46010 }, { "epoch": 8.444668746559001, "grad_norm": 1.8709157705307007, "learning_rate": 7.1568982209476875e-06, "loss": 0.0656, "num_input_tokens_seen": 99236480, "step": 46015 }, { "epoch": 8.445586346118553, "grad_norm": 8.879852294921875, "learning_rate": 7.156175774242824e-06, "loss": 0.1054, "num_input_tokens_seen": 99249184, "step": 46020 }, { "epoch": 8.446503945678106, "grad_norm": 23.69839096069336, "learning_rate": 7.155453272235393e-06, "loss": 0.502, "num_input_tokens_seen": 99260320, "step": 46025 }, { "epoch": 8.447421545237658, "grad_norm": 10.36164379119873, "learning_rate": 7.1547307149439264e-06, "loss": 0.3765, "num_input_tokens_seen": 99270560, "step": 46030 }, { "epoch": 8.44833914479721, "grad_norm": 17.52354621887207, "learning_rate": 7.154008102386955e-06, "loss": 0.4828, "num_input_tokens_seen": 99281216, "step": 46035 }, { "epoch": 8.449256744356763, "grad_norm": 43.556365966796875, "learning_rate": 7.1532854345830146e-06, "loss": 0.3154, "num_input_tokens_seen": 99290720, "step": 46040 }, { "epoch": 8.450174343916315, "grad_norm": 14.268674850463867, "learning_rate": 7.152562711550642e-06, "loss": 0.2835, "num_input_tokens_seen": 99301408, "step": 46045 }, { "epoch": 8.451091943475866, "grad_norm": 97.18006896972656, "learning_rate": 7.15183993330837e-06, "loss": 0.3519, "num_input_tokens_seen": 99311904, "step": 46050 }, { "epoch": 8.45200954303542, "grad_norm": 6.4641499519348145, "learning_rate": 7.151117099874739e-06, "loss": 0.2603, "num_input_tokens_seen": 99323520, "step": 46055 }, { "epoch": 8.452927142594971, "grad_norm": 27.886625289916992, "learning_rate": 7.150394211268288e-06, "loss": 0.2305, "num_input_tokens_seen": 99332704, "step": 46060 }, { "epoch": 8.453844742154523, "grad_norm": 43.006988525390625, "learning_rate": 7.1496712675075595e-06, "loss": 0.2505, "num_input_tokens_seen": 99343552, "step": 46065 }, { "epoch": 8.454762341714076, "grad_norm": 7.483773708343506, "learning_rate": 7.148948268611094e-06, "loss": 0.1279, "num_input_tokens_seen": 99354560, "step": 46070 }, { "epoch": 8.455679941273628, "grad_norm": 18.754314422607422, "learning_rate": 7.1482252145974375e-06, "loss": 0.2704, "num_input_tokens_seen": 99365088, "step": 46075 }, { "epoch": 8.45659754083318, "grad_norm": 0.45837751030921936, "learning_rate": 7.1475021054851314e-06, "loss": 0.2085, "num_input_tokens_seen": 99375072, "step": 46080 }, { "epoch": 8.457515140392733, "grad_norm": 36.531558990478516, "learning_rate": 7.146778941292725e-06, "loss": 0.092, "num_input_tokens_seen": 99386016, "step": 46085 }, { "epoch": 8.458432739952285, "grad_norm": 0.36903074383735657, "learning_rate": 7.146055722038767e-06, "loss": 0.1759, "num_input_tokens_seen": 99395264, "step": 46090 }, { "epoch": 8.459350339511836, "grad_norm": 59.571041107177734, "learning_rate": 7.145332447741805e-06, "loss": 0.4491, "num_input_tokens_seen": 99406272, "step": 46095 }, { "epoch": 8.46026793907139, "grad_norm": 65.8482666015625, "learning_rate": 7.144609118420391e-06, "loss": 0.2958, "num_input_tokens_seen": 99417408, "step": 46100 }, { "epoch": 8.461185538630941, "grad_norm": 27.28424835205078, "learning_rate": 7.143885734093077e-06, "loss": 0.4692, "num_input_tokens_seen": 99428736, "step": 46105 }, { "epoch": 8.462103138190493, "grad_norm": 29.06342124938965, "learning_rate": 7.143162294778418e-06, "loss": 0.2949, "num_input_tokens_seen": 99439552, "step": 46110 }, { "epoch": 8.463020737750046, "grad_norm": 17.546838760375977, "learning_rate": 7.142438800494965e-06, "loss": 0.155, "num_input_tokens_seen": 99449824, "step": 46115 }, { "epoch": 8.463938337309598, "grad_norm": 34.91812515258789, "learning_rate": 7.14171525126128e-06, "loss": 0.1507, "num_input_tokens_seen": 99460640, "step": 46120 }, { "epoch": 8.46485593686915, "grad_norm": 28.016569137573242, "learning_rate": 7.140991647095916e-06, "loss": 0.1433, "num_input_tokens_seen": 99471296, "step": 46125 }, { "epoch": 8.465773536428703, "grad_norm": 44.58209228515625, "learning_rate": 7.140267988017435e-06, "loss": 0.2303, "num_input_tokens_seen": 99482880, "step": 46130 }, { "epoch": 8.466691135988254, "grad_norm": 2.4485974311828613, "learning_rate": 7.139544274044398e-06, "loss": 0.1732, "num_input_tokens_seen": 99494016, "step": 46135 }, { "epoch": 8.467608735547808, "grad_norm": 0.5901513695716858, "learning_rate": 7.138820505195366e-06, "loss": 0.0714, "num_input_tokens_seen": 99504320, "step": 46140 }, { "epoch": 8.46852633510736, "grad_norm": 24.25225257873535, "learning_rate": 7.138096681488902e-06, "loss": 0.2895, "num_input_tokens_seen": 99515424, "step": 46145 }, { "epoch": 8.469443934666911, "grad_norm": 59.501800537109375, "learning_rate": 7.137372802943574e-06, "loss": 0.4405, "num_input_tokens_seen": 99525888, "step": 46150 }, { "epoch": 8.470361534226464, "grad_norm": 43.45598602294922, "learning_rate": 7.136648869577945e-06, "loss": 0.223, "num_input_tokens_seen": 99536544, "step": 46155 }, { "epoch": 8.471279133786016, "grad_norm": 7.6546311378479, "learning_rate": 7.135924881410583e-06, "loss": 0.0543, "num_input_tokens_seen": 99547232, "step": 46160 }, { "epoch": 8.472196733345568, "grad_norm": 10.461328506469727, "learning_rate": 7.135200838460059e-06, "loss": 0.5021, "num_input_tokens_seen": 99557984, "step": 46165 }, { "epoch": 8.473114332905121, "grad_norm": 27.69342041015625, "learning_rate": 7.1344767407449426e-06, "loss": 0.2128, "num_input_tokens_seen": 99567840, "step": 46170 }, { "epoch": 8.474031932464673, "grad_norm": 63.735687255859375, "learning_rate": 7.133752588283807e-06, "loss": 0.3354, "num_input_tokens_seen": 99578208, "step": 46175 }, { "epoch": 8.474949532024224, "grad_norm": 37.06437301635742, "learning_rate": 7.133028381095223e-06, "loss": 0.1151, "num_input_tokens_seen": 99588096, "step": 46180 }, { "epoch": 8.475867131583778, "grad_norm": 2.71897029876709, "learning_rate": 7.132304119197768e-06, "loss": 0.2211, "num_input_tokens_seen": 99598464, "step": 46185 }, { "epoch": 8.47678473114333, "grad_norm": 23.437822341918945, "learning_rate": 7.131579802610016e-06, "loss": 0.3654, "num_input_tokens_seen": 99608192, "step": 46190 }, { "epoch": 8.477702330702881, "grad_norm": 30.575164794921875, "learning_rate": 7.130855431350546e-06, "loss": 0.2518, "num_input_tokens_seen": 99617408, "step": 46195 }, { "epoch": 8.478619930262434, "grad_norm": 19.972679138183594, "learning_rate": 7.130131005437937e-06, "loss": 0.2852, "num_input_tokens_seen": 99629280, "step": 46200 }, { "epoch": 8.479537529821986, "grad_norm": 3.2384674549102783, "learning_rate": 7.12940652489077e-06, "loss": 0.2237, "num_input_tokens_seen": 99640800, "step": 46205 }, { "epoch": 8.480455129381538, "grad_norm": 46.80668640136719, "learning_rate": 7.128681989727625e-06, "loss": 0.2244, "num_input_tokens_seen": 99651584, "step": 46210 }, { "epoch": 8.481372728941091, "grad_norm": 0.2439710646867752, "learning_rate": 7.127957399967086e-06, "loss": 0.2751, "num_input_tokens_seen": 99662688, "step": 46215 }, { "epoch": 8.482290328500643, "grad_norm": 0.3810347616672516, "learning_rate": 7.127232755627739e-06, "loss": 0.2409, "num_input_tokens_seen": 99673376, "step": 46220 }, { "epoch": 8.483207928060194, "grad_norm": 6.2374491691589355, "learning_rate": 7.126508056728166e-06, "loss": 0.1482, "num_input_tokens_seen": 99684576, "step": 46225 }, { "epoch": 8.484125527619748, "grad_norm": 58.24345016479492, "learning_rate": 7.125783303286959e-06, "loss": 0.2743, "num_input_tokens_seen": 99695584, "step": 46230 }, { "epoch": 8.4850431271793, "grad_norm": 37.42762756347656, "learning_rate": 7.125058495322706e-06, "loss": 0.3168, "num_input_tokens_seen": 99705472, "step": 46235 }, { "epoch": 8.48596072673885, "grad_norm": 24.306781768798828, "learning_rate": 7.1243336328539944e-06, "loss": 0.2654, "num_input_tokens_seen": 99715872, "step": 46240 }, { "epoch": 8.486878326298404, "grad_norm": 8.025386810302734, "learning_rate": 7.123608715899418e-06, "loss": 0.1561, "num_input_tokens_seen": 99725120, "step": 46245 }, { "epoch": 8.487795925857956, "grad_norm": 11.962811470031738, "learning_rate": 7.12288374447757e-06, "loss": 0.3561, "num_input_tokens_seen": 99733984, "step": 46250 }, { "epoch": 8.488713525417507, "grad_norm": 21.286123275756836, "learning_rate": 7.122158718607043e-06, "loss": 0.2375, "num_input_tokens_seen": 99744960, "step": 46255 }, { "epoch": 8.48963112497706, "grad_norm": 4.301759719848633, "learning_rate": 7.121433638306436e-06, "loss": 0.0149, "num_input_tokens_seen": 99755264, "step": 46260 }, { "epoch": 8.490548724536612, "grad_norm": 1.6920469999313354, "learning_rate": 7.120708503594341e-06, "loss": 0.304, "num_input_tokens_seen": 99765696, "step": 46265 }, { "epoch": 8.491466324096164, "grad_norm": 19.145183563232422, "learning_rate": 7.119983314489363e-06, "loss": 0.2602, "num_input_tokens_seen": 99776448, "step": 46270 }, { "epoch": 8.492383923655717, "grad_norm": 0.7841119170188904, "learning_rate": 7.119258071010096e-06, "loss": 0.1305, "num_input_tokens_seen": 99787776, "step": 46275 }, { "epoch": 8.493301523215269, "grad_norm": 13.813966751098633, "learning_rate": 7.118532773175144e-06, "loss": 0.1872, "num_input_tokens_seen": 99797696, "step": 46280 }, { "epoch": 8.49421912277482, "grad_norm": 1.457927942276001, "learning_rate": 7.1178074210031116e-06, "loss": 0.2824, "num_input_tokens_seen": 99810304, "step": 46285 }, { "epoch": 8.495136722334374, "grad_norm": 3.1835007667541504, "learning_rate": 7.1170820145126e-06, "loss": 0.2616, "num_input_tokens_seen": 99820768, "step": 46290 }, { "epoch": 8.496054321893926, "grad_norm": 4.702846527099609, "learning_rate": 7.116356553722217e-06, "loss": 0.3365, "num_input_tokens_seen": 99832288, "step": 46295 }, { "epoch": 8.496971921453477, "grad_norm": 1.8336373567581177, "learning_rate": 7.1156310386505665e-06, "loss": 0.2491, "num_input_tokens_seen": 99842688, "step": 46300 }, { "epoch": 8.49788952101303, "grad_norm": 10.866161346435547, "learning_rate": 7.11490546931626e-06, "loss": 0.5077, "num_input_tokens_seen": 99853248, "step": 46305 }, { "epoch": 8.498807120572582, "grad_norm": 1.2159335613250732, "learning_rate": 7.1141798457379055e-06, "loss": 0.0986, "num_input_tokens_seen": 99863552, "step": 46310 }, { "epoch": 8.499724720132134, "grad_norm": 0.9563838839530945, "learning_rate": 7.113454167934115e-06, "loss": 0.3526, "num_input_tokens_seen": 99874176, "step": 46315 }, { "epoch": 8.500642319691687, "grad_norm": 0.2140408605337143, "learning_rate": 7.112728435923502e-06, "loss": 0.1674, "num_input_tokens_seen": 99884192, "step": 46320 }, { "epoch": 8.501559919251239, "grad_norm": 58.96269607543945, "learning_rate": 7.112002649724676e-06, "loss": 0.1826, "num_input_tokens_seen": 99894272, "step": 46325 }, { "epoch": 8.50247751881079, "grad_norm": 39.237709045410156, "learning_rate": 7.111276809356258e-06, "loss": 0.0959, "num_input_tokens_seen": 99904000, "step": 46330 }, { "epoch": 8.503395118370344, "grad_norm": 69.81674194335938, "learning_rate": 7.1105509148368615e-06, "loss": 0.1124, "num_input_tokens_seen": 99914560, "step": 46335 }, { "epoch": 8.504312717929896, "grad_norm": 0.2893922030925751, "learning_rate": 7.109824966185105e-06, "loss": 0.1147, "num_input_tokens_seen": 99925248, "step": 46340 }, { "epoch": 8.505230317489447, "grad_norm": 10.994230270385742, "learning_rate": 7.109098963419608e-06, "loss": 0.1489, "num_input_tokens_seen": 99936192, "step": 46345 }, { "epoch": 8.506147917049, "grad_norm": 0.6386317014694214, "learning_rate": 7.108372906558991e-06, "loss": 0.1574, "num_input_tokens_seen": 99948288, "step": 46350 }, { "epoch": 8.507065516608552, "grad_norm": 67.89498138427734, "learning_rate": 7.107646795621876e-06, "loss": 0.0359, "num_input_tokens_seen": 99959616, "step": 46355 }, { "epoch": 8.507983116168104, "grad_norm": 20.54275131225586, "learning_rate": 7.106920630626889e-06, "loss": 0.2008, "num_input_tokens_seen": 99969280, "step": 46360 }, { "epoch": 8.508900715727657, "grad_norm": 48.57162857055664, "learning_rate": 7.106194411592652e-06, "loss": 0.2416, "num_input_tokens_seen": 99981248, "step": 46365 }, { "epoch": 8.509818315287209, "grad_norm": 83.98487091064453, "learning_rate": 7.105468138537793e-06, "loss": 0.3158, "num_input_tokens_seen": 99990656, "step": 46370 }, { "epoch": 8.51073591484676, "grad_norm": 18.871585845947266, "learning_rate": 7.1047418114809395e-06, "loss": 0.2668, "num_input_tokens_seen": 100000448, "step": 46375 }, { "epoch": 8.511653514406314, "grad_norm": 14.694859504699707, "learning_rate": 7.104015430440719e-06, "loss": 0.1258, "num_input_tokens_seen": 100010912, "step": 46380 }, { "epoch": 8.512571113965866, "grad_norm": 0.9795964360237122, "learning_rate": 7.1032889954357665e-06, "loss": 0.2801, "num_input_tokens_seen": 100022912, "step": 46385 }, { "epoch": 8.513488713525417, "grad_norm": 0.3168499171733856, "learning_rate": 7.102562506484709e-06, "loss": 0.0594, "num_input_tokens_seen": 100034016, "step": 46390 }, { "epoch": 8.51440631308497, "grad_norm": 10.957983016967773, "learning_rate": 7.101835963606183e-06, "loss": 0.0355, "num_input_tokens_seen": 100045728, "step": 46395 }, { "epoch": 8.515323912644522, "grad_norm": 35.4787483215332, "learning_rate": 7.101109366818822e-06, "loss": 0.3782, "num_input_tokens_seen": 100056000, "step": 46400 }, { "epoch": 8.516241512204074, "grad_norm": 1.168225884437561, "learning_rate": 7.100382716141262e-06, "loss": 0.0786, "num_input_tokens_seen": 100066720, "step": 46405 }, { "epoch": 8.517159111763627, "grad_norm": 2.0491557121276855, "learning_rate": 7.09965601159214e-06, "loss": 0.2413, "num_input_tokens_seen": 100075936, "step": 46410 }, { "epoch": 8.518076711323179, "grad_norm": 0.6440222263336182, "learning_rate": 7.098929253190095e-06, "loss": 0.1722, "num_input_tokens_seen": 100087808, "step": 46415 }, { "epoch": 8.51899431088273, "grad_norm": 69.09162139892578, "learning_rate": 7.09820244095377e-06, "loss": 0.2705, "num_input_tokens_seen": 100099136, "step": 46420 }, { "epoch": 8.519911910442284, "grad_norm": 26.680360794067383, "learning_rate": 7.097475574901802e-06, "loss": 0.4136, "num_input_tokens_seen": 100110400, "step": 46425 }, { "epoch": 8.520829510001835, "grad_norm": 0.2550133466720581, "learning_rate": 7.096748655052837e-06, "loss": 0.3085, "num_input_tokens_seen": 100121792, "step": 46430 }, { "epoch": 8.521747109561387, "grad_norm": 1.6380362510681152, "learning_rate": 7.0960216814255185e-06, "loss": 0.5036, "num_input_tokens_seen": 100131456, "step": 46435 }, { "epoch": 8.52266470912094, "grad_norm": 20.420299530029297, "learning_rate": 7.095294654038493e-06, "loss": 0.2176, "num_input_tokens_seen": 100141440, "step": 46440 }, { "epoch": 8.523582308680492, "grad_norm": 22.277034759521484, "learning_rate": 7.094567572910407e-06, "loss": 0.2795, "num_input_tokens_seen": 100152544, "step": 46445 }, { "epoch": 8.524499908240044, "grad_norm": 0.5817656517028809, "learning_rate": 7.093840438059909e-06, "loss": 0.3852, "num_input_tokens_seen": 100163904, "step": 46450 }, { "epoch": 8.525417507799597, "grad_norm": 0.250813364982605, "learning_rate": 7.09311324950565e-06, "loss": 0.6102, "num_input_tokens_seen": 100174080, "step": 46455 }, { "epoch": 8.526335107359149, "grad_norm": 23.136001586914062, "learning_rate": 7.092386007266279e-06, "loss": 0.197, "num_input_tokens_seen": 100183840, "step": 46460 }, { "epoch": 8.5272527069187, "grad_norm": 12.212803840637207, "learning_rate": 7.09165871136045e-06, "loss": 0.2752, "num_input_tokens_seen": 100193888, "step": 46465 }, { "epoch": 8.528170306478254, "grad_norm": 23.588106155395508, "learning_rate": 7.0909313618068166e-06, "loss": 0.1742, "num_input_tokens_seen": 100205120, "step": 46470 }, { "epoch": 8.529087906037805, "grad_norm": 18.98015594482422, "learning_rate": 7.090203958624033e-06, "loss": 0.175, "num_input_tokens_seen": 100215808, "step": 46475 }, { "epoch": 8.530005505597357, "grad_norm": 4.023942470550537, "learning_rate": 7.08947650183076e-06, "loss": 0.3202, "num_input_tokens_seen": 100226752, "step": 46480 }, { "epoch": 8.53092310515691, "grad_norm": 6.2281036376953125, "learning_rate": 7.088748991445651e-06, "loss": 0.165, "num_input_tokens_seen": 100236864, "step": 46485 }, { "epoch": 8.531840704716462, "grad_norm": 25.53042984008789, "learning_rate": 7.088021427487368e-06, "loss": 0.2874, "num_input_tokens_seen": 100247424, "step": 46490 }, { "epoch": 8.532758304276014, "grad_norm": 48.973960876464844, "learning_rate": 7.087293809974574e-06, "loss": 0.0622, "num_input_tokens_seen": 100258144, "step": 46495 }, { "epoch": 8.533675903835567, "grad_norm": 3.935264825820923, "learning_rate": 7.086566138925925e-06, "loss": 0.0888, "num_input_tokens_seen": 100268096, "step": 46500 }, { "epoch": 8.534593503395119, "grad_norm": 0.49736374616622925, "learning_rate": 7.085838414360091e-06, "loss": 0.26, "num_input_tokens_seen": 100279488, "step": 46505 }, { "epoch": 8.53551110295467, "grad_norm": 4.654253005981445, "learning_rate": 7.085110636295733e-06, "loss": 0.0559, "num_input_tokens_seen": 100290240, "step": 46510 }, { "epoch": 8.536428702514224, "grad_norm": 6.10927152633667, "learning_rate": 7.084382804751519e-06, "loss": 0.2731, "num_input_tokens_seen": 100301696, "step": 46515 }, { "epoch": 8.537346302073775, "grad_norm": 19.232086181640625, "learning_rate": 7.083654919746119e-06, "loss": 0.483, "num_input_tokens_seen": 100312576, "step": 46520 }, { "epoch": 8.538263901633327, "grad_norm": 69.76605987548828, "learning_rate": 7.082926981298197e-06, "loss": 0.1925, "num_input_tokens_seen": 100323328, "step": 46525 }, { "epoch": 8.53918150119288, "grad_norm": 0.4532153904438019, "learning_rate": 7.082198989426428e-06, "loss": 0.0344, "num_input_tokens_seen": 100334656, "step": 46530 }, { "epoch": 8.540099100752432, "grad_norm": 25.374637603759766, "learning_rate": 7.08147094414948e-06, "loss": 0.2371, "num_input_tokens_seen": 100344864, "step": 46535 }, { "epoch": 8.541016700311983, "grad_norm": 60.7896614074707, "learning_rate": 7.08074284548603e-06, "loss": 0.1232, "num_input_tokens_seen": 100355296, "step": 46540 }, { "epoch": 8.541934299871537, "grad_norm": 0.06003400683403015, "learning_rate": 7.08001469345475e-06, "loss": 0.1265, "num_input_tokens_seen": 100365568, "step": 46545 }, { "epoch": 8.542851899431088, "grad_norm": 0.10997772961854935, "learning_rate": 7.079286488074317e-06, "loss": 0.2966, "num_input_tokens_seen": 100374624, "step": 46550 }, { "epoch": 8.54376949899064, "grad_norm": 0.4503253400325775, "learning_rate": 7.078558229363408e-06, "loss": 0.1463, "num_input_tokens_seen": 100384960, "step": 46555 }, { "epoch": 8.544687098550193, "grad_norm": 0.6864220499992371, "learning_rate": 7.077829917340703e-06, "loss": 0.1851, "num_input_tokens_seen": 100395008, "step": 46560 }, { "epoch": 8.545604698109745, "grad_norm": 0.7825621962547302, "learning_rate": 7.07710155202488e-06, "loss": 0.1049, "num_input_tokens_seen": 100405952, "step": 46565 }, { "epoch": 8.546522297669297, "grad_norm": 27.737735748291016, "learning_rate": 7.076373133434621e-06, "loss": 0.1799, "num_input_tokens_seen": 100417440, "step": 46570 }, { "epoch": 8.54743989722885, "grad_norm": 79.69065856933594, "learning_rate": 7.07564466158861e-06, "loss": 0.2504, "num_input_tokens_seen": 100428576, "step": 46575 }, { "epoch": 8.548357496788402, "grad_norm": 12.065347671508789, "learning_rate": 7.0749161365055295e-06, "loss": 0.1178, "num_input_tokens_seen": 100438464, "step": 46580 }, { "epoch": 8.549275096347953, "grad_norm": 36.58781433105469, "learning_rate": 7.074187558204066e-06, "loss": 0.4459, "num_input_tokens_seen": 100449056, "step": 46585 }, { "epoch": 8.550192695907507, "grad_norm": 7.596084117889404, "learning_rate": 7.073458926702907e-06, "loss": 0.3699, "num_input_tokens_seen": 100458688, "step": 46590 }, { "epoch": 8.551110295467058, "grad_norm": 48.03327941894531, "learning_rate": 7.07273024202074e-06, "loss": 0.2077, "num_input_tokens_seen": 100470656, "step": 46595 }, { "epoch": 8.55202789502661, "grad_norm": 42.311954498291016, "learning_rate": 7.072001504176255e-06, "loss": 0.4011, "num_input_tokens_seen": 100480864, "step": 46600 }, { "epoch": 8.552945494586163, "grad_norm": 56.171958923339844, "learning_rate": 7.071272713188142e-06, "loss": 0.7721, "num_input_tokens_seen": 100489696, "step": 46605 }, { "epoch": 8.553863094145715, "grad_norm": 1.5715948343276978, "learning_rate": 7.070543869075095e-06, "loss": 0.4119, "num_input_tokens_seen": 100500896, "step": 46610 }, { "epoch": 8.554780693705267, "grad_norm": 42.669795989990234, "learning_rate": 7.069814971855806e-06, "loss": 0.1475, "num_input_tokens_seen": 100512384, "step": 46615 }, { "epoch": 8.55569829326482, "grad_norm": 1.075901985168457, "learning_rate": 7.069086021548971e-06, "loss": 0.121, "num_input_tokens_seen": 100522656, "step": 46620 }, { "epoch": 8.556615892824372, "grad_norm": 2.010593891143799, "learning_rate": 7.0683570181732865e-06, "loss": 0.1445, "num_input_tokens_seen": 100534080, "step": 46625 }, { "epoch": 8.557533492383923, "grad_norm": 16.7613525390625, "learning_rate": 7.06762796174745e-06, "loss": 0.6447, "num_input_tokens_seen": 100543680, "step": 46630 }, { "epoch": 8.558451091943477, "grad_norm": 40.84850311279297, "learning_rate": 7.06689885229016e-06, "loss": 0.1362, "num_input_tokens_seen": 100554848, "step": 46635 }, { "epoch": 8.559368691503028, "grad_norm": 34.52558517456055, "learning_rate": 7.06616968982012e-06, "loss": 0.1051, "num_input_tokens_seen": 100565824, "step": 46640 }, { "epoch": 8.56028629106258, "grad_norm": 0.513553261756897, "learning_rate": 7.065440474356028e-06, "loss": 0.3844, "num_input_tokens_seen": 100577696, "step": 46645 }, { "epoch": 8.561203890622133, "grad_norm": 10.545027732849121, "learning_rate": 7.06471120591659e-06, "loss": 0.3184, "num_input_tokens_seen": 100588192, "step": 46650 }, { "epoch": 8.562121490181685, "grad_norm": 2.365732192993164, "learning_rate": 7.063981884520509e-06, "loss": 0.1278, "num_input_tokens_seen": 100598688, "step": 46655 }, { "epoch": 8.563039089741237, "grad_norm": 14.758394241333008, "learning_rate": 7.063252510186493e-06, "loss": 0.3793, "num_input_tokens_seen": 100610240, "step": 46660 }, { "epoch": 8.56395668930079, "grad_norm": 0.48192110657691956, "learning_rate": 7.062523082933245e-06, "loss": 0.2653, "num_input_tokens_seen": 100621376, "step": 46665 }, { "epoch": 8.564874288860342, "grad_norm": 17.348920822143555, "learning_rate": 7.061793602779479e-06, "loss": 0.1359, "num_input_tokens_seen": 100631328, "step": 46670 }, { "epoch": 8.565791888419893, "grad_norm": 8.9627685546875, "learning_rate": 7.061064069743902e-06, "loss": 0.6168, "num_input_tokens_seen": 100642336, "step": 46675 }, { "epoch": 8.566709487979447, "grad_norm": 0.44243550300598145, "learning_rate": 7.060334483845225e-06, "loss": 0.3517, "num_input_tokens_seen": 100652992, "step": 46680 }, { "epoch": 8.567627087538998, "grad_norm": 17.83350944519043, "learning_rate": 7.059604845102161e-06, "loss": 0.2538, "num_input_tokens_seen": 100663584, "step": 46685 }, { "epoch": 8.56854468709855, "grad_norm": 0.15064772963523865, "learning_rate": 7.058875153533428e-06, "loss": 0.2413, "num_input_tokens_seen": 100674976, "step": 46690 }, { "epoch": 8.569462286658103, "grad_norm": 8.66602611541748, "learning_rate": 7.0581454091577354e-06, "loss": 0.2641, "num_input_tokens_seen": 100685728, "step": 46695 }, { "epoch": 8.570379886217655, "grad_norm": 25.76732063293457, "learning_rate": 7.057415611993803e-06, "loss": 0.1761, "num_input_tokens_seen": 100696544, "step": 46700 }, { "epoch": 8.571297485777206, "grad_norm": 11.373152732849121, "learning_rate": 7.0566857620603515e-06, "loss": 0.3633, "num_input_tokens_seen": 100707040, "step": 46705 }, { "epoch": 8.57221508533676, "grad_norm": 52.739776611328125, "learning_rate": 7.0559558593760944e-06, "loss": 0.0313, "num_input_tokens_seen": 100718496, "step": 46710 }, { "epoch": 8.573132684896311, "grad_norm": 22.21231460571289, "learning_rate": 7.055225903959759e-06, "loss": 0.1679, "num_input_tokens_seen": 100729728, "step": 46715 }, { "epoch": 8.574050284455863, "grad_norm": 1.9703316688537598, "learning_rate": 7.054495895830063e-06, "loss": 0.3459, "num_input_tokens_seen": 100740320, "step": 46720 }, { "epoch": 8.574967884015416, "grad_norm": 0.5832809209823608, "learning_rate": 7.053765835005732e-06, "loss": 0.2746, "num_input_tokens_seen": 100751264, "step": 46725 }, { "epoch": 8.575885483574968, "grad_norm": 10.987115859985352, "learning_rate": 7.053035721505489e-06, "loss": 0.3041, "num_input_tokens_seen": 100762528, "step": 46730 }, { "epoch": 8.57680308313452, "grad_norm": 18.023679733276367, "learning_rate": 7.052305555348062e-06, "loss": 0.2688, "num_input_tokens_seen": 100772000, "step": 46735 }, { "epoch": 8.577720682694073, "grad_norm": 8.258406639099121, "learning_rate": 7.051575336552179e-06, "loss": 0.2999, "num_input_tokens_seen": 100781792, "step": 46740 }, { "epoch": 8.578638282253625, "grad_norm": 4.071710586547852, "learning_rate": 7.050845065136568e-06, "loss": 0.1844, "num_input_tokens_seen": 100792896, "step": 46745 }, { "epoch": 8.579555881813176, "grad_norm": 18.65770721435547, "learning_rate": 7.05011474111996e-06, "loss": 0.2777, "num_input_tokens_seen": 100804608, "step": 46750 }, { "epoch": 8.58047348137273, "grad_norm": 0.7361364960670471, "learning_rate": 7.049384364521086e-06, "loss": 0.18, "num_input_tokens_seen": 100816480, "step": 46755 }, { "epoch": 8.581391080932281, "grad_norm": 6.727949142456055, "learning_rate": 7.048653935358681e-06, "loss": 0.2137, "num_input_tokens_seen": 100827616, "step": 46760 }, { "epoch": 8.582308680491833, "grad_norm": 76.05078125, "learning_rate": 7.047923453651474e-06, "loss": 0.3201, "num_input_tokens_seen": 100838976, "step": 46765 }, { "epoch": 8.583226280051386, "grad_norm": 34.002662658691406, "learning_rate": 7.047192919418207e-06, "loss": 0.1093, "num_input_tokens_seen": 100849920, "step": 46770 }, { "epoch": 8.584143879610938, "grad_norm": 15.99952220916748, "learning_rate": 7.046462332677614e-06, "loss": 0.1437, "num_input_tokens_seen": 100860000, "step": 46775 }, { "epoch": 8.58506147917049, "grad_norm": 21.1818790435791, "learning_rate": 7.045731693448434e-06, "loss": 0.2543, "num_input_tokens_seen": 100870208, "step": 46780 }, { "epoch": 8.585979078730043, "grad_norm": 45.963958740234375, "learning_rate": 7.045001001749406e-06, "loss": 0.1077, "num_input_tokens_seen": 100881664, "step": 46785 }, { "epoch": 8.586896678289595, "grad_norm": 39.136627197265625, "learning_rate": 7.044270257599273e-06, "loss": 0.1908, "num_input_tokens_seen": 100891488, "step": 46790 }, { "epoch": 8.587814277849146, "grad_norm": 40.71903610229492, "learning_rate": 7.043539461016775e-06, "loss": 0.4252, "num_input_tokens_seen": 100902176, "step": 46795 }, { "epoch": 8.5887318774087, "grad_norm": 0.9593797326087952, "learning_rate": 7.0428086120206575e-06, "loss": 0.0728, "num_input_tokens_seen": 100913344, "step": 46800 }, { "epoch": 8.589649476968251, "grad_norm": 51.31525802612305, "learning_rate": 7.0420777106296645e-06, "loss": 0.2572, "num_input_tokens_seen": 100923392, "step": 46805 }, { "epoch": 8.590567076527803, "grad_norm": 17.478199005126953, "learning_rate": 7.041346756862543e-06, "loss": 0.5106, "num_input_tokens_seen": 100935200, "step": 46810 }, { "epoch": 8.591484676087356, "grad_norm": 23.650976181030273, "learning_rate": 7.040615750738042e-06, "loss": 0.3997, "num_input_tokens_seen": 100946432, "step": 46815 }, { "epoch": 8.592402275646908, "grad_norm": 0.18955034017562866, "learning_rate": 7.03988469227491e-06, "loss": 0.1742, "num_input_tokens_seen": 100957600, "step": 46820 }, { "epoch": 8.59331987520646, "grad_norm": 0.5299834609031677, "learning_rate": 7.039153581491898e-06, "loss": 0.2558, "num_input_tokens_seen": 100967328, "step": 46825 }, { "epoch": 8.594237474766013, "grad_norm": 2.2362072467803955, "learning_rate": 7.038422418407754e-06, "loss": 0.2179, "num_input_tokens_seen": 100978176, "step": 46830 }, { "epoch": 8.595155074325564, "grad_norm": 24.30034828186035, "learning_rate": 7.037691203041236e-06, "loss": 0.1731, "num_input_tokens_seen": 100988896, "step": 46835 }, { "epoch": 8.596072673885116, "grad_norm": 79.50404357910156, "learning_rate": 7.036959935411096e-06, "loss": 0.3561, "num_input_tokens_seen": 101000288, "step": 46840 }, { "epoch": 8.59699027344467, "grad_norm": 6.463006019592285, "learning_rate": 7.036228615536091e-06, "loss": 0.4271, "num_input_tokens_seen": 101010816, "step": 46845 }, { "epoch": 8.597907873004221, "grad_norm": 18.107954025268555, "learning_rate": 7.035497243434979e-06, "loss": 0.216, "num_input_tokens_seen": 101021504, "step": 46850 }, { "epoch": 8.598825472563773, "grad_norm": 0.3339754343032837, "learning_rate": 7.034765819126517e-06, "loss": 0.2252, "num_input_tokens_seen": 101032416, "step": 46855 }, { "epoch": 8.599743072123326, "grad_norm": 36.00852584838867, "learning_rate": 7.034034342629464e-06, "loss": 0.412, "num_input_tokens_seen": 101043200, "step": 46860 }, { "epoch": 8.600660671682878, "grad_norm": 3.7178571224212646, "learning_rate": 7.0333028139625835e-06, "loss": 0.2348, "num_input_tokens_seen": 101053600, "step": 46865 }, { "epoch": 8.60157827124243, "grad_norm": 1.3460474014282227, "learning_rate": 7.032571233144638e-06, "loss": 0.0754, "num_input_tokens_seen": 101063296, "step": 46870 }, { "epoch": 8.602495870801983, "grad_norm": 14.607027053833008, "learning_rate": 7.03183960019439e-06, "loss": 0.38, "num_input_tokens_seen": 101074272, "step": 46875 }, { "epoch": 8.603413470361534, "grad_norm": 23.785717010498047, "learning_rate": 7.031107915130606e-06, "loss": 0.3154, "num_input_tokens_seen": 101086080, "step": 46880 }, { "epoch": 8.604331069921086, "grad_norm": 12.658291816711426, "learning_rate": 7.0303761779720516e-06, "loss": 0.1211, "num_input_tokens_seen": 101097440, "step": 46885 }, { "epoch": 8.60524866948064, "grad_norm": 18.74211311340332, "learning_rate": 7.029644388737493e-06, "loss": 0.429, "num_input_tokens_seen": 101106592, "step": 46890 }, { "epoch": 8.606166269040191, "grad_norm": 7.828802585601807, "learning_rate": 7.028912547445703e-06, "loss": 0.118, "num_input_tokens_seen": 101117632, "step": 46895 }, { "epoch": 8.607083868599743, "grad_norm": 2.6311259269714355, "learning_rate": 7.028180654115451e-06, "loss": 0.0478, "num_input_tokens_seen": 101128160, "step": 46900 }, { "epoch": 8.608001468159296, "grad_norm": 15.266012191772461, "learning_rate": 7.027448708765508e-06, "loss": 0.2468, "num_input_tokens_seen": 101138688, "step": 46905 }, { "epoch": 8.608919067718848, "grad_norm": 3.4664270877838135, "learning_rate": 7.026716711414648e-06, "loss": 0.1698, "num_input_tokens_seen": 101149312, "step": 46910 }, { "epoch": 8.6098366672784, "grad_norm": 43.80695343017578, "learning_rate": 7.025984662081645e-06, "loss": 0.2931, "num_input_tokens_seen": 101160512, "step": 46915 }, { "epoch": 8.610754266837953, "grad_norm": 16.515378952026367, "learning_rate": 7.025252560785276e-06, "loss": 0.1088, "num_input_tokens_seen": 101172480, "step": 46920 }, { "epoch": 8.611671866397504, "grad_norm": 22.198530197143555, "learning_rate": 7.024520407544319e-06, "loss": 0.4037, "num_input_tokens_seen": 101183008, "step": 46925 }, { "epoch": 8.612589465957056, "grad_norm": 13.61135482788086, "learning_rate": 7.023788202377549e-06, "loss": 0.2708, "num_input_tokens_seen": 101194272, "step": 46930 }, { "epoch": 8.61350706551661, "grad_norm": 20.988874435424805, "learning_rate": 7.02305594530375e-06, "loss": 0.3115, "num_input_tokens_seen": 101204576, "step": 46935 }, { "epoch": 8.614424665076161, "grad_norm": 3.0456199645996094, "learning_rate": 7.022323636341699e-06, "loss": 0.2193, "num_input_tokens_seen": 101214432, "step": 46940 }, { "epoch": 8.615342264635713, "grad_norm": 2.495818853378296, "learning_rate": 7.021591275510182e-06, "loss": 0.2482, "num_input_tokens_seen": 101224576, "step": 46945 }, { "epoch": 8.616259864195266, "grad_norm": 19.75491714477539, "learning_rate": 7.020858862827984e-06, "loss": 0.3239, "num_input_tokens_seen": 101235904, "step": 46950 }, { "epoch": 8.617177463754818, "grad_norm": 31.689062118530273, "learning_rate": 7.020126398313887e-06, "loss": 0.2405, "num_input_tokens_seen": 101246144, "step": 46955 }, { "epoch": 8.61809506331437, "grad_norm": 3.3795127868652344, "learning_rate": 7.019393881986678e-06, "loss": 0.1433, "num_input_tokens_seen": 101255872, "step": 46960 }, { "epoch": 8.619012662873923, "grad_norm": 13.227072715759277, "learning_rate": 7.018661313865147e-06, "loss": 0.2944, "num_input_tokens_seen": 101266656, "step": 46965 }, { "epoch": 8.619930262433474, "grad_norm": 25.330007553100586, "learning_rate": 7.01792869396808e-06, "loss": 0.4002, "num_input_tokens_seen": 101277408, "step": 46970 }, { "epoch": 8.620847861993026, "grad_norm": 12.226738929748535, "learning_rate": 7.017196022314272e-06, "loss": 0.4117, "num_input_tokens_seen": 101288000, "step": 46975 }, { "epoch": 8.62176546155258, "grad_norm": 0.09642227739095688, "learning_rate": 7.016463298922511e-06, "loss": 0.1547, "num_input_tokens_seen": 101297120, "step": 46980 }, { "epoch": 8.62268306111213, "grad_norm": 8.034829139709473, "learning_rate": 7.015730523811592e-06, "loss": 0.2222, "num_input_tokens_seen": 101308480, "step": 46985 }, { "epoch": 8.623600660671682, "grad_norm": 21.29810905456543, "learning_rate": 7.014997697000309e-06, "loss": 0.5408, "num_input_tokens_seen": 101319040, "step": 46990 }, { "epoch": 8.624518260231236, "grad_norm": 20.05063247680664, "learning_rate": 7.014264818507458e-06, "loss": 0.184, "num_input_tokens_seen": 101329888, "step": 46995 }, { "epoch": 8.625435859790787, "grad_norm": 1.9315279722213745, "learning_rate": 7.013531888351837e-06, "loss": 0.3422, "num_input_tokens_seen": 101341600, "step": 47000 }, { "epoch": 8.626353459350339, "grad_norm": 8.349811553955078, "learning_rate": 7.012798906552242e-06, "loss": 0.2398, "num_input_tokens_seen": 101353568, "step": 47005 }, { "epoch": 8.627271058909892, "grad_norm": 1.3694241046905518, "learning_rate": 7.012065873127476e-06, "loss": 0.4217, "num_input_tokens_seen": 101364352, "step": 47010 }, { "epoch": 8.628188658469444, "grad_norm": 61.36245346069336, "learning_rate": 7.011332788096338e-06, "loss": 0.2919, "num_input_tokens_seen": 101373952, "step": 47015 }, { "epoch": 8.629106258028996, "grad_norm": 2.296475410461426, "learning_rate": 7.010599651477632e-06, "loss": 0.2205, "num_input_tokens_seen": 101385504, "step": 47020 }, { "epoch": 8.630023857588549, "grad_norm": 0.44649818539619446, "learning_rate": 7.00986646329016e-06, "loss": 0.1688, "num_input_tokens_seen": 101396064, "step": 47025 }, { "epoch": 8.6309414571481, "grad_norm": 29.303112030029297, "learning_rate": 7.009133223552729e-06, "loss": 0.2373, "num_input_tokens_seen": 101407296, "step": 47030 }, { "epoch": 8.631859056707652, "grad_norm": 9.399794578552246, "learning_rate": 7.008399932284145e-06, "loss": 0.3073, "num_input_tokens_seen": 101417376, "step": 47035 }, { "epoch": 8.632776656267206, "grad_norm": 29.96837043762207, "learning_rate": 7.007666589503215e-06, "loss": 0.2018, "num_input_tokens_seen": 101428640, "step": 47040 }, { "epoch": 8.633694255826757, "grad_norm": 12.836272239685059, "learning_rate": 7.006933195228749e-06, "loss": 0.1225, "num_input_tokens_seen": 101440608, "step": 47045 }, { "epoch": 8.634611855386309, "grad_norm": 6.564709663391113, "learning_rate": 7.006199749479557e-06, "loss": 0.1891, "num_input_tokens_seen": 101450656, "step": 47050 }, { "epoch": 8.635529454945862, "grad_norm": 11.038847923278809, "learning_rate": 7.005466252274449e-06, "loss": 0.2395, "num_input_tokens_seen": 101461344, "step": 47055 }, { "epoch": 8.636447054505414, "grad_norm": 0.36907798051834106, "learning_rate": 7.004732703632242e-06, "loss": 0.1615, "num_input_tokens_seen": 101470976, "step": 47060 }, { "epoch": 8.637364654064966, "grad_norm": 14.018671989440918, "learning_rate": 7.003999103571747e-06, "loss": 0.5879, "num_input_tokens_seen": 101482176, "step": 47065 }, { "epoch": 8.638282253624519, "grad_norm": 7.2193074226379395, "learning_rate": 7.003265452111781e-06, "loss": 0.1256, "num_input_tokens_seen": 101494112, "step": 47070 }, { "epoch": 8.63919985318407, "grad_norm": 1.013270616531372, "learning_rate": 7.002531749271162e-06, "loss": 0.2405, "num_input_tokens_seen": 101505024, "step": 47075 }, { "epoch": 8.640117452743622, "grad_norm": 5.623209476470947, "learning_rate": 7.001797995068706e-06, "loss": 0.245, "num_input_tokens_seen": 101515104, "step": 47080 }, { "epoch": 8.641035052303176, "grad_norm": 2.1025941371917725, "learning_rate": 7.0010641895232345e-06, "loss": 0.1298, "num_input_tokens_seen": 101526016, "step": 47085 }, { "epoch": 8.641952651862727, "grad_norm": 31.229215621948242, "learning_rate": 7.000330332653569e-06, "loss": 0.4182, "num_input_tokens_seen": 101536192, "step": 47090 }, { "epoch": 8.642870251422279, "grad_norm": 0.5348070859909058, "learning_rate": 6.99959642447853e-06, "loss": 0.183, "num_input_tokens_seen": 101547360, "step": 47095 }, { "epoch": 8.643787850981832, "grad_norm": 0.3833472728729248, "learning_rate": 6.998862465016941e-06, "loss": 0.2826, "num_input_tokens_seen": 101557824, "step": 47100 }, { "epoch": 8.644705450541384, "grad_norm": 4.088527679443359, "learning_rate": 6.998128454287627e-06, "loss": 0.1852, "num_input_tokens_seen": 101570208, "step": 47105 }, { "epoch": 8.645623050100935, "grad_norm": 13.941957473754883, "learning_rate": 6.997394392309418e-06, "loss": 0.3549, "num_input_tokens_seen": 101581920, "step": 47110 }, { "epoch": 8.646540649660489, "grad_norm": 0.20531193912029266, "learning_rate": 6.996660279101135e-06, "loss": 0.2495, "num_input_tokens_seen": 101591872, "step": 47115 }, { "epoch": 8.64745824922004, "grad_norm": 13.841291427612305, "learning_rate": 6.995926114681612e-06, "loss": 0.2565, "num_input_tokens_seen": 101603648, "step": 47120 }, { "epoch": 8.648375848779592, "grad_norm": 5.236475467681885, "learning_rate": 6.995191899069678e-06, "loss": 0.0361, "num_input_tokens_seen": 101613408, "step": 47125 }, { "epoch": 8.649293448339145, "grad_norm": 8.139015197753906, "learning_rate": 6.994457632284164e-06, "loss": 0.1833, "num_input_tokens_seen": 101624928, "step": 47130 }, { "epoch": 8.650211047898697, "grad_norm": 18.163171768188477, "learning_rate": 6.993723314343903e-06, "loss": 0.4121, "num_input_tokens_seen": 101635712, "step": 47135 }, { "epoch": 8.651128647458249, "grad_norm": 25.644439697265625, "learning_rate": 6.992988945267728e-06, "loss": 0.1767, "num_input_tokens_seen": 101645408, "step": 47140 }, { "epoch": 8.652046247017802, "grad_norm": 0.7661779522895813, "learning_rate": 6.9922545250744754e-06, "loss": 0.2381, "num_input_tokens_seen": 101657056, "step": 47145 }, { "epoch": 8.652963846577354, "grad_norm": 4.166330337524414, "learning_rate": 6.991520053782983e-06, "loss": 0.1479, "num_input_tokens_seen": 101667232, "step": 47150 }, { "epoch": 8.653881446136905, "grad_norm": 7.774785041809082, "learning_rate": 6.990785531412087e-06, "loss": 0.2296, "num_input_tokens_seen": 101678432, "step": 47155 }, { "epoch": 8.654799045696459, "grad_norm": 0.8614110946655273, "learning_rate": 6.990050957980628e-06, "loss": 0.2872, "num_input_tokens_seen": 101687648, "step": 47160 }, { "epoch": 8.65571664525601, "grad_norm": 14.509252548217773, "learning_rate": 6.989316333507446e-06, "loss": 0.1577, "num_input_tokens_seen": 101698688, "step": 47165 }, { "epoch": 8.656634244815562, "grad_norm": 19.139728546142578, "learning_rate": 6.988581658011383e-06, "loss": 0.3567, "num_input_tokens_seen": 101710720, "step": 47170 }, { "epoch": 8.657551844375115, "grad_norm": 10.510132789611816, "learning_rate": 6.987846931511282e-06, "loss": 0.2901, "num_input_tokens_seen": 101721696, "step": 47175 }, { "epoch": 8.658469443934667, "grad_norm": 23.253671646118164, "learning_rate": 6.98711215402599e-06, "loss": 0.3333, "num_input_tokens_seen": 101732512, "step": 47180 }, { "epoch": 8.659387043494219, "grad_norm": 17.687463760375977, "learning_rate": 6.98637732557435e-06, "loss": 0.244, "num_input_tokens_seen": 101742624, "step": 47185 }, { "epoch": 8.660304643053772, "grad_norm": 8.056904792785645, "learning_rate": 6.985642446175209e-06, "loss": 0.0932, "num_input_tokens_seen": 101752992, "step": 47190 }, { "epoch": 8.661222242613324, "grad_norm": 23.217985153198242, "learning_rate": 6.984907515847418e-06, "loss": 0.0467, "num_input_tokens_seen": 101763264, "step": 47195 }, { "epoch": 8.662139842172875, "grad_norm": 1.2241497039794922, "learning_rate": 6.984172534609825e-06, "loss": 0.2077, "num_input_tokens_seen": 101773888, "step": 47200 }, { "epoch": 8.663057441732429, "grad_norm": 0.5641483068466187, "learning_rate": 6.983437502481283e-06, "loss": 0.0748, "num_input_tokens_seen": 101783680, "step": 47205 }, { "epoch": 8.66397504129198, "grad_norm": 6.0598344802856445, "learning_rate": 6.982702419480642e-06, "loss": 0.2218, "num_input_tokens_seen": 101794816, "step": 47210 }, { "epoch": 8.664892640851532, "grad_norm": 0.5796117186546326, "learning_rate": 6.981967285626756e-06, "loss": 0.5183, "num_input_tokens_seen": 101805760, "step": 47215 }, { "epoch": 8.665810240411085, "grad_norm": 22.63420295715332, "learning_rate": 6.981232100938482e-06, "loss": 0.2789, "num_input_tokens_seen": 101816800, "step": 47220 }, { "epoch": 8.666727839970637, "grad_norm": 33.792781829833984, "learning_rate": 6.980496865434675e-06, "loss": 0.252, "num_input_tokens_seen": 101826656, "step": 47225 }, { "epoch": 8.667645439530189, "grad_norm": 8.833284378051758, "learning_rate": 6.979761579134193e-06, "loss": 0.3583, "num_input_tokens_seen": 101837632, "step": 47230 }, { "epoch": 8.668563039089742, "grad_norm": 1.3543033599853516, "learning_rate": 6.979026242055895e-06, "loss": 0.2283, "num_input_tokens_seen": 101848160, "step": 47235 }, { "epoch": 8.669480638649294, "grad_norm": 57.10014724731445, "learning_rate": 6.97829085421864e-06, "loss": 0.2521, "num_input_tokens_seen": 101860128, "step": 47240 }, { "epoch": 8.670398238208845, "grad_norm": 19.89800262451172, "learning_rate": 6.9775554156412925e-06, "loss": 0.2083, "num_input_tokens_seen": 101871520, "step": 47245 }, { "epoch": 8.671315837768399, "grad_norm": 0.4416313171386719, "learning_rate": 6.976819926342712e-06, "loss": 0.2929, "num_input_tokens_seen": 101882912, "step": 47250 }, { "epoch": 8.67223343732795, "grad_norm": 1.110317587852478, "learning_rate": 6.976084386341766e-06, "loss": 0.3016, "num_input_tokens_seen": 101894432, "step": 47255 }, { "epoch": 8.673151036887502, "grad_norm": 8.703290939331055, "learning_rate": 6.975348795657316e-06, "loss": 0.3191, "num_input_tokens_seen": 101906368, "step": 47260 }, { "epoch": 8.674068636447055, "grad_norm": 5.410447120666504, "learning_rate": 6.974613154308232e-06, "loss": 0.3356, "num_input_tokens_seen": 101916640, "step": 47265 }, { "epoch": 8.674986236006607, "grad_norm": 29.565532684326172, "learning_rate": 6.973877462313381e-06, "loss": 0.2071, "num_input_tokens_seen": 101928896, "step": 47270 }, { "epoch": 8.675903835566158, "grad_norm": 19.559730529785156, "learning_rate": 6.973141719691632e-06, "loss": 0.304, "num_input_tokens_seen": 101940800, "step": 47275 }, { "epoch": 8.676821435125712, "grad_norm": 9.854049682617188, "learning_rate": 6.972405926461856e-06, "loss": 0.0876, "num_input_tokens_seen": 101951840, "step": 47280 }, { "epoch": 8.677739034685263, "grad_norm": 52.59219741821289, "learning_rate": 6.971670082642925e-06, "loss": 0.1443, "num_input_tokens_seen": 101962432, "step": 47285 }, { "epoch": 8.678656634244815, "grad_norm": 0.6278728246688843, "learning_rate": 6.9709341882537105e-06, "loss": 0.0324, "num_input_tokens_seen": 101974048, "step": 47290 }, { "epoch": 8.679574233804368, "grad_norm": 13.253408432006836, "learning_rate": 6.970198243313091e-06, "loss": 0.1969, "num_input_tokens_seen": 101984736, "step": 47295 }, { "epoch": 8.68049183336392, "grad_norm": 4.798233509063721, "learning_rate": 6.969462247839941e-06, "loss": 0.1607, "num_input_tokens_seen": 101996352, "step": 47300 }, { "epoch": 8.681409432923472, "grad_norm": 0.834413468837738, "learning_rate": 6.9687262018531345e-06, "loss": 0.3351, "num_input_tokens_seen": 102006464, "step": 47305 }, { "epoch": 8.682327032483025, "grad_norm": 0.28016212582588196, "learning_rate": 6.9679901053715536e-06, "loss": 0.1549, "num_input_tokens_seen": 102017632, "step": 47310 }, { "epoch": 8.683244632042577, "grad_norm": 3.5979955196380615, "learning_rate": 6.967253958414075e-06, "loss": 0.2882, "num_input_tokens_seen": 102028352, "step": 47315 }, { "epoch": 8.684162231602128, "grad_norm": 2.6270558834075928, "learning_rate": 6.966517760999584e-06, "loss": 0.1629, "num_input_tokens_seen": 102039488, "step": 47320 }, { "epoch": 8.685079831161682, "grad_norm": 14.003230094909668, "learning_rate": 6.965781513146957e-06, "loss": 0.3838, "num_input_tokens_seen": 102049152, "step": 47325 }, { "epoch": 8.685997430721233, "grad_norm": 4.712395191192627, "learning_rate": 6.965045214875083e-06, "loss": 0.3262, "num_input_tokens_seen": 102060448, "step": 47330 }, { "epoch": 8.686915030280785, "grad_norm": 0.5746816396713257, "learning_rate": 6.964308866202844e-06, "loss": 0.2879, "num_input_tokens_seen": 102070528, "step": 47335 }, { "epoch": 8.687832629840338, "grad_norm": 0.7343613505363464, "learning_rate": 6.963572467149128e-06, "loss": 0.1429, "num_input_tokens_seen": 102081632, "step": 47340 }, { "epoch": 8.68875022939989, "grad_norm": 12.840872764587402, "learning_rate": 6.96283601773282e-06, "loss": 0.1627, "num_input_tokens_seen": 102091264, "step": 47345 }, { "epoch": 8.689667828959442, "grad_norm": 16.5947322845459, "learning_rate": 6.962099517972811e-06, "loss": 0.11, "num_input_tokens_seen": 102102400, "step": 47350 }, { "epoch": 8.690585428518995, "grad_norm": 3.5437631607055664, "learning_rate": 6.96136296788799e-06, "loss": 0.0702, "num_input_tokens_seen": 102113504, "step": 47355 }, { "epoch": 8.691503028078547, "grad_norm": 0.3575475811958313, "learning_rate": 6.9606263674972485e-06, "loss": 0.3086, "num_input_tokens_seen": 102124160, "step": 47360 }, { "epoch": 8.692420627638098, "grad_norm": 39.98072814941406, "learning_rate": 6.959889716819481e-06, "loss": 0.2656, "num_input_tokens_seen": 102133280, "step": 47365 }, { "epoch": 8.693338227197652, "grad_norm": 0.18704065680503845, "learning_rate": 6.959153015873578e-06, "loss": 0.2947, "num_input_tokens_seen": 102143328, "step": 47370 }, { "epoch": 8.694255826757203, "grad_norm": 0.51894611120224, "learning_rate": 6.958416264678437e-06, "loss": 0.05, "num_input_tokens_seen": 102152160, "step": 47375 }, { "epoch": 8.695173426316755, "grad_norm": 37.49546813964844, "learning_rate": 6.9576794632529546e-06, "loss": 0.3044, "num_input_tokens_seen": 102160896, "step": 47380 }, { "epoch": 8.696091025876308, "grad_norm": 16.95137596130371, "learning_rate": 6.9569426116160275e-06, "loss": 0.2884, "num_input_tokens_seen": 102172160, "step": 47385 }, { "epoch": 8.69700862543586, "grad_norm": 8.797825813293457, "learning_rate": 6.956205709786556e-06, "loss": 0.1081, "num_input_tokens_seen": 102182464, "step": 47390 }, { "epoch": 8.697926224995411, "grad_norm": 27.993486404418945, "learning_rate": 6.955468757783439e-06, "loss": 0.3694, "num_input_tokens_seen": 102193952, "step": 47395 }, { "epoch": 8.698843824554965, "grad_norm": 41.72188186645508, "learning_rate": 6.9547317556255785e-06, "loss": 0.0815, "num_input_tokens_seen": 102204256, "step": 47400 }, { "epoch": 8.699761424114516, "grad_norm": 10.431255340576172, "learning_rate": 6.953994703331881e-06, "loss": 0.0454, "num_input_tokens_seen": 102213856, "step": 47405 }, { "epoch": 8.700679023674068, "grad_norm": 64.24150848388672, "learning_rate": 6.953257600921246e-06, "loss": 0.2311, "num_input_tokens_seen": 102225152, "step": 47410 }, { "epoch": 8.701596623233621, "grad_norm": 0.3615933954715729, "learning_rate": 6.952520448412581e-06, "loss": 0.2551, "num_input_tokens_seen": 102236576, "step": 47415 }, { "epoch": 8.702514222793173, "grad_norm": 0.4644511938095093, "learning_rate": 6.951783245824793e-06, "loss": 0.1044, "num_input_tokens_seen": 102247904, "step": 47420 }, { "epoch": 8.703431822352725, "grad_norm": 149.99928283691406, "learning_rate": 6.951045993176788e-06, "loss": 0.3537, "num_input_tokens_seen": 102259744, "step": 47425 }, { "epoch": 8.704349421912278, "grad_norm": 6.306975364685059, "learning_rate": 6.9503086904874795e-06, "loss": 0.41, "num_input_tokens_seen": 102270688, "step": 47430 }, { "epoch": 8.70526702147183, "grad_norm": 13.9313325881958, "learning_rate": 6.949571337775774e-06, "loss": 0.2434, "num_input_tokens_seen": 102281120, "step": 47435 }, { "epoch": 8.706184621031381, "grad_norm": 0.3855036199092865, "learning_rate": 6.948833935060586e-06, "loss": 0.3341, "num_input_tokens_seen": 102291584, "step": 47440 }, { "epoch": 8.707102220590935, "grad_norm": 80.01282501220703, "learning_rate": 6.948096482360827e-06, "loss": 0.2159, "num_input_tokens_seen": 102300832, "step": 47445 }, { "epoch": 8.708019820150486, "grad_norm": 12.508126258850098, "learning_rate": 6.947358979695413e-06, "loss": 0.2376, "num_input_tokens_seen": 102312544, "step": 47450 }, { "epoch": 8.708937419710038, "grad_norm": 0.5569498538970947, "learning_rate": 6.9466214270832596e-06, "loss": 0.1511, "num_input_tokens_seen": 102322624, "step": 47455 }, { "epoch": 8.709855019269591, "grad_norm": 14.21998119354248, "learning_rate": 6.945883824543283e-06, "loss": 0.2296, "num_input_tokens_seen": 102332960, "step": 47460 }, { "epoch": 8.710772618829143, "grad_norm": 6.2108049392700195, "learning_rate": 6.9451461720944035e-06, "loss": 0.2961, "num_input_tokens_seen": 102344704, "step": 47465 }, { "epoch": 8.711690218388695, "grad_norm": 73.74520874023438, "learning_rate": 6.944408469755539e-06, "loss": 0.3749, "num_input_tokens_seen": 102355808, "step": 47470 }, { "epoch": 8.712607817948248, "grad_norm": 0.5400476455688477, "learning_rate": 6.943670717545611e-06, "loss": 0.429, "num_input_tokens_seen": 102366784, "step": 47475 }, { "epoch": 8.7135254175078, "grad_norm": 14.739776611328125, "learning_rate": 6.94293291548354e-06, "loss": 0.3077, "num_input_tokens_seen": 102376736, "step": 47480 }, { "epoch": 8.714443017067351, "grad_norm": 3.7075297832489014, "learning_rate": 6.9421950635882514e-06, "loss": 0.2348, "num_input_tokens_seen": 102386016, "step": 47485 }, { "epoch": 8.715360616626905, "grad_norm": 16.999433517456055, "learning_rate": 6.941457161878671e-06, "loss": 0.2846, "num_input_tokens_seen": 102397920, "step": 47490 }, { "epoch": 8.716278216186456, "grad_norm": 1.9596158266067505, "learning_rate": 6.940719210373722e-06, "loss": 0.2723, "num_input_tokens_seen": 102407456, "step": 47495 }, { "epoch": 8.717195815746008, "grad_norm": 0.9075090289115906, "learning_rate": 6.939981209092334e-06, "loss": 0.2766, "num_input_tokens_seen": 102416608, "step": 47500 }, { "epoch": 8.718113415305561, "grad_norm": 11.085712432861328, "learning_rate": 6.939243158053434e-06, "loss": 0.2739, "num_input_tokens_seen": 102428064, "step": 47505 }, { "epoch": 8.719031014865113, "grad_norm": 25.696748733520508, "learning_rate": 6.938505057275951e-06, "loss": 0.0675, "num_input_tokens_seen": 102440032, "step": 47510 }, { "epoch": 8.719948614424665, "grad_norm": 51.928409576416016, "learning_rate": 6.937766906778821e-06, "loss": 0.3264, "num_input_tokens_seen": 102451520, "step": 47515 }, { "epoch": 8.720866213984218, "grad_norm": 51.030555725097656, "learning_rate": 6.937028706580973e-06, "loss": 0.2612, "num_input_tokens_seen": 102462272, "step": 47520 }, { "epoch": 8.72178381354377, "grad_norm": 43.48740005493164, "learning_rate": 6.9362904567013374e-06, "loss": 0.1833, "num_input_tokens_seen": 102472160, "step": 47525 }, { "epoch": 8.722701413103321, "grad_norm": 0.8848761320114136, "learning_rate": 6.935552157158856e-06, "loss": 0.4516, "num_input_tokens_seen": 102483712, "step": 47530 }, { "epoch": 8.723619012662875, "grad_norm": 3.843626022338867, "learning_rate": 6.934813807972459e-06, "loss": 0.1453, "num_input_tokens_seen": 102494176, "step": 47535 }, { "epoch": 8.724536612222426, "grad_norm": 1.8949599266052246, "learning_rate": 6.934075409161088e-06, "loss": 0.1456, "num_input_tokens_seen": 102505696, "step": 47540 }, { "epoch": 8.725454211781978, "grad_norm": 3.7786202430725098, "learning_rate": 6.933336960743679e-06, "loss": 0.4281, "num_input_tokens_seen": 102514752, "step": 47545 }, { "epoch": 8.726371811341531, "grad_norm": 17.47023582458496, "learning_rate": 6.932598462739176e-06, "loss": 0.1709, "num_input_tokens_seen": 102523808, "step": 47550 }, { "epoch": 8.727289410901083, "grad_norm": 2.8958663940429688, "learning_rate": 6.931859915166516e-06, "loss": 0.0671, "num_input_tokens_seen": 102533824, "step": 47555 }, { "epoch": 8.728207010460634, "grad_norm": 1.5893065929412842, "learning_rate": 6.931121318044642e-06, "loss": 0.3295, "num_input_tokens_seen": 102544320, "step": 47560 }, { "epoch": 8.729124610020188, "grad_norm": 10.146950721740723, "learning_rate": 6.930382671392502e-06, "loss": 0.4561, "num_input_tokens_seen": 102554816, "step": 47565 }, { "epoch": 8.73004220957974, "grad_norm": 4.700613498687744, "learning_rate": 6.929643975229036e-06, "loss": 0.2351, "num_input_tokens_seen": 102565024, "step": 47570 }, { "epoch": 8.730959809139291, "grad_norm": 2.69523549079895, "learning_rate": 6.928905229573194e-06, "loss": 0.2691, "num_input_tokens_seen": 102575552, "step": 47575 }, { "epoch": 8.731877408698844, "grad_norm": 5.704378128051758, "learning_rate": 6.9281664344439215e-06, "loss": 0.161, "num_input_tokens_seen": 102586784, "step": 47580 }, { "epoch": 8.732795008258396, "grad_norm": 48.00297164916992, "learning_rate": 6.927427589860167e-06, "loss": 0.1681, "num_input_tokens_seen": 102597472, "step": 47585 }, { "epoch": 8.733712607817948, "grad_norm": 20.187990188598633, "learning_rate": 6.9266886958408855e-06, "loss": 0.298, "num_input_tokens_seen": 102608992, "step": 47590 }, { "epoch": 8.734630207377501, "grad_norm": 30.83647918701172, "learning_rate": 6.9259497524050225e-06, "loss": 0.4716, "num_input_tokens_seen": 102618720, "step": 47595 }, { "epoch": 8.735547806937053, "grad_norm": 68.82733154296875, "learning_rate": 6.925210759571535e-06, "loss": 0.3299, "num_input_tokens_seen": 102630368, "step": 47600 }, { "epoch": 8.736465406496604, "grad_norm": 0.7773579955101013, "learning_rate": 6.924471717359373e-06, "loss": 0.2139, "num_input_tokens_seen": 102641344, "step": 47605 }, { "epoch": 8.737383006056158, "grad_norm": 1.5060982704162598, "learning_rate": 6.923732625787496e-06, "loss": 0.1727, "num_input_tokens_seen": 102653312, "step": 47610 }, { "epoch": 8.73830060561571, "grad_norm": 19.830081939697266, "learning_rate": 6.922993484874858e-06, "loss": 0.0824, "num_input_tokens_seen": 102665344, "step": 47615 }, { "epoch": 8.739218205175261, "grad_norm": 4.346163749694824, "learning_rate": 6.922254294640419e-06, "loss": 0.1317, "num_input_tokens_seen": 102676352, "step": 47620 }, { "epoch": 8.740135804734814, "grad_norm": 2.8589940071105957, "learning_rate": 6.921515055103134e-06, "loss": 0.2998, "num_input_tokens_seen": 102685824, "step": 47625 }, { "epoch": 8.741053404294366, "grad_norm": 19.512340545654297, "learning_rate": 6.920775766281968e-06, "loss": 0.3496, "num_input_tokens_seen": 102695840, "step": 47630 }, { "epoch": 8.741971003853918, "grad_norm": 20.691650390625, "learning_rate": 6.92003642819588e-06, "loss": 0.1008, "num_input_tokens_seen": 102707424, "step": 47635 }, { "epoch": 8.742888603413471, "grad_norm": 7.266117572784424, "learning_rate": 6.919297040863832e-06, "loss": 0.332, "num_input_tokens_seen": 102716608, "step": 47640 }, { "epoch": 8.743806202973023, "grad_norm": 2.0402350425720215, "learning_rate": 6.918557604304792e-06, "loss": 0.2937, "num_input_tokens_seen": 102727072, "step": 47645 }, { "epoch": 8.744723802532574, "grad_norm": 1.675865650177002, "learning_rate": 6.917818118537721e-06, "loss": 0.2868, "num_input_tokens_seen": 102738720, "step": 47650 }, { "epoch": 8.745641402092128, "grad_norm": 9.97654914855957, "learning_rate": 6.917078583581589e-06, "loss": 0.2111, "num_input_tokens_seen": 102750624, "step": 47655 }, { "epoch": 8.74655900165168, "grad_norm": 16.209285736083984, "learning_rate": 6.916338999455362e-06, "loss": 0.3256, "num_input_tokens_seen": 102760576, "step": 47660 }, { "epoch": 8.74747660121123, "grad_norm": 16.647171020507812, "learning_rate": 6.915599366178008e-06, "loss": 0.2626, "num_input_tokens_seen": 102771712, "step": 47665 }, { "epoch": 8.748394200770784, "grad_norm": 24.320146560668945, "learning_rate": 6.914859683768501e-06, "loss": 0.2707, "num_input_tokens_seen": 102782656, "step": 47670 }, { "epoch": 8.749311800330336, "grad_norm": 19.940689086914062, "learning_rate": 6.9141199522458115e-06, "loss": 0.4489, "num_input_tokens_seen": 102794432, "step": 47675 }, { "epoch": 8.750229399889887, "grad_norm": 8.676697731018066, "learning_rate": 6.913380171628909e-06, "loss": 0.1507, "num_input_tokens_seen": 102804032, "step": 47680 }, { "epoch": 8.75114699944944, "grad_norm": 0.819581925868988, "learning_rate": 6.912640341936774e-06, "loss": 0.1949, "num_input_tokens_seen": 102815264, "step": 47685 }, { "epoch": 8.752064599008992, "grad_norm": 9.624887466430664, "learning_rate": 6.911900463188377e-06, "loss": 0.2996, "num_input_tokens_seen": 102826176, "step": 47690 }, { "epoch": 8.752982198568544, "grad_norm": 48.74971389770508, "learning_rate": 6.911160535402694e-06, "loss": 0.1151, "num_input_tokens_seen": 102836480, "step": 47695 }, { "epoch": 8.753899798128097, "grad_norm": 29.9375057220459, "learning_rate": 6.910420558598709e-06, "loss": 0.3457, "num_input_tokens_seen": 102847296, "step": 47700 }, { "epoch": 8.754817397687649, "grad_norm": 7.065600395202637, "learning_rate": 6.9096805327953955e-06, "loss": 0.1817, "num_input_tokens_seen": 102858080, "step": 47705 }, { "epoch": 8.7557349972472, "grad_norm": 23.830490112304688, "learning_rate": 6.9089404580117365e-06, "loss": 0.3043, "num_input_tokens_seen": 102868256, "step": 47710 }, { "epoch": 8.756652596806754, "grad_norm": 42.36482238769531, "learning_rate": 6.9082003342667145e-06, "loss": 0.2276, "num_input_tokens_seen": 102879552, "step": 47715 }, { "epoch": 8.757570196366306, "grad_norm": 36.9723014831543, "learning_rate": 6.907460161579309e-06, "loss": 0.2258, "num_input_tokens_seen": 102890144, "step": 47720 }, { "epoch": 8.758487795925857, "grad_norm": 6.124765872955322, "learning_rate": 6.906719939968509e-06, "loss": 0.3191, "num_input_tokens_seen": 102898880, "step": 47725 }, { "epoch": 8.75940539548541, "grad_norm": 70.4590072631836, "learning_rate": 6.905979669453298e-06, "loss": 0.1274, "num_input_tokens_seen": 102909696, "step": 47730 }, { "epoch": 8.760322995044962, "grad_norm": 23.937162399291992, "learning_rate": 6.90523935005266e-06, "loss": 0.2009, "num_input_tokens_seen": 102921184, "step": 47735 }, { "epoch": 8.761240594604514, "grad_norm": 12.511819839477539, "learning_rate": 6.904498981785588e-06, "loss": 0.2912, "num_input_tokens_seen": 102931712, "step": 47740 }, { "epoch": 8.762158194164067, "grad_norm": 1.1327741146087646, "learning_rate": 6.903758564671067e-06, "loss": 0.3002, "num_input_tokens_seen": 102943136, "step": 47745 }, { "epoch": 8.763075793723619, "grad_norm": 81.97864532470703, "learning_rate": 6.90301809872809e-06, "loss": 0.2699, "num_input_tokens_seen": 102954496, "step": 47750 }, { "epoch": 8.76399339328317, "grad_norm": 24.59691047668457, "learning_rate": 6.902277583975647e-06, "loss": 0.3556, "num_input_tokens_seen": 102964736, "step": 47755 }, { "epoch": 8.764910992842724, "grad_norm": 2.4215426445007324, "learning_rate": 6.901537020432735e-06, "loss": 0.2902, "num_input_tokens_seen": 102976096, "step": 47760 }, { "epoch": 8.765828592402276, "grad_norm": 2.1614534854888916, "learning_rate": 6.900796408118343e-06, "loss": 0.235, "num_input_tokens_seen": 102987200, "step": 47765 }, { "epoch": 8.766746191961827, "grad_norm": 31.039405822753906, "learning_rate": 6.90005574705147e-06, "loss": 0.3643, "num_input_tokens_seen": 102998592, "step": 47770 }, { "epoch": 8.76766379152138, "grad_norm": 21.8255672454834, "learning_rate": 6.899315037251112e-06, "loss": 0.1552, "num_input_tokens_seen": 103010752, "step": 47775 }, { "epoch": 8.768581391080932, "grad_norm": 69.31684875488281, "learning_rate": 6.898574278736266e-06, "loss": 0.1712, "num_input_tokens_seen": 103022208, "step": 47780 }, { "epoch": 8.769498990640484, "grad_norm": 3.8159372806549072, "learning_rate": 6.897833471525934e-06, "loss": 0.0858, "num_input_tokens_seen": 103032736, "step": 47785 }, { "epoch": 8.770416590200037, "grad_norm": 7.944155216217041, "learning_rate": 6.897092615639114e-06, "loss": 0.2843, "num_input_tokens_seen": 103044544, "step": 47790 }, { "epoch": 8.771334189759589, "grad_norm": 4.615546703338623, "learning_rate": 6.896351711094809e-06, "loss": 0.3286, "num_input_tokens_seen": 103056128, "step": 47795 }, { "epoch": 8.77225178931914, "grad_norm": 18.598615646362305, "learning_rate": 6.895610757912021e-06, "loss": 0.1613, "num_input_tokens_seen": 103066880, "step": 47800 }, { "epoch": 8.773169388878694, "grad_norm": 15.036959648132324, "learning_rate": 6.894869756109754e-06, "loss": 0.3228, "num_input_tokens_seen": 103078272, "step": 47805 }, { "epoch": 8.774086988438246, "grad_norm": 0.24796880781650543, "learning_rate": 6.894128705707018e-06, "loss": 0.0999, "num_input_tokens_seen": 103089248, "step": 47810 }, { "epoch": 8.775004587997797, "grad_norm": 6.787639141082764, "learning_rate": 6.893387606722812e-06, "loss": 0.1083, "num_input_tokens_seen": 103099552, "step": 47815 }, { "epoch": 8.77592218755735, "grad_norm": 0.5740192532539368, "learning_rate": 6.892646459176152e-06, "loss": 0.0268, "num_input_tokens_seen": 103111424, "step": 47820 }, { "epoch": 8.776839787116902, "grad_norm": 22.197704315185547, "learning_rate": 6.891905263086042e-06, "loss": 0.2381, "num_input_tokens_seen": 103121856, "step": 47825 }, { "epoch": 8.777757386676454, "grad_norm": 34.13423156738281, "learning_rate": 6.891164018471493e-06, "loss": 0.2797, "num_input_tokens_seen": 103131616, "step": 47830 }, { "epoch": 8.778674986236007, "grad_norm": 3.004750967025757, "learning_rate": 6.89042272535152e-06, "loss": 0.3114, "num_input_tokens_seen": 103141824, "step": 47835 }, { "epoch": 8.779592585795559, "grad_norm": 15.697099685668945, "learning_rate": 6.889681383745135e-06, "loss": 0.2661, "num_input_tokens_seen": 103152640, "step": 47840 }, { "epoch": 8.78051018535511, "grad_norm": 16.30970573425293, "learning_rate": 6.888939993671349e-06, "loss": 0.2624, "num_input_tokens_seen": 103162048, "step": 47845 }, { "epoch": 8.781427784914664, "grad_norm": 35.3829460144043, "learning_rate": 6.888198555149181e-06, "loss": 0.2198, "num_input_tokens_seen": 103172800, "step": 47850 }, { "epoch": 8.782345384474215, "grad_norm": 0.4391857981681824, "learning_rate": 6.887457068197645e-06, "loss": 0.2387, "num_input_tokens_seen": 103182560, "step": 47855 }, { "epoch": 8.783262984033767, "grad_norm": 25.911880493164062, "learning_rate": 6.886715532835761e-06, "loss": 0.0426, "num_input_tokens_seen": 103193600, "step": 47860 }, { "epoch": 8.78418058359332, "grad_norm": 3.5618255138397217, "learning_rate": 6.8859739490825485e-06, "loss": 0.1974, "num_input_tokens_seen": 103204544, "step": 47865 }, { "epoch": 8.785098183152872, "grad_norm": 0.6749131679534912, "learning_rate": 6.885232316957027e-06, "loss": 0.1945, "num_input_tokens_seen": 103214464, "step": 47870 }, { "epoch": 8.786015782712424, "grad_norm": 44.73566818237305, "learning_rate": 6.884490636478217e-06, "loss": 0.2458, "num_input_tokens_seen": 103225152, "step": 47875 }, { "epoch": 8.786933382271977, "grad_norm": 26.595998764038086, "learning_rate": 6.883748907665144e-06, "loss": 0.3179, "num_input_tokens_seen": 103235232, "step": 47880 }, { "epoch": 8.787850981831529, "grad_norm": 43.18854522705078, "learning_rate": 6.883007130536832e-06, "loss": 0.178, "num_input_tokens_seen": 103244640, "step": 47885 }, { "epoch": 8.78876858139108, "grad_norm": 98.17923736572266, "learning_rate": 6.882265305112303e-06, "loss": 0.3918, "num_input_tokens_seen": 103255136, "step": 47890 }, { "epoch": 8.789686180950634, "grad_norm": 1.1477514505386353, "learning_rate": 6.881523431410589e-06, "loss": 0.0328, "num_input_tokens_seen": 103266368, "step": 47895 }, { "epoch": 8.790603780510185, "grad_norm": 68.12902069091797, "learning_rate": 6.880781509450714e-06, "loss": 0.4846, "num_input_tokens_seen": 103276672, "step": 47900 }, { "epoch": 8.791521380069737, "grad_norm": 16.31528663635254, "learning_rate": 6.8800395392517074e-06, "loss": 0.2454, "num_input_tokens_seen": 103286720, "step": 47905 }, { "epoch": 8.79243897962929, "grad_norm": 8.1959228515625, "learning_rate": 6.879297520832602e-06, "loss": 0.125, "num_input_tokens_seen": 103296960, "step": 47910 }, { "epoch": 8.793356579188842, "grad_norm": 18.901418685913086, "learning_rate": 6.878555454212426e-06, "loss": 0.1569, "num_input_tokens_seen": 103307744, "step": 47915 }, { "epoch": 8.794274178748394, "grad_norm": 3.6862082481384277, "learning_rate": 6.877813339410215e-06, "loss": 0.2379, "num_input_tokens_seen": 103318208, "step": 47920 }, { "epoch": 8.795191778307947, "grad_norm": 50.25375747680664, "learning_rate": 6.8770711764450026e-06, "loss": 0.3005, "num_input_tokens_seen": 103330336, "step": 47925 }, { "epoch": 8.796109377867499, "grad_norm": 4.130699634552002, "learning_rate": 6.876328965335822e-06, "loss": 0.2564, "num_input_tokens_seen": 103341440, "step": 47930 }, { "epoch": 8.79702697742705, "grad_norm": 11.042112350463867, "learning_rate": 6.875586706101713e-06, "loss": 0.3358, "num_input_tokens_seen": 103351200, "step": 47935 }, { "epoch": 8.797944576986604, "grad_norm": 35.93891525268555, "learning_rate": 6.874844398761712e-06, "loss": 0.2453, "num_input_tokens_seen": 103361728, "step": 47940 }, { "epoch": 8.798862176546155, "grad_norm": 1.303654432296753, "learning_rate": 6.874102043334858e-06, "loss": 0.1989, "num_input_tokens_seen": 103372224, "step": 47945 }, { "epoch": 8.799779776105707, "grad_norm": 2.236428737640381, "learning_rate": 6.873359639840191e-06, "loss": 0.2146, "num_input_tokens_seen": 103382720, "step": 47950 }, { "epoch": 8.80069737566526, "grad_norm": 6.411398887634277, "learning_rate": 6.872617188296753e-06, "loss": 0.296, "num_input_tokens_seen": 103394720, "step": 47955 }, { "epoch": 8.801614975224812, "grad_norm": 0.41554194688796997, "learning_rate": 6.871874688723586e-06, "loss": 0.2604, "num_input_tokens_seen": 103403808, "step": 47960 }, { "epoch": 8.802532574784363, "grad_norm": 7.532866954803467, "learning_rate": 6.871132141139734e-06, "loss": 0.1538, "num_input_tokens_seen": 103415104, "step": 47965 }, { "epoch": 8.803450174343917, "grad_norm": 24.0738582611084, "learning_rate": 6.870389545564243e-06, "loss": 0.1248, "num_input_tokens_seen": 103426720, "step": 47970 }, { "epoch": 8.804367773903468, "grad_norm": 8.25544548034668, "learning_rate": 6.869646902016158e-06, "loss": 0.0692, "num_input_tokens_seen": 103438976, "step": 47975 }, { "epoch": 8.80528537346302, "grad_norm": 21.731971740722656, "learning_rate": 6.8689042105145296e-06, "loss": 0.1311, "num_input_tokens_seen": 103449760, "step": 47980 }, { "epoch": 8.806202973022573, "grad_norm": 3.096045732498169, "learning_rate": 6.8681614710784025e-06, "loss": 0.2951, "num_input_tokens_seen": 103461888, "step": 47985 }, { "epoch": 8.807120572582125, "grad_norm": 40.97630310058594, "learning_rate": 6.867418683726831e-06, "loss": 0.1438, "num_input_tokens_seen": 103472000, "step": 47990 }, { "epoch": 8.808038172141677, "grad_norm": 33.65089797973633, "learning_rate": 6.8666758484788645e-06, "loss": 0.4093, "num_input_tokens_seen": 103482048, "step": 47995 }, { "epoch": 8.80895577170123, "grad_norm": 33.92933654785156, "learning_rate": 6.865932965353555e-06, "loss": 0.1624, "num_input_tokens_seen": 103489504, "step": 48000 }, { "epoch": 8.809873371260782, "grad_norm": 4.148365497589111, "learning_rate": 6.865190034369956e-06, "loss": 0.2932, "num_input_tokens_seen": 103499520, "step": 48005 }, { "epoch": 8.810790970820333, "grad_norm": 52.730709075927734, "learning_rate": 6.864447055547123e-06, "loss": 0.1595, "num_input_tokens_seen": 103510304, "step": 48010 }, { "epoch": 8.811708570379887, "grad_norm": 0.6594122052192688, "learning_rate": 6.8637040289041135e-06, "loss": 0.3038, "num_input_tokens_seen": 103520960, "step": 48015 }, { "epoch": 8.812626169939438, "grad_norm": 20.759950637817383, "learning_rate": 6.862960954459985e-06, "loss": 0.3225, "num_input_tokens_seen": 103531808, "step": 48020 }, { "epoch": 8.81354376949899, "grad_norm": 0.41866162419319153, "learning_rate": 6.862217832233795e-06, "loss": 0.146, "num_input_tokens_seen": 103543968, "step": 48025 }, { "epoch": 8.814461369058543, "grad_norm": 2.5443923473358154, "learning_rate": 6.861474662244604e-06, "loss": 0.0843, "num_input_tokens_seen": 103555840, "step": 48030 }, { "epoch": 8.815378968618095, "grad_norm": 16.55647087097168, "learning_rate": 6.860731444511471e-06, "loss": 0.0525, "num_input_tokens_seen": 103567424, "step": 48035 }, { "epoch": 8.816296568177647, "grad_norm": 41.292564392089844, "learning_rate": 6.859988179053461e-06, "loss": 0.3367, "num_input_tokens_seen": 103578304, "step": 48040 }, { "epoch": 8.8172141677372, "grad_norm": 49.82685852050781, "learning_rate": 6.859244865889639e-06, "loss": 0.2012, "num_input_tokens_seen": 103588032, "step": 48045 }, { "epoch": 8.818131767296752, "grad_norm": 0.8893808722496033, "learning_rate": 6.8585015050390655e-06, "loss": 0.2942, "num_input_tokens_seen": 103599488, "step": 48050 }, { "epoch": 8.819049366856303, "grad_norm": 52.5490837097168, "learning_rate": 6.85775809652081e-06, "loss": 0.39, "num_input_tokens_seen": 103608448, "step": 48055 }, { "epoch": 8.819966966415857, "grad_norm": 27.034381866455078, "learning_rate": 6.85701464035394e-06, "loss": 0.3591, "num_input_tokens_seen": 103617952, "step": 48060 }, { "epoch": 8.820884565975408, "grad_norm": 2.0266401767730713, "learning_rate": 6.85627113655752e-06, "loss": 0.3294, "num_input_tokens_seen": 103629248, "step": 48065 }, { "epoch": 8.82180216553496, "grad_norm": 0.3664933145046234, "learning_rate": 6.855527585150623e-06, "loss": 0.093, "num_input_tokens_seen": 103638240, "step": 48070 }, { "epoch": 8.822719765094513, "grad_norm": 37.20455551147461, "learning_rate": 6.85478398615232e-06, "loss": 0.476, "num_input_tokens_seen": 103649056, "step": 48075 }, { "epoch": 8.823637364654065, "grad_norm": 0.3978164792060852, "learning_rate": 6.854040339581683e-06, "loss": 0.111, "num_input_tokens_seen": 103659168, "step": 48080 }, { "epoch": 8.824554964213617, "grad_norm": 60.1739387512207, "learning_rate": 6.853296645457782e-06, "loss": 0.474, "num_input_tokens_seen": 103669856, "step": 48085 }, { "epoch": 8.82547256377317, "grad_norm": 9.289338111877441, "learning_rate": 6.852552903799697e-06, "loss": 0.2282, "num_input_tokens_seen": 103679392, "step": 48090 }, { "epoch": 8.826390163332722, "grad_norm": 11.725995063781738, "learning_rate": 6.8518091146264995e-06, "loss": 0.1596, "num_input_tokens_seen": 103689088, "step": 48095 }, { "epoch": 8.827307762892273, "grad_norm": 17.871986389160156, "learning_rate": 6.851065277957268e-06, "loss": 0.1002, "num_input_tokens_seen": 103701088, "step": 48100 }, { "epoch": 8.828225362451827, "grad_norm": 24.90865135192871, "learning_rate": 6.8503213938110835e-06, "loss": 0.6039, "num_input_tokens_seen": 103711904, "step": 48105 }, { "epoch": 8.829142962011378, "grad_norm": 5.841371059417725, "learning_rate": 6.849577462207021e-06, "loss": 0.1799, "num_input_tokens_seen": 103722720, "step": 48110 }, { "epoch": 8.83006056157093, "grad_norm": 1.2916767597198486, "learning_rate": 6.848833483164164e-06, "loss": 0.077, "num_input_tokens_seen": 103733312, "step": 48115 }, { "epoch": 8.830978161130483, "grad_norm": 49.26021957397461, "learning_rate": 6.8480894567015936e-06, "loss": 0.1939, "num_input_tokens_seen": 103743808, "step": 48120 }, { "epoch": 8.831895760690035, "grad_norm": 16.92588233947754, "learning_rate": 6.847345382838392e-06, "loss": 0.1627, "num_input_tokens_seen": 103755648, "step": 48125 }, { "epoch": 8.832813360249586, "grad_norm": 0.27578622102737427, "learning_rate": 6.846601261593645e-06, "loss": 0.0455, "num_input_tokens_seen": 103767104, "step": 48130 }, { "epoch": 8.83373095980914, "grad_norm": 13.42563247680664, "learning_rate": 6.845857092986437e-06, "loss": 0.0961, "num_input_tokens_seen": 103778784, "step": 48135 }, { "epoch": 8.834648559368691, "grad_norm": 2.065094470977783, "learning_rate": 6.8451128770358565e-06, "loss": 0.1107, "num_input_tokens_seen": 103789536, "step": 48140 }, { "epoch": 8.835566158928243, "grad_norm": 4.328145980834961, "learning_rate": 6.84436861376099e-06, "loss": 0.2112, "num_input_tokens_seen": 103800448, "step": 48145 }, { "epoch": 8.836483758487796, "grad_norm": 8.176202774047852, "learning_rate": 6.843624303180927e-06, "loss": 0.2985, "num_input_tokens_seen": 103810112, "step": 48150 }, { "epoch": 8.837401358047348, "grad_norm": 23.597394943237305, "learning_rate": 6.842879945314758e-06, "loss": 0.1563, "num_input_tokens_seen": 103820832, "step": 48155 }, { "epoch": 8.8383189576069, "grad_norm": 37.44876480102539, "learning_rate": 6.842135540181575e-06, "loss": 0.4832, "num_input_tokens_seen": 103832128, "step": 48160 }, { "epoch": 8.839236557166453, "grad_norm": 90.42942810058594, "learning_rate": 6.84139108780047e-06, "loss": 0.3148, "num_input_tokens_seen": 103842176, "step": 48165 }, { "epoch": 8.840154156726005, "grad_norm": 0.577962338924408, "learning_rate": 6.840646588190539e-06, "loss": 0.1379, "num_input_tokens_seen": 103853088, "step": 48170 }, { "epoch": 8.841071756285556, "grad_norm": 2.4199891090393066, "learning_rate": 6.839902041370873e-06, "loss": 0.2757, "num_input_tokens_seen": 103862784, "step": 48175 }, { "epoch": 8.84198935584511, "grad_norm": 0.5288499593734741, "learning_rate": 6.839157447360573e-06, "loss": 0.3225, "num_input_tokens_seen": 103873696, "step": 48180 }, { "epoch": 8.842906955404661, "grad_norm": 54.42566680908203, "learning_rate": 6.838412806178734e-06, "loss": 0.4567, "num_input_tokens_seen": 103883456, "step": 48185 }, { "epoch": 8.843824554964213, "grad_norm": 50.58745574951172, "learning_rate": 6.837668117844456e-06, "loss": 0.3537, "num_input_tokens_seen": 103894336, "step": 48190 }, { "epoch": 8.844742154523766, "grad_norm": 35.09788513183594, "learning_rate": 6.836923382376839e-06, "loss": 0.3674, "num_input_tokens_seen": 103905088, "step": 48195 }, { "epoch": 8.845659754083318, "grad_norm": 17.91348648071289, "learning_rate": 6.8361785997949835e-06, "loss": 0.5144, "num_input_tokens_seen": 103914912, "step": 48200 }, { "epoch": 8.84657735364287, "grad_norm": 22.73192024230957, "learning_rate": 6.835433770117994e-06, "loss": 0.7018, "num_input_tokens_seen": 103925280, "step": 48205 }, { "epoch": 8.847494953202423, "grad_norm": 24.0748291015625, "learning_rate": 6.834688893364972e-06, "loss": 0.2003, "num_input_tokens_seen": 103935680, "step": 48210 }, { "epoch": 8.848412552761975, "grad_norm": 0.8518433570861816, "learning_rate": 6.833943969555025e-06, "loss": 0.2715, "num_input_tokens_seen": 103947360, "step": 48215 }, { "epoch": 8.849330152321526, "grad_norm": 1.1327966451644897, "learning_rate": 6.833198998707256e-06, "loss": 0.1364, "num_input_tokens_seen": 103959200, "step": 48220 }, { "epoch": 8.85024775188108, "grad_norm": 34.714420318603516, "learning_rate": 6.832453980840774e-06, "loss": 0.257, "num_input_tokens_seen": 103968544, "step": 48225 }, { "epoch": 8.851165351440631, "grad_norm": 17.289419174194336, "learning_rate": 6.831708915974689e-06, "loss": 0.3718, "num_input_tokens_seen": 103979136, "step": 48230 }, { "epoch": 8.852082951000183, "grad_norm": 32.20241928100586, "learning_rate": 6.830963804128107e-06, "loss": 0.2428, "num_input_tokens_seen": 103990336, "step": 48235 }, { "epoch": 8.853000550559736, "grad_norm": 12.540009498596191, "learning_rate": 6.830218645320142e-06, "loss": 0.3382, "num_input_tokens_seen": 104000736, "step": 48240 }, { "epoch": 8.853918150119288, "grad_norm": 21.46393394470215, "learning_rate": 6.829473439569906e-06, "loss": 0.1414, "num_input_tokens_seen": 104013472, "step": 48245 }, { "epoch": 8.85483574967884, "grad_norm": 0.6307848691940308, "learning_rate": 6.8287281868965114e-06, "loss": 0.0613, "num_input_tokens_seen": 104024128, "step": 48250 }, { "epoch": 8.855753349238393, "grad_norm": 0.3584218919277191, "learning_rate": 6.827982887319072e-06, "loss": 0.0959, "num_input_tokens_seen": 104034592, "step": 48255 }, { "epoch": 8.856670948797944, "grad_norm": 72.0197525024414, "learning_rate": 6.827237540856705e-06, "loss": 0.2285, "num_input_tokens_seen": 104045408, "step": 48260 }, { "epoch": 8.857588548357496, "grad_norm": 11.048515319824219, "learning_rate": 6.8264921475285284e-06, "loss": 0.2844, "num_input_tokens_seen": 104054272, "step": 48265 }, { "epoch": 8.85850614791705, "grad_norm": 25.529891967773438, "learning_rate": 6.825746707353659e-06, "loss": 0.4137, "num_input_tokens_seen": 104065728, "step": 48270 }, { "epoch": 8.859423747476601, "grad_norm": 7.423262119293213, "learning_rate": 6.825001220351215e-06, "loss": 0.4053, "num_input_tokens_seen": 104077152, "step": 48275 }, { "epoch": 8.860341347036153, "grad_norm": 2.123488664627075, "learning_rate": 6.8242556865403185e-06, "loss": 0.1864, "num_input_tokens_seen": 104088352, "step": 48280 }, { "epoch": 8.861258946595706, "grad_norm": 0.9391509890556335, "learning_rate": 6.823510105940091e-06, "loss": 0.3931, "num_input_tokens_seen": 104099392, "step": 48285 }, { "epoch": 8.862176546155258, "grad_norm": 16.329879760742188, "learning_rate": 6.822764478569655e-06, "loss": 0.3041, "num_input_tokens_seen": 104110880, "step": 48290 }, { "epoch": 8.86309414571481, "grad_norm": 10.604148864746094, "learning_rate": 6.8220188044481364e-06, "loss": 0.1332, "num_input_tokens_seen": 104121152, "step": 48295 }, { "epoch": 8.864011745274363, "grad_norm": 3.7373766899108887, "learning_rate": 6.82127308359466e-06, "loss": 0.1173, "num_input_tokens_seen": 104130912, "step": 48300 }, { "epoch": 8.864929344833914, "grad_norm": 17.152088165283203, "learning_rate": 6.820527316028349e-06, "loss": 0.2308, "num_input_tokens_seen": 104142624, "step": 48305 }, { "epoch": 8.865846944393466, "grad_norm": 25.626962661743164, "learning_rate": 6.819781501768334e-06, "loss": 0.3059, "num_input_tokens_seen": 104153376, "step": 48310 }, { "epoch": 8.86676454395302, "grad_norm": 11.377333641052246, "learning_rate": 6.819035640833746e-06, "loss": 0.1083, "num_input_tokens_seen": 104164704, "step": 48315 }, { "epoch": 8.867682143512571, "grad_norm": 8.667713165283203, "learning_rate": 6.818289733243713e-06, "loss": 0.291, "num_input_tokens_seen": 104176448, "step": 48320 }, { "epoch": 8.868599743072123, "grad_norm": 29.51824378967285, "learning_rate": 6.817543779017365e-06, "loss": 0.2992, "num_input_tokens_seen": 104188416, "step": 48325 }, { "epoch": 8.869517342631676, "grad_norm": 3.1254308223724365, "learning_rate": 6.816797778173836e-06, "loss": 0.3038, "num_input_tokens_seen": 104200192, "step": 48330 }, { "epoch": 8.870434942191228, "grad_norm": 2.0364418029785156, "learning_rate": 6.816051730732261e-06, "loss": 0.0225, "num_input_tokens_seen": 104211648, "step": 48335 }, { "epoch": 8.87135254175078, "grad_norm": 0.6999585628509521, "learning_rate": 6.815305636711773e-06, "loss": 0.1967, "num_input_tokens_seen": 104222560, "step": 48340 }, { "epoch": 8.872270141310333, "grad_norm": 0.23581784963607788, "learning_rate": 6.814559496131509e-06, "loss": 0.1614, "num_input_tokens_seen": 104234048, "step": 48345 }, { "epoch": 8.873187740869884, "grad_norm": 49.775394439697266, "learning_rate": 6.813813309010606e-06, "loss": 0.3561, "num_input_tokens_seen": 104243168, "step": 48350 }, { "epoch": 8.874105340429436, "grad_norm": 84.31710052490234, "learning_rate": 6.813067075368203e-06, "loss": 0.1807, "num_input_tokens_seen": 104255008, "step": 48355 }, { "epoch": 8.87502293998899, "grad_norm": 8.504558563232422, "learning_rate": 6.812320795223439e-06, "loss": 0.0643, "num_input_tokens_seen": 104265120, "step": 48360 }, { "epoch": 8.875940539548541, "grad_norm": 10.771785736083984, "learning_rate": 6.811574468595457e-06, "loss": 0.2647, "num_input_tokens_seen": 104275840, "step": 48365 }, { "epoch": 8.876858139108093, "grad_norm": 0.7159941792488098, "learning_rate": 6.810828095503396e-06, "loss": 0.2864, "num_input_tokens_seen": 104287360, "step": 48370 }, { "epoch": 8.877775738667646, "grad_norm": 18.846111297607422, "learning_rate": 6.810081675966403e-06, "loss": 0.2739, "num_input_tokens_seen": 104299904, "step": 48375 }, { "epoch": 8.878693338227198, "grad_norm": 0.7612117528915405, "learning_rate": 6.809335210003618e-06, "loss": 0.1829, "num_input_tokens_seen": 104310176, "step": 48380 }, { "epoch": 8.87961093778675, "grad_norm": 0.36001092195510864, "learning_rate": 6.80858869763419e-06, "loss": 0.2462, "num_input_tokens_seen": 104320256, "step": 48385 }, { "epoch": 8.880528537346303, "grad_norm": 11.054055213928223, "learning_rate": 6.807842138877266e-06, "loss": 0.5055, "num_input_tokens_seen": 104331008, "step": 48390 }, { "epoch": 8.881446136905854, "grad_norm": 2.185722589492798, "learning_rate": 6.807095533751991e-06, "loss": 0.3837, "num_input_tokens_seen": 104340096, "step": 48395 }, { "epoch": 8.882363736465406, "grad_norm": 11.59762191772461, "learning_rate": 6.806348882277517e-06, "loss": 0.4425, "num_input_tokens_seen": 104351488, "step": 48400 }, { "epoch": 8.88328133602496, "grad_norm": 22.909221649169922, "learning_rate": 6.805602184472993e-06, "loss": 0.2664, "num_input_tokens_seen": 104362208, "step": 48405 }, { "epoch": 8.88419893558451, "grad_norm": 2.5882911682128906, "learning_rate": 6.804855440357574e-06, "loss": 0.3596, "num_input_tokens_seen": 104372704, "step": 48410 }, { "epoch": 8.885116535144062, "grad_norm": 51.57261657714844, "learning_rate": 6.8041086499504075e-06, "loss": 0.3281, "num_input_tokens_seen": 104383104, "step": 48415 }, { "epoch": 8.886034134703616, "grad_norm": 15.990274429321289, "learning_rate": 6.803361813270651e-06, "loss": 0.2488, "num_input_tokens_seen": 104394240, "step": 48420 }, { "epoch": 8.886951734263167, "grad_norm": 16.287681579589844, "learning_rate": 6.80261493033746e-06, "loss": 0.2371, "num_input_tokens_seen": 104404672, "step": 48425 }, { "epoch": 8.887869333822719, "grad_norm": 14.62651252746582, "learning_rate": 6.801868001169989e-06, "loss": 0.3153, "num_input_tokens_seen": 104416064, "step": 48430 }, { "epoch": 8.888786933382272, "grad_norm": 22.32321548461914, "learning_rate": 6.801121025787396e-06, "loss": 0.2905, "num_input_tokens_seen": 104425856, "step": 48435 }, { "epoch": 8.889704532941824, "grad_norm": 33.481868743896484, "learning_rate": 6.8003740042088405e-06, "loss": 0.1941, "num_input_tokens_seen": 104438080, "step": 48440 }, { "epoch": 8.890622132501376, "grad_norm": 14.989931106567383, "learning_rate": 6.799626936453483e-06, "loss": 0.3165, "num_input_tokens_seen": 104449056, "step": 48445 }, { "epoch": 8.891539732060929, "grad_norm": 0.7517880797386169, "learning_rate": 6.7988798225404825e-06, "loss": 0.278, "num_input_tokens_seen": 104459616, "step": 48450 }, { "epoch": 8.89245733162048, "grad_norm": 22.04067611694336, "learning_rate": 6.7981326624890034e-06, "loss": 0.2578, "num_input_tokens_seen": 104469920, "step": 48455 }, { "epoch": 8.893374931180032, "grad_norm": 12.987753868103027, "learning_rate": 6.797385456318208e-06, "loss": 0.3796, "num_input_tokens_seen": 104480512, "step": 48460 }, { "epoch": 8.894292530739586, "grad_norm": 43.82875061035156, "learning_rate": 6.7966382040472614e-06, "loss": 0.1125, "num_input_tokens_seen": 104492000, "step": 48465 }, { "epoch": 8.895210130299137, "grad_norm": 20.97743034362793, "learning_rate": 6.795890905695329e-06, "loss": 0.1212, "num_input_tokens_seen": 104503712, "step": 48470 }, { "epoch": 8.896127729858689, "grad_norm": 3.0644612312316895, "learning_rate": 6.795143561281579e-06, "loss": 0.0917, "num_input_tokens_seen": 104515616, "step": 48475 }, { "epoch": 8.897045329418242, "grad_norm": 0.3560504615306854, "learning_rate": 6.794396170825179e-06, "loss": 0.2925, "num_input_tokens_seen": 104524640, "step": 48480 }, { "epoch": 8.897962928977794, "grad_norm": 0.15010903775691986, "learning_rate": 6.793648734345298e-06, "loss": 0.407, "num_input_tokens_seen": 104536224, "step": 48485 }, { "epoch": 8.898880528537346, "grad_norm": 31.643400192260742, "learning_rate": 6.7929012518611074e-06, "loss": 0.1386, "num_input_tokens_seen": 104546016, "step": 48490 }, { "epoch": 8.899798128096899, "grad_norm": 9.185596466064453, "learning_rate": 6.792153723391776e-06, "loss": 0.1577, "num_input_tokens_seen": 104555936, "step": 48495 }, { "epoch": 8.90071572765645, "grad_norm": 35.72987747192383, "learning_rate": 6.791406148956482e-06, "loss": 0.1053, "num_input_tokens_seen": 104566816, "step": 48500 }, { "epoch": 8.901633327216002, "grad_norm": 0.4510985016822815, "learning_rate": 6.7906585285743954e-06, "loss": 0.3101, "num_input_tokens_seen": 104577472, "step": 48505 }, { "epoch": 8.902550926775556, "grad_norm": 0.5085439085960388, "learning_rate": 6.789910862264693e-06, "loss": 0.2479, "num_input_tokens_seen": 104588352, "step": 48510 }, { "epoch": 8.903468526335107, "grad_norm": 36.208770751953125, "learning_rate": 6.789163150046552e-06, "loss": 0.3441, "num_input_tokens_seen": 104599392, "step": 48515 }, { "epoch": 8.904386125894659, "grad_norm": 7.326928615570068, "learning_rate": 6.7884153919391475e-06, "loss": 0.2285, "num_input_tokens_seen": 104610784, "step": 48520 }, { "epoch": 8.905303725454212, "grad_norm": 83.91622161865234, "learning_rate": 6.787667587961662e-06, "loss": 0.2055, "num_input_tokens_seen": 104622400, "step": 48525 }, { "epoch": 8.906221325013764, "grad_norm": 8.537835121154785, "learning_rate": 6.786919738133271e-06, "loss": 0.4944, "num_input_tokens_seen": 104633376, "step": 48530 }, { "epoch": 8.907138924573315, "grad_norm": 1.2032605409622192, "learning_rate": 6.786171842473159e-06, "loss": 0.1255, "num_input_tokens_seen": 104643520, "step": 48535 }, { "epoch": 8.908056524132869, "grad_norm": 37.647361755371094, "learning_rate": 6.785423901000509e-06, "loss": 0.255, "num_input_tokens_seen": 104654592, "step": 48540 }, { "epoch": 8.90897412369242, "grad_norm": 27.627487182617188, "learning_rate": 6.784675913734502e-06, "loss": 0.5173, "num_input_tokens_seen": 104665248, "step": 48545 }, { "epoch": 8.909891723251972, "grad_norm": 1.9142913818359375, "learning_rate": 6.783927880694323e-06, "loss": 0.0907, "num_input_tokens_seen": 104677024, "step": 48550 }, { "epoch": 8.910809322811525, "grad_norm": 27.164953231811523, "learning_rate": 6.78317980189916e-06, "loss": 0.0925, "num_input_tokens_seen": 104687840, "step": 48555 }, { "epoch": 8.911726922371077, "grad_norm": 6.469851016998291, "learning_rate": 6.7824316773681985e-06, "loss": 0.2991, "num_input_tokens_seen": 104699232, "step": 48560 }, { "epoch": 8.912644521930629, "grad_norm": 59.16693115234375, "learning_rate": 6.781683507120627e-06, "loss": 0.3378, "num_input_tokens_seen": 104709344, "step": 48565 }, { "epoch": 8.913562121490182, "grad_norm": 0.4679892063140869, "learning_rate": 6.780935291175636e-06, "loss": 0.1641, "num_input_tokens_seen": 104720416, "step": 48570 }, { "epoch": 8.914479721049734, "grad_norm": 6.1498188972473145, "learning_rate": 6.780187029552412e-06, "loss": 0.3748, "num_input_tokens_seen": 104731136, "step": 48575 }, { "epoch": 8.915397320609285, "grad_norm": 7.126240253448486, "learning_rate": 6.7794387222701505e-06, "loss": 0.2691, "num_input_tokens_seen": 104740640, "step": 48580 }, { "epoch": 8.916314920168839, "grad_norm": 1.9186863899230957, "learning_rate": 6.778690369348047e-06, "loss": 0.4062, "num_input_tokens_seen": 104749760, "step": 48585 }, { "epoch": 8.91723251972839, "grad_norm": 23.14101219177246, "learning_rate": 6.777941970805288e-06, "loss": 0.4931, "num_input_tokens_seen": 104759712, "step": 48590 }, { "epoch": 8.918150119287942, "grad_norm": 2.8010470867156982, "learning_rate": 6.777193526661077e-06, "loss": 0.1085, "num_input_tokens_seen": 104771136, "step": 48595 }, { "epoch": 8.919067718847495, "grad_norm": 15.751725196838379, "learning_rate": 6.776445036934605e-06, "loss": 0.3981, "num_input_tokens_seen": 104782208, "step": 48600 }, { "epoch": 8.919985318407047, "grad_norm": 2.1315929889678955, "learning_rate": 6.775696501645069e-06, "loss": 0.5825, "num_input_tokens_seen": 104793376, "step": 48605 }, { "epoch": 8.920902917966599, "grad_norm": 3.8658504486083984, "learning_rate": 6.774947920811672e-06, "loss": 0.2326, "num_input_tokens_seen": 104804256, "step": 48610 }, { "epoch": 8.921820517526152, "grad_norm": 21.152437210083008, "learning_rate": 6.77419929445361e-06, "loss": 0.3376, "num_input_tokens_seen": 104815456, "step": 48615 }, { "epoch": 8.922738117085704, "grad_norm": 1.2843741178512573, "learning_rate": 6.7734506225900875e-06, "loss": 0.2754, "num_input_tokens_seen": 104826336, "step": 48620 }, { "epoch": 8.923655716645255, "grad_norm": 1.249419093132019, "learning_rate": 6.772701905240304e-06, "loss": 0.1394, "num_input_tokens_seen": 104835264, "step": 48625 }, { "epoch": 8.924573316204809, "grad_norm": 3.382612705230713, "learning_rate": 6.771953142423464e-06, "loss": 0.0886, "num_input_tokens_seen": 104845952, "step": 48630 }, { "epoch": 8.92549091576436, "grad_norm": 0.6385360956192017, "learning_rate": 6.771204334158773e-06, "loss": 0.2768, "num_input_tokens_seen": 104857312, "step": 48635 }, { "epoch": 8.926408515323912, "grad_norm": 0.4709765613079071, "learning_rate": 6.770455480465435e-06, "loss": 0.1904, "num_input_tokens_seen": 104866688, "step": 48640 }, { "epoch": 8.927326114883465, "grad_norm": 2.162060499191284, "learning_rate": 6.7697065813626595e-06, "loss": 0.1607, "num_input_tokens_seen": 104876960, "step": 48645 }, { "epoch": 8.928243714443017, "grad_norm": 10.453695297241211, "learning_rate": 6.768957636869652e-06, "loss": 0.204, "num_input_tokens_seen": 104886976, "step": 48650 }, { "epoch": 8.929161314002569, "grad_norm": 15.715230941772461, "learning_rate": 6.768208647005622e-06, "loss": 0.1413, "num_input_tokens_seen": 104897120, "step": 48655 }, { "epoch": 8.930078913562122, "grad_norm": 39.99949264526367, "learning_rate": 6.767459611789782e-06, "loss": 0.2657, "num_input_tokens_seen": 104908544, "step": 48660 }, { "epoch": 8.930996513121674, "grad_norm": 37.3134765625, "learning_rate": 6.766710531241341e-06, "loss": 0.4021, "num_input_tokens_seen": 104919808, "step": 48665 }, { "epoch": 8.931914112681225, "grad_norm": 49.417869567871094, "learning_rate": 6.7659614053795146e-06, "loss": 0.1999, "num_input_tokens_seen": 104930784, "step": 48670 }, { "epoch": 8.932831712240779, "grad_norm": 0.9426974058151245, "learning_rate": 6.7652122342235135e-06, "loss": 0.2583, "num_input_tokens_seen": 104940736, "step": 48675 }, { "epoch": 8.93374931180033, "grad_norm": 0.5151312947273254, "learning_rate": 6.764463017792555e-06, "loss": 0.2718, "num_input_tokens_seen": 104951392, "step": 48680 }, { "epoch": 8.934666911359882, "grad_norm": 28.94462013244629, "learning_rate": 6.763713756105855e-06, "loss": 0.2237, "num_input_tokens_seen": 104962048, "step": 48685 }, { "epoch": 8.935584510919435, "grad_norm": 62.0762825012207, "learning_rate": 6.762964449182631e-06, "loss": 0.3079, "num_input_tokens_seen": 104972864, "step": 48690 }, { "epoch": 8.936502110478987, "grad_norm": 14.845759391784668, "learning_rate": 6.762215097042101e-06, "loss": 0.3235, "num_input_tokens_seen": 104983328, "step": 48695 }, { "epoch": 8.937419710038538, "grad_norm": 22.681970596313477, "learning_rate": 6.761465699703485e-06, "loss": 0.3733, "num_input_tokens_seen": 104995232, "step": 48700 }, { "epoch": 8.938337309598092, "grad_norm": 4.625432968139648, "learning_rate": 6.760716257186004e-06, "loss": 0.0875, "num_input_tokens_seen": 105005120, "step": 48705 }, { "epoch": 8.939254909157643, "grad_norm": 8.070934295654297, "learning_rate": 6.75996676950888e-06, "loss": 0.1253, "num_input_tokens_seen": 105015968, "step": 48710 }, { "epoch": 8.940172508717195, "grad_norm": 9.145340919494629, "learning_rate": 6.759217236691335e-06, "loss": 0.4165, "num_input_tokens_seen": 105028224, "step": 48715 }, { "epoch": 8.941090108276748, "grad_norm": 4.137453079223633, "learning_rate": 6.7584676587525955e-06, "loss": 0.1809, "num_input_tokens_seen": 105038176, "step": 48720 }, { "epoch": 8.9420077078363, "grad_norm": 1.175076961517334, "learning_rate": 6.757718035711885e-06, "loss": 0.2125, "num_input_tokens_seen": 105049504, "step": 48725 }, { "epoch": 8.942925307395852, "grad_norm": 94.92337799072266, "learning_rate": 6.756968367588432e-06, "loss": 0.22, "num_input_tokens_seen": 105059232, "step": 48730 }, { "epoch": 8.943842906955405, "grad_norm": 18.94788932800293, "learning_rate": 6.756218654401463e-06, "loss": 0.0824, "num_input_tokens_seen": 105070304, "step": 48735 }, { "epoch": 8.944760506514957, "grad_norm": 2.9245898723602295, "learning_rate": 6.755468896170207e-06, "loss": 0.232, "num_input_tokens_seen": 105081056, "step": 48740 }, { "epoch": 8.945678106074508, "grad_norm": 27.69425392150879, "learning_rate": 6.754719092913895e-06, "loss": 0.6415, "num_input_tokens_seen": 105091840, "step": 48745 }, { "epoch": 8.946595705634062, "grad_norm": 57.90724563598633, "learning_rate": 6.753969244651757e-06, "loss": 0.1955, "num_input_tokens_seen": 105102816, "step": 48750 }, { "epoch": 8.947513305193613, "grad_norm": 18.334753036499023, "learning_rate": 6.753219351403027e-06, "loss": 0.1298, "num_input_tokens_seen": 105114176, "step": 48755 }, { "epoch": 8.948430904753165, "grad_norm": 5.910434722900391, "learning_rate": 6.752469413186938e-06, "loss": 0.3952, "num_input_tokens_seen": 105124960, "step": 48760 }, { "epoch": 8.949348504312718, "grad_norm": 0.9617211818695068, "learning_rate": 6.751719430022724e-06, "loss": 0.2213, "num_input_tokens_seen": 105135232, "step": 48765 }, { "epoch": 8.95026610387227, "grad_norm": 20.628414154052734, "learning_rate": 6.7509694019296226e-06, "loss": 0.3129, "num_input_tokens_seen": 105146304, "step": 48770 }, { "epoch": 8.951183703431822, "grad_norm": 13.581259727478027, "learning_rate": 6.750219328926868e-06, "loss": 0.1244, "num_input_tokens_seen": 105156928, "step": 48775 }, { "epoch": 8.952101302991375, "grad_norm": 10.619101524353027, "learning_rate": 6.749469211033702e-06, "loss": 0.4652, "num_input_tokens_seen": 105166880, "step": 48780 }, { "epoch": 8.953018902550927, "grad_norm": 2.3261892795562744, "learning_rate": 6.748719048269362e-06, "loss": 0.1486, "num_input_tokens_seen": 105177184, "step": 48785 }, { "epoch": 8.953936502110478, "grad_norm": 45.771446228027344, "learning_rate": 6.747968840653087e-06, "loss": 0.1458, "num_input_tokens_seen": 105187584, "step": 48790 }, { "epoch": 8.954854101670032, "grad_norm": 1.1537450551986694, "learning_rate": 6.747218588204123e-06, "loss": 0.0737, "num_input_tokens_seen": 105197824, "step": 48795 }, { "epoch": 8.955771701229583, "grad_norm": 0.7648068070411682, "learning_rate": 6.746468290941708e-06, "loss": 0.2214, "num_input_tokens_seen": 105207936, "step": 48800 }, { "epoch": 8.956689300789135, "grad_norm": 5.689183235168457, "learning_rate": 6.74571794888509e-06, "loss": 0.2359, "num_input_tokens_seen": 105217920, "step": 48805 }, { "epoch": 8.957606900348688, "grad_norm": 4.549508571624756, "learning_rate": 6.744967562053512e-06, "loss": 0.1135, "num_input_tokens_seen": 105228512, "step": 48810 }, { "epoch": 8.95852449990824, "grad_norm": 30.23515510559082, "learning_rate": 6.744217130466219e-06, "loss": 0.379, "num_input_tokens_seen": 105239328, "step": 48815 }, { "epoch": 8.959442099467791, "grad_norm": 0.3951495885848999, "learning_rate": 6.743466654142461e-06, "loss": 0.2494, "num_input_tokens_seen": 105250560, "step": 48820 }, { "epoch": 8.960359699027345, "grad_norm": 59.95658493041992, "learning_rate": 6.7427161331014845e-06, "loss": 0.1283, "num_input_tokens_seen": 105260480, "step": 48825 }, { "epoch": 8.961277298586896, "grad_norm": 8.659576416015625, "learning_rate": 6.74196556736254e-06, "loss": 0.3448, "num_input_tokens_seen": 105271360, "step": 48830 }, { "epoch": 8.962194898146448, "grad_norm": 21.163469314575195, "learning_rate": 6.741214956944879e-06, "loss": 0.0929, "num_input_tokens_seen": 105281632, "step": 48835 }, { "epoch": 8.963112497706001, "grad_norm": 0.4428344964981079, "learning_rate": 6.740464301867753e-06, "loss": 0.0937, "num_input_tokens_seen": 105291968, "step": 48840 }, { "epoch": 8.964030097265553, "grad_norm": 0.26427677273750305, "learning_rate": 6.739713602150416e-06, "loss": 0.2797, "num_input_tokens_seen": 105303232, "step": 48845 }, { "epoch": 8.964947696825105, "grad_norm": 28.834871292114258, "learning_rate": 6.73896285781212e-06, "loss": 0.1829, "num_input_tokens_seen": 105313856, "step": 48850 }, { "epoch": 8.965865296384658, "grad_norm": 28.086637496948242, "learning_rate": 6.738212068872123e-06, "loss": 0.3359, "num_input_tokens_seen": 105324832, "step": 48855 }, { "epoch": 8.96678289594421, "grad_norm": 16.016088485717773, "learning_rate": 6.73746123534968e-06, "loss": 0.3244, "num_input_tokens_seen": 105335488, "step": 48860 }, { "epoch": 8.967700495503761, "grad_norm": 0.7250165939331055, "learning_rate": 6.736710357264049e-06, "loss": 0.0677, "num_input_tokens_seen": 105346112, "step": 48865 }, { "epoch": 8.968618095063315, "grad_norm": 18.803810119628906, "learning_rate": 6.735959434634489e-06, "loss": 0.5257, "num_input_tokens_seen": 105356320, "step": 48870 }, { "epoch": 8.969535694622866, "grad_norm": 3.889866590499878, "learning_rate": 6.73520846748026e-06, "loss": 0.2246, "num_input_tokens_seen": 105368352, "step": 48875 }, { "epoch": 8.970453294182418, "grad_norm": 5.87479829788208, "learning_rate": 6.734457455820623e-06, "loss": 0.2034, "num_input_tokens_seen": 105378880, "step": 48880 }, { "epoch": 8.971370893741971, "grad_norm": 0.5058286786079407, "learning_rate": 6.733706399674841e-06, "loss": 0.0956, "num_input_tokens_seen": 105390400, "step": 48885 }, { "epoch": 8.972288493301523, "grad_norm": 3.8767237663269043, "learning_rate": 6.732955299062176e-06, "loss": 0.3154, "num_input_tokens_seen": 105400960, "step": 48890 }, { "epoch": 8.973206092861075, "grad_norm": 2.308337450027466, "learning_rate": 6.732204154001895e-06, "loss": 0.1048, "num_input_tokens_seen": 105411136, "step": 48895 }, { "epoch": 8.974123692420628, "grad_norm": 0.34558358788490295, "learning_rate": 6.7314529645132595e-06, "loss": 0.1725, "num_input_tokens_seen": 105422912, "step": 48900 }, { "epoch": 8.97504129198018, "grad_norm": 19.623641967773438, "learning_rate": 6.730701730615541e-06, "loss": 0.1043, "num_input_tokens_seen": 105432576, "step": 48905 }, { "epoch": 8.975958891539731, "grad_norm": 5.265310764312744, "learning_rate": 6.729950452328004e-06, "loss": 0.1745, "num_input_tokens_seen": 105444544, "step": 48910 }, { "epoch": 8.976876491099285, "grad_norm": 5.180106163024902, "learning_rate": 6.729199129669921e-06, "loss": 0.2216, "num_input_tokens_seen": 105454528, "step": 48915 }, { "epoch": 8.977794090658836, "grad_norm": 93.89127349853516, "learning_rate": 6.72844776266056e-06, "loss": 0.2561, "num_input_tokens_seen": 105464960, "step": 48920 }, { "epoch": 8.978711690218388, "grad_norm": 53.75203323364258, "learning_rate": 6.727696351319192e-06, "loss": 0.3766, "num_input_tokens_seen": 105474976, "step": 48925 }, { "epoch": 8.979629289777941, "grad_norm": 1.01802396774292, "learning_rate": 6.726944895665091e-06, "loss": 0.2268, "num_input_tokens_seen": 105485600, "step": 48930 }, { "epoch": 8.980546889337493, "grad_norm": 13.050411224365234, "learning_rate": 6.726193395717528e-06, "loss": 0.2451, "num_input_tokens_seen": 105494144, "step": 48935 }, { "epoch": 8.981464488897045, "grad_norm": 0.2784479558467865, "learning_rate": 6.725441851495782e-06, "loss": 0.1246, "num_input_tokens_seen": 105505632, "step": 48940 }, { "epoch": 8.982382088456598, "grad_norm": 14.98910140991211, "learning_rate": 6.724690263019126e-06, "loss": 0.084, "num_input_tokens_seen": 105515104, "step": 48945 }, { "epoch": 8.98329968801615, "grad_norm": 7.460315227508545, "learning_rate": 6.723938630306837e-06, "loss": 0.2975, "num_input_tokens_seen": 105526528, "step": 48950 }, { "epoch": 8.984217287575701, "grad_norm": 64.513916015625, "learning_rate": 6.723186953378195e-06, "loss": 0.4767, "num_input_tokens_seen": 105535712, "step": 48955 }, { "epoch": 8.985134887135255, "grad_norm": 30.576969146728516, "learning_rate": 6.722435232252478e-06, "loss": 0.0265, "num_input_tokens_seen": 105547040, "step": 48960 }, { "epoch": 8.986052486694806, "grad_norm": 9.605542182922363, "learning_rate": 6.721683466948966e-06, "loss": 0.3069, "num_input_tokens_seen": 105556992, "step": 48965 }, { "epoch": 8.986970086254358, "grad_norm": 5.1591339111328125, "learning_rate": 6.720931657486942e-06, "loss": 0.2927, "num_input_tokens_seen": 105568160, "step": 48970 }, { "epoch": 8.987887685813911, "grad_norm": 26.405603408813477, "learning_rate": 6.720179803885688e-06, "loss": 0.2268, "num_input_tokens_seen": 105579104, "step": 48975 }, { "epoch": 8.988805285373463, "grad_norm": 7.593339920043945, "learning_rate": 6.7194279061644885e-06, "loss": 0.1171, "num_input_tokens_seen": 105589376, "step": 48980 }, { "epoch": 8.989722884933014, "grad_norm": 6.012564659118652, "learning_rate": 6.718675964342628e-06, "loss": 0.0823, "num_input_tokens_seen": 105600224, "step": 48985 }, { "epoch": 8.990640484492568, "grad_norm": 0.5464692115783691, "learning_rate": 6.717923978439393e-06, "loss": 0.292, "num_input_tokens_seen": 105611008, "step": 48990 }, { "epoch": 8.99155808405212, "grad_norm": 0.5642911791801453, "learning_rate": 6.717171948474071e-06, "loss": 0.1733, "num_input_tokens_seen": 105622080, "step": 48995 }, { "epoch": 8.992475683611671, "grad_norm": 0.6669049263000488, "learning_rate": 6.716419874465948e-06, "loss": 0.2625, "num_input_tokens_seen": 105633024, "step": 49000 }, { "epoch": 8.993393283171224, "grad_norm": 0.25504070520401, "learning_rate": 6.715667756434316e-06, "loss": 0.138, "num_input_tokens_seen": 105643936, "step": 49005 }, { "epoch": 8.994310882730776, "grad_norm": 24.7802734375, "learning_rate": 6.714915594398466e-06, "loss": 0.2926, "num_input_tokens_seen": 105654816, "step": 49010 }, { "epoch": 8.995228482290328, "grad_norm": 104.37139129638672, "learning_rate": 6.714163388377689e-06, "loss": 0.2046, "num_input_tokens_seen": 105666496, "step": 49015 }, { "epoch": 8.996146081849881, "grad_norm": 0.6172763705253601, "learning_rate": 6.713411138391277e-06, "loss": 0.1295, "num_input_tokens_seen": 105677152, "step": 49020 }, { "epoch": 8.997063681409433, "grad_norm": 0.20535564422607422, "learning_rate": 6.712658844458526e-06, "loss": 0.1657, "num_input_tokens_seen": 105688672, "step": 49025 }, { "epoch": 8.997981280968984, "grad_norm": 28.28862953186035, "learning_rate": 6.711906506598728e-06, "loss": 0.2017, "num_input_tokens_seen": 105699520, "step": 49030 }, { "epoch": 8.998898880528538, "grad_norm": 22.57305908203125, "learning_rate": 6.711154124831183e-06, "loss": 0.2011, "num_input_tokens_seen": 105710432, "step": 49035 }, { "epoch": 8.99981648008809, "grad_norm": 1.2035717964172363, "learning_rate": 6.710401699175187e-06, "loss": 0.3717, "num_input_tokens_seen": 105721248, "step": 49040 }, { "epoch": 9.000734079647641, "grad_norm": 202.7554931640625, "learning_rate": 6.709649229650037e-06, "loss": 0.2541, "num_input_tokens_seen": 105730800, "step": 49045 }, { "epoch": 9.001651679207194, "grad_norm": 39.62700271606445, "learning_rate": 6.708896716275035e-06, "loss": 0.1947, "num_input_tokens_seen": 105742736, "step": 49050 }, { "epoch": 9.002569278766746, "grad_norm": 36.268558502197266, "learning_rate": 6.70814415906948e-06, "loss": 0.1396, "num_input_tokens_seen": 105752624, "step": 49055 }, { "epoch": 9.003486878326298, "grad_norm": 39.4511833190918, "learning_rate": 6.707391558052675e-06, "loss": 0.6693, "num_input_tokens_seen": 105764112, "step": 49060 }, { "epoch": 9.004404477885851, "grad_norm": 37.752960205078125, "learning_rate": 6.706638913243924e-06, "loss": 0.1212, "num_input_tokens_seen": 105774064, "step": 49065 }, { "epoch": 9.005322077445403, "grad_norm": 0.2005835920572281, "learning_rate": 6.705886224662528e-06, "loss": 0.1565, "num_input_tokens_seen": 105784816, "step": 49070 }, { "epoch": 9.006239677004954, "grad_norm": 0.3400339186191559, "learning_rate": 6.7051334923277945e-06, "loss": 0.354, "num_input_tokens_seen": 105794960, "step": 49075 }, { "epoch": 9.007157276564508, "grad_norm": 23.774063110351562, "learning_rate": 6.70438071625903e-06, "loss": 0.3231, "num_input_tokens_seen": 105805104, "step": 49080 }, { "epoch": 9.00807487612406, "grad_norm": 68.54368591308594, "learning_rate": 6.7036278964755404e-06, "loss": 0.3686, "num_input_tokens_seen": 105816112, "step": 49085 }, { "epoch": 9.00899247568361, "grad_norm": 121.65113830566406, "learning_rate": 6.702875032996638e-06, "loss": 0.1999, "num_input_tokens_seen": 105825200, "step": 49090 }, { "epoch": 9.009910075243164, "grad_norm": 21.521068572998047, "learning_rate": 6.702122125841629e-06, "loss": 0.2764, "num_input_tokens_seen": 105835856, "step": 49095 }, { "epoch": 9.010827674802716, "grad_norm": 7.123946666717529, "learning_rate": 6.701369175029826e-06, "loss": 0.1155, "num_input_tokens_seen": 105847440, "step": 49100 }, { "epoch": 9.011745274362267, "grad_norm": 25.31403160095215, "learning_rate": 6.70061618058054e-06, "loss": 0.1314, "num_input_tokens_seen": 105858096, "step": 49105 }, { "epoch": 9.01266287392182, "grad_norm": 0.580936074256897, "learning_rate": 6.699863142513085e-06, "loss": 0.0546, "num_input_tokens_seen": 105867600, "step": 49110 }, { "epoch": 9.013580473481372, "grad_norm": 0.2879772484302521, "learning_rate": 6.699110060846775e-06, "loss": 0.1914, "num_input_tokens_seen": 105877936, "step": 49115 }, { "epoch": 9.014498073040924, "grad_norm": 0.1470874547958374, "learning_rate": 6.698356935600924e-06, "loss": 0.208, "num_input_tokens_seen": 105888784, "step": 49120 }, { "epoch": 9.015415672600477, "grad_norm": 76.0343246459961, "learning_rate": 6.697603766794853e-06, "loss": 0.148, "num_input_tokens_seen": 105899056, "step": 49125 }, { "epoch": 9.016333272160029, "grad_norm": 0.2888229489326477, "learning_rate": 6.696850554447873e-06, "loss": 0.192, "num_input_tokens_seen": 105909104, "step": 49130 }, { "epoch": 9.01725087171958, "grad_norm": 0.10675150156021118, "learning_rate": 6.696097298579308e-06, "loss": 0.1131, "num_input_tokens_seen": 105920080, "step": 49135 }, { "epoch": 9.018168471279134, "grad_norm": 0.3382607102394104, "learning_rate": 6.695343999208477e-06, "loss": 0.1866, "num_input_tokens_seen": 105931120, "step": 49140 }, { "epoch": 9.019086070838686, "grad_norm": 0.49254339933395386, "learning_rate": 6.694590656354698e-06, "loss": 0.1859, "num_input_tokens_seen": 105942736, "step": 49145 }, { "epoch": 9.020003670398237, "grad_norm": 0.11398289352655411, "learning_rate": 6.693837270037296e-06, "loss": 0.0393, "num_input_tokens_seen": 105952784, "step": 49150 }, { "epoch": 9.02092126995779, "grad_norm": 22.081151962280273, "learning_rate": 6.693083840275592e-06, "loss": 0.2576, "num_input_tokens_seen": 105963600, "step": 49155 }, { "epoch": 9.021838869517342, "grad_norm": 0.4126353859901428, "learning_rate": 6.692330367088913e-06, "loss": 0.1348, "num_input_tokens_seen": 105974320, "step": 49160 }, { "epoch": 9.022756469076894, "grad_norm": 5.41163969039917, "learning_rate": 6.691576850496582e-06, "loss": 0.1628, "num_input_tokens_seen": 105985520, "step": 49165 }, { "epoch": 9.023674068636447, "grad_norm": 5.361701965332031, "learning_rate": 6.690823290517926e-06, "loss": 0.0504, "num_input_tokens_seen": 105995824, "step": 49170 }, { "epoch": 9.024591668195999, "grad_norm": 76.72930908203125, "learning_rate": 6.690069687172275e-06, "loss": 0.2455, "num_input_tokens_seen": 106007056, "step": 49175 }, { "epoch": 9.02550926775555, "grad_norm": 11.19260025024414, "learning_rate": 6.689316040478955e-06, "loss": 0.1826, "num_input_tokens_seen": 106017936, "step": 49180 }, { "epoch": 9.026426867315104, "grad_norm": 0.4488185942173004, "learning_rate": 6.688562350457297e-06, "loss": 0.243, "num_input_tokens_seen": 106030192, "step": 49185 }, { "epoch": 9.027344466874656, "grad_norm": 16.300687789916992, "learning_rate": 6.68780861712663e-06, "loss": 0.5542, "num_input_tokens_seen": 106042000, "step": 49190 }, { "epoch": 9.028262066434207, "grad_norm": 0.03961017727851868, "learning_rate": 6.687054840506288e-06, "loss": 0.4677, "num_input_tokens_seen": 106051344, "step": 49195 }, { "epoch": 9.02917966599376, "grad_norm": 0.516991913318634, "learning_rate": 6.686301020615606e-06, "loss": 0.339, "num_input_tokens_seen": 106061488, "step": 49200 }, { "epoch": 9.030097265553312, "grad_norm": 0.23832465708255768, "learning_rate": 6.685547157473916e-06, "loss": 0.2786, "num_input_tokens_seen": 106072048, "step": 49205 }, { "epoch": 9.031014865112864, "grad_norm": 12.959467887878418, "learning_rate": 6.684793251100554e-06, "loss": 0.159, "num_input_tokens_seen": 106083824, "step": 49210 }, { "epoch": 9.031932464672417, "grad_norm": 10.674479484558105, "learning_rate": 6.6840393015148555e-06, "loss": 0.3508, "num_input_tokens_seen": 106093136, "step": 49215 }, { "epoch": 9.032850064231969, "grad_norm": 2.4648377895355225, "learning_rate": 6.6832853087361586e-06, "loss": 0.1509, "num_input_tokens_seen": 106103344, "step": 49220 }, { "epoch": 9.03376766379152, "grad_norm": 1.2645593881607056, "learning_rate": 6.6825312727838035e-06, "loss": 0.1355, "num_input_tokens_seen": 106115120, "step": 49225 }, { "epoch": 9.034685263351074, "grad_norm": 0.03923153877258301, "learning_rate": 6.681777193677128e-06, "loss": 0.1625, "num_input_tokens_seen": 106125840, "step": 49230 }, { "epoch": 9.035602862910626, "grad_norm": 46.765625, "learning_rate": 6.681023071435475e-06, "loss": 0.2522, "num_input_tokens_seen": 106137360, "step": 49235 }, { "epoch": 9.036520462470177, "grad_norm": 13.084019660949707, "learning_rate": 6.680268906078184e-06, "loss": 0.3339, "num_input_tokens_seen": 106148016, "step": 49240 }, { "epoch": 9.03743806202973, "grad_norm": 0.8265621662139893, "learning_rate": 6.679514697624601e-06, "loss": 0.1801, "num_input_tokens_seen": 106157488, "step": 49245 }, { "epoch": 9.038355661589282, "grad_norm": 0.5675358176231384, "learning_rate": 6.678760446094069e-06, "loss": 0.2452, "num_input_tokens_seen": 106168848, "step": 49250 }, { "epoch": 9.039273261148834, "grad_norm": 0.6002575159072876, "learning_rate": 6.678006151505934e-06, "loss": 0.1808, "num_input_tokens_seen": 106180592, "step": 49255 }, { "epoch": 9.040190860708387, "grad_norm": 0.7631350755691528, "learning_rate": 6.67725181387954e-06, "loss": 0.1454, "num_input_tokens_seen": 106191120, "step": 49260 }, { "epoch": 9.041108460267939, "grad_norm": 0.7478806972503662, "learning_rate": 6.676497433234237e-06, "loss": 0.3626, "num_input_tokens_seen": 106202672, "step": 49265 }, { "epoch": 9.04202605982749, "grad_norm": 2.6077537536621094, "learning_rate": 6.675743009589374e-06, "loss": 0.1003, "num_input_tokens_seen": 106214192, "step": 49270 }, { "epoch": 9.042943659387044, "grad_norm": 5.592828273773193, "learning_rate": 6.6749885429643e-06, "loss": 0.1708, "num_input_tokens_seen": 106224848, "step": 49275 }, { "epoch": 9.043861258946595, "grad_norm": 26.819053649902344, "learning_rate": 6.674234033378365e-06, "loss": 0.1799, "num_input_tokens_seen": 106235152, "step": 49280 }, { "epoch": 9.044778858506147, "grad_norm": 19.37751007080078, "learning_rate": 6.673479480850923e-06, "loss": 0.1701, "num_input_tokens_seen": 106246256, "step": 49285 }, { "epoch": 9.0456964580657, "grad_norm": 0.45412200689315796, "learning_rate": 6.672724885401325e-06, "loss": 0.1013, "num_input_tokens_seen": 106256208, "step": 49290 }, { "epoch": 9.046614057625252, "grad_norm": 0.26085367798805237, "learning_rate": 6.6719702470489255e-06, "loss": 0.1154, "num_input_tokens_seen": 106266384, "step": 49295 }, { "epoch": 9.047531657184804, "grad_norm": 1.5462597608566284, "learning_rate": 6.6712155658130815e-06, "loss": 0.2507, "num_input_tokens_seen": 106277328, "step": 49300 }, { "epoch": 9.048449256744357, "grad_norm": 0.0669201910495758, "learning_rate": 6.670460841713149e-06, "loss": 0.0058, "num_input_tokens_seen": 106288784, "step": 49305 }, { "epoch": 9.049366856303909, "grad_norm": 260.3807678222656, "learning_rate": 6.669706074768484e-06, "loss": 0.2736, "num_input_tokens_seen": 106298896, "step": 49310 }, { "epoch": 9.05028445586346, "grad_norm": 2.0259385108947754, "learning_rate": 6.6689512649984454e-06, "loss": 0.0034, "num_input_tokens_seen": 106310448, "step": 49315 }, { "epoch": 9.051202055423014, "grad_norm": 3.2100024223327637, "learning_rate": 6.668196412422395e-06, "loss": 0.3182, "num_input_tokens_seen": 106321392, "step": 49320 }, { "epoch": 9.052119654982565, "grad_norm": 0.32413730025291443, "learning_rate": 6.667441517059692e-06, "loss": 0.4385, "num_input_tokens_seen": 106331344, "step": 49325 }, { "epoch": 9.053037254542117, "grad_norm": 24.148727416992188, "learning_rate": 6.666686578929696e-06, "loss": 0.4149, "num_input_tokens_seen": 106343600, "step": 49330 }, { "epoch": 9.05395485410167, "grad_norm": 0.2781832814216614, "learning_rate": 6.665931598051776e-06, "loss": 0.2226, "num_input_tokens_seen": 106355824, "step": 49335 }, { "epoch": 9.054872453661222, "grad_norm": 47.808719635009766, "learning_rate": 6.665176574445289e-06, "loss": 0.2059, "num_input_tokens_seen": 106367280, "step": 49340 }, { "epoch": 9.055790053220774, "grad_norm": 2.3542628288269043, "learning_rate": 6.664421508129606e-06, "loss": 0.0196, "num_input_tokens_seen": 106378512, "step": 49345 }, { "epoch": 9.056707652780327, "grad_norm": 0.5693297386169434, "learning_rate": 6.663666399124091e-06, "loss": 0.0791, "num_input_tokens_seen": 106390064, "step": 49350 }, { "epoch": 9.057625252339879, "grad_norm": 4.914988994598389, "learning_rate": 6.662911247448111e-06, "loss": 0.1367, "num_input_tokens_seen": 106400560, "step": 49355 }, { "epoch": 9.05854285189943, "grad_norm": 57.82925033569336, "learning_rate": 6.6621560531210335e-06, "loss": 0.1188, "num_input_tokens_seen": 106411056, "step": 49360 }, { "epoch": 9.059460451458984, "grad_norm": 29.137367248535156, "learning_rate": 6.66140081616223e-06, "loss": 0.2744, "num_input_tokens_seen": 106422224, "step": 49365 }, { "epoch": 9.060378051018535, "grad_norm": 0.19431909918785095, "learning_rate": 6.660645536591072e-06, "loss": 0.1494, "num_input_tokens_seen": 106432528, "step": 49370 }, { "epoch": 9.061295650578089, "grad_norm": 0.43157315254211426, "learning_rate": 6.659890214426927e-06, "loss": 0.2475, "num_input_tokens_seen": 106442416, "step": 49375 }, { "epoch": 9.06221325013764, "grad_norm": 5.980693817138672, "learning_rate": 6.659134849689173e-06, "loss": 0.1139, "num_input_tokens_seen": 106452944, "step": 49380 }, { "epoch": 9.063130849697192, "grad_norm": 75.68913269042969, "learning_rate": 6.658379442397181e-06, "loss": 0.6533, "num_input_tokens_seen": 106463728, "step": 49385 }, { "epoch": 9.064048449256745, "grad_norm": 0.11998674273490906, "learning_rate": 6.657623992570325e-06, "loss": 0.2737, "num_input_tokens_seen": 106474416, "step": 49390 }, { "epoch": 9.064966048816297, "grad_norm": 14.933524131774902, "learning_rate": 6.656868500227984e-06, "loss": 0.3782, "num_input_tokens_seen": 106485456, "step": 49395 }, { "epoch": 9.065883648375848, "grad_norm": 0.11396254599094391, "learning_rate": 6.656112965389534e-06, "loss": 0.2121, "num_input_tokens_seen": 106495984, "step": 49400 }, { "epoch": 9.066801247935402, "grad_norm": 33.30820846557617, "learning_rate": 6.6553573880743516e-06, "loss": 0.2727, "num_input_tokens_seen": 106507376, "step": 49405 }, { "epoch": 9.067718847494953, "grad_norm": 0.30244579911231995, "learning_rate": 6.65460176830182e-06, "loss": 0.2534, "num_input_tokens_seen": 106518608, "step": 49410 }, { "epoch": 9.068636447054505, "grad_norm": 0.1770896464586258, "learning_rate": 6.653846106091316e-06, "loss": 0.0873, "num_input_tokens_seen": 106527632, "step": 49415 }, { "epoch": 9.069554046614059, "grad_norm": 117.1689682006836, "learning_rate": 6.6530904014622234e-06, "loss": 0.1114, "num_input_tokens_seen": 106538704, "step": 49420 }, { "epoch": 9.07047164617361, "grad_norm": 5.7395148277282715, "learning_rate": 6.652334654433923e-06, "loss": 0.2632, "num_input_tokens_seen": 106549392, "step": 49425 }, { "epoch": 9.071389245733162, "grad_norm": 1.539260983467102, "learning_rate": 6.6515788650258005e-06, "loss": 0.256, "num_input_tokens_seen": 106560560, "step": 49430 }, { "epoch": 9.072306845292715, "grad_norm": 0.41950753331184387, "learning_rate": 6.65082303325724e-06, "loss": 0.2494, "num_input_tokens_seen": 106571312, "step": 49435 }, { "epoch": 9.073224444852267, "grad_norm": 39.66523742675781, "learning_rate": 6.650067159147626e-06, "loss": 0.2708, "num_input_tokens_seen": 106581680, "step": 49440 }, { "epoch": 9.074142044411818, "grad_norm": 9.430423736572266, "learning_rate": 6.649311242716348e-06, "loss": 0.1621, "num_input_tokens_seen": 106593008, "step": 49445 }, { "epoch": 9.075059643971372, "grad_norm": 0.31505152583122253, "learning_rate": 6.648555283982793e-06, "loss": 0.1512, "num_input_tokens_seen": 106603280, "step": 49450 }, { "epoch": 9.075977243530923, "grad_norm": 0.15456219017505646, "learning_rate": 6.647799282966349e-06, "loss": 0.086, "num_input_tokens_seen": 106615024, "step": 49455 }, { "epoch": 9.076894843090475, "grad_norm": 1.9287127256393433, "learning_rate": 6.647043239686409e-06, "loss": 0.135, "num_input_tokens_seen": 106625808, "step": 49460 }, { "epoch": 9.077812442650028, "grad_norm": 1.265641212463379, "learning_rate": 6.646287154162361e-06, "loss": 0.3382, "num_input_tokens_seen": 106637552, "step": 49465 }, { "epoch": 9.07873004220958, "grad_norm": 36.147979736328125, "learning_rate": 6.6455310264136e-06, "loss": 0.0618, "num_input_tokens_seen": 106647152, "step": 49470 }, { "epoch": 9.079647641769132, "grad_norm": 0.19876740872859955, "learning_rate": 6.644774856459517e-06, "loss": 0.0927, "num_input_tokens_seen": 106656976, "step": 49475 }, { "epoch": 9.080565241328685, "grad_norm": 67.10933685302734, "learning_rate": 6.64401864431951e-06, "loss": 0.515, "num_input_tokens_seen": 106668432, "step": 49480 }, { "epoch": 9.081482840888237, "grad_norm": 21.172414779663086, "learning_rate": 6.643262390012971e-06, "loss": 0.2666, "num_input_tokens_seen": 106679792, "step": 49485 }, { "epoch": 9.082400440447788, "grad_norm": 0.3428875207901001, "learning_rate": 6.642506093559299e-06, "loss": 0.2999, "num_input_tokens_seen": 106690096, "step": 49490 }, { "epoch": 9.083318040007342, "grad_norm": 42.86486053466797, "learning_rate": 6.641749754977892e-06, "loss": 0.3319, "num_input_tokens_seen": 106700784, "step": 49495 }, { "epoch": 9.084235639566893, "grad_norm": 26.739728927612305, "learning_rate": 6.640993374288147e-06, "loss": 0.5898, "num_input_tokens_seen": 106711280, "step": 49500 }, { "epoch": 9.085153239126445, "grad_norm": 0.27996373176574707, "learning_rate": 6.640236951509467e-06, "loss": 0.1873, "num_input_tokens_seen": 106721840, "step": 49505 }, { "epoch": 9.086070838685998, "grad_norm": 0.4404313266277313, "learning_rate": 6.639480486661249e-06, "loss": 0.0187, "num_input_tokens_seen": 106733584, "step": 49510 }, { "epoch": 9.08698843824555, "grad_norm": 2.185455560684204, "learning_rate": 6.638723979762899e-06, "loss": 0.013, "num_input_tokens_seen": 106744848, "step": 49515 }, { "epoch": 9.087906037805102, "grad_norm": 10.703291893005371, "learning_rate": 6.63796743083382e-06, "loss": 0.1424, "num_input_tokens_seen": 106756048, "step": 49520 }, { "epoch": 9.088823637364655, "grad_norm": 0.14082825183868408, "learning_rate": 6.637210839893412e-06, "loss": 0.2125, "num_input_tokens_seen": 106767216, "step": 49525 }, { "epoch": 9.089741236924207, "grad_norm": 0.21297670900821686, "learning_rate": 6.636454206961086e-06, "loss": 0.1114, "num_input_tokens_seen": 106777296, "step": 49530 }, { "epoch": 9.090658836483758, "grad_norm": 0.3881256580352783, "learning_rate": 6.6356975320562445e-06, "loss": 0.1255, "num_input_tokens_seen": 106788720, "step": 49535 }, { "epoch": 9.091576436043312, "grad_norm": 0.29507774114608765, "learning_rate": 6.6349408151982965e-06, "loss": 0.0911, "num_input_tokens_seen": 106799120, "step": 49540 }, { "epoch": 9.092494035602863, "grad_norm": 21.716419219970703, "learning_rate": 6.634184056406652e-06, "loss": 0.267, "num_input_tokens_seen": 106809296, "step": 49545 }, { "epoch": 9.093411635162415, "grad_norm": 0.8863794803619385, "learning_rate": 6.6334272557007175e-06, "loss": 0.0994, "num_input_tokens_seen": 106820112, "step": 49550 }, { "epoch": 9.094329234721968, "grad_norm": 10.980046272277832, "learning_rate": 6.632670413099906e-06, "loss": 0.4153, "num_input_tokens_seen": 106831088, "step": 49555 }, { "epoch": 9.09524683428152, "grad_norm": 0.5918951630592346, "learning_rate": 6.631913528623628e-06, "loss": 0.209, "num_input_tokens_seen": 106841264, "step": 49560 }, { "epoch": 9.096164433841071, "grad_norm": 0.0705617293715477, "learning_rate": 6.631156602291299e-06, "loss": 0.1548, "num_input_tokens_seen": 106852080, "step": 49565 }, { "epoch": 9.097082033400625, "grad_norm": 154.52682495117188, "learning_rate": 6.630399634122331e-06, "loss": 0.1838, "num_input_tokens_seen": 106862224, "step": 49570 }, { "epoch": 9.097999632960176, "grad_norm": 0.2989825904369354, "learning_rate": 6.629642624136138e-06, "loss": 0.1717, "num_input_tokens_seen": 106871920, "step": 49575 }, { "epoch": 9.098917232519728, "grad_norm": 15.945183753967285, "learning_rate": 6.628885572352139e-06, "loss": 0.1126, "num_input_tokens_seen": 106883632, "step": 49580 }, { "epoch": 9.099834832079281, "grad_norm": 18.54706382751465, "learning_rate": 6.628128478789747e-06, "loss": 0.078, "num_input_tokens_seen": 106894192, "step": 49585 }, { "epoch": 9.100752431638833, "grad_norm": 0.4251696765422821, "learning_rate": 6.627371343468385e-06, "loss": 0.034, "num_input_tokens_seen": 106905072, "step": 49590 }, { "epoch": 9.101670031198385, "grad_norm": 10.212345123291016, "learning_rate": 6.6266141664074704e-06, "loss": 0.1899, "num_input_tokens_seen": 106916976, "step": 49595 }, { "epoch": 9.102587630757938, "grad_norm": 7.454704284667969, "learning_rate": 6.625856947626421e-06, "loss": 0.2672, "num_input_tokens_seen": 106928944, "step": 49600 }, { "epoch": 9.10350523031749, "grad_norm": 3.837185859680176, "learning_rate": 6.625099687144664e-06, "loss": 0.1462, "num_input_tokens_seen": 106941648, "step": 49605 }, { "epoch": 9.104422829877041, "grad_norm": 71.99468994140625, "learning_rate": 6.624342384981617e-06, "loss": 0.3516, "num_input_tokens_seen": 106952336, "step": 49610 }, { "epoch": 9.105340429436595, "grad_norm": 0.3342324197292328, "learning_rate": 6.623585041156706e-06, "loss": 0.0177, "num_input_tokens_seen": 106961648, "step": 49615 }, { "epoch": 9.106258028996146, "grad_norm": 12.728874206542969, "learning_rate": 6.622827655689353e-06, "loss": 0.2653, "num_input_tokens_seen": 106972464, "step": 49620 }, { "epoch": 9.107175628555698, "grad_norm": 11.025938034057617, "learning_rate": 6.622070228598987e-06, "loss": 0.2774, "num_input_tokens_seen": 106983600, "step": 49625 }, { "epoch": 9.108093228115251, "grad_norm": 2.5726630687713623, "learning_rate": 6.621312759905035e-06, "loss": 0.2506, "num_input_tokens_seen": 106994864, "step": 49630 }, { "epoch": 9.109010827674803, "grad_norm": 1.1951727867126465, "learning_rate": 6.6205552496269225e-06, "loss": 0.0921, "num_input_tokens_seen": 107005648, "step": 49635 }, { "epoch": 9.109928427234355, "grad_norm": 0.26459744572639465, "learning_rate": 6.619797697784079e-06, "loss": 0.2182, "num_input_tokens_seen": 107016592, "step": 49640 }, { "epoch": 9.110846026793908, "grad_norm": 1.673198938369751, "learning_rate": 6.619040104395935e-06, "loss": 0.0908, "num_input_tokens_seen": 107026576, "step": 49645 }, { "epoch": 9.11176362635346, "grad_norm": 14.131970405578613, "learning_rate": 6.618282469481922e-06, "loss": 0.2776, "num_input_tokens_seen": 107037296, "step": 49650 }, { "epoch": 9.112681225913011, "grad_norm": 17.43274688720703, "learning_rate": 6.617524793061473e-06, "loss": 0.3584, "num_input_tokens_seen": 107049136, "step": 49655 }, { "epoch": 9.113598825472565, "grad_norm": 0.16816489398479462, "learning_rate": 6.616767075154018e-06, "loss": 0.1086, "num_input_tokens_seen": 107059152, "step": 49660 }, { "epoch": 9.114516425032116, "grad_norm": 60.78178405761719, "learning_rate": 6.616009315778995e-06, "loss": 0.2114, "num_input_tokens_seen": 107071088, "step": 49665 }, { "epoch": 9.115434024591668, "grad_norm": 40.3692626953125, "learning_rate": 6.615251514955837e-06, "loss": 0.1486, "num_input_tokens_seen": 107080976, "step": 49670 }, { "epoch": 9.116351624151221, "grad_norm": 6.851154327392578, "learning_rate": 6.61449367270398e-06, "loss": 0.4019, "num_input_tokens_seen": 107090256, "step": 49675 }, { "epoch": 9.117269223710773, "grad_norm": 20.76548957824707, "learning_rate": 6.613735789042864e-06, "loss": 0.2935, "num_input_tokens_seen": 107100880, "step": 49680 }, { "epoch": 9.118186823270324, "grad_norm": 1.2571430206298828, "learning_rate": 6.6129778639919254e-06, "loss": 0.1518, "num_input_tokens_seen": 107111216, "step": 49685 }, { "epoch": 9.119104422829878, "grad_norm": 0.4936884343624115, "learning_rate": 6.612219897570604e-06, "loss": 0.055, "num_input_tokens_seen": 107122480, "step": 49690 }, { "epoch": 9.12002202238943, "grad_norm": 0.1279589980840683, "learning_rate": 6.6114618897983415e-06, "loss": 0.0972, "num_input_tokens_seen": 107132048, "step": 49695 }, { "epoch": 9.120939621948981, "grad_norm": 0.8501405119895935, "learning_rate": 6.610703840694579e-06, "loss": 0.1884, "num_input_tokens_seen": 107143600, "step": 49700 }, { "epoch": 9.121857221508535, "grad_norm": 0.43976160883903503, "learning_rate": 6.609945750278759e-06, "loss": 0.1709, "num_input_tokens_seen": 107154352, "step": 49705 }, { "epoch": 9.122774821068086, "grad_norm": 6.594326019287109, "learning_rate": 6.609187618570327e-06, "loss": 0.3688, "num_input_tokens_seen": 107163824, "step": 49710 }, { "epoch": 9.123692420627638, "grad_norm": 29.415996551513672, "learning_rate": 6.608429445588725e-06, "loss": 0.4573, "num_input_tokens_seen": 107174480, "step": 49715 }, { "epoch": 9.124610020187191, "grad_norm": 0.1557459682226181, "learning_rate": 6.6076712313534006e-06, "loss": 0.0482, "num_input_tokens_seen": 107184080, "step": 49720 }, { "epoch": 9.125527619746743, "grad_norm": 0.2952401340007782, "learning_rate": 6.606912975883801e-06, "loss": 0.0726, "num_input_tokens_seen": 107194800, "step": 49725 }, { "epoch": 9.126445219306294, "grad_norm": 123.26498413085938, "learning_rate": 6.606154679199375e-06, "loss": 0.1417, "num_input_tokens_seen": 107204784, "step": 49730 }, { "epoch": 9.127362818865848, "grad_norm": 20.64110565185547, "learning_rate": 6.60539634131957e-06, "loss": 0.2051, "num_input_tokens_seen": 107215696, "step": 49735 }, { "epoch": 9.1282804184254, "grad_norm": 0.9648005366325378, "learning_rate": 6.604637962263838e-06, "loss": 0.0264, "num_input_tokens_seen": 107226288, "step": 49740 }, { "epoch": 9.129198017984951, "grad_norm": 10.898307800292969, "learning_rate": 6.603879542051628e-06, "loss": 0.0208, "num_input_tokens_seen": 107236944, "step": 49745 }, { "epoch": 9.130115617544504, "grad_norm": 10.206061363220215, "learning_rate": 6.6031210807023925e-06, "loss": 0.3454, "num_input_tokens_seen": 107247024, "step": 49750 }, { "epoch": 9.131033217104056, "grad_norm": 0.2521689832210541, "learning_rate": 6.602362578235588e-06, "loss": 0.1395, "num_input_tokens_seen": 107258160, "step": 49755 }, { "epoch": 9.131950816663608, "grad_norm": 27.531620025634766, "learning_rate": 6.601604034670667e-06, "loss": 0.473, "num_input_tokens_seen": 107270736, "step": 49760 }, { "epoch": 9.132868416223161, "grad_norm": 21.47064208984375, "learning_rate": 6.600845450027085e-06, "loss": 0.1371, "num_input_tokens_seen": 107281296, "step": 49765 }, { "epoch": 9.133786015782713, "grad_norm": 25.99781608581543, "learning_rate": 6.600086824324295e-06, "loss": 0.1075, "num_input_tokens_seen": 107291376, "step": 49770 }, { "epoch": 9.134703615342264, "grad_norm": 150.4736785888672, "learning_rate": 6.599328157581762e-06, "loss": 0.3734, "num_input_tokens_seen": 107301424, "step": 49775 }, { "epoch": 9.135621214901818, "grad_norm": 31.410436630249023, "learning_rate": 6.598569449818939e-06, "loss": 0.3741, "num_input_tokens_seen": 107312560, "step": 49780 }, { "epoch": 9.13653881446137, "grad_norm": 19.896339416503906, "learning_rate": 6.597810701055286e-06, "loss": 0.1019, "num_input_tokens_seen": 107323856, "step": 49785 }, { "epoch": 9.137456414020921, "grad_norm": 0.10181759297847748, "learning_rate": 6.597051911310266e-06, "loss": 0.2586, "num_input_tokens_seen": 107334096, "step": 49790 }, { "epoch": 9.138374013580474, "grad_norm": 0.2588094472885132, "learning_rate": 6.596293080603338e-06, "loss": 0.2841, "num_input_tokens_seen": 107344432, "step": 49795 }, { "epoch": 9.139291613140026, "grad_norm": 6.75532865524292, "learning_rate": 6.595534208953969e-06, "loss": 0.2154, "num_input_tokens_seen": 107354128, "step": 49800 }, { "epoch": 9.140209212699578, "grad_norm": 0.033512067049741745, "learning_rate": 6.594775296381619e-06, "loss": 0.1068, "num_input_tokens_seen": 107365456, "step": 49805 }, { "epoch": 9.141126812259131, "grad_norm": 68.91947174072266, "learning_rate": 6.5940163429057544e-06, "loss": 0.2849, "num_input_tokens_seen": 107376368, "step": 49810 }, { "epoch": 9.142044411818683, "grad_norm": 71.93878936767578, "learning_rate": 6.5932573485458415e-06, "loss": 0.5773, "num_input_tokens_seen": 107387888, "step": 49815 }, { "epoch": 9.142962011378234, "grad_norm": 110.89867401123047, "learning_rate": 6.5924983133213475e-06, "loss": 0.1669, "num_input_tokens_seen": 107397808, "step": 49820 }, { "epoch": 9.143879610937788, "grad_norm": 23.047361373901367, "learning_rate": 6.591739237251739e-06, "loss": 0.0651, "num_input_tokens_seen": 107408624, "step": 49825 }, { "epoch": 9.14479721049734, "grad_norm": 32.15603256225586, "learning_rate": 6.590980120356485e-06, "loss": 0.3071, "num_input_tokens_seen": 107418672, "step": 49830 }, { "epoch": 9.14571481005689, "grad_norm": 0.24127016961574554, "learning_rate": 6.5902209626550585e-06, "loss": 0.3987, "num_input_tokens_seen": 107428272, "step": 49835 }, { "epoch": 9.146632409616444, "grad_norm": 17.83673858642578, "learning_rate": 6.589461764166929e-06, "loss": 0.2293, "num_input_tokens_seen": 107439824, "step": 49840 }, { "epoch": 9.147550009175996, "grad_norm": 1.5586084127426147, "learning_rate": 6.588702524911566e-06, "loss": 0.2393, "num_input_tokens_seen": 107450000, "step": 49845 }, { "epoch": 9.148467608735547, "grad_norm": 58.44391632080078, "learning_rate": 6.587943244908449e-06, "loss": 0.2389, "num_input_tokens_seen": 107459888, "step": 49850 }, { "epoch": 9.1493852082951, "grad_norm": 0.13342314958572388, "learning_rate": 6.587183924177048e-06, "loss": 0.1007, "num_input_tokens_seen": 107470160, "step": 49855 }, { "epoch": 9.150302807854652, "grad_norm": 12.575285911560059, "learning_rate": 6.5864245627368375e-06, "loss": 0.2078, "num_input_tokens_seen": 107480784, "step": 49860 }, { "epoch": 9.151220407414204, "grad_norm": 24.513132095336914, "learning_rate": 6.585665160607297e-06, "loss": 0.3095, "num_input_tokens_seen": 107492208, "step": 49865 }, { "epoch": 9.152138006973757, "grad_norm": 8.167261123657227, "learning_rate": 6.5849057178079014e-06, "loss": 0.1913, "num_input_tokens_seen": 107503184, "step": 49870 }, { "epoch": 9.153055606533309, "grad_norm": 0.7187064290046692, "learning_rate": 6.584146234358133e-06, "loss": 0.1273, "num_input_tokens_seen": 107514512, "step": 49875 }, { "epoch": 9.15397320609286, "grad_norm": 0.9291367530822754, "learning_rate": 6.583386710277467e-06, "loss": 0.0279, "num_input_tokens_seen": 107526160, "step": 49880 }, { "epoch": 9.154890805652414, "grad_norm": 6.18255615234375, "learning_rate": 6.582627145585387e-06, "loss": 0.1297, "num_input_tokens_seen": 107537904, "step": 49885 }, { "epoch": 9.155808405211966, "grad_norm": 0.5740725994110107, "learning_rate": 6.5818675403013735e-06, "loss": 0.1738, "num_input_tokens_seen": 107548688, "step": 49890 }, { "epoch": 9.156726004771517, "grad_norm": 25.84664535522461, "learning_rate": 6.581107894444908e-06, "loss": 0.2846, "num_input_tokens_seen": 107560304, "step": 49895 }, { "epoch": 9.15764360433107, "grad_norm": 0.09732766449451447, "learning_rate": 6.580348208035476e-06, "loss": 0.0569, "num_input_tokens_seen": 107571184, "step": 49900 }, { "epoch": 9.158561203890622, "grad_norm": 15.630026817321777, "learning_rate": 6.579588481092563e-06, "loss": 0.0747, "num_input_tokens_seen": 107582096, "step": 49905 }, { "epoch": 9.159478803450174, "grad_norm": 24.41004180908203, "learning_rate": 6.578828713635652e-06, "loss": 0.543, "num_input_tokens_seen": 107593072, "step": 49910 }, { "epoch": 9.160396403009727, "grad_norm": 74.00625610351562, "learning_rate": 6.5780689056842314e-06, "loss": 0.5828, "num_input_tokens_seen": 107604272, "step": 49915 }, { "epoch": 9.161314002569279, "grad_norm": 0.7673946619033813, "learning_rate": 6.57730905725779e-06, "loss": 0.4689, "num_input_tokens_seen": 107613968, "step": 49920 }, { "epoch": 9.16223160212883, "grad_norm": 0.27931225299835205, "learning_rate": 6.576549168375817e-06, "loss": 0.0431, "num_input_tokens_seen": 107624880, "step": 49925 }, { "epoch": 9.163149201688384, "grad_norm": 8.14447021484375, "learning_rate": 6.575789239057799e-06, "loss": 0.3059, "num_input_tokens_seen": 107635184, "step": 49930 }, { "epoch": 9.164066801247936, "grad_norm": 59.82026672363281, "learning_rate": 6.575029269323231e-06, "loss": 0.404, "num_input_tokens_seen": 107646288, "step": 49935 }, { "epoch": 9.164984400807487, "grad_norm": 22.564617156982422, "learning_rate": 6.5742692591916025e-06, "loss": 0.239, "num_input_tokens_seen": 107657072, "step": 49940 }, { "epoch": 9.16590200036704, "grad_norm": 8.604001998901367, "learning_rate": 6.573509208682407e-06, "loss": 0.2556, "num_input_tokens_seen": 107667120, "step": 49945 }, { "epoch": 9.166819599926592, "grad_norm": 28.38437271118164, "learning_rate": 6.57274911781514e-06, "loss": 0.114, "num_input_tokens_seen": 107677936, "step": 49950 }, { "epoch": 9.167737199486144, "grad_norm": 52.23186492919922, "learning_rate": 6.571988986609295e-06, "loss": 0.1258, "num_input_tokens_seen": 107688976, "step": 49955 }, { "epoch": 9.168654799045697, "grad_norm": 0.32614001631736755, "learning_rate": 6.5712288150843695e-06, "loss": 0.1205, "num_input_tokens_seen": 107698736, "step": 49960 }, { "epoch": 9.169572398605249, "grad_norm": 0.6954612731933594, "learning_rate": 6.5704686032598584e-06, "loss": 0.0977, "num_input_tokens_seen": 107709136, "step": 49965 }, { "epoch": 9.1704899981648, "grad_norm": 7.116802215576172, "learning_rate": 6.569708351155263e-06, "loss": 0.1352, "num_input_tokens_seen": 107719376, "step": 49970 }, { "epoch": 9.171407597724354, "grad_norm": 0.9863874912261963, "learning_rate": 6.568948058790081e-06, "loss": 0.0372, "num_input_tokens_seen": 107732176, "step": 49975 }, { "epoch": 9.172325197283905, "grad_norm": 14.73297119140625, "learning_rate": 6.568187726183812e-06, "loss": 0.29, "num_input_tokens_seen": 107743984, "step": 49980 }, { "epoch": 9.173242796843457, "grad_norm": 2.121845245361328, "learning_rate": 6.567427353355961e-06, "loss": 0.745, "num_input_tokens_seen": 107754064, "step": 49985 }, { "epoch": 9.17416039640301, "grad_norm": 1.7092927694320679, "learning_rate": 6.566666940326026e-06, "loss": 0.148, "num_input_tokens_seen": 107764336, "step": 49990 }, { "epoch": 9.175077995962562, "grad_norm": 44.053184509277344, "learning_rate": 6.565906487113511e-06, "loss": 0.3263, "num_input_tokens_seen": 107775344, "step": 49995 }, { "epoch": 9.175995595522114, "grad_norm": 17.117403030395508, "learning_rate": 6.565145993737924e-06, "loss": 0.324, "num_input_tokens_seen": 107785200, "step": 50000 }, { "epoch": 9.176913195081667, "grad_norm": 21.375093460083008, "learning_rate": 6.564385460218766e-06, "loss": 0.3568, "num_input_tokens_seen": 107795728, "step": 50005 }, { "epoch": 9.177830794641219, "grad_norm": 68.8187484741211, "learning_rate": 6.563624886575547e-06, "loss": 0.2222, "num_input_tokens_seen": 107807600, "step": 50010 }, { "epoch": 9.17874839420077, "grad_norm": 0.5705974698066711, "learning_rate": 6.562864272827772e-06, "loss": 0.2034, "num_input_tokens_seen": 107818352, "step": 50015 }, { "epoch": 9.179665993760324, "grad_norm": 0.19395719468593597, "learning_rate": 6.562103618994951e-06, "loss": 0.1492, "num_input_tokens_seen": 107829648, "step": 50020 }, { "epoch": 9.180583593319875, "grad_norm": 147.50579833984375, "learning_rate": 6.561342925096592e-06, "loss": 0.0897, "num_input_tokens_seen": 107841136, "step": 50025 }, { "epoch": 9.181501192879427, "grad_norm": 18.54852867126465, "learning_rate": 6.560582191152207e-06, "loss": 0.209, "num_input_tokens_seen": 107851728, "step": 50030 }, { "epoch": 9.18241879243898, "grad_norm": 58.39794921875, "learning_rate": 6.55982141718131e-06, "loss": 0.1576, "num_input_tokens_seen": 107861104, "step": 50035 }, { "epoch": 9.183336391998532, "grad_norm": 22.132761001586914, "learning_rate": 6.559060603203409e-06, "loss": 0.2667, "num_input_tokens_seen": 107871120, "step": 50040 }, { "epoch": 9.184253991558084, "grad_norm": 19.114500045776367, "learning_rate": 6.55829974923802e-06, "loss": 0.4796, "num_input_tokens_seen": 107881104, "step": 50045 }, { "epoch": 9.185171591117637, "grad_norm": 29.868587493896484, "learning_rate": 6.557538855304658e-06, "loss": 0.0691, "num_input_tokens_seen": 107892560, "step": 50050 }, { "epoch": 9.186089190677189, "grad_norm": 69.03948211669922, "learning_rate": 6.5567779214228375e-06, "loss": 0.1204, "num_input_tokens_seen": 107902928, "step": 50055 }, { "epoch": 9.18700679023674, "grad_norm": 91.46784210205078, "learning_rate": 6.556016947612078e-06, "loss": 0.2796, "num_input_tokens_seen": 107915344, "step": 50060 }, { "epoch": 9.187924389796294, "grad_norm": 2.670194625854492, "learning_rate": 6.555255933891893e-06, "loss": 0.3835, "num_input_tokens_seen": 107925392, "step": 50065 }, { "epoch": 9.188841989355845, "grad_norm": 9.756869316101074, "learning_rate": 6.554494880281805e-06, "loss": 0.1259, "num_input_tokens_seen": 107936624, "step": 50070 }, { "epoch": 9.189759588915397, "grad_norm": 0.16119179129600525, "learning_rate": 6.553733786801333e-06, "loss": 0.3664, "num_input_tokens_seen": 107948208, "step": 50075 }, { "epoch": 9.19067718847495, "grad_norm": 0.3068768084049225, "learning_rate": 6.552972653469997e-06, "loss": 0.3923, "num_input_tokens_seen": 107960496, "step": 50080 }, { "epoch": 9.191594788034502, "grad_norm": 28.97663688659668, "learning_rate": 6.55221148030732e-06, "loss": 0.1871, "num_input_tokens_seen": 107971152, "step": 50085 }, { "epoch": 9.192512387594054, "grad_norm": 1.243949294090271, "learning_rate": 6.551450267332823e-06, "loss": 0.1536, "num_input_tokens_seen": 107981744, "step": 50090 }, { "epoch": 9.193429987153607, "grad_norm": 27.231372833251953, "learning_rate": 6.5506890145660314e-06, "loss": 0.2617, "num_input_tokens_seen": 107992656, "step": 50095 }, { "epoch": 9.194347586713159, "grad_norm": 0.9989280700683594, "learning_rate": 6.54992772202647e-06, "loss": 0.1756, "num_input_tokens_seen": 108003216, "step": 50100 }, { "epoch": 9.19526518627271, "grad_norm": 0.12510095536708832, "learning_rate": 6.549166389733665e-06, "loss": 0.1371, "num_input_tokens_seen": 108013680, "step": 50105 }, { "epoch": 9.196182785832264, "grad_norm": 155.0074005126953, "learning_rate": 6.548405017707144e-06, "loss": 0.2336, "num_input_tokens_seen": 108025552, "step": 50110 }, { "epoch": 9.197100385391815, "grad_norm": 0.6827141046524048, "learning_rate": 6.5476436059664336e-06, "loss": 0.3005, "num_input_tokens_seen": 108036784, "step": 50115 }, { "epoch": 9.198017984951367, "grad_norm": 0.614418625831604, "learning_rate": 6.546882154531064e-06, "loss": 0.2644, "num_input_tokens_seen": 108048208, "step": 50120 }, { "epoch": 9.19893558451092, "grad_norm": 1.2315794229507446, "learning_rate": 6.546120663420562e-06, "loss": 0.2129, "num_input_tokens_seen": 108058576, "step": 50125 }, { "epoch": 9.199853184070472, "grad_norm": 0.5488136410713196, "learning_rate": 6.545359132654463e-06, "loss": 0.0649, "num_input_tokens_seen": 108070096, "step": 50130 }, { "epoch": 9.200770783630023, "grad_norm": 8.052865982055664, "learning_rate": 6.5445975622522975e-06, "loss": 0.3645, "num_input_tokens_seen": 108080752, "step": 50135 }, { "epoch": 9.201688383189577, "grad_norm": 43.13269805908203, "learning_rate": 6.543835952233597e-06, "loss": 0.0407, "num_input_tokens_seen": 108092336, "step": 50140 }, { "epoch": 9.202605982749128, "grad_norm": 0.18766288459300995, "learning_rate": 6.543074302617899e-06, "loss": 0.2166, "num_input_tokens_seen": 108102960, "step": 50145 }, { "epoch": 9.20352358230868, "grad_norm": 15.743634223937988, "learning_rate": 6.542312613424735e-06, "loss": 0.2381, "num_input_tokens_seen": 108114448, "step": 50150 }, { "epoch": 9.204441181868233, "grad_norm": 0.15213952958583832, "learning_rate": 6.541550884673643e-06, "loss": 0.3049, "num_input_tokens_seen": 108125744, "step": 50155 }, { "epoch": 9.205358781427785, "grad_norm": 0.2509092390537262, "learning_rate": 6.540789116384162e-06, "loss": 0.0892, "num_input_tokens_seen": 108136048, "step": 50160 }, { "epoch": 9.206276380987337, "grad_norm": 0.40996474027633667, "learning_rate": 6.540027308575826e-06, "loss": 0.1173, "num_input_tokens_seen": 108146992, "step": 50165 }, { "epoch": 9.20719398054689, "grad_norm": 0.12174802273511887, "learning_rate": 6.539265461268178e-06, "loss": 0.2473, "num_input_tokens_seen": 108156784, "step": 50170 }, { "epoch": 9.208111580106442, "grad_norm": 20.194997787475586, "learning_rate": 6.5385035744807545e-06, "loss": 0.6169, "num_input_tokens_seen": 108167984, "step": 50175 }, { "epoch": 9.209029179665993, "grad_norm": 131.3778533935547, "learning_rate": 6.5377416482331005e-06, "loss": 0.2693, "num_input_tokens_seen": 108178288, "step": 50180 }, { "epoch": 9.209946779225547, "grad_norm": 33.20677947998047, "learning_rate": 6.536979682544755e-06, "loss": 0.4772, "num_input_tokens_seen": 108189616, "step": 50185 }, { "epoch": 9.210864378785098, "grad_norm": 0.2629305124282837, "learning_rate": 6.536217677435264e-06, "loss": 0.2373, "num_input_tokens_seen": 108200080, "step": 50190 }, { "epoch": 9.21178197834465, "grad_norm": 109.53851318359375, "learning_rate": 6.53545563292417e-06, "loss": 0.2178, "num_input_tokens_seen": 108211024, "step": 50195 }, { "epoch": 9.212699577904203, "grad_norm": 0.1943480372428894, "learning_rate": 6.534693549031019e-06, "loss": 0.1545, "num_input_tokens_seen": 108221872, "step": 50200 }, { "epoch": 9.213617177463755, "grad_norm": 1.061552882194519, "learning_rate": 6.533931425775357e-06, "loss": 0.2415, "num_input_tokens_seen": 108233136, "step": 50205 }, { "epoch": 9.214534777023307, "grad_norm": 15.697077751159668, "learning_rate": 6.533169263176733e-06, "loss": 0.5592, "num_input_tokens_seen": 108244464, "step": 50210 }, { "epoch": 9.21545237658286, "grad_norm": 39.02156066894531, "learning_rate": 6.5324070612546905e-06, "loss": 0.4087, "num_input_tokens_seen": 108255184, "step": 50215 }, { "epoch": 9.216369976142412, "grad_norm": 0.7593382000923157, "learning_rate": 6.531644820028784e-06, "loss": 0.1711, "num_input_tokens_seen": 108265776, "step": 50220 }, { "epoch": 9.217287575701963, "grad_norm": 0.5089708566665649, "learning_rate": 6.530882539518562e-06, "loss": 0.0183, "num_input_tokens_seen": 108275536, "step": 50225 }, { "epoch": 9.218205175261517, "grad_norm": 25.1219425201416, "learning_rate": 6.530120219743574e-06, "loss": 0.1796, "num_input_tokens_seen": 108285904, "step": 50230 }, { "epoch": 9.219122774821068, "grad_norm": 135.7313232421875, "learning_rate": 6.529357860723374e-06, "loss": 0.2639, "num_input_tokens_seen": 108296560, "step": 50235 }, { "epoch": 9.22004037438062, "grad_norm": 8.809819221496582, "learning_rate": 6.528595462477515e-06, "loss": 0.2969, "num_input_tokens_seen": 108307248, "step": 50240 }, { "epoch": 9.220957973940173, "grad_norm": 0.37707430124282837, "learning_rate": 6.527833025025553e-06, "loss": 0.0849, "num_input_tokens_seen": 108317008, "step": 50245 }, { "epoch": 9.221875573499725, "grad_norm": 6.92089319229126, "learning_rate": 6.52707054838704e-06, "loss": 0.1504, "num_input_tokens_seen": 108326736, "step": 50250 }, { "epoch": 9.222793173059276, "grad_norm": 0.1953650712966919, "learning_rate": 6.5263080325815356e-06, "loss": 0.4235, "num_input_tokens_seen": 108338544, "step": 50255 }, { "epoch": 9.22371077261883, "grad_norm": 0.4444701373577118, "learning_rate": 6.525545477628594e-06, "loss": 0.2237, "num_input_tokens_seen": 108349104, "step": 50260 }, { "epoch": 9.224628372178381, "grad_norm": 0.2754751145839691, "learning_rate": 6.524782883547777e-06, "loss": 0.1067, "num_input_tokens_seen": 108359248, "step": 50265 }, { "epoch": 9.225545971737933, "grad_norm": 66.74954223632812, "learning_rate": 6.5240202503586415e-06, "loss": 0.0917, "num_input_tokens_seen": 108369744, "step": 50270 }, { "epoch": 9.226463571297487, "grad_norm": 22.16542625427246, "learning_rate": 6.5232575780807484e-06, "loss": 0.0493, "num_input_tokens_seen": 108381296, "step": 50275 }, { "epoch": 9.227381170857038, "grad_norm": 36.203731536865234, "learning_rate": 6.522494866733661e-06, "loss": 0.3814, "num_input_tokens_seen": 108391440, "step": 50280 }, { "epoch": 9.22829877041659, "grad_norm": 0.07099103927612305, "learning_rate": 6.521732116336938e-06, "loss": 0.086, "num_input_tokens_seen": 108401968, "step": 50285 }, { "epoch": 9.229216369976143, "grad_norm": 42.942596435546875, "learning_rate": 6.5209693269101435e-06, "loss": 0.3965, "num_input_tokens_seen": 108413712, "step": 50290 }, { "epoch": 9.230133969535695, "grad_norm": 0.1856914609670639, "learning_rate": 6.520206498472846e-06, "loss": 0.0323, "num_input_tokens_seen": 108424336, "step": 50295 }, { "epoch": 9.231051569095246, "grad_norm": 6.650506973266602, "learning_rate": 6.519443631044607e-06, "loss": 0.0442, "num_input_tokens_seen": 108435024, "step": 50300 }, { "epoch": 9.2319691686548, "grad_norm": 10.63240909576416, "learning_rate": 6.5186807246449935e-06, "loss": 0.2351, "num_input_tokens_seen": 108447536, "step": 50305 }, { "epoch": 9.232886768214351, "grad_norm": 59.17129898071289, "learning_rate": 6.517917779293572e-06, "loss": 0.2625, "num_input_tokens_seen": 108458928, "step": 50310 }, { "epoch": 9.233804367773903, "grad_norm": 0.5854228734970093, "learning_rate": 6.517154795009914e-06, "loss": 0.4294, "num_input_tokens_seen": 108470224, "step": 50315 }, { "epoch": 9.234721967333456, "grad_norm": 4.76383113861084, "learning_rate": 6.516391771813587e-06, "loss": 0.4455, "num_input_tokens_seen": 108481552, "step": 50320 }, { "epoch": 9.235639566893008, "grad_norm": 16.787660598754883, "learning_rate": 6.51562870972416e-06, "loss": 0.0307, "num_input_tokens_seen": 108493040, "step": 50325 }, { "epoch": 9.23655716645256, "grad_norm": 132.55963134765625, "learning_rate": 6.514865608761206e-06, "loss": 0.2468, "num_input_tokens_seen": 108504304, "step": 50330 }, { "epoch": 9.237474766012113, "grad_norm": 1.046190857887268, "learning_rate": 6.514102468944297e-06, "loss": 0.0941, "num_input_tokens_seen": 108514544, "step": 50335 }, { "epoch": 9.238392365571665, "grad_norm": 0.250939279794693, "learning_rate": 6.513339290293005e-06, "loss": 0.2379, "num_input_tokens_seen": 108525776, "step": 50340 }, { "epoch": 9.239309965131216, "grad_norm": 21.740354537963867, "learning_rate": 6.512576072826907e-06, "loss": 0.2413, "num_input_tokens_seen": 108536304, "step": 50345 }, { "epoch": 9.24022756469077, "grad_norm": 30.11273193359375, "learning_rate": 6.5118128165655766e-06, "loss": 0.1446, "num_input_tokens_seen": 108547920, "step": 50350 }, { "epoch": 9.241145164250321, "grad_norm": 45.131553649902344, "learning_rate": 6.511049521528592e-06, "loss": 0.1474, "num_input_tokens_seen": 108559312, "step": 50355 }, { "epoch": 9.242062763809873, "grad_norm": 36.67545700073242, "learning_rate": 6.510286187735527e-06, "loss": 0.2352, "num_input_tokens_seen": 108570128, "step": 50360 }, { "epoch": 9.242980363369426, "grad_norm": 39.4826774597168, "learning_rate": 6.509522815205962e-06, "loss": 0.1695, "num_input_tokens_seen": 108581168, "step": 50365 }, { "epoch": 9.243897962928978, "grad_norm": 2.9220008850097656, "learning_rate": 6.508759403959478e-06, "loss": 0.0267, "num_input_tokens_seen": 108592976, "step": 50370 }, { "epoch": 9.24481556248853, "grad_norm": 10.783160209655762, "learning_rate": 6.507995954015654e-06, "loss": 0.2142, "num_input_tokens_seen": 108603632, "step": 50375 }, { "epoch": 9.245733162048083, "grad_norm": 1.0581310987472534, "learning_rate": 6.507232465394069e-06, "loss": 0.3478, "num_input_tokens_seen": 108613648, "step": 50380 }, { "epoch": 9.246650761607635, "grad_norm": 96.67570495605469, "learning_rate": 6.506468938114307e-06, "loss": 0.2798, "num_input_tokens_seen": 108625360, "step": 50385 }, { "epoch": 9.247568361167186, "grad_norm": 0.14247767627239227, "learning_rate": 6.505705372195954e-06, "loss": 0.2202, "num_input_tokens_seen": 108635184, "step": 50390 }, { "epoch": 9.24848596072674, "grad_norm": 0.5824381113052368, "learning_rate": 6.50494176765859e-06, "loss": 0.0094, "num_input_tokens_seen": 108647408, "step": 50395 }, { "epoch": 9.249403560286291, "grad_norm": 53.33034896850586, "learning_rate": 6.504178124521803e-06, "loss": 0.2638, "num_input_tokens_seen": 108657808, "step": 50400 }, { "epoch": 9.250321159845843, "grad_norm": 15.846263885498047, "learning_rate": 6.5034144428051784e-06, "loss": 0.2064, "num_input_tokens_seen": 108667568, "step": 50405 }, { "epoch": 9.251238759405396, "grad_norm": 2.1467349529266357, "learning_rate": 6.502650722528302e-06, "loss": 0.0656, "num_input_tokens_seen": 108679024, "step": 50410 }, { "epoch": 9.252156358964948, "grad_norm": 0.30515795946121216, "learning_rate": 6.5018869637107655e-06, "loss": 0.1937, "num_input_tokens_seen": 108689424, "step": 50415 }, { "epoch": 9.2530739585245, "grad_norm": 32.47549057006836, "learning_rate": 6.501123166372154e-06, "loss": 0.1578, "num_input_tokens_seen": 108700528, "step": 50420 }, { "epoch": 9.253991558084053, "grad_norm": 0.8909618258476257, "learning_rate": 6.500359330532062e-06, "loss": 0.1869, "num_input_tokens_seen": 108712208, "step": 50425 }, { "epoch": 9.254909157643604, "grad_norm": 0.24634501338005066, "learning_rate": 6.499595456210077e-06, "loss": 0.1678, "num_input_tokens_seen": 108722736, "step": 50430 }, { "epoch": 9.255826757203156, "grad_norm": 1.55814528465271, "learning_rate": 6.498831543425793e-06, "loss": 0.2809, "num_input_tokens_seen": 108733648, "step": 50435 }, { "epoch": 9.25674435676271, "grad_norm": 0.05000041425228119, "learning_rate": 6.498067592198804e-06, "loss": 0.2026, "num_input_tokens_seen": 108744240, "step": 50440 }, { "epoch": 9.257661956322261, "grad_norm": 7.870036602020264, "learning_rate": 6.497303602548701e-06, "loss": 0.4878, "num_input_tokens_seen": 108755856, "step": 50445 }, { "epoch": 9.258579555881813, "grad_norm": 0.49257320165634155, "learning_rate": 6.4965395744950825e-06, "loss": 0.2165, "num_input_tokens_seen": 108766032, "step": 50450 }, { "epoch": 9.259497155441366, "grad_norm": 12.765321731567383, "learning_rate": 6.495775508057543e-06, "loss": 0.2439, "num_input_tokens_seen": 108778000, "step": 50455 }, { "epoch": 9.260414755000918, "grad_norm": 0.47482436895370483, "learning_rate": 6.49501140325568e-06, "loss": 0.4074, "num_input_tokens_seen": 108788560, "step": 50460 }, { "epoch": 9.26133235456047, "grad_norm": 3.830359697341919, "learning_rate": 6.494247260109092e-06, "loss": 0.4437, "num_input_tokens_seen": 108799408, "step": 50465 }, { "epoch": 9.262249954120023, "grad_norm": 50.27894973754883, "learning_rate": 6.4934830786373775e-06, "loss": 0.1197, "num_input_tokens_seen": 108809040, "step": 50470 }, { "epoch": 9.263167553679574, "grad_norm": 39.907081604003906, "learning_rate": 6.492718858860135e-06, "loss": 0.2541, "num_input_tokens_seen": 108819088, "step": 50475 }, { "epoch": 9.264085153239126, "grad_norm": 44.81835174560547, "learning_rate": 6.49195460079697e-06, "loss": 0.1979, "num_input_tokens_seen": 108829872, "step": 50480 }, { "epoch": 9.26500275279868, "grad_norm": 0.44380417466163635, "learning_rate": 6.491190304467481e-06, "loss": 0.175, "num_input_tokens_seen": 108841232, "step": 50485 }, { "epoch": 9.265920352358231, "grad_norm": 21.149185180664062, "learning_rate": 6.490425969891271e-06, "loss": 0.0225, "num_input_tokens_seen": 108851600, "step": 50490 }, { "epoch": 9.266837951917783, "grad_norm": 0.12375738471746445, "learning_rate": 6.489661597087945e-06, "loss": 0.0811, "num_input_tokens_seen": 108863056, "step": 50495 }, { "epoch": 9.267755551477336, "grad_norm": 0.3060609698295593, "learning_rate": 6.4888971860771075e-06, "loss": 0.2224, "num_input_tokens_seen": 108874160, "step": 50500 }, { "epoch": 9.268673151036888, "grad_norm": 0.0868462473154068, "learning_rate": 6.4881327368783655e-06, "loss": 0.3989, "num_input_tokens_seen": 108883440, "step": 50505 }, { "epoch": 9.26959075059644, "grad_norm": 0.14741502702236176, "learning_rate": 6.487368249511324e-06, "loss": 0.1254, "num_input_tokens_seen": 108894128, "step": 50510 }, { "epoch": 9.270508350155993, "grad_norm": 0.17063552141189575, "learning_rate": 6.486603723995595e-06, "loss": 0.2002, "num_input_tokens_seen": 108904784, "step": 50515 }, { "epoch": 9.271425949715544, "grad_norm": 43.40365982055664, "learning_rate": 6.485839160350782e-06, "loss": 0.5991, "num_input_tokens_seen": 108915408, "step": 50520 }, { "epoch": 9.272343549275096, "grad_norm": 0.04353270307183266, "learning_rate": 6.485074558596498e-06, "loss": 0.0793, "num_input_tokens_seen": 108926832, "step": 50525 }, { "epoch": 9.27326114883465, "grad_norm": 0.21652543544769287, "learning_rate": 6.484309918752353e-06, "loss": 0.0853, "num_input_tokens_seen": 108938640, "step": 50530 }, { "epoch": 9.2741787483942, "grad_norm": 0.24102646112442017, "learning_rate": 6.483545240837959e-06, "loss": 0.0202, "num_input_tokens_seen": 108949904, "step": 50535 }, { "epoch": 9.275096347953752, "grad_norm": 155.1842498779297, "learning_rate": 6.48278052487293e-06, "loss": 0.2836, "num_input_tokens_seen": 108960624, "step": 50540 }, { "epoch": 9.276013947513306, "grad_norm": 0.552241861820221, "learning_rate": 6.4820157708768775e-06, "loss": 0.1581, "num_input_tokens_seen": 108971664, "step": 50545 }, { "epoch": 9.276931547072857, "grad_norm": 13.915631294250488, "learning_rate": 6.4812509788694174e-06, "loss": 0.2616, "num_input_tokens_seen": 108982608, "step": 50550 }, { "epoch": 9.27784914663241, "grad_norm": 132.3822479248047, "learning_rate": 6.4804861488701665e-06, "loss": 0.274, "num_input_tokens_seen": 108994416, "step": 50555 }, { "epoch": 9.278766746191963, "grad_norm": 15.304900169372559, "learning_rate": 6.4797212808987385e-06, "loss": 0.3094, "num_input_tokens_seen": 109005296, "step": 50560 }, { "epoch": 9.279684345751514, "grad_norm": 30.22734832763672, "learning_rate": 6.478956374974755e-06, "loss": 0.1955, "num_input_tokens_seen": 109016048, "step": 50565 }, { "epoch": 9.280601945311066, "grad_norm": 0.1683432161808014, "learning_rate": 6.478191431117832e-06, "loss": 0.3872, "num_input_tokens_seen": 109026896, "step": 50570 }, { "epoch": 9.28151954487062, "grad_norm": 1.8622517585754395, "learning_rate": 6.477426449347589e-06, "loss": 0.1381, "num_input_tokens_seen": 109037808, "step": 50575 }, { "epoch": 9.28243714443017, "grad_norm": 6.321095943450928, "learning_rate": 6.476661429683649e-06, "loss": 0.0231, "num_input_tokens_seen": 109049168, "step": 50580 }, { "epoch": 9.283354743989722, "grad_norm": 0.1531488597393036, "learning_rate": 6.475896372145629e-06, "loss": 0.1221, "num_input_tokens_seen": 109060816, "step": 50585 }, { "epoch": 9.284272343549276, "grad_norm": 20.464563369750977, "learning_rate": 6.475131276753157e-06, "loss": 0.2935, "num_input_tokens_seen": 109071408, "step": 50590 }, { "epoch": 9.285189943108827, "grad_norm": 13.378255844116211, "learning_rate": 6.474366143525853e-06, "loss": 0.1727, "num_input_tokens_seen": 109082480, "step": 50595 }, { "epoch": 9.286107542668379, "grad_norm": 14.076356887817383, "learning_rate": 6.473600972483344e-06, "loss": 0.1757, "num_input_tokens_seen": 109093712, "step": 50600 }, { "epoch": 9.287025142227932, "grad_norm": 0.288513720035553, "learning_rate": 6.472835763645252e-06, "loss": 0.1139, "num_input_tokens_seen": 109104144, "step": 50605 }, { "epoch": 9.287942741787484, "grad_norm": 0.17372658848762512, "learning_rate": 6.472070517031206e-06, "loss": 0.1985, "num_input_tokens_seen": 109113776, "step": 50610 }, { "epoch": 9.288860341347036, "grad_norm": 41.934322357177734, "learning_rate": 6.471305232660833e-06, "loss": 0.2239, "num_input_tokens_seen": 109125520, "step": 50615 }, { "epoch": 9.289777940906589, "grad_norm": 0.22423815727233887, "learning_rate": 6.47053991055376e-06, "loss": 0.1969, "num_input_tokens_seen": 109135344, "step": 50620 }, { "epoch": 9.29069554046614, "grad_norm": 0.13937298953533173, "learning_rate": 6.4697745507296194e-06, "loss": 0.237, "num_input_tokens_seen": 109146032, "step": 50625 }, { "epoch": 9.291613140025692, "grad_norm": 0.2882250249385834, "learning_rate": 6.469009153208038e-06, "loss": 0.319, "num_input_tokens_seen": 109156208, "step": 50630 }, { "epoch": 9.292530739585246, "grad_norm": 0.299312025308609, "learning_rate": 6.46824371800865e-06, "loss": 0.2029, "num_input_tokens_seen": 109167376, "step": 50635 }, { "epoch": 9.293448339144797, "grad_norm": 10.305268287658691, "learning_rate": 6.4674782451510845e-06, "loss": 0.223, "num_input_tokens_seen": 109177744, "step": 50640 }, { "epoch": 9.294365938704349, "grad_norm": 10.436077117919922, "learning_rate": 6.4667127346549785e-06, "loss": 0.1814, "num_input_tokens_seen": 109188176, "step": 50645 }, { "epoch": 9.295283538263902, "grad_norm": 0.9084427952766418, "learning_rate": 6.465947186539962e-06, "loss": 0.1756, "num_input_tokens_seen": 109199056, "step": 50650 }, { "epoch": 9.296201137823454, "grad_norm": 0.42891228199005127, "learning_rate": 6.465181600825673e-06, "loss": 0.1714, "num_input_tokens_seen": 109209040, "step": 50655 }, { "epoch": 9.297118737383006, "grad_norm": 0.2780957520008087, "learning_rate": 6.464415977531746e-06, "loss": 0.0352, "num_input_tokens_seen": 109220208, "step": 50660 }, { "epoch": 9.298036336942559, "grad_norm": 40.530174255371094, "learning_rate": 6.4636503166778206e-06, "loss": 0.0494, "num_input_tokens_seen": 109231760, "step": 50665 }, { "epoch": 9.29895393650211, "grad_norm": 0.4191213846206665, "learning_rate": 6.462884618283531e-06, "loss": 0.006, "num_input_tokens_seen": 109240880, "step": 50670 }, { "epoch": 9.299871536061662, "grad_norm": 0.27139580249786377, "learning_rate": 6.4621188823685195e-06, "loss": 0.1978, "num_input_tokens_seen": 109251664, "step": 50675 }, { "epoch": 9.300789135621216, "grad_norm": 53.121707916259766, "learning_rate": 6.4613531089524236e-06, "loss": 0.4801, "num_input_tokens_seen": 109261712, "step": 50680 }, { "epoch": 9.301706735180767, "grad_norm": 40.93522262573242, "learning_rate": 6.460587298054887e-06, "loss": 0.2243, "num_input_tokens_seen": 109273200, "step": 50685 }, { "epoch": 9.302624334740319, "grad_norm": 317.69720458984375, "learning_rate": 6.459821449695549e-06, "loss": 0.3362, "num_input_tokens_seen": 109284080, "step": 50690 }, { "epoch": 9.303541934299872, "grad_norm": 0.0851326510310173, "learning_rate": 6.459055563894053e-06, "loss": 0.2085, "num_input_tokens_seen": 109295632, "step": 50695 }, { "epoch": 9.304459533859424, "grad_norm": 0.40320056676864624, "learning_rate": 6.458289640670044e-06, "loss": 0.426, "num_input_tokens_seen": 109307312, "step": 50700 }, { "epoch": 9.305377133418975, "grad_norm": 3.466809034347534, "learning_rate": 6.457523680043165e-06, "loss": 0.3147, "num_input_tokens_seen": 109318992, "step": 50705 }, { "epoch": 9.306294732978529, "grad_norm": 124.6534652709961, "learning_rate": 6.456757682033063e-06, "loss": 0.4426, "num_input_tokens_seen": 109331568, "step": 50710 }, { "epoch": 9.30721233253808, "grad_norm": 0.21896037459373474, "learning_rate": 6.455991646659384e-06, "loss": 0.3643, "num_input_tokens_seen": 109342320, "step": 50715 }, { "epoch": 9.308129932097632, "grad_norm": 0.6416930556297302, "learning_rate": 6.455225573941776e-06, "loss": 0.2485, "num_input_tokens_seen": 109353136, "step": 50720 }, { "epoch": 9.309047531657185, "grad_norm": 24.920425415039062, "learning_rate": 6.454459463899887e-06, "loss": 0.2556, "num_input_tokens_seen": 109362480, "step": 50725 }, { "epoch": 9.309965131216737, "grad_norm": 23.98002052307129, "learning_rate": 6.453693316553368e-06, "loss": 0.4211, "num_input_tokens_seen": 109373488, "step": 50730 }, { "epoch": 9.310882730776289, "grad_norm": 0.09228786826133728, "learning_rate": 6.452927131921868e-06, "loss": 0.0386, "num_input_tokens_seen": 109384912, "step": 50735 }, { "epoch": 9.311800330335842, "grad_norm": 0.20000381767749786, "learning_rate": 6.452160910025038e-06, "loss": 0.0769, "num_input_tokens_seen": 109396624, "step": 50740 }, { "epoch": 9.312717929895394, "grad_norm": 48.857669830322266, "learning_rate": 6.451394650882532e-06, "loss": 0.1589, "num_input_tokens_seen": 109408336, "step": 50745 }, { "epoch": 9.313635529454945, "grad_norm": 33.485801696777344, "learning_rate": 6.450628354514004e-06, "loss": 0.1208, "num_input_tokens_seen": 109419536, "step": 50750 }, { "epoch": 9.314553129014499, "grad_norm": 46.99049758911133, "learning_rate": 6.449862020939105e-06, "loss": 0.0916, "num_input_tokens_seen": 109430288, "step": 50755 }, { "epoch": 9.31547072857405, "grad_norm": 16.218029022216797, "learning_rate": 6.449095650177494e-06, "loss": 0.1764, "num_input_tokens_seen": 109440656, "step": 50760 }, { "epoch": 9.316388328133602, "grad_norm": 45.741764068603516, "learning_rate": 6.448329242248823e-06, "loss": 0.384, "num_input_tokens_seen": 109451856, "step": 50765 }, { "epoch": 9.317305927693155, "grad_norm": 29.030031204223633, "learning_rate": 6.447562797172753e-06, "loss": 0.2112, "num_input_tokens_seen": 109460656, "step": 50770 }, { "epoch": 9.318223527252707, "grad_norm": 24.4605770111084, "learning_rate": 6.446796314968942e-06, "loss": 0.0465, "num_input_tokens_seen": 109471600, "step": 50775 }, { "epoch": 9.319141126812259, "grad_norm": 0.3953210115432739, "learning_rate": 6.446029795657045e-06, "loss": 0.0187, "num_input_tokens_seen": 109481360, "step": 50780 }, { "epoch": 9.320058726371812, "grad_norm": 0.3878239393234253, "learning_rate": 6.445263239256727e-06, "loss": 0.3474, "num_input_tokens_seen": 109492944, "step": 50785 }, { "epoch": 9.320976325931364, "grad_norm": 42.868141174316406, "learning_rate": 6.444496645787647e-06, "loss": 0.3695, "num_input_tokens_seen": 109503248, "step": 50790 }, { "epoch": 9.321893925490915, "grad_norm": 69.13304138183594, "learning_rate": 6.443730015269465e-06, "loss": 0.1429, "num_input_tokens_seen": 109516368, "step": 50795 }, { "epoch": 9.322811525050469, "grad_norm": 0.4726465046405792, "learning_rate": 6.4429633477218475e-06, "loss": 0.0895, "num_input_tokens_seen": 109527600, "step": 50800 }, { "epoch": 9.32372912461002, "grad_norm": 0.18476799130439758, "learning_rate": 6.442196643164455e-06, "loss": 0.0907, "num_input_tokens_seen": 109539632, "step": 50805 }, { "epoch": 9.324646724169572, "grad_norm": 0.8762583136558533, "learning_rate": 6.441429901616956e-06, "loss": 0.2175, "num_input_tokens_seen": 109550544, "step": 50810 }, { "epoch": 9.325564323729125, "grad_norm": 0.3371628522872925, "learning_rate": 6.440663123099012e-06, "loss": 0.4704, "num_input_tokens_seen": 109560976, "step": 50815 }, { "epoch": 9.326481923288677, "grad_norm": 10.572380065917969, "learning_rate": 6.439896307630293e-06, "loss": 0.2862, "num_input_tokens_seen": 109572464, "step": 50820 }, { "epoch": 9.327399522848228, "grad_norm": 25.884876251220703, "learning_rate": 6.439129455230465e-06, "loss": 0.0576, "num_input_tokens_seen": 109584048, "step": 50825 }, { "epoch": 9.328317122407782, "grad_norm": 83.05305480957031, "learning_rate": 6.4383625659191964e-06, "loss": 0.2887, "num_input_tokens_seen": 109593392, "step": 50830 }, { "epoch": 9.329234721967333, "grad_norm": 4.645565032958984, "learning_rate": 6.437595639716158e-06, "loss": 0.1456, "num_input_tokens_seen": 109602480, "step": 50835 }, { "epoch": 9.330152321526885, "grad_norm": 0.8460313677787781, "learning_rate": 6.436828676641018e-06, "loss": 0.2128, "num_input_tokens_seen": 109612112, "step": 50840 }, { "epoch": 9.331069921086439, "grad_norm": 103.70723724365234, "learning_rate": 6.436061676713451e-06, "loss": 0.1595, "num_input_tokens_seen": 109622960, "step": 50845 }, { "epoch": 9.33198752064599, "grad_norm": 0.9850046634674072, "learning_rate": 6.435294639953127e-06, "loss": 0.088, "num_input_tokens_seen": 109634896, "step": 50850 }, { "epoch": 9.332905120205542, "grad_norm": 0.28715232014656067, "learning_rate": 6.43452756637972e-06, "loss": 0.2137, "num_input_tokens_seen": 109645808, "step": 50855 }, { "epoch": 9.333822719765095, "grad_norm": 8.972850799560547, "learning_rate": 6.433760456012905e-06, "loss": 0.245, "num_input_tokens_seen": 109655696, "step": 50860 }, { "epoch": 9.334740319324647, "grad_norm": 37.85494613647461, "learning_rate": 6.432993308872356e-06, "loss": 0.2006, "num_input_tokens_seen": 109666736, "step": 50865 }, { "epoch": 9.335657918884198, "grad_norm": 35.36281204223633, "learning_rate": 6.43222612497775e-06, "loss": 0.1162, "num_input_tokens_seen": 109677520, "step": 50870 }, { "epoch": 9.336575518443752, "grad_norm": 0.6282047629356384, "learning_rate": 6.431458904348762e-06, "loss": 0.2856, "num_input_tokens_seen": 109688368, "step": 50875 }, { "epoch": 9.337493118003303, "grad_norm": 180.6682586669922, "learning_rate": 6.430691647005072e-06, "loss": 0.2229, "num_input_tokens_seen": 109700016, "step": 50880 }, { "epoch": 9.338410717562855, "grad_norm": 0.7234307527542114, "learning_rate": 6.42992435296636e-06, "loss": 0.4059, "num_input_tokens_seen": 109709424, "step": 50885 }, { "epoch": 9.339328317122408, "grad_norm": 0.5169504880905151, "learning_rate": 6.4291570222523035e-06, "loss": 0.0587, "num_input_tokens_seen": 109721104, "step": 50890 }, { "epoch": 9.34024591668196, "grad_norm": 140.40029907226562, "learning_rate": 6.4283896548825856e-06, "loss": 0.1857, "num_input_tokens_seen": 109731184, "step": 50895 }, { "epoch": 9.341163516241512, "grad_norm": 60.95565414428711, "learning_rate": 6.427622250876885e-06, "loss": 0.1976, "num_input_tokens_seen": 109741968, "step": 50900 }, { "epoch": 9.342081115801065, "grad_norm": 53.4586067199707, "learning_rate": 6.426854810254887e-06, "loss": 0.2126, "num_input_tokens_seen": 109753296, "step": 50905 }, { "epoch": 9.342998715360617, "grad_norm": 38.4622688293457, "learning_rate": 6.426087333036275e-06, "loss": 0.5134, "num_input_tokens_seen": 109763344, "step": 50910 }, { "epoch": 9.343916314920168, "grad_norm": 0.10366889834403992, "learning_rate": 6.425319819240733e-06, "loss": 0.0348, "num_input_tokens_seen": 109772816, "step": 50915 }, { "epoch": 9.344833914479722, "grad_norm": 18.518529891967773, "learning_rate": 6.424552268887947e-06, "loss": 0.2993, "num_input_tokens_seen": 109783184, "step": 50920 }, { "epoch": 9.345751514039273, "grad_norm": 83.79815673828125, "learning_rate": 6.423784681997602e-06, "loss": 0.1088, "num_input_tokens_seen": 109793808, "step": 50925 }, { "epoch": 9.346669113598825, "grad_norm": 14.363746643066406, "learning_rate": 6.423017058589387e-06, "loss": 0.255, "num_input_tokens_seen": 109804080, "step": 50930 }, { "epoch": 9.347586713158378, "grad_norm": 77.35987091064453, "learning_rate": 6.4222493986829906e-06, "loss": 0.6509, "num_input_tokens_seen": 109815504, "step": 50935 }, { "epoch": 9.34850431271793, "grad_norm": 0.3357776403427124, "learning_rate": 6.4214817022981e-06, "loss": 0.1533, "num_input_tokens_seen": 109827152, "step": 50940 }, { "epoch": 9.349421912277482, "grad_norm": 0.30279478430747986, "learning_rate": 6.420713969454408e-06, "loss": 0.1072, "num_input_tokens_seen": 109836528, "step": 50945 }, { "epoch": 9.350339511837035, "grad_norm": 115.3345947265625, "learning_rate": 6.419946200171605e-06, "loss": 0.1576, "num_input_tokens_seen": 109847312, "step": 50950 }, { "epoch": 9.351257111396587, "grad_norm": 0.210774227976799, "learning_rate": 6.419178394469383e-06, "loss": 0.4726, "num_input_tokens_seen": 109859056, "step": 50955 }, { "epoch": 9.352174710956138, "grad_norm": 13.319643020629883, "learning_rate": 6.418410552367433e-06, "loss": 0.0084, "num_input_tokens_seen": 109870352, "step": 50960 }, { "epoch": 9.353092310515692, "grad_norm": 0.1359768658876419, "learning_rate": 6.417642673885452e-06, "loss": 0.2609, "num_input_tokens_seen": 109881008, "step": 50965 }, { "epoch": 9.354009910075243, "grad_norm": 0.5817341208457947, "learning_rate": 6.416874759043133e-06, "loss": 0.2737, "num_input_tokens_seen": 109892720, "step": 50970 }, { "epoch": 9.354927509634795, "grad_norm": 36.57483673095703, "learning_rate": 6.416106807860173e-06, "loss": 0.2139, "num_input_tokens_seen": 109902000, "step": 50975 }, { "epoch": 9.355845109194348, "grad_norm": 36.456172943115234, "learning_rate": 6.415338820356267e-06, "loss": 0.2009, "num_input_tokens_seen": 109912272, "step": 50980 }, { "epoch": 9.3567627087539, "grad_norm": 39.513153076171875, "learning_rate": 6.414570796551115e-06, "loss": 0.3506, "num_input_tokens_seen": 109923376, "step": 50985 }, { "epoch": 9.357680308313451, "grad_norm": 0.31927600502967834, "learning_rate": 6.413802736464414e-06, "loss": 0.136, "num_input_tokens_seen": 109934416, "step": 50990 }, { "epoch": 9.358597907873005, "grad_norm": 19.72304916381836, "learning_rate": 6.413034640115864e-06, "loss": 0.2056, "num_input_tokens_seen": 109945936, "step": 50995 }, { "epoch": 9.359515507432556, "grad_norm": 0.2871418297290802, "learning_rate": 6.412266507525165e-06, "loss": 0.2695, "num_input_tokens_seen": 109956176, "step": 51000 }, { "epoch": 9.360433106992108, "grad_norm": 38.34895324707031, "learning_rate": 6.4114983387120185e-06, "loss": 0.309, "num_input_tokens_seen": 109965232, "step": 51005 }, { "epoch": 9.361350706551661, "grad_norm": 26.631860733032227, "learning_rate": 6.410730133696128e-06, "loss": 0.2432, "num_input_tokens_seen": 109976400, "step": 51010 }, { "epoch": 9.362268306111213, "grad_norm": 4.178284168243408, "learning_rate": 6.409961892497196e-06, "loss": 0.1807, "num_input_tokens_seen": 109987984, "step": 51015 }, { "epoch": 9.363185905670765, "grad_norm": 59.30471420288086, "learning_rate": 6.409193615134928e-06, "loss": 0.1848, "num_input_tokens_seen": 109998032, "step": 51020 }, { "epoch": 9.364103505230318, "grad_norm": 70.12590026855469, "learning_rate": 6.408425301629026e-06, "loss": 0.1966, "num_input_tokens_seen": 110008048, "step": 51025 }, { "epoch": 9.36502110478987, "grad_norm": 12.180809020996094, "learning_rate": 6.407656951999198e-06, "loss": 0.2355, "num_input_tokens_seen": 110018320, "step": 51030 }, { "epoch": 9.365938704349421, "grad_norm": 21.46169090270996, "learning_rate": 6.406888566265152e-06, "loss": 0.3897, "num_input_tokens_seen": 110029296, "step": 51035 }, { "epoch": 9.366856303908975, "grad_norm": 0.541463315486908, "learning_rate": 6.406120144446593e-06, "loss": 0.34, "num_input_tokens_seen": 110042608, "step": 51040 }, { "epoch": 9.367773903468526, "grad_norm": 61.54062271118164, "learning_rate": 6.405351686563233e-06, "loss": 0.478, "num_input_tokens_seen": 110052752, "step": 51045 }, { "epoch": 9.368691503028078, "grad_norm": 3.296196937561035, "learning_rate": 6.404583192634779e-06, "loss": 0.1957, "num_input_tokens_seen": 110063056, "step": 51050 }, { "epoch": 9.369609102587631, "grad_norm": 3.9933924674987793, "learning_rate": 6.403814662680945e-06, "loss": 0.1665, "num_input_tokens_seen": 110075504, "step": 51055 }, { "epoch": 9.370526702147183, "grad_norm": 7.956409931182861, "learning_rate": 6.403046096721439e-06, "loss": 0.5327, "num_input_tokens_seen": 110086224, "step": 51060 }, { "epoch": 9.371444301706735, "grad_norm": 0.6317203044891357, "learning_rate": 6.402277494775977e-06, "loss": 0.3944, "num_input_tokens_seen": 110096432, "step": 51065 }, { "epoch": 9.372361901266288, "grad_norm": 10.379986763000488, "learning_rate": 6.401508856864268e-06, "loss": 0.1023, "num_input_tokens_seen": 110108144, "step": 51070 }, { "epoch": 9.37327950082584, "grad_norm": 5.910195350646973, "learning_rate": 6.400740183006031e-06, "loss": 0.2278, "num_input_tokens_seen": 110118320, "step": 51075 }, { "epoch": 9.374197100385391, "grad_norm": 45.465335845947266, "learning_rate": 6.39997147322098e-06, "loss": 0.4203, "num_input_tokens_seen": 110130288, "step": 51080 }, { "epoch": 9.375114699944945, "grad_norm": 4.312929153442383, "learning_rate": 6.399202727528828e-06, "loss": 0.2415, "num_input_tokens_seen": 110141584, "step": 51085 }, { "epoch": 9.376032299504496, "grad_norm": 25.395652770996094, "learning_rate": 6.398433945949295e-06, "loss": 0.2126, "num_input_tokens_seen": 110153040, "step": 51090 }, { "epoch": 9.376949899064048, "grad_norm": 19.231115341186523, "learning_rate": 6.397665128502099e-06, "loss": 0.0973, "num_input_tokens_seen": 110164592, "step": 51095 }, { "epoch": 9.377867498623601, "grad_norm": 91.76619720458984, "learning_rate": 6.39689627520696e-06, "loss": 0.3115, "num_input_tokens_seen": 110175792, "step": 51100 }, { "epoch": 9.378785098183153, "grad_norm": 0.4008762538433075, "learning_rate": 6.396127386083595e-06, "loss": 0.1214, "num_input_tokens_seen": 110186960, "step": 51105 }, { "epoch": 9.379702697742704, "grad_norm": 7.200952529907227, "learning_rate": 6.395358461151726e-06, "loss": 0.0713, "num_input_tokens_seen": 110197680, "step": 51110 }, { "epoch": 9.380620297302258, "grad_norm": 33.382747650146484, "learning_rate": 6.394589500431076e-06, "loss": 0.2188, "num_input_tokens_seen": 110209616, "step": 51115 }, { "epoch": 9.38153789686181, "grad_norm": 0.6298224329948425, "learning_rate": 6.393820503941367e-06, "loss": 0.3042, "num_input_tokens_seen": 110221648, "step": 51120 }, { "epoch": 9.382455496421361, "grad_norm": 4.110081672668457, "learning_rate": 6.393051471702322e-06, "loss": 0.4579, "num_input_tokens_seen": 110232688, "step": 51125 }, { "epoch": 9.383373095980915, "grad_norm": 6.053100109100342, "learning_rate": 6.3922824037336665e-06, "loss": 0.1488, "num_input_tokens_seen": 110243632, "step": 51130 }, { "epoch": 9.384290695540466, "grad_norm": 5.576217174530029, "learning_rate": 6.391513300055123e-06, "loss": 0.0932, "num_input_tokens_seen": 110254896, "step": 51135 }, { "epoch": 9.385208295100018, "grad_norm": 6.118638038635254, "learning_rate": 6.390744160686422e-06, "loss": 0.1196, "num_input_tokens_seen": 110265392, "step": 51140 }, { "epoch": 9.386125894659571, "grad_norm": 34.054115295410156, "learning_rate": 6.389974985647288e-06, "loss": 0.1518, "num_input_tokens_seen": 110275632, "step": 51145 }, { "epoch": 9.387043494219123, "grad_norm": 32.33790969848633, "learning_rate": 6.38920577495745e-06, "loss": 0.1734, "num_input_tokens_seen": 110286736, "step": 51150 }, { "epoch": 9.387961093778674, "grad_norm": 1.2079042196273804, "learning_rate": 6.388436528636637e-06, "loss": 0.4138, "num_input_tokens_seen": 110296880, "step": 51155 }, { "epoch": 9.388878693338228, "grad_norm": 10.061140060424805, "learning_rate": 6.387667246704579e-06, "loss": 0.0807, "num_input_tokens_seen": 110307824, "step": 51160 }, { "epoch": 9.38979629289778, "grad_norm": 17.3299503326416, "learning_rate": 6.386897929181006e-06, "loss": 0.074, "num_input_tokens_seen": 110319248, "step": 51165 }, { "epoch": 9.390713892457331, "grad_norm": 0.9969613552093506, "learning_rate": 6.386128576085652e-06, "loss": 0.1157, "num_input_tokens_seen": 110329360, "step": 51170 }, { "epoch": 9.391631492016884, "grad_norm": 5.541379928588867, "learning_rate": 6.385359187438248e-06, "loss": 0.2454, "num_input_tokens_seen": 110339280, "step": 51175 }, { "epoch": 9.392549091576436, "grad_norm": 0.4516574740409851, "learning_rate": 6.384589763258526e-06, "loss": 0.2414, "num_input_tokens_seen": 110348304, "step": 51180 }, { "epoch": 9.393466691135988, "grad_norm": 6.216357707977295, "learning_rate": 6.383820303566226e-06, "loss": 0.4626, "num_input_tokens_seen": 110359408, "step": 51185 }, { "epoch": 9.394384290695541, "grad_norm": 0.2848949134349823, "learning_rate": 6.383050808381079e-06, "loss": 0.3021, "num_input_tokens_seen": 110369936, "step": 51190 }, { "epoch": 9.395301890255093, "grad_norm": 5.26408052444458, "learning_rate": 6.382281277722819e-06, "loss": 0.2994, "num_input_tokens_seen": 110380304, "step": 51195 }, { "epoch": 9.396219489814644, "grad_norm": 38.58151626586914, "learning_rate": 6.381511711611189e-06, "loss": 0.2982, "num_input_tokens_seen": 110390544, "step": 51200 }, { "epoch": 9.397137089374198, "grad_norm": 0.03897133469581604, "learning_rate": 6.380742110065925e-06, "loss": 0.1832, "num_input_tokens_seen": 110400816, "step": 51205 }, { "epoch": 9.39805468893375, "grad_norm": 0.5116576552391052, "learning_rate": 6.3799724731067654e-06, "loss": 0.1818, "num_input_tokens_seen": 110411568, "step": 51210 }, { "epoch": 9.398972288493301, "grad_norm": 33.67477798461914, "learning_rate": 6.379202800753451e-06, "loss": 0.4402, "num_input_tokens_seen": 110421936, "step": 51215 }, { "epoch": 9.399889888052854, "grad_norm": 20.9688720703125, "learning_rate": 6.378433093025722e-06, "loss": 0.2267, "num_input_tokens_seen": 110433296, "step": 51220 }, { "epoch": 9.400807487612406, "grad_norm": 0.1530160754919052, "learning_rate": 6.377663349943319e-06, "loss": 0.1646, "num_input_tokens_seen": 110443472, "step": 51225 }, { "epoch": 9.401725087171958, "grad_norm": 27.26816749572754, "learning_rate": 6.376893571525989e-06, "loss": 0.3472, "num_input_tokens_seen": 110453808, "step": 51230 }, { "epoch": 9.402642686731511, "grad_norm": 1.0869312286376953, "learning_rate": 6.376123757793472e-06, "loss": 0.3757, "num_input_tokens_seen": 110463792, "step": 51235 }, { "epoch": 9.403560286291063, "grad_norm": 24.141952514648438, "learning_rate": 6.375353908765514e-06, "loss": 0.3113, "num_input_tokens_seen": 110474576, "step": 51240 }, { "epoch": 9.404477885850614, "grad_norm": 46.2431640625, "learning_rate": 6.374584024461859e-06, "loss": 0.233, "num_input_tokens_seen": 110485168, "step": 51245 }, { "epoch": 9.405395485410168, "grad_norm": 0.4595543146133423, "learning_rate": 6.373814104902253e-06, "loss": 0.128, "num_input_tokens_seen": 110496464, "step": 51250 }, { "epoch": 9.40631308496972, "grad_norm": 0.5544381141662598, "learning_rate": 6.373044150106446e-06, "loss": 0.0191, "num_input_tokens_seen": 110507920, "step": 51255 }, { "epoch": 9.40723068452927, "grad_norm": 32.783843994140625, "learning_rate": 6.372274160094183e-06, "loss": 0.4574, "num_input_tokens_seen": 110518384, "step": 51260 }, { "epoch": 9.408148284088824, "grad_norm": 70.52617645263672, "learning_rate": 6.371504134885217e-06, "loss": 0.2903, "num_input_tokens_seen": 110529712, "step": 51265 }, { "epoch": 9.409065883648376, "grad_norm": 3.201314926147461, "learning_rate": 6.370734074499294e-06, "loss": 0.2557, "num_input_tokens_seen": 110539376, "step": 51270 }, { "epoch": 9.409983483207927, "grad_norm": 3.8151872158050537, "learning_rate": 6.369963978956168e-06, "loss": 0.0301, "num_input_tokens_seen": 110550032, "step": 51275 }, { "epoch": 9.41090108276748, "grad_norm": 42.75324249267578, "learning_rate": 6.369193848275587e-06, "loss": 0.2524, "num_input_tokens_seen": 110561552, "step": 51280 }, { "epoch": 9.411818682327032, "grad_norm": 0.23996852338314056, "learning_rate": 6.368423682477307e-06, "loss": 0.0166, "num_input_tokens_seen": 110571760, "step": 51285 }, { "epoch": 9.412736281886584, "grad_norm": 0.7772381901741028, "learning_rate": 6.367653481581081e-06, "loss": 0.2944, "num_input_tokens_seen": 110583312, "step": 51290 }, { "epoch": 9.413653881446137, "grad_norm": 54.21050262451172, "learning_rate": 6.366883245606661e-06, "loss": 0.3042, "num_input_tokens_seen": 110593712, "step": 51295 }, { "epoch": 9.414571481005689, "grad_norm": 29.255218505859375, "learning_rate": 6.366112974573806e-06, "loss": 0.3172, "num_input_tokens_seen": 110605200, "step": 51300 }, { "epoch": 9.41548908056524, "grad_norm": 22.622032165527344, "learning_rate": 6.36534266850227e-06, "loss": 0.3319, "num_input_tokens_seen": 110616432, "step": 51305 }, { "epoch": 9.416406680124794, "grad_norm": 109.28401947021484, "learning_rate": 6.36457232741181e-06, "loss": 0.2426, "num_input_tokens_seen": 110628144, "step": 51310 }, { "epoch": 9.417324279684346, "grad_norm": 19.386690139770508, "learning_rate": 6.363801951322186e-06, "loss": 0.3056, "num_input_tokens_seen": 110639696, "step": 51315 }, { "epoch": 9.418241879243897, "grad_norm": 13.731718063354492, "learning_rate": 6.363031540253154e-06, "loss": 0.2369, "num_input_tokens_seen": 110650416, "step": 51320 }, { "epoch": 9.41915947880345, "grad_norm": 48.9889030456543, "learning_rate": 6.362261094224477e-06, "loss": 0.3883, "num_input_tokens_seen": 110661968, "step": 51325 }, { "epoch": 9.420077078363002, "grad_norm": 74.13874053955078, "learning_rate": 6.361490613255913e-06, "loss": 0.1601, "num_input_tokens_seen": 110673136, "step": 51330 }, { "epoch": 9.420994677922554, "grad_norm": 26.915624618530273, "learning_rate": 6.360720097367225e-06, "loss": 0.4126, "num_input_tokens_seen": 110684304, "step": 51335 }, { "epoch": 9.421912277482107, "grad_norm": 0.4228857755661011, "learning_rate": 6.359949546578176e-06, "loss": 0.073, "num_input_tokens_seen": 110695664, "step": 51340 }, { "epoch": 9.422829877041659, "grad_norm": 3.972754716873169, "learning_rate": 6.359178960908528e-06, "loss": 0.2178, "num_input_tokens_seen": 110706320, "step": 51345 }, { "epoch": 9.42374747660121, "grad_norm": 26.48715591430664, "learning_rate": 6.358408340378049e-06, "loss": 0.3599, "num_input_tokens_seen": 110716592, "step": 51350 }, { "epoch": 9.424665076160764, "grad_norm": 0.6718342304229736, "learning_rate": 6.357637685006498e-06, "loss": 0.0312, "num_input_tokens_seen": 110727216, "step": 51355 }, { "epoch": 9.425582675720316, "grad_norm": 0.567454993724823, "learning_rate": 6.356866994813645e-06, "loss": 0.155, "num_input_tokens_seen": 110737744, "step": 51360 }, { "epoch": 9.426500275279867, "grad_norm": 60.29177474975586, "learning_rate": 6.356096269819259e-06, "loss": 0.1754, "num_input_tokens_seen": 110748560, "step": 51365 }, { "epoch": 9.42741787483942, "grad_norm": 8.434722900390625, "learning_rate": 6.3553255100431025e-06, "loss": 0.1833, "num_input_tokens_seen": 110759024, "step": 51370 }, { "epoch": 9.428335474398972, "grad_norm": 95.94206237792969, "learning_rate": 6.354554715504949e-06, "loss": 0.3558, "num_input_tokens_seen": 110769008, "step": 51375 }, { "epoch": 9.429253073958524, "grad_norm": 0.6058737635612488, "learning_rate": 6.353783886224565e-06, "loss": 0.2202, "num_input_tokens_seen": 110780144, "step": 51380 }, { "epoch": 9.430170673518077, "grad_norm": 0.328287810087204, "learning_rate": 6.3530130222217244e-06, "loss": 0.3612, "num_input_tokens_seen": 110790768, "step": 51385 }, { "epoch": 9.431088273077629, "grad_norm": 37.101932525634766, "learning_rate": 6.352242123516196e-06, "loss": 0.3364, "num_input_tokens_seen": 110801424, "step": 51390 }, { "epoch": 9.43200587263718, "grad_norm": 3.4973082542419434, "learning_rate": 6.351471190127753e-06, "loss": 0.1664, "num_input_tokens_seen": 110811888, "step": 51395 }, { "epoch": 9.432923472196734, "grad_norm": 56.63044738769531, "learning_rate": 6.35070022207617e-06, "loss": 0.1519, "num_input_tokens_seen": 110822608, "step": 51400 }, { "epoch": 9.433841071756286, "grad_norm": 6.699093818664551, "learning_rate": 6.349929219381217e-06, "loss": 0.1359, "num_input_tokens_seen": 110833360, "step": 51405 }, { "epoch": 9.434758671315837, "grad_norm": 80.75675964355469, "learning_rate": 6.349158182062671e-06, "loss": 0.1035, "num_input_tokens_seen": 110842832, "step": 51410 }, { "epoch": 9.43567627087539, "grad_norm": 0.10558579117059708, "learning_rate": 6.348387110140312e-06, "loss": 0.6259, "num_input_tokens_seen": 110853776, "step": 51415 }, { "epoch": 9.436593870434942, "grad_norm": 85.85831451416016, "learning_rate": 6.347616003633911e-06, "loss": 0.8468, "num_input_tokens_seen": 110865040, "step": 51420 }, { "epoch": 9.437511469994494, "grad_norm": 0.34067097306251526, "learning_rate": 6.346844862563249e-06, "loss": 0.2136, "num_input_tokens_seen": 110873872, "step": 51425 }, { "epoch": 9.438429069554047, "grad_norm": 1.8307337760925293, "learning_rate": 6.346073686948103e-06, "loss": 0.1001, "num_input_tokens_seen": 110884944, "step": 51430 }, { "epoch": 9.439346669113599, "grad_norm": 35.5213623046875, "learning_rate": 6.345302476808254e-06, "loss": 0.0328, "num_input_tokens_seen": 110894928, "step": 51435 }, { "epoch": 9.44026426867315, "grad_norm": 0.15246529877185822, "learning_rate": 6.344531232163482e-06, "loss": 0.4933, "num_input_tokens_seen": 110906160, "step": 51440 }, { "epoch": 9.441181868232704, "grad_norm": 181.46592712402344, "learning_rate": 6.343759953033566e-06, "loss": 0.4384, "num_input_tokens_seen": 110917680, "step": 51445 }, { "epoch": 9.442099467792255, "grad_norm": 21.538188934326172, "learning_rate": 6.342988639438292e-06, "loss": 0.1379, "num_input_tokens_seen": 110929232, "step": 51450 }, { "epoch": 9.443017067351807, "grad_norm": 0.7334667444229126, "learning_rate": 6.342217291397439e-06, "loss": 0.2623, "num_input_tokens_seen": 110938768, "step": 51455 }, { "epoch": 9.44393466691136, "grad_norm": 1.0285327434539795, "learning_rate": 6.341445908930794e-06, "loss": 0.2791, "num_input_tokens_seen": 110949328, "step": 51460 }, { "epoch": 9.444852266470912, "grad_norm": 0.2104339748620987, "learning_rate": 6.34067449205814e-06, "loss": 0.2088, "num_input_tokens_seen": 110961680, "step": 51465 }, { "epoch": 9.445769866030464, "grad_norm": 6.334305286407471, "learning_rate": 6.339903040799262e-06, "loss": 0.1291, "num_input_tokens_seen": 110972656, "step": 51470 }, { "epoch": 9.446687465590017, "grad_norm": 0.3908885717391968, "learning_rate": 6.3391315551739495e-06, "loss": 0.2999, "num_input_tokens_seen": 110984880, "step": 51475 }, { "epoch": 9.447605065149569, "grad_norm": 0.31908029317855835, "learning_rate": 6.338360035201987e-06, "loss": 0.0542, "num_input_tokens_seen": 110995504, "step": 51480 }, { "epoch": 9.44852266470912, "grad_norm": 0.4160906672477722, "learning_rate": 6.337588480903164e-06, "loss": 0.257, "num_input_tokens_seen": 111005904, "step": 51485 }, { "epoch": 9.449440264268674, "grad_norm": 10.045438766479492, "learning_rate": 6.3368168922972695e-06, "loss": 0.1713, "num_input_tokens_seen": 111017232, "step": 51490 }, { "epoch": 9.450357863828225, "grad_norm": 8.95207691192627, "learning_rate": 6.336045269404094e-06, "loss": 0.2857, "num_input_tokens_seen": 111027472, "step": 51495 }, { "epoch": 9.451275463387777, "grad_norm": 20.77802848815918, "learning_rate": 6.335273612243428e-06, "loss": 0.3962, "num_input_tokens_seen": 111037776, "step": 51500 }, { "epoch": 9.45219306294733, "grad_norm": 5.957510471343994, "learning_rate": 6.334501920835063e-06, "loss": 0.1126, "num_input_tokens_seen": 111048336, "step": 51505 }, { "epoch": 9.453110662506882, "grad_norm": 43.853336334228516, "learning_rate": 6.333730195198793e-06, "loss": 0.22, "num_input_tokens_seen": 111057808, "step": 51510 }, { "epoch": 9.454028262066434, "grad_norm": 0.34313148260116577, "learning_rate": 6.332958435354409e-06, "loss": 0.0233, "num_input_tokens_seen": 111066896, "step": 51515 }, { "epoch": 9.454945861625987, "grad_norm": 0.17901885509490967, "learning_rate": 6.3321866413217085e-06, "loss": 0.0619, "num_input_tokens_seen": 111079408, "step": 51520 }, { "epoch": 9.455863461185539, "grad_norm": 0.5623128414154053, "learning_rate": 6.331414813120485e-06, "loss": 0.0949, "num_input_tokens_seen": 111090000, "step": 51525 }, { "epoch": 9.45678106074509, "grad_norm": 4.2218122482299805, "learning_rate": 6.330642950770533e-06, "loss": 0.1346, "num_input_tokens_seen": 111101360, "step": 51530 }, { "epoch": 9.457698660304644, "grad_norm": 6.429824352264404, "learning_rate": 6.329871054291654e-06, "loss": 0.1369, "num_input_tokens_seen": 111112080, "step": 51535 }, { "epoch": 9.458616259864195, "grad_norm": 1.8998608589172363, "learning_rate": 6.329099123703643e-06, "loss": 0.2804, "num_input_tokens_seen": 111122896, "step": 51540 }, { "epoch": 9.459533859423747, "grad_norm": 2.163315773010254, "learning_rate": 6.328327159026299e-06, "loss": 0.2116, "num_input_tokens_seen": 111133872, "step": 51545 }, { "epoch": 9.4604514589833, "grad_norm": 77.22673034667969, "learning_rate": 6.327555160279423e-06, "loss": 0.0165, "num_input_tokens_seen": 111144304, "step": 51550 }, { "epoch": 9.461369058542852, "grad_norm": 0.06125148385763168, "learning_rate": 6.326783127482814e-06, "loss": 0.0031, "num_input_tokens_seen": 111154864, "step": 51555 }, { "epoch": 9.462286658102403, "grad_norm": 0.8497493267059326, "learning_rate": 6.326011060656274e-06, "loss": 0.3597, "num_input_tokens_seen": 111166288, "step": 51560 }, { "epoch": 9.463204257661957, "grad_norm": 0.12256263941526413, "learning_rate": 6.325238959819605e-06, "loss": 0.1015, "num_input_tokens_seen": 111178672, "step": 51565 }, { "epoch": 9.464121857221508, "grad_norm": 81.05287170410156, "learning_rate": 6.324466824992611e-06, "loss": 0.3588, "num_input_tokens_seen": 111190128, "step": 51570 }, { "epoch": 9.46503945678106, "grad_norm": 7.837271690368652, "learning_rate": 6.3236946561950965e-06, "loss": 0.307, "num_input_tokens_seen": 111201648, "step": 51575 }, { "epoch": 9.465957056340613, "grad_norm": 0.09211243689060211, "learning_rate": 6.322922453446865e-06, "loss": 0.3507, "num_input_tokens_seen": 111212080, "step": 51580 }, { "epoch": 9.466874655900165, "grad_norm": 36.74462127685547, "learning_rate": 6.322150216767723e-06, "loss": 0.2768, "num_input_tokens_seen": 111222192, "step": 51585 }, { "epoch": 9.467792255459717, "grad_norm": 0.3877871334552765, "learning_rate": 6.321377946177476e-06, "loss": 0.2524, "num_input_tokens_seen": 111232848, "step": 51590 }, { "epoch": 9.46870985501927, "grad_norm": 0.7269972562789917, "learning_rate": 6.320605641695934e-06, "loss": 0.2593, "num_input_tokens_seen": 111244816, "step": 51595 }, { "epoch": 9.469627454578822, "grad_norm": 18.731292724609375, "learning_rate": 6.319833303342904e-06, "loss": 0.4085, "num_input_tokens_seen": 111255312, "step": 51600 }, { "epoch": 9.470545054138373, "grad_norm": 9.288063049316406, "learning_rate": 6.319060931138194e-06, "loss": 0.1322, "num_input_tokens_seen": 111266128, "step": 51605 }, { "epoch": 9.471462653697927, "grad_norm": 18.50462532043457, "learning_rate": 6.318288525101617e-06, "loss": 0.322, "num_input_tokens_seen": 111276240, "step": 51610 }, { "epoch": 9.472380253257478, "grad_norm": 48.57337188720703, "learning_rate": 6.317516085252982e-06, "loss": 0.3087, "num_input_tokens_seen": 111286576, "step": 51615 }, { "epoch": 9.47329785281703, "grad_norm": 0.4564974308013916, "learning_rate": 6.3167436116121015e-06, "loss": 0.274, "num_input_tokens_seen": 111297648, "step": 51620 }, { "epoch": 9.474215452376583, "grad_norm": 1.7292020320892334, "learning_rate": 6.315971104198788e-06, "loss": 0.1994, "num_input_tokens_seen": 111308624, "step": 51625 }, { "epoch": 9.475133051936135, "grad_norm": 9.761493682861328, "learning_rate": 6.315198563032855e-06, "loss": 0.3073, "num_input_tokens_seen": 111318672, "step": 51630 }, { "epoch": 9.476050651495687, "grad_norm": 55.391082763671875, "learning_rate": 6.314425988134118e-06, "loss": 0.166, "num_input_tokens_seen": 111329488, "step": 51635 }, { "epoch": 9.47696825105524, "grad_norm": 0.2893235385417938, "learning_rate": 6.313653379522391e-06, "loss": 0.0962, "num_input_tokens_seen": 111340304, "step": 51640 }, { "epoch": 9.477885850614792, "grad_norm": 2.221850633621216, "learning_rate": 6.31288073721749e-06, "loss": 0.0743, "num_input_tokens_seen": 111351184, "step": 51645 }, { "epoch": 9.478803450174343, "grad_norm": 59.888771057128906, "learning_rate": 6.312108061239234e-06, "loss": 0.3512, "num_input_tokens_seen": 111361680, "step": 51650 }, { "epoch": 9.479721049733897, "grad_norm": 5.999875545501709, "learning_rate": 6.3113353516074396e-06, "loss": 0.3391, "num_input_tokens_seen": 111372496, "step": 51655 }, { "epoch": 9.480638649293448, "grad_norm": 18.377111434936523, "learning_rate": 6.310562608341926e-06, "loss": 0.1384, "num_input_tokens_seen": 111383152, "step": 51660 }, { "epoch": 9.481556248853, "grad_norm": 8.45901107788086, "learning_rate": 6.3097898314625115e-06, "loss": 0.3332, "num_input_tokens_seen": 111394160, "step": 51665 }, { "epoch": 9.482473848412553, "grad_norm": 1.1152576208114624, "learning_rate": 6.309017020989019e-06, "loss": 0.2131, "num_input_tokens_seen": 111405008, "step": 51670 }, { "epoch": 9.483391447972105, "grad_norm": 28.50342559814453, "learning_rate": 6.308244176941268e-06, "loss": 0.3197, "num_input_tokens_seen": 111416400, "step": 51675 }, { "epoch": 9.484309047531656, "grad_norm": 15.835916519165039, "learning_rate": 6.307471299339082e-06, "loss": 0.2713, "num_input_tokens_seen": 111426448, "step": 51680 }, { "epoch": 9.48522664709121, "grad_norm": 17.62522315979004, "learning_rate": 6.306698388202284e-06, "loss": 0.3555, "num_input_tokens_seen": 111436752, "step": 51685 }, { "epoch": 9.486144246650762, "grad_norm": 0.8717684149742126, "learning_rate": 6.305925443550695e-06, "loss": 0.2701, "num_input_tokens_seen": 111445968, "step": 51690 }, { "epoch": 9.487061846210313, "grad_norm": 5.993112564086914, "learning_rate": 6.3051524654041466e-06, "loss": 0.3516, "num_input_tokens_seen": 111456688, "step": 51695 }, { "epoch": 9.487979445769867, "grad_norm": 11.161178588867188, "learning_rate": 6.304379453782457e-06, "loss": 0.2599, "num_input_tokens_seen": 111466544, "step": 51700 }, { "epoch": 9.488897045329418, "grad_norm": 0.33896157145500183, "learning_rate": 6.303606408705459e-06, "loss": 0.0411, "num_input_tokens_seen": 111476496, "step": 51705 }, { "epoch": 9.48981464488897, "grad_norm": 0.2024257928133011, "learning_rate": 6.302833330192973e-06, "loss": 0.1316, "num_input_tokens_seen": 111488432, "step": 51710 }, { "epoch": 9.490732244448523, "grad_norm": 78.17469787597656, "learning_rate": 6.302060218264834e-06, "loss": 0.0959, "num_input_tokens_seen": 111497584, "step": 51715 }, { "epoch": 9.491649844008075, "grad_norm": 44.137176513671875, "learning_rate": 6.301287072940867e-06, "loss": 0.1887, "num_input_tokens_seen": 111508176, "step": 51720 }, { "epoch": 9.492567443567626, "grad_norm": 10.902071952819824, "learning_rate": 6.300513894240905e-06, "loss": 0.0888, "num_input_tokens_seen": 111517616, "step": 51725 }, { "epoch": 9.49348504312718, "grad_norm": 8.375199317932129, "learning_rate": 6.299740682184776e-06, "loss": 0.2682, "num_input_tokens_seen": 111528304, "step": 51730 }, { "epoch": 9.494402642686731, "grad_norm": 9.279455184936523, "learning_rate": 6.298967436792314e-06, "loss": 0.0553, "num_input_tokens_seen": 111540240, "step": 51735 }, { "epoch": 9.495320242246283, "grad_norm": 9.576292037963867, "learning_rate": 6.298194158083349e-06, "loss": 0.3332, "num_input_tokens_seen": 111550224, "step": 51740 }, { "epoch": 9.496237841805836, "grad_norm": 80.5328369140625, "learning_rate": 6.2974208460777175e-06, "loss": 0.4883, "num_input_tokens_seen": 111562160, "step": 51745 }, { "epoch": 9.497155441365388, "grad_norm": 3.0093533992767334, "learning_rate": 6.2966475007952495e-06, "loss": 0.1672, "num_input_tokens_seen": 111572592, "step": 51750 }, { "epoch": 9.49807304092494, "grad_norm": 23.177305221557617, "learning_rate": 6.295874122255785e-06, "loss": 0.3566, "num_input_tokens_seen": 111582480, "step": 51755 }, { "epoch": 9.498990640484493, "grad_norm": 21.566020965576172, "learning_rate": 6.2951007104791576e-06, "loss": 0.3002, "num_input_tokens_seen": 111592592, "step": 51760 }, { "epoch": 9.499908240044045, "grad_norm": 0.21410471200942993, "learning_rate": 6.294327265485203e-06, "loss": 0.0102, "num_input_tokens_seen": 111604112, "step": 51765 }, { "epoch": 9.500825839603596, "grad_norm": 17.389558792114258, "learning_rate": 6.2935537872937616e-06, "loss": 0.2316, "num_input_tokens_seen": 111615312, "step": 51770 }, { "epoch": 9.50174343916315, "grad_norm": 19.042964935302734, "learning_rate": 6.292780275924669e-06, "loss": 0.2497, "num_input_tokens_seen": 111626032, "step": 51775 }, { "epoch": 9.502661038722701, "grad_norm": 1.7896019220352173, "learning_rate": 6.2920067313977675e-06, "loss": 0.0324, "num_input_tokens_seen": 111636368, "step": 51780 }, { "epoch": 9.503578638282253, "grad_norm": 0.16735771298408508, "learning_rate": 6.291233153732894e-06, "loss": 0.1487, "num_input_tokens_seen": 111648112, "step": 51785 }, { "epoch": 9.504496237841806, "grad_norm": 7.703943252563477, "learning_rate": 6.290459542949892e-06, "loss": 0.1707, "num_input_tokens_seen": 111657936, "step": 51790 }, { "epoch": 9.505413837401358, "grad_norm": 173.57286071777344, "learning_rate": 6.289685899068603e-06, "loss": 0.3496, "num_input_tokens_seen": 111668656, "step": 51795 }, { "epoch": 9.50633143696091, "grad_norm": 33.00698471069336, "learning_rate": 6.28891222210887e-06, "loss": 0.2333, "num_input_tokens_seen": 111679440, "step": 51800 }, { "epoch": 9.507249036520463, "grad_norm": 1.9153908491134644, "learning_rate": 6.288138512090536e-06, "loss": 0.2392, "num_input_tokens_seen": 111688368, "step": 51805 }, { "epoch": 9.508166636080015, "grad_norm": 14.784729957580566, "learning_rate": 6.287364769033444e-06, "loss": 0.1367, "num_input_tokens_seen": 111698736, "step": 51810 }, { "epoch": 9.509084235639566, "grad_norm": 11.900676727294922, "learning_rate": 6.2865909929574424e-06, "loss": 0.1194, "num_input_tokens_seen": 111710416, "step": 51815 }, { "epoch": 9.51000183519912, "grad_norm": 13.480340957641602, "learning_rate": 6.285817183882376e-06, "loss": 0.1599, "num_input_tokens_seen": 111722000, "step": 51820 }, { "epoch": 9.510919434758671, "grad_norm": 6.167736053466797, "learning_rate": 6.285043341828091e-06, "loss": 0.5683, "num_input_tokens_seen": 111732880, "step": 51825 }, { "epoch": 9.511837034318223, "grad_norm": 6.59025239944458, "learning_rate": 6.284269466814437e-06, "loss": 0.2499, "num_input_tokens_seen": 111743216, "step": 51830 }, { "epoch": 9.512754633877776, "grad_norm": 0.2324535846710205, "learning_rate": 6.28349555886126e-06, "loss": 0.0812, "num_input_tokens_seen": 111755024, "step": 51835 }, { "epoch": 9.513672233437328, "grad_norm": 1.3967844247817993, "learning_rate": 6.282721617988411e-06, "loss": 0.0186, "num_input_tokens_seen": 111765328, "step": 51840 }, { "epoch": 9.51458983299688, "grad_norm": 6.3738603591918945, "learning_rate": 6.281947644215742e-06, "loss": 0.1528, "num_input_tokens_seen": 111776848, "step": 51845 }, { "epoch": 9.515507432556433, "grad_norm": 13.345561981201172, "learning_rate": 6.281173637563102e-06, "loss": 0.1328, "num_input_tokens_seen": 111785680, "step": 51850 }, { "epoch": 9.516425032115984, "grad_norm": 9.295381546020508, "learning_rate": 6.280399598050344e-06, "loss": 0.2976, "num_input_tokens_seen": 111795984, "step": 51855 }, { "epoch": 9.517342631675536, "grad_norm": 0.5248417258262634, "learning_rate": 6.279625525697322e-06, "loss": 0.2445, "num_input_tokens_seen": 111806576, "step": 51860 }, { "epoch": 9.51826023123509, "grad_norm": 10.503412246704102, "learning_rate": 6.278851420523886e-06, "loss": 0.2742, "num_input_tokens_seen": 111817424, "step": 51865 }, { "epoch": 9.519177830794641, "grad_norm": 4.796370029449463, "learning_rate": 6.278077282549895e-06, "loss": 0.2725, "num_input_tokens_seen": 111827504, "step": 51870 }, { "epoch": 9.520095430354193, "grad_norm": 5.232590675354004, "learning_rate": 6.277303111795201e-06, "loss": 0.1134, "num_input_tokens_seen": 111838960, "step": 51875 }, { "epoch": 9.521013029913746, "grad_norm": 5.400026798248291, "learning_rate": 6.276528908279663e-06, "loss": 0.4919, "num_input_tokens_seen": 111848880, "step": 51880 }, { "epoch": 9.521930629473298, "grad_norm": 31.23567008972168, "learning_rate": 6.275754672023137e-06, "loss": 0.0717, "num_input_tokens_seen": 111859952, "step": 51885 }, { "epoch": 9.52284822903285, "grad_norm": 38.53148651123047, "learning_rate": 6.27498040304548e-06, "loss": 0.0935, "num_input_tokens_seen": 111870512, "step": 51890 }, { "epoch": 9.523765828592403, "grad_norm": 18.42142677307129, "learning_rate": 6.274206101366553e-06, "loss": 0.3533, "num_input_tokens_seen": 111881808, "step": 51895 }, { "epoch": 9.524683428151954, "grad_norm": 7.267979621887207, "learning_rate": 6.273431767006213e-06, "loss": 0.2343, "num_input_tokens_seen": 111892656, "step": 51900 }, { "epoch": 9.525601027711506, "grad_norm": 24.2264404296875, "learning_rate": 6.272657399984323e-06, "loss": 0.4027, "num_input_tokens_seen": 111903568, "step": 51905 }, { "epoch": 9.52651862727106, "grad_norm": 0.808668315410614, "learning_rate": 6.271883000320742e-06, "loss": 0.3123, "num_input_tokens_seen": 111914352, "step": 51910 }, { "epoch": 9.527436226830611, "grad_norm": 16.215667724609375, "learning_rate": 6.2711085680353345e-06, "loss": 0.3128, "num_input_tokens_seen": 111924432, "step": 51915 }, { "epoch": 9.528353826390163, "grad_norm": 0.6470946073532104, "learning_rate": 6.270334103147961e-06, "loss": 0.1739, "num_input_tokens_seen": 111935088, "step": 51920 }, { "epoch": 9.529271425949716, "grad_norm": 0.5824972987174988, "learning_rate": 6.269559605678488e-06, "loss": 0.1872, "num_input_tokens_seen": 111946576, "step": 51925 }, { "epoch": 9.530189025509268, "grad_norm": 35.897464752197266, "learning_rate": 6.268785075646777e-06, "loss": 0.2368, "num_input_tokens_seen": 111958192, "step": 51930 }, { "epoch": 9.53110662506882, "grad_norm": 26.965415954589844, "learning_rate": 6.268010513072698e-06, "loss": 0.0971, "num_input_tokens_seen": 111968720, "step": 51935 }, { "epoch": 9.532024224628373, "grad_norm": 9.441949844360352, "learning_rate": 6.267235917976112e-06, "loss": 0.3087, "num_input_tokens_seen": 111979536, "step": 51940 }, { "epoch": 9.532941824187924, "grad_norm": 21.174362182617188, "learning_rate": 6.26646129037689e-06, "loss": 0.1315, "num_input_tokens_seen": 111991152, "step": 51945 }, { "epoch": 9.533859423747476, "grad_norm": 0.9415481090545654, "learning_rate": 6.265686630294897e-06, "loss": 0.0957, "num_input_tokens_seen": 112002128, "step": 51950 }, { "epoch": 9.53477702330703, "grad_norm": 11.481755256652832, "learning_rate": 6.264911937750006e-06, "loss": 0.1679, "num_input_tokens_seen": 112012688, "step": 51955 }, { "epoch": 9.53569462286658, "grad_norm": 10.54175853729248, "learning_rate": 6.264137212762083e-06, "loss": 0.083, "num_input_tokens_seen": 112022928, "step": 51960 }, { "epoch": 9.536612222426132, "grad_norm": 2.0553503036499023, "learning_rate": 6.263362455350999e-06, "loss": 0.2059, "num_input_tokens_seen": 112033648, "step": 51965 }, { "epoch": 9.537529821985686, "grad_norm": 11.457708358764648, "learning_rate": 6.262587665536627e-06, "loss": 0.2398, "num_input_tokens_seen": 112044688, "step": 51970 }, { "epoch": 9.538447421545238, "grad_norm": 0.10613207519054413, "learning_rate": 6.261812843338837e-06, "loss": 0.3074, "num_input_tokens_seen": 112054512, "step": 51975 }, { "epoch": 9.53936502110479, "grad_norm": 8.012506484985352, "learning_rate": 6.261037988777505e-06, "loss": 0.2818, "num_input_tokens_seen": 112064752, "step": 51980 }, { "epoch": 9.540282620664343, "grad_norm": 99.33716583251953, "learning_rate": 6.260263101872502e-06, "loss": 0.176, "num_input_tokens_seen": 112076688, "step": 51985 }, { "epoch": 9.541200220223894, "grad_norm": 0.12287541478872299, "learning_rate": 6.259488182643705e-06, "loss": 0.2764, "num_input_tokens_seen": 112087248, "step": 51990 }, { "epoch": 9.542117819783446, "grad_norm": 55.87800216674805, "learning_rate": 6.258713231110987e-06, "loss": 0.2132, "num_input_tokens_seen": 112097456, "step": 51995 }, { "epoch": 9.543035419343, "grad_norm": 32.14698028564453, "learning_rate": 6.257938247294224e-06, "loss": 0.3506, "num_input_tokens_seen": 112107728, "step": 52000 }, { "epoch": 9.54395301890255, "grad_norm": 0.12936177849769592, "learning_rate": 6.2571632312132966e-06, "loss": 0.2904, "num_input_tokens_seen": 112117904, "step": 52005 }, { "epoch": 9.544870618462102, "grad_norm": 73.21305084228516, "learning_rate": 6.25638818288808e-06, "loss": 0.1998, "num_input_tokens_seen": 112129872, "step": 52010 }, { "epoch": 9.545788218021656, "grad_norm": 26.767663955688477, "learning_rate": 6.255613102338455e-06, "loss": 0.3119, "num_input_tokens_seen": 112140944, "step": 52015 }, { "epoch": 9.546705817581207, "grad_norm": 33.66529083251953, "learning_rate": 6.254837989584299e-06, "loss": 0.2386, "num_input_tokens_seen": 112152560, "step": 52020 }, { "epoch": 9.547623417140759, "grad_norm": 1.8485807180404663, "learning_rate": 6.254062844645493e-06, "loss": 0.3836, "num_input_tokens_seen": 112162384, "step": 52025 }, { "epoch": 9.548541016700312, "grad_norm": 5.414773941040039, "learning_rate": 6.253287667541918e-06, "loss": 0.3286, "num_input_tokens_seen": 112174576, "step": 52030 }, { "epoch": 9.549458616259864, "grad_norm": 40.32355499267578, "learning_rate": 6.25251245829346e-06, "loss": 0.3407, "num_input_tokens_seen": 112185616, "step": 52035 }, { "epoch": 9.550376215819416, "grad_norm": 0.40088772773742676, "learning_rate": 6.251737216919996e-06, "loss": 0.2008, "num_input_tokens_seen": 112195632, "step": 52040 }, { "epoch": 9.551293815378969, "grad_norm": 31.655357360839844, "learning_rate": 6.250961943441412e-06, "loss": 0.4464, "num_input_tokens_seen": 112207056, "step": 52045 }, { "epoch": 9.55221141493852, "grad_norm": 81.18919372558594, "learning_rate": 6.250186637877594e-06, "loss": 0.1729, "num_input_tokens_seen": 112218992, "step": 52050 }, { "epoch": 9.553129014498072, "grad_norm": 0.5222409963607788, "learning_rate": 6.249411300248427e-06, "loss": 0.2253, "num_input_tokens_seen": 112230128, "step": 52055 }, { "epoch": 9.554046614057626, "grad_norm": 18.048030853271484, "learning_rate": 6.248635930573796e-06, "loss": 0.1747, "num_input_tokens_seen": 112240176, "step": 52060 }, { "epoch": 9.554964213617177, "grad_norm": 14.688020706176758, "learning_rate": 6.247860528873588e-06, "loss": 0.2449, "num_input_tokens_seen": 112251440, "step": 52065 }, { "epoch": 9.555881813176729, "grad_norm": 1.2587462663650513, "learning_rate": 6.247085095167691e-06, "loss": 0.1318, "num_input_tokens_seen": 112262224, "step": 52070 }, { "epoch": 9.556799412736282, "grad_norm": 32.00927734375, "learning_rate": 6.246309629475995e-06, "loss": 0.4073, "num_input_tokens_seen": 112272816, "step": 52075 }, { "epoch": 9.557717012295834, "grad_norm": 33.18796157836914, "learning_rate": 6.245534131818388e-06, "loss": 0.0234, "num_input_tokens_seen": 112284720, "step": 52080 }, { "epoch": 9.558634611855386, "grad_norm": 0.35679224133491516, "learning_rate": 6.244758602214761e-06, "loss": 0.0593, "num_input_tokens_seen": 112294896, "step": 52085 }, { "epoch": 9.559552211414939, "grad_norm": 8.410780906677246, "learning_rate": 6.243983040685007e-06, "loss": 0.2558, "num_input_tokens_seen": 112306224, "step": 52090 }, { "epoch": 9.56046981097449, "grad_norm": 10.714713096618652, "learning_rate": 6.2432074472490135e-06, "loss": 0.1061, "num_input_tokens_seen": 112317136, "step": 52095 }, { "epoch": 9.561387410534042, "grad_norm": 0.7230662107467651, "learning_rate": 6.242431821926678e-06, "loss": 0.1906, "num_input_tokens_seen": 112328048, "step": 52100 }, { "epoch": 9.562305010093596, "grad_norm": 7.466335296630859, "learning_rate": 6.241656164737889e-06, "loss": 0.1437, "num_input_tokens_seen": 112339312, "step": 52105 }, { "epoch": 9.563222609653147, "grad_norm": 31.6859188079834, "learning_rate": 6.2408804757025455e-06, "loss": 0.2105, "num_input_tokens_seen": 112350160, "step": 52110 }, { "epoch": 9.564140209212699, "grad_norm": 1.2216763496398926, "learning_rate": 6.240104754840541e-06, "loss": 0.2248, "num_input_tokens_seen": 112361104, "step": 52115 }, { "epoch": 9.565057808772252, "grad_norm": 46.78203582763672, "learning_rate": 6.239329002171771e-06, "loss": 0.3657, "num_input_tokens_seen": 112372528, "step": 52120 }, { "epoch": 9.565975408331804, "grad_norm": 0.6025442481040955, "learning_rate": 6.238553217716135e-06, "loss": 0.2387, "num_input_tokens_seen": 112382928, "step": 52125 }, { "epoch": 9.566893007891355, "grad_norm": 18.808101654052734, "learning_rate": 6.237777401493526e-06, "loss": 0.4675, "num_input_tokens_seen": 112391952, "step": 52130 }, { "epoch": 9.567810607450909, "grad_norm": 6.483550548553467, "learning_rate": 6.237001553523846e-06, "loss": 0.4926, "num_input_tokens_seen": 112402800, "step": 52135 }, { "epoch": 9.56872820701046, "grad_norm": 72.81396484375, "learning_rate": 6.236225673826992e-06, "loss": 0.3469, "num_input_tokens_seen": 112414992, "step": 52140 }, { "epoch": 9.569645806570012, "grad_norm": 0.34787318110466003, "learning_rate": 6.235449762422867e-06, "loss": 0.1723, "num_input_tokens_seen": 112425904, "step": 52145 }, { "epoch": 9.570563406129565, "grad_norm": 20.58812713623047, "learning_rate": 6.23467381933137e-06, "loss": 0.2241, "num_input_tokens_seen": 112438032, "step": 52150 }, { "epoch": 9.571481005689117, "grad_norm": 1.1865726709365845, "learning_rate": 6.2338978445724045e-06, "loss": 0.0944, "num_input_tokens_seen": 112448112, "step": 52155 }, { "epoch": 9.572398605248669, "grad_norm": 3.1096103191375732, "learning_rate": 6.233121838165869e-06, "loss": 0.0801, "num_input_tokens_seen": 112458224, "step": 52160 }, { "epoch": 9.573316204808222, "grad_norm": 40.48969268798828, "learning_rate": 6.232345800131672e-06, "loss": 0.3298, "num_input_tokens_seen": 112468944, "step": 52165 }, { "epoch": 9.574233804367774, "grad_norm": 0.1841508001089096, "learning_rate": 6.231569730489713e-06, "loss": 0.2303, "num_input_tokens_seen": 112479088, "step": 52170 }, { "epoch": 9.575151403927325, "grad_norm": 12.422585487365723, "learning_rate": 6.2307936292599e-06, "loss": 0.1527, "num_input_tokens_seen": 112489456, "step": 52175 }, { "epoch": 9.576069003486879, "grad_norm": 31.58595085144043, "learning_rate": 6.230017496462138e-06, "loss": 0.2377, "num_input_tokens_seen": 112499600, "step": 52180 }, { "epoch": 9.57698660304643, "grad_norm": 0.41090813279151917, "learning_rate": 6.229241332116334e-06, "loss": 0.1913, "num_input_tokens_seen": 112510096, "step": 52185 }, { "epoch": 9.577904202605982, "grad_norm": 3.560215473175049, "learning_rate": 6.228465136242394e-06, "loss": 0.0257, "num_input_tokens_seen": 112520976, "step": 52190 }, { "epoch": 9.578821802165535, "grad_norm": 19.990541458129883, "learning_rate": 6.227688908860228e-06, "loss": 0.246, "num_input_tokens_seen": 112531600, "step": 52195 }, { "epoch": 9.579739401725087, "grad_norm": 1.016561508178711, "learning_rate": 6.2269126499897445e-06, "loss": 0.2174, "num_input_tokens_seen": 112542672, "step": 52200 }, { "epoch": 9.580657001284639, "grad_norm": 12.075236320495605, "learning_rate": 6.2261363596508515e-06, "loss": 0.1788, "num_input_tokens_seen": 112554192, "step": 52205 }, { "epoch": 9.581574600844192, "grad_norm": 0.6729776263237, "learning_rate": 6.225360037863462e-06, "loss": 0.3026, "num_input_tokens_seen": 112565840, "step": 52210 }, { "epoch": 9.582492200403744, "grad_norm": 28.908203125, "learning_rate": 6.224583684647488e-06, "loss": 0.2927, "num_input_tokens_seen": 112577040, "step": 52215 }, { "epoch": 9.583409799963295, "grad_norm": 0.4700721502304077, "learning_rate": 6.22380730002284e-06, "loss": 0.0716, "num_input_tokens_seen": 112587600, "step": 52220 }, { "epoch": 9.584327399522849, "grad_norm": 0.8008263111114502, "learning_rate": 6.223030884009431e-06, "loss": 0.2909, "num_input_tokens_seen": 112598032, "step": 52225 }, { "epoch": 9.5852449990824, "grad_norm": 3.4654769897460938, "learning_rate": 6.2222544366271745e-06, "loss": 0.071, "num_input_tokens_seen": 112608048, "step": 52230 }, { "epoch": 9.586162598641952, "grad_norm": 2.861572504043579, "learning_rate": 6.221477957895987e-06, "loss": 0.0591, "num_input_tokens_seen": 112618800, "step": 52235 }, { "epoch": 9.587080198201505, "grad_norm": 37.40300750732422, "learning_rate": 6.220701447835782e-06, "loss": 0.2329, "num_input_tokens_seen": 112629968, "step": 52240 }, { "epoch": 9.587997797761057, "grad_norm": 36.462894439697266, "learning_rate": 6.219924906466479e-06, "loss": 0.3166, "num_input_tokens_seen": 112640688, "step": 52245 }, { "epoch": 9.588915397320608, "grad_norm": 0.22493194043636322, "learning_rate": 6.219148333807991e-06, "loss": 0.2319, "num_input_tokens_seen": 112651952, "step": 52250 }, { "epoch": 9.589832996880162, "grad_norm": 25.838708877563477, "learning_rate": 6.218371729880238e-06, "loss": 0.3266, "num_input_tokens_seen": 112662448, "step": 52255 }, { "epoch": 9.590750596439714, "grad_norm": 1.4275336265563965, "learning_rate": 6.217595094703138e-06, "loss": 0.1371, "num_input_tokens_seen": 112673392, "step": 52260 }, { "epoch": 9.591668195999265, "grad_norm": 0.79146808385849, "learning_rate": 6.216818428296613e-06, "loss": 0.0457, "num_input_tokens_seen": 112685776, "step": 52265 }, { "epoch": 9.592585795558819, "grad_norm": 0.41644608974456787, "learning_rate": 6.216041730680579e-06, "loss": 0.1347, "num_input_tokens_seen": 112695664, "step": 52270 }, { "epoch": 9.59350339511837, "grad_norm": 62.12761306762695, "learning_rate": 6.2152650018749605e-06, "loss": 0.4315, "num_input_tokens_seen": 112706736, "step": 52275 }, { "epoch": 9.594420994677922, "grad_norm": 46.73654556274414, "learning_rate": 6.214488241899677e-06, "loss": 0.6878, "num_input_tokens_seen": 112716176, "step": 52280 }, { "epoch": 9.595338594237475, "grad_norm": 82.39179992675781, "learning_rate": 6.213711450774654e-06, "loss": 0.082, "num_input_tokens_seen": 112728336, "step": 52285 }, { "epoch": 9.596256193797027, "grad_norm": 17.427688598632812, "learning_rate": 6.212934628519812e-06, "loss": 0.2567, "num_input_tokens_seen": 112738608, "step": 52290 }, { "epoch": 9.597173793356578, "grad_norm": 0.224739670753479, "learning_rate": 6.212157775155077e-06, "loss": 0.0482, "num_input_tokens_seen": 112747376, "step": 52295 }, { "epoch": 9.598091392916132, "grad_norm": 17.850326538085938, "learning_rate": 6.211380890700374e-06, "loss": 0.2188, "num_input_tokens_seen": 112756528, "step": 52300 }, { "epoch": 9.599008992475683, "grad_norm": 1.6701792478561401, "learning_rate": 6.210603975175629e-06, "loss": 0.2687, "num_input_tokens_seen": 112765424, "step": 52305 }, { "epoch": 9.599926592035235, "grad_norm": 3.0276622772216797, "learning_rate": 6.209827028600768e-06, "loss": 0.5567, "num_input_tokens_seen": 112775536, "step": 52310 }, { "epoch": 9.600844191594788, "grad_norm": 115.63980102539062, "learning_rate": 6.209050050995717e-06, "loss": 0.2102, "num_input_tokens_seen": 112786832, "step": 52315 }, { "epoch": 9.60176179115434, "grad_norm": 0.312033474445343, "learning_rate": 6.208273042380408e-06, "loss": 0.1827, "num_input_tokens_seen": 112797520, "step": 52320 }, { "epoch": 9.602679390713892, "grad_norm": 35.882999420166016, "learning_rate": 6.207496002774769e-06, "loss": 0.3696, "num_input_tokens_seen": 112808400, "step": 52325 }, { "epoch": 9.603596990273445, "grad_norm": 0.30814746022224426, "learning_rate": 6.2067189321987265e-06, "loss": 0.1393, "num_input_tokens_seen": 112819088, "step": 52330 }, { "epoch": 9.604514589832997, "grad_norm": 5.819243907928467, "learning_rate": 6.205941830672215e-06, "loss": 0.1579, "num_input_tokens_seen": 112830704, "step": 52335 }, { "epoch": 9.605432189392548, "grad_norm": 79.7420425415039, "learning_rate": 6.205164698215165e-06, "loss": 0.1978, "num_input_tokens_seen": 112842192, "step": 52340 }, { "epoch": 9.606349788952102, "grad_norm": 0.9335730075836182, "learning_rate": 6.204387534847507e-06, "loss": 0.1838, "num_input_tokens_seen": 112853488, "step": 52345 }, { "epoch": 9.607267388511653, "grad_norm": 0.2864829897880554, "learning_rate": 6.203610340589177e-06, "loss": 0.007, "num_input_tokens_seen": 112864208, "step": 52350 }, { "epoch": 9.608184988071205, "grad_norm": 52.870357513427734, "learning_rate": 6.202833115460106e-06, "loss": 0.3011, "num_input_tokens_seen": 112874544, "step": 52355 }, { "epoch": 9.609102587630758, "grad_norm": 18.505455017089844, "learning_rate": 6.202055859480231e-06, "loss": 0.4394, "num_input_tokens_seen": 112885744, "step": 52360 }, { "epoch": 9.61002018719031, "grad_norm": 17.770099639892578, "learning_rate": 6.201278572669485e-06, "loss": 0.199, "num_input_tokens_seen": 112898128, "step": 52365 }, { "epoch": 9.610937786749862, "grad_norm": 33.35258102416992, "learning_rate": 6.200501255047806e-06, "loss": 0.1688, "num_input_tokens_seen": 112909936, "step": 52370 }, { "epoch": 9.611855386309415, "grad_norm": 1.562164306640625, "learning_rate": 6.19972390663513e-06, "loss": 0.0388, "num_input_tokens_seen": 112921776, "step": 52375 }, { "epoch": 9.612772985868967, "grad_norm": 0.39326241612434387, "learning_rate": 6.198946527451395e-06, "loss": 0.2376, "num_input_tokens_seen": 112931728, "step": 52380 }, { "epoch": 9.613690585428518, "grad_norm": 0.5983107686042786, "learning_rate": 6.198169117516542e-06, "loss": 0.3341, "num_input_tokens_seen": 112942256, "step": 52385 }, { "epoch": 9.614608184988072, "grad_norm": 1.913586139678955, "learning_rate": 6.197391676850505e-06, "loss": 0.353, "num_input_tokens_seen": 112951984, "step": 52390 }, { "epoch": 9.615525784547623, "grad_norm": 0.2568036913871765, "learning_rate": 6.1966142054732295e-06, "loss": 0.1932, "num_input_tokens_seen": 112961712, "step": 52395 }, { "epoch": 9.616443384107175, "grad_norm": 6.219420433044434, "learning_rate": 6.195836703404652e-06, "loss": 0.0665, "num_input_tokens_seen": 112972144, "step": 52400 }, { "epoch": 9.617360983666728, "grad_norm": 35.04374694824219, "learning_rate": 6.195059170664718e-06, "loss": 0.3317, "num_input_tokens_seen": 112983248, "step": 52405 }, { "epoch": 9.61827858322628, "grad_norm": 23.75613784790039, "learning_rate": 6.194281607273368e-06, "loss": 0.5143, "num_input_tokens_seen": 112993808, "step": 52410 }, { "epoch": 9.619196182785831, "grad_norm": 7.4129228591918945, "learning_rate": 6.193504013250546e-06, "loss": 0.3801, "num_input_tokens_seen": 113004432, "step": 52415 }, { "epoch": 9.620113782345385, "grad_norm": 4.6447930335998535, "learning_rate": 6.192726388616196e-06, "loss": 0.2424, "num_input_tokens_seen": 113013328, "step": 52420 }, { "epoch": 9.621031381904936, "grad_norm": 0.1850312501192093, "learning_rate": 6.1919487333902616e-06, "loss": 0.2305, "num_input_tokens_seen": 113023344, "step": 52425 }, { "epoch": 9.621948981464488, "grad_norm": 0.8842471837997437, "learning_rate": 6.19117104759269e-06, "loss": 0.1687, "num_input_tokens_seen": 113033872, "step": 52430 }, { "epoch": 9.622866581024041, "grad_norm": 0.4681273400783539, "learning_rate": 6.1903933312434285e-06, "loss": 0.1146, "num_input_tokens_seen": 113045776, "step": 52435 }, { "epoch": 9.623784180583593, "grad_norm": 8.656757354736328, "learning_rate": 6.1896155843624215e-06, "loss": 0.3462, "num_input_tokens_seen": 113056400, "step": 52440 }, { "epoch": 9.624701780143145, "grad_norm": 21.32419204711914, "learning_rate": 6.18883780696962e-06, "loss": 0.1515, "num_input_tokens_seen": 113066544, "step": 52445 }, { "epoch": 9.625619379702698, "grad_norm": 0.26428335905075073, "learning_rate": 6.18805999908497e-06, "loss": 0.2232, "num_input_tokens_seen": 113077904, "step": 52450 }, { "epoch": 9.62653697926225, "grad_norm": 0.8082166910171509, "learning_rate": 6.187282160728422e-06, "loss": 0.2061, "num_input_tokens_seen": 113090064, "step": 52455 }, { "epoch": 9.627454578821801, "grad_norm": 39.85143280029297, "learning_rate": 6.186504291919928e-06, "loss": 0.2111, "num_input_tokens_seen": 113100688, "step": 52460 }, { "epoch": 9.628372178381355, "grad_norm": 29.748798370361328, "learning_rate": 6.185726392679437e-06, "loss": 0.477, "num_input_tokens_seen": 113111920, "step": 52465 }, { "epoch": 9.629289777940906, "grad_norm": 0.41605910658836365, "learning_rate": 6.184948463026902e-06, "loss": 0.2427, "num_input_tokens_seen": 113121936, "step": 52470 }, { "epoch": 9.630207377500458, "grad_norm": 10.008445739746094, "learning_rate": 6.184170502982276e-06, "loss": 0.1412, "num_input_tokens_seen": 113131472, "step": 52475 }, { "epoch": 9.631124977060011, "grad_norm": 29.359582901000977, "learning_rate": 6.1833925125655104e-06, "loss": 0.1445, "num_input_tokens_seen": 113141712, "step": 52480 }, { "epoch": 9.632042576619563, "grad_norm": 27.903268814086914, "learning_rate": 6.1826144917965625e-06, "loss": 0.0875, "num_input_tokens_seen": 113152784, "step": 52485 }, { "epoch": 9.632960176179115, "grad_norm": 20.35698699951172, "learning_rate": 6.181836440695384e-06, "loss": 0.8175, "num_input_tokens_seen": 113162320, "step": 52490 }, { "epoch": 9.633877775738668, "grad_norm": 0.2853409945964813, "learning_rate": 6.181058359281935e-06, "loss": 0.0963, "num_input_tokens_seen": 113172784, "step": 52495 }, { "epoch": 9.63479537529822, "grad_norm": 24.2043514251709, "learning_rate": 6.180280247576168e-06, "loss": 0.4388, "num_input_tokens_seen": 113184976, "step": 52500 }, { "epoch": 9.635712974857771, "grad_norm": 10.105939865112305, "learning_rate": 6.179502105598041e-06, "loss": 0.2854, "num_input_tokens_seen": 113196272, "step": 52505 }, { "epoch": 9.636630574417325, "grad_norm": 0.253454327583313, "learning_rate": 6.178723933367515e-06, "loss": 0.2876, "num_input_tokens_seen": 113206800, "step": 52510 }, { "epoch": 9.637548173976876, "grad_norm": 0.49235254526138306, "learning_rate": 6.177945730904545e-06, "loss": 0.1502, "num_input_tokens_seen": 113217872, "step": 52515 }, { "epoch": 9.638465773536428, "grad_norm": 10.385824203491211, "learning_rate": 6.177167498229095e-06, "loss": 0.3208, "num_input_tokens_seen": 113229872, "step": 52520 }, { "epoch": 9.639383373095981, "grad_norm": 28.611906051635742, "learning_rate": 6.176389235361121e-06, "loss": 0.1398, "num_input_tokens_seen": 113240560, "step": 52525 }, { "epoch": 9.640300972655533, "grad_norm": 0.45590129494667053, "learning_rate": 6.175610942320588e-06, "loss": 0.3946, "num_input_tokens_seen": 113251568, "step": 52530 }, { "epoch": 9.641218572215084, "grad_norm": 36.24102783203125, "learning_rate": 6.174832619127455e-06, "loss": 0.2254, "num_input_tokens_seen": 113262544, "step": 52535 }, { "epoch": 9.642136171774638, "grad_norm": 56.444679260253906, "learning_rate": 6.174054265801686e-06, "loss": 0.1224, "num_input_tokens_seen": 113273744, "step": 52540 }, { "epoch": 9.64305377133419, "grad_norm": 1.4108556509017944, "learning_rate": 6.173275882363245e-06, "loss": 0.0999, "num_input_tokens_seen": 113284368, "step": 52545 }, { "epoch": 9.643971370893741, "grad_norm": 0.5118162035942078, "learning_rate": 6.172497468832097e-06, "loss": 0.438, "num_input_tokens_seen": 113295056, "step": 52550 }, { "epoch": 9.644888970453295, "grad_norm": 27.571792602539062, "learning_rate": 6.171719025228206e-06, "loss": 0.2897, "num_input_tokens_seen": 113304912, "step": 52555 }, { "epoch": 9.645806570012846, "grad_norm": 0.7630364298820496, "learning_rate": 6.170940551571537e-06, "loss": 0.2184, "num_input_tokens_seen": 113314992, "step": 52560 }, { "epoch": 9.646724169572398, "grad_norm": 47.04683303833008, "learning_rate": 6.170162047882059e-06, "loss": 0.0987, "num_input_tokens_seen": 113325680, "step": 52565 }, { "epoch": 9.647641769131951, "grad_norm": 0.274435818195343, "learning_rate": 6.169383514179737e-06, "loss": 0.0573, "num_input_tokens_seen": 113335920, "step": 52570 }, { "epoch": 9.648559368691503, "grad_norm": 1.7370356321334839, "learning_rate": 6.168604950484541e-06, "loss": 0.0147, "num_input_tokens_seen": 113347056, "step": 52575 }, { "epoch": 9.649476968251054, "grad_norm": 36.70925521850586, "learning_rate": 6.167826356816437e-06, "loss": 0.0761, "num_input_tokens_seen": 113357424, "step": 52580 }, { "epoch": 9.650394567810608, "grad_norm": 23.158567428588867, "learning_rate": 6.1670477331954e-06, "loss": 0.2955, "num_input_tokens_seen": 113367280, "step": 52585 }, { "epoch": 9.65131216737016, "grad_norm": 0.6541820764541626, "learning_rate": 6.166269079641396e-06, "loss": 0.2018, "num_input_tokens_seen": 113376688, "step": 52590 }, { "epoch": 9.652229766929711, "grad_norm": 24.691734313964844, "learning_rate": 6.165490396174398e-06, "loss": 0.2781, "num_input_tokens_seen": 113388176, "step": 52595 }, { "epoch": 9.653147366489264, "grad_norm": 1.7597148418426514, "learning_rate": 6.1647116828143776e-06, "loss": 0.2316, "num_input_tokens_seen": 113399376, "step": 52600 }, { "epoch": 9.654064966048816, "grad_norm": 0.24837680160999298, "learning_rate": 6.163932939581307e-06, "loss": 0.2536, "num_input_tokens_seen": 113409104, "step": 52605 }, { "epoch": 9.654982565608368, "grad_norm": 5.796307563781738, "learning_rate": 6.163154166495161e-06, "loss": 0.0146, "num_input_tokens_seen": 113419888, "step": 52610 }, { "epoch": 9.655900165167921, "grad_norm": 48.823360443115234, "learning_rate": 6.162375363575912e-06, "loss": 0.1663, "num_input_tokens_seen": 113429264, "step": 52615 }, { "epoch": 9.656817764727473, "grad_norm": 26.631820678710938, "learning_rate": 6.1615965308435384e-06, "loss": 0.2379, "num_input_tokens_seen": 113440432, "step": 52620 }, { "epoch": 9.657735364287024, "grad_norm": 18.491777420043945, "learning_rate": 6.160817668318013e-06, "loss": 0.0251, "num_input_tokens_seen": 113450832, "step": 52625 }, { "epoch": 9.658652963846578, "grad_norm": 6.277571201324463, "learning_rate": 6.160038776019314e-06, "loss": 0.1344, "num_input_tokens_seen": 113460592, "step": 52630 }, { "epoch": 9.65957056340613, "grad_norm": 43.80912399291992, "learning_rate": 6.159259853967419e-06, "loss": 0.6747, "num_input_tokens_seen": 113471504, "step": 52635 }, { "epoch": 9.660488162965681, "grad_norm": 19.46863555908203, "learning_rate": 6.1584809021823035e-06, "loss": 0.3315, "num_input_tokens_seen": 113482832, "step": 52640 }, { "epoch": 9.661405762525234, "grad_norm": 28.199792861938477, "learning_rate": 6.157701920683949e-06, "loss": 0.0901, "num_input_tokens_seen": 113493840, "step": 52645 }, { "epoch": 9.662323362084786, "grad_norm": 1.8407970666885376, "learning_rate": 6.156922909492336e-06, "loss": 0.3012, "num_input_tokens_seen": 113504848, "step": 52650 }, { "epoch": 9.663240961644338, "grad_norm": 25.864559173583984, "learning_rate": 6.156143868627442e-06, "loss": 0.2108, "num_input_tokens_seen": 113516240, "step": 52655 }, { "epoch": 9.664158561203891, "grad_norm": 22.10101318359375, "learning_rate": 6.15536479810925e-06, "loss": 0.2502, "num_input_tokens_seen": 113528528, "step": 52660 }, { "epoch": 9.665076160763443, "grad_norm": 76.79175567626953, "learning_rate": 6.1545856979577414e-06, "loss": 0.6735, "num_input_tokens_seen": 113539536, "step": 52665 }, { "epoch": 9.665993760322994, "grad_norm": 0.013324988074600697, "learning_rate": 6.1538065681928994e-06, "loss": 0.0074, "num_input_tokens_seen": 113550256, "step": 52670 }, { "epoch": 9.666911359882548, "grad_norm": 47.95029067993164, "learning_rate": 6.1530274088347075e-06, "loss": 0.1608, "num_input_tokens_seen": 113561136, "step": 52675 }, { "epoch": 9.6678289594421, "grad_norm": 3.048384189605713, "learning_rate": 6.152248219903149e-06, "loss": 0.2459, "num_input_tokens_seen": 113572976, "step": 52680 }, { "epoch": 9.66874655900165, "grad_norm": 3.323042869567871, "learning_rate": 6.15146900141821e-06, "loss": 0.07, "num_input_tokens_seen": 113583696, "step": 52685 }, { "epoch": 9.669664158561204, "grad_norm": 103.17117309570312, "learning_rate": 6.150689753399875e-06, "loss": 0.2437, "num_input_tokens_seen": 113595088, "step": 52690 }, { "epoch": 9.670581758120756, "grad_norm": 0.787893533706665, "learning_rate": 6.149910475868133e-06, "loss": 0.0627, "num_input_tokens_seen": 113606416, "step": 52695 }, { "epoch": 9.671499357680307, "grad_norm": 1.1020770072937012, "learning_rate": 6.149131168842967e-06, "loss": 0.2166, "num_input_tokens_seen": 113618320, "step": 52700 }, { "epoch": 9.67241695723986, "grad_norm": 35.55779266357422, "learning_rate": 6.148351832344369e-06, "loss": 0.6785, "num_input_tokens_seen": 113629360, "step": 52705 }, { "epoch": 9.673334556799412, "grad_norm": 37.940155029296875, "learning_rate": 6.147572466392325e-06, "loss": 0.4008, "num_input_tokens_seen": 113641648, "step": 52710 }, { "epoch": 9.674252156358964, "grad_norm": 28.292713165283203, "learning_rate": 6.146793071006828e-06, "loss": 0.1692, "num_input_tokens_seen": 113653328, "step": 52715 }, { "epoch": 9.675169755918517, "grad_norm": 0.3827644884586334, "learning_rate": 6.146013646207864e-06, "loss": 0.0215, "num_input_tokens_seen": 113663760, "step": 52720 }, { "epoch": 9.676087355478069, "grad_norm": 36.69986343383789, "learning_rate": 6.145234192015427e-06, "loss": 0.6586, "num_input_tokens_seen": 113674640, "step": 52725 }, { "epoch": 9.67700495503762, "grad_norm": 45.334815979003906, "learning_rate": 6.1444547084495075e-06, "loss": 0.3799, "num_input_tokens_seen": 113686000, "step": 52730 }, { "epoch": 9.677922554597174, "grad_norm": 87.17575073242188, "learning_rate": 6.143675195530099e-06, "loss": 0.3743, "num_input_tokens_seen": 113697040, "step": 52735 }, { "epoch": 9.678840154156726, "grad_norm": 18.22649574279785, "learning_rate": 6.142895653277194e-06, "loss": 0.15, "num_input_tokens_seen": 113706672, "step": 52740 }, { "epoch": 9.679757753716277, "grad_norm": 5.023112773895264, "learning_rate": 6.142116081710787e-06, "loss": 0.0377, "num_input_tokens_seen": 113716752, "step": 52745 }, { "epoch": 9.68067535327583, "grad_norm": 32.75153732299805, "learning_rate": 6.141336480850873e-06, "loss": 0.2763, "num_input_tokens_seen": 113727664, "step": 52750 }, { "epoch": 9.681592952835382, "grad_norm": 11.249719619750977, "learning_rate": 6.140556850717446e-06, "loss": 0.2967, "num_input_tokens_seen": 113737808, "step": 52755 }, { "epoch": 9.682510552394934, "grad_norm": 66.485107421875, "learning_rate": 6.139777191330504e-06, "loss": 0.2109, "num_input_tokens_seen": 113748624, "step": 52760 }, { "epoch": 9.683428151954487, "grad_norm": 43.99803924560547, "learning_rate": 6.1389975027100445e-06, "loss": 0.3259, "num_input_tokens_seen": 113758000, "step": 52765 }, { "epoch": 9.684345751514039, "grad_norm": 7.961881160736084, "learning_rate": 6.1382177848760625e-06, "loss": 0.5818, "num_input_tokens_seen": 113768464, "step": 52770 }, { "epoch": 9.68526335107359, "grad_norm": 1.1647164821624756, "learning_rate": 6.13743803784856e-06, "loss": 0.108, "num_input_tokens_seen": 113779088, "step": 52775 }, { "epoch": 9.686180950633144, "grad_norm": 0.1338956356048584, "learning_rate": 6.136658261647535e-06, "loss": 0.0962, "num_input_tokens_seen": 113789776, "step": 52780 }, { "epoch": 9.687098550192696, "grad_norm": 1.2595219612121582, "learning_rate": 6.135878456292986e-06, "loss": 0.2208, "num_input_tokens_seen": 113799792, "step": 52785 }, { "epoch": 9.688016149752247, "grad_norm": 1.9217833280563354, "learning_rate": 6.135098621804915e-06, "loss": 0.1272, "num_input_tokens_seen": 113811024, "step": 52790 }, { "epoch": 9.6889337493118, "grad_norm": 31.87211036682129, "learning_rate": 6.134318758203325e-06, "loss": 0.3262, "num_input_tokens_seen": 113820304, "step": 52795 }, { "epoch": 9.689851348871352, "grad_norm": 9.69437313079834, "learning_rate": 6.133538865508215e-06, "loss": 0.1611, "num_input_tokens_seen": 113830896, "step": 52800 }, { "epoch": 9.690768948430904, "grad_norm": 39.48651123046875, "learning_rate": 6.132758943739592e-06, "loss": 0.2443, "num_input_tokens_seen": 113841200, "step": 52805 }, { "epoch": 9.691686547990457, "grad_norm": 4.914370059967041, "learning_rate": 6.131978992917455e-06, "loss": 0.6385, "num_input_tokens_seen": 113851920, "step": 52810 }, { "epoch": 9.692604147550009, "grad_norm": 11.423584938049316, "learning_rate": 6.1311990130618135e-06, "loss": 0.1611, "num_input_tokens_seen": 113861680, "step": 52815 }, { "epoch": 9.69352174710956, "grad_norm": 2.395862102508545, "learning_rate": 6.13041900419267e-06, "loss": 0.1699, "num_input_tokens_seen": 113872400, "step": 52820 }, { "epoch": 9.694439346669114, "grad_norm": 21.323699951171875, "learning_rate": 6.12963896633003e-06, "loss": 0.3056, "num_input_tokens_seen": 113883216, "step": 52825 }, { "epoch": 9.695356946228666, "grad_norm": 0.5455843806266785, "learning_rate": 6.128858899493903e-06, "loss": 0.2006, "num_input_tokens_seen": 113894448, "step": 52830 }, { "epoch": 9.696274545788217, "grad_norm": 3.013486862182617, "learning_rate": 6.128078803704294e-06, "loss": 0.2327, "num_input_tokens_seen": 113906128, "step": 52835 }, { "epoch": 9.69719214534777, "grad_norm": 1.721657633781433, "learning_rate": 6.1272986789812125e-06, "loss": 0.1976, "num_input_tokens_seen": 113916944, "step": 52840 }, { "epoch": 9.698109744907322, "grad_norm": 0.7641655206680298, "learning_rate": 6.126518525344666e-06, "loss": 0.3038, "num_input_tokens_seen": 113927856, "step": 52845 }, { "epoch": 9.699027344466874, "grad_norm": 0.7442042827606201, "learning_rate": 6.125738342814667e-06, "loss": 0.1504, "num_input_tokens_seen": 113938992, "step": 52850 }, { "epoch": 9.699944944026427, "grad_norm": 16.860273361206055, "learning_rate": 6.1249581314112214e-06, "loss": 0.2271, "num_input_tokens_seen": 113949840, "step": 52855 }, { "epoch": 9.700862543585979, "grad_norm": 0.783839762210846, "learning_rate": 6.124177891154345e-06, "loss": 0.2455, "num_input_tokens_seen": 113960336, "step": 52860 }, { "epoch": 9.70178014314553, "grad_norm": 1.9555678367614746, "learning_rate": 6.123397622064049e-06, "loss": 0.0881, "num_input_tokens_seen": 113971472, "step": 52865 }, { "epoch": 9.702697742705084, "grad_norm": 76.20020294189453, "learning_rate": 6.122617324160343e-06, "loss": 0.1244, "num_input_tokens_seen": 113982160, "step": 52870 }, { "epoch": 9.703615342264635, "grad_norm": 3.1280364990234375, "learning_rate": 6.121836997463244e-06, "loss": 0.2664, "num_input_tokens_seen": 113993584, "step": 52875 }, { "epoch": 9.704532941824187, "grad_norm": 12.756114959716797, "learning_rate": 6.121056641992764e-06, "loss": 0.2335, "num_input_tokens_seen": 114002864, "step": 52880 }, { "epoch": 9.70545054138374, "grad_norm": 6.681281566619873, "learning_rate": 6.120276257768918e-06, "loss": 0.1893, "num_input_tokens_seen": 114014288, "step": 52885 }, { "epoch": 9.706368140943292, "grad_norm": 21.14278793334961, "learning_rate": 6.1194958448117246e-06, "loss": 0.205, "num_input_tokens_seen": 114023856, "step": 52890 }, { "epoch": 9.707285740502844, "grad_norm": 0.4426317811012268, "learning_rate": 6.118715403141197e-06, "loss": 0.1152, "num_input_tokens_seen": 114034224, "step": 52895 }, { "epoch": 9.708203340062397, "grad_norm": 0.24104641377925873, "learning_rate": 6.117934932777352e-06, "loss": 0.3744, "num_input_tokens_seen": 114044912, "step": 52900 }, { "epoch": 9.709120939621949, "grad_norm": 0.4468812346458435, "learning_rate": 6.117154433740209e-06, "loss": 0.1773, "num_input_tokens_seen": 114055088, "step": 52905 }, { "epoch": 9.7100385391815, "grad_norm": 0.762971818447113, "learning_rate": 6.1163739060497865e-06, "loss": 0.0303, "num_input_tokens_seen": 114064336, "step": 52910 }, { "epoch": 9.710956138741054, "grad_norm": 75.46697998046875, "learning_rate": 6.115593349726104e-06, "loss": 0.249, "num_input_tokens_seen": 114074608, "step": 52915 }, { "epoch": 9.711873738300605, "grad_norm": 0.6979792714118958, "learning_rate": 6.114812764789179e-06, "loss": 0.0599, "num_input_tokens_seen": 114086416, "step": 52920 }, { "epoch": 9.712791337860157, "grad_norm": 22.503433227539062, "learning_rate": 6.114032151259038e-06, "loss": 0.4368, "num_input_tokens_seen": 114096592, "step": 52925 }, { "epoch": 9.71370893741971, "grad_norm": 0.07763238251209259, "learning_rate": 6.113251509155695e-06, "loss": 0.2514, "num_input_tokens_seen": 114107920, "step": 52930 }, { "epoch": 9.714626536979262, "grad_norm": 20.331928253173828, "learning_rate": 6.112470838499178e-06, "loss": 0.0249, "num_input_tokens_seen": 114118032, "step": 52935 }, { "epoch": 9.715544136538814, "grad_norm": 1.830776572227478, "learning_rate": 6.1116901393095086e-06, "loss": 0.1421, "num_input_tokens_seen": 114130160, "step": 52940 }, { "epoch": 9.716461736098367, "grad_norm": 45.834739685058594, "learning_rate": 6.110909411606709e-06, "loss": 0.4158, "num_input_tokens_seen": 114140144, "step": 52945 }, { "epoch": 9.717379335657919, "grad_norm": 27.990020751953125, "learning_rate": 6.110128655410806e-06, "loss": 0.0411, "num_input_tokens_seen": 114149648, "step": 52950 }, { "epoch": 9.71829693521747, "grad_norm": 3.512382984161377, "learning_rate": 6.109347870741823e-06, "loss": 0.1092, "num_input_tokens_seen": 114159952, "step": 52955 }, { "epoch": 9.719214534777024, "grad_norm": 17.62140655517578, "learning_rate": 6.1085670576197855e-06, "loss": 0.2356, "num_input_tokens_seen": 114171568, "step": 52960 }, { "epoch": 9.720132134336575, "grad_norm": 58.86243438720703, "learning_rate": 6.107786216064723e-06, "loss": 0.4022, "num_input_tokens_seen": 114182704, "step": 52965 }, { "epoch": 9.721049733896127, "grad_norm": 0.25446847081184387, "learning_rate": 6.107005346096659e-06, "loss": 0.1063, "num_input_tokens_seen": 114194192, "step": 52970 }, { "epoch": 9.72196733345568, "grad_norm": 7.963291645050049, "learning_rate": 6.106224447735625e-06, "loss": 0.0968, "num_input_tokens_seen": 114204752, "step": 52975 }, { "epoch": 9.722884933015232, "grad_norm": 136.8043212890625, "learning_rate": 6.105443521001648e-06, "loss": 0.2805, "num_input_tokens_seen": 114214960, "step": 52980 }, { "epoch": 9.723802532574783, "grad_norm": 11.667642593383789, "learning_rate": 6.104662565914758e-06, "loss": 0.1207, "num_input_tokens_seen": 114226032, "step": 52985 }, { "epoch": 9.724720132134337, "grad_norm": 80.73779296875, "learning_rate": 6.1038815824949845e-06, "loss": 0.3549, "num_input_tokens_seen": 114237136, "step": 52990 }, { "epoch": 9.725637731693888, "grad_norm": 0.2588914632797241, "learning_rate": 6.1031005707623595e-06, "loss": 0.364, "num_input_tokens_seen": 114248400, "step": 52995 }, { "epoch": 9.72655533125344, "grad_norm": 38.14986038208008, "learning_rate": 6.102319530736916e-06, "loss": 0.4169, "num_input_tokens_seen": 114259536, "step": 53000 }, { "epoch": 9.727472930812993, "grad_norm": 11.819968223571777, "learning_rate": 6.101538462438684e-06, "loss": 0.4489, "num_input_tokens_seen": 114271248, "step": 53005 }, { "epoch": 9.728390530372545, "grad_norm": 7.440634727478027, "learning_rate": 6.100757365887698e-06, "loss": 0.1358, "num_input_tokens_seen": 114281936, "step": 53010 }, { "epoch": 9.729308129932097, "grad_norm": 12.689192771911621, "learning_rate": 6.099976241103991e-06, "loss": 0.2312, "num_input_tokens_seen": 114292368, "step": 53015 }, { "epoch": 9.73022572949165, "grad_norm": 30.916521072387695, "learning_rate": 6.099195088107598e-06, "loss": 0.193, "num_input_tokens_seen": 114302800, "step": 53020 }, { "epoch": 9.731143329051202, "grad_norm": 26.82723617553711, "learning_rate": 6.098413906918556e-06, "loss": 0.3308, "num_input_tokens_seen": 114313904, "step": 53025 }, { "epoch": 9.732060928610753, "grad_norm": 3.437765121459961, "learning_rate": 6.097632697556898e-06, "loss": 0.2839, "num_input_tokens_seen": 114324176, "step": 53030 }, { "epoch": 9.732978528170307, "grad_norm": 7.789649486541748, "learning_rate": 6.096851460042663e-06, "loss": 0.274, "num_input_tokens_seen": 114335760, "step": 53035 }, { "epoch": 9.733896127729858, "grad_norm": 1.2434282302856445, "learning_rate": 6.096070194395888e-06, "loss": 0.1152, "num_input_tokens_seen": 114345200, "step": 53040 }, { "epoch": 9.73481372728941, "grad_norm": 12.754304885864258, "learning_rate": 6.095288900636611e-06, "loss": 0.2078, "num_input_tokens_seen": 114353488, "step": 53045 }, { "epoch": 9.735731326848963, "grad_norm": 23.37908172607422, "learning_rate": 6.094507578784872e-06, "loss": 0.0719, "num_input_tokens_seen": 114362000, "step": 53050 }, { "epoch": 9.736648926408515, "grad_norm": 14.30306625366211, "learning_rate": 6.093726228860709e-06, "loss": 0.0258, "num_input_tokens_seen": 114371824, "step": 53055 }, { "epoch": 9.737566525968067, "grad_norm": 23.704727172851562, "learning_rate": 6.092944850884165e-06, "loss": 0.3051, "num_input_tokens_seen": 114381104, "step": 53060 }, { "epoch": 9.73848412552762, "grad_norm": 71.88140106201172, "learning_rate": 6.092163444875278e-06, "loss": 0.2895, "num_input_tokens_seen": 114392208, "step": 53065 }, { "epoch": 9.739401725087172, "grad_norm": 13.453338623046875, "learning_rate": 6.091382010854091e-06, "loss": 0.0269, "num_input_tokens_seen": 114402064, "step": 53070 }, { "epoch": 9.740319324646723, "grad_norm": 1.0381020307540894, "learning_rate": 6.0906005488406485e-06, "loss": 0.1302, "num_input_tokens_seen": 114413296, "step": 53075 }, { "epoch": 9.741236924206277, "grad_norm": 1.1943787336349487, "learning_rate": 6.089819058854991e-06, "loss": 0.1843, "num_input_tokens_seen": 114424368, "step": 53080 }, { "epoch": 9.742154523765828, "grad_norm": 53.388641357421875, "learning_rate": 6.089037540917165e-06, "loss": 0.1937, "num_input_tokens_seen": 114435376, "step": 53085 }, { "epoch": 9.74307212332538, "grad_norm": 0.3649942874908447, "learning_rate": 6.088255995047212e-06, "loss": 0.4052, "num_input_tokens_seen": 114446288, "step": 53090 }, { "epoch": 9.743989722884933, "grad_norm": 0.8265279531478882, "learning_rate": 6.087474421265179e-06, "loss": 0.1016, "num_input_tokens_seen": 114456432, "step": 53095 }, { "epoch": 9.744907322444485, "grad_norm": 2.599191904067993, "learning_rate": 6.0866928195911155e-06, "loss": 0.0059, "num_input_tokens_seen": 114466384, "step": 53100 }, { "epoch": 9.745824922004036, "grad_norm": 27.844717025756836, "learning_rate": 6.085911190045064e-06, "loss": 0.2358, "num_input_tokens_seen": 114477520, "step": 53105 }, { "epoch": 9.74674252156359, "grad_norm": 3.54063081741333, "learning_rate": 6.0851295326470726e-06, "loss": 0.146, "num_input_tokens_seen": 114488272, "step": 53110 }, { "epoch": 9.747660121123142, "grad_norm": 22.803085327148438, "learning_rate": 6.0843478474171925e-06, "loss": 0.3306, "num_input_tokens_seen": 114500112, "step": 53115 }, { "epoch": 9.748577720682695, "grad_norm": 0.008670293726027012, "learning_rate": 6.083566134375468e-06, "loss": 0.1314, "num_input_tokens_seen": 114510832, "step": 53120 }, { "epoch": 9.749495320242247, "grad_norm": 27.257823944091797, "learning_rate": 6.082784393541954e-06, "loss": 0.1234, "num_input_tokens_seen": 114521552, "step": 53125 }, { "epoch": 9.750412919801798, "grad_norm": 5.902767181396484, "learning_rate": 6.082002624936697e-06, "loss": 0.3401, "num_input_tokens_seen": 114532400, "step": 53130 }, { "epoch": 9.751330519361352, "grad_norm": 0.32116058468818665, "learning_rate": 6.08122082857975e-06, "loss": 0.3693, "num_input_tokens_seen": 114543888, "step": 53135 }, { "epoch": 9.752248118920903, "grad_norm": 0.42813193798065186, "learning_rate": 6.080439004491165e-06, "loss": 0.2787, "num_input_tokens_seen": 114553936, "step": 53140 }, { "epoch": 9.753165718480455, "grad_norm": 11.539250373840332, "learning_rate": 6.079657152690993e-06, "loss": 0.2469, "num_input_tokens_seen": 114565040, "step": 53145 }, { "epoch": 9.754083318040008, "grad_norm": 29.5216121673584, "learning_rate": 6.07887527319929e-06, "loss": 0.1594, "num_input_tokens_seen": 114575600, "step": 53150 }, { "epoch": 9.75500091759956, "grad_norm": 18.925188064575195, "learning_rate": 6.078093366036105e-06, "loss": 0.1002, "num_input_tokens_seen": 114587056, "step": 53155 }, { "epoch": 9.755918517159111, "grad_norm": 0.3407424986362457, "learning_rate": 6.0773114312215e-06, "loss": 0.0233, "num_input_tokens_seen": 114598576, "step": 53160 }, { "epoch": 9.756836116718665, "grad_norm": 32.25423049926758, "learning_rate": 6.076529468775524e-06, "loss": 0.3397, "num_input_tokens_seen": 114608816, "step": 53165 }, { "epoch": 9.757753716278216, "grad_norm": 18.15895652770996, "learning_rate": 6.075747478718237e-06, "loss": 0.1123, "num_input_tokens_seen": 114619568, "step": 53170 }, { "epoch": 9.758671315837768, "grad_norm": 2.460005044937134, "learning_rate": 6.074965461069693e-06, "loss": 0.0913, "num_input_tokens_seen": 114630512, "step": 53175 }, { "epoch": 9.759588915397321, "grad_norm": 12.652441024780273, "learning_rate": 6.074183415849952e-06, "loss": 0.0931, "num_input_tokens_seen": 114641776, "step": 53180 }, { "epoch": 9.760506514956873, "grad_norm": 6.04133939743042, "learning_rate": 6.073401343079071e-06, "loss": 0.185, "num_input_tokens_seen": 114652240, "step": 53185 }, { "epoch": 9.761424114516425, "grad_norm": 17.94666862487793, "learning_rate": 6.072619242777109e-06, "loss": 0.0112, "num_input_tokens_seen": 114663024, "step": 53190 }, { "epoch": 9.762341714075978, "grad_norm": 0.8825531005859375, "learning_rate": 6.0718371149641266e-06, "loss": 0.2477, "num_input_tokens_seen": 114674992, "step": 53195 }, { "epoch": 9.76325931363553, "grad_norm": 59.62461853027344, "learning_rate": 6.071054959660182e-06, "loss": 0.4368, "num_input_tokens_seen": 114685936, "step": 53200 }, { "epoch": 9.764176913195081, "grad_norm": 0.2693750262260437, "learning_rate": 6.070272776885338e-06, "loss": 0.3052, "num_input_tokens_seen": 114697168, "step": 53205 }, { "epoch": 9.765094512754635, "grad_norm": 10.479473114013672, "learning_rate": 6.069490566659657e-06, "loss": 0.2022, "num_input_tokens_seen": 114708400, "step": 53210 }, { "epoch": 9.766012112314186, "grad_norm": 0.32864269614219666, "learning_rate": 6.068708329003201e-06, "loss": 0.2825, "num_input_tokens_seen": 114718928, "step": 53215 }, { "epoch": 9.766929711873738, "grad_norm": 0.21077613532543182, "learning_rate": 6.067926063936031e-06, "loss": 0.1003, "num_input_tokens_seen": 114728944, "step": 53220 }, { "epoch": 9.767847311433291, "grad_norm": 91.0827865600586, "learning_rate": 6.067143771478213e-06, "loss": 0.7885, "num_input_tokens_seen": 114738832, "step": 53225 }, { "epoch": 9.768764910992843, "grad_norm": 0.3329285681247711, "learning_rate": 6.066361451649812e-06, "loss": 0.2602, "num_input_tokens_seen": 114749648, "step": 53230 }, { "epoch": 9.769682510552395, "grad_norm": 4.3495192527771, "learning_rate": 6.065579104470892e-06, "loss": 0.2987, "num_input_tokens_seen": 114760944, "step": 53235 }, { "epoch": 9.770600110111948, "grad_norm": 0.23235417902469635, "learning_rate": 6.06479672996152e-06, "loss": 0.2294, "num_input_tokens_seen": 114772240, "step": 53240 }, { "epoch": 9.7715177096715, "grad_norm": 0.8576857447624207, "learning_rate": 6.064014328141762e-06, "loss": 0.0983, "num_input_tokens_seen": 114783312, "step": 53245 }, { "epoch": 9.772435309231051, "grad_norm": 11.493413925170898, "learning_rate": 6.063231899031684e-06, "loss": 0.4327, "num_input_tokens_seen": 114793808, "step": 53250 }, { "epoch": 9.773352908790605, "grad_norm": 0.042923785746097565, "learning_rate": 6.062449442651357e-06, "loss": 0.0621, "num_input_tokens_seen": 114805328, "step": 53255 }, { "epoch": 9.774270508350156, "grad_norm": 25.28897476196289, "learning_rate": 6.061666959020849e-06, "loss": 0.374, "num_input_tokens_seen": 114815888, "step": 53260 }, { "epoch": 9.775188107909708, "grad_norm": 22.208620071411133, "learning_rate": 6.060884448160227e-06, "loss": 0.2195, "num_input_tokens_seen": 114825904, "step": 53265 }, { "epoch": 9.776105707469261, "grad_norm": 6.939566612243652, "learning_rate": 6.0601019100895654e-06, "loss": 0.4579, "num_input_tokens_seen": 114837072, "step": 53270 }, { "epoch": 9.777023307028813, "grad_norm": 21.533443450927734, "learning_rate": 6.0593193448289315e-06, "loss": 0.0757, "num_input_tokens_seen": 114847216, "step": 53275 }, { "epoch": 9.777940906588364, "grad_norm": 169.69139099121094, "learning_rate": 6.058536752398398e-06, "loss": 0.0604, "num_input_tokens_seen": 114857200, "step": 53280 }, { "epoch": 9.778858506147918, "grad_norm": 104.07671356201172, "learning_rate": 6.057754132818038e-06, "loss": 0.1776, "num_input_tokens_seen": 114868176, "step": 53285 }, { "epoch": 9.77977610570747, "grad_norm": 0.5182855129241943, "learning_rate": 6.056971486107923e-06, "loss": 0.0536, "num_input_tokens_seen": 114878544, "step": 53290 }, { "epoch": 9.780693705267021, "grad_norm": 81.9991455078125, "learning_rate": 6.056188812288129e-06, "loss": 0.2105, "num_input_tokens_seen": 114889616, "step": 53295 }, { "epoch": 9.781611304826574, "grad_norm": 157.64901733398438, "learning_rate": 6.055406111378727e-06, "loss": 0.2481, "num_input_tokens_seen": 114900144, "step": 53300 }, { "epoch": 9.782528904386126, "grad_norm": 5.8726654052734375, "learning_rate": 6.0546233833997956e-06, "loss": 0.1067, "num_input_tokens_seen": 114911856, "step": 53305 }, { "epoch": 9.783446503945678, "grad_norm": 25.235950469970703, "learning_rate": 6.053840628371408e-06, "loss": 0.1472, "num_input_tokens_seen": 114921296, "step": 53310 }, { "epoch": 9.784364103505231, "grad_norm": 25.638553619384766, "learning_rate": 6.053057846313642e-06, "loss": 0.2034, "num_input_tokens_seen": 114932464, "step": 53315 }, { "epoch": 9.785281703064783, "grad_norm": 7.961571216583252, "learning_rate": 6.052275037246575e-06, "loss": 0.2752, "num_input_tokens_seen": 114944208, "step": 53320 }, { "epoch": 9.786199302624334, "grad_norm": 8.497056007385254, "learning_rate": 6.051492201190285e-06, "loss": 0.2183, "num_input_tokens_seen": 114954320, "step": 53325 }, { "epoch": 9.787116902183888, "grad_norm": 0.5424695611000061, "learning_rate": 6.050709338164846e-06, "loss": 0.4183, "num_input_tokens_seen": 114963728, "step": 53330 }, { "epoch": 9.78803450174344, "grad_norm": 47.9976921081543, "learning_rate": 6.049926448190344e-06, "loss": 0.3878, "num_input_tokens_seen": 114975792, "step": 53335 }, { "epoch": 9.788952101302991, "grad_norm": 1.0861890316009521, "learning_rate": 6.049143531286855e-06, "loss": 0.3887, "num_input_tokens_seen": 114987408, "step": 53340 }, { "epoch": 9.789869700862544, "grad_norm": 0.2242838591337204, "learning_rate": 6.048360587474461e-06, "loss": 0.2829, "num_input_tokens_seen": 114998416, "step": 53345 }, { "epoch": 9.790787300422096, "grad_norm": 0.4701036214828491, "learning_rate": 6.047577616773242e-06, "loss": 0.3794, "num_input_tokens_seen": 115009456, "step": 53350 }, { "epoch": 9.791704899981648, "grad_norm": 0.6331658363342285, "learning_rate": 6.0467946192032815e-06, "loss": 0.1181, "num_input_tokens_seen": 115020080, "step": 53355 }, { "epoch": 9.792622499541201, "grad_norm": 0.13619455695152283, "learning_rate": 6.04601159478466e-06, "loss": 0.1836, "num_input_tokens_seen": 115031568, "step": 53360 }, { "epoch": 9.793540099100753, "grad_norm": 19.836647033691406, "learning_rate": 6.045228543537463e-06, "loss": 0.4447, "num_input_tokens_seen": 115041488, "step": 53365 }, { "epoch": 9.794457698660304, "grad_norm": 135.7831573486328, "learning_rate": 6.044445465481774e-06, "loss": 0.0983, "num_input_tokens_seen": 115052624, "step": 53370 }, { "epoch": 9.795375298219858, "grad_norm": 4.1820969581604, "learning_rate": 6.043662360637678e-06, "loss": 0.2888, "num_input_tokens_seen": 115062928, "step": 53375 }, { "epoch": 9.79629289777941, "grad_norm": 31.969728469848633, "learning_rate": 6.0428792290252595e-06, "loss": 0.2982, "num_input_tokens_seen": 115074128, "step": 53380 }, { "epoch": 9.79721049733896, "grad_norm": 8.76567268371582, "learning_rate": 6.0420960706646046e-06, "loss": 0.4854, "num_input_tokens_seen": 115085264, "step": 53385 }, { "epoch": 9.798128096898514, "grad_norm": 24.657329559326172, "learning_rate": 6.0413128855758e-06, "loss": 0.4291, "num_input_tokens_seen": 115096272, "step": 53390 }, { "epoch": 9.799045696458066, "grad_norm": 0.26072463393211365, "learning_rate": 6.040529673778936e-06, "loss": 0.3095, "num_input_tokens_seen": 115106960, "step": 53395 }, { "epoch": 9.799963296017618, "grad_norm": 21.210914611816406, "learning_rate": 6.039746435294097e-06, "loss": 0.2637, "num_input_tokens_seen": 115118032, "step": 53400 }, { "epoch": 9.800880895577171, "grad_norm": 0.4057970345020294, "learning_rate": 6.038963170141374e-06, "loss": 0.0142, "num_input_tokens_seen": 115130160, "step": 53405 }, { "epoch": 9.801798495136723, "grad_norm": 19.93916893005371, "learning_rate": 6.0381798783408555e-06, "loss": 0.343, "num_input_tokens_seen": 115141840, "step": 53410 }, { "epoch": 9.802716094696274, "grad_norm": 8.11959171295166, "learning_rate": 6.037396559912631e-06, "loss": 0.2517, "num_input_tokens_seen": 115151632, "step": 53415 }, { "epoch": 9.803633694255828, "grad_norm": 4.22977876663208, "learning_rate": 6.036613214876795e-06, "loss": 0.2186, "num_input_tokens_seen": 115161744, "step": 53420 }, { "epoch": 9.80455129381538, "grad_norm": 16.461631774902344, "learning_rate": 6.0358298432534355e-06, "loss": 0.0253, "num_input_tokens_seen": 115172304, "step": 53425 }, { "epoch": 9.80546889337493, "grad_norm": 24.997901916503906, "learning_rate": 6.035046445062647e-06, "loss": 0.1949, "num_input_tokens_seen": 115183152, "step": 53430 }, { "epoch": 9.806386492934484, "grad_norm": 0.07984629273414612, "learning_rate": 6.0342630203245204e-06, "loss": 0.1207, "num_input_tokens_seen": 115194800, "step": 53435 }, { "epoch": 9.807304092494036, "grad_norm": 0.9276477694511414, "learning_rate": 6.03347956905915e-06, "loss": 0.2181, "num_input_tokens_seen": 115205968, "step": 53440 }, { "epoch": 9.808221692053587, "grad_norm": 26.78583335876465, "learning_rate": 6.03269609128663e-06, "loss": 0.2893, "num_input_tokens_seen": 115216144, "step": 53445 }, { "epoch": 9.80913929161314, "grad_norm": 0.9584678411483765, "learning_rate": 6.031912587027057e-06, "loss": 0.2395, "num_input_tokens_seen": 115227536, "step": 53450 }, { "epoch": 9.810056891172692, "grad_norm": 0.3902648389339447, "learning_rate": 6.031129056300526e-06, "loss": 0.3849, "num_input_tokens_seen": 115238064, "step": 53455 }, { "epoch": 9.810974490732244, "grad_norm": 1.1302251815795898, "learning_rate": 6.030345499127131e-06, "loss": 0.2162, "num_input_tokens_seen": 115247984, "step": 53460 }, { "epoch": 9.811892090291797, "grad_norm": 34.22707748413086, "learning_rate": 6.029561915526971e-06, "loss": 0.4471, "num_input_tokens_seen": 115258864, "step": 53465 }, { "epoch": 9.812809689851349, "grad_norm": 8.47825813293457, "learning_rate": 6.0287783055201445e-06, "loss": 0.2445, "num_input_tokens_seen": 115269968, "step": 53470 }, { "epoch": 9.8137272894109, "grad_norm": 0.30437153577804565, "learning_rate": 6.027994669126748e-06, "loss": 0.2698, "num_input_tokens_seen": 115279888, "step": 53475 }, { "epoch": 9.814644888970454, "grad_norm": 94.85050201416016, "learning_rate": 6.027211006366882e-06, "loss": 0.5459, "num_input_tokens_seen": 115289072, "step": 53480 }, { "epoch": 9.815562488530006, "grad_norm": 4.913049697875977, "learning_rate": 6.026427317260645e-06, "loss": 0.088, "num_input_tokens_seen": 115300592, "step": 53485 }, { "epoch": 9.816480088089557, "grad_norm": 1.2968953847885132, "learning_rate": 6.0256436018281395e-06, "loss": 0.195, "num_input_tokens_seen": 115311376, "step": 53490 }, { "epoch": 9.81739768764911, "grad_norm": 8.113346099853516, "learning_rate": 6.024859860089464e-06, "loss": 0.1868, "num_input_tokens_seen": 115321968, "step": 53495 }, { "epoch": 9.818315287208662, "grad_norm": 0.21596232056617737, "learning_rate": 6.0240760920647215e-06, "loss": 0.0414, "num_input_tokens_seen": 115331984, "step": 53500 }, { "epoch": 9.819232886768214, "grad_norm": 3.1471269130706787, "learning_rate": 6.023292297774015e-06, "loss": 0.2052, "num_input_tokens_seen": 115343888, "step": 53505 }, { "epoch": 9.820150486327767, "grad_norm": 9.394716262817383, "learning_rate": 6.022508477237447e-06, "loss": 0.188, "num_input_tokens_seen": 115353712, "step": 53510 }, { "epoch": 9.821068085887319, "grad_norm": 0.24024848639965057, "learning_rate": 6.021724630475122e-06, "loss": 0.2895, "num_input_tokens_seen": 115363984, "step": 53515 }, { "epoch": 9.82198568544687, "grad_norm": 0.40183940529823303, "learning_rate": 6.020940757507142e-06, "loss": 0.1576, "num_input_tokens_seen": 115375184, "step": 53520 }, { "epoch": 9.822903285006424, "grad_norm": 0.24934692680835724, "learning_rate": 6.020156858353614e-06, "loss": 0.0968, "num_input_tokens_seen": 115385904, "step": 53525 }, { "epoch": 9.823820884565976, "grad_norm": 60.22050094604492, "learning_rate": 6.0193729330346465e-06, "loss": 0.1431, "num_input_tokens_seen": 115396464, "step": 53530 }, { "epoch": 9.824738484125527, "grad_norm": 30.31083869934082, "learning_rate": 6.01858898157034e-06, "loss": 0.1804, "num_input_tokens_seen": 115407568, "step": 53535 }, { "epoch": 9.82565608368508, "grad_norm": 0.14964470267295837, "learning_rate": 6.017805003980806e-06, "loss": 0.287, "num_input_tokens_seen": 115417040, "step": 53540 }, { "epoch": 9.826573683244632, "grad_norm": 14.442998886108398, "learning_rate": 6.0170210002861515e-06, "loss": 0.5363, "num_input_tokens_seen": 115427472, "step": 53545 }, { "epoch": 9.827491282804184, "grad_norm": 16.13530921936035, "learning_rate": 6.016236970506485e-06, "loss": 0.2242, "num_input_tokens_seen": 115438704, "step": 53550 }, { "epoch": 9.828408882363737, "grad_norm": 1.6900030374526978, "learning_rate": 6.015452914661914e-06, "loss": 0.3176, "num_input_tokens_seen": 115450224, "step": 53555 }, { "epoch": 9.829326481923289, "grad_norm": 8.654397964477539, "learning_rate": 6.01466883277255e-06, "loss": 0.4245, "num_input_tokens_seen": 115461200, "step": 53560 }, { "epoch": 9.83024408148284, "grad_norm": 68.57755279541016, "learning_rate": 6.013884724858503e-06, "loss": 0.5101, "num_input_tokens_seen": 115470992, "step": 53565 }, { "epoch": 9.831161681042394, "grad_norm": 0.27486929297447205, "learning_rate": 6.013100590939885e-06, "loss": 0.304, "num_input_tokens_seen": 115481552, "step": 53570 }, { "epoch": 9.832079280601945, "grad_norm": 0.29868796467781067, "learning_rate": 6.012316431036805e-06, "loss": 0.1545, "num_input_tokens_seen": 115491664, "step": 53575 }, { "epoch": 9.832996880161497, "grad_norm": 68.38802337646484, "learning_rate": 6.011532245169379e-06, "loss": 0.2308, "num_input_tokens_seen": 115501584, "step": 53580 }, { "epoch": 9.83391447972105, "grad_norm": 34.72713088989258, "learning_rate": 6.010748033357718e-06, "loss": 0.122, "num_input_tokens_seen": 115511856, "step": 53585 }, { "epoch": 9.834832079280602, "grad_norm": 21.920135498046875, "learning_rate": 6.009963795621938e-06, "loss": 0.2804, "num_input_tokens_seen": 115521808, "step": 53590 }, { "epoch": 9.835749678840154, "grad_norm": 25.527280807495117, "learning_rate": 6.00917953198215e-06, "loss": 0.3138, "num_input_tokens_seen": 115530512, "step": 53595 }, { "epoch": 9.836667278399707, "grad_norm": 86.09088134765625, "learning_rate": 6.008395242458472e-06, "loss": 0.2748, "num_input_tokens_seen": 115541968, "step": 53600 }, { "epoch": 9.837584877959259, "grad_norm": 0.706398069858551, "learning_rate": 6.007610927071018e-06, "loss": 0.1918, "num_input_tokens_seen": 115553360, "step": 53605 }, { "epoch": 9.83850247751881, "grad_norm": 1.055692434310913, "learning_rate": 6.006826585839907e-06, "loss": 0.2332, "num_input_tokens_seen": 115565584, "step": 53610 }, { "epoch": 9.839420077078364, "grad_norm": 4.7918853759765625, "learning_rate": 6.006042218785253e-06, "loss": 0.2904, "num_input_tokens_seen": 115576016, "step": 53615 }, { "epoch": 9.840337676637915, "grad_norm": 13.162363052368164, "learning_rate": 6.0052578259271755e-06, "loss": 0.3127, "num_input_tokens_seen": 115587248, "step": 53620 }, { "epoch": 9.841255276197467, "grad_norm": 26.029008865356445, "learning_rate": 6.004473407285794e-06, "loss": 0.1962, "num_input_tokens_seen": 115599056, "step": 53625 }, { "epoch": 9.84217287575702, "grad_norm": 1.6159881353378296, "learning_rate": 6.0036889628812245e-06, "loss": 0.0835, "num_input_tokens_seen": 115608752, "step": 53630 }, { "epoch": 9.843090475316572, "grad_norm": 15.360734939575195, "learning_rate": 6.00290449273359e-06, "loss": 0.3318, "num_input_tokens_seen": 115618480, "step": 53635 }, { "epoch": 9.844008074876124, "grad_norm": 1.0020030736923218, "learning_rate": 6.0021199968630095e-06, "loss": 0.335, "num_input_tokens_seen": 115628560, "step": 53640 }, { "epoch": 9.844925674435677, "grad_norm": 1.0572760105133057, "learning_rate": 6.0013354752896045e-06, "loss": 0.3055, "num_input_tokens_seen": 115639568, "step": 53645 }, { "epoch": 9.845843273995229, "grad_norm": 22.665508270263672, "learning_rate": 6.000550928033496e-06, "loss": 0.3493, "num_input_tokens_seen": 115649520, "step": 53650 }, { "epoch": 9.84676087355478, "grad_norm": 3.359086751937866, "learning_rate": 5.9997663551148074e-06, "loss": 0.3403, "num_input_tokens_seen": 115658352, "step": 53655 }, { "epoch": 9.847678473114334, "grad_norm": 21.433076858520508, "learning_rate": 5.998981756553661e-06, "loss": 0.2374, "num_input_tokens_seen": 115668624, "step": 53660 }, { "epoch": 9.848596072673885, "grad_norm": 115.2333984375, "learning_rate": 5.998197132370181e-06, "loss": 0.1149, "num_input_tokens_seen": 115679952, "step": 53665 }, { "epoch": 9.849513672233437, "grad_norm": 10.742620468139648, "learning_rate": 5.997412482584491e-06, "loss": 0.5033, "num_input_tokens_seen": 115691056, "step": 53670 }, { "epoch": 9.85043127179299, "grad_norm": 4.485722541809082, "learning_rate": 5.996627807216717e-06, "loss": 0.0857, "num_input_tokens_seen": 115703056, "step": 53675 }, { "epoch": 9.851348871352542, "grad_norm": 0.8663500547409058, "learning_rate": 5.995843106286985e-06, "loss": 0.2371, "num_input_tokens_seen": 115713456, "step": 53680 }, { "epoch": 9.852266470912094, "grad_norm": 2.878117799758911, "learning_rate": 5.9950583798154195e-06, "loss": 0.1389, "num_input_tokens_seen": 115723792, "step": 53685 }, { "epoch": 9.853184070471647, "grad_norm": 9.052054405212402, "learning_rate": 5.99427362782215e-06, "loss": 0.0866, "num_input_tokens_seen": 115734480, "step": 53690 }, { "epoch": 9.854101670031199, "grad_norm": 0.39358067512512207, "learning_rate": 5.9934888503273015e-06, "loss": 0.1052, "num_input_tokens_seen": 115745456, "step": 53695 }, { "epoch": 9.85501926959075, "grad_norm": 0.23019053041934967, "learning_rate": 5.9927040473510056e-06, "loss": 0.155, "num_input_tokens_seen": 115756784, "step": 53700 }, { "epoch": 9.855936869150304, "grad_norm": 1.4825334548950195, "learning_rate": 5.991919218913388e-06, "loss": 0.1823, "num_input_tokens_seen": 115767344, "step": 53705 }, { "epoch": 9.856854468709855, "grad_norm": 0.13174335658550262, "learning_rate": 5.991134365034579e-06, "loss": 0.1297, "num_input_tokens_seen": 115778160, "step": 53710 }, { "epoch": 9.857772068269407, "grad_norm": 121.02751159667969, "learning_rate": 5.990349485734712e-06, "loss": 0.1677, "num_input_tokens_seen": 115789104, "step": 53715 }, { "epoch": 9.85868966782896, "grad_norm": 266.8532409667969, "learning_rate": 5.989564581033914e-06, "loss": 0.366, "num_input_tokens_seen": 115799216, "step": 53720 }, { "epoch": 9.859607267388512, "grad_norm": 29.193777084350586, "learning_rate": 5.988779650952319e-06, "loss": 0.3702, "num_input_tokens_seen": 115810736, "step": 53725 }, { "epoch": 9.860524866948063, "grad_norm": 51.412864685058594, "learning_rate": 5.987994695510058e-06, "loss": 0.1776, "num_input_tokens_seen": 115821168, "step": 53730 }, { "epoch": 9.861442466507617, "grad_norm": 1.536812424659729, "learning_rate": 5.987209714727264e-06, "loss": 0.0857, "num_input_tokens_seen": 115829904, "step": 53735 }, { "epoch": 9.862360066067168, "grad_norm": 1.8786958456039429, "learning_rate": 5.986424708624071e-06, "loss": 0.1842, "num_input_tokens_seen": 115840336, "step": 53740 }, { "epoch": 9.86327766562672, "grad_norm": 9.53310775756836, "learning_rate": 5.985639677220613e-06, "loss": 0.3269, "num_input_tokens_seen": 115851856, "step": 53745 }, { "epoch": 9.864195265186273, "grad_norm": 64.48640441894531, "learning_rate": 5.984854620537026e-06, "loss": 0.3062, "num_input_tokens_seen": 115861072, "step": 53750 }, { "epoch": 9.865112864745825, "grad_norm": 12.268331527709961, "learning_rate": 5.984069538593444e-06, "loss": 0.1273, "num_input_tokens_seen": 115871792, "step": 53755 }, { "epoch": 9.866030464305377, "grad_norm": 71.76753234863281, "learning_rate": 5.983284431410003e-06, "loss": 0.3387, "num_input_tokens_seen": 115881264, "step": 53760 }, { "epoch": 9.86694806386493, "grad_norm": 3.929399013519287, "learning_rate": 5.982499299006841e-06, "loss": 0.2283, "num_input_tokens_seen": 115891824, "step": 53765 }, { "epoch": 9.867865663424482, "grad_norm": 22.7825984954834, "learning_rate": 5.981714141404093e-06, "loss": 0.1235, "num_input_tokens_seen": 115903056, "step": 53770 }, { "epoch": 9.868783262984033, "grad_norm": 37.698944091796875, "learning_rate": 5.9809289586219e-06, "loss": 0.3937, "num_input_tokens_seen": 115914736, "step": 53775 }, { "epoch": 9.869700862543587, "grad_norm": 1.9769091606140137, "learning_rate": 5.9801437506804e-06, "loss": 0.3086, "num_input_tokens_seen": 115926192, "step": 53780 }, { "epoch": 9.870618462103138, "grad_norm": 10.079570770263672, "learning_rate": 5.9793585175997316e-06, "loss": 0.3703, "num_input_tokens_seen": 115936400, "step": 53785 }, { "epoch": 9.87153606166269, "grad_norm": 20.808183670043945, "learning_rate": 5.978573259400034e-06, "loss": 0.3059, "num_input_tokens_seen": 115947984, "step": 53790 }, { "epoch": 9.872453661222243, "grad_norm": 1.1675740480422974, "learning_rate": 5.977787976101449e-06, "loss": 0.3477, "num_input_tokens_seen": 115959856, "step": 53795 }, { "epoch": 9.873371260781795, "grad_norm": 0.7644763588905334, "learning_rate": 5.9770026677241185e-06, "loss": 0.0172, "num_input_tokens_seen": 115969872, "step": 53800 }, { "epoch": 9.874288860341347, "grad_norm": 2.1661112308502197, "learning_rate": 5.9762173342881835e-06, "loss": 0.2073, "num_input_tokens_seen": 115981136, "step": 53805 }, { "epoch": 9.8752064599009, "grad_norm": 0.23185127973556519, "learning_rate": 5.975431975813788e-06, "loss": 0.0627, "num_input_tokens_seen": 115993264, "step": 53810 }, { "epoch": 9.876124059460452, "grad_norm": 14.48588752746582, "learning_rate": 5.974646592321073e-06, "loss": 0.1935, "num_input_tokens_seen": 116004624, "step": 53815 }, { "epoch": 9.877041659020003, "grad_norm": 27.55916404724121, "learning_rate": 5.973861183830183e-06, "loss": 0.3706, "num_input_tokens_seen": 116016112, "step": 53820 }, { "epoch": 9.877959258579557, "grad_norm": 9.27796459197998, "learning_rate": 5.973075750361265e-06, "loss": 0.2933, "num_input_tokens_seen": 116027280, "step": 53825 }, { "epoch": 9.878876858139108, "grad_norm": 0.3060198426246643, "learning_rate": 5.9722902919344595e-06, "loss": 0.2201, "num_input_tokens_seen": 116036816, "step": 53830 }, { "epoch": 9.87979445769866, "grad_norm": 0.3220742344856262, "learning_rate": 5.971504808569917e-06, "loss": 0.2558, "num_input_tokens_seen": 116046832, "step": 53835 }, { "epoch": 9.880712057258213, "grad_norm": 0.6108644008636475, "learning_rate": 5.970719300287781e-06, "loss": 0.3281, "num_input_tokens_seen": 116057520, "step": 53840 }, { "epoch": 9.881629656817765, "grad_norm": 1.1548064947128296, "learning_rate": 5.9699337671081996e-06, "loss": 0.201, "num_input_tokens_seen": 116069744, "step": 53845 }, { "epoch": 9.882547256377316, "grad_norm": 109.63814544677734, "learning_rate": 5.96914820905132e-06, "loss": 0.1964, "num_input_tokens_seen": 116080464, "step": 53850 }, { "epoch": 9.88346485593687, "grad_norm": 2.3087158203125, "learning_rate": 5.9683626261372905e-06, "loss": 0.1761, "num_input_tokens_seen": 116091888, "step": 53855 }, { "epoch": 9.884382455496421, "grad_norm": 0.5516282916069031, "learning_rate": 5.967577018386263e-06, "loss": 0.1346, "num_input_tokens_seen": 116103056, "step": 53860 }, { "epoch": 9.885300055055973, "grad_norm": 1.2844792604446411, "learning_rate": 5.966791385818383e-06, "loss": 0.127, "num_input_tokens_seen": 116112784, "step": 53865 }, { "epoch": 9.886217654615526, "grad_norm": 41.513916015625, "learning_rate": 5.966005728453801e-06, "loss": 0.3354, "num_input_tokens_seen": 116124080, "step": 53870 }, { "epoch": 9.887135254175078, "grad_norm": 41.47449493408203, "learning_rate": 5.9652200463126705e-06, "loss": 0.1007, "num_input_tokens_seen": 116134896, "step": 53875 }, { "epoch": 9.88805285373463, "grad_norm": 5.182509422302246, "learning_rate": 5.964434339415141e-06, "loss": 0.128, "num_input_tokens_seen": 116145744, "step": 53880 }, { "epoch": 9.888970453294183, "grad_norm": 0.034992050379514694, "learning_rate": 5.963648607781367e-06, "loss": 0.0207, "num_input_tokens_seen": 116156880, "step": 53885 }, { "epoch": 9.889888052853735, "grad_norm": 30.253759384155273, "learning_rate": 5.962862851431498e-06, "loss": 0.5573, "num_input_tokens_seen": 116166128, "step": 53890 }, { "epoch": 9.890805652413286, "grad_norm": 0.5266863107681274, "learning_rate": 5.96207707038569e-06, "loss": 0.0721, "num_input_tokens_seen": 116175888, "step": 53895 }, { "epoch": 9.89172325197284, "grad_norm": 18.116487503051758, "learning_rate": 5.961291264664096e-06, "loss": 0.3626, "num_input_tokens_seen": 116187408, "step": 53900 }, { "epoch": 9.892640851532391, "grad_norm": 120.11521911621094, "learning_rate": 5.9605054342868705e-06, "loss": 0.2143, "num_input_tokens_seen": 116198768, "step": 53905 }, { "epoch": 9.893558451091943, "grad_norm": 20.739784240722656, "learning_rate": 5.959719579274172e-06, "loss": 0.1081, "num_input_tokens_seen": 116209328, "step": 53910 }, { "epoch": 9.894476050651496, "grad_norm": 89.58892822265625, "learning_rate": 5.95893369964615e-06, "loss": 0.3927, "num_input_tokens_seen": 116219792, "step": 53915 }, { "epoch": 9.895393650211048, "grad_norm": 62.21110534667969, "learning_rate": 5.958147795422967e-06, "loss": 0.4027, "num_input_tokens_seen": 116231120, "step": 53920 }, { "epoch": 9.8963112497706, "grad_norm": 0.4043424427509308, "learning_rate": 5.957361866624777e-06, "loss": 0.1459, "num_input_tokens_seen": 116242096, "step": 53925 }, { "epoch": 9.897228849330153, "grad_norm": 60.74137878417969, "learning_rate": 5.956575913271738e-06, "loss": 0.0765, "num_input_tokens_seen": 116253776, "step": 53930 }, { "epoch": 9.898146448889705, "grad_norm": 0.20968525111675262, "learning_rate": 5.955789935384012e-06, "loss": 0.1512, "num_input_tokens_seen": 116262992, "step": 53935 }, { "epoch": 9.899064048449256, "grad_norm": 30.73708152770996, "learning_rate": 5.9550039329817536e-06, "loss": 0.2275, "num_input_tokens_seen": 116274128, "step": 53940 }, { "epoch": 9.89998164800881, "grad_norm": 24.594606399536133, "learning_rate": 5.954217906085126e-06, "loss": 0.3522, "num_input_tokens_seen": 116283024, "step": 53945 }, { "epoch": 9.900899247568361, "grad_norm": 13.284384727478027, "learning_rate": 5.953431854714287e-06, "loss": 0.3777, "num_input_tokens_seen": 116294224, "step": 53950 }, { "epoch": 9.901816847127913, "grad_norm": 11.900606155395508, "learning_rate": 5.9526457788893975e-06, "loss": 0.2828, "num_input_tokens_seen": 116305584, "step": 53955 }, { "epoch": 9.902734446687466, "grad_norm": 4.2665252685546875, "learning_rate": 5.951859678630621e-06, "loss": 0.1676, "num_input_tokens_seen": 116316112, "step": 53960 }, { "epoch": 9.903652046247018, "grad_norm": 14.129137992858887, "learning_rate": 5.95107355395812e-06, "loss": 0.217, "num_input_tokens_seen": 116326736, "step": 53965 }, { "epoch": 9.90456964580657, "grad_norm": 32.16103744506836, "learning_rate": 5.950287404892057e-06, "loss": 0.165, "num_input_tokens_seen": 116337584, "step": 53970 }, { "epoch": 9.905487245366123, "grad_norm": 19.34681510925293, "learning_rate": 5.949501231452594e-06, "loss": 0.4756, "num_input_tokens_seen": 116348336, "step": 53975 }, { "epoch": 9.906404844925675, "grad_norm": 0.584256649017334, "learning_rate": 5.948715033659894e-06, "loss": 0.2872, "num_input_tokens_seen": 116358992, "step": 53980 }, { "epoch": 9.907322444485226, "grad_norm": 10.434576034545898, "learning_rate": 5.947928811534127e-06, "loss": 0.0292, "num_input_tokens_seen": 116370256, "step": 53985 }, { "epoch": 9.90824004404478, "grad_norm": 20.723270416259766, "learning_rate": 5.9471425650954525e-06, "loss": 0.172, "num_input_tokens_seen": 116380112, "step": 53990 }, { "epoch": 9.909157643604331, "grad_norm": 72.91748046875, "learning_rate": 5.946356294364041e-06, "loss": 0.4231, "num_input_tokens_seen": 116390224, "step": 53995 }, { "epoch": 9.910075243163883, "grad_norm": 0.3082893490791321, "learning_rate": 5.9455699993600555e-06, "loss": 0.1009, "num_input_tokens_seen": 116401840, "step": 54000 }, { "epoch": 9.910992842723436, "grad_norm": 0.4575690031051636, "learning_rate": 5.944783680103666e-06, "loss": 0.1934, "num_input_tokens_seen": 116413872, "step": 54005 }, { "epoch": 9.911910442282988, "grad_norm": 26.259597778320312, "learning_rate": 5.94399733661504e-06, "loss": 0.1352, "num_input_tokens_seen": 116424112, "step": 54010 }, { "epoch": 9.91282804184254, "grad_norm": 8.6768798828125, "learning_rate": 5.943210968914343e-06, "loss": 0.2049, "num_input_tokens_seen": 116434160, "step": 54015 }, { "epoch": 9.913745641402093, "grad_norm": 0.23022575676441193, "learning_rate": 5.942424577021751e-06, "loss": 0.0028, "num_input_tokens_seen": 116444880, "step": 54020 }, { "epoch": 9.914663240961644, "grad_norm": 21.749584197998047, "learning_rate": 5.9416381609574246e-06, "loss": 0.1908, "num_input_tokens_seen": 116455696, "step": 54025 }, { "epoch": 9.915580840521196, "grad_norm": 25.35110855102539, "learning_rate": 5.9408517207415404e-06, "loss": 0.2458, "num_input_tokens_seen": 116465904, "step": 54030 }, { "epoch": 9.91649844008075, "grad_norm": 104.36212158203125, "learning_rate": 5.940065256394269e-06, "loss": 0.3666, "num_input_tokens_seen": 116476720, "step": 54035 }, { "epoch": 9.917416039640301, "grad_norm": 0.4044044017791748, "learning_rate": 5.939278767935779e-06, "loss": 0.205, "num_input_tokens_seen": 116487920, "step": 54040 }, { "epoch": 9.918333639199853, "grad_norm": 0.3644515872001648, "learning_rate": 5.938492255386246e-06, "loss": 0.3257, "num_input_tokens_seen": 116498544, "step": 54045 }, { "epoch": 9.919251238759406, "grad_norm": 0.5946871638298035, "learning_rate": 5.9377057187658395e-06, "loss": 0.1719, "num_input_tokens_seen": 116508880, "step": 54050 }, { "epoch": 9.920168838318958, "grad_norm": 6.638724327087402, "learning_rate": 5.936919158094736e-06, "loss": 0.1017, "num_input_tokens_seen": 116519536, "step": 54055 }, { "epoch": 9.92108643787851, "grad_norm": 22.9567813873291, "learning_rate": 5.936132573393106e-06, "loss": 0.1134, "num_input_tokens_seen": 116530864, "step": 54060 }, { "epoch": 9.922004037438063, "grad_norm": 0.5369488000869751, "learning_rate": 5.935345964681129e-06, "loss": 0.1881, "num_input_tokens_seen": 116541456, "step": 54065 }, { "epoch": 9.922921636997614, "grad_norm": 27.066192626953125, "learning_rate": 5.934559331978976e-06, "loss": 0.269, "num_input_tokens_seen": 116551728, "step": 54070 }, { "epoch": 9.923839236557166, "grad_norm": 1.1804416179656982, "learning_rate": 5.933772675306825e-06, "loss": 0.1003, "num_input_tokens_seen": 116562864, "step": 54075 }, { "epoch": 9.92475683611672, "grad_norm": 0.10783829540014267, "learning_rate": 5.932985994684854e-06, "loss": 0.1546, "num_input_tokens_seen": 116574256, "step": 54080 }, { "epoch": 9.925674435676271, "grad_norm": 6.9819416999816895, "learning_rate": 5.932199290133236e-06, "loss": 0.3512, "num_input_tokens_seen": 116584080, "step": 54085 }, { "epoch": 9.926592035235823, "grad_norm": 15.58352279663086, "learning_rate": 5.931412561672151e-06, "loss": 0.4141, "num_input_tokens_seen": 116594832, "step": 54090 }, { "epoch": 9.927509634795376, "grad_norm": 0.2951546013355255, "learning_rate": 5.930625809321778e-06, "loss": 0.1615, "num_input_tokens_seen": 116605296, "step": 54095 }, { "epoch": 9.928427234354928, "grad_norm": 41.323394775390625, "learning_rate": 5.9298390331022945e-06, "loss": 0.2508, "num_input_tokens_seen": 116617008, "step": 54100 }, { "epoch": 9.92934483391448, "grad_norm": 6.2816619873046875, "learning_rate": 5.9290522330338825e-06, "loss": 0.1355, "num_input_tokens_seen": 116628048, "step": 54105 }, { "epoch": 9.930262433474033, "grad_norm": 58.194061279296875, "learning_rate": 5.9282654091367194e-06, "loss": 0.3652, "num_input_tokens_seen": 116639344, "step": 54110 }, { "epoch": 9.931180033033584, "grad_norm": 1.359477162361145, "learning_rate": 5.927478561430987e-06, "loss": 0.3358, "num_input_tokens_seen": 116650672, "step": 54115 }, { "epoch": 9.932097632593136, "grad_norm": 116.659423828125, "learning_rate": 5.926691689936869e-06, "loss": 0.2521, "num_input_tokens_seen": 116661296, "step": 54120 }, { "epoch": 9.93301523215269, "grad_norm": 56.23267364501953, "learning_rate": 5.925904794674543e-06, "loss": 0.168, "num_input_tokens_seen": 116670704, "step": 54125 }, { "epoch": 9.93393283171224, "grad_norm": 1.7499018907546997, "learning_rate": 5.925117875664195e-06, "loss": 0.1596, "num_input_tokens_seen": 116681520, "step": 54130 }, { "epoch": 9.934850431271792, "grad_norm": 0.3904193639755249, "learning_rate": 5.924330932926007e-06, "loss": 0.227, "num_input_tokens_seen": 116692752, "step": 54135 }, { "epoch": 9.935768030831346, "grad_norm": 10.29963207244873, "learning_rate": 5.923543966480163e-06, "loss": 0.2721, "num_input_tokens_seen": 116703792, "step": 54140 }, { "epoch": 9.936685630390897, "grad_norm": 2.0675759315490723, "learning_rate": 5.922756976346848e-06, "loss": 0.2224, "num_input_tokens_seen": 116715216, "step": 54145 }, { "epoch": 9.937603229950449, "grad_norm": 23.938600540161133, "learning_rate": 5.921969962546247e-06, "loss": 0.436, "num_input_tokens_seen": 116726192, "step": 54150 }, { "epoch": 9.938520829510002, "grad_norm": 10.360345840454102, "learning_rate": 5.9211829250985455e-06, "loss": 0.1787, "num_input_tokens_seen": 116737360, "step": 54155 }, { "epoch": 9.939438429069554, "grad_norm": 8.754630088806152, "learning_rate": 5.920395864023929e-06, "loss": 0.0212, "num_input_tokens_seen": 116749360, "step": 54160 }, { "epoch": 9.940356028629106, "grad_norm": 26.25882339477539, "learning_rate": 5.919608779342585e-06, "loss": 0.4997, "num_input_tokens_seen": 116760176, "step": 54165 }, { "epoch": 9.94127362818866, "grad_norm": 64.27362060546875, "learning_rate": 5.918821671074702e-06, "loss": 0.4199, "num_input_tokens_seen": 116770448, "step": 54170 }, { "epoch": 9.94219122774821, "grad_norm": 0.9181256294250488, "learning_rate": 5.918034539240466e-06, "loss": 0.0055, "num_input_tokens_seen": 116780368, "step": 54175 }, { "epoch": 9.943108827307762, "grad_norm": 50.54423904418945, "learning_rate": 5.917247383860067e-06, "loss": 0.4868, "num_input_tokens_seen": 116792304, "step": 54180 }, { "epoch": 9.944026426867316, "grad_norm": 0.38863128423690796, "learning_rate": 5.916460204953696e-06, "loss": 0.3143, "num_input_tokens_seen": 116802160, "step": 54185 }, { "epoch": 9.944944026426867, "grad_norm": 0.16799579560756683, "learning_rate": 5.9156730025415396e-06, "loss": 0.2104, "num_input_tokens_seen": 116812048, "step": 54190 }, { "epoch": 9.945861625986419, "grad_norm": 1.1040687561035156, "learning_rate": 5.914885776643791e-06, "loss": 0.0453, "num_input_tokens_seen": 116822128, "step": 54195 }, { "epoch": 9.946779225545972, "grad_norm": 54.150245666503906, "learning_rate": 5.914098527280638e-06, "loss": 0.1582, "num_input_tokens_seen": 116833424, "step": 54200 }, { "epoch": 9.947696825105524, "grad_norm": 0.4516400396823883, "learning_rate": 5.913311254472276e-06, "loss": 0.1349, "num_input_tokens_seen": 116844848, "step": 54205 }, { "epoch": 9.948614424665076, "grad_norm": 20.835987091064453, "learning_rate": 5.912523958238896e-06, "loss": 0.3415, "num_input_tokens_seen": 116854288, "step": 54210 }, { "epoch": 9.949532024224629, "grad_norm": 0.361807644367218, "learning_rate": 5.91173663860069e-06, "loss": 0.2805, "num_input_tokens_seen": 116866096, "step": 54215 }, { "epoch": 9.95044962378418, "grad_norm": 0.7758792638778687, "learning_rate": 5.910949295577853e-06, "loss": 0.0083, "num_input_tokens_seen": 116877296, "step": 54220 }, { "epoch": 9.951367223343732, "grad_norm": 11.280906677246094, "learning_rate": 5.910161929190577e-06, "loss": 0.0414, "num_input_tokens_seen": 116887984, "step": 54225 }, { "epoch": 9.952284822903286, "grad_norm": 0.03643772006034851, "learning_rate": 5.9093745394590594e-06, "loss": 0.3878, "num_input_tokens_seen": 116898416, "step": 54230 }, { "epoch": 9.953202422462837, "grad_norm": 37.022369384765625, "learning_rate": 5.908587126403494e-06, "loss": 0.3874, "num_input_tokens_seen": 116909456, "step": 54235 }, { "epoch": 9.954120022022389, "grad_norm": 47.96883010864258, "learning_rate": 5.907799690044078e-06, "loss": 0.2403, "num_input_tokens_seen": 116920176, "step": 54240 }, { "epoch": 9.955037621581942, "grad_norm": 61.6645622253418, "learning_rate": 5.907012230401005e-06, "loss": 0.1476, "num_input_tokens_seen": 116932304, "step": 54245 }, { "epoch": 9.955955221141494, "grad_norm": 0.08192428946495056, "learning_rate": 5.9062247474944745e-06, "loss": 0.3335, "num_input_tokens_seen": 116943024, "step": 54250 }, { "epoch": 9.956872820701046, "grad_norm": 10.771759986877441, "learning_rate": 5.905437241344685e-06, "loss": 0.3815, "num_input_tokens_seen": 116952880, "step": 54255 }, { "epoch": 9.957790420260599, "grad_norm": 21.27892303466797, "learning_rate": 5.904649711971833e-06, "loss": 0.3126, "num_input_tokens_seen": 116963472, "step": 54260 }, { "epoch": 9.95870801982015, "grad_norm": 103.39595794677734, "learning_rate": 5.9038621593961175e-06, "loss": 0.0421, "num_input_tokens_seen": 116974032, "step": 54265 }, { "epoch": 9.959625619379702, "grad_norm": 25.54296112060547, "learning_rate": 5.903074583637738e-06, "loss": 0.4071, "num_input_tokens_seen": 116984944, "step": 54270 }, { "epoch": 9.960543218939256, "grad_norm": 4.717447280883789, "learning_rate": 5.902286984716895e-06, "loss": 0.1187, "num_input_tokens_seen": 116995920, "step": 54275 }, { "epoch": 9.961460818498807, "grad_norm": 0.6660788059234619, "learning_rate": 5.901499362653791e-06, "loss": 0.4377, "num_input_tokens_seen": 117008016, "step": 54280 }, { "epoch": 9.962378418058359, "grad_norm": 0.17740511894226074, "learning_rate": 5.9007117174686245e-06, "loss": 0.0167, "num_input_tokens_seen": 117019504, "step": 54285 }, { "epoch": 9.963296017617912, "grad_norm": 82.21281433105469, "learning_rate": 5.899924049181599e-06, "loss": 0.426, "num_input_tokens_seen": 117029552, "step": 54290 }, { "epoch": 9.964213617177464, "grad_norm": 64.53428649902344, "learning_rate": 5.899136357812917e-06, "loss": 0.1675, "num_input_tokens_seen": 117041744, "step": 54295 }, { "epoch": 9.965131216737015, "grad_norm": 0.12680746614933014, "learning_rate": 5.898348643382779e-06, "loss": 0.1031, "num_input_tokens_seen": 117053264, "step": 54300 }, { "epoch": 9.966048816296569, "grad_norm": 0.35179007053375244, "learning_rate": 5.897560905911391e-06, "loss": 0.2064, "num_input_tokens_seen": 117064688, "step": 54305 }, { "epoch": 9.96696641585612, "grad_norm": 147.95550537109375, "learning_rate": 5.896773145418958e-06, "loss": 0.1754, "num_input_tokens_seen": 117076368, "step": 54310 }, { "epoch": 9.967884015415672, "grad_norm": 12.900568962097168, "learning_rate": 5.895985361925684e-06, "loss": 0.4154, "num_input_tokens_seen": 117087536, "step": 54315 }, { "epoch": 9.968801614975225, "grad_norm": 10.36353588104248, "learning_rate": 5.895197555451771e-06, "loss": 0.4673, "num_input_tokens_seen": 117098000, "step": 54320 }, { "epoch": 9.969719214534777, "grad_norm": 5.629691123962402, "learning_rate": 5.894409726017431e-06, "loss": 0.0176, "num_input_tokens_seen": 117108176, "step": 54325 }, { "epoch": 9.970636814094329, "grad_norm": 9.695547103881836, "learning_rate": 5.893621873642867e-06, "loss": 0.1947, "num_input_tokens_seen": 117117744, "step": 54330 }, { "epoch": 9.971554413653882, "grad_norm": 2.9055120944976807, "learning_rate": 5.892833998348286e-06, "loss": 0.1011, "num_input_tokens_seen": 117128048, "step": 54335 }, { "epoch": 9.972472013213434, "grad_norm": 42.239112854003906, "learning_rate": 5.892046100153899e-06, "loss": 0.2253, "num_input_tokens_seen": 117138480, "step": 54340 }, { "epoch": 9.973389612772985, "grad_norm": 30.673460006713867, "learning_rate": 5.891258179079911e-06, "loss": 0.2698, "num_input_tokens_seen": 117149808, "step": 54345 }, { "epoch": 9.974307212332539, "grad_norm": 0.78598952293396, "learning_rate": 5.89047023514653e-06, "loss": 0.0927, "num_input_tokens_seen": 117160944, "step": 54350 }, { "epoch": 9.97522481189209, "grad_norm": 0.3912857472896576, "learning_rate": 5.88968226837397e-06, "loss": 0.0889, "num_input_tokens_seen": 117171856, "step": 54355 }, { "epoch": 9.976142411451642, "grad_norm": 0.2325403243303299, "learning_rate": 5.888894278782438e-06, "loss": 0.1229, "num_input_tokens_seen": 117182768, "step": 54360 }, { "epoch": 9.977060011011195, "grad_norm": 0.4642106592655182, "learning_rate": 5.888106266392146e-06, "loss": 0.094, "num_input_tokens_seen": 117193904, "step": 54365 }, { "epoch": 9.977977610570747, "grad_norm": 0.5709195733070374, "learning_rate": 5.887318231223303e-06, "loss": 0.1378, "num_input_tokens_seen": 117205488, "step": 54370 }, { "epoch": 9.978895210130299, "grad_norm": 0.21383830904960632, "learning_rate": 5.886530173296126e-06, "loss": 0.317, "num_input_tokens_seen": 117216944, "step": 54375 }, { "epoch": 9.979812809689852, "grad_norm": 41.22708511352539, "learning_rate": 5.885742092630821e-06, "loss": 0.358, "num_input_tokens_seen": 117225808, "step": 54380 }, { "epoch": 9.980730409249404, "grad_norm": 62.1047477722168, "learning_rate": 5.884953989247604e-06, "loss": 0.3835, "num_input_tokens_seen": 117236528, "step": 54385 }, { "epoch": 9.981648008808955, "grad_norm": 0.5310618877410889, "learning_rate": 5.88416586316669e-06, "loss": 0.0924, "num_input_tokens_seen": 117247568, "step": 54390 }, { "epoch": 9.982565608368509, "grad_norm": 24.54043960571289, "learning_rate": 5.883377714408292e-06, "loss": 0.088, "num_input_tokens_seen": 117259216, "step": 54395 }, { "epoch": 9.98348320792806, "grad_norm": 15.700474739074707, "learning_rate": 5.882589542992624e-06, "loss": 0.1339, "num_input_tokens_seen": 117270160, "step": 54400 }, { "epoch": 9.984400807487612, "grad_norm": 33.83756637573242, "learning_rate": 5.881801348939903e-06, "loss": 0.6226, "num_input_tokens_seen": 117280240, "step": 54405 }, { "epoch": 9.985318407047165, "grad_norm": 14.260344505310059, "learning_rate": 5.881013132270343e-06, "loss": 0.1434, "num_input_tokens_seen": 117291056, "step": 54410 }, { "epoch": 9.986236006606717, "grad_norm": 5.425861358642578, "learning_rate": 5.880224893004163e-06, "loss": 0.1752, "num_input_tokens_seen": 117302544, "step": 54415 }, { "epoch": 9.987153606166268, "grad_norm": 0.650748074054718, "learning_rate": 5.879436631161577e-06, "loss": 0.2355, "num_input_tokens_seen": 117312208, "step": 54420 }, { "epoch": 9.988071205725822, "grad_norm": 123.47877502441406, "learning_rate": 5.8786483467628054e-06, "loss": 0.3064, "num_input_tokens_seen": 117322704, "step": 54425 }, { "epoch": 9.988988805285373, "grad_norm": 43.51580047607422, "learning_rate": 5.877860039828065e-06, "loss": 0.3581, "num_input_tokens_seen": 117331184, "step": 54430 }, { "epoch": 9.989906404844925, "grad_norm": 35.36125946044922, "learning_rate": 5.877071710377575e-06, "loss": 0.3334, "num_input_tokens_seen": 117343536, "step": 54435 }, { "epoch": 9.990824004404478, "grad_norm": 42.044273376464844, "learning_rate": 5.876283358431556e-06, "loss": 0.1671, "num_input_tokens_seen": 117353136, "step": 54440 }, { "epoch": 9.99174160396403, "grad_norm": 32.0958251953125, "learning_rate": 5.875494984010226e-06, "loss": 0.1716, "num_input_tokens_seen": 117364592, "step": 54445 }, { "epoch": 9.992659203523582, "grad_norm": 0.3915075361728668, "learning_rate": 5.874706587133807e-06, "loss": 0.1035, "num_input_tokens_seen": 117374416, "step": 54450 }, { "epoch": 9.993576803083135, "grad_norm": 0.19062790274620056, "learning_rate": 5.87391816782252e-06, "loss": 0.1704, "num_input_tokens_seen": 117385648, "step": 54455 }, { "epoch": 9.994494402642687, "grad_norm": 8.788503646850586, "learning_rate": 5.873129726096585e-06, "loss": 0.2426, "num_input_tokens_seen": 117397072, "step": 54460 }, { "epoch": 9.995412002202238, "grad_norm": 7.037080764770508, "learning_rate": 5.872341261976228e-06, "loss": 0.1745, "num_input_tokens_seen": 117407856, "step": 54465 }, { "epoch": 9.996329601761792, "grad_norm": 0.7180137038230896, "learning_rate": 5.871552775481668e-06, "loss": 0.1314, "num_input_tokens_seen": 117418416, "step": 54470 }, { "epoch": 9.997247201321343, "grad_norm": 6.057301998138428, "learning_rate": 5.870764266633131e-06, "loss": 0.1607, "num_input_tokens_seen": 117429552, "step": 54475 }, { "epoch": 9.998164800880895, "grad_norm": 66.53952026367188, "learning_rate": 5.8699757354508395e-06, "loss": 0.1757, "num_input_tokens_seen": 117440176, "step": 54480 }, { "epoch": 9.999082400440448, "grad_norm": 73.81494903564453, "learning_rate": 5.869187181955018e-06, "loss": 0.192, "num_input_tokens_seen": 117450896, "step": 54485 }, { "epoch": 10.0, "grad_norm": 0.6788842082023621, "learning_rate": 5.868398606165894e-06, "loss": 0.1407, "num_input_tokens_seen": 117461104, "step": 54490 }, { "epoch": 10.0, "eval_loss": 0.4987534284591675, "eval_runtime": 179.1361, "eval_samples_per_second": 30.418, "eval_steps_per_second": 7.609, "num_input_tokens_seen": 117461104, "step": 54490 }, { "epoch": 10.000917599559552, "grad_norm": 6.7287445068359375, "learning_rate": 5.86761000810369e-06, "loss": 0.0094, "num_input_tokens_seen": 117473200, "step": 54495 }, { "epoch": 10.001835199119105, "grad_norm": 16.507570266723633, "learning_rate": 5.866821387788636e-06, "loss": 0.1613, "num_input_tokens_seen": 117484528, "step": 54500 }, { "epoch": 10.002752798678657, "grad_norm": 74.6456527709961, "learning_rate": 5.8660327452409545e-06, "loss": 0.1274, "num_input_tokens_seen": 117495248, "step": 54505 }, { "epoch": 10.003670398238208, "grad_norm": 0.10511040687561035, "learning_rate": 5.8652440804808775e-06, "loss": 0.2434, "num_input_tokens_seen": 117507088, "step": 54510 }, { "epoch": 10.004587997797762, "grad_norm": 0.6238600015640259, "learning_rate": 5.8644553935286305e-06, "loss": 0.1526, "num_input_tokens_seen": 117517296, "step": 54515 }, { "epoch": 10.005505597357313, "grad_norm": 149.32534790039062, "learning_rate": 5.863666684404442e-06, "loss": 0.1478, "num_input_tokens_seen": 117529168, "step": 54520 }, { "epoch": 10.006423196916865, "grad_norm": 10.904871940612793, "learning_rate": 5.862877953128542e-06, "loss": 0.1561, "num_input_tokens_seen": 117540080, "step": 54525 }, { "epoch": 10.007340796476418, "grad_norm": 0.1951894611120224, "learning_rate": 5.862089199721159e-06, "loss": 0.4562, "num_input_tokens_seen": 117551472, "step": 54530 }, { "epoch": 10.00825839603597, "grad_norm": 25.419008255004883, "learning_rate": 5.861300424202525e-06, "loss": 0.0977, "num_input_tokens_seen": 117562384, "step": 54535 }, { "epoch": 10.009175995595522, "grad_norm": 73.3738784790039, "learning_rate": 5.8605116265928685e-06, "loss": 0.499, "num_input_tokens_seen": 117573680, "step": 54540 }, { "epoch": 10.010093595155075, "grad_norm": 0.2738893926143646, "learning_rate": 5.859722806912424e-06, "loss": 0.073, "num_input_tokens_seen": 117585136, "step": 54545 }, { "epoch": 10.011011194714627, "grad_norm": 6.979682445526123, "learning_rate": 5.858933965181421e-06, "loss": 0.1796, "num_input_tokens_seen": 117596016, "step": 54550 }, { "epoch": 10.011928794274178, "grad_norm": 0.0957670658826828, "learning_rate": 5.858145101420093e-06, "loss": 0.0066, "num_input_tokens_seen": 117606096, "step": 54555 }, { "epoch": 10.012846393833732, "grad_norm": 27.592613220214844, "learning_rate": 5.857356215648674e-06, "loss": 0.1872, "num_input_tokens_seen": 117617360, "step": 54560 }, { "epoch": 10.013763993393283, "grad_norm": 1.4114407300949097, "learning_rate": 5.856567307887397e-06, "loss": 0.1378, "num_input_tokens_seen": 117628240, "step": 54565 }, { "epoch": 10.014681592952835, "grad_norm": 13.884635925292969, "learning_rate": 5.8557783781564945e-06, "loss": 0.1802, "num_input_tokens_seen": 117639600, "step": 54570 }, { "epoch": 10.015599192512388, "grad_norm": 0.7458795309066772, "learning_rate": 5.854989426476204e-06, "loss": 0.0045, "num_input_tokens_seen": 117650736, "step": 54575 }, { "epoch": 10.01651679207194, "grad_norm": 0.20458129048347473, "learning_rate": 5.85420045286676e-06, "loss": 0.1287, "num_input_tokens_seen": 117660304, "step": 54580 }, { "epoch": 10.017434391631491, "grad_norm": 22.998981475830078, "learning_rate": 5.853411457348398e-06, "loss": 0.1799, "num_input_tokens_seen": 117670736, "step": 54585 }, { "epoch": 10.018351991191045, "grad_norm": 0.7411282062530518, "learning_rate": 5.852622439941355e-06, "loss": 0.1195, "num_input_tokens_seen": 117681392, "step": 54590 }, { "epoch": 10.019269590750596, "grad_norm": 0.1860477179288864, "learning_rate": 5.8518334006658675e-06, "loss": 0.3299, "num_input_tokens_seen": 117692112, "step": 54595 }, { "epoch": 10.020187190310148, "grad_norm": 31.244596481323242, "learning_rate": 5.8510443395421735e-06, "loss": 0.0563, "num_input_tokens_seen": 117702416, "step": 54600 }, { "epoch": 10.021104789869701, "grad_norm": 28.797887802124023, "learning_rate": 5.850255256590512e-06, "loss": 0.4824, "num_input_tokens_seen": 117713808, "step": 54605 }, { "epoch": 10.022022389429253, "grad_norm": 0.8358645439147949, "learning_rate": 5.84946615183112e-06, "loss": 0.1718, "num_input_tokens_seen": 117723344, "step": 54610 }, { "epoch": 10.022939988988805, "grad_norm": 1.3697338104248047, "learning_rate": 5.8486770252842376e-06, "loss": 0.0991, "num_input_tokens_seen": 117735824, "step": 54615 }, { "epoch": 10.023857588548358, "grad_norm": 1.9127089977264404, "learning_rate": 5.847887876970106e-06, "loss": 0.0421, "num_input_tokens_seen": 117747056, "step": 54620 }, { "epoch": 10.02477518810791, "grad_norm": 0.20406009256839752, "learning_rate": 5.847098706908964e-06, "loss": 0.009, "num_input_tokens_seen": 117757840, "step": 54625 }, { "epoch": 10.025692787667461, "grad_norm": 0.2661026418209076, "learning_rate": 5.846309515121052e-06, "loss": 0.0304, "num_input_tokens_seen": 117768528, "step": 54630 }, { "epoch": 10.026610387227015, "grad_norm": 4.336870193481445, "learning_rate": 5.845520301626615e-06, "loss": 0.0065, "num_input_tokens_seen": 117778608, "step": 54635 }, { "epoch": 10.027527986786566, "grad_norm": 13.917033195495605, "learning_rate": 5.84473106644589e-06, "loss": 0.3758, "num_input_tokens_seen": 117787920, "step": 54640 }, { "epoch": 10.028445586346118, "grad_norm": 0.17012542486190796, "learning_rate": 5.843941809599123e-06, "loss": 0.007, "num_input_tokens_seen": 117799088, "step": 54645 }, { "epoch": 10.029363185905671, "grad_norm": 3.060364246368408, "learning_rate": 5.8431525311065585e-06, "loss": 0.3443, "num_input_tokens_seen": 117811312, "step": 54650 }, { "epoch": 10.030280785465223, "grad_norm": 85.49861145019531, "learning_rate": 5.842363230988436e-06, "loss": 0.4149, "num_input_tokens_seen": 117821200, "step": 54655 }, { "epoch": 10.031198385024775, "grad_norm": 2.514037847518921, "learning_rate": 5.841573909265004e-06, "loss": 0.0395, "num_input_tokens_seen": 117832336, "step": 54660 }, { "epoch": 10.032115984584328, "grad_norm": 12.424843788146973, "learning_rate": 5.840784565956504e-06, "loss": 0.055, "num_input_tokens_seen": 117843376, "step": 54665 }, { "epoch": 10.03303358414388, "grad_norm": 0.057502489537000656, "learning_rate": 5.8399952010831836e-06, "loss": 0.0989, "num_input_tokens_seen": 117854640, "step": 54670 }, { "epoch": 10.033951183703431, "grad_norm": 24.981061935424805, "learning_rate": 5.8392058146652885e-06, "loss": 0.07, "num_input_tokens_seen": 117864784, "step": 54675 }, { "epoch": 10.034868783262985, "grad_norm": 3.012883186340332, "learning_rate": 5.838416406723064e-06, "loss": 0.1584, "num_input_tokens_seen": 117875632, "step": 54680 }, { "epoch": 10.035786382822536, "grad_norm": 0.35349810123443604, "learning_rate": 5.837626977276759e-06, "loss": 0.4024, "num_input_tokens_seen": 117886960, "step": 54685 }, { "epoch": 10.036703982382088, "grad_norm": 66.36579895019531, "learning_rate": 5.836837526346619e-06, "loss": 0.5015, "num_input_tokens_seen": 117897680, "step": 54690 }, { "epoch": 10.037621581941641, "grad_norm": 67.2986831665039, "learning_rate": 5.836048053952895e-06, "loss": 0.0899, "num_input_tokens_seen": 117907536, "step": 54695 }, { "epoch": 10.038539181501193, "grad_norm": 25.717422485351562, "learning_rate": 5.835258560115834e-06, "loss": 0.3405, "num_input_tokens_seen": 117917936, "step": 54700 }, { "epoch": 10.039456781060744, "grad_norm": 1.52198326587677, "learning_rate": 5.834469044855684e-06, "loss": 0.1535, "num_input_tokens_seen": 117928304, "step": 54705 }, { "epoch": 10.040374380620298, "grad_norm": 7.459471225738525, "learning_rate": 5.833679508192698e-06, "loss": 0.2296, "num_input_tokens_seen": 117939312, "step": 54710 }, { "epoch": 10.04129198017985, "grad_norm": 3.03769850730896, "learning_rate": 5.8328899501471235e-06, "loss": 0.0241, "num_input_tokens_seen": 117949680, "step": 54715 }, { "epoch": 10.042209579739401, "grad_norm": 8.212925910949707, "learning_rate": 5.832100370739214e-06, "loss": 0.1284, "num_input_tokens_seen": 117960560, "step": 54720 }, { "epoch": 10.043127179298954, "grad_norm": 0.8164841532707214, "learning_rate": 5.831310769989219e-06, "loss": 0.328, "num_input_tokens_seen": 117970640, "step": 54725 }, { "epoch": 10.044044778858506, "grad_norm": 43.49618911743164, "learning_rate": 5.830521147917391e-06, "loss": 0.1714, "num_input_tokens_seen": 117980752, "step": 54730 }, { "epoch": 10.044962378418058, "grad_norm": 3.720682382583618, "learning_rate": 5.8297315045439826e-06, "loss": 0.063, "num_input_tokens_seen": 117991824, "step": 54735 }, { "epoch": 10.045879977977611, "grad_norm": 0.1026415005326271, "learning_rate": 5.8289418398892474e-06, "loss": 0.2533, "num_input_tokens_seen": 118003248, "step": 54740 }, { "epoch": 10.046797577537163, "grad_norm": 40.127830505371094, "learning_rate": 5.828152153973439e-06, "loss": 0.1868, "num_input_tokens_seen": 118013424, "step": 54745 }, { "epoch": 10.047715177096714, "grad_norm": 237.5101318359375, "learning_rate": 5.82736244681681e-06, "loss": 0.3175, "num_input_tokens_seen": 118023984, "step": 54750 }, { "epoch": 10.048632776656268, "grad_norm": 0.27600690722465515, "learning_rate": 5.826572718439617e-06, "loss": 0.0038, "num_input_tokens_seen": 118034640, "step": 54755 }, { "epoch": 10.04955037621582, "grad_norm": 120.39439392089844, "learning_rate": 5.825782968862116e-06, "loss": 0.1448, "num_input_tokens_seen": 118045168, "step": 54760 }, { "epoch": 10.050467975775371, "grad_norm": 14.585856437683105, "learning_rate": 5.8249931981045605e-06, "loss": 0.1301, "num_input_tokens_seen": 118055056, "step": 54765 }, { "epoch": 10.051385575334924, "grad_norm": 7.286336898803711, "learning_rate": 5.824203406187209e-06, "loss": 0.0594, "num_input_tokens_seen": 118064720, "step": 54770 }, { "epoch": 10.052303174894476, "grad_norm": 0.5534464120864868, "learning_rate": 5.823413593130317e-06, "loss": 0.1947, "num_input_tokens_seen": 118076016, "step": 54775 }, { "epoch": 10.053220774454028, "grad_norm": 1.0539599657058716, "learning_rate": 5.822623758954143e-06, "loss": 0.1918, "num_input_tokens_seen": 118086352, "step": 54780 }, { "epoch": 10.054138374013581, "grad_norm": 5.103592395782471, "learning_rate": 5.821833903678944e-06, "loss": 0.0126, "num_input_tokens_seen": 118096656, "step": 54785 }, { "epoch": 10.055055973573133, "grad_norm": 6.8893890380859375, "learning_rate": 5.821044027324978e-06, "loss": 0.3356, "num_input_tokens_seen": 118107984, "step": 54790 }, { "epoch": 10.055973573132684, "grad_norm": 234.1068115234375, "learning_rate": 5.820254129912507e-06, "loss": 0.5999, "num_input_tokens_seen": 118119280, "step": 54795 }, { "epoch": 10.056891172692238, "grad_norm": 4.548795700073242, "learning_rate": 5.819464211461789e-06, "loss": 0.082, "num_input_tokens_seen": 118130992, "step": 54800 }, { "epoch": 10.05780877225179, "grad_norm": 26.31423568725586, "learning_rate": 5.818674271993082e-06, "loss": 0.2682, "num_input_tokens_seen": 118141744, "step": 54805 }, { "epoch": 10.05872637181134, "grad_norm": 14.077000617980957, "learning_rate": 5.8178843115266505e-06, "loss": 0.1297, "num_input_tokens_seen": 118153744, "step": 54810 }, { "epoch": 10.059643971370894, "grad_norm": 0.4888745844364166, "learning_rate": 5.8170943300827536e-06, "loss": 0.1691, "num_input_tokens_seen": 118165296, "step": 54815 }, { "epoch": 10.060561570930446, "grad_norm": 27.61795997619629, "learning_rate": 5.816304327681653e-06, "loss": 0.2156, "num_input_tokens_seen": 118175760, "step": 54820 }, { "epoch": 10.061479170489998, "grad_norm": 65.60144805908203, "learning_rate": 5.815514304343612e-06, "loss": 0.293, "num_input_tokens_seen": 118185648, "step": 54825 }, { "epoch": 10.062396770049551, "grad_norm": 0.41363489627838135, "learning_rate": 5.814724260088894e-06, "loss": 0.159, "num_input_tokens_seen": 118195952, "step": 54830 }, { "epoch": 10.063314369609103, "grad_norm": 1.651161551475525, "learning_rate": 5.813934194937762e-06, "loss": 0.1376, "num_input_tokens_seen": 118206960, "step": 54835 }, { "epoch": 10.064231969168654, "grad_norm": 169.40853881835938, "learning_rate": 5.813144108910476e-06, "loss": 0.2984, "num_input_tokens_seen": 118215888, "step": 54840 }, { "epoch": 10.065149568728208, "grad_norm": 0.27913179993629456, "learning_rate": 5.812354002027307e-06, "loss": 0.1677, "num_input_tokens_seen": 118227440, "step": 54845 }, { "epoch": 10.06606716828776, "grad_norm": 116.57926940917969, "learning_rate": 5.811563874308514e-06, "loss": 0.2327, "num_input_tokens_seen": 118239056, "step": 54850 }, { "epoch": 10.06698476784731, "grad_norm": 0.09938051551580429, "learning_rate": 5.810773725774369e-06, "loss": 0.1174, "num_input_tokens_seen": 118249776, "step": 54855 }, { "epoch": 10.067902367406864, "grad_norm": 66.84769439697266, "learning_rate": 5.809983556445131e-06, "loss": 0.243, "num_input_tokens_seen": 118260432, "step": 54860 }, { "epoch": 10.068819966966416, "grad_norm": 33.959712982177734, "learning_rate": 5.8091933663410714e-06, "loss": 0.0378, "num_input_tokens_seen": 118270736, "step": 54865 }, { "epoch": 10.069737566525967, "grad_norm": 54.57726287841797, "learning_rate": 5.808403155482457e-06, "loss": 0.2099, "num_input_tokens_seen": 118280240, "step": 54870 }, { "epoch": 10.07065516608552, "grad_norm": 6.260673522949219, "learning_rate": 5.807612923889554e-06, "loss": 0.1197, "num_input_tokens_seen": 118291504, "step": 54875 }, { "epoch": 10.071572765645072, "grad_norm": 0.19150035083293915, "learning_rate": 5.806822671582631e-06, "loss": 0.13, "num_input_tokens_seen": 118301808, "step": 54880 }, { "epoch": 10.072490365204624, "grad_norm": 93.46305847167969, "learning_rate": 5.806032398581958e-06, "loss": 0.2096, "num_input_tokens_seen": 118313232, "step": 54885 }, { "epoch": 10.073407964764177, "grad_norm": 10.813727378845215, "learning_rate": 5.805242104907801e-06, "loss": 0.2195, "num_input_tokens_seen": 118324528, "step": 54890 }, { "epoch": 10.074325564323729, "grad_norm": 4.123713970184326, "learning_rate": 5.804451790580434e-06, "loss": 0.1359, "num_input_tokens_seen": 118333808, "step": 54895 }, { "epoch": 10.07524316388328, "grad_norm": 0.18364988267421722, "learning_rate": 5.803661455620124e-06, "loss": 0.1564, "num_input_tokens_seen": 118344976, "step": 54900 }, { "epoch": 10.076160763442834, "grad_norm": 0.837328314781189, "learning_rate": 5.802871100047145e-06, "loss": 0.1956, "num_input_tokens_seen": 118355664, "step": 54905 }, { "epoch": 10.077078363002386, "grad_norm": 1.1385313272476196, "learning_rate": 5.802080723881766e-06, "loss": 0.1733, "num_input_tokens_seen": 118366832, "step": 54910 }, { "epoch": 10.077995962561937, "grad_norm": 16.136615753173828, "learning_rate": 5.801290327144258e-06, "loss": 0.309, "num_input_tokens_seen": 118377168, "step": 54915 }, { "epoch": 10.07891356212149, "grad_norm": 0.33293765783309937, "learning_rate": 5.800499909854896e-06, "loss": 0.2277, "num_input_tokens_seen": 118388112, "step": 54920 }, { "epoch": 10.079831161681042, "grad_norm": 44.9013671875, "learning_rate": 5.799709472033952e-06, "loss": 0.2272, "num_input_tokens_seen": 118398288, "step": 54925 }, { "epoch": 10.080748761240594, "grad_norm": 105.81441497802734, "learning_rate": 5.798919013701701e-06, "loss": 0.0857, "num_input_tokens_seen": 118408272, "step": 54930 }, { "epoch": 10.081666360800147, "grad_norm": 0.7620564699172974, "learning_rate": 5.798128534878413e-06, "loss": 0.0587, "num_input_tokens_seen": 118420304, "step": 54935 }, { "epoch": 10.082583960359699, "grad_norm": 94.46601867675781, "learning_rate": 5.797338035584367e-06, "loss": 0.3553, "num_input_tokens_seen": 118430800, "step": 54940 }, { "epoch": 10.08350155991925, "grad_norm": 0.24935200810432434, "learning_rate": 5.796547515839836e-06, "loss": 0.2367, "num_input_tokens_seen": 118442928, "step": 54945 }, { "epoch": 10.084419159478804, "grad_norm": 22.298160552978516, "learning_rate": 5.7957569756650944e-06, "loss": 0.2963, "num_input_tokens_seen": 118453520, "step": 54950 }, { "epoch": 10.085336759038356, "grad_norm": 68.59322357177734, "learning_rate": 5.794966415080421e-06, "loss": 0.2712, "num_input_tokens_seen": 118462864, "step": 54955 }, { "epoch": 10.086254358597907, "grad_norm": 0.1942375749349594, "learning_rate": 5.79417583410609e-06, "loss": 0.2186, "num_input_tokens_seen": 118474096, "step": 54960 }, { "epoch": 10.08717195815746, "grad_norm": 0.37343427538871765, "learning_rate": 5.793385232762379e-06, "loss": 0.0624, "num_input_tokens_seen": 118483632, "step": 54965 }, { "epoch": 10.088089557717012, "grad_norm": 7.759340763092041, "learning_rate": 5.792594611069569e-06, "loss": 0.065, "num_input_tokens_seen": 118494576, "step": 54970 }, { "epoch": 10.089007157276564, "grad_norm": 28.289270401000977, "learning_rate": 5.791803969047933e-06, "loss": 0.3423, "num_input_tokens_seen": 118504496, "step": 54975 }, { "epoch": 10.089924756836117, "grad_norm": 11.053460121154785, "learning_rate": 5.7910133067177544e-06, "loss": 0.3726, "num_input_tokens_seen": 118514928, "step": 54980 }, { "epoch": 10.090842356395669, "grad_norm": 1.1546183824539185, "learning_rate": 5.790222624099309e-06, "loss": 0.0566, "num_input_tokens_seen": 118526800, "step": 54985 }, { "epoch": 10.09175995595522, "grad_norm": 77.62129211425781, "learning_rate": 5.789431921212879e-06, "loss": 0.0234, "num_input_tokens_seen": 118537264, "step": 54990 }, { "epoch": 10.092677555514774, "grad_norm": 0.03921540826559067, "learning_rate": 5.788641198078744e-06, "loss": 0.0063, "num_input_tokens_seen": 118547760, "step": 54995 }, { "epoch": 10.093595155074325, "grad_norm": 0.31066736578941345, "learning_rate": 5.787850454717183e-06, "loss": 0.0934, "num_input_tokens_seen": 118558064, "step": 55000 }, { "epoch": 10.094512754633877, "grad_norm": 6.224015235900879, "learning_rate": 5.78705969114848e-06, "loss": 0.089, "num_input_tokens_seen": 118569808, "step": 55005 }, { "epoch": 10.09543035419343, "grad_norm": 4.537929058074951, "learning_rate": 5.786268907392916e-06, "loss": 0.2782, "num_input_tokens_seen": 118580080, "step": 55010 }, { "epoch": 10.096347953752982, "grad_norm": 74.79744720458984, "learning_rate": 5.785478103470773e-06, "loss": 0.1656, "num_input_tokens_seen": 118591792, "step": 55015 }, { "epoch": 10.097265553312534, "grad_norm": 0.5695539712905884, "learning_rate": 5.784687279402332e-06, "loss": 0.1181, "num_input_tokens_seen": 118603120, "step": 55020 }, { "epoch": 10.098183152872087, "grad_norm": 45.789764404296875, "learning_rate": 5.783896435207881e-06, "loss": 0.0846, "num_input_tokens_seen": 118615504, "step": 55025 }, { "epoch": 10.099100752431639, "grad_norm": 0.19537103176116943, "learning_rate": 5.783105570907701e-06, "loss": 0.2922, "num_input_tokens_seen": 118626288, "step": 55030 }, { "epoch": 10.10001835199119, "grad_norm": 0.16905388236045837, "learning_rate": 5.782314686522076e-06, "loss": 0.1429, "num_input_tokens_seen": 118637968, "step": 55035 }, { "epoch": 10.100935951550744, "grad_norm": 0.25808465480804443, "learning_rate": 5.781523782071292e-06, "loss": 0.3312, "num_input_tokens_seen": 118648464, "step": 55040 }, { "epoch": 10.101853551110295, "grad_norm": 12.347699165344238, "learning_rate": 5.780732857575634e-06, "loss": 0.3631, "num_input_tokens_seen": 118659728, "step": 55045 }, { "epoch": 10.102771150669847, "grad_norm": 0.656587541103363, "learning_rate": 5.779941913055389e-06, "loss": 0.2534, "num_input_tokens_seen": 118671216, "step": 55050 }, { "epoch": 10.1036887502294, "grad_norm": 0.039481423795223236, "learning_rate": 5.779150948530844e-06, "loss": 0.3954, "num_input_tokens_seen": 118682480, "step": 55055 }, { "epoch": 10.104606349788952, "grad_norm": 0.32598787546157837, "learning_rate": 5.778359964022282e-06, "loss": 0.321, "num_input_tokens_seen": 118693392, "step": 55060 }, { "epoch": 10.105523949348504, "grad_norm": 52.10832214355469, "learning_rate": 5.777568959549995e-06, "loss": 0.3193, "num_input_tokens_seen": 118703952, "step": 55065 }, { "epoch": 10.106441548908057, "grad_norm": 34.86666488647461, "learning_rate": 5.776777935134268e-06, "loss": 0.2761, "num_input_tokens_seen": 118713872, "step": 55070 }, { "epoch": 10.107359148467609, "grad_norm": 5.191481590270996, "learning_rate": 5.775986890795391e-06, "loss": 0.183, "num_input_tokens_seen": 118724464, "step": 55075 }, { "epoch": 10.10827674802716, "grad_norm": 0.05977505072951317, "learning_rate": 5.775195826553655e-06, "loss": 0.2097, "num_input_tokens_seen": 118734992, "step": 55080 }, { "epoch": 10.109194347586714, "grad_norm": 74.92143249511719, "learning_rate": 5.774404742429345e-06, "loss": 0.1523, "num_input_tokens_seen": 118746672, "step": 55085 }, { "epoch": 10.110111947146265, "grad_norm": 0.998877227306366, "learning_rate": 5.773613638442754e-06, "loss": 0.0358, "num_input_tokens_seen": 118758160, "step": 55090 }, { "epoch": 10.111029546705817, "grad_norm": 0.5369656085968018, "learning_rate": 5.772822514614174e-06, "loss": 0.1389, "num_input_tokens_seen": 118769392, "step": 55095 }, { "epoch": 10.11194714626537, "grad_norm": 0.6226139068603516, "learning_rate": 5.772031370963892e-06, "loss": 0.0041, "num_input_tokens_seen": 118779952, "step": 55100 }, { "epoch": 10.112864745824922, "grad_norm": 174.3236846923828, "learning_rate": 5.771240207512203e-06, "loss": 0.2958, "num_input_tokens_seen": 118791792, "step": 55105 }, { "epoch": 10.113782345384474, "grad_norm": 5.9981913566589355, "learning_rate": 5.770449024279398e-06, "loss": 0.098, "num_input_tokens_seen": 118801616, "step": 55110 }, { "epoch": 10.114699944944027, "grad_norm": 10.03951358795166, "learning_rate": 5.769657821285769e-06, "loss": 0.0864, "num_input_tokens_seen": 118810800, "step": 55115 }, { "epoch": 10.115617544503579, "grad_norm": 134.94451904296875, "learning_rate": 5.76886659855161e-06, "loss": 0.2053, "num_input_tokens_seen": 118822224, "step": 55120 }, { "epoch": 10.11653514406313, "grad_norm": 51.97688293457031, "learning_rate": 5.7680753560972155e-06, "loss": 0.2149, "num_input_tokens_seen": 118832880, "step": 55125 }, { "epoch": 10.117452743622684, "grad_norm": 0.32848605513572693, "learning_rate": 5.767284093942877e-06, "loss": 0.0268, "num_input_tokens_seen": 118844400, "step": 55130 }, { "epoch": 10.118370343182235, "grad_norm": 0.2935238480567932, "learning_rate": 5.766492812108891e-06, "loss": 0.1265, "num_input_tokens_seen": 118856144, "step": 55135 }, { "epoch": 10.119287942741787, "grad_norm": 100.96698760986328, "learning_rate": 5.7657015106155536e-06, "loss": 0.0209, "num_input_tokens_seen": 118867408, "step": 55140 }, { "epoch": 10.12020554230134, "grad_norm": 1.4899752140045166, "learning_rate": 5.764910189483157e-06, "loss": 0.2014, "num_input_tokens_seen": 118878288, "step": 55145 }, { "epoch": 10.121123141860892, "grad_norm": 0.15922603011131287, "learning_rate": 5.764118848732001e-06, "loss": 0.0837, "num_input_tokens_seen": 118888112, "step": 55150 }, { "epoch": 10.122040741420443, "grad_norm": 0.1002245843410492, "learning_rate": 5.76332748838238e-06, "loss": 0.0037, "num_input_tokens_seen": 118899216, "step": 55155 }, { "epoch": 10.122958340979997, "grad_norm": 44.2214241027832, "learning_rate": 5.762536108454593e-06, "loss": 0.096, "num_input_tokens_seen": 118909968, "step": 55160 }, { "epoch": 10.123875940539548, "grad_norm": 0.164220929145813, "learning_rate": 5.761744708968937e-06, "loss": 0.2592, "num_input_tokens_seen": 118922096, "step": 55165 }, { "epoch": 10.1247935400991, "grad_norm": 0.09110518544912338, "learning_rate": 5.760953289945709e-06, "loss": 0.0067, "num_input_tokens_seen": 118932976, "step": 55170 }, { "epoch": 10.125711139658653, "grad_norm": 14.059033393859863, "learning_rate": 5.760161851405208e-06, "loss": 0.1517, "num_input_tokens_seen": 118943632, "step": 55175 }, { "epoch": 10.126628739218205, "grad_norm": 10.706949234008789, "learning_rate": 5.759370393367733e-06, "loss": 0.1272, "num_input_tokens_seen": 118955536, "step": 55180 }, { "epoch": 10.127546338777757, "grad_norm": 257.2161865234375, "learning_rate": 5.7585789158535865e-06, "loss": 0.277, "num_input_tokens_seen": 118966704, "step": 55185 }, { "epoch": 10.12846393833731, "grad_norm": 63.842987060546875, "learning_rate": 5.757787418883065e-06, "loss": 0.3066, "num_input_tokens_seen": 118977264, "step": 55190 }, { "epoch": 10.129381537896862, "grad_norm": 23.388025283813477, "learning_rate": 5.756995902476471e-06, "loss": 0.113, "num_input_tokens_seen": 118988688, "step": 55195 }, { "epoch": 10.130299137456413, "grad_norm": 12.056617736816406, "learning_rate": 5.756204366654107e-06, "loss": 0.0735, "num_input_tokens_seen": 118999856, "step": 55200 }, { "epoch": 10.131216737015967, "grad_norm": 178.13104248046875, "learning_rate": 5.75541281143627e-06, "loss": 0.3591, "num_input_tokens_seen": 119010224, "step": 55205 }, { "epoch": 10.132134336575518, "grad_norm": 131.09974670410156, "learning_rate": 5.7546212368432665e-06, "loss": 0.1207, "num_input_tokens_seen": 119020688, "step": 55210 }, { "epoch": 10.13305193613507, "grad_norm": 0.06083831563591957, "learning_rate": 5.753829642895399e-06, "loss": 0.2792, "num_input_tokens_seen": 119030928, "step": 55215 }, { "epoch": 10.133969535694623, "grad_norm": 62.55105209350586, "learning_rate": 5.753038029612968e-06, "loss": 0.2439, "num_input_tokens_seen": 119041264, "step": 55220 }, { "epoch": 10.134887135254175, "grad_norm": 41.754337310791016, "learning_rate": 5.752246397016279e-06, "loss": 0.2178, "num_input_tokens_seen": 119052112, "step": 55225 }, { "epoch": 10.135804734813727, "grad_norm": 4.328640937805176, "learning_rate": 5.751454745125636e-06, "loss": 0.2356, "num_input_tokens_seen": 119063632, "step": 55230 }, { "epoch": 10.13672233437328, "grad_norm": 31.5950870513916, "learning_rate": 5.750663073961343e-06, "loss": 0.1636, "num_input_tokens_seen": 119074768, "step": 55235 }, { "epoch": 10.137639933932832, "grad_norm": 0.22334176301956177, "learning_rate": 5.749871383543706e-06, "loss": 0.2073, "num_input_tokens_seen": 119086032, "step": 55240 }, { "epoch": 10.138557533492383, "grad_norm": 10.06862735748291, "learning_rate": 5.7490796738930285e-06, "loss": 0.2326, "num_input_tokens_seen": 119097392, "step": 55245 }, { "epoch": 10.139475133051937, "grad_norm": 0.33989399671554565, "learning_rate": 5.748287945029621e-06, "loss": 0.0058, "num_input_tokens_seen": 119108688, "step": 55250 }, { "epoch": 10.140392732611488, "grad_norm": 0.06682690978050232, "learning_rate": 5.747496196973786e-06, "loss": 0.4126, "num_input_tokens_seen": 119118544, "step": 55255 }, { "epoch": 10.14131033217104, "grad_norm": 0.19182048738002777, "learning_rate": 5.746704429745833e-06, "loss": 0.0054, "num_input_tokens_seen": 119128368, "step": 55260 }, { "epoch": 10.142227931730593, "grad_norm": 38.17781448364258, "learning_rate": 5.7459126433660696e-06, "loss": 0.2075, "num_input_tokens_seen": 119140336, "step": 55265 }, { "epoch": 10.143145531290145, "grad_norm": 1.6198161840438843, "learning_rate": 5.745120837854801e-06, "loss": 0.1293, "num_input_tokens_seen": 119150928, "step": 55270 }, { "epoch": 10.144063130849696, "grad_norm": 9.07133960723877, "learning_rate": 5.744329013232338e-06, "loss": 0.3041, "num_input_tokens_seen": 119161584, "step": 55275 }, { "epoch": 10.14498073040925, "grad_norm": 0.11068877577781677, "learning_rate": 5.743537169518989e-06, "loss": 0.1336, "num_input_tokens_seen": 119173520, "step": 55280 }, { "epoch": 10.145898329968801, "grad_norm": 0.47378599643707275, "learning_rate": 5.742745306735066e-06, "loss": 0.0048, "num_input_tokens_seen": 119183952, "step": 55285 }, { "epoch": 10.146815929528353, "grad_norm": 0.25294598937034607, "learning_rate": 5.7419534249008745e-06, "loss": 0.0883, "num_input_tokens_seen": 119194512, "step": 55290 }, { "epoch": 10.147733529087906, "grad_norm": 0.22924445569515228, "learning_rate": 5.741161524036728e-06, "loss": 0.1147, "num_input_tokens_seen": 119205040, "step": 55295 }, { "epoch": 10.148651128647458, "grad_norm": 0.0961250588297844, "learning_rate": 5.740369604162939e-06, "loss": 0.1673, "num_input_tokens_seen": 119215632, "step": 55300 }, { "epoch": 10.14956872820701, "grad_norm": 17.4473819732666, "learning_rate": 5.739577665299815e-06, "loss": 0.1848, "num_input_tokens_seen": 119227280, "step": 55305 }, { "epoch": 10.150486327766563, "grad_norm": 114.88501739501953, "learning_rate": 5.738785707467671e-06, "loss": 0.5103, "num_input_tokens_seen": 119237104, "step": 55310 }, { "epoch": 10.151403927326115, "grad_norm": 0.3793889284133911, "learning_rate": 5.737993730686819e-06, "loss": 0.0016, "num_input_tokens_seen": 119247376, "step": 55315 }, { "epoch": 10.152321526885666, "grad_norm": 1.1972187757492065, "learning_rate": 5.737201734977571e-06, "loss": 0.0773, "num_input_tokens_seen": 119257872, "step": 55320 }, { "epoch": 10.15323912644522, "grad_norm": 0.2993435859680176, "learning_rate": 5.736409720360241e-06, "loss": 0.0095, "num_input_tokens_seen": 119268528, "step": 55325 }, { "epoch": 10.154156726004771, "grad_norm": 0.28555092215538025, "learning_rate": 5.735617686855144e-06, "loss": 0.3443, "num_input_tokens_seen": 119279632, "step": 55330 }, { "epoch": 10.155074325564323, "grad_norm": 8.445377349853516, "learning_rate": 5.734825634482593e-06, "loss": 0.1596, "num_input_tokens_seen": 119289072, "step": 55335 }, { "epoch": 10.155991925123876, "grad_norm": 1.6036213636398315, "learning_rate": 5.734033563262902e-06, "loss": 0.242, "num_input_tokens_seen": 119299120, "step": 55340 }, { "epoch": 10.156909524683428, "grad_norm": 15.281368255615234, "learning_rate": 5.7332414732163885e-06, "loss": 0.1608, "num_input_tokens_seen": 119310608, "step": 55345 }, { "epoch": 10.15782712424298, "grad_norm": 6.5578179359436035, "learning_rate": 5.732449364363368e-06, "loss": 0.5738, "num_input_tokens_seen": 119320848, "step": 55350 }, { "epoch": 10.158744723802533, "grad_norm": 15.510287284851074, "learning_rate": 5.731657236724156e-06, "loss": 0.1765, "num_input_tokens_seen": 119331024, "step": 55355 }, { "epoch": 10.159662323362085, "grad_norm": 52.3806037902832, "learning_rate": 5.730865090319072e-06, "loss": 0.2103, "num_input_tokens_seen": 119342160, "step": 55360 }, { "epoch": 10.160579922921636, "grad_norm": 0.5049178004264832, "learning_rate": 5.730072925168429e-06, "loss": 0.1461, "num_input_tokens_seen": 119353808, "step": 55365 }, { "epoch": 10.16149752248119, "grad_norm": 0.5652400255203247, "learning_rate": 5.729280741292548e-06, "loss": 0.238, "num_input_tokens_seen": 119363728, "step": 55370 }, { "epoch": 10.162415122040741, "grad_norm": 48.94697952270508, "learning_rate": 5.7284885387117465e-06, "loss": 0.3711, "num_input_tokens_seen": 119374896, "step": 55375 }, { "epoch": 10.163332721600293, "grad_norm": 149.4152374267578, "learning_rate": 5.727696317446342e-06, "loss": 0.6591, "num_input_tokens_seen": 119384208, "step": 55380 }, { "epoch": 10.164250321159846, "grad_norm": 47.60770034790039, "learning_rate": 5.726904077516655e-06, "loss": 0.0323, "num_input_tokens_seen": 119394384, "step": 55385 }, { "epoch": 10.165167920719398, "grad_norm": 3.929699659347534, "learning_rate": 5.726111818943004e-06, "loss": 0.1406, "num_input_tokens_seen": 119405872, "step": 55390 }, { "epoch": 10.16608552027895, "grad_norm": 6.599491119384766, "learning_rate": 5.725319541745712e-06, "loss": 0.326, "num_input_tokens_seen": 119415888, "step": 55395 }, { "epoch": 10.167003119838503, "grad_norm": 39.70110321044922, "learning_rate": 5.724527245945097e-06, "loss": 0.0234, "num_input_tokens_seen": 119426512, "step": 55400 }, { "epoch": 10.167920719398055, "grad_norm": 10.967939376831055, "learning_rate": 5.723734931561481e-06, "loss": 0.072, "num_input_tokens_seen": 119438352, "step": 55405 }, { "epoch": 10.168838318957606, "grad_norm": 0.05598342791199684, "learning_rate": 5.722942598615187e-06, "loss": 0.0147, "num_input_tokens_seen": 119450032, "step": 55410 }, { "epoch": 10.16975591851716, "grad_norm": 0.14586585760116577, "learning_rate": 5.7221502471265345e-06, "loss": 0.0145, "num_input_tokens_seen": 119459920, "step": 55415 }, { "epoch": 10.170673518076711, "grad_norm": 75.06470489501953, "learning_rate": 5.7213578771158465e-06, "loss": 0.7876, "num_input_tokens_seen": 119471312, "step": 55420 }, { "epoch": 10.171591117636263, "grad_norm": 48.88796615600586, "learning_rate": 5.720565488603449e-06, "loss": 0.1578, "num_input_tokens_seen": 119483792, "step": 55425 }, { "epoch": 10.172508717195816, "grad_norm": 0.09250901639461517, "learning_rate": 5.719773081609662e-06, "loss": 0.1994, "num_input_tokens_seen": 119493680, "step": 55430 }, { "epoch": 10.173426316755368, "grad_norm": 6.632504463195801, "learning_rate": 5.718980656154812e-06, "loss": 0.1821, "num_input_tokens_seen": 119503728, "step": 55435 }, { "epoch": 10.17434391631492, "grad_norm": 0.30459967255592346, "learning_rate": 5.7181882122592215e-06, "loss": 0.4444, "num_input_tokens_seen": 119515472, "step": 55440 }, { "epoch": 10.175261515874473, "grad_norm": 48.40122985839844, "learning_rate": 5.717395749943217e-06, "loss": 0.3531, "num_input_tokens_seen": 119524624, "step": 55445 }, { "epoch": 10.176179115434024, "grad_norm": 0.06575437635183334, "learning_rate": 5.716603269227124e-06, "loss": 0.0008, "num_input_tokens_seen": 119535504, "step": 55450 }, { "epoch": 10.177096714993576, "grad_norm": 47.499351501464844, "learning_rate": 5.715810770131267e-06, "loss": 0.1154, "num_input_tokens_seen": 119545968, "step": 55455 }, { "epoch": 10.17801431455313, "grad_norm": 0.4868830442428589, "learning_rate": 5.715018252675974e-06, "loss": 0.2353, "num_input_tokens_seen": 119557648, "step": 55460 }, { "epoch": 10.178931914112681, "grad_norm": 144.8456268310547, "learning_rate": 5.71422571688157e-06, "loss": 0.1734, "num_input_tokens_seen": 119568112, "step": 55465 }, { "epoch": 10.179849513672233, "grad_norm": 2.7207183837890625, "learning_rate": 5.713433162768383e-06, "loss": 0.3082, "num_input_tokens_seen": 119579792, "step": 55470 }, { "epoch": 10.180767113231786, "grad_norm": 48.42640686035156, "learning_rate": 5.712640590356742e-06, "loss": 0.3991, "num_input_tokens_seen": 119590288, "step": 55475 }, { "epoch": 10.181684712791338, "grad_norm": 0.699607789516449, "learning_rate": 5.711847999666974e-06, "loss": 0.0197, "num_input_tokens_seen": 119601840, "step": 55480 }, { "epoch": 10.18260231235089, "grad_norm": 15.700643539428711, "learning_rate": 5.711055390719409e-06, "loss": 0.4893, "num_input_tokens_seen": 119614576, "step": 55485 }, { "epoch": 10.183519911910443, "grad_norm": 29.204160690307617, "learning_rate": 5.710262763534374e-06, "loss": 0.1388, "num_input_tokens_seen": 119624272, "step": 55490 }, { "epoch": 10.184437511469994, "grad_norm": 27.179162979125977, "learning_rate": 5.709470118132201e-06, "loss": 0.2251, "num_input_tokens_seen": 119635312, "step": 55495 }, { "epoch": 10.185355111029546, "grad_norm": 0.47860264778137207, "learning_rate": 5.708677454533218e-06, "loss": 0.2986, "num_input_tokens_seen": 119646000, "step": 55500 }, { "epoch": 10.1862727105891, "grad_norm": 13.964299201965332, "learning_rate": 5.707884772757757e-06, "loss": 0.4488, "num_input_tokens_seen": 119655536, "step": 55505 }, { "epoch": 10.187190310148651, "grad_norm": 0.043019432574510574, "learning_rate": 5.707092072826149e-06, "loss": 0.2842, "num_input_tokens_seen": 119665424, "step": 55510 }, { "epoch": 10.188107909708203, "grad_norm": 76.16844177246094, "learning_rate": 5.7062993547587246e-06, "loss": 0.2579, "num_input_tokens_seen": 119675440, "step": 55515 }, { "epoch": 10.189025509267756, "grad_norm": 0.24353255331516266, "learning_rate": 5.705506618575818e-06, "loss": 0.2271, "num_input_tokens_seen": 119684144, "step": 55520 }, { "epoch": 10.189943108827308, "grad_norm": 0.29081979393959045, "learning_rate": 5.704713864297758e-06, "loss": 0.0627, "num_input_tokens_seen": 119695888, "step": 55525 }, { "epoch": 10.19086070838686, "grad_norm": 0.7747740149497986, "learning_rate": 5.70392109194488e-06, "loss": 0.3764, "num_input_tokens_seen": 119706736, "step": 55530 }, { "epoch": 10.191778307946413, "grad_norm": 0.6977134943008423, "learning_rate": 5.7031283015375175e-06, "loss": 0.0177, "num_input_tokens_seen": 119717296, "step": 55535 }, { "epoch": 10.192695907505964, "grad_norm": 17.8636474609375, "learning_rate": 5.702335493096003e-06, "loss": 0.3085, "num_input_tokens_seen": 119728464, "step": 55540 }, { "epoch": 10.193613507065516, "grad_norm": 1.8673514127731323, "learning_rate": 5.701542666640674e-06, "loss": 0.3635, "num_input_tokens_seen": 119739184, "step": 55545 }, { "epoch": 10.19453110662507, "grad_norm": 10.522501945495605, "learning_rate": 5.70074982219186e-06, "loss": 0.1854, "num_input_tokens_seen": 119750960, "step": 55550 }, { "epoch": 10.19544870618462, "grad_norm": 0.21219377219676971, "learning_rate": 5.6999569597699e-06, "loss": 0.418, "num_input_tokens_seen": 119762384, "step": 55555 }, { "epoch": 10.196366305744172, "grad_norm": 0.5264925956726074, "learning_rate": 5.69916407939513e-06, "loss": 0.0074, "num_input_tokens_seen": 119773424, "step": 55560 }, { "epoch": 10.197283905303726, "grad_norm": 17.61590003967285, "learning_rate": 5.698371181087884e-06, "loss": 0.0789, "num_input_tokens_seen": 119783312, "step": 55565 }, { "epoch": 10.198201504863277, "grad_norm": 0.1393614262342453, "learning_rate": 5.6975782648684995e-06, "loss": 0.1996, "num_input_tokens_seen": 119793968, "step": 55570 }, { "epoch": 10.199119104422829, "grad_norm": 0.1869068741798401, "learning_rate": 5.696785330757314e-06, "loss": 0.1123, "num_input_tokens_seen": 119805808, "step": 55575 }, { "epoch": 10.200036703982382, "grad_norm": 0.29390662908554077, "learning_rate": 5.695992378774665e-06, "loss": 0.1284, "num_input_tokens_seen": 119817424, "step": 55580 }, { "epoch": 10.200954303541934, "grad_norm": 17.430675506591797, "learning_rate": 5.695199408940889e-06, "loss": 0.297, "num_input_tokens_seen": 119828368, "step": 55585 }, { "epoch": 10.201871903101486, "grad_norm": 0.049313001334667206, "learning_rate": 5.694406421276327e-06, "loss": 0.2787, "num_input_tokens_seen": 119838064, "step": 55590 }, { "epoch": 10.20278950266104, "grad_norm": 0.3770930767059326, "learning_rate": 5.693613415801317e-06, "loss": 0.0027, "num_input_tokens_seen": 119849136, "step": 55595 }, { "epoch": 10.20370710222059, "grad_norm": 33.03369140625, "learning_rate": 5.692820392536196e-06, "loss": 0.2311, "num_input_tokens_seen": 119860784, "step": 55600 }, { "epoch": 10.204624701780142, "grad_norm": 29.700462341308594, "learning_rate": 5.692027351501307e-06, "loss": 0.4256, "num_input_tokens_seen": 119872176, "step": 55605 }, { "epoch": 10.205542301339696, "grad_norm": 0.08284975588321686, "learning_rate": 5.691234292716988e-06, "loss": 0.3246, "num_input_tokens_seen": 119883056, "step": 55610 }, { "epoch": 10.206459900899247, "grad_norm": 50.06081771850586, "learning_rate": 5.69044121620358e-06, "loss": 0.135, "num_input_tokens_seen": 119893168, "step": 55615 }, { "epoch": 10.207377500458799, "grad_norm": 117.12812805175781, "learning_rate": 5.689648121981427e-06, "loss": 0.3662, "num_input_tokens_seen": 119903184, "step": 55620 }, { "epoch": 10.208295100018352, "grad_norm": 0.5095745921134949, "learning_rate": 5.688855010070867e-06, "loss": 0.2121, "num_input_tokens_seen": 119914256, "step": 55625 }, { "epoch": 10.209212699577904, "grad_norm": 0.15873338282108307, "learning_rate": 5.688061880492245e-06, "loss": 0.0066, "num_input_tokens_seen": 119924240, "step": 55630 }, { "epoch": 10.210130299137456, "grad_norm": 3.5066945552825928, "learning_rate": 5.687268733265901e-06, "loss": 0.2064, "num_input_tokens_seen": 119935568, "step": 55635 }, { "epoch": 10.211047898697009, "grad_norm": 7.397615909576416, "learning_rate": 5.68647556841218e-06, "loss": 0.0106, "num_input_tokens_seen": 119945936, "step": 55640 }, { "epoch": 10.21196549825656, "grad_norm": 34.569698333740234, "learning_rate": 5.685682385951424e-06, "loss": 0.1603, "num_input_tokens_seen": 119956208, "step": 55645 }, { "epoch": 10.212883097816112, "grad_norm": 3.6006107330322266, "learning_rate": 5.684889185903977e-06, "loss": 0.2971, "num_input_tokens_seen": 119967344, "step": 55650 }, { "epoch": 10.213800697375666, "grad_norm": 31.87009048461914, "learning_rate": 5.684095968290185e-06, "loss": 0.1362, "num_input_tokens_seen": 119977808, "step": 55655 }, { "epoch": 10.214718296935217, "grad_norm": 0.7755733728408813, "learning_rate": 5.683302733130391e-06, "loss": 0.2516, "num_input_tokens_seen": 119990608, "step": 55660 }, { "epoch": 10.215635896494769, "grad_norm": 71.44164276123047, "learning_rate": 5.682509480444941e-06, "loss": 0.1406, "num_input_tokens_seen": 120002000, "step": 55665 }, { "epoch": 10.216553496054322, "grad_norm": 42.56148910522461, "learning_rate": 5.681716210254181e-06, "loss": 0.1578, "num_input_tokens_seen": 120013488, "step": 55670 }, { "epoch": 10.217471095613874, "grad_norm": 11.621359825134277, "learning_rate": 5.680922922578456e-06, "loss": 0.2083, "num_input_tokens_seen": 120023920, "step": 55675 }, { "epoch": 10.218388695173426, "grad_norm": 1.04301118850708, "learning_rate": 5.6801296174381145e-06, "loss": 0.0763, "num_input_tokens_seen": 120035120, "step": 55680 }, { "epoch": 10.219306294732979, "grad_norm": 37.17194747924805, "learning_rate": 5.6793362948535015e-06, "loss": 0.0283, "num_input_tokens_seen": 120046224, "step": 55685 }, { "epoch": 10.22022389429253, "grad_norm": 110.91500854492188, "learning_rate": 5.678542954844967e-06, "loss": 0.201, "num_input_tokens_seen": 120057680, "step": 55690 }, { "epoch": 10.221141493852082, "grad_norm": 0.025758571922779083, "learning_rate": 5.677749597432854e-06, "loss": 0.131, "num_input_tokens_seen": 120068816, "step": 55695 }, { "epoch": 10.222059093411636, "grad_norm": 0.16955025494098663, "learning_rate": 5.6769562226375175e-06, "loss": 0.1247, "num_input_tokens_seen": 120079696, "step": 55700 }, { "epoch": 10.222976692971187, "grad_norm": 0.055060975253582, "learning_rate": 5.676162830479303e-06, "loss": 0.0014, "num_input_tokens_seen": 120090224, "step": 55705 }, { "epoch": 10.223894292530739, "grad_norm": 3.725576877593994, "learning_rate": 5.675369420978558e-06, "loss": 0.3988, "num_input_tokens_seen": 120102896, "step": 55710 }, { "epoch": 10.224811892090292, "grad_norm": 2.734341859817505, "learning_rate": 5.6745759941556345e-06, "loss": 0.2618, "num_input_tokens_seen": 120114896, "step": 55715 }, { "epoch": 10.225729491649844, "grad_norm": 0.5783483386039734, "learning_rate": 5.673782550030883e-06, "loss": 0.0028, "num_input_tokens_seen": 120126224, "step": 55720 }, { "epoch": 10.226647091209395, "grad_norm": 17.19048500061035, "learning_rate": 5.672989088624652e-06, "loss": 0.2655, "num_input_tokens_seen": 120135536, "step": 55725 }, { "epoch": 10.227564690768949, "grad_norm": 0.13568595051765442, "learning_rate": 5.6721956099572965e-06, "loss": 0.0014, "num_input_tokens_seen": 120146928, "step": 55730 }, { "epoch": 10.2284822903285, "grad_norm": 28.48432731628418, "learning_rate": 5.671402114049163e-06, "loss": 0.4165, "num_input_tokens_seen": 120158064, "step": 55735 }, { "epoch": 10.229399889888052, "grad_norm": 0.16438966989517212, "learning_rate": 5.670608600920607e-06, "loss": 0.0016, "num_input_tokens_seen": 120168464, "step": 55740 }, { "epoch": 10.230317489447605, "grad_norm": 0.09739512950181961, "learning_rate": 5.669815070591979e-06, "loss": 0.1895, "num_input_tokens_seen": 120179024, "step": 55745 }, { "epoch": 10.231235089007157, "grad_norm": 1.389870285987854, "learning_rate": 5.669021523083632e-06, "loss": 0.0412, "num_input_tokens_seen": 120188816, "step": 55750 }, { "epoch": 10.232152688566709, "grad_norm": 26.89318084716797, "learning_rate": 5.668227958415921e-06, "loss": 0.2159, "num_input_tokens_seen": 120199920, "step": 55755 }, { "epoch": 10.233070288126262, "grad_norm": 0.25580930709838867, "learning_rate": 5.6674343766091974e-06, "loss": 0.1283, "num_input_tokens_seen": 120211248, "step": 55760 }, { "epoch": 10.233987887685814, "grad_norm": 216.0419921875, "learning_rate": 5.666640777683818e-06, "loss": 0.374, "num_input_tokens_seen": 120222352, "step": 55765 }, { "epoch": 10.234905487245365, "grad_norm": 55.84391403198242, "learning_rate": 5.6658471616601355e-06, "loss": 0.4052, "num_input_tokens_seen": 120234064, "step": 55770 }, { "epoch": 10.235823086804919, "grad_norm": 0.5044029951095581, "learning_rate": 5.665053528558504e-06, "loss": 0.2313, "num_input_tokens_seen": 120244944, "step": 55775 }, { "epoch": 10.23674068636447, "grad_norm": 3.398303747177124, "learning_rate": 5.664259878399282e-06, "loss": 0.3524, "num_input_tokens_seen": 120256144, "step": 55780 }, { "epoch": 10.237658285924022, "grad_norm": 2.324655532836914, "learning_rate": 5.6634662112028225e-06, "loss": 0.1153, "num_input_tokens_seen": 120267856, "step": 55785 }, { "epoch": 10.238575885483575, "grad_norm": 0.2160048484802246, "learning_rate": 5.662672526989484e-06, "loss": 0.1103, "num_input_tokens_seen": 120279152, "step": 55790 }, { "epoch": 10.239493485043127, "grad_norm": 19.452836990356445, "learning_rate": 5.661878825779621e-06, "loss": 0.1147, "num_input_tokens_seen": 120289712, "step": 55795 }, { "epoch": 10.240411084602679, "grad_norm": 101.91586303710938, "learning_rate": 5.661085107593593e-06, "loss": 0.2904, "num_input_tokens_seen": 120301488, "step": 55800 }, { "epoch": 10.241328684162232, "grad_norm": 41.91647720336914, "learning_rate": 5.660291372451756e-06, "loss": 0.3915, "num_input_tokens_seen": 120312688, "step": 55805 }, { "epoch": 10.242246283721784, "grad_norm": 0.14959940314292908, "learning_rate": 5.659497620374469e-06, "loss": 0.1348, "num_input_tokens_seen": 120324048, "step": 55810 }, { "epoch": 10.243163883281335, "grad_norm": 2.7241852283477783, "learning_rate": 5.65870385138209e-06, "loss": 0.1596, "num_input_tokens_seen": 120333744, "step": 55815 }, { "epoch": 10.244081482840889, "grad_norm": 14.030442237854004, "learning_rate": 5.657910065494978e-06, "loss": 0.2693, "num_input_tokens_seen": 120344688, "step": 55820 }, { "epoch": 10.24499908240044, "grad_norm": 16.36949920654297, "learning_rate": 5.657116262733492e-06, "loss": 0.2858, "num_input_tokens_seen": 120353648, "step": 55825 }, { "epoch": 10.245916681959992, "grad_norm": 0.13534103333950043, "learning_rate": 5.656322443117993e-06, "loss": 0.0583, "num_input_tokens_seen": 120365232, "step": 55830 }, { "epoch": 10.246834281519545, "grad_norm": 30.839147567749023, "learning_rate": 5.655528606668839e-06, "loss": 0.0969, "num_input_tokens_seen": 120374992, "step": 55835 }, { "epoch": 10.247751881079097, "grad_norm": 36.5782470703125, "learning_rate": 5.654734753406394e-06, "loss": 0.277, "num_input_tokens_seen": 120386736, "step": 55840 }, { "epoch": 10.248669480638648, "grad_norm": 29.081104278564453, "learning_rate": 5.653940883351017e-06, "loss": 0.3005, "num_input_tokens_seen": 120398000, "step": 55845 }, { "epoch": 10.249587080198202, "grad_norm": 40.29538345336914, "learning_rate": 5.653146996523069e-06, "loss": 0.2783, "num_input_tokens_seen": 120409488, "step": 55850 }, { "epoch": 10.250504679757753, "grad_norm": 10.143563270568848, "learning_rate": 5.6523530929429145e-06, "loss": 0.0087, "num_input_tokens_seen": 120420528, "step": 55855 }, { "epoch": 10.251422279317305, "grad_norm": 0.3276146650314331, "learning_rate": 5.6515591726309124e-06, "loss": 0.2471, "num_input_tokens_seen": 120431920, "step": 55860 }, { "epoch": 10.252339878876858, "grad_norm": 1.0152040719985962, "learning_rate": 5.65076523560743e-06, "loss": 0.1525, "num_input_tokens_seen": 120443184, "step": 55865 }, { "epoch": 10.25325747843641, "grad_norm": 278.9244689941406, "learning_rate": 5.649971281892826e-06, "loss": 0.2137, "num_input_tokens_seen": 120453808, "step": 55870 }, { "epoch": 10.254175077995962, "grad_norm": 0.9052873849868774, "learning_rate": 5.649177311507465e-06, "loss": 0.2055, "num_input_tokens_seen": 120463952, "step": 55875 }, { "epoch": 10.255092677555515, "grad_norm": 2.5905046463012695, "learning_rate": 5.6483833244717136e-06, "loss": 0.2298, "num_input_tokens_seen": 120473648, "step": 55880 }, { "epoch": 10.256010277115067, "grad_norm": 0.0948898121714592, "learning_rate": 5.6475893208059325e-06, "loss": 0.2861, "num_input_tokens_seen": 120483440, "step": 55885 }, { "epoch": 10.256927876674618, "grad_norm": 0.3933251202106476, "learning_rate": 5.646795300530492e-06, "loss": 0.1456, "num_input_tokens_seen": 120494928, "step": 55890 }, { "epoch": 10.257845476234172, "grad_norm": 0.040115952491760254, "learning_rate": 5.646001263665753e-06, "loss": 0.0622, "num_input_tokens_seen": 120506128, "step": 55895 }, { "epoch": 10.258763075793723, "grad_norm": 0.23043300211429596, "learning_rate": 5.645207210232084e-06, "loss": 0.1467, "num_input_tokens_seen": 120516752, "step": 55900 }, { "epoch": 10.259680675353275, "grad_norm": 0.08370637148618698, "learning_rate": 5.644413140249849e-06, "loss": 0.4688, "num_input_tokens_seen": 120526992, "step": 55905 }, { "epoch": 10.260598274912828, "grad_norm": 0.32105451822280884, "learning_rate": 5.643619053739417e-06, "loss": 0.2208, "num_input_tokens_seen": 120535984, "step": 55910 }, { "epoch": 10.26151587447238, "grad_norm": 5.890213966369629, "learning_rate": 5.642824950721153e-06, "loss": 0.0509, "num_input_tokens_seen": 120547152, "step": 55915 }, { "epoch": 10.262433474031932, "grad_norm": 10.549745559692383, "learning_rate": 5.642030831215423e-06, "loss": 0.1005, "num_input_tokens_seen": 120559280, "step": 55920 }, { "epoch": 10.263351073591485, "grad_norm": 13.153858184814453, "learning_rate": 5.641236695242601e-06, "loss": 0.2014, "num_input_tokens_seen": 120568688, "step": 55925 }, { "epoch": 10.264268673151037, "grad_norm": 0.17351050674915314, "learning_rate": 5.640442542823049e-06, "loss": 0.099, "num_input_tokens_seen": 120579664, "step": 55930 }, { "epoch": 10.265186272710588, "grad_norm": 87.28777313232422, "learning_rate": 5.639648373977139e-06, "loss": 0.0544, "num_input_tokens_seen": 120590352, "step": 55935 }, { "epoch": 10.266103872270142, "grad_norm": 1.6782350540161133, "learning_rate": 5.63885418872524e-06, "loss": 0.1751, "num_input_tokens_seen": 120601552, "step": 55940 }, { "epoch": 10.267021471829693, "grad_norm": 3.1188805103302, "learning_rate": 5.6380599870877205e-06, "loss": 0.1658, "num_input_tokens_seen": 120612528, "step": 55945 }, { "epoch": 10.267939071389245, "grad_norm": 0.7322880625724792, "learning_rate": 5.637265769084953e-06, "loss": 0.5358, "num_input_tokens_seen": 120623344, "step": 55950 }, { "epoch": 10.268856670948798, "grad_norm": 0.6088904738426208, "learning_rate": 5.6364715347373045e-06, "loss": 0.1634, "num_input_tokens_seen": 120635312, "step": 55955 }, { "epoch": 10.26977427050835, "grad_norm": 6.875274181365967, "learning_rate": 5.635677284065147e-06, "loss": 0.2841, "num_input_tokens_seen": 120645712, "step": 55960 }, { "epoch": 10.270691870067902, "grad_norm": 0.24570058286190033, "learning_rate": 5.6348830170888535e-06, "loss": 0.087, "num_input_tokens_seen": 120656976, "step": 55965 }, { "epoch": 10.271609469627455, "grad_norm": 9.127150535583496, "learning_rate": 5.634088733828794e-06, "loss": 0.0943, "num_input_tokens_seen": 120667920, "step": 55970 }, { "epoch": 10.272527069187007, "grad_norm": 0.13758721947669983, "learning_rate": 5.633294434305343e-06, "loss": 0.2264, "num_input_tokens_seen": 120678256, "step": 55975 }, { "epoch": 10.273444668746558, "grad_norm": 31.407068252563477, "learning_rate": 5.632500118538869e-06, "loss": 0.1486, "num_input_tokens_seen": 120689552, "step": 55980 }, { "epoch": 10.274362268306112, "grad_norm": 2.0379300117492676, "learning_rate": 5.631705786549748e-06, "loss": 0.319, "num_input_tokens_seen": 120700336, "step": 55985 }, { "epoch": 10.275279867865663, "grad_norm": 5.197236061096191, "learning_rate": 5.630911438358353e-06, "loss": 0.731, "num_input_tokens_seen": 120712272, "step": 55990 }, { "epoch": 10.276197467425215, "grad_norm": 134.00323486328125, "learning_rate": 5.630117073985057e-06, "loss": 0.3563, "num_input_tokens_seen": 120721744, "step": 55995 }, { "epoch": 10.277115066984768, "grad_norm": 0.08906597644090652, "learning_rate": 5.629322693450236e-06, "loss": 0.2601, "num_input_tokens_seen": 120732432, "step": 56000 }, { "epoch": 10.27803266654432, "grad_norm": 0.1085018739104271, "learning_rate": 5.6285282967742615e-06, "loss": 0.0019, "num_input_tokens_seen": 120743056, "step": 56005 }, { "epoch": 10.278950266103871, "grad_norm": 7.142361640930176, "learning_rate": 5.6277338839775104e-06, "loss": 0.1976, "num_input_tokens_seen": 120754224, "step": 56010 }, { "epoch": 10.279867865663425, "grad_norm": 130.3538055419922, "learning_rate": 5.626939455080359e-06, "loss": 0.1031, "num_input_tokens_seen": 120765584, "step": 56015 }, { "epoch": 10.280785465222976, "grad_norm": 0.03463088721036911, "learning_rate": 5.626145010103182e-06, "loss": 0.2277, "num_input_tokens_seen": 120776080, "step": 56020 }, { "epoch": 10.281703064782528, "grad_norm": 0.9061025381088257, "learning_rate": 5.625350549066357e-06, "loss": 0.2091, "num_input_tokens_seen": 120786864, "step": 56025 }, { "epoch": 10.282620664342081, "grad_norm": 0.04513450339436531, "learning_rate": 5.6245560719902585e-06, "loss": 0.0007, "num_input_tokens_seen": 120798608, "step": 56030 }, { "epoch": 10.283538263901633, "grad_norm": 0.3737891614437103, "learning_rate": 5.6237615788952634e-06, "loss": 0.2255, "num_input_tokens_seen": 120809328, "step": 56035 }, { "epoch": 10.284455863461186, "grad_norm": 0.8282175064086914, "learning_rate": 5.622967069801752e-06, "loss": 0.2158, "num_input_tokens_seen": 120818672, "step": 56040 }, { "epoch": 10.285373463020738, "grad_norm": 0.06342390924692154, "learning_rate": 5.622172544730101e-06, "loss": 0.1018, "num_input_tokens_seen": 120829744, "step": 56045 }, { "epoch": 10.28629106258029, "grad_norm": 5.548817157745361, "learning_rate": 5.6213780037006885e-06, "loss": 0.1754, "num_input_tokens_seen": 120840176, "step": 56050 }, { "epoch": 10.287208662139843, "grad_norm": 35.44438171386719, "learning_rate": 5.6205834467338925e-06, "loss": 0.1219, "num_input_tokens_seen": 120850896, "step": 56055 }, { "epoch": 10.288126261699395, "grad_norm": 7.869778633117676, "learning_rate": 5.619788873850094e-06, "loss": 0.1466, "num_input_tokens_seen": 120861936, "step": 56060 }, { "epoch": 10.289043861258946, "grad_norm": 18.960006713867188, "learning_rate": 5.6189942850696695e-06, "loss": 0.2249, "num_input_tokens_seen": 120871984, "step": 56065 }, { "epoch": 10.2899614608185, "grad_norm": 0.049411483108997345, "learning_rate": 5.618199680413003e-06, "loss": 0.1731, "num_input_tokens_seen": 120882096, "step": 56070 }, { "epoch": 10.290879060378051, "grad_norm": 0.5087757110595703, "learning_rate": 5.617405059900472e-06, "loss": 0.0133, "num_input_tokens_seen": 120893712, "step": 56075 }, { "epoch": 10.291796659937603, "grad_norm": 0.033844731748104095, "learning_rate": 5.616610423552458e-06, "loss": 0.2374, "num_input_tokens_seen": 120905680, "step": 56080 }, { "epoch": 10.292714259497156, "grad_norm": 0.3381367325782776, "learning_rate": 5.615815771389342e-06, "loss": 0.1943, "num_input_tokens_seen": 120914896, "step": 56085 }, { "epoch": 10.293631859056708, "grad_norm": 0.09062238782644272, "learning_rate": 5.615021103431506e-06, "loss": 0.2543, "num_input_tokens_seen": 120925040, "step": 56090 }, { "epoch": 10.29454945861626, "grad_norm": 0.10037863254547119, "learning_rate": 5.614226419699332e-06, "loss": 0.0016, "num_input_tokens_seen": 120936112, "step": 56095 }, { "epoch": 10.295467058175813, "grad_norm": 0.06434272229671478, "learning_rate": 5.613431720213203e-06, "loss": 0.1782, "num_input_tokens_seen": 120947376, "step": 56100 }, { "epoch": 10.296384657735365, "grad_norm": 208.9337158203125, "learning_rate": 5.6126370049935e-06, "loss": 0.1836, "num_input_tokens_seen": 120958096, "step": 56105 }, { "epoch": 10.297302257294916, "grad_norm": 0.17693868279457092, "learning_rate": 5.611842274060609e-06, "loss": 0.2726, "num_input_tokens_seen": 120969552, "step": 56110 }, { "epoch": 10.29821985685447, "grad_norm": 0.34607094526290894, "learning_rate": 5.611047527434909e-06, "loss": 0.2028, "num_input_tokens_seen": 120981104, "step": 56115 }, { "epoch": 10.299137456414021, "grad_norm": 2.0886142253875732, "learning_rate": 5.61025276513679e-06, "loss": 0.0226, "num_input_tokens_seen": 120992336, "step": 56120 }, { "epoch": 10.300055055973573, "grad_norm": 0.5507020950317383, "learning_rate": 5.609457987186631e-06, "loss": 0.1147, "num_input_tokens_seen": 121002416, "step": 56125 }, { "epoch": 10.300972655533126, "grad_norm": 10.493326187133789, "learning_rate": 5.608663193604822e-06, "loss": 0.2737, "num_input_tokens_seen": 121013104, "step": 56130 }, { "epoch": 10.301890255092678, "grad_norm": 349.55413818359375, "learning_rate": 5.607868384411744e-06, "loss": 0.3415, "num_input_tokens_seen": 121023312, "step": 56135 }, { "epoch": 10.30280785465223, "grad_norm": 14.787537574768066, "learning_rate": 5.607073559627784e-06, "loss": 0.2972, "num_input_tokens_seen": 121034224, "step": 56140 }, { "epoch": 10.303725454211783, "grad_norm": 71.56895446777344, "learning_rate": 5.606278719273327e-06, "loss": 0.3658, "num_input_tokens_seen": 121045264, "step": 56145 }, { "epoch": 10.304643053771334, "grad_norm": 216.3774871826172, "learning_rate": 5.605483863368762e-06, "loss": 0.0792, "num_input_tokens_seen": 121056976, "step": 56150 }, { "epoch": 10.305560653330886, "grad_norm": 29.937671661376953, "learning_rate": 5.604688991934474e-06, "loss": 0.2571, "num_input_tokens_seen": 121067472, "step": 56155 }, { "epoch": 10.30647825289044, "grad_norm": 0.10676368325948715, "learning_rate": 5.60389410499085e-06, "loss": 0.0261, "num_input_tokens_seen": 121078384, "step": 56160 }, { "epoch": 10.307395852449991, "grad_norm": 0.49605822563171387, "learning_rate": 5.603099202558279e-06, "loss": 0.133, "num_input_tokens_seen": 121089904, "step": 56165 }, { "epoch": 10.308313452009543, "grad_norm": 0.41444694995880127, "learning_rate": 5.602304284657146e-06, "loss": 0.009, "num_input_tokens_seen": 121101392, "step": 56170 }, { "epoch": 10.309231051569096, "grad_norm": 141.67758178710938, "learning_rate": 5.601509351307844e-06, "loss": 0.7584, "num_input_tokens_seen": 121112464, "step": 56175 }, { "epoch": 10.310148651128648, "grad_norm": 0.4777647852897644, "learning_rate": 5.600714402530759e-06, "loss": 0.193, "num_input_tokens_seen": 121123376, "step": 56180 }, { "epoch": 10.3110662506882, "grad_norm": 107.40147399902344, "learning_rate": 5.5999194383462806e-06, "loss": 0.022, "num_input_tokens_seen": 121134576, "step": 56185 }, { "epoch": 10.311983850247753, "grad_norm": 119.3224105834961, "learning_rate": 5.599124458774797e-06, "loss": 0.4576, "num_input_tokens_seen": 121145968, "step": 56190 }, { "epoch": 10.312901449807304, "grad_norm": 37.003448486328125, "learning_rate": 5.598329463836702e-06, "loss": 0.1936, "num_input_tokens_seen": 121155760, "step": 56195 }, { "epoch": 10.313819049366856, "grad_norm": 20.299034118652344, "learning_rate": 5.597534453552381e-06, "loss": 0.1495, "num_input_tokens_seen": 121167920, "step": 56200 }, { "epoch": 10.31473664892641, "grad_norm": 0.25035223364830017, "learning_rate": 5.5967394279422286e-06, "loss": 0.278, "num_input_tokens_seen": 121178704, "step": 56205 }, { "epoch": 10.315654248485961, "grad_norm": 0.42673757672309875, "learning_rate": 5.595944387026635e-06, "loss": 0.0953, "num_input_tokens_seen": 121189808, "step": 56210 }, { "epoch": 10.316571848045513, "grad_norm": 29.09532928466797, "learning_rate": 5.595149330825991e-06, "loss": 0.468, "num_input_tokens_seen": 121202192, "step": 56215 }, { "epoch": 10.317489447605066, "grad_norm": 8.514286994934082, "learning_rate": 5.594354259360689e-06, "loss": 0.1836, "num_input_tokens_seen": 121213808, "step": 56220 }, { "epoch": 10.318407047164618, "grad_norm": 2.178194046020508, "learning_rate": 5.593559172651122e-06, "loss": 0.1025, "num_input_tokens_seen": 121224592, "step": 56225 }, { "epoch": 10.31932464672417, "grad_norm": 0.34945136308670044, "learning_rate": 5.592764070717682e-06, "loss": 0.0419, "num_input_tokens_seen": 121235408, "step": 56230 }, { "epoch": 10.320242246283723, "grad_norm": 0.3100269138813019, "learning_rate": 5.591968953580762e-06, "loss": 0.1252, "num_input_tokens_seen": 121246288, "step": 56235 }, { "epoch": 10.321159845843274, "grad_norm": 0.14254207909107208, "learning_rate": 5.5911738212607554e-06, "loss": 0.0854, "num_input_tokens_seen": 121257584, "step": 56240 }, { "epoch": 10.322077445402826, "grad_norm": 136.52496337890625, "learning_rate": 5.590378673778059e-06, "loss": 0.1883, "num_input_tokens_seen": 121267920, "step": 56245 }, { "epoch": 10.32299504496238, "grad_norm": 0.06272414326667786, "learning_rate": 5.589583511153061e-06, "loss": 0.232, "num_input_tokens_seen": 121277264, "step": 56250 }, { "epoch": 10.323912644521931, "grad_norm": 67.04232025146484, "learning_rate": 5.588788333406162e-06, "loss": 0.0747, "num_input_tokens_seen": 121289200, "step": 56255 }, { "epoch": 10.324830244081483, "grad_norm": 0.295840859413147, "learning_rate": 5.587993140557755e-06, "loss": 0.0943, "num_input_tokens_seen": 121300784, "step": 56260 }, { "epoch": 10.325747843641036, "grad_norm": 0.0754590705037117, "learning_rate": 5.5871979326282335e-06, "loss": 0.0443, "num_input_tokens_seen": 121312656, "step": 56265 }, { "epoch": 10.326665443200588, "grad_norm": 0.26762473583221436, "learning_rate": 5.586402709637997e-06, "loss": 0.0973, "num_input_tokens_seen": 121324176, "step": 56270 }, { "epoch": 10.32758304276014, "grad_norm": 0.026999982073903084, "learning_rate": 5.585607471607438e-06, "loss": 0.048, "num_input_tokens_seen": 121334896, "step": 56275 }, { "epoch": 10.328500642319693, "grad_norm": 87.89253997802734, "learning_rate": 5.584812218556955e-06, "loss": 0.1688, "num_input_tokens_seen": 121345776, "step": 56280 }, { "epoch": 10.329418241879244, "grad_norm": 59.42790985107422, "learning_rate": 5.584016950506947e-06, "loss": 0.1041, "num_input_tokens_seen": 121357552, "step": 56285 }, { "epoch": 10.330335841438796, "grad_norm": 0.13285544514656067, "learning_rate": 5.583221667477807e-06, "loss": 0.3505, "num_input_tokens_seen": 121366576, "step": 56290 }, { "epoch": 10.33125344099835, "grad_norm": 0.7953180074691772, "learning_rate": 5.582426369489937e-06, "loss": 0.0801, "num_input_tokens_seen": 121377616, "step": 56295 }, { "epoch": 10.3321710405579, "grad_norm": 0.03679677098989487, "learning_rate": 5.581631056563732e-06, "loss": 0.0995, "num_input_tokens_seen": 121387088, "step": 56300 }, { "epoch": 10.333088640117452, "grad_norm": 0.05177481099963188, "learning_rate": 5.580835728719593e-06, "loss": 0.0101, "num_input_tokens_seen": 121397200, "step": 56305 }, { "epoch": 10.334006239677006, "grad_norm": 140.63990783691406, "learning_rate": 5.5800403859779175e-06, "loss": 0.1972, "num_input_tokens_seen": 121407888, "step": 56310 }, { "epoch": 10.334923839236557, "grad_norm": 0.053941525518894196, "learning_rate": 5.579245028359104e-06, "loss": 0.0757, "num_input_tokens_seen": 121418864, "step": 56315 }, { "epoch": 10.335841438796109, "grad_norm": 92.04214477539062, "learning_rate": 5.5784496558835545e-06, "loss": 0.1389, "num_input_tokens_seen": 121429584, "step": 56320 }, { "epoch": 10.336759038355662, "grad_norm": 3.3592991828918457, "learning_rate": 5.5776542685716665e-06, "loss": 0.0143, "num_input_tokens_seen": 121440880, "step": 56325 }, { "epoch": 10.337676637915214, "grad_norm": 0.2026277482509613, "learning_rate": 5.576858866443844e-06, "loss": 0.1449, "num_input_tokens_seen": 121450832, "step": 56330 }, { "epoch": 10.338594237474766, "grad_norm": 0.40074050426483154, "learning_rate": 5.576063449520484e-06, "loss": 0.57, "num_input_tokens_seen": 121462416, "step": 56335 }, { "epoch": 10.339511837034319, "grad_norm": 18.83780860900879, "learning_rate": 5.57526801782199e-06, "loss": 0.1367, "num_input_tokens_seen": 121473168, "step": 56340 }, { "epoch": 10.34042943659387, "grad_norm": 57.719627380371094, "learning_rate": 5.574472571368763e-06, "loss": 0.1277, "num_input_tokens_seen": 121483792, "step": 56345 }, { "epoch": 10.341347036153422, "grad_norm": 112.09634399414062, "learning_rate": 5.573677110181204e-06, "loss": 0.3125, "num_input_tokens_seen": 121494768, "step": 56350 }, { "epoch": 10.342264635712976, "grad_norm": 21.393049240112305, "learning_rate": 5.572881634279716e-06, "loss": 0.4054, "num_input_tokens_seen": 121504912, "step": 56355 }, { "epoch": 10.343182235272527, "grad_norm": 0.11029738187789917, "learning_rate": 5.572086143684703e-06, "loss": 0.0092, "num_input_tokens_seen": 121516048, "step": 56360 }, { "epoch": 10.344099834832079, "grad_norm": 28.649065017700195, "learning_rate": 5.571290638416566e-06, "loss": 0.4115, "num_input_tokens_seen": 121527824, "step": 56365 }, { "epoch": 10.345017434391632, "grad_norm": 0.07012643665075302, "learning_rate": 5.570495118495711e-06, "loss": 0.124, "num_input_tokens_seen": 121537456, "step": 56370 }, { "epoch": 10.345935033951184, "grad_norm": 0.06364556401968002, "learning_rate": 5.569699583942539e-06, "loss": 0.1007, "num_input_tokens_seen": 121548592, "step": 56375 }, { "epoch": 10.346852633510736, "grad_norm": 0.1374201625585556, "learning_rate": 5.568904034777458e-06, "loss": 0.4758, "num_input_tokens_seen": 121558416, "step": 56380 }, { "epoch": 10.347770233070289, "grad_norm": 1.249243974685669, "learning_rate": 5.5681084710208675e-06, "loss": 0.1966, "num_input_tokens_seen": 121567952, "step": 56385 }, { "epoch": 10.34868783262984, "grad_norm": 0.2608333230018616, "learning_rate": 5.567312892693176e-06, "loss": 0.0465, "num_input_tokens_seen": 121578608, "step": 56390 }, { "epoch": 10.349605432189392, "grad_norm": 15.363096237182617, "learning_rate": 5.566517299814789e-06, "loss": 0.5042, "num_input_tokens_seen": 121588880, "step": 56395 }, { "epoch": 10.350523031748946, "grad_norm": 28.05228042602539, "learning_rate": 5.565721692406109e-06, "loss": 0.1363, "num_input_tokens_seen": 121599344, "step": 56400 }, { "epoch": 10.351440631308497, "grad_norm": 16.15683364868164, "learning_rate": 5.564926070487548e-06, "loss": 0.1766, "num_input_tokens_seen": 121609424, "step": 56405 }, { "epoch": 10.352358230868049, "grad_norm": 0.8924899101257324, "learning_rate": 5.564130434079506e-06, "loss": 0.011, "num_input_tokens_seen": 121621072, "step": 56410 }, { "epoch": 10.353275830427602, "grad_norm": 0.23306407034397125, "learning_rate": 5.563334783202393e-06, "loss": 0.1908, "num_input_tokens_seen": 121632976, "step": 56415 }, { "epoch": 10.354193429987154, "grad_norm": 0.8861615061759949, "learning_rate": 5.5625391178766164e-06, "loss": 0.2595, "num_input_tokens_seen": 121643696, "step": 56420 }, { "epoch": 10.355111029546705, "grad_norm": 0.0726039856672287, "learning_rate": 5.561743438122583e-06, "loss": 0.1759, "num_input_tokens_seen": 121654032, "step": 56425 }, { "epoch": 10.356028629106259, "grad_norm": 13.505610466003418, "learning_rate": 5.560947743960702e-06, "loss": 0.1929, "num_input_tokens_seen": 121665552, "step": 56430 }, { "epoch": 10.35694622866581, "grad_norm": 24.187665939331055, "learning_rate": 5.5601520354113805e-06, "loss": 0.3418, "num_input_tokens_seen": 121675824, "step": 56435 }, { "epoch": 10.357863828225362, "grad_norm": 28.82449722290039, "learning_rate": 5.559356312495027e-06, "loss": 0.3194, "num_input_tokens_seen": 121687248, "step": 56440 }, { "epoch": 10.358781427784916, "grad_norm": 0.27946051955223083, "learning_rate": 5.55856057523205e-06, "loss": 0.0529, "num_input_tokens_seen": 121698384, "step": 56445 }, { "epoch": 10.359699027344467, "grad_norm": 78.41863250732422, "learning_rate": 5.557764823642862e-06, "loss": 0.1691, "num_input_tokens_seen": 121710160, "step": 56450 }, { "epoch": 10.360616626904019, "grad_norm": 0.562627375125885, "learning_rate": 5.556969057747871e-06, "loss": 0.1193, "num_input_tokens_seen": 121720208, "step": 56455 }, { "epoch": 10.361534226463572, "grad_norm": 0.6583737730979919, "learning_rate": 5.556173277567485e-06, "loss": 0.1384, "num_input_tokens_seen": 121729104, "step": 56460 }, { "epoch": 10.362451826023124, "grad_norm": 8.54208755493164, "learning_rate": 5.555377483122117e-06, "loss": 0.0895, "num_input_tokens_seen": 121740240, "step": 56465 }, { "epoch": 10.363369425582675, "grad_norm": 99.38226318359375, "learning_rate": 5.554581674432177e-06, "loss": 0.3168, "num_input_tokens_seen": 121750480, "step": 56470 }, { "epoch": 10.364287025142229, "grad_norm": 11.178359031677246, "learning_rate": 5.553785851518076e-06, "loss": 0.264, "num_input_tokens_seen": 121760688, "step": 56475 }, { "epoch": 10.36520462470178, "grad_norm": 0.016666065901517868, "learning_rate": 5.552990014400228e-06, "loss": 0.2611, "num_input_tokens_seen": 121771600, "step": 56480 }, { "epoch": 10.366122224261332, "grad_norm": 80.94310760498047, "learning_rate": 5.552194163099042e-06, "loss": 0.0657, "num_input_tokens_seen": 121782480, "step": 56485 }, { "epoch": 10.367039823820885, "grad_norm": 150.7017822265625, "learning_rate": 5.551398297634931e-06, "loss": 0.3464, "num_input_tokens_seen": 121793584, "step": 56490 }, { "epoch": 10.367957423380437, "grad_norm": 19.2080135345459, "learning_rate": 5.55060241802831e-06, "loss": 0.2549, "num_input_tokens_seen": 121805488, "step": 56495 }, { "epoch": 10.368875022939989, "grad_norm": 15.347696304321289, "learning_rate": 5.549806524299589e-06, "loss": 0.2077, "num_input_tokens_seen": 121816752, "step": 56500 }, { "epoch": 10.369792622499542, "grad_norm": 1.4423952102661133, "learning_rate": 5.5490106164691835e-06, "loss": 0.2181, "num_input_tokens_seen": 121827792, "step": 56505 }, { "epoch": 10.370710222059094, "grad_norm": 82.73638916015625, "learning_rate": 5.548214694557506e-06, "loss": 0.184, "num_input_tokens_seen": 121836336, "step": 56510 }, { "epoch": 10.371627821618645, "grad_norm": 10.53766918182373, "learning_rate": 5.547418758584973e-06, "loss": 0.1687, "num_input_tokens_seen": 121847664, "step": 56515 }, { "epoch": 10.372545421178199, "grad_norm": 3.2780559062957764, "learning_rate": 5.546622808571994e-06, "loss": 0.2319, "num_input_tokens_seen": 121859504, "step": 56520 }, { "epoch": 10.37346302073775, "grad_norm": 12.57208251953125, "learning_rate": 5.545826844538988e-06, "loss": 0.1197, "num_input_tokens_seen": 121870352, "step": 56525 }, { "epoch": 10.374380620297302, "grad_norm": 0.1229618489742279, "learning_rate": 5.545030866506373e-06, "loss": 0.1151, "num_input_tokens_seen": 121881040, "step": 56530 }, { "epoch": 10.375298219856855, "grad_norm": 77.31729888916016, "learning_rate": 5.544234874494557e-06, "loss": 0.1583, "num_input_tokens_seen": 121891632, "step": 56535 }, { "epoch": 10.376215819416407, "grad_norm": 0.19102738797664642, "learning_rate": 5.543438868523961e-06, "loss": 0.1364, "num_input_tokens_seen": 121902736, "step": 56540 }, { "epoch": 10.377133418975959, "grad_norm": 2.156967878341675, "learning_rate": 5.542642848615001e-06, "loss": 0.0516, "num_input_tokens_seen": 121913264, "step": 56545 }, { "epoch": 10.378051018535512, "grad_norm": 30.793428421020508, "learning_rate": 5.541846814788094e-06, "loss": 0.3608, "num_input_tokens_seen": 121924048, "step": 56550 }, { "epoch": 10.378968618095064, "grad_norm": 9.149581909179688, "learning_rate": 5.541050767063653e-06, "loss": 0.2139, "num_input_tokens_seen": 121935280, "step": 56555 }, { "epoch": 10.379886217654615, "grad_norm": 40.96630096435547, "learning_rate": 5.5402547054621e-06, "loss": 0.267, "num_input_tokens_seen": 121945168, "step": 56560 }, { "epoch": 10.380803817214169, "grad_norm": 12.775116920471191, "learning_rate": 5.5394586300038524e-06, "loss": 0.1942, "num_input_tokens_seen": 121955984, "step": 56565 }, { "epoch": 10.38172141677372, "grad_norm": 137.84349060058594, "learning_rate": 5.538662540709324e-06, "loss": 0.418, "num_input_tokens_seen": 121966768, "step": 56570 }, { "epoch": 10.382639016333272, "grad_norm": 0.35406339168548584, "learning_rate": 5.537866437598938e-06, "loss": 0.005, "num_input_tokens_seen": 121977136, "step": 56575 }, { "epoch": 10.383556615892825, "grad_norm": 139.17431640625, "learning_rate": 5.537070320693112e-06, "loss": 0.4571, "num_input_tokens_seen": 121988944, "step": 56580 }, { "epoch": 10.384474215452377, "grad_norm": 0.15871796011924744, "learning_rate": 5.536274190012264e-06, "loss": 0.3556, "num_input_tokens_seen": 122000208, "step": 56585 }, { "epoch": 10.385391815011928, "grad_norm": 0.14317180216312408, "learning_rate": 5.535478045576814e-06, "loss": 0.0189, "num_input_tokens_seen": 122011696, "step": 56590 }, { "epoch": 10.386309414571482, "grad_norm": 0.14672255516052246, "learning_rate": 5.534681887407183e-06, "loss": 0.1751, "num_input_tokens_seen": 122021872, "step": 56595 }, { "epoch": 10.387227014131033, "grad_norm": 106.12101745605469, "learning_rate": 5.533885715523788e-06, "loss": 0.3604, "num_input_tokens_seen": 122033456, "step": 56600 }, { "epoch": 10.388144613690585, "grad_norm": 0.20382563769817352, "learning_rate": 5.533089529947054e-06, "loss": 0.238, "num_input_tokens_seen": 122044752, "step": 56605 }, { "epoch": 10.389062213250138, "grad_norm": 3.8019325733184814, "learning_rate": 5.532293330697399e-06, "loss": 0.1224, "num_input_tokens_seen": 122055376, "step": 56610 }, { "epoch": 10.38997981280969, "grad_norm": 0.1317048966884613, "learning_rate": 5.531497117795246e-06, "loss": 0.4458, "num_input_tokens_seen": 122065744, "step": 56615 }, { "epoch": 10.390897412369242, "grad_norm": 1.9959816932678223, "learning_rate": 5.530700891261014e-06, "loss": 0.0055, "num_input_tokens_seen": 122076560, "step": 56620 }, { "epoch": 10.391815011928795, "grad_norm": 19.44171142578125, "learning_rate": 5.529904651115128e-06, "loss": 0.4059, "num_input_tokens_seen": 122087088, "step": 56625 }, { "epoch": 10.392732611488347, "grad_norm": 25.40212059020996, "learning_rate": 5.529108397378008e-06, "loss": 0.0989, "num_input_tokens_seen": 122098512, "step": 56630 }, { "epoch": 10.393650211047898, "grad_norm": 0.8365833759307861, "learning_rate": 5.528312130070078e-06, "loss": 0.1545, "num_input_tokens_seen": 122109136, "step": 56635 }, { "epoch": 10.394567810607452, "grad_norm": 1.0174238681793213, "learning_rate": 5.527515849211762e-06, "loss": 0.0052, "num_input_tokens_seen": 122119472, "step": 56640 }, { "epoch": 10.395485410167003, "grad_norm": 0.306313693523407, "learning_rate": 5.52671955482348e-06, "loss": 0.0058, "num_input_tokens_seen": 122128528, "step": 56645 }, { "epoch": 10.396403009726555, "grad_norm": 41.600860595703125, "learning_rate": 5.525923246925659e-06, "loss": 0.2582, "num_input_tokens_seen": 122138448, "step": 56650 }, { "epoch": 10.397320609286108, "grad_norm": 0.22406992316246033, "learning_rate": 5.5251269255387206e-06, "loss": 0.418, "num_input_tokens_seen": 122149584, "step": 56655 }, { "epoch": 10.39823820884566, "grad_norm": 0.2447538524866104, "learning_rate": 5.524330590683092e-06, "loss": 0.3715, "num_input_tokens_seen": 122158256, "step": 56660 }, { "epoch": 10.399155808405212, "grad_norm": 1.5641471147537231, "learning_rate": 5.523534242379196e-06, "loss": 0.1081, "num_input_tokens_seen": 122168208, "step": 56665 }, { "epoch": 10.400073407964765, "grad_norm": 0.06650350987911224, "learning_rate": 5.5227378806474574e-06, "loss": 0.2501, "num_input_tokens_seen": 122178736, "step": 56670 }, { "epoch": 10.400991007524317, "grad_norm": 9.046112060546875, "learning_rate": 5.521941505508303e-06, "loss": 0.2946, "num_input_tokens_seen": 122189872, "step": 56675 }, { "epoch": 10.401908607083868, "grad_norm": 7.952470302581787, "learning_rate": 5.5211451169821586e-06, "loss": 0.3014, "num_input_tokens_seen": 122200560, "step": 56680 }, { "epoch": 10.402826206643422, "grad_norm": 47.9034538269043, "learning_rate": 5.520348715089448e-06, "loss": 0.4836, "num_input_tokens_seen": 122211824, "step": 56685 }, { "epoch": 10.403743806202973, "grad_norm": 0.16067391633987427, "learning_rate": 5.5195522998506e-06, "loss": 0.008, "num_input_tokens_seen": 122223216, "step": 56690 }, { "epoch": 10.404661405762525, "grad_norm": 0.4235765337944031, "learning_rate": 5.51875587128604e-06, "loss": 0.1352, "num_input_tokens_seen": 122234640, "step": 56695 }, { "epoch": 10.405579005322078, "grad_norm": 47.148738861083984, "learning_rate": 5.517959429416198e-06, "loss": 0.0529, "num_input_tokens_seen": 122246160, "step": 56700 }, { "epoch": 10.40649660488163, "grad_norm": 12.163764953613281, "learning_rate": 5.517162974261498e-06, "loss": 0.2833, "num_input_tokens_seen": 122259280, "step": 56705 }, { "epoch": 10.407414204441181, "grad_norm": 24.13054656982422, "learning_rate": 5.516366505842368e-06, "loss": 0.3307, "num_input_tokens_seen": 122271120, "step": 56710 }, { "epoch": 10.408331804000735, "grad_norm": 0.060448940843343735, "learning_rate": 5.51557002417924e-06, "loss": 0.3774, "num_input_tokens_seen": 122282896, "step": 56715 }, { "epoch": 10.409249403560286, "grad_norm": 31.737581253051758, "learning_rate": 5.514773529292537e-06, "loss": 0.0675, "num_input_tokens_seen": 122293136, "step": 56720 }, { "epoch": 10.410167003119838, "grad_norm": 0.1290387660264969, "learning_rate": 5.513977021202693e-06, "loss": 0.27, "num_input_tokens_seen": 122304656, "step": 56725 }, { "epoch": 10.411084602679392, "grad_norm": 0.03866322711110115, "learning_rate": 5.513180499930134e-06, "loss": 0.1942, "num_input_tokens_seen": 122315280, "step": 56730 }, { "epoch": 10.412002202238943, "grad_norm": 25.364913940429688, "learning_rate": 5.512383965495292e-06, "loss": 0.2127, "num_input_tokens_seen": 122325680, "step": 56735 }, { "epoch": 10.412919801798495, "grad_norm": 91.40364837646484, "learning_rate": 5.511587417918593e-06, "loss": 0.4075, "num_input_tokens_seen": 122336496, "step": 56740 }, { "epoch": 10.413837401358048, "grad_norm": 10.616617202758789, "learning_rate": 5.510790857220472e-06, "loss": 0.2354, "num_input_tokens_seen": 122347536, "step": 56745 }, { "epoch": 10.4147550009176, "grad_norm": 137.79547119140625, "learning_rate": 5.509994283421356e-06, "loss": 0.0876, "num_input_tokens_seen": 122358832, "step": 56750 }, { "epoch": 10.415672600477151, "grad_norm": 0.20582816004753113, "learning_rate": 5.509197696541677e-06, "loss": 0.0673, "num_input_tokens_seen": 122369488, "step": 56755 }, { "epoch": 10.416590200036705, "grad_norm": 0.20898881554603577, "learning_rate": 5.508401096601867e-06, "loss": 0.1175, "num_input_tokens_seen": 122379344, "step": 56760 }, { "epoch": 10.417507799596256, "grad_norm": 0.3829173743724823, "learning_rate": 5.5076044836223565e-06, "loss": 0.1006, "num_input_tokens_seen": 122389488, "step": 56765 }, { "epoch": 10.418425399155808, "grad_norm": 6.435290336608887, "learning_rate": 5.5068078576235776e-06, "loss": 0.0217, "num_input_tokens_seen": 122399824, "step": 56770 }, { "epoch": 10.419342998715361, "grad_norm": 0.3604898452758789, "learning_rate": 5.506011218625962e-06, "loss": 0.3048, "num_input_tokens_seen": 122411312, "step": 56775 }, { "epoch": 10.420260598274913, "grad_norm": 0.1982629895210266, "learning_rate": 5.505214566649944e-06, "loss": 0.3843, "num_input_tokens_seen": 122422448, "step": 56780 }, { "epoch": 10.421178197834465, "grad_norm": 0.12530003488063812, "learning_rate": 5.504417901715955e-06, "loss": 0.0863, "num_input_tokens_seen": 122433392, "step": 56785 }, { "epoch": 10.422095797394018, "grad_norm": 0.0626271590590477, "learning_rate": 5.503621223844429e-06, "loss": 0.0814, "num_input_tokens_seen": 122443856, "step": 56790 }, { "epoch": 10.42301339695357, "grad_norm": 0.02502431534230709, "learning_rate": 5.5028245330557975e-06, "loss": 0.0753, "num_input_tokens_seen": 122454352, "step": 56795 }, { "epoch": 10.423930996513121, "grad_norm": 0.07784724980592728, "learning_rate": 5.502027829370497e-06, "loss": 0.1789, "num_input_tokens_seen": 122464432, "step": 56800 }, { "epoch": 10.424848596072675, "grad_norm": 0.1355687826871872, "learning_rate": 5.5012311128089615e-06, "loss": 0.111, "num_input_tokens_seen": 122474320, "step": 56805 }, { "epoch": 10.425766195632226, "grad_norm": 8.319219589233398, "learning_rate": 5.500434383391624e-06, "loss": 0.2977, "num_input_tokens_seen": 122484496, "step": 56810 }, { "epoch": 10.426683795191778, "grad_norm": 0.18022185564041138, "learning_rate": 5.4996376411389205e-06, "loss": 0.0019, "num_input_tokens_seen": 122495472, "step": 56815 }, { "epoch": 10.427601394751331, "grad_norm": 4.620625972747803, "learning_rate": 5.498840886071285e-06, "loss": 0.0824, "num_input_tokens_seen": 122507120, "step": 56820 }, { "epoch": 10.428518994310883, "grad_norm": 2.139619827270508, "learning_rate": 5.498044118209155e-06, "loss": 0.1052, "num_input_tokens_seen": 122518768, "step": 56825 }, { "epoch": 10.429436593870435, "grad_norm": 35.246437072753906, "learning_rate": 5.497247337572964e-06, "loss": 0.0176, "num_input_tokens_seen": 122529392, "step": 56830 }, { "epoch": 10.430354193429988, "grad_norm": 83.77840423583984, "learning_rate": 5.496450544183151e-06, "loss": 0.1915, "num_input_tokens_seen": 122538704, "step": 56835 }, { "epoch": 10.43127179298954, "grad_norm": 12.343972206115723, "learning_rate": 5.495653738060151e-06, "loss": 0.128, "num_input_tokens_seen": 122549232, "step": 56840 }, { "epoch": 10.432189392549091, "grad_norm": 12.480758666992188, "learning_rate": 5.494856919224398e-06, "loss": 0.3792, "num_input_tokens_seen": 122559440, "step": 56845 }, { "epoch": 10.433106992108645, "grad_norm": 0.19861957430839539, "learning_rate": 5.494060087696336e-06, "loss": 0.2459, "num_input_tokens_seen": 122569456, "step": 56850 }, { "epoch": 10.434024591668196, "grad_norm": 5.845062732696533, "learning_rate": 5.493263243496396e-06, "loss": 0.0466, "num_input_tokens_seen": 122580848, "step": 56855 }, { "epoch": 10.434942191227748, "grad_norm": 12.433643341064453, "learning_rate": 5.492466386645019e-06, "loss": 0.2154, "num_input_tokens_seen": 122592176, "step": 56860 }, { "epoch": 10.435859790787301, "grad_norm": 22.172447204589844, "learning_rate": 5.491669517162642e-06, "loss": 0.1842, "num_input_tokens_seen": 122602576, "step": 56865 }, { "epoch": 10.436777390346853, "grad_norm": 0.5058908462524414, "learning_rate": 5.490872635069705e-06, "loss": 0.24, "num_input_tokens_seen": 122612368, "step": 56870 }, { "epoch": 10.437694989906404, "grad_norm": 0.519682765007019, "learning_rate": 5.490075740386644e-06, "loss": 0.0259, "num_input_tokens_seen": 122622320, "step": 56875 }, { "epoch": 10.438612589465958, "grad_norm": 84.79620361328125, "learning_rate": 5.4892788331339005e-06, "loss": 0.2105, "num_input_tokens_seen": 122633968, "step": 56880 }, { "epoch": 10.43953018902551, "grad_norm": 42.44394302368164, "learning_rate": 5.488481913331914e-06, "loss": 0.1786, "num_input_tokens_seen": 122645104, "step": 56885 }, { "epoch": 10.440447788585061, "grad_norm": 0.30099213123321533, "learning_rate": 5.487684981001124e-06, "loss": 0.2479, "num_input_tokens_seen": 122656528, "step": 56890 }, { "epoch": 10.441365388144614, "grad_norm": 4.729825973510742, "learning_rate": 5.486888036161968e-06, "loss": 0.0093, "num_input_tokens_seen": 122667440, "step": 56895 }, { "epoch": 10.442282987704166, "grad_norm": 7.608314037322998, "learning_rate": 5.48609107883489e-06, "loss": 0.0829, "num_input_tokens_seen": 122678640, "step": 56900 }, { "epoch": 10.443200587263718, "grad_norm": 87.95653533935547, "learning_rate": 5.485294109040328e-06, "loss": 0.3741, "num_input_tokens_seen": 122690096, "step": 56905 }, { "epoch": 10.444118186823271, "grad_norm": 26.935041427612305, "learning_rate": 5.4844971267987255e-06, "loss": 0.2937, "num_input_tokens_seen": 122700528, "step": 56910 }, { "epoch": 10.445035786382823, "grad_norm": 1.2178069353103638, "learning_rate": 5.483700132130522e-06, "loss": 0.0272, "num_input_tokens_seen": 122710608, "step": 56915 }, { "epoch": 10.445953385942374, "grad_norm": 0.1893003284931183, "learning_rate": 5.482903125056159e-06, "loss": 0.1032, "num_input_tokens_seen": 122721328, "step": 56920 }, { "epoch": 10.446870985501928, "grad_norm": 7.463572025299072, "learning_rate": 5.482106105596081e-06, "loss": 0.7063, "num_input_tokens_seen": 122731312, "step": 56925 }, { "epoch": 10.44778858506148, "grad_norm": 100.54508972167969, "learning_rate": 5.481309073770728e-06, "loss": 0.2407, "num_input_tokens_seen": 122742736, "step": 56930 }, { "epoch": 10.448706184621031, "grad_norm": 0.21945466101169586, "learning_rate": 5.480512029600542e-06, "loss": 0.2961, "num_input_tokens_seen": 122753520, "step": 56935 }, { "epoch": 10.449623784180584, "grad_norm": 53.6382942199707, "learning_rate": 5.479714973105968e-06, "loss": 0.3374, "num_input_tokens_seen": 122765008, "step": 56940 }, { "epoch": 10.450541383740136, "grad_norm": 94.5385971069336, "learning_rate": 5.478917904307448e-06, "loss": 0.251, "num_input_tokens_seen": 122776464, "step": 56945 }, { "epoch": 10.451458983299688, "grad_norm": 27.553146362304688, "learning_rate": 5.478120823225427e-06, "loss": 0.4062, "num_input_tokens_seen": 122787280, "step": 56950 }, { "epoch": 10.452376582859241, "grad_norm": 21.729963302612305, "learning_rate": 5.477323729880347e-06, "loss": 0.0126, "num_input_tokens_seen": 122798192, "step": 56955 }, { "epoch": 10.453294182418793, "grad_norm": 0.9562312960624695, "learning_rate": 5.476526624292654e-06, "loss": 0.317, "num_input_tokens_seen": 122810128, "step": 56960 }, { "epoch": 10.454211781978344, "grad_norm": 56.78528594970703, "learning_rate": 5.475729506482791e-06, "loss": 0.0542, "num_input_tokens_seen": 122820720, "step": 56965 }, { "epoch": 10.455129381537898, "grad_norm": 4.003304958343506, "learning_rate": 5.474932376471204e-06, "loss": 0.1593, "num_input_tokens_seen": 122832304, "step": 56970 }, { "epoch": 10.45604698109745, "grad_norm": 33.32883071899414, "learning_rate": 5.474135234278337e-06, "loss": 0.1981, "num_input_tokens_seen": 122841584, "step": 56975 }, { "epoch": 10.456964580657, "grad_norm": 0.3168873190879822, "learning_rate": 5.473338079924637e-06, "loss": 0.0181, "num_input_tokens_seen": 122853040, "step": 56980 }, { "epoch": 10.457882180216554, "grad_norm": 26.0012149810791, "learning_rate": 5.472540913430547e-06, "loss": 0.0629, "num_input_tokens_seen": 122864304, "step": 56985 }, { "epoch": 10.458799779776106, "grad_norm": 0.45088472962379456, "learning_rate": 5.471743734816517e-06, "loss": 0.1434, "num_input_tokens_seen": 122877072, "step": 56990 }, { "epoch": 10.459717379335657, "grad_norm": 1.7537506818771362, "learning_rate": 5.470946544102992e-06, "loss": 0.1092, "num_input_tokens_seen": 122888656, "step": 56995 }, { "epoch": 10.46063497889521, "grad_norm": 9.243552207946777, "learning_rate": 5.470149341310415e-06, "loss": 0.1309, "num_input_tokens_seen": 122900144, "step": 57000 }, { "epoch": 10.461552578454762, "grad_norm": 63.4184455871582, "learning_rate": 5.469352126459237e-06, "loss": 0.2978, "num_input_tokens_seen": 122912560, "step": 57005 }, { "epoch": 10.462470178014314, "grad_norm": 55.3713493347168, "learning_rate": 5.468554899569905e-06, "loss": 0.4395, "num_input_tokens_seen": 122923088, "step": 57010 }, { "epoch": 10.463387777573868, "grad_norm": 27.50432586669922, "learning_rate": 5.4677576606628665e-06, "loss": 0.3107, "num_input_tokens_seen": 122934256, "step": 57015 }, { "epoch": 10.46430537713342, "grad_norm": 0.5284703969955444, "learning_rate": 5.466960409758569e-06, "loss": 0.1838, "num_input_tokens_seen": 122944848, "step": 57020 }, { "epoch": 10.46522297669297, "grad_norm": 0.10953918844461441, "learning_rate": 5.46616314687746e-06, "loss": 0.2142, "num_input_tokens_seen": 122955152, "step": 57025 }, { "epoch": 10.466140576252524, "grad_norm": 0.25408869981765747, "learning_rate": 5.4653658720399885e-06, "loss": 0.1643, "num_input_tokens_seen": 122964752, "step": 57030 }, { "epoch": 10.467058175812076, "grad_norm": 0.2231239527463913, "learning_rate": 5.4645685852666045e-06, "loss": 0.3105, "num_input_tokens_seen": 122976912, "step": 57035 }, { "epoch": 10.467975775371627, "grad_norm": 0.12362876534461975, "learning_rate": 5.463771286577755e-06, "loss": 0.1123, "num_input_tokens_seen": 122986320, "step": 57040 }, { "epoch": 10.46889337493118, "grad_norm": 4.1027703285217285, "learning_rate": 5.4629739759938926e-06, "loss": 0.0044, "num_input_tokens_seen": 122996944, "step": 57045 }, { "epoch": 10.469810974490732, "grad_norm": 26.14212989807129, "learning_rate": 5.462176653535464e-06, "loss": 0.2671, "num_input_tokens_seen": 123007376, "step": 57050 }, { "epoch": 10.470728574050284, "grad_norm": 0.4076352119445801, "learning_rate": 5.46137931922292e-06, "loss": 0.2269, "num_input_tokens_seen": 123016880, "step": 57055 }, { "epoch": 10.471646173609837, "grad_norm": 0.5217763781547546, "learning_rate": 5.460581973076713e-06, "loss": 0.2395, "num_input_tokens_seen": 123028848, "step": 57060 }, { "epoch": 10.472563773169389, "grad_norm": 66.38932800292969, "learning_rate": 5.459784615117292e-06, "loss": 0.2777, "num_input_tokens_seen": 123038896, "step": 57065 }, { "epoch": 10.47348137272894, "grad_norm": 11.102012634277344, "learning_rate": 5.458987245365108e-06, "loss": 0.4336, "num_input_tokens_seen": 123050064, "step": 57070 }, { "epoch": 10.474398972288494, "grad_norm": 10.253864288330078, "learning_rate": 5.4581898638406115e-06, "loss": 0.2762, "num_input_tokens_seen": 123059952, "step": 57075 }, { "epoch": 10.475316571848046, "grad_norm": 0.1391567438840866, "learning_rate": 5.4573924705642565e-06, "loss": 0.1635, "num_input_tokens_seen": 123069840, "step": 57080 }, { "epoch": 10.476234171407597, "grad_norm": 0.21156081557273865, "learning_rate": 5.456595065556493e-06, "loss": 0.1633, "num_input_tokens_seen": 123080848, "step": 57085 }, { "epoch": 10.47715177096715, "grad_norm": 46.794620513916016, "learning_rate": 5.455797648837774e-06, "loss": 0.2609, "num_input_tokens_seen": 123092784, "step": 57090 }, { "epoch": 10.478069370526702, "grad_norm": 166.01565551757812, "learning_rate": 5.455000220428551e-06, "loss": 0.1607, "num_input_tokens_seen": 123103440, "step": 57095 }, { "epoch": 10.478986970086254, "grad_norm": 0.1826128214597702, "learning_rate": 5.454202780349279e-06, "loss": 0.1103, "num_input_tokens_seen": 123114320, "step": 57100 }, { "epoch": 10.479904569645807, "grad_norm": 144.26031494140625, "learning_rate": 5.4534053286204084e-06, "loss": 0.441, "num_input_tokens_seen": 123124016, "step": 57105 }, { "epoch": 10.480822169205359, "grad_norm": 0.29515737295150757, "learning_rate": 5.452607865262394e-06, "loss": 0.2773, "num_input_tokens_seen": 123134864, "step": 57110 }, { "epoch": 10.48173976876491, "grad_norm": 3.059459924697876, "learning_rate": 5.451810390295689e-06, "loss": 0.326, "num_input_tokens_seen": 123146544, "step": 57115 }, { "epoch": 10.482657368324464, "grad_norm": 39.02914810180664, "learning_rate": 5.451012903740749e-06, "loss": 0.1341, "num_input_tokens_seen": 123157296, "step": 57120 }, { "epoch": 10.483574967884016, "grad_norm": 0.3998371958732605, "learning_rate": 5.4502154056180255e-06, "loss": 0.0047, "num_input_tokens_seen": 123169360, "step": 57125 }, { "epoch": 10.484492567443567, "grad_norm": 0.06018265336751938, "learning_rate": 5.449417895947976e-06, "loss": 0.1483, "num_input_tokens_seen": 123181104, "step": 57130 }, { "epoch": 10.48541016700312, "grad_norm": 29.2845458984375, "learning_rate": 5.4486203747510525e-06, "loss": 0.2653, "num_input_tokens_seen": 123191632, "step": 57135 }, { "epoch": 10.486327766562672, "grad_norm": 0.19769588112831116, "learning_rate": 5.447822842047712e-06, "loss": 0.1695, "num_input_tokens_seen": 123201040, "step": 57140 }, { "epoch": 10.487245366122224, "grad_norm": 157.19485473632812, "learning_rate": 5.447025297858411e-06, "loss": 0.1372, "num_input_tokens_seen": 123212144, "step": 57145 }, { "epoch": 10.488162965681777, "grad_norm": 2.099517583847046, "learning_rate": 5.446227742203603e-06, "loss": 0.1207, "num_input_tokens_seen": 123223856, "step": 57150 }, { "epoch": 10.489080565241329, "grad_norm": 0.7764292359352112, "learning_rate": 5.445430175103745e-06, "loss": 0.0816, "num_input_tokens_seen": 123235408, "step": 57155 }, { "epoch": 10.48999816480088, "grad_norm": 0.18993893265724182, "learning_rate": 5.444632596579293e-06, "loss": 0.373, "num_input_tokens_seen": 123246864, "step": 57160 }, { "epoch": 10.490915764360434, "grad_norm": 0.41534674167633057, "learning_rate": 5.443835006650704e-06, "loss": 0.1789, "num_input_tokens_seen": 123256688, "step": 57165 }, { "epoch": 10.491833363919985, "grad_norm": 25.43950843811035, "learning_rate": 5.443037405338436e-06, "loss": 0.134, "num_input_tokens_seen": 123268432, "step": 57170 }, { "epoch": 10.492750963479537, "grad_norm": 0.06691039353609085, "learning_rate": 5.442239792662944e-06, "loss": 0.1304, "num_input_tokens_seen": 123280368, "step": 57175 }, { "epoch": 10.49366856303909, "grad_norm": 0.22083912789821625, "learning_rate": 5.441442168644688e-06, "loss": 0.175, "num_input_tokens_seen": 123292656, "step": 57180 }, { "epoch": 10.494586162598642, "grad_norm": 2.7455313205718994, "learning_rate": 5.4406445333041235e-06, "loss": 0.1009, "num_input_tokens_seen": 123303536, "step": 57185 }, { "epoch": 10.495503762158194, "grad_norm": 0.08706307411193848, "learning_rate": 5.439846886661711e-06, "loss": 0.0055, "num_input_tokens_seen": 123314416, "step": 57190 }, { "epoch": 10.496421361717747, "grad_norm": 1.5186694860458374, "learning_rate": 5.439049228737906e-06, "loss": 0.0075, "num_input_tokens_seen": 123325072, "step": 57195 }, { "epoch": 10.497338961277299, "grad_norm": 51.11337661743164, "learning_rate": 5.4382515595531695e-06, "loss": 0.2506, "num_input_tokens_seen": 123336272, "step": 57200 }, { "epoch": 10.49825656083685, "grad_norm": 0.03405759483575821, "learning_rate": 5.43745387912796e-06, "loss": 0.2938, "num_input_tokens_seen": 123346640, "step": 57205 }, { "epoch": 10.499174160396404, "grad_norm": 0.2713707685470581, "learning_rate": 5.436656187482736e-06, "loss": 0.1577, "num_input_tokens_seen": 123357424, "step": 57210 }, { "epoch": 10.500091759955955, "grad_norm": 10.333247184753418, "learning_rate": 5.435858484637957e-06, "loss": 0.3296, "num_input_tokens_seen": 123367248, "step": 57215 }, { "epoch": 10.501009359515507, "grad_norm": 12.155434608459473, "learning_rate": 5.435060770614085e-06, "loss": 0.2975, "num_input_tokens_seen": 123377232, "step": 57220 }, { "epoch": 10.50192695907506, "grad_norm": 57.99118423461914, "learning_rate": 5.434263045431577e-06, "loss": 0.223, "num_input_tokens_seen": 123388336, "step": 57225 }, { "epoch": 10.502844558634612, "grad_norm": 0.44835636019706726, "learning_rate": 5.433465309110896e-06, "loss": 0.3588, "num_input_tokens_seen": 123398544, "step": 57230 }, { "epoch": 10.503762158194164, "grad_norm": 5.040431022644043, "learning_rate": 5.4326675616725e-06, "loss": 0.01, "num_input_tokens_seen": 123409232, "step": 57235 }, { "epoch": 10.504679757753717, "grad_norm": 80.09420776367188, "learning_rate": 5.431869803136852e-06, "loss": 0.3514, "num_input_tokens_seen": 123420944, "step": 57240 }, { "epoch": 10.505597357313269, "grad_norm": 32.33331298828125, "learning_rate": 5.431072033524415e-06, "loss": 0.2195, "num_input_tokens_seen": 123431792, "step": 57245 }, { "epoch": 10.50651495687282, "grad_norm": 121.03985595703125, "learning_rate": 5.430274252855646e-06, "loss": 0.4881, "num_input_tokens_seen": 123442608, "step": 57250 }, { "epoch": 10.507432556432374, "grad_norm": 0.23285149037837982, "learning_rate": 5.429476461151011e-06, "loss": 0.3991, "num_input_tokens_seen": 123453712, "step": 57255 }, { "epoch": 10.508350155991925, "grad_norm": 19.67788314819336, "learning_rate": 5.428678658430969e-06, "loss": 0.2498, "num_input_tokens_seen": 123464848, "step": 57260 }, { "epoch": 10.509267755551477, "grad_norm": 0.7847046852111816, "learning_rate": 5.427880844715986e-06, "loss": 0.1558, "num_input_tokens_seen": 123476304, "step": 57265 }, { "epoch": 10.51018535511103, "grad_norm": 1.4788246154785156, "learning_rate": 5.4270830200265205e-06, "loss": 0.0174, "num_input_tokens_seen": 123485936, "step": 57270 }, { "epoch": 10.511102954670582, "grad_norm": 0.046712588518857956, "learning_rate": 5.4262851843830364e-06, "loss": 0.2738, "num_input_tokens_seen": 123496656, "step": 57275 }, { "epoch": 10.512020554230133, "grad_norm": 14.373722076416016, "learning_rate": 5.425487337806001e-06, "loss": 0.2028, "num_input_tokens_seen": 123506896, "step": 57280 }, { "epoch": 10.512938153789687, "grad_norm": 6.346559047698975, "learning_rate": 5.424689480315872e-06, "loss": 0.2357, "num_input_tokens_seen": 123517392, "step": 57285 }, { "epoch": 10.513855753349238, "grad_norm": 41.280269622802734, "learning_rate": 5.423891611933119e-06, "loss": 0.2092, "num_input_tokens_seen": 123528912, "step": 57290 }, { "epoch": 10.51477335290879, "grad_norm": 2.698603868484497, "learning_rate": 5.423093732678201e-06, "loss": 0.0416, "num_input_tokens_seen": 123538864, "step": 57295 }, { "epoch": 10.515690952468344, "grad_norm": 10.685619354248047, "learning_rate": 5.422295842571585e-06, "loss": 0.3025, "num_input_tokens_seen": 123549392, "step": 57300 }, { "epoch": 10.516608552027895, "grad_norm": 30.29268455505371, "learning_rate": 5.421497941633735e-06, "loss": 0.2292, "num_input_tokens_seen": 123560080, "step": 57305 }, { "epoch": 10.517526151587447, "grad_norm": 28.191457748413086, "learning_rate": 5.420700029885118e-06, "loss": 0.2601, "num_input_tokens_seen": 123571248, "step": 57310 }, { "epoch": 10.518443751147, "grad_norm": 34.83803939819336, "learning_rate": 5.419902107346195e-06, "loss": 0.2268, "num_input_tokens_seen": 123582352, "step": 57315 }, { "epoch": 10.519361350706552, "grad_norm": 15.694845199584961, "learning_rate": 5.419104174037434e-06, "loss": 0.1032, "num_input_tokens_seen": 123593520, "step": 57320 }, { "epoch": 10.520278950266103, "grad_norm": 0.3939194083213806, "learning_rate": 5.4183062299793e-06, "loss": 0.2177, "num_input_tokens_seen": 123604432, "step": 57325 }, { "epoch": 10.521196549825657, "grad_norm": 21.29606056213379, "learning_rate": 5.417508275192261e-06, "loss": 0.1874, "num_input_tokens_seen": 123615632, "step": 57330 }, { "epoch": 10.522114149385208, "grad_norm": 24.747684478759766, "learning_rate": 5.416710309696781e-06, "loss": 0.1057, "num_input_tokens_seen": 123626992, "step": 57335 }, { "epoch": 10.52303174894476, "grad_norm": 31.645673751831055, "learning_rate": 5.415912333513328e-06, "loss": 0.3657, "num_input_tokens_seen": 123636208, "step": 57340 }, { "epoch": 10.523949348504313, "grad_norm": 14.601751327514648, "learning_rate": 5.415114346662366e-06, "loss": 0.4313, "num_input_tokens_seen": 123646064, "step": 57345 }, { "epoch": 10.524866948063865, "grad_norm": 1.3945353031158447, "learning_rate": 5.414316349164367e-06, "loss": 0.1633, "num_input_tokens_seen": 123656496, "step": 57350 }, { "epoch": 10.525784547623417, "grad_norm": 0.16031286120414734, "learning_rate": 5.413518341039795e-06, "loss": 0.3084, "num_input_tokens_seen": 123667024, "step": 57355 }, { "epoch": 10.52670214718297, "grad_norm": 135.54193115234375, "learning_rate": 5.4127203223091176e-06, "loss": 0.2337, "num_input_tokens_seen": 123678224, "step": 57360 }, { "epoch": 10.527619746742522, "grad_norm": 2.83882737159729, "learning_rate": 5.411922292992805e-06, "loss": 0.0152, "num_input_tokens_seen": 123688912, "step": 57365 }, { "epoch": 10.528537346302073, "grad_norm": 0.3406602740287781, "learning_rate": 5.4111242531113225e-06, "loss": 0.0742, "num_input_tokens_seen": 123700240, "step": 57370 }, { "epoch": 10.529454945861627, "grad_norm": 0.5755403637886047, "learning_rate": 5.410326202685141e-06, "loss": 0.0331, "num_input_tokens_seen": 123710128, "step": 57375 }, { "epoch": 10.530372545421178, "grad_norm": 206.91494750976562, "learning_rate": 5.409528141734729e-06, "loss": 0.2648, "num_input_tokens_seen": 123721648, "step": 57380 }, { "epoch": 10.53129014498073, "grad_norm": 0.21266894042491913, "learning_rate": 5.408730070280553e-06, "loss": 0.1041, "num_input_tokens_seen": 123733872, "step": 57385 }, { "epoch": 10.532207744540283, "grad_norm": 0.8757924437522888, "learning_rate": 5.4079319883430855e-06, "loss": 0.0844, "num_input_tokens_seen": 123745712, "step": 57390 }, { "epoch": 10.533125344099835, "grad_norm": 33.78752517700195, "learning_rate": 5.407133895942794e-06, "loss": 0.4249, "num_input_tokens_seen": 123756368, "step": 57395 }, { "epoch": 10.534042943659387, "grad_norm": 19.365888595581055, "learning_rate": 5.406335793100151e-06, "loss": 0.3171, "num_input_tokens_seen": 123767408, "step": 57400 }, { "epoch": 10.53496054321894, "grad_norm": 0.008206403814256191, "learning_rate": 5.405537679835623e-06, "loss": 0.0126, "num_input_tokens_seen": 123777808, "step": 57405 }, { "epoch": 10.535878142778492, "grad_norm": 0.2367582768201828, "learning_rate": 5.404739556169683e-06, "loss": 0.2412, "num_input_tokens_seen": 123787984, "step": 57410 }, { "epoch": 10.536795742338043, "grad_norm": 0.13188400864601135, "learning_rate": 5.403941422122799e-06, "loss": 0.1374, "num_input_tokens_seen": 123799440, "step": 57415 }, { "epoch": 10.537713341897597, "grad_norm": 1.022085189819336, "learning_rate": 5.403143277715446e-06, "loss": 0.189, "num_input_tokens_seen": 123809616, "step": 57420 }, { "epoch": 10.538630941457148, "grad_norm": 45.8175163269043, "learning_rate": 5.402345122968091e-06, "loss": 0.2392, "num_input_tokens_seen": 123820976, "step": 57425 }, { "epoch": 10.5395485410167, "grad_norm": 196.91954040527344, "learning_rate": 5.401546957901207e-06, "loss": 0.3397, "num_input_tokens_seen": 123830032, "step": 57430 }, { "epoch": 10.540466140576253, "grad_norm": 0.07899367809295654, "learning_rate": 5.4007487825352655e-06, "loss": 0.136, "num_input_tokens_seen": 123839568, "step": 57435 }, { "epoch": 10.541383740135805, "grad_norm": 213.28115844726562, "learning_rate": 5.399950596890741e-06, "loss": 0.1466, "num_input_tokens_seen": 123850864, "step": 57440 }, { "epoch": 10.542301339695356, "grad_norm": 20.791786193847656, "learning_rate": 5.399152400988101e-06, "loss": 0.287, "num_input_tokens_seen": 123862576, "step": 57445 }, { "epoch": 10.54321893925491, "grad_norm": 7.547214031219482, "learning_rate": 5.3983541948478236e-06, "loss": 0.1866, "num_input_tokens_seen": 123871184, "step": 57450 }, { "epoch": 10.544136538814461, "grad_norm": 47.28047561645508, "learning_rate": 5.397555978490375e-06, "loss": 0.2967, "num_input_tokens_seen": 123882800, "step": 57455 }, { "epoch": 10.545054138374013, "grad_norm": 20.12661361694336, "learning_rate": 5.3967577519362335e-06, "loss": 0.185, "num_input_tokens_seen": 123893424, "step": 57460 }, { "epoch": 10.545971737933566, "grad_norm": 16.770671844482422, "learning_rate": 5.395959515205871e-06, "loss": 0.264, "num_input_tokens_seen": 123903664, "step": 57465 }, { "epoch": 10.546889337493118, "grad_norm": 0.7973344326019287, "learning_rate": 5.395161268319759e-06, "loss": 0.2863, "num_input_tokens_seen": 123914704, "step": 57470 }, { "epoch": 10.54780693705267, "grad_norm": 0.177337184548378, "learning_rate": 5.394363011298376e-06, "loss": 0.2056, "num_input_tokens_seen": 123925424, "step": 57475 }, { "epoch": 10.548724536612223, "grad_norm": 9.962496757507324, "learning_rate": 5.393564744162189e-06, "loss": 0.2293, "num_input_tokens_seen": 123936464, "step": 57480 }, { "epoch": 10.549642136171775, "grad_norm": 0.46718451380729675, "learning_rate": 5.392766466931678e-06, "loss": 0.1973, "num_input_tokens_seen": 123946832, "step": 57485 }, { "epoch": 10.550559735731326, "grad_norm": 6.12708854675293, "learning_rate": 5.391968179627317e-06, "loss": 0.552, "num_input_tokens_seen": 123958224, "step": 57490 }, { "epoch": 10.55147733529088, "grad_norm": 15.929864883422852, "learning_rate": 5.391169882269579e-06, "loss": 0.2319, "num_input_tokens_seen": 123967760, "step": 57495 }, { "epoch": 10.552394934850431, "grad_norm": 8.421990394592285, "learning_rate": 5.3903715748789395e-06, "loss": 0.1768, "num_input_tokens_seen": 123978192, "step": 57500 }, { "epoch": 10.553312534409983, "grad_norm": 0.6073961853981018, "learning_rate": 5.389573257475874e-06, "loss": 0.1347, "num_input_tokens_seen": 123988944, "step": 57505 }, { "epoch": 10.554230133969536, "grad_norm": 25.072093963623047, "learning_rate": 5.388774930080858e-06, "loss": 0.0142, "num_input_tokens_seen": 124000400, "step": 57510 }, { "epoch": 10.555147733529088, "grad_norm": 1.0398845672607422, "learning_rate": 5.387976592714369e-06, "loss": 0.0082, "num_input_tokens_seen": 124011984, "step": 57515 }, { "epoch": 10.55606533308864, "grad_norm": 0.1868029087781906, "learning_rate": 5.387178245396881e-06, "loss": 0.3132, "num_input_tokens_seen": 124023280, "step": 57520 }, { "epoch": 10.556982932648193, "grad_norm": 25.758481979370117, "learning_rate": 5.386379888148871e-06, "loss": 0.0954, "num_input_tokens_seen": 124034352, "step": 57525 }, { "epoch": 10.557900532207745, "grad_norm": 2.587507486343384, "learning_rate": 5.3855815209908156e-06, "loss": 0.0268, "num_input_tokens_seen": 124044432, "step": 57530 }, { "epoch": 10.558818131767296, "grad_norm": 0.2257251739501953, "learning_rate": 5.384783143943191e-06, "loss": 0.1407, "num_input_tokens_seen": 124054000, "step": 57535 }, { "epoch": 10.55973573132685, "grad_norm": 62.96283721923828, "learning_rate": 5.383984757026476e-06, "loss": 0.1161, "num_input_tokens_seen": 124064624, "step": 57540 }, { "epoch": 10.560653330886401, "grad_norm": 30.528995513916016, "learning_rate": 5.383186360261147e-06, "loss": 0.1894, "num_input_tokens_seen": 124076368, "step": 57545 }, { "epoch": 10.561570930445953, "grad_norm": 0.443362832069397, "learning_rate": 5.3823879536676815e-06, "loss": 0.0055, "num_input_tokens_seen": 124087152, "step": 57550 }, { "epoch": 10.562488530005506, "grad_norm": 39.16862487792969, "learning_rate": 5.381589537266559e-06, "loss": 0.1912, "num_input_tokens_seen": 124098448, "step": 57555 }, { "epoch": 10.563406129565058, "grad_norm": 432.2403564453125, "learning_rate": 5.380791111078253e-06, "loss": 0.1649, "num_input_tokens_seen": 124108272, "step": 57560 }, { "epoch": 10.56432372912461, "grad_norm": 2.7675697803497314, "learning_rate": 5.3799926751232475e-06, "loss": 0.2426, "num_input_tokens_seen": 124121136, "step": 57565 }, { "epoch": 10.565241328684163, "grad_norm": 0.18461893498897552, "learning_rate": 5.379194229422019e-06, "loss": 0.1094, "num_input_tokens_seen": 124131952, "step": 57570 }, { "epoch": 10.566158928243714, "grad_norm": 9.804028511047363, "learning_rate": 5.378395773995047e-06, "loss": 0.1835, "num_input_tokens_seen": 124142992, "step": 57575 }, { "epoch": 10.567076527803266, "grad_norm": 48.37653350830078, "learning_rate": 5.377597308862807e-06, "loss": 0.2683, "num_input_tokens_seen": 124153584, "step": 57580 }, { "epoch": 10.56799412736282, "grad_norm": 13.18234920501709, "learning_rate": 5.3767988340457835e-06, "loss": 0.4175, "num_input_tokens_seen": 124165392, "step": 57585 }, { "epoch": 10.568911726922371, "grad_norm": 88.97007751464844, "learning_rate": 5.376000349564453e-06, "loss": 0.1942, "num_input_tokens_seen": 124175536, "step": 57590 }, { "epoch": 10.569829326481923, "grad_norm": 178.47088623046875, "learning_rate": 5.375201855439296e-06, "loss": 0.298, "num_input_tokens_seen": 124185936, "step": 57595 }, { "epoch": 10.570746926041476, "grad_norm": 5.404532432556152, "learning_rate": 5.374403351690795e-06, "loss": 0.1217, "num_input_tokens_seen": 124196208, "step": 57600 }, { "epoch": 10.571664525601028, "grad_norm": 6.371824741363525, "learning_rate": 5.373604838339426e-06, "loss": 0.1075, "num_input_tokens_seen": 124207408, "step": 57605 }, { "epoch": 10.57258212516058, "grad_norm": 11.124366760253906, "learning_rate": 5.3728063154056735e-06, "loss": 0.2552, "num_input_tokens_seen": 124218640, "step": 57610 }, { "epoch": 10.573499724720133, "grad_norm": 8.852418899536133, "learning_rate": 5.372007782910015e-06, "loss": 0.2474, "num_input_tokens_seen": 124228720, "step": 57615 }, { "epoch": 10.574417324279684, "grad_norm": 57.595455169677734, "learning_rate": 5.371209240872934e-06, "loss": 0.3442, "num_input_tokens_seen": 124238608, "step": 57620 }, { "epoch": 10.575334923839236, "grad_norm": 30.018407821655273, "learning_rate": 5.370410689314912e-06, "loss": 0.1438, "num_input_tokens_seen": 124249904, "step": 57625 }, { "epoch": 10.57625252339879, "grad_norm": 1.4617195129394531, "learning_rate": 5.369612128256431e-06, "loss": 0.0079, "num_input_tokens_seen": 124260720, "step": 57630 }, { "epoch": 10.577170122958341, "grad_norm": 6.612835884094238, "learning_rate": 5.368813557717969e-06, "loss": 0.3685, "num_input_tokens_seen": 124270832, "step": 57635 }, { "epoch": 10.578087722517893, "grad_norm": 0.5599152445793152, "learning_rate": 5.368014977720013e-06, "loss": 0.0443, "num_input_tokens_seen": 124282704, "step": 57640 }, { "epoch": 10.579005322077446, "grad_norm": 0.44502440094947815, "learning_rate": 5.367216388283042e-06, "loss": 0.2065, "num_input_tokens_seen": 124293296, "step": 57645 }, { "epoch": 10.579922921636998, "grad_norm": 1.8698762655258179, "learning_rate": 5.366417789427541e-06, "loss": 0.0651, "num_input_tokens_seen": 124303728, "step": 57650 }, { "epoch": 10.58084052119655, "grad_norm": 0.19343380630016327, "learning_rate": 5.36561918117399e-06, "loss": 0.2378, "num_input_tokens_seen": 124313616, "step": 57655 }, { "epoch": 10.581758120756103, "grad_norm": 0.7557722330093384, "learning_rate": 5.364820563542875e-06, "loss": 0.087, "num_input_tokens_seen": 124324592, "step": 57660 }, { "epoch": 10.582675720315654, "grad_norm": 0.7921286225318909, "learning_rate": 5.364021936554678e-06, "loss": 0.1952, "num_input_tokens_seen": 124335184, "step": 57665 }, { "epoch": 10.583593319875206, "grad_norm": 6.529714584350586, "learning_rate": 5.3632233002298805e-06, "loss": 0.1123, "num_input_tokens_seen": 124345776, "step": 57670 }, { "epoch": 10.58451091943476, "grad_norm": 6.349965572357178, "learning_rate": 5.36242465458897e-06, "loss": 0.1384, "num_input_tokens_seen": 124356528, "step": 57675 }, { "epoch": 10.585428518994311, "grad_norm": 48.394351959228516, "learning_rate": 5.361625999652429e-06, "loss": 0.1411, "num_input_tokens_seen": 124367600, "step": 57680 }, { "epoch": 10.586346118553863, "grad_norm": 0.16421960294246674, "learning_rate": 5.360827335440741e-06, "loss": 0.2727, "num_input_tokens_seen": 124378352, "step": 57685 }, { "epoch": 10.587263718113416, "grad_norm": 0.1916200816631317, "learning_rate": 5.360028661974391e-06, "loss": 0.1493, "num_input_tokens_seen": 124389776, "step": 57690 }, { "epoch": 10.588181317672968, "grad_norm": 24.87924575805664, "learning_rate": 5.359229979273863e-06, "loss": 0.1617, "num_input_tokens_seen": 124399984, "step": 57695 }, { "epoch": 10.58909891723252, "grad_norm": 97.92723083496094, "learning_rate": 5.358431287359646e-06, "loss": 0.2263, "num_input_tokens_seen": 124412496, "step": 57700 }, { "epoch": 10.590016516792073, "grad_norm": 2.2060024738311768, "learning_rate": 5.357632586252218e-06, "loss": 0.0935, "num_input_tokens_seen": 124421232, "step": 57705 }, { "epoch": 10.590934116351624, "grad_norm": 0.504237174987793, "learning_rate": 5.356833875972071e-06, "loss": 0.008, "num_input_tokens_seen": 124432176, "step": 57710 }, { "epoch": 10.591851715911176, "grad_norm": 0.6008332371711731, "learning_rate": 5.356035156539687e-06, "loss": 0.1667, "num_input_tokens_seen": 124443088, "step": 57715 }, { "epoch": 10.59276931547073, "grad_norm": 0.15310166776180267, "learning_rate": 5.355236427975553e-06, "loss": 0.1279, "num_input_tokens_seen": 124454320, "step": 57720 }, { "epoch": 10.59368691503028, "grad_norm": 0.11637569963932037, "learning_rate": 5.354437690300156e-06, "loss": 0.4581, "num_input_tokens_seen": 124465680, "step": 57725 }, { "epoch": 10.594604514589832, "grad_norm": 425.3074951171875, "learning_rate": 5.353638943533982e-06, "loss": 0.2575, "num_input_tokens_seen": 124475984, "step": 57730 }, { "epoch": 10.595522114149386, "grad_norm": 110.22848510742188, "learning_rate": 5.3528401876975155e-06, "loss": 0.3216, "num_input_tokens_seen": 124486512, "step": 57735 }, { "epoch": 10.596439713708937, "grad_norm": 60.98768997192383, "learning_rate": 5.352041422811247e-06, "loss": 0.0302, "num_input_tokens_seen": 124497936, "step": 57740 }, { "epoch": 10.597357313268489, "grad_norm": 0.2334892898797989, "learning_rate": 5.3512426488956605e-06, "loss": 0.1612, "num_input_tokens_seen": 124507984, "step": 57745 }, { "epoch": 10.598274912828042, "grad_norm": 0.21994560956954956, "learning_rate": 5.350443865971244e-06, "loss": 0.2141, "num_input_tokens_seen": 124519184, "step": 57750 }, { "epoch": 10.599192512387594, "grad_norm": 50.87982177734375, "learning_rate": 5.349645074058486e-06, "loss": 0.241, "num_input_tokens_seen": 124530224, "step": 57755 }, { "epoch": 10.600110111947146, "grad_norm": 1.5358282327651978, "learning_rate": 5.348846273177874e-06, "loss": 0.2022, "num_input_tokens_seen": 124540784, "step": 57760 }, { "epoch": 10.601027711506699, "grad_norm": 0.6521271467208862, "learning_rate": 5.348047463349896e-06, "loss": 0.2095, "num_input_tokens_seen": 124550896, "step": 57765 }, { "epoch": 10.60194531106625, "grad_norm": 12.99777889251709, "learning_rate": 5.34724864459504e-06, "loss": 0.3775, "num_input_tokens_seen": 124561072, "step": 57770 }, { "epoch": 10.602862910625802, "grad_norm": 32.6251220703125, "learning_rate": 5.346449816933793e-06, "loss": 0.0714, "num_input_tokens_seen": 124571568, "step": 57775 }, { "epoch": 10.603780510185356, "grad_norm": 0.3540794849395752, "learning_rate": 5.345650980386645e-06, "loss": 0.3892, "num_input_tokens_seen": 124581648, "step": 57780 }, { "epoch": 10.604698109744907, "grad_norm": 40.27004623413086, "learning_rate": 5.344852134974087e-06, "loss": 0.1491, "num_input_tokens_seen": 124592144, "step": 57785 }, { "epoch": 10.605615709304459, "grad_norm": 0.6232443451881409, "learning_rate": 5.344053280716604e-06, "loss": 0.315, "num_input_tokens_seen": 124603152, "step": 57790 }, { "epoch": 10.606533308864012, "grad_norm": 0.0026920561213046312, "learning_rate": 5.34325441763469e-06, "loss": 0.0036, "num_input_tokens_seen": 124614128, "step": 57795 }, { "epoch": 10.607450908423564, "grad_norm": 263.0655212402344, "learning_rate": 5.3424555457488314e-06, "loss": 0.1955, "num_input_tokens_seen": 124624912, "step": 57800 }, { "epoch": 10.608368507983116, "grad_norm": 45.3116340637207, "learning_rate": 5.341656665079518e-06, "loss": 0.2168, "num_input_tokens_seen": 124635856, "step": 57805 }, { "epoch": 10.609286107542669, "grad_norm": 0.10210269689559937, "learning_rate": 5.340857775647241e-06, "loss": 0.1575, "num_input_tokens_seen": 124646512, "step": 57810 }, { "epoch": 10.61020370710222, "grad_norm": 0.36544108390808105, "learning_rate": 5.340058877472491e-06, "loss": 0.0862, "num_input_tokens_seen": 124657264, "step": 57815 }, { "epoch": 10.611121306661772, "grad_norm": 0.1037883311510086, "learning_rate": 5.339259970575757e-06, "loss": 0.1465, "num_input_tokens_seen": 124669008, "step": 57820 }, { "epoch": 10.612038906221326, "grad_norm": 24.44932746887207, "learning_rate": 5.33846105497753e-06, "loss": 0.2003, "num_input_tokens_seen": 124680976, "step": 57825 }, { "epoch": 10.612956505780877, "grad_norm": 1.5655007362365723, "learning_rate": 5.337662130698303e-06, "loss": 0.303, "num_input_tokens_seen": 124691568, "step": 57830 }, { "epoch": 10.613874105340429, "grad_norm": 1.8866394758224487, "learning_rate": 5.336863197758565e-06, "loss": 0.2662, "num_input_tokens_seen": 124701968, "step": 57835 }, { "epoch": 10.614791704899982, "grad_norm": 0.29070883989334106, "learning_rate": 5.336064256178809e-06, "loss": 0.2501, "num_input_tokens_seen": 124712688, "step": 57840 }, { "epoch": 10.615709304459534, "grad_norm": 20.63779640197754, "learning_rate": 5.335265305979524e-06, "loss": 0.4059, "num_input_tokens_seen": 124723632, "step": 57845 }, { "epoch": 10.616626904019085, "grad_norm": 1.442021369934082, "learning_rate": 5.334466347181205e-06, "loss": 0.2094, "num_input_tokens_seen": 124734064, "step": 57850 }, { "epoch": 10.617544503578639, "grad_norm": 0.17921772599220276, "learning_rate": 5.3336673798043414e-06, "loss": 0.1495, "num_input_tokens_seen": 124744848, "step": 57855 }, { "epoch": 10.61846210313819, "grad_norm": 305.8851013183594, "learning_rate": 5.332868403869428e-06, "loss": 0.2284, "num_input_tokens_seen": 124755952, "step": 57860 }, { "epoch": 10.619379702697742, "grad_norm": 30.717430114746094, "learning_rate": 5.332069419396955e-06, "loss": 0.4159, "num_input_tokens_seen": 124765296, "step": 57865 }, { "epoch": 10.620297302257296, "grad_norm": 15.69703483581543, "learning_rate": 5.331270426407416e-06, "loss": 0.1727, "num_input_tokens_seen": 124776464, "step": 57870 }, { "epoch": 10.621214901816847, "grad_norm": 99.3559341430664, "learning_rate": 5.330471424921304e-06, "loss": 0.3935, "num_input_tokens_seen": 124787408, "step": 57875 }, { "epoch": 10.622132501376399, "grad_norm": 182.947265625, "learning_rate": 5.329672414959112e-06, "loss": 0.1262, "num_input_tokens_seen": 124796816, "step": 57880 }, { "epoch": 10.623050100935952, "grad_norm": 13.320871353149414, "learning_rate": 5.328873396541334e-06, "loss": 0.2211, "num_input_tokens_seen": 124806608, "step": 57885 }, { "epoch": 10.623967700495504, "grad_norm": 0.3474747836589813, "learning_rate": 5.328074369688463e-06, "loss": 0.3332, "num_input_tokens_seen": 124817072, "step": 57890 }, { "epoch": 10.624885300055055, "grad_norm": 1.0206761360168457, "learning_rate": 5.327275334420993e-06, "loss": 0.1814, "num_input_tokens_seen": 124828304, "step": 57895 }, { "epoch": 10.625802899614609, "grad_norm": 0.2764597237110138, "learning_rate": 5.326476290759417e-06, "loss": 0.3575, "num_input_tokens_seen": 124840080, "step": 57900 }, { "epoch": 10.62672049917416, "grad_norm": 41.87726974487305, "learning_rate": 5.325677238724231e-06, "loss": 0.1043, "num_input_tokens_seen": 124850256, "step": 57905 }, { "epoch": 10.627638098733712, "grad_norm": 0.30341559648513794, "learning_rate": 5.324878178335928e-06, "loss": 0.2011, "num_input_tokens_seen": 124862512, "step": 57910 }, { "epoch": 10.628555698293265, "grad_norm": 0.3023887872695923, "learning_rate": 5.324079109615003e-06, "loss": 0.0055, "num_input_tokens_seen": 124872592, "step": 57915 }, { "epoch": 10.629473297852817, "grad_norm": 67.34017944335938, "learning_rate": 5.323280032581952e-06, "loss": 0.2888, "num_input_tokens_seen": 124882800, "step": 57920 }, { "epoch": 10.630390897412369, "grad_norm": 8.76710319519043, "learning_rate": 5.322480947257269e-06, "loss": 0.1767, "num_input_tokens_seen": 124893744, "step": 57925 }, { "epoch": 10.631308496971922, "grad_norm": 65.34626770019531, "learning_rate": 5.32168185366145e-06, "loss": 0.1102, "num_input_tokens_seen": 124904752, "step": 57930 }, { "epoch": 10.632226096531474, "grad_norm": 7.7853522300720215, "learning_rate": 5.320882751814989e-06, "loss": 0.2236, "num_input_tokens_seen": 124914544, "step": 57935 }, { "epoch": 10.633143696091025, "grad_norm": 91.38453674316406, "learning_rate": 5.320083641738383e-06, "loss": 0.295, "num_input_tokens_seen": 124926096, "step": 57940 }, { "epoch": 10.634061295650579, "grad_norm": 1.9446314573287964, "learning_rate": 5.319284523452128e-06, "loss": 0.304, "num_input_tokens_seen": 124936528, "step": 57945 }, { "epoch": 10.63497889521013, "grad_norm": 0.06442036479711533, "learning_rate": 5.31848539697672e-06, "loss": 0.1963, "num_input_tokens_seen": 124946768, "step": 57950 }, { "epoch": 10.635896494769682, "grad_norm": 91.77118682861328, "learning_rate": 5.3176862623326555e-06, "loss": 0.1263, "num_input_tokens_seen": 124958768, "step": 57955 }, { "epoch": 10.636814094329235, "grad_norm": 0.10682490468025208, "learning_rate": 5.31688711954043e-06, "loss": 0.0049, "num_input_tokens_seen": 124968976, "step": 57960 }, { "epoch": 10.637731693888787, "grad_norm": 1.1980793476104736, "learning_rate": 5.31608796862054e-06, "loss": 0.285, "num_input_tokens_seen": 124978832, "step": 57965 }, { "epoch": 10.638649293448339, "grad_norm": 33.917572021484375, "learning_rate": 5.3152888095934865e-06, "loss": 0.0068, "num_input_tokens_seen": 124990064, "step": 57970 }, { "epoch": 10.639566893007892, "grad_norm": 0.25870344042778015, "learning_rate": 5.314489642479761e-06, "loss": 0.1192, "num_input_tokens_seen": 125001008, "step": 57975 }, { "epoch": 10.640484492567444, "grad_norm": 1.6306037902832031, "learning_rate": 5.313690467299865e-06, "loss": 0.4306, "num_input_tokens_seen": 125011440, "step": 57980 }, { "epoch": 10.641402092126995, "grad_norm": 91.59869384765625, "learning_rate": 5.312891284074293e-06, "loss": 0.0669, "num_input_tokens_seen": 125023376, "step": 57985 }, { "epoch": 10.642319691686549, "grad_norm": 10.592501640319824, "learning_rate": 5.312092092823546e-06, "loss": 0.152, "num_input_tokens_seen": 125033232, "step": 57990 }, { "epoch": 10.6432372912461, "grad_norm": 0.3458353281021118, "learning_rate": 5.311292893568119e-06, "loss": 0.3316, "num_input_tokens_seen": 125044656, "step": 57995 }, { "epoch": 10.644154890805652, "grad_norm": 59.766815185546875, "learning_rate": 5.310493686328513e-06, "loss": 0.2861, "num_input_tokens_seen": 125054416, "step": 58000 }, { "epoch": 10.645072490365205, "grad_norm": 45.901336669921875, "learning_rate": 5.3096944711252255e-06, "loss": 0.554, "num_input_tokens_seen": 125064560, "step": 58005 }, { "epoch": 10.645990089924757, "grad_norm": 16.67586326599121, "learning_rate": 5.308895247978754e-06, "loss": 0.4327, "num_input_tokens_seen": 125074160, "step": 58010 }, { "epoch": 10.646907689484308, "grad_norm": 0.48262137174606323, "learning_rate": 5.308096016909597e-06, "loss": 0.1255, "num_input_tokens_seen": 125085072, "step": 58015 }, { "epoch": 10.647825289043862, "grad_norm": 166.28111267089844, "learning_rate": 5.307296777938258e-06, "loss": 0.2629, "num_input_tokens_seen": 125094288, "step": 58020 }, { "epoch": 10.648742888603413, "grad_norm": 21.403934478759766, "learning_rate": 5.30649753108523e-06, "loss": 0.5203, "num_input_tokens_seen": 125105584, "step": 58025 }, { "epoch": 10.649660488162965, "grad_norm": 0.34716132283210754, "learning_rate": 5.305698276371017e-06, "loss": 0.0177, "num_input_tokens_seen": 125115728, "step": 58030 }, { "epoch": 10.650578087722518, "grad_norm": 0.4133302569389343, "learning_rate": 5.304899013816116e-06, "loss": 0.1478, "num_input_tokens_seen": 125126672, "step": 58035 }, { "epoch": 10.65149568728207, "grad_norm": 0.1155976951122284, "learning_rate": 5.304099743441029e-06, "loss": 0.2965, "num_input_tokens_seen": 125136880, "step": 58040 }, { "epoch": 10.652413286841622, "grad_norm": 26.6246280670166, "learning_rate": 5.303300465266254e-06, "loss": 0.4798, "num_input_tokens_seen": 125147952, "step": 58045 }, { "epoch": 10.653330886401175, "grad_norm": 0.569566547870636, "learning_rate": 5.3025011793122915e-06, "loss": 0.0242, "num_input_tokens_seen": 125158992, "step": 58050 }, { "epoch": 10.654248485960727, "grad_norm": 1.6792426109313965, "learning_rate": 5.301701885599644e-06, "loss": 0.2461, "num_input_tokens_seen": 125169392, "step": 58055 }, { "epoch": 10.655166085520278, "grad_norm": 0.4348016083240509, "learning_rate": 5.3009025841488105e-06, "loss": 0.145, "num_input_tokens_seen": 125179888, "step": 58060 }, { "epoch": 10.656083685079832, "grad_norm": 121.7416000366211, "learning_rate": 5.300103274980291e-06, "loss": 0.3987, "num_input_tokens_seen": 125190512, "step": 58065 }, { "epoch": 10.657001284639383, "grad_norm": 0.3210870027542114, "learning_rate": 5.299303958114589e-06, "loss": 0.2182, "num_input_tokens_seen": 125201680, "step": 58070 }, { "epoch": 10.657918884198935, "grad_norm": 12.504899024963379, "learning_rate": 5.2985046335722025e-06, "loss": 0.0056, "num_input_tokens_seen": 125212528, "step": 58075 }, { "epoch": 10.658836483758488, "grad_norm": 2.2651052474975586, "learning_rate": 5.297705301373637e-06, "loss": 0.0911, "num_input_tokens_seen": 125222896, "step": 58080 }, { "epoch": 10.65975408331804, "grad_norm": 9.735651016235352, "learning_rate": 5.2969059615393906e-06, "loss": 0.4681, "num_input_tokens_seen": 125233008, "step": 58085 }, { "epoch": 10.660671682877592, "grad_norm": 20.533288955688477, "learning_rate": 5.296106614089966e-06, "loss": 0.5789, "num_input_tokens_seen": 125242800, "step": 58090 }, { "epoch": 10.661589282437145, "grad_norm": 0.37217170000076294, "learning_rate": 5.295307259045866e-06, "loss": 0.2723, "num_input_tokens_seen": 125253552, "step": 58095 }, { "epoch": 10.662506881996697, "grad_norm": 40.378326416015625, "learning_rate": 5.294507896427593e-06, "loss": 0.1938, "num_input_tokens_seen": 125265744, "step": 58100 }, { "epoch": 10.663424481556248, "grad_norm": 48.40074920654297, "learning_rate": 5.2937085262556486e-06, "loss": 0.2246, "num_input_tokens_seen": 125276560, "step": 58105 }, { "epoch": 10.664342081115802, "grad_norm": 0.8208114504814148, "learning_rate": 5.292909148550535e-06, "loss": 0.1028, "num_input_tokens_seen": 125287312, "step": 58110 }, { "epoch": 10.665259680675353, "grad_norm": 55.20475387573242, "learning_rate": 5.292109763332758e-06, "loss": 0.215, "num_input_tokens_seen": 125296528, "step": 58115 }, { "epoch": 10.666177280234905, "grad_norm": 8.759366035461426, "learning_rate": 5.291310370622816e-06, "loss": 0.2021, "num_input_tokens_seen": 125307216, "step": 58120 }, { "epoch": 10.667094879794458, "grad_norm": 0.36852502822875977, "learning_rate": 5.2905109704412146e-06, "loss": 0.2148, "num_input_tokens_seen": 125316944, "step": 58125 }, { "epoch": 10.66801247935401, "grad_norm": 19.927236557006836, "learning_rate": 5.28971156280846e-06, "loss": 0.1601, "num_input_tokens_seen": 125328912, "step": 58130 }, { "epoch": 10.668930078913561, "grad_norm": 29.537456512451172, "learning_rate": 5.288912147745049e-06, "loss": 0.1088, "num_input_tokens_seen": 125338768, "step": 58135 }, { "epoch": 10.669847678473115, "grad_norm": 0.15509295463562012, "learning_rate": 5.2881127252714916e-06, "loss": 0.0676, "num_input_tokens_seen": 125349104, "step": 58140 }, { "epoch": 10.670765278032667, "grad_norm": 1.4269688129425049, "learning_rate": 5.2873132954082875e-06, "loss": 0.1224, "num_input_tokens_seen": 125360240, "step": 58145 }, { "epoch": 10.671682877592218, "grad_norm": 6.847769737243652, "learning_rate": 5.286513858175943e-06, "loss": 0.3615, "num_input_tokens_seen": 125371120, "step": 58150 }, { "epoch": 10.672600477151772, "grad_norm": 0.06334857642650604, "learning_rate": 5.285714413594963e-06, "loss": 0.0705, "num_input_tokens_seen": 125381488, "step": 58155 }, { "epoch": 10.673518076711323, "grad_norm": 0.2388664335012436, "learning_rate": 5.284914961685852e-06, "loss": 0.3767, "num_input_tokens_seen": 125393040, "step": 58160 }, { "epoch": 10.674435676270875, "grad_norm": 0.2022722065448761, "learning_rate": 5.284115502469113e-06, "loss": 0.0618, "num_input_tokens_seen": 125403312, "step": 58165 }, { "epoch": 10.675353275830428, "grad_norm": 0.15784473717212677, "learning_rate": 5.283316035965251e-06, "loss": 0.6855, "num_input_tokens_seen": 125414160, "step": 58170 }, { "epoch": 10.67627087538998, "grad_norm": 23.34947967529297, "learning_rate": 5.282516562194773e-06, "loss": 0.1049, "num_input_tokens_seen": 125425168, "step": 58175 }, { "epoch": 10.677188474949531, "grad_norm": 35.009647369384766, "learning_rate": 5.281717081178183e-06, "loss": 0.4569, "num_input_tokens_seen": 125435152, "step": 58180 }, { "epoch": 10.678106074509085, "grad_norm": 0.3615284860134125, "learning_rate": 5.280917592935985e-06, "loss": 0.1153, "num_input_tokens_seen": 125445648, "step": 58185 }, { "epoch": 10.679023674068636, "grad_norm": 105.83631896972656, "learning_rate": 5.280118097488687e-06, "loss": 0.3432, "num_input_tokens_seen": 125456944, "step": 58190 }, { "epoch": 10.679941273628188, "grad_norm": 27.92143440246582, "learning_rate": 5.279318594856792e-06, "loss": 0.3495, "num_input_tokens_seen": 125467632, "step": 58195 }, { "epoch": 10.680858873187741, "grad_norm": 107.33586883544922, "learning_rate": 5.278519085060811e-06, "loss": 0.2539, "num_input_tokens_seen": 125478064, "step": 58200 }, { "epoch": 10.681776472747293, "grad_norm": 0.12584878504276276, "learning_rate": 5.277719568121245e-06, "loss": 0.1979, "num_input_tokens_seen": 125488272, "step": 58205 }, { "epoch": 10.682694072306845, "grad_norm": 10.274200439453125, "learning_rate": 5.276920044058603e-06, "loss": 0.2563, "num_input_tokens_seen": 125498896, "step": 58210 }, { "epoch": 10.683611671866398, "grad_norm": 6.34161376953125, "learning_rate": 5.276120512893392e-06, "loss": 0.2046, "num_input_tokens_seen": 125509520, "step": 58215 }, { "epoch": 10.68452927142595, "grad_norm": 37.120609283447266, "learning_rate": 5.275320974646118e-06, "loss": 0.2508, "num_input_tokens_seen": 125521168, "step": 58220 }, { "epoch": 10.685446870985501, "grad_norm": 14.943595886230469, "learning_rate": 5.2745214293372874e-06, "loss": 0.1182, "num_input_tokens_seen": 125531632, "step": 58225 }, { "epoch": 10.686364470545055, "grad_norm": 0.42807450890541077, "learning_rate": 5.273721876987405e-06, "loss": 0.3214, "num_input_tokens_seen": 125542736, "step": 58230 }, { "epoch": 10.687282070104606, "grad_norm": 9.027971267700195, "learning_rate": 5.272922317616983e-06, "loss": 0.1872, "num_input_tokens_seen": 125553136, "step": 58235 }, { "epoch": 10.688199669664158, "grad_norm": 0.5701388120651245, "learning_rate": 5.272122751246526e-06, "loss": 0.0219, "num_input_tokens_seen": 125565328, "step": 58240 }, { "epoch": 10.689117269223711, "grad_norm": 10.950879096984863, "learning_rate": 5.271323177896543e-06, "loss": 0.0867, "num_input_tokens_seen": 125576176, "step": 58245 }, { "epoch": 10.690034868783263, "grad_norm": 0.49327510595321655, "learning_rate": 5.2705235975875416e-06, "loss": 0.2894, "num_input_tokens_seen": 125587792, "step": 58250 }, { "epoch": 10.690952468342815, "grad_norm": 0.11434277892112732, "learning_rate": 5.269724010340027e-06, "loss": 0.318, "num_input_tokens_seen": 125598448, "step": 58255 }, { "epoch": 10.691870067902368, "grad_norm": 20.442447662353516, "learning_rate": 5.26892441617451e-06, "loss": 0.0202, "num_input_tokens_seen": 125608208, "step": 58260 }, { "epoch": 10.69278766746192, "grad_norm": 29.91371726989746, "learning_rate": 5.2681248151115e-06, "loss": 0.2557, "num_input_tokens_seen": 125618928, "step": 58265 }, { "epoch": 10.693705267021471, "grad_norm": 50.65059280395508, "learning_rate": 5.267325207171504e-06, "loss": 0.0244, "num_input_tokens_seen": 125629616, "step": 58270 }, { "epoch": 10.694622866581025, "grad_norm": 0.38837704062461853, "learning_rate": 5.266525592375031e-06, "loss": 0.216, "num_input_tokens_seen": 125641936, "step": 58275 }, { "epoch": 10.695540466140576, "grad_norm": 24.00206756591797, "learning_rate": 5.265725970742588e-06, "loss": 0.2967, "num_input_tokens_seen": 125652720, "step": 58280 }, { "epoch": 10.696458065700128, "grad_norm": 142.47950744628906, "learning_rate": 5.264926342294686e-06, "loss": 0.3854, "num_input_tokens_seen": 125664432, "step": 58285 }, { "epoch": 10.697375665259681, "grad_norm": 1.6223480701446533, "learning_rate": 5.264126707051836e-06, "loss": 0.144, "num_input_tokens_seen": 125674608, "step": 58290 }, { "epoch": 10.698293264819233, "grad_norm": 6.470914363861084, "learning_rate": 5.263327065034542e-06, "loss": 0.066, "num_input_tokens_seen": 125685328, "step": 58295 }, { "epoch": 10.699210864378784, "grad_norm": 1.1022610664367676, "learning_rate": 5.262527416263319e-06, "loss": 0.0187, "num_input_tokens_seen": 125696176, "step": 58300 }, { "epoch": 10.700128463938338, "grad_norm": 23.94074058532715, "learning_rate": 5.261727760758674e-06, "loss": 0.3713, "num_input_tokens_seen": 125707344, "step": 58305 }, { "epoch": 10.70104606349789, "grad_norm": 124.52925872802734, "learning_rate": 5.260928098541117e-06, "loss": 0.5042, "num_input_tokens_seen": 125718768, "step": 58310 }, { "epoch": 10.701963663057441, "grad_norm": 11.652654647827148, "learning_rate": 5.260128429631159e-06, "loss": 0.3093, "num_input_tokens_seen": 125730096, "step": 58315 }, { "epoch": 10.702881262616994, "grad_norm": 126.95130920410156, "learning_rate": 5.259328754049311e-06, "loss": 0.1884, "num_input_tokens_seen": 125741072, "step": 58320 }, { "epoch": 10.703798862176546, "grad_norm": 0.6897289156913757, "learning_rate": 5.258529071816082e-06, "loss": 0.0417, "num_input_tokens_seen": 125751472, "step": 58325 }, { "epoch": 10.704716461736098, "grad_norm": 43.213279724121094, "learning_rate": 5.257729382951983e-06, "loss": 0.2141, "num_input_tokens_seen": 125762064, "step": 58330 }, { "epoch": 10.705634061295651, "grad_norm": 10.016470909118652, "learning_rate": 5.256929687477524e-06, "loss": 0.1128, "num_input_tokens_seen": 125771888, "step": 58335 }, { "epoch": 10.706551660855203, "grad_norm": 2.4428460597991943, "learning_rate": 5.256129985413218e-06, "loss": 0.2294, "num_input_tokens_seen": 125783248, "step": 58340 }, { "epoch": 10.707469260414754, "grad_norm": 1.15652596950531, "learning_rate": 5.255330276779572e-06, "loss": 0.0938, "num_input_tokens_seen": 125793904, "step": 58345 }, { "epoch": 10.708386859974308, "grad_norm": 3.592219114303589, "learning_rate": 5.254530561597103e-06, "loss": 0.1703, "num_input_tokens_seen": 125803760, "step": 58350 }, { "epoch": 10.70930445953386, "grad_norm": 1.5662107467651367, "learning_rate": 5.253730839886318e-06, "loss": 0.1021, "num_input_tokens_seen": 125815344, "step": 58355 }, { "epoch": 10.710222059093411, "grad_norm": 22.961132049560547, "learning_rate": 5.252931111667731e-06, "loss": 0.1993, "num_input_tokens_seen": 125827504, "step": 58360 }, { "epoch": 10.711139658652964, "grad_norm": 3.8960886001586914, "learning_rate": 5.252131376961853e-06, "loss": 0.2798, "num_input_tokens_seen": 125838896, "step": 58365 }, { "epoch": 10.712057258212516, "grad_norm": 1.5234318971633911, "learning_rate": 5.251331635789196e-06, "loss": 0.1812, "num_input_tokens_seen": 125850224, "step": 58370 }, { "epoch": 10.712974857772068, "grad_norm": 23.936567306518555, "learning_rate": 5.250531888170273e-06, "loss": 0.4081, "num_input_tokens_seen": 125860112, "step": 58375 }, { "epoch": 10.713892457331621, "grad_norm": 59.684654235839844, "learning_rate": 5.2497321341255944e-06, "loss": 0.0568, "num_input_tokens_seen": 125870576, "step": 58380 }, { "epoch": 10.714810056891173, "grad_norm": 1.873387098312378, "learning_rate": 5.248932373675673e-06, "loss": 0.0811, "num_input_tokens_seen": 125882128, "step": 58385 }, { "epoch": 10.715727656450724, "grad_norm": 8.165058135986328, "learning_rate": 5.248132606841023e-06, "loss": 0.0543, "num_input_tokens_seen": 125892976, "step": 58390 }, { "epoch": 10.716645256010278, "grad_norm": 48.3574104309082, "learning_rate": 5.247332833642156e-06, "loss": 0.4662, "num_input_tokens_seen": 125904624, "step": 58395 }, { "epoch": 10.71756285556983, "grad_norm": 1.2714048624038696, "learning_rate": 5.246533054099585e-06, "loss": 0.1522, "num_input_tokens_seen": 125916400, "step": 58400 }, { "epoch": 10.71848045512938, "grad_norm": 145.58065795898438, "learning_rate": 5.245733268233822e-06, "loss": 0.1048, "num_input_tokens_seen": 125926288, "step": 58405 }, { "epoch": 10.719398054688934, "grad_norm": 0.12720376253128052, "learning_rate": 5.244933476065384e-06, "loss": 0.0092, "num_input_tokens_seen": 125937776, "step": 58410 }, { "epoch": 10.720315654248486, "grad_norm": 2.727569341659546, "learning_rate": 5.24413367761478e-06, "loss": 0.2212, "num_input_tokens_seen": 125948944, "step": 58415 }, { "epoch": 10.721233253808037, "grad_norm": 48.666446685791016, "learning_rate": 5.243333872902527e-06, "loss": 0.1491, "num_input_tokens_seen": 125959568, "step": 58420 }, { "epoch": 10.72215085336759, "grad_norm": 3.7980401515960693, "learning_rate": 5.242534061949136e-06, "loss": 0.0072, "num_input_tokens_seen": 125970704, "step": 58425 }, { "epoch": 10.723068452927143, "grad_norm": 0.5437886118888855, "learning_rate": 5.241734244775122e-06, "loss": 0.3066, "num_input_tokens_seen": 125981040, "step": 58430 }, { "epoch": 10.723986052486694, "grad_norm": 53.95943069458008, "learning_rate": 5.240934421401e-06, "loss": 0.2926, "num_input_tokens_seen": 125991824, "step": 58435 }, { "epoch": 10.724903652046248, "grad_norm": 0.15168827772140503, "learning_rate": 5.2401345918472835e-06, "loss": 0.0104, "num_input_tokens_seen": 126002896, "step": 58440 }, { "epoch": 10.7258212516058, "grad_norm": 22.398406982421875, "learning_rate": 5.239334756134486e-06, "loss": 0.337, "num_input_tokens_seen": 126014960, "step": 58445 }, { "epoch": 10.72673885116535, "grad_norm": 1034.0987548828125, "learning_rate": 5.238534914283125e-06, "loss": 0.2359, "num_input_tokens_seen": 126025872, "step": 58450 }, { "epoch": 10.727656450724904, "grad_norm": 207.4638214111328, "learning_rate": 5.237735066313712e-06, "loss": 0.2478, "num_input_tokens_seen": 126036432, "step": 58455 }, { "epoch": 10.728574050284456, "grad_norm": 10.770035743713379, "learning_rate": 5.236935212246763e-06, "loss": 0.1307, "num_input_tokens_seen": 126048592, "step": 58460 }, { "epoch": 10.729491649844007, "grad_norm": 0.2125825732946396, "learning_rate": 5.236135352102793e-06, "loss": 0.1026, "num_input_tokens_seen": 126060016, "step": 58465 }, { "epoch": 10.73040924940356, "grad_norm": 0.44085970520973206, "learning_rate": 5.235335485902317e-06, "loss": 0.0014, "num_input_tokens_seen": 126069616, "step": 58470 }, { "epoch": 10.731326848963112, "grad_norm": 116.52574920654297, "learning_rate": 5.23453561366585e-06, "loss": 0.1157, "num_input_tokens_seen": 126079696, "step": 58475 }, { "epoch": 10.732244448522664, "grad_norm": 0.03617039695382118, "learning_rate": 5.233735735413909e-06, "loss": 0.2432, "num_input_tokens_seen": 126090480, "step": 58480 }, { "epoch": 10.733162048082217, "grad_norm": 38.278202056884766, "learning_rate": 5.232935851167008e-06, "loss": 0.2027, "num_input_tokens_seen": 126100016, "step": 58485 }, { "epoch": 10.734079647641769, "grad_norm": 11.099066734313965, "learning_rate": 5.232135960945664e-06, "loss": 0.2653, "num_input_tokens_seen": 126110096, "step": 58490 }, { "epoch": 10.73499724720132, "grad_norm": 0.17595575749874115, "learning_rate": 5.231336064770392e-06, "loss": 0.5216, "num_input_tokens_seen": 126120240, "step": 58495 }, { "epoch": 10.735914846760874, "grad_norm": 23.428218841552734, "learning_rate": 5.2305361626617104e-06, "loss": 0.1122, "num_input_tokens_seen": 126131408, "step": 58500 }, { "epoch": 10.736832446320426, "grad_norm": 0.11768923699855804, "learning_rate": 5.229736254640131e-06, "loss": 0.0521, "num_input_tokens_seen": 126142128, "step": 58505 }, { "epoch": 10.737750045879977, "grad_norm": 0.7253932356834412, "learning_rate": 5.228936340726174e-06, "loss": 0.1636, "num_input_tokens_seen": 126152976, "step": 58510 }, { "epoch": 10.73866764543953, "grad_norm": 54.289649963378906, "learning_rate": 5.228136420940353e-06, "loss": 0.7754, "num_input_tokens_seen": 126164592, "step": 58515 }, { "epoch": 10.739585244999082, "grad_norm": 42.50551986694336, "learning_rate": 5.227336495303188e-06, "loss": 0.5498, "num_input_tokens_seen": 126175728, "step": 58520 }, { "epoch": 10.740502844558634, "grad_norm": 100.02410888671875, "learning_rate": 5.2265365638351936e-06, "loss": 0.0643, "num_input_tokens_seen": 126186704, "step": 58525 }, { "epoch": 10.741420444118187, "grad_norm": 0.30173227190971375, "learning_rate": 5.225736626556888e-06, "loss": 0.1198, "num_input_tokens_seen": 126198096, "step": 58530 }, { "epoch": 10.742338043677739, "grad_norm": 7.357442378997803, "learning_rate": 5.224936683488787e-06, "loss": 0.3076, "num_input_tokens_seen": 126208848, "step": 58535 }, { "epoch": 10.74325564323729, "grad_norm": 39.91717529296875, "learning_rate": 5.224136734651409e-06, "loss": 0.3742, "num_input_tokens_seen": 126219568, "step": 58540 }, { "epoch": 10.744173242796844, "grad_norm": 0.05334579944610596, "learning_rate": 5.223336780065271e-06, "loss": 0.0989, "num_input_tokens_seen": 126231024, "step": 58545 }, { "epoch": 10.745090842356396, "grad_norm": 29.30890464782715, "learning_rate": 5.222536819750891e-06, "loss": 0.3277, "num_input_tokens_seen": 126242608, "step": 58550 }, { "epoch": 10.746008441915947, "grad_norm": 0.07213207334280014, "learning_rate": 5.221736853728786e-06, "loss": 0.1826, "num_input_tokens_seen": 126253968, "step": 58555 }, { "epoch": 10.7469260414755, "grad_norm": 12.029130935668945, "learning_rate": 5.220936882019475e-06, "loss": 0.1287, "num_input_tokens_seen": 126265392, "step": 58560 }, { "epoch": 10.747843641035052, "grad_norm": 0.32957392930984497, "learning_rate": 5.220136904643475e-06, "loss": 0.2739, "num_input_tokens_seen": 126277264, "step": 58565 }, { "epoch": 10.748761240594604, "grad_norm": 0.20083306729793549, "learning_rate": 5.219336921621305e-06, "loss": 0.0903, "num_input_tokens_seen": 126288240, "step": 58570 }, { "epoch": 10.749678840154157, "grad_norm": 0.18157058954238892, "learning_rate": 5.218536932973483e-06, "loss": 0.1445, "num_input_tokens_seen": 126300432, "step": 58575 }, { "epoch": 10.750596439713709, "grad_norm": 0.20247101783752441, "learning_rate": 5.217736938720527e-06, "loss": 0.4445, "num_input_tokens_seen": 126311824, "step": 58580 }, { "epoch": 10.75151403927326, "grad_norm": 0.2509680390357971, "learning_rate": 5.216936938882956e-06, "loss": 0.2265, "num_input_tokens_seen": 126321936, "step": 58585 }, { "epoch": 10.752431638832814, "grad_norm": 0.37831220030784607, "learning_rate": 5.216136933481288e-06, "loss": 0.1136, "num_input_tokens_seen": 126332592, "step": 58590 }, { "epoch": 10.753349238392365, "grad_norm": 118.19898223876953, "learning_rate": 5.215336922536044e-06, "loss": 0.5311, "num_input_tokens_seen": 126342192, "step": 58595 }, { "epoch": 10.754266837951917, "grad_norm": 0.21366943418979645, "learning_rate": 5.214536906067742e-06, "loss": 0.1185, "num_input_tokens_seen": 126353552, "step": 58600 }, { "epoch": 10.75518443751147, "grad_norm": 1.1324987411499023, "learning_rate": 5.213736884096899e-06, "loss": 0.1076, "num_input_tokens_seen": 126363888, "step": 58605 }, { "epoch": 10.756102037071022, "grad_norm": 21.408531188964844, "learning_rate": 5.2129368566440385e-06, "loss": 0.1556, "num_input_tokens_seen": 126375184, "step": 58610 }, { "epoch": 10.757019636630574, "grad_norm": 0.716606616973877, "learning_rate": 5.2121368237296756e-06, "loss": 0.1813, "num_input_tokens_seen": 126385456, "step": 58615 }, { "epoch": 10.757937236190127, "grad_norm": 106.09951782226562, "learning_rate": 5.211336785374334e-06, "loss": 0.3008, "num_input_tokens_seen": 126395824, "step": 58620 }, { "epoch": 10.758854835749679, "grad_norm": 81.94055938720703, "learning_rate": 5.210536741598528e-06, "loss": 0.304, "num_input_tokens_seen": 126405456, "step": 58625 }, { "epoch": 10.75977243530923, "grad_norm": 0.1686573326587677, "learning_rate": 5.209736692422783e-06, "loss": 0.3691, "num_input_tokens_seen": 126416240, "step": 58630 }, { "epoch": 10.760690034868784, "grad_norm": 1.1144788265228271, "learning_rate": 5.2089366378676176e-06, "loss": 0.1792, "num_input_tokens_seen": 126426832, "step": 58635 }, { "epoch": 10.761607634428335, "grad_norm": 5.598828315734863, "learning_rate": 5.20813657795355e-06, "loss": 0.1547, "num_input_tokens_seen": 126439184, "step": 58640 }, { "epoch": 10.762525233987887, "grad_norm": 0.2439357489347458, "learning_rate": 5.207336512701102e-06, "loss": 0.2612, "num_input_tokens_seen": 126451056, "step": 58645 }, { "epoch": 10.76344283354744, "grad_norm": 0.10759678483009338, "learning_rate": 5.206536442130794e-06, "loss": 0.1741, "num_input_tokens_seen": 126461776, "step": 58650 }, { "epoch": 10.764360433106992, "grad_norm": 12.698287963867188, "learning_rate": 5.205736366263148e-06, "loss": 0.1065, "num_input_tokens_seen": 126471632, "step": 58655 }, { "epoch": 10.765278032666544, "grad_norm": 83.26693725585938, "learning_rate": 5.2049362851186805e-06, "loss": 0.2337, "num_input_tokens_seen": 126481808, "step": 58660 }, { "epoch": 10.766195632226097, "grad_norm": 0.1988382637500763, "learning_rate": 5.204136198717915e-06, "loss": 0.1943, "num_input_tokens_seen": 126492816, "step": 58665 }, { "epoch": 10.767113231785649, "grad_norm": 0.2743883728981018, "learning_rate": 5.203336107081374e-06, "loss": 0.1612, "num_input_tokens_seen": 126503632, "step": 58670 }, { "epoch": 10.7680308313452, "grad_norm": 10.83786678314209, "learning_rate": 5.202536010229575e-06, "loss": 0.1159, "num_input_tokens_seen": 126515216, "step": 58675 }, { "epoch": 10.768948430904754, "grad_norm": 95.59062957763672, "learning_rate": 5.201735908183043e-06, "loss": 0.2443, "num_input_tokens_seen": 126525072, "step": 58680 }, { "epoch": 10.769866030464305, "grad_norm": 10.186612129211426, "learning_rate": 5.200935800962297e-06, "loss": 0.1969, "num_input_tokens_seen": 126534768, "step": 58685 }, { "epoch": 10.770783630023857, "grad_norm": 0.17186321318149567, "learning_rate": 5.20013568858786e-06, "loss": 0.1047, "num_input_tokens_seen": 126546416, "step": 58690 }, { "epoch": 10.77170122958341, "grad_norm": 1.7911537885665894, "learning_rate": 5.199335571080252e-06, "loss": 0.2026, "num_input_tokens_seen": 126557360, "step": 58695 }, { "epoch": 10.772618829142962, "grad_norm": 163.11819458007812, "learning_rate": 5.198535448459996e-06, "loss": 0.3639, "num_input_tokens_seen": 126568656, "step": 58700 }, { "epoch": 10.773536428702513, "grad_norm": 0.9924390912055969, "learning_rate": 5.197735320747612e-06, "loss": 0.3586, "num_input_tokens_seen": 126578384, "step": 58705 }, { "epoch": 10.774454028262067, "grad_norm": 0.11616850644350052, "learning_rate": 5.196935187963625e-06, "loss": 0.1928, "num_input_tokens_seen": 126588784, "step": 58710 }, { "epoch": 10.775371627821619, "grad_norm": 339.44927978515625, "learning_rate": 5.196135050128554e-06, "loss": 0.3337, "num_input_tokens_seen": 126598064, "step": 58715 }, { "epoch": 10.77628922738117, "grad_norm": 28.179969787597656, "learning_rate": 5.1953349072629255e-06, "loss": 0.2614, "num_input_tokens_seen": 126607696, "step": 58720 }, { "epoch": 10.777206826940724, "grad_norm": 9.251447677612305, "learning_rate": 5.194534759387257e-06, "loss": 0.1248, "num_input_tokens_seen": 126619760, "step": 58725 }, { "epoch": 10.778124426500275, "grad_norm": 0.32112735509872437, "learning_rate": 5.193734606522075e-06, "loss": 0.0935, "num_input_tokens_seen": 126629712, "step": 58730 }, { "epoch": 10.779042026059827, "grad_norm": 22.20134735107422, "learning_rate": 5.1929344486878995e-06, "loss": 0.0959, "num_input_tokens_seen": 126640464, "step": 58735 }, { "epoch": 10.77995962561938, "grad_norm": 43.70927047729492, "learning_rate": 5.192134285905255e-06, "loss": 0.1879, "num_input_tokens_seen": 126650992, "step": 58740 }, { "epoch": 10.780877225178932, "grad_norm": 5.829128742218018, "learning_rate": 5.191334118194664e-06, "loss": 0.1247, "num_input_tokens_seen": 126661424, "step": 58745 }, { "epoch": 10.781794824738483, "grad_norm": 0.19571872055530548, "learning_rate": 5.190533945576649e-06, "loss": 0.167, "num_input_tokens_seen": 126672720, "step": 58750 }, { "epoch": 10.782712424298037, "grad_norm": 1.5497580766677856, "learning_rate": 5.1897337680717345e-06, "loss": 0.0066, "num_input_tokens_seen": 126682704, "step": 58755 }, { "epoch": 10.783630023857588, "grad_norm": 0.254172682762146, "learning_rate": 5.188933585700442e-06, "loss": 0.0077, "num_input_tokens_seen": 126694032, "step": 58760 }, { "epoch": 10.78454762341714, "grad_norm": 0.4615357518196106, "learning_rate": 5.188133398483295e-06, "loss": 0.1488, "num_input_tokens_seen": 126704240, "step": 58765 }, { "epoch": 10.785465222976693, "grad_norm": 61.175533294677734, "learning_rate": 5.18733320644082e-06, "loss": 0.4227, "num_input_tokens_seen": 126714288, "step": 58770 }, { "epoch": 10.786382822536245, "grad_norm": 0.23691347241401672, "learning_rate": 5.186533009593536e-06, "loss": 0.0952, "num_input_tokens_seen": 126724464, "step": 58775 }, { "epoch": 10.787300422095797, "grad_norm": 49.5820426940918, "learning_rate": 5.185732807961971e-06, "loss": 0.2291, "num_input_tokens_seen": 126735504, "step": 58780 }, { "epoch": 10.78821802165535, "grad_norm": 53.8134880065918, "learning_rate": 5.184932601566648e-06, "loss": 0.4196, "num_input_tokens_seen": 126746736, "step": 58785 }, { "epoch": 10.789135621214902, "grad_norm": 152.8816680908203, "learning_rate": 5.18413239042809e-06, "loss": 0.3289, "num_input_tokens_seen": 126757200, "step": 58790 }, { "epoch": 10.790053220774453, "grad_norm": 0.017084088176488876, "learning_rate": 5.183332174566821e-06, "loss": 0.0846, "num_input_tokens_seen": 126767408, "step": 58795 }, { "epoch": 10.790970820334007, "grad_norm": 74.67913818359375, "learning_rate": 5.182531954003365e-06, "loss": 0.4714, "num_input_tokens_seen": 126779280, "step": 58800 }, { "epoch": 10.791888419893558, "grad_norm": 4.255267143249512, "learning_rate": 5.181731728758249e-06, "loss": 0.2331, "num_input_tokens_seen": 126790224, "step": 58805 }, { "epoch": 10.79280601945311, "grad_norm": 1.5134844779968262, "learning_rate": 5.180931498851995e-06, "loss": 0.2036, "num_input_tokens_seen": 126801328, "step": 58810 }, { "epoch": 10.793723619012663, "grad_norm": 78.0770263671875, "learning_rate": 5.18013126430513e-06, "loss": 0.6933, "num_input_tokens_seen": 126813072, "step": 58815 }, { "epoch": 10.794641218572215, "grad_norm": 0.08461259305477142, "learning_rate": 5.1793310251381755e-06, "loss": 0.2334, "num_input_tokens_seen": 126822352, "step": 58820 }, { "epoch": 10.795558818131767, "grad_norm": 0.4365752935409546, "learning_rate": 5.178530781371658e-06, "loss": 0.208, "num_input_tokens_seen": 126833520, "step": 58825 }, { "epoch": 10.79647641769132, "grad_norm": 0.29046013951301575, "learning_rate": 5.177730533026104e-06, "loss": 0.0204, "num_input_tokens_seen": 126844464, "step": 58830 }, { "epoch": 10.797394017250872, "grad_norm": 54.59844207763672, "learning_rate": 5.1769302801220355e-06, "loss": 0.0758, "num_input_tokens_seen": 126856432, "step": 58835 }, { "epoch": 10.798311616810423, "grad_norm": 33.1734619140625, "learning_rate": 5.176130022679981e-06, "loss": 0.2975, "num_input_tokens_seen": 126866704, "step": 58840 }, { "epoch": 10.799229216369977, "grad_norm": 3.6363561153411865, "learning_rate": 5.175329760720463e-06, "loss": 0.0187, "num_input_tokens_seen": 126877776, "step": 58845 }, { "epoch": 10.800146815929528, "grad_norm": 97.55205535888672, "learning_rate": 5.174529494264009e-06, "loss": 0.0498, "num_input_tokens_seen": 126889104, "step": 58850 }, { "epoch": 10.80106441548908, "grad_norm": 0.18803666532039642, "learning_rate": 5.173729223331146e-06, "loss": 0.4622, "num_input_tokens_seen": 126900336, "step": 58855 }, { "epoch": 10.801982015048633, "grad_norm": 0.40633106231689453, "learning_rate": 5.172928947942395e-06, "loss": 0.151, "num_input_tokens_seen": 126909936, "step": 58860 }, { "epoch": 10.802899614608185, "grad_norm": 0.14201563596725464, "learning_rate": 5.172128668118286e-06, "loss": 0.1346, "num_input_tokens_seen": 126921520, "step": 58865 }, { "epoch": 10.803817214167736, "grad_norm": 36.32660675048828, "learning_rate": 5.171328383879341e-06, "loss": 0.0223, "num_input_tokens_seen": 126932592, "step": 58870 }, { "epoch": 10.80473481372729, "grad_norm": 24.210416793823242, "learning_rate": 5.170528095246091e-06, "loss": 0.0134, "num_input_tokens_seen": 126942160, "step": 58875 }, { "epoch": 10.805652413286841, "grad_norm": 50.875465393066406, "learning_rate": 5.1697278022390595e-06, "loss": 0.2122, "num_input_tokens_seen": 126953648, "step": 58880 }, { "epoch": 10.806570012846393, "grad_norm": 11.147385597229004, "learning_rate": 5.1689275048787725e-06, "loss": 0.362, "num_input_tokens_seen": 126964112, "step": 58885 }, { "epoch": 10.807487612405946, "grad_norm": 71.25296020507812, "learning_rate": 5.168127203185756e-06, "loss": 0.4295, "num_input_tokens_seen": 126974352, "step": 58890 }, { "epoch": 10.808405211965498, "grad_norm": 0.36900368332862854, "learning_rate": 5.1673268971805376e-06, "loss": 0.2854, "num_input_tokens_seen": 126984656, "step": 58895 }, { "epoch": 10.80932281152505, "grad_norm": 0.1051117554306984, "learning_rate": 5.166526586883644e-06, "loss": 0.2989, "num_input_tokens_seen": 126994960, "step": 58900 }, { "epoch": 10.810240411084603, "grad_norm": 8.963241577148438, "learning_rate": 5.165726272315602e-06, "loss": 0.3452, "num_input_tokens_seen": 127006096, "step": 58905 }, { "epoch": 10.811158010644155, "grad_norm": 0.32851871848106384, "learning_rate": 5.164925953496937e-06, "loss": 0.1039, "num_input_tokens_seen": 127016496, "step": 58910 }, { "epoch": 10.812075610203706, "grad_norm": 0.14102742075920105, "learning_rate": 5.164125630448178e-06, "loss": 0.0665, "num_input_tokens_seen": 127027120, "step": 58915 }, { "epoch": 10.81299320976326, "grad_norm": 0.42919328808784485, "learning_rate": 5.163325303189851e-06, "loss": 0.1134, "num_input_tokens_seen": 127037552, "step": 58920 }, { "epoch": 10.813910809322811, "grad_norm": 0.64743572473526, "learning_rate": 5.162524971742483e-06, "loss": 0.0089, "num_input_tokens_seen": 127047472, "step": 58925 }, { "epoch": 10.814828408882363, "grad_norm": 0.38018926978111267, "learning_rate": 5.161724636126602e-06, "loss": 0.3783, "num_input_tokens_seen": 127058288, "step": 58930 }, { "epoch": 10.815746008441916, "grad_norm": 0.42860135436058044, "learning_rate": 5.160924296362733e-06, "loss": 0.2387, "num_input_tokens_seen": 127069520, "step": 58935 }, { "epoch": 10.816663608001468, "grad_norm": 14.419641494750977, "learning_rate": 5.160123952471406e-06, "loss": 0.0936, "num_input_tokens_seen": 127079824, "step": 58940 }, { "epoch": 10.81758120756102, "grad_norm": 106.97333526611328, "learning_rate": 5.159323604473146e-06, "loss": 0.4921, "num_input_tokens_seen": 127090768, "step": 58945 }, { "epoch": 10.818498807120573, "grad_norm": 0.12091974169015884, "learning_rate": 5.158523252388486e-06, "loss": 0.2486, "num_input_tokens_seen": 127101616, "step": 58950 }, { "epoch": 10.819416406680125, "grad_norm": 9.50290298461914, "learning_rate": 5.1577228962379475e-06, "loss": 0.2253, "num_input_tokens_seen": 127112240, "step": 58955 }, { "epoch": 10.820334006239676, "grad_norm": 0.13802507519721985, "learning_rate": 5.156922536042061e-06, "loss": 0.0626, "num_input_tokens_seen": 127121360, "step": 58960 }, { "epoch": 10.82125160579923, "grad_norm": 0.22090163826942444, "learning_rate": 5.156122171821356e-06, "loss": 0.4023, "num_input_tokens_seen": 127131696, "step": 58965 }, { "epoch": 10.822169205358781, "grad_norm": 1.5515505075454712, "learning_rate": 5.1553218035963595e-06, "loss": 0.4243, "num_input_tokens_seen": 127143376, "step": 58970 }, { "epoch": 10.823086804918333, "grad_norm": 0.6714327335357666, "learning_rate": 5.154521431387599e-06, "loss": 0.272, "num_input_tokens_seen": 127153520, "step": 58975 }, { "epoch": 10.824004404477886, "grad_norm": 0.5605478882789612, "learning_rate": 5.153721055215602e-06, "loss": 0.0079, "num_input_tokens_seen": 127163984, "step": 58980 }, { "epoch": 10.824922004037438, "grad_norm": 4.766635894775391, "learning_rate": 5.152920675100899e-06, "loss": 0.3071, "num_input_tokens_seen": 127173968, "step": 58985 }, { "epoch": 10.82583960359699, "grad_norm": 11.067984580993652, "learning_rate": 5.152120291064019e-06, "loss": 0.2142, "num_input_tokens_seen": 127184240, "step": 58990 }, { "epoch": 10.826757203156543, "grad_norm": 0.565906286239624, "learning_rate": 5.151319903125488e-06, "loss": 0.1784, "num_input_tokens_seen": 127194608, "step": 58995 }, { "epoch": 10.827674802716095, "grad_norm": 46.68721389770508, "learning_rate": 5.150519511305837e-06, "loss": 0.3526, "num_input_tokens_seen": 127205360, "step": 59000 }, { "epoch": 10.828592402275646, "grad_norm": 207.6312255859375, "learning_rate": 5.149719115625592e-06, "loss": 0.2585, "num_input_tokens_seen": 127216720, "step": 59005 }, { "epoch": 10.8295100018352, "grad_norm": 0.37590038776397705, "learning_rate": 5.148918716105284e-06, "loss": 0.357, "num_input_tokens_seen": 127226384, "step": 59010 }, { "epoch": 10.830427601394751, "grad_norm": 7.359084606170654, "learning_rate": 5.1481183127654444e-06, "loss": 0.4913, "num_input_tokens_seen": 127237584, "step": 59015 }, { "epoch": 10.831345200954303, "grad_norm": 0.2730198800563812, "learning_rate": 5.147317905626598e-06, "loss": 0.1816, "num_input_tokens_seen": 127248880, "step": 59020 }, { "epoch": 10.832262800513856, "grad_norm": 44.02875900268555, "learning_rate": 5.146517494709276e-06, "loss": 0.3527, "num_input_tokens_seen": 127259376, "step": 59025 }, { "epoch": 10.833180400073408, "grad_norm": 25.720197677612305, "learning_rate": 5.145717080034007e-06, "loss": 0.2867, "num_input_tokens_seen": 127270000, "step": 59030 }, { "epoch": 10.83409799963296, "grad_norm": 31.526100158691406, "learning_rate": 5.14491666162132e-06, "loss": 0.0257, "num_input_tokens_seen": 127280304, "step": 59035 }, { "epoch": 10.835015599192513, "grad_norm": 15.029850006103516, "learning_rate": 5.144116239491746e-06, "loss": 0.2298, "num_input_tokens_seen": 127291600, "step": 59040 }, { "epoch": 10.835933198752064, "grad_norm": 71.15484619140625, "learning_rate": 5.143315813665814e-06, "loss": 0.2113, "num_input_tokens_seen": 127302096, "step": 59045 }, { "epoch": 10.836850798311616, "grad_norm": 1.5772228240966797, "learning_rate": 5.142515384164053e-06, "loss": 0.0126, "num_input_tokens_seen": 127312976, "step": 59050 }, { "epoch": 10.83776839787117, "grad_norm": 15.826126098632812, "learning_rate": 5.141714951006993e-06, "loss": 0.2884, "num_input_tokens_seen": 127323056, "step": 59055 }, { "epoch": 10.838685997430721, "grad_norm": 90.91761779785156, "learning_rate": 5.140914514215164e-06, "loss": 0.2126, "num_input_tokens_seen": 127333616, "step": 59060 }, { "epoch": 10.839603596990273, "grad_norm": 0.2905605137348175, "learning_rate": 5.140114073809097e-06, "loss": 0.2878, "num_input_tokens_seen": 127344560, "step": 59065 }, { "epoch": 10.840521196549826, "grad_norm": 0.8931220173835754, "learning_rate": 5.139313629809321e-06, "loss": 0.44, "num_input_tokens_seen": 127355088, "step": 59070 }, { "epoch": 10.841438796109378, "grad_norm": 17.26144790649414, "learning_rate": 5.138513182236367e-06, "loss": 0.3091, "num_input_tokens_seen": 127366064, "step": 59075 }, { "epoch": 10.84235639566893, "grad_norm": 3.340751886367798, "learning_rate": 5.137712731110764e-06, "loss": 0.0812, "num_input_tokens_seen": 127377040, "step": 59080 }, { "epoch": 10.843273995228483, "grad_norm": 38.486595153808594, "learning_rate": 5.136912276453041e-06, "loss": 0.2212, "num_input_tokens_seen": 127388432, "step": 59085 }, { "epoch": 10.844191594788034, "grad_norm": 70.48543548583984, "learning_rate": 5.1361118182837325e-06, "loss": 0.2924, "num_input_tokens_seen": 127398704, "step": 59090 }, { "epoch": 10.845109194347586, "grad_norm": 23.83293342590332, "learning_rate": 5.135311356623366e-06, "loss": 0.0553, "num_input_tokens_seen": 127409872, "step": 59095 }, { "epoch": 10.84602679390714, "grad_norm": 23.292259216308594, "learning_rate": 5.134510891492474e-06, "loss": 0.0753, "num_input_tokens_seen": 127420688, "step": 59100 }, { "epoch": 10.846944393466691, "grad_norm": 0.3472526967525482, "learning_rate": 5.133710422911584e-06, "loss": 0.1076, "num_input_tokens_seen": 127430640, "step": 59105 }, { "epoch": 10.847861993026243, "grad_norm": 0.5823059678077698, "learning_rate": 5.132909950901231e-06, "loss": 0.0059, "num_input_tokens_seen": 127442160, "step": 59110 }, { "epoch": 10.848779592585796, "grad_norm": 79.42728424072266, "learning_rate": 5.132109475481942e-06, "loss": 0.2356, "num_input_tokens_seen": 127452560, "step": 59115 }, { "epoch": 10.849697192145348, "grad_norm": 18.452402114868164, "learning_rate": 5.1313089966742504e-06, "loss": 0.2824, "num_input_tokens_seen": 127463216, "step": 59120 }, { "epoch": 10.8506147917049, "grad_norm": 0.14996376633644104, "learning_rate": 5.130508514498687e-06, "loss": 0.0862, "num_input_tokens_seen": 127473680, "step": 59125 }, { "epoch": 10.851532391264453, "grad_norm": 0.14735519886016846, "learning_rate": 5.129708028975782e-06, "loss": 0.1065, "num_input_tokens_seen": 127484144, "step": 59130 }, { "epoch": 10.852449990824004, "grad_norm": 28.725502014160156, "learning_rate": 5.128907540126068e-06, "loss": 0.4007, "num_input_tokens_seen": 127494768, "step": 59135 }, { "epoch": 10.853367590383556, "grad_norm": 60.71332550048828, "learning_rate": 5.1281070479700746e-06, "loss": 0.1201, "num_input_tokens_seen": 127504624, "step": 59140 }, { "epoch": 10.85428518994311, "grad_norm": 35.27273941040039, "learning_rate": 5.1273065525283335e-06, "loss": 0.317, "num_input_tokens_seen": 127516368, "step": 59145 }, { "epoch": 10.85520278950266, "grad_norm": 2.708763837814331, "learning_rate": 5.126506053821379e-06, "loss": 0.1257, "num_input_tokens_seen": 127527344, "step": 59150 }, { "epoch": 10.856120389062212, "grad_norm": 0.06396245956420898, "learning_rate": 5.125705551869737e-06, "loss": 0.2565, "num_input_tokens_seen": 127538128, "step": 59155 }, { "epoch": 10.857037988621766, "grad_norm": 0.5734118223190308, "learning_rate": 5.124905046693944e-06, "loss": 0.2127, "num_input_tokens_seen": 127549328, "step": 59160 }, { "epoch": 10.857955588181317, "grad_norm": 0.4977262616157532, "learning_rate": 5.1241045383145295e-06, "loss": 0.5645, "num_input_tokens_seen": 127561136, "step": 59165 }, { "epoch": 10.858873187740869, "grad_norm": 1.479494571685791, "learning_rate": 5.123304026752026e-06, "loss": 0.0117, "num_input_tokens_seen": 127571504, "step": 59170 }, { "epoch": 10.859790787300422, "grad_norm": 0.2595210373401642, "learning_rate": 5.122503512026966e-06, "loss": 0.2475, "num_input_tokens_seen": 127583184, "step": 59175 }, { "epoch": 10.860708386859974, "grad_norm": 61.01392364501953, "learning_rate": 5.121702994159881e-06, "loss": 0.1814, "num_input_tokens_seen": 127593872, "step": 59180 }, { "epoch": 10.861625986419526, "grad_norm": 60.68842697143555, "learning_rate": 5.1209024731713035e-06, "loss": 0.1713, "num_input_tokens_seen": 127603952, "step": 59185 }, { "epoch": 10.862543585979079, "grad_norm": 14.788366317749023, "learning_rate": 5.120101949081763e-06, "loss": 0.2553, "num_input_tokens_seen": 127614064, "step": 59190 }, { "epoch": 10.86346118553863, "grad_norm": 135.18226623535156, "learning_rate": 5.119301421911793e-06, "loss": 0.4742, "num_input_tokens_seen": 127624688, "step": 59195 }, { "epoch": 10.864378785098182, "grad_norm": 6.7270097732543945, "learning_rate": 5.118500891681929e-06, "loss": 0.4543, "num_input_tokens_seen": 127635024, "step": 59200 }, { "epoch": 10.865296384657736, "grad_norm": 1.9865878820419312, "learning_rate": 5.1177003584127e-06, "loss": 0.1808, "num_input_tokens_seen": 127646096, "step": 59205 }, { "epoch": 10.866213984217287, "grad_norm": 10.133636474609375, "learning_rate": 5.116899822124639e-06, "loss": 0.2277, "num_input_tokens_seen": 127657104, "step": 59210 }, { "epoch": 10.867131583776839, "grad_norm": 0.43955761194229126, "learning_rate": 5.116099282838277e-06, "loss": 0.2117, "num_input_tokens_seen": 127667888, "step": 59215 }, { "epoch": 10.868049183336392, "grad_norm": 16.829504013061523, "learning_rate": 5.11529874057415e-06, "loss": 0.2709, "num_input_tokens_seen": 127678768, "step": 59220 }, { "epoch": 10.868966782895944, "grad_norm": 0.6653433442115784, "learning_rate": 5.1144981953527895e-06, "loss": 0.0976, "num_input_tokens_seen": 127690288, "step": 59225 }, { "epoch": 10.869884382455496, "grad_norm": 0.9088265895843506, "learning_rate": 5.113697647194726e-06, "loss": 0.2129, "num_input_tokens_seen": 127701616, "step": 59230 }, { "epoch": 10.870801982015049, "grad_norm": 17.225828170776367, "learning_rate": 5.1128970961204975e-06, "loss": 0.1537, "num_input_tokens_seen": 127712656, "step": 59235 }, { "epoch": 10.8717195815746, "grad_norm": 0.26491108536720276, "learning_rate": 5.1120965421506305e-06, "loss": 0.1808, "num_input_tokens_seen": 127724080, "step": 59240 }, { "epoch": 10.872637181134152, "grad_norm": 5.156569480895996, "learning_rate": 5.111295985305662e-06, "loss": 0.2832, "num_input_tokens_seen": 127734896, "step": 59245 }, { "epoch": 10.873554780693706, "grad_norm": 5.532195091247559, "learning_rate": 5.110495425606124e-06, "loss": 0.0802, "num_input_tokens_seen": 127745648, "step": 59250 }, { "epoch": 10.874472380253257, "grad_norm": 0.7166719436645508, "learning_rate": 5.1096948630725484e-06, "loss": 0.1848, "num_input_tokens_seen": 127755952, "step": 59255 }, { "epoch": 10.875389979812809, "grad_norm": 36.65029525756836, "learning_rate": 5.108894297725472e-06, "loss": 0.3557, "num_input_tokens_seen": 127767312, "step": 59260 }, { "epoch": 10.876307579372362, "grad_norm": 1.1320040225982666, "learning_rate": 5.1080937295854225e-06, "loss": 0.0095, "num_input_tokens_seen": 127777808, "step": 59265 }, { "epoch": 10.877225178931914, "grad_norm": 14.978120803833008, "learning_rate": 5.107293158672939e-06, "loss": 0.2183, "num_input_tokens_seen": 127789712, "step": 59270 }, { "epoch": 10.878142778491465, "grad_norm": 45.13206481933594, "learning_rate": 5.10649258500855e-06, "loss": 0.3584, "num_input_tokens_seen": 127800368, "step": 59275 }, { "epoch": 10.879060378051019, "grad_norm": 8.496565818786621, "learning_rate": 5.105692008612793e-06, "loss": 0.2555, "num_input_tokens_seen": 127810768, "step": 59280 }, { "epoch": 10.87997797761057, "grad_norm": 0.9981994032859802, "learning_rate": 5.104891429506199e-06, "loss": 0.3769, "num_input_tokens_seen": 127820464, "step": 59285 }, { "epoch": 10.880895577170122, "grad_norm": 21.22968292236328, "learning_rate": 5.104090847709302e-06, "loss": 0.1795, "num_input_tokens_seen": 127831280, "step": 59290 }, { "epoch": 10.881813176729676, "grad_norm": 26.405689239501953, "learning_rate": 5.1032902632426375e-06, "loss": 0.0982, "num_input_tokens_seen": 127840720, "step": 59295 }, { "epoch": 10.882730776289227, "grad_norm": 52.08687973022461, "learning_rate": 5.1024896761267366e-06, "loss": 0.1157, "num_input_tokens_seen": 127850096, "step": 59300 }, { "epoch": 10.883648375848779, "grad_norm": 0.8983449339866638, "learning_rate": 5.101689086382134e-06, "loss": 0.3042, "num_input_tokens_seen": 127860880, "step": 59305 }, { "epoch": 10.884565975408332, "grad_norm": 14.156625747680664, "learning_rate": 5.1008884940293655e-06, "loss": 0.2526, "num_input_tokens_seen": 127871024, "step": 59310 }, { "epoch": 10.885483574967884, "grad_norm": 12.477684020996094, "learning_rate": 5.100087899088962e-06, "loss": 0.0391, "num_input_tokens_seen": 127881648, "step": 59315 }, { "epoch": 10.886401174527435, "grad_norm": 0.12642870843410492, "learning_rate": 5.09928730158146e-06, "loss": 0.153, "num_input_tokens_seen": 127893328, "step": 59320 }, { "epoch": 10.887318774086989, "grad_norm": 0.34690749645233154, "learning_rate": 5.098486701527392e-06, "loss": 0.3471, "num_input_tokens_seen": 127904752, "step": 59325 }, { "epoch": 10.88823637364654, "grad_norm": 1.7580832242965698, "learning_rate": 5.097686098947293e-06, "loss": 0.1951, "num_input_tokens_seen": 127915728, "step": 59330 }, { "epoch": 10.889153973206092, "grad_norm": 12.916593551635742, "learning_rate": 5.096885493861698e-06, "loss": 0.155, "num_input_tokens_seen": 127926192, "step": 59335 }, { "epoch": 10.890071572765645, "grad_norm": 121.06604766845703, "learning_rate": 5.096084886291139e-06, "loss": 0.2382, "num_input_tokens_seen": 127936176, "step": 59340 }, { "epoch": 10.890989172325197, "grad_norm": 278.1086730957031, "learning_rate": 5.0952842762561515e-06, "loss": 0.4999, "num_input_tokens_seen": 127947824, "step": 59345 }, { "epoch": 10.891906771884749, "grad_norm": 1.3396186828613281, "learning_rate": 5.094483663777271e-06, "loss": 0.1359, "num_input_tokens_seen": 127959824, "step": 59350 }, { "epoch": 10.892824371444302, "grad_norm": 37.83332824707031, "learning_rate": 5.09368304887503e-06, "loss": 0.277, "num_input_tokens_seen": 127971888, "step": 59355 }, { "epoch": 10.893741971003854, "grad_norm": 217.4571990966797, "learning_rate": 5.0928824315699645e-06, "loss": 0.1761, "num_input_tokens_seen": 127983760, "step": 59360 }, { "epoch": 10.894659570563405, "grad_norm": 0.9426348805427551, "learning_rate": 5.092081811882608e-06, "loss": 0.1581, "num_input_tokens_seen": 127994032, "step": 59365 }, { "epoch": 10.895577170122959, "grad_norm": 78.84291076660156, "learning_rate": 5.091281189833497e-06, "loss": 0.3333, "num_input_tokens_seen": 128005552, "step": 59370 }, { "epoch": 10.89649476968251, "grad_norm": 0.5851572751998901, "learning_rate": 5.090480565443163e-06, "loss": 0.0854, "num_input_tokens_seen": 128015440, "step": 59375 }, { "epoch": 10.897412369242062, "grad_norm": 98.82893371582031, "learning_rate": 5.0896799387321435e-06, "loss": 0.1669, "num_input_tokens_seen": 128026704, "step": 59380 }, { "epoch": 10.898329968801615, "grad_norm": 0.058168429881334305, "learning_rate": 5.088879309720973e-06, "loss": 0.1389, "num_input_tokens_seen": 128038736, "step": 59385 }, { "epoch": 10.899247568361167, "grad_norm": 46.480926513671875, "learning_rate": 5.088078678430186e-06, "loss": 0.2847, "num_input_tokens_seen": 128050480, "step": 59390 }, { "epoch": 10.900165167920719, "grad_norm": 0.7388197779655457, "learning_rate": 5.087278044880317e-06, "loss": 0.1233, "num_input_tokens_seen": 128061072, "step": 59395 }, { "epoch": 10.901082767480272, "grad_norm": 0.16768009960651398, "learning_rate": 5.086477409091902e-06, "loss": 0.4528, "num_input_tokens_seen": 128072208, "step": 59400 }, { "epoch": 10.902000367039824, "grad_norm": 11.614563941955566, "learning_rate": 5.085676771085476e-06, "loss": 0.3613, "num_input_tokens_seen": 128083024, "step": 59405 }, { "epoch": 10.902917966599375, "grad_norm": 0.044762253761291504, "learning_rate": 5.084876130881572e-06, "loss": 0.2806, "num_input_tokens_seen": 128094064, "step": 59410 }, { "epoch": 10.903835566158929, "grad_norm": 0.2689396142959595, "learning_rate": 5.084075488500727e-06, "loss": 0.2247, "num_input_tokens_seen": 128105328, "step": 59415 }, { "epoch": 10.90475316571848, "grad_norm": 22.196760177612305, "learning_rate": 5.0832748439634775e-06, "loss": 0.4126, "num_input_tokens_seen": 128116048, "step": 59420 }, { "epoch": 10.905670765278032, "grad_norm": 216.58169555664062, "learning_rate": 5.082474197290356e-06, "loss": 0.1927, "num_input_tokens_seen": 128125840, "step": 59425 }, { "epoch": 10.906588364837585, "grad_norm": 0.5938689112663269, "learning_rate": 5.081673548501899e-06, "loss": 0.0039, "num_input_tokens_seen": 128137200, "step": 59430 }, { "epoch": 10.907505964397137, "grad_norm": 43.13487243652344, "learning_rate": 5.0808728976186426e-06, "loss": 0.1237, "num_input_tokens_seen": 128148720, "step": 59435 }, { "epoch": 10.908423563956688, "grad_norm": 19.846036911010742, "learning_rate": 5.080072244661121e-06, "loss": 0.1784, "num_input_tokens_seen": 128159024, "step": 59440 }, { "epoch": 10.909341163516242, "grad_norm": 0.7243208885192871, "learning_rate": 5.079271589649872e-06, "loss": 0.0994, "num_input_tokens_seen": 128170608, "step": 59445 }, { "epoch": 10.910258763075793, "grad_norm": 67.6209716796875, "learning_rate": 5.078470932605428e-06, "loss": 0.1514, "num_input_tokens_seen": 128181584, "step": 59450 }, { "epoch": 10.911176362635345, "grad_norm": 0.3203677237033844, "learning_rate": 5.077670273548327e-06, "loss": 0.072, "num_input_tokens_seen": 128193392, "step": 59455 }, { "epoch": 10.912093962194898, "grad_norm": 29.213668823242188, "learning_rate": 5.076869612499105e-06, "loss": 0.4443, "num_input_tokens_seen": 128204272, "step": 59460 }, { "epoch": 10.91301156175445, "grad_norm": 17.560394287109375, "learning_rate": 5.076068949478294e-06, "loss": 0.415, "num_input_tokens_seen": 128215024, "step": 59465 }, { "epoch": 10.913929161314002, "grad_norm": 181.99713134765625, "learning_rate": 5.075268284506435e-06, "loss": 0.417, "num_input_tokens_seen": 128225040, "step": 59470 }, { "epoch": 10.914846760873555, "grad_norm": 73.79377746582031, "learning_rate": 5.07446761760406e-06, "loss": 0.1244, "num_input_tokens_seen": 128235952, "step": 59475 }, { "epoch": 10.915764360433107, "grad_norm": 15.587486267089844, "learning_rate": 5.073666948791706e-06, "loss": 0.1205, "num_input_tokens_seen": 128247312, "step": 59480 }, { "epoch": 10.916681959992658, "grad_norm": 0.06999356299638748, "learning_rate": 5.072866278089908e-06, "loss": 0.2071, "num_input_tokens_seen": 128257328, "step": 59485 }, { "epoch": 10.917599559552212, "grad_norm": 1.1255748271942139, "learning_rate": 5.072065605519203e-06, "loss": 0.3917, "num_input_tokens_seen": 128268976, "step": 59490 }, { "epoch": 10.918517159111763, "grad_norm": 31.16545867919922, "learning_rate": 5.071264931100129e-06, "loss": 0.2965, "num_input_tokens_seen": 128280080, "step": 59495 }, { "epoch": 10.919434758671315, "grad_norm": 0.47352179884910583, "learning_rate": 5.070464254853218e-06, "loss": 0.1891, "num_input_tokens_seen": 128290800, "step": 59500 }, { "epoch": 10.920352358230868, "grad_norm": 1.7473912239074707, "learning_rate": 5.069663576799009e-06, "loss": 0.1383, "num_input_tokens_seen": 128301104, "step": 59505 }, { "epoch": 10.92126995779042, "grad_norm": 142.84693908691406, "learning_rate": 5.068862896958036e-06, "loss": 0.2705, "num_input_tokens_seen": 128312080, "step": 59510 }, { "epoch": 10.922187557349972, "grad_norm": 8.07854175567627, "learning_rate": 5.0680622153508365e-06, "loss": 0.01, "num_input_tokens_seen": 128321808, "step": 59515 }, { "epoch": 10.923105156909525, "grad_norm": 52.40623474121094, "learning_rate": 5.067261531997948e-06, "loss": 0.4531, "num_input_tokens_seen": 128332592, "step": 59520 }, { "epoch": 10.924022756469077, "grad_norm": 3.324554443359375, "learning_rate": 5.066460846919905e-06, "loss": 0.0994, "num_input_tokens_seen": 128343088, "step": 59525 }, { "epoch": 10.924940356028628, "grad_norm": 9.634564399719238, "learning_rate": 5.065660160137245e-06, "loss": 0.2017, "num_input_tokens_seen": 128353040, "step": 59530 }, { "epoch": 10.925857955588182, "grad_norm": 30.97278594970703, "learning_rate": 5.0648594716705024e-06, "loss": 0.0891, "num_input_tokens_seen": 128363664, "step": 59535 }, { "epoch": 10.926775555147733, "grad_norm": 49.156856536865234, "learning_rate": 5.0640587815402145e-06, "loss": 0.013, "num_input_tokens_seen": 128375152, "step": 59540 }, { "epoch": 10.927693154707285, "grad_norm": 212.8072967529297, "learning_rate": 5.063258089766919e-06, "loss": 0.1609, "num_input_tokens_seen": 128384752, "step": 59545 }, { "epoch": 10.928610754266838, "grad_norm": 33.50523376464844, "learning_rate": 5.062457396371151e-06, "loss": 0.224, "num_input_tokens_seen": 128396176, "step": 59550 }, { "epoch": 10.92952835382639, "grad_norm": 43.87344741821289, "learning_rate": 5.061656701373449e-06, "loss": 0.1047, "num_input_tokens_seen": 128407952, "step": 59555 }, { "epoch": 10.930445953385941, "grad_norm": 2.2942259311676025, "learning_rate": 5.060856004794347e-06, "loss": 0.0281, "num_input_tokens_seen": 128419088, "step": 59560 }, { "epoch": 10.931363552945495, "grad_norm": 31.199174880981445, "learning_rate": 5.060055306654383e-06, "loss": 0.366, "num_input_tokens_seen": 128428272, "step": 59565 }, { "epoch": 10.932281152505047, "grad_norm": 1.9093081951141357, "learning_rate": 5.0592546069740945e-06, "loss": 0.2183, "num_input_tokens_seen": 128439152, "step": 59570 }, { "epoch": 10.933198752064598, "grad_norm": 0.16285163164138794, "learning_rate": 5.058453905774015e-06, "loss": 0.1723, "num_input_tokens_seen": 128450032, "step": 59575 }, { "epoch": 10.934116351624152, "grad_norm": 0.1797659546136856, "learning_rate": 5.057653203074686e-06, "loss": 0.008, "num_input_tokens_seen": 128460080, "step": 59580 }, { "epoch": 10.935033951183703, "grad_norm": 0.044786449521780014, "learning_rate": 5.0568524988966395e-06, "loss": 0.1089, "num_input_tokens_seen": 128471760, "step": 59585 }, { "epoch": 10.935951550743255, "grad_norm": 16.929006576538086, "learning_rate": 5.056051793260416e-06, "loss": 0.3154, "num_input_tokens_seen": 128484048, "step": 59590 }, { "epoch": 10.936869150302808, "grad_norm": 0.1914278268814087, "learning_rate": 5.05525108618655e-06, "loss": 0.2414, "num_input_tokens_seen": 128493808, "step": 59595 }, { "epoch": 10.93778674986236, "grad_norm": 27.72692108154297, "learning_rate": 5.054450377695579e-06, "loss": 0.1503, "num_input_tokens_seen": 128504016, "step": 59600 }, { "epoch": 10.938704349421911, "grad_norm": 18.382434844970703, "learning_rate": 5.053649667808041e-06, "loss": 0.1929, "num_input_tokens_seen": 128515376, "step": 59605 }, { "epoch": 10.939621948981465, "grad_norm": 0.39157411456108093, "learning_rate": 5.052848956544471e-06, "loss": 0.0952, "num_input_tokens_seen": 128525904, "step": 59610 }, { "epoch": 10.940539548541016, "grad_norm": 0.09210263937711716, "learning_rate": 5.05204824392541e-06, "loss": 0.3897, "num_input_tokens_seen": 128536720, "step": 59615 }, { "epoch": 10.941457148100568, "grad_norm": 1.211108922958374, "learning_rate": 5.051247529971388e-06, "loss": 0.1379, "num_input_tokens_seen": 128548944, "step": 59620 }, { "epoch": 10.942374747660121, "grad_norm": 16.129152297973633, "learning_rate": 5.050446814702948e-06, "loss": 0.2196, "num_input_tokens_seen": 128559792, "step": 59625 }, { "epoch": 10.943292347219673, "grad_norm": 14.947534561157227, "learning_rate": 5.049646098140627e-06, "loss": 0.1139, "num_input_tokens_seen": 128570768, "step": 59630 }, { "epoch": 10.944209946779225, "grad_norm": 8.13005256652832, "learning_rate": 5.048845380304959e-06, "loss": 0.2863, "num_input_tokens_seen": 128580400, "step": 59635 }, { "epoch": 10.945127546338778, "grad_norm": 0.11663145571947098, "learning_rate": 5.048044661216484e-06, "loss": 0.1595, "num_input_tokens_seen": 128591568, "step": 59640 }, { "epoch": 10.94604514589833, "grad_norm": 0.5744967460632324, "learning_rate": 5.047243940895736e-06, "loss": 0.0173, "num_input_tokens_seen": 128601072, "step": 59645 }, { "epoch": 10.946962745457881, "grad_norm": 29.569141387939453, "learning_rate": 5.046443219363255e-06, "loss": 0.2888, "num_input_tokens_seen": 128611248, "step": 59650 }, { "epoch": 10.947880345017435, "grad_norm": 0.7525213360786438, "learning_rate": 5.045642496639578e-06, "loss": 0.1022, "num_input_tokens_seen": 128621680, "step": 59655 }, { "epoch": 10.948797944576986, "grad_norm": 0.7076815366744995, "learning_rate": 5.044841772745241e-06, "loss": 0.185, "num_input_tokens_seen": 128631472, "step": 59660 }, { "epoch": 10.949715544136538, "grad_norm": 0.31256726384162903, "learning_rate": 5.044041047700783e-06, "loss": 0.2648, "num_input_tokens_seen": 128641072, "step": 59665 }, { "epoch": 10.950633143696091, "grad_norm": 0.7147632241249084, "learning_rate": 5.043240321526739e-06, "loss": 0.1965, "num_input_tokens_seen": 128650640, "step": 59670 }, { "epoch": 10.951550743255643, "grad_norm": 59.49819564819336, "learning_rate": 5.042439594243649e-06, "loss": 0.3597, "num_input_tokens_seen": 128661616, "step": 59675 }, { "epoch": 10.952468342815195, "grad_norm": 35.30986022949219, "learning_rate": 5.041638865872048e-06, "loss": 0.2876, "num_input_tokens_seen": 128671312, "step": 59680 }, { "epoch": 10.953385942374748, "grad_norm": 147.6184844970703, "learning_rate": 5.040838136432475e-06, "loss": 0.3137, "num_input_tokens_seen": 128681584, "step": 59685 }, { "epoch": 10.9543035419343, "grad_norm": 90.59898376464844, "learning_rate": 5.040037405945468e-06, "loss": 0.4865, "num_input_tokens_seen": 128692784, "step": 59690 }, { "epoch": 10.955221141493851, "grad_norm": 34.295387268066406, "learning_rate": 5.039236674431562e-06, "loss": 0.0719, "num_input_tokens_seen": 128704592, "step": 59695 }, { "epoch": 10.956138741053405, "grad_norm": 21.854787826538086, "learning_rate": 5.038435941911297e-06, "loss": 0.9268, "num_input_tokens_seen": 128715600, "step": 59700 }, { "epoch": 10.957056340612956, "grad_norm": 0.2155952900648117, "learning_rate": 5.03763520840521e-06, "loss": 0.1139, "num_input_tokens_seen": 128725136, "step": 59705 }, { "epoch": 10.957973940172508, "grad_norm": 84.527587890625, "learning_rate": 5.036834473933838e-06, "loss": 0.2454, "num_input_tokens_seen": 128736944, "step": 59710 }, { "epoch": 10.958891539732061, "grad_norm": 83.09577941894531, "learning_rate": 5.036033738517719e-06, "loss": 0.1938, "num_input_tokens_seen": 128748112, "step": 59715 }, { "epoch": 10.959809139291613, "grad_norm": 24.99148178100586, "learning_rate": 5.03523300217739e-06, "loss": 0.2528, "num_input_tokens_seen": 128759728, "step": 59720 }, { "epoch": 10.960726738851164, "grad_norm": 0.23160545527935028, "learning_rate": 5.03443226493339e-06, "loss": 0.1947, "num_input_tokens_seen": 128769968, "step": 59725 }, { "epoch": 10.961644338410718, "grad_norm": 1.6795152425765991, "learning_rate": 5.033631526806254e-06, "loss": 0.0364, "num_input_tokens_seen": 128781392, "step": 59730 }, { "epoch": 10.96256193797027, "grad_norm": 0.03037402406334877, "learning_rate": 5.032830787816523e-06, "loss": 0.279, "num_input_tokens_seen": 128791952, "step": 59735 }, { "epoch": 10.963479537529821, "grad_norm": 16.974029541015625, "learning_rate": 5.032030047984733e-06, "loss": 0.205, "num_input_tokens_seen": 128802512, "step": 59740 }, { "epoch": 10.964397137089374, "grad_norm": 1.1098328828811646, "learning_rate": 5.031229307331421e-06, "loss": 0.1741, "num_input_tokens_seen": 128811856, "step": 59745 }, { "epoch": 10.965314736648926, "grad_norm": 19.964792251586914, "learning_rate": 5.030428565877127e-06, "loss": 0.3454, "num_input_tokens_seen": 128823440, "step": 59750 }, { "epoch": 10.966232336208478, "grad_norm": 0.3425499498844147, "learning_rate": 5.0296278236423855e-06, "loss": 0.0997, "num_input_tokens_seen": 128833872, "step": 59755 }, { "epoch": 10.967149935768031, "grad_norm": 69.0265121459961, "learning_rate": 5.028827080647738e-06, "loss": 0.1897, "num_input_tokens_seen": 128844816, "step": 59760 }, { "epoch": 10.968067535327583, "grad_norm": 29.730497360229492, "learning_rate": 5.0280263369137205e-06, "loss": 0.1143, "num_input_tokens_seen": 128856688, "step": 59765 }, { "epoch": 10.968985134887134, "grad_norm": 0.32171276211738586, "learning_rate": 5.027225592460869e-06, "loss": 0.1317, "num_input_tokens_seen": 128866416, "step": 59770 }, { "epoch": 10.969902734446688, "grad_norm": 67.30635070800781, "learning_rate": 5.026424847309725e-06, "loss": 0.2487, "num_input_tokens_seen": 128876592, "step": 59775 }, { "epoch": 10.97082033400624, "grad_norm": 1.7418925762176514, "learning_rate": 5.025624101480826e-06, "loss": 0.0027, "num_input_tokens_seen": 128888304, "step": 59780 }, { "epoch": 10.971737933565791, "grad_norm": 0.15579527616500854, "learning_rate": 5.024823354994707e-06, "loss": 0.2796, "num_input_tokens_seen": 128899056, "step": 59785 }, { "epoch": 10.972655533125344, "grad_norm": 31.51556968688965, "learning_rate": 5.024022607871907e-06, "loss": 0.1788, "num_input_tokens_seen": 128908816, "step": 59790 }, { "epoch": 10.973573132684896, "grad_norm": 43.800636291503906, "learning_rate": 5.023221860132964e-06, "loss": 0.5204, "num_input_tokens_seen": 128918928, "step": 59795 }, { "epoch": 10.974490732244448, "grad_norm": 2.774991512298584, "learning_rate": 5.022421111798418e-06, "loss": 0.0293, "num_input_tokens_seen": 128930992, "step": 59800 }, { "epoch": 10.975408331804001, "grad_norm": 0.44914260506629944, "learning_rate": 5.021620362888803e-06, "loss": 0.0737, "num_input_tokens_seen": 128942288, "step": 59805 }, { "epoch": 10.976325931363553, "grad_norm": 0.10163122415542603, "learning_rate": 5.02081961342466e-06, "loss": 0.0866, "num_input_tokens_seen": 128952624, "step": 59810 }, { "epoch": 10.977243530923104, "grad_norm": 43.66987228393555, "learning_rate": 5.0200188634265265e-06, "loss": 0.2983, "num_input_tokens_seen": 128963120, "step": 59815 }, { "epoch": 10.978161130482658, "grad_norm": 40.8248291015625, "learning_rate": 5.019218112914939e-06, "loss": 0.3311, "num_input_tokens_seen": 128973520, "step": 59820 }, { "epoch": 10.97907873004221, "grad_norm": 59.143497467041016, "learning_rate": 5.018417361910439e-06, "loss": 0.2973, "num_input_tokens_seen": 128984528, "step": 59825 }, { "epoch": 10.97999632960176, "grad_norm": 73.98495483398438, "learning_rate": 5.017616610433561e-06, "loss": 0.2975, "num_input_tokens_seen": 128994448, "step": 59830 }, { "epoch": 10.980913929161314, "grad_norm": 0.11711196601390839, "learning_rate": 5.016815858504844e-06, "loss": 0.107, "num_input_tokens_seen": 129005456, "step": 59835 }, { "epoch": 10.981831528720866, "grad_norm": 128.47393798828125, "learning_rate": 5.016015106144827e-06, "loss": 0.1362, "num_input_tokens_seen": 129016688, "step": 59840 }, { "epoch": 10.982749128280417, "grad_norm": 0.3240130841732025, "learning_rate": 5.015214353374046e-06, "loss": 0.2598, "num_input_tokens_seen": 129028048, "step": 59845 }, { "epoch": 10.983666727839971, "grad_norm": 42.25433349609375, "learning_rate": 5.014413600213043e-06, "loss": 0.2523, "num_input_tokens_seen": 129038896, "step": 59850 }, { "epoch": 10.984584327399523, "grad_norm": 26.29018211364746, "learning_rate": 5.013612846682351e-06, "loss": 0.3009, "num_input_tokens_seen": 129050128, "step": 59855 }, { "epoch": 10.985501926959074, "grad_norm": 28.94146156311035, "learning_rate": 5.01281209280251e-06, "loss": 0.2347, "num_input_tokens_seen": 129061136, "step": 59860 }, { "epoch": 10.986419526518628, "grad_norm": 30.521541595458984, "learning_rate": 5.012011338594061e-06, "loss": 0.3796, "num_input_tokens_seen": 129071088, "step": 59865 }, { "epoch": 10.98733712607818, "grad_norm": 217.93527221679688, "learning_rate": 5.011210584077538e-06, "loss": 0.31, "num_input_tokens_seen": 129081936, "step": 59870 }, { "epoch": 10.98825472563773, "grad_norm": 1.883720874786377, "learning_rate": 5.010409829273483e-06, "loss": 0.1715, "num_input_tokens_seen": 129094160, "step": 59875 }, { "epoch": 10.989172325197284, "grad_norm": 0.7415862083435059, "learning_rate": 5.009609074202431e-06, "loss": 0.2748, "num_input_tokens_seen": 129105104, "step": 59880 }, { "epoch": 10.990089924756836, "grad_norm": 0.9071094393730164, "learning_rate": 5.008808318884921e-06, "loss": 0.1537, "num_input_tokens_seen": 129115184, "step": 59885 }, { "epoch": 10.991007524316387, "grad_norm": 148.06529235839844, "learning_rate": 5.008007563341491e-06, "loss": 0.2873, "num_input_tokens_seen": 129126448, "step": 59890 }, { "epoch": 10.99192512387594, "grad_norm": 27.95200538635254, "learning_rate": 5.007206807592679e-06, "loss": 0.0388, "num_input_tokens_seen": 129138416, "step": 59895 }, { "epoch": 10.992842723435492, "grad_norm": 0.3589503765106201, "learning_rate": 5.006406051659025e-06, "loss": 0.279, "num_input_tokens_seen": 129147824, "step": 59900 }, { "epoch": 10.993760322995044, "grad_norm": 29.14742660522461, "learning_rate": 5.005605295561065e-06, "loss": 0.2127, "num_input_tokens_seen": 129157360, "step": 59905 }, { "epoch": 10.994677922554597, "grad_norm": 0.5721133947372437, "learning_rate": 5.004804539319338e-06, "loss": 0.0156, "num_input_tokens_seen": 129168272, "step": 59910 }, { "epoch": 10.995595522114149, "grad_norm": 191.25390625, "learning_rate": 5.004003782954382e-06, "loss": 0.627, "num_input_tokens_seen": 129178928, "step": 59915 }, { "epoch": 10.996513121673702, "grad_norm": 5.216413974761963, "learning_rate": 5.0032030264867335e-06, "loss": 0.3135, "num_input_tokens_seen": 129188560, "step": 59920 }, { "epoch": 10.997430721233254, "grad_norm": 1.143853783607483, "learning_rate": 5.002402269936935e-06, "loss": 0.1995, "num_input_tokens_seen": 129199536, "step": 59925 }, { "epoch": 10.998348320792806, "grad_norm": 81.62122344970703, "learning_rate": 5.00160151332552e-06, "loss": 0.2, "num_input_tokens_seen": 129211184, "step": 59930 }, { "epoch": 10.999265920352359, "grad_norm": 0.39607611298561096, "learning_rate": 5.00080075667303e-06, "loss": 0.0749, "num_input_tokens_seen": 129222000, "step": 59935 }, { "epoch": 11.00018351991191, "grad_norm": 94.28694152832031, "learning_rate": 5e-06, "loss": 0.3048, "num_input_tokens_seen": 129230000, "step": 59940 }, { "epoch": 11.001101119471462, "grad_norm": 25.183168411254883, "learning_rate": 4.999199243326973e-06, "loss": 0.1291, "num_input_tokens_seen": 129239792, "step": 59945 }, { "epoch": 11.002018719031016, "grad_norm": 69.17977142333984, "learning_rate": 4.9983984866744806e-06, "loss": 0.3799, "num_input_tokens_seen": 129249968, "step": 59950 }, { "epoch": 11.002936318590567, "grad_norm": 3.1018476486206055, "learning_rate": 4.997597730063067e-06, "loss": 0.0673, "num_input_tokens_seen": 129260752, "step": 59955 }, { "epoch": 11.003853918150119, "grad_norm": 0.25326627492904663, "learning_rate": 4.996796973513267e-06, "loss": 0.0043, "num_input_tokens_seen": 129271024, "step": 59960 }, { "epoch": 11.004771517709672, "grad_norm": 0.5971193909645081, "learning_rate": 4.9959962170456215e-06, "loss": 0.013, "num_input_tokens_seen": 129281840, "step": 59965 }, { "epoch": 11.005689117269224, "grad_norm": 0.33765774965286255, "learning_rate": 4.995195460680663e-06, "loss": 0.2067, "num_input_tokens_seen": 129292880, "step": 59970 }, { "epoch": 11.006606716828776, "grad_norm": 1.527826189994812, "learning_rate": 4.994394704438936e-06, "loss": 0.1889, "num_input_tokens_seen": 129304016, "step": 59975 }, { "epoch": 11.007524316388329, "grad_norm": 0.33818385004997253, "learning_rate": 4.993593948340977e-06, "loss": 0.174, "num_input_tokens_seen": 129314928, "step": 59980 }, { "epoch": 11.00844191594788, "grad_norm": 0.061991702765226364, "learning_rate": 4.992793192407322e-06, "loss": 0.1033, "num_input_tokens_seen": 129325872, "step": 59985 }, { "epoch": 11.009359515507432, "grad_norm": 67.38846588134766, "learning_rate": 4.9919924366585096e-06, "loss": 0.0774, "num_input_tokens_seen": 129337040, "step": 59990 }, { "epoch": 11.010277115066986, "grad_norm": 0.09005719423294067, "learning_rate": 4.991191681115081e-06, "loss": 0.1613, "num_input_tokens_seen": 129347152, "step": 59995 }, { "epoch": 11.011194714626537, "grad_norm": 0.12299671769142151, "learning_rate": 4.990390925797569e-06, "loss": 0.1835, "num_input_tokens_seen": 129358512, "step": 60000 }, { "epoch": 11.012112314186089, "grad_norm": 10.395956039428711, "learning_rate": 4.989590170726518e-06, "loss": 0.1207, "num_input_tokens_seen": 129369040, "step": 60005 }, { "epoch": 11.013029913745642, "grad_norm": 86.55232238769531, "learning_rate": 4.988789415922463e-06, "loss": 0.2097, "num_input_tokens_seen": 129380176, "step": 60010 }, { "epoch": 11.013947513305194, "grad_norm": 0.9147047996520996, "learning_rate": 4.987988661405941e-06, "loss": 0.0248, "num_input_tokens_seen": 129390576, "step": 60015 }, { "epoch": 11.014865112864745, "grad_norm": 0.1959591656923294, "learning_rate": 4.98718790719749e-06, "loss": 0.0836, "num_input_tokens_seen": 129400784, "step": 60020 }, { "epoch": 11.015782712424299, "grad_norm": 0.10758606344461441, "learning_rate": 4.986387153317651e-06, "loss": 0.1297, "num_input_tokens_seen": 129411728, "step": 60025 }, { "epoch": 11.01670031198385, "grad_norm": 0.3532769978046417, "learning_rate": 4.98558639978696e-06, "loss": 0.2057, "num_input_tokens_seen": 129422224, "step": 60030 }, { "epoch": 11.017617911543402, "grad_norm": 0.1494893580675125, "learning_rate": 4.984785646625954e-06, "loss": 0.017, "num_input_tokens_seen": 129432944, "step": 60035 }, { "epoch": 11.018535511102955, "grad_norm": 0.32020604610443115, "learning_rate": 4.983984893855174e-06, "loss": 0.0423, "num_input_tokens_seen": 129443920, "step": 60040 }, { "epoch": 11.019453110662507, "grad_norm": 44.783504486083984, "learning_rate": 4.983184141495158e-06, "loss": 0.0969, "num_input_tokens_seen": 129453072, "step": 60045 }, { "epoch": 11.020370710222059, "grad_norm": 19.862167358398438, "learning_rate": 4.9823833895664394e-06, "loss": 0.0504, "num_input_tokens_seen": 129464176, "step": 60050 }, { "epoch": 11.021288309781612, "grad_norm": 0.4175940155982971, "learning_rate": 4.9815826380895625e-06, "loss": 0.1143, "num_input_tokens_seen": 129474928, "step": 60055 }, { "epoch": 11.022205909341164, "grad_norm": 180.0690460205078, "learning_rate": 4.9807818870850614e-06, "loss": 0.4607, "num_input_tokens_seen": 129486896, "step": 60060 }, { "epoch": 11.023123508900715, "grad_norm": 0.21568216383457184, "learning_rate": 4.979981136573476e-06, "loss": 0.1234, "num_input_tokens_seen": 129497872, "step": 60065 }, { "epoch": 11.024041108460269, "grad_norm": 57.12556457519531, "learning_rate": 4.97918038657534e-06, "loss": 0.0358, "num_input_tokens_seen": 129509008, "step": 60070 }, { "epoch": 11.02495870801982, "grad_norm": 0.04488819092512131, "learning_rate": 4.978379637111198e-06, "loss": 0.0015, "num_input_tokens_seen": 129520592, "step": 60075 }, { "epoch": 11.025876307579372, "grad_norm": 1.6383144855499268, "learning_rate": 4.9775788882015854e-06, "loss": 0.0043, "num_input_tokens_seen": 129531024, "step": 60080 }, { "epoch": 11.026793907138925, "grad_norm": 0.04093346744775772, "learning_rate": 4.976778139867037e-06, "loss": 0.0075, "num_input_tokens_seen": 129541712, "step": 60085 }, { "epoch": 11.027711506698477, "grad_norm": 2.471971035003662, "learning_rate": 4.975977392128095e-06, "loss": 0.171, "num_input_tokens_seen": 129553264, "step": 60090 }, { "epoch": 11.028629106258029, "grad_norm": 0.03006277233362198, "learning_rate": 4.975176645005295e-06, "loss": 0.1033, "num_input_tokens_seen": 129565456, "step": 60095 }, { "epoch": 11.029546705817582, "grad_norm": 0.5162056088447571, "learning_rate": 4.974375898519177e-06, "loss": 0.3515, "num_input_tokens_seen": 129575184, "step": 60100 }, { "epoch": 11.030464305377134, "grad_norm": 0.20797351002693176, "learning_rate": 4.973575152690276e-06, "loss": 0.241, "num_input_tokens_seen": 129588048, "step": 60105 }, { "epoch": 11.031381904936685, "grad_norm": 0.09651530534029007, "learning_rate": 4.9727744075391315e-06, "loss": 0.1357, "num_input_tokens_seen": 129598480, "step": 60110 }, { "epoch": 11.032299504496239, "grad_norm": 0.05221414938569069, "learning_rate": 4.971973663086281e-06, "loss": 0.1345, "num_input_tokens_seen": 129610128, "step": 60115 }, { "epoch": 11.03321710405579, "grad_norm": 0.06525302678346634, "learning_rate": 4.971172919352263e-06, "loss": 0.3054, "num_input_tokens_seen": 129622544, "step": 60120 }, { "epoch": 11.034134703615342, "grad_norm": 0.023301731795072556, "learning_rate": 4.970372176357615e-06, "loss": 0.1326, "num_input_tokens_seen": 129632432, "step": 60125 }, { "epoch": 11.035052303174895, "grad_norm": 9.12221908569336, "learning_rate": 4.969571434122876e-06, "loss": 0.0157, "num_input_tokens_seen": 129642992, "step": 60130 }, { "epoch": 11.035969902734447, "grad_norm": 3.285532236099243, "learning_rate": 4.968770692668579e-06, "loss": 0.1144, "num_input_tokens_seen": 129653680, "step": 60135 }, { "epoch": 11.036887502293999, "grad_norm": 73.02664947509766, "learning_rate": 4.967969952015269e-06, "loss": 0.2722, "num_input_tokens_seen": 129663664, "step": 60140 }, { "epoch": 11.037805101853552, "grad_norm": 0.07009148597717285, "learning_rate": 4.9671692121834785e-06, "loss": 0.1469, "num_input_tokens_seen": 129674192, "step": 60145 }, { "epoch": 11.038722701413104, "grad_norm": 80.1358642578125, "learning_rate": 4.966368473193748e-06, "loss": 0.1285, "num_input_tokens_seen": 129683920, "step": 60150 }, { "epoch": 11.039640300972655, "grad_norm": 15.849618911743164, "learning_rate": 4.965567735066611e-06, "loss": 0.2188, "num_input_tokens_seen": 129695856, "step": 60155 }, { "epoch": 11.040557900532209, "grad_norm": 76.50403594970703, "learning_rate": 4.964766997822611e-06, "loss": 0.3293, "num_input_tokens_seen": 129706576, "step": 60160 }, { "epoch": 11.04147550009176, "grad_norm": 37.75981903076172, "learning_rate": 4.963966261482283e-06, "loss": 0.0989, "num_input_tokens_seen": 129716848, "step": 60165 }, { "epoch": 11.042393099651312, "grad_norm": 5.879606246948242, "learning_rate": 4.9631655260661624e-06, "loss": 0.33, "num_input_tokens_seen": 129728048, "step": 60170 }, { "epoch": 11.043310699210865, "grad_norm": 22.950246810913086, "learning_rate": 4.962364791594791e-06, "loss": 0.0239, "num_input_tokens_seen": 129738576, "step": 60175 }, { "epoch": 11.044228298770417, "grad_norm": 20.705581665039062, "learning_rate": 4.961564058088705e-06, "loss": 0.3663, "num_input_tokens_seen": 129748592, "step": 60180 }, { "epoch": 11.045145898329968, "grad_norm": 0.2571945786476135, "learning_rate": 4.960763325568438e-06, "loss": 0.2154, "num_input_tokens_seen": 129759024, "step": 60185 }, { "epoch": 11.046063497889522, "grad_norm": 19.79087257385254, "learning_rate": 4.959962594054533e-06, "loss": 0.2475, "num_input_tokens_seen": 129769872, "step": 60190 }, { "epoch": 11.046981097449073, "grad_norm": 26.088287353515625, "learning_rate": 4.959161863567526e-06, "loss": 0.1469, "num_input_tokens_seen": 129780368, "step": 60195 }, { "epoch": 11.047898697008625, "grad_norm": 65.77926635742188, "learning_rate": 4.958361134127953e-06, "loss": 0.1124, "num_input_tokens_seen": 129791280, "step": 60200 }, { "epoch": 11.048816296568178, "grad_norm": 1.9337400197982788, "learning_rate": 4.957560405756352e-06, "loss": 0.1201, "num_input_tokens_seen": 129802960, "step": 60205 }, { "epoch": 11.04973389612773, "grad_norm": 11.904644966125488, "learning_rate": 4.956759678473263e-06, "loss": 0.1216, "num_input_tokens_seen": 129812944, "step": 60210 }, { "epoch": 11.050651495687282, "grad_norm": 0.20433460175991058, "learning_rate": 4.955958952299219e-06, "loss": 0.2105, "num_input_tokens_seen": 129824624, "step": 60215 }, { "epoch": 11.051569095246835, "grad_norm": 166.44032287597656, "learning_rate": 4.95515822725476e-06, "loss": 0.2758, "num_input_tokens_seen": 129836464, "step": 60220 }, { "epoch": 11.052486694806387, "grad_norm": 2.164182424545288, "learning_rate": 4.954357503360424e-06, "loss": 0.0142, "num_input_tokens_seen": 129846704, "step": 60225 }, { "epoch": 11.053404294365938, "grad_norm": 0.21277081966400146, "learning_rate": 4.953556780636747e-06, "loss": 0.0043, "num_input_tokens_seen": 129858288, "step": 60230 }, { "epoch": 11.054321893925492, "grad_norm": 102.18009185791016, "learning_rate": 4.952756059104265e-06, "loss": 0.0323, "num_input_tokens_seen": 129867696, "step": 60235 }, { "epoch": 11.055239493485043, "grad_norm": 0.530479907989502, "learning_rate": 4.951955338783518e-06, "loss": 0.0096, "num_input_tokens_seen": 129879824, "step": 60240 }, { "epoch": 11.056157093044595, "grad_norm": 34.47114562988281, "learning_rate": 4.951154619695043e-06, "loss": 0.1052, "num_input_tokens_seen": 129890320, "step": 60245 }, { "epoch": 11.057074692604148, "grad_norm": 14.670193672180176, "learning_rate": 4.9503539018593755e-06, "loss": 0.137, "num_input_tokens_seen": 129899504, "step": 60250 }, { "epoch": 11.0579922921637, "grad_norm": 22.92082405090332, "learning_rate": 4.949553185297052e-06, "loss": 0.26, "num_input_tokens_seen": 129910416, "step": 60255 }, { "epoch": 11.058909891723252, "grad_norm": 0.019077323377132416, "learning_rate": 4.9487524700286125e-06, "loss": 0.2424, "num_input_tokens_seen": 129922192, "step": 60260 }, { "epoch": 11.059827491282805, "grad_norm": 0.012131175957620144, "learning_rate": 4.947951756074594e-06, "loss": 0.0535, "num_input_tokens_seen": 129933072, "step": 60265 }, { "epoch": 11.060745090842357, "grad_norm": 88.05252075195312, "learning_rate": 4.9471510434555295e-06, "loss": 0.2943, "num_input_tokens_seen": 129944304, "step": 60270 }, { "epoch": 11.061662690401908, "grad_norm": 0.092357337474823, "learning_rate": 4.94635033219196e-06, "loss": 0.0359, "num_input_tokens_seen": 129954608, "step": 60275 }, { "epoch": 11.062580289961462, "grad_norm": 168.7581787109375, "learning_rate": 4.945549622304422e-06, "loss": 0.2596, "num_input_tokens_seen": 129964176, "step": 60280 }, { "epoch": 11.063497889521013, "grad_norm": 91.9985580444336, "learning_rate": 4.944748913813453e-06, "loss": 0.1415, "num_input_tokens_seen": 129974672, "step": 60285 }, { "epoch": 11.064415489080565, "grad_norm": 15.37507438659668, "learning_rate": 4.943948206739586e-06, "loss": 0.3307, "num_input_tokens_seen": 129986256, "step": 60290 }, { "epoch": 11.065333088640118, "grad_norm": 0.059879425913095474, "learning_rate": 4.943147501103362e-06, "loss": 0.001, "num_input_tokens_seen": 129996592, "step": 60295 }, { "epoch": 11.06625068819967, "grad_norm": 0.27400997281074524, "learning_rate": 4.942346796925317e-06, "loss": 0.1228, "num_input_tokens_seen": 130007568, "step": 60300 }, { "epoch": 11.067168287759221, "grad_norm": 4.248898506164551, "learning_rate": 4.941546094225986e-06, "loss": 0.0029, "num_input_tokens_seen": 130017616, "step": 60305 }, { "epoch": 11.068085887318775, "grad_norm": 19.461956024169922, "learning_rate": 4.940745393025907e-06, "loss": 0.2994, "num_input_tokens_seen": 130028432, "step": 60310 }, { "epoch": 11.069003486878326, "grad_norm": 63.04201126098633, "learning_rate": 4.939944693345618e-06, "loss": 0.2322, "num_input_tokens_seen": 130039728, "step": 60315 }, { "epoch": 11.069921086437878, "grad_norm": 0.2147897332906723, "learning_rate": 4.939143995205654e-06, "loss": 0.1855, "num_input_tokens_seen": 130050768, "step": 60320 }, { "epoch": 11.070838685997431, "grad_norm": 119.29681396484375, "learning_rate": 4.938343298626552e-06, "loss": 0.0597, "num_input_tokens_seen": 130062544, "step": 60325 }, { "epoch": 11.071756285556983, "grad_norm": 0.2250010222196579, "learning_rate": 4.93754260362885e-06, "loss": 0.0889, "num_input_tokens_seen": 130072624, "step": 60330 }, { "epoch": 11.072673885116535, "grad_norm": 0.8769525289535522, "learning_rate": 4.936741910233082e-06, "loss": 0.2382, "num_input_tokens_seen": 130082768, "step": 60335 }, { "epoch": 11.073591484676088, "grad_norm": 0.30838993191719055, "learning_rate": 4.935941218459786e-06, "loss": 0.0037, "num_input_tokens_seen": 130093488, "step": 60340 }, { "epoch": 11.07450908423564, "grad_norm": 95.58515167236328, "learning_rate": 4.935140528329499e-06, "loss": 0.0551, "num_input_tokens_seen": 130103600, "step": 60345 }, { "epoch": 11.075426683795191, "grad_norm": 199.08758544921875, "learning_rate": 4.934339839862758e-06, "loss": 0.2333, "num_input_tokens_seen": 130115536, "step": 60350 }, { "epoch": 11.076344283354745, "grad_norm": 11.233996391296387, "learning_rate": 4.933539153080095e-06, "loss": 0.3238, "num_input_tokens_seen": 130126512, "step": 60355 }, { "epoch": 11.077261882914296, "grad_norm": 0.031649086624383926, "learning_rate": 4.932738468002053e-06, "loss": 0.0065, "num_input_tokens_seen": 130138288, "step": 60360 }, { "epoch": 11.078179482473848, "grad_norm": 138.84811401367188, "learning_rate": 4.931937784649164e-06, "loss": 0.1097, "num_input_tokens_seen": 130148496, "step": 60365 }, { "epoch": 11.079097082033401, "grad_norm": 0.7079049944877625, "learning_rate": 4.931137103041964e-06, "loss": 0.1124, "num_input_tokens_seen": 130159760, "step": 60370 }, { "epoch": 11.080014681592953, "grad_norm": 1.4302831888198853, "learning_rate": 4.930336423200993e-06, "loss": 0.044, "num_input_tokens_seen": 130170832, "step": 60375 }, { "epoch": 11.080932281152505, "grad_norm": 0.046020667999982834, "learning_rate": 4.929535745146784e-06, "loss": 0.1258, "num_input_tokens_seen": 130182864, "step": 60380 }, { "epoch": 11.081849880712058, "grad_norm": 27.826066970825195, "learning_rate": 4.928735068899874e-06, "loss": 0.0711, "num_input_tokens_seen": 130194064, "step": 60385 }, { "epoch": 11.08276748027161, "grad_norm": 0.26012685894966125, "learning_rate": 4.927934394480797e-06, "loss": 0.0817, "num_input_tokens_seen": 130203920, "step": 60390 }, { "epoch": 11.083685079831161, "grad_norm": 9.144523620605469, "learning_rate": 4.927133721910093e-06, "loss": 0.2822, "num_input_tokens_seen": 130215344, "step": 60395 }, { "epoch": 11.084602679390715, "grad_norm": 3.2421107292175293, "learning_rate": 4.926333051208297e-06, "loss": 0.0766, "num_input_tokens_seen": 130226448, "step": 60400 }, { "epoch": 11.085520278950266, "grad_norm": 1.448570728302002, "learning_rate": 4.925532382395941e-06, "loss": 0.2891, "num_input_tokens_seen": 130236144, "step": 60405 }, { "epoch": 11.086437878509818, "grad_norm": 0.20209263265132904, "learning_rate": 4.9247317154935665e-06, "loss": 0.2355, "num_input_tokens_seen": 130248400, "step": 60410 }, { "epoch": 11.087355478069371, "grad_norm": 0.1557602435350418, "learning_rate": 4.923931050521707e-06, "loss": 0.0265, "num_input_tokens_seen": 130259376, "step": 60415 }, { "epoch": 11.088273077628923, "grad_norm": 0.45808032155036926, "learning_rate": 4.923130387500898e-06, "loss": 0.2062, "num_input_tokens_seen": 130270256, "step": 60420 }, { "epoch": 11.089190677188475, "grad_norm": 30.362688064575195, "learning_rate": 4.922329726451674e-06, "loss": 0.2507, "num_input_tokens_seen": 130281072, "step": 60425 }, { "epoch": 11.090108276748028, "grad_norm": 101.53662109375, "learning_rate": 4.921529067394574e-06, "loss": 0.1232, "num_input_tokens_seen": 130292144, "step": 60430 }, { "epoch": 11.09102587630758, "grad_norm": 0.48763009905815125, "learning_rate": 4.920728410350129e-06, "loss": 0.2538, "num_input_tokens_seen": 130302512, "step": 60435 }, { "epoch": 11.091943475867131, "grad_norm": 48.368473052978516, "learning_rate": 4.919927755338879e-06, "loss": 0.2213, "num_input_tokens_seen": 130313904, "step": 60440 }, { "epoch": 11.092861075426685, "grad_norm": 0.16292345523834229, "learning_rate": 4.919127102381359e-06, "loss": 0.1799, "num_input_tokens_seen": 130325296, "step": 60445 }, { "epoch": 11.093778674986236, "grad_norm": 0.47617536783218384, "learning_rate": 4.918326451498103e-06, "loss": 0.1774, "num_input_tokens_seen": 130336080, "step": 60450 }, { "epoch": 11.094696274545788, "grad_norm": 55.04326629638672, "learning_rate": 4.917525802709645e-06, "loss": 0.2971, "num_input_tokens_seen": 130347344, "step": 60455 }, { "epoch": 11.095613874105341, "grad_norm": 0.018608959391713142, "learning_rate": 4.916725156036525e-06, "loss": 0.1113, "num_input_tokens_seen": 130358320, "step": 60460 }, { "epoch": 11.096531473664893, "grad_norm": 62.229469299316406, "learning_rate": 4.9159245114992746e-06, "loss": 0.3243, "num_input_tokens_seen": 130369808, "step": 60465 }, { "epoch": 11.097449073224444, "grad_norm": 18.911046981811523, "learning_rate": 4.915123869118431e-06, "loss": 0.0382, "num_input_tokens_seen": 130381456, "step": 60470 }, { "epoch": 11.098366672783998, "grad_norm": 90.90491485595703, "learning_rate": 4.914323228914526e-06, "loss": 0.2043, "num_input_tokens_seen": 130393136, "step": 60475 }, { "epoch": 11.09928427234355, "grad_norm": 2.9577436447143555, "learning_rate": 4.913522590908099e-06, "loss": 0.1188, "num_input_tokens_seen": 130404528, "step": 60480 }, { "epoch": 11.100201871903101, "grad_norm": 0.19842176139354706, "learning_rate": 4.912721955119685e-06, "loss": 0.0035, "num_input_tokens_seen": 130414544, "step": 60485 }, { "epoch": 11.101119471462654, "grad_norm": 70.04365539550781, "learning_rate": 4.911921321569814e-06, "loss": 0.1074, "num_input_tokens_seen": 130424016, "step": 60490 }, { "epoch": 11.102037071022206, "grad_norm": 48.86405944824219, "learning_rate": 4.911120690279028e-06, "loss": 0.1927, "num_input_tokens_seen": 130434352, "step": 60495 }, { "epoch": 11.102954670581758, "grad_norm": 140.0458984375, "learning_rate": 4.910320061267857e-06, "loss": 0.049, "num_input_tokens_seen": 130445392, "step": 60500 }, { "epoch": 11.103872270141311, "grad_norm": 0.016662057489156723, "learning_rate": 4.909519434556837e-06, "loss": 0.2022, "num_input_tokens_seen": 130456400, "step": 60505 }, { "epoch": 11.104789869700863, "grad_norm": 0.9944435954093933, "learning_rate": 4.908718810166504e-06, "loss": 0.121, "num_input_tokens_seen": 130468112, "step": 60510 }, { "epoch": 11.105707469260414, "grad_norm": 24.31882667541504, "learning_rate": 4.907918188117393e-06, "loss": 0.1016, "num_input_tokens_seen": 130478320, "step": 60515 }, { "epoch": 11.106625068819968, "grad_norm": 0.06888740509748459, "learning_rate": 4.907117568430038e-06, "loss": 0.1289, "num_input_tokens_seen": 130489808, "step": 60520 }, { "epoch": 11.10754266837952, "grad_norm": 74.9175033569336, "learning_rate": 4.906316951124971e-06, "loss": 0.1392, "num_input_tokens_seen": 130500944, "step": 60525 }, { "epoch": 11.108460267939071, "grad_norm": 25.800806045532227, "learning_rate": 4.9055163362227305e-06, "loss": 0.1532, "num_input_tokens_seen": 130510768, "step": 60530 }, { "epoch": 11.109377867498624, "grad_norm": 15.885695457458496, "learning_rate": 4.90471572374385e-06, "loss": 0.4402, "num_input_tokens_seen": 130521904, "step": 60535 }, { "epoch": 11.110295467058176, "grad_norm": 43.159263610839844, "learning_rate": 4.903915113708862e-06, "loss": 0.1475, "num_input_tokens_seen": 130532592, "step": 60540 }, { "epoch": 11.111213066617728, "grad_norm": 77.18950653076172, "learning_rate": 4.903114506138304e-06, "loss": 0.1114, "num_input_tokens_seen": 130543056, "step": 60545 }, { "epoch": 11.112130666177281, "grad_norm": 0.5208377838134766, "learning_rate": 4.902313901052709e-06, "loss": 0.0664, "num_input_tokens_seen": 130554192, "step": 60550 }, { "epoch": 11.113048265736833, "grad_norm": 92.39788818359375, "learning_rate": 4.90151329847261e-06, "loss": 0.4163, "num_input_tokens_seen": 130564848, "step": 60555 }, { "epoch": 11.113965865296384, "grad_norm": 0.0605207197368145, "learning_rate": 4.900712698418541e-06, "loss": 0.0028, "num_input_tokens_seen": 130574832, "step": 60560 }, { "epoch": 11.114883464855938, "grad_norm": 1.4695545434951782, "learning_rate": 4.899912100911039e-06, "loss": 0.2281, "num_input_tokens_seen": 130585872, "step": 60565 }, { "epoch": 11.11580106441549, "grad_norm": 0.24558739364147186, "learning_rate": 4.899111505970637e-06, "loss": 0.2271, "num_input_tokens_seen": 130596720, "step": 60570 }, { "epoch": 11.11671866397504, "grad_norm": 3.0648081302642822, "learning_rate": 4.898310913617866e-06, "loss": 0.0639, "num_input_tokens_seen": 130607536, "step": 60575 }, { "epoch": 11.117636263534594, "grad_norm": 0.20406386256217957, "learning_rate": 4.897510323873264e-06, "loss": 0.1685, "num_input_tokens_seen": 130617328, "step": 60580 }, { "epoch": 11.118553863094146, "grad_norm": 0.08168358355760574, "learning_rate": 4.896709736757365e-06, "loss": 0.0821, "num_input_tokens_seen": 130629232, "step": 60585 }, { "epoch": 11.119471462653697, "grad_norm": 8.796847343444824, "learning_rate": 4.8959091522906985e-06, "loss": 0.1469, "num_input_tokens_seen": 130639728, "step": 60590 }, { "epoch": 11.12038906221325, "grad_norm": 8.847652435302734, "learning_rate": 4.895108570493802e-06, "loss": 0.1978, "num_input_tokens_seen": 130649520, "step": 60595 }, { "epoch": 11.121306661772802, "grad_norm": 0.11249925196170807, "learning_rate": 4.894307991387209e-06, "loss": 0.1109, "num_input_tokens_seen": 130660720, "step": 60600 }, { "epoch": 11.122224261332354, "grad_norm": 36.1983757019043, "learning_rate": 4.893507414991452e-06, "loss": 0.3118, "num_input_tokens_seen": 130672304, "step": 60605 }, { "epoch": 11.123141860891907, "grad_norm": 0.5432150959968567, "learning_rate": 4.892706841327063e-06, "loss": 0.023, "num_input_tokens_seen": 130683472, "step": 60610 }, { "epoch": 11.124059460451459, "grad_norm": 74.96449279785156, "learning_rate": 4.891906270414578e-06, "loss": 0.0584, "num_input_tokens_seen": 130695056, "step": 60615 }, { "epoch": 11.12497706001101, "grad_norm": 41.501399993896484, "learning_rate": 4.891105702274531e-06, "loss": 0.2772, "num_input_tokens_seen": 130704272, "step": 60620 }, { "epoch": 11.125894659570564, "grad_norm": 0.2104743868112564, "learning_rate": 4.890305136927453e-06, "loss": 0.1758, "num_input_tokens_seen": 130714160, "step": 60625 }, { "epoch": 11.126812259130116, "grad_norm": 117.54241180419922, "learning_rate": 4.889504574393877e-06, "loss": 0.0233, "num_input_tokens_seen": 130724464, "step": 60630 }, { "epoch": 11.127729858689667, "grad_norm": 4.044859886169434, "learning_rate": 4.88870401469434e-06, "loss": 0.1741, "num_input_tokens_seen": 130734864, "step": 60635 }, { "epoch": 11.12864745824922, "grad_norm": 53.91539001464844, "learning_rate": 4.88790345784937e-06, "loss": 0.2506, "num_input_tokens_seen": 130745456, "step": 60640 }, { "epoch": 11.129565057808772, "grad_norm": 0.37849077582359314, "learning_rate": 4.887102903879505e-06, "loss": 0.0115, "num_input_tokens_seen": 130756528, "step": 60645 }, { "epoch": 11.130482657368324, "grad_norm": 0.16558924317359924, "learning_rate": 4.886302352805274e-06, "loss": 0.3634, "num_input_tokens_seen": 130766288, "step": 60650 }, { "epoch": 11.131400256927877, "grad_norm": 0.5606865882873535, "learning_rate": 4.885501804647212e-06, "loss": 0.1827, "num_input_tokens_seen": 130777072, "step": 60655 }, { "epoch": 11.132317856487429, "grad_norm": 0.44252824783325195, "learning_rate": 4.8847012594258505e-06, "loss": 0.1329, "num_input_tokens_seen": 130786864, "step": 60660 }, { "epoch": 11.13323545604698, "grad_norm": 47.017459869384766, "learning_rate": 4.883900717161724e-06, "loss": 0.1355, "num_input_tokens_seen": 130798320, "step": 60665 }, { "epoch": 11.134153055606534, "grad_norm": 63.0424690246582, "learning_rate": 4.883100177875364e-06, "loss": 0.1203, "num_input_tokens_seen": 130808560, "step": 60670 }, { "epoch": 11.135070655166086, "grad_norm": 0.24389593303203583, "learning_rate": 4.882299641587301e-06, "loss": 0.0051, "num_input_tokens_seen": 130820976, "step": 60675 }, { "epoch": 11.135988254725637, "grad_norm": 0.1014801487326622, "learning_rate": 4.881499108318072e-06, "loss": 0.0335, "num_input_tokens_seen": 130831536, "step": 60680 }, { "epoch": 11.13690585428519, "grad_norm": 97.08942413330078, "learning_rate": 4.880698578088209e-06, "loss": 0.124, "num_input_tokens_seen": 130842736, "step": 60685 }, { "epoch": 11.137823453844742, "grad_norm": 221.9440155029297, "learning_rate": 4.879898050918238e-06, "loss": 0.1092, "num_input_tokens_seen": 130853520, "step": 60690 }, { "epoch": 11.138741053404294, "grad_norm": 0.28602540493011475, "learning_rate": 4.879097526828699e-06, "loss": 0.1547, "num_input_tokens_seen": 130864816, "step": 60695 }, { "epoch": 11.139658652963847, "grad_norm": 0.21640151739120483, "learning_rate": 4.878297005840121e-06, "loss": 0.2482, "num_input_tokens_seen": 130875280, "step": 60700 }, { "epoch": 11.140576252523399, "grad_norm": 0.008138383738696575, "learning_rate": 4.877496487973036e-06, "loss": 0.2414, "num_input_tokens_seen": 130885552, "step": 60705 }, { "epoch": 11.14149385208295, "grad_norm": 2.400932788848877, "learning_rate": 4.876695973247974e-06, "loss": 0.2043, "num_input_tokens_seen": 130893904, "step": 60710 }, { "epoch": 11.142411451642504, "grad_norm": 26.3603458404541, "learning_rate": 4.875895461685471e-06, "loss": 0.5491, "num_input_tokens_seen": 130905232, "step": 60715 }, { "epoch": 11.143329051202056, "grad_norm": 0.12671567499637604, "learning_rate": 4.875094953306058e-06, "loss": 0.0516, "num_input_tokens_seen": 130916560, "step": 60720 }, { "epoch": 11.144246650761607, "grad_norm": 1.4167263507843018, "learning_rate": 4.874294448130264e-06, "loss": 0.0047, "num_input_tokens_seen": 130925808, "step": 60725 }, { "epoch": 11.14516425032116, "grad_norm": 0.05855076014995575, "learning_rate": 4.873493946178624e-06, "loss": 0.049, "num_input_tokens_seen": 130937392, "step": 60730 }, { "epoch": 11.146081849880712, "grad_norm": 0.278576135635376, "learning_rate": 4.872693447471667e-06, "loss": 0.1967, "num_input_tokens_seen": 130948144, "step": 60735 }, { "epoch": 11.146999449440264, "grad_norm": 68.1477279663086, "learning_rate": 4.871892952029928e-06, "loss": 0.1761, "num_input_tokens_seen": 130959824, "step": 60740 }, { "epoch": 11.147917048999817, "grad_norm": 221.02273559570312, "learning_rate": 4.8710924598739336e-06, "loss": 0.1272, "num_input_tokens_seen": 130970768, "step": 60745 }, { "epoch": 11.148834648559369, "grad_norm": 19.567161560058594, "learning_rate": 4.87029197102422e-06, "loss": 0.7829, "num_input_tokens_seen": 130982480, "step": 60750 }, { "epoch": 11.14975224811892, "grad_norm": 7.464865684509277, "learning_rate": 4.869491485501314e-06, "loss": 0.1212, "num_input_tokens_seen": 130993584, "step": 60755 }, { "epoch": 11.150669847678474, "grad_norm": 10.009806632995605, "learning_rate": 4.86869100332575e-06, "loss": 0.228, "num_input_tokens_seen": 131004656, "step": 60760 }, { "epoch": 11.151587447238025, "grad_norm": 58.85107421875, "learning_rate": 4.867890524518059e-06, "loss": 0.1298, "num_input_tokens_seen": 131015856, "step": 60765 }, { "epoch": 11.152505046797577, "grad_norm": 0.9381405711174011, "learning_rate": 4.867090049098772e-06, "loss": 0.0168, "num_input_tokens_seen": 131027664, "step": 60770 }, { "epoch": 11.15342264635713, "grad_norm": 1.0329489707946777, "learning_rate": 4.866289577088416e-06, "loss": 0.2025, "num_input_tokens_seen": 131037840, "step": 60775 }, { "epoch": 11.154340245916682, "grad_norm": 12.658197402954102, "learning_rate": 4.865489108507529e-06, "loss": 0.3055, "num_input_tokens_seen": 131048720, "step": 60780 }, { "epoch": 11.155257845476234, "grad_norm": 0.3822014629840851, "learning_rate": 4.864688643376636e-06, "loss": 0.0084, "num_input_tokens_seen": 131060304, "step": 60785 }, { "epoch": 11.156175445035787, "grad_norm": 22.750110626220703, "learning_rate": 4.86388818171627e-06, "loss": 0.1401, "num_input_tokens_seen": 131072400, "step": 60790 }, { "epoch": 11.157093044595339, "grad_norm": 0.3032435178756714, "learning_rate": 4.863087723546959e-06, "loss": 0.1183, "num_input_tokens_seen": 131083632, "step": 60795 }, { "epoch": 11.15801064415489, "grad_norm": 0.5122281312942505, "learning_rate": 4.862287268889239e-06, "loss": 0.0685, "num_input_tokens_seen": 131095792, "step": 60800 }, { "epoch": 11.158928243714444, "grad_norm": 1.0156207084655762, "learning_rate": 4.861486817763636e-06, "loss": 0.2732, "num_input_tokens_seen": 131107088, "step": 60805 }, { "epoch": 11.159845843273995, "grad_norm": 9.451354026794434, "learning_rate": 4.860686370190679e-06, "loss": 0.051, "num_input_tokens_seen": 131117072, "step": 60810 }, { "epoch": 11.160763442833547, "grad_norm": 72.45458984375, "learning_rate": 4.859885926190904e-06, "loss": 0.1833, "num_input_tokens_seen": 131128208, "step": 60815 }, { "epoch": 11.1616810423931, "grad_norm": 59.767948150634766, "learning_rate": 4.859085485784837e-06, "loss": 0.3208, "num_input_tokens_seen": 131139472, "step": 60820 }, { "epoch": 11.162598641952652, "grad_norm": 0.03151943162083626, "learning_rate": 4.858285048993007e-06, "loss": 0.2113, "num_input_tokens_seen": 131150320, "step": 60825 }, { "epoch": 11.163516241512204, "grad_norm": 0.11544191837310791, "learning_rate": 4.857484615835948e-06, "loss": 0.2504, "num_input_tokens_seen": 131161488, "step": 60830 }, { "epoch": 11.164433841071757, "grad_norm": 0.12120092660188675, "learning_rate": 4.856684186334188e-06, "loss": 0.0615, "num_input_tokens_seen": 131171760, "step": 60835 }, { "epoch": 11.165351440631309, "grad_norm": 0.4319928288459778, "learning_rate": 4.855883760508256e-06, "loss": 0.0554, "num_input_tokens_seen": 131182288, "step": 60840 }, { "epoch": 11.16626904019086, "grad_norm": 0.4096331000328064, "learning_rate": 4.855083338378682e-06, "loss": 0.0548, "num_input_tokens_seen": 131192464, "step": 60845 }, { "epoch": 11.167186639750414, "grad_norm": 18.821208953857422, "learning_rate": 4.854282919965994e-06, "loss": 0.1627, "num_input_tokens_seen": 131202928, "step": 60850 }, { "epoch": 11.168104239309965, "grad_norm": 28.745319366455078, "learning_rate": 4.853482505290726e-06, "loss": 0.287, "num_input_tokens_seen": 131213648, "step": 60855 }, { "epoch": 11.169021838869517, "grad_norm": 0.09185738116502762, "learning_rate": 4.852682094373403e-06, "loss": 0.1575, "num_input_tokens_seen": 131223536, "step": 60860 }, { "epoch": 11.16993943842907, "grad_norm": 0.19842715561389923, "learning_rate": 4.851881687234557e-06, "loss": 0.2588, "num_input_tokens_seen": 131235472, "step": 60865 }, { "epoch": 11.170857037988622, "grad_norm": 0.5517474412918091, "learning_rate": 4.851081283894717e-06, "loss": 0.0388, "num_input_tokens_seen": 131246448, "step": 60870 }, { "epoch": 11.171774637548173, "grad_norm": 0.5440510511398315, "learning_rate": 4.8502808843744085e-06, "loss": 0.0672, "num_input_tokens_seen": 131256912, "step": 60875 }, { "epoch": 11.172692237107727, "grad_norm": 0.19052965939044952, "learning_rate": 4.849480488694164e-06, "loss": 0.1697, "num_input_tokens_seen": 131267312, "step": 60880 }, { "epoch": 11.173609836667278, "grad_norm": 26.78118896484375, "learning_rate": 4.848680096874514e-06, "loss": 0.2806, "num_input_tokens_seen": 131278384, "step": 60885 }, { "epoch": 11.17452743622683, "grad_norm": 49.27652359008789, "learning_rate": 4.8478797089359836e-06, "loss": 0.1382, "num_input_tokens_seen": 131288400, "step": 60890 }, { "epoch": 11.175445035786383, "grad_norm": 4.512237548828125, "learning_rate": 4.8470793248991014e-06, "loss": 0.0044, "num_input_tokens_seen": 131299408, "step": 60895 }, { "epoch": 11.176362635345935, "grad_norm": 40.95497131347656, "learning_rate": 4.846278944784399e-06, "loss": 0.3777, "num_input_tokens_seen": 131308272, "step": 60900 }, { "epoch": 11.177280234905487, "grad_norm": 0.1625555157661438, "learning_rate": 4.845478568612404e-06, "loss": 0.0635, "num_input_tokens_seen": 131318832, "step": 60905 }, { "epoch": 11.17819783446504, "grad_norm": 7.606888294219971, "learning_rate": 4.844678196403641e-06, "loss": 0.2521, "num_input_tokens_seen": 131329648, "step": 60910 }, { "epoch": 11.179115434024592, "grad_norm": 0.5613723397254944, "learning_rate": 4.843877828178645e-06, "loss": 0.0023, "num_input_tokens_seen": 131339984, "step": 60915 }, { "epoch": 11.180033033584143, "grad_norm": 169.95497131347656, "learning_rate": 4.84307746395794e-06, "loss": 0.0308, "num_input_tokens_seen": 131352016, "step": 60920 }, { "epoch": 11.180950633143697, "grad_norm": 67.34752655029297, "learning_rate": 4.842277103762055e-06, "loss": 0.1153, "num_input_tokens_seen": 131361840, "step": 60925 }, { "epoch": 11.181868232703248, "grad_norm": 32.63119888305664, "learning_rate": 4.841476747611516e-06, "loss": 0.0195, "num_input_tokens_seen": 131373168, "step": 60930 }, { "epoch": 11.1827858322628, "grad_norm": 0.15632395446300507, "learning_rate": 4.840676395526855e-06, "loss": 0.0913, "num_input_tokens_seen": 131385392, "step": 60935 }, { "epoch": 11.183703431822353, "grad_norm": 0.5870499610900879, "learning_rate": 4.839876047528597e-06, "loss": 0.042, "num_input_tokens_seen": 131395408, "step": 60940 }, { "epoch": 11.184621031381905, "grad_norm": 2.384413242340088, "learning_rate": 4.839075703637268e-06, "loss": 0.15, "num_input_tokens_seen": 131407248, "step": 60945 }, { "epoch": 11.185538630941457, "grad_norm": 0.2548062801361084, "learning_rate": 4.838275363873401e-06, "loss": 0.0934, "num_input_tokens_seen": 131416880, "step": 60950 }, { "epoch": 11.18645623050101, "grad_norm": 90.80322265625, "learning_rate": 4.837475028257519e-06, "loss": 0.2488, "num_input_tokens_seen": 131428592, "step": 60955 }, { "epoch": 11.187373830060562, "grad_norm": 0.12615425884723663, "learning_rate": 4.83667469681015e-06, "loss": 0.1444, "num_input_tokens_seen": 131438288, "step": 60960 }, { "epoch": 11.188291429620113, "grad_norm": 0.09677281975746155, "learning_rate": 4.835874369551823e-06, "loss": 0.0378, "num_input_tokens_seen": 131448912, "step": 60965 }, { "epoch": 11.189209029179667, "grad_norm": 0.027735156938433647, "learning_rate": 4.835074046503064e-06, "loss": 0.1621, "num_input_tokens_seen": 131459952, "step": 60970 }, { "epoch": 11.190126628739218, "grad_norm": 14.753817558288574, "learning_rate": 4.834273727684399e-06, "loss": 0.1375, "num_input_tokens_seen": 131471120, "step": 60975 }, { "epoch": 11.19104422829877, "grad_norm": 4.334967136383057, "learning_rate": 4.8334734131163565e-06, "loss": 0.1582, "num_input_tokens_seen": 131482224, "step": 60980 }, { "epoch": 11.191961827858323, "grad_norm": 0.22762548923492432, "learning_rate": 4.832673102819463e-06, "loss": 0.1739, "num_input_tokens_seen": 131493904, "step": 60985 }, { "epoch": 11.192879427417875, "grad_norm": 3.499911308288574, "learning_rate": 4.831872796814246e-06, "loss": 0.2071, "num_input_tokens_seen": 131505040, "step": 60990 }, { "epoch": 11.193797026977427, "grad_norm": 6.5755791664123535, "learning_rate": 4.831072495121228e-06, "loss": 0.3196, "num_input_tokens_seen": 131516752, "step": 60995 }, { "epoch": 11.19471462653698, "grad_norm": 0.3605022728443146, "learning_rate": 4.830272197760942e-06, "loss": 0.0021, "num_input_tokens_seen": 131527792, "step": 61000 }, { "epoch": 11.195632226096532, "grad_norm": 0.16672416031360626, "learning_rate": 4.829471904753911e-06, "loss": 0.0013, "num_input_tokens_seen": 131538032, "step": 61005 }, { "epoch": 11.196549825656083, "grad_norm": 0.05358646810054779, "learning_rate": 4.8286716161206586e-06, "loss": 0.0146, "num_input_tokens_seen": 131548752, "step": 61010 }, { "epoch": 11.197467425215637, "grad_norm": 0.35023149847984314, "learning_rate": 4.827871331881716e-06, "loss": 0.3002, "num_input_tokens_seen": 131559408, "step": 61015 }, { "epoch": 11.198385024775188, "grad_norm": 0.08201000839471817, "learning_rate": 4.827071052057607e-06, "loss": 0.1416, "num_input_tokens_seen": 131570544, "step": 61020 }, { "epoch": 11.19930262433474, "grad_norm": 0.20332132279872894, "learning_rate": 4.826270776668857e-06, "loss": 0.0742, "num_input_tokens_seen": 131582064, "step": 61025 }, { "epoch": 11.200220223894293, "grad_norm": 0.0197765976190567, "learning_rate": 4.825470505735991e-06, "loss": 0.0215, "num_input_tokens_seen": 131593424, "step": 61030 }, { "epoch": 11.201137823453845, "grad_norm": 0.13695520162582397, "learning_rate": 4.824670239279538e-06, "loss": 0.0658, "num_input_tokens_seen": 131603280, "step": 61035 }, { "epoch": 11.202055423013396, "grad_norm": 97.42302703857422, "learning_rate": 4.823869977320021e-06, "loss": 0.3537, "num_input_tokens_seen": 131615632, "step": 61040 }, { "epoch": 11.20297302257295, "grad_norm": 1.2809553146362305, "learning_rate": 4.8230697198779645e-06, "loss": 0.1796, "num_input_tokens_seen": 131626032, "step": 61045 }, { "epoch": 11.203890622132501, "grad_norm": 71.82721710205078, "learning_rate": 4.822269466973898e-06, "loss": 0.1741, "num_input_tokens_seen": 131637680, "step": 61050 }, { "epoch": 11.204808221692053, "grad_norm": 142.88681030273438, "learning_rate": 4.821469218628344e-06, "loss": 0.4143, "num_input_tokens_seen": 131648016, "step": 61055 }, { "epoch": 11.205725821251606, "grad_norm": 0.05731075257062912, "learning_rate": 4.820668974861827e-06, "loss": 0.3199, "num_input_tokens_seen": 131658512, "step": 61060 }, { "epoch": 11.206643420811158, "grad_norm": 0.2736373841762543, "learning_rate": 4.819868735694873e-06, "loss": 0.0104, "num_input_tokens_seen": 131668944, "step": 61065 }, { "epoch": 11.20756102037071, "grad_norm": 87.95475006103516, "learning_rate": 4.819068501148006e-06, "loss": 0.1263, "num_input_tokens_seen": 131678896, "step": 61070 }, { "epoch": 11.208478619930263, "grad_norm": 0.20390069484710693, "learning_rate": 4.818268271241752e-06, "loss": 0.174, "num_input_tokens_seen": 131690256, "step": 61075 }, { "epoch": 11.209396219489815, "grad_norm": 6.568652153015137, "learning_rate": 4.817468045996635e-06, "loss": 0.0996, "num_input_tokens_seen": 131701104, "step": 61080 }, { "epoch": 11.210313819049366, "grad_norm": 120.59329223632812, "learning_rate": 4.816667825433181e-06, "loss": 0.1044, "num_input_tokens_seen": 131712752, "step": 61085 }, { "epoch": 11.21123141860892, "grad_norm": 2.716834306716919, "learning_rate": 4.815867609571913e-06, "loss": 0.1338, "num_input_tokens_seen": 131722800, "step": 61090 }, { "epoch": 11.212149018168471, "grad_norm": 0.0896860659122467, "learning_rate": 4.815067398433353e-06, "loss": 0.0008, "num_input_tokens_seen": 131734320, "step": 61095 }, { "epoch": 11.213066617728023, "grad_norm": 37.89378356933594, "learning_rate": 4.8142671920380295e-06, "loss": 0.4209, "num_input_tokens_seen": 131744976, "step": 61100 }, { "epoch": 11.213984217287576, "grad_norm": 0.07376790791749954, "learning_rate": 4.813466990406465e-06, "loss": 0.1141, "num_input_tokens_seen": 131756176, "step": 61105 }, { "epoch": 11.214901816847128, "grad_norm": 24.75901985168457, "learning_rate": 4.812666793559183e-06, "loss": 0.3741, "num_input_tokens_seen": 131766640, "step": 61110 }, { "epoch": 11.21581941640668, "grad_norm": 16.217199325561523, "learning_rate": 4.811866601516705e-06, "loss": 0.1659, "num_input_tokens_seen": 131777136, "step": 61115 }, { "epoch": 11.216737015966233, "grad_norm": 11.548065185546875, "learning_rate": 4.81106641429956e-06, "loss": 0.2745, "num_input_tokens_seen": 131789488, "step": 61120 }, { "epoch": 11.217654615525785, "grad_norm": 21.945737838745117, "learning_rate": 4.810266231928268e-06, "loss": 0.0059, "num_input_tokens_seen": 131799536, "step": 61125 }, { "epoch": 11.218572215085336, "grad_norm": 0.12407859414815903, "learning_rate": 4.8094660544233515e-06, "loss": 0.118, "num_input_tokens_seen": 131810064, "step": 61130 }, { "epoch": 11.21948981464489, "grad_norm": 0.3699682652950287, "learning_rate": 4.808665881805337e-06, "loss": 0.2468, "num_input_tokens_seen": 131821904, "step": 61135 }, { "epoch": 11.220407414204441, "grad_norm": 0.014909216202795506, "learning_rate": 4.807865714094747e-06, "loss": 0.1987, "num_input_tokens_seen": 131832208, "step": 61140 }, { "epoch": 11.221325013763993, "grad_norm": 0.6245982050895691, "learning_rate": 4.8070655513121005e-06, "loss": 0.1606, "num_input_tokens_seen": 131842704, "step": 61145 }, { "epoch": 11.222242613323546, "grad_norm": 87.65470886230469, "learning_rate": 4.806265393477926e-06, "loss": 0.112, "num_input_tokens_seen": 131852912, "step": 61150 }, { "epoch": 11.223160212883098, "grad_norm": 1.8402910232543945, "learning_rate": 4.805465240612744e-06, "loss": 0.0261, "num_input_tokens_seen": 131864400, "step": 61155 }, { "epoch": 11.22407781244265, "grad_norm": 0.1172180324792862, "learning_rate": 4.804665092737077e-06, "loss": 0.0942, "num_input_tokens_seen": 131875152, "step": 61160 }, { "epoch": 11.224995412002203, "grad_norm": 0.06174661964178085, "learning_rate": 4.803864949871447e-06, "loss": 0.0055, "num_input_tokens_seen": 131886160, "step": 61165 }, { "epoch": 11.225913011561754, "grad_norm": 0.2725986838340759, "learning_rate": 4.803064812036376e-06, "loss": 0.1067, "num_input_tokens_seen": 131897264, "step": 61170 }, { "epoch": 11.226830611121306, "grad_norm": 0.06268768757581711, "learning_rate": 4.802264679252389e-06, "loss": 0.2339, "num_input_tokens_seen": 131907792, "step": 61175 }, { "epoch": 11.22774821068086, "grad_norm": 30.615581512451172, "learning_rate": 4.801464551540005e-06, "loss": 0.2308, "num_input_tokens_seen": 131919056, "step": 61180 }, { "epoch": 11.228665810240411, "grad_norm": 224.5870361328125, "learning_rate": 4.80066442891975e-06, "loss": 0.2306, "num_input_tokens_seen": 131930512, "step": 61185 }, { "epoch": 11.229583409799963, "grad_norm": 5.819912433624268, "learning_rate": 4.799864311412143e-06, "loss": 0.2374, "num_input_tokens_seen": 131943056, "step": 61190 }, { "epoch": 11.230501009359516, "grad_norm": 31.362621307373047, "learning_rate": 4.799064199037704e-06, "loss": 0.1462, "num_input_tokens_seen": 131954480, "step": 61195 }, { "epoch": 11.231418608919068, "grad_norm": 24.16092300415039, "learning_rate": 4.798264091816958e-06, "loss": 0.1234, "num_input_tokens_seen": 131965904, "step": 61200 }, { "epoch": 11.23233620847862, "grad_norm": 0.44128209352493286, "learning_rate": 4.7974639897704255e-06, "loss": 0.1103, "num_input_tokens_seen": 131974768, "step": 61205 }, { "epoch": 11.233253808038173, "grad_norm": 1.0001552104949951, "learning_rate": 4.7966638929186285e-06, "loss": 0.4874, "num_input_tokens_seen": 131985808, "step": 61210 }, { "epoch": 11.234171407597724, "grad_norm": 77.60566711425781, "learning_rate": 4.795863801282085e-06, "loss": 0.3308, "num_input_tokens_seen": 131997040, "step": 61215 }, { "epoch": 11.235089007157276, "grad_norm": 74.98126983642578, "learning_rate": 4.795063714881321e-06, "loss": 0.1929, "num_input_tokens_seen": 132008976, "step": 61220 }, { "epoch": 11.23600660671683, "grad_norm": 50.39100646972656, "learning_rate": 4.794263633736856e-06, "loss": 0.1468, "num_input_tokens_seen": 132020240, "step": 61225 }, { "epoch": 11.236924206276381, "grad_norm": 0.4237615764141083, "learning_rate": 4.793463557869206e-06, "loss": 0.0853, "num_input_tokens_seen": 132030928, "step": 61230 }, { "epoch": 11.237841805835933, "grad_norm": 53.95436096191406, "learning_rate": 4.7926634872988985e-06, "loss": 0.1673, "num_input_tokens_seen": 132041872, "step": 61235 }, { "epoch": 11.238759405395486, "grad_norm": 27.20348358154297, "learning_rate": 4.791863422046452e-06, "loss": 0.0107, "num_input_tokens_seen": 132052400, "step": 61240 }, { "epoch": 11.239677004955038, "grad_norm": 0.33053332567214966, "learning_rate": 4.791063362132386e-06, "loss": 0.1337, "num_input_tokens_seen": 132063568, "step": 61245 }, { "epoch": 11.24059460451459, "grad_norm": 0.009735411033034325, "learning_rate": 4.790263307577218e-06, "loss": 0.0104, "num_input_tokens_seen": 132073904, "step": 61250 }, { "epoch": 11.241512204074143, "grad_norm": 33.019691467285156, "learning_rate": 4.789463258401472e-06, "loss": 0.2278, "num_input_tokens_seen": 132084624, "step": 61255 }, { "epoch": 11.242429803633694, "grad_norm": 0.036994632333517075, "learning_rate": 4.7886632146256695e-06, "loss": 0.0282, "num_input_tokens_seen": 132094992, "step": 61260 }, { "epoch": 11.243347403193246, "grad_norm": 7.415787696838379, "learning_rate": 4.787863176270324e-06, "loss": 0.1101, "num_input_tokens_seen": 132106384, "step": 61265 }, { "epoch": 11.2442650027528, "grad_norm": 0.718233048915863, "learning_rate": 4.787063143355963e-06, "loss": 0.1898, "num_input_tokens_seen": 132116880, "step": 61270 }, { "epoch": 11.245182602312351, "grad_norm": 12.576833724975586, "learning_rate": 4.786263115903102e-06, "loss": 0.3059, "num_input_tokens_seen": 132126864, "step": 61275 }, { "epoch": 11.246100201871903, "grad_norm": 16.81988525390625, "learning_rate": 4.78546309393226e-06, "loss": 0.0983, "num_input_tokens_seen": 132136528, "step": 61280 }, { "epoch": 11.247017801431456, "grad_norm": 0.3832431137561798, "learning_rate": 4.784663077463957e-06, "loss": 0.1016, "num_input_tokens_seen": 132147088, "step": 61285 }, { "epoch": 11.247935400991008, "grad_norm": 0.09040835499763489, "learning_rate": 4.783863066518713e-06, "loss": 0.0029, "num_input_tokens_seen": 132157104, "step": 61290 }, { "epoch": 11.24885300055056, "grad_norm": 0.17951035499572754, "learning_rate": 4.783063061117045e-06, "loss": 0.3404, "num_input_tokens_seen": 132167696, "step": 61295 }, { "epoch": 11.249770600110113, "grad_norm": 1.4808173179626465, "learning_rate": 4.782263061279474e-06, "loss": 0.0057, "num_input_tokens_seen": 132179504, "step": 61300 }, { "epoch": 11.250688199669664, "grad_norm": 9.450860977172852, "learning_rate": 4.781463067026519e-06, "loss": 0.0322, "num_input_tokens_seen": 132189840, "step": 61305 }, { "epoch": 11.251605799229216, "grad_norm": 0.0981721580028534, "learning_rate": 4.7806630783786965e-06, "loss": 0.2742, "num_input_tokens_seen": 132200656, "step": 61310 }, { "epoch": 11.25252339878877, "grad_norm": 36.91482162475586, "learning_rate": 4.779863095356525e-06, "loss": 0.2481, "num_input_tokens_seen": 132210960, "step": 61315 }, { "epoch": 11.25344099834832, "grad_norm": 0.28923821449279785, "learning_rate": 4.779063117980526e-06, "loss": 0.2726, "num_input_tokens_seen": 132221456, "step": 61320 }, { "epoch": 11.254358597907872, "grad_norm": 0.12684831023216248, "learning_rate": 4.778263146271215e-06, "loss": 0.0906, "num_input_tokens_seen": 132233520, "step": 61325 }, { "epoch": 11.255276197467426, "grad_norm": 0.6154196858406067, "learning_rate": 4.777463180249111e-06, "loss": 0.1298, "num_input_tokens_seen": 132245456, "step": 61330 }, { "epoch": 11.256193797026977, "grad_norm": 0.04570317268371582, "learning_rate": 4.77666321993473e-06, "loss": 0.4603, "num_input_tokens_seen": 132256112, "step": 61335 }, { "epoch": 11.257111396586529, "grad_norm": 17.282705307006836, "learning_rate": 4.7758632653485925e-06, "loss": 0.0317, "num_input_tokens_seen": 132266640, "step": 61340 }, { "epoch": 11.258028996146082, "grad_norm": 94.61308288574219, "learning_rate": 4.7750633165112155e-06, "loss": 0.0493, "num_input_tokens_seen": 132277840, "step": 61345 }, { "epoch": 11.258946595705634, "grad_norm": 0.25113144516944885, "learning_rate": 4.774263373443113e-06, "loss": 0.3006, "num_input_tokens_seen": 132289072, "step": 61350 }, { "epoch": 11.259864195265186, "grad_norm": 0.09731687605381012, "learning_rate": 4.773463436164807e-06, "loss": 0.1786, "num_input_tokens_seen": 132300048, "step": 61355 }, { "epoch": 11.260781794824739, "grad_norm": 3.596111536026001, "learning_rate": 4.772663504696814e-06, "loss": 0.111, "num_input_tokens_seen": 132310416, "step": 61360 }, { "epoch": 11.26169939438429, "grad_norm": 15.750292778015137, "learning_rate": 4.7718635790596465e-06, "loss": 0.1148, "num_input_tokens_seen": 132321904, "step": 61365 }, { "epoch": 11.262616993943842, "grad_norm": 0.06751327961683273, "learning_rate": 4.771063659273828e-06, "loss": 0.0106, "num_input_tokens_seen": 132332752, "step": 61370 }, { "epoch": 11.263534593503396, "grad_norm": 212.31251525878906, "learning_rate": 4.770263745359871e-06, "loss": 0.1963, "num_input_tokens_seen": 132342832, "step": 61375 }, { "epoch": 11.264452193062947, "grad_norm": 65.8836898803711, "learning_rate": 4.769463837338293e-06, "loss": 0.379, "num_input_tokens_seen": 132354224, "step": 61380 }, { "epoch": 11.265369792622499, "grad_norm": 0.22670544683933258, "learning_rate": 4.7686639352296085e-06, "loss": 0.2666, "num_input_tokens_seen": 132365072, "step": 61385 }, { "epoch": 11.266287392182052, "grad_norm": 0.03673817217350006, "learning_rate": 4.7678640390543365e-06, "loss": 0.0435, "num_input_tokens_seen": 132376080, "step": 61390 }, { "epoch": 11.267204991741604, "grad_norm": 0.42919808626174927, "learning_rate": 4.767064148832993e-06, "loss": 0.055, "num_input_tokens_seen": 132385712, "step": 61395 }, { "epoch": 11.268122591301156, "grad_norm": 0.08748047053813934, "learning_rate": 4.766264264586092e-06, "loss": 0.1012, "num_input_tokens_seen": 132395984, "step": 61400 }, { "epoch": 11.269040190860709, "grad_norm": 45.64393615722656, "learning_rate": 4.765464386334151e-06, "loss": 0.4851, "num_input_tokens_seen": 132406672, "step": 61405 }, { "epoch": 11.26995779042026, "grad_norm": 79.47622680664062, "learning_rate": 4.764664514097686e-06, "loss": 0.0174, "num_input_tokens_seen": 132418544, "step": 61410 }, { "epoch": 11.270875389979812, "grad_norm": 0.5799350142478943, "learning_rate": 4.763864647897208e-06, "loss": 0.2001, "num_input_tokens_seen": 132428016, "step": 61415 }, { "epoch": 11.271792989539366, "grad_norm": 162.40689086914062, "learning_rate": 4.763064787753239e-06, "loss": 0.1901, "num_input_tokens_seen": 132438576, "step": 61420 }, { "epoch": 11.272710589098917, "grad_norm": 0.10898267477750778, "learning_rate": 4.7622649336862905e-06, "loss": 0.1026, "num_input_tokens_seen": 132450192, "step": 61425 }, { "epoch": 11.273628188658469, "grad_norm": 2.214456796646118, "learning_rate": 4.761465085716877e-06, "loss": 0.1096, "num_input_tokens_seen": 132460752, "step": 61430 }, { "epoch": 11.274545788218022, "grad_norm": 4.2070817947387695, "learning_rate": 4.760665243865514e-06, "loss": 0.126, "num_input_tokens_seen": 132471600, "step": 61435 }, { "epoch": 11.275463387777574, "grad_norm": 0.2620723843574524, "learning_rate": 4.759865408152718e-06, "loss": 0.0099, "num_input_tokens_seen": 132481488, "step": 61440 }, { "epoch": 11.276380987337125, "grad_norm": 31.954498291015625, "learning_rate": 4.759065578599002e-06, "loss": 0.2499, "num_input_tokens_seen": 132492176, "step": 61445 }, { "epoch": 11.277298586896679, "grad_norm": 8.50667953491211, "learning_rate": 4.758265755224878e-06, "loss": 0.032, "num_input_tokens_seen": 132501776, "step": 61450 }, { "epoch": 11.27821618645623, "grad_norm": 0.40584614872932434, "learning_rate": 4.757465938050866e-06, "loss": 0.1883, "num_input_tokens_seen": 132512848, "step": 61455 }, { "epoch": 11.279133786015782, "grad_norm": 14.639679908752441, "learning_rate": 4.756666127097476e-06, "loss": 0.2462, "num_input_tokens_seen": 132522896, "step": 61460 }, { "epoch": 11.280051385575335, "grad_norm": 4.7578206062316895, "learning_rate": 4.7558663223852205e-06, "loss": 0.1697, "num_input_tokens_seen": 132533744, "step": 61465 }, { "epoch": 11.280968985134887, "grad_norm": 0.19446825981140137, "learning_rate": 4.755066523934617e-06, "loss": 0.1026, "num_input_tokens_seen": 132543152, "step": 61470 }, { "epoch": 11.281886584694439, "grad_norm": 0.20648598670959473, "learning_rate": 4.754266731766179e-06, "loss": 0.2135, "num_input_tokens_seen": 132554384, "step": 61475 }, { "epoch": 11.282804184253992, "grad_norm": 0.15908664464950562, "learning_rate": 4.753466945900417e-06, "loss": 0.1785, "num_input_tokens_seen": 132565488, "step": 61480 }, { "epoch": 11.283721783813544, "grad_norm": 0.2623654007911682, "learning_rate": 4.752667166357845e-06, "loss": 0.0013, "num_input_tokens_seen": 132575120, "step": 61485 }, { "epoch": 11.284639383373095, "grad_norm": 15.657684326171875, "learning_rate": 4.751867393158978e-06, "loss": 0.4493, "num_input_tokens_seen": 132586096, "step": 61490 }, { "epoch": 11.285556982932649, "grad_norm": 18.759069442749023, "learning_rate": 4.751067626324328e-06, "loss": 0.1535, "num_input_tokens_seen": 132598160, "step": 61495 }, { "epoch": 11.2864745824922, "grad_norm": 53.108184814453125, "learning_rate": 4.750267865874406e-06, "loss": 0.6535, "num_input_tokens_seen": 132608944, "step": 61500 }, { "epoch": 11.287392182051752, "grad_norm": 0.8734857439994812, "learning_rate": 4.749468111829729e-06, "loss": 0.0758, "num_input_tokens_seen": 132619536, "step": 61505 }, { "epoch": 11.288309781611305, "grad_norm": 2.394531726837158, "learning_rate": 4.748668364210805e-06, "loss": 0.0062, "num_input_tokens_seen": 132630672, "step": 61510 }, { "epoch": 11.289227381170857, "grad_norm": 0.13332438468933105, "learning_rate": 4.747868623038148e-06, "loss": 0.2081, "num_input_tokens_seen": 132640720, "step": 61515 }, { "epoch": 11.290144980730409, "grad_norm": 0.16612274944782257, "learning_rate": 4.7470688883322695e-06, "loss": 0.2864, "num_input_tokens_seen": 132650064, "step": 61520 }, { "epoch": 11.291062580289962, "grad_norm": 104.92338562011719, "learning_rate": 4.7462691601136825e-06, "loss": 0.234, "num_input_tokens_seen": 132661264, "step": 61525 }, { "epoch": 11.291980179849514, "grad_norm": 36.027992248535156, "learning_rate": 4.7454694384029e-06, "loss": 0.1385, "num_input_tokens_seen": 132672784, "step": 61530 }, { "epoch": 11.292897779409065, "grad_norm": 0.07729914784431458, "learning_rate": 4.744669723220428e-06, "loss": 0.2462, "num_input_tokens_seen": 132684112, "step": 61535 }, { "epoch": 11.293815378968619, "grad_norm": 20.46040153503418, "learning_rate": 4.743870014586784e-06, "loss": 0.2347, "num_input_tokens_seen": 132694864, "step": 61540 }, { "epoch": 11.29473297852817, "grad_norm": 42.10369873046875, "learning_rate": 4.743070312522478e-06, "loss": 0.1141, "num_input_tokens_seen": 132705744, "step": 61545 }, { "epoch": 11.295650578087722, "grad_norm": 0.07244369387626648, "learning_rate": 4.742270617048018e-06, "loss": 0.1228, "num_input_tokens_seen": 132717264, "step": 61550 }, { "epoch": 11.296568177647275, "grad_norm": 0.09257400780916214, "learning_rate": 4.7414709281839195e-06, "loss": 0.3883, "num_input_tokens_seen": 132727696, "step": 61555 }, { "epoch": 11.297485777206827, "grad_norm": 21.552051544189453, "learning_rate": 4.740671245950691e-06, "loss": 0.1761, "num_input_tokens_seen": 132738384, "step": 61560 }, { "epoch": 11.298403376766379, "grad_norm": 1.897253394126892, "learning_rate": 4.739871570368842e-06, "loss": 0.1399, "num_input_tokens_seen": 132747952, "step": 61565 }, { "epoch": 11.299320976325932, "grad_norm": 9.602995872497559, "learning_rate": 4.739071901458883e-06, "loss": 0.2179, "num_input_tokens_seen": 132759248, "step": 61570 }, { "epoch": 11.300238575885484, "grad_norm": 25.077245712280273, "learning_rate": 4.738272239241328e-06, "loss": 0.2763, "num_input_tokens_seen": 132769040, "step": 61575 }, { "epoch": 11.301156175445035, "grad_norm": 0.28657084703445435, "learning_rate": 4.737472583736683e-06, "loss": 0.0575, "num_input_tokens_seen": 132780656, "step": 61580 }, { "epoch": 11.302073775004589, "grad_norm": 147.66880798339844, "learning_rate": 4.7366729349654585e-06, "loss": 0.3832, "num_input_tokens_seen": 132791408, "step": 61585 }, { "epoch": 11.30299137456414, "grad_norm": 14.598010063171387, "learning_rate": 4.735873292948167e-06, "loss": 0.1364, "num_input_tokens_seen": 132801584, "step": 61590 }, { "epoch": 11.303908974123692, "grad_norm": 77.30318450927734, "learning_rate": 4.735073657705315e-06, "loss": 0.2323, "num_input_tokens_seen": 132812304, "step": 61595 }, { "epoch": 11.304826573683245, "grad_norm": 2.3076648712158203, "learning_rate": 4.734274029257414e-06, "loss": 0.1225, "num_input_tokens_seen": 132823376, "step": 61600 }, { "epoch": 11.305744173242797, "grad_norm": 0.7541425824165344, "learning_rate": 4.733474407624972e-06, "loss": 0.0864, "num_input_tokens_seen": 132833968, "step": 61605 }, { "epoch": 11.306661772802348, "grad_norm": 15.719139099121094, "learning_rate": 4.732674792828497e-06, "loss": 0.0998, "num_input_tokens_seen": 132845584, "step": 61610 }, { "epoch": 11.307579372361902, "grad_norm": 107.87965393066406, "learning_rate": 4.7318751848885004e-06, "loss": 0.289, "num_input_tokens_seen": 132856816, "step": 61615 }, { "epoch": 11.308496971921453, "grad_norm": 9.94608211517334, "learning_rate": 4.73107558382549e-06, "loss": 0.193, "num_input_tokens_seen": 132868080, "step": 61620 }, { "epoch": 11.309414571481005, "grad_norm": 33.3323860168457, "learning_rate": 4.730275989659974e-06, "loss": 0.1382, "num_input_tokens_seen": 132879536, "step": 61625 }, { "epoch": 11.310332171040558, "grad_norm": 0.4739283323287964, "learning_rate": 4.729476402412461e-06, "loss": 0.2571, "num_input_tokens_seen": 132891216, "step": 61630 }, { "epoch": 11.31124977060011, "grad_norm": 18.533306121826172, "learning_rate": 4.728676822103457e-06, "loss": 0.1301, "num_input_tokens_seen": 132902032, "step": 61635 }, { "epoch": 11.312167370159662, "grad_norm": 0.08942966163158417, "learning_rate": 4.7278772487534745e-06, "loss": 0.1462, "num_input_tokens_seen": 132911568, "step": 61640 }, { "epoch": 11.313084969719215, "grad_norm": 19.59427833557129, "learning_rate": 4.727077682383018e-06, "loss": 0.4597, "num_input_tokens_seen": 132922288, "step": 61645 }, { "epoch": 11.314002569278767, "grad_norm": 16.895116806030273, "learning_rate": 4.7262781230125966e-06, "loss": 0.0623, "num_input_tokens_seen": 132931504, "step": 61650 }, { "epoch": 11.314920168838318, "grad_norm": 0.29362624883651733, "learning_rate": 4.725478570662715e-06, "loss": 0.2474, "num_input_tokens_seen": 132941456, "step": 61655 }, { "epoch": 11.315837768397872, "grad_norm": 0.25624990463256836, "learning_rate": 4.724679025353885e-06, "loss": 0.1561, "num_input_tokens_seen": 132952016, "step": 61660 }, { "epoch": 11.316755367957423, "grad_norm": 35.133541107177734, "learning_rate": 4.7238794871066105e-06, "loss": 0.2917, "num_input_tokens_seen": 132963344, "step": 61665 }, { "epoch": 11.317672967516975, "grad_norm": 2.175147771835327, "learning_rate": 4.723079955941397e-06, "loss": 0.238, "num_input_tokens_seen": 132973840, "step": 61670 }, { "epoch": 11.318590567076528, "grad_norm": 23.5845947265625, "learning_rate": 4.7222804318787555e-06, "loss": 0.1148, "num_input_tokens_seen": 132983696, "step": 61675 }, { "epoch": 11.31950816663608, "grad_norm": 140.2607879638672, "learning_rate": 4.7214809149391914e-06, "loss": 0.6085, "num_input_tokens_seen": 132994288, "step": 61680 }, { "epoch": 11.320425766195632, "grad_norm": 43.039039611816406, "learning_rate": 4.720681405143207e-06, "loss": 0.2584, "num_input_tokens_seen": 133002960, "step": 61685 }, { "epoch": 11.321343365755185, "grad_norm": 0.31276366114616394, "learning_rate": 4.719881902511315e-06, "loss": 0.0049, "num_input_tokens_seen": 133012304, "step": 61690 }, { "epoch": 11.322260965314737, "grad_norm": 0.04328326880931854, "learning_rate": 4.7190824070640176e-06, "loss": 0.0902, "num_input_tokens_seen": 133020944, "step": 61695 }, { "epoch": 11.323178564874288, "grad_norm": 0.21663694083690643, "learning_rate": 4.7182829188218205e-06, "loss": 0.1579, "num_input_tokens_seen": 133033776, "step": 61700 }, { "epoch": 11.324096164433842, "grad_norm": 0.12237731367349625, "learning_rate": 4.7174834378052294e-06, "loss": 0.1153, "num_input_tokens_seen": 133044240, "step": 61705 }, { "epoch": 11.325013763993393, "grad_norm": 139.5925750732422, "learning_rate": 4.71668396403475e-06, "loss": 0.0861, "num_input_tokens_seen": 133055888, "step": 61710 }, { "epoch": 11.325931363552945, "grad_norm": 31.804967880249023, "learning_rate": 4.715884497530889e-06, "loss": 0.0864, "num_input_tokens_seen": 133066160, "step": 61715 }, { "epoch": 11.326848963112498, "grad_norm": 0.027766775339841843, "learning_rate": 4.7150850383141485e-06, "loss": 0.155, "num_input_tokens_seen": 133077648, "step": 61720 }, { "epoch": 11.32776656267205, "grad_norm": 109.90568542480469, "learning_rate": 4.7142855864050375e-06, "loss": 0.1985, "num_input_tokens_seen": 133087952, "step": 61725 }, { "epoch": 11.328684162231601, "grad_norm": 0.07935614883899689, "learning_rate": 4.713486141824058e-06, "loss": 0.1195, "num_input_tokens_seen": 133098416, "step": 61730 }, { "epoch": 11.329601761791155, "grad_norm": 19.46822738647461, "learning_rate": 4.7126867045917125e-06, "loss": 0.0446, "num_input_tokens_seen": 133108528, "step": 61735 }, { "epoch": 11.330519361350706, "grad_norm": 0.30916929244995117, "learning_rate": 4.71188727472851e-06, "loss": 0.0019, "num_input_tokens_seen": 133119728, "step": 61740 }, { "epoch": 11.331436960910258, "grad_norm": 19.83965301513672, "learning_rate": 4.711087852254953e-06, "loss": 0.2275, "num_input_tokens_seen": 133130640, "step": 61745 }, { "epoch": 11.332354560469811, "grad_norm": 48.635597229003906, "learning_rate": 4.710288437191544e-06, "loss": 0.1722, "num_input_tokens_seen": 133141488, "step": 61750 }, { "epoch": 11.333272160029363, "grad_norm": 65.26424407958984, "learning_rate": 4.709489029558785e-06, "loss": 0.2161, "num_input_tokens_seen": 133152272, "step": 61755 }, { "epoch": 11.334189759588915, "grad_norm": 0.0077873957343399525, "learning_rate": 4.708689629377185e-06, "loss": 0.3987, "num_input_tokens_seen": 133162768, "step": 61760 }, { "epoch": 11.335107359148468, "grad_norm": 46.73546600341797, "learning_rate": 4.707890236667244e-06, "loss": 0.1879, "num_input_tokens_seen": 133174256, "step": 61765 }, { "epoch": 11.33602495870802, "grad_norm": 5.131185531616211, "learning_rate": 4.707090851449465e-06, "loss": 0.1143, "num_input_tokens_seen": 133184752, "step": 61770 }, { "epoch": 11.336942558267571, "grad_norm": 92.56732940673828, "learning_rate": 4.706291473744352e-06, "loss": 0.1884, "num_input_tokens_seen": 133195280, "step": 61775 }, { "epoch": 11.337860157827125, "grad_norm": 16.53589630126953, "learning_rate": 4.705492103572409e-06, "loss": 0.2083, "num_input_tokens_seen": 133205680, "step": 61780 }, { "epoch": 11.338777757386676, "grad_norm": 1.0113366842269897, "learning_rate": 4.7046927409541356e-06, "loss": 0.2099, "num_input_tokens_seen": 133216272, "step": 61785 }, { "epoch": 11.339695356946228, "grad_norm": 0.11087367683649063, "learning_rate": 4.703893385910035e-06, "loss": 0.2332, "num_input_tokens_seen": 133227504, "step": 61790 }, { "epoch": 11.340612956505781, "grad_norm": 1.1185685396194458, "learning_rate": 4.703094038460612e-06, "loss": 0.1225, "num_input_tokens_seen": 133238384, "step": 61795 }, { "epoch": 11.341530556065333, "grad_norm": 1.7930430173873901, "learning_rate": 4.7022946986263655e-06, "loss": 0.0188, "num_input_tokens_seen": 133248944, "step": 61800 }, { "epoch": 11.342448155624885, "grad_norm": 0.13376614451408386, "learning_rate": 4.7014953664277975e-06, "loss": 0.0236, "num_input_tokens_seen": 133258384, "step": 61805 }, { "epoch": 11.343365755184438, "grad_norm": 36.32100296020508, "learning_rate": 4.700696041885413e-06, "loss": 0.0571, "num_input_tokens_seen": 133268976, "step": 61810 }, { "epoch": 11.34428335474399, "grad_norm": 19.53675079345703, "learning_rate": 4.699896725019711e-06, "loss": 0.1787, "num_input_tokens_seen": 133280240, "step": 61815 }, { "epoch": 11.345200954303541, "grad_norm": 1.693803071975708, "learning_rate": 4.699097415851191e-06, "loss": 0.0782, "num_input_tokens_seen": 133291984, "step": 61820 }, { "epoch": 11.346118553863095, "grad_norm": 0.147267147898674, "learning_rate": 4.698298114400358e-06, "loss": 0.002, "num_input_tokens_seen": 133303472, "step": 61825 }, { "epoch": 11.347036153422646, "grad_norm": 0.30836424231529236, "learning_rate": 4.69749882068771e-06, "loss": 0.4105, "num_input_tokens_seen": 133314704, "step": 61830 }, { "epoch": 11.347953752982198, "grad_norm": 1.7055702209472656, "learning_rate": 4.696699534733748e-06, "loss": 0.3047, "num_input_tokens_seen": 133326736, "step": 61835 }, { "epoch": 11.348871352541751, "grad_norm": 16.21973419189453, "learning_rate": 4.695900256558973e-06, "loss": 0.0107, "num_input_tokens_seen": 133337904, "step": 61840 }, { "epoch": 11.349788952101303, "grad_norm": 20.56903648376465, "learning_rate": 4.695100986183885e-06, "loss": 0.2509, "num_input_tokens_seen": 133348592, "step": 61845 }, { "epoch": 11.350706551660855, "grad_norm": 0.2302914559841156, "learning_rate": 4.694301723628986e-06, "loss": 0.0032, "num_input_tokens_seen": 133359056, "step": 61850 }, { "epoch": 11.351624151220408, "grad_norm": 201.83523559570312, "learning_rate": 4.69350246891477e-06, "loss": 0.157, "num_input_tokens_seen": 133369488, "step": 61855 }, { "epoch": 11.35254175077996, "grad_norm": 0.21730723977088928, "learning_rate": 4.6927032220617445e-06, "loss": 0.0771, "num_input_tokens_seen": 133381968, "step": 61860 }, { "epoch": 11.353459350339511, "grad_norm": 4.855137348175049, "learning_rate": 4.6919039830904035e-06, "loss": 0.3638, "num_input_tokens_seen": 133392816, "step": 61865 }, { "epoch": 11.354376949899065, "grad_norm": 10.421005249023438, "learning_rate": 4.691104752021246e-06, "loss": 0.0374, "num_input_tokens_seen": 133403984, "step": 61870 }, { "epoch": 11.355294549458616, "grad_norm": 0.1061350628733635, "learning_rate": 4.690305528874775e-06, "loss": 0.2307, "num_input_tokens_seen": 133415184, "step": 61875 }, { "epoch": 11.356212149018168, "grad_norm": 42.13624954223633, "learning_rate": 4.689506313671489e-06, "loss": 0.4517, "num_input_tokens_seen": 133425328, "step": 61880 }, { "epoch": 11.357129748577721, "grad_norm": 0.21103046834468842, "learning_rate": 4.6887071064318825e-06, "loss": 0.1027, "num_input_tokens_seen": 133434896, "step": 61885 }, { "epoch": 11.358047348137273, "grad_norm": 0.3599526584148407, "learning_rate": 4.687907907176455e-06, "loss": 0.0228, "num_input_tokens_seen": 133443792, "step": 61890 }, { "epoch": 11.358964947696824, "grad_norm": 4.994693756103516, "learning_rate": 4.687108715925709e-06, "loss": 0.3014, "num_input_tokens_seen": 133455184, "step": 61895 }, { "epoch": 11.359882547256378, "grad_norm": 1.7498605251312256, "learning_rate": 4.686309532700138e-06, "loss": 0.1663, "num_input_tokens_seen": 133466736, "step": 61900 }, { "epoch": 11.36080014681593, "grad_norm": 0.7072308659553528, "learning_rate": 4.68551035752024e-06, "loss": 0.1441, "num_input_tokens_seen": 133478064, "step": 61905 }, { "epoch": 11.361717746375481, "grad_norm": 0.11824370920658112, "learning_rate": 4.684711190406516e-06, "loss": 0.1305, "num_input_tokens_seen": 133489200, "step": 61910 }, { "epoch": 11.362635345935034, "grad_norm": 169.5522003173828, "learning_rate": 4.683912031379461e-06, "loss": 0.1232, "num_input_tokens_seen": 133499184, "step": 61915 }, { "epoch": 11.363552945494586, "grad_norm": 2.656012773513794, "learning_rate": 4.6831128804595715e-06, "loss": 0.0892, "num_input_tokens_seen": 133509968, "step": 61920 }, { "epoch": 11.364470545054138, "grad_norm": 6.0678911209106445, "learning_rate": 4.682313737667347e-06, "loss": 0.0377, "num_input_tokens_seen": 133520688, "step": 61925 }, { "epoch": 11.365388144613691, "grad_norm": 1.1261075735092163, "learning_rate": 4.681514603023281e-06, "loss": 0.0219, "num_input_tokens_seen": 133530224, "step": 61930 }, { "epoch": 11.366305744173243, "grad_norm": 38.89134979248047, "learning_rate": 4.680715476547873e-06, "loss": 0.0658, "num_input_tokens_seen": 133541776, "step": 61935 }, { "epoch": 11.367223343732794, "grad_norm": 0.6059232354164124, "learning_rate": 4.679916358261617e-06, "loss": 0.1481, "num_input_tokens_seen": 133551568, "step": 61940 }, { "epoch": 11.368140943292348, "grad_norm": 4.0156168937683105, "learning_rate": 4.679117248185012e-06, "loss": 0.1248, "num_input_tokens_seen": 133562672, "step": 61945 }, { "epoch": 11.3690585428519, "grad_norm": 25.071754455566406, "learning_rate": 4.6783181463385524e-06, "loss": 0.1107, "num_input_tokens_seen": 133572944, "step": 61950 }, { "epoch": 11.369976142411451, "grad_norm": 11.308279037475586, "learning_rate": 4.677519052742731e-06, "loss": 0.108, "num_input_tokens_seen": 133584528, "step": 61955 }, { "epoch": 11.370893741971004, "grad_norm": 0.38201436400413513, "learning_rate": 4.676719967418049e-06, "loss": 0.4348, "num_input_tokens_seen": 133596208, "step": 61960 }, { "epoch": 11.371811341530556, "grad_norm": 24.687618255615234, "learning_rate": 4.675920890384998e-06, "loss": 0.3381, "num_input_tokens_seen": 133608400, "step": 61965 }, { "epoch": 11.372728941090108, "grad_norm": 24.884336471557617, "learning_rate": 4.6751218216640745e-06, "loss": 0.0701, "num_input_tokens_seen": 133619472, "step": 61970 }, { "epoch": 11.373646540649661, "grad_norm": 2.6575770378112793, "learning_rate": 4.67432276127577e-06, "loss": 0.0088, "num_input_tokens_seen": 133628592, "step": 61975 }, { "epoch": 11.374564140209213, "grad_norm": 3.4000911712646484, "learning_rate": 4.673523709240584e-06, "loss": 0.0388, "num_input_tokens_seen": 133639536, "step": 61980 }, { "epoch": 11.375481739768764, "grad_norm": 34.10251235961914, "learning_rate": 4.67272466557901e-06, "loss": 0.1338, "num_input_tokens_seen": 133649680, "step": 61985 }, { "epoch": 11.376399339328318, "grad_norm": 38.61806869506836, "learning_rate": 4.671925630311538e-06, "loss": 0.1847, "num_input_tokens_seen": 133660720, "step": 61990 }, { "epoch": 11.37731693888787, "grad_norm": 36.02619171142578, "learning_rate": 4.671126603458668e-06, "loss": 0.0896, "num_input_tokens_seen": 133672208, "step": 61995 }, { "epoch": 11.37823453844742, "grad_norm": 132.7288360595703, "learning_rate": 4.67032758504089e-06, "loss": 0.2448, "num_input_tokens_seen": 133683440, "step": 62000 }, { "epoch": 11.379152138006974, "grad_norm": 0.22110511362552643, "learning_rate": 4.669528575078696e-06, "loss": 0.1387, "num_input_tokens_seen": 133694224, "step": 62005 }, { "epoch": 11.380069737566526, "grad_norm": 91.36714172363281, "learning_rate": 4.668729573592585e-06, "loss": 0.0972, "num_input_tokens_seen": 133705328, "step": 62010 }, { "epoch": 11.380987337126077, "grad_norm": 1.171434998512268, "learning_rate": 4.667930580603047e-06, "loss": 0.0058, "num_input_tokens_seen": 133715600, "step": 62015 }, { "epoch": 11.38190493668563, "grad_norm": 0.04181436449289322, "learning_rate": 4.667131596130575e-06, "loss": 0.0183, "num_input_tokens_seen": 133725808, "step": 62020 }, { "epoch": 11.382822536245182, "grad_norm": 0.15960437059402466, "learning_rate": 4.6663326201956585e-06, "loss": 0.1138, "num_input_tokens_seen": 133735664, "step": 62025 }, { "epoch": 11.383740135804734, "grad_norm": 0.11066946387290955, "learning_rate": 4.665533652818796e-06, "loss": 0.1804, "num_input_tokens_seen": 133744880, "step": 62030 }, { "epoch": 11.384657735364287, "grad_norm": 5.662609100341797, "learning_rate": 4.664734694020477e-06, "loss": 0.239, "num_input_tokens_seen": 133755792, "step": 62035 }, { "epoch": 11.385575334923839, "grad_norm": 12.72167682647705, "learning_rate": 4.663935743821193e-06, "loss": 0.2587, "num_input_tokens_seen": 133766960, "step": 62040 }, { "epoch": 11.38649293448339, "grad_norm": 0.23185691237449646, "learning_rate": 4.663136802241436e-06, "loss": 0.1034, "num_input_tokens_seen": 133777232, "step": 62045 }, { "epoch": 11.387410534042944, "grad_norm": 58.529510498046875, "learning_rate": 4.662337869301699e-06, "loss": 0.1548, "num_input_tokens_seen": 133787504, "step": 62050 }, { "epoch": 11.388328133602496, "grad_norm": 0.0815734714269638, "learning_rate": 4.6615389450224696e-06, "loss": 0.2327, "num_input_tokens_seen": 133798448, "step": 62055 }, { "epoch": 11.389245733162047, "grad_norm": 0.1213693916797638, "learning_rate": 4.6607400294242444e-06, "loss": 0.002, "num_input_tokens_seen": 133809264, "step": 62060 }, { "epoch": 11.3901633327216, "grad_norm": 10.683683395385742, "learning_rate": 4.659941122527511e-06, "loss": 0.3872, "num_input_tokens_seen": 133820720, "step": 62065 }, { "epoch": 11.391080932281152, "grad_norm": 55.69926452636719, "learning_rate": 4.659142224352761e-06, "loss": 0.3746, "num_input_tokens_seen": 133831472, "step": 62070 }, { "epoch": 11.391998531840704, "grad_norm": 38.75717544555664, "learning_rate": 4.658343334920482e-06, "loss": 0.0348, "num_input_tokens_seen": 133841520, "step": 62075 }, { "epoch": 11.392916131400257, "grad_norm": 3.4652435779571533, "learning_rate": 4.65754445425117e-06, "loss": 0.1956, "num_input_tokens_seen": 133851056, "step": 62080 }, { "epoch": 11.393833730959809, "grad_norm": 5.231935977935791, "learning_rate": 4.656745582365312e-06, "loss": 0.0635, "num_input_tokens_seen": 133863216, "step": 62085 }, { "epoch": 11.39475133051936, "grad_norm": 1.1335968971252441, "learning_rate": 4.6559467192833956e-06, "loss": 0.1482, "num_input_tokens_seen": 133873360, "step": 62090 }, { "epoch": 11.395668930078914, "grad_norm": 60.319862365722656, "learning_rate": 4.655147865025914e-06, "loss": 0.0626, "num_input_tokens_seen": 133884944, "step": 62095 }, { "epoch": 11.396586529638466, "grad_norm": 0.38736751675605774, "learning_rate": 4.6543490196133566e-06, "loss": 0.0104, "num_input_tokens_seen": 133895920, "step": 62100 }, { "epoch": 11.397504129198017, "grad_norm": 18.621318817138672, "learning_rate": 4.65355018306621e-06, "loss": 0.1027, "num_input_tokens_seen": 133907312, "step": 62105 }, { "epoch": 11.39842172875757, "grad_norm": 10.125468254089355, "learning_rate": 4.652751355404962e-06, "loss": 0.0079, "num_input_tokens_seen": 133917360, "step": 62110 }, { "epoch": 11.399339328317122, "grad_norm": 0.034479156136512756, "learning_rate": 4.6519525366501066e-06, "loss": 0.4979, "num_input_tokens_seen": 133928880, "step": 62115 }, { "epoch": 11.400256927876674, "grad_norm": 0.12345065921545029, "learning_rate": 4.651153726822128e-06, "loss": 0.0889, "num_input_tokens_seen": 133939056, "step": 62120 }, { "epoch": 11.401174527436227, "grad_norm": 62.237266540527344, "learning_rate": 4.650354925941515e-06, "loss": 0.1534, "num_input_tokens_seen": 133948560, "step": 62125 }, { "epoch": 11.402092126995779, "grad_norm": 18.412281036376953, "learning_rate": 4.649556134028757e-06, "loss": 0.2269, "num_input_tokens_seen": 133959632, "step": 62130 }, { "epoch": 11.40300972655533, "grad_norm": 25.765644073486328, "learning_rate": 4.648757351104341e-06, "loss": 0.2804, "num_input_tokens_seen": 133971344, "step": 62135 }, { "epoch": 11.403927326114884, "grad_norm": 32.9318733215332, "learning_rate": 4.647958577188754e-06, "loss": 0.147, "num_input_tokens_seen": 133981712, "step": 62140 }, { "epoch": 11.404844925674436, "grad_norm": 0.10775226354598999, "learning_rate": 4.647159812302485e-06, "loss": 0.1522, "num_input_tokens_seen": 133992816, "step": 62145 }, { "epoch": 11.405762525233987, "grad_norm": 33.09447479248047, "learning_rate": 4.646361056466019e-06, "loss": 0.1006, "num_input_tokens_seen": 134003312, "step": 62150 }, { "epoch": 11.40668012479354, "grad_norm": 2.6053342819213867, "learning_rate": 4.6455623096998445e-06, "loss": 0.1171, "num_input_tokens_seen": 134015632, "step": 62155 }, { "epoch": 11.407597724353092, "grad_norm": 0.2548231780529022, "learning_rate": 4.6447635720244475e-06, "loss": 0.3174, "num_input_tokens_seen": 134027120, "step": 62160 }, { "epoch": 11.408515323912644, "grad_norm": 2.220276355743408, "learning_rate": 4.643964843460314e-06, "loss": 0.1696, "num_input_tokens_seen": 134038416, "step": 62165 }, { "epoch": 11.409432923472197, "grad_norm": 0.6328121423721313, "learning_rate": 4.643166124027931e-06, "loss": 0.1152, "num_input_tokens_seen": 134049104, "step": 62170 }, { "epoch": 11.410350523031749, "grad_norm": 72.57461547851562, "learning_rate": 4.642367413747782e-06, "loss": 0.0915, "num_input_tokens_seen": 134059440, "step": 62175 }, { "epoch": 11.4112681225913, "grad_norm": 0.2812580466270447, "learning_rate": 4.641568712640357e-06, "loss": 0.2363, "num_input_tokens_seen": 134070608, "step": 62180 }, { "epoch": 11.412185722150854, "grad_norm": 1.8354203701019287, "learning_rate": 4.640770020726137e-06, "loss": 0.0478, "num_input_tokens_seen": 134081392, "step": 62185 }, { "epoch": 11.413103321710405, "grad_norm": 0.1126677393913269, "learning_rate": 4.639971338025609e-06, "loss": 0.0566, "num_input_tokens_seen": 134091952, "step": 62190 }, { "epoch": 11.414020921269957, "grad_norm": 26.56327247619629, "learning_rate": 4.63917266455926e-06, "loss": 0.1737, "num_input_tokens_seen": 134104016, "step": 62195 }, { "epoch": 11.41493852082951, "grad_norm": 1.4390010833740234, "learning_rate": 4.638374000347573e-06, "loss": 0.3047, "num_input_tokens_seen": 134115184, "step": 62200 }, { "epoch": 11.415856120389062, "grad_norm": 0.03149227797985077, "learning_rate": 4.637575345411031e-06, "loss": 0.0055, "num_input_tokens_seen": 134125712, "step": 62205 }, { "epoch": 11.416773719948614, "grad_norm": 165.1276092529297, "learning_rate": 4.6367766997701195e-06, "loss": 0.178, "num_input_tokens_seen": 134136528, "step": 62210 }, { "epoch": 11.417691319508167, "grad_norm": 11.333499908447266, "learning_rate": 4.635978063445324e-06, "loss": 0.2296, "num_input_tokens_seen": 134148496, "step": 62215 }, { "epoch": 11.418608919067719, "grad_norm": 0.09848079830408096, "learning_rate": 4.635179436457127e-06, "loss": 0.4299, "num_input_tokens_seen": 134158320, "step": 62220 }, { "epoch": 11.41952651862727, "grad_norm": 44.46583557128906, "learning_rate": 4.63438081882601e-06, "loss": 0.1756, "num_input_tokens_seen": 134170544, "step": 62225 }, { "epoch": 11.420444118186824, "grad_norm": 1.0967192649841309, "learning_rate": 4.63358221057246e-06, "loss": 0.1097, "num_input_tokens_seen": 134181520, "step": 62230 }, { "epoch": 11.421361717746375, "grad_norm": 2.0410521030426025, "learning_rate": 4.632783611716959e-06, "loss": 0.0085, "num_input_tokens_seen": 134192016, "step": 62235 }, { "epoch": 11.422279317305927, "grad_norm": 0.2628186345100403, "learning_rate": 4.631985022279989e-06, "loss": 0.0251, "num_input_tokens_seen": 134202736, "step": 62240 }, { "epoch": 11.42319691686548, "grad_norm": 0.02050851844251156, "learning_rate": 4.631186442282032e-06, "loss": 0.1123, "num_input_tokens_seen": 134212816, "step": 62245 }, { "epoch": 11.424114516425032, "grad_norm": 0.42905235290527344, "learning_rate": 4.630387871743572e-06, "loss": 0.1327, "num_input_tokens_seen": 134222640, "step": 62250 }, { "epoch": 11.425032115984584, "grad_norm": 30.907085418701172, "learning_rate": 4.629589310685089e-06, "loss": 0.1654, "num_input_tokens_seen": 134234416, "step": 62255 }, { "epoch": 11.425949715544137, "grad_norm": 12.710488319396973, "learning_rate": 4.6287907591270665e-06, "loss": 0.0925, "num_input_tokens_seen": 134245680, "step": 62260 }, { "epoch": 11.426867315103689, "grad_norm": 6.616883754730225, "learning_rate": 4.627992217089987e-06, "loss": 0.0074, "num_input_tokens_seen": 134256784, "step": 62265 }, { "epoch": 11.42778491466324, "grad_norm": 262.388671875, "learning_rate": 4.62719368459433e-06, "loss": 0.2213, "num_input_tokens_seen": 134267248, "step": 62270 }, { "epoch": 11.428702514222794, "grad_norm": 54.97127914428711, "learning_rate": 4.626395161660575e-06, "loss": 0.0755, "num_input_tokens_seen": 134279344, "step": 62275 }, { "epoch": 11.429620113782345, "grad_norm": 0.27287212014198303, "learning_rate": 4.625596648309208e-06, "loss": 0.0211, "num_input_tokens_seen": 134289584, "step": 62280 }, { "epoch": 11.430537713341897, "grad_norm": 11.975672721862793, "learning_rate": 4.6247981445607055e-06, "loss": 0.1977, "num_input_tokens_seen": 134301520, "step": 62285 }, { "epoch": 11.43145531290145, "grad_norm": 12.587098121643066, "learning_rate": 4.62399965043555e-06, "loss": 0.2889, "num_input_tokens_seen": 134312144, "step": 62290 }, { "epoch": 11.432372912461002, "grad_norm": 0.4137974679470062, "learning_rate": 4.623201165954217e-06, "loss": 0.1579, "num_input_tokens_seen": 134323984, "step": 62295 }, { "epoch": 11.433290512020553, "grad_norm": 0.04730743542313576, "learning_rate": 4.6224026911371945e-06, "loss": 0.1299, "num_input_tokens_seen": 134334864, "step": 62300 }, { "epoch": 11.434208111580107, "grad_norm": 9.88096809387207, "learning_rate": 4.621604226004957e-06, "loss": 0.227, "num_input_tokens_seen": 134345456, "step": 62305 }, { "epoch": 11.435125711139658, "grad_norm": 44.540618896484375, "learning_rate": 4.620805770577982e-06, "loss": 0.4139, "num_input_tokens_seen": 134356816, "step": 62310 }, { "epoch": 11.43604331069921, "grad_norm": 18.11176872253418, "learning_rate": 4.620007324876753e-06, "loss": 0.1417, "num_input_tokens_seen": 134367312, "step": 62315 }, { "epoch": 11.436960910258763, "grad_norm": 1.6484341621398926, "learning_rate": 4.619208888921748e-06, "loss": 0.0117, "num_input_tokens_seen": 134377776, "step": 62320 }, { "epoch": 11.437878509818315, "grad_norm": 213.58042907714844, "learning_rate": 4.618410462733442e-06, "loss": 0.045, "num_input_tokens_seen": 134388880, "step": 62325 }, { "epoch": 11.438796109377867, "grad_norm": 31.863698959350586, "learning_rate": 4.617612046332319e-06, "loss": 0.3375, "num_input_tokens_seen": 134399088, "step": 62330 }, { "epoch": 11.43971370893742, "grad_norm": 1.4527491331100464, "learning_rate": 4.616813639738855e-06, "loss": 0.1031, "num_input_tokens_seen": 134410480, "step": 62335 }, { "epoch": 11.440631308496972, "grad_norm": 0.04105237126350403, "learning_rate": 4.616015242973526e-06, "loss": 0.1337, "num_input_tokens_seen": 134420784, "step": 62340 }, { "epoch": 11.441548908056523, "grad_norm": 2.7542076110839844, "learning_rate": 4.615216856056809e-06, "loss": 0.1888, "num_input_tokens_seen": 134431600, "step": 62345 }, { "epoch": 11.442466507616077, "grad_norm": 0.009545535780489445, "learning_rate": 4.614418479009186e-06, "loss": 0.322, "num_input_tokens_seen": 134442992, "step": 62350 }, { "epoch": 11.443384107175628, "grad_norm": 17.54336929321289, "learning_rate": 4.613620111851132e-06, "loss": 0.1042, "num_input_tokens_seen": 134452112, "step": 62355 }, { "epoch": 11.44430170673518, "grad_norm": 21.145795822143555, "learning_rate": 4.6128217546031205e-06, "loss": 0.108, "num_input_tokens_seen": 134461712, "step": 62360 }, { "epoch": 11.445219306294733, "grad_norm": 38.76097106933594, "learning_rate": 4.612023407285633e-06, "loss": 0.0238, "num_input_tokens_seen": 134473168, "step": 62365 }, { "epoch": 11.446136905854285, "grad_norm": 3.5518338680267334, "learning_rate": 4.611225069919144e-06, "loss": 0.0066, "num_input_tokens_seen": 134482800, "step": 62370 }, { "epoch": 11.447054505413837, "grad_norm": 52.412471771240234, "learning_rate": 4.610426742524126e-06, "loss": 0.4353, "num_input_tokens_seen": 134494384, "step": 62375 }, { "epoch": 11.44797210497339, "grad_norm": 0.04505039006471634, "learning_rate": 4.609628425121061e-06, "loss": 0.0426, "num_input_tokens_seen": 134505840, "step": 62380 }, { "epoch": 11.448889704532942, "grad_norm": 0.4335530996322632, "learning_rate": 4.6088301177304225e-06, "loss": 0.5457, "num_input_tokens_seen": 134516784, "step": 62385 }, { "epoch": 11.449807304092493, "grad_norm": 0.06274052709341049, "learning_rate": 4.608031820372686e-06, "loss": 0.1162, "num_input_tokens_seen": 134528048, "step": 62390 }, { "epoch": 11.450724903652047, "grad_norm": 0.37133824825286865, "learning_rate": 4.607233533068322e-06, "loss": 0.0024, "num_input_tokens_seen": 134539152, "step": 62395 }, { "epoch": 11.451642503211598, "grad_norm": 39.83402633666992, "learning_rate": 4.6064352558378115e-06, "loss": 0.6461, "num_input_tokens_seen": 134549680, "step": 62400 }, { "epoch": 11.45256010277115, "grad_norm": 0.19604606926441193, "learning_rate": 4.605636988701627e-06, "loss": 0.1348, "num_input_tokens_seen": 134561424, "step": 62405 }, { "epoch": 11.453477702330703, "grad_norm": 0.32776227593421936, "learning_rate": 4.604838731680241e-06, "loss": 0.0047, "num_input_tokens_seen": 134573360, "step": 62410 }, { "epoch": 11.454395301890255, "grad_norm": 139.81100463867188, "learning_rate": 4.60404048479413e-06, "loss": 0.1105, "num_input_tokens_seen": 134584112, "step": 62415 }, { "epoch": 11.455312901449807, "grad_norm": 0.09546017646789551, "learning_rate": 4.603242248063768e-06, "loss": 0.1311, "num_input_tokens_seen": 134593776, "step": 62420 }, { "epoch": 11.45623050100936, "grad_norm": 40.311283111572266, "learning_rate": 4.602444021509626e-06, "loss": 0.1456, "num_input_tokens_seen": 134605520, "step": 62425 }, { "epoch": 11.457148100568912, "grad_norm": 20.950122833251953, "learning_rate": 4.601645805152178e-06, "loss": 0.2413, "num_input_tokens_seen": 134616208, "step": 62430 }, { "epoch": 11.458065700128463, "grad_norm": 0.022273117676377296, "learning_rate": 4.6008475990119e-06, "loss": 0.0786, "num_input_tokens_seen": 134626192, "step": 62435 }, { "epoch": 11.458983299688017, "grad_norm": 0.02987794578075409, "learning_rate": 4.600049403109262e-06, "loss": 0.0052, "num_input_tokens_seen": 134637488, "step": 62440 }, { "epoch": 11.459900899247568, "grad_norm": 13.699726104736328, "learning_rate": 4.5992512174647345e-06, "loss": 0.1578, "num_input_tokens_seen": 134648208, "step": 62445 }, { "epoch": 11.46081849880712, "grad_norm": 0.08710109442472458, "learning_rate": 4.598453042098794e-06, "loss": 0.1537, "num_input_tokens_seen": 134658576, "step": 62450 }, { "epoch": 11.461736098366673, "grad_norm": 0.02266668528318405, "learning_rate": 4.597654877031911e-06, "loss": 0.2461, "num_input_tokens_seen": 134669136, "step": 62455 }, { "epoch": 11.462653697926225, "grad_norm": 31.106081008911133, "learning_rate": 4.596856722284556e-06, "loss": 0.154, "num_input_tokens_seen": 134679728, "step": 62460 }, { "epoch": 11.463571297485776, "grad_norm": 38.58162307739258, "learning_rate": 4.5960585778772025e-06, "loss": 0.1758, "num_input_tokens_seen": 134690096, "step": 62465 }, { "epoch": 11.46448889704533, "grad_norm": 3.758310556411743, "learning_rate": 4.595260443830319e-06, "loss": 0.008, "num_input_tokens_seen": 134701296, "step": 62470 }, { "epoch": 11.465406496604881, "grad_norm": 43.829654693603516, "learning_rate": 4.594462320164378e-06, "loss": 0.192, "num_input_tokens_seen": 134711632, "step": 62475 }, { "epoch": 11.466324096164433, "grad_norm": 0.05704789236187935, "learning_rate": 4.59366420689985e-06, "loss": 0.2845, "num_input_tokens_seen": 134723184, "step": 62480 }, { "epoch": 11.467241695723986, "grad_norm": 35.53194046020508, "learning_rate": 4.5928661040572065e-06, "loss": 0.2315, "num_input_tokens_seen": 134732560, "step": 62485 }, { "epoch": 11.468159295283538, "grad_norm": 0.38579532504081726, "learning_rate": 4.592068011656916e-06, "loss": 0.0074, "num_input_tokens_seen": 134742768, "step": 62490 }, { "epoch": 11.46907689484309, "grad_norm": 3.461235761642456, "learning_rate": 4.591269929719447e-06, "loss": 0.1829, "num_input_tokens_seen": 134754576, "step": 62495 }, { "epoch": 11.469994494402643, "grad_norm": 109.8883285522461, "learning_rate": 4.590471858265273e-06, "loss": 0.371, "num_input_tokens_seen": 134764880, "step": 62500 }, { "epoch": 11.470912093962195, "grad_norm": 0.31168824434280396, "learning_rate": 4.589673797314861e-06, "loss": 0.0027, "num_input_tokens_seen": 134775568, "step": 62505 }, { "epoch": 11.471829693521746, "grad_norm": 0.051445744931697845, "learning_rate": 4.5888757468886774e-06, "loss": 0.1444, "num_input_tokens_seen": 134785392, "step": 62510 }, { "epoch": 11.4727472930813, "grad_norm": 0.07621587812900543, "learning_rate": 4.588077707007196e-06, "loss": 0.4837, "num_input_tokens_seen": 134796112, "step": 62515 }, { "epoch": 11.473664892640851, "grad_norm": 1.4289755821228027, "learning_rate": 4.587279677690883e-06, "loss": 0.1155, "num_input_tokens_seen": 134805904, "step": 62520 }, { "epoch": 11.474582492200403, "grad_norm": 178.8018798828125, "learning_rate": 4.586481658960208e-06, "loss": 0.275, "num_input_tokens_seen": 134817616, "step": 62525 }, { "epoch": 11.475500091759956, "grad_norm": 0.38273802399635315, "learning_rate": 4.585683650835634e-06, "loss": 0.1676, "num_input_tokens_seen": 134828336, "step": 62530 }, { "epoch": 11.476417691319508, "grad_norm": 0.06197620555758476, "learning_rate": 4.584885653337634e-06, "loss": 0.2231, "num_input_tokens_seen": 134839664, "step": 62535 }, { "epoch": 11.47733529087906, "grad_norm": 0.2847987115383148, "learning_rate": 4.584087666486675e-06, "loss": 0.0712, "num_input_tokens_seen": 134849904, "step": 62540 }, { "epoch": 11.478252890438613, "grad_norm": 8.654703140258789, "learning_rate": 4.5832896903032195e-06, "loss": 0.0146, "num_input_tokens_seen": 134861904, "step": 62545 }, { "epoch": 11.479170489998165, "grad_norm": 0.04595477133989334, "learning_rate": 4.582491724807741e-06, "loss": 0.0563, "num_input_tokens_seen": 134871120, "step": 62550 }, { "epoch": 11.480088089557716, "grad_norm": 63.82405090332031, "learning_rate": 4.581693770020701e-06, "loss": 0.1847, "num_input_tokens_seen": 134882160, "step": 62555 }, { "epoch": 11.48100568911727, "grad_norm": 0.07344600558280945, "learning_rate": 4.580895825962568e-06, "loss": 0.075, "num_input_tokens_seen": 134892976, "step": 62560 }, { "epoch": 11.481923288676821, "grad_norm": 189.6013946533203, "learning_rate": 4.580097892653808e-06, "loss": 0.5064, "num_input_tokens_seen": 134904688, "step": 62565 }, { "epoch": 11.482840888236373, "grad_norm": 77.46107482910156, "learning_rate": 4.579299970114884e-06, "loss": 0.1515, "num_input_tokens_seen": 134916016, "step": 62570 }, { "epoch": 11.483758487795926, "grad_norm": 0.05599026009440422, "learning_rate": 4.578502058366266e-06, "loss": 0.0007, "num_input_tokens_seen": 134926992, "step": 62575 }, { "epoch": 11.484676087355478, "grad_norm": 0.02542080543935299, "learning_rate": 4.577704157428416e-06, "loss": 0.2291, "num_input_tokens_seen": 134936752, "step": 62580 }, { "epoch": 11.48559368691503, "grad_norm": 0.22875458002090454, "learning_rate": 4.576906267321801e-06, "loss": 0.2913, "num_input_tokens_seen": 134946032, "step": 62585 }, { "epoch": 11.486511286474583, "grad_norm": 36.71541976928711, "learning_rate": 4.576108388066884e-06, "loss": 0.1218, "num_input_tokens_seen": 134958320, "step": 62590 }, { "epoch": 11.487428886034134, "grad_norm": 638.3435668945312, "learning_rate": 4.575310519684127e-06, "loss": 0.1366, "num_input_tokens_seen": 134968496, "step": 62595 }, { "epoch": 11.488346485593686, "grad_norm": 75.13896942138672, "learning_rate": 4.574512662194001e-06, "loss": 0.1989, "num_input_tokens_seen": 134980144, "step": 62600 }, { "epoch": 11.48926408515324, "grad_norm": 65.36732482910156, "learning_rate": 4.573714815616964e-06, "loss": 0.2005, "num_input_tokens_seen": 134990416, "step": 62605 }, { "epoch": 11.490181684712791, "grad_norm": 0.5637589693069458, "learning_rate": 4.572916979973482e-06, "loss": 0.0611, "num_input_tokens_seen": 135001456, "step": 62610 }, { "epoch": 11.491099284272343, "grad_norm": 0.010305611416697502, "learning_rate": 4.572119155284016e-06, "loss": 0.2023, "num_input_tokens_seen": 135011472, "step": 62615 }, { "epoch": 11.492016883831896, "grad_norm": 50.30260467529297, "learning_rate": 4.571321341569032e-06, "loss": 0.0225, "num_input_tokens_seen": 135022256, "step": 62620 }, { "epoch": 11.492934483391448, "grad_norm": 0.057668473571538925, "learning_rate": 4.5705235388489914e-06, "loss": 0.2351, "num_input_tokens_seen": 135032784, "step": 62625 }, { "epoch": 11.493852082951, "grad_norm": 142.69639587402344, "learning_rate": 4.569725747144354e-06, "loss": 0.1521, "num_input_tokens_seen": 135044528, "step": 62630 }, { "epoch": 11.494769682510553, "grad_norm": 93.22688293457031, "learning_rate": 4.5689279664755866e-06, "loss": 0.0345, "num_input_tokens_seen": 135053776, "step": 62635 }, { "epoch": 11.495687282070104, "grad_norm": 0.13575196266174316, "learning_rate": 4.568130196863149e-06, "loss": 0.0952, "num_input_tokens_seen": 135064880, "step": 62640 }, { "epoch": 11.496604881629656, "grad_norm": 22.99969482421875, "learning_rate": 4.5673324383275e-06, "loss": 0.2704, "num_input_tokens_seen": 135075344, "step": 62645 }, { "epoch": 11.49752248118921, "grad_norm": 37.225643157958984, "learning_rate": 4.566534690889106e-06, "loss": 0.2227, "num_input_tokens_seen": 135087344, "step": 62650 }, { "epoch": 11.498440080748761, "grad_norm": 25.863725662231445, "learning_rate": 4.5657369545684245e-06, "loss": 0.1395, "num_input_tokens_seen": 135098512, "step": 62655 }, { "epoch": 11.499357680308313, "grad_norm": 28.537498474121094, "learning_rate": 4.5649392293859175e-06, "loss": 0.0142, "num_input_tokens_seen": 135107856, "step": 62660 }, { "epoch": 11.500275279867866, "grad_norm": 0.06939030438661575, "learning_rate": 4.564141515362043e-06, "loss": 0.3295, "num_input_tokens_seen": 135118256, "step": 62665 }, { "epoch": 11.501192879427418, "grad_norm": 23.444631576538086, "learning_rate": 4.563343812517265e-06, "loss": 0.2548, "num_input_tokens_seen": 135128944, "step": 62670 }, { "epoch": 11.50211047898697, "grad_norm": 0.2115328311920166, "learning_rate": 4.562546120872042e-06, "loss": 0.2461, "num_input_tokens_seen": 135140400, "step": 62675 }, { "epoch": 11.503028078546523, "grad_norm": 0.07356103509664536, "learning_rate": 4.561748440446831e-06, "loss": 0.0006, "num_input_tokens_seen": 135151152, "step": 62680 }, { "epoch": 11.503945678106074, "grad_norm": 0.022443166002631187, "learning_rate": 4.560950771262096e-06, "loss": 0.1277, "num_input_tokens_seen": 135161616, "step": 62685 }, { "epoch": 11.504863277665628, "grad_norm": 0.18792767822742462, "learning_rate": 4.560153113338291e-06, "loss": 0.0028, "num_input_tokens_seen": 135171440, "step": 62690 }, { "epoch": 11.50578087722518, "grad_norm": 101.78207397460938, "learning_rate": 4.559355466695878e-06, "loss": 0.3642, "num_input_tokens_seen": 135182768, "step": 62695 }, { "epoch": 11.506698476784731, "grad_norm": 0.0422501303255558, "learning_rate": 4.558557831355313e-06, "loss": 0.1396, "num_input_tokens_seen": 135192528, "step": 62700 }, { "epoch": 11.507616076344284, "grad_norm": 0.17840223014354706, "learning_rate": 4.557760207337057e-06, "loss": 0.1172, "num_input_tokens_seen": 135202640, "step": 62705 }, { "epoch": 11.508533675903836, "grad_norm": 0.051946233958005905, "learning_rate": 4.556962594661567e-06, "loss": 0.153, "num_input_tokens_seen": 135213008, "step": 62710 }, { "epoch": 11.509451275463388, "grad_norm": 0.1405607908964157, "learning_rate": 4.556164993349296e-06, "loss": 0.2111, "num_input_tokens_seen": 135224272, "step": 62715 }, { "epoch": 11.510368875022941, "grad_norm": 30.939496994018555, "learning_rate": 4.555367403420709e-06, "loss": 0.2507, "num_input_tokens_seen": 135234896, "step": 62720 }, { "epoch": 11.511286474582493, "grad_norm": 0.03830881789326668, "learning_rate": 4.554569824896258e-06, "loss": 0.1933, "num_input_tokens_seen": 135244656, "step": 62725 }, { "epoch": 11.512204074142044, "grad_norm": 235.1367950439453, "learning_rate": 4.553772257796398e-06, "loss": 0.0401, "num_input_tokens_seen": 135254768, "step": 62730 }, { "epoch": 11.513121673701598, "grad_norm": 255.63214111328125, "learning_rate": 4.55297470214159e-06, "loss": 0.4858, "num_input_tokens_seen": 135265680, "step": 62735 }, { "epoch": 11.51403927326115, "grad_norm": 15.33263874053955, "learning_rate": 4.55217715795229e-06, "loss": 0.2386, "num_input_tokens_seen": 135274480, "step": 62740 }, { "epoch": 11.5149568728207, "grad_norm": 0.15947896242141724, "learning_rate": 4.55137962524895e-06, "loss": 0.3021, "num_input_tokens_seen": 135284432, "step": 62745 }, { "epoch": 11.515874472380254, "grad_norm": 0.18040651082992554, "learning_rate": 4.550582104052025e-06, "loss": 0.29, "num_input_tokens_seen": 135295920, "step": 62750 }, { "epoch": 11.516792071939806, "grad_norm": 73.20712280273438, "learning_rate": 4.549784594381976e-06, "loss": 0.1876, "num_input_tokens_seen": 135306704, "step": 62755 }, { "epoch": 11.517709671499357, "grad_norm": 0.27070674300193787, "learning_rate": 4.548987096259254e-06, "loss": 0.0146, "num_input_tokens_seen": 135317200, "step": 62760 }, { "epoch": 11.51862727105891, "grad_norm": 117.12837219238281, "learning_rate": 4.548189609704311e-06, "loss": 0.3838, "num_input_tokens_seen": 135328208, "step": 62765 }, { "epoch": 11.519544870618462, "grad_norm": 155.75228881835938, "learning_rate": 4.547392134737607e-06, "loss": 0.1703, "num_input_tokens_seen": 135339248, "step": 62770 }, { "epoch": 11.520462470178014, "grad_norm": 0.05186642333865166, "learning_rate": 4.546594671379594e-06, "loss": 0.0192, "num_input_tokens_seen": 135349040, "step": 62775 }, { "epoch": 11.521380069737567, "grad_norm": 5.242249011993408, "learning_rate": 4.545797219650723e-06, "loss": 0.1468, "num_input_tokens_seen": 135360848, "step": 62780 }, { "epoch": 11.522297669297119, "grad_norm": 90.9189224243164, "learning_rate": 4.54499977957145e-06, "loss": 0.1188, "num_input_tokens_seen": 135369872, "step": 62785 }, { "epoch": 11.52321526885667, "grad_norm": 127.51036834716797, "learning_rate": 4.544202351162227e-06, "loss": 0.1262, "num_input_tokens_seen": 135380848, "step": 62790 }, { "epoch": 11.524132868416224, "grad_norm": 33.092411041259766, "learning_rate": 4.543404934443509e-06, "loss": 0.1544, "num_input_tokens_seen": 135390800, "step": 62795 }, { "epoch": 11.525050467975776, "grad_norm": 275.57476806640625, "learning_rate": 4.542607529435744e-06, "loss": 0.3488, "num_input_tokens_seen": 135402896, "step": 62800 }, { "epoch": 11.525968067535327, "grad_norm": 36.923179626464844, "learning_rate": 4.541810136159389e-06, "loss": 0.1707, "num_input_tokens_seen": 135413712, "step": 62805 }, { "epoch": 11.52688566709488, "grad_norm": 0.7828507423400879, "learning_rate": 4.541012754634895e-06, "loss": 0.3641, "num_input_tokens_seen": 135423824, "step": 62810 }, { "epoch": 11.527803266654432, "grad_norm": 12.60604190826416, "learning_rate": 4.540215384882709e-06, "loss": 0.0911, "num_input_tokens_seen": 135435376, "step": 62815 }, { "epoch": 11.528720866213984, "grad_norm": 0.2567209005355835, "learning_rate": 4.539418026923289e-06, "loss": 0.0049, "num_input_tokens_seen": 135447216, "step": 62820 }, { "epoch": 11.529638465773537, "grad_norm": 3.453981637954712, "learning_rate": 4.538620680777081e-06, "loss": 0.2432, "num_input_tokens_seen": 135458480, "step": 62825 }, { "epoch": 11.530556065333089, "grad_norm": 11.966495513916016, "learning_rate": 4.537823346464536e-06, "loss": 0.1981, "num_input_tokens_seen": 135468880, "step": 62830 }, { "epoch": 11.53147366489264, "grad_norm": 0.5991246700286865, "learning_rate": 4.537026024006109e-06, "loss": 0.3124, "num_input_tokens_seen": 135480464, "step": 62835 }, { "epoch": 11.532391264452194, "grad_norm": 0.16053467988967896, "learning_rate": 4.536228713422246e-06, "loss": 0.0068, "num_input_tokens_seen": 135490352, "step": 62840 }, { "epoch": 11.533308864011746, "grad_norm": 5.088559627532959, "learning_rate": 4.535431414733398e-06, "loss": 0.2291, "num_input_tokens_seen": 135501904, "step": 62845 }, { "epoch": 11.534226463571297, "grad_norm": 0.22705814242362976, "learning_rate": 4.534634127960012e-06, "loss": 0.1725, "num_input_tokens_seen": 135511408, "step": 62850 }, { "epoch": 11.53514406313085, "grad_norm": 12.904370307922363, "learning_rate": 4.533836853122542e-06, "loss": 0.0061, "num_input_tokens_seen": 135522896, "step": 62855 }, { "epoch": 11.536061662690402, "grad_norm": 1.7178915739059448, "learning_rate": 4.5330395902414335e-06, "loss": 0.3367, "num_input_tokens_seen": 135533424, "step": 62860 }, { "epoch": 11.536979262249954, "grad_norm": 0.09585757553577423, "learning_rate": 4.532242339337134e-06, "loss": 0.0503, "num_input_tokens_seen": 135543760, "step": 62865 }, { "epoch": 11.537896861809507, "grad_norm": 0.3075709342956543, "learning_rate": 4.5314451004300954e-06, "loss": 0.1387, "num_input_tokens_seen": 135553904, "step": 62870 }, { "epoch": 11.538814461369059, "grad_norm": 0.15623149275779724, "learning_rate": 4.530647873540764e-06, "loss": 0.1276, "num_input_tokens_seen": 135564080, "step": 62875 }, { "epoch": 11.53973206092861, "grad_norm": 0.5243682861328125, "learning_rate": 4.529850658689587e-06, "loss": 0.3761, "num_input_tokens_seen": 135574480, "step": 62880 }, { "epoch": 11.540649660488164, "grad_norm": 68.14977264404297, "learning_rate": 4.52905345589701e-06, "loss": 0.4021, "num_input_tokens_seen": 135585168, "step": 62885 }, { "epoch": 11.541567260047715, "grad_norm": 15.918371200561523, "learning_rate": 4.528256265183484e-06, "loss": 0.2861, "num_input_tokens_seen": 135596528, "step": 62890 }, { "epoch": 11.542484859607267, "grad_norm": 14.357588768005371, "learning_rate": 4.527459086569454e-06, "loss": 0.0621, "num_input_tokens_seen": 135607536, "step": 62895 }, { "epoch": 11.54340245916682, "grad_norm": 123.29981994628906, "learning_rate": 4.526661920075365e-06, "loss": 0.0559, "num_input_tokens_seen": 135617776, "step": 62900 }, { "epoch": 11.544320058726372, "grad_norm": 0.19620750844478607, "learning_rate": 4.525864765721665e-06, "loss": 0.1908, "num_input_tokens_seen": 135627440, "step": 62905 }, { "epoch": 11.545237658285924, "grad_norm": 0.19982373714447021, "learning_rate": 4.5250676235287985e-06, "loss": 0.1418, "num_input_tokens_seen": 135637840, "step": 62910 }, { "epoch": 11.546155257845477, "grad_norm": 23.438547134399414, "learning_rate": 4.52427049351721e-06, "loss": 0.1588, "num_input_tokens_seen": 135647952, "step": 62915 }, { "epoch": 11.547072857405029, "grad_norm": 52.29136276245117, "learning_rate": 4.5234733757073475e-06, "loss": 0.3826, "num_input_tokens_seen": 135658288, "step": 62920 }, { "epoch": 11.54799045696458, "grad_norm": 50.65336227416992, "learning_rate": 4.522676270119654e-06, "loss": 0.2344, "num_input_tokens_seen": 135668816, "step": 62925 }, { "epoch": 11.548908056524134, "grad_norm": 56.13872528076172, "learning_rate": 4.521879176774575e-06, "loss": 0.1465, "num_input_tokens_seen": 135680336, "step": 62930 }, { "epoch": 11.549825656083685, "grad_norm": 0.30947262048721313, "learning_rate": 4.5210820956925525e-06, "loss": 0.2141, "num_input_tokens_seen": 135692688, "step": 62935 }, { "epoch": 11.550743255643237, "grad_norm": 0.16503703594207764, "learning_rate": 4.520285026894033e-06, "loss": 0.002, "num_input_tokens_seen": 135702608, "step": 62940 }, { "epoch": 11.55166085520279, "grad_norm": 207.40573120117188, "learning_rate": 4.51948797039946e-06, "loss": 0.2177, "num_input_tokens_seen": 135713968, "step": 62945 }, { "epoch": 11.552578454762342, "grad_norm": 423.5896911621094, "learning_rate": 4.518690926229273e-06, "loss": 0.1597, "num_input_tokens_seen": 135724880, "step": 62950 }, { "epoch": 11.553496054321894, "grad_norm": 0.11027649790048599, "learning_rate": 4.517893894403921e-06, "loss": 0.2466, "num_input_tokens_seen": 135736112, "step": 62955 }, { "epoch": 11.554413653881447, "grad_norm": 0.17531687021255493, "learning_rate": 4.517096874943842e-06, "loss": 0.0166, "num_input_tokens_seen": 135748208, "step": 62960 }, { "epoch": 11.555331253440999, "grad_norm": 0.09209995716810226, "learning_rate": 4.516299867869478e-06, "loss": 0.011, "num_input_tokens_seen": 135759632, "step": 62965 }, { "epoch": 11.55624885300055, "grad_norm": 0.26285800337791443, "learning_rate": 4.515502873201275e-06, "loss": 0.0138, "num_input_tokens_seen": 135770672, "step": 62970 }, { "epoch": 11.557166452560104, "grad_norm": 10.763567924499512, "learning_rate": 4.514705890959673e-06, "loss": 0.0231, "num_input_tokens_seen": 135780112, "step": 62975 }, { "epoch": 11.558084052119655, "grad_norm": 25.257144927978516, "learning_rate": 4.513908921165113e-06, "loss": 0.3446, "num_input_tokens_seen": 135790960, "step": 62980 }, { "epoch": 11.559001651679207, "grad_norm": 38.81678771972656, "learning_rate": 4.513111963838033e-06, "loss": 0.1471, "num_input_tokens_seen": 135801392, "step": 62985 }, { "epoch": 11.55991925123876, "grad_norm": 0.012501344084739685, "learning_rate": 4.512315018998878e-06, "loss": 0.1016, "num_input_tokens_seen": 135811984, "step": 62990 }, { "epoch": 11.560836850798312, "grad_norm": 60.94754409790039, "learning_rate": 4.511518086668088e-06, "loss": 0.1814, "num_input_tokens_seen": 135822640, "step": 62995 }, { "epoch": 11.561754450357864, "grad_norm": 117.29102325439453, "learning_rate": 4.5107211668661e-06, "loss": 0.2947, "num_input_tokens_seen": 135833264, "step": 63000 }, { "epoch": 11.562672049917417, "grad_norm": 73.44229125976562, "learning_rate": 4.5099242596133575e-06, "loss": 0.2273, "num_input_tokens_seen": 135844176, "step": 63005 }, { "epoch": 11.563589649476969, "grad_norm": 176.27427673339844, "learning_rate": 4.509127364930297e-06, "loss": 0.039, "num_input_tokens_seen": 135854128, "step": 63010 }, { "epoch": 11.56450724903652, "grad_norm": 0.38147586584091187, "learning_rate": 4.50833048283736e-06, "loss": 0.0797, "num_input_tokens_seen": 135863664, "step": 63015 }, { "epoch": 11.565424848596074, "grad_norm": 214.6881561279297, "learning_rate": 4.5075336133549825e-06, "loss": 0.1905, "num_input_tokens_seen": 135874704, "step": 63020 }, { "epoch": 11.566342448155625, "grad_norm": 50.6773796081543, "learning_rate": 4.5067367565036055e-06, "loss": 0.3218, "num_input_tokens_seen": 135884528, "step": 63025 }, { "epoch": 11.567260047715177, "grad_norm": 0.011315557174384594, "learning_rate": 4.505939912303667e-06, "loss": 0.1742, "num_input_tokens_seen": 135895504, "step": 63030 }, { "epoch": 11.56817764727473, "grad_norm": 0.12663477659225464, "learning_rate": 4.505143080775602e-06, "loss": 0.2607, "num_input_tokens_seen": 135905840, "step": 63035 }, { "epoch": 11.569095246834282, "grad_norm": 56.67988967895508, "learning_rate": 4.504346261939851e-06, "loss": 0.3027, "num_input_tokens_seen": 135917264, "step": 63040 }, { "epoch": 11.570012846393833, "grad_norm": 0.10126882791519165, "learning_rate": 4.503549455816851e-06, "loss": 0.1546, "num_input_tokens_seen": 135927920, "step": 63045 }, { "epoch": 11.570930445953387, "grad_norm": 0.42595967650413513, "learning_rate": 4.502752662427036e-06, "loss": 0.4003, "num_input_tokens_seen": 135939824, "step": 63050 }, { "epoch": 11.571848045512938, "grad_norm": 83.01103210449219, "learning_rate": 4.501955881790846e-06, "loss": 0.087, "num_input_tokens_seen": 135948880, "step": 63055 }, { "epoch": 11.57276564507249, "grad_norm": 93.86760711669922, "learning_rate": 4.501159113928716e-06, "loss": 0.3675, "num_input_tokens_seen": 135959184, "step": 63060 }, { "epoch": 11.573683244632043, "grad_norm": 28.41305160522461, "learning_rate": 4.500362358861082e-06, "loss": 0.2773, "num_input_tokens_seen": 135970192, "step": 63065 }, { "epoch": 11.574600844191595, "grad_norm": 0.1299891173839569, "learning_rate": 4.499565616608377e-06, "loss": 0.1618, "num_input_tokens_seen": 135981296, "step": 63070 }, { "epoch": 11.575518443751147, "grad_norm": 2.0119810104370117, "learning_rate": 4.49876888719104e-06, "loss": 0.4696, "num_input_tokens_seen": 135991888, "step": 63075 }, { "epoch": 11.5764360433107, "grad_norm": 0.43124204874038696, "learning_rate": 4.497972170629505e-06, "loss": 0.0583, "num_input_tokens_seen": 136002032, "step": 63080 }, { "epoch": 11.577353642870252, "grad_norm": 0.07921065390110016, "learning_rate": 4.4971754669442025e-06, "loss": 0.1977, "num_input_tokens_seen": 136011824, "step": 63085 }, { "epoch": 11.578271242429803, "grad_norm": 0.3066217303276062, "learning_rate": 4.496378776155573e-06, "loss": 0.0016, "num_input_tokens_seen": 136022032, "step": 63090 }, { "epoch": 11.579188841989357, "grad_norm": 6.505852222442627, "learning_rate": 4.495582098284047e-06, "loss": 0.0044, "num_input_tokens_seen": 136033936, "step": 63095 }, { "epoch": 11.580106441548908, "grad_norm": 0.6445594429969788, "learning_rate": 4.494785433350057e-06, "loss": 0.0021, "num_input_tokens_seen": 136044752, "step": 63100 }, { "epoch": 11.58102404110846, "grad_norm": 0.2097565084695816, "learning_rate": 4.493988781374039e-06, "loss": 0.1071, "num_input_tokens_seen": 136054608, "step": 63105 }, { "epoch": 11.581941640668013, "grad_norm": 0.1621207892894745, "learning_rate": 4.493192142376423e-06, "loss": 0.1351, "num_input_tokens_seen": 136065488, "step": 63110 }, { "epoch": 11.582859240227565, "grad_norm": 7.368414878845215, "learning_rate": 4.492395516377645e-06, "loss": 0.2822, "num_input_tokens_seen": 136075600, "step": 63115 }, { "epoch": 11.583776839787117, "grad_norm": 0.6490809917449951, "learning_rate": 4.4915989033981346e-06, "loss": 0.636, "num_input_tokens_seen": 136085744, "step": 63120 }, { "epoch": 11.58469443934667, "grad_norm": 0.03321089595556259, "learning_rate": 4.490802303458324e-06, "loss": 0.1259, "num_input_tokens_seen": 136097360, "step": 63125 }, { "epoch": 11.585612038906222, "grad_norm": 0.8141503930091858, "learning_rate": 4.490005716578646e-06, "loss": 0.118, "num_input_tokens_seen": 136107856, "step": 63130 }, { "epoch": 11.586529638465773, "grad_norm": 32.08818054199219, "learning_rate": 4.489209142779528e-06, "loss": 0.6058, "num_input_tokens_seen": 136118768, "step": 63135 }, { "epoch": 11.587447238025327, "grad_norm": 7.785744667053223, "learning_rate": 4.488412582081408e-06, "loss": 0.0058, "num_input_tokens_seen": 136129072, "step": 63140 }, { "epoch": 11.588364837584878, "grad_norm": 0.021942952647805214, "learning_rate": 4.48761603450471e-06, "loss": 0.2967, "num_input_tokens_seen": 136138960, "step": 63145 }, { "epoch": 11.58928243714443, "grad_norm": 0.6823955774307251, "learning_rate": 4.486819500069866e-06, "loss": 0.0934, "num_input_tokens_seen": 136149424, "step": 63150 }, { "epoch": 11.590200036703983, "grad_norm": 0.16627417504787445, "learning_rate": 4.486022978797308e-06, "loss": 0.1239, "num_input_tokens_seen": 136161168, "step": 63155 }, { "epoch": 11.591117636263535, "grad_norm": 1.1328306198120117, "learning_rate": 4.4852264707074635e-06, "loss": 0.0276, "num_input_tokens_seen": 136171504, "step": 63160 }, { "epoch": 11.592035235823086, "grad_norm": 0.15139029920101166, "learning_rate": 4.484429975820763e-06, "loss": 0.2421, "num_input_tokens_seen": 136181264, "step": 63165 }, { "epoch": 11.59295283538264, "grad_norm": 0.13324107229709625, "learning_rate": 4.483633494157632e-06, "loss": 0.0894, "num_input_tokens_seen": 136192368, "step": 63170 }, { "epoch": 11.593870434942191, "grad_norm": 0.2597803771495819, "learning_rate": 4.482837025738504e-06, "loss": 0.2042, "num_input_tokens_seen": 136204208, "step": 63175 }, { "epoch": 11.594788034501743, "grad_norm": 0.10485722869634628, "learning_rate": 4.482040570583805e-06, "loss": 0.0645, "num_input_tokens_seen": 136215952, "step": 63180 }, { "epoch": 11.595705634061297, "grad_norm": 0.16758456826210022, "learning_rate": 4.481244128713959e-06, "loss": 0.0745, "num_input_tokens_seen": 136226896, "step": 63185 }, { "epoch": 11.596623233620848, "grad_norm": 51.281097412109375, "learning_rate": 4.4804477001494015e-06, "loss": 0.1665, "num_input_tokens_seen": 136237584, "step": 63190 }, { "epoch": 11.5975408331804, "grad_norm": 0.3553333878517151, "learning_rate": 4.479651284910554e-06, "loss": 0.0926, "num_input_tokens_seen": 136248784, "step": 63195 }, { "epoch": 11.598458432739953, "grad_norm": 105.8153076171875, "learning_rate": 4.478854883017845e-06, "loss": 0.1335, "num_input_tokens_seen": 136259184, "step": 63200 }, { "epoch": 11.599376032299505, "grad_norm": 3.9979002475738525, "learning_rate": 4.4780584944916975e-06, "loss": 0.0099, "num_input_tokens_seen": 136270192, "step": 63205 }, { "epoch": 11.600293631859056, "grad_norm": 10.333678245544434, "learning_rate": 4.477262119352543e-06, "loss": 0.2729, "num_input_tokens_seen": 136281392, "step": 63210 }, { "epoch": 11.60121123141861, "grad_norm": 0.2508717179298401, "learning_rate": 4.476465757620806e-06, "loss": 0.2408, "num_input_tokens_seen": 136291600, "step": 63215 }, { "epoch": 11.602128830978161, "grad_norm": 0.19112533330917358, "learning_rate": 4.4756694093169086e-06, "loss": 0.1576, "num_input_tokens_seen": 136302640, "step": 63220 }, { "epoch": 11.603046430537713, "grad_norm": 123.37966918945312, "learning_rate": 4.47487307446128e-06, "loss": 0.4823, "num_input_tokens_seen": 136313200, "step": 63225 }, { "epoch": 11.603964030097266, "grad_norm": 0.07207900285720825, "learning_rate": 4.474076753074342e-06, "loss": 0.0926, "num_input_tokens_seen": 136322224, "step": 63230 }, { "epoch": 11.604881629656818, "grad_norm": 203.5211944580078, "learning_rate": 4.47328044517652e-06, "loss": 0.1951, "num_input_tokens_seen": 136331792, "step": 63235 }, { "epoch": 11.60579922921637, "grad_norm": 0.3625737726688385, "learning_rate": 4.47248415078824e-06, "loss": 0.1623, "num_input_tokens_seen": 136342320, "step": 63240 }, { "epoch": 11.606716828775923, "grad_norm": 0.24917016923427582, "learning_rate": 4.471687869929923e-06, "loss": 0.2385, "num_input_tokens_seen": 136352816, "step": 63245 }, { "epoch": 11.607634428335475, "grad_norm": 74.81476593017578, "learning_rate": 4.470891602621994e-06, "loss": 0.1788, "num_input_tokens_seen": 136364208, "step": 63250 }, { "epoch": 11.608552027895026, "grad_norm": 0.31050848960876465, "learning_rate": 4.470095348884873e-06, "loss": 0.0093, "num_input_tokens_seen": 136374000, "step": 63255 }, { "epoch": 11.60946962745458, "grad_norm": 0.18015235662460327, "learning_rate": 4.4692991087389866e-06, "loss": 0.1041, "num_input_tokens_seen": 136383792, "step": 63260 }, { "epoch": 11.610387227014131, "grad_norm": 0.0814664289355278, "learning_rate": 4.468502882204756e-06, "loss": 0.0187, "num_input_tokens_seen": 136394864, "step": 63265 }, { "epoch": 11.611304826573683, "grad_norm": 0.2905625104904175, "learning_rate": 4.467706669302601e-06, "loss": 0.0039, "num_input_tokens_seen": 136404816, "step": 63270 }, { "epoch": 11.612222426133236, "grad_norm": 18.747051239013672, "learning_rate": 4.466910470052947e-06, "loss": 0.1643, "num_input_tokens_seen": 136416400, "step": 63275 }, { "epoch": 11.613140025692788, "grad_norm": 17.994905471801758, "learning_rate": 4.466114284476213e-06, "loss": 0.1656, "num_input_tokens_seen": 136426992, "step": 63280 }, { "epoch": 11.61405762525234, "grad_norm": 0.049880485981702805, "learning_rate": 4.465318112592818e-06, "loss": 0.1942, "num_input_tokens_seen": 136436880, "step": 63285 }, { "epoch": 11.614975224811893, "grad_norm": 0.034773241728544235, "learning_rate": 4.464521954423187e-06, "loss": 0.3436, "num_input_tokens_seen": 136448528, "step": 63290 }, { "epoch": 11.615892824371445, "grad_norm": 22.952238082885742, "learning_rate": 4.463725809987738e-06, "loss": 0.1914, "num_input_tokens_seen": 136459024, "step": 63295 }, { "epoch": 11.616810423930996, "grad_norm": 0.23237968981266022, "learning_rate": 4.46292967930689e-06, "loss": 0.1078, "num_input_tokens_seen": 136470512, "step": 63300 }, { "epoch": 11.61772802349055, "grad_norm": 33.99633026123047, "learning_rate": 4.462133562401062e-06, "loss": 0.4794, "num_input_tokens_seen": 136482384, "step": 63305 }, { "epoch": 11.618645623050101, "grad_norm": 68.5499496459961, "learning_rate": 4.461337459290677e-06, "loss": 0.3747, "num_input_tokens_seen": 136494000, "step": 63310 }, { "epoch": 11.619563222609653, "grad_norm": 21.693328857421875, "learning_rate": 4.460541369996151e-06, "loss": 0.2732, "num_input_tokens_seen": 136505712, "step": 63315 }, { "epoch": 11.620480822169206, "grad_norm": 5.122961044311523, "learning_rate": 4.459745294537901e-06, "loss": 0.0034, "num_input_tokens_seen": 136515504, "step": 63320 }, { "epoch": 11.621398421728758, "grad_norm": 10.507898330688477, "learning_rate": 4.458949232936349e-06, "loss": 0.1115, "num_input_tokens_seen": 136525264, "step": 63325 }, { "epoch": 11.62231602128831, "grad_norm": 0.2983440160751343, "learning_rate": 4.458153185211909e-06, "loss": 0.1276, "num_input_tokens_seen": 136536944, "step": 63330 }, { "epoch": 11.623233620847863, "grad_norm": 24.471418380737305, "learning_rate": 4.457357151385e-06, "loss": 0.541, "num_input_tokens_seen": 136546928, "step": 63335 }, { "epoch": 11.624151220407414, "grad_norm": 0.1422826051712036, "learning_rate": 4.45656113147604e-06, "loss": 0.1204, "num_input_tokens_seen": 136557840, "step": 63340 }, { "epoch": 11.625068819966966, "grad_norm": 1.224053144454956, "learning_rate": 4.455765125505444e-06, "loss": 0.0177, "num_input_tokens_seen": 136569200, "step": 63345 }, { "epoch": 11.62598641952652, "grad_norm": 8.88491439819336, "learning_rate": 4.454969133493631e-06, "loss": 0.4301, "num_input_tokens_seen": 136580592, "step": 63350 }, { "epoch": 11.626904019086071, "grad_norm": 0.5514826774597168, "learning_rate": 4.454173155461011e-06, "loss": 0.1175, "num_input_tokens_seen": 136592976, "step": 63355 }, { "epoch": 11.627821618645623, "grad_norm": 7.663570880889893, "learning_rate": 4.453377191428007e-06, "loss": 0.4384, "num_input_tokens_seen": 136603568, "step": 63360 }, { "epoch": 11.628739218205176, "grad_norm": 0.13909469544887543, "learning_rate": 4.4525812414150304e-06, "loss": 0.0196, "num_input_tokens_seen": 136614992, "step": 63365 }, { "epoch": 11.629656817764728, "grad_norm": 0.2010200470685959, "learning_rate": 4.451785305442494e-06, "loss": 0.0096, "num_input_tokens_seen": 136625328, "step": 63370 }, { "epoch": 11.63057441732428, "grad_norm": 0.3867344856262207, "learning_rate": 4.450989383530817e-06, "loss": 0.2165, "num_input_tokens_seen": 136636880, "step": 63375 }, { "epoch": 11.631492016883833, "grad_norm": 1.422072410583496, "learning_rate": 4.450193475700412e-06, "loss": 0.3037, "num_input_tokens_seen": 136647664, "step": 63380 }, { "epoch": 11.632409616443384, "grad_norm": 119.95315551757812, "learning_rate": 4.449397581971693e-06, "loss": 0.147, "num_input_tokens_seen": 136659152, "step": 63385 }, { "epoch": 11.633327216002936, "grad_norm": 0.15186186134815216, "learning_rate": 4.448601702365069e-06, "loss": 0.0017, "num_input_tokens_seen": 136670928, "step": 63390 }, { "epoch": 11.63424481556249, "grad_norm": 21.90850257873535, "learning_rate": 4.44780583690096e-06, "loss": 0.2708, "num_input_tokens_seen": 136682960, "step": 63395 }, { "epoch": 11.635162415122041, "grad_norm": 7.722622394561768, "learning_rate": 4.447009985599775e-06, "loss": 0.0386, "num_input_tokens_seen": 136694352, "step": 63400 }, { "epoch": 11.636080014681593, "grad_norm": 0.10977289080619812, "learning_rate": 4.446214148481924e-06, "loss": 0.1016, "num_input_tokens_seen": 136704336, "step": 63405 }, { "epoch": 11.636997614241146, "grad_norm": 0.4155241847038269, "learning_rate": 4.445418325567825e-06, "loss": 0.1051, "num_input_tokens_seen": 136714992, "step": 63410 }, { "epoch": 11.637915213800698, "grad_norm": 33.02285385131836, "learning_rate": 4.444622516877886e-06, "loss": 0.1533, "num_input_tokens_seen": 136727216, "step": 63415 }, { "epoch": 11.63883281336025, "grad_norm": 130.5894012451172, "learning_rate": 4.443826722432517e-06, "loss": 0.1351, "num_input_tokens_seen": 136737296, "step": 63420 }, { "epoch": 11.639750412919803, "grad_norm": 8.346866607666016, "learning_rate": 4.4430309422521315e-06, "loss": 0.1146, "num_input_tokens_seen": 136748944, "step": 63425 }, { "epoch": 11.640668012479354, "grad_norm": 22.750085830688477, "learning_rate": 4.442235176357139e-06, "loss": 0.1694, "num_input_tokens_seen": 136759504, "step": 63430 }, { "epoch": 11.641585612038906, "grad_norm": 0.2223229855298996, "learning_rate": 4.441439424767951e-06, "loss": 0.0314, "num_input_tokens_seen": 136770800, "step": 63435 }, { "epoch": 11.64250321159846, "grad_norm": 10.570555686950684, "learning_rate": 4.440643687504975e-06, "loss": 0.0985, "num_input_tokens_seen": 136782544, "step": 63440 }, { "epoch": 11.64342081115801, "grad_norm": 4.70916223526001, "learning_rate": 4.439847964588622e-06, "loss": 0.0053, "num_input_tokens_seen": 136792816, "step": 63445 }, { "epoch": 11.644338410717562, "grad_norm": 0.44932058453559875, "learning_rate": 4.4390522560393e-06, "loss": 0.0821, "num_input_tokens_seen": 136804080, "step": 63450 }, { "epoch": 11.645256010277116, "grad_norm": 0.06745312362909317, "learning_rate": 4.438256561877417e-06, "loss": 0.1164, "num_input_tokens_seen": 136813584, "step": 63455 }, { "epoch": 11.646173609836667, "grad_norm": 0.00836959294974804, "learning_rate": 4.437460882123384e-06, "loss": 0.5106, "num_input_tokens_seen": 136822960, "step": 63460 }, { "epoch": 11.64709120939622, "grad_norm": 115.67793273925781, "learning_rate": 4.436665216797609e-06, "loss": 0.5309, "num_input_tokens_seen": 136831280, "step": 63465 }, { "epoch": 11.648008808955773, "grad_norm": 1.5567686557769775, "learning_rate": 4.435869565920497e-06, "loss": 0.1632, "num_input_tokens_seen": 136842416, "step": 63470 }, { "epoch": 11.648926408515324, "grad_norm": 0.5556510090827942, "learning_rate": 4.435073929512454e-06, "loss": 0.0092, "num_input_tokens_seen": 136853424, "step": 63475 }, { "epoch": 11.649844008074876, "grad_norm": 2.2792410850524902, "learning_rate": 4.4342783075938915e-06, "loss": 0.0726, "num_input_tokens_seen": 136864016, "step": 63480 }, { "epoch": 11.65076160763443, "grad_norm": 0.6181173324584961, "learning_rate": 4.433482700185213e-06, "loss": 0.1025, "num_input_tokens_seen": 136875536, "step": 63485 }, { "epoch": 11.65167920719398, "grad_norm": 12.677038192749023, "learning_rate": 4.432687107306825e-06, "loss": 0.0074, "num_input_tokens_seen": 136886128, "step": 63490 }, { "epoch": 11.652596806753532, "grad_norm": 228.62176513671875, "learning_rate": 4.431891528979133e-06, "loss": 0.528, "num_input_tokens_seen": 136897008, "step": 63495 }, { "epoch": 11.653514406313086, "grad_norm": 0.20282162725925446, "learning_rate": 4.431095965222545e-06, "loss": 0.2415, "num_input_tokens_seen": 136907856, "step": 63500 }, { "epoch": 11.654432005872637, "grad_norm": 306.38543701171875, "learning_rate": 4.430300416057461e-06, "loss": 0.4536, "num_input_tokens_seen": 136919568, "step": 63505 }, { "epoch": 11.655349605432189, "grad_norm": 0.17875653505325317, "learning_rate": 4.42950488150429e-06, "loss": 0.1483, "num_input_tokens_seen": 136930416, "step": 63510 }, { "epoch": 11.656267204991742, "grad_norm": 18.92283821105957, "learning_rate": 4.428709361583435e-06, "loss": 0.0583, "num_input_tokens_seen": 136941360, "step": 63515 }, { "epoch": 11.657184804551294, "grad_norm": 0.1297411471605301, "learning_rate": 4.427913856315299e-06, "loss": 0.0178, "num_input_tokens_seen": 136951312, "step": 63520 }, { "epoch": 11.658102404110846, "grad_norm": 0.348334401845932, "learning_rate": 4.427118365720285e-06, "loss": 0.1551, "num_input_tokens_seen": 136962416, "step": 63525 }, { "epoch": 11.659020003670399, "grad_norm": 0.10450474172830582, "learning_rate": 4.426322889818798e-06, "loss": 0.0037, "num_input_tokens_seen": 136973360, "step": 63530 }, { "epoch": 11.65993760322995, "grad_norm": 0.14609165489673615, "learning_rate": 4.42552742863124e-06, "loss": 0.1021, "num_input_tokens_seen": 136983632, "step": 63535 }, { "epoch": 11.660855202789502, "grad_norm": 97.85501861572266, "learning_rate": 4.424731982178012e-06, "loss": 0.4992, "num_input_tokens_seen": 136994000, "step": 63540 }, { "epoch": 11.661772802349056, "grad_norm": 0.06853219121694565, "learning_rate": 4.423936550479519e-06, "loss": 0.1359, "num_input_tokens_seen": 137004912, "step": 63545 }, { "epoch": 11.662690401908607, "grad_norm": 182.4296875, "learning_rate": 4.423141133556158e-06, "loss": 0.1092, "num_input_tokens_seen": 137015504, "step": 63550 }, { "epoch": 11.663608001468159, "grad_norm": 0.04267273470759392, "learning_rate": 4.4223457314283335e-06, "loss": 0.111, "num_input_tokens_seen": 137026064, "step": 63555 }, { "epoch": 11.664525601027712, "grad_norm": 28.917999267578125, "learning_rate": 4.421550344116447e-06, "loss": 0.0518, "num_input_tokens_seen": 137036432, "step": 63560 }, { "epoch": 11.665443200587264, "grad_norm": 0.7874571084976196, "learning_rate": 4.420754971640898e-06, "loss": 0.2415, "num_input_tokens_seen": 137048880, "step": 63565 }, { "epoch": 11.666360800146816, "grad_norm": 5.757078647613525, "learning_rate": 4.419959614022086e-06, "loss": 0.0776, "num_input_tokens_seen": 137059600, "step": 63570 }, { "epoch": 11.667278399706369, "grad_norm": 0.04748307168483734, "learning_rate": 4.419164271280408e-06, "loss": 0.169, "num_input_tokens_seen": 137069232, "step": 63575 }, { "epoch": 11.66819599926592, "grad_norm": 129.1869354248047, "learning_rate": 4.418368943436269e-06, "loss": 0.1909, "num_input_tokens_seen": 137079984, "step": 63580 }, { "epoch": 11.669113598825472, "grad_norm": 49.208221435546875, "learning_rate": 4.4175736305100656e-06, "loss": 0.3179, "num_input_tokens_seen": 137090768, "step": 63585 }, { "epoch": 11.670031198385026, "grad_norm": 13.988231658935547, "learning_rate": 4.416778332522193e-06, "loss": 0.4019, "num_input_tokens_seen": 137101776, "step": 63590 }, { "epoch": 11.670948797944577, "grad_norm": 6.421355724334717, "learning_rate": 4.415983049493055e-06, "loss": 0.2362, "num_input_tokens_seen": 137113296, "step": 63595 }, { "epoch": 11.671866397504129, "grad_norm": 1.880340337753296, "learning_rate": 4.415187781443047e-06, "loss": 0.1864, "num_input_tokens_seen": 137125136, "step": 63600 }, { "epoch": 11.672783997063682, "grad_norm": 0.24280552566051483, "learning_rate": 4.414392528392563e-06, "loss": 0.0024, "num_input_tokens_seen": 137136528, "step": 63605 }, { "epoch": 11.673701596623234, "grad_norm": 2.0389726161956787, "learning_rate": 4.413597290362005e-06, "loss": 0.0742, "num_input_tokens_seen": 137148464, "step": 63610 }, { "epoch": 11.674619196182785, "grad_norm": 0.16922178864479065, "learning_rate": 4.412802067371768e-06, "loss": 0.3058, "num_input_tokens_seen": 137158928, "step": 63615 }, { "epoch": 11.675536795742339, "grad_norm": 18.87472915649414, "learning_rate": 4.4120068594422485e-06, "loss": 0.2621, "num_input_tokens_seen": 137167792, "step": 63620 }, { "epoch": 11.67645439530189, "grad_norm": 0.02507263980805874, "learning_rate": 4.411211666593839e-06, "loss": 0.2792, "num_input_tokens_seen": 137178160, "step": 63625 }, { "epoch": 11.677371994861442, "grad_norm": 16.11110496520996, "learning_rate": 4.41041648884694e-06, "loss": 0.1977, "num_input_tokens_seen": 137188016, "step": 63630 }, { "epoch": 11.678289594420995, "grad_norm": 0.17615845799446106, "learning_rate": 4.4096213262219436e-06, "loss": 0.2684, "num_input_tokens_seen": 137199152, "step": 63635 }, { "epoch": 11.679207193980547, "grad_norm": 8.963133811950684, "learning_rate": 4.408826178739245e-06, "loss": 0.2834, "num_input_tokens_seen": 137209488, "step": 63640 }, { "epoch": 11.680124793540099, "grad_norm": 0.49177467823028564, "learning_rate": 4.408031046419239e-06, "loss": 0.0059, "num_input_tokens_seen": 137219152, "step": 63645 }, { "epoch": 11.681042393099652, "grad_norm": 0.05142221227288246, "learning_rate": 4.407235929282319e-06, "loss": 0.2978, "num_input_tokens_seen": 137230448, "step": 63650 }, { "epoch": 11.681959992659204, "grad_norm": 0.03256025165319443, "learning_rate": 4.406440827348879e-06, "loss": 0.0265, "num_input_tokens_seen": 137241968, "step": 63655 }, { "epoch": 11.682877592218755, "grad_norm": 0.09782814234495163, "learning_rate": 4.405645740639312e-06, "loss": 0.0156, "num_input_tokens_seen": 137252208, "step": 63660 }, { "epoch": 11.683795191778309, "grad_norm": 233.996826171875, "learning_rate": 4.404850669174011e-06, "loss": 0.2149, "num_input_tokens_seen": 137262608, "step": 63665 }, { "epoch": 11.68471279133786, "grad_norm": 0.18489953875541687, "learning_rate": 4.404055612973367e-06, "loss": 0.0687, "num_input_tokens_seen": 137271856, "step": 63670 }, { "epoch": 11.685630390897412, "grad_norm": 0.06500566750764847, "learning_rate": 4.403260572057772e-06, "loss": 0.1088, "num_input_tokens_seen": 137282256, "step": 63675 }, { "epoch": 11.686547990456965, "grad_norm": 5.890315532684326, "learning_rate": 4.4024655464476204e-06, "loss": 0.0037, "num_input_tokens_seen": 137293328, "step": 63680 }, { "epoch": 11.687465590016517, "grad_norm": 0.04761845991015434, "learning_rate": 4.401670536163301e-06, "loss": 0.1046, "num_input_tokens_seen": 137303632, "step": 63685 }, { "epoch": 11.688383189576069, "grad_norm": 4.981605052947998, "learning_rate": 4.400875541225203e-06, "loss": 0.0405, "num_input_tokens_seen": 137314672, "step": 63690 }, { "epoch": 11.689300789135622, "grad_norm": 18.98691177368164, "learning_rate": 4.400080561653721e-06, "loss": 0.1137, "num_input_tokens_seen": 137325712, "step": 63695 }, { "epoch": 11.690218388695174, "grad_norm": 0.10906599462032318, "learning_rate": 4.399285597469242e-06, "loss": 0.0413, "num_input_tokens_seen": 137336144, "step": 63700 }, { "epoch": 11.691135988254725, "grad_norm": 0.033872559666633606, "learning_rate": 4.398490648692158e-06, "loss": 0.0918, "num_input_tokens_seen": 137347280, "step": 63705 }, { "epoch": 11.692053587814279, "grad_norm": 1.2394357919692993, "learning_rate": 4.397695715342854e-06, "loss": 0.4111, "num_input_tokens_seen": 137359120, "step": 63710 }, { "epoch": 11.69297118737383, "grad_norm": 0.01573873497545719, "learning_rate": 4.396900797441723e-06, "loss": 0.0057, "num_input_tokens_seen": 137370736, "step": 63715 }, { "epoch": 11.693888786933382, "grad_norm": 0.944221019744873, "learning_rate": 4.396105895009151e-06, "loss": 0.4898, "num_input_tokens_seen": 137382096, "step": 63720 }, { "epoch": 11.694806386492935, "grad_norm": 0.5953672528266907, "learning_rate": 4.395311008065527e-06, "loss": 0.2139, "num_input_tokens_seen": 137393872, "step": 63725 }, { "epoch": 11.695723986052487, "grad_norm": 3.4390485286712646, "learning_rate": 4.394516136631239e-06, "loss": 0.2302, "num_input_tokens_seen": 137405552, "step": 63730 }, { "epoch": 11.696641585612038, "grad_norm": 0.27608489990234375, "learning_rate": 4.393721280726674e-06, "loss": 0.0419, "num_input_tokens_seen": 137415984, "step": 63735 }, { "epoch": 11.697559185171592, "grad_norm": 158.30963134765625, "learning_rate": 4.392926440372218e-06, "loss": 0.2568, "num_input_tokens_seen": 137426864, "step": 63740 }, { "epoch": 11.698476784731143, "grad_norm": 54.61904525756836, "learning_rate": 4.3921316155882575e-06, "loss": 0.1869, "num_input_tokens_seen": 137437936, "step": 63745 }, { "epoch": 11.699394384290695, "grad_norm": 0.0657561719417572, "learning_rate": 4.3913368063951795e-06, "loss": 0.1047, "num_input_tokens_seen": 137448752, "step": 63750 }, { "epoch": 11.700311983850249, "grad_norm": 0.15180757641792297, "learning_rate": 4.390542012813369e-06, "loss": 0.1613, "num_input_tokens_seen": 137460112, "step": 63755 }, { "epoch": 11.7012295834098, "grad_norm": 0.9454267024993896, "learning_rate": 4.389747234863211e-06, "loss": 0.0041, "num_input_tokens_seen": 137471056, "step": 63760 }, { "epoch": 11.702147182969352, "grad_norm": 0.042987141758203506, "learning_rate": 4.3889524725650915e-06, "loss": 0.21, "num_input_tokens_seen": 137482640, "step": 63765 }, { "epoch": 11.703064782528905, "grad_norm": 70.8838882446289, "learning_rate": 4.388157725939392e-06, "loss": 0.0611, "num_input_tokens_seen": 137493264, "step": 63770 }, { "epoch": 11.703982382088457, "grad_norm": 10.222946166992188, "learning_rate": 4.3873629950065e-06, "loss": 0.0362, "num_input_tokens_seen": 137504112, "step": 63775 }, { "epoch": 11.704899981648008, "grad_norm": 1.748909592628479, "learning_rate": 4.386568279786799e-06, "loss": 0.2459, "num_input_tokens_seen": 137514864, "step": 63780 }, { "epoch": 11.705817581207562, "grad_norm": 0.07229208201169968, "learning_rate": 4.3857735803006695e-06, "loss": 0.1389, "num_input_tokens_seen": 137527760, "step": 63785 }, { "epoch": 11.706735180767113, "grad_norm": 0.3388797342777252, "learning_rate": 4.384978896568496e-06, "loss": 0.1857, "num_input_tokens_seen": 137537872, "step": 63790 }, { "epoch": 11.707652780326665, "grad_norm": 0.27827420830726624, "learning_rate": 4.384184228610659e-06, "loss": 0.2835, "num_input_tokens_seen": 137547696, "step": 63795 }, { "epoch": 11.708570379886218, "grad_norm": 0.1178940162062645, "learning_rate": 4.383389576447544e-06, "loss": 0.23, "num_input_tokens_seen": 137558864, "step": 63800 }, { "epoch": 11.70948797944577, "grad_norm": 29.933082580566406, "learning_rate": 4.382594940099531e-06, "loss": 0.1345, "num_input_tokens_seen": 137570000, "step": 63805 }, { "epoch": 11.710405579005322, "grad_norm": 31.3678035736084, "learning_rate": 4.381800319586998e-06, "loss": 0.2243, "num_input_tokens_seen": 137579760, "step": 63810 }, { "epoch": 11.711323178564875, "grad_norm": 26.436885833740234, "learning_rate": 4.381005714930331e-06, "loss": 0.2391, "num_input_tokens_seen": 137590096, "step": 63815 }, { "epoch": 11.712240778124427, "grad_norm": 0.9575844407081604, "learning_rate": 4.380211126149909e-06, "loss": 0.005, "num_input_tokens_seen": 137600912, "step": 63820 }, { "epoch": 11.713158377683978, "grad_norm": 0.3786145746707916, "learning_rate": 4.379416553266108e-06, "loss": 0.167, "num_input_tokens_seen": 137611376, "step": 63825 }, { "epoch": 11.714075977243532, "grad_norm": 111.39634704589844, "learning_rate": 4.378621996299313e-06, "loss": 0.3745, "num_input_tokens_seen": 137621936, "step": 63830 }, { "epoch": 11.714993576803083, "grad_norm": 0.13060259819030762, "learning_rate": 4.377827455269901e-06, "loss": 0.0012, "num_input_tokens_seen": 137633008, "step": 63835 }, { "epoch": 11.715911176362635, "grad_norm": 0.07339904457330704, "learning_rate": 4.37703293019825e-06, "loss": 0.116, "num_input_tokens_seen": 137644848, "step": 63840 }, { "epoch": 11.716828775922188, "grad_norm": 27.651809692382812, "learning_rate": 4.3762384211047365e-06, "loss": 0.2307, "num_input_tokens_seen": 137655664, "step": 63845 }, { "epoch": 11.71774637548174, "grad_norm": 0.1415269374847412, "learning_rate": 4.375443928009743e-06, "loss": 0.0051, "num_input_tokens_seen": 137666128, "step": 63850 }, { "epoch": 11.718663975041292, "grad_norm": 0.046997398138046265, "learning_rate": 4.3746494509336455e-06, "loss": 0.0995, "num_input_tokens_seen": 137677072, "step": 63855 }, { "epoch": 11.719581574600845, "grad_norm": 0.13717420399188995, "learning_rate": 4.373854989896819e-06, "loss": 0.0015, "num_input_tokens_seen": 137689168, "step": 63860 }, { "epoch": 11.720499174160397, "grad_norm": 0.20909228920936584, "learning_rate": 4.373060544919642e-06, "loss": 0.3121, "num_input_tokens_seen": 137700624, "step": 63865 }, { "epoch": 11.721416773719948, "grad_norm": 105.53632354736328, "learning_rate": 4.37226611602249e-06, "loss": 0.2824, "num_input_tokens_seen": 137711280, "step": 63870 }, { "epoch": 11.722334373279502, "grad_norm": 126.82746124267578, "learning_rate": 4.371471703225739e-06, "loss": 0.2297, "num_input_tokens_seen": 137721680, "step": 63875 }, { "epoch": 11.723251972839053, "grad_norm": 0.13562339544296265, "learning_rate": 4.370677306549766e-06, "loss": 0.1052, "num_input_tokens_seen": 137732400, "step": 63880 }, { "epoch": 11.724169572398605, "grad_norm": 20.408784866333008, "learning_rate": 4.369882926014945e-06, "loss": 0.0109, "num_input_tokens_seen": 137743632, "step": 63885 }, { "epoch": 11.725087171958158, "grad_norm": 0.20912504196166992, "learning_rate": 4.369088561641649e-06, "loss": 0.0522, "num_input_tokens_seen": 137755344, "step": 63890 }, { "epoch": 11.72600477151771, "grad_norm": 0.13312453031539917, "learning_rate": 4.3682942134502525e-06, "loss": 0.0112, "num_input_tokens_seen": 137765360, "step": 63895 }, { "epoch": 11.726922371077261, "grad_norm": 0.2675487995147705, "learning_rate": 4.367499881461132e-06, "loss": 0.055, "num_input_tokens_seen": 137775984, "step": 63900 }, { "epoch": 11.727839970636815, "grad_norm": 0.4903300106525421, "learning_rate": 4.36670556569466e-06, "loss": 0.0072, "num_input_tokens_seen": 137787216, "step": 63905 }, { "epoch": 11.728757570196366, "grad_norm": 34.516048431396484, "learning_rate": 4.365911266171206e-06, "loss": 0.3768, "num_input_tokens_seen": 137796592, "step": 63910 }, { "epoch": 11.729675169755918, "grad_norm": 196.53643798828125, "learning_rate": 4.365116982911147e-06, "loss": 0.2638, "num_input_tokens_seen": 137806224, "step": 63915 }, { "epoch": 11.730592769315471, "grad_norm": 82.30178833007812, "learning_rate": 4.364322715934854e-06, "loss": 0.0427, "num_input_tokens_seen": 137817456, "step": 63920 }, { "epoch": 11.731510368875023, "grad_norm": 150.3838653564453, "learning_rate": 4.363528465262699e-06, "loss": 0.293, "num_input_tokens_seen": 137828016, "step": 63925 }, { "epoch": 11.732427968434575, "grad_norm": 0.18690016865730286, "learning_rate": 4.3627342309150485e-06, "loss": 0.0048, "num_input_tokens_seen": 137839120, "step": 63930 }, { "epoch": 11.733345567994128, "grad_norm": 0.3573915362358093, "learning_rate": 4.361940012912281e-06, "loss": 0.0228, "num_input_tokens_seen": 137849168, "step": 63935 }, { "epoch": 11.73426316755368, "grad_norm": 11.080777168273926, "learning_rate": 4.361145811274762e-06, "loss": 0.4465, "num_input_tokens_seen": 137860272, "step": 63940 }, { "epoch": 11.735180767113231, "grad_norm": 14.631512641906738, "learning_rate": 4.360351626022861e-06, "loss": 0.4065, "num_input_tokens_seen": 137871216, "step": 63945 }, { "epoch": 11.736098366672785, "grad_norm": 89.96369934082031, "learning_rate": 4.359557457176952e-06, "loss": 0.2809, "num_input_tokens_seen": 137881360, "step": 63950 }, { "epoch": 11.737015966232336, "grad_norm": 123.85948944091797, "learning_rate": 4.358763304757402e-06, "loss": 0.1024, "num_input_tokens_seen": 137890864, "step": 63955 }, { "epoch": 11.737933565791888, "grad_norm": 1.2094330787658691, "learning_rate": 4.357969168784577e-06, "loss": 0.0788, "num_input_tokens_seen": 137901744, "step": 63960 }, { "epoch": 11.738851165351441, "grad_norm": 0.0026633108500391245, "learning_rate": 4.357175049278849e-06, "loss": 0.0298, "num_input_tokens_seen": 137911088, "step": 63965 }, { "epoch": 11.739768764910993, "grad_norm": 2.4560866355895996, "learning_rate": 4.356380946260585e-06, "loss": 0.1998, "num_input_tokens_seen": 137922224, "step": 63970 }, { "epoch": 11.740686364470545, "grad_norm": 2.39381742477417, "learning_rate": 4.355586859750153e-06, "loss": 0.0919, "num_input_tokens_seen": 137934224, "step": 63975 }, { "epoch": 11.741603964030098, "grad_norm": 0.3115837872028351, "learning_rate": 4.354792789767918e-06, "loss": 0.133, "num_input_tokens_seen": 137944176, "step": 63980 }, { "epoch": 11.74252156358965, "grad_norm": 28.188295364379883, "learning_rate": 4.353998736334248e-06, "loss": 0.3688, "num_input_tokens_seen": 137955472, "step": 63985 }, { "epoch": 11.743439163149201, "grad_norm": 46.794639587402344, "learning_rate": 4.35320469946951e-06, "loss": 0.3933, "num_input_tokens_seen": 137967536, "step": 63990 }, { "epoch": 11.744356762708755, "grad_norm": 3.089357376098633, "learning_rate": 4.352410679194067e-06, "loss": 0.0164, "num_input_tokens_seen": 137978096, "step": 63995 }, { "epoch": 11.745274362268306, "grad_norm": 19.89522361755371, "learning_rate": 4.351616675528288e-06, "loss": 0.4483, "num_input_tokens_seen": 137989808, "step": 64000 }, { "epoch": 11.746191961827858, "grad_norm": 0.1276625543832779, "learning_rate": 4.350822688492537e-06, "loss": 0.2401, "num_input_tokens_seen": 138000144, "step": 64005 }, { "epoch": 11.747109561387411, "grad_norm": 0.10665781050920486, "learning_rate": 4.350028718107175e-06, "loss": 0.4489, "num_input_tokens_seen": 138011088, "step": 64010 }, { "epoch": 11.748027160946963, "grad_norm": 0.2526776194572449, "learning_rate": 4.349234764392572e-06, "loss": 0.031, "num_input_tokens_seen": 138022000, "step": 64015 }, { "epoch": 11.748944760506514, "grad_norm": 0.22605711221694946, "learning_rate": 4.348440827369088e-06, "loss": 0.002, "num_input_tokens_seen": 138033232, "step": 64020 }, { "epoch": 11.749862360066068, "grad_norm": 0.38713160157203674, "learning_rate": 4.347646907057088e-06, "loss": 0.0036, "num_input_tokens_seen": 138042800, "step": 64025 }, { "epoch": 11.75077995962562, "grad_norm": 0.19472841918468475, "learning_rate": 4.346853003476931e-06, "loss": 0.0036, "num_input_tokens_seen": 138055504, "step": 64030 }, { "epoch": 11.751697559185171, "grad_norm": 0.2683993875980377, "learning_rate": 4.346059116648984e-06, "loss": 0.1598, "num_input_tokens_seen": 138065968, "step": 64035 }, { "epoch": 11.752615158744725, "grad_norm": 4.051147937774658, "learning_rate": 4.3452652465936075e-06, "loss": 0.3636, "num_input_tokens_seen": 138076816, "step": 64040 }, { "epoch": 11.753532758304276, "grad_norm": 0.01878349855542183, "learning_rate": 4.344471393331161e-06, "loss": 0.2857, "num_input_tokens_seen": 138086128, "step": 64045 }, { "epoch": 11.754450357863828, "grad_norm": 0.02713751792907715, "learning_rate": 4.3436775568820085e-06, "loss": 0.0545, "num_input_tokens_seen": 138095376, "step": 64050 }, { "epoch": 11.755367957423381, "grad_norm": 1.3896641731262207, "learning_rate": 4.34288373726651e-06, "loss": 0.2805, "num_input_tokens_seen": 138105968, "step": 64055 }, { "epoch": 11.756285556982933, "grad_norm": 0.14344461262226105, "learning_rate": 4.3420899345050235e-06, "loss": 0.1201, "num_input_tokens_seen": 138117360, "step": 64060 }, { "epoch": 11.757203156542484, "grad_norm": 0.1270601451396942, "learning_rate": 4.341296148617911e-06, "loss": 0.0348, "num_input_tokens_seen": 138127280, "step": 64065 }, { "epoch": 11.758120756102038, "grad_norm": 0.18395696580410004, "learning_rate": 4.340502379625533e-06, "loss": 0.0811, "num_input_tokens_seen": 138138256, "step": 64070 }, { "epoch": 11.75903835566159, "grad_norm": 0.042284801602363586, "learning_rate": 4.339708627548246e-06, "loss": 0.5227, "num_input_tokens_seen": 138148944, "step": 64075 }, { "epoch": 11.759955955221141, "grad_norm": 21.919710159301758, "learning_rate": 4.338914892406408e-06, "loss": 0.2875, "num_input_tokens_seen": 138161296, "step": 64080 }, { "epoch": 11.760873554780694, "grad_norm": 0.6849342584609985, "learning_rate": 4.33812117422038e-06, "loss": 0.108, "num_input_tokens_seen": 138172112, "step": 64085 }, { "epoch": 11.761791154340246, "grad_norm": 0.32391947507858276, "learning_rate": 4.337327473010518e-06, "loss": 0.0873, "num_input_tokens_seen": 138182096, "step": 64090 }, { "epoch": 11.762708753899798, "grad_norm": 0.06913618743419647, "learning_rate": 4.3365337887971775e-06, "loss": 0.017, "num_input_tokens_seen": 138192368, "step": 64095 }, { "epoch": 11.763626353459351, "grad_norm": 26.40842056274414, "learning_rate": 4.335740121600719e-06, "loss": 0.3188, "num_input_tokens_seen": 138204336, "step": 64100 }, { "epoch": 11.764543953018903, "grad_norm": 70.50814819335938, "learning_rate": 4.334946471441497e-06, "loss": 0.0437, "num_input_tokens_seen": 138213968, "step": 64105 }, { "epoch": 11.765461552578454, "grad_norm": 0.09202143549919128, "learning_rate": 4.334152838339867e-06, "loss": 0.017, "num_input_tokens_seen": 138223952, "step": 64110 }, { "epoch": 11.766379152138008, "grad_norm": 0.0941067636013031, "learning_rate": 4.3333592223161826e-06, "loss": 0.0028, "num_input_tokens_seen": 138235728, "step": 64115 }, { "epoch": 11.76729675169756, "grad_norm": 0.19916732609272003, "learning_rate": 4.332565623390803e-06, "loss": 0.1888, "num_input_tokens_seen": 138246832, "step": 64120 }, { "epoch": 11.768214351257111, "grad_norm": 0.11912013590335846, "learning_rate": 4.331772041584081e-06, "loss": 0.0231, "num_input_tokens_seen": 138257712, "step": 64125 }, { "epoch": 11.769131950816664, "grad_norm": 0.04672480374574661, "learning_rate": 4.3309784769163685e-06, "loss": 0.1801, "num_input_tokens_seen": 138267760, "step": 64130 }, { "epoch": 11.770049550376216, "grad_norm": 50.798667907714844, "learning_rate": 4.330184929408023e-06, "loss": 0.2934, "num_input_tokens_seen": 138277712, "step": 64135 }, { "epoch": 11.770967149935768, "grad_norm": 10.36389446258545, "learning_rate": 4.329391399079396e-06, "loss": 0.2179, "num_input_tokens_seen": 138289392, "step": 64140 }, { "epoch": 11.771884749495321, "grad_norm": 24.48588752746582, "learning_rate": 4.328597885950838e-06, "loss": 0.0636, "num_input_tokens_seen": 138300496, "step": 64145 }, { "epoch": 11.772802349054873, "grad_norm": 0.005296232644468546, "learning_rate": 4.327804390042706e-06, "loss": 0.0969, "num_input_tokens_seen": 138311280, "step": 64150 }, { "epoch": 11.773719948614424, "grad_norm": 8.686566352844238, "learning_rate": 4.327010911375349e-06, "loss": 0.3233, "num_input_tokens_seen": 138322608, "step": 64155 }, { "epoch": 11.774637548173978, "grad_norm": 53.50019073486328, "learning_rate": 4.3262174499691195e-06, "loss": 0.1522, "num_input_tokens_seen": 138333744, "step": 64160 }, { "epoch": 11.77555514773353, "grad_norm": 0.1308681070804596, "learning_rate": 4.3254240058443655e-06, "loss": 0.1947, "num_input_tokens_seen": 138344400, "step": 64165 }, { "epoch": 11.77647274729308, "grad_norm": 90.16612243652344, "learning_rate": 4.324630579021444e-06, "loss": 0.2635, "num_input_tokens_seen": 138355664, "step": 64170 }, { "epoch": 11.777390346852634, "grad_norm": 35.835567474365234, "learning_rate": 4.3238371695207e-06, "loss": 0.2506, "num_input_tokens_seen": 138366736, "step": 64175 }, { "epoch": 11.778307946412186, "grad_norm": 0.08119111508131027, "learning_rate": 4.323043777362483e-06, "loss": 0.3261, "num_input_tokens_seen": 138378704, "step": 64180 }, { "epoch": 11.779225545971737, "grad_norm": 47.26165008544922, "learning_rate": 4.3222504025671466e-06, "loss": 0.2281, "num_input_tokens_seen": 138388912, "step": 64185 }, { "epoch": 11.78014314553129, "grad_norm": 0.09524200111627579, "learning_rate": 4.321457045155035e-06, "loss": 0.1421, "num_input_tokens_seen": 138400208, "step": 64190 }, { "epoch": 11.781060745090842, "grad_norm": 213.82794189453125, "learning_rate": 4.3206637051464985e-06, "loss": 0.2329, "num_input_tokens_seen": 138411344, "step": 64195 }, { "epoch": 11.781978344650394, "grad_norm": 19.81290054321289, "learning_rate": 4.319870382561887e-06, "loss": 0.185, "num_input_tokens_seen": 138421008, "step": 64200 }, { "epoch": 11.782895944209947, "grad_norm": 4.09744119644165, "learning_rate": 4.319077077421545e-06, "loss": 0.0116, "num_input_tokens_seen": 138432240, "step": 64205 }, { "epoch": 11.783813543769499, "grad_norm": 0.02523723989725113, "learning_rate": 4.318283789745821e-06, "loss": 0.2075, "num_input_tokens_seen": 138443088, "step": 64210 }, { "epoch": 11.78473114332905, "grad_norm": 0.026407761499285698, "learning_rate": 4.31749051955506e-06, "loss": 0.0154, "num_input_tokens_seen": 138453040, "step": 64215 }, { "epoch": 11.785648742888604, "grad_norm": 0.5218796133995056, "learning_rate": 4.31669726686961e-06, "loss": 0.0015, "num_input_tokens_seen": 138462224, "step": 64220 }, { "epoch": 11.786566342448156, "grad_norm": 0.09128120541572571, "learning_rate": 4.315904031709817e-06, "loss": 0.1376, "num_input_tokens_seen": 138472720, "step": 64225 }, { "epoch": 11.787483942007707, "grad_norm": 40.35872268676758, "learning_rate": 4.315110814096023e-06, "loss": 0.4031, "num_input_tokens_seen": 138482992, "step": 64230 }, { "epoch": 11.78840154156726, "grad_norm": 69.83525848388672, "learning_rate": 4.314317614048577e-06, "loss": 0.1282, "num_input_tokens_seen": 138493552, "step": 64235 }, { "epoch": 11.789319141126812, "grad_norm": 0.03269801661372185, "learning_rate": 4.313524431587822e-06, "loss": 0.0266, "num_input_tokens_seen": 138503952, "step": 64240 }, { "epoch": 11.790236740686364, "grad_norm": 0.08517343550920486, "learning_rate": 4.312731266734101e-06, "loss": 0.0756, "num_input_tokens_seen": 138514736, "step": 64245 }, { "epoch": 11.791154340245917, "grad_norm": 1.062139868736267, "learning_rate": 4.311938119507756e-06, "loss": 0.2304, "num_input_tokens_seen": 138524528, "step": 64250 }, { "epoch": 11.792071939805469, "grad_norm": 152.7067413330078, "learning_rate": 4.311144989929134e-06, "loss": 0.0952, "num_input_tokens_seen": 138535120, "step": 64255 }, { "epoch": 11.79298953936502, "grad_norm": 0.06855988502502441, "learning_rate": 4.310351878018575e-06, "loss": 0.1322, "num_input_tokens_seen": 138546064, "step": 64260 }, { "epoch": 11.793907138924574, "grad_norm": 146.2969512939453, "learning_rate": 4.30955878379642e-06, "loss": 0.649, "num_input_tokens_seen": 138557200, "step": 64265 }, { "epoch": 11.794824738484126, "grad_norm": 27.424720764160156, "learning_rate": 4.3087657072830136e-06, "loss": 0.4371, "num_input_tokens_seen": 138568272, "step": 64270 }, { "epoch": 11.795742338043677, "grad_norm": 55.97309112548828, "learning_rate": 4.307972648498696e-06, "loss": 0.1335, "num_input_tokens_seen": 138578608, "step": 64275 }, { "epoch": 11.79665993760323, "grad_norm": 0.18225671350955963, "learning_rate": 4.307179607463806e-06, "loss": 0.0019, "num_input_tokens_seen": 138589168, "step": 64280 }, { "epoch": 11.797577537162782, "grad_norm": 0.24949653446674347, "learning_rate": 4.306386584198685e-06, "loss": 0.0017, "num_input_tokens_seen": 138599344, "step": 64285 }, { "epoch": 11.798495136722334, "grad_norm": 0.07964964956045151, "learning_rate": 4.305593578723674e-06, "loss": 0.0487, "num_input_tokens_seen": 138610320, "step": 64290 }, { "epoch": 11.799412736281887, "grad_norm": 1.2586994171142578, "learning_rate": 4.304800591059113e-06, "loss": 0.2454, "num_input_tokens_seen": 138620432, "step": 64295 }, { "epoch": 11.800330335841439, "grad_norm": 1.1598892211914062, "learning_rate": 4.304007621225336e-06, "loss": 0.106, "num_input_tokens_seen": 138630928, "step": 64300 }, { "epoch": 11.80124793540099, "grad_norm": 0.08004838228225708, "learning_rate": 4.303214669242688e-06, "loss": 0.9053, "num_input_tokens_seen": 138642192, "step": 64305 }, { "epoch": 11.802165534960544, "grad_norm": 121.54176330566406, "learning_rate": 4.302421735131502e-06, "loss": 0.1914, "num_input_tokens_seen": 138652880, "step": 64310 }, { "epoch": 11.803083134520095, "grad_norm": 0.7309561967849731, "learning_rate": 4.301628818912117e-06, "loss": 0.0037, "num_input_tokens_seen": 138664816, "step": 64315 }, { "epoch": 11.804000734079647, "grad_norm": 27.34302520751953, "learning_rate": 4.3008359206048716e-06, "loss": 0.4213, "num_input_tokens_seen": 138675216, "step": 64320 }, { "epoch": 11.8049183336392, "grad_norm": 12.714159965515137, "learning_rate": 4.300043040230101e-06, "loss": 0.2726, "num_input_tokens_seen": 138686896, "step": 64325 }, { "epoch": 11.805835933198752, "grad_norm": 0.11569102108478546, "learning_rate": 4.29925017780814e-06, "loss": 0.0488, "num_input_tokens_seen": 138698032, "step": 64330 }, { "epoch": 11.806753532758304, "grad_norm": 23.55223274230957, "learning_rate": 4.298457333359328e-06, "loss": 0.1264, "num_input_tokens_seen": 138709584, "step": 64335 }, { "epoch": 11.807671132317857, "grad_norm": 0.2895291745662689, "learning_rate": 4.297664506903998e-06, "loss": 0.1304, "num_input_tokens_seen": 138719248, "step": 64340 }, { "epoch": 11.808588731877409, "grad_norm": 0.05284033715724945, "learning_rate": 4.296871698462485e-06, "loss": 0.2065, "num_input_tokens_seen": 138731760, "step": 64345 }, { "epoch": 11.80950633143696, "grad_norm": 0.05661604553461075, "learning_rate": 4.296078908055121e-06, "loss": 0.4891, "num_input_tokens_seen": 138741424, "step": 64350 }, { "epoch": 11.810423930996514, "grad_norm": 142.94900512695312, "learning_rate": 4.295286135702243e-06, "loss": 0.211, "num_input_tokens_seen": 138751856, "step": 64355 }, { "epoch": 11.811341530556065, "grad_norm": 354.736328125, "learning_rate": 4.294493381424185e-06, "loss": 0.1926, "num_input_tokens_seen": 138762704, "step": 64360 }, { "epoch": 11.812259130115617, "grad_norm": 59.3294563293457, "learning_rate": 4.293700645241276e-06, "loss": 0.3853, "num_input_tokens_seen": 138774608, "step": 64365 }, { "epoch": 11.81317672967517, "grad_norm": 37.728816986083984, "learning_rate": 4.292907927173853e-06, "loss": 0.1616, "num_input_tokens_seen": 138785264, "step": 64370 }, { "epoch": 11.814094329234722, "grad_norm": 85.61576080322266, "learning_rate": 4.292115227242245e-06, "loss": 0.3833, "num_input_tokens_seen": 138796016, "step": 64375 }, { "epoch": 11.815011928794274, "grad_norm": 0.5498696565628052, "learning_rate": 4.2913225454667844e-06, "loss": 0.1896, "num_input_tokens_seen": 138805616, "step": 64380 }, { "epoch": 11.815929528353827, "grad_norm": 31.598527908325195, "learning_rate": 4.290529881867801e-06, "loss": 0.2863, "num_input_tokens_seen": 138816208, "step": 64385 }, { "epoch": 11.816847127913379, "grad_norm": 30.40782356262207, "learning_rate": 4.289737236465627e-06, "loss": 0.2144, "num_input_tokens_seen": 138827152, "step": 64390 }, { "epoch": 11.81776472747293, "grad_norm": 1.6407567262649536, "learning_rate": 4.288944609280593e-06, "loss": 0.0854, "num_input_tokens_seen": 138838512, "step": 64395 }, { "epoch": 11.818682327032484, "grad_norm": 0.3724452555179596, "learning_rate": 4.2881520003330265e-06, "loss": 0.3011, "num_input_tokens_seen": 138849296, "step": 64400 }, { "epoch": 11.819599926592035, "grad_norm": 0.27147573232650757, "learning_rate": 4.28735940964326e-06, "loss": 0.2749, "num_input_tokens_seen": 138858960, "step": 64405 }, { "epoch": 11.820517526151587, "grad_norm": 0.30703747272491455, "learning_rate": 4.286566837231617e-06, "loss": 0.003, "num_input_tokens_seen": 138871696, "step": 64410 }, { "epoch": 11.82143512571114, "grad_norm": 25.07834815979004, "learning_rate": 4.285774283118431e-06, "loss": 0.0178, "num_input_tokens_seen": 138883472, "step": 64415 }, { "epoch": 11.822352725270692, "grad_norm": 0.18791913986206055, "learning_rate": 4.284981747324028e-06, "loss": 0.2312, "num_input_tokens_seen": 138893456, "step": 64420 }, { "epoch": 11.823270324830244, "grad_norm": 1.4952726364135742, "learning_rate": 4.284189229868735e-06, "loss": 0.416, "num_input_tokens_seen": 138903472, "step": 64425 }, { "epoch": 11.824187924389797, "grad_norm": 0.40742331743240356, "learning_rate": 4.283396730772879e-06, "loss": 0.0805, "num_input_tokens_seen": 138914672, "step": 64430 }, { "epoch": 11.825105523949349, "grad_norm": 0.5728297233581543, "learning_rate": 4.282604250056784e-06, "loss": 0.0019, "num_input_tokens_seen": 138925360, "step": 64435 }, { "epoch": 11.8260231235089, "grad_norm": 35.19437026977539, "learning_rate": 4.281811787740779e-06, "loss": 0.3343, "num_input_tokens_seen": 138936400, "step": 64440 }, { "epoch": 11.826940723068454, "grad_norm": 0.09610838443040848, "learning_rate": 4.28101934384519e-06, "loss": 0.3962, "num_input_tokens_seen": 138947408, "step": 64445 }, { "epoch": 11.827858322628005, "grad_norm": 0.388510137796402, "learning_rate": 4.280226918390338e-06, "loss": 0.2471, "num_input_tokens_seen": 138956528, "step": 64450 }, { "epoch": 11.828775922187557, "grad_norm": 143.162109375, "learning_rate": 4.2794345113965524e-06, "loss": 0.2059, "num_input_tokens_seen": 138966256, "step": 64455 }, { "epoch": 11.82969352174711, "grad_norm": 61.775367736816406, "learning_rate": 4.278642122884154e-06, "loss": 0.1247, "num_input_tokens_seen": 138976592, "step": 64460 }, { "epoch": 11.830611121306662, "grad_norm": 117.20590209960938, "learning_rate": 4.277849752873466e-06, "loss": 0.4025, "num_input_tokens_seen": 138986736, "step": 64465 }, { "epoch": 11.831528720866213, "grad_norm": 4.204562187194824, "learning_rate": 4.277057401384815e-06, "loss": 0.0829, "num_input_tokens_seen": 138996592, "step": 64470 }, { "epoch": 11.832446320425767, "grad_norm": 0.7519938945770264, "learning_rate": 4.276265068438521e-06, "loss": 0.2203, "num_input_tokens_seen": 139006864, "step": 64475 }, { "epoch": 11.833363919985318, "grad_norm": 12.24411678314209, "learning_rate": 4.2754727540549056e-06, "loss": 0.1881, "num_input_tokens_seen": 139017040, "step": 64480 }, { "epoch": 11.83428151954487, "grad_norm": 16.528005599975586, "learning_rate": 4.2746804582542886e-06, "loss": 0.1593, "num_input_tokens_seen": 139027408, "step": 64485 }, { "epoch": 11.835199119104423, "grad_norm": 24.871654510498047, "learning_rate": 4.273888181056997e-06, "loss": 0.3164, "num_input_tokens_seen": 139039824, "step": 64490 }, { "epoch": 11.836116718663975, "grad_norm": 0.27437180280685425, "learning_rate": 4.2730959224833475e-06, "loss": 0.0913, "num_input_tokens_seen": 139050160, "step": 64495 }, { "epoch": 11.837034318223527, "grad_norm": 1.609165906906128, "learning_rate": 4.27230368255366e-06, "loss": 0.3657, "num_input_tokens_seen": 139060400, "step": 64500 }, { "epoch": 11.83795191778308, "grad_norm": 103.13422393798828, "learning_rate": 4.271511461288255e-06, "loss": 0.2394, "num_input_tokens_seen": 139070352, "step": 64505 }, { "epoch": 11.838869517342632, "grad_norm": 6.171080112457275, "learning_rate": 4.270719258707454e-06, "loss": 0.0923, "num_input_tokens_seen": 139081872, "step": 64510 }, { "epoch": 11.839787116902183, "grad_norm": 23.47849464416504, "learning_rate": 4.269927074831571e-06, "loss": 0.0954, "num_input_tokens_seen": 139093136, "step": 64515 }, { "epoch": 11.840704716461737, "grad_norm": 0.3613206148147583, "learning_rate": 4.26913490968093e-06, "loss": 0.0681, "num_input_tokens_seen": 139105136, "step": 64520 }, { "epoch": 11.841622316021288, "grad_norm": 0.06553644686937332, "learning_rate": 4.268342763275844e-06, "loss": 0.0037, "num_input_tokens_seen": 139116912, "step": 64525 }, { "epoch": 11.84253991558084, "grad_norm": 108.51764678955078, "learning_rate": 4.2675506356366336e-06, "loss": 0.1246, "num_input_tokens_seen": 139127600, "step": 64530 }, { "epoch": 11.843457515140393, "grad_norm": 2.850383758544922, "learning_rate": 4.2667585267836114e-06, "loss": 0.243, "num_input_tokens_seen": 139138320, "step": 64535 }, { "epoch": 11.844375114699945, "grad_norm": 52.02326202392578, "learning_rate": 4.265966436737099e-06, "loss": 0.3951, "num_input_tokens_seen": 139148976, "step": 64540 }, { "epoch": 11.845292714259497, "grad_norm": 0.09563177824020386, "learning_rate": 4.2651743655174096e-06, "loss": 0.4576, "num_input_tokens_seen": 139159760, "step": 64545 }, { "epoch": 11.84621031381905, "grad_norm": 22.59520721435547, "learning_rate": 4.264382313144857e-06, "loss": 0.1252, "num_input_tokens_seen": 139171088, "step": 64550 }, { "epoch": 11.847127913378602, "grad_norm": 3.6196513175964355, "learning_rate": 4.26359027963976e-06, "loss": 0.1139, "num_input_tokens_seen": 139181328, "step": 64555 }, { "epoch": 11.848045512938153, "grad_norm": 0.17615097761154175, "learning_rate": 4.262798265022431e-06, "loss": 0.1732, "num_input_tokens_seen": 139191248, "step": 64560 }, { "epoch": 11.848963112497707, "grad_norm": 0.23376305401325226, "learning_rate": 4.2620062693131834e-06, "loss": 0.0034, "num_input_tokens_seen": 139201552, "step": 64565 }, { "epoch": 11.849880712057258, "grad_norm": 45.659629821777344, "learning_rate": 4.2612142925323294e-06, "loss": 0.1573, "num_input_tokens_seen": 139213168, "step": 64570 }, { "epoch": 11.85079831161681, "grad_norm": 0.10352842509746552, "learning_rate": 4.260422334700187e-06, "loss": 0.172, "num_input_tokens_seen": 139223760, "step": 64575 }, { "epoch": 11.851715911176363, "grad_norm": 5.050448417663574, "learning_rate": 4.2596303958370635e-06, "loss": 0.0058, "num_input_tokens_seen": 139234736, "step": 64580 }, { "epoch": 11.852633510735915, "grad_norm": 0.11882060021162033, "learning_rate": 4.258838475963273e-06, "loss": 0.0026, "num_input_tokens_seen": 139245520, "step": 64585 }, { "epoch": 11.853551110295466, "grad_norm": 0.17292025685310364, "learning_rate": 4.258046575099126e-06, "loss": 0.1848, "num_input_tokens_seen": 139255600, "step": 64590 }, { "epoch": 11.85446870985502, "grad_norm": 51.9290771484375, "learning_rate": 4.257254693264937e-06, "loss": 0.2062, "num_input_tokens_seen": 139265968, "step": 64595 }, { "epoch": 11.855386309414571, "grad_norm": 0.8425067067146301, "learning_rate": 4.256462830481012e-06, "loss": 0.1855, "num_input_tokens_seen": 139276752, "step": 64600 }, { "epoch": 11.856303908974123, "grad_norm": 2.4903879165649414, "learning_rate": 4.255670986767664e-06, "loss": 0.1138, "num_input_tokens_seen": 139287472, "step": 64605 }, { "epoch": 11.857221508533677, "grad_norm": 28.578527450561523, "learning_rate": 4.254879162145201e-06, "loss": 0.0834, "num_input_tokens_seen": 139297584, "step": 64610 }, { "epoch": 11.858139108093228, "grad_norm": 0.634719967842102, "learning_rate": 4.254087356633934e-06, "loss": 0.2547, "num_input_tokens_seen": 139308560, "step": 64615 }, { "epoch": 11.85905670765278, "grad_norm": 312.0218200683594, "learning_rate": 4.2532955702541686e-06, "loss": 0.5564, "num_input_tokens_seen": 139317680, "step": 64620 }, { "epoch": 11.859974307212333, "grad_norm": 0.057837583124637604, "learning_rate": 4.2525038030262155e-06, "loss": 0.2617, "num_input_tokens_seen": 139328432, "step": 64625 }, { "epoch": 11.860891906771885, "grad_norm": 19.211688995361328, "learning_rate": 4.25171205497038e-06, "loss": 0.3005, "num_input_tokens_seen": 139339696, "step": 64630 }, { "epoch": 11.861809506331436, "grad_norm": 35.7518196105957, "learning_rate": 4.250920326106971e-06, "loss": 0.0904, "num_input_tokens_seen": 139351728, "step": 64635 }, { "epoch": 11.86272710589099, "grad_norm": 66.21636962890625, "learning_rate": 4.2501286164562965e-06, "loss": 0.1628, "num_input_tokens_seen": 139361616, "step": 64640 }, { "epoch": 11.863644705450541, "grad_norm": 1.5982792377471924, "learning_rate": 4.249336926038659e-06, "loss": 0.0014, "num_input_tokens_seen": 139373136, "step": 64645 }, { "epoch": 11.864562305010093, "grad_norm": 1.4799801111221313, "learning_rate": 4.248545254874365e-06, "loss": 0.0015, "num_input_tokens_seen": 139383120, "step": 64650 }, { "epoch": 11.865479904569646, "grad_norm": 0.08394435048103333, "learning_rate": 4.247753602983722e-06, "loss": 0.0145, "num_input_tokens_seen": 139394608, "step": 64655 }, { "epoch": 11.866397504129198, "grad_norm": 0.10852570086717606, "learning_rate": 4.246961970387034e-06, "loss": 0.0783, "num_input_tokens_seen": 139404144, "step": 64660 }, { "epoch": 11.86731510368875, "grad_norm": 77.84419250488281, "learning_rate": 4.246170357104604e-06, "loss": 0.359, "num_input_tokens_seen": 139414512, "step": 64665 }, { "epoch": 11.868232703248303, "grad_norm": 143.8889617919922, "learning_rate": 4.245378763156734e-06, "loss": 0.1188, "num_input_tokens_seen": 139426576, "step": 64670 }, { "epoch": 11.869150302807855, "grad_norm": 0.8992999196052551, "learning_rate": 4.2445871885637314e-06, "loss": 0.4441, "num_input_tokens_seen": 139437616, "step": 64675 }, { "epoch": 11.870067902367406, "grad_norm": 0.3947530686855316, "learning_rate": 4.2437956333458966e-06, "loss": 0.145, "num_input_tokens_seen": 139449488, "step": 64680 }, { "epoch": 11.87098550192696, "grad_norm": 19.906742095947266, "learning_rate": 4.243004097523529e-06, "loss": 0.318, "num_input_tokens_seen": 139461168, "step": 64685 }, { "epoch": 11.871903101486511, "grad_norm": 20.226137161254883, "learning_rate": 4.242212581116936e-06, "loss": 0.0108, "num_input_tokens_seen": 139470416, "step": 64690 }, { "epoch": 11.872820701046063, "grad_norm": 0.05243498459458351, "learning_rate": 4.241421084146415e-06, "loss": 0.0013, "num_input_tokens_seen": 139481808, "step": 64695 }, { "epoch": 11.873738300605616, "grad_norm": 0.19459126889705658, "learning_rate": 4.240629606632268e-06, "loss": 0.103, "num_input_tokens_seen": 139492080, "step": 64700 }, { "epoch": 11.874655900165168, "grad_norm": 0.18564599752426147, "learning_rate": 4.239838148594793e-06, "loss": 0.1489, "num_input_tokens_seen": 139503184, "step": 64705 }, { "epoch": 11.87557349972472, "grad_norm": 13.702156066894531, "learning_rate": 4.239046710054293e-06, "loss": 0.0305, "num_input_tokens_seen": 139514192, "step": 64710 }, { "epoch": 11.876491099284273, "grad_norm": 164.29144287109375, "learning_rate": 4.238255291031065e-06, "loss": 0.5367, "num_input_tokens_seen": 139524528, "step": 64715 }, { "epoch": 11.877408698843825, "grad_norm": 0.0907144695520401, "learning_rate": 4.237463891545408e-06, "loss": 0.2562, "num_input_tokens_seen": 139534352, "step": 64720 }, { "epoch": 11.878326298403376, "grad_norm": 0.5020838975906372, "learning_rate": 4.236672511617621e-06, "loss": 0.0042, "num_input_tokens_seen": 139545264, "step": 64725 }, { "epoch": 11.87924389796293, "grad_norm": 16.20353126525879, "learning_rate": 4.235881151268001e-06, "loss": 0.1137, "num_input_tokens_seen": 139556048, "step": 64730 }, { "epoch": 11.880161497522481, "grad_norm": 28.681785583496094, "learning_rate": 4.235089810516844e-06, "loss": 0.0608, "num_input_tokens_seen": 139567088, "step": 64735 }, { "epoch": 11.881079097082033, "grad_norm": 11.96777057647705, "learning_rate": 4.234298489384449e-06, "loss": 0.1877, "num_input_tokens_seen": 139577744, "step": 64740 }, { "epoch": 11.881996696641586, "grad_norm": 96.47161865234375, "learning_rate": 4.23350718789111e-06, "loss": 0.5254, "num_input_tokens_seen": 139589040, "step": 64745 }, { "epoch": 11.882914296201138, "grad_norm": 1.0368701219558716, "learning_rate": 4.232715906057126e-06, "loss": 0.0681, "num_input_tokens_seen": 139600272, "step": 64750 }, { "epoch": 11.88383189576069, "grad_norm": 0.3422321677207947, "learning_rate": 4.231924643902786e-06, "loss": 0.2761, "num_input_tokens_seen": 139611248, "step": 64755 }, { "epoch": 11.884749495320243, "grad_norm": 1.1593732833862305, "learning_rate": 4.231133401448391e-06, "loss": 0.0034, "num_input_tokens_seen": 139622192, "step": 64760 }, { "epoch": 11.885667094879794, "grad_norm": 12.414051055908203, "learning_rate": 4.230342178714233e-06, "loss": 0.2003, "num_input_tokens_seen": 139633008, "step": 64765 }, { "epoch": 11.886584694439346, "grad_norm": 0.17676496505737305, "learning_rate": 4.229550975720603e-06, "loss": 0.0018, "num_input_tokens_seen": 139642768, "step": 64770 }, { "epoch": 11.8875022939989, "grad_norm": 16.96769905090332, "learning_rate": 4.2287597924877986e-06, "loss": 0.1487, "num_input_tokens_seen": 139654000, "step": 64775 }, { "epoch": 11.888419893558451, "grad_norm": 67.89341735839844, "learning_rate": 4.22796862903611e-06, "loss": 0.2387, "num_input_tokens_seen": 139663696, "step": 64780 }, { "epoch": 11.889337493118003, "grad_norm": 42.47207260131836, "learning_rate": 4.227177485385827e-06, "loss": 0.301, "num_input_tokens_seen": 139674672, "step": 64785 }, { "epoch": 11.890255092677556, "grad_norm": 50.49677658081055, "learning_rate": 4.226386361557247e-06, "loss": 0.0352, "num_input_tokens_seen": 139685712, "step": 64790 }, { "epoch": 11.891172692237108, "grad_norm": 23.318296432495117, "learning_rate": 4.225595257570657e-06, "loss": 0.5988, "num_input_tokens_seen": 139697168, "step": 64795 }, { "epoch": 11.89209029179666, "grad_norm": 0.113178551197052, "learning_rate": 4.224804173446349e-06, "loss": 0.1738, "num_input_tokens_seen": 139707536, "step": 64800 }, { "epoch": 11.893007891356213, "grad_norm": 191.2975616455078, "learning_rate": 4.224013109204609e-06, "loss": 0.2182, "num_input_tokens_seen": 139717520, "step": 64805 }, { "epoch": 11.893925490915764, "grad_norm": 2.6352646350860596, "learning_rate": 4.223222064865733e-06, "loss": 0.0037, "num_input_tokens_seen": 139726704, "step": 64810 }, { "epoch": 11.894843090475316, "grad_norm": 0.5230844020843506, "learning_rate": 4.222431040450008e-06, "loss": 0.1331, "num_input_tokens_seen": 139737232, "step": 64815 }, { "epoch": 11.89576069003487, "grad_norm": 0.27608397603034973, "learning_rate": 4.221640035977719e-06, "loss": 0.1364, "num_input_tokens_seen": 139748080, "step": 64820 }, { "epoch": 11.896678289594421, "grad_norm": 0.6356008648872375, "learning_rate": 4.220849051469158e-06, "loss": 0.0783, "num_input_tokens_seen": 139759056, "step": 64825 }, { "epoch": 11.897595889153973, "grad_norm": 7.6525983810424805, "learning_rate": 4.220058086944611e-06, "loss": 0.0392, "num_input_tokens_seen": 139770832, "step": 64830 }, { "epoch": 11.898513488713526, "grad_norm": 0.5067511200904846, "learning_rate": 4.219267142424367e-06, "loss": 0.2243, "num_input_tokens_seen": 139783408, "step": 64835 }, { "epoch": 11.899431088273078, "grad_norm": 0.2574175298213959, "learning_rate": 4.218476217928709e-06, "loss": 0.2665, "num_input_tokens_seen": 139794384, "step": 64840 }, { "epoch": 11.90034868783263, "grad_norm": 0.4695391058921814, "learning_rate": 4.217685313477926e-06, "loss": 0.0147, "num_input_tokens_seen": 139804848, "step": 64845 }, { "epoch": 11.901266287392183, "grad_norm": 23.188251495361328, "learning_rate": 4.216894429092301e-06, "loss": 0.2687, "num_input_tokens_seen": 139816400, "step": 64850 }, { "epoch": 11.902183886951734, "grad_norm": 0.26555362343788147, "learning_rate": 4.21610356479212e-06, "loss": 0.157, "num_input_tokens_seen": 139828624, "step": 64855 }, { "epoch": 11.903101486511286, "grad_norm": 0.04408808425068855, "learning_rate": 4.215312720597669e-06, "loss": 0.0831, "num_input_tokens_seen": 139838480, "step": 64860 }, { "epoch": 11.90401908607084, "grad_norm": 0.6086057424545288, "learning_rate": 4.21452189652923e-06, "loss": 0.1205, "num_input_tokens_seen": 139847856, "step": 64865 }, { "epoch": 11.90493668563039, "grad_norm": 31.132246017456055, "learning_rate": 4.213731092607085e-06, "loss": 0.2017, "num_input_tokens_seen": 139857808, "step": 64870 }, { "epoch": 11.905854285189942, "grad_norm": 11.593988418579102, "learning_rate": 4.212940308851521e-06, "loss": 0.013, "num_input_tokens_seen": 139868112, "step": 64875 }, { "epoch": 11.906771884749496, "grad_norm": 0.0948096439242363, "learning_rate": 4.212149545282819e-06, "loss": 0.1305, "num_input_tokens_seen": 139879984, "step": 64880 }, { "epoch": 11.907689484309048, "grad_norm": 45.964969635009766, "learning_rate": 4.211358801921259e-06, "loss": 0.3353, "num_input_tokens_seen": 139891536, "step": 64885 }, { "epoch": 11.9086070838686, "grad_norm": 0.33666664361953735, "learning_rate": 4.210568078787122e-06, "loss": 0.0803, "num_input_tokens_seen": 139901456, "step": 64890 }, { "epoch": 11.909524683428153, "grad_norm": 37.601871490478516, "learning_rate": 4.209777375900692e-06, "loss": 0.0197, "num_input_tokens_seen": 139912560, "step": 64895 }, { "epoch": 11.910442282987704, "grad_norm": 104.49490356445312, "learning_rate": 4.208986693282248e-06, "loss": 0.3211, "num_input_tokens_seen": 139923184, "step": 64900 }, { "epoch": 11.911359882547256, "grad_norm": 0.05321900546550751, "learning_rate": 4.2081960309520676e-06, "loss": 0.2903, "num_input_tokens_seen": 139934992, "step": 64905 }, { "epoch": 11.91227748210681, "grad_norm": 0.08196781575679779, "learning_rate": 4.207405388930434e-06, "loss": 0.0048, "num_input_tokens_seen": 139945680, "step": 64910 }, { "epoch": 11.91319508166636, "grad_norm": 1.3935917615890503, "learning_rate": 4.206614767237622e-06, "loss": 0.1532, "num_input_tokens_seen": 139956720, "step": 64915 }, { "epoch": 11.914112681225912, "grad_norm": 84.53055572509766, "learning_rate": 4.205824165893912e-06, "loss": 0.3181, "num_input_tokens_seen": 139968144, "step": 64920 }, { "epoch": 11.915030280785466, "grad_norm": 0.441255658864975, "learning_rate": 4.205033584919581e-06, "loss": 0.1677, "num_input_tokens_seen": 139978224, "step": 64925 }, { "epoch": 11.915947880345017, "grad_norm": 35.21105194091797, "learning_rate": 4.204243024334907e-06, "loss": 0.1639, "num_input_tokens_seen": 139988688, "step": 64930 }, { "epoch": 11.916865479904569, "grad_norm": 0.9090508818626404, "learning_rate": 4.203452484160167e-06, "loss": 0.3356, "num_input_tokens_seen": 139999024, "step": 64935 }, { "epoch": 11.917783079464122, "grad_norm": 17.974998474121094, "learning_rate": 4.202661964415635e-06, "loss": 0.1952, "num_input_tokens_seen": 140010512, "step": 64940 }, { "epoch": 11.918700679023674, "grad_norm": 0.08835117518901825, "learning_rate": 4.201871465121589e-06, "loss": 0.0951, "num_input_tokens_seen": 140021168, "step": 64945 }, { "epoch": 11.919618278583226, "grad_norm": 2.911621332168579, "learning_rate": 4.201080986298302e-06, "loss": 0.0818, "num_input_tokens_seen": 140031536, "step": 64950 }, { "epoch": 11.920535878142779, "grad_norm": 6.677484035491943, "learning_rate": 4.200290527966048e-06, "loss": 0.005, "num_input_tokens_seen": 140041104, "step": 64955 }, { "epoch": 11.92145347770233, "grad_norm": 0.04502519220113754, "learning_rate": 4.199500090145105e-06, "loss": 0.289, "num_input_tokens_seen": 140052240, "step": 64960 }, { "epoch": 11.922371077261882, "grad_norm": 0.17008048295974731, "learning_rate": 4.1987096728557435e-06, "loss": 0.0021, "num_input_tokens_seen": 140063888, "step": 64965 }, { "epoch": 11.923288676821436, "grad_norm": 2.0930891036987305, "learning_rate": 4.197919276118235e-06, "loss": 0.0013, "num_input_tokens_seen": 140073552, "step": 64970 }, { "epoch": 11.924206276380987, "grad_norm": 0.13364751636981964, "learning_rate": 4.197128899952856e-06, "loss": 0.002, "num_input_tokens_seen": 140084560, "step": 64975 }, { "epoch": 11.925123875940539, "grad_norm": 49.617584228515625, "learning_rate": 4.196338544379877e-06, "loss": 0.3649, "num_input_tokens_seen": 140095248, "step": 64980 }, { "epoch": 11.926041475500092, "grad_norm": 267.06475830078125, "learning_rate": 4.195548209419568e-06, "loss": 0.2625, "num_input_tokens_seen": 140106576, "step": 64985 }, { "epoch": 11.926959075059644, "grad_norm": 0.9841338992118835, "learning_rate": 4.194757895092199e-06, "loss": 0.1503, "num_input_tokens_seen": 140116432, "step": 64990 }, { "epoch": 11.927876674619196, "grad_norm": 0.2614858150482178, "learning_rate": 4.193967601418044e-06, "loss": 0.0009, "num_input_tokens_seen": 140127664, "step": 64995 }, { "epoch": 11.928794274178749, "grad_norm": 0.04951991140842438, "learning_rate": 4.19317732841737e-06, "loss": 0.0043, "num_input_tokens_seen": 140138832, "step": 65000 }, { "epoch": 11.9297118737383, "grad_norm": 87.2059097290039, "learning_rate": 4.192387076110446e-06, "loss": 0.1156, "num_input_tokens_seen": 140150960, "step": 65005 }, { "epoch": 11.930629473297852, "grad_norm": 702.2507934570312, "learning_rate": 4.191596844517544e-06, "loss": 0.2264, "num_input_tokens_seen": 140161552, "step": 65010 }, { "epoch": 11.931547072857406, "grad_norm": 0.08494304865598679, "learning_rate": 4.190806633658929e-06, "loss": 0.2849, "num_input_tokens_seen": 140172816, "step": 65015 }, { "epoch": 11.932464672416957, "grad_norm": 35.79792404174805, "learning_rate": 4.190016443554871e-06, "loss": 0.4112, "num_input_tokens_seen": 140184240, "step": 65020 }, { "epoch": 11.933382271976509, "grad_norm": 0.16420584917068481, "learning_rate": 4.1892262742256325e-06, "loss": 0.0657, "num_input_tokens_seen": 140194256, "step": 65025 }, { "epoch": 11.934299871536062, "grad_norm": 63.33846664428711, "learning_rate": 4.1884361256914864e-06, "loss": 0.4405, "num_input_tokens_seen": 140204080, "step": 65030 }, { "epoch": 11.935217471095614, "grad_norm": 1.3452236652374268, "learning_rate": 4.187645997972696e-06, "loss": 0.5652, "num_input_tokens_seen": 140214704, "step": 65035 }, { "epoch": 11.936135070655165, "grad_norm": 56.98211669921875, "learning_rate": 4.186855891089525e-06, "loss": 0.0594, "num_input_tokens_seen": 140226288, "step": 65040 }, { "epoch": 11.937052670214719, "grad_norm": 2.804433584213257, "learning_rate": 4.186065805062241e-06, "loss": 0.0049, "num_input_tokens_seen": 140237168, "step": 65045 }, { "epoch": 11.93797026977427, "grad_norm": 0.09057193249464035, "learning_rate": 4.185275739911107e-06, "loss": 0.458, "num_input_tokens_seen": 140249072, "step": 65050 }, { "epoch": 11.938887869333822, "grad_norm": 109.075927734375, "learning_rate": 4.184485695656388e-06, "loss": 0.1983, "num_input_tokens_seen": 140260560, "step": 65055 }, { "epoch": 11.939805468893375, "grad_norm": 0.3481828570365906, "learning_rate": 4.183695672318348e-06, "loss": 0.0713, "num_input_tokens_seen": 140271600, "step": 65060 }, { "epoch": 11.940723068452927, "grad_norm": 201.31097412109375, "learning_rate": 4.182905669917248e-06, "loss": 0.1142, "num_input_tokens_seen": 140281488, "step": 65065 }, { "epoch": 11.941640668012479, "grad_norm": 38.1127815246582, "learning_rate": 4.182115688473352e-06, "loss": 0.1233, "num_input_tokens_seen": 140291056, "step": 65070 }, { "epoch": 11.942558267572032, "grad_norm": 32.678035736083984, "learning_rate": 4.181325728006919e-06, "loss": 0.0172, "num_input_tokens_seen": 140302640, "step": 65075 }, { "epoch": 11.943475867131584, "grad_norm": 0.8459295630455017, "learning_rate": 4.180535788538213e-06, "loss": 0.2397, "num_input_tokens_seen": 140313264, "step": 65080 }, { "epoch": 11.944393466691135, "grad_norm": 0.05692128837108612, "learning_rate": 4.179745870087495e-06, "loss": 0.4539, "num_input_tokens_seen": 140325424, "step": 65085 }, { "epoch": 11.945311066250689, "grad_norm": 155.9462127685547, "learning_rate": 4.178955972675022e-06, "loss": 0.3255, "num_input_tokens_seen": 140335952, "step": 65090 }, { "epoch": 11.94622866581024, "grad_norm": 0.1420583724975586, "learning_rate": 4.178166096321058e-06, "loss": 0.3671, "num_input_tokens_seen": 140346992, "step": 65095 }, { "epoch": 11.947146265369792, "grad_norm": 0.7086414694786072, "learning_rate": 4.17737624104586e-06, "loss": 0.5349, "num_input_tokens_seen": 140358448, "step": 65100 }, { "epoch": 11.948063864929345, "grad_norm": 46.57429122924805, "learning_rate": 4.1765864068696834e-06, "loss": 0.1389, "num_input_tokens_seen": 140368496, "step": 65105 }, { "epoch": 11.948981464488897, "grad_norm": 20.226329803466797, "learning_rate": 4.175796593812792e-06, "loss": 0.007, "num_input_tokens_seen": 140379536, "step": 65110 }, { "epoch": 11.949899064048449, "grad_norm": 3.436293125152588, "learning_rate": 4.175006801895441e-06, "loss": 0.0037, "num_input_tokens_seen": 140389232, "step": 65115 }, { "epoch": 11.950816663608002, "grad_norm": 1.048072338104248, "learning_rate": 4.174217031137886e-06, "loss": 0.0005, "num_input_tokens_seen": 140399472, "step": 65120 }, { "epoch": 11.951734263167554, "grad_norm": 0.38366374373435974, "learning_rate": 4.173427281560383e-06, "loss": 0.1126, "num_input_tokens_seen": 140411056, "step": 65125 }, { "epoch": 11.952651862727105, "grad_norm": 104.38811492919922, "learning_rate": 4.172637553183191e-06, "loss": 0.1851, "num_input_tokens_seen": 140422320, "step": 65130 }, { "epoch": 11.953569462286659, "grad_norm": 0.4635903239250183, "learning_rate": 4.1718478460265635e-06, "loss": 0.4838, "num_input_tokens_seen": 140431760, "step": 65135 }, { "epoch": 11.95448706184621, "grad_norm": 26.916854858398438, "learning_rate": 4.171058160110754e-06, "loss": 0.6067, "num_input_tokens_seen": 140443280, "step": 65140 }, { "epoch": 11.955404661405762, "grad_norm": 0.10899808257818222, "learning_rate": 4.170268495456018e-06, "loss": 0.162, "num_input_tokens_seen": 140454576, "step": 65145 }, { "epoch": 11.956322260965315, "grad_norm": 0.2716793715953827, "learning_rate": 4.169478852082611e-06, "loss": 0.2615, "num_input_tokens_seen": 140465616, "step": 65150 }, { "epoch": 11.957239860524867, "grad_norm": 22.655271530151367, "learning_rate": 4.168689230010783e-06, "loss": 0.3564, "num_input_tokens_seen": 140478704, "step": 65155 }, { "epoch": 11.958157460084418, "grad_norm": 38.32768630981445, "learning_rate": 4.167899629260788e-06, "loss": 0.0859, "num_input_tokens_seen": 140489296, "step": 65160 }, { "epoch": 11.959075059643972, "grad_norm": 224.30563354492188, "learning_rate": 4.167110049852878e-06, "loss": 0.1655, "num_input_tokens_seen": 140499536, "step": 65165 }, { "epoch": 11.959992659203524, "grad_norm": 0.24125036597251892, "learning_rate": 4.166320491807303e-06, "loss": 0.3398, "num_input_tokens_seen": 140510800, "step": 65170 }, { "epoch": 11.960910258763075, "grad_norm": 44.74311828613281, "learning_rate": 4.1655309551443165e-06, "loss": 0.1259, "num_input_tokens_seen": 140520912, "step": 65175 }, { "epoch": 11.961827858322629, "grad_norm": 3.1971850395202637, "learning_rate": 4.164741439884168e-06, "loss": 0.0267, "num_input_tokens_seen": 140531760, "step": 65180 }, { "epoch": 11.96274545788218, "grad_norm": 49.108787536621094, "learning_rate": 4.163951946047107e-06, "loss": 0.2209, "num_input_tokens_seen": 140542128, "step": 65185 }, { "epoch": 11.963663057441732, "grad_norm": 107.5323715209961, "learning_rate": 4.163162473653381e-06, "loss": 0.3667, "num_input_tokens_seen": 140553136, "step": 65190 }, { "epoch": 11.964580657001285, "grad_norm": 30.30510711669922, "learning_rate": 4.162373022723242e-06, "loss": 0.5557, "num_input_tokens_seen": 140562640, "step": 65195 }, { "epoch": 11.965498256560837, "grad_norm": 2.686046600341797, "learning_rate": 4.161583593276938e-06, "loss": 0.0619, "num_input_tokens_seen": 140573840, "step": 65200 }, { "epoch": 11.966415856120388, "grad_norm": 0.2015727162361145, "learning_rate": 4.160794185334715e-06, "loss": 0.0163, "num_input_tokens_seen": 140585264, "step": 65205 }, { "epoch": 11.967333455679942, "grad_norm": 85.14368438720703, "learning_rate": 4.160004798916817e-06, "loss": 0.0157, "num_input_tokens_seen": 140595696, "step": 65210 }, { "epoch": 11.968251055239493, "grad_norm": 1.1830328702926636, "learning_rate": 4.1592154340434975e-06, "loss": 0.1802, "num_input_tokens_seen": 140607568, "step": 65215 }, { "epoch": 11.969168654799045, "grad_norm": 19.1231632232666, "learning_rate": 4.158426090734999e-06, "loss": 0.2148, "num_input_tokens_seen": 140617584, "step": 65220 }, { "epoch": 11.970086254358598, "grad_norm": 1.0668630599975586, "learning_rate": 4.157636769011564e-06, "loss": 0.0034, "num_input_tokens_seen": 140628752, "step": 65225 }, { "epoch": 11.97100385391815, "grad_norm": 0.02786237932741642, "learning_rate": 4.156847468893443e-06, "loss": 0.4787, "num_input_tokens_seen": 140639600, "step": 65230 }, { "epoch": 11.971921453477702, "grad_norm": 33.29425811767578, "learning_rate": 4.156058190400878e-06, "loss": 0.2206, "num_input_tokens_seen": 140650640, "step": 65235 }, { "epoch": 11.972839053037255, "grad_norm": 2.438814640045166, "learning_rate": 4.15526893355411e-06, "loss": 0.0025, "num_input_tokens_seen": 140660784, "step": 65240 }, { "epoch": 11.973756652596807, "grad_norm": 10.861552238464355, "learning_rate": 4.154479698373386e-06, "loss": 0.2165, "num_input_tokens_seen": 140672112, "step": 65245 }, { "epoch": 11.974674252156358, "grad_norm": 5.766462326049805, "learning_rate": 4.153690484878949e-06, "loss": 0.0029, "num_input_tokens_seen": 140683056, "step": 65250 }, { "epoch": 11.975591851715912, "grad_norm": 42.747798919677734, "learning_rate": 4.152901293091038e-06, "loss": 0.2821, "num_input_tokens_seen": 140694736, "step": 65255 }, { "epoch": 11.976509451275463, "grad_norm": 35.21052932739258, "learning_rate": 4.152112123029896e-06, "loss": 0.1401, "num_input_tokens_seen": 140705008, "step": 65260 }, { "epoch": 11.977427050835015, "grad_norm": 35.38408279418945, "learning_rate": 4.151322974715763e-06, "loss": 0.1176, "num_input_tokens_seen": 140716400, "step": 65265 }, { "epoch": 11.978344650394568, "grad_norm": 0.09835958480834961, "learning_rate": 4.150533848168881e-06, "loss": 0.0033, "num_input_tokens_seen": 140727760, "step": 65270 }, { "epoch": 11.97926224995412, "grad_norm": 16.553171157836914, "learning_rate": 4.149744743409489e-06, "loss": 0.1486, "num_input_tokens_seen": 140738800, "step": 65275 }, { "epoch": 11.980179849513672, "grad_norm": 92.18096160888672, "learning_rate": 4.148955660457827e-06, "loss": 0.5128, "num_input_tokens_seen": 140749744, "step": 65280 }, { "epoch": 11.981097449073225, "grad_norm": 0.03316207975149155, "learning_rate": 4.148166599334135e-06, "loss": 0.0043, "num_input_tokens_seen": 140760624, "step": 65285 }, { "epoch": 11.982015048632777, "grad_norm": 36.996116638183594, "learning_rate": 4.147377560058645e-06, "loss": 0.2706, "num_input_tokens_seen": 140772496, "step": 65290 }, { "epoch": 11.982932648192328, "grad_norm": 0.5667874217033386, "learning_rate": 4.1465885426516025e-06, "loss": 0.0005, "num_input_tokens_seen": 140783216, "step": 65295 }, { "epoch": 11.983850247751882, "grad_norm": 0.49065420031547546, "learning_rate": 4.145799547133242e-06, "loss": 0.1549, "num_input_tokens_seen": 140794224, "step": 65300 }, { "epoch": 11.984767847311433, "grad_norm": 13.765572547912598, "learning_rate": 4.145010573523798e-06, "loss": 0.279, "num_input_tokens_seen": 140804848, "step": 65305 }, { "epoch": 11.985685446870985, "grad_norm": 7.947258949279785, "learning_rate": 4.144221621843506e-06, "loss": 0.3135, "num_input_tokens_seen": 140815792, "step": 65310 }, { "epoch": 11.986603046430538, "grad_norm": 0.056421179324388504, "learning_rate": 4.143432692112605e-06, "loss": 0.119, "num_input_tokens_seen": 140827728, "step": 65315 }, { "epoch": 11.98752064599009, "grad_norm": 0.2889289855957031, "learning_rate": 4.142643784351328e-06, "loss": 0.0024, "num_input_tokens_seen": 140837968, "step": 65320 }, { "epoch": 11.988438245549641, "grad_norm": 0.6215763092041016, "learning_rate": 4.141854898579907e-06, "loss": 0.2141, "num_input_tokens_seen": 140848464, "step": 65325 }, { "epoch": 11.989355845109195, "grad_norm": 1.8186366558074951, "learning_rate": 4.14106603481858e-06, "loss": 0.2253, "num_input_tokens_seen": 140860144, "step": 65330 }, { "epoch": 11.990273444668746, "grad_norm": 0.0767178013920784, "learning_rate": 4.140277193087579e-06, "loss": 0.009, "num_input_tokens_seen": 140870384, "step": 65335 }, { "epoch": 11.991191044228298, "grad_norm": 0.7470082640647888, "learning_rate": 4.139488373407133e-06, "loss": 0.3209, "num_input_tokens_seen": 140881392, "step": 65340 }, { "epoch": 11.992108643787851, "grad_norm": 0.3205488324165344, "learning_rate": 4.138699575797477e-06, "loss": 0.4431, "num_input_tokens_seen": 140890832, "step": 65345 }, { "epoch": 11.993026243347403, "grad_norm": 58.0802116394043, "learning_rate": 4.137910800278842e-06, "loss": 0.1953, "num_input_tokens_seen": 140901968, "step": 65350 }, { "epoch": 11.993943842906955, "grad_norm": 0.04361478239297867, "learning_rate": 4.137122046871461e-06, "loss": 0.2263, "num_input_tokens_seen": 140912208, "step": 65355 }, { "epoch": 11.994861442466508, "grad_norm": 0.10001377016305923, "learning_rate": 4.13633331559556e-06, "loss": 0.0007, "num_input_tokens_seen": 140922416, "step": 65360 }, { "epoch": 11.99577904202606, "grad_norm": 0.04641168564558029, "learning_rate": 4.13554460647137e-06, "loss": 0.1238, "num_input_tokens_seen": 140933520, "step": 65365 }, { "epoch": 11.996696641585611, "grad_norm": 0.22581499814987183, "learning_rate": 4.134755919519123e-06, "loss": 0.1839, "num_input_tokens_seen": 140944656, "step": 65370 }, { "epoch": 11.997614241145165, "grad_norm": 0.006385270971804857, "learning_rate": 4.1339672547590454e-06, "loss": 0.0948, "num_input_tokens_seen": 140955760, "step": 65375 }, { "epoch": 11.998531840704716, "grad_norm": 9.903300285339355, "learning_rate": 4.133178612211366e-06, "loss": 0.1517, "num_input_tokens_seen": 140966288, "step": 65380 }, { "epoch": 11.999449440264268, "grad_norm": 1.27226722240448, "learning_rate": 4.132389991896311e-06, "loss": 0.1463, "num_input_tokens_seen": 140976400, "step": 65385 }, { "epoch": 12.0, "eval_loss": 0.6823185086250305, "eval_runtime": 179.2069, "eval_samples_per_second": 30.406, "eval_steps_per_second": 7.606, "num_input_tokens_seen": 140981152, "step": 65388 }, { "epoch": 12.000367039823821, "grad_norm": 48.51678466796875, "learning_rate": 4.131601393834108e-06, "loss": 0.0702, "num_input_tokens_seen": 140985888, "step": 65390 }, { "epoch": 12.001284639383373, "grad_norm": 1.495759129524231, "learning_rate": 4.1308128180449826e-06, "loss": 0.1015, "num_input_tokens_seen": 140996096, "step": 65395 }, { "epoch": 12.002202238942925, "grad_norm": 53.62199020385742, "learning_rate": 4.130024264549162e-06, "loss": 0.1075, "num_input_tokens_seen": 141006336, "step": 65400 }, { "epoch": 12.003119838502478, "grad_norm": 15.117242813110352, "learning_rate": 4.129235733366872e-06, "loss": 0.0056, "num_input_tokens_seen": 141017280, "step": 65405 }, { "epoch": 12.00403743806203, "grad_norm": 13.7648344039917, "learning_rate": 4.128447224518333e-06, "loss": 0.3552, "num_input_tokens_seen": 141027744, "step": 65410 }, { "epoch": 12.004955037621581, "grad_norm": 0.2716011106967926, "learning_rate": 4.127658738023774e-06, "loss": 0.0025, "num_input_tokens_seen": 141038016, "step": 65415 }, { "epoch": 12.005872637181135, "grad_norm": 26.34610366821289, "learning_rate": 4.126870273903416e-06, "loss": 0.1891, "num_input_tokens_seen": 141049440, "step": 65420 }, { "epoch": 12.006790236740686, "grad_norm": 0.36067599058151245, "learning_rate": 4.126081832177481e-06, "loss": 0.0008, "num_input_tokens_seen": 141059776, "step": 65425 }, { "epoch": 12.007707836300238, "grad_norm": 0.18162336945533752, "learning_rate": 4.125293412866194e-06, "loss": 0.0252, "num_input_tokens_seen": 141069696, "step": 65430 }, { "epoch": 12.008625435859791, "grad_norm": 0.008020095527172089, "learning_rate": 4.124505015989775e-06, "loss": 0.0006, "num_input_tokens_seen": 141081440, "step": 65435 }, { "epoch": 12.009543035419343, "grad_norm": 0.08574366569519043, "learning_rate": 4.123716641568447e-06, "loss": 0.0009, "num_input_tokens_seen": 141092736, "step": 65440 }, { "epoch": 12.010460634978894, "grad_norm": 9.468419075012207, "learning_rate": 4.122928289622426e-06, "loss": 0.1856, "num_input_tokens_seen": 141103520, "step": 65445 }, { "epoch": 12.011378234538448, "grad_norm": 61.694252014160156, "learning_rate": 4.122139960171937e-06, "loss": 0.3854, "num_input_tokens_seen": 141113792, "step": 65450 }, { "epoch": 12.012295834098, "grad_norm": 0.14535608887672424, "learning_rate": 4.121351653237197e-06, "loss": 0.1228, "num_input_tokens_seen": 141124864, "step": 65455 }, { "epoch": 12.013213433657551, "grad_norm": 165.12008666992188, "learning_rate": 4.120563368838425e-06, "loss": 0.0596, "num_input_tokens_seen": 141135744, "step": 65460 }, { "epoch": 12.014131033217105, "grad_norm": 0.07602175325155258, "learning_rate": 4.119775106995839e-06, "loss": 0.1082, "num_input_tokens_seen": 141145088, "step": 65465 }, { "epoch": 12.015048632776656, "grad_norm": 0.44987720251083374, "learning_rate": 4.1189868677296585e-06, "loss": 0.0792, "num_input_tokens_seen": 141156640, "step": 65470 }, { "epoch": 12.015966232336208, "grad_norm": 0.16464895009994507, "learning_rate": 4.1181986510601e-06, "loss": 0.0088, "num_input_tokens_seen": 141165856, "step": 65475 }, { "epoch": 12.016883831895761, "grad_norm": 55.106956481933594, "learning_rate": 4.1174104570073775e-06, "loss": 0.03, "num_input_tokens_seen": 141178560, "step": 65480 }, { "epoch": 12.017801431455313, "grad_norm": 10.972777366638184, "learning_rate": 4.11662228559171e-06, "loss": 0.2023, "num_input_tokens_seen": 141188832, "step": 65485 }, { "epoch": 12.018719031014864, "grad_norm": 0.05706671252846718, "learning_rate": 4.115834136833312e-06, "loss": 0.1309, "num_input_tokens_seen": 141200128, "step": 65490 }, { "epoch": 12.019636630574418, "grad_norm": 2.0414938926696777, "learning_rate": 4.115046010752397e-06, "loss": 0.1443, "num_input_tokens_seen": 141212032, "step": 65495 }, { "epoch": 12.02055423013397, "grad_norm": 23.372133255004883, "learning_rate": 4.1142579073691815e-06, "loss": 0.1191, "num_input_tokens_seen": 141222624, "step": 65500 }, { "epoch": 12.021471829693521, "grad_norm": 0.005627406295388937, "learning_rate": 4.113469826703878e-06, "loss": 0.1856, "num_input_tokens_seen": 141233472, "step": 65505 }, { "epoch": 12.022389429253074, "grad_norm": 0.054214585572481155, "learning_rate": 4.1126817687766966e-06, "loss": 0.0688, "num_input_tokens_seen": 141242880, "step": 65510 }, { "epoch": 12.023307028812626, "grad_norm": 0.8605528473854065, "learning_rate": 4.111893733607856e-06, "loss": 0.0009, "num_input_tokens_seen": 141255328, "step": 65515 }, { "epoch": 12.024224628372178, "grad_norm": 0.794642984867096, "learning_rate": 4.111105721217563e-06, "loss": 0.2041, "num_input_tokens_seen": 141265856, "step": 65520 }, { "epoch": 12.025142227931731, "grad_norm": 5.53322696685791, "learning_rate": 4.110317731626032e-06, "loss": 0.2047, "num_input_tokens_seen": 141277152, "step": 65525 }, { "epoch": 12.026059827491283, "grad_norm": 0.17938487231731415, "learning_rate": 4.10952976485347e-06, "loss": 0.0337, "num_input_tokens_seen": 141287424, "step": 65530 }, { "epoch": 12.026977427050834, "grad_norm": 17.442659378051758, "learning_rate": 4.108741820920091e-06, "loss": 0.1245, "num_input_tokens_seen": 141297024, "step": 65535 }, { "epoch": 12.027895026610388, "grad_norm": 0.30336320400238037, "learning_rate": 4.107953899846103e-06, "loss": 0.1114, "num_input_tokens_seen": 141307168, "step": 65540 }, { "epoch": 12.02881262616994, "grad_norm": 0.026656607165932655, "learning_rate": 4.107166001651713e-06, "loss": 0.0018, "num_input_tokens_seen": 141318816, "step": 65545 }, { "epoch": 12.029730225729491, "grad_norm": 0.21421948075294495, "learning_rate": 4.106378126357134e-06, "loss": 0.0041, "num_input_tokens_seen": 141328000, "step": 65550 }, { "epoch": 12.030647825289044, "grad_norm": 0.3093150854110718, "learning_rate": 4.10559027398257e-06, "loss": 0.1233, "num_input_tokens_seen": 141338464, "step": 65555 }, { "epoch": 12.031565424848596, "grad_norm": 0.26968201994895935, "learning_rate": 4.1048024445482286e-06, "loss": 0.2782, "num_input_tokens_seen": 141348960, "step": 65560 }, { "epoch": 12.032483024408148, "grad_norm": 0.25252631306648254, "learning_rate": 4.104014638074319e-06, "loss": 0.1107, "num_input_tokens_seen": 141359648, "step": 65565 }, { "epoch": 12.033400623967701, "grad_norm": 0.06225820630788803, "learning_rate": 4.103226854581044e-06, "loss": 0.2973, "num_input_tokens_seen": 141369856, "step": 65570 }, { "epoch": 12.034318223527253, "grad_norm": 18.90713119506836, "learning_rate": 4.10243909408861e-06, "loss": 0.1604, "num_input_tokens_seen": 141380128, "step": 65575 }, { "epoch": 12.035235823086804, "grad_norm": 0.7317183017730713, "learning_rate": 4.101651356617223e-06, "loss": 0.2714, "num_input_tokens_seen": 141391648, "step": 65580 }, { "epoch": 12.036153422646358, "grad_norm": 5.998387336730957, "learning_rate": 4.100863642187085e-06, "loss": 0.002, "num_input_tokens_seen": 141401280, "step": 65585 }, { "epoch": 12.03707102220591, "grad_norm": 17.844018936157227, "learning_rate": 4.1000759508184025e-06, "loss": 0.1355, "num_input_tokens_seen": 141413056, "step": 65590 }, { "epoch": 12.03798862176546, "grad_norm": 0.4821203947067261, "learning_rate": 4.099288282531376e-06, "loss": 0.0013, "num_input_tokens_seen": 141424288, "step": 65595 }, { "epoch": 12.038906221325014, "grad_norm": 0.9234987497329712, "learning_rate": 4.09850063734621e-06, "loss": 0.0068, "num_input_tokens_seen": 141435104, "step": 65600 }, { "epoch": 12.039823820884566, "grad_norm": 70.23947143554688, "learning_rate": 4.0977130152831056e-06, "loss": 0.3376, "num_input_tokens_seen": 141445888, "step": 65605 }, { "epoch": 12.040741420444117, "grad_norm": 0.0316779650747776, "learning_rate": 4.096925416362264e-06, "loss": 0.3008, "num_input_tokens_seen": 141456416, "step": 65610 }, { "epoch": 12.04165902000367, "grad_norm": 0.22595147788524628, "learning_rate": 4.096137840603883e-06, "loss": 0.2082, "num_input_tokens_seen": 141467040, "step": 65615 }, { "epoch": 12.042576619563222, "grad_norm": 0.3201563358306885, "learning_rate": 4.09535028802817e-06, "loss": 0.0659, "num_input_tokens_seen": 141479840, "step": 65620 }, { "epoch": 12.043494219122774, "grad_norm": 0.22323566675186157, "learning_rate": 4.0945627586553176e-06, "loss": 0.2506, "num_input_tokens_seen": 141490624, "step": 65625 }, { "epoch": 12.044411818682327, "grad_norm": 12.333417892456055, "learning_rate": 4.0937752525055255e-06, "loss": 0.0092, "num_input_tokens_seen": 141502016, "step": 65630 }, { "epoch": 12.045329418241879, "grad_norm": 0.15384630858898163, "learning_rate": 4.092987769598996e-06, "loss": 0.0014, "num_input_tokens_seen": 141513600, "step": 65635 }, { "epoch": 12.04624701780143, "grad_norm": 0.09741385281085968, "learning_rate": 4.092200309955925e-06, "loss": 0.1128, "num_input_tokens_seen": 141522624, "step": 65640 }, { "epoch": 12.047164617360984, "grad_norm": 309.6442565917969, "learning_rate": 4.091412873596507e-06, "loss": 0.0824, "num_input_tokens_seen": 141534240, "step": 65645 }, { "epoch": 12.048082216920536, "grad_norm": 198.88766479492188, "learning_rate": 4.090625460540941e-06, "loss": 0.5197, "num_input_tokens_seen": 141544800, "step": 65650 }, { "epoch": 12.048999816480087, "grad_norm": 0.4647517800331116, "learning_rate": 4.089838070809424e-06, "loss": 0.0432, "num_input_tokens_seen": 141556096, "step": 65655 }, { "epoch": 12.04991741603964, "grad_norm": 0.046172842383384705, "learning_rate": 4.08905070442215e-06, "loss": 0.0061, "num_input_tokens_seen": 141566464, "step": 65660 }, { "epoch": 12.050835015599192, "grad_norm": 157.53463745117188, "learning_rate": 4.088263361399311e-06, "loss": 0.1568, "num_input_tokens_seen": 141577792, "step": 65665 }, { "epoch": 12.051752615158744, "grad_norm": 0.07234463095664978, "learning_rate": 4.087476041761106e-06, "loss": 0.1744, "num_input_tokens_seen": 141589024, "step": 65670 }, { "epoch": 12.052670214718297, "grad_norm": 0.05303151160478592, "learning_rate": 4.086688745527726e-06, "loss": 0.125, "num_input_tokens_seen": 141601024, "step": 65675 }, { "epoch": 12.053587814277849, "grad_norm": 0.0973137840628624, "learning_rate": 4.0859014727193634e-06, "loss": 0.2201, "num_input_tokens_seen": 141611296, "step": 65680 }, { "epoch": 12.0545054138374, "grad_norm": 0.02183501236140728, "learning_rate": 4.085114223356211e-06, "loss": 0.0006, "num_input_tokens_seen": 141622720, "step": 65685 }, { "epoch": 12.055423013396954, "grad_norm": 76.45506286621094, "learning_rate": 4.084326997458462e-06, "loss": 0.2234, "num_input_tokens_seen": 141632096, "step": 65690 }, { "epoch": 12.056340612956506, "grad_norm": 0.03397203981876373, "learning_rate": 4.083539795046305e-06, "loss": 0.2222, "num_input_tokens_seen": 141643648, "step": 65695 }, { "epoch": 12.057258212516057, "grad_norm": 0.025881364941596985, "learning_rate": 4.082752616139934e-06, "loss": 0.2857, "num_input_tokens_seen": 141653664, "step": 65700 }, { "epoch": 12.05817581207561, "grad_norm": 0.4075254201889038, "learning_rate": 4.081965460759536e-06, "loss": 0.2737, "num_input_tokens_seen": 141664864, "step": 65705 }, { "epoch": 12.059093411635162, "grad_norm": 0.35884377360343933, "learning_rate": 4.0811783289253e-06, "loss": 0.2025, "num_input_tokens_seen": 141675584, "step": 65710 }, { "epoch": 12.060011011194714, "grad_norm": 0.18737681210041046, "learning_rate": 4.080391220657416e-06, "loss": 0.0027, "num_input_tokens_seen": 141686336, "step": 65715 }, { "epoch": 12.060928610754267, "grad_norm": 18.416685104370117, "learning_rate": 4.079604135976073e-06, "loss": 0.2175, "num_input_tokens_seen": 141698048, "step": 65720 }, { "epoch": 12.061846210313819, "grad_norm": 44.92039489746094, "learning_rate": 4.078817074901457e-06, "loss": 0.247, "num_input_tokens_seen": 141709568, "step": 65725 }, { "epoch": 12.06276380987337, "grad_norm": 58.938499450683594, "learning_rate": 4.078030037453753e-06, "loss": 0.0576, "num_input_tokens_seen": 141721312, "step": 65730 }, { "epoch": 12.063681409432924, "grad_norm": 49.172393798828125, "learning_rate": 4.077243023653153e-06, "loss": 0.1989, "num_input_tokens_seen": 141732992, "step": 65735 }, { "epoch": 12.064599008992476, "grad_norm": 25.82725715637207, "learning_rate": 4.076456033519839e-06, "loss": 0.4871, "num_input_tokens_seen": 141742336, "step": 65740 }, { "epoch": 12.065516608552027, "grad_norm": 0.7269449234008789, "learning_rate": 4.075669067073994e-06, "loss": 0.112, "num_input_tokens_seen": 141752896, "step": 65745 }, { "epoch": 12.06643420811158, "grad_norm": 0.06693019717931747, "learning_rate": 4.074882124335806e-06, "loss": 0.09, "num_input_tokens_seen": 141763424, "step": 65750 }, { "epoch": 12.067351807671132, "grad_norm": 10.466327667236328, "learning_rate": 4.074095205325459e-06, "loss": 0.0104, "num_input_tokens_seen": 141775072, "step": 65755 }, { "epoch": 12.068269407230684, "grad_norm": 0.12905184924602509, "learning_rate": 4.073308310063134e-06, "loss": 0.004, "num_input_tokens_seen": 141786080, "step": 65760 }, { "epoch": 12.069187006790237, "grad_norm": 0.08287785202264786, "learning_rate": 4.0725214385690135e-06, "loss": 0.2381, "num_input_tokens_seen": 141798144, "step": 65765 }, { "epoch": 12.070104606349789, "grad_norm": 0.11370380967855453, "learning_rate": 4.071734590863282e-06, "loss": 0.0006, "num_input_tokens_seen": 141809536, "step": 65770 }, { "epoch": 12.07102220590934, "grad_norm": 0.12297119200229645, "learning_rate": 4.07094776696612e-06, "loss": 0.1075, "num_input_tokens_seen": 141821152, "step": 65775 }, { "epoch": 12.071939805468894, "grad_norm": 0.2655324339866638, "learning_rate": 4.070160966897705e-06, "loss": 0.0103, "num_input_tokens_seen": 141830944, "step": 65780 }, { "epoch": 12.072857405028445, "grad_norm": 0.8677747845649719, "learning_rate": 4.069374190678223e-06, "loss": 0.0045, "num_input_tokens_seen": 141842272, "step": 65785 }, { "epoch": 12.073775004587997, "grad_norm": 0.40243956446647644, "learning_rate": 4.06858743832785e-06, "loss": 0.1322, "num_input_tokens_seen": 141853952, "step": 65790 }, { "epoch": 12.07469260414755, "grad_norm": 32.04180145263672, "learning_rate": 4.0678007098667665e-06, "loss": 0.1574, "num_input_tokens_seen": 141864352, "step": 65795 }, { "epoch": 12.075610203707102, "grad_norm": 1.3943099975585938, "learning_rate": 4.067014005315149e-06, "loss": 0.1055, "num_input_tokens_seen": 141874944, "step": 65800 }, { "epoch": 12.076527803266654, "grad_norm": 0.017177224159240723, "learning_rate": 4.066227324693176e-06, "loss": 0.002, "num_input_tokens_seen": 141886400, "step": 65805 }, { "epoch": 12.077445402826207, "grad_norm": 90.64971923828125, "learning_rate": 4.065440668021025e-06, "loss": 0.197, "num_input_tokens_seen": 141896352, "step": 65810 }, { "epoch": 12.078363002385759, "grad_norm": 56.71798324584961, "learning_rate": 4.064654035318872e-06, "loss": 0.1847, "num_input_tokens_seen": 141906848, "step": 65815 }, { "epoch": 12.07928060194531, "grad_norm": 108.83287048339844, "learning_rate": 4.063867426606894e-06, "loss": 0.1703, "num_input_tokens_seen": 141917280, "step": 65820 }, { "epoch": 12.080198201504864, "grad_norm": 69.6761474609375, "learning_rate": 4.063080841905267e-06, "loss": 0.0628, "num_input_tokens_seen": 141928224, "step": 65825 }, { "epoch": 12.081115801064415, "grad_norm": 0.03212033212184906, "learning_rate": 4.0622942812341605e-06, "loss": 0.1352, "num_input_tokens_seen": 141938336, "step": 65830 }, { "epoch": 12.082033400623967, "grad_norm": 0.031163858249783516, "learning_rate": 4.061507744613756e-06, "loss": 0.1302, "num_input_tokens_seen": 141949184, "step": 65835 }, { "epoch": 12.08295100018352, "grad_norm": 1.0595413446426392, "learning_rate": 4.060721232064223e-06, "loss": 0.1674, "num_input_tokens_seen": 141959616, "step": 65840 }, { "epoch": 12.083868599743072, "grad_norm": 0.07490281760692596, "learning_rate": 4.059934743605734e-06, "loss": 0.3004, "num_input_tokens_seen": 141970528, "step": 65845 }, { "epoch": 12.084786199302624, "grad_norm": 0.14626175165176392, "learning_rate": 4.05914827925846e-06, "loss": 0.0884, "num_input_tokens_seen": 141980800, "step": 65850 }, { "epoch": 12.085703798862177, "grad_norm": 0.09003964811563492, "learning_rate": 4.058361839042576e-06, "loss": 0.0646, "num_input_tokens_seen": 141992896, "step": 65855 }, { "epoch": 12.086621398421729, "grad_norm": 0.35638800263404846, "learning_rate": 4.057575422978253e-06, "loss": 0.0012, "num_input_tokens_seen": 142004352, "step": 65860 }, { "epoch": 12.08753899798128, "grad_norm": 2.1728146076202393, "learning_rate": 4.056789031085656e-06, "loss": 0.0026, "num_input_tokens_seen": 142014720, "step": 65865 }, { "epoch": 12.088456597540834, "grad_norm": 0.05040794610977173, "learning_rate": 4.056002663384961e-06, "loss": 0.119, "num_input_tokens_seen": 142024448, "step": 65870 }, { "epoch": 12.089374197100385, "grad_norm": 54.36268615722656, "learning_rate": 4.0552163198963355e-06, "loss": 0.1885, "num_input_tokens_seen": 142034624, "step": 65875 }, { "epoch": 12.090291796659937, "grad_norm": 0.07300924509763718, "learning_rate": 4.0544300006399445e-06, "loss": 0.1475, "num_input_tokens_seen": 142045824, "step": 65880 }, { "epoch": 12.09120939621949, "grad_norm": 35.623634338378906, "learning_rate": 4.053643705635961e-06, "loss": 0.0934, "num_input_tokens_seen": 142055552, "step": 65885 }, { "epoch": 12.092126995779042, "grad_norm": 0.03313654288649559, "learning_rate": 4.052857434904549e-06, "loss": 0.0032, "num_input_tokens_seen": 142065440, "step": 65890 }, { "epoch": 12.093044595338593, "grad_norm": 0.03304493427276611, "learning_rate": 4.0520711884658755e-06, "loss": 0.1535, "num_input_tokens_seen": 142077600, "step": 65895 }, { "epoch": 12.093962194898147, "grad_norm": 30.881317138671875, "learning_rate": 4.051284966340107e-06, "loss": 0.1137, "num_input_tokens_seen": 142088128, "step": 65900 }, { "epoch": 12.094879794457698, "grad_norm": 32.69761276245117, "learning_rate": 4.050498768547408e-06, "loss": 0.2418, "num_input_tokens_seen": 142098144, "step": 65905 }, { "epoch": 12.09579739401725, "grad_norm": 0.03538275137543678, "learning_rate": 4.0497125951079455e-06, "loss": 0.0056, "num_input_tokens_seen": 142107808, "step": 65910 }, { "epoch": 12.096714993576803, "grad_norm": 18.514799118041992, "learning_rate": 4.048926446041881e-06, "loss": 0.1678, "num_input_tokens_seen": 142118784, "step": 65915 }, { "epoch": 12.097632593136355, "grad_norm": 23.3232421875, "learning_rate": 4.0481403213693795e-06, "loss": 0.1054, "num_input_tokens_seen": 142128576, "step": 65920 }, { "epoch": 12.098550192695907, "grad_norm": 0.038838475942611694, "learning_rate": 4.047354221110604e-06, "loss": 0.1287, "num_input_tokens_seen": 142138816, "step": 65925 }, { "epoch": 12.09946779225546, "grad_norm": 2.0997278690338135, "learning_rate": 4.0465681452857155e-06, "loss": 0.0012, "num_input_tokens_seen": 142149856, "step": 65930 }, { "epoch": 12.100385391815012, "grad_norm": 42.572879791259766, "learning_rate": 4.045782093914876e-06, "loss": 0.3908, "num_input_tokens_seen": 142159104, "step": 65935 }, { "epoch": 12.101302991374563, "grad_norm": 41.02971267700195, "learning_rate": 4.044996067018247e-06, "loss": 0.0491, "num_input_tokens_seen": 142168544, "step": 65940 }, { "epoch": 12.102220590934117, "grad_norm": 24.324846267700195, "learning_rate": 4.0442100646159906e-06, "loss": 0.1255, "num_input_tokens_seen": 142179456, "step": 65945 }, { "epoch": 12.103138190493668, "grad_norm": 0.12706193327903748, "learning_rate": 4.043424086728262e-06, "loss": 0.0182, "num_input_tokens_seen": 142190752, "step": 65950 }, { "epoch": 12.10405579005322, "grad_norm": 29.069883346557617, "learning_rate": 4.042638133375225e-06, "loss": 0.3524, "num_input_tokens_seen": 142201280, "step": 65955 }, { "epoch": 12.104973389612773, "grad_norm": 3.5151169300079346, "learning_rate": 4.041852204577035e-06, "loss": 0.1541, "num_input_tokens_seen": 142211552, "step": 65960 }, { "epoch": 12.105890989172325, "grad_norm": 113.24441528320312, "learning_rate": 4.04106630035385e-06, "loss": 0.1155, "num_input_tokens_seen": 142221280, "step": 65965 }, { "epoch": 12.106808588731877, "grad_norm": 5.178432941436768, "learning_rate": 4.040280420725831e-06, "loss": 0.1948, "num_input_tokens_seen": 142232832, "step": 65970 }, { "epoch": 12.10772618829143, "grad_norm": 3.3582170009613037, "learning_rate": 4.03949456571313e-06, "loss": 0.0857, "num_input_tokens_seen": 142243904, "step": 65975 }, { "epoch": 12.108643787850982, "grad_norm": 6.70267391204834, "learning_rate": 4.038708735335906e-06, "loss": 0.0073, "num_input_tokens_seen": 142255040, "step": 65980 }, { "epoch": 12.109561387410533, "grad_norm": 0.006774044595658779, "learning_rate": 4.037922929614311e-06, "loss": 0.0015, "num_input_tokens_seen": 142264928, "step": 65985 }, { "epoch": 12.110478986970087, "grad_norm": 0.07993657141923904, "learning_rate": 4.037137148568503e-06, "loss": 0.0026, "num_input_tokens_seen": 142275872, "step": 65990 }, { "epoch": 12.111396586529638, "grad_norm": 0.1731419861316681, "learning_rate": 4.036351392218635e-06, "loss": 0.0007, "num_input_tokens_seen": 142285280, "step": 65995 }, { "epoch": 12.11231418608919, "grad_norm": 0.9734752178192139, "learning_rate": 4.03556566058486e-06, "loss": 0.0011, "num_input_tokens_seen": 142294528, "step": 66000 }, { "epoch": 12.113231785648743, "grad_norm": 6.481621265411377, "learning_rate": 4.03477995368733e-06, "loss": 0.1702, "num_input_tokens_seen": 142305888, "step": 66005 }, { "epoch": 12.114149385208295, "grad_norm": 0.05125067010521889, "learning_rate": 4.033994271546201e-06, "loss": 0.001, "num_input_tokens_seen": 142317280, "step": 66010 }, { "epoch": 12.115066984767846, "grad_norm": 0.09561631828546524, "learning_rate": 4.033208614181619e-06, "loss": 0.0041, "num_input_tokens_seen": 142327232, "step": 66015 }, { "epoch": 12.1159845843274, "grad_norm": 0.02636437863111496, "learning_rate": 4.03242298161374e-06, "loss": 0.0155, "num_input_tokens_seen": 142336832, "step": 66020 }, { "epoch": 12.116902183886952, "grad_norm": 0.2708764970302582, "learning_rate": 4.031637373862711e-06, "loss": 0.0015, "num_input_tokens_seen": 142347392, "step": 66025 }, { "epoch": 12.117819783446503, "grad_norm": 136.19378662109375, "learning_rate": 4.030851790948681e-06, "loss": 0.3589, "num_input_tokens_seen": 142358496, "step": 66030 }, { "epoch": 12.118737383006057, "grad_norm": 0.0293223038315773, "learning_rate": 4.030066232891801e-06, "loss": 0.1694, "num_input_tokens_seen": 142370240, "step": 66035 }, { "epoch": 12.119654982565608, "grad_norm": 0.48090994358062744, "learning_rate": 4.029280699712221e-06, "loss": 0.3115, "num_input_tokens_seen": 142380832, "step": 66040 }, { "epoch": 12.12057258212516, "grad_norm": 0.013196224346756935, "learning_rate": 4.028495191430085e-06, "loss": 0.1189, "num_input_tokens_seen": 142391328, "step": 66045 }, { "epoch": 12.121490181684713, "grad_norm": 0.07565652579069138, "learning_rate": 4.0277097080655405e-06, "loss": 0.0304, "num_input_tokens_seen": 142402624, "step": 66050 }, { "epoch": 12.122407781244265, "grad_norm": 2.4754245281219482, "learning_rate": 4.026924249638737e-06, "loss": 0.0024, "num_input_tokens_seen": 142413664, "step": 66055 }, { "epoch": 12.123325380803816, "grad_norm": 146.52627563476562, "learning_rate": 4.026138816169819e-06, "loss": 0.0946, "num_input_tokens_seen": 142423616, "step": 66060 }, { "epoch": 12.12424298036337, "grad_norm": 0.02686845324933529, "learning_rate": 4.02535340767893e-06, "loss": 0.0485, "num_input_tokens_seen": 142434176, "step": 66065 }, { "epoch": 12.125160579922921, "grad_norm": 0.0687817633152008, "learning_rate": 4.024568024186213e-06, "loss": 0.0015, "num_input_tokens_seen": 142445216, "step": 66070 }, { "epoch": 12.126078179482473, "grad_norm": 64.46870422363281, "learning_rate": 4.023782665711818e-06, "loss": 0.277, "num_input_tokens_seen": 142456416, "step": 66075 }, { "epoch": 12.126995779042026, "grad_norm": 57.62832260131836, "learning_rate": 4.022997332275883e-06, "loss": 0.1452, "num_input_tokens_seen": 142468000, "step": 66080 }, { "epoch": 12.127913378601578, "grad_norm": 0.04555201157927513, "learning_rate": 4.0222120238985515e-06, "loss": 0.1323, "num_input_tokens_seen": 142478752, "step": 66085 }, { "epoch": 12.12883097816113, "grad_norm": 514.7463989257812, "learning_rate": 4.021426740599967e-06, "loss": 0.1862, "num_input_tokens_seen": 142489728, "step": 66090 }, { "epoch": 12.129748577720683, "grad_norm": 107.86756896972656, "learning_rate": 4.020641482400272e-06, "loss": 0.3266, "num_input_tokens_seen": 142500544, "step": 66095 }, { "epoch": 12.130666177280235, "grad_norm": 0.12368625402450562, "learning_rate": 4.019856249319601e-06, "loss": 0.0009, "num_input_tokens_seen": 142511840, "step": 66100 }, { "epoch": 12.131583776839786, "grad_norm": 54.14623260498047, "learning_rate": 4.019071041378101e-06, "loss": 0.2429, "num_input_tokens_seen": 142522528, "step": 66105 }, { "epoch": 12.13250137639934, "grad_norm": 0.2414887398481369, "learning_rate": 4.018285858595908e-06, "loss": 0.3798, "num_input_tokens_seen": 142532800, "step": 66110 }, { "epoch": 12.133418975958891, "grad_norm": 23.353601455688477, "learning_rate": 4.017500700993162e-06, "loss": 0.3489, "num_input_tokens_seen": 142544160, "step": 66115 }, { "epoch": 12.134336575518443, "grad_norm": 0.09752544015645981, "learning_rate": 4.016715568589999e-06, "loss": 0.1619, "num_input_tokens_seen": 142555456, "step": 66120 }, { "epoch": 12.135254175077996, "grad_norm": 0.05982189252972603, "learning_rate": 4.015930461406557e-06, "loss": 0.0009, "num_input_tokens_seen": 142566656, "step": 66125 }, { "epoch": 12.136171774637548, "grad_norm": 0.05910458788275719, "learning_rate": 4.015145379462976e-06, "loss": 0.1353, "num_input_tokens_seen": 142577632, "step": 66130 }, { "epoch": 12.1370893741971, "grad_norm": 0.11208608001470566, "learning_rate": 4.014360322779387e-06, "loss": 0.0743, "num_input_tokens_seen": 142589088, "step": 66135 }, { "epoch": 12.138006973756653, "grad_norm": 0.0307476706802845, "learning_rate": 4.01357529137593e-06, "loss": 0.3673, "num_input_tokens_seen": 142600352, "step": 66140 }, { "epoch": 12.138924573316205, "grad_norm": 0.49854201078414917, "learning_rate": 4.012790285272738e-06, "loss": 0.2276, "num_input_tokens_seen": 142612672, "step": 66145 }, { "epoch": 12.139842172875756, "grad_norm": 0.04750034958124161, "learning_rate": 4.012005304489943e-06, "loss": 0.0011, "num_input_tokens_seen": 142621632, "step": 66150 }, { "epoch": 12.14075977243531, "grad_norm": 0.013488177210092545, "learning_rate": 4.011220349047683e-06, "loss": 0.1621, "num_input_tokens_seen": 142633504, "step": 66155 }, { "epoch": 12.141677371994861, "grad_norm": 6.507943153381348, "learning_rate": 4.010435418966088e-06, "loss": 0.1088, "num_input_tokens_seen": 142644896, "step": 66160 }, { "epoch": 12.142594971554413, "grad_norm": 0.099006786942482, "learning_rate": 4.0096505142652905e-06, "loss": 0.0026, "num_input_tokens_seen": 142655104, "step": 66165 }, { "epoch": 12.143512571113966, "grad_norm": 0.022725796326994896, "learning_rate": 4.0088656349654205e-06, "loss": 0.0008, "num_input_tokens_seen": 142666336, "step": 66170 }, { "epoch": 12.144430170673518, "grad_norm": 0.1193658858537674, "learning_rate": 4.008080781086614e-06, "loss": 0.1675, "num_input_tokens_seen": 142677216, "step": 66175 }, { "epoch": 12.14534777023307, "grad_norm": 0.07566575706005096, "learning_rate": 4.007295952648998e-06, "loss": 0.0483, "num_input_tokens_seen": 142687072, "step": 66180 }, { "epoch": 12.146265369792623, "grad_norm": 0.9724907875061035, "learning_rate": 4.0065111496726985e-06, "loss": 0.0102, "num_input_tokens_seen": 142698944, "step": 66185 }, { "epoch": 12.147182969352174, "grad_norm": 15.798364639282227, "learning_rate": 4.0057263721778515e-06, "loss": 0.1019, "num_input_tokens_seen": 142710592, "step": 66190 }, { "epoch": 12.148100568911726, "grad_norm": 71.82058715820312, "learning_rate": 4.004941620184582e-06, "loss": 0.0858, "num_input_tokens_seen": 142719936, "step": 66195 }, { "epoch": 12.14901816847128, "grad_norm": 0.21603363752365112, "learning_rate": 4.004156893713016e-06, "loss": 0.0015, "num_input_tokens_seen": 142731296, "step": 66200 }, { "epoch": 12.149935768030831, "grad_norm": 0.1552787572145462, "learning_rate": 4.003372192783284e-06, "loss": 0.006, "num_input_tokens_seen": 142743136, "step": 66205 }, { "epoch": 12.150853367590383, "grad_norm": 37.97982406616211, "learning_rate": 4.00258751741551e-06, "loss": 0.4137, "num_input_tokens_seen": 142754304, "step": 66210 }, { "epoch": 12.151770967149936, "grad_norm": 0.24654221534729004, "learning_rate": 4.001802867629821e-06, "loss": 0.0012, "num_input_tokens_seen": 142764864, "step": 66215 }, { "epoch": 12.152688566709488, "grad_norm": 0.35539862513542175, "learning_rate": 4.00101824344634e-06, "loss": 0.2796, "num_input_tokens_seen": 142775136, "step": 66220 }, { "epoch": 12.15360616626904, "grad_norm": 14.520926475524902, "learning_rate": 4.000233644885193e-06, "loss": 0.2734, "num_input_tokens_seen": 142786208, "step": 66225 }, { "epoch": 12.154523765828593, "grad_norm": 0.4984385371208191, "learning_rate": 3.999449071966505e-06, "loss": 0.0024, "num_input_tokens_seen": 142797408, "step": 66230 }, { "epoch": 12.155441365388144, "grad_norm": 0.8962190747261047, "learning_rate": 3.998664524710396e-06, "loss": 0.0809, "num_input_tokens_seen": 142807904, "step": 66235 }, { "epoch": 12.156358964947696, "grad_norm": 0.10269280523061752, "learning_rate": 3.997880003136991e-06, "loss": 0.0801, "num_input_tokens_seen": 142818784, "step": 66240 }, { "epoch": 12.15727656450725, "grad_norm": 0.1565794199705124, "learning_rate": 3.9970955072664115e-06, "loss": 0.0007, "num_input_tokens_seen": 142830368, "step": 66245 }, { "epoch": 12.158194164066801, "grad_norm": 22.8237247467041, "learning_rate": 3.996311037118776e-06, "loss": 0.395, "num_input_tokens_seen": 142841856, "step": 66250 }, { "epoch": 12.159111763626353, "grad_norm": 35.47058868408203, "learning_rate": 3.995526592714207e-06, "loss": 0.0151, "num_input_tokens_seen": 142853568, "step": 66255 }, { "epoch": 12.160029363185906, "grad_norm": 1.2517211437225342, "learning_rate": 3.994742174072826e-06, "loss": 0.0006, "num_input_tokens_seen": 142864960, "step": 66260 }, { "epoch": 12.160946962745458, "grad_norm": 0.03934541717171669, "learning_rate": 3.993957781214749e-06, "loss": 0.0005, "num_input_tokens_seen": 142875360, "step": 66265 }, { "epoch": 12.16186456230501, "grad_norm": 1.0378484725952148, "learning_rate": 3.993173414160094e-06, "loss": 0.0024, "num_input_tokens_seen": 142886112, "step": 66270 }, { "epoch": 12.162782161864563, "grad_norm": 0.010829520411789417, "learning_rate": 3.992389072928983e-06, "loss": 0.0005, "num_input_tokens_seen": 142897184, "step": 66275 }, { "epoch": 12.163699761424114, "grad_norm": 252.949951171875, "learning_rate": 3.9916047575415304e-06, "loss": 0.1241, "num_input_tokens_seen": 142908224, "step": 66280 }, { "epoch": 12.164617360983666, "grad_norm": 0.04509894177317619, "learning_rate": 3.9908204680178505e-06, "loss": 0.0003, "num_input_tokens_seen": 142918464, "step": 66285 }, { "epoch": 12.16553496054322, "grad_norm": 0.05990088731050491, "learning_rate": 3.990036204378064e-06, "loss": 0.2974, "num_input_tokens_seen": 142929632, "step": 66290 }, { "epoch": 12.16645256010277, "grad_norm": 0.009940873831510544, "learning_rate": 3.989251966642284e-06, "loss": 0.0591, "num_input_tokens_seen": 142941536, "step": 66295 }, { "epoch": 12.167370159662322, "grad_norm": 0.00637377193197608, "learning_rate": 3.988467754830623e-06, "loss": 0.2676, "num_input_tokens_seen": 142951744, "step": 66300 }, { "epoch": 12.168287759221876, "grad_norm": 0.03646136075258255, "learning_rate": 3.9876835689631955e-06, "loss": 0.0593, "num_input_tokens_seen": 142962560, "step": 66305 }, { "epoch": 12.169205358781428, "grad_norm": 0.13971169292926788, "learning_rate": 3.986899409060117e-06, "loss": 0.154, "num_input_tokens_seen": 142973120, "step": 66310 }, { "epoch": 12.17012295834098, "grad_norm": 33.47465133666992, "learning_rate": 3.986115275141499e-06, "loss": 0.0157, "num_input_tokens_seen": 142983264, "step": 66315 }, { "epoch": 12.171040557900533, "grad_norm": 1.5693923234939575, "learning_rate": 3.98533116722745e-06, "loss": 0.0016, "num_input_tokens_seen": 142993632, "step": 66320 }, { "epoch": 12.171958157460084, "grad_norm": 20.44207191467285, "learning_rate": 3.984547085338087e-06, "loss": 0.2036, "num_input_tokens_seen": 143005152, "step": 66325 }, { "epoch": 12.172875757019636, "grad_norm": 0.4380476176738739, "learning_rate": 3.983763029493517e-06, "loss": 0.1469, "num_input_tokens_seen": 143016544, "step": 66330 }, { "epoch": 12.17379335657919, "grad_norm": 0.03692379221320152, "learning_rate": 3.982978999713849e-06, "loss": 0.3033, "num_input_tokens_seen": 143028480, "step": 66335 }, { "epoch": 12.17471095613874, "grad_norm": 0.004239671863615513, "learning_rate": 3.9821949960191944e-06, "loss": 0.1645, "num_input_tokens_seen": 143037728, "step": 66340 }, { "epoch": 12.175628555698292, "grad_norm": 0.03951042890548706, "learning_rate": 3.981411018429661e-06, "loss": 0.0105, "num_input_tokens_seen": 143048832, "step": 66345 }, { "epoch": 12.176546155257846, "grad_norm": 0.1270594298839569, "learning_rate": 3.980627066965356e-06, "loss": 0.1378, "num_input_tokens_seen": 143059552, "step": 66350 }, { "epoch": 12.177463754817397, "grad_norm": 2.7632057666778564, "learning_rate": 3.979843141646385e-06, "loss": 0.1505, "num_input_tokens_seen": 143071616, "step": 66355 }, { "epoch": 12.178381354376949, "grad_norm": 0.32860705256462097, "learning_rate": 3.9790592424928596e-06, "loss": 0.0005, "num_input_tokens_seen": 143082144, "step": 66360 }, { "epoch": 12.179298953936502, "grad_norm": 0.01025676541030407, "learning_rate": 3.978275369524881e-06, "loss": 0.0002, "num_input_tokens_seen": 143091552, "step": 66365 }, { "epoch": 12.180216553496054, "grad_norm": 0.01195339486002922, "learning_rate": 3.977491522762553e-06, "loss": 0.0004, "num_input_tokens_seen": 143102400, "step": 66370 }, { "epoch": 12.181134153055606, "grad_norm": 0.07837377488613129, "learning_rate": 3.976707702225986e-06, "loss": 0.4086, "num_input_tokens_seen": 143114304, "step": 66375 }, { "epoch": 12.182051752615159, "grad_norm": 78.13480377197266, "learning_rate": 3.97592390793528e-06, "loss": 0.2494, "num_input_tokens_seen": 143125120, "step": 66380 }, { "epoch": 12.18296935217471, "grad_norm": 0.9006883502006531, "learning_rate": 3.975140139910538e-06, "loss": 0.2508, "num_input_tokens_seen": 143135744, "step": 66385 }, { "epoch": 12.183886951734262, "grad_norm": 0.017039639875292778, "learning_rate": 3.974356398171862e-06, "loss": 0.0217, "num_input_tokens_seen": 143146336, "step": 66390 }, { "epoch": 12.184804551293816, "grad_norm": 0.12949296832084656, "learning_rate": 3.973572682739356e-06, "loss": 0.0006, "num_input_tokens_seen": 143158176, "step": 66395 }, { "epoch": 12.185722150853367, "grad_norm": 0.07850076258182526, "learning_rate": 3.972788993633121e-06, "loss": 0.003, "num_input_tokens_seen": 143169088, "step": 66400 }, { "epoch": 12.186639750412919, "grad_norm": 0.22673505544662476, "learning_rate": 3.972005330873253e-06, "loss": 0.0268, "num_input_tokens_seen": 143180224, "step": 66405 }, { "epoch": 12.187557349972472, "grad_norm": 0.2140919268131256, "learning_rate": 3.971221694479857e-06, "loss": 0.2037, "num_input_tokens_seen": 143189760, "step": 66410 }, { "epoch": 12.188474949532024, "grad_norm": 58.6078987121582, "learning_rate": 3.970438084473031e-06, "loss": 0.3455, "num_input_tokens_seen": 143201664, "step": 66415 }, { "epoch": 12.189392549091576, "grad_norm": 0.01998007670044899, "learning_rate": 3.96965450087287e-06, "loss": 0.1213, "num_input_tokens_seen": 143212576, "step": 66420 }, { "epoch": 12.190310148651129, "grad_norm": 0.0653904601931572, "learning_rate": 3.968870943699477e-06, "loss": 0.1246, "num_input_tokens_seen": 143223712, "step": 66425 }, { "epoch": 12.19122774821068, "grad_norm": 0.009563236497342587, "learning_rate": 3.968087412972945e-06, "loss": 0.1693, "num_input_tokens_seen": 143234336, "step": 66430 }, { "epoch": 12.192145347770232, "grad_norm": 0.8295366168022156, "learning_rate": 3.967303908713372e-06, "loss": 0.2352, "num_input_tokens_seen": 143244736, "step": 66435 }, { "epoch": 12.193062947329786, "grad_norm": 0.913171648979187, "learning_rate": 3.966520430940852e-06, "loss": 0.1306, "num_input_tokens_seen": 143255136, "step": 66440 }, { "epoch": 12.193980546889337, "grad_norm": 100.48078918457031, "learning_rate": 3.965736979675481e-06, "loss": 0.2557, "num_input_tokens_seen": 143265952, "step": 66445 }, { "epoch": 12.194898146448889, "grad_norm": 0.01971817947924137, "learning_rate": 3.9649535549373555e-06, "loss": 0.0859, "num_input_tokens_seen": 143276192, "step": 66450 }, { "epoch": 12.195815746008442, "grad_norm": 0.005437428131699562, "learning_rate": 3.964170156746565e-06, "loss": 0.1708, "num_input_tokens_seen": 143286656, "step": 66455 }, { "epoch": 12.196733345567994, "grad_norm": 25.37233543395996, "learning_rate": 3.963386785123207e-06, "loss": 0.0442, "num_input_tokens_seen": 143296192, "step": 66460 }, { "epoch": 12.197650945127545, "grad_norm": 1.0595593452453613, "learning_rate": 3.9626034400873695e-06, "loss": 0.0541, "num_input_tokens_seen": 143307872, "step": 66465 }, { "epoch": 12.198568544687099, "grad_norm": 0.028040414676070213, "learning_rate": 3.961820121659145e-06, "loss": 0.004, "num_input_tokens_seen": 143319072, "step": 66470 }, { "epoch": 12.19948614424665, "grad_norm": 0.266848623752594, "learning_rate": 3.9610368298586275e-06, "loss": 0.0011, "num_input_tokens_seen": 143330976, "step": 66475 }, { "epoch": 12.200403743806202, "grad_norm": 0.020721592009067535, "learning_rate": 3.960253564705905e-06, "loss": 0.0433, "num_input_tokens_seen": 143341056, "step": 66480 }, { "epoch": 12.201321343365755, "grad_norm": 0.05988878756761551, "learning_rate": 3.959470326221066e-06, "loss": 0.102, "num_input_tokens_seen": 143351584, "step": 66485 }, { "epoch": 12.202238942925307, "grad_norm": 25.406173706054688, "learning_rate": 3.9586871144242e-06, "loss": 0.0379, "num_input_tokens_seen": 143363008, "step": 66490 }, { "epoch": 12.20315654248486, "grad_norm": 173.92637634277344, "learning_rate": 3.957903929335397e-06, "loss": 0.1915, "num_input_tokens_seen": 143373696, "step": 66495 }, { "epoch": 12.204074142044412, "grad_norm": 38.75568771362305, "learning_rate": 3.957120770974743e-06, "loss": 0.2665, "num_input_tokens_seen": 143385024, "step": 66500 }, { "epoch": 12.204991741603964, "grad_norm": 0.06879424303770065, "learning_rate": 3.956337639362323e-06, "loss": 0.1806, "num_input_tokens_seen": 143396128, "step": 66505 }, { "epoch": 12.205909341163517, "grad_norm": 0.03571149706840515, "learning_rate": 3.955554534518227e-06, "loss": 0.1104, "num_input_tokens_seen": 143406336, "step": 66510 }, { "epoch": 12.206826940723069, "grad_norm": 0.17425639927387238, "learning_rate": 3.954771456462538e-06, "loss": 0.0007, "num_input_tokens_seen": 143415936, "step": 66515 }, { "epoch": 12.20774454028262, "grad_norm": 0.11923377215862274, "learning_rate": 3.953988405215342e-06, "loss": 0.1327, "num_input_tokens_seen": 143426368, "step": 66520 }, { "epoch": 12.208662139842174, "grad_norm": 59.329071044921875, "learning_rate": 3.953205380796719e-06, "loss": 0.2831, "num_input_tokens_seen": 143437792, "step": 66525 }, { "epoch": 12.209579739401725, "grad_norm": 44.45124053955078, "learning_rate": 3.952422383226759e-06, "loss": 0.2114, "num_input_tokens_seen": 143450048, "step": 66530 }, { "epoch": 12.210497338961277, "grad_norm": 29.832748413085938, "learning_rate": 3.951639412525541e-06, "loss": 0.1418, "num_input_tokens_seen": 143461440, "step": 66535 }, { "epoch": 12.21141493852083, "grad_norm": 145.96302795410156, "learning_rate": 3.9508564687131465e-06, "loss": 0.2949, "num_input_tokens_seen": 143471296, "step": 66540 }, { "epoch": 12.212332538080382, "grad_norm": 0.8904040455818176, "learning_rate": 3.950073551809657e-06, "loss": 0.0019, "num_input_tokens_seen": 143481568, "step": 66545 }, { "epoch": 12.213250137639934, "grad_norm": 0.008361387066543102, "learning_rate": 3.9492906618351545e-06, "loss": 0.0033, "num_input_tokens_seen": 143492640, "step": 66550 }, { "epoch": 12.214167737199487, "grad_norm": 0.036674074828624725, "learning_rate": 3.948507798809718e-06, "loss": 0.0108, "num_input_tokens_seen": 143503584, "step": 66555 }, { "epoch": 12.215085336759039, "grad_norm": 0.17145001888275146, "learning_rate": 3.9477249627534265e-06, "loss": 0.0647, "num_input_tokens_seen": 143514720, "step": 66560 }, { "epoch": 12.21600293631859, "grad_norm": 0.3018049895763397, "learning_rate": 3.9469421536863595e-06, "loss": 0.0176, "num_input_tokens_seen": 143526592, "step": 66565 }, { "epoch": 12.216920535878144, "grad_norm": 75.4849853515625, "learning_rate": 3.946159371628593e-06, "loss": 0.0717, "num_input_tokens_seen": 143537952, "step": 66570 }, { "epoch": 12.217838135437695, "grad_norm": 63.612548828125, "learning_rate": 3.945376616600205e-06, "loss": 0.2368, "num_input_tokens_seen": 143548416, "step": 66575 }, { "epoch": 12.218755734997247, "grad_norm": 0.035880107432603836, "learning_rate": 3.944593888621274e-06, "loss": 0.0017, "num_input_tokens_seen": 143558208, "step": 66580 }, { "epoch": 12.2196733345568, "grad_norm": 36.48590850830078, "learning_rate": 3.943811187711873e-06, "loss": 0.1341, "num_input_tokens_seen": 143569696, "step": 66585 }, { "epoch": 12.220590934116352, "grad_norm": 0.005886952392756939, "learning_rate": 3.943028513892078e-06, "loss": 0.1302, "num_input_tokens_seen": 143580832, "step": 66590 }, { "epoch": 12.221508533675904, "grad_norm": 0.18411144614219666, "learning_rate": 3.942245867181964e-06, "loss": 0.001, "num_input_tokens_seen": 143591776, "step": 66595 }, { "epoch": 12.222426133235457, "grad_norm": 0.009767523966729641, "learning_rate": 3.941463247601604e-06, "loss": 0.0012, "num_input_tokens_seen": 143602688, "step": 66600 }, { "epoch": 12.223343732795009, "grad_norm": 0.06681077927350998, "learning_rate": 3.940680655171069e-06, "loss": 0.218, "num_input_tokens_seen": 143612800, "step": 66605 }, { "epoch": 12.22426133235456, "grad_norm": 0.021934567019343376, "learning_rate": 3.939898089910436e-06, "loss": 0.1384, "num_input_tokens_seen": 143624128, "step": 66610 }, { "epoch": 12.225178931914114, "grad_norm": 1.999886155128479, "learning_rate": 3.939115551839774e-06, "loss": 0.0014, "num_input_tokens_seen": 143635616, "step": 66615 }, { "epoch": 12.226096531473665, "grad_norm": 0.01971767656505108, "learning_rate": 3.9383330409791545e-06, "loss": 0.0804, "num_input_tokens_seen": 143646784, "step": 66620 }, { "epoch": 12.227014131033217, "grad_norm": 0.06217344477772713, "learning_rate": 3.937550557348644e-06, "loss": 0.2278, "num_input_tokens_seen": 143658912, "step": 66625 }, { "epoch": 12.22793173059277, "grad_norm": 0.01443522796034813, "learning_rate": 3.936768100968317e-06, "loss": 0.1864, "num_input_tokens_seen": 143669600, "step": 66630 }, { "epoch": 12.228849330152322, "grad_norm": 0.0876263827085495, "learning_rate": 3.935985671858241e-06, "loss": 0.3279, "num_input_tokens_seen": 143680544, "step": 66635 }, { "epoch": 12.229766929711873, "grad_norm": 726.87646484375, "learning_rate": 3.935203270038481e-06, "loss": 0.0946, "num_input_tokens_seen": 143690496, "step": 66640 }, { "epoch": 12.230684529271427, "grad_norm": 0.06304068118333817, "learning_rate": 3.934420895529109e-06, "loss": 0.1565, "num_input_tokens_seen": 143701632, "step": 66645 }, { "epoch": 12.231602128830978, "grad_norm": 0.10531584918498993, "learning_rate": 3.933638548350189e-06, "loss": 0.0024, "num_input_tokens_seen": 143712736, "step": 66650 }, { "epoch": 12.23251972839053, "grad_norm": 0.12175416946411133, "learning_rate": 3.932856228521788e-06, "loss": 0.0005, "num_input_tokens_seen": 143722592, "step": 66655 }, { "epoch": 12.233437327950083, "grad_norm": 0.3309669494628906, "learning_rate": 3.93207393606397e-06, "loss": 0.0018, "num_input_tokens_seen": 143733760, "step": 66660 }, { "epoch": 12.234354927509635, "grad_norm": 0.040787726640701294, "learning_rate": 3.931291670996801e-06, "loss": 0.0597, "num_input_tokens_seen": 143745024, "step": 66665 }, { "epoch": 12.235272527069187, "grad_norm": 0.035680003464221954, "learning_rate": 3.930509433340344e-06, "loss": 0.6136, "num_input_tokens_seen": 143757248, "step": 66670 }, { "epoch": 12.23619012662874, "grad_norm": 0.18464520573616028, "learning_rate": 3.929727223114662e-06, "loss": 0.2101, "num_input_tokens_seen": 143768960, "step": 66675 }, { "epoch": 12.237107726188292, "grad_norm": 0.14111898839473724, "learning_rate": 3.928945040339819e-06, "loss": 0.0012, "num_input_tokens_seen": 143778944, "step": 66680 }, { "epoch": 12.238025325747843, "grad_norm": 0.030343862250447273, "learning_rate": 3.928162885035877e-06, "loss": 0.2098, "num_input_tokens_seen": 143788480, "step": 66685 }, { "epoch": 12.238942925307397, "grad_norm": 111.10367584228516, "learning_rate": 3.927380757222892e-06, "loss": 0.2759, "num_input_tokens_seen": 143799104, "step": 66690 }, { "epoch": 12.239860524866948, "grad_norm": 210.07662963867188, "learning_rate": 3.92659865692093e-06, "loss": 0.2253, "num_input_tokens_seen": 143809984, "step": 66695 }, { "epoch": 12.2407781244265, "grad_norm": 0.019977744668722153, "learning_rate": 3.92581658415005e-06, "loss": 0.1057, "num_input_tokens_seen": 143821472, "step": 66700 }, { "epoch": 12.241695723986053, "grad_norm": 0.023051010444760323, "learning_rate": 3.925034538930309e-06, "loss": 0.0893, "num_input_tokens_seen": 143831968, "step": 66705 }, { "epoch": 12.242613323545605, "grad_norm": 0.03390748053789139, "learning_rate": 3.9242525212817645e-06, "loss": 0.0119, "num_input_tokens_seen": 143843488, "step": 66710 }, { "epoch": 12.243530923105157, "grad_norm": 0.18202714622020721, "learning_rate": 3.923470531224478e-06, "loss": 0.1505, "num_input_tokens_seen": 143854752, "step": 66715 }, { "epoch": 12.24444852266471, "grad_norm": 0.01585945300757885, "learning_rate": 3.9226885687785035e-06, "loss": 0.0023, "num_input_tokens_seen": 143864992, "step": 66720 }, { "epoch": 12.245366122224262, "grad_norm": 10.945527076721191, "learning_rate": 3.921906633963894e-06, "loss": 0.0051, "num_input_tokens_seen": 143875488, "step": 66725 }, { "epoch": 12.246283721783813, "grad_norm": 1.1340487003326416, "learning_rate": 3.921124726800713e-06, "loss": 0.0066, "num_input_tokens_seen": 143886400, "step": 66730 }, { "epoch": 12.247201321343367, "grad_norm": 1.4755457639694214, "learning_rate": 3.920342847309009e-06, "loss": 0.1301, "num_input_tokens_seen": 143897344, "step": 66735 }, { "epoch": 12.248118920902918, "grad_norm": 73.76154327392578, "learning_rate": 3.919560995508836e-06, "loss": 0.0646, "num_input_tokens_seen": 143907712, "step": 66740 }, { "epoch": 12.24903652046247, "grad_norm": 0.7222134470939636, "learning_rate": 3.918779171420251e-06, "loss": 0.2788, "num_input_tokens_seen": 143918144, "step": 66745 }, { "epoch": 12.249954120022023, "grad_norm": 85.46927642822266, "learning_rate": 3.917997375063305e-06, "loss": 0.1692, "num_input_tokens_seen": 143929312, "step": 66750 }, { "epoch": 12.250871719581575, "grad_norm": 0.041314516216516495, "learning_rate": 3.917215606458049e-06, "loss": 0.1255, "num_input_tokens_seen": 143940640, "step": 66755 }, { "epoch": 12.251789319141126, "grad_norm": 54.950408935546875, "learning_rate": 3.916433865624533e-06, "loss": 0.119, "num_input_tokens_seen": 143951488, "step": 66760 }, { "epoch": 12.25270691870068, "grad_norm": 0.04891939461231232, "learning_rate": 3.915652152582809e-06, "loss": 0.1921, "num_input_tokens_seen": 143962496, "step": 66765 }, { "epoch": 12.253624518260231, "grad_norm": 0.11161620169878006, "learning_rate": 3.914870467352928e-06, "loss": 0.1505, "num_input_tokens_seen": 143973280, "step": 66770 }, { "epoch": 12.254542117819783, "grad_norm": 78.06318664550781, "learning_rate": 3.914088809954937e-06, "loss": 0.7693, "num_input_tokens_seen": 143983840, "step": 66775 }, { "epoch": 12.255459717379336, "grad_norm": 1.1717710494995117, "learning_rate": 3.913307180408886e-06, "loss": 0.0522, "num_input_tokens_seen": 143994784, "step": 66780 }, { "epoch": 12.256377316938888, "grad_norm": 1.1178240776062012, "learning_rate": 3.912525578734822e-06, "loss": 0.0015, "num_input_tokens_seen": 144005920, "step": 66785 }, { "epoch": 12.25729491649844, "grad_norm": 4.025257110595703, "learning_rate": 3.9117440049527885e-06, "loss": 0.0628, "num_input_tokens_seen": 144017120, "step": 66790 }, { "epoch": 12.258212516057993, "grad_norm": 0.3569372594356537, "learning_rate": 3.910962459082837e-06, "loss": 0.0019, "num_input_tokens_seen": 144028992, "step": 66795 }, { "epoch": 12.259130115617545, "grad_norm": 0.055844783782958984, "learning_rate": 3.910180941145011e-06, "loss": 0.0054, "num_input_tokens_seen": 144039360, "step": 66800 }, { "epoch": 12.260047715177096, "grad_norm": 628.2351684570312, "learning_rate": 3.909399451159354e-06, "loss": 0.0404, "num_input_tokens_seen": 144051136, "step": 66805 }, { "epoch": 12.26096531473665, "grad_norm": 70.64153289794922, "learning_rate": 3.90861798914591e-06, "loss": 0.3261, "num_input_tokens_seen": 144062272, "step": 66810 }, { "epoch": 12.261882914296201, "grad_norm": 8.433408737182617, "learning_rate": 3.907836555124724e-06, "loss": 0.1927, "num_input_tokens_seen": 144073376, "step": 66815 }, { "epoch": 12.262800513855753, "grad_norm": 0.04796452820301056, "learning_rate": 3.907055149115838e-06, "loss": 0.1194, "num_input_tokens_seen": 144083200, "step": 66820 }, { "epoch": 12.263718113415306, "grad_norm": 0.06501458585262299, "learning_rate": 3.906273771139291e-06, "loss": 0.0679, "num_input_tokens_seen": 144092544, "step": 66825 }, { "epoch": 12.264635712974858, "grad_norm": 0.35576972365379333, "learning_rate": 3.905492421215129e-06, "loss": 0.0316, "num_input_tokens_seen": 144103840, "step": 66830 }, { "epoch": 12.26555331253441, "grad_norm": 0.058486923575401306, "learning_rate": 3.9047110993633905e-06, "loss": 0.1325, "num_input_tokens_seen": 144114336, "step": 66835 }, { "epoch": 12.266470912093963, "grad_norm": 0.5449923872947693, "learning_rate": 3.9039298056041145e-06, "loss": 0.1444, "num_input_tokens_seen": 144125952, "step": 66840 }, { "epoch": 12.267388511653515, "grad_norm": 168.64678955078125, "learning_rate": 3.903148539957339e-06, "loss": 0.1668, "num_input_tokens_seen": 144136384, "step": 66845 }, { "epoch": 12.268306111213066, "grad_norm": 295.2964782714844, "learning_rate": 3.902367302443104e-06, "loss": 0.1105, "num_input_tokens_seen": 144147648, "step": 66850 }, { "epoch": 12.26922371077262, "grad_norm": 0.06551370024681091, "learning_rate": 3.901586093081447e-06, "loss": 0.0707, "num_input_tokens_seen": 144159008, "step": 66855 }, { "epoch": 12.270141310332171, "grad_norm": 0.028199275955557823, "learning_rate": 3.900804911892402e-06, "loss": 0.1744, "num_input_tokens_seen": 144169984, "step": 66860 }, { "epoch": 12.271058909891723, "grad_norm": 1.2079581022262573, "learning_rate": 3.900023758896011e-06, "loss": 0.0012, "num_input_tokens_seen": 144181536, "step": 66865 }, { "epoch": 12.271976509451276, "grad_norm": 0.11084003746509552, "learning_rate": 3.899242634112304e-06, "loss": 0.0957, "num_input_tokens_seen": 144192448, "step": 66870 }, { "epoch": 12.272894109010828, "grad_norm": 15.2765474319458, "learning_rate": 3.8984615375613175e-06, "loss": 0.1731, "num_input_tokens_seen": 144202880, "step": 66875 }, { "epoch": 12.27381170857038, "grad_norm": 89.20877838134766, "learning_rate": 3.897680469263085e-06, "loss": 0.2715, "num_input_tokens_seen": 144214656, "step": 66880 }, { "epoch": 12.274729308129933, "grad_norm": 0.024813728407025337, "learning_rate": 3.896899429237641e-06, "loss": 0.0112, "num_input_tokens_seen": 144224384, "step": 66885 }, { "epoch": 12.275646907689485, "grad_norm": 4.427950859069824, "learning_rate": 3.896118417505016e-06, "loss": 0.0012, "num_input_tokens_seen": 144234624, "step": 66890 }, { "epoch": 12.276564507249036, "grad_norm": 4.109556674957275, "learning_rate": 3.8953374340852435e-06, "loss": 0.115, "num_input_tokens_seen": 144245472, "step": 66895 }, { "epoch": 12.27748210680859, "grad_norm": 1.1491950750350952, "learning_rate": 3.8945564789983535e-06, "loss": 0.0358, "num_input_tokens_seen": 144256480, "step": 66900 }, { "epoch": 12.278399706368141, "grad_norm": 0.13400889933109283, "learning_rate": 3.893775552264377e-06, "loss": 0.0592, "num_input_tokens_seen": 144266432, "step": 66905 }, { "epoch": 12.279317305927693, "grad_norm": 0.00950360856950283, "learning_rate": 3.892994653903342e-06, "loss": 0.0008, "num_input_tokens_seen": 144276640, "step": 66910 }, { "epoch": 12.280234905487246, "grad_norm": 46.80269241333008, "learning_rate": 3.892213783935279e-06, "loss": 0.0117, "num_input_tokens_seen": 144286784, "step": 66915 }, { "epoch": 12.281152505046798, "grad_norm": 125.98372650146484, "learning_rate": 3.891432942380215e-06, "loss": 0.0707, "num_input_tokens_seen": 144297728, "step": 66920 }, { "epoch": 12.28207010460635, "grad_norm": 5.224872589111328, "learning_rate": 3.8906521292581775e-06, "loss": 0.0111, "num_input_tokens_seen": 144308736, "step": 66925 }, { "epoch": 12.282987704165903, "grad_norm": 0.004346576984971762, "learning_rate": 3.889871344589195e-06, "loss": 0.0023, "num_input_tokens_seen": 144319456, "step": 66930 }, { "epoch": 12.283905303725454, "grad_norm": 0.0956413671374321, "learning_rate": 3.8890905883932926e-06, "loss": 0.1629, "num_input_tokens_seen": 144330976, "step": 66935 }, { "epoch": 12.284822903285006, "grad_norm": 0.008097252808511257, "learning_rate": 3.888309860690493e-06, "loss": 0.0022, "num_input_tokens_seen": 144341664, "step": 66940 }, { "epoch": 12.28574050284456, "grad_norm": 0.11220362037420273, "learning_rate": 3.887529161500822e-06, "loss": 0.165, "num_input_tokens_seen": 144352960, "step": 66945 }, { "epoch": 12.286658102404111, "grad_norm": 1.0084235668182373, "learning_rate": 3.886748490844306e-06, "loss": 0.1442, "num_input_tokens_seen": 144362816, "step": 66950 }, { "epoch": 12.287575701963663, "grad_norm": 4.54594087600708, "learning_rate": 3.885967848740965e-06, "loss": 0.001, "num_input_tokens_seen": 144372736, "step": 66955 }, { "epoch": 12.288493301523216, "grad_norm": 1.354337215423584, "learning_rate": 3.885187235210821e-06, "loss": 0.184, "num_input_tokens_seen": 144384000, "step": 66960 }, { "epoch": 12.289410901082768, "grad_norm": 0.06139499694108963, "learning_rate": 3.884406650273897e-06, "loss": 0.31, "num_input_tokens_seen": 144395520, "step": 66965 }, { "epoch": 12.29032850064232, "grad_norm": 69.029296875, "learning_rate": 3.883626093950215e-06, "loss": 0.2379, "num_input_tokens_seen": 144406272, "step": 66970 }, { "epoch": 12.291246100201873, "grad_norm": 0.221946120262146, "learning_rate": 3.882845566259792e-06, "loss": 0.0016, "num_input_tokens_seen": 144418336, "step": 66975 }, { "epoch": 12.292163699761424, "grad_norm": 0.10484641045331955, "learning_rate": 3.88206506722265e-06, "loss": 0.2546, "num_input_tokens_seen": 144428864, "step": 66980 }, { "epoch": 12.293081299320976, "grad_norm": 0.38722357153892517, "learning_rate": 3.881284596858805e-06, "loss": 0.1797, "num_input_tokens_seen": 144439200, "step": 66985 }, { "epoch": 12.29399889888053, "grad_norm": 34.88205337524414, "learning_rate": 3.880504155188277e-06, "loss": 0.3871, "num_input_tokens_seen": 144449248, "step": 66990 }, { "epoch": 12.294916498440081, "grad_norm": 0.023558294400572777, "learning_rate": 3.879723742231082e-06, "loss": 0.1343, "num_input_tokens_seen": 144460000, "step": 66995 }, { "epoch": 12.295834097999633, "grad_norm": 0.2539394199848175, "learning_rate": 3.878943358007238e-06, "loss": 0.1076, "num_input_tokens_seen": 144471456, "step": 67000 }, { "epoch": 12.296751697559186, "grad_norm": 2.4387869834899902, "learning_rate": 3.878163002536759e-06, "loss": 0.2272, "num_input_tokens_seen": 144483008, "step": 67005 }, { "epoch": 12.297669297118738, "grad_norm": 1.0777405500411987, "learning_rate": 3.877382675839657e-06, "loss": 0.1907, "num_input_tokens_seen": 144494560, "step": 67010 }, { "epoch": 12.29858689667829, "grad_norm": 145.78152465820312, "learning_rate": 3.876602377935953e-06, "loss": 0.169, "num_input_tokens_seen": 144505760, "step": 67015 }, { "epoch": 12.299504496237843, "grad_norm": 15.433331489562988, "learning_rate": 3.875822108845657e-06, "loss": 0.4648, "num_input_tokens_seen": 144518048, "step": 67020 }, { "epoch": 12.300422095797394, "grad_norm": 0.09688184410333633, "learning_rate": 3.87504186858878e-06, "loss": 0.2511, "num_input_tokens_seen": 144528288, "step": 67025 }, { "epoch": 12.301339695356946, "grad_norm": 8.874441146850586, "learning_rate": 3.8742616571853355e-06, "loss": 0.0136, "num_input_tokens_seen": 144539104, "step": 67030 }, { "epoch": 12.3022572949165, "grad_norm": 59.37225341796875, "learning_rate": 3.873481474655336e-06, "loss": 0.2525, "num_input_tokens_seen": 144550080, "step": 67035 }, { "epoch": 12.30317489447605, "grad_norm": 442.05908203125, "learning_rate": 3.87270132101879e-06, "loss": 0.1197, "num_input_tokens_seen": 144561664, "step": 67040 }, { "epoch": 12.304092494035602, "grad_norm": 0.10812264680862427, "learning_rate": 3.871921196295706e-06, "loss": 0.2578, "num_input_tokens_seen": 144572800, "step": 67045 }, { "epoch": 12.305010093595156, "grad_norm": 0.06574761122465134, "learning_rate": 3.8711411005060985e-06, "loss": 0.0291, "num_input_tokens_seen": 144584000, "step": 67050 }, { "epoch": 12.305927693154707, "grad_norm": 0.1393354833126068, "learning_rate": 3.870361033669971e-06, "loss": 0.1717, "num_input_tokens_seen": 144595424, "step": 67055 }, { "epoch": 12.306845292714259, "grad_norm": 0.12796056270599365, "learning_rate": 3.869580995807331e-06, "loss": 0.1385, "num_input_tokens_seen": 144606080, "step": 67060 }, { "epoch": 12.307762892273812, "grad_norm": 0.1061374694108963, "learning_rate": 3.868800986938187e-06, "loss": 0.0862, "num_input_tokens_seen": 144616032, "step": 67065 }, { "epoch": 12.308680491833364, "grad_norm": 1.2096495628356934, "learning_rate": 3.868021007082546e-06, "loss": 0.1458, "num_input_tokens_seen": 144627200, "step": 67070 }, { "epoch": 12.309598091392916, "grad_norm": 68.02684783935547, "learning_rate": 3.867241056260411e-06, "loss": 0.3406, "num_input_tokens_seen": 144637184, "step": 67075 }, { "epoch": 12.31051569095247, "grad_norm": 1.0305761098861694, "learning_rate": 3.8664611344917865e-06, "loss": 0.0113, "num_input_tokens_seen": 144648352, "step": 67080 }, { "epoch": 12.31143329051202, "grad_norm": 41.912052154541016, "learning_rate": 3.865681241796677e-06, "loss": 0.3784, "num_input_tokens_seen": 144659072, "step": 67085 }, { "epoch": 12.312350890071572, "grad_norm": 0.03322684392333031, "learning_rate": 3.864901378195086e-06, "loss": 0.3918, "num_input_tokens_seen": 144670304, "step": 67090 }, { "epoch": 12.313268489631126, "grad_norm": 270.91143798828125, "learning_rate": 3.864121543707016e-06, "loss": 0.0339, "num_input_tokens_seen": 144681504, "step": 67095 }, { "epoch": 12.314186089190677, "grad_norm": 0.7826125025749207, "learning_rate": 3.863341738352468e-06, "loss": 0.0044, "num_input_tokens_seen": 144690496, "step": 67100 }, { "epoch": 12.315103688750229, "grad_norm": 40.79922103881836, "learning_rate": 3.862561962151442e-06, "loss": 0.1631, "num_input_tokens_seen": 144701280, "step": 67105 }, { "epoch": 12.316021288309782, "grad_norm": 0.03262440487742424, "learning_rate": 3.8617822151239374e-06, "loss": 0.0466, "num_input_tokens_seen": 144712704, "step": 67110 }, { "epoch": 12.316938887869334, "grad_norm": 13.454405784606934, "learning_rate": 3.861002497289957e-06, "loss": 0.091, "num_input_tokens_seen": 144723648, "step": 67115 }, { "epoch": 12.317856487428886, "grad_norm": 36.69767379760742, "learning_rate": 3.860222808669498e-06, "loss": 0.0881, "num_input_tokens_seen": 144733440, "step": 67120 }, { "epoch": 12.318774086988439, "grad_norm": 0.48379436135292053, "learning_rate": 3.859443149282556e-06, "loss": 0.2236, "num_input_tokens_seen": 144744000, "step": 67125 }, { "epoch": 12.31969168654799, "grad_norm": 0.3076326251029968, "learning_rate": 3.858663519149128e-06, "loss": 0.1354, "num_input_tokens_seen": 144754176, "step": 67130 }, { "epoch": 12.320609286107542, "grad_norm": 39.706016540527344, "learning_rate": 3.857883918289215e-06, "loss": 0.1683, "num_input_tokens_seen": 144764448, "step": 67135 }, { "epoch": 12.321526885667096, "grad_norm": 74.69841003417969, "learning_rate": 3.857104346722808e-06, "loss": 0.1638, "num_input_tokens_seen": 144775168, "step": 67140 }, { "epoch": 12.322444485226647, "grad_norm": 4.314133167266846, "learning_rate": 3.856324804469901e-06, "loss": 0.1415, "num_input_tokens_seen": 144787008, "step": 67145 }, { "epoch": 12.323362084786199, "grad_norm": 0.4068344235420227, "learning_rate": 3.855545291550493e-06, "loss": 0.2063, "num_input_tokens_seen": 144798144, "step": 67150 }, { "epoch": 12.324279684345752, "grad_norm": 38.94131851196289, "learning_rate": 3.854765807984575e-06, "loss": 0.1471, "num_input_tokens_seen": 144808640, "step": 67155 }, { "epoch": 12.325197283905304, "grad_norm": 0.005783087573945522, "learning_rate": 3.853986353792138e-06, "loss": 0.0016, "num_input_tokens_seen": 144819840, "step": 67160 }, { "epoch": 12.326114883464856, "grad_norm": 14.223989486694336, "learning_rate": 3.853206928993174e-06, "loss": 0.2547, "num_input_tokens_seen": 144831168, "step": 67165 }, { "epoch": 12.327032483024409, "grad_norm": 82.93292236328125, "learning_rate": 3.852427533607676e-06, "loss": 0.2748, "num_input_tokens_seen": 144842912, "step": 67170 }, { "epoch": 12.32795008258396, "grad_norm": 23.02483367919922, "learning_rate": 3.851648167655634e-06, "loss": 0.0214, "num_input_tokens_seen": 144854080, "step": 67175 }, { "epoch": 12.328867682143512, "grad_norm": 0.11230320483446121, "learning_rate": 3.850868831157034e-06, "loss": 0.0214, "num_input_tokens_seen": 144864608, "step": 67180 }, { "epoch": 12.329785281703066, "grad_norm": 2.7961061000823975, "learning_rate": 3.850089524131869e-06, "loss": 0.0432, "num_input_tokens_seen": 144875072, "step": 67185 }, { "epoch": 12.330702881262617, "grad_norm": 504.32354736328125, "learning_rate": 3.849310246600126e-06, "loss": 0.3003, "num_input_tokens_seen": 144886176, "step": 67190 }, { "epoch": 12.331620480822169, "grad_norm": 16.070711135864258, "learning_rate": 3.848530998581792e-06, "loss": 0.2122, "num_input_tokens_seen": 144897312, "step": 67195 }, { "epoch": 12.332538080381722, "grad_norm": 0.6741150617599487, "learning_rate": 3.847751780096852e-06, "loss": 0.0948, "num_input_tokens_seen": 144907424, "step": 67200 }, { "epoch": 12.333455679941274, "grad_norm": 0.4688475728034973, "learning_rate": 3.846972591165293e-06, "loss": 0.0224, "num_input_tokens_seen": 144918432, "step": 67205 }, { "epoch": 12.334373279500825, "grad_norm": 0.20490491390228271, "learning_rate": 3.846193431807102e-06, "loss": 0.0068, "num_input_tokens_seen": 144928992, "step": 67210 }, { "epoch": 12.335290879060379, "grad_norm": 0.20088988542556763, "learning_rate": 3.845414302042259e-06, "loss": 0.1651, "num_input_tokens_seen": 144939808, "step": 67215 }, { "epoch": 12.33620847861993, "grad_norm": 0.18417096138000488, "learning_rate": 3.844635201890751e-06, "loss": 0.0384, "num_input_tokens_seen": 144950752, "step": 67220 }, { "epoch": 12.337126078179482, "grad_norm": 0.03983057662844658, "learning_rate": 3.843856131372561e-06, "loss": 0.1845, "num_input_tokens_seen": 144961952, "step": 67225 }, { "epoch": 12.338043677739035, "grad_norm": 0.4298713505268097, "learning_rate": 3.843077090507664e-06, "loss": 0.3897, "num_input_tokens_seen": 144971968, "step": 67230 }, { "epoch": 12.338961277298587, "grad_norm": 0.2513311505317688, "learning_rate": 3.842298079316052e-06, "loss": 0.008, "num_input_tokens_seen": 144983712, "step": 67235 }, { "epoch": 12.339878876858139, "grad_norm": 193.71868896484375, "learning_rate": 3.841519097817698e-06, "loss": 0.2232, "num_input_tokens_seen": 144992928, "step": 67240 }, { "epoch": 12.340796476417692, "grad_norm": 2.6892809867858887, "learning_rate": 3.840740146032582e-06, "loss": 0.1325, "num_input_tokens_seen": 145003392, "step": 67245 }, { "epoch": 12.341714075977244, "grad_norm": 0.17117048799991608, "learning_rate": 3.839961223980686e-06, "loss": 0.0009, "num_input_tokens_seen": 145015072, "step": 67250 }, { "epoch": 12.342631675536795, "grad_norm": 0.07425336539745331, "learning_rate": 3.8391823316819886e-06, "loss": 0.0597, "num_input_tokens_seen": 145024032, "step": 67255 }, { "epoch": 12.343549275096349, "grad_norm": 0.2689017951488495, "learning_rate": 3.838403469156464e-06, "loss": 0.0005, "num_input_tokens_seen": 145035648, "step": 67260 }, { "epoch": 12.3444668746559, "grad_norm": 0.0627831444144249, "learning_rate": 3.837624636424088e-06, "loss": 0.0002, "num_input_tokens_seen": 145046624, "step": 67265 }, { "epoch": 12.345384474215452, "grad_norm": 0.06516527384519577, "learning_rate": 3.836845833504841e-06, "loss": 0.1101, "num_input_tokens_seen": 145057728, "step": 67270 }, { "epoch": 12.346302073775005, "grad_norm": 1.129313349723816, "learning_rate": 3.836067060418695e-06, "loss": 0.1565, "num_input_tokens_seen": 145068544, "step": 67275 }, { "epoch": 12.347219673334557, "grad_norm": 0.0714283138513565, "learning_rate": 3.835288317185623e-06, "loss": 0.0015, "num_input_tokens_seen": 145079776, "step": 67280 }, { "epoch": 12.348137272894109, "grad_norm": 2.3869590759277344, "learning_rate": 3.8345096038256035e-06, "loss": 0.0038, "num_input_tokens_seen": 145090080, "step": 67285 }, { "epoch": 12.349054872453662, "grad_norm": 0.2342715710401535, "learning_rate": 3.8337309203586055e-06, "loss": 0.001, "num_input_tokens_seen": 145101088, "step": 67290 }, { "epoch": 12.349972472013214, "grad_norm": 95.12577819824219, "learning_rate": 3.832952266804602e-06, "loss": 0.3089, "num_input_tokens_seen": 145112192, "step": 67295 }, { "epoch": 12.350890071572765, "grad_norm": 0.01547564473003149, "learning_rate": 3.832173643183564e-06, "loss": 0.0004, "num_input_tokens_seen": 145122720, "step": 67300 }, { "epoch": 12.351807671132319, "grad_norm": 0.10281708836555481, "learning_rate": 3.831395049515461e-06, "loss": 0.2818, "num_input_tokens_seen": 145132736, "step": 67305 }, { "epoch": 12.35272527069187, "grad_norm": 0.2869289815425873, "learning_rate": 3.830616485820264e-06, "loss": 0.1928, "num_input_tokens_seen": 145142432, "step": 67310 }, { "epoch": 12.353642870251422, "grad_norm": 0.01331274677067995, "learning_rate": 3.829837952117942e-06, "loss": 0.0825, "num_input_tokens_seen": 145151712, "step": 67315 }, { "epoch": 12.354560469810975, "grad_norm": 145.63331604003906, "learning_rate": 3.829059448428465e-06, "loss": 0.1243, "num_input_tokens_seen": 145162336, "step": 67320 }, { "epoch": 12.355478069370527, "grad_norm": 0.11459099501371384, "learning_rate": 3.828280974771796e-06, "loss": 0.1507, "num_input_tokens_seen": 145172768, "step": 67325 }, { "epoch": 12.356395668930078, "grad_norm": 0.21626731753349304, "learning_rate": 3.827502531167903e-06, "loss": 0.1224, "num_input_tokens_seen": 145184608, "step": 67330 }, { "epoch": 12.357313268489632, "grad_norm": 0.01723269373178482, "learning_rate": 3.826724117636756e-06, "loss": 0.0363, "num_input_tokens_seen": 145195488, "step": 67335 }, { "epoch": 12.358230868049183, "grad_norm": 0.21789288520812988, "learning_rate": 3.825945734198315e-06, "loss": 0.0019, "num_input_tokens_seen": 145207232, "step": 67340 }, { "epoch": 12.359148467608735, "grad_norm": 0.0028781157452613115, "learning_rate": 3.825167380872547e-06, "loss": 0.0213, "num_input_tokens_seen": 145215680, "step": 67345 }, { "epoch": 12.360066067168288, "grad_norm": 0.016832206398248672, "learning_rate": 3.824389057679413e-06, "loss": 0.5391, "num_input_tokens_seen": 145226944, "step": 67350 }, { "epoch": 12.36098366672784, "grad_norm": 0.031490348279476166, "learning_rate": 3.82361076463888e-06, "loss": 0.0005, "num_input_tokens_seen": 145237312, "step": 67355 }, { "epoch": 12.361901266287392, "grad_norm": 16.165523529052734, "learning_rate": 3.822832501770908e-06, "loss": 0.3646, "num_input_tokens_seen": 145247808, "step": 67360 }, { "epoch": 12.362818865846945, "grad_norm": 0.008325916714966297, "learning_rate": 3.822054269095455e-06, "loss": 0.1326, "num_input_tokens_seen": 145258304, "step": 67365 }, { "epoch": 12.363736465406497, "grad_norm": 0.2275274246931076, "learning_rate": 3.821276066632487e-06, "loss": 0.2022, "num_input_tokens_seen": 145269440, "step": 67370 }, { "epoch": 12.364654064966048, "grad_norm": 196.89308166503906, "learning_rate": 3.82049789440196e-06, "loss": 0.1021, "num_input_tokens_seen": 145281632, "step": 67375 }, { "epoch": 12.365571664525602, "grad_norm": 0.12631957232952118, "learning_rate": 3.819719752423833e-06, "loss": 0.1003, "num_input_tokens_seen": 145293376, "step": 67380 }, { "epoch": 12.366489264085153, "grad_norm": 0.07779613137245178, "learning_rate": 3.8189416407180665e-06, "loss": 0.16, "num_input_tokens_seen": 145302176, "step": 67385 }, { "epoch": 12.367406863644705, "grad_norm": 65.7892074584961, "learning_rate": 3.8181635593046165e-06, "loss": 0.0213, "num_input_tokens_seen": 145312288, "step": 67390 }, { "epoch": 12.368324463204258, "grad_norm": 45.65533447265625, "learning_rate": 3.81738550820344e-06, "loss": 0.1046, "num_input_tokens_seen": 145324128, "step": 67395 }, { "epoch": 12.36924206276381, "grad_norm": 0.04283927381038666, "learning_rate": 3.8166074874344895e-06, "loss": 0.1328, "num_input_tokens_seen": 145332992, "step": 67400 }, { "epoch": 12.370159662323362, "grad_norm": 0.011210865341126919, "learning_rate": 3.8158294970177256e-06, "loss": 0.0026, "num_input_tokens_seen": 145342208, "step": 67405 }, { "epoch": 12.371077261882915, "grad_norm": 0.1991562396287918, "learning_rate": 3.8150515369731e-06, "loss": 0.0019, "num_input_tokens_seen": 145352512, "step": 67410 }, { "epoch": 12.371994861442467, "grad_norm": 29.666810989379883, "learning_rate": 3.814273607320564e-06, "loss": 0.2023, "num_input_tokens_seen": 145362720, "step": 67415 }, { "epoch": 12.372912461002018, "grad_norm": 39.593353271484375, "learning_rate": 3.813495708080074e-06, "loss": 0.0815, "num_input_tokens_seen": 145373856, "step": 67420 }, { "epoch": 12.373830060561572, "grad_norm": 0.21047525107860565, "learning_rate": 3.8127178392715795e-06, "loss": 0.1602, "num_input_tokens_seen": 145384128, "step": 67425 }, { "epoch": 12.374747660121123, "grad_norm": 0.044703248888254166, "learning_rate": 3.8119400009150308e-06, "loss": 0.0564, "num_input_tokens_seen": 145394944, "step": 67430 }, { "epoch": 12.375665259680675, "grad_norm": 0.9301224946975708, "learning_rate": 3.811162193030382e-06, "loss": 0.2996, "num_input_tokens_seen": 145405440, "step": 67435 }, { "epoch": 12.376582859240228, "grad_norm": 51.469871520996094, "learning_rate": 3.81038441563758e-06, "loss": 0.1647, "num_input_tokens_seen": 145415616, "step": 67440 }, { "epoch": 12.37750045879978, "grad_norm": 0.11124691367149353, "learning_rate": 3.809606668756574e-06, "loss": 0.0014, "num_input_tokens_seen": 145426144, "step": 67445 }, { "epoch": 12.378418058359332, "grad_norm": 1.0677655935287476, "learning_rate": 3.80882895240731e-06, "loss": 0.1921, "num_input_tokens_seen": 145435136, "step": 67450 }, { "epoch": 12.379335657918885, "grad_norm": 0.3284108638763428, "learning_rate": 3.8080512666097393e-06, "loss": 0.1677, "num_input_tokens_seen": 145445312, "step": 67455 }, { "epoch": 12.380253257478437, "grad_norm": 0.10315696150064468, "learning_rate": 3.8072736113838068e-06, "loss": 0.0004, "num_input_tokens_seen": 145456480, "step": 67460 }, { "epoch": 12.381170857037988, "grad_norm": 66.36915588378906, "learning_rate": 3.8064959867494543e-06, "loss": 0.1941, "num_input_tokens_seen": 145467840, "step": 67465 }, { "epoch": 12.382088456597542, "grad_norm": 0.1887638419866562, "learning_rate": 3.805718392726633e-06, "loss": 0.0011, "num_input_tokens_seen": 145478176, "step": 67470 }, { "epoch": 12.383006056157093, "grad_norm": 54.881717681884766, "learning_rate": 3.804940829335284e-06, "loss": 0.163, "num_input_tokens_seen": 145488992, "step": 67475 }, { "epoch": 12.383923655716645, "grad_norm": 0.11318625509738922, "learning_rate": 3.80416329659535e-06, "loss": 0.005, "num_input_tokens_seen": 145498112, "step": 67480 }, { "epoch": 12.384841255276198, "grad_norm": 0.06500600278377533, "learning_rate": 3.8033857945267726e-06, "loss": 0.1996, "num_input_tokens_seen": 145509184, "step": 67485 }, { "epoch": 12.38575885483575, "grad_norm": 9.497533798217773, "learning_rate": 3.802608323149497e-06, "loss": 0.002, "num_input_tokens_seen": 145520256, "step": 67490 }, { "epoch": 12.386676454395301, "grad_norm": 112.72864532470703, "learning_rate": 3.8018308824834616e-06, "loss": 0.1848, "num_input_tokens_seen": 145531360, "step": 67495 }, { "epoch": 12.387594053954855, "grad_norm": 49.9652099609375, "learning_rate": 3.801053472548605e-06, "loss": 0.1134, "num_input_tokens_seen": 145542368, "step": 67500 }, { "epoch": 12.388511653514406, "grad_norm": 0.4650150239467621, "learning_rate": 3.800276093364871e-06, "loss": 0.1848, "num_input_tokens_seen": 145554016, "step": 67505 }, { "epoch": 12.389429253073958, "grad_norm": 55.07220458984375, "learning_rate": 3.799498744952196e-06, "loss": 0.1626, "num_input_tokens_seen": 145564256, "step": 67510 }, { "epoch": 12.390346852633511, "grad_norm": 0.02341149002313614, "learning_rate": 3.798721427330516e-06, "loss": 0.0005, "num_input_tokens_seen": 145574720, "step": 67515 }, { "epoch": 12.391264452193063, "grad_norm": 41.528987884521484, "learning_rate": 3.7979441405197713e-06, "loss": 0.3256, "num_input_tokens_seen": 145584992, "step": 67520 }, { "epoch": 12.392182051752615, "grad_norm": 0.2921004593372345, "learning_rate": 3.797166884539895e-06, "loss": 0.002, "num_input_tokens_seen": 145594976, "step": 67525 }, { "epoch": 12.393099651312168, "grad_norm": 0.1654176414012909, "learning_rate": 3.796389659410825e-06, "loss": 0.049, "num_input_tokens_seen": 145605952, "step": 67530 }, { "epoch": 12.39401725087172, "grad_norm": 0.35696902871131897, "learning_rate": 3.7956124651524934e-06, "loss": 0.187, "num_input_tokens_seen": 145616864, "step": 67535 }, { "epoch": 12.394934850431271, "grad_norm": 0.031076395884156227, "learning_rate": 3.794835301784837e-06, "loss": 0.0006, "num_input_tokens_seen": 145626624, "step": 67540 }, { "epoch": 12.395852449990825, "grad_norm": 32.648406982421875, "learning_rate": 3.7940581693277865e-06, "loss": 0.4168, "num_input_tokens_seen": 145637696, "step": 67545 }, { "epoch": 12.396770049550376, "grad_norm": 0.2692071795463562, "learning_rate": 3.793281067801274e-06, "loss": 0.3996, "num_input_tokens_seen": 145648352, "step": 67550 }, { "epoch": 12.397687649109928, "grad_norm": 0.008680242113769054, "learning_rate": 3.792503997225233e-06, "loss": 0.1259, "num_input_tokens_seen": 145659232, "step": 67555 }, { "epoch": 12.398605248669481, "grad_norm": 0.02353457547724247, "learning_rate": 3.7917269576195935e-06, "loss": 0.1664, "num_input_tokens_seen": 145670880, "step": 67560 }, { "epoch": 12.399522848229033, "grad_norm": 0.0530986562371254, "learning_rate": 3.7909499490042824e-06, "loss": 0.0008, "num_input_tokens_seen": 145681664, "step": 67565 }, { "epoch": 12.400440447788585, "grad_norm": 26.15850257873535, "learning_rate": 3.7901729713992338e-06, "loss": 0.2292, "num_input_tokens_seen": 145691328, "step": 67570 }, { "epoch": 12.401358047348138, "grad_norm": 0.13847346603870392, "learning_rate": 3.789396024824373e-06, "loss": 0.0045, "num_input_tokens_seen": 145702592, "step": 67575 }, { "epoch": 12.40227564690769, "grad_norm": 0.09055199474096298, "learning_rate": 3.7886191092996284e-06, "loss": 0.0756, "num_input_tokens_seen": 145712384, "step": 67580 }, { "epoch": 12.403193246467241, "grad_norm": 0.2788006663322449, "learning_rate": 3.7878422248449237e-06, "loss": 0.0213, "num_input_tokens_seen": 145722752, "step": 67585 }, { "epoch": 12.404110846026795, "grad_norm": 44.74748229980469, "learning_rate": 3.7870653714801897e-06, "loss": 0.2755, "num_input_tokens_seen": 145732672, "step": 67590 }, { "epoch": 12.405028445586346, "grad_norm": 20.642061233520508, "learning_rate": 3.786288549225349e-06, "loss": 0.1102, "num_input_tokens_seen": 145743456, "step": 67595 }, { "epoch": 12.405946045145898, "grad_norm": 0.16145257651805878, "learning_rate": 3.785511758100323e-06, "loss": 0.3967, "num_input_tokens_seen": 145755872, "step": 67600 }, { "epoch": 12.406863644705451, "grad_norm": 148.75210571289062, "learning_rate": 3.7847349981250408e-06, "loss": 0.3033, "num_input_tokens_seen": 145766880, "step": 67605 }, { "epoch": 12.407781244265003, "grad_norm": 27.10776138305664, "learning_rate": 3.7839582693194227e-06, "loss": 0.2297, "num_input_tokens_seen": 145778464, "step": 67610 }, { "epoch": 12.408698843824554, "grad_norm": 24.633020401000977, "learning_rate": 3.78318157170339e-06, "loss": 0.1133, "num_input_tokens_seen": 145789440, "step": 67615 }, { "epoch": 12.409616443384108, "grad_norm": 0.06567493081092834, "learning_rate": 3.7824049052968625e-06, "loss": 0.0543, "num_input_tokens_seen": 145799776, "step": 67620 }, { "epoch": 12.41053404294366, "grad_norm": 0.06622102856636047, "learning_rate": 3.781628270119763e-06, "loss": 0.1166, "num_input_tokens_seen": 145811456, "step": 67625 }, { "epoch": 12.411451642503211, "grad_norm": 27.720884323120117, "learning_rate": 3.780851666192011e-06, "loss": 0.2317, "num_input_tokens_seen": 145822528, "step": 67630 }, { "epoch": 12.412369242062764, "grad_norm": 0.42199644446372986, "learning_rate": 3.780075093533523e-06, "loss": 0.196, "num_input_tokens_seen": 145834496, "step": 67635 }, { "epoch": 12.413286841622316, "grad_norm": 0.1969112604856491, "learning_rate": 3.7792985521642187e-06, "loss": 0.0006, "num_input_tokens_seen": 145844960, "step": 67640 }, { "epoch": 12.414204441181868, "grad_norm": 63.78429412841797, "learning_rate": 3.7785220421040153e-06, "loss": 0.1053, "num_input_tokens_seen": 145855680, "step": 67645 }, { "epoch": 12.415122040741421, "grad_norm": 16.05515480041504, "learning_rate": 3.777745563372826e-06, "loss": 0.0896, "num_input_tokens_seen": 145867392, "step": 67650 }, { "epoch": 12.416039640300973, "grad_norm": 0.11985692381858826, "learning_rate": 3.7769691159905706e-06, "loss": 0.0513, "num_input_tokens_seen": 145876800, "step": 67655 }, { "epoch": 12.416957239860524, "grad_norm": 93.6630859375, "learning_rate": 3.7761926999771624e-06, "loss": 0.1259, "num_input_tokens_seen": 145886880, "step": 67660 }, { "epoch": 12.417874839420078, "grad_norm": 0.3197997212409973, "learning_rate": 3.7754163153525147e-06, "loss": 0.002, "num_input_tokens_seen": 145897504, "step": 67665 }, { "epoch": 12.41879243897963, "grad_norm": 0.004606259986758232, "learning_rate": 3.774639962136538e-06, "loss": 0.057, "num_input_tokens_seen": 145907744, "step": 67670 }, { "epoch": 12.419710038539181, "grad_norm": 0.5160474181175232, "learning_rate": 3.773863640349149e-06, "loss": 0.1238, "num_input_tokens_seen": 145917696, "step": 67675 }, { "epoch": 12.420627638098734, "grad_norm": 52.03166580200195, "learning_rate": 3.7730873500102584e-06, "loss": 0.3858, "num_input_tokens_seen": 145929472, "step": 67680 }, { "epoch": 12.421545237658286, "grad_norm": 0.07517613470554352, "learning_rate": 3.7723110911397727e-06, "loss": 0.2126, "num_input_tokens_seen": 145940032, "step": 67685 }, { "epoch": 12.422462837217838, "grad_norm": 0.039189789444208145, "learning_rate": 3.771534863757607e-06, "loss": 0.2072, "num_input_tokens_seen": 145951392, "step": 67690 }, { "epoch": 12.423380436777391, "grad_norm": 0.50531405210495, "learning_rate": 3.7707586678836685e-06, "loss": 0.2749, "num_input_tokens_seen": 145962656, "step": 67695 }, { "epoch": 12.424298036336943, "grad_norm": 0.23122745752334595, "learning_rate": 3.769982503537862e-06, "loss": 0.0585, "num_input_tokens_seen": 145971872, "step": 67700 }, { "epoch": 12.425215635896494, "grad_norm": 0.06032142788171768, "learning_rate": 3.7692063707401013e-06, "loss": 0.0253, "num_input_tokens_seen": 145982912, "step": 67705 }, { "epoch": 12.426133235456048, "grad_norm": 125.82364654541016, "learning_rate": 3.768430269510289e-06, "loss": 0.0992, "num_input_tokens_seen": 145993728, "step": 67710 }, { "epoch": 12.4270508350156, "grad_norm": 70.35487365722656, "learning_rate": 3.7676541998683315e-06, "loss": 0.1965, "num_input_tokens_seen": 146003104, "step": 67715 }, { "epoch": 12.42796843457515, "grad_norm": 0.026853932067751884, "learning_rate": 3.7668781618341315e-06, "loss": 0.115, "num_input_tokens_seen": 146013376, "step": 67720 }, { "epoch": 12.428886034134704, "grad_norm": 38.44748306274414, "learning_rate": 3.766102155427598e-06, "loss": 0.1798, "num_input_tokens_seen": 146025344, "step": 67725 }, { "epoch": 12.429803633694256, "grad_norm": 0.1105310246348381, "learning_rate": 3.7653261806686316e-06, "loss": 0.0007, "num_input_tokens_seen": 146036864, "step": 67730 }, { "epoch": 12.430721233253808, "grad_norm": 0.2855507433414459, "learning_rate": 3.7645502375771337e-06, "loss": 0.001, "num_input_tokens_seen": 146048096, "step": 67735 }, { "epoch": 12.431638832813361, "grad_norm": 0.04465379938483238, "learning_rate": 3.763774326173009e-06, "loss": 0.0794, "num_input_tokens_seen": 146058528, "step": 67740 }, { "epoch": 12.432556432372913, "grad_norm": 0.3304266333580017, "learning_rate": 3.762998446476156e-06, "loss": 0.1104, "num_input_tokens_seen": 146068864, "step": 67745 }, { "epoch": 12.433474031932464, "grad_norm": 59.76289367675781, "learning_rate": 3.7622225985064763e-06, "loss": 0.1526, "num_input_tokens_seen": 146079232, "step": 67750 }, { "epoch": 12.434391631492018, "grad_norm": 206.1075897216797, "learning_rate": 3.761446782283867e-06, "loss": 0.3042, "num_input_tokens_seen": 146090624, "step": 67755 }, { "epoch": 12.43530923105157, "grad_norm": 0.053100645542144775, "learning_rate": 3.76067099782823e-06, "loss": 0.0011, "num_input_tokens_seen": 146102112, "step": 67760 }, { "epoch": 12.43622683061112, "grad_norm": 1.50209379196167, "learning_rate": 3.759895245159461e-06, "loss": 0.2078, "num_input_tokens_seen": 146112192, "step": 67765 }, { "epoch": 12.437144430170674, "grad_norm": 0.3616706132888794, "learning_rate": 3.759119524297455e-06, "loss": 0.0656, "num_input_tokens_seen": 146122912, "step": 67770 }, { "epoch": 12.438062029730226, "grad_norm": 0.05727681145071983, "learning_rate": 3.758343835262112e-06, "loss": 0.001, "num_input_tokens_seen": 146133536, "step": 67775 }, { "epoch": 12.438979629289777, "grad_norm": 0.011792745441198349, "learning_rate": 3.7575681780733253e-06, "loss": 0.1347, "num_input_tokens_seen": 146143424, "step": 67780 }, { "epoch": 12.43989722884933, "grad_norm": 0.0762224942445755, "learning_rate": 3.756792552750987e-06, "loss": 0.3308, "num_input_tokens_seen": 146155328, "step": 67785 }, { "epoch": 12.440814828408882, "grad_norm": 10.084636688232422, "learning_rate": 3.756016959314995e-06, "loss": 0.1625, "num_input_tokens_seen": 146165024, "step": 67790 }, { "epoch": 12.441732427968434, "grad_norm": 0.23388269543647766, "learning_rate": 3.75524139778524e-06, "loss": 0.2721, "num_input_tokens_seen": 146174560, "step": 67795 }, { "epoch": 12.442650027527987, "grad_norm": 6.75385046005249, "learning_rate": 3.7544658681816137e-06, "loss": 0.0026, "num_input_tokens_seen": 146184512, "step": 67800 }, { "epoch": 12.443567627087539, "grad_norm": 1.0179036855697632, "learning_rate": 3.7536903705240057e-06, "loss": 0.0335, "num_input_tokens_seen": 146194176, "step": 67805 }, { "epoch": 12.44448522664709, "grad_norm": 25.015256881713867, "learning_rate": 3.7529149048323098e-06, "loss": 0.2245, "num_input_tokens_seen": 146203744, "step": 67810 }, { "epoch": 12.445402826206644, "grad_norm": 22.233171463012695, "learning_rate": 3.7521394711264148e-06, "loss": 0.2586, "num_input_tokens_seen": 146214176, "step": 67815 }, { "epoch": 12.446320425766196, "grad_norm": 33.202606201171875, "learning_rate": 3.7513640694262056e-06, "loss": 0.2018, "num_input_tokens_seen": 146225248, "step": 67820 }, { "epoch": 12.447238025325747, "grad_norm": 0.1417335867881775, "learning_rate": 3.7505886997515744e-06, "loss": 0.1807, "num_input_tokens_seen": 146236736, "step": 67825 }, { "epoch": 12.4481556248853, "grad_norm": 58.30647659301758, "learning_rate": 3.7498133621224073e-06, "loss": 0.3776, "num_input_tokens_seen": 146248704, "step": 67830 }, { "epoch": 12.449073224444852, "grad_norm": 10.982888221740723, "learning_rate": 3.7490380565585886e-06, "loss": 0.1051, "num_input_tokens_seen": 146259488, "step": 67835 }, { "epoch": 12.449990824004404, "grad_norm": 0.16327309608459473, "learning_rate": 3.748262783080006e-06, "loss": 0.1241, "num_input_tokens_seen": 146269728, "step": 67840 }, { "epoch": 12.450908423563957, "grad_norm": 0.08980560302734375, "learning_rate": 3.747487541706542e-06, "loss": 0.0026, "num_input_tokens_seen": 146280992, "step": 67845 }, { "epoch": 12.451826023123509, "grad_norm": 0.05930333212018013, "learning_rate": 3.746712332458082e-06, "loss": 0.0951, "num_input_tokens_seen": 146291872, "step": 67850 }, { "epoch": 12.45274362268306, "grad_norm": 43.450660705566406, "learning_rate": 3.7459371553545076e-06, "loss": 0.2731, "num_input_tokens_seen": 146302304, "step": 67855 }, { "epoch": 12.453661222242614, "grad_norm": 0.5428964495658875, "learning_rate": 3.745162010415703e-06, "loss": 0.0008, "num_input_tokens_seen": 146313760, "step": 67860 }, { "epoch": 12.454578821802166, "grad_norm": 0.36571890115737915, "learning_rate": 3.744386897661547e-06, "loss": 0.1233, "num_input_tokens_seen": 146324704, "step": 67865 }, { "epoch": 12.455496421361717, "grad_norm": 0.10127946734428406, "learning_rate": 3.7436118171119198e-06, "loss": 0.0013, "num_input_tokens_seen": 146334752, "step": 67870 }, { "epoch": 12.45641402092127, "grad_norm": 0.09283747524023056, "learning_rate": 3.742836768786704e-06, "loss": 0.1171, "num_input_tokens_seen": 146346048, "step": 67875 }, { "epoch": 12.457331620480822, "grad_norm": 0.05879441648721695, "learning_rate": 3.742061752705777e-06, "loss": 0.2257, "num_input_tokens_seen": 146357632, "step": 67880 }, { "epoch": 12.458249220040374, "grad_norm": 19.10021209716797, "learning_rate": 3.7412867688890144e-06, "loss": 0.0146, "num_input_tokens_seen": 146366880, "step": 67885 }, { "epoch": 12.459166819599927, "grad_norm": 0.049405891448259354, "learning_rate": 3.7405118173562964e-06, "loss": 0.0072, "num_input_tokens_seen": 146378272, "step": 67890 }, { "epoch": 12.460084419159479, "grad_norm": 4.196362018585205, "learning_rate": 3.7397368981274994e-06, "loss": 0.0037, "num_input_tokens_seen": 146390144, "step": 67895 }, { "epoch": 12.46100201871903, "grad_norm": 64.10368347167969, "learning_rate": 3.7389620112224966e-06, "loss": 0.2531, "num_input_tokens_seen": 146402112, "step": 67900 }, { "epoch": 12.461919618278584, "grad_norm": 0.5676708817481995, "learning_rate": 3.7381871566611626e-06, "loss": 0.5244, "num_input_tokens_seen": 146413984, "step": 67905 }, { "epoch": 12.462837217838135, "grad_norm": 1.6311248540878296, "learning_rate": 3.7374123344633745e-06, "loss": 0.1137, "num_input_tokens_seen": 146425568, "step": 67910 }, { "epoch": 12.463754817397687, "grad_norm": 0.12339959293603897, "learning_rate": 3.7366375446490028e-06, "loss": 0.1724, "num_input_tokens_seen": 146436544, "step": 67915 }, { "epoch": 12.46467241695724, "grad_norm": 8.2908296585083, "learning_rate": 3.7358627872379183e-06, "loss": 0.1767, "num_input_tokens_seen": 146447520, "step": 67920 }, { "epoch": 12.465590016516792, "grad_norm": 2.379570245742798, "learning_rate": 3.7350880622499956e-06, "loss": 0.0462, "num_input_tokens_seen": 146457632, "step": 67925 }, { "epoch": 12.466507616076344, "grad_norm": 2.993122100830078, "learning_rate": 3.734313369705104e-06, "loss": 0.0952, "num_input_tokens_seen": 146469440, "step": 67930 }, { "epoch": 12.467425215635897, "grad_norm": 0.1295623928308487, "learning_rate": 3.733538709623113e-06, "loss": 0.092, "num_input_tokens_seen": 146479200, "step": 67935 }, { "epoch": 12.468342815195449, "grad_norm": 24.94422149658203, "learning_rate": 3.7327640820238893e-06, "loss": 0.0764, "num_input_tokens_seen": 146490752, "step": 67940 }, { "epoch": 12.469260414755, "grad_norm": 0.029167262837290764, "learning_rate": 3.7319894869273043e-06, "loss": 0.0305, "num_input_tokens_seen": 146502848, "step": 67945 }, { "epoch": 12.470178014314554, "grad_norm": 210.2141571044922, "learning_rate": 3.731214924353224e-06, "loss": 0.1333, "num_input_tokens_seen": 146514304, "step": 67950 }, { "epoch": 12.471095613874105, "grad_norm": 74.4675064086914, "learning_rate": 3.730440394321514e-06, "loss": 0.2166, "num_input_tokens_seen": 146525472, "step": 67955 }, { "epoch": 12.472013213433657, "grad_norm": 0.01997234858572483, "learning_rate": 3.7296658968520406e-06, "loss": 0.0797, "num_input_tokens_seen": 146536064, "step": 67960 }, { "epoch": 12.47293081299321, "grad_norm": 0.22077319025993347, "learning_rate": 3.7288914319646684e-06, "loss": 0.1443, "num_input_tokens_seen": 146546048, "step": 67965 }, { "epoch": 12.473848412552762, "grad_norm": 0.0019413888221606612, "learning_rate": 3.728116999679259e-06, "loss": 0.0021, "num_input_tokens_seen": 146556352, "step": 67970 }, { "epoch": 12.474766012112314, "grad_norm": 0.004171686712652445, "learning_rate": 3.727342600015679e-06, "loss": 0.0073, "num_input_tokens_seen": 146567168, "step": 67975 }, { "epoch": 12.475683611671867, "grad_norm": 12.221719741821289, "learning_rate": 3.726568232993789e-06, "loss": 0.1475, "num_input_tokens_seen": 146578560, "step": 67980 }, { "epoch": 12.476601211231419, "grad_norm": 159.9145050048828, "learning_rate": 3.72579389863345e-06, "loss": 0.454, "num_input_tokens_seen": 146589696, "step": 67985 }, { "epoch": 12.47751881079097, "grad_norm": 0.29636234045028687, "learning_rate": 3.7250195969545213e-06, "loss": 0.2453, "num_input_tokens_seen": 146601344, "step": 67990 }, { "epoch": 12.478436410350524, "grad_norm": 32.10935592651367, "learning_rate": 3.724245327976865e-06, "loss": 0.6334, "num_input_tokens_seen": 146610944, "step": 67995 }, { "epoch": 12.479354009910075, "grad_norm": 2.8394522666931152, "learning_rate": 3.7234710917203387e-06, "loss": 0.0656, "num_input_tokens_seen": 146621216, "step": 68000 }, { "epoch": 12.480271609469627, "grad_norm": 40.94926834106445, "learning_rate": 3.722696888204799e-06, "loss": 0.0217, "num_input_tokens_seen": 146633056, "step": 68005 }, { "epoch": 12.48118920902918, "grad_norm": 65.4466781616211, "learning_rate": 3.721922717450107e-06, "loss": 0.2517, "num_input_tokens_seen": 146645056, "step": 68010 }, { "epoch": 12.482106808588732, "grad_norm": 254.35423278808594, "learning_rate": 3.721148579476116e-06, "loss": 0.1249, "num_input_tokens_seen": 146655456, "step": 68015 }, { "epoch": 12.483024408148284, "grad_norm": 0.22137996554374695, "learning_rate": 3.720374474302679e-06, "loss": 0.0006, "num_input_tokens_seen": 146667552, "step": 68020 }, { "epoch": 12.483942007707837, "grad_norm": 41.84296417236328, "learning_rate": 3.719600401949657e-06, "loss": 0.4474, "num_input_tokens_seen": 146678464, "step": 68025 }, { "epoch": 12.484859607267389, "grad_norm": 0.16445793211460114, "learning_rate": 3.718826362436899e-06, "loss": 0.1355, "num_input_tokens_seen": 146690144, "step": 68030 }, { "epoch": 12.48577720682694, "grad_norm": 0.3652712106704712, "learning_rate": 3.7180523557842603e-06, "loss": 0.0023, "num_input_tokens_seen": 146702144, "step": 68035 }, { "epoch": 12.486694806386494, "grad_norm": 33.82609558105469, "learning_rate": 3.717278382011589e-06, "loss": 0.0497, "num_input_tokens_seen": 146713024, "step": 68040 }, { "epoch": 12.487612405946045, "grad_norm": 0.21310357749462128, "learning_rate": 3.7165044411387417e-06, "loss": 0.0482, "num_input_tokens_seen": 146723296, "step": 68045 }, { "epoch": 12.488530005505597, "grad_norm": 0.44264018535614014, "learning_rate": 3.715730533185566e-06, "loss": 0.1184, "num_input_tokens_seen": 146733216, "step": 68050 }, { "epoch": 12.48944760506515, "grad_norm": 63.21300506591797, "learning_rate": 3.7149566581719105e-06, "loss": 0.1101, "num_input_tokens_seen": 146743776, "step": 68055 }, { "epoch": 12.490365204624702, "grad_norm": 0.32593342661857605, "learning_rate": 3.7141828161176265e-06, "loss": 0.0434, "num_input_tokens_seen": 146753888, "step": 68060 }, { "epoch": 12.491282804184253, "grad_norm": 2.550539970397949, "learning_rate": 3.713409007042559e-06, "loss": 0.0607, "num_input_tokens_seen": 146764960, "step": 68065 }, { "epoch": 12.492200403743807, "grad_norm": 0.322805255651474, "learning_rate": 3.7126352309665577e-06, "loss": 0.091, "num_input_tokens_seen": 146776320, "step": 68070 }, { "epoch": 12.493118003303358, "grad_norm": 1.3856409788131714, "learning_rate": 3.711861487909466e-06, "loss": 0.0008, "num_input_tokens_seen": 146787104, "step": 68075 }, { "epoch": 12.49403560286291, "grad_norm": 0.06487869471311569, "learning_rate": 3.7110877778911324e-06, "loss": 0.0219, "num_input_tokens_seen": 146798240, "step": 68080 }, { "epoch": 12.494953202422463, "grad_norm": 22.58487319946289, "learning_rate": 3.7103141009313993e-06, "loss": 0.1158, "num_input_tokens_seen": 146808928, "step": 68085 }, { "epoch": 12.495870801982015, "grad_norm": 0.09870145469903946, "learning_rate": 3.7095404570501087e-06, "loss": 0.0411, "num_input_tokens_seen": 146818944, "step": 68090 }, { "epoch": 12.496788401541567, "grad_norm": 0.0663105845451355, "learning_rate": 3.7087668462671074e-06, "loss": 0.0016, "num_input_tokens_seen": 146828704, "step": 68095 }, { "epoch": 12.49770600110112, "grad_norm": 3.2837328910827637, "learning_rate": 3.7079932686022354e-06, "loss": 0.137, "num_input_tokens_seen": 146838688, "step": 68100 }, { "epoch": 12.498623600660672, "grad_norm": 0.015140950679779053, "learning_rate": 3.7072197240753315e-06, "loss": 0.5366, "num_input_tokens_seen": 146850880, "step": 68105 }, { "epoch": 12.499541200220223, "grad_norm": 0.030107811093330383, "learning_rate": 3.7064462127062397e-06, "loss": 0.279, "num_input_tokens_seen": 146861664, "step": 68110 }, { "epoch": 12.500458799779777, "grad_norm": 55.51304626464844, "learning_rate": 3.705672734514798e-06, "loss": 0.062, "num_input_tokens_seen": 146871808, "step": 68115 }, { "epoch": 12.501376399339328, "grad_norm": 0.1445179581642151, "learning_rate": 3.7048992895208445e-06, "loss": 0.4216, "num_input_tokens_seen": 146882048, "step": 68120 }, { "epoch": 12.50229399889888, "grad_norm": 0.023819057270884514, "learning_rate": 3.704125877744216e-06, "loss": 0.0017, "num_input_tokens_seen": 146892128, "step": 68125 }, { "epoch": 12.503211598458433, "grad_norm": 182.9507293701172, "learning_rate": 3.703352499204751e-06, "loss": 0.2161, "num_input_tokens_seen": 146902240, "step": 68130 }, { "epoch": 12.504129198017985, "grad_norm": 0.0832466259598732, "learning_rate": 3.7025791539222855e-06, "loss": 0.0014, "num_input_tokens_seen": 146912096, "step": 68135 }, { "epoch": 12.505046797577537, "grad_norm": 0.07557385414838791, "learning_rate": 3.701805841916651e-06, "loss": 0.2449, "num_input_tokens_seen": 146922176, "step": 68140 }, { "epoch": 12.50596439713709, "grad_norm": 0.3187579810619354, "learning_rate": 3.7010325632076877e-06, "loss": 0.0004, "num_input_tokens_seen": 146933120, "step": 68145 }, { "epoch": 12.506881996696642, "grad_norm": 0.41761037707328796, "learning_rate": 3.7002593178152257e-06, "loss": 0.0056, "num_input_tokens_seen": 146943040, "step": 68150 }, { "epoch": 12.507799596256193, "grad_norm": 88.18990325927734, "learning_rate": 3.6994861057590963e-06, "loss": 0.3843, "num_input_tokens_seen": 146953760, "step": 68155 }, { "epoch": 12.508717195815747, "grad_norm": 0.06753844022750854, "learning_rate": 3.6987129270591337e-06, "loss": 0.0686, "num_input_tokens_seen": 146964384, "step": 68160 }, { "epoch": 12.509634795375298, "grad_norm": 63.65460968017578, "learning_rate": 3.697939781735167e-06, "loss": 0.3985, "num_input_tokens_seen": 146976352, "step": 68165 }, { "epoch": 12.51055239493485, "grad_norm": 0.6851288080215454, "learning_rate": 3.6971666698070285e-06, "loss": 0.0026, "num_input_tokens_seen": 146986560, "step": 68170 }, { "epoch": 12.511469994494403, "grad_norm": 0.16130122542381287, "learning_rate": 3.696393591294544e-06, "loss": 0.0698, "num_input_tokens_seen": 146996960, "step": 68175 }, { "epoch": 12.512387594053955, "grad_norm": 1.2953386306762695, "learning_rate": 3.6956205462175443e-06, "loss": 0.1887, "num_input_tokens_seen": 147007744, "step": 68180 }, { "epoch": 12.513305193613506, "grad_norm": 49.782470703125, "learning_rate": 3.694847534595857e-06, "loss": 0.0149, "num_input_tokens_seen": 147019616, "step": 68185 }, { "epoch": 12.51422279317306, "grad_norm": 51.8365592956543, "learning_rate": 3.6940745564493043e-06, "loss": 0.0556, "num_input_tokens_seen": 147030528, "step": 68190 }, { "epoch": 12.515140392732611, "grad_norm": 48.75376510620117, "learning_rate": 3.6933016117977177e-06, "loss": 0.0508, "num_input_tokens_seen": 147041440, "step": 68195 }, { "epoch": 12.516057992292163, "grad_norm": 0.261180579662323, "learning_rate": 3.69252870066092e-06, "loss": 0.0212, "num_input_tokens_seen": 147052288, "step": 68200 }, { "epoch": 12.516975591851716, "grad_norm": 0.04198162257671356, "learning_rate": 3.691755823058734e-06, "loss": 0.0592, "num_input_tokens_seen": 147062944, "step": 68205 }, { "epoch": 12.517893191411268, "grad_norm": 0.7570661902427673, "learning_rate": 3.690982979010982e-06, "loss": 0.0107, "num_input_tokens_seen": 147073216, "step": 68210 }, { "epoch": 12.51881079097082, "grad_norm": 0.027567630633711815, "learning_rate": 3.6902101685374897e-06, "loss": 0.1187, "num_input_tokens_seen": 147083648, "step": 68215 }, { "epoch": 12.519728390530373, "grad_norm": 0.006907041184604168, "learning_rate": 3.689437391658077e-06, "loss": 0.1546, "num_input_tokens_seen": 147093504, "step": 68220 }, { "epoch": 12.520645990089925, "grad_norm": 0.004103388637304306, "learning_rate": 3.6886646483925613e-06, "loss": 0.0007, "num_input_tokens_seen": 147104384, "step": 68225 }, { "epoch": 12.521563589649476, "grad_norm": 152.52072143554688, "learning_rate": 3.687891938760767e-06, "loss": 0.0334, "num_input_tokens_seen": 147116672, "step": 68230 }, { "epoch": 12.52248118920903, "grad_norm": 0.022382238879799843, "learning_rate": 3.6871192627825115e-06, "loss": 0.2704, "num_input_tokens_seen": 147128672, "step": 68235 }, { "epoch": 12.523398788768581, "grad_norm": 0.057238686829805374, "learning_rate": 3.68634662047761e-06, "loss": 0.2535, "num_input_tokens_seen": 147139584, "step": 68240 }, { "epoch": 12.524316388328133, "grad_norm": 23.7669620513916, "learning_rate": 3.6855740118658834e-06, "loss": 0.1641, "num_input_tokens_seen": 147150656, "step": 68245 }, { "epoch": 12.525233987887686, "grad_norm": 23.45777702331543, "learning_rate": 3.6848014369671464e-06, "loss": 0.0066, "num_input_tokens_seen": 147160384, "step": 68250 }, { "epoch": 12.526151587447238, "grad_norm": 1.6757813692092896, "learning_rate": 3.6840288958012137e-06, "loss": 0.0008, "num_input_tokens_seen": 147170400, "step": 68255 }, { "epoch": 12.52706918700679, "grad_norm": 28.573326110839844, "learning_rate": 3.6832563883878993e-06, "loss": 0.0151, "num_input_tokens_seen": 147179552, "step": 68260 }, { "epoch": 12.527986786566343, "grad_norm": 0.01747973822057247, "learning_rate": 3.6824839147470193e-06, "loss": 0.018, "num_input_tokens_seen": 147189440, "step": 68265 }, { "epoch": 12.528904386125895, "grad_norm": 0.018056832253932953, "learning_rate": 3.6817114748983845e-06, "loss": 0.0929, "num_input_tokens_seen": 147198656, "step": 68270 }, { "epoch": 12.529821985685446, "grad_norm": 231.9423065185547, "learning_rate": 3.680939068861806e-06, "loss": 0.038, "num_input_tokens_seen": 147209952, "step": 68275 }, { "epoch": 12.530739585245, "grad_norm": 18.908830642700195, "learning_rate": 3.6801666966570983e-06, "loss": 0.0697, "num_input_tokens_seen": 147220224, "step": 68280 }, { "epoch": 12.531657184804551, "grad_norm": 0.09937611222267151, "learning_rate": 3.679394358304067e-06, "loss": 0.1505, "num_input_tokens_seen": 147230624, "step": 68285 }, { "epoch": 12.532574784364103, "grad_norm": 35.4693489074707, "learning_rate": 3.678622053822524e-06, "loss": 0.0351, "num_input_tokens_seen": 147240896, "step": 68290 }, { "epoch": 12.533492383923656, "grad_norm": 0.046188343316316605, "learning_rate": 3.677849783232279e-06, "loss": 0.0085, "num_input_tokens_seen": 147251296, "step": 68295 }, { "epoch": 12.534409983483208, "grad_norm": 0.04998426139354706, "learning_rate": 3.677077546553137e-06, "loss": 0.1503, "num_input_tokens_seen": 147260832, "step": 68300 }, { "epoch": 12.53532758304276, "grad_norm": 0.03209249675273895, "learning_rate": 3.676305343804906e-06, "loss": 0.0005, "num_input_tokens_seen": 147270944, "step": 68305 }, { "epoch": 12.536245182602313, "grad_norm": 79.60533905029297, "learning_rate": 3.67553317500739e-06, "loss": 0.0127, "num_input_tokens_seen": 147281536, "step": 68310 }, { "epoch": 12.537162782161865, "grad_norm": 1.394882082939148, "learning_rate": 3.674761040180396e-06, "loss": 0.1342, "num_input_tokens_seen": 147290944, "step": 68315 }, { "epoch": 12.538080381721416, "grad_norm": 0.03543569892644882, "learning_rate": 3.673988939343728e-06, "loss": 0.1907, "num_input_tokens_seen": 147301408, "step": 68320 }, { "epoch": 12.53899798128097, "grad_norm": 0.04861463978886604, "learning_rate": 3.673216872517187e-06, "loss": 0.0003, "num_input_tokens_seen": 147312320, "step": 68325 }, { "epoch": 12.539915580840521, "grad_norm": 0.13189265131950378, "learning_rate": 3.6724448397205785e-06, "loss": 0.0006, "num_input_tokens_seen": 147323360, "step": 68330 }, { "epoch": 12.540833180400073, "grad_norm": 25.945335388183594, "learning_rate": 3.671672840973702e-06, "loss": 0.195, "num_input_tokens_seen": 147334272, "step": 68335 }, { "epoch": 12.541750779959626, "grad_norm": 0.009262319654226303, "learning_rate": 3.670900876296357e-06, "loss": 0.1907, "num_input_tokens_seen": 147344384, "step": 68340 }, { "epoch": 12.542668379519178, "grad_norm": 0.005256434436887503, "learning_rate": 3.6701289457083466e-06, "loss": 0.2421, "num_input_tokens_seen": 147355296, "step": 68345 }, { "epoch": 12.54358597907873, "grad_norm": 0.008614701218903065, "learning_rate": 3.669357049229468e-06, "loss": 0.0006, "num_input_tokens_seen": 147366400, "step": 68350 }, { "epoch": 12.544503578638283, "grad_norm": 0.0068854535929858685, "learning_rate": 3.668585186879518e-06, "loss": 0.0002, "num_input_tokens_seen": 147378464, "step": 68355 }, { "epoch": 12.545421178197834, "grad_norm": 0.005410974845290184, "learning_rate": 3.6678133586782928e-06, "loss": 0.0098, "num_input_tokens_seen": 147389696, "step": 68360 }, { "epoch": 12.546338777757386, "grad_norm": 0.09151981770992279, "learning_rate": 3.667041564645592e-06, "loss": 0.0005, "num_input_tokens_seen": 147399808, "step": 68365 }, { "epoch": 12.54725637731694, "grad_norm": 5.210691452026367, "learning_rate": 3.666269804801209e-06, "loss": 0.4563, "num_input_tokens_seen": 147410688, "step": 68370 }, { "epoch": 12.548173976876491, "grad_norm": 0.8071838021278381, "learning_rate": 3.665498079164938e-06, "loss": 0.0794, "num_input_tokens_seen": 147421760, "step": 68375 }, { "epoch": 12.549091576436043, "grad_norm": 119.76092529296875, "learning_rate": 3.6647263877565736e-06, "loss": 0.3155, "num_input_tokens_seen": 147432640, "step": 68380 }, { "epoch": 12.550009175995596, "grad_norm": 17.397132873535156, "learning_rate": 3.6639547305959066e-06, "loss": 0.1863, "num_input_tokens_seen": 147442912, "step": 68385 }, { "epoch": 12.550926775555148, "grad_norm": 0.028173260390758514, "learning_rate": 3.6631831077027317e-06, "loss": 0.0022, "num_input_tokens_seen": 147454272, "step": 68390 }, { "epoch": 12.5518443751147, "grad_norm": 1023.3016967773438, "learning_rate": 3.662411519096837e-06, "loss": 0.2804, "num_input_tokens_seen": 147464608, "step": 68395 }, { "epoch": 12.552761974674253, "grad_norm": 16.49570083618164, "learning_rate": 3.661639964798015e-06, "loss": 0.0066, "num_input_tokens_seen": 147475584, "step": 68400 }, { "epoch": 12.553679574233804, "grad_norm": 0.04240972548723221, "learning_rate": 3.6608684448260535e-06, "loss": 0.2637, "num_input_tokens_seen": 147485024, "step": 68405 }, { "epoch": 12.554597173793356, "grad_norm": 0.031915076076984406, "learning_rate": 3.6600969592007383e-06, "loss": 0.0007, "num_input_tokens_seen": 147495904, "step": 68410 }, { "epoch": 12.55551477335291, "grad_norm": 83.98650360107422, "learning_rate": 3.6593255079418622e-06, "loss": 0.2085, "num_input_tokens_seen": 147506560, "step": 68415 }, { "epoch": 12.556432372912461, "grad_norm": 0.010011670179665089, "learning_rate": 3.6585540910692085e-06, "loss": 0.0379, "num_input_tokens_seen": 147518912, "step": 68420 }, { "epoch": 12.557349972472013, "grad_norm": 46.8653678894043, "learning_rate": 3.6577827086025613e-06, "loss": 0.0979, "num_input_tokens_seen": 147530112, "step": 68425 }, { "epoch": 12.558267572031566, "grad_norm": 0.020106028765439987, "learning_rate": 3.6570113605617095e-06, "loss": 0.0004, "num_input_tokens_seen": 147539040, "step": 68430 }, { "epoch": 12.559185171591118, "grad_norm": 0.31844234466552734, "learning_rate": 3.656240046966435e-06, "loss": 0.7253, "num_input_tokens_seen": 147549824, "step": 68435 }, { "epoch": 12.56010277115067, "grad_norm": 404.37164306640625, "learning_rate": 3.6554687678365207e-06, "loss": 0.1562, "num_input_tokens_seen": 147560928, "step": 68440 }, { "epoch": 12.561020370710223, "grad_norm": 449.8999328613281, "learning_rate": 3.6546975231917466e-06, "loss": 0.2785, "num_input_tokens_seen": 147571072, "step": 68445 }, { "epoch": 12.561937970269774, "grad_norm": 26.60919189453125, "learning_rate": 3.6539263130518976e-06, "loss": 0.0522, "num_input_tokens_seen": 147582560, "step": 68450 }, { "epoch": 12.562855569829326, "grad_norm": 82.55382537841797, "learning_rate": 3.653155137436753e-06, "loss": 0.0934, "num_input_tokens_seen": 147591872, "step": 68455 }, { "epoch": 12.56377316938888, "grad_norm": 0.04149247705936432, "learning_rate": 3.6523839963660897e-06, "loss": 0.0422, "num_input_tokens_seen": 147603040, "step": 68460 }, { "epoch": 12.56469076894843, "grad_norm": 0.004286445211619139, "learning_rate": 3.6516128898596897e-06, "loss": 0.0298, "num_input_tokens_seen": 147614304, "step": 68465 }, { "epoch": 12.565608368507982, "grad_norm": 0.021217701956629753, "learning_rate": 3.6508418179373294e-06, "loss": 0.0177, "num_input_tokens_seen": 147624320, "step": 68470 }, { "epoch": 12.566525968067536, "grad_norm": 50.29833221435547, "learning_rate": 3.650070780618785e-06, "loss": 0.1519, "num_input_tokens_seen": 147634304, "step": 68475 }, { "epoch": 12.567443567627087, "grad_norm": 0.11011944711208344, "learning_rate": 3.6492997779238325e-06, "loss": 0.1126, "num_input_tokens_seen": 147645856, "step": 68480 }, { "epoch": 12.568361167186639, "grad_norm": 10.411852836608887, "learning_rate": 3.6485288098722484e-06, "loss": 0.021, "num_input_tokens_seen": 147657344, "step": 68485 }, { "epoch": 12.569278766746192, "grad_norm": 0.017802171409130096, "learning_rate": 3.6477578764838063e-06, "loss": 0.0004, "num_input_tokens_seen": 147667136, "step": 68490 }, { "epoch": 12.570196366305744, "grad_norm": 32.553958892822266, "learning_rate": 3.646986977778277e-06, "loss": 0.2536, "num_input_tokens_seen": 147677792, "step": 68495 }, { "epoch": 12.571113965865296, "grad_norm": 3.9692955017089844, "learning_rate": 3.646216113775436e-06, "loss": 0.199, "num_input_tokens_seen": 147687872, "step": 68500 }, { "epoch": 12.57203156542485, "grad_norm": 115.5379638671875, "learning_rate": 3.6454452844950533e-06, "loss": 0.0947, "num_input_tokens_seen": 147697344, "step": 68505 }, { "epoch": 12.5729491649844, "grad_norm": 175.34388732910156, "learning_rate": 3.6446744899568975e-06, "loss": 0.1331, "num_input_tokens_seen": 147708704, "step": 68510 }, { "epoch": 12.573866764543952, "grad_norm": 133.82223510742188, "learning_rate": 3.6439037301807434e-06, "loss": 0.131, "num_input_tokens_seen": 147719936, "step": 68515 }, { "epoch": 12.574784364103506, "grad_norm": 0.001059996779076755, "learning_rate": 3.6431330051863567e-06, "loss": 0.1399, "num_input_tokens_seen": 147731904, "step": 68520 }, { "epoch": 12.575701963663057, "grad_norm": 17.14415168762207, "learning_rate": 3.642362314993505e-06, "loss": 0.3222, "num_input_tokens_seen": 147743840, "step": 68525 }, { "epoch": 12.576619563222609, "grad_norm": 136.79318237304688, "learning_rate": 3.6415916596219538e-06, "loss": 0.2447, "num_input_tokens_seen": 147752960, "step": 68530 }, { "epoch": 12.577537162782162, "grad_norm": 0.014658159576356411, "learning_rate": 3.6408210390914733e-06, "loss": 0.0007, "num_input_tokens_seen": 147763936, "step": 68535 }, { "epoch": 12.578454762341714, "grad_norm": 15.250238418579102, "learning_rate": 3.640050453421826e-06, "loss": 0.0021, "num_input_tokens_seen": 147776128, "step": 68540 }, { "epoch": 12.579372361901266, "grad_norm": 0.5355355143547058, "learning_rate": 3.6392799026327757e-06, "loss": 0.0004, "num_input_tokens_seen": 147786208, "step": 68545 }, { "epoch": 12.580289961460819, "grad_norm": 0.527706503868103, "learning_rate": 3.638509386744089e-06, "loss": 0.2237, "num_input_tokens_seen": 147797312, "step": 68550 }, { "epoch": 12.58120756102037, "grad_norm": 0.07519873976707458, "learning_rate": 3.637738905775526e-06, "loss": 0.0002, "num_input_tokens_seen": 147808160, "step": 68555 }, { "epoch": 12.582125160579922, "grad_norm": 300.2745361328125, "learning_rate": 3.636968459746847e-06, "loss": 0.3801, "num_input_tokens_seen": 147819232, "step": 68560 }, { "epoch": 12.583042760139476, "grad_norm": 40.65065002441406, "learning_rate": 3.636198048677816e-06, "loss": 0.1093, "num_input_tokens_seen": 147830208, "step": 68565 }, { "epoch": 12.583960359699027, "grad_norm": 0.010376118123531342, "learning_rate": 3.635427672588192e-06, "loss": 0.1106, "num_input_tokens_seen": 147841024, "step": 68570 }, { "epoch": 12.584877959258579, "grad_norm": 285.6410217285156, "learning_rate": 3.634657331497733e-06, "loss": 0.4893, "num_input_tokens_seen": 147851872, "step": 68575 }, { "epoch": 12.585795558818132, "grad_norm": 0.6454151272773743, "learning_rate": 3.6338870254261947e-06, "loss": 0.1507, "num_input_tokens_seen": 147862080, "step": 68580 }, { "epoch": 12.586713158377684, "grad_norm": 0.059584084898233414, "learning_rate": 3.63311675439334e-06, "loss": 0.1837, "num_input_tokens_seen": 147872128, "step": 68585 }, { "epoch": 12.587630757937236, "grad_norm": 81.9687728881836, "learning_rate": 3.6323465184189217e-06, "loss": 0.0801, "num_input_tokens_seen": 147883616, "step": 68590 }, { "epoch": 12.588548357496789, "grad_norm": 0.029455985873937607, "learning_rate": 3.6315763175226948e-06, "loss": 0.0533, "num_input_tokens_seen": 147893728, "step": 68595 }, { "epoch": 12.58946595705634, "grad_norm": 38.01779556274414, "learning_rate": 3.630806151724415e-06, "loss": 0.1831, "num_input_tokens_seen": 147904800, "step": 68600 }, { "epoch": 12.590383556615892, "grad_norm": 78.46106719970703, "learning_rate": 3.6300360210438345e-06, "loss": 0.2826, "num_input_tokens_seen": 147916512, "step": 68605 }, { "epoch": 12.591301156175446, "grad_norm": 0.32539406418800354, "learning_rate": 3.6292659255007067e-06, "loss": 0.0566, "num_input_tokens_seen": 147927424, "step": 68610 }, { "epoch": 12.592218755734997, "grad_norm": 5.300488471984863, "learning_rate": 3.628495865114785e-06, "loss": 0.0032, "num_input_tokens_seen": 147937728, "step": 68615 }, { "epoch": 12.593136355294549, "grad_norm": 0.05380158871412277, "learning_rate": 3.627725839905818e-06, "loss": 0.1193, "num_input_tokens_seen": 147947808, "step": 68620 }, { "epoch": 12.594053954854102, "grad_norm": 1.6894451379776, "learning_rate": 3.6269558498935562e-06, "loss": 0.0185, "num_input_tokens_seen": 147957472, "step": 68625 }, { "epoch": 12.594971554413654, "grad_norm": 13.930986404418945, "learning_rate": 3.626185895097748e-06, "loss": 0.234, "num_input_tokens_seen": 147967904, "step": 68630 }, { "epoch": 12.595889153973205, "grad_norm": 27.35660171508789, "learning_rate": 3.6254159755381434e-06, "loss": 0.3238, "num_input_tokens_seen": 147979776, "step": 68635 }, { "epoch": 12.596806753532759, "grad_norm": 0.03692230209708214, "learning_rate": 3.6246460912344892e-06, "loss": 0.1766, "num_input_tokens_seen": 147989280, "step": 68640 }, { "epoch": 12.59772435309231, "grad_norm": 0.042449019849300385, "learning_rate": 3.623876242206529e-06, "loss": 0.1172, "num_input_tokens_seen": 148000480, "step": 68645 }, { "epoch": 12.598641952651862, "grad_norm": 1.2203112840652466, "learning_rate": 3.623106428474012e-06, "loss": 0.0084, "num_input_tokens_seen": 148011520, "step": 68650 }, { "epoch": 12.599559552211415, "grad_norm": 0.052285220474004745, "learning_rate": 3.6223366500566815e-06, "loss": 0.0514, "num_input_tokens_seen": 148022240, "step": 68655 }, { "epoch": 12.600477151770967, "grad_norm": 0.07055933028459549, "learning_rate": 3.6215669069742804e-06, "loss": 0.0386, "num_input_tokens_seen": 148033120, "step": 68660 }, { "epoch": 12.601394751330519, "grad_norm": 0.1900438815355301, "learning_rate": 3.6207971992465495e-06, "loss": 0.0121, "num_input_tokens_seen": 148044896, "step": 68665 }, { "epoch": 12.602312350890072, "grad_norm": 0.034208960831165314, "learning_rate": 3.6200275268932362e-06, "loss": 0.0055, "num_input_tokens_seen": 148054464, "step": 68670 }, { "epoch": 12.603229950449624, "grad_norm": 0.03836643695831299, "learning_rate": 3.619257889934077e-06, "loss": 0.1535, "num_input_tokens_seen": 148064736, "step": 68675 }, { "epoch": 12.604147550009175, "grad_norm": 0.14033865928649902, "learning_rate": 3.6184882883888106e-06, "loss": 0.0036, "num_input_tokens_seen": 148075968, "step": 68680 }, { "epoch": 12.605065149568729, "grad_norm": 0.6472831964492798, "learning_rate": 3.6177187222771814e-06, "loss": 0.0005, "num_input_tokens_seen": 148087360, "step": 68685 }, { "epoch": 12.60598274912828, "grad_norm": 0.00226596905849874, "learning_rate": 3.6169491916189243e-06, "loss": 0.1324, "num_input_tokens_seen": 148097536, "step": 68690 }, { "epoch": 12.606900348687832, "grad_norm": 0.0536385215818882, "learning_rate": 3.616179696433776e-06, "loss": 0.0061, "num_input_tokens_seen": 148107552, "step": 68695 }, { "epoch": 12.607817948247385, "grad_norm": 12.152436256408691, "learning_rate": 3.6154102367414744e-06, "loss": 0.1054, "num_input_tokens_seen": 148118848, "step": 68700 }, { "epoch": 12.608735547806937, "grad_norm": 103.81131744384766, "learning_rate": 3.614640812561754e-06, "loss": 0.1737, "num_input_tokens_seen": 148129856, "step": 68705 }, { "epoch": 12.609653147366489, "grad_norm": 196.37095642089844, "learning_rate": 3.61387142391435e-06, "loss": 0.1954, "num_input_tokens_seen": 148140640, "step": 68710 }, { "epoch": 12.610570746926042, "grad_norm": 0.024324974045157433, "learning_rate": 3.6131020708189947e-06, "loss": 0.0318, "num_input_tokens_seen": 148152032, "step": 68715 }, { "epoch": 12.611488346485594, "grad_norm": 0.24679790437221527, "learning_rate": 3.6123327532954233e-06, "loss": 0.2066, "num_input_tokens_seen": 148163200, "step": 68720 }, { "epoch": 12.612405946045145, "grad_norm": 0.042551930993795395, "learning_rate": 3.6115634713633653e-06, "loss": 0.1549, "num_input_tokens_seen": 148174048, "step": 68725 }, { "epoch": 12.613323545604699, "grad_norm": 0.007378222420811653, "learning_rate": 3.610794225042551e-06, "loss": 0.0947, "num_input_tokens_seen": 148185280, "step": 68730 }, { "epoch": 12.61424114516425, "grad_norm": 0.27588680386543274, "learning_rate": 3.610025014352714e-06, "loss": 0.2281, "num_input_tokens_seen": 148196224, "step": 68735 }, { "epoch": 12.615158744723802, "grad_norm": 0.00777659984305501, "learning_rate": 3.60925583931358e-06, "loss": 0.2762, "num_input_tokens_seen": 148207424, "step": 68740 }, { "epoch": 12.616076344283355, "grad_norm": 0.16623613238334656, "learning_rate": 3.608486699944877e-06, "loss": 0.2731, "num_input_tokens_seen": 148217408, "step": 68745 }, { "epoch": 12.616993943842907, "grad_norm": 0.05444016307592392, "learning_rate": 3.6077175962663356e-06, "loss": 0.1961, "num_input_tokens_seen": 148227392, "step": 68750 }, { "epoch": 12.617911543402458, "grad_norm": 1.4307754039764404, "learning_rate": 3.6069485282976796e-06, "loss": 0.1237, "num_input_tokens_seen": 148238784, "step": 68755 }, { "epoch": 12.618829142962012, "grad_norm": 0.15042050182819366, "learning_rate": 3.606179496058635e-06, "loss": 0.0767, "num_input_tokens_seen": 148249280, "step": 68760 }, { "epoch": 12.619746742521563, "grad_norm": 0.014299729838967323, "learning_rate": 3.6054104995689244e-06, "loss": 0.0828, "num_input_tokens_seen": 148261152, "step": 68765 }, { "epoch": 12.620664342081115, "grad_norm": 0.03224664553999901, "learning_rate": 3.6046415388482746e-06, "loss": 0.0013, "num_input_tokens_seen": 148272928, "step": 68770 }, { "epoch": 12.621581941640668, "grad_norm": 0.0019358256831765175, "learning_rate": 3.603872613916407e-06, "loss": 0.2732, "num_input_tokens_seen": 148282944, "step": 68775 }, { "epoch": 12.62249954120022, "grad_norm": 0.4339642822742462, "learning_rate": 3.603103724793041e-06, "loss": 0.2072, "num_input_tokens_seen": 148293824, "step": 68780 }, { "epoch": 12.623417140759772, "grad_norm": 0.13824224472045898, "learning_rate": 3.602334871497901e-06, "loss": 0.0017, "num_input_tokens_seen": 148304288, "step": 68785 }, { "epoch": 12.624334740319325, "grad_norm": 0.03185513988137245, "learning_rate": 3.601566054050706e-06, "loss": 0.0016, "num_input_tokens_seen": 148315136, "step": 68790 }, { "epoch": 12.625252339878877, "grad_norm": 0.2228115200996399, "learning_rate": 3.6007972724711737e-06, "loss": 0.1524, "num_input_tokens_seen": 148326784, "step": 68795 }, { "epoch": 12.626169939438428, "grad_norm": 69.25654602050781, "learning_rate": 3.6000285267790223e-06, "loss": 0.2386, "num_input_tokens_seen": 148336416, "step": 68800 }, { "epoch": 12.627087538997982, "grad_norm": 0.009266051463782787, "learning_rate": 3.5992598169939702e-06, "loss": 0.0054, "num_input_tokens_seen": 148346240, "step": 68805 }, { "epoch": 12.628005138557533, "grad_norm": 0.15163558721542358, "learning_rate": 3.598491143135733e-06, "loss": 0.069, "num_input_tokens_seen": 148357568, "step": 68810 }, { "epoch": 12.628922738117085, "grad_norm": 0.03054668754339218, "learning_rate": 3.597722505224025e-06, "loss": 0.1207, "num_input_tokens_seen": 148368384, "step": 68815 }, { "epoch": 12.629840337676638, "grad_norm": 0.06693893671035767, "learning_rate": 3.5969539032785622e-06, "loss": 0.2037, "num_input_tokens_seen": 148377856, "step": 68820 }, { "epoch": 12.63075793723619, "grad_norm": 3.1668686866760254, "learning_rate": 3.5961853373190566e-06, "loss": 0.2114, "num_input_tokens_seen": 148388416, "step": 68825 }, { "epoch": 12.631675536795742, "grad_norm": 18.67601203918457, "learning_rate": 3.5954168073652207e-06, "loss": 0.0114, "num_input_tokens_seen": 148399808, "step": 68830 }, { "epoch": 12.632593136355295, "grad_norm": 0.05403520539402962, "learning_rate": 3.5946483134367684e-06, "loss": 0.2489, "num_input_tokens_seen": 148409760, "step": 68835 }, { "epoch": 12.633510735914847, "grad_norm": 0.549847424030304, "learning_rate": 3.5938798555534084e-06, "loss": 0.1279, "num_input_tokens_seen": 148419616, "step": 68840 }, { "epoch": 12.634428335474398, "grad_norm": 0.1326902210712433, "learning_rate": 3.593111433734851e-06, "loss": 0.2097, "num_input_tokens_seen": 148430080, "step": 68845 }, { "epoch": 12.635345935033952, "grad_norm": 0.012426236644387245, "learning_rate": 3.5923430480008028e-06, "loss": 0.0285, "num_input_tokens_seen": 148440608, "step": 68850 }, { "epoch": 12.636263534593503, "grad_norm": 0.6349458694458008, "learning_rate": 3.591574698370976e-06, "loss": 0.3602, "num_input_tokens_seen": 148451456, "step": 68855 }, { "epoch": 12.637181134153055, "grad_norm": 40.39447021484375, "learning_rate": 3.5908063848650755e-06, "loss": 0.3137, "num_input_tokens_seen": 148461600, "step": 68860 }, { "epoch": 12.638098733712608, "grad_norm": 0.08092646300792694, "learning_rate": 3.5900381075028045e-06, "loss": 0.0544, "num_input_tokens_seen": 148471616, "step": 68865 }, { "epoch": 12.63901633327216, "grad_norm": 89.66602325439453, "learning_rate": 3.589269866303873e-06, "loss": 0.2988, "num_input_tokens_seen": 148482784, "step": 68870 }, { "epoch": 12.639933932831712, "grad_norm": 44.49826431274414, "learning_rate": 3.588501661287983e-06, "loss": 0.1603, "num_input_tokens_seen": 148494144, "step": 68875 }, { "epoch": 12.640851532391265, "grad_norm": 0.017970673739910126, "learning_rate": 3.587733492474835e-06, "loss": 0.2191, "num_input_tokens_seen": 148504128, "step": 68880 }, { "epoch": 12.641769131950817, "grad_norm": 0.5210508108139038, "learning_rate": 3.5869653598841376e-06, "loss": 0.1318, "num_input_tokens_seen": 148515104, "step": 68885 }, { "epoch": 12.642686731510368, "grad_norm": 0.42693135142326355, "learning_rate": 3.586197263535588e-06, "loss": 0.0004, "num_input_tokens_seen": 148525696, "step": 68890 }, { "epoch": 12.643604331069922, "grad_norm": 2.082634210586548, "learning_rate": 3.585429203448888e-06, "loss": 0.101, "num_input_tokens_seen": 148536288, "step": 68895 }, { "epoch": 12.644521930629473, "grad_norm": 0.4070821702480316, "learning_rate": 3.5846611796437337e-06, "loss": 0.1382, "num_input_tokens_seen": 148547584, "step": 68900 }, { "epoch": 12.645439530189025, "grad_norm": 75.35182189941406, "learning_rate": 3.583893192139829e-06, "loss": 0.1866, "num_input_tokens_seen": 148558048, "step": 68905 }, { "epoch": 12.646357129748578, "grad_norm": 0.1619628369808197, "learning_rate": 3.5831252409568683e-06, "loss": 0.021, "num_input_tokens_seen": 148569024, "step": 68910 }, { "epoch": 12.64727472930813, "grad_norm": 0.20938386023044586, "learning_rate": 3.5823573261145496e-06, "loss": 0.1632, "num_input_tokens_seen": 148579712, "step": 68915 }, { "epoch": 12.648192328867681, "grad_norm": 37.658443450927734, "learning_rate": 3.5815894476325685e-06, "loss": 0.2026, "num_input_tokens_seen": 148589120, "step": 68920 }, { "epoch": 12.649109928427235, "grad_norm": 0.03915315121412277, "learning_rate": 3.5808216055306187e-06, "loss": 0.0006, "num_input_tokens_seen": 148600736, "step": 68925 }, { "epoch": 12.650027527986786, "grad_norm": 0.004628991708159447, "learning_rate": 3.5800537998283957e-06, "loss": 0.0007, "num_input_tokens_seen": 148610688, "step": 68930 }, { "epoch": 12.650945127546338, "grad_norm": 5.693124294281006, "learning_rate": 3.5792860305455923e-06, "loss": 0.0704, "num_input_tokens_seen": 148621728, "step": 68935 }, { "epoch": 12.651862727105891, "grad_norm": 1.208288550376892, "learning_rate": 3.5785182977019005e-06, "loss": 0.0008, "num_input_tokens_seen": 148632352, "step": 68940 }, { "epoch": 12.652780326665443, "grad_norm": 0.1677228808403015, "learning_rate": 3.5777506013170115e-06, "loss": 0.0005, "num_input_tokens_seen": 148643328, "step": 68945 }, { "epoch": 12.653697926224995, "grad_norm": 0.03193558380007744, "learning_rate": 3.5769829414106137e-06, "loss": 0.2208, "num_input_tokens_seen": 148654304, "step": 68950 }, { "epoch": 12.654615525784548, "grad_norm": 64.23912048339844, "learning_rate": 3.5762153180023997e-06, "loss": 0.5199, "num_input_tokens_seen": 148664384, "step": 68955 }, { "epoch": 12.6555331253441, "grad_norm": 0.11253490298986435, "learning_rate": 3.5754477311120562e-06, "loss": 0.0036, "num_input_tokens_seen": 148674464, "step": 68960 }, { "epoch": 12.656450724903651, "grad_norm": 37.238075256347656, "learning_rate": 3.574680180759268e-06, "loss": 0.0209, "num_input_tokens_seen": 148685248, "step": 68965 }, { "epoch": 12.657368324463205, "grad_norm": 0.10665638744831085, "learning_rate": 3.573912666963726e-06, "loss": 0.0249, "num_input_tokens_seen": 148696032, "step": 68970 }, { "epoch": 12.658285924022756, "grad_norm": 0.8091866970062256, "learning_rate": 3.5731451897451146e-06, "loss": 0.1866, "num_input_tokens_seen": 148706720, "step": 68975 }, { "epoch": 12.659203523582308, "grad_norm": 0.06700006872415543, "learning_rate": 3.572377749123117e-06, "loss": 0.2421, "num_input_tokens_seen": 148717952, "step": 68980 }, { "epoch": 12.660121123141861, "grad_norm": 0.09591545164585114, "learning_rate": 3.571610345117416e-06, "loss": 0.1327, "num_input_tokens_seen": 148728704, "step": 68985 }, { "epoch": 12.661038722701413, "grad_norm": 0.055587511509656906, "learning_rate": 3.570842977747698e-06, "loss": 0.0658, "num_input_tokens_seen": 148739136, "step": 68990 }, { "epoch": 12.661956322260965, "grad_norm": 13.429603576660156, "learning_rate": 3.5700756470336422e-06, "loss": 0.1032, "num_input_tokens_seen": 148748512, "step": 68995 }, { "epoch": 12.662873921820518, "grad_norm": 229.9529266357422, "learning_rate": 3.569308352994928e-06, "loss": 0.0892, "num_input_tokens_seen": 148758464, "step": 69000 }, { "epoch": 12.66379152138007, "grad_norm": 0.060941431671381, "learning_rate": 3.568541095651239e-06, "loss": 0.1042, "num_input_tokens_seen": 148769472, "step": 69005 }, { "epoch": 12.664709120939621, "grad_norm": 0.1866534948348999, "learning_rate": 3.5677738750222526e-06, "loss": 0.007, "num_input_tokens_seen": 148780256, "step": 69010 }, { "epoch": 12.665626720499175, "grad_norm": 207.5383758544922, "learning_rate": 3.567006691127646e-06, "loss": 0.239, "num_input_tokens_seen": 148791648, "step": 69015 }, { "epoch": 12.666544320058726, "grad_norm": 86.57421112060547, "learning_rate": 3.5662395439870956e-06, "loss": 0.0243, "num_input_tokens_seen": 148802048, "step": 69020 }, { "epoch": 12.667461919618278, "grad_norm": 42.810977935791016, "learning_rate": 3.565472433620281e-06, "loss": 0.06, "num_input_tokens_seen": 148812992, "step": 69025 }, { "epoch": 12.668379519177831, "grad_norm": 0.12903998792171478, "learning_rate": 3.5647053600468747e-06, "loss": 0.0218, "num_input_tokens_seen": 148822784, "step": 69030 }, { "epoch": 12.669297118737383, "grad_norm": 24.436222076416016, "learning_rate": 3.56393832328655e-06, "loss": 0.1795, "num_input_tokens_seen": 148833248, "step": 69035 }, { "epoch": 12.670214718296934, "grad_norm": 0.3258926272392273, "learning_rate": 3.563171323358983e-06, "loss": 0.0008, "num_input_tokens_seen": 148843840, "step": 69040 }, { "epoch": 12.671132317856488, "grad_norm": 0.17483298480510712, "learning_rate": 3.5624043602838447e-06, "loss": 0.1352, "num_input_tokens_seen": 148854112, "step": 69045 }, { "epoch": 12.67204991741604, "grad_norm": 0.029911531135439873, "learning_rate": 3.561637434080805e-06, "loss": 0.0151, "num_input_tokens_seen": 148865504, "step": 69050 }, { "epoch": 12.672967516975591, "grad_norm": 0.05924814194440842, "learning_rate": 3.560870544769537e-06, "loss": 0.0964, "num_input_tokens_seen": 148876544, "step": 69055 }, { "epoch": 12.673885116535144, "grad_norm": 0.08323792368173599, "learning_rate": 3.560103692369709e-06, "loss": 0.0491, "num_input_tokens_seen": 148887872, "step": 69060 }, { "epoch": 12.674802716094696, "grad_norm": 0.005934738088399172, "learning_rate": 3.5593368769009884e-06, "loss": 0.132, "num_input_tokens_seen": 148898560, "step": 69065 }, { "epoch": 12.675720315654248, "grad_norm": 0.12791737914085388, "learning_rate": 3.558570098383045e-06, "loss": 0.1153, "num_input_tokens_seen": 148910176, "step": 69070 }, { "epoch": 12.676637915213801, "grad_norm": 4.747975826263428, "learning_rate": 3.557803356835546e-06, "loss": 0.0025, "num_input_tokens_seen": 148921600, "step": 69075 }, { "epoch": 12.677555514773353, "grad_norm": 40.570438385009766, "learning_rate": 3.5570366522781546e-06, "loss": 0.2265, "num_input_tokens_seen": 148932704, "step": 69080 }, { "epoch": 12.678473114332904, "grad_norm": 165.35482788085938, "learning_rate": 3.5562699847305347e-06, "loss": 0.1815, "num_input_tokens_seen": 148943680, "step": 69085 }, { "epoch": 12.679390713892458, "grad_norm": 0.05603761225938797, "learning_rate": 3.555503354212355e-06, "loss": 0.0004, "num_input_tokens_seen": 148953792, "step": 69090 }, { "epoch": 12.68030831345201, "grad_norm": 665.6756591796875, "learning_rate": 3.5547367607432748e-06, "loss": 0.3254, "num_input_tokens_seen": 148963776, "step": 69095 }, { "epoch": 12.681225913011561, "grad_norm": 0.1077134907245636, "learning_rate": 3.553970204342955e-06, "loss": 0.0006, "num_input_tokens_seen": 148975040, "step": 69100 }, { "epoch": 12.682143512571114, "grad_norm": 168.6891326904297, "learning_rate": 3.55320368503106e-06, "loss": 0.0189, "num_input_tokens_seen": 148986816, "step": 69105 }, { "epoch": 12.683061112130666, "grad_norm": 0.06387186050415039, "learning_rate": 3.552437202827248e-06, "loss": 0.1874, "num_input_tokens_seen": 148996992, "step": 69110 }, { "epoch": 12.683978711690218, "grad_norm": 203.6950225830078, "learning_rate": 3.551670757751178e-06, "loss": 0.2211, "num_input_tokens_seen": 149007552, "step": 69115 }, { "epoch": 12.684896311249771, "grad_norm": 45.308956146240234, "learning_rate": 3.5509043498225077e-06, "loss": 0.0771, "num_input_tokens_seen": 149017024, "step": 69120 }, { "epoch": 12.685813910809323, "grad_norm": 49.90207290649414, "learning_rate": 3.5501379790608957e-06, "loss": 0.4414, "num_input_tokens_seen": 149027904, "step": 69125 }, { "epoch": 12.686731510368874, "grad_norm": 0.24570080637931824, "learning_rate": 3.5493716454859985e-06, "loss": 0.2109, "num_input_tokens_seen": 149038752, "step": 69130 }, { "epoch": 12.687649109928428, "grad_norm": 0.038288362324237823, "learning_rate": 3.5486053491174687e-06, "loss": 0.0066, "num_input_tokens_seen": 149049920, "step": 69135 }, { "epoch": 12.68856670948798, "grad_norm": 0.3537328839302063, "learning_rate": 3.5478390899749633e-06, "loss": 0.1758, "num_input_tokens_seen": 149060256, "step": 69140 }, { "epoch": 12.68948430904753, "grad_norm": 35.24006652832031, "learning_rate": 3.5470728680781334e-06, "loss": 0.2526, "num_input_tokens_seen": 149071328, "step": 69145 }, { "epoch": 12.690401908607084, "grad_norm": 1.4092469215393066, "learning_rate": 3.546306683446633e-06, "loss": 0.0007, "num_input_tokens_seen": 149081952, "step": 69150 }, { "epoch": 12.691319508166636, "grad_norm": 118.70010375976562, "learning_rate": 3.5455405361001136e-06, "loss": 0.0947, "num_input_tokens_seen": 149092448, "step": 69155 }, { "epoch": 12.692237107726188, "grad_norm": 3.4176430702209473, "learning_rate": 3.544774426058226e-06, "loss": 0.1505, "num_input_tokens_seen": 149104288, "step": 69160 }, { "epoch": 12.693154707285741, "grad_norm": 62.00737380981445, "learning_rate": 3.544008353340618e-06, "loss": 0.2342, "num_input_tokens_seen": 149115552, "step": 69165 }, { "epoch": 12.694072306845293, "grad_norm": 0.4152607023715973, "learning_rate": 3.5432423179669384e-06, "loss": 0.0179, "num_input_tokens_seen": 149127712, "step": 69170 }, { "epoch": 12.694989906404844, "grad_norm": 0.13909819722175598, "learning_rate": 3.542476319956837e-06, "loss": 0.0008, "num_input_tokens_seen": 149139712, "step": 69175 }, { "epoch": 12.695907505964398, "grad_norm": 4.176516532897949, "learning_rate": 3.5417103593299586e-06, "loss": 0.1175, "num_input_tokens_seen": 149151424, "step": 69180 }, { "epoch": 12.69682510552395, "grad_norm": 0.045709218829870224, "learning_rate": 3.5409444361059474e-06, "loss": 0.0004, "num_input_tokens_seen": 149162208, "step": 69185 }, { "epoch": 12.6977427050835, "grad_norm": 150.22445678710938, "learning_rate": 3.5401785503044523e-06, "loss": 0.1103, "num_input_tokens_seen": 149171296, "step": 69190 }, { "epoch": 12.698660304643054, "grad_norm": 0.19517581164836884, "learning_rate": 3.5394127019451153e-06, "loss": 0.0101, "num_input_tokens_seen": 149182272, "step": 69195 }, { "epoch": 12.699577904202606, "grad_norm": 0.274427592754364, "learning_rate": 3.5386468910475756e-06, "loss": 0.0032, "num_input_tokens_seen": 149192832, "step": 69200 }, { "epoch": 12.700495503762157, "grad_norm": 0.19498242437839508, "learning_rate": 3.5378811176314813e-06, "loss": 0.0631, "num_input_tokens_seen": 149204736, "step": 69205 }, { "epoch": 12.70141310332171, "grad_norm": 412.05377197265625, "learning_rate": 3.5371153817164706e-06, "loss": 0.233, "num_input_tokens_seen": 149216128, "step": 69210 }, { "epoch": 12.702330702881262, "grad_norm": 0.012414233759045601, "learning_rate": 3.5363496833221824e-06, "loss": 0.2257, "num_input_tokens_seen": 149226432, "step": 69215 }, { "epoch": 12.703248302440814, "grad_norm": 21.97325325012207, "learning_rate": 3.5355840224682537e-06, "loss": 0.1479, "num_input_tokens_seen": 149236384, "step": 69220 }, { "epoch": 12.704165902000367, "grad_norm": 31.80832862854004, "learning_rate": 3.534818399174328e-06, "loss": 0.2165, "num_input_tokens_seen": 149247552, "step": 69225 }, { "epoch": 12.705083501559919, "grad_norm": 2.836310386657715, "learning_rate": 3.5340528134600393e-06, "loss": 0.0181, "num_input_tokens_seen": 149258272, "step": 69230 }, { "epoch": 12.70600110111947, "grad_norm": 0.24996063113212585, "learning_rate": 3.5332872653450236e-06, "loss": 0.0035, "num_input_tokens_seen": 149268256, "step": 69235 }, { "epoch": 12.706918700679024, "grad_norm": 0.04447806254029274, "learning_rate": 3.5325217548489167e-06, "loss": 0.1351, "num_input_tokens_seen": 149279296, "step": 69240 }, { "epoch": 12.707836300238576, "grad_norm": 88.13701629638672, "learning_rate": 3.531756281991352e-06, "loss": 0.4963, "num_input_tokens_seen": 149288992, "step": 69245 }, { "epoch": 12.708753899798127, "grad_norm": 77.07515716552734, "learning_rate": 3.5309908467919616e-06, "loss": 0.3353, "num_input_tokens_seen": 149299840, "step": 69250 }, { "epoch": 12.70967149935768, "grad_norm": 35.933509826660156, "learning_rate": 3.530225449270382e-06, "loss": 0.0937, "num_input_tokens_seen": 149310240, "step": 69255 }, { "epoch": 12.710589098917232, "grad_norm": 0.5554715394973755, "learning_rate": 3.5294600894462405e-06, "loss": 0.1071, "num_input_tokens_seen": 149321504, "step": 69260 }, { "epoch": 12.711506698476784, "grad_norm": 106.62171936035156, "learning_rate": 3.5286947673391693e-06, "loss": 0.2133, "num_input_tokens_seen": 149331968, "step": 69265 }, { "epoch": 12.712424298036337, "grad_norm": 0.1262957751750946, "learning_rate": 3.527929482968795e-06, "loss": 0.0003, "num_input_tokens_seen": 149343296, "step": 69270 }, { "epoch": 12.713341897595889, "grad_norm": 52.38169479370117, "learning_rate": 3.52716423635475e-06, "loss": 0.168, "num_input_tokens_seen": 149354560, "step": 69275 }, { "epoch": 12.71425949715544, "grad_norm": 63.20166015625, "learning_rate": 3.5263990275166585e-06, "loss": 0.229, "num_input_tokens_seen": 149366112, "step": 69280 }, { "epoch": 12.715177096714994, "grad_norm": 12.091425895690918, "learning_rate": 3.525633856474147e-06, "loss": 0.0062, "num_input_tokens_seen": 149377728, "step": 69285 }, { "epoch": 12.716094696274546, "grad_norm": 0.14355725049972534, "learning_rate": 3.5248687232468437e-06, "loss": 0.0999, "num_input_tokens_seen": 149387584, "step": 69290 }, { "epoch": 12.717012295834097, "grad_norm": 0.009086661040782928, "learning_rate": 3.5241036278543716e-06, "loss": 0.5188, "num_input_tokens_seen": 149397632, "step": 69295 }, { "epoch": 12.71792989539365, "grad_norm": 766.05126953125, "learning_rate": 3.5233385703163547e-06, "loss": 0.5553, "num_input_tokens_seen": 149408896, "step": 69300 }, { "epoch": 12.718847494953202, "grad_norm": 0.48156100511550903, "learning_rate": 3.522573550652412e-06, "loss": 0.1751, "num_input_tokens_seen": 149420000, "step": 69305 }, { "epoch": 12.719765094512754, "grad_norm": 10.785538673400879, "learning_rate": 3.52180856888217e-06, "loss": 0.1207, "num_input_tokens_seen": 149431456, "step": 69310 }, { "epoch": 12.720682694072307, "grad_norm": 0.39428189396858215, "learning_rate": 3.521043625025248e-06, "loss": 0.0054, "num_input_tokens_seen": 149442528, "step": 69315 }, { "epoch": 12.721600293631859, "grad_norm": 7.815688610076904, "learning_rate": 3.520278719101261e-06, "loss": 0.2681, "num_input_tokens_seen": 149453568, "step": 69320 }, { "epoch": 12.72251789319141, "grad_norm": 0.10526970028877258, "learning_rate": 3.5195138511298356e-06, "loss": 0.0009, "num_input_tokens_seen": 149464352, "step": 69325 }, { "epoch": 12.723435492750964, "grad_norm": 0.0145493745803833, "learning_rate": 3.5187490211305842e-06, "loss": 0.0126, "num_input_tokens_seen": 149475648, "step": 69330 }, { "epoch": 12.724353092310515, "grad_norm": 0.035724058747291565, "learning_rate": 3.5179842291231238e-06, "loss": 0.0008, "num_input_tokens_seen": 149487488, "step": 69335 }, { "epoch": 12.725270691870067, "grad_norm": 0.3210432827472687, "learning_rate": 3.517219475127071e-06, "loss": 0.182, "num_input_tokens_seen": 149498880, "step": 69340 }, { "epoch": 12.72618829142962, "grad_norm": 253.4755859375, "learning_rate": 3.5164547591620417e-06, "loss": 0.1132, "num_input_tokens_seen": 149510112, "step": 69345 }, { "epoch": 12.727105890989172, "grad_norm": 0.11441142857074738, "learning_rate": 3.5156900812476487e-06, "loss": 0.1647, "num_input_tokens_seen": 149520128, "step": 69350 }, { "epoch": 12.728023490548724, "grad_norm": 0.14290015399456024, "learning_rate": 3.5149254414035035e-06, "loss": 0.2775, "num_input_tokens_seen": 149531392, "step": 69355 }, { "epoch": 12.728941090108277, "grad_norm": 1.1525369882583618, "learning_rate": 3.51416083964922e-06, "loss": 0.0253, "num_input_tokens_seen": 149540480, "step": 69360 }, { "epoch": 12.729858689667829, "grad_norm": 0.8467138409614563, "learning_rate": 3.5133962760044073e-06, "loss": 0.0006, "num_input_tokens_seen": 149552512, "step": 69365 }, { "epoch": 12.73077628922738, "grad_norm": 56.38640213012695, "learning_rate": 3.512631750488675e-06, "loss": 0.212, "num_input_tokens_seen": 149562112, "step": 69370 }, { "epoch": 12.731693888786934, "grad_norm": 0.005080007947981358, "learning_rate": 3.5118672631216353e-06, "loss": 0.0083, "num_input_tokens_seen": 149573888, "step": 69375 }, { "epoch": 12.732611488346485, "grad_norm": 0.03826470300555229, "learning_rate": 3.511102813922894e-06, "loss": 0.015, "num_input_tokens_seen": 149585472, "step": 69380 }, { "epoch": 12.733529087906037, "grad_norm": 139.9932098388672, "learning_rate": 3.5103384029120557e-06, "loss": 0.2082, "num_input_tokens_seen": 149596160, "step": 69385 }, { "epoch": 12.73444668746559, "grad_norm": 0.012695299461483955, "learning_rate": 3.50957403010873e-06, "loss": 0.1349, "num_input_tokens_seen": 149605312, "step": 69390 }, { "epoch": 12.735364287025142, "grad_norm": 177.415283203125, "learning_rate": 3.5088096955325215e-06, "loss": 0.0372, "num_input_tokens_seen": 149615072, "step": 69395 }, { "epoch": 12.736281886584694, "grad_norm": 0.007840296253561974, "learning_rate": 3.5080453992030327e-06, "loss": 0.2422, "num_input_tokens_seen": 149627232, "step": 69400 }, { "epoch": 12.737199486144247, "grad_norm": 115.22298431396484, "learning_rate": 3.5072811411398646e-06, "loss": 0.0966, "num_input_tokens_seen": 149636160, "step": 69405 }, { "epoch": 12.738117085703799, "grad_norm": 39.97060775756836, "learning_rate": 3.506516921362624e-06, "loss": 0.0483, "num_input_tokens_seen": 149646560, "step": 69410 }, { "epoch": 12.73903468526335, "grad_norm": 62.206825256347656, "learning_rate": 3.5057527398909103e-06, "loss": 0.0188, "num_input_tokens_seen": 149657088, "step": 69415 }, { "epoch": 12.739952284822904, "grad_norm": 103.60966491699219, "learning_rate": 3.5049885967443205e-06, "loss": 0.2666, "num_input_tokens_seen": 149666176, "step": 69420 }, { "epoch": 12.740869884382455, "grad_norm": 0.007529311813414097, "learning_rate": 3.504224491942458e-06, "loss": 0.0708, "num_input_tokens_seen": 149677312, "step": 69425 }, { "epoch": 12.741787483942007, "grad_norm": 69.93539428710938, "learning_rate": 3.503460425504919e-06, "loss": 0.5224, "num_input_tokens_seen": 149686368, "step": 69430 }, { "epoch": 12.74270508350156, "grad_norm": 142.7610321044922, "learning_rate": 3.5026963974513007e-06, "loss": 0.0545, "num_input_tokens_seen": 149696480, "step": 69435 }, { "epoch": 12.743622683061112, "grad_norm": 0.049721475690603256, "learning_rate": 3.5019324078011973e-06, "loss": 0.1443, "num_input_tokens_seen": 149706720, "step": 69440 }, { "epoch": 12.744540282620664, "grad_norm": 0.08878152817487717, "learning_rate": 3.5011684565742083e-06, "loss": 0.0005, "num_input_tokens_seen": 149718624, "step": 69445 }, { "epoch": 12.745457882180217, "grad_norm": 0.0811871886253357, "learning_rate": 3.5004045437899248e-06, "loss": 0.3992, "num_input_tokens_seen": 149728224, "step": 69450 }, { "epoch": 12.746375481739769, "grad_norm": 1.1709941625595093, "learning_rate": 3.4996406694679395e-06, "loss": 0.1078, "num_input_tokens_seen": 149740160, "step": 69455 }, { "epoch": 12.74729308129932, "grad_norm": 0.1537131518125534, "learning_rate": 3.4988768336278473e-06, "loss": 0.0013, "num_input_tokens_seen": 149751040, "step": 69460 }, { "epoch": 12.748210680858874, "grad_norm": 0.05227890983223915, "learning_rate": 3.498113036289236e-06, "loss": 0.0406, "num_input_tokens_seen": 149762112, "step": 69465 }, { "epoch": 12.749128280418425, "grad_norm": 91.00231170654297, "learning_rate": 3.497349277471698e-06, "loss": 0.3526, "num_input_tokens_seen": 149773216, "step": 69470 }, { "epoch": 12.750045879977977, "grad_norm": 4.561721324920654, "learning_rate": 3.4965855571948236e-06, "loss": 0.0444, "num_input_tokens_seen": 149782080, "step": 69475 }, { "epoch": 12.75096347953753, "grad_norm": 0.04530675336718559, "learning_rate": 3.495821875478199e-06, "loss": 0.1253, "num_input_tokens_seen": 149791968, "step": 69480 }, { "epoch": 12.751881079097082, "grad_norm": 64.74153900146484, "learning_rate": 3.495058232341412e-06, "loss": 0.4021, "num_input_tokens_seen": 149803136, "step": 69485 }, { "epoch": 12.752798678656633, "grad_norm": 196.2676239013672, "learning_rate": 3.4942946278040475e-06, "loss": 0.1704, "num_input_tokens_seen": 149813760, "step": 69490 }, { "epoch": 12.753716278216187, "grad_norm": 80.75334930419922, "learning_rate": 3.4935310618856932e-06, "loss": 0.1715, "num_input_tokens_seen": 149825056, "step": 69495 }, { "epoch": 12.754633877775738, "grad_norm": 0.0037719260435551405, "learning_rate": 3.492767534605933e-06, "loss": 0.1198, "num_input_tokens_seen": 149835168, "step": 69500 }, { "epoch": 12.755551477335292, "grad_norm": 2.64849591255188, "learning_rate": 3.4920040459843475e-06, "loss": 0.0052, "num_input_tokens_seen": 149846144, "step": 69505 }, { "epoch": 12.756469076894843, "grad_norm": 9.737456321716309, "learning_rate": 3.4912405960405225e-06, "loss": 0.1534, "num_input_tokens_seen": 149857312, "step": 69510 }, { "epoch": 12.757386676454395, "grad_norm": 0.00978916697204113, "learning_rate": 3.490477184794039e-06, "loss": 0.1452, "num_input_tokens_seen": 149868000, "step": 69515 }, { "epoch": 12.758304276013948, "grad_norm": 0.03529202193021774, "learning_rate": 3.4897138122644737e-06, "loss": 0.0009, "num_input_tokens_seen": 149878752, "step": 69520 }, { "epoch": 12.7592218755735, "grad_norm": 0.0773160457611084, "learning_rate": 3.48895047847141e-06, "loss": 0.0884, "num_input_tokens_seen": 149888608, "step": 69525 }, { "epoch": 12.760139475133052, "grad_norm": 5.051259517669678, "learning_rate": 3.4881871834344242e-06, "loss": 0.1937, "num_input_tokens_seen": 149899200, "step": 69530 }, { "epoch": 12.761057074692605, "grad_norm": 0.7885854244232178, "learning_rate": 3.487423927173095e-06, "loss": 0.0006, "num_input_tokens_seen": 149910656, "step": 69535 }, { "epoch": 12.761974674252157, "grad_norm": 236.51596069335938, "learning_rate": 3.4866607097069948e-06, "loss": 0.1496, "num_input_tokens_seen": 149922496, "step": 69540 }, { "epoch": 12.762892273811708, "grad_norm": 0.04214801266789436, "learning_rate": 3.485897531055705e-06, "loss": 0.4343, "num_input_tokens_seen": 149934208, "step": 69545 }, { "epoch": 12.763809873371262, "grad_norm": 81.9190902709961, "learning_rate": 3.485134391238796e-06, "loss": 0.1263, "num_input_tokens_seen": 149945120, "step": 69550 }, { "epoch": 12.764727472930813, "grad_norm": 0.5817409753799438, "learning_rate": 3.484371290275842e-06, "loss": 0.0012, "num_input_tokens_seen": 149957280, "step": 69555 }, { "epoch": 12.765645072490365, "grad_norm": 0.12836386263370514, "learning_rate": 3.4836082281864148e-06, "loss": 0.1363, "num_input_tokens_seen": 149967968, "step": 69560 }, { "epoch": 12.766562672049918, "grad_norm": 127.83783721923828, "learning_rate": 3.4828452049900875e-06, "loss": 0.3969, "num_input_tokens_seen": 149978624, "step": 69565 }, { "epoch": 12.76748027160947, "grad_norm": 0.040615130215883255, "learning_rate": 3.4820822207064276e-06, "loss": 0.002, "num_input_tokens_seen": 149989024, "step": 69570 }, { "epoch": 12.768397871169022, "grad_norm": 0.020247163251042366, "learning_rate": 3.4813192753550086e-06, "loss": 0.0004, "num_input_tokens_seen": 150000128, "step": 69575 }, { "epoch": 12.769315470728575, "grad_norm": 0.004096606746315956, "learning_rate": 3.4805563689553954e-06, "loss": 0.1036, "num_input_tokens_seen": 150010624, "step": 69580 }, { "epoch": 12.770233070288127, "grad_norm": 220.70162963867188, "learning_rate": 3.4797935015271566e-06, "loss": 0.0563, "num_input_tokens_seen": 150022144, "step": 69585 }, { "epoch": 12.771150669847678, "grad_norm": 0.10198252648115158, "learning_rate": 3.479030673089856e-06, "loss": 0.1943, "num_input_tokens_seen": 150033696, "step": 69590 }, { "epoch": 12.772068269407232, "grad_norm": 223.02626037597656, "learning_rate": 3.478267883663064e-06, "loss": 0.4921, "num_input_tokens_seen": 150045216, "step": 69595 }, { "epoch": 12.772985868966783, "grad_norm": 109.39836120605469, "learning_rate": 3.4775051332663424e-06, "loss": 0.4681, "num_input_tokens_seen": 150055616, "step": 69600 }, { "epoch": 12.773903468526335, "grad_norm": 0.4287548065185547, "learning_rate": 3.476742421919252e-06, "loss": 0.1539, "num_input_tokens_seen": 150066976, "step": 69605 }, { "epoch": 12.774821068085888, "grad_norm": 0.03297165408730507, "learning_rate": 3.4759797496413593e-06, "loss": 0.1415, "num_input_tokens_seen": 150077216, "step": 69610 }, { "epoch": 12.77573866764544, "grad_norm": 0.19718770682811737, "learning_rate": 3.4752171164522254e-06, "loss": 0.136, "num_input_tokens_seen": 150088736, "step": 69615 }, { "epoch": 12.776656267204991, "grad_norm": 60.47304916381836, "learning_rate": 3.4744545223714078e-06, "loss": 0.0133, "num_input_tokens_seen": 150100064, "step": 69620 }, { "epoch": 12.777573866764545, "grad_norm": 0.2593124210834503, "learning_rate": 3.4736919674184657e-06, "loss": 0.2542, "num_input_tokens_seen": 150110848, "step": 69625 }, { "epoch": 12.778491466324096, "grad_norm": 0.03959601745009422, "learning_rate": 3.4729294516129614e-06, "loss": 0.1929, "num_input_tokens_seen": 150120864, "step": 69630 }, { "epoch": 12.779409065883648, "grad_norm": 0.015302443876862526, "learning_rate": 3.4721669749744502e-06, "loss": 0.1961, "num_input_tokens_seen": 150131104, "step": 69635 }, { "epoch": 12.780326665443202, "grad_norm": 0.3884548842906952, "learning_rate": 3.4714045375224852e-06, "loss": 0.2682, "num_input_tokens_seen": 150141696, "step": 69640 }, { "epoch": 12.781244265002753, "grad_norm": 243.3762664794922, "learning_rate": 3.4706421392766278e-06, "loss": 0.301, "num_input_tokens_seen": 150151776, "step": 69645 }, { "epoch": 12.782161864562305, "grad_norm": 0.20617428421974182, "learning_rate": 3.469879780256428e-06, "loss": 0.1105, "num_input_tokens_seen": 150163744, "step": 69650 }, { "epoch": 12.783079464121858, "grad_norm": 309.4895935058594, "learning_rate": 3.4691174604814406e-06, "loss": 0.2895, "num_input_tokens_seen": 150173856, "step": 69655 }, { "epoch": 12.78399706368141, "grad_norm": 0.17245103418827057, "learning_rate": 3.4683551799712167e-06, "loss": 0.0054, "num_input_tokens_seen": 150182944, "step": 69660 }, { "epoch": 12.784914663240961, "grad_norm": 0.10480564832687378, "learning_rate": 3.46759293874531e-06, "loss": 0.0962, "num_input_tokens_seen": 150192864, "step": 69665 }, { "epoch": 12.785832262800515, "grad_norm": 0.06661572307348251, "learning_rate": 3.46683073682327e-06, "loss": 0.3695, "num_input_tokens_seen": 150203072, "step": 69670 }, { "epoch": 12.786749862360066, "grad_norm": 0.0867738351225853, "learning_rate": 3.4660685742246436e-06, "loss": 0.289, "num_input_tokens_seen": 150213376, "step": 69675 }, { "epoch": 12.787667461919618, "grad_norm": 39.853759765625, "learning_rate": 3.465306450968982e-06, "loss": 0.148, "num_input_tokens_seen": 150225120, "step": 69680 }, { "epoch": 12.788585061479171, "grad_norm": 0.6312053203582764, "learning_rate": 3.4645443670758303e-06, "loss": 0.0078, "num_input_tokens_seen": 150236576, "step": 69685 }, { "epoch": 12.789502661038723, "grad_norm": 0.624314546585083, "learning_rate": 3.463782322564736e-06, "loss": 0.0021, "num_input_tokens_seen": 150247072, "step": 69690 }, { "epoch": 12.790420260598275, "grad_norm": 0.08310630917549133, "learning_rate": 3.4630203174552455e-06, "loss": 0.001, "num_input_tokens_seen": 150257952, "step": 69695 }, { "epoch": 12.791337860157828, "grad_norm": 0.039629917591810226, "learning_rate": 3.4622583517669016e-06, "loss": 0.0135, "num_input_tokens_seen": 150268832, "step": 69700 }, { "epoch": 12.79225545971738, "grad_norm": 0.10827458649873734, "learning_rate": 3.461496425519246e-06, "loss": 0.0021, "num_input_tokens_seen": 150279488, "step": 69705 }, { "epoch": 12.793173059276931, "grad_norm": 0.42743974924087524, "learning_rate": 3.4607345387318236e-06, "loss": 0.041, "num_input_tokens_seen": 150289248, "step": 69710 }, { "epoch": 12.794090658836485, "grad_norm": 0.05163043364882469, "learning_rate": 3.4599726914241755e-06, "loss": 0.2181, "num_input_tokens_seen": 150299680, "step": 69715 }, { "epoch": 12.795008258396036, "grad_norm": 0.013066365383565426, "learning_rate": 3.4592108836158413e-06, "loss": 0.0826, "num_input_tokens_seen": 150310208, "step": 69720 }, { "epoch": 12.795925857955588, "grad_norm": 193.7670440673828, "learning_rate": 3.4584491153263565e-06, "loss": 0.114, "num_input_tokens_seen": 150322176, "step": 69725 }, { "epoch": 12.796843457515141, "grad_norm": 93.02379608154297, "learning_rate": 3.457687386575266e-06, "loss": 0.0439, "num_input_tokens_seen": 150331680, "step": 69730 }, { "epoch": 12.797761057074693, "grad_norm": 1.5213173627853394, "learning_rate": 3.4569256973821036e-06, "loss": 0.3744, "num_input_tokens_seen": 150342976, "step": 69735 }, { "epoch": 12.798678656634245, "grad_norm": 3.5492639541625977, "learning_rate": 3.4561640477664028e-06, "loss": 0.0353, "num_input_tokens_seen": 150353760, "step": 69740 }, { "epoch": 12.799596256193798, "grad_norm": 0.5473513007164001, "learning_rate": 3.455402437747704e-06, "loss": 0.0023, "num_input_tokens_seen": 150364576, "step": 69745 }, { "epoch": 12.80051385575335, "grad_norm": 0.037672340869903564, "learning_rate": 3.454640867345539e-06, "loss": 0.0006, "num_input_tokens_seen": 150372864, "step": 69750 }, { "epoch": 12.801431455312901, "grad_norm": 1.7184374332427979, "learning_rate": 3.4538793365794397e-06, "loss": 0.3474, "num_input_tokens_seen": 150383392, "step": 69755 }, { "epoch": 12.802349054872455, "grad_norm": 0.019558770582079887, "learning_rate": 3.453117845468938e-06, "loss": 0.0253, "num_input_tokens_seen": 150393536, "step": 69760 }, { "epoch": 12.803266654432006, "grad_norm": 215.47911071777344, "learning_rate": 3.452356394033568e-06, "loss": 0.0801, "num_input_tokens_seen": 150404448, "step": 69765 }, { "epoch": 12.804184253991558, "grad_norm": 300.8030700683594, "learning_rate": 3.4515949822928573e-06, "loss": 0.1742, "num_input_tokens_seen": 150414816, "step": 69770 }, { "epoch": 12.805101853551111, "grad_norm": 0.1280716508626938, "learning_rate": 3.4508336102663353e-06, "loss": 0.1254, "num_input_tokens_seen": 150425056, "step": 69775 }, { "epoch": 12.806019453110663, "grad_norm": 52.33401107788086, "learning_rate": 3.450072277973531e-06, "loss": 0.1966, "num_input_tokens_seen": 150436000, "step": 69780 }, { "epoch": 12.806937052670214, "grad_norm": 0.09282416850328445, "learning_rate": 3.449310985433969e-06, "loss": 0.0734, "num_input_tokens_seen": 150447168, "step": 69785 }, { "epoch": 12.807854652229768, "grad_norm": 0.12211571633815765, "learning_rate": 3.4485497326671774e-06, "loss": 0.0014, "num_input_tokens_seen": 150457248, "step": 69790 }, { "epoch": 12.80877225178932, "grad_norm": 0.05195479467511177, "learning_rate": 3.4477885196926817e-06, "loss": 0.012, "num_input_tokens_seen": 150468480, "step": 69795 }, { "epoch": 12.809689851348871, "grad_norm": 0.2096378207206726, "learning_rate": 3.4470273465300043e-06, "loss": 0.0015, "num_input_tokens_seen": 150478144, "step": 69800 }, { "epoch": 12.810607450908424, "grad_norm": 37.695953369140625, "learning_rate": 3.446266213198669e-06, "loss": 0.3004, "num_input_tokens_seen": 150489728, "step": 69805 }, { "epoch": 12.811525050467976, "grad_norm": 13.28306770324707, "learning_rate": 3.4455051197181953e-06, "loss": 0.1438, "num_input_tokens_seen": 150500928, "step": 69810 }, { "epoch": 12.812442650027528, "grad_norm": 181.98492431640625, "learning_rate": 3.444744066108108e-06, "loss": 0.1555, "num_input_tokens_seen": 150512608, "step": 69815 }, { "epoch": 12.813360249587081, "grad_norm": 95.29254150390625, "learning_rate": 3.443983052387925e-06, "loss": 0.155, "num_input_tokens_seen": 150522336, "step": 69820 }, { "epoch": 12.814277849146633, "grad_norm": 19.184463500976562, "learning_rate": 3.443222078577162e-06, "loss": 0.0042, "num_input_tokens_seen": 150533344, "step": 69825 }, { "epoch": 12.815195448706184, "grad_norm": 0.017632553353905678, "learning_rate": 3.442461144695343e-06, "loss": 0.0435, "num_input_tokens_seen": 150543232, "step": 69830 }, { "epoch": 12.816113048265738, "grad_norm": 25.524574279785156, "learning_rate": 3.441700250761982e-06, "loss": 0.1214, "num_input_tokens_seen": 150554784, "step": 69835 }, { "epoch": 12.81703064782529, "grad_norm": 0.005755262449383736, "learning_rate": 3.4409393967965914e-06, "loss": 0.1287, "num_input_tokens_seen": 150565440, "step": 69840 }, { "epoch": 12.817948247384841, "grad_norm": 32.801841735839844, "learning_rate": 3.4401785828186918e-06, "loss": 0.0341, "num_input_tokens_seen": 150576544, "step": 69845 }, { "epoch": 12.818865846944394, "grad_norm": 0.0012865662574768066, "learning_rate": 3.4394178088477934e-06, "loss": 0.1909, "num_input_tokens_seen": 150587552, "step": 69850 }, { "epoch": 12.819783446503946, "grad_norm": 0.14752638339996338, "learning_rate": 3.43865707490341e-06, "loss": 0.1828, "num_input_tokens_seen": 150599072, "step": 69855 }, { "epoch": 12.820701046063498, "grad_norm": 21.56692886352539, "learning_rate": 3.4378963810050505e-06, "loss": 0.2395, "num_input_tokens_seen": 150610752, "step": 69860 }, { "epoch": 12.821618645623051, "grad_norm": 0.0369519367814064, "learning_rate": 3.43713572717223e-06, "loss": 0.0978, "num_input_tokens_seen": 150621440, "step": 69865 }, { "epoch": 12.822536245182603, "grad_norm": 0.04462261497974396, "learning_rate": 3.436375113424456e-06, "loss": 0.1078, "num_input_tokens_seen": 150630976, "step": 69870 }, { "epoch": 12.823453844742154, "grad_norm": 31.149917602539062, "learning_rate": 3.4356145397812355e-06, "loss": 0.1529, "num_input_tokens_seen": 150642400, "step": 69875 }, { "epoch": 12.824371444301708, "grad_norm": 0.03811141103506088, "learning_rate": 3.4348540062620772e-06, "loss": 0.0059, "num_input_tokens_seen": 150652320, "step": 69880 }, { "epoch": 12.82528904386126, "grad_norm": 0.1229742094874382, "learning_rate": 3.4340935128864895e-06, "loss": 0.03, "num_input_tokens_seen": 150663360, "step": 69885 }, { "epoch": 12.82620664342081, "grad_norm": 72.34635162353516, "learning_rate": 3.4333330596739765e-06, "loss": 0.2727, "num_input_tokens_seen": 150673952, "step": 69890 }, { "epoch": 12.827124242980364, "grad_norm": 0.6412113308906555, "learning_rate": 3.432572646644041e-06, "loss": 0.109, "num_input_tokens_seen": 150685440, "step": 69895 }, { "epoch": 12.828041842539916, "grad_norm": 19.659961700439453, "learning_rate": 3.4318122738161885e-06, "loss": 0.1193, "num_input_tokens_seen": 150696704, "step": 69900 }, { "epoch": 12.828959442099467, "grad_norm": 0.2932688891887665, "learning_rate": 3.4310519412099203e-06, "loss": 0.1318, "num_input_tokens_seen": 150706784, "step": 69905 }, { "epoch": 12.82987704165902, "grad_norm": 0.011510261334478855, "learning_rate": 3.430291648844738e-06, "loss": 0.566, "num_input_tokens_seen": 150717056, "step": 69910 }, { "epoch": 12.830794641218572, "grad_norm": 0.052339907735586166, "learning_rate": 3.429531396740143e-06, "loss": 0.2412, "num_input_tokens_seen": 150728000, "step": 69915 }, { "epoch": 12.831712240778124, "grad_norm": 0.02007756009697914, "learning_rate": 3.4287711849156335e-06, "loss": 0.0011, "num_input_tokens_seen": 150737952, "step": 69920 }, { "epoch": 12.832629840337678, "grad_norm": 0.09660492837429047, "learning_rate": 3.4280110133907063e-06, "loss": 0.0763, "num_input_tokens_seen": 150749120, "step": 69925 }, { "epoch": 12.83354743989723, "grad_norm": 41.52035140991211, "learning_rate": 3.4272508821848617e-06, "loss": 0.1861, "num_input_tokens_seen": 150758592, "step": 69930 }, { "epoch": 12.83446503945678, "grad_norm": 24.487756729125977, "learning_rate": 3.426490791317595e-06, "loss": 0.1933, "num_input_tokens_seen": 150770304, "step": 69935 }, { "epoch": 12.835382639016334, "grad_norm": 0.20153433084487915, "learning_rate": 3.425730740808401e-06, "loss": 0.3158, "num_input_tokens_seen": 150780096, "step": 69940 }, { "epoch": 12.836300238575886, "grad_norm": 37.529823303222656, "learning_rate": 3.4249707306767706e-06, "loss": 0.5586, "num_input_tokens_seen": 150790464, "step": 69945 }, { "epoch": 12.837217838135437, "grad_norm": 0.03792242705821991, "learning_rate": 3.4242107609422024e-06, "loss": 0.0059, "num_input_tokens_seen": 150801792, "step": 69950 }, { "epoch": 12.83813543769499, "grad_norm": 0.1196734756231308, "learning_rate": 3.4234508316241853e-06, "loss": 0.0007, "num_input_tokens_seen": 150812672, "step": 69955 }, { "epoch": 12.839053037254542, "grad_norm": 0.21288011968135834, "learning_rate": 3.42269094274221e-06, "loss": 0.0036, "num_input_tokens_seen": 150823392, "step": 69960 }, { "epoch": 12.839970636814094, "grad_norm": 0.10985221713781357, "learning_rate": 3.421931094315769e-06, "loss": 0.0122, "num_input_tokens_seen": 150834112, "step": 69965 }, { "epoch": 12.840888236373647, "grad_norm": 14.796717643737793, "learning_rate": 3.4211712863643497e-06, "loss": 0.1849, "num_input_tokens_seen": 150844256, "step": 69970 }, { "epoch": 12.841805835933199, "grad_norm": 135.73402404785156, "learning_rate": 3.4204115189074386e-06, "loss": 0.045, "num_input_tokens_seen": 150855424, "step": 69975 }, { "epoch": 12.84272343549275, "grad_norm": 0.04217695817351341, "learning_rate": 3.4196517919645247e-06, "loss": 0.0069, "num_input_tokens_seen": 150865312, "step": 69980 }, { "epoch": 12.843641035052304, "grad_norm": 0.07549618184566498, "learning_rate": 3.4188921055550934e-06, "loss": 0.3533, "num_input_tokens_seen": 150876000, "step": 69985 }, { "epoch": 12.844558634611856, "grad_norm": 0.28711557388305664, "learning_rate": 3.4181324596986294e-06, "loss": 0.0038, "num_input_tokens_seen": 150887424, "step": 69990 }, { "epoch": 12.845476234171407, "grad_norm": 31.08685874938965, "learning_rate": 3.4173728544146147e-06, "loss": 0.1643, "num_input_tokens_seen": 150899424, "step": 69995 }, { "epoch": 12.84639383373096, "grad_norm": 38.678977966308594, "learning_rate": 3.4166132897225346e-06, "loss": 0.2143, "num_input_tokens_seen": 150911008, "step": 70000 }, { "epoch": 12.847311433290512, "grad_norm": 0.16427309811115265, "learning_rate": 3.415853765641869e-06, "loss": 0.3224, "num_input_tokens_seen": 150922528, "step": 70005 }, { "epoch": 12.848229032850064, "grad_norm": 0.06557822972536087, "learning_rate": 3.4150942821920985e-06, "loss": 0.3621, "num_input_tokens_seen": 150933024, "step": 70010 }, { "epoch": 12.849146632409617, "grad_norm": 2.824129581451416, "learning_rate": 3.4143348393927043e-06, "loss": 0.169, "num_input_tokens_seen": 150944320, "step": 70015 }, { "epoch": 12.850064231969169, "grad_norm": 57.181400299072266, "learning_rate": 3.4135754372631646e-06, "loss": 0.1863, "num_input_tokens_seen": 150955392, "step": 70020 }, { "epoch": 12.85098183152872, "grad_norm": 307.7200622558594, "learning_rate": 3.4128160758229532e-06, "loss": 0.165, "num_input_tokens_seen": 150965120, "step": 70025 }, { "epoch": 12.851899431088274, "grad_norm": 0.5693977475166321, "learning_rate": 3.4120567550915527e-06, "loss": 0.1747, "num_input_tokens_seen": 150975872, "step": 70030 }, { "epoch": 12.852817030647826, "grad_norm": 0.08731398731470108, "learning_rate": 3.411297475088434e-06, "loss": 0.001, "num_input_tokens_seen": 150986336, "step": 70035 }, { "epoch": 12.853734630207377, "grad_norm": 35.43242645263672, "learning_rate": 3.410538235833074e-06, "loss": 0.0353, "num_input_tokens_seen": 150996608, "step": 70040 }, { "epoch": 12.85465222976693, "grad_norm": 16.375347137451172, "learning_rate": 3.4097790373449423e-06, "loss": 0.0901, "num_input_tokens_seen": 151006528, "step": 70045 }, { "epoch": 12.855569829326482, "grad_norm": 0.013153661973774433, "learning_rate": 3.409019879643516e-06, "loss": 0.1267, "num_input_tokens_seen": 151016576, "step": 70050 }, { "epoch": 12.856487428886034, "grad_norm": 0.015380659140646458, "learning_rate": 3.408260762748263e-06, "loss": 0.0959, "num_input_tokens_seen": 151028096, "step": 70055 }, { "epoch": 12.857405028445587, "grad_norm": 0.6939279437065125, "learning_rate": 3.4075016866786538e-06, "loss": 0.0992, "num_input_tokens_seen": 151038624, "step": 70060 }, { "epoch": 12.858322628005139, "grad_norm": 3.5009117126464844, "learning_rate": 3.4067426514541597e-06, "loss": 0.1365, "num_input_tokens_seen": 151050368, "step": 70065 }, { "epoch": 12.85924022756469, "grad_norm": 0.08246123790740967, "learning_rate": 3.4059836570942472e-06, "loss": 0.0015, "num_input_tokens_seen": 151061664, "step": 70070 }, { "epoch": 12.860157827124244, "grad_norm": 21.641448974609375, "learning_rate": 3.4052247036183827e-06, "loss": 0.0038, "num_input_tokens_seen": 151073568, "step": 70075 }, { "epoch": 12.861075426683795, "grad_norm": 0.011702739633619785, "learning_rate": 3.4044657910460323e-06, "loss": 0.1352, "num_input_tokens_seen": 151085216, "step": 70080 }, { "epoch": 12.861993026243347, "grad_norm": 0.002159201307222247, "learning_rate": 3.403706919396662e-06, "loss": 0.2281, "num_input_tokens_seen": 151096064, "step": 70085 }, { "epoch": 12.8629106258029, "grad_norm": 0.04660915210843086, "learning_rate": 3.402948088689736e-06, "loss": 0.0266, "num_input_tokens_seen": 151106976, "step": 70090 }, { "epoch": 12.863828225362452, "grad_norm": 0.9965593814849854, "learning_rate": 3.402189298944716e-06, "loss": 0.0023, "num_input_tokens_seen": 151117184, "step": 70095 }, { "epoch": 12.864745824922004, "grad_norm": 0.1190202459692955, "learning_rate": 3.401430550181063e-06, "loss": 0.3143, "num_input_tokens_seen": 151129184, "step": 70100 }, { "epoch": 12.865663424481557, "grad_norm": 129.66641235351562, "learning_rate": 3.40067184241824e-06, "loss": 0.119, "num_input_tokens_seen": 151140608, "step": 70105 }, { "epoch": 12.866581024041109, "grad_norm": 0.33171385526657104, "learning_rate": 3.3999131756757043e-06, "loss": 0.271, "num_input_tokens_seen": 151151264, "step": 70110 }, { "epoch": 12.86749862360066, "grad_norm": 0.2470325380563736, "learning_rate": 3.3991545499729175e-06, "loss": 0.2825, "num_input_tokens_seen": 151162464, "step": 70115 }, { "epoch": 12.868416223160214, "grad_norm": 0.7886437773704529, "learning_rate": 3.3983959653293353e-06, "loss": 0.0628, "num_input_tokens_seen": 151172672, "step": 70120 }, { "epoch": 12.869333822719765, "grad_norm": 0.3071341812610626, "learning_rate": 3.3976374217644138e-06, "loss": 0.0127, "num_input_tokens_seen": 151183520, "step": 70125 }, { "epoch": 12.870251422279317, "grad_norm": 2.7153573036193848, "learning_rate": 3.396878919297607e-06, "loss": 0.2056, "num_input_tokens_seen": 151194688, "step": 70130 }, { "epoch": 12.87116902183887, "grad_norm": 71.02947998046875, "learning_rate": 3.3961204579483736e-06, "loss": 0.0821, "num_input_tokens_seen": 151206144, "step": 70135 }, { "epoch": 12.872086621398422, "grad_norm": 35.62143325805664, "learning_rate": 3.3953620377361648e-06, "loss": 0.1866, "num_input_tokens_seen": 151217248, "step": 70140 }, { "epoch": 12.873004220957974, "grad_norm": 0.0918612852692604, "learning_rate": 3.3946036586804305e-06, "loss": 0.077, "num_input_tokens_seen": 151228128, "step": 70145 }, { "epoch": 12.873921820517527, "grad_norm": 23.539703369140625, "learning_rate": 3.3938453208006262e-06, "loss": 0.0534, "num_input_tokens_seen": 151239488, "step": 70150 }, { "epoch": 12.874839420077079, "grad_norm": 0.031180698424577713, "learning_rate": 3.3930870241162e-06, "loss": 0.0045, "num_input_tokens_seen": 151249440, "step": 70155 }, { "epoch": 12.87575701963663, "grad_norm": 0.10647181421518326, "learning_rate": 3.3923287686465994e-06, "loss": 0.0743, "num_input_tokens_seen": 151259712, "step": 70160 }, { "epoch": 12.876674619196184, "grad_norm": 126.63219451904297, "learning_rate": 3.3915705544112764e-06, "loss": 0.1467, "num_input_tokens_seen": 151269888, "step": 70165 }, { "epoch": 12.877592218755735, "grad_norm": 0.020319612696766853, "learning_rate": 3.390812381429676e-06, "loss": 0.0073, "num_input_tokens_seen": 151280768, "step": 70170 }, { "epoch": 12.878509818315287, "grad_norm": 0.049730122089385986, "learning_rate": 3.390054249721243e-06, "loss": 0.002, "num_input_tokens_seen": 151292512, "step": 70175 }, { "epoch": 12.87942741787484, "grad_norm": 2.272454023361206, "learning_rate": 3.389296159305422e-06, "loss": 0.0007, "num_input_tokens_seen": 151302240, "step": 70180 }, { "epoch": 12.880345017434392, "grad_norm": 13.046651840209961, "learning_rate": 3.38853811020166e-06, "loss": 0.1448, "num_input_tokens_seen": 151312160, "step": 70185 }, { "epoch": 12.881262616993943, "grad_norm": 2.603278160095215, "learning_rate": 3.387780102429398e-06, "loss": 0.0258, "num_input_tokens_seen": 151323520, "step": 70190 }, { "epoch": 12.882180216553497, "grad_norm": 0.05514898523688316, "learning_rate": 3.3870221360080766e-06, "loss": 0.0055, "num_input_tokens_seen": 151333760, "step": 70195 }, { "epoch": 12.883097816113048, "grad_norm": 0.44010359048843384, "learning_rate": 3.3862642109571376e-06, "loss": 0.0105, "num_input_tokens_seen": 151344896, "step": 70200 }, { "epoch": 12.8840154156726, "grad_norm": 2.5827529430389404, "learning_rate": 3.385506327296021e-06, "loss": 0.1633, "num_input_tokens_seen": 151353920, "step": 70205 }, { "epoch": 12.884933015232154, "grad_norm": 0.2831173837184906, "learning_rate": 3.384748485044166e-06, "loss": 0.0555, "num_input_tokens_seen": 151364544, "step": 70210 }, { "epoch": 12.885850614791705, "grad_norm": 0.1230289563536644, "learning_rate": 3.3839906842210068e-06, "loss": 0.1261, "num_input_tokens_seen": 151374720, "step": 70215 }, { "epoch": 12.886768214351257, "grad_norm": 226.18247985839844, "learning_rate": 3.383232924845984e-06, "loss": 0.2176, "num_input_tokens_seen": 151385664, "step": 70220 }, { "epoch": 12.88768581391081, "grad_norm": 127.87982177734375, "learning_rate": 3.3824752069385293e-06, "loss": 0.1811, "num_input_tokens_seen": 151397120, "step": 70225 }, { "epoch": 12.888603413470362, "grad_norm": 81.316162109375, "learning_rate": 3.3817175305180784e-06, "loss": 0.1189, "num_input_tokens_seen": 151408608, "step": 70230 }, { "epoch": 12.889521013029913, "grad_norm": 1.0209877490997314, "learning_rate": 3.3809598956040656e-06, "loss": 0.0491, "num_input_tokens_seen": 151417760, "step": 70235 }, { "epoch": 12.890438612589467, "grad_norm": 1.758173942565918, "learning_rate": 3.380202302215923e-06, "loss": 0.1899, "num_input_tokens_seen": 151429408, "step": 70240 }, { "epoch": 12.891356212149018, "grad_norm": 0.023596271872520447, "learning_rate": 3.3794447503730787e-06, "loss": 0.2134, "num_input_tokens_seen": 151441344, "step": 70245 }, { "epoch": 12.89227381170857, "grad_norm": 112.29127502441406, "learning_rate": 3.3786872400949666e-06, "loss": 0.1799, "num_input_tokens_seen": 151451584, "step": 70250 }, { "epoch": 12.893191411268123, "grad_norm": 0.02402012050151825, "learning_rate": 3.377929771401014e-06, "loss": 0.0018, "num_input_tokens_seen": 151463456, "step": 70255 }, { "epoch": 12.894109010827675, "grad_norm": 0.263085275888443, "learning_rate": 3.3771723443106486e-06, "loss": 0.1138, "num_input_tokens_seen": 151475136, "step": 70260 }, { "epoch": 12.895026610387227, "grad_norm": 56.915733337402344, "learning_rate": 3.376414958843296e-06, "loss": 0.0148, "num_input_tokens_seen": 151486720, "step": 70265 }, { "epoch": 12.89594420994678, "grad_norm": 0.35207462310791016, "learning_rate": 3.375657615018385e-06, "loss": 0.4348, "num_input_tokens_seen": 151497216, "step": 70270 }, { "epoch": 12.896861809506332, "grad_norm": 122.12578582763672, "learning_rate": 3.374900312855339e-06, "loss": 0.7005, "num_input_tokens_seen": 151509056, "step": 70275 }, { "epoch": 12.897779409065883, "grad_norm": 0.11830595135688782, "learning_rate": 3.3741430523735787e-06, "loss": 0.001, "num_input_tokens_seen": 151519520, "step": 70280 }, { "epoch": 12.898697008625437, "grad_norm": 0.008859033696353436, "learning_rate": 3.3733858335925317e-06, "loss": 0.5322, "num_input_tokens_seen": 151531680, "step": 70285 }, { "epoch": 12.899614608184988, "grad_norm": 31.920337677001953, "learning_rate": 3.372628656531617e-06, "loss": 0.1085, "num_input_tokens_seen": 151542784, "step": 70290 }, { "epoch": 12.90053220774454, "grad_norm": 124.37065124511719, "learning_rate": 3.371871521210253e-06, "loss": 0.1339, "num_input_tokens_seen": 151552928, "step": 70295 }, { "epoch": 12.901449807304093, "grad_norm": 142.16722106933594, "learning_rate": 3.371114427647863e-06, "loss": 0.11, "num_input_tokens_seen": 151564576, "step": 70300 }, { "epoch": 12.902367406863645, "grad_norm": 0.1561092734336853, "learning_rate": 3.3703573758638635e-06, "loss": 0.0022, "num_input_tokens_seen": 151575168, "step": 70305 }, { "epoch": 12.903285006423197, "grad_norm": 161.82382202148438, "learning_rate": 3.3696003658776717e-06, "loss": 0.0378, "num_input_tokens_seen": 151585120, "step": 70310 }, { "epoch": 12.90420260598275, "grad_norm": 0.054411254823207855, "learning_rate": 3.368843397708702e-06, "loss": 0.0388, "num_input_tokens_seen": 151595904, "step": 70315 }, { "epoch": 12.905120205542302, "grad_norm": 0.3828165531158447, "learning_rate": 3.368086471376373e-06, "loss": 0.0004, "num_input_tokens_seen": 151606400, "step": 70320 }, { "epoch": 12.906037805101853, "grad_norm": 0.1545300930738449, "learning_rate": 3.3673295869000956e-06, "loss": 0.0008, "num_input_tokens_seen": 151617824, "step": 70325 }, { "epoch": 12.906955404661407, "grad_norm": 0.22438542544841766, "learning_rate": 3.3665727442992833e-06, "loss": 0.1581, "num_input_tokens_seen": 151628288, "step": 70330 }, { "epoch": 12.907873004220958, "grad_norm": 2.1080400943756104, "learning_rate": 3.3658159435933503e-06, "loss": 0.2783, "num_input_tokens_seen": 151639904, "step": 70335 }, { "epoch": 12.90879060378051, "grad_norm": 112.01636505126953, "learning_rate": 3.365059184801705e-06, "loss": 0.0765, "num_input_tokens_seen": 151651328, "step": 70340 }, { "epoch": 12.909708203340063, "grad_norm": 65.41433715820312, "learning_rate": 3.364302467943758e-06, "loss": 0.0766, "num_input_tokens_seen": 151661696, "step": 70345 }, { "epoch": 12.910625802899615, "grad_norm": 0.018042579293251038, "learning_rate": 3.3635457930389153e-06, "loss": 0.0015, "num_input_tokens_seen": 151673440, "step": 70350 }, { "epoch": 12.911543402459166, "grad_norm": 0.03512114658951759, "learning_rate": 3.362789160106589e-06, "loss": 0.062, "num_input_tokens_seen": 151683136, "step": 70355 }, { "epoch": 12.91246100201872, "grad_norm": 0.5228894352912903, "learning_rate": 3.3620325691661833e-06, "loss": 0.0029, "num_input_tokens_seen": 151693632, "step": 70360 }, { "epoch": 12.913378601578271, "grad_norm": 51.50163650512695, "learning_rate": 3.3612760202371008e-06, "loss": 0.0322, "num_input_tokens_seen": 151703904, "step": 70365 }, { "epoch": 12.914296201137823, "grad_norm": 143.23451232910156, "learning_rate": 3.3605195133387516e-06, "loss": 0.2731, "num_input_tokens_seen": 151714592, "step": 70370 }, { "epoch": 12.915213800697376, "grad_norm": 26.071975708007812, "learning_rate": 3.3597630484905356e-06, "loss": 0.0398, "num_input_tokens_seen": 151725984, "step": 70375 }, { "epoch": 12.916131400256928, "grad_norm": 0.11078416556119919, "learning_rate": 3.359006625711854e-06, "loss": 0.266, "num_input_tokens_seen": 151735232, "step": 70380 }, { "epoch": 12.91704899981648, "grad_norm": 0.003925530705600977, "learning_rate": 3.35825024502211e-06, "loss": 0.2537, "num_input_tokens_seen": 151746400, "step": 70385 }, { "epoch": 12.917966599376033, "grad_norm": 0.2917770743370056, "learning_rate": 3.357493906440703e-06, "loss": 0.174, "num_input_tokens_seen": 151758656, "step": 70390 }, { "epoch": 12.918884198935585, "grad_norm": 24.784286499023438, "learning_rate": 3.3567376099870318e-06, "loss": 0.1305, "num_input_tokens_seen": 151769440, "step": 70395 }, { "epoch": 12.919801798495136, "grad_norm": 148.08360290527344, "learning_rate": 3.3559813556804922e-06, "loss": 0.4597, "num_input_tokens_seen": 151781056, "step": 70400 }, { "epoch": 12.92071939805469, "grad_norm": 0.10415638238191605, "learning_rate": 3.3552251435404844e-06, "loss": 0.0069, "num_input_tokens_seen": 151792480, "step": 70405 }, { "epoch": 12.921636997614241, "grad_norm": 0.018247678875923157, "learning_rate": 3.354468973586403e-06, "loss": 0.1564, "num_input_tokens_seen": 151802592, "step": 70410 }, { "epoch": 12.922554597173793, "grad_norm": 0.589182436466217, "learning_rate": 3.3537128458376407e-06, "loss": 0.2462, "num_input_tokens_seen": 151813728, "step": 70415 }, { "epoch": 12.923472196733346, "grad_norm": 0.014245716854929924, "learning_rate": 3.3529567603135925e-06, "loss": 0.5102, "num_input_tokens_seen": 151825952, "step": 70420 }, { "epoch": 12.924389796292898, "grad_norm": 0.09433307498693466, "learning_rate": 3.352200717033652e-06, "loss": 0.0045, "num_input_tokens_seen": 151835488, "step": 70425 }, { "epoch": 12.92530739585245, "grad_norm": 441.7731018066406, "learning_rate": 3.3514447160172077e-06, "loss": 0.0274, "num_input_tokens_seen": 151846848, "step": 70430 }, { "epoch": 12.926224995412003, "grad_norm": 100.26912689208984, "learning_rate": 3.350688757283653e-06, "loss": 0.0154, "num_input_tokens_seen": 151856960, "step": 70435 }, { "epoch": 12.927142594971555, "grad_norm": 0.4854274392127991, "learning_rate": 3.3499328408523748e-06, "loss": 0.0373, "num_input_tokens_seen": 151867648, "step": 70440 }, { "epoch": 12.928060194531106, "grad_norm": 0.26132386922836304, "learning_rate": 3.349176966742761e-06, "loss": 0.1384, "num_input_tokens_seen": 151877888, "step": 70445 }, { "epoch": 12.92897779409066, "grad_norm": 25.31478500366211, "learning_rate": 3.3484211349742003e-06, "loss": 0.0862, "num_input_tokens_seen": 151889312, "step": 70450 }, { "epoch": 12.929895393650211, "grad_norm": 521.334228515625, "learning_rate": 3.3476653455660777e-06, "loss": 0.2365, "num_input_tokens_seen": 151901280, "step": 70455 }, { "epoch": 12.930812993209763, "grad_norm": 0.013686268590390682, "learning_rate": 3.3469095985377786e-06, "loss": 0.1659, "num_input_tokens_seen": 151911840, "step": 70460 }, { "epoch": 12.931730592769316, "grad_norm": 136.36105346679688, "learning_rate": 3.3461538939086844e-06, "loss": 0.252, "num_input_tokens_seen": 151923776, "step": 70465 }, { "epoch": 12.932648192328868, "grad_norm": 1.8015400171279907, "learning_rate": 3.3453982316981815e-06, "loss": 0.135, "num_input_tokens_seen": 151935008, "step": 70470 }, { "epoch": 12.93356579188842, "grad_norm": 0.019710781052708626, "learning_rate": 3.3446426119256493e-06, "loss": 0.0016, "num_input_tokens_seen": 151944608, "step": 70475 }, { "epoch": 12.934483391447973, "grad_norm": 0.10527326166629791, "learning_rate": 3.343887034610467e-06, "loss": 0.0917, "num_input_tokens_seen": 151956128, "step": 70480 }, { "epoch": 12.935400991007524, "grad_norm": 33.947208404541016, "learning_rate": 3.343131499772017e-06, "loss": 0.0624, "num_input_tokens_seen": 151967072, "step": 70485 }, { "epoch": 12.936318590567076, "grad_norm": 1.010114073753357, "learning_rate": 3.3423760074296764e-06, "loss": 0.0009, "num_input_tokens_seen": 151977632, "step": 70490 }, { "epoch": 12.93723619012663, "grad_norm": 145.73492431640625, "learning_rate": 3.341620557602822e-06, "loss": 0.2223, "num_input_tokens_seen": 151988832, "step": 70495 }, { "epoch": 12.938153789686181, "grad_norm": 0.06924846023321152, "learning_rate": 3.340865150310828e-06, "loss": 0.0006, "num_input_tokens_seen": 152000128, "step": 70500 }, { "epoch": 12.939071389245733, "grad_norm": 0.026400946080684662, "learning_rate": 3.3401097855730735e-06, "loss": 0.1071, "num_input_tokens_seen": 152012512, "step": 70505 }, { "epoch": 12.939988988805286, "grad_norm": 0.0530654713511467, "learning_rate": 3.3393544634089304e-06, "loss": 0.2349, "num_input_tokens_seen": 152023776, "step": 70510 }, { "epoch": 12.940906588364838, "grad_norm": 1.4653931856155396, "learning_rate": 3.338599183837771e-06, "loss": 0.0016, "num_input_tokens_seen": 152034816, "step": 70515 }, { "epoch": 12.94182418792439, "grad_norm": 0.006738357245922089, "learning_rate": 3.337843946878967e-06, "loss": 0.1659, "num_input_tokens_seen": 152045568, "step": 70520 }, { "epoch": 12.942741787483943, "grad_norm": 32.56155014038086, "learning_rate": 3.337088752551891e-06, "loss": 0.2021, "num_input_tokens_seen": 152056320, "step": 70525 }, { "epoch": 12.943659387043494, "grad_norm": 71.08256530761719, "learning_rate": 3.3363336008759113e-06, "loss": 0.0423, "num_input_tokens_seen": 152067328, "step": 70530 }, { "epoch": 12.944576986603046, "grad_norm": 0.05708129331469536, "learning_rate": 3.335578491870395e-06, "loss": 0.0003, "num_input_tokens_seen": 152077856, "step": 70535 }, { "epoch": 12.9454945861626, "grad_norm": 0.07482288032770157, "learning_rate": 3.3348234255547117e-06, "loss": 0.0268, "num_input_tokens_seen": 152089216, "step": 70540 }, { "epoch": 12.946412185722151, "grad_norm": 0.027363091707229614, "learning_rate": 3.3340684019482263e-06, "loss": 0.1474, "num_input_tokens_seen": 152100928, "step": 70545 }, { "epoch": 12.947329785281703, "grad_norm": 328.4507751464844, "learning_rate": 3.333313421070303e-06, "loss": 0.292, "num_input_tokens_seen": 152112640, "step": 70550 }, { "epoch": 12.948247384841256, "grad_norm": 0.3489421308040619, "learning_rate": 3.33255848294031e-06, "loss": 0.0011, "num_input_tokens_seen": 152122880, "step": 70555 }, { "epoch": 12.949164984400808, "grad_norm": 0.15435577929019928, "learning_rate": 3.3318035875776066e-06, "loss": 0.0877, "num_input_tokens_seen": 152133312, "step": 70560 }, { "epoch": 12.95008258396036, "grad_norm": 4.806766033172607, "learning_rate": 3.331048735001554e-06, "loss": 0.0036, "num_input_tokens_seen": 152143808, "step": 70565 }, { "epoch": 12.951000183519913, "grad_norm": 10.607988357543945, "learning_rate": 3.330293925231517e-06, "loss": 0.4648, "num_input_tokens_seen": 152154592, "step": 70570 }, { "epoch": 12.951917783079464, "grad_norm": 52.64593505859375, "learning_rate": 3.3295391582868532e-06, "loss": 0.3254, "num_input_tokens_seen": 152165536, "step": 70575 }, { "epoch": 12.952835382639016, "grad_norm": 0.03429268300533295, "learning_rate": 3.32878443418692e-06, "loss": 0.002, "num_input_tokens_seen": 152176800, "step": 70580 }, { "epoch": 12.95375298219857, "grad_norm": 0.00202581356279552, "learning_rate": 3.3280297529510754e-06, "loss": 0.1457, "num_input_tokens_seen": 152186080, "step": 70585 }, { "epoch": 12.954670581758121, "grad_norm": 4.336963653564453, "learning_rate": 3.327275114598677e-06, "loss": 0.1099, "num_input_tokens_seen": 152197152, "step": 70590 }, { "epoch": 12.955588181317673, "grad_norm": 0.2614729404449463, "learning_rate": 3.32652051914908e-06, "loss": 0.0016, "num_input_tokens_seen": 152207968, "step": 70595 }, { "epoch": 12.956505780877226, "grad_norm": 0.07958325743675232, "learning_rate": 3.3257659666216358e-06, "loss": 0.0248, "num_input_tokens_seen": 152218592, "step": 70600 }, { "epoch": 12.957423380436778, "grad_norm": 0.49442538619041443, "learning_rate": 3.325011457035702e-06, "loss": 0.1929, "num_input_tokens_seen": 152230272, "step": 70605 }, { "epoch": 12.95834097999633, "grad_norm": 3.41382098197937, "learning_rate": 3.324256990410628e-06, "loss": 0.3668, "num_input_tokens_seen": 152241600, "step": 70610 }, { "epoch": 12.959258579555883, "grad_norm": 0.11856276541948318, "learning_rate": 3.323502566765763e-06, "loss": 0.1084, "num_input_tokens_seen": 152252896, "step": 70615 }, { "epoch": 12.960176179115434, "grad_norm": 0.10636494308710098, "learning_rate": 3.322748186120461e-06, "loss": 0.0346, "num_input_tokens_seen": 152264800, "step": 70620 }, { "epoch": 12.961093778674986, "grad_norm": 3.9922919273376465, "learning_rate": 3.3219938484940682e-06, "loss": 0.374, "num_input_tokens_seen": 152276832, "step": 70625 }, { "epoch": 12.96201137823454, "grad_norm": 0.043371379375457764, "learning_rate": 3.321239553905933e-06, "loss": 0.0127, "num_input_tokens_seen": 152287552, "step": 70630 }, { "epoch": 12.96292897779409, "grad_norm": 0.05964129790663719, "learning_rate": 3.3204853023754004e-06, "loss": 0.0009, "num_input_tokens_seen": 152298112, "step": 70635 }, { "epoch": 12.963846577353642, "grad_norm": 29.121713638305664, "learning_rate": 3.3197310939218164e-06, "loss": 0.2714, "num_input_tokens_seen": 152308736, "step": 70640 }, { "epoch": 12.964764176913196, "grad_norm": 22.357240676879883, "learning_rate": 3.3189769285645268e-06, "loss": 0.466, "num_input_tokens_seen": 152319232, "step": 70645 }, { "epoch": 12.965681776472747, "grad_norm": 33.25592041015625, "learning_rate": 3.3182228063228726e-06, "loss": 0.1416, "num_input_tokens_seen": 152329888, "step": 70650 }, { "epoch": 12.966599376032299, "grad_norm": 0.0535706952214241, "learning_rate": 3.317468727216198e-06, "loss": 0.0768, "num_input_tokens_seen": 152340832, "step": 70655 }, { "epoch": 12.967516975591852, "grad_norm": 0.6549474000930786, "learning_rate": 3.316714691263843e-06, "loss": 0.1607, "num_input_tokens_seen": 152351200, "step": 70660 }, { "epoch": 12.968434575151404, "grad_norm": 0.013794856145977974, "learning_rate": 3.315960698485147e-06, "loss": 0.13, "num_input_tokens_seen": 152362432, "step": 70665 }, { "epoch": 12.969352174710956, "grad_norm": 42.51392364501953, "learning_rate": 3.3152067488994477e-06, "loss": 0.3588, "num_input_tokens_seen": 152373664, "step": 70670 }, { "epoch": 12.970269774270509, "grad_norm": 0.004813210107386112, "learning_rate": 3.3144528425260854e-06, "loss": 0.0694, "num_input_tokens_seen": 152385280, "step": 70675 }, { "epoch": 12.97118737383006, "grad_norm": 9.599411010742188, "learning_rate": 3.3136989793843953e-06, "loss": 0.0016, "num_input_tokens_seen": 152396192, "step": 70680 }, { "epoch": 12.972104973389612, "grad_norm": 0.018045544624328613, "learning_rate": 3.312945159493711e-06, "loss": 0.1286, "num_input_tokens_seen": 152406272, "step": 70685 }, { "epoch": 12.973022572949166, "grad_norm": 0.260201096534729, "learning_rate": 3.312191382873371e-06, "loss": 0.0109, "num_input_tokens_seen": 152417568, "step": 70690 }, { "epoch": 12.973940172508717, "grad_norm": 50.73208999633789, "learning_rate": 3.3114376495427057e-06, "loss": 0.1561, "num_input_tokens_seen": 152426528, "step": 70695 }, { "epoch": 12.974857772068269, "grad_norm": 2.008357286453247, "learning_rate": 3.3106839595210462e-06, "loss": 0.0221, "num_input_tokens_seen": 152438464, "step": 70700 }, { "epoch": 12.975775371627822, "grad_norm": 0.11501166224479675, "learning_rate": 3.3099303128277266e-06, "loss": 0.2234, "num_input_tokens_seen": 152449216, "step": 70705 }, { "epoch": 12.976692971187374, "grad_norm": 0.03609498217701912, "learning_rate": 3.309176709482075e-06, "loss": 0.1945, "num_input_tokens_seen": 152459232, "step": 70710 }, { "epoch": 12.977610570746926, "grad_norm": 110.97469329833984, "learning_rate": 3.3084231495034204e-06, "loss": 0.4631, "num_input_tokens_seen": 152469792, "step": 70715 }, { "epoch": 12.978528170306479, "grad_norm": 0.8140546083450317, "learning_rate": 3.307669632911088e-06, "loss": 0.0005, "num_input_tokens_seen": 152480352, "step": 70720 }, { "epoch": 12.97944576986603, "grad_norm": 0.06552458554506302, "learning_rate": 3.306916159724409e-06, "loss": 0.0287, "num_input_tokens_seen": 152490464, "step": 70725 }, { "epoch": 12.980363369425582, "grad_norm": 0.053897127509117126, "learning_rate": 3.3061627299627063e-06, "loss": 0.0081, "num_input_tokens_seen": 152501888, "step": 70730 }, { "epoch": 12.981280968985136, "grad_norm": 0.07719957083463669, "learning_rate": 3.305409343645304e-06, "loss": 0.1726, "num_input_tokens_seen": 152513472, "step": 70735 }, { "epoch": 12.982198568544687, "grad_norm": 2.302788019180298, "learning_rate": 3.304656000791525e-06, "loss": 0.0782, "num_input_tokens_seen": 152524096, "step": 70740 }, { "epoch": 12.983116168104239, "grad_norm": 0.06438768655061722, "learning_rate": 3.303902701420693e-06, "loss": 0.4425, "num_input_tokens_seen": 152534656, "step": 70745 }, { "epoch": 12.984033767663792, "grad_norm": 0.07740583270788193, "learning_rate": 3.3031494455521273e-06, "loss": 0.0149, "num_input_tokens_seen": 152546752, "step": 70750 }, { "epoch": 12.984951367223344, "grad_norm": 9.023149490356445, "learning_rate": 3.3023962332051494e-06, "loss": 0.0027, "num_input_tokens_seen": 152556768, "step": 70755 }, { "epoch": 12.985868966782895, "grad_norm": 0.020832082256674767, "learning_rate": 3.3016430643990773e-06, "loss": 0.0004, "num_input_tokens_seen": 152566816, "step": 70760 }, { "epoch": 12.986786566342449, "grad_norm": 0.06473948061466217, "learning_rate": 3.3008899391532266e-06, "loss": 0.001, "num_input_tokens_seen": 152574880, "step": 70765 }, { "epoch": 12.987704165902, "grad_norm": 3.6048338413238525, "learning_rate": 3.3001368574869158e-06, "loss": 0.1085, "num_input_tokens_seen": 152585152, "step": 70770 }, { "epoch": 12.988621765461552, "grad_norm": 0.22312702238559723, "learning_rate": 3.2993838194194617e-06, "loss": 0.139, "num_input_tokens_seen": 152595264, "step": 70775 }, { "epoch": 12.989539365021106, "grad_norm": 0.32532644271850586, "learning_rate": 3.298630824970176e-06, "loss": 0.001, "num_input_tokens_seen": 152606272, "step": 70780 }, { "epoch": 12.990456964580657, "grad_norm": 0.031698886305093765, "learning_rate": 3.2978778741583717e-06, "loss": 0.2579, "num_input_tokens_seen": 152615872, "step": 70785 }, { "epoch": 12.991374564140209, "grad_norm": 0.0436612106859684, "learning_rate": 3.2971249670033633e-06, "loss": 0.0311, "num_input_tokens_seen": 152625984, "step": 70790 }, { "epoch": 12.992292163699762, "grad_norm": 0.8956969380378723, "learning_rate": 3.29637210352446e-06, "loss": 0.1288, "num_input_tokens_seen": 152637952, "step": 70795 }, { "epoch": 12.993209763259314, "grad_norm": 207.9210968017578, "learning_rate": 3.2956192837409705e-06, "loss": 0.1527, "num_input_tokens_seen": 152647968, "step": 70800 }, { "epoch": 12.994127362818865, "grad_norm": 0.024688780307769775, "learning_rate": 3.2948665076722064e-06, "loss": 0.159, "num_input_tokens_seen": 152659456, "step": 70805 }, { "epoch": 12.995044962378419, "grad_norm": 24.195810317993164, "learning_rate": 3.294113775337474e-06, "loss": 0.1265, "num_input_tokens_seen": 152670784, "step": 70810 }, { "epoch": 12.99596256193797, "grad_norm": 0.1166430413722992, "learning_rate": 3.2933610867560796e-06, "loss": 0.0007, "num_input_tokens_seen": 152680480, "step": 70815 }, { "epoch": 12.996880161497522, "grad_norm": 15.316377639770508, "learning_rate": 3.292608441947326e-06, "loss": 0.4484, "num_input_tokens_seen": 152692384, "step": 70820 }, { "epoch": 12.997797761057075, "grad_norm": 0.35666751861572266, "learning_rate": 3.2918558409305213e-06, "loss": 0.1211, "num_input_tokens_seen": 152702464, "step": 70825 }, { "epoch": 12.998715360616627, "grad_norm": 0.0789913684129715, "learning_rate": 3.291103283724967e-06, "loss": 0.0003, "num_input_tokens_seen": 152712544, "step": 70830 }, { "epoch": 12.999632960176179, "grad_norm": 0.1925784796476364, "learning_rate": 3.2903507703499625e-06, "loss": 0.0183, "num_input_tokens_seen": 152723360, "step": 70835 }, { "epoch": 13.000550559735732, "grad_norm": 0.19536691904067993, "learning_rate": 3.2895983008248144e-06, "loss": 0.4359, "num_input_tokens_seen": 152732656, "step": 70840 }, { "epoch": 13.001468159295284, "grad_norm": 0.09413024038076401, "learning_rate": 3.2888458751688177e-06, "loss": 0.0044, "num_input_tokens_seen": 152743760, "step": 70845 }, { "epoch": 13.002385758854835, "grad_norm": 0.15658441185951233, "learning_rate": 3.2880934934012733e-06, "loss": 0.3746, "num_input_tokens_seen": 152754992, "step": 70850 }, { "epoch": 13.003303358414389, "grad_norm": 33.727378845214844, "learning_rate": 3.2873411555414757e-06, "loss": 0.1228, "num_input_tokens_seen": 152765840, "step": 70855 }, { "epoch": 13.00422095797394, "grad_norm": 11.626641273498535, "learning_rate": 3.286588861608724e-06, "loss": 0.009, "num_input_tokens_seen": 152775952, "step": 70860 }, { "epoch": 13.005138557533492, "grad_norm": 0.7403726577758789, "learning_rate": 3.2858366116223124e-06, "loss": 0.0046, "num_input_tokens_seen": 152786608, "step": 70865 }, { "epoch": 13.006056157093045, "grad_norm": 0.6306434273719788, "learning_rate": 3.285084405601534e-06, "loss": 0.0015, "num_input_tokens_seen": 152798064, "step": 70870 }, { "epoch": 13.006973756652597, "grad_norm": 76.03296661376953, "learning_rate": 3.2843322435656844e-06, "loss": 0.1533, "num_input_tokens_seen": 152808848, "step": 70875 }, { "epoch": 13.007891356212149, "grad_norm": 0.07468630373477936, "learning_rate": 3.283580125534054e-06, "loss": 0.0515, "num_input_tokens_seen": 152819472, "step": 70880 }, { "epoch": 13.008808955771702, "grad_norm": 112.3718490600586, "learning_rate": 3.2828280515259303e-06, "loss": 0.0049, "num_input_tokens_seen": 152830192, "step": 70885 }, { "epoch": 13.009726555331254, "grad_norm": 0.04155024513602257, "learning_rate": 3.282076021560608e-06, "loss": 0.317, "num_input_tokens_seen": 152841392, "step": 70890 }, { "epoch": 13.010644154890805, "grad_norm": 0.05201868712902069, "learning_rate": 3.2813240356573732e-06, "loss": 0.001, "num_input_tokens_seen": 152851440, "step": 70895 }, { "epoch": 13.011561754450359, "grad_norm": 17.623292922973633, "learning_rate": 3.2805720938355136e-06, "loss": 0.1815, "num_input_tokens_seen": 152861904, "step": 70900 }, { "epoch": 13.01247935400991, "grad_norm": 0.030719242990016937, "learning_rate": 3.279820196114312e-06, "loss": 0.2176, "num_input_tokens_seen": 152873040, "step": 70905 }, { "epoch": 13.013396953569462, "grad_norm": 1.0506051778793335, "learning_rate": 3.279068342513059e-06, "loss": 0.0011, "num_input_tokens_seen": 152883696, "step": 70910 }, { "epoch": 13.014314553129015, "grad_norm": 60.1201057434082, "learning_rate": 3.2783165330510356e-06, "loss": 0.0465, "num_input_tokens_seen": 152894384, "step": 70915 }, { "epoch": 13.015232152688567, "grad_norm": 38.21762466430664, "learning_rate": 3.277564767747523e-06, "loss": 0.0387, "num_input_tokens_seen": 152905936, "step": 70920 }, { "epoch": 13.016149752248118, "grad_norm": 57.43684387207031, "learning_rate": 3.2768130466218063e-06, "loss": 0.1887, "num_input_tokens_seen": 152916464, "step": 70925 }, { "epoch": 13.017067351807672, "grad_norm": 0.1254017949104309, "learning_rate": 3.276061369693165e-06, "loss": 0.0961, "num_input_tokens_seen": 152927408, "step": 70930 }, { "epoch": 13.017984951367223, "grad_norm": 58.15895080566406, "learning_rate": 3.275309736980875e-06, "loss": 0.1299, "num_input_tokens_seen": 152938896, "step": 70935 }, { "epoch": 13.018902550926775, "grad_norm": 0.014472848735749722, "learning_rate": 3.274558148504219e-06, "loss": 0.3035, "num_input_tokens_seen": 152949232, "step": 70940 }, { "epoch": 13.019820150486328, "grad_norm": 5.466793537139893, "learning_rate": 3.273806604282473e-06, "loss": 0.0032, "num_input_tokens_seen": 152960496, "step": 70945 }, { "epoch": 13.02073775004588, "grad_norm": 0.0173049233853817, "learning_rate": 3.273055104334911e-06, "loss": 0.1125, "num_input_tokens_seen": 152971408, "step": 70950 }, { "epoch": 13.021655349605432, "grad_norm": 0.18588829040527344, "learning_rate": 3.2723036486808096e-06, "loss": 0.001, "num_input_tokens_seen": 152981584, "step": 70955 }, { "epoch": 13.022572949164985, "grad_norm": 4.219449520111084, "learning_rate": 3.2715522373394417e-06, "loss": 0.009, "num_input_tokens_seen": 152992688, "step": 70960 }, { "epoch": 13.023490548724537, "grad_norm": 77.83936309814453, "learning_rate": 3.27080087033008e-06, "loss": 0.0666, "num_input_tokens_seen": 153003952, "step": 70965 }, { "epoch": 13.024408148284088, "grad_norm": 0.060493435710668564, "learning_rate": 3.2700495476719956e-06, "loss": 0.0003, "num_input_tokens_seen": 153014736, "step": 70970 }, { "epoch": 13.025325747843642, "grad_norm": 0.006252165883779526, "learning_rate": 3.26929826938446e-06, "loss": 0.0004, "num_input_tokens_seen": 153026000, "step": 70975 }, { "epoch": 13.026243347403193, "grad_norm": 0.002906897570937872, "learning_rate": 3.2685470354867417e-06, "loss": 0.0005, "num_input_tokens_seen": 153034544, "step": 70980 }, { "epoch": 13.027160946962745, "grad_norm": 0.06119627133011818, "learning_rate": 3.2677958459981076e-06, "loss": 0.1663, "num_input_tokens_seen": 153044816, "step": 70985 }, { "epoch": 13.028078546522298, "grad_norm": 3.2021284103393555, "learning_rate": 3.267044700937825e-06, "loss": 0.0076, "num_input_tokens_seen": 153054896, "step": 70990 }, { "epoch": 13.02899614608185, "grad_norm": 0.0015118473675101995, "learning_rate": 3.266293600325161e-06, "loss": 0.0176, "num_input_tokens_seen": 153067600, "step": 70995 }, { "epoch": 13.029913745641402, "grad_norm": 17.86652183532715, "learning_rate": 3.2655425441793788e-06, "loss": 0.3079, "num_input_tokens_seen": 153077520, "step": 71000 }, { "epoch": 13.030831345200955, "grad_norm": 0.06849687546491623, "learning_rate": 3.264791532519741e-06, "loss": 0.0731, "num_input_tokens_seen": 153088336, "step": 71005 }, { "epoch": 13.031748944760507, "grad_norm": 0.025709431618452072, "learning_rate": 3.264040565365512e-06, "loss": 0.1165, "num_input_tokens_seen": 153099024, "step": 71010 }, { "epoch": 13.032666544320058, "grad_norm": 0.08515971899032593, "learning_rate": 3.2632896427359527e-06, "loss": 0.0002, "num_input_tokens_seen": 153109584, "step": 71015 }, { "epoch": 13.033584143879612, "grad_norm": 0.1262408047914505, "learning_rate": 3.2625387646503202e-06, "loss": 0.1919, "num_input_tokens_seen": 153120272, "step": 71020 }, { "epoch": 13.034501743439163, "grad_norm": 47.32099914550781, "learning_rate": 3.2617879311278776e-06, "loss": 0.0686, "num_input_tokens_seen": 153130288, "step": 71025 }, { "epoch": 13.035419342998715, "grad_norm": 405.1851501464844, "learning_rate": 3.2610371421878813e-06, "loss": 0.0776, "num_input_tokens_seen": 153141392, "step": 71030 }, { "epoch": 13.036336942558268, "grad_norm": 0.568810760974884, "learning_rate": 3.2602863978495864e-06, "loss": 0.0005, "num_input_tokens_seen": 153152208, "step": 71035 }, { "epoch": 13.03725454211782, "grad_norm": 0.07315277308225632, "learning_rate": 3.259535698132247e-06, "loss": 0.1732, "num_input_tokens_seen": 153163216, "step": 71040 }, { "epoch": 13.038172141677371, "grad_norm": 412.9668273925781, "learning_rate": 3.2587850430551216e-06, "loss": 0.0977, "num_input_tokens_seen": 153173904, "step": 71045 }, { "epoch": 13.039089741236925, "grad_norm": 0.13798919320106506, "learning_rate": 3.2580344326374613e-06, "loss": 0.0006, "num_input_tokens_seen": 153184208, "step": 71050 }, { "epoch": 13.040007340796476, "grad_norm": 0.2267061024904251, "learning_rate": 3.2572838668985176e-06, "loss": 0.0337, "num_input_tokens_seen": 153195088, "step": 71055 }, { "epoch": 13.040924940356028, "grad_norm": 36.718807220458984, "learning_rate": 3.256533345857541e-06, "loss": 0.0974, "num_input_tokens_seen": 153205872, "step": 71060 }, { "epoch": 13.041842539915582, "grad_norm": 0.048539914190769196, "learning_rate": 3.255782869533783e-06, "loss": 0.0002, "num_input_tokens_seen": 153217040, "step": 71065 }, { "epoch": 13.042760139475133, "grad_norm": 12.489019393920898, "learning_rate": 3.25503243794649e-06, "loss": 0.1937, "num_input_tokens_seen": 153227184, "step": 71070 }, { "epoch": 13.043677739034685, "grad_norm": 14.132513999938965, "learning_rate": 3.254282051114912e-06, "loss": 0.0138, "num_input_tokens_seen": 153237200, "step": 71075 }, { "epoch": 13.044595338594238, "grad_norm": 24.42144203186035, "learning_rate": 3.253531709058293e-06, "loss": 0.1892, "num_input_tokens_seen": 153248752, "step": 71080 }, { "epoch": 13.04551293815379, "grad_norm": 0.05124236270785332, "learning_rate": 3.2527814117958785e-06, "loss": 0.1011, "num_input_tokens_seen": 153260048, "step": 71085 }, { "epoch": 13.046430537713341, "grad_norm": 88.35138702392578, "learning_rate": 3.252031159346912e-06, "loss": 0.3292, "num_input_tokens_seen": 153269520, "step": 71090 }, { "epoch": 13.047348137272895, "grad_norm": 1.4048417806625366, "learning_rate": 3.2512809517306398e-06, "loss": 0.0033, "num_input_tokens_seen": 153281072, "step": 71095 }, { "epoch": 13.048265736832446, "grad_norm": 0.013531062752008438, "learning_rate": 3.2505307889662998e-06, "loss": 0.1116, "num_input_tokens_seen": 153291408, "step": 71100 }, { "epoch": 13.049183336391998, "grad_norm": 2.7374916076660156, "learning_rate": 3.2497806710731316e-06, "loss": 0.0139, "num_input_tokens_seen": 153301456, "step": 71105 }, { "epoch": 13.050100935951551, "grad_norm": 1.992936134338379, "learning_rate": 3.2490305980703787e-06, "loss": 0.084, "num_input_tokens_seen": 153311920, "step": 71110 }, { "epoch": 13.051018535511103, "grad_norm": 6.501984119415283, "learning_rate": 3.2482805699772774e-06, "loss": 0.0014, "num_input_tokens_seen": 153322160, "step": 71115 }, { "epoch": 13.051936135070655, "grad_norm": 0.060869693756103516, "learning_rate": 3.247530586813065e-06, "loss": 0.0004, "num_input_tokens_seen": 153332656, "step": 71120 }, { "epoch": 13.052853734630208, "grad_norm": 3.2106494903564453, "learning_rate": 3.2467806485969737e-06, "loss": 0.1328, "num_input_tokens_seen": 153342384, "step": 71125 }, { "epoch": 13.05377133418976, "grad_norm": 1.6709996461868286, "learning_rate": 3.2460307553482447e-06, "loss": 0.0037, "num_input_tokens_seen": 153353808, "step": 71130 }, { "epoch": 13.054688933749311, "grad_norm": 0.0033534332178533077, "learning_rate": 3.245280907086108e-06, "loss": 0.1508, "num_input_tokens_seen": 153364752, "step": 71135 }, { "epoch": 13.055606533308865, "grad_norm": 0.01194555964320898, "learning_rate": 3.2445311038297944e-06, "loss": 0.002, "num_input_tokens_seen": 153375568, "step": 71140 }, { "epoch": 13.056524132868416, "grad_norm": 0.11433334648609161, "learning_rate": 3.243781345598539e-06, "loss": 0.0005, "num_input_tokens_seen": 153385424, "step": 71145 }, { "epoch": 13.057441732427968, "grad_norm": 0.006767845246940851, "learning_rate": 3.243031632411571e-06, "loss": 0.001, "num_input_tokens_seen": 153396880, "step": 71150 }, { "epoch": 13.058359331987521, "grad_norm": 8.231095314025879, "learning_rate": 3.2422819642881154e-06, "loss": 0.1547, "num_input_tokens_seen": 153407728, "step": 71155 }, { "epoch": 13.059276931547073, "grad_norm": 0.0022662561386823654, "learning_rate": 3.2415323412474066e-06, "loss": 0.0379, "num_input_tokens_seen": 153419344, "step": 71160 }, { "epoch": 13.060194531106625, "grad_norm": 362.72576904296875, "learning_rate": 3.2407827633086662e-06, "loss": 0.196, "num_input_tokens_seen": 153430576, "step": 71165 }, { "epoch": 13.061112130666178, "grad_norm": 0.02159154787659645, "learning_rate": 3.240033230491123e-06, "loss": 0.1175, "num_input_tokens_seen": 153440592, "step": 71170 }, { "epoch": 13.06202973022573, "grad_norm": 12.196557998657227, "learning_rate": 3.239283742813998e-06, "loss": 0.0233, "num_input_tokens_seen": 153450768, "step": 71175 }, { "epoch": 13.062947329785281, "grad_norm": 12.724932670593262, "learning_rate": 3.2385343002965156e-06, "loss": 0.0262, "num_input_tokens_seen": 153460720, "step": 71180 }, { "epoch": 13.063864929344835, "grad_norm": 0.04288781061768532, "learning_rate": 3.2377849029579e-06, "loss": 0.1955, "num_input_tokens_seen": 153472272, "step": 71185 }, { "epoch": 13.064782528904386, "grad_norm": 0.051873695105314255, "learning_rate": 3.23703555081737e-06, "loss": 0.0021, "num_input_tokens_seen": 153483088, "step": 71190 }, { "epoch": 13.065700128463938, "grad_norm": 40.666786193847656, "learning_rate": 3.2362862438941458e-06, "loss": 0.6153, "num_input_tokens_seen": 153493424, "step": 71195 }, { "epoch": 13.066617728023491, "grad_norm": 0.3718501925468445, "learning_rate": 3.2355369822074467e-06, "loss": 0.0166, "num_input_tokens_seen": 153504272, "step": 71200 }, { "epoch": 13.067535327583043, "grad_norm": 0.027591412886977196, "learning_rate": 3.234787765776487e-06, "loss": 0.0215, "num_input_tokens_seen": 153514288, "step": 71205 }, { "epoch": 13.068452927142594, "grad_norm": 0.0033937108237296343, "learning_rate": 3.2340385946204867e-06, "loss": 0.1628, "num_input_tokens_seen": 153525808, "step": 71210 }, { "epoch": 13.069370526702148, "grad_norm": 195.10377502441406, "learning_rate": 3.2332894687586602e-06, "loss": 0.2506, "num_input_tokens_seen": 153537168, "step": 71215 }, { "epoch": 13.0702881262617, "grad_norm": 0.006028375122696161, "learning_rate": 3.2325403882102204e-06, "loss": 0.0004, "num_input_tokens_seen": 153547920, "step": 71220 }, { "epoch": 13.071205725821251, "grad_norm": 0.2571858763694763, "learning_rate": 3.2317913529943782e-06, "loss": 0.1556, "num_input_tokens_seen": 153558544, "step": 71225 }, { "epoch": 13.072123325380804, "grad_norm": 0.6758586764335632, "learning_rate": 3.23104236313035e-06, "loss": 0.0008, "num_input_tokens_seen": 153567312, "step": 71230 }, { "epoch": 13.073040924940356, "grad_norm": 0.1241457536816597, "learning_rate": 3.2302934186373426e-06, "loss": 0.0002, "num_input_tokens_seen": 153577872, "step": 71235 }, { "epoch": 13.073958524499908, "grad_norm": 0.02482284978032112, "learning_rate": 3.229544519534565e-06, "loss": 0.0001, "num_input_tokens_seen": 153588592, "step": 71240 }, { "epoch": 13.074876124059461, "grad_norm": 1.9628833532333374, "learning_rate": 3.228795665841228e-06, "loss": 0.0006, "num_input_tokens_seen": 153598704, "step": 71245 }, { "epoch": 13.075793723619013, "grad_norm": 0.05080031976103783, "learning_rate": 3.228046857576537e-06, "loss": 0.2939, "num_input_tokens_seen": 153609008, "step": 71250 }, { "epoch": 13.076711323178564, "grad_norm": 128.93919372558594, "learning_rate": 3.2272980947596967e-06, "loss": 0.0426, "num_input_tokens_seen": 153619472, "step": 71255 }, { "epoch": 13.077628922738118, "grad_norm": 28.750892639160156, "learning_rate": 3.2265493774099138e-06, "loss": 0.0027, "num_input_tokens_seen": 153629072, "step": 71260 }, { "epoch": 13.07854652229767, "grad_norm": 0.1898680329322815, "learning_rate": 3.2258007055463913e-06, "loss": 0.0146, "num_input_tokens_seen": 153640560, "step": 71265 }, { "epoch": 13.079464121857221, "grad_norm": 0.027984188869595528, "learning_rate": 3.225052079188331e-06, "loss": 0.2252, "num_input_tokens_seen": 153651568, "step": 71270 }, { "epoch": 13.080381721416774, "grad_norm": 0.020861364901065826, "learning_rate": 3.2243034983549326e-06, "loss": 0.3385, "num_input_tokens_seen": 153662384, "step": 71275 }, { "epoch": 13.081299320976326, "grad_norm": 0.22235965728759766, "learning_rate": 3.2235549630653974e-06, "loss": 0.1572, "num_input_tokens_seen": 153673040, "step": 71280 }, { "epoch": 13.082216920535878, "grad_norm": 0.08973953872919083, "learning_rate": 3.2228064733389254e-06, "loss": 0.0801, "num_input_tokens_seen": 153684688, "step": 71285 }, { "epoch": 13.083134520095431, "grad_norm": 0.14806221425533295, "learning_rate": 3.222058029194712e-06, "loss": 0.0008, "num_input_tokens_seen": 153695248, "step": 71290 }, { "epoch": 13.084052119654983, "grad_norm": 0.042588312178850174, "learning_rate": 3.2213096306519553e-06, "loss": 0.3006, "num_input_tokens_seen": 153705456, "step": 71295 }, { "epoch": 13.084969719214534, "grad_norm": 0.006257897708564997, "learning_rate": 3.22056127772985e-06, "loss": 0.007, "num_input_tokens_seen": 153715888, "step": 71300 }, { "epoch": 13.085887318774088, "grad_norm": 0.04071377217769623, "learning_rate": 3.219812970447589e-06, "loss": 0.0108, "num_input_tokens_seen": 153727344, "step": 71305 }, { "epoch": 13.08680491833364, "grad_norm": 0.03273070231080055, "learning_rate": 3.2190647088243665e-06, "loss": 0.0589, "num_input_tokens_seen": 153737712, "step": 71310 }, { "epoch": 13.08772251789319, "grad_norm": 0.020574698224663734, "learning_rate": 3.2183164928793746e-06, "loss": 0.0293, "num_input_tokens_seen": 153748464, "step": 71315 }, { "epoch": 13.088640117452744, "grad_norm": 85.66382598876953, "learning_rate": 3.217568322631803e-06, "loss": 0.0888, "num_input_tokens_seen": 153757776, "step": 71320 }, { "epoch": 13.089557717012296, "grad_norm": 0.05541342496871948, "learning_rate": 3.2168201981008406e-06, "loss": 0.0007, "num_input_tokens_seen": 153767376, "step": 71325 }, { "epoch": 13.090475316571847, "grad_norm": 0.00668299337849021, "learning_rate": 3.2160721193056774e-06, "loss": 0.1205, "num_input_tokens_seen": 153777968, "step": 71330 }, { "epoch": 13.0913929161314, "grad_norm": 2.02807354927063, "learning_rate": 3.2153240862655e-06, "loss": 0.003, "num_input_tokens_seen": 153790160, "step": 71335 }, { "epoch": 13.092310515690952, "grad_norm": 0.015952304005622864, "learning_rate": 3.2145760989994917e-06, "loss": 0.0537, "num_input_tokens_seen": 153800368, "step": 71340 }, { "epoch": 13.093228115250504, "grad_norm": 58.30707931518555, "learning_rate": 3.2138281575268414e-06, "loss": 0.3602, "num_input_tokens_seen": 153810832, "step": 71345 }, { "epoch": 13.094145714810058, "grad_norm": 0.0030232323333621025, "learning_rate": 3.2130802618667308e-06, "loss": 0.0029, "num_input_tokens_seen": 153820848, "step": 71350 }, { "epoch": 13.09506331436961, "grad_norm": 0.033963628113269806, "learning_rate": 3.2123324120383414e-06, "loss": 0.0823, "num_input_tokens_seen": 153830736, "step": 71355 }, { "epoch": 13.09598091392916, "grad_norm": 0.024662282317876816, "learning_rate": 3.2115846080608533e-06, "loss": 0.0099, "num_input_tokens_seen": 153842096, "step": 71360 }, { "epoch": 13.096898513488714, "grad_norm": 10.426054954528809, "learning_rate": 3.2108368499534503e-06, "loss": 0.1915, "num_input_tokens_seen": 153852368, "step": 71365 }, { "epoch": 13.097816113048266, "grad_norm": 77.57247161865234, "learning_rate": 3.2100891377353083e-06, "loss": 0.4384, "num_input_tokens_seen": 153862736, "step": 71370 }, { "epoch": 13.098733712607817, "grad_norm": 0.27654391527175903, "learning_rate": 3.209341471425605e-06, "loss": 0.0003, "num_input_tokens_seen": 153873232, "step": 71375 }, { "epoch": 13.09965131216737, "grad_norm": 59.96528244018555, "learning_rate": 3.2085938510435188e-06, "loss": 0.3624, "num_input_tokens_seen": 153884848, "step": 71380 }, { "epoch": 13.100568911726922, "grad_norm": 0.007201671600341797, "learning_rate": 3.207846276608224e-06, "loss": 0.0705, "num_input_tokens_seen": 153895824, "step": 71385 }, { "epoch": 13.101486511286474, "grad_norm": 0.04587050527334213, "learning_rate": 3.2070987481388942e-06, "loss": 0.059, "num_input_tokens_seen": 153906544, "step": 71390 }, { "epoch": 13.102404110846027, "grad_norm": 0.008700371719896793, "learning_rate": 3.2063512656547036e-06, "loss": 0.0143, "num_input_tokens_seen": 153918096, "step": 71395 }, { "epoch": 13.103321710405579, "grad_norm": 0.037495192140340805, "learning_rate": 3.205603829174823e-06, "loss": 0.0534, "num_input_tokens_seen": 153928496, "step": 71400 }, { "epoch": 13.10423930996513, "grad_norm": 1.045301079750061, "learning_rate": 3.204856438718422e-06, "loss": 0.0005, "num_input_tokens_seen": 153939760, "step": 71405 }, { "epoch": 13.105156909524684, "grad_norm": 0.19095827639102936, "learning_rate": 3.2041090943046715e-06, "loss": 0.0003, "num_input_tokens_seen": 153950480, "step": 71410 }, { "epoch": 13.106074509084236, "grad_norm": 0.397036075592041, "learning_rate": 3.20336179595274e-06, "loss": 0.0544, "num_input_tokens_seen": 153961936, "step": 71415 }, { "epoch": 13.106992108643787, "grad_norm": 49.38991928100586, "learning_rate": 3.202614543681794e-06, "loss": 0.1634, "num_input_tokens_seen": 153973200, "step": 71420 }, { "epoch": 13.10790970820334, "grad_norm": 3.4438321590423584, "learning_rate": 3.201867337510997e-06, "loss": 0.1628, "num_input_tokens_seen": 153983920, "step": 71425 }, { "epoch": 13.108827307762892, "grad_norm": 0.02740592695772648, "learning_rate": 3.2011201774595187e-06, "loss": 0.2998, "num_input_tokens_seen": 153994736, "step": 71430 }, { "epoch": 13.109744907322444, "grad_norm": 0.010026942007243633, "learning_rate": 3.2003730635465193e-06, "loss": 0.0004, "num_input_tokens_seen": 154006288, "step": 71435 }, { "epoch": 13.110662506881997, "grad_norm": 1.5046104192733765, "learning_rate": 3.199625995791161e-06, "loss": 0.0011, "num_input_tokens_seen": 154016304, "step": 71440 }, { "epoch": 13.111580106441549, "grad_norm": 0.027262644842267036, "learning_rate": 3.1988789742126046e-06, "loss": 0.0451, "num_input_tokens_seen": 154027248, "step": 71445 }, { "epoch": 13.1124977060011, "grad_norm": 5.023512363433838, "learning_rate": 3.198131998830013e-06, "loss": 0.065, "num_input_tokens_seen": 154035632, "step": 71450 }, { "epoch": 13.113415305560654, "grad_norm": 1.3392761945724487, "learning_rate": 3.1973850696625424e-06, "loss": 0.0106, "num_input_tokens_seen": 154046576, "step": 71455 }, { "epoch": 13.114332905120206, "grad_norm": 40.86336135864258, "learning_rate": 3.1966381867293494e-06, "loss": 0.0804, "num_input_tokens_seen": 154058352, "step": 71460 }, { "epoch": 13.115250504679757, "grad_norm": 0.1509907990694046, "learning_rate": 3.1958913500495937e-06, "loss": 0.0015, "num_input_tokens_seen": 154070192, "step": 71465 }, { "epoch": 13.11616810423931, "grad_norm": 0.017826585099101067, "learning_rate": 3.1951445596424293e-06, "loss": 0.0007, "num_input_tokens_seen": 154081680, "step": 71470 }, { "epoch": 13.117085703798862, "grad_norm": 0.006439792457967997, "learning_rate": 3.1943978155270066e-06, "loss": 0.1037, "num_input_tokens_seen": 154093072, "step": 71475 }, { "epoch": 13.118003303358414, "grad_norm": 135.95217895507812, "learning_rate": 3.193651117722484e-06, "loss": 0.1994, "num_input_tokens_seen": 154103984, "step": 71480 }, { "epoch": 13.118920902917967, "grad_norm": 0.6245043277740479, "learning_rate": 3.1929044662480115e-06, "loss": 0.0725, "num_input_tokens_seen": 154115120, "step": 71485 }, { "epoch": 13.119838502477519, "grad_norm": 0.10954917222261429, "learning_rate": 3.1921578611227377e-06, "loss": 0.0246, "num_input_tokens_seen": 154125456, "step": 71490 }, { "epoch": 13.12075610203707, "grad_norm": 0.33005183935165405, "learning_rate": 3.191411302365812e-06, "loss": 0.2341, "num_input_tokens_seen": 154136240, "step": 71495 }, { "epoch": 13.121673701596624, "grad_norm": 0.9581148624420166, "learning_rate": 3.1906647899963834e-06, "loss": 0.0916, "num_input_tokens_seen": 154147248, "step": 71500 }, { "epoch": 13.122591301156175, "grad_norm": 0.003194863675162196, "learning_rate": 3.1899183240335994e-06, "loss": 0.2132, "num_input_tokens_seen": 154159504, "step": 71505 }, { "epoch": 13.123508900715727, "grad_norm": 0.011856593191623688, "learning_rate": 3.1891719044966044e-06, "loss": 0.0006, "num_input_tokens_seen": 154171248, "step": 71510 }, { "epoch": 13.12442650027528, "grad_norm": 0.018848396837711334, "learning_rate": 3.188425531404545e-06, "loss": 0.0068, "num_input_tokens_seen": 154182512, "step": 71515 }, { "epoch": 13.125344099834832, "grad_norm": 0.22172439098358154, "learning_rate": 3.1876792047765627e-06, "loss": 0.0004, "num_input_tokens_seen": 154192336, "step": 71520 }, { "epoch": 13.126261699394384, "grad_norm": 0.051566146314144135, "learning_rate": 3.186932924631797e-06, "loss": 0.0485, "num_input_tokens_seen": 154202672, "step": 71525 }, { "epoch": 13.127179298953937, "grad_norm": 0.00454450910910964, "learning_rate": 3.1861866909893953e-06, "loss": 0.0212, "num_input_tokens_seen": 154213872, "step": 71530 }, { "epoch": 13.128096898513489, "grad_norm": 0.01965625211596489, "learning_rate": 3.185440503868493e-06, "loss": 0.0002, "num_input_tokens_seen": 154224528, "step": 71535 }, { "epoch": 13.12901449807304, "grad_norm": 0.9311009645462036, "learning_rate": 3.1846943632882294e-06, "loss": 0.0004, "num_input_tokens_seen": 154236144, "step": 71540 }, { "epoch": 13.129932097632594, "grad_norm": 0.007345850113779306, "learning_rate": 3.1839482692677405e-06, "loss": 0.0002, "num_input_tokens_seen": 154247504, "step": 71545 }, { "epoch": 13.130849697192145, "grad_norm": 10.047002792358398, "learning_rate": 3.1832022218261648e-06, "loss": 0.0025, "num_input_tokens_seen": 154258192, "step": 71550 }, { "epoch": 13.131767296751697, "grad_norm": 205.9230499267578, "learning_rate": 3.182456220982637e-06, "loss": 0.2695, "num_input_tokens_seen": 154270384, "step": 71555 }, { "epoch": 13.13268489631125, "grad_norm": 0.1537584513425827, "learning_rate": 3.1817102667562883e-06, "loss": 0.0014, "num_input_tokens_seen": 154280272, "step": 71560 }, { "epoch": 13.133602495870802, "grad_norm": 0.02530340477824211, "learning_rate": 3.1809643591662553e-06, "loss": 0.0017, "num_input_tokens_seen": 154290768, "step": 71565 }, { "epoch": 13.134520095430354, "grad_norm": 70.34294128417969, "learning_rate": 3.180218498231667e-06, "loss": 0.2262, "num_input_tokens_seen": 154300816, "step": 71570 }, { "epoch": 13.135437694989907, "grad_norm": 0.06833329796791077, "learning_rate": 3.179472683971654e-06, "loss": 0.2535, "num_input_tokens_seen": 154311472, "step": 71575 }, { "epoch": 13.136355294549459, "grad_norm": 6.867706775665283, "learning_rate": 3.1787269164053425e-06, "loss": 0.3115, "num_input_tokens_seen": 154322096, "step": 71580 }, { "epoch": 13.13727289410901, "grad_norm": 0.33730772137641907, "learning_rate": 3.1779811955518652e-06, "loss": 0.0003, "num_input_tokens_seen": 154332752, "step": 71585 }, { "epoch": 13.138190493668564, "grad_norm": 0.002314326586201787, "learning_rate": 3.1772355214303464e-06, "loss": 0.01, "num_input_tokens_seen": 154344080, "step": 71590 }, { "epoch": 13.139108093228115, "grad_norm": 1.053057312965393, "learning_rate": 3.176489894059911e-06, "loss": 0.0409, "num_input_tokens_seen": 154355120, "step": 71595 }, { "epoch": 13.140025692787667, "grad_norm": 1.0062634944915771, "learning_rate": 3.1757443134596827e-06, "loss": 0.003, "num_input_tokens_seen": 154366224, "step": 71600 }, { "epoch": 13.14094329234722, "grad_norm": 0.02333112619817257, "learning_rate": 3.174998779648787e-06, "loss": 0.2267, "num_input_tokens_seen": 154377392, "step": 71605 }, { "epoch": 13.141860891906772, "grad_norm": 37.78662109375, "learning_rate": 3.1742532926463427e-06, "loss": 0.062, "num_input_tokens_seen": 154387376, "step": 71610 }, { "epoch": 13.142778491466323, "grad_norm": 0.018107641488313675, "learning_rate": 3.173507852471473e-06, "loss": 0.1575, "num_input_tokens_seen": 154398640, "step": 71615 }, { "epoch": 13.143696091025877, "grad_norm": 44.685150146484375, "learning_rate": 3.1727624591432958e-06, "loss": 0.2671, "num_input_tokens_seen": 154408400, "step": 71620 }, { "epoch": 13.144613690585429, "grad_norm": 0.07006987184286118, "learning_rate": 3.172017112680929e-06, "loss": 0.0951, "num_input_tokens_seen": 154418320, "step": 71625 }, { "epoch": 13.14553129014498, "grad_norm": 0.15862394869327545, "learning_rate": 3.1712718131034902e-06, "loss": 0.0889, "num_input_tokens_seen": 154429936, "step": 71630 }, { "epoch": 13.146448889704534, "grad_norm": 0.05670516937971115, "learning_rate": 3.170526560430096e-06, "loss": 0.1848, "num_input_tokens_seen": 154441904, "step": 71635 }, { "epoch": 13.147366489264085, "grad_norm": 0.10186441242694855, "learning_rate": 3.16978135467986e-06, "loss": 0.0663, "num_input_tokens_seen": 154452112, "step": 71640 }, { "epoch": 13.148284088823637, "grad_norm": 0.025271093472838402, "learning_rate": 3.1690361958718935e-06, "loss": 0.0004, "num_input_tokens_seen": 154462864, "step": 71645 }, { "epoch": 13.14920168838319, "grad_norm": 0.06178475171327591, "learning_rate": 3.1682910840253132e-06, "loss": 0.0016, "num_input_tokens_seen": 154473488, "step": 71650 }, { "epoch": 13.150119287942742, "grad_norm": 0.13625985383987427, "learning_rate": 3.1675460191592277e-06, "loss": 0.1354, "num_input_tokens_seen": 154485456, "step": 71655 }, { "epoch": 13.151036887502293, "grad_norm": 20.435487747192383, "learning_rate": 3.166801001292744e-06, "loss": 0.1612, "num_input_tokens_seen": 154495472, "step": 71660 }, { "epoch": 13.151954487061847, "grad_norm": 0.5549809336662292, "learning_rate": 3.166056030444976e-06, "loss": 0.1567, "num_input_tokens_seen": 154506544, "step": 71665 }, { "epoch": 13.152872086621398, "grad_norm": 37.514930725097656, "learning_rate": 3.165311106635029e-06, "loss": 0.2716, "num_input_tokens_seen": 154516944, "step": 71670 }, { "epoch": 13.15378968618095, "grad_norm": 13.46055793762207, "learning_rate": 3.1645662298820077e-06, "loss": 0.0786, "num_input_tokens_seen": 154526640, "step": 71675 }, { "epoch": 13.154707285740503, "grad_norm": 2.4752745628356934, "learning_rate": 3.1638214002050165e-06, "loss": 0.0016, "num_input_tokens_seen": 154536496, "step": 71680 }, { "epoch": 13.155624885300055, "grad_norm": 20.726558685302734, "learning_rate": 3.1630766176231626e-06, "loss": 0.1378, "num_input_tokens_seen": 154548048, "step": 71685 }, { "epoch": 13.156542484859607, "grad_norm": 0.01828124187886715, "learning_rate": 3.162331882155546e-06, "loss": 0.0021, "num_input_tokens_seen": 154559536, "step": 71690 }, { "epoch": 13.15746008441916, "grad_norm": 0.1407509446144104, "learning_rate": 3.161587193821267e-06, "loss": 0.0921, "num_input_tokens_seen": 154570704, "step": 71695 }, { "epoch": 13.158377683978712, "grad_norm": 121.96760559082031, "learning_rate": 3.1608425526394286e-06, "loss": 0.0654, "num_input_tokens_seen": 154581104, "step": 71700 }, { "epoch": 13.159295283538263, "grad_norm": 147.0907440185547, "learning_rate": 3.160097958629129e-06, "loss": 0.0539, "num_input_tokens_seen": 154592976, "step": 71705 }, { "epoch": 13.160212883097817, "grad_norm": 0.01727571152150631, "learning_rate": 3.159353411809464e-06, "loss": 0.0007, "num_input_tokens_seen": 154603536, "step": 71710 }, { "epoch": 13.161130482657368, "grad_norm": 0.08274440467357635, "learning_rate": 3.1586089121995316e-06, "loss": 0.179, "num_input_tokens_seen": 154615472, "step": 71715 }, { "epoch": 13.16204808221692, "grad_norm": 0.0409957692027092, "learning_rate": 3.157864459818426e-06, "loss": 0.0006, "num_input_tokens_seen": 154625936, "step": 71720 }, { "epoch": 13.162965681776473, "grad_norm": 0.10735607147216797, "learning_rate": 3.1571200546852432e-06, "loss": 0.0785, "num_input_tokens_seen": 154637008, "step": 71725 }, { "epoch": 13.163883281336025, "grad_norm": 39.51313781738281, "learning_rate": 3.156375696819074e-06, "loss": 0.0123, "num_input_tokens_seen": 154648464, "step": 71730 }, { "epoch": 13.164800880895577, "grad_norm": 0.18228508532047272, "learning_rate": 3.1556313862390116e-06, "loss": 0.0921, "num_input_tokens_seen": 154659184, "step": 71735 }, { "epoch": 13.16571848045513, "grad_norm": 0.0023560260888189077, "learning_rate": 3.154887122964145e-06, "loss": 0.4285, "num_input_tokens_seen": 154669776, "step": 71740 }, { "epoch": 13.166636080014682, "grad_norm": 0.010637783445417881, "learning_rate": 3.154142907013563e-06, "loss": 0.0662, "num_input_tokens_seen": 154680528, "step": 71745 }, { "epoch": 13.167553679574233, "grad_norm": 0.006668445188552141, "learning_rate": 3.1533987384063565e-06, "loss": 0.0478, "num_input_tokens_seen": 154691120, "step": 71750 }, { "epoch": 13.168471279133787, "grad_norm": 0.024307282641530037, "learning_rate": 3.15265461716161e-06, "loss": 0.0006, "num_input_tokens_seen": 154702320, "step": 71755 }, { "epoch": 13.169388878693338, "grad_norm": 0.028562184423208237, "learning_rate": 3.1519105432984098e-06, "loss": 0.0002, "num_input_tokens_seen": 154711920, "step": 71760 }, { "epoch": 13.17030647825289, "grad_norm": 0.04542160779237747, "learning_rate": 3.1511665168358374e-06, "loss": 0.0429, "num_input_tokens_seen": 154722832, "step": 71765 }, { "epoch": 13.171224077812443, "grad_norm": 0.020125634968280792, "learning_rate": 3.150422537792981e-06, "loss": 0.2166, "num_input_tokens_seen": 154733968, "step": 71770 }, { "epoch": 13.172141677371995, "grad_norm": 0.0060592591762542725, "learning_rate": 3.149678606188919e-06, "loss": 0.0015, "num_input_tokens_seen": 154744752, "step": 71775 }, { "epoch": 13.173059276931546, "grad_norm": 0.0016048192046582699, "learning_rate": 3.148934722042731e-06, "loss": 0.1846, "num_input_tokens_seen": 154756016, "step": 71780 }, { "epoch": 13.1739768764911, "grad_norm": 0.04561295732855797, "learning_rate": 3.1481908853735018e-06, "loss": 0.1161, "num_input_tokens_seen": 154766992, "step": 71785 }, { "epoch": 13.174894476050651, "grad_norm": 46.52846908569336, "learning_rate": 3.147447096200306e-06, "loss": 0.1602, "num_input_tokens_seen": 154778096, "step": 71790 }, { "epoch": 13.175812075610203, "grad_norm": 0.013602696359157562, "learning_rate": 3.1467033545422184e-06, "loss": 0.1096, "num_input_tokens_seen": 154788624, "step": 71795 }, { "epoch": 13.176729675169756, "grad_norm": 0.07720315456390381, "learning_rate": 3.1459596604183197e-06, "loss": 0.0175, "num_input_tokens_seen": 154798864, "step": 71800 }, { "epoch": 13.177647274729308, "grad_norm": 0.24298204481601715, "learning_rate": 3.1452160138476817e-06, "loss": 0.0291, "num_input_tokens_seen": 154809456, "step": 71805 }, { "epoch": 13.17856487428886, "grad_norm": 48.66757583618164, "learning_rate": 3.1444724148493786e-06, "loss": 0.1043, "num_input_tokens_seen": 154819472, "step": 71810 }, { "epoch": 13.179482473848413, "grad_norm": 16.457305908203125, "learning_rate": 3.1437288634424814e-06, "loss": 0.4056, "num_input_tokens_seen": 154830768, "step": 71815 }, { "epoch": 13.180400073407965, "grad_norm": 0.059673406183719635, "learning_rate": 3.142985359646062e-06, "loss": 0.1031, "num_input_tokens_seen": 154841936, "step": 71820 }, { "epoch": 13.181317672967516, "grad_norm": 0.03671436011791229, "learning_rate": 3.1422419034791905e-06, "loss": 0.0978, "num_input_tokens_seen": 154854032, "step": 71825 }, { "epoch": 13.18223527252707, "grad_norm": 0.26289379596710205, "learning_rate": 3.1414984949609345e-06, "loss": 0.0006, "num_input_tokens_seen": 154863408, "step": 71830 }, { "epoch": 13.183152872086621, "grad_norm": 0.004184520337730646, "learning_rate": 3.1407551341103626e-06, "loss": 0.1149, "num_input_tokens_seen": 154874704, "step": 71835 }, { "epoch": 13.184070471646173, "grad_norm": 67.04694366455078, "learning_rate": 3.1400118209465395e-06, "loss": 0.1559, "num_input_tokens_seen": 154887056, "step": 71840 }, { "epoch": 13.184988071205726, "grad_norm": 69.83746337890625, "learning_rate": 3.139268555488529e-06, "loss": 0.1382, "num_input_tokens_seen": 154896784, "step": 71845 }, { "epoch": 13.185905670765278, "grad_norm": 0.07834108173847198, "learning_rate": 3.138525337755398e-06, "loss": 0.1135, "num_input_tokens_seen": 154907760, "step": 71850 }, { "epoch": 13.18682327032483, "grad_norm": 0.1990874707698822, "learning_rate": 3.137782167766207e-06, "loss": 0.0456, "num_input_tokens_seen": 154918416, "step": 71855 }, { "epoch": 13.187740869884383, "grad_norm": 7.2877960205078125, "learning_rate": 3.137039045540017e-06, "loss": 0.0421, "num_input_tokens_seen": 154929264, "step": 71860 }, { "epoch": 13.188658469443935, "grad_norm": 0.07813075184822083, "learning_rate": 3.136295971095886e-06, "loss": 0.02, "num_input_tokens_seen": 154939792, "step": 71865 }, { "epoch": 13.189576069003486, "grad_norm": 0.014907480217516422, "learning_rate": 3.1355529444528777e-06, "loss": 0.2627, "num_input_tokens_seen": 154950832, "step": 71870 }, { "epoch": 13.19049366856304, "grad_norm": 0.2159176766872406, "learning_rate": 3.134809965630047e-06, "loss": 0.1412, "num_input_tokens_seen": 154961744, "step": 71875 }, { "epoch": 13.191411268122591, "grad_norm": 0.132122203707695, "learning_rate": 3.1340670346464465e-06, "loss": 0.0019, "num_input_tokens_seen": 154971824, "step": 71880 }, { "epoch": 13.192328867682143, "grad_norm": 0.04152701795101166, "learning_rate": 3.1333241515211376e-06, "loss": 0.0539, "num_input_tokens_seen": 154982896, "step": 71885 }, { "epoch": 13.193246467241696, "grad_norm": 343.2847900390625, "learning_rate": 3.1325813162731705e-06, "loss": 0.1539, "num_input_tokens_seen": 154993200, "step": 71890 }, { "epoch": 13.194164066801248, "grad_norm": 23.29502296447754, "learning_rate": 3.131838528921599e-06, "loss": 0.0778, "num_input_tokens_seen": 155004912, "step": 71895 }, { "epoch": 13.1950816663608, "grad_norm": 0.23888035118579865, "learning_rate": 3.1310957894854717e-06, "loss": 0.003, "num_input_tokens_seen": 155016176, "step": 71900 }, { "epoch": 13.195999265920353, "grad_norm": 0.16609449684619904, "learning_rate": 3.1303530979838425e-06, "loss": 0.0005, "num_input_tokens_seen": 155027536, "step": 71905 }, { "epoch": 13.196916865479905, "grad_norm": 0.16073083877563477, "learning_rate": 3.1296104544357587e-06, "loss": 0.2317, "num_input_tokens_seen": 155038672, "step": 71910 }, { "epoch": 13.197834465039456, "grad_norm": 51.79633331298828, "learning_rate": 3.128867858860266e-06, "loss": 0.1501, "num_input_tokens_seen": 155048528, "step": 71915 }, { "epoch": 13.19875206459901, "grad_norm": 0.07998076826334, "learning_rate": 3.1281253112764154e-06, "loss": 0.0024, "num_input_tokens_seen": 155058608, "step": 71920 }, { "epoch": 13.199669664158561, "grad_norm": 0.057448841631412506, "learning_rate": 3.127382811703249e-06, "loss": 0.0044, "num_input_tokens_seen": 155069392, "step": 71925 }, { "epoch": 13.200587263718113, "grad_norm": 76.11669158935547, "learning_rate": 3.1266403601598094e-06, "loss": 0.0233, "num_input_tokens_seen": 155079728, "step": 71930 }, { "epoch": 13.201504863277666, "grad_norm": 305.6962585449219, "learning_rate": 3.1258979566651426e-06, "loss": 0.2304, "num_input_tokens_seen": 155090000, "step": 71935 }, { "epoch": 13.202422462837218, "grad_norm": 0.3834802210330963, "learning_rate": 3.125155601238289e-06, "loss": 0.1378, "num_input_tokens_seen": 155100720, "step": 71940 }, { "epoch": 13.20334006239677, "grad_norm": 0.027612753212451935, "learning_rate": 3.1244132938982873e-06, "loss": 0.0942, "num_input_tokens_seen": 155112080, "step": 71945 }, { "epoch": 13.204257661956323, "grad_norm": 0.042928699404001236, "learning_rate": 3.1236710346641776e-06, "loss": 0.1009, "num_input_tokens_seen": 155122736, "step": 71950 }, { "epoch": 13.205175261515874, "grad_norm": 0.0026086901780217886, "learning_rate": 3.122928823554999e-06, "loss": 0.0001, "num_input_tokens_seen": 155133808, "step": 71955 }, { "epoch": 13.206092861075426, "grad_norm": 0.040131647139787674, "learning_rate": 3.1221866605897868e-06, "loss": 0.001, "num_input_tokens_seen": 155144080, "step": 71960 }, { "epoch": 13.20701046063498, "grad_norm": 0.012731397524476051, "learning_rate": 3.121444545787574e-06, "loss": 0.0002, "num_input_tokens_seen": 155155696, "step": 71965 }, { "epoch": 13.207928060194531, "grad_norm": 0.007303131278604269, "learning_rate": 3.1207024791674e-06, "loss": 0.0001, "num_input_tokens_seen": 155166704, "step": 71970 }, { "epoch": 13.208845659754083, "grad_norm": 0.019916487857699394, "learning_rate": 3.1199604607482942e-06, "loss": 0.0002, "num_input_tokens_seen": 155177104, "step": 71975 }, { "epoch": 13.209763259313636, "grad_norm": 0.11593657732009888, "learning_rate": 3.1192184905492865e-06, "loss": 0.0979, "num_input_tokens_seen": 155188176, "step": 71980 }, { "epoch": 13.210680858873188, "grad_norm": 16.004594802856445, "learning_rate": 3.1184765685894125e-06, "loss": 0.0033, "num_input_tokens_seen": 155197904, "step": 71985 }, { "epoch": 13.21159845843274, "grad_norm": 0.03126319497823715, "learning_rate": 3.1177346948876974e-06, "loss": 0.0042, "num_input_tokens_seen": 155209008, "step": 71990 }, { "epoch": 13.212516057992293, "grad_norm": 0.013343901373445988, "learning_rate": 3.1169928694631706e-06, "loss": 0.0022, "num_input_tokens_seen": 155219984, "step": 71995 }, { "epoch": 13.213433657551844, "grad_norm": 0.011760561726987362, "learning_rate": 3.1162510923348564e-06, "loss": 0.0002, "num_input_tokens_seen": 155230992, "step": 72000 }, { "epoch": 13.214351257111396, "grad_norm": 0.017747623845934868, "learning_rate": 3.1155093635217836e-06, "loss": 0.0012, "num_input_tokens_seen": 155242224, "step": 72005 }, { "epoch": 13.21526885667095, "grad_norm": 0.18755674362182617, "learning_rate": 3.114767683042976e-06, "loss": 0.0185, "num_input_tokens_seen": 155252464, "step": 72010 }, { "epoch": 13.216186456230501, "grad_norm": 0.1778968721628189, "learning_rate": 3.1140260509174523e-06, "loss": 0.2514, "num_input_tokens_seen": 155264624, "step": 72015 }, { "epoch": 13.217104055790053, "grad_norm": 0.1345553696155548, "learning_rate": 3.1132844671642405e-06, "loss": 0.4002, "num_input_tokens_seen": 155275792, "step": 72020 }, { "epoch": 13.218021655349606, "grad_norm": 0.18622051179409027, "learning_rate": 3.112542931802357e-06, "loss": 0.0003, "num_input_tokens_seen": 155285936, "step": 72025 }, { "epoch": 13.218939254909158, "grad_norm": 0.053071971982717514, "learning_rate": 3.1118014448508223e-06, "loss": 0.0525, "num_input_tokens_seen": 155297584, "step": 72030 }, { "epoch": 13.21985685446871, "grad_norm": 87.69181823730469, "learning_rate": 3.111060006328653e-06, "loss": 0.1291, "num_input_tokens_seen": 155308656, "step": 72035 }, { "epoch": 13.220774454028263, "grad_norm": 0.005511841736733913, "learning_rate": 3.110318616254867e-06, "loss": 0.2614, "num_input_tokens_seen": 155319440, "step": 72040 }, { "epoch": 13.221692053587814, "grad_norm": 0.23519784212112427, "learning_rate": 3.109577274648481e-06, "loss": 0.0051, "num_input_tokens_seen": 155330896, "step": 72045 }, { "epoch": 13.222609653147366, "grad_norm": 0.09559135884046555, "learning_rate": 3.108835981528507e-06, "loss": 0.1604, "num_input_tokens_seen": 155342448, "step": 72050 }, { "epoch": 13.22352725270692, "grad_norm": 52.2868537902832, "learning_rate": 3.1080947369139603e-06, "loss": 0.3595, "num_input_tokens_seen": 155351888, "step": 72055 }, { "epoch": 13.22444485226647, "grad_norm": 0.026317914947867393, "learning_rate": 3.107353540823851e-06, "loss": 0.2456, "num_input_tokens_seen": 155361968, "step": 72060 }, { "epoch": 13.225362451826022, "grad_norm": 0.12580269575119019, "learning_rate": 3.1066123932771873e-06, "loss": 0.0002, "num_input_tokens_seen": 155372944, "step": 72065 }, { "epoch": 13.226280051385576, "grad_norm": 132.09292602539062, "learning_rate": 3.105871294292985e-06, "loss": 0.1503, "num_input_tokens_seen": 155383952, "step": 72070 }, { "epoch": 13.227197650945127, "grad_norm": 133.15689086914062, "learning_rate": 3.1051302438902463e-06, "loss": 0.174, "num_input_tokens_seen": 155395248, "step": 72075 }, { "epoch": 13.228115250504679, "grad_norm": 0.14448429644107819, "learning_rate": 3.1043892420879818e-06, "loss": 0.0003, "num_input_tokens_seen": 155406288, "step": 72080 }, { "epoch": 13.229032850064232, "grad_norm": 0.016545170918107033, "learning_rate": 3.1036482889051924e-06, "loss": 0.1598, "num_input_tokens_seen": 155416912, "step": 72085 }, { "epoch": 13.229950449623784, "grad_norm": 0.1755029559135437, "learning_rate": 3.1029073843608874e-06, "loss": 0.0938, "num_input_tokens_seen": 155427632, "step": 72090 }, { "epoch": 13.230868049183336, "grad_norm": 0.020655231550335884, "learning_rate": 3.102166528474068e-06, "loss": 0.0041, "num_input_tokens_seen": 155439440, "step": 72095 }, { "epoch": 13.231785648742889, "grad_norm": 2.656818151473999, "learning_rate": 3.101425721263734e-06, "loss": 0.0181, "num_input_tokens_seen": 155450128, "step": 72100 }, { "epoch": 13.23270324830244, "grad_norm": 0.006471544038504362, "learning_rate": 3.100684962748889e-06, "loss": 0.0001, "num_input_tokens_seen": 155461200, "step": 72105 }, { "epoch": 13.233620847861992, "grad_norm": 22.305431365966797, "learning_rate": 3.0999442529485314e-06, "loss": 0.2523, "num_input_tokens_seen": 155471760, "step": 72110 }, { "epoch": 13.234538447421546, "grad_norm": 0.005810482893139124, "learning_rate": 3.099203591881657e-06, "loss": 0.0948, "num_input_tokens_seen": 155482768, "step": 72115 }, { "epoch": 13.235456046981097, "grad_norm": 0.01207074522972107, "learning_rate": 3.0984629795672666e-06, "loss": 0.0004, "num_input_tokens_seen": 155492720, "step": 72120 }, { "epoch": 13.236373646540649, "grad_norm": 0.05924712121486664, "learning_rate": 3.0977224160243535e-06, "loss": 0.0015, "num_input_tokens_seen": 155502832, "step": 72125 }, { "epoch": 13.237291246100202, "grad_norm": 195.87774658203125, "learning_rate": 3.096981901271912e-06, "loss": 0.0384, "num_input_tokens_seen": 155513744, "step": 72130 }, { "epoch": 13.238208845659754, "grad_norm": 0.021713992580771446, "learning_rate": 3.096241435328935e-06, "loss": 0.0773, "num_input_tokens_seen": 155525104, "step": 72135 }, { "epoch": 13.239126445219306, "grad_norm": 0.41010817885398865, "learning_rate": 3.095501018214414e-06, "loss": 0.1335, "num_input_tokens_seen": 155535824, "step": 72140 }, { "epoch": 13.240044044778859, "grad_norm": 0.11471536755561829, "learning_rate": 3.0947606499473414e-06, "loss": 0.2081, "num_input_tokens_seen": 155547472, "step": 72145 }, { "epoch": 13.24096164433841, "grad_norm": 0.22496306896209717, "learning_rate": 3.0940203305467036e-06, "loss": 0.0037, "num_input_tokens_seen": 155556656, "step": 72150 }, { "epoch": 13.241879243897962, "grad_norm": 425.0603332519531, "learning_rate": 3.093280060031492e-06, "loss": 0.056, "num_input_tokens_seen": 155567888, "step": 72155 }, { "epoch": 13.242796843457516, "grad_norm": 0.015904366970062256, "learning_rate": 3.0925398384206915e-06, "loss": 0.0042, "num_input_tokens_seen": 155580112, "step": 72160 }, { "epoch": 13.243714443017067, "grad_norm": 0.004937696270644665, "learning_rate": 3.091799665733286e-06, "loss": 0.0001, "num_input_tokens_seen": 155590672, "step": 72165 }, { "epoch": 13.244632042576619, "grad_norm": 0.02136566862463951, "learning_rate": 3.091059541988264e-06, "loss": 0.0003, "num_input_tokens_seen": 155602160, "step": 72170 }, { "epoch": 13.245549642136172, "grad_norm": 0.24722692370414734, "learning_rate": 3.0903194672046053e-06, "loss": 0.0049, "num_input_tokens_seen": 155612176, "step": 72175 }, { "epoch": 13.246467241695724, "grad_norm": 3.869795799255371, "learning_rate": 3.089579441401293e-06, "loss": 0.2825, "num_input_tokens_seen": 155622832, "step": 72180 }, { "epoch": 13.247384841255275, "grad_norm": 0.002026798902079463, "learning_rate": 3.088839464597305e-06, "loss": 0.0013, "num_input_tokens_seen": 155632560, "step": 72185 }, { "epoch": 13.248302440814829, "grad_norm": 0.05365055799484253, "learning_rate": 3.088099536811625e-06, "loss": 0.009, "num_input_tokens_seen": 155643184, "step": 72190 }, { "epoch": 13.24922004037438, "grad_norm": 0.009267439134418964, "learning_rate": 3.0873596580632287e-06, "loss": 0.1053, "num_input_tokens_seen": 155654672, "step": 72195 }, { "epoch": 13.250137639933932, "grad_norm": 40.714698791503906, "learning_rate": 3.0866198283710904e-06, "loss": 0.261, "num_input_tokens_seen": 155665520, "step": 72200 }, { "epoch": 13.251055239493486, "grad_norm": 117.74003601074219, "learning_rate": 3.0858800477541906e-06, "loss": 0.2785, "num_input_tokens_seen": 155677136, "step": 72205 }, { "epoch": 13.251972839053037, "grad_norm": 0.0029991960618644953, "learning_rate": 3.085140316231501e-06, "loss": 0.1232, "num_input_tokens_seen": 155686352, "step": 72210 }, { "epoch": 13.252890438612589, "grad_norm": 21.364498138427734, "learning_rate": 3.0844006338219935e-06, "loss": 0.399, "num_input_tokens_seen": 155696688, "step": 72215 }, { "epoch": 13.253808038172142, "grad_norm": 0.022302253171801567, "learning_rate": 3.08366100054464e-06, "loss": 0.008, "num_input_tokens_seen": 155707472, "step": 72220 }, { "epoch": 13.254725637731694, "grad_norm": 0.13319212198257446, "learning_rate": 3.082921416418413e-06, "loss": 0.0251, "num_input_tokens_seen": 155718000, "step": 72225 }, { "epoch": 13.255643237291245, "grad_norm": 82.98605346679688, "learning_rate": 3.0821818814622806e-06, "loss": 0.2352, "num_input_tokens_seen": 155728496, "step": 72230 }, { "epoch": 13.256560836850799, "grad_norm": 0.013319922611117363, "learning_rate": 3.081442395695209e-06, "loss": 0.0762, "num_input_tokens_seen": 155738384, "step": 72235 }, { "epoch": 13.25747843641035, "grad_norm": 0.012940812855958939, "learning_rate": 3.080702959136168e-06, "loss": 0.0007, "num_input_tokens_seen": 155748528, "step": 72240 }, { "epoch": 13.258396035969902, "grad_norm": 0.0919223427772522, "learning_rate": 3.079963571804122e-06, "loss": 0.1678, "num_input_tokens_seen": 155759504, "step": 72245 }, { "epoch": 13.259313635529455, "grad_norm": 0.06009703502058983, "learning_rate": 3.0792242337180334e-06, "loss": 0.2191, "num_input_tokens_seen": 155772240, "step": 72250 }, { "epoch": 13.260231235089007, "grad_norm": 0.01935841515660286, "learning_rate": 3.0784849448968667e-06, "loss": 0.0004, "num_input_tokens_seen": 155783600, "step": 72255 }, { "epoch": 13.261148834648559, "grad_norm": 0.039257556200027466, "learning_rate": 3.0777457053595827e-06, "loss": 0.1772, "num_input_tokens_seen": 155795440, "step": 72260 }, { "epoch": 13.262066434208112, "grad_norm": 109.85474395751953, "learning_rate": 3.0770065151251427e-06, "loss": 0.082, "num_input_tokens_seen": 155806288, "step": 72265 }, { "epoch": 13.262984033767664, "grad_norm": 10.89828109741211, "learning_rate": 3.076267374212505e-06, "loss": 0.0193, "num_input_tokens_seen": 155816912, "step": 72270 }, { "epoch": 13.263901633327215, "grad_norm": 0.010773280635476112, "learning_rate": 3.0755282826406275e-06, "loss": 0.0002, "num_input_tokens_seen": 155826800, "step": 72275 }, { "epoch": 13.264819232886769, "grad_norm": 0.21791626513004303, "learning_rate": 3.0747892404284675e-06, "loss": 0.0005, "num_input_tokens_seen": 155837328, "step": 72280 }, { "epoch": 13.26573683244632, "grad_norm": 0.15260010957717896, "learning_rate": 3.0740502475949775e-06, "loss": 0.0226, "num_input_tokens_seen": 155847824, "step": 72285 }, { "epoch": 13.266654432005872, "grad_norm": 0.007075882516801357, "learning_rate": 3.073311304159116e-06, "loss": 0.0059, "num_input_tokens_seen": 155858672, "step": 72290 }, { "epoch": 13.267572031565425, "grad_norm": 6.227298736572266, "learning_rate": 3.0725724101398334e-06, "loss": 0.0014, "num_input_tokens_seen": 155869904, "step": 72295 }, { "epoch": 13.268489631124977, "grad_norm": 0.0410575233399868, "learning_rate": 3.0718335655560793e-06, "loss": 0.0017, "num_input_tokens_seen": 155881328, "step": 72300 }, { "epoch": 13.269407230684529, "grad_norm": 0.011221860535442829, "learning_rate": 3.071094770426808e-06, "loss": 0.0003, "num_input_tokens_seen": 155893040, "step": 72305 }, { "epoch": 13.270324830244082, "grad_norm": 4.521576404571533, "learning_rate": 3.0703560247709656e-06, "loss": 0.0727, "num_input_tokens_seen": 155903152, "step": 72310 }, { "epoch": 13.271242429803634, "grad_norm": 135.03738403320312, "learning_rate": 3.069617328607501e-06, "loss": 0.0739, "num_input_tokens_seen": 155914896, "step": 72315 }, { "epoch": 13.272160029363185, "grad_norm": 2.431976318359375, "learning_rate": 3.068878681955358e-06, "loss": 0.3155, "num_input_tokens_seen": 155926544, "step": 72320 }, { "epoch": 13.273077628922739, "grad_norm": 0.042890772223472595, "learning_rate": 3.068140084833486e-06, "loss": 0.1346, "num_input_tokens_seen": 155937584, "step": 72325 }, { "epoch": 13.27399522848229, "grad_norm": 0.3791961967945099, "learning_rate": 3.067401537260826e-06, "loss": 0.3605, "num_input_tokens_seen": 155948592, "step": 72330 }, { "epoch": 13.274912828041842, "grad_norm": 0.029703926295042038, "learning_rate": 3.0666630392563203e-06, "loss": 0.2972, "num_input_tokens_seen": 155960176, "step": 72335 }, { "epoch": 13.275830427601395, "grad_norm": 0.05546468496322632, "learning_rate": 3.0659245908389122e-06, "loss": 0.1502, "num_input_tokens_seen": 155971440, "step": 72340 }, { "epoch": 13.276748027160947, "grad_norm": 341.8291015625, "learning_rate": 3.0651861920275415e-06, "loss": 0.1434, "num_input_tokens_seen": 155981936, "step": 72345 }, { "epoch": 13.277665626720498, "grad_norm": 8.784483909606934, "learning_rate": 3.064447842841147e-06, "loss": 0.1871, "num_input_tokens_seen": 155993104, "step": 72350 }, { "epoch": 13.278583226280052, "grad_norm": 0.03419389948248863, "learning_rate": 3.063709543298663e-06, "loss": 0.0836, "num_input_tokens_seen": 156004592, "step": 72355 }, { "epoch": 13.279500825839603, "grad_norm": 11.35528564453125, "learning_rate": 3.0629712934190294e-06, "loss": 0.0115, "num_input_tokens_seen": 156014192, "step": 72360 }, { "epoch": 13.280418425399155, "grad_norm": 0.03727169707417488, "learning_rate": 3.062233093221181e-06, "loss": 0.0291, "num_input_tokens_seen": 156025808, "step": 72365 }, { "epoch": 13.281336024958708, "grad_norm": 0.15101028978824615, "learning_rate": 3.0614949427240483e-06, "loss": 0.021, "num_input_tokens_seen": 156036880, "step": 72370 }, { "epoch": 13.28225362451826, "grad_norm": 0.03797968104481697, "learning_rate": 3.060756841946568e-06, "loss": 0.0429, "num_input_tokens_seen": 156048144, "step": 72375 }, { "epoch": 13.283171224077812, "grad_norm": 0.11343809217214584, "learning_rate": 3.0600187909076683e-06, "loss": 0.0474, "num_input_tokens_seen": 156059984, "step": 72380 }, { "epoch": 13.284088823637365, "grad_norm": 0.33481940627098083, "learning_rate": 3.059280789626279e-06, "loss": 0.0018, "num_input_tokens_seen": 156070096, "step": 72385 }, { "epoch": 13.285006423196917, "grad_norm": 78.25948333740234, "learning_rate": 3.0585428381213305e-06, "loss": 0.215, "num_input_tokens_seen": 156080816, "step": 72390 }, { "epoch": 13.285924022756468, "grad_norm": 11.795368194580078, "learning_rate": 3.0578049364117502e-06, "loss": 0.0104, "num_input_tokens_seen": 156092048, "step": 72395 }, { "epoch": 13.286841622316022, "grad_norm": 1.2506940364837646, "learning_rate": 3.057067084516462e-06, "loss": 0.0007, "num_input_tokens_seen": 156103568, "step": 72400 }, { "epoch": 13.287759221875573, "grad_norm": 0.11071404814720154, "learning_rate": 3.0563292824543912e-06, "loss": 0.1473, "num_input_tokens_seen": 156115056, "step": 72405 }, { "epoch": 13.288676821435125, "grad_norm": 63.96345520019531, "learning_rate": 3.0555915302444626e-06, "loss": 0.1551, "num_input_tokens_seen": 156126736, "step": 72410 }, { "epoch": 13.289594420994678, "grad_norm": 0.005112020298838615, "learning_rate": 3.0548538279055986e-06, "loss": 0.2089, "num_input_tokens_seen": 156136816, "step": 72415 }, { "epoch": 13.29051202055423, "grad_norm": 0.008011452853679657, "learning_rate": 3.054116175456717e-06, "loss": 0.0001, "num_input_tokens_seen": 156148720, "step": 72420 }, { "epoch": 13.291429620113782, "grad_norm": 0.05763235688209534, "learning_rate": 3.053378572916741e-06, "loss": 0.0004, "num_input_tokens_seen": 156159600, "step": 72425 }, { "epoch": 13.292347219673335, "grad_norm": 0.06603285670280457, "learning_rate": 3.0526410203045888e-06, "loss": 0.2355, "num_input_tokens_seen": 156171504, "step": 72430 }, { "epoch": 13.293264819232887, "grad_norm": 0.005279562436044216, "learning_rate": 3.051903517639173e-06, "loss": 0.0021, "num_input_tokens_seen": 156181936, "step": 72435 }, { "epoch": 13.294182418792438, "grad_norm": 0.002397339092567563, "learning_rate": 3.0511660649394153e-06, "loss": 0.061, "num_input_tokens_seen": 156192656, "step": 72440 }, { "epoch": 13.295100018351992, "grad_norm": 0.10030408203601837, "learning_rate": 3.050428662224228e-06, "loss": 0.0005, "num_input_tokens_seen": 156202800, "step": 72445 }, { "epoch": 13.296017617911543, "grad_norm": 40.40519332885742, "learning_rate": 3.0496913095125235e-06, "loss": 0.2778, "num_input_tokens_seen": 156214416, "step": 72450 }, { "epoch": 13.296935217471095, "grad_norm": 0.020634159445762634, "learning_rate": 3.0489540068232124e-06, "loss": 0.032, "num_input_tokens_seen": 156225872, "step": 72455 }, { "epoch": 13.297852817030648, "grad_norm": 398.8023986816406, "learning_rate": 3.048216754175209e-06, "loss": 0.0535, "num_input_tokens_seen": 156236496, "step": 72460 }, { "epoch": 13.2987704165902, "grad_norm": 0.007568716071546078, "learning_rate": 3.0474795515874212e-06, "loss": 0.0245, "num_input_tokens_seen": 156246704, "step": 72465 }, { "epoch": 13.299688016149751, "grad_norm": 0.06653006374835968, "learning_rate": 3.0467423990787547e-06, "loss": 0.0003, "num_input_tokens_seen": 156257232, "step": 72470 }, { "epoch": 13.300605615709305, "grad_norm": 0.00281832879409194, "learning_rate": 3.046005296668121e-06, "loss": 0.0033, "num_input_tokens_seen": 156268272, "step": 72475 }, { "epoch": 13.301523215268857, "grad_norm": 0.019488152116537094, "learning_rate": 3.045268244374422e-06, "loss": 0.0764, "num_input_tokens_seen": 156278320, "step": 72480 }, { "epoch": 13.302440814828408, "grad_norm": 107.72821044921875, "learning_rate": 3.0445312422165616e-06, "loss": 0.0058, "num_input_tokens_seen": 156289744, "step": 72485 }, { "epoch": 13.303358414387962, "grad_norm": 0.008551247417926788, "learning_rate": 3.0437942902134453e-06, "loss": 0.0018, "num_input_tokens_seen": 156300400, "step": 72490 }, { "epoch": 13.304276013947513, "grad_norm": 132.21542358398438, "learning_rate": 3.043057388383974e-06, "loss": 0.1752, "num_input_tokens_seen": 156312752, "step": 72495 }, { "epoch": 13.305193613507065, "grad_norm": 0.020458336919546127, "learning_rate": 3.0423205367470475e-06, "loss": 0.0001, "num_input_tokens_seen": 156324016, "step": 72500 }, { "epoch": 13.306111213066618, "grad_norm": 0.004604538902640343, "learning_rate": 3.041583735321564e-06, "loss": 0.1434, "num_input_tokens_seen": 156335248, "step": 72505 }, { "epoch": 13.30702881262617, "grad_norm": 0.02409357577562332, "learning_rate": 3.0408469841264234e-06, "loss": 0.0002, "num_input_tokens_seen": 156345520, "step": 72510 }, { "epoch": 13.307946412185721, "grad_norm": 0.026210620999336243, "learning_rate": 3.040110283180522e-06, "loss": 0.0075, "num_input_tokens_seen": 156355760, "step": 72515 }, { "epoch": 13.308864011745275, "grad_norm": 0.00240498804487288, "learning_rate": 3.039373632502751e-06, "loss": 0.1844, "num_input_tokens_seen": 156366896, "step": 72520 }, { "epoch": 13.309781611304826, "grad_norm": 45.96982955932617, "learning_rate": 3.0386370321120105e-06, "loss": 0.2138, "num_input_tokens_seen": 156378160, "step": 72525 }, { "epoch": 13.310699210864378, "grad_norm": 0.16655012965202332, "learning_rate": 3.0379004820271906e-06, "loss": 0.0711, "num_input_tokens_seen": 156389904, "step": 72530 }, { "epoch": 13.311616810423931, "grad_norm": 0.07772848010063171, "learning_rate": 3.037163982267182e-06, "loss": 0.0021, "num_input_tokens_seen": 156399344, "step": 72535 }, { "epoch": 13.312534409983483, "grad_norm": 41.95719909667969, "learning_rate": 3.0364275328508736e-06, "loss": 0.0882, "num_input_tokens_seen": 156409968, "step": 72540 }, { "epoch": 13.313452009543035, "grad_norm": 0.00829792208969593, "learning_rate": 3.0356911337971575e-06, "loss": 0.0003, "num_input_tokens_seen": 156421360, "step": 72545 }, { "epoch": 13.314369609102588, "grad_norm": 0.1998467743396759, "learning_rate": 3.034954785124919e-06, "loss": 0.0088, "num_input_tokens_seen": 156432048, "step": 72550 }, { "epoch": 13.31528720866214, "grad_norm": 0.05814715474843979, "learning_rate": 3.0342184868530435e-06, "loss": 0.0067, "num_input_tokens_seen": 156443440, "step": 72555 }, { "epoch": 13.316204808221691, "grad_norm": 0.03260776773095131, "learning_rate": 3.0334822390004183e-06, "loss": 0.451, "num_input_tokens_seen": 156454032, "step": 72560 }, { "epoch": 13.317122407781245, "grad_norm": 0.002431329106912017, "learning_rate": 3.0327460415859255e-06, "loss": 0.0276, "num_input_tokens_seen": 156465072, "step": 72565 }, { "epoch": 13.318040007340796, "grad_norm": 0.23399251699447632, "learning_rate": 3.0320098946284477e-06, "loss": 0.0087, "num_input_tokens_seen": 156476496, "step": 72570 }, { "epoch": 13.318957606900348, "grad_norm": 0.15307781100273132, "learning_rate": 3.0312737981468663e-06, "loss": 0.0046, "num_input_tokens_seen": 156487792, "step": 72575 }, { "epoch": 13.319875206459901, "grad_norm": 0.1836773157119751, "learning_rate": 3.030537752160061e-06, "loss": 0.3326, "num_input_tokens_seen": 156497744, "step": 72580 }, { "epoch": 13.320792806019453, "grad_norm": 76.26123046875, "learning_rate": 3.0298017566869096e-06, "loss": 0.2882, "num_input_tokens_seen": 156509168, "step": 72585 }, { "epoch": 13.321710405579005, "grad_norm": 0.027648989111185074, "learning_rate": 3.029065811746289e-06, "loss": 0.0001, "num_input_tokens_seen": 156519568, "step": 72590 }, { "epoch": 13.322628005138558, "grad_norm": 0.01507857907563448, "learning_rate": 3.0283299173570768e-06, "loss": 0.1501, "num_input_tokens_seen": 156530192, "step": 72595 }, { "epoch": 13.32354560469811, "grad_norm": 0.035381920635700226, "learning_rate": 3.0275940735381463e-06, "loss": 0.1877, "num_input_tokens_seen": 156540592, "step": 72600 }, { "epoch": 13.324463204257661, "grad_norm": 247.6784210205078, "learning_rate": 3.026858280308369e-06, "loss": 0.1238, "num_input_tokens_seen": 156551568, "step": 72605 }, { "epoch": 13.325380803817215, "grad_norm": 0.20818038284778595, "learning_rate": 3.026122537686621e-06, "loss": 0.0005, "num_input_tokens_seen": 156561872, "step": 72610 }, { "epoch": 13.326298403376766, "grad_norm": 2.112487554550171, "learning_rate": 3.02538684569177e-06, "loss": 0.0595, "num_input_tokens_seen": 156572336, "step": 72615 }, { "epoch": 13.327216002936318, "grad_norm": 0.04822847247123718, "learning_rate": 3.0246512043426846e-06, "loss": 0.059, "num_input_tokens_seen": 156582800, "step": 72620 }, { "epoch": 13.328133602495871, "grad_norm": 36.008182525634766, "learning_rate": 3.023915613658236e-06, "loss": 0.1455, "num_input_tokens_seen": 156594352, "step": 72625 }, { "epoch": 13.329051202055423, "grad_norm": 0.424619197845459, "learning_rate": 3.0231800736572893e-06, "loss": 0.0008, "num_input_tokens_seen": 156606640, "step": 72630 }, { "epoch": 13.329968801614974, "grad_norm": 1.0090231895446777, "learning_rate": 3.0224445843587104e-06, "loss": 0.3508, "num_input_tokens_seen": 156615792, "step": 72635 }, { "epoch": 13.330886401174528, "grad_norm": 19.835208892822266, "learning_rate": 3.0217091457813598e-06, "loss": 0.0019, "num_input_tokens_seen": 156627344, "step": 72640 }, { "epoch": 13.33180400073408, "grad_norm": 0.4533199667930603, "learning_rate": 3.0209737579441067e-06, "loss": 0.0209, "num_input_tokens_seen": 156637680, "step": 72645 }, { "epoch": 13.332721600293631, "grad_norm": 640.7949829101562, "learning_rate": 3.0202384208658086e-06, "loss": 0.2034, "num_input_tokens_seen": 156648784, "step": 72650 }, { "epoch": 13.333639199853184, "grad_norm": 0.0037520588375627995, "learning_rate": 3.0195031345653252e-06, "loss": 0.0017, "num_input_tokens_seen": 156659664, "step": 72655 }, { "epoch": 13.334556799412736, "grad_norm": 0.006416249554604292, "learning_rate": 3.0187678990615187e-06, "loss": 0.1792, "num_input_tokens_seen": 156670992, "step": 72660 }, { "epoch": 13.335474398972288, "grad_norm": 0.004951311741024256, "learning_rate": 3.018032714373245e-06, "loss": 0.0122, "num_input_tokens_seen": 156681936, "step": 72665 }, { "epoch": 13.336391998531841, "grad_norm": 39.05876541137695, "learning_rate": 3.0172975805193604e-06, "loss": 0.221, "num_input_tokens_seen": 156691696, "step": 72670 }, { "epoch": 13.337309598091393, "grad_norm": 37.863853454589844, "learning_rate": 3.016562497518719e-06, "loss": 0.4272, "num_input_tokens_seen": 156702000, "step": 72675 }, { "epoch": 13.338227197650944, "grad_norm": 9.310601234436035, "learning_rate": 3.0158274653901756e-06, "loss": 0.0037, "num_input_tokens_seen": 156712112, "step": 72680 }, { "epoch": 13.339144797210498, "grad_norm": 0.45045992732048035, "learning_rate": 3.0150924841525837e-06, "loss": 0.2639, "num_input_tokens_seen": 156723600, "step": 72685 }, { "epoch": 13.34006239677005, "grad_norm": 0.019475413486361504, "learning_rate": 3.0143575538247915e-06, "loss": 0.1659, "num_input_tokens_seen": 156735280, "step": 72690 }, { "epoch": 13.340979996329601, "grad_norm": 0.020781464874744415, "learning_rate": 3.0136226744256524e-06, "loss": 0.0317, "num_input_tokens_seen": 156745104, "step": 72695 }, { "epoch": 13.341897595889154, "grad_norm": 0.05876972898840904, "learning_rate": 3.0128878459740128e-06, "loss": 0.2894, "num_input_tokens_seen": 156755856, "step": 72700 }, { "epoch": 13.342815195448706, "grad_norm": 0.021165460348129272, "learning_rate": 3.012153068488718e-06, "loss": 0.0004, "num_input_tokens_seen": 156768112, "step": 72705 }, { "epoch": 13.343732795008258, "grad_norm": 0.4669171869754791, "learning_rate": 3.0114183419886183e-06, "loss": 0.0004, "num_input_tokens_seen": 156779184, "step": 72710 }, { "epoch": 13.344650394567811, "grad_norm": 115.03748321533203, "learning_rate": 3.0106836664925565e-06, "loss": 0.1681, "num_input_tokens_seen": 156789360, "step": 72715 }, { "epoch": 13.345567994127363, "grad_norm": 0.5302047729492188, "learning_rate": 3.0099490420193746e-06, "loss": 0.0018, "num_input_tokens_seen": 156799856, "step": 72720 }, { "epoch": 13.346485593686914, "grad_norm": 0.002597132697701454, "learning_rate": 3.0092144685879144e-06, "loss": 0.1272, "num_input_tokens_seen": 156810288, "step": 72725 }, { "epoch": 13.347403193246468, "grad_norm": 0.07883463054895401, "learning_rate": 3.0084799462170187e-06, "loss": 0.0033, "num_input_tokens_seen": 156820880, "step": 72730 }, { "epoch": 13.34832079280602, "grad_norm": 0.009250273928046227, "learning_rate": 3.0077454749255262e-06, "loss": 0.0005, "num_input_tokens_seen": 156831472, "step": 72735 }, { "epoch": 13.34923839236557, "grad_norm": 0.12355703115463257, "learning_rate": 3.007011054732273e-06, "loss": 0.0006, "num_input_tokens_seen": 156842192, "step": 72740 }, { "epoch": 13.350155991925124, "grad_norm": 0.9483180642127991, "learning_rate": 3.006276685656099e-06, "loss": 0.0004, "num_input_tokens_seen": 156853424, "step": 72745 }, { "epoch": 13.351073591484676, "grad_norm": 20.90192985534668, "learning_rate": 3.005542367715838e-06, "loss": 0.0157, "num_input_tokens_seen": 156864752, "step": 72750 }, { "epoch": 13.351991191044227, "grad_norm": 0.024889299646019936, "learning_rate": 3.004808100930322e-06, "loss": 0.0005, "num_input_tokens_seen": 156876144, "step": 72755 }, { "epoch": 13.35290879060378, "grad_norm": 0.08201864361763, "learning_rate": 3.004073885318388e-06, "loss": 0.0001, "num_input_tokens_seen": 156887920, "step": 72760 }, { "epoch": 13.353826390163333, "grad_norm": 1.007995367050171, "learning_rate": 3.0033397208988656e-06, "loss": 0.3172, "num_input_tokens_seen": 156898224, "step": 72765 }, { "epoch": 13.354743989722884, "grad_norm": 142.48263549804688, "learning_rate": 3.002605607690585e-06, "loss": 0.1826, "num_input_tokens_seen": 156909264, "step": 72770 }, { "epoch": 13.355661589282438, "grad_norm": 0.014289308339357376, "learning_rate": 3.0018715457123725e-06, "loss": 0.0588, "num_input_tokens_seen": 156918928, "step": 72775 }, { "epoch": 13.35657918884199, "grad_norm": 0.061337847262620926, "learning_rate": 3.001137534983061e-06, "loss": 0.3154, "num_input_tokens_seen": 156929040, "step": 72780 }, { "epoch": 13.35749678840154, "grad_norm": 0.032376471906900406, "learning_rate": 3.000403575521472e-06, "loss": 0.0454, "num_input_tokens_seen": 156939504, "step": 72785 }, { "epoch": 13.358414387961094, "grad_norm": 92.2479019165039, "learning_rate": 2.999669667346432e-06, "loss": 0.0174, "num_input_tokens_seen": 156950800, "step": 72790 }, { "epoch": 13.359331987520646, "grad_norm": 0.02294294536113739, "learning_rate": 2.9989358104767663e-06, "loss": 0.1592, "num_input_tokens_seen": 156960368, "step": 72795 }, { "epoch": 13.360249587080197, "grad_norm": 0.21171726286411285, "learning_rate": 2.9982020049312945e-06, "loss": 0.369, "num_input_tokens_seen": 156971504, "step": 72800 }, { "epoch": 13.36116718663975, "grad_norm": 0.015459704212844372, "learning_rate": 2.99746825072884e-06, "loss": 0.0949, "num_input_tokens_seen": 156982448, "step": 72805 }, { "epoch": 13.362084786199302, "grad_norm": 0.03825229033827782, "learning_rate": 2.996734547888219e-06, "loss": 0.1051, "num_input_tokens_seen": 156993968, "step": 72810 }, { "epoch": 13.363002385758854, "grad_norm": 0.29562753438949585, "learning_rate": 2.9960008964282544e-06, "loss": 0.0008, "num_input_tokens_seen": 157005488, "step": 72815 }, { "epoch": 13.363919985318407, "grad_norm": 822.33984375, "learning_rate": 2.9952672963677604e-06, "loss": 0.2401, "num_input_tokens_seen": 157016432, "step": 72820 }, { "epoch": 13.364837584877959, "grad_norm": 0.07346825301647186, "learning_rate": 2.994533747725551e-06, "loss": 0.0738, "num_input_tokens_seen": 157027696, "step": 72825 }, { "epoch": 13.36575518443751, "grad_norm": 0.012765311636030674, "learning_rate": 2.9938002505204457e-06, "loss": 0.0176, "num_input_tokens_seen": 157038608, "step": 72830 }, { "epoch": 13.366672783997064, "grad_norm": 2.4959583282470703, "learning_rate": 2.9930668047712536e-06, "loss": 0.1135, "num_input_tokens_seen": 157048880, "step": 72835 }, { "epoch": 13.367590383556616, "grad_norm": 0.019266124814748764, "learning_rate": 2.992333410496786e-06, "loss": 0.019, "num_input_tokens_seen": 157059920, "step": 72840 }, { "epoch": 13.368507983116167, "grad_norm": 33.38072967529297, "learning_rate": 2.991600067715856e-06, "loss": 0.2042, "num_input_tokens_seen": 157071536, "step": 72845 }, { "epoch": 13.36942558267572, "grad_norm": 35.7130126953125, "learning_rate": 2.990866776447272e-06, "loss": 0.3539, "num_input_tokens_seen": 157082448, "step": 72850 }, { "epoch": 13.370343182235272, "grad_norm": 0.5710398554801941, "learning_rate": 2.9901335367098416e-06, "loss": 0.0008, "num_input_tokens_seen": 157093072, "step": 72855 }, { "epoch": 13.371260781794824, "grad_norm": 0.003195584751665592, "learning_rate": 2.9894003485223687e-06, "loss": 0.0001, "num_input_tokens_seen": 157103280, "step": 72860 }, { "epoch": 13.372178381354377, "grad_norm": 0.28284189105033875, "learning_rate": 2.988667211903663e-06, "loss": 0.163, "num_input_tokens_seen": 157114192, "step": 72865 }, { "epoch": 13.373095980913929, "grad_norm": 404.0315246582031, "learning_rate": 2.987934126872526e-06, "loss": 0.2948, "num_input_tokens_seen": 157124048, "step": 72870 }, { "epoch": 13.37401358047348, "grad_norm": 0.010658321902155876, "learning_rate": 2.987201093447758e-06, "loss": 0.0707, "num_input_tokens_seen": 157134608, "step": 72875 }, { "epoch": 13.374931180033034, "grad_norm": 0.013348967768251896, "learning_rate": 2.9864681116481655e-06, "loss": 0.2967, "num_input_tokens_seen": 157144656, "step": 72880 }, { "epoch": 13.375848779592586, "grad_norm": 212.38002014160156, "learning_rate": 2.985735181492544e-06, "loss": 0.096, "num_input_tokens_seen": 157155536, "step": 72885 }, { "epoch": 13.376766379152137, "grad_norm": 0.04757287725806236, "learning_rate": 2.9850023029996923e-06, "loss": 0.0334, "num_input_tokens_seen": 157166736, "step": 72890 }, { "epoch": 13.37768397871169, "grad_norm": 0.017440175637602806, "learning_rate": 2.9842694761884095e-06, "loss": 0.0004, "num_input_tokens_seen": 157178384, "step": 72895 }, { "epoch": 13.378601578271242, "grad_norm": 0.017802471294999123, "learning_rate": 2.9835367010774903e-06, "loss": 0.2347, "num_input_tokens_seen": 157189872, "step": 72900 }, { "epoch": 13.379519177830794, "grad_norm": 0.34351491928100586, "learning_rate": 2.98280397768573e-06, "loss": 0.1497, "num_input_tokens_seen": 157202032, "step": 72905 }, { "epoch": 13.380436777390347, "grad_norm": 0.005139901302754879, "learning_rate": 2.98207130603192e-06, "loss": 0.1846, "num_input_tokens_seen": 157212496, "step": 72910 }, { "epoch": 13.381354376949899, "grad_norm": 0.2654246687889099, "learning_rate": 2.981338686134855e-06, "loss": 0.0003, "num_input_tokens_seen": 157222384, "step": 72915 }, { "epoch": 13.38227197650945, "grad_norm": 0.1598934382200241, "learning_rate": 2.980606118013324e-06, "loss": 0.0536, "num_input_tokens_seen": 157233552, "step": 72920 }, { "epoch": 13.383189576069004, "grad_norm": 0.08384893089532852, "learning_rate": 2.979873601686114e-06, "loss": 0.0005, "num_input_tokens_seen": 157243568, "step": 72925 }, { "epoch": 13.384107175628555, "grad_norm": 252.2950439453125, "learning_rate": 2.9791411371720168e-06, "loss": 0.069, "num_input_tokens_seen": 157255248, "step": 72930 }, { "epoch": 13.385024775188107, "grad_norm": 0.00783136859536171, "learning_rate": 2.9784087244898184e-06, "loss": 0.1198, "num_input_tokens_seen": 157265296, "step": 72935 }, { "epoch": 13.38594237474766, "grad_norm": 1.3116096258163452, "learning_rate": 2.9776763636583007e-06, "loss": 0.151, "num_input_tokens_seen": 157275184, "step": 72940 }, { "epoch": 13.386859974307212, "grad_norm": 109.49420166015625, "learning_rate": 2.976944054696252e-06, "loss": 0.198, "num_input_tokens_seen": 157285904, "step": 72945 }, { "epoch": 13.387777573866764, "grad_norm": 113.00581359863281, "learning_rate": 2.9762117976224526e-06, "loss": 0.0293, "num_input_tokens_seen": 157296592, "step": 72950 }, { "epoch": 13.388695173426317, "grad_norm": 0.004079846199601889, "learning_rate": 2.975479592455684e-06, "loss": 0.0026, "num_input_tokens_seen": 157305744, "step": 72955 }, { "epoch": 13.389612772985869, "grad_norm": 0.028710316866636276, "learning_rate": 2.974747439214724e-06, "loss": 0.2077, "num_input_tokens_seen": 157317456, "step": 72960 }, { "epoch": 13.39053037254542, "grad_norm": 0.16547438502311707, "learning_rate": 2.9740153379183555e-06, "loss": 0.2419, "num_input_tokens_seen": 157326768, "step": 72965 }, { "epoch": 13.391447972104974, "grad_norm": 23.357406616210938, "learning_rate": 2.9732832885853535e-06, "loss": 0.1724, "num_input_tokens_seen": 157337456, "step": 72970 }, { "epoch": 13.392365571664525, "grad_norm": 0.040135931223630905, "learning_rate": 2.9725512912344923e-06, "loss": 0.188, "num_input_tokens_seen": 157348656, "step": 72975 }, { "epoch": 13.393283171224077, "grad_norm": 285.9239501953125, "learning_rate": 2.97181934588455e-06, "loss": 0.148, "num_input_tokens_seen": 157360400, "step": 72980 }, { "epoch": 13.39420077078363, "grad_norm": 0.1027376651763916, "learning_rate": 2.971087452554299e-06, "loss": 0.083, "num_input_tokens_seen": 157370640, "step": 72985 }, { "epoch": 13.395118370343182, "grad_norm": 0.015235438011586666, "learning_rate": 2.9703556112625086e-06, "loss": 0.0148, "num_input_tokens_seen": 157382480, "step": 72990 }, { "epoch": 13.396035969902734, "grad_norm": 0.012155861593782902, "learning_rate": 2.9696238220279505e-06, "loss": 0.1458, "num_input_tokens_seen": 157392720, "step": 72995 }, { "epoch": 13.396953569462287, "grad_norm": 0.002259942004457116, "learning_rate": 2.968892084869396e-06, "loss": 0.0013, "num_input_tokens_seen": 157404496, "step": 73000 }, { "epoch": 13.397871169021839, "grad_norm": 0.357363760471344, "learning_rate": 2.968160399805612e-06, "loss": 0.1042, "num_input_tokens_seen": 157415568, "step": 73005 }, { "epoch": 13.39878876858139, "grad_norm": 6.576331615447998, "learning_rate": 2.9674287668553624e-06, "loss": 0.1098, "num_input_tokens_seen": 157425968, "step": 73010 }, { "epoch": 13.399706368140944, "grad_norm": 0.8791052103042603, "learning_rate": 2.9666971860374173e-06, "loss": 0.0045, "num_input_tokens_seen": 157437424, "step": 73015 }, { "epoch": 13.400623967700495, "grad_norm": 14.546542167663574, "learning_rate": 2.9659656573705374e-06, "loss": 0.0312, "num_input_tokens_seen": 157448400, "step": 73020 }, { "epoch": 13.401541567260047, "grad_norm": 43.19173049926758, "learning_rate": 2.965234180873484e-06, "loss": 0.189, "num_input_tokens_seen": 157460016, "step": 73025 }, { "epoch": 13.4024591668196, "grad_norm": 101.73401641845703, "learning_rate": 2.964502756565022e-06, "loss": 0.0356, "num_input_tokens_seen": 157469904, "step": 73030 }, { "epoch": 13.403376766379152, "grad_norm": 2.2376933097839355, "learning_rate": 2.9637713844639092e-06, "loss": 0.001, "num_input_tokens_seen": 157482416, "step": 73035 }, { "epoch": 13.404294365938703, "grad_norm": 0.22176742553710938, "learning_rate": 2.9630400645889055e-06, "loss": 0.0015, "num_input_tokens_seen": 157493424, "step": 73040 }, { "epoch": 13.405211965498257, "grad_norm": 0.008208603598177433, "learning_rate": 2.9623087969587648e-06, "loss": 0.1377, "num_input_tokens_seen": 157503440, "step": 73045 }, { "epoch": 13.406129565057809, "grad_norm": 0.11132481694221497, "learning_rate": 2.961577581592247e-06, "loss": 0.0002, "num_input_tokens_seen": 157514640, "step": 73050 }, { "epoch": 13.40704716461736, "grad_norm": 0.0037890165112912655, "learning_rate": 2.9608464185081055e-06, "loss": 0.0975, "num_input_tokens_seen": 157525520, "step": 73055 }, { "epoch": 13.407964764176914, "grad_norm": 0.13325011730194092, "learning_rate": 2.9601153077250907e-06, "loss": 0.0008, "num_input_tokens_seen": 157537136, "step": 73060 }, { "epoch": 13.408882363736465, "grad_norm": 0.1605517864227295, "learning_rate": 2.9593842492619584e-06, "loss": 0.0101, "num_input_tokens_seen": 157548400, "step": 73065 }, { "epoch": 13.409799963296017, "grad_norm": 0.558962881565094, "learning_rate": 2.9586532431374583e-06, "loss": 0.1378, "num_input_tokens_seen": 157559664, "step": 73070 }, { "epoch": 13.41071756285557, "grad_norm": 0.3158941864967346, "learning_rate": 2.957922289370335e-06, "loss": 0.0009, "num_input_tokens_seen": 157570256, "step": 73075 }, { "epoch": 13.411635162415122, "grad_norm": 0.444802850484848, "learning_rate": 2.9571913879793433e-06, "loss": 0.103, "num_input_tokens_seen": 157580016, "step": 73080 }, { "epoch": 13.412552761974673, "grad_norm": 0.0868232250213623, "learning_rate": 2.9564605389832267e-06, "loss": 0.0018, "num_input_tokens_seen": 157589744, "step": 73085 }, { "epoch": 13.413470361534227, "grad_norm": 0.010461469180881977, "learning_rate": 2.9557297424007296e-06, "loss": 0.0013, "num_input_tokens_seen": 157600656, "step": 73090 }, { "epoch": 13.414387961093778, "grad_norm": 0.028916586190462112, "learning_rate": 2.9549989982505943e-06, "loss": 0.3543, "num_input_tokens_seen": 157612208, "step": 73095 }, { "epoch": 13.41530556065333, "grad_norm": 0.005035130772739649, "learning_rate": 2.9542683065515678e-06, "loss": 0.0012, "num_input_tokens_seen": 157622704, "step": 73100 }, { "epoch": 13.416223160212883, "grad_norm": 0.04782888665795326, "learning_rate": 2.953537667322388e-06, "loss": 0.4286, "num_input_tokens_seen": 157633872, "step": 73105 }, { "epoch": 13.417140759772435, "grad_norm": 0.22829340398311615, "learning_rate": 2.9528070805817945e-06, "loss": 0.0065, "num_input_tokens_seen": 157644912, "step": 73110 }, { "epoch": 13.418058359331987, "grad_norm": 0.08995696902275085, "learning_rate": 2.952076546348527e-06, "loss": 0.0002, "num_input_tokens_seen": 157655504, "step": 73115 }, { "epoch": 13.41897595889154, "grad_norm": 0.015630869194865227, "learning_rate": 2.9513460646413215e-06, "loss": 0.0007, "num_input_tokens_seen": 157666352, "step": 73120 }, { "epoch": 13.419893558451092, "grad_norm": 0.1942664235830307, "learning_rate": 2.9506156354789156e-06, "loss": 0.1472, "num_input_tokens_seen": 157677264, "step": 73125 }, { "epoch": 13.420811158010643, "grad_norm": 0.013227785937488079, "learning_rate": 2.949885258880041e-06, "loss": 0.1434, "num_input_tokens_seen": 157687472, "step": 73130 }, { "epoch": 13.421728757570197, "grad_norm": 0.10640265792608261, "learning_rate": 2.9491549348634335e-06, "loss": 0.0002, "num_input_tokens_seen": 157698608, "step": 73135 }, { "epoch": 13.422646357129748, "grad_norm": 0.016023946925997734, "learning_rate": 2.948424663447823e-06, "loss": 0.0001, "num_input_tokens_seen": 157710512, "step": 73140 }, { "epoch": 13.4235639566893, "grad_norm": 0.3088666796684265, "learning_rate": 2.9476944446519383e-06, "loss": 0.0014, "num_input_tokens_seen": 157722864, "step": 73145 }, { "epoch": 13.424481556248853, "grad_norm": 29.282066345214844, "learning_rate": 2.946964278494513e-06, "loss": 0.0078, "num_input_tokens_seen": 157732016, "step": 73150 }, { "epoch": 13.425399155808405, "grad_norm": 0.08143959939479828, "learning_rate": 2.946234164994271e-06, "loss": 0.0002, "num_input_tokens_seen": 157742064, "step": 73155 }, { "epoch": 13.426316755367957, "grad_norm": 0.0883394405245781, "learning_rate": 2.945504104169938e-06, "loss": 0.0002, "num_input_tokens_seen": 157751952, "step": 73160 }, { "epoch": 13.42723435492751, "grad_norm": 0.04088381677865982, "learning_rate": 2.9447740960402428e-06, "loss": 0.0005, "num_input_tokens_seen": 157761808, "step": 73165 }, { "epoch": 13.428151954487062, "grad_norm": 0.002495389897376299, "learning_rate": 2.9440441406239064e-06, "loss": 0.0285, "num_input_tokens_seen": 157772848, "step": 73170 }, { "epoch": 13.429069554046613, "grad_norm": 0.0496564619243145, "learning_rate": 2.943314237939652e-06, "loss": 0.0377, "num_input_tokens_seen": 157781712, "step": 73175 }, { "epoch": 13.429987153606167, "grad_norm": 147.0283660888672, "learning_rate": 2.9425843880061966e-06, "loss": 0.1787, "num_input_tokens_seen": 157791760, "step": 73180 }, { "epoch": 13.430904753165718, "grad_norm": 0.04841466620564461, "learning_rate": 2.941854590842266e-06, "loss": 0.2445, "num_input_tokens_seen": 157803792, "step": 73185 }, { "epoch": 13.43182235272527, "grad_norm": 0.015216930769383907, "learning_rate": 2.9411248464665748e-06, "loss": 0.0332, "num_input_tokens_seen": 157813712, "step": 73190 }, { "epoch": 13.432739952284823, "grad_norm": 1.1080647706985474, "learning_rate": 2.9403951548978382e-06, "loss": 0.0003, "num_input_tokens_seen": 157825104, "step": 73195 }, { "epoch": 13.433657551844375, "grad_norm": 0.2604433298110962, "learning_rate": 2.939665516154776e-06, "loss": 0.0003, "num_input_tokens_seen": 157835856, "step": 73200 }, { "epoch": 13.434575151403926, "grad_norm": 4.9357523918151855, "learning_rate": 2.9389359302561004e-06, "loss": 0.1707, "num_input_tokens_seen": 157847184, "step": 73205 }, { "epoch": 13.43549275096348, "grad_norm": 43.835872650146484, "learning_rate": 2.938206397220523e-06, "loss": 0.1165, "num_input_tokens_seen": 157858064, "step": 73210 }, { "epoch": 13.436410350523031, "grad_norm": 0.7661740183830261, "learning_rate": 2.937476917066756e-06, "loss": 0.0014, "num_input_tokens_seen": 157868464, "step": 73215 }, { "epoch": 13.437327950082583, "grad_norm": 0.024124750867486, "learning_rate": 2.9367474898135095e-06, "loss": 0.1318, "num_input_tokens_seen": 157879344, "step": 73220 }, { "epoch": 13.438245549642136, "grad_norm": 0.08342073857784271, "learning_rate": 2.9360181154794927e-06, "loss": 0.0962, "num_input_tokens_seen": 157890640, "step": 73225 }, { "epoch": 13.439163149201688, "grad_norm": 0.03348783031105995, "learning_rate": 2.9352887940834115e-06, "loss": 0.018, "num_input_tokens_seen": 157900304, "step": 73230 }, { "epoch": 13.44008074876124, "grad_norm": 0.049147360026836395, "learning_rate": 2.9345595256439727e-06, "loss": 0.1192, "num_input_tokens_seen": 157910544, "step": 73235 }, { "epoch": 13.440998348320793, "grad_norm": 0.16796502470970154, "learning_rate": 2.9338303101798825e-06, "loss": 0.1944, "num_input_tokens_seen": 157921936, "step": 73240 }, { "epoch": 13.441915947880345, "grad_norm": 0.14656510949134827, "learning_rate": 2.93310114770984e-06, "loss": 0.1336, "num_input_tokens_seen": 157932080, "step": 73245 }, { "epoch": 13.442833547439896, "grad_norm": 0.012011848390102386, "learning_rate": 2.932372038252551e-06, "loss": 0.0007, "num_input_tokens_seen": 157943664, "step": 73250 }, { "epoch": 13.44375114699945, "grad_norm": 2.588001012802124, "learning_rate": 2.9316429818267156e-06, "loss": 0.2383, "num_input_tokens_seen": 157954128, "step": 73255 }, { "epoch": 13.444668746559001, "grad_norm": 0.007969049736857414, "learning_rate": 2.9309139784510313e-06, "loss": 0.0017, "num_input_tokens_seen": 157965808, "step": 73260 }, { "epoch": 13.445586346118553, "grad_norm": 0.09879874438047409, "learning_rate": 2.9301850281441953e-06, "loss": 0.3663, "num_input_tokens_seen": 157976464, "step": 73265 }, { "epoch": 13.446503945678106, "grad_norm": 0.05125822126865387, "learning_rate": 2.929456130924907e-06, "loss": 0.0946, "num_input_tokens_seen": 157987376, "step": 73270 }, { "epoch": 13.447421545237658, "grad_norm": 0.12095505744218826, "learning_rate": 2.92872728681186e-06, "loss": 0.0004, "num_input_tokens_seen": 157997808, "step": 73275 }, { "epoch": 13.44833914479721, "grad_norm": 0.5460233688354492, "learning_rate": 2.9279984958237462e-06, "loss": 0.3098, "num_input_tokens_seen": 158008720, "step": 73280 }, { "epoch": 13.449256744356763, "grad_norm": 0.046536896377801895, "learning_rate": 2.927269757979261e-06, "loss": 0.1172, "num_input_tokens_seen": 158020048, "step": 73285 }, { "epoch": 13.450174343916315, "grad_norm": 0.23442091047763824, "learning_rate": 2.9265410732970943e-06, "loss": 0.0007, "num_input_tokens_seen": 158031216, "step": 73290 }, { "epoch": 13.451091943475866, "grad_norm": 0.015842994675040245, "learning_rate": 2.9258124417959337e-06, "loss": 0.0006, "num_input_tokens_seen": 158041360, "step": 73295 }, { "epoch": 13.45200954303542, "grad_norm": 1.1390037536621094, "learning_rate": 2.9250838634944713e-06, "loss": 0.5125, "num_input_tokens_seen": 158051664, "step": 73300 }, { "epoch": 13.452927142594971, "grad_norm": 21.617115020751953, "learning_rate": 2.924355338411392e-06, "loss": 0.3526, "num_input_tokens_seen": 158062832, "step": 73305 }, { "epoch": 13.453844742154523, "grad_norm": 0.006038839928805828, "learning_rate": 2.923626866565381e-06, "loss": 0.0002, "num_input_tokens_seen": 158072592, "step": 73310 }, { "epoch": 13.454762341714076, "grad_norm": 0.13346242904663086, "learning_rate": 2.922898447975121e-06, "loss": 0.1159, "num_input_tokens_seen": 158081008, "step": 73315 }, { "epoch": 13.455679941273628, "grad_norm": 0.025905197486281395, "learning_rate": 2.922170082659299e-06, "loss": 0.1041, "num_input_tokens_seen": 158091760, "step": 73320 }, { "epoch": 13.45659754083318, "grad_norm": 16.087297439575195, "learning_rate": 2.9214417706365933e-06, "loss": 0.1602, "num_input_tokens_seen": 158102544, "step": 73325 }, { "epoch": 13.457515140392733, "grad_norm": 2.702744245529175, "learning_rate": 2.920713511925684e-06, "loss": 0.0229, "num_input_tokens_seen": 158113840, "step": 73330 }, { "epoch": 13.458432739952285, "grad_norm": 0.03687264025211334, "learning_rate": 2.9199853065452515e-06, "loss": 0.0001, "num_input_tokens_seen": 158124656, "step": 73335 }, { "epoch": 13.459350339511836, "grad_norm": 49.35420227050781, "learning_rate": 2.9192571545139715e-06, "loss": 0.0132, "num_input_tokens_seen": 158134608, "step": 73340 }, { "epoch": 13.46026793907139, "grad_norm": 0.28312504291534424, "learning_rate": 2.918529055850519e-06, "loss": 0.0197, "num_input_tokens_seen": 158145872, "step": 73345 }, { "epoch": 13.461185538630941, "grad_norm": 0.07203646004199982, "learning_rate": 2.9178010105735725e-06, "loss": 0.001, "num_input_tokens_seen": 158156528, "step": 73350 }, { "epoch": 13.462103138190493, "grad_norm": 188.9155731201172, "learning_rate": 2.917073018701804e-06, "loss": 0.0204, "num_input_tokens_seen": 158167408, "step": 73355 }, { "epoch": 13.463020737750046, "grad_norm": 0.26706498861312866, "learning_rate": 2.916345080253883e-06, "loss": 0.0004, "num_input_tokens_seen": 158178384, "step": 73360 }, { "epoch": 13.463938337309598, "grad_norm": 0.02392319217324257, "learning_rate": 2.915617195248479e-06, "loss": 0.0314, "num_input_tokens_seen": 158189040, "step": 73365 }, { "epoch": 13.46485593686915, "grad_norm": 34.3828125, "learning_rate": 2.9148893637042663e-06, "loss": 0.1564, "num_input_tokens_seen": 158201552, "step": 73370 }, { "epoch": 13.465773536428703, "grad_norm": 0.020664788782596588, "learning_rate": 2.9141615856399095e-06, "loss": 0.1472, "num_input_tokens_seen": 158212144, "step": 73375 }, { "epoch": 13.466691135988254, "grad_norm": 24.448284149169922, "learning_rate": 2.9134338610740754e-06, "loss": 0.2346, "num_input_tokens_seen": 158223024, "step": 73380 }, { "epoch": 13.467608735547808, "grad_norm": 0.038377415388822556, "learning_rate": 2.9127061900254295e-06, "loss": 0.0004, "num_input_tokens_seen": 158234832, "step": 73385 }, { "epoch": 13.46852633510736, "grad_norm": 0.005329588428139687, "learning_rate": 2.9119785725126316e-06, "loss": 0.2052, "num_input_tokens_seen": 158245232, "step": 73390 }, { "epoch": 13.469443934666911, "grad_norm": 0.014798588119447231, "learning_rate": 2.9112510085543497e-06, "loss": 0.0104, "num_input_tokens_seen": 158256048, "step": 73395 }, { "epoch": 13.470361534226464, "grad_norm": 3.137219190597534, "learning_rate": 2.910523498169242e-06, "loss": 0.1963, "num_input_tokens_seen": 158267344, "step": 73400 }, { "epoch": 13.471279133786016, "grad_norm": 0.0023879408836364746, "learning_rate": 2.9097960413759683e-06, "loss": 0.0005, "num_input_tokens_seen": 158277616, "step": 73405 }, { "epoch": 13.472196733345568, "grad_norm": 80.74789428710938, "learning_rate": 2.9090686381931876e-06, "loss": 0.0057, "num_input_tokens_seen": 158287984, "step": 73410 }, { "epoch": 13.473114332905121, "grad_norm": 52.55048370361328, "learning_rate": 2.9083412886395522e-06, "loss": 0.1266, "num_input_tokens_seen": 158299056, "step": 73415 }, { "epoch": 13.474031932464673, "grad_norm": 0.031841132789850235, "learning_rate": 2.907613992733724e-06, "loss": 0.1735, "num_input_tokens_seen": 158310640, "step": 73420 }, { "epoch": 13.474949532024224, "grad_norm": 0.1858886033296585, "learning_rate": 2.906886750494353e-06, "loss": 0.2253, "num_input_tokens_seen": 158320048, "step": 73425 }, { "epoch": 13.475867131583778, "grad_norm": 115.25201416015625, "learning_rate": 2.9061595619400918e-06, "loss": 0.0542, "num_input_tokens_seen": 158329936, "step": 73430 }, { "epoch": 13.47678473114333, "grad_norm": 0.5162591934204102, "learning_rate": 2.905432427089594e-06, "loss": 0.0004, "num_input_tokens_seen": 158341872, "step": 73435 }, { "epoch": 13.477702330702881, "grad_norm": 3.168175220489502, "learning_rate": 2.9047053459615083e-06, "loss": 0.0036, "num_input_tokens_seen": 158352464, "step": 73440 }, { "epoch": 13.478619930262434, "grad_norm": 0.013966793194413185, "learning_rate": 2.903978318574483e-06, "loss": 0.0073, "num_input_tokens_seen": 158364368, "step": 73445 }, { "epoch": 13.479537529821986, "grad_norm": 20.122766494750977, "learning_rate": 2.903251344947164e-06, "loss": 0.1287, "num_input_tokens_seen": 158374704, "step": 73450 }, { "epoch": 13.480455129381538, "grad_norm": 0.24567021429538727, "learning_rate": 2.9025244250982e-06, "loss": 0.2628, "num_input_tokens_seen": 158386064, "step": 73455 }, { "epoch": 13.481372728941091, "grad_norm": 0.1372455656528473, "learning_rate": 2.9017975590462332e-06, "loss": 0.0534, "num_input_tokens_seen": 158396400, "step": 73460 }, { "epoch": 13.482290328500643, "grad_norm": 0.12277353554964066, "learning_rate": 2.9010707468099054e-06, "loss": 0.0293, "num_input_tokens_seen": 158406992, "step": 73465 }, { "epoch": 13.483207928060194, "grad_norm": 0.0018446645699441433, "learning_rate": 2.9003439884078615e-06, "loss": 0.2018, "num_input_tokens_seen": 158418384, "step": 73470 }, { "epoch": 13.484125527619748, "grad_norm": 0.028905916959047318, "learning_rate": 2.899617283858741e-06, "loss": 0.12, "num_input_tokens_seen": 158427632, "step": 73475 }, { "epoch": 13.4850431271793, "grad_norm": 0.31905508041381836, "learning_rate": 2.8988906331811788e-06, "loss": 0.0011, "num_input_tokens_seen": 158438288, "step": 73480 }, { "epoch": 13.48596072673885, "grad_norm": 0.04831746593117714, "learning_rate": 2.898164036393818e-06, "loss": 0.0081, "num_input_tokens_seen": 158448592, "step": 73485 }, { "epoch": 13.486878326298404, "grad_norm": 2.4881091117858887, "learning_rate": 2.897437493515293e-06, "loss": 0.0148, "num_input_tokens_seen": 158459632, "step": 73490 }, { "epoch": 13.487795925857956, "grad_norm": 0.021942900493741035, "learning_rate": 2.896711004564236e-06, "loss": 0.1181, "num_input_tokens_seen": 158472560, "step": 73495 }, { "epoch": 13.488713525417507, "grad_norm": 0.18197743594646454, "learning_rate": 2.8959845695592807e-06, "loss": 0.007, "num_input_tokens_seen": 158483632, "step": 73500 }, { "epoch": 13.48963112497706, "grad_norm": 2.6915969848632812, "learning_rate": 2.895258188519062e-06, "loss": 0.1559, "num_input_tokens_seen": 158493808, "step": 73505 }, { "epoch": 13.490548724536612, "grad_norm": 8.045494079589844, "learning_rate": 2.894531861462209e-06, "loss": 0.0305, "num_input_tokens_seen": 158504272, "step": 73510 }, { "epoch": 13.491466324096164, "grad_norm": 0.0843547135591507, "learning_rate": 2.8938055884073492e-06, "loss": 0.0003, "num_input_tokens_seen": 158515472, "step": 73515 }, { "epoch": 13.492383923655717, "grad_norm": 0.006886679213494062, "learning_rate": 2.893079369373113e-06, "loss": 0.0001, "num_input_tokens_seen": 158526704, "step": 73520 }, { "epoch": 13.493301523215269, "grad_norm": 0.3426934778690338, "learning_rate": 2.8923532043781254e-06, "loss": 0.0594, "num_input_tokens_seen": 158536816, "step": 73525 }, { "epoch": 13.49421912277482, "grad_norm": 0.12002637982368469, "learning_rate": 2.8916270934410097e-06, "loss": 0.0003, "num_input_tokens_seen": 158547120, "step": 73530 }, { "epoch": 13.495136722334374, "grad_norm": 0.04972129687666893, "learning_rate": 2.8909010365803934e-06, "loss": 0.1587, "num_input_tokens_seen": 158556368, "step": 73535 }, { "epoch": 13.496054321893926, "grad_norm": 0.05628898739814758, "learning_rate": 2.890175033814897e-06, "loss": 0.0033, "num_input_tokens_seen": 158567088, "step": 73540 }, { "epoch": 13.496971921453477, "grad_norm": 0.03786463662981987, "learning_rate": 2.8894490851631405e-06, "loss": 0.0022, "num_input_tokens_seen": 158578608, "step": 73545 }, { "epoch": 13.49788952101303, "grad_norm": 0.0035170684568583965, "learning_rate": 2.8887231906437417e-06, "loss": 0.0884, "num_input_tokens_seen": 158588496, "step": 73550 }, { "epoch": 13.498807120572582, "grad_norm": 485.2991943359375, "learning_rate": 2.887997350275324e-06, "loss": 0.1396, "num_input_tokens_seen": 158599792, "step": 73555 }, { "epoch": 13.499724720132134, "grad_norm": 0.007255214266479015, "learning_rate": 2.8872715640765003e-06, "loss": 0.1503, "num_input_tokens_seen": 158610320, "step": 73560 }, { "epoch": 13.500642319691687, "grad_norm": 0.020492233335971832, "learning_rate": 2.8865458320658844e-06, "loss": 0.3193, "num_input_tokens_seen": 158620880, "step": 73565 }, { "epoch": 13.501559919251239, "grad_norm": 0.14623799920082092, "learning_rate": 2.8858201542620945e-06, "loss": 0.0483, "num_input_tokens_seen": 158631536, "step": 73570 }, { "epoch": 13.50247751881079, "grad_norm": 0.034801557660102844, "learning_rate": 2.8850945306837406e-06, "loss": 0.0003, "num_input_tokens_seen": 158642256, "step": 73575 }, { "epoch": 13.503395118370344, "grad_norm": 0.01934225484728813, "learning_rate": 2.8843689613494352e-06, "loss": 0.0004, "num_input_tokens_seen": 158652048, "step": 73580 }, { "epoch": 13.504312717929896, "grad_norm": 0.003359117778018117, "learning_rate": 2.883643446277784e-06, "loss": 0.0006, "num_input_tokens_seen": 158662192, "step": 73585 }, { "epoch": 13.505230317489447, "grad_norm": 0.002558112842962146, "learning_rate": 2.8829179854874013e-06, "loss": 0.0891, "num_input_tokens_seen": 158672912, "step": 73590 }, { "epoch": 13.506147917049, "grad_norm": 0.0913178101181984, "learning_rate": 2.88219257899689e-06, "loss": 0.0005, "num_input_tokens_seen": 158683248, "step": 73595 }, { "epoch": 13.507065516608552, "grad_norm": 0.5584600567817688, "learning_rate": 2.881467226824858e-06, "loss": 0.0012, "num_input_tokens_seen": 158694768, "step": 73600 }, { "epoch": 13.507983116168104, "grad_norm": 5.547706604003906, "learning_rate": 2.880741928989907e-06, "loss": 0.0042, "num_input_tokens_seen": 158706608, "step": 73605 }, { "epoch": 13.508900715727657, "grad_norm": 0.01275892835110426, "learning_rate": 2.880016685510639e-06, "loss": 0.0823, "num_input_tokens_seen": 158717392, "step": 73610 }, { "epoch": 13.509818315287209, "grad_norm": 0.36364465951919556, "learning_rate": 2.87929149640566e-06, "loss": 0.0015, "num_input_tokens_seen": 158728848, "step": 73615 }, { "epoch": 13.51073591484676, "grad_norm": 41.45484924316406, "learning_rate": 2.878566361693567e-06, "loss": 0.1191, "num_input_tokens_seen": 158738832, "step": 73620 }, { "epoch": 13.511653514406314, "grad_norm": 230.52806091308594, "learning_rate": 2.877841281392959e-06, "loss": 0.0591, "num_input_tokens_seen": 158749040, "step": 73625 }, { "epoch": 13.512571113965866, "grad_norm": 0.01995791681110859, "learning_rate": 2.877116255522433e-06, "loss": 0.0001, "num_input_tokens_seen": 158760816, "step": 73630 }, { "epoch": 13.513488713525417, "grad_norm": 0.21265845000743866, "learning_rate": 2.8763912841005833e-06, "loss": 0.1814, "num_input_tokens_seen": 158771696, "step": 73635 }, { "epoch": 13.51440631308497, "grad_norm": 0.009385728277266026, "learning_rate": 2.8756663671460072e-06, "loss": 0.0331, "num_input_tokens_seen": 158782640, "step": 73640 }, { "epoch": 13.515323912644522, "grad_norm": 0.021259529516100883, "learning_rate": 2.8749415046772964e-06, "loss": 0.1346, "num_input_tokens_seen": 158793552, "step": 73645 }, { "epoch": 13.516241512204074, "grad_norm": 92.0199203491211, "learning_rate": 2.874216696713041e-06, "loss": 0.319, "num_input_tokens_seen": 158804720, "step": 73650 }, { "epoch": 13.517159111763627, "grad_norm": 324.225341796875, "learning_rate": 2.8734919432718343e-06, "loss": 0.223, "num_input_tokens_seen": 158816400, "step": 73655 }, { "epoch": 13.518076711323179, "grad_norm": 0.005005158483982086, "learning_rate": 2.8727672443722642e-06, "loss": 0.0001, "num_input_tokens_seen": 158827888, "step": 73660 }, { "epoch": 13.51899431088273, "grad_norm": 70.63372802734375, "learning_rate": 2.872042600032915e-06, "loss": 0.3391, "num_input_tokens_seen": 158840144, "step": 73665 }, { "epoch": 13.519911910442284, "grad_norm": 112.78107452392578, "learning_rate": 2.8713180102723764e-06, "loss": 0.2303, "num_input_tokens_seen": 158850960, "step": 73670 }, { "epoch": 13.520829510001835, "grad_norm": 40.4635124206543, "learning_rate": 2.8705934751092323e-06, "loss": 0.1483, "num_input_tokens_seen": 158861488, "step": 73675 }, { "epoch": 13.521747109561387, "grad_norm": 89.61929321289062, "learning_rate": 2.869868994562065e-06, "loss": 0.0799, "num_input_tokens_seen": 158873360, "step": 73680 }, { "epoch": 13.52266470912094, "grad_norm": 1.4886237382888794, "learning_rate": 2.8691445686494545e-06, "loss": 0.0007, "num_input_tokens_seen": 158883952, "step": 73685 }, { "epoch": 13.523582308680492, "grad_norm": 0.08651198446750641, "learning_rate": 2.8684201973899856e-06, "loss": 0.1802, "num_input_tokens_seen": 158894224, "step": 73690 }, { "epoch": 13.524499908240044, "grad_norm": 0.23967549204826355, "learning_rate": 2.8676958808022346e-06, "loss": 0.0027, "num_input_tokens_seen": 158905264, "step": 73695 }, { "epoch": 13.525417507799597, "grad_norm": 0.142361581325531, "learning_rate": 2.866971618904778e-06, "loss": 0.1322, "num_input_tokens_seen": 158914960, "step": 73700 }, { "epoch": 13.526335107359149, "grad_norm": 59.9813346862793, "learning_rate": 2.8662474117161955e-06, "loss": 0.0605, "num_input_tokens_seen": 158926608, "step": 73705 }, { "epoch": 13.5272527069187, "grad_norm": 0.7827543020248413, "learning_rate": 2.865523259255059e-06, "loss": 0.0011, "num_input_tokens_seen": 158936848, "step": 73710 }, { "epoch": 13.528170306478254, "grad_norm": 0.02122434228658676, "learning_rate": 2.8647991615399436e-06, "loss": 0.0017, "num_input_tokens_seen": 158947632, "step": 73715 }, { "epoch": 13.529087906037805, "grad_norm": 0.05679528787732124, "learning_rate": 2.8640751185894176e-06, "loss": 0.0002, "num_input_tokens_seen": 158958128, "step": 73720 }, { "epoch": 13.530005505597357, "grad_norm": 86.70923614501953, "learning_rate": 2.8633511304220574e-06, "loss": 0.1874, "num_input_tokens_seen": 158968368, "step": 73725 }, { "epoch": 13.53092310515691, "grad_norm": 133.8714599609375, "learning_rate": 2.862627197056429e-06, "loss": 0.1614, "num_input_tokens_seen": 158979600, "step": 73730 }, { "epoch": 13.531840704716462, "grad_norm": 56.20091247558594, "learning_rate": 2.8619033185110976e-06, "loss": 0.0156, "num_input_tokens_seen": 158990352, "step": 73735 }, { "epoch": 13.532758304276014, "grad_norm": 22.16942024230957, "learning_rate": 2.8611794948046357e-06, "loss": 0.1164, "num_input_tokens_seen": 158999536, "step": 73740 }, { "epoch": 13.533675903835567, "grad_norm": 0.02421480603516102, "learning_rate": 2.8604557259556037e-06, "loss": 0.1847, "num_input_tokens_seen": 159009840, "step": 73745 }, { "epoch": 13.534593503395119, "grad_norm": 25.167173385620117, "learning_rate": 2.8597320119825642e-06, "loss": 0.0028, "num_input_tokens_seen": 159020496, "step": 73750 }, { "epoch": 13.53551110295467, "grad_norm": 0.024522043764591217, "learning_rate": 2.8590083529040847e-06, "loss": 0.2415, "num_input_tokens_seen": 159031600, "step": 73755 }, { "epoch": 13.536428702514224, "grad_norm": 150.6254425048828, "learning_rate": 2.8582847487387224e-06, "loss": 0.0739, "num_input_tokens_seen": 159042672, "step": 73760 }, { "epoch": 13.537346302073775, "grad_norm": 0.8774486184120178, "learning_rate": 2.857561199505036e-06, "loss": 0.0024, "num_input_tokens_seen": 159053072, "step": 73765 }, { "epoch": 13.538263901633327, "grad_norm": 0.3180674910545349, "learning_rate": 2.8568377052215828e-06, "loss": 0.1307, "num_input_tokens_seen": 159062960, "step": 73770 }, { "epoch": 13.53918150119288, "grad_norm": 0.08818931877613068, "learning_rate": 2.856114265906923e-06, "loss": 0.0014, "num_input_tokens_seen": 159072688, "step": 73775 }, { "epoch": 13.540099100752432, "grad_norm": 0.022423505783081055, "learning_rate": 2.8553908815796095e-06, "loss": 0.0101, "num_input_tokens_seen": 159083344, "step": 73780 }, { "epoch": 13.541016700311983, "grad_norm": 34.8932991027832, "learning_rate": 2.8546675522581947e-06, "loss": 0.166, "num_input_tokens_seen": 159094064, "step": 73785 }, { "epoch": 13.541934299871537, "grad_norm": 82.219970703125, "learning_rate": 2.8539442779612332e-06, "loss": 0.3136, "num_input_tokens_seen": 159103152, "step": 73790 }, { "epoch": 13.542851899431088, "grad_norm": 0.06466986984014511, "learning_rate": 2.853221058707275e-06, "loss": 0.3472, "num_input_tokens_seen": 159113712, "step": 73795 }, { "epoch": 13.54376949899064, "grad_norm": 0.0771990716457367, "learning_rate": 2.8524978945148702e-06, "loss": 0.0016, "num_input_tokens_seen": 159123568, "step": 73800 }, { "epoch": 13.544687098550193, "grad_norm": 0.08412856608629227, "learning_rate": 2.8517747854025633e-06, "loss": 0.1912, "num_input_tokens_seen": 159134032, "step": 73805 }, { "epoch": 13.545604698109745, "grad_norm": 0.06714761257171631, "learning_rate": 2.8510517313889063e-06, "loss": 0.0431, "num_input_tokens_seen": 159145392, "step": 73810 }, { "epoch": 13.546522297669297, "grad_norm": 0.16486535966396332, "learning_rate": 2.8503287324924413e-06, "loss": 0.0652, "num_input_tokens_seen": 159156208, "step": 73815 }, { "epoch": 13.54743989722885, "grad_norm": 1.519816279411316, "learning_rate": 2.849605788731713e-06, "loss": 0.109, "num_input_tokens_seen": 159166736, "step": 73820 }, { "epoch": 13.548357496788402, "grad_norm": 0.04100726172327995, "learning_rate": 2.8488829001252632e-06, "loss": 0.0002, "num_input_tokens_seen": 159177872, "step": 73825 }, { "epoch": 13.549275096347953, "grad_norm": 27.843780517578125, "learning_rate": 2.848160066691633e-06, "loss": 0.2024, "num_input_tokens_seen": 159188944, "step": 73830 }, { "epoch": 13.550192695907507, "grad_norm": 0.007980064488947392, "learning_rate": 2.8474372884493605e-06, "loss": 0.0007, "num_input_tokens_seen": 159199568, "step": 73835 }, { "epoch": 13.551110295467058, "grad_norm": 0.014531990513205528, "learning_rate": 2.846714565416987e-06, "loss": 0.1626, "num_input_tokens_seen": 159212400, "step": 73840 }, { "epoch": 13.55202789502661, "grad_norm": 623.3294677734375, "learning_rate": 2.8459918976130474e-06, "loss": 0.1198, "num_input_tokens_seen": 159223120, "step": 73845 }, { "epoch": 13.552945494586163, "grad_norm": 0.19594068825244904, "learning_rate": 2.845269285056076e-06, "loss": 0.0269, "num_input_tokens_seen": 159233520, "step": 73850 }, { "epoch": 13.553863094145715, "grad_norm": 0.034618958830833435, "learning_rate": 2.844546727764609e-06, "loss": 0.1566, "num_input_tokens_seen": 159243056, "step": 73855 }, { "epoch": 13.554780693705267, "grad_norm": 0.004398564342409372, "learning_rate": 2.843824225757178e-06, "loss": 0.0053, "num_input_tokens_seen": 159253904, "step": 73860 }, { "epoch": 13.55569829326482, "grad_norm": 0.16516508162021637, "learning_rate": 2.843101779052314e-06, "loss": 0.1378, "num_input_tokens_seen": 159264560, "step": 73865 }, { "epoch": 13.556615892824372, "grad_norm": 0.007906070910394192, "learning_rate": 2.8423793876685444e-06, "loss": 0.1722, "num_input_tokens_seen": 159276496, "step": 73870 }, { "epoch": 13.557533492383923, "grad_norm": 0.4730050563812256, "learning_rate": 2.8416570516244018e-06, "loss": 0.0012, "num_input_tokens_seen": 159286736, "step": 73875 }, { "epoch": 13.558451091943477, "grad_norm": 0.09492307156324387, "learning_rate": 2.8409347709384103e-06, "loss": 0.2138, "num_input_tokens_seen": 159298736, "step": 73880 }, { "epoch": 13.559368691503028, "grad_norm": 0.14426390826702118, "learning_rate": 2.840212545629094e-06, "loss": 0.0002, "num_input_tokens_seen": 159309104, "step": 73885 }, { "epoch": 13.56028629106258, "grad_norm": 0.05452123284339905, "learning_rate": 2.8394903757149805e-06, "loss": 0.1662, "num_input_tokens_seen": 159319632, "step": 73890 }, { "epoch": 13.561203890622133, "grad_norm": 0.045270007103681564, "learning_rate": 2.8387682612145905e-06, "loss": 0.0433, "num_input_tokens_seen": 159330352, "step": 73895 }, { "epoch": 13.562121490181685, "grad_norm": 0.20164436101913452, "learning_rate": 2.838046202146445e-06, "loss": 0.0126, "num_input_tokens_seen": 159340848, "step": 73900 }, { "epoch": 13.563039089741237, "grad_norm": 0.05648680776357651, "learning_rate": 2.8373241985290613e-06, "loss": 0.0005, "num_input_tokens_seen": 159351824, "step": 73905 }, { "epoch": 13.56395668930079, "grad_norm": 0.044003285467624664, "learning_rate": 2.836602250380962e-06, "loss": 0.0006, "num_input_tokens_seen": 159361584, "step": 73910 }, { "epoch": 13.564874288860342, "grad_norm": 0.5490768551826477, "learning_rate": 2.8358803577206624e-06, "loss": 0.1952, "num_input_tokens_seen": 159373488, "step": 73915 }, { "epoch": 13.565791888419893, "grad_norm": 308.72900390625, "learning_rate": 2.8351585205666755e-06, "loss": 0.0441, "num_input_tokens_seen": 159384944, "step": 73920 }, { "epoch": 13.566709487979447, "grad_norm": 0.581575334072113, "learning_rate": 2.8344367389375193e-06, "loss": 0.0433, "num_input_tokens_seen": 159394704, "step": 73925 }, { "epoch": 13.567627087538998, "grad_norm": 3.1769120693206787, "learning_rate": 2.8337150128517042e-06, "loss": 0.129, "num_input_tokens_seen": 159405456, "step": 73930 }, { "epoch": 13.56854468709855, "grad_norm": 0.051546432077884674, "learning_rate": 2.83299334232774e-06, "loss": 0.0945, "num_input_tokens_seen": 159415888, "step": 73935 }, { "epoch": 13.569462286658103, "grad_norm": 0.0014459930825978518, "learning_rate": 2.832271727384139e-06, "loss": 0.0002, "num_input_tokens_seen": 159427120, "step": 73940 }, { "epoch": 13.570379886217655, "grad_norm": 11.135635375976562, "learning_rate": 2.8315501680394097e-06, "loss": 0.1444, "num_input_tokens_seen": 159436528, "step": 73945 }, { "epoch": 13.571297485777206, "grad_norm": 31.93780517578125, "learning_rate": 2.8308286643120574e-06, "loss": 0.2259, "num_input_tokens_seen": 159447408, "step": 73950 }, { "epoch": 13.57221508533676, "grad_norm": 67.44224548339844, "learning_rate": 2.8301072162205857e-06, "loss": 0.2228, "num_input_tokens_seen": 159458576, "step": 73955 }, { "epoch": 13.573132684896311, "grad_norm": 0.02507280930876732, "learning_rate": 2.8293858237835037e-06, "loss": 0.0001, "num_input_tokens_seen": 159470384, "step": 73960 }, { "epoch": 13.574050284455863, "grad_norm": 64.51466369628906, "learning_rate": 2.8286644870193104e-06, "loss": 0.3262, "num_input_tokens_seen": 159481840, "step": 73965 }, { "epoch": 13.574967884015416, "grad_norm": 29.97968101501465, "learning_rate": 2.8279432059465055e-06, "loss": 0.2715, "num_input_tokens_seen": 159492880, "step": 73970 }, { "epoch": 13.575885483574968, "grad_norm": 0.4003666341304779, "learning_rate": 2.8272219805835933e-06, "loss": 0.0011, "num_input_tokens_seen": 159504080, "step": 73975 }, { "epoch": 13.57680308313452, "grad_norm": 1.7804789543151855, "learning_rate": 2.82650081094907e-06, "loss": 0.0094, "num_input_tokens_seen": 159514960, "step": 73980 }, { "epoch": 13.577720682694073, "grad_norm": 0.5998281240463257, "learning_rate": 2.8257796970614303e-06, "loss": 0.129, "num_input_tokens_seen": 159526768, "step": 73985 }, { "epoch": 13.578638282253625, "grad_norm": 99.22362518310547, "learning_rate": 2.825058638939173e-06, "loss": 0.2693, "num_input_tokens_seen": 159537552, "step": 73990 }, { "epoch": 13.579555881813176, "grad_norm": 23.844303131103516, "learning_rate": 2.824337636600792e-06, "loss": 0.2476, "num_input_tokens_seen": 159548336, "step": 73995 }, { "epoch": 13.58047348137273, "grad_norm": 0.06606951355934143, "learning_rate": 2.823616690064778e-06, "loss": 0.0183, "num_input_tokens_seen": 159559568, "step": 74000 }, { "epoch": 13.581391080932281, "grad_norm": 62.29220199584961, "learning_rate": 2.8228957993496207e-06, "loss": 0.1622, "num_input_tokens_seen": 159571344, "step": 74005 }, { "epoch": 13.582308680491833, "grad_norm": 28.22193717956543, "learning_rate": 2.822174964473814e-06, "loss": 0.145, "num_input_tokens_seen": 159582256, "step": 74010 }, { "epoch": 13.583226280051386, "grad_norm": 0.05577938258647919, "learning_rate": 2.821454185455844e-06, "loss": 0.221, "num_input_tokens_seen": 159592112, "step": 74015 }, { "epoch": 13.584143879610938, "grad_norm": 0.05836043506860733, "learning_rate": 2.820733462314198e-06, "loss": 0.0885, "num_input_tokens_seen": 159602576, "step": 74020 }, { "epoch": 13.58506147917049, "grad_norm": 138.86097717285156, "learning_rate": 2.8200127950673608e-06, "loss": 0.0298, "num_input_tokens_seen": 159613168, "step": 74025 }, { "epoch": 13.585979078730043, "grad_norm": 0.1128048524260521, "learning_rate": 2.819292183733815e-06, "loss": 0.158, "num_input_tokens_seen": 159624176, "step": 74030 }, { "epoch": 13.586896678289595, "grad_norm": 0.06904340535402298, "learning_rate": 2.8185716283320462e-06, "loss": 0.3083, "num_input_tokens_seen": 159634640, "step": 74035 }, { "epoch": 13.587814277849146, "grad_norm": 0.015107713639736176, "learning_rate": 2.8178511288805355e-06, "loss": 0.0004, "num_input_tokens_seen": 159646480, "step": 74040 }, { "epoch": 13.5887318774087, "grad_norm": 27.754026412963867, "learning_rate": 2.8171306853977602e-06, "loss": 0.1663, "num_input_tokens_seen": 159657680, "step": 74045 }, { "epoch": 13.589649476968251, "grad_norm": 0.00401915842667222, "learning_rate": 2.8164102979022e-06, "loss": 0.2372, "num_input_tokens_seen": 159667952, "step": 74050 }, { "epoch": 13.590567076527803, "grad_norm": 34.02011489868164, "learning_rate": 2.8156899664123295e-06, "loss": 0.1453, "num_input_tokens_seen": 159677264, "step": 74055 }, { "epoch": 13.591484676087356, "grad_norm": 2.544455051422119, "learning_rate": 2.8149696909466285e-06, "loss": 0.0902, "num_input_tokens_seen": 159688784, "step": 74060 }, { "epoch": 13.592402275646908, "grad_norm": 1.239945888519287, "learning_rate": 2.814249471523568e-06, "loss": 0.1371, "num_input_tokens_seen": 159699280, "step": 74065 }, { "epoch": 13.59331987520646, "grad_norm": 180.3713836669922, "learning_rate": 2.813529308161619e-06, "loss": 0.2134, "num_input_tokens_seen": 159709616, "step": 74070 }, { "epoch": 13.594237474766013, "grad_norm": 1.3032044172286987, "learning_rate": 2.812809200879256e-06, "loss": 0.0008, "num_input_tokens_seen": 159720560, "step": 74075 }, { "epoch": 13.595155074325564, "grad_norm": 0.007436808198690414, "learning_rate": 2.812089149694948e-06, "loss": 0.2696, "num_input_tokens_seen": 159731216, "step": 74080 }, { "epoch": 13.596072673885116, "grad_norm": 0.012449676170945168, "learning_rate": 2.8113691546271614e-06, "loss": 0.0002, "num_input_tokens_seen": 159741744, "step": 74085 }, { "epoch": 13.59699027344467, "grad_norm": 0.10487135499715805, "learning_rate": 2.810649215694362e-06, "loss": 0.0287, "num_input_tokens_seen": 159751856, "step": 74090 }, { "epoch": 13.597907873004221, "grad_norm": 0.094377301633358, "learning_rate": 2.80992933291502e-06, "loss": 0.0035, "num_input_tokens_seen": 159763216, "step": 74095 }, { "epoch": 13.598825472563773, "grad_norm": 0.03613162040710449, "learning_rate": 2.8092095063075955e-06, "loss": 0.0843, "num_input_tokens_seen": 159773392, "step": 74100 }, { "epoch": 13.599743072123326, "grad_norm": 298.96539306640625, "learning_rate": 2.8084897358905506e-06, "loss": 0.1674, "num_input_tokens_seen": 159783632, "step": 74105 }, { "epoch": 13.600660671682878, "grad_norm": 0.007784544490277767, "learning_rate": 2.807770021682348e-06, "loss": 0.0329, "num_input_tokens_seen": 159793744, "step": 74110 }, { "epoch": 13.60157827124243, "grad_norm": 0.2454555630683899, "learning_rate": 2.8070503637014477e-06, "loss": 0.1567, "num_input_tokens_seen": 159804624, "step": 74115 }, { "epoch": 13.602495870801983, "grad_norm": 0.024923156946897507, "learning_rate": 2.8063307619663047e-06, "loss": 0.0011, "num_input_tokens_seen": 159814448, "step": 74120 }, { "epoch": 13.603413470361534, "grad_norm": 0.02019057609140873, "learning_rate": 2.80561121649538e-06, "loss": 0.0993, "num_input_tokens_seen": 159824336, "step": 74125 }, { "epoch": 13.604331069921086, "grad_norm": 0.07239864021539688, "learning_rate": 2.8048917273071263e-06, "loss": 0.0461, "num_input_tokens_seen": 159835440, "step": 74130 }, { "epoch": 13.60524866948064, "grad_norm": 0.7326725721359253, "learning_rate": 2.8041722944199977e-06, "loss": 0.0015, "num_input_tokens_seen": 159846736, "step": 74135 }, { "epoch": 13.606166269040191, "grad_norm": 0.0037243713159114122, "learning_rate": 2.803452917852445e-06, "loss": 0.0009, "num_input_tokens_seen": 159857488, "step": 74140 }, { "epoch": 13.607083868599743, "grad_norm": 0.013335811905562878, "learning_rate": 2.802733597622922e-06, "loss": 0.0002, "num_input_tokens_seen": 159868688, "step": 74145 }, { "epoch": 13.608001468159296, "grad_norm": 0.006322372704744339, "learning_rate": 2.802014333749877e-06, "loss": 0.0001, "num_input_tokens_seen": 159880240, "step": 74150 }, { "epoch": 13.608919067718848, "grad_norm": 0.006433958187699318, "learning_rate": 2.801295126251755e-06, "loss": 0.0006, "num_input_tokens_seen": 159890928, "step": 74155 }, { "epoch": 13.6098366672784, "grad_norm": 0.20157773792743683, "learning_rate": 2.8005759751470086e-06, "loss": 0.0004, "num_input_tokens_seen": 159902192, "step": 74160 }, { "epoch": 13.610754266837953, "grad_norm": 0.07357234507799149, "learning_rate": 2.7998568804540786e-06, "loss": 0.0002, "num_input_tokens_seen": 159914160, "step": 74165 }, { "epoch": 13.611671866397504, "grad_norm": 0.3582957983016968, "learning_rate": 2.7991378421914107e-06, "loss": 0.422, "num_input_tokens_seen": 159925904, "step": 74170 }, { "epoch": 13.612589465957056, "grad_norm": 0.25524184107780457, "learning_rate": 2.798418860377443e-06, "loss": 0.1609, "num_input_tokens_seen": 159937520, "step": 74175 }, { "epoch": 13.61350706551661, "grad_norm": 0.032618217170238495, "learning_rate": 2.797699935030622e-06, "loss": 0.1749, "num_input_tokens_seen": 159948400, "step": 74180 }, { "epoch": 13.614424665076161, "grad_norm": 38.75328063964844, "learning_rate": 2.7969810661693848e-06, "loss": 0.1929, "num_input_tokens_seen": 159959472, "step": 74185 }, { "epoch": 13.615342264635713, "grad_norm": 1082.1248779296875, "learning_rate": 2.7962622538121665e-06, "loss": 0.3228, "num_input_tokens_seen": 159970224, "step": 74190 }, { "epoch": 13.616259864195266, "grad_norm": 87.09615325927734, "learning_rate": 2.7955434979774077e-06, "loss": 0.0097, "num_input_tokens_seen": 159981072, "step": 74195 }, { "epoch": 13.617177463754818, "grad_norm": 0.03224766254425049, "learning_rate": 2.794824798683542e-06, "loss": 0.1486, "num_input_tokens_seen": 159990320, "step": 74200 }, { "epoch": 13.61809506331437, "grad_norm": 0.0031380271539092064, "learning_rate": 2.794106155949e-06, "loss": 0.0011, "num_input_tokens_seen": 160001104, "step": 74205 }, { "epoch": 13.619012662873923, "grad_norm": 0.004661582410335541, "learning_rate": 2.7933875697922184e-06, "loss": 0.1129, "num_input_tokens_seen": 160012816, "step": 74210 }, { "epoch": 13.619930262433474, "grad_norm": 65.89088439941406, "learning_rate": 2.7926690402316257e-06, "loss": 0.0228, "num_input_tokens_seen": 160023312, "step": 74215 }, { "epoch": 13.620847861993026, "grad_norm": 67.4089584350586, "learning_rate": 2.7919505672856518e-06, "loss": 0.0148, "num_input_tokens_seen": 160034960, "step": 74220 }, { "epoch": 13.62176546155258, "grad_norm": 0.02550489269196987, "learning_rate": 2.7912321509727208e-06, "loss": 0.1304, "num_input_tokens_seen": 160045040, "step": 74225 }, { "epoch": 13.62268306111213, "grad_norm": 2.3569884300231934, "learning_rate": 2.7905137913112647e-06, "loss": 0.0211, "num_input_tokens_seen": 160055088, "step": 74230 }, { "epoch": 13.623600660671682, "grad_norm": 0.031323809176683426, "learning_rate": 2.7897954883197042e-06, "loss": 0.0171, "num_input_tokens_seen": 160067600, "step": 74235 }, { "epoch": 13.624518260231236, "grad_norm": 0.013704859651625156, "learning_rate": 2.7890772420164646e-06, "loss": 0.1715, "num_input_tokens_seen": 160078512, "step": 74240 }, { "epoch": 13.625435859790787, "grad_norm": 0.030068103224039078, "learning_rate": 2.788359052419968e-06, "loss": 0.1988, "num_input_tokens_seen": 160090384, "step": 74245 }, { "epoch": 13.626353459350339, "grad_norm": 0.04832563176751137, "learning_rate": 2.7876409195486305e-06, "loss": 0.0006, "num_input_tokens_seen": 160101072, "step": 74250 }, { "epoch": 13.627271058909892, "grad_norm": 0.02814597822725773, "learning_rate": 2.7869228434208774e-06, "loss": 0.0002, "num_input_tokens_seen": 160111696, "step": 74255 }, { "epoch": 13.628188658469444, "grad_norm": 0.030581742525100708, "learning_rate": 2.786204824055123e-06, "loss": 0.0084, "num_input_tokens_seen": 160121456, "step": 74260 }, { "epoch": 13.629106258028996, "grad_norm": 217.66458129882812, "learning_rate": 2.785486861469784e-06, "loss": 0.0536, "num_input_tokens_seen": 160132912, "step": 74265 }, { "epoch": 13.630023857588549, "grad_norm": 0.49745166301727295, "learning_rate": 2.7847689556832745e-06, "loss": 0.0867, "num_input_tokens_seen": 160142768, "step": 74270 }, { "epoch": 13.6309414571481, "grad_norm": 366.240478515625, "learning_rate": 2.7840511067140065e-06, "loss": 0.3392, "num_input_tokens_seen": 160153392, "step": 74275 }, { "epoch": 13.631859056707652, "grad_norm": 0.01892010308802128, "learning_rate": 2.7833333145803946e-06, "loss": 0.0003, "num_input_tokens_seen": 160164784, "step": 74280 }, { "epoch": 13.632776656267206, "grad_norm": 0.027020925655961037, "learning_rate": 2.782615579300848e-06, "loss": 0.2002, "num_input_tokens_seen": 160175728, "step": 74285 }, { "epoch": 13.633694255826757, "grad_norm": 85.202392578125, "learning_rate": 2.7818979008937735e-06, "loss": 0.0084, "num_input_tokens_seen": 160187216, "step": 74290 }, { "epoch": 13.634611855386309, "grad_norm": 0.0038581418339163065, "learning_rate": 2.781180279377582e-06, "loss": 0.1136, "num_input_tokens_seen": 160198864, "step": 74295 }, { "epoch": 13.635529454945862, "grad_norm": 0.03360697999596596, "learning_rate": 2.7804627147706775e-06, "loss": 0.0404, "num_input_tokens_seen": 160208400, "step": 74300 }, { "epoch": 13.636447054505414, "grad_norm": 0.1268802285194397, "learning_rate": 2.7797452070914622e-06, "loss": 0.0614, "num_input_tokens_seen": 160218352, "step": 74305 }, { "epoch": 13.637364654064966, "grad_norm": 0.18919745087623596, "learning_rate": 2.7790277563583427e-06, "loss": 0.0004, "num_input_tokens_seen": 160228912, "step": 74310 }, { "epoch": 13.638282253624519, "grad_norm": 33.40460968017578, "learning_rate": 2.7783103625897194e-06, "loss": 0.5436, "num_input_tokens_seen": 160239440, "step": 74315 }, { "epoch": 13.63919985318407, "grad_norm": 0.11795951426029205, "learning_rate": 2.7775930258039925e-06, "loss": 0.0005, "num_input_tokens_seen": 160249200, "step": 74320 }, { "epoch": 13.640117452743622, "grad_norm": 0.6173234581947327, "learning_rate": 2.776875746019558e-06, "loss": 0.0011, "num_input_tokens_seen": 160259952, "step": 74325 }, { "epoch": 13.641035052303176, "grad_norm": 0.0022690489422529936, "learning_rate": 2.7761585232548165e-06, "loss": 0.0002, "num_input_tokens_seen": 160270288, "step": 74330 }, { "epoch": 13.641952651862727, "grad_norm": 70.10147857666016, "learning_rate": 2.7754413575281624e-06, "loss": 0.3728, "num_input_tokens_seen": 160280912, "step": 74335 }, { "epoch": 13.642870251422279, "grad_norm": 0.02554483152925968, "learning_rate": 2.7747242488579882e-06, "loss": 0.0002, "num_input_tokens_seen": 160291600, "step": 74340 }, { "epoch": 13.643787850981832, "grad_norm": 0.41497552394866943, "learning_rate": 2.7740071972626897e-06, "loss": 0.331, "num_input_tokens_seen": 160301744, "step": 74345 }, { "epoch": 13.644705450541384, "grad_norm": 0.00646289112046361, "learning_rate": 2.7732902027606568e-06, "loss": 0.0006, "num_input_tokens_seen": 160311760, "step": 74350 }, { "epoch": 13.645623050100935, "grad_norm": 0.3743710219860077, "learning_rate": 2.7725732653702786e-06, "loss": 0.1698, "num_input_tokens_seen": 160322032, "step": 74355 }, { "epoch": 13.646540649660489, "grad_norm": 262.6271057128906, "learning_rate": 2.771856385109943e-06, "loss": 0.171, "num_input_tokens_seen": 160332848, "step": 74360 }, { "epoch": 13.64745824922004, "grad_norm": 0.004358215257525444, "learning_rate": 2.7711395619980385e-06, "loss": 0.1257, "num_input_tokens_seen": 160342672, "step": 74365 }, { "epoch": 13.648375848779592, "grad_norm": 0.2665925920009613, "learning_rate": 2.7704227960529504e-06, "loss": 0.0215, "num_input_tokens_seen": 160353296, "step": 74370 }, { "epoch": 13.649293448339145, "grad_norm": 0.07322818040847778, "learning_rate": 2.7697060872930608e-06, "loss": 0.2753, "num_input_tokens_seen": 160362416, "step": 74375 }, { "epoch": 13.650211047898697, "grad_norm": 0.008075686171650887, "learning_rate": 2.7689894357367547e-06, "loss": 0.0001, "num_input_tokens_seen": 160373648, "step": 74380 }, { "epoch": 13.651128647458249, "grad_norm": 16.142253875732422, "learning_rate": 2.7682728414024117e-06, "loss": 0.1867, "num_input_tokens_seen": 160384976, "step": 74385 }, { "epoch": 13.652046247017802, "grad_norm": 0.21791252493858337, "learning_rate": 2.7675563043084096e-06, "loss": 0.0289, "num_input_tokens_seen": 160397136, "step": 74390 }, { "epoch": 13.652963846577354, "grad_norm": 0.04193601384758949, "learning_rate": 2.766839824473131e-06, "loss": 0.0013, "num_input_tokens_seen": 160407536, "step": 74395 }, { "epoch": 13.653881446136905, "grad_norm": 0.07948477566242218, "learning_rate": 2.766123401914949e-06, "loss": 0.0755, "num_input_tokens_seen": 160419088, "step": 74400 }, { "epoch": 13.654799045696459, "grad_norm": 0.013168847188353539, "learning_rate": 2.7654070366522403e-06, "loss": 0.0597, "num_input_tokens_seen": 160429776, "step": 74405 }, { "epoch": 13.65571664525601, "grad_norm": 44.25501251220703, "learning_rate": 2.7646907287033747e-06, "loss": 0.1772, "num_input_tokens_seen": 160438864, "step": 74410 }, { "epoch": 13.656634244815562, "grad_norm": 0.08183857053518295, "learning_rate": 2.76397447808673e-06, "loss": 0.0535, "num_input_tokens_seen": 160449904, "step": 74415 }, { "epoch": 13.657551844375115, "grad_norm": 418.55572509765625, "learning_rate": 2.7632582848206747e-06, "loss": 0.0647, "num_input_tokens_seen": 160461424, "step": 74420 }, { "epoch": 13.658469443934667, "grad_norm": 1.450003981590271, "learning_rate": 2.7625421489235753e-06, "loss": 0.0051, "num_input_tokens_seen": 160472496, "step": 74425 }, { "epoch": 13.659387043494219, "grad_norm": 0.46393558382987976, "learning_rate": 2.7618260704138043e-06, "loss": 0.0002, "num_input_tokens_seen": 160482256, "step": 74430 }, { "epoch": 13.660304643053772, "grad_norm": 0.1916068196296692, "learning_rate": 2.7611100493097253e-06, "loss": 0.0004, "num_input_tokens_seen": 160493232, "step": 74435 }, { "epoch": 13.661222242613324, "grad_norm": 0.007654041517525911, "learning_rate": 2.760394085629704e-06, "loss": 0.0001, "num_input_tokens_seen": 160503152, "step": 74440 }, { "epoch": 13.662139842172875, "grad_norm": 0.005450355354696512, "learning_rate": 2.759678179392102e-06, "loss": 0.1324, "num_input_tokens_seen": 160513744, "step": 74445 }, { "epoch": 13.663057441732429, "grad_norm": 195.64752197265625, "learning_rate": 2.7589623306152836e-06, "loss": 0.0139, "num_input_tokens_seen": 160525424, "step": 74450 }, { "epoch": 13.66397504129198, "grad_norm": 0.6410285234451294, "learning_rate": 2.758246539317608e-06, "loss": 0.0975, "num_input_tokens_seen": 160536688, "step": 74455 }, { "epoch": 13.664892640851532, "grad_norm": 0.0012974485289305449, "learning_rate": 2.7575308055174348e-06, "loss": 0.0479, "num_input_tokens_seen": 160548336, "step": 74460 }, { "epoch": 13.665810240411085, "grad_norm": 0.01690363883972168, "learning_rate": 2.756815129233121e-06, "loss": 0.1008, "num_input_tokens_seen": 160558864, "step": 74465 }, { "epoch": 13.666727839970637, "grad_norm": 0.006987473461776972, "learning_rate": 2.75609951048302e-06, "loss": 0.0002, "num_input_tokens_seen": 160569360, "step": 74470 }, { "epoch": 13.667645439530189, "grad_norm": 0.21433790028095245, "learning_rate": 2.755383949285491e-06, "loss": 0.0006, "num_input_tokens_seen": 160578736, "step": 74475 }, { "epoch": 13.668563039089742, "grad_norm": 0.024484319612383842, "learning_rate": 2.754668445658885e-06, "loss": 0.1918, "num_input_tokens_seen": 160589072, "step": 74480 }, { "epoch": 13.669480638649294, "grad_norm": 227.59750366210938, "learning_rate": 2.753952999621553e-06, "loss": 0.1378, "num_input_tokens_seen": 160599376, "step": 74485 }, { "epoch": 13.670398238208845, "grad_norm": 141.76275634765625, "learning_rate": 2.753237611191846e-06, "loss": 0.2165, "num_input_tokens_seen": 160610352, "step": 74490 }, { "epoch": 13.671315837768399, "grad_norm": 0.03684238716959953, "learning_rate": 2.7525222803881103e-06, "loss": 0.0002, "num_input_tokens_seen": 160621840, "step": 74495 }, { "epoch": 13.67223343732795, "grad_norm": 0.006895377300679684, "learning_rate": 2.751807007228696e-06, "loss": 0.0003, "num_input_tokens_seen": 160634000, "step": 74500 }, { "epoch": 13.673151036887502, "grad_norm": 0.14030790328979492, "learning_rate": 2.7510917917319485e-06, "loss": 0.0031, "num_input_tokens_seen": 160644816, "step": 74505 }, { "epoch": 13.674068636447055, "grad_norm": 0.020159367471933365, "learning_rate": 2.7503766339162086e-06, "loss": 0.0002, "num_input_tokens_seen": 160654896, "step": 74510 }, { "epoch": 13.674986236006607, "grad_norm": 50.43661117553711, "learning_rate": 2.7496615337998234e-06, "loss": 0.0822, "num_input_tokens_seen": 160665776, "step": 74515 }, { "epoch": 13.675903835566158, "grad_norm": 278.3178405761719, "learning_rate": 2.748946491401132e-06, "loss": 0.3006, "num_input_tokens_seen": 160675632, "step": 74520 }, { "epoch": 13.676821435125712, "grad_norm": 46.895896911621094, "learning_rate": 2.7482315067384725e-06, "loss": 0.2364, "num_input_tokens_seen": 160685712, "step": 74525 }, { "epoch": 13.677739034685263, "grad_norm": 0.513554036617279, "learning_rate": 2.7475165798301872e-06, "loss": 0.132, "num_input_tokens_seen": 160696624, "step": 74530 }, { "epoch": 13.678656634244815, "grad_norm": 0.09154373407363892, "learning_rate": 2.74680171069461e-06, "loss": 0.1439, "num_input_tokens_seen": 160707088, "step": 74535 }, { "epoch": 13.679574233804368, "grad_norm": 0.1297614425420761, "learning_rate": 2.7460868993500773e-06, "loss": 0.0018, "num_input_tokens_seen": 160718128, "step": 74540 }, { "epoch": 13.68049183336392, "grad_norm": 0.004495182074606419, "learning_rate": 2.7453721458149203e-06, "loss": 0.0708, "num_input_tokens_seen": 160729712, "step": 74545 }, { "epoch": 13.681409432923472, "grad_norm": 0.03070145845413208, "learning_rate": 2.744657450107475e-06, "loss": 0.0001, "num_input_tokens_seen": 160739312, "step": 74550 }, { "epoch": 13.682327032483025, "grad_norm": 0.17204003036022186, "learning_rate": 2.743942812246071e-06, "loss": 0.0007, "num_input_tokens_seen": 160749936, "step": 74555 }, { "epoch": 13.683244632042577, "grad_norm": 0.021223757416009903, "learning_rate": 2.7432282322490355e-06, "loss": 0.0005, "num_input_tokens_seen": 160760944, "step": 74560 }, { "epoch": 13.684162231602128, "grad_norm": 0.026317842304706573, "learning_rate": 2.7425137101347e-06, "loss": 0.1909, "num_input_tokens_seen": 160771152, "step": 74565 }, { "epoch": 13.685079831161682, "grad_norm": 140.61614990234375, "learning_rate": 2.7417992459213883e-06, "loss": 0.2424, "num_input_tokens_seen": 160784144, "step": 74570 }, { "epoch": 13.685997430721233, "grad_norm": 2.0477848052978516, "learning_rate": 2.741084839627425e-06, "loss": 0.0004, "num_input_tokens_seen": 160796400, "step": 74575 }, { "epoch": 13.686915030280785, "grad_norm": 0.03317119926214218, "learning_rate": 2.740370491271136e-06, "loss": 0.0067, "num_input_tokens_seen": 160807696, "step": 74580 }, { "epoch": 13.687832629840338, "grad_norm": 0.22504070401191711, "learning_rate": 2.7396562008708423e-06, "loss": 0.1733, "num_input_tokens_seen": 160818352, "step": 74585 }, { "epoch": 13.68875022939989, "grad_norm": 0.002408315660431981, "learning_rate": 2.7389419684448637e-06, "loss": 0.0051, "num_input_tokens_seen": 160830576, "step": 74590 }, { "epoch": 13.689667828959442, "grad_norm": 361.84918212890625, "learning_rate": 2.7382277940115172e-06, "loss": 0.0377, "num_input_tokens_seen": 160842000, "step": 74595 }, { "epoch": 13.690585428518995, "grad_norm": 0.27275538444519043, "learning_rate": 2.737513677589124e-06, "loss": 0.0002, "num_input_tokens_seen": 160853040, "step": 74600 }, { "epoch": 13.691503028078547, "grad_norm": 19.169939041137695, "learning_rate": 2.736799619195999e-06, "loss": 0.1291, "num_input_tokens_seen": 160863888, "step": 74605 }, { "epoch": 13.692420627638098, "grad_norm": 0.2824423909187317, "learning_rate": 2.7360856188504538e-06, "loss": 0.008, "num_input_tokens_seen": 160873936, "step": 74610 }, { "epoch": 13.693338227197652, "grad_norm": 0.17946727573871613, "learning_rate": 2.735371676570806e-06, "loss": 0.1326, "num_input_tokens_seen": 160884304, "step": 74615 }, { "epoch": 13.694255826757203, "grad_norm": 0.005475910846143961, "learning_rate": 2.7346577923753644e-06, "loss": 0.1566, "num_input_tokens_seen": 160896112, "step": 74620 }, { "epoch": 13.695173426316755, "grad_norm": 0.005492906551808119, "learning_rate": 2.7339439662824396e-06, "loss": 0.1629, "num_input_tokens_seen": 160907376, "step": 74625 }, { "epoch": 13.696091025876308, "grad_norm": 0.042115990072488785, "learning_rate": 2.733230198310338e-06, "loss": 0.0003, "num_input_tokens_seen": 160918320, "step": 74630 }, { "epoch": 13.69700862543586, "grad_norm": 0.08253137022256851, "learning_rate": 2.732516488477371e-06, "loss": 0.0002, "num_input_tokens_seen": 160927888, "step": 74635 }, { "epoch": 13.697926224995411, "grad_norm": 387.7668151855469, "learning_rate": 2.731802836801841e-06, "loss": 0.2076, "num_input_tokens_seen": 160938064, "step": 74640 }, { "epoch": 13.698843824554965, "grad_norm": 65.53970336914062, "learning_rate": 2.731089243302052e-06, "loss": 0.0026, "num_input_tokens_seen": 160949392, "step": 74645 }, { "epoch": 13.699761424114516, "grad_norm": 36.3685188293457, "learning_rate": 2.7303757079963083e-06, "loss": 0.36, "num_input_tokens_seen": 160960976, "step": 74650 }, { "epoch": 13.700679023674068, "grad_norm": 0.17957866191864014, "learning_rate": 2.7296622309029107e-06, "loss": 0.001, "num_input_tokens_seen": 160972816, "step": 74655 }, { "epoch": 13.701596623233621, "grad_norm": 0.019972411915659904, "learning_rate": 2.728948812040158e-06, "loss": 0.0002, "num_input_tokens_seen": 160983408, "step": 74660 }, { "epoch": 13.702514222793173, "grad_norm": 41.81761932373047, "learning_rate": 2.7282354514263464e-06, "loss": 0.1828, "num_input_tokens_seen": 160995472, "step": 74665 }, { "epoch": 13.703431822352725, "grad_norm": 10.930547714233398, "learning_rate": 2.7275221490797764e-06, "loss": 0.0034, "num_input_tokens_seen": 161005200, "step": 74670 }, { "epoch": 13.704349421912278, "grad_norm": 0.046877626329660416, "learning_rate": 2.7268089050187418e-06, "loss": 0.0001, "num_input_tokens_seen": 161016368, "step": 74675 }, { "epoch": 13.70526702147183, "grad_norm": 0.34761032462120056, "learning_rate": 2.7260957192615357e-06, "loss": 0.0003, "num_input_tokens_seen": 161026640, "step": 74680 }, { "epoch": 13.706184621031381, "grad_norm": 0.7221828699111938, "learning_rate": 2.72538259182645e-06, "loss": 0.001, "num_input_tokens_seen": 161038064, "step": 74685 }, { "epoch": 13.707102220590935, "grad_norm": 191.9327850341797, "learning_rate": 2.724669522731773e-06, "loss": 0.0536, "num_input_tokens_seen": 161048592, "step": 74690 }, { "epoch": 13.708019820150486, "grad_norm": 0.02977367863059044, "learning_rate": 2.723956511995799e-06, "loss": 0.1598, "num_input_tokens_seen": 161060944, "step": 74695 }, { "epoch": 13.708937419710038, "grad_norm": 92.11763000488281, "learning_rate": 2.7232435596368123e-06, "loss": 0.2198, "num_input_tokens_seen": 161071120, "step": 74700 }, { "epoch": 13.709855019269591, "grad_norm": 0.023613348603248596, "learning_rate": 2.7225306656730998e-06, "loss": 0.0511, "num_input_tokens_seen": 161083472, "step": 74705 }, { "epoch": 13.710772618829143, "grad_norm": 0.007270669564604759, "learning_rate": 2.7218178301229435e-06, "loss": 0.0209, "num_input_tokens_seen": 161093456, "step": 74710 }, { "epoch": 13.711690218388695, "grad_norm": 0.0066514271311461926, "learning_rate": 2.7211050530046325e-06, "loss": 0.0308, "num_input_tokens_seen": 161104688, "step": 74715 }, { "epoch": 13.712607817948248, "grad_norm": 24.007383346557617, "learning_rate": 2.7203923343364434e-06, "loss": 0.0011, "num_input_tokens_seen": 161116304, "step": 74720 }, { "epoch": 13.7135254175078, "grad_norm": 189.57315063476562, "learning_rate": 2.7196796741366583e-06, "loss": 0.0387, "num_input_tokens_seen": 161126928, "step": 74725 }, { "epoch": 13.714443017067351, "grad_norm": 0.007373057305812836, "learning_rate": 2.718967072423554e-06, "loss": 0.0003, "num_input_tokens_seen": 161137616, "step": 74730 }, { "epoch": 13.715360616626905, "grad_norm": 0.005983492825180292, "learning_rate": 2.7182545292154106e-06, "loss": 0.0008, "num_input_tokens_seen": 161148336, "step": 74735 }, { "epoch": 13.716278216186456, "grad_norm": 0.05923215299844742, "learning_rate": 2.7175420445305017e-06, "loss": 0.2857, "num_input_tokens_seen": 161157008, "step": 74740 }, { "epoch": 13.717195815746008, "grad_norm": 0.013683836907148361, "learning_rate": 2.7168296183871e-06, "loss": 0.1371, "num_input_tokens_seen": 161167760, "step": 74745 }, { "epoch": 13.718113415305561, "grad_norm": 158.32763671875, "learning_rate": 2.7161172508034826e-06, "loss": 0.167, "num_input_tokens_seen": 161180656, "step": 74750 }, { "epoch": 13.719031014865113, "grad_norm": 182.31338500976562, "learning_rate": 2.7154049417979176e-06, "loss": 0.2324, "num_input_tokens_seen": 161190352, "step": 74755 }, { "epoch": 13.719948614424665, "grad_norm": 0.005110797472298145, "learning_rate": 2.714692691388673e-06, "loss": 0.0026, "num_input_tokens_seen": 161201296, "step": 74760 }, { "epoch": 13.720866213984218, "grad_norm": 39.96390914916992, "learning_rate": 2.713980499594021e-06, "loss": 0.0534, "num_input_tokens_seen": 161212784, "step": 74765 }, { "epoch": 13.72178381354377, "grad_norm": 0.03125116229057312, "learning_rate": 2.7132683664322262e-06, "loss": 0.1533, "num_input_tokens_seen": 161223344, "step": 74770 }, { "epoch": 13.722701413103321, "grad_norm": 0.006165041588246822, "learning_rate": 2.7125562919215537e-06, "loss": 0.0006, "num_input_tokens_seen": 161234064, "step": 74775 }, { "epoch": 13.723619012662875, "grad_norm": 0.00463498430326581, "learning_rate": 2.7118442760802654e-06, "loss": 0.0003, "num_input_tokens_seen": 161244272, "step": 74780 }, { "epoch": 13.724536612222426, "grad_norm": 0.5966551303863525, "learning_rate": 2.711132318926627e-06, "loss": 0.0002, "num_input_tokens_seen": 161254000, "step": 74785 }, { "epoch": 13.725454211781978, "grad_norm": 0.035184819251298904, "learning_rate": 2.710420420478896e-06, "loss": 0.2568, "num_input_tokens_seen": 161263120, "step": 74790 }, { "epoch": 13.726371811341531, "grad_norm": 4.881939888000488, "learning_rate": 2.7097085807553326e-06, "loss": 0.1094, "num_input_tokens_seen": 161273648, "step": 74795 }, { "epoch": 13.727289410901083, "grad_norm": 107.69559478759766, "learning_rate": 2.708996799774195e-06, "loss": 0.2452, "num_input_tokens_seen": 161286768, "step": 74800 }, { "epoch": 13.728207010460634, "grad_norm": 0.014085110276937485, "learning_rate": 2.7082850775537397e-06, "loss": 0.0065, "num_input_tokens_seen": 161297072, "step": 74805 }, { "epoch": 13.729124610020188, "grad_norm": 2.3519811630249023, "learning_rate": 2.70757341411222e-06, "loss": 0.1072, "num_input_tokens_seen": 161307856, "step": 74810 }, { "epoch": 13.73004220957974, "grad_norm": 0.003142263274639845, "learning_rate": 2.7068618094678867e-06, "loss": 0.2079, "num_input_tokens_seen": 161317104, "step": 74815 }, { "epoch": 13.730959809139291, "grad_norm": 5.173829078674316, "learning_rate": 2.7061502636389967e-06, "loss": 0.1014, "num_input_tokens_seen": 161328272, "step": 74820 }, { "epoch": 13.731877408698844, "grad_norm": 0.7518013119697571, "learning_rate": 2.705438776643797e-06, "loss": 0.0009, "num_input_tokens_seen": 161338672, "step": 74825 }, { "epoch": 13.732795008258396, "grad_norm": 0.018174072727560997, "learning_rate": 2.7047273485005344e-06, "loss": 0.0735, "num_input_tokens_seen": 161349552, "step": 74830 }, { "epoch": 13.733712607817948, "grad_norm": 0.11427073925733566, "learning_rate": 2.70401597922746e-06, "loss": 0.068, "num_input_tokens_seen": 161361264, "step": 74835 }, { "epoch": 13.734630207377501, "grad_norm": 259.8937072753906, "learning_rate": 2.7033046688428177e-06, "loss": 0.0946, "num_input_tokens_seen": 161372208, "step": 74840 }, { "epoch": 13.735547806937053, "grad_norm": 27.9592342376709, "learning_rate": 2.7025934173648488e-06, "loss": 0.2212, "num_input_tokens_seen": 161382800, "step": 74845 }, { "epoch": 13.736465406496604, "grad_norm": 0.051575545221567154, "learning_rate": 2.7018822248118e-06, "loss": 0.1866, "num_input_tokens_seen": 161393328, "step": 74850 }, { "epoch": 13.737383006056158, "grad_norm": 0.038837332278490067, "learning_rate": 2.7011710912019106e-06, "loss": 0.3817, "num_input_tokens_seen": 161404272, "step": 74855 }, { "epoch": 13.73830060561571, "grad_norm": 0.07089199125766754, "learning_rate": 2.7004600165534188e-06, "loss": 0.0002, "num_input_tokens_seen": 161415856, "step": 74860 }, { "epoch": 13.739218205175261, "grad_norm": 0.718966007232666, "learning_rate": 2.699749000884563e-06, "loss": 0.0005, "num_input_tokens_seen": 161426672, "step": 74865 }, { "epoch": 13.740135804734814, "grad_norm": 5.944049835205078, "learning_rate": 2.6990380442135817e-06, "loss": 0.1216, "num_input_tokens_seen": 161438160, "step": 74870 }, { "epoch": 13.741053404294366, "grad_norm": 38.4362907409668, "learning_rate": 2.698327146558708e-06, "loss": 0.2913, "num_input_tokens_seen": 161449424, "step": 74875 }, { "epoch": 13.741971003853918, "grad_norm": 15.1460542678833, "learning_rate": 2.697616307938177e-06, "loss": 0.0025, "num_input_tokens_seen": 161459856, "step": 74880 }, { "epoch": 13.742888603413471, "grad_norm": 0.006217653863132, "learning_rate": 2.696905528370216e-06, "loss": 0.01, "num_input_tokens_seen": 161471248, "step": 74885 }, { "epoch": 13.743806202973023, "grad_norm": 2.895613670349121, "learning_rate": 2.6961948078730614e-06, "loss": 0.0064, "num_input_tokens_seen": 161481360, "step": 74890 }, { "epoch": 13.744723802532574, "grad_norm": 0.008333207108080387, "learning_rate": 2.695484146464939e-06, "loss": 0.2692, "num_input_tokens_seen": 161491792, "step": 74895 }, { "epoch": 13.745641402092128, "grad_norm": 0.05652916431427002, "learning_rate": 2.6947735441640764e-06, "loss": 0.0003, "num_input_tokens_seen": 161502576, "step": 74900 }, { "epoch": 13.74655900165168, "grad_norm": 0.034530848264694214, "learning_rate": 2.6940630009887003e-06, "loss": 0.0004, "num_input_tokens_seen": 161513040, "step": 74905 }, { "epoch": 13.74747660121123, "grad_norm": 6.49409294128418, "learning_rate": 2.693352516957034e-06, "loss": 0.087, "num_input_tokens_seen": 161523152, "step": 74910 }, { "epoch": 13.748394200770784, "grad_norm": 0.3936866819858551, "learning_rate": 2.6926420920872987e-06, "loss": 0.001, "num_input_tokens_seen": 161532464, "step": 74915 }, { "epoch": 13.749311800330336, "grad_norm": 0.3794471323490143, "learning_rate": 2.6919317263977198e-06, "loss": 0.2073, "num_input_tokens_seen": 161544016, "step": 74920 }, { "epoch": 13.750229399889887, "grad_norm": 0.12664055824279785, "learning_rate": 2.6912214199065146e-06, "loss": 0.0002, "num_input_tokens_seen": 161555440, "step": 74925 }, { "epoch": 13.75114699944944, "grad_norm": 0.2560325562953949, "learning_rate": 2.6905111726319e-06, "loss": 0.0003, "num_input_tokens_seen": 161564816, "step": 74930 }, { "epoch": 13.752064599008992, "grad_norm": 0.007453033234924078, "learning_rate": 2.6898009845920958e-06, "loss": 0.0245, "num_input_tokens_seen": 161575056, "step": 74935 }, { "epoch": 13.752982198568544, "grad_norm": 10.540006637573242, "learning_rate": 2.6890908558053163e-06, "loss": 0.1802, "num_input_tokens_seen": 161585936, "step": 74940 }, { "epoch": 13.753899798128097, "grad_norm": 0.10834833979606628, "learning_rate": 2.688380786289775e-06, "loss": 0.0008, "num_input_tokens_seen": 161596176, "step": 74945 }, { "epoch": 13.754817397687649, "grad_norm": 0.026441160589456558, "learning_rate": 2.687670776063682e-06, "loss": 0.0072, "num_input_tokens_seen": 161606576, "step": 74950 }, { "epoch": 13.7557349972472, "grad_norm": 0.013177919201552868, "learning_rate": 2.6869608251452517e-06, "loss": 0.0004, "num_input_tokens_seen": 161617840, "step": 74955 }, { "epoch": 13.756652596806754, "grad_norm": 0.04384305328130722, "learning_rate": 2.686250933552691e-06, "loss": 0.0009, "num_input_tokens_seen": 161628016, "step": 74960 }, { "epoch": 13.757570196366306, "grad_norm": 0.05852407217025757, "learning_rate": 2.6855411013042054e-06, "loss": 0.0012, "num_input_tokens_seen": 161639376, "step": 74965 }, { "epoch": 13.758487795925857, "grad_norm": 0.005813386291265488, "learning_rate": 2.684831328418006e-06, "loss": 0.2097, "num_input_tokens_seen": 161650736, "step": 74970 }, { "epoch": 13.75940539548541, "grad_norm": 4.4846086502075195, "learning_rate": 2.6841216149122953e-06, "loss": 0.0006, "num_input_tokens_seen": 161661232, "step": 74975 }, { "epoch": 13.760322995044962, "grad_norm": 10.391188621520996, "learning_rate": 2.683411960805273e-06, "loss": 0.0012, "num_input_tokens_seen": 161672368, "step": 74980 }, { "epoch": 13.761240594604514, "grad_norm": 0.009609201923012733, "learning_rate": 2.682702366115146e-06, "loss": 0.0002, "num_input_tokens_seen": 161681360, "step": 74985 }, { "epoch": 13.762158194164067, "grad_norm": 50.86515808105469, "learning_rate": 2.6819928308601123e-06, "loss": 0.0702, "num_input_tokens_seen": 161691568, "step": 74990 }, { "epoch": 13.763075793723619, "grad_norm": 0.9572790861129761, "learning_rate": 2.6812833550583694e-06, "loss": 0.0004, "num_input_tokens_seen": 161702064, "step": 74995 }, { "epoch": 13.76399339328317, "grad_norm": 0.03613198176026344, "learning_rate": 2.680573938728113e-06, "loss": 0.0097, "num_input_tokens_seen": 161712784, "step": 75000 }, { "epoch": 13.764910992842724, "grad_norm": 3.2305080890655518, "learning_rate": 2.6798645818875424e-06, "loss": 0.2695, "num_input_tokens_seen": 161723216, "step": 75005 }, { "epoch": 13.765828592402276, "grad_norm": 0.04706944525241852, "learning_rate": 2.6791552845548486e-06, "loss": 0.3267, "num_input_tokens_seen": 161733680, "step": 75010 }, { "epoch": 13.766746191961827, "grad_norm": 0.08163053542375565, "learning_rate": 2.678446046748223e-06, "loss": 0.5428, "num_input_tokens_seen": 161743600, "step": 75015 }, { "epoch": 13.76766379152138, "grad_norm": 1.4211452007293701, "learning_rate": 2.6777368684858608e-06, "loss": 0.0018, "num_input_tokens_seen": 161754608, "step": 75020 }, { "epoch": 13.768581391080932, "grad_norm": 0.07202304154634476, "learning_rate": 2.677027749785949e-06, "loss": 0.1627, "num_input_tokens_seen": 161764624, "step": 75025 }, { "epoch": 13.769498990640484, "grad_norm": 0.19066913425922394, "learning_rate": 2.676318690666672e-06, "loss": 0.0072, "num_input_tokens_seen": 161777200, "step": 75030 }, { "epoch": 13.770416590200037, "grad_norm": 0.033819712698459625, "learning_rate": 2.6756096911462216e-06, "loss": 0.0001, "num_input_tokens_seen": 161788080, "step": 75035 }, { "epoch": 13.771334189759589, "grad_norm": 0.16626514494419098, "learning_rate": 2.6749007512427807e-06, "loss": 0.0004, "num_input_tokens_seen": 161799600, "step": 75040 }, { "epoch": 13.77225178931914, "grad_norm": 0.005521997809410095, "learning_rate": 2.6741918709745314e-06, "loss": 0.0041, "num_input_tokens_seen": 161809552, "step": 75045 }, { "epoch": 13.773169388878694, "grad_norm": 78.7878189086914, "learning_rate": 2.6734830503596545e-06, "loss": 0.2844, "num_input_tokens_seen": 161820496, "step": 75050 }, { "epoch": 13.774086988438246, "grad_norm": 1.6108617782592773, "learning_rate": 2.6727742894163326e-06, "loss": 0.1971, "num_input_tokens_seen": 161831216, "step": 75055 }, { "epoch": 13.775004587997797, "grad_norm": 0.0029937594663351774, "learning_rate": 2.6720655881627437e-06, "loss": 0.0006, "num_input_tokens_seen": 161843696, "step": 75060 }, { "epoch": 13.77592218755735, "grad_norm": 0.006188991479575634, "learning_rate": 2.671356946617063e-06, "loss": 0.1192, "num_input_tokens_seen": 161853552, "step": 75065 }, { "epoch": 13.776839787116902, "grad_norm": 0.03787751495838165, "learning_rate": 2.6706483647974692e-06, "loss": 0.2289, "num_input_tokens_seen": 161864816, "step": 75070 }, { "epoch": 13.777757386676454, "grad_norm": 8.751214027404785, "learning_rate": 2.6699398427221345e-06, "loss": 0.1373, "num_input_tokens_seen": 161876272, "step": 75075 }, { "epoch": 13.778674986236007, "grad_norm": 0.00456229317933321, "learning_rate": 2.6692313804092297e-06, "loss": 0.0004, "num_input_tokens_seen": 161887984, "step": 75080 }, { "epoch": 13.779592585795559, "grad_norm": 0.022292857989668846, "learning_rate": 2.6685229778769296e-06, "loss": 0.1629, "num_input_tokens_seen": 161897808, "step": 75085 }, { "epoch": 13.78051018535511, "grad_norm": 0.02794255129992962, "learning_rate": 2.667814635143402e-06, "loss": 0.2589, "num_input_tokens_seen": 161909680, "step": 75090 }, { "epoch": 13.781427784914664, "grad_norm": 31.02202033996582, "learning_rate": 2.6671063522268143e-06, "loss": 0.2458, "num_input_tokens_seen": 161920688, "step": 75095 }, { "epoch": 13.782345384474215, "grad_norm": 0.012974433600902557, "learning_rate": 2.666398129145333e-06, "loss": 0.0704, "num_input_tokens_seen": 161931824, "step": 75100 }, { "epoch": 13.783262984033767, "grad_norm": 0.057803135365247726, "learning_rate": 2.6656899659171225e-06, "loss": 0.0001, "num_input_tokens_seen": 161942736, "step": 75105 }, { "epoch": 13.78418058359332, "grad_norm": 1.4875701665878296, "learning_rate": 2.6649818625603453e-06, "loss": 0.7226, "num_input_tokens_seen": 161952944, "step": 75110 }, { "epoch": 13.785098183152872, "grad_norm": 0.02037862129509449, "learning_rate": 2.6642738190931656e-06, "loss": 0.0014, "num_input_tokens_seen": 161964112, "step": 75115 }, { "epoch": 13.786015782712424, "grad_norm": 0.011561770923435688, "learning_rate": 2.663565835533742e-06, "loss": 0.0039, "num_input_tokens_seen": 161975920, "step": 75120 }, { "epoch": 13.786933382271977, "grad_norm": 0.014616825617849827, "learning_rate": 2.662857911900235e-06, "loss": 0.2158, "num_input_tokens_seen": 161986864, "step": 75125 }, { "epoch": 13.787850981831529, "grad_norm": 0.0037509952671825886, "learning_rate": 2.6621500482108e-06, "loss": 0.0148, "num_input_tokens_seen": 161997072, "step": 75130 }, { "epoch": 13.78876858139108, "grad_norm": 0.18113097548484802, "learning_rate": 2.6614422444835897e-06, "loss": 0.0002, "num_input_tokens_seen": 162007600, "step": 75135 }, { "epoch": 13.789686180950634, "grad_norm": 5.209569931030273, "learning_rate": 2.6607345007367645e-06, "loss": 0.0025, "num_input_tokens_seen": 162017072, "step": 75140 }, { "epoch": 13.790603780510185, "grad_norm": 0.09189028292894363, "learning_rate": 2.6600268169884737e-06, "loss": 0.2284, "num_input_tokens_seen": 162028048, "step": 75145 }, { "epoch": 13.791521380069737, "grad_norm": 0.005045147147029638, "learning_rate": 2.6593191932568663e-06, "loss": 0.0538, "num_input_tokens_seen": 162038992, "step": 75150 }, { "epoch": 13.79243897962929, "grad_norm": 74.09202575683594, "learning_rate": 2.6586116295600963e-06, "loss": 0.2213, "num_input_tokens_seen": 162048432, "step": 75155 }, { "epoch": 13.793356579188842, "grad_norm": 0.08892941474914551, "learning_rate": 2.657904125916308e-06, "loss": 0.0014, "num_input_tokens_seen": 162058544, "step": 75160 }, { "epoch": 13.794274178748394, "grad_norm": 0.22497588396072388, "learning_rate": 2.657196682343648e-06, "loss": 0.1568, "num_input_tokens_seen": 162070608, "step": 75165 }, { "epoch": 13.795191778307947, "grad_norm": 0.26574409008026123, "learning_rate": 2.6564892988602634e-06, "loss": 0.2381, "num_input_tokens_seen": 162081648, "step": 75170 }, { "epoch": 13.796109377867499, "grad_norm": 0.1378600150346756, "learning_rate": 2.6557819754842966e-06, "loss": 0.3398, "num_input_tokens_seen": 162093104, "step": 75175 }, { "epoch": 13.79702697742705, "grad_norm": 280.0533752441406, "learning_rate": 2.6550747122338886e-06, "loss": 0.0394, "num_input_tokens_seen": 162103120, "step": 75180 }, { "epoch": 13.797944576986604, "grad_norm": 0.02135927602648735, "learning_rate": 2.654367509127178e-06, "loss": 0.0176, "num_input_tokens_seen": 162114384, "step": 75185 }, { "epoch": 13.798862176546155, "grad_norm": 245.05259704589844, "learning_rate": 2.653660366182308e-06, "loss": 0.209, "num_input_tokens_seen": 162124464, "step": 75190 }, { "epoch": 13.799779776105707, "grad_norm": 30.229520797729492, "learning_rate": 2.6529532834174126e-06, "loss": 0.4982, "num_input_tokens_seen": 162135536, "step": 75195 }, { "epoch": 13.80069737566526, "grad_norm": 0.02443739026784897, "learning_rate": 2.652246260850626e-06, "loss": 0.0002, "num_input_tokens_seen": 162146192, "step": 75200 }, { "epoch": 13.801614975224812, "grad_norm": 0.022642867639660835, "learning_rate": 2.651539298500086e-06, "loss": 0.0905, "num_input_tokens_seen": 162157968, "step": 75205 }, { "epoch": 13.802532574784363, "grad_norm": 2.8440282344818115, "learning_rate": 2.6508323963839235e-06, "loss": 0.0008, "num_input_tokens_seen": 162168112, "step": 75210 }, { "epoch": 13.803450174343917, "grad_norm": 0.12594251334667206, "learning_rate": 2.6501255545202663e-06, "loss": 0.1878, "num_input_tokens_seen": 162180016, "step": 75215 }, { "epoch": 13.804367773903468, "grad_norm": 0.32158875465393066, "learning_rate": 2.64941877292725e-06, "loss": 0.2744, "num_input_tokens_seen": 162190256, "step": 75220 }, { "epoch": 13.80528537346302, "grad_norm": 0.29332655668258667, "learning_rate": 2.648712051622998e-06, "loss": 0.2712, "num_input_tokens_seen": 162199568, "step": 75225 }, { "epoch": 13.806202973022573, "grad_norm": 53.80107116699219, "learning_rate": 2.648005390625638e-06, "loss": 0.1836, "num_input_tokens_seen": 162210416, "step": 75230 }, { "epoch": 13.807120572582125, "grad_norm": 14.232495307922363, "learning_rate": 2.647298789953293e-06, "loss": 0.0015, "num_input_tokens_seen": 162220464, "step": 75235 }, { "epoch": 13.808038172141677, "grad_norm": 102.80398559570312, "learning_rate": 2.646592249624089e-06, "loss": 0.0255, "num_input_tokens_seen": 162230928, "step": 75240 }, { "epoch": 13.80895577170123, "grad_norm": 1.914100170135498, "learning_rate": 2.6458857696561468e-06, "loss": 0.0493, "num_input_tokens_seen": 162241616, "step": 75245 }, { "epoch": 13.809873371260782, "grad_norm": 1.0837748050689697, "learning_rate": 2.645179350067584e-06, "loss": 0.0916, "num_input_tokens_seen": 162251952, "step": 75250 }, { "epoch": 13.810790970820333, "grad_norm": 0.19486857950687408, "learning_rate": 2.6444729908765227e-06, "loss": 0.3162, "num_input_tokens_seen": 162263856, "step": 75255 }, { "epoch": 13.811708570379887, "grad_norm": 63.360626220703125, "learning_rate": 2.6437666921010784e-06, "loss": 0.2288, "num_input_tokens_seen": 162275248, "step": 75260 }, { "epoch": 13.812626169939438, "grad_norm": 0.4931160807609558, "learning_rate": 2.6430604537593673e-06, "loss": 0.0177, "num_input_tokens_seen": 162286096, "step": 75265 }, { "epoch": 13.81354376949899, "grad_norm": 0.029475264251232147, "learning_rate": 2.642354275869501e-06, "loss": 0.0001, "num_input_tokens_seen": 162296784, "step": 75270 }, { "epoch": 13.814461369058543, "grad_norm": 1.5926692485809326, "learning_rate": 2.6416481584495947e-06, "loss": 0.1009, "num_input_tokens_seen": 162306256, "step": 75275 }, { "epoch": 13.815378968618095, "grad_norm": 0.025759676471352577, "learning_rate": 2.6409421015177583e-06, "loss": 0.048, "num_input_tokens_seen": 162317200, "step": 75280 }, { "epoch": 13.816296568177647, "grad_norm": 0.2375672161579132, "learning_rate": 2.640236105092099e-06, "loss": 0.008, "num_input_tokens_seen": 162327920, "step": 75285 }, { "epoch": 13.8172141677372, "grad_norm": 0.021847551688551903, "learning_rate": 2.639530169190727e-06, "loss": 0.0034, "num_input_tokens_seen": 162338512, "step": 75290 }, { "epoch": 13.818131767296752, "grad_norm": 756.910400390625, "learning_rate": 2.6388242938317486e-06, "loss": 0.0707, "num_input_tokens_seen": 162348880, "step": 75295 }, { "epoch": 13.819049366856303, "grad_norm": 0.010433492250740528, "learning_rate": 2.638118479033268e-06, "loss": 0.0008, "num_input_tokens_seen": 162359760, "step": 75300 }, { "epoch": 13.819966966415857, "grad_norm": 0.010716591961681843, "learning_rate": 2.6374127248133858e-06, "loss": 0.0007, "num_input_tokens_seen": 162370896, "step": 75305 }, { "epoch": 13.820884565975408, "grad_norm": 0.32136449217796326, "learning_rate": 2.6367070311902075e-06, "loss": 0.322, "num_input_tokens_seen": 162381648, "step": 75310 }, { "epoch": 13.82180216553496, "grad_norm": 0.005038425792008638, "learning_rate": 2.636001398181831e-06, "loss": 0.0169, "num_input_tokens_seen": 162393136, "step": 75315 }, { "epoch": 13.822719765094513, "grad_norm": 0.18287217617034912, "learning_rate": 2.635295825806354e-06, "loss": 0.0002, "num_input_tokens_seen": 162404112, "step": 75320 }, { "epoch": 13.823637364654065, "grad_norm": 0.07595407962799072, "learning_rate": 2.634590314081875e-06, "loss": 0.0003, "num_input_tokens_seen": 162414736, "step": 75325 }, { "epoch": 13.824554964213617, "grad_norm": 0.024738037958741188, "learning_rate": 2.6338848630264864e-06, "loss": 0.011, "num_input_tokens_seen": 162424688, "step": 75330 }, { "epoch": 13.82547256377317, "grad_norm": 0.018728645518422127, "learning_rate": 2.6331794726582853e-06, "loss": 0.1136, "num_input_tokens_seen": 162435664, "step": 75335 }, { "epoch": 13.826390163332722, "grad_norm": 146.15155029296875, "learning_rate": 2.6324741429953626e-06, "loss": 0.1817, "num_input_tokens_seen": 162446320, "step": 75340 }, { "epoch": 13.827307762892273, "grad_norm": 0.018749365583062172, "learning_rate": 2.6317688740558096e-06, "loss": 0.0002, "num_input_tokens_seen": 162457264, "step": 75345 }, { "epoch": 13.828225362451827, "grad_norm": 0.004237958695739508, "learning_rate": 2.6310636658577114e-06, "loss": 0.0004, "num_input_tokens_seen": 162468752, "step": 75350 }, { "epoch": 13.829142962011378, "grad_norm": 0.5019974708557129, "learning_rate": 2.6303585184191614e-06, "loss": 0.113, "num_input_tokens_seen": 162479568, "step": 75355 }, { "epoch": 13.83006056157093, "grad_norm": 79.69256591796875, "learning_rate": 2.629653431758243e-06, "loss": 0.1181, "num_input_tokens_seen": 162490032, "step": 75360 }, { "epoch": 13.830978161130483, "grad_norm": 0.009616076946258545, "learning_rate": 2.6289484058930405e-06, "loss": 0.0005, "num_input_tokens_seen": 162499952, "step": 75365 }, { "epoch": 13.831895760690035, "grad_norm": 0.021591931581497192, "learning_rate": 2.6282434408416337e-06, "loss": 0.0008, "num_input_tokens_seen": 162510512, "step": 75370 }, { "epoch": 13.832813360249586, "grad_norm": 0.1047624796628952, "learning_rate": 2.627538536622109e-06, "loss": 0.0148, "num_input_tokens_seen": 162522128, "step": 75375 }, { "epoch": 13.83373095980914, "grad_norm": 3.3297271728515625, "learning_rate": 2.626833693252544e-06, "loss": 0.0008, "num_input_tokens_seen": 162533008, "step": 75380 }, { "epoch": 13.834648559368691, "grad_norm": 0.047127604484558105, "learning_rate": 2.6261289107510148e-06, "loss": 0.0002, "num_input_tokens_seen": 162543184, "step": 75385 }, { "epoch": 13.835566158928243, "grad_norm": 0.005986652337014675, "learning_rate": 2.6254241891356014e-06, "loss": 0.0012, "num_input_tokens_seen": 162554864, "step": 75390 }, { "epoch": 13.836483758487796, "grad_norm": 0.03827937692403793, "learning_rate": 2.6247195284243776e-06, "loss": 0.5004, "num_input_tokens_seen": 162565968, "step": 75395 }, { "epoch": 13.837401358047348, "grad_norm": 0.0168682262301445, "learning_rate": 2.6240149286354167e-06, "loss": 0.0006, "num_input_tokens_seen": 162576144, "step": 75400 }, { "epoch": 13.8383189576069, "grad_norm": 1.9534127712249756, "learning_rate": 2.6233103897867884e-06, "loss": 0.1792, "num_input_tokens_seen": 162587376, "step": 75405 }, { "epoch": 13.839236557166453, "grad_norm": 0.056966427713632584, "learning_rate": 2.6226059118965675e-06, "loss": 0.7441, "num_input_tokens_seen": 162597424, "step": 75410 }, { "epoch": 13.840154156726005, "grad_norm": 1.0760531425476074, "learning_rate": 2.62190149498282e-06, "loss": 0.0972, "num_input_tokens_seen": 162608112, "step": 75415 }, { "epoch": 13.841071756285556, "grad_norm": 0.20093056559562683, "learning_rate": 2.621197139063611e-06, "loss": 0.0044, "num_input_tokens_seen": 162618000, "step": 75420 }, { "epoch": 13.84198935584511, "grad_norm": 0.027865761891007423, "learning_rate": 2.620492844157011e-06, "loss": 0.2044, "num_input_tokens_seen": 162628080, "step": 75425 }, { "epoch": 13.842906955404661, "grad_norm": 0.011194982565939426, "learning_rate": 2.619788610281081e-06, "loss": 0.118, "num_input_tokens_seen": 162639664, "step": 75430 }, { "epoch": 13.843824554964213, "grad_norm": 0.003828317392617464, "learning_rate": 2.619084437453883e-06, "loss": 0.0001, "num_input_tokens_seen": 162649040, "step": 75435 }, { "epoch": 13.844742154523766, "grad_norm": 0.0169492419809103, "learning_rate": 2.61838032569348e-06, "loss": 0.0026, "num_input_tokens_seen": 162660496, "step": 75440 }, { "epoch": 13.845659754083318, "grad_norm": 0.0037414864636957645, "learning_rate": 2.617676275017932e-06, "loss": 0.0057, "num_input_tokens_seen": 162669200, "step": 75445 }, { "epoch": 13.84657735364287, "grad_norm": 0.029815392568707466, "learning_rate": 2.6169722854452944e-06, "loss": 0.0001, "num_input_tokens_seen": 162679888, "step": 75450 }, { "epoch": 13.847494953202423, "grad_norm": 0.009231084026396275, "learning_rate": 2.6162683569936224e-06, "loss": 0.0003, "num_input_tokens_seen": 162691248, "step": 75455 }, { "epoch": 13.848412552761975, "grad_norm": 0.008051702752709389, "learning_rate": 2.6155644896809745e-06, "loss": 0.1217, "num_input_tokens_seen": 162702768, "step": 75460 }, { "epoch": 13.849330152321526, "grad_norm": 0.002349613467231393, "learning_rate": 2.614860683525402e-06, "loss": 0.0021, "num_input_tokens_seen": 162713744, "step": 75465 }, { "epoch": 13.85024775188108, "grad_norm": 527.2653198242188, "learning_rate": 2.6141569385449545e-06, "loss": 0.9066, "num_input_tokens_seen": 162725200, "step": 75470 }, { "epoch": 13.851165351440631, "grad_norm": 0.011361189186573029, "learning_rate": 2.613453254757686e-06, "loss": 0.0001, "num_input_tokens_seen": 162736080, "step": 75475 }, { "epoch": 13.852082951000183, "grad_norm": 0.12214748561382294, "learning_rate": 2.612749632181642e-06, "loss": 0.0009, "num_input_tokens_seen": 162747888, "step": 75480 }, { "epoch": 13.853000550559736, "grad_norm": 0.08127538859844208, "learning_rate": 2.6120460708348685e-06, "loss": 0.0089, "num_input_tokens_seen": 162758512, "step": 75485 }, { "epoch": 13.853918150119288, "grad_norm": 0.01631391979753971, "learning_rate": 2.6113425707354147e-06, "loss": 0.0376, "num_input_tokens_seen": 162767440, "step": 75490 }, { "epoch": 13.85483574967884, "grad_norm": 48.903831481933594, "learning_rate": 2.6106391319013208e-06, "loss": 0.511, "num_input_tokens_seen": 162778928, "step": 75495 }, { "epoch": 13.855753349238393, "grad_norm": 0.007584002800285816, "learning_rate": 2.6099357543506302e-06, "loss": 0.0004, "num_input_tokens_seen": 162789584, "step": 75500 }, { "epoch": 13.856670948797944, "grad_norm": 110.47372436523438, "learning_rate": 2.6092324381013823e-06, "loss": 0.3657, "num_input_tokens_seen": 162801552, "step": 75505 }, { "epoch": 13.857588548357496, "grad_norm": 79.41256713867188, "learning_rate": 2.6085291831716175e-06, "loss": 0.2754, "num_input_tokens_seen": 162811728, "step": 75510 }, { "epoch": 13.85850614791705, "grad_norm": 0.005881652235984802, "learning_rate": 2.607825989579374e-06, "loss": 0.1439, "num_input_tokens_seen": 162821552, "step": 75515 }, { "epoch": 13.859423747476601, "grad_norm": 0.03604218363761902, "learning_rate": 2.6071228573426856e-06, "loss": 0.0011, "num_input_tokens_seen": 162832752, "step": 75520 }, { "epoch": 13.860341347036153, "grad_norm": 43.68164825439453, "learning_rate": 2.606419786479586e-06, "loss": 0.1578, "num_input_tokens_seen": 162842128, "step": 75525 }, { "epoch": 13.861258946595706, "grad_norm": 9.09618091583252, "learning_rate": 2.6057167770081104e-06, "loss": 0.0122, "num_input_tokens_seen": 162854224, "step": 75530 }, { "epoch": 13.862176546155258, "grad_norm": 0.014984115958213806, "learning_rate": 2.605013828946289e-06, "loss": 0.0002, "num_input_tokens_seen": 162864720, "step": 75535 }, { "epoch": 13.86309414571481, "grad_norm": 0.01381913386285305, "learning_rate": 2.6043109423121506e-06, "loss": 0.0026, "num_input_tokens_seen": 162876080, "step": 75540 }, { "epoch": 13.864011745274363, "grad_norm": 0.017635153606534004, "learning_rate": 2.6036081171237236e-06, "loss": 0.0161, "num_input_tokens_seen": 162885968, "step": 75545 }, { "epoch": 13.864929344833914, "grad_norm": 19.36491584777832, "learning_rate": 2.6029053533990333e-06, "loss": 0.2361, "num_input_tokens_seen": 162897360, "step": 75550 }, { "epoch": 13.865846944393466, "grad_norm": 0.189658060669899, "learning_rate": 2.6022026511561067e-06, "loss": 0.1244, "num_input_tokens_seen": 162908368, "step": 75555 }, { "epoch": 13.86676454395302, "grad_norm": 0.008813321590423584, "learning_rate": 2.601500010412966e-06, "loss": 0.1512, "num_input_tokens_seen": 162919632, "step": 75560 }, { "epoch": 13.867682143512571, "grad_norm": 0.021220725029706955, "learning_rate": 2.600797431187633e-06, "loss": 0.1257, "num_input_tokens_seen": 162930320, "step": 75565 }, { "epoch": 13.868599743072123, "grad_norm": 45.37392807006836, "learning_rate": 2.600094913498125e-06, "loss": 0.1291, "num_input_tokens_seen": 162942800, "step": 75570 }, { "epoch": 13.869517342631676, "grad_norm": 0.06409085541963577, "learning_rate": 2.599392457362465e-06, "loss": 0.0004, "num_input_tokens_seen": 162952880, "step": 75575 }, { "epoch": 13.870434942191228, "grad_norm": 0.30331048369407654, "learning_rate": 2.5986900627986677e-06, "loss": 0.0828, "num_input_tokens_seen": 162964560, "step": 75580 }, { "epoch": 13.87135254175078, "grad_norm": 0.9144293665885925, "learning_rate": 2.597987729824749e-06, "loss": 0.101, "num_input_tokens_seen": 162976048, "step": 75585 }, { "epoch": 13.872270141310333, "grad_norm": 48.85430145263672, "learning_rate": 2.5972854584587205e-06, "loss": 0.2266, "num_input_tokens_seen": 162987440, "step": 75590 }, { "epoch": 13.873187740869884, "grad_norm": 0.20705361664295197, "learning_rate": 2.596583248718597e-06, "loss": 0.0043, "num_input_tokens_seen": 162997680, "step": 75595 }, { "epoch": 13.874105340429436, "grad_norm": 237.56454467773438, "learning_rate": 2.5958811006223893e-06, "loss": 0.1081, "num_input_tokens_seen": 163010032, "step": 75600 }, { "epoch": 13.87502293998899, "grad_norm": 0.004946700297296047, "learning_rate": 2.5951790141881028e-06, "loss": 0.0704, "num_input_tokens_seen": 163019760, "step": 75605 }, { "epoch": 13.875940539548541, "grad_norm": 6.385077953338623, "learning_rate": 2.5944769894337496e-06, "loss": 0.0181, "num_input_tokens_seen": 163031664, "step": 75610 }, { "epoch": 13.876858139108093, "grad_norm": 0.061719149351119995, "learning_rate": 2.5937750263773336e-06, "loss": 0.0067, "num_input_tokens_seen": 163040272, "step": 75615 }, { "epoch": 13.877775738667646, "grad_norm": 1.0647677183151245, "learning_rate": 2.593073125036857e-06, "loss": 0.0005, "num_input_tokens_seen": 163049872, "step": 75620 }, { "epoch": 13.878693338227198, "grad_norm": 0.033961132168769836, "learning_rate": 2.5923712854303256e-06, "loss": 0.1193, "num_input_tokens_seen": 163060464, "step": 75625 }, { "epoch": 13.87961093778675, "grad_norm": 0.0009883990278467536, "learning_rate": 2.59166950757574e-06, "loss": 0.0001, "num_input_tokens_seen": 163070704, "step": 75630 }, { "epoch": 13.880528537346303, "grad_norm": 29.34687614440918, "learning_rate": 2.5909677914910987e-06, "loss": 0.0121, "num_input_tokens_seen": 163082256, "step": 75635 }, { "epoch": 13.881446136905854, "grad_norm": 111.75049591064453, "learning_rate": 2.5902661371943977e-06, "loss": 0.4874, "num_input_tokens_seen": 163093648, "step": 75640 }, { "epoch": 13.882363736465406, "grad_norm": 0.03730550408363342, "learning_rate": 2.5895645447036378e-06, "loss": 0.0287, "num_input_tokens_seen": 163103856, "step": 75645 }, { "epoch": 13.88328133602496, "grad_norm": 0.04919790104031563, "learning_rate": 2.588863014036811e-06, "loss": 0.0001, "num_input_tokens_seen": 163114320, "step": 75650 }, { "epoch": 13.88419893558451, "grad_norm": 47.31605529785156, "learning_rate": 2.5881615452119092e-06, "loss": 0.0057, "num_input_tokens_seen": 163124144, "step": 75655 }, { "epoch": 13.885116535144062, "grad_norm": 0.04095921665430069, "learning_rate": 2.5874601382469277e-06, "loss": 0.0004, "num_input_tokens_seen": 163134864, "step": 75660 }, { "epoch": 13.886034134703616, "grad_norm": 34.06021499633789, "learning_rate": 2.586758793159855e-06, "loss": 0.1311, "num_input_tokens_seen": 163146448, "step": 75665 }, { "epoch": 13.886951734263167, "grad_norm": 0.027546720579266548, "learning_rate": 2.586057509968677e-06, "loss": 0.0594, "num_input_tokens_seen": 163157712, "step": 75670 }, { "epoch": 13.887869333822719, "grad_norm": 6.1252617835998535, "learning_rate": 2.585356288691384e-06, "loss": 0.0013, "num_input_tokens_seen": 163168944, "step": 75675 }, { "epoch": 13.888786933382272, "grad_norm": 0.03728350251913071, "learning_rate": 2.58465512934596e-06, "loss": 0.129, "num_input_tokens_seen": 163179152, "step": 75680 }, { "epoch": 13.889704532941824, "grad_norm": 45.283634185791016, "learning_rate": 2.583954031950389e-06, "loss": 0.4644, "num_input_tokens_seen": 163190000, "step": 75685 }, { "epoch": 13.890622132501376, "grad_norm": 42.44152069091797, "learning_rate": 2.5832529965226503e-06, "loss": 0.1402, "num_input_tokens_seen": 163200336, "step": 75690 }, { "epoch": 13.891539732060929, "grad_norm": 0.037011146545410156, "learning_rate": 2.5825520230807288e-06, "loss": 0.0909, "num_input_tokens_seen": 163210000, "step": 75695 }, { "epoch": 13.89245733162048, "grad_norm": 343.163330078125, "learning_rate": 2.581851111642601e-06, "loss": 0.2569, "num_input_tokens_seen": 163219728, "step": 75700 }, { "epoch": 13.893374931180032, "grad_norm": 0.11624059826135635, "learning_rate": 2.581150262226242e-06, "loss": 0.0004, "num_input_tokens_seen": 163230128, "step": 75705 }, { "epoch": 13.894292530739586, "grad_norm": 0.013498655520379543, "learning_rate": 2.580449474849632e-06, "loss": 0.08, "num_input_tokens_seen": 163240464, "step": 75710 }, { "epoch": 13.895210130299137, "grad_norm": 604.9588623046875, "learning_rate": 2.579748749530744e-06, "loss": 0.1444, "num_input_tokens_seen": 163251312, "step": 75715 }, { "epoch": 13.896127729858689, "grad_norm": 0.16878512501716614, "learning_rate": 2.579048086287549e-06, "loss": 0.2098, "num_input_tokens_seen": 163261712, "step": 75720 }, { "epoch": 13.897045329418242, "grad_norm": 0.005665292497724295, "learning_rate": 2.5783474851380157e-06, "loss": 0.0377, "num_input_tokens_seen": 163272272, "step": 75725 }, { "epoch": 13.897962928977794, "grad_norm": 3.9459567070007324, "learning_rate": 2.5776469461001184e-06, "loss": 0.0802, "num_input_tokens_seen": 163283056, "step": 75730 }, { "epoch": 13.898880528537346, "grad_norm": 0.7938507795333862, "learning_rate": 2.5769464691918235e-06, "loss": 0.1927, "num_input_tokens_seen": 163294096, "step": 75735 }, { "epoch": 13.899798128096899, "grad_norm": 0.1493629813194275, "learning_rate": 2.5762460544310957e-06, "loss": 0.006, "num_input_tokens_seen": 163304976, "step": 75740 }, { "epoch": 13.90071572765645, "grad_norm": 0.025614267215132713, "learning_rate": 2.575545701835898e-06, "loss": 0.0066, "num_input_tokens_seen": 163315216, "step": 75745 }, { "epoch": 13.901633327216002, "grad_norm": 0.10740624368190765, "learning_rate": 2.574845411424198e-06, "loss": 0.3502, "num_input_tokens_seen": 163325680, "step": 75750 }, { "epoch": 13.902550926775556, "grad_norm": 0.006579930894076824, "learning_rate": 2.5741451832139543e-06, "loss": 0.2114, "num_input_tokens_seen": 163336528, "step": 75755 }, { "epoch": 13.903468526335107, "grad_norm": 0.2014276534318924, "learning_rate": 2.573445017223126e-06, "loss": 0.0148, "num_input_tokens_seen": 163347216, "step": 75760 }, { "epoch": 13.904386125894659, "grad_norm": 0.11428835242986679, "learning_rate": 2.5727449134696736e-06, "loss": 0.0299, "num_input_tokens_seen": 163358960, "step": 75765 }, { "epoch": 13.905303725454212, "grad_norm": 0.03961007669568062, "learning_rate": 2.5720448719715497e-06, "loss": 0.0247, "num_input_tokens_seen": 163370512, "step": 75770 }, { "epoch": 13.906221325013764, "grad_norm": 34.34634780883789, "learning_rate": 2.5713448927467134e-06, "loss": 0.1898, "num_input_tokens_seen": 163381680, "step": 75775 }, { "epoch": 13.907138924573315, "grad_norm": 0.0066383336670696735, "learning_rate": 2.570644975813117e-06, "loss": 0.0994, "num_input_tokens_seen": 163391792, "step": 75780 }, { "epoch": 13.908056524132869, "grad_norm": 0.2310454398393631, "learning_rate": 2.5699451211887116e-06, "loss": 0.0733, "num_input_tokens_seen": 163403728, "step": 75785 }, { "epoch": 13.90897412369242, "grad_norm": 0.02096412144601345, "learning_rate": 2.569245328891446e-06, "loss": 0.1537, "num_input_tokens_seen": 163414256, "step": 75790 }, { "epoch": 13.909891723251972, "grad_norm": 137.3048858642578, "learning_rate": 2.568545598939272e-06, "loss": 0.1664, "num_input_tokens_seen": 163425712, "step": 75795 }, { "epoch": 13.910809322811525, "grad_norm": 0.08575338870286942, "learning_rate": 2.567845931350135e-06, "loss": 0.0016, "num_input_tokens_seen": 163435376, "step": 75800 }, { "epoch": 13.911726922371077, "grad_norm": 0.0052848574705421925, "learning_rate": 2.567146326141979e-06, "loss": 0.1224, "num_input_tokens_seen": 163446096, "step": 75805 }, { "epoch": 13.912644521930629, "grad_norm": 0.049070242792367935, "learning_rate": 2.5664467833327498e-06, "loss": 0.0767, "num_input_tokens_seen": 163456432, "step": 75810 }, { "epoch": 13.913562121490182, "grad_norm": 0.04276615008711815, "learning_rate": 2.56574730294039e-06, "loss": 0.0868, "num_input_tokens_seen": 163465936, "step": 75815 }, { "epoch": 13.914479721049734, "grad_norm": 16.4774112701416, "learning_rate": 2.565047884982839e-06, "loss": 0.1849, "num_input_tokens_seen": 163477040, "step": 75820 }, { "epoch": 13.915397320609285, "grad_norm": 0.265249639749527, "learning_rate": 2.564348529478034e-06, "loss": 0.1256, "num_input_tokens_seen": 163487632, "step": 75825 }, { "epoch": 13.916314920168839, "grad_norm": 0.010953733697533607, "learning_rate": 2.5636492364439158e-06, "loss": 0.0648, "num_input_tokens_seen": 163498800, "step": 75830 }, { "epoch": 13.91723251972839, "grad_norm": 0.09346669167280197, "learning_rate": 2.562950005898419e-06, "loss": 0.0003, "num_input_tokens_seen": 163509168, "step": 75835 }, { "epoch": 13.918150119287942, "grad_norm": 0.23754733800888062, "learning_rate": 2.5622508378594757e-06, "loss": 0.0009, "num_input_tokens_seen": 163519984, "step": 75840 }, { "epoch": 13.919067718847495, "grad_norm": 475.5549621582031, "learning_rate": 2.5615517323450223e-06, "loss": 0.0352, "num_input_tokens_seen": 163531536, "step": 75845 }, { "epoch": 13.919985318407047, "grad_norm": 124.9565658569336, "learning_rate": 2.560852689372987e-06, "loss": 0.0655, "num_input_tokens_seen": 163542544, "step": 75850 }, { "epoch": 13.920902917966599, "grad_norm": 120.48066711425781, "learning_rate": 2.5601537089613005e-06, "loss": 0.1193, "num_input_tokens_seen": 163553392, "step": 75855 }, { "epoch": 13.921820517526152, "grad_norm": 0.04474617540836334, "learning_rate": 2.559454791127888e-06, "loss": 0.0088, "num_input_tokens_seen": 163564080, "step": 75860 }, { "epoch": 13.922738117085704, "grad_norm": 0.04435448721051216, "learning_rate": 2.5587559358906788e-06, "loss": 0.0001, "num_input_tokens_seen": 163575760, "step": 75865 }, { "epoch": 13.923655716645255, "grad_norm": 0.1840486228466034, "learning_rate": 2.558057143267597e-06, "loss": 0.1536, "num_input_tokens_seen": 163586928, "step": 75870 }, { "epoch": 13.924573316204809, "grad_norm": 151.58389282226562, "learning_rate": 2.5573584132765627e-06, "loss": 0.071, "num_input_tokens_seen": 163597616, "step": 75875 }, { "epoch": 13.92549091576436, "grad_norm": 0.21599023044109344, "learning_rate": 2.5566597459355013e-06, "loss": 0.1752, "num_input_tokens_seen": 163607696, "step": 75880 }, { "epoch": 13.926408515323912, "grad_norm": 0.09299205243587494, "learning_rate": 2.555961141262331e-06, "loss": 0.1522, "num_input_tokens_seen": 163619472, "step": 75885 }, { "epoch": 13.927326114883465, "grad_norm": 0.3535163104534149, "learning_rate": 2.555262599274967e-06, "loss": 0.0019, "num_input_tokens_seen": 163630768, "step": 75890 }, { "epoch": 13.928243714443017, "grad_norm": 0.01737922988831997, "learning_rate": 2.5545641199913297e-06, "loss": 0.0003, "num_input_tokens_seen": 163640560, "step": 75895 }, { "epoch": 13.929161314002569, "grad_norm": 0.17192357778549194, "learning_rate": 2.5538657034293335e-06, "loss": 0.0955, "num_input_tokens_seen": 163650224, "step": 75900 }, { "epoch": 13.930078913562122, "grad_norm": 1.793269395828247, "learning_rate": 2.553167349606891e-06, "loss": 0.1297, "num_input_tokens_seen": 163661776, "step": 75905 }, { "epoch": 13.930996513121674, "grad_norm": 0.13001751899719238, "learning_rate": 2.552469058541911e-06, "loss": 0.3166, "num_input_tokens_seen": 163673072, "step": 75910 }, { "epoch": 13.931914112681225, "grad_norm": 6.174599647521973, "learning_rate": 2.5517708302523092e-06, "loss": 0.0017, "num_input_tokens_seen": 163683184, "step": 75915 }, { "epoch": 13.932831712240779, "grad_norm": 0.07930375635623932, "learning_rate": 2.5510726647559904e-06, "loss": 0.0008, "num_input_tokens_seen": 163694448, "step": 75920 }, { "epoch": 13.93374931180033, "grad_norm": 0.04445364698767662, "learning_rate": 2.5503745620708607e-06, "loss": 0.0006, "num_input_tokens_seen": 163706064, "step": 75925 }, { "epoch": 13.934666911359882, "grad_norm": 0.018818054348230362, "learning_rate": 2.549676522214829e-06, "loss": 0.0002, "num_input_tokens_seen": 163717552, "step": 75930 }, { "epoch": 13.935584510919435, "grad_norm": 0.06410573422908783, "learning_rate": 2.5489785452057965e-06, "loss": 0.0033, "num_input_tokens_seen": 163728048, "step": 75935 }, { "epoch": 13.936502110478987, "grad_norm": 0.014448117464780807, "learning_rate": 2.5482806310616635e-06, "loss": 0.0012, "num_input_tokens_seen": 163738672, "step": 75940 }, { "epoch": 13.937419710038538, "grad_norm": 0.004129481036216021, "learning_rate": 2.547582779800335e-06, "loss": 0.0001, "num_input_tokens_seen": 163750000, "step": 75945 }, { "epoch": 13.938337309598092, "grad_norm": 0.17273089289665222, "learning_rate": 2.5468849914397067e-06, "loss": 0.1526, "num_input_tokens_seen": 163760048, "step": 75950 }, { "epoch": 13.939254909157643, "grad_norm": 0.05870095267891884, "learning_rate": 2.5461872659976766e-06, "loss": 0.0002, "num_input_tokens_seen": 163770576, "step": 75955 }, { "epoch": 13.940172508717195, "grad_norm": 0.1155984178185463, "learning_rate": 2.5454896034921402e-06, "loss": 0.1589, "num_input_tokens_seen": 163782512, "step": 75960 }, { "epoch": 13.941090108276748, "grad_norm": 37.91824722290039, "learning_rate": 2.544792003940989e-06, "loss": 0.2626, "num_input_tokens_seen": 163793328, "step": 75965 }, { "epoch": 13.9420077078363, "grad_norm": 0.04407312348484993, "learning_rate": 2.5440944673621204e-06, "loss": 0.1385, "num_input_tokens_seen": 163803760, "step": 75970 }, { "epoch": 13.942925307395852, "grad_norm": 2.0943002700805664, "learning_rate": 2.5433969937734216e-06, "loss": 0.1107, "num_input_tokens_seen": 163814576, "step": 75975 }, { "epoch": 13.943842906955405, "grad_norm": 0.08998653292655945, "learning_rate": 2.5426995831927827e-06, "loss": 0.0007, "num_input_tokens_seen": 163825456, "step": 75980 }, { "epoch": 13.944760506514957, "grad_norm": 1.1469546556472778, "learning_rate": 2.5420022356380912e-06, "loss": 0.0041, "num_input_tokens_seen": 163837456, "step": 75985 }, { "epoch": 13.945678106074508, "grad_norm": 0.09524469077587128, "learning_rate": 2.5413049511272307e-06, "loss": 0.0015, "num_input_tokens_seen": 163848304, "step": 75990 }, { "epoch": 13.946595705634062, "grad_norm": 0.004563395399600267, "learning_rate": 2.5406077296780895e-06, "loss": 0.0053, "num_input_tokens_seen": 163859152, "step": 75995 }, { "epoch": 13.947513305193613, "grad_norm": 0.015542258508503437, "learning_rate": 2.5399105713085486e-06, "loss": 0.0095, "num_input_tokens_seen": 163870800, "step": 76000 }, { "epoch": 13.948430904753165, "grad_norm": 0.004797368310391903, "learning_rate": 2.539213476036489e-06, "loss": 0.2733, "num_input_tokens_seen": 163881392, "step": 76005 }, { "epoch": 13.949348504312718, "grad_norm": 0.08693448454141617, "learning_rate": 2.5385164438797872e-06, "loss": 0.157, "num_input_tokens_seen": 163893168, "step": 76010 }, { "epoch": 13.95026610387227, "grad_norm": 41.910099029541016, "learning_rate": 2.5378194748563264e-06, "loss": 0.1006, "num_input_tokens_seen": 163903760, "step": 76015 }, { "epoch": 13.951183703431822, "grad_norm": 0.002162761054933071, "learning_rate": 2.5371225689839795e-06, "loss": 0.0006, "num_input_tokens_seen": 163915504, "step": 76020 }, { "epoch": 13.952101302991375, "grad_norm": 26.72365379333496, "learning_rate": 2.536425726280619e-06, "loss": 0.1818, "num_input_tokens_seen": 163926512, "step": 76025 }, { "epoch": 13.953018902550927, "grad_norm": 0.023692451417446136, "learning_rate": 2.535728946764123e-06, "loss": 0.3034, "num_input_tokens_seen": 163937232, "step": 76030 }, { "epoch": 13.953936502110478, "grad_norm": 0.05257529392838478, "learning_rate": 2.535032230452361e-06, "loss": 0.1596, "num_input_tokens_seen": 163947056, "step": 76035 }, { "epoch": 13.954854101670032, "grad_norm": 0.2379581332206726, "learning_rate": 2.534335577363201e-06, "loss": 0.0002, "num_input_tokens_seen": 163959312, "step": 76040 }, { "epoch": 13.955771701229583, "grad_norm": 0.014063620939850807, "learning_rate": 2.5336389875145105e-06, "loss": 0.2884, "num_input_tokens_seen": 163970576, "step": 76045 }, { "epoch": 13.956689300789135, "grad_norm": 0.15840694308280945, "learning_rate": 2.5329424609241593e-06, "loss": 0.016, "num_input_tokens_seen": 163981264, "step": 76050 }, { "epoch": 13.957606900348688, "grad_norm": 40.92569351196289, "learning_rate": 2.53224599761001e-06, "loss": 0.1573, "num_input_tokens_seen": 163992624, "step": 76055 }, { "epoch": 13.95852449990824, "grad_norm": 0.34002622961997986, "learning_rate": 2.531549597589925e-06, "loss": 0.1853, "num_input_tokens_seen": 164004016, "step": 76060 }, { "epoch": 13.959442099467791, "grad_norm": 0.01756122149527073, "learning_rate": 2.530853260881768e-06, "loss": 0.144, "num_input_tokens_seen": 164015056, "step": 76065 }, { "epoch": 13.960359699027345, "grad_norm": 0.5083768367767334, "learning_rate": 2.530156987503399e-06, "loss": 0.0557, "num_input_tokens_seen": 164025840, "step": 76070 }, { "epoch": 13.961277298586896, "grad_norm": 0.06575220078229904, "learning_rate": 2.529460777472673e-06, "loss": 0.0706, "num_input_tokens_seen": 164037008, "step": 76075 }, { "epoch": 13.962194898146448, "grad_norm": 0.1233002096414566, "learning_rate": 2.5287646308074507e-06, "loss": 0.001, "num_input_tokens_seen": 164048112, "step": 76080 }, { "epoch": 13.963112497706001, "grad_norm": 24.00982666015625, "learning_rate": 2.528068547525586e-06, "loss": 0.1257, "num_input_tokens_seen": 164059408, "step": 76085 }, { "epoch": 13.964030097265553, "grad_norm": 0.020474784076213837, "learning_rate": 2.5273725276449323e-06, "loss": 0.081, "num_input_tokens_seen": 164070544, "step": 76090 }, { "epoch": 13.964947696825105, "grad_norm": 0.0017044671112671494, "learning_rate": 2.5266765711833387e-06, "loss": 0.0008, "num_input_tokens_seen": 164081808, "step": 76095 }, { "epoch": 13.965865296384658, "grad_norm": 50.068511962890625, "learning_rate": 2.5259806781586595e-06, "loss": 0.2258, "num_input_tokens_seen": 164093136, "step": 76100 }, { "epoch": 13.96678289594421, "grad_norm": 0.025397270917892456, "learning_rate": 2.5252848485887416e-06, "loss": 0.1253, "num_input_tokens_seen": 164103920, "step": 76105 }, { "epoch": 13.967700495503761, "grad_norm": 191.8981170654297, "learning_rate": 2.52458908249143e-06, "loss": 0.2229, "num_input_tokens_seen": 164114320, "step": 76110 }, { "epoch": 13.968618095063315, "grad_norm": 28.385576248168945, "learning_rate": 2.5238933798845733e-06, "loss": 0.1182, "num_input_tokens_seen": 164124272, "step": 76115 }, { "epoch": 13.969535694622866, "grad_norm": 0.00860531534999609, "learning_rate": 2.523197740786014e-06, "loss": 0.0597, "num_input_tokens_seen": 164135792, "step": 76120 }, { "epoch": 13.970453294182418, "grad_norm": 0.04098925366997719, "learning_rate": 2.522502165213593e-06, "loss": 0.1225, "num_input_tokens_seen": 164145424, "step": 76125 }, { "epoch": 13.971370893741971, "grad_norm": 0.010680628009140491, "learning_rate": 2.5218066531851525e-06, "loss": 0.0004, "num_input_tokens_seen": 164155376, "step": 76130 }, { "epoch": 13.972288493301523, "grad_norm": 601.9771118164062, "learning_rate": 2.521111204718531e-06, "loss": 0.32, "num_input_tokens_seen": 164166096, "step": 76135 }, { "epoch": 13.973206092861075, "grad_norm": 0.0026142706628888845, "learning_rate": 2.5204158198315652e-06, "loss": 0.0002, "num_input_tokens_seen": 164175632, "step": 76140 }, { "epoch": 13.974123692420628, "grad_norm": 0.11993497610092163, "learning_rate": 2.5197204985420886e-06, "loss": 0.0008, "num_input_tokens_seen": 164186224, "step": 76145 }, { "epoch": 13.97504129198018, "grad_norm": 0.023596271872520447, "learning_rate": 2.519025240867938e-06, "loss": 0.1192, "num_input_tokens_seen": 164196208, "step": 76150 }, { "epoch": 13.975958891539731, "grad_norm": 0.20691533386707306, "learning_rate": 2.518330046826947e-06, "loss": 0.0004, "num_input_tokens_seen": 164209168, "step": 76155 }, { "epoch": 13.976876491099285, "grad_norm": 0.003738192142918706, "learning_rate": 2.5176349164369405e-06, "loss": 0.0767, "num_input_tokens_seen": 164219248, "step": 76160 }, { "epoch": 13.977794090658836, "grad_norm": 2.8788321018218994, "learning_rate": 2.516939849715754e-06, "loss": 0.2195, "num_input_tokens_seen": 164229584, "step": 76165 }, { "epoch": 13.978711690218388, "grad_norm": 0.006914441008120775, "learning_rate": 2.5162448466812106e-06, "loss": 0.1677, "num_input_tokens_seen": 164240304, "step": 76170 }, { "epoch": 13.979629289777941, "grad_norm": 0.6753994226455688, "learning_rate": 2.515549907351138e-06, "loss": 0.0381, "num_input_tokens_seen": 164251216, "step": 76175 }, { "epoch": 13.980546889337493, "grad_norm": 52.48416519165039, "learning_rate": 2.5148550317433606e-06, "loss": 0.3383, "num_input_tokens_seen": 164262000, "step": 76180 }, { "epoch": 13.981464488897045, "grad_norm": 0.006202551536262035, "learning_rate": 2.5141602198756993e-06, "loss": 0.0003, "num_input_tokens_seen": 164271344, "step": 76185 }, { "epoch": 13.982382088456598, "grad_norm": 0.04235232248902321, "learning_rate": 2.5134654717659735e-06, "loss": 0.0735, "num_input_tokens_seen": 164283056, "step": 76190 }, { "epoch": 13.98329968801615, "grad_norm": 0.0438196137547493, "learning_rate": 2.5127707874320066e-06, "loss": 0.0983, "num_input_tokens_seen": 164295056, "step": 76195 }, { "epoch": 13.984217287575701, "grad_norm": 0.07883860915899277, "learning_rate": 2.512076166891615e-06, "loss": 0.0246, "num_input_tokens_seen": 164305520, "step": 76200 }, { "epoch": 13.985134887135255, "grad_norm": 0.032311610877513885, "learning_rate": 2.5113816101626127e-06, "loss": 0.0006, "num_input_tokens_seen": 164316976, "step": 76205 }, { "epoch": 13.986052486694806, "grad_norm": 0.004099800251424313, "learning_rate": 2.5106871172628133e-06, "loss": 0.0175, "num_input_tokens_seen": 164326896, "step": 76210 }, { "epoch": 13.986970086254358, "grad_norm": 0.0010027717798948288, "learning_rate": 2.5099926882100335e-06, "loss": 0.0883, "num_input_tokens_seen": 164337552, "step": 76215 }, { "epoch": 13.987887685813911, "grad_norm": 0.0017447549616917968, "learning_rate": 2.5092983230220824e-06, "loss": 0.0657, "num_input_tokens_seen": 164349104, "step": 76220 }, { "epoch": 13.988805285373463, "grad_norm": 237.75962829589844, "learning_rate": 2.5086040217167683e-06, "loss": 0.112, "num_input_tokens_seen": 164360656, "step": 76225 }, { "epoch": 13.989722884933014, "grad_norm": 113.5037612915039, "learning_rate": 2.5079097843118984e-06, "loss": 0.1247, "num_input_tokens_seen": 164371504, "step": 76230 }, { "epoch": 13.990640484492568, "grad_norm": 0.009517919272184372, "learning_rate": 2.507215610825282e-06, "loss": 0.2501, "num_input_tokens_seen": 164382800, "step": 76235 }, { "epoch": 13.99155808405212, "grad_norm": 0.04766623303294182, "learning_rate": 2.506521501274722e-06, "loss": 0.1587, "num_input_tokens_seen": 164394416, "step": 76240 }, { "epoch": 13.992475683611671, "grad_norm": 0.01379014179110527, "learning_rate": 2.505827455678018e-06, "loss": 0.0098, "num_input_tokens_seen": 164405328, "step": 76245 }, { "epoch": 13.993393283171224, "grad_norm": 59.24734878540039, "learning_rate": 2.505133474052977e-06, "loss": 0.0331, "num_input_tokens_seen": 164415760, "step": 76250 }, { "epoch": 13.994310882730776, "grad_norm": 0.12718217074871063, "learning_rate": 2.504439556417395e-06, "loss": 0.1503, "num_input_tokens_seen": 164427184, "step": 76255 }, { "epoch": 13.995228482290328, "grad_norm": 0.05016421899199486, "learning_rate": 2.50374570278907e-06, "loss": 0.3535, "num_input_tokens_seen": 164437584, "step": 76260 }, { "epoch": 13.996146081849881, "grad_norm": 12.257607460021973, "learning_rate": 2.5030519131857994e-06, "loss": 0.0424, "num_input_tokens_seen": 164449232, "step": 76265 }, { "epoch": 13.997063681409433, "grad_norm": 0.01324087381362915, "learning_rate": 2.5023581876253776e-06, "loss": 0.2446, "num_input_tokens_seen": 164459824, "step": 76270 }, { "epoch": 13.997981280968984, "grad_norm": 0.09178577363491058, "learning_rate": 2.501664526125598e-06, "loss": 0.0038, "num_input_tokens_seen": 164470576, "step": 76275 }, { "epoch": 13.998898880528538, "grad_norm": 25.6173152923584, "learning_rate": 2.5009709287042485e-06, "loss": 0.0053, "num_input_tokens_seen": 164481328, "step": 76280 }, { "epoch": 13.99981648008809, "grad_norm": 0.07379653304815292, "learning_rate": 2.5002773953791238e-06, "loss": 0.0003, "num_input_tokens_seen": 164492304, "step": 76285 }, { "epoch": 14.0, "eval_loss": 1.000267505645752, "eval_runtime": 179.3989, "eval_samples_per_second": 30.374, "eval_steps_per_second": 7.598, "num_input_tokens_seen": 164493408, "step": 76286 }, { "epoch": 14.000734079647641, "grad_norm": 0.14381887018680573, "learning_rate": 2.49958392616801e-06, "loss": 0.3006, "num_input_tokens_seen": 164502016, "step": 76290 }, { "epoch": 14.001651679207194, "grad_norm": 0.06973625719547272, "learning_rate": 2.4988905210886904e-06, "loss": 0.0001, "num_input_tokens_seen": 164513376, "step": 76295 }, { "epoch": 14.002569278766746, "grad_norm": 0.03373198211193085, "learning_rate": 2.498197180158955e-06, "loss": 0.0763, "num_input_tokens_seen": 164525056, "step": 76300 }, { "epoch": 14.003486878326298, "grad_norm": 0.011661325581371784, "learning_rate": 2.4975039033965847e-06, "loss": 0.0002, "num_input_tokens_seen": 164534848, "step": 76305 }, { "epoch": 14.004404477885851, "grad_norm": 0.1873868703842163, "learning_rate": 2.496810690819361e-06, "loss": 0.0009, "num_input_tokens_seen": 164546592, "step": 76310 }, { "epoch": 14.005322077445403, "grad_norm": 0.02579222247004509, "learning_rate": 2.4961175424450608e-06, "loss": 0.0019, "num_input_tokens_seen": 164556864, "step": 76315 }, { "epoch": 14.006239677004954, "grad_norm": 406.9116516113281, "learning_rate": 2.4954244582914673e-06, "loss": 0.0382, "num_input_tokens_seen": 164566368, "step": 76320 }, { "epoch": 14.007157276564508, "grad_norm": 0.03838857635855675, "learning_rate": 2.4947314383763544e-06, "loss": 0.0021, "num_input_tokens_seen": 164577344, "step": 76325 }, { "epoch": 14.00807487612406, "grad_norm": 0.005336685571819544, "learning_rate": 2.4940384827174956e-06, "loss": 0.0002, "num_input_tokens_seen": 164588192, "step": 76330 }, { "epoch": 14.00899247568361, "grad_norm": 0.18339310586452484, "learning_rate": 2.4933455913326678e-06, "loss": 0.1319, "num_input_tokens_seen": 164598720, "step": 76335 }, { "epoch": 14.009910075243164, "grad_norm": 0.0044530173763632774, "learning_rate": 2.492652764239641e-06, "loss": 0.0001, "num_input_tokens_seen": 164608992, "step": 76340 }, { "epoch": 14.010827674802716, "grad_norm": 0.16114935278892517, "learning_rate": 2.4919600014561824e-06, "loss": 0.012, "num_input_tokens_seen": 164620352, "step": 76345 }, { "epoch": 14.011745274362267, "grad_norm": 0.11449867486953735, "learning_rate": 2.4912673030000646e-06, "loss": 0.0001, "num_input_tokens_seen": 164631232, "step": 76350 }, { "epoch": 14.01266287392182, "grad_norm": 0.00290657882578671, "learning_rate": 2.490574668889052e-06, "loss": 0.0506, "num_input_tokens_seen": 164642720, "step": 76355 }, { "epoch": 14.013580473481372, "grad_norm": 1.41975998878479, "learning_rate": 2.48988209914091e-06, "loss": 0.0009, "num_input_tokens_seen": 164654304, "step": 76360 }, { "epoch": 14.014498073040924, "grad_norm": 0.004186042118817568, "learning_rate": 2.4891895937734e-06, "loss": 0.0646, "num_input_tokens_seen": 164662720, "step": 76365 }, { "epoch": 14.015415672600477, "grad_norm": 0.694844663143158, "learning_rate": 2.4884971528042877e-06, "loss": 0.0002, "num_input_tokens_seen": 164674688, "step": 76370 }, { "epoch": 14.016333272160029, "grad_norm": 0.22920605540275574, "learning_rate": 2.487804776251331e-06, "loss": 0.0025, "num_input_tokens_seen": 164684928, "step": 76375 }, { "epoch": 14.01725087171958, "grad_norm": 0.01209102664142847, "learning_rate": 2.487112464132288e-06, "loss": 0.0006, "num_input_tokens_seen": 164695200, "step": 76380 }, { "epoch": 14.018168471279134, "grad_norm": 70.92121887207031, "learning_rate": 2.4864202164649136e-06, "loss": 0.0066, "num_input_tokens_seen": 164705888, "step": 76385 }, { "epoch": 14.019086070838686, "grad_norm": 0.007494543679058552, "learning_rate": 2.485728033266967e-06, "loss": 0.0004, "num_input_tokens_seen": 164716512, "step": 76390 }, { "epoch": 14.020003670398237, "grad_norm": 1.5689656734466553, "learning_rate": 2.4850359145562e-06, "loss": 0.1506, "num_input_tokens_seen": 164727808, "step": 76395 }, { "epoch": 14.02092126995779, "grad_norm": 0.19050481915473938, "learning_rate": 2.4843438603503633e-06, "loss": 0.0002, "num_input_tokens_seen": 164739104, "step": 76400 }, { "epoch": 14.021838869517342, "grad_norm": 0.07344729453325272, "learning_rate": 2.4836518706672076e-06, "loss": 0.3331, "num_input_tokens_seen": 164749856, "step": 76405 }, { "epoch": 14.022756469076894, "grad_norm": 0.04804683104157448, "learning_rate": 2.4829599455244803e-06, "loss": 0.0002, "num_input_tokens_seen": 164761504, "step": 76410 }, { "epoch": 14.023674068636447, "grad_norm": 0.2324841469526291, "learning_rate": 2.4822680849399306e-06, "loss": 0.3417, "num_input_tokens_seen": 164773312, "step": 76415 }, { "epoch": 14.024591668195999, "grad_norm": 0.033020537346601486, "learning_rate": 2.481576288931302e-06, "loss": 0.0003, "num_input_tokens_seen": 164783648, "step": 76420 }, { "epoch": 14.02550926775555, "grad_norm": 0.03158799186348915, "learning_rate": 2.4808845575163395e-06, "loss": 0.1101, "num_input_tokens_seen": 164794688, "step": 76425 }, { "epoch": 14.026426867315104, "grad_norm": 0.21430830657482147, "learning_rate": 2.4801928907127814e-06, "loss": 0.0002, "num_input_tokens_seen": 164804192, "step": 76430 }, { "epoch": 14.027344466874656, "grad_norm": 0.15331006050109863, "learning_rate": 2.479501288538372e-06, "loss": 0.0001, "num_input_tokens_seen": 164815296, "step": 76435 }, { "epoch": 14.028262066434207, "grad_norm": 0.029111580923199654, "learning_rate": 2.478809751010848e-06, "loss": 0.0003, "num_input_tokens_seen": 164825248, "step": 76440 }, { "epoch": 14.02917966599376, "grad_norm": 0.06229310855269432, "learning_rate": 2.478118278147945e-06, "loss": 0.0004, "num_input_tokens_seen": 164836064, "step": 76445 }, { "epoch": 14.030097265553312, "grad_norm": 0.17946256697177887, "learning_rate": 2.4774268699674016e-06, "loss": 0.2537, "num_input_tokens_seen": 164845504, "step": 76450 }, { "epoch": 14.031014865112864, "grad_norm": 0.0676635131239891, "learning_rate": 2.4767355264869493e-06, "loss": 0.012, "num_input_tokens_seen": 164856320, "step": 76455 }, { "epoch": 14.031932464672417, "grad_norm": 0.04549837112426758, "learning_rate": 2.4760442477243197e-06, "loss": 0.147, "num_input_tokens_seen": 164868064, "step": 76460 }, { "epoch": 14.032850064231969, "grad_norm": 0.0054890671744942665, "learning_rate": 2.4753530336972413e-06, "loss": 0.2032, "num_input_tokens_seen": 164879296, "step": 76465 }, { "epoch": 14.03376766379152, "grad_norm": 0.034440528601408005, "learning_rate": 2.474661884423447e-06, "loss": 0.0002, "num_input_tokens_seen": 164890560, "step": 76470 }, { "epoch": 14.034685263351074, "grad_norm": 0.5022167563438416, "learning_rate": 2.4739707999206613e-06, "loss": 0.0002, "num_input_tokens_seen": 164902880, "step": 76475 }, { "epoch": 14.035602862910626, "grad_norm": 0.025068597868084908, "learning_rate": 2.473279780206608e-06, "loss": 0.1755, "num_input_tokens_seen": 164913664, "step": 76480 }, { "epoch": 14.036520462470177, "grad_norm": 110.05088806152344, "learning_rate": 2.472588825299014e-06, "loss": 0.2437, "num_input_tokens_seen": 164923840, "step": 76485 }, { "epoch": 14.03743806202973, "grad_norm": 0.096973717212677, "learning_rate": 2.4718979352155993e-06, "loss": 0.1894, "num_input_tokens_seen": 164933824, "step": 76490 }, { "epoch": 14.038355661589282, "grad_norm": 33.05854415893555, "learning_rate": 2.471207109974085e-06, "loss": 0.2567, "num_input_tokens_seen": 164943136, "step": 76495 }, { "epoch": 14.039273261148834, "grad_norm": 0.004917914513498545, "learning_rate": 2.4705163495921864e-06, "loss": 0.0003, "num_input_tokens_seen": 164952256, "step": 76500 }, { "epoch": 14.040190860708387, "grad_norm": 0.05870610475540161, "learning_rate": 2.469825654087625e-06, "loss": 0.17, "num_input_tokens_seen": 164962240, "step": 76505 }, { "epoch": 14.041108460267939, "grad_norm": 0.05529661849141121, "learning_rate": 2.469135023478114e-06, "loss": 0.0452, "num_input_tokens_seen": 164973536, "step": 76510 }, { "epoch": 14.04202605982749, "grad_norm": 0.08632810413837433, "learning_rate": 2.468444457781366e-06, "loss": 0.3224, "num_input_tokens_seen": 164986144, "step": 76515 }, { "epoch": 14.042943659387044, "grad_norm": 55.12712860107422, "learning_rate": 2.4677539570150955e-06, "loss": 0.4429, "num_input_tokens_seen": 164997056, "step": 76520 }, { "epoch": 14.043861258946595, "grad_norm": 0.24414902925491333, "learning_rate": 2.4670635211970116e-06, "loss": 0.0005, "num_input_tokens_seen": 165007776, "step": 76525 }, { "epoch": 14.044778858506147, "grad_norm": 0.10632188618183136, "learning_rate": 2.4663731503448208e-06, "loss": 0.0002, "num_input_tokens_seen": 165018848, "step": 76530 }, { "epoch": 14.0456964580657, "grad_norm": 36.50412368774414, "learning_rate": 2.4656828444762337e-06, "loss": 0.1154, "num_input_tokens_seen": 165028800, "step": 76535 }, { "epoch": 14.046614057625252, "grad_norm": 0.2008466273546219, "learning_rate": 2.464992603608954e-06, "loss": 0.0013, "num_input_tokens_seen": 165037824, "step": 76540 }, { "epoch": 14.047531657184804, "grad_norm": 0.005735558923333883, "learning_rate": 2.4643024277606846e-06, "loss": 0.0002, "num_input_tokens_seen": 165049504, "step": 76545 }, { "epoch": 14.048449256744357, "grad_norm": 74.70722198486328, "learning_rate": 2.4636123169491265e-06, "loss": 0.0756, "num_input_tokens_seen": 165060352, "step": 76550 }, { "epoch": 14.049366856303909, "grad_norm": 0.01038387045264244, "learning_rate": 2.4629222711919836e-06, "loss": 0.0, "num_input_tokens_seen": 165069984, "step": 76555 }, { "epoch": 14.05028445586346, "grad_norm": 0.002392527414485812, "learning_rate": 2.4622322905069517e-06, "loss": 0.0108, "num_input_tokens_seen": 165080960, "step": 76560 }, { "epoch": 14.051202055423014, "grad_norm": 0.0077653429470956326, "learning_rate": 2.4615423749117266e-06, "loss": 0.0004, "num_input_tokens_seen": 165092160, "step": 76565 }, { "epoch": 14.052119654982565, "grad_norm": 22.332548141479492, "learning_rate": 2.460852524424008e-06, "loss": 0.0208, "num_input_tokens_seen": 165103776, "step": 76570 }, { "epoch": 14.053037254542117, "grad_norm": 0.047156572341918945, "learning_rate": 2.460162739061486e-06, "loss": 0.0922, "num_input_tokens_seen": 165115488, "step": 76575 }, { "epoch": 14.05395485410167, "grad_norm": 42.061546325683594, "learning_rate": 2.4594730188418513e-06, "loss": 0.0746, "num_input_tokens_seen": 165126336, "step": 76580 }, { "epoch": 14.054872453661222, "grad_norm": 2.026946544647217, "learning_rate": 2.4587833637827986e-06, "loss": 0.0773, "num_input_tokens_seen": 165138112, "step": 76585 }, { "epoch": 14.055790053220774, "grad_norm": 0.07164766639471054, "learning_rate": 2.458093773902014e-06, "loss": 0.0002, "num_input_tokens_seen": 165149280, "step": 76590 }, { "epoch": 14.056707652780327, "grad_norm": 0.03911105543375015, "learning_rate": 2.4574042492171844e-06, "loss": 0.0041, "num_input_tokens_seen": 165160288, "step": 76595 }, { "epoch": 14.057625252339879, "grad_norm": 0.03279199078679085, "learning_rate": 2.4567147897459954e-06, "loss": 0.0084, "num_input_tokens_seen": 165172352, "step": 76600 }, { "epoch": 14.05854285189943, "grad_norm": 0.12165536731481552, "learning_rate": 2.456025395506128e-06, "loss": 0.0035, "num_input_tokens_seen": 165182336, "step": 76605 }, { "epoch": 14.059460451458984, "grad_norm": 0.019664188846945763, "learning_rate": 2.4553360665152685e-06, "loss": 0.0288, "num_input_tokens_seen": 165191584, "step": 76610 }, { "epoch": 14.060378051018535, "grad_norm": 0.06240590661764145, "learning_rate": 2.4546468027910952e-06, "loss": 0.0005, "num_input_tokens_seen": 165201024, "step": 76615 }, { "epoch": 14.061295650578089, "grad_norm": 0.10517384111881256, "learning_rate": 2.4539576043512862e-06, "loss": 0.0002, "num_input_tokens_seen": 165212256, "step": 76620 }, { "epoch": 14.06221325013764, "grad_norm": 0.014087497256696224, "learning_rate": 2.453268471213519e-06, "loss": 0.0244, "num_input_tokens_seen": 165223008, "step": 76625 }, { "epoch": 14.063130849697192, "grad_norm": 0.33074402809143066, "learning_rate": 2.4525794033954657e-06, "loss": 0.0003, "num_input_tokens_seen": 165233920, "step": 76630 }, { "epoch": 14.064048449256745, "grad_norm": 0.4021071195602417, "learning_rate": 2.4518904009148054e-06, "loss": 0.0005, "num_input_tokens_seen": 165244480, "step": 76635 }, { "epoch": 14.064966048816297, "grad_norm": 0.08495653420686722, "learning_rate": 2.4512014637892067e-06, "loss": 0.2565, "num_input_tokens_seen": 165255264, "step": 76640 }, { "epoch": 14.065883648375848, "grad_norm": 0.16507382690906525, "learning_rate": 2.4505125920363403e-06, "loss": 0.0003, "num_input_tokens_seen": 165266208, "step": 76645 }, { "epoch": 14.066801247935402, "grad_norm": 0.006933143362402916, "learning_rate": 2.4498237856738728e-06, "loss": 0.1591, "num_input_tokens_seen": 165276128, "step": 76650 }, { "epoch": 14.067718847494953, "grad_norm": 0.023433972150087357, "learning_rate": 2.449135044719474e-06, "loss": 0.0706, "num_input_tokens_seen": 165287392, "step": 76655 }, { "epoch": 14.068636447054505, "grad_norm": 0.2841457724571228, "learning_rate": 2.4484463691908082e-06, "loss": 0.0004, "num_input_tokens_seen": 165298784, "step": 76660 }, { "epoch": 14.069554046614059, "grad_norm": 0.009372876025736332, "learning_rate": 2.4477577591055368e-06, "loss": 0.0001, "num_input_tokens_seen": 165309664, "step": 76665 }, { "epoch": 14.07047164617361, "grad_norm": 0.01320229284465313, "learning_rate": 2.4470692144813254e-06, "loss": 0.2283, "num_input_tokens_seen": 165320064, "step": 76670 }, { "epoch": 14.071389245733162, "grad_norm": 0.6420143842697144, "learning_rate": 2.4463807353358317e-06, "loss": 0.0003, "num_input_tokens_seen": 165330176, "step": 76675 }, { "epoch": 14.072306845292715, "grad_norm": 1.3779927492141724, "learning_rate": 2.445692321686714e-06, "loss": 0.0002, "num_input_tokens_seen": 165341440, "step": 76680 }, { "epoch": 14.073224444852267, "grad_norm": 0.7591878175735474, "learning_rate": 2.445003973551628e-06, "loss": 0.0003, "num_input_tokens_seen": 165351936, "step": 76685 }, { "epoch": 14.074142044411818, "grad_norm": 0.004228897858411074, "learning_rate": 2.4443156909482318e-06, "loss": 0.0002, "num_input_tokens_seen": 165362400, "step": 76690 }, { "epoch": 14.075059643971372, "grad_norm": 0.0028413350228220224, "learning_rate": 2.4436274738941773e-06, "loss": 0.1564, "num_input_tokens_seen": 165372800, "step": 76695 }, { "epoch": 14.075977243530923, "grad_norm": 0.010616573505103588, "learning_rate": 2.442939322407114e-06, "loss": 0.1349, "num_input_tokens_seen": 165383776, "step": 76700 }, { "epoch": 14.076894843090475, "grad_norm": 0.014703080989420414, "learning_rate": 2.4422512365046957e-06, "loss": 0.0002, "num_input_tokens_seen": 165393664, "step": 76705 }, { "epoch": 14.077812442650028, "grad_norm": 0.022640686482191086, "learning_rate": 2.4415632162045695e-06, "loss": 0.0001, "num_input_tokens_seen": 165404896, "step": 76710 }, { "epoch": 14.07873004220958, "grad_norm": 0.03995944559574127, "learning_rate": 2.4408752615243796e-06, "loss": 0.0823, "num_input_tokens_seen": 165415168, "step": 76715 }, { "epoch": 14.079647641769132, "grad_norm": 0.3038743734359741, "learning_rate": 2.440187372481775e-06, "loss": 0.1239, "num_input_tokens_seen": 165424800, "step": 76720 }, { "epoch": 14.080565241328685, "grad_norm": 0.047140687704086304, "learning_rate": 2.439499549094397e-06, "loss": 0.1295, "num_input_tokens_seen": 165435648, "step": 76725 }, { "epoch": 14.081482840888237, "grad_norm": 24.78993034362793, "learning_rate": 2.4388117913798866e-06, "loss": 0.056, "num_input_tokens_seen": 165446912, "step": 76730 }, { "epoch": 14.082400440447788, "grad_norm": 0.05681304261088371, "learning_rate": 2.4381240993558824e-06, "loss": 0.0002, "num_input_tokens_seen": 165458656, "step": 76735 }, { "epoch": 14.083318040007342, "grad_norm": 0.38053834438323975, "learning_rate": 2.4374364730400268e-06, "loss": 0.0053, "num_input_tokens_seen": 165470208, "step": 76740 }, { "epoch": 14.084235639566893, "grad_norm": 47.118045806884766, "learning_rate": 2.4367489124499544e-06, "loss": 0.0518, "num_input_tokens_seen": 165480192, "step": 76745 }, { "epoch": 14.085153239126445, "grad_norm": 184.80943298339844, "learning_rate": 2.436061417603297e-06, "loss": 0.0984, "num_input_tokens_seen": 165491104, "step": 76750 }, { "epoch": 14.086070838685998, "grad_norm": 22.164766311645508, "learning_rate": 2.435373988517693e-06, "loss": 0.0668, "num_input_tokens_seen": 165501824, "step": 76755 }, { "epoch": 14.08698843824555, "grad_norm": 0.16537818312644958, "learning_rate": 2.434686625210771e-06, "loss": 0.2202, "num_input_tokens_seen": 165513312, "step": 76760 }, { "epoch": 14.087906037805102, "grad_norm": 44.007659912109375, "learning_rate": 2.4339993277001597e-06, "loss": 0.0092, "num_input_tokens_seen": 165524864, "step": 76765 }, { "epoch": 14.088823637364655, "grad_norm": 0.0081296032294631, "learning_rate": 2.43331209600349e-06, "loss": 0.0078, "num_input_tokens_seen": 165533792, "step": 76770 }, { "epoch": 14.089741236924207, "grad_norm": 92.2472915649414, "learning_rate": 2.4326249301383876e-06, "loss": 0.1739, "num_input_tokens_seen": 165543552, "step": 76775 }, { "epoch": 14.090658836483758, "grad_norm": 0.02376566268503666, "learning_rate": 2.431937830122476e-06, "loss": 0.1684, "num_input_tokens_seen": 165553856, "step": 76780 }, { "epoch": 14.091576436043312, "grad_norm": 80.4717025756836, "learning_rate": 2.431250795973378e-06, "loss": 0.0298, "num_input_tokens_seen": 165565088, "step": 76785 }, { "epoch": 14.092494035602863, "grad_norm": 0.04569279029965401, "learning_rate": 2.430563827708717e-06, "loss": 0.1418, "num_input_tokens_seen": 165574912, "step": 76790 }, { "epoch": 14.093411635162415, "grad_norm": 0.0042461492121219635, "learning_rate": 2.429876925346112e-06, "loss": 0.0001, "num_input_tokens_seen": 165586144, "step": 76795 }, { "epoch": 14.094329234721968, "grad_norm": 64.97073364257812, "learning_rate": 2.429190088903178e-06, "loss": 0.1889, "num_input_tokens_seen": 165597728, "step": 76800 }, { "epoch": 14.09524683428152, "grad_norm": 0.5697104930877686, "learning_rate": 2.4285033183975364e-06, "loss": 0.0003, "num_input_tokens_seen": 165608192, "step": 76805 }, { "epoch": 14.096164433841071, "grad_norm": 0.017578771337866783, "learning_rate": 2.427816613846799e-06, "loss": 0.0001, "num_input_tokens_seen": 165618848, "step": 76810 }, { "epoch": 14.097082033400625, "grad_norm": 0.028767049312591553, "learning_rate": 2.427129975268579e-06, "loss": 0.0003, "num_input_tokens_seen": 165629888, "step": 76815 }, { "epoch": 14.097999632960176, "grad_norm": 0.0023487010039389133, "learning_rate": 2.426443402680487e-06, "loss": 0.1829, "num_input_tokens_seen": 165639680, "step": 76820 }, { "epoch": 14.098917232519728, "grad_norm": 0.07843511551618576, "learning_rate": 2.4257568961001316e-06, "loss": 0.1896, "num_input_tokens_seen": 165649184, "step": 76825 }, { "epoch": 14.099834832079281, "grad_norm": 0.0009451498626731336, "learning_rate": 2.4250704555451245e-06, "loss": 0.0306, "num_input_tokens_seen": 165660320, "step": 76830 }, { "epoch": 14.100752431638833, "grad_norm": 0.029864907264709473, "learning_rate": 2.424384081033069e-06, "loss": 0.0002, "num_input_tokens_seen": 165672160, "step": 76835 }, { "epoch": 14.101670031198385, "grad_norm": 155.43800354003906, "learning_rate": 2.4236977725815696e-06, "loss": 0.0335, "num_input_tokens_seen": 165680512, "step": 76840 }, { "epoch": 14.102587630757938, "grad_norm": 0.00989094190299511, "learning_rate": 2.4230115302082295e-06, "loss": 0.0001, "num_input_tokens_seen": 165691328, "step": 76845 }, { "epoch": 14.10350523031749, "grad_norm": 0.05111198127269745, "learning_rate": 2.4223253539306487e-06, "loss": 0.1384, "num_input_tokens_seen": 165702240, "step": 76850 }, { "epoch": 14.104422829877041, "grad_norm": 0.027509193867444992, "learning_rate": 2.4216392437664284e-06, "loss": 0.0011, "num_input_tokens_seen": 165713568, "step": 76855 }, { "epoch": 14.105340429436595, "grad_norm": 0.02138839289546013, "learning_rate": 2.420953199733166e-06, "loss": 0.0004, "num_input_tokens_seen": 165724352, "step": 76860 }, { "epoch": 14.106258028996146, "grad_norm": 0.0605839341878891, "learning_rate": 2.4202672218484563e-06, "loss": 0.0017, "num_input_tokens_seen": 165734080, "step": 76865 }, { "epoch": 14.107175628555698, "grad_norm": 0.09725257754325867, "learning_rate": 2.4195813101298928e-06, "loss": 0.0004, "num_input_tokens_seen": 165745248, "step": 76870 }, { "epoch": 14.108093228115251, "grad_norm": 0.02532116137444973, "learning_rate": 2.4188954645950715e-06, "loss": 0.0002, "num_input_tokens_seen": 165757312, "step": 76875 }, { "epoch": 14.109010827674803, "grad_norm": 0.0042810942977666855, "learning_rate": 2.4182096852615806e-06, "loss": 0.0019, "num_input_tokens_seen": 165767520, "step": 76880 }, { "epoch": 14.109928427234355, "grad_norm": 20.713104248046875, "learning_rate": 2.417523972147008e-06, "loss": 0.0976, "num_input_tokens_seen": 165778976, "step": 76885 }, { "epoch": 14.110846026793908, "grad_norm": 32.411712646484375, "learning_rate": 2.4168383252689447e-06, "loss": 0.0052, "num_input_tokens_seen": 165790240, "step": 76890 }, { "epoch": 14.11176362635346, "grad_norm": 0.0462665855884552, "learning_rate": 2.4161527446449757e-06, "loss": 0.0916, "num_input_tokens_seen": 165801216, "step": 76895 }, { "epoch": 14.112681225913011, "grad_norm": 0.1511201709508896, "learning_rate": 2.415467230292681e-06, "loss": 0.0004, "num_input_tokens_seen": 165812512, "step": 76900 }, { "epoch": 14.113598825472565, "grad_norm": 34.77143096923828, "learning_rate": 2.414781782229649e-06, "loss": 0.129, "num_input_tokens_seen": 165822400, "step": 76905 }, { "epoch": 14.114516425032116, "grad_norm": 0.09447044879198074, "learning_rate": 2.414096400473458e-06, "loss": 0.0027, "num_input_tokens_seen": 165831712, "step": 76910 }, { "epoch": 14.115434024591668, "grad_norm": 0.16092464327812195, "learning_rate": 2.413411085041686e-06, "loss": 0.2158, "num_input_tokens_seen": 165841408, "step": 76915 }, { "epoch": 14.116351624151221, "grad_norm": 0.15716637670993805, "learning_rate": 2.4127258359519083e-06, "loss": 0.1193, "num_input_tokens_seen": 165851968, "step": 76920 }, { "epoch": 14.117269223710773, "grad_norm": 0.0281792301684618, "learning_rate": 2.412040653221706e-06, "loss": 0.0002, "num_input_tokens_seen": 165863520, "step": 76925 }, { "epoch": 14.118186823270324, "grad_norm": 0.061655621975660324, "learning_rate": 2.411355536868649e-06, "loss": 0.0002, "num_input_tokens_seen": 165874720, "step": 76930 }, { "epoch": 14.119104422829878, "grad_norm": 0.016965024173259735, "learning_rate": 2.410670486910309e-06, "loss": 0.0431, "num_input_tokens_seen": 165885312, "step": 76935 }, { "epoch": 14.12002202238943, "grad_norm": 0.06877478957176208, "learning_rate": 2.40998550336426e-06, "loss": 0.1011, "num_input_tokens_seen": 165896416, "step": 76940 }, { "epoch": 14.120939621948981, "grad_norm": 57.371707916259766, "learning_rate": 2.409300586248069e-06, "loss": 0.1711, "num_input_tokens_seen": 165906816, "step": 76945 }, { "epoch": 14.121857221508535, "grad_norm": 243.60533142089844, "learning_rate": 2.408615735579302e-06, "loss": 0.0461, "num_input_tokens_seen": 165919040, "step": 76950 }, { "epoch": 14.122774821068086, "grad_norm": 0.01504202838987112, "learning_rate": 2.407930951375523e-06, "loss": 0.0004, "num_input_tokens_seen": 165929376, "step": 76955 }, { "epoch": 14.123692420627638, "grad_norm": 22.819965362548828, "learning_rate": 2.4072462336543007e-06, "loss": 0.0042, "num_input_tokens_seen": 165939168, "step": 76960 }, { "epoch": 14.124610020187191, "grad_norm": 0.015882452949881554, "learning_rate": 2.4065615824331936e-06, "loss": 0.0001, "num_input_tokens_seen": 165949696, "step": 76965 }, { "epoch": 14.125527619746743, "grad_norm": 0.013529806397855282, "learning_rate": 2.4058769977297604e-06, "loss": 0.0006, "num_input_tokens_seen": 165961632, "step": 76970 }, { "epoch": 14.126445219306294, "grad_norm": 0.03069990687072277, "learning_rate": 2.405192479561563e-06, "loss": 0.0656, "num_input_tokens_seen": 165972416, "step": 76975 }, { "epoch": 14.127362818865848, "grad_norm": 15.918705940246582, "learning_rate": 2.404508027946158e-06, "loss": 0.0056, "num_input_tokens_seen": 165982944, "step": 76980 }, { "epoch": 14.1282804184254, "grad_norm": 0.03125506639480591, "learning_rate": 2.403823642901097e-06, "loss": 0.0001, "num_input_tokens_seen": 165994208, "step": 76985 }, { "epoch": 14.129198017984951, "grad_norm": 89.8021240234375, "learning_rate": 2.403139324443938e-06, "loss": 0.0507, "num_input_tokens_seen": 166004128, "step": 76990 }, { "epoch": 14.130115617544504, "grad_norm": 0.021267419680953026, "learning_rate": 2.40245507259223e-06, "loss": 0.0006, "num_input_tokens_seen": 166016480, "step": 76995 }, { "epoch": 14.131033217104056, "grad_norm": 0.0032644239254295826, "learning_rate": 2.401770887363524e-06, "loss": 0.0004, "num_input_tokens_seen": 166027168, "step": 77000 }, { "epoch": 14.131950816663608, "grad_norm": 0.042395174503326416, "learning_rate": 2.401086768775366e-06, "loss": 0.0002, "num_input_tokens_seen": 166038912, "step": 77005 }, { "epoch": 14.132868416223161, "grad_norm": 0.07872933894395828, "learning_rate": 2.4004027168453063e-06, "loss": 0.0003, "num_input_tokens_seen": 166049088, "step": 77010 }, { "epoch": 14.133786015782713, "grad_norm": 31.731815338134766, "learning_rate": 2.3997187315908876e-06, "loss": 0.0104, "num_input_tokens_seen": 166058432, "step": 77015 }, { "epoch": 14.134703615342264, "grad_norm": 0.030533166602253914, "learning_rate": 2.399034813029652e-06, "loss": 0.0002, "num_input_tokens_seen": 166069600, "step": 77020 }, { "epoch": 14.135621214901818, "grad_norm": 0.015268353745341301, "learning_rate": 2.3983509611791437e-06, "loss": 0.1355, "num_input_tokens_seen": 166080992, "step": 77025 }, { "epoch": 14.13653881446137, "grad_norm": 0.5938348174095154, "learning_rate": 2.3976671760569014e-06, "loss": 0.0002, "num_input_tokens_seen": 166091712, "step": 77030 }, { "epoch": 14.137456414020921, "grad_norm": 26.45078468322754, "learning_rate": 2.3969834576804623e-06, "loss": 0.0061, "num_input_tokens_seen": 166104000, "step": 77035 }, { "epoch": 14.138374013580474, "grad_norm": 0.019968103617429733, "learning_rate": 2.396299806067364e-06, "loss": 0.0002, "num_input_tokens_seen": 166114368, "step": 77040 }, { "epoch": 14.139291613140026, "grad_norm": 0.01701059378683567, "learning_rate": 2.395616221235138e-06, "loss": 0.0001, "num_input_tokens_seen": 166125536, "step": 77045 }, { "epoch": 14.140209212699578, "grad_norm": 0.032485272735357285, "learning_rate": 2.3949327032013214e-06, "loss": 0.1069, "num_input_tokens_seen": 166135584, "step": 77050 }, { "epoch": 14.141126812259131, "grad_norm": 0.005675832275301218, "learning_rate": 2.3942492519834433e-06, "loss": 0.0001, "num_input_tokens_seen": 166146048, "step": 77055 }, { "epoch": 14.142044411818683, "grad_norm": 1.9017415046691895, "learning_rate": 2.393565867599033e-06, "loss": 0.0005, "num_input_tokens_seen": 166155744, "step": 77060 }, { "epoch": 14.142962011378234, "grad_norm": 0.0581093393266201, "learning_rate": 2.3928825500656192e-06, "loss": 0.0001, "num_input_tokens_seen": 166165792, "step": 77065 }, { "epoch": 14.143879610937788, "grad_norm": 0.21242912113666534, "learning_rate": 2.392199299400725e-06, "loss": 0.0434, "num_input_tokens_seen": 166176640, "step": 77070 }, { "epoch": 14.14479721049734, "grad_norm": 0.446505069732666, "learning_rate": 2.3915161156218797e-06, "loss": 0.1812, "num_input_tokens_seen": 166188224, "step": 77075 }, { "epoch": 14.14571481005689, "grad_norm": 26.229646682739258, "learning_rate": 2.390832998746603e-06, "loss": 0.0937, "num_input_tokens_seen": 166200192, "step": 77080 }, { "epoch": 14.146632409616444, "grad_norm": 0.18104319274425507, "learning_rate": 2.3901499487924155e-06, "loss": 0.0027, "num_input_tokens_seen": 166212352, "step": 77085 }, { "epoch": 14.147550009175996, "grad_norm": 0.0030167726799845695, "learning_rate": 2.3894669657768356e-06, "loss": 0.1253, "num_input_tokens_seen": 166223200, "step": 77090 }, { "epoch": 14.148467608735547, "grad_norm": 1.0880744457244873, "learning_rate": 2.3887840497173835e-06, "loss": 0.0012, "num_input_tokens_seen": 166233280, "step": 77095 }, { "epoch": 14.1493852082951, "grad_norm": 0.02861408330500126, "learning_rate": 2.3881012006315734e-06, "loss": 0.0079, "num_input_tokens_seen": 166243744, "step": 77100 }, { "epoch": 14.150302807854652, "grad_norm": 0.002941675251349807, "learning_rate": 2.387418418536918e-06, "loss": 0.0002, "num_input_tokens_seen": 166254848, "step": 77105 }, { "epoch": 14.151220407414204, "grad_norm": 0.06197408586740494, "learning_rate": 2.386735703450933e-06, "loss": 0.347, "num_input_tokens_seen": 166266976, "step": 77110 }, { "epoch": 14.152138006973757, "grad_norm": 82.96967315673828, "learning_rate": 2.3860530553911263e-06, "loss": 0.0066, "num_input_tokens_seen": 166278560, "step": 77115 }, { "epoch": 14.153055606533309, "grad_norm": 0.004595611244440079, "learning_rate": 2.385370474375007e-06, "loss": 0.0051, "num_input_tokens_seen": 166290848, "step": 77120 }, { "epoch": 14.15397320609286, "grad_norm": 0.0012541807955130935, "learning_rate": 2.3846879604200828e-06, "loss": 0.0003, "num_input_tokens_seen": 166302368, "step": 77125 }, { "epoch": 14.154890805652414, "grad_norm": 0.5744072198867798, "learning_rate": 2.38400551354386e-06, "loss": 0.1162, "num_input_tokens_seen": 166312768, "step": 77130 }, { "epoch": 14.155808405211966, "grad_norm": 99.39839172363281, "learning_rate": 2.3833231337638413e-06, "loss": 0.1417, "num_input_tokens_seen": 166323296, "step": 77135 }, { "epoch": 14.156726004771517, "grad_norm": 0.003494191449135542, "learning_rate": 2.382640821097527e-06, "loss": 0.2258, "num_input_tokens_seen": 166333920, "step": 77140 }, { "epoch": 14.15764360433107, "grad_norm": 0.03175666928291321, "learning_rate": 2.381958575562421e-06, "loss": 0.1253, "num_input_tokens_seen": 166344896, "step": 77145 }, { "epoch": 14.158561203890622, "grad_norm": 0.004177512135356665, "learning_rate": 2.3812763971760196e-06, "loss": 0.0622, "num_input_tokens_seen": 166355456, "step": 77150 }, { "epoch": 14.159478803450174, "grad_norm": 0.2290714532136917, "learning_rate": 2.3805942859558183e-06, "loss": 0.0011, "num_input_tokens_seen": 166365888, "step": 77155 }, { "epoch": 14.160396403009727, "grad_norm": 0.09109574556350708, "learning_rate": 2.3799122419193155e-06, "loss": 0.0883, "num_input_tokens_seen": 166377056, "step": 77160 }, { "epoch": 14.161314002569279, "grad_norm": 0.052369024604558945, "learning_rate": 2.3792302650840032e-06, "loss": 0.0617, "num_input_tokens_seen": 166388992, "step": 77165 }, { "epoch": 14.16223160212883, "grad_norm": 18.809606552124023, "learning_rate": 2.3785483554673707e-06, "loss": 0.0059, "num_input_tokens_seen": 166399968, "step": 77170 }, { "epoch": 14.163149201688384, "grad_norm": 0.006661712657660246, "learning_rate": 2.377866513086912e-06, "loss": 0.0003, "num_input_tokens_seen": 166410784, "step": 77175 }, { "epoch": 14.164066801247936, "grad_norm": 0.10666804015636444, "learning_rate": 2.377184737960113e-06, "loss": 0.0794, "num_input_tokens_seen": 166422144, "step": 77180 }, { "epoch": 14.164984400807487, "grad_norm": 0.36915287375450134, "learning_rate": 2.376503030104461e-06, "loss": 0.1627, "num_input_tokens_seen": 166433824, "step": 77185 }, { "epoch": 14.16590200036704, "grad_norm": 0.0799977257847786, "learning_rate": 2.3758213895374383e-06, "loss": 0.048, "num_input_tokens_seen": 166445536, "step": 77190 }, { "epoch": 14.166819599926592, "grad_norm": 49.45725631713867, "learning_rate": 2.375139816276531e-06, "loss": 0.3224, "num_input_tokens_seen": 166456160, "step": 77195 }, { "epoch": 14.167737199486144, "grad_norm": 0.16224254667758942, "learning_rate": 2.37445831033922e-06, "loss": 0.0648, "num_input_tokens_seen": 166464672, "step": 77200 }, { "epoch": 14.168654799045697, "grad_norm": 0.0636339783668518, "learning_rate": 2.3737768717429823e-06, "loss": 0.0002, "num_input_tokens_seen": 166475712, "step": 77205 }, { "epoch": 14.169572398605249, "grad_norm": 0.3599509000778198, "learning_rate": 2.373095500505299e-06, "loss": 0.0007, "num_input_tokens_seen": 166485856, "step": 77210 }, { "epoch": 14.1704899981648, "grad_norm": 0.008627123199403286, "learning_rate": 2.372414196643645e-06, "loss": 0.0006, "num_input_tokens_seen": 166494880, "step": 77215 }, { "epoch": 14.171407597724354, "grad_norm": 0.029530329629778862, "learning_rate": 2.3717329601754923e-06, "loss": 0.1385, "num_input_tokens_seen": 166506112, "step": 77220 }, { "epoch": 14.172325197283905, "grad_norm": 0.11482847481966019, "learning_rate": 2.371051791118318e-06, "loss": 0.0001, "num_input_tokens_seen": 166516960, "step": 77225 }, { "epoch": 14.173242796843457, "grad_norm": 4.670460224151611, "learning_rate": 2.3703706894895906e-06, "loss": 0.1663, "num_input_tokens_seen": 166528352, "step": 77230 }, { "epoch": 14.17416039640301, "grad_norm": 0.04888483136892319, "learning_rate": 2.3696896553067795e-06, "loss": 0.037, "num_input_tokens_seen": 166540192, "step": 77235 }, { "epoch": 14.175077995962562, "grad_norm": 0.2717210054397583, "learning_rate": 2.36900868858735e-06, "loss": 0.0069, "num_input_tokens_seen": 166551392, "step": 77240 }, { "epoch": 14.175995595522114, "grad_norm": 0.019166551530361176, "learning_rate": 2.3683277893487723e-06, "loss": 0.0005, "num_input_tokens_seen": 166562048, "step": 77245 }, { "epoch": 14.176913195081667, "grad_norm": 273.6065979003906, "learning_rate": 2.3676469576085076e-06, "loss": 0.1496, "num_input_tokens_seen": 166572320, "step": 77250 }, { "epoch": 14.177830794641219, "grad_norm": 0.04752260446548462, "learning_rate": 2.366966193384019e-06, "loss": 0.0005, "num_input_tokens_seen": 166583360, "step": 77255 }, { "epoch": 14.17874839420077, "grad_norm": 7.520537853240967, "learning_rate": 2.3662854966927662e-06, "loss": 0.0596, "num_input_tokens_seen": 166593760, "step": 77260 }, { "epoch": 14.179665993760324, "grad_norm": 0.004032780881971121, "learning_rate": 2.3656048675522094e-06, "loss": 0.0041, "num_input_tokens_seen": 166603616, "step": 77265 }, { "epoch": 14.180583593319875, "grad_norm": 0.626481294631958, "learning_rate": 2.364924305979802e-06, "loss": 0.0004, "num_input_tokens_seen": 166615744, "step": 77270 }, { "epoch": 14.181501192879427, "grad_norm": 0.6571733951568604, "learning_rate": 2.3642438119930046e-06, "loss": 0.0006, "num_input_tokens_seen": 166627456, "step": 77275 }, { "epoch": 14.18241879243898, "grad_norm": 0.005814298987388611, "learning_rate": 2.3635633856092684e-06, "loss": 0.0887, "num_input_tokens_seen": 166638496, "step": 77280 }, { "epoch": 14.183336391998532, "grad_norm": 0.0017815885366871953, "learning_rate": 2.3628830268460452e-06, "loss": 0.0005, "num_input_tokens_seen": 166648352, "step": 77285 }, { "epoch": 14.184253991558084, "grad_norm": 0.13272790610790253, "learning_rate": 2.3622027357207826e-06, "loss": 0.1663, "num_input_tokens_seen": 166660416, "step": 77290 }, { "epoch": 14.185171591117637, "grad_norm": 0.08346536755561829, "learning_rate": 2.3615225122509345e-06, "loss": 0.0053, "num_input_tokens_seen": 166672064, "step": 77295 }, { "epoch": 14.186089190677189, "grad_norm": 0.006880569271743298, "learning_rate": 2.360842356453944e-06, "loss": 0.12, "num_input_tokens_seen": 166682400, "step": 77300 }, { "epoch": 14.18700679023674, "grad_norm": 0.029668958857655525, "learning_rate": 2.3601622683472553e-06, "loss": 0.0008, "num_input_tokens_seen": 166693440, "step": 77305 }, { "epoch": 14.187924389796294, "grad_norm": 0.21883535385131836, "learning_rate": 2.359482247948315e-06, "loss": 0.0001, "num_input_tokens_seen": 166705344, "step": 77310 }, { "epoch": 14.188841989355845, "grad_norm": 0.0707990750670433, "learning_rate": 2.358802295274562e-06, "loss": 0.1457, "num_input_tokens_seen": 166717664, "step": 77315 }, { "epoch": 14.189759588915397, "grad_norm": 0.19471009075641632, "learning_rate": 2.3581224103434377e-06, "loss": 0.0001, "num_input_tokens_seen": 166728832, "step": 77320 }, { "epoch": 14.19067718847495, "grad_norm": 0.408096581697464, "learning_rate": 2.357442593172376e-06, "loss": 0.0091, "num_input_tokens_seen": 166740448, "step": 77325 }, { "epoch": 14.191594788034502, "grad_norm": 0.4879668951034546, "learning_rate": 2.356762843778819e-06, "loss": 0.008, "num_input_tokens_seen": 166751136, "step": 77330 }, { "epoch": 14.192512387594054, "grad_norm": 0.018958454951643944, "learning_rate": 2.3560831621801977e-06, "loss": 0.0001, "num_input_tokens_seen": 166762560, "step": 77335 }, { "epoch": 14.193429987153607, "grad_norm": 0.0069720130413770676, "learning_rate": 2.3554035483939437e-06, "loss": 0.04, "num_input_tokens_seen": 166772064, "step": 77340 }, { "epoch": 14.194347586713159, "grad_norm": 0.028763506561517715, "learning_rate": 2.3547240024374922e-06, "loss": 0.0884, "num_input_tokens_seen": 166782912, "step": 77345 }, { "epoch": 14.19526518627271, "grad_norm": 0.06974315643310547, "learning_rate": 2.35404452432827e-06, "loss": 0.0002, "num_input_tokens_seen": 166794112, "step": 77350 }, { "epoch": 14.196182785832264, "grad_norm": 51.35451889038086, "learning_rate": 2.3533651140837034e-06, "loss": 0.1348, "num_input_tokens_seen": 166804160, "step": 77355 }, { "epoch": 14.197100385391815, "grad_norm": 184.08920288085938, "learning_rate": 2.352685771721221e-06, "loss": 0.1231, "num_input_tokens_seen": 166815072, "step": 77360 }, { "epoch": 14.198017984951367, "grad_norm": 0.011387973092496395, "learning_rate": 2.3520064972582458e-06, "loss": 0.0001, "num_input_tokens_seen": 166826464, "step": 77365 }, { "epoch": 14.19893558451092, "grad_norm": 0.002757285488769412, "learning_rate": 2.3513272907122005e-06, "loss": 0.0755, "num_input_tokens_seen": 166838720, "step": 77370 }, { "epoch": 14.199853184070472, "grad_norm": 0.04594716429710388, "learning_rate": 2.3506481521005028e-06, "loss": 0.0001, "num_input_tokens_seen": 166848160, "step": 77375 }, { "epoch": 14.200770783630023, "grad_norm": 0.03830581530928612, "learning_rate": 2.349969081440575e-06, "loss": 0.0025, "num_input_tokens_seen": 166860192, "step": 77380 }, { "epoch": 14.201688383189577, "grad_norm": 0.006238402333110571, "learning_rate": 2.349290078749834e-06, "loss": 0.1503, "num_input_tokens_seen": 166870464, "step": 77385 }, { "epoch": 14.202605982749128, "grad_norm": 0.43557217717170715, "learning_rate": 2.348611144045692e-06, "loss": 0.3167, "num_input_tokens_seen": 166882112, "step": 77390 }, { "epoch": 14.20352358230868, "grad_norm": 20.5117130279541, "learning_rate": 2.347932277345567e-06, "loss": 0.0049, "num_input_tokens_seen": 166893600, "step": 77395 }, { "epoch": 14.204441181868233, "grad_norm": 55.63541030883789, "learning_rate": 2.347253478666869e-06, "loss": 0.2165, "num_input_tokens_seen": 166906592, "step": 77400 }, { "epoch": 14.205358781427785, "grad_norm": 0.019315727055072784, "learning_rate": 2.3465747480270072e-06, "loss": 0.1501, "num_input_tokens_seen": 166917312, "step": 77405 }, { "epoch": 14.206276380987337, "grad_norm": 0.01719236746430397, "learning_rate": 2.3458960854433895e-06, "loss": 0.012, "num_input_tokens_seen": 166927520, "step": 77410 }, { "epoch": 14.20719398054689, "grad_norm": 0.028927870094776154, "learning_rate": 2.3452174909334254e-06, "loss": 0.2631, "num_input_tokens_seen": 166938336, "step": 77415 }, { "epoch": 14.208111580106442, "grad_norm": 0.019921010360121727, "learning_rate": 2.344538964514518e-06, "loss": 0.1379, "num_input_tokens_seen": 166949088, "step": 77420 }, { "epoch": 14.209029179665993, "grad_norm": 0.0015638222685083747, "learning_rate": 2.3438605062040687e-06, "loss": 0.0026, "num_input_tokens_seen": 166959936, "step": 77425 }, { "epoch": 14.209946779225547, "grad_norm": 175.4614715576172, "learning_rate": 2.3431821160194824e-06, "loss": 0.0285, "num_input_tokens_seen": 166970656, "step": 77430 }, { "epoch": 14.210864378785098, "grad_norm": 0.12703272700309753, "learning_rate": 2.3425037939781576e-06, "loss": 0.0001, "num_input_tokens_seen": 166981792, "step": 77435 }, { "epoch": 14.21178197834465, "grad_norm": 0.00275598606094718, "learning_rate": 2.3418255400974893e-06, "loss": 0.0003, "num_input_tokens_seen": 166993440, "step": 77440 }, { "epoch": 14.212699577904203, "grad_norm": 93.96505737304688, "learning_rate": 2.3411473543948787e-06, "loss": 0.0285, "num_input_tokens_seen": 167004768, "step": 77445 }, { "epoch": 14.213617177463755, "grad_norm": 0.01602860726416111, "learning_rate": 2.340469236887717e-06, "loss": 0.0001, "num_input_tokens_seen": 167015744, "step": 77450 }, { "epoch": 14.214534777023307, "grad_norm": 0.01296901423484087, "learning_rate": 2.3397911875933978e-06, "loss": 0.0006, "num_input_tokens_seen": 167026048, "step": 77455 }, { "epoch": 14.21545237658286, "grad_norm": 0.0009254455799236894, "learning_rate": 2.3391132065293113e-06, "loss": 0.0003, "num_input_tokens_seen": 167035616, "step": 77460 }, { "epoch": 14.216369976142412, "grad_norm": 155.7975311279297, "learning_rate": 2.338435293712846e-06, "loss": 0.3689, "num_input_tokens_seen": 167046720, "step": 77465 }, { "epoch": 14.217287575701963, "grad_norm": 7.3998494148254395, "learning_rate": 2.3377574491613915e-06, "loss": 0.0038, "num_input_tokens_seen": 167056864, "step": 77470 }, { "epoch": 14.218205175261517, "grad_norm": 0.009384353645145893, "learning_rate": 2.3370796728923323e-06, "loss": 0.163, "num_input_tokens_seen": 167068640, "step": 77475 }, { "epoch": 14.219122774821068, "grad_norm": 0.1815713346004486, "learning_rate": 2.3364019649230526e-06, "loss": 0.1683, "num_input_tokens_seen": 167080704, "step": 77480 }, { "epoch": 14.22004037438062, "grad_norm": 0.027326779440045357, "learning_rate": 2.3357243252709345e-06, "loss": 0.0001, "num_input_tokens_seen": 167090944, "step": 77485 }, { "epoch": 14.220957973940173, "grad_norm": 0.33207523822784424, "learning_rate": 2.3350467539533557e-06, "loss": 0.0005, "num_input_tokens_seen": 167102080, "step": 77490 }, { "epoch": 14.221875573499725, "grad_norm": 0.006344163324683905, "learning_rate": 2.3343692509877e-06, "loss": 0.0004, "num_input_tokens_seen": 167111776, "step": 77495 }, { "epoch": 14.222793173059276, "grad_norm": 0.005433869082480669, "learning_rate": 2.333691816391341e-06, "loss": 0.0064, "num_input_tokens_seen": 167121952, "step": 77500 }, { "epoch": 14.22371077261883, "grad_norm": 94.66447448730469, "learning_rate": 2.3330144501816547e-06, "loss": 0.0196, "num_input_tokens_seen": 167131264, "step": 77505 }, { "epoch": 14.224628372178381, "grad_norm": 0.013785031624138355, "learning_rate": 2.3323371523760125e-06, "loss": 0.0, "num_input_tokens_seen": 167141824, "step": 77510 }, { "epoch": 14.225545971737933, "grad_norm": 0.003395227249711752, "learning_rate": 2.3316599229917898e-06, "loss": 0.038, "num_input_tokens_seen": 167152352, "step": 77515 }, { "epoch": 14.226463571297487, "grad_norm": 0.003572941292077303, "learning_rate": 2.3309827620463545e-06, "loss": 0.0002, "num_input_tokens_seen": 167163360, "step": 77520 }, { "epoch": 14.227381170857038, "grad_norm": 0.002127019688487053, "learning_rate": 2.330305669557073e-06, "loss": 0.2001, "num_input_tokens_seen": 167174880, "step": 77525 }, { "epoch": 14.22829877041659, "grad_norm": 0.02758149988949299, "learning_rate": 2.3296286455413148e-06, "loss": 0.0946, "num_input_tokens_seen": 167186624, "step": 77530 }, { "epoch": 14.229216369976143, "grad_norm": 0.4148574471473694, "learning_rate": 2.328951690016444e-06, "loss": 0.0003, "num_input_tokens_seen": 167197664, "step": 77535 }, { "epoch": 14.230133969535695, "grad_norm": 92.06391906738281, "learning_rate": 2.3282748029998213e-06, "loss": 0.1067, "num_input_tokens_seen": 167209184, "step": 77540 }, { "epoch": 14.231051569095246, "grad_norm": 0.018355848267674446, "learning_rate": 2.3275979845088083e-06, "loss": 0.0072, "num_input_tokens_seen": 167220256, "step": 77545 }, { "epoch": 14.2319691686548, "grad_norm": 0.015666097402572632, "learning_rate": 2.3269212345607666e-06, "loss": 0.0002, "num_input_tokens_seen": 167231168, "step": 77550 }, { "epoch": 14.232886768214351, "grad_norm": 0.0155301159247756, "learning_rate": 2.3262445531730526e-06, "loss": 0.0002, "num_input_tokens_seen": 167242944, "step": 77555 }, { "epoch": 14.233804367773903, "grad_norm": 0.01846730336546898, "learning_rate": 2.32556794036302e-06, "loss": 0.0001, "num_input_tokens_seen": 167253952, "step": 77560 }, { "epoch": 14.234721967333456, "grad_norm": 0.008778347633779049, "learning_rate": 2.3248913961480263e-06, "loss": 0.3474, "num_input_tokens_seen": 167262912, "step": 77565 }, { "epoch": 14.235639566893008, "grad_norm": 24.6779842376709, "learning_rate": 2.3242149205454223e-06, "loss": 0.0011, "num_input_tokens_seen": 167274016, "step": 77570 }, { "epoch": 14.23655716645256, "grad_norm": 0.04009794443845749, "learning_rate": 2.3235385135725567e-06, "loss": 0.0427, "num_input_tokens_seen": 167284992, "step": 77575 }, { "epoch": 14.237474766012113, "grad_norm": 1.605027198791504, "learning_rate": 2.322862175246782e-06, "loss": 0.0902, "num_input_tokens_seen": 167296224, "step": 77580 }, { "epoch": 14.238392365571665, "grad_norm": 0.03897086903452873, "learning_rate": 2.3221859055854433e-06, "loss": 0.0002, "num_input_tokens_seen": 167306528, "step": 77585 }, { "epoch": 14.239309965131216, "grad_norm": 0.08131545782089233, "learning_rate": 2.321509704605886e-06, "loss": 0.0002, "num_input_tokens_seen": 167316864, "step": 77590 }, { "epoch": 14.24022756469077, "grad_norm": 0.023870648816227913, "learning_rate": 2.3208335723254518e-06, "loss": 0.0019, "num_input_tokens_seen": 167328000, "step": 77595 }, { "epoch": 14.241145164250321, "grad_norm": 34.06802749633789, "learning_rate": 2.3201575087614854e-06, "loss": 0.003, "num_input_tokens_seen": 167339104, "step": 77600 }, { "epoch": 14.242062763809873, "grad_norm": 306.73626708984375, "learning_rate": 2.319481513931326e-06, "loss": 0.1631, "num_input_tokens_seen": 167350240, "step": 77605 }, { "epoch": 14.242980363369426, "grad_norm": 0.030514782294631004, "learning_rate": 2.3188055878523093e-06, "loss": 0.0001, "num_input_tokens_seen": 167359744, "step": 77610 }, { "epoch": 14.243897962928978, "grad_norm": 0.36203843355178833, "learning_rate": 2.3181297305417753e-06, "loss": 0.1288, "num_input_tokens_seen": 167370624, "step": 77615 }, { "epoch": 14.24481556248853, "grad_norm": 0.08599653840065002, "learning_rate": 2.317453942017058e-06, "loss": 0.0578, "num_input_tokens_seen": 167382176, "step": 77620 }, { "epoch": 14.245733162048083, "grad_norm": 2.226792097091675, "learning_rate": 2.316778222295487e-06, "loss": 0.0007, "num_input_tokens_seen": 167392512, "step": 77625 }, { "epoch": 14.246650761607635, "grad_norm": 0.07449281960725784, "learning_rate": 2.316102571394398e-06, "loss": 0.0009, "num_input_tokens_seen": 167402464, "step": 77630 }, { "epoch": 14.247568361167186, "grad_norm": 0.25552552938461304, "learning_rate": 2.3154269893311186e-06, "loss": 0.0003, "num_input_tokens_seen": 167412448, "step": 77635 }, { "epoch": 14.24848596072674, "grad_norm": 0.021526066586375237, "learning_rate": 2.314751476122976e-06, "loss": 0.0001, "num_input_tokens_seen": 167421760, "step": 77640 }, { "epoch": 14.249403560286291, "grad_norm": 0.0013839883031323552, "learning_rate": 2.3140760317872947e-06, "loss": 0.0002, "num_input_tokens_seen": 167432288, "step": 77645 }, { "epoch": 14.250321159845843, "grad_norm": 192.90118408203125, "learning_rate": 2.3134006563414017e-06, "loss": 0.048, "num_input_tokens_seen": 167444128, "step": 77650 }, { "epoch": 14.251238759405396, "grad_norm": 0.05930033326148987, "learning_rate": 2.312725349802618e-06, "loss": 0.0004, "num_input_tokens_seen": 167455424, "step": 77655 }, { "epoch": 14.252156358964948, "grad_norm": 0.5124499797821045, "learning_rate": 2.3120501121882634e-06, "loss": 0.1833, "num_input_tokens_seen": 167465600, "step": 77660 }, { "epoch": 14.2530739585245, "grad_norm": 17.154390335083008, "learning_rate": 2.311374943515658e-06, "loss": 0.0029, "num_input_tokens_seen": 167475872, "step": 77665 }, { "epoch": 14.253991558084053, "grad_norm": 27.01528549194336, "learning_rate": 2.3106998438021187e-06, "loss": 0.1204, "num_input_tokens_seen": 167485888, "step": 77670 }, { "epoch": 14.254909157643604, "grad_norm": 0.9361067414283752, "learning_rate": 2.310024813064961e-06, "loss": 0.0009, "num_input_tokens_seen": 167496256, "step": 77675 }, { "epoch": 14.255826757203156, "grad_norm": 0.024372821673750877, "learning_rate": 2.3093498513214974e-06, "loss": 0.2335, "num_input_tokens_seen": 167507296, "step": 77680 }, { "epoch": 14.25674435676271, "grad_norm": 0.003132425481453538, "learning_rate": 2.3086749585890377e-06, "loss": 0.2502, "num_input_tokens_seen": 167518016, "step": 77685 }, { "epoch": 14.257661956322261, "grad_norm": 0.007963505573570728, "learning_rate": 2.3080001348848966e-06, "loss": 0.0011, "num_input_tokens_seen": 167528032, "step": 77690 }, { "epoch": 14.258579555881813, "grad_norm": 0.0012544706696644425, "learning_rate": 2.3073253802263794e-06, "loss": 0.0018, "num_input_tokens_seen": 167537600, "step": 77695 }, { "epoch": 14.259497155441366, "grad_norm": 0.0812460333108902, "learning_rate": 2.306650694630793e-06, "loss": 0.009, "num_input_tokens_seen": 167547776, "step": 77700 }, { "epoch": 14.260414755000918, "grad_norm": 0.0071700322441756725, "learning_rate": 2.3059760781154424e-06, "loss": 0.0071, "num_input_tokens_seen": 167557984, "step": 77705 }, { "epoch": 14.26133235456047, "grad_norm": 5.847973346710205, "learning_rate": 2.305301530697628e-06, "loss": 0.0007, "num_input_tokens_seen": 167569280, "step": 77710 }, { "epoch": 14.262249954120023, "grad_norm": 0.010556341148912907, "learning_rate": 2.3046270523946545e-06, "loss": 0.0001, "num_input_tokens_seen": 167580000, "step": 77715 }, { "epoch": 14.263167553679574, "grad_norm": 0.03587052598595619, "learning_rate": 2.3039526432238197e-06, "loss": 0.0014, "num_input_tokens_seen": 167590016, "step": 77720 }, { "epoch": 14.264085153239126, "grad_norm": 0.10203081369400024, "learning_rate": 2.3032783032024208e-06, "loss": 0.0001, "num_input_tokens_seen": 167599872, "step": 77725 }, { "epoch": 14.26500275279868, "grad_norm": 0.0019750138744711876, "learning_rate": 2.3026040323477528e-06, "loss": 0.0825, "num_input_tokens_seen": 167611584, "step": 77730 }, { "epoch": 14.265920352358231, "grad_norm": 0.04426048323512077, "learning_rate": 2.301929830677112e-06, "loss": 0.0003, "num_input_tokens_seen": 167621952, "step": 77735 }, { "epoch": 14.266837951917783, "grad_norm": 10.512885093688965, "learning_rate": 2.30125569820779e-06, "loss": 0.0028, "num_input_tokens_seen": 167632576, "step": 77740 }, { "epoch": 14.267755551477336, "grad_norm": 0.6033229231834412, "learning_rate": 2.300581634957074e-06, "loss": 0.0021, "num_input_tokens_seen": 167643072, "step": 77745 }, { "epoch": 14.268673151036888, "grad_norm": 0.036980871111154556, "learning_rate": 2.2999076409422585e-06, "loss": 0.0546, "num_input_tokens_seen": 167654784, "step": 77750 }, { "epoch": 14.26959075059644, "grad_norm": 31.53419303894043, "learning_rate": 2.2992337161806262e-06, "loss": 0.0072, "num_input_tokens_seen": 167665888, "step": 77755 }, { "epoch": 14.270508350155993, "grad_norm": 95.9739990234375, "learning_rate": 2.2985598606894615e-06, "loss": 0.1943, "num_input_tokens_seen": 167676576, "step": 77760 }, { "epoch": 14.271425949715544, "grad_norm": 0.007094975095242262, "learning_rate": 2.2978860744860514e-06, "loss": 0.0001, "num_input_tokens_seen": 167688448, "step": 77765 }, { "epoch": 14.272343549275096, "grad_norm": 0.02696133404970169, "learning_rate": 2.2972123575876757e-06, "loss": 0.0823, "num_input_tokens_seen": 167698560, "step": 77770 }, { "epoch": 14.27326114883465, "grad_norm": 0.5557658076286316, "learning_rate": 2.2965387100116145e-06, "loss": 0.0002, "num_input_tokens_seen": 167709504, "step": 77775 }, { "epoch": 14.2741787483942, "grad_norm": 8.695050239562988, "learning_rate": 2.295865131775143e-06, "loss": 0.0062, "num_input_tokens_seen": 167719584, "step": 77780 }, { "epoch": 14.275096347953752, "grad_norm": 0.06129708141088486, "learning_rate": 2.2951916228955416e-06, "loss": 0.0002, "num_input_tokens_seen": 167730496, "step": 77785 }, { "epoch": 14.276013947513306, "grad_norm": 0.0032701706513762474, "learning_rate": 2.294518183390083e-06, "loss": 0.0, "num_input_tokens_seen": 167741632, "step": 77790 }, { "epoch": 14.276931547072857, "grad_norm": 511.6439208984375, "learning_rate": 2.293844813276039e-06, "loss": 0.029, "num_input_tokens_seen": 167752512, "step": 77795 }, { "epoch": 14.27784914663241, "grad_norm": 0.011614480055868626, "learning_rate": 2.293171512570682e-06, "loss": 0.1502, "num_input_tokens_seen": 167762048, "step": 77800 }, { "epoch": 14.278766746191963, "grad_norm": 20.204099655151367, "learning_rate": 2.292498281291281e-06, "loss": 0.0788, "num_input_tokens_seen": 167772032, "step": 77805 }, { "epoch": 14.279684345751514, "grad_norm": 0.005193828605115414, "learning_rate": 2.291825119455101e-06, "loss": 0.0973, "num_input_tokens_seen": 167783968, "step": 77810 }, { "epoch": 14.280601945311066, "grad_norm": 67.5462875366211, "learning_rate": 2.2911520270794114e-06, "loss": 0.3798, "num_input_tokens_seen": 167793760, "step": 77815 }, { "epoch": 14.28151954487062, "grad_norm": 0.19360031187534332, "learning_rate": 2.2904790041814734e-06, "loss": 0.0064, "num_input_tokens_seen": 167804832, "step": 77820 }, { "epoch": 14.28243714443017, "grad_norm": 0.020869459956884384, "learning_rate": 2.28980605077855e-06, "loss": 0.0, "num_input_tokens_seen": 167816032, "step": 77825 }, { "epoch": 14.283354743989722, "grad_norm": 0.011566486209630966, "learning_rate": 2.289133166887899e-06, "loss": 0.0323, "num_input_tokens_seen": 167827424, "step": 77830 }, { "epoch": 14.284272343549276, "grad_norm": 0.002547415206208825, "learning_rate": 2.288460352526783e-06, "loss": 0.2188, "num_input_tokens_seen": 167837792, "step": 77835 }, { "epoch": 14.285189943108827, "grad_norm": 0.4295458495616913, "learning_rate": 2.287787607712456e-06, "loss": 0.2253, "num_input_tokens_seen": 167849152, "step": 77840 }, { "epoch": 14.286107542668379, "grad_norm": 0.010238558985292912, "learning_rate": 2.287114932462172e-06, "loss": 0.0042, "num_input_tokens_seen": 167858944, "step": 77845 }, { "epoch": 14.287025142227932, "grad_norm": 0.003953773062676191, "learning_rate": 2.286442326793187e-06, "loss": 0.0002, "num_input_tokens_seen": 167869184, "step": 77850 }, { "epoch": 14.287942741787484, "grad_norm": 0.004645271226763725, "learning_rate": 2.2857697907227504e-06, "loss": 0.3478, "num_input_tokens_seen": 167880416, "step": 77855 }, { "epoch": 14.288860341347036, "grad_norm": 0.04385652393102646, "learning_rate": 2.285097324268112e-06, "loss": 0.0001, "num_input_tokens_seen": 167892416, "step": 77860 }, { "epoch": 14.289777940906589, "grad_norm": 0.032778557389974594, "learning_rate": 2.284424927446518e-06, "loss": 0.1439, "num_input_tokens_seen": 167905024, "step": 77865 }, { "epoch": 14.29069554046614, "grad_norm": 0.007814718410372734, "learning_rate": 2.2837526002752176e-06, "loss": 0.3751, "num_input_tokens_seen": 167915296, "step": 77870 }, { "epoch": 14.291613140025692, "grad_norm": 0.08175189048051834, "learning_rate": 2.2830803427714533e-06, "loss": 0.0003, "num_input_tokens_seen": 167925568, "step": 77875 }, { "epoch": 14.292530739585246, "grad_norm": 0.012674939818680286, "learning_rate": 2.2824081549524654e-06, "loss": 0.0479, "num_input_tokens_seen": 167936800, "step": 77880 }, { "epoch": 14.293448339144797, "grad_norm": 3.88775372505188, "learning_rate": 2.281736036835498e-06, "loss": 0.37, "num_input_tokens_seen": 167947520, "step": 77885 }, { "epoch": 14.294365938704349, "grad_norm": 0.05818713456392288, "learning_rate": 2.281063988437789e-06, "loss": 0.0001, "num_input_tokens_seen": 167959264, "step": 77890 }, { "epoch": 14.295283538263902, "grad_norm": 0.0031542873475700617, "learning_rate": 2.280392009776574e-06, "loss": 0.1067, "num_input_tokens_seen": 167969344, "step": 77895 }, { "epoch": 14.296201137823454, "grad_norm": 0.054910872131586075, "learning_rate": 2.2797201008690893e-06, "loss": 0.0003, "num_input_tokens_seen": 167980288, "step": 77900 }, { "epoch": 14.297118737383006, "grad_norm": 0.19582420587539673, "learning_rate": 2.279048261732566e-06, "loss": 0.0002, "num_input_tokens_seen": 167991040, "step": 77905 }, { "epoch": 14.298036336942559, "grad_norm": 0.001942023285664618, "learning_rate": 2.27837649238424e-06, "loss": 0.0279, "num_input_tokens_seen": 168002880, "step": 77910 }, { "epoch": 14.29895393650211, "grad_norm": 0.0028133539017289877, "learning_rate": 2.277704792841338e-06, "loss": 0.0001, "num_input_tokens_seen": 168014048, "step": 77915 }, { "epoch": 14.299871536061662, "grad_norm": 0.06083173304796219, "learning_rate": 2.2770331631210894e-06, "loss": 0.0886, "num_input_tokens_seen": 168025440, "step": 77920 }, { "epoch": 14.300789135621216, "grad_norm": 0.10206246376037598, "learning_rate": 2.27636160324072e-06, "loss": 0.0001, "num_input_tokens_seen": 168036320, "step": 77925 }, { "epoch": 14.301706735180767, "grad_norm": 0.10709013789892197, "learning_rate": 2.2756901132174525e-06, "loss": 0.155, "num_input_tokens_seen": 168047296, "step": 77930 }, { "epoch": 14.302624334740319, "grad_norm": 0.06999877840280533, "learning_rate": 2.2750186930685124e-06, "loss": 0.0352, "num_input_tokens_seen": 168058048, "step": 77935 }, { "epoch": 14.303541934299872, "grad_norm": 0.03389628976583481, "learning_rate": 2.27434734281112e-06, "loss": 0.1042, "num_input_tokens_seen": 168070144, "step": 77940 }, { "epoch": 14.304459533859424, "grad_norm": 4.661059856414795, "learning_rate": 2.273676062462492e-06, "loss": 0.0609, "num_input_tokens_seen": 168081216, "step": 77945 }, { "epoch": 14.305377133418975, "grad_norm": 1.6612721681594849, "learning_rate": 2.2730048520398494e-06, "loss": 0.0162, "num_input_tokens_seen": 168091296, "step": 77950 }, { "epoch": 14.306294732978529, "grad_norm": 0.1611967831850052, "learning_rate": 2.272333711560406e-06, "loss": 0.144, "num_input_tokens_seen": 168101536, "step": 77955 }, { "epoch": 14.30721233253808, "grad_norm": 0.007431416772305965, "learning_rate": 2.2716626410413755e-06, "loss": 0.2195, "num_input_tokens_seen": 168112608, "step": 77960 }, { "epoch": 14.308129932097632, "grad_norm": 31.774259567260742, "learning_rate": 2.2709916404999677e-06, "loss": 0.0508, "num_input_tokens_seen": 168124640, "step": 77965 }, { "epoch": 14.309047531657185, "grad_norm": 1.502852439880371, "learning_rate": 2.2703207099533963e-06, "loss": 0.0003, "num_input_tokens_seen": 168135360, "step": 77970 }, { "epoch": 14.309965131216737, "grad_norm": 0.07353806495666504, "learning_rate": 2.2696498494188685e-06, "loss": 0.0002, "num_input_tokens_seen": 168145792, "step": 77975 }, { "epoch": 14.310882730776289, "grad_norm": 0.4218945801258087, "learning_rate": 2.2689790589135884e-06, "loss": 0.0003, "num_input_tokens_seen": 168157824, "step": 77980 }, { "epoch": 14.311800330335842, "grad_norm": 7.027462482452393, "learning_rate": 2.268308338454765e-06, "loss": 0.0194, "num_input_tokens_seen": 168168640, "step": 77985 }, { "epoch": 14.312717929895394, "grad_norm": 0.013143893331289291, "learning_rate": 2.2676376880595985e-06, "loss": 0.0098, "num_input_tokens_seen": 168180192, "step": 77990 }, { "epoch": 14.313635529454945, "grad_norm": 79.23714447021484, "learning_rate": 2.2669671077452906e-06, "loss": 0.1238, "num_input_tokens_seen": 168191104, "step": 77995 }, { "epoch": 14.314553129014499, "grad_norm": 0.015515511855483055, "learning_rate": 2.2662965975290386e-06, "loss": 0.0006, "num_input_tokens_seen": 168201952, "step": 78000 }, { "epoch": 14.31547072857405, "grad_norm": 0.19657108187675476, "learning_rate": 2.265626157428044e-06, "loss": 0.0007, "num_input_tokens_seen": 168212416, "step": 78005 }, { "epoch": 14.316388328133602, "grad_norm": 0.01910465769469738, "learning_rate": 2.2649557874595007e-06, "loss": 0.0014, "num_input_tokens_seen": 168222464, "step": 78010 }, { "epoch": 14.317305927693155, "grad_norm": 0.005232697352766991, "learning_rate": 2.2642854876406e-06, "loss": 0.0001, "num_input_tokens_seen": 168232608, "step": 78015 }, { "epoch": 14.318223527252707, "grad_norm": 86.8827133178711, "learning_rate": 2.2636152579885395e-06, "loss": 0.1078, "num_input_tokens_seen": 168242208, "step": 78020 }, { "epoch": 14.319141126812259, "grad_norm": 0.00248767645098269, "learning_rate": 2.262945098520506e-06, "loss": 0.0026, "num_input_tokens_seen": 168253280, "step": 78025 }, { "epoch": 14.320058726371812, "grad_norm": 0.004199168644845486, "learning_rate": 2.2622750092536866e-06, "loss": 0.0002, "num_input_tokens_seen": 168264608, "step": 78030 }, { "epoch": 14.320976325931364, "grad_norm": 11.686163902282715, "learning_rate": 2.2616049902052723e-06, "loss": 0.1496, "num_input_tokens_seen": 168275520, "step": 78035 }, { "epoch": 14.321893925490915, "grad_norm": 0.004949287977069616, "learning_rate": 2.260935041392446e-06, "loss": 0.0001, "num_input_tokens_seen": 168285888, "step": 78040 }, { "epoch": 14.322811525050469, "grad_norm": 106.07421112060547, "learning_rate": 2.2602651628323905e-06, "loss": 0.2438, "num_input_tokens_seen": 168297888, "step": 78045 }, { "epoch": 14.32372912461002, "grad_norm": 42.39870834350586, "learning_rate": 2.2595953545422855e-06, "loss": 0.0026, "num_input_tokens_seen": 168309984, "step": 78050 }, { "epoch": 14.324646724169572, "grad_norm": 0.016588952392339706, "learning_rate": 2.258925616539314e-06, "loss": 0.1989, "num_input_tokens_seen": 168320832, "step": 78055 }, { "epoch": 14.325564323729125, "grad_norm": 0.019518226385116577, "learning_rate": 2.258255948840653e-06, "loss": 0.0033, "num_input_tokens_seen": 168332224, "step": 78060 }, { "epoch": 14.326481923288677, "grad_norm": 0.02272035926580429, "learning_rate": 2.2575863514634748e-06, "loss": 0.002, "num_input_tokens_seen": 168341856, "step": 78065 }, { "epoch": 14.327399522848228, "grad_norm": 0.0025402463506907225, "learning_rate": 2.256916824424959e-06, "loss": 0.0003, "num_input_tokens_seen": 168352864, "step": 78070 }, { "epoch": 14.328317122407782, "grad_norm": 0.013508453033864498, "learning_rate": 2.2562473677422745e-06, "loss": 0.0001, "num_input_tokens_seen": 168363424, "step": 78075 }, { "epoch": 14.329234721967333, "grad_norm": 0.02091580256819725, "learning_rate": 2.2555779814325913e-06, "loss": 0.0408, "num_input_tokens_seen": 168373632, "step": 78080 }, { "epoch": 14.330152321526885, "grad_norm": 0.044537827372550964, "learning_rate": 2.2549086655130815e-06, "loss": 0.0704, "num_input_tokens_seen": 168384576, "step": 78085 }, { "epoch": 14.331069921086439, "grad_norm": 0.03509775921702385, "learning_rate": 2.2542394200009095e-06, "loss": 0.0814, "num_input_tokens_seen": 168395488, "step": 78090 }, { "epoch": 14.33198752064599, "grad_norm": 0.004615663550794125, "learning_rate": 2.253570244913241e-06, "loss": 0.0001, "num_input_tokens_seen": 168406624, "step": 78095 }, { "epoch": 14.332905120205542, "grad_norm": 0.18161258101463318, "learning_rate": 2.2529011402672367e-06, "loss": 0.0004, "num_input_tokens_seen": 168416512, "step": 78100 }, { "epoch": 14.333822719765095, "grad_norm": 0.07220727950334549, "learning_rate": 2.252232106080063e-06, "loss": 0.0208, "num_input_tokens_seen": 168426880, "step": 78105 }, { "epoch": 14.334740319324647, "grad_norm": 0.0015523971524089575, "learning_rate": 2.2515631423688766e-06, "loss": 0.0003, "num_input_tokens_seen": 168437504, "step": 78110 }, { "epoch": 14.335657918884198, "grad_norm": 0.21977779269218445, "learning_rate": 2.2508942491508364e-06, "loss": 0.0004, "num_input_tokens_seen": 168448064, "step": 78115 }, { "epoch": 14.336575518443752, "grad_norm": 0.007198826875537634, "learning_rate": 2.250225426443098e-06, "loss": 0.1227, "num_input_tokens_seen": 168459584, "step": 78120 }, { "epoch": 14.337493118003303, "grad_norm": 47.85853958129883, "learning_rate": 2.2495566742628133e-06, "loss": 0.0031, "num_input_tokens_seen": 168470528, "step": 78125 }, { "epoch": 14.338410717562855, "grad_norm": 0.0272983368486166, "learning_rate": 2.2488879926271396e-06, "loss": 0.0051, "num_input_tokens_seen": 168480960, "step": 78130 }, { "epoch": 14.339328317122408, "grad_norm": 0.00813671387732029, "learning_rate": 2.2482193815532246e-06, "loss": 0.0003, "num_input_tokens_seen": 168492256, "step": 78135 }, { "epoch": 14.34024591668196, "grad_norm": 0.002358352532610297, "learning_rate": 2.2475508410582176e-06, "loss": 0.0, "num_input_tokens_seen": 168502784, "step": 78140 }, { "epoch": 14.341163516241512, "grad_norm": 0.002024336252361536, "learning_rate": 2.2468823711592656e-06, "loss": 0.3845, "num_input_tokens_seen": 168513152, "step": 78145 }, { "epoch": 14.342081115801065, "grad_norm": 39.42775344848633, "learning_rate": 2.246213971873512e-06, "loss": 0.1345, "num_input_tokens_seen": 168524128, "step": 78150 }, { "epoch": 14.342998715360617, "grad_norm": 0.02099403738975525, "learning_rate": 2.245545643218104e-06, "loss": 0.0001, "num_input_tokens_seen": 168534272, "step": 78155 }, { "epoch": 14.343916314920168, "grad_norm": 0.004962288308888674, "learning_rate": 2.2448773852101814e-06, "loss": 0.0, "num_input_tokens_seen": 168546048, "step": 78160 }, { "epoch": 14.344833914479722, "grad_norm": 0.008968953974545002, "learning_rate": 2.2442091978668817e-06, "loss": 0.1483, "num_input_tokens_seen": 168555328, "step": 78165 }, { "epoch": 14.345751514039273, "grad_norm": 173.8177032470703, "learning_rate": 2.243541081205347e-06, "loss": 0.1319, "num_input_tokens_seen": 168566592, "step": 78170 }, { "epoch": 14.346669113598825, "grad_norm": 0.0803305134177208, "learning_rate": 2.2428730352427112e-06, "loss": 0.0028, "num_input_tokens_seen": 168577568, "step": 78175 }, { "epoch": 14.347586713158378, "grad_norm": 0.09286992251873016, "learning_rate": 2.242205059996109e-06, "loss": 0.0004, "num_input_tokens_seen": 168588480, "step": 78180 }, { "epoch": 14.34850431271793, "grad_norm": 0.009350379928946495, "learning_rate": 2.2415371554826714e-06, "loss": 0.1333, "num_input_tokens_seen": 168597984, "step": 78185 }, { "epoch": 14.349421912277482, "grad_norm": 0.0024502365849912167, "learning_rate": 2.240869321719532e-06, "loss": 0.0002, "num_input_tokens_seen": 168608960, "step": 78190 }, { "epoch": 14.350339511837035, "grad_norm": 0.06135276332497597, "learning_rate": 2.240201558723818e-06, "loss": 0.0099, "num_input_tokens_seen": 168620288, "step": 78195 }, { "epoch": 14.351257111396587, "grad_norm": 0.02451394684612751, "learning_rate": 2.2395338665126554e-06, "loss": 0.0003, "num_input_tokens_seen": 168631200, "step": 78200 }, { "epoch": 14.352174710956138, "grad_norm": 0.019539128988981247, "learning_rate": 2.238866245103172e-06, "loss": 0.1347, "num_input_tokens_seen": 168641216, "step": 78205 }, { "epoch": 14.353092310515692, "grad_norm": 0.11194244772195816, "learning_rate": 2.2381986945124907e-06, "loss": 0.0012, "num_input_tokens_seen": 168651648, "step": 78210 }, { "epoch": 14.354009910075243, "grad_norm": 0.018506737425923347, "learning_rate": 2.23753121475773e-06, "loss": 0.0001, "num_input_tokens_seen": 168662368, "step": 78215 }, { "epoch": 14.354927509634795, "grad_norm": 0.004779094830155373, "learning_rate": 2.2368638058560145e-06, "loss": 0.0003, "num_input_tokens_seen": 168673728, "step": 78220 }, { "epoch": 14.355845109194348, "grad_norm": 0.03709897771477699, "learning_rate": 2.23619646782446e-06, "loss": 0.0891, "num_input_tokens_seen": 168685120, "step": 78225 }, { "epoch": 14.3567627087539, "grad_norm": 0.29348626732826233, "learning_rate": 2.235529200680182e-06, "loss": 0.1766, "num_input_tokens_seen": 168696800, "step": 78230 }, { "epoch": 14.357680308313451, "grad_norm": 0.007519652601331472, "learning_rate": 2.2348620044402943e-06, "loss": 0.0026, "num_input_tokens_seen": 168706240, "step": 78235 }, { "epoch": 14.358597907873005, "grad_norm": 0.3350641429424286, "learning_rate": 2.234194879121912e-06, "loss": 0.1491, "num_input_tokens_seen": 168715936, "step": 78240 }, { "epoch": 14.359515507432556, "grad_norm": 0.10188877582550049, "learning_rate": 2.233527824742145e-06, "loss": 0.0009, "num_input_tokens_seen": 168726176, "step": 78245 }, { "epoch": 14.360433106992108, "grad_norm": 28.483436584472656, "learning_rate": 2.232860841318099e-06, "loss": 0.002, "num_input_tokens_seen": 168737408, "step": 78250 }, { "epoch": 14.361350706551661, "grad_norm": 0.01404806412756443, "learning_rate": 2.232193928866886e-06, "loss": 0.1067, "num_input_tokens_seen": 168748672, "step": 78255 }, { "epoch": 14.362268306111213, "grad_norm": 2.7702691555023193, "learning_rate": 2.2315270874056088e-06, "loss": 0.0209, "num_input_tokens_seen": 168759712, "step": 78260 }, { "epoch": 14.363185905670765, "grad_norm": 0.001947698532603681, "learning_rate": 2.2308603169513698e-06, "loss": 0.0011, "num_input_tokens_seen": 168771296, "step": 78265 }, { "epoch": 14.364103505230318, "grad_norm": 0.014853877946734428, "learning_rate": 2.2301936175212737e-06, "loss": 0.1506, "num_input_tokens_seen": 168781056, "step": 78270 }, { "epoch": 14.36502110478987, "grad_norm": 0.8893593549728394, "learning_rate": 2.229526989132418e-06, "loss": 0.1864, "num_input_tokens_seen": 168791680, "step": 78275 }, { "epoch": 14.365938704349421, "grad_norm": 0.3654244840145111, "learning_rate": 2.228860431801901e-06, "loss": 0.0005, "num_input_tokens_seen": 168802272, "step": 78280 }, { "epoch": 14.366856303908975, "grad_norm": 8.194669723510742, "learning_rate": 2.228193945546818e-06, "loss": 0.0022, "num_input_tokens_seen": 168813152, "step": 78285 }, { "epoch": 14.367773903468526, "grad_norm": 0.005916507914662361, "learning_rate": 2.2275275303842654e-06, "loss": 0.0674, "num_input_tokens_seen": 168823680, "step": 78290 }, { "epoch": 14.368691503028078, "grad_norm": 89.55409240722656, "learning_rate": 2.226861186331335e-06, "loss": 0.2001, "num_input_tokens_seen": 168833696, "step": 78295 }, { "epoch": 14.369609102587631, "grad_norm": 0.008305019699037075, "learning_rate": 2.226194913405115e-06, "loss": 0.1067, "num_input_tokens_seen": 168844448, "step": 78300 }, { "epoch": 14.370526702147183, "grad_norm": 0.005386021453887224, "learning_rate": 2.2255287116226994e-06, "loss": 0.0734, "num_input_tokens_seen": 168854912, "step": 78305 }, { "epoch": 14.371444301706735, "grad_norm": 0.028532397001981735, "learning_rate": 2.2248625810011716e-06, "loss": 0.0208, "num_input_tokens_seen": 168865792, "step": 78310 }, { "epoch": 14.372361901266288, "grad_norm": 12.021190643310547, "learning_rate": 2.2241965215576173e-06, "loss": 0.0618, "num_input_tokens_seen": 168877824, "step": 78315 }, { "epoch": 14.37327950082584, "grad_norm": 0.020736100152134895, "learning_rate": 2.223530533309119e-06, "loss": 0.071, "num_input_tokens_seen": 168888768, "step": 78320 }, { "epoch": 14.374197100385391, "grad_norm": 0.06638490408658981, "learning_rate": 2.2228646162727606e-06, "loss": 0.0174, "num_input_tokens_seen": 168899136, "step": 78325 }, { "epoch": 14.375114699944945, "grad_norm": 0.22182819247245789, "learning_rate": 2.2221987704656204e-06, "loss": 0.0002, "num_input_tokens_seen": 168910912, "step": 78330 }, { "epoch": 14.376032299504496, "grad_norm": 0.030488261952996254, "learning_rate": 2.221532995904777e-06, "loss": 0.0004, "num_input_tokens_seen": 168921984, "step": 78335 }, { "epoch": 14.376949899064048, "grad_norm": 62.69768524169922, "learning_rate": 2.2208672926073062e-06, "loss": 0.1778, "num_input_tokens_seen": 168933664, "step": 78340 }, { "epoch": 14.377867498623601, "grad_norm": 0.0018100510351359844, "learning_rate": 2.2202016605902816e-06, "loss": 0.0002, "num_input_tokens_seen": 168944192, "step": 78345 }, { "epoch": 14.378785098183153, "grad_norm": 0.012436993420124054, "learning_rate": 2.2195360998707747e-06, "loss": 0.0177, "num_input_tokens_seen": 168954400, "step": 78350 }, { "epoch": 14.379702697742704, "grad_norm": 0.010586360469460487, "learning_rate": 2.218870610465859e-06, "loss": 0.0017, "num_input_tokens_seen": 168965408, "step": 78355 }, { "epoch": 14.380620297302258, "grad_norm": 0.002487974939867854, "learning_rate": 2.218205192392602e-06, "loss": 0.0051, "num_input_tokens_seen": 168976928, "step": 78360 }, { "epoch": 14.38153789686181, "grad_norm": 0.45668795704841614, "learning_rate": 2.21753984566807e-06, "loss": 0.0066, "num_input_tokens_seen": 168988800, "step": 78365 }, { "epoch": 14.382455496421361, "grad_norm": 0.034404635429382324, "learning_rate": 2.2168745703093273e-06, "loss": 0.0003, "num_input_tokens_seen": 168998880, "step": 78370 }, { "epoch": 14.383373095980915, "grad_norm": 0.06957761943340302, "learning_rate": 2.21620936633344e-06, "loss": 0.0003, "num_input_tokens_seen": 169010208, "step": 78375 }, { "epoch": 14.384290695540466, "grad_norm": 162.087890625, "learning_rate": 2.2155442337574677e-06, "loss": 0.0269, "num_input_tokens_seen": 169021728, "step": 78380 }, { "epoch": 14.385208295100018, "grad_norm": 35.06028747558594, "learning_rate": 2.214879172598469e-06, "loss": 0.257, "num_input_tokens_seen": 169033472, "step": 78385 }, { "epoch": 14.386125894659571, "grad_norm": 9.551383972167969, "learning_rate": 2.2142141828735047e-06, "loss": 0.0008, "num_input_tokens_seen": 169044480, "step": 78390 }, { "epoch": 14.387043494219123, "grad_norm": 0.14048999547958374, "learning_rate": 2.2135492645996286e-06, "loss": 0.0714, "num_input_tokens_seen": 169054880, "step": 78395 }, { "epoch": 14.387961093778674, "grad_norm": 0.06905052810907364, "learning_rate": 2.212884417793894e-06, "loss": 0.1439, "num_input_tokens_seen": 169066976, "step": 78400 }, { "epoch": 14.388878693338228, "grad_norm": 0.1876363754272461, "learning_rate": 2.212219642473356e-06, "loss": 0.1378, "num_input_tokens_seen": 169076928, "step": 78405 }, { "epoch": 14.38979629289778, "grad_norm": 0.004172143526375294, "learning_rate": 2.2115549386550635e-06, "loss": 0.0278, "num_input_tokens_seen": 169088448, "step": 78410 }, { "epoch": 14.390713892457331, "grad_norm": 0.0023314696736633778, "learning_rate": 2.2108903063560648e-06, "loss": 0.0436, "num_input_tokens_seen": 169098880, "step": 78415 }, { "epoch": 14.391631492016884, "grad_norm": 0.012347367592155933, "learning_rate": 2.2102257455934056e-06, "loss": 0.0021, "num_input_tokens_seen": 169109024, "step": 78420 }, { "epoch": 14.392549091576436, "grad_norm": 0.002826019888743758, "learning_rate": 2.209561256384134e-06, "loss": 0.0533, "num_input_tokens_seen": 169120384, "step": 78425 }, { "epoch": 14.393466691135988, "grad_norm": 12.831299781799316, "learning_rate": 2.2088968387452915e-06, "loss": 0.0026, "num_input_tokens_seen": 169130880, "step": 78430 }, { "epoch": 14.394384290695541, "grad_norm": 0.007777240127325058, "learning_rate": 2.208232492693917e-06, "loss": 0.1191, "num_input_tokens_seen": 169141984, "step": 78435 }, { "epoch": 14.395301890255093, "grad_norm": 0.012774557806551456, "learning_rate": 2.207568218247054e-06, "loss": 0.0001, "num_input_tokens_seen": 169152992, "step": 78440 }, { "epoch": 14.396219489814644, "grad_norm": 0.0030343160033226013, "learning_rate": 2.2069040154217392e-06, "loss": 0.0004, "num_input_tokens_seen": 169164640, "step": 78445 }, { "epoch": 14.397137089374198, "grad_norm": 4.37041711807251, "learning_rate": 2.2062398842350067e-06, "loss": 0.0105, "num_input_tokens_seen": 169175840, "step": 78450 }, { "epoch": 14.39805468893375, "grad_norm": 0.0057206223718822, "learning_rate": 2.20557582470389e-06, "loss": 0.0001, "num_input_tokens_seen": 169185696, "step": 78455 }, { "epoch": 14.398972288493301, "grad_norm": 0.0018794446950778365, "learning_rate": 2.2049118368454236e-06, "loss": 0.0012, "num_input_tokens_seen": 169194976, "step": 78460 }, { "epoch": 14.399889888052854, "grad_norm": 0.14009670913219452, "learning_rate": 2.2042479206766372e-06, "loss": 0.0004, "num_input_tokens_seen": 169205280, "step": 78465 }, { "epoch": 14.400807487612406, "grad_norm": 0.0029743195045739412, "learning_rate": 2.2035840762145566e-06, "loss": 0.0011, "num_input_tokens_seen": 169215136, "step": 78470 }, { "epoch": 14.401725087171958, "grad_norm": 0.003226021770387888, "learning_rate": 2.202920303476212e-06, "loss": 0.0003, "num_input_tokens_seen": 169226304, "step": 78475 }, { "epoch": 14.402642686731511, "grad_norm": 0.008031598292291164, "learning_rate": 2.202256602478627e-06, "loss": 0.1785, "num_input_tokens_seen": 169235904, "step": 78480 }, { "epoch": 14.403560286291063, "grad_norm": 73.89794158935547, "learning_rate": 2.2015929732388214e-06, "loss": 0.2127, "num_input_tokens_seen": 169246560, "step": 78485 }, { "epoch": 14.404477885850614, "grad_norm": 2.586515426635742, "learning_rate": 2.2009294157738214e-06, "loss": 0.0004, "num_input_tokens_seen": 169258016, "step": 78490 }, { "epoch": 14.405395485410168, "grad_norm": 0.32080313563346863, "learning_rate": 2.2002659301006434e-06, "loss": 0.0002, "num_input_tokens_seen": 169268960, "step": 78495 }, { "epoch": 14.40631308496972, "grad_norm": 0.0855884850025177, "learning_rate": 2.1996025162363056e-06, "loss": 0.1412, "num_input_tokens_seen": 169279488, "step": 78500 }, { "epoch": 14.40723068452927, "grad_norm": 0.15898749232292175, "learning_rate": 2.1989391741978206e-06, "loss": 0.0174, "num_input_tokens_seen": 169290976, "step": 78505 }, { "epoch": 14.408148284088824, "grad_norm": 81.42037963867188, "learning_rate": 2.1982759040022066e-06, "loss": 0.1392, "num_input_tokens_seen": 169301248, "step": 78510 }, { "epoch": 14.409065883648376, "grad_norm": 0.009946080856025219, "learning_rate": 2.1976127056664732e-06, "loss": 0.0002, "num_input_tokens_seen": 169310976, "step": 78515 }, { "epoch": 14.409983483207927, "grad_norm": 0.038604628294706345, "learning_rate": 2.196949579207629e-06, "loss": 0.0149, "num_input_tokens_seen": 169321312, "step": 78520 }, { "epoch": 14.41090108276748, "grad_norm": 0.03767183795571327, "learning_rate": 2.1962865246426857e-06, "loss": 0.0001, "num_input_tokens_seen": 169331968, "step": 78525 }, { "epoch": 14.411818682327032, "grad_norm": 0.44332972168922424, "learning_rate": 2.1956235419886475e-06, "loss": 0.0015, "num_input_tokens_seen": 169343008, "step": 78530 }, { "epoch": 14.412736281886584, "grad_norm": 0.01170083973556757, "learning_rate": 2.194960631262519e-06, "loss": 0.0003, "num_input_tokens_seen": 169353088, "step": 78535 }, { "epoch": 14.413653881446137, "grad_norm": 5.764627933502197, "learning_rate": 2.194297792481303e-06, "loss": 0.0293, "num_input_tokens_seen": 169364096, "step": 78540 }, { "epoch": 14.414571481005689, "grad_norm": 0.004907415714114904, "learning_rate": 2.193635025661998e-06, "loss": 0.0165, "num_input_tokens_seen": 169374688, "step": 78545 }, { "epoch": 14.41548908056524, "grad_norm": 0.0021025831811130047, "learning_rate": 2.1929723308216074e-06, "loss": 0.0001, "num_input_tokens_seen": 169384864, "step": 78550 }, { "epoch": 14.416406680124794, "grad_norm": 0.5023736953735352, "learning_rate": 2.192309707977126e-06, "loss": 0.0097, "num_input_tokens_seen": 169396032, "step": 78555 }, { "epoch": 14.417324279684346, "grad_norm": 0.07344950735569, "learning_rate": 2.191647157145549e-06, "loss": 0.0003, "num_input_tokens_seen": 169408192, "step": 78560 }, { "epoch": 14.418241879243897, "grad_norm": 0.004153443966060877, "learning_rate": 2.19098467834387e-06, "loss": 0.0003, "num_input_tokens_seen": 169418112, "step": 78565 }, { "epoch": 14.41915947880345, "grad_norm": 0.07499681413173676, "learning_rate": 2.190322271589078e-06, "loss": 0.0001, "num_input_tokens_seen": 169429216, "step": 78570 }, { "epoch": 14.420077078363002, "grad_norm": 0.0016658551758155227, "learning_rate": 2.1896599368981674e-06, "loss": 0.0008, "num_input_tokens_seen": 169440608, "step": 78575 }, { "epoch": 14.420994677922554, "grad_norm": 0.07204301655292511, "learning_rate": 2.1889976742881237e-06, "loss": 0.0154, "num_input_tokens_seen": 169450880, "step": 78580 }, { "epoch": 14.421912277482107, "grad_norm": 0.002698787022382021, "learning_rate": 2.1883354837759312e-06, "loss": 0.2655, "num_input_tokens_seen": 169462112, "step": 78585 }, { "epoch": 14.422829877041659, "grad_norm": 0.0072573148645460606, "learning_rate": 2.1876733653785776e-06, "loss": 0.0002, "num_input_tokens_seen": 169474016, "step": 78590 }, { "epoch": 14.42374747660121, "grad_norm": 0.0038806607481092215, "learning_rate": 2.187011319113044e-06, "loss": 0.0002, "num_input_tokens_seen": 169484928, "step": 78595 }, { "epoch": 14.424665076160764, "grad_norm": 0.016126364469528198, "learning_rate": 2.1863493449963098e-06, "loss": 0.0006, "num_input_tokens_seen": 169496256, "step": 78600 }, { "epoch": 14.425582675720316, "grad_norm": 1.1812360286712646, "learning_rate": 2.1856874430453522e-06, "loss": 0.0005, "num_input_tokens_seen": 169506272, "step": 78605 }, { "epoch": 14.426500275279867, "grad_norm": 0.026196923106908798, "learning_rate": 2.185025613277152e-06, "loss": 0.0001, "num_input_tokens_seen": 169515776, "step": 78610 }, { "epoch": 14.42741787483942, "grad_norm": 0.0018123979680240154, "learning_rate": 2.1843638557086816e-06, "loss": 0.0001, "num_input_tokens_seen": 169527904, "step": 78615 }, { "epoch": 14.428335474398972, "grad_norm": 0.0011980903800576925, "learning_rate": 2.1837021703569134e-06, "loss": 0.0003, "num_input_tokens_seen": 169538496, "step": 78620 }, { "epoch": 14.429253073958524, "grad_norm": 0.001076682354323566, "learning_rate": 2.1830405572388207e-06, "loss": 0.1036, "num_input_tokens_seen": 169548992, "step": 78625 }, { "epoch": 14.430170673518077, "grad_norm": 1202.8394775390625, "learning_rate": 2.182379016371372e-06, "loss": 0.0824, "num_input_tokens_seen": 169559136, "step": 78630 }, { "epoch": 14.431088273077629, "grad_norm": 1.6196659803390503, "learning_rate": 2.1817175477715352e-06, "loss": 0.0022, "num_input_tokens_seen": 169569536, "step": 78635 }, { "epoch": 14.43200587263718, "grad_norm": 0.007953650318086147, "learning_rate": 2.181056151456273e-06, "loss": 0.1067, "num_input_tokens_seen": 169580608, "step": 78640 }, { "epoch": 14.432923472196734, "grad_norm": 0.004442910198122263, "learning_rate": 2.1803948274425534e-06, "loss": 0.0002, "num_input_tokens_seen": 169592320, "step": 78645 }, { "epoch": 14.433841071756286, "grad_norm": 0.029106367379426956, "learning_rate": 2.1797335757473363e-06, "loss": 0.0001, "num_input_tokens_seen": 169602752, "step": 78650 }, { "epoch": 14.434758671315837, "grad_norm": 0.00271999672986567, "learning_rate": 2.1790723963875805e-06, "loss": 0.0853, "num_input_tokens_seen": 169613440, "step": 78655 }, { "epoch": 14.43567627087539, "grad_norm": 50.09831619262695, "learning_rate": 2.1784112893802474e-06, "loss": 0.0175, "num_input_tokens_seen": 169623744, "step": 78660 }, { "epoch": 14.436593870434942, "grad_norm": 0.0020355056039988995, "learning_rate": 2.1777502547422917e-06, "loss": 0.0081, "num_input_tokens_seen": 169633824, "step": 78665 }, { "epoch": 14.437511469994494, "grad_norm": 0.01521785743534565, "learning_rate": 2.1770892924906663e-06, "loss": 0.0001, "num_input_tokens_seen": 169643168, "step": 78670 }, { "epoch": 14.438429069554047, "grad_norm": 16.762357711791992, "learning_rate": 2.1764284026423266e-06, "loss": 0.3773, "num_input_tokens_seen": 169653376, "step": 78675 }, { "epoch": 14.439346669113599, "grad_norm": 0.07139110565185547, "learning_rate": 2.175767585214223e-06, "loss": 0.3392, "num_input_tokens_seen": 169663968, "step": 78680 }, { "epoch": 14.44026426867315, "grad_norm": 724.1702880859375, "learning_rate": 2.1751068402233033e-06, "loss": 0.1395, "num_input_tokens_seen": 169674912, "step": 78685 }, { "epoch": 14.441181868232704, "grad_norm": 161.8539581298828, "learning_rate": 2.174446167686513e-06, "loss": 0.1006, "num_input_tokens_seen": 169685984, "step": 78690 }, { "epoch": 14.442099467792255, "grad_norm": 0.031182119622826576, "learning_rate": 2.1737855676208016e-06, "loss": 0.0002, "num_input_tokens_seen": 169696736, "step": 78695 }, { "epoch": 14.443017067351807, "grad_norm": 0.00990872923284769, "learning_rate": 2.173125040043109e-06, "loss": 0.0001, "num_input_tokens_seen": 169707808, "step": 78700 }, { "epoch": 14.44393466691136, "grad_norm": 0.025732235983014107, "learning_rate": 2.1724645849703773e-06, "loss": 0.0001, "num_input_tokens_seen": 169718368, "step": 78705 }, { "epoch": 14.444852266470912, "grad_norm": 0.9853472709655762, "learning_rate": 2.171804202419548e-06, "loss": 0.2222, "num_input_tokens_seen": 169729056, "step": 78710 }, { "epoch": 14.445769866030464, "grad_norm": 0.001664361567236483, "learning_rate": 2.1711438924075578e-06, "loss": 0.2126, "num_input_tokens_seen": 169740640, "step": 78715 }, { "epoch": 14.446687465590017, "grad_norm": 42.2655143737793, "learning_rate": 2.1704836549513404e-06, "loss": 0.3821, "num_input_tokens_seen": 169752288, "step": 78720 }, { "epoch": 14.447605065149569, "grad_norm": 0.02733374573290348, "learning_rate": 2.169823490067834e-06, "loss": 0.0005, "num_input_tokens_seen": 169762816, "step": 78725 }, { "epoch": 14.44852266470912, "grad_norm": 0.011761598289012909, "learning_rate": 2.1691633977739683e-06, "loss": 0.2536, "num_input_tokens_seen": 169774464, "step": 78730 }, { "epoch": 14.449440264268674, "grad_norm": 0.3706357181072235, "learning_rate": 2.168503378086674e-06, "loss": 0.0002, "num_input_tokens_seen": 169784640, "step": 78735 }, { "epoch": 14.450357863828225, "grad_norm": 0.0006215580506250262, "learning_rate": 2.1678434310228787e-06, "loss": 0.001, "num_input_tokens_seen": 169794688, "step": 78740 }, { "epoch": 14.451275463387777, "grad_norm": 0.0013096454786136746, "learning_rate": 2.1671835565995107e-06, "loss": 0.0002, "num_input_tokens_seen": 169806368, "step": 78745 }, { "epoch": 14.45219306294733, "grad_norm": 0.010102134197950363, "learning_rate": 2.1665237548334943e-06, "loss": 0.0, "num_input_tokens_seen": 169817600, "step": 78750 }, { "epoch": 14.453110662506882, "grad_norm": 0.002366594271734357, "learning_rate": 2.1658640257417524e-06, "loss": 0.0005, "num_input_tokens_seen": 169829248, "step": 78755 }, { "epoch": 14.454028262066434, "grad_norm": 1.1856499910354614, "learning_rate": 2.1652043693412057e-06, "loss": 0.0004, "num_input_tokens_seen": 169839712, "step": 78760 }, { "epoch": 14.454945861625987, "grad_norm": 0.002989827888086438, "learning_rate": 2.1645447856487713e-06, "loss": 0.0119, "num_input_tokens_seen": 169850880, "step": 78765 }, { "epoch": 14.455863461185539, "grad_norm": 82.9876708984375, "learning_rate": 2.1638852746813706e-06, "loss": 0.0255, "num_input_tokens_seen": 169862048, "step": 78770 }, { "epoch": 14.45678106074509, "grad_norm": 3.2160661220550537, "learning_rate": 2.163225836455917e-06, "loss": 0.2671, "num_input_tokens_seen": 169872224, "step": 78775 }, { "epoch": 14.457698660304644, "grad_norm": 0.0042096273973584175, "learning_rate": 2.162566470989324e-06, "loss": 0.0001, "num_input_tokens_seen": 169882976, "step": 78780 }, { "epoch": 14.458616259864195, "grad_norm": 0.04345712810754776, "learning_rate": 2.1619071782985033e-06, "loss": 0.0001, "num_input_tokens_seen": 169892928, "step": 78785 }, { "epoch": 14.459533859423747, "grad_norm": 40.111305236816406, "learning_rate": 2.161247958400363e-06, "loss": 0.2221, "num_input_tokens_seen": 169903328, "step": 78790 }, { "epoch": 14.4604514589833, "grad_norm": 0.07975975424051285, "learning_rate": 2.160588811311815e-06, "loss": 0.1879, "num_input_tokens_seen": 169914048, "step": 78795 }, { "epoch": 14.461369058542852, "grad_norm": 0.12869718670845032, "learning_rate": 2.1599297370497637e-06, "loss": 0.0003, "num_input_tokens_seen": 169923616, "step": 78800 }, { "epoch": 14.462286658102403, "grad_norm": 0.005426549352705479, "learning_rate": 2.1592707356311103e-06, "loss": 0.0001, "num_input_tokens_seen": 169933984, "step": 78805 }, { "epoch": 14.463204257661957, "grad_norm": 0.02103770337998867, "learning_rate": 2.158611807072762e-06, "loss": 0.0001, "num_input_tokens_seen": 169944736, "step": 78810 }, { "epoch": 14.464121857221508, "grad_norm": 0.19388218224048615, "learning_rate": 2.157952951391617e-06, "loss": 0.1885, "num_input_tokens_seen": 169955872, "step": 78815 }, { "epoch": 14.46503945678106, "grad_norm": 0.2572786211967468, "learning_rate": 2.157294168604574e-06, "loss": 0.1441, "num_input_tokens_seen": 169967008, "step": 78820 }, { "epoch": 14.465957056340613, "grad_norm": 0.020110510289669037, "learning_rate": 2.1566354587285283e-06, "loss": 0.1292, "num_input_tokens_seen": 169978432, "step": 78825 }, { "epoch": 14.466874655900165, "grad_norm": 0.0052892714738845825, "learning_rate": 2.1559768217803777e-06, "loss": 0.0003, "num_input_tokens_seen": 169990272, "step": 78830 }, { "epoch": 14.467792255459717, "grad_norm": 0.33206620812416077, "learning_rate": 2.155318257777014e-06, "loss": 0.0332, "num_input_tokens_seen": 170001184, "step": 78835 }, { "epoch": 14.46870985501927, "grad_norm": 0.37003427743911743, "learning_rate": 2.1546597667353257e-06, "loss": 0.1933, "num_input_tokens_seen": 170011456, "step": 78840 }, { "epoch": 14.469627454578822, "grad_norm": 0.23610207438468933, "learning_rate": 2.1540013486722073e-06, "loss": 0.0026, "num_input_tokens_seen": 170021504, "step": 78845 }, { "epoch": 14.470545054138373, "grad_norm": 72.12947082519531, "learning_rate": 2.1533430036045427e-06, "loss": 0.0882, "num_input_tokens_seen": 170033120, "step": 78850 }, { "epoch": 14.471462653697927, "grad_norm": 0.013917336240410805, "learning_rate": 2.1526847315492165e-06, "loss": 0.0001, "num_input_tokens_seen": 170043744, "step": 78855 }, { "epoch": 14.472380253257478, "grad_norm": 318.4729919433594, "learning_rate": 2.152026532523116e-06, "loss": 0.1896, "num_input_tokens_seen": 170055200, "step": 78860 }, { "epoch": 14.47329785281703, "grad_norm": 14.233729362487793, "learning_rate": 2.1513684065431207e-06, "loss": 0.0018, "num_input_tokens_seen": 170064416, "step": 78865 }, { "epoch": 14.474215452376583, "grad_norm": 3.188579559326172, "learning_rate": 2.150710353626111e-06, "loss": 0.1171, "num_input_tokens_seen": 170075392, "step": 78870 }, { "epoch": 14.475133051936135, "grad_norm": 0.017574148252606392, "learning_rate": 2.150052373788963e-06, "loss": 0.1863, "num_input_tokens_seen": 170086176, "step": 78875 }, { "epoch": 14.476050651495687, "grad_norm": 0.032475944608449936, "learning_rate": 2.1493944670485562e-06, "loss": 0.0001, "num_input_tokens_seen": 170096544, "step": 78880 }, { "epoch": 14.47696825105524, "grad_norm": 0.00828227587044239, "learning_rate": 2.1487366334217628e-06, "loss": 0.0001, "num_input_tokens_seen": 170106496, "step": 78885 }, { "epoch": 14.477885850614792, "grad_norm": 164.62326049804688, "learning_rate": 2.148078872925455e-06, "loss": 0.1444, "num_input_tokens_seen": 170118144, "step": 78890 }, { "epoch": 14.478803450174343, "grad_norm": 135.7006072998047, "learning_rate": 2.1474211855765055e-06, "loss": 0.1124, "num_input_tokens_seen": 170129856, "step": 78895 }, { "epoch": 14.479721049733897, "grad_norm": 0.0006056732963770628, "learning_rate": 2.1467635713917807e-06, "loss": 0.1025, "num_input_tokens_seen": 170139872, "step": 78900 }, { "epoch": 14.480638649293448, "grad_norm": 0.026523537933826447, "learning_rate": 2.146106030388147e-06, "loss": 0.0002, "num_input_tokens_seen": 170150848, "step": 78905 }, { "epoch": 14.481556248853, "grad_norm": 0.0052807945758104324, "learning_rate": 2.1454485625824724e-06, "loss": 0.0034, "num_input_tokens_seen": 170161632, "step": 78910 }, { "epoch": 14.482473848412553, "grad_norm": 1.2657082080841064, "learning_rate": 2.1447911679916177e-06, "loss": 0.0003, "num_input_tokens_seen": 170171968, "step": 78915 }, { "epoch": 14.483391447972105, "grad_norm": 5.733610153198242, "learning_rate": 2.1441338466324445e-06, "loss": 0.1493, "num_input_tokens_seen": 170183424, "step": 78920 }, { "epoch": 14.484309047531656, "grad_norm": 0.005419709254056215, "learning_rate": 2.1434765985218103e-06, "loss": 0.0001, "num_input_tokens_seen": 170194912, "step": 78925 }, { "epoch": 14.48522664709121, "grad_norm": 0.02555202506482601, "learning_rate": 2.142819423676576e-06, "loss": 0.0003, "num_input_tokens_seen": 170205120, "step": 78930 }, { "epoch": 14.486144246650762, "grad_norm": 0.00471559539437294, "learning_rate": 2.1421623221135947e-06, "loss": 0.2314, "num_input_tokens_seen": 170216160, "step": 78935 }, { "epoch": 14.487061846210313, "grad_norm": 16.185789108276367, "learning_rate": 2.1415052938497195e-06, "loss": 0.0043, "num_input_tokens_seen": 170226880, "step": 78940 }, { "epoch": 14.487979445769867, "grad_norm": 0.030559014528989792, "learning_rate": 2.1408483389018043e-06, "loss": 0.0005, "num_input_tokens_seen": 170237024, "step": 78945 }, { "epoch": 14.488897045329418, "grad_norm": 0.005248154979199171, "learning_rate": 2.1401914572866983e-06, "loss": 0.2563, "num_input_tokens_seen": 170247328, "step": 78950 }, { "epoch": 14.48981464488897, "grad_norm": 0.010185244493186474, "learning_rate": 2.1395346490212493e-06, "loss": 0.269, "num_input_tokens_seen": 170257824, "step": 78955 }, { "epoch": 14.490732244448523, "grad_norm": 0.0015304874395951629, "learning_rate": 2.138877914122301e-06, "loss": 0.0001, "num_input_tokens_seen": 170267616, "step": 78960 }, { "epoch": 14.491649844008075, "grad_norm": 0.04565238580107689, "learning_rate": 2.138221252606702e-06, "loss": 0.065, "num_input_tokens_seen": 170278496, "step": 78965 }, { "epoch": 14.492567443567626, "grad_norm": 0.18648511171340942, "learning_rate": 2.1375646644912925e-06, "loss": 0.1848, "num_input_tokens_seen": 170289024, "step": 78970 }, { "epoch": 14.49348504312718, "grad_norm": 98.53815460205078, "learning_rate": 2.1369081497929127e-06, "loss": 0.0516, "num_input_tokens_seen": 170299296, "step": 78975 }, { "epoch": 14.494402642686731, "grad_norm": 0.005269615910947323, "learning_rate": 2.136251708528402e-06, "loss": 0.0005, "num_input_tokens_seen": 170310720, "step": 78980 }, { "epoch": 14.495320242246283, "grad_norm": 0.006155157927423716, "learning_rate": 2.1355953407145947e-06, "loss": 0.0001, "num_input_tokens_seen": 170322336, "step": 78985 }, { "epoch": 14.496237841805836, "grad_norm": 0.04099705070257187, "learning_rate": 2.134939046368329e-06, "loss": 0.0003, "num_input_tokens_seen": 170333760, "step": 78990 }, { "epoch": 14.497155441365388, "grad_norm": 0.0028557151090353727, "learning_rate": 2.1342828255064362e-06, "loss": 0.3376, "num_input_tokens_seen": 170344224, "step": 78995 }, { "epoch": 14.49807304092494, "grad_norm": 0.25471776723861694, "learning_rate": 2.1336266781457475e-06, "loss": 0.0022, "num_input_tokens_seen": 170354720, "step": 79000 }, { "epoch": 14.498990640484493, "grad_norm": 48.45550537109375, "learning_rate": 2.1329706043030924e-06, "loss": 0.3112, "num_input_tokens_seen": 170365184, "step": 79005 }, { "epoch": 14.499908240044045, "grad_norm": 0.0030786588322371244, "learning_rate": 2.132314603995296e-06, "loss": 0.0002, "num_input_tokens_seen": 170377728, "step": 79010 }, { "epoch": 14.500825839603596, "grad_norm": 0.0013366511557251215, "learning_rate": 2.1316586772391866e-06, "loss": 0.1037, "num_input_tokens_seen": 170387904, "step": 79015 }, { "epoch": 14.50174343916315, "grad_norm": 0.14142967760562897, "learning_rate": 2.1310028240515874e-06, "loss": 0.1128, "num_input_tokens_seen": 170397088, "step": 79020 }, { "epoch": 14.502661038722701, "grad_norm": 73.54893493652344, "learning_rate": 2.130347044449317e-06, "loss": 0.0408, "num_input_tokens_seen": 170407680, "step": 79025 }, { "epoch": 14.503578638282253, "grad_norm": 67.16110229492188, "learning_rate": 2.129691338449199e-06, "loss": 0.008, "num_input_tokens_seen": 170417696, "step": 79030 }, { "epoch": 14.504496237841806, "grad_norm": 0.006796781439334154, "learning_rate": 2.1290357060680498e-06, "loss": 0.0001, "num_input_tokens_seen": 170427136, "step": 79035 }, { "epoch": 14.505413837401358, "grad_norm": 1.4771324396133423, "learning_rate": 2.1283801473226835e-06, "loss": 0.0011, "num_input_tokens_seen": 170439200, "step": 79040 }, { "epoch": 14.50633143696091, "grad_norm": 0.36750295758247375, "learning_rate": 2.1277246622299176e-06, "loss": 0.1952, "num_input_tokens_seen": 170448992, "step": 79045 }, { "epoch": 14.507249036520463, "grad_norm": 0.008817260153591633, "learning_rate": 2.127069250806562e-06, "loss": 0.0018, "num_input_tokens_seen": 170458304, "step": 79050 }, { "epoch": 14.508166636080015, "grad_norm": 0.34809598326683044, "learning_rate": 2.126413913069428e-06, "loss": 0.0003, "num_input_tokens_seen": 170469248, "step": 79055 }, { "epoch": 14.509084235639566, "grad_norm": 0.771975040435791, "learning_rate": 2.1257586490353216e-06, "loss": 0.1013, "num_input_tokens_seen": 170479776, "step": 79060 }, { "epoch": 14.51000183519912, "grad_norm": 0.03138037025928497, "learning_rate": 2.1251034587210527e-06, "loss": 0.1527, "num_input_tokens_seen": 170488704, "step": 79065 }, { "epoch": 14.510919434758671, "grad_norm": 0.000988313928246498, "learning_rate": 2.124448342143425e-06, "loss": 0.0001, "num_input_tokens_seen": 170499584, "step": 79070 }, { "epoch": 14.511837034318223, "grad_norm": 0.01870134472846985, "learning_rate": 2.1237932993192385e-06, "loss": 0.0008, "num_input_tokens_seen": 170509120, "step": 79075 }, { "epoch": 14.512754633877776, "grad_norm": 0.04414267838001251, "learning_rate": 2.1231383302652975e-06, "loss": 0.0001, "num_input_tokens_seen": 170519712, "step": 79080 }, { "epoch": 14.513672233437328, "grad_norm": 0.07005231082439423, "learning_rate": 2.1224834349984e-06, "loss": 0.0001, "num_input_tokens_seen": 170530016, "step": 79085 }, { "epoch": 14.51458983299688, "grad_norm": 0.0042247474193573, "learning_rate": 2.1218286135353427e-06, "loss": 0.0001, "num_input_tokens_seen": 170541568, "step": 79090 }, { "epoch": 14.515507432556433, "grad_norm": 1.1976641416549683, "learning_rate": 2.121173865892919e-06, "loss": 0.0003, "num_input_tokens_seen": 170552320, "step": 79095 }, { "epoch": 14.516425032115984, "grad_norm": 67.31148529052734, "learning_rate": 2.1205191920879254e-06, "loss": 0.1681, "num_input_tokens_seen": 170562400, "step": 79100 }, { "epoch": 14.517342631675536, "grad_norm": 0.03922216594219208, "learning_rate": 2.1198645921371517e-06, "loss": 0.0001, "num_input_tokens_seen": 170572256, "step": 79105 }, { "epoch": 14.51826023123509, "grad_norm": 0.030850812792778015, "learning_rate": 2.119210066057386e-06, "loss": 0.0823, "num_input_tokens_seen": 170583328, "step": 79110 }, { "epoch": 14.519177830794641, "grad_norm": 0.047102026641368866, "learning_rate": 2.1185556138654184e-06, "loss": 0.2283, "num_input_tokens_seen": 170593760, "step": 79115 }, { "epoch": 14.520095430354193, "grad_norm": 0.01308823935687542, "learning_rate": 2.1179012355780344e-06, "loss": 0.0734, "num_input_tokens_seen": 170605376, "step": 79120 }, { "epoch": 14.521013029913746, "grad_norm": 0.014328528195619583, "learning_rate": 2.1172469312120144e-06, "loss": 0.0, "num_input_tokens_seen": 170616512, "step": 79125 }, { "epoch": 14.521930629473298, "grad_norm": 61.99618911743164, "learning_rate": 2.1165927007841445e-06, "loss": 0.0885, "num_input_tokens_seen": 170626880, "step": 79130 }, { "epoch": 14.52284822903285, "grad_norm": 0.174168661236763, "learning_rate": 2.1159385443112033e-06, "loss": 0.0003, "num_input_tokens_seen": 170637664, "step": 79135 }, { "epoch": 14.523765828592403, "grad_norm": 0.0009447218617424369, "learning_rate": 2.1152844618099682e-06, "loss": 0.0175, "num_input_tokens_seen": 170648736, "step": 79140 }, { "epoch": 14.524683428151954, "grad_norm": 5.0376152992248535, "learning_rate": 2.1146304532972144e-06, "loss": 0.0006, "num_input_tokens_seen": 170658720, "step": 79145 }, { "epoch": 14.525601027711506, "grad_norm": 0.0027308594435453415, "learning_rate": 2.1139765187897195e-06, "loss": 0.104, "num_input_tokens_seen": 170668448, "step": 79150 }, { "epoch": 14.52651862727106, "grad_norm": 0.017176270484924316, "learning_rate": 2.1133226583042534e-06, "loss": 0.002, "num_input_tokens_seen": 170679840, "step": 79155 }, { "epoch": 14.527436226830611, "grad_norm": 0.03731481730937958, "learning_rate": 2.1126688718575857e-06, "loss": 0.0214, "num_input_tokens_seen": 170689856, "step": 79160 }, { "epoch": 14.528353826390163, "grad_norm": 15.024507522583008, "learning_rate": 2.112015159466488e-06, "loss": 0.0098, "num_input_tokens_seen": 170700832, "step": 79165 }, { "epoch": 14.529271425949716, "grad_norm": 0.016584889963269234, "learning_rate": 2.111361521147725e-06, "loss": 0.0001, "num_input_tokens_seen": 170711840, "step": 79170 }, { "epoch": 14.530189025509268, "grad_norm": 0.011742822825908661, "learning_rate": 2.1107079569180626e-06, "loss": 0.0913, "num_input_tokens_seen": 170722752, "step": 79175 }, { "epoch": 14.53110662506882, "grad_norm": 0.0033492485526949167, "learning_rate": 2.1100544667942617e-06, "loss": 0.0001, "num_input_tokens_seen": 170733472, "step": 79180 }, { "epoch": 14.532024224628373, "grad_norm": 8.472270965576172, "learning_rate": 2.109401050793086e-06, "loss": 0.0354, "num_input_tokens_seen": 170743744, "step": 79185 }, { "epoch": 14.532941824187924, "grad_norm": 0.0015480444999411702, "learning_rate": 2.1087477089312938e-06, "loss": 0.0005, "num_input_tokens_seen": 170753792, "step": 79190 }, { "epoch": 14.533859423747476, "grad_norm": 0.0030767270363867283, "learning_rate": 2.108094441225641e-06, "loss": 0.0004, "num_input_tokens_seen": 170765472, "step": 79195 }, { "epoch": 14.53477702330703, "grad_norm": 0.2392675280570984, "learning_rate": 2.1074412476928845e-06, "loss": 0.3418, "num_input_tokens_seen": 170777056, "step": 79200 }, { "epoch": 14.53569462286658, "grad_norm": 3.2561802864074707, "learning_rate": 2.1067881283497763e-06, "loss": 0.0892, "num_input_tokens_seen": 170787136, "step": 79205 }, { "epoch": 14.536612222426132, "grad_norm": 5.143986701965332, "learning_rate": 2.1061350832130673e-06, "loss": 0.0003, "num_input_tokens_seen": 170799168, "step": 79210 }, { "epoch": 14.537529821985686, "grad_norm": 0.05310153216123581, "learning_rate": 2.10548211229951e-06, "loss": 0.0216, "num_input_tokens_seen": 170809600, "step": 79215 }, { "epoch": 14.538447421545238, "grad_norm": 0.03295830637216568, "learning_rate": 2.1048292156258506e-06, "loss": 0.2221, "num_input_tokens_seen": 170820576, "step": 79220 }, { "epoch": 14.53936502110479, "grad_norm": 0.07834198325872421, "learning_rate": 2.104176393208834e-06, "loss": 0.0002, "num_input_tokens_seen": 170832064, "step": 79225 }, { "epoch": 14.540282620664343, "grad_norm": 0.14578334987163544, "learning_rate": 2.1035236450652037e-06, "loss": 0.0003, "num_input_tokens_seen": 170843200, "step": 79230 }, { "epoch": 14.541200220223894, "grad_norm": 301.184814453125, "learning_rate": 2.1028709712117045e-06, "loss": 0.1473, "num_input_tokens_seen": 170852544, "step": 79235 }, { "epoch": 14.542117819783446, "grad_norm": 0.0791507214307785, "learning_rate": 2.1022183716650744e-06, "loss": 0.0006, "num_input_tokens_seen": 170863840, "step": 79240 }, { "epoch": 14.543035419343, "grad_norm": 0.0027333491016179323, "learning_rate": 2.1015658464420503e-06, "loss": 0.0009, "num_input_tokens_seen": 170874560, "step": 79245 }, { "epoch": 14.54395301890255, "grad_norm": 0.0905257984995842, "learning_rate": 2.1009133955593717e-06, "loss": 0.059, "num_input_tokens_seen": 170886624, "step": 79250 }, { "epoch": 14.544870618462102, "grad_norm": 0.003878234652802348, "learning_rate": 2.100261019033772e-06, "loss": 0.0001, "num_input_tokens_seen": 170899584, "step": 79255 }, { "epoch": 14.545788218021656, "grad_norm": 0.003358085174113512, "learning_rate": 2.0996087168819803e-06, "loss": 0.0001, "num_input_tokens_seen": 170910944, "step": 79260 }, { "epoch": 14.546705817581207, "grad_norm": 0.051635000854730606, "learning_rate": 2.098956489120732e-06, "loss": 0.0001, "num_input_tokens_seen": 170921184, "step": 79265 }, { "epoch": 14.547623417140759, "grad_norm": 0.007378509733825922, "learning_rate": 2.0983043357667537e-06, "loss": 0.2688, "num_input_tokens_seen": 170931360, "step": 79270 }, { "epoch": 14.548541016700312, "grad_norm": 0.14464351534843445, "learning_rate": 2.097652256836772e-06, "loss": 0.0122, "num_input_tokens_seen": 170942208, "step": 79275 }, { "epoch": 14.549458616259864, "grad_norm": 0.07584551721811295, "learning_rate": 2.0970002523475093e-06, "loss": 0.0084, "num_input_tokens_seen": 170953472, "step": 79280 }, { "epoch": 14.550376215819416, "grad_norm": 110.18987274169922, "learning_rate": 2.0963483223156933e-06, "loss": 0.0762, "num_input_tokens_seen": 170964960, "step": 79285 }, { "epoch": 14.551293815378969, "grad_norm": 0.0016582469688728452, "learning_rate": 2.095696466758042e-06, "loss": 0.0, "num_input_tokens_seen": 170975872, "step": 79290 }, { "epoch": 14.55221141493852, "grad_norm": 4.86859130859375, "learning_rate": 2.095044685691274e-06, "loss": 0.0004, "num_input_tokens_seen": 170987520, "step": 79295 }, { "epoch": 14.553129014498072, "grad_norm": 0.019359102472662926, "learning_rate": 2.0943929791321086e-06, "loss": 0.0145, "num_input_tokens_seen": 170998240, "step": 79300 }, { "epoch": 14.554046614057626, "grad_norm": 206.41761779785156, "learning_rate": 2.0937413470972603e-06, "loss": 0.1006, "num_input_tokens_seen": 171007104, "step": 79305 }, { "epoch": 14.554964213617177, "grad_norm": 0.03293346241116524, "learning_rate": 2.0930897896034403e-06, "loss": 0.0002, "num_input_tokens_seen": 171018208, "step": 79310 }, { "epoch": 14.555881813176729, "grad_norm": 0.029501015320420265, "learning_rate": 2.0924383066673636e-06, "loss": 0.0733, "num_input_tokens_seen": 171028800, "step": 79315 }, { "epoch": 14.556799412736282, "grad_norm": 0.002423988888040185, "learning_rate": 2.091786898305739e-06, "loss": 0.033, "num_input_tokens_seen": 171038784, "step": 79320 }, { "epoch": 14.557717012295834, "grad_norm": 0.08225829899311066, "learning_rate": 2.091135564535272e-06, "loss": 0.1025, "num_input_tokens_seen": 171049504, "step": 79325 }, { "epoch": 14.558634611855386, "grad_norm": 0.10529977828264236, "learning_rate": 2.090484305372668e-06, "loss": 0.0001, "num_input_tokens_seen": 171060736, "step": 79330 }, { "epoch": 14.559552211414939, "grad_norm": 0.0038894556928426027, "learning_rate": 2.089833120834634e-06, "loss": 0.0057, "num_input_tokens_seen": 171070848, "step": 79335 }, { "epoch": 14.56046981097449, "grad_norm": 1.3543314933776855, "learning_rate": 2.08918201093787e-06, "loss": 0.001, "num_input_tokens_seen": 171082208, "step": 79340 }, { "epoch": 14.561387410534042, "grad_norm": 44.62328338623047, "learning_rate": 2.0885309756990747e-06, "loss": 0.1384, "num_input_tokens_seen": 171093312, "step": 79345 }, { "epoch": 14.562305010093596, "grad_norm": 0.00090705108596012, "learning_rate": 2.087880015134949e-06, "loss": 0.1439, "num_input_tokens_seen": 171104352, "step": 79350 }, { "epoch": 14.563222609653147, "grad_norm": 0.0008869416196830571, "learning_rate": 2.087229129262187e-06, "loss": 0.0002, "num_input_tokens_seen": 171115520, "step": 79355 }, { "epoch": 14.564140209212699, "grad_norm": 0.0037144944071769714, "learning_rate": 2.0865783180974825e-06, "loss": 0.0003, "num_input_tokens_seen": 171125856, "step": 79360 }, { "epoch": 14.565057808772252, "grad_norm": 0.5295230746269226, "learning_rate": 2.08592758165753e-06, "loss": 0.0015, "num_input_tokens_seen": 171136864, "step": 79365 }, { "epoch": 14.565975408331804, "grad_norm": 0.00174701155629009, "learning_rate": 2.0852769199590187e-06, "loss": 0.1951, "num_input_tokens_seen": 171147456, "step": 79370 }, { "epoch": 14.566893007891355, "grad_norm": 0.00643757451325655, "learning_rate": 2.0846263330186373e-06, "loss": 0.0002, "num_input_tokens_seen": 171158496, "step": 79375 }, { "epoch": 14.567810607450909, "grad_norm": 0.003761391621083021, "learning_rate": 2.0839758208530704e-06, "loss": 0.0001, "num_input_tokens_seen": 171169536, "step": 79380 }, { "epoch": 14.56872820701046, "grad_norm": 0.07274309545755386, "learning_rate": 2.083325383479005e-06, "loss": 0.0001, "num_input_tokens_seen": 171180800, "step": 79385 }, { "epoch": 14.569645806570012, "grad_norm": 0.1434982419013977, "learning_rate": 2.082675020913124e-06, "loss": 0.0004, "num_input_tokens_seen": 171191136, "step": 79390 }, { "epoch": 14.570563406129565, "grad_norm": 0.012799750082194805, "learning_rate": 2.0820247331721073e-06, "loss": 0.0145, "num_input_tokens_seen": 171202144, "step": 79395 }, { "epoch": 14.571481005689117, "grad_norm": 0.0044133844785392284, "learning_rate": 2.0813745202726315e-06, "loss": 0.3365, "num_input_tokens_seen": 171213216, "step": 79400 }, { "epoch": 14.572398605248669, "grad_norm": 0.06356470286846161, "learning_rate": 2.0807243822313776e-06, "loss": 0.1254, "num_input_tokens_seen": 171223712, "step": 79405 }, { "epoch": 14.573316204808222, "grad_norm": 0.009264957159757614, "learning_rate": 2.08007431906502e-06, "loss": 0.0002, "num_input_tokens_seen": 171235104, "step": 79410 }, { "epoch": 14.574233804367774, "grad_norm": 0.9737133979797363, "learning_rate": 2.079424330790229e-06, "loss": 0.0003, "num_input_tokens_seen": 171246016, "step": 79415 }, { "epoch": 14.575151403927325, "grad_norm": 0.049805186688899994, "learning_rate": 2.0787744174236784e-06, "loss": 0.0004, "num_input_tokens_seen": 171256736, "step": 79420 }, { "epoch": 14.576069003486879, "grad_norm": 2.9355709552764893, "learning_rate": 2.078124578982036e-06, "loss": 0.0003, "num_input_tokens_seen": 171266720, "step": 79425 }, { "epoch": 14.57698660304643, "grad_norm": 291.99420166015625, "learning_rate": 2.077474815481968e-06, "loss": 0.0223, "num_input_tokens_seen": 171276480, "step": 79430 }, { "epoch": 14.577904202605982, "grad_norm": 0.012487299740314484, "learning_rate": 2.0768251269401435e-06, "loss": 0.2721, "num_input_tokens_seen": 171286976, "step": 79435 }, { "epoch": 14.578821802165535, "grad_norm": 0.055270109325647354, "learning_rate": 2.0761755133732236e-06, "loss": 0.1502, "num_input_tokens_seen": 171297376, "step": 79440 }, { "epoch": 14.579739401725087, "grad_norm": 1.1031240224838257, "learning_rate": 2.075525974797869e-06, "loss": 0.0007, "num_input_tokens_seen": 171308096, "step": 79445 }, { "epoch": 14.580657001284639, "grad_norm": 0.0030291969887912273, "learning_rate": 2.074876511230742e-06, "loss": 0.0177, "num_input_tokens_seen": 171318784, "step": 79450 }, { "epoch": 14.581574600844192, "grad_norm": 0.009240320883691311, "learning_rate": 2.074227122688499e-06, "loss": 0.0, "num_input_tokens_seen": 171329216, "step": 79455 }, { "epoch": 14.582492200403744, "grad_norm": 0.05556633695960045, "learning_rate": 2.0735778091877963e-06, "loss": 0.0002, "num_input_tokens_seen": 171339968, "step": 79460 }, { "epoch": 14.583409799963295, "grad_norm": 0.0018263913225382566, "learning_rate": 2.0729285707452846e-06, "loss": 0.1283, "num_input_tokens_seen": 171350336, "step": 79465 }, { "epoch": 14.584327399522849, "grad_norm": 0.0011210974771529436, "learning_rate": 2.072279407377621e-06, "loss": 0.0064, "num_input_tokens_seen": 171361760, "step": 79470 }, { "epoch": 14.5852449990824, "grad_norm": 0.888823390007019, "learning_rate": 2.0716303191014527e-06, "loss": 0.0153, "num_input_tokens_seen": 171373472, "step": 79475 }, { "epoch": 14.586162598641952, "grad_norm": 0.014774501323699951, "learning_rate": 2.070981305933426e-06, "loss": 0.0003, "num_input_tokens_seen": 171383520, "step": 79480 }, { "epoch": 14.587080198201505, "grad_norm": 0.006707038264721632, "learning_rate": 2.0703323678901915e-06, "loss": 0.0054, "num_input_tokens_seen": 171393568, "step": 79485 }, { "epoch": 14.587997797761057, "grad_norm": 0.010670321062207222, "learning_rate": 2.069683504988391e-06, "loss": 0.0022, "num_input_tokens_seen": 171403872, "step": 79490 }, { "epoch": 14.588915397320608, "grad_norm": 0.867918848991394, "learning_rate": 2.0690347172446655e-06, "loss": 0.0289, "num_input_tokens_seen": 171414048, "step": 79495 }, { "epoch": 14.589832996880162, "grad_norm": 0.022646354511380196, "learning_rate": 2.0683860046756587e-06, "loss": 0.0002, "num_input_tokens_seen": 171424448, "step": 79500 }, { "epoch": 14.590750596439714, "grad_norm": 17.009830474853516, "learning_rate": 2.067737367298007e-06, "loss": 0.198, "num_input_tokens_seen": 171435744, "step": 79505 }, { "epoch": 14.591668195999265, "grad_norm": 0.03838900104165077, "learning_rate": 2.067088805128348e-06, "loss": 0.0001, "num_input_tokens_seen": 171446560, "step": 79510 }, { "epoch": 14.592585795558819, "grad_norm": 0.006924964487552643, "learning_rate": 2.0664403181833125e-06, "loss": 0.0, "num_input_tokens_seen": 171456032, "step": 79515 }, { "epoch": 14.59350339511837, "grad_norm": 0.010849403217434883, "learning_rate": 2.065791906479539e-06, "loss": 0.1074, "num_input_tokens_seen": 171466848, "step": 79520 }, { "epoch": 14.594420994677922, "grad_norm": 66.2965087890625, "learning_rate": 2.0651435700336554e-06, "loss": 0.4272, "num_input_tokens_seen": 171477664, "step": 79525 }, { "epoch": 14.595338594237475, "grad_norm": 0.27948036789894104, "learning_rate": 2.0644953088622882e-06, "loss": 0.0837, "num_input_tokens_seen": 171488384, "step": 79530 }, { "epoch": 14.596256193797027, "grad_norm": 0.6475727558135986, "learning_rate": 2.0638471229820687e-06, "loss": 0.0002, "num_input_tokens_seen": 171498912, "step": 79535 }, { "epoch": 14.597173793356578, "grad_norm": 0.007409397512674332, "learning_rate": 2.0631990124096203e-06, "loss": 0.0017, "num_input_tokens_seen": 171509056, "step": 79540 }, { "epoch": 14.598091392916132, "grad_norm": 0.008826549164950848, "learning_rate": 2.0625509771615655e-06, "loss": 0.0002, "num_input_tokens_seen": 171520992, "step": 79545 }, { "epoch": 14.599008992475683, "grad_norm": 113.66842651367188, "learning_rate": 2.0619030172545236e-06, "loss": 0.224, "num_input_tokens_seen": 171532416, "step": 79550 }, { "epoch": 14.599926592035235, "grad_norm": 0.0018757464131340384, "learning_rate": 2.061255132705117e-06, "loss": 0.0436, "num_input_tokens_seen": 171543840, "step": 79555 }, { "epoch": 14.600844191594788, "grad_norm": 141.9740753173828, "learning_rate": 2.0606073235299625e-06, "loss": 0.113, "num_input_tokens_seen": 171554560, "step": 79560 }, { "epoch": 14.60176179115434, "grad_norm": 0.1257772594690323, "learning_rate": 2.059959589745672e-06, "loss": 0.0008, "num_input_tokens_seen": 171566048, "step": 79565 }, { "epoch": 14.602679390713892, "grad_norm": 0.01993226259946823, "learning_rate": 2.0593119313688635e-06, "loss": 0.0001, "num_input_tokens_seen": 171577184, "step": 79570 }, { "epoch": 14.603596990273445, "grad_norm": 34.450801849365234, "learning_rate": 2.058664348416146e-06, "loss": 0.0031, "num_input_tokens_seen": 171588032, "step": 79575 }, { "epoch": 14.604514589832997, "grad_norm": 0.12762735784053802, "learning_rate": 2.0580168409041278e-06, "loss": 0.0823, "num_input_tokens_seen": 171598752, "step": 79580 }, { "epoch": 14.605432189392548, "grad_norm": 0.10309897363185883, "learning_rate": 2.05736940884942e-06, "loss": 0.0001, "num_input_tokens_seen": 171610720, "step": 79585 }, { "epoch": 14.606349788952102, "grad_norm": 0.5722911953926086, "learning_rate": 2.056722052268626e-06, "loss": 0.113, "num_input_tokens_seen": 171622176, "step": 79590 }, { "epoch": 14.607267388511653, "grad_norm": 0.012630069628357887, "learning_rate": 2.0560747711783497e-06, "loss": 0.1466, "num_input_tokens_seen": 171633440, "step": 79595 }, { "epoch": 14.608184988071205, "grad_norm": 0.014025725424289703, "learning_rate": 2.0554275655951903e-06, "loss": 0.0001, "num_input_tokens_seen": 171643072, "step": 79600 }, { "epoch": 14.609102587630758, "grad_norm": 0.009752086363732815, "learning_rate": 2.054780435535753e-06, "loss": 0.0001, "num_input_tokens_seen": 171654272, "step": 79605 }, { "epoch": 14.61002018719031, "grad_norm": 0.0026099705137312412, "learning_rate": 2.0541333810166326e-06, "loss": 0.0001, "num_input_tokens_seen": 171664064, "step": 79610 }, { "epoch": 14.610937786749862, "grad_norm": 0.013969382271170616, "learning_rate": 2.0534864020544247e-06, "loss": 0.0001, "num_input_tokens_seen": 171674304, "step": 79615 }, { "epoch": 14.611855386309415, "grad_norm": 0.00851359497755766, "learning_rate": 2.0528394986657247e-06, "loss": 0.0001, "num_input_tokens_seen": 171685920, "step": 79620 }, { "epoch": 14.612772985868967, "grad_norm": 56.87653350830078, "learning_rate": 2.0521926708671215e-06, "loss": 0.1696, "num_input_tokens_seen": 171696896, "step": 79625 }, { "epoch": 14.613690585428518, "grad_norm": 0.027233589440584183, "learning_rate": 2.0515459186752094e-06, "loss": 0.0005, "num_input_tokens_seen": 171707328, "step": 79630 }, { "epoch": 14.614608184988072, "grad_norm": 0.018848855048418045, "learning_rate": 2.0508992421065755e-06, "loss": 0.0002, "num_input_tokens_seen": 171718848, "step": 79635 }, { "epoch": 14.615525784547623, "grad_norm": 59.954124450683594, "learning_rate": 2.0502526411778046e-06, "loss": 0.3292, "num_input_tokens_seen": 171729504, "step": 79640 }, { "epoch": 14.616443384107175, "grad_norm": 0.006389648653566837, "learning_rate": 2.049606115905482e-06, "loss": 0.0007, "num_input_tokens_seen": 171739488, "step": 79645 }, { "epoch": 14.617360983666728, "grad_norm": 0.036885447800159454, "learning_rate": 2.0489596663061882e-06, "loss": 0.1192, "num_input_tokens_seen": 171749920, "step": 79650 }, { "epoch": 14.61827858322628, "grad_norm": 0.008529786951839924, "learning_rate": 2.048313292396507e-06, "loss": 0.0084, "num_input_tokens_seen": 171760288, "step": 79655 }, { "epoch": 14.619196182785831, "grad_norm": 0.0011967545142397285, "learning_rate": 2.047666994193015e-06, "loss": 0.0001, "num_input_tokens_seen": 171770848, "step": 79660 }, { "epoch": 14.620113782345385, "grad_norm": 0.40642544627189636, "learning_rate": 2.0470207717122875e-06, "loss": 0.0004, "num_input_tokens_seen": 171781248, "step": 79665 }, { "epoch": 14.621031381904936, "grad_norm": 0.00579847302287817, "learning_rate": 2.0463746249709016e-06, "loss": 0.0001, "num_input_tokens_seen": 171793120, "step": 79670 }, { "epoch": 14.621948981464488, "grad_norm": 0.24498316645622253, "learning_rate": 2.0457285539854295e-06, "loss": 0.0005, "num_input_tokens_seen": 171803520, "step": 79675 }, { "epoch": 14.622866581024041, "grad_norm": 0.008947131223976612, "learning_rate": 2.045082558772441e-06, "loss": 0.0001, "num_input_tokens_seen": 171813984, "step": 79680 }, { "epoch": 14.623784180583593, "grad_norm": 0.08104805648326874, "learning_rate": 2.044436639348503e-06, "loss": 0.0006, "num_input_tokens_seen": 171825632, "step": 79685 }, { "epoch": 14.624701780143145, "grad_norm": 0.003050362691283226, "learning_rate": 2.0437907957301873e-06, "loss": 0.0001, "num_input_tokens_seen": 171835840, "step": 79690 }, { "epoch": 14.625619379702698, "grad_norm": 0.00109002273529768, "learning_rate": 2.0431450279340554e-06, "loss": 0.1253, "num_input_tokens_seen": 171847808, "step": 79695 }, { "epoch": 14.62653697926225, "grad_norm": 0.12449220567941666, "learning_rate": 2.0424993359766685e-06, "loss": 0.0004, "num_input_tokens_seen": 171858656, "step": 79700 }, { "epoch": 14.627454578821801, "grad_norm": 0.005455284379422665, "learning_rate": 2.0418537198745927e-06, "loss": 0.0001, "num_input_tokens_seen": 171869376, "step": 79705 }, { "epoch": 14.628372178381355, "grad_norm": 0.0733933225274086, "learning_rate": 2.0412081796443834e-06, "loss": 0.0001, "num_input_tokens_seen": 171880512, "step": 79710 }, { "epoch": 14.629289777940906, "grad_norm": 296.2636413574219, "learning_rate": 2.0405627153025974e-06, "loss": 0.1563, "num_input_tokens_seen": 171890208, "step": 79715 }, { "epoch": 14.630207377500458, "grad_norm": 0.002458822214975953, "learning_rate": 2.0399173268657923e-06, "loss": 0.0033, "num_input_tokens_seen": 171900256, "step": 79720 }, { "epoch": 14.631124977060011, "grad_norm": 0.002064560540020466, "learning_rate": 2.039272014350521e-06, "loss": 0.0002, "num_input_tokens_seen": 171911936, "step": 79725 }, { "epoch": 14.632042576619563, "grad_norm": 0.24058330059051514, "learning_rate": 2.0386267777733325e-06, "loss": 0.0208, "num_input_tokens_seen": 171921984, "step": 79730 }, { "epoch": 14.632960176179115, "grad_norm": 0.0030966969206929207, "learning_rate": 2.0379816171507764e-06, "loss": 0.0006, "num_input_tokens_seen": 171933760, "step": 79735 }, { "epoch": 14.633877775738668, "grad_norm": 0.002134110312908888, "learning_rate": 2.0373365324994033e-06, "loss": 0.0001, "num_input_tokens_seen": 171944896, "step": 79740 }, { "epoch": 14.63479537529822, "grad_norm": 0.0018513966351747513, "learning_rate": 2.036691523835756e-06, "loss": 0.0002, "num_input_tokens_seen": 171955360, "step": 79745 }, { "epoch": 14.635712974857771, "grad_norm": 0.005475654732435942, "learning_rate": 2.036046591176376e-06, "loss": 0.0207, "num_input_tokens_seen": 171965536, "step": 79750 }, { "epoch": 14.636630574417325, "grad_norm": 0.002995090326294303, "learning_rate": 2.0354017345378098e-06, "loss": 0.013, "num_input_tokens_seen": 171977504, "step": 79755 }, { "epoch": 14.637548173976876, "grad_norm": 0.023219967260956764, "learning_rate": 2.034756953936594e-06, "loss": 0.3005, "num_input_tokens_seen": 171988864, "step": 79760 }, { "epoch": 14.638465773536428, "grad_norm": 0.5238255858421326, "learning_rate": 2.034112249389265e-06, "loss": 0.0004, "num_input_tokens_seen": 171999744, "step": 79765 }, { "epoch": 14.639383373095981, "grad_norm": 0.007526444271206856, "learning_rate": 2.033467620912362e-06, "loss": 0.0207, "num_input_tokens_seen": 172010144, "step": 79770 }, { "epoch": 14.640300972655533, "grad_norm": 0.008702698163688183, "learning_rate": 2.032823068522417e-06, "loss": 0.0, "num_input_tokens_seen": 172020928, "step": 79775 }, { "epoch": 14.641218572215084, "grad_norm": 6.589729309082031, "learning_rate": 2.032178592235961e-06, "loss": 0.2809, "num_input_tokens_seen": 172031616, "step": 79780 }, { "epoch": 14.642136171774638, "grad_norm": 197.6610870361328, "learning_rate": 2.031534192069523e-06, "loss": 0.0247, "num_input_tokens_seen": 172041344, "step": 79785 }, { "epoch": 14.64305377133419, "grad_norm": 0.007162074092775583, "learning_rate": 2.030889868039634e-06, "loss": 0.0001, "num_input_tokens_seen": 172052384, "step": 79790 }, { "epoch": 14.643971370893741, "grad_norm": 13.329717636108398, "learning_rate": 2.030245620162818e-06, "loss": 0.0067, "num_input_tokens_seen": 172063872, "step": 79795 }, { "epoch": 14.644888970453295, "grad_norm": 475.21075439453125, "learning_rate": 2.0296014484555976e-06, "loss": 0.2972, "num_input_tokens_seen": 172074432, "step": 79800 }, { "epoch": 14.645806570012846, "grad_norm": 0.12389910221099854, "learning_rate": 2.028957352934498e-06, "loss": 0.0082, "num_input_tokens_seen": 172085056, "step": 79805 }, { "epoch": 14.646724169572398, "grad_norm": 0.018705716356635094, "learning_rate": 2.028313333616038e-06, "loss": 0.1133, "num_input_tokens_seen": 172096640, "step": 79810 }, { "epoch": 14.647641769131951, "grad_norm": 0.0013827778166159987, "learning_rate": 2.0276693905167344e-06, "loss": 0.0, "num_input_tokens_seen": 172105984, "step": 79815 }, { "epoch": 14.648559368691503, "grad_norm": 0.0031992134172469378, "learning_rate": 2.0270255236531032e-06, "loss": 0.0032, "num_input_tokens_seen": 172115872, "step": 79820 }, { "epoch": 14.649476968251054, "grad_norm": 22.103010177612305, "learning_rate": 2.0263817330416612e-06, "loss": 0.0079, "num_input_tokens_seen": 172127360, "step": 79825 }, { "epoch": 14.650394567810608, "grad_norm": 27.639057159423828, "learning_rate": 2.025738018698919e-06, "loss": 0.2199, "num_input_tokens_seen": 172138720, "step": 79830 }, { "epoch": 14.65131216737016, "grad_norm": 0.2470397800207138, "learning_rate": 2.025094380641387e-06, "loss": 0.0738, "num_input_tokens_seen": 172149056, "step": 79835 }, { "epoch": 14.652229766929711, "grad_norm": 0.008873963728547096, "learning_rate": 2.0244508188855733e-06, "loss": 0.0003, "num_input_tokens_seen": 172159552, "step": 79840 }, { "epoch": 14.653147366489264, "grad_norm": 0.0021195535082370043, "learning_rate": 2.023807333447983e-06, "loss": 0.0008, "num_input_tokens_seen": 172170464, "step": 79845 }, { "epoch": 14.654064966048816, "grad_norm": 0.006485641468316317, "learning_rate": 2.0231639243451235e-06, "loss": 0.0, "num_input_tokens_seen": 172181952, "step": 79850 }, { "epoch": 14.654982565608368, "grad_norm": 0.002728053368628025, "learning_rate": 2.022520591593495e-06, "loss": 0.0, "num_input_tokens_seen": 172192416, "step": 79855 }, { "epoch": 14.655900165167921, "grad_norm": 0.0024899106938391924, "learning_rate": 2.0218773352096e-06, "loss": 0.0001, "num_input_tokens_seen": 172202784, "step": 79860 }, { "epoch": 14.656817764727473, "grad_norm": 0.028900740668177605, "learning_rate": 2.0212341552099347e-06, "loss": 0.0, "num_input_tokens_seen": 172214208, "step": 79865 }, { "epoch": 14.657735364287024, "grad_norm": 0.027574092149734497, "learning_rate": 2.020591051610995e-06, "loss": 0.0645, "num_input_tokens_seen": 172225760, "step": 79870 }, { "epoch": 14.658652963846578, "grad_norm": 0.043350785970687866, "learning_rate": 2.019948024429279e-06, "loss": 0.0036, "num_input_tokens_seen": 172236576, "step": 79875 }, { "epoch": 14.65957056340613, "grad_norm": 0.09273143857717514, "learning_rate": 2.019305073681278e-06, "loss": 0.0001, "num_input_tokens_seen": 172247872, "step": 79880 }, { "epoch": 14.660488162965681, "grad_norm": 0.01131916232407093, "learning_rate": 2.01866219938348e-06, "loss": 0.0045, "num_input_tokens_seen": 172259072, "step": 79885 }, { "epoch": 14.661405762525234, "grad_norm": 0.00376533018425107, "learning_rate": 2.0180194015523772e-06, "loss": 0.1416, "num_input_tokens_seen": 172268640, "step": 79890 }, { "epoch": 14.662323362084786, "grad_norm": 91.13286590576172, "learning_rate": 2.017376680204456e-06, "loss": 0.0435, "num_input_tokens_seen": 172280768, "step": 79895 }, { "epoch": 14.663240961644338, "grad_norm": 0.0041833301074802876, "learning_rate": 2.016734035356198e-06, "loss": 0.186, "num_input_tokens_seen": 172292320, "step": 79900 }, { "epoch": 14.664158561203891, "grad_norm": 0.0045474739745259285, "learning_rate": 2.0160914670240906e-06, "loss": 0.0012, "num_input_tokens_seen": 172303648, "step": 79905 }, { "epoch": 14.665076160763443, "grad_norm": 0.002705871593207121, "learning_rate": 2.015448975224612e-06, "loss": 0.0763, "num_input_tokens_seen": 172313568, "step": 79910 }, { "epoch": 14.665993760322994, "grad_norm": 0.0064024170860648155, "learning_rate": 2.0148065599742416e-06, "loss": 0.0001, "num_input_tokens_seen": 172323936, "step": 79915 }, { "epoch": 14.666911359882548, "grad_norm": 0.0435633659362793, "learning_rate": 2.0141642212894547e-06, "loss": 0.0002, "num_input_tokens_seen": 172335328, "step": 79920 }, { "epoch": 14.6678289594421, "grad_norm": 0.01643240638077259, "learning_rate": 2.0135219591867293e-06, "loss": 0.0001, "num_input_tokens_seen": 172346336, "step": 79925 }, { "epoch": 14.66874655900165, "grad_norm": 0.041817761957645416, "learning_rate": 2.0128797736825375e-06, "loss": 0.1286, "num_input_tokens_seen": 172357760, "step": 79930 }, { "epoch": 14.669664158561204, "grad_norm": 0.22210811078548431, "learning_rate": 2.012237664793348e-06, "loss": 0.0164, "num_input_tokens_seen": 172370208, "step": 79935 }, { "epoch": 14.670581758120756, "grad_norm": 0.00467678764835, "learning_rate": 2.011595632535633e-06, "loss": 0.0001, "num_input_tokens_seen": 172381280, "step": 79940 }, { "epoch": 14.671499357680307, "grad_norm": 0.006969534792006016, "learning_rate": 2.010953676925859e-06, "loss": 0.1876, "num_input_tokens_seen": 172393120, "step": 79945 }, { "epoch": 14.67241695723986, "grad_norm": 0.001216835342347622, "learning_rate": 2.010311797980489e-06, "loss": 0.0001, "num_input_tokens_seen": 172403872, "step": 79950 }, { "epoch": 14.673334556799412, "grad_norm": 0.014586801640689373, "learning_rate": 2.0096699957159886e-06, "loss": 0.2847, "num_input_tokens_seen": 172415456, "step": 79955 }, { "epoch": 14.674252156358964, "grad_norm": 0.013548363000154495, "learning_rate": 2.009028270148819e-06, "loss": 0.0001, "num_input_tokens_seen": 172426944, "step": 79960 }, { "epoch": 14.675169755918517, "grad_norm": 0.008984597399830818, "learning_rate": 2.008386621295439e-06, "loss": 0.2802, "num_input_tokens_seen": 172438336, "step": 79965 }, { "epoch": 14.676087355478069, "grad_norm": 45.080711364746094, "learning_rate": 2.007745049172303e-06, "loss": 0.116, "num_input_tokens_seen": 172449312, "step": 79970 }, { "epoch": 14.67700495503762, "grad_norm": 67.51414489746094, "learning_rate": 2.007103553795871e-06, "loss": 0.3888, "num_input_tokens_seen": 172459520, "step": 79975 }, { "epoch": 14.677922554597174, "grad_norm": 0.07470300793647766, "learning_rate": 2.006462135182594e-06, "loss": 0.0019, "num_input_tokens_seen": 172469184, "step": 79980 }, { "epoch": 14.678840154156726, "grad_norm": 0.09382548183202744, "learning_rate": 2.005820793348923e-06, "loss": 0.0004, "num_input_tokens_seen": 172479488, "step": 79985 }, { "epoch": 14.679757753716277, "grad_norm": 0.0009521959000267088, "learning_rate": 2.0051795283113085e-06, "loss": 0.0002, "num_input_tokens_seen": 172490080, "step": 79990 }, { "epoch": 14.68067535327583, "grad_norm": 78.87899017333984, "learning_rate": 2.0045383400861985e-06, "loss": 0.0087, "num_input_tokens_seen": 172500544, "step": 79995 }, { "epoch": 14.681592952835382, "grad_norm": 0.0012343566631898284, "learning_rate": 2.003897228690037e-06, "loss": 0.1286, "num_input_tokens_seen": 172510272, "step": 80000 }, { "epoch": 14.682510552394934, "grad_norm": 0.010145827196538448, "learning_rate": 2.0032561941392663e-06, "loss": 0.0002, "num_input_tokens_seen": 172520672, "step": 80005 }, { "epoch": 14.683428151954487, "grad_norm": 145.87205505371094, "learning_rate": 2.0026152364503313e-06, "loss": 0.1504, "num_input_tokens_seen": 172531872, "step": 80010 }, { "epoch": 14.684345751514039, "grad_norm": 0.19458384811878204, "learning_rate": 2.0019743556396703e-06, "loss": 0.1985, "num_input_tokens_seen": 172543424, "step": 80015 }, { "epoch": 14.68526335107359, "grad_norm": 0.010057016275823116, "learning_rate": 2.0013335517237194e-06, "loss": 0.0065, "num_input_tokens_seen": 172555520, "step": 80020 }, { "epoch": 14.686180950633144, "grad_norm": 0.0015623457729816437, "learning_rate": 2.0006928247189162e-06, "loss": 0.0002, "num_input_tokens_seen": 172566144, "step": 80025 }, { "epoch": 14.687098550192696, "grad_norm": 0.09447991102933884, "learning_rate": 2.000052174641694e-06, "loss": 0.1453, "num_input_tokens_seen": 172577600, "step": 80030 }, { "epoch": 14.688016149752247, "grad_norm": 0.49758148193359375, "learning_rate": 1.9994116015084835e-06, "loss": 0.0006, "num_input_tokens_seen": 172590016, "step": 80035 }, { "epoch": 14.6889337493118, "grad_norm": 0.08929704129695892, "learning_rate": 1.9987711053357134e-06, "loss": 0.0021, "num_input_tokens_seen": 172600704, "step": 80040 }, { "epoch": 14.689851348871352, "grad_norm": 0.0028466356452554464, "learning_rate": 1.998130686139815e-06, "loss": 0.0008, "num_input_tokens_seen": 172611200, "step": 80045 }, { "epoch": 14.690768948430904, "grad_norm": 0.03492651507258415, "learning_rate": 1.9974903439372116e-06, "loss": 0.0013, "num_input_tokens_seen": 172623360, "step": 80050 }, { "epoch": 14.691686547990457, "grad_norm": 0.025558970868587494, "learning_rate": 1.996850078744328e-06, "loss": 0.0001, "num_input_tokens_seen": 172634112, "step": 80055 }, { "epoch": 14.692604147550009, "grad_norm": 0.0013847594382241368, "learning_rate": 1.996209890577585e-06, "loss": 0.0822, "num_input_tokens_seen": 172645184, "step": 80060 }, { "epoch": 14.69352174710956, "grad_norm": 0.027652030810713768, "learning_rate": 1.9955697794534012e-06, "loss": 0.1439, "num_input_tokens_seen": 172655104, "step": 80065 }, { "epoch": 14.694439346669114, "grad_norm": 0.025503193959593773, "learning_rate": 1.9949297453881977e-06, "loss": 0.0003, "num_input_tokens_seen": 172666464, "step": 80070 }, { "epoch": 14.695356946228666, "grad_norm": 0.1655907779932022, "learning_rate": 1.994289788398389e-06, "loss": 0.0004, "num_input_tokens_seen": 172676384, "step": 80075 }, { "epoch": 14.696274545788217, "grad_norm": 0.0013274718075990677, "learning_rate": 1.993649908500389e-06, "loss": 0.1358, "num_input_tokens_seen": 172686336, "step": 80080 }, { "epoch": 14.69719214534777, "grad_norm": 0.02729485183954239, "learning_rate": 1.9930101057106076e-06, "loss": 0.0431, "num_input_tokens_seen": 172697440, "step": 80085 }, { "epoch": 14.698109744907322, "grad_norm": 0.018505766987800598, "learning_rate": 1.992370380045458e-06, "loss": 0.1745, "num_input_tokens_seen": 172708640, "step": 80090 }, { "epoch": 14.699027344466874, "grad_norm": 0.0026469442527741194, "learning_rate": 1.9917307315213468e-06, "loss": 0.0002, "num_input_tokens_seen": 172719712, "step": 80095 }, { "epoch": 14.699944944026427, "grad_norm": 0.09792807698249817, "learning_rate": 1.99109116015468e-06, "loss": 0.0026, "num_input_tokens_seen": 172730176, "step": 80100 }, { "epoch": 14.700862543585979, "grad_norm": 56.94630813598633, "learning_rate": 1.99045166596186e-06, "loss": 0.2818, "num_input_tokens_seen": 172741792, "step": 80105 }, { "epoch": 14.70178014314553, "grad_norm": 0.002130415290594101, "learning_rate": 1.989812248959292e-06, "loss": 0.3295, "num_input_tokens_seen": 172751776, "step": 80110 }, { "epoch": 14.702697742705084, "grad_norm": 35.11542892456055, "learning_rate": 1.9891729091633743e-06, "loss": 0.0097, "num_input_tokens_seen": 172762432, "step": 80115 }, { "epoch": 14.703615342264635, "grad_norm": 0.0008395738550461829, "learning_rate": 1.9885336465905035e-06, "loss": 0.0103, "num_input_tokens_seen": 172774208, "step": 80120 }, { "epoch": 14.704532941824187, "grad_norm": 0.08189499378204346, "learning_rate": 1.9878944612570793e-06, "loss": 0.1721, "num_input_tokens_seen": 172784320, "step": 80125 }, { "epoch": 14.70545054138374, "grad_norm": 1.862363576889038, "learning_rate": 1.9872553531794936e-06, "loss": 0.0008, "num_input_tokens_seen": 172795296, "step": 80130 }, { "epoch": 14.706368140943292, "grad_norm": 31.16168975830078, "learning_rate": 1.9866163223741386e-06, "loss": 0.1073, "num_input_tokens_seen": 172805088, "step": 80135 }, { "epoch": 14.707285740502844, "grad_norm": 1.7988531589508057, "learning_rate": 1.985977368857403e-06, "loss": 0.0008, "num_input_tokens_seen": 172815360, "step": 80140 }, { "epoch": 14.708203340062397, "grad_norm": 0.0014681589091196656, "learning_rate": 1.9853384926456785e-06, "loss": 0.0001, "num_input_tokens_seen": 172825696, "step": 80145 }, { "epoch": 14.709120939621949, "grad_norm": 0.01936432532966137, "learning_rate": 1.984699693755349e-06, "loss": 0.0002, "num_input_tokens_seen": 172833888, "step": 80150 }, { "epoch": 14.7100385391815, "grad_norm": 0.037104953080415726, "learning_rate": 1.9840609722027976e-06, "loss": 0.0001, "num_input_tokens_seen": 172843744, "step": 80155 }, { "epoch": 14.710956138741054, "grad_norm": 0.007880684919655323, "learning_rate": 1.9834223280044097e-06, "loss": 0.048, "num_input_tokens_seen": 172855168, "step": 80160 }, { "epoch": 14.711873738300605, "grad_norm": 0.13318805396556854, "learning_rate": 1.9827837611765638e-06, "loss": 0.0001, "num_input_tokens_seen": 172864352, "step": 80165 }, { "epoch": 14.712791337860157, "grad_norm": 37.4540901184082, "learning_rate": 1.9821452717356367e-06, "loss": 0.0026, "num_input_tokens_seen": 172876160, "step": 80170 }, { "epoch": 14.71370893741971, "grad_norm": 0.0047067515552043915, "learning_rate": 1.9815068596980077e-06, "loss": 0.0004, "num_input_tokens_seen": 172886240, "step": 80175 }, { "epoch": 14.714626536979262, "grad_norm": 0.0016019950853660703, "learning_rate": 1.9808685250800493e-06, "loss": 0.0003, "num_input_tokens_seen": 172896832, "step": 80180 }, { "epoch": 14.715544136538814, "grad_norm": 1.0385780334472656, "learning_rate": 1.980230267898134e-06, "loss": 0.244, "num_input_tokens_seen": 172907008, "step": 80185 }, { "epoch": 14.716461736098367, "grad_norm": 0.0031924666836857796, "learning_rate": 1.9795920881686305e-06, "loss": 0.0005, "num_input_tokens_seen": 172918112, "step": 80190 }, { "epoch": 14.717379335657919, "grad_norm": 0.0030318109784275293, "learning_rate": 1.9789539859079103e-06, "loss": 0.1814, "num_input_tokens_seen": 172927648, "step": 80195 }, { "epoch": 14.71829693521747, "grad_norm": 107.93582153320312, "learning_rate": 1.9783159611323383e-06, "loss": 0.3027, "num_input_tokens_seen": 172939008, "step": 80200 }, { "epoch": 14.719214534777024, "grad_norm": 50.043128967285156, "learning_rate": 1.9776780138582768e-06, "loss": 0.0073, "num_input_tokens_seen": 172948576, "step": 80205 }, { "epoch": 14.720132134336575, "grad_norm": 0.05126412212848663, "learning_rate": 1.977040144102092e-06, "loss": 0.0006, "num_input_tokens_seen": 172958784, "step": 80210 }, { "epoch": 14.721049733896127, "grad_norm": 113.66812896728516, "learning_rate": 1.976402351880142e-06, "loss": 0.202, "num_input_tokens_seen": 172968992, "step": 80215 }, { "epoch": 14.72196733345568, "grad_norm": 139.5161590576172, "learning_rate": 1.9757646372087845e-06, "loss": 0.0336, "num_input_tokens_seen": 172979136, "step": 80220 }, { "epoch": 14.722884933015232, "grad_norm": 4.594954967498779, "learning_rate": 1.9751270001043782e-06, "loss": 0.0008, "num_input_tokens_seen": 172989408, "step": 80225 }, { "epoch": 14.723802532574783, "grad_norm": 0.0010036443127319217, "learning_rate": 1.974489440583276e-06, "loss": 0.0152, "num_input_tokens_seen": 172998912, "step": 80230 }, { "epoch": 14.724720132134337, "grad_norm": 0.0019411700777709484, "learning_rate": 1.973851958661831e-06, "loss": 0.0033, "num_input_tokens_seen": 173008736, "step": 80235 }, { "epoch": 14.725637731693888, "grad_norm": 185.26980590820312, "learning_rate": 1.9732145543563913e-06, "loss": 0.1821, "num_input_tokens_seen": 173018656, "step": 80240 }, { "epoch": 14.72655533125344, "grad_norm": 0.05678006634116173, "learning_rate": 1.9725772276833087e-06, "loss": 0.0007, "num_input_tokens_seen": 173029152, "step": 80245 }, { "epoch": 14.727472930812993, "grad_norm": 0.2967257797718048, "learning_rate": 1.971939978658928e-06, "loss": 0.0001, "num_input_tokens_seen": 173039424, "step": 80250 }, { "epoch": 14.728390530372545, "grad_norm": 0.0021222936920821667, "learning_rate": 1.9713028072995945e-06, "loss": 0.0002, "num_input_tokens_seen": 173050112, "step": 80255 }, { "epoch": 14.729308129932097, "grad_norm": 0.09543579816818237, "learning_rate": 1.9706657136216477e-06, "loss": 0.0003, "num_input_tokens_seen": 173059680, "step": 80260 }, { "epoch": 14.73022572949165, "grad_norm": 2.509708881378174, "learning_rate": 1.970028697641432e-06, "loss": 0.3754, "num_input_tokens_seen": 173071520, "step": 80265 }, { "epoch": 14.731143329051202, "grad_norm": 0.11144858598709106, "learning_rate": 1.9693917593752843e-06, "loss": 0.0002, "num_input_tokens_seen": 173082176, "step": 80270 }, { "epoch": 14.732060928610753, "grad_norm": 244.8929443359375, "learning_rate": 1.968754898839541e-06, "loss": 0.3245, "num_input_tokens_seen": 173093184, "step": 80275 }, { "epoch": 14.732978528170307, "grad_norm": 0.4444202482700348, "learning_rate": 1.9681181160505364e-06, "loss": 0.0002, "num_input_tokens_seen": 173104224, "step": 80280 }, { "epoch": 14.733896127729858, "grad_norm": 0.020995547994971275, "learning_rate": 1.967481411024603e-06, "loss": 0.0033, "num_input_tokens_seen": 173114944, "step": 80285 }, { "epoch": 14.73481372728941, "grad_norm": 0.026793222874403, "learning_rate": 1.9668447837780704e-06, "loss": 0.0001, "num_input_tokens_seen": 173125120, "step": 80290 }, { "epoch": 14.735731326848963, "grad_norm": 0.015116789378225803, "learning_rate": 1.9662082343272693e-06, "loss": 0.0005, "num_input_tokens_seen": 173136064, "step": 80295 }, { "epoch": 14.736648926408515, "grad_norm": 25.18524169921875, "learning_rate": 1.9655717626885244e-06, "loss": 0.0886, "num_input_tokens_seen": 173148896, "step": 80300 }, { "epoch": 14.737566525968067, "grad_norm": 1041.97265625, "learning_rate": 1.96493536887816e-06, "loss": 0.065, "num_input_tokens_seen": 173159776, "step": 80305 }, { "epoch": 14.73848412552762, "grad_norm": 1.8409854173660278, "learning_rate": 1.9642990529125013e-06, "loss": 0.0014, "num_input_tokens_seen": 173169760, "step": 80310 }, { "epoch": 14.739401725087172, "grad_norm": 0.031194789335131645, "learning_rate": 1.963662814807867e-06, "loss": 0.0005, "num_input_tokens_seen": 173180608, "step": 80315 }, { "epoch": 14.740319324646723, "grad_norm": 0.001918427529744804, "learning_rate": 1.963026654580575e-06, "loss": 0.0007, "num_input_tokens_seen": 173190624, "step": 80320 }, { "epoch": 14.741236924206277, "grad_norm": 0.25043556094169617, "learning_rate": 1.962390572246941e-06, "loss": 0.2311, "num_input_tokens_seen": 173202528, "step": 80325 }, { "epoch": 14.742154523765828, "grad_norm": 0.0006135873845778406, "learning_rate": 1.9617545678232826e-06, "loss": 0.0007, "num_input_tokens_seen": 173212672, "step": 80330 }, { "epoch": 14.74307212332538, "grad_norm": 0.10038501769304276, "learning_rate": 1.961118641325911e-06, "loss": 0.0001, "num_input_tokens_seen": 173223552, "step": 80335 }, { "epoch": 14.743989722884933, "grad_norm": 0.001977446023374796, "learning_rate": 1.960482792771134e-06, "loss": 0.0076, "num_input_tokens_seen": 173234880, "step": 80340 }, { "epoch": 14.744907322444485, "grad_norm": 0.002589557785540819, "learning_rate": 1.9598470221752646e-06, "loss": 0.0014, "num_input_tokens_seen": 173244544, "step": 80345 }, { "epoch": 14.745824922004036, "grad_norm": 0.005112529266625643, "learning_rate": 1.9592113295546077e-06, "loss": 0.1659, "num_input_tokens_seen": 173255552, "step": 80350 }, { "epoch": 14.74674252156359, "grad_norm": 0.006492343731224537, "learning_rate": 1.958575714925465e-06, "loss": 0.2563, "num_input_tokens_seen": 173266048, "step": 80355 }, { "epoch": 14.747660121123142, "grad_norm": 0.38123130798339844, "learning_rate": 1.9579401783041435e-06, "loss": 0.0002, "num_input_tokens_seen": 173276352, "step": 80360 }, { "epoch": 14.748577720682695, "grad_norm": 1.2119108438491821, "learning_rate": 1.9573047197069415e-06, "loss": 0.0428, "num_input_tokens_seen": 173287328, "step": 80365 }, { "epoch": 14.749495320242247, "grad_norm": 0.04386478662490845, "learning_rate": 1.9566693391501583e-06, "loss": 0.0, "num_input_tokens_seen": 173296384, "step": 80370 }, { "epoch": 14.750412919801798, "grad_norm": 0.05085451900959015, "learning_rate": 1.9560340366500874e-06, "loss": 0.0777, "num_input_tokens_seen": 173306208, "step": 80375 }, { "epoch": 14.751330519361352, "grad_norm": 43.362125396728516, "learning_rate": 1.955398812223028e-06, "loss": 0.5096, "num_input_tokens_seen": 173318144, "step": 80380 }, { "epoch": 14.752248118920903, "grad_norm": 0.01377151533961296, "learning_rate": 1.95476366588527e-06, "loss": 0.0, "num_input_tokens_seen": 173328928, "step": 80385 }, { "epoch": 14.753165718480455, "grad_norm": 0.037077222019433975, "learning_rate": 1.9541285976531026e-06, "loss": 0.0004, "num_input_tokens_seen": 173340640, "step": 80390 }, { "epoch": 14.754083318040008, "grad_norm": 0.0022088566329330206, "learning_rate": 1.953493607542818e-06, "loss": 0.0001, "num_input_tokens_seen": 173352416, "step": 80395 }, { "epoch": 14.75500091759956, "grad_norm": 0.028197521343827248, "learning_rate": 1.9528586955707e-06, "loss": 0.0002, "num_input_tokens_seen": 173363296, "step": 80400 }, { "epoch": 14.755918517159111, "grad_norm": 0.306360125541687, "learning_rate": 1.9522238617530324e-06, "loss": 0.0098, "num_input_tokens_seen": 173374240, "step": 80405 }, { "epoch": 14.756836116718665, "grad_norm": 0.0366511307656765, "learning_rate": 1.9515891061061016e-06, "loss": 0.0003, "num_input_tokens_seen": 173384928, "step": 80410 }, { "epoch": 14.757753716278216, "grad_norm": 0.3529798686504364, "learning_rate": 1.9509544286461852e-06, "loss": 0.1316, "num_input_tokens_seen": 173395648, "step": 80415 }, { "epoch": 14.758671315837768, "grad_norm": 0.005947529803961515, "learning_rate": 1.9503198293895615e-06, "loss": 0.3251, "num_input_tokens_seen": 173406528, "step": 80420 }, { "epoch": 14.759588915397321, "grad_norm": 0.0037663725670427084, "learning_rate": 1.9496853083525065e-06, "loss": 0.0003, "num_input_tokens_seen": 173417920, "step": 80425 }, { "epoch": 14.760506514956873, "grad_norm": 451.7841796875, "learning_rate": 1.9490508655512974e-06, "loss": 0.1767, "num_input_tokens_seen": 173429312, "step": 80430 }, { "epoch": 14.761424114516425, "grad_norm": 0.0007322100573219359, "learning_rate": 1.948416501002205e-06, "loss": 0.1565, "num_input_tokens_seen": 173439552, "step": 80435 }, { "epoch": 14.762341714075978, "grad_norm": 330.13714599609375, "learning_rate": 1.9477822147214983e-06, "loss": 0.1751, "num_input_tokens_seen": 173451424, "step": 80440 }, { "epoch": 14.76325931363553, "grad_norm": 0.0025729513727128506, "learning_rate": 1.9471480067254484e-06, "loss": 0.0244, "num_input_tokens_seen": 173461152, "step": 80445 }, { "epoch": 14.764176913195081, "grad_norm": 0.14906710386276245, "learning_rate": 1.946513877030321e-06, "loss": 0.0006, "num_input_tokens_seen": 173471488, "step": 80450 }, { "epoch": 14.765094512754635, "grad_norm": 0.20584100484848022, "learning_rate": 1.9458798256523804e-06, "loss": 0.0008, "num_input_tokens_seen": 173481888, "step": 80455 }, { "epoch": 14.766012112314186, "grad_norm": 8.972779273986816, "learning_rate": 1.9452458526078867e-06, "loss": 0.0007, "num_input_tokens_seen": 173493440, "step": 80460 }, { "epoch": 14.766929711873738, "grad_norm": 0.46246692538261414, "learning_rate": 1.9446119579131045e-06, "loss": 0.0004, "num_input_tokens_seen": 173504992, "step": 80465 }, { "epoch": 14.767847311433291, "grad_norm": 0.02440488710999489, "learning_rate": 1.9439781415842903e-06, "loss": 0.0001, "num_input_tokens_seen": 173517408, "step": 80470 }, { "epoch": 14.768764910992843, "grad_norm": 163.99427795410156, "learning_rate": 1.9433444036376997e-06, "loss": 0.0288, "num_input_tokens_seen": 173526912, "step": 80475 }, { "epoch": 14.769682510552395, "grad_norm": 0.00533498777076602, "learning_rate": 1.9427107440895865e-06, "loss": 0.0056, "num_input_tokens_seen": 173537440, "step": 80480 }, { "epoch": 14.770600110111948, "grad_norm": 0.014766114763915539, "learning_rate": 1.9420771629562057e-06, "loss": 0.0038, "num_input_tokens_seen": 173546752, "step": 80485 }, { "epoch": 14.7715177096715, "grad_norm": 0.006617967039346695, "learning_rate": 1.941443660253807e-06, "loss": 0.001, "num_input_tokens_seen": 173555488, "step": 80490 }, { "epoch": 14.772435309231051, "grad_norm": 0.04104379937052727, "learning_rate": 1.9408102359986375e-06, "loss": 0.0426, "num_input_tokens_seen": 173565216, "step": 80495 }, { "epoch": 14.773352908790605, "grad_norm": 111.94319915771484, "learning_rate": 1.940176890206944e-06, "loss": 0.0714, "num_input_tokens_seen": 173575072, "step": 80500 }, { "epoch": 14.774270508350156, "grad_norm": 0.006204945966601372, "learning_rate": 1.9395436228949715e-06, "loss": 0.2004, "num_input_tokens_seen": 173586336, "step": 80505 }, { "epoch": 14.775188107909708, "grad_norm": 0.18109212815761566, "learning_rate": 1.93891043407896e-06, "loss": 0.0001, "num_input_tokens_seen": 173597056, "step": 80510 }, { "epoch": 14.776105707469261, "grad_norm": 106.91799926757812, "learning_rate": 1.938277323775153e-06, "loss": 0.1316, "num_input_tokens_seen": 173609184, "step": 80515 }, { "epoch": 14.777023307028813, "grad_norm": 0.13677597045898438, "learning_rate": 1.937644291999788e-06, "loss": 0.0479, "num_input_tokens_seen": 173620192, "step": 80520 }, { "epoch": 14.777940906588364, "grad_norm": 0.0043410649523139, "learning_rate": 1.937011338769098e-06, "loss": 0.0703, "num_input_tokens_seen": 173631360, "step": 80525 }, { "epoch": 14.778858506147918, "grad_norm": 0.0009470980148762465, "learning_rate": 1.9363784640993223e-06, "loss": 0.2095, "num_input_tokens_seen": 173641856, "step": 80530 }, { "epoch": 14.77977610570747, "grad_norm": 0.00563337467610836, "learning_rate": 1.935745668006691e-06, "loss": 0.2538, "num_input_tokens_seen": 173652896, "step": 80535 }, { "epoch": 14.780693705267021, "grad_norm": 36.71870040893555, "learning_rate": 1.9351129505074317e-06, "loss": 0.1333, "num_input_tokens_seen": 173662144, "step": 80540 }, { "epoch": 14.781611304826574, "grad_norm": 0.028974058106541634, "learning_rate": 1.9344803116177772e-06, "loss": 0.1038, "num_input_tokens_seen": 173671648, "step": 80545 }, { "epoch": 14.782528904386126, "grad_norm": 0.003306586993858218, "learning_rate": 1.9338477513539523e-06, "loss": 0.0007, "num_input_tokens_seen": 173682240, "step": 80550 }, { "epoch": 14.783446503945678, "grad_norm": 0.009507604874670506, "learning_rate": 1.9332152697321793e-06, "loss": 0.0052, "num_input_tokens_seen": 173693536, "step": 80555 }, { "epoch": 14.784364103505231, "grad_norm": 0.017933467403054237, "learning_rate": 1.932582866768681e-06, "loss": 0.001, "num_input_tokens_seen": 173704288, "step": 80560 }, { "epoch": 14.785281703064783, "grad_norm": 355.9006652832031, "learning_rate": 1.9319505424796784e-06, "loss": 0.1848, "num_input_tokens_seen": 173716288, "step": 80565 }, { "epoch": 14.786199302624334, "grad_norm": 0.007206239737570286, "learning_rate": 1.9313182968813902e-06, "loss": 0.0001, "num_input_tokens_seen": 173727744, "step": 80570 }, { "epoch": 14.787116902183888, "grad_norm": 0.0016201260732486844, "learning_rate": 1.9306861299900303e-06, "loss": 0.0011, "num_input_tokens_seen": 173737280, "step": 80575 }, { "epoch": 14.78803450174344, "grad_norm": 0.15914654731750488, "learning_rate": 1.9300540418218155e-06, "loss": 0.0017, "num_input_tokens_seen": 173748960, "step": 80580 }, { "epoch": 14.788952101302991, "grad_norm": 212.86932373046875, "learning_rate": 1.9294220323929564e-06, "loss": 0.1252, "num_input_tokens_seen": 173759040, "step": 80585 }, { "epoch": 14.789869700862544, "grad_norm": 0.008077879436314106, "learning_rate": 1.9287901017196613e-06, "loss": 0.0705, "num_input_tokens_seen": 173771008, "step": 80590 }, { "epoch": 14.790787300422096, "grad_norm": 0.005541977472603321, "learning_rate": 1.9281582498181424e-06, "loss": 0.0003, "num_input_tokens_seen": 173781696, "step": 80595 }, { "epoch": 14.791704899981648, "grad_norm": 3.9993302822113037, "learning_rate": 1.927526476704603e-06, "loss": 0.0019, "num_input_tokens_seen": 173792704, "step": 80600 }, { "epoch": 14.792622499541201, "grad_norm": 0.02938207797706127, "learning_rate": 1.9268947823952476e-06, "loss": 0.1104, "num_input_tokens_seen": 173803968, "step": 80605 }, { "epoch": 14.793540099100753, "grad_norm": 0.047437284141778946, "learning_rate": 1.926263166906277e-06, "loss": 0.0763, "num_input_tokens_seen": 173814848, "step": 80610 }, { "epoch": 14.794457698660304, "grad_norm": 63.1693229675293, "learning_rate": 1.9256316302538935e-06, "loss": 0.0558, "num_input_tokens_seen": 173826464, "step": 80615 }, { "epoch": 14.795375298219858, "grad_norm": 0.017140517011284828, "learning_rate": 1.925000172454294e-06, "loss": 0.1097, "num_input_tokens_seen": 173836640, "step": 80620 }, { "epoch": 14.79629289777941, "grad_norm": 0.0018596493173390627, "learning_rate": 1.9243687935236725e-06, "loss": 0.3835, "num_input_tokens_seen": 173847136, "step": 80625 }, { "epoch": 14.79721049733896, "grad_norm": 0.03658166155219078, "learning_rate": 1.9237374934782266e-06, "loss": 0.1658, "num_input_tokens_seen": 173857184, "step": 80630 }, { "epoch": 14.798128096898514, "grad_norm": 0.012990247458219528, "learning_rate": 1.9231062723341458e-06, "loss": 0.0002, "num_input_tokens_seen": 173866976, "step": 80635 }, { "epoch": 14.799045696458066, "grad_norm": 0.014109146781265736, "learning_rate": 1.9224751301076206e-06, "loss": 0.0004, "num_input_tokens_seen": 173878496, "step": 80640 }, { "epoch": 14.799963296017618, "grad_norm": 141.0272674560547, "learning_rate": 1.9218440668148367e-06, "loss": 0.1632, "num_input_tokens_seen": 173888960, "step": 80645 }, { "epoch": 14.800880895577171, "grad_norm": 0.0033014665823429823, "learning_rate": 1.921213082471984e-06, "loss": 0.1627, "num_input_tokens_seen": 173898784, "step": 80650 }, { "epoch": 14.801798495136723, "grad_norm": 0.016774432733654976, "learning_rate": 1.9205821770952433e-06, "loss": 0.0005, "num_input_tokens_seen": 173909824, "step": 80655 }, { "epoch": 14.802716094696274, "grad_norm": 0.004881089553236961, "learning_rate": 1.9199513507007954e-06, "loss": 0.0, "num_input_tokens_seen": 173920512, "step": 80660 }, { "epoch": 14.803633694255828, "grad_norm": 0.009707590565085411, "learning_rate": 1.919320603304824e-06, "loss": 0.0519, "num_input_tokens_seen": 173929632, "step": 80665 }, { "epoch": 14.80455129381538, "grad_norm": 0.11000761389732361, "learning_rate": 1.9186899349235044e-06, "loss": 0.1596, "num_input_tokens_seen": 173940736, "step": 80670 }, { "epoch": 14.80546889337493, "grad_norm": 0.31487685441970825, "learning_rate": 1.9180593455730107e-06, "loss": 0.0001, "num_input_tokens_seen": 173950784, "step": 80675 }, { "epoch": 14.806386492934484, "grad_norm": 0.009873965755105019, "learning_rate": 1.9174288352695197e-06, "loss": 0.2134, "num_input_tokens_seen": 173962080, "step": 80680 }, { "epoch": 14.807304092494036, "grad_norm": 0.03157897666096687, "learning_rate": 1.9167984040292016e-06, "loss": 0.0523, "num_input_tokens_seen": 173971616, "step": 80685 }, { "epoch": 14.808221692053587, "grad_norm": 0.0024954047985374928, "learning_rate": 1.916168051868226e-06, "loss": 0.1738, "num_input_tokens_seen": 173983936, "step": 80690 }, { "epoch": 14.80913929161314, "grad_norm": 0.052246078848838806, "learning_rate": 1.91553777880276e-06, "loss": 0.1708, "num_input_tokens_seen": 173994464, "step": 80695 }, { "epoch": 14.810056891172692, "grad_norm": 0.0011645546182990074, "learning_rate": 1.9149075848489698e-06, "loss": 0.008, "num_input_tokens_seen": 174004128, "step": 80700 }, { "epoch": 14.810974490732244, "grad_norm": 241.60203552246094, "learning_rate": 1.9142774700230167e-06, "loss": 0.0211, "num_input_tokens_seen": 174014464, "step": 80705 }, { "epoch": 14.811892090291797, "grad_norm": 0.02159637212753296, "learning_rate": 1.913647434341066e-06, "loss": 0.0052, "num_input_tokens_seen": 174025824, "step": 80710 }, { "epoch": 14.812809689851349, "grad_norm": 0.0035766528453677893, "learning_rate": 1.913017477819275e-06, "loss": 0.001, "num_input_tokens_seen": 174036384, "step": 80715 }, { "epoch": 14.8137272894109, "grad_norm": 5.8814544677734375, "learning_rate": 1.912387600473801e-06, "loss": 0.007, "num_input_tokens_seen": 174047552, "step": 80720 }, { "epoch": 14.814644888970454, "grad_norm": 25.982765197753906, "learning_rate": 1.911757802320799e-06, "loss": 0.0011, "num_input_tokens_seen": 174058752, "step": 80725 }, { "epoch": 14.815562488530006, "grad_norm": 0.011014989577233791, "learning_rate": 1.911128083376424e-06, "loss": 0.0001, "num_input_tokens_seen": 174069152, "step": 80730 }, { "epoch": 14.816480088089557, "grad_norm": 0.005752825643867254, "learning_rate": 1.9104984436568263e-06, "loss": 0.0001, "num_input_tokens_seen": 174079872, "step": 80735 }, { "epoch": 14.81739768764911, "grad_norm": 1.0513215065002441, "learning_rate": 1.909868883178155e-06, "loss": 0.0004, "num_input_tokens_seen": 174090400, "step": 80740 }, { "epoch": 14.818315287208662, "grad_norm": 0.013605889864265919, "learning_rate": 1.9092394019565564e-06, "loss": 0.0001, "num_input_tokens_seen": 174101760, "step": 80745 }, { "epoch": 14.819232886768214, "grad_norm": 0.011778722517192364, "learning_rate": 1.9086100000081786e-06, "loss": 0.0033, "num_input_tokens_seen": 174113408, "step": 80750 }, { "epoch": 14.820150486327767, "grad_norm": 0.0713871419429779, "learning_rate": 1.9079806773491625e-06, "loss": 0.0, "num_input_tokens_seen": 174123360, "step": 80755 }, { "epoch": 14.821068085887319, "grad_norm": 0.06163103133440018, "learning_rate": 1.9073514339956487e-06, "loss": 0.0475, "num_input_tokens_seen": 174133888, "step": 80760 }, { "epoch": 14.82198568544687, "grad_norm": 171.8979949951172, "learning_rate": 1.9067222699637794e-06, "loss": 0.0671, "num_input_tokens_seen": 174145504, "step": 80765 }, { "epoch": 14.822903285006424, "grad_norm": 0.001345713739283383, "learning_rate": 1.906093185269689e-06, "loss": 0.002, "num_input_tokens_seen": 174156800, "step": 80770 }, { "epoch": 14.823820884565976, "grad_norm": 96.88983917236328, "learning_rate": 1.9054641799295136e-06, "loss": 0.3754, "num_input_tokens_seen": 174167616, "step": 80775 }, { "epoch": 14.824738484125527, "grad_norm": 90.26919555664062, "learning_rate": 1.9048352539593845e-06, "loss": 0.1314, "num_input_tokens_seen": 174179072, "step": 80780 }, { "epoch": 14.82565608368508, "grad_norm": 0.10941498726606369, "learning_rate": 1.9042064073754352e-06, "loss": 0.0003, "num_input_tokens_seen": 174189408, "step": 80785 }, { "epoch": 14.826573683244632, "grad_norm": 0.010063180699944496, "learning_rate": 1.9035776401937938e-06, "loss": 0.0001, "num_input_tokens_seen": 174200512, "step": 80790 }, { "epoch": 14.827491282804184, "grad_norm": 0.04632973298430443, "learning_rate": 1.9029489524305855e-06, "loss": 0.0, "num_input_tokens_seen": 174211648, "step": 80795 }, { "epoch": 14.828408882363737, "grad_norm": 0.04797447845339775, "learning_rate": 1.9023203441019377e-06, "loss": 0.0001, "num_input_tokens_seen": 174221568, "step": 80800 }, { "epoch": 14.829326481923289, "grad_norm": 0.0007675218512304127, "learning_rate": 1.9016918152239722e-06, "loss": 0.0001, "num_input_tokens_seen": 174232800, "step": 80805 }, { "epoch": 14.83024408148284, "grad_norm": 0.005097007378935814, "learning_rate": 1.901063365812808e-06, "loss": 0.0284, "num_input_tokens_seen": 174244000, "step": 80810 }, { "epoch": 14.831161681042394, "grad_norm": 0.016384722664952278, "learning_rate": 1.9004349958845676e-06, "loss": 0.1532, "num_input_tokens_seen": 174254176, "step": 80815 }, { "epoch": 14.832079280601945, "grad_norm": 0.004290775395929813, "learning_rate": 1.8998067054553654e-06, "loss": 0.0486, "num_input_tokens_seen": 174264384, "step": 80820 }, { "epoch": 14.832996880161497, "grad_norm": 47.97938537597656, "learning_rate": 1.8991784945413166e-06, "loss": 0.2445, "num_input_tokens_seen": 174275840, "step": 80825 }, { "epoch": 14.83391447972105, "grad_norm": 0.02481796033680439, "learning_rate": 1.8985503631585317e-06, "loss": 0.0003, "num_input_tokens_seen": 174286272, "step": 80830 }, { "epoch": 14.834832079280602, "grad_norm": 0.016780300065875053, "learning_rate": 1.8979223113231249e-06, "loss": 0.0013, "num_input_tokens_seen": 174296768, "step": 80835 }, { "epoch": 14.835749678840154, "grad_norm": 0.1688421368598938, "learning_rate": 1.8972943390512026e-06, "loss": 0.0033, "num_input_tokens_seen": 174307648, "step": 80840 }, { "epoch": 14.836667278399707, "grad_norm": 115.92244720458984, "learning_rate": 1.8966664463588707e-06, "loss": 0.3313, "num_input_tokens_seen": 174318880, "step": 80845 }, { "epoch": 14.837584877959259, "grad_norm": 0.012215943075716496, "learning_rate": 1.896038633262236e-06, "loss": 0.0589, "num_input_tokens_seen": 174329824, "step": 80850 }, { "epoch": 14.83850247751881, "grad_norm": 0.0017472797771915793, "learning_rate": 1.8954108997774002e-06, "loss": 0.1782, "num_input_tokens_seen": 174340960, "step": 80855 }, { "epoch": 14.839420077078364, "grad_norm": 381.0763244628906, "learning_rate": 1.8947832459204607e-06, "loss": 0.2194, "num_input_tokens_seen": 174351072, "step": 80860 }, { "epoch": 14.840337676637915, "grad_norm": 0.00248121190816164, "learning_rate": 1.8941556717075205e-06, "loss": 0.0, "num_input_tokens_seen": 174361856, "step": 80865 }, { "epoch": 14.841255276197467, "grad_norm": 0.004342268221080303, "learning_rate": 1.8935281771546737e-06, "loss": 0.0001, "num_input_tokens_seen": 174372128, "step": 80870 }, { "epoch": 14.84217287575702, "grad_norm": 0.00803342554718256, "learning_rate": 1.8929007622780143e-06, "loss": 0.2601, "num_input_tokens_seen": 174382208, "step": 80875 }, { "epoch": 14.843090475316572, "grad_norm": 0.016616014763712883, "learning_rate": 1.8922734270936333e-06, "loss": 0.0004, "num_input_tokens_seen": 174392832, "step": 80880 }, { "epoch": 14.844008074876124, "grad_norm": 0.0016212888294830918, "learning_rate": 1.8916461716176237e-06, "loss": 0.0, "num_input_tokens_seen": 174404032, "step": 80885 }, { "epoch": 14.844925674435677, "grad_norm": 44.80867385864258, "learning_rate": 1.8910189958660725e-06, "loss": 0.075, "num_input_tokens_seen": 174414720, "step": 80890 }, { "epoch": 14.845843273995229, "grad_norm": 79.93672943115234, "learning_rate": 1.8903918998550659e-06, "loss": 0.2021, "num_input_tokens_seen": 174424992, "step": 80895 }, { "epoch": 14.84676087355478, "grad_norm": 0.017918463796377182, "learning_rate": 1.8897648836006854e-06, "loss": 0.0428, "num_input_tokens_seen": 174435232, "step": 80900 }, { "epoch": 14.847678473114334, "grad_norm": 0.0018964129267260432, "learning_rate": 1.889137947119017e-06, "loss": 0.0001, "num_input_tokens_seen": 174447072, "step": 80905 }, { "epoch": 14.848596072673885, "grad_norm": 0.000760586466640234, "learning_rate": 1.8885110904261389e-06, "loss": 0.0003, "num_input_tokens_seen": 174457408, "step": 80910 }, { "epoch": 14.849513672233437, "grad_norm": 0.006825397722423077, "learning_rate": 1.8878843135381293e-06, "loss": 0.1252, "num_input_tokens_seen": 174468384, "step": 80915 }, { "epoch": 14.85043127179299, "grad_norm": 0.07702498883008957, "learning_rate": 1.8872576164710633e-06, "loss": 0.0051, "num_input_tokens_seen": 174479328, "step": 80920 }, { "epoch": 14.851348871352542, "grad_norm": 0.012692131102085114, "learning_rate": 1.8866309992410137e-06, "loss": 0.0003, "num_input_tokens_seen": 174490048, "step": 80925 }, { "epoch": 14.852266470912094, "grad_norm": 0.015057254582643509, "learning_rate": 1.886004461864055e-06, "loss": 0.0045, "num_input_tokens_seen": 174500224, "step": 80930 }, { "epoch": 14.853184070471647, "grad_norm": 179.22006225585938, "learning_rate": 1.885378004356256e-06, "loss": 0.1287, "num_input_tokens_seen": 174511616, "step": 80935 }, { "epoch": 14.854101670031199, "grad_norm": 0.013825329020619392, "learning_rate": 1.884751626733684e-06, "loss": 0.0564, "num_input_tokens_seen": 174522048, "step": 80940 }, { "epoch": 14.85501926959075, "grad_norm": 0.005083946511149406, "learning_rate": 1.8841253290124022e-06, "loss": 0.0966, "num_input_tokens_seen": 174533632, "step": 80945 }, { "epoch": 14.855936869150304, "grad_norm": 0.03491273149847984, "learning_rate": 1.8834991112084788e-06, "loss": 0.0052, "num_input_tokens_seen": 174545728, "step": 80950 }, { "epoch": 14.856854468709855, "grad_norm": 0.051901210099458694, "learning_rate": 1.882872973337973e-06, "loss": 0.0008, "num_input_tokens_seen": 174555968, "step": 80955 }, { "epoch": 14.857772068269407, "grad_norm": 0.06294716149568558, "learning_rate": 1.8822469154169448e-06, "loss": 0.2345, "num_input_tokens_seen": 174567008, "step": 80960 }, { "epoch": 14.85868966782896, "grad_norm": 0.8316290378570557, "learning_rate": 1.8816209374614487e-06, "loss": 0.0001, "num_input_tokens_seen": 174577664, "step": 80965 }, { "epoch": 14.859607267388512, "grad_norm": 37.44427490234375, "learning_rate": 1.8809950394875443e-06, "loss": 0.039, "num_input_tokens_seen": 174588192, "step": 80970 }, { "epoch": 14.860524866948063, "grad_norm": 0.2366013526916504, "learning_rate": 1.8803692215112834e-06, "loss": 0.2502, "num_input_tokens_seen": 174598752, "step": 80975 }, { "epoch": 14.861442466507617, "grad_norm": 15.835186004638672, "learning_rate": 1.879743483548715e-06, "loss": 0.0026, "num_input_tokens_seen": 174608960, "step": 80980 }, { "epoch": 14.862360066067168, "grad_norm": 0.024027524515986443, "learning_rate": 1.8791178256158926e-06, "loss": 0.1475, "num_input_tokens_seen": 174619552, "step": 80985 }, { "epoch": 14.86327766562672, "grad_norm": 0.01327424868941307, "learning_rate": 1.8784922477288602e-06, "loss": 0.0002, "num_input_tokens_seen": 174631072, "step": 80990 }, { "epoch": 14.864195265186273, "grad_norm": 0.0007171907927840948, "learning_rate": 1.8778667499036624e-06, "loss": 0.3523, "num_input_tokens_seen": 174643040, "step": 80995 }, { "epoch": 14.865112864745825, "grad_norm": 0.017046572640538216, "learning_rate": 1.8772413321563453e-06, "loss": 0.0001, "num_input_tokens_seen": 174654144, "step": 81000 }, { "epoch": 14.866030464305377, "grad_norm": 0.08099707961082458, "learning_rate": 1.8766159945029482e-06, "loss": 0.0002, "num_input_tokens_seen": 174663936, "step": 81005 }, { "epoch": 14.86694806386493, "grad_norm": 169.30648803710938, "learning_rate": 1.8759907369595104e-06, "loss": 0.3722, "num_input_tokens_seen": 174676384, "step": 81010 }, { "epoch": 14.867865663424482, "grad_norm": 1.4928958415985107, "learning_rate": 1.8753655595420661e-06, "loss": 0.2705, "num_input_tokens_seen": 174685856, "step": 81015 }, { "epoch": 14.868783262984033, "grad_norm": 0.0029862436931580305, "learning_rate": 1.874740462266655e-06, "loss": 0.0001, "num_input_tokens_seen": 174696576, "step": 81020 }, { "epoch": 14.869700862543587, "grad_norm": 0.02156444638967514, "learning_rate": 1.8741154451493065e-06, "loss": 0.0025, "num_input_tokens_seen": 174708288, "step": 81025 }, { "epoch": 14.870618462103138, "grad_norm": 0.04820816591382027, "learning_rate": 1.8734905082060505e-06, "loss": 0.0122, "num_input_tokens_seen": 174718432, "step": 81030 }, { "epoch": 14.87153606166269, "grad_norm": 12.400848388671875, "learning_rate": 1.8728656514529192e-06, "loss": 0.0068, "num_input_tokens_seen": 174729056, "step": 81035 }, { "epoch": 14.872453661222243, "grad_norm": 0.013780069537460804, "learning_rate": 1.8722408749059374e-06, "loss": 0.2972, "num_input_tokens_seen": 174739168, "step": 81040 }, { "epoch": 14.873371260781795, "grad_norm": 35.01416015625, "learning_rate": 1.8716161785811277e-06, "loss": 0.3854, "num_input_tokens_seen": 174750624, "step": 81045 }, { "epoch": 14.874288860341347, "grad_norm": 0.17328476905822754, "learning_rate": 1.8709915624945163e-06, "loss": 0.1881, "num_input_tokens_seen": 174761728, "step": 81050 }, { "epoch": 14.8752064599009, "grad_norm": 0.009513073600828648, "learning_rate": 1.8703670266621222e-06, "loss": 0.0591, "num_input_tokens_seen": 174771264, "step": 81055 }, { "epoch": 14.876124059460452, "grad_norm": 0.026547783985733986, "learning_rate": 1.8697425710999628e-06, "loss": 0.0001, "num_input_tokens_seen": 174782208, "step": 81060 }, { "epoch": 14.877041659020003, "grad_norm": 0.01030850037932396, "learning_rate": 1.8691181958240534e-06, "loss": 0.001, "num_input_tokens_seen": 174792256, "step": 81065 }, { "epoch": 14.877959258579557, "grad_norm": 2.2004997730255127, "learning_rate": 1.8684939008504116e-06, "loss": 0.0538, "num_input_tokens_seen": 174803168, "step": 81070 }, { "epoch": 14.878876858139108, "grad_norm": 0.02167789824306965, "learning_rate": 1.8678696861950478e-06, "loss": 0.0, "num_input_tokens_seen": 174813280, "step": 81075 }, { "epoch": 14.87979445769866, "grad_norm": 0.033490147441625595, "learning_rate": 1.8672455518739708e-06, "loss": 0.0119, "num_input_tokens_seen": 174824160, "step": 81080 }, { "epoch": 14.880712057258213, "grad_norm": 0.0381154827773571, "learning_rate": 1.866621497903191e-06, "loss": 0.0066, "num_input_tokens_seen": 174834272, "step": 81085 }, { "epoch": 14.881629656817765, "grad_norm": 0.02015509083867073, "learning_rate": 1.8659975242987143e-06, "loss": 0.1635, "num_input_tokens_seen": 174845344, "step": 81090 }, { "epoch": 14.882547256377316, "grad_norm": 0.02135821431875229, "learning_rate": 1.8653736310765435e-06, "loss": 0.0001, "num_input_tokens_seen": 174854720, "step": 81095 }, { "epoch": 14.88346485593687, "grad_norm": 0.022366521880030632, "learning_rate": 1.864749818252679e-06, "loss": 0.0013, "num_input_tokens_seen": 174866400, "step": 81100 }, { "epoch": 14.884382455496421, "grad_norm": 0.06110639497637749, "learning_rate": 1.8641260858431243e-06, "loss": 0.0009, "num_input_tokens_seen": 174877824, "step": 81105 }, { "epoch": 14.885300055055973, "grad_norm": 0.001032395288348198, "learning_rate": 1.8635024338638758e-06, "loss": 0.0, "num_input_tokens_seen": 174887712, "step": 81110 }, { "epoch": 14.886217654615526, "grad_norm": 0.15580077469348907, "learning_rate": 1.862878862330928e-06, "loss": 0.0002, "num_input_tokens_seen": 174896768, "step": 81115 }, { "epoch": 14.887135254175078, "grad_norm": 0.5997618436813354, "learning_rate": 1.8622553712602737e-06, "loss": 0.0002, "num_input_tokens_seen": 174908448, "step": 81120 }, { "epoch": 14.88805285373463, "grad_norm": 0.0069353012368083, "learning_rate": 1.861631960667908e-06, "loss": 0.1314, "num_input_tokens_seen": 174918400, "step": 81125 }, { "epoch": 14.888970453294183, "grad_norm": 0.05123625695705414, "learning_rate": 1.8610086305698184e-06, "loss": 0.0002, "num_input_tokens_seen": 174929600, "step": 81130 }, { "epoch": 14.889888052853735, "grad_norm": 6.805443286895752, "learning_rate": 1.8603853809819927e-06, "loss": 0.001, "num_input_tokens_seen": 174940160, "step": 81135 }, { "epoch": 14.890805652413286, "grad_norm": 0.01647215522825718, "learning_rate": 1.8597622119204156e-06, "loss": 0.001, "num_input_tokens_seen": 174950688, "step": 81140 }, { "epoch": 14.89172325197284, "grad_norm": 0.031412236392498016, "learning_rate": 1.859139123401069e-06, "loss": 0.1334, "num_input_tokens_seen": 174961152, "step": 81145 }, { "epoch": 14.892640851532391, "grad_norm": 0.2237144261598587, "learning_rate": 1.8585161154399383e-06, "loss": 0.0001, "num_input_tokens_seen": 174972896, "step": 81150 }, { "epoch": 14.893558451091943, "grad_norm": 2.9082789421081543, "learning_rate": 1.8578931880529998e-06, "loss": 0.0017, "num_input_tokens_seen": 174984992, "step": 81155 }, { "epoch": 14.894476050651496, "grad_norm": 0.06823506951332092, "learning_rate": 1.857270341256232e-06, "loss": 0.0001, "num_input_tokens_seen": 174996832, "step": 81160 }, { "epoch": 14.895393650211048, "grad_norm": 0.019129689782857895, "learning_rate": 1.8566475750656066e-06, "loss": 0.0001, "num_input_tokens_seen": 175007328, "step": 81165 }, { "epoch": 14.8963112497706, "grad_norm": 27.369558334350586, "learning_rate": 1.856024889497101e-06, "loss": 0.0014, "num_input_tokens_seen": 175018848, "step": 81170 }, { "epoch": 14.897228849330153, "grad_norm": 1.1589053869247437, "learning_rate": 1.8554022845666846e-06, "loss": 0.0012, "num_input_tokens_seen": 175028352, "step": 81175 }, { "epoch": 14.898146448889705, "grad_norm": 0.011050187982618809, "learning_rate": 1.8547797602903244e-06, "loss": 0.1815, "num_input_tokens_seen": 175038496, "step": 81180 }, { "epoch": 14.899064048449256, "grad_norm": 0.001230249647051096, "learning_rate": 1.8541573166839898e-06, "loss": 0.0001, "num_input_tokens_seen": 175050080, "step": 81185 }, { "epoch": 14.89998164800881, "grad_norm": 0.18556971848011017, "learning_rate": 1.8535349537636449e-06, "loss": 0.0001, "num_input_tokens_seen": 175060544, "step": 81190 }, { "epoch": 14.900899247568361, "grad_norm": 0.07554834336042404, "learning_rate": 1.8529126715452516e-06, "loss": 0.0002, "num_input_tokens_seen": 175071776, "step": 81195 }, { "epoch": 14.901816847127913, "grad_norm": 0.005802053026854992, "learning_rate": 1.852290470044769e-06, "loss": 0.0001, "num_input_tokens_seen": 175083008, "step": 81200 }, { "epoch": 14.902734446687466, "grad_norm": 0.003069412661716342, "learning_rate": 1.851668349278159e-06, "loss": 0.0001, "num_input_tokens_seen": 175093632, "step": 81205 }, { "epoch": 14.903652046247018, "grad_norm": 0.003475917037576437, "learning_rate": 1.8510463092613767e-06, "loss": 0.1191, "num_input_tokens_seen": 175104960, "step": 81210 }, { "epoch": 14.90456964580657, "grad_norm": 0.12728358805179596, "learning_rate": 1.8504243500103742e-06, "loss": 0.1659, "num_input_tokens_seen": 175115488, "step": 81215 }, { "epoch": 14.905487245366123, "grad_norm": 0.02329355478286743, "learning_rate": 1.8498024715411073e-06, "loss": 0.0001, "num_input_tokens_seen": 175125920, "step": 81220 }, { "epoch": 14.906404844925675, "grad_norm": 0.004042404238134623, "learning_rate": 1.8491806738695245e-06, "loss": 0.0001, "num_input_tokens_seen": 175137472, "step": 81225 }, { "epoch": 14.907322444485226, "grad_norm": 0.0167683195322752, "learning_rate": 1.8485589570115748e-06, "loss": 0.0001, "num_input_tokens_seen": 175147616, "step": 81230 }, { "epoch": 14.90824004404478, "grad_norm": 0.07064495235681534, "learning_rate": 1.8479373209832013e-06, "loss": 0.0001, "num_input_tokens_seen": 175158944, "step": 81235 }, { "epoch": 14.909157643604331, "grad_norm": 0.013279655948281288, "learning_rate": 1.847315765800352e-06, "loss": 0.0001, "num_input_tokens_seen": 175169728, "step": 81240 }, { "epoch": 14.910075243163883, "grad_norm": 0.014245130121707916, "learning_rate": 1.846694291478967e-06, "loss": 0.0479, "num_input_tokens_seen": 175180960, "step": 81245 }, { "epoch": 14.910992842723436, "grad_norm": 47.50242233276367, "learning_rate": 1.8460728980349845e-06, "loss": 0.0121, "num_input_tokens_seen": 175192192, "step": 81250 }, { "epoch": 14.911910442282988, "grad_norm": 0.013608155772089958, "learning_rate": 1.8454515854843463e-06, "loss": 0.0002, "num_input_tokens_seen": 175203168, "step": 81255 }, { "epoch": 14.91282804184254, "grad_norm": 0.0005402228562161326, "learning_rate": 1.8448303538429852e-06, "loss": 0.0174, "num_input_tokens_seen": 175213088, "step": 81260 }, { "epoch": 14.913745641402093, "grad_norm": 0.15030735731124878, "learning_rate": 1.8442092031268344e-06, "loss": 0.0001, "num_input_tokens_seen": 175224416, "step": 81265 }, { "epoch": 14.914663240961644, "grad_norm": 0.006989940069615841, "learning_rate": 1.8435881333518275e-06, "loss": 0.172, "num_input_tokens_seen": 175235744, "step": 81270 }, { "epoch": 14.915580840521196, "grad_norm": 2.16206693649292, "learning_rate": 1.8429671445338938e-06, "loss": 0.0003, "num_input_tokens_seen": 175246400, "step": 81275 }, { "epoch": 14.91649844008075, "grad_norm": 0.0036726349499076605, "learning_rate": 1.8423462366889587e-06, "loss": 0.1884, "num_input_tokens_seen": 175256192, "step": 81280 }, { "epoch": 14.917416039640301, "grad_norm": 301.209228515625, "learning_rate": 1.8417254098329479e-06, "loss": 0.3786, "num_input_tokens_seen": 175267680, "step": 81285 }, { "epoch": 14.918333639199853, "grad_norm": 0.012381880544126034, "learning_rate": 1.841104663981787e-06, "loss": 0.1006, "num_input_tokens_seen": 175279872, "step": 81290 }, { "epoch": 14.919251238759406, "grad_norm": 0.055218350142240524, "learning_rate": 1.8404839991513956e-06, "loss": 0.2256, "num_input_tokens_seen": 175291072, "step": 81295 }, { "epoch": 14.920168838318958, "grad_norm": 0.012701321393251419, "learning_rate": 1.8398634153576911e-06, "loss": 0.0001, "num_input_tokens_seen": 175303040, "step": 81300 }, { "epoch": 14.92108643787851, "grad_norm": 30.104263305664062, "learning_rate": 1.839242912616594e-06, "loss": 0.245, "num_input_tokens_seen": 175314432, "step": 81305 }, { "epoch": 14.922004037438063, "grad_norm": 3.344529390335083, "learning_rate": 1.8386224909440175e-06, "loss": 0.0038, "num_input_tokens_seen": 175324224, "step": 81310 }, { "epoch": 14.922921636997614, "grad_norm": 0.09301673620939255, "learning_rate": 1.8380021503558726e-06, "loss": 0.0007, "num_input_tokens_seen": 175334048, "step": 81315 }, { "epoch": 14.923839236557166, "grad_norm": 0.004861713387072086, "learning_rate": 1.8373818908680736e-06, "loss": 0.0005, "num_input_tokens_seen": 175345248, "step": 81320 }, { "epoch": 14.92475683611672, "grad_norm": 0.04542096331715584, "learning_rate": 1.836761712496527e-06, "loss": 0.2558, "num_input_tokens_seen": 175355296, "step": 81325 }, { "epoch": 14.925674435676271, "grad_norm": 0.12649254500865936, "learning_rate": 1.8361416152571403e-06, "loss": 0.0011, "num_input_tokens_seen": 175365088, "step": 81330 }, { "epoch": 14.926592035235823, "grad_norm": 0.035011764615774155, "learning_rate": 1.8355215991658183e-06, "loss": 0.0002, "num_input_tokens_seen": 175375168, "step": 81335 }, { "epoch": 14.927509634795376, "grad_norm": 0.002407003892585635, "learning_rate": 1.8349016642384604e-06, "loss": 0.0007, "num_input_tokens_seen": 175385536, "step": 81340 }, { "epoch": 14.928427234354928, "grad_norm": 0.20268967747688293, "learning_rate": 1.834281810490971e-06, "loss": 0.0001, "num_input_tokens_seen": 175396160, "step": 81345 }, { "epoch": 14.92934483391448, "grad_norm": 0.011469388380646706, "learning_rate": 1.8336620379392466e-06, "loss": 0.0004, "num_input_tokens_seen": 175407232, "step": 81350 }, { "epoch": 14.930262433474033, "grad_norm": 0.1651850938796997, "learning_rate": 1.8330423465991843e-06, "loss": 0.0946, "num_input_tokens_seen": 175418464, "step": 81355 }, { "epoch": 14.931180033033584, "grad_norm": 312.485107421875, "learning_rate": 1.832422736486677e-06, "loss": 0.0705, "num_input_tokens_seen": 175429408, "step": 81360 }, { "epoch": 14.932097632593136, "grad_norm": 0.014427448622882366, "learning_rate": 1.8318032076176167e-06, "loss": 0.1442, "num_input_tokens_seen": 175439712, "step": 81365 }, { "epoch": 14.93301523215269, "grad_norm": 0.003895497415214777, "learning_rate": 1.831183760007893e-06, "loss": 0.1264, "num_input_tokens_seen": 175450240, "step": 81370 }, { "epoch": 14.93393283171224, "grad_norm": 0.4482825994491577, "learning_rate": 1.8305643936733959e-06, "loss": 0.0003, "num_input_tokens_seen": 175460576, "step": 81375 }, { "epoch": 14.934850431271792, "grad_norm": 0.05439264327287674, "learning_rate": 1.8299451086300102e-06, "loss": 0.0001, "num_input_tokens_seen": 175471264, "step": 81380 }, { "epoch": 14.935768030831346, "grad_norm": 0.007653628941625357, "learning_rate": 1.8293259048936174e-06, "loss": 0.2032, "num_input_tokens_seen": 175482816, "step": 81385 }, { "epoch": 14.936685630390897, "grad_norm": 0.013587367720901966, "learning_rate": 1.828706782480103e-06, "loss": 0.0001, "num_input_tokens_seen": 175494432, "step": 81390 }, { "epoch": 14.937603229950449, "grad_norm": 203.99002075195312, "learning_rate": 1.8280877414053445e-06, "loss": 0.033, "num_input_tokens_seen": 175505696, "step": 81395 }, { "epoch": 14.938520829510002, "grad_norm": 0.15091373026371002, "learning_rate": 1.827468781685217e-06, "loss": 0.0002, "num_input_tokens_seen": 175516224, "step": 81400 }, { "epoch": 14.939438429069554, "grad_norm": 1.8974112272262573, "learning_rate": 1.8268499033356007e-06, "loss": 0.0042, "num_input_tokens_seen": 175526880, "step": 81405 }, { "epoch": 14.940356028629106, "grad_norm": 24.116329193115234, "learning_rate": 1.8262311063723664e-06, "loss": 0.0908, "num_input_tokens_seen": 175537984, "step": 81410 }, { "epoch": 14.94127362818866, "grad_norm": 26.82298469543457, "learning_rate": 1.8256123908113853e-06, "loss": 0.1067, "num_input_tokens_seen": 175549056, "step": 81415 }, { "epoch": 14.94219122774821, "grad_norm": 0.038604557514190674, "learning_rate": 1.8249937566685245e-06, "loss": 0.0001, "num_input_tokens_seen": 175560448, "step": 81420 }, { "epoch": 14.943108827307762, "grad_norm": 36.390804290771484, "learning_rate": 1.824375203959655e-06, "loss": 0.1091, "num_input_tokens_seen": 175572096, "step": 81425 }, { "epoch": 14.944026426867316, "grad_norm": 0.06464243680238724, "learning_rate": 1.82375673270064e-06, "loss": 0.071, "num_input_tokens_seen": 175581344, "step": 81430 }, { "epoch": 14.944944026426867, "grad_norm": 0.13273932039737701, "learning_rate": 1.8231383429073401e-06, "loss": 0.1102, "num_input_tokens_seen": 175591168, "step": 81435 }, { "epoch": 14.945861625986419, "grad_norm": 0.3203633725643158, "learning_rate": 1.8225200345956195e-06, "loss": 0.0008, "num_input_tokens_seen": 175601792, "step": 81440 }, { "epoch": 14.946779225545972, "grad_norm": 0.03136564418673515, "learning_rate": 1.8219018077813356e-06, "loss": 0.1534, "num_input_tokens_seen": 175611872, "step": 81445 }, { "epoch": 14.947696825105524, "grad_norm": 0.09348966926336288, "learning_rate": 1.8212836624803431e-06, "loss": 0.0001, "num_input_tokens_seen": 175622720, "step": 81450 }, { "epoch": 14.948614424665076, "grad_norm": 0.018087835982441902, "learning_rate": 1.8206655987084998e-06, "loss": 0.0001, "num_input_tokens_seen": 175633920, "step": 81455 }, { "epoch": 14.949532024224629, "grad_norm": 0.003039643634110689, "learning_rate": 1.820047616481656e-06, "loss": 0.0002, "num_input_tokens_seen": 175646080, "step": 81460 }, { "epoch": 14.95044962378418, "grad_norm": 227.00599670410156, "learning_rate": 1.8194297158156627e-06, "loss": 0.2743, "num_input_tokens_seen": 175657088, "step": 81465 }, { "epoch": 14.951367223343732, "grad_norm": 0.02222646214067936, "learning_rate": 1.8188118967263657e-06, "loss": 0.0006, "num_input_tokens_seen": 175667776, "step": 81470 }, { "epoch": 14.952284822903286, "grad_norm": 168.82321166992188, "learning_rate": 1.8181941592296155e-06, "loss": 0.0176, "num_input_tokens_seen": 175678976, "step": 81475 }, { "epoch": 14.953202422462837, "grad_norm": 0.021526386961340904, "learning_rate": 1.8175765033412534e-06, "loss": 0.1876, "num_input_tokens_seen": 175689184, "step": 81480 }, { "epoch": 14.954120022022389, "grad_norm": 0.0007064045057632029, "learning_rate": 1.81695892907712e-06, "loss": 0.0147, "num_input_tokens_seen": 175699264, "step": 81485 }, { "epoch": 14.955037621581942, "grad_norm": 0.018383072689175606, "learning_rate": 1.8163414364530585e-06, "loss": 0.1432, "num_input_tokens_seen": 175710304, "step": 81490 }, { "epoch": 14.955955221141494, "grad_norm": 0.16138528287410736, "learning_rate": 1.8157240254849046e-06, "loss": 0.1471, "num_input_tokens_seen": 175720000, "step": 81495 }, { "epoch": 14.956872820701046, "grad_norm": 0.14345206320285797, "learning_rate": 1.8151066961884927e-06, "loss": 0.1005, "num_input_tokens_seen": 175731488, "step": 81500 }, { "epoch": 14.957790420260599, "grad_norm": 0.031313035637140274, "learning_rate": 1.81448944857966e-06, "loss": 0.0003, "num_input_tokens_seen": 175742560, "step": 81505 }, { "epoch": 14.95870801982015, "grad_norm": 0.8693034052848816, "learning_rate": 1.8138722826742356e-06, "loss": 0.002, "num_input_tokens_seen": 175752544, "step": 81510 }, { "epoch": 14.959625619379702, "grad_norm": 0.13679227232933044, "learning_rate": 1.8132551984880491e-06, "loss": 0.0001, "num_input_tokens_seen": 175764288, "step": 81515 }, { "epoch": 14.960543218939256, "grad_norm": 0.003028841456398368, "learning_rate": 1.812638196036926e-06, "loss": 0.0008, "num_input_tokens_seen": 175774240, "step": 81520 }, { "epoch": 14.961460818498807, "grad_norm": 2.17417049407959, "learning_rate": 1.812021275336695e-06, "loss": 0.0012, "num_input_tokens_seen": 175786240, "step": 81525 }, { "epoch": 14.962378418058359, "grad_norm": 0.03232314810156822, "learning_rate": 1.8114044364031774e-06, "loss": 0.0002, "num_input_tokens_seen": 175797408, "step": 81530 }, { "epoch": 14.963296017617912, "grad_norm": 0.005227565765380859, "learning_rate": 1.8107876792521928e-06, "loss": 0.0, "num_input_tokens_seen": 175808640, "step": 81535 }, { "epoch": 14.964213617177464, "grad_norm": 1.8712899684906006, "learning_rate": 1.8101710038995623e-06, "loss": 0.0006, "num_input_tokens_seen": 175820352, "step": 81540 }, { "epoch": 14.965131216737015, "grad_norm": 0.30566853284835815, "learning_rate": 1.8095544103611024e-06, "loss": 0.236, "num_input_tokens_seen": 175831488, "step": 81545 }, { "epoch": 14.966048816296569, "grad_norm": 116.92521667480469, "learning_rate": 1.8089378986526268e-06, "loss": 0.6585, "num_input_tokens_seen": 175841728, "step": 81550 }, { "epoch": 14.96696641585612, "grad_norm": 0.015856605023145676, "learning_rate": 1.8083214687899487e-06, "loss": 0.1409, "num_input_tokens_seen": 175851584, "step": 81555 }, { "epoch": 14.967884015415672, "grad_norm": 0.13787296414375305, "learning_rate": 1.807705120788878e-06, "loss": 0.1253, "num_input_tokens_seen": 175862720, "step": 81560 }, { "epoch": 14.968801614975225, "grad_norm": 0.16009333729743958, "learning_rate": 1.8070888546652216e-06, "loss": 0.0038, "num_input_tokens_seen": 175873568, "step": 81565 }, { "epoch": 14.969719214534777, "grad_norm": 0.0027150234673172235, "learning_rate": 1.806472670434789e-06, "loss": 0.0004, "num_input_tokens_seen": 175885344, "step": 81570 }, { "epoch": 14.970636814094329, "grad_norm": 0.7859739661216736, "learning_rate": 1.8058565681133833e-06, "loss": 0.3236, "num_input_tokens_seen": 175898304, "step": 81575 }, { "epoch": 14.971554413653882, "grad_norm": 0.02437593974173069, "learning_rate": 1.8052405477168062e-06, "loss": 0.313, "num_input_tokens_seen": 175909504, "step": 81580 }, { "epoch": 14.972472013213434, "grad_norm": 0.16081929206848145, "learning_rate": 1.8046246092608555e-06, "loss": 0.1221, "num_input_tokens_seen": 175921248, "step": 81585 }, { "epoch": 14.973389612772985, "grad_norm": 0.04818464070558548, "learning_rate": 1.8040087527613331e-06, "loss": 0.0017, "num_input_tokens_seen": 175932480, "step": 81590 }, { "epoch": 14.974307212332539, "grad_norm": 0.0016174328047782183, "learning_rate": 1.8033929782340332e-06, "loss": 0.0001, "num_input_tokens_seen": 175943168, "step": 81595 }, { "epoch": 14.97522481189209, "grad_norm": 0.014683759771287441, "learning_rate": 1.802777285694749e-06, "loss": 0.0002, "num_input_tokens_seen": 175954688, "step": 81600 }, { "epoch": 14.976142411451642, "grad_norm": 0.005659925751388073, "learning_rate": 1.8021616751592702e-06, "loss": 0.0001, "num_input_tokens_seen": 175963680, "step": 81605 }, { "epoch": 14.977060011011195, "grad_norm": 0.015755577012896538, "learning_rate": 1.8015461466433898e-06, "loss": 0.0001, "num_input_tokens_seen": 175974016, "step": 81610 }, { "epoch": 14.977977610570747, "grad_norm": 1.0942455530166626, "learning_rate": 1.800930700162894e-06, "loss": 0.0002, "num_input_tokens_seen": 175984608, "step": 81615 }, { "epoch": 14.978895210130299, "grad_norm": 0.19695550203323364, "learning_rate": 1.8003153357335657e-06, "loss": 0.0591, "num_input_tokens_seen": 175995648, "step": 81620 }, { "epoch": 14.979812809689852, "grad_norm": 0.009117516689002514, "learning_rate": 1.7997000533711916e-06, "loss": 0.0002, "num_input_tokens_seen": 176007104, "step": 81625 }, { "epoch": 14.980730409249404, "grad_norm": 265.6150817871094, "learning_rate": 1.7990848530915512e-06, "loss": 0.0284, "num_input_tokens_seen": 176017536, "step": 81630 }, { "epoch": 14.981648008808955, "grad_norm": 0.03281928598880768, "learning_rate": 1.7984697349104218e-06, "loss": 0.2626, "num_input_tokens_seen": 176029216, "step": 81635 }, { "epoch": 14.982565608368509, "grad_norm": 2.959486246109009, "learning_rate": 1.7978546988435836e-06, "loss": 0.0706, "num_input_tokens_seen": 176037696, "step": 81640 }, { "epoch": 14.98348320792806, "grad_norm": 1.553670883178711, "learning_rate": 1.797239744906809e-06, "loss": 0.0006, "num_input_tokens_seen": 176048320, "step": 81645 }, { "epoch": 14.984400807487612, "grad_norm": 0.01402998249977827, "learning_rate": 1.7966248731158714e-06, "loss": 0.0823, "num_input_tokens_seen": 176058624, "step": 81650 }, { "epoch": 14.985318407047165, "grad_norm": 0.07983066886663437, "learning_rate": 1.7960100834865396e-06, "loss": 0.0383, "num_input_tokens_seen": 176069888, "step": 81655 }, { "epoch": 14.986236006606717, "grad_norm": 0.5987728834152222, "learning_rate": 1.795395376034585e-06, "loss": 0.0003, "num_input_tokens_seen": 176081376, "step": 81660 }, { "epoch": 14.987153606166268, "grad_norm": 0.022701075300574303, "learning_rate": 1.794780750775772e-06, "loss": 0.088, "num_input_tokens_seen": 176092256, "step": 81665 }, { "epoch": 14.988071205725822, "grad_norm": 20.99880027770996, "learning_rate": 1.7941662077258632e-06, "loss": 0.2045, "num_input_tokens_seen": 176103808, "step": 81670 }, { "epoch": 14.988988805285373, "grad_norm": 0.004097421653568745, "learning_rate": 1.7935517469006247e-06, "loss": 0.0003, "num_input_tokens_seen": 176115104, "step": 81675 }, { "epoch": 14.989906404844925, "grad_norm": 0.007753198035061359, "learning_rate": 1.7929373683158142e-06, "loss": 0.0, "num_input_tokens_seen": 176125760, "step": 81680 }, { "epoch": 14.990824004404478, "grad_norm": 0.002148024272173643, "learning_rate": 1.7923230719871897e-06, "loss": 0.0763, "num_input_tokens_seen": 176137440, "step": 81685 }, { "epoch": 14.99174160396403, "grad_norm": 0.13465234637260437, "learning_rate": 1.791708857930506e-06, "loss": 0.0002, "num_input_tokens_seen": 176147360, "step": 81690 }, { "epoch": 14.992659203523582, "grad_norm": 0.0050082881934940815, "learning_rate": 1.7910947261615186e-06, "loss": 0.0146, "num_input_tokens_seen": 176158656, "step": 81695 }, { "epoch": 14.993576803083135, "grad_norm": 0.005715776700526476, "learning_rate": 1.7904806766959782e-06, "loss": 0.0001, "num_input_tokens_seen": 176169632, "step": 81700 }, { "epoch": 14.994494402642687, "grad_norm": 0.0007603882695548236, "learning_rate": 1.7898667095496325e-06, "loss": 0.0027, "num_input_tokens_seen": 176181728, "step": 81705 }, { "epoch": 14.995412002202238, "grad_norm": 0.38031941652297974, "learning_rate": 1.7892528247382317e-06, "loss": 0.0247, "num_input_tokens_seen": 176192480, "step": 81710 }, { "epoch": 14.996329601761792, "grad_norm": 105.67337036132812, "learning_rate": 1.7886390222775202e-06, "loss": 0.2291, "num_input_tokens_seen": 176202016, "step": 81715 }, { "epoch": 14.997247201321343, "grad_norm": 0.27232110500335693, "learning_rate": 1.7880253021832388e-06, "loss": 0.0332, "num_input_tokens_seen": 176213152, "step": 81720 }, { "epoch": 14.998164800880895, "grad_norm": 5.654861927032471, "learning_rate": 1.7874116644711326e-06, "loss": 0.1585, "num_input_tokens_seen": 176224800, "step": 81725 }, { "epoch": 14.999082400440448, "grad_norm": 0.013957207091152668, "learning_rate": 1.7867981091569374e-06, "loss": 0.0, "num_input_tokens_seen": 176235936, "step": 81730 }, { "epoch": 15.0, "grad_norm": 0.011458776891231537, "learning_rate": 1.786184636256391e-06, "loss": 0.1007, "num_input_tokens_seen": 176244928, "step": 81735 }, { "epoch": 15.000917599559552, "grad_norm": 0.001969096018001437, "learning_rate": 1.7855712457852259e-06, "loss": 0.0001, "num_input_tokens_seen": 176255392, "step": 81740 }, { "epoch": 15.001835199119105, "grad_norm": 0.08543572574853897, "learning_rate": 1.784957937759178e-06, "loss": 0.0914, "num_input_tokens_seen": 176266080, "step": 81745 }, { "epoch": 15.002752798678657, "grad_norm": 0.015937993302941322, "learning_rate": 1.7843447121939767e-06, "loss": 0.0001, "num_input_tokens_seen": 176276032, "step": 81750 }, { "epoch": 15.003670398238208, "grad_norm": 0.006449286825954914, "learning_rate": 1.7837315691053474e-06, "loss": 0.1847, "num_input_tokens_seen": 176286656, "step": 81755 }, { "epoch": 15.004587997797762, "grad_norm": 0.08137983083724976, "learning_rate": 1.7831185085090201e-06, "loss": 0.0001, "num_input_tokens_seen": 176298272, "step": 81760 }, { "epoch": 15.005505597357313, "grad_norm": 0.0019117360934615135, "learning_rate": 1.7825055304207183e-06, "loss": 0.0002, "num_input_tokens_seen": 176308384, "step": 81765 }, { "epoch": 15.006423196916865, "grad_norm": 0.016254596412181854, "learning_rate": 1.781892634856162e-06, "loss": 0.0043, "num_input_tokens_seen": 176318432, "step": 81770 }, { "epoch": 15.007340796476418, "grad_norm": 0.011464547365903854, "learning_rate": 1.781279821831073e-06, "loss": 0.0004, "num_input_tokens_seen": 176329376, "step": 81775 }, { "epoch": 15.00825839603597, "grad_norm": 0.017884008586406708, "learning_rate": 1.7806670913611673e-06, "loss": 0.0008, "num_input_tokens_seen": 176339744, "step": 81780 }, { "epoch": 15.009175995595522, "grad_norm": 0.0473506785929203, "learning_rate": 1.7800544434621597e-06, "loss": 0.1321, "num_input_tokens_seen": 176351104, "step": 81785 }, { "epoch": 15.010093595155075, "grad_norm": 0.007694317027926445, "learning_rate": 1.7794418781497668e-06, "loss": 0.0, "num_input_tokens_seen": 176362240, "step": 81790 }, { "epoch": 15.011011194714627, "grad_norm": 0.004312656819820404, "learning_rate": 1.778829395439698e-06, "loss": 0.1318, "num_input_tokens_seen": 176372256, "step": 81795 }, { "epoch": 15.011928794274178, "grad_norm": 59.479087829589844, "learning_rate": 1.778216995347663e-06, "loss": 0.0535, "num_input_tokens_seen": 176384224, "step": 81800 }, { "epoch": 15.012846393833732, "grad_norm": 0.017960580065846443, "learning_rate": 1.7776046778893675e-06, "loss": 0.0002, "num_input_tokens_seen": 176394240, "step": 81805 }, { "epoch": 15.013763993393283, "grad_norm": 0.010684146545827389, "learning_rate": 1.776992443080519e-06, "loss": 0.0003, "num_input_tokens_seen": 176404384, "step": 81810 }, { "epoch": 15.014681592952835, "grad_norm": 0.08775539696216583, "learning_rate": 1.7763802909368194e-06, "loss": 0.0001, "num_input_tokens_seen": 176416352, "step": 81815 }, { "epoch": 15.015599192512388, "grad_norm": 0.02416408434510231, "learning_rate": 1.7757682214739692e-06, "loss": 0.0002, "num_input_tokens_seen": 176426528, "step": 81820 }, { "epoch": 15.01651679207194, "grad_norm": 0.004017157014459372, "learning_rate": 1.7751562347076651e-06, "loss": 0.0001, "num_input_tokens_seen": 176436512, "step": 81825 }, { "epoch": 15.017434391631491, "grad_norm": 0.009088202379643917, "learning_rate": 1.7745443306536075e-06, "loss": 0.0004, "num_input_tokens_seen": 176445440, "step": 81830 }, { "epoch": 15.018351991191045, "grad_norm": 0.06512060761451721, "learning_rate": 1.7739325093274883e-06, "loss": 0.0001, "num_input_tokens_seen": 176455584, "step": 81835 }, { "epoch": 15.019269590750596, "grad_norm": 0.0023829538840800524, "learning_rate": 1.773320770744999e-06, "loss": 0.0, "num_input_tokens_seen": 176466944, "step": 81840 }, { "epoch": 15.020187190310148, "grad_norm": 0.005112932063639164, "learning_rate": 1.7727091149218327e-06, "loss": 0.0021, "num_input_tokens_seen": 176476704, "step": 81845 }, { "epoch": 15.021104789869701, "grad_norm": 0.02301356941461563, "learning_rate": 1.7720975418736758e-06, "loss": 0.0001, "num_input_tokens_seen": 176487968, "step": 81850 }, { "epoch": 15.022022389429253, "grad_norm": 0.1981753557920456, "learning_rate": 1.7714860516162125e-06, "loss": 0.0026, "num_input_tokens_seen": 176499488, "step": 81855 }, { "epoch": 15.022939988988805, "grad_norm": 0.181207537651062, "learning_rate": 1.7708746441651293e-06, "loss": 0.0147, "num_input_tokens_seen": 176510944, "step": 81860 }, { "epoch": 15.023857588548358, "grad_norm": 0.0021527912467718124, "learning_rate": 1.7702633195361073e-06, "loss": 0.0097, "num_input_tokens_seen": 176521792, "step": 81865 }, { "epoch": 15.02477518810791, "grad_norm": 0.0011803907109424472, "learning_rate": 1.7696520777448256e-06, "loss": 0.3473, "num_input_tokens_seen": 176532320, "step": 81870 }, { "epoch": 15.025692787667461, "grad_norm": 0.018275829032063484, "learning_rate": 1.7690409188069595e-06, "loss": 0.0, "num_input_tokens_seen": 176541696, "step": 81875 }, { "epoch": 15.026610387227015, "grad_norm": 0.7244197130203247, "learning_rate": 1.7684298427381885e-06, "loss": 0.0001, "num_input_tokens_seen": 176553824, "step": 81880 }, { "epoch": 15.027527986786566, "grad_norm": 39.620819091796875, "learning_rate": 1.767818849554183e-06, "loss": 0.0244, "num_input_tokens_seen": 176564448, "step": 81885 }, { "epoch": 15.028445586346118, "grad_norm": 0.0665140375494957, "learning_rate": 1.7672079392706132e-06, "loss": 0.0646, "num_input_tokens_seen": 176574976, "step": 81890 }, { "epoch": 15.029363185905671, "grad_norm": 0.16234351694583893, "learning_rate": 1.7665971119031512e-06, "loss": 0.0001, "num_input_tokens_seen": 176585440, "step": 81895 }, { "epoch": 15.030280785465223, "grad_norm": 0.0020179457496851683, "learning_rate": 1.7659863674674615e-06, "loss": 0.0005, "num_input_tokens_seen": 176596096, "step": 81900 }, { "epoch": 15.031198385024775, "grad_norm": 74.86865997314453, "learning_rate": 1.7653757059792081e-06, "loss": 0.0329, "num_input_tokens_seen": 176605952, "step": 81905 }, { "epoch": 15.032115984584328, "grad_norm": 0.009983916766941547, "learning_rate": 1.764765127454056e-06, "loss": 0.0, "num_input_tokens_seen": 176615744, "step": 81910 }, { "epoch": 15.03303358414388, "grad_norm": 0.002913109725341201, "learning_rate": 1.764154631907664e-06, "loss": 0.0, "num_input_tokens_seen": 176625984, "step": 81915 }, { "epoch": 15.033951183703431, "grad_norm": 0.023464297875761986, "learning_rate": 1.7635442193556913e-06, "loss": 0.3251, "num_input_tokens_seen": 176636384, "step": 81920 }, { "epoch": 15.034868783262985, "grad_norm": 0.0062195900827646255, "learning_rate": 1.762933889813791e-06, "loss": 0.0001, "num_input_tokens_seen": 176648128, "step": 81925 }, { "epoch": 15.035786382822536, "grad_norm": 0.13122498989105225, "learning_rate": 1.7623236432976209e-06, "loss": 0.0002, "num_input_tokens_seen": 176658912, "step": 81930 }, { "epoch": 15.036703982382088, "grad_norm": 0.02392839640378952, "learning_rate": 1.7617134798228318e-06, "loss": 0.1036, "num_input_tokens_seen": 176668704, "step": 81935 }, { "epoch": 15.037621581941641, "grad_norm": 0.01174025610089302, "learning_rate": 1.7611033994050714e-06, "loss": 0.0001, "num_input_tokens_seen": 176679328, "step": 81940 }, { "epoch": 15.038539181501193, "grad_norm": 0.05315907299518585, "learning_rate": 1.7604934020599906e-06, "loss": 0.0001, "num_input_tokens_seen": 176690432, "step": 81945 }, { "epoch": 15.039456781060744, "grad_norm": 0.04584990069270134, "learning_rate": 1.7598834878032333e-06, "loss": 0.0002, "num_input_tokens_seen": 176701664, "step": 81950 }, { "epoch": 15.040374380620298, "grad_norm": 0.019989043474197388, "learning_rate": 1.7592736566504414e-06, "loss": 0.0, "num_input_tokens_seen": 176712768, "step": 81955 }, { "epoch": 15.04129198017985, "grad_norm": 0.05601102113723755, "learning_rate": 1.7586639086172585e-06, "loss": 0.0007, "num_input_tokens_seen": 176722944, "step": 81960 }, { "epoch": 15.042209579739401, "grad_norm": 0.011150919832289219, "learning_rate": 1.7580542437193231e-06, "loss": 0.0001, "num_input_tokens_seen": 176734560, "step": 81965 }, { "epoch": 15.043127179298954, "grad_norm": 0.02354290895164013, "learning_rate": 1.7574446619722723e-06, "loss": 0.0001, "num_input_tokens_seen": 176743744, "step": 81970 }, { "epoch": 15.044044778858506, "grad_norm": 0.23160311579704285, "learning_rate": 1.7568351633917396e-06, "loss": 0.0884, "num_input_tokens_seen": 176754816, "step": 81975 }, { "epoch": 15.044962378418058, "grad_norm": 0.2008069008588791, "learning_rate": 1.7562257479933576e-06, "loss": 0.0003, "num_input_tokens_seen": 176766432, "step": 81980 }, { "epoch": 15.045879977977611, "grad_norm": 0.46990033984184265, "learning_rate": 1.7556164157927586e-06, "loss": 0.0885, "num_input_tokens_seen": 176778016, "step": 81985 }, { "epoch": 15.046797577537163, "grad_norm": 0.008240148425102234, "learning_rate": 1.7550071668055708e-06, "loss": 0.1471, "num_input_tokens_seen": 176789280, "step": 81990 }, { "epoch": 15.047715177096714, "grad_norm": 257.72540283203125, "learning_rate": 1.7543980010474198e-06, "loss": 0.0378, "num_input_tokens_seen": 176800192, "step": 81995 }, { "epoch": 15.048632776656268, "grad_norm": 0.0051232390105724335, "learning_rate": 1.7537889185339296e-06, "loss": 0.0, "num_input_tokens_seen": 176811680, "step": 82000 }, { "epoch": 15.04955037621582, "grad_norm": 59.359561920166016, "learning_rate": 1.7531799192807208e-06, "loss": 0.1707, "num_input_tokens_seen": 176822400, "step": 82005 }, { "epoch": 15.050467975775371, "grad_norm": 0.00831909291446209, "learning_rate": 1.752571003303417e-06, "loss": 0.0, "num_input_tokens_seen": 176834464, "step": 82010 }, { "epoch": 15.051385575334924, "grad_norm": 0.06599006801843643, "learning_rate": 1.7519621706176337e-06, "loss": 0.0001, "num_input_tokens_seen": 176845504, "step": 82015 }, { "epoch": 15.052303174894476, "grad_norm": 0.01798744685947895, "learning_rate": 1.7513534212389865e-06, "loss": 0.0001, "num_input_tokens_seen": 176856256, "step": 82020 }, { "epoch": 15.053220774454028, "grad_norm": 2.4695944786071777, "learning_rate": 1.7507447551830875e-06, "loss": 0.001, "num_input_tokens_seen": 176868416, "step": 82025 }, { "epoch": 15.054138374013581, "grad_norm": 0.028471140190958977, "learning_rate": 1.7501361724655519e-06, "loss": 0.0005, "num_input_tokens_seen": 176879744, "step": 82030 }, { "epoch": 15.055055973573133, "grad_norm": 0.09612461924552917, "learning_rate": 1.7495276731019862e-06, "loss": 0.0001, "num_input_tokens_seen": 176890496, "step": 82035 }, { "epoch": 15.055973573132684, "grad_norm": 1.6900707483291626, "learning_rate": 1.748919257107996e-06, "loss": 0.0003, "num_input_tokens_seen": 176900672, "step": 82040 }, { "epoch": 15.056891172692238, "grad_norm": 0.008488933555781841, "learning_rate": 1.7483109244991896e-06, "loss": 0.1073, "num_input_tokens_seen": 176911424, "step": 82045 }, { "epoch": 15.05780877225179, "grad_norm": 0.0016580341616645455, "learning_rate": 1.7477026752911691e-06, "loss": 0.0001, "num_input_tokens_seen": 176922272, "step": 82050 }, { "epoch": 15.05872637181134, "grad_norm": 0.017124278470873833, "learning_rate": 1.747094509499534e-06, "loss": 0.0051, "num_input_tokens_seen": 176934368, "step": 82055 }, { "epoch": 15.059643971370894, "grad_norm": 0.20790599286556244, "learning_rate": 1.746486427139882e-06, "loss": 0.0001, "num_input_tokens_seen": 176945376, "step": 82060 }, { "epoch": 15.060561570930446, "grad_norm": 25.986482620239258, "learning_rate": 1.7458784282278112e-06, "loss": 0.0703, "num_input_tokens_seen": 176957120, "step": 82065 }, { "epoch": 15.061479170489998, "grad_norm": 0.0045522041618824005, "learning_rate": 1.745270512778916e-06, "loss": 0.0001, "num_input_tokens_seen": 176969248, "step": 82070 }, { "epoch": 15.062396770049551, "grad_norm": 0.14404307305812836, "learning_rate": 1.7446626808087864e-06, "loss": 0.1253, "num_input_tokens_seen": 176979744, "step": 82075 }, { "epoch": 15.063314369609103, "grad_norm": 0.010831811465322971, "learning_rate": 1.7440549323330148e-06, "loss": 0.0001, "num_input_tokens_seen": 176990464, "step": 82080 }, { "epoch": 15.064231969168654, "grad_norm": 0.005903265904635191, "learning_rate": 1.743447267367188e-06, "loss": 0.0, "num_input_tokens_seen": 177001504, "step": 82085 }, { "epoch": 15.065149568728208, "grad_norm": 0.013157490640878677, "learning_rate": 1.7428396859268903e-06, "loss": 0.0598, "num_input_tokens_seen": 177012320, "step": 82090 }, { "epoch": 15.06606716828776, "grad_norm": 40.59374237060547, "learning_rate": 1.7422321880277082e-06, "loss": 0.2759, "num_input_tokens_seen": 177021888, "step": 82095 }, { "epoch": 15.06698476784731, "grad_norm": 0.01959998719394207, "learning_rate": 1.741624773685221e-06, "loss": 0.0001, "num_input_tokens_seen": 177033152, "step": 82100 }, { "epoch": 15.067902367406864, "grad_norm": 0.004158825147897005, "learning_rate": 1.7410174429150085e-06, "loss": 0.0057, "num_input_tokens_seen": 177044384, "step": 82105 }, { "epoch": 15.068819966966416, "grad_norm": 0.0026063602417707443, "learning_rate": 1.7404101957326457e-06, "loss": 0.0002, "num_input_tokens_seen": 177055712, "step": 82110 }, { "epoch": 15.069737566525967, "grad_norm": 0.033672623336315155, "learning_rate": 1.7398030321537117e-06, "loss": 0.0001, "num_input_tokens_seen": 177066304, "step": 82115 }, { "epoch": 15.07065516608552, "grad_norm": 1.4950602054595947, "learning_rate": 1.7391959521937767e-06, "loss": 0.0003, "num_input_tokens_seen": 177076672, "step": 82120 }, { "epoch": 15.071572765645072, "grad_norm": 0.034370847046375275, "learning_rate": 1.73858895586841e-06, "loss": 0.0001, "num_input_tokens_seen": 177086560, "step": 82125 }, { "epoch": 15.072490365204624, "grad_norm": 0.029060687869787216, "learning_rate": 1.7379820431931838e-06, "loss": 0.0, "num_input_tokens_seen": 177096256, "step": 82130 }, { "epoch": 15.073407964764177, "grad_norm": 0.004537500906735659, "learning_rate": 1.7373752141836625e-06, "loss": 0.0002, "num_input_tokens_seen": 177108128, "step": 82135 }, { "epoch": 15.074325564323729, "grad_norm": 5.022153377532959, "learning_rate": 1.7367684688554103e-06, "loss": 0.111, "num_input_tokens_seen": 177118688, "step": 82140 }, { "epoch": 15.07524316388328, "grad_norm": 0.004528902471065521, "learning_rate": 1.7361618072239877e-06, "loss": 0.0001, "num_input_tokens_seen": 177129792, "step": 82145 }, { "epoch": 15.076160763442834, "grad_norm": 0.0012386500602588058, "learning_rate": 1.7355552293049578e-06, "loss": 0.0001, "num_input_tokens_seen": 177140480, "step": 82150 }, { "epoch": 15.077078363002386, "grad_norm": 0.011922472156584263, "learning_rate": 1.7349487351138766e-06, "loss": 0.0001, "num_input_tokens_seen": 177152352, "step": 82155 }, { "epoch": 15.077995962561937, "grad_norm": 0.004473143257200718, "learning_rate": 1.7343423246662988e-06, "loss": 0.0001, "num_input_tokens_seen": 177165120, "step": 82160 }, { "epoch": 15.07891356212149, "grad_norm": 0.003551177680492401, "learning_rate": 1.7337359979777802e-06, "loss": 0.0001, "num_input_tokens_seen": 177176736, "step": 82165 }, { "epoch": 15.079831161681042, "grad_norm": 63.38740539550781, "learning_rate": 1.7331297550638714e-06, "loss": 0.2862, "num_input_tokens_seen": 177186688, "step": 82170 }, { "epoch": 15.080748761240594, "grad_norm": 0.0016298528062179685, "learning_rate": 1.7325235959401194e-06, "loss": 0.0041, "num_input_tokens_seen": 177197280, "step": 82175 }, { "epoch": 15.081666360800147, "grad_norm": 0.049790158867836, "learning_rate": 1.7319175206220745e-06, "loss": 0.0001, "num_input_tokens_seen": 177208384, "step": 82180 }, { "epoch": 15.082583960359699, "grad_norm": 0.001825148006901145, "learning_rate": 1.7313115291252809e-06, "loss": 0.0969, "num_input_tokens_seen": 177218624, "step": 82185 }, { "epoch": 15.08350155991925, "grad_norm": 0.07169400155544281, "learning_rate": 1.7307056214652796e-06, "loss": 0.0002, "num_input_tokens_seen": 177229696, "step": 82190 }, { "epoch": 15.084419159478804, "grad_norm": 0.052027057856321335, "learning_rate": 1.7300997976576128e-06, "loss": 0.0001, "num_input_tokens_seen": 177240480, "step": 82195 }, { "epoch": 15.085336759038356, "grad_norm": 106.1734390258789, "learning_rate": 1.7294940577178164e-06, "loss": 0.1483, "num_input_tokens_seen": 177250592, "step": 82200 }, { "epoch": 15.086254358597907, "grad_norm": 0.5730850696563721, "learning_rate": 1.7288884016614305e-06, "loss": 0.0002, "num_input_tokens_seen": 177261440, "step": 82205 }, { "epoch": 15.08717195815746, "grad_norm": 7.978527069091797, "learning_rate": 1.7282828295039866e-06, "loss": 0.0016, "num_input_tokens_seen": 177272992, "step": 82210 }, { "epoch": 15.088089557717012, "grad_norm": 0.12767039239406586, "learning_rate": 1.7276773412610181e-06, "loss": 0.0002, "num_input_tokens_seen": 177282336, "step": 82215 }, { "epoch": 15.089007157276564, "grad_norm": 0.027448084205389023, "learning_rate": 1.7270719369480543e-06, "loss": 0.001, "num_input_tokens_seen": 177293280, "step": 82220 }, { "epoch": 15.089924756836117, "grad_norm": 0.021584009751677513, "learning_rate": 1.7264666165806205e-06, "loss": 0.0002, "num_input_tokens_seen": 177303776, "step": 82225 }, { "epoch": 15.090842356395669, "grad_norm": 0.000859816384036094, "learning_rate": 1.7258613801742463e-06, "loss": 0.0, "num_input_tokens_seen": 177314592, "step": 82230 }, { "epoch": 15.09175995595522, "grad_norm": 0.2596411108970642, "learning_rate": 1.7252562277444534e-06, "loss": 0.0002, "num_input_tokens_seen": 177326528, "step": 82235 }, { "epoch": 15.092677555514774, "grad_norm": 0.06807710230350494, "learning_rate": 1.7246511593067627e-06, "loss": 0.1689, "num_input_tokens_seen": 177337920, "step": 82240 }, { "epoch": 15.093595155074325, "grad_norm": 1.5803173780441284, "learning_rate": 1.7240461748766917e-06, "loss": 0.0009, "num_input_tokens_seen": 177348384, "step": 82245 }, { "epoch": 15.094512754633877, "grad_norm": 0.013936417177319527, "learning_rate": 1.723441274469761e-06, "loss": 0.0025, "num_input_tokens_seen": 177358784, "step": 82250 }, { "epoch": 15.09543035419343, "grad_norm": 0.001316092791967094, "learning_rate": 1.7228364581014834e-06, "loss": 0.1222, "num_input_tokens_seen": 177369216, "step": 82255 }, { "epoch": 15.096347953752982, "grad_norm": 0.9815775156021118, "learning_rate": 1.72223172578737e-06, "loss": 0.0427, "num_input_tokens_seen": 177379520, "step": 82260 }, { "epoch": 15.097265553312534, "grad_norm": 0.0007931165164336562, "learning_rate": 1.721627077542934e-06, "loss": 0.1068, "num_input_tokens_seen": 177390848, "step": 82265 }, { "epoch": 15.098183152872087, "grad_norm": 0.4850887656211853, "learning_rate": 1.7210225133836828e-06, "loss": 0.0375, "num_input_tokens_seen": 177401920, "step": 82270 }, { "epoch": 15.099100752431639, "grad_norm": 0.3114309012889862, "learning_rate": 1.720418033325122e-06, "loss": 0.0002, "num_input_tokens_seen": 177411136, "step": 82275 }, { "epoch": 15.10001835199119, "grad_norm": 0.002528927056118846, "learning_rate": 1.719813637382754e-06, "loss": 0.0209, "num_input_tokens_seen": 177422304, "step": 82280 }, { "epoch": 15.100935951550744, "grad_norm": 0.06245645880699158, "learning_rate": 1.7192093255720838e-06, "loss": 0.0001, "num_input_tokens_seen": 177431744, "step": 82285 }, { "epoch": 15.101853551110295, "grad_norm": 0.7227208018302917, "learning_rate": 1.71860509790861e-06, "loss": 0.0042, "num_input_tokens_seen": 177442720, "step": 82290 }, { "epoch": 15.102771150669847, "grad_norm": 0.005265285260975361, "learning_rate": 1.718000954407828e-06, "loss": 0.0001, "num_input_tokens_seen": 177454080, "step": 82295 }, { "epoch": 15.1036887502294, "grad_norm": 94.60623168945312, "learning_rate": 1.7173968950852366e-06, "loss": 0.0079, "num_input_tokens_seen": 177464800, "step": 82300 }, { "epoch": 15.104606349788952, "grad_norm": 0.0016141713131219149, "learning_rate": 1.7167929199563272e-06, "loss": 0.0063, "num_input_tokens_seen": 177475968, "step": 82305 }, { "epoch": 15.105523949348504, "grad_norm": 0.005532616283744574, "learning_rate": 1.716189029036589e-06, "loss": 0.0455, "num_input_tokens_seen": 177487456, "step": 82310 }, { "epoch": 15.106441548908057, "grad_norm": 0.018506942316889763, "learning_rate": 1.715585222341515e-06, "loss": 0.0207, "num_input_tokens_seen": 177497152, "step": 82315 }, { "epoch": 15.107359148467609, "grad_norm": 0.014367532916367054, "learning_rate": 1.7149814998865894e-06, "loss": 0.0001, "num_input_tokens_seen": 177507616, "step": 82320 }, { "epoch": 15.10827674802716, "grad_norm": 0.0018104311311617494, "learning_rate": 1.7143778616872968e-06, "loss": 0.0, "num_input_tokens_seen": 177519264, "step": 82325 }, { "epoch": 15.109194347586714, "grad_norm": 0.033486369997262955, "learning_rate": 1.7137743077591184e-06, "loss": 0.0428, "num_input_tokens_seen": 177529312, "step": 82330 }, { "epoch": 15.110111947146265, "grad_norm": 0.07297026365995407, "learning_rate": 1.713170838117537e-06, "loss": 0.2225, "num_input_tokens_seen": 177540800, "step": 82335 }, { "epoch": 15.111029546705817, "grad_norm": 0.014484788291156292, "learning_rate": 1.71256745277803e-06, "loss": 0.0001, "num_input_tokens_seen": 177552096, "step": 82340 }, { "epoch": 15.11194714626537, "grad_norm": 0.008549731224775314, "learning_rate": 1.7119641517560709e-06, "loss": 0.1005, "num_input_tokens_seen": 177563904, "step": 82345 }, { "epoch": 15.112864745824922, "grad_norm": 0.003186563029885292, "learning_rate": 1.7113609350671372e-06, "loss": 0.0, "num_input_tokens_seen": 177575104, "step": 82350 }, { "epoch": 15.113782345384474, "grad_norm": 148.54061889648438, "learning_rate": 1.710757802726698e-06, "loss": 0.2251, "num_input_tokens_seen": 177586368, "step": 82355 }, { "epoch": 15.114699944944027, "grad_norm": 0.04762899503111839, "learning_rate": 1.7101547547502223e-06, "loss": 0.0001, "num_input_tokens_seen": 177598144, "step": 82360 }, { "epoch": 15.115617544503579, "grad_norm": 0.0017227542120963335, "learning_rate": 1.70955179115318e-06, "loss": 0.2063, "num_input_tokens_seen": 177609920, "step": 82365 }, { "epoch": 15.11653514406313, "grad_norm": 0.00877801701426506, "learning_rate": 1.708948911951034e-06, "loss": 0.1397, "num_input_tokens_seen": 177620160, "step": 82370 }, { "epoch": 15.117452743622684, "grad_norm": 0.0015177605673670769, "learning_rate": 1.708346117159248e-06, "loss": 0.0, "num_input_tokens_seen": 177631744, "step": 82375 }, { "epoch": 15.118370343182235, "grad_norm": 0.002756072673946619, "learning_rate": 1.7077434067932808e-06, "loss": 0.0002, "num_input_tokens_seen": 177642304, "step": 82380 }, { "epoch": 15.119287942741787, "grad_norm": 0.004931187257170677, "learning_rate": 1.7071407808685946e-06, "loss": 0.0, "num_input_tokens_seen": 177652512, "step": 82385 }, { "epoch": 15.12020554230134, "grad_norm": 0.07347196340560913, "learning_rate": 1.7065382394006436e-06, "loss": 0.0002, "num_input_tokens_seen": 177663040, "step": 82390 }, { "epoch": 15.121123141860892, "grad_norm": 0.0032820424530655146, "learning_rate": 1.7059357824048805e-06, "loss": 0.0532, "num_input_tokens_seen": 177674944, "step": 82395 }, { "epoch": 15.122040741420443, "grad_norm": 0.006163898389786482, "learning_rate": 1.7053334098967616e-06, "loss": 0.0002, "num_input_tokens_seen": 177686208, "step": 82400 }, { "epoch": 15.122958340979997, "grad_norm": 0.24560195207595825, "learning_rate": 1.704731121891734e-06, "loss": 0.1023, "num_input_tokens_seen": 177698464, "step": 82405 }, { "epoch": 15.123875940539548, "grad_norm": 0.05055085942149162, "learning_rate": 1.7041289184052462e-06, "loss": 0.0002, "num_input_tokens_seen": 177709312, "step": 82410 }, { "epoch": 15.1247935400991, "grad_norm": 2.3026282787323, "learning_rate": 1.7035267994527433e-06, "loss": 0.132, "num_input_tokens_seen": 177719872, "step": 82415 }, { "epoch": 15.125711139658653, "grad_norm": 0.01171884872019291, "learning_rate": 1.7029247650496672e-06, "loss": 0.0, "num_input_tokens_seen": 177730208, "step": 82420 }, { "epoch": 15.126628739218205, "grad_norm": 1.9276065826416016, "learning_rate": 1.7023228152114625e-06, "loss": 0.0008, "num_input_tokens_seen": 177740224, "step": 82425 }, { "epoch": 15.127546338777757, "grad_norm": 19.501602172851562, "learning_rate": 1.7017209499535664e-06, "loss": 0.0041, "num_input_tokens_seen": 177749920, "step": 82430 }, { "epoch": 15.12846393833731, "grad_norm": 0.0994441881775856, "learning_rate": 1.7011191692914165e-06, "loss": 0.0001, "num_input_tokens_seen": 177760960, "step": 82435 }, { "epoch": 15.129381537896862, "grad_norm": 62.53740692138672, "learning_rate": 1.7005174732404473e-06, "loss": 0.291, "num_input_tokens_seen": 177772608, "step": 82440 }, { "epoch": 15.130299137456413, "grad_norm": 0.04747899994254112, "learning_rate": 1.6999158618160888e-06, "loss": 0.0003, "num_input_tokens_seen": 177782656, "step": 82445 }, { "epoch": 15.131216737015967, "grad_norm": 352.9674072265625, "learning_rate": 1.699314335033776e-06, "loss": 0.0245, "num_input_tokens_seen": 177793152, "step": 82450 }, { "epoch": 15.132134336575518, "grad_norm": 0.011696230620145798, "learning_rate": 1.6987128929089346e-06, "loss": 0.0, "num_input_tokens_seen": 177804192, "step": 82455 }, { "epoch": 15.13305193613507, "grad_norm": 0.010069622658193111, "learning_rate": 1.6981115354569915e-06, "loss": 0.0001, "num_input_tokens_seen": 177814752, "step": 82460 }, { "epoch": 15.133969535694623, "grad_norm": 153.19866943359375, "learning_rate": 1.6975102626933683e-06, "loss": 0.0823, "num_input_tokens_seen": 177824576, "step": 82465 }, { "epoch": 15.134887135254175, "grad_norm": 0.004985425621271133, "learning_rate": 1.6969090746334893e-06, "loss": 0.0001, "num_input_tokens_seen": 177835712, "step": 82470 }, { "epoch": 15.135804734813727, "grad_norm": 0.002193861175328493, "learning_rate": 1.6963079712927737e-06, "loss": 0.0001, "num_input_tokens_seen": 177845888, "step": 82475 }, { "epoch": 15.13672233437328, "grad_norm": 0.0024952145759016275, "learning_rate": 1.695706952686637e-06, "loss": 0.0008, "num_input_tokens_seen": 177855968, "step": 82480 }, { "epoch": 15.137639933932832, "grad_norm": 0.0629541426897049, "learning_rate": 1.6951060188304975e-06, "loss": 0.001, "num_input_tokens_seen": 177866240, "step": 82485 }, { "epoch": 15.138557533492383, "grad_norm": 0.009431771002709866, "learning_rate": 1.6945051697397658e-06, "loss": 0.0588, "num_input_tokens_seen": 177877632, "step": 82490 }, { "epoch": 15.139475133051937, "grad_norm": 136.73228454589844, "learning_rate": 1.693904405429852e-06, "loss": 0.189, "num_input_tokens_seen": 177888960, "step": 82495 }, { "epoch": 15.140392732611488, "grad_norm": 0.007165541406720877, "learning_rate": 1.6933037259161682e-06, "loss": 0.0175, "num_input_tokens_seen": 177900320, "step": 82500 }, { "epoch": 15.14131033217104, "grad_norm": 0.0015389409381896257, "learning_rate": 1.692703131214119e-06, "loss": 0.0964, "num_input_tokens_seen": 177912032, "step": 82505 }, { "epoch": 15.142227931730593, "grad_norm": 0.0018278737552464008, "learning_rate": 1.6921026213391083e-06, "loss": 0.0004, "num_input_tokens_seen": 177924512, "step": 82510 }, { "epoch": 15.143145531290145, "grad_norm": 0.004911134485155344, "learning_rate": 1.691502196306537e-06, "loss": 0.0001, "num_input_tokens_seen": 177935264, "step": 82515 }, { "epoch": 15.144063130849696, "grad_norm": 0.0026447142008692026, "learning_rate": 1.6909018561318086e-06, "loss": 0.0001, "num_input_tokens_seen": 177946208, "step": 82520 }, { "epoch": 15.14498073040925, "grad_norm": 0.014199168421328068, "learning_rate": 1.6903016008303187e-06, "loss": 0.0001, "num_input_tokens_seen": 177956704, "step": 82525 }, { "epoch": 15.145898329968801, "grad_norm": 0.0014139005215838552, "learning_rate": 1.6897014304174615e-06, "loss": 0.0, "num_input_tokens_seen": 177968832, "step": 82530 }, { "epoch": 15.146815929528353, "grad_norm": 2.457824230194092, "learning_rate": 1.6891013449086335e-06, "loss": 0.2339, "num_input_tokens_seen": 177978656, "step": 82535 }, { "epoch": 15.147733529087906, "grad_norm": 0.0025012483820319176, "learning_rate": 1.688501344319225e-06, "loss": 0.0, "num_input_tokens_seen": 177990560, "step": 82540 }, { "epoch": 15.148651128647458, "grad_norm": 0.038505811244249344, "learning_rate": 1.6879014286646228e-06, "loss": 0.0023, "num_input_tokens_seen": 178001088, "step": 82545 }, { "epoch": 15.14956872820701, "grad_norm": 0.20407430827617645, "learning_rate": 1.6873015979602176e-06, "loss": 0.1564, "num_input_tokens_seen": 178013952, "step": 82550 }, { "epoch": 15.150486327766563, "grad_norm": 0.010656686499714851, "learning_rate": 1.6867018522213918e-06, "loss": 0.0012, "num_input_tokens_seen": 178024384, "step": 82555 }, { "epoch": 15.151403927326115, "grad_norm": 0.013248526491224766, "learning_rate": 1.6861021914635289e-06, "loss": 0.0002, "num_input_tokens_seen": 178034400, "step": 82560 }, { "epoch": 15.152321526885666, "grad_norm": 0.03248917683959007, "learning_rate": 1.6855026157020066e-06, "loss": 0.0001, "num_input_tokens_seen": 178045216, "step": 82565 }, { "epoch": 15.15323912644522, "grad_norm": 0.011967364698648453, "learning_rate": 1.684903124952207e-06, "loss": 0.0002, "num_input_tokens_seen": 178056544, "step": 82570 }, { "epoch": 15.154156726004771, "grad_norm": 0.0050672623328864574, "learning_rate": 1.6843037192295042e-06, "loss": 0.0002, "num_input_tokens_seen": 178067616, "step": 82575 }, { "epoch": 15.155074325564323, "grad_norm": 39.04631423950195, "learning_rate": 1.6837043985492707e-06, "loss": 0.1617, "num_input_tokens_seen": 178078656, "step": 82580 }, { "epoch": 15.155991925123876, "grad_norm": 1.3140069246292114, "learning_rate": 1.6831051629268807e-06, "loss": 0.0004, "num_input_tokens_seen": 178088832, "step": 82585 }, { "epoch": 15.156909524683428, "grad_norm": 0.0008228334481827915, "learning_rate": 1.6825060123777032e-06, "loss": 0.0001, "num_input_tokens_seen": 178100096, "step": 82590 }, { "epoch": 15.15782712424298, "grad_norm": 128.63685607910156, "learning_rate": 1.6819069469171045e-06, "loss": 0.0174, "num_input_tokens_seen": 178110080, "step": 82595 }, { "epoch": 15.158744723802533, "grad_norm": 22.86453628540039, "learning_rate": 1.6813079665604487e-06, "loss": 0.0016, "num_input_tokens_seen": 178120640, "step": 82600 }, { "epoch": 15.159662323362085, "grad_norm": 0.009601379744708538, "learning_rate": 1.6807090713231012e-06, "loss": 0.0001, "num_input_tokens_seen": 178131616, "step": 82605 }, { "epoch": 15.160579922921636, "grad_norm": 0.08075924962759018, "learning_rate": 1.6801102612204218e-06, "loss": 0.1133, "num_input_tokens_seen": 178143136, "step": 82610 }, { "epoch": 15.16149752248119, "grad_norm": 0.007240658160299063, "learning_rate": 1.6795115362677671e-06, "loss": 0.0, "num_input_tokens_seen": 178154240, "step": 82615 }, { "epoch": 15.162415122040741, "grad_norm": 0.0010511704022064805, "learning_rate": 1.6789128964804973e-06, "loss": 0.0003, "num_input_tokens_seen": 178165728, "step": 82620 }, { "epoch": 15.163332721600293, "grad_norm": 0.01039028074592352, "learning_rate": 1.6783143418739639e-06, "loss": 0.0588, "num_input_tokens_seen": 178175680, "step": 82625 }, { "epoch": 15.164250321159846, "grad_norm": 0.01144421100616455, "learning_rate": 1.6777158724635202e-06, "loss": 0.0, "num_input_tokens_seen": 178186080, "step": 82630 }, { "epoch": 15.165167920719398, "grad_norm": 0.004531066864728928, "learning_rate": 1.6771174882645147e-06, "loss": 0.0001, "num_input_tokens_seen": 178196928, "step": 82635 }, { "epoch": 15.16608552027895, "grad_norm": 0.04533330351114273, "learning_rate": 1.6765191892922956e-06, "loss": 0.033, "num_input_tokens_seen": 178207680, "step": 82640 }, { "epoch": 15.167003119838503, "grad_norm": 0.061644863337278366, "learning_rate": 1.675920975562207e-06, "loss": 0.0024, "num_input_tokens_seen": 178219168, "step": 82645 }, { "epoch": 15.167920719398055, "grad_norm": 0.0018749091541394591, "learning_rate": 1.675322847089595e-06, "loss": 0.0001, "num_input_tokens_seen": 178228576, "step": 82650 }, { "epoch": 15.168838318957606, "grad_norm": 0.08565980941057205, "learning_rate": 1.674724803889799e-06, "loss": 0.0645, "num_input_tokens_seen": 178238400, "step": 82655 }, { "epoch": 15.16975591851716, "grad_norm": 0.004267342854291201, "learning_rate": 1.6741268459781584e-06, "loss": 0.0003, "num_input_tokens_seen": 178250368, "step": 82660 }, { "epoch": 15.170673518076711, "grad_norm": 0.22275419533252716, "learning_rate": 1.6735289733700078e-06, "loss": 0.0353, "num_input_tokens_seen": 178262272, "step": 82665 }, { "epoch": 15.171591117636263, "grad_norm": 26.851957321166992, "learning_rate": 1.6729311860806851e-06, "loss": 0.066, "num_input_tokens_seen": 178273696, "step": 82670 }, { "epoch": 15.172508717195816, "grad_norm": 0.024529265239834785, "learning_rate": 1.6723334841255212e-06, "loss": 0.0002, "num_input_tokens_seen": 178284960, "step": 82675 }, { "epoch": 15.173426316755368, "grad_norm": 1.3381116390228271, "learning_rate": 1.6717358675198442e-06, "loss": 0.0009, "num_input_tokens_seen": 178295584, "step": 82680 }, { "epoch": 15.17434391631492, "grad_norm": 0.007395194843411446, "learning_rate": 1.6711383362789857e-06, "loss": 0.002, "num_input_tokens_seen": 178305376, "step": 82685 }, { "epoch": 15.175261515874473, "grad_norm": 0.006196114234626293, "learning_rate": 1.6705408904182696e-06, "loss": 0.0002, "num_input_tokens_seen": 178315424, "step": 82690 }, { "epoch": 15.176179115434024, "grad_norm": 0.07353833317756653, "learning_rate": 1.6699435299530191e-06, "loss": 0.0006, "num_input_tokens_seen": 178325216, "step": 82695 }, { "epoch": 15.177096714993576, "grad_norm": 0.007313229609280825, "learning_rate": 1.6693462548985545e-06, "loss": 0.0, "num_input_tokens_seen": 178335936, "step": 82700 }, { "epoch": 15.17801431455313, "grad_norm": 0.08483010530471802, "learning_rate": 1.6687490652701982e-06, "loss": 0.0245, "num_input_tokens_seen": 178347072, "step": 82705 }, { "epoch": 15.178931914112681, "grad_norm": 0.024897532537579536, "learning_rate": 1.6681519610832653e-06, "loss": 0.0051, "num_input_tokens_seen": 178357536, "step": 82710 }, { "epoch": 15.179849513672233, "grad_norm": 0.0022095688618719578, "learning_rate": 1.6675549423530685e-06, "loss": 0.0001, "num_input_tokens_seen": 178367776, "step": 82715 }, { "epoch": 15.180767113231786, "grad_norm": 0.08275848627090454, "learning_rate": 1.666958009094925e-06, "loss": 0.1752, "num_input_tokens_seen": 178378624, "step": 82720 }, { "epoch": 15.181684712791338, "grad_norm": 0.005509087350219488, "learning_rate": 1.6663611613241427e-06, "loss": 0.0, "num_input_tokens_seen": 178389472, "step": 82725 }, { "epoch": 15.18260231235089, "grad_norm": 0.0006306191789917648, "learning_rate": 1.665764399056028e-06, "loss": 0.0328, "num_input_tokens_seen": 178400128, "step": 82730 }, { "epoch": 15.183519911910443, "grad_norm": 315.22015380859375, "learning_rate": 1.6651677223058909e-06, "loss": 0.0646, "num_input_tokens_seen": 178410624, "step": 82735 }, { "epoch": 15.184437511469994, "grad_norm": 0.007326366379857063, "learning_rate": 1.6645711310890328e-06, "loss": 0.099, "num_input_tokens_seen": 178421312, "step": 82740 }, { "epoch": 15.185355111029546, "grad_norm": 31.78135108947754, "learning_rate": 1.6639746254207562e-06, "loss": 0.0098, "num_input_tokens_seen": 178431392, "step": 82745 }, { "epoch": 15.1862727105891, "grad_norm": 1.4524179697036743, "learning_rate": 1.6633782053163578e-06, "loss": 0.0002, "num_input_tokens_seen": 178442144, "step": 82750 }, { "epoch": 15.187190310148651, "grad_norm": 0.0010317915584892035, "learning_rate": 1.6627818707911392e-06, "loss": 0.0002, "num_input_tokens_seen": 178452800, "step": 82755 }, { "epoch": 15.188107909708203, "grad_norm": 0.007202025968581438, "learning_rate": 1.6621856218603932e-06, "loss": 0.0001, "num_input_tokens_seen": 178463360, "step": 82760 }, { "epoch": 15.189025509267756, "grad_norm": 0.017327824607491493, "learning_rate": 1.6615894585394115e-06, "loss": 0.0104, "num_input_tokens_seen": 178473440, "step": 82765 }, { "epoch": 15.189943108827308, "grad_norm": 105.98445129394531, "learning_rate": 1.6609933808434875e-06, "loss": 0.0146, "num_input_tokens_seen": 178483872, "step": 82770 }, { "epoch": 15.19086070838686, "grad_norm": 0.005501676816493273, "learning_rate": 1.6603973887879088e-06, "loss": 0.1688, "num_input_tokens_seen": 178495520, "step": 82775 }, { "epoch": 15.191778307946413, "grad_norm": 0.002570894779637456, "learning_rate": 1.6598014823879604e-06, "loss": 0.0014, "num_input_tokens_seen": 178506336, "step": 82780 }, { "epoch": 15.192695907505964, "grad_norm": 0.02985028736293316, "learning_rate": 1.6592056616589258e-06, "loss": 0.0001, "num_input_tokens_seen": 178517504, "step": 82785 }, { "epoch": 15.193613507065516, "grad_norm": 0.022671585902571678, "learning_rate": 1.6586099266160904e-06, "loss": 0.0001, "num_input_tokens_seen": 178527200, "step": 82790 }, { "epoch": 15.19453110662507, "grad_norm": 0.061156854033470154, "learning_rate": 1.658014277274731e-06, "loss": 0.0001, "num_input_tokens_seen": 178537440, "step": 82795 }, { "epoch": 15.19544870618462, "grad_norm": 0.006714574992656708, "learning_rate": 1.6574187136501247e-06, "loss": 0.2629, "num_input_tokens_seen": 178548224, "step": 82800 }, { "epoch": 15.196366305744172, "grad_norm": 0.213509663939476, "learning_rate": 1.6568232357575486e-06, "loss": 0.0001, "num_input_tokens_seen": 178559552, "step": 82805 }, { "epoch": 15.197283905303726, "grad_norm": 112.33495330810547, "learning_rate": 1.6562278436122759e-06, "loss": 0.0677, "num_input_tokens_seen": 178569632, "step": 82810 }, { "epoch": 15.198201504863277, "grad_norm": 0.042620494961738586, "learning_rate": 1.6556325372295746e-06, "loss": 0.0001, "num_input_tokens_seen": 178580320, "step": 82815 }, { "epoch": 15.199119104422829, "grad_norm": 0.5056562423706055, "learning_rate": 1.6550373166247174e-06, "loss": 0.0561, "num_input_tokens_seen": 178590752, "step": 82820 }, { "epoch": 15.200036703982382, "grad_norm": 0.1626306027173996, "learning_rate": 1.6544421818129685e-06, "loss": 0.0002, "num_input_tokens_seen": 178601824, "step": 82825 }, { "epoch": 15.200954303541934, "grad_norm": 62.56706619262695, "learning_rate": 1.6538471328095922e-06, "loss": 0.2126, "num_input_tokens_seen": 178612672, "step": 82830 }, { "epoch": 15.201871903101486, "grad_norm": 0.002268389333039522, "learning_rate": 1.6532521696298515e-06, "loss": 0.0, "num_input_tokens_seen": 178623552, "step": 82835 }, { "epoch": 15.20278950266104, "grad_norm": 0.10700478404760361, "learning_rate": 1.6526572922890038e-06, "loss": 0.0001, "num_input_tokens_seen": 178633600, "step": 82840 }, { "epoch": 15.20370710222059, "grad_norm": 0.12882554531097412, "learning_rate": 1.6520625008023106e-06, "loss": 0.0004, "num_input_tokens_seen": 178644576, "step": 82845 }, { "epoch": 15.204624701780142, "grad_norm": 19.888004302978516, "learning_rate": 1.651467795185025e-06, "loss": 0.0754, "num_input_tokens_seen": 178653568, "step": 82850 }, { "epoch": 15.205542301339696, "grad_norm": 0.373834490776062, "learning_rate": 1.6508731754524004e-06, "loss": 0.0001, "num_input_tokens_seen": 178664000, "step": 82855 }, { "epoch": 15.206459900899247, "grad_norm": 0.005144889932125807, "learning_rate": 1.6502786416196887e-06, "loss": 0.0001, "num_input_tokens_seen": 178674240, "step": 82860 }, { "epoch": 15.207377500458799, "grad_norm": 0.019219277426600456, "learning_rate": 1.6496841937021363e-06, "loss": 0.0026, "num_input_tokens_seen": 178685120, "step": 82865 }, { "epoch": 15.208295100018352, "grad_norm": 1.138614296913147, "learning_rate": 1.649089831714994e-06, "loss": 0.001, "num_input_tokens_seen": 178696448, "step": 82870 }, { "epoch": 15.209212699577904, "grad_norm": 0.01022059004753828, "learning_rate": 1.6484955556735033e-06, "loss": 0.0, "num_input_tokens_seen": 178706112, "step": 82875 }, { "epoch": 15.210130299137456, "grad_norm": 0.0533733144402504, "learning_rate": 1.6479013655929077e-06, "loss": 0.0003, "num_input_tokens_seen": 178717216, "step": 82880 }, { "epoch": 15.211047898697009, "grad_norm": 0.1522795408964157, "learning_rate": 1.647307261488445e-06, "loss": 0.0618, "num_input_tokens_seen": 178728608, "step": 82885 }, { "epoch": 15.21196549825656, "grad_norm": 25.208145141601562, "learning_rate": 1.6467132433753568e-06, "loss": 0.0333, "num_input_tokens_seen": 178739744, "step": 82890 }, { "epoch": 15.212883097816112, "grad_norm": 0.021816711872816086, "learning_rate": 1.646119311268876e-06, "loss": 0.1523, "num_input_tokens_seen": 178750400, "step": 82895 }, { "epoch": 15.213800697375666, "grad_norm": 0.32966479659080505, "learning_rate": 1.6455254651842361e-06, "loss": 0.0001, "num_input_tokens_seen": 178760800, "step": 82900 }, { "epoch": 15.214718296935217, "grad_norm": 0.006311954464763403, "learning_rate": 1.6449317051366703e-06, "loss": 0.0008, "num_input_tokens_seen": 178771744, "step": 82905 }, { "epoch": 15.215635896494769, "grad_norm": 0.0007686231983825564, "learning_rate": 1.6443380311414065e-06, "loss": 0.0174, "num_input_tokens_seen": 178781408, "step": 82910 }, { "epoch": 15.216553496054322, "grad_norm": 0.0012934714322909713, "learning_rate": 1.6437444432136713e-06, "loss": 0.0001, "num_input_tokens_seen": 178792608, "step": 82915 }, { "epoch": 15.217471095613874, "grad_norm": 1.7534005641937256, "learning_rate": 1.6431509413686874e-06, "loss": 0.0034, "num_input_tokens_seen": 178803200, "step": 82920 }, { "epoch": 15.218388695173426, "grad_norm": 0.0016913916915655136, "learning_rate": 1.6425575256216815e-06, "loss": 0.0001, "num_input_tokens_seen": 178814432, "step": 82925 }, { "epoch": 15.219306294732979, "grad_norm": 0.004016158636659384, "learning_rate": 1.6419641959878712e-06, "loss": 0.0, "num_input_tokens_seen": 178825152, "step": 82930 }, { "epoch": 15.22022389429253, "grad_norm": 0.000682941114064306, "learning_rate": 1.6413709524824729e-06, "loss": 0.1098, "num_input_tokens_seen": 178836992, "step": 82935 }, { "epoch": 15.221141493852082, "grad_norm": 0.0031290280167013407, "learning_rate": 1.6407777951207065e-06, "loss": 0.0001, "num_input_tokens_seen": 178848032, "step": 82940 }, { "epoch": 15.222059093411636, "grad_norm": 0.0038799771573394537, "learning_rate": 1.6401847239177826e-06, "loss": 0.0, "num_input_tokens_seen": 178859136, "step": 82945 }, { "epoch": 15.222976692971187, "grad_norm": 31.56987762451172, "learning_rate": 1.6395917388889122e-06, "loss": 0.0825, "num_input_tokens_seen": 178870752, "step": 82950 }, { "epoch": 15.223894292530739, "grad_norm": 0.033121101558208466, "learning_rate": 1.6389988400493068e-06, "loss": 0.0, "num_input_tokens_seen": 178882944, "step": 82955 }, { "epoch": 15.224811892090292, "grad_norm": 0.3052295744419098, "learning_rate": 1.6384060274141729e-06, "loss": 0.0098, "num_input_tokens_seen": 178894240, "step": 82960 }, { "epoch": 15.225729491649844, "grad_norm": 0.003282353514805436, "learning_rate": 1.6378133009987134e-06, "loss": 0.0, "num_input_tokens_seen": 178905312, "step": 82965 }, { "epoch": 15.226647091209395, "grad_norm": 0.09145143628120422, "learning_rate": 1.6372206608181307e-06, "loss": 0.0001, "num_input_tokens_seen": 178915872, "step": 82970 }, { "epoch": 15.227564690768949, "grad_norm": 0.014928345568478107, "learning_rate": 1.6366281068876277e-06, "loss": 0.0, "num_input_tokens_seen": 178925792, "step": 82975 }, { "epoch": 15.2284822903285, "grad_norm": 0.6769409775733948, "learning_rate": 1.6360356392224009e-06, "loss": 0.0001, "num_input_tokens_seen": 178936352, "step": 82980 }, { "epoch": 15.229399889888052, "grad_norm": 0.001962611684575677, "learning_rate": 1.635443257837645e-06, "loss": 0.0001, "num_input_tokens_seen": 178947232, "step": 82985 }, { "epoch": 15.230317489447605, "grad_norm": 0.00956711731851101, "learning_rate": 1.6348509627485558e-06, "loss": 0.0001, "num_input_tokens_seen": 178957600, "step": 82990 }, { "epoch": 15.231235089007157, "grad_norm": 0.0024801059626042843, "learning_rate": 1.6342587539703247e-06, "loss": 0.0003, "num_input_tokens_seen": 178968000, "step": 82995 }, { "epoch": 15.232152688566709, "grad_norm": 0.022232661023736, "learning_rate": 1.6336666315181382e-06, "loss": 0.0001, "num_input_tokens_seen": 178979232, "step": 83000 }, { "epoch": 15.233070288126262, "grad_norm": 0.00546442624181509, "learning_rate": 1.6330745954071869e-06, "loss": 0.0762, "num_input_tokens_seen": 178988736, "step": 83005 }, { "epoch": 15.233987887685814, "grad_norm": 1.4304109811782837, "learning_rate": 1.6324826456526544e-06, "loss": 0.0001, "num_input_tokens_seen": 178998144, "step": 83010 }, { "epoch": 15.234905487245365, "grad_norm": 0.12579944729804993, "learning_rate": 1.6318907822697222e-06, "loss": 0.0001, "num_input_tokens_seen": 179007904, "step": 83015 }, { "epoch": 15.235823086804919, "grad_norm": 0.0023400995414704084, "learning_rate": 1.63129900527357e-06, "loss": 0.0, "num_input_tokens_seen": 179019200, "step": 83020 }, { "epoch": 15.23674068636447, "grad_norm": 0.0009367195307277143, "learning_rate": 1.6307073146793788e-06, "loss": 0.0, "num_input_tokens_seen": 179031168, "step": 83025 }, { "epoch": 15.237658285924022, "grad_norm": 0.0006286523421294987, "learning_rate": 1.630115710502323e-06, "loss": 0.0, "num_input_tokens_seen": 179040384, "step": 83030 }, { "epoch": 15.238575885483575, "grad_norm": 0.01403759140521288, "learning_rate": 1.629524192757575e-06, "loss": 0.0001, "num_input_tokens_seen": 179050752, "step": 83035 }, { "epoch": 15.239493485043127, "grad_norm": 0.0012981065083295107, "learning_rate": 1.6289327614603096e-06, "loss": 0.0003, "num_input_tokens_seen": 179061664, "step": 83040 }, { "epoch": 15.240411084602679, "grad_norm": 0.30126649141311646, "learning_rate": 1.6283414166256933e-06, "loss": 0.0003, "num_input_tokens_seen": 179071552, "step": 83045 }, { "epoch": 15.241328684162232, "grad_norm": 0.011922834441065788, "learning_rate": 1.6277501582688948e-06, "loss": 0.0, "num_input_tokens_seen": 179082624, "step": 83050 }, { "epoch": 15.242246283721784, "grad_norm": 0.0045197647996246815, "learning_rate": 1.627158986405078e-06, "loss": 0.0, "num_input_tokens_seen": 179092992, "step": 83055 }, { "epoch": 15.243163883281335, "grad_norm": 0.008527529425919056, "learning_rate": 1.626567901049404e-06, "loss": 0.0376, "num_input_tokens_seen": 179103872, "step": 83060 }, { "epoch": 15.244081482840889, "grad_norm": 0.0021936462726444006, "learning_rate": 1.6259769022170368e-06, "loss": 0.0001, "num_input_tokens_seen": 179113152, "step": 83065 }, { "epoch": 15.24499908240044, "grad_norm": 0.039642397314310074, "learning_rate": 1.6253859899231327e-06, "loss": 0.0064, "num_input_tokens_seen": 179124608, "step": 83070 }, { "epoch": 15.245916681959992, "grad_norm": 0.0013180830283090472, "learning_rate": 1.624795164182848e-06, "loss": 0.0, "num_input_tokens_seen": 179135840, "step": 83075 }, { "epoch": 15.246834281519545, "grad_norm": 0.0018473205855116248, "learning_rate": 1.624204425011336e-06, "loss": 0.0001, "num_input_tokens_seen": 179147264, "step": 83080 }, { "epoch": 15.247751881079097, "grad_norm": 0.22481046617031097, "learning_rate": 1.6236137724237473e-06, "loss": 0.0977, "num_input_tokens_seen": 179158112, "step": 83085 }, { "epoch": 15.248669480638648, "grad_norm": 0.0056792995892465115, "learning_rate": 1.6230232064352336e-06, "loss": 0.0002, "num_input_tokens_seen": 179168288, "step": 83090 }, { "epoch": 15.249587080198202, "grad_norm": 0.055807821452617645, "learning_rate": 1.6224327270609408e-06, "loss": 0.0001, "num_input_tokens_seen": 179179456, "step": 83095 }, { "epoch": 15.250504679757753, "grad_norm": 0.006350317038595676, "learning_rate": 1.621842334316014e-06, "loss": 0.0001, "num_input_tokens_seen": 179190976, "step": 83100 }, { "epoch": 15.251422279317305, "grad_norm": 0.003304117126390338, "learning_rate": 1.6212520282155935e-06, "loss": 0.0002, "num_input_tokens_seen": 179202944, "step": 83105 }, { "epoch": 15.252339878876858, "grad_norm": 0.0013543448876589537, "learning_rate": 1.6206618087748238e-06, "loss": 0.0, "num_input_tokens_seen": 179212576, "step": 83110 }, { "epoch": 15.25325747843641, "grad_norm": 4.871227741241455, "learning_rate": 1.6200716760088415e-06, "loss": 0.0011, "num_input_tokens_seen": 179223232, "step": 83115 }, { "epoch": 15.254175077995962, "grad_norm": 0.4207954406738281, "learning_rate": 1.619481629932781e-06, "loss": 0.0001, "num_input_tokens_seen": 179233824, "step": 83120 }, { "epoch": 15.255092677555515, "grad_norm": 0.004159770905971527, "learning_rate": 1.618891670561778e-06, "loss": 0.0, "num_input_tokens_seen": 179244704, "step": 83125 }, { "epoch": 15.256010277115067, "grad_norm": 0.0007571032037958503, "learning_rate": 1.618301797910964e-06, "loss": 0.3094, "num_input_tokens_seen": 179254304, "step": 83130 }, { "epoch": 15.256927876674618, "grad_norm": 14.565004348754883, "learning_rate": 1.617712011995466e-06, "loss": 0.0026, "num_input_tokens_seen": 179263776, "step": 83135 }, { "epoch": 15.257845476234172, "grad_norm": 0.0010578108485788107, "learning_rate": 1.6171223128304148e-06, "loss": 0.0, "num_input_tokens_seen": 179274880, "step": 83140 }, { "epoch": 15.258763075793723, "grad_norm": 0.009693123400211334, "learning_rate": 1.6165327004309328e-06, "loss": 0.0001, "num_input_tokens_seen": 179286496, "step": 83145 }, { "epoch": 15.259680675353275, "grad_norm": 71.54044342041016, "learning_rate": 1.615943174812143e-06, "loss": 0.2204, "num_input_tokens_seen": 179297760, "step": 83150 }, { "epoch": 15.260598274912828, "grad_norm": 0.024389272555708885, "learning_rate": 1.6153537359891647e-06, "loss": 0.0001, "num_input_tokens_seen": 179309344, "step": 83155 }, { "epoch": 15.26151587447238, "grad_norm": 0.004284786060452461, "learning_rate": 1.6147643839771188e-06, "loss": 0.0, "num_input_tokens_seen": 179320864, "step": 83160 }, { "epoch": 15.262433474031932, "grad_norm": 0.2561154365539551, "learning_rate": 1.6141751187911198e-06, "loss": 0.0002, "num_input_tokens_seen": 179332576, "step": 83165 }, { "epoch": 15.263351073591485, "grad_norm": 0.0022627406287938356, "learning_rate": 1.61358594044628e-06, "loss": 0.0003, "num_input_tokens_seen": 179343456, "step": 83170 }, { "epoch": 15.264268673151037, "grad_norm": 0.028971685096621513, "learning_rate": 1.6129968489577142e-06, "loss": 0.0002, "num_input_tokens_seen": 179354304, "step": 83175 }, { "epoch": 15.265186272710588, "grad_norm": 0.0020866405684500933, "learning_rate": 1.6124078443405294e-06, "loss": 0.0049, "num_input_tokens_seen": 179365600, "step": 83180 }, { "epoch": 15.266103872270142, "grad_norm": 0.00333421491086483, "learning_rate": 1.6118189266098315e-06, "loss": 0.0426, "num_input_tokens_seen": 179376608, "step": 83185 }, { "epoch": 15.267021471829693, "grad_norm": 0.41555172204971313, "learning_rate": 1.6112300957807286e-06, "loss": 0.0001, "num_input_tokens_seen": 179386720, "step": 83190 }, { "epoch": 15.267939071389245, "grad_norm": 0.00399054354056716, "learning_rate": 1.6106413518683217e-06, "loss": 0.0, "num_input_tokens_seen": 179398208, "step": 83195 }, { "epoch": 15.268856670948798, "grad_norm": 0.008605935610830784, "learning_rate": 1.6100526948877115e-06, "loss": 0.0001, "num_input_tokens_seen": 179409248, "step": 83200 }, { "epoch": 15.26977427050835, "grad_norm": 0.00306482776068151, "learning_rate": 1.6094641248539933e-06, "loss": 0.0, "num_input_tokens_seen": 179419584, "step": 83205 }, { "epoch": 15.270691870067902, "grad_norm": 0.0011602318845689297, "learning_rate": 1.6088756417822675e-06, "loss": 0.0006, "num_input_tokens_seen": 179431584, "step": 83210 }, { "epoch": 15.271609469627455, "grad_norm": 0.00047720340080559254, "learning_rate": 1.608287245687626e-06, "loss": 0.0001, "num_input_tokens_seen": 179442976, "step": 83215 }, { "epoch": 15.272527069187007, "grad_norm": 0.005486824084073305, "learning_rate": 1.6076989365851581e-06, "loss": 0.0, "num_input_tokens_seen": 179456096, "step": 83220 }, { "epoch": 15.273444668746558, "grad_norm": 0.010511940345168114, "learning_rate": 1.6071107144899562e-06, "loss": 0.0329, "num_input_tokens_seen": 179467712, "step": 83225 }, { "epoch": 15.274362268306112, "grad_norm": 0.0033526199404150248, "learning_rate": 1.6065225794171064e-06, "loss": 0.1595, "num_input_tokens_seen": 179477760, "step": 83230 }, { "epoch": 15.275279867865663, "grad_norm": 0.0028308045584708452, "learning_rate": 1.6059345313816927e-06, "loss": 0.0, "num_input_tokens_seen": 179489632, "step": 83235 }, { "epoch": 15.276197467425215, "grad_norm": 0.2918654680252075, "learning_rate": 1.6053465703987963e-06, "loss": 0.0001, "num_input_tokens_seen": 179499936, "step": 83240 }, { "epoch": 15.277115066984768, "grad_norm": 0.033178701996803284, "learning_rate": 1.604758696483501e-06, "loss": 0.2056, "num_input_tokens_seen": 179510208, "step": 83245 }, { "epoch": 15.27803266654432, "grad_norm": 0.005381174851208925, "learning_rate": 1.6041709096508828e-06, "loss": 0.0, "num_input_tokens_seen": 179520704, "step": 83250 }, { "epoch": 15.278950266103871, "grad_norm": 0.0006923695909790695, "learning_rate": 1.6035832099160165e-06, "loss": 0.0041, "num_input_tokens_seen": 179531872, "step": 83255 }, { "epoch": 15.279867865663425, "grad_norm": 0.7683460712432861, "learning_rate": 1.6029955972939782e-06, "loss": 0.0004, "num_input_tokens_seen": 179543168, "step": 83260 }, { "epoch": 15.280785465222976, "grad_norm": 0.0014031954342499375, "learning_rate": 1.602408071799838e-06, "loss": 0.056, "num_input_tokens_seen": 179553728, "step": 83265 }, { "epoch": 15.281703064782528, "grad_norm": 0.19204212725162506, "learning_rate": 1.6018206334486647e-06, "loss": 0.0005, "num_input_tokens_seen": 179565344, "step": 83270 }, { "epoch": 15.282620664342081, "grad_norm": 1.824903964996338, "learning_rate": 1.601233282255526e-06, "loss": 0.0003, "num_input_tokens_seen": 179576352, "step": 83275 }, { "epoch": 15.283538263901633, "grad_norm": 0.0031207548454403877, "learning_rate": 1.6006460182354839e-06, "loss": 0.0016, "num_input_tokens_seen": 179587296, "step": 83280 }, { "epoch": 15.284455863461186, "grad_norm": 0.003030451014637947, "learning_rate": 1.600058841403605e-06, "loss": 0.002, "num_input_tokens_seen": 179597312, "step": 83285 }, { "epoch": 15.285373463020738, "grad_norm": 0.000709334562998265, "learning_rate": 1.5994717517749469e-06, "loss": 0.0001, "num_input_tokens_seen": 179609152, "step": 83290 }, { "epoch": 15.28629106258029, "grad_norm": 0.009610304608941078, "learning_rate": 1.5988847493645682e-06, "loss": 0.0, "num_input_tokens_seen": 179620544, "step": 83295 }, { "epoch": 15.287208662139843, "grad_norm": 0.003545277751982212, "learning_rate": 1.5982978341875244e-06, "loss": 0.0001, "num_input_tokens_seen": 179631968, "step": 83300 }, { "epoch": 15.288126261699395, "grad_norm": 1.3356380462646484, "learning_rate": 1.5977110062588675e-06, "loss": 0.0006, "num_input_tokens_seen": 179641824, "step": 83305 }, { "epoch": 15.289043861258946, "grad_norm": 0.05954619497060776, "learning_rate": 1.5971242655936519e-06, "loss": 0.0435, "num_input_tokens_seen": 179652448, "step": 83310 }, { "epoch": 15.2899614608185, "grad_norm": 0.00802726112306118, "learning_rate": 1.5965376122069248e-06, "loss": 0.2017, "num_input_tokens_seen": 179662848, "step": 83315 }, { "epoch": 15.290879060378051, "grad_norm": 114.43379974365234, "learning_rate": 1.5959510461137312e-06, "loss": 0.2407, "num_input_tokens_seen": 179673856, "step": 83320 }, { "epoch": 15.291796659937603, "grad_norm": 0.003351264400407672, "learning_rate": 1.595364567329119e-06, "loss": 0.056, "num_input_tokens_seen": 179684288, "step": 83325 }, { "epoch": 15.292714259497156, "grad_norm": 0.022904211655259132, "learning_rate": 1.5947781758681297e-06, "loss": 0.0001, "num_input_tokens_seen": 179694816, "step": 83330 }, { "epoch": 15.293631859056708, "grad_norm": 0.0030286405235528946, "learning_rate": 1.594191871745802e-06, "loss": 0.0006, "num_input_tokens_seen": 179706624, "step": 83335 }, { "epoch": 15.29454945861626, "grad_norm": 0.11583206057548523, "learning_rate": 1.5936056549771728e-06, "loss": 0.0003, "num_input_tokens_seen": 179716384, "step": 83340 }, { "epoch": 15.295467058175813, "grad_norm": 180.23521423339844, "learning_rate": 1.5930195255772807e-06, "loss": 0.0369, "num_input_tokens_seen": 179727360, "step": 83345 }, { "epoch": 15.296384657735365, "grad_norm": 0.0006142791826277971, "learning_rate": 1.5924334835611572e-06, "loss": 0.0001, "num_input_tokens_seen": 179738464, "step": 83350 }, { "epoch": 15.297302257294916, "grad_norm": 0.010064266622066498, "learning_rate": 1.5918475289438323e-06, "loss": 0.0, "num_input_tokens_seen": 179749664, "step": 83355 }, { "epoch": 15.29821985685447, "grad_norm": 0.020731506869196892, "learning_rate": 1.5912616617403376e-06, "loss": 0.1378, "num_input_tokens_seen": 179758880, "step": 83360 }, { "epoch": 15.299137456414021, "grad_norm": 0.006537461653351784, "learning_rate": 1.5906758819656982e-06, "loss": 0.0001, "num_input_tokens_seen": 179770656, "step": 83365 }, { "epoch": 15.300055055973573, "grad_norm": 0.0015904020983725786, "learning_rate": 1.5900901896349386e-06, "loss": 0.0004, "num_input_tokens_seen": 179780128, "step": 83370 }, { "epoch": 15.300972655533126, "grad_norm": 0.011095498688519001, "learning_rate": 1.5895045847630792e-06, "loss": 0.0001, "num_input_tokens_seen": 179790464, "step": 83375 }, { "epoch": 15.301890255092678, "grad_norm": 0.008415346965193748, "learning_rate": 1.5889190673651427e-06, "loss": 0.0202, "num_input_tokens_seen": 179801504, "step": 83380 }, { "epoch": 15.30280785465223, "grad_norm": 0.0050679524429142475, "learning_rate": 1.5883336374561453e-06, "loss": 0.0001, "num_input_tokens_seen": 179812384, "step": 83385 }, { "epoch": 15.303725454211783, "grad_norm": 0.012426666915416718, "learning_rate": 1.5877482950511013e-06, "loss": 0.0001, "num_input_tokens_seen": 179824064, "step": 83390 }, { "epoch": 15.304643053771334, "grad_norm": 70.71508026123047, "learning_rate": 1.5871630401650268e-06, "loss": 0.0119, "num_input_tokens_seen": 179834784, "step": 83395 }, { "epoch": 15.305560653330886, "grad_norm": 0.008845069445669651, "learning_rate": 1.5865778728129305e-06, "loss": 0.0064, "num_input_tokens_seen": 179844864, "step": 83400 }, { "epoch": 15.30647825289044, "grad_norm": 0.00691385380923748, "learning_rate": 1.58599279300982e-06, "loss": 0.0, "num_input_tokens_seen": 179855040, "step": 83405 }, { "epoch": 15.307395852449991, "grad_norm": 3.9163920879364014, "learning_rate": 1.5854078007707047e-06, "loss": 0.0003, "num_input_tokens_seen": 179865312, "step": 83410 }, { "epoch": 15.308313452009543, "grad_norm": 0.023369286209344864, "learning_rate": 1.5848228961105872e-06, "loss": 0.0005, "num_input_tokens_seen": 179876288, "step": 83415 }, { "epoch": 15.309231051569096, "grad_norm": 0.26945167779922485, "learning_rate": 1.584238079044469e-06, "loss": 0.1222, "num_input_tokens_seen": 179886976, "step": 83420 }, { "epoch": 15.310148651128648, "grad_norm": 0.009061413817107677, "learning_rate": 1.583653349587349e-06, "loss": 0.0001, "num_input_tokens_seen": 179896576, "step": 83425 }, { "epoch": 15.3110662506882, "grad_norm": 0.019770996645092964, "learning_rate": 1.5830687077542272e-06, "loss": 0.0006, "num_input_tokens_seen": 179907392, "step": 83430 }, { "epoch": 15.311983850247753, "grad_norm": 0.0012861336581408978, "learning_rate": 1.582484153560097e-06, "loss": 0.0001, "num_input_tokens_seen": 179918560, "step": 83435 }, { "epoch": 15.312901449807304, "grad_norm": 0.057395171374082565, "learning_rate": 1.5818996870199505e-06, "loss": 0.1625, "num_input_tokens_seen": 179930656, "step": 83440 }, { "epoch": 15.313819049366856, "grad_norm": 0.003027279395610094, "learning_rate": 1.581315308148781e-06, "loss": 0.0478, "num_input_tokens_seen": 179940896, "step": 83445 }, { "epoch": 15.31473664892641, "grad_norm": 136.09512329101562, "learning_rate": 1.5807310169615747e-06, "loss": 0.0544, "num_input_tokens_seen": 179952000, "step": 83450 }, { "epoch": 15.315654248485961, "grad_norm": 0.039037324488162994, "learning_rate": 1.5801468134733171e-06, "loss": 0.0001, "num_input_tokens_seen": 179962240, "step": 83455 }, { "epoch": 15.316571848045513, "grad_norm": 0.0012751000467687845, "learning_rate": 1.5795626976989953e-06, "loss": 0.0066, "num_input_tokens_seen": 179972128, "step": 83460 }, { "epoch": 15.317489447605066, "grad_norm": 0.03361573442816734, "learning_rate": 1.5789786696535891e-06, "loss": 0.0001, "num_input_tokens_seen": 179983392, "step": 83465 }, { "epoch": 15.318407047164618, "grad_norm": 71.17462158203125, "learning_rate": 1.578394729352078e-06, "loss": 0.0329, "num_input_tokens_seen": 179993408, "step": 83470 }, { "epoch": 15.31932464672417, "grad_norm": 0.07785775512456894, "learning_rate": 1.5778108768094374e-06, "loss": 0.0001, "num_input_tokens_seen": 180002432, "step": 83475 }, { "epoch": 15.320242246283723, "grad_norm": 0.0047280266880989075, "learning_rate": 1.577227112040645e-06, "loss": 0.0002, "num_input_tokens_seen": 180012288, "step": 83480 }, { "epoch": 15.321159845843274, "grad_norm": 0.001265752362087369, "learning_rate": 1.576643435060673e-06, "loss": 0.0, "num_input_tokens_seen": 180023712, "step": 83485 }, { "epoch": 15.322077445402826, "grad_norm": 1.5825105905532837, "learning_rate": 1.576059845884491e-06, "loss": 0.0004, "num_input_tokens_seen": 180035200, "step": 83490 }, { "epoch": 15.32299504496238, "grad_norm": 0.25252461433410645, "learning_rate": 1.5754763445270677e-06, "loss": 0.0001, "num_input_tokens_seen": 180046144, "step": 83495 }, { "epoch": 15.323912644521931, "grad_norm": 0.0017223916947841644, "learning_rate": 1.5748929310033661e-06, "loss": 0.0001, "num_input_tokens_seen": 180058400, "step": 83500 }, { "epoch": 15.324830244081483, "grad_norm": 0.020901808515191078, "learning_rate": 1.5743096053283546e-06, "loss": 0.0209, "num_input_tokens_seen": 180069664, "step": 83505 }, { "epoch": 15.325747843641036, "grad_norm": 0.0034013648983091116, "learning_rate": 1.5737263675169922e-06, "loss": 0.0, "num_input_tokens_seen": 180079968, "step": 83510 }, { "epoch": 15.326665443200588, "grad_norm": 0.0011773309670388699, "learning_rate": 1.5731432175842386e-06, "loss": 0.1782, "num_input_tokens_seen": 180091200, "step": 83515 }, { "epoch": 15.32758304276014, "grad_norm": 0.00378456711769104, "learning_rate": 1.5725601555450498e-06, "loss": 0.0, "num_input_tokens_seen": 180101024, "step": 83520 }, { "epoch": 15.328500642319693, "grad_norm": 0.030565721914172173, "learning_rate": 1.5719771814143798e-06, "loss": 0.0003, "num_input_tokens_seen": 180111296, "step": 83525 }, { "epoch": 15.329418241879244, "grad_norm": 0.001048951642587781, "learning_rate": 1.5713942952071837e-06, "loss": 0.0002, "num_input_tokens_seen": 180122752, "step": 83530 }, { "epoch": 15.330335841438796, "grad_norm": 0.047151800245046616, "learning_rate": 1.5708114969384096e-06, "loss": 0.0377, "num_input_tokens_seen": 180134048, "step": 83535 }, { "epoch": 15.33125344099835, "grad_norm": 0.02510366402566433, "learning_rate": 1.5702287866230048e-06, "loss": 0.0065, "num_input_tokens_seen": 180145408, "step": 83540 }, { "epoch": 15.3321710405579, "grad_norm": 0.010126789100468159, "learning_rate": 1.5696461642759169e-06, "loss": 0.0, "num_input_tokens_seen": 180156224, "step": 83545 }, { "epoch": 15.333088640117452, "grad_norm": 0.006413393188267946, "learning_rate": 1.5690636299120893e-06, "loss": 0.0001, "num_input_tokens_seen": 180167328, "step": 83550 }, { "epoch": 15.334006239677006, "grad_norm": 0.016426589339971542, "learning_rate": 1.5684811835464613e-06, "loss": 0.0, "num_input_tokens_seen": 180179168, "step": 83555 }, { "epoch": 15.334923839236557, "grad_norm": 0.003318268107250333, "learning_rate": 1.5678988251939713e-06, "loss": 0.0381, "num_input_tokens_seen": 180188928, "step": 83560 }, { "epoch": 15.335841438796109, "grad_norm": 0.014146190136671066, "learning_rate": 1.5673165548695584e-06, "loss": 0.0, "num_input_tokens_seen": 180200416, "step": 83565 }, { "epoch": 15.336759038355662, "grad_norm": 0.682101845741272, "learning_rate": 1.566734372588156e-06, "loss": 0.0004, "num_input_tokens_seen": 180212096, "step": 83570 }, { "epoch": 15.337676637915214, "grad_norm": 0.010212963446974754, "learning_rate": 1.5661522783646943e-06, "loss": 0.0426, "num_input_tokens_seen": 180222624, "step": 83575 }, { "epoch": 15.338594237474766, "grad_norm": 805.1812744140625, "learning_rate": 1.5655702722141065e-06, "loss": 0.0883, "num_input_tokens_seen": 180232000, "step": 83580 }, { "epoch": 15.339511837034319, "grad_norm": 0.008836614899337292, "learning_rate": 1.5649883541513177e-06, "loss": 0.0, "num_input_tokens_seen": 180242592, "step": 83585 }, { "epoch": 15.34042943659387, "grad_norm": 0.0004708741034846753, "learning_rate": 1.5644065241912526e-06, "loss": 0.0, "num_input_tokens_seen": 180253088, "step": 83590 }, { "epoch": 15.341347036153422, "grad_norm": 0.0017728459788486362, "learning_rate": 1.5638247823488373e-06, "loss": 0.2094, "num_input_tokens_seen": 180262272, "step": 83595 }, { "epoch": 15.342264635712976, "grad_norm": 0.008202512748539448, "learning_rate": 1.5632431286389905e-06, "loss": 0.0001, "num_input_tokens_seen": 180272704, "step": 83600 }, { "epoch": 15.343182235272527, "grad_norm": 0.0017117485404014587, "learning_rate": 1.5626615630766312e-06, "loss": 0.0, "num_input_tokens_seen": 180283136, "step": 83605 }, { "epoch": 15.344099834832079, "grad_norm": 0.01789766363799572, "learning_rate": 1.5620800856766731e-06, "loss": 0.2188, "num_input_tokens_seen": 180292896, "step": 83610 }, { "epoch": 15.345017434391632, "grad_norm": 7.248016834259033, "learning_rate": 1.5614986964540346e-06, "loss": 0.0008, "num_input_tokens_seen": 180303552, "step": 83615 }, { "epoch": 15.345935033951184, "grad_norm": 0.001501670340076089, "learning_rate": 1.5609173954236256e-06, "loss": 0.0, "num_input_tokens_seen": 180312448, "step": 83620 }, { "epoch": 15.346852633510736, "grad_norm": 0.019688909873366356, "learning_rate": 1.5603361826003533e-06, "loss": 0.0041, "num_input_tokens_seen": 180322016, "step": 83625 }, { "epoch": 15.347770233070289, "grad_norm": 0.0015305222477763891, "learning_rate": 1.559755057999129e-06, "loss": 0.0616, "num_input_tokens_seen": 180333184, "step": 83630 }, { "epoch": 15.34868783262984, "grad_norm": 10.949930191040039, "learning_rate": 1.559174021634855e-06, "loss": 0.002, "num_input_tokens_seen": 180343392, "step": 83635 }, { "epoch": 15.349605432189392, "grad_norm": 0.009484147652983665, "learning_rate": 1.5585930735224332e-06, "loss": 0.0001, "num_input_tokens_seen": 180354176, "step": 83640 }, { "epoch": 15.350523031748946, "grad_norm": 0.0016642595874145627, "learning_rate": 1.5580122136767667e-06, "loss": 0.0044, "num_input_tokens_seen": 180364864, "step": 83645 }, { "epoch": 15.351440631308497, "grad_norm": 0.01687973365187645, "learning_rate": 1.5574314421127528e-06, "loss": 0.3751, "num_input_tokens_seen": 180376704, "step": 83650 }, { "epoch": 15.352358230868049, "grad_norm": 0.002353641903027892, "learning_rate": 1.5568507588452863e-06, "loss": 0.0, "num_input_tokens_seen": 180387776, "step": 83655 }, { "epoch": 15.353275830427602, "grad_norm": 0.15199227631092072, "learning_rate": 1.5562701638892608e-06, "loss": 0.0021, "num_input_tokens_seen": 180398880, "step": 83660 }, { "epoch": 15.354193429987154, "grad_norm": 0.0049193017184734344, "learning_rate": 1.5556896572595693e-06, "loss": 0.0, "num_input_tokens_seen": 180410688, "step": 83665 }, { "epoch": 15.355111029546705, "grad_norm": 0.002424299716949463, "learning_rate": 1.5551092389710998e-06, "loss": 0.0401, "num_input_tokens_seen": 180421440, "step": 83670 }, { "epoch": 15.356028629106259, "grad_norm": 0.006616578437387943, "learning_rate": 1.5545289090387378e-06, "loss": 0.4623, "num_input_tokens_seen": 180430912, "step": 83675 }, { "epoch": 15.35694622866581, "grad_norm": 0.0011772644938901067, "learning_rate": 1.5539486674773707e-06, "loss": 0.002, "num_input_tokens_seen": 180443936, "step": 83680 }, { "epoch": 15.357863828225362, "grad_norm": 0.2883220314979553, "learning_rate": 1.5533685143018795e-06, "loss": 0.0704, "num_input_tokens_seen": 180454592, "step": 83685 }, { "epoch": 15.358781427784916, "grad_norm": 0.0008962878491729498, "learning_rate": 1.5527884495271439e-06, "loss": 0.0, "num_input_tokens_seen": 180464704, "step": 83690 }, { "epoch": 15.359699027344467, "grad_norm": 0.0019420612370595336, "learning_rate": 1.5522084731680404e-06, "loss": 0.0, "num_input_tokens_seen": 180475584, "step": 83695 }, { "epoch": 15.360616626904019, "grad_norm": 0.015069747343659401, "learning_rate": 1.551628585239448e-06, "loss": 0.0, "num_input_tokens_seen": 180486592, "step": 83700 }, { "epoch": 15.361534226463572, "grad_norm": 0.0015807022573426366, "learning_rate": 1.5510487857562373e-06, "loss": 0.1128, "num_input_tokens_seen": 180496736, "step": 83705 }, { "epoch": 15.362451826023124, "grad_norm": 0.009079091250896454, "learning_rate": 1.55046907473328e-06, "loss": 0.0, "num_input_tokens_seen": 180507680, "step": 83710 }, { "epoch": 15.363369425582675, "grad_norm": 2.1397933959960938, "learning_rate": 1.5498894521854452e-06, "loss": 0.0007, "num_input_tokens_seen": 180519008, "step": 83715 }, { "epoch": 15.364287025142229, "grad_norm": 0.0870487317442894, "learning_rate": 1.5493099181275978e-06, "loss": 0.0028, "num_input_tokens_seen": 180529344, "step": 83720 }, { "epoch": 15.36520462470178, "grad_norm": 0.039066582918167114, "learning_rate": 1.5487304725746023e-06, "loss": 0.0001, "num_input_tokens_seen": 180540384, "step": 83725 }, { "epoch": 15.366122224261332, "grad_norm": 0.0036457511596381664, "learning_rate": 1.548151115541322e-06, "loss": 0.0244, "num_input_tokens_seen": 180551616, "step": 83730 }, { "epoch": 15.367039823820885, "grad_norm": 0.18996867537498474, "learning_rate": 1.547571847042616e-06, "loss": 0.0002, "num_input_tokens_seen": 180563328, "step": 83735 }, { "epoch": 15.367957423380437, "grad_norm": 0.0015404963633045554, "learning_rate": 1.5469926670933417e-06, "loss": 0.0656, "num_input_tokens_seen": 180574368, "step": 83740 }, { "epoch": 15.368875022939989, "grad_norm": 0.0008010902092792094, "learning_rate": 1.5464135757083516e-06, "loss": 0.001, "num_input_tokens_seen": 180585824, "step": 83745 }, { "epoch": 15.369792622499542, "grad_norm": 0.0014650706434622407, "learning_rate": 1.5458345729025025e-06, "loss": 0.0001, "num_input_tokens_seen": 180596224, "step": 83750 }, { "epoch": 15.370710222059094, "grad_norm": 0.01158769428730011, "learning_rate": 1.5452556586906437e-06, "loss": 0.1255, "num_input_tokens_seen": 180606720, "step": 83755 }, { "epoch": 15.371627821618645, "grad_norm": 0.003220628947019577, "learning_rate": 1.5446768330876204e-06, "loss": 0.001, "num_input_tokens_seen": 180617408, "step": 83760 }, { "epoch": 15.372545421178199, "grad_norm": 2.768411636352539, "learning_rate": 1.5440980961082835e-06, "loss": 0.0209, "num_input_tokens_seen": 180628000, "step": 83765 }, { "epoch": 15.37346302073775, "grad_norm": 0.007239856757223606, "learning_rate": 1.5435194477674737e-06, "loss": 0.0001, "num_input_tokens_seen": 180637408, "step": 83770 }, { "epoch": 15.374380620297302, "grad_norm": 0.09617451578378677, "learning_rate": 1.5429408880800317e-06, "loss": 0.0001, "num_input_tokens_seen": 180648192, "step": 83775 }, { "epoch": 15.375298219856855, "grad_norm": 0.208045095205307, "learning_rate": 1.5423624170607992e-06, "loss": 0.0001, "num_input_tokens_seen": 180659968, "step": 83780 }, { "epoch": 15.376215819416407, "grad_norm": 0.010538886301219463, "learning_rate": 1.5417840347246122e-06, "loss": 0.1628, "num_input_tokens_seen": 180671008, "step": 83785 }, { "epoch": 15.377133418975959, "grad_norm": 0.017095116898417473, "learning_rate": 1.5412057410863045e-06, "loss": 0.0, "num_input_tokens_seen": 180682752, "step": 83790 }, { "epoch": 15.378051018535512, "grad_norm": 0.003171816933900118, "learning_rate": 1.540627536160708e-06, "loss": 0.0, "num_input_tokens_seen": 180693632, "step": 83795 }, { "epoch": 15.378968618095064, "grad_norm": 0.00423911539837718, "learning_rate": 1.5400494199626547e-06, "loss": 0.0329, "num_input_tokens_seen": 180704736, "step": 83800 }, { "epoch": 15.379886217654615, "grad_norm": 0.006038064602762461, "learning_rate": 1.5394713925069715e-06, "loss": 0.0063, "num_input_tokens_seen": 180715488, "step": 83805 }, { "epoch": 15.380803817214169, "grad_norm": 0.009295043535530567, "learning_rate": 1.538893453808482e-06, "loss": 0.0001, "num_input_tokens_seen": 180724384, "step": 83810 }, { "epoch": 15.38172141677372, "grad_norm": 0.01804587058722973, "learning_rate": 1.5383156038820134e-06, "loss": 0.0001, "num_input_tokens_seen": 180736000, "step": 83815 }, { "epoch": 15.382639016333272, "grad_norm": 0.10386033356189728, "learning_rate": 1.5377378427423839e-06, "loss": 0.0013, "num_input_tokens_seen": 180746720, "step": 83820 }, { "epoch": 15.383556615892825, "grad_norm": 0.0052959369495511055, "learning_rate": 1.5371601704044125e-06, "loss": 0.0144, "num_input_tokens_seen": 180757248, "step": 83825 }, { "epoch": 15.384474215452377, "grad_norm": 0.0035231919027864933, "learning_rate": 1.536582586882915e-06, "loss": 0.0401, "num_input_tokens_seen": 180767424, "step": 83830 }, { "epoch": 15.385391815011928, "grad_norm": 0.004705518018454313, "learning_rate": 1.5360050921927072e-06, "loss": 0.0, "num_input_tokens_seen": 180778144, "step": 83835 }, { "epoch": 15.386309414571482, "grad_norm": 159.20545959472656, "learning_rate": 1.5354276863486006e-06, "loss": 0.1705, "num_input_tokens_seen": 180788416, "step": 83840 }, { "epoch": 15.387227014131033, "grad_norm": 0.26820677518844604, "learning_rate": 1.5348503693654021e-06, "loss": 0.0001, "num_input_tokens_seen": 180799296, "step": 83845 }, { "epoch": 15.388144613690585, "grad_norm": 0.009445139206945896, "learning_rate": 1.5342731412579232e-06, "loss": 0.0588, "num_input_tokens_seen": 180810592, "step": 83850 }, { "epoch": 15.389062213250138, "grad_norm": 0.0026657069101929665, "learning_rate": 1.5336960020409665e-06, "loss": 0.1878, "num_input_tokens_seen": 180821888, "step": 83855 }, { "epoch": 15.38997981280969, "grad_norm": 0.0032162931747734547, "learning_rate": 1.5331189517293337e-06, "loss": 0.0001, "num_input_tokens_seen": 180832960, "step": 83860 }, { "epoch": 15.390897412369242, "grad_norm": 0.006178963463753462, "learning_rate": 1.532541990337828e-06, "loss": 0.4639, "num_input_tokens_seen": 180844416, "step": 83865 }, { "epoch": 15.391815011928795, "grad_norm": 2.175938367843628, "learning_rate": 1.5319651178812462e-06, "loss": 0.0003, "num_input_tokens_seen": 180855712, "step": 83870 }, { "epoch": 15.392732611488347, "grad_norm": 0.013897974975407124, "learning_rate": 1.5313883343743846e-06, "loss": 0.0001, "num_input_tokens_seen": 180866624, "step": 83875 }, { "epoch": 15.393650211047898, "grad_norm": 0.8788021206855774, "learning_rate": 1.5308116398320343e-06, "loss": 0.0002, "num_input_tokens_seen": 180877120, "step": 83880 }, { "epoch": 15.394567810607452, "grad_norm": 0.006702060345560312, "learning_rate": 1.5302350342689904e-06, "loss": 0.0207, "num_input_tokens_seen": 180886848, "step": 83885 }, { "epoch": 15.395485410167003, "grad_norm": 0.036469414830207825, "learning_rate": 1.52965851770004e-06, "loss": 0.0, "num_input_tokens_seen": 180897312, "step": 83890 }, { "epoch": 15.396403009726555, "grad_norm": 0.02677585929632187, "learning_rate": 1.529082090139969e-06, "loss": 0.0, "num_input_tokens_seen": 180908256, "step": 83895 }, { "epoch": 15.397320609286108, "grad_norm": 0.27108335494995117, "learning_rate": 1.528505751603564e-06, "loss": 0.0001, "num_input_tokens_seen": 180918080, "step": 83900 }, { "epoch": 15.39823820884566, "grad_norm": 0.011442695744335651, "learning_rate": 1.5279295021056067e-06, "loss": 0.0003, "num_input_tokens_seen": 180928800, "step": 83905 }, { "epoch": 15.399155808405212, "grad_norm": 0.0006307945586740971, "learning_rate": 1.527353341660876e-06, "loss": 0.1753, "num_input_tokens_seen": 180940032, "step": 83910 }, { "epoch": 15.400073407964765, "grad_norm": 0.04121388494968414, "learning_rate": 1.52677727028415e-06, "loss": 0.0, "num_input_tokens_seen": 180951040, "step": 83915 }, { "epoch": 15.400991007524317, "grad_norm": 0.1201886534690857, "learning_rate": 1.5262012879902027e-06, "loss": 0.0001, "num_input_tokens_seen": 180960608, "step": 83920 }, { "epoch": 15.401908607083868, "grad_norm": 0.0025143988896161318, "learning_rate": 1.52562539479381e-06, "loss": 0.0, "num_input_tokens_seen": 180971008, "step": 83925 }, { "epoch": 15.402826206643422, "grad_norm": 0.0016098292544484138, "learning_rate": 1.5250495907097407e-06, "loss": 0.0001, "num_input_tokens_seen": 180982016, "step": 83930 }, { "epoch": 15.403743806202973, "grad_norm": 49.38062286376953, "learning_rate": 1.5244738757527645e-06, "loss": 0.0944, "num_input_tokens_seen": 180991840, "step": 83935 }, { "epoch": 15.404661405762525, "grad_norm": 0.0009043972822837532, "learning_rate": 1.5238982499376458e-06, "loss": 0.0001, "num_input_tokens_seen": 181002752, "step": 83940 }, { "epoch": 15.405579005322078, "grad_norm": 0.02142694592475891, "learning_rate": 1.523322713279149e-06, "loss": 0.0001, "num_input_tokens_seen": 181013696, "step": 83945 }, { "epoch": 15.40649660488163, "grad_norm": 0.0010238585527986288, "learning_rate": 1.5227472657920373e-06, "loss": 0.0001, "num_input_tokens_seen": 181025056, "step": 83950 }, { "epoch": 15.407414204441181, "grad_norm": 0.0011631682282313704, "learning_rate": 1.5221719074910691e-06, "loss": 0.043, "num_input_tokens_seen": 181035712, "step": 83955 }, { "epoch": 15.408331804000735, "grad_norm": 0.00485639413818717, "learning_rate": 1.5215966383910008e-06, "loss": 0.0001, "num_input_tokens_seen": 181046304, "step": 83960 }, { "epoch": 15.409249403560286, "grad_norm": 0.002401178702712059, "learning_rate": 1.521021458506587e-06, "loss": 0.0002, "num_input_tokens_seen": 181058176, "step": 83965 }, { "epoch": 15.410167003119838, "grad_norm": 0.0016175276832655072, "learning_rate": 1.5204463678525817e-06, "loss": 0.0001, "num_input_tokens_seen": 181070048, "step": 83970 }, { "epoch": 15.411084602679392, "grad_norm": 2.0773725509643555, "learning_rate": 1.5198713664437342e-06, "loss": 0.0008, "num_input_tokens_seen": 181081440, "step": 83975 }, { "epoch": 15.412002202238943, "grad_norm": 0.008055939339101315, "learning_rate": 1.5192964542947912e-06, "loss": 0.0, "num_input_tokens_seen": 181093024, "step": 83980 }, { "epoch": 15.412919801798495, "grad_norm": 0.007235941477119923, "learning_rate": 1.518721631420501e-06, "loss": 0.0009, "num_input_tokens_seen": 181103424, "step": 83985 }, { "epoch": 15.413837401358048, "grad_norm": 0.01240702997893095, "learning_rate": 1.5181468978356057e-06, "loss": 0.2376, "num_input_tokens_seen": 181113376, "step": 83990 }, { "epoch": 15.4147550009176, "grad_norm": 0.01679357700049877, "learning_rate": 1.5175722535548442e-06, "loss": 0.0009, "num_input_tokens_seen": 181125184, "step": 83995 }, { "epoch": 15.415672600477151, "grad_norm": 74.05956268310547, "learning_rate": 1.516997698592959e-06, "loss": 0.1283, "num_input_tokens_seen": 181134176, "step": 84000 }, { "epoch": 15.416590200036705, "grad_norm": 0.0004176981747150421, "learning_rate": 1.516423232964685e-06, "loss": 0.0, "num_input_tokens_seen": 181144544, "step": 84005 }, { "epoch": 15.417507799596256, "grad_norm": 89.76459503173828, "learning_rate": 1.5158488566847551e-06, "loss": 0.0588, "num_input_tokens_seen": 181156608, "step": 84010 }, { "epoch": 15.418425399155808, "grad_norm": 2.6270434856414795, "learning_rate": 1.5152745697679011e-06, "loss": 0.0385, "num_input_tokens_seen": 181167872, "step": 84015 }, { "epoch": 15.419342998715361, "grad_norm": 0.0072367992252111435, "learning_rate": 1.5147003722288551e-06, "loss": 0.0, "num_input_tokens_seen": 181179968, "step": 84020 }, { "epoch": 15.420260598274913, "grad_norm": 0.016894252970814705, "learning_rate": 1.5141262640823428e-06, "loss": 0.0002, "num_input_tokens_seen": 181190432, "step": 84025 }, { "epoch": 15.421178197834465, "grad_norm": 0.0007788505754433572, "learning_rate": 1.5135522453430874e-06, "loss": 0.0478, "num_input_tokens_seen": 181202592, "step": 84030 }, { "epoch": 15.422095797394018, "grad_norm": 0.0029090740717947483, "learning_rate": 1.5129783160258149e-06, "loss": 0.0001, "num_input_tokens_seen": 181212832, "step": 84035 }, { "epoch": 15.42301339695357, "grad_norm": 0.018704673275351524, "learning_rate": 1.5124044761452444e-06, "loss": 0.0025, "num_input_tokens_seen": 181221184, "step": 84040 }, { "epoch": 15.423930996513121, "grad_norm": 0.0022445234935730696, "learning_rate": 1.5118307257160925e-06, "loss": 0.0063, "num_input_tokens_seen": 181232032, "step": 84045 }, { "epoch": 15.424848596072675, "grad_norm": 17.188934326171875, "learning_rate": 1.5112570647530779e-06, "loss": 0.0378, "num_input_tokens_seen": 181243008, "step": 84050 }, { "epoch": 15.425766195632226, "grad_norm": 0.06648196280002594, "learning_rate": 1.510683493270912e-06, "loss": 0.1037, "num_input_tokens_seen": 181253728, "step": 84055 }, { "epoch": 15.426683795191778, "grad_norm": 0.061706461012363434, "learning_rate": 1.510110011284307e-06, "loss": 0.1876, "num_input_tokens_seen": 181265248, "step": 84060 }, { "epoch": 15.427601394751331, "grad_norm": 0.003574466099962592, "learning_rate": 1.5095366188079697e-06, "loss": 0.0002, "num_input_tokens_seen": 181275648, "step": 84065 }, { "epoch": 15.428518994310883, "grad_norm": 36.31907653808594, "learning_rate": 1.5089633158566098e-06, "loss": 0.16, "num_input_tokens_seen": 181286176, "step": 84070 }, { "epoch": 15.429436593870435, "grad_norm": 0.02576908841729164, "learning_rate": 1.5083901024449298e-06, "loss": 0.0028, "num_input_tokens_seen": 181297088, "step": 84075 }, { "epoch": 15.430354193429988, "grad_norm": 117.63793182373047, "learning_rate": 1.5078169785876312e-06, "loss": 0.2165, "num_input_tokens_seen": 181307232, "step": 84080 }, { "epoch": 15.43127179298954, "grad_norm": 0.025609983131289482, "learning_rate": 1.5072439442994163e-06, "loss": 0.5525, "num_input_tokens_seen": 181318848, "step": 84085 }, { "epoch": 15.432189392549091, "grad_norm": 0.32081571221351624, "learning_rate": 1.5066709995949808e-06, "loss": 0.0001, "num_input_tokens_seen": 181330432, "step": 84090 }, { "epoch": 15.433106992108645, "grad_norm": 0.1892051100730896, "learning_rate": 1.5060981444890187e-06, "loss": 0.0002, "num_input_tokens_seen": 181341472, "step": 84095 }, { "epoch": 15.434024591668196, "grad_norm": 0.01583120971918106, "learning_rate": 1.5055253789962255e-06, "loss": 0.0001, "num_input_tokens_seen": 181352480, "step": 84100 }, { "epoch": 15.434942191227748, "grad_norm": 0.004466533660888672, "learning_rate": 1.5049527031312906e-06, "loss": 0.0, "num_input_tokens_seen": 181362880, "step": 84105 }, { "epoch": 15.435859790787301, "grad_norm": 0.02865324541926384, "learning_rate": 1.5043801169089017e-06, "loss": 0.0002, "num_input_tokens_seen": 181373024, "step": 84110 }, { "epoch": 15.436777390346853, "grad_norm": 0.11403841525316238, "learning_rate": 1.5038076203437436e-06, "loss": 0.2376, "num_input_tokens_seen": 181384448, "step": 84115 }, { "epoch": 15.437694989906404, "grad_norm": 0.0008517528185620904, "learning_rate": 1.503235213450503e-06, "loss": 0.0001, "num_input_tokens_seen": 181395232, "step": 84120 }, { "epoch": 15.438612589465958, "grad_norm": 179.3031005859375, "learning_rate": 1.5026628962438594e-06, "loss": 0.0533, "num_input_tokens_seen": 181404608, "step": 84125 }, { "epoch": 15.43953018902551, "grad_norm": 0.01317079272121191, "learning_rate": 1.5020906687384923e-06, "loss": 0.2081, "num_input_tokens_seen": 181414624, "step": 84130 }, { "epoch": 15.440447788585061, "grad_norm": 0.001990178134292364, "learning_rate": 1.5015185309490786e-06, "loss": 0.0001, "num_input_tokens_seen": 181425312, "step": 84135 }, { "epoch": 15.441365388144614, "grad_norm": 3.7928946018218994, "learning_rate": 1.5009464828902902e-06, "loss": 0.0178, "num_input_tokens_seen": 181436672, "step": 84140 }, { "epoch": 15.442282987704166, "grad_norm": 0.0024593432899564505, "learning_rate": 1.5003745245768036e-06, "loss": 0.0022, "num_input_tokens_seen": 181448032, "step": 84145 }, { "epoch": 15.443200587263718, "grad_norm": 0.2985456585884094, "learning_rate": 1.4998026560232865e-06, "loss": 0.0427, "num_input_tokens_seen": 181457440, "step": 84150 }, { "epoch": 15.444118186823271, "grad_norm": 0.22451098263263702, "learning_rate": 1.4992308772444063e-06, "loss": 0.0031, "num_input_tokens_seen": 181467968, "step": 84155 }, { "epoch": 15.445035786382823, "grad_norm": 0.010611067526042461, "learning_rate": 1.4986591882548285e-06, "loss": 0.0001, "num_input_tokens_seen": 181479360, "step": 84160 }, { "epoch": 15.445953385942374, "grad_norm": 57.125675201416016, "learning_rate": 1.4980875890692143e-06, "loss": 0.3099, "num_input_tokens_seen": 181488896, "step": 84165 }, { "epoch": 15.446870985501928, "grad_norm": 0.8214685320854187, "learning_rate": 1.4975160797022276e-06, "loss": 0.0102, "num_input_tokens_seen": 181500320, "step": 84170 }, { "epoch": 15.44778858506148, "grad_norm": 0.07566548883914948, "learning_rate": 1.496944660168525e-06, "loss": 0.0027, "num_input_tokens_seen": 181511200, "step": 84175 }, { "epoch": 15.448706184621031, "grad_norm": 0.0283077210187912, "learning_rate": 1.4963733304827616e-06, "loss": 0.0981, "num_input_tokens_seen": 181522784, "step": 84180 }, { "epoch": 15.449623784180584, "grad_norm": 0.19011855125427246, "learning_rate": 1.4958020906595933e-06, "loss": 0.0018, "num_input_tokens_seen": 181533632, "step": 84185 }, { "epoch": 15.450541383740136, "grad_norm": 0.02438514493405819, "learning_rate": 1.49523094071367e-06, "loss": 0.0824, "num_input_tokens_seen": 181545536, "step": 84190 }, { "epoch": 15.451458983299688, "grad_norm": 0.37118420004844666, "learning_rate": 1.4946598806596413e-06, "loss": 0.0001, "num_input_tokens_seen": 181555648, "step": 84195 }, { "epoch": 15.452376582859241, "grad_norm": 0.004608359653502703, "learning_rate": 1.4940889105121526e-06, "loss": 0.0, "num_input_tokens_seen": 181565888, "step": 84200 }, { "epoch": 15.453294182418793, "grad_norm": 0.008899601176381111, "learning_rate": 1.493518030285851e-06, "loss": 0.0001, "num_input_tokens_seen": 181576128, "step": 84205 }, { "epoch": 15.454211781978344, "grad_norm": 0.018056675791740417, "learning_rate": 1.4929472399953775e-06, "loss": 0.0, "num_input_tokens_seen": 181586144, "step": 84210 }, { "epoch": 15.455129381537898, "grad_norm": 0.006526027340441942, "learning_rate": 1.4923765396553702e-06, "loss": 0.0001, "num_input_tokens_seen": 181596512, "step": 84215 }, { "epoch": 15.45604698109745, "grad_norm": 0.004552808590233326, "learning_rate": 1.4918059292804698e-06, "loss": 0.0001, "num_input_tokens_seen": 181606784, "step": 84220 }, { "epoch": 15.456964580657, "grad_norm": 0.005706022493541241, "learning_rate": 1.4912354088853103e-06, "loss": 0.0001, "num_input_tokens_seen": 181618112, "step": 84225 }, { "epoch": 15.457882180216554, "grad_norm": 0.005507391877472401, "learning_rate": 1.4906649784845234e-06, "loss": 0.0, "num_input_tokens_seen": 181628160, "step": 84230 }, { "epoch": 15.458799779776106, "grad_norm": 0.0008784847450442612, "learning_rate": 1.4900946380927416e-06, "loss": 0.0284, "num_input_tokens_seen": 181639328, "step": 84235 }, { "epoch": 15.459717379335657, "grad_norm": 0.009096273221075535, "learning_rate": 1.489524387724593e-06, "loss": 0.0, "num_input_tokens_seen": 181647264, "step": 84240 }, { "epoch": 15.46063497889521, "grad_norm": 58.844425201416016, "learning_rate": 1.4889542273947027e-06, "loss": 0.3189, "num_input_tokens_seen": 181657856, "step": 84245 }, { "epoch": 15.461552578454762, "grad_norm": 0.005596342962235212, "learning_rate": 1.4883841571176931e-06, "loss": 0.1938, "num_input_tokens_seen": 181668832, "step": 84250 }, { "epoch": 15.462470178014314, "grad_norm": 0.013150373473763466, "learning_rate": 1.4878141769081895e-06, "loss": 0.0002, "num_input_tokens_seen": 181679904, "step": 84255 }, { "epoch": 15.463387777573868, "grad_norm": 0.00762866111472249, "learning_rate": 1.4872442867808084e-06, "loss": 0.0004, "num_input_tokens_seen": 181690400, "step": 84260 }, { "epoch": 15.46430537713342, "grad_norm": 0.017143456265330315, "learning_rate": 1.486674486750166e-06, "loss": 0.1315, "num_input_tokens_seen": 181701792, "step": 84265 }, { "epoch": 15.46522297669297, "grad_norm": 0.0014162409352138638, "learning_rate": 1.486104776830879e-06, "loss": 0.1438, "num_input_tokens_seen": 181712032, "step": 84270 }, { "epoch": 15.466140576252524, "grad_norm": 0.0005986103788018227, "learning_rate": 1.4855351570375587e-06, "loss": 0.0001, "num_input_tokens_seen": 181722976, "step": 84275 }, { "epoch": 15.467058175812076, "grad_norm": 0.10256176441907883, "learning_rate": 1.4849656273848146e-06, "loss": 0.0824, "num_input_tokens_seen": 181733376, "step": 84280 }, { "epoch": 15.467975775371627, "grad_norm": 0.07828477025032043, "learning_rate": 1.4843961878872526e-06, "loss": 0.1253, "num_input_tokens_seen": 181743584, "step": 84285 }, { "epoch": 15.46889337493118, "grad_norm": 0.020549360662698746, "learning_rate": 1.4838268385594812e-06, "loss": 0.0028, "num_input_tokens_seen": 181754528, "step": 84290 }, { "epoch": 15.469810974490732, "grad_norm": 0.4059593081474304, "learning_rate": 1.4832575794161024e-06, "loss": 0.0002, "num_input_tokens_seen": 181765664, "step": 84295 }, { "epoch": 15.470728574050284, "grad_norm": 0.035761650651693344, "learning_rate": 1.4826884104717142e-06, "loss": 0.0003, "num_input_tokens_seen": 181777344, "step": 84300 }, { "epoch": 15.471646173609837, "grad_norm": 0.205540731549263, "learning_rate": 1.4821193317409182e-06, "loss": 0.2783, "num_input_tokens_seen": 181787808, "step": 84305 }, { "epoch": 15.472563773169389, "grad_norm": 0.31540822982788086, "learning_rate": 1.4815503432383099e-06, "loss": 0.0794, "num_input_tokens_seen": 181798560, "step": 84310 }, { "epoch": 15.47348137272894, "grad_norm": 0.004906892776489258, "learning_rate": 1.4809814449784803e-06, "loss": 0.0516, "num_input_tokens_seen": 181808192, "step": 84315 }, { "epoch": 15.474398972288494, "grad_norm": 0.008766948245465755, "learning_rate": 1.4804126369760241e-06, "loss": 0.0001, "num_input_tokens_seen": 181818272, "step": 84320 }, { "epoch": 15.475316571848046, "grad_norm": 0.015207890421152115, "learning_rate": 1.4798439192455288e-06, "loss": 0.0, "num_input_tokens_seen": 181829472, "step": 84325 }, { "epoch": 15.476234171407597, "grad_norm": 0.011721103452146053, "learning_rate": 1.4792752918015812e-06, "loss": 0.0005, "num_input_tokens_seen": 181839744, "step": 84330 }, { "epoch": 15.47715177096715, "grad_norm": 0.001340624992735684, "learning_rate": 1.4787067546587647e-06, "loss": 0.0, "num_input_tokens_seen": 181850112, "step": 84335 }, { "epoch": 15.478069370526702, "grad_norm": 0.045810725539922714, "learning_rate": 1.4781383078316636e-06, "loss": 0.0003, "num_input_tokens_seen": 181860448, "step": 84340 }, { "epoch": 15.478986970086254, "grad_norm": 0.004749234300106764, "learning_rate": 1.4775699513348563e-06, "loss": 0.0045, "num_input_tokens_seen": 181871328, "step": 84345 }, { "epoch": 15.479904569645807, "grad_norm": 0.0010747889755293727, "learning_rate": 1.477001685182921e-06, "loss": 0.0004, "num_input_tokens_seen": 181880800, "step": 84350 }, { "epoch": 15.480822169205359, "grad_norm": 0.001822006655856967, "learning_rate": 1.4764335093904319e-06, "loss": 0.0, "num_input_tokens_seen": 181892480, "step": 84355 }, { "epoch": 15.48173976876491, "grad_norm": 0.006564129609614611, "learning_rate": 1.4758654239719612e-06, "loss": 0.0588, "num_input_tokens_seen": 181904960, "step": 84360 }, { "epoch": 15.482657368324464, "grad_norm": 0.005342930089682341, "learning_rate": 1.4752974289420813e-06, "loss": 0.0, "num_input_tokens_seen": 181915936, "step": 84365 }, { "epoch": 15.483574967884016, "grad_norm": 0.908552348613739, "learning_rate": 1.4747295243153597e-06, "loss": 0.1441, "num_input_tokens_seen": 181924864, "step": 84370 }, { "epoch": 15.484492567443567, "grad_norm": 0.013245436362922192, "learning_rate": 1.4741617101063626e-06, "loss": 0.0119, "num_input_tokens_seen": 181936000, "step": 84375 }, { "epoch": 15.48541016700312, "grad_norm": 0.00971498154103756, "learning_rate": 1.4735939863296527e-06, "loss": 0.008, "num_input_tokens_seen": 181947360, "step": 84380 }, { "epoch": 15.486327766562672, "grad_norm": 0.016918037086725235, "learning_rate": 1.47302635299979e-06, "loss": 0.0002, "num_input_tokens_seen": 181957216, "step": 84385 }, { "epoch": 15.487245366122224, "grad_norm": 0.008908134885132313, "learning_rate": 1.4724588101313369e-06, "loss": 0.0001, "num_input_tokens_seen": 181968544, "step": 84390 }, { "epoch": 15.488162965681777, "grad_norm": 0.0010620567481964827, "learning_rate": 1.4718913577388483e-06, "loss": 0.0004, "num_input_tokens_seen": 181979872, "step": 84395 }, { "epoch": 15.489080565241329, "grad_norm": 0.01025411020964384, "learning_rate": 1.4713239958368763e-06, "loss": 0.0145, "num_input_tokens_seen": 181989120, "step": 84400 }, { "epoch": 15.48999816480088, "grad_norm": 0.0031661787070333958, "learning_rate": 1.4707567244399761e-06, "loss": 0.116, "num_input_tokens_seen": 182000352, "step": 84405 }, { "epoch": 15.490915764360434, "grad_norm": 0.5666534900665283, "learning_rate": 1.4701895435626967e-06, "loss": 0.0009, "num_input_tokens_seen": 182010880, "step": 84410 }, { "epoch": 15.491833363919985, "grad_norm": 194.50828552246094, "learning_rate": 1.4696224532195847e-06, "loss": 0.116, "num_input_tokens_seen": 182021504, "step": 84415 }, { "epoch": 15.492750963479537, "grad_norm": 0.03158808499574661, "learning_rate": 1.4690554534251838e-06, "loss": 0.1222, "num_input_tokens_seen": 182032864, "step": 84420 }, { "epoch": 15.49366856303909, "grad_norm": 0.0028841886669397354, "learning_rate": 1.4684885441940393e-06, "loss": 0.001, "num_input_tokens_seen": 182043072, "step": 84425 }, { "epoch": 15.494586162598642, "grad_norm": 0.03005189634859562, "learning_rate": 1.4679217255406902e-06, "loss": 0.0131, "num_input_tokens_seen": 182054464, "step": 84430 }, { "epoch": 15.495503762158194, "grad_norm": 0.017855819314718246, "learning_rate": 1.4673549974796735e-06, "loss": 0.0001, "num_input_tokens_seen": 182064928, "step": 84435 }, { "epoch": 15.496421361717747, "grad_norm": 0.0034639195073395967, "learning_rate": 1.4667883600255272e-06, "loss": 0.0002, "num_input_tokens_seen": 182075648, "step": 84440 }, { "epoch": 15.497338961277299, "grad_norm": 0.04385704547166824, "learning_rate": 1.4662218131927835e-06, "loss": 0.0, "num_input_tokens_seen": 182086304, "step": 84445 }, { "epoch": 15.49825656083685, "grad_norm": 0.001821714686229825, "learning_rate": 1.4656553569959719e-06, "loss": 0.0101, "num_input_tokens_seen": 182097216, "step": 84450 }, { "epoch": 15.499174160396404, "grad_norm": 0.26021909713745117, "learning_rate": 1.465088991449624e-06, "loss": 0.0004, "num_input_tokens_seen": 182107296, "step": 84455 }, { "epoch": 15.500091759955955, "grad_norm": 0.0018245349638164043, "learning_rate": 1.4645227165682652e-06, "loss": 0.0003, "num_input_tokens_seen": 182118432, "step": 84460 }, { "epoch": 15.501009359515507, "grad_norm": 0.021432017907500267, "learning_rate": 1.4639565323664196e-06, "loss": 0.1377, "num_input_tokens_seen": 182129856, "step": 84465 }, { "epoch": 15.50192695907506, "grad_norm": 0.04129711166024208, "learning_rate": 1.4633904388586063e-06, "loss": 0.0001, "num_input_tokens_seen": 182141440, "step": 84470 }, { "epoch": 15.502844558634612, "grad_norm": 0.0005128313205204904, "learning_rate": 1.4628244360593492e-06, "loss": 0.0291, "num_input_tokens_seen": 182152512, "step": 84475 }, { "epoch": 15.503762158194164, "grad_norm": 0.00566738098859787, "learning_rate": 1.4622585239831627e-06, "loss": 0.0, "num_input_tokens_seen": 182162720, "step": 84480 }, { "epoch": 15.504679757753717, "grad_norm": 0.006727907806634903, "learning_rate": 1.4616927026445604e-06, "loss": 0.0007, "num_input_tokens_seen": 182172864, "step": 84485 }, { "epoch": 15.505597357313269, "grad_norm": 1.110131859779358, "learning_rate": 1.4611269720580578e-06, "loss": 0.0001, "num_input_tokens_seen": 182184672, "step": 84490 }, { "epoch": 15.50651495687282, "grad_norm": 0.007020014338195324, "learning_rate": 1.4605613322381644e-06, "loss": 0.0002, "num_input_tokens_seen": 182196480, "step": 84495 }, { "epoch": 15.507432556432374, "grad_norm": 0.015932517126202583, "learning_rate": 1.459995783199385e-06, "loss": 0.0489, "num_input_tokens_seen": 182208352, "step": 84500 }, { "epoch": 15.508350155991925, "grad_norm": 0.0016400620806962252, "learning_rate": 1.459430324956229e-06, "loss": 0.0, "num_input_tokens_seen": 182218176, "step": 84505 }, { "epoch": 15.509267755551477, "grad_norm": 0.044162917882204056, "learning_rate": 1.4588649575231978e-06, "loss": 0.0005, "num_input_tokens_seen": 182228384, "step": 84510 }, { "epoch": 15.51018535511103, "grad_norm": 0.000717280141543597, "learning_rate": 1.458299680914792e-06, "loss": 0.0002, "num_input_tokens_seen": 182239520, "step": 84515 }, { "epoch": 15.511102954670582, "grad_norm": 0.008119816891849041, "learning_rate": 1.4577344951455092e-06, "loss": 0.0, "num_input_tokens_seen": 182250144, "step": 84520 }, { "epoch": 15.512020554230133, "grad_norm": 0.018523043021559715, "learning_rate": 1.4571694002298476e-06, "loss": 0.0026, "num_input_tokens_seen": 182260544, "step": 84525 }, { "epoch": 15.512938153789687, "grad_norm": 623.9894409179688, "learning_rate": 1.4566043961823001e-06, "loss": 0.1741, "num_input_tokens_seen": 182271776, "step": 84530 }, { "epoch": 15.513855753349238, "grad_norm": 0.00223379023373127, "learning_rate": 1.4560394830173569e-06, "loss": 0.1751, "num_input_tokens_seen": 182282496, "step": 84535 }, { "epoch": 15.51477335290879, "grad_norm": 0.008631939068436623, "learning_rate": 1.4554746607495097e-06, "loss": 0.1876, "num_input_tokens_seen": 182293184, "step": 84540 }, { "epoch": 15.515690952468344, "grad_norm": 0.12793588638305664, "learning_rate": 1.4549099293932439e-06, "loss": 0.0017, "num_input_tokens_seen": 182304576, "step": 84545 }, { "epoch": 15.516608552027895, "grad_norm": 0.02385830320417881, "learning_rate": 1.4543452889630438e-06, "loss": 0.0, "num_input_tokens_seen": 182315776, "step": 84550 }, { "epoch": 15.517526151587447, "grad_norm": 0.0010415613651275635, "learning_rate": 1.4537807394733905e-06, "loss": 0.0, "num_input_tokens_seen": 182326144, "step": 84555 }, { "epoch": 15.518443751147, "grad_norm": 0.0007644559373147786, "learning_rate": 1.4532162809387663e-06, "loss": 0.0002, "num_input_tokens_seen": 182336256, "step": 84560 }, { "epoch": 15.519361350706552, "grad_norm": 0.0004864541406277567, "learning_rate": 1.4526519133736477e-06, "loss": 0.1128, "num_input_tokens_seen": 182346784, "step": 84565 }, { "epoch": 15.520278950266103, "grad_norm": 0.040232934057712555, "learning_rate": 1.4520876367925097e-06, "loss": 0.0001, "num_input_tokens_seen": 182358080, "step": 84570 }, { "epoch": 15.521196549825657, "grad_norm": 31.536043167114258, "learning_rate": 1.4515234512098247e-06, "loss": 0.0883, "num_input_tokens_seen": 182368608, "step": 84575 }, { "epoch": 15.522114149385208, "grad_norm": 0.2016718089580536, "learning_rate": 1.4509593566400627e-06, "loss": 0.0, "num_input_tokens_seen": 182378656, "step": 84580 }, { "epoch": 15.52303174894476, "grad_norm": 0.4469243586063385, "learning_rate": 1.4503953530976933e-06, "loss": 0.0004, "num_input_tokens_seen": 182389952, "step": 84585 }, { "epoch": 15.523949348504313, "grad_norm": 0.0010837339796125889, "learning_rate": 1.4498314405971826e-06, "loss": 0.0, "num_input_tokens_seen": 182400896, "step": 84590 }, { "epoch": 15.524866948063865, "grad_norm": 0.08629666268825531, "learning_rate": 1.4492676191529926e-06, "loss": 0.0001, "num_input_tokens_seen": 182411072, "step": 84595 }, { "epoch": 15.525784547623417, "grad_norm": 0.025551259517669678, "learning_rate": 1.4487038887795851e-06, "loss": 0.019, "num_input_tokens_seen": 182422080, "step": 84600 }, { "epoch": 15.52670214718297, "grad_norm": 0.002216803841292858, "learning_rate": 1.4481402494914175e-06, "loss": 0.0079, "num_input_tokens_seen": 182432800, "step": 84605 }, { "epoch": 15.527619746742522, "grad_norm": 0.012558983638882637, "learning_rate": 1.447576701302949e-06, "loss": 0.0, "num_input_tokens_seen": 182442912, "step": 84610 }, { "epoch": 15.528537346302073, "grad_norm": 0.04581578075885773, "learning_rate": 1.4470132442286322e-06, "loss": 0.0005, "num_input_tokens_seen": 182453856, "step": 84615 }, { "epoch": 15.529454945861627, "grad_norm": 0.002016549464315176, "learning_rate": 1.4464498782829178e-06, "loss": 0.0764, "num_input_tokens_seen": 182464736, "step": 84620 }, { "epoch": 15.530372545421178, "grad_norm": 0.001718500629067421, "learning_rate": 1.4458866034802581e-06, "loss": 0.0, "num_input_tokens_seen": 182473728, "step": 84625 }, { "epoch": 15.53129014498073, "grad_norm": 0.011848493479192257, "learning_rate": 1.4453234198350986e-06, "loss": 0.0001, "num_input_tokens_seen": 182484992, "step": 84630 }, { "epoch": 15.532207744540283, "grad_norm": 0.039172884076833725, "learning_rate": 1.4447603273618826e-06, "loss": 0.002, "num_input_tokens_seen": 182496352, "step": 84635 }, { "epoch": 15.533125344099835, "grad_norm": 0.0013404028723016381, "learning_rate": 1.4441973260750553e-06, "loss": 0.1066, "num_input_tokens_seen": 182507616, "step": 84640 }, { "epoch": 15.534042943659387, "grad_norm": 0.0163415614515543, "learning_rate": 1.4436344159890559e-06, "loss": 0.0001, "num_input_tokens_seen": 182519776, "step": 84645 }, { "epoch": 15.53496054321894, "grad_norm": 0.040479399263858795, "learning_rate": 1.4430715971183218e-06, "loss": 0.0703, "num_input_tokens_seen": 182531488, "step": 84650 }, { "epoch": 15.535878142778492, "grad_norm": 10.380398750305176, "learning_rate": 1.442508869477287e-06, "loss": 0.0008, "num_input_tokens_seen": 182542848, "step": 84655 }, { "epoch": 15.536795742338043, "grad_norm": 0.16558675467967987, "learning_rate": 1.4419462330803879e-06, "loss": 0.0002, "num_input_tokens_seen": 182552256, "step": 84660 }, { "epoch": 15.537713341897597, "grad_norm": 0.03535643592476845, "learning_rate": 1.4413836879420528e-06, "loss": 0.0001, "num_input_tokens_seen": 182563296, "step": 84665 }, { "epoch": 15.538630941457148, "grad_norm": 0.002031240612268448, "learning_rate": 1.4408212340767096e-06, "loss": 0.0001, "num_input_tokens_seen": 182573248, "step": 84670 }, { "epoch": 15.5395485410167, "grad_norm": 0.03466155380010605, "learning_rate": 1.440258871498787e-06, "loss": 0.0, "num_input_tokens_seen": 182583264, "step": 84675 }, { "epoch": 15.540466140576253, "grad_norm": 0.2088111788034439, "learning_rate": 1.4396966002227075e-06, "loss": 0.002, "num_input_tokens_seen": 182594560, "step": 84680 }, { "epoch": 15.541383740135805, "grad_norm": 0.017964065074920654, "learning_rate": 1.4391344202628905e-06, "loss": 0.2113, "num_input_tokens_seen": 182605280, "step": 84685 }, { "epoch": 15.542301339695356, "grad_norm": 0.06846898049116135, "learning_rate": 1.438572331633758e-06, "loss": 0.0944, "num_input_tokens_seen": 182615808, "step": 84690 }, { "epoch": 15.54321893925491, "grad_norm": 0.005659322254359722, "learning_rate": 1.438010334349726e-06, "loss": 0.1005, "num_input_tokens_seen": 182627104, "step": 84695 }, { "epoch": 15.544136538814461, "grad_norm": 0.0015283324755728245, "learning_rate": 1.4374484284252077e-06, "loss": 0.0001, "num_input_tokens_seen": 182637824, "step": 84700 }, { "epoch": 15.545054138374013, "grad_norm": 0.006457183510065079, "learning_rate": 1.4368866138746147e-06, "loss": 0.0001, "num_input_tokens_seen": 182648800, "step": 84705 }, { "epoch": 15.545971737933566, "grad_norm": 0.006589433178305626, "learning_rate": 1.436324890712359e-06, "loss": 0.0208, "num_input_tokens_seen": 182660800, "step": 84710 }, { "epoch": 15.546889337493118, "grad_norm": 0.35688894987106323, "learning_rate": 1.435763258952847e-06, "loss": 0.0001, "num_input_tokens_seen": 182671008, "step": 84715 }, { "epoch": 15.54780693705267, "grad_norm": 44.52100372314453, "learning_rate": 1.4352017186104816e-06, "loss": 0.008, "num_input_tokens_seen": 182681984, "step": 84720 }, { "epoch": 15.548724536612223, "grad_norm": 0.0067816986702382565, "learning_rate": 1.4346402696996685e-06, "loss": 0.0013, "num_input_tokens_seen": 182692352, "step": 84725 }, { "epoch": 15.549642136171775, "grad_norm": 304.1855773925781, "learning_rate": 1.434078912234807e-06, "loss": 0.0822, "num_input_tokens_seen": 182704320, "step": 84730 }, { "epoch": 15.550559735731326, "grad_norm": 0.0015428311889991164, "learning_rate": 1.4335176462302947e-06, "loss": 0.0002, "num_input_tokens_seen": 182714592, "step": 84735 }, { "epoch": 15.55147733529088, "grad_norm": 0.0029395469464361668, "learning_rate": 1.4329564717005257e-06, "loss": 0.0022, "num_input_tokens_seen": 182726720, "step": 84740 }, { "epoch": 15.552394934850431, "grad_norm": 0.012913582846522331, "learning_rate": 1.4323953886598963e-06, "loss": 0.1782, "num_input_tokens_seen": 182737856, "step": 84745 }, { "epoch": 15.553312534409983, "grad_norm": 0.002488195663318038, "learning_rate": 1.431834397122796e-06, "loss": 0.0002, "num_input_tokens_seen": 182748544, "step": 84750 }, { "epoch": 15.554230133969536, "grad_norm": 71.0403060913086, "learning_rate": 1.4312734971036113e-06, "loss": 0.0705, "num_input_tokens_seen": 182757984, "step": 84755 }, { "epoch": 15.555147733529088, "grad_norm": 11.996294021606445, "learning_rate": 1.430712688616732e-06, "loss": 0.0031, "num_input_tokens_seen": 182769568, "step": 84760 }, { "epoch": 15.55606533308864, "grad_norm": 0.022289631888270378, "learning_rate": 1.4301519716765405e-06, "loss": 0.0041, "num_input_tokens_seen": 182781472, "step": 84765 }, { "epoch": 15.556982932648193, "grad_norm": 0.004396119154989719, "learning_rate": 1.4295913462974187e-06, "loss": 0.1314, "num_input_tokens_seen": 182792320, "step": 84770 }, { "epoch": 15.557900532207745, "grad_norm": 0.0077088624238967896, "learning_rate": 1.4290308124937429e-06, "loss": 0.0079, "num_input_tokens_seen": 182803008, "step": 84775 }, { "epoch": 15.558818131767296, "grad_norm": 0.011335275135934353, "learning_rate": 1.428470370279894e-06, "loss": 0.0001, "num_input_tokens_seen": 182812832, "step": 84780 }, { "epoch": 15.55973573132685, "grad_norm": 0.013796070590615273, "learning_rate": 1.427910019670245e-06, "loss": 0.1595, "num_input_tokens_seen": 182823968, "step": 84785 }, { "epoch": 15.560653330886401, "grad_norm": 0.256022185087204, "learning_rate": 1.4273497606791675e-06, "loss": 0.0001, "num_input_tokens_seen": 182835584, "step": 84790 }, { "epoch": 15.561570930445953, "grad_norm": 6.093649387359619, "learning_rate": 1.426789593321032e-06, "loss": 0.0038, "num_input_tokens_seen": 182846240, "step": 84795 }, { "epoch": 15.562488530005506, "grad_norm": 0.34745898842811584, "learning_rate": 1.4262295176102047e-06, "loss": 0.001, "num_input_tokens_seen": 182856352, "step": 84800 }, { "epoch": 15.563406129565058, "grad_norm": 0.13789334893226624, "learning_rate": 1.4256695335610504e-06, "loss": 0.0001, "num_input_tokens_seen": 182867200, "step": 84805 }, { "epoch": 15.56432372912461, "grad_norm": 0.008047761395573616, "learning_rate": 1.425109641187934e-06, "loss": 0.0588, "num_input_tokens_seen": 182879616, "step": 84810 }, { "epoch": 15.565241328684163, "grad_norm": 0.0019169216975569725, "learning_rate": 1.424549840505215e-06, "loss": 0.0001, "num_input_tokens_seen": 182889728, "step": 84815 }, { "epoch": 15.566158928243714, "grad_norm": 0.008944532833993435, "learning_rate": 1.4239901315272498e-06, "loss": 0.0207, "num_input_tokens_seen": 182901472, "step": 84820 }, { "epoch": 15.567076527803266, "grad_norm": 0.05463887006044388, "learning_rate": 1.4234305142683963e-06, "loss": 0.0001, "num_input_tokens_seen": 182912544, "step": 84825 }, { "epoch": 15.56799412736282, "grad_norm": 0.003061153693124652, "learning_rate": 1.4228709887430075e-06, "loss": 0.0003, "num_input_tokens_seen": 182922944, "step": 84830 }, { "epoch": 15.568911726922371, "grad_norm": 0.011907420121133327, "learning_rate": 1.4223115549654337e-06, "loss": 0.0002, "num_input_tokens_seen": 182933568, "step": 84835 }, { "epoch": 15.569829326481923, "grad_norm": 0.02652880549430847, "learning_rate": 1.4217522129500222e-06, "loss": 0.0544, "num_input_tokens_seen": 182944224, "step": 84840 }, { "epoch": 15.570746926041476, "grad_norm": 0.01251351647078991, "learning_rate": 1.421192962711122e-06, "loss": 0.0, "num_input_tokens_seen": 182954304, "step": 84845 }, { "epoch": 15.571664525601028, "grad_norm": 0.001972080674022436, "learning_rate": 1.4206338042630757e-06, "loss": 0.0001, "num_input_tokens_seen": 182965216, "step": 84850 }, { "epoch": 15.57258212516058, "grad_norm": 0.14663086831569672, "learning_rate": 1.4200747376202228e-06, "loss": 0.0001, "num_input_tokens_seen": 182976416, "step": 84855 }, { "epoch": 15.573499724720133, "grad_norm": 0.0013787787174805999, "learning_rate": 1.419515762796907e-06, "loss": 0.0, "num_input_tokens_seen": 182986880, "step": 84860 }, { "epoch": 15.574417324279684, "grad_norm": 0.0011468613520264626, "learning_rate": 1.4189568798074615e-06, "loss": 0.0032, "num_input_tokens_seen": 182996192, "step": 84865 }, { "epoch": 15.575334923839236, "grad_norm": 0.05172225460410118, "learning_rate": 1.4183980886662214e-06, "loss": 0.0, "num_input_tokens_seen": 183005536, "step": 84870 }, { "epoch": 15.57625252339879, "grad_norm": 0.006859919521957636, "learning_rate": 1.4178393893875204e-06, "loss": 0.0002, "num_input_tokens_seen": 183016896, "step": 84875 }, { "epoch": 15.577170122958341, "grad_norm": 3.306459426879883, "learning_rate": 1.4172807819856872e-06, "loss": 0.2573, "num_input_tokens_seen": 183026176, "step": 84880 }, { "epoch": 15.578087722517893, "grad_norm": 0.011133897118270397, "learning_rate": 1.4167222664750495e-06, "loss": 0.0002, "num_input_tokens_seen": 183036768, "step": 84885 }, { "epoch": 15.579005322077446, "grad_norm": 0.001228214823640883, "learning_rate": 1.4161638428699304e-06, "loss": 0.0001, "num_input_tokens_seen": 183047392, "step": 84890 }, { "epoch": 15.579922921636998, "grad_norm": 1.9560106992721558, "learning_rate": 1.4156055111846555e-06, "loss": 0.0002, "num_input_tokens_seen": 183057984, "step": 84895 }, { "epoch": 15.58084052119655, "grad_norm": 0.013631819747388363, "learning_rate": 1.4150472714335446e-06, "loss": 0.0, "num_input_tokens_seen": 183068480, "step": 84900 }, { "epoch": 15.581758120756103, "grad_norm": 8.04289722442627, "learning_rate": 1.4144891236309127e-06, "loss": 0.0008, "num_input_tokens_seen": 183077696, "step": 84905 }, { "epoch": 15.582675720315654, "grad_norm": 0.0007465972448699176, "learning_rate": 1.41393106779108e-06, "loss": 0.0001, "num_input_tokens_seen": 183089216, "step": 84910 }, { "epoch": 15.583593319875206, "grad_norm": 0.013170457445085049, "learning_rate": 1.413373103928357e-06, "loss": 0.0, "num_input_tokens_seen": 183099904, "step": 84915 }, { "epoch": 15.58451091943476, "grad_norm": 0.0012922221794724464, "learning_rate": 1.4128152320570553e-06, "loss": 0.0001, "num_input_tokens_seen": 183111232, "step": 84920 }, { "epoch": 15.585428518994311, "grad_norm": 132.185791015625, "learning_rate": 1.4122574521914818e-06, "loss": 0.2381, "num_input_tokens_seen": 183122112, "step": 84925 }, { "epoch": 15.586346118553863, "grad_norm": 0.0008980652783066034, "learning_rate": 1.4116997643459458e-06, "loss": 0.0, "num_input_tokens_seen": 183132736, "step": 84930 }, { "epoch": 15.587263718113416, "grad_norm": 0.4139372706413269, "learning_rate": 1.4111421685347493e-06, "loss": 0.0008, "num_input_tokens_seen": 183143232, "step": 84935 }, { "epoch": 15.588181317672968, "grad_norm": 0.0013358425348997116, "learning_rate": 1.4105846647721922e-06, "loss": 0.0009, "num_input_tokens_seen": 183153824, "step": 84940 }, { "epoch": 15.58909891723252, "grad_norm": 0.10926396399736404, "learning_rate": 1.4100272530725773e-06, "loss": 0.0001, "num_input_tokens_seen": 183163616, "step": 84945 }, { "epoch": 15.590016516792073, "grad_norm": 0.0024388369638472795, "learning_rate": 1.4094699334501988e-06, "loss": 0.0016, "num_input_tokens_seen": 183175360, "step": 84950 }, { "epoch": 15.590934116351624, "grad_norm": 0.43501725792884827, "learning_rate": 1.4089127059193507e-06, "loss": 0.0001, "num_input_tokens_seen": 183185952, "step": 84955 }, { "epoch": 15.591851715911176, "grad_norm": 0.009368333034217358, "learning_rate": 1.4083555704943275e-06, "loss": 0.0079, "num_input_tokens_seen": 183196768, "step": 84960 }, { "epoch": 15.59276931547073, "grad_norm": 0.01406269334256649, "learning_rate": 1.4077985271894173e-06, "loss": 0.0, "num_input_tokens_seen": 183208256, "step": 84965 }, { "epoch": 15.59368691503028, "grad_norm": 12.976774215698242, "learning_rate": 1.4072415760189074e-06, "loss": 0.0016, "num_input_tokens_seen": 183219968, "step": 84970 }, { "epoch": 15.594604514589832, "grad_norm": 0.0032532773911952972, "learning_rate": 1.4066847169970815e-06, "loss": 0.0, "num_input_tokens_seen": 183230592, "step": 84975 }, { "epoch": 15.595522114149386, "grad_norm": 0.009470576420426369, "learning_rate": 1.4061279501382247e-06, "loss": 0.0001, "num_input_tokens_seen": 183241248, "step": 84980 }, { "epoch": 15.596439713708937, "grad_norm": 0.018197519704699516, "learning_rate": 1.4055712754566164e-06, "loss": 0.0002, "num_input_tokens_seen": 183253344, "step": 84985 }, { "epoch": 15.597357313268489, "grad_norm": 0.11610674113035202, "learning_rate": 1.4050146929665337e-06, "loss": 0.0001, "num_input_tokens_seen": 183265440, "step": 84990 }, { "epoch": 15.598274912828042, "grad_norm": 0.0006787683232687414, "learning_rate": 1.4044582026822523e-06, "loss": 0.0001, "num_input_tokens_seen": 183276096, "step": 84995 }, { "epoch": 15.599192512387594, "grad_norm": 0.0008418670040555298, "learning_rate": 1.4039018046180442e-06, "loss": 0.0, "num_input_tokens_seen": 183286336, "step": 85000 }, { "epoch": 15.600110111947146, "grad_norm": 1.914503574371338, "learning_rate": 1.4033454987881828e-06, "loss": 0.0002, "num_input_tokens_seen": 183296192, "step": 85005 }, { "epoch": 15.601027711506699, "grad_norm": 0.002990350127220154, "learning_rate": 1.4027892852069347e-06, "loss": 0.1005, "num_input_tokens_seen": 183306016, "step": 85010 }, { "epoch": 15.60194531106625, "grad_norm": 0.000927097862586379, "learning_rate": 1.4022331638885666e-06, "loss": 0.0944, "num_input_tokens_seen": 183316672, "step": 85015 }, { "epoch": 15.602862910625802, "grad_norm": 568.740478515625, "learning_rate": 1.4016771348473418e-06, "loss": 0.0739, "num_input_tokens_seen": 183328160, "step": 85020 }, { "epoch": 15.603780510185356, "grad_norm": 0.00812471192330122, "learning_rate": 1.4011211980975198e-06, "loss": 0.0, "num_input_tokens_seen": 183337632, "step": 85025 }, { "epoch": 15.604698109744907, "grad_norm": 0.0027340250089764595, "learning_rate": 1.400565353653363e-06, "loss": 0.0285, "num_input_tokens_seen": 183347968, "step": 85030 }, { "epoch": 15.605615709304459, "grad_norm": 0.000995345413684845, "learning_rate": 1.4000096015291264e-06, "loss": 0.0001, "num_input_tokens_seen": 183358336, "step": 85035 }, { "epoch": 15.606533308864012, "grad_norm": 12.288564682006836, "learning_rate": 1.3994539417390623e-06, "loss": 0.0027, "num_input_tokens_seen": 183368704, "step": 85040 }, { "epoch": 15.607450908423564, "grad_norm": 0.002407799242064357, "learning_rate": 1.398898374297426e-06, "loss": 0.1252, "num_input_tokens_seen": 183379808, "step": 85045 }, { "epoch": 15.608368507983116, "grad_norm": 0.0015109034720808268, "learning_rate": 1.3983428992184656e-06, "loss": 0.0, "num_input_tokens_seen": 183390976, "step": 85050 }, { "epoch": 15.609286107542669, "grad_norm": 0.0015538263833150268, "learning_rate": 1.3977875165164273e-06, "loss": 0.0, "num_input_tokens_seen": 183401472, "step": 85055 }, { "epoch": 15.61020370710222, "grad_norm": 125.63611602783203, "learning_rate": 1.3972322262055543e-06, "loss": 0.0097, "num_input_tokens_seen": 183411840, "step": 85060 }, { "epoch": 15.611121306661772, "grad_norm": 0.0025291452184319496, "learning_rate": 1.396677028300093e-06, "loss": 0.0001, "num_input_tokens_seen": 183422304, "step": 85065 }, { "epoch": 15.612038906221326, "grad_norm": 0.0028395415283739567, "learning_rate": 1.3961219228142813e-06, "loss": 0.0401, "num_input_tokens_seen": 183433632, "step": 85070 }, { "epoch": 15.612956505780877, "grad_norm": 0.0005340155330486596, "learning_rate": 1.3955669097623548e-06, "loss": 0.0, "num_input_tokens_seen": 183445632, "step": 85075 }, { "epoch": 15.613874105340429, "grad_norm": 0.002165041398257017, "learning_rate": 1.3950119891585529e-06, "loss": 0.0645, "num_input_tokens_seen": 183455872, "step": 85080 }, { "epoch": 15.614791704899982, "grad_norm": 0.000985294347628951, "learning_rate": 1.394457161017106e-06, "loss": 0.0352, "num_input_tokens_seen": 183466528, "step": 85085 }, { "epoch": 15.615709304459534, "grad_norm": 0.005142576992511749, "learning_rate": 1.3939024253522432e-06, "loss": 0.0144, "num_input_tokens_seen": 183478080, "step": 85090 }, { "epoch": 15.616626904019085, "grad_norm": 0.011912966147065163, "learning_rate": 1.3933477821781954e-06, "loss": 0.0005, "num_input_tokens_seen": 183489856, "step": 85095 }, { "epoch": 15.617544503578639, "grad_norm": 0.004825850483030081, "learning_rate": 1.3927932315091874e-06, "loss": 0.1252, "num_input_tokens_seen": 183500832, "step": 85100 }, { "epoch": 15.61846210313819, "grad_norm": 0.003378859721124172, "learning_rate": 1.3922387733594428e-06, "loss": 0.0762, "num_input_tokens_seen": 183512640, "step": 85105 }, { "epoch": 15.619379702697742, "grad_norm": 0.005925287958234549, "learning_rate": 1.3916844077431802e-06, "loss": 0.0001, "num_input_tokens_seen": 183523104, "step": 85110 }, { "epoch": 15.620297302257296, "grad_norm": 5.703851699829102, "learning_rate": 1.391130134674622e-06, "loss": 0.0013, "num_input_tokens_seen": 183532928, "step": 85115 }, { "epoch": 15.621214901816847, "grad_norm": 0.0057950206100940704, "learning_rate": 1.3905759541679826e-06, "loss": 0.0, "num_input_tokens_seen": 183544096, "step": 85120 }, { "epoch": 15.622132501376399, "grad_norm": 0.09278705716133118, "learning_rate": 1.3900218662374737e-06, "loss": 0.2094, "num_input_tokens_seen": 183555424, "step": 85125 }, { "epoch": 15.623050100935952, "grad_norm": 0.0005379155627451837, "learning_rate": 1.3894678708973108e-06, "loss": 0.0, "num_input_tokens_seen": 183564768, "step": 85130 }, { "epoch": 15.623967700495504, "grad_norm": 0.008463275618851185, "learning_rate": 1.3889139681617014e-06, "loss": 0.0, "num_input_tokens_seen": 183576768, "step": 85135 }, { "epoch": 15.624885300055055, "grad_norm": 0.007738432846963406, "learning_rate": 1.3883601580448508e-06, "loss": 0.0, "num_input_tokens_seen": 183587424, "step": 85140 }, { "epoch": 15.625802899614609, "grad_norm": 0.05777401477098465, "learning_rate": 1.387806440560966e-06, "loss": 0.0001, "num_input_tokens_seen": 183597696, "step": 85145 }, { "epoch": 15.62672049917416, "grad_norm": 30.546018600463867, "learning_rate": 1.3872528157242471e-06, "loss": 0.0051, "num_input_tokens_seen": 183609024, "step": 85150 }, { "epoch": 15.627638098733712, "grad_norm": 0.00335870124399662, "learning_rate": 1.3866992835488945e-06, "loss": 0.0, "num_input_tokens_seen": 183620192, "step": 85155 }, { "epoch": 15.628555698293265, "grad_norm": 0.0016427749069407582, "learning_rate": 1.3861458440491038e-06, "loss": 0.0, "num_input_tokens_seen": 183629440, "step": 85160 }, { "epoch": 15.629473297852817, "grad_norm": 0.014247622340917587, "learning_rate": 1.3855924972390728e-06, "loss": 0.0, "num_input_tokens_seen": 183640064, "step": 85165 }, { "epoch": 15.630390897412369, "grad_norm": 0.009287353605031967, "learning_rate": 1.3850392431329918e-06, "loss": 0.0065, "num_input_tokens_seen": 183649600, "step": 85170 }, { "epoch": 15.631308496971922, "grad_norm": 0.021990180015563965, "learning_rate": 1.3844860817450507e-06, "loss": 0.0, "num_input_tokens_seen": 183659872, "step": 85175 }, { "epoch": 15.632226096531474, "grad_norm": 0.004074231255799532, "learning_rate": 1.383933013089439e-06, "loss": 0.0, "num_input_tokens_seen": 183670560, "step": 85180 }, { "epoch": 15.633143696091025, "grad_norm": 0.01769302599132061, "learning_rate": 1.3833800371803419e-06, "loss": 0.3445, "num_input_tokens_seen": 183682400, "step": 85185 }, { "epoch": 15.634061295650579, "grad_norm": 0.001243765465915203, "learning_rate": 1.382827154031941e-06, "loss": 0.1098, "num_input_tokens_seen": 183693600, "step": 85190 }, { "epoch": 15.63497889521013, "grad_norm": 0.00799565576016903, "learning_rate": 1.3822743636584162e-06, "loss": 0.0001, "num_input_tokens_seen": 183704384, "step": 85195 }, { "epoch": 15.635896494769682, "grad_norm": 0.003673915285617113, "learning_rate": 1.3817216660739486e-06, "loss": 0.225, "num_input_tokens_seen": 183715392, "step": 85200 }, { "epoch": 15.636814094329235, "grad_norm": 34.403076171875, "learning_rate": 1.381169061292712e-06, "loss": 0.2058, "num_input_tokens_seen": 183726528, "step": 85205 }, { "epoch": 15.637731693888787, "grad_norm": 0.0831117331981659, "learning_rate": 1.3806165493288808e-06, "loss": 0.0001, "num_input_tokens_seen": 183737920, "step": 85210 }, { "epoch": 15.638649293448339, "grad_norm": 45.77967071533203, "learning_rate": 1.380064130196625e-06, "loss": 0.1407, "num_input_tokens_seen": 183748608, "step": 85215 }, { "epoch": 15.639566893007892, "grad_norm": 3.6279613971710205, "learning_rate": 1.379511803910113e-06, "loss": 0.0015, "num_input_tokens_seen": 183758496, "step": 85220 }, { "epoch": 15.640484492567444, "grad_norm": 54.5866584777832, "learning_rate": 1.378959570483513e-06, "loss": 0.0014, "num_input_tokens_seen": 183769120, "step": 85225 }, { "epoch": 15.641402092126995, "grad_norm": 0.02541687712073326, "learning_rate": 1.3784074299309885e-06, "loss": 0.0042, "num_input_tokens_seen": 183780576, "step": 85230 }, { "epoch": 15.642319691686549, "grad_norm": 11.579901695251465, "learning_rate": 1.3778553822667e-06, "loss": 0.0014, "num_input_tokens_seen": 183791424, "step": 85235 }, { "epoch": 15.6432372912461, "grad_norm": 0.0014436275232583284, "learning_rate": 1.377303427504807e-06, "loss": 0.0, "num_input_tokens_seen": 183801408, "step": 85240 }, { "epoch": 15.644154890805652, "grad_norm": 0.0031338934786617756, "learning_rate": 1.3767515656594654e-06, "loss": 0.0002, "num_input_tokens_seen": 183811904, "step": 85245 }, { "epoch": 15.645072490365205, "grad_norm": 0.0028732174541801214, "learning_rate": 1.3761997967448316e-06, "loss": 0.0002, "num_input_tokens_seen": 183823008, "step": 85250 }, { "epoch": 15.645990089924757, "grad_norm": 0.06098918989300728, "learning_rate": 1.3756481207750572e-06, "loss": 0.0003, "num_input_tokens_seen": 183833536, "step": 85255 }, { "epoch": 15.646907689484308, "grad_norm": 0.0005023466073907912, "learning_rate": 1.3750965377642895e-06, "loss": 0.0001, "num_input_tokens_seen": 183844064, "step": 85260 }, { "epoch": 15.647825289043862, "grad_norm": 0.001072436454705894, "learning_rate": 1.3745450477266786e-06, "loss": 0.0, "num_input_tokens_seen": 183855136, "step": 85265 }, { "epoch": 15.648742888603413, "grad_norm": 18.23007583618164, "learning_rate": 1.3739936506763685e-06, "loss": 0.001, "num_input_tokens_seen": 183865664, "step": 85270 }, { "epoch": 15.649660488162965, "grad_norm": 0.0016261261189356446, "learning_rate": 1.3734423466275004e-06, "loss": 0.0, "num_input_tokens_seen": 183876320, "step": 85275 }, { "epoch": 15.650578087722518, "grad_norm": 0.0036243738140910864, "learning_rate": 1.3728911355942164e-06, "loss": 0.0, "num_input_tokens_seen": 183886752, "step": 85280 }, { "epoch": 15.65149568728207, "grad_norm": 0.007267351727932692, "learning_rate": 1.3723400175906536e-06, "loss": 0.25, "num_input_tokens_seen": 183897472, "step": 85285 }, { "epoch": 15.652413286841622, "grad_norm": 0.052948929369449615, "learning_rate": 1.3717889926309468e-06, "loss": 0.0, "num_input_tokens_seen": 183907744, "step": 85290 }, { "epoch": 15.653330886401175, "grad_norm": 0.009429671801626682, "learning_rate": 1.3712380607292281e-06, "loss": 0.2033, "num_input_tokens_seen": 183917664, "step": 85295 }, { "epoch": 15.654248485960727, "grad_norm": 0.01599561609327793, "learning_rate": 1.3706872218996299e-06, "loss": 0.2684, "num_input_tokens_seen": 183929760, "step": 85300 }, { "epoch": 15.655166085520278, "grad_norm": 0.010135414078831673, "learning_rate": 1.3701364761562801e-06, "loss": 0.0001, "num_input_tokens_seen": 183941312, "step": 85305 }, { "epoch": 15.656083685079832, "grad_norm": 137.38523864746094, "learning_rate": 1.3695858235133019e-06, "loss": 0.1626, "num_input_tokens_seen": 183953632, "step": 85310 }, { "epoch": 15.657001284639383, "grad_norm": 0.002958707744255662, "learning_rate": 1.3690352639848226e-06, "loss": 0.0001, "num_input_tokens_seen": 183965248, "step": 85315 }, { "epoch": 15.657918884198935, "grad_norm": 0.012880557216703892, "learning_rate": 1.3684847975849609e-06, "loss": 0.0001, "num_input_tokens_seen": 183976704, "step": 85320 }, { "epoch": 15.658836483758488, "grad_norm": 0.0018961323658004403, "learning_rate": 1.3679344243278348e-06, "loss": 0.0001, "num_input_tokens_seen": 183988512, "step": 85325 }, { "epoch": 15.65975408331804, "grad_norm": 0.0006901268498040736, "learning_rate": 1.3673841442275625e-06, "loss": 0.0003, "num_input_tokens_seen": 183999904, "step": 85330 }, { "epoch": 15.660671682877592, "grad_norm": 0.00681136641651392, "learning_rate": 1.366833957298257e-06, "loss": 0.0002, "num_input_tokens_seen": 184011328, "step": 85335 }, { "epoch": 15.661589282437145, "grad_norm": 0.0018266482511535287, "learning_rate": 1.3662838635540298e-06, "loss": 0.0002, "num_input_tokens_seen": 184021600, "step": 85340 }, { "epoch": 15.662506881996697, "grad_norm": 1.1560728549957275, "learning_rate": 1.3657338630089883e-06, "loss": 0.0003, "num_input_tokens_seen": 184032832, "step": 85345 }, { "epoch": 15.663424481556248, "grad_norm": 0.00711751589551568, "learning_rate": 1.3651839556772418e-06, "loss": 0.0001, "num_input_tokens_seen": 184044064, "step": 85350 }, { "epoch": 15.664342081115802, "grad_norm": 491.25311279296875, "learning_rate": 1.3646341415728936e-06, "loss": 0.3401, "num_input_tokens_seen": 184053632, "step": 85355 }, { "epoch": 15.665259680675353, "grad_norm": 0.0761583000421524, "learning_rate": 1.364084420710044e-06, "loss": 0.0, "num_input_tokens_seen": 184066112, "step": 85360 }, { "epoch": 15.666177280234905, "grad_norm": 185.7315216064453, "learning_rate": 1.3635347931027947e-06, "loss": 0.1067, "num_input_tokens_seen": 184076704, "step": 85365 }, { "epoch": 15.667094879794458, "grad_norm": 0.013832530938088894, "learning_rate": 1.3629852587652426e-06, "loss": 0.0001, "num_input_tokens_seen": 184087520, "step": 85370 }, { "epoch": 15.66801247935401, "grad_norm": 0.03503881022334099, "learning_rate": 1.3624358177114815e-06, "loss": 0.0432, "num_input_tokens_seen": 184099552, "step": 85375 }, { "epoch": 15.668930078913561, "grad_norm": 0.08538737893104553, "learning_rate": 1.3618864699556029e-06, "loss": 0.0002, "num_input_tokens_seen": 184111168, "step": 85380 }, { "epoch": 15.669847678473115, "grad_norm": 0.027980880811810493, "learning_rate": 1.3613372155116987e-06, "loss": 0.0002, "num_input_tokens_seen": 184121984, "step": 85385 }, { "epoch": 15.670765278032667, "grad_norm": 0.0014809183776378632, "learning_rate": 1.3607880543938557e-06, "loss": 0.0016, "num_input_tokens_seen": 184132384, "step": 85390 }, { "epoch": 15.671682877592218, "grad_norm": 0.45491915941238403, "learning_rate": 1.3602389866161575e-06, "loss": 0.0004, "num_input_tokens_seen": 184142176, "step": 85395 }, { "epoch": 15.672600477151772, "grad_norm": 56.57246398925781, "learning_rate": 1.3596900121926893e-06, "loss": 0.26, "num_input_tokens_seen": 184153152, "step": 85400 }, { "epoch": 15.673518076711323, "grad_norm": 0.12232185900211334, "learning_rate": 1.3591411311375307e-06, "loss": 0.0001, "num_input_tokens_seen": 184164704, "step": 85405 }, { "epoch": 15.674435676270875, "grad_norm": 0.17297054827213287, "learning_rate": 1.358592343464759e-06, "loss": 0.0004, "num_input_tokens_seen": 184175712, "step": 85410 }, { "epoch": 15.675353275830428, "grad_norm": 0.11413984000682831, "learning_rate": 1.3580436491884485e-06, "loss": 0.0426, "num_input_tokens_seen": 184186400, "step": 85415 }, { "epoch": 15.67627087538998, "grad_norm": 0.01661924086511135, "learning_rate": 1.3574950483226757e-06, "loss": 0.0, "num_input_tokens_seen": 184196992, "step": 85420 }, { "epoch": 15.677188474949531, "grad_norm": 0.012604719027876854, "learning_rate": 1.356946540881509e-06, "loss": 0.0, "num_input_tokens_seen": 184208608, "step": 85425 }, { "epoch": 15.678106074509085, "grad_norm": 34.68047332763672, "learning_rate": 1.3563981268790182e-06, "loss": 0.004, "num_input_tokens_seen": 184219776, "step": 85430 }, { "epoch": 15.679023674068636, "grad_norm": 0.0010493516456335783, "learning_rate": 1.3558498063292674e-06, "loss": 0.0, "num_input_tokens_seen": 184229088, "step": 85435 }, { "epoch": 15.679941273628188, "grad_norm": 0.09783808141946793, "learning_rate": 1.3553015792463202e-06, "loss": 0.0014, "num_input_tokens_seen": 184239456, "step": 85440 }, { "epoch": 15.680858873187741, "grad_norm": 0.0010430283145979047, "learning_rate": 1.35475344564424e-06, "loss": 0.0001, "num_input_tokens_seen": 184249600, "step": 85445 }, { "epoch": 15.681776472747293, "grad_norm": 0.11378571391105652, "learning_rate": 1.3542054055370846e-06, "loss": 0.0001, "num_input_tokens_seen": 184260992, "step": 85450 }, { "epoch": 15.682694072306845, "grad_norm": 0.0012214778689667583, "learning_rate": 1.35365745893891e-06, "loss": 0.0001, "num_input_tokens_seen": 184271808, "step": 85455 }, { "epoch": 15.683611671866398, "grad_norm": 0.0013280479470267892, "learning_rate": 1.3531096058637682e-06, "loss": 0.0013, "num_input_tokens_seen": 184283360, "step": 85460 }, { "epoch": 15.68452927142595, "grad_norm": 0.007942179217934608, "learning_rate": 1.3525618463257151e-06, "loss": 0.0451, "num_input_tokens_seen": 184293440, "step": 85465 }, { "epoch": 15.685446870985501, "grad_norm": 0.00042943726293742657, "learning_rate": 1.352014180338797e-06, "loss": 0.0003, "num_input_tokens_seen": 184302656, "step": 85470 }, { "epoch": 15.686364470545055, "grad_norm": 0.024759722873568535, "learning_rate": 1.3514666079170618e-06, "loss": 0.0002, "num_input_tokens_seen": 184311936, "step": 85475 }, { "epoch": 15.687282070104606, "grad_norm": 0.004330303519964218, "learning_rate": 1.3509191290745515e-06, "loss": 0.0001, "num_input_tokens_seen": 184322688, "step": 85480 }, { "epoch": 15.688199669664158, "grad_norm": 0.006461989600211382, "learning_rate": 1.3503717438253118e-06, "loss": 0.0, "num_input_tokens_seen": 184333248, "step": 85485 }, { "epoch": 15.689117269223711, "grad_norm": 0.0032341484911739826, "learning_rate": 1.3498244521833803e-06, "loss": 0.2379, "num_input_tokens_seen": 184343936, "step": 85490 }, { "epoch": 15.690034868783263, "grad_norm": 3.5923638343811035, "learning_rate": 1.349277254162793e-06, "loss": 0.0002, "num_input_tokens_seen": 184354592, "step": 85495 }, { "epoch": 15.690952468342815, "grad_norm": 0.02278495393693447, "learning_rate": 1.3487301497775874e-06, "loss": 0.0451, "num_input_tokens_seen": 184366304, "step": 85500 }, { "epoch": 15.691870067902368, "grad_norm": 0.0014118199469521642, "learning_rate": 1.3481831390417943e-06, "loss": 0.0001, "num_input_tokens_seen": 184377248, "step": 85505 }, { "epoch": 15.69278766746192, "grad_norm": 0.003040360752493143, "learning_rate": 1.3476362219694445e-06, "loss": 0.0, "num_input_tokens_seen": 184388704, "step": 85510 }, { "epoch": 15.693705267021471, "grad_norm": 0.12882256507873535, "learning_rate": 1.347089398574563e-06, "loss": 0.0001, "num_input_tokens_seen": 184399488, "step": 85515 }, { "epoch": 15.694622866581025, "grad_norm": 314.0205993652344, "learning_rate": 1.3465426688711786e-06, "loss": 0.0532, "num_input_tokens_seen": 184409568, "step": 85520 }, { "epoch": 15.695540466140576, "grad_norm": 0.0657276064157486, "learning_rate": 1.3459960328733118e-06, "loss": 0.0179, "num_input_tokens_seen": 184421056, "step": 85525 }, { "epoch": 15.696458065700128, "grad_norm": 0.014763440936803818, "learning_rate": 1.3454494905949829e-06, "loss": 0.0, "num_input_tokens_seen": 184431680, "step": 85530 }, { "epoch": 15.697375665259681, "grad_norm": 0.06784999370574951, "learning_rate": 1.3449030420502113e-06, "loss": 0.0144, "num_input_tokens_seen": 184443392, "step": 85535 }, { "epoch": 15.698293264819233, "grad_norm": 0.005522597581148148, "learning_rate": 1.3443566872530122e-06, "loss": 0.0001, "num_input_tokens_seen": 184454432, "step": 85540 }, { "epoch": 15.699210864378784, "grad_norm": 0.0011868749279528856, "learning_rate": 1.3438104262173968e-06, "loss": 0.0063, "num_input_tokens_seen": 184465696, "step": 85545 }, { "epoch": 15.700128463938338, "grad_norm": 0.03690226003527641, "learning_rate": 1.3432642589573792e-06, "loss": 0.0, "num_input_tokens_seen": 184476288, "step": 85550 }, { "epoch": 15.70104606349789, "grad_norm": 0.0007093124440871179, "learning_rate": 1.3427181854869653e-06, "loss": 0.2252, "num_input_tokens_seen": 184486880, "step": 85555 }, { "epoch": 15.701963663057441, "grad_norm": 0.0010600123787298799, "learning_rate": 1.342172205820162e-06, "loss": 0.0002, "num_input_tokens_seen": 184497568, "step": 85560 }, { "epoch": 15.702881262616994, "grad_norm": 0.0036109674256294966, "learning_rate": 1.341626319970971e-06, "loss": 0.4067, "num_input_tokens_seen": 184508928, "step": 85565 }, { "epoch": 15.703798862176546, "grad_norm": 0.0047749984078109264, "learning_rate": 1.341080527953396e-06, "loss": 0.0001, "num_input_tokens_seen": 184519712, "step": 85570 }, { "epoch": 15.704716461736098, "grad_norm": 0.023203091695904732, "learning_rate": 1.3405348297814353e-06, "loss": 0.0032, "num_input_tokens_seen": 184530208, "step": 85575 }, { "epoch": 15.705634061295651, "grad_norm": 0.0030374780762940645, "learning_rate": 1.3399892254690827e-06, "loss": 0.0001, "num_input_tokens_seen": 184540800, "step": 85580 }, { "epoch": 15.706551660855203, "grad_norm": 0.0005770763964392245, "learning_rate": 1.3394437150303358e-06, "loss": 0.0, "num_input_tokens_seen": 184551520, "step": 85585 }, { "epoch": 15.707469260414754, "grad_norm": 0.06633174419403076, "learning_rate": 1.3388982984791837e-06, "loss": 0.0002, "num_input_tokens_seen": 184560480, "step": 85590 }, { "epoch": 15.708386859974308, "grad_norm": 0.007409901358187199, "learning_rate": 1.3383529758296154e-06, "loss": 0.0001, "num_input_tokens_seen": 184570848, "step": 85595 }, { "epoch": 15.70930445953386, "grad_norm": 0.0031811718363314867, "learning_rate": 1.3378077470956192e-06, "loss": 0.0, "num_input_tokens_seen": 184580704, "step": 85600 }, { "epoch": 15.710222059093411, "grad_norm": 0.0016894949367269874, "learning_rate": 1.337262612291178e-06, "loss": 0.0, "num_input_tokens_seen": 184591104, "step": 85605 }, { "epoch": 15.711139658652964, "grad_norm": 0.026545614004135132, "learning_rate": 1.336717571430275e-06, "loss": 0.0005, "num_input_tokens_seen": 184603104, "step": 85610 }, { "epoch": 15.712057258212516, "grad_norm": 0.4574679434299469, "learning_rate": 1.3361726245268863e-06, "loss": 0.0001, "num_input_tokens_seen": 184613824, "step": 85615 }, { "epoch": 15.712974857772068, "grad_norm": 0.000886225258000195, "learning_rate": 1.3356277715949934e-06, "loss": 0.124, "num_input_tokens_seen": 184624896, "step": 85620 }, { "epoch": 15.713892457331621, "grad_norm": 0.011058717034757137, "learning_rate": 1.3350830126485686e-06, "loss": 0.004, "num_input_tokens_seen": 184636128, "step": 85625 }, { "epoch": 15.714810056891173, "grad_norm": 0.05683296546339989, "learning_rate": 1.3345383477015844e-06, "loss": 0.0001, "num_input_tokens_seen": 184647360, "step": 85630 }, { "epoch": 15.715727656450724, "grad_norm": 0.0016767930937930942, "learning_rate": 1.3339937767680094e-06, "loss": 0.0006, "num_input_tokens_seen": 184658368, "step": 85635 }, { "epoch": 15.716645256010278, "grad_norm": 0.03806395083665848, "learning_rate": 1.3334492998618137e-06, "loss": 0.0532, "num_input_tokens_seen": 184669376, "step": 85640 }, { "epoch": 15.71756285556983, "grad_norm": 0.014967190101742744, "learning_rate": 1.332904916996961e-06, "loss": 0.0, "num_input_tokens_seen": 184680832, "step": 85645 }, { "epoch": 15.71848045512938, "grad_norm": 0.0005661370232701302, "learning_rate": 1.3323606281874135e-06, "loss": 0.3752, "num_input_tokens_seen": 184690560, "step": 85650 }, { "epoch": 15.719398054688934, "grad_norm": 380.96435546875, "learning_rate": 1.3318164334471312e-06, "loss": 0.0479, "num_input_tokens_seen": 184701504, "step": 85655 }, { "epoch": 15.720315654248486, "grad_norm": 95.15066528320312, "learning_rate": 1.3312723327900711e-06, "loss": 0.0426, "num_input_tokens_seen": 184713056, "step": 85660 }, { "epoch": 15.721233253808037, "grad_norm": 0.0029237547423690557, "learning_rate": 1.3307283262301912e-06, "loss": 0.0, "num_input_tokens_seen": 184723328, "step": 85665 }, { "epoch": 15.72215085336759, "grad_norm": 0.009127935394644737, "learning_rate": 1.3301844137814428e-06, "loss": 0.0, "num_input_tokens_seen": 184734368, "step": 85670 }, { "epoch": 15.723068452927143, "grad_norm": 0.11061415821313858, "learning_rate": 1.3296405954577763e-06, "loss": 0.0041, "num_input_tokens_seen": 184744480, "step": 85675 }, { "epoch": 15.723986052486694, "grad_norm": 0.2485763132572174, "learning_rate": 1.3290968712731384e-06, "loss": 0.0004, "num_input_tokens_seen": 184754816, "step": 85680 }, { "epoch": 15.724903652046248, "grad_norm": 0.0018794122152030468, "learning_rate": 1.328553241241478e-06, "loss": 0.0478, "num_input_tokens_seen": 184766368, "step": 85685 }, { "epoch": 15.7258212516058, "grad_norm": 0.0028993277810513973, "learning_rate": 1.3280097053767372e-06, "loss": 0.0, "num_input_tokens_seen": 184775872, "step": 85690 }, { "epoch": 15.72673885116535, "grad_norm": 0.0023879974614828825, "learning_rate": 1.327466263692856e-06, "loss": 0.0002, "num_input_tokens_seen": 184787104, "step": 85695 }, { "epoch": 15.727656450724904, "grad_norm": 0.0067612952552735806, "learning_rate": 1.3269229162037716e-06, "loss": 0.1181, "num_input_tokens_seen": 184797728, "step": 85700 }, { "epoch": 15.728574050284456, "grad_norm": 0.0029541102703660727, "learning_rate": 1.3263796629234233e-06, "loss": 0.0, "num_input_tokens_seen": 184808736, "step": 85705 }, { "epoch": 15.729491649844007, "grad_norm": 0.004420126788318157, "learning_rate": 1.3258365038657433e-06, "loss": 0.0, "num_input_tokens_seen": 184819552, "step": 85710 }, { "epoch": 15.73040924940356, "grad_norm": 0.0006918581202626228, "learning_rate": 1.3252934390446603e-06, "loss": 0.0004, "num_input_tokens_seen": 184829664, "step": 85715 }, { "epoch": 15.731326848963112, "grad_norm": 0.003351076738908887, "learning_rate": 1.3247504684741075e-06, "loss": 0.1283, "num_input_tokens_seen": 184840544, "step": 85720 }, { "epoch": 15.732244448522664, "grad_norm": 0.0023898445069789886, "learning_rate": 1.3242075921680092e-06, "loss": 0.0001, "num_input_tokens_seen": 184850400, "step": 85725 }, { "epoch": 15.733162048082217, "grad_norm": 246.9579620361328, "learning_rate": 1.323664810140287e-06, "loss": 0.0079, "num_input_tokens_seen": 184861536, "step": 85730 }, { "epoch": 15.734079647641769, "grad_norm": 0.0023959483951330185, "learning_rate": 1.3231221224048668e-06, "loss": 0.1005, "num_input_tokens_seen": 184871648, "step": 85735 }, { "epoch": 15.73499724720132, "grad_norm": 164.82350158691406, "learning_rate": 1.322579528975665e-06, "loss": 0.0532, "num_input_tokens_seen": 184882496, "step": 85740 }, { "epoch": 15.735914846760874, "grad_norm": 0.012598118744790554, "learning_rate": 1.3220370298665992e-06, "loss": 0.0, "num_input_tokens_seen": 184893184, "step": 85745 }, { "epoch": 15.736832446320426, "grad_norm": 0.842059314250946, "learning_rate": 1.321494625091581e-06, "loss": 0.0001, "num_input_tokens_seen": 184903488, "step": 85750 }, { "epoch": 15.737750045879977, "grad_norm": 32.06353759765625, "learning_rate": 1.320952314664527e-06, "loss": 0.01, "num_input_tokens_seen": 184915104, "step": 85755 }, { "epoch": 15.73866764543953, "grad_norm": 0.0035087314900010824, "learning_rate": 1.3204100985993435e-06, "loss": 0.0212, "num_input_tokens_seen": 184924672, "step": 85760 }, { "epoch": 15.739585244999082, "grad_norm": 0.04520014300942421, "learning_rate": 1.3198679769099365e-06, "loss": 0.0001, "num_input_tokens_seen": 184935136, "step": 85765 }, { "epoch": 15.740502844558634, "grad_norm": 0.027427883818745613, "learning_rate": 1.319325949610214e-06, "loss": 0.2408, "num_input_tokens_seen": 184944928, "step": 85770 }, { "epoch": 15.741420444118187, "grad_norm": 0.0006440047873184085, "learning_rate": 1.3187840167140763e-06, "loss": 0.1938, "num_input_tokens_seen": 184956160, "step": 85775 }, { "epoch": 15.742338043677739, "grad_norm": 0.3778795599937439, "learning_rate": 1.318242178235422e-06, "loss": 0.0006, "num_input_tokens_seen": 184965696, "step": 85780 }, { "epoch": 15.74325564323729, "grad_norm": 0.011091889813542366, "learning_rate": 1.317700434188151e-06, "loss": 0.0, "num_input_tokens_seen": 184976576, "step": 85785 }, { "epoch": 15.744173242796844, "grad_norm": 0.005604318808764219, "learning_rate": 1.3171587845861566e-06, "loss": 0.0002, "num_input_tokens_seen": 184987392, "step": 85790 }, { "epoch": 15.745090842356396, "grad_norm": 0.01939781941473484, "learning_rate": 1.3166172294433315e-06, "loss": 0.1439, "num_input_tokens_seen": 184998400, "step": 85795 }, { "epoch": 15.746008441915947, "grad_norm": 0.07127156853675842, "learning_rate": 1.3160757687735642e-06, "loss": 0.0001, "num_input_tokens_seen": 185009888, "step": 85800 }, { "epoch": 15.7469260414755, "grad_norm": 0.002171758795157075, "learning_rate": 1.3155344025907458e-06, "loss": 0.0051, "num_input_tokens_seen": 185019968, "step": 85805 }, { "epoch": 15.747843641035052, "grad_norm": 0.0009595400770194829, "learning_rate": 1.3149931309087594e-06, "loss": 0.0017, "num_input_tokens_seen": 185030848, "step": 85810 }, { "epoch": 15.748761240594604, "grad_norm": 0.977213442325592, "learning_rate": 1.3144519537414862e-06, "loss": 0.0002, "num_input_tokens_seen": 185042432, "step": 85815 }, { "epoch": 15.749678840154157, "grad_norm": 0.0005106425960548222, "learning_rate": 1.3139108711028099e-06, "loss": 0.0, "num_input_tokens_seen": 185053568, "step": 85820 }, { "epoch": 15.750596439713709, "grad_norm": 14.295032501220703, "learning_rate": 1.3133698830066066e-06, "loss": 0.0014, "num_input_tokens_seen": 185062752, "step": 85825 }, { "epoch": 15.75151403927326, "grad_norm": 0.007823253981769085, "learning_rate": 1.3128289894667524e-06, "loss": 0.1252, "num_input_tokens_seen": 185072832, "step": 85830 }, { "epoch": 15.752431638832814, "grad_norm": 0.012601305730640888, "learning_rate": 1.3122881904971186e-06, "loss": 0.0002, "num_input_tokens_seen": 185083456, "step": 85835 }, { "epoch": 15.753349238392365, "grad_norm": 0.0015778528759256005, "learning_rate": 1.3117474861115786e-06, "loss": 0.1657, "num_input_tokens_seen": 185093824, "step": 85840 }, { "epoch": 15.754266837951917, "grad_norm": 0.0036310218274593353, "learning_rate": 1.3112068763239994e-06, "loss": 0.0, "num_input_tokens_seen": 185104064, "step": 85845 }, { "epoch": 15.75518443751147, "grad_norm": 0.0009136692970059812, "learning_rate": 1.3106663611482463e-06, "loss": 0.0, "num_input_tokens_seen": 185113408, "step": 85850 }, { "epoch": 15.756102037071022, "grad_norm": 0.0030930174980312586, "learning_rate": 1.310125940598182e-06, "loss": 0.0, "num_input_tokens_seen": 185125440, "step": 85855 }, { "epoch": 15.757019636630574, "grad_norm": 0.015858203172683716, "learning_rate": 1.3095856146876695e-06, "loss": 0.0001, "num_input_tokens_seen": 185135904, "step": 85860 }, { "epoch": 15.757937236190127, "grad_norm": 0.0012845885939896107, "learning_rate": 1.3090453834305672e-06, "loss": 0.1502, "num_input_tokens_seen": 185147456, "step": 85865 }, { "epoch": 15.758854835749679, "grad_norm": 0.0010437029413878918, "learning_rate": 1.30850524684073e-06, "loss": 0.0001, "num_input_tokens_seen": 185158816, "step": 85870 }, { "epoch": 15.75977243530923, "grad_norm": 2.0420007705688477, "learning_rate": 1.307965204932012e-06, "loss": 0.0004, "num_input_tokens_seen": 185169952, "step": 85875 }, { "epoch": 15.760690034868784, "grad_norm": 0.040292125195264816, "learning_rate": 1.3074252577182638e-06, "loss": 0.0009, "num_input_tokens_seen": 185181248, "step": 85880 }, { "epoch": 15.761607634428335, "grad_norm": 0.02736467495560646, "learning_rate": 1.306885405213334e-06, "loss": 0.0, "num_input_tokens_seen": 185191872, "step": 85885 }, { "epoch": 15.762525233987887, "grad_norm": 0.0017894090851768851, "learning_rate": 1.306345647431071e-06, "loss": 0.0025, "num_input_tokens_seen": 185202976, "step": 85890 }, { "epoch": 15.76344283354744, "grad_norm": 0.06753529608249664, "learning_rate": 1.3058059843853171e-06, "loss": 0.0, "num_input_tokens_seen": 185214336, "step": 85895 }, { "epoch": 15.764360433106992, "grad_norm": 0.069883331656456, "learning_rate": 1.3052664160899131e-06, "loss": 0.0001, "num_input_tokens_seen": 185224832, "step": 85900 }, { "epoch": 15.765278032666544, "grad_norm": 0.3364386558532715, "learning_rate": 1.3047269425587005e-06, "loss": 0.0001, "num_input_tokens_seen": 185236448, "step": 85905 }, { "epoch": 15.766195632226097, "grad_norm": 0.0013411957770586014, "learning_rate": 1.3041875638055152e-06, "loss": 0.0079, "num_input_tokens_seen": 185248160, "step": 85910 }, { "epoch": 15.767113231785649, "grad_norm": 0.021131034940481186, "learning_rate": 1.303648279844189e-06, "loss": 0.0944, "num_input_tokens_seen": 185259392, "step": 85915 }, { "epoch": 15.7680308313452, "grad_norm": 0.029246335849165916, "learning_rate": 1.303109090688557e-06, "loss": 0.1473, "num_input_tokens_seen": 185270304, "step": 85920 }, { "epoch": 15.768948430904754, "grad_norm": 0.005235101096332073, "learning_rate": 1.3025699963524475e-06, "loss": 0.0001, "num_input_tokens_seen": 185281600, "step": 85925 }, { "epoch": 15.769866030464305, "grad_norm": 0.001811923342756927, "learning_rate": 1.3020309968496869e-06, "loss": 0.1316, "num_input_tokens_seen": 185293664, "step": 85930 }, { "epoch": 15.770783630023857, "grad_norm": 0.002639465034008026, "learning_rate": 1.3014920921940983e-06, "loss": 0.0, "num_input_tokens_seen": 185304224, "step": 85935 }, { "epoch": 15.77170122958341, "grad_norm": 0.004478856455534697, "learning_rate": 1.300953282399507e-06, "loss": 0.0016, "num_input_tokens_seen": 185314432, "step": 85940 }, { "epoch": 15.772618829142962, "grad_norm": 0.0023598980624228716, "learning_rate": 1.3004145674797307e-06, "loss": 0.0002, "num_input_tokens_seen": 185326304, "step": 85945 }, { "epoch": 15.773536428702513, "grad_norm": 0.18312209844589233, "learning_rate": 1.2998759474485856e-06, "loss": 0.0001, "num_input_tokens_seen": 185337216, "step": 85950 }, { "epoch": 15.774454028262067, "grad_norm": 0.00417891563847661, "learning_rate": 1.2993374223198896e-06, "loss": 0.0, "num_input_tokens_seen": 185347584, "step": 85955 }, { "epoch": 15.775371627821619, "grad_norm": 0.005085065960884094, "learning_rate": 1.2987989921074528e-06, "loss": 0.2188, "num_input_tokens_seen": 185358496, "step": 85960 }, { "epoch": 15.77628922738117, "grad_norm": 0.001092280144803226, "learning_rate": 1.2982606568250856e-06, "loss": 0.0003, "num_input_tokens_seen": 185369856, "step": 85965 }, { "epoch": 15.777206826940724, "grad_norm": 0.04368880018591881, "learning_rate": 1.2977224164865943e-06, "loss": 0.0001, "num_input_tokens_seen": 185380512, "step": 85970 }, { "epoch": 15.778124426500275, "grad_norm": 0.0007957594352774322, "learning_rate": 1.2971842711057858e-06, "loss": 0.0041, "num_input_tokens_seen": 185391200, "step": 85975 }, { "epoch": 15.779042026059827, "grad_norm": 7.144662380218506, "learning_rate": 1.2966462206964624e-06, "loss": 0.0107, "num_input_tokens_seen": 185403040, "step": 85980 }, { "epoch": 15.77995962561938, "grad_norm": 0.0010236044181510806, "learning_rate": 1.296108265272422e-06, "loss": 0.0704, "num_input_tokens_seen": 185415008, "step": 85985 }, { "epoch": 15.780877225178932, "grad_norm": 0.012801669538021088, "learning_rate": 1.2955704048474655e-06, "loss": 0.1319, "num_input_tokens_seen": 185426112, "step": 85990 }, { "epoch": 15.781794824738483, "grad_norm": 0.0009202352375723422, "learning_rate": 1.2950326394353874e-06, "loss": 0.0, "num_input_tokens_seen": 185436224, "step": 85995 }, { "epoch": 15.782712424298037, "grad_norm": 0.02136032097041607, "learning_rate": 1.2944949690499776e-06, "loss": 0.0012, "num_input_tokens_seen": 185446336, "step": 86000 }, { "epoch": 15.783630023857588, "grad_norm": 0.007650852669030428, "learning_rate": 1.293957393705031e-06, "loss": 0.0004, "num_input_tokens_seen": 185457312, "step": 86005 }, { "epoch": 15.78454762341714, "grad_norm": 0.6130316853523254, "learning_rate": 1.2934199134143326e-06, "loss": 0.0001, "num_input_tokens_seen": 185467840, "step": 86010 }, { "epoch": 15.785465222976693, "grad_norm": 85.47830963134766, "learning_rate": 1.2928825281916697e-06, "loss": 0.4456, "num_input_tokens_seen": 185479200, "step": 86015 }, { "epoch": 15.786382822536245, "grad_norm": 322.09320068359375, "learning_rate": 1.2923452380508223e-06, "loss": 0.1714, "num_input_tokens_seen": 185491776, "step": 86020 }, { "epoch": 15.787300422095797, "grad_norm": 0.09057474881410599, "learning_rate": 1.291808043005575e-06, "loss": 0.0014, "num_input_tokens_seen": 185502176, "step": 86025 }, { "epoch": 15.78821802165535, "grad_norm": 0.0064119440503418446, "learning_rate": 1.291270943069704e-06, "loss": 0.0001, "num_input_tokens_seen": 185513472, "step": 86030 }, { "epoch": 15.789135621214902, "grad_norm": 29.736759185791016, "learning_rate": 1.290733938256984e-06, "loss": 0.0021, "num_input_tokens_seen": 185524192, "step": 86035 }, { "epoch": 15.790053220774453, "grad_norm": 0.008559548296034336, "learning_rate": 1.290197028581191e-06, "loss": 0.1191, "num_input_tokens_seen": 185535872, "step": 86040 }, { "epoch": 15.790970820334007, "grad_norm": 0.0021742056123912334, "learning_rate": 1.289660214056095e-06, "loss": 0.1688, "num_input_tokens_seen": 185546368, "step": 86045 }, { "epoch": 15.791888419893558, "grad_norm": 0.005577608477324247, "learning_rate": 1.2891234946954617e-06, "loss": 0.04, "num_input_tokens_seen": 185557792, "step": 86050 }, { "epoch": 15.79280601945311, "grad_norm": 0.0037148487754166126, "learning_rate": 1.2885868705130617e-06, "loss": 0.0001, "num_input_tokens_seen": 185569344, "step": 86055 }, { "epoch": 15.793723619012663, "grad_norm": 0.0029980172403156757, "learning_rate": 1.288050341522656e-06, "loss": 0.0003, "num_input_tokens_seen": 185581088, "step": 86060 }, { "epoch": 15.794641218572215, "grad_norm": 0.0025874129496514797, "learning_rate": 1.2875139077380055e-06, "loss": 0.0005, "num_input_tokens_seen": 185591584, "step": 86065 }, { "epoch": 15.795558818131767, "grad_norm": 0.0007069199928082526, "learning_rate": 1.2869775691728703e-06, "loss": 0.0, "num_input_tokens_seen": 185602240, "step": 86070 }, { "epoch": 15.79647641769132, "grad_norm": 0.0036973136011511087, "learning_rate": 1.2864413258410052e-06, "loss": 0.0001, "num_input_tokens_seen": 185613184, "step": 86075 }, { "epoch": 15.797394017250872, "grad_norm": 0.0009607571992091835, "learning_rate": 1.2859051777561631e-06, "loss": 0.0001, "num_input_tokens_seen": 185622976, "step": 86080 }, { "epoch": 15.798311616810423, "grad_norm": 0.005291840992867947, "learning_rate": 1.2853691249320988e-06, "loss": 0.0, "num_input_tokens_seen": 185633408, "step": 86085 }, { "epoch": 15.799229216369977, "grad_norm": 0.058925751596689224, "learning_rate": 1.2848331673825587e-06, "loss": 0.1627, "num_input_tokens_seen": 185644096, "step": 86090 }, { "epoch": 15.800146815929528, "grad_norm": 0.007556856609880924, "learning_rate": 1.2842973051212905e-06, "loss": 0.0001, "num_input_tokens_seen": 185654656, "step": 86095 }, { "epoch": 15.80106441548908, "grad_norm": 0.015569730661809444, "learning_rate": 1.2837615381620371e-06, "loss": 0.0883, "num_input_tokens_seen": 185667040, "step": 86100 }, { "epoch": 15.801982015048633, "grad_norm": 0.022338010370731354, "learning_rate": 1.2832258665185392e-06, "loss": 0.0001, "num_input_tokens_seen": 185678848, "step": 86105 }, { "epoch": 15.802899614608185, "grad_norm": 0.0022624805569648743, "learning_rate": 1.2826902902045391e-06, "loss": 0.0, "num_input_tokens_seen": 185689952, "step": 86110 }, { "epoch": 15.803817214167736, "grad_norm": 0.0010875773150473833, "learning_rate": 1.2821548092337716e-06, "loss": 0.0352, "num_input_tokens_seen": 185700288, "step": 86115 }, { "epoch": 15.80473481372729, "grad_norm": 0.20433466136455536, "learning_rate": 1.2816194236199697e-06, "loss": 0.0017, "num_input_tokens_seen": 185710528, "step": 86120 }, { "epoch": 15.805652413286841, "grad_norm": 0.002700559562072158, "learning_rate": 1.281084133376868e-06, "loss": 0.0, "num_input_tokens_seen": 185721184, "step": 86125 }, { "epoch": 15.806570012846393, "grad_norm": 0.006247624754905701, "learning_rate": 1.2805489385181946e-06, "loss": 0.0001, "num_input_tokens_seen": 185733120, "step": 86130 }, { "epoch": 15.807487612405946, "grad_norm": 0.0017469426384195685, "learning_rate": 1.280013839057675e-06, "loss": 0.0974, "num_input_tokens_seen": 185745280, "step": 86135 }, { "epoch": 15.808405211965498, "grad_norm": 0.020741119980812073, "learning_rate": 1.279478835009036e-06, "loss": 0.0919, "num_input_tokens_seen": 185755488, "step": 86140 }, { "epoch": 15.80932281152505, "grad_norm": 0.1135430857539177, "learning_rate": 1.2789439263859987e-06, "loss": 0.0004, "num_input_tokens_seen": 185766976, "step": 86145 }, { "epoch": 15.810240411084603, "grad_norm": 0.334737092256546, "learning_rate": 1.278409113202283e-06, "loss": 0.0002, "num_input_tokens_seen": 185777536, "step": 86150 }, { "epoch": 15.811158010644155, "grad_norm": 56.4202995300293, "learning_rate": 1.2778743954716032e-06, "loss": 0.2474, "num_input_tokens_seen": 185787872, "step": 86155 }, { "epoch": 15.812075610203706, "grad_norm": 0.004841167014092207, "learning_rate": 1.2773397732076787e-06, "loss": 0.0008, "num_input_tokens_seen": 185798912, "step": 86160 }, { "epoch": 15.81299320976326, "grad_norm": 0.011459569446742535, "learning_rate": 1.2768052464242193e-06, "loss": 0.0001, "num_input_tokens_seen": 185808384, "step": 86165 }, { "epoch": 15.813910809322811, "grad_norm": 0.044348593801259995, "learning_rate": 1.2762708151349335e-06, "loss": 0.0, "num_input_tokens_seen": 185816800, "step": 86170 }, { "epoch": 15.814828408882363, "grad_norm": 0.01047971099615097, "learning_rate": 1.275736479353531e-06, "loss": 0.0, "num_input_tokens_seen": 185827296, "step": 86175 }, { "epoch": 15.815746008441916, "grad_norm": 0.34827783703804016, "learning_rate": 1.2752022390937158e-06, "loss": 0.0081, "num_input_tokens_seen": 185837920, "step": 86180 }, { "epoch": 15.816663608001468, "grad_norm": 141.84205627441406, "learning_rate": 1.2746680943691892e-06, "loss": 0.0173, "num_input_tokens_seen": 185849120, "step": 86185 }, { "epoch": 15.81758120756102, "grad_norm": 0.05375067889690399, "learning_rate": 1.2741340451936535e-06, "loss": 0.1519, "num_input_tokens_seen": 185859360, "step": 86190 }, { "epoch": 15.818498807120573, "grad_norm": 0.00344512308947742, "learning_rate": 1.273600091580805e-06, "loss": 0.1314, "num_input_tokens_seen": 185870272, "step": 86195 }, { "epoch": 15.819416406680125, "grad_norm": 0.7039316892623901, "learning_rate": 1.2730662335443389e-06, "loss": 0.0003, "num_input_tokens_seen": 185881472, "step": 86200 }, { "epoch": 15.820334006239676, "grad_norm": 0.0044448585249483585, "learning_rate": 1.2725324710979459e-06, "loss": 0.0, "num_input_tokens_seen": 185892224, "step": 86205 }, { "epoch": 15.82125160579923, "grad_norm": 0.0009890738874673843, "learning_rate": 1.2719988042553194e-06, "loss": 0.0, "num_input_tokens_seen": 185903104, "step": 86210 }, { "epoch": 15.822169205358781, "grad_norm": 0.0057375384494662285, "learning_rate": 1.2714652330301457e-06, "loss": 0.002, "num_input_tokens_seen": 185914432, "step": 86215 }, { "epoch": 15.823086804918333, "grad_norm": 0.0541372150182724, "learning_rate": 1.2709317574361092e-06, "loss": 0.0, "num_input_tokens_seen": 185924640, "step": 86220 }, { "epoch": 15.824004404477886, "grad_norm": 0.09530878812074661, "learning_rate": 1.2703983774868945e-06, "loss": 0.0005, "num_input_tokens_seen": 185935776, "step": 86225 }, { "epoch": 15.824922004037438, "grad_norm": 79.9839859008789, "learning_rate": 1.2698650931961815e-06, "loss": 0.1488, "num_input_tokens_seen": 185946848, "step": 86230 }, { "epoch": 15.82583960359699, "grad_norm": 0.050165820866823196, "learning_rate": 1.269331904577646e-06, "loss": 0.0763, "num_input_tokens_seen": 185956608, "step": 86235 }, { "epoch": 15.826757203156543, "grad_norm": 0.010343071073293686, "learning_rate": 1.2687988116449663e-06, "loss": 0.0, "num_input_tokens_seen": 185967680, "step": 86240 }, { "epoch": 15.827674802716095, "grad_norm": 0.11178334057331085, "learning_rate": 1.2682658144118144e-06, "loss": 0.0006, "num_input_tokens_seen": 185979296, "step": 86245 }, { "epoch": 15.828592402275646, "grad_norm": 17.40485954284668, "learning_rate": 1.2677329128918608e-06, "loss": 0.1214, "num_input_tokens_seen": 185989920, "step": 86250 }, { "epoch": 15.8295100018352, "grad_norm": 194.33311462402344, "learning_rate": 1.2672001070987716e-06, "loss": 0.056, "num_input_tokens_seen": 186001984, "step": 86255 }, { "epoch": 15.830427601394751, "grad_norm": 5.271421432495117, "learning_rate": 1.2666673970462163e-06, "loss": 0.0008, "num_input_tokens_seen": 186012928, "step": 86260 }, { "epoch": 15.831345200954303, "grad_norm": 0.0012657645856961608, "learning_rate": 1.266134782747856e-06, "loss": 0.002, "num_input_tokens_seen": 186023168, "step": 86265 }, { "epoch": 15.832262800513856, "grad_norm": 0.0021034814417362213, "learning_rate": 1.2656022642173516e-06, "loss": 0.0002, "num_input_tokens_seen": 186034304, "step": 86270 }, { "epoch": 15.833180400073408, "grad_norm": 0.004010187461972237, "learning_rate": 1.2650698414683598e-06, "loss": 0.2167, "num_input_tokens_seen": 186045632, "step": 86275 }, { "epoch": 15.83409799963296, "grad_norm": 0.12341483682394028, "learning_rate": 1.2645375145145395e-06, "loss": 0.0001, "num_input_tokens_seen": 186057120, "step": 86280 }, { "epoch": 15.835015599192513, "grad_norm": 0.011872324161231518, "learning_rate": 1.2640052833695426e-06, "loss": 0.1412, "num_input_tokens_seen": 186068800, "step": 86285 }, { "epoch": 15.835933198752064, "grad_norm": 0.0025364202447235584, "learning_rate": 1.2634731480470197e-06, "loss": 0.0, "num_input_tokens_seen": 186079616, "step": 86290 }, { "epoch": 15.836850798311616, "grad_norm": 44.87118148803711, "learning_rate": 1.2629411085606196e-06, "loss": 0.0884, "num_input_tokens_seen": 186089344, "step": 86295 }, { "epoch": 15.83776839787117, "grad_norm": 0.18780650198459625, "learning_rate": 1.2624091649239867e-06, "loss": 0.001, "num_input_tokens_seen": 186100096, "step": 86300 }, { "epoch": 15.838685997430721, "grad_norm": 1.36320960521698, "learning_rate": 1.261877317150767e-06, "loss": 0.0003, "num_input_tokens_seen": 186110976, "step": 86305 }, { "epoch": 15.839603596990273, "grad_norm": 4.293115139007568, "learning_rate": 1.2613455652546009e-06, "loss": 0.0021, "num_input_tokens_seen": 186120480, "step": 86310 }, { "epoch": 15.840521196549826, "grad_norm": 0.19020088016986847, "learning_rate": 1.2608139092491268e-06, "loss": 0.0002, "num_input_tokens_seen": 186130848, "step": 86315 }, { "epoch": 15.841438796109378, "grad_norm": 0.0008986555621959269, "learning_rate": 1.2602823491479793e-06, "loss": 0.0016, "num_input_tokens_seen": 186141920, "step": 86320 }, { "epoch": 15.84235639566893, "grad_norm": 0.01373775489628315, "learning_rate": 1.2597508849647944e-06, "loss": 0.0001, "num_input_tokens_seen": 186152416, "step": 86325 }, { "epoch": 15.843273995228483, "grad_norm": 0.0006690094014629722, "learning_rate": 1.2592195167132032e-06, "loss": 0.113, "num_input_tokens_seen": 186164160, "step": 86330 }, { "epoch": 15.844191594788034, "grad_norm": 0.02506435476243496, "learning_rate": 1.2586882444068331e-06, "loss": 0.0207, "num_input_tokens_seen": 186174848, "step": 86335 }, { "epoch": 15.845109194347586, "grad_norm": 41.97090530395508, "learning_rate": 1.2581570680593097e-06, "loss": 0.0162, "num_input_tokens_seen": 186184992, "step": 86340 }, { "epoch": 15.84602679390714, "grad_norm": 0.032398633658885956, "learning_rate": 1.25762598768426e-06, "loss": 0.0, "num_input_tokens_seen": 186196704, "step": 86345 }, { "epoch": 15.846944393466691, "grad_norm": 0.0005956491804681718, "learning_rate": 1.2570950032953034e-06, "loss": 0.0001, "num_input_tokens_seen": 186207872, "step": 86350 }, { "epoch": 15.847861993026243, "grad_norm": 0.0064656962640583515, "learning_rate": 1.256564114906057e-06, "loss": 0.0079, "num_input_tokens_seen": 186218816, "step": 86355 }, { "epoch": 15.848779592585796, "grad_norm": 295.8567199707031, "learning_rate": 1.2560333225301413e-06, "loss": 0.0564, "num_input_tokens_seen": 186229920, "step": 86360 }, { "epoch": 15.849697192145348, "grad_norm": 73.92688751220703, "learning_rate": 1.2555026261811682e-06, "loss": 0.0703, "num_input_tokens_seen": 186241056, "step": 86365 }, { "epoch": 15.8506147917049, "grad_norm": 0.004912072326987982, "learning_rate": 1.2549720258727477e-06, "loss": 0.0003, "num_input_tokens_seen": 186252640, "step": 86370 }, { "epoch": 15.851532391264453, "grad_norm": 0.02274857833981514, "learning_rate": 1.2544415216184918e-06, "loss": 0.0, "num_input_tokens_seen": 186263072, "step": 86375 }, { "epoch": 15.852449990824004, "grad_norm": 0.007193038240075111, "learning_rate": 1.2539111134320058e-06, "loss": 0.0063, "num_input_tokens_seen": 186273920, "step": 86380 }, { "epoch": 15.853367590383556, "grad_norm": 80.5079574584961, "learning_rate": 1.2533808013268938e-06, "loss": 0.0051, "num_input_tokens_seen": 186283520, "step": 86385 }, { "epoch": 15.85428518994311, "grad_norm": 0.023307545110583305, "learning_rate": 1.2528505853167566e-06, "loss": 0.0, "num_input_tokens_seen": 186294528, "step": 86390 }, { "epoch": 15.85520278950266, "grad_norm": 0.006450916640460491, "learning_rate": 1.2523204654151955e-06, "loss": 0.0, "num_input_tokens_seen": 186306304, "step": 86395 }, { "epoch": 15.856120389062212, "grad_norm": 0.6831132769584656, "learning_rate": 1.2517904416358056e-06, "loss": 0.2359, "num_input_tokens_seen": 186316608, "step": 86400 }, { "epoch": 15.857037988621766, "grad_norm": 0.02096567675471306, "learning_rate": 1.2512605139921807e-06, "loss": 0.0023, "num_input_tokens_seen": 186327360, "step": 86405 }, { "epoch": 15.857955588181317, "grad_norm": 0.008158981800079346, "learning_rate": 1.2507306824979148e-06, "loss": 0.2757, "num_input_tokens_seen": 186338976, "step": 86410 }, { "epoch": 15.858873187740869, "grad_norm": 0.0010940160136669874, "learning_rate": 1.2502009471665966e-06, "loss": 0.0, "num_input_tokens_seen": 186349344, "step": 86415 }, { "epoch": 15.859790787300422, "grad_norm": 0.001541876350529492, "learning_rate": 1.2496713080118116e-06, "loss": 0.0, "num_input_tokens_seen": 186360224, "step": 86420 }, { "epoch": 15.860708386859974, "grad_norm": 0.0020728623494505882, "learning_rate": 1.2491417650471444e-06, "loss": 0.059, "num_input_tokens_seen": 186371872, "step": 86425 }, { "epoch": 15.861625986419526, "grad_norm": 0.08274060487747192, "learning_rate": 1.2486123182861788e-06, "loss": 0.0001, "num_input_tokens_seen": 186382272, "step": 86430 }, { "epoch": 15.862543585979079, "grad_norm": 0.015591448172926903, "learning_rate": 1.2480829677424933e-06, "loss": 0.0001, "num_input_tokens_seen": 186392128, "step": 86435 }, { "epoch": 15.86346118553863, "grad_norm": 25.462419509887695, "learning_rate": 1.2475537134296628e-06, "loss": 0.0005, "num_input_tokens_seen": 186402816, "step": 86440 }, { "epoch": 15.864378785098182, "grad_norm": 0.03937884420156479, "learning_rate": 1.2470245553612654e-06, "loss": 0.0007, "num_input_tokens_seen": 186414144, "step": 86445 }, { "epoch": 15.865296384657736, "grad_norm": 0.001079393783584237, "learning_rate": 1.2464954935508716e-06, "loss": 0.2126, "num_input_tokens_seen": 186424544, "step": 86450 }, { "epoch": 15.866213984217287, "grad_norm": 0.08333003520965576, "learning_rate": 1.2459665280120498e-06, "loss": 0.0006, "num_input_tokens_seen": 186436320, "step": 86455 }, { "epoch": 15.867131583776839, "grad_norm": 0.0014899306697770953, "learning_rate": 1.2454376587583693e-06, "loss": 0.0, "num_input_tokens_seen": 186446336, "step": 86460 }, { "epoch": 15.868049183336392, "grad_norm": 0.1342417299747467, "learning_rate": 1.244908885803394e-06, "loss": 0.0001, "num_input_tokens_seen": 186457920, "step": 86465 }, { "epoch": 15.868966782895944, "grad_norm": 0.0033065141178667545, "learning_rate": 1.244380209160686e-06, "loss": 0.0158, "num_input_tokens_seen": 186468800, "step": 86470 }, { "epoch": 15.869884382455496, "grad_norm": 0.010752749629318714, "learning_rate": 1.2438516288438036e-06, "loss": 0.0001, "num_input_tokens_seen": 186479040, "step": 86475 }, { "epoch": 15.870801982015049, "grad_norm": 0.1723480075597763, "learning_rate": 1.2433231448663069e-06, "loss": 0.3827, "num_input_tokens_seen": 186489696, "step": 86480 }, { "epoch": 15.8717195815746, "grad_norm": 0.0030802851542830467, "learning_rate": 1.2427947572417493e-06, "loss": 0.0119, "num_input_tokens_seen": 186500384, "step": 86485 }, { "epoch": 15.872637181134152, "grad_norm": 0.004196579102426767, "learning_rate": 1.2422664659836824e-06, "loss": 0.0004, "num_input_tokens_seen": 186511136, "step": 86490 }, { "epoch": 15.873554780693706, "grad_norm": 0.0011375208850950003, "learning_rate": 1.2417382711056558e-06, "loss": 0.0645, "num_input_tokens_seen": 186521632, "step": 86495 }, { "epoch": 15.874472380253257, "grad_norm": 0.08706703782081604, "learning_rate": 1.241210172621219e-06, "loss": 0.0001, "num_input_tokens_seen": 186533120, "step": 86500 }, { "epoch": 15.875389979812809, "grad_norm": 0.013851125724613667, "learning_rate": 1.240682170543916e-06, "loss": 0.0001, "num_input_tokens_seen": 186544160, "step": 86505 }, { "epoch": 15.876307579372362, "grad_norm": 0.006465247832238674, "learning_rate": 1.2401542648872883e-06, "loss": 0.0, "num_input_tokens_seen": 186554592, "step": 86510 }, { "epoch": 15.877225178931914, "grad_norm": 55.2061882019043, "learning_rate": 1.239626455664877e-06, "loss": 0.0532, "num_input_tokens_seen": 186565408, "step": 86515 }, { "epoch": 15.878142778491465, "grad_norm": 0.026133956387639046, "learning_rate": 1.2390987428902175e-06, "loss": 0.0006, "num_input_tokens_seen": 186577248, "step": 86520 }, { "epoch": 15.879060378051019, "grad_norm": 0.033344708383083344, "learning_rate": 1.2385711265768474e-06, "loss": 0.0003, "num_input_tokens_seen": 186588384, "step": 86525 }, { "epoch": 15.87997797761057, "grad_norm": 0.16072572767734528, "learning_rate": 1.238043606738299e-06, "loss": 0.1565, "num_input_tokens_seen": 186598048, "step": 86530 }, { "epoch": 15.880895577170122, "grad_norm": 0.010097011923789978, "learning_rate": 1.2375161833881011e-06, "loss": 0.0, "num_input_tokens_seen": 186608416, "step": 86535 }, { "epoch": 15.881813176729676, "grad_norm": 0.0114277433604002, "learning_rate": 1.2369888565397802e-06, "loss": 0.0, "num_input_tokens_seen": 186619584, "step": 86540 }, { "epoch": 15.882730776289227, "grad_norm": 107.89356231689453, "learning_rate": 1.2364616262068646e-06, "loss": 0.2815, "num_input_tokens_seen": 186630240, "step": 86545 }, { "epoch": 15.883648375848779, "grad_norm": 0.0023303001653403044, "learning_rate": 1.2359344924028754e-06, "loss": 0.0002, "num_input_tokens_seen": 186639776, "step": 86550 }, { "epoch": 15.884565975408332, "grad_norm": 268.445556640625, "learning_rate": 1.2354074551413314e-06, "loss": 0.1311, "num_input_tokens_seen": 186650176, "step": 86555 }, { "epoch": 15.885483574967884, "grad_norm": 15.07769775390625, "learning_rate": 1.2348805144357528e-06, "loss": 0.001, "num_input_tokens_seen": 186660576, "step": 86560 }, { "epoch": 15.886401174527435, "grad_norm": 0.04177932068705559, "learning_rate": 1.2343536702996534e-06, "loss": 0.1938, "num_input_tokens_seen": 186671680, "step": 86565 }, { "epoch": 15.887318774086989, "grad_norm": 0.026363667100667953, "learning_rate": 1.2338269227465467e-06, "loss": 0.0004, "num_input_tokens_seen": 186682496, "step": 86570 }, { "epoch": 15.88823637364654, "grad_norm": 0.009379317983984947, "learning_rate": 1.2333002717899405e-06, "loss": 0.0, "num_input_tokens_seen": 186693984, "step": 86575 }, { "epoch": 15.889153973206092, "grad_norm": 0.005137205123901367, "learning_rate": 1.2327737174433457e-06, "loss": 0.0004, "num_input_tokens_seen": 186704672, "step": 86580 }, { "epoch": 15.890071572765645, "grad_norm": 0.012525012716650963, "learning_rate": 1.2322472597202667e-06, "loss": 0.0008, "num_input_tokens_seen": 186714688, "step": 86585 }, { "epoch": 15.890989172325197, "grad_norm": 0.03485265001654625, "learning_rate": 1.231720898634205e-06, "loss": 0.0329, "num_input_tokens_seen": 186726080, "step": 86590 }, { "epoch": 15.891906771884749, "grad_norm": 0.007219233550131321, "learning_rate": 1.2311946341986624e-06, "loss": 0.0, "num_input_tokens_seen": 186738208, "step": 86595 }, { "epoch": 15.892824371444302, "grad_norm": 0.050784096121788025, "learning_rate": 1.2306684664271374e-06, "loss": 0.0013, "num_input_tokens_seen": 186748544, "step": 86600 }, { "epoch": 15.893741971003854, "grad_norm": 0.0025828711222857237, "learning_rate": 1.2301423953331237e-06, "loss": 0.0, "num_input_tokens_seen": 186759328, "step": 86605 }, { "epoch": 15.894659570563405, "grad_norm": 0.059385161846876144, "learning_rate": 1.2296164209301132e-06, "loss": 0.0018, "num_input_tokens_seen": 186770624, "step": 86610 }, { "epoch": 15.895577170122959, "grad_norm": 0.0007580426754429936, "learning_rate": 1.2290905432315997e-06, "loss": 0.0, "num_input_tokens_seen": 186780256, "step": 86615 }, { "epoch": 15.89649476968251, "grad_norm": 0.03015396185219288, "learning_rate": 1.2285647622510693e-06, "loss": 0.0, "num_input_tokens_seen": 186791584, "step": 86620 }, { "epoch": 15.897412369242062, "grad_norm": 0.0012891158694401383, "learning_rate": 1.2280390780020062e-06, "loss": 0.0674, "num_input_tokens_seen": 186803744, "step": 86625 }, { "epoch": 15.898329968801615, "grad_norm": 0.18885762989521027, "learning_rate": 1.2275134904978958e-06, "loss": 0.0001, "num_input_tokens_seen": 186816416, "step": 86630 }, { "epoch": 15.899247568361167, "grad_norm": 13.973527908325195, "learning_rate": 1.2269879997522182e-06, "loss": 0.0004, "num_input_tokens_seen": 186826240, "step": 86635 }, { "epoch": 15.900165167920719, "grad_norm": 0.0026866134721785784, "learning_rate": 1.226462605778449e-06, "loss": 0.0376, "num_input_tokens_seen": 186837312, "step": 86640 }, { "epoch": 15.901082767480272, "grad_norm": 0.0022029909305274487, "learning_rate": 1.2259373085900667e-06, "loss": 0.0, "num_input_tokens_seen": 186848896, "step": 86645 }, { "epoch": 15.902000367039824, "grad_norm": 0.08862262964248657, "learning_rate": 1.2254121082005431e-06, "loss": 0.0064, "num_input_tokens_seen": 186860480, "step": 86650 }, { "epoch": 15.902917966599375, "grad_norm": 0.004824578296393156, "learning_rate": 1.2248870046233495e-06, "loss": 0.0, "num_input_tokens_seen": 186872928, "step": 86655 }, { "epoch": 15.903835566158929, "grad_norm": 0.0029657993000000715, "learning_rate": 1.2243619978719518e-06, "loss": 0.0001, "num_input_tokens_seen": 186884000, "step": 86660 }, { "epoch": 15.90475316571848, "grad_norm": 0.0358220674097538, "learning_rate": 1.2238370879598183e-06, "loss": 0.0, "num_input_tokens_seen": 186894368, "step": 86665 }, { "epoch": 15.905670765278032, "grad_norm": 0.02652020938694477, "learning_rate": 1.2233122749004107e-06, "loss": 0.0001, "num_input_tokens_seen": 186907008, "step": 86670 }, { "epoch": 15.906588364837585, "grad_norm": 0.01135295256972313, "learning_rate": 1.2227875587071886e-06, "loss": 0.0001, "num_input_tokens_seen": 186917760, "step": 86675 }, { "epoch": 15.907505964397137, "grad_norm": 0.016807138919830322, "learning_rate": 1.222262939393613e-06, "loss": 0.0011, "num_input_tokens_seen": 186929472, "step": 86680 }, { "epoch": 15.908423563956688, "grad_norm": 79.59978485107422, "learning_rate": 1.2217384169731383e-06, "loss": 0.0733, "num_input_tokens_seen": 186939872, "step": 86685 }, { "epoch": 15.909341163516242, "grad_norm": 0.003552593756467104, "learning_rate": 1.2212139914592158e-06, "loss": 0.0666, "num_input_tokens_seen": 186951520, "step": 86690 }, { "epoch": 15.910258763075793, "grad_norm": 0.003296884708106518, "learning_rate": 1.2206896628652992e-06, "loss": 0.0, "num_input_tokens_seen": 186962560, "step": 86695 }, { "epoch": 15.911176362635345, "grad_norm": 0.0008502623531967402, "learning_rate": 1.2201654312048355e-06, "loss": 0.0001, "num_input_tokens_seen": 186974016, "step": 86700 }, { "epoch": 15.912093962194898, "grad_norm": 0.025653932243585587, "learning_rate": 1.2196412964912702e-06, "loss": 0.0, "num_input_tokens_seen": 186983712, "step": 86705 }, { "epoch": 15.91301156175445, "grad_norm": 0.00044407552923075855, "learning_rate": 1.2191172587380467e-06, "loss": 0.0, "num_input_tokens_seen": 186994624, "step": 86710 }, { "epoch": 15.913929161314002, "grad_norm": 0.0007265364984050393, "learning_rate": 1.218593317958604e-06, "loss": 0.0, "num_input_tokens_seen": 187005728, "step": 86715 }, { "epoch": 15.914846760873555, "grad_norm": 0.005816254299134016, "learning_rate": 1.218069474166384e-06, "loss": 0.0, "num_input_tokens_seen": 187015904, "step": 86720 }, { "epoch": 15.915764360433107, "grad_norm": 0.003157028229907155, "learning_rate": 1.2175457273748199e-06, "loss": 0.225, "num_input_tokens_seen": 187026976, "step": 86725 }, { "epoch": 15.916681959992658, "grad_norm": 0.07490724325180054, "learning_rate": 1.2170220775973462e-06, "loss": 0.0005, "num_input_tokens_seen": 187037376, "step": 86730 }, { "epoch": 15.917599559552212, "grad_norm": 0.0015692294109612703, "learning_rate": 1.2164985248473926e-06, "loss": 0.0, "num_input_tokens_seen": 187047872, "step": 86735 }, { "epoch": 15.918517159111763, "grad_norm": 1.9798016548156738, "learning_rate": 1.2159750691383865e-06, "loss": 0.0005, "num_input_tokens_seen": 187058560, "step": 86740 }, { "epoch": 15.919434758671315, "grad_norm": 0.007062522228807211, "learning_rate": 1.215451710483757e-06, "loss": 0.1314, "num_input_tokens_seen": 187069536, "step": 86745 }, { "epoch": 15.920352358230868, "grad_norm": 0.01167959813028574, "learning_rate": 1.2149284488969254e-06, "loss": 0.0003, "num_input_tokens_seen": 187080256, "step": 86750 }, { "epoch": 15.92126995779042, "grad_norm": 0.003328244900330901, "learning_rate": 1.214405284391313e-06, "loss": 0.0008, "num_input_tokens_seen": 187092576, "step": 86755 }, { "epoch": 15.922187557349972, "grad_norm": 0.021501094102859497, "learning_rate": 1.213882216980336e-06, "loss": 0.0003, "num_input_tokens_seen": 187104384, "step": 86760 }, { "epoch": 15.923105156909525, "grad_norm": 0.006944772321730852, "learning_rate": 1.213359246677414e-06, "loss": 0.0, "num_input_tokens_seen": 187116064, "step": 86765 }, { "epoch": 15.924022756469077, "grad_norm": 0.0022776841651648283, "learning_rate": 1.2128363734959585e-06, "loss": 0.0001, "num_input_tokens_seen": 187126304, "step": 86770 }, { "epoch": 15.924940356028628, "grad_norm": 417.8082275390625, "learning_rate": 1.2123135974493788e-06, "loss": 0.0823, "num_input_tokens_seen": 187135712, "step": 86775 }, { "epoch": 15.925857955588182, "grad_norm": 0.025557689368724823, "learning_rate": 1.2117909185510867e-06, "loss": 0.0016, "num_input_tokens_seen": 187146048, "step": 86780 }, { "epoch": 15.926775555147733, "grad_norm": 0.02238273434340954, "learning_rate": 1.2112683368144862e-06, "loss": 0.0046, "num_input_tokens_seen": 187158336, "step": 86785 }, { "epoch": 15.927693154707285, "grad_norm": 0.0024572822730988264, "learning_rate": 1.2107458522529808e-06, "loss": 0.033, "num_input_tokens_seen": 187170560, "step": 86790 }, { "epoch": 15.928610754266838, "grad_norm": 16.859291076660156, "learning_rate": 1.2102234648799699e-06, "loss": 0.0144, "num_input_tokens_seen": 187181376, "step": 86795 }, { "epoch": 15.92952835382639, "grad_norm": 52.51200485229492, "learning_rate": 1.2097011747088555e-06, "loss": 0.0063, "num_input_tokens_seen": 187192352, "step": 86800 }, { "epoch": 15.930445953385941, "grad_norm": 0.13227425515651703, "learning_rate": 1.2091789817530308e-06, "loss": 0.0001, "num_input_tokens_seen": 187203168, "step": 86805 }, { "epoch": 15.931363552945495, "grad_norm": 0.00254053995013237, "learning_rate": 1.208656886025889e-06, "loss": 0.0, "num_input_tokens_seen": 187214624, "step": 86810 }, { "epoch": 15.932281152505047, "grad_norm": 14.3026704788208, "learning_rate": 1.2081348875408233e-06, "loss": 0.002, "num_input_tokens_seen": 187226208, "step": 86815 }, { "epoch": 15.933198752064598, "grad_norm": 73.52391052246094, "learning_rate": 1.2076129863112213e-06, "loss": 0.0704, "num_input_tokens_seen": 187237568, "step": 86820 }, { "epoch": 15.934116351624152, "grad_norm": 0.00965859554708004, "learning_rate": 1.2070911823504667e-06, "loss": 0.0, "num_input_tokens_seen": 187248800, "step": 86825 }, { "epoch": 15.935033951183703, "grad_norm": 313.9062805175781, "learning_rate": 1.2065694756719459e-06, "loss": 0.0808, "num_input_tokens_seen": 187260448, "step": 86830 }, { "epoch": 15.935951550743255, "grad_norm": 0.01938052289187908, "learning_rate": 1.2060478662890396e-06, "loss": 0.2908, "num_input_tokens_seen": 187272096, "step": 86835 }, { "epoch": 15.936869150302808, "grad_norm": 0.03257603943347931, "learning_rate": 1.2055263542151246e-06, "loss": 0.0564, "num_input_tokens_seen": 187283424, "step": 86840 }, { "epoch": 15.93778674986236, "grad_norm": 0.0018939293222501874, "learning_rate": 1.2050049394635766e-06, "loss": 0.0376, "num_input_tokens_seen": 187294016, "step": 86845 }, { "epoch": 15.938704349421911, "grad_norm": 0.9381569623947144, "learning_rate": 1.2044836220477718e-06, "loss": 0.0002, "num_input_tokens_seen": 187305472, "step": 86850 }, { "epoch": 15.939621948981465, "grad_norm": 199.819091796875, "learning_rate": 1.2039624019810796e-06, "loss": 0.2948, "num_input_tokens_seen": 187315072, "step": 86855 }, { "epoch": 15.940539548541016, "grad_norm": 0.0013076436007395387, "learning_rate": 1.2034412792768668e-06, "loss": 0.0006, "num_input_tokens_seen": 187326048, "step": 86860 }, { "epoch": 15.941457148100568, "grad_norm": 0.013880347833037376, "learning_rate": 1.2029202539485025e-06, "loss": 0.1908, "num_input_tokens_seen": 187337504, "step": 86865 }, { "epoch": 15.942374747660121, "grad_norm": 272.0011291503906, "learning_rate": 1.2023993260093491e-06, "loss": 0.0589, "num_input_tokens_seen": 187348096, "step": 86870 }, { "epoch": 15.943292347219673, "grad_norm": 33.02472686767578, "learning_rate": 1.2018784954727669e-06, "loss": 0.0036, "num_input_tokens_seen": 187359424, "step": 86875 }, { "epoch": 15.944209946779225, "grad_norm": 0.015559163875877857, "learning_rate": 1.2013577623521132e-06, "loss": 0.0014, "num_input_tokens_seen": 187370912, "step": 86880 }, { "epoch": 15.945127546338778, "grad_norm": 0.03072150982916355, "learning_rate": 1.2008371266607471e-06, "loss": 0.0003, "num_input_tokens_seen": 187382112, "step": 86885 }, { "epoch": 15.94604514589833, "grad_norm": 0.004591940902173519, "learning_rate": 1.2003165884120205e-06, "loss": 0.0001, "num_input_tokens_seen": 187393248, "step": 86890 }, { "epoch": 15.946962745457881, "grad_norm": 0.012067156843841076, "learning_rate": 1.1997961476192832e-06, "loss": 0.0854, "num_input_tokens_seen": 187404064, "step": 86895 }, { "epoch": 15.947880345017435, "grad_norm": 0.009615495800971985, "learning_rate": 1.1992758042958864e-06, "loss": 0.0, "num_input_tokens_seen": 187412864, "step": 86900 }, { "epoch": 15.948797944576986, "grad_norm": 0.014468694105744362, "learning_rate": 1.1987555584551741e-06, "loss": 0.0329, "num_input_tokens_seen": 187422816, "step": 86905 }, { "epoch": 15.949715544136538, "grad_norm": 0.0022015240974724293, "learning_rate": 1.1982354101104892e-06, "loss": 0.0001, "num_input_tokens_seen": 187433376, "step": 86910 }, { "epoch": 15.950633143696091, "grad_norm": 0.03186031058430672, "learning_rate": 1.1977153592751755e-06, "loss": 0.0018, "num_input_tokens_seen": 187443808, "step": 86915 }, { "epoch": 15.951550743255643, "grad_norm": 0.004244111943989992, "learning_rate": 1.1971954059625696e-06, "loss": 0.0049, "num_input_tokens_seen": 187454368, "step": 86920 }, { "epoch": 15.952468342815195, "grad_norm": 0.0008646242204122245, "learning_rate": 1.1966755501860077e-06, "loss": 0.0, "num_input_tokens_seen": 187465376, "step": 86925 }, { "epoch": 15.953385942374748, "grad_norm": 0.000504167634062469, "learning_rate": 1.1961557919588234e-06, "loss": 0.0, "num_input_tokens_seen": 187475552, "step": 86930 }, { "epoch": 15.9543035419343, "grad_norm": 0.10346938669681549, "learning_rate": 1.1956361312943466e-06, "loss": 0.0001, "num_input_tokens_seen": 187486304, "step": 86935 }, { "epoch": 15.955221141493851, "grad_norm": 0.065660260617733, "learning_rate": 1.1951165682059073e-06, "loss": 0.0001, "num_input_tokens_seen": 187497632, "step": 86940 }, { "epoch": 15.956138741053405, "grad_norm": 0.003376849228516221, "learning_rate": 1.194597102706832e-06, "loss": 0.0006, "num_input_tokens_seen": 187509152, "step": 86945 }, { "epoch": 15.957056340612956, "grad_norm": 4.911361217498779, "learning_rate": 1.1940777348104427e-06, "loss": 0.0008, "num_input_tokens_seen": 187518912, "step": 86950 }, { "epoch": 15.957973940172508, "grad_norm": 0.004380465019494295, "learning_rate": 1.1935584645300607e-06, "loss": 0.1407, "num_input_tokens_seen": 187529376, "step": 86955 }, { "epoch": 15.958891539732061, "grad_norm": 3.2544212341308594, "learning_rate": 1.1930392918790035e-06, "loss": 0.0183, "num_input_tokens_seen": 187540288, "step": 86960 }, { "epoch": 15.959809139291613, "grad_norm": 0.04220809042453766, "learning_rate": 1.192520216870589e-06, "loss": 0.0001, "num_input_tokens_seen": 187551232, "step": 86965 }, { "epoch": 15.960726738851164, "grad_norm": 0.13401566445827484, "learning_rate": 1.1920012395181308e-06, "loss": 0.0001, "num_input_tokens_seen": 187559776, "step": 86970 }, { "epoch": 15.961644338410718, "grad_norm": 0.0018268482526764274, "learning_rate": 1.1914823598349384e-06, "loss": 0.1532, "num_input_tokens_seen": 187570784, "step": 86975 }, { "epoch": 15.96256193797027, "grad_norm": 0.0019516147440299392, "learning_rate": 1.1909635778343192e-06, "loss": 0.0001, "num_input_tokens_seen": 187581760, "step": 86980 }, { "epoch": 15.963479537529821, "grad_norm": 0.0016121844528242946, "learning_rate": 1.1904448935295825e-06, "loss": 0.0, "num_input_tokens_seen": 187592768, "step": 86985 }, { "epoch": 15.964397137089374, "grad_norm": 0.02130666933953762, "learning_rate": 1.18992630693403e-06, "loss": 0.0001, "num_input_tokens_seen": 187603584, "step": 86990 }, { "epoch": 15.965314736648926, "grad_norm": 0.13393186032772064, "learning_rate": 1.1894078180609614e-06, "loss": 0.0002, "num_input_tokens_seen": 187613728, "step": 86995 }, { "epoch": 15.966232336208478, "grad_norm": 0.0027484914753586054, "learning_rate": 1.1888894269236773e-06, "loss": 0.0001, "num_input_tokens_seen": 187624512, "step": 87000 }, { "epoch": 15.967149935768031, "grad_norm": 2.0488967895507812, "learning_rate": 1.188371133535473e-06, "loss": 0.0001, "num_input_tokens_seen": 187635808, "step": 87005 }, { "epoch": 15.968067535327583, "grad_norm": 0.11085492372512817, "learning_rate": 1.1878529379096405e-06, "loss": 0.0, "num_input_tokens_seen": 187646464, "step": 87010 }, { "epoch": 15.968985134887134, "grad_norm": 7.4275898933410645, "learning_rate": 1.1873348400594725e-06, "loss": 0.0748, "num_input_tokens_seen": 187656448, "step": 87015 }, { "epoch": 15.969902734446688, "grad_norm": 17.742921829223633, "learning_rate": 1.1868168399982578e-06, "loss": 0.001, "num_input_tokens_seen": 187666912, "step": 87020 }, { "epoch": 15.97082033400624, "grad_norm": 1.601529598236084, "learning_rate": 1.1862989377392802e-06, "loss": 0.0005, "num_input_tokens_seen": 187677664, "step": 87025 }, { "epoch": 15.971737933565791, "grad_norm": 0.0047347573563456535, "learning_rate": 1.1857811332958235e-06, "loss": 0.0001, "num_input_tokens_seen": 187687264, "step": 87030 }, { "epoch": 15.972655533125344, "grad_norm": 0.6817436814308167, "learning_rate": 1.1852634266811701e-06, "loss": 0.0067, "num_input_tokens_seen": 187698496, "step": 87035 }, { "epoch": 15.973573132684896, "grad_norm": 0.12953026592731476, "learning_rate": 1.184745817908598e-06, "loss": 0.0001, "num_input_tokens_seen": 187709440, "step": 87040 }, { "epoch": 15.974490732244448, "grad_norm": 0.2017122060060501, "learning_rate": 1.1842283069913807e-06, "loss": 0.0004, "num_input_tokens_seen": 187720960, "step": 87045 }, { "epoch": 15.975408331804001, "grad_norm": 0.23747265338897705, "learning_rate": 1.1837108939427955e-06, "loss": 0.0042, "num_input_tokens_seen": 187731136, "step": 87050 }, { "epoch": 15.976325931363553, "grad_norm": 0.0043304432183504105, "learning_rate": 1.1831935787761106e-06, "loss": 0.0001, "num_input_tokens_seen": 187741248, "step": 87055 }, { "epoch": 15.977243530923104, "grad_norm": 0.0021465716417878866, "learning_rate": 1.182676361504595e-06, "loss": 0.238, "num_input_tokens_seen": 187751712, "step": 87060 }, { "epoch": 15.978161130482658, "grad_norm": 1.1766144037246704, "learning_rate": 1.182159242141513e-06, "loss": 0.0921, "num_input_tokens_seen": 187761088, "step": 87065 }, { "epoch": 15.97907873004221, "grad_norm": 0.17039968073368073, "learning_rate": 1.1816422207001304e-06, "loss": 0.0001, "num_input_tokens_seen": 187772032, "step": 87070 }, { "epoch": 15.97999632960176, "grad_norm": 0.04917769134044647, "learning_rate": 1.1811252971937075e-06, "loss": 0.0004, "num_input_tokens_seen": 187783424, "step": 87075 }, { "epoch": 15.980913929161314, "grad_norm": 0.0059677897952497005, "learning_rate": 1.1806084716355003e-06, "loss": 0.0008, "num_input_tokens_seen": 187794144, "step": 87080 }, { "epoch": 15.981831528720866, "grad_norm": 0.005992555059492588, "learning_rate": 1.1800917440387677e-06, "loss": 0.197, "num_input_tokens_seen": 187804032, "step": 87085 }, { "epoch": 15.982749128280417, "grad_norm": 0.6180358529090881, "learning_rate": 1.1795751144167616e-06, "loss": 0.0002, "num_input_tokens_seen": 187813824, "step": 87090 }, { "epoch": 15.983666727839971, "grad_norm": 0.05896007642149925, "learning_rate": 1.179058582782731e-06, "loss": 0.1751, "num_input_tokens_seen": 187825472, "step": 87095 }, { "epoch": 15.984584327399523, "grad_norm": 0.1671428233385086, "learning_rate": 1.1785421491499277e-06, "loss": 0.0001, "num_input_tokens_seen": 187836608, "step": 87100 }, { "epoch": 15.985501926959074, "grad_norm": 0.8449310660362244, "learning_rate": 1.1780258135315953e-06, "loss": 0.0003, "num_input_tokens_seen": 187847616, "step": 87105 }, { "epoch": 15.986419526518628, "grad_norm": 0.029101325199007988, "learning_rate": 1.1775095759409776e-06, "loss": 0.0001, "num_input_tokens_seen": 187858624, "step": 87110 }, { "epoch": 15.98733712607818, "grad_norm": 0.10303152352571487, "learning_rate": 1.1769934363913132e-06, "loss": 0.0, "num_input_tokens_seen": 187868768, "step": 87115 }, { "epoch": 15.98825472563773, "grad_norm": 0.001954376231878996, "learning_rate": 1.1764773948958435e-06, "loss": 0.0001, "num_input_tokens_seen": 187879136, "step": 87120 }, { "epoch": 15.989172325197284, "grad_norm": 25.502277374267578, "learning_rate": 1.1759614514678024e-06, "loss": 0.0038, "num_input_tokens_seen": 187890816, "step": 87125 }, { "epoch": 15.990089924756836, "grad_norm": 0.002183606382459402, "learning_rate": 1.1754456061204227e-06, "loss": 0.0348, "num_input_tokens_seen": 187901696, "step": 87130 }, { "epoch": 15.991007524316387, "grad_norm": 0.020250625908374786, "learning_rate": 1.1749298588669366e-06, "loss": 0.0003, "num_input_tokens_seen": 187911776, "step": 87135 }, { "epoch": 15.99192512387594, "grad_norm": 0.022085392847657204, "learning_rate": 1.1744142097205713e-06, "loss": 0.0007, "num_input_tokens_seen": 187921664, "step": 87140 }, { "epoch": 15.992842723435492, "grad_norm": 0.057389140129089355, "learning_rate": 1.1738986586945523e-06, "loss": 0.1268, "num_input_tokens_seen": 187932832, "step": 87145 }, { "epoch": 15.993760322995044, "grad_norm": 0.0005715584848076105, "learning_rate": 1.1733832058021027e-06, "loss": 0.0, "num_input_tokens_seen": 187942880, "step": 87150 }, { "epoch": 15.994677922554597, "grad_norm": 0.0021237297914922237, "learning_rate": 1.1728678510564435e-06, "loss": 0.0003, "num_input_tokens_seen": 187954944, "step": 87155 }, { "epoch": 15.995595522114149, "grad_norm": 0.15490683913230896, "learning_rate": 1.1723525944707908e-06, "loss": 0.0793, "num_input_tokens_seen": 187965664, "step": 87160 }, { "epoch": 15.996513121673702, "grad_norm": 1.4565542936325073, "learning_rate": 1.1718374360583633e-06, "loss": 0.0004, "num_input_tokens_seen": 187978112, "step": 87165 }, { "epoch": 15.997430721233254, "grad_norm": 0.019784418866038322, "learning_rate": 1.1713223758323728e-06, "loss": 0.0001, "num_input_tokens_seen": 187988256, "step": 87170 }, { "epoch": 15.998348320792806, "grad_norm": 5.529341220855713, "learning_rate": 1.170807413806029e-06, "loss": 0.0006, "num_input_tokens_seen": 187999616, "step": 87175 }, { "epoch": 15.999265920352359, "grad_norm": 0.048593562096357346, "learning_rate": 1.1702925499925388e-06, "loss": 0.0001, "num_input_tokens_seen": 188011712, "step": 87180 }, { "epoch": 16.0, "eval_loss": 1.2439020872116089, "eval_runtime": 179.3626, "eval_samples_per_second": 30.38, "eval_steps_per_second": 7.599, "num_input_tokens_seen": 188018640, "step": 87184 }, { "epoch": 16.00018351991191, "grad_norm": 0.0042565190233290195, "learning_rate": 1.1697777844051105e-06, "loss": 0.0178, "num_input_tokens_seen": 188021200, "step": 87185 }, { "epoch": 16.001101119471464, "grad_norm": 0.010316613130271435, "learning_rate": 1.1692631170569457e-06, "loss": 0.0, "num_input_tokens_seen": 188031824, "step": 87190 }, { "epoch": 16.002018719031014, "grad_norm": 0.0157769825309515, "learning_rate": 1.1687485479612453e-06, "loss": 0.002, "num_input_tokens_seen": 188042512, "step": 87195 }, { "epoch": 16.002936318590567, "grad_norm": 0.0035345079377293587, "learning_rate": 1.1682340771312051e-06, "loss": 0.0, "num_input_tokens_seen": 188053520, "step": 87200 }, { "epoch": 16.00385391815012, "grad_norm": 0.0037230621092021465, "learning_rate": 1.1677197045800238e-06, "loss": 0.1688, "num_input_tokens_seen": 188063952, "step": 87205 }, { "epoch": 16.00477151770967, "grad_norm": 0.013761374168097973, "learning_rate": 1.1672054303208923e-06, "loss": 0.0064, "num_input_tokens_seen": 188074512, "step": 87210 }, { "epoch": 16.005689117269224, "grad_norm": 0.003367119235917926, "learning_rate": 1.1666912543669995e-06, "loss": 0.0001, "num_input_tokens_seen": 188085520, "step": 87215 }, { "epoch": 16.006606716828777, "grad_norm": 0.02295764721930027, "learning_rate": 1.1661771767315366e-06, "loss": 0.0478, "num_input_tokens_seen": 188098288, "step": 87220 }, { "epoch": 16.007524316388327, "grad_norm": 0.0026020228397101164, "learning_rate": 1.1656631974276878e-06, "loss": 0.0001, "num_input_tokens_seen": 188108880, "step": 87225 }, { "epoch": 16.00844191594788, "grad_norm": 59.28208541870117, "learning_rate": 1.1651493164686333e-06, "loss": 0.1005, "num_input_tokens_seen": 188120432, "step": 87230 }, { "epoch": 16.009359515507434, "grad_norm": 0.02778468281030655, "learning_rate": 1.1646355338675568e-06, "loss": 0.083, "num_input_tokens_seen": 188132336, "step": 87235 }, { "epoch": 16.010277115066984, "grad_norm": 0.03614906966686249, "learning_rate": 1.1641218496376345e-06, "loss": 0.0001, "num_input_tokens_seen": 188144496, "step": 87240 }, { "epoch": 16.011194714626537, "grad_norm": 0.005931049585342407, "learning_rate": 1.163608263792042e-06, "loss": 0.0003, "num_input_tokens_seen": 188155056, "step": 87245 }, { "epoch": 16.01211231418609, "grad_norm": 0.0005900880787521601, "learning_rate": 1.1630947763439498e-06, "loss": 0.0001, "num_input_tokens_seen": 188166672, "step": 87250 }, { "epoch": 16.01302991374564, "grad_norm": 0.012462516315281391, "learning_rate": 1.1625813873065317e-06, "loss": 0.0, "num_input_tokens_seen": 188178160, "step": 87255 }, { "epoch": 16.013947513305194, "grad_norm": 0.0028754156082868576, "learning_rate": 1.1620680966929538e-06, "loss": 0.0, "num_input_tokens_seen": 188189008, "step": 87260 }, { "epoch": 16.014865112864747, "grad_norm": 0.01396855991333723, "learning_rate": 1.1615549045163794e-06, "loss": 0.0, "num_input_tokens_seen": 188199696, "step": 87265 }, { "epoch": 16.015782712424297, "grad_norm": 2.0315475463867188, "learning_rate": 1.1610418107899734e-06, "loss": 0.0018, "num_input_tokens_seen": 188210416, "step": 87270 }, { "epoch": 16.01670031198385, "grad_norm": 0.01670764572918415, "learning_rate": 1.1605288155268958e-06, "loss": 0.0001, "num_input_tokens_seen": 188221136, "step": 87275 }, { "epoch": 16.017617911543404, "grad_norm": 0.006844247225672007, "learning_rate": 1.160015918740302e-06, "loss": 0.0, "num_input_tokens_seen": 188231568, "step": 87280 }, { "epoch": 16.018535511102954, "grad_norm": 0.002780299400910735, "learning_rate": 1.1595031204433493e-06, "loss": 0.0001, "num_input_tokens_seen": 188241584, "step": 87285 }, { "epoch": 16.019453110662507, "grad_norm": 2.0283329486846924, "learning_rate": 1.1589904206491898e-06, "loss": 0.0009, "num_input_tokens_seen": 188251536, "step": 87290 }, { "epoch": 16.02037071022206, "grad_norm": 0.0015045939944684505, "learning_rate": 1.1584778193709728e-06, "loss": 0.0, "num_input_tokens_seen": 188262576, "step": 87295 }, { "epoch": 16.02128830978161, "grad_norm": 0.015994848683476448, "learning_rate": 1.1579653166218447e-06, "loss": 0.0, "num_input_tokens_seen": 188273328, "step": 87300 }, { "epoch": 16.022205909341164, "grad_norm": 0.0021223558578640223, "learning_rate": 1.157452912414953e-06, "loss": 0.0, "num_input_tokens_seen": 188284240, "step": 87305 }, { "epoch": 16.023123508900717, "grad_norm": 0.0019376452546566725, "learning_rate": 1.1569406067634386e-06, "loss": 0.0007, "num_input_tokens_seen": 188294672, "step": 87310 }, { "epoch": 16.024041108460267, "grad_norm": 0.002798505127429962, "learning_rate": 1.1564283996804405e-06, "loss": 0.0002, "num_input_tokens_seen": 188306736, "step": 87315 }, { "epoch": 16.02495870801982, "grad_norm": 0.0009496959391981363, "learning_rate": 1.1559162911790978e-06, "loss": 0.0007, "num_input_tokens_seen": 188317712, "step": 87320 }, { "epoch": 16.025876307579374, "grad_norm": 3.0444164276123047, "learning_rate": 1.155404281272544e-06, "loss": 0.0005, "num_input_tokens_seen": 188328528, "step": 87325 }, { "epoch": 16.026793907138924, "grad_norm": 0.011637801304459572, "learning_rate": 1.1548923699739129e-06, "loss": 0.0, "num_input_tokens_seen": 188339632, "step": 87330 }, { "epoch": 16.027711506698477, "grad_norm": 0.028187526389956474, "learning_rate": 1.1543805572963307e-06, "loss": 0.0, "num_input_tokens_seen": 188350352, "step": 87335 }, { "epoch": 16.02862910625803, "grad_norm": 0.005319627933204174, "learning_rate": 1.1538688432529294e-06, "loss": 0.0, "num_input_tokens_seen": 188361392, "step": 87340 }, { "epoch": 16.02954670581758, "grad_norm": 0.010927276685833931, "learning_rate": 1.1533572278568306e-06, "loss": 0.2469, "num_input_tokens_seen": 188371792, "step": 87345 }, { "epoch": 16.030464305377134, "grad_norm": 0.005169369280338287, "learning_rate": 1.1528457111211572e-06, "loss": 0.0002, "num_input_tokens_seen": 188383184, "step": 87350 }, { "epoch": 16.031381904936687, "grad_norm": 1.5022929906845093, "learning_rate": 1.1523342930590276e-06, "loss": 0.0002, "num_input_tokens_seen": 188393904, "step": 87355 }, { "epoch": 16.032299504496237, "grad_norm": 0.0037036335561424494, "learning_rate": 1.1518229736835612e-06, "loss": 0.0001, "num_input_tokens_seen": 188403984, "step": 87360 }, { "epoch": 16.03321710405579, "grad_norm": 0.009239023551344872, "learning_rate": 1.1513117530078715e-06, "loss": 0.0289, "num_input_tokens_seen": 188415088, "step": 87365 }, { "epoch": 16.034134703615344, "grad_norm": 0.022681526839733124, "learning_rate": 1.150800631045071e-06, "loss": 0.0, "num_input_tokens_seen": 188426416, "step": 87370 }, { "epoch": 16.035052303174893, "grad_norm": 0.0027331800665706396, "learning_rate": 1.1502896078082682e-06, "loss": 0.0, "num_input_tokens_seen": 188435792, "step": 87375 }, { "epoch": 16.035969902734447, "grad_norm": 0.00036950086359865963, "learning_rate": 1.1497786833105685e-06, "loss": 0.0001, "num_input_tokens_seen": 188446128, "step": 87380 }, { "epoch": 16.036887502294, "grad_norm": 0.015512439422309399, "learning_rate": 1.1492678575650802e-06, "loss": 0.0, "num_input_tokens_seen": 188457392, "step": 87385 }, { "epoch": 16.03780510185355, "grad_norm": 0.005326480139046907, "learning_rate": 1.1487571305849032e-06, "loss": 0.0, "num_input_tokens_seen": 188466608, "step": 87390 }, { "epoch": 16.038722701413104, "grad_norm": 106.70632934570312, "learning_rate": 1.148246502383137e-06, "loss": 0.0284, "num_input_tokens_seen": 188476208, "step": 87395 }, { "epoch": 16.039640300972657, "grad_norm": 0.0032048921566456556, "learning_rate": 1.1477359729728765e-06, "loss": 0.0001, "num_input_tokens_seen": 188487408, "step": 87400 }, { "epoch": 16.040557900532207, "grad_norm": 0.017200376838445663, "learning_rate": 1.1472255423672196e-06, "loss": 0.0007, "num_input_tokens_seen": 188497520, "step": 87405 }, { "epoch": 16.04147550009176, "grad_norm": 0.020325856283307076, "learning_rate": 1.1467152105792563e-06, "loss": 0.0033, "num_input_tokens_seen": 188508368, "step": 87410 }, { "epoch": 16.042393099651314, "grad_norm": 0.0038941185921430588, "learning_rate": 1.146204977622074e-06, "loss": 0.0, "num_input_tokens_seen": 188520368, "step": 87415 }, { "epoch": 16.043310699210863, "grad_norm": 189.86639404296875, "learning_rate": 1.1456948435087633e-06, "loss": 0.0051, "num_input_tokens_seen": 188530768, "step": 87420 }, { "epoch": 16.044228298770417, "grad_norm": 0.009712209925055504, "learning_rate": 1.1451848082524059e-06, "loss": 0.0, "num_input_tokens_seen": 188543376, "step": 87425 }, { "epoch": 16.04514589832997, "grad_norm": 1.949501872062683, "learning_rate": 1.1446748718660834e-06, "loss": 0.0003, "num_input_tokens_seen": 188553584, "step": 87430 }, { "epoch": 16.04606349788952, "grad_norm": 0.0015173222636803985, "learning_rate": 1.144165034362874e-06, "loss": 0.2525, "num_input_tokens_seen": 188564432, "step": 87435 }, { "epoch": 16.046981097449073, "grad_norm": 0.0005645883502438664, "learning_rate": 1.1436552957558571e-06, "loss": 0.0, "num_input_tokens_seen": 188576016, "step": 87440 }, { "epoch": 16.047898697008627, "grad_norm": 0.0007568061701022089, "learning_rate": 1.1431456560581051e-06, "loss": 0.0, "num_input_tokens_seen": 188587760, "step": 87445 }, { "epoch": 16.048816296568177, "grad_norm": 2.5086264610290527, "learning_rate": 1.1426361152826876e-06, "loss": 0.0001, "num_input_tokens_seen": 188599088, "step": 87450 }, { "epoch": 16.04973389612773, "grad_norm": 0.008102190680801868, "learning_rate": 1.1421266734426773e-06, "loss": 0.0, "num_input_tokens_seen": 188608304, "step": 87455 }, { "epoch": 16.050651495687283, "grad_norm": 0.0005804848042316735, "learning_rate": 1.141617330551138e-06, "loss": 0.0001, "num_input_tokens_seen": 188619312, "step": 87460 }, { "epoch": 16.051569095246833, "grad_norm": 0.003991411533206701, "learning_rate": 1.1411080866211334e-06, "loss": 0.0, "num_input_tokens_seen": 188630576, "step": 87465 }, { "epoch": 16.052486694806387, "grad_norm": 0.0021406873129308224, "learning_rate": 1.140598941665727e-06, "loss": 0.0001, "num_input_tokens_seen": 188641968, "step": 87470 }, { "epoch": 16.05340429436594, "grad_norm": 0.07886632531881332, "learning_rate": 1.140089895697976e-06, "loss": 0.2063, "num_input_tokens_seen": 188652976, "step": 87475 }, { "epoch": 16.05432189392549, "grad_norm": 0.005583553109318018, "learning_rate": 1.1395809487309367e-06, "loss": 0.001, "num_input_tokens_seen": 188662736, "step": 87480 }, { "epoch": 16.055239493485043, "grad_norm": 0.0013963418314233422, "learning_rate": 1.1390721007776616e-06, "loss": 0.0, "num_input_tokens_seen": 188674480, "step": 87485 }, { "epoch": 16.056157093044597, "grad_norm": 17.87179183959961, "learning_rate": 1.1385633518512051e-06, "loss": 0.0003, "num_input_tokens_seen": 188685264, "step": 87490 }, { "epoch": 16.057074692604147, "grad_norm": 0.03290430083870888, "learning_rate": 1.1380547019646137e-06, "loss": 0.0376, "num_input_tokens_seen": 188696336, "step": 87495 }, { "epoch": 16.0579922921637, "grad_norm": 67.18572235107422, "learning_rate": 1.1375461511309322e-06, "loss": 0.0478, "num_input_tokens_seen": 188708272, "step": 87500 }, { "epoch": 16.058909891723253, "grad_norm": 0.11785827577114105, "learning_rate": 1.137037699363207e-06, "loss": 0.0, "num_input_tokens_seen": 188717424, "step": 87505 }, { "epoch": 16.059827491282803, "grad_norm": 40.306053161621094, "learning_rate": 1.1365293466744781e-06, "loss": 0.002, "num_input_tokens_seen": 188728752, "step": 87510 }, { "epoch": 16.060745090842357, "grad_norm": 0.014925467781722546, "learning_rate": 1.1360210930777836e-06, "loss": 0.0, "num_input_tokens_seen": 188738384, "step": 87515 }, { "epoch": 16.06166269040191, "grad_norm": 0.002961863996461034, "learning_rate": 1.135512938586158e-06, "loss": 0.0001, "num_input_tokens_seen": 188747792, "step": 87520 }, { "epoch": 16.06258028996146, "grad_norm": 0.006724996026605368, "learning_rate": 1.135004883212637e-06, "loss": 0.0001, "num_input_tokens_seen": 188758992, "step": 87525 }, { "epoch": 16.063497889521013, "grad_norm": 0.0005112868966534734, "learning_rate": 1.134496926970251e-06, "loss": 0.0887, "num_input_tokens_seen": 188770992, "step": 87530 }, { "epoch": 16.064415489080567, "grad_norm": 0.0020691484678536654, "learning_rate": 1.1339890698720263e-06, "loss": 0.0, "num_input_tokens_seen": 188783120, "step": 87535 }, { "epoch": 16.065333088640116, "grad_norm": 0.0004384573840070516, "learning_rate": 1.1334813119309918e-06, "loss": 0.0, "num_input_tokens_seen": 188793040, "step": 87540 }, { "epoch": 16.06625068819967, "grad_norm": 0.0030063267331570387, "learning_rate": 1.1329736531601687e-06, "loss": 0.0, "num_input_tokens_seen": 188803856, "step": 87545 }, { "epoch": 16.067168287759223, "grad_norm": 0.02265254408121109, "learning_rate": 1.1324660935725772e-06, "loss": 0.0, "num_input_tokens_seen": 188814800, "step": 87550 }, { "epoch": 16.068085887318773, "grad_norm": 0.0072656855918467045, "learning_rate": 1.1319586331812372e-06, "loss": 0.0, "num_input_tokens_seen": 188824752, "step": 87555 }, { "epoch": 16.069003486878326, "grad_norm": 0.0023034089244902134, "learning_rate": 1.1314512719991633e-06, "loss": 0.0, "num_input_tokens_seen": 188836048, "step": 87560 }, { "epoch": 16.06992108643788, "grad_norm": 3.049243927001953, "learning_rate": 1.1309440100393686e-06, "loss": 0.0006, "num_input_tokens_seen": 188846832, "step": 87565 }, { "epoch": 16.07083868599743, "grad_norm": 0.002754022367298603, "learning_rate": 1.1304368473148641e-06, "loss": 0.0, "num_input_tokens_seen": 188855888, "step": 87570 }, { "epoch": 16.071756285556983, "grad_norm": 0.0015948922373354435, "learning_rate": 1.1299297838386553e-06, "loss": 0.0823, "num_input_tokens_seen": 188867408, "step": 87575 }, { "epoch": 16.072673885116536, "grad_norm": 120.33334350585938, "learning_rate": 1.129422819623751e-06, "loss": 0.0883, "num_input_tokens_seen": 188877104, "step": 87580 }, { "epoch": 16.073591484676086, "grad_norm": 1.5944019556045532, "learning_rate": 1.1289159546831524e-06, "loss": 0.0001, "num_input_tokens_seen": 188887504, "step": 87585 }, { "epoch": 16.07450908423564, "grad_norm": 0.0009462162852287292, "learning_rate": 1.1284091890298599e-06, "loss": 0.0, "num_input_tokens_seen": 188897296, "step": 87590 }, { "epoch": 16.075426683795193, "grad_norm": 0.0005161752924323082, "learning_rate": 1.1279025226768713e-06, "loss": 0.0, "num_input_tokens_seen": 188908720, "step": 87595 }, { "epoch": 16.076344283354743, "grad_norm": 0.0687599927186966, "learning_rate": 1.1273959556371806e-06, "loss": 0.0041, "num_input_tokens_seen": 188919792, "step": 87600 }, { "epoch": 16.077261882914296, "grad_norm": 0.001604951568879187, "learning_rate": 1.1268894879237829e-06, "loss": 0.0, "num_input_tokens_seen": 188931056, "step": 87605 }, { "epoch": 16.07817948247385, "grad_norm": 0.0009719418012537062, "learning_rate": 1.1263831195496672e-06, "loss": 0.0, "num_input_tokens_seen": 188941296, "step": 87610 }, { "epoch": 16.0790970820334, "grad_norm": 0.09097570180892944, "learning_rate": 1.1258768505278205e-06, "loss": 0.0001, "num_input_tokens_seen": 188951888, "step": 87615 }, { "epoch": 16.080014681592953, "grad_norm": 0.020417846739292145, "learning_rate": 1.1253706808712272e-06, "loss": 0.0003, "num_input_tokens_seen": 188964432, "step": 87620 }, { "epoch": 16.080932281152506, "grad_norm": 4.0399322509765625, "learning_rate": 1.1248646105928724e-06, "loss": 0.0004, "num_input_tokens_seen": 188975792, "step": 87625 }, { "epoch": 16.081849880712056, "grad_norm": 0.011081917211413383, "learning_rate": 1.1243586397057343e-06, "loss": 0.0003, "num_input_tokens_seen": 188986224, "step": 87630 }, { "epoch": 16.08276748027161, "grad_norm": 0.003728324081748724, "learning_rate": 1.123852768222789e-06, "loss": 0.0944, "num_input_tokens_seen": 188995600, "step": 87635 }, { "epoch": 16.083685079831163, "grad_norm": 0.0011912016198039055, "learning_rate": 1.1233469961570138e-06, "loss": 0.0478, "num_input_tokens_seen": 189006192, "step": 87640 }, { "epoch": 16.084602679390713, "grad_norm": 0.000431322812801227, "learning_rate": 1.1228413235213799e-06, "loss": 0.0001, "num_input_tokens_seen": 189015728, "step": 87645 }, { "epoch": 16.085520278950266, "grad_norm": 0.0022126745898276567, "learning_rate": 1.1223357503288573e-06, "loss": 0.0, "num_input_tokens_seen": 189025808, "step": 87650 }, { "epoch": 16.08643787850982, "grad_norm": 0.17359668016433716, "learning_rate": 1.121830276592411e-06, "loss": 0.1962, "num_input_tokens_seen": 189036496, "step": 87655 }, { "epoch": 16.08735547806937, "grad_norm": 0.0014514877693727612, "learning_rate": 1.1213249023250094e-06, "loss": 0.0002, "num_input_tokens_seen": 189046128, "step": 87660 }, { "epoch": 16.088273077628923, "grad_norm": 3.9067955017089844, "learning_rate": 1.1208196275396128e-06, "loss": 0.0015, "num_input_tokens_seen": 189057616, "step": 87665 }, { "epoch": 16.089190677188476, "grad_norm": 0.000662347418256104, "learning_rate": 1.1203144522491789e-06, "loss": 0.1128, "num_input_tokens_seen": 189069296, "step": 87670 }, { "epoch": 16.090108276748026, "grad_norm": 0.006712658330798149, "learning_rate": 1.1198093764666673e-06, "loss": 0.0002, "num_input_tokens_seen": 189080080, "step": 87675 }, { "epoch": 16.09102587630758, "grad_norm": 0.008231474086642265, "learning_rate": 1.1193044002050318e-06, "loss": 0.0002, "num_input_tokens_seen": 189091440, "step": 87680 }, { "epoch": 16.091943475867133, "grad_norm": 0.003178352490067482, "learning_rate": 1.1187995234772224e-06, "loss": 0.0, "num_input_tokens_seen": 189102224, "step": 87685 }, { "epoch": 16.092861075426683, "grad_norm": 0.015390275977551937, "learning_rate": 1.1182947462961913e-06, "loss": 0.0144, "num_input_tokens_seen": 189113936, "step": 87690 }, { "epoch": 16.093778674986236, "grad_norm": 0.006508250720798969, "learning_rate": 1.1177900686748844e-06, "loss": 0.0005, "num_input_tokens_seen": 189124560, "step": 87695 }, { "epoch": 16.09469627454579, "grad_norm": 0.41671013832092285, "learning_rate": 1.1172854906262449e-06, "loss": 0.0001, "num_input_tokens_seen": 189135984, "step": 87700 }, { "epoch": 16.09561387410534, "grad_norm": 0.0008483942947350442, "learning_rate": 1.1167810121632133e-06, "loss": 0.0, "num_input_tokens_seen": 189147632, "step": 87705 }, { "epoch": 16.096531473664893, "grad_norm": 105.01607513427734, "learning_rate": 1.1162766332987318e-06, "loss": 0.0533, "num_input_tokens_seen": 189158832, "step": 87710 }, { "epoch": 16.097449073224446, "grad_norm": 0.018265752121806145, "learning_rate": 1.1157723540457354e-06, "loss": 0.0, "num_input_tokens_seen": 189168816, "step": 87715 }, { "epoch": 16.098366672783996, "grad_norm": 0.332711398601532, "learning_rate": 1.1152681744171573e-06, "loss": 0.0001, "num_input_tokens_seen": 189178928, "step": 87720 }, { "epoch": 16.09928427234355, "grad_norm": 0.0033210383262485266, "learning_rate": 1.1147640944259308e-06, "loss": 0.0, "num_input_tokens_seen": 189189872, "step": 87725 }, { "epoch": 16.100201871903103, "grad_norm": 0.0016299926210194826, "learning_rate": 1.1142601140849835e-06, "loss": 0.0002, "num_input_tokens_seen": 189200304, "step": 87730 }, { "epoch": 16.101119471462653, "grad_norm": 0.0075799692422151566, "learning_rate": 1.1137562334072405e-06, "loss": 0.0002, "num_input_tokens_seen": 189212976, "step": 87735 }, { "epoch": 16.102037071022206, "grad_norm": 0.008467462845146656, "learning_rate": 1.1132524524056287e-06, "loss": 0.0, "num_input_tokens_seen": 189222928, "step": 87740 }, { "epoch": 16.10295467058176, "grad_norm": 0.0018621631897985935, "learning_rate": 1.1127487710930673e-06, "loss": 0.0002, "num_input_tokens_seen": 189233232, "step": 87745 }, { "epoch": 16.10387227014131, "grad_norm": 0.06465279310941696, "learning_rate": 1.1122451894824753e-06, "loss": 0.0, "num_input_tokens_seen": 189243568, "step": 87750 }, { "epoch": 16.104789869700863, "grad_norm": 0.008302772417664528, "learning_rate": 1.1117417075867675e-06, "loss": 0.0289, "num_input_tokens_seen": 189254864, "step": 87755 }, { "epoch": 16.105707469260416, "grad_norm": 0.0014153153169900179, "learning_rate": 1.1112383254188598e-06, "loss": 0.0001, "num_input_tokens_seen": 189265616, "step": 87760 }, { "epoch": 16.106625068819966, "grad_norm": 0.0005111657083034515, "learning_rate": 1.110735042991662e-06, "loss": 0.0, "num_input_tokens_seen": 189276400, "step": 87765 }, { "epoch": 16.10754266837952, "grad_norm": 0.0011268308153375983, "learning_rate": 1.1102318603180811e-06, "loss": 0.0232, "num_input_tokens_seen": 189286544, "step": 87770 }, { "epoch": 16.108460267939073, "grad_norm": 0.0293971486389637, "learning_rate": 1.109728777411026e-06, "loss": 0.0, "num_input_tokens_seen": 189298288, "step": 87775 }, { "epoch": 16.109377867498623, "grad_norm": 0.0026091765612363815, "learning_rate": 1.1092257942833985e-06, "loss": 0.0, "num_input_tokens_seen": 189309936, "step": 87780 }, { "epoch": 16.110295467058176, "grad_norm": 2.672656536102295, "learning_rate": 1.108722910948099e-06, "loss": 0.0016, "num_input_tokens_seen": 189320784, "step": 87785 }, { "epoch": 16.11121306661773, "grad_norm": 0.0011773889418691397, "learning_rate": 1.1082201274180259e-06, "loss": 0.0287, "num_input_tokens_seen": 189329840, "step": 87790 }, { "epoch": 16.11213066617728, "grad_norm": 0.008465684950351715, "learning_rate": 1.1077174437060734e-06, "loss": 0.001, "num_input_tokens_seen": 189341360, "step": 87795 }, { "epoch": 16.113048265736833, "grad_norm": 0.002938999328762293, "learning_rate": 1.1072148598251375e-06, "loss": 0.004, "num_input_tokens_seen": 189352912, "step": 87800 }, { "epoch": 16.113965865296386, "grad_norm": 0.006480474956333637, "learning_rate": 1.106712375788107e-06, "loss": 0.0, "num_input_tokens_seen": 189363920, "step": 87805 }, { "epoch": 16.114883464855936, "grad_norm": 0.004770706407725811, "learning_rate": 1.1062099916078705e-06, "loss": 0.0001, "num_input_tokens_seen": 189374448, "step": 87810 }, { "epoch": 16.11580106441549, "grad_norm": 0.02485056035220623, "learning_rate": 1.1057077072973121e-06, "loss": 0.0001, "num_input_tokens_seen": 189385872, "step": 87815 }, { "epoch": 16.116718663975043, "grad_norm": 0.005078582093119621, "learning_rate": 1.1052055228693147e-06, "loss": 0.0, "num_input_tokens_seen": 189395952, "step": 87820 }, { "epoch": 16.117636263534592, "grad_norm": 0.16511932015419006, "learning_rate": 1.1047034383367606e-06, "loss": 0.0001, "num_input_tokens_seen": 189407248, "step": 87825 }, { "epoch": 16.118553863094146, "grad_norm": 0.02611834742128849, "learning_rate": 1.1042014537125256e-06, "loss": 0.0, "num_input_tokens_seen": 189416624, "step": 87830 }, { "epoch": 16.1194714626537, "grad_norm": 0.20937691628932953, "learning_rate": 1.1036995690094859e-06, "loss": 0.1097, "num_input_tokens_seen": 189427408, "step": 87835 }, { "epoch": 16.12038906221325, "grad_norm": 0.006458617746829987, "learning_rate": 1.1031977842405117e-06, "loss": 0.0, "num_input_tokens_seen": 189438416, "step": 87840 }, { "epoch": 16.121306661772802, "grad_norm": 0.040667545050382614, "learning_rate": 1.1026960994184766e-06, "loss": 0.1693, "num_input_tokens_seen": 189448784, "step": 87845 }, { "epoch": 16.122224261332356, "grad_norm": 0.0010968743590638041, "learning_rate": 1.1021945145562463e-06, "loss": 0.0008, "num_input_tokens_seen": 189459920, "step": 87850 }, { "epoch": 16.123141860891906, "grad_norm": 223.5819091796875, "learning_rate": 1.101693029666684e-06, "loss": 0.0645, "num_input_tokens_seen": 189470960, "step": 87855 }, { "epoch": 16.12405946045146, "grad_norm": 0.0010098045459017158, "learning_rate": 1.1011916447626548e-06, "loss": 0.0, "num_input_tokens_seen": 189481040, "step": 87860 }, { "epoch": 16.124977060011012, "grad_norm": 0.0040677618235349655, "learning_rate": 1.100690359857018e-06, "loss": 0.0032, "num_input_tokens_seen": 189490320, "step": 87865 }, { "epoch": 16.125894659570562, "grad_norm": 0.006535890977829695, "learning_rate": 1.1001891749626281e-06, "loss": 0.0001, "num_input_tokens_seen": 189501552, "step": 87870 }, { "epoch": 16.126812259130116, "grad_norm": 0.04784034192562103, "learning_rate": 1.0996880900923433e-06, "loss": 0.0, "num_input_tokens_seen": 189512752, "step": 87875 }, { "epoch": 16.12772985868967, "grad_norm": 0.011534005403518677, "learning_rate": 1.0991871052590141e-06, "loss": 0.1354, "num_input_tokens_seen": 189523440, "step": 87880 }, { "epoch": 16.12864745824922, "grad_norm": 0.002364290179684758, "learning_rate": 1.09868622047549e-06, "loss": 0.0, "num_input_tokens_seen": 189534096, "step": 87885 }, { "epoch": 16.129565057808772, "grad_norm": 0.0018865821184590459, "learning_rate": 1.0981854357546163e-06, "loss": 0.0001, "num_input_tokens_seen": 189545328, "step": 87890 }, { "epoch": 16.130482657368326, "grad_norm": 0.0035172065254300833, "learning_rate": 1.0976847511092403e-06, "loss": 0.0278, "num_input_tokens_seen": 189557744, "step": 87895 }, { "epoch": 16.131400256927876, "grad_norm": 0.00646352069452405, "learning_rate": 1.0971841665522026e-06, "loss": 0.0, "num_input_tokens_seen": 189567504, "step": 87900 }, { "epoch": 16.13231785648743, "grad_norm": 2.2234838008880615, "learning_rate": 1.0966836820963412e-06, "loss": 0.0002, "num_input_tokens_seen": 189578736, "step": 87905 }, { "epoch": 16.133235456046982, "grad_norm": 0.0027292997110635042, "learning_rate": 1.0961832977544944e-06, "loss": 0.0001, "num_input_tokens_seen": 189589968, "step": 87910 }, { "epoch": 16.134153055606532, "grad_norm": 0.015127342194318771, "learning_rate": 1.0956830135394959e-06, "loss": 0.0, "num_input_tokens_seen": 189600496, "step": 87915 }, { "epoch": 16.135070655166086, "grad_norm": 0.12726999819278717, "learning_rate": 1.0951828294641753e-06, "loss": 0.0001, "num_input_tokens_seen": 189611568, "step": 87920 }, { "epoch": 16.13598825472564, "grad_norm": 0.0015963830519467592, "learning_rate": 1.094682745541365e-06, "loss": 0.0, "num_input_tokens_seen": 189622960, "step": 87925 }, { "epoch": 16.13690585428519, "grad_norm": 0.002297128550708294, "learning_rate": 1.0941827617838897e-06, "loss": 0.0006, "num_input_tokens_seen": 189631920, "step": 87930 }, { "epoch": 16.137823453844742, "grad_norm": 0.0032055084593594074, "learning_rate": 1.0936828782045728e-06, "loss": 0.0284, "num_input_tokens_seen": 189642832, "step": 87935 }, { "epoch": 16.138741053404296, "grad_norm": 0.0007868026150390506, "learning_rate": 1.0931830948162342e-06, "loss": 0.0003, "num_input_tokens_seen": 189653936, "step": 87940 }, { "epoch": 16.139658652963845, "grad_norm": 0.0016748254420235753, "learning_rate": 1.0926834116316958e-06, "loss": 0.0008, "num_input_tokens_seen": 189665872, "step": 87945 }, { "epoch": 16.1405762525234, "grad_norm": 0.005573681090027094, "learning_rate": 1.0921838286637726e-06, "loss": 0.0, "num_input_tokens_seen": 189674928, "step": 87950 }, { "epoch": 16.141493852082952, "grad_norm": 0.011881189420819283, "learning_rate": 1.0916843459252756e-06, "loss": 0.0001, "num_input_tokens_seen": 189686960, "step": 87955 }, { "epoch": 16.142411451642502, "grad_norm": 38.930458068847656, "learning_rate": 1.0911849634290194e-06, "loss": 0.1166, "num_input_tokens_seen": 189698928, "step": 87960 }, { "epoch": 16.143329051202056, "grad_norm": 0.020572485402226448, "learning_rate": 1.0906856811878107e-06, "loss": 0.0, "num_input_tokens_seen": 189709904, "step": 87965 }, { "epoch": 16.14424665076161, "grad_norm": 0.0027663917280733585, "learning_rate": 1.0901864992144556e-06, "loss": 0.0002, "num_input_tokens_seen": 189721232, "step": 87970 }, { "epoch": 16.14516425032116, "grad_norm": 0.00186704914085567, "learning_rate": 1.089687417521756e-06, "loss": 0.0063, "num_input_tokens_seen": 189731984, "step": 87975 }, { "epoch": 16.146081849880712, "grad_norm": 0.1244499534368515, "learning_rate": 1.0891884361225147e-06, "loss": 0.0, "num_input_tokens_seen": 189743216, "step": 87980 }, { "epoch": 16.146999449440266, "grad_norm": 0.0015753270126879215, "learning_rate": 1.0886895550295284e-06, "loss": 0.0002, "num_input_tokens_seen": 189754544, "step": 87985 }, { "epoch": 16.147917048999815, "grad_norm": 0.020595630630850792, "learning_rate": 1.088190774255592e-06, "loss": 0.0145, "num_input_tokens_seen": 189765136, "step": 87990 }, { "epoch": 16.14883464855937, "grad_norm": 0.014679334126412868, "learning_rate": 1.087692093813501e-06, "loss": 0.0001, "num_input_tokens_seen": 189776528, "step": 87995 }, { "epoch": 16.149752248118922, "grad_norm": 0.033268578350543976, "learning_rate": 1.0871935137160444e-06, "loss": 0.0, "num_input_tokens_seen": 189786768, "step": 88000 }, { "epoch": 16.150669847678472, "grad_norm": 0.01533389650285244, "learning_rate": 1.0866950339760096e-06, "loss": 0.0, "num_input_tokens_seen": 189796528, "step": 88005 }, { "epoch": 16.151587447238025, "grad_norm": 0.00466945581138134, "learning_rate": 1.0861966546061819e-06, "loss": 0.0, "num_input_tokens_seen": 189807536, "step": 88010 }, { "epoch": 16.15250504679758, "grad_norm": 0.0004858967731706798, "learning_rate": 1.0856983756193435e-06, "loss": 0.0001, "num_input_tokens_seen": 189817648, "step": 88015 }, { "epoch": 16.15342264635713, "grad_norm": 0.38832226395606995, "learning_rate": 1.085200197028276e-06, "loss": 0.0001, "num_input_tokens_seen": 189828816, "step": 88020 }, { "epoch": 16.154340245916682, "grad_norm": 0.0036044323351234198, "learning_rate": 1.0847021188457563e-06, "loss": 0.0, "num_input_tokens_seen": 189839568, "step": 88025 }, { "epoch": 16.155257845476235, "grad_norm": 0.0010401044273748994, "learning_rate": 1.084204141084559e-06, "loss": 0.0, "num_input_tokens_seen": 189850992, "step": 88030 }, { "epoch": 16.156175445035785, "grad_norm": 0.0022900751791894436, "learning_rate": 1.0837062637574563e-06, "loss": 0.3086, "num_input_tokens_seen": 189861296, "step": 88035 }, { "epoch": 16.15709304459534, "grad_norm": 0.029377823695540428, "learning_rate": 1.083208486877217e-06, "loss": 0.0, "num_input_tokens_seen": 189871216, "step": 88040 }, { "epoch": 16.158010644154892, "grad_norm": 71.72298431396484, "learning_rate": 1.082710810456611e-06, "loss": 0.1719, "num_input_tokens_seen": 189881712, "step": 88045 }, { "epoch": 16.158928243714442, "grad_norm": 233.1261749267578, "learning_rate": 1.0822132345084014e-06, "loss": 0.0377, "num_input_tokens_seen": 189891472, "step": 88050 }, { "epoch": 16.159845843273995, "grad_norm": 0.019358810037374496, "learning_rate": 1.0817157590453487e-06, "loss": 0.0, "num_input_tokens_seen": 189902448, "step": 88055 }, { "epoch": 16.16076344283355, "grad_norm": 0.4358970820903778, "learning_rate": 1.0812183840802154e-06, "loss": 0.0001, "num_input_tokens_seen": 189912944, "step": 88060 }, { "epoch": 16.1616810423931, "grad_norm": 0.07460204511880875, "learning_rate": 1.0807211096257576e-06, "loss": 0.0001, "num_input_tokens_seen": 189923440, "step": 88065 }, { "epoch": 16.162598641952652, "grad_norm": 0.0013161711394786835, "learning_rate": 1.0802239356947285e-06, "loss": 0.0001, "num_input_tokens_seen": 189934800, "step": 88070 }, { "epoch": 16.163516241512205, "grad_norm": 0.0009792339988052845, "learning_rate": 1.0797268622998791e-06, "loss": 0.0, "num_input_tokens_seen": 189945808, "step": 88075 }, { "epoch": 16.164433841071755, "grad_norm": 0.00850234366953373, "learning_rate": 1.079229889453961e-06, "loss": 0.0, "num_input_tokens_seen": 189956688, "step": 88080 }, { "epoch": 16.16535144063131, "grad_norm": 0.019327254965901375, "learning_rate": 1.0787330171697197e-06, "loss": 0.0, "num_input_tokens_seen": 189967568, "step": 88085 }, { "epoch": 16.166269040190862, "grad_norm": 485.8197937011719, "learning_rate": 1.0782362454598978e-06, "loss": 0.2197, "num_input_tokens_seen": 189978288, "step": 88090 }, { "epoch": 16.167186639750412, "grad_norm": 0.0010130178416147828, "learning_rate": 1.0777395743372392e-06, "loss": 0.0, "num_input_tokens_seen": 189989264, "step": 88095 }, { "epoch": 16.168104239309965, "grad_norm": 23.816011428833008, "learning_rate": 1.0772430038144822e-06, "loss": 0.0012, "num_input_tokens_seen": 189998192, "step": 88100 }, { "epoch": 16.16902183886952, "grad_norm": 0.023587552830576897, "learning_rate": 1.0767465339043615e-06, "loss": 0.0, "num_input_tokens_seen": 190008016, "step": 88105 }, { "epoch": 16.16993943842907, "grad_norm": 0.0005692915292456746, "learning_rate": 1.076250164619611e-06, "loss": 0.0883, "num_input_tokens_seen": 190019376, "step": 88110 }, { "epoch": 16.170857037988622, "grad_norm": 0.19284848868846893, "learning_rate": 1.0757538959729635e-06, "loss": 0.0001, "num_input_tokens_seen": 190030352, "step": 88115 }, { "epoch": 16.171774637548175, "grad_norm": 0.016660498455166817, "learning_rate": 1.075257727977147e-06, "loss": 0.0, "num_input_tokens_seen": 190040176, "step": 88120 }, { "epoch": 16.172692237107725, "grad_norm": 0.008104418404400349, "learning_rate": 1.0747616606448853e-06, "loss": 0.0, "num_input_tokens_seen": 190051408, "step": 88125 }, { "epoch": 16.17360983666728, "grad_norm": 0.0015568817034363747, "learning_rate": 1.0742656939889046e-06, "loss": 0.0, "num_input_tokens_seen": 190061744, "step": 88130 }, { "epoch": 16.174527436226832, "grad_norm": 0.0026150429621338844, "learning_rate": 1.073769828021925e-06, "loss": 0.0, "num_input_tokens_seen": 190072048, "step": 88135 }, { "epoch": 16.17544503578638, "grad_norm": 0.1841852217912674, "learning_rate": 1.0732740627566623e-06, "loss": 0.0001, "num_input_tokens_seen": 190083504, "step": 88140 }, { "epoch": 16.176362635345935, "grad_norm": 0.0023789622355252504, "learning_rate": 1.072778398205836e-06, "loss": 0.0, "num_input_tokens_seen": 190094928, "step": 88145 }, { "epoch": 16.17728023490549, "grad_norm": 0.061929844319820404, "learning_rate": 1.0722828343821568e-06, "loss": 0.0119, "num_input_tokens_seen": 190106256, "step": 88150 }, { "epoch": 16.17819783446504, "grad_norm": 0.08492903411388397, "learning_rate": 1.0717873712983357e-06, "loss": 0.0001, "num_input_tokens_seen": 190117392, "step": 88155 }, { "epoch": 16.17911543402459, "grad_norm": 0.0333840511739254, "learning_rate": 1.0712920089670787e-06, "loss": 0.0001, "num_input_tokens_seen": 190127888, "step": 88160 }, { "epoch": 16.180033033584145, "grad_norm": 4.86981201171875, "learning_rate": 1.0707967474010937e-06, "loss": 0.0005, "num_input_tokens_seen": 190138512, "step": 88165 }, { "epoch": 16.180950633143695, "grad_norm": 6.325540542602539, "learning_rate": 1.0703015866130833e-06, "loss": 0.0025, "num_input_tokens_seen": 190150384, "step": 88170 }, { "epoch": 16.18186823270325, "grad_norm": 0.1896599680185318, "learning_rate": 1.0698065266157447e-06, "loss": 0.0001, "num_input_tokens_seen": 190160240, "step": 88175 }, { "epoch": 16.1827858322628, "grad_norm": 0.004378531128168106, "learning_rate": 1.069311567421779e-06, "loss": 0.0, "num_input_tokens_seen": 190170800, "step": 88180 }, { "epoch": 16.18370343182235, "grad_norm": 0.008454084396362305, "learning_rate": 1.06881670904388e-06, "loss": 0.0001, "num_input_tokens_seen": 190182224, "step": 88185 }, { "epoch": 16.184621031381905, "grad_norm": 0.021806800737977028, "learning_rate": 1.0683219514947379e-06, "loss": 0.0001, "num_input_tokens_seen": 190191664, "step": 88190 }, { "epoch": 16.18553863094146, "grad_norm": 0.002505839802324772, "learning_rate": 1.0678272947870455e-06, "loss": 0.0, "num_input_tokens_seen": 190202000, "step": 88195 }, { "epoch": 16.18645623050101, "grad_norm": 2.123443841934204, "learning_rate": 1.0673327389334886e-06, "loss": 0.0001, "num_input_tokens_seen": 190213744, "step": 88200 }, { "epoch": 16.18737383006056, "grad_norm": 0.0030744411051273346, "learning_rate": 1.0668382839467522e-06, "loss": 0.0002, "num_input_tokens_seen": 190224432, "step": 88205 }, { "epoch": 16.188291429620115, "grad_norm": 0.0010146782733500004, "learning_rate": 1.0663439298395162e-06, "loss": 0.0, "num_input_tokens_seen": 190236144, "step": 88210 }, { "epoch": 16.189209029179665, "grad_norm": 0.0011358902556821704, "learning_rate": 1.0658496766244636e-06, "loss": 0.0001, "num_input_tokens_seen": 190247760, "step": 88215 }, { "epoch": 16.19012662873922, "grad_norm": 0.002656973199918866, "learning_rate": 1.0653555243142694e-06, "loss": 0.0, "num_input_tokens_seen": 190257680, "step": 88220 }, { "epoch": 16.19104422829877, "grad_norm": 0.0010608406737446785, "learning_rate": 1.0648614729216072e-06, "loss": 0.0002, "num_input_tokens_seen": 190268816, "step": 88225 }, { "epoch": 16.19196182785832, "grad_norm": 0.4076545238494873, "learning_rate": 1.06436752245915e-06, "loss": 0.0001, "num_input_tokens_seen": 190278800, "step": 88230 }, { "epoch": 16.192879427417875, "grad_norm": 0.030979877337813377, "learning_rate": 1.063873672939566e-06, "loss": 0.0, "num_input_tokens_seen": 190288400, "step": 88235 }, { "epoch": 16.19379702697743, "grad_norm": 56.56578826904297, "learning_rate": 1.0633799243755199e-06, "loss": 0.0703, "num_input_tokens_seen": 190299408, "step": 88240 }, { "epoch": 16.194714626536978, "grad_norm": 0.0006495286943390965, "learning_rate": 1.0628862767796799e-06, "loss": 0.0086, "num_input_tokens_seen": 190309360, "step": 88245 }, { "epoch": 16.19563222609653, "grad_norm": 103.7862777709961, "learning_rate": 1.0623927301647042e-06, "loss": 0.0588, "num_input_tokens_seen": 190319408, "step": 88250 }, { "epoch": 16.196549825656085, "grad_norm": 0.0035263756290078163, "learning_rate": 1.0618992845432525e-06, "loss": 0.0, "num_input_tokens_seen": 190330544, "step": 88255 }, { "epoch": 16.197467425215635, "grad_norm": 0.001786565757356584, "learning_rate": 1.0614059399279792e-06, "loss": 0.2008, "num_input_tokens_seen": 190340656, "step": 88260 }, { "epoch": 16.198385024775188, "grad_norm": 0.0026011974550783634, "learning_rate": 1.0609126963315407e-06, "loss": 0.0, "num_input_tokens_seen": 190350992, "step": 88265 }, { "epoch": 16.19930262433474, "grad_norm": 0.009673390537500381, "learning_rate": 1.0604195537665861e-06, "loss": 0.0, "num_input_tokens_seen": 190362000, "step": 88270 }, { "epoch": 16.20022022389429, "grad_norm": 0.0021870615892112255, "learning_rate": 1.0599265122457637e-06, "loss": 0.0, "num_input_tokens_seen": 190372752, "step": 88275 }, { "epoch": 16.201137823453845, "grad_norm": 0.012604143470525742, "learning_rate": 1.0594335717817207e-06, "loss": 0.0001, "num_input_tokens_seen": 190383376, "step": 88280 }, { "epoch": 16.202055423013398, "grad_norm": 0.0490112341940403, "learning_rate": 1.0589407323870988e-06, "loss": 0.0, "num_input_tokens_seen": 190394512, "step": 88285 }, { "epoch": 16.202973022572948, "grad_norm": 0.0015835905214771628, "learning_rate": 1.058447994074539e-06, "loss": 0.0588, "num_input_tokens_seen": 190405296, "step": 88290 }, { "epoch": 16.2038906221325, "grad_norm": 0.01584267057478428, "learning_rate": 1.0579553568566787e-06, "loss": 0.0618, "num_input_tokens_seen": 190416848, "step": 88295 }, { "epoch": 16.204808221692055, "grad_norm": 0.001646780758164823, "learning_rate": 1.0574628207461546e-06, "loss": 0.0, "num_input_tokens_seen": 190427856, "step": 88300 }, { "epoch": 16.205725821251605, "grad_norm": 0.0029300032183527946, "learning_rate": 1.0569703857555992e-06, "loss": 0.0, "num_input_tokens_seen": 190439472, "step": 88305 }, { "epoch": 16.206643420811158, "grad_norm": 0.001295341644436121, "learning_rate": 1.0564780518976403e-06, "loss": 0.1438, "num_input_tokens_seen": 190450768, "step": 88310 }, { "epoch": 16.20756102037071, "grad_norm": 0.006055863574147224, "learning_rate": 1.0559858191849092e-06, "loss": 0.0, "num_input_tokens_seen": 190460752, "step": 88315 }, { "epoch": 16.20847861993026, "grad_norm": 0.0023765687365084887, "learning_rate": 1.0554936876300292e-06, "loss": 0.0001, "num_input_tokens_seen": 190471696, "step": 88320 }, { "epoch": 16.209396219489815, "grad_norm": 0.10329259932041168, "learning_rate": 1.0550016572456212e-06, "loss": 0.008, "num_input_tokens_seen": 190482576, "step": 88325 }, { "epoch": 16.210313819049368, "grad_norm": 0.0024878885596990585, "learning_rate": 1.0545097280443078e-06, "loss": 0.0, "num_input_tokens_seen": 190493872, "step": 88330 }, { "epoch": 16.211231418608918, "grad_norm": 0.0071829017251729965, "learning_rate": 1.0540179000387053e-06, "loss": 0.0001, "num_input_tokens_seen": 190504560, "step": 88335 }, { "epoch": 16.21214901816847, "grad_norm": 0.01514006033539772, "learning_rate": 1.0535261732414276e-06, "loss": 0.0023, "num_input_tokens_seen": 190515664, "step": 88340 }, { "epoch": 16.213066617728025, "grad_norm": 0.003701993729919195, "learning_rate": 1.053034547665086e-06, "loss": 0.1176, "num_input_tokens_seen": 190526864, "step": 88345 }, { "epoch": 16.213984217287575, "grad_norm": 0.021163837984204292, "learning_rate": 1.0525430233222922e-06, "loss": 0.0, "num_input_tokens_seen": 190537328, "step": 88350 }, { "epoch": 16.214901816847128, "grad_norm": 0.060209304094314575, "learning_rate": 1.052051600225652e-06, "loss": 0.0004, "num_input_tokens_seen": 190548304, "step": 88355 }, { "epoch": 16.21581941640668, "grad_norm": 0.0009058327414095402, "learning_rate": 1.0515602783877676e-06, "loss": 0.0002, "num_input_tokens_seen": 190558000, "step": 88360 }, { "epoch": 16.21673701596623, "grad_norm": 0.014865292236208916, "learning_rate": 1.0510690578212447e-06, "loss": 0.0823, "num_input_tokens_seen": 190568944, "step": 88365 }, { "epoch": 16.217654615525785, "grad_norm": 0.01541051920503378, "learning_rate": 1.0505779385386795e-06, "loss": 0.0001, "num_input_tokens_seen": 190578864, "step": 88370 }, { "epoch": 16.218572215085338, "grad_norm": 526.0963134765625, "learning_rate": 1.0500869205526681e-06, "loss": 0.0157, "num_input_tokens_seen": 190590896, "step": 88375 }, { "epoch": 16.219489814644888, "grad_norm": 0.021478857845067978, "learning_rate": 1.0495960038758063e-06, "loss": 0.1377, "num_input_tokens_seen": 190602320, "step": 88380 }, { "epoch": 16.22040741420444, "grad_norm": 0.14592795073986053, "learning_rate": 1.049105188520685e-06, "loss": 0.0002, "num_input_tokens_seen": 190613488, "step": 88385 }, { "epoch": 16.221325013763995, "grad_norm": 0.019216598942875862, "learning_rate": 1.0486144744998922e-06, "loss": 0.0002, "num_input_tokens_seen": 190624368, "step": 88390 }, { "epoch": 16.222242613323544, "grad_norm": 0.00590283889323473, "learning_rate": 1.0481238618260126e-06, "loss": 0.0001, "num_input_tokens_seen": 190635120, "step": 88395 }, { "epoch": 16.223160212883098, "grad_norm": 0.007071036379784346, "learning_rate": 1.047633350511632e-06, "loss": 0.0026, "num_input_tokens_seen": 190645872, "step": 88400 }, { "epoch": 16.22407781244265, "grad_norm": 0.0010051324497908354, "learning_rate": 1.0471429405693307e-06, "loss": 0.0, "num_input_tokens_seen": 190656336, "step": 88405 }, { "epoch": 16.2249954120022, "grad_norm": 0.0060779317282140255, "learning_rate": 1.0466526320116854e-06, "loss": 0.0329, "num_input_tokens_seen": 190667088, "step": 88410 }, { "epoch": 16.225913011561754, "grad_norm": 0.009873231872916222, "learning_rate": 1.0461624248512741e-06, "loss": 0.0001, "num_input_tokens_seen": 190677168, "step": 88415 }, { "epoch": 16.226830611121308, "grad_norm": 0.009424817748367786, "learning_rate": 1.045672319100669e-06, "loss": 0.0001, "num_input_tokens_seen": 190688368, "step": 88420 }, { "epoch": 16.227748210680858, "grad_norm": 0.000729814637452364, "learning_rate": 1.04518231477244e-06, "loss": 0.0001, "num_input_tokens_seen": 190699856, "step": 88425 }, { "epoch": 16.22866581024041, "grad_norm": 0.009127145633101463, "learning_rate": 1.0446924118791552e-06, "loss": 0.1693, "num_input_tokens_seen": 190710832, "step": 88430 }, { "epoch": 16.229583409799964, "grad_norm": 225.94189453125, "learning_rate": 1.0442026104333785e-06, "loss": 0.0144, "num_input_tokens_seen": 190721072, "step": 88435 }, { "epoch": 16.230501009359514, "grad_norm": 0.005038998555392027, "learning_rate": 1.0437129104476756e-06, "loss": 0.0945, "num_input_tokens_seen": 190731408, "step": 88440 }, { "epoch": 16.231418608919068, "grad_norm": 0.009598532691597939, "learning_rate": 1.0432233119346047e-06, "loss": 0.0001, "num_input_tokens_seen": 190742288, "step": 88445 }, { "epoch": 16.23233620847862, "grad_norm": 0.02295186184346676, "learning_rate": 1.042733814906723e-06, "loss": 0.0004, "num_input_tokens_seen": 190752560, "step": 88450 }, { "epoch": 16.23325380803817, "grad_norm": 0.0007476233877241611, "learning_rate": 1.0422444193765862e-06, "loss": 0.0, "num_input_tokens_seen": 190763216, "step": 88455 }, { "epoch": 16.234171407597724, "grad_norm": 0.0019500133348628879, "learning_rate": 1.0417551253567447e-06, "loss": 0.0001, "num_input_tokens_seen": 190775280, "step": 88460 }, { "epoch": 16.235089007157278, "grad_norm": 0.008068608120083809, "learning_rate": 1.0412659328597507e-06, "loss": 0.0013, "num_input_tokens_seen": 190784976, "step": 88465 }, { "epoch": 16.236006606716828, "grad_norm": 0.002704572631046176, "learning_rate": 1.0407768418981501e-06, "loss": 0.0001, "num_input_tokens_seen": 190795600, "step": 88470 }, { "epoch": 16.23692420627638, "grad_norm": 0.012794737704098225, "learning_rate": 1.0402878524844872e-06, "loss": 0.0001, "num_input_tokens_seen": 190806320, "step": 88475 }, { "epoch": 16.237841805835934, "grad_norm": 0.001313826534897089, "learning_rate": 1.0397989646313022e-06, "loss": 0.0001, "num_input_tokens_seen": 190817136, "step": 88480 }, { "epoch": 16.238759405395484, "grad_norm": 0.0017076913500204682, "learning_rate": 1.0393101783511377e-06, "loss": 0.0001, "num_input_tokens_seen": 190827504, "step": 88485 }, { "epoch": 16.239677004955038, "grad_norm": 0.013436207547783852, "learning_rate": 1.038821493656529e-06, "loss": 0.0, "num_input_tokens_seen": 190837584, "step": 88490 }, { "epoch": 16.24059460451459, "grad_norm": 0.0396946556866169, "learning_rate": 1.0383329105600082e-06, "loss": 0.0, "num_input_tokens_seen": 190848208, "step": 88495 }, { "epoch": 16.24151220407414, "grad_norm": 0.005731232464313507, "learning_rate": 1.0378444290741092e-06, "loss": 0.0, "num_input_tokens_seen": 190859024, "step": 88500 }, { "epoch": 16.242429803633694, "grad_norm": 0.0022813263349235058, "learning_rate": 1.0373560492113598e-06, "loss": 0.0386, "num_input_tokens_seen": 190868752, "step": 88505 }, { "epoch": 16.243347403193248, "grad_norm": 0.03683128207921982, "learning_rate": 1.036867770984285e-06, "loss": 0.0, "num_input_tokens_seen": 190879600, "step": 88510 }, { "epoch": 16.244265002752797, "grad_norm": 0.0036158019211143255, "learning_rate": 1.0363795944054112e-06, "loss": 0.0, "num_input_tokens_seen": 190890448, "step": 88515 }, { "epoch": 16.24518260231235, "grad_norm": 0.005346078425645828, "learning_rate": 1.0358915194872576e-06, "loss": 0.0, "num_input_tokens_seen": 190902032, "step": 88520 }, { "epoch": 16.246100201871904, "grad_norm": 0.03513750061392784, "learning_rate": 1.0354035462423423e-06, "loss": 0.0001, "num_input_tokens_seen": 190913680, "step": 88525 }, { "epoch": 16.247017801431454, "grad_norm": 0.0027371312025934458, "learning_rate": 1.0349156746831807e-06, "loss": 0.0057, "num_input_tokens_seen": 190923856, "step": 88530 }, { "epoch": 16.247935400991008, "grad_norm": 0.0009607424144633114, "learning_rate": 1.0344279048222877e-06, "loss": 0.1072, "num_input_tokens_seen": 190934480, "step": 88535 }, { "epoch": 16.24885300055056, "grad_norm": 0.011810027062892914, "learning_rate": 1.033940236672173e-06, "loss": 0.0002, "num_input_tokens_seen": 190945136, "step": 88540 }, { "epoch": 16.24977060011011, "grad_norm": 0.03907071053981781, "learning_rate": 1.0334526702453429e-06, "loss": 0.0051, "num_input_tokens_seen": 190955216, "step": 88545 }, { "epoch": 16.250688199669664, "grad_norm": 1.2481908798217773, "learning_rate": 1.032965205554306e-06, "loss": 0.0004, "num_input_tokens_seen": 190965712, "step": 88550 }, { "epoch": 16.251605799229218, "grad_norm": 0.0011887731961905956, "learning_rate": 1.0324778426115628e-06, "loss": 0.0645, "num_input_tokens_seen": 190977232, "step": 88555 }, { "epoch": 16.252523398788767, "grad_norm": 0.001128315576352179, "learning_rate": 1.031990581429614e-06, "loss": 0.0, "num_input_tokens_seen": 190987568, "step": 88560 }, { "epoch": 16.25344099834832, "grad_norm": 0.0010647614253684878, "learning_rate": 1.0315034220209553e-06, "loss": 0.0, "num_input_tokens_seen": 190997968, "step": 88565 }, { "epoch": 16.254358597907874, "grad_norm": 0.009220005944371223, "learning_rate": 1.0310163643980848e-06, "loss": 0.0078, "num_input_tokens_seen": 191009584, "step": 88570 }, { "epoch": 16.255276197467424, "grad_norm": 0.003188998671248555, "learning_rate": 1.0305294085734935e-06, "loss": 0.0329, "num_input_tokens_seen": 191019280, "step": 88575 }, { "epoch": 16.256193797026977, "grad_norm": 0.003143595764413476, "learning_rate": 1.0300425545596686e-06, "loss": 0.1751, "num_input_tokens_seen": 191029168, "step": 88580 }, { "epoch": 16.25711139658653, "grad_norm": 0.0074359648860991, "learning_rate": 1.0295558023691016e-06, "loss": 0.0001, "num_input_tokens_seen": 191040400, "step": 88585 }, { "epoch": 16.25802899614608, "grad_norm": 0.4464380443096161, "learning_rate": 1.0290691520142737e-06, "loss": 0.0003, "num_input_tokens_seen": 191050288, "step": 88590 }, { "epoch": 16.258946595705634, "grad_norm": 0.14288371801376343, "learning_rate": 1.0285826035076667e-06, "loss": 0.0005, "num_input_tokens_seen": 191061040, "step": 88595 }, { "epoch": 16.259864195265187, "grad_norm": 0.015341458842158318, "learning_rate": 1.0280961568617626e-06, "loss": 0.0001, "num_input_tokens_seen": 191071888, "step": 88600 }, { "epoch": 16.260781794824737, "grad_norm": 0.0016934433951973915, "learning_rate": 1.027609812089036e-06, "loss": 0.0001, "num_input_tokens_seen": 191082480, "step": 88605 }, { "epoch": 16.26169939438429, "grad_norm": 0.1382845938205719, "learning_rate": 1.0271235692019605e-06, "loss": 0.0001, "num_input_tokens_seen": 191094320, "step": 88610 }, { "epoch": 16.262616993943844, "grad_norm": 0.006110795307904482, "learning_rate": 1.026637428213007e-06, "loss": 0.0, "num_input_tokens_seen": 191105008, "step": 88615 }, { "epoch": 16.263534593503394, "grad_norm": 0.002813332946971059, "learning_rate": 1.0261513891346469e-06, "loss": 0.0, "num_input_tokens_seen": 191116464, "step": 88620 }, { "epoch": 16.264452193062947, "grad_norm": 0.0010971538722515106, "learning_rate": 1.0256654519793447e-06, "loss": 0.0, "num_input_tokens_seen": 191127152, "step": 88625 }, { "epoch": 16.2653697926225, "grad_norm": 0.0058724964037537575, "learning_rate": 1.0251796167595623e-06, "loss": 0.0001, "num_input_tokens_seen": 191137520, "step": 88630 }, { "epoch": 16.26628739218205, "grad_norm": 0.009175937622785568, "learning_rate": 1.024693883487764e-06, "loss": 0.0, "num_input_tokens_seen": 191148624, "step": 88635 }, { "epoch": 16.267204991741604, "grad_norm": 0.0017838738858699799, "learning_rate": 1.0242082521764062e-06, "loss": 0.001, "num_input_tokens_seen": 191158832, "step": 88640 }, { "epoch": 16.268122591301157, "grad_norm": 0.0010949814459308982, "learning_rate": 1.0237227228379448e-06, "loss": 0.0, "num_input_tokens_seen": 191169424, "step": 88645 }, { "epoch": 16.269040190860707, "grad_norm": 0.0007599171949550509, "learning_rate": 1.0232372954848335e-06, "loss": 0.0, "num_input_tokens_seen": 191178896, "step": 88650 }, { "epoch": 16.26995779042026, "grad_norm": 0.013288168236613274, "learning_rate": 1.0227519701295203e-06, "loss": 0.0003, "num_input_tokens_seen": 191190256, "step": 88655 }, { "epoch": 16.270875389979814, "grad_norm": 0.006512765306979418, "learning_rate": 1.022266746784456e-06, "loss": 0.0, "num_input_tokens_seen": 191200016, "step": 88660 }, { "epoch": 16.271792989539364, "grad_norm": 0.014492173679172993, "learning_rate": 1.021781625462085e-06, "loss": 0.0, "num_input_tokens_seen": 191210928, "step": 88665 }, { "epoch": 16.272710589098917, "grad_norm": 71.58689880371094, "learning_rate": 1.0212966061748497e-06, "loss": 0.0079, "num_input_tokens_seen": 191221648, "step": 88670 }, { "epoch": 16.27362818865847, "grad_norm": 0.14614975452423096, "learning_rate": 1.0208116889351899e-06, "loss": 0.0001, "num_input_tokens_seen": 191232496, "step": 88675 }, { "epoch": 16.27454578821802, "grad_norm": 0.0008835602784529328, "learning_rate": 1.0203268737555417e-06, "loss": 0.0, "num_input_tokens_seen": 191243600, "step": 88680 }, { "epoch": 16.275463387777574, "grad_norm": 0.003219225909560919, "learning_rate": 1.0198421606483427e-06, "loss": 0.0, "num_input_tokens_seen": 191253808, "step": 88685 }, { "epoch": 16.276380987337127, "grad_norm": 0.024831857532262802, "learning_rate": 1.0193575496260238e-06, "loss": 0.0, "num_input_tokens_seen": 191265264, "step": 88690 }, { "epoch": 16.277298586896677, "grad_norm": 0.006173988338559866, "learning_rate": 1.0188730407010129e-06, "loss": 0.0002, "num_input_tokens_seen": 191275568, "step": 88695 }, { "epoch": 16.27821618645623, "grad_norm": 3498.83056640625, "learning_rate": 1.018388633885739e-06, "loss": 0.1345, "num_input_tokens_seen": 191286288, "step": 88700 }, { "epoch": 16.279133786015784, "grad_norm": 0.005439115688204765, "learning_rate": 1.0179043291926267e-06, "loss": 0.0, "num_input_tokens_seen": 191297488, "step": 88705 }, { "epoch": 16.280051385575334, "grad_norm": 0.0062497081235051155, "learning_rate": 1.017420126634096e-06, "loss": 0.0001, "num_input_tokens_seen": 191309328, "step": 88710 }, { "epoch": 16.280968985134887, "grad_norm": 0.01665257103741169, "learning_rate": 1.0169360262225653e-06, "loss": 0.0, "num_input_tokens_seen": 191320272, "step": 88715 }, { "epoch": 16.28188658469444, "grad_norm": 0.0006563673377968371, "learning_rate": 1.0164520279704538e-06, "loss": 0.0944, "num_input_tokens_seen": 191331920, "step": 88720 }, { "epoch": 16.28280418425399, "grad_norm": 0.0058851041831076145, "learning_rate": 1.0159681318901738e-06, "loss": 0.0, "num_input_tokens_seen": 191342896, "step": 88725 }, { "epoch": 16.283721783813544, "grad_norm": 0.0009780973196029663, "learning_rate": 1.0154843379941354e-06, "loss": 0.0001, "num_input_tokens_seen": 191353488, "step": 88730 }, { "epoch": 16.284639383373097, "grad_norm": 0.016319693997502327, "learning_rate": 1.0150006462947493e-06, "loss": 0.0001, "num_input_tokens_seen": 191362512, "step": 88735 }, { "epoch": 16.285556982932647, "grad_norm": 0.0005842749960720539, "learning_rate": 1.014517056804421e-06, "loss": 0.0, "num_input_tokens_seen": 191374064, "step": 88740 }, { "epoch": 16.2864745824922, "grad_norm": 0.7561554908752441, "learning_rate": 1.0140335695355525e-06, "loss": 0.0005, "num_input_tokens_seen": 191384784, "step": 88745 }, { "epoch": 16.287392182051754, "grad_norm": 0.034946590662002563, "learning_rate": 1.0135501845005446e-06, "loss": 0.0, "num_input_tokens_seen": 191393808, "step": 88750 }, { "epoch": 16.288309781611304, "grad_norm": 0.009345787577331066, "learning_rate": 1.0130669017117967e-06, "loss": 0.0, "num_input_tokens_seen": 191405328, "step": 88755 }, { "epoch": 16.289227381170857, "grad_norm": 0.001888379454612732, "learning_rate": 1.0125837211817042e-06, "loss": 0.0001, "num_input_tokens_seen": 191416592, "step": 88760 }, { "epoch": 16.29014498073041, "grad_norm": 0.0029865363612771034, "learning_rate": 1.0121006429226575e-06, "loss": 0.0, "num_input_tokens_seen": 191427088, "step": 88765 }, { "epoch": 16.29106258028996, "grad_norm": 0.10571205615997314, "learning_rate": 1.01161766694705e-06, "loss": 0.0002, "num_input_tokens_seen": 191436912, "step": 88770 }, { "epoch": 16.291980179849514, "grad_norm": 0.002264385810121894, "learning_rate": 1.0111347932672682e-06, "loss": 0.0, "num_input_tokens_seen": 191446160, "step": 88775 }, { "epoch": 16.292897779409067, "grad_norm": 1.5251930952072144, "learning_rate": 1.0106520218956955e-06, "loss": 0.0002, "num_input_tokens_seen": 191457200, "step": 88780 }, { "epoch": 16.293815378968617, "grad_norm": 0.004635901190340519, "learning_rate": 1.0101693528447166e-06, "loss": 0.0, "num_input_tokens_seen": 191468304, "step": 88785 }, { "epoch": 16.29473297852817, "grad_norm": 0.028102710843086243, "learning_rate": 1.0096867861267102e-06, "loss": 0.1128, "num_input_tokens_seen": 191479440, "step": 88790 }, { "epoch": 16.295650578087724, "grad_norm": 0.004044036380946636, "learning_rate": 1.0092043217540536e-06, "loss": 0.0001, "num_input_tokens_seen": 191490032, "step": 88795 }, { "epoch": 16.296568177647274, "grad_norm": 0.1619858592748642, "learning_rate": 1.008721959739119e-06, "loss": 0.0, "num_input_tokens_seen": 191501936, "step": 88800 }, { "epoch": 16.297485777206827, "grad_norm": 0.0019391573732718825, "learning_rate": 1.0082397000942823e-06, "loss": 0.0377, "num_input_tokens_seen": 191512336, "step": 88805 }, { "epoch": 16.29840337676638, "grad_norm": 0.006155785173177719, "learning_rate": 1.0077575428319096e-06, "loss": 0.0001, "num_input_tokens_seen": 191524528, "step": 88810 }, { "epoch": 16.29932097632593, "grad_norm": 0.0011083661811426282, "learning_rate": 1.0072754879643682e-06, "loss": 0.0001, "num_input_tokens_seen": 191534768, "step": 88815 }, { "epoch": 16.300238575885484, "grad_norm": 0.006621636915951967, "learning_rate": 1.0067935355040231e-06, "loss": 0.0, "num_input_tokens_seen": 191545328, "step": 88820 }, { "epoch": 16.301156175445037, "grad_norm": 0.001461394247598946, "learning_rate": 1.0063116854632355e-06, "loss": 0.0, "num_input_tokens_seen": 191555280, "step": 88825 }, { "epoch": 16.302073775004587, "grad_norm": 0.014402581378817558, "learning_rate": 1.0058299378543617e-06, "loss": 0.0001, "num_input_tokens_seen": 191567184, "step": 88830 }, { "epoch": 16.30299137456414, "grad_norm": 0.002141202799975872, "learning_rate": 1.0053482926897607e-06, "loss": 0.2097, "num_input_tokens_seen": 191577808, "step": 88835 }, { "epoch": 16.303908974123694, "grad_norm": 0.018480338156223297, "learning_rate": 1.0048667499817854e-06, "loss": 0.0005, "num_input_tokens_seen": 191587728, "step": 88840 }, { "epoch": 16.304826573683243, "grad_norm": 0.0012703161919489503, "learning_rate": 1.0043853097427859e-06, "loss": 0.0, "num_input_tokens_seen": 191597776, "step": 88845 }, { "epoch": 16.305744173242797, "grad_norm": 51.1134033203125, "learning_rate": 1.003903971985109e-06, "loss": 0.2625, "num_input_tokens_seen": 191608208, "step": 88850 }, { "epoch": 16.30666177280235, "grad_norm": 0.0066910842433571815, "learning_rate": 1.0034227367211036e-06, "loss": 0.0008, "num_input_tokens_seen": 191618704, "step": 88855 }, { "epoch": 16.3075793723619, "grad_norm": 0.010073235258460045, "learning_rate": 1.0029416039631101e-06, "loss": 0.0071, "num_input_tokens_seen": 191629872, "step": 88860 }, { "epoch": 16.308496971921453, "grad_norm": 0.015380258671939373, "learning_rate": 1.0024605737234705e-06, "loss": 0.0001, "num_input_tokens_seen": 191640912, "step": 88865 }, { "epoch": 16.309414571481007, "grad_norm": 0.3763830065727234, "learning_rate": 1.0019796460145209e-06, "loss": 0.0485, "num_input_tokens_seen": 191650288, "step": 88870 }, { "epoch": 16.310332171040557, "grad_norm": 0.003853851230815053, "learning_rate": 1.001498820848596e-06, "loss": 0.0, "num_input_tokens_seen": 191660848, "step": 88875 }, { "epoch": 16.31124977060011, "grad_norm": 0.0025524068623781204, "learning_rate": 1.0010180982380303e-06, "loss": 0.0, "num_input_tokens_seen": 191673584, "step": 88880 }, { "epoch": 16.312167370159663, "grad_norm": 0.002351156435906887, "learning_rate": 1.0005374781951526e-06, "loss": 0.0, "num_input_tokens_seen": 191684368, "step": 88885 }, { "epoch": 16.313084969719213, "grad_norm": 0.0004434303555171937, "learning_rate": 1.0000569607322902e-06, "loss": 0.0, "num_input_tokens_seen": 191695984, "step": 88890 }, { "epoch": 16.314002569278767, "grad_norm": 0.003492841962724924, "learning_rate": 9.995765458617674e-07, "loss": 0.0, "num_input_tokens_seen": 191707312, "step": 88895 }, { "epoch": 16.31492016883832, "grad_norm": 0.1399068385362625, "learning_rate": 9.990962335959047e-07, "loss": 0.0001, "num_input_tokens_seen": 191718384, "step": 88900 }, { "epoch": 16.31583776839787, "grad_norm": 0.0006758483941666782, "learning_rate": 9.986160239470238e-07, "loss": 0.1481, "num_input_tokens_seen": 191729200, "step": 88905 }, { "epoch": 16.316755367957423, "grad_norm": 28.648406982421875, "learning_rate": 9.981359169274408e-07, "loss": 0.0012, "num_input_tokens_seen": 191739888, "step": 88910 }, { "epoch": 16.317672967516977, "grad_norm": 0.04629673808813095, "learning_rate": 9.976559125494673e-07, "loss": 0.0, "num_input_tokens_seen": 191751440, "step": 88915 }, { "epoch": 16.318590567076527, "grad_norm": 0.050038307905197144, "learning_rate": 9.971760108254185e-07, "loss": 0.0001, "num_input_tokens_seen": 191762288, "step": 88920 }, { "epoch": 16.31950816663608, "grad_norm": 0.004890902899205685, "learning_rate": 9.96696211767601e-07, "loss": 0.0, "num_input_tokens_seen": 191774192, "step": 88925 }, { "epoch": 16.320425766195633, "grad_norm": 0.001070573227480054, "learning_rate": 9.962165153883207e-07, "loss": 0.0174, "num_input_tokens_seen": 191784976, "step": 88930 }, { "epoch": 16.321343365755183, "grad_norm": 0.007955010049045086, "learning_rate": 9.957369216998807e-07, "loss": 0.0, "num_input_tokens_seen": 191794704, "step": 88935 }, { "epoch": 16.322260965314737, "grad_norm": 0.00927942618727684, "learning_rate": 9.952574307145834e-07, "loss": 0.0, "num_input_tokens_seen": 191805008, "step": 88940 }, { "epoch": 16.32317856487429, "grad_norm": 0.0020398981869220734, "learning_rate": 9.947780424447268e-07, "loss": 0.0, "num_input_tokens_seen": 191815664, "step": 88945 }, { "epoch": 16.32409616443384, "grad_norm": 0.014367582276463509, "learning_rate": 9.942987569026041e-07, "loss": 0.0, "num_input_tokens_seen": 191826224, "step": 88950 }, { "epoch": 16.325013763993393, "grad_norm": 0.0006207111291587353, "learning_rate": 9.938195741005119e-07, "loss": 0.0, "num_input_tokens_seen": 191836464, "step": 88955 }, { "epoch": 16.325931363552947, "grad_norm": 0.004141438286751509, "learning_rate": 9.93340494050738e-07, "loss": 0.0, "num_input_tokens_seen": 191846832, "step": 88960 }, { "epoch": 16.326848963112496, "grad_norm": 0.0015197091270238161, "learning_rate": 9.928615167655698e-07, "loss": 0.0, "num_input_tokens_seen": 191857360, "step": 88965 }, { "epoch": 16.32776656267205, "grad_norm": 0.002003895351663232, "learning_rate": 9.92382642257294e-07, "loss": 0.0001, "num_input_tokens_seen": 191868848, "step": 88970 }, { "epoch": 16.328684162231603, "grad_norm": 0.004546182695776224, "learning_rate": 9.91903870538193e-07, "loss": 0.0, "num_input_tokens_seen": 191878864, "step": 88975 }, { "epoch": 16.329601761791153, "grad_norm": 313.3265686035156, "learning_rate": 9.91425201620545e-07, "loss": 0.0144, "num_input_tokens_seen": 191889776, "step": 88980 }, { "epoch": 16.330519361350706, "grad_norm": 0.0013301712460815907, "learning_rate": 9.909466355166263e-07, "loss": 0.0352, "num_input_tokens_seen": 191899984, "step": 88985 }, { "epoch": 16.33143696091026, "grad_norm": 0.44702044129371643, "learning_rate": 9.904681722387149e-07, "loss": 0.0002, "num_input_tokens_seen": 191910416, "step": 88990 }, { "epoch": 16.33235456046981, "grad_norm": 0.010810693725943565, "learning_rate": 9.899898117990808e-07, "loss": 0.1314, "num_input_tokens_seen": 191920880, "step": 88995 }, { "epoch": 16.333272160029363, "grad_norm": 0.03991326317191124, "learning_rate": 9.895115542099915e-07, "loss": 0.119, "num_input_tokens_seen": 191932240, "step": 89000 }, { "epoch": 16.334189759588917, "grad_norm": 0.005892136599868536, "learning_rate": 9.890333994837159e-07, "loss": 0.1626, "num_input_tokens_seen": 191942032, "step": 89005 }, { "epoch": 16.335107359148466, "grad_norm": 0.004698703531175852, "learning_rate": 9.885553476325177e-07, "loss": 0.0001, "num_input_tokens_seen": 191953904, "step": 89010 }, { "epoch": 16.33602495870802, "grad_norm": 0.02004718966782093, "learning_rate": 9.880773986686576e-07, "loss": 0.0005, "num_input_tokens_seen": 191964912, "step": 89015 }, { "epoch": 16.336942558267573, "grad_norm": 0.0012203515507280827, "learning_rate": 9.87599552604393e-07, "loss": 0.002, "num_input_tokens_seen": 191976592, "step": 89020 }, { "epoch": 16.337860157827123, "grad_norm": 0.07420031726360321, "learning_rate": 9.871218094519824e-07, "loss": 0.0, "num_input_tokens_seen": 191988336, "step": 89025 }, { "epoch": 16.338777757386676, "grad_norm": 0.005371568258851767, "learning_rate": 9.866441692236784e-07, "loss": 0.0001, "num_input_tokens_seen": 191998256, "step": 89030 }, { "epoch": 16.33969535694623, "grad_norm": 0.10398612171411514, "learning_rate": 9.861666319317298e-07, "loss": 0.0174, "num_input_tokens_seen": 192009296, "step": 89035 }, { "epoch": 16.34061295650578, "grad_norm": 0.0037254332564771175, "learning_rate": 9.856891975883874e-07, "loss": 0.0, "num_input_tokens_seen": 192020944, "step": 89040 }, { "epoch": 16.341530556065333, "grad_norm": 0.010183945298194885, "learning_rate": 9.852118662058957e-07, "loss": 0.0013, "num_input_tokens_seen": 192031824, "step": 89045 }, { "epoch": 16.342448155624886, "grad_norm": 0.034913141280412674, "learning_rate": 9.847346377964956e-07, "loss": 0.0001, "num_input_tokens_seen": 192042800, "step": 89050 }, { "epoch": 16.343365755184436, "grad_norm": 0.04086591303348541, "learning_rate": 9.8425751237243e-07, "loss": 0.1067, "num_input_tokens_seen": 192053616, "step": 89055 }, { "epoch": 16.34428335474399, "grad_norm": 0.10822467505931854, "learning_rate": 9.837804899459364e-07, "loss": 0.0008, "num_input_tokens_seen": 192062000, "step": 89060 }, { "epoch": 16.345200954303543, "grad_norm": 0.012918263673782349, "learning_rate": 9.833035705292482e-07, "loss": 0.0, "num_input_tokens_seen": 192073712, "step": 89065 }, { "epoch": 16.346118553863093, "grad_norm": 0.001272738678380847, "learning_rate": 9.828267541345965e-07, "loss": 0.0001, "num_input_tokens_seen": 192083024, "step": 89070 }, { "epoch": 16.347036153422646, "grad_norm": 0.0036696672905236483, "learning_rate": 9.823500407742137e-07, "loss": 0.0, "num_input_tokens_seen": 192094064, "step": 89075 }, { "epoch": 16.3479537529822, "grad_norm": 0.002287587383762002, "learning_rate": 9.81873430460326e-07, "loss": 0.0018, "num_input_tokens_seen": 192105968, "step": 89080 }, { "epoch": 16.34887135254175, "grad_norm": 0.005547434091567993, "learning_rate": 9.813969232051573e-07, "loss": 0.0001, "num_input_tokens_seen": 192116400, "step": 89085 }, { "epoch": 16.349788952101303, "grad_norm": 0.006372054107487202, "learning_rate": 9.809205190209287e-07, "loss": 0.0, "num_input_tokens_seen": 192127312, "step": 89090 }, { "epoch": 16.350706551660856, "grad_norm": 6.439251899719238, "learning_rate": 9.804442179198593e-07, "loss": 0.0011, "num_input_tokens_seen": 192138512, "step": 89095 }, { "epoch": 16.351624151220406, "grad_norm": 0.004689769819378853, "learning_rate": 9.799680199141665e-07, "loss": 0.0001, "num_input_tokens_seen": 192148240, "step": 89100 }, { "epoch": 16.35254175077996, "grad_norm": 0.019047390669584274, "learning_rate": 9.79491925016064e-07, "loss": 0.0001, "num_input_tokens_seen": 192160080, "step": 89105 }, { "epoch": 16.353459350339513, "grad_norm": 0.0951729267835617, "learning_rate": 9.790159332377619e-07, "loss": 0.0, "num_input_tokens_seen": 192171184, "step": 89110 }, { "epoch": 16.354376949899063, "grad_norm": 0.00672524981200695, "learning_rate": 9.785400445914694e-07, "loss": 0.0001, "num_input_tokens_seen": 192182320, "step": 89115 }, { "epoch": 16.355294549458616, "grad_norm": 0.011830818839371204, "learning_rate": 9.780642590893908e-07, "loss": 0.0001, "num_input_tokens_seen": 192192912, "step": 89120 }, { "epoch": 16.35621214901817, "grad_norm": 0.0012614780571311712, "learning_rate": 9.77588576743732e-07, "loss": 0.0762, "num_input_tokens_seen": 192202992, "step": 89125 }, { "epoch": 16.35712974857772, "grad_norm": 0.043450985103845596, "learning_rate": 9.77112997566692e-07, "loss": 0.0, "num_input_tokens_seen": 192213296, "step": 89130 }, { "epoch": 16.358047348137273, "grad_norm": 115.92623901367188, "learning_rate": 9.76637521570467e-07, "loss": 0.0532, "num_input_tokens_seen": 192225392, "step": 89135 }, { "epoch": 16.358964947696826, "grad_norm": 0.0018263452220708132, "learning_rate": 9.761621487672558e-07, "loss": 0.2094, "num_input_tokens_seen": 192236464, "step": 89140 }, { "epoch": 16.359882547256376, "grad_norm": 0.16777603328227997, "learning_rate": 9.756868791692486e-07, "loss": 0.0002, "num_input_tokens_seen": 192247312, "step": 89145 }, { "epoch": 16.36080014681593, "grad_norm": 0.0010274735977873206, "learning_rate": 9.752117127886346e-07, "loss": 0.002, "num_input_tokens_seen": 192257968, "step": 89150 }, { "epoch": 16.361717746375483, "grad_norm": 0.05706793814897537, "learning_rate": 9.747366496376037e-07, "loss": 0.019, "num_input_tokens_seen": 192268144, "step": 89155 }, { "epoch": 16.362635345935033, "grad_norm": 0.007445815950632095, "learning_rate": 9.74261689728339e-07, "loss": 0.0, "num_input_tokens_seen": 192278800, "step": 89160 }, { "epoch": 16.363552945494586, "grad_norm": 0.39393121004104614, "learning_rate": 9.737868330730232e-07, "loss": 0.0854, "num_input_tokens_seen": 192290288, "step": 89165 }, { "epoch": 16.36447054505414, "grad_norm": 0.1786721646785736, "learning_rate": 9.73312079683833e-07, "loss": 0.0736, "num_input_tokens_seen": 192301200, "step": 89170 }, { "epoch": 16.36538814461369, "grad_norm": 0.10289500653743744, "learning_rate": 9.728374295729491e-07, "loss": 0.002, "num_input_tokens_seen": 192312592, "step": 89175 }, { "epoch": 16.366305744173243, "grad_norm": 0.0005038323579356074, "learning_rate": 9.723628827525433e-07, "loss": 0.0001, "num_input_tokens_seen": 192323920, "step": 89180 }, { "epoch": 16.367223343732796, "grad_norm": 0.005564535968005657, "learning_rate": 9.71888439234786e-07, "loss": 0.0, "num_input_tokens_seen": 192335632, "step": 89185 }, { "epoch": 16.368140943292346, "grad_norm": 0.002337467623874545, "learning_rate": 9.714140990318488e-07, "loss": 0.0001, "num_input_tokens_seen": 192345232, "step": 89190 }, { "epoch": 16.3690585428519, "grad_norm": 0.003386695636436343, "learning_rate": 9.709398621558958e-07, "loss": 0.0001, "num_input_tokens_seen": 192355824, "step": 89195 }, { "epoch": 16.369976142411453, "grad_norm": 0.019179929047822952, "learning_rate": 9.704657286190911e-07, "loss": 0.0, "num_input_tokens_seen": 192366384, "step": 89200 }, { "epoch": 16.370893741971003, "grad_norm": 0.1821037083864212, "learning_rate": 9.699916984335938e-07, "loss": 0.0001, "num_input_tokens_seen": 192378192, "step": 89205 }, { "epoch": 16.371811341530556, "grad_norm": 0.002765063429251313, "learning_rate": 9.695177716115645e-07, "loss": 0.0, "num_input_tokens_seen": 192388432, "step": 89210 }, { "epoch": 16.37272894109011, "grad_norm": 0.004415440373122692, "learning_rate": 9.690439481651582e-07, "loss": 0.0, "num_input_tokens_seen": 192400176, "step": 89215 }, { "epoch": 16.37364654064966, "grad_norm": 1.5470415353775024, "learning_rate": 9.685702281065258e-07, "loss": 0.0002, "num_input_tokens_seen": 192410864, "step": 89220 }, { "epoch": 16.374564140209213, "grad_norm": 0.0025392973329871893, "learning_rate": 9.680966114478202e-07, "loss": 0.0003, "num_input_tokens_seen": 192422000, "step": 89225 }, { "epoch": 16.375481739768766, "grad_norm": 0.01902841031551361, "learning_rate": 9.676230982011875e-07, "loss": 0.0953, "num_input_tokens_seen": 192433424, "step": 89230 }, { "epoch": 16.376399339328316, "grad_norm": 33.81521224975586, "learning_rate": 9.671496883787712e-07, "loss": 0.0053, "num_input_tokens_seen": 192444688, "step": 89235 }, { "epoch": 16.37731693888787, "grad_norm": 0.0008505427394993603, "learning_rate": 9.66676381992717e-07, "loss": 0.0, "num_input_tokens_seen": 192456304, "step": 89240 }, { "epoch": 16.378234538447423, "grad_norm": 0.001251522800885141, "learning_rate": 9.66203179055162e-07, "loss": 0.0021, "num_input_tokens_seen": 192466032, "step": 89245 }, { "epoch": 16.379152138006972, "grad_norm": 0.0009063648758456111, "learning_rate": 9.657300795782436e-07, "loss": 0.0016, "num_input_tokens_seen": 192477200, "step": 89250 }, { "epoch": 16.380069737566526, "grad_norm": 0.12479296326637268, "learning_rate": 9.652570835740954e-07, "loss": 0.0, "num_input_tokens_seen": 192487408, "step": 89255 }, { "epoch": 16.38098733712608, "grad_norm": 0.008807845413684845, "learning_rate": 9.647841910548505e-07, "loss": 0.1844, "num_input_tokens_seen": 192498416, "step": 89260 }, { "epoch": 16.38190493668563, "grad_norm": 0.0068282983265817165, "learning_rate": 9.643114020326371e-07, "loss": 0.0, "num_input_tokens_seen": 192509840, "step": 89265 }, { "epoch": 16.382822536245182, "grad_norm": 0.002682536607608199, "learning_rate": 9.6383871651958e-07, "loss": 0.0, "num_input_tokens_seen": 192521072, "step": 89270 }, { "epoch": 16.383740135804736, "grad_norm": 0.005504885222762823, "learning_rate": 9.63366134527806e-07, "loss": 0.0079, "num_input_tokens_seen": 192532400, "step": 89275 }, { "epoch": 16.384657735364286, "grad_norm": 0.0038605809677392244, "learning_rate": 9.628936560694347e-07, "loss": 0.0052, "num_input_tokens_seen": 192544592, "step": 89280 }, { "epoch": 16.38557533492384, "grad_norm": 0.002049566013738513, "learning_rate": 9.624212811565837e-07, "loss": 0.0, "num_input_tokens_seen": 192555984, "step": 89285 }, { "epoch": 16.386492934483393, "grad_norm": 0.009170260280370712, "learning_rate": 9.619490098013678e-07, "loss": 0.0002, "num_input_tokens_seen": 192566192, "step": 89290 }, { "epoch": 16.387410534042942, "grad_norm": 0.01256862934678793, "learning_rate": 9.614768420159031e-07, "loss": 0.0001, "num_input_tokens_seen": 192577360, "step": 89295 }, { "epoch": 16.388328133602496, "grad_norm": 0.008273271843791008, "learning_rate": 9.610047778122977e-07, "loss": 0.0002, "num_input_tokens_seen": 192589488, "step": 89300 }, { "epoch": 16.38924573316205, "grad_norm": 300.20135498046875, "learning_rate": 9.605328172026608e-07, "loss": 0.2313, "num_input_tokens_seen": 192599184, "step": 89305 }, { "epoch": 16.3901633327216, "grad_norm": 0.05782158300280571, "learning_rate": 9.600609601990957e-07, "loss": 0.0001, "num_input_tokens_seen": 192611248, "step": 89310 }, { "epoch": 16.391080932281152, "grad_norm": 0.0030220085754990578, "learning_rate": 9.59589206813706e-07, "loss": 0.0, "num_input_tokens_seen": 192620688, "step": 89315 }, { "epoch": 16.391998531840706, "grad_norm": 0.000537451880518347, "learning_rate": 9.591175570585892e-07, "loss": 0.0004, "num_input_tokens_seen": 192632272, "step": 89320 }, { "epoch": 16.392916131400256, "grad_norm": 0.1529783457517624, "learning_rate": 9.586460109458462e-07, "loss": 0.0001, "num_input_tokens_seen": 192641776, "step": 89325 }, { "epoch": 16.39383373095981, "grad_norm": 54.07426834106445, "learning_rate": 9.581745684875694e-07, "loss": 0.2563, "num_input_tokens_seen": 192651664, "step": 89330 }, { "epoch": 16.394751330519362, "grad_norm": 0.03381717577576637, "learning_rate": 9.577032296958504e-07, "loss": 0.0001, "num_input_tokens_seen": 192662192, "step": 89335 }, { "epoch": 16.395668930078912, "grad_norm": 0.004393579438328743, "learning_rate": 9.572319945827774e-07, "loss": 0.0003, "num_input_tokens_seen": 192672624, "step": 89340 }, { "epoch": 16.396586529638466, "grad_norm": 0.0016340194270014763, "learning_rate": 9.567608631604398e-07, "loss": 0.0001, "num_input_tokens_seen": 192683472, "step": 89345 }, { "epoch": 16.39750412919802, "grad_norm": 0.005149715580046177, "learning_rate": 9.56289835440919e-07, "loss": 0.0001, "num_input_tokens_seen": 192694416, "step": 89350 }, { "epoch": 16.39842172875757, "grad_norm": 0.0005980929126963019, "learning_rate": 9.558189114362954e-07, "loss": 0.1626, "num_input_tokens_seen": 192705680, "step": 89355 }, { "epoch": 16.399339328317122, "grad_norm": 0.0008778207702562213, "learning_rate": 9.553480911586504e-07, "loss": 0.0, "num_input_tokens_seen": 192717200, "step": 89360 }, { "epoch": 16.400256927876676, "grad_norm": 0.0012719203950837255, "learning_rate": 9.54877374620058e-07, "loss": 0.0002, "num_input_tokens_seen": 192726992, "step": 89365 }, { "epoch": 16.401174527436226, "grad_norm": 0.004184724297374487, "learning_rate": 9.544067618325904e-07, "loss": 0.0001, "num_input_tokens_seen": 192739376, "step": 89370 }, { "epoch": 16.40209212699578, "grad_norm": 0.01445387490093708, "learning_rate": 9.539362528083207e-07, "loss": 0.0, "num_input_tokens_seen": 192751120, "step": 89375 }, { "epoch": 16.403009726555332, "grad_norm": 0.2255474478006363, "learning_rate": 9.534658475593151e-07, "loss": 0.0, "num_input_tokens_seen": 192761584, "step": 89380 }, { "epoch": 16.403927326114882, "grad_norm": 0.041774503886699677, "learning_rate": 9.529955460976387e-07, "loss": 0.0005, "num_input_tokens_seen": 192772400, "step": 89385 }, { "epoch": 16.404844925674436, "grad_norm": 0.02867361716926098, "learning_rate": 9.52525348435353e-07, "loss": 0.0, "num_input_tokens_seen": 192783888, "step": 89390 }, { "epoch": 16.40576252523399, "grad_norm": 0.007609353866428137, "learning_rate": 9.520552545845208e-07, "loss": 0.007, "num_input_tokens_seen": 192794640, "step": 89395 }, { "epoch": 16.40668012479354, "grad_norm": 5.036041259765625, "learning_rate": 9.515852645571977e-07, "loss": 0.0007, "num_input_tokens_seen": 192806096, "step": 89400 }, { "epoch": 16.407597724353092, "grad_norm": 0.002473132684826851, "learning_rate": 9.511153783654365e-07, "loss": 0.0, "num_input_tokens_seen": 192816432, "step": 89405 }, { "epoch": 16.408515323912646, "grad_norm": 0.0012509430525824428, "learning_rate": 9.506455960212918e-07, "loss": 0.0001, "num_input_tokens_seen": 192827408, "step": 89410 }, { "epoch": 16.409432923472195, "grad_norm": 0.0013323508901521564, "learning_rate": 9.501759175368114e-07, "loss": 0.0002, "num_input_tokens_seen": 192838448, "step": 89415 }, { "epoch": 16.41035052303175, "grad_norm": 0.16522426903247833, "learning_rate": 9.497063429240411e-07, "loss": 0.0001, "num_input_tokens_seen": 192849904, "step": 89420 }, { "epoch": 16.411268122591302, "grad_norm": 0.0024317759089171886, "learning_rate": 9.492368721950274e-07, "loss": 0.0, "num_input_tokens_seen": 192860848, "step": 89425 }, { "epoch": 16.412185722150852, "grad_norm": 0.013270610943436623, "learning_rate": 9.487675053618095e-07, "loss": 0.0703, "num_input_tokens_seen": 192870800, "step": 89430 }, { "epoch": 16.413103321710405, "grad_norm": 0.0029391562566161156, "learning_rate": 9.482982424364262e-07, "loss": 0.0, "num_input_tokens_seen": 192880592, "step": 89435 }, { "epoch": 16.41402092126996, "grad_norm": 0.026593517512083054, "learning_rate": 9.478290834309117e-07, "loss": 0.0001, "num_input_tokens_seen": 192892368, "step": 89440 }, { "epoch": 16.41493852082951, "grad_norm": 0.0072141108103096485, "learning_rate": 9.473600283573026e-07, "loss": 0.0883, "num_input_tokens_seen": 192902032, "step": 89445 }, { "epoch": 16.415856120389062, "grad_norm": 0.00447677681222558, "learning_rate": 9.46891077227628e-07, "loss": 0.0, "num_input_tokens_seen": 192913840, "step": 89450 }, { "epoch": 16.416773719948615, "grad_norm": 0.005820410326123238, "learning_rate": 9.464222300539139e-07, "loss": 0.0, "num_input_tokens_seen": 192925200, "step": 89455 }, { "epoch": 16.417691319508165, "grad_norm": 0.025739653035998344, "learning_rate": 9.459534868481885e-07, "loss": 0.0011, "num_input_tokens_seen": 192936240, "step": 89460 }, { "epoch": 16.41860891906772, "grad_norm": 0.10222846269607544, "learning_rate": 9.454848476224732e-07, "loss": 0.0001, "num_input_tokens_seen": 192946608, "step": 89465 }, { "epoch": 16.419526518627272, "grad_norm": 0.0029588949400931597, "learning_rate": 9.450163123887873e-07, "loss": 0.0001, "num_input_tokens_seen": 192958416, "step": 89470 }, { "epoch": 16.420444118186822, "grad_norm": 0.5747010111808777, "learning_rate": 9.445478811591469e-07, "loss": 0.0, "num_input_tokens_seen": 192969200, "step": 89475 }, { "epoch": 16.421361717746375, "grad_norm": 0.007862536236643791, "learning_rate": 9.440795539455699e-07, "loss": 0.0001, "num_input_tokens_seen": 192980560, "step": 89480 }, { "epoch": 16.42227931730593, "grad_norm": 0.003469257615506649, "learning_rate": 9.436113307600658e-07, "loss": 0.0, "num_input_tokens_seen": 192990128, "step": 89485 }, { "epoch": 16.42319691686548, "grad_norm": 0.0013967240229249, "learning_rate": 9.431432116146427e-07, "loss": 0.0792, "num_input_tokens_seen": 193001936, "step": 89490 }, { "epoch": 16.424114516425032, "grad_norm": 6.162023067474365, "learning_rate": 9.426751965213105e-07, "loss": 0.0003, "num_input_tokens_seen": 193013392, "step": 89495 }, { "epoch": 16.425032115984585, "grad_norm": 0.006169095169752836, "learning_rate": 9.422072854920706e-07, "loss": 0.0001, "num_input_tokens_seen": 193023888, "step": 89500 }, { "epoch": 16.425949715544135, "grad_norm": 0.014115799218416214, "learning_rate": 9.417394785389255e-07, "loss": 0.0001, "num_input_tokens_seen": 193033872, "step": 89505 }, { "epoch": 16.42686731510369, "grad_norm": 0.004591655917465687, "learning_rate": 9.412717756738726e-07, "loss": 0.0, "num_input_tokens_seen": 193044912, "step": 89510 }, { "epoch": 16.427784914663242, "grad_norm": 0.015162966214120388, "learning_rate": 9.408041769089071e-07, "loss": 0.0001, "num_input_tokens_seen": 193056912, "step": 89515 }, { "epoch": 16.428702514222792, "grad_norm": 56.682334899902344, "learning_rate": 9.403366822560245e-07, "loss": 0.1128, "num_input_tokens_seen": 193067600, "step": 89520 }, { "epoch": 16.429620113782345, "grad_norm": 0.07950101047754288, "learning_rate": 9.39869291727214e-07, "loss": 0.2188, "num_input_tokens_seen": 193078704, "step": 89525 }, { "epoch": 16.4305377133419, "grad_norm": 0.0026402168441563845, "learning_rate": 9.394020053344638e-07, "loss": 0.0001, "num_input_tokens_seen": 193089072, "step": 89530 }, { "epoch": 16.43145531290145, "grad_norm": 0.010556320659816265, "learning_rate": 9.389348230897582e-07, "loss": 0.0018, "num_input_tokens_seen": 193100912, "step": 89535 }, { "epoch": 16.432372912461002, "grad_norm": 0.004201831296086311, "learning_rate": 9.384677450050794e-07, "loss": 0.0, "num_input_tokens_seen": 193112688, "step": 89540 }, { "epoch": 16.433290512020555, "grad_norm": 0.00889518577605486, "learning_rate": 9.380007710924099e-07, "loss": 0.0002, "num_input_tokens_seen": 193121872, "step": 89545 }, { "epoch": 16.434208111580105, "grad_norm": 42.99888610839844, "learning_rate": 9.375339013637247e-07, "loss": 0.004, "num_input_tokens_seen": 193133328, "step": 89550 }, { "epoch": 16.43512571113966, "grad_norm": 0.0007693480583839118, "learning_rate": 9.370671358309968e-07, "loss": 0.0003, "num_input_tokens_seen": 193145072, "step": 89555 }, { "epoch": 16.436043310699212, "grad_norm": 0.004067132715135813, "learning_rate": 9.366004745062018e-07, "loss": 0.0, "num_input_tokens_seen": 193155664, "step": 89560 }, { "epoch": 16.43696091025876, "grad_norm": 247.2462158203125, "learning_rate": 9.361339174013073e-07, "loss": 0.1376, "num_input_tokens_seen": 193165936, "step": 89565 }, { "epoch": 16.437878509818315, "grad_norm": 0.0387708954513073, "learning_rate": 9.356674645282787e-07, "loss": 0.0001, "num_input_tokens_seen": 193176656, "step": 89570 }, { "epoch": 16.43879610937787, "grad_norm": 0.0006119761383160949, "learning_rate": 9.352011158990793e-07, "loss": 0.0, "num_input_tokens_seen": 193188560, "step": 89575 }, { "epoch": 16.43971370893742, "grad_norm": 0.0009509269148111343, "learning_rate": 9.347348715256732e-07, "loss": 0.0, "num_input_tokens_seen": 193199696, "step": 89580 }, { "epoch": 16.44063130849697, "grad_norm": 0.0025560399517416954, "learning_rate": 9.342687314200166e-07, "loss": 0.0001, "num_input_tokens_seen": 193210352, "step": 89585 }, { "epoch": 16.441548908056525, "grad_norm": 0.03901718556880951, "learning_rate": 9.338026955940643e-07, "loss": 0.0001, "num_input_tokens_seen": 193221104, "step": 89590 }, { "epoch": 16.442466507616075, "grad_norm": 0.0026366952806711197, "learning_rate": 9.333367640597723e-07, "loss": 0.0, "num_input_tokens_seen": 193231280, "step": 89595 }, { "epoch": 16.44338410717563, "grad_norm": 0.0007484052330255508, "learning_rate": 9.328709368290901e-07, "loss": 0.0001, "num_input_tokens_seen": 193242960, "step": 89600 }, { "epoch": 16.44430170673518, "grad_norm": 1.999498724937439, "learning_rate": 9.324052139139628e-07, "loss": 0.0003, "num_input_tokens_seen": 193252848, "step": 89605 }, { "epoch": 16.44521930629473, "grad_norm": 46.80891036987305, "learning_rate": 9.319395953263393e-07, "loss": 0.1349, "num_input_tokens_seen": 193264816, "step": 89610 }, { "epoch": 16.446136905854285, "grad_norm": 0.11718079447746277, "learning_rate": 9.314740810781603e-07, "loss": 0.0, "num_input_tokens_seen": 193275440, "step": 89615 }, { "epoch": 16.44705450541384, "grad_norm": 0.0008116415119729936, "learning_rate": 9.310086711813649e-07, "loss": 0.0, "num_input_tokens_seen": 193285776, "step": 89620 }, { "epoch": 16.44797210497339, "grad_norm": 0.03640303388237953, "learning_rate": 9.305433656478902e-07, "loss": 0.0001, "num_input_tokens_seen": 193294896, "step": 89625 }, { "epoch": 16.44888970453294, "grad_norm": 0.0015435147797688842, "learning_rate": 9.300781644896717e-07, "loss": 0.0001, "num_input_tokens_seen": 193306032, "step": 89630 }, { "epoch": 16.449807304092495, "grad_norm": 0.0034354152157902718, "learning_rate": 9.29613067718641e-07, "loss": 0.002, "num_input_tokens_seen": 193316400, "step": 89635 }, { "epoch": 16.450724903652045, "grad_norm": 0.001774715376086533, "learning_rate": 9.291480753467247e-07, "loss": 0.0883, "num_input_tokens_seen": 193327888, "step": 89640 }, { "epoch": 16.4516425032116, "grad_norm": 0.15674224495887756, "learning_rate": 9.286831873858531e-07, "loss": 0.0001, "num_input_tokens_seen": 193338224, "step": 89645 }, { "epoch": 16.45256010277115, "grad_norm": 0.0018743152031674981, "learning_rate": 9.282184038479469e-07, "loss": 0.0119, "num_input_tokens_seen": 193349872, "step": 89650 }, { "epoch": 16.4534777023307, "grad_norm": 0.029749715700745583, "learning_rate": 9.277537247449286e-07, "loss": 0.0, "num_input_tokens_seen": 193359888, "step": 89655 }, { "epoch": 16.454395301890255, "grad_norm": 0.0006344974390231073, "learning_rate": 9.272891500887143e-07, "loss": 0.0, "num_input_tokens_seen": 193370224, "step": 89660 }, { "epoch": 16.45531290144981, "grad_norm": 0.0005780108622275293, "learning_rate": 9.268246798912228e-07, "loss": 0.0001, "num_input_tokens_seen": 193381328, "step": 89665 }, { "epoch": 16.456230501009358, "grad_norm": 0.0013106856495141983, "learning_rate": 9.263603141643651e-07, "loss": 0.0001, "num_input_tokens_seen": 193392144, "step": 89670 }, { "epoch": 16.45714810056891, "grad_norm": 0.0012966298963874578, "learning_rate": 9.258960529200505e-07, "loss": 0.0, "num_input_tokens_seen": 193403344, "step": 89675 }, { "epoch": 16.458065700128465, "grad_norm": 0.0012173106661066413, "learning_rate": 9.254318961701892e-07, "loss": 0.0, "num_input_tokens_seen": 193414736, "step": 89680 }, { "epoch": 16.458983299688015, "grad_norm": 0.0007856190204620361, "learning_rate": 9.249678439266846e-07, "loss": 0.0079, "num_input_tokens_seen": 193425040, "step": 89685 }, { "epoch": 16.459900899247568, "grad_norm": 0.002202183473855257, "learning_rate": 9.24503896201438e-07, "loss": 0.1128, "num_input_tokens_seen": 193436592, "step": 89690 }, { "epoch": 16.46081849880712, "grad_norm": 0.026892123743891716, "learning_rate": 9.240400530063509e-07, "loss": 0.0, "num_input_tokens_seen": 193446576, "step": 89695 }, { "epoch": 16.46173609836667, "grad_norm": 0.011292500421404839, "learning_rate": 9.2357631435332e-07, "loss": 0.0001, "num_input_tokens_seen": 193458160, "step": 89700 }, { "epoch": 16.462653697926225, "grad_norm": 0.12118710577487946, "learning_rate": 9.231126802542378e-07, "loss": 0.3018, "num_input_tokens_seen": 193469584, "step": 89705 }, { "epoch": 16.463571297485778, "grad_norm": 0.03835306689143181, "learning_rate": 9.226491507209961e-07, "loss": 0.0001, "num_input_tokens_seen": 193480176, "step": 89710 }, { "epoch": 16.464488897045328, "grad_norm": 0.0012653187150135636, "learning_rate": 9.221857257654859e-07, "loss": 0.0001, "num_input_tokens_seen": 193491888, "step": 89715 }, { "epoch": 16.46540649660488, "grad_norm": 0.00045983237214386463, "learning_rate": 9.21722405399591e-07, "loss": 0.0016, "num_input_tokens_seen": 193503184, "step": 89720 }, { "epoch": 16.466324096164435, "grad_norm": 0.007792070973664522, "learning_rate": 9.212591896351963e-07, "loss": 0.0002, "num_input_tokens_seen": 193513392, "step": 89725 }, { "epoch": 16.467241695723985, "grad_norm": 0.014535455033183098, "learning_rate": 9.207960784841818e-07, "loss": 0.0001, "num_input_tokens_seen": 193524336, "step": 89730 }, { "epoch": 16.468159295283538, "grad_norm": 56.77440643310547, "learning_rate": 9.203330719584241e-07, "loss": 0.0822, "num_input_tokens_seen": 193535408, "step": 89735 }, { "epoch": 16.46907689484309, "grad_norm": 0.9967179894447327, "learning_rate": 9.19870170069802e-07, "loss": 0.0001, "num_input_tokens_seen": 193545520, "step": 89740 }, { "epoch": 16.46999449440264, "grad_norm": 0.003427850781008601, "learning_rate": 9.194073728301861e-07, "loss": 0.0011, "num_input_tokens_seen": 193555248, "step": 89745 }, { "epoch": 16.470912093962195, "grad_norm": 0.11688897758722305, "learning_rate": 9.189446802514468e-07, "loss": 0.0011, "num_input_tokens_seen": 193566064, "step": 89750 }, { "epoch": 16.471829693521748, "grad_norm": 194.04296875, "learning_rate": 9.184820923454513e-07, "loss": 0.0144, "num_input_tokens_seen": 193575376, "step": 89755 }, { "epoch": 16.472747293081298, "grad_norm": 0.11027471721172333, "learning_rate": 9.180196091240628e-07, "loss": 0.0005, "num_input_tokens_seen": 193585744, "step": 89760 }, { "epoch": 16.47366489264085, "grad_norm": 0.09060779213905334, "learning_rate": 9.175572305991465e-07, "loss": 0.0016, "num_input_tokens_seen": 193596592, "step": 89765 }, { "epoch": 16.474582492200405, "grad_norm": 0.002816141815856099, "learning_rate": 9.170949567825599e-07, "loss": 0.0, "num_input_tokens_seen": 193607696, "step": 89770 }, { "epoch": 16.475500091759955, "grad_norm": 0.008820701390504837, "learning_rate": 9.166327876861586e-07, "loss": 0.0001, "num_input_tokens_seen": 193618960, "step": 89775 }, { "epoch": 16.476417691319508, "grad_norm": 0.10383614897727966, "learning_rate": 9.161707233217987e-07, "loss": 0.0041, "num_input_tokens_seen": 193630608, "step": 89780 }, { "epoch": 16.47733529087906, "grad_norm": 0.012902063317596912, "learning_rate": 9.157087637013307e-07, "loss": 0.0016, "num_input_tokens_seen": 193642128, "step": 89785 }, { "epoch": 16.47825289043861, "grad_norm": 0.0029515251517295837, "learning_rate": 9.152469088366023e-07, "loss": 0.0001, "num_input_tokens_seen": 193653040, "step": 89790 }, { "epoch": 16.479170489998165, "grad_norm": 0.08958809822797775, "learning_rate": 9.147851587394591e-07, "loss": 0.0004, "num_input_tokens_seen": 193662512, "step": 89795 }, { "epoch": 16.480088089557718, "grad_norm": 0.03519917279481888, "learning_rate": 9.14323513421746e-07, "loss": 0.0, "num_input_tokens_seen": 193672944, "step": 89800 }, { "epoch": 16.481005689117268, "grad_norm": 0.006088599096983671, "learning_rate": 9.13861972895303e-07, "loss": 0.0, "num_input_tokens_seen": 193684592, "step": 89805 }, { "epoch": 16.48192328867682, "grad_norm": 0.013195598497986794, "learning_rate": 9.134005371719656e-07, "loss": 0.0, "num_input_tokens_seen": 193696080, "step": 89810 }, { "epoch": 16.482840888236375, "grad_norm": 72.19019317626953, "learning_rate": 9.129392062635728e-07, "loss": 0.1241, "num_input_tokens_seen": 193707760, "step": 89815 }, { "epoch": 16.483758487795924, "grad_norm": 0.0005438540247268975, "learning_rate": 9.124779801819544e-07, "loss": 0.0, "num_input_tokens_seen": 193717392, "step": 89820 }, { "epoch": 16.484676087355478, "grad_norm": 0.0016931751742959023, "learning_rate": 9.120168589389395e-07, "loss": 0.0, "num_input_tokens_seen": 193729616, "step": 89825 }, { "epoch": 16.48559368691503, "grad_norm": 0.005853032227605581, "learning_rate": 9.115558425463577e-07, "loss": 0.0, "num_input_tokens_seen": 193739440, "step": 89830 }, { "epoch": 16.48651128647458, "grad_norm": 0.0008116248645819724, "learning_rate": 9.110949310160322e-07, "loss": 0.0001, "num_input_tokens_seen": 193749776, "step": 89835 }, { "epoch": 16.487428886034134, "grad_norm": 0.39006170630455017, "learning_rate": 9.106341243597844e-07, "loss": 0.0364, "num_input_tokens_seen": 193759024, "step": 89840 }, { "epoch": 16.488346485593688, "grad_norm": 0.001475230441428721, "learning_rate": 9.10173422589432e-07, "loss": 0.0004, "num_input_tokens_seen": 193769744, "step": 89845 }, { "epoch": 16.489264085153238, "grad_norm": 0.0019875983707606792, "learning_rate": 9.097128257167937e-07, "loss": 0.0001, "num_input_tokens_seen": 193781328, "step": 89850 }, { "epoch": 16.49018168471279, "grad_norm": 1.5848443508148193, "learning_rate": 9.092523337536824e-07, "loss": 0.0, "num_input_tokens_seen": 193792016, "step": 89855 }, { "epoch": 16.491099284272345, "grad_norm": 0.004513351246714592, "learning_rate": 9.087919467119071e-07, "loss": 0.0, "num_input_tokens_seen": 193802448, "step": 89860 }, { "epoch": 16.492016883831894, "grad_norm": 0.004111594520509243, "learning_rate": 9.083316646032791e-07, "loss": 0.0003, "num_input_tokens_seen": 193812720, "step": 89865 }, { "epoch": 16.492934483391448, "grad_norm": 0.003425885457545519, "learning_rate": 9.078714874396027e-07, "loss": 0.0001, "num_input_tokens_seen": 193824144, "step": 89870 }, { "epoch": 16.493852082951, "grad_norm": 0.0017214110121130943, "learning_rate": 9.074114152326785e-07, "loss": 0.0, "num_input_tokens_seen": 193835536, "step": 89875 }, { "epoch": 16.49476968251055, "grad_norm": 0.0039004269056022167, "learning_rate": 9.069514479943104e-07, "loss": 0.0001, "num_input_tokens_seen": 193846608, "step": 89880 }, { "epoch": 16.495687282070104, "grad_norm": 0.0010487941326573491, "learning_rate": 9.064915857362939e-07, "loss": 0.0001, "num_input_tokens_seen": 193858160, "step": 89885 }, { "epoch": 16.496604881629658, "grad_norm": 0.0015190686099231243, "learning_rate": 9.060318284704234e-07, "loss": 0.1314, "num_input_tokens_seen": 193869840, "step": 89890 }, { "epoch": 16.497522481189208, "grad_norm": 0.046276818960905075, "learning_rate": 9.055721762084907e-07, "loss": 0.0013, "num_input_tokens_seen": 193880624, "step": 89895 }, { "epoch": 16.49844008074876, "grad_norm": 0.03660232573747635, "learning_rate": 9.051126289622869e-07, "loss": 0.0001, "num_input_tokens_seen": 193891376, "step": 89900 }, { "epoch": 16.499357680308314, "grad_norm": 0.017806343734264374, "learning_rate": 9.046531867435976e-07, "loss": 0.0, "num_input_tokens_seen": 193902672, "step": 89905 }, { "epoch": 16.500275279867864, "grad_norm": 0.0053252410143613815, "learning_rate": 9.041938495642056e-07, "loss": 0.0001, "num_input_tokens_seen": 193913040, "step": 89910 }, { "epoch": 16.501192879427418, "grad_norm": 0.5039517283439636, "learning_rate": 9.037346174358946e-07, "loss": 0.0004, "num_input_tokens_seen": 193921712, "step": 89915 }, { "epoch": 16.50211047898697, "grad_norm": 0.006987251341342926, "learning_rate": 9.032754903704422e-07, "loss": 0.0, "num_input_tokens_seen": 193932592, "step": 89920 }, { "epoch": 16.50302807854652, "grad_norm": 0.010316450148820877, "learning_rate": 9.028164683796243e-07, "loss": 0.0, "num_input_tokens_seen": 193943888, "step": 89925 }, { "epoch": 16.503945678106074, "grad_norm": 0.010972080752253532, "learning_rate": 9.023575514752126e-07, "loss": 0.0, "num_input_tokens_seen": 193953520, "step": 89930 }, { "epoch": 16.504863277665628, "grad_norm": 0.006726501043885946, "learning_rate": 9.018987396689799e-07, "loss": 0.0001, "num_input_tokens_seen": 193964848, "step": 89935 }, { "epoch": 16.505780877225178, "grad_norm": 0.012297061271965504, "learning_rate": 9.01440032972693e-07, "loss": 0.0001, "num_input_tokens_seen": 193974416, "step": 89940 }, { "epoch": 16.50669847678473, "grad_norm": 0.049971193075180054, "learning_rate": 9.009814313981175e-07, "loss": 0.0, "num_input_tokens_seen": 193985200, "step": 89945 }, { "epoch": 16.507616076344284, "grad_norm": 0.0013301755534484982, "learning_rate": 9.005229349570155e-07, "loss": 0.0, "num_input_tokens_seen": 193997040, "step": 89950 }, { "epoch": 16.508533675903834, "grad_norm": 0.0013391553657129407, "learning_rate": 9.000645436611449e-07, "loss": 0.0001, "num_input_tokens_seen": 194008144, "step": 89955 }, { "epoch": 16.509451275463388, "grad_norm": 0.005652464460581541, "learning_rate": 8.996062575222659e-07, "loss": 0.1595, "num_input_tokens_seen": 194018832, "step": 89960 }, { "epoch": 16.51036887502294, "grad_norm": 0.009633148089051247, "learning_rate": 8.99148076552131e-07, "loss": 0.0, "num_input_tokens_seen": 194029840, "step": 89965 }, { "epoch": 16.51128647458249, "grad_norm": 0.027042658999562263, "learning_rate": 8.986900007624927e-07, "loss": 0.0, "num_input_tokens_seen": 194040880, "step": 89970 }, { "epoch": 16.512204074142044, "grad_norm": 0.0013190496247261763, "learning_rate": 8.982320301650988e-07, "loss": 0.0149, "num_input_tokens_seen": 194051696, "step": 89975 }, { "epoch": 16.513121673701598, "grad_norm": 0.0007591702742502093, "learning_rate": 8.977741647716953e-07, "loss": 0.0, "num_input_tokens_seen": 194061680, "step": 89980 }, { "epoch": 16.514039273261147, "grad_norm": 0.0020326324738562107, "learning_rate": 8.97316404594028e-07, "loss": 0.0, "num_input_tokens_seen": 194072400, "step": 89985 }, { "epoch": 16.5149568728207, "grad_norm": 0.004781431518495083, "learning_rate": 8.968587496438363e-07, "loss": 0.0, "num_input_tokens_seen": 194083408, "step": 89990 }, { "epoch": 16.515874472380254, "grad_norm": 0.005003937054425478, "learning_rate": 8.96401199932857e-07, "loss": 0.0284, "num_input_tokens_seen": 194094640, "step": 89995 }, { "epoch": 16.516792071939804, "grad_norm": 0.001023121178150177, "learning_rate": 8.959437554728279e-07, "loss": 0.0, "num_input_tokens_seen": 194105520, "step": 90000 }, { "epoch": 16.517709671499357, "grad_norm": 0.0010199298849329352, "learning_rate": 8.954864162754812e-07, "loss": 0.0, "num_input_tokens_seen": 194116688, "step": 90005 }, { "epoch": 16.51862727105891, "grad_norm": 0.002823412185534835, "learning_rate": 8.950291823525447e-07, "loss": 0.1376, "num_input_tokens_seen": 194127792, "step": 90010 }, { "epoch": 16.51954487061846, "grad_norm": 0.002209924627095461, "learning_rate": 8.945720537157493e-07, "loss": 0.0, "num_input_tokens_seen": 194139152, "step": 90015 }, { "epoch": 16.520462470178014, "grad_norm": 0.0015471770893782377, "learning_rate": 8.941150303768181e-07, "loss": 0.0763, "num_input_tokens_seen": 194149712, "step": 90020 }, { "epoch": 16.521380069737567, "grad_norm": 0.00085197709267959, "learning_rate": 8.936581123474725e-07, "loss": 0.0, "num_input_tokens_seen": 194160240, "step": 90025 }, { "epoch": 16.522297669297117, "grad_norm": 0.0024759990628808737, "learning_rate": 8.932012996394307e-07, "loss": 0.0244, "num_input_tokens_seen": 194171280, "step": 90030 }, { "epoch": 16.52321526885667, "grad_norm": 1.2089687585830688, "learning_rate": 8.927445922644118e-07, "loss": 0.0001, "num_input_tokens_seen": 194182096, "step": 90035 }, { "epoch": 16.524132868416224, "grad_norm": 0.0016554412432014942, "learning_rate": 8.922879902341286e-07, "loss": 0.0001, "num_input_tokens_seen": 194192336, "step": 90040 }, { "epoch": 16.525050467975774, "grad_norm": 38.12324905395508, "learning_rate": 8.918314935602912e-07, "loss": 0.0028, "num_input_tokens_seen": 194201648, "step": 90045 }, { "epoch": 16.525968067535327, "grad_norm": 0.002100370591506362, "learning_rate": 8.913751022546097e-07, "loss": 0.0, "num_input_tokens_seen": 194211952, "step": 90050 }, { "epoch": 16.52688566709488, "grad_norm": 0.0012643691152334213, "learning_rate": 8.909188163287891e-07, "loss": 0.0, "num_input_tokens_seen": 194222064, "step": 90055 }, { "epoch": 16.52780326665443, "grad_norm": 1.5794119834899902, "learning_rate": 8.904626357945312e-07, "loss": 0.0001, "num_input_tokens_seen": 194232016, "step": 90060 }, { "epoch": 16.528720866213984, "grad_norm": 0.36206987500190735, "learning_rate": 8.900065606635383e-07, "loss": 0.0001, "num_input_tokens_seen": 194241936, "step": 90065 }, { "epoch": 16.529638465773537, "grad_norm": 0.07044567912817001, "learning_rate": 8.895505909475077e-07, "loss": 0.0, "num_input_tokens_seen": 194252944, "step": 90070 }, { "epoch": 16.530556065333087, "grad_norm": 0.0022130445577204227, "learning_rate": 8.89094726658134e-07, "loss": 0.0079, "num_input_tokens_seen": 194264112, "step": 90075 }, { "epoch": 16.53147366489264, "grad_norm": 0.0006592866266146302, "learning_rate": 8.886389678071073e-07, "loss": 0.0001, "num_input_tokens_seen": 194275664, "step": 90080 }, { "epoch": 16.532391264452194, "grad_norm": 0.0020986972376704216, "learning_rate": 8.881833144061208e-07, "loss": 0.0006, "num_input_tokens_seen": 194286960, "step": 90085 }, { "epoch": 16.533308864011744, "grad_norm": 0.0024329645093530416, "learning_rate": 8.877277664668593e-07, "loss": 0.0207, "num_input_tokens_seen": 194298416, "step": 90090 }, { "epoch": 16.534226463571297, "grad_norm": 0.008771766908466816, "learning_rate": 8.872723240010061e-07, "loss": 0.0, "num_input_tokens_seen": 194308720, "step": 90095 }, { "epoch": 16.53514406313085, "grad_norm": 0.002712004352360964, "learning_rate": 8.868169870202447e-07, "loss": 0.0001, "num_input_tokens_seen": 194318448, "step": 90100 }, { "epoch": 16.5360616626904, "grad_norm": 0.01610410585999489, "learning_rate": 8.86361755536253e-07, "loss": 0.0001, "num_input_tokens_seen": 194328816, "step": 90105 }, { "epoch": 16.536979262249954, "grad_norm": 1.7588810920715332, "learning_rate": 8.859066295607066e-07, "loss": 0.0004, "num_input_tokens_seen": 194340336, "step": 90110 }, { "epoch": 16.537896861809507, "grad_norm": 0.005303609650582075, "learning_rate": 8.854516091052772e-07, "loss": 0.0, "num_input_tokens_seen": 194351632, "step": 90115 }, { "epoch": 16.538814461369057, "grad_norm": 0.024398934096097946, "learning_rate": 8.84996694181639e-07, "loss": 0.0, "num_input_tokens_seen": 194362384, "step": 90120 }, { "epoch": 16.53973206092861, "grad_norm": 0.007637404836714268, "learning_rate": 8.845418848014576e-07, "loss": 0.0329, "num_input_tokens_seen": 194373136, "step": 90125 }, { "epoch": 16.540649660488164, "grad_norm": 0.007020602002739906, "learning_rate": 8.840871809763973e-07, "loss": 0.0001, "num_input_tokens_seen": 194385040, "step": 90130 }, { "epoch": 16.541567260047714, "grad_norm": 0.005688077304512262, "learning_rate": 8.83632582718123e-07, "loss": 0.0157, "num_input_tokens_seen": 194396112, "step": 90135 }, { "epoch": 16.542484859607267, "grad_norm": 0.008605278097093105, "learning_rate": 8.83178090038293e-07, "loss": 0.0, "num_input_tokens_seen": 194407600, "step": 90140 }, { "epoch": 16.54340245916682, "grad_norm": 0.016149329021573067, "learning_rate": 8.827237029485647e-07, "loss": 0.0, "num_input_tokens_seen": 194419056, "step": 90145 }, { "epoch": 16.54432005872637, "grad_norm": 0.015772372484207153, "learning_rate": 8.822694214605904e-07, "loss": 0.0001, "num_input_tokens_seen": 194429552, "step": 90150 }, { "epoch": 16.545237658285924, "grad_norm": 0.0008954029181040823, "learning_rate": 8.818152455860251e-07, "loss": 0.0001, "num_input_tokens_seen": 194440816, "step": 90155 }, { "epoch": 16.546155257845477, "grad_norm": 0.007888159714639187, "learning_rate": 8.813611753365165e-07, "loss": 0.0, "num_input_tokens_seen": 194451760, "step": 90160 }, { "epoch": 16.547072857405027, "grad_norm": 0.004775665700435638, "learning_rate": 8.809072107237105e-07, "loss": 0.0001, "num_input_tokens_seen": 194461360, "step": 90165 }, { "epoch": 16.54799045696458, "grad_norm": 0.0038737922441214323, "learning_rate": 8.804533517592501e-07, "loss": 0.0016, "num_input_tokens_seen": 194472432, "step": 90170 }, { "epoch": 16.548908056524134, "grad_norm": 0.24493099749088287, "learning_rate": 8.799995984547754e-07, "loss": 0.0001, "num_input_tokens_seen": 194481968, "step": 90175 }, { "epoch": 16.549825656083684, "grad_norm": 0.02854134887456894, "learning_rate": 8.795459508219267e-07, "loss": 0.0, "num_input_tokens_seen": 194493168, "step": 90180 }, { "epoch": 16.550743255643237, "grad_norm": 0.2765294909477234, "learning_rate": 8.790924088723384e-07, "loss": 0.0208, "num_input_tokens_seen": 194503632, "step": 90185 }, { "epoch": 16.55166085520279, "grad_norm": 0.002865468617528677, "learning_rate": 8.78638972617643e-07, "loss": 0.0, "num_input_tokens_seen": 194513232, "step": 90190 }, { "epoch": 16.55257845476234, "grad_norm": 0.0016766446642577648, "learning_rate": 8.78185642069469e-07, "loss": 0.0, "num_input_tokens_seen": 194524560, "step": 90195 }, { "epoch": 16.553496054321894, "grad_norm": 0.0363948717713356, "learning_rate": 8.777324172394463e-07, "loss": 0.0002, "num_input_tokens_seen": 194535728, "step": 90200 }, { "epoch": 16.554413653881447, "grad_norm": 0.15718133747577667, "learning_rate": 8.772792981391981e-07, "loss": 0.0001, "num_input_tokens_seen": 194546896, "step": 90205 }, { "epoch": 16.555331253440997, "grad_norm": 0.0011988668702542782, "learning_rate": 8.768262847803466e-07, "loss": 0.0004, "num_input_tokens_seen": 194558224, "step": 90210 }, { "epoch": 16.55624885300055, "grad_norm": 0.010468191467225552, "learning_rate": 8.763733771745092e-07, "loss": 0.0003, "num_input_tokens_seen": 194569744, "step": 90215 }, { "epoch": 16.557166452560104, "grad_norm": 0.1351809799671173, "learning_rate": 8.75920575333305e-07, "loss": 0.0792, "num_input_tokens_seen": 194579792, "step": 90220 }, { "epoch": 16.558084052119654, "grad_norm": 0.012512320652604103, "learning_rate": 8.754678792683457e-07, "loss": 0.1221, "num_input_tokens_seen": 194590704, "step": 90225 }, { "epoch": 16.559001651679207, "grad_norm": 5.8038458824157715, "learning_rate": 8.750152889912422e-07, "loss": 0.0005, "num_input_tokens_seen": 194601552, "step": 90230 }, { "epoch": 16.55991925123876, "grad_norm": 0.0061522903852164745, "learning_rate": 8.745628045136045e-07, "loss": 0.1563, "num_input_tokens_seen": 194612656, "step": 90235 }, { "epoch": 16.56083685079831, "grad_norm": 0.011557064950466156, "learning_rate": 8.741104258470368e-07, "loss": 0.0, "num_input_tokens_seen": 194622352, "step": 90240 }, { "epoch": 16.561754450357864, "grad_norm": 0.002201853320002556, "learning_rate": 8.736581530031424e-07, "loss": 0.0, "num_input_tokens_seen": 194632208, "step": 90245 }, { "epoch": 16.562672049917417, "grad_norm": 0.0010036190506070852, "learning_rate": 8.7320598599352e-07, "loss": 0.0, "num_input_tokens_seen": 194641776, "step": 90250 }, { "epoch": 16.563589649476967, "grad_norm": 0.0025058239698410034, "learning_rate": 8.727539248297689e-07, "loss": 0.0008, "num_input_tokens_seen": 194651728, "step": 90255 }, { "epoch": 16.56450724903652, "grad_norm": 0.004631402436643839, "learning_rate": 8.72301969523483e-07, "loss": 0.0287, "num_input_tokens_seen": 194663440, "step": 90260 }, { "epoch": 16.565424848596074, "grad_norm": 0.9472517371177673, "learning_rate": 8.718501200862533e-07, "loss": 0.0005, "num_input_tokens_seen": 194674800, "step": 90265 }, { "epoch": 16.566342448155623, "grad_norm": 0.015216581523418427, "learning_rate": 8.713983765296713e-07, "loss": 0.0003, "num_input_tokens_seen": 194686256, "step": 90270 }, { "epoch": 16.567260047715177, "grad_norm": 0.01762322150170803, "learning_rate": 8.70946738865322e-07, "loss": 0.0012, "num_input_tokens_seen": 194698160, "step": 90275 }, { "epoch": 16.56817764727473, "grad_norm": 0.07197374105453491, "learning_rate": 8.704952071047879e-07, "loss": 0.0001, "num_input_tokens_seen": 194708272, "step": 90280 }, { "epoch": 16.56909524683428, "grad_norm": 0.002811331767588854, "learning_rate": 8.700437812596535e-07, "loss": 0.0, "num_input_tokens_seen": 194718992, "step": 90285 }, { "epoch": 16.570012846393833, "grad_norm": 9.442805290222168, "learning_rate": 8.695924613414946e-07, "loss": 0.0016, "num_input_tokens_seen": 194730352, "step": 90290 }, { "epoch": 16.570930445953387, "grad_norm": 0.0009408012847416103, "learning_rate": 8.691412473618876e-07, "loss": 0.0, "num_input_tokens_seen": 194739952, "step": 90295 }, { "epoch": 16.571848045512937, "grad_norm": 0.002884727204218507, "learning_rate": 8.686901393324043e-07, "loss": 0.1098, "num_input_tokens_seen": 194750416, "step": 90300 }, { "epoch": 16.57276564507249, "grad_norm": 0.19435593485832214, "learning_rate": 8.682391372646171e-07, "loss": 0.0001, "num_input_tokens_seen": 194762224, "step": 90305 }, { "epoch": 16.573683244632043, "grad_norm": 0.0011671569664031267, "learning_rate": 8.677882411700928e-07, "loss": 0.0, "num_input_tokens_seen": 194774064, "step": 90310 }, { "epoch": 16.574600844191593, "grad_norm": 0.00047487230040133, "learning_rate": 8.673374510603938e-07, "loss": 0.0, "num_input_tokens_seen": 194784048, "step": 90315 }, { "epoch": 16.575518443751147, "grad_norm": 0.0019511634018272161, "learning_rate": 8.668867669470859e-07, "loss": 0.0, "num_input_tokens_seen": 194794960, "step": 90320 }, { "epoch": 16.5764360433107, "grad_norm": 0.006808073725551367, "learning_rate": 8.664361888417267e-07, "loss": 0.0001, "num_input_tokens_seen": 194807376, "step": 90325 }, { "epoch": 16.57735364287025, "grad_norm": 0.000619700993411243, "learning_rate": 8.65985716755871e-07, "loss": 0.0001, "num_input_tokens_seen": 194817360, "step": 90330 }, { "epoch": 16.578271242429803, "grad_norm": 0.0033430270850658417, "learning_rate": 8.655353507010766e-07, "loss": 0.0, "num_input_tokens_seen": 194827728, "step": 90335 }, { "epoch": 16.579188841989357, "grad_norm": 0.0028748083859682083, "learning_rate": 8.650850906888919e-07, "loss": 0.0207, "num_input_tokens_seen": 194838736, "step": 90340 }, { "epoch": 16.580106441548907, "grad_norm": 0.0017520481487736106, "learning_rate": 8.646349367308666e-07, "loss": 0.0, "num_input_tokens_seen": 194849808, "step": 90345 }, { "epoch": 16.58102404110846, "grad_norm": 0.005665235687047243, "learning_rate": 8.641848888385446e-07, "loss": 0.0, "num_input_tokens_seen": 194861072, "step": 90350 }, { "epoch": 16.581941640668013, "grad_norm": 0.0006420230492949486, "learning_rate": 8.637349470234713e-07, "loss": 0.0, "num_input_tokens_seen": 194872336, "step": 90355 }, { "epoch": 16.582859240227563, "grad_norm": 0.0010070184944197536, "learning_rate": 8.632851112971857e-07, "loss": 0.0, "num_input_tokens_seen": 194882416, "step": 90360 }, { "epoch": 16.583776839787117, "grad_norm": 0.0006535726715810597, "learning_rate": 8.628353816712265e-07, "loss": 0.0002, "num_input_tokens_seen": 194893264, "step": 90365 }, { "epoch": 16.58469443934667, "grad_norm": 0.044577427208423615, "learning_rate": 8.62385758157126e-07, "loss": 0.172, "num_input_tokens_seen": 194904176, "step": 90370 }, { "epoch": 16.58561203890622, "grad_norm": 0.011214081197977066, "learning_rate": 8.619362407664195e-07, "loss": 0.0, "num_input_tokens_seen": 194915344, "step": 90375 }, { "epoch": 16.586529638465773, "grad_norm": 353.4297790527344, "learning_rate": 8.614868295106343e-07, "loss": 0.303, "num_input_tokens_seen": 194926704, "step": 90380 }, { "epoch": 16.587447238025327, "grad_norm": 0.0998634323477745, "learning_rate": 8.610375244012986e-07, "loss": 0.0, "num_input_tokens_seen": 194935984, "step": 90385 }, { "epoch": 16.588364837584876, "grad_norm": 0.0027484199963510036, "learning_rate": 8.605883254499353e-07, "loss": 0.0, "num_input_tokens_seen": 194946704, "step": 90390 }, { "epoch": 16.58928243714443, "grad_norm": 0.002033347962424159, "learning_rate": 8.601392326680664e-07, "loss": 0.0207, "num_input_tokens_seen": 194956528, "step": 90395 }, { "epoch": 16.590200036703983, "grad_norm": 0.05957028269767761, "learning_rate": 8.596902460672079e-07, "loss": 0.0, "num_input_tokens_seen": 194968336, "step": 90400 }, { "epoch": 16.591117636263533, "grad_norm": 0.004183938726782799, "learning_rate": 8.592413656588794e-07, "loss": 0.0, "num_input_tokens_seen": 194978160, "step": 90405 }, { "epoch": 16.592035235823086, "grad_norm": 0.013165384531021118, "learning_rate": 8.587925914545925e-07, "loss": 0.2758, "num_input_tokens_seen": 194989872, "step": 90410 }, { "epoch": 16.59295283538264, "grad_norm": 32.21806335449219, "learning_rate": 8.583439234658558e-07, "loss": 0.0007, "num_input_tokens_seen": 195000688, "step": 90415 }, { "epoch": 16.59387043494219, "grad_norm": 0.0006976668373681605, "learning_rate": 8.578953617041797e-07, "loss": 0.0002, "num_input_tokens_seen": 195010736, "step": 90420 }, { "epoch": 16.594788034501743, "grad_norm": 0.8339008688926697, "learning_rate": 8.574469061810681e-07, "loss": 0.0002, "num_input_tokens_seen": 195022288, "step": 90425 }, { "epoch": 16.595705634061297, "grad_norm": 0.7485107183456421, "learning_rate": 8.569985569080225e-07, "loss": 0.0001, "num_input_tokens_seen": 195033936, "step": 90430 }, { "epoch": 16.596623233620846, "grad_norm": 0.02633366361260414, "learning_rate": 8.56550313896542e-07, "loss": 0.0, "num_input_tokens_seen": 195043664, "step": 90435 }, { "epoch": 16.5975408331804, "grad_norm": 0.12525267899036407, "learning_rate": 8.561021771581257e-07, "loss": 0.0001, "num_input_tokens_seen": 195055056, "step": 90440 }, { "epoch": 16.598458432739953, "grad_norm": 0.006944332271814346, "learning_rate": 8.556541467042656e-07, "loss": 0.0, "num_input_tokens_seen": 195065424, "step": 90445 }, { "epoch": 16.599376032299503, "grad_norm": 0.000771279854234308, "learning_rate": 8.552062225464525e-07, "loss": 0.0883, "num_input_tokens_seen": 195073808, "step": 90450 }, { "epoch": 16.600293631859056, "grad_norm": 0.0011356078321114182, "learning_rate": 8.547584046961771e-07, "loss": 0.0001, "num_input_tokens_seen": 195084528, "step": 90455 }, { "epoch": 16.60121123141861, "grad_norm": 0.08499999344348907, "learning_rate": 8.543106931649236e-07, "loss": 0.0, "num_input_tokens_seen": 195093712, "step": 90460 }, { "epoch": 16.60212883097816, "grad_norm": 0.01553881075233221, "learning_rate": 8.538630879641752e-07, "loss": 0.0, "num_input_tokens_seen": 195104176, "step": 90465 }, { "epoch": 16.603046430537713, "grad_norm": 0.0007253449293784797, "learning_rate": 8.534155891054135e-07, "loss": 0.0001, "num_input_tokens_seen": 195115760, "step": 90470 }, { "epoch": 16.603964030097266, "grad_norm": 0.05465095117688179, "learning_rate": 8.529681966001152e-07, "loss": 0.0, "num_input_tokens_seen": 195125520, "step": 90475 }, { "epoch": 16.604881629656816, "grad_norm": 0.14936542510986328, "learning_rate": 8.525209104597553e-07, "loss": 0.0001, "num_input_tokens_seen": 195137360, "step": 90480 }, { "epoch": 16.60579922921637, "grad_norm": 0.004428161773830652, "learning_rate": 8.520737306958049e-07, "loss": 0.0, "num_input_tokens_seen": 195148848, "step": 90485 }, { "epoch": 16.606716828775923, "grad_norm": 0.3008459806442261, "learning_rate": 8.516266573197363e-07, "loss": 0.0001, "num_input_tokens_seen": 195159504, "step": 90490 }, { "epoch": 16.607634428335473, "grad_norm": 0.0013186698779463768, "learning_rate": 8.511796903430142e-07, "loss": 0.0, "num_input_tokens_seen": 195170768, "step": 90495 }, { "epoch": 16.608552027895026, "grad_norm": 0.0036297952756285667, "learning_rate": 8.507328297771017e-07, "loss": 0.0, "num_input_tokens_seen": 195180976, "step": 90500 }, { "epoch": 16.60946962745458, "grad_norm": 82.09978485107422, "learning_rate": 8.502860756334624e-07, "loss": 0.0246, "num_input_tokens_seen": 195192240, "step": 90505 }, { "epoch": 16.61038722701413, "grad_norm": 3.765239953994751, "learning_rate": 8.498394279235539e-07, "loss": 0.0004, "num_input_tokens_seen": 195202704, "step": 90510 }, { "epoch": 16.611304826573683, "grad_norm": 0.004582645837217569, "learning_rate": 8.493928866588308e-07, "loss": 0.0002, "num_input_tokens_seen": 195213584, "step": 90515 }, { "epoch": 16.612222426133236, "grad_norm": 0.3030933141708374, "learning_rate": 8.489464518507484e-07, "loss": 0.0001, "num_input_tokens_seen": 195224464, "step": 90520 }, { "epoch": 16.613140025692786, "grad_norm": 0.0008812754531390965, "learning_rate": 8.485001235107559e-07, "loss": 0.0002, "num_input_tokens_seen": 195233264, "step": 90525 }, { "epoch": 16.61405762525234, "grad_norm": 0.01798701472580433, "learning_rate": 8.480539016503009e-07, "loss": 0.1751, "num_input_tokens_seen": 195243920, "step": 90530 }, { "epoch": 16.614975224811893, "grad_norm": 0.2380378097295761, "learning_rate": 8.476077862808274e-07, "loss": 0.0, "num_input_tokens_seen": 195254096, "step": 90535 }, { "epoch": 16.615892824371443, "grad_norm": 0.01136383879929781, "learning_rate": 8.471617774137797e-07, "loss": 0.0, "num_input_tokens_seen": 195265104, "step": 90540 }, { "epoch": 16.616810423930996, "grad_norm": 0.003541007172316313, "learning_rate": 8.467158750605964e-07, "loss": 0.0, "num_input_tokens_seen": 195276496, "step": 90545 }, { "epoch": 16.61772802349055, "grad_norm": 0.04541609436273575, "learning_rate": 8.462700792327122e-07, "loss": 0.0, "num_input_tokens_seen": 195287088, "step": 90550 }, { "epoch": 16.6186456230501, "grad_norm": 0.01355646550655365, "learning_rate": 8.458243899415641e-07, "loss": 0.0588, "num_input_tokens_seen": 195298512, "step": 90555 }, { "epoch": 16.619563222609653, "grad_norm": 0.0012191908899694681, "learning_rate": 8.453788071985824e-07, "loss": 0.0, "num_input_tokens_seen": 195309136, "step": 90560 }, { "epoch": 16.620480822169206, "grad_norm": 0.1219090148806572, "learning_rate": 8.449333310151947e-07, "loss": 0.0011, "num_input_tokens_seen": 195321008, "step": 90565 }, { "epoch": 16.621398421728756, "grad_norm": 0.1374788135290146, "learning_rate": 8.444879614028262e-07, "loss": 0.0716, "num_input_tokens_seen": 195333136, "step": 90570 }, { "epoch": 16.62231602128831, "grad_norm": 0.20141422748565674, "learning_rate": 8.440426983729027e-07, "loss": 0.0003, "num_input_tokens_seen": 195344656, "step": 90575 }, { "epoch": 16.623233620847863, "grad_norm": 0.0017670025117695332, "learning_rate": 8.435975419368425e-07, "loss": 0.0, "num_input_tokens_seen": 195355952, "step": 90580 }, { "epoch": 16.624151220407413, "grad_norm": 0.004745585843920708, "learning_rate": 8.431524921060635e-07, "loss": 0.0, "num_input_tokens_seen": 195365296, "step": 90585 }, { "epoch": 16.625068819966966, "grad_norm": 0.04368383064866066, "learning_rate": 8.427075488919801e-07, "loss": 0.0, "num_input_tokens_seen": 195377104, "step": 90590 }, { "epoch": 16.62598641952652, "grad_norm": 0.08134032785892487, "learning_rate": 8.42262712306004e-07, "loss": 0.0207, "num_input_tokens_seen": 195387920, "step": 90595 }, { "epoch": 16.62690401908607, "grad_norm": 0.0064799049869179726, "learning_rate": 8.418179823595468e-07, "loss": 0.0008, "num_input_tokens_seen": 195400176, "step": 90600 }, { "epoch": 16.627821618645623, "grad_norm": 44.80997085571289, "learning_rate": 8.413733590640138e-07, "loss": 0.0436, "num_input_tokens_seen": 195411280, "step": 90605 }, { "epoch": 16.628739218205176, "grad_norm": 0.0010105166584253311, "learning_rate": 8.409288424308088e-07, "loss": 0.0025, "num_input_tokens_seen": 195422448, "step": 90610 }, { "epoch": 16.629656817764726, "grad_norm": 0.006502483040094376, "learning_rate": 8.40484432471333e-07, "loss": 0.0, "num_input_tokens_seen": 195433552, "step": 90615 }, { "epoch": 16.63057441732428, "grad_norm": 0.008282767608761787, "learning_rate": 8.400401291969834e-07, "loss": 0.0, "num_input_tokens_seen": 195444848, "step": 90620 }, { "epoch": 16.631492016883833, "grad_norm": 0.012818777933716774, "learning_rate": 8.395959326191583e-07, "loss": 0.0, "num_input_tokens_seen": 195455600, "step": 90625 }, { "epoch": 16.632409616443383, "grad_norm": 0.0023706622887402773, "learning_rate": 8.391518427492501e-07, "loss": 0.0245, "num_input_tokens_seen": 195466192, "step": 90630 }, { "epoch": 16.633327216002936, "grad_norm": 0.0013919181656092405, "learning_rate": 8.387078595986464e-07, "loss": 0.0001, "num_input_tokens_seen": 195476912, "step": 90635 }, { "epoch": 16.63424481556249, "grad_norm": 0.03158222883939743, "learning_rate": 8.382639831787387e-07, "loss": 0.0, "num_input_tokens_seen": 195488880, "step": 90640 }, { "epoch": 16.63516241512204, "grad_norm": 0.0015974874841049314, "learning_rate": 8.378202135009089e-07, "loss": 0.0, "num_input_tokens_seen": 195499312, "step": 90645 }, { "epoch": 16.636080014681593, "grad_norm": 0.0006341601256281137, "learning_rate": 8.373765505765391e-07, "loss": 0.0002, "num_input_tokens_seen": 195508816, "step": 90650 }, { "epoch": 16.636997614241146, "grad_norm": 0.0005238446174189448, "learning_rate": 8.369329944170107e-07, "loss": 0.0004, "num_input_tokens_seen": 195519504, "step": 90655 }, { "epoch": 16.637915213800696, "grad_norm": 0.031364865601062775, "learning_rate": 8.364895450336985e-07, "loss": 0.0, "num_input_tokens_seen": 195529008, "step": 90660 }, { "epoch": 16.63883281336025, "grad_norm": 0.0019884512294083834, "learning_rate": 8.360462024379762e-07, "loss": 0.0001, "num_input_tokens_seen": 195537872, "step": 90665 }, { "epoch": 16.639750412919803, "grad_norm": 0.08573759347200394, "learning_rate": 8.356029666412147e-07, "loss": 0.0002, "num_input_tokens_seen": 195548496, "step": 90670 }, { "epoch": 16.640668012479352, "grad_norm": 0.0022347222547978163, "learning_rate": 8.351598376547837e-07, "loss": 0.1863, "num_input_tokens_seen": 195559856, "step": 90675 }, { "epoch": 16.641585612038906, "grad_norm": 0.07710401713848114, "learning_rate": 8.347168154900481e-07, "loss": 0.0, "num_input_tokens_seen": 195569872, "step": 90680 }, { "epoch": 16.64250321159846, "grad_norm": 0.08398273587226868, "learning_rate": 8.342739001583699e-07, "loss": 0.0016, "num_input_tokens_seen": 195580848, "step": 90685 }, { "epoch": 16.64342081115801, "grad_norm": 0.005583843681961298, "learning_rate": 8.338310916711106e-07, "loss": 0.0, "num_input_tokens_seen": 195591696, "step": 90690 }, { "epoch": 16.644338410717562, "grad_norm": 0.020809371024370193, "learning_rate": 8.333883900396267e-07, "loss": 0.0002, "num_input_tokens_seen": 195602096, "step": 90695 }, { "epoch": 16.645256010277116, "grad_norm": 0.014597396366298199, "learning_rate": 8.329457952752729e-07, "loss": 0.0, "num_input_tokens_seen": 195613648, "step": 90700 }, { "epoch": 16.646173609836666, "grad_norm": 0.003998119849711657, "learning_rate": 8.325033073894001e-07, "loss": 0.0329, "num_input_tokens_seen": 195624080, "step": 90705 }, { "epoch": 16.64709120939622, "grad_norm": 0.005756031256169081, "learning_rate": 8.320609263933593e-07, "loss": 0.0852, "num_input_tokens_seen": 195634704, "step": 90710 }, { "epoch": 16.648008808955773, "grad_norm": 0.014024770818650723, "learning_rate": 8.316186522984964e-07, "loss": 0.0426, "num_input_tokens_seen": 195646192, "step": 90715 }, { "epoch": 16.648926408515322, "grad_norm": 105.1369857788086, "learning_rate": 8.311764851161535e-07, "loss": 0.0207, "num_input_tokens_seen": 195656496, "step": 90720 }, { "epoch": 16.649844008074876, "grad_norm": 0.1206565573811531, "learning_rate": 8.307344248576738e-07, "loss": 0.1565, "num_input_tokens_seen": 195667824, "step": 90725 }, { "epoch": 16.65076160763443, "grad_norm": 0.0027603795751929283, "learning_rate": 8.302924715343941e-07, "loss": 0.0001, "num_input_tokens_seen": 195676720, "step": 90730 }, { "epoch": 16.65167920719398, "grad_norm": 0.0013274761149659753, "learning_rate": 8.298506251576494e-07, "loss": 0.0, "num_input_tokens_seen": 195687024, "step": 90735 }, { "epoch": 16.652596806753532, "grad_norm": 0.0007765176706016064, "learning_rate": 8.294088857387733e-07, "loss": 0.0001, "num_input_tokens_seen": 195696656, "step": 90740 }, { "epoch": 16.653514406313086, "grad_norm": 0.029385492205619812, "learning_rate": 8.289672532890963e-07, "loss": 0.0, "num_input_tokens_seen": 195707632, "step": 90745 }, { "epoch": 16.654432005872636, "grad_norm": 0.0014248547377064824, "learning_rate": 8.285257278199443e-07, "loss": 0.0001, "num_input_tokens_seen": 195718288, "step": 90750 }, { "epoch": 16.65534960543219, "grad_norm": 0.000817815016489476, "learning_rate": 8.28084309342641e-07, "loss": 0.0, "num_input_tokens_seen": 195729072, "step": 90755 }, { "epoch": 16.656267204991742, "grad_norm": 0.006514077074825764, "learning_rate": 8.276429978685108e-07, "loss": 0.0, "num_input_tokens_seen": 195738736, "step": 90760 }, { "epoch": 16.657184804551292, "grad_norm": 23.69154930114746, "learning_rate": 8.272017934088706e-07, "loss": 0.1007, "num_input_tokens_seen": 195749104, "step": 90765 }, { "epoch": 16.658102404110846, "grad_norm": 0.15837454795837402, "learning_rate": 8.267606959750363e-07, "loss": 0.0003, "num_input_tokens_seen": 195758608, "step": 90770 }, { "epoch": 16.6590200036704, "grad_norm": 0.024546343833208084, "learning_rate": 8.263197055783234e-07, "loss": 0.0, "num_input_tokens_seen": 195768720, "step": 90775 }, { "epoch": 16.65993760322995, "grad_norm": 0.09288967400789261, "learning_rate": 8.258788222300413e-07, "loss": 0.0, "num_input_tokens_seen": 195780432, "step": 90780 }, { "epoch": 16.660855202789502, "grad_norm": 0.0348011814057827, "learning_rate": 8.254380459414984e-07, "loss": 0.0285, "num_input_tokens_seen": 195791792, "step": 90785 }, { "epoch": 16.661772802349056, "grad_norm": 0.6497382521629333, "learning_rate": 8.249973767239983e-07, "loss": 0.0, "num_input_tokens_seen": 195801264, "step": 90790 }, { "epoch": 16.662690401908606, "grad_norm": 0.0024393375497311354, "learning_rate": 8.24556814588846e-07, "loss": 0.0207, "num_input_tokens_seen": 195813232, "step": 90795 }, { "epoch": 16.66360800146816, "grad_norm": 0.2003280371427536, "learning_rate": 8.2411635954734e-07, "loss": 0.1533, "num_input_tokens_seen": 195824144, "step": 90800 }, { "epoch": 16.664525601027712, "grad_norm": 0.0012159928446635604, "learning_rate": 8.236760116107773e-07, "loss": 0.0, "num_input_tokens_seen": 195834704, "step": 90805 }, { "epoch": 16.665443200587262, "grad_norm": 0.009249920956790447, "learning_rate": 8.232357707904521e-07, "loss": 0.0, "num_input_tokens_seen": 195844976, "step": 90810 }, { "epoch": 16.666360800146816, "grad_norm": 0.007554563228040934, "learning_rate": 8.227956370976553e-07, "loss": 0.0008, "num_input_tokens_seen": 195856432, "step": 90815 }, { "epoch": 16.66727839970637, "grad_norm": 0.0010635642101988196, "learning_rate": 8.22355610543677e-07, "loss": 0.0944, "num_input_tokens_seen": 195867248, "step": 90820 }, { "epoch": 16.66819599926592, "grad_norm": 5.713583469390869, "learning_rate": 8.219156911398024e-07, "loss": 0.001, "num_input_tokens_seen": 195878480, "step": 90825 }, { "epoch": 16.669113598825472, "grad_norm": 0.008615891449153423, "learning_rate": 8.214758788973154e-07, "loss": 0.0, "num_input_tokens_seen": 195889104, "step": 90830 }, { "epoch": 16.670031198385026, "grad_norm": 0.006472376640886068, "learning_rate": 8.210361738274946e-07, "loss": 0.016, "num_input_tokens_seen": 195900400, "step": 90835 }, { "epoch": 16.670948797944575, "grad_norm": 0.020654521882534027, "learning_rate": 8.205965759416202e-07, "loss": 0.0001, "num_input_tokens_seen": 195911920, "step": 90840 }, { "epoch": 16.67186639750413, "grad_norm": 0.008517682552337646, "learning_rate": 8.201570852509661e-07, "loss": 0.0002, "num_input_tokens_seen": 195922320, "step": 90845 }, { "epoch": 16.672783997063682, "grad_norm": 96.65092468261719, "learning_rate": 8.197177017668051e-07, "loss": 0.0376, "num_input_tokens_seen": 195931824, "step": 90850 }, { "epoch": 16.673701596623232, "grad_norm": 0.06191125884652138, "learning_rate": 8.192784255004043e-07, "loss": 0.0002, "num_input_tokens_seen": 195942736, "step": 90855 }, { "epoch": 16.674619196182785, "grad_norm": 0.0032466743141412735, "learning_rate": 8.188392564630337e-07, "loss": 0.0532, "num_input_tokens_seen": 195954000, "step": 90860 }, { "epoch": 16.67553679574234, "grad_norm": 0.0010285666212439537, "learning_rate": 8.184001946659564e-07, "loss": 0.0033, "num_input_tokens_seen": 195964208, "step": 90865 }, { "epoch": 16.67645439530189, "grad_norm": 0.00597724225372076, "learning_rate": 8.179612401204317e-07, "loss": 0.0, "num_input_tokens_seen": 195975024, "step": 90870 }, { "epoch": 16.677371994861442, "grad_norm": 0.006184173282235861, "learning_rate": 8.175223928377207e-07, "loss": 0.0174, "num_input_tokens_seen": 195987344, "step": 90875 }, { "epoch": 16.678289594420995, "grad_norm": 0.0009346709703095257, "learning_rate": 8.170836528290783e-07, "loss": 0.0885, "num_input_tokens_seen": 195998544, "step": 90880 }, { "epoch": 16.679207193980545, "grad_norm": 0.0007965363911353052, "learning_rate": 8.166450201057574e-07, "loss": 0.0144, "num_input_tokens_seen": 196010128, "step": 90885 }, { "epoch": 16.6801247935401, "grad_norm": 0.0005966300959698856, "learning_rate": 8.162064946790066e-07, "loss": 0.0244, "num_input_tokens_seen": 196021040, "step": 90890 }, { "epoch": 16.681042393099652, "grad_norm": 0.0017076708609238267, "learning_rate": 8.157680765600762e-07, "loss": 0.0, "num_input_tokens_seen": 196033232, "step": 90895 }, { "epoch": 16.681959992659202, "grad_norm": 0.005279700271785259, "learning_rate": 8.1532976576021e-07, "loss": 0.018, "num_input_tokens_seen": 196043440, "step": 90900 }, { "epoch": 16.682877592218755, "grad_norm": 0.001465495559386909, "learning_rate": 8.148915622906478e-07, "loss": 0.0, "num_input_tokens_seen": 196055376, "step": 90905 }, { "epoch": 16.68379519177831, "grad_norm": 0.005856798496097326, "learning_rate": 8.144534661626324e-07, "loss": 0.1252, "num_input_tokens_seen": 196065104, "step": 90910 }, { "epoch": 16.68471279133786, "grad_norm": 0.0028674378991127014, "learning_rate": 8.140154773873988e-07, "loss": 0.0, "num_input_tokens_seen": 196074384, "step": 90915 }, { "epoch": 16.685630390897412, "grad_norm": 0.1993427872657776, "learning_rate": 8.135775959761788e-07, "loss": 0.0002, "num_input_tokens_seen": 196086128, "step": 90920 }, { "epoch": 16.686547990456965, "grad_norm": 142.40478515625, "learning_rate": 8.131398219402065e-07, "loss": 0.2943, "num_input_tokens_seen": 196097104, "step": 90925 }, { "epoch": 16.687465590016515, "grad_norm": 0.0009082212345674634, "learning_rate": 8.127021552907083e-07, "loss": 0.0, "num_input_tokens_seen": 196108144, "step": 90930 }, { "epoch": 16.68838318957607, "grad_norm": 0.0031995242461562157, "learning_rate": 8.122645960389108e-07, "loss": 0.0, "num_input_tokens_seen": 196119824, "step": 90935 }, { "epoch": 16.689300789135622, "grad_norm": 0.0006230033468455076, "learning_rate": 8.118271441960346e-07, "loss": 0.0, "num_input_tokens_seen": 196130992, "step": 90940 }, { "epoch": 16.690218388695172, "grad_norm": 0.14992690086364746, "learning_rate": 8.113897997733017e-07, "loss": 0.0, "num_input_tokens_seen": 196140912, "step": 90945 }, { "epoch": 16.691135988254725, "grad_norm": 0.0006418314296752214, "learning_rate": 8.109525627819293e-07, "loss": 0.0001, "num_input_tokens_seen": 196151888, "step": 90950 }, { "epoch": 16.69205358781428, "grad_norm": 0.017315136268734932, "learning_rate": 8.1051543323313e-07, "loss": 0.0, "num_input_tokens_seen": 196161936, "step": 90955 }, { "epoch": 16.69297118737383, "grad_norm": 0.01167511846870184, "learning_rate": 8.100784111381177e-07, "loss": 0.0, "num_input_tokens_seen": 196172752, "step": 90960 }, { "epoch": 16.693888786933382, "grad_norm": 0.001607775455340743, "learning_rate": 8.096414965081007e-07, "loss": 0.0, "num_input_tokens_seen": 196184496, "step": 90965 }, { "epoch": 16.694806386492935, "grad_norm": 0.03147733956575394, "learning_rate": 8.092046893542832e-07, "loss": 0.0, "num_input_tokens_seen": 196196208, "step": 90970 }, { "epoch": 16.695723986052485, "grad_norm": 0.0040649715811014175, "learning_rate": 8.087679896878715e-07, "loss": 0.0, "num_input_tokens_seen": 196205840, "step": 90975 }, { "epoch": 16.69664158561204, "grad_norm": 0.047735217958688736, "learning_rate": 8.083313975200651e-07, "loss": 0.0, "num_input_tokens_seen": 196216112, "step": 90980 }, { "epoch": 16.697559185171592, "grad_norm": 0.0022601268719881773, "learning_rate": 8.078949128620623e-07, "loss": 0.0, "num_input_tokens_seen": 196227760, "step": 90985 }, { "epoch": 16.69847678473114, "grad_norm": 0.0021660258062183857, "learning_rate": 8.074585357250564e-07, "loss": 0.0, "num_input_tokens_seen": 196238960, "step": 90990 }, { "epoch": 16.699394384290695, "grad_norm": 0.003898289753124118, "learning_rate": 8.07022266120242e-07, "loss": 0.0, "num_input_tokens_seen": 196249392, "step": 90995 }, { "epoch": 16.70031198385025, "grad_norm": 0.009282296523451805, "learning_rate": 8.065861040588086e-07, "loss": 0.0174, "num_input_tokens_seen": 196259856, "step": 91000 }, { "epoch": 16.7012295834098, "grad_norm": 0.0012705909321084619, "learning_rate": 8.06150049551942e-07, "loss": 0.1719, "num_input_tokens_seen": 196270736, "step": 91005 }, { "epoch": 16.70214718296935, "grad_norm": 0.0005237807054072618, "learning_rate": 8.057141026108256e-07, "loss": 0.0, "num_input_tokens_seen": 196281936, "step": 91010 }, { "epoch": 16.703064782528905, "grad_norm": 0.010283702053129673, "learning_rate": 8.052782632466427e-07, "loss": 0.0007, "num_input_tokens_seen": 196293488, "step": 91015 }, { "epoch": 16.703982382088455, "grad_norm": 0.004002475179731846, "learning_rate": 8.048425314705716e-07, "loss": 0.0006, "num_input_tokens_seen": 196304080, "step": 91020 }, { "epoch": 16.70489998164801, "grad_norm": 203.0802001953125, "learning_rate": 8.044069072937877e-07, "loss": 0.0131, "num_input_tokens_seen": 196315696, "step": 91025 }, { "epoch": 16.70581758120756, "grad_norm": 0.001340165501460433, "learning_rate": 8.039713907274643e-07, "loss": 0.0008, "num_input_tokens_seen": 196326320, "step": 91030 }, { "epoch": 16.70673518076711, "grad_norm": 0.0598638691008091, "learning_rate": 8.035359817827698e-07, "loss": 0.1252, "num_input_tokens_seen": 196337936, "step": 91035 }, { "epoch": 16.707652780326665, "grad_norm": 0.005161902867257595, "learning_rate": 8.031006804708746e-07, "loss": 0.0, "num_input_tokens_seen": 196347440, "step": 91040 }, { "epoch": 16.70857037988622, "grad_norm": 0.0008093648939393461, "learning_rate": 8.026654868029427e-07, "loss": 0.0003, "num_input_tokens_seen": 196358640, "step": 91045 }, { "epoch": 16.70948797944577, "grad_norm": 0.014193467795848846, "learning_rate": 8.022304007901355e-07, "loss": 0.0, "num_input_tokens_seen": 196368176, "step": 91050 }, { "epoch": 16.71040557900532, "grad_norm": 0.00039389514131471515, "learning_rate": 8.017954224436114e-07, "loss": 0.0003, "num_input_tokens_seen": 196379344, "step": 91055 }, { "epoch": 16.711323178564875, "grad_norm": 0.0013070147251710296, "learning_rate": 8.013605517745293e-07, "loss": 0.0, "num_input_tokens_seen": 196389840, "step": 91060 }, { "epoch": 16.712240778124425, "grad_norm": 0.041080161929130554, "learning_rate": 8.009257887940419e-07, "loss": 0.0006, "num_input_tokens_seen": 196400880, "step": 91065 }, { "epoch": 16.71315837768398, "grad_norm": 0.0006904122419655323, "learning_rate": 8.004911335132998e-07, "loss": 0.0, "num_input_tokens_seen": 196412432, "step": 91070 }, { "epoch": 16.71407597724353, "grad_norm": 177.7904510498047, "learning_rate": 8.000565859434506e-07, "loss": 0.0381, "num_input_tokens_seen": 196424240, "step": 91075 }, { "epoch": 16.71499357680308, "grad_norm": 0.003440719796344638, "learning_rate": 7.996221460956416e-07, "loss": 0.0, "num_input_tokens_seen": 196435312, "step": 91080 }, { "epoch": 16.715911176362635, "grad_norm": 0.02084927260875702, "learning_rate": 7.99187813981015e-07, "loss": 0.0, "num_input_tokens_seen": 196447120, "step": 91085 }, { "epoch": 16.71682877592219, "grad_norm": 0.015865257009863853, "learning_rate": 7.987535896107085e-07, "loss": 0.0001, "num_input_tokens_seen": 196458160, "step": 91090 }, { "epoch": 16.717746375481738, "grad_norm": 169.1433563232422, "learning_rate": 7.983194729958626e-07, "loss": 0.0056, "num_input_tokens_seen": 196469488, "step": 91095 }, { "epoch": 16.71866397504129, "grad_norm": 0.3910444378852844, "learning_rate": 7.978854641476102e-07, "loss": 0.0002, "num_input_tokens_seen": 196479728, "step": 91100 }, { "epoch": 16.719581574600845, "grad_norm": 0.005299154669046402, "learning_rate": 7.974515630770813e-07, "loss": 0.0589, "num_input_tokens_seen": 196491088, "step": 91105 }, { "epoch": 16.720499174160395, "grad_norm": 0.0020923782140016556, "learning_rate": 7.970177697954084e-07, "loss": 0.0943, "num_input_tokens_seen": 196501712, "step": 91110 }, { "epoch": 16.721416773719948, "grad_norm": 0.0035129121970385313, "learning_rate": 7.965840843137152e-07, "loss": 0.0, "num_input_tokens_seen": 196512976, "step": 91115 }, { "epoch": 16.7223343732795, "grad_norm": 0.004416747484356165, "learning_rate": 7.961505066431258e-07, "loss": 0.0001, "num_input_tokens_seen": 196523728, "step": 91120 }, { "epoch": 16.72325197283905, "grad_norm": 0.0018847519531846046, "learning_rate": 7.957170367947587e-07, "loss": 0.0, "num_input_tokens_seen": 196532848, "step": 91125 }, { "epoch": 16.724169572398605, "grad_norm": 0.0313459075987339, "learning_rate": 7.952836747797354e-07, "loss": 0.0001, "num_input_tokens_seen": 196542896, "step": 91130 }, { "epoch": 16.725087171958158, "grad_norm": 0.0025852255057543516, "learning_rate": 7.94850420609169e-07, "loss": 0.0, "num_input_tokens_seen": 196554352, "step": 91135 }, { "epoch": 16.726004771517708, "grad_norm": 0.09927059710025787, "learning_rate": 7.944172742941708e-07, "loss": 0.0, "num_input_tokens_seen": 196564400, "step": 91140 }, { "epoch": 16.72692237107726, "grad_norm": 0.009747988544404507, "learning_rate": 7.939842358458521e-07, "loss": 0.0009, "num_input_tokens_seen": 196575184, "step": 91145 }, { "epoch": 16.727839970636815, "grad_norm": 0.04275326430797577, "learning_rate": 7.935513052753197e-07, "loss": 0.0006, "num_input_tokens_seen": 196585008, "step": 91150 }, { "epoch": 16.728757570196365, "grad_norm": 0.0006166240200400352, "learning_rate": 7.931184825936766e-07, "loss": 0.0, "num_input_tokens_seen": 196595792, "step": 91155 }, { "epoch": 16.729675169755918, "grad_norm": 0.0008683218620717525, "learning_rate": 7.926857678120232e-07, "loss": 0.0051, "num_input_tokens_seen": 196605872, "step": 91160 }, { "epoch": 16.73059276931547, "grad_norm": 0.014179693534970284, "learning_rate": 7.922531609414602e-07, "loss": 0.0, "num_input_tokens_seen": 196616656, "step": 91165 }, { "epoch": 16.73151036887502, "grad_norm": 0.0024842943530529737, "learning_rate": 7.918206619930824e-07, "loss": 0.0, "num_input_tokens_seen": 196626288, "step": 91170 }, { "epoch": 16.732427968434575, "grad_norm": 0.0016531473957002163, "learning_rate": 7.913882709779813e-07, "loss": 0.0001, "num_input_tokens_seen": 196636784, "step": 91175 }, { "epoch": 16.733345567994128, "grad_norm": 0.008659818209707737, "learning_rate": 7.909559879072493e-07, "loss": 0.0, "num_input_tokens_seen": 196647664, "step": 91180 }, { "epoch": 16.734263167553678, "grad_norm": 8.17244815826416, "learning_rate": 7.905238127919729e-07, "loss": 0.001, "num_input_tokens_seen": 196658512, "step": 91185 }, { "epoch": 16.73518076711323, "grad_norm": 0.031950049102306366, "learning_rate": 7.900917456432355e-07, "loss": 0.222, "num_input_tokens_seen": 196668784, "step": 91190 }, { "epoch": 16.736098366672785, "grad_norm": 0.020425599068403244, "learning_rate": 7.896597864721212e-07, "loss": 0.0532, "num_input_tokens_seen": 196681008, "step": 91195 }, { "epoch": 16.737015966232335, "grad_norm": 0.19587144255638123, "learning_rate": 7.892279352897075e-07, "loss": 0.0, "num_input_tokens_seen": 196691152, "step": 91200 }, { "epoch": 16.737933565791888, "grad_norm": 0.003572298213839531, "learning_rate": 7.887961921070719e-07, "loss": 0.0004, "num_input_tokens_seen": 196702320, "step": 91205 }, { "epoch": 16.73885116535144, "grad_norm": 0.00350908562541008, "learning_rate": 7.883645569352854e-07, "loss": 0.0, "num_input_tokens_seen": 196712848, "step": 91210 }, { "epoch": 16.73976876491099, "grad_norm": 0.0012676736805588007, "learning_rate": 7.879330297854221e-07, "loss": 0.0, "num_input_tokens_seen": 196723728, "step": 91215 }, { "epoch": 16.740686364470545, "grad_norm": 0.001222264370881021, "learning_rate": 7.875016106685485e-07, "loss": 0.0, "num_input_tokens_seen": 196734384, "step": 91220 }, { "epoch": 16.741603964030098, "grad_norm": 77.40570831298828, "learning_rate": 7.870702995957297e-07, "loss": 0.1128, "num_input_tokens_seen": 196746096, "step": 91225 }, { "epoch": 16.742521563589648, "grad_norm": 0.4450254440307617, "learning_rate": 7.866390965780274e-07, "loss": 0.0001, "num_input_tokens_seen": 196757328, "step": 91230 }, { "epoch": 16.7434391631492, "grad_norm": 0.00496829254552722, "learning_rate": 7.862080016265028e-07, "loss": 0.0017, "num_input_tokens_seen": 196768304, "step": 91235 }, { "epoch": 16.744356762708755, "grad_norm": 0.0916224792599678, "learning_rate": 7.857770147522126e-07, "loss": 0.0, "num_input_tokens_seen": 196779440, "step": 91240 }, { "epoch": 16.745274362268304, "grad_norm": 0.0012005617609247565, "learning_rate": 7.853461359662101e-07, "loss": 0.0, "num_input_tokens_seen": 196789136, "step": 91245 }, { "epoch": 16.746191961827858, "grad_norm": 0.012319760397076607, "learning_rate": 7.849153652795472e-07, "loss": 0.0006, "num_input_tokens_seen": 196801168, "step": 91250 }, { "epoch": 16.74710956138741, "grad_norm": 0.019776804372668266, "learning_rate": 7.844847027032715e-07, "loss": 0.1252, "num_input_tokens_seen": 196812048, "step": 91255 }, { "epoch": 16.74802716094696, "grad_norm": 0.0038645807653665543, "learning_rate": 7.84054148248431e-07, "loss": 0.0119, "num_input_tokens_seen": 196822832, "step": 91260 }, { "epoch": 16.748944760506514, "grad_norm": 0.001899952651001513, "learning_rate": 7.836237019260667e-07, "loss": 0.0002, "num_input_tokens_seen": 196833872, "step": 91265 }, { "epoch": 16.749862360066068, "grad_norm": 0.05462019890546799, "learning_rate": 7.831933637472205e-07, "loss": 0.002, "num_input_tokens_seen": 196844272, "step": 91270 }, { "epoch": 16.750779959625618, "grad_norm": 0.00571186700835824, "learning_rate": 7.827631337229274e-07, "loss": 0.0, "num_input_tokens_seen": 196854704, "step": 91275 }, { "epoch": 16.75169755918517, "grad_norm": 0.0028004252817481756, "learning_rate": 7.823330118642253e-07, "loss": 0.0016, "num_input_tokens_seen": 196865744, "step": 91280 }, { "epoch": 16.752615158744725, "grad_norm": 5.5639729499816895, "learning_rate": 7.819029981821441e-07, "loss": 0.0126, "num_input_tokens_seen": 196876304, "step": 91285 }, { "epoch": 16.753532758304274, "grad_norm": 0.0013593476032838225, "learning_rate": 7.814730926877129e-07, "loss": 0.0, "num_input_tokens_seen": 196886928, "step": 91290 }, { "epoch": 16.754450357863828, "grad_norm": 0.0046011339873075485, "learning_rate": 7.8104329539196e-07, "loss": 0.0001, "num_input_tokens_seen": 196897552, "step": 91295 }, { "epoch": 16.75536795742338, "grad_norm": 0.0007775115082040429, "learning_rate": 7.806136063059072e-07, "loss": 0.0, "num_input_tokens_seen": 196908912, "step": 91300 }, { "epoch": 16.75628555698293, "grad_norm": 0.0030975916888564825, "learning_rate": 7.801840254405763e-07, "loss": 0.0, "num_input_tokens_seen": 196919632, "step": 91305 }, { "epoch": 16.757203156542484, "grad_norm": 0.001932831946760416, "learning_rate": 7.797545528069839e-07, "loss": 0.0, "num_input_tokens_seen": 196929808, "step": 91310 }, { "epoch": 16.758120756102038, "grad_norm": 0.02848479337990284, "learning_rate": 7.793251884161474e-07, "loss": 0.0028, "num_input_tokens_seen": 196941072, "step": 91315 }, { "epoch": 16.759038355661588, "grad_norm": 0.004269947297871113, "learning_rate": 7.788959322790784e-07, "loss": 0.0008, "num_input_tokens_seen": 196952336, "step": 91320 }, { "epoch": 16.75995595522114, "grad_norm": 0.0005332448636181653, "learning_rate": 7.784667844067856e-07, "loss": 0.1563, "num_input_tokens_seen": 196962512, "step": 91325 }, { "epoch": 16.760873554780694, "grad_norm": 0.03953023627400398, "learning_rate": 7.780377448102783e-07, "loss": 0.0, "num_input_tokens_seen": 196974320, "step": 91330 }, { "epoch": 16.761791154340244, "grad_norm": 0.0008683844353072345, "learning_rate": 7.776088135005594e-07, "loss": 0.0001, "num_input_tokens_seen": 196984656, "step": 91335 }, { "epoch": 16.762708753899798, "grad_norm": 0.006216045469045639, "learning_rate": 7.771799904886301e-07, "loss": 0.0, "num_input_tokens_seen": 196995856, "step": 91340 }, { "epoch": 16.76362635345935, "grad_norm": 0.01894180104136467, "learning_rate": 7.767512757854878e-07, "loss": 0.0, "num_input_tokens_seen": 197007312, "step": 91345 }, { "epoch": 16.7645439530189, "grad_norm": 0.0008220947347581387, "learning_rate": 7.763226694021314e-07, "loss": 0.0588, "num_input_tokens_seen": 197017296, "step": 91350 }, { "epoch": 16.765461552578454, "grad_norm": 0.0004054622258991003, "learning_rate": 7.758941713495522e-07, "loss": 0.0001, "num_input_tokens_seen": 197028560, "step": 91355 }, { "epoch": 16.766379152138008, "grad_norm": 0.004110845737159252, "learning_rate": 7.754657816387401e-07, "loss": 0.1719, "num_input_tokens_seen": 197041136, "step": 91360 }, { "epoch": 16.767296751697558, "grad_norm": 0.031095094978809357, "learning_rate": 7.750375002806837e-07, "loss": 0.093, "num_input_tokens_seen": 197052560, "step": 91365 }, { "epoch": 16.76821435125711, "grad_norm": 0.413229763507843, "learning_rate": 7.746093272863681e-07, "loss": 0.0001, "num_input_tokens_seen": 197061712, "step": 91370 }, { "epoch": 16.769131950816664, "grad_norm": 0.0011368467239663005, "learning_rate": 7.741812626667727e-07, "loss": 0.0, "num_input_tokens_seen": 197073744, "step": 91375 }, { "epoch": 16.770049550376214, "grad_norm": 0.014052903279662132, "learning_rate": 7.737533064328795e-07, "loss": 0.0005, "num_input_tokens_seen": 197085040, "step": 91380 }, { "epoch": 16.770967149935768, "grad_norm": 0.0047527183778584, "learning_rate": 7.733254585956646e-07, "loss": 0.0, "num_input_tokens_seen": 197096176, "step": 91385 }, { "epoch": 16.77188474949532, "grad_norm": 0.0728832483291626, "learning_rate": 7.728977191661002e-07, "loss": 0.0001, "num_input_tokens_seen": 197106288, "step": 91390 }, { "epoch": 16.77280234905487, "grad_norm": 0.01742025651037693, "learning_rate": 7.724700881551572e-07, "loss": 0.0, "num_input_tokens_seen": 197116624, "step": 91395 }, { "epoch": 16.773719948614424, "grad_norm": 0.0024375237990170717, "learning_rate": 7.720425655738056e-07, "loss": 0.0, "num_input_tokens_seen": 197126992, "step": 91400 }, { "epoch": 16.774637548173978, "grad_norm": 0.00859171710908413, "learning_rate": 7.716151514330094e-07, "loss": 0.0002, "num_input_tokens_seen": 197137296, "step": 91405 }, { "epoch": 16.775555147733527, "grad_norm": 0.002986755222082138, "learning_rate": 7.7118784574373e-07, "loss": 0.0733, "num_input_tokens_seen": 197148528, "step": 91410 }, { "epoch": 16.77647274729308, "grad_norm": 0.007815643213689327, "learning_rate": 7.707606485169289e-07, "loss": 0.0, "num_input_tokens_seen": 197159888, "step": 91415 }, { "epoch": 16.777390346852634, "grad_norm": 0.5170246362686157, "learning_rate": 7.703335597635631e-07, "loss": 0.0588, "num_input_tokens_seen": 197171120, "step": 91420 }, { "epoch": 16.778307946412184, "grad_norm": 0.0017278206069022417, "learning_rate": 7.699065794945848e-07, "loss": 0.0008, "num_input_tokens_seen": 197182064, "step": 91425 }, { "epoch": 16.779225545971737, "grad_norm": 0.021825691685080528, "learning_rate": 7.694797077209476e-07, "loss": 0.0, "num_input_tokens_seen": 197193968, "step": 91430 }, { "epoch": 16.78014314553129, "grad_norm": 0.018714481964707375, "learning_rate": 7.690529444535993e-07, "loss": 0.0532, "num_input_tokens_seen": 197203472, "step": 91435 }, { "epoch": 16.78106074509084, "grad_norm": 0.0006639250204898417, "learning_rate": 7.686262897034858e-07, "loss": 0.3943, "num_input_tokens_seen": 197214704, "step": 91440 }, { "epoch": 16.781978344650394, "grad_norm": 0.025825465098023415, "learning_rate": 7.681997434815497e-07, "loss": 0.0647, "num_input_tokens_seen": 197224976, "step": 91445 }, { "epoch": 16.782895944209947, "grad_norm": 0.001944826333783567, "learning_rate": 7.677733057987308e-07, "loss": 0.0, "num_input_tokens_seen": 197236592, "step": 91450 }, { "epoch": 16.783813543769497, "grad_norm": 0.0004065333923790604, "learning_rate": 7.67346976665968e-07, "loss": 0.0, "num_input_tokens_seen": 197247152, "step": 91455 }, { "epoch": 16.78473114332905, "grad_norm": 0.012368801049888134, "learning_rate": 7.66920756094195e-07, "loss": 0.0051, "num_input_tokens_seen": 197258128, "step": 91460 }, { "epoch": 16.785648742888604, "grad_norm": 3.789670705795288, "learning_rate": 7.664946440943444e-07, "loss": 0.0015, "num_input_tokens_seen": 197269040, "step": 91465 }, { "epoch": 16.786566342448154, "grad_norm": 0.002991480752825737, "learning_rate": 7.660686406773443e-07, "loss": 0.0119, "num_input_tokens_seen": 197279600, "step": 91470 }, { "epoch": 16.787483942007707, "grad_norm": 0.0004077666671946645, "learning_rate": 7.656427458541222e-07, "loss": 0.0, "num_input_tokens_seen": 197291184, "step": 91475 }, { "epoch": 16.78840154156726, "grad_norm": 0.0009275739430449903, "learning_rate": 7.652169596355997e-07, "loss": 0.0, "num_input_tokens_seen": 197300336, "step": 91480 }, { "epoch": 16.78931914112681, "grad_norm": 0.008856091648340225, "learning_rate": 7.647912820326997e-07, "loss": 0.0001, "num_input_tokens_seen": 197309584, "step": 91485 }, { "epoch": 16.790236740686364, "grad_norm": 0.0016234430950134993, "learning_rate": 7.643657130563392e-07, "loss": 0.0, "num_input_tokens_seen": 197321040, "step": 91490 }, { "epoch": 16.791154340245917, "grad_norm": 0.0013703631702810526, "learning_rate": 7.639402527174328e-07, "loss": 0.0, "num_input_tokens_seen": 197332400, "step": 91495 }, { "epoch": 16.792071939805467, "grad_norm": 0.0037527212407439947, "learning_rate": 7.635149010268944e-07, "loss": 0.0001, "num_input_tokens_seen": 197343376, "step": 91500 }, { "epoch": 16.79298953936502, "grad_norm": 112.38162231445312, "learning_rate": 7.630896579956331e-07, "loss": 0.0119, "num_input_tokens_seen": 197354768, "step": 91505 }, { "epoch": 16.793907138924574, "grad_norm": 0.0016409155214205384, "learning_rate": 7.626645236345543e-07, "loss": 0.0004, "num_input_tokens_seen": 197365776, "step": 91510 }, { "epoch": 16.794824738484124, "grad_norm": 0.0021259519271552563, "learning_rate": 7.622394979545644e-07, "loss": 0.0, "num_input_tokens_seen": 197375856, "step": 91515 }, { "epoch": 16.795742338043677, "grad_norm": 0.01128619909286499, "learning_rate": 7.618145809665634e-07, "loss": 0.1066, "num_input_tokens_seen": 197387408, "step": 91520 }, { "epoch": 16.79665993760323, "grad_norm": 0.007139102090150118, "learning_rate": 7.613897726814501e-07, "loss": 0.0, "num_input_tokens_seen": 197397776, "step": 91525 }, { "epoch": 16.79757753716278, "grad_norm": 0.0018696776824072003, "learning_rate": 7.609650731101181e-07, "loss": 0.0002, "num_input_tokens_seen": 197408528, "step": 91530 }, { "epoch": 16.798495136722334, "grad_norm": 0.0009139935136772692, "learning_rate": 7.605404822634637e-07, "loss": 0.0, "num_input_tokens_seen": 197419696, "step": 91535 }, { "epoch": 16.799412736281887, "grad_norm": 0.0010401054751127958, "learning_rate": 7.601160001523749e-07, "loss": 0.0, "num_input_tokens_seen": 197430544, "step": 91540 }, { "epoch": 16.800330335841437, "grad_norm": 0.012614122591912746, "learning_rate": 7.596916267877385e-07, "loss": 0.0057, "num_input_tokens_seen": 197440336, "step": 91545 }, { "epoch": 16.80124793540099, "grad_norm": 0.006166832055896521, "learning_rate": 7.592673621804414e-07, "loss": 0.0, "num_input_tokens_seen": 197451280, "step": 91550 }, { "epoch": 16.802165534960544, "grad_norm": 0.018524261191487312, "learning_rate": 7.588432063413637e-07, "loss": 0.0, "num_input_tokens_seen": 197461104, "step": 91555 }, { "epoch": 16.803083134520094, "grad_norm": 0.002005760557949543, "learning_rate": 7.584191592813839e-07, "loss": 0.0, "num_input_tokens_seen": 197471152, "step": 91560 }, { "epoch": 16.804000734079647, "grad_norm": 0.0016447808593511581, "learning_rate": 7.579952210113795e-07, "loss": 0.0, "num_input_tokens_seen": 197482416, "step": 91565 }, { "epoch": 16.8049183336392, "grad_norm": 0.01706874370574951, "learning_rate": 7.575713915422228e-07, "loss": 0.0, "num_input_tokens_seen": 197493200, "step": 91570 }, { "epoch": 16.80583593319875, "grad_norm": 0.0010100890649482608, "learning_rate": 7.571476708847853e-07, "loss": 0.0001, "num_input_tokens_seen": 197503696, "step": 91575 }, { "epoch": 16.806753532758304, "grad_norm": 0.04126507416367531, "learning_rate": 7.56724059049933e-07, "loss": 0.0, "num_input_tokens_seen": 197513968, "step": 91580 }, { "epoch": 16.807671132317857, "grad_norm": 0.0008158889249898493, "learning_rate": 7.563005560485332e-07, "loss": 0.0, "num_input_tokens_seen": 197524944, "step": 91585 }, { "epoch": 16.808588731877407, "grad_norm": 0.022053338587284088, "learning_rate": 7.558771618914468e-07, "loss": 0.0244, "num_input_tokens_seen": 197536336, "step": 91590 }, { "epoch": 16.80950633143696, "grad_norm": 0.0015771787147969007, "learning_rate": 7.554538765895325e-07, "loss": 0.0, "num_input_tokens_seen": 197547376, "step": 91595 }, { "epoch": 16.810423930996514, "grad_norm": 0.0030376266222447157, "learning_rate": 7.550307001536489e-07, "loss": 0.0001, "num_input_tokens_seen": 197558192, "step": 91600 }, { "epoch": 16.811341530556064, "grad_norm": 0.004032114986330271, "learning_rate": 7.546076325946489e-07, "loss": 0.0, "num_input_tokens_seen": 197569680, "step": 91605 }, { "epoch": 16.812259130115617, "grad_norm": 0.004111114889383316, "learning_rate": 7.541846739233832e-07, "loss": 0.0, "num_input_tokens_seen": 197580400, "step": 91610 }, { "epoch": 16.81317672967517, "grad_norm": 0.0006286129355430603, "learning_rate": 7.537618241506989e-07, "loss": 0.0, "num_input_tokens_seen": 197590704, "step": 91615 }, { "epoch": 16.81409432923472, "grad_norm": 0.014944261871278286, "learning_rate": 7.533390832874438e-07, "loss": 0.0119, "num_input_tokens_seen": 197601488, "step": 91620 }, { "epoch": 16.815011928794274, "grad_norm": 0.009242288768291473, "learning_rate": 7.529164513444598e-07, "loss": 0.0, "num_input_tokens_seen": 197613072, "step": 91625 }, { "epoch": 16.815929528353827, "grad_norm": 0.014938202686607838, "learning_rate": 7.524939283325849e-07, "loss": 0.0, "num_input_tokens_seen": 197624816, "step": 91630 }, { "epoch": 16.816847127913377, "grad_norm": 0.0012549484381452203, "learning_rate": 7.520715142626595e-07, "loss": 0.0, "num_input_tokens_seen": 197635632, "step": 91635 }, { "epoch": 16.81776472747293, "grad_norm": 0.023418327793478966, "learning_rate": 7.516492091455157e-07, "loss": 0.0, "num_input_tokens_seen": 197645968, "step": 91640 }, { "epoch": 16.818682327032484, "grad_norm": 0.0054061030969023705, "learning_rate": 7.51227012991984e-07, "loss": 0.0079, "num_input_tokens_seen": 197657104, "step": 91645 }, { "epoch": 16.819599926592034, "grad_norm": 0.0004243832372594625, "learning_rate": 7.508049258128958e-07, "loss": 0.0, "num_input_tokens_seen": 197666672, "step": 91650 }, { "epoch": 16.820517526151587, "grad_norm": 0.12824849784374237, "learning_rate": 7.503829476190754e-07, "loss": 0.0645, "num_input_tokens_seen": 197677456, "step": 91655 }, { "epoch": 16.82143512571114, "grad_norm": 0.0005987189360894263, "learning_rate": 7.499610784213468e-07, "loss": 0.0, "num_input_tokens_seen": 197688752, "step": 91660 }, { "epoch": 16.82235272527069, "grad_norm": 0.010203112848103046, "learning_rate": 7.495393182305288e-07, "loss": 0.0001, "num_input_tokens_seen": 197700560, "step": 91665 }, { "epoch": 16.823270324830244, "grad_norm": 134.34173583984375, "learning_rate": 7.491176670574396e-07, "loss": 0.0762, "num_input_tokens_seen": 197711792, "step": 91670 }, { "epoch": 16.824187924389797, "grad_norm": 0.003438713261857629, "learning_rate": 7.486961249128932e-07, "loss": 0.0, "num_input_tokens_seen": 197722992, "step": 91675 }, { "epoch": 16.825105523949347, "grad_norm": 0.12558548152446747, "learning_rate": 7.482746918077033e-07, "loss": 0.0097, "num_input_tokens_seen": 197734512, "step": 91680 }, { "epoch": 16.8260231235089, "grad_norm": 0.0012551293475553393, "learning_rate": 7.478533677526783e-07, "loss": 0.0, "num_input_tokens_seen": 197745104, "step": 91685 }, { "epoch": 16.826940723068454, "grad_norm": 0.0028429259546101093, "learning_rate": 7.474321527586237e-07, "loss": 0.0001, "num_input_tokens_seen": 197755568, "step": 91690 }, { "epoch": 16.827858322628003, "grad_norm": 0.0008160885190591216, "learning_rate": 7.470110468363428e-07, "loss": 0.0001, "num_input_tokens_seen": 197765712, "step": 91695 }, { "epoch": 16.828775922187557, "grad_norm": 0.0029964898712933064, "learning_rate": 7.465900499966378e-07, "loss": 0.0001, "num_input_tokens_seen": 197776464, "step": 91700 }, { "epoch": 16.82969352174711, "grad_norm": 0.0012938175350427628, "learning_rate": 7.461691622503059e-07, "loss": 0.0001, "num_input_tokens_seen": 197787280, "step": 91705 }, { "epoch": 16.83061112130666, "grad_norm": 0.2641587555408478, "learning_rate": 7.45748383608142e-07, "loss": 0.0001, "num_input_tokens_seen": 197797456, "step": 91710 }, { "epoch": 16.831528720866213, "grad_norm": 0.0034267238806933165, "learning_rate": 7.453277140809378e-07, "loss": 0.0, "num_input_tokens_seen": 197808560, "step": 91715 }, { "epoch": 16.832446320425767, "grad_norm": 0.6397250890731812, "learning_rate": 7.449071536794844e-07, "loss": 0.0001, "num_input_tokens_seen": 197818352, "step": 91720 }, { "epoch": 16.833363919985317, "grad_norm": 0.0033633606508374214, "learning_rate": 7.44486702414568e-07, "loss": 0.0, "num_input_tokens_seen": 197829168, "step": 91725 }, { "epoch": 16.83428151954487, "grad_norm": 0.0031723221763968468, "learning_rate": 7.440663602969711e-07, "loss": 0.0589, "num_input_tokens_seen": 197840432, "step": 91730 }, { "epoch": 16.835199119104423, "grad_norm": 0.911626935005188, "learning_rate": 7.436461273374768e-07, "loss": 0.0001, "num_input_tokens_seen": 197851728, "step": 91735 }, { "epoch": 16.836116718663973, "grad_norm": 0.0015707385027781129, "learning_rate": 7.432260035468625e-07, "loss": 0.0244, "num_input_tokens_seen": 197862544, "step": 91740 }, { "epoch": 16.837034318223527, "grad_norm": 0.0014856295892968774, "learning_rate": 7.428059889359029e-07, "loss": 0.0, "num_input_tokens_seen": 197872976, "step": 91745 }, { "epoch": 16.83795191778308, "grad_norm": 0.0038248514756560326, "learning_rate": 7.423860835153729e-07, "loss": 0.0, "num_input_tokens_seen": 197883696, "step": 91750 }, { "epoch": 16.83886951734263, "grad_norm": 8.945385932922363, "learning_rate": 7.419662872960409e-07, "loss": 0.0031, "num_input_tokens_seen": 197895024, "step": 91755 }, { "epoch": 16.839787116902183, "grad_norm": 0.0011580507270991802, "learning_rate": 7.415466002886745e-07, "loss": 0.1563, "num_input_tokens_seen": 197905552, "step": 91760 }, { "epoch": 16.840704716461737, "grad_norm": 0.008392835035920143, "learning_rate": 7.411270225040368e-07, "loss": 0.0005, "num_input_tokens_seen": 197914896, "step": 91765 }, { "epoch": 16.841622316021287, "grad_norm": 0.0027040361892431974, "learning_rate": 7.407075539528907e-07, "loss": 0.001, "num_input_tokens_seen": 197924016, "step": 91770 }, { "epoch": 16.84253991558084, "grad_norm": 0.0028261286206543446, "learning_rate": 7.402881946459956e-07, "loss": 0.0, "num_input_tokens_seen": 197935984, "step": 91775 }, { "epoch": 16.843457515140393, "grad_norm": 0.0007261421997100115, "learning_rate": 7.398689445941043e-07, "loss": 0.0003, "num_input_tokens_seen": 197946384, "step": 91780 }, { "epoch": 16.844375114699943, "grad_norm": 0.014174452051520348, "learning_rate": 7.394498038079734e-07, "loss": 0.0, "num_input_tokens_seen": 197956144, "step": 91785 }, { "epoch": 16.845292714259497, "grad_norm": 0.0027917944826185703, "learning_rate": 7.39030772298352e-07, "loss": 0.0, "num_input_tokens_seen": 197966448, "step": 91790 }, { "epoch": 16.84621031381905, "grad_norm": 0.03902242332696915, "learning_rate": 7.38611850075987e-07, "loss": 0.0007, "num_input_tokens_seen": 197975696, "step": 91795 }, { "epoch": 16.8471279133786, "grad_norm": 0.002107267966493964, "learning_rate": 7.381930371516227e-07, "loss": 0.0, "num_input_tokens_seen": 197986800, "step": 91800 }, { "epoch": 16.848045512938153, "grad_norm": 0.011019906029105186, "learning_rate": 7.377743335360027e-07, "loss": 0.0012, "num_input_tokens_seen": 197997104, "step": 91805 }, { "epoch": 16.848963112497707, "grad_norm": 0.4943808317184448, "learning_rate": 7.373557392398656e-07, "loss": 0.1285, "num_input_tokens_seen": 198006992, "step": 91810 }, { "epoch": 16.849880712057256, "grad_norm": 0.005262050777673721, "learning_rate": 7.369372542739456e-07, "loss": 0.0, "num_input_tokens_seen": 198019056, "step": 91815 }, { "epoch": 16.85079831161681, "grad_norm": 0.570947527885437, "learning_rate": 7.365188786489796e-07, "loss": 0.0003, "num_input_tokens_seen": 198029136, "step": 91820 }, { "epoch": 16.851715911176363, "grad_norm": 0.0024051612708717585, "learning_rate": 7.361006123756964e-07, "loss": 0.0, "num_input_tokens_seen": 198040368, "step": 91825 }, { "epoch": 16.852633510735913, "grad_norm": 0.001103961723856628, "learning_rate": 7.356824554648223e-07, "loss": 0.0, "num_input_tokens_seen": 198050032, "step": 91830 }, { "epoch": 16.853551110295466, "grad_norm": 0.013036997988820076, "learning_rate": 7.35264407927086e-07, "loss": 0.0, "num_input_tokens_seen": 198060720, "step": 91835 }, { "epoch": 16.85446870985502, "grad_norm": 0.2711966931819916, "learning_rate": 7.348464697732077e-07, "loss": 0.0002, "num_input_tokens_seen": 198071312, "step": 91840 }, { "epoch": 16.85538630941457, "grad_norm": 0.04422862455248833, "learning_rate": 7.344286410139067e-07, "loss": 0.0001, "num_input_tokens_seen": 198082896, "step": 91845 }, { "epoch": 16.856303908974123, "grad_norm": 0.002326906193047762, "learning_rate": 7.340109216598995e-07, "loss": 0.0, "num_input_tokens_seen": 198092784, "step": 91850 }, { "epoch": 16.857221508533677, "grad_norm": 0.0009046983323059976, "learning_rate": 7.335933117219013e-07, "loss": 0.0, "num_input_tokens_seen": 198102576, "step": 91855 }, { "epoch": 16.858139108093226, "grad_norm": 47.28551483154297, "learning_rate": 7.331758112106219e-07, "loss": 0.0173, "num_input_tokens_seen": 198113328, "step": 91860 }, { "epoch": 16.85905670765278, "grad_norm": 10.558107376098633, "learning_rate": 7.327584201367705e-07, "loss": 0.0002, "num_input_tokens_seen": 198124080, "step": 91865 }, { "epoch": 16.859974307212333, "grad_norm": 0.0006950670504011214, "learning_rate": 7.323411385110507e-07, "loss": 0.0, "num_input_tokens_seen": 198135408, "step": 91870 }, { "epoch": 16.860891906771883, "grad_norm": 0.12345618009567261, "learning_rate": 7.319239663441674e-07, "loss": 0.0001, "num_input_tokens_seen": 198145424, "step": 91875 }, { "epoch": 16.861809506331436, "grad_norm": 0.00036592225660569966, "learning_rate": 7.315069036468197e-07, "loss": 0.0001, "num_input_tokens_seen": 198155760, "step": 91880 }, { "epoch": 16.86272710589099, "grad_norm": 0.018676897510886192, "learning_rate": 7.310899504297042e-07, "loss": 0.0, "num_input_tokens_seen": 198166960, "step": 91885 }, { "epoch": 16.86364470545054, "grad_norm": 0.00223972974345088, "learning_rate": 7.306731067035155e-07, "loss": 0.0207, "num_input_tokens_seen": 198178256, "step": 91890 }, { "epoch": 16.864562305010093, "grad_norm": 0.001126040588133037, "learning_rate": 7.302563724789435e-07, "loss": 0.0001, "num_input_tokens_seen": 198189456, "step": 91895 }, { "epoch": 16.865479904569646, "grad_norm": 0.011110203340649605, "learning_rate": 7.298397477666791e-07, "loss": 0.0, "num_input_tokens_seen": 198199856, "step": 91900 }, { "epoch": 16.866397504129196, "grad_norm": 0.002559978747740388, "learning_rate": 7.29423232577407e-07, "loss": 0.0, "num_input_tokens_seen": 198211120, "step": 91905 }, { "epoch": 16.86731510368875, "grad_norm": 0.0009723313269205391, "learning_rate": 7.290068269218103e-07, "loss": 0.0001, "num_input_tokens_seen": 198220880, "step": 91910 }, { "epoch": 16.868232703248303, "grad_norm": 0.002637529978528619, "learning_rate": 7.285905308105678e-07, "loss": 0.0, "num_input_tokens_seen": 198231472, "step": 91915 }, { "epoch": 16.869150302807853, "grad_norm": 5.766641139984131, "learning_rate": 7.281743442543593e-07, "loss": 0.001, "num_input_tokens_seen": 198241616, "step": 91920 }, { "epoch": 16.870067902367406, "grad_norm": 0.0006244457908906043, "learning_rate": 7.277582672638583e-07, "loss": 0.1407, "num_input_tokens_seen": 198251952, "step": 91925 }, { "epoch": 16.87098550192696, "grad_norm": 0.0028461948968470097, "learning_rate": 7.273422998497365e-07, "loss": 0.0425, "num_input_tokens_seen": 198263344, "step": 91930 }, { "epoch": 16.87190310148651, "grad_norm": 0.0027893963269889355, "learning_rate": 7.269264420226613e-07, "loss": 0.0, "num_input_tokens_seen": 198273968, "step": 91935 }, { "epoch": 16.872820701046063, "grad_norm": 0.0017443117685616016, "learning_rate": 7.265106937933009e-07, "loss": 0.0, "num_input_tokens_seen": 198284144, "step": 91940 }, { "epoch": 16.873738300605616, "grad_norm": 0.2003295123577118, "learning_rate": 7.260950551723184e-07, "loss": 0.0589, "num_input_tokens_seen": 198295600, "step": 91945 }, { "epoch": 16.874655900165166, "grad_norm": 0.010072115808725357, "learning_rate": 7.256795261703725e-07, "loss": 0.0, "num_input_tokens_seen": 198306224, "step": 91950 }, { "epoch": 16.87557349972472, "grad_norm": 0.0010072379373013973, "learning_rate": 7.252641067981237e-07, "loss": 0.0002, "num_input_tokens_seen": 198317168, "step": 91955 }, { "epoch": 16.876491099284273, "grad_norm": 0.0025424137711524963, "learning_rate": 7.248487970662249e-07, "loss": 0.0001, "num_input_tokens_seen": 198328240, "step": 91960 }, { "epoch": 16.877408698843823, "grad_norm": 0.000542797555681318, "learning_rate": 7.244335969853272e-07, "loss": 0.0001, "num_input_tokens_seen": 198338864, "step": 91965 }, { "epoch": 16.878326298403376, "grad_norm": 0.06768178939819336, "learning_rate": 7.240185065660827e-07, "loss": 0.0001, "num_input_tokens_seen": 198349936, "step": 91970 }, { "epoch": 16.87924389796293, "grad_norm": 0.002627220470458269, "learning_rate": 7.236035258191365e-07, "loss": 0.0, "num_input_tokens_seen": 198359824, "step": 91975 }, { "epoch": 16.88016149752248, "grad_norm": 50.541080474853516, "learning_rate": 7.231886547551314e-07, "loss": 0.0079, "num_input_tokens_seen": 198371248, "step": 91980 }, { "epoch": 16.881079097082033, "grad_norm": 0.0017621765146031976, "learning_rate": 7.227738933847083e-07, "loss": 0.0, "num_input_tokens_seen": 198381296, "step": 91985 }, { "epoch": 16.881996696641586, "grad_norm": 0.002991016488522291, "learning_rate": 7.223592417185066e-07, "loss": 0.0, "num_input_tokens_seen": 198390832, "step": 91990 }, { "epoch": 16.882914296201136, "grad_norm": 0.0011362354271113873, "learning_rate": 7.219446997671609e-07, "loss": 0.0001, "num_input_tokens_seen": 198401968, "step": 91995 }, { "epoch": 16.88383189576069, "grad_norm": 0.0013168741716071963, "learning_rate": 7.215302675413022e-07, "loss": 0.0003, "num_input_tokens_seen": 198412784, "step": 92000 }, { "epoch": 16.884749495320243, "grad_norm": 0.003990480210632086, "learning_rate": 7.211159450515621e-07, "loss": 0.0145, "num_input_tokens_seen": 198423152, "step": 92005 }, { "epoch": 16.885667094879793, "grad_norm": 0.000864448375068605, "learning_rate": 7.207017323085658e-07, "loss": 0.1252, "num_input_tokens_seen": 198434416, "step": 92010 }, { "epoch": 16.886584694439346, "grad_norm": 0.011145208962261677, "learning_rate": 7.202876293229372e-07, "loss": 0.0, "num_input_tokens_seen": 198445232, "step": 92015 }, { "epoch": 16.8875022939989, "grad_norm": 74.95269012451172, "learning_rate": 7.198736361052989e-07, "loss": 0.0588, "num_input_tokens_seen": 198455312, "step": 92020 }, { "epoch": 16.88841989355845, "grad_norm": 0.46760451793670654, "learning_rate": 7.194597526662683e-07, "loss": 0.0001, "num_input_tokens_seen": 198466224, "step": 92025 }, { "epoch": 16.889337493118003, "grad_norm": 0.010457323864102364, "learning_rate": 7.190459790164605e-07, "loss": 0.0, "num_input_tokens_seen": 198478320, "step": 92030 }, { "epoch": 16.890255092677556, "grad_norm": 0.026418371126055717, "learning_rate": 7.186323151664881e-07, "loss": 0.0001, "num_input_tokens_seen": 198489136, "step": 92035 }, { "epoch": 16.891172692237106, "grad_norm": 0.004984764847904444, "learning_rate": 7.18218761126962e-07, "loss": 0.0, "num_input_tokens_seen": 198499344, "step": 92040 }, { "epoch": 16.89209029179666, "grad_norm": 0.005872638430446386, "learning_rate": 7.178053169084881e-07, "loss": 0.0001, "num_input_tokens_seen": 198510992, "step": 92045 }, { "epoch": 16.893007891356213, "grad_norm": 0.9854897856712341, "learning_rate": 7.173919825216702e-07, "loss": 0.0002, "num_input_tokens_seen": 198522160, "step": 92050 }, { "epoch": 16.893925490915763, "grad_norm": 0.04745199903845787, "learning_rate": 7.16978757977112e-07, "loss": 0.0, "num_input_tokens_seen": 198532848, "step": 92055 }, { "epoch": 16.894843090475316, "grad_norm": 0.03381720185279846, "learning_rate": 7.165656432854101e-07, "loss": 0.0, "num_input_tokens_seen": 198543984, "step": 92060 }, { "epoch": 16.89576069003487, "grad_norm": 0.018700402230024338, "learning_rate": 7.16152638457161e-07, "loss": 0.0001, "num_input_tokens_seen": 198554352, "step": 92065 }, { "epoch": 16.89667828959442, "grad_norm": 0.016049467027187347, "learning_rate": 7.157397435029561e-07, "loss": 0.0, "num_input_tokens_seen": 198564848, "step": 92070 }, { "epoch": 16.897595889153973, "grad_norm": 0.0019909830298274755, "learning_rate": 7.153269584333877e-07, "loss": 0.0674, "num_input_tokens_seen": 198575440, "step": 92075 }, { "epoch": 16.898513488713526, "grad_norm": 0.0008732783608138561, "learning_rate": 7.149142832590428e-07, "loss": 0.0008, "num_input_tokens_seen": 198586096, "step": 92080 }, { "epoch": 16.899431088273076, "grad_norm": 0.002101689809933305, "learning_rate": 7.145017179905045e-07, "loss": 0.0645, "num_input_tokens_seen": 198596848, "step": 92085 }, { "epoch": 16.90034868783263, "grad_norm": 0.0006469945074059069, "learning_rate": 7.140892626383544e-07, "loss": 0.0538, "num_input_tokens_seen": 198607216, "step": 92090 }, { "epoch": 16.901266287392183, "grad_norm": 0.0007805494242347777, "learning_rate": 7.136769172131736e-07, "loss": 0.0532, "num_input_tokens_seen": 198617040, "step": 92095 }, { "epoch": 16.902183886951732, "grad_norm": 0.014041961170732975, "learning_rate": 7.132646817255362e-07, "loss": 0.0, "num_input_tokens_seen": 198628464, "step": 92100 }, { "epoch": 16.903101486511286, "grad_norm": 0.027795806527137756, "learning_rate": 7.128525561860161e-07, "loss": 0.0703, "num_input_tokens_seen": 198638384, "step": 92105 }, { "epoch": 16.90401908607084, "grad_norm": 0.04781772941350937, "learning_rate": 7.124405406051837e-07, "loss": 0.0001, "num_input_tokens_seen": 198649296, "step": 92110 }, { "epoch": 16.90493668563039, "grad_norm": 0.0013044411316514015, "learning_rate": 7.12028634993605e-07, "loss": 0.0003, "num_input_tokens_seen": 198660368, "step": 92115 }, { "epoch": 16.905854285189942, "grad_norm": 0.0031862554606050253, "learning_rate": 7.116168393618473e-07, "loss": 0.0016, "num_input_tokens_seen": 198671312, "step": 92120 }, { "epoch": 16.906771884749496, "grad_norm": 0.0006120156613178551, "learning_rate": 7.112051537204706e-07, "loss": 0.0, "num_input_tokens_seen": 198682736, "step": 92125 }, { "epoch": 16.907689484309046, "grad_norm": 0.000540449284017086, "learning_rate": 7.107935780800351e-07, "loss": 0.0, "num_input_tokens_seen": 198693296, "step": 92130 }, { "epoch": 16.9086070838686, "grad_norm": 0.006241427734494209, "learning_rate": 7.103821124510957e-07, "loss": 0.0, "num_input_tokens_seen": 198704304, "step": 92135 }, { "epoch": 16.909524683428153, "grad_norm": 0.17647546529769897, "learning_rate": 7.099707568442083e-07, "loss": 0.0, "num_input_tokens_seen": 198714352, "step": 92140 }, { "epoch": 16.910442282987702, "grad_norm": 0.041181884706020355, "learning_rate": 7.095595112699211e-07, "loss": 0.375, "num_input_tokens_seen": 198725552, "step": 92145 }, { "epoch": 16.911359882547256, "grad_norm": 0.001713540405035019, "learning_rate": 7.091483757387824e-07, "loss": 0.0, "num_input_tokens_seen": 198736656, "step": 92150 }, { "epoch": 16.91227748210681, "grad_norm": 0.002239239402115345, "learning_rate": 7.087373502613387e-07, "loss": 0.0, "num_input_tokens_seen": 198747344, "step": 92155 }, { "epoch": 16.91319508166636, "grad_norm": 0.0007025338127277792, "learning_rate": 7.083264348481312e-07, "loss": 0.0001, "num_input_tokens_seen": 198757936, "step": 92160 }, { "epoch": 16.914112681225912, "grad_norm": 0.005124186165630817, "learning_rate": 7.079156295096983e-07, "loss": 0.0, "num_input_tokens_seen": 198768432, "step": 92165 }, { "epoch": 16.915030280785466, "grad_norm": 0.017790067940950394, "learning_rate": 7.075049342565771e-07, "loss": 0.0001, "num_input_tokens_seen": 198779184, "step": 92170 }, { "epoch": 16.915947880345016, "grad_norm": 0.01197532657533884, "learning_rate": 7.070943490993027e-07, "loss": 0.0002, "num_input_tokens_seen": 198790352, "step": 92175 }, { "epoch": 16.91686547990457, "grad_norm": 0.2006746232509613, "learning_rate": 7.066838740484044e-07, "loss": 0.0001, "num_input_tokens_seen": 198799888, "step": 92180 }, { "epoch": 16.917783079464122, "grad_norm": 1.352928638458252, "learning_rate": 7.062735091144102e-07, "loss": 0.0004, "num_input_tokens_seen": 198810352, "step": 92185 }, { "epoch": 16.918700679023672, "grad_norm": 0.008079559542238712, "learning_rate": 7.058632543078464e-07, "loss": 0.0, "num_input_tokens_seen": 198819824, "step": 92190 }, { "epoch": 16.919618278583226, "grad_norm": 0.0010449837427586317, "learning_rate": 7.054531096392347e-07, "loss": 0.0, "num_input_tokens_seen": 198831472, "step": 92195 }, { "epoch": 16.92053587814278, "grad_norm": 0.0025351017247885466, "learning_rate": 7.05043075119094e-07, "loss": 0.0041, "num_input_tokens_seen": 198841904, "step": 92200 }, { "epoch": 16.92145347770233, "grad_norm": 0.0028124619275331497, "learning_rate": 7.046331507579429e-07, "loss": 0.0, "num_input_tokens_seen": 198852432, "step": 92205 }, { "epoch": 16.922371077261882, "grad_norm": 0.008856427855789661, "learning_rate": 7.042233365662943e-07, "loss": 0.0, "num_input_tokens_seen": 198861680, "step": 92210 }, { "epoch": 16.923288676821436, "grad_norm": 0.0020349498372524977, "learning_rate": 7.038136325546597e-07, "loss": 0.0, "num_input_tokens_seen": 198873392, "step": 92215 }, { "epoch": 16.924206276380986, "grad_norm": 0.0030425165314227343, "learning_rate": 7.03404038733545e-07, "loss": 0.001, "num_input_tokens_seen": 198885648, "step": 92220 }, { "epoch": 16.92512387594054, "grad_norm": 0.0008817558991722763, "learning_rate": 7.029945551134592e-07, "loss": 0.0, "num_input_tokens_seen": 198895664, "step": 92225 }, { "epoch": 16.926041475500092, "grad_norm": 0.0020158332772552967, "learning_rate": 7.025851817049028e-07, "loss": 0.0, "num_input_tokens_seen": 198906224, "step": 92230 }, { "epoch": 16.926959075059642, "grad_norm": 0.025314228609204292, "learning_rate": 7.021759185183757e-07, "loss": 0.0, "num_input_tokens_seen": 198916112, "step": 92235 }, { "epoch": 16.927876674619196, "grad_norm": 0.060829974710941315, "learning_rate": 7.017667655643762e-07, "loss": 0.0, "num_input_tokens_seen": 198926096, "step": 92240 }, { "epoch": 16.92879427417875, "grad_norm": 72.74469757080078, "learning_rate": 7.013577228533975e-07, "loss": 0.0174, "num_input_tokens_seen": 198936784, "step": 92245 }, { "epoch": 16.9297118737383, "grad_norm": 0.013710799627006054, "learning_rate": 7.009487903959305e-07, "loss": 0.0, "num_input_tokens_seen": 198948464, "step": 92250 }, { "epoch": 16.930629473297852, "grad_norm": 0.011988222599029541, "learning_rate": 7.005399682024633e-07, "loss": 0.0, "num_input_tokens_seen": 198959888, "step": 92255 }, { "epoch": 16.931547072857406, "grad_norm": 0.0007148364675231278, "learning_rate": 7.001312562834834e-07, "loss": 0.0, "num_input_tokens_seen": 198971152, "step": 92260 }, { "epoch": 16.932464672416955, "grad_norm": 0.032578688114881516, "learning_rate": 6.997226546494723e-07, "loss": 0.0, "num_input_tokens_seen": 198981648, "step": 92265 }, { "epoch": 16.93338227197651, "grad_norm": 0.0006975295837037265, "learning_rate": 6.993141633109096e-07, "loss": 0.0921, "num_input_tokens_seen": 198992720, "step": 92270 }, { "epoch": 16.934299871536062, "grad_norm": 0.0006098411395214498, "learning_rate": 6.989057822782741e-07, "loss": 0.0016, "num_input_tokens_seen": 199004784, "step": 92275 }, { "epoch": 16.935217471095616, "grad_norm": 0.003911927808076143, "learning_rate": 6.984975115620396e-07, "loss": 0.0, "num_input_tokens_seen": 199015792, "step": 92280 }, { "epoch": 16.936135070655165, "grad_norm": 0.015141909010708332, "learning_rate": 6.980893511726756e-07, "loss": 0.0, "num_input_tokens_seen": 199026992, "step": 92285 }, { "epoch": 16.93705267021472, "grad_norm": 0.0003402248548809439, "learning_rate": 6.976813011206534e-07, "loss": 0.0, "num_input_tokens_seen": 199037968, "step": 92290 }, { "epoch": 16.937970269774272, "grad_norm": 0.0018870235653594136, "learning_rate": 6.972733614164378e-07, "loss": 0.0, "num_input_tokens_seen": 199049264, "step": 92295 }, { "epoch": 16.938887869333822, "grad_norm": 0.026972541585564613, "learning_rate": 6.968655320704926e-07, "loss": 0.0006, "num_input_tokens_seen": 199059120, "step": 92300 }, { "epoch": 16.939805468893375, "grad_norm": 0.0065588089637458324, "learning_rate": 6.964578130932764e-07, "loss": 0.0703, "num_input_tokens_seen": 199068976, "step": 92305 }, { "epoch": 16.94072306845293, "grad_norm": 0.004923665896058083, "learning_rate": 6.960502044952466e-07, "loss": 0.0001, "num_input_tokens_seen": 199079248, "step": 92310 }, { "epoch": 16.94164066801248, "grad_norm": 41.144996643066406, "learning_rate": 6.956427062868599e-07, "loss": 0.0466, "num_input_tokens_seen": 199089360, "step": 92315 }, { "epoch": 16.942558267572032, "grad_norm": 0.0008536336827091873, "learning_rate": 6.952353184785666e-07, "loss": 0.0, "num_input_tokens_seen": 199100688, "step": 92320 }, { "epoch": 16.943475867131585, "grad_norm": 0.0018095046980306506, "learning_rate": 6.948280410808156e-07, "loss": 0.0, "num_input_tokens_seen": 199111344, "step": 92325 }, { "epoch": 16.944393466691135, "grad_norm": 0.0047065261751413345, "learning_rate": 6.944208741040526e-07, "loss": 0.0, "num_input_tokens_seen": 199122704, "step": 92330 }, { "epoch": 16.94531106625069, "grad_norm": 0.2764163613319397, "learning_rate": 6.940138175587202e-07, "loss": 0.0027, "num_input_tokens_seen": 199134064, "step": 92335 }, { "epoch": 16.946228665810242, "grad_norm": 0.0012101103784516454, "learning_rate": 6.936068714552607e-07, "loss": 0.0, "num_input_tokens_seen": 199144816, "step": 92340 }, { "epoch": 16.947146265369792, "grad_norm": 69.81038665771484, "learning_rate": 6.932000358041107e-07, "loss": 0.2563, "num_input_tokens_seen": 199154192, "step": 92345 }, { "epoch": 16.948063864929345, "grad_norm": 0.026752278208732605, "learning_rate": 6.927933106157053e-07, "loss": 0.0174, "num_input_tokens_seen": 199164464, "step": 92350 }, { "epoch": 16.9489814644889, "grad_norm": 0.0653533786535263, "learning_rate": 6.923866959004743e-07, "loss": 0.0001, "num_input_tokens_seen": 199175184, "step": 92355 }, { "epoch": 16.94989906404845, "grad_norm": 19.14045524597168, "learning_rate": 6.919801916688495e-07, "loss": 0.003, "num_input_tokens_seen": 199185360, "step": 92360 }, { "epoch": 16.950816663608002, "grad_norm": 0.05226963013410568, "learning_rate": 6.915737979312559e-07, "loss": 0.0, "num_input_tokens_seen": 199195600, "step": 92365 }, { "epoch": 16.951734263167555, "grad_norm": 0.00218907929956913, "learning_rate": 6.911675146981161e-07, "loss": 0.0, "num_input_tokens_seen": 199206704, "step": 92370 }, { "epoch": 16.952651862727105, "grad_norm": 0.0013657459057867527, "learning_rate": 6.907613419798526e-07, "loss": 0.0, "num_input_tokens_seen": 199218320, "step": 92375 }, { "epoch": 16.95356946228666, "grad_norm": 0.010911294259130955, "learning_rate": 6.903552797868817e-07, "loss": 0.0, "num_input_tokens_seen": 199227792, "step": 92380 }, { "epoch": 16.954487061846212, "grad_norm": 0.0032029401045292616, "learning_rate": 6.899493281296182e-07, "loss": 0.0, "num_input_tokens_seen": 199238608, "step": 92385 }, { "epoch": 16.955404661405762, "grad_norm": 0.03875046595931053, "learning_rate": 6.895434870184742e-07, "loss": 0.0003, "num_input_tokens_seen": 199250992, "step": 92390 }, { "epoch": 16.956322260965315, "grad_norm": 0.0006463846657425165, "learning_rate": 6.891377564638596e-07, "loss": 0.0, "num_input_tokens_seen": 199261264, "step": 92395 }, { "epoch": 16.95723986052487, "grad_norm": 0.0006159143522381783, "learning_rate": 6.887321364761806e-07, "loss": 0.0005, "num_input_tokens_seen": 199271024, "step": 92400 }, { "epoch": 16.95815746008442, "grad_norm": 0.1317739486694336, "learning_rate": 6.883266270658395e-07, "loss": 0.0, "num_input_tokens_seen": 199281232, "step": 92405 }, { "epoch": 16.959075059643972, "grad_norm": 0.003615563502535224, "learning_rate": 6.879212282432385e-07, "loss": 0.0646, "num_input_tokens_seen": 199291184, "step": 92410 }, { "epoch": 16.959992659203525, "grad_norm": 0.0004462953074835241, "learning_rate": 6.875159400187753e-07, "loss": 0.0013, "num_input_tokens_seen": 199301712, "step": 92415 }, { "epoch": 16.960910258763075, "grad_norm": 0.0035615398082882166, "learning_rate": 6.871107624028434e-07, "loss": 0.0003, "num_input_tokens_seen": 199312080, "step": 92420 }, { "epoch": 16.96182785832263, "grad_norm": 0.0027951488737016916, "learning_rate": 6.867056954058371e-07, "loss": 0.0001, "num_input_tokens_seen": 199323824, "step": 92425 }, { "epoch": 16.962745457882182, "grad_norm": 0.013969358056783676, "learning_rate": 6.863007390381449e-07, "loss": 0.0, "num_input_tokens_seen": 199334384, "step": 92430 }, { "epoch": 16.96366305744173, "grad_norm": 0.05974463000893593, "learning_rate": 6.858958933101529e-07, "loss": 0.0, "num_input_tokens_seen": 199345040, "step": 92435 }, { "epoch": 16.964580657001285, "grad_norm": 0.04212094470858574, "learning_rate": 6.854911582322438e-07, "loss": 0.0, "num_input_tokens_seen": 199355856, "step": 92440 }, { "epoch": 16.96549825656084, "grad_norm": 0.006797450128942728, "learning_rate": 6.850865338148005e-07, "loss": 0.0, "num_input_tokens_seen": 199366128, "step": 92445 }, { "epoch": 16.96641585612039, "grad_norm": 0.005054150242358446, "learning_rate": 6.846820200682003e-07, "loss": 0.004, "num_input_tokens_seen": 199377616, "step": 92450 }, { "epoch": 16.967333455679942, "grad_norm": 0.0034225676208734512, "learning_rate": 6.84277617002817e-07, "loss": 0.0001, "num_input_tokens_seen": 199389552, "step": 92455 }, { "epoch": 16.968251055239495, "grad_norm": 0.047537606209516525, "learning_rate": 6.838733246290258e-07, "loss": 0.0, "num_input_tokens_seen": 199398800, "step": 92460 }, { "epoch": 16.969168654799045, "grad_norm": 0.03822662681341171, "learning_rate": 6.834691429571938e-07, "loss": 0.0, "num_input_tokens_seen": 199409552, "step": 92465 }, { "epoch": 16.9700862543586, "grad_norm": 0.0008841438102535903, "learning_rate": 6.830650719976872e-07, "loss": 0.0, "num_input_tokens_seen": 199421168, "step": 92470 }, { "epoch": 16.971003853918152, "grad_norm": 0.818185031414032, "learning_rate": 6.826611117608722e-07, "loss": 0.0006, "num_input_tokens_seen": 199432496, "step": 92475 }, { "epoch": 16.9719214534777, "grad_norm": 0.0002456452348269522, "learning_rate": 6.822572622571083e-07, "loss": 0.0, "num_input_tokens_seen": 199443120, "step": 92480 }, { "epoch": 16.972839053037255, "grad_norm": 0.08550505340099335, "learning_rate": 6.818535234967532e-07, "loss": 0.0, "num_input_tokens_seen": 199454128, "step": 92485 }, { "epoch": 16.97375665259681, "grad_norm": 0.05464693158864975, "learning_rate": 6.814498954901622e-07, "loss": 0.0001, "num_input_tokens_seen": 199464560, "step": 92490 }, { "epoch": 16.97467425215636, "grad_norm": 0.0016372303944081068, "learning_rate": 6.810463782476895e-07, "loss": 0.0, "num_input_tokens_seen": 199475856, "step": 92495 }, { "epoch": 16.97559185171591, "grad_norm": 0.01896694116294384, "learning_rate": 6.80642971779683e-07, "loss": 0.0, "num_input_tokens_seen": 199486000, "step": 92500 }, { "epoch": 16.976509451275465, "grad_norm": 0.0013198741944506764, "learning_rate": 6.802396760964891e-07, "loss": 0.0, "num_input_tokens_seen": 199495952, "step": 92505 }, { "epoch": 16.977427050835015, "grad_norm": 0.0013628919841721654, "learning_rate": 6.798364912084532e-07, "loss": 0.0002, "num_input_tokens_seen": 199507504, "step": 92510 }, { "epoch": 16.97834465039457, "grad_norm": 35.071136474609375, "learning_rate": 6.794334171259159e-07, "loss": 0.012, "num_input_tokens_seen": 199518928, "step": 92515 }, { "epoch": 16.97926224995412, "grad_norm": 0.16474051773548126, "learning_rate": 6.790304538592152e-07, "loss": 0.0, "num_input_tokens_seen": 199529904, "step": 92520 }, { "epoch": 16.98017984951367, "grad_norm": 0.0010558165377005935, "learning_rate": 6.786276014186866e-07, "loss": 0.0001, "num_input_tokens_seen": 199541008, "step": 92525 }, { "epoch": 16.981097449073225, "grad_norm": 0.001574684982188046, "learning_rate": 6.782248598146612e-07, "loss": 0.0097, "num_input_tokens_seen": 199552048, "step": 92530 }, { "epoch": 16.98201504863278, "grad_norm": 0.10097140073776245, "learning_rate": 6.778222290574709e-07, "loss": 0.0001, "num_input_tokens_seen": 199563024, "step": 92535 }, { "epoch": 16.982932648192328, "grad_norm": 0.001594717730768025, "learning_rate": 6.774197091574419e-07, "loss": 0.0003, "num_input_tokens_seen": 199573712, "step": 92540 }, { "epoch": 16.98385024775188, "grad_norm": 0.01272277906537056, "learning_rate": 6.770173001248981e-07, "loss": 0.0, "num_input_tokens_seen": 199584752, "step": 92545 }, { "epoch": 16.984767847311435, "grad_norm": 0.0031565241515636444, "learning_rate": 6.766150019701601e-07, "loss": 0.0, "num_input_tokens_seen": 199595632, "step": 92550 }, { "epoch": 16.985685446870985, "grad_norm": 0.005346042104065418, "learning_rate": 6.762128147035463e-07, "loss": 0.0, "num_input_tokens_seen": 199604400, "step": 92555 }, { "epoch": 16.986603046430538, "grad_norm": 0.0017776895547285676, "learning_rate": 6.758107383353729e-07, "loss": 0.0, "num_input_tokens_seen": 199613968, "step": 92560 }, { "epoch": 16.98752064599009, "grad_norm": 0.0009325437131337821, "learning_rate": 6.754087728759523e-07, "loss": 0.0, "num_input_tokens_seen": 199626032, "step": 92565 }, { "epoch": 16.98843824554964, "grad_norm": 0.0015736640198156238, "learning_rate": 6.750069183355946e-07, "loss": 0.0, "num_input_tokens_seen": 199636592, "step": 92570 }, { "epoch": 16.989355845109195, "grad_norm": 0.0029045548290014267, "learning_rate": 6.746051747246046e-07, "loss": 0.002, "num_input_tokens_seen": 199645680, "step": 92575 }, { "epoch": 16.99027344466875, "grad_norm": 0.005913168657571077, "learning_rate": 6.7420354205329e-07, "loss": 0.0, "num_input_tokens_seen": 199656880, "step": 92580 }, { "epoch": 16.991191044228298, "grad_norm": 0.0008023578557185829, "learning_rate": 6.738020203319495e-07, "loss": 0.2313, "num_input_tokens_seen": 199667024, "step": 92585 }, { "epoch": 16.99210864378785, "grad_norm": 0.0016164962435141206, "learning_rate": 6.734006095708811e-07, "loss": 0.0001, "num_input_tokens_seen": 199678224, "step": 92590 }, { "epoch": 16.993026243347405, "grad_norm": 0.0060030329041182995, "learning_rate": 6.729993097803828e-07, "loss": 0.0001, "num_input_tokens_seen": 199688656, "step": 92595 }, { "epoch": 16.993943842906955, "grad_norm": 0.010479334741830826, "learning_rate": 6.72598120970746e-07, "loss": 0.0001, "num_input_tokens_seen": 199698672, "step": 92600 }, { "epoch": 16.994861442466508, "grad_norm": 0.0019794958643615246, "learning_rate": 6.721970431522595e-07, "loss": 0.0, "num_input_tokens_seen": 199710000, "step": 92605 }, { "epoch": 16.99577904202606, "grad_norm": 0.05883181095123291, "learning_rate": 6.717960763352122e-07, "loss": 0.0, "num_input_tokens_seen": 199720560, "step": 92610 }, { "epoch": 16.99669664158561, "grad_norm": 0.27594465017318726, "learning_rate": 6.713952205298874e-07, "loss": 0.0001, "num_input_tokens_seen": 199732144, "step": 92615 }, { "epoch": 16.997614241145165, "grad_norm": 0.010295134969055653, "learning_rate": 6.709944757465664e-07, "loss": 0.0006, "num_input_tokens_seen": 199742608, "step": 92620 }, { "epoch": 16.998531840704718, "grad_norm": 0.003577941097319126, "learning_rate": 6.705938419955271e-07, "loss": 0.001, "num_input_tokens_seen": 199754128, "step": 92625 }, { "epoch": 16.999449440264268, "grad_norm": 0.022451546043157578, "learning_rate": 6.701933192870463e-07, "loss": 0.0001, "num_input_tokens_seen": 199765456, "step": 92630 }, { "epoch": 17.00036703982382, "grad_norm": 0.0002722199133131653, "learning_rate": 6.697929076313969e-07, "loss": 0.0, "num_input_tokens_seen": 199774448, "step": 92635 }, { "epoch": 17.001284639383375, "grad_norm": 0.014292432926595211, "learning_rate": 6.693926070388468e-07, "loss": 0.0001, "num_input_tokens_seen": 199784304, "step": 92640 }, { "epoch": 17.002202238942925, "grad_norm": 0.006876436062157154, "learning_rate": 6.689924175196655e-07, "loss": 0.0, "num_input_tokens_seen": 199794384, "step": 92645 }, { "epoch": 17.003119838502478, "grad_norm": 0.0009969213278964162, "learning_rate": 6.685923390841165e-07, "loss": 0.0, "num_input_tokens_seen": 199805648, "step": 92650 }, { "epoch": 17.00403743806203, "grad_norm": 0.0007875044830143452, "learning_rate": 6.681923717424593e-07, "loss": 0.0, "num_input_tokens_seen": 199817104, "step": 92655 }, { "epoch": 17.00495503762158, "grad_norm": 0.004568116273730993, "learning_rate": 6.677925155049559e-07, "loss": 0.0, "num_input_tokens_seen": 199827600, "step": 92660 }, { "epoch": 17.005872637181135, "grad_norm": 0.0007832362316548824, "learning_rate": 6.673927703818595e-07, "loss": 0.0006, "num_input_tokens_seen": 199838352, "step": 92665 }, { "epoch": 17.006790236740688, "grad_norm": 0.009884639643132687, "learning_rate": 6.669931363834242e-07, "loss": 0.0001, "num_input_tokens_seen": 199849616, "step": 92670 }, { "epoch": 17.007707836300238, "grad_norm": 1.1449964046478271, "learning_rate": 6.66593613519898e-07, "loss": 0.0002, "num_input_tokens_seen": 199859408, "step": 92675 }, { "epoch": 17.00862543585979, "grad_norm": 0.00235116109251976, "learning_rate": 6.661942018015304e-07, "loss": 0.0, "num_input_tokens_seen": 199870416, "step": 92680 }, { "epoch": 17.009543035419345, "grad_norm": 0.008438914082944393, "learning_rate": 6.65794901238565e-07, "loss": 0.0, "num_input_tokens_seen": 199881360, "step": 92685 }, { "epoch": 17.010460634978894, "grad_norm": 41.67253875732422, "learning_rate": 6.653957118412418e-07, "loss": 0.1437, "num_input_tokens_seen": 199891600, "step": 92690 }, { "epoch": 17.011378234538448, "grad_norm": 0.0020233141258358955, "learning_rate": 6.649966336198016e-07, "loss": 0.0, "num_input_tokens_seen": 199902672, "step": 92695 }, { "epoch": 17.012295834098, "grad_norm": 0.006207063794136047, "learning_rate": 6.645976665844788e-07, "loss": 0.0, "num_input_tokens_seen": 199914512, "step": 92700 }, { "epoch": 17.01321343365755, "grad_norm": 0.0007867213571444154, "learning_rate": 6.641988107455072e-07, "loss": 0.2063, "num_input_tokens_seen": 199925264, "step": 92705 }, { "epoch": 17.014131033217105, "grad_norm": 0.0030357143841683865, "learning_rate": 6.638000661131144e-07, "loss": 0.0, "num_input_tokens_seen": 199935696, "step": 92710 }, { "epoch": 17.015048632776658, "grad_norm": 0.0010219959076493979, "learning_rate": 6.634014326975313e-07, "loss": 0.0, "num_input_tokens_seen": 199946640, "step": 92715 }, { "epoch": 17.015966232336208, "grad_norm": 0.003479025326669216, "learning_rate": 6.630029105089797e-07, "loss": 0.0, "num_input_tokens_seen": 199957680, "step": 92720 }, { "epoch": 17.01688383189576, "grad_norm": 0.08293738216161728, "learning_rate": 6.626044995576808e-07, "loss": 0.0, "num_input_tokens_seen": 199967888, "step": 92725 }, { "epoch": 17.017801431455315, "grad_norm": 0.001564301666803658, "learning_rate": 6.622061998538554e-07, "loss": 0.0, "num_input_tokens_seen": 199979440, "step": 92730 }, { "epoch": 17.018719031014864, "grad_norm": 0.10497079789638519, "learning_rate": 6.61808011407718e-07, "loss": 0.0001, "num_input_tokens_seen": 199990736, "step": 92735 }, { "epoch": 17.019636630574418, "grad_norm": 22.398305892944336, "learning_rate": 6.614099342294816e-07, "loss": 0.0119, "num_input_tokens_seen": 199999984, "step": 92740 }, { "epoch": 17.02055423013397, "grad_norm": 0.002304412890225649, "learning_rate": 6.610119683293559e-07, "loss": 0.0, "num_input_tokens_seen": 200010736, "step": 92745 }, { "epoch": 17.02147182969352, "grad_norm": 0.000970889872405678, "learning_rate": 6.606141137175481e-07, "loss": 0.0002, "num_input_tokens_seen": 200022192, "step": 92750 }, { "epoch": 17.022389429253074, "grad_norm": 0.017702171579003334, "learning_rate": 6.602163704042625e-07, "loss": 0.0, "num_input_tokens_seen": 200032560, "step": 92755 }, { "epoch": 17.023307028812628, "grad_norm": 0.017216484993696213, "learning_rate": 6.598187383997017e-07, "loss": 0.0, "num_input_tokens_seen": 200043984, "step": 92760 }, { "epoch": 17.024224628372178, "grad_norm": 0.0011639874428510666, "learning_rate": 6.594212177140636e-07, "loss": 0.0, "num_input_tokens_seen": 200054608, "step": 92765 }, { "epoch": 17.02514222793173, "grad_norm": 0.008436836302280426, "learning_rate": 6.59023808357544e-07, "loss": 0.0001, "num_input_tokens_seen": 200065552, "step": 92770 }, { "epoch": 17.026059827491284, "grad_norm": 0.0004302864253986627, "learning_rate": 6.586265103403344e-07, "loss": 0.0, "num_input_tokens_seen": 200075376, "step": 92775 }, { "epoch": 17.026977427050834, "grad_norm": 0.0018760855309665203, "learning_rate": 6.582293236726278e-07, "loss": 0.0, "num_input_tokens_seen": 200086256, "step": 92780 }, { "epoch": 17.027895026610388, "grad_norm": 0.08136047422885895, "learning_rate": 6.5783224836461e-07, "loss": 0.0002, "num_input_tokens_seen": 200098352, "step": 92785 }, { "epoch": 17.02881262616994, "grad_norm": 0.009078186936676502, "learning_rate": 6.574352844264637e-07, "loss": 0.0002, "num_input_tokens_seen": 200109328, "step": 92790 }, { "epoch": 17.02973022572949, "grad_norm": 0.008814374916255474, "learning_rate": 6.570384318683731e-07, "loss": 0.0, "num_input_tokens_seen": 200120592, "step": 92795 }, { "epoch": 17.030647825289044, "grad_norm": 0.0019387396750971675, "learning_rate": 6.566416907005163e-07, "loss": 0.0001, "num_input_tokens_seen": 200130864, "step": 92800 }, { "epoch": 17.031565424848598, "grad_norm": 0.08168333768844604, "learning_rate": 6.562450609330678e-07, "loss": 0.0, "num_input_tokens_seen": 200142256, "step": 92805 }, { "epoch": 17.032483024408148, "grad_norm": 0.06483300775289536, "learning_rate": 6.558485425762007e-07, "loss": 0.0, "num_input_tokens_seen": 200153808, "step": 92810 }, { "epoch": 17.0334006239677, "grad_norm": 0.018105480819940567, "learning_rate": 6.554521356400867e-07, "loss": 0.0, "num_input_tokens_seen": 200164432, "step": 92815 }, { "epoch": 17.034318223527254, "grad_norm": 0.0015234688762575388, "learning_rate": 6.550558401348922e-07, "loss": 0.0, "num_input_tokens_seen": 200175728, "step": 92820 }, { "epoch": 17.035235823086804, "grad_norm": 0.008841411210596561, "learning_rate": 6.546596560707796e-07, "loss": 0.0, "num_input_tokens_seen": 200186224, "step": 92825 }, { "epoch": 17.036153422646358, "grad_norm": 0.0392981618642807, "learning_rate": 6.542635834579136e-07, "loss": 0.1128, "num_input_tokens_seen": 200197616, "step": 92830 }, { "epoch": 17.03707102220591, "grad_norm": 0.04982321336865425, "learning_rate": 6.538676223064516e-07, "loss": 0.0001, "num_input_tokens_seen": 200208944, "step": 92835 }, { "epoch": 17.03798862176546, "grad_norm": 0.0006834681844338775, "learning_rate": 6.534717726265489e-07, "loss": 0.0001, "num_input_tokens_seen": 200220432, "step": 92840 }, { "epoch": 17.038906221325014, "grad_norm": 8.860889434814453, "learning_rate": 6.530760344283583e-07, "loss": 0.0051, "num_input_tokens_seen": 200232432, "step": 92845 }, { "epoch": 17.039823820884568, "grad_norm": 0.002436032984405756, "learning_rate": 6.526804077220306e-07, "loss": 0.0003, "num_input_tokens_seen": 200243344, "step": 92850 }, { "epoch": 17.040741420444117, "grad_norm": 0.0016818179283291101, "learning_rate": 6.522848925177128e-07, "loss": 0.0, "num_input_tokens_seen": 200254000, "step": 92855 }, { "epoch": 17.04165902000367, "grad_norm": 0.0017950464971363544, "learning_rate": 6.518894888255483e-07, "loss": 0.0, "num_input_tokens_seen": 200264080, "step": 92860 }, { "epoch": 17.042576619563224, "grad_norm": 0.0047252122312784195, "learning_rate": 6.514941966556804e-07, "loss": 0.0, "num_input_tokens_seen": 200274768, "step": 92865 }, { "epoch": 17.043494219122774, "grad_norm": 0.01580999046564102, "learning_rate": 6.510990160182468e-07, "loss": 0.0, "num_input_tokens_seen": 200285392, "step": 92870 }, { "epoch": 17.044411818682327, "grad_norm": 0.006640731822699308, "learning_rate": 6.507039469233823e-07, "loss": 0.0, "num_input_tokens_seen": 200296560, "step": 92875 }, { "epoch": 17.04532941824188, "grad_norm": 0.003070200327783823, "learning_rate": 6.50308989381222e-07, "loss": 0.0, "num_input_tokens_seen": 200306224, "step": 92880 }, { "epoch": 17.04624701780143, "grad_norm": 0.003228087443858385, "learning_rate": 6.49914143401894e-07, "loss": 0.0, "num_input_tokens_seen": 200316496, "step": 92885 }, { "epoch": 17.047164617360984, "grad_norm": 0.005577651783823967, "learning_rate": 6.49519408995527e-07, "loss": 0.0001, "num_input_tokens_seen": 200327376, "step": 92890 }, { "epoch": 17.048082216920537, "grad_norm": 0.002045523375272751, "learning_rate": 6.491247861722427e-07, "loss": 0.0, "num_input_tokens_seen": 200337584, "step": 92895 }, { "epoch": 17.048999816480087, "grad_norm": 0.002194751054048538, "learning_rate": 6.487302749421664e-07, "loss": 0.0, "num_input_tokens_seen": 200349104, "step": 92900 }, { "epoch": 17.04991741603964, "grad_norm": 0.016750352457165718, "learning_rate": 6.48335875315414e-07, "loss": 0.0, "num_input_tokens_seen": 200359216, "step": 92905 }, { "epoch": 17.050835015599194, "grad_norm": 0.0005392802413553, "learning_rate": 6.479415873021011e-07, "loss": 0.0, "num_input_tokens_seen": 200370832, "step": 92910 }, { "epoch": 17.051752615158744, "grad_norm": 0.006491576321423054, "learning_rate": 6.475474109123425e-07, "loss": 0.0001, "num_input_tokens_seen": 200381776, "step": 92915 }, { "epoch": 17.052670214718297, "grad_norm": 0.007057395298033953, "learning_rate": 6.471533461562469e-07, "loss": 0.0, "num_input_tokens_seen": 200392848, "step": 92920 }, { "epoch": 17.05358781427785, "grad_norm": 0.0009444594033993781, "learning_rate": 6.467593930439209e-07, "loss": 0.0, "num_input_tokens_seen": 200403760, "step": 92925 }, { "epoch": 17.0545054138374, "grad_norm": 0.0018121261382475495, "learning_rate": 6.4636555158547e-07, "loss": 0.0, "num_input_tokens_seen": 200414832, "step": 92930 }, { "epoch": 17.055423013396954, "grad_norm": 0.11104980856180191, "learning_rate": 6.45971821790996e-07, "loss": 0.0, "num_input_tokens_seen": 200426352, "step": 92935 }, { "epoch": 17.056340612956507, "grad_norm": 0.0034395090769976377, "learning_rate": 6.45578203670596e-07, "loss": 0.0, "num_input_tokens_seen": 200436944, "step": 92940 }, { "epoch": 17.057258212516057, "grad_norm": 0.003394629340618849, "learning_rate": 6.451846972343668e-07, "loss": 0.0285, "num_input_tokens_seen": 200448368, "step": 92945 }, { "epoch": 17.05817581207561, "grad_norm": 0.40399497747421265, "learning_rate": 6.447913024923996e-07, "loss": 0.0017, "num_input_tokens_seen": 200459184, "step": 92950 }, { "epoch": 17.059093411635164, "grad_norm": 0.0010913043515756726, "learning_rate": 6.443980194547861e-07, "loss": 0.0, "num_input_tokens_seen": 200470288, "step": 92955 }, { "epoch": 17.060011011194714, "grad_norm": 0.0037857061251997948, "learning_rate": 6.440048481316136e-07, "loss": 0.0, "num_input_tokens_seen": 200481328, "step": 92960 }, { "epoch": 17.060928610754267, "grad_norm": 0.0006246789125725627, "learning_rate": 6.436117885329652e-07, "loss": 0.0, "num_input_tokens_seen": 200492080, "step": 92965 }, { "epoch": 17.06184621031382, "grad_norm": 0.0012388189788907766, "learning_rate": 6.432188406689227e-07, "loss": 0.0, "num_input_tokens_seen": 200503440, "step": 92970 }, { "epoch": 17.06276380987337, "grad_norm": 0.0009500053129158914, "learning_rate": 6.428260045495632e-07, "loss": 0.0, "num_input_tokens_seen": 200514128, "step": 92975 }, { "epoch": 17.063681409432924, "grad_norm": 0.0007579175289720297, "learning_rate": 6.424332801849648e-07, "loss": 0.0, "num_input_tokens_seen": 200525072, "step": 92980 }, { "epoch": 17.064599008992477, "grad_norm": 0.0020445603877305984, "learning_rate": 6.420406675851993e-07, "loss": 0.0, "num_input_tokens_seen": 200536112, "step": 92985 }, { "epoch": 17.065516608552027, "grad_norm": 5.349706649780273, "learning_rate": 6.416481667603363e-07, "loss": 0.002, "num_input_tokens_seen": 200547056, "step": 92990 }, { "epoch": 17.06643420811158, "grad_norm": 0.001795669086277485, "learning_rate": 6.412557777204426e-07, "loss": 0.0002, "num_input_tokens_seen": 200558672, "step": 92995 }, { "epoch": 17.067351807671134, "grad_norm": 0.0347900465130806, "learning_rate": 6.408635004755831e-07, "loss": 0.0, "num_input_tokens_seen": 200568176, "step": 93000 }, { "epoch": 17.068269407230684, "grad_norm": 0.07246869802474976, "learning_rate": 6.404713350358188e-07, "loss": 0.0, "num_input_tokens_seen": 200578864, "step": 93005 }, { "epoch": 17.069187006790237, "grad_norm": 0.025419646874070168, "learning_rate": 6.400792814112072e-07, "loss": 0.0001, "num_input_tokens_seen": 200589456, "step": 93010 }, { "epoch": 17.07010460634979, "grad_norm": 0.012164154089987278, "learning_rate": 6.396873396118059e-07, "loss": 0.0, "num_input_tokens_seen": 200599632, "step": 93015 }, { "epoch": 17.07102220590934, "grad_norm": 0.02616184391081333, "learning_rate": 6.392955096476667e-07, "loss": 0.0, "num_input_tokens_seen": 200610992, "step": 93020 }, { "epoch": 17.071939805468894, "grad_norm": 44.99949645996094, "learning_rate": 6.389037915288388e-07, "loss": 0.0705, "num_input_tokens_seen": 200620048, "step": 93025 }, { "epoch": 17.072857405028447, "grad_norm": 0.002309780567884445, "learning_rate": 6.385121852653686e-07, "loss": 0.0, "num_input_tokens_seen": 200630256, "step": 93030 }, { "epoch": 17.073775004587997, "grad_norm": 0.0018229843117296696, "learning_rate": 6.381206908673021e-07, "loss": 0.0001, "num_input_tokens_seen": 200639984, "step": 93035 }, { "epoch": 17.07469260414755, "grad_norm": 0.0003278654767200351, "learning_rate": 6.377293083446795e-07, "loss": 0.0376, "num_input_tokens_seen": 200649264, "step": 93040 }, { "epoch": 17.075610203707104, "grad_norm": 0.004461777396500111, "learning_rate": 6.373380377075383e-07, "loss": 0.0, "num_input_tokens_seen": 200659312, "step": 93045 }, { "epoch": 17.076527803266654, "grad_norm": 0.010553627274930477, "learning_rate": 6.369468789659161e-07, "loss": 0.0, "num_input_tokens_seen": 200669808, "step": 93050 }, { "epoch": 17.077445402826207, "grad_norm": 0.005393240135163069, "learning_rate": 6.365558321298443e-07, "loss": 0.0, "num_input_tokens_seen": 200679440, "step": 93055 }, { "epoch": 17.07836300238576, "grad_norm": 0.007504710927605629, "learning_rate": 6.361648972093515e-07, "loss": 0.0, "num_input_tokens_seen": 200691504, "step": 93060 }, { "epoch": 17.07928060194531, "grad_norm": 0.2763650715351105, "learning_rate": 6.357740742144669e-07, "loss": 0.0001, "num_input_tokens_seen": 200701008, "step": 93065 }, { "epoch": 17.080198201504864, "grad_norm": 7.013028621673584, "learning_rate": 6.353833631552137e-07, "loss": 0.0074, "num_input_tokens_seen": 200711952, "step": 93070 }, { "epoch": 17.081115801064417, "grad_norm": 0.025159336626529694, "learning_rate": 6.34992764041612e-07, "loss": 0.0, "num_input_tokens_seen": 200723184, "step": 93075 }, { "epoch": 17.082033400623967, "grad_norm": 0.0020845746621489525, "learning_rate": 6.346022768836802e-07, "loss": 0.0, "num_input_tokens_seen": 200734320, "step": 93080 }, { "epoch": 17.08295100018352, "grad_norm": 0.016878927126526833, "learning_rate": 6.34211901691435e-07, "loss": 0.0032, "num_input_tokens_seen": 200745424, "step": 93085 }, { "epoch": 17.083868599743074, "grad_norm": 0.008025266230106354, "learning_rate": 6.338216384748885e-07, "loss": 0.0, "num_input_tokens_seen": 200756720, "step": 93090 }, { "epoch": 17.084786199302624, "grad_norm": 0.02059922367334366, "learning_rate": 6.33431487244049e-07, "loss": 0.0, "num_input_tokens_seen": 200767728, "step": 93095 }, { "epoch": 17.085703798862177, "grad_norm": 0.07307168841362, "learning_rate": 6.330414480089248e-07, "loss": 0.0, "num_input_tokens_seen": 200776912, "step": 93100 }, { "epoch": 17.08662139842173, "grad_norm": 0.029866814613342285, "learning_rate": 6.326515207795198e-07, "loss": 0.0, "num_input_tokens_seen": 200787248, "step": 93105 }, { "epoch": 17.08753899798128, "grad_norm": 0.06765671074390411, "learning_rate": 6.322617055658331e-07, "loss": 0.0001, "num_input_tokens_seen": 200797808, "step": 93110 }, { "epoch": 17.088456597540834, "grad_norm": 0.0012909166980534792, "learning_rate": 6.318720023778651e-07, "loss": 0.0, "num_input_tokens_seen": 200808624, "step": 93115 }, { "epoch": 17.089374197100387, "grad_norm": 0.002386647742241621, "learning_rate": 6.314824112256107e-07, "loss": 0.0, "num_input_tokens_seen": 200819408, "step": 93120 }, { "epoch": 17.090291796659937, "grad_norm": 0.0007622579578310251, "learning_rate": 6.310929321190623e-07, "loss": 0.0, "num_input_tokens_seen": 200830128, "step": 93125 }, { "epoch": 17.09120939621949, "grad_norm": 0.005248410161584616, "learning_rate": 6.30703565068207e-07, "loss": 0.0, "num_input_tokens_seen": 200841520, "step": 93130 }, { "epoch": 17.092126995779044, "grad_norm": 0.000652359682135284, "learning_rate": 6.30314310083035e-07, "loss": 0.0, "num_input_tokens_seen": 200851344, "step": 93135 }, { "epoch": 17.093044595338593, "grad_norm": 1.5294585227966309, "learning_rate": 6.299251671735285e-07, "loss": 0.0001, "num_input_tokens_seen": 200861392, "step": 93140 }, { "epoch": 17.093962194898147, "grad_norm": 0.0032614979427307844, "learning_rate": 6.295361363496677e-07, "loss": 0.0, "num_input_tokens_seen": 200872112, "step": 93145 }, { "epoch": 17.0948797944577, "grad_norm": 0.0014036042848601937, "learning_rate": 6.29147217621432e-07, "loss": 0.0, "num_input_tokens_seen": 200883984, "step": 93150 }, { "epoch": 17.09579739401725, "grad_norm": 0.0027196125593036413, "learning_rate": 6.28758410998796e-07, "loss": 0.0001, "num_input_tokens_seen": 200894832, "step": 93155 }, { "epoch": 17.096714993576803, "grad_norm": 0.005988610442727804, "learning_rate": 6.283697164917324e-07, "loss": 0.0002, "num_input_tokens_seen": 200905584, "step": 93160 }, { "epoch": 17.097632593136357, "grad_norm": 0.0007733766688033938, "learning_rate": 6.279811341102099e-07, "loss": 0.0001, "num_input_tokens_seen": 200917232, "step": 93165 }, { "epoch": 17.098550192695907, "grad_norm": 0.0008967489120550454, "learning_rate": 6.275926638641938e-07, "loss": 0.0007, "num_input_tokens_seen": 200928752, "step": 93170 }, { "epoch": 17.09946779225546, "grad_norm": 0.08221224695444107, "learning_rate": 6.272043057636507e-07, "loss": 0.0, "num_input_tokens_seen": 200939600, "step": 93175 }, { "epoch": 17.100385391815013, "grad_norm": 0.0025907475501298904, "learning_rate": 6.268160598185402e-07, "loss": 0.0, "num_input_tokens_seen": 200949360, "step": 93180 }, { "epoch": 17.101302991374563, "grad_norm": 0.0005265487707220018, "learning_rate": 6.264279260388195e-07, "loss": 0.0, "num_input_tokens_seen": 200960912, "step": 93185 }, { "epoch": 17.102220590934117, "grad_norm": 0.0015807149466127157, "learning_rate": 6.260399044344445e-07, "loss": 0.0, "num_input_tokens_seen": 200971696, "step": 93190 }, { "epoch": 17.10313819049367, "grad_norm": 0.0005521868006326258, "learning_rate": 6.256519950153655e-07, "loss": 0.0, "num_input_tokens_seen": 200982640, "step": 93195 }, { "epoch": 17.10405579005322, "grad_norm": 0.0012819395633414388, "learning_rate": 6.252641977915341e-07, "loss": 0.0002, "num_input_tokens_seen": 200994768, "step": 93200 }, { "epoch": 17.104973389612773, "grad_norm": 0.0012459353310987353, "learning_rate": 6.248765127728961e-07, "loss": 0.0, "num_input_tokens_seen": 201005200, "step": 93205 }, { "epoch": 17.105890989172327, "grad_norm": 0.33694273233413696, "learning_rate": 6.244889399693948e-07, "loss": 0.0002, "num_input_tokens_seen": 201016880, "step": 93210 }, { "epoch": 17.106808588731877, "grad_norm": 0.0011196830309927464, "learning_rate": 6.241014793909694e-07, "loss": 0.0, "num_input_tokens_seen": 201028080, "step": 93215 }, { "epoch": 17.10772618829143, "grad_norm": 0.010088875889778137, "learning_rate": 6.237141310475603e-07, "loss": 0.0, "num_input_tokens_seen": 201038448, "step": 93220 }, { "epoch": 17.108643787850983, "grad_norm": 0.0005897062947042286, "learning_rate": 6.233268949491011e-07, "loss": 0.2469, "num_input_tokens_seen": 201049424, "step": 93225 }, { "epoch": 17.109561387410533, "grad_norm": 0.0007283881423063576, "learning_rate": 6.22939771105523e-07, "loss": 0.0001, "num_input_tokens_seen": 201060560, "step": 93230 }, { "epoch": 17.110478986970087, "grad_norm": 0.004573191981762648, "learning_rate": 6.225527595267567e-07, "loss": 0.0, "num_input_tokens_seen": 201071888, "step": 93235 }, { "epoch": 17.11139658652964, "grad_norm": 0.0035212880466133356, "learning_rate": 6.221658602227276e-07, "loss": 0.0, "num_input_tokens_seen": 201082640, "step": 93240 }, { "epoch": 17.11231418608919, "grad_norm": 0.0017674150876700878, "learning_rate": 6.217790732033586e-07, "loss": 0.0, "num_input_tokens_seen": 201093904, "step": 93245 }, { "epoch": 17.113231785648743, "grad_norm": 0.0012730630114674568, "learning_rate": 6.213923984785713e-07, "loss": 0.0001, "num_input_tokens_seen": 201103696, "step": 93250 }, { "epoch": 17.114149385208297, "grad_norm": 0.0016463653882965446, "learning_rate": 6.210058360582827e-07, "loss": 0.0, "num_input_tokens_seen": 201114320, "step": 93255 }, { "epoch": 17.115066984767846, "grad_norm": 0.0014956592349335551, "learning_rate": 6.206193859524079e-07, "loss": 0.0, "num_input_tokens_seen": 201124080, "step": 93260 }, { "epoch": 17.1159845843274, "grad_norm": 0.007501235231757164, "learning_rate": 6.202330481708574e-07, "loss": 0.0, "num_input_tokens_seen": 201135632, "step": 93265 }, { "epoch": 17.116902183886953, "grad_norm": 0.005526966881006956, "learning_rate": 6.198468227235421e-07, "loss": 0.0001, "num_input_tokens_seen": 201146064, "step": 93270 }, { "epoch": 17.117819783446503, "grad_norm": 0.017288926988840103, "learning_rate": 6.19460709620367e-07, "loss": 0.0, "num_input_tokens_seen": 201157456, "step": 93275 }, { "epoch": 17.118737383006057, "grad_norm": 0.16274969279766083, "learning_rate": 6.190747088712346e-07, "loss": 0.0001, "num_input_tokens_seen": 201168176, "step": 93280 }, { "epoch": 17.11965498256561, "grad_norm": 0.0019442709162831306, "learning_rate": 6.18688820486047e-07, "loss": 0.1066, "num_input_tokens_seen": 201179312, "step": 93285 }, { "epoch": 17.12057258212516, "grad_norm": 0.0010007089003920555, "learning_rate": 6.183030444747007e-07, "loss": 0.0001, "num_input_tokens_seen": 201190224, "step": 93290 }, { "epoch": 17.121490181684713, "grad_norm": 0.003159375162795186, "learning_rate": 6.179173808470906e-07, "loss": 0.0, "num_input_tokens_seen": 201201936, "step": 93295 }, { "epoch": 17.122407781244267, "grad_norm": 0.0016020640032365918, "learning_rate": 6.175318296131072e-07, "loss": 0.0, "num_input_tokens_seen": 201213328, "step": 93300 }, { "epoch": 17.123325380803816, "grad_norm": 0.006257109344005585, "learning_rate": 6.171463907826408e-07, "loss": 0.0, "num_input_tokens_seen": 201223504, "step": 93305 }, { "epoch": 17.12424298036337, "grad_norm": 0.03321877121925354, "learning_rate": 6.16761064365577e-07, "loss": 0.0, "num_input_tokens_seen": 201233648, "step": 93310 }, { "epoch": 17.125160579922923, "grad_norm": 0.0077863833867013454, "learning_rate": 6.163758503717971e-07, "loss": 0.0, "num_input_tokens_seen": 201244784, "step": 93315 }, { "epoch": 17.126078179482473, "grad_norm": 0.0006663997774012387, "learning_rate": 6.159907488111838e-07, "loss": 0.0, "num_input_tokens_seen": 201256080, "step": 93320 }, { "epoch": 17.126995779042026, "grad_norm": 0.011907366104424, "learning_rate": 6.156057596936133e-07, "loss": 0.0001, "num_input_tokens_seen": 201266768, "step": 93325 }, { "epoch": 17.12791337860158, "grad_norm": 0.004100109450519085, "learning_rate": 6.152208830289586e-07, "loss": 0.0, "num_input_tokens_seen": 201277456, "step": 93330 }, { "epoch": 17.12883097816113, "grad_norm": 0.0009135258733294904, "learning_rate": 6.148361188270934e-07, "loss": 0.0, "num_input_tokens_seen": 201288144, "step": 93335 }, { "epoch": 17.129748577720683, "grad_norm": 0.0009766039438545704, "learning_rate": 6.144514670978857e-07, "loss": 0.0, "num_input_tokens_seen": 201299248, "step": 93340 }, { "epoch": 17.130666177280236, "grad_norm": 0.00589552940800786, "learning_rate": 6.140669278512007e-07, "loss": 0.0, "num_input_tokens_seen": 201309808, "step": 93345 }, { "epoch": 17.131583776839786, "grad_norm": 0.0030854037031531334, "learning_rate": 6.136825010969006e-07, "loss": 0.0376, "num_input_tokens_seen": 201319088, "step": 93350 }, { "epoch": 17.13250137639934, "grad_norm": 0.002719457494094968, "learning_rate": 6.132981868448468e-07, "loss": 0.0, "num_input_tokens_seen": 201329136, "step": 93355 }, { "epoch": 17.133418975958893, "grad_norm": 0.0009002851438708603, "learning_rate": 6.129139851048959e-07, "loss": 0.0, "num_input_tokens_seen": 201339952, "step": 93360 }, { "epoch": 17.134336575518443, "grad_norm": 75.24848175048828, "learning_rate": 6.125298958869009e-07, "loss": 0.0376, "num_input_tokens_seen": 201350256, "step": 93365 }, { "epoch": 17.135254175077996, "grad_norm": 0.08246713876724243, "learning_rate": 6.121459192007156e-07, "loss": 0.0, "num_input_tokens_seen": 201361232, "step": 93370 }, { "epoch": 17.13617177463755, "grad_norm": 0.007913441397249699, "learning_rate": 6.117620550561865e-07, "loss": 0.0, "num_input_tokens_seen": 201372080, "step": 93375 }, { "epoch": 17.1370893741971, "grad_norm": 0.004224294796586037, "learning_rate": 6.113783034631593e-07, "loss": 0.0, "num_input_tokens_seen": 201381808, "step": 93380 }, { "epoch": 17.138006973756653, "grad_norm": 0.11095262318849564, "learning_rate": 6.109946644314774e-07, "loss": 0.0001, "num_input_tokens_seen": 201392464, "step": 93385 }, { "epoch": 17.138924573316206, "grad_norm": 0.0018522219033911824, "learning_rate": 6.106111379709784e-07, "loss": 0.0, "num_input_tokens_seen": 201403792, "step": 93390 }, { "epoch": 17.139842172875756, "grad_norm": 0.05820916220545769, "learning_rate": 6.102277240915022e-07, "loss": 0.0001, "num_input_tokens_seen": 201414256, "step": 93395 }, { "epoch": 17.14075977243531, "grad_norm": 0.0011275793658569455, "learning_rate": 6.098444228028816e-07, "loss": 0.1321, "num_input_tokens_seen": 201425776, "step": 93400 }, { "epoch": 17.141677371994863, "grad_norm": 0.0005929421749897301, "learning_rate": 6.09461234114947e-07, "loss": 0.0001, "num_input_tokens_seen": 201437744, "step": 93405 }, { "epoch": 17.142594971554413, "grad_norm": 0.0036313666496425867, "learning_rate": 6.090781580375271e-07, "loss": 0.0, "num_input_tokens_seen": 201447248, "step": 93410 }, { "epoch": 17.143512571113966, "grad_norm": 0.005709260702133179, "learning_rate": 6.086951945804459e-07, "loss": 0.0, "num_input_tokens_seen": 201457200, "step": 93415 }, { "epoch": 17.14443017067352, "grad_norm": 0.002035480458289385, "learning_rate": 6.083123437535282e-07, "loss": 0.0, "num_input_tokens_seen": 201468112, "step": 93420 }, { "epoch": 17.14534777023307, "grad_norm": 0.011466550640761852, "learning_rate": 6.079296055665929e-07, "loss": 0.0, "num_input_tokens_seen": 201478672, "step": 93425 }, { "epoch": 17.146265369792623, "grad_norm": 0.040446192026138306, "learning_rate": 6.075469800294548e-07, "loss": 0.0, "num_input_tokens_seen": 201489264, "step": 93430 }, { "epoch": 17.147182969352176, "grad_norm": 0.000619049125816673, "learning_rate": 6.071644671519295e-07, "loss": 0.0, "num_input_tokens_seen": 201500080, "step": 93435 }, { "epoch": 17.148100568911726, "grad_norm": 0.0008547335746698081, "learning_rate": 6.067820669438279e-07, "loss": 0.0001, "num_input_tokens_seen": 201511568, "step": 93440 }, { "epoch": 17.14901816847128, "grad_norm": 0.0006041281158104539, "learning_rate": 6.063997794149573e-07, "loss": 0.2063, "num_input_tokens_seen": 201522160, "step": 93445 }, { "epoch": 17.149935768030833, "grad_norm": 0.013876301236450672, "learning_rate": 6.060176045751215e-07, "loss": 0.0, "num_input_tokens_seen": 201533712, "step": 93450 }, { "epoch": 17.150853367590383, "grad_norm": 0.0015939654549583793, "learning_rate": 6.056355424341259e-07, "loss": 0.0284, "num_input_tokens_seen": 201545040, "step": 93455 }, { "epoch": 17.151770967149936, "grad_norm": 0.02021031640470028, "learning_rate": 6.052535930017672e-07, "loss": 0.0, "num_input_tokens_seen": 201553936, "step": 93460 }, { "epoch": 17.15268856670949, "grad_norm": 0.006051176693290472, "learning_rate": 6.04871756287842e-07, "loss": 0.0, "num_input_tokens_seen": 201565840, "step": 93465 }, { "epoch": 17.15360616626904, "grad_norm": 0.0006288531003519893, "learning_rate": 6.044900323021452e-07, "loss": 0.0014, "num_input_tokens_seen": 201576496, "step": 93470 }, { "epoch": 17.154523765828593, "grad_norm": 0.01485254056751728, "learning_rate": 6.041084210544668e-07, "loss": 0.0, "num_input_tokens_seen": 201587664, "step": 93475 }, { "epoch": 17.155441365388146, "grad_norm": 0.0014687059447169304, "learning_rate": 6.037269225545944e-07, "loss": 0.0001, "num_input_tokens_seen": 201598832, "step": 93480 }, { "epoch": 17.156358964947696, "grad_norm": 0.00499717378988862, "learning_rate": 6.03345536812312e-07, "loss": 0.0, "num_input_tokens_seen": 201610128, "step": 93485 }, { "epoch": 17.15727656450725, "grad_norm": 0.00472283735871315, "learning_rate": 6.029642638374028e-07, "loss": 0.0, "num_input_tokens_seen": 201619856, "step": 93490 }, { "epoch": 17.158194164066803, "grad_norm": 0.015438990667462349, "learning_rate": 6.025831036396462e-07, "loss": 0.0, "num_input_tokens_seen": 201630416, "step": 93495 }, { "epoch": 17.159111763626353, "grad_norm": 0.0033275617752224207, "learning_rate": 6.022020562288161e-07, "loss": 0.0, "num_input_tokens_seen": 201641200, "step": 93500 }, { "epoch": 17.160029363185906, "grad_norm": 0.058530595153570175, "learning_rate": 6.018211216146885e-07, "loss": 0.0, "num_input_tokens_seen": 201651184, "step": 93505 }, { "epoch": 17.16094696274546, "grad_norm": 0.0014518068637698889, "learning_rate": 6.014402998070323e-07, "loss": 0.0, "num_input_tokens_seen": 201662288, "step": 93510 }, { "epoch": 17.16186456230501, "grad_norm": 0.0024245502427220345, "learning_rate": 6.010595908156147e-07, "loss": 0.0, "num_input_tokens_seen": 201672496, "step": 93515 }, { "epoch": 17.162782161864563, "grad_norm": 0.0021132146939635277, "learning_rate": 6.006789946502017e-07, "loss": 0.0, "num_input_tokens_seen": 201682224, "step": 93520 }, { "epoch": 17.163699761424116, "grad_norm": 0.004705802537500858, "learning_rate": 6.002985113205539e-07, "loss": 0.0026, "num_input_tokens_seen": 201693008, "step": 93525 }, { "epoch": 17.164617360983666, "grad_norm": 0.005117497872561216, "learning_rate": 5.999181408364308e-07, "loss": 0.0, "num_input_tokens_seen": 201703952, "step": 93530 }, { "epoch": 17.16553496054322, "grad_norm": 0.0054263761267066, "learning_rate": 5.995378832075865e-07, "loss": 0.0, "num_input_tokens_seen": 201715280, "step": 93535 }, { "epoch": 17.166452560102773, "grad_norm": 0.0013374007539823651, "learning_rate": 5.991577384437764e-07, "loss": 0.0, "num_input_tokens_seen": 201726704, "step": 93540 }, { "epoch": 17.167370159662322, "grad_norm": 0.03584503009915352, "learning_rate": 5.987777065547495e-07, "loss": 0.0005, "num_input_tokens_seen": 201737392, "step": 93545 }, { "epoch": 17.168287759221876, "grad_norm": 0.0004921351792290807, "learning_rate": 5.983977875502528e-07, "loss": 0.0, "num_input_tokens_seen": 201748720, "step": 93550 }, { "epoch": 17.16920535878143, "grad_norm": 0.0017479303060099483, "learning_rate": 5.980179814400311e-07, "loss": 0.0001, "num_input_tokens_seen": 201759408, "step": 93555 }, { "epoch": 17.17012295834098, "grad_norm": 0.04135545715689659, "learning_rate": 5.976382882338266e-07, "loss": 0.0, "num_input_tokens_seen": 201771216, "step": 93560 }, { "epoch": 17.171040557900533, "grad_norm": 0.0011182588059455156, "learning_rate": 5.972587079413755e-07, "loss": 0.0, "num_input_tokens_seen": 201781200, "step": 93565 }, { "epoch": 17.171958157460086, "grad_norm": 0.0005354188615456223, "learning_rate": 5.968792405724161e-07, "loss": 0.0001, "num_input_tokens_seen": 201792592, "step": 93570 }, { "epoch": 17.172875757019636, "grad_norm": 0.07066982239484787, "learning_rate": 5.964998861366794e-07, "loss": 0.0016, "num_input_tokens_seen": 201803376, "step": 93575 }, { "epoch": 17.17379335657919, "grad_norm": 0.0016671076882630587, "learning_rate": 5.961206446438966e-07, "loss": 0.0001, "num_input_tokens_seen": 201814384, "step": 93580 }, { "epoch": 17.174710956138743, "grad_norm": 0.0009997630259022117, "learning_rate": 5.957415161037921e-07, "loss": 0.0004, "num_input_tokens_seen": 201825424, "step": 93585 }, { "epoch": 17.175628555698292, "grad_norm": 0.0002816384076140821, "learning_rate": 5.953625005260932e-07, "loss": 0.0, "num_input_tokens_seen": 201836432, "step": 93590 }, { "epoch": 17.176546155257846, "grad_norm": 0.022822290658950806, "learning_rate": 5.949835979205199e-07, "loss": 0.0, "num_input_tokens_seen": 201848048, "step": 93595 }, { "epoch": 17.1774637548174, "grad_norm": 0.05214863270521164, "learning_rate": 5.946048082967898e-07, "loss": 0.0, "num_input_tokens_seen": 201858736, "step": 93600 }, { "epoch": 17.17838135437695, "grad_norm": 0.014716479927301407, "learning_rate": 5.942261316646187e-07, "loss": 0.0001, "num_input_tokens_seen": 201867760, "step": 93605 }, { "epoch": 17.179298953936502, "grad_norm": 0.0009314889903180301, "learning_rate": 5.938475680337174e-07, "loss": 0.0, "num_input_tokens_seen": 201879152, "step": 93610 }, { "epoch": 17.180216553496056, "grad_norm": 0.002667234744876623, "learning_rate": 5.934691174137991e-07, "loss": 0.0, "num_input_tokens_seen": 201891472, "step": 93615 }, { "epoch": 17.181134153055606, "grad_norm": 0.0044857049360871315, "learning_rate": 5.930907798145674e-07, "loss": 0.0003, "num_input_tokens_seen": 201903024, "step": 93620 }, { "epoch": 17.18205175261516, "grad_norm": 0.12630517780780792, "learning_rate": 5.92712555245728e-07, "loss": 0.0, "num_input_tokens_seen": 201914096, "step": 93625 }, { "epoch": 17.182969352174712, "grad_norm": 0.04773550108075142, "learning_rate": 5.923344437169804e-07, "loss": 0.0, "num_input_tokens_seen": 201924944, "step": 93630 }, { "epoch": 17.183886951734262, "grad_norm": 0.0011571702780202031, "learning_rate": 5.919564452380222e-07, "loss": 0.0, "num_input_tokens_seen": 201936112, "step": 93635 }, { "epoch": 17.184804551293816, "grad_norm": 0.018993163481354713, "learning_rate": 5.915785598185503e-07, "loss": 0.0, "num_input_tokens_seen": 201947088, "step": 93640 }, { "epoch": 17.18572215085337, "grad_norm": 0.004962393082678318, "learning_rate": 5.912007874682557e-07, "loss": 0.0, "num_input_tokens_seen": 201959760, "step": 93645 }, { "epoch": 17.18663975041292, "grad_norm": 0.000668432330712676, "learning_rate": 5.908231281968274e-07, "loss": 0.0006, "num_input_tokens_seen": 201970864, "step": 93650 }, { "epoch": 17.187557349972472, "grad_norm": 0.014668374322354794, "learning_rate": 5.904455820139526e-07, "loss": 0.0, "num_input_tokens_seen": 201981296, "step": 93655 }, { "epoch": 17.188474949532026, "grad_norm": 0.0041391076520085335, "learning_rate": 5.900681489293147e-07, "loss": 0.0001, "num_input_tokens_seen": 201992720, "step": 93660 }, { "epoch": 17.189392549091576, "grad_norm": 0.0025842676404863596, "learning_rate": 5.896908289525943e-07, "loss": 0.0, "num_input_tokens_seen": 202002992, "step": 93665 }, { "epoch": 17.19031014865113, "grad_norm": 74.17920684814453, "learning_rate": 5.893136220934675e-07, "loss": 0.027, "num_input_tokens_seen": 202013904, "step": 93670 }, { "epoch": 17.191227748210682, "grad_norm": 0.000473243766464293, "learning_rate": 5.889365283616111e-07, "loss": 0.0, "num_input_tokens_seen": 202023344, "step": 93675 }, { "epoch": 17.192145347770232, "grad_norm": 0.014699973165988922, "learning_rate": 5.885595477666967e-07, "loss": 0.0, "num_input_tokens_seen": 202033872, "step": 93680 }, { "epoch": 17.193062947329786, "grad_norm": 0.00494243623688817, "learning_rate": 5.881826803183915e-07, "loss": 0.0, "num_input_tokens_seen": 202046096, "step": 93685 }, { "epoch": 17.19398054688934, "grad_norm": 0.002553394762799144, "learning_rate": 5.878059260263641e-07, "loss": 0.0, "num_input_tokens_seen": 202056336, "step": 93690 }, { "epoch": 17.19489814644889, "grad_norm": 0.012475285679101944, "learning_rate": 5.874292849002761e-07, "loss": 0.0, "num_input_tokens_seen": 202066256, "step": 93695 }, { "epoch": 17.195815746008442, "grad_norm": 0.0005482289125211537, "learning_rate": 5.870527569497875e-07, "loss": 0.0001, "num_input_tokens_seen": 202076176, "step": 93700 }, { "epoch": 17.196733345567996, "grad_norm": 0.04581458121538162, "learning_rate": 5.866763421845567e-07, "loss": 0.0001, "num_input_tokens_seen": 202087664, "step": 93705 }, { "epoch": 17.197650945127545, "grad_norm": 0.002380537101998925, "learning_rate": 5.863000406142383e-07, "loss": 0.0, "num_input_tokens_seen": 202098704, "step": 93710 }, { "epoch": 17.1985685446871, "grad_norm": 0.000957040989305824, "learning_rate": 5.859238522484828e-07, "loss": 0.0, "num_input_tokens_seen": 202109520, "step": 93715 }, { "epoch": 17.199486144246652, "grad_norm": 0.4026014804840088, "learning_rate": 5.855477770969381e-07, "loss": 0.0041, "num_input_tokens_seen": 202120048, "step": 93720 }, { "epoch": 17.200403743806202, "grad_norm": 0.0010464336955919862, "learning_rate": 5.851718151692526e-07, "loss": 0.0016, "num_input_tokens_seen": 202130672, "step": 93725 }, { "epoch": 17.201321343365755, "grad_norm": 0.0020955679938197136, "learning_rate": 5.847959664750674e-07, "loss": 0.0207, "num_input_tokens_seen": 202140304, "step": 93730 }, { "epoch": 17.20223894292531, "grad_norm": 0.0007527560810558498, "learning_rate": 5.844202310240222e-07, "loss": 0.0, "num_input_tokens_seen": 202151920, "step": 93735 }, { "epoch": 17.20315654248486, "grad_norm": 0.004697931930422783, "learning_rate": 5.840446088257551e-07, "loss": 0.0, "num_input_tokens_seen": 202163312, "step": 93740 }, { "epoch": 17.204074142044412, "grad_norm": 0.0006210441351868212, "learning_rate": 5.836690998898997e-07, "loss": 0.0, "num_input_tokens_seen": 202174608, "step": 93745 }, { "epoch": 17.204991741603965, "grad_norm": 0.0036372945178300142, "learning_rate": 5.832937042260872e-07, "loss": 0.0018, "num_input_tokens_seen": 202185712, "step": 93750 }, { "epoch": 17.205909341163515, "grad_norm": 0.001347015262581408, "learning_rate": 5.829184218439448e-07, "loss": 0.0, "num_input_tokens_seen": 202197008, "step": 93755 }, { "epoch": 17.20682694072307, "grad_norm": 0.012349171563982964, "learning_rate": 5.825432527531005e-07, "loss": 0.0, "num_input_tokens_seen": 202208592, "step": 93760 }, { "epoch": 17.207744540282622, "grad_norm": 0.0013134045293554664, "learning_rate": 5.821681969631749e-07, "loss": 0.0, "num_input_tokens_seen": 202219888, "step": 93765 }, { "epoch": 17.208662139842172, "grad_norm": 0.019968697801232338, "learning_rate": 5.817932544837873e-07, "loss": 0.0, "num_input_tokens_seen": 202230544, "step": 93770 }, { "epoch": 17.209579739401725, "grad_norm": 0.1504542976617813, "learning_rate": 5.814184253245558e-07, "loss": 0.0, "num_input_tokens_seen": 202241008, "step": 93775 }, { "epoch": 17.21049733896128, "grad_norm": 0.0037912381812930107, "learning_rate": 5.810437094950938e-07, "loss": 0.0001, "num_input_tokens_seen": 202252784, "step": 93780 }, { "epoch": 17.21141493852083, "grad_norm": 0.007994248531758785, "learning_rate": 5.806691070050108e-07, "loss": 0.0, "num_input_tokens_seen": 202262192, "step": 93785 }, { "epoch": 17.212332538080382, "grad_norm": 0.01648191548883915, "learning_rate": 5.802946178639168e-07, "loss": 0.0, "num_input_tokens_seen": 202272592, "step": 93790 }, { "epoch": 17.213250137639935, "grad_norm": 0.015444329008460045, "learning_rate": 5.79920242081416e-07, "loss": 0.0008, "num_input_tokens_seen": 202283376, "step": 93795 }, { "epoch": 17.214167737199485, "grad_norm": 0.0022698098327964544, "learning_rate": 5.795459796671105e-07, "loss": 0.0504, "num_input_tokens_seen": 202293264, "step": 93800 }, { "epoch": 17.21508533675904, "grad_norm": 2.926241397857666, "learning_rate": 5.791718306305982e-07, "loss": 0.0005, "num_input_tokens_seen": 202304976, "step": 93805 }, { "epoch": 17.216002936318592, "grad_norm": 0.0011088920291513205, "learning_rate": 5.787977949814783e-07, "loss": 0.0, "num_input_tokens_seen": 202316304, "step": 93810 }, { "epoch": 17.216920535878142, "grad_norm": 0.005916311871260405, "learning_rate": 5.784238727293423e-07, "loss": 0.0, "num_input_tokens_seen": 202329264, "step": 93815 }, { "epoch": 17.217838135437695, "grad_norm": 0.0015937864081934094, "learning_rate": 5.780500638837811e-07, "loss": 0.0, "num_input_tokens_seen": 202339248, "step": 93820 }, { "epoch": 17.21875573499725, "grad_norm": 0.003313209628686309, "learning_rate": 5.776763684543829e-07, "loss": 0.0, "num_input_tokens_seen": 202348592, "step": 93825 }, { "epoch": 17.2196733345568, "grad_norm": 0.0056819734163582325, "learning_rate": 5.773027864507313e-07, "loss": 0.0001, "num_input_tokens_seen": 202359440, "step": 93830 }, { "epoch": 17.220590934116352, "grad_norm": 0.0068668946623802185, "learning_rate": 5.769293178824081e-07, "loss": 0.0001, "num_input_tokens_seen": 202369680, "step": 93835 }, { "epoch": 17.221508533675905, "grad_norm": 0.0024523416068404913, "learning_rate": 5.765559627589934e-07, "loss": 0.0, "num_input_tokens_seen": 202381776, "step": 93840 }, { "epoch": 17.222426133235455, "grad_norm": 0.04997336119413376, "learning_rate": 5.761827210900628e-07, "loss": 0.0, "num_input_tokens_seen": 202392304, "step": 93845 }, { "epoch": 17.22334373279501, "grad_norm": 0.00649863388389349, "learning_rate": 5.758095928851893e-07, "loss": 0.0, "num_input_tokens_seen": 202402416, "step": 93850 }, { "epoch": 17.224261332354562, "grad_norm": 0.03556205704808235, "learning_rate": 5.754365781539412e-07, "loss": 0.0, "num_input_tokens_seen": 202412592, "step": 93855 }, { "epoch": 17.22517893191411, "grad_norm": 0.010165768675506115, "learning_rate": 5.750636769058893e-07, "loss": 0.0, "num_input_tokens_seen": 202421840, "step": 93860 }, { "epoch": 17.226096531473665, "grad_norm": 0.0009266072884202003, "learning_rate": 5.746908891505953e-07, "loss": 0.2181, "num_input_tokens_seen": 202433360, "step": 93865 }, { "epoch": 17.22701413103322, "grad_norm": 0.0028574420139193535, "learning_rate": 5.743182148976207e-07, "loss": 0.0003, "num_input_tokens_seen": 202445904, "step": 93870 }, { "epoch": 17.22793173059277, "grad_norm": 0.005103332456201315, "learning_rate": 5.739456541565258e-07, "loss": 0.0002, "num_input_tokens_seen": 202456656, "step": 93875 }, { "epoch": 17.228849330152322, "grad_norm": 0.0006674376782029867, "learning_rate": 5.735732069368649e-07, "loss": 0.0, "num_input_tokens_seen": 202467472, "step": 93880 }, { "epoch": 17.229766929711875, "grad_norm": 0.00802823156118393, "learning_rate": 5.732008732481897e-07, "loss": 0.0, "num_input_tokens_seen": 202478736, "step": 93885 }, { "epoch": 17.230684529271425, "grad_norm": 0.0005560779827646911, "learning_rate": 5.728286531000526e-07, "loss": 0.0, "num_input_tokens_seen": 202488624, "step": 93890 }, { "epoch": 17.23160212883098, "grad_norm": 0.0026975851505994797, "learning_rate": 5.724565465019988e-07, "loss": 0.0, "num_input_tokens_seen": 202498224, "step": 93895 }, { "epoch": 17.232519728390532, "grad_norm": 883.7008666992188, "learning_rate": 5.720845534635727e-07, "loss": 0.2063, "num_input_tokens_seen": 202509360, "step": 93900 }, { "epoch": 17.23343732795008, "grad_norm": 0.008612450212240219, "learning_rate": 5.717126739943141e-07, "loss": 0.0, "num_input_tokens_seen": 202521104, "step": 93905 }, { "epoch": 17.234354927509635, "grad_norm": 0.0021528892684727907, "learning_rate": 5.71340908103763e-07, "loss": 0.0, "num_input_tokens_seen": 202532368, "step": 93910 }, { "epoch": 17.23527252706919, "grad_norm": 0.041848666965961456, "learning_rate": 5.70969255801454e-07, "loss": 0.2188, "num_input_tokens_seen": 202542192, "step": 93915 }, { "epoch": 17.23619012662874, "grad_norm": 0.00908187497407198, "learning_rate": 5.705977170969184e-07, "loss": 0.0, "num_input_tokens_seen": 202552144, "step": 93920 }, { "epoch": 17.23710772618829, "grad_norm": 0.0011940315598621964, "learning_rate": 5.702262919996871e-07, "loss": 0.0004, "num_input_tokens_seen": 202562896, "step": 93925 }, { "epoch": 17.238025325747845, "grad_norm": 0.004983504302799702, "learning_rate": 5.69854980519286e-07, "loss": 0.0, "num_input_tokens_seen": 202573840, "step": 93930 }, { "epoch": 17.238942925307395, "grad_norm": 0.19582462310791016, "learning_rate": 5.694837826652383e-07, "loss": 0.0007, "num_input_tokens_seen": 202584560, "step": 93935 }, { "epoch": 17.23986052486695, "grad_norm": 0.3795088529586792, "learning_rate": 5.691126984470641e-07, "loss": 0.0001, "num_input_tokens_seen": 202595856, "step": 93940 }, { "epoch": 17.2407781244265, "grad_norm": 0.008693807758390903, "learning_rate": 5.68741727874283e-07, "loss": 0.0, "num_input_tokens_seen": 202605840, "step": 93945 }, { "epoch": 17.24169572398605, "grad_norm": 0.0011653879191726446, "learning_rate": 5.68370870956409e-07, "loss": 0.0, "num_input_tokens_seen": 202616944, "step": 93950 }, { "epoch": 17.242613323545605, "grad_norm": 0.01204605307430029, "learning_rate": 5.680001277029524e-07, "loss": 0.0, "num_input_tokens_seen": 202628016, "step": 93955 }, { "epoch": 17.24353092310516, "grad_norm": 0.004238656256347895, "learning_rate": 5.676294981234243e-07, "loss": 0.0016, "num_input_tokens_seen": 202638672, "step": 93960 }, { "epoch": 17.244448522664708, "grad_norm": 0.008859934285283089, "learning_rate": 5.672589822273305e-07, "loss": 0.0, "num_input_tokens_seen": 202648144, "step": 93965 }, { "epoch": 17.24536612222426, "grad_norm": 0.0026045457925647497, "learning_rate": 5.668885800241724e-07, "loss": 0.0, "num_input_tokens_seen": 202659760, "step": 93970 }, { "epoch": 17.246283721783815, "grad_norm": 0.0011527843307703733, "learning_rate": 5.66518291523453e-07, "loss": 0.0001, "num_input_tokens_seen": 202671280, "step": 93975 }, { "epoch": 17.247201321343365, "grad_norm": 0.009462754242122173, "learning_rate": 5.661481167346677e-07, "loss": 0.0, "num_input_tokens_seen": 202681776, "step": 93980 }, { "epoch": 17.248118920902918, "grad_norm": 0.04931372404098511, "learning_rate": 5.657780556673115e-07, "loss": 0.0001, "num_input_tokens_seen": 202692272, "step": 93985 }, { "epoch": 17.24903652046247, "grad_norm": 0.00037126822280697525, "learning_rate": 5.654081083308744e-07, "loss": 0.0, "num_input_tokens_seen": 202703120, "step": 93990 }, { "epoch": 17.24995412002202, "grad_norm": 0.005491376854479313, "learning_rate": 5.650382747348476e-07, "loss": 0.0, "num_input_tokens_seen": 202713616, "step": 93995 }, { "epoch": 17.250871719581575, "grad_norm": 0.003297293558716774, "learning_rate": 5.646685548887154e-07, "loss": 0.0, "num_input_tokens_seen": 202723696, "step": 94000 }, { "epoch": 17.25178931914113, "grad_norm": 0.013430032879114151, "learning_rate": 5.642989488019601e-07, "loss": 0.0, "num_input_tokens_seen": 202733488, "step": 94005 }, { "epoch": 17.252706918700678, "grad_norm": 0.008906770497560501, "learning_rate": 5.639294564840625e-07, "loss": 0.0, "num_input_tokens_seen": 202743376, "step": 94010 }, { "epoch": 17.25362451826023, "grad_norm": 0.0009028076310642064, "learning_rate": 5.635600779444995e-07, "loss": 0.0001, "num_input_tokens_seen": 202752976, "step": 94015 }, { "epoch": 17.254542117819785, "grad_norm": 0.00675802631303668, "learning_rate": 5.631908131927438e-07, "loss": 0.0, "num_input_tokens_seen": 202763888, "step": 94020 }, { "epoch": 17.255459717379335, "grad_norm": 1.7802997827529907, "learning_rate": 5.628216622382682e-07, "loss": 0.0001, "num_input_tokens_seen": 202772784, "step": 94025 }, { "epoch": 17.256377316938888, "grad_norm": 0.0012441971339285374, "learning_rate": 5.624526250905388e-07, "loss": 0.0, "num_input_tokens_seen": 202782832, "step": 94030 }, { "epoch": 17.25729491649844, "grad_norm": 0.01567579060792923, "learning_rate": 5.620837017590225e-07, "loss": 0.0, "num_input_tokens_seen": 202793648, "step": 94035 }, { "epoch": 17.25821251605799, "grad_norm": 0.009708206169307232, "learning_rate": 5.617148922531817e-07, "loss": 0.0, "num_input_tokens_seen": 202804368, "step": 94040 }, { "epoch": 17.259130115617545, "grad_norm": 0.0006544193020090461, "learning_rate": 5.613461965824746e-07, "loss": 0.0, "num_input_tokens_seen": 202815824, "step": 94045 }, { "epoch": 17.260047715177098, "grad_norm": 0.021394003182649612, "learning_rate": 5.609776147563589e-07, "loss": 0.0, "num_input_tokens_seen": 202825904, "step": 94050 }, { "epoch": 17.260965314736648, "grad_norm": 0.0015571395633742213, "learning_rate": 5.606091467842861e-07, "loss": 0.0, "num_input_tokens_seen": 202836208, "step": 94055 }, { "epoch": 17.2618829142962, "grad_norm": 0.0006466858903877437, "learning_rate": 5.602407926757092e-07, "loss": 0.0, "num_input_tokens_seen": 202845776, "step": 94060 }, { "epoch": 17.262800513855755, "grad_norm": 141.9770050048828, "learning_rate": 5.598725524400755e-07, "loss": 0.1823, "num_input_tokens_seen": 202856656, "step": 94065 }, { "epoch": 17.263718113415305, "grad_norm": 0.002626759698614478, "learning_rate": 5.595044260868288e-07, "loss": 0.0, "num_input_tokens_seen": 202867728, "step": 94070 }, { "epoch": 17.264635712974858, "grad_norm": 0.000368261884432286, "learning_rate": 5.591364136254107e-07, "loss": 0.0, "num_input_tokens_seen": 202879248, "step": 94075 }, { "epoch": 17.26555331253441, "grad_norm": 0.001282784272916615, "learning_rate": 5.587685150652616e-07, "loss": 0.0, "num_input_tokens_seen": 202890608, "step": 94080 }, { "epoch": 17.26647091209396, "grad_norm": 0.003788979258388281, "learning_rate": 5.584007304158168e-07, "loss": 0.0003, "num_input_tokens_seen": 202902032, "step": 94085 }, { "epoch": 17.267388511653515, "grad_norm": 0.0018588303355500102, "learning_rate": 5.580330596865085e-07, "loss": 0.0, "num_input_tokens_seen": 202912944, "step": 94090 }, { "epoch": 17.268306111213068, "grad_norm": 0.0031268277671188116, "learning_rate": 5.576655028867689e-07, "loss": 0.0, "num_input_tokens_seen": 202923536, "step": 94095 }, { "epoch": 17.269223710772618, "grad_norm": 0.003114779945462942, "learning_rate": 5.572980600260241e-07, "loss": 0.0, "num_input_tokens_seen": 202935952, "step": 94100 }, { "epoch": 17.27014131033217, "grad_norm": 0.008714506402611732, "learning_rate": 5.569307311136973e-07, "loss": 0.0, "num_input_tokens_seen": 202947152, "step": 94105 }, { "epoch": 17.271058909891725, "grad_norm": 0.0003961490001529455, "learning_rate": 5.56563516159212e-07, "loss": 0.0, "num_input_tokens_seen": 202957712, "step": 94110 }, { "epoch": 17.271976509451274, "grad_norm": 0.0004704141756519675, "learning_rate": 5.561964151719862e-07, "loss": 0.0, "num_input_tokens_seen": 202967760, "step": 94115 }, { "epoch": 17.272894109010828, "grad_norm": 0.019616421312093735, "learning_rate": 5.558294281614351e-07, "loss": 0.0001, "num_input_tokens_seen": 202977840, "step": 94120 }, { "epoch": 17.27381170857038, "grad_norm": 0.000588119903113693, "learning_rate": 5.554625551369702e-07, "loss": 0.0079, "num_input_tokens_seen": 202986736, "step": 94125 }, { "epoch": 17.27472930812993, "grad_norm": 0.008467999286949635, "learning_rate": 5.550957961080034e-07, "loss": 0.0, "num_input_tokens_seen": 202997648, "step": 94130 }, { "epoch": 17.275646907689485, "grad_norm": 0.0012682548258453608, "learning_rate": 5.547291510839404e-07, "loss": 0.0, "num_input_tokens_seen": 203008688, "step": 94135 }, { "epoch": 17.276564507249038, "grad_norm": 0.05102846026420593, "learning_rate": 5.543626200741842e-07, "loss": 0.0, "num_input_tokens_seen": 203019312, "step": 94140 }, { "epoch": 17.277482106808588, "grad_norm": 0.0016409457894042134, "learning_rate": 5.539962030881374e-07, "loss": 0.0001, "num_input_tokens_seen": 203031280, "step": 94145 }, { "epoch": 17.27839970636814, "grad_norm": 0.003732092445716262, "learning_rate": 5.536299001351975e-07, "loss": 0.0, "num_input_tokens_seen": 203043216, "step": 94150 }, { "epoch": 17.279317305927695, "grad_norm": 0.09125220030546188, "learning_rate": 5.532637112247585e-07, "loss": 0.1314, "num_input_tokens_seen": 203053744, "step": 94155 }, { "epoch": 17.280234905487244, "grad_norm": 0.0008499535033479333, "learning_rate": 5.528976363662142e-07, "loss": 0.0703, "num_input_tokens_seen": 203065136, "step": 94160 }, { "epoch": 17.281152505046798, "grad_norm": 0.0015046773478388786, "learning_rate": 5.525316755689536e-07, "loss": 0.0, "num_input_tokens_seen": 203076176, "step": 94165 }, { "epoch": 17.28207010460635, "grad_norm": 0.08175894618034363, "learning_rate": 5.52165828842362e-07, "loss": 0.0002, "num_input_tokens_seen": 203086896, "step": 94170 }, { "epoch": 17.2829877041659, "grad_norm": 0.0062120212242007256, "learning_rate": 5.518000961958231e-07, "loss": 0.0, "num_input_tokens_seen": 203098096, "step": 94175 }, { "epoch": 17.283905303725454, "grad_norm": 2.564012289047241, "learning_rate": 5.514344776387182e-07, "loss": 0.0004, "num_input_tokens_seen": 203108048, "step": 94180 }, { "epoch": 17.284822903285008, "grad_norm": 0.0017062791157513857, "learning_rate": 5.510689731804242e-07, "loss": 0.0001, "num_input_tokens_seen": 203117968, "step": 94185 }, { "epoch": 17.285740502844558, "grad_norm": 0.02875855378806591, "learning_rate": 5.507035828303148e-07, "loss": 0.0, "num_input_tokens_seen": 203128080, "step": 94190 }, { "epoch": 17.28665810240411, "grad_norm": 0.0008742682985030115, "learning_rate": 5.503383065977641e-07, "loss": 0.0, "num_input_tokens_seen": 203138064, "step": 94195 }, { "epoch": 17.287575701963664, "grad_norm": 0.008098372258245945, "learning_rate": 5.499731444921391e-07, "loss": 0.0, "num_input_tokens_seen": 203149200, "step": 94200 }, { "epoch": 17.288493301523214, "grad_norm": 0.011465389281511307, "learning_rate": 5.496080965228062e-07, "loss": 0.0, "num_input_tokens_seen": 203159824, "step": 94205 }, { "epoch": 17.289410901082768, "grad_norm": 2.508820056915283, "learning_rate": 5.492431626991274e-07, "loss": 0.0001, "num_input_tokens_seen": 203169616, "step": 94210 }, { "epoch": 17.29032850064232, "grad_norm": 0.3683812618255615, "learning_rate": 5.488783430304639e-07, "loss": 0.0, "num_input_tokens_seen": 203179504, "step": 94215 }, { "epoch": 17.29124610020187, "grad_norm": 0.018904533237218857, "learning_rate": 5.485136375261729e-07, "loss": 0.0, "num_input_tokens_seen": 203190640, "step": 94220 }, { "epoch": 17.292163699761424, "grad_norm": 0.007664654403924942, "learning_rate": 5.481490461956063e-07, "loss": 0.0001, "num_input_tokens_seen": 203202512, "step": 94225 }, { "epoch": 17.293081299320978, "grad_norm": 0.0007982035167515278, "learning_rate": 5.477845690481181e-07, "loss": 0.0, "num_input_tokens_seen": 203213616, "step": 94230 }, { "epoch": 17.293998898880528, "grad_norm": 0.002257933607324958, "learning_rate": 5.474202060930555e-07, "loss": 0.0, "num_input_tokens_seen": 203224656, "step": 94235 }, { "epoch": 17.29491649844008, "grad_norm": 0.0034851115196943283, "learning_rate": 5.470559573397638e-07, "loss": 0.0, "num_input_tokens_seen": 203235248, "step": 94240 }, { "epoch": 17.295834097999634, "grad_norm": 0.0013894912553951144, "learning_rate": 5.466918227975854e-07, "loss": 0.0, "num_input_tokens_seen": 203245904, "step": 94245 }, { "epoch": 17.296751697559184, "grad_norm": 0.0018356307409703732, "learning_rate": 5.463278024758584e-07, "loss": 0.0, "num_input_tokens_seen": 203256656, "step": 94250 }, { "epoch": 17.297669297118738, "grad_norm": 0.009272614493966103, "learning_rate": 5.45963896383922e-07, "loss": 0.0, "num_input_tokens_seen": 203267504, "step": 94255 }, { "epoch": 17.29858689667829, "grad_norm": 0.015528860501945019, "learning_rate": 5.456001045311088e-07, "loss": 0.0001, "num_input_tokens_seen": 203277424, "step": 94260 }, { "epoch": 17.29950449623784, "grad_norm": 0.006694850046187639, "learning_rate": 5.452364269267485e-07, "loss": 0.0, "num_input_tokens_seen": 203288304, "step": 94265 }, { "epoch": 17.300422095797394, "grad_norm": 0.0025274446234107018, "learning_rate": 5.448728635801703e-07, "loss": 0.0, "num_input_tokens_seen": 203298832, "step": 94270 }, { "epoch": 17.301339695356948, "grad_norm": 0.03240901604294777, "learning_rate": 5.445094145006968e-07, "loss": 0.0, "num_input_tokens_seen": 203308912, "step": 94275 }, { "epoch": 17.302257294916497, "grad_norm": 0.0009258058853447437, "learning_rate": 5.441460796976527e-07, "loss": 0.0, "num_input_tokens_seen": 203320272, "step": 94280 }, { "epoch": 17.30317489447605, "grad_norm": 0.10945470631122589, "learning_rate": 5.437828591803557e-07, "loss": 0.0002, "num_input_tokens_seen": 203330896, "step": 94285 }, { "epoch": 17.304092494035604, "grad_norm": 2.462000846862793, "learning_rate": 5.434197529581209e-07, "loss": 0.0005, "num_input_tokens_seen": 203341648, "step": 94290 }, { "epoch": 17.305010093595154, "grad_norm": 0.001299044000916183, "learning_rate": 5.430567610402632e-07, "loss": 0.0, "num_input_tokens_seen": 203352016, "step": 94295 }, { "epoch": 17.305927693154707, "grad_norm": 0.0034410974476486444, "learning_rate": 5.426938834360918e-07, "loss": 0.0425, "num_input_tokens_seen": 203362416, "step": 94300 }, { "epoch": 17.30684529271426, "grad_norm": 0.0004228614561725408, "learning_rate": 5.423311201549142e-07, "loss": 0.0, "num_input_tokens_seen": 203372944, "step": 94305 }, { "epoch": 17.30776289227381, "grad_norm": 0.0008593255188316107, "learning_rate": 5.419684712060336e-07, "loss": 0.0001, "num_input_tokens_seen": 203384976, "step": 94310 }, { "epoch": 17.308680491833364, "grad_norm": 0.004042597021907568, "learning_rate": 5.416059365987536e-07, "loss": 0.0, "num_input_tokens_seen": 203396240, "step": 94315 }, { "epoch": 17.309598091392917, "grad_norm": 0.010346346534788609, "learning_rate": 5.412435163423712e-07, "loss": 0.0, "num_input_tokens_seen": 203406928, "step": 94320 }, { "epoch": 17.310515690952467, "grad_norm": 0.037280671298503876, "learning_rate": 5.408812104461814e-07, "loss": 0.0, "num_input_tokens_seen": 203417072, "step": 94325 }, { "epoch": 17.31143329051202, "grad_norm": 159.5886688232422, "learning_rate": 5.405190189194786e-07, "loss": 0.1067, "num_input_tokens_seen": 203427280, "step": 94330 }, { "epoch": 17.312350890071574, "grad_norm": 0.0013811492826789618, "learning_rate": 5.401569417715513e-07, "loss": 0.04, "num_input_tokens_seen": 203438352, "step": 94335 }, { "epoch": 17.313268489631124, "grad_norm": 0.007526273839175701, "learning_rate": 5.397949790116852e-07, "loss": 0.0, "num_input_tokens_seen": 203450064, "step": 94340 }, { "epoch": 17.314186089190677, "grad_norm": 0.010305150412023067, "learning_rate": 5.394331306491662e-07, "loss": 0.0, "num_input_tokens_seen": 203460976, "step": 94345 }, { "epoch": 17.31510368875023, "grad_norm": 0.003329051425680518, "learning_rate": 5.390713966932743e-07, "loss": 0.0, "num_input_tokens_seen": 203471760, "step": 94350 }, { "epoch": 17.31602128830978, "grad_norm": 0.0009554717689752579, "learning_rate": 5.387097771532867e-07, "loss": 0.0, "num_input_tokens_seen": 203483184, "step": 94355 }, { "epoch": 17.316938887869334, "grad_norm": 0.00239206338301301, "learning_rate": 5.383482720384786e-07, "loss": 0.0, "num_input_tokens_seen": 203492976, "step": 94360 }, { "epoch": 17.317856487428887, "grad_norm": 0.1347510665655136, "learning_rate": 5.379868813581234e-07, "loss": 0.0003, "num_input_tokens_seen": 203504144, "step": 94365 }, { "epoch": 17.318774086988437, "grad_norm": 1.0600067377090454, "learning_rate": 5.37625605121489e-07, "loss": 0.0003, "num_input_tokens_seen": 203513744, "step": 94370 }, { "epoch": 17.31969168654799, "grad_norm": 0.003912283573299646, "learning_rate": 5.372644433378405e-07, "loss": 0.0, "num_input_tokens_seen": 203526000, "step": 94375 }, { "epoch": 17.320609286107544, "grad_norm": 0.00304652308113873, "learning_rate": 5.369033960164438e-07, "loss": 0.0, "num_input_tokens_seen": 203537712, "step": 94380 }, { "epoch": 17.321526885667094, "grad_norm": 0.21423107385635376, "learning_rate": 5.365424631665578e-07, "loss": 0.006, "num_input_tokens_seen": 203547728, "step": 94385 }, { "epoch": 17.322444485226647, "grad_norm": 0.008275897242128849, "learning_rate": 5.361816447974394e-07, "loss": 0.0, "num_input_tokens_seen": 203558832, "step": 94390 }, { "epoch": 17.3233620847862, "grad_norm": 0.0019624026026576757, "learning_rate": 5.358209409183429e-07, "loss": 0.0008, "num_input_tokens_seen": 203568752, "step": 94395 }, { "epoch": 17.32427968434575, "grad_norm": 0.010260806418955326, "learning_rate": 5.354603515385215e-07, "loss": 0.0, "num_input_tokens_seen": 203578512, "step": 94400 }, { "epoch": 17.325197283905304, "grad_norm": 0.1876896172761917, "learning_rate": 5.350998766672227e-07, "loss": 0.0, "num_input_tokens_seen": 203589232, "step": 94405 }, { "epoch": 17.326114883464857, "grad_norm": 0.0048029120080173016, "learning_rate": 5.34739516313691e-07, "loss": 0.0, "num_input_tokens_seen": 203601136, "step": 94410 }, { "epoch": 17.327032483024407, "grad_norm": 0.000563275592867285, "learning_rate": 5.343792704871714e-07, "loss": 0.0002, "num_input_tokens_seen": 203611376, "step": 94415 }, { "epoch": 17.32795008258396, "grad_norm": 0.009654377587139606, "learning_rate": 5.340191391969019e-07, "loss": 0.0, "num_input_tokens_seen": 203621872, "step": 94420 }, { "epoch": 17.328867682143514, "grad_norm": 0.0373077467083931, "learning_rate": 5.336591224521192e-07, "loss": 0.2438, "num_input_tokens_seen": 203632496, "step": 94425 }, { "epoch": 17.329785281703064, "grad_norm": 0.0017948400927707553, "learning_rate": 5.332992202620585e-07, "loss": 0.0001, "num_input_tokens_seen": 203644496, "step": 94430 }, { "epoch": 17.330702881262617, "grad_norm": 0.0004553144972305745, "learning_rate": 5.329394326359504e-07, "loss": 0.0, "num_input_tokens_seen": 203656784, "step": 94435 }, { "epoch": 17.33162048082217, "grad_norm": 0.014720512554049492, "learning_rate": 5.325797595830224e-07, "loss": 0.0, "num_input_tokens_seen": 203667088, "step": 94440 }, { "epoch": 17.33253808038172, "grad_norm": 0.0011680020252242684, "learning_rate": 5.322202011124989e-07, "loss": 0.0001, "num_input_tokens_seen": 203677328, "step": 94445 }, { "epoch": 17.333455679941274, "grad_norm": 0.02920852042734623, "learning_rate": 5.318607572336037e-07, "loss": 0.0, "num_input_tokens_seen": 203688144, "step": 94450 }, { "epoch": 17.334373279500827, "grad_norm": 0.07899951934814453, "learning_rate": 5.315014279555547e-07, "loss": 0.0063, "num_input_tokens_seen": 203699376, "step": 94455 }, { "epoch": 17.335290879060377, "grad_norm": 0.000859302410390228, "learning_rate": 5.311422132875688e-07, "loss": 0.0003, "num_input_tokens_seen": 203709648, "step": 94460 }, { "epoch": 17.33620847861993, "grad_norm": 0.006658988073468208, "learning_rate": 5.307831132388591e-07, "loss": 0.0, "num_input_tokens_seen": 203720816, "step": 94465 }, { "epoch": 17.337126078179484, "grad_norm": 0.0021102188620716333, "learning_rate": 5.304241278186351e-07, "loss": 0.0, "num_input_tokens_seen": 203731600, "step": 94470 }, { "epoch": 17.338043677739034, "grad_norm": 0.0008381744846701622, "learning_rate": 5.300652570361053e-07, "loss": 0.0003, "num_input_tokens_seen": 203742576, "step": 94475 }, { "epoch": 17.338961277298587, "grad_norm": 0.0035749259404838085, "learning_rate": 5.297065009004749e-07, "loss": 0.0, "num_input_tokens_seen": 203754256, "step": 94480 }, { "epoch": 17.33987887685814, "grad_norm": 0.0336761437356472, "learning_rate": 5.293478594209433e-07, "loss": 0.1439, "num_input_tokens_seen": 203765584, "step": 94485 }, { "epoch": 17.34079647641769, "grad_norm": 0.0013842369662597775, "learning_rate": 5.289893326067108e-07, "loss": 0.0, "num_input_tokens_seen": 203775728, "step": 94490 }, { "epoch": 17.341714075977244, "grad_norm": 0.0008645813213661313, "learning_rate": 5.286309204669715e-07, "loss": 0.0, "num_input_tokens_seen": 203787312, "step": 94495 }, { "epoch": 17.342631675536797, "grad_norm": 0.009267082437872887, "learning_rate": 5.282726230109203e-07, "loss": 0.0002, "num_input_tokens_seen": 203797520, "step": 94500 }, { "epoch": 17.343549275096347, "grad_norm": 0.041430987417697906, "learning_rate": 5.279144402477454e-07, "loss": 0.0, "num_input_tokens_seen": 203807248, "step": 94505 }, { "epoch": 17.3444668746559, "grad_norm": 0.0011600785655900836, "learning_rate": 5.275563721866334e-07, "loss": 0.1006, "num_input_tokens_seen": 203818288, "step": 94510 }, { "epoch": 17.345384474215454, "grad_norm": 0.0015393177745863795, "learning_rate": 5.271984188367695e-07, "loss": 0.0003, "num_input_tokens_seen": 203828592, "step": 94515 }, { "epoch": 17.346302073775004, "grad_norm": 306.1361389160156, "learning_rate": 5.26840580207334e-07, "loss": 0.0244, "num_input_tokens_seen": 203838864, "step": 94520 }, { "epoch": 17.347219673334557, "grad_norm": 0.006759596522897482, "learning_rate": 5.264828563075047e-07, "loss": 0.0, "num_input_tokens_seen": 203849136, "step": 94525 }, { "epoch": 17.34813727289411, "grad_norm": 0.026590799912810326, "learning_rate": 5.261252471464562e-07, "loss": 0.0479, "num_input_tokens_seen": 203858832, "step": 94530 }, { "epoch": 17.34905487245366, "grad_norm": 0.18138791620731354, "learning_rate": 5.257677527333616e-07, "loss": 0.0001, "num_input_tokens_seen": 203868432, "step": 94535 }, { "epoch": 17.349972472013214, "grad_norm": 40.79452133178711, "learning_rate": 5.254103730773901e-07, "loss": 0.0285, "num_input_tokens_seen": 203879952, "step": 94540 }, { "epoch": 17.350890071572767, "grad_norm": 18.56197166442871, "learning_rate": 5.250531081877064e-07, "loss": 0.0008, "num_input_tokens_seen": 203890480, "step": 94545 }, { "epoch": 17.351807671132317, "grad_norm": 0.06111011281609535, "learning_rate": 5.246959580734762e-07, "loss": 0.0002, "num_input_tokens_seen": 203901360, "step": 94550 }, { "epoch": 17.35272527069187, "grad_norm": 0.01510746218264103, "learning_rate": 5.243389227438584e-07, "loss": 0.0, "num_input_tokens_seen": 203911568, "step": 94555 }, { "epoch": 17.353642870251424, "grad_norm": 0.017225012183189392, "learning_rate": 5.2398200220801e-07, "loss": 0.0, "num_input_tokens_seen": 203922576, "step": 94560 }, { "epoch": 17.354560469810973, "grad_norm": 0.040031611919403076, "learning_rate": 5.236251964750866e-07, "loss": 0.0, "num_input_tokens_seen": 203931920, "step": 94565 }, { "epoch": 17.355478069370527, "grad_norm": 1.3188707828521729, "learning_rate": 5.232685055542391e-07, "loss": 0.0004, "num_input_tokens_seen": 203943248, "step": 94570 }, { "epoch": 17.35639566893008, "grad_norm": 0.0018345718272030354, "learning_rate": 5.229119294546164e-07, "loss": 0.0, "num_input_tokens_seen": 203953776, "step": 94575 }, { "epoch": 17.35731326848963, "grad_norm": 0.00101964152418077, "learning_rate": 5.225554681853623e-07, "loss": 0.0, "num_input_tokens_seen": 203964432, "step": 94580 }, { "epoch": 17.358230868049183, "grad_norm": 0.007280291058123112, "learning_rate": 5.221991217556227e-07, "loss": 0.0001, "num_input_tokens_seen": 203974320, "step": 94585 }, { "epoch": 17.359148467608737, "grad_norm": 0.0017328686080873013, "learning_rate": 5.218428901745353e-07, "loss": 0.0, "num_input_tokens_seen": 203984880, "step": 94590 }, { "epoch": 17.360066067168287, "grad_norm": 0.0008511111955158412, "learning_rate": 5.214867734512364e-07, "loss": 0.0, "num_input_tokens_seen": 203995376, "step": 94595 }, { "epoch": 17.36098366672784, "grad_norm": 0.001567014609463513, "learning_rate": 5.211307715948616e-07, "loss": 0.0001, "num_input_tokens_seen": 204006768, "step": 94600 }, { "epoch": 17.361901266287393, "grad_norm": 0.001545466249808669, "learning_rate": 5.20774884614541e-07, "loss": 0.0, "num_input_tokens_seen": 204017424, "step": 94605 }, { "epoch": 17.362818865846943, "grad_norm": 0.0015582252526655793, "learning_rate": 5.204191125194013e-07, "loss": 0.0, "num_input_tokens_seen": 204028432, "step": 94610 }, { "epoch": 17.363736465406497, "grad_norm": 0.004170033615082502, "learning_rate": 5.200634553185696e-07, "loss": 0.0, "num_input_tokens_seen": 204038224, "step": 94615 }, { "epoch": 17.36465406496605, "grad_norm": 0.0015548827359452844, "learning_rate": 5.197079130211674e-07, "loss": 0.0, "num_input_tokens_seen": 204048592, "step": 94620 }, { "epoch": 17.3655716645256, "grad_norm": 0.004026857204735279, "learning_rate": 5.19352485636313e-07, "loss": 0.0, "num_input_tokens_seen": 204059600, "step": 94625 }, { "epoch": 17.366489264085153, "grad_norm": 0.14514321088790894, "learning_rate": 5.189971731731219e-07, "loss": 0.0, "num_input_tokens_seen": 204070320, "step": 94630 }, { "epoch": 17.367406863644707, "grad_norm": 0.0083980867639184, "learning_rate": 5.186419756407096e-07, "loss": 0.0, "num_input_tokens_seen": 204079280, "step": 94635 }, { "epoch": 17.368324463204257, "grad_norm": 0.008769911713898182, "learning_rate": 5.182868930481855e-07, "loss": 0.0001, "num_input_tokens_seen": 204090192, "step": 94640 }, { "epoch": 17.36924206276381, "grad_norm": 0.0036562008317559958, "learning_rate": 5.17931925404655e-07, "loss": 0.0, "num_input_tokens_seen": 204101232, "step": 94645 }, { "epoch": 17.370159662323363, "grad_norm": 0.008005470968782902, "learning_rate": 5.175770727192253e-07, "loss": 0.0, "num_input_tokens_seen": 204112560, "step": 94650 }, { "epoch": 17.371077261882913, "grad_norm": 0.006008592434227467, "learning_rate": 5.172223350009963e-07, "loss": 0.0, "num_input_tokens_seen": 204122736, "step": 94655 }, { "epoch": 17.371994861442467, "grad_norm": 64.02213287353516, "learning_rate": 5.168677122590671e-07, "loss": 0.0097, "num_input_tokens_seen": 204133872, "step": 94660 }, { "epoch": 17.37291246100202, "grad_norm": 0.002669174922630191, "learning_rate": 5.165132045025317e-07, "loss": 0.0, "num_input_tokens_seen": 204145264, "step": 94665 }, { "epoch": 17.37383006056157, "grad_norm": 0.008546831086277962, "learning_rate": 5.161588117404848e-07, "loss": 0.0001, "num_input_tokens_seen": 204157200, "step": 94670 }, { "epoch": 17.374747660121123, "grad_norm": 0.000663300568703562, "learning_rate": 5.15804533982015e-07, "loss": 0.0, "num_input_tokens_seen": 204167088, "step": 94675 }, { "epoch": 17.375665259680677, "grad_norm": 0.0011148713529109955, "learning_rate": 5.154503712362092e-07, "loss": 0.0, "num_input_tokens_seen": 204177936, "step": 94680 }, { "epoch": 17.376582859240226, "grad_norm": 0.0008618682622909546, "learning_rate": 5.150963235121509e-07, "loss": 0.0174, "num_input_tokens_seen": 204189360, "step": 94685 }, { "epoch": 17.37750045879978, "grad_norm": 0.004750635474920273, "learning_rate": 5.147423908189198e-07, "loss": 0.0703, "num_input_tokens_seen": 204199600, "step": 94690 }, { "epoch": 17.378418058359333, "grad_norm": 0.0071782683953642845, "learning_rate": 5.143885731655962e-07, "loss": 0.0001, "num_input_tokens_seen": 204210896, "step": 94695 }, { "epoch": 17.379335657918883, "grad_norm": 0.003481256077066064, "learning_rate": 5.14034870561253e-07, "loss": 0.0, "num_input_tokens_seen": 204221392, "step": 94700 }, { "epoch": 17.380253257478437, "grad_norm": 0.000786258140578866, "learning_rate": 5.136812830149635e-07, "loss": 0.0, "num_input_tokens_seen": 204231888, "step": 94705 }, { "epoch": 17.38117085703799, "grad_norm": 0.0035475704353302717, "learning_rate": 5.133278105357952e-07, "loss": 0.0, "num_input_tokens_seen": 204241968, "step": 94710 }, { "epoch": 17.38208845659754, "grad_norm": 0.0013249950716271996, "learning_rate": 5.12974453132814e-07, "loss": 0.0001, "num_input_tokens_seen": 204252336, "step": 94715 }, { "epoch": 17.383006056157093, "grad_norm": 0.0007351971580646932, "learning_rate": 5.126212108150852e-07, "loss": 0.0, "num_input_tokens_seen": 204264144, "step": 94720 }, { "epoch": 17.383923655716647, "grad_norm": 0.0014817578485235572, "learning_rate": 5.122680835916677e-07, "loss": 0.0, "num_input_tokens_seen": 204274928, "step": 94725 }, { "epoch": 17.384841255276196, "grad_norm": 0.0005374709144234657, "learning_rate": 5.11915071471617e-07, "loss": 0.0003, "num_input_tokens_seen": 204285264, "step": 94730 }, { "epoch": 17.38575885483575, "grad_norm": 0.000696688424795866, "learning_rate": 5.115621744639898e-07, "loss": 0.0532, "num_input_tokens_seen": 204296912, "step": 94735 }, { "epoch": 17.386676454395303, "grad_norm": 0.0003987185191363096, "learning_rate": 5.112093925778366e-07, "loss": 0.0, "num_input_tokens_seen": 204308624, "step": 94740 }, { "epoch": 17.387594053954853, "grad_norm": 0.018014853820204735, "learning_rate": 5.108567258222047e-07, "loss": 0.0003, "num_input_tokens_seen": 204318608, "step": 94745 }, { "epoch": 17.388511653514406, "grad_norm": 0.0063411770388484, "learning_rate": 5.105041742061406e-07, "loss": 0.0002, "num_input_tokens_seen": 204328656, "step": 94750 }, { "epoch": 17.38942925307396, "grad_norm": 0.0006967367371544242, "learning_rate": 5.101517377386867e-07, "loss": 0.0, "num_input_tokens_seen": 204339536, "step": 94755 }, { "epoch": 17.39034685263351, "grad_norm": 0.0028391487430781126, "learning_rate": 5.097994164288822e-07, "loss": 0.0405, "num_input_tokens_seen": 204349776, "step": 94760 }, { "epoch": 17.391264452193063, "grad_norm": 0.01262464839965105, "learning_rate": 5.094472102857622e-07, "loss": 0.0146, "num_input_tokens_seen": 204361232, "step": 94765 }, { "epoch": 17.392182051752616, "grad_norm": 0.001450191717594862, "learning_rate": 5.090951193183629e-07, "loss": 0.0001, "num_input_tokens_seen": 204372144, "step": 94770 }, { "epoch": 17.393099651312166, "grad_norm": 0.0018073751125484705, "learning_rate": 5.087431435357132e-07, "loss": 0.0, "num_input_tokens_seen": 204383088, "step": 94775 }, { "epoch": 17.39401725087172, "grad_norm": 0.00030863541178405285, "learning_rate": 5.083912829468408e-07, "loss": 0.0, "num_input_tokens_seen": 204392976, "step": 94780 }, { "epoch": 17.394934850431273, "grad_norm": 0.0006750639877282083, "learning_rate": 5.080395375607705e-07, "loss": 0.0004, "num_input_tokens_seen": 204403216, "step": 94785 }, { "epoch": 17.395852449990823, "grad_norm": 0.009776934050023556, "learning_rate": 5.076879073865248e-07, "loss": 0.0, "num_input_tokens_seen": 204414544, "step": 94790 }, { "epoch": 17.396770049550376, "grad_norm": 0.08506936579942703, "learning_rate": 5.07336392433121e-07, "loss": 0.0, "num_input_tokens_seen": 204424144, "step": 94795 }, { "epoch": 17.39768764910993, "grad_norm": 0.06465969979763031, "learning_rate": 5.06984992709576e-07, "loss": 0.0, "num_input_tokens_seen": 204435152, "step": 94800 }, { "epoch": 17.39860524866948, "grad_norm": 1.2132295370101929, "learning_rate": 5.066337082249028e-07, "loss": 0.0001, "num_input_tokens_seen": 204445296, "step": 94805 }, { "epoch": 17.399522848229033, "grad_norm": 0.0027165301144123077, "learning_rate": 5.062825389881109e-07, "loss": 0.0, "num_input_tokens_seen": 204456240, "step": 94810 }, { "epoch": 17.400440447788586, "grad_norm": 0.006532602943480015, "learning_rate": 5.059314850082064e-07, "loss": 0.015, "num_input_tokens_seen": 204466224, "step": 94815 }, { "epoch": 17.401358047348136, "grad_norm": 0.08560274541378021, "learning_rate": 5.055805462941954e-07, "loss": 0.0, "num_input_tokens_seen": 204477616, "step": 94820 }, { "epoch": 17.40227564690769, "grad_norm": 0.0032469499856233597, "learning_rate": 5.052297228550768e-07, "loss": 0.0, "num_input_tokens_seen": 204489456, "step": 94825 }, { "epoch": 17.403193246467243, "grad_norm": 112.96710968017578, "learning_rate": 5.048790146998495e-07, "loss": 0.004, "num_input_tokens_seen": 204501200, "step": 94830 }, { "epoch": 17.404110846026793, "grad_norm": 0.031037699431180954, "learning_rate": 5.045284218375091e-07, "loss": 0.0, "num_input_tokens_seen": 204511152, "step": 94835 }, { "epoch": 17.405028445586346, "grad_norm": 0.012257256545126438, "learning_rate": 5.041779442770472e-07, "loss": 0.0063, "num_input_tokens_seen": 204521616, "step": 94840 }, { "epoch": 17.4059460451459, "grad_norm": 0.004108490888029337, "learning_rate": 5.038275820274536e-07, "loss": 0.0097, "num_input_tokens_seen": 204531056, "step": 94845 }, { "epoch": 17.40686364470545, "grad_norm": 0.001058755675330758, "learning_rate": 5.03477335097713e-07, "loss": 0.0, "num_input_tokens_seen": 204542736, "step": 94850 }, { "epoch": 17.407781244265003, "grad_norm": 0.0019617145881056786, "learning_rate": 5.031272034968104e-07, "loss": 0.0, "num_input_tokens_seen": 204554032, "step": 94855 }, { "epoch": 17.408698843824556, "grad_norm": 0.0017587102483958006, "learning_rate": 5.027771872337256e-07, "loss": 0.0, "num_input_tokens_seen": 204563952, "step": 94860 }, { "epoch": 17.409616443384106, "grad_norm": 0.0656302198767662, "learning_rate": 5.024272863174351e-07, "loss": 0.0, "num_input_tokens_seen": 204575376, "step": 94865 }, { "epoch": 17.41053404294366, "grad_norm": 0.004601922817528248, "learning_rate": 5.02077500756915e-07, "loss": 0.0, "num_input_tokens_seen": 204585360, "step": 94870 }, { "epoch": 17.411451642503213, "grad_norm": 0.0008088243776001036, "learning_rate": 5.017278305611357e-07, "loss": 0.0, "num_input_tokens_seen": 204595120, "step": 94875 }, { "epoch": 17.412369242062763, "grad_norm": 0.002366089029237628, "learning_rate": 5.013782757390662e-07, "loss": 0.0, "num_input_tokens_seen": 204604784, "step": 94880 }, { "epoch": 17.413286841622316, "grad_norm": 0.010296800173819065, "learning_rate": 5.010288362996707e-07, "loss": 0.0, "num_input_tokens_seen": 204616624, "step": 94885 }, { "epoch": 17.41420444118187, "grad_norm": 0.001439544023014605, "learning_rate": 5.006795122519131e-07, "loss": 0.0001, "num_input_tokens_seen": 204627888, "step": 94890 }, { "epoch": 17.41512204074142, "grad_norm": 0.0024589854292571545, "learning_rate": 5.003303036047536e-07, "loss": 0.0001, "num_input_tokens_seen": 204638992, "step": 94895 }, { "epoch": 17.416039640300973, "grad_norm": 0.004068603739142418, "learning_rate": 4.999812103671475e-07, "loss": 0.0, "num_input_tokens_seen": 204650864, "step": 94900 }, { "epoch": 17.416957239860526, "grad_norm": 0.029995791614055634, "learning_rate": 4.99632232548049e-07, "loss": 0.0, "num_input_tokens_seen": 204661232, "step": 94905 }, { "epoch": 17.417874839420076, "grad_norm": 0.0012305426644161344, "learning_rate": 4.992833701564087e-07, "loss": 0.0, "num_input_tokens_seen": 204670928, "step": 94910 }, { "epoch": 17.41879243897963, "grad_norm": 0.0015071278903633356, "learning_rate": 4.989346232011738e-07, "loss": 0.0823, "num_input_tokens_seen": 204681136, "step": 94915 }, { "epoch": 17.419710038539183, "grad_norm": 34.36524200439453, "learning_rate": 4.985859916912905e-07, "loss": 0.0097, "num_input_tokens_seen": 204691312, "step": 94920 }, { "epoch": 17.420627638098733, "grad_norm": 0.000649771245662123, "learning_rate": 4.982374756357e-07, "loss": 0.0, "num_input_tokens_seen": 204701424, "step": 94925 }, { "epoch": 17.421545237658286, "grad_norm": 0.0020507285371422768, "learning_rate": 4.978890750433401e-07, "loss": 0.0, "num_input_tokens_seen": 204712784, "step": 94930 }, { "epoch": 17.42246283721784, "grad_norm": 0.0005482390988618135, "learning_rate": 4.975407899231488e-07, "loss": 0.1066, "num_input_tokens_seen": 204724208, "step": 94935 }, { "epoch": 17.42338043677739, "grad_norm": 97.95549011230469, "learning_rate": 4.971926202840582e-07, "loss": 0.0352, "num_input_tokens_seen": 204735248, "step": 94940 }, { "epoch": 17.424298036336943, "grad_norm": 0.004939647391438484, "learning_rate": 4.96844566134998e-07, "loss": 0.1829, "num_input_tokens_seen": 204746128, "step": 94945 }, { "epoch": 17.425215635896496, "grad_norm": 0.023490557447075844, "learning_rate": 4.964966274848948e-07, "loss": 0.0001, "num_input_tokens_seen": 204757072, "step": 94950 }, { "epoch": 17.426133235456046, "grad_norm": 4.751147270202637, "learning_rate": 4.961488043426738e-07, "loss": 0.0001, "num_input_tokens_seen": 204767472, "step": 94955 }, { "epoch": 17.4270508350156, "grad_norm": 3.508418560028076, "learning_rate": 4.958010967172561e-07, "loss": 0.0007, "num_input_tokens_seen": 204777328, "step": 94960 }, { "epoch": 17.427968434575153, "grad_norm": 1.146570086479187, "learning_rate": 4.954535046175579e-07, "loss": 0.0004, "num_input_tokens_seen": 204788240, "step": 94965 }, { "epoch": 17.428886034134702, "grad_norm": 0.0009608532418496907, "learning_rate": 4.951060280524972e-07, "loss": 0.0, "num_input_tokens_seen": 204800304, "step": 94970 }, { "epoch": 17.429803633694256, "grad_norm": 0.01502312533557415, "learning_rate": 4.947586670309851e-07, "loss": 0.0, "num_input_tokens_seen": 204811024, "step": 94975 }, { "epoch": 17.43072123325381, "grad_norm": 0.048271145671606064, "learning_rate": 4.9441142156193e-07, "loss": 0.0008, "num_input_tokens_seen": 204821552, "step": 94980 }, { "epoch": 17.43163883281336, "grad_norm": 0.0005663923220708966, "learning_rate": 4.940642916542387e-07, "loss": 0.0, "num_input_tokens_seen": 204833232, "step": 94985 }, { "epoch": 17.432556432372913, "grad_norm": 0.0011962029384449124, "learning_rate": 4.937172773168153e-07, "loss": 0.0001, "num_input_tokens_seen": 204844464, "step": 94990 }, { "epoch": 17.433474031932466, "grad_norm": 0.05956551432609558, "learning_rate": 4.933703785585597e-07, "loss": 0.0006, "num_input_tokens_seen": 204854352, "step": 94995 }, { "epoch": 17.434391631492016, "grad_norm": 0.0012550980318337679, "learning_rate": 4.930235953883683e-07, "loss": 0.0, "num_input_tokens_seen": 204866800, "step": 95000 }, { "epoch": 17.43530923105157, "grad_norm": 0.004982823505997658, "learning_rate": 4.926769278151377e-07, "loss": 0.0, "num_input_tokens_seen": 204876304, "step": 95005 }, { "epoch": 17.436226830611123, "grad_norm": 0.006919412408024073, "learning_rate": 4.923303758477577e-07, "loss": 0.0, "num_input_tokens_seen": 204887984, "step": 95010 }, { "epoch": 17.437144430170672, "grad_norm": 0.0034068257082253695, "learning_rate": 4.91983939495117e-07, "loss": 0.0005, "num_input_tokens_seen": 204897872, "step": 95015 }, { "epoch": 17.438062029730226, "grad_norm": 0.46182990074157715, "learning_rate": 4.916376187661021e-07, "loss": 0.0001, "num_input_tokens_seen": 204909072, "step": 95020 }, { "epoch": 17.43897962928978, "grad_norm": 0.004529178142547607, "learning_rate": 4.912914136695945e-07, "loss": 0.0, "num_input_tokens_seen": 204919120, "step": 95025 }, { "epoch": 17.43989722884933, "grad_norm": 0.005541808437556028, "learning_rate": 4.909453242144746e-07, "loss": 0.0, "num_input_tokens_seen": 204929712, "step": 95030 }, { "epoch": 17.440814828408882, "grad_norm": 0.0008222558535635471, "learning_rate": 4.905993504096179e-07, "loss": 0.0, "num_input_tokens_seen": 204938928, "step": 95035 }, { "epoch": 17.441732427968436, "grad_norm": 0.10664674639701843, "learning_rate": 4.902534922639002e-07, "loss": 0.0, "num_input_tokens_seen": 204948464, "step": 95040 }, { "epoch": 17.442650027527986, "grad_norm": 0.001484328182414174, "learning_rate": 4.899077497861904e-07, "loss": 0.0, "num_input_tokens_seen": 204959632, "step": 95045 }, { "epoch": 17.44356762708754, "grad_norm": 0.0027925390750169754, "learning_rate": 4.895621229853558e-07, "loss": 0.0, "num_input_tokens_seen": 204970800, "step": 95050 }, { "epoch": 17.444485226647092, "grad_norm": 0.0008271036203950644, "learning_rate": 4.892166118702635e-07, "loss": 0.0029, "num_input_tokens_seen": 204982288, "step": 95055 }, { "epoch": 17.445402826206642, "grad_norm": 0.0011712866835296154, "learning_rate": 4.888712164497738e-07, "loss": 0.0, "num_input_tokens_seen": 204993840, "step": 95060 }, { "epoch": 17.446320425766196, "grad_norm": 0.011322364211082458, "learning_rate": 4.885259367327449e-07, "loss": 0.0, "num_input_tokens_seen": 205004880, "step": 95065 }, { "epoch": 17.44723802532575, "grad_norm": 25.910587310791016, "learning_rate": 4.881807727280346e-07, "loss": 0.0008, "num_input_tokens_seen": 205015760, "step": 95070 }, { "epoch": 17.4481556248853, "grad_norm": 0.0012264644028618932, "learning_rate": 4.878357244444947e-07, "loss": 0.0, "num_input_tokens_seen": 205027152, "step": 95075 }, { "epoch": 17.449073224444852, "grad_norm": 114.77666473388672, "learning_rate": 4.874907918909755e-07, "loss": 0.0119, "num_input_tokens_seen": 205037840, "step": 95080 }, { "epoch": 17.449990824004406, "grad_norm": 0.0008947462774813175, "learning_rate": 4.871459750763224e-07, "loss": 0.0, "num_input_tokens_seen": 205050192, "step": 95085 }, { "epoch": 17.450908423563956, "grad_norm": 0.00142926094122231, "learning_rate": 4.86801274009382e-07, "loss": 0.0002, "num_input_tokens_seen": 205058768, "step": 95090 }, { "epoch": 17.45182602312351, "grad_norm": 0.0059191822074353695, "learning_rate": 4.864566886989941e-07, "loss": 0.0, "num_input_tokens_seen": 205068848, "step": 95095 }, { "epoch": 17.452743622683062, "grad_norm": 0.0005372484447434545, "learning_rate": 4.861122191539969e-07, "loss": 0.0, "num_input_tokens_seen": 205079984, "step": 95100 }, { "epoch": 17.453661222242612, "grad_norm": 0.0026681304443627596, "learning_rate": 4.857678653832249e-07, "loss": 0.0001, "num_input_tokens_seen": 205091472, "step": 95105 }, { "epoch": 17.454578821802166, "grad_norm": 0.0016600459348410368, "learning_rate": 4.854236273955098e-07, "loss": 0.0, "num_input_tokens_seen": 205102224, "step": 95110 }, { "epoch": 17.45549642136172, "grad_norm": 0.01237125601619482, "learning_rate": 4.850795051996832e-07, "loss": 0.0, "num_input_tokens_seen": 205113296, "step": 95115 }, { "epoch": 17.45641402092127, "grad_norm": 0.00033890927443280816, "learning_rate": 4.847354988045694e-07, "loss": 0.0, "num_input_tokens_seen": 205124816, "step": 95120 }, { "epoch": 17.457331620480822, "grad_norm": 0.003324468620121479, "learning_rate": 4.84391608218992e-07, "loss": 0.0001, "num_input_tokens_seen": 205134992, "step": 95125 }, { "epoch": 17.458249220040376, "grad_norm": 0.0008290853584185243, "learning_rate": 4.840478334517712e-07, "loss": 0.0, "num_input_tokens_seen": 205145072, "step": 95130 }, { "epoch": 17.459166819599925, "grad_norm": 0.001980333821848035, "learning_rate": 4.837041745117238e-07, "loss": 0.0, "num_input_tokens_seen": 205156048, "step": 95135 }, { "epoch": 17.46008441915948, "grad_norm": 0.0005444943672046065, "learning_rate": 4.833606314076655e-07, "loss": 0.0, "num_input_tokens_seen": 205167344, "step": 95140 }, { "epoch": 17.461002018719032, "grad_norm": 0.015194658190011978, "learning_rate": 4.830172041484072e-07, "loss": 0.0, "num_input_tokens_seen": 205178448, "step": 95145 }, { "epoch": 17.461919618278582, "grad_norm": 0.0331013947725296, "learning_rate": 4.826738927427555e-07, "loss": 0.0, "num_input_tokens_seen": 205189776, "step": 95150 }, { "epoch": 17.462837217838135, "grad_norm": 0.012845478951931, "learning_rate": 4.823306971995179e-07, "loss": 0.0001, "num_input_tokens_seen": 205201104, "step": 95155 }, { "epoch": 17.46375481739769, "grad_norm": 0.03267194330692291, "learning_rate": 4.819876175274968e-07, "loss": 0.0001, "num_input_tokens_seen": 205211728, "step": 95160 }, { "epoch": 17.46467241695724, "grad_norm": 0.005568250082433224, "learning_rate": 4.816446537354907e-07, "loss": 0.0, "num_input_tokens_seen": 205221776, "step": 95165 }, { "epoch": 17.465590016516792, "grad_norm": 0.0006259544170461595, "learning_rate": 4.813018058322955e-07, "loss": 0.0005, "num_input_tokens_seen": 205231632, "step": 95170 }, { "epoch": 17.466507616076345, "grad_norm": 0.0022258111275732517, "learning_rate": 4.809590738267067e-07, "loss": 0.0, "num_input_tokens_seen": 205242640, "step": 95175 }, { "epoch": 17.467425215635895, "grad_norm": 0.0008738428587093949, "learning_rate": 4.806164577275135e-07, "loss": 0.0, "num_input_tokens_seen": 205253200, "step": 95180 }, { "epoch": 17.46834281519545, "grad_norm": 0.0005103258299641311, "learning_rate": 4.802739575435028e-07, "loss": 0.0, "num_input_tokens_seen": 205265488, "step": 95185 }, { "epoch": 17.469260414755002, "grad_norm": 0.003094688057899475, "learning_rate": 4.799315732834614e-07, "loss": 0.0, "num_input_tokens_seen": 205275536, "step": 95190 }, { "epoch": 17.470178014314552, "grad_norm": 48.45852279663086, "learning_rate": 4.795893049561695e-07, "loss": 0.0174, "num_input_tokens_seen": 205285968, "step": 95195 }, { "epoch": 17.471095613874105, "grad_norm": 0.0004976560594514012, "learning_rate": 4.792471525704051e-07, "loss": 0.0, "num_input_tokens_seen": 205297904, "step": 95200 }, { "epoch": 17.47201321343366, "grad_norm": 0.0014578928239643574, "learning_rate": 4.789051161349456e-07, "loss": 0.0, "num_input_tokens_seen": 205308912, "step": 95205 }, { "epoch": 17.47293081299321, "grad_norm": 0.007999245077371597, "learning_rate": 4.785631956585629e-07, "loss": 0.0, "num_input_tokens_seen": 205319152, "step": 95210 }, { "epoch": 17.473848412552762, "grad_norm": 0.0015394953079521656, "learning_rate": 4.782213911500266e-07, "loss": 0.0001, "num_input_tokens_seen": 205330288, "step": 95215 }, { "epoch": 17.474766012112315, "grad_norm": 0.001779714715667069, "learning_rate": 4.778797026181026e-07, "loss": 0.0001, "num_input_tokens_seen": 205339792, "step": 95220 }, { "epoch": 17.475683611671865, "grad_norm": 0.0008334093727171421, "learning_rate": 4.775381300715565e-07, "loss": 0.0, "num_input_tokens_seen": 205351280, "step": 95225 }, { "epoch": 17.47660121123142, "grad_norm": 0.1545589417219162, "learning_rate": 4.77196673519148e-07, "loss": 0.0001, "num_input_tokens_seen": 205362512, "step": 95230 }, { "epoch": 17.477518810790972, "grad_norm": 0.0014869138831272721, "learning_rate": 4.768553329696341e-07, "loss": 0.0, "num_input_tokens_seen": 205372784, "step": 95235 }, { "epoch": 17.478436410350522, "grad_norm": 0.00029078085208311677, "learning_rate": 4.76514108431772e-07, "loss": 0.0063, "num_input_tokens_seen": 205382928, "step": 95240 }, { "epoch": 17.479354009910075, "grad_norm": 0.006380932871252298, "learning_rate": 4.7617299991431164e-07, "loss": 0.0, "num_input_tokens_seen": 205393904, "step": 95245 }, { "epoch": 17.48027160946963, "grad_norm": 0.009195047430694103, "learning_rate": 4.75832007426002e-07, "loss": 0.0, "num_input_tokens_seen": 205405552, "step": 95250 }, { "epoch": 17.48118920902918, "grad_norm": 0.0018423892324790359, "learning_rate": 4.7549113097559053e-07, "loss": 0.0, "num_input_tokens_seen": 205415824, "step": 95255 }, { "epoch": 17.482106808588732, "grad_norm": 0.008754642680287361, "learning_rate": 4.751503705718191e-07, "loss": 0.0, "num_input_tokens_seen": 205426864, "step": 95260 }, { "epoch": 17.483024408148285, "grad_norm": 0.0005059525719843805, "learning_rate": 4.7480972622342803e-07, "loss": 0.0, "num_input_tokens_seen": 205438352, "step": 95265 }, { "epoch": 17.483942007707835, "grad_norm": 0.0013998812064528465, "learning_rate": 4.74469197939153e-07, "loss": 0.0, "num_input_tokens_seen": 205448624, "step": 95270 }, { "epoch": 17.48485960726739, "grad_norm": 0.3678150773048401, "learning_rate": 4.741287857277299e-07, "loss": 0.0001, "num_input_tokens_seen": 205457520, "step": 95275 }, { "epoch": 17.485777206826942, "grad_norm": 0.08966687321662903, "learning_rate": 4.7378848959788893e-07, "loss": 0.0883, "num_input_tokens_seen": 205467856, "step": 95280 }, { "epoch": 17.48669480638649, "grad_norm": 0.0035285567864775658, "learning_rate": 4.734483095583575e-07, "loss": 0.0, "num_input_tokens_seen": 205477136, "step": 95285 }, { "epoch": 17.487612405946045, "grad_norm": 0.004973819945007563, "learning_rate": 4.7310824561786206e-07, "loss": 0.0, "num_input_tokens_seen": 205489200, "step": 95290 }, { "epoch": 17.4885300055056, "grad_norm": 0.0009418317349627614, "learning_rate": 4.7276829778512445e-07, "loss": 0.0, "num_input_tokens_seen": 205499824, "step": 95295 }, { "epoch": 17.48944760506515, "grad_norm": 0.0011788411065936089, "learning_rate": 4.724284660688633e-07, "loss": 0.0, "num_input_tokens_seen": 205510800, "step": 95300 }, { "epoch": 17.490365204624702, "grad_norm": 0.0176292322576046, "learning_rate": 4.7208875047779377e-07, "loss": 0.0, "num_input_tokens_seen": 205520912, "step": 95305 }, { "epoch": 17.491282804184255, "grad_norm": 0.013791317120194435, "learning_rate": 4.7174915102063125e-07, "loss": 0.0, "num_input_tokens_seen": 205532432, "step": 95310 }, { "epoch": 17.492200403743805, "grad_norm": 0.08187764137983322, "learning_rate": 4.714096677060848e-07, "loss": 0.0, "num_input_tokens_seen": 205541840, "step": 95315 }, { "epoch": 17.49311800330336, "grad_norm": 0.000407299812650308, "learning_rate": 4.7107030054286185e-07, "loss": 0.1128, "num_input_tokens_seen": 205552752, "step": 95320 }, { "epoch": 17.494035602862912, "grad_norm": 0.006395230535417795, "learning_rate": 4.707310495396661e-07, "loss": 0.0, "num_input_tokens_seen": 205563408, "step": 95325 }, { "epoch": 17.49495320242246, "grad_norm": 0.009579021483659744, "learning_rate": 4.7039191470519884e-07, "loss": 0.0002, "num_input_tokens_seen": 205574352, "step": 95330 }, { "epoch": 17.495870801982015, "grad_norm": 0.001488109352067113, "learning_rate": 4.700528960481593e-07, "loss": 0.0, "num_input_tokens_seen": 205585360, "step": 95335 }, { "epoch": 17.49678840154157, "grad_norm": 0.002477886388078332, "learning_rate": 4.697139935772421e-07, "loss": 0.0, "num_input_tokens_seen": 205596112, "step": 95340 }, { "epoch": 17.49770600110112, "grad_norm": 0.0015330727910622954, "learning_rate": 4.693752073011398e-07, "loss": 0.0, "num_input_tokens_seen": 205608720, "step": 95345 }, { "epoch": 17.49862360066067, "grad_norm": 0.0013390496606007218, "learning_rate": 4.6903653722854157e-07, "loss": 0.0, "num_input_tokens_seen": 205619888, "step": 95350 }, { "epoch": 17.499541200220225, "grad_norm": 0.04114558547735214, "learning_rate": 4.6869798336813264e-07, "loss": 0.0, "num_input_tokens_seen": 205631248, "step": 95355 }, { "epoch": 17.500458799779775, "grad_norm": 0.0009411474457010627, "learning_rate": 4.683595457285989e-07, "loss": 0.0008, "num_input_tokens_seen": 205641712, "step": 95360 }, { "epoch": 17.50137639933933, "grad_norm": 0.02450540103018284, "learning_rate": 4.680212243186194e-07, "loss": 0.0207, "num_input_tokens_seen": 205652688, "step": 95365 }, { "epoch": 17.50229399889888, "grad_norm": 0.0005300983902998269, "learning_rate": 4.6768301914687007e-07, "loss": 0.0, "num_input_tokens_seen": 205662832, "step": 95370 }, { "epoch": 17.50321159845843, "grad_norm": 0.00037662091199308634, "learning_rate": 4.6734493022202845e-07, "loss": 0.0, "num_input_tokens_seen": 205673584, "step": 95375 }, { "epoch": 17.504129198017985, "grad_norm": 0.0005216243444010615, "learning_rate": 4.6700695755276414e-07, "loss": 0.0001, "num_input_tokens_seen": 205684688, "step": 95380 }, { "epoch": 17.50504679757754, "grad_norm": 0.0011447386350482702, "learning_rate": 4.666691011477448e-07, "loss": 0.0, "num_input_tokens_seen": 205694384, "step": 95385 }, { "epoch": 17.505964397137088, "grad_norm": 0.004289778880774975, "learning_rate": 4.663313610156378e-07, "loss": 0.0001, "num_input_tokens_seen": 205704912, "step": 95390 }, { "epoch": 17.50688199669664, "grad_norm": 0.002691199304535985, "learning_rate": 4.659937371651052e-07, "loss": 0.0001, "num_input_tokens_seen": 205715248, "step": 95395 }, { "epoch": 17.507799596256195, "grad_norm": 3.5916247367858887, "learning_rate": 4.656562296048062e-07, "loss": 0.0001, "num_input_tokens_seen": 205727120, "step": 95400 }, { "epoch": 17.508717195815745, "grad_norm": 0.008977985940873623, "learning_rate": 4.6531883834339595e-07, "loss": 0.0, "num_input_tokens_seen": 205738032, "step": 95405 }, { "epoch": 17.509634795375298, "grad_norm": 0.0023566826712340117, "learning_rate": 4.6498156338953047e-07, "loss": 0.0004, "num_input_tokens_seen": 205748944, "step": 95410 }, { "epoch": 17.51055239493485, "grad_norm": 0.01716538332402706, "learning_rate": 4.646444047518595e-07, "loss": 0.0, "num_input_tokens_seen": 205760016, "step": 95415 }, { "epoch": 17.5114699944944, "grad_norm": 89.62820434570312, "learning_rate": 4.643073624390293e-07, "loss": 0.1006, "num_input_tokens_seen": 205770768, "step": 95420 }, { "epoch": 17.512387594053955, "grad_norm": 0.00232643517665565, "learning_rate": 4.639704364596864e-07, "loss": 0.0, "num_input_tokens_seen": 205781808, "step": 95425 }, { "epoch": 17.51330519361351, "grad_norm": 0.0017464892007410526, "learning_rate": 4.636336268224717e-07, "loss": 0.0, "num_input_tokens_seen": 205792048, "step": 95430 }, { "epoch": 17.514222793173058, "grad_norm": 0.03873677924275398, "learning_rate": 4.632969335360238e-07, "loss": 0.0285, "num_input_tokens_seen": 205803024, "step": 95435 }, { "epoch": 17.51514039273261, "grad_norm": 0.0074301827698946, "learning_rate": 4.6296035660897744e-07, "loss": 0.0, "num_input_tokens_seen": 205814544, "step": 95440 }, { "epoch": 17.516057992292165, "grad_norm": 0.0005046168225817382, "learning_rate": 4.6262389604996684e-07, "loss": 0.0, "num_input_tokens_seen": 205824816, "step": 95445 }, { "epoch": 17.516975591851715, "grad_norm": 0.0338296964764595, "learning_rate": 4.622875518676212e-07, "loss": 0.0001, "num_input_tokens_seen": 205835344, "step": 95450 }, { "epoch": 17.517893191411268, "grad_norm": 0.0008044239366427064, "learning_rate": 4.6195132407056644e-07, "loss": 0.0703, "num_input_tokens_seen": 205845584, "step": 95455 }, { "epoch": 17.51881079097082, "grad_norm": 0.0015233848243951797, "learning_rate": 4.6161521266742726e-07, "loss": 0.0004, "num_input_tokens_seen": 205856528, "step": 95460 }, { "epoch": 17.51972839053037, "grad_norm": 0.030491260811686516, "learning_rate": 4.61279217666824e-07, "loss": 0.0002, "num_input_tokens_seen": 205868048, "step": 95465 }, { "epoch": 17.520645990089925, "grad_norm": 0.0032212105579674244, "learning_rate": 4.6094333907737375e-07, "loss": 0.0, "num_input_tokens_seen": 205878000, "step": 95470 }, { "epoch": 17.521563589649478, "grad_norm": 0.013040538877248764, "learning_rate": 4.606075769076929e-07, "loss": 0.0, "num_input_tokens_seen": 205889200, "step": 95475 }, { "epoch": 17.522481189209028, "grad_norm": 0.1117812767624855, "learning_rate": 4.6027193116639226e-07, "loss": 0.0033, "num_input_tokens_seen": 205900016, "step": 95480 }, { "epoch": 17.52339878876858, "grad_norm": 0.0009617287432774901, "learning_rate": 4.5993640186208054e-07, "loss": 0.0207, "num_input_tokens_seen": 205911088, "step": 95485 }, { "epoch": 17.524316388328135, "grad_norm": 0.009931118227541447, "learning_rate": 4.5960098900336256e-07, "loss": 0.0, "num_input_tokens_seen": 205921840, "step": 95490 }, { "epoch": 17.525233987887685, "grad_norm": 0.0030826798174530268, "learning_rate": 4.5926569259884313e-07, "loss": 0.0001, "num_input_tokens_seen": 205931824, "step": 95495 }, { "epoch": 17.526151587447238, "grad_norm": 0.019509313628077507, "learning_rate": 4.589305126571214e-07, "loss": 0.0, "num_input_tokens_seen": 205944464, "step": 95500 }, { "epoch": 17.52706918700679, "grad_norm": 0.0014325730735436082, "learning_rate": 4.585954491867933e-07, "loss": 0.0001, "num_input_tokens_seen": 205955952, "step": 95505 }, { "epoch": 17.52798678656634, "grad_norm": 0.0009198590996675193, "learning_rate": 4.5826050219645425e-07, "loss": 0.0, "num_input_tokens_seen": 205967888, "step": 95510 }, { "epoch": 17.528904386125895, "grad_norm": 0.0034555874299257994, "learning_rate": 4.579256716946939e-07, "loss": 0.0001, "num_input_tokens_seen": 205978128, "step": 95515 }, { "epoch": 17.529821985685448, "grad_norm": 88.14630126953125, "learning_rate": 4.5759095769010055e-07, "loss": 0.1314, "num_input_tokens_seen": 205988112, "step": 95520 }, { "epoch": 17.530739585244998, "grad_norm": 197.540771484375, "learning_rate": 4.572563601912583e-07, "loss": 0.0913, "num_input_tokens_seen": 205998480, "step": 95525 }, { "epoch": 17.53165718480455, "grad_norm": 0.0018929606303572655, "learning_rate": 4.5692187920675093e-07, "loss": 0.0, "num_input_tokens_seen": 206009296, "step": 95530 }, { "epoch": 17.532574784364105, "grad_norm": 0.0013468903489410877, "learning_rate": 4.565875147451559e-07, "loss": 0.0, "num_input_tokens_seen": 206020048, "step": 95535 }, { "epoch": 17.533492383923655, "grad_norm": 0.0014292197301983833, "learning_rate": 4.562532668150493e-07, "loss": 0.0703, "num_input_tokens_seen": 206031760, "step": 95540 }, { "epoch": 17.534409983483208, "grad_norm": 0.013585653156042099, "learning_rate": 4.5591913542500477e-07, "loss": 0.0001, "num_input_tokens_seen": 206043216, "step": 95545 }, { "epoch": 17.53532758304276, "grad_norm": 0.01404569111764431, "learning_rate": 4.555851205835904e-07, "loss": 0.0, "num_input_tokens_seen": 206054032, "step": 95550 }, { "epoch": 17.53624518260231, "grad_norm": 0.3290789723396301, "learning_rate": 4.5525122229937547e-07, "loss": 0.0002, "num_input_tokens_seen": 206064976, "step": 95555 }, { "epoch": 17.537162782161865, "grad_norm": 0.0020056518260389566, "learning_rate": 4.549174405809231e-07, "loss": 0.0005, "num_input_tokens_seen": 206076048, "step": 95560 }, { "epoch": 17.538080381721418, "grad_norm": 0.018634673207998276, "learning_rate": 4.545837754367938e-07, "loss": 0.0001, "num_input_tokens_seen": 206086416, "step": 95565 }, { "epoch": 17.538997981280968, "grad_norm": 70.9966049194336, "learning_rate": 4.5425022687554557e-07, "loss": 0.2064, "num_input_tokens_seen": 206096368, "step": 95570 }, { "epoch": 17.53991558084052, "grad_norm": 0.0014845718396827579, "learning_rate": 4.539167949057344e-07, "loss": 0.0, "num_input_tokens_seen": 206106672, "step": 95575 }, { "epoch": 17.540833180400075, "grad_norm": 0.0005830205627717078, "learning_rate": 4.535834795359112e-07, "loss": 0.0001, "num_input_tokens_seen": 206117808, "step": 95580 }, { "epoch": 17.541750779959624, "grad_norm": 0.0010730293579399586, "learning_rate": 4.5325028077462584e-07, "loss": 0.0001, "num_input_tokens_seen": 206129040, "step": 95585 }, { "epoch": 17.542668379519178, "grad_norm": 0.005011739209294319, "learning_rate": 4.529171986304232e-07, "loss": 0.0, "num_input_tokens_seen": 206139184, "step": 95590 }, { "epoch": 17.54358597907873, "grad_norm": 0.00149594247341156, "learning_rate": 4.5258423311184794e-07, "loss": 0.0, "num_input_tokens_seen": 206150224, "step": 95595 }, { "epoch": 17.54450357863828, "grad_norm": 0.041386839002370834, "learning_rate": 4.5225138422743897e-07, "loss": 0.0001, "num_input_tokens_seen": 206161296, "step": 95600 }, { "epoch": 17.545421178197834, "grad_norm": 0.001855007722042501, "learning_rate": 4.519186519857327e-07, "loss": 0.0, "num_input_tokens_seen": 206172880, "step": 95605 }, { "epoch": 17.546338777757388, "grad_norm": 0.004235436674207449, "learning_rate": 4.5158603639526565e-07, "loss": 0.0, "num_input_tokens_seen": 206184208, "step": 95610 }, { "epoch": 17.547256377316938, "grad_norm": 0.0010809764498844743, "learning_rate": 4.512535374645666e-07, "loss": 0.0, "num_input_tokens_seen": 206194480, "step": 95615 }, { "epoch": 17.54817397687649, "grad_norm": 0.0007015342125669122, "learning_rate": 4.509211552021647e-07, "loss": 0.0, "num_input_tokens_seen": 206204752, "step": 95620 }, { "epoch": 17.549091576436044, "grad_norm": 0.0008352193399332464, "learning_rate": 4.505888896165839e-07, "loss": 0.0001, "num_input_tokens_seen": 206215376, "step": 95625 }, { "epoch": 17.550009175995594, "grad_norm": 0.07042131572961807, "learning_rate": 4.502567407163477e-07, "loss": 0.0001, "num_input_tokens_seen": 206226736, "step": 95630 }, { "epoch": 17.550926775555148, "grad_norm": 0.001335029723122716, "learning_rate": 4.4992470850997506e-07, "loss": 0.0478, "num_input_tokens_seen": 206237424, "step": 95635 }, { "epoch": 17.5518443751147, "grad_norm": 0.05574249103665352, "learning_rate": 4.495927930059807e-07, "loss": 0.0001, "num_input_tokens_seen": 206248976, "step": 95640 }, { "epoch": 17.55276197467425, "grad_norm": 0.01483368780463934, "learning_rate": 4.492609942128795e-07, "loss": 0.0001, "num_input_tokens_seen": 206260848, "step": 95645 }, { "epoch": 17.553679574233804, "grad_norm": 0.008327300660312176, "learning_rate": 4.489293121391808e-07, "loss": 0.0426, "num_input_tokens_seen": 206271248, "step": 95650 }, { "epoch": 17.554597173793358, "grad_norm": 611.8167114257812, "learning_rate": 4.485977467933911e-07, "loss": 0.0883, "num_input_tokens_seen": 206281360, "step": 95655 }, { "epoch": 17.555514773352908, "grad_norm": 0.0020706362556666136, "learning_rate": 4.482662981840158e-07, "loss": 0.0872, "num_input_tokens_seen": 206290640, "step": 95660 }, { "epoch": 17.55643237291246, "grad_norm": 0.0034841804299503565, "learning_rate": 4.4793496631955533e-07, "loss": 0.0, "num_input_tokens_seen": 206301488, "step": 95665 }, { "epoch": 17.557349972472014, "grad_norm": 0.0033431267365813255, "learning_rate": 4.4760375120850797e-07, "loss": 0.0032, "num_input_tokens_seen": 206312528, "step": 95670 }, { "epoch": 17.558267572031564, "grad_norm": 0.009531269781291485, "learning_rate": 4.4727265285936796e-07, "loss": 0.0, "num_input_tokens_seen": 206323216, "step": 95675 }, { "epoch": 17.559185171591118, "grad_norm": 0.0013715780805796385, "learning_rate": 4.4694167128062903e-07, "loss": 0.0, "num_input_tokens_seen": 206333072, "step": 95680 }, { "epoch": 17.56010277115067, "grad_norm": 0.006668893154710531, "learning_rate": 4.4661080648078004e-07, "loss": 0.0, "num_input_tokens_seen": 206342736, "step": 95685 }, { "epoch": 17.56102037071022, "grad_norm": 0.0007321761222556233, "learning_rate": 4.4628005846830524e-07, "loss": 0.0, "num_input_tokens_seen": 206353200, "step": 95690 }, { "epoch": 17.561937970269774, "grad_norm": 0.007142455317080021, "learning_rate": 4.459494272516907e-07, "loss": 0.0, "num_input_tokens_seen": 206364304, "step": 95695 }, { "epoch": 17.562855569829328, "grad_norm": 0.0003262295213062316, "learning_rate": 4.4561891283941506e-07, "loss": 0.0, "num_input_tokens_seen": 206374480, "step": 95700 }, { "epoch": 17.563773169388877, "grad_norm": 0.013917204923927784, "learning_rate": 4.4528851523995496e-07, "loss": 0.0, "num_input_tokens_seen": 206385072, "step": 95705 }, { "epoch": 17.56469076894843, "grad_norm": 0.0005995052633807063, "learning_rate": 4.449582344617859e-07, "loss": 0.0, "num_input_tokens_seen": 206396240, "step": 95710 }, { "epoch": 17.565608368507984, "grad_norm": 0.001180009450763464, "learning_rate": 4.4462807051337875e-07, "loss": 0.0005, "num_input_tokens_seen": 206408240, "step": 95715 }, { "epoch": 17.566525968067534, "grad_norm": 0.0016174864722415805, "learning_rate": 4.442980234032007e-07, "loss": 0.0007, "num_input_tokens_seen": 206419536, "step": 95720 }, { "epoch": 17.567443567627087, "grad_norm": 0.0013581644743680954, "learning_rate": 4.4396809313971776e-07, "loss": 0.0, "num_input_tokens_seen": 206431952, "step": 95725 }, { "epoch": 17.56836116718664, "grad_norm": 0.0032157166860997677, "learning_rate": 4.4363827973139206e-07, "loss": 0.0822, "num_input_tokens_seen": 206443216, "step": 95730 }, { "epoch": 17.56927876674619, "grad_norm": 0.0017673266120254993, "learning_rate": 4.433085831866835e-07, "loss": 0.005, "num_input_tokens_seen": 206455024, "step": 95735 }, { "epoch": 17.570196366305744, "grad_norm": 0.0005862422985956073, "learning_rate": 4.429790035140469e-07, "loss": 0.0352, "num_input_tokens_seen": 206466800, "step": 95740 }, { "epoch": 17.571113965865298, "grad_norm": 0.0011497588129714131, "learning_rate": 4.4264954072193553e-07, "loss": 0.0003, "num_input_tokens_seen": 206477136, "step": 95745 }, { "epoch": 17.572031565424847, "grad_norm": 0.0511464960873127, "learning_rate": 4.4232019481880104e-07, "loss": 0.0001, "num_input_tokens_seen": 206487472, "step": 95750 }, { "epoch": 17.5729491649844, "grad_norm": 0.0020917747169733047, "learning_rate": 4.4199096581308996e-07, "loss": 0.0, "num_input_tokens_seen": 206497808, "step": 95755 }, { "epoch": 17.573866764543954, "grad_norm": 0.005448543932288885, "learning_rate": 4.416618537132461e-07, "loss": 0.0, "num_input_tokens_seen": 206508880, "step": 95760 }, { "epoch": 17.574784364103504, "grad_norm": 0.1383875608444214, "learning_rate": 4.4133285852771104e-07, "loss": 0.0001, "num_input_tokens_seen": 206519888, "step": 95765 }, { "epoch": 17.575701963663057, "grad_norm": 0.00963650457561016, "learning_rate": 4.4100398026492187e-07, "loss": 0.0, "num_input_tokens_seen": 206529904, "step": 95770 }, { "epoch": 17.57661956322261, "grad_norm": 0.0782654657959938, "learning_rate": 4.4067521893331576e-07, "loss": 0.0, "num_input_tokens_seen": 206540720, "step": 95775 }, { "epoch": 17.57753716278216, "grad_norm": 4.041144847869873, "learning_rate": 4.403465745413238e-07, "loss": 0.0016, "num_input_tokens_seen": 206551248, "step": 95780 }, { "epoch": 17.578454762341714, "grad_norm": 0.1473965346813202, "learning_rate": 4.400180470973753e-07, "loss": 0.0, "num_input_tokens_seen": 206562576, "step": 95785 }, { "epoch": 17.579372361901267, "grad_norm": 0.0005040266551077366, "learning_rate": 4.3968963660989627e-07, "loss": 0.0, "num_input_tokens_seen": 206573136, "step": 95790 }, { "epoch": 17.580289961460817, "grad_norm": 0.0019001801265403628, "learning_rate": 4.393613430873106e-07, "loss": 0.0001, "num_input_tokens_seen": 206583888, "step": 95795 }, { "epoch": 17.58120756102037, "grad_norm": 0.0009917090646922588, "learning_rate": 4.3903316653803816e-07, "loss": 0.0, "num_input_tokens_seen": 206593872, "step": 95800 }, { "epoch": 17.582125160579924, "grad_norm": 0.005774782504886389, "learning_rate": 4.387051069704962e-07, "loss": 0.0031, "num_input_tokens_seen": 206604784, "step": 95805 }, { "epoch": 17.583042760139474, "grad_norm": 0.003889924380928278, "learning_rate": 4.3837716439309843e-07, "loss": 0.0, "num_input_tokens_seen": 206614768, "step": 95810 }, { "epoch": 17.583960359699027, "grad_norm": 0.0012043101014569402, "learning_rate": 4.38049338814257e-07, "loss": 0.0, "num_input_tokens_seen": 206625264, "step": 95815 }, { "epoch": 17.58487795925858, "grad_norm": 0.00034284411231055856, "learning_rate": 4.3772163024237923e-07, "loss": 0.0001, "num_input_tokens_seen": 206636720, "step": 95820 }, { "epoch": 17.58579555881813, "grad_norm": 0.00043377073598094285, "learning_rate": 4.373940386858705e-07, "loss": 0.0, "num_input_tokens_seen": 206648368, "step": 95825 }, { "epoch": 17.586713158377684, "grad_norm": 0.7554532289505005, "learning_rate": 4.370665641531341e-07, "loss": 0.0002, "num_input_tokens_seen": 206660784, "step": 95830 }, { "epoch": 17.587630757937237, "grad_norm": 0.0021795069333165884, "learning_rate": 4.3673920665256833e-07, "loss": 0.0, "num_input_tokens_seen": 206671664, "step": 95835 }, { "epoch": 17.588548357496787, "grad_norm": 0.15690360963344574, "learning_rate": 4.3641196619256867e-07, "loss": 0.1813, "num_input_tokens_seen": 206683376, "step": 95840 }, { "epoch": 17.58946595705634, "grad_norm": 0.0011679118033498526, "learning_rate": 4.360848427815295e-07, "loss": 0.0, "num_input_tokens_seen": 206693104, "step": 95845 }, { "epoch": 17.590383556615894, "grad_norm": 0.003810885362327099, "learning_rate": 4.3575783642784144e-07, "loss": 0.0, "num_input_tokens_seen": 206703408, "step": 95850 }, { "epoch": 17.591301156175444, "grad_norm": 0.03452655300498009, "learning_rate": 4.354309471398904e-07, "loss": 0.0, "num_input_tokens_seen": 206714416, "step": 95855 }, { "epoch": 17.592218755734997, "grad_norm": 0.003700646571815014, "learning_rate": 4.351041749260604e-07, "loss": 0.0001, "num_input_tokens_seen": 206724400, "step": 95860 }, { "epoch": 17.59313635529455, "grad_norm": 0.003091315506026149, "learning_rate": 4.3477751979473457e-07, "loss": 0.0943, "num_input_tokens_seen": 206734768, "step": 95865 }, { "epoch": 17.5940539548541, "grad_norm": 0.0012303482508286834, "learning_rate": 4.3445098175428966e-07, "loss": 0.0, "num_input_tokens_seen": 206744912, "step": 95870 }, { "epoch": 17.594971554413654, "grad_norm": 0.002408055355772376, "learning_rate": 4.3412456081310006e-07, "loss": 0.0032, "num_input_tokens_seen": 206755888, "step": 95875 }, { "epoch": 17.595889153973207, "grad_norm": 0.0013401410542428493, "learning_rate": 4.3379825697954014e-07, "loss": 0.0, "num_input_tokens_seen": 206764880, "step": 95880 }, { "epoch": 17.596806753532757, "grad_norm": 0.001267329789698124, "learning_rate": 4.334720702619777e-07, "loss": 0.0, "num_input_tokens_seen": 206775440, "step": 95885 }, { "epoch": 17.59772435309231, "grad_norm": 0.0007667011232115328, "learning_rate": 4.3314600066877934e-07, "loss": 0.0, "num_input_tokens_seen": 206785968, "step": 95890 }, { "epoch": 17.598641952651864, "grad_norm": 0.005837936419993639, "learning_rate": 4.3282004820830726e-07, "loss": 0.0, "num_input_tokens_seen": 206797776, "step": 95895 }, { "epoch": 17.599559552211414, "grad_norm": 0.0016097206389531493, "learning_rate": 4.3249421288892313e-07, "loss": 0.0, "num_input_tokens_seen": 206809040, "step": 95900 }, { "epoch": 17.600477151770967, "grad_norm": 0.0006894854013808072, "learning_rate": 4.3216849471898356e-07, "loss": 0.0, "num_input_tokens_seen": 206820496, "step": 95905 }, { "epoch": 17.60139475133052, "grad_norm": 0.001981648150831461, "learning_rate": 4.3184289370684195e-07, "loss": 0.0, "num_input_tokens_seen": 206830544, "step": 95910 }, { "epoch": 17.60231235089007, "grad_norm": 0.0006016957340762019, "learning_rate": 4.31517409860851e-07, "loss": 0.0001, "num_input_tokens_seen": 206841168, "step": 95915 }, { "epoch": 17.603229950449624, "grad_norm": 0.0009030045475810766, "learning_rate": 4.311920431893579e-07, "loss": 0.0002, "num_input_tokens_seen": 206851696, "step": 95920 }, { "epoch": 17.604147550009177, "grad_norm": 0.0013608421431854367, "learning_rate": 4.3086679370070664e-07, "loss": 0.0001, "num_input_tokens_seen": 206861392, "step": 95925 }, { "epoch": 17.605065149568727, "grad_norm": 0.17314545810222626, "learning_rate": 4.305416614032415e-07, "loss": 0.0001, "num_input_tokens_seen": 206872816, "step": 95930 }, { "epoch": 17.60598274912828, "grad_norm": 0.04067061468958855, "learning_rate": 4.302166463053015e-07, "loss": 0.0, "num_input_tokens_seen": 206884304, "step": 95935 }, { "epoch": 17.606900348687834, "grad_norm": 0.005518024787306786, "learning_rate": 4.29891748415221e-07, "loss": 0.0, "num_input_tokens_seen": 206893936, "step": 95940 }, { "epoch": 17.607817948247384, "grad_norm": 0.2742181420326233, "learning_rate": 4.295669677413339e-07, "loss": 0.0001, "num_input_tokens_seen": 206905104, "step": 95945 }, { "epoch": 17.608735547806937, "grad_norm": 0.08265189826488495, "learning_rate": 4.2924230429197135e-07, "loss": 0.0, "num_input_tokens_seen": 206915248, "step": 95950 }, { "epoch": 17.60965314736649, "grad_norm": 0.020095139741897583, "learning_rate": 4.2891775807545944e-07, "loss": 0.0, "num_input_tokens_seen": 206925488, "step": 95955 }, { "epoch": 17.61057074692604, "grad_norm": 0.0192551389336586, "learning_rate": 4.2859332910012264e-07, "loss": 0.0, "num_input_tokens_seen": 206937264, "step": 95960 }, { "epoch": 17.611488346485594, "grad_norm": 0.011757627129554749, "learning_rate": 4.2826901737428093e-07, "loss": 0.0, "num_input_tokens_seen": 206947792, "step": 95965 }, { "epoch": 17.612405946045147, "grad_norm": 0.0017062807455658913, "learning_rate": 4.279448229062544e-07, "loss": 0.0, "num_input_tokens_seen": 206958640, "step": 95970 }, { "epoch": 17.613323545604697, "grad_norm": 0.0515545979142189, "learning_rate": 4.276207457043569e-07, "loss": 0.0146, "num_input_tokens_seen": 206968816, "step": 95975 }, { "epoch": 17.61424114516425, "grad_norm": 0.0025444249622523785, "learning_rate": 4.2729678577690117e-07, "loss": 0.0, "num_input_tokens_seen": 206979472, "step": 95980 }, { "epoch": 17.615158744723804, "grad_norm": 0.0005706365336664021, "learning_rate": 4.2697294313219564e-07, "loss": 0.0, "num_input_tokens_seen": 206990640, "step": 95985 }, { "epoch": 17.616076344283353, "grad_norm": 0.005599426105618477, "learning_rate": 4.266492177785464e-07, "loss": 0.0, "num_input_tokens_seen": 207000560, "step": 95990 }, { "epoch": 17.616993943842907, "grad_norm": 0.010104498825967312, "learning_rate": 4.263256097242557e-07, "loss": 0.0, "num_input_tokens_seen": 207012368, "step": 95995 }, { "epoch": 17.61791154340246, "grad_norm": 0.0029272176325321198, "learning_rate": 4.260021189776259e-07, "loss": 0.0, "num_input_tokens_seen": 207024112, "step": 96000 }, { "epoch": 17.61882914296201, "grad_norm": 0.04353564605116844, "learning_rate": 4.2567874554695187e-07, "loss": 0.0, "num_input_tokens_seen": 207035632, "step": 96005 }, { "epoch": 17.619746742521563, "grad_norm": 0.006433050148189068, "learning_rate": 4.2535548944052816e-07, "loss": 0.0, "num_input_tokens_seen": 207046128, "step": 96010 }, { "epoch": 17.620664342081117, "grad_norm": 0.006783795543015003, "learning_rate": 4.2503235066664705e-07, "loss": 0.0, "num_input_tokens_seen": 207057168, "step": 96015 }, { "epoch": 17.621581941640667, "grad_norm": 0.013228860683739185, "learning_rate": 4.2470932923359575e-07, "loss": 0.0, "num_input_tokens_seen": 207067504, "step": 96020 }, { "epoch": 17.62249954120022, "grad_norm": 182.1025848388672, "learning_rate": 4.2438642514965765e-07, "loss": 0.0207, "num_input_tokens_seen": 207078256, "step": 96025 }, { "epoch": 17.623417140759774, "grad_norm": 0.0011337142204865813, "learning_rate": 4.2406363842311727e-07, "loss": 0.0, "num_input_tokens_seen": 207089520, "step": 96030 }, { "epoch": 17.624334740319323, "grad_norm": 0.0006072754040360451, "learning_rate": 4.2374096906225293e-07, "loss": 0.0, "num_input_tokens_seen": 207100336, "step": 96035 }, { "epoch": 17.625252339878877, "grad_norm": 0.0009061470627784729, "learning_rate": 4.234184170753397e-07, "loss": 0.0, "num_input_tokens_seen": 207111248, "step": 96040 }, { "epoch": 17.62616993943843, "grad_norm": 0.017347749322652817, "learning_rate": 4.230959824706504e-07, "loss": 0.0001, "num_input_tokens_seen": 207122800, "step": 96045 }, { "epoch": 17.62708753899798, "grad_norm": 0.0014635998522862792, "learning_rate": 4.2277366525645625e-07, "loss": 0.0, "num_input_tokens_seen": 207133264, "step": 96050 }, { "epoch": 17.628005138557533, "grad_norm": 0.001163674984127283, "learning_rate": 4.224514654410233e-07, "loss": 0.0478, "num_input_tokens_seen": 207145168, "step": 96055 }, { "epoch": 17.628922738117087, "grad_norm": 0.02448223903775215, "learning_rate": 4.221293830326151e-07, "loss": 0.0, "num_input_tokens_seen": 207156784, "step": 96060 }, { "epoch": 17.629840337676637, "grad_norm": 0.0028447045478969812, "learning_rate": 4.2180741803949433e-07, "loss": 0.0, "num_input_tokens_seen": 207167920, "step": 96065 }, { "epoch": 17.63075793723619, "grad_norm": 0.001530398498289287, "learning_rate": 4.214855704699172e-07, "loss": 0.0, "num_input_tokens_seen": 207177872, "step": 96070 }, { "epoch": 17.631675536795743, "grad_norm": 0.000750972714740783, "learning_rate": 4.211638403321394e-07, "loss": 0.0, "num_input_tokens_seen": 207187792, "step": 96075 }, { "epoch": 17.632593136355293, "grad_norm": 0.029473476111888885, "learning_rate": 4.208422276344121e-07, "loss": 0.0001, "num_input_tokens_seen": 207198960, "step": 96080 }, { "epoch": 17.633510735914847, "grad_norm": 0.005870228633284569, "learning_rate": 4.205207323849847e-07, "loss": 0.0, "num_input_tokens_seen": 207208432, "step": 96085 }, { "epoch": 17.6344283354744, "grad_norm": 0.0051335617899894714, "learning_rate": 4.2019935459210346e-07, "loss": 0.0, "num_input_tokens_seen": 207219152, "step": 96090 }, { "epoch": 17.63534593503395, "grad_norm": 0.17149017751216888, "learning_rate": 4.1987809426400963e-07, "loss": 0.0, "num_input_tokens_seen": 207231184, "step": 96095 }, { "epoch": 17.636263534593503, "grad_norm": 0.002011901466175914, "learning_rate": 4.1955695140894537e-07, "loss": 0.0, "num_input_tokens_seen": 207241424, "step": 96100 }, { "epoch": 17.637181134153057, "grad_norm": 0.0033842630218714476, "learning_rate": 4.1923592603514586e-07, "loss": 0.0, "num_input_tokens_seen": 207251952, "step": 96105 }, { "epoch": 17.638098733712607, "grad_norm": 0.0030749875586479902, "learning_rate": 4.1891501815084447e-07, "loss": 0.0, "num_input_tokens_seen": 207263408, "step": 96110 }, { "epoch": 17.63901633327216, "grad_norm": 49.7833137512207, "learning_rate": 4.1859422776427404e-07, "loss": 0.0762, "num_input_tokens_seen": 207274768, "step": 96115 }, { "epoch": 17.639933932831713, "grad_norm": 0.114956334233284, "learning_rate": 4.1827355488366085e-07, "loss": 0.0005, "num_input_tokens_seen": 207285200, "step": 96120 }, { "epoch": 17.640851532391263, "grad_norm": 0.006685640197247267, "learning_rate": 4.179529995172299e-07, "loss": 0.0002, "num_input_tokens_seen": 207297296, "step": 96125 }, { "epoch": 17.641769131950817, "grad_norm": 0.05858352407813072, "learning_rate": 4.1763256167320245e-07, "loss": 0.0, "num_input_tokens_seen": 207306960, "step": 96130 }, { "epoch": 17.64268673151037, "grad_norm": 0.006219387520104647, "learning_rate": 4.17312241359798e-07, "loss": 0.0, "num_input_tokens_seen": 207317392, "step": 96135 }, { "epoch": 17.64360433106992, "grad_norm": 0.008944499306380749, "learning_rate": 4.1699203858523276e-07, "loss": 0.0, "num_input_tokens_seen": 207329168, "step": 96140 }, { "epoch": 17.644521930629473, "grad_norm": 0.002982508623972535, "learning_rate": 4.166719533577174e-07, "loss": 0.0, "num_input_tokens_seen": 207340240, "step": 96145 }, { "epoch": 17.645439530189027, "grad_norm": 0.004783967509865761, "learning_rate": 4.1635198568546363e-07, "loss": 0.0001, "num_input_tokens_seen": 207350928, "step": 96150 }, { "epoch": 17.646357129748576, "grad_norm": 0.008784701116383076, "learning_rate": 4.160321355766778e-07, "loss": 0.0, "num_input_tokens_seen": 207360912, "step": 96155 }, { "epoch": 17.64727472930813, "grad_norm": 0.0023525531869381666, "learning_rate": 4.1571240303956204e-07, "loss": 0.0, "num_input_tokens_seen": 207372144, "step": 96160 }, { "epoch": 17.648192328867683, "grad_norm": 0.0015190563863143325, "learning_rate": 4.1539278808231944e-07, "loss": 0.004, "num_input_tokens_seen": 207382736, "step": 96165 }, { "epoch": 17.649109928427233, "grad_norm": 0.0845465138554573, "learning_rate": 4.1507329071314604e-07, "loss": 0.0, "num_input_tokens_seen": 207394512, "step": 96170 }, { "epoch": 17.650027527986786, "grad_norm": 0.0013681409182026982, "learning_rate": 4.1475391094023656e-07, "loss": 0.0, "num_input_tokens_seen": 207404976, "step": 96175 }, { "epoch": 17.65094512754634, "grad_norm": 0.0018896532710641623, "learning_rate": 4.144346487717832e-07, "loss": 0.0, "num_input_tokens_seen": 207415632, "step": 96180 }, { "epoch": 17.65186272710589, "grad_norm": 0.0019305844325572252, "learning_rate": 4.141155042159739e-07, "loss": 0.0, "num_input_tokens_seen": 207427184, "step": 96185 }, { "epoch": 17.652780326665443, "grad_norm": 0.0674409493803978, "learning_rate": 4.137964772809938e-07, "loss": 0.0, "num_input_tokens_seen": 207439344, "step": 96190 }, { "epoch": 17.653697926224996, "grad_norm": 0.0030336412601172924, "learning_rate": 4.134775679750264e-07, "loss": 0.0, "num_input_tokens_seen": 207449552, "step": 96195 }, { "epoch": 17.654615525784546, "grad_norm": 0.0005227068904787302, "learning_rate": 4.131587763062511e-07, "loss": 0.0, "num_input_tokens_seen": 207460976, "step": 96200 }, { "epoch": 17.6555331253441, "grad_norm": 0.02363806962966919, "learning_rate": 4.128401022828449e-07, "loss": 0.0001, "num_input_tokens_seen": 207472816, "step": 96205 }, { "epoch": 17.656450724903653, "grad_norm": 0.16515813767910004, "learning_rate": 4.1252154591298e-07, "loss": 0.0001, "num_input_tokens_seen": 207484176, "step": 96210 }, { "epoch": 17.657368324463203, "grad_norm": 0.002480296418070793, "learning_rate": 4.122031072048266e-07, "loss": 0.0, "num_input_tokens_seen": 207495856, "step": 96215 }, { "epoch": 17.658285924022756, "grad_norm": 0.06450270116329193, "learning_rate": 4.118847861665537e-07, "loss": 0.0, "num_input_tokens_seen": 207506640, "step": 96220 }, { "epoch": 17.65920352358231, "grad_norm": 0.020983338356018066, "learning_rate": 4.115665828063259e-07, "loss": 0.0002, "num_input_tokens_seen": 207518256, "step": 96225 }, { "epoch": 17.66012112314186, "grad_norm": 0.0012592572020366788, "learning_rate": 4.112484971323022e-07, "loss": 0.0, "num_input_tokens_seen": 207528304, "step": 96230 }, { "epoch": 17.661038722701413, "grad_norm": 0.0008347777184098959, "learning_rate": 4.1093052915264386e-07, "loss": 0.0002, "num_input_tokens_seen": 207539728, "step": 96235 }, { "epoch": 17.661956322260966, "grad_norm": 0.0014615828404203057, "learning_rate": 4.106126788755049e-07, "loss": 0.0001, "num_input_tokens_seen": 207550960, "step": 96240 }, { "epoch": 17.662873921820516, "grad_norm": 0.07756983488798141, "learning_rate": 4.102949463090372e-07, "loss": 0.0, "num_input_tokens_seen": 207561808, "step": 96245 }, { "epoch": 17.66379152138007, "grad_norm": 0.033405277878046036, "learning_rate": 4.099773314613914e-07, "loss": 0.0, "num_input_tokens_seen": 207571920, "step": 96250 }, { "epoch": 17.664709120939623, "grad_norm": 0.002394711831584573, "learning_rate": 4.096598343407132e-07, "loss": 0.0144, "num_input_tokens_seen": 207583216, "step": 96255 }, { "epoch": 17.665626720499173, "grad_norm": 0.001163336681202054, "learning_rate": 4.093424549551456e-07, "loss": 0.0, "num_input_tokens_seen": 207593520, "step": 96260 }, { "epoch": 17.666544320058726, "grad_norm": 0.002272007055580616, "learning_rate": 4.0902519331282863e-07, "loss": 0.0004, "num_input_tokens_seen": 207603952, "step": 96265 }, { "epoch": 17.66746191961828, "grad_norm": 0.25332626700401306, "learning_rate": 4.087080494219009e-07, "loss": 0.0, "num_input_tokens_seen": 207615312, "step": 96270 }, { "epoch": 17.66837951917783, "grad_norm": 0.006540936883538961, "learning_rate": 4.0839102329049585e-07, "loss": 0.0, "num_input_tokens_seen": 207626064, "step": 96275 }, { "epoch": 17.669297118737383, "grad_norm": 0.00411625113338232, "learning_rate": 4.0807411492674364e-07, "loss": 0.0, "num_input_tokens_seen": 207636656, "step": 96280 }, { "epoch": 17.670214718296936, "grad_norm": 0.0009643612429499626, "learning_rate": 4.0775732433877504e-07, "loss": 0.0, "num_input_tokens_seen": 207647600, "step": 96285 }, { "epoch": 17.671132317856486, "grad_norm": 0.005085665732622147, "learning_rate": 4.0744065153471293e-07, "loss": 0.0032, "num_input_tokens_seen": 207658544, "step": 96290 }, { "epoch": 17.67204991741604, "grad_norm": 0.0019877918530255556, "learning_rate": 4.071240965226797e-07, "loss": 0.0, "num_input_tokens_seen": 207670512, "step": 96295 }, { "epoch": 17.672967516975593, "grad_norm": 0.001380312372930348, "learning_rate": 4.0680765931079614e-07, "loss": 0.0025, "num_input_tokens_seen": 207681168, "step": 96300 }, { "epoch": 17.673885116535143, "grad_norm": 0.0008034820784814656, "learning_rate": 4.064913399071774e-07, "loss": 0.0001, "num_input_tokens_seen": 207693424, "step": 96305 }, { "epoch": 17.674802716094696, "grad_norm": 0.016938867047429085, "learning_rate": 4.061751383199358e-07, "loss": 0.0, "num_input_tokens_seen": 207704208, "step": 96310 }, { "epoch": 17.67572031565425, "grad_norm": 0.0011797567130997777, "learning_rate": 4.058590545571817e-07, "loss": 0.0, "num_input_tokens_seen": 207714512, "step": 96315 }, { "epoch": 17.6766379152138, "grad_norm": 0.0004617877129931003, "learning_rate": 4.055430886270234e-07, "loss": 0.004, "num_input_tokens_seen": 207725488, "step": 96320 }, { "epoch": 17.677555514773353, "grad_norm": 22.020645141601562, "learning_rate": 4.0522724053756457e-07, "loss": 0.1551, "num_input_tokens_seen": 207736624, "step": 96325 }, { "epoch": 17.678473114332906, "grad_norm": 0.013931899331510067, "learning_rate": 4.0491151029690423e-07, "loss": 0.0, "num_input_tokens_seen": 207746512, "step": 96330 }, { "epoch": 17.679390713892456, "grad_norm": 0.0007289614295586944, "learning_rate": 4.0459589791314313e-07, "loss": 0.0001, "num_input_tokens_seen": 207756752, "step": 96335 }, { "epoch": 17.68030831345201, "grad_norm": 0.009032624773681164, "learning_rate": 4.042804033943748e-07, "loss": 0.0174, "num_input_tokens_seen": 207768368, "step": 96340 }, { "epoch": 17.681225913011563, "grad_norm": 0.0004753676475957036, "learning_rate": 4.039650267486911e-07, "loss": 0.0097, "num_input_tokens_seen": 207779376, "step": 96345 }, { "epoch": 17.682143512571113, "grad_norm": 0.0014535870868712664, "learning_rate": 4.0364976798418166e-07, "loss": 0.0, "num_input_tokens_seen": 207789136, "step": 96350 }, { "epoch": 17.683061112130666, "grad_norm": 0.006218998227268457, "learning_rate": 4.0333462710893167e-07, "loss": 0.0, "num_input_tokens_seen": 207801200, "step": 96355 }, { "epoch": 17.68397871169022, "grad_norm": 0.005681033246219158, "learning_rate": 4.030196041310247e-07, "loss": 0.0013, "num_input_tokens_seen": 207811568, "step": 96360 }, { "epoch": 17.68489631124977, "grad_norm": 0.0013401148607954383, "learning_rate": 4.0270469905853927e-07, "loss": 0.0, "num_input_tokens_seen": 207822576, "step": 96365 }, { "epoch": 17.685813910809323, "grad_norm": 0.043634723871946335, "learning_rate": 4.0238991189955443e-07, "loss": 0.0001, "num_input_tokens_seen": 207834256, "step": 96370 }, { "epoch": 17.686731510368876, "grad_norm": 0.030462203547358513, "learning_rate": 4.0207524266214213e-07, "loss": 0.0, "num_input_tokens_seen": 207845744, "step": 96375 }, { "epoch": 17.687649109928426, "grad_norm": 0.005600954405963421, "learning_rate": 4.017606913543737e-07, "loss": 0.0, "num_input_tokens_seen": 207856112, "step": 96380 }, { "epoch": 17.68856670948798, "grad_norm": 0.008327946998178959, "learning_rate": 4.0144625798431646e-07, "loss": 0.0, "num_input_tokens_seen": 207866768, "step": 96385 }, { "epoch": 17.689484309047533, "grad_norm": 0.0027389039751142263, "learning_rate": 4.011319425600363e-07, "loss": 0.0001, "num_input_tokens_seen": 207876464, "step": 96390 }, { "epoch": 17.690401908607083, "grad_norm": 0.007545880042016506, "learning_rate": 4.0081774508959394e-07, "loss": 0.0, "num_input_tokens_seen": 207887856, "step": 96395 }, { "epoch": 17.691319508166636, "grad_norm": 0.0018681030487641692, "learning_rate": 4.005036655810485e-07, "loss": 0.0, "num_input_tokens_seen": 207899376, "step": 96400 }, { "epoch": 17.69223710772619, "grad_norm": 0.0018238305347040296, "learning_rate": 4.001897040424557e-07, "loss": 0.0, "num_input_tokens_seen": 207910384, "step": 96405 }, { "epoch": 17.69315470728574, "grad_norm": 0.004431906156241894, "learning_rate": 3.99875860481867e-07, "loss": 0.0, "num_input_tokens_seen": 207921808, "step": 96410 }, { "epoch": 17.694072306845293, "grad_norm": 61.990234375, "learning_rate": 3.995621349073336e-07, "loss": 0.0426, "num_input_tokens_seen": 207932080, "step": 96415 }, { "epoch": 17.694989906404846, "grad_norm": 0.001396095845848322, "learning_rate": 3.9924852732690144e-07, "loss": 0.0645, "num_input_tokens_seen": 207942320, "step": 96420 }, { "epoch": 17.695907505964396, "grad_norm": 1.3950610160827637, "learning_rate": 3.9893503774861396e-07, "loss": 0.0916, "num_input_tokens_seen": 207952240, "step": 96425 }, { "epoch": 17.69682510552395, "grad_norm": 0.007049701642245054, "learning_rate": 3.9862166618051147e-07, "loss": 0.0, "num_input_tokens_seen": 207962704, "step": 96430 }, { "epoch": 17.697742705083503, "grad_norm": 0.001966596581041813, "learning_rate": 3.983084126306319e-07, "loss": 0.0, "num_input_tokens_seen": 207972912, "step": 96435 }, { "epoch": 17.698660304643052, "grad_norm": 0.0005912979249842465, "learning_rate": 3.9799527710701e-07, "loss": 0.0, "num_input_tokens_seen": 207982512, "step": 96440 }, { "epoch": 17.699577904202606, "grad_norm": 0.0019350546645000577, "learning_rate": 3.976822596176766e-07, "loss": 0.0, "num_input_tokens_seen": 207993360, "step": 96445 }, { "epoch": 17.70049550376216, "grad_norm": 0.005812793970108032, "learning_rate": 3.9736936017065906e-07, "loss": 0.0004, "num_input_tokens_seen": 208006128, "step": 96450 }, { "epoch": 17.70141310332171, "grad_norm": 0.0005382089293561876, "learning_rate": 3.970565787739855e-07, "loss": 0.0, "num_input_tokens_seen": 208017328, "step": 96455 }, { "epoch": 17.702330702881262, "grad_norm": 0.00040228667785413563, "learning_rate": 3.967439154356767e-07, "loss": 0.0, "num_input_tokens_seen": 208027120, "step": 96460 }, { "epoch": 17.703248302440816, "grad_norm": 0.0024566540960222483, "learning_rate": 3.9643137016375064e-07, "loss": 0.0, "num_input_tokens_seen": 208038352, "step": 96465 }, { "epoch": 17.704165902000366, "grad_norm": 0.002181225223466754, "learning_rate": 3.9611894296622653e-07, "loss": 0.0, "num_input_tokens_seen": 208047888, "step": 96470 }, { "epoch": 17.70508350155992, "grad_norm": 0.012384168803691864, "learning_rate": 3.958066338511157e-07, "loss": 0.0, "num_input_tokens_seen": 208058736, "step": 96475 }, { "epoch": 17.706001101119472, "grad_norm": 0.0014443043619394302, "learning_rate": 3.9549444282642847e-07, "loss": 0.0, "num_input_tokens_seen": 208069392, "step": 96480 }, { "epoch": 17.706918700679022, "grad_norm": 0.013344969600439072, "learning_rate": 3.9518236990017276e-07, "loss": 0.0001, "num_input_tokens_seen": 208080688, "step": 96485 }, { "epoch": 17.707836300238576, "grad_norm": 0.0012842401629313827, "learning_rate": 3.9487041508035284e-07, "loss": 0.0, "num_input_tokens_seen": 208090800, "step": 96490 }, { "epoch": 17.70875389979813, "grad_norm": 6.388703346252441, "learning_rate": 3.9455857837496945e-07, "loss": 0.0004, "num_input_tokens_seen": 208100848, "step": 96495 }, { "epoch": 17.70967149935768, "grad_norm": 0.013603661209344864, "learning_rate": 3.9424685979202013e-07, "loss": 0.0, "num_input_tokens_seen": 208110992, "step": 96500 }, { "epoch": 17.710589098917232, "grad_norm": 0.022434739395976067, "learning_rate": 3.939352593395007e-07, "loss": 0.0, "num_input_tokens_seen": 208121296, "step": 96505 }, { "epoch": 17.711506698476786, "grad_norm": 0.002385573461651802, "learning_rate": 3.9362377702540367e-07, "loss": 0.0, "num_input_tokens_seen": 208132368, "step": 96510 }, { "epoch": 17.712424298036336, "grad_norm": 5.076231002807617, "learning_rate": 3.933124128577165e-07, "loss": 0.0006, "num_input_tokens_seen": 208142096, "step": 96515 }, { "epoch": 17.71334189759589, "grad_norm": 0.0014259492745622993, "learning_rate": 3.9300116684442724e-07, "loss": 0.0, "num_input_tokens_seen": 208152432, "step": 96520 }, { "epoch": 17.714259497155442, "grad_norm": 0.0058043766766786575, "learning_rate": 3.9269003899351786e-07, "loss": 0.0, "num_input_tokens_seen": 208163888, "step": 96525 }, { "epoch": 17.715177096714992, "grad_norm": 0.10187377035617828, "learning_rate": 3.9237902931296813e-07, "loss": 0.0001, "num_input_tokens_seen": 208175728, "step": 96530 }, { "epoch": 17.716094696274546, "grad_norm": 0.00219776201993227, "learning_rate": 3.920681378107544e-07, "loss": 0.0, "num_input_tokens_seen": 208185680, "step": 96535 }, { "epoch": 17.7170122958341, "grad_norm": 0.002270181430503726, "learning_rate": 3.917573644948519e-07, "loss": 0.0, "num_input_tokens_seen": 208196592, "step": 96540 }, { "epoch": 17.71792989539365, "grad_norm": 0.0020497897639870644, "learning_rate": 3.9144670937323104e-07, "loss": 0.0, "num_input_tokens_seen": 208207184, "step": 96545 }, { "epoch": 17.718847494953202, "grad_norm": 0.003633238608017564, "learning_rate": 3.911361724538587e-07, "loss": 0.0, "num_input_tokens_seen": 208218992, "step": 96550 }, { "epoch": 17.719765094512756, "grad_norm": 0.004287021700292826, "learning_rate": 3.908257537447008e-07, "loss": 0.0, "num_input_tokens_seen": 208230032, "step": 96555 }, { "epoch": 17.720682694072305, "grad_norm": 0.0010257562389597297, "learning_rate": 3.905154532537192e-07, "loss": 0.0089, "num_input_tokens_seen": 208240592, "step": 96560 }, { "epoch": 17.72160029363186, "grad_norm": 0.07169397175312042, "learning_rate": 3.902052709888715e-07, "loss": 0.0, "num_input_tokens_seen": 208250480, "step": 96565 }, { "epoch": 17.722517893191412, "grad_norm": 0.0012610359117388725, "learning_rate": 3.898952069581147e-07, "loss": 0.0097, "num_input_tokens_seen": 208261040, "step": 96570 }, { "epoch": 17.723435492750962, "grad_norm": 0.0036544871982187033, "learning_rate": 3.8958526116940066e-07, "loss": 0.0, "num_input_tokens_seen": 208271920, "step": 96575 }, { "epoch": 17.724353092310515, "grad_norm": 0.04136835038661957, "learning_rate": 3.892754336306792e-07, "loss": 0.0, "num_input_tokens_seen": 208283376, "step": 96580 }, { "epoch": 17.72527069187007, "grad_norm": 0.0041059996001422405, "learning_rate": 3.889657243498962e-07, "loss": 0.0, "num_input_tokens_seen": 208293712, "step": 96585 }, { "epoch": 17.72618829142962, "grad_norm": 0.0006167622632347047, "learning_rate": 3.8865613333499697e-07, "loss": 0.0, "num_input_tokens_seen": 208304304, "step": 96590 }, { "epoch": 17.727105890989172, "grad_norm": 29.98048973083496, "learning_rate": 3.8834666059392067e-07, "loss": 0.0087, "num_input_tokens_seen": 208315216, "step": 96595 }, { "epoch": 17.728023490548726, "grad_norm": 0.00581563962623477, "learning_rate": 3.8803730613460544e-07, "loss": 0.0001, "num_input_tokens_seen": 208325616, "step": 96600 }, { "epoch": 17.728941090108275, "grad_norm": 0.005639002192765474, "learning_rate": 3.877280699649838e-07, "loss": 0.0, "num_input_tokens_seen": 208336112, "step": 96605 }, { "epoch": 17.72985868966783, "grad_norm": 0.019729454070329666, "learning_rate": 3.8741895209299053e-07, "loss": 0.0, "num_input_tokens_seen": 208347472, "step": 96610 }, { "epoch": 17.730776289227382, "grad_norm": 0.002279460895806551, "learning_rate": 3.87109952526552e-07, "loss": 0.0, "num_input_tokens_seen": 208358000, "step": 96615 }, { "epoch": 17.731693888786932, "grad_norm": 0.000860240834299475, "learning_rate": 3.8680107127359367e-07, "loss": 0.0, "num_input_tokens_seen": 208370192, "step": 96620 }, { "epoch": 17.732611488346485, "grad_norm": 0.025168776512145996, "learning_rate": 3.86492308342038e-07, "loss": 0.0, "num_input_tokens_seen": 208381744, "step": 96625 }, { "epoch": 17.73352908790604, "grad_norm": 0.013553456403315067, "learning_rate": 3.8618366373980364e-07, "loss": 0.0, "num_input_tokens_seen": 208393008, "step": 96630 }, { "epoch": 17.73444668746559, "grad_norm": 0.00502223102375865, "learning_rate": 3.858751374748082e-07, "loss": 0.0, "num_input_tokens_seen": 208403024, "step": 96635 }, { "epoch": 17.735364287025142, "grad_norm": 23.266939163208008, "learning_rate": 3.855667295549648e-07, "loss": 0.0131, "num_input_tokens_seen": 208412816, "step": 96640 }, { "epoch": 17.736281886584695, "grad_norm": 0.0031878540758043528, "learning_rate": 3.8525843998818257e-07, "loss": 0.0, "num_input_tokens_seen": 208424336, "step": 96645 }, { "epoch": 17.737199486144245, "grad_norm": 0.013946859166026115, "learning_rate": 3.8495026878236805e-07, "loss": 0.0001, "num_input_tokens_seen": 208434640, "step": 96650 }, { "epoch": 17.7381170857038, "grad_norm": 0.0015763207338750362, "learning_rate": 3.846422159454277e-07, "loss": 0.0, "num_input_tokens_seen": 208446416, "step": 96655 }, { "epoch": 17.739034685263352, "grad_norm": 0.0030844174325466156, "learning_rate": 3.8433428148526133e-07, "loss": 0.0, "num_input_tokens_seen": 208455920, "step": 96660 }, { "epoch": 17.739952284822902, "grad_norm": 0.024979861453175545, "learning_rate": 3.84026465409767e-07, "loss": 0.0, "num_input_tokens_seen": 208467024, "step": 96665 }, { "epoch": 17.740869884382455, "grad_norm": 0.009727811440825462, "learning_rate": 3.837187677268389e-07, "loss": 0.0002, "num_input_tokens_seen": 208478512, "step": 96670 }, { "epoch": 17.74178748394201, "grad_norm": 0.0027303339447826147, "learning_rate": 3.834111884443709e-07, "loss": 0.0, "num_input_tokens_seen": 208489232, "step": 96675 }, { "epoch": 17.74270508350156, "grad_norm": 0.015127621591091156, "learning_rate": 3.831037275702504e-07, "loss": 0.0, "num_input_tokens_seen": 208501136, "step": 96680 }, { "epoch": 17.743622683061112, "grad_norm": 0.0008288266253657639, "learning_rate": 3.827963851123634e-07, "loss": 0.0, "num_input_tokens_seen": 208511472, "step": 96685 }, { "epoch": 17.744540282620665, "grad_norm": 0.002962191356346011, "learning_rate": 3.824891610785936e-07, "loss": 0.0, "num_input_tokens_seen": 208522000, "step": 96690 }, { "epoch": 17.745457882180215, "grad_norm": 0.0019909327384084463, "learning_rate": 3.821820554768202e-07, "loss": 0.0, "num_input_tokens_seen": 208532752, "step": 96695 }, { "epoch": 17.74637548173977, "grad_norm": 0.007260518614202738, "learning_rate": 3.8187506831491973e-07, "loss": 0.0, "num_input_tokens_seen": 208543472, "step": 96700 }, { "epoch": 17.747293081299322, "grad_norm": 0.0006170664564706385, "learning_rate": 3.815681996007664e-07, "loss": 0.0, "num_input_tokens_seen": 208554192, "step": 96705 }, { "epoch": 17.74821068085887, "grad_norm": 0.0011533385841175914, "learning_rate": 3.812614493422312e-07, "loss": 0.0, "num_input_tokens_seen": 208563856, "step": 96710 }, { "epoch": 17.749128280418425, "grad_norm": 0.019751232117414474, "learning_rate": 3.809548175471817e-07, "loss": 0.0001, "num_input_tokens_seen": 208574512, "step": 96715 }, { "epoch": 17.75004587997798, "grad_norm": 0.009486228227615356, "learning_rate": 3.8064830422348154e-07, "loss": 0.0, "num_input_tokens_seen": 208585104, "step": 96720 }, { "epoch": 17.75096347953753, "grad_norm": 0.002191985724493861, "learning_rate": 3.803419093789934e-07, "loss": 0.0063, "num_input_tokens_seen": 208595216, "step": 96725 }, { "epoch": 17.751881079097082, "grad_norm": 0.004837655927985907, "learning_rate": 3.800356330215754e-07, "loss": 0.0001, "num_input_tokens_seen": 208605840, "step": 96730 }, { "epoch": 17.752798678656635, "grad_norm": 0.8919597864151001, "learning_rate": 3.7972947515908245e-07, "loss": 0.0132, "num_input_tokens_seen": 208616464, "step": 96735 }, { "epoch": 17.753716278216185, "grad_norm": 0.022972365841269493, "learning_rate": 3.7942343579936867e-07, "loss": 0.0001, "num_input_tokens_seen": 208626704, "step": 96740 }, { "epoch": 17.75463387777574, "grad_norm": 0.0015210092533379793, "learning_rate": 3.7911751495028235e-07, "loss": 0.0, "num_input_tokens_seen": 208637712, "step": 96745 }, { "epoch": 17.755551477335292, "grad_norm": 0.006444994360208511, "learning_rate": 3.788117126196694e-07, "loss": 0.0, "num_input_tokens_seen": 208649360, "step": 96750 }, { "epoch": 17.75646907689484, "grad_norm": 0.0019136605551466346, "learning_rate": 3.785060288153747e-07, "loss": 0.001, "num_input_tokens_seen": 208660944, "step": 96755 }, { "epoch": 17.757386676454395, "grad_norm": 0.0008477607043460011, "learning_rate": 3.782004635452374e-07, "loss": 0.0001, "num_input_tokens_seen": 208671856, "step": 96760 }, { "epoch": 17.75830427601395, "grad_norm": 0.0018860719865188003, "learning_rate": 3.778950168170953e-07, "loss": 0.0, "num_input_tokens_seen": 208682672, "step": 96765 }, { "epoch": 17.7592218755735, "grad_norm": 0.00048754503950476646, "learning_rate": 3.7758968863878144e-07, "loss": 0.0, "num_input_tokens_seen": 208693168, "step": 96770 }, { "epoch": 17.76013947513305, "grad_norm": 0.005820625461637974, "learning_rate": 3.7728447901812904e-07, "loss": 0.0, "num_input_tokens_seen": 208703632, "step": 96775 }, { "epoch": 17.761057074692605, "grad_norm": 0.0027582915499806404, "learning_rate": 3.7697938796296516e-07, "loss": 0.0, "num_input_tokens_seen": 208714736, "step": 96780 }, { "epoch": 17.761974674252155, "grad_norm": 0.001205416745506227, "learning_rate": 3.7667441548111363e-07, "loss": 0.0, "num_input_tokens_seen": 208725168, "step": 96785 }, { "epoch": 17.76289227381171, "grad_norm": 0.01606065034866333, "learning_rate": 3.763695615803992e-07, "loss": 0.0, "num_input_tokens_seen": 208735440, "step": 96790 }, { "epoch": 17.76380987337126, "grad_norm": 0.0010915757156908512, "learning_rate": 3.760648262686395e-07, "loss": 0.0001, "num_input_tokens_seen": 208746032, "step": 96795 }, { "epoch": 17.76472747293081, "grad_norm": 0.0011514045763760805, "learning_rate": 3.7576020955364947e-07, "loss": 0.0, "num_input_tokens_seen": 208756304, "step": 96800 }, { "epoch": 17.765645072490365, "grad_norm": 0.00038234874955378473, "learning_rate": 3.754557114432439e-07, "loss": 0.0016, "num_input_tokens_seen": 208765360, "step": 96805 }, { "epoch": 17.76656267204992, "grad_norm": 0.009323938749730587, "learning_rate": 3.751513319452321e-07, "loss": 0.0, "num_input_tokens_seen": 208774832, "step": 96810 }, { "epoch": 17.767480271609468, "grad_norm": 0.0038836125750094652, "learning_rate": 3.748470710674207e-07, "loss": 0.0, "num_input_tokens_seen": 208785168, "step": 96815 }, { "epoch": 17.76839787116902, "grad_norm": 0.017170647159218788, "learning_rate": 3.745429288176139e-07, "loss": 0.0001, "num_input_tokens_seen": 208796656, "step": 96820 }, { "epoch": 17.769315470728575, "grad_norm": 0.000648653891403228, "learning_rate": 3.7423890520361104e-07, "loss": 0.0003, "num_input_tokens_seen": 208807024, "step": 96825 }, { "epoch": 17.770233070288125, "grad_norm": 0.0018127465154975653, "learning_rate": 3.7393500023321204e-07, "loss": 0.001, "num_input_tokens_seen": 208816592, "step": 96830 }, { "epoch": 17.771150669847678, "grad_norm": 0.015439560636878014, "learning_rate": 3.7363121391421007e-07, "loss": 0.0005, "num_input_tokens_seen": 208826032, "step": 96835 }, { "epoch": 17.77206826940723, "grad_norm": 2.1683077812194824, "learning_rate": 3.7332754625439784e-07, "loss": 0.0002, "num_input_tokens_seen": 208837008, "step": 96840 }, { "epoch": 17.77298586896678, "grad_norm": 0.0010016944725066423, "learning_rate": 3.730239972615629e-07, "loss": 0.0, "num_input_tokens_seen": 208849168, "step": 96845 }, { "epoch": 17.773903468526335, "grad_norm": 0.8355688452720642, "learning_rate": 3.727205669434908e-07, "loss": 0.0001, "num_input_tokens_seen": 208860080, "step": 96850 }, { "epoch": 17.77482106808589, "grad_norm": 0.0013705219607800245, "learning_rate": 3.7241725530796524e-07, "loss": 0.0, "num_input_tokens_seen": 208871600, "step": 96855 }, { "epoch": 17.775738667645438, "grad_norm": 0.0005557620897889137, "learning_rate": 3.7211406236276503e-07, "loss": 0.0, "num_input_tokens_seen": 208881200, "step": 96860 }, { "epoch": 17.77665626720499, "grad_norm": 0.00667659193277359, "learning_rate": 3.718109881156662e-07, "loss": 0.0, "num_input_tokens_seen": 208892464, "step": 96865 }, { "epoch": 17.777573866764545, "grad_norm": 0.0032492391765117645, "learning_rate": 3.715080325744419e-07, "loss": 0.0, "num_input_tokens_seen": 208903280, "step": 96870 }, { "epoch": 17.778491466324095, "grad_norm": 0.0012751863105222583, "learning_rate": 3.7120519574686377e-07, "loss": 0.0051, "num_input_tokens_seen": 208915088, "step": 96875 }, { "epoch": 17.779409065883648, "grad_norm": 0.0022418727166950703, "learning_rate": 3.709024776406983e-07, "loss": 0.0001, "num_input_tokens_seen": 208926224, "step": 96880 }, { "epoch": 17.7803266654432, "grad_norm": 0.001054746680893004, "learning_rate": 3.7059987826370934e-07, "loss": 0.0, "num_input_tokens_seen": 208935728, "step": 96885 }, { "epoch": 17.78124426500275, "grad_norm": 0.011279482394456863, "learning_rate": 3.7029739762365904e-07, "loss": 0.0, "num_input_tokens_seen": 208946448, "step": 96890 }, { "epoch": 17.782161864562305, "grad_norm": 0.02267606370151043, "learning_rate": 3.6999503572830555e-07, "loss": 0.0, "num_input_tokens_seen": 208957552, "step": 96895 }, { "epoch": 17.783079464121858, "grad_norm": 0.0004980983794666827, "learning_rate": 3.6969279258540325e-07, "loss": 0.0, "num_input_tokens_seen": 208968496, "step": 96900 }, { "epoch": 17.783997063681408, "grad_norm": 0.002658986020833254, "learning_rate": 3.6939066820270376e-07, "loss": 0.0002, "num_input_tokens_seen": 208978480, "step": 96905 }, { "epoch": 17.78491466324096, "grad_norm": 0.0003946924116462469, "learning_rate": 3.690886625879575e-07, "loss": 0.0001, "num_input_tokens_seen": 208988400, "step": 96910 }, { "epoch": 17.785832262800515, "grad_norm": 0.0015151889529079199, "learning_rate": 3.6878677574890996e-07, "loss": 0.004, "num_input_tokens_seen": 209000048, "step": 96915 }, { "epoch": 17.786749862360065, "grad_norm": 0.0015089692315086722, "learning_rate": 3.6848500769330275e-07, "loss": 0.002, "num_input_tokens_seen": 209010480, "step": 96920 }, { "epoch": 17.787667461919618, "grad_norm": 0.02095417119562626, "learning_rate": 3.68183358428878e-07, "loss": 0.0, "num_input_tokens_seen": 209021136, "step": 96925 }, { "epoch": 17.78858506147917, "grad_norm": 0.00045466399751603603, "learning_rate": 3.678818279633717e-07, "loss": 0.0, "num_input_tokens_seen": 209033520, "step": 96930 }, { "epoch": 17.78950266103872, "grad_norm": 0.2554513216018677, "learning_rate": 3.67580416304516e-07, "loss": 0.0, "num_input_tokens_seen": 209044560, "step": 96935 }, { "epoch": 17.790420260598275, "grad_norm": 0.0008957334794104099, "learning_rate": 3.672791234600442e-07, "loss": 0.0, "num_input_tokens_seen": 209055952, "step": 96940 }, { "epoch": 17.791337860157828, "grad_norm": 1.0652587413787842, "learning_rate": 3.6697794943768293e-07, "loss": 0.0002, "num_input_tokens_seen": 209065936, "step": 96945 }, { "epoch": 17.792255459717378, "grad_norm": 0.010608877055346966, "learning_rate": 3.666768942451565e-07, "loss": 0.0, "num_input_tokens_seen": 209075664, "step": 96950 }, { "epoch": 17.79317305927693, "grad_norm": 0.0048574344255030155, "learning_rate": 3.6637595789018597e-07, "loss": 0.0, "num_input_tokens_seen": 209086064, "step": 96955 }, { "epoch": 17.794090658836485, "grad_norm": 0.0020568245090544224, "learning_rate": 3.660751403804913e-07, "loss": 0.0, "num_input_tokens_seen": 209097104, "step": 96960 }, { "epoch": 17.795008258396035, "grad_norm": 0.0030218258034437895, "learning_rate": 3.657744417237874e-07, "loss": 0.0, "num_input_tokens_seen": 209107856, "step": 96965 }, { "epoch": 17.795925857955588, "grad_norm": 0.0038684403989464045, "learning_rate": 3.6547386192778587e-07, "loss": 0.0, "num_input_tokens_seen": 209118352, "step": 96970 }, { "epoch": 17.79684345751514, "grad_norm": 0.001485778484493494, "learning_rate": 3.651734010001978e-07, "loss": 0.0, "num_input_tokens_seen": 209130032, "step": 96975 }, { "epoch": 17.79776105707469, "grad_norm": 0.0564056932926178, "learning_rate": 3.648730589487287e-07, "loss": 0.0, "num_input_tokens_seen": 209140560, "step": 96980 }, { "epoch": 17.798678656634245, "grad_norm": 0.09649606049060822, "learning_rate": 3.645728357810818e-07, "loss": 0.1907, "num_input_tokens_seen": 209150320, "step": 96985 }, { "epoch": 17.799596256193798, "grad_norm": 0.0010652098571881652, "learning_rate": 3.6427273150495655e-07, "loss": 0.0, "num_input_tokens_seen": 209160976, "step": 96990 }, { "epoch": 17.800513855753348, "grad_norm": 0.000541045272257179, "learning_rate": 3.639727461280518e-07, "loss": 0.0, "num_input_tokens_seen": 209171856, "step": 96995 }, { "epoch": 17.8014314553129, "grad_norm": 0.0023805834352970123, "learning_rate": 3.636728796580613e-07, "loss": 0.0, "num_input_tokens_seen": 209182992, "step": 97000 }, { "epoch": 17.802349054872455, "grad_norm": 0.001284866244532168, "learning_rate": 3.6337313210267456e-07, "loss": 0.0001, "num_input_tokens_seen": 209194544, "step": 97005 }, { "epoch": 17.803266654432004, "grad_norm": 0.03943607583642006, "learning_rate": 3.63073503469582e-07, "loss": 0.0, "num_input_tokens_seen": 209204880, "step": 97010 }, { "epoch": 17.804184253991558, "grad_norm": 0.001281015807762742, "learning_rate": 3.6277399376646703e-07, "loss": 0.0, "num_input_tokens_seen": 209216144, "step": 97015 }, { "epoch": 17.80510185355111, "grad_norm": 0.004458367358893156, "learning_rate": 3.624746030010112e-07, "loss": 0.0, "num_input_tokens_seen": 209226544, "step": 97020 }, { "epoch": 17.80601945311066, "grad_norm": 0.0011567143956199288, "learning_rate": 3.6217533118089567e-07, "loss": 0.0, "num_input_tokens_seen": 209238096, "step": 97025 }, { "epoch": 17.806937052670214, "grad_norm": 0.015663983300328255, "learning_rate": 3.6187617831379475e-07, "loss": 0.0, "num_input_tokens_seen": 209248240, "step": 97030 }, { "epoch": 17.807854652229768, "grad_norm": 0.009729614481329918, "learning_rate": 3.6157714440738124e-07, "loss": 0.0, "num_input_tokens_seen": 209257616, "step": 97035 }, { "epoch": 17.808772251789318, "grad_norm": 0.020738843828439713, "learning_rate": 3.612782294693251e-07, "loss": 0.0, "num_input_tokens_seen": 209266096, "step": 97040 }, { "epoch": 17.80968985134887, "grad_norm": 0.009499269537627697, "learning_rate": 3.609794335072925e-07, "loss": 0.0, "num_input_tokens_seen": 209276400, "step": 97045 }, { "epoch": 17.810607450908424, "grad_norm": 0.015561972744762897, "learning_rate": 3.6068075652894774e-07, "loss": 0.0, "num_input_tokens_seen": 209286480, "step": 97050 }, { "epoch": 17.811525050467974, "grad_norm": 0.002660853788256645, "learning_rate": 3.60382198541952e-07, "loss": 0.0, "num_input_tokens_seen": 209296400, "step": 97055 }, { "epoch": 17.812442650027528, "grad_norm": 0.026730936020612717, "learning_rate": 3.600837595539619e-07, "loss": 0.0, "num_input_tokens_seen": 209305712, "step": 97060 }, { "epoch": 17.81336024958708, "grad_norm": 0.0020430537406355143, "learning_rate": 3.597854395726319e-07, "loss": 0.0376, "num_input_tokens_seen": 209317072, "step": 97065 }, { "epoch": 17.81427784914663, "grad_norm": 0.021597446873784065, "learning_rate": 3.594872386056131e-07, "loss": 0.0011, "num_input_tokens_seen": 209329104, "step": 97070 }, { "epoch": 17.815195448706184, "grad_norm": 0.0023365411907434464, "learning_rate": 3.5918915666055487e-07, "loss": 0.0, "num_input_tokens_seen": 209340304, "step": 97075 }, { "epoch": 17.816113048265738, "grad_norm": 0.0054372744634747505, "learning_rate": 3.5889119374510285e-07, "loss": 0.0, "num_input_tokens_seen": 209352528, "step": 97080 }, { "epoch": 17.817030647825288, "grad_norm": 0.0005183866596780717, "learning_rate": 3.585933498668981e-07, "loss": 0.0001, "num_input_tokens_seen": 209363696, "step": 97085 }, { "epoch": 17.81794824738484, "grad_norm": 0.007775448728352785, "learning_rate": 3.582956250335801e-07, "loss": 0.0, "num_input_tokens_seen": 209375344, "step": 97090 }, { "epoch": 17.818865846944394, "grad_norm": 43.28266525268555, "learning_rate": 3.5799801925278546e-07, "loss": 0.1994, "num_input_tokens_seen": 209386640, "step": 97095 }, { "epoch": 17.819783446503944, "grad_norm": 0.002286844188347459, "learning_rate": 3.5770053253214755e-07, "loss": 0.0, "num_input_tokens_seen": 209396976, "step": 97100 }, { "epoch": 17.820701046063498, "grad_norm": 0.0006549006793648005, "learning_rate": 3.5740316487929527e-07, "loss": 0.0, "num_input_tokens_seen": 209407568, "step": 97105 }, { "epoch": 17.82161864562305, "grad_norm": 0.014526784420013428, "learning_rate": 3.571059163018575e-07, "loss": 0.0, "num_input_tokens_seen": 209419408, "step": 97110 }, { "epoch": 17.8225362451826, "grad_norm": 0.000581288302782923, "learning_rate": 3.5680878680745657e-07, "loss": 0.0, "num_input_tokens_seen": 209429776, "step": 97115 }, { "epoch": 17.823453844742154, "grad_norm": 0.00041570229222998023, "learning_rate": 3.565117764037146e-07, "loss": 0.0, "num_input_tokens_seen": 209440400, "step": 97120 }, { "epoch": 17.824371444301708, "grad_norm": 0.00902053713798523, "learning_rate": 3.5621488509824775e-07, "loss": 0.0, "num_input_tokens_seen": 209451280, "step": 97125 }, { "epoch": 17.825289043861257, "grad_norm": 0.0007373050320893526, "learning_rate": 3.5591811289867274e-07, "loss": 0.0822, "num_input_tokens_seen": 209462512, "step": 97130 }, { "epoch": 17.82620664342081, "grad_norm": 0.00801867712289095, "learning_rate": 3.556214598126001e-07, "loss": 0.0001, "num_input_tokens_seen": 209473680, "step": 97135 }, { "epoch": 17.827124242980364, "grad_norm": 0.003941255621612072, "learning_rate": 3.553249258476382e-07, "loss": 0.0002, "num_input_tokens_seen": 209485200, "step": 97140 }, { "epoch": 17.828041842539914, "grad_norm": 0.0008622638997621834, "learning_rate": 3.5502851101139436e-07, "loss": 0.0001, "num_input_tokens_seen": 209495280, "step": 97145 }, { "epoch": 17.828959442099467, "grad_norm": 0.0011673702392727137, "learning_rate": 3.5473221531147015e-07, "loss": 0.0, "num_input_tokens_seen": 209507152, "step": 97150 }, { "epoch": 17.82987704165902, "grad_norm": 0.019047057256102562, "learning_rate": 3.5443603875546404e-07, "loss": 0.0, "num_input_tokens_seen": 209517424, "step": 97155 }, { "epoch": 17.83079464121857, "grad_norm": 0.0002554699603933841, "learning_rate": 3.5413998135097493e-07, "loss": 0.0, "num_input_tokens_seen": 209527440, "step": 97160 }, { "epoch": 17.831712240778124, "grad_norm": 0.00830992590636015, "learning_rate": 3.538440431055945e-07, "loss": 0.0, "num_input_tokens_seen": 209537424, "step": 97165 }, { "epoch": 17.832629840337678, "grad_norm": 0.00047654626541770995, "learning_rate": 3.5354822402691336e-07, "loss": 0.0, "num_input_tokens_seen": 209548240, "step": 97170 }, { "epoch": 17.833547439897227, "grad_norm": 0.0028141809161752462, "learning_rate": 3.532525241225182e-07, "loss": 0.0, "num_input_tokens_seen": 209559440, "step": 97175 }, { "epoch": 17.83446503945678, "grad_norm": 0.2575829327106476, "learning_rate": 3.5295694339999467e-07, "loss": 0.0001, "num_input_tokens_seen": 209570736, "step": 97180 }, { "epoch": 17.835382639016334, "grad_norm": 0.015736756846308708, "learning_rate": 3.526614818669233e-07, "loss": 0.0, "num_input_tokens_seen": 209582128, "step": 97185 }, { "epoch": 17.836300238575884, "grad_norm": 0.004210598301142454, "learning_rate": 3.5236613953088197e-07, "loss": 0.0, "num_input_tokens_seen": 209593104, "step": 97190 }, { "epoch": 17.837217838135437, "grad_norm": 0.030035831034183502, "learning_rate": 3.520709163994462e-07, "loss": 0.0, "num_input_tokens_seen": 209603824, "step": 97195 }, { "epoch": 17.83813543769499, "grad_norm": 0.06497413665056229, "learning_rate": 3.517758124801879e-07, "loss": 0.0002, "num_input_tokens_seen": 209614352, "step": 97200 }, { "epoch": 17.83905303725454, "grad_norm": 0.0014420701190829277, "learning_rate": 3.514808277806753e-07, "loss": 0.0012, "num_input_tokens_seen": 209625200, "step": 97205 }, { "epoch": 17.839970636814094, "grad_norm": 0.0010909150587394834, "learning_rate": 3.5118596230847513e-07, "loss": 0.0, "num_input_tokens_seen": 209635440, "step": 97210 }, { "epoch": 17.840888236373647, "grad_norm": 0.009604073129594326, "learning_rate": 3.508912160711508e-07, "loss": 0.0, "num_input_tokens_seen": 209646768, "step": 97215 }, { "epoch": 17.841805835933197, "grad_norm": 0.010249233804643154, "learning_rate": 3.505965890762608e-07, "loss": 0.0, "num_input_tokens_seen": 209657648, "step": 97220 }, { "epoch": 17.84272343549275, "grad_norm": 0.00045709218829870224, "learning_rate": 3.5030208133136176e-07, "loss": 0.0001, "num_input_tokens_seen": 209668304, "step": 97225 }, { "epoch": 17.843641035052304, "grad_norm": 0.04004031419754028, "learning_rate": 3.500076928440088e-07, "loss": 0.0001, "num_input_tokens_seen": 209679984, "step": 97230 }, { "epoch": 17.844558634611854, "grad_norm": 0.001795963034965098, "learning_rate": 3.497134236217514e-07, "loss": 0.0, "num_input_tokens_seen": 209691056, "step": 97235 }, { "epoch": 17.845476234171407, "grad_norm": 0.0037823806051164865, "learning_rate": 3.494192736721369e-07, "loss": 0.0002, "num_input_tokens_seen": 209700592, "step": 97240 }, { "epoch": 17.84639383373096, "grad_norm": 89.02852630615234, "learning_rate": 3.491252430027109e-07, "loss": 0.1657, "num_input_tokens_seen": 209710352, "step": 97245 }, { "epoch": 17.84731143329051, "grad_norm": 0.0979842022061348, "learning_rate": 3.488313316210146e-07, "loss": 0.0003, "num_input_tokens_seen": 209722128, "step": 97250 }, { "epoch": 17.848229032850064, "grad_norm": 0.0004537039785645902, "learning_rate": 3.485375395345858e-07, "loss": 0.0, "num_input_tokens_seen": 209732752, "step": 97255 }, { "epoch": 17.849146632409617, "grad_norm": 0.010936297476291656, "learning_rate": 3.4824386675095966e-07, "loss": 0.0, "num_input_tokens_seen": 209743344, "step": 97260 }, { "epoch": 17.850064231969167, "grad_norm": 0.01748962514102459, "learning_rate": 3.4795031327766906e-07, "loss": 0.0, "num_input_tokens_seen": 209754032, "step": 97265 }, { "epoch": 17.85098183152872, "grad_norm": 0.0009329367894679308, "learning_rate": 3.4765687912224177e-07, "loss": 0.0478, "num_input_tokens_seen": 209765200, "step": 97270 }, { "epoch": 17.851899431088274, "grad_norm": 0.0011381752556189895, "learning_rate": 3.473635642922063e-07, "loss": 0.0, "num_input_tokens_seen": 209776400, "step": 97275 }, { "epoch": 17.852817030647824, "grad_norm": 0.0015098198782652617, "learning_rate": 3.4707036879508437e-07, "loss": 0.0, "num_input_tokens_seen": 209787184, "step": 97280 }, { "epoch": 17.853734630207377, "grad_norm": 0.004552244208753109, "learning_rate": 3.467772926383961e-07, "loss": 0.0, "num_input_tokens_seen": 209797008, "step": 97285 }, { "epoch": 17.85465222976693, "grad_norm": 140.2563934326172, "learning_rate": 3.4648433582965767e-07, "loss": 0.011, "num_input_tokens_seen": 209808368, "step": 97290 }, { "epoch": 17.85556982932648, "grad_norm": 0.03195090964436531, "learning_rate": 3.461914983763842e-07, "loss": 0.0, "num_input_tokens_seen": 209819376, "step": 97295 }, { "epoch": 17.856487428886034, "grad_norm": 0.003915483132004738, "learning_rate": 3.4589878028608635e-07, "loss": 0.0426, "num_input_tokens_seen": 209830256, "step": 97300 }, { "epoch": 17.857405028445587, "grad_norm": 0.002637972356751561, "learning_rate": 3.4560618156627144e-07, "loss": 0.0, "num_input_tokens_seen": 209840432, "step": 97305 }, { "epoch": 17.858322628005137, "grad_norm": 0.008794572204351425, "learning_rate": 3.453137022244435e-07, "loss": 0.0001, "num_input_tokens_seen": 209852016, "step": 97310 }, { "epoch": 17.85924022756469, "grad_norm": 0.00092637276975438, "learning_rate": 3.450213422681059e-07, "loss": 0.0, "num_input_tokens_seen": 209863856, "step": 97315 }, { "epoch": 17.860157827124244, "grad_norm": 0.001369811245240271, "learning_rate": 3.4472910170475604e-07, "loss": 0.0, "num_input_tokens_seen": 209873904, "step": 97320 }, { "epoch": 17.861075426683794, "grad_norm": 0.02401636354625225, "learning_rate": 3.444369805418896e-07, "loss": 0.0, "num_input_tokens_seen": 209884496, "step": 97325 }, { "epoch": 17.861993026243347, "grad_norm": 0.020536888390779495, "learning_rate": 3.441449787869994e-07, "loss": 0.0, "num_input_tokens_seen": 209895024, "step": 97330 }, { "epoch": 17.8629106258029, "grad_norm": 0.003897816641256213, "learning_rate": 3.438530964475745e-07, "loss": 0.0, "num_input_tokens_seen": 209905936, "step": 97335 }, { "epoch": 17.86382822536245, "grad_norm": 0.0005157154519110918, "learning_rate": 3.4356133353110057e-07, "loss": 0.0, "num_input_tokens_seen": 209917040, "step": 97340 }, { "epoch": 17.864745824922004, "grad_norm": 0.0013312376104295254, "learning_rate": 3.432696900450627e-07, "loss": 0.0284, "num_input_tokens_seen": 209928304, "step": 97345 }, { "epoch": 17.865663424481557, "grad_norm": 0.0037558507174253464, "learning_rate": 3.4297816599693944e-07, "loss": 0.0001, "num_input_tokens_seen": 209939472, "step": 97350 }, { "epoch": 17.866581024041107, "grad_norm": 0.0012949910014867783, "learning_rate": 3.426867613942092e-07, "loss": 0.0, "num_input_tokens_seen": 209950704, "step": 97355 }, { "epoch": 17.86749862360066, "grad_norm": 0.0009189906413666904, "learning_rate": 3.4239547624434375e-07, "loss": 0.0051, "num_input_tokens_seen": 209961936, "step": 97360 }, { "epoch": 17.868416223160214, "grad_norm": 0.0062719108536839485, "learning_rate": 3.421043105548172e-07, "loss": 0.0, "num_input_tokens_seen": 209973168, "step": 97365 }, { "epoch": 17.869333822719764, "grad_norm": 0.00716853141784668, "learning_rate": 3.418132643330957e-07, "loss": 0.001, "num_input_tokens_seen": 209982384, "step": 97370 }, { "epoch": 17.870251422279317, "grad_norm": 0.001292941626161337, "learning_rate": 3.4152233758664386e-07, "loss": 0.0, "num_input_tokens_seen": 209993872, "step": 97375 }, { "epoch": 17.87116902183887, "grad_norm": 0.0008823396638035774, "learning_rate": 3.412315303229247e-07, "loss": 0.0001, "num_input_tokens_seen": 210004688, "step": 97380 }, { "epoch": 17.87208662139842, "grad_norm": 0.205950066447258, "learning_rate": 3.4094084254939596e-07, "loss": 0.0, "num_input_tokens_seen": 210015856, "step": 97385 }, { "epoch": 17.873004220957974, "grad_norm": 0.002451239852234721, "learning_rate": 3.406502742735135e-07, "loss": 0.0002, "num_input_tokens_seen": 210027312, "step": 97390 }, { "epoch": 17.873921820517527, "grad_norm": 0.013852853327989578, "learning_rate": 3.4035982550273074e-07, "loss": 0.0, "num_input_tokens_seen": 210037936, "step": 97395 }, { "epoch": 17.874839420077077, "grad_norm": 0.11394085735082626, "learning_rate": 3.400694962444967e-07, "loss": 0.0001, "num_input_tokens_seen": 210048688, "step": 97400 }, { "epoch": 17.87575701963663, "grad_norm": 0.006490771193057299, "learning_rate": 3.3977928650625766e-07, "loss": 0.0001, "num_input_tokens_seen": 210059952, "step": 97405 }, { "epoch": 17.876674619196184, "grad_norm": 0.0005520940176211298, "learning_rate": 3.394891962954566e-07, "loss": 0.0005, "num_input_tokens_seen": 210070416, "step": 97410 }, { "epoch": 17.877592218755733, "grad_norm": 0.3351230025291443, "learning_rate": 3.391992256195353e-07, "loss": 0.0001, "num_input_tokens_seen": 210080240, "step": 97415 }, { "epoch": 17.878509818315287, "grad_norm": 0.0168177280575037, "learning_rate": 3.3890937448593064e-07, "loss": 0.0, "num_input_tokens_seen": 210090864, "step": 97420 }, { "epoch": 17.87942741787484, "grad_norm": 0.000752436404582113, "learning_rate": 3.386196429020749e-07, "loss": 0.0, "num_input_tokens_seen": 210102576, "step": 97425 }, { "epoch": 17.88034501743439, "grad_norm": 0.0036778924986720085, "learning_rate": 3.383300308754023e-07, "loss": 0.0, "num_input_tokens_seen": 210113584, "step": 97430 }, { "epoch": 17.881262616993943, "grad_norm": 0.0012336736544966698, "learning_rate": 3.380405384133395e-07, "loss": 0.0002, "num_input_tokens_seen": 210124912, "step": 97435 }, { "epoch": 17.882180216553497, "grad_norm": 0.02778763510286808, "learning_rate": 3.377511655233112e-07, "loss": 0.0002, "num_input_tokens_seen": 210135248, "step": 97440 }, { "epoch": 17.883097816113047, "grad_norm": 0.07855354994535446, "learning_rate": 3.3746191221273874e-07, "loss": 0.0001, "num_input_tokens_seen": 210146320, "step": 97445 }, { "epoch": 17.8840154156726, "grad_norm": 0.010946663096547127, "learning_rate": 3.3717277848904327e-07, "loss": 0.0, "num_input_tokens_seen": 210157776, "step": 97450 }, { "epoch": 17.884933015232154, "grad_norm": 0.007207099348306656, "learning_rate": 3.36883764359639e-07, "loss": 0.0, "num_input_tokens_seen": 210168464, "step": 97455 }, { "epoch": 17.885850614791703, "grad_norm": 0.003642900614067912, "learning_rate": 3.3659486983193877e-07, "loss": 0.0, "num_input_tokens_seen": 210179824, "step": 97460 }, { "epoch": 17.886768214351257, "grad_norm": 0.00351080484688282, "learning_rate": 3.3630609491335175e-07, "loss": 0.0004, "num_input_tokens_seen": 210190192, "step": 97465 }, { "epoch": 17.88768581391081, "grad_norm": 0.05969512090086937, "learning_rate": 3.360174396112864e-07, "loss": 0.0, "num_input_tokens_seen": 210201104, "step": 97470 }, { "epoch": 17.88860341347036, "grad_norm": 0.0014997324906289577, "learning_rate": 3.3572890393314517e-07, "loss": 0.0, "num_input_tokens_seen": 210212752, "step": 97475 }, { "epoch": 17.889521013029913, "grad_norm": 0.0021223346702754498, "learning_rate": 3.354404878863288e-07, "loss": 0.0, "num_input_tokens_seen": 210223856, "step": 97480 }, { "epoch": 17.890438612589467, "grad_norm": 0.013993867672979832, "learning_rate": 3.351521914782341e-07, "loss": 0.0, "num_input_tokens_seen": 210235664, "step": 97485 }, { "epoch": 17.891356212149017, "grad_norm": 0.002397489733994007, "learning_rate": 3.3486401471625516e-07, "loss": 0.0, "num_input_tokens_seen": 210247280, "step": 97490 }, { "epoch": 17.89227381170857, "grad_norm": 0.2562287747859955, "learning_rate": 3.345759576077845e-07, "loss": 0.0, "num_input_tokens_seen": 210257104, "step": 97495 }, { "epoch": 17.893191411268123, "grad_norm": 0.006392518524080515, "learning_rate": 3.3428802016021e-07, "loss": 0.0025, "num_input_tokens_seen": 210266480, "step": 97500 }, { "epoch": 17.894109010827673, "grad_norm": 0.054467231035232544, "learning_rate": 3.340002023809169e-07, "loss": 0.0001, "num_input_tokens_seen": 210277968, "step": 97505 }, { "epoch": 17.895026610387227, "grad_norm": 0.0024642767384648323, "learning_rate": 3.337125042772854e-07, "loss": 0.0, "num_input_tokens_seen": 210288656, "step": 97510 }, { "epoch": 17.89594420994678, "grad_norm": 0.0005318488110788167, "learning_rate": 3.334249258566974e-07, "loss": 0.0144, "num_input_tokens_seen": 210299504, "step": 97515 }, { "epoch": 17.89686180950633, "grad_norm": 0.02686859667301178, "learning_rate": 3.331374671265275e-07, "loss": 0.0001, "num_input_tokens_seen": 210311056, "step": 97520 }, { "epoch": 17.897779409065883, "grad_norm": 0.006837198045104742, "learning_rate": 3.328501280941476e-07, "loss": 0.0119, "num_input_tokens_seen": 210322064, "step": 97525 }, { "epoch": 17.898697008625437, "grad_norm": 0.0021597235463559628, "learning_rate": 3.3256290876692965e-07, "loss": 0.0004, "num_input_tokens_seen": 210332368, "step": 97530 }, { "epoch": 17.899614608184987, "grad_norm": 0.0005362852825783193, "learning_rate": 3.3227580915223877e-07, "loss": 0.1626, "num_input_tokens_seen": 210342544, "step": 97535 }, { "epoch": 17.90053220774454, "grad_norm": 0.0009872298687696457, "learning_rate": 3.319888292574397e-07, "loss": 0.0, "num_input_tokens_seen": 210353200, "step": 97540 }, { "epoch": 17.901449807304093, "grad_norm": 0.003792103147134185, "learning_rate": 3.3170196908989093e-07, "loss": 0.0006, "num_input_tokens_seen": 210361968, "step": 97545 }, { "epoch": 17.902367406863643, "grad_norm": 0.00036669487599283457, "learning_rate": 3.3141522865695276e-07, "loss": 0.0001, "num_input_tokens_seen": 210373200, "step": 97550 }, { "epoch": 17.903285006423197, "grad_norm": 0.0028055293951183558, "learning_rate": 3.3112860796597813e-07, "loss": 0.0051, "num_input_tokens_seen": 210384208, "step": 97555 }, { "epoch": 17.90420260598275, "grad_norm": 0.0020065291319042444, "learning_rate": 3.308421070243173e-07, "loss": 0.0, "num_input_tokens_seen": 210394704, "step": 97560 }, { "epoch": 17.9051202055423, "grad_norm": 0.038018591701984406, "learning_rate": 3.3055572583932163e-07, "loss": 0.0, "num_input_tokens_seen": 210405136, "step": 97565 }, { "epoch": 17.906037805101853, "grad_norm": 0.037316229194402695, "learning_rate": 3.302694644183341e-07, "loss": 0.0, "num_input_tokens_seen": 210414480, "step": 97570 }, { "epoch": 17.906955404661407, "grad_norm": 0.0017313924618065357, "learning_rate": 3.2998332276869714e-07, "loss": 0.0, "num_input_tokens_seen": 210423472, "step": 97575 }, { "epoch": 17.907873004220956, "grad_norm": 0.0017391646979376674, "learning_rate": 3.296973008977494e-07, "loss": 0.0, "num_input_tokens_seen": 210434480, "step": 97580 }, { "epoch": 17.90879060378051, "grad_norm": 0.0006204760284163058, "learning_rate": 3.2941139881282893e-07, "loss": 0.0883, "num_input_tokens_seen": 210445616, "step": 97585 }, { "epoch": 17.909708203340063, "grad_norm": 0.004951986018568277, "learning_rate": 3.291256165212664e-07, "loss": 0.0001, "num_input_tokens_seen": 210455536, "step": 97590 }, { "epoch": 17.910625802899613, "grad_norm": 3.9783082008361816, "learning_rate": 3.288399540303927e-07, "loss": 0.0006, "num_input_tokens_seen": 210466544, "step": 97595 }, { "epoch": 17.911543402459166, "grad_norm": 0.029339171946048737, "learning_rate": 3.2855441134753473e-07, "loss": 0.0, "num_input_tokens_seen": 210478800, "step": 97600 }, { "epoch": 17.91246100201872, "grad_norm": 95.15288543701172, "learning_rate": 3.2826898848001664e-07, "loss": 0.1128, "num_input_tokens_seen": 210489168, "step": 97605 }, { "epoch": 17.91337860157827, "grad_norm": 0.0012302675750106573, "learning_rate": 3.27983685435157e-07, "loss": 0.0003, "num_input_tokens_seen": 210500528, "step": 97610 }, { "epoch": 17.914296201137823, "grad_norm": 0.00042975819087587297, "learning_rate": 3.2769850222027613e-07, "loss": 0.0005, "num_input_tokens_seen": 210510704, "step": 97615 }, { "epoch": 17.915213800697376, "grad_norm": 0.29837527871131897, "learning_rate": 3.2741343884268695e-07, "loss": 0.0001, "num_input_tokens_seen": 210520720, "step": 97620 }, { "epoch": 17.916131400256926, "grad_norm": 0.00460836524143815, "learning_rate": 3.271284953097015e-07, "loss": 0.0, "num_input_tokens_seen": 210531792, "step": 97625 }, { "epoch": 17.91704899981648, "grad_norm": 0.4983326494693756, "learning_rate": 3.268436716286266e-07, "loss": 0.0, "num_input_tokens_seen": 210542320, "step": 97630 }, { "epoch": 17.917966599376033, "grad_norm": 0.04962621256709099, "learning_rate": 3.2655896780676986e-07, "loss": 0.0, "num_input_tokens_seen": 210554000, "step": 97635 }, { "epoch": 17.918884198935583, "grad_norm": 0.0016866520745679736, "learning_rate": 3.2627438385143264e-07, "loss": 0.0144, "num_input_tokens_seen": 210564656, "step": 97640 }, { "epoch": 17.919801798495136, "grad_norm": 0.0010976452613249421, "learning_rate": 3.259899197699129e-07, "loss": 0.0, "num_input_tokens_seen": 210573840, "step": 97645 }, { "epoch": 17.92071939805469, "grad_norm": 0.0026643695309758186, "learning_rate": 3.257055755695082e-07, "loss": 0.0, "num_input_tokens_seen": 210583344, "step": 97650 }, { "epoch": 17.92163699761424, "grad_norm": 0.004329211078584194, "learning_rate": 3.254213512575111e-07, "loss": 0.0, "num_input_tokens_seen": 210594576, "step": 97655 }, { "epoch": 17.922554597173793, "grad_norm": 0.0017034957418218255, "learning_rate": 3.2513724684121063e-07, "loss": 0.0, "num_input_tokens_seen": 210604528, "step": 97660 }, { "epoch": 17.923472196733346, "grad_norm": 0.00032250682124868035, "learning_rate": 3.248532623278955e-07, "loss": 0.0, "num_input_tokens_seen": 210615312, "step": 97665 }, { "epoch": 17.924389796292896, "grad_norm": 0.0020236880518496037, "learning_rate": 3.2456939772484816e-07, "loss": 0.0, "num_input_tokens_seen": 210626960, "step": 97670 }, { "epoch": 17.92530739585245, "grad_norm": 0.01014088187366724, "learning_rate": 3.2428565303934956e-07, "loss": 0.0, "num_input_tokens_seen": 210638256, "step": 97675 }, { "epoch": 17.926224995412003, "grad_norm": 0.0015104329213500023, "learning_rate": 3.240020282786771e-07, "loss": 0.0, "num_input_tokens_seen": 210648176, "step": 97680 }, { "epoch": 17.927142594971553, "grad_norm": 0.003943383693695068, "learning_rate": 3.2371852345010445e-07, "loss": 0.0016, "num_input_tokens_seen": 210660016, "step": 97685 }, { "epoch": 17.928060194531106, "grad_norm": 0.0017022439278662205, "learning_rate": 3.2343513856090525e-07, "loss": 0.0, "num_input_tokens_seen": 210671056, "step": 97690 }, { "epoch": 17.92897779409066, "grad_norm": 0.13639503717422485, "learning_rate": 3.2315187361834697e-07, "loss": 0.0, "num_input_tokens_seen": 210681264, "step": 97695 }, { "epoch": 17.92989539365021, "grad_norm": 0.05721140280365944, "learning_rate": 3.2286872862969443e-07, "loss": 0.0, "num_input_tokens_seen": 210694096, "step": 97700 }, { "epoch": 17.930812993209763, "grad_norm": 0.06181161850690842, "learning_rate": 3.225857036022101e-07, "loss": 0.0025, "num_input_tokens_seen": 210704656, "step": 97705 }, { "epoch": 17.931730592769316, "grad_norm": 0.0004828503297176212, "learning_rate": 3.2230279854315205e-07, "loss": 0.0, "num_input_tokens_seen": 210715600, "step": 97710 }, { "epoch": 17.932648192328866, "grad_norm": 0.012215475551784039, "learning_rate": 3.220200134597784e-07, "loss": 0.0588, "num_input_tokens_seen": 210725968, "step": 97715 }, { "epoch": 17.93356579188842, "grad_norm": 0.0009076262358576059, "learning_rate": 3.217373483593417e-07, "loss": 0.0, "num_input_tokens_seen": 210736400, "step": 97720 }, { "epoch": 17.934483391447973, "grad_norm": 0.005981669295579195, "learning_rate": 3.214548032490905e-07, "loss": 0.0, "num_input_tokens_seen": 210747312, "step": 97725 }, { "epoch": 17.935400991007523, "grad_norm": 0.0008247286314144731, "learning_rate": 3.2117237813627247e-07, "loss": 0.0008, "num_input_tokens_seen": 210759184, "step": 97730 }, { "epoch": 17.936318590567076, "grad_norm": 0.007159966044127941, "learning_rate": 3.2089007302813167e-07, "loss": 0.0001, "num_input_tokens_seen": 210769360, "step": 97735 }, { "epoch": 17.93723619012663, "grad_norm": 0.017096875235438347, "learning_rate": 3.206078879319086e-07, "loss": 0.0822, "num_input_tokens_seen": 210780368, "step": 97740 }, { "epoch": 17.93815378968618, "grad_norm": 0.0017213834216818213, "learning_rate": 3.203258228548406e-07, "loss": 0.0, "num_input_tokens_seen": 210791504, "step": 97745 }, { "epoch": 17.939071389245733, "grad_norm": 0.004910605028271675, "learning_rate": 3.200438778041626e-07, "loss": 0.0, "num_input_tokens_seen": 210802352, "step": 97750 }, { "epoch": 17.939988988805286, "grad_norm": 0.01136617362499237, "learning_rate": 3.1976205278710593e-07, "loss": 0.0, "num_input_tokens_seen": 210813680, "step": 97755 }, { "epoch": 17.940906588364836, "grad_norm": 0.06262599676847458, "learning_rate": 3.194803478108993e-07, "loss": 0.0, "num_input_tokens_seen": 210825488, "step": 97760 }, { "epoch": 17.94182418792439, "grad_norm": 0.0007437322055920959, "learning_rate": 3.191987628827664e-07, "loss": 0.0051, "num_input_tokens_seen": 210837040, "step": 97765 }, { "epoch": 17.942741787483943, "grad_norm": 0.002014890545979142, "learning_rate": 3.1891729800993145e-07, "loss": 0.0, "num_input_tokens_seen": 210849424, "step": 97770 }, { "epoch": 17.943659387043493, "grad_norm": 0.0011322003556415439, "learning_rate": 3.1863595319961304e-07, "loss": 0.0, "num_input_tokens_seen": 210861200, "step": 97775 }, { "epoch": 17.944576986603046, "grad_norm": 0.0010889299446716905, "learning_rate": 3.1835472845902604e-07, "loss": 0.0, "num_input_tokens_seen": 210872720, "step": 97780 }, { "epoch": 17.9454945861626, "grad_norm": 0.015417122282087803, "learning_rate": 3.1807362379538464e-07, "loss": 0.0, "num_input_tokens_seen": 210884688, "step": 97785 }, { "epoch": 17.94641218572215, "grad_norm": 0.0009780953405424953, "learning_rate": 3.177926392158992e-07, "loss": 0.0004, "num_input_tokens_seen": 210896272, "step": 97790 }, { "epoch": 17.947329785281703, "grad_norm": 0.010828704573214054, "learning_rate": 3.1751177472777507e-07, "loss": 0.0, "num_input_tokens_seen": 210906800, "step": 97795 }, { "epoch": 17.948247384841256, "grad_norm": 0.009328685700893402, "learning_rate": 3.17231030338217e-07, "loss": 0.001, "num_input_tokens_seen": 210917520, "step": 97800 }, { "epoch": 17.949164984400806, "grad_norm": 0.0005757699254900217, "learning_rate": 3.169504060544254e-07, "loss": 0.0, "num_input_tokens_seen": 210928144, "step": 97805 }, { "epoch": 17.95008258396036, "grad_norm": 0.0022779277060180902, "learning_rate": 3.166699018835978e-07, "loss": 0.0, "num_input_tokens_seen": 210939824, "step": 97810 }, { "epoch": 17.951000183519913, "grad_norm": 0.0027437943499535322, "learning_rate": 3.163895178329285e-07, "loss": 0.0244, "num_input_tokens_seen": 210951184, "step": 97815 }, { "epoch": 17.951917783079463, "grad_norm": 0.0754927322268486, "learning_rate": 3.1610925390960944e-07, "loss": 0.0, "num_input_tokens_seen": 210962320, "step": 97820 }, { "epoch": 17.952835382639016, "grad_norm": 0.0005819338839501143, "learning_rate": 3.158291101208288e-07, "loss": 0.0, "num_input_tokens_seen": 210973776, "step": 97825 }, { "epoch": 17.95375298219857, "grad_norm": 0.0010483680525794625, "learning_rate": 3.155490864737709e-07, "loss": 0.0, "num_input_tokens_seen": 210984144, "step": 97830 }, { "epoch": 17.95467058175812, "grad_norm": 0.0007809029775671661, "learning_rate": 3.1526918297561937e-07, "loss": 0.0, "num_input_tokens_seen": 210994064, "step": 97835 }, { "epoch": 17.955588181317673, "grad_norm": 0.0402764230966568, "learning_rate": 3.1498939963355236e-07, "loss": 0.0, "num_input_tokens_seen": 211004688, "step": 97840 }, { "epoch": 17.956505780877226, "grad_norm": 0.0035707862116396427, "learning_rate": 3.1470973645474577e-07, "loss": 0.0, "num_input_tokens_seen": 211016016, "step": 97845 }, { "epoch": 17.957423380436776, "grad_norm": 0.004145248327404261, "learning_rate": 3.144301934463734e-07, "loss": 0.0, "num_input_tokens_seen": 211027888, "step": 97850 }, { "epoch": 17.95834097999633, "grad_norm": 0.00424703024327755, "learning_rate": 3.1415077061560494e-07, "loss": 0.0, "num_input_tokens_seen": 211037840, "step": 97855 }, { "epoch": 17.959258579555883, "grad_norm": 0.14641442894935608, "learning_rate": 3.138714679696064e-07, "loss": 0.0532, "num_input_tokens_seen": 211048368, "step": 97860 }, { "epoch": 17.960176179115432, "grad_norm": 0.18845896422863007, "learning_rate": 3.1359228551554154e-07, "loss": 0.0001, "num_input_tokens_seen": 211058416, "step": 97865 }, { "epoch": 17.961093778674986, "grad_norm": 1.4129323959350586, "learning_rate": 3.133132232605718e-07, "loss": 0.0003, "num_input_tokens_seen": 211069264, "step": 97870 }, { "epoch": 17.96201137823454, "grad_norm": 0.18782775104045868, "learning_rate": 3.1303428121185417e-07, "loss": 0.0001, "num_input_tokens_seen": 211079952, "step": 97875 }, { "epoch": 17.96292897779409, "grad_norm": 0.0013032086426392198, "learning_rate": 3.127554593765425e-07, "loss": 0.0, "num_input_tokens_seen": 211092400, "step": 97880 }, { "epoch": 17.963846577353642, "grad_norm": 0.004351399838924408, "learning_rate": 3.1247675776178934e-07, "loss": 0.0, "num_input_tokens_seen": 211103888, "step": 97885 }, { "epoch": 17.964764176913196, "grad_norm": 0.01176906656473875, "learning_rate": 3.1219817637474226e-07, "loss": 0.0, "num_input_tokens_seen": 211114576, "step": 97890 }, { "epoch": 17.965681776472746, "grad_norm": 0.0005906817968934774, "learning_rate": 3.119197152225467e-07, "loss": 0.0, "num_input_tokens_seen": 211125968, "step": 97895 }, { "epoch": 17.9665993760323, "grad_norm": 0.0006773141794838011, "learning_rate": 3.116413743123442e-07, "loss": 0.0, "num_input_tokens_seen": 211137072, "step": 97900 }, { "epoch": 17.967516975591852, "grad_norm": 0.0025473034474998713, "learning_rate": 3.11363153651274e-07, "loss": 0.0, "num_input_tokens_seen": 211147184, "step": 97905 }, { "epoch": 17.968434575151402, "grad_norm": 0.004253873601555824, "learning_rate": 3.1108505324647263e-07, "loss": 0.0, "num_input_tokens_seen": 211157616, "step": 97910 }, { "epoch": 17.969352174710956, "grad_norm": 73.4068374633789, "learning_rate": 3.108070731050722e-07, "loss": 0.1594, "num_input_tokens_seen": 211168464, "step": 97915 }, { "epoch": 17.97026977427051, "grad_norm": 0.0013989870203658938, "learning_rate": 3.1052921323420304e-07, "loss": 0.0, "num_input_tokens_seen": 211178960, "step": 97920 }, { "epoch": 17.97118737383006, "grad_norm": 0.005115610081702471, "learning_rate": 3.102514736409917e-07, "loss": 0.0, "num_input_tokens_seen": 211190224, "step": 97925 }, { "epoch": 17.972104973389612, "grad_norm": 0.09990856051445007, "learning_rate": 3.099738543325609e-07, "loss": 0.0, "num_input_tokens_seen": 211199888, "step": 97930 }, { "epoch": 17.973022572949166, "grad_norm": 0.0014101306442171335, "learning_rate": 3.0969635531603206e-07, "loss": 0.1345, "num_input_tokens_seen": 211210288, "step": 97935 }, { "epoch": 17.973940172508716, "grad_norm": 0.11027691513299942, "learning_rate": 3.094189765985228e-07, "loss": 0.0001, "num_input_tokens_seen": 211220592, "step": 97940 }, { "epoch": 17.97485777206827, "grad_norm": 0.16691984236240387, "learning_rate": 3.09141718187147e-07, "loss": 0.0001, "num_input_tokens_seen": 211229680, "step": 97945 }, { "epoch": 17.975775371627822, "grad_norm": 124.3837661743164, "learning_rate": 3.08864580089015e-07, "loss": 0.0478, "num_input_tokens_seen": 211238480, "step": 97950 }, { "epoch": 17.976692971187372, "grad_norm": 0.004228989128023386, "learning_rate": 3.085875623112372e-07, "loss": 0.0, "num_input_tokens_seen": 211248720, "step": 97955 }, { "epoch": 17.977610570746926, "grad_norm": 0.0029118380043655634, "learning_rate": 3.083106648609169e-07, "loss": 0.0001, "num_input_tokens_seen": 211259376, "step": 97960 }, { "epoch": 17.97852817030648, "grad_norm": 0.0066221412271261215, "learning_rate": 3.0803388774515606e-07, "loss": 0.0, "num_input_tokens_seen": 211271152, "step": 97965 }, { "epoch": 17.97944576986603, "grad_norm": 0.0013781499583274126, "learning_rate": 3.077572309710547e-07, "loss": 0.0, "num_input_tokens_seen": 211283216, "step": 97970 }, { "epoch": 17.980363369425582, "grad_norm": 0.42538824677467346, "learning_rate": 3.0748069454570815e-07, "loss": 0.0001, "num_input_tokens_seen": 211292624, "step": 97975 }, { "epoch": 17.981280968985136, "grad_norm": 0.005373696330934763, "learning_rate": 3.07204278476208e-07, "loss": 0.0001, "num_input_tokens_seen": 211303216, "step": 97980 }, { "epoch": 17.982198568544685, "grad_norm": 0.009588117711246014, "learning_rate": 3.0692798276964584e-07, "loss": 0.0001, "num_input_tokens_seen": 211313744, "step": 97985 }, { "epoch": 17.98311616810424, "grad_norm": 0.0015216436004266143, "learning_rate": 3.0665180743310764e-07, "loss": 0.0008, "num_input_tokens_seen": 211323472, "step": 97990 }, { "epoch": 17.984033767663792, "grad_norm": 0.002029755152761936, "learning_rate": 3.0637575247367656e-07, "loss": 0.0, "num_input_tokens_seen": 211334160, "step": 97995 }, { "epoch": 17.984951367223342, "grad_norm": 0.0005043526762165129, "learning_rate": 3.06099817898432e-07, "loss": 0.0, "num_input_tokens_seen": 211344528, "step": 98000 }, { "epoch": 17.985868966782895, "grad_norm": 0.000722613069228828, "learning_rate": 3.0582400371445274e-07, "loss": 0.0, "num_input_tokens_seen": 211355408, "step": 98005 }, { "epoch": 17.98678656634245, "grad_norm": 0.03488507494330406, "learning_rate": 3.055483099288126e-07, "loss": 0.0, "num_input_tokens_seen": 211365456, "step": 98010 }, { "epoch": 17.987704165902, "grad_norm": 0.0007777579012326896, "learning_rate": 3.052727365485819e-07, "loss": 0.0001, "num_input_tokens_seen": 211376976, "step": 98015 }, { "epoch": 17.988621765461552, "grad_norm": 0.0011295926524326205, "learning_rate": 3.049972835808301e-07, "loss": 0.0001, "num_input_tokens_seen": 211388176, "step": 98020 }, { "epoch": 17.989539365021106, "grad_norm": 0.0046707685105502605, "learning_rate": 3.047219510326216e-07, "loss": 0.0001, "num_input_tokens_seen": 211400080, "step": 98025 }, { "epoch": 17.990456964580655, "grad_norm": 0.004389796406030655, "learning_rate": 3.0444673891101784e-07, "loss": 0.0, "num_input_tokens_seen": 211411280, "step": 98030 }, { "epoch": 17.99137456414021, "grad_norm": 0.002619657199829817, "learning_rate": 3.0417164722307713e-07, "loss": 0.0, "num_input_tokens_seen": 211421648, "step": 98035 }, { "epoch": 17.992292163699762, "grad_norm": 0.0004316147824283689, "learning_rate": 3.0389667597585657e-07, "loss": 0.0, "num_input_tokens_seen": 211433008, "step": 98040 }, { "epoch": 17.993209763259312, "grad_norm": 0.00429860083386302, "learning_rate": 3.036218251764078e-07, "loss": 0.0, "num_input_tokens_seen": 211443984, "step": 98045 }, { "epoch": 17.994127362818865, "grad_norm": 0.016499321907758713, "learning_rate": 3.033470948317796e-07, "loss": 0.0, "num_input_tokens_seen": 211454704, "step": 98050 }, { "epoch": 17.99504496237842, "grad_norm": 0.003312972141429782, "learning_rate": 3.030724849490202e-07, "loss": 0.0, "num_input_tokens_seen": 211465648, "step": 98055 }, { "epoch": 17.99596256193797, "grad_norm": 0.10149257630109787, "learning_rate": 3.0279799553517174e-07, "loss": 0.0, "num_input_tokens_seen": 211476912, "step": 98060 }, { "epoch": 17.996880161497522, "grad_norm": 0.0013423549244180322, "learning_rate": 3.025236265972742e-07, "loss": 0.0, "num_input_tokens_seen": 211487120, "step": 98065 }, { "epoch": 17.997797761057075, "grad_norm": 0.01205402985215187, "learning_rate": 3.022493781423663e-07, "loss": 0.0001, "num_input_tokens_seen": 211498928, "step": 98070 }, { "epoch": 17.998715360616625, "grad_norm": 0.12182163447141647, "learning_rate": 3.0197525017748033e-07, "loss": 0.0, "num_input_tokens_seen": 211508432, "step": 98075 }, { "epoch": 17.99963296017618, "grad_norm": 0.0017333321738988161, "learning_rate": 3.017012427096483e-07, "loss": 0.2313, "num_input_tokens_seen": 211519184, "step": 98080 }, { "epoch": 18.0, "eval_loss": 1.4315905570983887, "eval_runtime": 179.3655, "eval_samples_per_second": 30.379, "eval_steps_per_second": 7.599, "num_input_tokens_seen": 211522080, "step": 98082 }, { "epoch": 18.000550559735732, "grad_norm": 0.002541355788707733, "learning_rate": 3.014273557458969e-07, "loss": 0.0451, "num_input_tokens_seen": 211529184, "step": 98085 }, { "epoch": 18.001468159295282, "grad_norm": 0.07023646682500839, "learning_rate": 3.0115358929325267e-07, "loss": 0.0, "num_input_tokens_seen": 211540384, "step": 98090 }, { "epoch": 18.002385758854835, "grad_norm": 0.0006391472998075187, "learning_rate": 3.008799433587356e-07, "loss": 0.0, "num_input_tokens_seen": 211551584, "step": 98095 }, { "epoch": 18.00330335841439, "grad_norm": 0.0010155508061870933, "learning_rate": 3.006064179493651e-07, "loss": 0.0, "num_input_tokens_seen": 211562880, "step": 98100 }, { "epoch": 18.00422095797394, "grad_norm": 0.0007468487601727247, "learning_rate": 3.003330130721566e-07, "loss": 0.0, "num_input_tokens_seen": 211573120, "step": 98105 }, { "epoch": 18.005138557533492, "grad_norm": 0.0071990229189395905, "learning_rate": 3.000597287341228e-07, "loss": 0.0, "num_input_tokens_seen": 211583840, "step": 98110 }, { "epoch": 18.006056157093045, "grad_norm": 0.0005271460977382958, "learning_rate": 2.997865649422732e-07, "loss": 0.0, "num_input_tokens_seen": 211595168, "step": 98115 }, { "epoch": 18.006973756652595, "grad_norm": 0.001059034955687821, "learning_rate": 2.995135217036127e-07, "loss": 0.0001, "num_input_tokens_seen": 211606368, "step": 98120 }, { "epoch": 18.00789135621215, "grad_norm": 0.0067564635537564754, "learning_rate": 2.9924059902514515e-07, "loss": 0.0, "num_input_tokens_seen": 211617216, "step": 98125 }, { "epoch": 18.008808955771702, "grad_norm": 0.00270900409668684, "learning_rate": 2.9896779691387103e-07, "loss": 0.0, "num_input_tokens_seen": 211627840, "step": 98130 }, { "epoch": 18.00972655533125, "grad_norm": 0.03352328762412071, "learning_rate": 2.9869511537678753e-07, "loss": 0.0, "num_input_tokens_seen": 211638848, "step": 98135 }, { "epoch": 18.010644154890805, "grad_norm": 0.0009139025933109224, "learning_rate": 2.9842255442088744e-07, "loss": 0.0, "num_input_tokens_seen": 211649792, "step": 98140 }, { "epoch": 18.01156175445036, "grad_norm": 0.0016654691426083446, "learning_rate": 2.9815011405316227e-07, "loss": 0.0, "num_input_tokens_seen": 211660512, "step": 98145 }, { "epoch": 18.01247935400991, "grad_norm": 0.01163670513778925, "learning_rate": 2.9787779428059825e-07, "loss": 0.0001, "num_input_tokens_seen": 211671168, "step": 98150 }, { "epoch": 18.013396953569462, "grad_norm": 0.0011459727538749576, "learning_rate": 2.976055951101825e-07, "loss": 0.0, "num_input_tokens_seen": 211682080, "step": 98155 }, { "epoch": 18.014314553129015, "grad_norm": 0.004858075175434351, "learning_rate": 2.9733351654889495e-07, "loss": 0.0, "num_input_tokens_seen": 211693856, "step": 98160 }, { "epoch": 18.015232152688565, "grad_norm": 0.004564350470900536, "learning_rate": 2.9706155860371344e-07, "loss": 0.0, "num_input_tokens_seen": 211704448, "step": 98165 }, { "epoch": 18.01614975224812, "grad_norm": 0.029503192752599716, "learning_rate": 2.967897212816151e-07, "loss": 0.0, "num_input_tokens_seen": 211715392, "step": 98170 }, { "epoch": 18.017067351807672, "grad_norm": 0.003961889538913965, "learning_rate": 2.96518004589571e-07, "loss": 0.0, "num_input_tokens_seen": 211725888, "step": 98175 }, { "epoch": 18.01798495136722, "grad_norm": 0.00431374367326498, "learning_rate": 2.962464085345501e-07, "loss": 0.0, "num_input_tokens_seen": 211736064, "step": 98180 }, { "epoch": 18.018902550926775, "grad_norm": 0.003178943879902363, "learning_rate": 2.9597493312351844e-07, "loss": 0.0, "num_input_tokens_seen": 211747328, "step": 98185 }, { "epoch": 18.01982015048633, "grad_norm": 0.004002952482551336, "learning_rate": 2.9570357836343933e-07, "loss": 0.0, "num_input_tokens_seen": 211757760, "step": 98190 }, { "epoch": 18.02073775004588, "grad_norm": 0.0022135514300316572, "learning_rate": 2.9543234426127274e-07, "loss": 0.0, "num_input_tokens_seen": 211768960, "step": 98195 }, { "epoch": 18.02165534960543, "grad_norm": 0.022843101993203163, "learning_rate": 2.951612308239743e-07, "loss": 0.002, "num_input_tokens_seen": 211780864, "step": 98200 }, { "epoch": 18.022572949164985, "grad_norm": 0.003515342017635703, "learning_rate": 2.9489023805849893e-07, "loss": 0.0, "num_input_tokens_seen": 211791808, "step": 98205 }, { "epoch": 18.023490548724535, "grad_norm": 0.0024863164871931076, "learning_rate": 2.946193659717972e-07, "loss": 0.0, "num_input_tokens_seen": 211802816, "step": 98210 }, { "epoch": 18.02440814828409, "grad_norm": 0.0036876609083265066, "learning_rate": 2.9434861457081575e-07, "loss": 0.0, "num_input_tokens_seen": 211814144, "step": 98215 }, { "epoch": 18.02532574784364, "grad_norm": 0.004481277894228697, "learning_rate": 2.9407798386249854e-07, "loss": 0.0, "num_input_tokens_seen": 211824288, "step": 98220 }, { "epoch": 18.02624334740319, "grad_norm": 0.0015670389402657747, "learning_rate": 2.9380747385378825e-07, "loss": 0.0001, "num_input_tokens_seen": 211835744, "step": 98225 }, { "epoch": 18.027160946962745, "grad_norm": 0.0012665004469454288, "learning_rate": 2.9353708455162224e-07, "loss": 0.0, "num_input_tokens_seen": 211847680, "step": 98230 }, { "epoch": 18.0280785465223, "grad_norm": 0.0016059150220826268, "learning_rate": 2.932668159629348e-07, "loss": 0.002, "num_input_tokens_seen": 211857536, "step": 98235 }, { "epoch": 18.028996146081848, "grad_norm": 0.0007260391139425337, "learning_rate": 2.9299666809466e-07, "loss": 0.0, "num_input_tokens_seen": 211867552, "step": 98240 }, { "epoch": 18.0299137456414, "grad_norm": 0.0320594422519207, "learning_rate": 2.9272664095372494e-07, "loss": 0.004, "num_input_tokens_seen": 211878880, "step": 98245 }, { "epoch": 18.030831345200955, "grad_norm": 0.001434195670299232, "learning_rate": 2.9245673454705525e-07, "loss": 0.0, "num_input_tokens_seen": 211890560, "step": 98250 }, { "epoch": 18.031748944760505, "grad_norm": 0.0078544020652771, "learning_rate": 2.921869488815754e-07, "loss": 0.0, "num_input_tokens_seen": 211899904, "step": 98255 }, { "epoch": 18.032666544320058, "grad_norm": 0.010520552285015583, "learning_rate": 2.9191728396420373e-07, "loss": 0.0, "num_input_tokens_seen": 211911776, "step": 98260 }, { "epoch": 18.03358414387961, "grad_norm": 0.0003836450632661581, "learning_rate": 2.916477398018569e-07, "loss": 0.0, "num_input_tokens_seen": 211923328, "step": 98265 }, { "epoch": 18.03450174343916, "grad_norm": 0.0003005701000802219, "learning_rate": 2.9137831640144723e-07, "loss": 0.0, "num_input_tokens_seen": 211934656, "step": 98270 }, { "epoch": 18.035419342998715, "grad_norm": 0.0014575712848454714, "learning_rate": 2.9110901376988686e-07, "loss": 0.0, "num_input_tokens_seen": 211946112, "step": 98275 }, { "epoch": 18.03633694255827, "grad_norm": 0.0014547668397426605, "learning_rate": 2.9083983191408206e-07, "loss": 0.0, "num_input_tokens_seen": 211956000, "step": 98280 }, { "epoch": 18.037254542117818, "grad_norm": 0.0006797256646677852, "learning_rate": 2.9057077084093667e-07, "loss": 0.0, "num_input_tokens_seen": 211968032, "step": 98285 }, { "epoch": 18.03817214167737, "grad_norm": 0.0010938850464299321, "learning_rate": 2.903018305573524e-07, "loss": 0.0, "num_input_tokens_seen": 211979776, "step": 98290 }, { "epoch": 18.039089741236925, "grad_norm": 0.0016749324277043343, "learning_rate": 2.9003301107022705e-07, "loss": 0.0, "num_input_tokens_seen": 211991008, "step": 98295 }, { "epoch": 18.040007340796475, "grad_norm": 0.0008496878435835242, "learning_rate": 2.8976431238645465e-07, "loss": 0.0, "num_input_tokens_seen": 212000448, "step": 98300 }, { "epoch": 18.040924940356028, "grad_norm": 0.020552175119519234, "learning_rate": 2.8949573451292787e-07, "loss": 0.0, "num_input_tokens_seen": 212012416, "step": 98305 }, { "epoch": 18.04184253991558, "grad_norm": 0.004422272555530071, "learning_rate": 2.892272774565352e-07, "loss": 0.1252, "num_input_tokens_seen": 212022400, "step": 98310 }, { "epoch": 18.04276013947513, "grad_norm": 0.012314221821725368, "learning_rate": 2.889589412241611e-07, "loss": 0.0, "num_input_tokens_seen": 212033536, "step": 98315 }, { "epoch": 18.043677739034685, "grad_norm": 0.0015708421124145389, "learning_rate": 2.8869072582268844e-07, "loss": 0.0, "num_input_tokens_seen": 212044512, "step": 98320 }, { "epoch": 18.044595338594238, "grad_norm": 0.05367867276072502, "learning_rate": 2.8842263125899774e-07, "loss": 0.0, "num_input_tokens_seen": 212055616, "step": 98325 }, { "epoch": 18.045512938153788, "grad_norm": 0.31522464752197266, "learning_rate": 2.881546575399641e-07, "loss": 0.0001, "num_input_tokens_seen": 212066432, "step": 98330 }, { "epoch": 18.04643053771334, "grad_norm": 0.10619248449802399, "learning_rate": 2.878868046724609e-07, "loss": 0.0001, "num_input_tokens_seen": 212078080, "step": 98335 }, { "epoch": 18.047348137272895, "grad_norm": 0.0008889620075933635, "learning_rate": 2.8761907266335766e-07, "loss": 0.0001, "num_input_tokens_seen": 212090528, "step": 98340 }, { "epoch": 18.048265736832445, "grad_norm": 0.02674531191587448, "learning_rate": 2.873514615195222e-07, "loss": 0.0001, "num_input_tokens_seen": 212101568, "step": 98345 }, { "epoch": 18.049183336391998, "grad_norm": 0.0007814443088136613, "learning_rate": 2.870839712478163e-07, "loss": 0.0, "num_input_tokens_seen": 212112288, "step": 98350 }, { "epoch": 18.05010093595155, "grad_norm": 0.005176194477826357, "learning_rate": 2.868166018551038e-07, "loss": 0.0, "num_input_tokens_seen": 212122784, "step": 98355 }, { "epoch": 18.0510185355111, "grad_norm": 0.006186498329043388, "learning_rate": 2.8654935334824e-07, "loss": 0.0, "num_input_tokens_seen": 212134016, "step": 98360 }, { "epoch": 18.051936135070655, "grad_norm": 0.0026804720982909203, "learning_rate": 2.862822257340803e-07, "loss": 0.0001, "num_input_tokens_seen": 212145664, "step": 98365 }, { "epoch": 18.052853734630208, "grad_norm": 0.056318771094083786, "learning_rate": 2.860152190194754e-07, "loss": 0.0001, "num_input_tokens_seen": 212155808, "step": 98370 }, { "epoch": 18.053771334189758, "grad_norm": 0.0026596179232001305, "learning_rate": 2.857483332112748e-07, "loss": 0.0, "num_input_tokens_seen": 212166336, "step": 98375 }, { "epoch": 18.05468893374931, "grad_norm": 0.020996859297156334, "learning_rate": 2.8548156831632314e-07, "loss": 0.0, "num_input_tokens_seen": 212177440, "step": 98380 }, { "epoch": 18.055606533308865, "grad_norm": 0.004181446507573128, "learning_rate": 2.852149243414615e-07, "loss": 0.0, "num_input_tokens_seen": 212187008, "step": 98385 }, { "epoch": 18.056524132868415, "grad_norm": 80.93045043945312, "learning_rate": 2.849484012935305e-07, "loss": 0.0016, "num_input_tokens_seen": 212197536, "step": 98390 }, { "epoch": 18.057441732427968, "grad_norm": 0.009502902626991272, "learning_rate": 2.8468199917936535e-07, "loss": 0.0, "num_input_tokens_seen": 212208672, "step": 98395 }, { "epoch": 18.05835933198752, "grad_norm": 0.009868144989013672, "learning_rate": 2.844157180057988e-07, "loss": 0.0, "num_input_tokens_seen": 212220128, "step": 98400 }, { "epoch": 18.05927693154707, "grad_norm": 0.003657070454210043, "learning_rate": 2.841495577796599e-07, "loss": 0.0, "num_input_tokens_seen": 212231808, "step": 98405 }, { "epoch": 18.060194531106625, "grad_norm": 0.09478561580181122, "learning_rate": 2.8388351850777653e-07, "loss": 0.0, "num_input_tokens_seen": 212242080, "step": 98410 }, { "epoch": 18.061112130666178, "grad_norm": 0.003723635571077466, "learning_rate": 2.836176001969715e-07, "loss": 0.0001, "num_input_tokens_seen": 212253600, "step": 98415 }, { "epoch": 18.062029730225728, "grad_norm": 0.004970932379364967, "learning_rate": 2.8335180285406494e-07, "loss": 0.1501, "num_input_tokens_seen": 212264096, "step": 98420 }, { "epoch": 18.06294732978528, "grad_norm": 0.0008526599267497659, "learning_rate": 2.830861264858753e-07, "loss": 0.0, "num_input_tokens_seen": 212275104, "step": 98425 }, { "epoch": 18.063864929344835, "grad_norm": 0.0009205247624777257, "learning_rate": 2.8282057109921545e-07, "loss": 0.0, "num_input_tokens_seen": 212286240, "step": 98430 }, { "epoch": 18.064782528904384, "grad_norm": 0.09889121353626251, "learning_rate": 2.825551367008966e-07, "loss": 0.0001, "num_input_tokens_seen": 212297152, "step": 98435 }, { "epoch": 18.065700128463938, "grad_norm": 0.00036638081655837595, "learning_rate": 2.822898232977278e-07, "loss": 0.0001, "num_input_tokens_seen": 212307936, "step": 98440 }, { "epoch": 18.06661772802349, "grad_norm": 17.289276123046875, "learning_rate": 2.8202463089651354e-07, "loss": 0.0032, "num_input_tokens_seen": 212319264, "step": 98445 }, { "epoch": 18.06753532758304, "grad_norm": 0.0013088694540783763, "learning_rate": 2.8175955950405453e-07, "loss": 0.0001, "num_input_tokens_seen": 212330304, "step": 98450 }, { "epoch": 18.068452927142594, "grad_norm": 0.0017296234145760536, "learning_rate": 2.8149460912715034e-07, "loss": 0.0, "num_input_tokens_seen": 212342144, "step": 98455 }, { "epoch": 18.069370526702148, "grad_norm": 0.030180271714925766, "learning_rate": 2.8122977977259657e-07, "loss": 0.0, "num_input_tokens_seen": 212351616, "step": 98460 }, { "epoch": 18.070288126261698, "grad_norm": 0.0006029154174029827, "learning_rate": 2.8096507144718567e-07, "loss": 0.0, "num_input_tokens_seen": 212362368, "step": 98465 }, { "epoch": 18.07120572582125, "grad_norm": 0.001986447721719742, "learning_rate": 2.80700484157706e-07, "loss": 0.0, "num_input_tokens_seen": 212373024, "step": 98470 }, { "epoch": 18.072123325380804, "grad_norm": 0.005774929653853178, "learning_rate": 2.8043601791094555e-07, "loss": 0.0, "num_input_tokens_seen": 212385088, "step": 98475 }, { "epoch": 18.073040924940354, "grad_norm": 0.012008707970380783, "learning_rate": 2.801716727136866e-07, "loss": 0.0, "num_input_tokens_seen": 212394432, "step": 98480 }, { "epoch": 18.073958524499908, "grad_norm": 0.06798950582742691, "learning_rate": 2.7990744857270823e-07, "loss": 0.0, "num_input_tokens_seen": 212404768, "step": 98485 }, { "epoch": 18.07487612405946, "grad_norm": 0.0021536515560001135, "learning_rate": 2.796433454947894e-07, "loss": 0.0, "num_input_tokens_seen": 212416480, "step": 98490 }, { "epoch": 18.07579372361901, "grad_norm": 0.0007197662489488721, "learning_rate": 2.7937936348670256e-07, "loss": 0.0, "num_input_tokens_seen": 212427392, "step": 98495 }, { "epoch": 18.076711323178564, "grad_norm": 0.014870725572109222, "learning_rate": 2.791155025552189e-07, "loss": 0.0001, "num_input_tokens_seen": 212438176, "step": 98500 }, { "epoch": 18.077628922738118, "grad_norm": 0.006605201400816441, "learning_rate": 2.7885176270710525e-07, "loss": 0.0, "num_input_tokens_seen": 212448352, "step": 98505 }, { "epoch": 18.078546522297668, "grad_norm": 0.0013313032686710358, "learning_rate": 2.7858814394912725e-07, "loss": 0.0, "num_input_tokens_seen": 212459328, "step": 98510 }, { "epoch": 18.07946412185722, "grad_norm": 0.002849243115633726, "learning_rate": 2.7832464628804624e-07, "loss": 0.0, "num_input_tokens_seen": 212471392, "step": 98515 }, { "epoch": 18.080381721416774, "grad_norm": 0.344287246465683, "learning_rate": 2.78061269730619e-07, "loss": 0.0097, "num_input_tokens_seen": 212481728, "step": 98520 }, { "epoch": 18.081299320976324, "grad_norm": 0.02675693854689598, "learning_rate": 2.777980142836029e-07, "loss": 0.0, "num_input_tokens_seen": 212492640, "step": 98525 }, { "epoch": 18.082216920535878, "grad_norm": 0.0035677780397236347, "learning_rate": 2.775348799537486e-07, "loss": 0.0, "num_input_tokens_seen": 212503232, "step": 98530 }, { "epoch": 18.08313452009543, "grad_norm": 0.009361231699585915, "learning_rate": 2.7727186674780583e-07, "loss": 0.0001, "num_input_tokens_seen": 212513728, "step": 98535 }, { "epoch": 18.08405211965498, "grad_norm": 0.0007234492804855108, "learning_rate": 2.7700897467251965e-07, "loss": 0.0002, "num_input_tokens_seen": 212524160, "step": 98540 }, { "epoch": 18.084969719214534, "grad_norm": 0.000446991907665506, "learning_rate": 2.7674620373463303e-07, "loss": 0.0, "num_input_tokens_seen": 212534912, "step": 98545 }, { "epoch": 18.085887318774088, "grad_norm": 0.00484229763969779, "learning_rate": 2.7648355394088666e-07, "loss": 0.0, "num_input_tokens_seen": 212546304, "step": 98550 }, { "epoch": 18.086804918333637, "grad_norm": 0.001113352132961154, "learning_rate": 2.7622102529801576e-07, "loss": 0.0, "num_input_tokens_seen": 212557696, "step": 98555 }, { "epoch": 18.08772251789319, "grad_norm": 0.0016194147756323218, "learning_rate": 2.759586178127549e-07, "loss": 0.0, "num_input_tokens_seen": 212567936, "step": 98560 }, { "epoch": 18.088640117452744, "grad_norm": 0.0017118031391873956, "learning_rate": 2.7569633149183375e-07, "loss": 0.0, "num_input_tokens_seen": 212579392, "step": 98565 }, { "epoch": 18.089557717012294, "grad_norm": 0.009371001273393631, "learning_rate": 2.7543416634197907e-07, "loss": 0.0, "num_input_tokens_seen": 212589856, "step": 98570 }, { "epoch": 18.090475316571847, "grad_norm": 0.024292834103107452, "learning_rate": 2.751721223699161e-07, "loss": 0.0, "num_input_tokens_seen": 212600288, "step": 98575 }, { "epoch": 18.0913929161314, "grad_norm": 0.0009020119905471802, "learning_rate": 2.7491019958236554e-07, "loss": 0.0, "num_input_tokens_seen": 212611552, "step": 98580 }, { "epoch": 18.09231051569095, "grad_norm": 0.00525266956537962, "learning_rate": 2.746483979860448e-07, "loss": 0.0, "num_input_tokens_seen": 212622176, "step": 98585 }, { "epoch": 18.093228115250504, "grad_norm": 0.01832096092402935, "learning_rate": 2.743867175876691e-07, "loss": 0.0, "num_input_tokens_seen": 212633344, "step": 98590 }, { "epoch": 18.094145714810058, "grad_norm": 0.010505327954888344, "learning_rate": 2.7412515839395025e-07, "loss": 0.0, "num_input_tokens_seen": 212644352, "step": 98595 }, { "epoch": 18.095063314369607, "grad_norm": 0.011343441903591156, "learning_rate": 2.7386372041159627e-07, "loss": 0.0001, "num_input_tokens_seen": 212654112, "step": 98600 }, { "epoch": 18.09598091392916, "grad_norm": 0.0011957737151533365, "learning_rate": 2.7360240364731285e-07, "loss": 0.0, "num_input_tokens_seen": 212664864, "step": 98605 }, { "epoch": 18.096898513488714, "grad_norm": 0.00040802324656397104, "learning_rate": 2.7334120810780297e-07, "loss": 0.0, "num_input_tokens_seen": 212676576, "step": 98610 }, { "epoch": 18.097816113048264, "grad_norm": 0.02710609696805477, "learning_rate": 2.730801337997657e-07, "loss": 0.0, "num_input_tokens_seen": 212687936, "step": 98615 }, { "epoch": 18.098733712607817, "grad_norm": 0.001971584977582097, "learning_rate": 2.728191807298958e-07, "loss": 0.0, "num_input_tokens_seen": 212698528, "step": 98620 }, { "epoch": 18.09965131216737, "grad_norm": 0.07525510340929031, "learning_rate": 2.7255834890488883e-07, "loss": 0.0, "num_input_tokens_seen": 212709664, "step": 98625 }, { "epoch": 18.10056891172692, "grad_norm": 0.0019256735686212778, "learning_rate": 2.7229763833143296e-07, "loss": 0.0, "num_input_tokens_seen": 212719712, "step": 98630 }, { "epoch": 18.101486511286474, "grad_norm": 0.0010809747036546469, "learning_rate": 2.7203704901621495e-07, "loss": 0.0, "num_input_tokens_seen": 212730080, "step": 98635 }, { "epoch": 18.102404110846027, "grad_norm": 0.0019187008729204535, "learning_rate": 2.7177658096591894e-07, "loss": 0.0, "num_input_tokens_seen": 212741056, "step": 98640 }, { "epoch": 18.103321710405577, "grad_norm": 0.0020860114600509405, "learning_rate": 2.715162341872257e-07, "loss": 0.0, "num_input_tokens_seen": 212751040, "step": 98645 }, { "epoch": 18.10423930996513, "grad_norm": 0.00036001246189698577, "learning_rate": 2.7125600868681326e-07, "loss": 0.04, "num_input_tokens_seen": 212762176, "step": 98650 }, { "epoch": 18.105156909524684, "grad_norm": 0.0004820743633899838, "learning_rate": 2.709959044713539e-07, "loss": 0.0, "num_input_tokens_seen": 212772768, "step": 98655 }, { "epoch": 18.106074509084234, "grad_norm": 0.0005178332212381065, "learning_rate": 2.707359215475214e-07, "loss": 0.0, "num_input_tokens_seen": 212784544, "step": 98660 }, { "epoch": 18.106992108643787, "grad_norm": 0.002757691778242588, "learning_rate": 2.70476059921983e-07, "loss": 0.0016, "num_input_tokens_seen": 212795904, "step": 98665 }, { "epoch": 18.10790970820334, "grad_norm": 0.004554024897515774, "learning_rate": 2.7021631960140296e-07, "loss": 0.0, "num_input_tokens_seen": 212805024, "step": 98670 }, { "epoch": 18.10882730776289, "grad_norm": 0.011644942685961723, "learning_rate": 2.699567005924436e-07, "loss": 0.0001, "num_input_tokens_seen": 212815552, "step": 98675 }, { "epoch": 18.109744907322444, "grad_norm": 0.003375434782356024, "learning_rate": 2.6969720290176414e-07, "loss": 0.0001, "num_input_tokens_seen": 212826272, "step": 98680 }, { "epoch": 18.110662506881997, "grad_norm": 0.0015670678112655878, "learning_rate": 2.694378265360209e-07, "loss": 0.0, "num_input_tokens_seen": 212837952, "step": 98685 }, { "epoch": 18.111580106441547, "grad_norm": 0.0016061075730249286, "learning_rate": 2.691785715018647e-07, "loss": 0.0, "num_input_tokens_seen": 212848832, "step": 98690 }, { "epoch": 18.1124977060011, "grad_norm": 0.035873640328645706, "learning_rate": 2.689194378059462e-07, "loss": 0.0, "num_input_tokens_seen": 212859648, "step": 98695 }, { "epoch": 18.113415305560654, "grad_norm": 0.02927371859550476, "learning_rate": 2.6866042545491247e-07, "loss": 0.0001, "num_input_tokens_seen": 212870112, "step": 98700 }, { "epoch": 18.114332905120204, "grad_norm": 0.006511656567454338, "learning_rate": 2.684015344554047e-07, "loss": 0.0, "num_input_tokens_seen": 212880896, "step": 98705 }, { "epoch": 18.115250504679757, "grad_norm": 0.0015329972375184298, "learning_rate": 2.68142764814065e-07, "loss": 0.0, "num_input_tokens_seen": 212891136, "step": 98710 }, { "epoch": 18.11616810423931, "grad_norm": 0.00036320561775937676, "learning_rate": 2.6788411653752953e-07, "loss": 0.0, "num_input_tokens_seen": 212901184, "step": 98715 }, { "epoch": 18.11708570379886, "grad_norm": 0.004166834522038698, "learning_rate": 2.6762558963243255e-07, "loss": 0.0, "num_input_tokens_seen": 212910464, "step": 98720 }, { "epoch": 18.118003303358414, "grad_norm": 0.0063199992291629314, "learning_rate": 2.6736718410540377e-07, "loss": 0.0, "num_input_tokens_seen": 212921376, "step": 98725 }, { "epoch": 18.118920902917967, "grad_norm": 0.002771284431219101, "learning_rate": 2.6710889996307275e-07, "loss": 0.0001, "num_input_tokens_seen": 212932960, "step": 98730 }, { "epoch": 18.11983850247752, "grad_norm": 0.0028335191309452057, "learning_rate": 2.6685073721206323e-07, "loss": 0.0, "num_input_tokens_seen": 212943488, "step": 98735 }, { "epoch": 18.12075610203707, "grad_norm": 0.0035453124437481165, "learning_rate": 2.6659269585899595e-07, "loss": 0.0, "num_input_tokens_seen": 212953632, "step": 98740 }, { "epoch": 18.121673701596624, "grad_norm": 0.04175587370991707, "learning_rate": 2.6633477591049005e-07, "loss": 0.0, "num_input_tokens_seen": 212965280, "step": 98745 }, { "epoch": 18.122591301156177, "grad_norm": 0.07205428928136826, "learning_rate": 2.6607697737316084e-07, "loss": 0.0, "num_input_tokens_seen": 212976512, "step": 98750 }, { "epoch": 18.123508900715727, "grad_norm": 0.0005450650351122022, "learning_rate": 2.658193002536208e-07, "loss": 0.0, "num_input_tokens_seen": 212987392, "step": 98755 }, { "epoch": 18.12442650027528, "grad_norm": 0.0029409374110400677, "learning_rate": 2.655617445584779e-07, "loss": 0.0, "num_input_tokens_seen": 212997824, "step": 98760 }, { "epoch": 18.125344099834834, "grad_norm": 29.265422821044922, "learning_rate": 2.653043102943376e-07, "loss": 0.0079, "num_input_tokens_seen": 213009152, "step": 98765 }, { "epoch": 18.126261699394384, "grad_norm": 0.0016881515039131045, "learning_rate": 2.6504699746780493e-07, "loss": 0.0, "num_input_tokens_seen": 213019200, "step": 98770 }, { "epoch": 18.127179298953937, "grad_norm": 0.001509851892478764, "learning_rate": 2.6478980608547755e-07, "loss": 0.0, "num_input_tokens_seen": 213030272, "step": 98775 }, { "epoch": 18.12809689851349, "grad_norm": 0.0007963434909470379, "learning_rate": 2.6453273615395345e-07, "loss": 0.0, "num_input_tokens_seen": 213039968, "step": 98780 }, { "epoch": 18.12901449807304, "grad_norm": 0.000730946019757539, "learning_rate": 2.642757876798252e-07, "loss": 0.0, "num_input_tokens_seen": 213049760, "step": 98785 }, { "epoch": 18.129932097632594, "grad_norm": 0.007225068286061287, "learning_rate": 2.6401896066968245e-07, "loss": 0.0, "num_input_tokens_seen": 213060000, "step": 98790 }, { "epoch": 18.130849697192147, "grad_norm": 0.001325366087257862, "learning_rate": 2.637622551301139e-07, "loss": 0.0045, "num_input_tokens_seen": 213072064, "step": 98795 }, { "epoch": 18.131767296751697, "grad_norm": 0.0034421079326421022, "learning_rate": 2.635056710677031e-07, "loss": 0.002, "num_input_tokens_seen": 213082112, "step": 98800 }, { "epoch": 18.13268489631125, "grad_norm": 0.0012731592869386077, "learning_rate": 2.632492084890309e-07, "loss": 0.0, "num_input_tokens_seen": 213093152, "step": 98805 }, { "epoch": 18.133602495870804, "grad_norm": 0.002348913112655282, "learning_rate": 2.629928674006743e-07, "loss": 0.0004, "num_input_tokens_seen": 213102176, "step": 98810 }, { "epoch": 18.134520095430354, "grad_norm": 0.0004269052587915212, "learning_rate": 2.627366478092097e-07, "loss": 0.0, "num_input_tokens_seen": 213113344, "step": 98815 }, { "epoch": 18.135437694989907, "grad_norm": 0.03837200254201889, "learning_rate": 2.624805497212085e-07, "loss": 0.0001, "num_input_tokens_seen": 213124960, "step": 98820 }, { "epoch": 18.13635529454946, "grad_norm": 0.008546363562345505, "learning_rate": 2.6222457314323713e-07, "loss": 0.0, "num_input_tokens_seen": 213135968, "step": 98825 }, { "epoch": 18.13727289410901, "grad_norm": 0.003604313125833869, "learning_rate": 2.619687180818642e-07, "loss": 0.0, "num_input_tokens_seen": 213147424, "step": 98830 }, { "epoch": 18.138190493668564, "grad_norm": 0.00555443437770009, "learning_rate": 2.6171298454365e-07, "loss": 0.04, "num_input_tokens_seen": 213158912, "step": 98835 }, { "epoch": 18.139108093228117, "grad_norm": 0.0067162648774683475, "learning_rate": 2.6145737253515325e-07, "loss": 0.0, "num_input_tokens_seen": 213170112, "step": 98840 }, { "epoch": 18.140025692787667, "grad_norm": 0.0008967073517851532, "learning_rate": 2.612018820629314e-07, "loss": 0.0, "num_input_tokens_seen": 213181024, "step": 98845 }, { "epoch": 18.14094329234722, "grad_norm": 134.02577209472656, "learning_rate": 2.6094651313353704e-07, "loss": 0.1005, "num_input_tokens_seen": 213191744, "step": 98850 }, { "epoch": 18.141860891906774, "grad_norm": 0.019098784774541855, "learning_rate": 2.606912657535199e-07, "loss": 0.0, "num_input_tokens_seen": 213202496, "step": 98855 }, { "epoch": 18.142778491466323, "grad_norm": 0.0007603216217830777, "learning_rate": 2.604361399294253e-07, "loss": 0.0, "num_input_tokens_seen": 213211232, "step": 98860 }, { "epoch": 18.143696091025877, "grad_norm": 0.13799059391021729, "learning_rate": 2.601811356677991e-07, "loss": 0.0, "num_input_tokens_seen": 213220736, "step": 98865 }, { "epoch": 18.14461369058543, "grad_norm": 0.0009551394614391029, "learning_rate": 2.599262529751806e-07, "loss": 0.0, "num_input_tokens_seen": 213230944, "step": 98870 }, { "epoch": 18.14553129014498, "grad_norm": 0.0015096045099198818, "learning_rate": 2.596714918581067e-07, "loss": 0.0, "num_input_tokens_seen": 213241504, "step": 98875 }, { "epoch": 18.146448889704534, "grad_norm": 0.00977738481014967, "learning_rate": 2.594168523231133e-07, "loss": 0.0, "num_input_tokens_seen": 213252352, "step": 98880 }, { "epoch": 18.147366489264087, "grad_norm": 0.005313169676810503, "learning_rate": 2.5916233437672965e-07, "loss": 0.0, "num_input_tokens_seen": 213262496, "step": 98885 }, { "epoch": 18.148284088823637, "grad_norm": 0.0018143514171242714, "learning_rate": 2.5890793802548443e-07, "loss": 0.0001, "num_input_tokens_seen": 213273088, "step": 98890 }, { "epoch": 18.14920168838319, "grad_norm": 0.0014729895628988743, "learning_rate": 2.5865366327590293e-07, "loss": 0.0, "num_input_tokens_seen": 213283232, "step": 98895 }, { "epoch": 18.150119287942744, "grad_norm": 0.0005258586024865508, "learning_rate": 2.583995101345066e-07, "loss": 0.0, "num_input_tokens_seen": 213294176, "step": 98900 }, { "epoch": 18.151036887502293, "grad_norm": 0.09286816418170929, "learning_rate": 2.5814547860781356e-07, "loss": 0.0, "num_input_tokens_seen": 213305920, "step": 98905 }, { "epoch": 18.151954487061847, "grad_norm": 0.00037488286034204066, "learning_rate": 2.578915687023398e-07, "loss": 0.0, "num_input_tokens_seen": 213316768, "step": 98910 }, { "epoch": 18.1528720866214, "grad_norm": 0.0015597627498209476, "learning_rate": 2.5763778042459773e-07, "loss": 0.0002, "num_input_tokens_seen": 213328320, "step": 98915 }, { "epoch": 18.15378968618095, "grad_norm": 0.038373321294784546, "learning_rate": 2.573841137810973e-07, "loss": 0.0, "num_input_tokens_seen": 213339296, "step": 98920 }, { "epoch": 18.154707285740503, "grad_norm": 0.001289501553401351, "learning_rate": 2.571305687783426e-07, "loss": 0.0001, "num_input_tokens_seen": 213349920, "step": 98925 }, { "epoch": 18.155624885300057, "grad_norm": 0.0018656144384294748, "learning_rate": 2.5687714542283914e-07, "loss": 0.0, "num_input_tokens_seen": 213361664, "step": 98930 }, { "epoch": 18.156542484859607, "grad_norm": 0.0016458260361105204, "learning_rate": 2.566238437210861e-07, "loss": 0.0, "num_input_tokens_seen": 213372288, "step": 98935 }, { "epoch": 18.15746008441916, "grad_norm": 0.007953151129186153, "learning_rate": 2.5637066367957817e-07, "loss": 0.0, "num_input_tokens_seen": 213383040, "step": 98940 }, { "epoch": 18.158377683978713, "grad_norm": 0.0015901625156402588, "learning_rate": 2.561176053048126e-07, "loss": 0.0, "num_input_tokens_seen": 213393888, "step": 98945 }, { "epoch": 18.159295283538263, "grad_norm": 0.0009537332807667553, "learning_rate": 2.558646686032773e-07, "loss": 0.0, "num_input_tokens_seen": 213404832, "step": 98950 }, { "epoch": 18.160212883097817, "grad_norm": 0.03211139142513275, "learning_rate": 2.556118535814606e-07, "loss": 0.0, "num_input_tokens_seen": 213416320, "step": 98955 }, { "epoch": 18.16113048265737, "grad_norm": 0.006695800460875034, "learning_rate": 2.553591602458461e-07, "loss": 0.0, "num_input_tokens_seen": 213428064, "step": 98960 }, { "epoch": 18.16204808221692, "grad_norm": 0.0005110188503749669, "learning_rate": 2.551065886029164e-07, "loss": 0.0, "num_input_tokens_seen": 213437280, "step": 98965 }, { "epoch": 18.162965681776473, "grad_norm": 0.0007926420075818896, "learning_rate": 2.548541386591491e-07, "loss": 0.0001, "num_input_tokens_seen": 213447872, "step": 98970 }, { "epoch": 18.163883281336027, "grad_norm": 0.0022916009183973074, "learning_rate": 2.546018104210185e-07, "loss": 0.0, "num_input_tokens_seen": 213458784, "step": 98975 }, { "epoch": 18.164800880895577, "grad_norm": 0.001008176594041288, "learning_rate": 2.5434960389499665e-07, "loss": 0.0, "num_input_tokens_seen": 213469472, "step": 98980 }, { "epoch": 18.16571848045513, "grad_norm": 1.2811543941497803, "learning_rate": 2.5409751908755163e-07, "loss": 0.0002, "num_input_tokens_seen": 213480608, "step": 98985 }, { "epoch": 18.166636080014683, "grad_norm": 0.0012067072093486786, "learning_rate": 2.538455560051506e-07, "loss": 0.0, "num_input_tokens_seen": 213490752, "step": 98990 }, { "epoch": 18.167553679574233, "grad_norm": 0.0012465286999940872, "learning_rate": 2.535937146542555e-07, "loss": 0.0, "num_input_tokens_seen": 213501312, "step": 98995 }, { "epoch": 18.168471279133787, "grad_norm": 0.004858991596847773, "learning_rate": 2.533419950413246e-07, "loss": 0.0, "num_input_tokens_seen": 213512864, "step": 99000 }, { "epoch": 18.16938887869334, "grad_norm": 0.0216732919216156, "learning_rate": 2.530903971728155e-07, "loss": 0.0, "num_input_tokens_seen": 213523648, "step": 99005 }, { "epoch": 18.17030647825289, "grad_norm": 0.003742686938494444, "learning_rate": 2.5283892105518016e-07, "loss": 0.0001, "num_input_tokens_seen": 213534048, "step": 99010 }, { "epoch": 18.171224077812443, "grad_norm": 0.0018042548326775432, "learning_rate": 2.5258756669486906e-07, "loss": 0.0, "num_input_tokens_seen": 213545408, "step": 99015 }, { "epoch": 18.172141677371997, "grad_norm": 0.000731047592125833, "learning_rate": 2.5233633409832923e-07, "loss": 0.0, "num_input_tokens_seen": 213555072, "step": 99020 }, { "epoch": 18.173059276931546, "grad_norm": 0.05020079016685486, "learning_rate": 2.520852232720039e-07, "loss": 0.0, "num_input_tokens_seen": 213566528, "step": 99025 }, { "epoch": 18.1739768764911, "grad_norm": 0.0005507571622729301, "learning_rate": 2.5183423422233456e-07, "loss": 0.0, "num_input_tokens_seen": 213577888, "step": 99030 }, { "epoch": 18.174894476050653, "grad_norm": 0.18022525310516357, "learning_rate": 2.515833669557577e-07, "loss": 0.0001, "num_input_tokens_seen": 213588672, "step": 99035 }, { "epoch": 18.175812075610203, "grad_norm": 0.005240928381681442, "learning_rate": 2.5133262147870876e-07, "loss": 0.0, "num_input_tokens_seen": 213598208, "step": 99040 }, { "epoch": 18.176729675169756, "grad_norm": 0.0019825128838419914, "learning_rate": 2.51081997797617e-07, "loss": 0.0, "num_input_tokens_seen": 213609216, "step": 99045 }, { "epoch": 18.17764727472931, "grad_norm": 0.0005983128794468939, "learning_rate": 2.5083149591891285e-07, "loss": 0.0, "num_input_tokens_seen": 213620352, "step": 99050 }, { "epoch": 18.17856487428886, "grad_norm": 0.0007230317569337785, "learning_rate": 2.5058111584902065e-07, "loss": 0.0, "num_input_tokens_seen": 213631680, "step": 99055 }, { "epoch": 18.179482473848413, "grad_norm": 0.0007383258198387921, "learning_rate": 2.503308575943608e-07, "loss": 0.001, "num_input_tokens_seen": 213642944, "step": 99060 }, { "epoch": 18.180400073407966, "grad_norm": 0.0012249319115653634, "learning_rate": 2.5008072116135374e-07, "loss": 0.0, "num_input_tokens_seen": 213653120, "step": 99065 }, { "epoch": 18.181317672967516, "grad_norm": 0.03125978261232376, "learning_rate": 2.498307065564143e-07, "loss": 0.0, "num_input_tokens_seen": 213663360, "step": 99070 }, { "epoch": 18.18223527252707, "grad_norm": 0.0075561413541436195, "learning_rate": 2.495808137859551e-07, "loss": 0.0, "num_input_tokens_seen": 213674304, "step": 99075 }, { "epoch": 18.183152872086623, "grad_norm": 0.005681710783392191, "learning_rate": 2.4933104285638556e-07, "loss": 0.0, "num_input_tokens_seen": 213685408, "step": 99080 }, { "epoch": 18.184070471646173, "grad_norm": 0.0009411806240677834, "learning_rate": 2.490813937741121e-07, "loss": 0.0, "num_input_tokens_seen": 213695328, "step": 99085 }, { "epoch": 18.184988071205726, "grad_norm": 0.010892228223383427, "learning_rate": 2.4883186654553806e-07, "loss": 0.0, "num_input_tokens_seen": 213705536, "step": 99090 }, { "epoch": 18.18590567076528, "grad_norm": 0.012644332833588123, "learning_rate": 2.4858246117706207e-07, "loss": 0.0, "num_input_tokens_seen": 213715968, "step": 99095 }, { "epoch": 18.18682327032483, "grad_norm": 0.0004838698368985206, "learning_rate": 2.483331776750825e-07, "loss": 0.0, "num_input_tokens_seen": 213726752, "step": 99100 }, { "epoch": 18.187740869884383, "grad_norm": 0.01931666024029255, "learning_rate": 2.480840160459924e-07, "loss": 0.0001, "num_input_tokens_seen": 213735936, "step": 99105 }, { "epoch": 18.188658469443936, "grad_norm": 0.0016344207106158137, "learning_rate": 2.478349762961818e-07, "loss": 0.0, "num_input_tokens_seen": 213748032, "step": 99110 }, { "epoch": 18.189576069003486, "grad_norm": 0.013335163705050945, "learning_rate": 2.4758605843203996e-07, "loss": 0.0, "num_input_tokens_seen": 213758784, "step": 99115 }, { "epoch": 18.19049366856304, "grad_norm": 0.021133752539753914, "learning_rate": 2.473372624599496e-07, "loss": 0.0, "num_input_tokens_seen": 213770048, "step": 99120 }, { "epoch": 18.191411268122593, "grad_norm": 0.0020136344246566296, "learning_rate": 2.470885883862928e-07, "loss": 0.0001, "num_input_tokens_seen": 213779776, "step": 99125 }, { "epoch": 18.192328867682143, "grad_norm": 0.0026482518296688795, "learning_rate": 2.468400362174467e-07, "loss": 0.0, "num_input_tokens_seen": 213789536, "step": 99130 }, { "epoch": 18.193246467241696, "grad_norm": 0.005329255945980549, "learning_rate": 2.4659160595978784e-07, "loss": 0.0003, "num_input_tokens_seen": 213800544, "step": 99135 }, { "epoch": 18.19416406680125, "grad_norm": 0.009265410713851452, "learning_rate": 2.4634329761968667e-07, "loss": 0.0, "num_input_tokens_seen": 213811168, "step": 99140 }, { "epoch": 18.1950816663608, "grad_norm": 0.000813785707578063, "learning_rate": 2.46095111203512e-07, "loss": 0.0, "num_input_tokens_seen": 213820960, "step": 99145 }, { "epoch": 18.195999265920353, "grad_norm": 0.001604833989404142, "learning_rate": 2.458470467176305e-07, "loss": 0.0, "num_input_tokens_seen": 213831520, "step": 99150 }, { "epoch": 18.196916865479906, "grad_norm": 0.0016121537191793323, "learning_rate": 2.4559910416840413e-07, "loss": 0.0, "num_input_tokens_seen": 213842176, "step": 99155 }, { "epoch": 18.197834465039456, "grad_norm": 0.0072929747402668, "learning_rate": 2.4535128356219075e-07, "loss": 0.0, "num_input_tokens_seen": 213852480, "step": 99160 }, { "epoch": 18.19875206459901, "grad_norm": 0.006527238991111517, "learning_rate": 2.451035849053496e-07, "loss": 0.0, "num_input_tokens_seen": 213862528, "step": 99165 }, { "epoch": 18.199669664158563, "grad_norm": 0.0015707206912338734, "learning_rate": 2.4485600820423114e-07, "loss": 0.0003, "num_input_tokens_seen": 213874176, "step": 99170 }, { "epoch": 18.200587263718113, "grad_norm": 0.04063817486166954, "learning_rate": 2.4460855346518706e-07, "loss": 0.0, "num_input_tokens_seen": 213885344, "step": 99175 }, { "epoch": 18.201504863277666, "grad_norm": 0.00046076090075075626, "learning_rate": 2.4436122069456223e-07, "loss": 0.0, "num_input_tokens_seen": 213895296, "step": 99180 }, { "epoch": 18.20242246283722, "grad_norm": 0.017503956332802773, "learning_rate": 2.4411400989870213e-07, "loss": 0.0079, "num_input_tokens_seen": 213906944, "step": 99185 }, { "epoch": 18.20334006239677, "grad_norm": 0.0006188242114149034, "learning_rate": 2.438669210839467e-07, "loss": 0.0, "num_input_tokens_seen": 213917248, "step": 99190 }, { "epoch": 18.204257661956323, "grad_norm": 0.0011229591909796, "learning_rate": 2.4361995425663367e-07, "loss": 0.0, "num_input_tokens_seen": 213929440, "step": 99195 }, { "epoch": 18.205175261515876, "grad_norm": 0.0016813792753964663, "learning_rate": 2.4337310942309734e-07, "loss": 0.0, "num_input_tokens_seen": 213940224, "step": 99200 }, { "epoch": 18.206092861075426, "grad_norm": 0.004864241927862167, "learning_rate": 2.4312638658966823e-07, "loss": 0.0, "num_input_tokens_seen": 213950336, "step": 99205 }, { "epoch": 18.20701046063498, "grad_norm": 0.012352264486253262, "learning_rate": 2.428797857626741e-07, "loss": 0.0, "num_input_tokens_seen": 213962080, "step": 99210 }, { "epoch": 18.207928060194533, "grad_norm": 0.009630316868424416, "learning_rate": 2.4263330694844156e-07, "loss": 0.0, "num_input_tokens_seen": 213972864, "step": 99215 }, { "epoch": 18.208845659754083, "grad_norm": 0.006319269072264433, "learning_rate": 2.423869501532916e-07, "loss": 0.0, "num_input_tokens_seen": 213984608, "step": 99220 }, { "epoch": 18.209763259313636, "grad_norm": 0.0021643887739628553, "learning_rate": 2.42140715383542e-07, "loss": 0.0, "num_input_tokens_seen": 213995424, "step": 99225 }, { "epoch": 18.21068085887319, "grad_norm": 0.004748830106109381, "learning_rate": 2.418946026455088e-07, "loss": 0.0, "num_input_tokens_seen": 214006336, "step": 99230 }, { "epoch": 18.21159845843274, "grad_norm": 0.07738978415727615, "learning_rate": 2.416486119455053e-07, "loss": 0.0, "num_input_tokens_seen": 214017056, "step": 99235 }, { "epoch": 18.212516057992293, "grad_norm": 0.0022151339799165726, "learning_rate": 2.4140274328984025e-07, "loss": 0.0001, "num_input_tokens_seen": 214028512, "step": 99240 }, { "epoch": 18.213433657551846, "grad_norm": 0.003386304248124361, "learning_rate": 2.411569966848193e-07, "loss": 0.0, "num_input_tokens_seen": 214038304, "step": 99245 }, { "epoch": 18.214351257111396, "grad_norm": 0.0018410860793665051, "learning_rate": 2.4091137213674564e-07, "loss": 0.0, "num_input_tokens_seen": 214048288, "step": 99250 }, { "epoch": 18.21526885667095, "grad_norm": 0.00035468945861794055, "learning_rate": 2.4066586965191985e-07, "loss": 0.0, "num_input_tokens_seen": 214059872, "step": 99255 }, { "epoch": 18.216186456230503, "grad_norm": 0.005569640547037125, "learning_rate": 2.404204892366385e-07, "loss": 0.0, "num_input_tokens_seen": 214071104, "step": 99260 }, { "epoch": 18.217104055790053, "grad_norm": 0.027355115860700607, "learning_rate": 2.4017523089719385e-07, "loss": 0.0, "num_input_tokens_seen": 214082496, "step": 99265 }, { "epoch": 18.218021655349606, "grad_norm": 0.0067952945828437805, "learning_rate": 2.399300946398786e-07, "loss": 0.0002, "num_input_tokens_seen": 214093600, "step": 99270 }, { "epoch": 18.21893925490916, "grad_norm": 0.0014048820594325662, "learning_rate": 2.3968508047097826e-07, "loss": 0.0, "num_input_tokens_seen": 214104384, "step": 99275 }, { "epoch": 18.21985685446871, "grad_norm": 2.9139912128448486, "learning_rate": 2.3944018839677784e-07, "loss": 0.0001, "num_input_tokens_seen": 214114144, "step": 99280 }, { "epoch": 18.220774454028263, "grad_norm": 0.0021646691020578146, "learning_rate": 2.3919541842355843e-07, "loss": 0.0, "num_input_tokens_seen": 214124192, "step": 99285 }, { "epoch": 18.221692053587816, "grad_norm": 0.018914366140961647, "learning_rate": 2.389507705575983e-07, "loss": 0.0, "num_input_tokens_seen": 214135264, "step": 99290 }, { "epoch": 18.222609653147366, "grad_norm": 0.0129140829667449, "learning_rate": 2.3870624480517134e-07, "loss": 0.0, "num_input_tokens_seen": 214146496, "step": 99295 }, { "epoch": 18.22352725270692, "grad_norm": 0.00042483050492592156, "learning_rate": 2.3846184117255034e-07, "loss": 0.0, "num_input_tokens_seen": 214156864, "step": 99300 }, { "epoch": 18.224444852266473, "grad_norm": 0.00133827468380332, "learning_rate": 2.3821755966600357e-07, "loss": 0.0, "num_input_tokens_seen": 214167776, "step": 99305 }, { "epoch": 18.225362451826022, "grad_norm": 0.0011864337138831615, "learning_rate": 2.3797340029179605e-07, "loss": 0.0001, "num_input_tokens_seen": 214179264, "step": 99310 }, { "epoch": 18.226280051385576, "grad_norm": 0.0036479008849710226, "learning_rate": 2.3772936305618999e-07, "loss": 0.0002, "num_input_tokens_seen": 214190016, "step": 99315 }, { "epoch": 18.22719765094513, "grad_norm": 0.15149489045143127, "learning_rate": 2.3748544796544537e-07, "loss": 0.0, "num_input_tokens_seen": 214200096, "step": 99320 }, { "epoch": 18.22811525050468, "grad_norm": 0.01803789846599102, "learning_rate": 2.3724165502581774e-07, "loss": 0.0052, "num_input_tokens_seen": 214210112, "step": 99325 }, { "epoch": 18.229032850064232, "grad_norm": 0.001679604989476502, "learning_rate": 2.3699798424355936e-07, "loss": 0.0097, "num_input_tokens_seen": 214220928, "step": 99330 }, { "epoch": 18.229950449623786, "grad_norm": 0.007944930344820023, "learning_rate": 2.367544356249213e-07, "loss": 0.0, "num_input_tokens_seen": 214231168, "step": 99335 }, { "epoch": 18.230868049183336, "grad_norm": 0.004787420388311148, "learning_rate": 2.3651100917615021e-07, "loss": 0.0, "num_input_tokens_seen": 214241920, "step": 99340 }, { "epoch": 18.23178564874289, "grad_norm": 0.004287375137209892, "learning_rate": 2.3626770490348782e-07, "loss": 0.0588, "num_input_tokens_seen": 214251936, "step": 99345 }, { "epoch": 18.232703248302442, "grad_norm": 0.016916977241635323, "learning_rate": 2.3602452281317634e-07, "loss": 0.0, "num_input_tokens_seen": 214262752, "step": 99350 }, { "epoch": 18.233620847861992, "grad_norm": 0.00021961091260891408, "learning_rate": 2.3578146291145242e-07, "loss": 0.0, "num_input_tokens_seen": 214274784, "step": 99355 }, { "epoch": 18.234538447421546, "grad_norm": 0.0008359605562873185, "learning_rate": 2.3553852520455e-07, "loss": 0.0, "num_input_tokens_seen": 214286048, "step": 99360 }, { "epoch": 18.2354560469811, "grad_norm": 0.029491251334547997, "learning_rate": 2.3529570969869963e-07, "loss": 0.0, "num_input_tokens_seen": 214296800, "step": 99365 }, { "epoch": 18.23637364654065, "grad_norm": 0.019187066704034805, "learning_rate": 2.3505301640013022e-07, "loss": 0.0, "num_input_tokens_seen": 214307072, "step": 99370 }, { "epoch": 18.237291246100202, "grad_norm": 0.009210402145981789, "learning_rate": 2.3481044531506626e-07, "loss": 0.0207, "num_input_tokens_seen": 214317920, "step": 99375 }, { "epoch": 18.238208845659756, "grad_norm": 0.018822865560650826, "learning_rate": 2.3456799644972828e-07, "loss": 0.0, "num_input_tokens_seen": 214328864, "step": 99380 }, { "epoch": 18.239126445219306, "grad_norm": 0.006222471594810486, "learning_rate": 2.3432566981033577e-07, "loss": 0.1938, "num_input_tokens_seen": 214339456, "step": 99385 }, { "epoch": 18.24004404477886, "grad_norm": 0.0007484977832064033, "learning_rate": 2.3408346540310379e-07, "loss": 0.0001, "num_input_tokens_seen": 214350432, "step": 99390 }, { "epoch": 18.240961644338412, "grad_norm": 0.0017224116018041968, "learning_rate": 2.3384138323424455e-07, "loss": 0.0, "num_input_tokens_seen": 214360384, "step": 99395 }, { "epoch": 18.241879243897962, "grad_norm": 0.03614836931228638, "learning_rate": 2.3359942330996644e-07, "loss": 0.0, "num_input_tokens_seen": 214371360, "step": 99400 }, { "epoch": 18.242796843457516, "grad_norm": 0.0013908493565395474, "learning_rate": 2.3335758563647614e-07, "loss": 0.0, "num_input_tokens_seen": 214382368, "step": 99405 }, { "epoch": 18.24371444301707, "grad_norm": 0.001721281441859901, "learning_rate": 2.331158702199765e-07, "loss": 0.0001, "num_input_tokens_seen": 214393056, "step": 99410 }, { "epoch": 18.24463204257662, "grad_norm": 0.000926452805288136, "learning_rate": 2.32874277066667e-07, "loss": 0.0, "num_input_tokens_seen": 214403232, "step": 99415 }, { "epoch": 18.245549642136172, "grad_norm": 0.0038384173531085253, "learning_rate": 2.326328061827432e-07, "loss": 0.0, "num_input_tokens_seen": 214413120, "step": 99420 }, { "epoch": 18.246467241695726, "grad_norm": 0.0023358718026429415, "learning_rate": 2.3239145757439961e-07, "loss": 0.0, "num_input_tokens_seen": 214423552, "step": 99425 }, { "epoch": 18.247384841255275, "grad_norm": 0.00039313171873800457, "learning_rate": 2.321502312478252e-07, "loss": 0.0, "num_input_tokens_seen": 214434816, "step": 99430 }, { "epoch": 18.24830244081483, "grad_norm": 0.0013043892104178667, "learning_rate": 2.3190912720920888e-07, "loss": 0.0, "num_input_tokens_seen": 214444544, "step": 99435 }, { "epoch": 18.249220040374382, "grad_norm": 0.0004409948014654219, "learning_rate": 2.316681454647335e-07, "loss": 0.0, "num_input_tokens_seen": 214454944, "step": 99440 }, { "epoch": 18.250137639933932, "grad_norm": 0.00035415636375546455, "learning_rate": 2.3142728602057962e-07, "loss": 0.0, "num_input_tokens_seen": 214466240, "step": 99445 }, { "epoch": 18.251055239493486, "grad_norm": 0.3325973451137543, "learning_rate": 2.3118654888292458e-07, "loss": 0.0001, "num_input_tokens_seen": 214476000, "step": 99450 }, { "epoch": 18.25197283905304, "grad_norm": 0.01743900403380394, "learning_rate": 2.3094593405794453e-07, "loss": 0.0, "num_input_tokens_seen": 214487200, "step": 99455 }, { "epoch": 18.25289043861259, "grad_norm": 0.011057189665734768, "learning_rate": 2.3070544155180952e-07, "loss": 0.0001, "num_input_tokens_seen": 214497984, "step": 99460 }, { "epoch": 18.253808038172142, "grad_norm": 0.0012813190696761012, "learning_rate": 2.3046507137068797e-07, "loss": 0.0001, "num_input_tokens_seen": 214508320, "step": 99465 }, { "epoch": 18.254725637731696, "grad_norm": 0.0011587959015741944, "learning_rate": 2.3022482352074548e-07, "loss": 0.0, "num_input_tokens_seen": 214519328, "step": 99470 }, { "epoch": 18.255643237291245, "grad_norm": 0.0010084352688863873, "learning_rate": 2.2998469800814382e-07, "loss": 0.0, "num_input_tokens_seen": 214528128, "step": 99475 }, { "epoch": 18.2565608368508, "grad_norm": 0.028615746647119522, "learning_rate": 2.2974469483904138e-07, "loss": 0.0, "num_input_tokens_seen": 214539360, "step": 99480 }, { "epoch": 18.257478436410352, "grad_norm": 0.0015678085619583726, "learning_rate": 2.295048140195949e-07, "loss": 0.0, "num_input_tokens_seen": 214549664, "step": 99485 }, { "epoch": 18.258396035969902, "grad_norm": 0.003561969380825758, "learning_rate": 2.292650555559567e-07, "loss": 0.0, "num_input_tokens_seen": 214560736, "step": 99490 }, { "epoch": 18.259313635529455, "grad_norm": 0.10277245938777924, "learning_rate": 2.2902541945427514e-07, "loss": 0.0002, "num_input_tokens_seen": 214569760, "step": 99495 }, { "epoch": 18.26023123508901, "grad_norm": 0.003976739011704922, "learning_rate": 2.2878590572069702e-07, "loss": 0.0, "num_input_tokens_seen": 214580224, "step": 99500 }, { "epoch": 18.26114883464856, "grad_norm": 0.0025417166762053967, "learning_rate": 2.2854651436136633e-07, "loss": 0.0, "num_input_tokens_seen": 214591072, "step": 99505 }, { "epoch": 18.262066434208112, "grad_norm": 0.0007869977853260934, "learning_rate": 2.28307245382422e-07, "loss": 0.0, "num_input_tokens_seen": 214601856, "step": 99510 }, { "epoch": 18.262984033767665, "grad_norm": 0.0008126685861498117, "learning_rate": 2.2806809879000136e-07, "loss": 0.0, "num_input_tokens_seen": 214612128, "step": 99515 }, { "epoch": 18.263901633327215, "grad_norm": 0.2668035924434662, "learning_rate": 2.278290745902384e-07, "loss": 0.0001, "num_input_tokens_seen": 214623392, "step": 99520 }, { "epoch": 18.26481923288677, "grad_norm": 0.000949936977121979, "learning_rate": 2.2759017278926377e-07, "loss": 0.0, "num_input_tokens_seen": 214634848, "step": 99525 }, { "epoch": 18.265736832446322, "grad_norm": 0.016123276203870773, "learning_rate": 2.2735139339320366e-07, "loss": 0.0, "num_input_tokens_seen": 214646144, "step": 99530 }, { "epoch": 18.266654432005872, "grad_norm": 0.0791386216878891, "learning_rate": 2.2711273640818433e-07, "loss": 0.0, "num_input_tokens_seen": 214656256, "step": 99535 }, { "epoch": 18.267572031565425, "grad_norm": 0.0008767378167249262, "learning_rate": 2.2687420184032583e-07, "loss": 0.0, "num_input_tokens_seen": 214667488, "step": 99540 }, { "epoch": 18.26848963112498, "grad_norm": 0.0038752395194023848, "learning_rate": 2.266357896957466e-07, "loss": 0.0, "num_input_tokens_seen": 214678272, "step": 99545 }, { "epoch": 18.26940723068453, "grad_norm": 0.007218678016215563, "learning_rate": 2.263974999805607e-07, "loss": 0.001, "num_input_tokens_seen": 214689568, "step": 99550 }, { "epoch": 18.270324830244082, "grad_norm": 0.002622386673465371, "learning_rate": 2.2615933270088098e-07, "loss": 0.0, "num_input_tokens_seen": 214700384, "step": 99555 }, { "epoch": 18.271242429803635, "grad_norm": 0.013776563107967377, "learning_rate": 2.259212878628153e-07, "loss": 0.0, "num_input_tokens_seen": 214711744, "step": 99560 }, { "epoch": 18.272160029363185, "grad_norm": 0.001802499289624393, "learning_rate": 2.256833654724694e-07, "loss": 0.0, "num_input_tokens_seen": 214722240, "step": 99565 }, { "epoch": 18.27307762892274, "grad_norm": 0.0036844986025243998, "learning_rate": 2.254455655359461e-07, "loss": 0.0, "num_input_tokens_seen": 214732704, "step": 99570 }, { "epoch": 18.273995228482292, "grad_norm": 0.0016613492043688893, "learning_rate": 2.252078880593439e-07, "loss": 0.0, "num_input_tokens_seen": 214744576, "step": 99575 }, { "epoch": 18.274912828041842, "grad_norm": 0.0018861287971958518, "learning_rate": 2.2497033304875903e-07, "loss": 0.0, "num_input_tokens_seen": 214756736, "step": 99580 }, { "epoch": 18.275830427601395, "grad_norm": 0.0006113802082836628, "learning_rate": 2.247329005102844e-07, "loss": 0.0, "num_input_tokens_seen": 214767872, "step": 99585 }, { "epoch": 18.27674802716095, "grad_norm": 0.0007229563198052347, "learning_rate": 2.2449559045001012e-07, "loss": 0.0, "num_input_tokens_seen": 214776800, "step": 99590 }, { "epoch": 18.2776656267205, "grad_norm": 0.003936468157917261, "learning_rate": 2.2425840287402246e-07, "loss": 0.0, "num_input_tokens_seen": 214788000, "step": 99595 }, { "epoch": 18.278583226280052, "grad_norm": 0.043438881635665894, "learning_rate": 2.2402133778840484e-07, "loss": 0.0, "num_input_tokens_seen": 214799168, "step": 99600 }, { "epoch": 18.279500825839605, "grad_norm": 0.005268283654004335, "learning_rate": 2.23784395199238e-07, "loss": 0.0, "num_input_tokens_seen": 214810464, "step": 99605 }, { "epoch": 18.280418425399155, "grad_norm": 0.0043062856420874596, "learning_rate": 2.2354757511259927e-07, "loss": 0.0001, "num_input_tokens_seen": 214822368, "step": 99610 }, { "epoch": 18.28133602495871, "grad_norm": 0.10270769894123077, "learning_rate": 2.2331087753456216e-07, "loss": 0.0, "num_input_tokens_seen": 214832448, "step": 99615 }, { "epoch": 18.282253624518262, "grad_norm": 0.008708532899618149, "learning_rate": 2.2307430247119788e-07, "loss": 0.0, "num_input_tokens_seen": 214843744, "step": 99620 }, { "epoch": 18.28317122407781, "grad_norm": 0.1061030700802803, "learning_rate": 2.2283784992857383e-07, "loss": 0.0, "num_input_tokens_seen": 214854848, "step": 99625 }, { "epoch": 18.284088823637365, "grad_norm": 0.3986675441265106, "learning_rate": 2.226015199127557e-07, "loss": 0.0001, "num_input_tokens_seen": 214865824, "step": 99630 }, { "epoch": 18.28500642319692, "grad_norm": 0.004783234558999538, "learning_rate": 2.2236531242980364e-07, "loss": 0.0014, "num_input_tokens_seen": 214876896, "step": 99635 }, { "epoch": 18.28592402275647, "grad_norm": 0.0011204455513507128, "learning_rate": 2.2212922748577725e-07, "loss": 0.0427, "num_input_tokens_seen": 214888160, "step": 99640 }, { "epoch": 18.28684162231602, "grad_norm": 0.008005650714039803, "learning_rate": 2.2189326508673114e-07, "loss": 0.0001, "num_input_tokens_seen": 214898656, "step": 99645 }, { "epoch": 18.287759221875575, "grad_norm": 0.012646994553506374, "learning_rate": 2.2165742523871603e-07, "loss": 0.0, "num_input_tokens_seen": 214909280, "step": 99650 }, { "epoch": 18.288676821435125, "grad_norm": 0.000965370680205524, "learning_rate": 2.2142170794778374e-07, "loss": 0.0, "num_input_tokens_seen": 214919360, "step": 99655 }, { "epoch": 18.28959442099468, "grad_norm": 0.00046887921052984893, "learning_rate": 2.211861132199783e-07, "loss": 0.1626, "num_input_tokens_seen": 214930176, "step": 99660 }, { "epoch": 18.29051202055423, "grad_norm": 0.0029404854867607355, "learning_rate": 2.2095064106134157e-07, "loss": 0.0, "num_input_tokens_seen": 214940832, "step": 99665 }, { "epoch": 18.29142962011378, "grad_norm": 0.0010521009098738432, "learning_rate": 2.207152914779148e-07, "loss": 0.0, "num_input_tokens_seen": 214952160, "step": 99670 }, { "epoch": 18.292347219673335, "grad_norm": 0.003393852384760976, "learning_rate": 2.2048006447573377e-07, "loss": 0.0, "num_input_tokens_seen": 214962880, "step": 99675 }, { "epoch": 18.29326481923289, "grad_norm": 0.0006888586794957519, "learning_rate": 2.202449600608314e-07, "loss": 0.0, "num_input_tokens_seen": 214974112, "step": 99680 }, { "epoch": 18.29418241879244, "grad_norm": 0.00666103744879365, "learning_rate": 2.200099782392373e-07, "loss": 0.0002, "num_input_tokens_seen": 214983616, "step": 99685 }, { "epoch": 18.29510001835199, "grad_norm": 0.006400640122592449, "learning_rate": 2.1977511901697947e-07, "loss": 0.0, "num_input_tokens_seen": 214993728, "step": 99690 }, { "epoch": 18.296017617911545, "grad_norm": 0.0066263689659535885, "learning_rate": 2.195403824000808e-07, "loss": 0.0, "num_input_tokens_seen": 215004032, "step": 99695 }, { "epoch": 18.296935217471095, "grad_norm": 0.0021745162084698677, "learning_rate": 2.1930576839456208e-07, "loss": 0.001, "num_input_tokens_seen": 215013152, "step": 99700 }, { "epoch": 18.29785281703065, "grad_norm": 0.0011489520547911525, "learning_rate": 2.190712770064418e-07, "loss": 0.0, "num_input_tokens_seen": 215022080, "step": 99705 }, { "epoch": 18.2987704165902, "grad_norm": 0.0006395181408151984, "learning_rate": 2.1883690824173354e-07, "loss": 0.0, "num_input_tokens_seen": 215031712, "step": 99710 }, { "epoch": 18.29968801614975, "grad_norm": 0.26624491810798645, "learning_rate": 2.18602662106448e-07, "loss": 0.0001, "num_input_tokens_seen": 215042752, "step": 99715 }, { "epoch": 18.300605615709305, "grad_norm": 0.0026988782919943333, "learning_rate": 2.1836853860659312e-07, "loss": 0.0, "num_input_tokens_seen": 215054016, "step": 99720 }, { "epoch": 18.30152321526886, "grad_norm": 0.00037395363324321806, "learning_rate": 2.1813453774817528e-07, "loss": 0.0, "num_input_tokens_seen": 215064160, "step": 99725 }, { "epoch": 18.302440814828408, "grad_norm": 0.011136166751384735, "learning_rate": 2.179006595371952e-07, "loss": 0.0, "num_input_tokens_seen": 215075200, "step": 99730 }, { "epoch": 18.30335841438796, "grad_norm": 0.00044939364306628704, "learning_rate": 2.1766690397965084e-07, "loss": 0.0, "num_input_tokens_seen": 215085856, "step": 99735 }, { "epoch": 18.304276013947515, "grad_norm": 0.0007515016477555037, "learning_rate": 2.174332710815391e-07, "loss": 0.0, "num_input_tokens_seen": 215096352, "step": 99740 }, { "epoch": 18.305193613507065, "grad_norm": 0.0008373263408429921, "learning_rate": 2.1719976084885186e-07, "loss": 0.0144, "num_input_tokens_seen": 215106496, "step": 99745 }, { "epoch": 18.306111213066618, "grad_norm": 0.00991752278059721, "learning_rate": 2.1696637328757707e-07, "loss": 0.0, "num_input_tokens_seen": 215116192, "step": 99750 }, { "epoch": 18.30702881262617, "grad_norm": 0.0004697747644968331, "learning_rate": 2.1673310840370277e-07, "loss": 0.0, "num_input_tokens_seen": 215126720, "step": 99755 }, { "epoch": 18.30794641218572, "grad_norm": 0.0009367287857457995, "learning_rate": 2.164999662032108e-07, "loss": 0.0, "num_input_tokens_seen": 215136736, "step": 99760 }, { "epoch": 18.308864011745275, "grad_norm": 0.00236879987642169, "learning_rate": 2.1626694669208082e-07, "loss": 0.0004, "num_input_tokens_seen": 215147232, "step": 99765 }, { "epoch": 18.309781611304828, "grad_norm": 0.0018377845408394933, "learning_rate": 2.1603404987628918e-07, "loss": 0.0, "num_input_tokens_seen": 215157696, "step": 99770 }, { "epoch": 18.310699210864378, "grad_norm": 0.002980533055961132, "learning_rate": 2.1580127576180998e-07, "loss": 0.0065, "num_input_tokens_seen": 215168768, "step": 99775 }, { "epoch": 18.31161681042393, "grad_norm": 0.011272242292761803, "learning_rate": 2.1556862435461344e-07, "loss": 0.0, "num_input_tokens_seen": 215179776, "step": 99780 }, { "epoch": 18.312534409983485, "grad_norm": 0.000980100710876286, "learning_rate": 2.1533609566066594e-07, "loss": 0.0, "num_input_tokens_seen": 215191168, "step": 99785 }, { "epoch": 18.313452009543035, "grad_norm": 0.013749286532402039, "learning_rate": 2.1510368968593266e-07, "loss": 0.0002, "num_input_tokens_seen": 215202592, "step": 99790 }, { "epoch": 18.314369609102588, "grad_norm": 0.21845784783363342, "learning_rate": 2.1487140643637328e-07, "loss": 0.147, "num_input_tokens_seen": 215213600, "step": 99795 }, { "epoch": 18.31528720866214, "grad_norm": 0.014952726662158966, "learning_rate": 2.1463924591794581e-07, "loss": 0.0, "num_input_tokens_seen": 215225440, "step": 99800 }, { "epoch": 18.31620480822169, "grad_norm": 0.001000676304101944, "learning_rate": 2.144072081366061e-07, "loss": 0.0, "num_input_tokens_seen": 215237344, "step": 99805 }, { "epoch": 18.317122407781245, "grad_norm": 0.0007724332972429693, "learning_rate": 2.1417529309830376e-07, "loss": 0.0001, "num_input_tokens_seen": 215249344, "step": 99810 }, { "epoch": 18.318040007340798, "grad_norm": 0.001039920374751091, "learning_rate": 2.1394350080898795e-07, "loss": 0.0, "num_input_tokens_seen": 215259520, "step": 99815 }, { "epoch": 18.318957606900348, "grad_norm": 0.0016714282101020217, "learning_rate": 2.1371183127460337e-07, "loss": 0.0, "num_input_tokens_seen": 215268800, "step": 99820 }, { "epoch": 18.3198752064599, "grad_norm": 0.02021147310733795, "learning_rate": 2.134802845010925e-07, "loss": 0.0, "num_input_tokens_seen": 215279584, "step": 99825 }, { "epoch": 18.320792806019455, "grad_norm": 0.0007088620914146304, "learning_rate": 2.1324886049439442e-07, "loss": 0.0, "num_input_tokens_seen": 215290816, "step": 99830 }, { "epoch": 18.321710405579005, "grad_norm": 0.002847075229510665, "learning_rate": 2.1301755926044386e-07, "loss": 0.0, "num_input_tokens_seen": 215301216, "step": 99835 }, { "epoch": 18.322628005138558, "grad_norm": 0.0039939237758517265, "learning_rate": 2.127863808051739e-07, "loss": 0.0, "num_input_tokens_seen": 215312352, "step": 99840 }, { "epoch": 18.32354560469811, "grad_norm": 0.005387315060943365, "learning_rate": 2.1255532513451304e-07, "loss": 0.0001, "num_input_tokens_seen": 215322976, "step": 99845 }, { "epoch": 18.32446320425766, "grad_norm": 0.2253834754228592, "learning_rate": 2.1232439225438883e-07, "loss": 0.0001, "num_input_tokens_seen": 215332896, "step": 99850 }, { "epoch": 18.325380803817215, "grad_norm": 0.010644475929439068, "learning_rate": 2.1209358217072374e-07, "loss": 0.0, "num_input_tokens_seen": 215343136, "step": 99855 }, { "epoch": 18.326298403376768, "grad_norm": 0.0061900438740849495, "learning_rate": 2.1186289488943746e-07, "loss": 0.0001, "num_input_tokens_seen": 215354176, "step": 99860 }, { "epoch": 18.327216002936318, "grad_norm": 0.0014411596348509192, "learning_rate": 2.1163233041644749e-07, "loss": 0.0008, "num_input_tokens_seen": 215365056, "step": 99865 }, { "epoch": 18.32813360249587, "grad_norm": 0.000957743963226676, "learning_rate": 2.1140188875766575e-07, "loss": 0.0, "num_input_tokens_seen": 215374432, "step": 99870 }, { "epoch": 18.329051202055425, "grad_norm": 0.002879425650462508, "learning_rate": 2.1117156991900534e-07, "loss": 0.0, "num_input_tokens_seen": 215385120, "step": 99875 }, { "epoch": 18.329968801614974, "grad_norm": 0.00212383852340281, "learning_rate": 2.1094137390637148e-07, "loss": 0.0, "num_input_tokens_seen": 215394944, "step": 99880 }, { "epoch": 18.330886401174528, "grad_norm": 0.001188394264318049, "learning_rate": 2.1071130072566836e-07, "loss": 0.0, "num_input_tokens_seen": 215405184, "step": 99885 }, { "epoch": 18.33180400073408, "grad_norm": 0.011856642551720142, "learning_rate": 2.1048135038279848e-07, "loss": 0.0, "num_input_tokens_seen": 215415296, "step": 99890 }, { "epoch": 18.33272160029363, "grad_norm": 0.0014405437977984548, "learning_rate": 2.102515228836588e-07, "loss": 0.0, "num_input_tokens_seen": 215425536, "step": 99895 }, { "epoch": 18.333639199853184, "grad_norm": 0.0015240359352901578, "learning_rate": 2.1002181823414458e-07, "loss": 0.0, "num_input_tokens_seen": 215436128, "step": 99900 }, { "epoch": 18.334556799412738, "grad_norm": 0.006611634511500597, "learning_rate": 2.0979223644014557e-07, "loss": 0.0, "num_input_tokens_seen": 215447008, "step": 99905 }, { "epoch": 18.335474398972288, "grad_norm": 0.0030414096545428038, "learning_rate": 2.0956277750755262e-07, "loss": 0.0001, "num_input_tokens_seen": 215459296, "step": 99910 }, { "epoch": 18.33639199853184, "grad_norm": 0.0010324838804081082, "learning_rate": 2.0933344144224988e-07, "loss": 0.0, "num_input_tokens_seen": 215469312, "step": 99915 }, { "epoch": 18.337309598091394, "grad_norm": 0.04747201129794121, "learning_rate": 2.0910422825011877e-07, "loss": 0.0, "num_input_tokens_seen": 215480416, "step": 99920 }, { "epoch": 18.338227197650944, "grad_norm": 0.0026544274296611547, "learning_rate": 2.088751379370396e-07, "loss": 0.0, "num_input_tokens_seen": 215490752, "step": 99925 }, { "epoch": 18.339144797210498, "grad_norm": 0.008547734469175339, "learning_rate": 2.086461705088877e-07, "loss": 0.1626, "num_input_tokens_seen": 215502976, "step": 99930 }, { "epoch": 18.34006239677005, "grad_norm": 0.0011721611954271793, "learning_rate": 2.0841732597153498e-07, "loss": 0.0, "num_input_tokens_seen": 215513824, "step": 99935 }, { "epoch": 18.3409799963296, "grad_norm": 0.00037443332257680595, "learning_rate": 2.0818860433085232e-07, "loss": 0.0, "num_input_tokens_seen": 215525056, "step": 99940 }, { "epoch": 18.341897595889154, "grad_norm": 0.010740518569946289, "learning_rate": 2.0796000559270501e-07, "loss": 0.0001, "num_input_tokens_seen": 215535488, "step": 99945 }, { "epoch": 18.342815195448708, "grad_norm": 0.0008257973822765052, "learning_rate": 2.0773152976295673e-07, "loss": 0.0, "num_input_tokens_seen": 215546784, "step": 99950 }, { "epoch": 18.343732795008258, "grad_norm": 0.007104033604264259, "learning_rate": 2.0750317684746669e-07, "loss": 0.0, "num_input_tokens_seen": 215557760, "step": 99955 }, { "epoch": 18.34465039456781, "grad_norm": 0.0016051706625148654, "learning_rate": 2.072749468520935e-07, "loss": 0.0, "num_input_tokens_seen": 215567264, "step": 99960 }, { "epoch": 18.345567994127364, "grad_norm": 0.07998736202716827, "learning_rate": 2.0704683978268913e-07, "loss": 0.0, "num_input_tokens_seen": 215577472, "step": 99965 }, { "epoch": 18.346485593686914, "grad_norm": 0.006721511948853731, "learning_rate": 2.068188556451045e-07, "loss": 0.0, "num_input_tokens_seen": 215587680, "step": 99970 }, { "epoch": 18.347403193246468, "grad_norm": 0.003547585802152753, "learning_rate": 2.0659099444518827e-07, "loss": 0.0, "num_input_tokens_seen": 215598112, "step": 99975 }, { "epoch": 18.34832079280602, "grad_norm": 0.0025408416986465454, "learning_rate": 2.0636325618878406e-07, "loss": 0.0, "num_input_tokens_seen": 215607904, "step": 99980 }, { "epoch": 18.34923839236557, "grad_norm": 0.000662921171169728, "learning_rate": 2.061356408817322e-07, "loss": 0.0, "num_input_tokens_seen": 215618752, "step": 99985 }, { "epoch": 18.350155991925124, "grad_norm": 0.001103843911550939, "learning_rate": 2.0590814852987194e-07, "loss": 0.0006, "num_input_tokens_seen": 215629504, "step": 99990 }, { "epoch": 18.351073591484678, "grad_norm": 0.17237749695777893, "learning_rate": 2.056807791390375e-07, "loss": 0.0001, "num_input_tokens_seen": 215639104, "step": 99995 }, { "epoch": 18.351991191044227, "grad_norm": 0.0069947694428265095, "learning_rate": 2.054535327150603e-07, "loss": 0.0008, "num_input_tokens_seen": 215650432, "step": 100000 }, { "epoch": 18.35290879060378, "grad_norm": 15.038540840148926, "learning_rate": 2.0522640926376846e-07, "loss": 0.0032, "num_input_tokens_seen": 215661408, "step": 100005 }, { "epoch": 18.353826390163334, "grad_norm": 0.0728534683585167, "learning_rate": 2.04999408790989e-07, "loss": 0.0001, "num_input_tokens_seen": 215673216, "step": 100010 }, { "epoch": 18.354743989722884, "grad_norm": 0.008272519335150719, "learning_rate": 2.0477253130254283e-07, "loss": 0.0, "num_input_tokens_seen": 215683904, "step": 100015 }, { "epoch": 18.355661589282438, "grad_norm": 0.0011076341615989804, "learning_rate": 2.045457768042486e-07, "loss": 0.0, "num_input_tokens_seen": 215694976, "step": 100020 }, { "epoch": 18.35657918884199, "grad_norm": 0.0008807246340438724, "learning_rate": 2.0431914530192388e-07, "loss": 0.0, "num_input_tokens_seen": 215705120, "step": 100025 }, { "epoch": 18.35749678840154, "grad_norm": 0.0065014115534722805, "learning_rate": 2.0409263680138015e-07, "loss": 0.0, "num_input_tokens_seen": 215715360, "step": 100030 }, { "epoch": 18.358414387961094, "grad_norm": 0.0008236976573243737, "learning_rate": 2.0386625130842774e-07, "loss": 0.0001, "num_input_tokens_seen": 215725760, "step": 100035 }, { "epoch": 18.359331987520648, "grad_norm": 0.007433771155774593, "learning_rate": 2.0363998882887147e-07, "loss": 0.0, "num_input_tokens_seen": 215737120, "step": 100040 }, { "epoch": 18.360249587080197, "grad_norm": 0.009300637058913708, "learning_rate": 2.0341384936851672e-07, "loss": 0.001, "num_input_tokens_seen": 215748096, "step": 100045 }, { "epoch": 18.36116718663975, "grad_norm": 0.009392848238348961, "learning_rate": 2.0318783293316268e-07, "loss": 0.0011, "num_input_tokens_seen": 215759104, "step": 100050 }, { "epoch": 18.362084786199304, "grad_norm": 0.0014426123816519976, "learning_rate": 2.0296193952860643e-07, "loss": 0.0, "num_input_tokens_seen": 215770560, "step": 100055 }, { "epoch": 18.363002385758854, "grad_norm": 0.007807470858097076, "learning_rate": 2.0273616916064165e-07, "loss": 0.0, "num_input_tokens_seen": 215781824, "step": 100060 }, { "epoch": 18.363919985318407, "grad_norm": 0.0041268630884587765, "learning_rate": 2.0251052183505814e-07, "loss": 0.0, "num_input_tokens_seen": 215793088, "step": 100065 }, { "epoch": 18.36483758487796, "grad_norm": 0.18591666221618652, "learning_rate": 2.022849975576452e-07, "loss": 0.0001, "num_input_tokens_seen": 215804384, "step": 100070 }, { "epoch": 18.36575518443751, "grad_norm": 0.010040665045380592, "learning_rate": 2.020595963341865e-07, "loss": 0.0023, "num_input_tokens_seen": 215814784, "step": 100075 }, { "epoch": 18.366672783997064, "grad_norm": 0.003261634847149253, "learning_rate": 2.0183431817046238e-07, "loss": 0.0, "num_input_tokens_seen": 215826688, "step": 100080 }, { "epoch": 18.367590383556617, "grad_norm": 0.032214969396591187, "learning_rate": 2.016091630722522e-07, "loss": 0.0003, "num_input_tokens_seen": 215837920, "step": 100085 }, { "epoch": 18.368507983116167, "grad_norm": 0.0014626802876591682, "learning_rate": 2.0138413104532904e-07, "loss": 0.0, "num_input_tokens_seen": 215848672, "step": 100090 }, { "epoch": 18.36942558267572, "grad_norm": 0.004466693848371506, "learning_rate": 2.0115922209546667e-07, "loss": 0.0, "num_input_tokens_seen": 215859232, "step": 100095 }, { "epoch": 18.370343182235274, "grad_norm": 0.006669644266366959, "learning_rate": 2.0093443622843267e-07, "loss": 0.0, "num_input_tokens_seen": 215869920, "step": 100100 }, { "epoch": 18.371260781794824, "grad_norm": 1.2829962968826294, "learning_rate": 2.0070977344999186e-07, "loss": 0.0, "num_input_tokens_seen": 215882016, "step": 100105 }, { "epoch": 18.372178381354377, "grad_norm": 0.002516436856240034, "learning_rate": 2.0048523376590745e-07, "loss": 0.0003, "num_input_tokens_seen": 215891872, "step": 100110 }, { "epoch": 18.37309598091393, "grad_norm": 0.0014460984384641051, "learning_rate": 2.0026081718193867e-07, "loss": 0.0, "num_input_tokens_seen": 215902336, "step": 100115 }, { "epoch": 18.37401358047348, "grad_norm": 0.001496853306889534, "learning_rate": 2.0003652370383985e-07, "loss": 0.0006, "num_input_tokens_seen": 215912064, "step": 100120 }, { "epoch": 18.374931180033034, "grad_norm": 0.0009824475273489952, "learning_rate": 1.9981235333736637e-07, "loss": 0.0, "num_input_tokens_seen": 215922592, "step": 100125 }, { "epoch": 18.375848779592587, "grad_norm": 0.0006643718224950135, "learning_rate": 1.9958830608826586e-07, "loss": 0.0, "num_input_tokens_seen": 215933152, "step": 100130 }, { "epoch": 18.376766379152137, "grad_norm": 0.1645764410495758, "learning_rate": 1.9936438196228535e-07, "loss": 0.0, "num_input_tokens_seen": 215944192, "step": 100135 }, { "epoch": 18.37768397871169, "grad_norm": 0.001002386910840869, "learning_rate": 1.9914058096516753e-07, "loss": 0.0, "num_input_tokens_seen": 215956096, "step": 100140 }, { "epoch": 18.378601578271244, "grad_norm": 0.005581335164606571, "learning_rate": 1.9891690310265388e-07, "loss": 0.0944, "num_input_tokens_seen": 215968352, "step": 100145 }, { "epoch": 18.379519177830794, "grad_norm": 0.0021939296275377274, "learning_rate": 1.9869334838048038e-07, "loss": 0.0002, "num_input_tokens_seen": 215980096, "step": 100150 }, { "epoch": 18.380436777390347, "grad_norm": 0.004614022094756365, "learning_rate": 1.9846991680438078e-07, "loss": 0.0, "num_input_tokens_seen": 215990976, "step": 100155 }, { "epoch": 18.3813543769499, "grad_norm": 0.0014355439925566316, "learning_rate": 1.9824660838008658e-07, "loss": 0.0, "num_input_tokens_seen": 216002208, "step": 100160 }, { "epoch": 18.38227197650945, "grad_norm": 0.010850794613361359, "learning_rate": 1.980234231133249e-07, "loss": 0.0, "num_input_tokens_seen": 216012320, "step": 100165 }, { "epoch": 18.383189576069004, "grad_norm": 0.0010486375540494919, "learning_rate": 1.9780036100981946e-07, "loss": 0.0, "num_input_tokens_seen": 216025120, "step": 100170 }, { "epoch": 18.384107175628557, "grad_norm": 2.359900712966919, "learning_rate": 1.9757742207529296e-07, "loss": 0.0005, "num_input_tokens_seen": 216036992, "step": 100175 }, { "epoch": 18.385024775188107, "grad_norm": 0.010760163888335228, "learning_rate": 1.973546063154619e-07, "loss": 0.0, "num_input_tokens_seen": 216047360, "step": 100180 }, { "epoch": 18.38594237474766, "grad_norm": 0.020560372620821, "learning_rate": 1.9713191373604225e-07, "loss": 0.0, "num_input_tokens_seen": 216058400, "step": 100185 }, { "epoch": 18.386859974307214, "grad_norm": 0.0036155679263174534, "learning_rate": 1.9690934434274445e-07, "loss": 0.0, "num_input_tokens_seen": 216068832, "step": 100190 }, { "epoch": 18.387777573866764, "grad_norm": 0.011215955018997192, "learning_rate": 1.966868981412784e-07, "loss": 0.0, "num_input_tokens_seen": 216078976, "step": 100195 }, { "epoch": 18.388695173426317, "grad_norm": 0.0012577250599861145, "learning_rate": 1.9646457513734896e-07, "loss": 0.0, "num_input_tokens_seen": 216089280, "step": 100200 }, { "epoch": 18.38961277298587, "grad_norm": 0.0009609548142179847, "learning_rate": 1.962423753366577e-07, "loss": 0.0, "num_input_tokens_seen": 216100256, "step": 100205 }, { "epoch": 18.39053037254542, "grad_norm": 0.001288735307753086, "learning_rate": 1.9602029874490502e-07, "loss": 0.0003, "num_input_tokens_seen": 216110880, "step": 100210 }, { "epoch": 18.391447972104974, "grad_norm": 0.005143612623214722, "learning_rate": 1.9579834536778642e-07, "loss": 0.0, "num_input_tokens_seen": 216121280, "step": 100215 }, { "epoch": 18.392365571664527, "grad_norm": 0.04276218265295029, "learning_rate": 1.9557651521099397e-07, "loss": 0.0, "num_input_tokens_seen": 216132288, "step": 100220 }, { "epoch": 18.393283171224077, "grad_norm": 0.001271514454856515, "learning_rate": 1.9535480828021757e-07, "loss": 0.0, "num_input_tokens_seen": 216143296, "step": 100225 }, { "epoch": 18.39420077078363, "grad_norm": 0.005472711753100157, "learning_rate": 1.9513322458114438e-07, "loss": 0.1813, "num_input_tokens_seen": 216154304, "step": 100230 }, { "epoch": 18.395118370343184, "grad_norm": 0.0004167113220319152, "learning_rate": 1.94911764119457e-07, "loss": 0.0, "num_input_tokens_seen": 216164640, "step": 100235 }, { "epoch": 18.396035969902734, "grad_norm": 0.0014705201610922813, "learning_rate": 1.9469042690083483e-07, "loss": 0.0, "num_input_tokens_seen": 216175168, "step": 100240 }, { "epoch": 18.396953569462287, "grad_norm": 0.3312124013900757, "learning_rate": 1.9446921293095667e-07, "loss": 0.0002, "num_input_tokens_seen": 216187136, "step": 100245 }, { "epoch": 18.39787116902184, "grad_norm": 0.008112393319606781, "learning_rate": 1.942481222154946e-07, "loss": 0.0, "num_input_tokens_seen": 216197152, "step": 100250 }, { "epoch": 18.39878876858139, "grad_norm": 0.23152227699756622, "learning_rate": 1.940271547601208e-07, "loss": 0.0001, "num_input_tokens_seen": 216207488, "step": 100255 }, { "epoch": 18.399706368140944, "grad_norm": 0.02167489379644394, "learning_rate": 1.938063105705007e-07, "loss": 0.0, "num_input_tokens_seen": 216217952, "step": 100260 }, { "epoch": 18.400623967700497, "grad_norm": 0.010429240763187408, "learning_rate": 1.935855896523009e-07, "loss": 0.0001, "num_input_tokens_seen": 216228768, "step": 100265 }, { "epoch": 18.401541567260047, "grad_norm": 0.001624930533580482, "learning_rate": 1.9336499201118076e-07, "loss": 0.0, "num_input_tokens_seen": 216238912, "step": 100270 }, { "epoch": 18.4024591668196, "grad_norm": 0.001773879979737103, "learning_rate": 1.9314451765279963e-07, "loss": 0.0, "num_input_tokens_seen": 216249696, "step": 100275 }, { "epoch": 18.403376766379154, "grad_norm": 0.06362852454185486, "learning_rate": 1.9292416658281132e-07, "loss": 0.0, "num_input_tokens_seen": 216260128, "step": 100280 }, { "epoch": 18.404294365938703, "grad_norm": 0.0013017421588301659, "learning_rate": 1.9270393880686798e-07, "loss": 0.0, "num_input_tokens_seen": 216271936, "step": 100285 }, { "epoch": 18.405211965498257, "grad_norm": 0.001155270729213953, "learning_rate": 1.9248383433061734e-07, "loss": 0.0, "num_input_tokens_seen": 216283040, "step": 100290 }, { "epoch": 18.40612956505781, "grad_norm": 0.13363000750541687, "learning_rate": 1.9226385315970597e-07, "loss": 0.0, "num_input_tokens_seen": 216294880, "step": 100295 }, { "epoch": 18.40704716461736, "grad_norm": 0.007182112894952297, "learning_rate": 1.9204399529977547e-07, "loss": 0.0, "num_input_tokens_seen": 216306752, "step": 100300 }, { "epoch": 18.407964764176914, "grad_norm": 0.0032185588497668505, "learning_rate": 1.918242607564641e-07, "loss": 0.0, "num_input_tokens_seen": 216316992, "step": 100305 }, { "epoch": 18.408882363736467, "grad_norm": 0.06276591867208481, "learning_rate": 1.9160464953540958e-07, "loss": 0.0, "num_input_tokens_seen": 216328448, "step": 100310 }, { "epoch": 18.409799963296017, "grad_norm": 274.07073974609375, "learning_rate": 1.9138516164224298e-07, "loss": 0.2313, "num_input_tokens_seen": 216338528, "step": 100315 }, { "epoch": 18.41071756285557, "grad_norm": 0.001763076987117529, "learning_rate": 1.911657970825942e-07, "loss": 0.0001, "num_input_tokens_seen": 216349696, "step": 100320 }, { "epoch": 18.411635162415124, "grad_norm": 0.0012551648542284966, "learning_rate": 1.9094655586208932e-07, "loss": 0.0, "num_input_tokens_seen": 216362048, "step": 100325 }, { "epoch": 18.412552761974673, "grad_norm": 0.006643407978117466, "learning_rate": 1.9072743798635275e-07, "loss": 0.0, "num_input_tokens_seen": 216371488, "step": 100330 }, { "epoch": 18.413470361534227, "grad_norm": 0.03142411261796951, "learning_rate": 1.9050844346100329e-07, "loss": 0.0, "num_input_tokens_seen": 216382848, "step": 100335 }, { "epoch": 18.41438796109378, "grad_norm": 0.0030533636454492807, "learning_rate": 1.9028957229165756e-07, "loss": 0.0, "num_input_tokens_seen": 216394368, "step": 100340 }, { "epoch": 18.41530556065333, "grad_norm": 0.004519413225352764, "learning_rate": 1.900708244839311e-07, "loss": 0.0, "num_input_tokens_seen": 216404256, "step": 100345 }, { "epoch": 18.416223160212883, "grad_norm": 0.0009613633155822754, "learning_rate": 1.8985220004343274e-07, "loss": 0.0005, "num_input_tokens_seen": 216414720, "step": 100350 }, { "epoch": 18.417140759772437, "grad_norm": 0.0007593695190735161, "learning_rate": 1.8963369897577076e-07, "loss": 0.0, "num_input_tokens_seen": 216425600, "step": 100355 }, { "epoch": 18.418058359331987, "grad_norm": 0.0007493634475395083, "learning_rate": 1.8941532128654794e-07, "loss": 0.0, "num_input_tokens_seen": 216436448, "step": 100360 }, { "epoch": 18.41897595889154, "grad_norm": 0.0014800593489781022, "learning_rate": 1.8919706698136753e-07, "loss": 0.0001, "num_input_tokens_seen": 216446752, "step": 100365 }, { "epoch": 18.419893558451093, "grad_norm": 0.005904957186430693, "learning_rate": 1.8897893606582562e-07, "loss": 0.0001, "num_input_tokens_seen": 216458592, "step": 100370 }, { "epoch": 18.420811158010643, "grad_norm": 0.0012654840247705579, "learning_rate": 1.8876092854551776e-07, "loss": 0.0, "num_input_tokens_seen": 216469472, "step": 100375 }, { "epoch": 18.421728757570197, "grad_norm": 0.001017279108054936, "learning_rate": 1.8854304442603555e-07, "loss": 0.0334, "num_input_tokens_seen": 216480480, "step": 100380 }, { "epoch": 18.42264635712975, "grad_norm": 0.007657642476260662, "learning_rate": 1.8832528371296733e-07, "loss": 0.132, "num_input_tokens_seen": 216491424, "step": 100385 }, { "epoch": 18.4235639566893, "grad_norm": 0.005376006476581097, "learning_rate": 1.8810764641189695e-07, "loss": 0.0, "num_input_tokens_seen": 216502112, "step": 100390 }, { "epoch": 18.424481556248853, "grad_norm": 0.004023566376417875, "learning_rate": 1.8789013252840882e-07, "loss": 0.0, "num_input_tokens_seen": 216514112, "step": 100395 }, { "epoch": 18.425399155808407, "grad_norm": 0.004063790198415518, "learning_rate": 1.8767274206808073e-07, "loss": 0.0, "num_input_tokens_seen": 216524800, "step": 100400 }, { "epoch": 18.426316755367957, "grad_norm": 0.0077999127097427845, "learning_rate": 1.874554750364882e-07, "loss": 0.0, "num_input_tokens_seen": 216534784, "step": 100405 }, { "epoch": 18.42723435492751, "grad_norm": 0.0007521247607655823, "learning_rate": 1.8723833143920345e-07, "loss": 0.0, "num_input_tokens_seen": 216544640, "step": 100410 }, { "epoch": 18.428151954487063, "grad_norm": 0.007475764956325293, "learning_rate": 1.8702131128179702e-07, "loss": 0.0, "num_input_tokens_seen": 216556160, "step": 100415 }, { "epoch": 18.429069554046613, "grad_norm": 0.001458787708543241, "learning_rate": 1.8680441456983446e-07, "loss": 0.0, "num_input_tokens_seen": 216567360, "step": 100420 }, { "epoch": 18.429987153606167, "grad_norm": 0.007472719997167587, "learning_rate": 1.8658764130887853e-07, "loss": 0.0, "num_input_tokens_seen": 216578176, "step": 100425 }, { "epoch": 18.43090475316572, "grad_norm": 0.003807791043072939, "learning_rate": 1.863709915044898e-07, "loss": 0.0, "num_input_tokens_seen": 216589248, "step": 100430 }, { "epoch": 18.43182235272527, "grad_norm": 16.535030364990234, "learning_rate": 1.861544651622249e-07, "loss": 0.0032, "num_input_tokens_seen": 216599968, "step": 100435 }, { "epoch": 18.432739952284823, "grad_norm": 0.0006683430401608348, "learning_rate": 1.859380622876361e-07, "loss": 0.001, "num_input_tokens_seen": 216611744, "step": 100440 }, { "epoch": 18.433657551844377, "grad_norm": 0.017810139805078506, "learning_rate": 1.8572178288627617e-07, "loss": 0.0, "num_input_tokens_seen": 216622848, "step": 100445 }, { "epoch": 18.434575151403926, "grad_norm": 0.0009690466686151922, "learning_rate": 1.8550562696369068e-07, "loss": 0.0, "num_input_tokens_seen": 216632320, "step": 100450 }, { "epoch": 18.43549275096348, "grad_norm": 0.010820173658430576, "learning_rate": 1.8528959452542406e-07, "loss": 0.0, "num_input_tokens_seen": 216643936, "step": 100455 }, { "epoch": 18.436410350523033, "grad_norm": 0.0011485220165923238, "learning_rate": 1.8507368557701687e-07, "loss": 0.0, "num_input_tokens_seen": 216655264, "step": 100460 }, { "epoch": 18.437327950082583, "grad_norm": 0.021929319947957993, "learning_rate": 1.848579001240075e-07, "loss": 0.0, "num_input_tokens_seen": 216665952, "step": 100465 }, { "epoch": 18.438245549642136, "grad_norm": 0.00938921794295311, "learning_rate": 1.8464223817193039e-07, "loss": 0.0, "num_input_tokens_seen": 216676416, "step": 100470 }, { "epoch": 18.43916314920169, "grad_norm": 0.0013895935844630003, "learning_rate": 1.8442669972631665e-07, "loss": 0.0, "num_input_tokens_seen": 216687872, "step": 100475 }, { "epoch": 18.44008074876124, "grad_norm": 0.0025716982781887054, "learning_rate": 1.8421128479269357e-07, "loss": 0.0, "num_input_tokens_seen": 216698272, "step": 100480 }, { "epoch": 18.440998348320793, "grad_norm": 0.0006950312526896596, "learning_rate": 1.8399599337658836e-07, "loss": 0.0045, "num_input_tokens_seen": 216708992, "step": 100485 }, { "epoch": 18.441915947880346, "grad_norm": 0.0028155790641903877, "learning_rate": 1.837808254835216e-07, "loss": 0.0032, "num_input_tokens_seen": 216719232, "step": 100490 }, { "epoch": 18.442833547439896, "grad_norm": 0.0007376270368695259, "learning_rate": 1.8356578111901225e-07, "loss": 0.0, "num_input_tokens_seen": 216731328, "step": 100495 }, { "epoch": 18.44375114699945, "grad_norm": 0.0011029386660084128, "learning_rate": 1.8335086028857585e-07, "loss": 0.0, "num_input_tokens_seen": 216741728, "step": 100500 }, { "epoch": 18.444668746559003, "grad_norm": 0.019969025626778603, "learning_rate": 1.8313606299772468e-07, "loss": 0.0, "num_input_tokens_seen": 216752608, "step": 100505 }, { "epoch": 18.445586346118553, "grad_norm": 0.0018397633684799075, "learning_rate": 1.8292138925196767e-07, "loss": 0.0, "num_input_tokens_seen": 216764128, "step": 100510 }, { "epoch": 18.446503945678106, "grad_norm": 0.012838112190365791, "learning_rate": 1.8270683905681153e-07, "loss": 0.0, "num_input_tokens_seen": 216775040, "step": 100515 }, { "epoch": 18.44742154523766, "grad_norm": 0.0322430245578289, "learning_rate": 1.8249241241775906e-07, "loss": 0.0, "num_input_tokens_seen": 216785760, "step": 100520 }, { "epoch": 18.44833914479721, "grad_norm": 0.0003474969125818461, "learning_rate": 1.822781093403092e-07, "loss": 0.0, "num_input_tokens_seen": 216797408, "step": 100525 }, { "epoch": 18.449256744356763, "grad_norm": 0.06964940577745438, "learning_rate": 1.8206392982995924e-07, "loss": 0.001, "num_input_tokens_seen": 216808352, "step": 100530 }, { "epoch": 18.450174343916316, "grad_norm": 0.08238834887742996, "learning_rate": 1.818498738922031e-07, "loss": 0.0, "num_input_tokens_seen": 216819072, "step": 100535 }, { "epoch": 18.451091943475866, "grad_norm": 0.0031464158091694117, "learning_rate": 1.8163594153252972e-07, "loss": 0.0, "num_input_tokens_seen": 216830656, "step": 100540 }, { "epoch": 18.45200954303542, "grad_norm": 0.030843917280435562, "learning_rate": 1.8142213275642583e-07, "loss": 0.0, "num_input_tokens_seen": 216841824, "step": 100545 }, { "epoch": 18.452927142594973, "grad_norm": 0.0012422185391187668, "learning_rate": 1.8120844756937705e-07, "loss": 0.0, "num_input_tokens_seen": 216853600, "step": 100550 }, { "epoch": 18.453844742154523, "grad_norm": 0.0010235451627522707, "learning_rate": 1.8099488597686287e-07, "loss": 0.0, "num_input_tokens_seen": 216864576, "step": 100555 }, { "epoch": 18.454762341714076, "grad_norm": 0.00027992986724711955, "learning_rate": 1.8078144798436114e-07, "loss": 0.0, "num_input_tokens_seen": 216876064, "step": 100560 }, { "epoch": 18.45567994127363, "grad_norm": 0.0008603301830589771, "learning_rate": 1.8056813359734583e-07, "loss": 0.0, "num_input_tokens_seen": 216885440, "step": 100565 }, { "epoch": 18.45659754083318, "grad_norm": 0.010902618058025837, "learning_rate": 1.8035494282128918e-07, "loss": 0.0, "num_input_tokens_seen": 216896096, "step": 100570 }, { "epoch": 18.457515140392733, "grad_norm": 0.007553392089903355, "learning_rate": 1.8014187566165743e-07, "loss": 0.0, "num_input_tokens_seen": 216906592, "step": 100575 }, { "epoch": 18.458432739952286, "grad_norm": 0.0010863040806725621, "learning_rate": 1.7992893212391725e-07, "loss": 0.0, "num_input_tokens_seen": 216917024, "step": 100580 }, { "epoch": 18.459350339511836, "grad_norm": 0.0021973394323140383, "learning_rate": 1.7971611221352991e-07, "loss": 0.0, "num_input_tokens_seen": 216928512, "step": 100585 }, { "epoch": 18.46026793907139, "grad_norm": 0.0032686556223779917, "learning_rate": 1.7950341593595265e-07, "loss": 0.0, "num_input_tokens_seen": 216939680, "step": 100590 }, { "epoch": 18.461185538630943, "grad_norm": 0.1164197251200676, "learning_rate": 1.792908432966417e-07, "loss": 0.0, "num_input_tokens_seen": 216951328, "step": 100595 }, { "epoch": 18.462103138190493, "grad_norm": 0.001792636001482606, "learning_rate": 1.790783943010499e-07, "loss": 0.0, "num_input_tokens_seen": 216962400, "step": 100600 }, { "epoch": 18.463020737750046, "grad_norm": 0.0010045290691778064, "learning_rate": 1.788660689546251e-07, "loss": 0.0, "num_input_tokens_seen": 216974016, "step": 100605 }, { "epoch": 18.4639383373096, "grad_norm": 0.0011097067035734653, "learning_rate": 1.7865386726281352e-07, "loss": 0.0, "num_input_tokens_seen": 216984704, "step": 100610 }, { "epoch": 18.46485593686915, "grad_norm": 0.0015703849494457245, "learning_rate": 1.7844178923105805e-07, "loss": 0.0, "num_input_tokens_seen": 216995648, "step": 100615 }, { "epoch": 18.465773536428703, "grad_norm": 0.005740107968449593, "learning_rate": 1.7822983486479762e-07, "loss": 0.0, "num_input_tokens_seen": 217006368, "step": 100620 }, { "epoch": 18.466691135988256, "grad_norm": 0.0022983881644904613, "learning_rate": 1.7801800416946902e-07, "loss": 0.0, "num_input_tokens_seen": 217015680, "step": 100625 }, { "epoch": 18.467608735547806, "grad_norm": 0.0045633274130523205, "learning_rate": 1.7780629715050512e-07, "loss": 0.0, "num_input_tokens_seen": 217027200, "step": 100630 }, { "epoch": 18.46852633510736, "grad_norm": 0.0007946513942442834, "learning_rate": 1.775947138133366e-07, "loss": 0.0001, "num_input_tokens_seen": 217036736, "step": 100635 }, { "epoch": 18.469443934666913, "grad_norm": 0.0005032007466070354, "learning_rate": 1.773832541633891e-07, "loss": 0.0, "num_input_tokens_seen": 217048224, "step": 100640 }, { "epoch": 18.470361534226463, "grad_norm": 0.000408514664741233, "learning_rate": 1.7717191820608604e-07, "loss": 0.0, "num_input_tokens_seen": 217059520, "step": 100645 }, { "epoch": 18.471279133786016, "grad_norm": 0.03823965787887573, "learning_rate": 1.7696070594684978e-07, "loss": 0.0, "num_input_tokens_seen": 217070784, "step": 100650 }, { "epoch": 18.47219673334557, "grad_norm": 0.004327031783759594, "learning_rate": 1.7674961739109597e-07, "loss": 0.0, "num_input_tokens_seen": 217081280, "step": 100655 }, { "epoch": 18.47311433290512, "grad_norm": 0.00045239206519909203, "learning_rate": 1.7653865254423863e-07, "loss": 0.0, "num_input_tokens_seen": 217092576, "step": 100660 }, { "epoch": 18.474031932464673, "grad_norm": 0.0011835714103654027, "learning_rate": 1.7632781141168953e-07, "loss": 0.0005, "num_input_tokens_seen": 217104032, "step": 100665 }, { "epoch": 18.474949532024226, "grad_norm": 0.0015305844135582447, "learning_rate": 1.7611709399885657e-07, "loss": 0.0, "num_input_tokens_seen": 217114976, "step": 100670 }, { "epoch": 18.475867131583776, "grad_norm": 0.01968279853463173, "learning_rate": 1.759065003111432e-07, "loss": 0.0, "num_input_tokens_seen": 217126880, "step": 100675 }, { "epoch": 18.47678473114333, "grad_norm": 0.011724993586540222, "learning_rate": 1.756960303539512e-07, "loss": 0.0, "num_input_tokens_seen": 217137632, "step": 100680 }, { "epoch": 18.477702330702883, "grad_norm": 0.0071720234118402, "learning_rate": 1.7548568413267964e-07, "loss": 0.0001, "num_input_tokens_seen": 217148800, "step": 100685 }, { "epoch": 18.478619930262433, "grad_norm": 0.0007033540168777108, "learning_rate": 1.7527546165272302e-07, "loss": 0.0, "num_input_tokens_seen": 217159680, "step": 100690 }, { "epoch": 18.479537529821986, "grad_norm": 0.015049451030790806, "learning_rate": 1.750653629194732e-07, "loss": 0.0478, "num_input_tokens_seen": 217170304, "step": 100695 }, { "epoch": 18.48045512938154, "grad_norm": 0.0013476330786943436, "learning_rate": 1.7485538793831858e-07, "loss": 0.0001, "num_input_tokens_seen": 217181024, "step": 100700 }, { "epoch": 18.48137272894109, "grad_norm": 0.004644996486604214, "learning_rate": 1.7464553671464434e-07, "loss": 0.0016, "num_input_tokens_seen": 217192256, "step": 100705 }, { "epoch": 18.482290328500643, "grad_norm": 0.0962112620472908, "learning_rate": 1.7443580925383397e-07, "loss": 0.0001, "num_input_tokens_seen": 217204032, "step": 100710 }, { "epoch": 18.483207928060196, "grad_norm": 0.0010832694824784994, "learning_rate": 1.7422620556126647e-07, "loss": 0.0001, "num_input_tokens_seen": 217213856, "step": 100715 }, { "epoch": 18.484125527619746, "grad_norm": 0.0874362364411354, "learning_rate": 1.7401672564231752e-07, "loss": 0.0, "num_input_tokens_seen": 217225152, "step": 100720 }, { "epoch": 18.4850431271793, "grad_norm": 0.012347395531833172, "learning_rate": 1.738073695023601e-07, "loss": 0.0, "num_input_tokens_seen": 217235008, "step": 100725 }, { "epoch": 18.485960726738853, "grad_norm": 0.10594785958528519, "learning_rate": 1.7359813714676266e-07, "loss": 0.0, "num_input_tokens_seen": 217246624, "step": 100730 }, { "epoch": 18.486878326298402, "grad_norm": 0.003058156231418252, "learning_rate": 1.7338902858089367e-07, "loss": 0.0, "num_input_tokens_seen": 217258016, "step": 100735 }, { "epoch": 18.487795925857956, "grad_norm": 0.0006937890429981053, "learning_rate": 1.7318004381011556e-07, "loss": 0.0001, "num_input_tokens_seen": 217266656, "step": 100740 }, { "epoch": 18.48871352541751, "grad_norm": 0.0020365531090646982, "learning_rate": 1.7297118283978731e-07, "loss": 0.0, "num_input_tokens_seen": 217276768, "step": 100745 }, { "epoch": 18.48963112497706, "grad_norm": 0.0017056778306141496, "learning_rate": 1.72762445675268e-07, "loss": 0.0001, "num_input_tokens_seen": 217287136, "step": 100750 }, { "epoch": 18.490548724536612, "grad_norm": 0.2236199527978897, "learning_rate": 1.7255383232191058e-07, "loss": 0.0001, "num_input_tokens_seen": 217298272, "step": 100755 }, { "epoch": 18.491466324096166, "grad_norm": 0.0006535235443152487, "learning_rate": 1.7234534278506465e-07, "loss": 0.0, "num_input_tokens_seen": 217309984, "step": 100760 }, { "epoch": 18.492383923655716, "grad_norm": 0.16156533360481262, "learning_rate": 1.7213697707007927e-07, "loss": 0.0, "num_input_tokens_seen": 217320352, "step": 100765 }, { "epoch": 18.49330152321527, "grad_norm": 0.0016173311742022634, "learning_rate": 1.7192873518229792e-07, "loss": 0.0, "num_input_tokens_seen": 217331232, "step": 100770 }, { "epoch": 18.494219122774822, "grad_norm": 0.0015713232569396496, "learning_rate": 1.717206171270619e-07, "loss": 0.0, "num_input_tokens_seen": 217342496, "step": 100775 }, { "epoch": 18.495136722334372, "grad_norm": 0.01313171535730362, "learning_rate": 1.715126229097075e-07, "loss": 0.0, "num_input_tokens_seen": 217353248, "step": 100780 }, { "epoch": 18.496054321893926, "grad_norm": 0.0006448589847423136, "learning_rate": 1.7130475253557211e-07, "loss": 0.0, "num_input_tokens_seen": 217363360, "step": 100785 }, { "epoch": 18.49697192145348, "grad_norm": 0.009692201390862465, "learning_rate": 1.7109700600998592e-07, "loss": 0.0001, "num_input_tokens_seen": 217374912, "step": 100790 }, { "epoch": 18.49788952101303, "grad_norm": 0.0011047846637666225, "learning_rate": 1.7088938333827688e-07, "loss": 0.0, "num_input_tokens_seen": 217385760, "step": 100795 }, { "epoch": 18.498807120572582, "grad_norm": 0.0009327915031462908, "learning_rate": 1.706818845257713e-07, "loss": 0.0001, "num_input_tokens_seen": 217395936, "step": 100800 }, { "epoch": 18.499724720132136, "grad_norm": 0.027578303590416908, "learning_rate": 1.7047450957779044e-07, "loss": 0.0001, "num_input_tokens_seen": 217407840, "step": 100805 }, { "epoch": 18.500642319691686, "grad_norm": 0.0006773778586648405, "learning_rate": 1.7026725849965341e-07, "loss": 0.0, "num_input_tokens_seen": 217418752, "step": 100810 }, { "epoch": 18.50155991925124, "grad_norm": 0.006838913541287184, "learning_rate": 1.7006013129667488e-07, "loss": 0.0001, "num_input_tokens_seen": 217429664, "step": 100815 }, { "epoch": 18.502477518810792, "grad_norm": 0.005520177539438009, "learning_rate": 1.698531279741694e-07, "loss": 0.0, "num_input_tokens_seen": 217441024, "step": 100820 }, { "epoch": 18.503395118370342, "grad_norm": 0.0012706302804872394, "learning_rate": 1.6964624853744448e-07, "loss": 0.0, "num_input_tokens_seen": 217451200, "step": 100825 }, { "epoch": 18.504312717929896, "grad_norm": 0.005481433589011431, "learning_rate": 1.6943949299180694e-07, "loss": 0.0, "num_input_tokens_seen": 217462272, "step": 100830 }, { "epoch": 18.50523031748945, "grad_norm": 0.8431439995765686, "learning_rate": 1.6923286134255978e-07, "loss": 0.0003, "num_input_tokens_seen": 217473696, "step": 100835 }, { "epoch": 18.506147917049, "grad_norm": 0.04161141812801361, "learning_rate": 1.690263535950032e-07, "loss": 0.0, "num_input_tokens_seen": 217486016, "step": 100840 }, { "epoch": 18.507065516608552, "grad_norm": 0.0013443378265947104, "learning_rate": 1.6881996975443239e-07, "loss": 0.0, "num_input_tokens_seen": 217497984, "step": 100845 }, { "epoch": 18.507983116168106, "grad_norm": 0.00042423070408403873, "learning_rate": 1.6861370982614255e-07, "loss": 0.0001, "num_input_tokens_seen": 217508832, "step": 100850 }, { "epoch": 18.508900715727655, "grad_norm": 0.012753455899655819, "learning_rate": 1.6840757381542284e-07, "loss": 0.0, "num_input_tokens_seen": 217520864, "step": 100855 }, { "epoch": 18.50981831528721, "grad_norm": 0.0005993960658088326, "learning_rate": 1.6820156172756063e-07, "loss": 0.0, "num_input_tokens_seen": 217532608, "step": 100860 }, { "epoch": 18.510735914846762, "grad_norm": 0.01425278838723898, "learning_rate": 1.6799567356783953e-07, "loss": 0.0, "num_input_tokens_seen": 217544000, "step": 100865 }, { "epoch": 18.511653514406312, "grad_norm": 0.0008846475393511355, "learning_rate": 1.6778990934154082e-07, "loss": 0.0001, "num_input_tokens_seen": 217554656, "step": 100870 }, { "epoch": 18.512571113965866, "grad_norm": 0.0008978463592939079, "learning_rate": 1.6758426905394144e-07, "loss": 0.0, "num_input_tokens_seen": 217564384, "step": 100875 }, { "epoch": 18.51348871352542, "grad_norm": 0.00983649305999279, "learning_rate": 1.6737875271031546e-07, "loss": 0.0001, "num_input_tokens_seen": 217575360, "step": 100880 }, { "epoch": 18.51440631308497, "grad_norm": 0.0005194146651774645, "learning_rate": 1.6717336031593534e-07, "loss": 0.0001, "num_input_tokens_seen": 217585120, "step": 100885 }, { "epoch": 18.515323912644522, "grad_norm": 0.07072018831968307, "learning_rate": 1.6696809187606855e-07, "loss": 0.0, "num_input_tokens_seen": 217596448, "step": 100890 }, { "epoch": 18.516241512204076, "grad_norm": 0.0012563243508338928, "learning_rate": 1.667629473959792e-07, "loss": 0.0007, "num_input_tokens_seen": 217606272, "step": 100895 }, { "epoch": 18.517159111763625, "grad_norm": 0.24585627019405365, "learning_rate": 1.665579268809292e-07, "loss": 0.0001, "num_input_tokens_seen": 217616992, "step": 100900 }, { "epoch": 18.51807671132318, "grad_norm": 0.01635388843715191, "learning_rate": 1.6635303033617767e-07, "loss": 0.0733, "num_input_tokens_seen": 217627776, "step": 100905 }, { "epoch": 18.518994310882732, "grad_norm": 0.003001973731443286, "learning_rate": 1.661482577669793e-07, "loss": 0.0, "num_input_tokens_seen": 217639072, "step": 100910 }, { "epoch": 18.519911910442282, "grad_norm": 0.011585232801735401, "learning_rate": 1.6594360917858655e-07, "loss": 0.0, "num_input_tokens_seen": 217652096, "step": 100915 }, { "epoch": 18.520829510001835, "grad_norm": 0.0010490409331396222, "learning_rate": 1.65739084576248e-07, "loss": 0.0001, "num_input_tokens_seen": 217661024, "step": 100920 }, { "epoch": 18.52174710956139, "grad_norm": 0.0020102255512028933, "learning_rate": 1.655346839652089e-07, "loss": 0.0, "num_input_tokens_seen": 217671872, "step": 100925 }, { "epoch": 18.52266470912094, "grad_norm": 0.000577244907617569, "learning_rate": 1.6533040735071336e-07, "loss": 0.0, "num_input_tokens_seen": 217683424, "step": 100930 }, { "epoch": 18.523582308680492, "grad_norm": 0.009803385473787785, "learning_rate": 1.651262547379995e-07, "loss": 0.0, "num_input_tokens_seen": 217693984, "step": 100935 }, { "epoch": 18.524499908240045, "grad_norm": 0.004421310964971781, "learning_rate": 1.6492222613230358e-07, "loss": 0.0, "num_input_tokens_seen": 217705120, "step": 100940 }, { "epoch": 18.525417507799595, "grad_norm": 0.0021431632339954376, "learning_rate": 1.6471832153885926e-07, "loss": 0.0, "num_input_tokens_seen": 217716704, "step": 100945 }, { "epoch": 18.52633510735915, "grad_norm": 0.011741110123693943, "learning_rate": 1.645145409628951e-07, "loss": 0.0, "num_input_tokens_seen": 217725536, "step": 100950 }, { "epoch": 18.527252706918702, "grad_norm": 0.0029439611826092005, "learning_rate": 1.6431088440963972e-07, "loss": 0.0002, "num_input_tokens_seen": 217736608, "step": 100955 }, { "epoch": 18.528170306478252, "grad_norm": 0.004515169188380241, "learning_rate": 1.6410735188431503e-07, "loss": 0.0, "num_input_tokens_seen": 217747808, "step": 100960 }, { "epoch": 18.529087906037805, "grad_norm": 0.0021401583217084408, "learning_rate": 1.6390394339214133e-07, "loss": 0.0, "num_input_tokens_seen": 217759104, "step": 100965 }, { "epoch": 18.53000550559736, "grad_norm": 0.0013731115031987429, "learning_rate": 1.637006589383372e-07, "loss": 0.0, "num_input_tokens_seen": 217769664, "step": 100970 }, { "epoch": 18.53092310515691, "grad_norm": 0.0008255605935119092, "learning_rate": 1.6349749852811515e-07, "loss": 0.0, "num_input_tokens_seen": 217780800, "step": 100975 }, { "epoch": 18.531840704716462, "grad_norm": 0.20576804876327515, "learning_rate": 1.6329446216668543e-07, "loss": 0.0001, "num_input_tokens_seen": 217791968, "step": 100980 }, { "epoch": 18.532758304276015, "grad_norm": 0.0011151510989293456, "learning_rate": 1.630915498592578e-07, "loss": 0.0, "num_input_tokens_seen": 217802368, "step": 100985 }, { "epoch": 18.533675903835565, "grad_norm": 0.011371924541890621, "learning_rate": 1.6288876161103528e-07, "loss": 0.0, "num_input_tokens_seen": 217814368, "step": 100990 }, { "epoch": 18.53459350339512, "grad_norm": 0.0012785412836819887, "learning_rate": 1.6268609742721874e-07, "loss": 0.0004, "num_input_tokens_seen": 217823488, "step": 100995 }, { "epoch": 18.535511102954672, "grad_norm": 0.010264198295772076, "learning_rate": 1.624835573130068e-07, "loss": 0.0003, "num_input_tokens_seen": 217834912, "step": 101000 }, { "epoch": 18.536428702514222, "grad_norm": 0.004824074450880289, "learning_rate": 1.622811412735942e-07, "loss": 0.0, "num_input_tokens_seen": 217845504, "step": 101005 }, { "epoch": 18.537346302073775, "grad_norm": 0.0005741377244703472, "learning_rate": 1.620788493141723e-07, "loss": 0.0, "num_input_tokens_seen": 217856384, "step": 101010 }, { "epoch": 18.53826390163333, "grad_norm": 0.049435585737228394, "learning_rate": 1.6187668143992974e-07, "loss": 0.0, "num_input_tokens_seen": 217867008, "step": 101015 }, { "epoch": 18.53918150119288, "grad_norm": 3.7185659408569336, "learning_rate": 1.616746376560524e-07, "loss": 0.0014, "num_input_tokens_seen": 217877216, "step": 101020 }, { "epoch": 18.540099100752432, "grad_norm": 0.028192942962050438, "learning_rate": 1.6147271796772168e-07, "loss": 0.0, "num_input_tokens_seen": 217887680, "step": 101025 }, { "epoch": 18.541016700311985, "grad_norm": 0.0011625881306827068, "learning_rate": 1.6127092238011622e-07, "loss": 0.0, "num_input_tokens_seen": 217899072, "step": 101030 }, { "epoch": 18.541934299871535, "grad_norm": 0.0005142543232068419, "learning_rate": 1.6106925089841296e-07, "loss": 0.0, "num_input_tokens_seen": 217909568, "step": 101035 }, { "epoch": 18.54285189943109, "grad_norm": 0.015027469024062157, "learning_rate": 1.6086770352778336e-07, "loss": 0.0, "num_input_tokens_seen": 217920320, "step": 101040 }, { "epoch": 18.543769498990642, "grad_norm": 0.0019231573678553104, "learning_rate": 1.606662802733977e-07, "loss": 0.0, "num_input_tokens_seen": 217930912, "step": 101045 }, { "epoch": 18.54468709855019, "grad_norm": 0.14790524542331696, "learning_rate": 1.6046498114042076e-07, "loss": 0.0, "num_input_tokens_seen": 217940832, "step": 101050 }, { "epoch": 18.545604698109745, "grad_norm": 244.93206787109375, "learning_rate": 1.6026380613401726e-07, "loss": 0.0285, "num_input_tokens_seen": 217951456, "step": 101055 }, { "epoch": 18.5465222976693, "grad_norm": 0.0008065261645242572, "learning_rate": 1.600627552593459e-07, "loss": 0.0, "num_input_tokens_seen": 217962176, "step": 101060 }, { "epoch": 18.54743989722885, "grad_norm": 1089.7283935546875, "learning_rate": 1.5986182852156307e-07, "loss": 0.0588, "num_input_tokens_seen": 217972224, "step": 101065 }, { "epoch": 18.5483574967884, "grad_norm": 0.0012732008472085, "learning_rate": 1.5966102592582356e-07, "loss": 0.0, "num_input_tokens_seen": 217982304, "step": 101070 }, { "epoch": 18.549275096347955, "grad_norm": 0.007144922856241465, "learning_rate": 1.5946034747727711e-07, "loss": 0.0306, "num_input_tokens_seen": 217993952, "step": 101075 }, { "epoch": 18.550192695907505, "grad_norm": 0.030127275735139847, "learning_rate": 1.5925979318106965e-07, "loss": 0.0001, "num_input_tokens_seen": 218004096, "step": 101080 }, { "epoch": 18.55111029546706, "grad_norm": 0.004612539429217577, "learning_rate": 1.5905936304234703e-07, "loss": 0.0001, "num_input_tokens_seen": 218014688, "step": 101085 }, { "epoch": 18.55202789502661, "grad_norm": 0.0005238136509433389, "learning_rate": 1.5885905706624849e-07, "loss": 0.0, "num_input_tokens_seen": 218025888, "step": 101090 }, { "epoch": 18.55294549458616, "grad_norm": 0.001099891378544271, "learning_rate": 1.5865887525791212e-07, "loss": 0.0, "num_input_tokens_seen": 218037888, "step": 101095 }, { "epoch": 18.553863094145715, "grad_norm": 0.006808013655245304, "learning_rate": 1.5845881762247162e-07, "loss": 0.0, "num_input_tokens_seen": 218048384, "step": 101100 }, { "epoch": 18.55478069370527, "grad_norm": 0.004057817626744509, "learning_rate": 1.5825888416505953e-07, "loss": 0.0, "num_input_tokens_seen": 218058688, "step": 101105 }, { "epoch": 18.55569829326482, "grad_norm": 0.0007692627841606736, "learning_rate": 1.5805907489080285e-07, "loss": 0.0008, "num_input_tokens_seen": 218069376, "step": 101110 }, { "epoch": 18.55661589282437, "grad_norm": 0.005666009616106749, "learning_rate": 1.5785938980482696e-07, "loss": 0.0, "num_input_tokens_seen": 218079424, "step": 101115 }, { "epoch": 18.557533492383925, "grad_norm": 0.03481980785727501, "learning_rate": 1.576598289122522e-07, "loss": 0.2188, "num_input_tokens_seen": 218089632, "step": 101120 }, { "epoch": 18.558451091943475, "grad_norm": 0.013088942505419254, "learning_rate": 1.574603922181983e-07, "loss": 0.0, "num_input_tokens_seen": 218101312, "step": 101125 }, { "epoch": 18.55936869150303, "grad_norm": 0.0021919175051152706, "learning_rate": 1.5726107972778015e-07, "loss": 0.0, "num_input_tokens_seen": 218112064, "step": 101130 }, { "epoch": 18.56028629106258, "grad_norm": 0.0018117169383913279, "learning_rate": 1.570618914461103e-07, "loss": 0.0, "num_input_tokens_seen": 218123520, "step": 101135 }, { "epoch": 18.56120389062213, "grad_norm": 0.006508298218250275, "learning_rate": 1.5686282737829627e-07, "loss": 0.0, "num_input_tokens_seen": 218134656, "step": 101140 }, { "epoch": 18.562121490181685, "grad_norm": 0.014191629365086555, "learning_rate": 1.566638875294446e-07, "loss": 0.0, "num_input_tokens_seen": 218144096, "step": 101145 }, { "epoch": 18.56303908974124, "grad_norm": 0.0005791594157926738, "learning_rate": 1.5646507190465788e-07, "loss": 0.0032, "num_input_tokens_seen": 218156096, "step": 101150 }, { "epoch": 18.563956689300788, "grad_norm": 0.025391340255737305, "learning_rate": 1.562663805090353e-07, "loss": 0.0, "num_input_tokens_seen": 218164928, "step": 101155 }, { "epoch": 18.56487428886034, "grad_norm": 0.002427412196993828, "learning_rate": 1.5606781334767285e-07, "loss": 0.0, "num_input_tokens_seen": 218176384, "step": 101160 }, { "epoch": 18.565791888419895, "grad_norm": 0.08696345239877701, "learning_rate": 1.5586937042566365e-07, "loss": 0.0, "num_input_tokens_seen": 218186720, "step": 101165 }, { "epoch": 18.566709487979445, "grad_norm": 0.0021223530638962984, "learning_rate": 1.5567105174809748e-07, "loss": 0.0, "num_input_tokens_seen": 218196672, "step": 101170 }, { "epoch": 18.567627087538998, "grad_norm": 0.002089753979817033, "learning_rate": 1.5547285732006034e-07, "loss": 0.0, "num_input_tokens_seen": 218206816, "step": 101175 }, { "epoch": 18.56854468709855, "grad_norm": 0.0014297396410256624, "learning_rate": 1.55274787146637e-07, "loss": 0.0, "num_input_tokens_seen": 218217280, "step": 101180 }, { "epoch": 18.5694622866581, "grad_norm": 0.0014812403824180365, "learning_rate": 1.5507684123290567e-07, "loss": 0.0, "num_input_tokens_seen": 218228544, "step": 101185 }, { "epoch": 18.570379886217655, "grad_norm": 0.01050709281116724, "learning_rate": 1.5487901958394503e-07, "loss": 0.0, "num_input_tokens_seen": 218238400, "step": 101190 }, { "epoch": 18.571297485777208, "grad_norm": 0.0024998376611620188, "learning_rate": 1.5468132220482822e-07, "loss": 0.0, "num_input_tokens_seen": 218249824, "step": 101195 }, { "epoch": 18.572215085336758, "grad_norm": 0.001923222909681499, "learning_rate": 1.5448374910062514e-07, "loss": 0.0, "num_input_tokens_seen": 218260288, "step": 101200 }, { "epoch": 18.57313268489631, "grad_norm": 0.0007000521291047335, "learning_rate": 1.5428630027640502e-07, "loss": 0.0, "num_input_tokens_seen": 218271936, "step": 101205 }, { "epoch": 18.574050284455865, "grad_norm": 0.0005263655330054462, "learning_rate": 1.5408897573723102e-07, "loss": 0.0, "num_input_tokens_seen": 218282752, "step": 101210 }, { "epoch": 18.574967884015415, "grad_norm": 0.001151670585386455, "learning_rate": 1.5389177548816358e-07, "loss": 0.0001, "num_input_tokens_seen": 218293536, "step": 101215 }, { "epoch": 18.575885483574968, "grad_norm": 0.018746402114629745, "learning_rate": 1.5369469953426198e-07, "loss": 0.0, "num_input_tokens_seen": 218305056, "step": 101220 }, { "epoch": 18.57680308313452, "grad_norm": 0.001559945405460894, "learning_rate": 1.5349774788058048e-07, "loss": 0.0, "num_input_tokens_seen": 218314240, "step": 101225 }, { "epoch": 18.57772068269407, "grad_norm": 0.0012621359201148152, "learning_rate": 1.5330092053217005e-07, "loss": 0.0, "num_input_tokens_seen": 218326272, "step": 101230 }, { "epoch": 18.578638282253625, "grad_norm": 0.0009894140530377626, "learning_rate": 1.5310421749407888e-07, "loss": 0.0, "num_input_tokens_seen": 218336192, "step": 101235 }, { "epoch": 18.579555881813178, "grad_norm": 0.019570330157876015, "learning_rate": 1.5290763877135295e-07, "loss": 0.0, "num_input_tokens_seen": 218347008, "step": 101240 }, { "epoch": 18.580473481372728, "grad_norm": 0.01371130719780922, "learning_rate": 1.5271118436903376e-07, "loss": 0.0, "num_input_tokens_seen": 218356576, "step": 101245 }, { "epoch": 18.58139108093228, "grad_norm": 0.09015518426895142, "learning_rate": 1.5251485429215952e-07, "loss": 0.0, "num_input_tokens_seen": 218367744, "step": 101250 }, { "epoch": 18.582308680491835, "grad_norm": 0.0015669568674638867, "learning_rate": 1.5231864854576728e-07, "loss": 0.0001, "num_input_tokens_seen": 218377376, "step": 101255 }, { "epoch": 18.583226280051385, "grad_norm": 0.0008205912308767438, "learning_rate": 1.5212256713488805e-07, "loss": 0.0, "num_input_tokens_seen": 218388576, "step": 101260 }, { "epoch": 18.584143879610938, "grad_norm": 0.0010219283867627382, "learning_rate": 1.5192661006455166e-07, "loss": 0.0, "num_input_tokens_seen": 218398464, "step": 101265 }, { "epoch": 18.58506147917049, "grad_norm": 0.0062521654181182384, "learning_rate": 1.5173077733978304e-07, "loss": 0.0, "num_input_tokens_seen": 218408096, "step": 101270 }, { "epoch": 18.58597907873004, "grad_norm": 0.0029207521583884954, "learning_rate": 1.5153506896560644e-07, "loss": 0.0, "num_input_tokens_seen": 218419456, "step": 101275 }, { "epoch": 18.586896678289595, "grad_norm": 0.0013452712446451187, "learning_rate": 1.513394849470412e-07, "loss": 0.0, "num_input_tokens_seen": 218431424, "step": 101280 }, { "epoch": 18.587814277849148, "grad_norm": 0.003032262669876218, "learning_rate": 1.5114402528910278e-07, "loss": 0.0, "num_input_tokens_seen": 218441632, "step": 101285 }, { "epoch": 18.588731877408698, "grad_norm": 0.0006193109438754618, "learning_rate": 1.5094868999680547e-07, "loss": 0.0, "num_input_tokens_seen": 218452160, "step": 101290 }, { "epoch": 18.58964947696825, "grad_norm": 0.0018634330481290817, "learning_rate": 1.5075347907515913e-07, "loss": 0.0, "num_input_tokens_seen": 218462816, "step": 101295 }, { "epoch": 18.590567076527805, "grad_norm": 0.0003569081600289792, "learning_rate": 1.5055839252916981e-07, "loss": 0.0, "num_input_tokens_seen": 218472544, "step": 101300 }, { "epoch": 18.591484676087354, "grad_norm": 0.00265571684576571, "learning_rate": 1.5036343036384182e-07, "loss": 0.0, "num_input_tokens_seen": 218482720, "step": 101305 }, { "epoch": 18.592402275646908, "grad_norm": 0.0008139490382745862, "learning_rate": 1.5016859258417615e-07, "loss": 0.0, "num_input_tokens_seen": 218493632, "step": 101310 }, { "epoch": 18.59331987520646, "grad_norm": 0.058337800204753876, "learning_rate": 1.4997387919516936e-07, "loss": 0.0001, "num_input_tokens_seen": 218505088, "step": 101315 }, { "epoch": 18.59423747476601, "grad_norm": 0.0044619617983698845, "learning_rate": 1.4977929020181526e-07, "loss": 0.0, "num_input_tokens_seen": 218515552, "step": 101320 }, { "epoch": 18.595155074325564, "grad_norm": 0.001359464949928224, "learning_rate": 1.4958482560910592e-07, "loss": 0.0, "num_input_tokens_seen": 218526304, "step": 101325 }, { "epoch": 18.596072673885118, "grad_norm": 0.006816911976784468, "learning_rate": 1.4939048542202795e-07, "loss": 0.0, "num_input_tokens_seen": 218537920, "step": 101330 }, { "epoch": 18.596990273444668, "grad_norm": 0.005767472553998232, "learning_rate": 1.4919626964556622e-07, "loss": 0.0, "num_input_tokens_seen": 218549312, "step": 101335 }, { "epoch": 18.59790787300422, "grad_norm": 0.00576007179915905, "learning_rate": 1.4900217828470176e-07, "loss": 0.0244, "num_input_tokens_seen": 218560480, "step": 101340 }, { "epoch": 18.598825472563774, "grad_norm": 0.001045905170030892, "learning_rate": 1.488082113444139e-07, "loss": 0.0001, "num_input_tokens_seen": 218572544, "step": 101345 }, { "epoch": 18.599743072123324, "grad_norm": 0.0010539016220718622, "learning_rate": 1.486143688296765e-07, "loss": 0.0001, "num_input_tokens_seen": 218584064, "step": 101350 }, { "epoch": 18.600660671682878, "grad_norm": 0.001765984925441444, "learning_rate": 1.4842065074546107e-07, "loss": 0.0, "num_input_tokens_seen": 218595904, "step": 101355 }, { "epoch": 18.60157827124243, "grad_norm": 0.0008116160170175135, "learning_rate": 1.4822705709673756e-07, "loss": 0.0, "num_input_tokens_seen": 218607552, "step": 101360 }, { "epoch": 18.60249587080198, "grad_norm": 0.001570007298141718, "learning_rate": 1.4803358788846977e-07, "loss": 0.0063, "num_input_tokens_seen": 218618720, "step": 101365 }, { "epoch": 18.603413470361534, "grad_norm": 0.00034227941068820655, "learning_rate": 1.478402431256204e-07, "loss": 0.0, "num_input_tokens_seen": 218629824, "step": 101370 }, { "epoch": 18.604331069921088, "grad_norm": 0.0009316173382103443, "learning_rate": 1.476470228131488e-07, "loss": 0.0, "num_input_tokens_seen": 218641760, "step": 101375 }, { "epoch": 18.605248669480638, "grad_norm": 0.0019787373021245003, "learning_rate": 1.4745392695601103e-07, "loss": 0.0, "num_input_tokens_seen": 218653600, "step": 101380 }, { "epoch": 18.60616626904019, "grad_norm": 0.0027541781309992075, "learning_rate": 1.4726095555915864e-07, "loss": 0.0002, "num_input_tokens_seen": 218664608, "step": 101385 }, { "epoch": 18.607083868599744, "grad_norm": 0.01110859215259552, "learning_rate": 1.4706810862754217e-07, "loss": 0.0, "num_input_tokens_seen": 218675232, "step": 101390 }, { "epoch": 18.608001468159294, "grad_norm": 0.000755091430619359, "learning_rate": 1.4687538616610707e-07, "loss": 0.0, "num_input_tokens_seen": 218686048, "step": 101395 }, { "epoch": 18.608919067718848, "grad_norm": 0.05004248395562172, "learning_rate": 1.4668278817979718e-07, "loss": 0.0944, "num_input_tokens_seen": 218696416, "step": 101400 }, { "epoch": 18.6098366672784, "grad_norm": 0.007826696150004864, "learning_rate": 1.464903146735508e-07, "loss": 0.0, "num_input_tokens_seen": 218707232, "step": 101405 }, { "epoch": 18.61075426683795, "grad_norm": 0.00298443459905684, "learning_rate": 1.462979656523067e-07, "loss": 0.0, "num_input_tokens_seen": 218717664, "step": 101410 }, { "epoch": 18.611671866397504, "grad_norm": 0.013789504766464233, "learning_rate": 1.4610574112099652e-07, "loss": 0.0, "num_input_tokens_seen": 218728288, "step": 101415 }, { "epoch": 18.612589465957058, "grad_norm": 0.008627185598015785, "learning_rate": 1.459136410845513e-07, "loss": 0.0, "num_input_tokens_seen": 218737824, "step": 101420 }, { "epoch": 18.613507065516607, "grad_norm": 0.18360765278339386, "learning_rate": 1.4572166554789825e-07, "loss": 0.0, "num_input_tokens_seen": 218749152, "step": 101425 }, { "epoch": 18.61442466507616, "grad_norm": 0.004629181232303381, "learning_rate": 1.4552981451596117e-07, "loss": 0.0, "num_input_tokens_seen": 218760704, "step": 101430 }, { "epoch": 18.615342264635714, "grad_norm": 0.0002677477605175227, "learning_rate": 1.4533808799366001e-07, "loss": 0.0, "num_input_tokens_seen": 218772416, "step": 101435 }, { "epoch": 18.616259864195264, "grad_norm": 0.04299093782901764, "learning_rate": 1.4514648598591309e-07, "loss": 0.0, "num_input_tokens_seen": 218783136, "step": 101440 }, { "epoch": 18.617177463754818, "grad_norm": 0.0002865278802346438, "learning_rate": 1.449550084976348e-07, "loss": 0.0, "num_input_tokens_seen": 218794752, "step": 101445 }, { "epoch": 18.61809506331437, "grad_norm": 0.010470651090145111, "learning_rate": 1.4476365553373616e-07, "loss": 0.0, "num_input_tokens_seen": 218805440, "step": 101450 }, { "epoch": 18.61901266287392, "grad_norm": 0.00427252845838666, "learning_rate": 1.445724270991239e-07, "loss": 0.0, "num_input_tokens_seen": 218815680, "step": 101455 }, { "epoch": 18.619930262433474, "grad_norm": 0.0012312085600569844, "learning_rate": 1.44381323198704e-07, "loss": 0.0001, "num_input_tokens_seen": 218826944, "step": 101460 }, { "epoch": 18.620847861993028, "grad_norm": 0.022030089050531387, "learning_rate": 1.4419034383737817e-07, "loss": 0.0, "num_input_tokens_seen": 218838944, "step": 101465 }, { "epoch": 18.621765461552577, "grad_norm": 0.0005658252048306167, "learning_rate": 1.4399948902004301e-07, "loss": 0.0, "num_input_tokens_seen": 218850240, "step": 101470 }, { "epoch": 18.62268306111213, "grad_norm": 0.0027605206705629826, "learning_rate": 1.4380875875159572e-07, "loss": 0.0, "num_input_tokens_seen": 218860256, "step": 101475 }, { "epoch": 18.623600660671684, "grad_norm": 0.06887336820363998, "learning_rate": 1.436181530369274e-07, "loss": 0.0, "num_input_tokens_seen": 218870400, "step": 101480 }, { "epoch": 18.624518260231234, "grad_norm": 0.0017733799759298563, "learning_rate": 1.434276718809263e-07, "loss": 0.0, "num_input_tokens_seen": 218880224, "step": 101485 }, { "epoch": 18.625435859790787, "grad_norm": 0.002720634685829282, "learning_rate": 1.4323731528847862e-07, "loss": 0.0, "num_input_tokens_seen": 218891040, "step": 101490 }, { "epoch": 18.62635345935034, "grad_norm": 0.0023533387575298548, "learning_rate": 1.4304708326446704e-07, "loss": 0.0001, "num_input_tokens_seen": 218901888, "step": 101495 }, { "epoch": 18.62727105890989, "grad_norm": 0.0016304586315527558, "learning_rate": 1.428569758137699e-07, "loss": 0.002, "num_input_tokens_seen": 218914368, "step": 101500 }, { "epoch": 18.628188658469444, "grad_norm": 0.02470366284251213, "learning_rate": 1.4266699294126273e-07, "loss": 0.0, "num_input_tokens_seen": 218924768, "step": 101505 }, { "epoch": 18.629106258028997, "grad_norm": 0.0003880863660015166, "learning_rate": 1.4247713465181946e-07, "loss": 0.0, "num_input_tokens_seen": 218934528, "step": 101510 }, { "epoch": 18.630023857588547, "grad_norm": 0.0024874878581613302, "learning_rate": 1.4228740095030945e-07, "loss": 0.0, "num_input_tokens_seen": 218945600, "step": 101515 }, { "epoch": 18.6309414571481, "grad_norm": 0.04203111305832863, "learning_rate": 1.4209779184159832e-07, "loss": 0.0, "num_input_tokens_seen": 218955744, "step": 101520 }, { "epoch": 18.631859056707654, "grad_norm": 0.03629906848073006, "learning_rate": 1.419083073305505e-07, "loss": 0.0, "num_input_tokens_seen": 218966304, "step": 101525 }, { "epoch": 18.632776656267204, "grad_norm": 0.0009296549251303077, "learning_rate": 1.4171894742202487e-07, "loss": 0.0, "num_input_tokens_seen": 218977664, "step": 101530 }, { "epoch": 18.633694255826757, "grad_norm": 0.015970503911376, "learning_rate": 1.4152971212087807e-07, "loss": 0.0, "num_input_tokens_seen": 218988928, "step": 101535 }, { "epoch": 18.63461185538631, "grad_norm": 0.0012362959096208215, "learning_rate": 1.413406014319646e-07, "loss": 0.0, "num_input_tokens_seen": 219001024, "step": 101540 }, { "epoch": 18.63552945494586, "grad_norm": 0.008839292451739311, "learning_rate": 1.41151615360135e-07, "loss": 0.0002, "num_input_tokens_seen": 219011776, "step": 101545 }, { "epoch": 18.636447054505414, "grad_norm": 0.23461245000362396, "learning_rate": 1.409627539102354e-07, "loss": 0.0, "num_input_tokens_seen": 219022720, "step": 101550 }, { "epoch": 18.637364654064967, "grad_norm": 0.001097190543077886, "learning_rate": 1.407740170871108e-07, "loss": 0.0, "num_input_tokens_seen": 219032576, "step": 101555 }, { "epoch": 18.638282253624517, "grad_norm": 0.01878550834953785, "learning_rate": 1.4058540489560123e-07, "loss": 0.0, "num_input_tokens_seen": 219042816, "step": 101560 }, { "epoch": 18.63919985318407, "grad_norm": 0.000784571107942611, "learning_rate": 1.4039691734054396e-07, "loss": 0.0, "num_input_tokens_seen": 219053344, "step": 101565 }, { "epoch": 18.640117452743624, "grad_norm": 0.0008811433799564838, "learning_rate": 1.4020855442677507e-07, "loss": 0.0, "num_input_tokens_seen": 219064480, "step": 101570 }, { "epoch": 18.641035052303174, "grad_norm": 0.01084791962057352, "learning_rate": 1.400203161591246e-07, "loss": 0.0, "num_input_tokens_seen": 219074784, "step": 101575 }, { "epoch": 18.641952651862727, "grad_norm": 0.00429982366040349, "learning_rate": 1.3983220254242036e-07, "loss": 0.0, "num_input_tokens_seen": 219085248, "step": 101580 }, { "epoch": 18.64287025142228, "grad_norm": 0.04764697700738907, "learning_rate": 1.3964421358148794e-07, "loss": 0.0, "num_input_tokens_seen": 219096832, "step": 101585 }, { "epoch": 18.64378785098183, "grad_norm": 0.002674899762496352, "learning_rate": 1.3945634928114794e-07, "loss": 0.002, "num_input_tokens_seen": 219107360, "step": 101590 }, { "epoch": 18.644705450541384, "grad_norm": 0.0014459947124123573, "learning_rate": 1.392686096462198e-07, "loss": 0.0, "num_input_tokens_seen": 219117280, "step": 101595 }, { "epoch": 18.645623050100937, "grad_norm": 0.10770747065544128, "learning_rate": 1.3908099468151858e-07, "loss": 0.0001, "num_input_tokens_seen": 219127712, "step": 101600 }, { "epoch": 18.646540649660487, "grad_norm": 0.00033342858660034835, "learning_rate": 1.3889350439185544e-07, "loss": 0.0, "num_input_tokens_seen": 219138144, "step": 101605 }, { "epoch": 18.64745824922004, "grad_norm": 0.014065902680158615, "learning_rate": 1.3870613878204042e-07, "loss": 0.0, "num_input_tokens_seen": 219148128, "step": 101610 }, { "epoch": 18.648375848779594, "grad_norm": 0.016308480873703957, "learning_rate": 1.3851889785687856e-07, "loss": 0.0, "num_input_tokens_seen": 219157344, "step": 101615 }, { "epoch": 18.649293448339144, "grad_norm": 0.001286112004891038, "learning_rate": 1.383317816211721e-07, "loss": 0.0, "num_input_tokens_seen": 219167168, "step": 101620 }, { "epoch": 18.650211047898697, "grad_norm": 0.0025042505003511906, "learning_rate": 1.381447900797206e-07, "loss": 0.0, "num_input_tokens_seen": 219178496, "step": 101625 }, { "epoch": 18.65112864745825, "grad_norm": 0.006039291620254517, "learning_rate": 1.3795792323732072e-07, "loss": 0.0, "num_input_tokens_seen": 219188512, "step": 101630 }, { "epoch": 18.6520462470178, "grad_norm": 0.0008083164575509727, "learning_rate": 1.377711810987642e-07, "loss": 0.0, "num_input_tokens_seen": 219198784, "step": 101635 }, { "epoch": 18.652963846577354, "grad_norm": 0.00041401092312298715, "learning_rate": 1.3758456366884054e-07, "loss": 0.0002, "num_input_tokens_seen": 219209440, "step": 101640 }, { "epoch": 18.653881446136907, "grad_norm": 0.24517136812210083, "learning_rate": 1.373980709523376e-07, "loss": 0.0001, "num_input_tokens_seen": 219220160, "step": 101645 }, { "epoch": 18.654799045696457, "grad_norm": 0.0021042837761342525, "learning_rate": 1.3721170295403709e-07, "loss": 0.0001, "num_input_tokens_seen": 219231232, "step": 101650 }, { "epoch": 18.65571664525601, "grad_norm": 0.0019019978353753686, "learning_rate": 1.3702545967872016e-07, "loss": 0.0001, "num_input_tokens_seen": 219241376, "step": 101655 }, { "epoch": 18.656634244815564, "grad_norm": 0.025710957124829292, "learning_rate": 1.3683934113116304e-07, "loss": 0.0, "num_input_tokens_seen": 219251712, "step": 101660 }, { "epoch": 18.657551844375114, "grad_norm": 0.003960969392210245, "learning_rate": 1.366533473161402e-07, "loss": 0.0144, "num_input_tokens_seen": 219261920, "step": 101665 }, { "epoch": 18.658469443934667, "grad_norm": 0.1199842095375061, "learning_rate": 1.3646747823842065e-07, "loss": 0.0, "num_input_tokens_seen": 219273536, "step": 101670 }, { "epoch": 18.65938704349422, "grad_norm": 0.0007134536281228065, "learning_rate": 1.3628173390277278e-07, "loss": 0.0, "num_input_tokens_seen": 219283424, "step": 101675 }, { "epoch": 18.66030464305377, "grad_norm": 0.0007453728467226028, "learning_rate": 1.3609611431396054e-07, "loss": 0.0, "num_input_tokens_seen": 219293760, "step": 101680 }, { "epoch": 18.661222242613324, "grad_norm": 0.0061399624682962894, "learning_rate": 1.3591061947674455e-07, "loss": 0.0, "num_input_tokens_seen": 219304032, "step": 101685 }, { "epoch": 18.662139842172877, "grad_norm": 0.0006257953937165439, "learning_rate": 1.3572524939588217e-07, "loss": 0.0, "num_input_tokens_seen": 219313920, "step": 101690 }, { "epoch": 18.663057441732427, "grad_norm": 0.0008958342368714511, "learning_rate": 1.35540004076129e-07, "loss": 0.0, "num_input_tokens_seen": 219326048, "step": 101695 }, { "epoch": 18.66397504129198, "grad_norm": 0.0006035355618223548, "learning_rate": 1.3535488352223513e-07, "loss": 0.0001, "num_input_tokens_seen": 219337568, "step": 101700 }, { "epoch": 18.664892640851534, "grad_norm": 0.005822923965752125, "learning_rate": 1.3516988773894845e-07, "loss": 0.0, "num_input_tokens_seen": 219348448, "step": 101705 }, { "epoch": 18.665810240411083, "grad_norm": 0.00902754906564951, "learning_rate": 1.349850167310146e-07, "loss": 0.0, "num_input_tokens_seen": 219357760, "step": 101710 }, { "epoch": 18.666727839970637, "grad_norm": 0.0021271624136716127, "learning_rate": 1.3480027050317533e-07, "loss": 0.0, "num_input_tokens_seen": 219368864, "step": 101715 }, { "epoch": 18.66764543953019, "grad_norm": 0.0010577947832643986, "learning_rate": 1.346156490601691e-07, "loss": 0.1376, "num_input_tokens_seen": 219379136, "step": 101720 }, { "epoch": 18.66856303908974, "grad_norm": 1.2433534860610962, "learning_rate": 1.3443115240672989e-07, "loss": 0.0001, "num_input_tokens_seen": 219390176, "step": 101725 }, { "epoch": 18.669480638649294, "grad_norm": 0.0051982891745865345, "learning_rate": 1.342467805475911e-07, "loss": 0.0, "num_input_tokens_seen": 219400800, "step": 101730 }, { "epoch": 18.670398238208847, "grad_norm": 0.0005821503582410514, "learning_rate": 1.3406253348748123e-07, "loss": 0.0001, "num_input_tokens_seen": 219411424, "step": 101735 }, { "epoch": 18.671315837768397, "grad_norm": 0.01034778542816639, "learning_rate": 1.3387841123112534e-07, "loss": 0.0, "num_input_tokens_seen": 219421632, "step": 101740 }, { "epoch": 18.67223343732795, "grad_norm": 0.005994245409965515, "learning_rate": 1.336944137832469e-07, "loss": 0.0, "num_input_tokens_seen": 219432064, "step": 101745 }, { "epoch": 18.673151036887504, "grad_norm": 0.005118048749864101, "learning_rate": 1.3351054114856488e-07, "loss": 0.0, "num_input_tokens_seen": 219442688, "step": 101750 }, { "epoch": 18.674068636447053, "grad_norm": 0.005810581613332033, "learning_rate": 1.3332679333179443e-07, "loss": 0.0, "num_input_tokens_seen": 219454368, "step": 101755 }, { "epoch": 18.674986236006607, "grad_norm": 0.0041021681390702724, "learning_rate": 1.3314317033764957e-07, "loss": 0.0, "num_input_tokens_seen": 219466592, "step": 101760 }, { "epoch": 18.67590383556616, "grad_norm": 0.0014963565627112985, "learning_rate": 1.3295967217083926e-07, "loss": 0.0, "num_input_tokens_seen": 219477280, "step": 101765 }, { "epoch": 18.67682143512571, "grad_norm": 0.0015905102482065558, "learning_rate": 1.3277629883607035e-07, "loss": 0.0, "num_input_tokens_seen": 219487296, "step": 101770 }, { "epoch": 18.677739034685263, "grad_norm": 0.3481852412223816, "learning_rate": 1.325930503380457e-07, "loss": 0.0001, "num_input_tokens_seen": 219498848, "step": 101775 }, { "epoch": 18.678656634244817, "grad_norm": 0.0008069563773460686, "learning_rate": 1.3240992668146546e-07, "loss": 0.0, "num_input_tokens_seen": 219510016, "step": 101780 }, { "epoch": 18.679574233804367, "grad_norm": 0.002986834617331624, "learning_rate": 1.322269278710264e-07, "loss": 0.0, "num_input_tokens_seen": 219520480, "step": 101785 }, { "epoch": 18.68049183336392, "grad_norm": 0.003948657773435116, "learning_rate": 1.320440539114226e-07, "loss": 0.0, "num_input_tokens_seen": 219530144, "step": 101790 }, { "epoch": 18.681409432923473, "grad_norm": 0.000986860366538167, "learning_rate": 1.3186130480734417e-07, "loss": 0.0, "num_input_tokens_seen": 219539904, "step": 101795 }, { "epoch": 18.682327032483023, "grad_norm": 0.0010317896958440542, "learning_rate": 1.3167868056347843e-07, "loss": 0.0, "num_input_tokens_seen": 219550592, "step": 101800 }, { "epoch": 18.683244632042577, "grad_norm": 0.0030613981653004885, "learning_rate": 1.3149618118450836e-07, "loss": 0.0, "num_input_tokens_seen": 219560800, "step": 101805 }, { "epoch": 18.68416223160213, "grad_norm": 0.003589488798752427, "learning_rate": 1.3131380667511683e-07, "loss": 0.0, "num_input_tokens_seen": 219570688, "step": 101810 }, { "epoch": 18.68507983116168, "grad_norm": 0.008475780487060547, "learning_rate": 1.3113155703998016e-07, "loss": 0.0, "num_input_tokens_seen": 219580480, "step": 101815 }, { "epoch": 18.685997430721233, "grad_norm": 0.0004958877107128501, "learning_rate": 1.309494322837729e-07, "loss": 0.0284, "num_input_tokens_seen": 219592896, "step": 101820 }, { "epoch": 18.686915030280787, "grad_norm": 0.00381623231805861, "learning_rate": 1.3076743241116573e-07, "loss": 0.0, "num_input_tokens_seen": 219603840, "step": 101825 }, { "epoch": 18.687832629840337, "grad_norm": 0.10053873062133789, "learning_rate": 1.3058555742682777e-07, "loss": 0.0, "num_input_tokens_seen": 219614848, "step": 101830 }, { "epoch": 18.68875022939989, "grad_norm": 0.0020684218034148216, "learning_rate": 1.3040380733542356e-07, "loss": 0.0002, "num_input_tokens_seen": 219625888, "step": 101835 }, { "epoch": 18.689667828959443, "grad_norm": 0.04734455421566963, "learning_rate": 1.3022218214161442e-07, "loss": 0.0, "num_input_tokens_seen": 219636576, "step": 101840 }, { "epoch": 18.690585428518993, "grad_norm": 0.0005649802042171359, "learning_rate": 1.3004068185005881e-07, "loss": 0.0, "num_input_tokens_seen": 219648096, "step": 101845 }, { "epoch": 18.691503028078547, "grad_norm": 0.0006460019503720105, "learning_rate": 1.2985930646541188e-07, "loss": 0.0, "num_input_tokens_seen": 219659200, "step": 101850 }, { "epoch": 18.6924206276381, "grad_norm": 0.010517427697777748, "learning_rate": 1.296780559923261e-07, "loss": 0.0, "num_input_tokens_seen": 219670688, "step": 101855 }, { "epoch": 18.69333822719765, "grad_norm": 0.00040434024413116276, "learning_rate": 1.2949693043544875e-07, "loss": 0.0, "num_input_tokens_seen": 219681664, "step": 101860 }, { "epoch": 18.694255826757203, "grad_norm": 0.0007018629112280905, "learning_rate": 1.2931592979942787e-07, "loss": 0.0, "num_input_tokens_seen": 219691616, "step": 101865 }, { "epoch": 18.695173426316757, "grad_norm": 0.0005942288553342223, "learning_rate": 1.2913505408890414e-07, "loss": 0.0, "num_input_tokens_seen": 219703008, "step": 101870 }, { "epoch": 18.696091025876306, "grad_norm": 0.004663021769374609, "learning_rate": 1.289543033085161e-07, "loss": 0.0, "num_input_tokens_seen": 219713184, "step": 101875 }, { "epoch": 18.69700862543586, "grad_norm": 1.0147984027862549, "learning_rate": 1.287736774629017e-07, "loss": 0.0001, "num_input_tokens_seen": 219723968, "step": 101880 }, { "epoch": 18.697926224995413, "grad_norm": 0.008464708924293518, "learning_rate": 1.2859317655669279e-07, "loss": 0.0003, "num_input_tokens_seen": 219733728, "step": 101885 }, { "epoch": 18.698843824554963, "grad_norm": 0.020435677841305733, "learning_rate": 1.2841280059451844e-07, "loss": 0.0, "num_input_tokens_seen": 219744960, "step": 101890 }, { "epoch": 18.699761424114516, "grad_norm": 0.0016339393332600594, "learning_rate": 1.2823254958100606e-07, "loss": 0.0, "num_input_tokens_seen": 219755872, "step": 101895 }, { "epoch": 18.70067902367407, "grad_norm": 0.02822817862033844, "learning_rate": 1.2805242352077808e-07, "loss": 0.0, "num_input_tokens_seen": 219767040, "step": 101900 }, { "epoch": 18.70159662323362, "grad_norm": 0.001606718054972589, "learning_rate": 1.2787242241845465e-07, "loss": 0.0, "num_input_tokens_seen": 219777536, "step": 101905 }, { "epoch": 18.702514222793173, "grad_norm": 0.005903270095586777, "learning_rate": 1.2769254627865213e-07, "loss": 0.0, "num_input_tokens_seen": 219787936, "step": 101910 }, { "epoch": 18.703431822352726, "grad_norm": 0.0066660018637776375, "learning_rate": 1.2751279510598458e-07, "loss": 0.0, "num_input_tokens_seen": 219798720, "step": 101915 }, { "epoch": 18.704349421912276, "grad_norm": 0.0004236837849020958, "learning_rate": 1.273331689050622e-07, "loss": 0.0, "num_input_tokens_seen": 219809632, "step": 101920 }, { "epoch": 18.70526702147183, "grad_norm": 0.02469508908689022, "learning_rate": 1.2715366768049186e-07, "loss": 0.0, "num_input_tokens_seen": 219820160, "step": 101925 }, { "epoch": 18.706184621031383, "grad_norm": 0.003223787760362029, "learning_rate": 1.2697429143687768e-07, "loss": 0.0, "num_input_tokens_seen": 219831008, "step": 101930 }, { "epoch": 18.707102220590933, "grad_norm": 0.1292901337146759, "learning_rate": 1.2679504017882094e-07, "loss": 0.0032, "num_input_tokens_seen": 219842048, "step": 101935 }, { "epoch": 18.708019820150486, "grad_norm": 0.006780498661100864, "learning_rate": 1.2661591391091797e-07, "loss": 0.0, "num_input_tokens_seen": 219851840, "step": 101940 }, { "epoch": 18.70893741971004, "grad_norm": 9.250652313232422, "learning_rate": 1.2643691263776404e-07, "loss": 0.0026, "num_input_tokens_seen": 219861632, "step": 101945 }, { "epoch": 18.70985501926959, "grad_norm": 0.058511339128017426, "learning_rate": 1.262580363639504e-07, "loss": 0.0, "num_input_tokens_seen": 219872320, "step": 101950 }, { "epoch": 18.710772618829143, "grad_norm": 0.004082696978002787, "learning_rate": 1.2607928509406452e-07, "loss": 0.0, "num_input_tokens_seen": 219883520, "step": 101955 }, { "epoch": 18.711690218388696, "grad_norm": 0.3768559992313385, "learning_rate": 1.2590065883269e-07, "loss": 0.0001, "num_input_tokens_seen": 219894272, "step": 101960 }, { "epoch": 18.712607817948246, "grad_norm": 0.003062684088945389, "learning_rate": 1.257221575844103e-07, "loss": 0.0, "num_input_tokens_seen": 219904192, "step": 101965 }, { "epoch": 18.7135254175078, "grad_norm": 0.0011211959645152092, "learning_rate": 1.2554378135380296e-07, "loss": 0.0, "num_input_tokens_seen": 219914208, "step": 101970 }, { "epoch": 18.714443017067353, "grad_norm": 0.009926092810928822, "learning_rate": 1.2536553014544263e-07, "loss": 0.0, "num_input_tokens_seen": 219924736, "step": 101975 }, { "epoch": 18.715360616626903, "grad_norm": 0.06178281456232071, "learning_rate": 1.2518740396390115e-07, "loss": 0.0012, "num_input_tokens_seen": 219935520, "step": 101980 }, { "epoch": 18.716278216186456, "grad_norm": 0.0056024049408733845, "learning_rate": 1.2500940281374774e-07, "loss": 0.002, "num_input_tokens_seen": 219947712, "step": 101985 }, { "epoch": 18.71719581574601, "grad_norm": 0.0005124022136442363, "learning_rate": 1.2483152669954756e-07, "loss": 0.0, "num_input_tokens_seen": 219957696, "step": 101990 }, { "epoch": 18.71811341530556, "grad_norm": 48.80084228515625, "learning_rate": 1.2465377562586312e-07, "loss": 0.0674, "num_input_tokens_seen": 219967968, "step": 101995 }, { "epoch": 18.719031014865113, "grad_norm": 0.00234771054238081, "learning_rate": 1.244761495972535e-07, "loss": 0.0005, "num_input_tokens_seen": 219978400, "step": 102000 }, { "epoch": 18.719948614424666, "grad_norm": 0.017070069909095764, "learning_rate": 1.2429864861827345e-07, "loss": 0.0, "num_input_tokens_seen": 219989344, "step": 102005 }, { "epoch": 18.720866213984216, "grad_norm": 0.019046416506171227, "learning_rate": 1.2412127269347653e-07, "loss": 0.0, "num_input_tokens_seen": 220000032, "step": 102010 }, { "epoch": 18.72178381354377, "grad_norm": 0.10773654282093048, "learning_rate": 1.239440218274124e-07, "loss": 0.0, "num_input_tokens_seen": 220011616, "step": 102015 }, { "epoch": 18.722701413103323, "grad_norm": 0.0022146590054035187, "learning_rate": 1.237668960246269e-07, "loss": 0.0, "num_input_tokens_seen": 220023488, "step": 102020 }, { "epoch": 18.723619012662873, "grad_norm": 0.009851591661572456, "learning_rate": 1.2358989528966303e-07, "loss": 0.0, "num_input_tokens_seen": 220034048, "step": 102025 }, { "epoch": 18.724536612222426, "grad_norm": 0.011667526327073574, "learning_rate": 1.2341301962706054e-07, "loss": 0.0, "num_input_tokens_seen": 220045440, "step": 102030 }, { "epoch": 18.72545421178198, "grad_norm": 0.0009032362722791731, "learning_rate": 1.2323626904135578e-07, "loss": 0.0001, "num_input_tokens_seen": 220055968, "step": 102035 }, { "epoch": 18.72637181134153, "grad_norm": 0.0008739873883314431, "learning_rate": 1.2305964353708289e-07, "loss": 0.0025, "num_input_tokens_seen": 220066080, "step": 102040 }, { "epoch": 18.727289410901083, "grad_norm": 0.005729724187403917, "learning_rate": 1.2288314311877103e-07, "loss": 0.04, "num_input_tokens_seen": 220076288, "step": 102045 }, { "epoch": 18.728207010460636, "grad_norm": 0.0027389719616621733, "learning_rate": 1.2270676779094827e-07, "loss": 0.0, "num_input_tokens_seen": 220088096, "step": 102050 }, { "epoch": 18.729124610020186, "grad_norm": 0.0020872459281235933, "learning_rate": 1.225305175581376e-07, "loss": 0.0, "num_input_tokens_seen": 220099168, "step": 102055 }, { "epoch": 18.73004220957974, "grad_norm": 0.003802412888035178, "learning_rate": 1.2235439242485937e-07, "loss": 0.0001, "num_input_tokens_seen": 220109536, "step": 102060 }, { "epoch": 18.730959809139293, "grad_norm": 0.0024070965591818094, "learning_rate": 1.2217839239563156e-07, "loss": 0.0, "num_input_tokens_seen": 220120096, "step": 102065 }, { "epoch": 18.731877408698843, "grad_norm": 0.0013766525080427527, "learning_rate": 1.2200251747496838e-07, "loss": 0.0, "num_input_tokens_seen": 220130880, "step": 102070 }, { "epoch": 18.732795008258396, "grad_norm": 0.0010769772343337536, "learning_rate": 1.2182676766738012e-07, "loss": 0.0, "num_input_tokens_seen": 220142464, "step": 102075 }, { "epoch": 18.73371260781795, "grad_norm": 0.0023115817457437515, "learning_rate": 1.216511429773748e-07, "loss": 0.0, "num_input_tokens_seen": 220153504, "step": 102080 }, { "epoch": 18.7346302073775, "grad_norm": 0.38588330149650574, "learning_rate": 1.2147564340945718e-07, "loss": 0.0001, "num_input_tokens_seen": 220164864, "step": 102085 }, { "epoch": 18.735547806937053, "grad_norm": 0.004885051399469376, "learning_rate": 1.2130026896812809e-07, "loss": 0.0001, "num_input_tokens_seen": 220174976, "step": 102090 }, { "epoch": 18.736465406496606, "grad_norm": 0.0026563957799226046, "learning_rate": 1.2112501965788558e-07, "loss": 0.0, "num_input_tokens_seen": 220185536, "step": 102095 }, { "epoch": 18.737383006056156, "grad_norm": 0.0007352734683081508, "learning_rate": 1.20949895483225e-07, "loss": 0.0097, "num_input_tokens_seen": 220197376, "step": 102100 }, { "epoch": 18.73830060561571, "grad_norm": 0.006170389708131552, "learning_rate": 1.207748964486377e-07, "loss": 0.0, "num_input_tokens_seen": 220208160, "step": 102105 }, { "epoch": 18.739218205175263, "grad_norm": 0.017715567722916603, "learning_rate": 1.206000225586118e-07, "loss": 0.0, "num_input_tokens_seen": 220220032, "step": 102110 }, { "epoch": 18.740135804734813, "grad_norm": 0.010197550058364868, "learning_rate": 1.2042527381763313e-07, "loss": 0.0, "num_input_tokens_seen": 220231680, "step": 102115 }, { "epoch": 18.741053404294366, "grad_norm": 1.7856441736221313, "learning_rate": 1.2025065023018423e-07, "loss": 0.0027, "num_input_tokens_seen": 220242240, "step": 102120 }, { "epoch": 18.74197100385392, "grad_norm": 0.007096848450601101, "learning_rate": 1.2007615180074206e-07, "loss": 0.0048, "num_input_tokens_seen": 220253568, "step": 102125 }, { "epoch": 18.74288860341347, "grad_norm": 3.4737143516540527, "learning_rate": 1.1990177853378415e-07, "loss": 0.0003, "num_input_tokens_seen": 220264448, "step": 102130 }, { "epoch": 18.743806202973023, "grad_norm": 0.0028525942470878363, "learning_rate": 1.197275304337825e-07, "loss": 0.0, "num_input_tokens_seen": 220272896, "step": 102135 }, { "epoch": 18.744723802532576, "grad_norm": 0.0006872274097986519, "learning_rate": 1.1955340750520516e-07, "loss": 0.0, "num_input_tokens_seen": 220285120, "step": 102140 }, { "epoch": 18.745641402092126, "grad_norm": 0.0017946073785424232, "learning_rate": 1.1937940975251916e-07, "loss": 0.0, "num_input_tokens_seen": 220295232, "step": 102145 }, { "epoch": 18.74655900165168, "grad_norm": 0.0014710662653669715, "learning_rate": 1.1920553718018702e-07, "loss": 0.0, "num_input_tokens_seen": 220306816, "step": 102150 }, { "epoch": 18.747476601211233, "grad_norm": 0.013483013026416302, "learning_rate": 1.1903178979266905e-07, "loss": 0.0063, "num_input_tokens_seen": 220318560, "step": 102155 }, { "epoch": 18.748394200770782, "grad_norm": 0.001892958884127438, "learning_rate": 1.1885816759441948e-07, "loss": 0.0, "num_input_tokens_seen": 220329696, "step": 102160 }, { "epoch": 18.749311800330336, "grad_norm": 0.000664330436848104, "learning_rate": 1.1868467058989364e-07, "loss": 0.0, "num_input_tokens_seen": 220339808, "step": 102165 }, { "epoch": 18.75022939988989, "grad_norm": 0.0011008190922439098, "learning_rate": 1.1851129878354072e-07, "loss": 0.0, "num_input_tokens_seen": 220351680, "step": 102170 }, { "epoch": 18.75114699944944, "grad_norm": 0.000687193707562983, "learning_rate": 1.183380521798072e-07, "loss": 0.0, "num_input_tokens_seen": 220363328, "step": 102175 }, { "epoch": 18.752064599008992, "grad_norm": 0.007409111596643925, "learning_rate": 1.1816493078313674e-07, "loss": 0.0, "num_input_tokens_seen": 220374784, "step": 102180 }, { "epoch": 18.752982198568546, "grad_norm": 0.0003045650664716959, "learning_rate": 1.1799193459796965e-07, "loss": 0.0, "num_input_tokens_seen": 220384928, "step": 102185 }, { "epoch": 18.753899798128096, "grad_norm": 0.0008187273633666337, "learning_rate": 1.1781906362874296e-07, "loss": 0.0, "num_input_tokens_seen": 220395520, "step": 102190 }, { "epoch": 18.75481739768765, "grad_norm": 0.0026869173161685467, "learning_rate": 1.176463178798909e-07, "loss": 0.0, "num_input_tokens_seen": 220406784, "step": 102195 }, { "epoch": 18.755734997247203, "grad_norm": 0.005735037848353386, "learning_rate": 1.1747369735584324e-07, "loss": 0.0, "num_input_tokens_seen": 220416704, "step": 102200 }, { "epoch": 18.756652596806752, "grad_norm": 0.0012540871975943446, "learning_rate": 1.1730120206102869e-07, "loss": 0.0, "num_input_tokens_seen": 220426816, "step": 102205 }, { "epoch": 18.757570196366306, "grad_norm": 0.028771867975592613, "learning_rate": 1.1712883199987035e-07, "loss": 0.0, "num_input_tokens_seen": 220436576, "step": 102210 }, { "epoch": 18.75848779592586, "grad_norm": 0.0035756677389144897, "learning_rate": 1.1695658717679026e-07, "loss": 0.0003, "num_input_tokens_seen": 220448256, "step": 102215 }, { "epoch": 18.75940539548541, "grad_norm": 0.007488701492547989, "learning_rate": 1.1678446759620543e-07, "loss": 0.0, "num_input_tokens_seen": 220459392, "step": 102220 }, { "epoch": 18.760322995044962, "grad_norm": 0.001824927399866283, "learning_rate": 1.1661247326253011e-07, "loss": 0.0, "num_input_tokens_seen": 220469120, "step": 102225 }, { "epoch": 18.761240594604516, "grad_norm": 0.0007688597543165088, "learning_rate": 1.1644060418017689e-07, "loss": 0.0, "num_input_tokens_seen": 220479200, "step": 102230 }, { "epoch": 18.762158194164066, "grad_norm": 0.005966676864773035, "learning_rate": 1.1626886035355334e-07, "loss": 0.0, "num_input_tokens_seen": 220490624, "step": 102235 }, { "epoch": 18.76307579372362, "grad_norm": 0.0007500804495066404, "learning_rate": 1.1609724178706427e-07, "loss": 0.0, "num_input_tokens_seen": 220501440, "step": 102240 }, { "epoch": 18.763993393283172, "grad_norm": 0.005484138149768114, "learning_rate": 1.1592574848511118e-07, "loss": 0.0, "num_input_tokens_seen": 220513088, "step": 102245 }, { "epoch": 18.764910992842722, "grad_norm": 0.004077024757862091, "learning_rate": 1.1575438045209331e-07, "loss": 0.0, "num_input_tokens_seen": 220525504, "step": 102250 }, { "epoch": 18.765828592402276, "grad_norm": 31.206680297851562, "learning_rate": 1.1558313769240603e-07, "loss": 0.0087, "num_input_tokens_seen": 220533600, "step": 102255 }, { "epoch": 18.76674619196183, "grad_norm": 0.0006344602443277836, "learning_rate": 1.1541202021044029e-07, "loss": 0.0, "num_input_tokens_seen": 220544544, "step": 102260 }, { "epoch": 18.76766379152138, "grad_norm": 0.00020179286366328597, "learning_rate": 1.1524102801058646e-07, "loss": 0.0, "num_input_tokens_seen": 220554272, "step": 102265 }, { "epoch": 18.768581391080932, "grad_norm": 0.002554610837250948, "learning_rate": 1.150701610972299e-07, "loss": 0.0, "num_input_tokens_seen": 220564992, "step": 102270 }, { "epoch": 18.769498990640486, "grad_norm": 0.001052102423273027, "learning_rate": 1.1489941947475213e-07, "loss": 0.0944, "num_input_tokens_seen": 220576608, "step": 102275 }, { "epoch": 18.770416590200036, "grad_norm": 0.004047999158501625, "learning_rate": 1.1472880314753299e-07, "loss": 0.0, "num_input_tokens_seen": 220587008, "step": 102280 }, { "epoch": 18.77133418975959, "grad_norm": 0.0007454223232343793, "learning_rate": 1.1455831211994895e-07, "loss": 0.0, "num_input_tokens_seen": 220598784, "step": 102285 }, { "epoch": 18.772251789319142, "grad_norm": 0.0035640932619571686, "learning_rate": 1.1438794639637264e-07, "loss": 0.0, "num_input_tokens_seen": 220610176, "step": 102290 }, { "epoch": 18.773169388878692, "grad_norm": 0.0014185451436787844, "learning_rate": 1.1421770598117276e-07, "loss": 0.2288, "num_input_tokens_seen": 220621984, "step": 102295 }, { "epoch": 18.774086988438246, "grad_norm": 0.01957586035132408, "learning_rate": 1.1404759087871697e-07, "loss": 0.0, "num_input_tokens_seen": 220633312, "step": 102300 }, { "epoch": 18.7750045879978, "grad_norm": 0.009167634882032871, "learning_rate": 1.1387760109336788e-07, "loss": 0.0, "num_input_tokens_seen": 220642304, "step": 102305 }, { "epoch": 18.77592218755735, "grad_norm": 0.10052397847175598, "learning_rate": 1.1370773662948532e-07, "loss": 0.0097, "num_input_tokens_seen": 220653056, "step": 102310 }, { "epoch": 18.776839787116902, "grad_norm": 0.0009211681317538023, "learning_rate": 1.1353799749142636e-07, "loss": 0.0, "num_input_tokens_seen": 220664576, "step": 102315 }, { "epoch": 18.777757386676456, "grad_norm": 0.0006276571657508612, "learning_rate": 1.1336838368354419e-07, "loss": 0.0001, "num_input_tokens_seen": 220675264, "step": 102320 }, { "epoch": 18.778674986236005, "grad_norm": 0.002873073797672987, "learning_rate": 1.1319889521018978e-07, "loss": 0.0, "num_input_tokens_seen": 220685472, "step": 102325 }, { "epoch": 18.77959258579556, "grad_norm": 0.00268132658675313, "learning_rate": 1.1302953207570965e-07, "loss": 0.0119, "num_input_tokens_seen": 220697728, "step": 102330 }, { "epoch": 18.780510185355112, "grad_norm": 0.0006690039881505072, "learning_rate": 1.1286029428444812e-07, "loss": 0.0, "num_input_tokens_seen": 220708256, "step": 102335 }, { "epoch": 18.781427784914662, "grad_norm": 0.0027401172555983067, "learning_rate": 1.1269118184074556e-07, "loss": 0.0, "num_input_tokens_seen": 220718368, "step": 102340 }, { "epoch": 18.782345384474215, "grad_norm": 0.0011086371960118413, "learning_rate": 1.125221947489391e-07, "loss": 0.0, "num_input_tokens_seen": 220729120, "step": 102345 }, { "epoch": 18.78326298403377, "grad_norm": 0.0016168731963261962, "learning_rate": 1.1235333301336415e-07, "loss": 0.0, "num_input_tokens_seen": 220740288, "step": 102350 }, { "epoch": 18.78418058359332, "grad_norm": 0.05215935781598091, "learning_rate": 1.1218459663835058e-07, "loss": 0.0001, "num_input_tokens_seen": 220749344, "step": 102355 }, { "epoch": 18.785098183152872, "grad_norm": 0.13907912373542786, "learning_rate": 1.1201598562822713e-07, "loss": 0.0, "num_input_tokens_seen": 220759584, "step": 102360 }, { "epoch": 18.786015782712425, "grad_norm": 3.4006083011627197, "learning_rate": 1.1184749998731703e-07, "loss": 0.0002, "num_input_tokens_seen": 220771072, "step": 102365 }, { "epoch": 18.786933382271975, "grad_norm": 0.0004972645547240973, "learning_rate": 1.1167913971994348e-07, "loss": 0.002, "num_input_tokens_seen": 220780704, "step": 102370 }, { "epoch": 18.78785098183153, "grad_norm": 89.16556549072266, "learning_rate": 1.1151090483042359e-07, "loss": 0.0284, "num_input_tokens_seen": 220790624, "step": 102375 }, { "epoch": 18.788768581391082, "grad_norm": 1.185081124305725, "learning_rate": 1.1134279532307224e-07, "loss": 0.0003, "num_input_tokens_seen": 220801984, "step": 102380 }, { "epoch": 18.789686180950632, "grad_norm": 0.01638328842818737, "learning_rate": 1.1117481120220208e-07, "loss": 0.0001, "num_input_tokens_seen": 220811616, "step": 102385 }, { "epoch": 18.790603780510185, "grad_norm": 0.0013544567627832294, "learning_rate": 1.1100695247212079e-07, "loss": 0.0, "num_input_tokens_seen": 220822112, "step": 102390 }, { "epoch": 18.79152138006974, "grad_norm": 0.0018226626561954618, "learning_rate": 1.1083921913713325e-07, "loss": 0.1813, "num_input_tokens_seen": 220833888, "step": 102395 }, { "epoch": 18.79243897962929, "grad_norm": 0.012806735001504421, "learning_rate": 1.1067161120154268e-07, "loss": 0.0, "num_input_tokens_seen": 220845120, "step": 102400 }, { "epoch": 18.793356579188842, "grad_norm": 0.005163817200809717, "learning_rate": 1.1050412866964789e-07, "loss": 0.0, "num_input_tokens_seen": 220856800, "step": 102405 }, { "epoch": 18.794274178748395, "grad_norm": 0.003090139012783766, "learning_rate": 1.1033677154574373e-07, "loss": 0.0944, "num_input_tokens_seen": 220867968, "step": 102410 }, { "epoch": 18.795191778307945, "grad_norm": 0.000711343192961067, "learning_rate": 1.1016953983412349e-07, "loss": 0.0, "num_input_tokens_seen": 220880000, "step": 102415 }, { "epoch": 18.7961093778675, "grad_norm": 0.0012383931316435337, "learning_rate": 1.1000243353907536e-07, "loss": 0.0, "num_input_tokens_seen": 220890112, "step": 102420 }, { "epoch": 18.797026977427052, "grad_norm": 0.0017116123344749212, "learning_rate": 1.098354526648865e-07, "loss": 0.0, "num_input_tokens_seen": 220901824, "step": 102425 }, { "epoch": 18.797944576986602, "grad_norm": 0.0006425651954486966, "learning_rate": 1.09668597215839e-07, "loss": 0.0, "num_input_tokens_seen": 220912192, "step": 102430 }, { "epoch": 18.798862176546155, "grad_norm": 0.003924685530364513, "learning_rate": 1.095018671962128e-07, "loss": 0.0, "num_input_tokens_seen": 220921696, "step": 102435 }, { "epoch": 18.79977977610571, "grad_norm": 0.0019033802673220634, "learning_rate": 1.0933526261028449e-07, "loss": 0.0, "num_input_tokens_seen": 220934016, "step": 102440 }, { "epoch": 18.80069737566526, "grad_norm": 0.0005971162463538349, "learning_rate": 1.0916878346232618e-07, "loss": 0.0, "num_input_tokens_seen": 220943904, "step": 102445 }, { "epoch": 18.801614975224812, "grad_norm": 0.0006191600696183741, "learning_rate": 1.0900242975660835e-07, "loss": 0.0, "num_input_tokens_seen": 220954016, "step": 102450 }, { "epoch": 18.802532574784365, "grad_norm": 0.004927974659949541, "learning_rate": 1.0883620149739871e-07, "loss": 0.0, "num_input_tokens_seen": 220964640, "step": 102455 }, { "epoch": 18.803450174343915, "grad_norm": 0.001153131714090705, "learning_rate": 1.0867009868895939e-07, "loss": 0.0, "num_input_tokens_seen": 220974496, "step": 102460 }, { "epoch": 18.80436777390347, "grad_norm": 0.0009598474716767669, "learning_rate": 1.0850412133555088e-07, "loss": 0.0001, "num_input_tokens_seen": 220985472, "step": 102465 }, { "epoch": 18.805285373463022, "grad_norm": 0.004834710620343685, "learning_rate": 1.0833826944143089e-07, "loss": 0.0, "num_input_tokens_seen": 220995840, "step": 102470 }, { "epoch": 18.80620297302257, "grad_norm": 0.005793024320155382, "learning_rate": 1.0817254301085267e-07, "loss": 0.0, "num_input_tokens_seen": 221006656, "step": 102475 }, { "epoch": 18.807120572582125, "grad_norm": 0.019844043999910355, "learning_rate": 1.0800694204806672e-07, "loss": 0.0051, "num_input_tokens_seen": 221016480, "step": 102480 }, { "epoch": 18.80803817214168, "grad_norm": 0.010880285874009132, "learning_rate": 1.078414665573213e-07, "loss": 0.0, "num_input_tokens_seen": 221027488, "step": 102485 }, { "epoch": 18.80895577170123, "grad_norm": 0.0005889565800316632, "learning_rate": 1.0767611654286025e-07, "loss": 0.0, "num_input_tokens_seen": 221037824, "step": 102490 }, { "epoch": 18.80987337126078, "grad_norm": 0.0006677046185359359, "learning_rate": 1.0751089200892461e-07, "loss": 0.0, "num_input_tokens_seen": 221048736, "step": 102495 }, { "epoch": 18.810790970820335, "grad_norm": 0.0026287660002708435, "learning_rate": 1.0734579295975101e-07, "loss": 0.0, "num_input_tokens_seen": 221060608, "step": 102500 }, { "epoch": 18.811708570379885, "grad_norm": 0.0011254925047978759, "learning_rate": 1.0718081939957548e-07, "loss": 0.056, "num_input_tokens_seen": 221070976, "step": 102505 }, { "epoch": 18.81262616993944, "grad_norm": 0.003436281578615308, "learning_rate": 1.0701597133262908e-07, "loss": 0.0, "num_input_tokens_seen": 221082496, "step": 102510 }, { "epoch": 18.81354376949899, "grad_norm": 0.0011674368288367987, "learning_rate": 1.0685124876313901e-07, "loss": 0.0, "num_input_tokens_seen": 221093600, "step": 102515 }, { "epoch": 18.81446136905854, "grad_norm": 0.009602510370314121, "learning_rate": 1.0668665169533076e-07, "loss": 0.0001, "num_input_tokens_seen": 221105568, "step": 102520 }, { "epoch": 18.815378968618095, "grad_norm": 0.0005872242618352175, "learning_rate": 1.0652218013342596e-07, "loss": 0.0, "num_input_tokens_seen": 221113728, "step": 102525 }, { "epoch": 18.81629656817765, "grad_norm": 0.0015901824226602912, "learning_rate": 1.0635783408164291e-07, "loss": 0.0, "num_input_tokens_seen": 221124160, "step": 102530 }, { "epoch": 18.8172141677372, "grad_norm": 0.003941853065043688, "learning_rate": 1.0619361354419766e-07, "loss": 0.0, "num_input_tokens_seen": 221136256, "step": 102535 }, { "epoch": 18.81813176729675, "grad_norm": 0.0008364773239009082, "learning_rate": 1.0602951852530075e-07, "loss": 0.0, "num_input_tokens_seen": 221147680, "step": 102540 }, { "epoch": 18.819049366856305, "grad_norm": 0.010318825952708721, "learning_rate": 1.0586554902916214e-07, "loss": 0.0006, "num_input_tokens_seen": 221157856, "step": 102545 }, { "epoch": 18.819966966415855, "grad_norm": 0.00359726301394403, "learning_rate": 1.0570170505998679e-07, "loss": 0.0, "num_input_tokens_seen": 221167776, "step": 102550 }, { "epoch": 18.82088456597541, "grad_norm": 0.007563900202512741, "learning_rate": 1.0553798662197745e-07, "loss": 0.0, "num_input_tokens_seen": 221179232, "step": 102555 }, { "epoch": 18.82180216553496, "grad_norm": 0.006551203317940235, "learning_rate": 1.05374393719333e-07, "loss": 0.0, "num_input_tokens_seen": 221189408, "step": 102560 }, { "epoch": 18.82271976509451, "grad_norm": 0.003475039964541793, "learning_rate": 1.0521092635624897e-07, "loss": 0.0, "num_input_tokens_seen": 221200480, "step": 102565 }, { "epoch": 18.823637364654065, "grad_norm": 0.007240969687700272, "learning_rate": 1.0504758453691866e-07, "loss": 0.0, "num_input_tokens_seen": 221210048, "step": 102570 }, { "epoch": 18.82455496421362, "grad_norm": 0.018070293590426445, "learning_rate": 1.048843682655315e-07, "loss": 0.0, "num_input_tokens_seen": 221220736, "step": 102575 }, { "epoch": 18.825472563773168, "grad_norm": 0.0007563817780464888, "learning_rate": 1.0472127754627304e-07, "loss": 0.0, "num_input_tokens_seen": 221230880, "step": 102580 }, { "epoch": 18.82639016333272, "grad_norm": 0.0030380096286535263, "learning_rate": 1.045583123833277e-07, "loss": 0.0, "num_input_tokens_seen": 221242272, "step": 102585 }, { "epoch": 18.827307762892275, "grad_norm": 0.020163118839263916, "learning_rate": 1.043954727808738e-07, "loss": 0.0001, "num_input_tokens_seen": 221253312, "step": 102590 }, { "epoch": 18.828225362451825, "grad_norm": 0.0007673284271731973, "learning_rate": 1.0423275874308858e-07, "loss": 0.0, "num_input_tokens_seen": 221263744, "step": 102595 }, { "epoch": 18.829142962011378, "grad_norm": 0.003861251985654235, "learning_rate": 1.0407017027414535e-07, "loss": 0.0, "num_input_tokens_seen": 221274912, "step": 102600 }, { "epoch": 18.83006056157093, "grad_norm": 0.0018289471045136452, "learning_rate": 1.039077073782141e-07, "loss": 0.0001, "num_input_tokens_seen": 221285728, "step": 102605 }, { "epoch": 18.83097816113048, "grad_norm": 0.0008766860701143742, "learning_rate": 1.0374537005946261e-07, "loss": 0.0, "num_input_tokens_seen": 221296480, "step": 102610 }, { "epoch": 18.831895760690035, "grad_norm": 0.31442293524742126, "learning_rate": 1.0358315832205257e-07, "loss": 0.0001, "num_input_tokens_seen": 221306976, "step": 102615 }, { "epoch": 18.832813360249588, "grad_norm": 0.008400969207286835, "learning_rate": 1.0342107217014674e-07, "loss": 0.0, "num_input_tokens_seen": 221316864, "step": 102620 }, { "epoch": 18.833730959809138, "grad_norm": 0.009643291123211384, "learning_rate": 1.0325911160790126e-07, "loss": 0.0, "num_input_tokens_seen": 221326208, "step": 102625 }, { "epoch": 18.83464855936869, "grad_norm": 0.001917426474392414, "learning_rate": 1.0309727663947055e-07, "loss": 0.0, "num_input_tokens_seen": 221337632, "step": 102630 }, { "epoch": 18.835566158928245, "grad_norm": 0.0012307852739468217, "learning_rate": 1.0293556726900522e-07, "loss": 0.0, "num_input_tokens_seen": 221348448, "step": 102635 }, { "epoch": 18.836483758487795, "grad_norm": 0.0019356185803189874, "learning_rate": 1.0277398350065249e-07, "loss": 0.0, "num_input_tokens_seen": 221359328, "step": 102640 }, { "epoch": 18.837401358047348, "grad_norm": 0.0009978211019188166, "learning_rate": 1.0261252533855681e-07, "loss": 0.0, "num_input_tokens_seen": 221370528, "step": 102645 }, { "epoch": 18.8383189576069, "grad_norm": 0.14840571582317352, "learning_rate": 1.0245119278685989e-07, "loss": 0.0001, "num_input_tokens_seen": 221381696, "step": 102650 }, { "epoch": 18.83923655716645, "grad_norm": 0.032591186463832855, "learning_rate": 1.0228998584969951e-07, "loss": 0.0, "num_input_tokens_seen": 221392832, "step": 102655 }, { "epoch": 18.840154156726005, "grad_norm": 0.004988244269043207, "learning_rate": 1.0212890453121016e-07, "loss": 0.0, "num_input_tokens_seen": 221403168, "step": 102660 }, { "epoch": 18.841071756285558, "grad_norm": 0.0028836727142333984, "learning_rate": 1.0196794883552296e-07, "loss": 0.001, "num_input_tokens_seen": 221414336, "step": 102665 }, { "epoch": 18.841989355845108, "grad_norm": 0.014257114380598068, "learning_rate": 1.0180711876676686e-07, "loss": 0.0, "num_input_tokens_seen": 221426080, "step": 102670 }, { "epoch": 18.84290695540466, "grad_norm": 0.0005611596861854196, "learning_rate": 1.0164641432906686e-07, "loss": 0.0119, "num_input_tokens_seen": 221438080, "step": 102675 }, { "epoch": 18.843824554964215, "grad_norm": 0.0015643415972590446, "learning_rate": 1.0148583552654467e-07, "loss": 0.0, "num_input_tokens_seen": 221447488, "step": 102680 }, { "epoch": 18.844742154523765, "grad_norm": 0.0028858943842351437, "learning_rate": 1.0132538236331813e-07, "loss": 0.0, "num_input_tokens_seen": 221459008, "step": 102685 }, { "epoch": 18.845659754083318, "grad_norm": 0.0002807606942951679, "learning_rate": 1.0116505484350392e-07, "loss": 0.0, "num_input_tokens_seen": 221471104, "step": 102690 }, { "epoch": 18.84657735364287, "grad_norm": 0.005329339299350977, "learning_rate": 1.0100485297121321e-07, "loss": 0.0, "num_input_tokens_seen": 221481440, "step": 102695 }, { "epoch": 18.84749495320242, "grad_norm": 0.0006798660615459085, "learning_rate": 1.0084477675055548e-07, "loss": 0.0, "num_input_tokens_seen": 221492672, "step": 102700 }, { "epoch": 18.848412552761975, "grad_norm": 0.0007924375240691006, "learning_rate": 1.006848261856358e-07, "loss": 0.0, "num_input_tokens_seen": 221502720, "step": 102705 }, { "epoch": 18.849330152321528, "grad_norm": 0.0013023396022617817, "learning_rate": 1.0052500128055753e-07, "loss": 0.0032, "num_input_tokens_seen": 221514144, "step": 102710 }, { "epoch": 18.850247751881078, "grad_norm": 0.005233472678810358, "learning_rate": 1.0036530203941908e-07, "loss": 0.0, "num_input_tokens_seen": 221526080, "step": 102715 }, { "epoch": 18.85116535144063, "grad_norm": 0.0021010541822761297, "learning_rate": 1.0020572846631771e-07, "loss": 0.0207, "num_input_tokens_seen": 221536608, "step": 102720 }, { "epoch": 18.852082951000185, "grad_norm": 0.0014828827697783709, "learning_rate": 1.000462805653446e-07, "loss": 0.0, "num_input_tokens_seen": 221548064, "step": 102725 }, { "epoch": 18.853000550559734, "grad_norm": 0.0014104051515460014, "learning_rate": 9.988695834059092e-08, "loss": 0.0, "num_input_tokens_seen": 221558816, "step": 102730 }, { "epoch": 18.853918150119288, "grad_norm": 0.0014871264575049281, "learning_rate": 9.972776179614118e-08, "loss": 0.0, "num_input_tokens_seen": 221568928, "step": 102735 }, { "epoch": 18.85483574967884, "grad_norm": 0.000967176107224077, "learning_rate": 9.956869093608046e-08, "loss": 0.0, "num_input_tokens_seen": 221579936, "step": 102740 }, { "epoch": 18.85575334923839, "grad_norm": 0.006735539995133877, "learning_rate": 9.94097457644877e-08, "loss": 0.175, "num_input_tokens_seen": 221590944, "step": 102745 }, { "epoch": 18.856670948797944, "grad_norm": 0.0006518074660561979, "learning_rate": 9.925092628543908e-08, "loss": 0.0001, "num_input_tokens_seen": 221601120, "step": 102750 }, { "epoch": 18.857588548357498, "grad_norm": 0.1283121556043625, "learning_rate": 9.90922325030097e-08, "loss": 0.0001, "num_input_tokens_seen": 221611904, "step": 102755 }, { "epoch": 18.858506147917048, "grad_norm": 0.002371957991272211, "learning_rate": 9.89336644212685e-08, "loss": 0.0, "num_input_tokens_seen": 221622880, "step": 102760 }, { "epoch": 18.8594237474766, "grad_norm": 0.20628869533538818, "learning_rate": 9.877522204428225e-08, "loss": 0.0, "num_input_tokens_seen": 221633152, "step": 102765 }, { "epoch": 18.860341347036155, "grad_norm": 109.00927734375, "learning_rate": 9.861690537611601e-08, "loss": 0.0478, "num_input_tokens_seen": 221645184, "step": 102770 }, { "epoch": 18.861258946595704, "grad_norm": 0.012276819907128811, "learning_rate": 9.845871442082989e-08, "loss": 0.0, "num_input_tokens_seen": 221655488, "step": 102775 }, { "epoch": 18.862176546155258, "grad_norm": 0.01794968545436859, "learning_rate": 9.830064918248061e-08, "loss": 0.0, "num_input_tokens_seen": 221666400, "step": 102780 }, { "epoch": 18.86309414571481, "grad_norm": 0.007999351248145103, "learning_rate": 9.814270966512218e-08, "loss": 0.0, "num_input_tokens_seen": 221677504, "step": 102785 }, { "epoch": 18.86401174527436, "grad_norm": 0.04678093269467354, "learning_rate": 9.79848958728069e-08, "loss": 0.0, "num_input_tokens_seen": 221689088, "step": 102790 }, { "epoch": 18.864929344833914, "grad_norm": 0.0012234882451593876, "learning_rate": 9.782720780958155e-08, "loss": 0.0005, "num_input_tokens_seen": 221699744, "step": 102795 }, { "epoch": 18.865846944393468, "grad_norm": 0.0016701897839084268, "learning_rate": 9.766964547949009e-08, "loss": 0.0, "num_input_tokens_seen": 221709760, "step": 102800 }, { "epoch": 18.866764543953018, "grad_norm": 33.22223663330078, "learning_rate": 9.751220888657486e-08, "loss": 0.2006, "num_input_tokens_seen": 221720384, "step": 102805 }, { "epoch": 18.86768214351257, "grad_norm": 0.0005061405245214701, "learning_rate": 9.73548980348732e-08, "loss": 0.0, "num_input_tokens_seen": 221731200, "step": 102810 }, { "epoch": 18.868599743072124, "grad_norm": 0.003189506707713008, "learning_rate": 9.719771292842017e-08, "loss": 0.0032, "num_input_tokens_seen": 221740672, "step": 102815 }, { "epoch": 18.869517342631674, "grad_norm": 0.003363093826919794, "learning_rate": 9.704065357124648e-08, "loss": 0.0974, "num_input_tokens_seen": 221752160, "step": 102820 }, { "epoch": 18.870434942191228, "grad_norm": 0.0021845775190740824, "learning_rate": 9.688371996738166e-08, "loss": 0.0506, "num_input_tokens_seen": 221762784, "step": 102825 }, { "epoch": 18.87135254175078, "grad_norm": 0.005675874650478363, "learning_rate": 9.672691212085028e-08, "loss": 0.0, "num_input_tokens_seen": 221772576, "step": 102830 }, { "epoch": 18.87227014131033, "grad_norm": 0.001818542368710041, "learning_rate": 9.657023003567411e-08, "loss": 0.0001, "num_input_tokens_seen": 221783264, "step": 102835 }, { "epoch": 18.873187740869884, "grad_norm": 0.01945566199719906, "learning_rate": 9.641367371587163e-08, "loss": 0.0, "num_input_tokens_seen": 221794240, "step": 102840 }, { "epoch": 18.874105340429438, "grad_norm": 0.0017608817433938384, "learning_rate": 9.625724316545904e-08, "loss": 0.0, "num_input_tokens_seen": 221804704, "step": 102845 }, { "epoch": 18.875022939988988, "grad_norm": 0.012531629763543606, "learning_rate": 9.610093838844814e-08, "loss": 0.0, "num_input_tokens_seen": 221815936, "step": 102850 }, { "epoch": 18.87594053954854, "grad_norm": 0.00411049323156476, "learning_rate": 9.594475938884739e-08, "loss": 0.0, "num_input_tokens_seen": 221826464, "step": 102855 }, { "epoch": 18.876858139108094, "grad_norm": 0.0015282457461580634, "learning_rate": 9.57887061706636e-08, "loss": 0.0, "num_input_tokens_seen": 221835648, "step": 102860 }, { "epoch": 18.877775738667644, "grad_norm": 0.011431525461375713, "learning_rate": 9.563277873789745e-08, "loss": 0.0207, "num_input_tokens_seen": 221845952, "step": 102865 }, { "epoch": 18.878693338227198, "grad_norm": 0.0050770738162100315, "learning_rate": 9.547697709455073e-08, "loss": 0.0001, "num_input_tokens_seen": 221857184, "step": 102870 }, { "epoch": 18.87961093778675, "grad_norm": 0.003577908966690302, "learning_rate": 9.532130124461747e-08, "loss": 0.0, "num_input_tokens_seen": 221866688, "step": 102875 }, { "epoch": 18.8805285373463, "grad_norm": 0.0006248718127608299, "learning_rate": 9.516575119209171e-08, "loss": 0.0, "num_input_tokens_seen": 221878176, "step": 102880 }, { "epoch": 18.881446136905854, "grad_norm": 0.000975768081843853, "learning_rate": 9.50103269409619e-08, "loss": 0.0, "num_input_tokens_seen": 221889632, "step": 102885 }, { "epoch": 18.882363736465408, "grad_norm": 0.28720149397850037, "learning_rate": 9.485502849521599e-08, "loss": 0.0, "num_input_tokens_seen": 221901504, "step": 102890 }, { "epoch": 18.883281336024957, "grad_norm": 0.0005100092967040837, "learning_rate": 9.469985585883579e-08, "loss": 0.0, "num_input_tokens_seen": 221912736, "step": 102895 }, { "epoch": 18.88419893558451, "grad_norm": 0.0005575598333962262, "learning_rate": 9.454480903580143e-08, "loss": 0.0, "num_input_tokens_seen": 221923872, "step": 102900 }, { "epoch": 18.885116535144064, "grad_norm": 0.002023735549300909, "learning_rate": 9.438988803009086e-08, "loss": 0.0006, "num_input_tokens_seen": 221933184, "step": 102905 }, { "epoch": 18.886034134703614, "grad_norm": 0.16082419455051422, "learning_rate": 9.423509284567645e-08, "loss": 0.0, "num_input_tokens_seen": 221943104, "step": 102910 }, { "epoch": 18.886951734263167, "grad_norm": 0.000700916803907603, "learning_rate": 9.408042348652835e-08, "loss": 0.0, "num_input_tokens_seen": 221954016, "step": 102915 }, { "epoch": 18.88786933382272, "grad_norm": 0.003250515554100275, "learning_rate": 9.392587995661396e-08, "loss": 0.0, "num_input_tokens_seen": 221964736, "step": 102920 }, { "epoch": 18.88878693338227, "grad_norm": 0.036523763090372086, "learning_rate": 9.377146225989676e-08, "loss": 0.0001, "num_input_tokens_seen": 221975648, "step": 102925 }, { "epoch": 18.889704532941824, "grad_norm": 0.0016795576084405184, "learning_rate": 9.361717040033802e-08, "loss": 0.0, "num_input_tokens_seen": 221986368, "step": 102930 }, { "epoch": 18.890622132501377, "grad_norm": 0.0006216347683221102, "learning_rate": 9.34630043818946e-08, "loss": 0.0, "num_input_tokens_seen": 221995200, "step": 102935 }, { "epoch": 18.891539732060927, "grad_norm": 0.0029791889246553183, "learning_rate": 9.330896420852054e-08, "loss": 0.0, "num_input_tokens_seen": 222005632, "step": 102940 }, { "epoch": 18.89245733162048, "grad_norm": 0.00044993404299020767, "learning_rate": 9.315504988416713e-08, "loss": 0.0, "num_input_tokens_seen": 222015328, "step": 102945 }, { "epoch": 18.893374931180034, "grad_norm": 0.00033306903787888587, "learning_rate": 9.300126141278177e-08, "loss": 0.0001, "num_input_tokens_seen": 222026080, "step": 102950 }, { "epoch": 18.894292530739584, "grad_norm": 0.012740053236484528, "learning_rate": 9.284759879830796e-08, "loss": 0.0, "num_input_tokens_seen": 222035936, "step": 102955 }, { "epoch": 18.895210130299137, "grad_norm": 0.001743566244840622, "learning_rate": 9.269406204468867e-08, "loss": 0.0, "num_input_tokens_seen": 222046944, "step": 102960 }, { "epoch": 18.89612772985869, "grad_norm": 0.0010615031933411956, "learning_rate": 9.25406511558613e-08, "loss": 0.0002, "num_input_tokens_seen": 222057696, "step": 102965 }, { "epoch": 18.89704532941824, "grad_norm": 0.0012273520696908236, "learning_rate": 9.238736613575994e-08, "loss": 0.0, "num_input_tokens_seen": 222067680, "step": 102970 }, { "epoch": 18.897962928977794, "grad_norm": 0.0018597162561491132, "learning_rate": 9.223420698831642e-08, "loss": 0.0, "num_input_tokens_seen": 222078624, "step": 102975 }, { "epoch": 18.898880528537347, "grad_norm": 0.001160806161351502, "learning_rate": 9.208117371745928e-08, "loss": 0.0, "num_input_tokens_seen": 222089600, "step": 102980 }, { "epoch": 18.899798128096897, "grad_norm": 0.004864903632551432, "learning_rate": 9.192826632711315e-08, "loss": 0.0, "num_input_tokens_seen": 222101536, "step": 102985 }, { "epoch": 18.90071572765645, "grad_norm": 0.004231756553053856, "learning_rate": 9.177548482120102e-08, "loss": 0.0, "num_input_tokens_seen": 222111712, "step": 102990 }, { "epoch": 18.901633327216004, "grad_norm": 0.0016501812497153878, "learning_rate": 9.16228292036403e-08, "loss": 0.0, "num_input_tokens_seen": 222122528, "step": 102995 }, { "epoch": 18.902550926775554, "grad_norm": 0.0054405140690505505, "learning_rate": 9.147029947834618e-08, "loss": 0.0, "num_input_tokens_seen": 222133056, "step": 103000 }, { "epoch": 18.903468526335107, "grad_norm": 0.001887560705654323, "learning_rate": 9.131789564923166e-08, "loss": 0.0, "num_input_tokens_seen": 222143488, "step": 103005 }, { "epoch": 18.90438612589466, "grad_norm": 0.0031681968830525875, "learning_rate": 9.116561772020527e-08, "loss": 0.0, "num_input_tokens_seen": 222153664, "step": 103010 }, { "epoch": 18.90530372545421, "grad_norm": 0.03843459486961365, "learning_rate": 9.101346569517334e-08, "loss": 0.0, "num_input_tokens_seen": 222164896, "step": 103015 }, { "epoch": 18.906221325013764, "grad_norm": 0.0004877810715697706, "learning_rate": 9.086143957803717e-08, "loss": 0.0, "num_input_tokens_seen": 222175488, "step": 103020 }, { "epoch": 18.907138924573317, "grad_norm": 0.002358849858865142, "learning_rate": 9.070953937269645e-08, "loss": 0.0, "num_input_tokens_seen": 222185568, "step": 103025 }, { "epoch": 18.908056524132867, "grad_norm": 0.0022404666524380445, "learning_rate": 9.055776508304804e-08, "loss": 0.0, "num_input_tokens_seen": 222195648, "step": 103030 }, { "epoch": 18.90897412369242, "grad_norm": 0.0016949187265709043, "learning_rate": 9.040611671298327e-08, "loss": 0.0, "num_input_tokens_seen": 222206656, "step": 103035 }, { "epoch": 18.909891723251974, "grad_norm": 0.0010259365662932396, "learning_rate": 9.025459426639294e-08, "loss": 0.0, "num_input_tokens_seen": 222217184, "step": 103040 }, { "epoch": 18.910809322811524, "grad_norm": 0.0042606620118021965, "learning_rate": 9.010319774716281e-08, "loss": 0.0, "num_input_tokens_seen": 222227808, "step": 103045 }, { "epoch": 18.911726922371077, "grad_norm": 0.0009551006951369345, "learning_rate": 8.995192715917588e-08, "loss": 0.0, "num_input_tokens_seen": 222238624, "step": 103050 }, { "epoch": 18.91264452193063, "grad_norm": 0.0009978031739592552, "learning_rate": 8.980078250631241e-08, "loss": 0.0032, "num_input_tokens_seen": 222250560, "step": 103055 }, { "epoch": 18.91356212149018, "grad_norm": 0.0007083728560246527, "learning_rate": 8.964976379244816e-08, "loss": 0.0, "num_input_tokens_seen": 222261408, "step": 103060 }, { "epoch": 18.914479721049734, "grad_norm": 0.007314001210033894, "learning_rate": 8.949887102145783e-08, "loss": 0.0, "num_input_tokens_seen": 222271072, "step": 103065 }, { "epoch": 18.915397320609287, "grad_norm": 0.0004242356226313859, "learning_rate": 8.934810419721052e-08, "loss": 0.0, "num_input_tokens_seen": 222282112, "step": 103070 }, { "epoch": 18.916314920168837, "grad_norm": 0.1285458654165268, "learning_rate": 8.91974633235737e-08, "loss": 0.0001, "num_input_tokens_seen": 222291456, "step": 103075 }, { "epoch": 18.91723251972839, "grad_norm": 0.0022827854845672846, "learning_rate": 8.904694840441041e-08, "loss": 0.0001, "num_input_tokens_seen": 222301664, "step": 103080 }, { "epoch": 18.918150119287944, "grad_norm": 15.158575057983398, "learning_rate": 8.8896559443582e-08, "loss": 0.002, "num_input_tokens_seen": 222312448, "step": 103085 }, { "epoch": 18.919067718847494, "grad_norm": 0.004164727870374918, "learning_rate": 8.874629644494481e-08, "loss": 0.0097, "num_input_tokens_seen": 222323488, "step": 103090 }, { "epoch": 18.919985318407047, "grad_norm": 0.0011372852604836226, "learning_rate": 8.85961594123541e-08, "loss": 0.0, "num_input_tokens_seen": 222334240, "step": 103095 }, { "epoch": 18.9209029179666, "grad_norm": 0.010325336828827858, "learning_rate": 8.844614834965959e-08, "loss": 0.0, "num_input_tokens_seen": 222345600, "step": 103100 }, { "epoch": 18.92182051752615, "grad_norm": 0.0016888045938685536, "learning_rate": 8.829626326070872e-08, "loss": 0.0, "num_input_tokens_seen": 222357280, "step": 103105 }, { "epoch": 18.922738117085704, "grad_norm": 0.04082108661532402, "learning_rate": 8.814650414934677e-08, "loss": 0.0, "num_input_tokens_seen": 222367584, "step": 103110 }, { "epoch": 18.923655716645257, "grad_norm": 0.03537924960255623, "learning_rate": 8.799687101941456e-08, "loss": 0.0, "num_input_tokens_seen": 222378528, "step": 103115 }, { "epoch": 18.924573316204807, "grad_norm": 0.059586554765701294, "learning_rate": 8.784736387474902e-08, "loss": 0.0001, "num_input_tokens_seen": 222389536, "step": 103120 }, { "epoch": 18.92549091576436, "grad_norm": 0.006049197632819414, "learning_rate": 8.769798271918595e-08, "loss": 0.0001, "num_input_tokens_seen": 222400384, "step": 103125 }, { "epoch": 18.926408515323914, "grad_norm": 0.0005162028246559203, "learning_rate": 8.75487275565562e-08, "loss": 0.0, "num_input_tokens_seen": 222410592, "step": 103130 }, { "epoch": 18.927326114883464, "grad_norm": 0.07045195996761322, "learning_rate": 8.739959839068779e-08, "loss": 0.0, "num_input_tokens_seen": 222422240, "step": 103135 }, { "epoch": 18.928243714443017, "grad_norm": 0.002607955364510417, "learning_rate": 8.725059522540546e-08, "loss": 0.0, "num_input_tokens_seen": 222433248, "step": 103140 }, { "epoch": 18.92916131400257, "grad_norm": 0.030283644795417786, "learning_rate": 8.710171806453171e-08, "loss": 0.0, "num_input_tokens_seen": 222443264, "step": 103145 }, { "epoch": 18.93007891356212, "grad_norm": 0.007335347589105368, "learning_rate": 8.695296691188514e-08, "loss": 0.0, "num_input_tokens_seen": 222454240, "step": 103150 }, { "epoch": 18.930996513121674, "grad_norm": 0.0014823491219431162, "learning_rate": 8.680434177127938e-08, "loss": 0.0, "num_input_tokens_seen": 222465984, "step": 103155 }, { "epoch": 18.931914112681227, "grad_norm": 0.0017333008581772447, "learning_rate": 8.665584264652805e-08, "loss": 0.0, "num_input_tokens_seen": 222476032, "step": 103160 }, { "epoch": 18.932831712240777, "grad_norm": 0.0013963328674435616, "learning_rate": 8.650746954143919e-08, "loss": 0.0, "num_input_tokens_seen": 222486976, "step": 103165 }, { "epoch": 18.93374931180033, "grad_norm": 0.00044672959484159946, "learning_rate": 8.635922245981865e-08, "loss": 0.0, "num_input_tokens_seen": 222498016, "step": 103170 }, { "epoch": 18.934666911359884, "grad_norm": 0.10036129504442215, "learning_rate": 8.62111014054684e-08, "loss": 0.0002, "num_input_tokens_seen": 222508896, "step": 103175 }, { "epoch": 18.935584510919433, "grad_norm": 0.005440548527985811, "learning_rate": 8.606310638218818e-08, "loss": 0.0, "num_input_tokens_seen": 222520832, "step": 103180 }, { "epoch": 18.936502110478987, "grad_norm": 0.005436814855784178, "learning_rate": 8.591523739377328e-08, "loss": 0.0588, "num_input_tokens_seen": 222529504, "step": 103185 }, { "epoch": 18.93741971003854, "grad_norm": 0.01532912626862526, "learning_rate": 8.576749444401566e-08, "loss": 0.0, "num_input_tokens_seen": 222539104, "step": 103190 }, { "epoch": 18.93833730959809, "grad_norm": 0.002440112177282572, "learning_rate": 8.56198775367062e-08, "loss": 0.0144, "num_input_tokens_seen": 222549664, "step": 103195 }, { "epoch": 18.939254909157643, "grad_norm": 0.0004792924446519464, "learning_rate": 8.547238667563018e-08, "loss": 0.0, "num_input_tokens_seen": 222560832, "step": 103200 }, { "epoch": 18.940172508717197, "grad_norm": 0.002619327511638403, "learning_rate": 8.532502186457014e-08, "loss": 0.0, "num_input_tokens_seen": 222570304, "step": 103205 }, { "epoch": 18.941090108276747, "grad_norm": 0.059193409979343414, "learning_rate": 8.517778310730696e-08, "loss": 0.0001, "num_input_tokens_seen": 222580768, "step": 103210 }, { "epoch": 18.9420077078363, "grad_norm": 0.007147443946450949, "learning_rate": 8.503067040761593e-08, "loss": 0.0, "num_input_tokens_seen": 222590784, "step": 103215 }, { "epoch": 18.942925307395853, "grad_norm": 0.14238502085208893, "learning_rate": 8.48836837692707e-08, "loss": 0.0, "num_input_tokens_seen": 222600960, "step": 103220 }, { "epoch": 18.943842906955403, "grad_norm": 0.0005457086954265833, "learning_rate": 8.473682319604104e-08, "loss": 0.0063, "num_input_tokens_seen": 222611168, "step": 103225 }, { "epoch": 18.944760506514957, "grad_norm": 0.000978380092419684, "learning_rate": 8.45900886916945e-08, "loss": 0.0, "num_input_tokens_seen": 222621120, "step": 103230 }, { "epoch": 18.94567810607451, "grad_norm": 0.004315308295190334, "learning_rate": 8.44434802599936e-08, "loss": 0.0, "num_input_tokens_seen": 222632192, "step": 103235 }, { "epoch": 18.94659570563406, "grad_norm": 0.0006619992782361805, "learning_rate": 8.429699790469869e-08, "loss": 0.0, "num_input_tokens_seen": 222641920, "step": 103240 }, { "epoch": 18.947513305193613, "grad_norm": 0.0028499066829681396, "learning_rate": 8.415064162956787e-08, "loss": 0.0, "num_input_tokens_seen": 222653504, "step": 103245 }, { "epoch": 18.948430904753167, "grad_norm": 0.019797883927822113, "learning_rate": 8.40044114383537e-08, "loss": 0.0001, "num_input_tokens_seen": 222664832, "step": 103250 }, { "epoch": 18.949348504312717, "grad_norm": 0.001760038430802524, "learning_rate": 8.38583073348076e-08, "loss": 0.0, "num_input_tokens_seen": 222676320, "step": 103255 }, { "epoch": 18.95026610387227, "grad_norm": 0.0008417764911428094, "learning_rate": 8.371232932267603e-08, "loss": 0.0, "num_input_tokens_seen": 222688288, "step": 103260 }, { "epoch": 18.951183703431823, "grad_norm": 0.0038289236836135387, "learning_rate": 8.356647740570434e-08, "loss": 0.0, "num_input_tokens_seen": 222699488, "step": 103265 }, { "epoch": 18.952101302991373, "grad_norm": 0.28434085845947266, "learning_rate": 8.34207515876323e-08, "loss": 0.0001, "num_input_tokens_seen": 222710464, "step": 103270 }, { "epoch": 18.953018902550927, "grad_norm": 0.006748119834810495, "learning_rate": 8.327515187219859e-08, "loss": 0.0, "num_input_tokens_seen": 222721824, "step": 103275 }, { "epoch": 18.95393650211048, "grad_norm": 0.0007874315488152206, "learning_rate": 8.312967826313633e-08, "loss": 0.0, "num_input_tokens_seen": 222733088, "step": 103280 }, { "epoch": 18.95485410167003, "grad_norm": 0.0034830064978450537, "learning_rate": 8.298433076417755e-08, "loss": 0.0, "num_input_tokens_seen": 222744960, "step": 103285 }, { "epoch": 18.955771701229583, "grad_norm": 0.0031168977729976177, "learning_rate": 8.283910937904981e-08, "loss": 0.0, "num_input_tokens_seen": 222755744, "step": 103290 }, { "epoch": 18.956689300789137, "grad_norm": 0.0016042347997426987, "learning_rate": 8.269401411147848e-08, "loss": 0.0, "num_input_tokens_seen": 222768064, "step": 103295 }, { "epoch": 18.957606900348686, "grad_norm": 0.0009906644700095057, "learning_rate": 8.254904496518446e-08, "loss": 0.0, "num_input_tokens_seen": 222779328, "step": 103300 }, { "epoch": 18.95852449990824, "grad_norm": 0.012316780164837837, "learning_rate": 8.240420194388532e-08, "loss": 0.0008, "num_input_tokens_seen": 222789920, "step": 103305 }, { "epoch": 18.959442099467793, "grad_norm": 0.00256624654866755, "learning_rate": 8.225948505129755e-08, "loss": 0.0, "num_input_tokens_seen": 222800576, "step": 103310 }, { "epoch": 18.960359699027343, "grad_norm": 0.0022123248782008886, "learning_rate": 8.211489429113206e-08, "loss": 0.0, "num_input_tokens_seen": 222811808, "step": 103315 }, { "epoch": 18.961277298586896, "grad_norm": 0.013219616375863552, "learning_rate": 8.197042966709756e-08, "loss": 0.0, "num_input_tokens_seen": 222823680, "step": 103320 }, { "epoch": 18.96219489814645, "grad_norm": 0.0005521407001651824, "learning_rate": 8.182609118289886e-08, "loss": 0.0, "num_input_tokens_seen": 222834880, "step": 103325 }, { "epoch": 18.963112497706, "grad_norm": 0.005433215759694576, "learning_rate": 8.168187884223911e-08, "loss": 0.0002, "num_input_tokens_seen": 222846432, "step": 103330 }, { "epoch": 18.964030097265553, "grad_norm": 0.021787328645586967, "learning_rate": 8.15377926488159e-08, "loss": 0.0, "num_input_tokens_seen": 222858080, "step": 103335 }, { "epoch": 18.964947696825107, "grad_norm": 0.0005602732417173684, "learning_rate": 8.139383260632571e-08, "loss": 0.0, "num_input_tokens_seen": 222868672, "step": 103340 }, { "epoch": 18.965865296384656, "grad_norm": 0.00043638894567266107, "learning_rate": 8.124999871846062e-08, "loss": 0.0, "num_input_tokens_seen": 222880000, "step": 103345 }, { "epoch": 18.96678289594421, "grad_norm": 0.01223935466259718, "learning_rate": 8.110629098890932e-08, "loss": 0.0, "num_input_tokens_seen": 222890848, "step": 103350 }, { "epoch": 18.967700495503763, "grad_norm": 0.03514544665813446, "learning_rate": 8.096270942135776e-08, "loss": 0.0, "num_input_tokens_seen": 222901440, "step": 103355 }, { "epoch": 18.968618095063313, "grad_norm": 0.0007272672955878079, "learning_rate": 8.081925401948964e-08, "loss": 0.0, "num_input_tokens_seen": 222912576, "step": 103360 }, { "epoch": 18.969535694622866, "grad_norm": 0.029311493039131165, "learning_rate": 8.067592478698371e-08, "loss": 0.0001, "num_input_tokens_seen": 222923168, "step": 103365 }, { "epoch": 18.97045329418242, "grad_norm": 0.028545092791318893, "learning_rate": 8.053272172751591e-08, "loss": 0.0, "num_input_tokens_seen": 222933760, "step": 103370 }, { "epoch": 18.97137089374197, "grad_norm": 0.0009914078982546926, "learning_rate": 8.038964484475886e-08, "loss": 0.0097, "num_input_tokens_seen": 222943616, "step": 103375 }, { "epoch": 18.972288493301523, "grad_norm": 0.00244275969453156, "learning_rate": 8.024669414238295e-08, "loss": 0.0001, "num_input_tokens_seen": 222953472, "step": 103380 }, { "epoch": 18.973206092861076, "grad_norm": 0.000773998093791306, "learning_rate": 8.010386962405415e-08, "loss": 0.0, "num_input_tokens_seen": 222964512, "step": 103385 }, { "epoch": 18.974123692420626, "grad_norm": 0.0068494281731545925, "learning_rate": 7.996117129343616e-08, "loss": 0.0, "num_input_tokens_seen": 222974752, "step": 103390 }, { "epoch": 18.97504129198018, "grad_norm": 0.0005414281040430069, "learning_rate": 7.981859915418888e-08, "loss": 0.0, "num_input_tokens_seen": 222985504, "step": 103395 }, { "epoch": 18.975958891539733, "grad_norm": 0.001148388721048832, "learning_rate": 7.96761532099688e-08, "loss": 0.0002, "num_input_tokens_seen": 222995424, "step": 103400 }, { "epoch": 18.976876491099283, "grad_norm": 0.019390538334846497, "learning_rate": 7.953383346443022e-08, "loss": 0.0, "num_input_tokens_seen": 223007008, "step": 103405 }, { "epoch": 18.977794090658836, "grad_norm": 0.005043781362473965, "learning_rate": 7.939163992122189e-08, "loss": 0.0001, "num_input_tokens_seen": 223018592, "step": 103410 }, { "epoch": 18.97871169021839, "grad_norm": 0.0012058994034305215, "learning_rate": 7.924957258399202e-08, "loss": 0.0, "num_input_tokens_seen": 223028512, "step": 103415 }, { "epoch": 18.97962928977794, "grad_norm": 0.00915522687137127, "learning_rate": 7.910763145638434e-08, "loss": 0.0, "num_input_tokens_seen": 223039104, "step": 103420 }, { "epoch": 18.980546889337493, "grad_norm": 0.006950785405933857, "learning_rate": 7.896581654203872e-08, "loss": 0.0001, "num_input_tokens_seen": 223050112, "step": 103425 }, { "epoch": 18.981464488897046, "grad_norm": 0.00026852302835322917, "learning_rate": 7.882412784459336e-08, "loss": 0.0, "num_input_tokens_seen": 223060512, "step": 103430 }, { "epoch": 18.982382088456596, "grad_norm": 0.01974019780755043, "learning_rate": 7.868256536768203e-08, "loss": 0.0063, "num_input_tokens_seen": 223071840, "step": 103435 }, { "epoch": 18.98329968801615, "grad_norm": 0.001791885937564075, "learning_rate": 7.854112911493516e-08, "loss": 0.0, "num_input_tokens_seen": 223081952, "step": 103440 }, { "epoch": 18.984217287575703, "grad_norm": 0.013690562918782234, "learning_rate": 7.839981908998151e-08, "loss": 0.0, "num_input_tokens_seen": 223094624, "step": 103445 }, { "epoch": 18.985134887135253, "grad_norm": 0.004320212174206972, "learning_rate": 7.825863529644429e-08, "loss": 0.0001, "num_input_tokens_seen": 223103552, "step": 103450 }, { "epoch": 18.986052486694806, "grad_norm": 0.0040815724059939384, "learning_rate": 7.811757773794504e-08, "loss": 0.0, "num_input_tokens_seen": 223114912, "step": 103455 }, { "epoch": 18.98697008625436, "grad_norm": 0.0010410916293039918, "learning_rate": 7.797664641810143e-08, "loss": 0.0063, "num_input_tokens_seen": 223125376, "step": 103460 }, { "epoch": 18.98788768581391, "grad_norm": 0.0014934670180082321, "learning_rate": 7.783584134052891e-08, "loss": 0.0, "num_input_tokens_seen": 223136000, "step": 103465 }, { "epoch": 18.988805285373463, "grad_norm": 0.0015982058830559254, "learning_rate": 7.769516250883846e-08, "loss": 0.0, "num_input_tokens_seen": 223146272, "step": 103470 }, { "epoch": 18.989722884933016, "grad_norm": 0.002756887348368764, "learning_rate": 7.755460992663722e-08, "loss": 0.0002, "num_input_tokens_seen": 223156768, "step": 103475 }, { "epoch": 18.990640484492566, "grad_norm": 0.0005266012740321457, "learning_rate": 7.741418359753228e-08, "loss": 0.0, "num_input_tokens_seen": 223168096, "step": 103480 }, { "epoch": 18.99155808405212, "grad_norm": 0.0006823604926466942, "learning_rate": 7.727388352512355e-08, "loss": 0.0, "num_input_tokens_seen": 223178688, "step": 103485 }, { "epoch": 18.992475683611673, "grad_norm": 0.0013162510003894567, "learning_rate": 7.713370971301093e-08, "loss": 0.0, "num_input_tokens_seen": 223189312, "step": 103490 }, { "epoch": 18.993393283171223, "grad_norm": 2.8162739276885986, "learning_rate": 7.699366216478821e-08, "loss": 0.0003, "num_input_tokens_seen": 223200128, "step": 103495 }, { "epoch": 18.994310882730776, "grad_norm": 0.012268474325537682, "learning_rate": 7.685374088404807e-08, "loss": 0.0, "num_input_tokens_seen": 223209984, "step": 103500 }, { "epoch": 18.99522848229033, "grad_norm": 0.03536638990044594, "learning_rate": 7.671394587437931e-08, "loss": 0.0001, "num_input_tokens_seen": 223220288, "step": 103505 }, { "epoch": 18.99614608184988, "grad_norm": 0.00978761538863182, "learning_rate": 7.657427713936794e-08, "loss": 0.0, "num_input_tokens_seen": 223230496, "step": 103510 }, { "epoch": 18.997063681409433, "grad_norm": 0.0007899296469986439, "learning_rate": 7.643473468259554e-08, "loss": 0.0, "num_input_tokens_seen": 223240736, "step": 103515 }, { "epoch": 18.997981280968986, "grad_norm": 0.0013673697831109166, "learning_rate": 7.62953185076415e-08, "loss": 0.0, "num_input_tokens_seen": 223250208, "step": 103520 }, { "epoch": 18.998898880528536, "grad_norm": 0.26361414790153503, "learning_rate": 7.615602861808069e-08, "loss": 0.0001, "num_input_tokens_seen": 223262048, "step": 103525 }, { "epoch": 18.99981648008809, "grad_norm": 0.002637416124343872, "learning_rate": 7.601686501748695e-08, "loss": 0.0119, "num_input_tokens_seen": 223272960, "step": 103530 }, { "epoch": 19.000734079647643, "grad_norm": 0.023124534636735916, "learning_rate": 7.587782770942965e-08, "loss": 0.0, "num_input_tokens_seen": 223280992, "step": 103535 }, { "epoch": 19.001651679207193, "grad_norm": 0.010141655802726746, "learning_rate": 7.573891669747369e-08, "loss": 0.0, "num_input_tokens_seen": 223292256, "step": 103540 }, { "epoch": 19.002569278766746, "grad_norm": 0.0037017955910414457, "learning_rate": 7.56001319851829e-08, "loss": 0.0, "num_input_tokens_seen": 223303072, "step": 103545 }, { "epoch": 19.0034868783263, "grad_norm": 0.15660931169986725, "learning_rate": 7.546147357611666e-08, "loss": 0.0001, "num_input_tokens_seen": 223314400, "step": 103550 }, { "epoch": 19.00440447788585, "grad_norm": 0.0008127637556754053, "learning_rate": 7.532294147383101e-08, "loss": 0.0, "num_input_tokens_seen": 223325664, "step": 103555 }, { "epoch": 19.005322077445403, "grad_norm": 0.012145813554525375, "learning_rate": 7.518453568187922e-08, "loss": 0.0, "num_input_tokens_seen": 223335424, "step": 103560 }, { "epoch": 19.006239677004956, "grad_norm": 0.0033606919459998608, "learning_rate": 7.504625620381178e-08, "loss": 0.0, "num_input_tokens_seen": 223346208, "step": 103565 }, { "epoch": 19.007157276564506, "grad_norm": 0.002875271486118436, "learning_rate": 7.490810304317475e-08, "loss": 0.0, "num_input_tokens_seen": 223355712, "step": 103570 }, { "epoch": 19.00807487612406, "grad_norm": 0.018486103042960167, "learning_rate": 7.47700762035114e-08, "loss": 0.0, "num_input_tokens_seen": 223366624, "step": 103575 }, { "epoch": 19.008992475683613, "grad_norm": 0.0017355949385091662, "learning_rate": 7.463217568836222e-08, "loss": 0.0, "num_input_tokens_seen": 223377888, "step": 103580 }, { "epoch": 19.009910075243162, "grad_norm": 0.09278370440006256, "learning_rate": 7.449440150126441e-08, "loss": 0.0001, "num_input_tokens_seen": 223387712, "step": 103585 }, { "epoch": 19.010827674802716, "grad_norm": 0.019733155146241188, "learning_rate": 7.435675364575124e-08, "loss": 0.0, "num_input_tokens_seen": 223398272, "step": 103590 }, { "epoch": 19.01174527436227, "grad_norm": 0.0046897041611373425, "learning_rate": 7.42192321253532e-08, "loss": 0.0, "num_input_tokens_seen": 223409216, "step": 103595 }, { "epoch": 19.01266287392182, "grad_norm": 0.03642122074961662, "learning_rate": 7.40818369435975e-08, "loss": 0.0001, "num_input_tokens_seen": 223418688, "step": 103600 }, { "epoch": 19.013580473481372, "grad_norm": 0.02442491427063942, "learning_rate": 7.394456810400852e-08, "loss": 0.0, "num_input_tokens_seen": 223428736, "step": 103605 }, { "epoch": 19.014498073040926, "grad_norm": 0.022423988208174706, "learning_rate": 7.380742561010623e-08, "loss": 0.0, "num_input_tokens_seen": 223440544, "step": 103610 }, { "epoch": 19.015415672600476, "grad_norm": 0.008258714340627193, "learning_rate": 7.367040946540894e-08, "loss": 0.0, "num_input_tokens_seen": 223450816, "step": 103615 }, { "epoch": 19.01633327216003, "grad_norm": 0.0004169843450654298, "learning_rate": 7.353351967343048e-08, "loss": 0.0, "num_input_tokens_seen": 223460288, "step": 103620 }, { "epoch": 19.017250871719583, "grad_norm": 0.0026615115348249674, "learning_rate": 7.339675623768194e-08, "loss": 0.0, "num_input_tokens_seen": 223470848, "step": 103625 }, { "epoch": 19.018168471279132, "grad_norm": 0.0032291917596012354, "learning_rate": 7.326011916167108e-08, "loss": 0.0, "num_input_tokens_seen": 223481664, "step": 103630 }, { "epoch": 19.019086070838686, "grad_norm": 0.006177523639053106, "learning_rate": 7.312360844890232e-08, "loss": 0.0, "num_input_tokens_seen": 223492352, "step": 103635 }, { "epoch": 19.02000367039824, "grad_norm": 0.0007196684600785375, "learning_rate": 7.298722410287728e-08, "loss": 0.0, "num_input_tokens_seen": 223503328, "step": 103640 }, { "epoch": 19.02092126995779, "grad_norm": 0.005438887048512697, "learning_rate": 7.28509661270932e-08, "loss": 0.0, "num_input_tokens_seen": 223513920, "step": 103645 }, { "epoch": 19.021838869517342, "grad_norm": 0.00035487418062984943, "learning_rate": 7.271483452504557e-08, "loss": 0.0001, "num_input_tokens_seen": 223525248, "step": 103650 }, { "epoch": 19.022756469076896, "grad_norm": 0.0007911563152447343, "learning_rate": 7.257882930022608e-08, "loss": 0.0, "num_input_tokens_seen": 223535200, "step": 103655 }, { "epoch": 19.023674068636446, "grad_norm": 0.004527267534285784, "learning_rate": 7.244295045612249e-08, "loss": 0.0001, "num_input_tokens_seen": 223545984, "step": 103660 }, { "epoch": 19.024591668196, "grad_norm": 0.014317045919597149, "learning_rate": 7.230719799622087e-08, "loss": 0.0, "num_input_tokens_seen": 223556160, "step": 103665 }, { "epoch": 19.025509267755552, "grad_norm": 0.0012573496205732226, "learning_rate": 7.217157192400181e-08, "loss": 0.0, "num_input_tokens_seen": 223567296, "step": 103670 }, { "epoch": 19.026426867315102, "grad_norm": 0.05370374023914337, "learning_rate": 7.203607224294473e-08, "loss": 0.0, "num_input_tokens_seen": 223578208, "step": 103675 }, { "epoch": 19.027344466874656, "grad_norm": 0.0025169954169541597, "learning_rate": 7.190069895652463e-08, "loss": 0.0, "num_input_tokens_seen": 223588800, "step": 103680 }, { "epoch": 19.02826206643421, "grad_norm": 0.004581413697451353, "learning_rate": 7.176545206821373e-08, "loss": 0.0, "num_input_tokens_seen": 223600320, "step": 103685 }, { "epoch": 19.02917966599376, "grad_norm": 0.36833465099334717, "learning_rate": 7.163033158148147e-08, "loss": 0.0001, "num_input_tokens_seen": 223610912, "step": 103690 }, { "epoch": 19.030097265553312, "grad_norm": 0.00886133685708046, "learning_rate": 7.149533749979176e-08, "loss": 0.0, "num_input_tokens_seen": 223622080, "step": 103695 }, { "epoch": 19.031014865112866, "grad_norm": 0.002057802863419056, "learning_rate": 7.136046982660904e-08, "loss": 0.0, "num_input_tokens_seen": 223632352, "step": 103700 }, { "epoch": 19.031932464672416, "grad_norm": 0.009216728620231152, "learning_rate": 7.122572856539167e-08, "loss": 0.0, "num_input_tokens_seen": 223643424, "step": 103705 }, { "epoch": 19.03285006423197, "grad_norm": 0.0022450550459325314, "learning_rate": 7.109111371959521e-08, "loss": 0.0, "num_input_tokens_seen": 223653824, "step": 103710 }, { "epoch": 19.033767663791522, "grad_norm": 0.008423283696174622, "learning_rate": 7.095662529267244e-08, "loss": 0.0, "num_input_tokens_seen": 223663520, "step": 103715 }, { "epoch": 19.034685263351072, "grad_norm": 0.01730257272720337, "learning_rate": 7.082226328807285e-08, "loss": 0.0, "num_input_tokens_seen": 223674464, "step": 103720 }, { "epoch": 19.035602862910626, "grad_norm": 1.233099102973938, "learning_rate": 7.068802770924255e-08, "loss": 0.0005, "num_input_tokens_seen": 223685344, "step": 103725 }, { "epoch": 19.03652046247018, "grad_norm": 0.0049405754543840885, "learning_rate": 7.05539185596249e-08, "loss": 0.0, "num_input_tokens_seen": 223696384, "step": 103730 }, { "epoch": 19.03743806202973, "grad_norm": 0.02195146307349205, "learning_rate": 7.041993584265938e-08, "loss": 0.0, "num_input_tokens_seen": 223707040, "step": 103735 }, { "epoch": 19.038355661589282, "grad_norm": 0.011868074536323547, "learning_rate": 7.028607956178268e-08, "loss": 0.0, "num_input_tokens_seen": 223717600, "step": 103740 }, { "epoch": 19.039273261148836, "grad_norm": 0.0008922578999772668, "learning_rate": 7.015234972042651e-08, "loss": 0.0, "num_input_tokens_seen": 223728320, "step": 103745 }, { "epoch": 19.040190860708385, "grad_norm": 0.006153926718980074, "learning_rate": 7.00187463220231e-08, "loss": 0.0, "num_input_tokens_seen": 223738816, "step": 103750 }, { "epoch": 19.04110846026794, "grad_norm": 0.0009586084051989019, "learning_rate": 6.988526936999751e-08, "loss": 0.0, "num_input_tokens_seen": 223750528, "step": 103755 }, { "epoch": 19.042026059827492, "grad_norm": 0.0008559388224966824, "learning_rate": 6.975191886777366e-08, "loss": 0.0, "num_input_tokens_seen": 223760160, "step": 103760 }, { "epoch": 19.042943659387042, "grad_norm": 0.0006756525253877044, "learning_rate": 6.961869481877215e-08, "loss": 0.0, "num_input_tokens_seen": 223770720, "step": 103765 }, { "epoch": 19.043861258946595, "grad_norm": 0.003995849750936031, "learning_rate": 6.948559722641024e-08, "loss": 0.0, "num_input_tokens_seen": 223781120, "step": 103770 }, { "epoch": 19.04477885850615, "grad_norm": 0.0009544107015244663, "learning_rate": 6.935262609410076e-08, "loss": 0.0, "num_input_tokens_seen": 223790464, "step": 103775 }, { "epoch": 19.0456964580657, "grad_norm": 78.0702133178711, "learning_rate": 6.921978142525376e-08, "loss": 0.056, "num_input_tokens_seen": 223800608, "step": 103780 }, { "epoch": 19.046614057625252, "grad_norm": 0.0021022886503487825, "learning_rate": 6.908706322327818e-08, "loss": 0.0, "num_input_tokens_seen": 223811136, "step": 103785 }, { "epoch": 19.047531657184805, "grad_norm": 0.001687444862909615, "learning_rate": 6.895447149157741e-08, "loss": 0.0, "num_input_tokens_seen": 223821440, "step": 103790 }, { "epoch": 19.048449256744355, "grad_norm": 0.001130943768657744, "learning_rate": 6.882200623355151e-08, "loss": 0.0, "num_input_tokens_seen": 223832544, "step": 103795 }, { "epoch": 19.04936685630391, "grad_norm": 0.0038336182478815317, "learning_rate": 6.868966745259886e-08, "loss": 0.0, "num_input_tokens_seen": 223843104, "step": 103800 }, { "epoch": 19.050284455863462, "grad_norm": 0.0014150749193504453, "learning_rate": 6.855745515211343e-08, "loss": 0.0, "num_input_tokens_seen": 223854816, "step": 103805 }, { "epoch": 19.051202055423012, "grad_norm": 0.0009831138886511326, "learning_rate": 6.842536933548583e-08, "loss": 0.0, "num_input_tokens_seen": 223866464, "step": 103810 }, { "epoch": 19.052119654982565, "grad_norm": 0.003553619608283043, "learning_rate": 6.829341000610445e-08, "loss": 0.0001, "num_input_tokens_seen": 223877152, "step": 103815 }, { "epoch": 19.05303725454212, "grad_norm": 0.0011280018370598555, "learning_rate": 6.816157716735383e-08, "loss": 0.0478, "num_input_tokens_seen": 223888448, "step": 103820 }, { "epoch": 19.05395485410167, "grad_norm": 0.003550333669409156, "learning_rate": 6.802987082261514e-08, "loss": 0.0, "num_input_tokens_seen": 223898336, "step": 103825 }, { "epoch": 19.054872453661222, "grad_norm": 0.00225172215141356, "learning_rate": 6.789829097526569e-08, "loss": 0.0001, "num_input_tokens_seen": 223908000, "step": 103830 }, { "epoch": 19.055790053220775, "grad_norm": 0.0016955607570707798, "learning_rate": 6.776683762868164e-08, "loss": 0.0001, "num_input_tokens_seen": 223918528, "step": 103835 }, { "epoch": 19.056707652780325, "grad_norm": 0.01803571917116642, "learning_rate": 6.76355107862342e-08, "loss": 0.0, "num_input_tokens_seen": 223928352, "step": 103840 }, { "epoch": 19.05762525233988, "grad_norm": 0.0016410499811172485, "learning_rate": 6.750431045129069e-08, "loss": 0.0, "num_input_tokens_seen": 223939232, "step": 103845 }, { "epoch": 19.058542851899432, "grad_norm": 0.002610742812976241, "learning_rate": 6.737323662721728e-08, "loss": 0.0, "num_input_tokens_seen": 223950496, "step": 103850 }, { "epoch": 19.059460451458982, "grad_norm": 0.07375600934028625, "learning_rate": 6.724228931737576e-08, "loss": 0.0, "num_input_tokens_seen": 223961504, "step": 103855 }, { "epoch": 19.060378051018535, "grad_norm": 0.0006343521527014673, "learning_rate": 6.711146852512395e-08, "loss": 0.0, "num_input_tokens_seen": 223972192, "step": 103860 }, { "epoch": 19.06129565057809, "grad_norm": 0.0023631169460713863, "learning_rate": 6.69807742538181e-08, "loss": 0.0, "num_input_tokens_seen": 223983520, "step": 103865 }, { "epoch": 19.06221325013764, "grad_norm": 0.0032103697303682566, "learning_rate": 6.68502065068094e-08, "loss": 0.0, "num_input_tokens_seen": 223993184, "step": 103870 }, { "epoch": 19.063130849697192, "grad_norm": 0.005609691608697176, "learning_rate": 6.671976528744795e-08, "loss": 0.0, "num_input_tokens_seen": 224002880, "step": 103875 }, { "epoch": 19.064048449256745, "grad_norm": 0.0006028853240422904, "learning_rate": 6.658945059907773e-08, "loss": 0.0001, "num_input_tokens_seen": 224013664, "step": 103880 }, { "epoch": 19.064966048816295, "grad_norm": 0.003537253476679325, "learning_rate": 6.645926244504275e-08, "loss": 0.0001, "num_input_tokens_seen": 224023840, "step": 103885 }, { "epoch": 19.06588364837585, "grad_norm": 0.008937348611652851, "learning_rate": 6.632920082868144e-08, "loss": 0.0, "num_input_tokens_seen": 224034528, "step": 103890 }, { "epoch": 19.066801247935402, "grad_norm": 0.0041695344261825085, "learning_rate": 6.619926575332891e-08, "loss": 0.0, "num_input_tokens_seen": 224044608, "step": 103895 }, { "epoch": 19.06771884749495, "grad_norm": 0.016433250159025192, "learning_rate": 6.606945722231916e-08, "loss": 0.0, "num_input_tokens_seen": 224055904, "step": 103900 }, { "epoch": 19.068636447054505, "grad_norm": 0.0005090598133392632, "learning_rate": 6.593977523898066e-08, "loss": 0.0207, "num_input_tokens_seen": 224066624, "step": 103905 }, { "epoch": 19.06955404661406, "grad_norm": 0.00220467709004879, "learning_rate": 6.581021980664015e-08, "loss": 0.0, "num_input_tokens_seen": 224076672, "step": 103910 }, { "epoch": 19.07047164617361, "grad_norm": 0.0018008005572482944, "learning_rate": 6.568079092862e-08, "loss": 0.0, "num_input_tokens_seen": 224087392, "step": 103915 }, { "epoch": 19.07138924573316, "grad_norm": 0.002566060284152627, "learning_rate": 6.555148860823979e-08, "loss": 0.0, "num_input_tokens_seen": 224098432, "step": 103920 }, { "epoch": 19.072306845292715, "grad_norm": 0.0005001644021831453, "learning_rate": 6.542231284881628e-08, "loss": 0.0, "num_input_tokens_seen": 224108672, "step": 103925 }, { "epoch": 19.073224444852265, "grad_norm": 0.0010109212016686797, "learning_rate": 6.529326365366295e-08, "loss": 0.0, "num_input_tokens_seen": 224119168, "step": 103930 }, { "epoch": 19.07414204441182, "grad_norm": 0.0009889048524200916, "learning_rate": 6.516434102608882e-08, "loss": 0.0, "num_input_tokens_seen": 224130400, "step": 103935 }, { "epoch": 19.07505964397137, "grad_norm": 0.002976455492898822, "learning_rate": 6.503554496940123e-08, "loss": 0.0, "num_input_tokens_seen": 224142336, "step": 103940 }, { "epoch": 19.07597724353092, "grad_norm": 0.0006558040040545166, "learning_rate": 6.490687548690366e-08, "loss": 0.0, "num_input_tokens_seen": 224151808, "step": 103945 }, { "epoch": 19.076894843090475, "grad_norm": 0.0003635083558037877, "learning_rate": 6.47783325818957e-08, "loss": 0.0, "num_input_tokens_seen": 224162432, "step": 103950 }, { "epoch": 19.07781244265003, "grad_norm": 0.007069480139762163, "learning_rate": 6.464991625767469e-08, "loss": 0.0, "num_input_tokens_seen": 224173888, "step": 103955 }, { "epoch": 19.07873004220958, "grad_norm": 0.0018990833777934313, "learning_rate": 6.452162651753413e-08, "loss": 0.0, "num_input_tokens_seen": 224185184, "step": 103960 }, { "epoch": 19.07964764176913, "grad_norm": 0.017346709966659546, "learning_rate": 6.43934633647647e-08, "loss": 0.0, "num_input_tokens_seen": 224196352, "step": 103965 }, { "epoch": 19.080565241328685, "grad_norm": 0.004796234425157309, "learning_rate": 6.426542680265324e-08, "loss": 0.0, "num_input_tokens_seen": 224207200, "step": 103970 }, { "epoch": 19.081482840888235, "grad_norm": 0.001482302788645029, "learning_rate": 6.413751683448432e-08, "loss": 0.0, "num_input_tokens_seen": 224218592, "step": 103975 }, { "epoch": 19.08240044044779, "grad_norm": 0.0015272502787411213, "learning_rate": 6.400973346353756e-08, "loss": 0.004, "num_input_tokens_seen": 224226976, "step": 103980 }, { "epoch": 19.08331804000734, "grad_norm": 0.011195083148777485, "learning_rate": 6.388207669309143e-08, "loss": 0.0, "num_input_tokens_seen": 224238656, "step": 103985 }, { "epoch": 19.08423563956689, "grad_norm": 0.0016317953122779727, "learning_rate": 6.375454652641999e-08, "loss": 0.0, "num_input_tokens_seen": 224249184, "step": 103990 }, { "epoch": 19.085153239126445, "grad_norm": 0.003252765629440546, "learning_rate": 6.362714296679396e-08, "loss": 0.0, "num_input_tokens_seen": 224260352, "step": 103995 }, { "epoch": 19.086070838686, "grad_norm": 0.000602668384090066, "learning_rate": 6.349986601748015e-08, "loss": 0.0, "num_input_tokens_seen": 224270656, "step": 104000 }, { "epoch": 19.086988438245548, "grad_norm": 0.00528363324701786, "learning_rate": 6.337271568174485e-08, "loss": 0.0, "num_input_tokens_seen": 224281408, "step": 104005 }, { "epoch": 19.0879060378051, "grad_norm": 0.911925733089447, "learning_rate": 6.324569196284768e-08, "loss": 0.0002, "num_input_tokens_seen": 224292544, "step": 104010 }, { "epoch": 19.088823637364655, "grad_norm": 0.02168438956141472, "learning_rate": 6.31187948640477e-08, "loss": 0.0001, "num_input_tokens_seen": 224303488, "step": 104015 }, { "epoch": 19.089741236924205, "grad_norm": 0.022673698142170906, "learning_rate": 6.299202438859898e-08, "loss": 0.0, "num_input_tokens_seen": 224313568, "step": 104020 }, { "epoch": 19.090658836483758, "grad_norm": 0.001293674809858203, "learning_rate": 6.286538053975333e-08, "loss": 0.0, "num_input_tokens_seen": 224322816, "step": 104025 }, { "epoch": 19.09157643604331, "grad_norm": 0.0011265153298154473, "learning_rate": 6.273886332075818e-08, "loss": 0.0, "num_input_tokens_seen": 224333728, "step": 104030 }, { "epoch": 19.09249403560286, "grad_norm": 0.004649520386010408, "learning_rate": 6.261247273485981e-08, "loss": 0.0, "num_input_tokens_seen": 224344128, "step": 104035 }, { "epoch": 19.093411635162415, "grad_norm": 0.0008343522786162794, "learning_rate": 6.248620878529898e-08, "loss": 0.0, "num_input_tokens_seen": 224354272, "step": 104040 }, { "epoch": 19.094329234721968, "grad_norm": 0.009303326718509197, "learning_rate": 6.236007147531475e-08, "loss": 0.0, "num_input_tokens_seen": 224365600, "step": 104045 }, { "epoch": 19.095246834281518, "grad_norm": 0.002877464983612299, "learning_rate": 6.223406080814121e-08, "loss": 0.0, "num_input_tokens_seen": 224375648, "step": 104050 }, { "epoch": 19.09616443384107, "grad_norm": 0.00041068001883104444, "learning_rate": 6.210817678701187e-08, "loss": 0.0, "num_input_tokens_seen": 224386560, "step": 104055 }, { "epoch": 19.097082033400625, "grad_norm": 0.0016800464363768697, "learning_rate": 6.19824194151547e-08, "loss": 0.0, "num_input_tokens_seen": 224397312, "step": 104060 }, { "epoch": 19.097999632960175, "grad_norm": 0.015197518281638622, "learning_rate": 6.185678869579492e-08, "loss": 0.0, "num_input_tokens_seen": 224408992, "step": 104065 }, { "epoch": 19.098917232519728, "grad_norm": 0.001118175103329122, "learning_rate": 6.17312846321555e-08, "loss": 0.0, "num_input_tokens_seen": 224420128, "step": 104070 }, { "epoch": 19.09983483207928, "grad_norm": 0.019088687375187874, "learning_rate": 6.160590722745496e-08, "loss": 0.0, "num_input_tokens_seen": 224429376, "step": 104075 }, { "epoch": 19.10075243163883, "grad_norm": 0.0020118486136198044, "learning_rate": 6.148065648490852e-08, "loss": 0.0, "num_input_tokens_seen": 224441312, "step": 104080 }, { "epoch": 19.101670031198385, "grad_norm": 0.002250106306746602, "learning_rate": 6.135553240772973e-08, "loss": 0.0, "num_input_tokens_seen": 224450976, "step": 104085 }, { "epoch": 19.102587630757938, "grad_norm": 0.0007342486060224473, "learning_rate": 6.123053499912768e-08, "loss": 0.0, "num_input_tokens_seen": 224462208, "step": 104090 }, { "epoch": 19.103505230317488, "grad_norm": 0.004633067641407251, "learning_rate": 6.110566426230758e-08, "loss": 0.0, "num_input_tokens_seen": 224473760, "step": 104095 }, { "epoch": 19.10442282987704, "grad_norm": 0.008906546980142593, "learning_rate": 6.098092020047242e-08, "loss": 0.0, "num_input_tokens_seen": 224484928, "step": 104100 }, { "epoch": 19.105340429436595, "grad_norm": 0.0032062409445643425, "learning_rate": 6.085630281682187e-08, "loss": 0.0, "num_input_tokens_seen": 224497120, "step": 104105 }, { "epoch": 19.106258028996145, "grad_norm": 0.0003859669086523354, "learning_rate": 6.073181211455281e-08, "loss": 0.0, "num_input_tokens_seen": 224508000, "step": 104110 }, { "epoch": 19.107175628555698, "grad_norm": 0.0015190254198387265, "learning_rate": 6.06074480968566e-08, "loss": 0.0001, "num_input_tokens_seen": 224519744, "step": 104115 }, { "epoch": 19.10809322811525, "grad_norm": 0.05438538268208504, "learning_rate": 6.048321076692454e-08, "loss": 0.0001, "num_input_tokens_seen": 224530464, "step": 104120 }, { "epoch": 19.1090108276748, "grad_norm": 0.001583562232553959, "learning_rate": 6.035910012794299e-08, "loss": 0.0, "num_input_tokens_seen": 224541888, "step": 104125 }, { "epoch": 19.109928427234355, "grad_norm": 0.01517628412693739, "learning_rate": 6.023511618309441e-08, "loss": 0.0, "num_input_tokens_seen": 224551584, "step": 104130 }, { "epoch": 19.110846026793908, "grad_norm": 0.0010145135456696153, "learning_rate": 6.011125893555902e-08, "loss": 0.0, "num_input_tokens_seen": 224561696, "step": 104135 }, { "epoch": 19.111763626353458, "grad_norm": 0.014743208885192871, "learning_rate": 5.998752838851374e-08, "loss": 0.0, "num_input_tokens_seen": 224573312, "step": 104140 }, { "epoch": 19.11268122591301, "grad_norm": 0.0003174244484398514, "learning_rate": 5.986392454513213e-08, "loss": 0.0, "num_input_tokens_seen": 224584224, "step": 104145 }, { "epoch": 19.113598825472565, "grad_norm": 0.0008339930209331214, "learning_rate": 5.974044740858386e-08, "loss": 0.0, "num_input_tokens_seen": 224594432, "step": 104150 }, { "epoch": 19.114516425032114, "grad_norm": 0.008660328574478626, "learning_rate": 5.961709698203699e-08, "loss": 0.0005, "num_input_tokens_seen": 224604640, "step": 104155 }, { "epoch": 19.115434024591668, "grad_norm": 0.0021983489859849215, "learning_rate": 5.9493873268654524e-08, "loss": 0.0, "num_input_tokens_seen": 224615712, "step": 104160 }, { "epoch": 19.11635162415122, "grad_norm": 0.016912031918764114, "learning_rate": 5.937077627159726e-08, "loss": 0.0, "num_input_tokens_seen": 224626144, "step": 104165 }, { "epoch": 19.11726922371077, "grad_norm": 0.0027581346221268177, "learning_rate": 5.924780599402213e-08, "loss": 0.0, "num_input_tokens_seen": 224637600, "step": 104170 }, { "epoch": 19.118186823270324, "grad_norm": 0.0017589947674423456, "learning_rate": 5.9124962439083274e-08, "loss": 0.0, "num_input_tokens_seen": 224648288, "step": 104175 }, { "epoch": 19.119104422829878, "grad_norm": 0.001269046450033784, "learning_rate": 5.900224560993151e-08, "loss": 0.0, "num_input_tokens_seen": 224657568, "step": 104180 }, { "epoch": 19.120022022389428, "grad_norm": 0.0004111227754037827, "learning_rate": 5.8879655509714306e-08, "loss": 0.0, "num_input_tokens_seen": 224669280, "step": 104185 }, { "epoch": 19.12093962194898, "grad_norm": 0.0003666244156192988, "learning_rate": 5.875719214157582e-08, "loss": 0.0, "num_input_tokens_seen": 224679488, "step": 104190 }, { "epoch": 19.121857221508535, "grad_norm": 0.0013233176432549953, "learning_rate": 5.863485550865744e-08, "loss": 0.0, "num_input_tokens_seen": 224689248, "step": 104195 }, { "epoch": 19.122774821068084, "grad_norm": 0.000534911872819066, "learning_rate": 5.8512645614096086e-08, "loss": 0.0, "num_input_tokens_seen": 224701216, "step": 104200 }, { "epoch": 19.123692420627638, "grad_norm": 0.0025162396486848593, "learning_rate": 5.839056246102703e-08, "loss": 0.0, "num_input_tokens_seen": 224712640, "step": 104205 }, { "epoch": 19.12461002018719, "grad_norm": 0.010057331994175911, "learning_rate": 5.826860605258111e-08, "loss": 0.0, "num_input_tokens_seen": 224722816, "step": 104210 }, { "epoch": 19.12552761974674, "grad_norm": 0.0005394123145379126, "learning_rate": 5.814677639188637e-08, "loss": 0.0, "num_input_tokens_seen": 224733504, "step": 104215 }, { "epoch": 19.126445219306294, "grad_norm": 0.005467541050165892, "learning_rate": 5.8025073482068095e-08, "loss": 0.0, "num_input_tokens_seen": 224743648, "step": 104220 }, { "epoch": 19.127362818865848, "grad_norm": 0.010681642219424248, "learning_rate": 5.7903497326247116e-08, "loss": 0.0, "num_input_tokens_seen": 224755840, "step": 104225 }, { "epoch": 19.128280418425398, "grad_norm": 0.050518184900283813, "learning_rate": 5.778204792754205e-08, "loss": 0.0, "num_input_tokens_seen": 224767264, "step": 104230 }, { "epoch": 19.12919801798495, "grad_norm": 0.00522980373352766, "learning_rate": 5.766072528906708e-08, "loss": 0.0, "num_input_tokens_seen": 224777568, "step": 104235 }, { "epoch": 19.130115617544504, "grad_norm": 0.002196610439568758, "learning_rate": 5.753952941393526e-08, "loss": 0.0, "num_input_tokens_seen": 224787744, "step": 104240 }, { "epoch": 19.131033217104054, "grad_norm": 0.000497321889270097, "learning_rate": 5.741846030525411e-08, "loss": 0.0, "num_input_tokens_seen": 224799328, "step": 104245 }, { "epoch": 19.131950816663608, "grad_norm": 0.002268071984872222, "learning_rate": 5.7297517966128926e-08, "loss": 0.0, "num_input_tokens_seen": 224809856, "step": 104250 }, { "epoch": 19.13286841622316, "grad_norm": 0.0007935480098240077, "learning_rate": 5.717670239966222e-08, "loss": 0.0, "num_input_tokens_seen": 224820352, "step": 104255 }, { "epoch": 19.13378601578271, "grad_norm": 0.17206570506095886, "learning_rate": 5.705601360895263e-08, "loss": 0.0, "num_input_tokens_seen": 224832256, "step": 104260 }, { "epoch": 19.134703615342264, "grad_norm": 0.021786566823720932, "learning_rate": 5.693545159709491e-08, "loss": 0.0, "num_input_tokens_seen": 224842944, "step": 104265 }, { "epoch": 19.135621214901818, "grad_norm": 0.000662068254314363, "learning_rate": 5.6815016367181564e-08, "loss": 0.0, "num_input_tokens_seen": 224853280, "step": 104270 }, { "epoch": 19.136538814461368, "grad_norm": 0.0009348727180622518, "learning_rate": 5.669470792230236e-08, "loss": 0.0, "num_input_tokens_seen": 224863936, "step": 104275 }, { "epoch": 19.13745641402092, "grad_norm": 0.0007822876796126366, "learning_rate": 5.657452626554261e-08, "loss": 0.0, "num_input_tokens_seen": 224874336, "step": 104280 }, { "epoch": 19.138374013580474, "grad_norm": 0.00045111135113984346, "learning_rate": 5.6454471399984275e-08, "loss": 0.0, "num_input_tokens_seen": 224885376, "step": 104285 }, { "epoch": 19.139291613140024, "grad_norm": 0.002323399530723691, "learning_rate": 5.6334543328707134e-08, "loss": 0.0, "num_input_tokens_seen": 224896832, "step": 104290 }, { "epoch": 19.140209212699578, "grad_norm": 0.017601804807782173, "learning_rate": 5.621474205478705e-08, "loss": 0.0, "num_input_tokens_seen": 224907520, "step": 104295 }, { "epoch": 19.14112681225913, "grad_norm": 0.08362056314945221, "learning_rate": 5.609506758129601e-08, "loss": 0.0, "num_input_tokens_seen": 224918496, "step": 104300 }, { "epoch": 19.14204441181868, "grad_norm": 0.005263731349259615, "learning_rate": 5.59755199113049e-08, "loss": 0.0, "num_input_tokens_seen": 224929408, "step": 104305 }, { "epoch": 19.142962011378234, "grad_norm": 0.004244076553732157, "learning_rate": 5.585609904787903e-08, "loss": 0.0, "num_input_tokens_seen": 224939744, "step": 104310 }, { "epoch": 19.143879610937788, "grad_norm": 0.0015490050427615643, "learning_rate": 5.5736804994081515e-08, "loss": 0.0, "num_input_tokens_seen": 224950848, "step": 104315 }, { "epoch": 19.144797210497337, "grad_norm": 0.0007860395708121359, "learning_rate": 5.5617637752971575e-08, "loss": 0.0, "num_input_tokens_seen": 224961216, "step": 104320 }, { "epoch": 19.14571481005689, "grad_norm": 0.03735581040382385, "learning_rate": 5.549859732760676e-08, "loss": 0.0, "num_input_tokens_seen": 224972224, "step": 104325 }, { "epoch": 19.146632409616444, "grad_norm": 0.001681049820035696, "learning_rate": 5.537968372103908e-08, "loss": 0.0, "num_input_tokens_seen": 224982528, "step": 104330 }, { "epoch": 19.147550009175994, "grad_norm": 0.004497524816542864, "learning_rate": 5.526089693631942e-08, "loss": 0.0, "num_input_tokens_seen": 224992864, "step": 104335 }, { "epoch": 19.148467608735547, "grad_norm": 0.004260275512933731, "learning_rate": 5.514223697649368e-08, "loss": 0.1193, "num_input_tokens_seen": 225003264, "step": 104340 }, { "epoch": 19.1493852082951, "grad_norm": 0.0011739857727661729, "learning_rate": 5.502370384460609e-08, "loss": 0.0, "num_input_tokens_seen": 225014368, "step": 104345 }, { "epoch": 19.15030280785465, "grad_norm": 0.0035917768254876137, "learning_rate": 5.4905297543696446e-08, "loss": 0.0, "num_input_tokens_seen": 225024768, "step": 104350 }, { "epoch": 19.151220407414204, "grad_norm": 0.002216789871454239, "learning_rate": 5.478701807680176e-08, "loss": 0.0, "num_input_tokens_seen": 225035232, "step": 104355 }, { "epoch": 19.152138006973757, "grad_norm": 0.011383534409105778, "learning_rate": 5.466886544695571e-08, "loss": 0.0, "num_input_tokens_seen": 225045984, "step": 104360 }, { "epoch": 19.153055606533307, "grad_norm": 0.0055292025208473206, "learning_rate": 5.455083965718866e-08, "loss": 0.0, "num_input_tokens_seen": 225057088, "step": 104365 }, { "epoch": 19.15397320609286, "grad_norm": 0.03380262479186058, "learning_rate": 5.443294071052763e-08, "loss": 0.0, "num_input_tokens_seen": 225066240, "step": 104370 }, { "epoch": 19.154890805652414, "grad_norm": 0.0013676034286618233, "learning_rate": 5.431516860999686e-08, "loss": 0.0, "num_input_tokens_seen": 225076768, "step": 104375 }, { "epoch": 19.155808405211964, "grad_norm": 0.001004245481453836, "learning_rate": 5.4197523358617276e-08, "loss": 0.0, "num_input_tokens_seen": 225087424, "step": 104380 }, { "epoch": 19.156726004771517, "grad_norm": 0.0009036062983796, "learning_rate": 5.40800049594048e-08, "loss": 0.0, "num_input_tokens_seen": 225099456, "step": 104385 }, { "epoch": 19.15764360433107, "grad_norm": 0.005989587865769863, "learning_rate": 5.3962613415375895e-08, "loss": 0.0, "num_input_tokens_seen": 225110816, "step": 104390 }, { "epoch": 19.15856120389062, "grad_norm": 0.0003850277862511575, "learning_rate": 5.384534872953984e-08, "loss": 0.0158, "num_input_tokens_seen": 225121248, "step": 104395 }, { "epoch": 19.159478803450174, "grad_norm": 0.00494129303842783, "learning_rate": 5.372821090490421e-08, "loss": 0.0, "num_input_tokens_seen": 225132064, "step": 104400 }, { "epoch": 19.160396403009727, "grad_norm": 0.03887827321887016, "learning_rate": 5.3611199944474946e-08, "loss": 0.0, "num_input_tokens_seen": 225142944, "step": 104405 }, { "epoch": 19.161314002569277, "grad_norm": 0.006805164739489555, "learning_rate": 5.3494315851251866e-08, "loss": 0.0, "num_input_tokens_seen": 225153440, "step": 104410 }, { "epoch": 19.16223160212883, "grad_norm": 0.003070672508329153, "learning_rate": 5.337755862823313e-08, "loss": 0.0, "num_input_tokens_seen": 225164096, "step": 104415 }, { "epoch": 19.163149201688384, "grad_norm": 0.0011485940776765347, "learning_rate": 5.3260928278413006e-08, "loss": 0.0, "num_input_tokens_seen": 225174912, "step": 104420 }, { "epoch": 19.164066801247934, "grad_norm": 0.004252172540873289, "learning_rate": 5.3144424804783545e-08, "loss": 0.0, "num_input_tokens_seen": 225184928, "step": 104425 }, { "epoch": 19.164984400807487, "grad_norm": 0.013276269659399986, "learning_rate": 5.302804821033292e-08, "loss": 0.0, "num_input_tokens_seen": 225194752, "step": 104430 }, { "epoch": 19.16590200036704, "grad_norm": 0.011434049345552921, "learning_rate": 5.29117984980454e-08, "loss": 0.0, "num_input_tokens_seen": 225205056, "step": 104435 }, { "epoch": 19.16681959992659, "grad_norm": 0.0023094008211046457, "learning_rate": 5.2795675670903044e-08, "loss": 0.0, "num_input_tokens_seen": 225215136, "step": 104440 }, { "epoch": 19.167737199486144, "grad_norm": 0.0006141717894934118, "learning_rate": 5.2679679731884595e-08, "loss": 0.0002, "num_input_tokens_seen": 225225088, "step": 104445 }, { "epoch": 19.168654799045697, "grad_norm": 142.69833374023438, "learning_rate": 5.256381068396432e-08, "loss": 0.0041, "num_input_tokens_seen": 225236352, "step": 104450 }, { "epoch": 19.169572398605247, "grad_norm": 0.0015497553395107388, "learning_rate": 5.24480685301143e-08, "loss": 0.0, "num_input_tokens_seen": 225248576, "step": 104455 }, { "epoch": 19.1704899981648, "grad_norm": 0.14689406752586365, "learning_rate": 5.2332453273303827e-08, "loss": 0.0001, "num_input_tokens_seen": 225260256, "step": 104460 }, { "epoch": 19.171407597724354, "grad_norm": 0.00028741240384988487, "learning_rate": 5.221696491649775e-08, "loss": 0.0, "num_input_tokens_seen": 225271456, "step": 104465 }, { "epoch": 19.172325197283904, "grad_norm": 0.007303299847990274, "learning_rate": 5.2101603462657576e-08, "loss": 0.0, "num_input_tokens_seen": 225280448, "step": 104470 }, { "epoch": 19.173242796843457, "grad_norm": 0.0006175615126267076, "learning_rate": 5.198636891474262e-08, "loss": 0.0, "num_input_tokens_seen": 225291008, "step": 104475 }, { "epoch": 19.17416039640301, "grad_norm": 0.0012682608794420958, "learning_rate": 5.1871261275709405e-08, "loss": 0.0079, "num_input_tokens_seen": 225302976, "step": 104480 }, { "epoch": 19.17507799596256, "grad_norm": 0.002948339097201824, "learning_rate": 5.17562805485089e-08, "loss": 0.0, "num_input_tokens_seen": 225313440, "step": 104485 }, { "epoch": 19.175995595522114, "grad_norm": 0.001147883478552103, "learning_rate": 5.164142673609041e-08, "loss": 0.0, "num_input_tokens_seen": 225322944, "step": 104490 }, { "epoch": 19.176913195081667, "grad_norm": 0.0008969987975433469, "learning_rate": 5.152669984140102e-08, "loss": 0.0, "num_input_tokens_seen": 225334048, "step": 104495 }, { "epoch": 19.177830794641217, "grad_norm": 0.0013922801008448005, "learning_rate": 5.1412099867381715e-08, "loss": 0.0, "num_input_tokens_seen": 225345472, "step": 104500 }, { "epoch": 19.17874839420077, "grad_norm": 0.00039717741310596466, "learning_rate": 5.129762681697237e-08, "loss": 0.0, "num_input_tokens_seen": 225356672, "step": 104505 }, { "epoch": 19.179665993760324, "grad_norm": 0.0010158502263948321, "learning_rate": 5.118328069310896e-08, "loss": 0.0, "num_input_tokens_seen": 225367328, "step": 104510 }, { "epoch": 19.180583593319874, "grad_norm": 0.010935005731880665, "learning_rate": 5.10690614987247e-08, "loss": 0.0, "num_input_tokens_seen": 225379008, "step": 104515 }, { "epoch": 19.181501192879427, "grad_norm": 0.002511634724214673, "learning_rate": 5.095496923674892e-08, "loss": 0.0, "num_input_tokens_seen": 225389056, "step": 104520 }, { "epoch": 19.18241879243898, "grad_norm": 0.11008703708648682, "learning_rate": 5.08410039101076e-08, "loss": 0.0, "num_input_tokens_seen": 225400288, "step": 104525 }, { "epoch": 19.18333639199853, "grad_norm": 0.0004372221592348069, "learning_rate": 5.072716552172452e-08, "loss": 0.0, "num_input_tokens_seen": 225410112, "step": 104530 }, { "epoch": 19.184253991558084, "grad_norm": 0.01297786459326744, "learning_rate": 5.0613454074518455e-08, "loss": 0.0001, "num_input_tokens_seen": 225420896, "step": 104535 }, { "epoch": 19.185171591117637, "grad_norm": 0.006520366296172142, "learning_rate": 5.049986957140651e-08, "loss": 0.0, "num_input_tokens_seen": 225430624, "step": 104540 }, { "epoch": 19.186089190677187, "grad_norm": 0.012223719619214535, "learning_rate": 5.0386412015302475e-08, "loss": 0.0, "num_input_tokens_seen": 225440128, "step": 104545 }, { "epoch": 19.18700679023674, "grad_norm": 0.05694568157196045, "learning_rate": 5.027308140911513e-08, "loss": 0.0, "num_input_tokens_seen": 225452288, "step": 104550 }, { "epoch": 19.187924389796294, "grad_norm": 0.0006317977677099407, "learning_rate": 5.015987775575215e-08, "loss": 0.0, "num_input_tokens_seen": 225463392, "step": 104555 }, { "epoch": 19.188841989355844, "grad_norm": 0.005308985244482756, "learning_rate": 5.004680105811677e-08, "loss": 0.0, "num_input_tokens_seen": 225473920, "step": 104560 }, { "epoch": 19.189759588915397, "grad_norm": 0.027788551524281502, "learning_rate": 4.993385131910888e-08, "loss": 0.0, "num_input_tokens_seen": 225483712, "step": 104565 }, { "epoch": 19.19067718847495, "grad_norm": 0.001538264798000455, "learning_rate": 4.982102854162618e-08, "loss": 0.0, "num_input_tokens_seen": 225492960, "step": 104570 }, { "epoch": 19.1915947880345, "grad_norm": 0.0010389784583821893, "learning_rate": 4.97083327285619e-08, "loss": 0.0, "num_input_tokens_seen": 225505312, "step": 104575 }, { "epoch": 19.192512387594054, "grad_norm": 0.3442125916481018, "learning_rate": 4.9595763882806514e-08, "loss": 0.0003, "num_input_tokens_seen": 225516160, "step": 104580 }, { "epoch": 19.193429987153607, "grad_norm": 0.0036437464877963066, "learning_rate": 4.9483322007247145e-08, "loss": 0.0, "num_input_tokens_seen": 225526848, "step": 104585 }, { "epoch": 19.194347586713157, "grad_norm": 0.00032448326237499714, "learning_rate": 4.937100710476872e-08, "loss": 0.0, "num_input_tokens_seen": 225537792, "step": 104590 }, { "epoch": 19.19526518627271, "grad_norm": 0.0010330760851502419, "learning_rate": 4.9258819178250596e-08, "loss": 0.0, "num_input_tokens_seen": 225548288, "step": 104595 }, { "epoch": 19.196182785832264, "grad_norm": 0.0021128568332642317, "learning_rate": 4.914675823057102e-08, "loss": 0.0, "num_input_tokens_seen": 225559776, "step": 104600 }, { "epoch": 19.197100385391813, "grad_norm": 0.0011533984215930104, "learning_rate": 4.903482426460382e-08, "loss": 0.0, "num_input_tokens_seen": 225570592, "step": 104605 }, { "epoch": 19.198017984951367, "grad_norm": 0.0016174853080883622, "learning_rate": 4.892301728322002e-08, "loss": 0.0, "num_input_tokens_seen": 225581472, "step": 104610 }, { "epoch": 19.19893558451092, "grad_norm": 0.0010143662802875042, "learning_rate": 4.881133728928733e-08, "loss": 0.0, "num_input_tokens_seen": 225593760, "step": 104615 }, { "epoch": 19.19985318407047, "grad_norm": 0.0014737134333699942, "learning_rate": 4.869978428567012e-08, "loss": 0.0, "num_input_tokens_seen": 225605856, "step": 104620 }, { "epoch": 19.200770783630023, "grad_norm": 0.0019949544221162796, "learning_rate": 4.858835827523001e-08, "loss": 0.1376, "num_input_tokens_seen": 225617536, "step": 104625 }, { "epoch": 19.201688383189577, "grad_norm": 0.010620391927659512, "learning_rate": 4.8477059260824685e-08, "loss": 0.0, "num_input_tokens_seen": 225628096, "step": 104630 }, { "epoch": 19.202605982749127, "grad_norm": 0.00429608253762126, "learning_rate": 4.8365887245308e-08, "loss": 0.175, "num_input_tokens_seen": 225639072, "step": 104635 }, { "epoch": 19.20352358230868, "grad_norm": 0.0007997337961569428, "learning_rate": 4.8254842231532095e-08, "loss": 0.0, "num_input_tokens_seen": 225649664, "step": 104640 }, { "epoch": 19.204441181868233, "grad_norm": 0.003584455233067274, "learning_rate": 4.814392422234526e-08, "loss": 0.0, "num_input_tokens_seen": 225659616, "step": 104645 }, { "epoch": 19.205358781427783, "grad_norm": 0.003956693224608898, "learning_rate": 4.803313322059189e-08, "loss": 0.0, "num_input_tokens_seen": 225669856, "step": 104650 }, { "epoch": 19.206276380987337, "grad_norm": 0.0017179353162646294, "learning_rate": 4.792246922911359e-08, "loss": 0.0, "num_input_tokens_seen": 225680128, "step": 104655 }, { "epoch": 19.20719398054689, "grad_norm": 0.0009002010338008404, "learning_rate": 4.7811932250749205e-08, "loss": 0.0, "num_input_tokens_seen": 225691168, "step": 104660 }, { "epoch": 19.20811158010644, "grad_norm": 0.027511198073625565, "learning_rate": 4.7701522288333694e-08, "loss": 0.0, "num_input_tokens_seen": 225702656, "step": 104665 }, { "epoch": 19.209029179665993, "grad_norm": 0.0008972195209935308, "learning_rate": 4.759123934469867e-08, "loss": 0.0, "num_input_tokens_seen": 225714016, "step": 104670 }, { "epoch": 19.209946779225547, "grad_norm": 0.0009512725519016385, "learning_rate": 4.7481083422672435e-08, "loss": 0.0, "num_input_tokens_seen": 225724320, "step": 104675 }, { "epoch": 19.210864378785097, "grad_norm": 0.009812461212277412, "learning_rate": 4.737105452508106e-08, "loss": 0.0, "num_input_tokens_seen": 225734432, "step": 104680 }, { "epoch": 19.21178197834465, "grad_norm": 0.007833141833543777, "learning_rate": 4.726115265474673e-08, "loss": 0.0001, "num_input_tokens_seen": 225745824, "step": 104685 }, { "epoch": 19.212699577904203, "grad_norm": 0.0072474488988518715, "learning_rate": 4.715137781448664e-08, "loss": 0.0703, "num_input_tokens_seen": 225757248, "step": 104690 }, { "epoch": 19.213617177463753, "grad_norm": 0.05042479559779167, "learning_rate": 4.7041730007118536e-08, "loss": 0.0, "num_input_tokens_seen": 225768256, "step": 104695 }, { "epoch": 19.214534777023307, "grad_norm": 0.0013875234872102737, "learning_rate": 4.69322092354535e-08, "loss": 0.0, "num_input_tokens_seen": 225780256, "step": 104700 }, { "epoch": 19.21545237658286, "grad_norm": 0.007688978686928749, "learning_rate": 4.6822815502299834e-08, "loss": 0.0, "num_input_tokens_seen": 225790656, "step": 104705 }, { "epoch": 19.21636997614241, "grad_norm": 0.0036911393981426954, "learning_rate": 4.67135488104653e-08, "loss": 0.0, "num_input_tokens_seen": 225801408, "step": 104710 }, { "epoch": 19.217287575701963, "grad_norm": 0.0013997372006997466, "learning_rate": 4.6604409162750995e-08, "loss": 0.0, "num_input_tokens_seen": 225812128, "step": 104715 }, { "epoch": 19.218205175261517, "grad_norm": 0.02238621935248375, "learning_rate": 4.649539656195634e-08, "loss": 0.0, "num_input_tokens_seen": 225822272, "step": 104720 }, { "epoch": 19.219122774821066, "grad_norm": 0.002900676801800728, "learning_rate": 4.6386511010877435e-08, "loss": 0.0, "num_input_tokens_seen": 225832896, "step": 104725 }, { "epoch": 19.22004037438062, "grad_norm": 0.0024169848766177893, "learning_rate": 4.6277752512307595e-08, "loss": 0.0, "num_input_tokens_seen": 225844480, "step": 104730 }, { "epoch": 19.220957973940173, "grad_norm": 0.015560220927000046, "learning_rate": 4.6169121069035703e-08, "loss": 0.0, "num_input_tokens_seen": 225856160, "step": 104735 }, { "epoch": 19.221875573499723, "grad_norm": 0.0020140716806054115, "learning_rate": 4.606061668384787e-08, "loss": 0.0, "num_input_tokens_seen": 225867648, "step": 104740 }, { "epoch": 19.222793173059276, "grad_norm": 0.0013630359899252653, "learning_rate": 4.5952239359527416e-08, "loss": 0.0, "num_input_tokens_seen": 225880000, "step": 104745 }, { "epoch": 19.22371077261883, "grad_norm": 0.006043152417987585, "learning_rate": 4.584398909885379e-08, "loss": 0.0, "num_input_tokens_seen": 225891776, "step": 104750 }, { "epoch": 19.22462837217838, "grad_norm": 0.0023667789064347744, "learning_rate": 4.573586590460366e-08, "loss": 0.0, "num_input_tokens_seen": 225902016, "step": 104755 }, { "epoch": 19.225545971737933, "grad_norm": 0.002215562155470252, "learning_rate": 4.562786977955036e-08, "loss": 0.0, "num_input_tokens_seen": 225913728, "step": 104760 }, { "epoch": 19.226463571297487, "grad_norm": 0.004800573922693729, "learning_rate": 4.552000072646334e-08, "loss": 0.0, "num_input_tokens_seen": 225924256, "step": 104765 }, { "epoch": 19.227381170857036, "grad_norm": 0.0008979797130450606, "learning_rate": 4.541225874810984e-08, "loss": 0.0, "num_input_tokens_seen": 225935328, "step": 104770 }, { "epoch": 19.22829877041659, "grad_norm": 0.0025807516649365425, "learning_rate": 4.5304643847252636e-08, "loss": 0.0001, "num_input_tokens_seen": 225945280, "step": 104775 }, { "epoch": 19.229216369976143, "grad_norm": 0.23329554498195648, "learning_rate": 4.5197156026652866e-08, "loss": 0.0001, "num_input_tokens_seen": 225957248, "step": 104780 }, { "epoch": 19.230133969535693, "grad_norm": 0.00045129170757718384, "learning_rate": 4.508979528906609e-08, "loss": 0.0, "num_input_tokens_seen": 225968544, "step": 104785 }, { "epoch": 19.231051569095246, "grad_norm": 0.015135178342461586, "learning_rate": 4.4982561637247346e-08, "loss": 0.0, "num_input_tokens_seen": 225979648, "step": 104790 }, { "epoch": 19.2319691686548, "grad_norm": 0.10305668413639069, "learning_rate": 4.487545507394608e-08, "loss": 0.0001, "num_input_tokens_seen": 225990176, "step": 104795 }, { "epoch": 19.23288676821435, "grad_norm": 0.0006626737304031849, "learning_rate": 4.476847560190956e-08, "loss": 0.0, "num_input_tokens_seen": 226000672, "step": 104800 }, { "epoch": 19.233804367773903, "grad_norm": 0.0016754971584305167, "learning_rate": 4.466162322388112e-08, "loss": 0.0, "num_input_tokens_seen": 226012512, "step": 104805 }, { "epoch": 19.234721967333456, "grad_norm": 0.009588281624019146, "learning_rate": 4.4554897942603034e-08, "loss": 0.0, "num_input_tokens_seen": 226023552, "step": 104810 }, { "epoch": 19.235639566893006, "grad_norm": 0.001128033734858036, "learning_rate": 4.4448299760810884e-08, "loss": 0.0, "num_input_tokens_seen": 226034432, "step": 104815 }, { "epoch": 19.23655716645256, "grad_norm": 0.005104952957481146, "learning_rate": 4.434182868123971e-08, "loss": 0.0883, "num_input_tokens_seen": 226046336, "step": 104820 }, { "epoch": 19.237474766012113, "grad_norm": 0.0008124005398713052, "learning_rate": 4.4235484706619535e-08, "loss": 0.0, "num_input_tokens_seen": 226057696, "step": 104825 }, { "epoch": 19.238392365571663, "grad_norm": 0.0005311283748596907, "learning_rate": 4.4129267839679305e-08, "loss": 0.0, "num_input_tokens_seen": 226067424, "step": 104830 }, { "epoch": 19.239309965131216, "grad_norm": 0.0005555434036068618, "learning_rate": 4.402317808314183e-08, "loss": 0.0, "num_input_tokens_seen": 226077696, "step": 104835 }, { "epoch": 19.24022756469077, "grad_norm": 0.0012052537640556693, "learning_rate": 4.3917215439728824e-08, "loss": 0.0005, "num_input_tokens_seen": 226087712, "step": 104840 }, { "epoch": 19.24114516425032, "grad_norm": 0.0003151008568238467, "learning_rate": 4.3811379912158114e-08, "loss": 0.0, "num_input_tokens_seen": 226097536, "step": 104845 }, { "epoch": 19.242062763809873, "grad_norm": 0.0004907565889880061, "learning_rate": 4.37056715031442e-08, "loss": 0.0, "num_input_tokens_seen": 226107648, "step": 104850 }, { "epoch": 19.242980363369426, "grad_norm": 0.18626432120800018, "learning_rate": 4.360009021539768e-08, "loss": 0.0001, "num_input_tokens_seen": 226118400, "step": 104855 }, { "epoch": 19.243897962928976, "grad_norm": 0.002590940101072192, "learning_rate": 4.349463605162807e-08, "loss": 0.0, "num_input_tokens_seen": 226128576, "step": 104860 }, { "epoch": 19.24481556248853, "grad_norm": 0.002127087442204356, "learning_rate": 4.338930901453875e-08, "loss": 0.0, "num_input_tokens_seen": 226139840, "step": 104865 }, { "epoch": 19.245733162048083, "grad_norm": 0.0005768788978457451, "learning_rate": 4.328410910683145e-08, "loss": 0.0, "num_input_tokens_seen": 226150976, "step": 104870 }, { "epoch": 19.246650761607633, "grad_norm": 0.0009561731712892652, "learning_rate": 4.317903633120457e-08, "loss": 0.0, "num_input_tokens_seen": 226162784, "step": 104875 }, { "epoch": 19.247568361167186, "grad_norm": 0.0006272547761909664, "learning_rate": 4.3074090690353175e-08, "loss": 0.0, "num_input_tokens_seen": 226172640, "step": 104880 }, { "epoch": 19.24848596072674, "grad_norm": 0.005839822813868523, "learning_rate": 4.2969272186969004e-08, "loss": 0.0, "num_input_tokens_seen": 226183616, "step": 104885 }, { "epoch": 19.24940356028629, "grad_norm": 0.0004596459912136197, "learning_rate": 4.286458082373934e-08, "loss": 0.0, "num_input_tokens_seen": 226194144, "step": 104890 }, { "epoch": 19.250321159845843, "grad_norm": 0.002153946552425623, "learning_rate": 4.2760016603351493e-08, "loss": 0.0, "num_input_tokens_seen": 226204960, "step": 104895 }, { "epoch": 19.251238759405396, "grad_norm": 0.00021563359769061208, "learning_rate": 4.265557952848554e-08, "loss": 0.0, "num_input_tokens_seen": 226216096, "step": 104900 }, { "epoch": 19.252156358964946, "grad_norm": 0.008087920024991035, "learning_rate": 4.255126960182099e-08, "loss": 0.0, "num_input_tokens_seen": 226227936, "step": 104905 }, { "epoch": 19.2530739585245, "grad_norm": 0.0031920094043016434, "learning_rate": 4.244708682603293e-08, "loss": 0.0, "num_input_tokens_seen": 226237280, "step": 104910 }, { "epoch": 19.253991558084053, "grad_norm": 0.003241103608161211, "learning_rate": 4.234303120379368e-08, "loss": 0.0, "num_input_tokens_seen": 226248384, "step": 104915 }, { "epoch": 19.254909157643603, "grad_norm": 0.0003784722357522696, "learning_rate": 4.2239102737772207e-08, "loss": 0.0, "num_input_tokens_seen": 226258784, "step": 104920 }, { "epoch": 19.255826757203156, "grad_norm": 0.0006931436364538968, "learning_rate": 4.2135301430633046e-08, "loss": 0.0, "num_input_tokens_seen": 226270144, "step": 104925 }, { "epoch": 19.25674435676271, "grad_norm": 0.002096533076837659, "learning_rate": 4.203162728504018e-08, "loss": 0.0, "num_input_tokens_seen": 226281568, "step": 104930 }, { "epoch": 19.25766195632226, "grad_norm": 0.008252185769379139, "learning_rate": 4.192808030365203e-08, "loss": 0.0, "num_input_tokens_seen": 226291104, "step": 104935 }, { "epoch": 19.258579555881813, "grad_norm": 0.024456247687339783, "learning_rate": 4.1824660489123705e-08, "loss": 0.0002, "num_input_tokens_seen": 226304224, "step": 104940 }, { "epoch": 19.259497155441366, "grad_norm": 0.06452431529760361, "learning_rate": 4.172136784410918e-08, "loss": 0.0, "num_input_tokens_seen": 226314752, "step": 104945 }, { "epoch": 19.260414755000916, "grad_norm": 0.002838309621438384, "learning_rate": 4.1618202371256355e-08, "loss": 0.0, "num_input_tokens_seen": 226324320, "step": 104950 }, { "epoch": 19.26133235456047, "grad_norm": 0.0009236327023245394, "learning_rate": 4.1515164073211987e-08, "loss": 0.0, "num_input_tokens_seen": 226334272, "step": 104955 }, { "epoch": 19.262249954120023, "grad_norm": 0.0014810562133789062, "learning_rate": 4.141225295261842e-08, "loss": 0.0, "num_input_tokens_seen": 226344032, "step": 104960 }, { "epoch": 19.263167553679573, "grad_norm": 0.14853079617023468, "learning_rate": 4.1309469012115744e-08, "loss": 0.0001, "num_input_tokens_seen": 226354272, "step": 104965 }, { "epoch": 19.264085153239126, "grad_norm": 0.03379269689321518, "learning_rate": 4.1206812254340204e-08, "loss": 0.0001, "num_input_tokens_seen": 226365760, "step": 104970 }, { "epoch": 19.26500275279868, "grad_norm": 0.0014389120042324066, "learning_rate": 4.110428268192412e-08, "loss": 0.0, "num_input_tokens_seen": 226377536, "step": 104975 }, { "epoch": 19.26592035235823, "grad_norm": 0.02417238987982273, "learning_rate": 4.100188029749763e-08, "loss": 0.0, "num_input_tokens_seen": 226389280, "step": 104980 }, { "epoch": 19.266837951917783, "grad_norm": 0.0005084027070552111, "learning_rate": 4.0899605103686956e-08, "loss": 0.004, "num_input_tokens_seen": 226399168, "step": 104985 }, { "epoch": 19.267755551477336, "grad_norm": 0.0005214816774241626, "learning_rate": 4.079745710311611e-08, "loss": 0.0, "num_input_tokens_seen": 226408864, "step": 104990 }, { "epoch": 19.268673151036886, "grad_norm": 0.0030773740727454424, "learning_rate": 4.069543629840411e-08, "loss": 0.0, "num_input_tokens_seen": 226419968, "step": 104995 }, { "epoch": 19.26959075059644, "grad_norm": 0.055503129959106445, "learning_rate": 4.0593542692167755e-08, "loss": 0.0, "num_input_tokens_seen": 226431104, "step": 105000 }, { "epoch": 19.270508350155993, "grad_norm": 0.008361512795090675, "learning_rate": 4.04917762870205e-08, "loss": 0.0, "num_input_tokens_seen": 226441824, "step": 105005 }, { "epoch": 19.271425949715542, "grad_norm": 0.0020328399259597063, "learning_rate": 4.0390137085573046e-08, "loss": 0.0, "num_input_tokens_seen": 226452320, "step": 105010 }, { "epoch": 19.272343549275096, "grad_norm": 0.0009424887830391526, "learning_rate": 4.0288625090431634e-08, "loss": 0.0, "num_input_tokens_seen": 226463168, "step": 105015 }, { "epoch": 19.27326114883465, "grad_norm": 0.004174674861133099, "learning_rate": 4.0187240304200294e-08, "loss": 0.0, "num_input_tokens_seen": 226475104, "step": 105020 }, { "epoch": 19.2741787483942, "grad_norm": 0.001276395982131362, "learning_rate": 4.008598272947917e-08, "loss": 0.0, "num_input_tokens_seen": 226486656, "step": 105025 }, { "epoch": 19.275096347953752, "grad_norm": 0.0014259315794333816, "learning_rate": 3.9984852368865626e-08, "loss": 0.0, "num_input_tokens_seen": 226497696, "step": 105030 }, { "epoch": 19.276013947513306, "grad_norm": 0.006657302379608154, "learning_rate": 3.988384922495314e-08, "loss": 0.0, "num_input_tokens_seen": 226509344, "step": 105035 }, { "epoch": 19.276931547072856, "grad_norm": 0.0054649608209729195, "learning_rate": 3.978297330033187e-08, "loss": 0.0, "num_input_tokens_seen": 226520896, "step": 105040 }, { "epoch": 19.27784914663241, "grad_norm": 0.007826228626072407, "learning_rate": 3.9682224597590303e-08, "loss": 0.0001, "num_input_tokens_seen": 226531264, "step": 105045 }, { "epoch": 19.278766746191963, "grad_norm": 0.0017337341560050845, "learning_rate": 3.9581603119311915e-08, "loss": 0.0, "num_input_tokens_seen": 226542624, "step": 105050 }, { "epoch": 19.279684345751512, "grad_norm": 0.03959260135889053, "learning_rate": 3.948110886807743e-08, "loss": 0.0, "num_input_tokens_seen": 226554048, "step": 105055 }, { "epoch": 19.280601945311066, "grad_norm": 0.007349149789661169, "learning_rate": 3.938074184646423e-08, "loss": 0.0078, "num_input_tokens_seen": 226563776, "step": 105060 }, { "epoch": 19.28151954487062, "grad_norm": 0.0007198068196885288, "learning_rate": 3.928050205704692e-08, "loss": 0.0001, "num_input_tokens_seen": 226575552, "step": 105065 }, { "epoch": 19.28243714443017, "grad_norm": 0.0032601982820779085, "learning_rate": 3.9180389502396224e-08, "loss": 0.0, "num_input_tokens_seen": 226585408, "step": 105070 }, { "epoch": 19.283354743989722, "grad_norm": 0.0014261520700529218, "learning_rate": 3.9080404185079524e-08, "loss": 0.0, "num_input_tokens_seen": 226596192, "step": 105075 }, { "epoch": 19.284272343549276, "grad_norm": 0.008884843438863754, "learning_rate": 3.898054610766255e-08, "loss": 0.0, "num_input_tokens_seen": 226606176, "step": 105080 }, { "epoch": 19.285189943108826, "grad_norm": 0.11550518125295639, "learning_rate": 3.888081527270493e-08, "loss": 0.0001, "num_input_tokens_seen": 226617440, "step": 105085 }, { "epoch": 19.28610754266838, "grad_norm": 0.016599010676145554, "learning_rate": 3.8781211682765716e-08, "loss": 0.0, "num_input_tokens_seen": 226627968, "step": 105090 }, { "epoch": 19.287025142227932, "grad_norm": 0.01772504486143589, "learning_rate": 3.8681735340398984e-08, "loss": 0.0, "num_input_tokens_seen": 226639712, "step": 105095 }, { "epoch": 19.287942741787482, "grad_norm": 0.0005608012434095144, "learning_rate": 3.8582386248157133e-08, "loss": 0.0, "num_input_tokens_seen": 226652000, "step": 105100 }, { "epoch": 19.288860341347036, "grad_norm": 0.0009067541686818004, "learning_rate": 3.8483164408587016e-08, "loss": 0.0, "num_input_tokens_seen": 226662144, "step": 105105 }, { "epoch": 19.28977794090659, "grad_norm": 0.0010090116411447525, "learning_rate": 3.838406982423382e-08, "loss": 0.0, "num_input_tokens_seen": 226673664, "step": 105110 }, { "epoch": 19.29069554046614, "grad_norm": 0.0026398447807878256, "learning_rate": 3.828510249763995e-08, "loss": 0.0, "num_input_tokens_seen": 226683392, "step": 105115 }, { "epoch": 19.291613140025692, "grad_norm": 0.015216858126223087, "learning_rate": 3.8186262431342826e-08, "loss": 0.0, "num_input_tokens_seen": 226695008, "step": 105120 }, { "epoch": 19.292530739585246, "grad_norm": 0.0041138664819300175, "learning_rate": 3.8087549627878196e-08, "loss": 0.0, "num_input_tokens_seen": 226705888, "step": 105125 }, { "epoch": 19.293448339144796, "grad_norm": 0.010534973815083504, "learning_rate": 3.798896408977737e-08, "loss": 0.0, "num_input_tokens_seen": 226716352, "step": 105130 }, { "epoch": 19.29436593870435, "grad_norm": 0.014299578964710236, "learning_rate": 3.7890505819569435e-08, "loss": 0.0, "num_input_tokens_seen": 226725152, "step": 105135 }, { "epoch": 19.295283538263902, "grad_norm": 0.013040182180702686, "learning_rate": 3.779217481977959e-08, "loss": 0.0016, "num_input_tokens_seen": 226736000, "step": 105140 }, { "epoch": 19.296201137823452, "grad_norm": 0.03666457161307335, "learning_rate": 3.769397109292971e-08, "loss": 0.0, "num_input_tokens_seen": 226747488, "step": 105145 }, { "epoch": 19.297118737383006, "grad_norm": 0.0005614567780867219, "learning_rate": 3.759589464153834e-08, "loss": 0.0, "num_input_tokens_seen": 226759104, "step": 105150 }, { "epoch": 19.29803633694256, "grad_norm": 0.0035191327333450317, "learning_rate": 3.7497945468121244e-08, "loss": 0.0002, "num_input_tokens_seen": 226770176, "step": 105155 }, { "epoch": 19.29895393650211, "grad_norm": 0.002586695598438382, "learning_rate": 3.740012357519085e-08, "loss": 0.0003, "num_input_tokens_seen": 226782240, "step": 105160 }, { "epoch": 19.299871536061662, "grad_norm": 0.011729788966476917, "learning_rate": 3.7302428965256263e-08, "loss": 0.0, "num_input_tokens_seen": 226793152, "step": 105165 }, { "epoch": 19.300789135621216, "grad_norm": 0.0018145251087844372, "learning_rate": 3.7204861640822154e-08, "loss": 0.0, "num_input_tokens_seen": 226804992, "step": 105170 }, { "epoch": 19.301706735180765, "grad_norm": 0.002921454142779112, "learning_rate": 3.710742160439207e-08, "loss": 0.0, "num_input_tokens_seen": 226814848, "step": 105175 }, { "epoch": 19.30262433474032, "grad_norm": 0.001253634225577116, "learning_rate": 3.701010885846512e-08, "loss": 0.0, "num_input_tokens_seen": 226825056, "step": 105180 }, { "epoch": 19.303541934299872, "grad_norm": 0.02482270635664463, "learning_rate": 3.691292340553654e-08, "loss": 0.0, "num_input_tokens_seen": 226836864, "step": 105185 }, { "epoch": 19.304459533859422, "grad_norm": 0.002049713395535946, "learning_rate": 3.681586524809932e-08, "loss": 0.0, "num_input_tokens_seen": 226847360, "step": 105190 }, { "epoch": 19.305377133418975, "grad_norm": 0.004540317691862583, "learning_rate": 3.671893438864316e-08, "loss": 0.0, "num_input_tokens_seen": 226858688, "step": 105195 }, { "epoch": 19.30629473297853, "grad_norm": 0.03948626667261124, "learning_rate": 3.662213082965383e-08, "loss": 0.0, "num_input_tokens_seen": 226869952, "step": 105200 }, { "epoch": 19.30721233253808, "grad_norm": 0.0004040607309434563, "learning_rate": 3.652545457361489e-08, "loss": 0.0, "num_input_tokens_seen": 226881024, "step": 105205 }, { "epoch": 19.308129932097632, "grad_norm": 0.08661606162786484, "learning_rate": 3.642890562300439e-08, "loss": 0.0, "num_input_tokens_seen": 226891072, "step": 105210 }, { "epoch": 19.309047531657185, "grad_norm": 0.0035897139459848404, "learning_rate": 3.6332483980300316e-08, "loss": 0.0001, "num_input_tokens_seen": 226900864, "step": 105215 }, { "epoch": 19.309965131216735, "grad_norm": 0.014052368700504303, "learning_rate": 3.62361896479746e-08, "loss": 0.0, "num_input_tokens_seen": 226911680, "step": 105220 }, { "epoch": 19.31088273077629, "grad_norm": 0.004483082797378302, "learning_rate": 3.614002262849803e-08, "loss": 0.0, "num_input_tokens_seen": 226922848, "step": 105225 }, { "epoch": 19.311800330335842, "grad_norm": 0.000672282068990171, "learning_rate": 3.604398292433586e-08, "loss": 0.0, "num_input_tokens_seen": 226934752, "step": 105230 }, { "epoch": 19.312717929895392, "grad_norm": 0.0002677877782844007, "learning_rate": 3.5948070537952796e-08, "loss": 0.0, "num_input_tokens_seen": 226945760, "step": 105235 }, { "epoch": 19.313635529454945, "grad_norm": 0.015631528571248055, "learning_rate": 3.585228547180797e-08, "loss": 0.0, "num_input_tokens_seen": 226957984, "step": 105240 }, { "epoch": 19.3145531290145, "grad_norm": 0.0133213447406888, "learning_rate": 3.575662772835775e-08, "loss": 0.0, "num_input_tokens_seen": 226968128, "step": 105245 }, { "epoch": 19.31547072857405, "grad_norm": 0.00046788720646873116, "learning_rate": 3.5661097310056846e-08, "loss": 0.0, "num_input_tokens_seen": 226978592, "step": 105250 }, { "epoch": 19.316388328133602, "grad_norm": 0.002259301720187068, "learning_rate": 3.5565694219354406e-08, "loss": 0.0, "num_input_tokens_seen": 226989184, "step": 105255 }, { "epoch": 19.317305927693155, "grad_norm": 0.07837407290935516, "learning_rate": 3.5470418458697365e-08, "loss": 0.0, "num_input_tokens_seen": 227000000, "step": 105260 }, { "epoch": 19.318223527252705, "grad_norm": 0.005989916156977415, "learning_rate": 3.5375270030530427e-08, "loss": 0.0, "num_input_tokens_seen": 227009632, "step": 105265 }, { "epoch": 19.31914112681226, "grad_norm": 0.0008339235209859908, "learning_rate": 3.5280248937293316e-08, "loss": 0.0032, "num_input_tokens_seen": 227019936, "step": 105270 }, { "epoch": 19.320058726371812, "grad_norm": 0.010687616653740406, "learning_rate": 3.518535518142297e-08, "loss": 0.0, "num_input_tokens_seen": 227030112, "step": 105275 }, { "epoch": 19.320976325931362, "grad_norm": 0.0013381342869251966, "learning_rate": 3.509058876535354e-08, "loss": 0.0, "num_input_tokens_seen": 227039744, "step": 105280 }, { "epoch": 19.321893925490915, "grad_norm": 0.0422818697988987, "learning_rate": 3.499594969151532e-08, "loss": 0.0, "num_input_tokens_seen": 227051008, "step": 105285 }, { "epoch": 19.32281152505047, "grad_norm": 0.0025007787626236677, "learning_rate": 3.490143796233636e-08, "loss": 0.0, "num_input_tokens_seen": 227062976, "step": 105290 }, { "epoch": 19.32372912461002, "grad_norm": 10.400467872619629, "learning_rate": 3.4807053580239726e-08, "loss": 0.0032, "num_input_tokens_seen": 227073728, "step": 105295 }, { "epoch": 19.324646724169572, "grad_norm": 0.0066595482639968395, "learning_rate": 3.471279654764737e-08, "loss": 0.0, "num_input_tokens_seen": 227085440, "step": 105300 }, { "epoch": 19.325564323729125, "grad_norm": 0.009573706425726414, "learning_rate": 3.461866686697624e-08, "loss": 0.0, "num_input_tokens_seen": 227095072, "step": 105305 }, { "epoch": 19.326481923288675, "grad_norm": 0.0033817037474364042, "learning_rate": 3.4524664540640515e-08, "loss": 0.0, "num_input_tokens_seen": 227106720, "step": 105310 }, { "epoch": 19.32739952284823, "grad_norm": 0.00364812184125185, "learning_rate": 3.4430789571051615e-08, "loss": 0.0, "num_input_tokens_seen": 227117760, "step": 105315 }, { "epoch": 19.328317122407782, "grad_norm": 0.06451916694641113, "learning_rate": 3.4337041960616494e-08, "loss": 0.0, "num_input_tokens_seen": 227127552, "step": 105320 }, { "epoch": 19.32923472196733, "grad_norm": 0.0009401766583323479, "learning_rate": 3.4243421711740996e-08, "loss": 0.0, "num_input_tokens_seen": 227138752, "step": 105325 }, { "epoch": 19.330152321526885, "grad_norm": 0.007755069062113762, "learning_rate": 3.414992882682433e-08, "loss": 0.0, "num_input_tokens_seen": 227150720, "step": 105330 }, { "epoch": 19.33106992108644, "grad_norm": 0.005744833964854479, "learning_rate": 3.405656330826679e-08, "loss": 0.0, "num_input_tokens_seen": 227161952, "step": 105335 }, { "epoch": 19.33198752064599, "grad_norm": 0.004309490788727999, "learning_rate": 3.396332515846146e-08, "loss": 0.0, "num_input_tokens_seen": 227172832, "step": 105340 }, { "epoch": 19.33290512020554, "grad_norm": 0.0005300745833665133, "learning_rate": 3.387021437979976e-08, "loss": 0.0, "num_input_tokens_seen": 227183488, "step": 105345 }, { "epoch": 19.333822719765095, "grad_norm": 0.0008260673494078219, "learning_rate": 3.377723097467089e-08, "loss": 0.0001, "num_input_tokens_seen": 227195232, "step": 105350 }, { "epoch": 19.334740319324645, "grad_norm": 0.0010429065441712737, "learning_rate": 3.3684374945458506e-08, "loss": 0.0, "num_input_tokens_seen": 227205728, "step": 105355 }, { "epoch": 19.3356579188842, "grad_norm": 0.07826551049947739, "learning_rate": 3.3591646294545146e-08, "loss": 0.0, "num_input_tokens_seen": 227214848, "step": 105360 }, { "epoch": 19.33657551844375, "grad_norm": 0.006517494097352028, "learning_rate": 3.34990450243089e-08, "loss": 0.0, "num_input_tokens_seen": 227226464, "step": 105365 }, { "epoch": 19.3374931180033, "grad_norm": 0.001286052749492228, "learning_rate": 3.340657113712453e-08, "loss": 0.0, "num_input_tokens_seen": 227238400, "step": 105370 }, { "epoch": 19.338410717562855, "grad_norm": 0.0030022128485143185, "learning_rate": 3.3314224635364603e-08, "loss": 0.0001, "num_input_tokens_seen": 227248096, "step": 105375 }, { "epoch": 19.33932831712241, "grad_norm": 0.0022742520086467266, "learning_rate": 3.3222005521396097e-08, "loss": 0.0, "num_input_tokens_seen": 227257408, "step": 105380 }, { "epoch": 19.34024591668196, "grad_norm": 0.0009370839688926935, "learning_rate": 3.312991379758657e-08, "loss": 0.0, "num_input_tokens_seen": 227267904, "step": 105385 }, { "epoch": 19.34116351624151, "grad_norm": 0.00042452654452063143, "learning_rate": 3.303794946629635e-08, "loss": 0.0, "num_input_tokens_seen": 227278816, "step": 105390 }, { "epoch": 19.342081115801065, "grad_norm": 0.04517105966806412, "learning_rate": 3.294611252988411e-08, "loss": 0.0, "num_input_tokens_seen": 227289536, "step": 105395 }, { "epoch": 19.342998715360615, "grad_norm": 0.001397663843818009, "learning_rate": 3.2854402990706305e-08, "loss": 0.0, "num_input_tokens_seen": 227298752, "step": 105400 }, { "epoch": 19.34391631492017, "grad_norm": 0.11674831062555313, "learning_rate": 3.276282085111493e-08, "loss": 0.0, "num_input_tokens_seen": 227308608, "step": 105405 }, { "epoch": 19.34483391447972, "grad_norm": 0.000841735047288239, "learning_rate": 3.267136611345812e-08, "loss": 0.0, "num_input_tokens_seen": 227319616, "step": 105410 }, { "epoch": 19.34575151403927, "grad_norm": 0.0016826970968395472, "learning_rate": 3.2580038780082315e-08, "loss": 0.1563, "num_input_tokens_seen": 227328544, "step": 105415 }, { "epoch": 19.346669113598825, "grad_norm": 0.007699658162891865, "learning_rate": 3.2488838853330096e-08, "loss": 0.0, "num_input_tokens_seen": 227340480, "step": 105420 }, { "epoch": 19.34758671315838, "grad_norm": 0.004164164420217276, "learning_rate": 3.239776633553959e-08, "loss": 0.0, "num_input_tokens_seen": 227352192, "step": 105425 }, { "epoch": 19.348504312717928, "grad_norm": 0.007063626311719418, "learning_rate": 3.2306821229047826e-08, "loss": 0.0, "num_input_tokens_seen": 227362144, "step": 105430 }, { "epoch": 19.34942191227748, "grad_norm": 0.00046717008808627725, "learning_rate": 3.221600353618681e-08, "loss": 0.0, "num_input_tokens_seen": 227372640, "step": 105435 }, { "epoch": 19.350339511837035, "grad_norm": 0.056678906083106995, "learning_rate": 3.212531325928525e-08, "loss": 0.0001, "num_input_tokens_seen": 227383040, "step": 105440 }, { "epoch": 19.351257111396585, "grad_norm": 0.0015735067427158356, "learning_rate": 3.203475040067072e-08, "loss": 0.0012, "num_input_tokens_seen": 227395008, "step": 105445 }, { "epoch": 19.352174710956138, "grad_norm": 0.0012166659580543637, "learning_rate": 3.19443149626647e-08, "loss": 0.0, "num_input_tokens_seen": 227405952, "step": 105450 }, { "epoch": 19.35309231051569, "grad_norm": 0.001007203245535493, "learning_rate": 3.1854006947587554e-08, "loss": 0.0, "num_input_tokens_seen": 227416224, "step": 105455 }, { "epoch": 19.35400991007524, "grad_norm": 0.0072745163924992085, "learning_rate": 3.1763826357755214e-08, "loss": 0.0, "num_input_tokens_seen": 227427776, "step": 105460 }, { "epoch": 19.354927509634795, "grad_norm": 0.005248855333775282, "learning_rate": 3.167377319548026e-08, "loss": 0.119, "num_input_tokens_seen": 227438016, "step": 105465 }, { "epoch": 19.355845109194348, "grad_norm": 0.003611554391682148, "learning_rate": 3.158384746307308e-08, "loss": 0.0097, "num_input_tokens_seen": 227447456, "step": 105470 }, { "epoch": 19.356762708753898, "grad_norm": 0.0025965021923184395, "learning_rate": 3.1494049162840155e-08, "loss": 0.0, "num_input_tokens_seen": 227459520, "step": 105475 }, { "epoch": 19.35768030831345, "grad_norm": 0.07242874801158905, "learning_rate": 3.140437829708354e-08, "loss": 0.0, "num_input_tokens_seen": 227470688, "step": 105480 }, { "epoch": 19.358597907873005, "grad_norm": 0.0013397011207416654, "learning_rate": 3.131483486810472e-08, "loss": 0.0, "num_input_tokens_seen": 227481920, "step": 105485 }, { "epoch": 19.359515507432555, "grad_norm": 0.0754951536655426, "learning_rate": 3.1225418878199635e-08, "loss": 0.0001, "num_input_tokens_seen": 227492352, "step": 105490 }, { "epoch": 19.360433106992108, "grad_norm": 0.006322022061794996, "learning_rate": 3.1136130329661455e-08, "loss": 0.0, "num_input_tokens_seen": 227503648, "step": 105495 }, { "epoch": 19.36135070655166, "grad_norm": 0.002546392846852541, "learning_rate": 3.104696922478057e-08, "loss": 0.0, "num_input_tokens_seen": 227514400, "step": 105500 }, { "epoch": 19.36226830611121, "grad_norm": 0.0012110313400626183, "learning_rate": 3.095793556584348e-08, "loss": 0.0, "num_input_tokens_seen": 227524160, "step": 105505 }, { "epoch": 19.363185905670765, "grad_norm": 0.0016885744407773018, "learning_rate": 3.0869029355134474e-08, "loss": 0.0, "num_input_tokens_seen": 227535680, "step": 105510 }, { "epoch": 19.364103505230318, "grad_norm": 0.0009205179521813989, "learning_rate": 3.0780250594932836e-08, "loss": 0.0, "num_input_tokens_seen": 227546816, "step": 105515 }, { "epoch": 19.365021104789868, "grad_norm": 0.0003845971659757197, "learning_rate": 3.069159928751675e-08, "loss": 0.0, "num_input_tokens_seen": 227557536, "step": 105520 }, { "epoch": 19.36593870434942, "grad_norm": 0.006010969635099173, "learning_rate": 3.0603075435158836e-08, "loss": 0.0, "num_input_tokens_seen": 227568512, "step": 105525 }, { "epoch": 19.366856303908975, "grad_norm": 0.0006725462735630572, "learning_rate": 3.0514679040130614e-08, "loss": 0.0, "num_input_tokens_seen": 227580000, "step": 105530 }, { "epoch": 19.367773903468525, "grad_norm": 0.0007636211230419576, "learning_rate": 3.04264101046986e-08, "loss": 0.0, "num_input_tokens_seen": 227591136, "step": 105535 }, { "epoch": 19.368691503028078, "grad_norm": 0.0036946875043213367, "learning_rate": 3.03382686311271e-08, "loss": 0.0, "num_input_tokens_seen": 227601504, "step": 105540 }, { "epoch": 19.36960910258763, "grad_norm": 0.0033926067408174276, "learning_rate": 3.0250254621677077e-08, "loss": 0.0, "num_input_tokens_seen": 227612736, "step": 105545 }, { "epoch": 19.37052670214718, "grad_norm": 0.000933734467253089, "learning_rate": 3.016236807860506e-08, "loss": 0.0, "num_input_tokens_seen": 227624224, "step": 105550 }, { "epoch": 19.371444301706735, "grad_norm": 0.07178855687379837, "learning_rate": 3.007460900416592e-08, "loss": 0.0, "num_input_tokens_seen": 227635136, "step": 105555 }, { "epoch": 19.372361901266288, "grad_norm": 0.0016446139197796583, "learning_rate": 2.998697740061063e-08, "loss": 0.0, "num_input_tokens_seen": 227645088, "step": 105560 }, { "epoch": 19.373279500825838, "grad_norm": 0.018666287884116173, "learning_rate": 2.9899473270186276e-08, "loss": 0.0, "num_input_tokens_seen": 227655872, "step": 105565 }, { "epoch": 19.37419710038539, "grad_norm": 0.01042644027620554, "learning_rate": 2.981209661513773e-08, "loss": 0.0, "num_input_tokens_seen": 227667072, "step": 105570 }, { "epoch": 19.375114699944945, "grad_norm": 0.004880489315837622, "learning_rate": 2.9724847437705428e-08, "loss": 0.0, "num_input_tokens_seen": 227677088, "step": 105575 }, { "epoch": 19.376032299504494, "grad_norm": 0.004103366285562515, "learning_rate": 2.9637725740127578e-08, "loss": 0.0, "num_input_tokens_seen": 227686048, "step": 105580 }, { "epoch": 19.376949899064048, "grad_norm": 0.0016315690008923411, "learning_rate": 2.9550731524639053e-08, "loss": 0.0, "num_input_tokens_seen": 227697120, "step": 105585 }, { "epoch": 19.3778674986236, "grad_norm": 0.0005492214113473892, "learning_rate": 2.9463864793470853e-08, "loss": 0.0, "num_input_tokens_seen": 227707968, "step": 105590 }, { "epoch": 19.37878509818315, "grad_norm": 0.0013895380543544888, "learning_rate": 2.9377125548850638e-08, "loss": 0.0, "num_input_tokens_seen": 227718336, "step": 105595 }, { "epoch": 19.379702697742704, "grad_norm": 0.1597488522529602, "learning_rate": 2.9290513793003294e-08, "loss": 0.0001, "num_input_tokens_seen": 227729376, "step": 105600 }, { "epoch": 19.380620297302258, "grad_norm": 0.0005292928544804454, "learning_rate": 2.9204029528150378e-08, "loss": 0.0, "num_input_tokens_seen": 227740480, "step": 105605 }, { "epoch": 19.381537896861808, "grad_norm": 0.00046491980901919305, "learning_rate": 2.9117672756510673e-08, "loss": 0.0001, "num_input_tokens_seen": 227751936, "step": 105610 }, { "epoch": 19.38245549642136, "grad_norm": 0.0017319354228675365, "learning_rate": 2.9031443480297406e-08, "loss": 0.0, "num_input_tokens_seen": 227761504, "step": 105615 }, { "epoch": 19.383373095980915, "grad_norm": 0.0028088735416531563, "learning_rate": 2.894534170172436e-08, "loss": 0.0, "num_input_tokens_seen": 227772448, "step": 105620 }, { "epoch": 19.384290695540464, "grad_norm": 0.001018422655761242, "learning_rate": 2.8859367422998108e-08, "loss": 0.0, "num_input_tokens_seen": 227784128, "step": 105625 }, { "epoch": 19.385208295100018, "grad_norm": 0.0028043093625456095, "learning_rate": 2.8773520646325214e-08, "loss": 0.0, "num_input_tokens_seen": 227795008, "step": 105630 }, { "epoch": 19.38612589465957, "grad_norm": 0.0015952549874782562, "learning_rate": 2.8687801373906142e-08, "loss": 0.0, "num_input_tokens_seen": 227805920, "step": 105635 }, { "epoch": 19.38704349421912, "grad_norm": 0.0005964514566585422, "learning_rate": 2.8602209607940247e-08, "loss": 0.0, "num_input_tokens_seen": 227816768, "step": 105640 }, { "epoch": 19.387961093778674, "grad_norm": 0.048606373369693756, "learning_rate": 2.8516745350622987e-08, "loss": 0.0, "num_input_tokens_seen": 227827520, "step": 105645 }, { "epoch": 19.388878693338228, "grad_norm": 1.4457008838653564, "learning_rate": 2.8431408604145948e-08, "loss": 0.0003, "num_input_tokens_seen": 227837088, "step": 105650 }, { "epoch": 19.389796292897778, "grad_norm": 0.0020572806242853403, "learning_rate": 2.8346199370698492e-08, "loss": 0.0, "num_input_tokens_seen": 227848384, "step": 105655 }, { "epoch": 19.39071389245733, "grad_norm": 0.0014903669944033027, "learning_rate": 2.8261117652464974e-08, "loss": 0.0, "num_input_tokens_seen": 227859392, "step": 105660 }, { "epoch": 19.391631492016884, "grad_norm": 0.0015250144060701132, "learning_rate": 2.8176163451628656e-08, "loss": 0.0, "num_input_tokens_seen": 227870848, "step": 105665 }, { "epoch": 19.392549091576434, "grad_norm": 0.0015311038587242365, "learning_rate": 2.8091336770367794e-08, "loss": 0.0, "num_input_tokens_seen": 227882432, "step": 105670 }, { "epoch": 19.393466691135988, "grad_norm": 0.30036360025405884, "learning_rate": 2.8006637610858976e-08, "loss": 0.0, "num_input_tokens_seen": 227892480, "step": 105675 }, { "epoch": 19.39438429069554, "grad_norm": 0.0007767612114548683, "learning_rate": 2.7922065975273806e-08, "loss": 0.0, "num_input_tokens_seen": 227901408, "step": 105680 }, { "epoch": 19.39530189025509, "grad_norm": 0.0036699995398521423, "learning_rate": 2.7837621865781094e-08, "loss": 0.0, "num_input_tokens_seen": 227912800, "step": 105685 }, { "epoch": 19.396219489814644, "grad_norm": 0.013494445942342281, "learning_rate": 2.775330528454745e-08, "loss": 0.0, "num_input_tokens_seen": 227924160, "step": 105690 }, { "epoch": 19.397137089374198, "grad_norm": 0.02513834461569786, "learning_rate": 2.7669116233735584e-08, "loss": 0.0, "num_input_tokens_seen": 227934880, "step": 105695 }, { "epoch": 19.398054688933748, "grad_norm": 0.015163924545049667, "learning_rate": 2.7585054715504324e-08, "loss": 0.0, "num_input_tokens_seen": 227944416, "step": 105700 }, { "epoch": 19.3989722884933, "grad_norm": 0.001381765934638679, "learning_rate": 2.7501120732009722e-08, "loss": 0.0025, "num_input_tokens_seen": 227956096, "step": 105705 }, { "epoch": 19.399889888052854, "grad_norm": 0.046401314437389374, "learning_rate": 2.7417314285405062e-08, "loss": 0.0, "num_input_tokens_seen": 227967200, "step": 105710 }, { "epoch": 19.400807487612404, "grad_norm": 0.01938866823911667, "learning_rate": 2.733363537783862e-08, "loss": 0.0, "num_input_tokens_seen": 227978880, "step": 105715 }, { "epoch": 19.401725087171958, "grad_norm": 0.0014689200324937701, "learning_rate": 2.7250084011458122e-08, "loss": 0.0, "num_input_tokens_seen": 227990208, "step": 105720 }, { "epoch": 19.40264268673151, "grad_norm": 0.004311041906476021, "learning_rate": 2.71666601884063e-08, "loss": 0.0, "num_input_tokens_seen": 228001984, "step": 105725 }, { "epoch": 19.40356028629106, "grad_norm": 0.0017805729294195771, "learning_rate": 2.7083363910822004e-08, "loss": 0.0001, "num_input_tokens_seen": 228012192, "step": 105730 }, { "epoch": 19.404477885850614, "grad_norm": 0.004731812980026007, "learning_rate": 2.7000195180841848e-08, "loss": 0.0, "num_input_tokens_seen": 228023296, "step": 105735 }, { "epoch": 19.405395485410168, "grad_norm": 0.003370654070749879, "learning_rate": 2.6917154000599688e-08, "loss": 0.0, "num_input_tokens_seen": 228033792, "step": 105740 }, { "epoch": 19.40631308496972, "grad_norm": 0.00983745139092207, "learning_rate": 2.683424037222493e-08, "loss": 0.0, "num_input_tokens_seen": 228044128, "step": 105745 }, { "epoch": 19.40723068452927, "grad_norm": 0.06867703795433044, "learning_rate": 2.675145429784365e-08, "loss": 0.0, "num_input_tokens_seen": 228055200, "step": 105750 }, { "epoch": 19.408148284088824, "grad_norm": 0.000760735129006207, "learning_rate": 2.666879577958026e-08, "loss": 0.0032, "num_input_tokens_seen": 228066112, "step": 105755 }, { "epoch": 19.409065883648378, "grad_norm": 0.0009326526196673512, "learning_rate": 2.6586264819554175e-08, "loss": 0.0, "num_input_tokens_seen": 228076704, "step": 105760 }, { "epoch": 19.409983483207927, "grad_norm": 0.0013404269702732563, "learning_rate": 2.6503861419882036e-08, "loss": 0.0, "num_input_tokens_seen": 228086592, "step": 105765 }, { "epoch": 19.41090108276748, "grad_norm": 0.012108273804187775, "learning_rate": 2.6421585582678266e-08, "loss": 0.0, "num_input_tokens_seen": 228095904, "step": 105770 }, { "epoch": 19.411818682327034, "grad_norm": 0.0038804011419415474, "learning_rate": 2.633943731005173e-08, "loss": 0.0, "num_input_tokens_seen": 228107040, "step": 105775 }, { "epoch": 19.412736281886584, "grad_norm": 0.026934564113616943, "learning_rate": 2.6257416604110742e-08, "loss": 0.0, "num_input_tokens_seen": 228117568, "step": 105780 }, { "epoch": 19.413653881446137, "grad_norm": 0.005547977983951569, "learning_rate": 2.6175523466958063e-08, "loss": 0.0, "num_input_tokens_seen": 228128032, "step": 105785 }, { "epoch": 19.41457148100569, "grad_norm": 0.004203628748655319, "learning_rate": 2.6093757900694795e-08, "loss": 0.0, "num_input_tokens_seen": 228138848, "step": 105790 }, { "epoch": 19.41548908056524, "grad_norm": 0.3816620111465454, "learning_rate": 2.601211990741759e-08, "loss": 0.0001, "num_input_tokens_seen": 228149344, "step": 105795 }, { "epoch": 19.416406680124794, "grad_norm": 0.010533321648836136, "learning_rate": 2.5930609489220327e-08, "loss": 0.0, "num_input_tokens_seen": 228160800, "step": 105800 }, { "epoch": 19.417324279684347, "grad_norm": 0.0034520686604082584, "learning_rate": 2.5849226648194115e-08, "loss": 0.0, "num_input_tokens_seen": 228170432, "step": 105805 }, { "epoch": 19.418241879243897, "grad_norm": 0.005023266654461622, "learning_rate": 2.5767971386425616e-08, "loss": 0.0, "num_input_tokens_seen": 228181248, "step": 105810 }, { "epoch": 19.41915947880345, "grad_norm": 0.010322638787329197, "learning_rate": 2.568684370599983e-08, "loss": 0.0, "num_input_tokens_seen": 228192192, "step": 105815 }, { "epoch": 19.420077078363004, "grad_norm": 0.0006607187679037452, "learning_rate": 2.560584360899676e-08, "loss": 0.0, "num_input_tokens_seen": 228202880, "step": 105820 }, { "epoch": 19.420994677922554, "grad_norm": 0.0046673729084432125, "learning_rate": 2.552497109749419e-08, "loss": 0.0, "num_input_tokens_seen": 228213664, "step": 105825 }, { "epoch": 19.421912277482107, "grad_norm": 0.011785516515374184, "learning_rate": 2.5444226173566565e-08, "loss": 0.0, "num_input_tokens_seen": 228226688, "step": 105830 }, { "epoch": 19.42282987704166, "grad_norm": 0.0004135237541049719, "learning_rate": 2.5363608839283905e-08, "loss": 0.0, "num_input_tokens_seen": 228235392, "step": 105835 }, { "epoch": 19.42374747660121, "grad_norm": 0.0004671728238463402, "learning_rate": 2.5283119096715658e-08, "loss": 0.0, "num_input_tokens_seen": 228246080, "step": 105840 }, { "epoch": 19.424665076160764, "grad_norm": 0.00179744279012084, "learning_rate": 2.5202756947925178e-08, "loss": 0.0032, "num_input_tokens_seen": 228257216, "step": 105845 }, { "epoch": 19.425582675720317, "grad_norm": 0.0010064768139272928, "learning_rate": 2.5122522394973037e-08, "loss": 0.0, "num_input_tokens_seen": 228267776, "step": 105850 }, { "epoch": 19.426500275279867, "grad_norm": 0.0018251906149089336, "learning_rate": 2.5042415439918145e-08, "loss": 0.0, "num_input_tokens_seen": 228278240, "step": 105855 }, { "epoch": 19.42741787483942, "grad_norm": 0.004167846869677305, "learning_rate": 2.4962436084814966e-08, "loss": 0.0001, "num_input_tokens_seen": 228289824, "step": 105860 }, { "epoch": 19.428335474398974, "grad_norm": 0.0073422593995928764, "learning_rate": 2.48825843317152e-08, "loss": 0.0, "num_input_tokens_seen": 228300448, "step": 105865 }, { "epoch": 19.429253073958524, "grad_norm": 0.005380990915000439, "learning_rate": 2.4802860182665533e-08, "loss": 0.0, "num_input_tokens_seen": 228311744, "step": 105870 }, { "epoch": 19.430170673518077, "grad_norm": 0.017048142850399017, "learning_rate": 2.4723263639712114e-08, "loss": 0.0001, "num_input_tokens_seen": 228322080, "step": 105875 }, { "epoch": 19.43108827307763, "grad_norm": 0.2814156115055084, "learning_rate": 2.4643794704896083e-08, "loss": 0.0, "num_input_tokens_seen": 228332384, "step": 105880 }, { "epoch": 19.43200587263718, "grad_norm": 0.019215626642107964, "learning_rate": 2.456445338025526e-08, "loss": 0.0, "num_input_tokens_seen": 228343840, "step": 105885 }, { "epoch": 19.432923472196734, "grad_norm": 0.022374682128429413, "learning_rate": 2.4485239667825234e-08, "loss": 0.0, "num_input_tokens_seen": 228352448, "step": 105890 }, { "epoch": 19.433841071756287, "grad_norm": 0.002861165441572666, "learning_rate": 2.4406153569637157e-08, "loss": 0.0, "num_input_tokens_seen": 228362176, "step": 105895 }, { "epoch": 19.434758671315837, "grad_norm": 0.0008181409793905914, "learning_rate": 2.432719508771997e-08, "loss": 0.0, "num_input_tokens_seen": 228374560, "step": 105900 }, { "epoch": 19.43567627087539, "grad_norm": 0.00823670532554388, "learning_rate": 2.424836422409871e-08, "loss": 0.0, "num_input_tokens_seen": 228384288, "step": 105905 }, { "epoch": 19.436593870434944, "grad_norm": 0.030811408534646034, "learning_rate": 2.4169660980795095e-08, "loss": 0.0, "num_input_tokens_seen": 228395424, "step": 105910 }, { "epoch": 19.437511469994494, "grad_norm": 0.002106821397319436, "learning_rate": 2.409108535982807e-08, "loss": 0.0, "num_input_tokens_seen": 228405920, "step": 105915 }, { "epoch": 19.438429069554047, "grad_norm": 0.004788670688867569, "learning_rate": 2.4012637363212133e-08, "loss": 0.0, "num_input_tokens_seen": 228416608, "step": 105920 }, { "epoch": 19.4393466691136, "grad_norm": 0.0009483643225394189, "learning_rate": 2.3934316992960673e-08, "loss": 0.0, "num_input_tokens_seen": 228428128, "step": 105925 }, { "epoch": 19.44026426867315, "grad_norm": 0.0011071964399889112, "learning_rate": 2.3856124251081525e-08, "loss": 0.0, "num_input_tokens_seen": 228438752, "step": 105930 }, { "epoch": 19.441181868232704, "grad_norm": 0.0007944427197799087, "learning_rate": 2.377805913958031e-08, "loss": 0.0, "num_input_tokens_seen": 228449024, "step": 105935 }, { "epoch": 19.442099467792257, "grad_norm": 0.06670736521482468, "learning_rate": 2.370012166045932e-08, "loss": 0.0, "num_input_tokens_seen": 228459488, "step": 105940 }, { "epoch": 19.443017067351807, "grad_norm": 0.0011303597129881382, "learning_rate": 2.3622311815718058e-08, "loss": 0.0032, "num_input_tokens_seen": 228470912, "step": 105945 }, { "epoch": 19.44393466691136, "grad_norm": 0.0027530761435627937, "learning_rate": 2.35446296073516e-08, "loss": 0.0, "num_input_tokens_seen": 228482400, "step": 105950 }, { "epoch": 19.444852266470914, "grad_norm": 1.0030096769332886, "learning_rate": 2.3467075037352795e-08, "loss": 0.0003, "num_input_tokens_seen": 228492768, "step": 105955 }, { "epoch": 19.445769866030464, "grad_norm": 0.0006199387134984136, "learning_rate": 2.3389648107710605e-08, "loss": 0.0, "num_input_tokens_seen": 228503968, "step": 105960 }, { "epoch": 19.446687465590017, "grad_norm": 0.0009557906305417418, "learning_rate": 2.3312348820410668e-08, "loss": 0.0, "num_input_tokens_seen": 228513088, "step": 105965 }, { "epoch": 19.44760506514957, "grad_norm": 0.11115144938230515, "learning_rate": 2.3235177177435287e-08, "loss": 0.0, "num_input_tokens_seen": 228524448, "step": 105970 }, { "epoch": 19.44852266470912, "grad_norm": 0.0015228462871164083, "learning_rate": 2.3158133180765097e-08, "loss": 0.0, "num_input_tokens_seen": 228535328, "step": 105975 }, { "epoch": 19.449440264268674, "grad_norm": 0.009720610454678535, "learning_rate": 2.3081216832375187e-08, "loss": 0.0078, "num_input_tokens_seen": 228546016, "step": 105980 }, { "epoch": 19.450357863828227, "grad_norm": 0.0016074118902906775, "learning_rate": 2.3004428134238423e-08, "loss": 0.0, "num_input_tokens_seen": 228556064, "step": 105985 }, { "epoch": 19.451275463387777, "grad_norm": 0.11150489747524261, "learning_rate": 2.2927767088324338e-08, "loss": 0.0001, "num_input_tokens_seen": 228567808, "step": 105990 }, { "epoch": 19.45219306294733, "grad_norm": 0.026326023042201996, "learning_rate": 2.2851233696599696e-08, "loss": 0.0, "num_input_tokens_seen": 228579264, "step": 105995 }, { "epoch": 19.453110662506884, "grad_norm": 0.0006341977277770638, "learning_rate": 2.277482796102681e-08, "loss": 0.0, "num_input_tokens_seen": 228590784, "step": 106000 }, { "epoch": 19.454028262066434, "grad_norm": 0.006948588415980339, "learning_rate": 2.269854988356579e-08, "loss": 0.0, "num_input_tokens_seen": 228602080, "step": 106005 }, { "epoch": 19.454945861625987, "grad_norm": 0.0004010992415715009, "learning_rate": 2.2622399466172286e-08, "loss": 0.0, "num_input_tokens_seen": 228612864, "step": 106010 }, { "epoch": 19.45586346118554, "grad_norm": 0.021175144240260124, "learning_rate": 2.2546376710800287e-08, "loss": 0.0, "num_input_tokens_seen": 228623168, "step": 106015 }, { "epoch": 19.45678106074509, "grad_norm": 0.008841600269079208, "learning_rate": 2.2470481619399354e-08, "loss": 0.0, "num_input_tokens_seen": 228634272, "step": 106020 }, { "epoch": 19.457698660304644, "grad_norm": 0.0014875414781272411, "learning_rate": 2.2394714193916257e-08, "loss": 0.0, "num_input_tokens_seen": 228646144, "step": 106025 }, { "epoch": 19.458616259864197, "grad_norm": 0.0007329423096962273, "learning_rate": 2.2319074436294442e-08, "loss": 0.0, "num_input_tokens_seen": 228656704, "step": 106030 }, { "epoch": 19.459533859423747, "grad_norm": 0.001673577120527625, "learning_rate": 2.2243562348472915e-08, "loss": 0.0, "num_input_tokens_seen": 228668864, "step": 106035 }, { "epoch": 19.4604514589833, "grad_norm": 0.001573277055285871, "learning_rate": 2.2168177932389566e-08, "loss": 0.0, "num_input_tokens_seen": 228680416, "step": 106040 }, { "epoch": 19.461369058542854, "grad_norm": 0.014295529574155807, "learning_rate": 2.2092921189977856e-08, "loss": 0.0, "num_input_tokens_seen": 228691072, "step": 106045 }, { "epoch": 19.462286658102403, "grad_norm": 0.0020704329945147038, "learning_rate": 2.2017792123167348e-08, "loss": 0.0, "num_input_tokens_seen": 228701568, "step": 106050 }, { "epoch": 19.463204257661957, "grad_norm": 39.973541259765625, "learning_rate": 2.1942790733884833e-08, "loss": 0.0032, "num_input_tokens_seen": 228713056, "step": 106055 }, { "epoch": 19.46412185722151, "grad_norm": 0.0006013971287757158, "learning_rate": 2.1867917024054886e-08, "loss": 0.0, "num_input_tokens_seen": 228724224, "step": 106060 }, { "epoch": 19.46503945678106, "grad_norm": 0.012943155132234097, "learning_rate": 2.1793170995597636e-08, "loss": 0.0, "num_input_tokens_seen": 228735584, "step": 106065 }, { "epoch": 19.465957056340613, "grad_norm": 0.002004583366215229, "learning_rate": 2.171855265042988e-08, "loss": 0.0, "num_input_tokens_seen": 228746528, "step": 106070 }, { "epoch": 19.466874655900167, "grad_norm": 0.003478210885077715, "learning_rate": 2.1644061990465647e-08, "loss": 0.0, "num_input_tokens_seen": 228756416, "step": 106075 }, { "epoch": 19.467792255459717, "grad_norm": 0.0006672314484603703, "learning_rate": 2.1569699017615076e-08, "loss": 0.0001, "num_input_tokens_seen": 228765888, "step": 106080 }, { "epoch": 19.46870985501927, "grad_norm": 0.014234599657356739, "learning_rate": 2.1495463733786082e-08, "loss": 0.0, "num_input_tokens_seen": 228778272, "step": 106085 }, { "epoch": 19.469627454578823, "grad_norm": 0.011210164986550808, "learning_rate": 2.14213561408827e-08, "loss": 0.0, "num_input_tokens_seen": 228790880, "step": 106090 }, { "epoch": 19.470545054138373, "grad_norm": 0.0026712073013186455, "learning_rate": 2.1347376240805627e-08, "loss": 0.0, "num_input_tokens_seen": 228801696, "step": 106095 }, { "epoch": 19.471462653697927, "grad_norm": 0.002956860000267625, "learning_rate": 2.1273524035451687e-08, "loss": 0.0, "num_input_tokens_seen": 228811392, "step": 106100 }, { "epoch": 19.47238025325748, "grad_norm": 0.0004482935764826834, "learning_rate": 2.119979952671547e-08, "loss": 0.0, "num_input_tokens_seen": 228821664, "step": 106105 }, { "epoch": 19.47329785281703, "grad_norm": 0.004371039569377899, "learning_rate": 2.112620271648824e-08, "loss": 0.0003, "num_input_tokens_seen": 228832384, "step": 106110 }, { "epoch": 19.474215452376583, "grad_norm": 0.008184602484107018, "learning_rate": 2.1052733606657382e-08, "loss": 0.0, "num_input_tokens_seen": 228843584, "step": 106115 }, { "epoch": 19.475133051936137, "grad_norm": 0.0010331249795854092, "learning_rate": 2.097939219910694e-08, "loss": 0.0036, "num_input_tokens_seen": 228854112, "step": 106120 }, { "epoch": 19.476050651495687, "grad_norm": 0.13609278202056885, "learning_rate": 2.0906178495718187e-08, "loss": 0.0, "num_input_tokens_seen": 228865568, "step": 106125 }, { "epoch": 19.47696825105524, "grad_norm": 0.02984224632382393, "learning_rate": 2.0833092498369624e-08, "loss": 0.0, "num_input_tokens_seen": 228876032, "step": 106130 }, { "epoch": 19.477885850614793, "grad_norm": 0.008558832108974457, "learning_rate": 2.0760134208934747e-08, "loss": 0.0, "num_input_tokens_seen": 228888000, "step": 106135 }, { "epoch": 19.478803450174343, "grad_norm": 0.0016297845868393779, "learning_rate": 2.0687303629285393e-08, "loss": 0.0, "num_input_tokens_seen": 228898112, "step": 106140 }, { "epoch": 19.479721049733897, "grad_norm": 0.011034834198653698, "learning_rate": 2.061460076129007e-08, "loss": 0.0, "num_input_tokens_seen": 228909024, "step": 106145 }, { "epoch": 19.48063864929345, "grad_norm": 0.00088578398572281, "learning_rate": 2.0542025606812287e-08, "loss": 0.0, "num_input_tokens_seen": 228920480, "step": 106150 }, { "epoch": 19.481556248853, "grad_norm": 0.012572246603667736, "learning_rate": 2.046957816771389e-08, "loss": 0.002, "num_input_tokens_seen": 228931424, "step": 106155 }, { "epoch": 19.482473848412553, "grad_norm": 0.0008192576933652163, "learning_rate": 2.039725844585394e-08, "loss": 0.0, "num_input_tokens_seen": 228941312, "step": 106160 }, { "epoch": 19.483391447972107, "grad_norm": 0.033172860741615295, "learning_rate": 2.0325066443085962e-08, "loss": 0.0, "num_input_tokens_seen": 228951872, "step": 106165 }, { "epoch": 19.484309047531656, "grad_norm": 0.000480378745123744, "learning_rate": 2.0253002161262358e-08, "loss": 0.0002, "num_input_tokens_seen": 228963168, "step": 106170 }, { "epoch": 19.48522664709121, "grad_norm": 0.0061398898251354694, "learning_rate": 2.0181065602231652e-08, "loss": 0.0, "num_input_tokens_seen": 228973888, "step": 106175 }, { "epoch": 19.486144246650763, "grad_norm": 0.023075800389051437, "learning_rate": 2.010925676783848e-08, "loss": 0.0, "num_input_tokens_seen": 228983648, "step": 106180 }, { "epoch": 19.487061846210313, "grad_norm": 0.0007730167708359659, "learning_rate": 2.0037575659924703e-08, "loss": 0.0, "num_input_tokens_seen": 228994336, "step": 106185 }, { "epoch": 19.487979445769867, "grad_norm": 0.00024147146905306727, "learning_rate": 1.9966022280328845e-08, "loss": 0.0, "num_input_tokens_seen": 229005728, "step": 106190 }, { "epoch": 19.48889704532942, "grad_norm": 0.0007655287045054138, "learning_rate": 1.9894596630886108e-08, "loss": 0.0, "num_input_tokens_seen": 229017344, "step": 106195 }, { "epoch": 19.48981464488897, "grad_norm": 0.1841554194688797, "learning_rate": 1.9823298713428363e-08, "loss": 0.0001, "num_input_tokens_seen": 229027680, "step": 106200 }, { "epoch": 19.490732244448523, "grad_norm": 0.005716548766940832, "learning_rate": 1.9752128529784696e-08, "loss": 0.0, "num_input_tokens_seen": 229037824, "step": 106205 }, { "epoch": 19.491649844008077, "grad_norm": 0.027576465159654617, "learning_rate": 1.968108608178032e-08, "loss": 0.0001, "num_input_tokens_seen": 229048032, "step": 106210 }, { "epoch": 19.492567443567626, "grad_norm": 0.00576893100515008, "learning_rate": 1.9610171371237107e-08, "loss": 0.0, "num_input_tokens_seen": 229059008, "step": 106215 }, { "epoch": 19.49348504312718, "grad_norm": 0.0028033116832375526, "learning_rate": 1.953938439997416e-08, "loss": 0.0063, "num_input_tokens_seen": 229069184, "step": 106220 }, { "epoch": 19.494402642686733, "grad_norm": 0.0012415602104738355, "learning_rate": 1.946872516980669e-08, "loss": 0.0001, "num_input_tokens_seen": 229080192, "step": 106225 }, { "epoch": 19.495320242246283, "grad_norm": 0.000973030983004719, "learning_rate": 1.9398193682547693e-08, "loss": 0.0, "num_input_tokens_seen": 229091360, "step": 106230 }, { "epoch": 19.496237841805836, "grad_norm": 0.0006572525016963482, "learning_rate": 1.9327789940005727e-08, "loss": 0.0, "num_input_tokens_seen": 229101664, "step": 106235 }, { "epoch": 19.49715544136539, "grad_norm": 0.1349041908979416, "learning_rate": 1.925751394398656e-08, "loss": 0.0, "num_input_tokens_seen": 229111168, "step": 106240 }, { "epoch": 19.49807304092494, "grad_norm": 0.0014953742502257228, "learning_rate": 1.9187365696292647e-08, "loss": 0.0, "num_input_tokens_seen": 229122464, "step": 106245 }, { "epoch": 19.498990640484493, "grad_norm": 0.0003027236380148679, "learning_rate": 1.911734519872366e-08, "loss": 0.0, "num_input_tokens_seen": 229131360, "step": 106250 }, { "epoch": 19.499908240044046, "grad_norm": 0.015833262354135513, "learning_rate": 1.9047452453074268e-08, "loss": 0.0, "num_input_tokens_seen": 229141952, "step": 106255 }, { "epoch": 19.500825839603596, "grad_norm": 0.022746076807379723, "learning_rate": 1.8977687461138596e-08, "loss": 0.0, "num_input_tokens_seen": 229153728, "step": 106260 }, { "epoch": 19.50174343916315, "grad_norm": 0.0009149814140982926, "learning_rate": 1.890805022470521e-08, "loss": 0.0, "num_input_tokens_seen": 229164672, "step": 106265 }, { "epoch": 19.502661038722703, "grad_norm": 2.152815580368042, "learning_rate": 1.8838540745560465e-08, "loss": 0.0012, "num_input_tokens_seen": 229175136, "step": 106270 }, { "epoch": 19.503578638282253, "grad_norm": 0.0012753824703395367, "learning_rate": 1.8769159025486816e-08, "loss": 0.0, "num_input_tokens_seen": 229184928, "step": 106275 }, { "epoch": 19.504496237841806, "grad_norm": 0.0007891982677392662, "learning_rate": 1.8699905066263958e-08, "loss": 0.0, "num_input_tokens_seen": 229195392, "step": 106280 }, { "epoch": 19.50541383740136, "grad_norm": 0.000708626233972609, "learning_rate": 1.8630778869668244e-08, "loss": 0.0, "num_input_tokens_seen": 229205920, "step": 106285 }, { "epoch": 19.50633143696091, "grad_norm": 0.001972584519535303, "learning_rate": 1.8561780437473254e-08, "loss": 0.0, "num_input_tokens_seen": 229215712, "step": 106290 }, { "epoch": 19.507249036520463, "grad_norm": 0.005923538003116846, "learning_rate": 1.8492909771447575e-08, "loss": 0.0, "num_input_tokens_seen": 229226048, "step": 106295 }, { "epoch": 19.508166636080016, "grad_norm": 0.002162766642868519, "learning_rate": 1.8424166873357573e-08, "loss": 0.0, "num_input_tokens_seen": 229237088, "step": 106300 }, { "epoch": 19.509084235639566, "grad_norm": 0.0012278377544134855, "learning_rate": 1.8355551744967947e-08, "loss": 0.0001, "num_input_tokens_seen": 229248000, "step": 106305 }, { "epoch": 19.51000183519912, "grad_norm": 0.0016913962317630649, "learning_rate": 1.8287064388036736e-08, "loss": 0.0001, "num_input_tokens_seen": 229259744, "step": 106310 }, { "epoch": 19.510919434758673, "grad_norm": 0.001028447411954403, "learning_rate": 1.8218704804321973e-08, "loss": 0.0, "num_input_tokens_seen": 229270336, "step": 106315 }, { "epoch": 19.511837034318223, "grad_norm": 0.00849875807762146, "learning_rate": 1.815047299557615e-08, "loss": 0.0001, "num_input_tokens_seen": 229279936, "step": 106320 }, { "epoch": 19.512754633877776, "grad_norm": 0.0011667346116155386, "learning_rate": 1.8082368963549533e-08, "loss": 0.0, "num_input_tokens_seen": 229291232, "step": 106325 }, { "epoch": 19.51367223343733, "grad_norm": 0.02854488044977188, "learning_rate": 1.801439270998906e-08, "loss": 0.0, "num_input_tokens_seen": 229301600, "step": 106330 }, { "epoch": 19.51458983299688, "grad_norm": 0.0006725713610649109, "learning_rate": 1.7946544236637774e-08, "loss": 0.0, "num_input_tokens_seen": 229312032, "step": 106335 }, { "epoch": 19.515507432556433, "grad_norm": 0.0553295724093914, "learning_rate": 1.7878823545235956e-08, "loss": 0.0, "num_input_tokens_seen": 229322432, "step": 106340 }, { "epoch": 19.516425032115986, "grad_norm": 0.005135319661349058, "learning_rate": 1.7811230637521105e-08, "loss": 0.0, "num_input_tokens_seen": 229333184, "step": 106345 }, { "epoch": 19.517342631675536, "grad_norm": 0.02047174610197544, "learning_rate": 1.7743765515226274e-08, "loss": 0.0, "num_input_tokens_seen": 229344704, "step": 106350 }, { "epoch": 19.51826023123509, "grad_norm": 0.0016364587936550379, "learning_rate": 1.7676428180082305e-08, "loss": 0.0, "num_input_tokens_seen": 229355296, "step": 106355 }, { "epoch": 19.519177830794643, "grad_norm": 0.003637549467384815, "learning_rate": 1.7609218633815596e-08, "loss": 0.0, "num_input_tokens_seen": 229366688, "step": 106360 }, { "epoch": 19.520095430354193, "grad_norm": 0.0005655413842760026, "learning_rate": 1.7542136878150873e-08, "loss": 0.0004, "num_input_tokens_seen": 229376832, "step": 106365 }, { "epoch": 19.521013029913746, "grad_norm": 0.0037216104101389647, "learning_rate": 1.7475182914808432e-08, "loss": 0.0, "num_input_tokens_seen": 229388512, "step": 106370 }, { "epoch": 19.5219306294733, "grad_norm": 0.035919588059186935, "learning_rate": 1.7408356745504674e-08, "loss": 0.0, "num_input_tokens_seen": 229398688, "step": 106375 }, { "epoch": 19.52284822903285, "grad_norm": 0.0016011249972507358, "learning_rate": 1.7341658371954896e-08, "loss": 0.0001, "num_input_tokens_seen": 229409664, "step": 106380 }, { "epoch": 19.523765828592403, "grad_norm": 0.0005398770445026457, "learning_rate": 1.7275087795868837e-08, "loss": 0.0, "num_input_tokens_seen": 229420736, "step": 106385 }, { "epoch": 19.524683428151956, "grad_norm": 0.0015829536132514477, "learning_rate": 1.7208645018954028e-08, "loss": 0.0, "num_input_tokens_seen": 229431296, "step": 106390 }, { "epoch": 19.525601027711506, "grad_norm": 0.0020207108464092016, "learning_rate": 1.714233004291521e-08, "loss": 0.0001, "num_input_tokens_seen": 229442720, "step": 106395 }, { "epoch": 19.52651862727106, "grad_norm": 0.0010639415122568607, "learning_rate": 1.7076142869452694e-08, "loss": 0.0, "num_input_tokens_seen": 229454144, "step": 106400 }, { "epoch": 19.527436226830613, "grad_norm": 0.009951930493116379, "learning_rate": 1.7010083500264006e-08, "loss": 0.0, "num_input_tokens_seen": 229464480, "step": 106405 }, { "epoch": 19.528353826390163, "grad_norm": 0.00047231174539774656, "learning_rate": 1.6944151937044463e-08, "loss": 0.0, "num_input_tokens_seen": 229475648, "step": 106410 }, { "epoch": 19.529271425949716, "grad_norm": 0.005884765647351742, "learning_rate": 1.6878348181483816e-08, "loss": 0.0, "num_input_tokens_seen": 229486688, "step": 106415 }, { "epoch": 19.53018902550927, "grad_norm": 0.001980044413357973, "learning_rate": 1.681267223527072e-08, "loss": 0.0, "num_input_tokens_seen": 229498720, "step": 106420 }, { "epoch": 19.53110662506882, "grad_norm": 0.0006458560237661004, "learning_rate": 1.674712410008883e-08, "loss": 0.0, "num_input_tokens_seen": 229509376, "step": 106425 }, { "epoch": 19.532024224628373, "grad_norm": 0.001556802773848176, "learning_rate": 1.6681703777620683e-08, "loss": 0.0002, "num_input_tokens_seen": 229520480, "step": 106430 }, { "epoch": 19.532941824187926, "grad_norm": 0.2008640319108963, "learning_rate": 1.6616411269542722e-08, "loss": 0.0013, "num_input_tokens_seen": 229531360, "step": 106435 }, { "epoch": 19.533859423747476, "grad_norm": 0.0013857233570888638, "learning_rate": 1.6551246577530267e-08, "loss": 0.0, "num_input_tokens_seen": 229540832, "step": 106440 }, { "epoch": 19.53477702330703, "grad_norm": 0.0014455060008913279, "learning_rate": 1.6486209703255318e-08, "loss": 0.0, "num_input_tokens_seen": 229552192, "step": 106445 }, { "epoch": 19.535694622866583, "grad_norm": 0.05453001707792282, "learning_rate": 1.6421300648384876e-08, "loss": 0.0174, "num_input_tokens_seen": 229563136, "step": 106450 }, { "epoch": 19.536612222426132, "grad_norm": 0.004807825665920973, "learning_rate": 1.635651941458427e-08, "loss": 0.0, "num_input_tokens_seen": 229573888, "step": 106455 }, { "epoch": 19.537529821985686, "grad_norm": 0.0014320163754746318, "learning_rate": 1.6291866003514957e-08, "loss": 0.0, "num_input_tokens_seen": 229584512, "step": 106460 }, { "epoch": 19.53844742154524, "grad_norm": 0.0006188127445057034, "learning_rate": 1.6227340416835047e-08, "loss": 0.0, "num_input_tokens_seen": 229596032, "step": 106465 }, { "epoch": 19.53936502110479, "grad_norm": 0.002521927235648036, "learning_rate": 1.6162942656200443e-08, "loss": 0.0, "num_input_tokens_seen": 229607200, "step": 106470 }, { "epoch": 19.540282620664343, "grad_norm": 0.005330349784344435, "learning_rate": 1.6098672723261487e-08, "loss": 0.0173, "num_input_tokens_seen": 229617472, "step": 106475 }, { "epoch": 19.541200220223896, "grad_norm": 0.0025998142082244158, "learning_rate": 1.603453061966742e-08, "loss": 0.0, "num_input_tokens_seen": 229628832, "step": 106480 }, { "epoch": 19.542117819783446, "grad_norm": 0.08629672974348068, "learning_rate": 1.5970516347063037e-08, "loss": 0.0, "num_input_tokens_seen": 229640640, "step": 106485 }, { "epoch": 19.543035419343, "grad_norm": 0.031074298545718193, "learning_rate": 1.5906629907090355e-08, "loss": 0.0, "num_input_tokens_seen": 229651744, "step": 106490 }, { "epoch": 19.543953018902553, "grad_norm": 0.0027185394428670406, "learning_rate": 1.5842871301388064e-08, "loss": 0.0, "num_input_tokens_seen": 229663296, "step": 106495 }, { "epoch": 19.544870618462102, "grad_norm": 0.001741544809192419, "learning_rate": 1.577924053159152e-08, "loss": 0.0001, "num_input_tokens_seen": 229673952, "step": 106500 }, { "epoch": 19.545788218021656, "grad_norm": 0.00034520114422775805, "learning_rate": 1.5715737599332203e-08, "loss": 0.0, "num_input_tokens_seen": 229683808, "step": 106505 }, { "epoch": 19.54670581758121, "grad_norm": 0.0007774784462526441, "learning_rate": 1.5652362506239915e-08, "loss": 0.0, "num_input_tokens_seen": 229693536, "step": 106510 }, { "epoch": 19.54762341714076, "grad_norm": 0.0004178976232651621, "learning_rate": 1.5589115253938914e-08, "loss": 0.0, "num_input_tokens_seen": 229703264, "step": 106515 }, { "epoch": 19.548541016700312, "grad_norm": 0.0010630055330693722, "learning_rate": 1.5525995844052345e-08, "loss": 0.0, "num_input_tokens_seen": 229714240, "step": 106520 }, { "epoch": 19.549458616259866, "grad_norm": 0.0019171058665961027, "learning_rate": 1.546300427819891e-08, "loss": 0.0, "num_input_tokens_seen": 229724096, "step": 106525 }, { "epoch": 19.550376215819416, "grad_norm": 0.010384850203990936, "learning_rate": 1.5400140557993437e-08, "loss": 0.0, "num_input_tokens_seen": 229734176, "step": 106530 }, { "epoch": 19.55129381537897, "grad_norm": 0.0030914857052266598, "learning_rate": 1.533740468504963e-08, "loss": 0.0, "num_input_tokens_seen": 229745440, "step": 106535 }, { "epoch": 19.552211414938522, "grad_norm": 0.009381027892231941, "learning_rate": 1.527479666097509e-08, "loss": 0.0, "num_input_tokens_seen": 229755680, "step": 106540 }, { "epoch": 19.553129014498072, "grad_norm": 0.01582557149231434, "learning_rate": 1.5212316487376867e-08, "loss": 0.0, "num_input_tokens_seen": 229764736, "step": 106545 }, { "epoch": 19.554046614057626, "grad_norm": 0.0006379876285791397, "learning_rate": 1.514996416585701e-08, "loss": 0.0, "num_input_tokens_seen": 229775776, "step": 106550 }, { "epoch": 19.55496421361718, "grad_norm": 0.0017140714917331934, "learning_rate": 1.5087739698014804e-08, "loss": 0.0, "num_input_tokens_seen": 229787520, "step": 106555 }, { "epoch": 19.55588181317673, "grad_norm": 0.002661621430888772, "learning_rate": 1.5025643085446183e-08, "loss": 0.0, "num_input_tokens_seen": 229798656, "step": 106560 }, { "epoch": 19.556799412736282, "grad_norm": 0.0015731907915323973, "learning_rate": 1.4963674329743216e-08, "loss": 0.0, "num_input_tokens_seen": 229809056, "step": 106565 }, { "epoch": 19.557717012295836, "grad_norm": 0.0006018257117830217, "learning_rate": 1.490183343249685e-08, "loss": 0.0, "num_input_tokens_seen": 229821088, "step": 106570 }, { "epoch": 19.558634611855386, "grad_norm": 0.05190294235944748, "learning_rate": 1.4840120395291368e-08, "loss": 0.0, "num_input_tokens_seen": 229830944, "step": 106575 }, { "epoch": 19.55955221141494, "grad_norm": 0.0014598306734114885, "learning_rate": 1.4778535219711066e-08, "loss": 0.0, "num_input_tokens_seen": 229842208, "step": 106580 }, { "epoch": 19.560469810974492, "grad_norm": 0.0023363279178738594, "learning_rate": 1.4717077907334675e-08, "loss": 0.0001, "num_input_tokens_seen": 229852480, "step": 106585 }, { "epoch": 19.561387410534042, "grad_norm": 0.0023039968218654394, "learning_rate": 1.4655748459738717e-08, "loss": 0.0, "num_input_tokens_seen": 229864192, "step": 106590 }, { "epoch": 19.562305010093596, "grad_norm": 0.0017236960120499134, "learning_rate": 1.4594546878496373e-08, "loss": 0.0, "num_input_tokens_seen": 229875264, "step": 106595 }, { "epoch": 19.56322260965315, "grad_norm": 0.0042726402170956135, "learning_rate": 1.4533473165177503e-08, "loss": 0.0051, "num_input_tokens_seen": 229885088, "step": 106600 }, { "epoch": 19.5641402092127, "grad_norm": 0.003250496694818139, "learning_rate": 1.4472527321348074e-08, "loss": 0.0, "num_input_tokens_seen": 229895488, "step": 106605 }, { "epoch": 19.565057808772252, "grad_norm": 0.0026245529297739267, "learning_rate": 1.4411709348570724e-08, "loss": 0.0, "num_input_tokens_seen": 229907072, "step": 106610 }, { "epoch": 19.565975408331806, "grad_norm": 0.04176199436187744, "learning_rate": 1.4351019248406983e-08, "loss": 0.0001, "num_input_tokens_seen": 229918016, "step": 106615 }, { "epoch": 19.566893007891355, "grad_norm": 0.06472988426685333, "learning_rate": 1.4290457022412274e-08, "loss": 0.0001, "num_input_tokens_seen": 229929184, "step": 106620 }, { "epoch": 19.56781060745091, "grad_norm": 0.02231026440858841, "learning_rate": 1.4230022672139799e-08, "loss": 0.0, "num_input_tokens_seen": 229940416, "step": 106625 }, { "epoch": 19.568728207010462, "grad_norm": 0.004258262924849987, "learning_rate": 1.4169716199140537e-08, "loss": 0.0, "num_input_tokens_seen": 229951264, "step": 106630 }, { "epoch": 19.569645806570012, "grad_norm": 0.0008260679896920919, "learning_rate": 1.4109537604960477e-08, "loss": 0.0, "num_input_tokens_seen": 229962272, "step": 106635 }, { "epoch": 19.570563406129565, "grad_norm": 0.0011540896957740188, "learning_rate": 1.4049486891143938e-08, "loss": 0.0, "num_input_tokens_seen": 229972992, "step": 106640 }, { "epoch": 19.57148100568912, "grad_norm": 0.029809495434165, "learning_rate": 1.3989564059229687e-08, "loss": 0.0, "num_input_tokens_seen": 229984192, "step": 106645 }, { "epoch": 19.57239860524867, "grad_norm": 0.0008283504284918308, "learning_rate": 1.3929769110755943e-08, "loss": 0.0001, "num_input_tokens_seen": 229994944, "step": 106650 }, { "epoch": 19.573316204808222, "grad_norm": 0.0014814725145697594, "learning_rate": 1.3870102047255917e-08, "loss": 0.0, "num_input_tokens_seen": 230006560, "step": 106655 }, { "epoch": 19.574233804367775, "grad_norm": 0.0017456996720284224, "learning_rate": 1.3810562870259504e-08, "loss": 0.0, "num_input_tokens_seen": 230017792, "step": 106660 }, { "epoch": 19.575151403927325, "grad_norm": 0.0017601636936888099, "learning_rate": 1.3751151581294919e-08, "loss": 0.0, "num_input_tokens_seen": 230027456, "step": 106665 }, { "epoch": 19.57606900348688, "grad_norm": 0.001670504454523325, "learning_rate": 1.3691868181884838e-08, "loss": 0.0645, "num_input_tokens_seen": 230037024, "step": 106670 }, { "epoch": 19.576986603046432, "grad_norm": 0.0018101423047482967, "learning_rate": 1.3632712673550263e-08, "loss": 0.0, "num_input_tokens_seen": 230048544, "step": 106675 }, { "epoch": 19.577904202605982, "grad_norm": 0.002557069528847933, "learning_rate": 1.3573685057808872e-08, "loss": 0.0, "num_input_tokens_seen": 230059648, "step": 106680 }, { "epoch": 19.578821802165535, "grad_norm": 0.13046088814735413, "learning_rate": 1.3514785336173897e-08, "loss": 0.0001, "num_input_tokens_seen": 230070240, "step": 106685 }, { "epoch": 19.57973940172509, "grad_norm": 0.0009631781140342355, "learning_rate": 1.3456013510156351e-08, "loss": 0.0, "num_input_tokens_seen": 230079584, "step": 106690 }, { "epoch": 19.58065700128464, "grad_norm": 0.056146200746297836, "learning_rate": 1.3397369581263364e-08, "loss": 0.0, "num_input_tokens_seen": 230088800, "step": 106695 }, { "epoch": 19.581574600844192, "grad_norm": 0.0010221231495961547, "learning_rate": 1.3338853550999841e-08, "loss": 0.0, "num_input_tokens_seen": 230098592, "step": 106700 }, { "epoch": 19.582492200403745, "grad_norm": 0.0006101267645135522, "learning_rate": 1.3280465420865695e-08, "loss": 0.0, "num_input_tokens_seen": 230109824, "step": 106705 }, { "epoch": 19.583409799963295, "grad_norm": 0.01612999476492405, "learning_rate": 1.3222205192359172e-08, "loss": 0.0001, "num_input_tokens_seen": 230120992, "step": 106710 }, { "epoch": 19.58432739952285, "grad_norm": 0.0046190838329494, "learning_rate": 1.3164072866974076e-08, "loss": 0.0, "num_input_tokens_seen": 230132320, "step": 106715 }, { "epoch": 19.585244999082402, "grad_norm": 0.0011161426082253456, "learning_rate": 1.3106068446201991e-08, "loss": 0.0, "num_input_tokens_seen": 230142976, "step": 106720 }, { "epoch": 19.586162598641952, "grad_norm": 0.0012677725171670318, "learning_rate": 1.3048191931529508e-08, "loss": 0.0, "num_input_tokens_seen": 230154016, "step": 106725 }, { "epoch": 19.587080198201505, "grad_norm": 0.0010229302570223808, "learning_rate": 1.299044332444266e-08, "loss": 0.0, "num_input_tokens_seen": 230164384, "step": 106730 }, { "epoch": 19.58799779776106, "grad_norm": 0.2771228849887848, "learning_rate": 1.293282262642137e-08, "loss": 0.0001, "num_input_tokens_seen": 230176256, "step": 106735 }, { "epoch": 19.58891539732061, "grad_norm": 0.004449524451047182, "learning_rate": 1.2875329838943907e-08, "loss": 0.0001, "num_input_tokens_seen": 230186496, "step": 106740 }, { "epoch": 19.589832996880162, "grad_norm": 0.0006884497124701738, "learning_rate": 1.2817964963484641e-08, "loss": 0.0, "num_input_tokens_seen": 230197312, "step": 106745 }, { "epoch": 19.590750596439715, "grad_norm": 0.0021567277144640684, "learning_rate": 1.2760728001515733e-08, "loss": 0.0, "num_input_tokens_seen": 230208032, "step": 106750 }, { "epoch": 19.591668195999265, "grad_norm": 0.0009471431258134544, "learning_rate": 1.2703618954504337e-08, "loss": 0.0, "num_input_tokens_seen": 230218048, "step": 106755 }, { "epoch": 19.59258579555882, "grad_norm": 0.001220972859300673, "learning_rate": 1.2646637823915397e-08, "loss": 0.0, "num_input_tokens_seen": 230228384, "step": 106760 }, { "epoch": 19.593503395118372, "grad_norm": 0.005994230974465609, "learning_rate": 1.258978461121052e-08, "loss": 0.0, "num_input_tokens_seen": 230237728, "step": 106765 }, { "epoch": 19.59442099467792, "grad_norm": 0.0028120747301727533, "learning_rate": 1.2533059317847985e-08, "loss": 0.0, "num_input_tokens_seen": 230248864, "step": 106770 }, { "epoch": 19.595338594237475, "grad_norm": 0.0058951000683009624, "learning_rate": 1.2476461945282736e-08, "loss": 0.0, "num_input_tokens_seen": 230260032, "step": 106775 }, { "epoch": 19.59625619379703, "grad_norm": 0.029599769040942192, "learning_rate": 1.2419992494965837e-08, "loss": 0.0, "num_input_tokens_seen": 230271616, "step": 106780 }, { "epoch": 19.59717379335658, "grad_norm": 0.0010694172233343124, "learning_rate": 1.2363650968346685e-08, "loss": 0.001, "num_input_tokens_seen": 230282464, "step": 106785 }, { "epoch": 19.598091392916132, "grad_norm": 0.011927350424230099, "learning_rate": 1.2307437366869679e-08, "loss": 0.0002, "num_input_tokens_seen": 230293600, "step": 106790 }, { "epoch": 19.599008992475685, "grad_norm": 0.0006673328462056816, "learning_rate": 1.2251351691975888e-08, "loss": 0.0, "num_input_tokens_seen": 230305312, "step": 106795 }, { "epoch": 19.599926592035235, "grad_norm": 0.006783590652048588, "learning_rate": 1.2195393945105271e-08, "loss": 0.0, "num_input_tokens_seen": 230316928, "step": 106800 }, { "epoch": 19.60084419159479, "grad_norm": 0.0016314341919496655, "learning_rate": 1.2139564127692238e-08, "loss": 0.0, "num_input_tokens_seen": 230326816, "step": 106805 }, { "epoch": 19.601761791154342, "grad_norm": 0.0010041601490229368, "learning_rate": 1.2083862241168976e-08, "loss": 0.0, "num_input_tokens_seen": 230337184, "step": 106810 }, { "epoch": 19.60267939071389, "grad_norm": 0.005777599755674601, "learning_rate": 1.202828828696434e-08, "loss": 0.0, "num_input_tokens_seen": 230346944, "step": 106815 }, { "epoch": 19.603596990273445, "grad_norm": 0.0006922294851392508, "learning_rate": 1.1972842266503305e-08, "loss": 0.0, "num_input_tokens_seen": 230357824, "step": 106820 }, { "epoch": 19.604514589833, "grad_norm": 0.031035076826810837, "learning_rate": 1.1917524181208063e-08, "loss": 0.0, "num_input_tokens_seen": 230369184, "step": 106825 }, { "epoch": 19.60543218939255, "grad_norm": 0.0011707558296620846, "learning_rate": 1.1862334032496925e-08, "loss": 0.0001, "num_input_tokens_seen": 230378816, "step": 106830 }, { "epoch": 19.6063497889521, "grad_norm": 0.0019209972815588117, "learning_rate": 1.1807271821786536e-08, "loss": 0.0, "num_input_tokens_seen": 230389248, "step": 106835 }, { "epoch": 19.607267388511655, "grad_norm": 0.019907664507627487, "learning_rate": 1.1752337550489101e-08, "loss": 0.0001, "num_input_tokens_seen": 230401504, "step": 106840 }, { "epoch": 19.608184988071205, "grad_norm": 0.01533673144876957, "learning_rate": 1.1697531220012382e-08, "loss": 0.0, "num_input_tokens_seen": 230411936, "step": 106845 }, { "epoch": 19.60910258763076, "grad_norm": 0.0017608096823096275, "learning_rate": 1.1642852831763029e-08, "loss": 0.0, "num_input_tokens_seen": 230424064, "step": 106850 }, { "epoch": 19.61002018719031, "grad_norm": 0.0006584480870515108, "learning_rate": 1.1588302387143257e-08, "loss": 0.0, "num_input_tokens_seen": 230435456, "step": 106855 }, { "epoch": 19.61093778674986, "grad_norm": 0.001196868484839797, "learning_rate": 1.1533879887551947e-08, "loss": 0.0, "num_input_tokens_seen": 230446784, "step": 106860 }, { "epoch": 19.611855386309415, "grad_norm": 0.007205720525234938, "learning_rate": 1.1479585334385757e-08, "loss": 0.0, "num_input_tokens_seen": 230457312, "step": 106865 }, { "epoch": 19.61277298586897, "grad_norm": 0.011104851961135864, "learning_rate": 1.1425418729036353e-08, "loss": 0.0, "num_input_tokens_seen": 230467296, "step": 106870 }, { "epoch": 19.613690585428518, "grad_norm": 0.007203179877251387, "learning_rate": 1.1371380072893735e-08, "loss": 0.0, "num_input_tokens_seen": 230478112, "step": 106875 }, { "epoch": 19.61460818498807, "grad_norm": 0.005697120446711779, "learning_rate": 1.1317469367342903e-08, "loss": 0.0, "num_input_tokens_seen": 230488512, "step": 106880 }, { "epoch": 19.615525784547625, "grad_norm": 0.00793519988656044, "learning_rate": 1.1263686613767755e-08, "loss": 0.0, "num_input_tokens_seen": 230498816, "step": 106885 }, { "epoch": 19.616443384107175, "grad_norm": 0.2589750289916992, "learning_rate": 1.1210031813547185e-08, "loss": 0.0001, "num_input_tokens_seen": 230509984, "step": 106890 }, { "epoch": 19.617360983666728, "grad_norm": 0.0005385156837292016, "learning_rate": 1.1156504968056758e-08, "loss": 0.0005, "num_input_tokens_seen": 230520608, "step": 106895 }, { "epoch": 19.61827858322628, "grad_norm": 0.02788432314991951, "learning_rate": 1.1103106078670378e-08, "loss": 0.0, "num_input_tokens_seen": 230532000, "step": 106900 }, { "epoch": 19.61919618278583, "grad_norm": 0.003060795832425356, "learning_rate": 1.1049835146757504e-08, "loss": 0.0, "num_input_tokens_seen": 230543008, "step": 106905 }, { "epoch": 19.620113782345385, "grad_norm": 0.001171233132481575, "learning_rate": 1.0996692173684265e-08, "loss": 0.0, "num_input_tokens_seen": 230552992, "step": 106910 }, { "epoch": 19.62103138190494, "grad_norm": 0.0013667279854416847, "learning_rate": 1.0943677160812904e-08, "loss": 0.0001, "num_input_tokens_seen": 230564416, "step": 106915 }, { "epoch": 19.621948981464488, "grad_norm": 0.005398431792855263, "learning_rate": 1.0890790109504556e-08, "loss": 0.0, "num_input_tokens_seen": 230576480, "step": 106920 }, { "epoch": 19.62286658102404, "grad_norm": 0.010517374612390995, "learning_rate": 1.0838031021114803e-08, "loss": 0.0, "num_input_tokens_seen": 230587296, "step": 106925 }, { "epoch": 19.623784180583595, "grad_norm": 0.002699746983125806, "learning_rate": 1.0785399896997007e-08, "loss": 0.0, "num_input_tokens_seen": 230597408, "step": 106930 }, { "epoch": 19.624701780143145, "grad_norm": 0.011954691261053085, "learning_rate": 1.0732896738500643e-08, "loss": 0.0, "num_input_tokens_seen": 230608928, "step": 106935 }, { "epoch": 19.625619379702698, "grad_norm": 0.04383600503206253, "learning_rate": 1.0680521546973522e-08, "loss": 0.0, "num_input_tokens_seen": 230619264, "step": 106940 }, { "epoch": 19.62653697926225, "grad_norm": 0.0016901345225051045, "learning_rate": 1.0628274323757903e-08, "loss": 0.0328, "num_input_tokens_seen": 230629472, "step": 106945 }, { "epoch": 19.6274545788218, "grad_norm": 0.0008141898433677852, "learning_rate": 1.0576155070194939e-08, "loss": 0.0, "num_input_tokens_seen": 230639616, "step": 106950 }, { "epoch": 19.628372178381355, "grad_norm": 0.0006641182699240744, "learning_rate": 1.0524163787619669e-08, "loss": 0.0, "num_input_tokens_seen": 230649312, "step": 106955 }, { "epoch": 19.629289777940908, "grad_norm": 0.00040016372804529965, "learning_rate": 1.0472300477367137e-08, "loss": 0.0, "num_input_tokens_seen": 230659776, "step": 106960 }, { "epoch": 19.630207377500458, "grad_norm": 0.0018859989941120148, "learning_rate": 1.0420565140766837e-08, "loss": 0.0, "num_input_tokens_seen": 230672224, "step": 106965 }, { "epoch": 19.63112497706001, "grad_norm": 4.802713394165039, "learning_rate": 1.0368957779146039e-08, "loss": 0.0006, "num_input_tokens_seen": 230682944, "step": 106970 }, { "epoch": 19.632042576619565, "grad_norm": 0.0011089881882071495, "learning_rate": 1.0317478393828684e-08, "loss": 0.1719, "num_input_tokens_seen": 230694848, "step": 106975 }, { "epoch": 19.632960176179115, "grad_norm": 0.0031413529068231583, "learning_rate": 1.0266126986133718e-08, "loss": 0.0, "num_input_tokens_seen": 230707392, "step": 106980 }, { "epoch": 19.633877775738668, "grad_norm": 0.0016438178718090057, "learning_rate": 1.0214903557380085e-08, "loss": 0.0, "num_input_tokens_seen": 230718848, "step": 106985 }, { "epoch": 19.63479537529822, "grad_norm": 0.0021531814709305763, "learning_rate": 1.016380810888007e-08, "loss": 0.0, "num_input_tokens_seen": 230729568, "step": 106990 }, { "epoch": 19.63571297485777, "grad_norm": 0.0015763746341690421, "learning_rate": 1.0112840641945399e-08, "loss": 0.0, "num_input_tokens_seen": 230740992, "step": 106995 }, { "epoch": 19.636630574417325, "grad_norm": 0.06163099408149719, "learning_rate": 1.0062001157882251e-08, "loss": 0.0, "num_input_tokens_seen": 230751680, "step": 107000 }, { "epoch": 19.637548173976878, "grad_norm": 0.0045893434435129166, "learning_rate": 1.0011289657995693e-08, "loss": 0.0, "num_input_tokens_seen": 230762720, "step": 107005 }, { "epoch": 19.638465773536428, "grad_norm": 0.003708957228809595, "learning_rate": 9.960706143585241e-09, "loss": 0.0, "num_input_tokens_seen": 230774496, "step": 107010 }, { "epoch": 19.63938337309598, "grad_norm": 0.011268462985754013, "learning_rate": 9.910250615948747e-09, "loss": 0.0, "num_input_tokens_seen": 230784960, "step": 107015 }, { "epoch": 19.640300972655535, "grad_norm": 0.0034040259197354317, "learning_rate": 9.859923076380728e-09, "loss": 0.0, "num_input_tokens_seen": 230795616, "step": 107020 }, { "epoch": 19.641218572215084, "grad_norm": 0.004636241123080254, "learning_rate": 9.809723526171266e-09, "loss": 0.0, "num_input_tokens_seen": 230807136, "step": 107025 }, { "epoch": 19.642136171774638, "grad_norm": 0.003457270562648773, "learning_rate": 9.759651966608774e-09, "loss": 0.0, "num_input_tokens_seen": 230818240, "step": 107030 }, { "epoch": 19.64305377133419, "grad_norm": 0.0013497989857569337, "learning_rate": 9.709708398976669e-09, "loss": 0.0, "num_input_tokens_seen": 230828000, "step": 107035 }, { "epoch": 19.64397137089374, "grad_norm": 0.0026941935066133738, "learning_rate": 9.659892824556705e-09, "loss": 0.0, "num_input_tokens_seen": 230838784, "step": 107040 }, { "epoch": 19.644888970453295, "grad_norm": 0.042727626860141754, "learning_rate": 9.610205244625637e-09, "loss": 0.0, "num_input_tokens_seen": 230849120, "step": 107045 }, { "epoch": 19.645806570012848, "grad_norm": 0.1724850982427597, "learning_rate": 9.560645660458556e-09, "loss": 0.0001, "num_input_tokens_seen": 230860832, "step": 107050 }, { "epoch": 19.646724169572398, "grad_norm": 0.03475251793861389, "learning_rate": 9.511214073326668e-09, "loss": 0.0, "num_input_tokens_seen": 230872672, "step": 107055 }, { "epoch": 19.64764176913195, "grad_norm": 0.010923607274889946, "learning_rate": 9.461910484497294e-09, "loss": 0.0, "num_input_tokens_seen": 230883776, "step": 107060 }, { "epoch": 19.648559368691505, "grad_norm": 0.00028728501638397574, "learning_rate": 9.412734895235532e-09, "loss": 0.0001, "num_input_tokens_seen": 230895840, "step": 107065 }, { "epoch": 19.649476968251054, "grad_norm": 0.007166234776377678, "learning_rate": 9.363687306802594e-09, "loss": 0.0, "num_input_tokens_seen": 230905920, "step": 107070 }, { "epoch": 19.650394567810608, "grad_norm": 0.003884982317686081, "learning_rate": 9.314767720455809e-09, "loss": 0.0, "num_input_tokens_seen": 230916128, "step": 107075 }, { "epoch": 19.65131216737016, "grad_norm": 0.044619735330343246, "learning_rate": 9.265976137450284e-09, "loss": 0.0, "num_input_tokens_seen": 230927104, "step": 107080 }, { "epoch": 19.65222976692971, "grad_norm": 0.0023144108708947897, "learning_rate": 9.21731255903835e-09, "loss": 0.0, "num_input_tokens_seen": 230937824, "step": 107085 }, { "epoch": 19.653147366489264, "grad_norm": 0.0009046413470059633, "learning_rate": 9.168776986466787e-09, "loss": 0.0, "num_input_tokens_seen": 230949152, "step": 107090 }, { "epoch": 19.654064966048818, "grad_norm": 0.0007463919464498758, "learning_rate": 9.120369420980712e-09, "loss": 0.0001, "num_input_tokens_seen": 230960832, "step": 107095 }, { "epoch": 19.654982565608368, "grad_norm": 0.003341076662763953, "learning_rate": 9.072089863822464e-09, "loss": 0.0, "num_input_tokens_seen": 230972448, "step": 107100 }, { "epoch": 19.65590016516792, "grad_norm": 0.002634361619129777, "learning_rate": 9.023938316229941e-09, "loss": 0.0, "num_input_tokens_seen": 230983360, "step": 107105 }, { "epoch": 19.656817764727474, "grad_norm": 0.018979821354150772, "learning_rate": 8.97591477943771e-09, "loss": 0.0, "num_input_tokens_seen": 230995200, "step": 107110 }, { "epoch": 19.657735364287024, "grad_norm": 0.0008742081699892879, "learning_rate": 8.928019254678123e-09, "loss": 0.0, "num_input_tokens_seen": 231006880, "step": 107115 }, { "epoch": 19.658652963846578, "grad_norm": 59.85143280029297, "learning_rate": 8.880251743179081e-09, "loss": 0.1345, "num_input_tokens_seen": 231018240, "step": 107120 }, { "epoch": 19.65957056340613, "grad_norm": 0.01128696370869875, "learning_rate": 8.832612246166273e-09, "loss": 0.0, "num_input_tokens_seen": 231029120, "step": 107125 }, { "epoch": 19.66048816296568, "grad_norm": 0.006102674640715122, "learning_rate": 8.785100764861498e-09, "loss": 0.0, "num_input_tokens_seen": 231039840, "step": 107130 }, { "epoch": 19.661405762525234, "grad_norm": 0.00993603840470314, "learning_rate": 8.737717300483228e-09, "loss": 0.0, "num_input_tokens_seen": 231051360, "step": 107135 }, { "epoch": 19.662323362084788, "grad_norm": 0.0014865136472508311, "learning_rate": 8.690461854246601e-09, "loss": 0.0, "num_input_tokens_seen": 231062400, "step": 107140 }, { "epoch": 19.663240961644338, "grad_norm": 0.0009113129926845431, "learning_rate": 8.643334427363425e-09, "loss": 0.0, "num_input_tokens_seen": 231073696, "step": 107145 }, { "epoch": 19.66415856120389, "grad_norm": 0.0009531340911053121, "learning_rate": 8.59633502104329e-09, "loss": 0.0002, "num_input_tokens_seen": 231084832, "step": 107150 }, { "epoch": 19.665076160763444, "grad_norm": 0.0028492030687630177, "learning_rate": 8.549463636491339e-09, "loss": 0.0, "num_input_tokens_seen": 231095680, "step": 107155 }, { "epoch": 19.665993760322994, "grad_norm": 0.0025419166777282953, "learning_rate": 8.502720274909392e-09, "loss": 0.0, "num_input_tokens_seen": 231105792, "step": 107160 }, { "epoch": 19.666911359882548, "grad_norm": 0.0031745843589305878, "learning_rate": 8.45610493749649e-09, "loss": 0.0, "num_input_tokens_seen": 231116480, "step": 107165 }, { "epoch": 19.6678289594421, "grad_norm": 0.003292858600616455, "learning_rate": 8.409617625448342e-09, "loss": 0.0, "num_input_tokens_seen": 231127040, "step": 107170 }, { "epoch": 19.66874655900165, "grad_norm": 0.004010990262031555, "learning_rate": 8.36325833995677e-09, "loss": 0.0131, "num_input_tokens_seen": 231138176, "step": 107175 }, { "epoch": 19.669664158561204, "grad_norm": 0.0008621482411399484, "learning_rate": 8.317027082211937e-09, "loss": 0.0, "num_input_tokens_seen": 231149664, "step": 107180 }, { "epoch": 19.670581758120758, "grad_norm": 0.3377680778503418, "learning_rate": 8.270923853399004e-09, "loss": 0.0001, "num_input_tokens_seen": 231160448, "step": 107185 }, { "epoch": 19.671499357680307, "grad_norm": 0.04308478906750679, "learning_rate": 8.224948654699806e-09, "loss": 0.0, "num_input_tokens_seen": 231170432, "step": 107190 }, { "epoch": 19.67241695723986, "grad_norm": 0.0014244195772334933, "learning_rate": 8.179101487294505e-09, "loss": 0.0, "num_input_tokens_seen": 231181312, "step": 107195 }, { "epoch": 19.673334556799414, "grad_norm": 0.0023193461820483208, "learning_rate": 8.133382352358276e-09, "loss": 0.0, "num_input_tokens_seen": 231192256, "step": 107200 }, { "epoch": 19.674252156358964, "grad_norm": 0.0018214313313364983, "learning_rate": 8.087791251064624e-09, "loss": 0.0, "num_input_tokens_seen": 231205888, "step": 107205 }, { "epoch": 19.675169755918517, "grad_norm": 0.000763941090553999, "learning_rate": 8.04232818458206e-09, "loss": 0.0, "num_input_tokens_seen": 231215840, "step": 107210 }, { "epoch": 19.67608735547807, "grad_norm": 0.0028498172760009766, "learning_rate": 7.996993154076871e-09, "loss": 0.0, "num_input_tokens_seen": 231227008, "step": 107215 }, { "epoch": 19.67700495503762, "grad_norm": 0.002054100390523672, "learning_rate": 7.95178616071257e-09, "loss": 0.0, "num_input_tokens_seen": 231237472, "step": 107220 }, { "epoch": 19.677922554597174, "grad_norm": 0.0007199161918833852, "learning_rate": 7.906707205647124e-09, "loss": 0.0, "num_input_tokens_seen": 231246368, "step": 107225 }, { "epoch": 19.678840154156727, "grad_norm": 0.0015778042143210769, "learning_rate": 7.861756290037936e-09, "loss": 0.0, "num_input_tokens_seen": 231257792, "step": 107230 }, { "epoch": 19.679757753716277, "grad_norm": 0.0008385946275666356, "learning_rate": 7.81693341503742e-09, "loss": 0.0, "num_input_tokens_seen": 231269152, "step": 107235 }, { "epoch": 19.68067535327583, "grad_norm": 0.002112025860697031, "learning_rate": 7.772238581795766e-09, "loss": 0.0401, "num_input_tokens_seen": 231279296, "step": 107240 }, { "epoch": 19.681592952835384, "grad_norm": 0.005482916254550219, "learning_rate": 7.727671791458724e-09, "loss": 0.0, "num_input_tokens_seen": 231290688, "step": 107245 }, { "epoch": 19.682510552394934, "grad_norm": 0.0015299497172236443, "learning_rate": 7.683233045169825e-09, "loss": 0.0, "num_input_tokens_seen": 231302144, "step": 107250 }, { "epoch": 19.683428151954487, "grad_norm": 0.0002618471335154027, "learning_rate": 7.638922344068156e-09, "loss": 0.0, "num_input_tokens_seen": 231312288, "step": 107255 }, { "epoch": 19.68434575151404, "grad_norm": 0.062614306807518, "learning_rate": 7.594739689291142e-09, "loss": 0.0001, "num_input_tokens_seen": 231322944, "step": 107260 }, { "epoch": 19.68526335107359, "grad_norm": 0.021849513053894043, "learning_rate": 7.550685081970655e-09, "loss": 0.0, "num_input_tokens_seen": 231333376, "step": 107265 }, { "epoch": 19.686180950633144, "grad_norm": 0.0023821424692869186, "learning_rate": 7.506758523238567e-09, "loss": 0.0, "num_input_tokens_seen": 231344288, "step": 107270 }, { "epoch": 19.687098550192697, "grad_norm": 0.0043862429447472095, "learning_rate": 7.46296001422009e-09, "loss": 0.0, "num_input_tokens_seen": 231355584, "step": 107275 }, { "epoch": 19.688016149752247, "grad_norm": 0.0009103592019528151, "learning_rate": 7.4192895560387665e-09, "loss": 0.0, "num_input_tokens_seen": 231365696, "step": 107280 }, { "epoch": 19.6889337493118, "grad_norm": 0.0014726222725585103, "learning_rate": 7.3757471498148156e-09, "loss": 0.0, "num_input_tokens_seen": 231376160, "step": 107285 }, { "epoch": 19.689851348871354, "grad_norm": 0.0019736834801733494, "learning_rate": 7.3323327966651206e-09, "loss": 0.0, "num_input_tokens_seen": 231387136, "step": 107290 }, { "epoch": 19.690768948430904, "grad_norm": 0.006603557150810957, "learning_rate": 7.289046497703234e-09, "loss": 0.0, "num_input_tokens_seen": 231399232, "step": 107295 }, { "epoch": 19.691686547990457, "grad_norm": 0.011983356438577175, "learning_rate": 7.245888254039379e-09, "loss": 0.0, "num_input_tokens_seen": 231408448, "step": 107300 }, { "epoch": 19.69260414755001, "grad_norm": 0.0009562275954522192, "learning_rate": 7.202858066780449e-09, "loss": 0.0, "num_input_tokens_seen": 231418880, "step": 107305 }, { "epoch": 19.69352174710956, "grad_norm": 0.005193320102989674, "learning_rate": 7.159955937030005e-09, "loss": 0.0, "num_input_tokens_seen": 231429120, "step": 107310 }, { "epoch": 19.694439346669114, "grad_norm": 0.0014714582357555628, "learning_rate": 7.117181865888279e-09, "loss": 0.0, "num_input_tokens_seen": 231439392, "step": 107315 }, { "epoch": 19.695356946228667, "grad_norm": 0.0015243437374010682, "learning_rate": 7.0745358544527246e-09, "loss": 0.0, "num_input_tokens_seen": 231449568, "step": 107320 }, { "epoch": 19.696274545788217, "grad_norm": 0.0016959127970039845, "learning_rate": 7.032017903817467e-09, "loss": 0.0, "num_input_tokens_seen": 231459968, "step": 107325 }, { "epoch": 19.69719214534777, "grad_norm": 0.005794662982225418, "learning_rate": 6.989628015072192e-09, "loss": 0.0, "num_input_tokens_seen": 231471424, "step": 107330 }, { "epoch": 19.698109744907324, "grad_norm": 0.0006511868559755385, "learning_rate": 6.9473661893043605e-09, "loss": 0.0, "num_input_tokens_seen": 231482112, "step": 107335 }, { "epoch": 19.699027344466874, "grad_norm": 0.0017369382549077272, "learning_rate": 6.905232427598108e-09, "loss": 0.0001, "num_input_tokens_seen": 231493600, "step": 107340 }, { "epoch": 19.699944944026427, "grad_norm": 0.0006927827489562333, "learning_rate": 6.863226731034234e-09, "loss": 0.0, "num_input_tokens_seen": 231503392, "step": 107345 }, { "epoch": 19.70086254358598, "grad_norm": 0.010676892474293709, "learning_rate": 6.821349100689656e-09, "loss": 0.0, "num_input_tokens_seen": 231514848, "step": 107350 }, { "epoch": 19.70178014314553, "grad_norm": 0.01849202811717987, "learning_rate": 6.779599537638515e-09, "loss": 0.0532, "num_input_tokens_seen": 231525248, "step": 107355 }, { "epoch": 19.702697742705084, "grad_norm": 0.0020431596785783768, "learning_rate": 6.737978042952176e-09, "loss": 0.1626, "num_input_tokens_seen": 231536320, "step": 107360 }, { "epoch": 19.703615342264637, "grad_norm": 0.007799750193953514, "learning_rate": 6.696484617698118e-09, "loss": 0.0, "num_input_tokens_seen": 231547072, "step": 107365 }, { "epoch": 19.704532941824187, "grad_norm": 0.002216933760792017, "learning_rate": 6.655119262939935e-09, "loss": 0.0, "num_input_tokens_seen": 231558368, "step": 107370 }, { "epoch": 19.70545054138374, "grad_norm": 0.0006766080041415989, "learning_rate": 6.613881979739556e-09, "loss": 0.0, "num_input_tokens_seen": 231568832, "step": 107375 }, { "epoch": 19.706368140943294, "grad_norm": 0.0007504624663852155, "learning_rate": 6.572772769153357e-09, "loss": 0.0, "num_input_tokens_seen": 231579392, "step": 107380 }, { "epoch": 19.707285740502844, "grad_norm": 0.025174353271722794, "learning_rate": 6.531791632236606e-09, "loss": 0.001, "num_input_tokens_seen": 231589152, "step": 107385 }, { "epoch": 19.708203340062397, "grad_norm": 0.028604745864868164, "learning_rate": 6.490938570040128e-09, "loss": 0.0, "num_input_tokens_seen": 231600416, "step": 107390 }, { "epoch": 19.70912093962195, "grad_norm": 0.01258346438407898, "learning_rate": 6.450213583611975e-09, "loss": 0.0001, "num_input_tokens_seen": 231612000, "step": 107395 }, { "epoch": 19.7100385391815, "grad_norm": 0.0026692328974604607, "learning_rate": 6.4096166739968655e-09, "loss": 0.0, "num_input_tokens_seen": 231621728, "step": 107400 }, { "epoch": 19.710956138741054, "grad_norm": 0.012054097838699818, "learning_rate": 6.369147842235079e-09, "loss": 0.0, "num_input_tokens_seen": 231632512, "step": 107405 }, { "epoch": 19.711873738300607, "grad_norm": 0.019467029720544815, "learning_rate": 6.328807089365785e-09, "loss": 0.0, "num_input_tokens_seen": 231643808, "step": 107410 }, { "epoch": 19.712791337860157, "grad_norm": 0.0008651753887534142, "learning_rate": 6.288594416423155e-09, "loss": 0.0002, "num_input_tokens_seen": 231654208, "step": 107415 }, { "epoch": 19.71370893741971, "grad_norm": 0.08652710169553757, "learning_rate": 6.248509824438032e-09, "loss": 0.0001, "num_input_tokens_seen": 231665152, "step": 107420 }, { "epoch": 19.714626536979264, "grad_norm": 0.007865633815526962, "learning_rate": 6.208553314439037e-09, "loss": 0.0001, "num_input_tokens_seen": 231677280, "step": 107425 }, { "epoch": 19.715544136538814, "grad_norm": 0.001333882100880146, "learning_rate": 6.1687248874514605e-09, "loss": 0.0, "num_input_tokens_seen": 231688416, "step": 107430 }, { "epoch": 19.716461736098367, "grad_norm": 0.0016803158214315772, "learning_rate": 6.129024544496154e-09, "loss": 0.0, "num_input_tokens_seen": 231697888, "step": 107435 }, { "epoch": 19.71737933565792, "grad_norm": 0.0005648473161272705, "learning_rate": 6.089452286591191e-09, "loss": 0.0, "num_input_tokens_seen": 231707712, "step": 107440 }, { "epoch": 19.71829693521747, "grad_norm": 0.003928318154066801, "learning_rate": 6.050008114752426e-09, "loss": 0.0, "num_input_tokens_seen": 231717600, "step": 107445 }, { "epoch": 19.719214534777024, "grad_norm": 0.07155407220125198, "learning_rate": 6.010692029990717e-09, "loss": 0.0, "num_input_tokens_seen": 231729440, "step": 107450 }, { "epoch": 19.720132134336577, "grad_norm": 0.0013697915710508823, "learning_rate": 5.971504033314701e-09, "loss": 0.0, "num_input_tokens_seen": 231740000, "step": 107455 }, { "epoch": 19.721049733896127, "grad_norm": 0.00042241805931553245, "learning_rate": 5.932444125729686e-09, "loss": 0.0, "num_input_tokens_seen": 231751488, "step": 107460 }, { "epoch": 19.72196733345568, "grad_norm": 0.0049522388726472855, "learning_rate": 5.8935123082376475e-09, "loss": 0.0, "num_input_tokens_seen": 231762752, "step": 107465 }, { "epoch": 19.722884933015234, "grad_norm": 0.0015789977042004466, "learning_rate": 5.854708581836677e-09, "loss": 0.0, "num_input_tokens_seen": 231774432, "step": 107470 }, { "epoch": 19.723802532574783, "grad_norm": 0.011515854857861996, "learning_rate": 5.816032947522088e-09, "loss": 0.0, "num_input_tokens_seen": 231787392, "step": 107475 }, { "epoch": 19.724720132134337, "grad_norm": 0.003808779874816537, "learning_rate": 5.777485406285866e-09, "loss": 0.0, "num_input_tokens_seen": 231797312, "step": 107480 }, { "epoch": 19.72563773169389, "grad_norm": 0.00043749433825723827, "learning_rate": 5.739065959117218e-09, "loss": 0.0, "num_input_tokens_seen": 231807616, "step": 107485 }, { "epoch": 19.72655533125344, "grad_norm": 0.04919695854187012, "learning_rate": 5.700774607000914e-09, "loss": 0.0, "num_input_tokens_seen": 231817888, "step": 107490 }, { "epoch": 19.727472930812993, "grad_norm": 0.0016800869489088655, "learning_rate": 5.662611350918945e-09, "loss": 0.0, "num_input_tokens_seen": 231828320, "step": 107495 }, { "epoch": 19.728390530372547, "grad_norm": 0.0027711130678653717, "learning_rate": 5.624576191851084e-09, "loss": 0.0, "num_input_tokens_seen": 231838816, "step": 107500 }, { "epoch": 19.729308129932097, "grad_norm": 0.0005246716900728643, "learning_rate": 5.586669130771549e-09, "loss": 0.0, "num_input_tokens_seen": 231849664, "step": 107505 }, { "epoch": 19.73022572949165, "grad_norm": 0.000632012146525085, "learning_rate": 5.548890168654008e-09, "loss": 0.0008, "num_input_tokens_seen": 231860064, "step": 107510 }, { "epoch": 19.731143329051203, "grad_norm": 0.035120442509651184, "learning_rate": 5.511239306466576e-09, "loss": 0.0, "num_input_tokens_seen": 231871456, "step": 107515 }, { "epoch": 19.732060928610753, "grad_norm": 0.017146218568086624, "learning_rate": 5.47371654517459e-09, "loss": 0.0, "num_input_tokens_seen": 231881504, "step": 107520 }, { "epoch": 19.732978528170307, "grad_norm": 0.0009203223162330687, "learning_rate": 5.436321885741725e-09, "loss": 0.0, "num_input_tokens_seen": 231893568, "step": 107525 }, { "epoch": 19.73389612772986, "grad_norm": 0.0006314546335488558, "learning_rate": 5.399055329126102e-09, "loss": 0.0, "num_input_tokens_seen": 231904672, "step": 107530 }, { "epoch": 19.73481372728941, "grad_norm": 0.0015816318336874247, "learning_rate": 5.361916876283069e-09, "loss": 0.0, "num_input_tokens_seen": 231915840, "step": 107535 }, { "epoch": 19.735731326848963, "grad_norm": 0.09080976247787476, "learning_rate": 5.324906528166862e-09, "loss": 0.0001, "num_input_tokens_seen": 231927392, "step": 107540 }, { "epoch": 19.736648926408517, "grad_norm": 0.0012704981490969658, "learning_rate": 5.288024285725057e-09, "loss": 0.0, "num_input_tokens_seen": 231938272, "step": 107545 }, { "epoch": 19.737566525968067, "grad_norm": 0.0014040996320545673, "learning_rate": 5.251270149904675e-09, "loss": 0.0, "num_input_tokens_seen": 231948608, "step": 107550 }, { "epoch": 19.73848412552762, "grad_norm": 0.0024634026922285557, "learning_rate": 5.214644121648293e-09, "loss": 0.0, "num_input_tokens_seen": 231959456, "step": 107555 }, { "epoch": 19.739401725087173, "grad_norm": 0.0004424329672474414, "learning_rate": 5.178146201894607e-09, "loss": 0.0, "num_input_tokens_seen": 231970752, "step": 107560 }, { "epoch": 19.740319324646723, "grad_norm": 0.00034616366610862315, "learning_rate": 5.1417763915800885e-09, "loss": 0.0, "num_input_tokens_seen": 231980672, "step": 107565 }, { "epoch": 19.741236924206277, "grad_norm": 0.022883862257003784, "learning_rate": 5.105534691638437e-09, "loss": 0.0, "num_input_tokens_seen": 231990912, "step": 107570 }, { "epoch": 19.74215452376583, "grad_norm": 0.006529496517032385, "learning_rate": 5.069421102997796e-09, "loss": 0.0478, "num_input_tokens_seen": 232001984, "step": 107575 }, { "epoch": 19.74307212332538, "grad_norm": 0.0013076462782919407, "learning_rate": 5.033435626585204e-09, "loss": 0.001, "num_input_tokens_seen": 232012608, "step": 107580 }, { "epoch": 19.743989722884933, "grad_norm": 0.00327478745020926, "learning_rate": 4.997578263323255e-09, "loss": 0.0, "num_input_tokens_seen": 232022816, "step": 107585 }, { "epoch": 19.744907322444487, "grad_norm": 0.005401630885899067, "learning_rate": 4.961849014132325e-09, "loss": 0.0, "num_input_tokens_seen": 232033312, "step": 107590 }, { "epoch": 19.745824922004036, "grad_norm": 0.001521191792562604, "learning_rate": 4.926247879928348e-09, "loss": 0.0, "num_input_tokens_seen": 232043360, "step": 107595 }, { "epoch": 19.74674252156359, "grad_norm": 0.02553638257086277, "learning_rate": 4.890774861623926e-09, "loss": 0.0, "num_input_tokens_seen": 232054816, "step": 107600 }, { "epoch": 19.747660121123143, "grad_norm": 0.0012358232634142041, "learning_rate": 4.855429960129998e-09, "loss": 0.0, "num_input_tokens_seen": 232065152, "step": 107605 }, { "epoch": 19.748577720682693, "grad_norm": 0.004077283199876547, "learning_rate": 4.8202131763519515e-09, "loss": 0.0, "num_input_tokens_seen": 232076512, "step": 107610 }, { "epoch": 19.749495320242247, "grad_norm": 14.63976764678955, "learning_rate": 4.785124511194061e-09, "loss": 0.004, "num_input_tokens_seen": 232087872, "step": 107615 }, { "epoch": 19.7504129198018, "grad_norm": 0.00997256301343441, "learning_rate": 4.75016396555561e-09, "loss": 0.0883, "num_input_tokens_seen": 232098720, "step": 107620 }, { "epoch": 19.75133051936135, "grad_norm": 0.00724234851077199, "learning_rate": 4.715331540333656e-09, "loss": 0.0, "num_input_tokens_seen": 232108992, "step": 107625 }, { "epoch": 19.752248118920903, "grad_norm": 0.0007258935365825891, "learning_rate": 4.6806272364213754e-09, "loss": 0.0, "num_input_tokens_seen": 232119936, "step": 107630 }, { "epoch": 19.753165718480457, "grad_norm": 0.04874427616596222, "learning_rate": 4.646051054709166e-09, "loss": 0.0, "num_input_tokens_seen": 232130848, "step": 107635 }, { "epoch": 19.754083318040006, "grad_norm": 0.003424824448302388, "learning_rate": 4.6116029960835415e-09, "loss": 0.0, "num_input_tokens_seen": 232142720, "step": 107640 }, { "epoch": 19.75500091759956, "grad_norm": 0.002926921471953392, "learning_rate": 4.577283061428239e-09, "loss": 0.0, "num_input_tokens_seen": 232154048, "step": 107645 }, { "epoch": 19.755918517159113, "grad_norm": 0.0028364728204905987, "learning_rate": 4.543091251623111e-09, "loss": 0.0, "num_input_tokens_seen": 232164896, "step": 107650 }, { "epoch": 19.756836116718663, "grad_norm": 0.004981253296136856, "learning_rate": 4.5090275675452326e-09, "loss": 0.0, "num_input_tokens_seen": 232175968, "step": 107655 }, { "epoch": 19.757753716278216, "grad_norm": 0.003928236663341522, "learning_rate": 4.475092010068905e-09, "loss": 0.0, "num_input_tokens_seen": 232186944, "step": 107660 }, { "epoch": 19.75867131583777, "grad_norm": 0.008611575700342655, "learning_rate": 4.441284580064542e-09, "loss": 0.0, "num_input_tokens_seen": 232196032, "step": 107665 }, { "epoch": 19.75958891539732, "grad_norm": 0.00031510303961113095, "learning_rate": 4.407605278398119e-09, "loss": 0.0, "num_input_tokens_seen": 232205184, "step": 107670 }, { "epoch": 19.760506514956873, "grad_norm": 0.012012643739581108, "learning_rate": 4.3740541059345e-09, "loss": 0.0001, "num_input_tokens_seen": 232216704, "step": 107675 }, { "epoch": 19.761424114516426, "grad_norm": 0.018891915678977966, "learning_rate": 4.340631063533551e-09, "loss": 0.0, "num_input_tokens_seen": 232227712, "step": 107680 }, { "epoch": 19.762341714075976, "grad_norm": 0.05285225063562393, "learning_rate": 4.307336152052921e-09, "loss": 0.0, "num_input_tokens_seen": 232238112, "step": 107685 }, { "epoch": 19.76325931363553, "grad_norm": 0.002520009409636259, "learning_rate": 4.2741693723469255e-09, "loss": 0.0, "num_input_tokens_seen": 232250016, "step": 107690 }, { "epoch": 19.764176913195083, "grad_norm": 0.0004103739629499614, "learning_rate": 4.241130725265441e-09, "loss": 0.0, "num_input_tokens_seen": 232260992, "step": 107695 }, { "epoch": 19.765094512754633, "grad_norm": 0.0006506606587208807, "learning_rate": 4.208220211656122e-09, "loss": 0.0001, "num_input_tokens_seen": 232271808, "step": 107700 }, { "epoch": 19.766012112314186, "grad_norm": 0.007420744746923447, "learning_rate": 4.175437832363294e-09, "loss": 0.0, "num_input_tokens_seen": 232282656, "step": 107705 }, { "epoch": 19.76692971187374, "grad_norm": 0.013469942845404148, "learning_rate": 4.14278358822795e-09, "loss": 0.0, "num_input_tokens_seen": 232292192, "step": 107710 }, { "epoch": 19.76784731143329, "grad_norm": 0.001223885454237461, "learning_rate": 4.110257480086644e-09, "loss": 0.0, "num_input_tokens_seen": 232303808, "step": 107715 }, { "epoch": 19.768764910992843, "grad_norm": 0.0004970295703969896, "learning_rate": 4.077859508774817e-09, "loss": 0.0, "num_input_tokens_seen": 232314048, "step": 107720 }, { "epoch": 19.769682510552396, "grad_norm": 0.018182838335633278, "learning_rate": 4.045589675122919e-09, "loss": 0.0, "num_input_tokens_seen": 232324416, "step": 107725 }, { "epoch": 19.770600110111946, "grad_norm": 0.0021124985069036484, "learning_rate": 4.013447979958618e-09, "loss": 0.0, "num_input_tokens_seen": 232335136, "step": 107730 }, { "epoch": 19.7715177096715, "grad_norm": 0.005213144235312939, "learning_rate": 3.981434424106256e-09, "loss": 0.0, "num_input_tokens_seen": 232346336, "step": 107735 }, { "epoch": 19.772435309231053, "grad_norm": 0.0010303091257810593, "learning_rate": 3.949549008386844e-09, "loss": 0.0, "num_input_tokens_seen": 232357920, "step": 107740 }, { "epoch": 19.773352908790603, "grad_norm": 0.0031885607168078423, "learning_rate": 3.917791733618614e-09, "loss": 0.0007, "num_input_tokens_seen": 232369056, "step": 107745 }, { "epoch": 19.774270508350156, "grad_norm": 0.0010630765464156866, "learning_rate": 3.886162600615362e-09, "loss": 0.0, "num_input_tokens_seen": 232378976, "step": 107750 }, { "epoch": 19.77518810790971, "grad_norm": 0.002249652985483408, "learning_rate": 3.854661610189214e-09, "loss": 0.0, "num_input_tokens_seen": 232389280, "step": 107755 }, { "epoch": 19.77610570746926, "grad_norm": 0.0027258319314569235, "learning_rate": 3.823288763147304e-09, "loss": 0.0, "num_input_tokens_seen": 232399328, "step": 107760 }, { "epoch": 19.777023307028813, "grad_norm": 0.009474559687077999, "learning_rate": 3.792044060295097e-09, "loss": 0.0, "num_input_tokens_seen": 232408960, "step": 107765 }, { "epoch": 19.777940906588366, "grad_norm": 0.0013774537947028875, "learning_rate": 3.760927502433065e-09, "loss": 0.0, "num_input_tokens_seen": 232420416, "step": 107770 }, { "epoch": 19.778858506147916, "grad_norm": 0.008037170395255089, "learning_rate": 3.729939090360013e-09, "loss": 0.0, "num_input_tokens_seen": 232432032, "step": 107775 }, { "epoch": 19.77977610570747, "grad_norm": 0.001921505550853908, "learning_rate": 3.699078824870306e-09, "loss": 0.0, "num_input_tokens_seen": 232442976, "step": 107780 }, { "epoch": 19.780693705267023, "grad_norm": 0.004849535413086414, "learning_rate": 3.6683467067560872e-09, "loss": 0.0001, "num_input_tokens_seen": 232453472, "step": 107785 }, { "epoch": 19.781611304826573, "grad_norm": 0.00041706717456690967, "learning_rate": 3.637742736805061e-09, "loss": 0.0, "num_input_tokens_seen": 232464480, "step": 107790 }, { "epoch": 19.782528904386126, "grad_norm": 0.0007698909030295908, "learning_rate": 3.6072669158021544e-09, "loss": 0.0, "num_input_tokens_seen": 232475808, "step": 107795 }, { "epoch": 19.78344650394568, "grad_norm": 0.0035099012311547995, "learning_rate": 3.576919244528965e-09, "loss": 0.0, "num_input_tokens_seen": 232486624, "step": 107800 }, { "epoch": 19.78436410350523, "grad_norm": 0.0057026720605790615, "learning_rate": 3.546699723764313e-09, "loss": 0.0, "num_input_tokens_seen": 232497504, "step": 107805 }, { "epoch": 19.785281703064783, "grad_norm": 0.005742502398788929, "learning_rate": 3.51660835428258e-09, "loss": 0.0, "num_input_tokens_seen": 232508192, "step": 107810 }, { "epoch": 19.786199302624336, "grad_norm": 0.0003621700743678957, "learning_rate": 3.4866451368564812e-09, "loss": 0.0, "num_input_tokens_seen": 232517152, "step": 107815 }, { "epoch": 19.787116902183886, "grad_norm": 0.0021492396481335163, "learning_rate": 3.4568100722537358e-09, "loss": 0.0, "num_input_tokens_seen": 232528640, "step": 107820 }, { "epoch": 19.78803450174344, "grad_norm": 0.0005235497956164181, "learning_rate": 3.427103161240397e-09, "loss": 0.0016, "num_input_tokens_seen": 232539360, "step": 107825 }, { "epoch": 19.788952101302993, "grad_norm": 0.006788952741771936, "learning_rate": 3.3975244045775234e-09, "loss": 0.0, "num_input_tokens_seen": 232550208, "step": 107830 }, { "epoch": 19.789869700862543, "grad_norm": 0.016324514523148537, "learning_rate": 3.368073803023952e-09, "loss": 0.0, "num_input_tokens_seen": 232561824, "step": 107835 }, { "epoch": 19.790787300422096, "grad_norm": 0.00235184608027339, "learning_rate": 3.3387513573351902e-09, "loss": 0.0, "num_input_tokens_seen": 232573664, "step": 107840 }, { "epoch": 19.79170489998165, "grad_norm": 0.007060949690639973, "learning_rate": 3.3095570682634136e-09, "loss": 0.0, "num_input_tokens_seen": 232585376, "step": 107845 }, { "epoch": 19.7926224995412, "grad_norm": 0.00032555454527027905, "learning_rate": 3.2804909365574676e-09, "loss": 0.0001, "num_input_tokens_seen": 232595808, "step": 107850 }, { "epoch": 19.793540099100753, "grad_norm": 0.0016664211871102452, "learning_rate": 3.2515529629628674e-09, "loss": 0.0, "num_input_tokens_seen": 232607232, "step": 107855 }, { "epoch": 19.794457698660306, "grad_norm": 0.0005125190946273506, "learning_rate": 3.2227431482212413e-09, "loss": 0.0, "num_input_tokens_seen": 232617920, "step": 107860 }, { "epoch": 19.795375298219856, "grad_norm": 0.002199020003899932, "learning_rate": 3.194061493071998e-09, "loss": 0.0, "num_input_tokens_seen": 232628736, "step": 107865 }, { "epoch": 19.79629289777941, "grad_norm": 0.001345840748399496, "learning_rate": 3.165507998251216e-09, "loss": 0.0, "num_input_tokens_seen": 232640064, "step": 107870 }, { "epoch": 19.797210497338963, "grad_norm": 0.0955536812543869, "learning_rate": 3.1370826644899764e-09, "loss": 0.0, "num_input_tokens_seen": 232649216, "step": 107875 }, { "epoch": 19.798128096898512, "grad_norm": 0.00043022242607548833, "learning_rate": 3.1087854925188067e-09, "loss": 0.0, "num_input_tokens_seen": 232658912, "step": 107880 }, { "epoch": 19.799045696458066, "grad_norm": 0.0003196468169335276, "learning_rate": 3.080616483062682e-09, "loss": 0.0, "num_input_tokens_seen": 232670144, "step": 107885 }, { "epoch": 19.79996329601762, "grad_norm": 0.0003830971254501492, "learning_rate": 3.052575636843802e-09, "loss": 0.0, "num_input_tokens_seen": 232679904, "step": 107890 }, { "epoch": 19.80088089557717, "grad_norm": 0.0005107452743686736, "learning_rate": 3.024662954582147e-09, "loss": 0.0, "num_input_tokens_seen": 232692320, "step": 107895 }, { "epoch": 19.801798495136723, "grad_norm": 0.0008029825985431671, "learning_rate": 2.9968784369932557e-09, "loss": 0.0, "num_input_tokens_seen": 232702240, "step": 107900 }, { "epoch": 19.802716094696276, "grad_norm": 0.0008263415074907243, "learning_rate": 2.969222084789891e-09, "loss": 0.0, "num_input_tokens_seen": 232713152, "step": 107905 }, { "epoch": 19.803633694255826, "grad_norm": 0.018201034516096115, "learning_rate": 2.9416938986814857e-09, "loss": 0.0478, "num_input_tokens_seen": 232724352, "step": 107910 }, { "epoch": 19.80455129381538, "grad_norm": 0.0012237802147865295, "learning_rate": 2.9142938793735862e-09, "loss": 0.0, "num_input_tokens_seen": 232734400, "step": 107915 }, { "epoch": 19.805468893374933, "grad_norm": 0.0008220788440667093, "learning_rate": 2.887022027568964e-09, "loss": 0.0, "num_input_tokens_seen": 232746176, "step": 107920 }, { "epoch": 19.806386492934482, "grad_norm": 0.0008524673175998032, "learning_rate": 2.8598783439676147e-09, "loss": 0.0, "num_input_tokens_seen": 232757664, "step": 107925 }, { "epoch": 19.807304092494036, "grad_norm": 0.0010141425300389528, "learning_rate": 2.8328628292656477e-09, "loss": 0.0, "num_input_tokens_seen": 232768896, "step": 107930 }, { "epoch": 19.80822169205359, "grad_norm": 0.0008936739759519696, "learning_rate": 2.805975484155843e-09, "loss": 0.0, "num_input_tokens_seen": 232778336, "step": 107935 }, { "epoch": 19.80913929161314, "grad_norm": 0.017109693959355354, "learning_rate": 2.779216309327648e-09, "loss": 0.0, "num_input_tokens_seen": 232789536, "step": 107940 }, { "epoch": 19.810056891172692, "grad_norm": 0.0009312295005656779, "learning_rate": 2.7525853054677367e-09, "loss": 0.0, "num_input_tokens_seen": 232799648, "step": 107945 }, { "epoch": 19.810974490732246, "grad_norm": 0.020960751920938492, "learning_rate": 2.7260824732588954e-09, "loss": 0.0001, "num_input_tokens_seen": 232810112, "step": 107950 }, { "epoch": 19.811892090291796, "grad_norm": 0.17966070771217346, "learning_rate": 2.6997078133811363e-09, "loss": 0.0, "num_input_tokens_seen": 232821248, "step": 107955 }, { "epoch": 19.81280968985135, "grad_norm": 0.008146660402417183, "learning_rate": 2.673461326510585e-09, "loss": 0.197, "num_input_tokens_seen": 232832832, "step": 107960 }, { "epoch": 19.813727289410902, "grad_norm": 0.004243203438818455, "learning_rate": 2.6473430133205913e-09, "loss": 0.0, "num_input_tokens_seen": 232843936, "step": 107965 }, { "epoch": 19.814644888970452, "grad_norm": 0.012829361483454704, "learning_rate": 2.6213528744811757e-09, "loss": 0.0, "num_input_tokens_seen": 232854112, "step": 107970 }, { "epoch": 19.815562488530006, "grad_norm": 0.027835384011268616, "learning_rate": 2.5954909106590266e-09, "loss": 0.0001, "num_input_tokens_seen": 232864608, "step": 107975 }, { "epoch": 19.81648008808956, "grad_norm": 0.0006111306720413268, "learning_rate": 2.5697571225169473e-09, "loss": 0.0, "num_input_tokens_seen": 232874752, "step": 107980 }, { "epoch": 19.81739768764911, "grad_norm": 0.0009623572113923728, "learning_rate": 2.544151510714965e-09, "loss": 0.1066, "num_input_tokens_seen": 232884160, "step": 107985 }, { "epoch": 19.818315287208662, "grad_norm": 0.0006599484477192163, "learning_rate": 2.5186740759108876e-09, "loss": 0.0, "num_input_tokens_seen": 232894912, "step": 107990 }, { "epoch": 19.819232886768216, "grad_norm": 0.007677293848246336, "learning_rate": 2.4933248187569703e-09, "loss": 0.0, "num_input_tokens_seen": 232906304, "step": 107995 }, { "epoch": 19.820150486327766, "grad_norm": 0.09479068964719772, "learning_rate": 2.468103739903804e-09, "loss": 0.0, "num_input_tokens_seen": 232916000, "step": 108000 }, { "epoch": 19.82106808588732, "grad_norm": 0.003769895061850548, "learning_rate": 2.4430108399986495e-09, "loss": 0.0, "num_input_tokens_seen": 232927584, "step": 108005 }, { "epoch": 19.821985685446872, "grad_norm": 0.0016132111195474863, "learning_rate": 2.418046119684325e-09, "loss": 0.0, "num_input_tokens_seen": 232938560, "step": 108010 }, { "epoch": 19.822903285006422, "grad_norm": 0.031482893973588943, "learning_rate": 2.3932095796014297e-09, "loss": 0.0, "num_input_tokens_seen": 232950720, "step": 108015 }, { "epoch": 19.823820884565976, "grad_norm": 0.0015426450408995152, "learning_rate": 2.3685012203877867e-09, "loss": 0.0, "num_input_tokens_seen": 232959584, "step": 108020 }, { "epoch": 19.82473848412553, "grad_norm": 0.015619654208421707, "learning_rate": 2.3439210426762225e-09, "loss": 0.0, "num_input_tokens_seen": 232971136, "step": 108025 }, { "epoch": 19.82565608368508, "grad_norm": 0.07858435809612274, "learning_rate": 2.319469047097345e-09, "loss": 0.0, "num_input_tokens_seen": 232981472, "step": 108030 }, { "epoch": 19.826573683244632, "grad_norm": 0.0008219536393880844, "learning_rate": 2.2951452342784287e-09, "loss": 0.0, "num_input_tokens_seen": 232991936, "step": 108035 }, { "epoch": 19.827491282804186, "grad_norm": 0.014806277118623257, "learning_rate": 2.2709496048428647e-09, "loss": 0.0, "num_input_tokens_seen": 233003808, "step": 108040 }, { "epoch": 19.828408882363735, "grad_norm": 0.0714518204331398, "learning_rate": 2.246882159411823e-09, "loss": 0.0001, "num_input_tokens_seen": 233014272, "step": 108045 }, { "epoch": 19.82932648192329, "grad_norm": 0.0006244093528948724, "learning_rate": 2.222942898603142e-09, "loss": 0.0, "num_input_tokens_seen": 233025024, "step": 108050 }, { "epoch": 19.830244081482842, "grad_norm": 0.011521823704242706, "learning_rate": 2.1991318230296655e-09, "loss": 0.0, "num_input_tokens_seen": 233036480, "step": 108055 }, { "epoch": 19.831161681042392, "grad_norm": 0.00331090553663671, "learning_rate": 2.1754489333020156e-09, "loss": 0.0, "num_input_tokens_seen": 233047456, "step": 108060 }, { "epoch": 19.832079280601945, "grad_norm": 26.270526885986328, "learning_rate": 2.15189423002915e-09, "loss": 0.0097, "num_input_tokens_seen": 233058112, "step": 108065 }, { "epoch": 19.8329968801615, "grad_norm": 0.04327508807182312, "learning_rate": 2.1284677138133648e-09, "loss": 0.0005, "num_input_tokens_seen": 233068160, "step": 108070 }, { "epoch": 19.83391447972105, "grad_norm": 18.30227279663086, "learning_rate": 2.1051693852569555e-09, "loss": 0.0131, "num_input_tokens_seen": 233078624, "step": 108075 }, { "epoch": 19.834832079280602, "grad_norm": 0.3408490717411041, "learning_rate": 2.081999244956667e-09, "loss": 0.003, "num_input_tokens_seen": 233090624, "step": 108080 }, { "epoch": 19.835749678840155, "grad_norm": 0.0074330465868115425, "learning_rate": 2.0589572935070247e-09, "loss": 0.0051, "num_input_tokens_seen": 233102720, "step": 108085 }, { "epoch": 19.836667278399705, "grad_norm": 0.0005087040481157601, "learning_rate": 2.036043531499221e-09, "loss": 0.0, "num_input_tokens_seen": 233112704, "step": 108090 }, { "epoch": 19.83758487795926, "grad_norm": 0.0034572994336485863, "learning_rate": 2.0132579595205646e-09, "loss": 0.0, "num_input_tokens_seen": 233122784, "step": 108095 }, { "epoch": 19.838502477518812, "grad_norm": 0.018611565232276917, "learning_rate": 1.990600578155588e-09, "loss": 0.0207, "num_input_tokens_seen": 233134016, "step": 108100 }, { "epoch": 19.839420077078362, "grad_norm": 0.0013946355320513248, "learning_rate": 1.968071387986048e-09, "loss": 0.0, "num_input_tokens_seen": 233144608, "step": 108105 }, { "epoch": 19.840337676637915, "grad_norm": 0.020292144268751144, "learning_rate": 1.9456703895887054e-09, "loss": 0.0, "num_input_tokens_seen": 233155744, "step": 108110 }, { "epoch": 19.84125527619747, "grad_norm": 0.06681927293539047, "learning_rate": 1.9233975835386553e-09, "loss": 0.0, "num_input_tokens_seen": 233167104, "step": 108115 }, { "epoch": 19.84217287575702, "grad_norm": 0.0023511871695518494, "learning_rate": 1.901252970407108e-09, "loss": 0.0, "num_input_tokens_seen": 233177408, "step": 108120 }, { "epoch": 19.843090475316572, "grad_norm": 0.07703813910484314, "learning_rate": 1.8792365507624975e-09, "loss": 0.0, "num_input_tokens_seen": 233187936, "step": 108125 }, { "epoch": 19.844008074876125, "grad_norm": 0.005736993160098791, "learning_rate": 1.8573483251688173e-09, "loss": 0.0, "num_input_tokens_seen": 233198656, "step": 108130 }, { "epoch": 19.844925674435675, "grad_norm": 0.001777112833224237, "learning_rate": 1.835588294187285e-09, "loss": 0.0, "num_input_tokens_seen": 233210016, "step": 108135 }, { "epoch": 19.84584327399523, "grad_norm": 0.0052524409256875515, "learning_rate": 1.8139564583768977e-09, "loss": 0.0, "num_input_tokens_seen": 233221664, "step": 108140 }, { "epoch": 19.846760873554782, "grad_norm": 0.002471664221957326, "learning_rate": 1.792452818292212e-09, "loss": 0.0, "num_input_tokens_seen": 233232032, "step": 108145 }, { "epoch": 19.847678473114332, "grad_norm": 0.009192563593387604, "learning_rate": 1.7710773744844533e-09, "loss": 0.0131, "num_input_tokens_seen": 233242272, "step": 108150 }, { "epoch": 19.848596072673885, "grad_norm": 0.0008965297602117062, "learning_rate": 1.7498301275020724e-09, "loss": 0.0, "num_input_tokens_seen": 233253280, "step": 108155 }, { "epoch": 19.84951367223344, "grad_norm": 0.001643938710913062, "learning_rate": 1.7287110778896333e-09, "loss": 0.0, "num_input_tokens_seen": 233263648, "step": 108160 }, { "epoch": 19.85043127179299, "grad_norm": 0.0010372137185186148, "learning_rate": 1.7077202261894798e-09, "loss": 0.0, "num_input_tokens_seen": 233274368, "step": 108165 }, { "epoch": 19.851348871352542, "grad_norm": 0.0015881828730925918, "learning_rate": 1.6868575729395154e-09, "loss": 0.0051, "num_input_tokens_seen": 233284992, "step": 108170 }, { "epoch": 19.852266470912095, "grad_norm": 0.002222292125225067, "learning_rate": 1.6661231186748673e-09, "loss": 0.0, "num_input_tokens_seen": 233296000, "step": 108175 }, { "epoch": 19.853184070471645, "grad_norm": 0.001231667585670948, "learning_rate": 1.6455168639273322e-09, "loss": 0.0001, "num_input_tokens_seen": 233306048, "step": 108180 }, { "epoch": 19.8541016700312, "grad_norm": 0.00654609315097332, "learning_rate": 1.6250388092259317e-09, "loss": 0.0, "num_input_tokens_seen": 233315552, "step": 108185 }, { "epoch": 19.855019269590752, "grad_norm": 0.0014345420058816671, "learning_rate": 1.604688955095246e-09, "loss": 0.0001, "num_input_tokens_seen": 233326848, "step": 108190 }, { "epoch": 19.8559368691503, "grad_norm": 0.01095829252153635, "learning_rate": 1.5844673020576351e-09, "loss": 0.0, "num_input_tokens_seen": 233337824, "step": 108195 }, { "epoch": 19.856854468709855, "grad_norm": 0.0027540740557014942, "learning_rate": 1.5643738506315731e-09, "loss": 0.0, "num_input_tokens_seen": 233349024, "step": 108200 }, { "epoch": 19.85777206826941, "grad_norm": 0.0013631191104650497, "learning_rate": 1.5444086013327586e-09, "loss": 0.0, "num_input_tokens_seen": 233360768, "step": 108205 }, { "epoch": 19.85868966782896, "grad_norm": 0.005384943913668394, "learning_rate": 1.5245715546724493e-09, "loss": 0.119, "num_input_tokens_seen": 233371744, "step": 108210 }, { "epoch": 19.859607267388512, "grad_norm": 0.004317459184676409, "learning_rate": 1.5048627111602376e-09, "loss": 0.0, "num_input_tokens_seen": 233383168, "step": 108215 }, { "epoch": 19.860524866948065, "grad_norm": 0.001529487781226635, "learning_rate": 1.48528207130072e-09, "loss": 0.0, "num_input_tokens_seen": 233394144, "step": 108220 }, { "epoch": 19.861442466507615, "grad_norm": 0.001663117203861475, "learning_rate": 1.4658296355973822e-09, "loss": 0.1314, "num_input_tokens_seen": 233405056, "step": 108225 }, { "epoch": 19.86236006606717, "grad_norm": 0.04474674537777901, "learning_rate": 1.4465054045481597e-09, "loss": 0.0, "num_input_tokens_seen": 233416896, "step": 108230 }, { "epoch": 19.863277665626722, "grad_norm": 0.00043980689952149987, "learning_rate": 1.427309378649322e-09, "loss": 0.0, "num_input_tokens_seen": 233428352, "step": 108235 }, { "epoch": 19.86419526518627, "grad_norm": 0.0157940536737442, "learning_rate": 1.408241558392698e-09, "loss": 0.0, "num_input_tokens_seen": 233438080, "step": 108240 }, { "epoch": 19.865112864745825, "grad_norm": 0.007019153330475092, "learning_rate": 1.3893019442678958e-09, "loss": 0.0, "num_input_tokens_seen": 233448320, "step": 108245 }, { "epoch": 19.86603046430538, "grad_norm": 0.0007865226943977177, "learning_rate": 1.3704905367600829e-09, "loss": 0.0, "num_input_tokens_seen": 233459808, "step": 108250 }, { "epoch": 19.86694806386493, "grad_norm": 0.01446844357997179, "learning_rate": 1.3518073363516515e-09, "loss": 0.0001, "num_input_tokens_seen": 233471552, "step": 108255 }, { "epoch": 19.86786566342448, "grad_norm": 0.9210575819015503, "learning_rate": 1.3332523435227728e-09, "loss": 0.0002, "num_input_tokens_seen": 233482368, "step": 108260 }, { "epoch": 19.868783262984035, "grad_norm": 0.0034272365737706423, "learning_rate": 1.3148255587486225e-09, "loss": 0.0, "num_input_tokens_seen": 233492864, "step": 108265 }, { "epoch": 19.869700862543585, "grad_norm": 0.0007010149420239031, "learning_rate": 1.2965269825016002e-09, "loss": 0.0, "num_input_tokens_seen": 233504032, "step": 108270 }, { "epoch": 19.87061846210314, "grad_norm": 0.0004921133513562381, "learning_rate": 1.2783566152518856e-09, "loss": 0.0, "num_input_tokens_seen": 233514304, "step": 108275 }, { "epoch": 19.87153606166269, "grad_norm": 0.05751747265458107, "learning_rate": 1.260314457464662e-09, "loss": 0.0, "num_input_tokens_seen": 233524384, "step": 108280 }, { "epoch": 19.87245366122224, "grad_norm": 0.002492798725143075, "learning_rate": 1.2424005096028925e-09, "loss": 0.0, "num_input_tokens_seen": 233534112, "step": 108285 }, { "epoch": 19.873371260781795, "grad_norm": 0.0030177454464137554, "learning_rate": 1.2246147721262092e-09, "loss": 0.0001, "num_input_tokens_seen": 233544704, "step": 108290 }, { "epoch": 19.87428886034135, "grad_norm": 0.0034072929993271828, "learning_rate": 1.2069572454909139e-09, "loss": 0.0001, "num_input_tokens_seen": 233555968, "step": 108295 }, { "epoch": 19.875206459900898, "grad_norm": 0.0006664336542598903, "learning_rate": 1.1894279301499777e-09, "loss": 0.0, "num_input_tokens_seen": 233567648, "step": 108300 }, { "epoch": 19.87612405946045, "grad_norm": 0.002461283700540662, "learning_rate": 1.1720268265524858e-09, "loss": 0.0, "num_input_tokens_seen": 233578208, "step": 108305 }, { "epoch": 19.877041659020005, "grad_norm": 0.007770926225930452, "learning_rate": 1.154753935144748e-09, "loss": 0.0, "num_input_tokens_seen": 233589664, "step": 108310 }, { "epoch": 19.877959258579555, "grad_norm": 0.0006944012711755931, "learning_rate": 1.137609256370298e-09, "loss": 0.0, "num_input_tokens_seen": 233600768, "step": 108315 }, { "epoch": 19.878876858139108, "grad_norm": 0.009915314614772797, "learning_rate": 1.1205927906687842e-09, "loss": 0.0001, "num_input_tokens_seen": 233613312, "step": 108320 }, { "epoch": 19.87979445769866, "grad_norm": 0.0009269806323572993, "learning_rate": 1.1037045384765244e-09, "loss": 0.0, "num_input_tokens_seen": 233624576, "step": 108325 }, { "epoch": 19.88071205725821, "grad_norm": 0.01843450404703617, "learning_rate": 1.0869445002265056e-09, "loss": 0.0001, "num_input_tokens_seen": 233633664, "step": 108330 }, { "epoch": 19.881629656817765, "grad_norm": 0.0007570185116492212, "learning_rate": 1.070312676348939e-09, "loss": 0.0, "num_input_tokens_seen": 233644544, "step": 108335 }, { "epoch": 19.88254725637732, "grad_norm": 0.002545601222664118, "learning_rate": 1.0538090672701506e-09, "loss": 0.0, "num_input_tokens_seen": 233655200, "step": 108340 }, { "epoch": 19.883464855936868, "grad_norm": 0.004220031667500734, "learning_rate": 1.0374336734131352e-09, "loss": 0.0, "num_input_tokens_seen": 233666080, "step": 108345 }, { "epoch": 19.88438245549642, "grad_norm": 0.07346264272928238, "learning_rate": 1.0211864951986671e-09, "loss": 0.0, "num_input_tokens_seen": 233677632, "step": 108350 }, { "epoch": 19.885300055055975, "grad_norm": 0.0005290117114782333, "learning_rate": 1.0050675330430803e-09, "loss": 0.0, "num_input_tokens_seen": 233688672, "step": 108355 }, { "epoch": 19.886217654615525, "grad_norm": 0.0004078989732079208, "learning_rate": 9.890767873593777e-10, "loss": 0.0, "num_input_tokens_seen": 233698848, "step": 108360 }, { "epoch": 19.887135254175078, "grad_norm": 0.00037281939876265824, "learning_rate": 9.732142585583416e-10, "loss": 0.0, "num_input_tokens_seen": 233710912, "step": 108365 }, { "epoch": 19.88805285373463, "grad_norm": 0.027410203590989113, "learning_rate": 9.574799470463137e-10, "loss": 0.0, "num_input_tokens_seen": 233721632, "step": 108370 }, { "epoch": 19.88897045329418, "grad_norm": 0.0008735567098483443, "learning_rate": 9.418738532274151e-10, "loss": 0.0, "num_input_tokens_seen": 233731808, "step": 108375 }, { "epoch": 19.889888052853735, "grad_norm": 0.0003271278110332787, "learning_rate": 9.263959775018816e-10, "loss": 0.0004, "num_input_tokens_seen": 233744000, "step": 108380 }, { "epoch": 19.890805652413288, "grad_norm": 0.04024677351117134, "learning_rate": 9.110463202660625e-10, "loss": 0.0, "num_input_tokens_seen": 233753824, "step": 108385 }, { "epoch": 19.891723251972838, "grad_norm": 0.000681609904859215, "learning_rate": 8.958248819140869e-10, "loss": 0.0, "num_input_tokens_seen": 233765568, "step": 108390 }, { "epoch": 19.89264085153239, "grad_norm": 0.0012777005322277546, "learning_rate": 8.807316628361984e-10, "loss": 0.0, "num_input_tokens_seen": 233776032, "step": 108395 }, { "epoch": 19.893558451091945, "grad_norm": 0.000690782384481281, "learning_rate": 8.657666634193096e-10, "loss": 0.0, "num_input_tokens_seen": 233786944, "step": 108400 }, { "epoch": 19.894476050651495, "grad_norm": 0.049123357981443405, "learning_rate": 8.509298840481128e-10, "loss": 0.0, "num_input_tokens_seen": 233799296, "step": 108405 }, { "epoch": 19.895393650211048, "grad_norm": 0.0017497108783572912, "learning_rate": 8.362213251023044e-10, "loss": 0.0, "num_input_tokens_seen": 233810112, "step": 108410 }, { "epoch": 19.8963112497706, "grad_norm": 0.0012893794337287545, "learning_rate": 8.2164098695936e-10, "loss": 0.0, "num_input_tokens_seen": 233822080, "step": 108415 }, { "epoch": 19.89722884933015, "grad_norm": 0.020982112735509872, "learning_rate": 8.07188869993425e-10, "loss": 0.0, "num_input_tokens_seen": 233832704, "step": 108420 }, { "epoch": 19.898146448889705, "grad_norm": 0.4315890669822693, "learning_rate": 7.928649745753136e-10, "loss": 0.0001, "num_input_tokens_seen": 233843040, "step": 108425 }, { "epoch": 19.899064048449258, "grad_norm": 0.0009206263930536807, "learning_rate": 7.786693010719548e-10, "loss": 0.0, "num_input_tokens_seen": 233853728, "step": 108430 }, { "epoch": 19.899981648008808, "grad_norm": 0.0006597689352929592, "learning_rate": 7.646018498475016e-10, "loss": 0.0051, "num_input_tokens_seen": 233864864, "step": 108435 }, { "epoch": 19.90089924756836, "grad_norm": 0.0010384800843894482, "learning_rate": 7.506626212627766e-10, "loss": 0.0, "num_input_tokens_seen": 233876320, "step": 108440 }, { "epoch": 19.901816847127915, "grad_norm": 0.023587970063090324, "learning_rate": 7.368516156758266e-10, "loss": 0.0, "num_input_tokens_seen": 233885888, "step": 108445 }, { "epoch": 19.902734446687464, "grad_norm": 0.0031849571969360113, "learning_rate": 7.231688334402576e-10, "loss": 0.0, "num_input_tokens_seen": 233896448, "step": 108450 }, { "epoch": 19.903652046247018, "grad_norm": 0.015049353241920471, "learning_rate": 7.096142749074553e-10, "loss": 0.0, "num_input_tokens_seen": 233907424, "step": 108455 }, { "epoch": 19.90456964580657, "grad_norm": 0.010972456075251102, "learning_rate": 6.961879404243643e-10, "loss": 0.0, "num_input_tokens_seen": 233918560, "step": 108460 }, { "epoch": 19.90548724536612, "grad_norm": 0.0007858973694965243, "learning_rate": 6.828898303362641e-10, "loss": 0.0, "num_input_tokens_seen": 233929120, "step": 108465 }, { "epoch": 19.906404844925675, "grad_norm": 0.009147826582193375, "learning_rate": 6.69719944983438e-10, "loss": 0.0, "num_input_tokens_seen": 233939264, "step": 108470 }, { "epoch": 19.907322444485228, "grad_norm": 0.0008867073920555413, "learning_rate": 6.566782847045039e-10, "loss": 0.0, "num_input_tokens_seen": 233948416, "step": 108475 }, { "epoch": 19.908240044044778, "grad_norm": 0.004500182811170816, "learning_rate": 6.43764849833084e-10, "loss": 0.0, "num_input_tokens_seen": 233958144, "step": 108480 }, { "epoch": 19.90915764360433, "grad_norm": 0.08764570206403732, "learning_rate": 6.309796407005797e-10, "loss": 0.0001, "num_input_tokens_seen": 233968064, "step": 108485 }, { "epoch": 19.910075243163885, "grad_norm": 0.008634870871901512, "learning_rate": 6.183226576356172e-10, "loss": 0.0, "num_input_tokens_seen": 233977184, "step": 108490 }, { "epoch": 19.910992842723434, "grad_norm": 0.01018765289336443, "learning_rate": 6.057939009623815e-10, "loss": 0.0, "num_input_tokens_seen": 233987872, "step": 108495 }, { "epoch": 19.911910442282988, "grad_norm": 0.024203885346651077, "learning_rate": 5.93393371001727e-10, "loss": 0.0, "num_input_tokens_seen": 233998112, "step": 108500 }, { "epoch": 19.91282804184254, "grad_norm": 0.0046086846850812435, "learning_rate": 5.811210680728429e-10, "loss": 0.0, "num_input_tokens_seen": 234009728, "step": 108505 }, { "epoch": 19.91374564140209, "grad_norm": 0.012989860028028488, "learning_rate": 5.689769924893673e-10, "loss": 0.0, "num_input_tokens_seen": 234019872, "step": 108510 }, { "epoch": 19.914663240961644, "grad_norm": 0.002358380937948823, "learning_rate": 5.569611445632728e-10, "loss": 0.0002, "num_input_tokens_seen": 234029952, "step": 108515 }, { "epoch": 19.915580840521198, "grad_norm": 0.0007147837313823402, "learning_rate": 5.450735246026462e-10, "loss": 0.0, "num_input_tokens_seen": 234040608, "step": 108520 }, { "epoch": 19.916498440080748, "grad_norm": 0.012381358072161674, "learning_rate": 5.333141329122438e-10, "loss": 0.0, "num_input_tokens_seen": 234051936, "step": 108525 }, { "epoch": 19.9174160396403, "grad_norm": 0.0005851024179719388, "learning_rate": 5.216829697940462e-10, "loss": 0.0425, "num_input_tokens_seen": 234064192, "step": 108530 }, { "epoch": 19.918333639199854, "grad_norm": 0.0009867961052805185, "learning_rate": 5.101800355461483e-10, "loss": 0.0, "num_input_tokens_seen": 234075776, "step": 108535 }, { "epoch": 19.919251238759404, "grad_norm": 0.019063150510191917, "learning_rate": 4.988053304638696e-10, "loss": 0.0, "num_input_tokens_seen": 234085760, "step": 108540 }, { "epoch": 19.920168838318958, "grad_norm": 0.005979685112833977, "learning_rate": 4.875588548380883e-10, "loss": 0.0, "num_input_tokens_seen": 234096448, "step": 108545 }, { "epoch": 19.92108643787851, "grad_norm": 0.006593252532184124, "learning_rate": 4.764406089585727e-10, "loss": 0.0, "num_input_tokens_seen": 234107808, "step": 108550 }, { "epoch": 19.92200403743806, "grad_norm": 0.001464496599510312, "learning_rate": 4.6545059310953986e-10, "loss": 0.0, "num_input_tokens_seen": 234117760, "step": 108555 }, { "epoch": 19.922921636997614, "grad_norm": 44.012451171875, "learning_rate": 4.5458880757298653e-10, "loss": 0.0478, "num_input_tokens_seen": 234129120, "step": 108560 }, { "epoch": 19.923839236557168, "grad_norm": 0.004618776962161064, "learning_rate": 4.438552526281337e-10, "loss": 0.0, "num_input_tokens_seen": 234140032, "step": 108565 }, { "epoch": 19.924756836116718, "grad_norm": 0.0016418835148215294, "learning_rate": 4.3324992854920645e-10, "loss": 0.0001, "num_input_tokens_seen": 234152128, "step": 108570 }, { "epoch": 19.92567443567627, "grad_norm": 0.000538174994289875, "learning_rate": 4.227728356087646e-10, "loss": 0.0032, "num_input_tokens_seen": 234161920, "step": 108575 }, { "epoch": 19.926592035235824, "grad_norm": 0.0029002611991018057, "learning_rate": 4.1242397407603717e-10, "loss": 0.0, "num_input_tokens_seen": 234172032, "step": 108580 }, { "epoch": 19.927509634795374, "grad_norm": 0.01570172607898712, "learning_rate": 4.0220334421581244e-10, "loss": 0.0, "num_input_tokens_seen": 234182944, "step": 108585 }, { "epoch": 19.928427234354928, "grad_norm": 0.0003996374143753201, "learning_rate": 3.921109462901029e-10, "loss": 0.0, "num_input_tokens_seen": 234194560, "step": 108590 }, { "epoch": 19.92934483391448, "grad_norm": 0.006493418011814356, "learning_rate": 3.821467805581458e-10, "loss": 0.0, "num_input_tokens_seen": 234204128, "step": 108595 }, { "epoch": 19.93026243347403, "grad_norm": 0.0005355759640224278, "learning_rate": 3.723108472758474e-10, "loss": 0.0, "num_input_tokens_seen": 234214592, "step": 108600 }, { "epoch": 19.931180033033584, "grad_norm": 0.0036958665587008, "learning_rate": 3.626031466946733e-10, "loss": 0.0, "num_input_tokens_seen": 234224320, "step": 108605 }, { "epoch": 19.932097632593138, "grad_norm": 0.0008527761674486101, "learning_rate": 3.530236790638686e-10, "loss": 0.0, "num_input_tokens_seen": 234235392, "step": 108610 }, { "epoch": 19.933015232152687, "grad_norm": 0.005506488960236311, "learning_rate": 3.435724446299027e-10, "loss": 0.0, "num_input_tokens_seen": 234245600, "step": 108615 }, { "epoch": 19.93393283171224, "grad_norm": 0.0010386398062109947, "learning_rate": 3.3424944363369405e-10, "loss": 0.0, "num_input_tokens_seen": 234254848, "step": 108620 }, { "epoch": 19.934850431271794, "grad_norm": 0.012442834675312042, "learning_rate": 3.250546763156059e-10, "loss": 0.0, "num_input_tokens_seen": 234267168, "step": 108625 }, { "epoch": 19.935768030831344, "grad_norm": 0.007559075020253658, "learning_rate": 3.1598814291100563e-10, "loss": 0.0, "num_input_tokens_seen": 234279040, "step": 108630 }, { "epoch": 19.936685630390897, "grad_norm": 0.001997190061956644, "learning_rate": 3.0704984365304e-10, "loss": 0.0, "num_input_tokens_seen": 234288672, "step": 108635 }, { "epoch": 19.93760322995045, "grad_norm": 0.05061062425374985, "learning_rate": 2.982397787698599e-10, "loss": 0.0, "num_input_tokens_seen": 234297888, "step": 108640 }, { "epoch": 19.93852082951, "grad_norm": 0.005856309086084366, "learning_rate": 2.895579484879507e-10, "loss": 0.0, "num_input_tokens_seen": 234309600, "step": 108645 }, { "epoch": 19.939438429069554, "grad_norm": 0.0004415755101945251, "learning_rate": 2.8100435303046737e-10, "loss": 0.0, "num_input_tokens_seen": 234321184, "step": 108650 }, { "epoch": 19.940356028629107, "grad_norm": 0.003845469793304801, "learning_rate": 2.725789926155686e-10, "loss": 0.0, "num_input_tokens_seen": 234332640, "step": 108655 }, { "epoch": 19.941273628188657, "grad_norm": 0.0005365182878449559, "learning_rate": 2.6428186746085827e-10, "loss": 0.0, "num_input_tokens_seen": 234343968, "step": 108660 }, { "epoch": 19.94219122774821, "grad_norm": 0.0036857330705970526, "learning_rate": 2.5611297777838886e-10, "loss": 0.0, "num_input_tokens_seen": 234355840, "step": 108665 }, { "epoch": 19.943108827307764, "grad_norm": 0.0030322661623358727, "learning_rate": 2.480723237774374e-10, "loss": 0.0, "num_input_tokens_seen": 234367136, "step": 108670 }, { "epoch": 19.944026426867314, "grad_norm": 0.05297167971730232, "learning_rate": 2.4015990566450543e-10, "loss": 0.0064, "num_input_tokens_seen": 234378528, "step": 108675 }, { "epoch": 19.944944026426867, "grad_norm": 0.0014545518206432462, "learning_rate": 2.3237572364276374e-10, "loss": 0.0, "num_input_tokens_seen": 234389216, "step": 108680 }, { "epoch": 19.94586162598642, "grad_norm": 0.001619223621673882, "learning_rate": 2.2471977791149735e-10, "loss": 0.0, "num_input_tokens_seen": 234400928, "step": 108685 }, { "epoch": 19.94677922554597, "grad_norm": 0.06865013390779495, "learning_rate": 2.1719206866721575e-10, "loss": 0.0, "num_input_tokens_seen": 234412416, "step": 108690 }, { "epoch": 19.947696825105524, "grad_norm": 0.04227747395634651, "learning_rate": 2.0979259610309776e-10, "loss": 0.0, "num_input_tokens_seen": 234424640, "step": 108695 }, { "epoch": 19.948614424665077, "grad_norm": 0.010576223954558372, "learning_rate": 2.0252136040899152e-10, "loss": 0.0, "num_input_tokens_seen": 234434816, "step": 108700 }, { "epoch": 19.949532024224627, "grad_norm": 0.000629676622338593, "learning_rate": 1.9537836177085934e-10, "loss": 0.0, "num_input_tokens_seen": 234446560, "step": 108705 }, { "epoch": 19.95044962378418, "grad_norm": 0.0025707187596708536, "learning_rate": 1.8836360037244316e-10, "loss": 0.0, "num_input_tokens_seen": 234456832, "step": 108710 }, { "epoch": 19.951367223343734, "grad_norm": 0.010691306553781033, "learning_rate": 1.8147707639359913e-10, "loss": 0.0, "num_input_tokens_seen": 234467616, "step": 108715 }, { "epoch": 19.952284822903284, "grad_norm": 0.022642087191343307, "learning_rate": 1.747187900108527e-10, "loss": 0.0, "num_input_tokens_seen": 234478688, "step": 108720 }, { "epoch": 19.953202422462837, "grad_norm": 0.03590894490480423, "learning_rate": 1.680887413973986e-10, "loss": 0.0, "num_input_tokens_seen": 234489088, "step": 108725 }, { "epoch": 19.95412002202239, "grad_norm": 0.01338153425604105, "learning_rate": 1.6158693072310106e-10, "loss": 0.0, "num_input_tokens_seen": 234498848, "step": 108730 }, { "epoch": 19.95503762158194, "grad_norm": 0.03971128910779953, "learning_rate": 1.5521335815560367e-10, "loss": 0.0, "num_input_tokens_seen": 234510336, "step": 108735 }, { "epoch": 19.955955221141494, "grad_norm": 0.001318578259088099, "learning_rate": 1.4896802385755415e-10, "loss": 0.0, "num_input_tokens_seen": 234521536, "step": 108740 }, { "epoch": 19.956872820701047, "grad_norm": 0.02836054563522339, "learning_rate": 1.4285092798937972e-10, "loss": 0.0001, "num_input_tokens_seen": 234531936, "step": 108745 }, { "epoch": 19.957790420260597, "grad_norm": 0.0005304812220856547, "learning_rate": 1.3686207070817693e-10, "loss": 0.0, "num_input_tokens_seen": 234542496, "step": 108750 }, { "epoch": 19.95870801982015, "grad_norm": 0.002031095791608095, "learning_rate": 1.3100145216715653e-10, "loss": 0.0001, "num_input_tokens_seen": 234553952, "step": 108755 }, { "epoch": 19.959625619379704, "grad_norm": 0.0038582333363592625, "learning_rate": 1.252690725173089e-10, "loss": 0.0, "num_input_tokens_seen": 234564800, "step": 108760 }, { "epoch": 19.960543218939254, "grad_norm": 0.001654239371418953, "learning_rate": 1.1966493190462836e-10, "loss": 0.0, "num_input_tokens_seen": 234576000, "step": 108765 }, { "epoch": 19.961460818498807, "grad_norm": 0.0006293189944699407, "learning_rate": 1.1418903047399899e-10, "loss": 0.0, "num_input_tokens_seen": 234586144, "step": 108770 }, { "epoch": 19.96237841805836, "grad_norm": 0.0035150989424437284, "learning_rate": 1.0884136836475378e-10, "loss": 0.0, "num_input_tokens_seen": 234597760, "step": 108775 }, { "epoch": 19.96329601761791, "grad_norm": 0.004846660420298576, "learning_rate": 1.0362194571511552e-10, "loss": 0.0, "num_input_tokens_seen": 234608128, "step": 108780 }, { "epoch": 19.964213617177464, "grad_norm": 0.003769383067265153, "learning_rate": 9.853076265831096e-11, "loss": 0.0, "num_input_tokens_seen": 234619488, "step": 108785 }, { "epoch": 19.965131216737017, "grad_norm": 0.0038149578031152487, "learning_rate": 9.356781932479131e-11, "loss": 0.0, "num_input_tokens_seen": 234630112, "step": 108790 }, { "epoch": 19.966048816296567, "grad_norm": 0.0032590413466095924, "learning_rate": 8.87331158422322e-11, "loss": 0.0, "num_input_tokens_seen": 234641312, "step": 108795 }, { "epoch": 19.96696641585612, "grad_norm": 0.02377617359161377, "learning_rate": 8.402665233442352e-11, "loss": 0.0, "num_input_tokens_seen": 234650944, "step": 108800 }, { "epoch": 19.967884015415674, "grad_norm": 0.0012257909402251244, "learning_rate": 7.944842892237958e-11, "loss": 0.0, "num_input_tokens_seen": 234661632, "step": 108805 }, { "epoch": 19.968801614975224, "grad_norm": 0.0034892724361270666, "learning_rate": 7.49984457232289e-11, "loss": 0.0, "num_input_tokens_seen": 234672352, "step": 108810 }, { "epoch": 19.969719214534777, "grad_norm": 0.0012331375619396567, "learning_rate": 7.067670285076933e-11, "loss": 0.0, "num_input_tokens_seen": 234682816, "step": 108815 }, { "epoch": 19.97063681409433, "grad_norm": 0.0016374083934351802, "learning_rate": 6.64832004165783e-11, "loss": 0.0, "num_input_tokens_seen": 234692896, "step": 108820 }, { "epoch": 19.97155441365388, "grad_norm": 0.002442148281261325, "learning_rate": 6.241793852834743e-11, "loss": 0.0, "num_input_tokens_seen": 234703296, "step": 108825 }, { "epoch": 19.972472013213434, "grad_norm": 0.00198408798314631, "learning_rate": 5.848091728932748e-11, "loss": 0.0, "num_input_tokens_seen": 234715456, "step": 108830 }, { "epoch": 19.973389612772987, "grad_norm": 0.2822827696800232, "learning_rate": 5.4672136801103836e-11, "loss": 0.0001, "num_input_tokens_seen": 234727040, "step": 108835 }, { "epoch": 19.974307212332537, "grad_norm": 0.016770783811807632, "learning_rate": 5.099159716137614e-11, "loss": 0.0, "num_input_tokens_seen": 234737024, "step": 108840 }, { "epoch": 19.97522481189209, "grad_norm": 0.022219812497496605, "learning_rate": 4.7439298464513337e-11, "loss": 0.0, "num_input_tokens_seen": 234748672, "step": 108845 }, { "epoch": 19.976142411451644, "grad_norm": 0.0014579619746655226, "learning_rate": 4.401524080210884e-11, "loss": 0.0, "num_input_tokens_seen": 234759040, "step": 108850 }, { "epoch": 19.977060011011194, "grad_norm": 0.0005023411940783262, "learning_rate": 4.0719424260760035e-11, "loss": 0.0078, "num_input_tokens_seen": 234769856, "step": 108855 }, { "epoch": 19.977977610570747, "grad_norm": 0.03267607092857361, "learning_rate": 3.755184892595409e-11, "loss": 0.0, "num_input_tokens_seen": 234780864, "step": 108860 }, { "epoch": 19.9788952101303, "grad_norm": 53.936614990234375, "learning_rate": 3.45125148787373e-11, "loss": 0.0079, "num_input_tokens_seen": 234789952, "step": 108865 }, { "epoch": 19.97981280968985, "grad_norm": 0.17711977660655975, "learning_rate": 3.1601422197380383e-11, "loss": 0.0, "num_input_tokens_seen": 234802240, "step": 108870 }, { "epoch": 19.980730409249404, "grad_norm": 0.05039582401514053, "learning_rate": 2.8818570955713166e-11, "loss": 0.0, "num_input_tokens_seen": 234813856, "step": 108875 }, { "epoch": 19.981648008808957, "grad_norm": 0.002874805359169841, "learning_rate": 2.616396122590015e-11, "loss": 0.0, "num_input_tokens_seen": 234824960, "step": 108880 }, { "epoch": 19.982565608368507, "grad_norm": 0.0014431197196245193, "learning_rate": 2.3637593075664933e-11, "loss": 0.0, "num_input_tokens_seen": 234835424, "step": 108885 }, { "epoch": 19.98348320792806, "grad_norm": 0.0005933247157372534, "learning_rate": 2.1239466569400458e-11, "loss": 0.0, "num_input_tokens_seen": 234846848, "step": 108890 }, { "epoch": 19.984400807487614, "grad_norm": 0.0005390175501815975, "learning_rate": 1.8969581769834324e-11, "loss": 0.0, "num_input_tokens_seen": 234857216, "step": 108895 }, { "epoch": 19.985318407047163, "grad_norm": 0.3458422124385834, "learning_rate": 1.6827938733587902e-11, "loss": 0.0001, "num_input_tokens_seen": 234869472, "step": 108900 }, { "epoch": 19.986236006606717, "grad_norm": 0.010561254806816578, "learning_rate": 1.4814537517282566e-11, "loss": 0.0001, "num_input_tokens_seen": 234880352, "step": 108905 }, { "epoch": 19.98715360616627, "grad_norm": 0.0009281709208153188, "learning_rate": 1.2929378170878359e-11, "loss": 0.0, "num_input_tokens_seen": 234890816, "step": 108910 }, { "epoch": 19.98807120572582, "grad_norm": 0.0021756121423095465, "learning_rate": 1.1172460744335311e-11, "loss": 0.0, "num_input_tokens_seen": 234900224, "step": 108915 }, { "epoch": 19.988988805285373, "grad_norm": 0.0013672325294464827, "learning_rate": 9.543785281507235e-12, "loss": 0.0, "num_input_tokens_seen": 234911584, "step": 108920 }, { "epoch": 19.989906404844927, "grad_norm": 0.0038973328191787004, "learning_rate": 8.043351824582601e-12, "loss": 0.0, "num_input_tokens_seen": 234923200, "step": 108925 }, { "epoch": 19.990824004404477, "grad_norm": 0.00046462510363198817, "learning_rate": 6.67116041241922e-12, "loss": 0.0, "num_input_tokens_seen": 234934784, "step": 108930 }, { "epoch": 19.99174160396403, "grad_norm": 0.0006516082212328911, "learning_rate": 5.427211079434002e-12, "loss": 0.0, "num_input_tokens_seen": 234946080, "step": 108935 }, { "epoch": 19.992659203523583, "grad_norm": 0.0004097922064829618, "learning_rate": 4.31150385837853e-12, "loss": 0.0, "num_input_tokens_seen": 234957280, "step": 108940 }, { "epoch": 19.993576803083133, "grad_norm": 0.000993613270111382, "learning_rate": 3.324038777008376e-12, "loss": 0.0, "num_input_tokens_seen": 234968224, "step": 108945 }, { "epoch": 19.994494402642687, "grad_norm": 0.03339738771319389, "learning_rate": 2.464815861413783e-12, "loss": 0.0, "num_input_tokens_seen": 234978112, "step": 108950 }, { "epoch": 19.99541200220224, "grad_norm": 0.0014295941218733788, "learning_rate": 1.7338351332440994e-12, "loss": 0.0, "num_input_tokens_seen": 234989664, "step": 108955 }, { "epoch": 19.99632960176179, "grad_norm": 0.0007755639380775392, "learning_rate": 1.1310966108180055e-12, "loss": 0.0, "num_input_tokens_seen": 234999872, "step": 108960 }, { "epoch": 19.997247201321343, "grad_norm": 0.0005508993635885417, "learning_rate": 6.566003107888464e-13, "loss": 0.0, "num_input_tokens_seen": 235010208, "step": 108965 }, { "epoch": 19.998164800880897, "grad_norm": 0.00040750604239292443, "learning_rate": 3.103462442588523e-13, "loss": 0.0, "num_input_tokens_seen": 235022240, "step": 108970 }, { "epoch": 19.999082400440447, "grad_norm": 0.0023449689615517855, "learning_rate": 9.233442066491904e-14, "loss": 0.0, "num_input_tokens_seen": 235033984, "step": 108975 }, { "epoch": 20.0, "grad_norm": 0.003743827808648348, "learning_rate": 2.564845003050209e-15, "loss": 0.0, "num_input_tokens_seen": 235044016, "step": 108980 }, { "epoch": 20.0, "eval_loss": 1.4579435586929321, "eval_runtime": 179.1009, "eval_samples_per_second": 30.424, "eval_steps_per_second": 7.61, "num_input_tokens_seen": 235044016, "step": 108980 }, { "epoch": 20.0, "num_input_tokens_seen": 235044016, "step": 108980, "total_flos": 1.0583925808106177e+19, "train_loss": 0.20944354211633787, "train_runtime": 34676.4566, "train_samples_per_second": 12.57, "train_steps_per_second": 3.143 } ], "logging_steps": 5, "max_steps": 108980, "num_input_tokens_seen": 235044016, "num_train_epochs": 20, "save_steps": 10898, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0583925808106177e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }