diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,196384 @@ +{ + "best_global_step": 36780, + "best_metric": 0.11071006208658218, + "best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_multirc_123_1764974431/checkpoint-36780", + "epoch": 20.0, + "eval_steps": 6130, + "global_step": 122600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008156606851549756, + "grad_norm": 251.15960693359375, + "learning_rate": 3.262642740619902e-07, + "loss": 11.2491, + "num_input_tokens_seen": 8928, + "step": 5 + }, + { + "epoch": 0.0016313213703099511, + "grad_norm": 111.67447662353516, + "learning_rate": 7.34094616639478e-07, + "loss": 11.1863, + "num_input_tokens_seen": 20448, + "step": 10 + }, + { + "epoch": 0.0024469820554649264, + "grad_norm": 142.75416564941406, + "learning_rate": 1.1419249592169658e-06, + "loss": 10.6092, + "num_input_tokens_seen": 31072, + "step": 15 + }, + { + "epoch": 0.0032626427406199023, + "grad_norm": 174.57147216796875, + "learning_rate": 1.5497553017944535e-06, + "loss": 9.6052, + "num_input_tokens_seen": 42528, + "step": 20 + }, + { + "epoch": 0.004078303425774877, + "grad_norm": 352.3755187988281, + "learning_rate": 1.957585644371941e-06, + "loss": 9.0336, + "num_input_tokens_seen": 52576, + "step": 25 + }, + { + "epoch": 0.004893964110929853, + "grad_norm": 124.07296752929688, + "learning_rate": 2.365415986949429e-06, + "loss": 7.8752, + "num_input_tokens_seen": 62944, + "step": 30 + }, + { + "epoch": 0.005709624796084829, + "grad_norm": 230.94424438476562, + "learning_rate": 2.7732463295269165e-06, + "loss": 6.6635, + "num_input_tokens_seen": 73504, + "step": 35 + }, + { + "epoch": 0.0065252854812398045, + "grad_norm": 75.48468017578125, + "learning_rate": 3.1810766721044044e-06, + "loss": 5.1473, + "num_input_tokens_seen": 84640, + "step": 40 + }, + { + "epoch": 0.00734094616639478, + "grad_norm": 135.2037353515625, + "learning_rate": 3.5889070146818927e-06, + "loss": 3.8638, + "num_input_tokens_seen": 96288, + "step": 45 + }, + { + "epoch": 0.008156606851549755, + "grad_norm": 101.66092681884766, + "learning_rate": 3.99673735725938e-06, + "loss": 3.1409, + "num_input_tokens_seen": 107360, + "step": 50 + }, + { + "epoch": 0.00897226753670473, + "grad_norm": 151.60845947265625, + "learning_rate": 4.404567699836868e-06, + "loss": 2.1487, + "num_input_tokens_seen": 118432, + "step": 55 + }, + { + "epoch": 0.009787928221859706, + "grad_norm": 128.37997436523438, + "learning_rate": 4.812398042414356e-06, + "loss": 1.5323, + "num_input_tokens_seen": 128416, + "step": 60 + }, + { + "epoch": 0.010603588907014683, + "grad_norm": 106.93790435791016, + "learning_rate": 5.2202283849918435e-06, + "loss": 1.1476, + "num_input_tokens_seen": 138432, + "step": 65 + }, + { + "epoch": 0.011419249592169658, + "grad_norm": 187.4431915283203, + "learning_rate": 5.628058727569331e-06, + "loss": 0.8388, + "num_input_tokens_seen": 149504, + "step": 70 + }, + { + "epoch": 0.012234910277324634, + "grad_norm": 46.7535285949707, + "learning_rate": 6.035889070146819e-06, + "loss": 0.6467, + "num_input_tokens_seen": 160192, + "step": 75 + }, + { + "epoch": 0.013050570962479609, + "grad_norm": 144.10482788085938, + "learning_rate": 6.443719412724307e-06, + "loss": 0.4727, + "num_input_tokens_seen": 170432, + "step": 80 + }, + { + "epoch": 0.013866231647634585, + "grad_norm": 52.891971588134766, + "learning_rate": 6.851549755301794e-06, + "loss": 0.5673, + "num_input_tokens_seen": 180672, + "step": 85 + }, + { + "epoch": 0.01468189233278956, + "grad_norm": 16.148460388183594, + "learning_rate": 7.2593800978792825e-06, + "loss": 0.4971, + "num_input_tokens_seen": 192352, + "step": 90 + }, + { + "epoch": 0.015497553017944535, + "grad_norm": 106.64603424072266, + "learning_rate": 7.66721044045677e-06, + "loss": 0.449, + "num_input_tokens_seen": 203040, + "step": 95 + }, + { + "epoch": 0.01631321370309951, + "grad_norm": 34.131343841552734, + "learning_rate": 8.075040783034257e-06, + "loss": 0.6507, + "num_input_tokens_seen": 214272, + "step": 100 + }, + { + "epoch": 0.017128874388254486, + "grad_norm": 160.18325805664062, + "learning_rate": 8.482871125611746e-06, + "loss": 0.6407, + "num_input_tokens_seen": 225664, + "step": 105 + }, + { + "epoch": 0.01794453507340946, + "grad_norm": 32.64995193481445, + "learning_rate": 8.890701468189234e-06, + "loss": 1.7699, + "num_input_tokens_seen": 237216, + "step": 110 + }, + { + "epoch": 0.018760195758564437, + "grad_norm": 41.17549514770508, + "learning_rate": 9.298531810766722e-06, + "loss": 0.472, + "num_input_tokens_seen": 246560, + "step": 115 + }, + { + "epoch": 0.01957585644371941, + "grad_norm": 933.0833740234375, + "learning_rate": 9.706362153344209e-06, + "loss": 1.9404, + "num_input_tokens_seen": 257184, + "step": 120 + }, + { + "epoch": 0.020391517128874388, + "grad_norm": 185.07606506347656, + "learning_rate": 1.0114192495921697e-05, + "loss": 0.7782, + "num_input_tokens_seen": 268384, + "step": 125 + }, + { + "epoch": 0.021207177814029365, + "grad_norm": 14.245857238769531, + "learning_rate": 1.0522022838499184e-05, + "loss": 0.5652, + "num_input_tokens_seen": 279680, + "step": 130 + }, + { + "epoch": 0.02202283849918434, + "grad_norm": 27.22962188720703, + "learning_rate": 1.0929853181076672e-05, + "loss": 0.4006, + "num_input_tokens_seen": 290688, + "step": 135 + }, + { + "epoch": 0.022838499184339316, + "grad_norm": 117.80506134033203, + "learning_rate": 1.1337683523654159e-05, + "loss": 0.5573, + "num_input_tokens_seen": 301408, + "step": 140 + }, + { + "epoch": 0.02365415986949429, + "grad_norm": 149.01083374023438, + "learning_rate": 1.1745513866231649e-05, + "loss": 0.6893, + "num_input_tokens_seen": 311968, + "step": 145 + }, + { + "epoch": 0.024469820554649267, + "grad_norm": 24.81130027770996, + "learning_rate": 1.2153344208809135e-05, + "loss": 0.3935, + "num_input_tokens_seen": 322080, + "step": 150 + }, + { + "epoch": 0.02528548123980424, + "grad_norm": 11.513254165649414, + "learning_rate": 1.2561174551386624e-05, + "loss": 0.4692, + "num_input_tokens_seen": 333824, + "step": 155 + }, + { + "epoch": 0.026101141924959218, + "grad_norm": 36.972740173339844, + "learning_rate": 1.296900489396411e-05, + "loss": 0.3643, + "num_input_tokens_seen": 344896, + "step": 160 + }, + { + "epoch": 0.026916802610114192, + "grad_norm": 33.10696792602539, + "learning_rate": 1.3376835236541599e-05, + "loss": 0.4509, + "num_input_tokens_seen": 356192, + "step": 165 + }, + { + "epoch": 0.02773246329526917, + "grad_norm": 1314.303466796875, + "learning_rate": 1.3784665579119085e-05, + "loss": 1.2467, + "num_input_tokens_seen": 367136, + "step": 170 + }, + { + "epoch": 0.028548123980424143, + "grad_norm": 25.855735778808594, + "learning_rate": 1.4192495921696575e-05, + "loss": 1.1451, + "num_input_tokens_seen": 376832, + "step": 175 + }, + { + "epoch": 0.02936378466557912, + "grad_norm": 123.07361602783203, + "learning_rate": 1.4600326264274062e-05, + "loss": 0.4904, + "num_input_tokens_seen": 387552, + "step": 180 + }, + { + "epoch": 0.030179445350734094, + "grad_norm": 39.203365325927734, + "learning_rate": 1.500815660685155e-05, + "loss": 0.4395, + "num_input_tokens_seen": 398528, + "step": 185 + }, + { + "epoch": 0.03099510603588907, + "grad_norm": 20.756071090698242, + "learning_rate": 1.5415986949429037e-05, + "loss": 0.4019, + "num_input_tokens_seen": 408896, + "step": 190 + }, + { + "epoch": 0.03181076672104405, + "grad_norm": 18.606752395629883, + "learning_rate": 1.5823817292006523e-05, + "loss": 0.328, + "num_input_tokens_seen": 420416, + "step": 195 + }, + { + "epoch": 0.03262642740619902, + "grad_norm": 26.72914695739746, + "learning_rate": 1.6231647634584013e-05, + "loss": 0.3512, + "num_input_tokens_seen": 430912, + "step": 200 + }, + { + "epoch": 0.033442088091353996, + "grad_norm": 12.489819526672363, + "learning_rate": 1.66394779771615e-05, + "loss": 0.4811, + "num_input_tokens_seen": 442176, + "step": 205 + }, + { + "epoch": 0.03425774877650897, + "grad_norm": 19.206298828125, + "learning_rate": 1.704730831973899e-05, + "loss": 0.4251, + "num_input_tokens_seen": 454496, + "step": 210 + }, + { + "epoch": 0.03507340946166395, + "grad_norm": 16.998523712158203, + "learning_rate": 1.7455138662316477e-05, + "loss": 0.3833, + "num_input_tokens_seen": 464896, + "step": 215 + }, + { + "epoch": 0.03588907014681892, + "grad_norm": 14.737927436828613, + "learning_rate": 1.7862969004893963e-05, + "loss": 0.473, + "num_input_tokens_seen": 477024, + "step": 220 + }, + { + "epoch": 0.0367047308319739, + "grad_norm": 13.0357027053833, + "learning_rate": 1.8270799347471453e-05, + "loss": 0.5315, + "num_input_tokens_seen": 487776, + "step": 225 + }, + { + "epoch": 0.037520391517128875, + "grad_norm": 12.749910354614258, + "learning_rate": 1.867862969004894e-05, + "loss": 0.2705, + "num_input_tokens_seen": 498528, + "step": 230 + }, + { + "epoch": 0.03833605220228385, + "grad_norm": 43.4488525390625, + "learning_rate": 1.908646003262643e-05, + "loss": 0.7499, + "num_input_tokens_seen": 509184, + "step": 235 + }, + { + "epoch": 0.03915171288743882, + "grad_norm": 8.351309776306152, + "learning_rate": 1.9494290375203913e-05, + "loss": 0.4334, + "num_input_tokens_seen": 520224, + "step": 240 + }, + { + "epoch": 0.0399673735725938, + "grad_norm": 12.115440368652344, + "learning_rate": 1.9902120717781403e-05, + "loss": 0.4105, + "num_input_tokens_seen": 530464, + "step": 245 + }, + { + "epoch": 0.040783034257748776, + "grad_norm": 52.79171371459961, + "learning_rate": 2.0309951060358893e-05, + "loss": 0.4232, + "num_input_tokens_seen": 541952, + "step": 250 + }, + { + "epoch": 0.041598694942903754, + "grad_norm": 18.117061614990234, + "learning_rate": 2.071778140293638e-05, + "loss": 0.3747, + "num_input_tokens_seen": 551424, + "step": 255 + }, + { + "epoch": 0.04241435562805873, + "grad_norm": 11.777599334716797, + "learning_rate": 2.1125611745513866e-05, + "loss": 0.329, + "num_input_tokens_seen": 561440, + "step": 260 + }, + { + "epoch": 0.0432300163132137, + "grad_norm": 5.754439353942871, + "learning_rate": 2.1533442088091353e-05, + "loss": 0.4069, + "num_input_tokens_seen": 573440, + "step": 265 + }, + { + "epoch": 0.04404567699836868, + "grad_norm": 7.56231689453125, + "learning_rate": 2.1941272430668843e-05, + "loss": 0.3663, + "num_input_tokens_seen": 584032, + "step": 270 + }, + { + "epoch": 0.044861337683523655, + "grad_norm": 11.69233512878418, + "learning_rate": 2.234910277324633e-05, + "loss": 0.3925, + "num_input_tokens_seen": 595328, + "step": 275 + }, + { + "epoch": 0.04567699836867863, + "grad_norm": 3.8412091732025146, + "learning_rate": 2.2756933115823816e-05, + "loss": 0.379, + "num_input_tokens_seen": 605760, + "step": 280 + }, + { + "epoch": 0.0464926590538336, + "grad_norm": 6.388465404510498, + "learning_rate": 2.3164763458401306e-05, + "loss": 0.324, + "num_input_tokens_seen": 616384, + "step": 285 + }, + { + "epoch": 0.04730831973898858, + "grad_norm": 20.283143997192383, + "learning_rate": 2.3572593800978793e-05, + "loss": 0.4937, + "num_input_tokens_seen": 626880, + "step": 290 + }, + { + "epoch": 0.04812398042414356, + "grad_norm": 18.39556884765625, + "learning_rate": 2.3980424143556283e-05, + "loss": 0.3861, + "num_input_tokens_seen": 637536, + "step": 295 + }, + { + "epoch": 0.048939641109298535, + "grad_norm": 9.842692375183105, + "learning_rate": 2.4388254486133766e-05, + "loss": 0.3908, + "num_input_tokens_seen": 648000, + "step": 300 + }, + { + "epoch": 0.049755301794453505, + "grad_norm": 11.340023040771484, + "learning_rate": 2.4796084828711256e-05, + "loss": 0.3919, + "num_input_tokens_seen": 659648, + "step": 305 + }, + { + "epoch": 0.05057096247960848, + "grad_norm": 9.721641540527344, + "learning_rate": 2.5203915171288743e-05, + "loss": 0.3985, + "num_input_tokens_seen": 669920, + "step": 310 + }, + { + "epoch": 0.05138662316476346, + "grad_norm": 4.567502498626709, + "learning_rate": 2.5611745513866233e-05, + "loss": 0.3636, + "num_input_tokens_seen": 681216, + "step": 315 + }, + { + "epoch": 0.052202283849918436, + "grad_norm": 2.6591572761535645, + "learning_rate": 2.6019575856443723e-05, + "loss": 0.3458, + "num_input_tokens_seen": 691872, + "step": 320 + }, + { + "epoch": 0.05301794453507341, + "grad_norm": 4.603590488433838, + "learning_rate": 2.6427406199021206e-05, + "loss": 0.3619, + "num_input_tokens_seen": 702208, + "step": 325 + }, + { + "epoch": 0.053833605220228384, + "grad_norm": 31.67989730834961, + "learning_rate": 2.6835236541598696e-05, + "loss": 0.4057, + "num_input_tokens_seen": 712288, + "step": 330 + }, + { + "epoch": 0.05464926590538336, + "grad_norm": 2.3346760272979736, + "learning_rate": 2.7243066884176183e-05, + "loss": 0.4857, + "num_input_tokens_seen": 722976, + "step": 335 + }, + { + "epoch": 0.05546492659053834, + "grad_norm": 5.371507167816162, + "learning_rate": 2.7650897226753673e-05, + "loss": 0.3786, + "num_input_tokens_seen": 733760, + "step": 340 + }, + { + "epoch": 0.05628058727569331, + "grad_norm": 2.8156282901763916, + "learning_rate": 2.805872756933116e-05, + "loss": 0.3661, + "num_input_tokens_seen": 743520, + "step": 345 + }, + { + "epoch": 0.057096247960848286, + "grad_norm": 4.485386848449707, + "learning_rate": 2.8466557911908646e-05, + "loss": 0.2988, + "num_input_tokens_seen": 753984, + "step": 350 + }, + { + "epoch": 0.05791190864600326, + "grad_norm": 1.7257930040359497, + "learning_rate": 2.8874388254486136e-05, + "loss": 0.3364, + "num_input_tokens_seen": 765376, + "step": 355 + }, + { + "epoch": 0.05872756933115824, + "grad_norm": 1.9125670194625854, + "learning_rate": 2.9282218597063623e-05, + "loss": 0.3801, + "num_input_tokens_seen": 775904, + "step": 360 + }, + { + "epoch": 0.05954323001631321, + "grad_norm": 6.248146057128906, + "learning_rate": 2.969004893964111e-05, + "loss": 0.5768, + "num_input_tokens_seen": 788000, + "step": 365 + }, + { + "epoch": 0.06035889070146819, + "grad_norm": 8.308306694030762, + "learning_rate": 3.0097879282218596e-05, + "loss": 0.4294, + "num_input_tokens_seen": 798816, + "step": 370 + }, + { + "epoch": 0.061174551386623165, + "grad_norm": 3.773549795150757, + "learning_rate": 3.0505709624796086e-05, + "loss": 0.3048, + "num_input_tokens_seen": 809856, + "step": 375 + }, + { + "epoch": 0.06199021207177814, + "grad_norm": 1.9782823324203491, + "learning_rate": 3.0913539967373576e-05, + "loss": 0.3589, + "num_input_tokens_seen": 820448, + "step": 380 + }, + { + "epoch": 0.06280587275693311, + "grad_norm": 2.147299289703369, + "learning_rate": 3.132137030995106e-05, + "loss": 0.3359, + "num_input_tokens_seen": 830624, + "step": 385 + }, + { + "epoch": 0.0636215334420881, + "grad_norm": 7.291934967041016, + "learning_rate": 3.172920065252855e-05, + "loss": 0.3737, + "num_input_tokens_seen": 840224, + "step": 390 + }, + { + "epoch": 0.06443719412724307, + "grad_norm": 3.42254638671875, + "learning_rate": 3.213703099510604e-05, + "loss": 0.365, + "num_input_tokens_seen": 850176, + "step": 395 + }, + { + "epoch": 0.06525285481239804, + "grad_norm": 3.277561664581299, + "learning_rate": 3.254486133768352e-05, + "loss": 0.3987, + "num_input_tokens_seen": 861056, + "step": 400 + }, + { + "epoch": 0.06606851549755302, + "grad_norm": 6.280937671661377, + "learning_rate": 3.295269168026101e-05, + "loss": 0.3561, + "num_input_tokens_seen": 871360, + "step": 405 + }, + { + "epoch": 0.06688417618270799, + "grad_norm": 3.0383718013763428, + "learning_rate": 3.33605220228385e-05, + "loss": 0.3441, + "num_input_tokens_seen": 882656, + "step": 410 + }, + { + "epoch": 0.06769983686786298, + "grad_norm": 3.4604156017303467, + "learning_rate": 3.3768352365415986e-05, + "loss": 0.3894, + "num_input_tokens_seen": 893952, + "step": 415 + }, + { + "epoch": 0.06851549755301795, + "grad_norm": 2.6101272106170654, + "learning_rate": 3.4176182707993476e-05, + "loss": 0.4019, + "num_input_tokens_seen": 905472, + "step": 420 + }, + { + "epoch": 0.06933115823817292, + "grad_norm": 1.728624939918518, + "learning_rate": 3.458401305057096e-05, + "loss": 0.3495, + "num_input_tokens_seen": 916000, + "step": 425 + }, + { + "epoch": 0.0701468189233279, + "grad_norm": 3.064460039138794, + "learning_rate": 3.4991843393148456e-05, + "loss": 0.3383, + "num_input_tokens_seen": 926944, + "step": 430 + }, + { + "epoch": 0.07096247960848287, + "grad_norm": 42.99164962768555, + "learning_rate": 3.539967373572594e-05, + "loss": 0.3808, + "num_input_tokens_seen": 938496, + "step": 435 + }, + { + "epoch": 0.07177814029363784, + "grad_norm": 3.0072669982910156, + "learning_rate": 3.580750407830342e-05, + "loss": 0.3772, + "num_input_tokens_seen": 949472, + "step": 440 + }, + { + "epoch": 0.07259380097879282, + "grad_norm": 8.388079643249512, + "learning_rate": 3.621533442088092e-05, + "loss": 0.3442, + "num_input_tokens_seen": 959712, + "step": 445 + }, + { + "epoch": 0.0734094616639478, + "grad_norm": 2.4625377655029297, + "learning_rate": 3.66231647634584e-05, + "loss": 0.3541, + "num_input_tokens_seen": 969888, + "step": 450 + }, + { + "epoch": 0.07422512234910278, + "grad_norm": 2.3994970321655273, + "learning_rate": 3.703099510603589e-05, + "loss": 0.3529, + "num_input_tokens_seen": 981024, + "step": 455 + }, + { + "epoch": 0.07504078303425775, + "grad_norm": 1.599547266960144, + "learning_rate": 3.7438825448613375e-05, + "loss": 0.4738, + "num_input_tokens_seen": 991680, + "step": 460 + }, + { + "epoch": 0.07585644371941272, + "grad_norm": 1.4611376523971558, + "learning_rate": 3.7846655791190865e-05, + "loss": 0.3719, + "num_input_tokens_seen": 1003008, + "step": 465 + }, + { + "epoch": 0.0766721044045677, + "grad_norm": 1.614911437034607, + "learning_rate": 3.8254486133768355e-05, + "loss": 0.3604, + "num_input_tokens_seen": 1014336, + "step": 470 + }, + { + "epoch": 0.07748776508972267, + "grad_norm": 5.341881275177002, + "learning_rate": 3.866231647634584e-05, + "loss": 0.3574, + "num_input_tokens_seen": 1024960, + "step": 475 + }, + { + "epoch": 0.07830342577487764, + "grad_norm": 2.061347246170044, + "learning_rate": 3.907014681892333e-05, + "loss": 0.3317, + "num_input_tokens_seen": 1034880, + "step": 480 + }, + { + "epoch": 0.07911908646003263, + "grad_norm": 3.665827751159668, + "learning_rate": 3.947797716150082e-05, + "loss": 0.3804, + "num_input_tokens_seen": 1046112, + "step": 485 + }, + { + "epoch": 0.0799347471451876, + "grad_norm": 12.822347640991211, + "learning_rate": 3.98858075040783e-05, + "loss": 0.4257, + "num_input_tokens_seen": 1056864, + "step": 490 + }, + { + "epoch": 0.08075040783034258, + "grad_norm": 7.721891403198242, + "learning_rate": 4.029363784665579e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1067072, + "step": 495 + }, + { + "epoch": 0.08156606851549755, + "grad_norm": 2.3584465980529785, + "learning_rate": 4.070146818923328e-05, + "loss": 0.4476, + "num_input_tokens_seen": 1079136, + "step": 500 + }, + { + "epoch": 0.08238172920065252, + "grad_norm": 2.4510133266448975, + "learning_rate": 4.1109298531810765e-05, + "loss": 0.3606, + "num_input_tokens_seen": 1089408, + "step": 505 + }, + { + "epoch": 0.08319738988580751, + "grad_norm": 1.2990319728851318, + "learning_rate": 4.1517128874388255e-05, + "loss": 0.3348, + "num_input_tokens_seen": 1099488, + "step": 510 + }, + { + "epoch": 0.08401305057096248, + "grad_norm": 3.786846399307251, + "learning_rate": 4.1924959216965745e-05, + "loss": 0.3743, + "num_input_tokens_seen": 1110464, + "step": 515 + }, + { + "epoch": 0.08482871125611746, + "grad_norm": 1.5801750421524048, + "learning_rate": 4.233278955954323e-05, + "loss": 0.3635, + "num_input_tokens_seen": 1121216, + "step": 520 + }, + { + "epoch": 0.08564437194127243, + "grad_norm": 4.429471015930176, + "learning_rate": 4.274061990212072e-05, + "loss": 0.353, + "num_input_tokens_seen": 1132192, + "step": 525 + }, + { + "epoch": 0.0864600326264274, + "grad_norm": 2.704030990600586, + "learning_rate": 4.314845024469821e-05, + "loss": 0.3785, + "num_input_tokens_seen": 1142944, + "step": 530 + }, + { + "epoch": 0.08727569331158239, + "grad_norm": 3.279242753982544, + "learning_rate": 4.35562805872757e-05, + "loss": 0.355, + "num_input_tokens_seen": 1154080, + "step": 535 + }, + { + "epoch": 0.08809135399673736, + "grad_norm": 2.517383337020874, + "learning_rate": 4.396411092985318e-05, + "loss": 0.3505, + "num_input_tokens_seen": 1165120, + "step": 540 + }, + { + "epoch": 0.08890701468189233, + "grad_norm": 2.494194507598877, + "learning_rate": 4.4371941272430665e-05, + "loss": 0.3721, + "num_input_tokens_seen": 1176352, + "step": 545 + }, + { + "epoch": 0.08972267536704731, + "grad_norm": 1.5179803371429443, + "learning_rate": 4.477977161500816e-05, + "loss": 0.3556, + "num_input_tokens_seen": 1188288, + "step": 550 + }, + { + "epoch": 0.09053833605220228, + "grad_norm": 6.280685901641846, + "learning_rate": 4.5187601957585645e-05, + "loss": 0.3533, + "num_input_tokens_seen": 1200800, + "step": 555 + }, + { + "epoch": 0.09135399673735727, + "grad_norm": 1.5632981061935425, + "learning_rate": 4.559543230016313e-05, + "loss": 0.346, + "num_input_tokens_seen": 1211360, + "step": 560 + }, + { + "epoch": 0.09216965742251224, + "grad_norm": 2.2686402797698975, + "learning_rate": 4.6003262642740625e-05, + "loss": 0.336, + "num_input_tokens_seen": 1221184, + "step": 565 + }, + { + "epoch": 0.0929853181076672, + "grad_norm": 4.302751541137695, + "learning_rate": 4.641109298531811e-05, + "loss": 0.3667, + "num_input_tokens_seen": 1231808, + "step": 570 + }, + { + "epoch": 0.09380097879282219, + "grad_norm": 3.876094102859497, + "learning_rate": 4.68189233278956e-05, + "loss": 0.3685, + "num_input_tokens_seen": 1242560, + "step": 575 + }, + { + "epoch": 0.09461663947797716, + "grad_norm": 0.8888449668884277, + "learning_rate": 4.722675367047308e-05, + "loss": 0.3717, + "num_input_tokens_seen": 1253440, + "step": 580 + }, + { + "epoch": 0.09543230016313213, + "grad_norm": 0.6884282827377319, + "learning_rate": 4.763458401305057e-05, + "loss": 0.3487, + "num_input_tokens_seen": 1263616, + "step": 585 + }, + { + "epoch": 0.09624796084828711, + "grad_norm": 1.8834329843521118, + "learning_rate": 4.804241435562806e-05, + "loss": 0.3437, + "num_input_tokens_seen": 1275488, + "step": 590 + }, + { + "epoch": 0.09706362153344208, + "grad_norm": 2.1269099712371826, + "learning_rate": 4.8450244698205544e-05, + "loss": 0.3447, + "num_input_tokens_seen": 1287200, + "step": 595 + }, + { + "epoch": 0.09787928221859707, + "grad_norm": 1.8290114402770996, + "learning_rate": 4.885807504078304e-05, + "loss": 0.3846, + "num_input_tokens_seen": 1298336, + "step": 600 + }, + { + "epoch": 0.09869494290375204, + "grad_norm": 2.6021039485931396, + "learning_rate": 4.9265905383360524e-05, + "loss": 0.3702, + "num_input_tokens_seen": 1309056, + "step": 605 + }, + { + "epoch": 0.09951060358890701, + "grad_norm": 2.3401405811309814, + "learning_rate": 4.967373572593801e-05, + "loss": 0.3452, + "num_input_tokens_seen": 1320864, + "step": 610 + }, + { + "epoch": 0.100326264274062, + "grad_norm": 6.006040096282959, + "learning_rate": 5.00815660685155e-05, + "loss": 0.3778, + "num_input_tokens_seen": 1332448, + "step": 615 + }, + { + "epoch": 0.10114192495921696, + "grad_norm": 0.4541068375110626, + "learning_rate": 5.048939641109299e-05, + "loss": 0.3658, + "num_input_tokens_seen": 1342368, + "step": 620 + }, + { + "epoch": 0.10195758564437195, + "grad_norm": 2.084834337234497, + "learning_rate": 5.089722675367047e-05, + "loss": 0.3625, + "num_input_tokens_seen": 1353024, + "step": 625 + }, + { + "epoch": 0.10277324632952692, + "grad_norm": 1.032232403755188, + "learning_rate": 5.130505709624796e-05, + "loss": 0.3336, + "num_input_tokens_seen": 1364000, + "step": 630 + }, + { + "epoch": 0.10358890701468189, + "grad_norm": 3.1807632446289062, + "learning_rate": 5.171288743882545e-05, + "loss": 0.3577, + "num_input_tokens_seen": 1374784, + "step": 635 + }, + { + "epoch": 0.10440456769983687, + "grad_norm": 4.449918270111084, + "learning_rate": 5.212071778140294e-05, + "loss": 0.2796, + "num_input_tokens_seen": 1384192, + "step": 640 + }, + { + "epoch": 0.10522022838499184, + "grad_norm": 5.02224588394165, + "learning_rate": 5.2528548123980424e-05, + "loss": 0.4753, + "num_input_tokens_seen": 1395392, + "step": 645 + }, + { + "epoch": 0.10603588907014681, + "grad_norm": 2.447105884552002, + "learning_rate": 5.293637846655791e-05, + "loss": 0.3355, + "num_input_tokens_seen": 1406208, + "step": 650 + }, + { + "epoch": 0.1068515497553018, + "grad_norm": 4.51138162612915, + "learning_rate": 5.3344208809135404e-05, + "loss": 0.3551, + "num_input_tokens_seen": 1417024, + "step": 655 + }, + { + "epoch": 0.10766721044045677, + "grad_norm": 2.1552836894989014, + "learning_rate": 5.375203915171289e-05, + "loss": 0.3592, + "num_input_tokens_seen": 1427808, + "step": 660 + }, + { + "epoch": 0.10848287112561175, + "grad_norm": 3.184595823287964, + "learning_rate": 5.415986949429037e-05, + "loss": 0.3393, + "num_input_tokens_seen": 1437344, + "step": 665 + }, + { + "epoch": 0.10929853181076672, + "grad_norm": 0.8752344846725464, + "learning_rate": 5.456769983686787e-05, + "loss": 0.3558, + "num_input_tokens_seen": 1448352, + "step": 670 + }, + { + "epoch": 0.11011419249592169, + "grad_norm": 4.0038743019104, + "learning_rate": 5.497553017944535e-05, + "loss": 0.3583, + "num_input_tokens_seen": 1459520, + "step": 675 + }, + { + "epoch": 0.11092985318107668, + "grad_norm": 22.788026809692383, + "learning_rate": 5.538336052202284e-05, + "loss": 3.7669, + "num_input_tokens_seen": 1470432, + "step": 680 + }, + { + "epoch": 0.11174551386623165, + "grad_norm": 17.497343063354492, + "learning_rate": 5.579119086460033e-05, + "loss": 4.2486, + "num_input_tokens_seen": 1480544, + "step": 685 + }, + { + "epoch": 0.11256117455138662, + "grad_norm": 11.30042552947998, + "learning_rate": 5.6199021207177814e-05, + "loss": 1.5352, + "num_input_tokens_seen": 1490752, + "step": 690 + }, + { + "epoch": 0.1133768352365416, + "grad_norm": 18.100345611572266, + "learning_rate": 5.6606851549755304e-05, + "loss": 0.6047, + "num_input_tokens_seen": 1502240, + "step": 695 + }, + { + "epoch": 0.11419249592169657, + "grad_norm": 2.8180792331695557, + "learning_rate": 5.701468189233279e-05, + "loss": 0.5049, + "num_input_tokens_seen": 1512256, + "step": 700 + }, + { + "epoch": 0.11500815660685156, + "grad_norm": 15.969682693481445, + "learning_rate": 5.7422512234910284e-05, + "loss": 0.495, + "num_input_tokens_seen": 1523168, + "step": 705 + }, + { + "epoch": 0.11582381729200653, + "grad_norm": 19.627962112426758, + "learning_rate": 5.783034257748777e-05, + "loss": 0.4133, + "num_input_tokens_seen": 1534976, + "step": 710 + }, + { + "epoch": 0.1166394779771615, + "grad_norm": 6.401419639587402, + "learning_rate": 5.823817292006525e-05, + "loss": 0.6107, + "num_input_tokens_seen": 1545856, + "step": 715 + }, + { + "epoch": 0.11745513866231648, + "grad_norm": 1.514564037322998, + "learning_rate": 5.864600326264275e-05, + "loss": 0.3392, + "num_input_tokens_seen": 1555680, + "step": 720 + }, + { + "epoch": 0.11827079934747145, + "grad_norm": 2.0046682357788086, + "learning_rate": 5.905383360522023e-05, + "loss": 0.3058, + "num_input_tokens_seen": 1567328, + "step": 725 + }, + { + "epoch": 0.11908646003262642, + "grad_norm": 4.961424827575684, + "learning_rate": 5.9461663947797714e-05, + "loss": 0.6076, + "num_input_tokens_seen": 1577184, + "step": 730 + }, + { + "epoch": 0.1199021207177814, + "grad_norm": 10.483532905578613, + "learning_rate": 5.9869494290375204e-05, + "loss": 0.4824, + "num_input_tokens_seen": 1588544, + "step": 735 + }, + { + "epoch": 0.12071778140293637, + "grad_norm": 2.3520166873931885, + "learning_rate": 6.0277324632952694e-05, + "loss": 0.4605, + "num_input_tokens_seen": 1599648, + "step": 740 + }, + { + "epoch": 0.12153344208809136, + "grad_norm": 5.859687328338623, + "learning_rate": 6.0685154975530184e-05, + "loss": 0.4401, + "num_input_tokens_seen": 1612096, + "step": 745 + }, + { + "epoch": 0.12234910277324633, + "grad_norm": 3.8298563957214355, + "learning_rate": 6.109298531810767e-05, + "loss": 0.4431, + "num_input_tokens_seen": 1623584, + "step": 750 + }, + { + "epoch": 0.1231647634584013, + "grad_norm": 1.8206467628479004, + "learning_rate": 6.150081566068516e-05, + "loss": 0.3903, + "num_input_tokens_seen": 1634688, + "step": 755 + }, + { + "epoch": 0.12398042414355628, + "grad_norm": 3.001570224761963, + "learning_rate": 6.190864600326265e-05, + "loss": 0.3624, + "num_input_tokens_seen": 1645600, + "step": 760 + }, + { + "epoch": 0.12479608482871125, + "grad_norm": 1.0341628789901733, + "learning_rate": 6.231647634584014e-05, + "loss": 0.3552, + "num_input_tokens_seen": 1655968, + "step": 765 + }, + { + "epoch": 0.12561174551386622, + "grad_norm": 3.0903894901275635, + "learning_rate": 6.272430668841763e-05, + "loss": 0.4151, + "num_input_tokens_seen": 1666848, + "step": 770 + }, + { + "epoch": 0.1264274061990212, + "grad_norm": 1.0071330070495605, + "learning_rate": 6.31321370309951e-05, + "loss": 0.5799, + "num_input_tokens_seen": 1676512, + "step": 775 + }, + { + "epoch": 0.1272430668841762, + "grad_norm": 0.8469381332397461, + "learning_rate": 6.35399673735726e-05, + "loss": 0.3397, + "num_input_tokens_seen": 1687488, + "step": 780 + }, + { + "epoch": 0.12805872756933115, + "grad_norm": 2.6776137351989746, + "learning_rate": 6.394779771615008e-05, + "loss": 0.4768, + "num_input_tokens_seen": 1697312, + "step": 785 + }, + { + "epoch": 0.12887438825448613, + "grad_norm": 2.523658514022827, + "learning_rate": 6.435562805872756e-05, + "loss": 0.37, + "num_input_tokens_seen": 1708096, + "step": 790 + }, + { + "epoch": 0.12969004893964112, + "grad_norm": 2.922795534133911, + "learning_rate": 6.476345840130505e-05, + "loss": 0.4222, + "num_input_tokens_seen": 1717824, + "step": 795 + }, + { + "epoch": 0.13050570962479607, + "grad_norm": 2.616544008255005, + "learning_rate": 6.517128874388255e-05, + "loss": 0.3818, + "num_input_tokens_seen": 1728608, + "step": 800 + }, + { + "epoch": 0.13132137030995106, + "grad_norm": 1.1942052841186523, + "learning_rate": 6.557911908646004e-05, + "loss": 0.3884, + "num_input_tokens_seen": 1739456, + "step": 805 + }, + { + "epoch": 0.13213703099510604, + "grad_norm": 6.790151119232178, + "learning_rate": 6.598694942903752e-05, + "loss": 0.401, + "num_input_tokens_seen": 1750304, + "step": 810 + }, + { + "epoch": 0.132952691680261, + "grad_norm": 8.127601623535156, + "learning_rate": 6.639477977161501e-05, + "loss": 0.3575, + "num_input_tokens_seen": 1762496, + "step": 815 + }, + { + "epoch": 0.13376835236541598, + "grad_norm": 3.7575249671936035, + "learning_rate": 6.68026101141925e-05, + "loss": 0.4206, + "num_input_tokens_seen": 1772640, + "step": 820 + }, + { + "epoch": 0.13458401305057097, + "grad_norm": 3.6937663555145264, + "learning_rate": 6.721044045676998e-05, + "loss": 0.326, + "num_input_tokens_seen": 1782272, + "step": 825 + }, + { + "epoch": 0.13539967373572595, + "grad_norm": 7.329087257385254, + "learning_rate": 6.761827079934747e-05, + "loss": 0.4601, + "num_input_tokens_seen": 1793120, + "step": 830 + }, + { + "epoch": 0.1362153344208809, + "grad_norm": 3.4597127437591553, + "learning_rate": 6.802610114192497e-05, + "loss": 0.3631, + "num_input_tokens_seen": 1803264, + "step": 835 + }, + { + "epoch": 0.1370309951060359, + "grad_norm": 2.523526430130005, + "learning_rate": 6.843393148450245e-05, + "loss": 0.3378, + "num_input_tokens_seen": 1813088, + "step": 840 + }, + { + "epoch": 0.13784665579119088, + "grad_norm": 0.8580302596092224, + "learning_rate": 6.884176182707994e-05, + "loss": 0.3911, + "num_input_tokens_seen": 1824032, + "step": 845 + }, + { + "epoch": 0.13866231647634583, + "grad_norm": 0.7191941738128662, + "learning_rate": 6.924959216965743e-05, + "loss": 0.3979, + "num_input_tokens_seen": 1836512, + "step": 850 + }, + { + "epoch": 0.13947797716150082, + "grad_norm": 0.6474296450614929, + "learning_rate": 6.96574225122349e-05, + "loss": 0.404, + "num_input_tokens_seen": 1848256, + "step": 855 + }, + { + "epoch": 0.1402936378466558, + "grad_norm": 1.4502009153366089, + "learning_rate": 7.006525285481239e-05, + "loss": 0.377, + "num_input_tokens_seen": 1859040, + "step": 860 + }, + { + "epoch": 0.14110929853181076, + "grad_norm": 2.782411813735962, + "learning_rate": 7.047308319738988e-05, + "loss": 0.3289, + "num_input_tokens_seen": 1869248, + "step": 865 + }, + { + "epoch": 0.14192495921696574, + "grad_norm": 2.9253158569335938, + "learning_rate": 7.088091353996739e-05, + "loss": 0.4676, + "num_input_tokens_seen": 1881184, + "step": 870 + }, + { + "epoch": 0.14274061990212072, + "grad_norm": 5.021274566650391, + "learning_rate": 7.128874388254486e-05, + "loss": 0.4003, + "num_input_tokens_seen": 1893760, + "step": 875 + }, + { + "epoch": 0.14355628058727568, + "grad_norm": 0.8828313946723938, + "learning_rate": 7.169657422512235e-05, + "loss": 0.3662, + "num_input_tokens_seen": 1904800, + "step": 880 + }, + { + "epoch": 0.14437194127243066, + "grad_norm": 3.992981433868408, + "learning_rate": 7.210440456769984e-05, + "loss": 0.3451, + "num_input_tokens_seen": 1915296, + "step": 885 + }, + { + "epoch": 0.14518760195758565, + "grad_norm": 5.543689250946045, + "learning_rate": 7.251223491027732e-05, + "loss": 0.3863, + "num_input_tokens_seen": 1925152, + "step": 890 + }, + { + "epoch": 0.14600326264274063, + "grad_norm": 2.8486008644104004, + "learning_rate": 7.292006525285481e-05, + "loss": 0.4681, + "num_input_tokens_seen": 1935200, + "step": 895 + }, + { + "epoch": 0.1468189233278956, + "grad_norm": 1.5159333944320679, + "learning_rate": 7.332789559543231e-05, + "loss": 0.3525, + "num_input_tokens_seen": 1945632, + "step": 900 + }, + { + "epoch": 0.14763458401305057, + "grad_norm": 1.958126187324524, + "learning_rate": 7.373572593800979e-05, + "loss": 0.371, + "num_input_tokens_seen": 1956992, + "step": 905 + }, + { + "epoch": 0.14845024469820556, + "grad_norm": 1.0591323375701904, + "learning_rate": 7.414355628058728e-05, + "loss": 0.3765, + "num_input_tokens_seen": 1968800, + "step": 910 + }, + { + "epoch": 0.14926590538336051, + "grad_norm": 3.3293936252593994, + "learning_rate": 7.455138662316477e-05, + "loss": 0.3954, + "num_input_tokens_seen": 1979424, + "step": 915 + }, + { + "epoch": 0.1500815660685155, + "grad_norm": 1.6851379871368408, + "learning_rate": 7.495921696574225e-05, + "loss": 0.3444, + "num_input_tokens_seen": 1990976, + "step": 920 + }, + { + "epoch": 0.15089722675367048, + "grad_norm": 2.509611129760742, + "learning_rate": 7.536704730831974e-05, + "loss": 0.4096, + "num_input_tokens_seen": 2002432, + "step": 925 + }, + { + "epoch": 0.15171288743882544, + "grad_norm": 3.9215729236602783, + "learning_rate": 7.577487765089723e-05, + "loss": 0.3729, + "num_input_tokens_seen": 2013632, + "step": 930 + }, + { + "epoch": 0.15252854812398042, + "grad_norm": 3.0131313800811768, + "learning_rate": 7.618270799347473e-05, + "loss": 0.4069, + "num_input_tokens_seen": 2024960, + "step": 935 + }, + { + "epoch": 0.1533442088091354, + "grad_norm": 1.9746767282485962, + "learning_rate": 7.65905383360522e-05, + "loss": 0.3479, + "num_input_tokens_seen": 2035712, + "step": 940 + }, + { + "epoch": 0.15415986949429036, + "grad_norm": 1.245065689086914, + "learning_rate": 7.69983686786297e-05, + "loss": 0.3399, + "num_input_tokens_seen": 2046656, + "step": 945 + }, + { + "epoch": 0.15497553017944535, + "grad_norm": 1.6873618364334106, + "learning_rate": 7.740619902120719e-05, + "loss": 0.3936, + "num_input_tokens_seen": 2058112, + "step": 950 + }, + { + "epoch": 0.15579119086460033, + "grad_norm": 1.5522501468658447, + "learning_rate": 7.781402936378466e-05, + "loss": 0.399, + "num_input_tokens_seen": 2068768, + "step": 955 + }, + { + "epoch": 0.1566068515497553, + "grad_norm": 3.1248691082000732, + "learning_rate": 7.822185970636215e-05, + "loss": 0.4012, + "num_input_tokens_seen": 2080928, + "step": 960 + }, + { + "epoch": 0.15742251223491027, + "grad_norm": 0.7277675271034241, + "learning_rate": 7.862969004893964e-05, + "loss": 0.3949, + "num_input_tokens_seen": 2090784, + "step": 965 + }, + { + "epoch": 0.15823817292006526, + "grad_norm": 0.8373827934265137, + "learning_rate": 7.903752039151713e-05, + "loss": 0.3424, + "num_input_tokens_seen": 2100416, + "step": 970 + }, + { + "epoch": 0.15905383360522024, + "grad_norm": 1.8267219066619873, + "learning_rate": 7.944535073409462e-05, + "loss": 0.3533, + "num_input_tokens_seen": 2111424, + "step": 975 + }, + { + "epoch": 0.1598694942903752, + "grad_norm": 0.7513936758041382, + "learning_rate": 7.985318107667211e-05, + "loss": 0.3353, + "num_input_tokens_seen": 2122176, + "step": 980 + }, + { + "epoch": 0.16068515497553018, + "grad_norm": 1.645971655845642, + "learning_rate": 8.026101141924959e-05, + "loss": 0.3732, + "num_input_tokens_seen": 2132928, + "step": 985 + }, + { + "epoch": 0.16150081566068517, + "grad_norm": 0.8605174422264099, + "learning_rate": 8.066884176182708e-05, + "loss": 0.4186, + "num_input_tokens_seen": 2144480, + "step": 990 + }, + { + "epoch": 0.16231647634584012, + "grad_norm": 1.2997450828552246, + "learning_rate": 8.107667210440457e-05, + "loss": 0.3588, + "num_input_tokens_seen": 2154272, + "step": 995 + }, + { + "epoch": 0.1631321370309951, + "grad_norm": 4.329593181610107, + "learning_rate": 8.148450244698205e-05, + "loss": 0.4124, + "num_input_tokens_seen": 2163936, + "step": 1000 + }, + { + "epoch": 0.1639477977161501, + "grad_norm": 1.166007161140442, + "learning_rate": 8.189233278955955e-05, + "loss": 0.447, + "num_input_tokens_seen": 2175392, + "step": 1005 + }, + { + "epoch": 0.16476345840130505, + "grad_norm": 3.236189603805542, + "learning_rate": 8.230016313213704e-05, + "loss": 0.3681, + "num_input_tokens_seen": 2185120, + "step": 1010 + }, + { + "epoch": 0.16557911908646003, + "grad_norm": 0.9860192537307739, + "learning_rate": 8.270799347471453e-05, + "loss": 0.3226, + "num_input_tokens_seen": 2196864, + "step": 1015 + }, + { + "epoch": 0.16639477977161501, + "grad_norm": 1.963640809059143, + "learning_rate": 8.3115823817292e-05, + "loss": 0.3408, + "num_input_tokens_seen": 2207744, + "step": 1020 + }, + { + "epoch": 0.16721044045676997, + "grad_norm": 0.584351122379303, + "learning_rate": 8.35236541598695e-05, + "loss": 0.3445, + "num_input_tokens_seen": 2218528, + "step": 1025 + }, + { + "epoch": 0.16802610114192496, + "grad_norm": 1.9298738241195679, + "learning_rate": 8.393148450244699e-05, + "loss": 0.3675, + "num_input_tokens_seen": 2228320, + "step": 1030 + }, + { + "epoch": 0.16884176182707994, + "grad_norm": 0.6805135011672974, + "learning_rate": 8.433931484502446e-05, + "loss": 0.3418, + "num_input_tokens_seen": 2238464, + "step": 1035 + }, + { + "epoch": 0.16965742251223492, + "grad_norm": 1.4359157085418701, + "learning_rate": 8.474714518760197e-05, + "loss": 0.3876, + "num_input_tokens_seen": 2249440, + "step": 1040 + }, + { + "epoch": 0.17047308319738988, + "grad_norm": 1.4835699796676636, + "learning_rate": 8.515497553017946e-05, + "loss": 0.3285, + "num_input_tokens_seen": 2260576, + "step": 1045 + }, + { + "epoch": 0.17128874388254486, + "grad_norm": 0.8585436940193176, + "learning_rate": 8.556280587275693e-05, + "loss": 0.3726, + "num_input_tokens_seen": 2271904, + "step": 1050 + }, + { + "epoch": 0.17210440456769985, + "grad_norm": 2.352973699569702, + "learning_rate": 8.597063621533442e-05, + "loss": 0.337, + "num_input_tokens_seen": 2281568, + "step": 1055 + }, + { + "epoch": 0.1729200652528548, + "grad_norm": 1.128648281097412, + "learning_rate": 8.637846655791191e-05, + "loss": 0.3441, + "num_input_tokens_seen": 2292704, + "step": 1060 + }, + { + "epoch": 0.1737357259380098, + "grad_norm": 0.7506831288337708, + "learning_rate": 8.678629690048939e-05, + "loss": 0.2638, + "num_input_tokens_seen": 2303904, + "step": 1065 + }, + { + "epoch": 0.17455138662316477, + "grad_norm": 1.01543390750885, + "learning_rate": 8.719412724306688e-05, + "loss": 0.288, + "num_input_tokens_seen": 2314336, + "step": 1070 + }, + { + "epoch": 0.17536704730831973, + "grad_norm": 5.3944091796875, + "learning_rate": 8.760195758564438e-05, + "loss": 0.3511, + "num_input_tokens_seen": 2325024, + "step": 1075 + }, + { + "epoch": 0.1761827079934747, + "grad_norm": 1.6179203987121582, + "learning_rate": 8.800978792822187e-05, + "loss": 0.3916, + "num_input_tokens_seen": 2336224, + "step": 1080 + }, + { + "epoch": 0.1769983686786297, + "grad_norm": 2.521864175796509, + "learning_rate": 8.841761827079935e-05, + "loss": 0.4995, + "num_input_tokens_seen": 2346880, + "step": 1085 + }, + { + "epoch": 0.17781402936378465, + "grad_norm": 1.3786225318908691, + "learning_rate": 8.882544861337684e-05, + "loss": 0.392, + "num_input_tokens_seen": 2357632, + "step": 1090 + }, + { + "epoch": 0.17862969004893964, + "grad_norm": 1.3058786392211914, + "learning_rate": 8.923327895595433e-05, + "loss": 0.3614, + "num_input_tokens_seen": 2370592, + "step": 1095 + }, + { + "epoch": 0.17944535073409462, + "grad_norm": 0.9755756855010986, + "learning_rate": 8.96411092985318e-05, + "loss": 0.3802, + "num_input_tokens_seen": 2381024, + "step": 1100 + }, + { + "epoch": 0.1802610114192496, + "grad_norm": 1.755306601524353, + "learning_rate": 9.00489396411093e-05, + "loss": 0.3776, + "num_input_tokens_seen": 2392160, + "step": 1105 + }, + { + "epoch": 0.18107667210440456, + "grad_norm": 1.9835728406906128, + "learning_rate": 9.04567699836868e-05, + "loss": 0.3415, + "num_input_tokens_seen": 2403616, + "step": 1110 + }, + { + "epoch": 0.18189233278955955, + "grad_norm": 1.071263313293457, + "learning_rate": 9.086460032626427e-05, + "loss": 0.3701, + "num_input_tokens_seen": 2416032, + "step": 1115 + }, + { + "epoch": 0.18270799347471453, + "grad_norm": 1.9002200365066528, + "learning_rate": 9.127243066884176e-05, + "loss": 0.3392, + "num_input_tokens_seen": 2426112, + "step": 1120 + }, + { + "epoch": 0.1835236541598695, + "grad_norm": 1.5399904251098633, + "learning_rate": 9.168026101141925e-05, + "loss": 0.3392, + "num_input_tokens_seen": 2437344, + "step": 1125 + }, + { + "epoch": 0.18433931484502447, + "grad_norm": 0.6316581964492798, + "learning_rate": 9.208809135399673e-05, + "loss": 0.361, + "num_input_tokens_seen": 2448672, + "step": 1130 + }, + { + "epoch": 0.18515497553017946, + "grad_norm": 1.2252036333084106, + "learning_rate": 9.249592169657422e-05, + "loss": 0.3541, + "num_input_tokens_seen": 2459552, + "step": 1135 + }, + { + "epoch": 0.1859706362153344, + "grad_norm": 1.1536774635314941, + "learning_rate": 9.290375203915171e-05, + "loss": 0.3298, + "num_input_tokens_seen": 2469184, + "step": 1140 + }, + { + "epoch": 0.1867862969004894, + "grad_norm": 1.0686466693878174, + "learning_rate": 9.33115823817292e-05, + "loss": 0.3424, + "num_input_tokens_seen": 2481120, + "step": 1145 + }, + { + "epoch": 0.18760195758564438, + "grad_norm": 1.763959288597107, + "learning_rate": 9.371941272430669e-05, + "loss": 0.3269, + "num_input_tokens_seen": 2490976, + "step": 1150 + }, + { + "epoch": 0.18841761827079934, + "grad_norm": 1.11533784866333, + "learning_rate": 9.412724306688418e-05, + "loss": 0.3486, + "num_input_tokens_seen": 2501024, + "step": 1155 + }, + { + "epoch": 0.18923327895595432, + "grad_norm": 0.9042391777038574, + "learning_rate": 9.453507340946167e-05, + "loss": 0.2907, + "num_input_tokens_seen": 2511264, + "step": 1160 + }, + { + "epoch": 0.1900489396411093, + "grad_norm": 1.2744841575622559, + "learning_rate": 9.494290375203915e-05, + "loss": 0.356, + "num_input_tokens_seen": 2521920, + "step": 1165 + }, + { + "epoch": 0.19086460032626426, + "grad_norm": 2.1233761310577393, + "learning_rate": 9.535073409461664e-05, + "loss": 0.4065, + "num_input_tokens_seen": 2533056, + "step": 1170 + }, + { + "epoch": 0.19168026101141925, + "grad_norm": 1.9429396390914917, + "learning_rate": 9.575856443719413e-05, + "loss": 0.3787, + "num_input_tokens_seen": 2544320, + "step": 1175 + }, + { + "epoch": 0.19249592169657423, + "grad_norm": 1.9294630289077759, + "learning_rate": 9.616639477977162e-05, + "loss": 0.3739, + "num_input_tokens_seen": 2555168, + "step": 1180 + }, + { + "epoch": 0.1933115823817292, + "grad_norm": 0.6923383474349976, + "learning_rate": 9.657422512234911e-05, + "loss": 0.3391, + "num_input_tokens_seen": 2566304, + "step": 1185 + }, + { + "epoch": 0.19412724306688417, + "grad_norm": 0.8450667858123779, + "learning_rate": 9.69820554649266e-05, + "loss": 0.344, + "num_input_tokens_seen": 2575936, + "step": 1190 + }, + { + "epoch": 0.19494290375203915, + "grad_norm": 1.3003705739974976, + "learning_rate": 9.738988580750407e-05, + "loss": 0.2965, + "num_input_tokens_seen": 2586528, + "step": 1195 + }, + { + "epoch": 0.19575856443719414, + "grad_norm": 0.6091299653053284, + "learning_rate": 9.779771615008156e-05, + "loss": 0.412, + "num_input_tokens_seen": 2596064, + "step": 1200 + }, + { + "epoch": 0.1965742251223491, + "grad_norm": 2.3082096576690674, + "learning_rate": 9.820554649265905e-05, + "loss": 0.3257, + "num_input_tokens_seen": 2606464, + "step": 1205 + }, + { + "epoch": 0.19738988580750408, + "grad_norm": 0.7262064218521118, + "learning_rate": 9.861337683523653e-05, + "loss": 0.4696, + "num_input_tokens_seen": 2617568, + "step": 1210 + }, + { + "epoch": 0.19820554649265906, + "grad_norm": 0.4807187020778656, + "learning_rate": 9.902120717781403e-05, + "loss": 0.2929, + "num_input_tokens_seen": 2628928, + "step": 1215 + }, + { + "epoch": 0.19902120717781402, + "grad_norm": 2.5533368587493896, + "learning_rate": 9.942903752039152e-05, + "loss": 0.3284, + "num_input_tokens_seen": 2641344, + "step": 1220 + }, + { + "epoch": 0.199836867862969, + "grad_norm": 0.6028878092765808, + "learning_rate": 9.983686786296901e-05, + "loss": 0.3102, + "num_input_tokens_seen": 2651584, + "step": 1225 + }, + { + "epoch": 0.200652528548124, + "grad_norm": 0.9486817717552185, + "learning_rate": 0.00010024469820554649, + "loss": 0.4618, + "num_input_tokens_seen": 2663360, + "step": 1230 + }, + { + "epoch": 0.20146818923327894, + "grad_norm": 2.3076438903808594, + "learning_rate": 0.00010065252854812398, + "loss": 0.4763, + "num_input_tokens_seen": 2675168, + "step": 1235 + }, + { + "epoch": 0.20228384991843393, + "grad_norm": 1.1677532196044922, + "learning_rate": 0.00010106035889070147, + "loss": 0.3505, + "num_input_tokens_seen": 2685952, + "step": 1240 + }, + { + "epoch": 0.2030995106035889, + "grad_norm": 3.1230320930480957, + "learning_rate": 0.00010146818923327896, + "loss": 0.3932, + "num_input_tokens_seen": 2695488, + "step": 1245 + }, + { + "epoch": 0.2039151712887439, + "grad_norm": 1.663774847984314, + "learning_rate": 0.00010187601957585645, + "loss": 0.3438, + "num_input_tokens_seen": 2706880, + "step": 1250 + }, + { + "epoch": 0.20473083197389885, + "grad_norm": 3.202942132949829, + "learning_rate": 0.00010228384991843394, + "loss": 0.4604, + "num_input_tokens_seen": 2717184, + "step": 1255 + }, + { + "epoch": 0.20554649265905384, + "grad_norm": 0.9774695634841919, + "learning_rate": 0.00010269168026101142, + "loss": 0.3216, + "num_input_tokens_seen": 2729184, + "step": 1260 + }, + { + "epoch": 0.20636215334420882, + "grad_norm": 0.6506635546684265, + "learning_rate": 0.00010309951060358891, + "loss": 0.3916, + "num_input_tokens_seen": 2740768, + "step": 1265 + }, + { + "epoch": 0.20717781402936378, + "grad_norm": 0.5004635453224182, + "learning_rate": 0.0001035073409461664, + "loss": 0.3711, + "num_input_tokens_seen": 2751328, + "step": 1270 + }, + { + "epoch": 0.20799347471451876, + "grad_norm": 1.1510679721832275, + "learning_rate": 0.00010391517128874387, + "loss": 0.368, + "num_input_tokens_seen": 2762464, + "step": 1275 + }, + { + "epoch": 0.20880913539967375, + "grad_norm": 1.0645408630371094, + "learning_rate": 0.00010432300163132138, + "loss": 0.3437, + "num_input_tokens_seen": 2773824, + "step": 1280 + }, + { + "epoch": 0.2096247960848287, + "grad_norm": 1.1933183670043945, + "learning_rate": 0.00010473083197389887, + "loss": 0.3339, + "num_input_tokens_seen": 2784544, + "step": 1285 + }, + { + "epoch": 0.21044045676998369, + "grad_norm": 0.7531349062919617, + "learning_rate": 0.00010513866231647634, + "loss": 0.3151, + "num_input_tokens_seen": 2796032, + "step": 1290 + }, + { + "epoch": 0.21125611745513867, + "grad_norm": 2.9709150791168213, + "learning_rate": 0.00010554649265905383, + "loss": 0.3472, + "num_input_tokens_seen": 2808096, + "step": 1295 + }, + { + "epoch": 0.21207177814029363, + "grad_norm": 1.2990198135375977, + "learning_rate": 0.00010595432300163132, + "loss": 0.3321, + "num_input_tokens_seen": 2818080, + "step": 1300 + }, + { + "epoch": 0.2128874388254486, + "grad_norm": 2.682976245880127, + "learning_rate": 0.00010636215334420881, + "loss": 0.3846, + "num_input_tokens_seen": 2829248, + "step": 1305 + }, + { + "epoch": 0.2137030995106036, + "grad_norm": 0.9402604103088379, + "learning_rate": 0.00010676998368678629, + "loss": 0.3476, + "num_input_tokens_seen": 2840288, + "step": 1310 + }, + { + "epoch": 0.21451876019575855, + "grad_norm": 0.7319830060005188, + "learning_rate": 0.0001071778140293638, + "loss": 0.3443, + "num_input_tokens_seen": 2849440, + "step": 1315 + }, + { + "epoch": 0.21533442088091354, + "grad_norm": 0.9809147119522095, + "learning_rate": 0.00010758564437194128, + "loss": 0.3212, + "num_input_tokens_seen": 2861056, + "step": 1320 + }, + { + "epoch": 0.21615008156606852, + "grad_norm": 0.4285057485103607, + "learning_rate": 0.00010799347471451876, + "loss": 0.3353, + "num_input_tokens_seen": 2872832, + "step": 1325 + }, + { + "epoch": 0.2169657422512235, + "grad_norm": 0.8098944425582886, + "learning_rate": 0.00010840130505709625, + "loss": 0.3465, + "num_input_tokens_seen": 2883968, + "step": 1330 + }, + { + "epoch": 0.21778140293637846, + "grad_norm": 0.6743302345275879, + "learning_rate": 0.00010880913539967374, + "loss": 0.3804, + "num_input_tokens_seen": 2894880, + "step": 1335 + }, + { + "epoch": 0.21859706362153344, + "grad_norm": 2.499708652496338, + "learning_rate": 0.00010921696574225122, + "loss": 0.3756, + "num_input_tokens_seen": 2905216, + "step": 1340 + }, + { + "epoch": 0.21941272430668843, + "grad_norm": 0.5635789036750793, + "learning_rate": 0.0001096247960848287, + "loss": 0.3706, + "num_input_tokens_seen": 2917152, + "step": 1345 + }, + { + "epoch": 0.22022838499184338, + "grad_norm": 1.3032878637313843, + "learning_rate": 0.00011003262642740621, + "loss": 0.3542, + "num_input_tokens_seen": 2928448, + "step": 1350 + }, + { + "epoch": 0.22104404567699837, + "grad_norm": 0.7343788743019104, + "learning_rate": 0.00011044045676998369, + "loss": 0.3207, + "num_input_tokens_seen": 2938336, + "step": 1355 + }, + { + "epoch": 0.22185970636215335, + "grad_norm": 1.47647225856781, + "learning_rate": 0.00011084828711256118, + "loss": 0.3149, + "num_input_tokens_seen": 2950496, + "step": 1360 + }, + { + "epoch": 0.2226753670473083, + "grad_norm": 0.46358925104141235, + "learning_rate": 0.00011125611745513867, + "loss": 0.369, + "num_input_tokens_seen": 2961024, + "step": 1365 + }, + { + "epoch": 0.2234910277324633, + "grad_norm": 0.8246099352836609, + "learning_rate": 0.00011166394779771616, + "loss": 0.3343, + "num_input_tokens_seen": 2971744, + "step": 1370 + }, + { + "epoch": 0.22430668841761828, + "grad_norm": 0.46428874135017395, + "learning_rate": 0.00011207177814029363, + "loss": 0.3288, + "num_input_tokens_seen": 2984192, + "step": 1375 + }, + { + "epoch": 0.22512234910277323, + "grad_norm": 0.8306359052658081, + "learning_rate": 0.00011247960848287112, + "loss": 0.2791, + "num_input_tokens_seen": 2994816, + "step": 1380 + }, + { + "epoch": 0.22593800978792822, + "grad_norm": 0.33363911509513855, + "learning_rate": 0.00011288743882544863, + "loss": 0.3459, + "num_input_tokens_seen": 3005568, + "step": 1385 + }, + { + "epoch": 0.2267536704730832, + "grad_norm": 0.9000904560089111, + "learning_rate": 0.0001132952691680261, + "loss": 0.35, + "num_input_tokens_seen": 3015680, + "step": 1390 + }, + { + "epoch": 0.2275693311582382, + "grad_norm": 2.413815975189209, + "learning_rate": 0.00011370309951060359, + "loss": 0.321, + "num_input_tokens_seen": 3026752, + "step": 1395 + }, + { + "epoch": 0.22838499184339314, + "grad_norm": 1.3468011617660522, + "learning_rate": 0.00011411092985318108, + "loss": 0.3517, + "num_input_tokens_seen": 3036704, + "step": 1400 + }, + { + "epoch": 0.22920065252854813, + "grad_norm": 0.5703008770942688, + "learning_rate": 0.00011451876019575856, + "loss": 0.3861, + "num_input_tokens_seen": 3048960, + "step": 1405 + }, + { + "epoch": 0.2300163132137031, + "grad_norm": 1.0225015878677368, + "learning_rate": 0.00011492659053833605, + "loss": 0.3963, + "num_input_tokens_seen": 3059456, + "step": 1410 + }, + { + "epoch": 0.23083197389885807, + "grad_norm": 1.5991069078445435, + "learning_rate": 0.00011533442088091354, + "loss": 0.3273, + "num_input_tokens_seen": 3070112, + "step": 1415 + }, + { + "epoch": 0.23164763458401305, + "grad_norm": 0.643428385257721, + "learning_rate": 0.00011574225122349103, + "loss": 0.3746, + "num_input_tokens_seen": 3080224, + "step": 1420 + }, + { + "epoch": 0.23246329526916804, + "grad_norm": 1.5102077722549438, + "learning_rate": 0.00011615008156606852, + "loss": 0.3251, + "num_input_tokens_seen": 3090304, + "step": 1425 + }, + { + "epoch": 0.233278955954323, + "grad_norm": 1.0390809774398804, + "learning_rate": 0.00011655791190864601, + "loss": 0.3354, + "num_input_tokens_seen": 3099232, + "step": 1430 + }, + { + "epoch": 0.23409461663947798, + "grad_norm": 0.3219949007034302, + "learning_rate": 0.0001169657422512235, + "loss": 0.3497, + "num_input_tokens_seen": 3109696, + "step": 1435 + }, + { + "epoch": 0.23491027732463296, + "grad_norm": 0.9242702722549438, + "learning_rate": 0.00011737357259380098, + "loss": 0.289, + "num_input_tokens_seen": 3120160, + "step": 1440 + }, + { + "epoch": 0.23572593800978792, + "grad_norm": 1.805952787399292, + "learning_rate": 0.00011778140293637847, + "loss": 0.3682, + "num_input_tokens_seen": 3130784, + "step": 1445 + }, + { + "epoch": 0.2365415986949429, + "grad_norm": 0.8927225470542908, + "learning_rate": 0.00011818923327895596, + "loss": 0.3091, + "num_input_tokens_seen": 3141536, + "step": 1450 + }, + { + "epoch": 0.23735725938009788, + "grad_norm": 0.5092608332633972, + "learning_rate": 0.00011859706362153345, + "loss": 0.3718, + "num_input_tokens_seen": 3152160, + "step": 1455 + }, + { + "epoch": 0.23817292006525284, + "grad_norm": 0.6312627792358398, + "learning_rate": 0.00011900489396411094, + "loss": 0.2408, + "num_input_tokens_seen": 3162560, + "step": 1460 + }, + { + "epoch": 0.23898858075040783, + "grad_norm": 0.5869734287261963, + "learning_rate": 0.00011941272430668843, + "loss": 0.2982, + "num_input_tokens_seen": 3175360, + "step": 1465 + }, + { + "epoch": 0.2398042414355628, + "grad_norm": 1.082488775253296, + "learning_rate": 0.0001198205546492659, + "loss": 0.297, + "num_input_tokens_seen": 3187936, + "step": 1470 + }, + { + "epoch": 0.2406199021207178, + "grad_norm": 0.891363799571991, + "learning_rate": 0.00012022838499184339, + "loss": 0.3907, + "num_input_tokens_seen": 3199424, + "step": 1475 + }, + { + "epoch": 0.24143556280587275, + "grad_norm": 0.8379907608032227, + "learning_rate": 0.00012063621533442088, + "loss": 0.4234, + "num_input_tokens_seen": 3210272, + "step": 1480 + }, + { + "epoch": 0.24225122349102773, + "grad_norm": 1.020867109298706, + "learning_rate": 0.00012104404567699836, + "loss": 0.3592, + "num_input_tokens_seen": 3219680, + "step": 1485 + }, + { + "epoch": 0.24306688417618272, + "grad_norm": 2.042038679122925, + "learning_rate": 0.00012145187601957586, + "loss": 0.3904, + "num_input_tokens_seen": 3229376, + "step": 1490 + }, + { + "epoch": 0.24388254486133767, + "grad_norm": 1.5664820671081543, + "learning_rate": 0.00012185970636215335, + "loss": 0.3454, + "num_input_tokens_seen": 3240896, + "step": 1495 + }, + { + "epoch": 0.24469820554649266, + "grad_norm": 0.7190054655075073, + "learning_rate": 0.00012226753670473083, + "loss": 0.3488, + "num_input_tokens_seen": 3251808, + "step": 1500 + }, + { + "epoch": 0.24551386623164764, + "grad_norm": 0.9747446179389954, + "learning_rate": 0.00012267536704730833, + "loss": 0.2656, + "num_input_tokens_seen": 3262688, + "step": 1505 + }, + { + "epoch": 0.2463295269168026, + "grad_norm": 1.7554460763931274, + "learning_rate": 0.0001230831973898858, + "loss": 0.3375, + "num_input_tokens_seen": 3273344, + "step": 1510 + }, + { + "epoch": 0.24714518760195758, + "grad_norm": 0.8664163947105408, + "learning_rate": 0.0001234910277324633, + "loss": 0.3886, + "num_input_tokens_seen": 3283520, + "step": 1515 + }, + { + "epoch": 0.24796084828711257, + "grad_norm": 0.7226205468177795, + "learning_rate": 0.0001238988580750408, + "loss": 0.3208, + "num_input_tokens_seen": 3294816, + "step": 1520 + }, + { + "epoch": 0.24877650897226752, + "grad_norm": 1.1765764951705933, + "learning_rate": 0.00012430668841761827, + "loss": 0.321, + "num_input_tokens_seen": 3305152, + "step": 1525 + }, + { + "epoch": 0.2495921696574225, + "grad_norm": 1.0162301063537598, + "learning_rate": 0.00012471451876019577, + "loss": 0.3033, + "num_input_tokens_seen": 3315264, + "step": 1530 + }, + { + "epoch": 0.25040783034257746, + "grad_norm": 0.5437058210372925, + "learning_rate": 0.00012512234910277325, + "loss": 0.3426, + "num_input_tokens_seen": 3326208, + "step": 1535 + }, + { + "epoch": 0.25122349102773245, + "grad_norm": 0.5854163765907288, + "learning_rate": 0.00012553017944535072, + "loss": 0.2918, + "num_input_tokens_seen": 3337472, + "step": 1540 + }, + { + "epoch": 0.25203915171288743, + "grad_norm": 0.9946615695953369, + "learning_rate": 0.00012593800978792823, + "loss": 0.3851, + "num_input_tokens_seen": 3348576, + "step": 1545 + }, + { + "epoch": 0.2528548123980424, + "grad_norm": 0.5947259068489075, + "learning_rate": 0.0001263458401305057, + "loss": 0.3308, + "num_input_tokens_seen": 3359200, + "step": 1550 + }, + { + "epoch": 0.2536704730831974, + "grad_norm": 1.863559365272522, + "learning_rate": 0.0001267536704730832, + "loss": 0.324, + "num_input_tokens_seen": 3369696, + "step": 1555 + }, + { + "epoch": 0.2544861337683524, + "grad_norm": 1.6220893859863281, + "learning_rate": 0.00012716150081566068, + "loss": 0.269, + "num_input_tokens_seen": 3381088, + "step": 1560 + }, + { + "epoch": 0.2553017944535073, + "grad_norm": 2.2439920902252197, + "learning_rate": 0.00012756933115823819, + "loss": 0.416, + "num_input_tokens_seen": 3392704, + "step": 1565 + }, + { + "epoch": 0.2561174551386623, + "grad_norm": 0.666847825050354, + "learning_rate": 0.00012797716150081566, + "loss": 0.336, + "num_input_tokens_seen": 3403136, + "step": 1570 + }, + { + "epoch": 0.2569331158238173, + "grad_norm": 0.6700378656387329, + "learning_rate": 0.00012838499184339314, + "loss": 0.3087, + "num_input_tokens_seen": 3411968, + "step": 1575 + }, + { + "epoch": 0.25774877650897227, + "grad_norm": 0.611040472984314, + "learning_rate": 0.00012879282218597064, + "loss": 0.3808, + "num_input_tokens_seen": 3421216, + "step": 1580 + }, + { + "epoch": 0.25856443719412725, + "grad_norm": 2.4051663875579834, + "learning_rate": 0.00012920065252854812, + "loss": 0.3651, + "num_input_tokens_seen": 3433120, + "step": 1585 + }, + { + "epoch": 0.25938009787928223, + "grad_norm": 2.451159954071045, + "learning_rate": 0.00012960848287112562, + "loss": 0.3538, + "num_input_tokens_seen": 3444960, + "step": 1590 + }, + { + "epoch": 0.2601957585644372, + "grad_norm": 1.052446961402893, + "learning_rate": 0.0001300163132137031, + "loss": 0.3747, + "num_input_tokens_seen": 3456512, + "step": 1595 + }, + { + "epoch": 0.26101141924959215, + "grad_norm": 0.5838006138801575, + "learning_rate": 0.0001304241435562806, + "loss": 0.2898, + "num_input_tokens_seen": 3466880, + "step": 1600 + }, + { + "epoch": 0.26182707993474713, + "grad_norm": 1.1788747310638428, + "learning_rate": 0.00013083197389885805, + "loss": 0.2571, + "num_input_tokens_seen": 3478528, + "step": 1605 + }, + { + "epoch": 0.2626427406199021, + "grad_norm": 0.9669155478477478, + "learning_rate": 0.00013123980424143555, + "loss": 0.3644, + "num_input_tokens_seen": 3490240, + "step": 1610 + }, + { + "epoch": 0.2634584013050571, + "grad_norm": 0.6122996807098389, + "learning_rate": 0.00013164763458401306, + "loss": 0.2765, + "num_input_tokens_seen": 3501152, + "step": 1615 + }, + { + "epoch": 0.2642740619902121, + "grad_norm": 2.7994883060455322, + "learning_rate": 0.00013205546492659053, + "loss": 0.5243, + "num_input_tokens_seen": 3511360, + "step": 1620 + }, + { + "epoch": 0.26508972267536707, + "grad_norm": 0.5510287880897522, + "learning_rate": 0.00013246329526916804, + "loss": 0.3546, + "num_input_tokens_seen": 3522880, + "step": 1625 + }, + { + "epoch": 0.265905383360522, + "grad_norm": 0.9189668893814087, + "learning_rate": 0.00013287112561174552, + "loss": 0.2995, + "num_input_tokens_seen": 3534720, + "step": 1630 + }, + { + "epoch": 0.266721044045677, + "grad_norm": 1.0096598863601685, + "learning_rate": 0.00013327895595432302, + "loss": 0.3882, + "num_input_tokens_seen": 3544128, + "step": 1635 + }, + { + "epoch": 0.26753670473083196, + "grad_norm": 0.4427107274532318, + "learning_rate": 0.00013368678629690047, + "loss": 0.3373, + "num_input_tokens_seen": 3555552, + "step": 1640 + }, + { + "epoch": 0.26835236541598695, + "grad_norm": 0.8222571611404419, + "learning_rate": 0.00013409461663947797, + "loss": 0.3723, + "num_input_tokens_seen": 3565824, + "step": 1645 + }, + { + "epoch": 0.26916802610114193, + "grad_norm": 0.5624419450759888, + "learning_rate": 0.00013450244698205548, + "loss": 0.3169, + "num_input_tokens_seen": 3576864, + "step": 1650 + }, + { + "epoch": 0.2699836867862969, + "grad_norm": 0.7991923689842224, + "learning_rate": 0.00013491027732463295, + "loss": 0.3387, + "num_input_tokens_seen": 3588032, + "step": 1655 + }, + { + "epoch": 0.2707993474714519, + "grad_norm": 1.1604034900665283, + "learning_rate": 0.00013531810766721046, + "loss": 0.3308, + "num_input_tokens_seen": 3599488, + "step": 1660 + }, + { + "epoch": 0.27161500815660683, + "grad_norm": 1.401595115661621, + "learning_rate": 0.00013572593800978793, + "loss": 0.3647, + "num_input_tokens_seen": 3610368, + "step": 1665 + }, + { + "epoch": 0.2724306688417618, + "grad_norm": 1.5285650491714478, + "learning_rate": 0.0001361337683523654, + "loss": 0.3551, + "num_input_tokens_seen": 3620128, + "step": 1670 + }, + { + "epoch": 0.2732463295269168, + "grad_norm": 1.500177025794983, + "learning_rate": 0.00013654159869494288, + "loss": 0.3608, + "num_input_tokens_seen": 3631872, + "step": 1675 + }, + { + "epoch": 0.2740619902120718, + "grad_norm": 0.8659718036651611, + "learning_rate": 0.0001369494290375204, + "loss": 0.3412, + "num_input_tokens_seen": 3643296, + "step": 1680 + }, + { + "epoch": 0.27487765089722677, + "grad_norm": 0.9596500396728516, + "learning_rate": 0.0001373572593800979, + "loss": 0.3407, + "num_input_tokens_seen": 3654592, + "step": 1685 + }, + { + "epoch": 0.27569331158238175, + "grad_norm": 1.7839581966400146, + "learning_rate": 0.00013776508972267537, + "loss": 0.4241, + "num_input_tokens_seen": 3665312, + "step": 1690 + }, + { + "epoch": 0.2765089722675367, + "grad_norm": 0.6468521952629089, + "learning_rate": 0.00013817292006525287, + "loss": 0.3345, + "num_input_tokens_seen": 3677024, + "step": 1695 + }, + { + "epoch": 0.27732463295269166, + "grad_norm": 0.37736302614212036, + "learning_rate": 0.00013858075040783035, + "loss": 0.3275, + "num_input_tokens_seen": 3687904, + "step": 1700 + }, + { + "epoch": 0.27814029363784665, + "grad_norm": 0.38723012804985046, + "learning_rate": 0.00013898858075040782, + "loss": 0.2918, + "num_input_tokens_seen": 3698848, + "step": 1705 + }, + { + "epoch": 0.27895595432300163, + "grad_norm": 0.6731218695640564, + "learning_rate": 0.00013939641109298533, + "loss": 0.3511, + "num_input_tokens_seen": 3709856, + "step": 1710 + }, + { + "epoch": 0.2797716150081566, + "grad_norm": 0.43490883708000183, + "learning_rate": 0.0001398042414355628, + "loss": 0.3023, + "num_input_tokens_seen": 3720736, + "step": 1715 + }, + { + "epoch": 0.2805872756933116, + "grad_norm": 0.8174964785575867, + "learning_rate": 0.0001402120717781403, + "loss": 0.4184, + "num_input_tokens_seen": 3731264, + "step": 1720 + }, + { + "epoch": 0.2814029363784666, + "grad_norm": 1.0343927145004272, + "learning_rate": 0.00014061990212071778, + "loss": 0.3225, + "num_input_tokens_seen": 3743072, + "step": 1725 + }, + { + "epoch": 0.2822185970636215, + "grad_norm": 0.7688319087028503, + "learning_rate": 0.0001410277324632953, + "loss": 0.3334, + "num_input_tokens_seen": 3754656, + "step": 1730 + }, + { + "epoch": 0.2830342577487765, + "grad_norm": 1.115333080291748, + "learning_rate": 0.00014143556280587274, + "loss": 0.3274, + "num_input_tokens_seen": 3765248, + "step": 1735 + }, + { + "epoch": 0.2838499184339315, + "grad_norm": 1.0914790630340576, + "learning_rate": 0.00014184339314845024, + "loss": 0.2984, + "num_input_tokens_seen": 3774464, + "step": 1740 + }, + { + "epoch": 0.28466557911908646, + "grad_norm": 1.9679445028305054, + "learning_rate": 0.00014225122349102774, + "loss": 0.4068, + "num_input_tokens_seen": 3784608, + "step": 1745 + }, + { + "epoch": 0.28548123980424145, + "grad_norm": 1.1118956804275513, + "learning_rate": 0.00014265905383360522, + "loss": 0.3405, + "num_input_tokens_seen": 3796672, + "step": 1750 + }, + { + "epoch": 0.28629690048939643, + "grad_norm": 1.6323825120925903, + "learning_rate": 0.00014306688417618272, + "loss": 0.3236, + "num_input_tokens_seen": 3807712, + "step": 1755 + }, + { + "epoch": 0.28711256117455136, + "grad_norm": 0.5782093405723572, + "learning_rate": 0.0001434747145187602, + "loss": 0.4269, + "num_input_tokens_seen": 3817984, + "step": 1760 + }, + { + "epoch": 0.28792822185970635, + "grad_norm": 1.2646207809448242, + "learning_rate": 0.0001438825448613377, + "loss": 0.3611, + "num_input_tokens_seen": 3829152, + "step": 1765 + }, + { + "epoch": 0.28874388254486133, + "grad_norm": 0.49866920709609985, + "learning_rate": 0.00014429037520391515, + "loss": 0.3586, + "num_input_tokens_seen": 3840352, + "step": 1770 + }, + { + "epoch": 0.2895595432300163, + "grad_norm": 1.1308144330978394, + "learning_rate": 0.00014469820554649266, + "loss": 0.3653, + "num_input_tokens_seen": 3851456, + "step": 1775 + }, + { + "epoch": 0.2903752039151713, + "grad_norm": 1.0308849811553955, + "learning_rate": 0.00014510603588907016, + "loss": 0.3515, + "num_input_tokens_seen": 3863040, + "step": 1780 + }, + { + "epoch": 0.2911908646003263, + "grad_norm": 0.6121219992637634, + "learning_rate": 0.00014551386623164764, + "loss": 0.3026, + "num_input_tokens_seen": 3874176, + "step": 1785 + }, + { + "epoch": 0.29200652528548127, + "grad_norm": 1.246109962463379, + "learning_rate": 0.00014592169657422514, + "loss": 0.346, + "num_input_tokens_seen": 3884736, + "step": 1790 + }, + { + "epoch": 0.2928221859706362, + "grad_norm": 0.36836186051368713, + "learning_rate": 0.00014632952691680262, + "loss": 0.3528, + "num_input_tokens_seen": 3895584, + "step": 1795 + }, + { + "epoch": 0.2936378466557912, + "grad_norm": 1.420259952545166, + "learning_rate": 0.0001467373572593801, + "loss": 0.3205, + "num_input_tokens_seen": 3907584, + "step": 1800 + }, + { + "epoch": 0.29445350734094616, + "grad_norm": 0.3933400809764862, + "learning_rate": 0.00014714518760195757, + "loss": 0.3764, + "num_input_tokens_seen": 3919424, + "step": 1805 + }, + { + "epoch": 0.29526916802610115, + "grad_norm": 0.3296147584915161, + "learning_rate": 0.00014755301794453507, + "loss": 0.3389, + "num_input_tokens_seen": 3929152, + "step": 1810 + }, + { + "epoch": 0.29608482871125613, + "grad_norm": 0.7825654149055481, + "learning_rate": 0.00014796084828711258, + "loss": 0.3407, + "num_input_tokens_seen": 3940928, + "step": 1815 + }, + { + "epoch": 0.2969004893964111, + "grad_norm": 0.9432071447372437, + "learning_rate": 0.00014836867862969005, + "loss": 0.3202, + "num_input_tokens_seen": 3949568, + "step": 1820 + }, + { + "epoch": 0.29771615008156604, + "grad_norm": 1.7713226079940796, + "learning_rate": 0.00014877650897226756, + "loss": 0.3339, + "num_input_tokens_seen": 3961088, + "step": 1825 + }, + { + "epoch": 0.29853181076672103, + "grad_norm": 0.3637374937534332, + "learning_rate": 0.00014918433931484503, + "loss": 0.3567, + "num_input_tokens_seen": 3973088, + "step": 1830 + }, + { + "epoch": 0.299347471451876, + "grad_norm": 0.5709724426269531, + "learning_rate": 0.0001495921696574225, + "loss": 0.3035, + "num_input_tokens_seen": 3982976, + "step": 1835 + }, + { + "epoch": 0.300163132137031, + "grad_norm": 0.40148237347602844, + "learning_rate": 0.00015, + "loss": 0.3411, + "num_input_tokens_seen": 3993440, + "step": 1840 + }, + { + "epoch": 0.300978792822186, + "grad_norm": 0.5015760064125061, + "learning_rate": 0.0001504078303425775, + "loss": 0.2825, + "num_input_tokens_seen": 4002720, + "step": 1845 + }, + { + "epoch": 0.30179445350734097, + "grad_norm": 1.6022138595581055, + "learning_rate": 0.000150815660685155, + "loss": 0.3604, + "num_input_tokens_seen": 4012896, + "step": 1850 + }, + { + "epoch": 0.30261011419249595, + "grad_norm": 3.2500264644622803, + "learning_rate": 0.00015122349102773247, + "loss": 0.4179, + "num_input_tokens_seen": 4023680, + "step": 1855 + }, + { + "epoch": 0.3034257748776509, + "grad_norm": 1.5644983053207397, + "learning_rate": 0.00015163132137030997, + "loss": 0.3891, + "num_input_tokens_seen": 4033760, + "step": 1860 + }, + { + "epoch": 0.30424143556280586, + "grad_norm": 4.540710926055908, + "learning_rate": 0.00015203915171288742, + "loss": 0.404, + "num_input_tokens_seen": 4044192, + "step": 1865 + }, + { + "epoch": 0.30505709624796085, + "grad_norm": 0.46936094760894775, + "learning_rate": 0.00015244698205546493, + "loss": 0.3576, + "num_input_tokens_seen": 4055520, + "step": 1870 + }, + { + "epoch": 0.30587275693311583, + "grad_norm": 0.4898531138896942, + "learning_rate": 0.0001528548123980424, + "loss": 0.311, + "num_input_tokens_seen": 4067264, + "step": 1875 + }, + { + "epoch": 0.3066884176182708, + "grad_norm": 0.7477511763572693, + "learning_rate": 0.0001532626427406199, + "loss": 0.3362, + "num_input_tokens_seen": 4078016, + "step": 1880 + }, + { + "epoch": 0.3075040783034258, + "grad_norm": 0.3766342103481293, + "learning_rate": 0.0001536704730831974, + "loss": 0.3798, + "num_input_tokens_seen": 4088352, + "step": 1885 + }, + { + "epoch": 0.3083197389885807, + "grad_norm": 0.4264306128025055, + "learning_rate": 0.0001540783034257749, + "loss": 0.318, + "num_input_tokens_seen": 4100416, + "step": 1890 + }, + { + "epoch": 0.3091353996737357, + "grad_norm": 0.9823868274688721, + "learning_rate": 0.00015448613376835236, + "loss": 0.2882, + "num_input_tokens_seen": 4110080, + "step": 1895 + }, + { + "epoch": 0.3099510603588907, + "grad_norm": 0.32300037145614624, + "learning_rate": 0.00015489396411092984, + "loss": 0.4169, + "num_input_tokens_seen": 4120736, + "step": 1900 + }, + { + "epoch": 0.3107667210440457, + "grad_norm": 0.39407405257225037, + "learning_rate": 0.00015530179445350734, + "loss": 0.3501, + "num_input_tokens_seen": 4132736, + "step": 1905 + }, + { + "epoch": 0.31158238172920066, + "grad_norm": 0.6380731463432312, + "learning_rate": 0.00015570962479608482, + "loss": 0.3182, + "num_input_tokens_seen": 4143808, + "step": 1910 + }, + { + "epoch": 0.31239804241435565, + "grad_norm": 0.6447558403015137, + "learning_rate": 0.00015611745513866232, + "loss": 0.329, + "num_input_tokens_seen": 4154976, + "step": 1915 + }, + { + "epoch": 0.3132137030995106, + "grad_norm": 0.6399807929992676, + "learning_rate": 0.00015652528548123983, + "loss": 0.3842, + "num_input_tokens_seen": 4165504, + "step": 1920 + }, + { + "epoch": 0.31402936378466556, + "grad_norm": 0.7410326600074768, + "learning_rate": 0.0001569331158238173, + "loss": 0.354, + "num_input_tokens_seen": 4177184, + "step": 1925 + }, + { + "epoch": 0.31484502446982054, + "grad_norm": 0.9611150026321411, + "learning_rate": 0.00015734094616639478, + "loss": 0.3558, + "num_input_tokens_seen": 4189088, + "step": 1930 + }, + { + "epoch": 0.31566068515497553, + "grad_norm": 0.3854925334453583, + "learning_rate": 0.00015774877650897226, + "loss": 0.2777, + "num_input_tokens_seen": 4200064, + "step": 1935 + }, + { + "epoch": 0.3164763458401305, + "grad_norm": 1.1114511489868164, + "learning_rate": 0.00015815660685154976, + "loss": 0.3013, + "num_input_tokens_seen": 4211936, + "step": 1940 + }, + { + "epoch": 0.3172920065252855, + "grad_norm": 0.4343189299106598, + "learning_rate": 0.00015856443719412724, + "loss": 0.3283, + "num_input_tokens_seen": 4221952, + "step": 1945 + }, + { + "epoch": 0.3181076672104405, + "grad_norm": 2.0649616718292236, + "learning_rate": 0.00015897226753670474, + "loss": 0.3852, + "num_input_tokens_seen": 4233024, + "step": 1950 + }, + { + "epoch": 0.3189233278955954, + "grad_norm": 1.73568856716156, + "learning_rate": 0.00015938009787928224, + "loss": 0.3143, + "num_input_tokens_seen": 4244480, + "step": 1955 + }, + { + "epoch": 0.3197389885807504, + "grad_norm": 1.0688120126724243, + "learning_rate": 0.0001597879282218597, + "loss": 0.3751, + "num_input_tokens_seen": 4255904, + "step": 1960 + }, + { + "epoch": 0.3205546492659054, + "grad_norm": 1.2920629978179932, + "learning_rate": 0.0001601957585644372, + "loss": 0.3487, + "num_input_tokens_seen": 4265792, + "step": 1965 + }, + { + "epoch": 0.32137030995106036, + "grad_norm": 0.7379094362258911, + "learning_rate": 0.00016060358890701467, + "loss": 0.3297, + "num_input_tokens_seen": 4276160, + "step": 1970 + }, + { + "epoch": 0.32218597063621535, + "grad_norm": 1.2795677185058594, + "learning_rate": 0.00016101141924959218, + "loss": 0.3298, + "num_input_tokens_seen": 4286624, + "step": 1975 + }, + { + "epoch": 0.32300163132137033, + "grad_norm": 1.0733577013015747, + "learning_rate": 0.00016141924959216965, + "loss": 0.3275, + "num_input_tokens_seen": 4297600, + "step": 1980 + }, + { + "epoch": 0.32381729200652526, + "grad_norm": 0.5892490744590759, + "learning_rate": 0.00016182707993474716, + "loss": 0.2693, + "num_input_tokens_seen": 4308352, + "step": 1985 + }, + { + "epoch": 0.32463295269168024, + "grad_norm": 0.4131726324558258, + "learning_rate": 0.00016223491027732466, + "loss": 0.3961, + "num_input_tokens_seen": 4319360, + "step": 1990 + }, + { + "epoch": 0.3254486133768352, + "grad_norm": 0.33160898089408875, + "learning_rate": 0.0001626427406199021, + "loss": 0.3153, + "num_input_tokens_seen": 4331552, + "step": 1995 + }, + { + "epoch": 0.3262642740619902, + "grad_norm": 0.43107712268829346, + "learning_rate": 0.0001630505709624796, + "loss": 0.3754, + "num_input_tokens_seen": 4343744, + "step": 2000 + }, + { + "epoch": 0.3270799347471452, + "grad_norm": 0.8763712644577026, + "learning_rate": 0.0001634584013050571, + "loss": 0.342, + "num_input_tokens_seen": 4354304, + "step": 2005 + }, + { + "epoch": 0.3278955954323002, + "grad_norm": 0.27995434403419495, + "learning_rate": 0.0001638662316476346, + "loss": 0.3581, + "num_input_tokens_seen": 4365696, + "step": 2010 + }, + { + "epoch": 0.32871125611745516, + "grad_norm": 1.2065924406051636, + "learning_rate": 0.00016427406199021207, + "loss": 0.3749, + "num_input_tokens_seen": 4376736, + "step": 2015 + }, + { + "epoch": 0.3295269168026101, + "grad_norm": 0.4733133614063263, + "learning_rate": 0.00016468189233278957, + "loss": 0.3028, + "num_input_tokens_seen": 4388064, + "step": 2020 + }, + { + "epoch": 0.3303425774877651, + "grad_norm": 0.8918518424034119, + "learning_rate": 0.00016508972267536705, + "loss": 0.3164, + "num_input_tokens_seen": 4398912, + "step": 2025 + }, + { + "epoch": 0.33115823817292006, + "grad_norm": 0.5678725242614746, + "learning_rate": 0.00016549755301794453, + "loss": 0.379, + "num_input_tokens_seen": 4409856, + "step": 2030 + }, + { + "epoch": 0.33197389885807504, + "grad_norm": 0.2943176329135895, + "learning_rate": 0.00016590538336052203, + "loss": 0.3357, + "num_input_tokens_seen": 4419552, + "step": 2035 + }, + { + "epoch": 0.33278955954323003, + "grad_norm": 0.6602084040641785, + "learning_rate": 0.0001663132137030995, + "loss": 0.3177, + "num_input_tokens_seen": 4430080, + "step": 2040 + }, + { + "epoch": 0.333605220228385, + "grad_norm": 0.359231561422348, + "learning_rate": 0.000166721044045677, + "loss": 0.3206, + "num_input_tokens_seen": 4441568, + "step": 2045 + }, + { + "epoch": 0.33442088091353994, + "grad_norm": 0.9323316216468811, + "learning_rate": 0.00016712887438825449, + "loss": 0.3346, + "num_input_tokens_seen": 4451200, + "step": 2050 + }, + { + "epoch": 0.3352365415986949, + "grad_norm": 0.36109283566474915, + "learning_rate": 0.000167536704730832, + "loss": 0.3483, + "num_input_tokens_seen": 4461888, + "step": 2055 + }, + { + "epoch": 0.3360522022838499, + "grad_norm": 0.8677675724029541, + "learning_rate": 0.00016794453507340947, + "loss": 0.295, + "num_input_tokens_seen": 4471392, + "step": 2060 + }, + { + "epoch": 0.3368678629690049, + "grad_norm": 0.3884972929954529, + "learning_rate": 0.00016835236541598694, + "loss": 0.3513, + "num_input_tokens_seen": 4482656, + "step": 2065 + }, + { + "epoch": 0.3376835236541599, + "grad_norm": 0.8152845501899719, + "learning_rate": 0.00016876019575856445, + "loss": 0.2987, + "num_input_tokens_seen": 4494336, + "step": 2070 + }, + { + "epoch": 0.33849918433931486, + "grad_norm": 1.0055776834487915, + "learning_rate": 0.00016916802610114192, + "loss": 0.2865, + "num_input_tokens_seen": 4505760, + "step": 2075 + }, + { + "epoch": 0.33931484502446985, + "grad_norm": 2.453181028366089, + "learning_rate": 0.00016957585644371943, + "loss": 0.4349, + "num_input_tokens_seen": 4514976, + "step": 2080 + }, + { + "epoch": 0.3401305057096248, + "grad_norm": 0.9779709577560425, + "learning_rate": 0.0001699836867862969, + "loss": 0.4068, + "num_input_tokens_seen": 4524928, + "step": 2085 + }, + { + "epoch": 0.34094616639477976, + "grad_norm": 0.38216322660446167, + "learning_rate": 0.00017039151712887438, + "loss": 0.3499, + "num_input_tokens_seen": 4536192, + "step": 2090 + }, + { + "epoch": 0.34176182707993474, + "grad_norm": 0.3808996081352234, + "learning_rate": 0.00017079934747145188, + "loss": 0.2784, + "num_input_tokens_seen": 4548160, + "step": 2095 + }, + { + "epoch": 0.3425774877650897, + "grad_norm": 0.3375453054904938, + "learning_rate": 0.00017120717781402936, + "loss": 0.3938, + "num_input_tokens_seen": 4558560, + "step": 2100 + }, + { + "epoch": 0.3433931484502447, + "grad_norm": 0.5401726961135864, + "learning_rate": 0.00017161500815660686, + "loss": 0.3508, + "num_input_tokens_seen": 4570272, + "step": 2105 + }, + { + "epoch": 0.3442088091353997, + "grad_norm": 0.7671635150909424, + "learning_rate": 0.00017202283849918434, + "loss": 0.3716, + "num_input_tokens_seen": 4580608, + "step": 2110 + }, + { + "epoch": 0.3450244698205546, + "grad_norm": 0.34603986144065857, + "learning_rate": 0.00017243066884176184, + "loss": 0.3561, + "num_input_tokens_seen": 4591072, + "step": 2115 + }, + { + "epoch": 0.3458401305057096, + "grad_norm": 0.2979963719844818, + "learning_rate": 0.00017283849918433932, + "loss": 0.3299, + "num_input_tokens_seen": 4602368, + "step": 2120 + }, + { + "epoch": 0.3466557911908646, + "grad_norm": 0.37870991230010986, + "learning_rate": 0.0001732463295269168, + "loss": 0.3297, + "num_input_tokens_seen": 4613600, + "step": 2125 + }, + { + "epoch": 0.3474714518760196, + "grad_norm": 0.5697657465934753, + "learning_rate": 0.0001736541598694943, + "loss": 0.3283, + "num_input_tokens_seen": 4623872, + "step": 2130 + }, + { + "epoch": 0.34828711256117456, + "grad_norm": 0.8874559998512268, + "learning_rate": 0.00017406199021207178, + "loss": 0.4015, + "num_input_tokens_seen": 4635136, + "step": 2135 + }, + { + "epoch": 0.34910277324632955, + "grad_norm": 0.33925214409828186, + "learning_rate": 0.00017446982055464928, + "loss": 0.3658, + "num_input_tokens_seen": 4645152, + "step": 2140 + }, + { + "epoch": 0.34991843393148453, + "grad_norm": 0.6134099364280701, + "learning_rate": 0.00017487765089722676, + "loss": 0.3347, + "num_input_tokens_seen": 4654240, + "step": 2145 + }, + { + "epoch": 0.35073409461663946, + "grad_norm": 0.316588819026947, + "learning_rate": 0.00017528548123980426, + "loss": 0.3029, + "num_input_tokens_seen": 4666240, + "step": 2150 + }, + { + "epoch": 0.35154975530179444, + "grad_norm": 0.7088047862052917, + "learning_rate": 0.0001756933115823817, + "loss": 0.3461, + "num_input_tokens_seen": 4678528, + "step": 2155 + }, + { + "epoch": 0.3523654159869494, + "grad_norm": 0.9629406929016113, + "learning_rate": 0.0001761011419249592, + "loss": 0.4831, + "num_input_tokens_seen": 4688768, + "step": 2160 + }, + { + "epoch": 0.3531810766721044, + "grad_norm": 0.4813165068626404, + "learning_rate": 0.00017650897226753672, + "loss": 0.3442, + "num_input_tokens_seen": 4698496, + "step": 2165 + }, + { + "epoch": 0.3539967373572594, + "grad_norm": 0.3006298243999481, + "learning_rate": 0.0001769168026101142, + "loss": 0.3765, + "num_input_tokens_seen": 4710208, + "step": 2170 + }, + { + "epoch": 0.3548123980424144, + "grad_norm": 1.0809162855148315, + "learning_rate": 0.0001773246329526917, + "loss": 0.3515, + "num_input_tokens_seen": 4720480, + "step": 2175 + }, + { + "epoch": 0.3556280587275693, + "grad_norm": 0.2537889778614044, + "learning_rate": 0.00017773246329526917, + "loss": 0.3265, + "num_input_tokens_seen": 4730560, + "step": 2180 + }, + { + "epoch": 0.3564437194127243, + "grad_norm": 0.6803996562957764, + "learning_rate": 0.00017814029363784668, + "loss": 0.3468, + "num_input_tokens_seen": 4740768, + "step": 2185 + }, + { + "epoch": 0.3572593800978793, + "grad_norm": 0.25576868653297424, + "learning_rate": 0.00017854812398042412, + "loss": 0.3265, + "num_input_tokens_seen": 4751040, + "step": 2190 + }, + { + "epoch": 0.35807504078303426, + "grad_norm": 0.2311267852783203, + "learning_rate": 0.00017895595432300163, + "loss": 0.3205, + "num_input_tokens_seen": 4762016, + "step": 2195 + }, + { + "epoch": 0.35889070146818924, + "grad_norm": 0.669933021068573, + "learning_rate": 0.00017936378466557913, + "loss": 0.31, + "num_input_tokens_seen": 4771648, + "step": 2200 + }, + { + "epoch": 0.35970636215334423, + "grad_norm": 0.34055840969085693, + "learning_rate": 0.0001797716150081566, + "loss": 0.3378, + "num_input_tokens_seen": 4781440, + "step": 2205 + }, + { + "epoch": 0.3605220228384992, + "grad_norm": 0.37736746668815613, + "learning_rate": 0.0001801794453507341, + "loss": 0.3622, + "num_input_tokens_seen": 4793312, + "step": 2210 + }, + { + "epoch": 0.36133768352365414, + "grad_norm": 1.1001076698303223, + "learning_rate": 0.0001805872756933116, + "loss": 0.364, + "num_input_tokens_seen": 4804288, + "step": 2215 + }, + { + "epoch": 0.3621533442088091, + "grad_norm": 1.1262805461883545, + "learning_rate": 0.00018099510603588906, + "loss": 0.3724, + "num_input_tokens_seen": 4814784, + "step": 2220 + }, + { + "epoch": 0.3629690048939641, + "grad_norm": 0.6733670830726624, + "learning_rate": 0.00018140293637846654, + "loss": 0.3619, + "num_input_tokens_seen": 4825152, + "step": 2225 + }, + { + "epoch": 0.3637846655791191, + "grad_norm": 0.620262861251831, + "learning_rate": 0.00018181076672104404, + "loss": 0.3755, + "num_input_tokens_seen": 4835776, + "step": 2230 + }, + { + "epoch": 0.3646003262642741, + "grad_norm": 0.3375418484210968, + "learning_rate": 0.00018221859706362155, + "loss": 0.3478, + "num_input_tokens_seen": 4846848, + "step": 2235 + }, + { + "epoch": 0.36541598694942906, + "grad_norm": 0.7215891480445862, + "learning_rate": 0.00018262642740619902, + "loss": 0.3897, + "num_input_tokens_seen": 4857664, + "step": 2240 + }, + { + "epoch": 0.366231647634584, + "grad_norm": 0.6194847226142883, + "learning_rate": 0.00018303425774877653, + "loss": 0.3101, + "num_input_tokens_seen": 4868800, + "step": 2245 + }, + { + "epoch": 0.367047308319739, + "grad_norm": 0.2767339050769806, + "learning_rate": 0.00018344208809135398, + "loss": 0.3539, + "num_input_tokens_seen": 4879776, + "step": 2250 + }, + { + "epoch": 0.36786296900489396, + "grad_norm": 0.18931439518928528, + "learning_rate": 0.00018384991843393148, + "loss": 0.3427, + "num_input_tokens_seen": 4890816, + "step": 2255 + }, + { + "epoch": 0.36867862969004894, + "grad_norm": 0.24322864413261414, + "learning_rate": 0.00018425774877650896, + "loss": 0.3271, + "num_input_tokens_seen": 4899136, + "step": 2260 + }, + { + "epoch": 0.3694942903752039, + "grad_norm": 0.2841740548610687, + "learning_rate": 0.00018466557911908646, + "loss": 0.3448, + "num_input_tokens_seen": 4910112, + "step": 2265 + }, + { + "epoch": 0.3703099510603589, + "grad_norm": 0.5461204648017883, + "learning_rate": 0.00018507340946166396, + "loss": 0.3202, + "num_input_tokens_seen": 4920032, + "step": 2270 + }, + { + "epoch": 0.37112561174551384, + "grad_norm": 0.5967912673950195, + "learning_rate": 0.00018548123980424144, + "loss": 0.3357, + "num_input_tokens_seen": 4930752, + "step": 2275 + }, + { + "epoch": 0.3719412724306688, + "grad_norm": 0.3169904053211212, + "learning_rate": 0.00018588907014681894, + "loss": 0.3368, + "num_input_tokens_seen": 4940960, + "step": 2280 + }, + { + "epoch": 0.3727569331158238, + "grad_norm": 0.4621799886226654, + "learning_rate": 0.0001862969004893964, + "loss": 0.2798, + "num_input_tokens_seen": 4952128, + "step": 2285 + }, + { + "epoch": 0.3735725938009788, + "grad_norm": 0.2985716462135315, + "learning_rate": 0.0001867047308319739, + "loss": 0.3573, + "num_input_tokens_seen": 4962848, + "step": 2290 + }, + { + "epoch": 0.3743882544861338, + "grad_norm": 0.551990807056427, + "learning_rate": 0.0001871125611745514, + "loss": 0.2891, + "num_input_tokens_seen": 4972448, + "step": 2295 + }, + { + "epoch": 0.37520391517128876, + "grad_norm": 0.6259016394615173, + "learning_rate": 0.00018752039151712888, + "loss": 0.3484, + "num_input_tokens_seen": 4982432, + "step": 2300 + }, + { + "epoch": 0.37601957585644374, + "grad_norm": 0.5325135588645935, + "learning_rate": 0.00018792822185970638, + "loss": 0.3061, + "num_input_tokens_seen": 4992800, + "step": 2305 + }, + { + "epoch": 0.3768352365415987, + "grad_norm": 0.5709698796272278, + "learning_rate": 0.00018833605220228386, + "loss": 0.3248, + "num_input_tokens_seen": 5004352, + "step": 2310 + }, + { + "epoch": 0.37765089722675366, + "grad_norm": 0.3906923234462738, + "learning_rate": 0.00018874388254486133, + "loss": 0.3273, + "num_input_tokens_seen": 5015968, + "step": 2315 + }, + { + "epoch": 0.37846655791190864, + "grad_norm": 0.5311968922615051, + "learning_rate": 0.0001891517128874388, + "loss": 0.3876, + "num_input_tokens_seen": 5026624, + "step": 2320 + }, + { + "epoch": 0.3792822185970636, + "grad_norm": 0.41739797592163086, + "learning_rate": 0.00018955954323001631, + "loss": 0.3486, + "num_input_tokens_seen": 5037312, + "step": 2325 + }, + { + "epoch": 0.3800978792822186, + "grad_norm": 0.4734439253807068, + "learning_rate": 0.00018996737357259382, + "loss": 0.3274, + "num_input_tokens_seen": 5048544, + "step": 2330 + }, + { + "epoch": 0.3809135399673736, + "grad_norm": 0.3764461874961853, + "learning_rate": 0.0001903752039151713, + "loss": 0.3468, + "num_input_tokens_seen": 5059584, + "step": 2335 + }, + { + "epoch": 0.3817292006525285, + "grad_norm": 0.4716011881828308, + "learning_rate": 0.0001907830342577488, + "loss": 0.2744, + "num_input_tokens_seen": 5072736, + "step": 2340 + }, + { + "epoch": 0.3825448613376835, + "grad_norm": 1.4850523471832275, + "learning_rate": 0.00019119086460032627, + "loss": 0.3347, + "num_input_tokens_seen": 5083456, + "step": 2345 + }, + { + "epoch": 0.3833605220228385, + "grad_norm": 1.071189284324646, + "learning_rate": 0.00019159869494290375, + "loss": 0.2989, + "num_input_tokens_seen": 5094240, + "step": 2350 + }, + { + "epoch": 0.3841761827079935, + "grad_norm": 0.4793599545955658, + "learning_rate": 0.00019200652528548123, + "loss": 0.31, + "num_input_tokens_seen": 5103744, + "step": 2355 + }, + { + "epoch": 0.38499184339314846, + "grad_norm": 1.7425473928451538, + "learning_rate": 0.00019241435562805873, + "loss": 0.3179, + "num_input_tokens_seen": 5114048, + "step": 2360 + }, + { + "epoch": 0.38580750407830344, + "grad_norm": 0.34256845712661743, + "learning_rate": 0.00019282218597063623, + "loss": 0.3322, + "num_input_tokens_seen": 5123744, + "step": 2365 + }, + { + "epoch": 0.3866231647634584, + "grad_norm": 0.8014249801635742, + "learning_rate": 0.0001932300163132137, + "loss": 0.3061, + "num_input_tokens_seen": 5133888, + "step": 2370 + }, + { + "epoch": 0.38743882544861336, + "grad_norm": 0.5163637399673462, + "learning_rate": 0.00019363784665579121, + "loss": 0.2767, + "num_input_tokens_seen": 5145024, + "step": 2375 + }, + { + "epoch": 0.38825448613376834, + "grad_norm": 0.7992238402366638, + "learning_rate": 0.00019404567699836866, + "loss": 0.2409, + "num_input_tokens_seen": 5156544, + "step": 2380 + }, + { + "epoch": 0.3890701468189233, + "grad_norm": 0.801210343837738, + "learning_rate": 0.00019445350734094617, + "loss": 0.3662, + "num_input_tokens_seen": 5168192, + "step": 2385 + }, + { + "epoch": 0.3898858075040783, + "grad_norm": 0.7554042935371399, + "learning_rate": 0.00019486133768352364, + "loss": 0.3152, + "num_input_tokens_seen": 5178240, + "step": 2390 + }, + { + "epoch": 0.3907014681892333, + "grad_norm": 0.7473616600036621, + "learning_rate": 0.00019526916802610115, + "loss": 0.3067, + "num_input_tokens_seen": 5189248, + "step": 2395 + }, + { + "epoch": 0.3915171288743883, + "grad_norm": 0.3379824459552765, + "learning_rate": 0.00019567699836867865, + "loss": 0.3327, + "num_input_tokens_seen": 5199392, + "step": 2400 + }, + { + "epoch": 0.3923327895595432, + "grad_norm": 0.6337383985519409, + "learning_rate": 0.00019608482871125613, + "loss": 0.3792, + "num_input_tokens_seen": 5210240, + "step": 2405 + }, + { + "epoch": 0.3931484502446982, + "grad_norm": 1.1168708801269531, + "learning_rate": 0.00019649265905383363, + "loss": 0.2778, + "num_input_tokens_seen": 5220000, + "step": 2410 + }, + { + "epoch": 0.3939641109298532, + "grad_norm": 0.5852292776107788, + "learning_rate": 0.00019690048939641108, + "loss": 0.3148, + "num_input_tokens_seen": 5231520, + "step": 2415 + }, + { + "epoch": 0.39477977161500816, + "grad_norm": 0.31556594371795654, + "learning_rate": 0.00019730831973898858, + "loss": 0.3743, + "num_input_tokens_seen": 5243200, + "step": 2420 + }, + { + "epoch": 0.39559543230016314, + "grad_norm": 1.3660370111465454, + "learning_rate": 0.00019771615008156606, + "loss": 0.3706, + "num_input_tokens_seen": 5253632, + "step": 2425 + }, + { + "epoch": 0.3964110929853181, + "grad_norm": 0.4243089258670807, + "learning_rate": 0.00019812398042414356, + "loss": 0.4021, + "num_input_tokens_seen": 5264736, + "step": 2430 + }, + { + "epoch": 0.3972267536704731, + "grad_norm": 0.41518595814704895, + "learning_rate": 0.00019853181076672107, + "loss": 0.3128, + "num_input_tokens_seen": 5276192, + "step": 2435 + }, + { + "epoch": 0.39804241435562804, + "grad_norm": 0.35518136620521545, + "learning_rate": 0.00019893964110929854, + "loss": 0.4496, + "num_input_tokens_seen": 5287296, + "step": 2440 + }, + { + "epoch": 0.398858075040783, + "grad_norm": 0.4728354811668396, + "learning_rate": 0.00019934747145187602, + "loss": 0.3256, + "num_input_tokens_seen": 5298336, + "step": 2445 + }, + { + "epoch": 0.399673735725938, + "grad_norm": 1.3262606859207153, + "learning_rate": 0.0001997553017944535, + "loss": 0.3286, + "num_input_tokens_seen": 5309248, + "step": 2450 + }, + { + "epoch": 0.400489396411093, + "grad_norm": 0.37386152148246765, + "learning_rate": 0.000200163132137031, + "loss": 0.3628, + "num_input_tokens_seen": 5320384, + "step": 2455 + }, + { + "epoch": 0.401305057096248, + "grad_norm": 0.37013426423072815, + "learning_rate": 0.00020057096247960848, + "loss": 0.35, + "num_input_tokens_seen": 5331904, + "step": 2460 + }, + { + "epoch": 0.40212071778140296, + "grad_norm": 0.2538347840309143, + "learning_rate": 0.00020097879282218598, + "loss": 0.3279, + "num_input_tokens_seen": 5343904, + "step": 2465 + }, + { + "epoch": 0.4029363784665579, + "grad_norm": 1.1983567476272583, + "learning_rate": 0.00020138662316476348, + "loss": 0.3439, + "num_input_tokens_seen": 5354336, + "step": 2470 + }, + { + "epoch": 0.40375203915171287, + "grad_norm": 0.8135309815406799, + "learning_rate": 0.00020179445350734096, + "loss": 0.3306, + "num_input_tokens_seen": 5366144, + "step": 2475 + }, + { + "epoch": 0.40456769983686786, + "grad_norm": 0.5173861980438232, + "learning_rate": 0.00020220228384991844, + "loss": 0.3404, + "num_input_tokens_seen": 5377344, + "step": 2480 + }, + { + "epoch": 0.40538336052202284, + "grad_norm": 0.5329051613807678, + "learning_rate": 0.0002026101141924959, + "loss": 0.3111, + "num_input_tokens_seen": 5386752, + "step": 2485 + }, + { + "epoch": 0.4061990212071778, + "grad_norm": 0.29332637786865234, + "learning_rate": 0.00020301794453507342, + "loss": 0.3253, + "num_input_tokens_seen": 5396384, + "step": 2490 + }, + { + "epoch": 0.4070146818923328, + "grad_norm": 1.0537220239639282, + "learning_rate": 0.0002034257748776509, + "loss": 0.4057, + "num_input_tokens_seen": 5408256, + "step": 2495 + }, + { + "epoch": 0.4078303425774878, + "grad_norm": 0.48377951979637146, + "learning_rate": 0.0002038336052202284, + "loss": 0.298, + "num_input_tokens_seen": 5418752, + "step": 2500 + }, + { + "epoch": 0.4086460032626427, + "grad_norm": 0.3569847643375397, + "learning_rate": 0.0002042414355628059, + "loss": 0.3168, + "num_input_tokens_seen": 5429792, + "step": 2505 + }, + { + "epoch": 0.4094616639477977, + "grad_norm": 0.9022392630577087, + "learning_rate": 0.00020464926590538335, + "loss": 0.2927, + "num_input_tokens_seen": 5440608, + "step": 2510 + }, + { + "epoch": 0.4102773246329527, + "grad_norm": 0.3175264596939087, + "learning_rate": 0.00020505709624796085, + "loss": 0.4258, + "num_input_tokens_seen": 5450016, + "step": 2515 + }, + { + "epoch": 0.4110929853181077, + "grad_norm": 0.49386703968048096, + "learning_rate": 0.00020546492659053833, + "loss": 0.3105, + "num_input_tokens_seen": 5461120, + "step": 2520 + }, + { + "epoch": 0.41190864600326266, + "grad_norm": 0.8477874398231506, + "learning_rate": 0.00020587275693311583, + "loss": 0.3654, + "num_input_tokens_seen": 5471648, + "step": 2525 + }, + { + "epoch": 0.41272430668841764, + "grad_norm": 0.9865925312042236, + "learning_rate": 0.0002062805872756933, + "loss": 0.2982, + "num_input_tokens_seen": 5482656, + "step": 2530 + }, + { + "epoch": 0.41353996737357257, + "grad_norm": 0.3181234300136566, + "learning_rate": 0.0002066884176182708, + "loss": 0.3258, + "num_input_tokens_seen": 5491168, + "step": 2535 + }, + { + "epoch": 0.41435562805872755, + "grad_norm": 0.22360840439796448, + "learning_rate": 0.00020709624796084832, + "loss": 0.3014, + "num_input_tokens_seen": 5501792, + "step": 2540 + }, + { + "epoch": 0.41517128874388254, + "grad_norm": 0.49664536118507385, + "learning_rate": 0.00020750407830342577, + "loss": 0.3788, + "num_input_tokens_seen": 5513152, + "step": 2545 + }, + { + "epoch": 0.4159869494290375, + "grad_norm": 0.40055933594703674, + "learning_rate": 0.00020791190864600327, + "loss": 0.3046, + "num_input_tokens_seen": 5524480, + "step": 2550 + }, + { + "epoch": 0.4168026101141925, + "grad_norm": 0.33127811551094055, + "learning_rate": 0.00020831973898858075, + "loss": 0.386, + "num_input_tokens_seen": 5535008, + "step": 2555 + }, + { + "epoch": 0.4176182707993475, + "grad_norm": 0.5803729891777039, + "learning_rate": 0.00020872756933115825, + "loss": 0.3449, + "num_input_tokens_seen": 5546048, + "step": 2560 + }, + { + "epoch": 0.4184339314845024, + "grad_norm": 0.43146127462387085, + "learning_rate": 0.00020913539967373573, + "loss": 0.3754, + "num_input_tokens_seen": 5557760, + "step": 2565 + }, + { + "epoch": 0.4192495921696574, + "grad_norm": 0.11432364583015442, + "learning_rate": 0.00020954323001631323, + "loss": 0.3211, + "num_input_tokens_seen": 5567488, + "step": 2570 + }, + { + "epoch": 0.4200652528548124, + "grad_norm": 0.4708321690559387, + "learning_rate": 0.0002099510603588907, + "loss": 0.3923, + "num_input_tokens_seen": 5578592, + "step": 2575 + }, + { + "epoch": 0.42088091353996737, + "grad_norm": 0.6483908891677856, + "learning_rate": 0.00021035889070146818, + "loss": 0.3387, + "num_input_tokens_seen": 5588576, + "step": 2580 + }, + { + "epoch": 0.42169657422512236, + "grad_norm": 0.36748918890953064, + "learning_rate": 0.00021076672104404569, + "loss": 0.2996, + "num_input_tokens_seen": 5599264, + "step": 2585 + }, + { + "epoch": 0.42251223491027734, + "grad_norm": 0.3322773277759552, + "learning_rate": 0.00021117455138662316, + "loss": 0.364, + "num_input_tokens_seen": 5609504, + "step": 2590 + }, + { + "epoch": 0.4233278955954323, + "grad_norm": 0.5112482905387878, + "learning_rate": 0.00021158238172920067, + "loss": 0.317, + "num_input_tokens_seen": 5620256, + "step": 2595 + }, + { + "epoch": 0.42414355628058725, + "grad_norm": 0.3455706536769867, + "learning_rate": 0.00021199021207177814, + "loss": 0.3627, + "num_input_tokens_seen": 5630592, + "step": 2600 + }, + { + "epoch": 0.42495921696574224, + "grad_norm": 0.40241679549217224, + "learning_rate": 0.00021239804241435562, + "loss": 0.3738, + "num_input_tokens_seen": 5641536, + "step": 2605 + }, + { + "epoch": 0.4257748776508972, + "grad_norm": 0.3282737731933594, + "learning_rate": 0.00021280587275693312, + "loss": 0.3312, + "num_input_tokens_seen": 5652416, + "step": 2610 + }, + { + "epoch": 0.4265905383360522, + "grad_norm": 0.4676101505756378, + "learning_rate": 0.0002132137030995106, + "loss": 0.36, + "num_input_tokens_seen": 5662528, + "step": 2615 + }, + { + "epoch": 0.4274061990212072, + "grad_norm": 0.4622323215007782, + "learning_rate": 0.0002136215334420881, + "loss": 0.3199, + "num_input_tokens_seen": 5672288, + "step": 2620 + }, + { + "epoch": 0.4282218597063622, + "grad_norm": 0.4303980767726898, + "learning_rate": 0.00021402936378466558, + "loss": 0.3272, + "num_input_tokens_seen": 5682976, + "step": 2625 + }, + { + "epoch": 0.4290375203915171, + "grad_norm": 0.2869790196418762, + "learning_rate": 0.00021443719412724308, + "loss": 0.3369, + "num_input_tokens_seen": 5694240, + "step": 2630 + }, + { + "epoch": 0.4298531810766721, + "grad_norm": 0.9547647833824158, + "learning_rate": 0.00021484502446982056, + "loss": 0.2603, + "num_input_tokens_seen": 5704192, + "step": 2635 + }, + { + "epoch": 0.43066884176182707, + "grad_norm": 0.48299872875213623, + "learning_rate": 0.00021525285481239804, + "loss": 0.4003, + "num_input_tokens_seen": 5715968, + "step": 2640 + }, + { + "epoch": 0.43148450244698205, + "grad_norm": 0.4422161281108856, + "learning_rate": 0.00021566068515497554, + "loss": 0.28, + "num_input_tokens_seen": 5726784, + "step": 2645 + }, + { + "epoch": 0.43230016313213704, + "grad_norm": 0.2703772187232971, + "learning_rate": 0.00021606851549755302, + "loss": 0.3708, + "num_input_tokens_seen": 5737952, + "step": 2650 + }, + { + "epoch": 0.433115823817292, + "grad_norm": 0.294817715883255, + "learning_rate": 0.00021647634584013052, + "loss": 0.2767, + "num_input_tokens_seen": 5748512, + "step": 2655 + }, + { + "epoch": 0.433931484502447, + "grad_norm": 0.607304036617279, + "learning_rate": 0.000216884176182708, + "loss": 0.3808, + "num_input_tokens_seen": 5758944, + "step": 2660 + }, + { + "epoch": 0.43474714518760194, + "grad_norm": 0.31858983635902405, + "learning_rate": 0.0002172920065252855, + "loss": 0.2919, + "num_input_tokens_seen": 5769632, + "step": 2665 + }, + { + "epoch": 0.4355628058727569, + "grad_norm": 0.3608427345752716, + "learning_rate": 0.00021769983686786295, + "loss": 0.2857, + "num_input_tokens_seen": 5781248, + "step": 2670 + }, + { + "epoch": 0.4363784665579119, + "grad_norm": 0.4658503830432892, + "learning_rate": 0.00021810766721044045, + "loss": 0.3792, + "num_input_tokens_seen": 5792192, + "step": 2675 + }, + { + "epoch": 0.4371941272430669, + "grad_norm": 0.5746659636497498, + "learning_rate": 0.00021851549755301796, + "loss": 0.3173, + "num_input_tokens_seen": 5803264, + "step": 2680 + }, + { + "epoch": 0.43800978792822187, + "grad_norm": 0.22268733382225037, + "learning_rate": 0.00021892332789559543, + "loss": 0.3581, + "num_input_tokens_seen": 5815072, + "step": 2685 + }, + { + "epoch": 0.43882544861337686, + "grad_norm": 1.2028968334197998, + "learning_rate": 0.00021933115823817294, + "loss": 0.3917, + "num_input_tokens_seen": 5825248, + "step": 2690 + }, + { + "epoch": 0.4396411092985318, + "grad_norm": 0.17777279019355774, + "learning_rate": 0.0002197389885807504, + "loss": 0.3704, + "num_input_tokens_seen": 5835904, + "step": 2695 + }, + { + "epoch": 0.44045676998368677, + "grad_norm": 0.3075076937675476, + "learning_rate": 0.00022014681892332792, + "loss": 0.3284, + "num_input_tokens_seen": 5844736, + "step": 2700 + }, + { + "epoch": 0.44127243066884175, + "grad_norm": 0.5591664910316467, + "learning_rate": 0.00022055464926590536, + "loss": 0.3055, + "num_input_tokens_seen": 5856672, + "step": 2705 + }, + { + "epoch": 0.44208809135399674, + "grad_norm": 0.16215193271636963, + "learning_rate": 0.00022096247960848287, + "loss": 0.3818, + "num_input_tokens_seen": 5866848, + "step": 2710 + }, + { + "epoch": 0.4429037520391517, + "grad_norm": 0.27822771668434143, + "learning_rate": 0.00022137030995106037, + "loss": 0.3301, + "num_input_tokens_seen": 5877856, + "step": 2715 + }, + { + "epoch": 0.4437194127243067, + "grad_norm": 0.4213409423828125, + "learning_rate": 0.00022177814029363785, + "loss": 0.361, + "num_input_tokens_seen": 5888896, + "step": 2720 + }, + { + "epoch": 0.4445350734094617, + "grad_norm": 0.23394480347633362, + "learning_rate": 0.00022218597063621535, + "loss": 0.3152, + "num_input_tokens_seen": 5899872, + "step": 2725 + }, + { + "epoch": 0.4453507340946166, + "grad_norm": 0.1931873857975006, + "learning_rate": 0.00022259380097879283, + "loss": 0.3442, + "num_input_tokens_seen": 5910272, + "step": 2730 + }, + { + "epoch": 0.4461663947797716, + "grad_norm": 0.24352242052555084, + "learning_rate": 0.0002230016313213703, + "loss": 0.3434, + "num_input_tokens_seen": 5921312, + "step": 2735 + }, + { + "epoch": 0.4469820554649266, + "grad_norm": 0.18127134442329407, + "learning_rate": 0.00022340946166394778, + "loss": 0.3569, + "num_input_tokens_seen": 5932032, + "step": 2740 + }, + { + "epoch": 0.44779771615008157, + "grad_norm": 0.6061711311340332, + "learning_rate": 0.00022381729200652529, + "loss": 0.2695, + "num_input_tokens_seen": 5942176, + "step": 2745 + }, + { + "epoch": 0.44861337683523655, + "grad_norm": 0.970435619354248, + "learning_rate": 0.0002242251223491028, + "loss": 0.4079, + "num_input_tokens_seen": 5954336, + "step": 2750 + }, + { + "epoch": 0.44942903752039154, + "grad_norm": 0.5338501334190369, + "learning_rate": 0.00022463295269168027, + "loss": 0.2631, + "num_input_tokens_seen": 5965152, + "step": 2755 + }, + { + "epoch": 0.45024469820554647, + "grad_norm": 3.782496929168701, + "learning_rate": 0.00022504078303425777, + "loss": 0.5234, + "num_input_tokens_seen": 5977216, + "step": 2760 + }, + { + "epoch": 0.45106035889070145, + "grad_norm": 0.4664481282234192, + "learning_rate": 0.00022544861337683525, + "loss": 0.3275, + "num_input_tokens_seen": 5988000, + "step": 2765 + }, + { + "epoch": 0.45187601957585644, + "grad_norm": 0.6806289553642273, + "learning_rate": 0.00022585644371941272, + "loss": 0.4017, + "num_input_tokens_seen": 5998400, + "step": 2770 + }, + { + "epoch": 0.4526916802610114, + "grad_norm": 0.4981570243835449, + "learning_rate": 0.0002262642740619902, + "loss": 0.3521, + "num_input_tokens_seen": 6010176, + "step": 2775 + }, + { + "epoch": 0.4535073409461664, + "grad_norm": 0.38380923867225647, + "learning_rate": 0.0002266721044045677, + "loss": 0.3451, + "num_input_tokens_seen": 6021312, + "step": 2780 + }, + { + "epoch": 0.4543230016313214, + "grad_norm": 0.6756896376609802, + "learning_rate": 0.0002270799347471452, + "loss": 0.3396, + "num_input_tokens_seen": 6033024, + "step": 2785 + }, + { + "epoch": 0.4551386623164764, + "grad_norm": 0.507156491279602, + "learning_rate": 0.00022748776508972268, + "loss": 0.342, + "num_input_tokens_seen": 6042432, + "step": 2790 + }, + { + "epoch": 0.4559543230016313, + "grad_norm": 0.34993717074394226, + "learning_rate": 0.00022789559543230019, + "loss": 0.3274, + "num_input_tokens_seen": 6054304, + "step": 2795 + }, + { + "epoch": 0.4567699836867863, + "grad_norm": 0.7146971225738525, + "learning_rate": 0.00022830342577487763, + "loss": 0.3629, + "num_input_tokens_seen": 6065312, + "step": 2800 + }, + { + "epoch": 0.45758564437194127, + "grad_norm": 0.3397325873374939, + "learning_rate": 0.00022871125611745514, + "loss": 0.3313, + "num_input_tokens_seen": 6075072, + "step": 2805 + }, + { + "epoch": 0.45840130505709625, + "grad_norm": 0.7157807350158691, + "learning_rate": 0.00022911908646003261, + "loss": 0.3165, + "num_input_tokens_seen": 6084992, + "step": 2810 + }, + { + "epoch": 0.45921696574225124, + "grad_norm": 0.765362024307251, + "learning_rate": 0.00022952691680261012, + "loss": 0.373, + "num_input_tokens_seen": 6095424, + "step": 2815 + }, + { + "epoch": 0.4600326264274062, + "grad_norm": 0.27471351623535156, + "learning_rate": 0.00022993474714518762, + "loss": 0.3328, + "num_input_tokens_seen": 6107328, + "step": 2820 + }, + { + "epoch": 0.46084828711256115, + "grad_norm": 0.3250696659088135, + "learning_rate": 0.0002303425774877651, + "loss": 0.3523, + "num_input_tokens_seen": 6118048, + "step": 2825 + }, + { + "epoch": 0.46166394779771613, + "grad_norm": 0.1394474357366562, + "learning_rate": 0.0002307504078303426, + "loss": 0.3072, + "num_input_tokens_seen": 6128512, + "step": 2830 + }, + { + "epoch": 0.4624796084828711, + "grad_norm": 0.2740083932876587, + "learning_rate": 0.00023115823817292005, + "loss": 0.4081, + "num_input_tokens_seen": 6138720, + "step": 2835 + }, + { + "epoch": 0.4632952691680261, + "grad_norm": 0.2871670424938202, + "learning_rate": 0.00023156606851549755, + "loss": 0.3189, + "num_input_tokens_seen": 6149600, + "step": 2840 + }, + { + "epoch": 0.4641109298531811, + "grad_norm": 0.4568738043308258, + "learning_rate": 0.00023197389885807503, + "loss": 0.2934, + "num_input_tokens_seen": 6160608, + "step": 2845 + }, + { + "epoch": 0.46492659053833607, + "grad_norm": 0.1800023764371872, + "learning_rate": 0.00023238172920065253, + "loss": 0.3319, + "num_input_tokens_seen": 6171968, + "step": 2850 + }, + { + "epoch": 0.46574225122349105, + "grad_norm": 0.5981073975563049, + "learning_rate": 0.00023278955954323004, + "loss": 0.348, + "num_input_tokens_seen": 6183360, + "step": 2855 + }, + { + "epoch": 0.466557911908646, + "grad_norm": 0.28120627999305725, + "learning_rate": 0.00023319738988580751, + "loss": 0.331, + "num_input_tokens_seen": 6193664, + "step": 2860 + }, + { + "epoch": 0.46737357259380097, + "grad_norm": 0.1502273678779602, + "learning_rate": 0.000233605220228385, + "loss": 0.3349, + "num_input_tokens_seen": 6204320, + "step": 2865 + }, + { + "epoch": 0.46818923327895595, + "grad_norm": 0.3262067437171936, + "learning_rate": 0.00023401305057096247, + "loss": 0.3298, + "num_input_tokens_seen": 6214560, + "step": 2870 + }, + { + "epoch": 0.46900489396411094, + "grad_norm": 0.17942096292972565, + "learning_rate": 0.00023442088091353997, + "loss": 0.3213, + "num_input_tokens_seen": 6226176, + "step": 2875 + }, + { + "epoch": 0.4698205546492659, + "grad_norm": 0.15920691192150116, + "learning_rate": 0.00023482871125611747, + "loss": 0.3351, + "num_input_tokens_seen": 6236768, + "step": 2880 + }, + { + "epoch": 0.4706362153344209, + "grad_norm": 0.48741838335990906, + "learning_rate": 0.00023523654159869495, + "loss": 0.4369, + "num_input_tokens_seen": 6247424, + "step": 2885 + }, + { + "epoch": 0.47145187601957583, + "grad_norm": 0.27710264921188354, + "learning_rate": 0.00023564437194127245, + "loss": 0.3109, + "num_input_tokens_seen": 6257280, + "step": 2890 + }, + { + "epoch": 0.4722675367047308, + "grad_norm": 0.3456025719642639, + "learning_rate": 0.00023605220228384993, + "loss": 0.3441, + "num_input_tokens_seen": 6268160, + "step": 2895 + }, + { + "epoch": 0.4730831973898858, + "grad_norm": 0.16855378448963165, + "learning_rate": 0.0002364600326264274, + "loss": 0.3408, + "num_input_tokens_seen": 6278976, + "step": 2900 + }, + { + "epoch": 0.4738988580750408, + "grad_norm": 0.5824698805809021, + "learning_rate": 0.00023686786296900488, + "loss": 0.3412, + "num_input_tokens_seen": 6290144, + "step": 2905 + }, + { + "epoch": 0.47471451876019577, + "grad_norm": 0.36858123540878296, + "learning_rate": 0.0002372756933115824, + "loss": 0.3275, + "num_input_tokens_seen": 6301760, + "step": 2910 + }, + { + "epoch": 0.47553017944535075, + "grad_norm": 0.2003246694803238, + "learning_rate": 0.0002376835236541599, + "loss": 0.286, + "num_input_tokens_seen": 6312416, + "step": 2915 + }, + { + "epoch": 0.4763458401305057, + "grad_norm": 0.3464984595775604, + "learning_rate": 0.00023809135399673737, + "loss": 0.382, + "num_input_tokens_seen": 6323488, + "step": 2920 + }, + { + "epoch": 0.47716150081566067, + "grad_norm": 0.6668310761451721, + "learning_rate": 0.00023849918433931487, + "loss": 0.3876, + "num_input_tokens_seen": 6334496, + "step": 2925 + }, + { + "epoch": 0.47797716150081565, + "grad_norm": 0.9299334287643433, + "learning_rate": 0.00023890701468189232, + "loss": 0.3962, + "num_input_tokens_seen": 6344320, + "step": 2930 + }, + { + "epoch": 0.47879282218597063, + "grad_norm": 0.2379339188337326, + "learning_rate": 0.00023931484502446982, + "loss": 0.3629, + "num_input_tokens_seen": 6355904, + "step": 2935 + }, + { + "epoch": 0.4796084828711256, + "grad_norm": 0.2122390866279602, + "learning_rate": 0.0002397226753670473, + "loss": 0.3594, + "num_input_tokens_seen": 6366016, + "step": 2940 + }, + { + "epoch": 0.4804241435562806, + "grad_norm": 0.102106474339962, + "learning_rate": 0.0002401305057096248, + "loss": 0.3629, + "num_input_tokens_seen": 6375712, + "step": 2945 + }, + { + "epoch": 0.4812398042414356, + "grad_norm": 0.5380746722221375, + "learning_rate": 0.0002405383360522023, + "loss": 0.3461, + "num_input_tokens_seen": 6386720, + "step": 2950 + }, + { + "epoch": 0.4820554649265905, + "grad_norm": 0.5553815960884094, + "learning_rate": 0.00024094616639477978, + "loss": 0.3596, + "num_input_tokens_seen": 6397536, + "step": 2955 + }, + { + "epoch": 0.4828711256117455, + "grad_norm": 0.28765079379081726, + "learning_rate": 0.00024135399673735726, + "loss": 0.3407, + "num_input_tokens_seen": 6408384, + "step": 2960 + }, + { + "epoch": 0.4836867862969005, + "grad_norm": 0.19030137360095978, + "learning_rate": 0.00024176182707993474, + "loss": 0.314, + "num_input_tokens_seen": 6420640, + "step": 2965 + }, + { + "epoch": 0.48450244698205547, + "grad_norm": 0.09808509796857834, + "learning_rate": 0.00024216965742251224, + "loss": 0.3268, + "num_input_tokens_seen": 6431680, + "step": 2970 + }, + { + "epoch": 0.48531810766721045, + "grad_norm": 0.13716532289981842, + "learning_rate": 0.00024257748776508972, + "loss": 0.3589, + "num_input_tokens_seen": 6442368, + "step": 2975 + }, + { + "epoch": 0.48613376835236544, + "grad_norm": 0.22778721153736115, + "learning_rate": 0.00024298531810766722, + "loss": 0.3194, + "num_input_tokens_seen": 6453856, + "step": 2980 + }, + { + "epoch": 0.48694942903752036, + "grad_norm": 0.1628149002790451, + "learning_rate": 0.00024339314845024472, + "loss": 0.3427, + "num_input_tokens_seen": 6464640, + "step": 2985 + }, + { + "epoch": 0.48776508972267535, + "grad_norm": 0.2008904665708542, + "learning_rate": 0.0002438009787928222, + "loss": 0.3359, + "num_input_tokens_seen": 6475392, + "step": 2990 + }, + { + "epoch": 0.48858075040783033, + "grad_norm": 0.38410887122154236, + "learning_rate": 0.0002442088091353997, + "loss": 0.3462, + "num_input_tokens_seen": 6486880, + "step": 2995 + }, + { + "epoch": 0.4893964110929853, + "grad_norm": 0.3415551781654358, + "learning_rate": 0.00024461663947797715, + "loss": 0.3485, + "num_input_tokens_seen": 6497216, + "step": 3000 + }, + { + "epoch": 0.4902120717781403, + "grad_norm": 0.13694021105766296, + "learning_rate": 0.00024502446982055463, + "loss": 0.4278, + "num_input_tokens_seen": 6508320, + "step": 3005 + }, + { + "epoch": 0.4910277324632953, + "grad_norm": 0.37679731845855713, + "learning_rate": 0.00024543230016313216, + "loss": 0.3299, + "num_input_tokens_seen": 6519520, + "step": 3010 + }, + { + "epoch": 0.49184339314845027, + "grad_norm": 0.15331923961639404, + "learning_rate": 0.00024584013050570964, + "loss": 0.3111, + "num_input_tokens_seen": 6530240, + "step": 3015 + }, + { + "epoch": 0.4926590538336052, + "grad_norm": 0.34927237033843994, + "learning_rate": 0.0002462479608482871, + "loss": 0.3775, + "num_input_tokens_seen": 6540192, + "step": 3020 + }, + { + "epoch": 0.4934747145187602, + "grad_norm": 0.12338215857744217, + "learning_rate": 0.0002466557911908646, + "loss": 0.3411, + "num_input_tokens_seen": 6550016, + "step": 3025 + }, + { + "epoch": 0.49429037520391517, + "grad_norm": 0.13807620108127594, + "learning_rate": 0.00024706362153344207, + "loss": 0.3544, + "num_input_tokens_seen": 6560672, + "step": 3030 + }, + { + "epoch": 0.49510603588907015, + "grad_norm": 0.2901446223258972, + "learning_rate": 0.0002474714518760196, + "loss": 0.3501, + "num_input_tokens_seen": 6571808, + "step": 3035 + }, + { + "epoch": 0.49592169657422513, + "grad_norm": 0.3042585551738739, + "learning_rate": 0.0002478792822185971, + "loss": 0.3442, + "num_input_tokens_seen": 6582624, + "step": 3040 + }, + { + "epoch": 0.4967373572593801, + "grad_norm": 0.5703750848770142, + "learning_rate": 0.00024828711256117455, + "loss": 0.3541, + "num_input_tokens_seen": 6595008, + "step": 3045 + }, + { + "epoch": 0.49755301794453505, + "grad_norm": 2.2514257431030273, + "learning_rate": 0.000248694942903752, + "loss": 0.3453, + "num_input_tokens_seen": 6606976, + "step": 3050 + }, + { + "epoch": 0.49836867862969003, + "grad_norm": 0.24025072157382965, + "learning_rate": 0.00024910277324632956, + "loss": 0.3227, + "num_input_tokens_seen": 6617760, + "step": 3055 + }, + { + "epoch": 0.499184339314845, + "grad_norm": 0.5085632801055908, + "learning_rate": 0.00024951060358890703, + "loss": 0.3192, + "num_input_tokens_seen": 6628896, + "step": 3060 + }, + { + "epoch": 0.5, + "grad_norm": 0.5351697206497192, + "learning_rate": 0.0002499184339314845, + "loss": 0.314, + "num_input_tokens_seen": 6639424, + "step": 3065 + }, + { + "epoch": 0.5008156606851549, + "grad_norm": 0.23477743566036224, + "learning_rate": 0.00025032626427406204, + "loss": 0.404, + "num_input_tokens_seen": 6649280, + "step": 3070 + }, + { + "epoch": 0.50163132137031, + "grad_norm": 0.4015452563762665, + "learning_rate": 0.00025073409461663946, + "loss": 0.3214, + "num_input_tokens_seen": 6659008, + "step": 3075 + }, + { + "epoch": 0.5024469820554649, + "grad_norm": 0.1324753761291504, + "learning_rate": 0.00025114192495921694, + "loss": 0.3422, + "num_input_tokens_seen": 6669312, + "step": 3080 + }, + { + "epoch": 0.5032626427406199, + "grad_norm": 0.24739250540733337, + "learning_rate": 0.00025154975530179447, + "loss": 0.34, + "num_input_tokens_seen": 6679968, + "step": 3085 + }, + { + "epoch": 0.5040783034257749, + "grad_norm": 0.5270182490348816, + "learning_rate": 0.00025195758564437195, + "loss": 0.3104, + "num_input_tokens_seen": 6689728, + "step": 3090 + }, + { + "epoch": 0.5048939641109299, + "grad_norm": 0.9428045153617859, + "learning_rate": 0.0002523654159869495, + "loss": 0.372, + "num_input_tokens_seen": 6700448, + "step": 3095 + }, + { + "epoch": 0.5057096247960848, + "grad_norm": 0.3519197106361389, + "learning_rate": 0.0002527732463295269, + "loss": 0.38, + "num_input_tokens_seen": 6711424, + "step": 3100 + }, + { + "epoch": 0.5065252854812398, + "grad_norm": 0.36259037256240845, + "learning_rate": 0.0002531810766721044, + "loss": 0.3139, + "num_input_tokens_seen": 6722752, + "step": 3105 + }, + { + "epoch": 0.5073409461663948, + "grad_norm": 0.17484600841999054, + "learning_rate": 0.0002535889070146819, + "loss": 0.3149, + "num_input_tokens_seen": 6734080, + "step": 3110 + }, + { + "epoch": 0.5081566068515497, + "grad_norm": 0.16905194520950317, + "learning_rate": 0.0002539967373572594, + "loss": 0.3442, + "num_input_tokens_seen": 6744672, + "step": 3115 + }, + { + "epoch": 0.5089722675367048, + "grad_norm": 0.3450393080711365, + "learning_rate": 0.00025440456769983686, + "loss": 0.3294, + "num_input_tokens_seen": 6755104, + "step": 3120 + }, + { + "epoch": 0.5097879282218597, + "grad_norm": 0.21835975348949432, + "learning_rate": 0.00025481239804241434, + "loss": 0.3159, + "num_input_tokens_seen": 6767168, + "step": 3125 + }, + { + "epoch": 0.5106035889070146, + "grad_norm": 0.33362606167793274, + "learning_rate": 0.00025522022838499187, + "loss": 0.3369, + "num_input_tokens_seen": 6777472, + "step": 3130 + }, + { + "epoch": 0.5114192495921697, + "grad_norm": 0.35720738768577576, + "learning_rate": 0.00025562805872756934, + "loss": 0.2971, + "num_input_tokens_seen": 6787776, + "step": 3135 + }, + { + "epoch": 0.5122349102773246, + "grad_norm": 0.17389388382434845, + "learning_rate": 0.0002560358890701468, + "loss": 0.3461, + "num_input_tokens_seen": 6799008, + "step": 3140 + }, + { + "epoch": 0.5130505709624796, + "grad_norm": 0.18747566640377045, + "learning_rate": 0.0002564437194127243, + "loss": 0.3194, + "num_input_tokens_seen": 6809440, + "step": 3145 + }, + { + "epoch": 0.5138662316476346, + "grad_norm": 0.22149868309497833, + "learning_rate": 0.00025685154975530177, + "loss": 0.3344, + "num_input_tokens_seen": 6819456, + "step": 3150 + }, + { + "epoch": 0.5146818923327896, + "grad_norm": 0.19501285254955292, + "learning_rate": 0.0002572593800978793, + "loss": 0.325, + "num_input_tokens_seen": 6829312, + "step": 3155 + }, + { + "epoch": 0.5154975530179445, + "grad_norm": 0.20681631565093994, + "learning_rate": 0.0002576672104404568, + "loss": 0.3063, + "num_input_tokens_seen": 6839456, + "step": 3160 + }, + { + "epoch": 0.5163132137030995, + "grad_norm": 0.2203688621520996, + "learning_rate": 0.0002580750407830343, + "loss": 0.3547, + "num_input_tokens_seen": 6850752, + "step": 3165 + }, + { + "epoch": 0.5171288743882545, + "grad_norm": 0.26611343026161194, + "learning_rate": 0.00025848287112561173, + "loss": 0.3054, + "num_input_tokens_seen": 6862144, + "step": 3170 + }, + { + "epoch": 0.5179445350734094, + "grad_norm": 0.8188471794128418, + "learning_rate": 0.0002588907014681892, + "loss": 0.3631, + "num_input_tokens_seen": 6872704, + "step": 3175 + }, + { + "epoch": 0.5187601957585645, + "grad_norm": 0.3196231424808502, + "learning_rate": 0.00025929853181076674, + "loss": 0.359, + "num_input_tokens_seen": 6883232, + "step": 3180 + }, + { + "epoch": 0.5195758564437194, + "grad_norm": 0.34572383761405945, + "learning_rate": 0.0002597063621533442, + "loss": 0.3638, + "num_input_tokens_seen": 6894368, + "step": 3185 + }, + { + "epoch": 0.5203915171288744, + "grad_norm": 0.3288843333721161, + "learning_rate": 0.00026011419249592175, + "loss": 0.3348, + "num_input_tokens_seen": 6906336, + "step": 3190 + }, + { + "epoch": 0.5212071778140294, + "grad_norm": 0.3959663212299347, + "learning_rate": 0.00026052202283849917, + "loss": 0.3301, + "num_input_tokens_seen": 6917440, + "step": 3195 + }, + { + "epoch": 0.5220228384991843, + "grad_norm": 0.3188297748565674, + "learning_rate": 0.0002609298531810767, + "loss": 0.3442, + "num_input_tokens_seen": 6928736, + "step": 3200 + }, + { + "epoch": 0.5228384991843393, + "grad_norm": 0.35221388936042786, + "learning_rate": 0.0002613376835236542, + "loss": 0.3485, + "num_input_tokens_seen": 6939488, + "step": 3205 + }, + { + "epoch": 0.5236541598694943, + "grad_norm": 0.5500792264938354, + "learning_rate": 0.00026174551386623165, + "loss": 0.3342, + "num_input_tokens_seen": 6950016, + "step": 3210 + }, + { + "epoch": 0.5244698205546493, + "grad_norm": 0.41831010580062866, + "learning_rate": 0.00026215334420880913, + "loss": 0.3329, + "num_input_tokens_seen": 6961280, + "step": 3215 + }, + { + "epoch": 0.5252854812398042, + "grad_norm": 0.34372174739837646, + "learning_rate": 0.0002625611745513866, + "loss": 0.3033, + "num_input_tokens_seen": 6971360, + "step": 3220 + }, + { + "epoch": 0.5261011419249593, + "grad_norm": 0.260886013507843, + "learning_rate": 0.00026296900489396414, + "loss": 0.3655, + "num_input_tokens_seen": 6982528, + "step": 3225 + }, + { + "epoch": 0.5269168026101142, + "grad_norm": 0.2501831352710724, + "learning_rate": 0.0002633768352365416, + "loss": 0.3583, + "num_input_tokens_seen": 6992704, + "step": 3230 + }, + { + "epoch": 0.5277324632952691, + "grad_norm": 0.3974502384662628, + "learning_rate": 0.0002637846655791191, + "loss": 0.2711, + "num_input_tokens_seen": 7003072, + "step": 3235 + }, + { + "epoch": 0.5285481239804242, + "grad_norm": 0.4438716471195221, + "learning_rate": 0.00026419249592169657, + "loss": 0.351, + "num_input_tokens_seen": 7013728, + "step": 3240 + }, + { + "epoch": 0.5293637846655791, + "grad_norm": 0.32573044300079346, + "learning_rate": 0.00026460032626427404, + "loss": 0.2589, + "num_input_tokens_seen": 7024256, + "step": 3245 + }, + { + "epoch": 0.5301794453507341, + "grad_norm": 0.3844357430934906, + "learning_rate": 0.00026500815660685157, + "loss": 0.2638, + "num_input_tokens_seen": 7034688, + "step": 3250 + }, + { + "epoch": 0.5309951060358891, + "grad_norm": 0.4688999652862549, + "learning_rate": 0.00026541598694942905, + "loss": 0.4332, + "num_input_tokens_seen": 7046080, + "step": 3255 + }, + { + "epoch": 0.531810766721044, + "grad_norm": 0.5767448544502258, + "learning_rate": 0.0002658238172920066, + "loss": 0.3157, + "num_input_tokens_seen": 7056032, + "step": 3260 + }, + { + "epoch": 0.532626427406199, + "grad_norm": 2.0818305015563965, + "learning_rate": 0.000266231647634584, + "loss": 0.3434, + "num_input_tokens_seen": 7066880, + "step": 3265 + }, + { + "epoch": 0.533442088091354, + "grad_norm": 1.1674504280090332, + "learning_rate": 0.0002666394779771615, + "loss": 0.339, + "num_input_tokens_seen": 7077664, + "step": 3270 + }, + { + "epoch": 0.534257748776509, + "grad_norm": 1.1195868253707886, + "learning_rate": 0.000267047308319739, + "loss": 0.3439, + "num_input_tokens_seen": 7088928, + "step": 3275 + }, + { + "epoch": 0.5350734094616639, + "grad_norm": 0.4200175106525421, + "learning_rate": 0.0002674551386623165, + "loss": 0.35, + "num_input_tokens_seen": 7100800, + "step": 3280 + }, + { + "epoch": 0.535889070146819, + "grad_norm": 0.5413742065429688, + "learning_rate": 0.00026786296900489396, + "loss": 0.2677, + "num_input_tokens_seen": 7112160, + "step": 3285 + }, + { + "epoch": 0.5367047308319739, + "grad_norm": 0.6396359205245972, + "learning_rate": 0.00026827079934747144, + "loss": 0.2657, + "num_input_tokens_seen": 7122848, + "step": 3290 + }, + { + "epoch": 0.5375203915171288, + "grad_norm": 2.0694944858551025, + "learning_rate": 0.00026867862969004897, + "loss": 0.1975, + "num_input_tokens_seen": 7132800, + "step": 3295 + }, + { + "epoch": 0.5383360522022839, + "grad_norm": 1.4937360286712646, + "learning_rate": 0.00026908646003262645, + "loss": 0.1781, + "num_input_tokens_seen": 7144064, + "step": 3300 + }, + { + "epoch": 0.5391517128874388, + "grad_norm": 3.8154499530792236, + "learning_rate": 0.0002694942903752039, + "loss": 0.2212, + "num_input_tokens_seen": 7155744, + "step": 3305 + }, + { + "epoch": 0.5399673735725938, + "grad_norm": 3.7212789058685303, + "learning_rate": 0.0002699021207177814, + "loss": 0.2407, + "num_input_tokens_seen": 7166336, + "step": 3310 + }, + { + "epoch": 0.5407830342577488, + "grad_norm": 0.43533310294151306, + "learning_rate": 0.0002703099510603589, + "loss": 0.2136, + "num_input_tokens_seen": 7176832, + "step": 3315 + }, + { + "epoch": 0.5415986949429038, + "grad_norm": 4.021356582641602, + "learning_rate": 0.0002707177814029364, + "loss": 0.4333, + "num_input_tokens_seen": 7187616, + "step": 3320 + }, + { + "epoch": 0.5424143556280587, + "grad_norm": 1.149902105331421, + "learning_rate": 0.0002711256117455139, + "loss": 0.2748, + "num_input_tokens_seen": 7198144, + "step": 3325 + }, + { + "epoch": 0.5432300163132137, + "grad_norm": 4.663343906402588, + "learning_rate": 0.0002715334420880914, + "loss": 0.2526, + "num_input_tokens_seen": 7208960, + "step": 3330 + }, + { + "epoch": 0.5440456769983687, + "grad_norm": 4.060253620147705, + "learning_rate": 0.00027194127243066883, + "loss": 0.2445, + "num_input_tokens_seen": 7220064, + "step": 3335 + }, + { + "epoch": 0.5448613376835236, + "grad_norm": 11.504706382751465, + "learning_rate": 0.0002723491027732463, + "loss": 0.2842, + "num_input_tokens_seen": 7231008, + "step": 3340 + }, + { + "epoch": 0.5456769983686787, + "grad_norm": 5.282680511474609, + "learning_rate": 0.00027275693311582384, + "loss": 0.2295, + "num_input_tokens_seen": 7242144, + "step": 3345 + }, + { + "epoch": 0.5464926590538336, + "grad_norm": 1.881278157234192, + "learning_rate": 0.0002731647634584013, + "loss": 0.3371, + "num_input_tokens_seen": 7252448, + "step": 3350 + }, + { + "epoch": 0.5473083197389886, + "grad_norm": 2.955146074295044, + "learning_rate": 0.0002735725938009788, + "loss": 0.1651, + "num_input_tokens_seen": 7261856, + "step": 3355 + }, + { + "epoch": 0.5481239804241436, + "grad_norm": 0.45134466886520386, + "learning_rate": 0.00027398042414355627, + "loss": 0.1109, + "num_input_tokens_seen": 7272800, + "step": 3360 + }, + { + "epoch": 0.5489396411092985, + "grad_norm": 4.103794097900391, + "learning_rate": 0.00027438825448613375, + "loss": 0.3971, + "num_input_tokens_seen": 7283808, + "step": 3365 + }, + { + "epoch": 0.5497553017944535, + "grad_norm": 0.36970141530036926, + "learning_rate": 0.0002747960848287113, + "loss": 0.0893, + "num_input_tokens_seen": 7294976, + "step": 3370 + }, + { + "epoch": 0.5505709624796085, + "grad_norm": 3.103245258331299, + "learning_rate": 0.00027520391517128875, + "loss": 0.2665, + "num_input_tokens_seen": 7306560, + "step": 3375 + }, + { + "epoch": 0.5513866231647635, + "grad_norm": 7.966569900512695, + "learning_rate": 0.00027561174551386623, + "loss": 0.1223, + "num_input_tokens_seen": 7317600, + "step": 3380 + }, + { + "epoch": 0.5522022838499184, + "grad_norm": 11.583137512207031, + "learning_rate": 0.0002760195758564437, + "loss": 0.3326, + "num_input_tokens_seen": 7330144, + "step": 3385 + }, + { + "epoch": 0.5530179445350734, + "grad_norm": 15.787392616271973, + "learning_rate": 0.00027642740619902124, + "loss": 0.2025, + "num_input_tokens_seen": 7341248, + "step": 3390 + }, + { + "epoch": 0.5538336052202284, + "grad_norm": 5.767389297485352, + "learning_rate": 0.0002768352365415987, + "loss": 0.5156, + "num_input_tokens_seen": 7351200, + "step": 3395 + }, + { + "epoch": 0.5546492659053833, + "grad_norm": 0.38437649607658386, + "learning_rate": 0.0002772430668841762, + "loss": 0.0356, + "num_input_tokens_seen": 7361792, + "step": 3400 + }, + { + "epoch": 0.5554649265905384, + "grad_norm": 3.2605271339416504, + "learning_rate": 0.00027765089722675367, + "loss": 0.2842, + "num_input_tokens_seen": 7372160, + "step": 3405 + }, + { + "epoch": 0.5562805872756933, + "grad_norm": 0.7511138319969177, + "learning_rate": 0.00027805872756933114, + "loss": 0.135, + "num_input_tokens_seen": 7381408, + "step": 3410 + }, + { + "epoch": 0.5570962479608483, + "grad_norm": 2.64924693107605, + "learning_rate": 0.0002784665579119087, + "loss": 0.2896, + "num_input_tokens_seen": 7392512, + "step": 3415 + }, + { + "epoch": 0.5579119086460033, + "grad_norm": 2.335987091064453, + "learning_rate": 0.00027887438825448615, + "loss": 0.2444, + "num_input_tokens_seen": 7404192, + "step": 3420 + }, + { + "epoch": 0.5587275693311582, + "grad_norm": 0.07507435232400894, + "learning_rate": 0.00027928221859706363, + "loss": 0.1906, + "num_input_tokens_seen": 7415008, + "step": 3425 + }, + { + "epoch": 0.5595432300163132, + "grad_norm": 2.7252261638641357, + "learning_rate": 0.0002796900489396411, + "loss": 0.2069, + "num_input_tokens_seen": 7425088, + "step": 3430 + }, + { + "epoch": 0.5603588907014682, + "grad_norm": 1.43885338306427, + "learning_rate": 0.0002800978792822186, + "loss": 0.3257, + "num_input_tokens_seen": 7436608, + "step": 3435 + }, + { + "epoch": 0.5611745513866232, + "grad_norm": 0.6228769421577454, + "learning_rate": 0.0002805057096247961, + "loss": 0.204, + "num_input_tokens_seen": 7446784, + "step": 3440 + }, + { + "epoch": 0.5619902120717781, + "grad_norm": 0.7971484065055847, + "learning_rate": 0.0002809135399673736, + "loss": 0.2242, + "num_input_tokens_seen": 7456640, + "step": 3445 + }, + { + "epoch": 0.5628058727569332, + "grad_norm": 0.16205042600631714, + "learning_rate": 0.00028132137030995106, + "loss": 0.0723, + "num_input_tokens_seen": 7468096, + "step": 3450 + }, + { + "epoch": 0.5636215334420881, + "grad_norm": 0.6337445974349976, + "learning_rate": 0.00028172920065252854, + "loss": 0.2102, + "num_input_tokens_seen": 7480096, + "step": 3455 + }, + { + "epoch": 0.564437194127243, + "grad_norm": 0.3351195156574249, + "learning_rate": 0.000282137030995106, + "loss": 0.1359, + "num_input_tokens_seen": 7491136, + "step": 3460 + }, + { + "epoch": 0.5652528548123981, + "grad_norm": 0.2824116051197052, + "learning_rate": 0.00028254486133768355, + "loss": 0.0847, + "num_input_tokens_seen": 7502304, + "step": 3465 + }, + { + "epoch": 0.566068515497553, + "grad_norm": 0.9770231246948242, + "learning_rate": 0.000282952691680261, + "loss": 0.2422, + "num_input_tokens_seen": 7512928, + "step": 3470 + }, + { + "epoch": 0.566884176182708, + "grad_norm": 0.2426728755235672, + "learning_rate": 0.0002833605220228385, + "loss": 0.2351, + "num_input_tokens_seen": 7523904, + "step": 3475 + }, + { + "epoch": 0.567699836867863, + "grad_norm": 1.6302433013916016, + "learning_rate": 0.000283768352365416, + "loss": 0.2663, + "num_input_tokens_seen": 7534496, + "step": 3480 + }, + { + "epoch": 0.5685154975530179, + "grad_norm": 2.3486311435699463, + "learning_rate": 0.0002841761827079935, + "loss": 0.3039, + "num_input_tokens_seen": 7545120, + "step": 3485 + }, + { + "epoch": 0.5693311582381729, + "grad_norm": 1.3553709983825684, + "learning_rate": 0.000284584013050571, + "loss": 0.0968, + "num_input_tokens_seen": 7555712, + "step": 3490 + }, + { + "epoch": 0.5701468189233279, + "grad_norm": 0.3113894462585449, + "learning_rate": 0.0002849918433931484, + "loss": 0.1543, + "num_input_tokens_seen": 7567200, + "step": 3495 + }, + { + "epoch": 0.5709624796084829, + "grad_norm": 1.223634958267212, + "learning_rate": 0.00028539967373572594, + "loss": 0.2732, + "num_input_tokens_seen": 7578464, + "step": 3500 + }, + { + "epoch": 0.5717781402936378, + "grad_norm": 0.20850342512130737, + "learning_rate": 0.0002858075040783034, + "loss": 0.1921, + "num_input_tokens_seen": 7588256, + "step": 3505 + }, + { + "epoch": 0.5725938009787929, + "grad_norm": 2.081028699874878, + "learning_rate": 0.00028621533442088094, + "loss": 0.3551, + "num_input_tokens_seen": 7598848, + "step": 3510 + }, + { + "epoch": 0.5734094616639478, + "grad_norm": 1.5860928297042847, + "learning_rate": 0.0002866231647634584, + "loss": 0.1176, + "num_input_tokens_seen": 7609696, + "step": 3515 + }, + { + "epoch": 0.5742251223491027, + "grad_norm": 0.4478835165500641, + "learning_rate": 0.0002870309951060359, + "loss": 0.1014, + "num_input_tokens_seen": 7620864, + "step": 3520 + }, + { + "epoch": 0.5750407830342578, + "grad_norm": 0.5952057838439941, + "learning_rate": 0.0002874388254486134, + "loss": 0.0888, + "num_input_tokens_seen": 7631392, + "step": 3525 + }, + { + "epoch": 0.5758564437194127, + "grad_norm": 0.11655039340257645, + "learning_rate": 0.00028784665579119085, + "loss": 0.0938, + "num_input_tokens_seen": 7643232, + "step": 3530 + }, + { + "epoch": 0.5766721044045677, + "grad_norm": 0.08364559710025787, + "learning_rate": 0.0002882544861337684, + "loss": 0.1788, + "num_input_tokens_seen": 7654944, + "step": 3535 + }, + { + "epoch": 0.5774877650897227, + "grad_norm": 1.4451963901519775, + "learning_rate": 0.00028866231647634586, + "loss": 0.0771, + "num_input_tokens_seen": 7665760, + "step": 3540 + }, + { + "epoch": 0.5783034257748777, + "grad_norm": 0.045954909175634384, + "learning_rate": 0.00028907014681892333, + "loss": 0.1094, + "num_input_tokens_seen": 7676576, + "step": 3545 + }, + { + "epoch": 0.5791190864600326, + "grad_norm": 0.6190336346626282, + "learning_rate": 0.0002894779771615008, + "loss": 0.2924, + "num_input_tokens_seen": 7687104, + "step": 3550 + }, + { + "epoch": 0.5799347471451876, + "grad_norm": 0.5118790864944458, + "learning_rate": 0.00028988580750407834, + "loss": 0.1175, + "num_input_tokens_seen": 7698080, + "step": 3555 + }, + { + "epoch": 0.5807504078303426, + "grad_norm": 0.957623302936554, + "learning_rate": 0.0002902936378466558, + "loss": 0.2314, + "num_input_tokens_seen": 7709632, + "step": 3560 + }, + { + "epoch": 0.5815660685154975, + "grad_norm": 0.7817164063453674, + "learning_rate": 0.00029070146818923324, + "loss": 0.0697, + "num_input_tokens_seen": 7719936, + "step": 3565 + }, + { + "epoch": 0.5823817292006526, + "grad_norm": 0.12220557779073715, + "learning_rate": 0.00029110929853181077, + "loss": 0.1666, + "num_input_tokens_seen": 7730080, + "step": 3570 + }, + { + "epoch": 0.5831973898858075, + "grad_norm": 1.621195912361145, + "learning_rate": 0.00029151712887438825, + "loss": 0.2131, + "num_input_tokens_seen": 7740576, + "step": 3575 + }, + { + "epoch": 0.5840130505709625, + "grad_norm": 0.04325403645634651, + "learning_rate": 0.0002919249592169658, + "loss": 0.1593, + "num_input_tokens_seen": 7751040, + "step": 3580 + }, + { + "epoch": 0.5848287112561175, + "grad_norm": 0.017309635877609253, + "learning_rate": 0.00029233278955954325, + "loss": 0.0343, + "num_input_tokens_seen": 7760768, + "step": 3585 + }, + { + "epoch": 0.5856443719412724, + "grad_norm": 0.13765253126621246, + "learning_rate": 0.0002927406199021207, + "loss": 0.0984, + "num_input_tokens_seen": 7771680, + "step": 3590 + }, + { + "epoch": 0.5864600326264274, + "grad_norm": 0.6749603152275085, + "learning_rate": 0.0002931484502446982, + "loss": 0.1097, + "num_input_tokens_seen": 7783072, + "step": 3595 + }, + { + "epoch": 0.5872756933115824, + "grad_norm": 0.12430331856012344, + "learning_rate": 0.0002935562805872757, + "loss": 0.0998, + "num_input_tokens_seen": 7792608, + "step": 3600 + }, + { + "epoch": 0.5880913539967374, + "grad_norm": 0.19184599816799164, + "learning_rate": 0.0002939641109298532, + "loss": 0.299, + "num_input_tokens_seen": 7803744, + "step": 3605 + }, + { + "epoch": 0.5889070146818923, + "grad_norm": 0.5135388374328613, + "learning_rate": 0.0002943719412724307, + "loss": 0.1628, + "num_input_tokens_seen": 7815008, + "step": 3610 + }, + { + "epoch": 0.5897226753670473, + "grad_norm": 0.5754003524780273, + "learning_rate": 0.00029477977161500817, + "loss": 0.1835, + "num_input_tokens_seen": 7825536, + "step": 3615 + }, + { + "epoch": 0.5905383360522023, + "grad_norm": 0.5735435485839844, + "learning_rate": 0.00029518760195758564, + "loss": 0.2017, + "num_input_tokens_seen": 7836160, + "step": 3620 + }, + { + "epoch": 0.5913539967373572, + "grad_norm": 0.712598443031311, + "learning_rate": 0.0002955954323001631, + "loss": 0.0891, + "num_input_tokens_seen": 7848864, + "step": 3625 + }, + { + "epoch": 0.5921696574225123, + "grad_norm": 1.8197914361953735, + "learning_rate": 0.00029600326264274065, + "loss": 0.1948, + "num_input_tokens_seen": 7859488, + "step": 3630 + }, + { + "epoch": 0.5929853181076672, + "grad_norm": 2.1675949096679688, + "learning_rate": 0.00029641109298531807, + "loss": 0.1876, + "num_input_tokens_seen": 7869600, + "step": 3635 + }, + { + "epoch": 0.5938009787928222, + "grad_norm": 0.9781519174575806, + "learning_rate": 0.0002968189233278956, + "loss": 0.1315, + "num_input_tokens_seen": 7879840, + "step": 3640 + }, + { + "epoch": 0.5946166394779772, + "grad_norm": 0.18419213593006134, + "learning_rate": 0.0002972267536704731, + "loss": 0.0612, + "num_input_tokens_seen": 7890272, + "step": 3645 + }, + { + "epoch": 0.5954323001631321, + "grad_norm": 1.9968929290771484, + "learning_rate": 0.0002976345840130506, + "loss": 0.3824, + "num_input_tokens_seen": 7901120, + "step": 3650 + }, + { + "epoch": 0.5962479608482871, + "grad_norm": 1.1627930402755737, + "learning_rate": 0.0002980424143556281, + "loss": 0.1702, + "num_input_tokens_seen": 7912608, + "step": 3655 + }, + { + "epoch": 0.5970636215334421, + "grad_norm": 0.1972116231918335, + "learning_rate": 0.0002984502446982055, + "loss": 0.2167, + "num_input_tokens_seen": 7923936, + "step": 3660 + }, + { + "epoch": 0.5978792822185971, + "grad_norm": 0.49415111541748047, + "learning_rate": 0.00029885807504078304, + "loss": 0.2059, + "num_input_tokens_seen": 7934496, + "step": 3665 + }, + { + "epoch": 0.598694942903752, + "grad_norm": 0.25723397731781006, + "learning_rate": 0.0002992659053833605, + "loss": 0.1787, + "num_input_tokens_seen": 7945728, + "step": 3670 + }, + { + "epoch": 0.5995106035889071, + "grad_norm": 0.24495765566825867, + "learning_rate": 0.00029967373572593805, + "loss": 0.1547, + "num_input_tokens_seen": 7956960, + "step": 3675 + }, + { + "epoch": 0.600326264274062, + "grad_norm": 0.6844676733016968, + "learning_rate": 0.0003000815660685155, + "loss": 0.1315, + "num_input_tokens_seen": 7969440, + "step": 3680 + }, + { + "epoch": 0.6011419249592169, + "grad_norm": 0.3189769983291626, + "learning_rate": 0.000300489396411093, + "loss": 0.0872, + "num_input_tokens_seen": 7978560, + "step": 3685 + }, + { + "epoch": 0.601957585644372, + "grad_norm": 0.6261066794395447, + "learning_rate": 0.0003008972267536705, + "loss": 0.2567, + "num_input_tokens_seen": 7990400, + "step": 3690 + }, + { + "epoch": 0.6027732463295269, + "grad_norm": 0.8831694722175598, + "learning_rate": 0.00030130505709624795, + "loss": 0.1221, + "num_input_tokens_seen": 8000128, + "step": 3695 + }, + { + "epoch": 0.6035889070146819, + "grad_norm": 0.14275726675987244, + "learning_rate": 0.0003017128874388255, + "loss": 0.1726, + "num_input_tokens_seen": 8010592, + "step": 3700 + }, + { + "epoch": 0.6044045676998369, + "grad_norm": 0.41234052181243896, + "learning_rate": 0.0003021207177814029, + "loss": 0.2479, + "num_input_tokens_seen": 8022240, + "step": 3705 + }, + { + "epoch": 0.6052202283849919, + "grad_norm": 0.16699844598770142, + "learning_rate": 0.00030252854812398044, + "loss": 0.1367, + "num_input_tokens_seen": 8032096, + "step": 3710 + }, + { + "epoch": 0.6060358890701468, + "grad_norm": 1.191625952720642, + "learning_rate": 0.0003029363784665579, + "loss": 0.2151, + "num_input_tokens_seen": 8042400, + "step": 3715 + }, + { + "epoch": 0.6068515497553018, + "grad_norm": 0.10937491804361343, + "learning_rate": 0.0003033442088091354, + "loss": 0.0987, + "num_input_tokens_seen": 8052672, + "step": 3720 + }, + { + "epoch": 0.6076672104404568, + "grad_norm": 0.3499789535999298, + "learning_rate": 0.0003037520391517129, + "loss": 0.1599, + "num_input_tokens_seen": 8063712, + "step": 3725 + }, + { + "epoch": 0.6084828711256117, + "grad_norm": 0.21835818886756897, + "learning_rate": 0.00030415986949429034, + "loss": 0.1213, + "num_input_tokens_seen": 8073600, + "step": 3730 + }, + { + "epoch": 0.6092985318107668, + "grad_norm": 0.3154020607471466, + "learning_rate": 0.00030456769983686787, + "loss": 0.1756, + "num_input_tokens_seen": 8085664, + "step": 3735 + }, + { + "epoch": 0.6101141924959217, + "grad_norm": 0.5285472273826599, + "learning_rate": 0.00030497553017944535, + "loss": 0.1691, + "num_input_tokens_seen": 8097152, + "step": 3740 + }, + { + "epoch": 0.6109298531810766, + "grad_norm": 0.5110867023468018, + "learning_rate": 0.0003053833605220229, + "loss": 0.0891, + "num_input_tokens_seen": 8107648, + "step": 3745 + }, + { + "epoch": 0.6117455138662317, + "grad_norm": 0.7529709339141846, + "learning_rate": 0.00030579119086460036, + "loss": 0.2222, + "num_input_tokens_seen": 8118272, + "step": 3750 + }, + { + "epoch": 0.6125611745513866, + "grad_norm": 0.29078394174575806, + "learning_rate": 0.0003061990212071778, + "loss": 0.1238, + "num_input_tokens_seen": 8129120, + "step": 3755 + }, + { + "epoch": 0.6133768352365416, + "grad_norm": 0.18543705344200134, + "learning_rate": 0.0003066068515497553, + "loss": 0.1179, + "num_input_tokens_seen": 8139200, + "step": 3760 + }, + { + "epoch": 0.6141924959216966, + "grad_norm": 0.1336267739534378, + "learning_rate": 0.0003070146818923328, + "loss": 0.0697, + "num_input_tokens_seen": 8150272, + "step": 3765 + }, + { + "epoch": 0.6150081566068516, + "grad_norm": 0.9017431735992432, + "learning_rate": 0.0003074225122349103, + "loss": 0.1805, + "num_input_tokens_seen": 8162496, + "step": 3770 + }, + { + "epoch": 0.6158238172920065, + "grad_norm": 0.2880920469760895, + "learning_rate": 0.00030783034257748774, + "loss": 0.1551, + "num_input_tokens_seen": 8173664, + "step": 3775 + }, + { + "epoch": 0.6166394779771615, + "grad_norm": 0.11792991310358047, + "learning_rate": 0.00030823817292006527, + "loss": 0.1701, + "num_input_tokens_seen": 8184512, + "step": 3780 + }, + { + "epoch": 0.6174551386623165, + "grad_norm": 1.30263352394104, + "learning_rate": 0.00030864600326264275, + "loss": 0.2642, + "num_input_tokens_seen": 8194432, + "step": 3785 + }, + { + "epoch": 0.6182707993474714, + "grad_norm": 1.2924909591674805, + "learning_rate": 0.0003090538336052202, + "loss": 0.2478, + "num_input_tokens_seen": 8205152, + "step": 3790 + }, + { + "epoch": 0.6190864600326265, + "grad_norm": 0.40174412727355957, + "learning_rate": 0.00030946166394779775, + "loss": 0.2048, + "num_input_tokens_seen": 8215360, + "step": 3795 + }, + { + "epoch": 0.6199021207177814, + "grad_norm": 0.31183719635009766, + "learning_rate": 0.0003098694942903752, + "loss": 0.1391, + "num_input_tokens_seen": 8225216, + "step": 3800 + }, + { + "epoch": 0.6207177814029364, + "grad_norm": 0.7523826956748962, + "learning_rate": 0.0003102773246329527, + "loss": 0.1697, + "num_input_tokens_seen": 8235488, + "step": 3805 + }, + { + "epoch": 0.6215334420880914, + "grad_norm": 0.4145306944847107, + "learning_rate": 0.0003106851549755302, + "loss": 0.1359, + "num_input_tokens_seen": 8246208, + "step": 3810 + }, + { + "epoch": 0.6223491027732463, + "grad_norm": 0.41024190187454224, + "learning_rate": 0.00031109298531810766, + "loss": 0.1166, + "num_input_tokens_seen": 8258048, + "step": 3815 + }, + { + "epoch": 0.6231647634584013, + "grad_norm": 0.3654584586620331, + "learning_rate": 0.0003115008156606852, + "loss": 0.0574, + "num_input_tokens_seen": 8269472, + "step": 3820 + }, + { + "epoch": 0.6239804241435563, + "grad_norm": 0.9429664611816406, + "learning_rate": 0.0003119086460032626, + "loss": 0.3624, + "num_input_tokens_seen": 8280512, + "step": 3825 + }, + { + "epoch": 0.6247960848287113, + "grad_norm": 0.17094814777374268, + "learning_rate": 0.00031231647634584014, + "loss": 0.1941, + "num_input_tokens_seen": 8291360, + "step": 3830 + }, + { + "epoch": 0.6256117455138662, + "grad_norm": 0.056781914085149765, + "learning_rate": 0.0003127243066884176, + "loss": 0.0781, + "num_input_tokens_seen": 8301440, + "step": 3835 + }, + { + "epoch": 0.6264274061990212, + "grad_norm": 0.4901602268218994, + "learning_rate": 0.00031313213703099515, + "loss": 0.2091, + "num_input_tokens_seen": 8313472, + "step": 3840 + }, + { + "epoch": 0.6272430668841762, + "grad_norm": 0.10239008069038391, + "learning_rate": 0.0003135399673735726, + "loss": 0.0432, + "num_input_tokens_seen": 8323488, + "step": 3845 + }, + { + "epoch": 0.6280587275693311, + "grad_norm": 0.12009858340024948, + "learning_rate": 0.00031394779771615005, + "loss": 0.0599, + "num_input_tokens_seen": 8336096, + "step": 3850 + }, + { + "epoch": 0.6288743882544862, + "grad_norm": 0.9222142696380615, + "learning_rate": 0.0003143556280587276, + "loss": 0.1487, + "num_input_tokens_seen": 8344736, + "step": 3855 + }, + { + "epoch": 0.6296900489396411, + "grad_norm": 0.0708332359790802, + "learning_rate": 0.00031476345840130506, + "loss": 0.1018, + "num_input_tokens_seen": 8356416, + "step": 3860 + }, + { + "epoch": 0.6305057096247961, + "grad_norm": 0.38017889857292175, + "learning_rate": 0.0003151712887438826, + "loss": 0.1401, + "num_input_tokens_seen": 8367712, + "step": 3865 + }, + { + "epoch": 0.6313213703099511, + "grad_norm": 0.03265518695116043, + "learning_rate": 0.00031557911908646, + "loss": 0.2011, + "num_input_tokens_seen": 8378752, + "step": 3870 + }, + { + "epoch": 0.632137030995106, + "grad_norm": 0.5149789452552795, + "learning_rate": 0.00031598694942903754, + "loss": 0.2281, + "num_input_tokens_seen": 8389472, + "step": 3875 + }, + { + "epoch": 0.632952691680261, + "grad_norm": 0.25411802530288696, + "learning_rate": 0.000316394779771615, + "loss": 0.2246, + "num_input_tokens_seen": 8400064, + "step": 3880 + }, + { + "epoch": 0.633768352365416, + "grad_norm": 0.4327666759490967, + "learning_rate": 0.0003168026101141925, + "loss": 0.1371, + "num_input_tokens_seen": 8410176, + "step": 3885 + }, + { + "epoch": 0.634584013050571, + "grad_norm": 0.22683726251125336, + "learning_rate": 0.00031721044045677, + "loss": 0.3167, + "num_input_tokens_seen": 8420256, + "step": 3890 + }, + { + "epoch": 0.6353996737357259, + "grad_norm": 0.8412274718284607, + "learning_rate": 0.00031761827079934744, + "loss": 0.1702, + "num_input_tokens_seen": 8431808, + "step": 3895 + }, + { + "epoch": 0.636215334420881, + "grad_norm": 0.2465798258781433, + "learning_rate": 0.000318026101141925, + "loss": 0.1306, + "num_input_tokens_seen": 8444064, + "step": 3900 + }, + { + "epoch": 0.6370309951060359, + "grad_norm": 0.31027480959892273, + "learning_rate": 0.00031843393148450245, + "loss": 0.1557, + "num_input_tokens_seen": 8454944, + "step": 3905 + }, + { + "epoch": 0.6378466557911908, + "grad_norm": 0.06405351310968399, + "learning_rate": 0.00031884176182708, + "loss": 0.0342, + "num_input_tokens_seen": 8466208, + "step": 3910 + }, + { + "epoch": 0.6386623164763459, + "grad_norm": 0.20856167376041412, + "learning_rate": 0.00031924959216965746, + "loss": 0.0742, + "num_input_tokens_seen": 8476480, + "step": 3915 + }, + { + "epoch": 0.6394779771615008, + "grad_norm": 0.5141230821609497, + "learning_rate": 0.0003196574225122349, + "loss": 0.1296, + "num_input_tokens_seen": 8486688, + "step": 3920 + }, + { + "epoch": 0.6402936378466558, + "grad_norm": 0.09380485117435455, + "learning_rate": 0.0003200652528548124, + "loss": 0.1631, + "num_input_tokens_seen": 8499200, + "step": 3925 + }, + { + "epoch": 0.6411092985318108, + "grad_norm": 0.542812705039978, + "learning_rate": 0.0003204730831973899, + "loss": 0.1939, + "num_input_tokens_seen": 8509728, + "step": 3930 + }, + { + "epoch": 0.6419249592169658, + "grad_norm": 0.8569459319114685, + "learning_rate": 0.0003208809135399674, + "loss": 0.1429, + "num_input_tokens_seen": 8520928, + "step": 3935 + }, + { + "epoch": 0.6427406199021207, + "grad_norm": 1.3260815143585205, + "learning_rate": 0.00032128874388254484, + "loss": 0.2277, + "num_input_tokens_seen": 8532448, + "step": 3940 + }, + { + "epoch": 0.6435562805872757, + "grad_norm": 1.7172664403915405, + "learning_rate": 0.0003216965742251223, + "loss": 0.1428, + "num_input_tokens_seen": 8543360, + "step": 3945 + }, + { + "epoch": 0.6443719412724307, + "grad_norm": 0.8204522728919983, + "learning_rate": 0.00032210440456769985, + "loss": 0.2712, + "num_input_tokens_seen": 8554176, + "step": 3950 + }, + { + "epoch": 0.6451876019575856, + "grad_norm": 0.466779500246048, + "learning_rate": 0.0003225122349102773, + "loss": 0.1626, + "num_input_tokens_seen": 8565376, + "step": 3955 + }, + { + "epoch": 0.6460032626427407, + "grad_norm": 0.5056552886962891, + "learning_rate": 0.00032292006525285486, + "loss": 0.2301, + "num_input_tokens_seen": 8576480, + "step": 3960 + }, + { + "epoch": 0.6468189233278956, + "grad_norm": 0.6118406653404236, + "learning_rate": 0.0003233278955954323, + "loss": 0.1045, + "num_input_tokens_seen": 8586944, + "step": 3965 + }, + { + "epoch": 0.6476345840130505, + "grad_norm": 0.10663698613643646, + "learning_rate": 0.0003237357259380098, + "loss": 0.2513, + "num_input_tokens_seen": 8596640, + "step": 3970 + }, + { + "epoch": 0.6484502446982056, + "grad_norm": 0.1542896330356598, + "learning_rate": 0.0003241435562805873, + "loss": 0.091, + "num_input_tokens_seen": 8608256, + "step": 3975 + }, + { + "epoch": 0.6492659053833605, + "grad_norm": 0.6385184526443481, + "learning_rate": 0.00032455138662316476, + "loss": 0.152, + "num_input_tokens_seen": 8618752, + "step": 3980 + }, + { + "epoch": 0.6500815660685155, + "grad_norm": 0.0111524797976017, + "learning_rate": 0.0003249592169657423, + "loss": 0.1256, + "num_input_tokens_seen": 8629824, + "step": 3985 + }, + { + "epoch": 0.6508972267536705, + "grad_norm": 0.03276031091809273, + "learning_rate": 0.0003253670473083197, + "loss": 0.1968, + "num_input_tokens_seen": 8640832, + "step": 3990 + }, + { + "epoch": 0.6517128874388255, + "grad_norm": 0.07210643589496613, + "learning_rate": 0.00032577487765089724, + "loss": 0.158, + "num_input_tokens_seen": 8651104, + "step": 3995 + }, + { + "epoch": 0.6525285481239804, + "grad_norm": 0.9950265884399414, + "learning_rate": 0.0003261827079934747, + "loss": 0.2309, + "num_input_tokens_seen": 8662112, + "step": 4000 + }, + { + "epoch": 0.6533442088091354, + "grad_norm": 0.25081875920295715, + "learning_rate": 0.00032659053833605225, + "loss": 0.0864, + "num_input_tokens_seen": 8672960, + "step": 4005 + }, + { + "epoch": 0.6541598694942904, + "grad_norm": 0.04845547303557396, + "learning_rate": 0.0003269983686786297, + "loss": 0.053, + "num_input_tokens_seen": 8683968, + "step": 4010 + }, + { + "epoch": 0.6549755301794453, + "grad_norm": 0.6152459979057312, + "learning_rate": 0.00032740619902120715, + "loss": 0.1411, + "num_input_tokens_seen": 8695200, + "step": 4015 + }, + { + "epoch": 0.6557911908646004, + "grad_norm": 0.2643890380859375, + "learning_rate": 0.0003278140293637847, + "loss": 0.2036, + "num_input_tokens_seen": 8705408, + "step": 4020 + }, + { + "epoch": 0.6566068515497553, + "grad_norm": 0.8687467575073242, + "learning_rate": 0.00032822185970636216, + "loss": 0.2411, + "num_input_tokens_seen": 8716128, + "step": 4025 + }, + { + "epoch": 0.6574225122349103, + "grad_norm": 0.24564947187900543, + "learning_rate": 0.0003286296900489397, + "loss": 0.2853, + "num_input_tokens_seen": 8727744, + "step": 4030 + }, + { + "epoch": 0.6582381729200653, + "grad_norm": 0.45441335439682007, + "learning_rate": 0.0003290375203915171, + "loss": 0.2239, + "num_input_tokens_seen": 8740128, + "step": 4035 + }, + { + "epoch": 0.6590538336052202, + "grad_norm": 0.29755422472953796, + "learning_rate": 0.00032944535073409464, + "loss": 0.1493, + "num_input_tokens_seen": 8752320, + "step": 4040 + }, + { + "epoch": 0.6598694942903752, + "grad_norm": 0.3011384606361389, + "learning_rate": 0.0003298531810766721, + "loss": 0.1758, + "num_input_tokens_seen": 8762784, + "step": 4045 + }, + { + "epoch": 0.6606851549755302, + "grad_norm": 0.13070525228977203, + "learning_rate": 0.0003302610114192496, + "loss": 0.1104, + "num_input_tokens_seen": 8773312, + "step": 4050 + }, + { + "epoch": 0.6615008156606852, + "grad_norm": 0.37032073736190796, + "learning_rate": 0.0003306688417618271, + "loss": 0.1433, + "num_input_tokens_seen": 8783744, + "step": 4055 + }, + { + "epoch": 0.6623164763458401, + "grad_norm": 0.22848550975322723, + "learning_rate": 0.00033107667210440455, + "loss": 0.0828, + "num_input_tokens_seen": 8795104, + "step": 4060 + }, + { + "epoch": 0.6631321370309952, + "grad_norm": 0.23329204320907593, + "learning_rate": 0.0003314845024469821, + "loss": 0.1024, + "num_input_tokens_seen": 8805760, + "step": 4065 + }, + { + "epoch": 0.6639477977161501, + "grad_norm": 0.7741011381149292, + "learning_rate": 0.00033189233278955955, + "loss": 0.119, + "num_input_tokens_seen": 8817536, + "step": 4070 + }, + { + "epoch": 0.664763458401305, + "grad_norm": 0.29406437277793884, + "learning_rate": 0.00033230016313213703, + "loss": 0.0765, + "num_input_tokens_seen": 8829184, + "step": 4075 + }, + { + "epoch": 0.6655791190864601, + "grad_norm": 0.19787578284740448, + "learning_rate": 0.0003327079934747145, + "loss": 0.0728, + "num_input_tokens_seen": 8839328, + "step": 4080 + }, + { + "epoch": 0.666394779771615, + "grad_norm": 0.26881399750709534, + "learning_rate": 0.000333115823817292, + "loss": 0.2293, + "num_input_tokens_seen": 8850336, + "step": 4085 + }, + { + "epoch": 0.66721044045677, + "grad_norm": 0.05652927979826927, + "learning_rate": 0.0003335236541598695, + "loss": 0.1286, + "num_input_tokens_seen": 8859392, + "step": 4090 + }, + { + "epoch": 0.668026101141925, + "grad_norm": 0.01822480745613575, + "learning_rate": 0.000333931484502447, + "loss": 0.0623, + "num_input_tokens_seen": 8871008, + "step": 4095 + }, + { + "epoch": 0.6688417618270799, + "grad_norm": 0.025784168392419815, + "learning_rate": 0.0003343393148450245, + "loss": 0.1742, + "num_input_tokens_seen": 8881760, + "step": 4100 + }, + { + "epoch": 0.6696574225122349, + "grad_norm": 0.08115997165441513, + "learning_rate": 0.00033474714518760194, + "loss": 0.2347, + "num_input_tokens_seen": 8892544, + "step": 4105 + }, + { + "epoch": 0.6704730831973899, + "grad_norm": 0.3836643099784851, + "learning_rate": 0.0003351549755301794, + "loss": 0.2412, + "num_input_tokens_seen": 8903936, + "step": 4110 + }, + { + "epoch": 0.6712887438825449, + "grad_norm": 0.119254931807518, + "learning_rate": 0.00033556280587275695, + "loss": 0.1529, + "num_input_tokens_seen": 8913952, + "step": 4115 + }, + { + "epoch": 0.6721044045676998, + "grad_norm": 0.5897416472434998, + "learning_rate": 0.0003359706362153344, + "loss": 0.1757, + "num_input_tokens_seen": 8925120, + "step": 4120 + }, + { + "epoch": 0.6729200652528549, + "grad_norm": 0.30086612701416016, + "learning_rate": 0.00033637846655791196, + "loss": 0.1692, + "num_input_tokens_seen": 8935872, + "step": 4125 + }, + { + "epoch": 0.6737357259380098, + "grad_norm": 0.12169612944126129, + "learning_rate": 0.0003367862969004894, + "loss": 0.1185, + "num_input_tokens_seen": 8946624, + "step": 4130 + }, + { + "epoch": 0.6745513866231647, + "grad_norm": 0.06142524629831314, + "learning_rate": 0.0003371941272430669, + "loss": 0.1007, + "num_input_tokens_seen": 8957568, + "step": 4135 + }, + { + "epoch": 0.6753670473083198, + "grad_norm": 0.22979362308979034, + "learning_rate": 0.0003376019575856444, + "loss": 0.1771, + "num_input_tokens_seen": 8968160, + "step": 4140 + }, + { + "epoch": 0.6761827079934747, + "grad_norm": 0.14729270339012146, + "learning_rate": 0.00033800978792822186, + "loss": 0.1118, + "num_input_tokens_seen": 8978560, + "step": 4145 + }, + { + "epoch": 0.6769983686786297, + "grad_norm": 0.9021096229553223, + "learning_rate": 0.00033841761827079934, + "loss": 0.2374, + "num_input_tokens_seen": 8988896, + "step": 4150 + }, + { + "epoch": 0.6778140293637847, + "grad_norm": 0.7865363955497742, + "learning_rate": 0.0003388254486133768, + "loss": 0.2803, + "num_input_tokens_seen": 9000928, + "step": 4155 + }, + { + "epoch": 0.6786296900489397, + "grad_norm": 0.2096089869737625, + "learning_rate": 0.00033923327895595435, + "loss": 0.0894, + "num_input_tokens_seen": 9011712, + "step": 4160 + }, + { + "epoch": 0.6794453507340946, + "grad_norm": 0.2097005844116211, + "learning_rate": 0.0003396411092985318, + "loss": 0.1248, + "num_input_tokens_seen": 9022624, + "step": 4165 + }, + { + "epoch": 0.6802610114192496, + "grad_norm": 0.19112005829811096, + "learning_rate": 0.0003400489396411093, + "loss": 0.2156, + "num_input_tokens_seen": 9033088, + "step": 4170 + }, + { + "epoch": 0.6810766721044046, + "grad_norm": 0.7205477952957153, + "learning_rate": 0.0003404567699836868, + "loss": 0.2077, + "num_input_tokens_seen": 9043680, + "step": 4175 + }, + { + "epoch": 0.6818923327895595, + "grad_norm": 0.4396249055862427, + "learning_rate": 0.00034086460032626425, + "loss": 0.1547, + "num_input_tokens_seen": 9055424, + "step": 4180 + }, + { + "epoch": 0.6827079934747146, + "grad_norm": 0.6017875075340271, + "learning_rate": 0.0003412724306688418, + "loss": 0.0954, + "num_input_tokens_seen": 9066816, + "step": 4185 + }, + { + "epoch": 0.6835236541598695, + "grad_norm": 0.12390795350074768, + "learning_rate": 0.00034168026101141926, + "loss": 0.1381, + "num_input_tokens_seen": 9077792, + "step": 4190 + }, + { + "epoch": 0.6843393148450244, + "grad_norm": 0.29826679825782776, + "learning_rate": 0.0003420880913539968, + "loss": 0.2694, + "num_input_tokens_seen": 9088160, + "step": 4195 + }, + { + "epoch": 0.6851549755301795, + "grad_norm": 0.2392323613166809, + "learning_rate": 0.0003424959216965742, + "loss": 0.0504, + "num_input_tokens_seen": 9099264, + "step": 4200 + }, + { + "epoch": 0.6859706362153344, + "grad_norm": 0.4493273198604584, + "learning_rate": 0.0003429037520391517, + "loss": 0.1906, + "num_input_tokens_seen": 9110336, + "step": 4205 + }, + { + "epoch": 0.6867862969004894, + "grad_norm": 0.6997247934341431, + "learning_rate": 0.0003433115823817292, + "loss": 0.105, + "num_input_tokens_seen": 9121248, + "step": 4210 + }, + { + "epoch": 0.6876019575856444, + "grad_norm": 0.19306375086307526, + "learning_rate": 0.0003437194127243067, + "loss": 0.0508, + "num_input_tokens_seen": 9132896, + "step": 4215 + }, + { + "epoch": 0.6884176182707994, + "grad_norm": 0.04344617575407028, + "learning_rate": 0.00034412724306688417, + "loss": 0.0752, + "num_input_tokens_seen": 9143968, + "step": 4220 + }, + { + "epoch": 0.6892332789559543, + "grad_norm": 0.0478254109621048, + "learning_rate": 0.00034453507340946165, + "loss": 0.0733, + "num_input_tokens_seen": 9154848, + "step": 4225 + }, + { + "epoch": 0.6900489396411092, + "grad_norm": 0.08865730464458466, + "learning_rate": 0.0003449429037520392, + "loss": 0.1284, + "num_input_tokens_seen": 9167264, + "step": 4230 + }, + { + "epoch": 0.6908646003262643, + "grad_norm": 0.05257609114050865, + "learning_rate": 0.00034535073409461666, + "loss": 0.0823, + "num_input_tokens_seen": 9178016, + "step": 4235 + }, + { + "epoch": 0.6916802610114192, + "grad_norm": 0.09601225703954697, + "learning_rate": 0.00034575856443719413, + "loss": 0.2197, + "num_input_tokens_seen": 9188224, + "step": 4240 + }, + { + "epoch": 0.6924959216965743, + "grad_norm": 0.01799871399998665, + "learning_rate": 0.0003461663947797716, + "loss": 0.0188, + "num_input_tokens_seen": 9198208, + "step": 4245 + }, + { + "epoch": 0.6933115823817292, + "grad_norm": 0.14752286672592163, + "learning_rate": 0.0003465742251223491, + "loss": 0.0887, + "num_input_tokens_seen": 9208128, + "step": 4250 + }, + { + "epoch": 0.6941272430668842, + "grad_norm": 0.015657467767596245, + "learning_rate": 0.0003469820554649266, + "loss": 0.1092, + "num_input_tokens_seen": 9218080, + "step": 4255 + }, + { + "epoch": 0.6949429037520392, + "grad_norm": 0.12187521904706955, + "learning_rate": 0.0003473898858075041, + "loss": 0.0554, + "num_input_tokens_seen": 9228416, + "step": 4260 + }, + { + "epoch": 0.6957585644371941, + "grad_norm": 0.17781707644462585, + "learning_rate": 0.0003477977161500816, + "loss": 0.1639, + "num_input_tokens_seen": 9239296, + "step": 4265 + }, + { + "epoch": 0.6965742251223491, + "grad_norm": 0.24876636266708374, + "learning_rate": 0.00034820554649265905, + "loss": 0.276, + "num_input_tokens_seen": 9249472, + "step": 4270 + }, + { + "epoch": 0.697389885807504, + "grad_norm": 0.299495667219162, + "learning_rate": 0.0003486133768352365, + "loss": 0.1178, + "num_input_tokens_seen": 9259616, + "step": 4275 + }, + { + "epoch": 0.6982055464926591, + "grad_norm": 0.33542829751968384, + "learning_rate": 0.00034902120717781405, + "loss": 0.12, + "num_input_tokens_seen": 9271360, + "step": 4280 + }, + { + "epoch": 0.699021207177814, + "grad_norm": 0.21136386692523956, + "learning_rate": 0.00034942903752039153, + "loss": 0.1846, + "num_input_tokens_seen": 9281952, + "step": 4285 + }, + { + "epoch": 0.6998368678629691, + "grad_norm": 0.649427056312561, + "learning_rate": 0.000349836867862969, + "loss": 0.1849, + "num_input_tokens_seen": 9292480, + "step": 4290 + }, + { + "epoch": 0.700652528548124, + "grad_norm": 0.09823936223983765, + "learning_rate": 0.0003502446982055465, + "loss": 0.2742, + "num_input_tokens_seen": 9301696, + "step": 4295 + }, + { + "epoch": 0.7014681892332789, + "grad_norm": 0.3656994700431824, + "learning_rate": 0.00035065252854812396, + "loss": 0.1071, + "num_input_tokens_seen": 9311360, + "step": 4300 + }, + { + "epoch": 0.702283849918434, + "grad_norm": 0.3502924144268036, + "learning_rate": 0.0003510603588907015, + "loss": 0.1584, + "num_input_tokens_seen": 9321504, + "step": 4305 + }, + { + "epoch": 0.7030995106035889, + "grad_norm": 0.08680325001478195, + "learning_rate": 0.00035146818923327897, + "loss": 0.1115, + "num_input_tokens_seen": 9333024, + "step": 4310 + }, + { + "epoch": 0.7039151712887439, + "grad_norm": 0.4333436191082001, + "learning_rate": 0.00035187601957585644, + "loss": 0.1009, + "num_input_tokens_seen": 9343424, + "step": 4315 + }, + { + "epoch": 0.7047308319738989, + "grad_norm": 0.057815730571746826, + "learning_rate": 0.0003522838499184339, + "loss": 0.0739, + "num_input_tokens_seen": 9354016, + "step": 4320 + }, + { + "epoch": 0.7055464926590538, + "grad_norm": 0.17198070883750916, + "learning_rate": 0.00035269168026101145, + "loss": 0.1415, + "num_input_tokens_seen": 9365600, + "step": 4325 + }, + { + "epoch": 0.7063621533442088, + "grad_norm": 0.4147215187549591, + "learning_rate": 0.0003530995106035889, + "loss": 0.1644, + "num_input_tokens_seen": 9377216, + "step": 4330 + }, + { + "epoch": 0.7071778140293637, + "grad_norm": 0.3243969678878784, + "learning_rate": 0.0003535073409461664, + "loss": 0.102, + "num_input_tokens_seen": 9388000, + "step": 4335 + }, + { + "epoch": 0.7079934747145188, + "grad_norm": 0.567345380783081, + "learning_rate": 0.0003539151712887439, + "loss": 0.0943, + "num_input_tokens_seen": 9399616, + "step": 4340 + }, + { + "epoch": 0.7088091353996737, + "grad_norm": 0.21629932522773743, + "learning_rate": 0.00035432300163132136, + "loss": 0.1617, + "num_input_tokens_seen": 9411872, + "step": 4345 + }, + { + "epoch": 0.7096247960848288, + "grad_norm": 0.21952052414417267, + "learning_rate": 0.0003547308319738989, + "loss": 0.1893, + "num_input_tokens_seen": 9422912, + "step": 4350 + }, + { + "epoch": 0.7104404567699837, + "grad_norm": 0.11742942035198212, + "learning_rate": 0.00035513866231647636, + "loss": 0.0233, + "num_input_tokens_seen": 9434176, + "step": 4355 + }, + { + "epoch": 0.7112561174551386, + "grad_norm": 0.15607663989067078, + "learning_rate": 0.0003555464926590539, + "loss": 0.0886, + "num_input_tokens_seen": 9443936, + "step": 4360 + }, + { + "epoch": 0.7120717781402937, + "grad_norm": 0.08815481513738632, + "learning_rate": 0.0003559543230016313, + "loss": 0.0836, + "num_input_tokens_seen": 9453984, + "step": 4365 + }, + { + "epoch": 0.7128874388254486, + "grad_norm": 0.16504555940628052, + "learning_rate": 0.0003563621533442088, + "loss": 0.1904, + "num_input_tokens_seen": 9465408, + "step": 4370 + }, + { + "epoch": 0.7137030995106036, + "grad_norm": 0.24558444321155548, + "learning_rate": 0.0003567699836867863, + "loss": 0.1525, + "num_input_tokens_seen": 9475392, + "step": 4375 + }, + { + "epoch": 0.7145187601957586, + "grad_norm": 0.2139747142791748, + "learning_rate": 0.0003571778140293638, + "loss": 0.1616, + "num_input_tokens_seen": 9485280, + "step": 4380 + }, + { + "epoch": 0.7153344208809136, + "grad_norm": 0.28706589341163635, + "learning_rate": 0.0003575856443719413, + "loss": 0.0938, + "num_input_tokens_seen": 9496224, + "step": 4385 + }, + { + "epoch": 0.7161500815660685, + "grad_norm": 0.49027663469314575, + "learning_rate": 0.00035799347471451875, + "loss": 0.1596, + "num_input_tokens_seen": 9507456, + "step": 4390 + }, + { + "epoch": 0.7169657422512234, + "grad_norm": 0.5312650799751282, + "learning_rate": 0.0003584013050570963, + "loss": 0.2244, + "num_input_tokens_seen": 9517952, + "step": 4395 + }, + { + "epoch": 0.7177814029363785, + "grad_norm": 0.8909090757369995, + "learning_rate": 0.00035880913539967376, + "loss": 0.1399, + "num_input_tokens_seen": 9528672, + "step": 4400 + }, + { + "epoch": 0.7185970636215334, + "grad_norm": 0.22219215333461761, + "learning_rate": 0.00035921696574225124, + "loss": 0.0626, + "num_input_tokens_seen": 9539552, + "step": 4405 + }, + { + "epoch": 0.7194127243066885, + "grad_norm": 0.2684333920478821, + "learning_rate": 0.0003596247960848287, + "loss": 0.2299, + "num_input_tokens_seen": 9550112, + "step": 4410 + }, + { + "epoch": 0.7202283849918434, + "grad_norm": 0.17051482200622559, + "learning_rate": 0.0003600326264274062, + "loss": 0.1622, + "num_input_tokens_seen": 9560544, + "step": 4415 + }, + { + "epoch": 0.7210440456769984, + "grad_norm": 1.0428999662399292, + "learning_rate": 0.0003604404567699837, + "loss": 0.3066, + "num_input_tokens_seen": 9571232, + "step": 4420 + }, + { + "epoch": 0.7218597063621534, + "grad_norm": 0.28611576557159424, + "learning_rate": 0.0003608482871125612, + "loss": 0.083, + "num_input_tokens_seen": 9581920, + "step": 4425 + }, + { + "epoch": 0.7226753670473083, + "grad_norm": 0.33363673090934753, + "learning_rate": 0.0003612561174551386, + "loss": 0.1089, + "num_input_tokens_seen": 9592448, + "step": 4430 + }, + { + "epoch": 0.7234910277324633, + "grad_norm": 0.76138836145401, + "learning_rate": 0.00036166394779771615, + "loss": 0.1595, + "num_input_tokens_seen": 9603360, + "step": 4435 + }, + { + "epoch": 0.7243066884176182, + "grad_norm": 0.216191366314888, + "learning_rate": 0.0003620717781402936, + "loss": 0.1535, + "num_input_tokens_seen": 9614336, + "step": 4440 + }, + { + "epoch": 0.7251223491027733, + "grad_norm": 0.07313571870326996, + "learning_rate": 0.00036247960848287116, + "loss": 0.122, + "num_input_tokens_seen": 9624512, + "step": 4445 + }, + { + "epoch": 0.7259380097879282, + "grad_norm": 0.15714143216609955, + "learning_rate": 0.00036288743882544863, + "loss": 0.1517, + "num_input_tokens_seen": 9635168, + "step": 4450 + }, + { + "epoch": 0.7267536704730831, + "grad_norm": 0.6960117220878601, + "learning_rate": 0.0003632952691680261, + "loss": 0.2596, + "num_input_tokens_seen": 9645280, + "step": 4455 + }, + { + "epoch": 0.7275693311582382, + "grad_norm": 0.32101142406463623, + "learning_rate": 0.0003637030995106036, + "loss": 0.1436, + "num_input_tokens_seen": 9656032, + "step": 4460 + }, + { + "epoch": 0.7283849918433931, + "grad_norm": 0.28535813093185425, + "learning_rate": 0.00036411092985318106, + "loss": 0.2361, + "num_input_tokens_seen": 9667968, + "step": 4465 + }, + { + "epoch": 0.7292006525285482, + "grad_norm": 0.40190282464027405, + "learning_rate": 0.0003645187601957586, + "loss": 0.0877, + "num_input_tokens_seen": 9680032, + "step": 4470 + }, + { + "epoch": 0.7300163132137031, + "grad_norm": 2.4087488651275635, + "learning_rate": 0.00036492659053833607, + "loss": 0.1908, + "num_input_tokens_seen": 9691424, + "step": 4475 + }, + { + "epoch": 0.7308319738988581, + "grad_norm": 0.8622110486030579, + "learning_rate": 0.00036533442088091354, + "loss": 0.2741, + "num_input_tokens_seen": 9702400, + "step": 4480 + }, + { + "epoch": 0.731647634584013, + "grad_norm": 0.07771331071853638, + "learning_rate": 0.000365742251223491, + "loss": 0.061, + "num_input_tokens_seen": 9712928, + "step": 4485 + }, + { + "epoch": 0.732463295269168, + "grad_norm": 0.9341453909873962, + "learning_rate": 0.00036615008156606855, + "loss": 0.4303, + "num_input_tokens_seen": 9723360, + "step": 4490 + }, + { + "epoch": 0.733278955954323, + "grad_norm": 0.26300737261772156, + "learning_rate": 0.00036655791190864603, + "loss": 0.1159, + "num_input_tokens_seen": 9734496, + "step": 4495 + }, + { + "epoch": 0.734094616639478, + "grad_norm": 0.4297347068786621, + "learning_rate": 0.0003669657422512235, + "loss": 0.1867, + "num_input_tokens_seen": 9744288, + "step": 4500 + }, + { + "epoch": 0.734910277324633, + "grad_norm": 0.239863783121109, + "learning_rate": 0.000367373572593801, + "loss": 0.2036, + "num_input_tokens_seen": 9755072, + "step": 4505 + }, + { + "epoch": 0.7357259380097879, + "grad_norm": 0.4273749589920044, + "learning_rate": 0.00036778140293637846, + "loss": 0.1437, + "num_input_tokens_seen": 9764192, + "step": 4510 + }, + { + "epoch": 0.736541598694943, + "grad_norm": 0.5901488065719604, + "learning_rate": 0.000368189233278956, + "loss": 0.2585, + "num_input_tokens_seen": 9775392, + "step": 4515 + }, + { + "epoch": 0.7373572593800979, + "grad_norm": 0.15196426212787628, + "learning_rate": 0.00036859706362153346, + "loss": 0.0932, + "num_input_tokens_seen": 9784864, + "step": 4520 + }, + { + "epoch": 0.7381729200652528, + "grad_norm": 0.17676207423210144, + "learning_rate": 0.0003690048939641109, + "loss": 0.0609, + "num_input_tokens_seen": 9796512, + "step": 4525 + }, + { + "epoch": 0.7389885807504079, + "grad_norm": 0.1951192319393158, + "learning_rate": 0.0003694127243066884, + "loss": 0.1845, + "num_input_tokens_seen": 9808416, + "step": 4530 + }, + { + "epoch": 0.7398042414355628, + "grad_norm": 0.5130126476287842, + "learning_rate": 0.0003698205546492659, + "loss": 0.0996, + "num_input_tokens_seen": 9819552, + "step": 4535 + }, + { + "epoch": 0.7406199021207178, + "grad_norm": 0.29215341806411743, + "learning_rate": 0.0003702283849918434, + "loss": 0.1596, + "num_input_tokens_seen": 9831168, + "step": 4540 + }, + { + "epoch": 0.7414355628058727, + "grad_norm": 0.47994017601013184, + "learning_rate": 0.0003706362153344209, + "loss": 0.0691, + "num_input_tokens_seen": 9841280, + "step": 4545 + }, + { + "epoch": 0.7422512234910277, + "grad_norm": 0.14875724911689758, + "learning_rate": 0.0003710440456769984, + "loss": 0.1222, + "num_input_tokens_seen": 9852224, + "step": 4550 + }, + { + "epoch": 0.7430668841761827, + "grad_norm": 0.24020762741565704, + "learning_rate": 0.00037145187601957585, + "loss": 0.2732, + "num_input_tokens_seen": 9862656, + "step": 4555 + }, + { + "epoch": 0.7438825448613376, + "grad_norm": 0.23504795134067535, + "learning_rate": 0.00037185970636215333, + "loss": 0.0841, + "num_input_tokens_seen": 9874304, + "step": 4560 + }, + { + "epoch": 0.7446982055464927, + "grad_norm": 0.16769489645957947, + "learning_rate": 0.00037226753670473086, + "loss": 0.1028, + "num_input_tokens_seen": 9884128, + "step": 4565 + }, + { + "epoch": 0.7455138662316476, + "grad_norm": 0.5762054920196533, + "learning_rate": 0.00037267536704730834, + "loss": 0.2434, + "num_input_tokens_seen": 9894208, + "step": 4570 + }, + { + "epoch": 0.7463295269168027, + "grad_norm": 0.7083689570426941, + "learning_rate": 0.0003730831973898858, + "loss": 0.1754, + "num_input_tokens_seen": 9904864, + "step": 4575 + }, + { + "epoch": 0.7471451876019576, + "grad_norm": 0.2699931859970093, + "learning_rate": 0.0003734910277324633, + "loss": 0.1771, + "num_input_tokens_seen": 9915904, + "step": 4580 + }, + { + "epoch": 0.7479608482871125, + "grad_norm": 0.09955000877380371, + "learning_rate": 0.0003738988580750408, + "loss": 0.1662, + "num_input_tokens_seen": 9926048, + "step": 4585 + }, + { + "epoch": 0.7487765089722676, + "grad_norm": 0.1995488554239273, + "learning_rate": 0.0003743066884176183, + "loss": 0.0735, + "num_input_tokens_seen": 9936736, + "step": 4590 + }, + { + "epoch": 0.7495921696574225, + "grad_norm": 0.4435023069381714, + "learning_rate": 0.0003747145187601957, + "loss": 0.1639, + "num_input_tokens_seen": 9949408, + "step": 4595 + }, + { + "epoch": 0.7504078303425775, + "grad_norm": 0.22048331797122955, + "learning_rate": 0.00037512234910277325, + "loss": 0.1252, + "num_input_tokens_seen": 9960160, + "step": 4600 + }, + { + "epoch": 0.7512234910277324, + "grad_norm": 0.44227954745292664, + "learning_rate": 0.00037553017944535073, + "loss": 0.1111, + "num_input_tokens_seen": 9970624, + "step": 4605 + }, + { + "epoch": 0.7520391517128875, + "grad_norm": 0.1393173187971115, + "learning_rate": 0.00037593800978792826, + "loss": 0.0851, + "num_input_tokens_seen": 9982080, + "step": 4610 + }, + { + "epoch": 0.7528548123980424, + "grad_norm": 0.06849927455186844, + "learning_rate": 0.00037634584013050573, + "loss": 0.0126, + "num_input_tokens_seen": 9992480, + "step": 4615 + }, + { + "epoch": 0.7536704730831973, + "grad_norm": 0.04791555553674698, + "learning_rate": 0.0003767536704730832, + "loss": 0.1476, + "num_input_tokens_seen": 10002496, + "step": 4620 + }, + { + "epoch": 0.7544861337683524, + "grad_norm": 0.4886733591556549, + "learning_rate": 0.0003771615008156607, + "loss": 0.317, + "num_input_tokens_seen": 10013408, + "step": 4625 + }, + { + "epoch": 0.7553017944535073, + "grad_norm": 0.2724720537662506, + "learning_rate": 0.00037756933115823816, + "loss": 0.0849, + "num_input_tokens_seen": 10022880, + "step": 4630 + }, + { + "epoch": 0.7561174551386624, + "grad_norm": 0.653141975402832, + "learning_rate": 0.0003779771615008157, + "loss": 0.1455, + "num_input_tokens_seen": 10034208, + "step": 4635 + }, + { + "epoch": 0.7569331158238173, + "grad_norm": 0.2737433612346649, + "learning_rate": 0.00037838499184339317, + "loss": 0.1083, + "num_input_tokens_seen": 10044832, + "step": 4640 + }, + { + "epoch": 0.7577487765089723, + "grad_norm": 0.04722199961543083, + "learning_rate": 0.00037879282218597065, + "loss": 0.1308, + "num_input_tokens_seen": 10055712, + "step": 4645 + }, + { + "epoch": 0.7585644371941273, + "grad_norm": 0.3538724184036255, + "learning_rate": 0.0003792006525285481, + "loss": 0.1228, + "num_input_tokens_seen": 10066976, + "step": 4650 + }, + { + "epoch": 0.7593800978792822, + "grad_norm": 0.2836623191833496, + "learning_rate": 0.0003796084828711256, + "loss": 0.1622, + "num_input_tokens_seen": 10078304, + "step": 4655 + }, + { + "epoch": 0.7601957585644372, + "grad_norm": 0.8579111099243164, + "learning_rate": 0.00038001631321370313, + "loss": 0.3416, + "num_input_tokens_seen": 10089056, + "step": 4660 + }, + { + "epoch": 0.7610114192495921, + "grad_norm": 0.2942768633365631, + "learning_rate": 0.00038042414355628055, + "loss": 0.2009, + "num_input_tokens_seen": 10099904, + "step": 4665 + }, + { + "epoch": 0.7618270799347472, + "grad_norm": 0.4043952524662018, + "learning_rate": 0.0003808319738988581, + "loss": 0.1121, + "num_input_tokens_seen": 10111552, + "step": 4670 + }, + { + "epoch": 0.7626427406199021, + "grad_norm": 0.42761731147766113, + "learning_rate": 0.00038123980424143556, + "loss": 0.276, + "num_input_tokens_seen": 10121568, + "step": 4675 + }, + { + "epoch": 0.763458401305057, + "grad_norm": 0.2798854112625122, + "learning_rate": 0.0003816476345840131, + "loss": 0.1555, + "num_input_tokens_seen": 10132000, + "step": 4680 + }, + { + "epoch": 0.7642740619902121, + "grad_norm": 0.4713301658630371, + "learning_rate": 0.00038205546492659057, + "loss": 0.2528, + "num_input_tokens_seen": 10141952, + "step": 4685 + }, + { + "epoch": 0.765089722675367, + "grad_norm": 0.22451333701610565, + "learning_rate": 0.000382463295269168, + "loss": 0.1312, + "num_input_tokens_seen": 10151776, + "step": 4690 + }, + { + "epoch": 0.765905383360522, + "grad_norm": 0.2801218330860138, + "learning_rate": 0.0003828711256117455, + "loss": 0.1707, + "num_input_tokens_seen": 10162688, + "step": 4695 + }, + { + "epoch": 0.766721044045677, + "grad_norm": 0.12770937383174896, + "learning_rate": 0.000383278955954323, + "loss": 0.0353, + "num_input_tokens_seen": 10174624, + "step": 4700 + }, + { + "epoch": 0.767536704730832, + "grad_norm": 0.05235608294606209, + "learning_rate": 0.00038368678629690053, + "loss": 0.038, + "num_input_tokens_seen": 10185248, + "step": 4705 + }, + { + "epoch": 0.768352365415987, + "grad_norm": 0.06234338879585266, + "learning_rate": 0.000384094616639478, + "loss": 0.0492, + "num_input_tokens_seen": 10196768, + "step": 4710 + }, + { + "epoch": 0.7691680261011419, + "grad_norm": 0.08394385129213333, + "learning_rate": 0.0003845024469820555, + "loss": 0.0572, + "num_input_tokens_seen": 10206816, + "step": 4715 + }, + { + "epoch": 0.7699836867862969, + "grad_norm": 0.06011839583516121, + "learning_rate": 0.00038491027732463296, + "loss": 0.091, + "num_input_tokens_seen": 10217600, + "step": 4720 + }, + { + "epoch": 0.7707993474714518, + "grad_norm": 0.1412273496389389, + "learning_rate": 0.00038531810766721043, + "loss": 0.1894, + "num_input_tokens_seen": 10228544, + "step": 4725 + }, + { + "epoch": 0.7716150081566069, + "grad_norm": 0.0563993863761425, + "learning_rate": 0.00038572593800978796, + "loss": 0.0792, + "num_input_tokens_seen": 10239168, + "step": 4730 + }, + { + "epoch": 0.7724306688417618, + "grad_norm": 0.12660855054855347, + "learning_rate": 0.0003861337683523654, + "loss": 0.078, + "num_input_tokens_seen": 10250784, + "step": 4735 + }, + { + "epoch": 0.7732463295269169, + "grad_norm": 0.6028103232383728, + "learning_rate": 0.0003865415986949429, + "loss": 0.1877, + "num_input_tokens_seen": 10260480, + "step": 4740 + }, + { + "epoch": 0.7740619902120718, + "grad_norm": 0.8399020433425903, + "learning_rate": 0.0003869494290375204, + "loss": 0.0958, + "num_input_tokens_seen": 10272064, + "step": 4745 + }, + { + "epoch": 0.7748776508972267, + "grad_norm": 0.043075088411569595, + "learning_rate": 0.0003873572593800979, + "loss": 0.0527, + "num_input_tokens_seen": 10281472, + "step": 4750 + }, + { + "epoch": 0.7756933115823818, + "grad_norm": 0.36994802951812744, + "learning_rate": 0.0003877650897226754, + "loss": 0.1139, + "num_input_tokens_seen": 10293536, + "step": 4755 + }, + { + "epoch": 0.7765089722675367, + "grad_norm": 1.0076755285263062, + "learning_rate": 0.0003881729200652528, + "loss": 0.1012, + "num_input_tokens_seen": 10303968, + "step": 4760 + }, + { + "epoch": 0.7773246329526917, + "grad_norm": 0.02243875525891781, + "learning_rate": 0.00038858075040783035, + "loss": 0.0783, + "num_input_tokens_seen": 10316064, + "step": 4765 + }, + { + "epoch": 0.7781402936378466, + "grad_norm": 0.14303316175937653, + "learning_rate": 0.00038898858075040783, + "loss": 0.1356, + "num_input_tokens_seen": 10327328, + "step": 4770 + }, + { + "epoch": 0.7789559543230016, + "grad_norm": 1.1636042594909668, + "learning_rate": 0.00038939641109298536, + "loss": 0.2765, + "num_input_tokens_seen": 10337088, + "step": 4775 + }, + { + "epoch": 0.7797716150081566, + "grad_norm": 0.2676616311073303, + "learning_rate": 0.00038980424143556284, + "loss": 0.1834, + "num_input_tokens_seen": 10347488, + "step": 4780 + }, + { + "epoch": 0.7805872756933115, + "grad_norm": 0.46309512853622437, + "learning_rate": 0.00039021207177814026, + "loss": 0.161, + "num_input_tokens_seen": 10358656, + "step": 4785 + }, + { + "epoch": 0.7814029363784666, + "grad_norm": 0.44753897190093994, + "learning_rate": 0.0003906199021207178, + "loss": 0.206, + "num_input_tokens_seen": 10370816, + "step": 4790 + }, + { + "epoch": 0.7822185970636215, + "grad_norm": 0.3726544678211212, + "learning_rate": 0.00039102773246329527, + "loss": 0.1489, + "num_input_tokens_seen": 10381856, + "step": 4795 + }, + { + "epoch": 0.7830342577487766, + "grad_norm": 0.4448489546775818, + "learning_rate": 0.0003914355628058728, + "loss": 0.221, + "num_input_tokens_seen": 10392320, + "step": 4800 + }, + { + "epoch": 0.7838499184339315, + "grad_norm": 0.2513653635978699, + "learning_rate": 0.0003918433931484502, + "loss": 0.2256, + "num_input_tokens_seen": 10401568, + "step": 4805 + }, + { + "epoch": 0.7846655791190864, + "grad_norm": 0.1805187314748764, + "learning_rate": 0.00039225122349102775, + "loss": 0.2585, + "num_input_tokens_seen": 10413248, + "step": 4810 + }, + { + "epoch": 0.7854812398042414, + "grad_norm": 0.1903678923845291, + "learning_rate": 0.0003926590538336052, + "loss": 0.1002, + "num_input_tokens_seen": 10423168, + "step": 4815 + }, + { + "epoch": 0.7862969004893964, + "grad_norm": 0.10028063505887985, + "learning_rate": 0.0003930668841761827, + "loss": 0.0939, + "num_input_tokens_seen": 10433792, + "step": 4820 + }, + { + "epoch": 0.7871125611745514, + "grad_norm": 0.32151925563812256, + "learning_rate": 0.00039347471451876023, + "loss": 0.1202, + "num_input_tokens_seen": 10445120, + "step": 4825 + }, + { + "epoch": 0.7879282218597063, + "grad_norm": 0.20094020664691925, + "learning_rate": 0.00039388254486133766, + "loss": 0.1248, + "num_input_tokens_seen": 10455584, + "step": 4830 + }, + { + "epoch": 0.7887438825448614, + "grad_norm": 0.10265758633613586, + "learning_rate": 0.0003942903752039152, + "loss": 0.0595, + "num_input_tokens_seen": 10466752, + "step": 4835 + }, + { + "epoch": 0.7895595432300163, + "grad_norm": 0.8203637003898621, + "learning_rate": 0.00039469820554649266, + "loss": 0.2248, + "num_input_tokens_seen": 10477376, + "step": 4840 + }, + { + "epoch": 0.7903752039151712, + "grad_norm": 0.3852264881134033, + "learning_rate": 0.0003951060358890702, + "loss": 0.1295, + "num_input_tokens_seen": 10487392, + "step": 4845 + }, + { + "epoch": 0.7911908646003263, + "grad_norm": 0.2891012132167816, + "learning_rate": 0.00039551386623164767, + "loss": 0.2015, + "num_input_tokens_seen": 10498208, + "step": 4850 + }, + { + "epoch": 0.7920065252854812, + "grad_norm": 0.6204549074172974, + "learning_rate": 0.0003959216965742251, + "loss": 0.1452, + "num_input_tokens_seen": 10507200, + "step": 4855 + }, + { + "epoch": 0.7928221859706363, + "grad_norm": 0.14992201328277588, + "learning_rate": 0.0003963295269168026, + "loss": 0.2595, + "num_input_tokens_seen": 10516416, + "step": 4860 + }, + { + "epoch": 0.7936378466557912, + "grad_norm": 0.35039445757865906, + "learning_rate": 0.0003967373572593801, + "loss": 0.2197, + "num_input_tokens_seen": 10527648, + "step": 4865 + }, + { + "epoch": 0.7944535073409462, + "grad_norm": 0.3829597532749176, + "learning_rate": 0.00039714518760195763, + "loss": 0.1364, + "num_input_tokens_seen": 10537952, + "step": 4870 + }, + { + "epoch": 0.7952691680261011, + "grad_norm": 0.2897590398788452, + "learning_rate": 0.00039755301794453505, + "loss": 0.1406, + "num_input_tokens_seen": 10548320, + "step": 4875 + }, + { + "epoch": 0.7960848287112561, + "grad_norm": 0.09263116866350174, + "learning_rate": 0.00039796084828711253, + "loss": 0.1179, + "num_input_tokens_seen": 10557824, + "step": 4880 + }, + { + "epoch": 0.7969004893964111, + "grad_norm": 0.1687658429145813, + "learning_rate": 0.00039836867862969006, + "loss": 0.0556, + "num_input_tokens_seen": 10568896, + "step": 4885 + }, + { + "epoch": 0.797716150081566, + "grad_norm": 0.35730594396591187, + "learning_rate": 0.00039877650897226754, + "loss": 0.2819, + "num_input_tokens_seen": 10580064, + "step": 4890 + }, + { + "epoch": 0.7985318107667211, + "grad_norm": 0.035357508808374405, + "learning_rate": 0.00039918433931484507, + "loss": 0.1337, + "num_input_tokens_seen": 10590976, + "step": 4895 + }, + { + "epoch": 0.799347471451876, + "grad_norm": 0.5397438406944275, + "learning_rate": 0.0003995921696574225, + "loss": 0.137, + "num_input_tokens_seen": 10600448, + "step": 4900 + }, + { + "epoch": 0.8001631321370309, + "grad_norm": 0.22979234158992767, + "learning_rate": 0.0004, + "loss": 0.1354, + "num_input_tokens_seen": 10611904, + "step": 4905 + }, + { + "epoch": 0.800978792822186, + "grad_norm": 0.17993277311325073, + "learning_rate": 0.0004004078303425775, + "loss": 0.0789, + "num_input_tokens_seen": 10622240, + "step": 4910 + }, + { + "epoch": 0.8017944535073409, + "grad_norm": 0.2799410820007324, + "learning_rate": 0.00040081566068515497, + "loss": 0.2357, + "num_input_tokens_seen": 10632864, + "step": 4915 + }, + { + "epoch": 0.802610114192496, + "grad_norm": 0.20957742631435394, + "learning_rate": 0.0004012234910277325, + "loss": 0.0725, + "num_input_tokens_seen": 10642464, + "step": 4920 + }, + { + "epoch": 0.8034257748776509, + "grad_norm": 0.3005708158016205, + "learning_rate": 0.0004016313213703099, + "loss": 0.1929, + "num_input_tokens_seen": 10653024, + "step": 4925 + }, + { + "epoch": 0.8042414355628059, + "grad_norm": 0.2430286854505539, + "learning_rate": 0.00040203915171288746, + "loss": 0.0941, + "num_input_tokens_seen": 10664384, + "step": 4930 + }, + { + "epoch": 0.8050570962479608, + "grad_norm": 0.6753287315368652, + "learning_rate": 0.00040244698205546493, + "loss": 0.214, + "num_input_tokens_seen": 10675360, + "step": 4935 + }, + { + "epoch": 0.8058727569331158, + "grad_norm": 0.29395872354507446, + "learning_rate": 0.00040285481239804246, + "loss": 0.1493, + "num_input_tokens_seen": 10685824, + "step": 4940 + }, + { + "epoch": 0.8066884176182708, + "grad_norm": 0.5989903807640076, + "learning_rate": 0.0004032626427406199, + "loss": 0.2205, + "num_input_tokens_seen": 10697536, + "step": 4945 + }, + { + "epoch": 0.8075040783034257, + "grad_norm": 0.33640438318252563, + "learning_rate": 0.00040367047308319736, + "loss": 0.1552, + "num_input_tokens_seen": 10709888, + "step": 4950 + }, + { + "epoch": 0.8083197389885808, + "grad_norm": 0.1403791308403015, + "learning_rate": 0.0004040783034257749, + "loss": 0.0983, + "num_input_tokens_seen": 10721216, + "step": 4955 + }, + { + "epoch": 0.8091353996737357, + "grad_norm": 0.12875372171401978, + "learning_rate": 0.00040448613376835237, + "loss": 0.0811, + "num_input_tokens_seen": 10729952, + "step": 4960 + }, + { + "epoch": 0.8099510603588908, + "grad_norm": 0.22171169519424438, + "learning_rate": 0.0004048939641109299, + "loss": 0.1516, + "num_input_tokens_seen": 10741216, + "step": 4965 + }, + { + "epoch": 0.8107667210440457, + "grad_norm": 0.058178044855594635, + "learning_rate": 0.0004053017944535073, + "loss": 0.1084, + "num_input_tokens_seen": 10752544, + "step": 4970 + }, + { + "epoch": 0.8115823817292006, + "grad_norm": 0.24571748077869415, + "learning_rate": 0.00040570962479608485, + "loss": 0.1198, + "num_input_tokens_seen": 10764960, + "step": 4975 + }, + { + "epoch": 0.8123980424143556, + "grad_norm": 0.5159160494804382, + "learning_rate": 0.00040611745513866233, + "loss": 0.0919, + "num_input_tokens_seen": 10775296, + "step": 4980 + }, + { + "epoch": 0.8132137030995106, + "grad_norm": 0.40539848804473877, + "learning_rate": 0.0004065252854812398, + "loss": 0.1244, + "num_input_tokens_seen": 10786112, + "step": 4985 + }, + { + "epoch": 0.8140293637846656, + "grad_norm": 0.30894115567207336, + "learning_rate": 0.00040693311582381734, + "loss": 0.1681, + "num_input_tokens_seen": 10797152, + "step": 4990 + }, + { + "epoch": 0.8148450244698205, + "grad_norm": 0.18106062710285187, + "learning_rate": 0.00040734094616639476, + "loss": 0.0964, + "num_input_tokens_seen": 10808480, + "step": 4995 + }, + { + "epoch": 0.8156606851549756, + "grad_norm": 0.23332974314689636, + "learning_rate": 0.0004077487765089723, + "loss": 0.0495, + "num_input_tokens_seen": 10819424, + "step": 5000 + }, + { + "epoch": 0.8164763458401305, + "grad_norm": 0.03415902704000473, + "learning_rate": 0.00040815660685154977, + "loss": 0.0376, + "num_input_tokens_seen": 10830048, + "step": 5005 + }, + { + "epoch": 0.8172920065252854, + "grad_norm": 0.10625167936086655, + "learning_rate": 0.00040856443719412724, + "loss": 0.1349, + "num_input_tokens_seen": 10840768, + "step": 5010 + }, + { + "epoch": 0.8181076672104405, + "grad_norm": 0.05832715332508087, + "learning_rate": 0.00040897226753670477, + "loss": 0.1287, + "num_input_tokens_seen": 10849792, + "step": 5015 + }, + { + "epoch": 0.8189233278955954, + "grad_norm": 0.02284836396574974, + "learning_rate": 0.0004093800978792822, + "loss": 0.0506, + "num_input_tokens_seen": 10861472, + "step": 5020 + }, + { + "epoch": 0.8197389885807504, + "grad_norm": 0.1458047330379486, + "learning_rate": 0.0004097879282218597, + "loss": 0.0383, + "num_input_tokens_seen": 10871968, + "step": 5025 + }, + { + "epoch": 0.8205546492659054, + "grad_norm": 0.1562747359275818, + "learning_rate": 0.0004101957585644372, + "loss": 0.095, + "num_input_tokens_seen": 10882560, + "step": 5030 + }, + { + "epoch": 0.8213703099510603, + "grad_norm": 0.3208521008491516, + "learning_rate": 0.00041060358890701473, + "loss": 0.1989, + "num_input_tokens_seen": 10894048, + "step": 5035 + }, + { + "epoch": 0.8221859706362153, + "grad_norm": 0.48728033900260925, + "learning_rate": 0.00041101141924959215, + "loss": 0.2368, + "num_input_tokens_seen": 10905408, + "step": 5040 + }, + { + "epoch": 0.8230016313213703, + "grad_norm": 0.3826005160808563, + "learning_rate": 0.00041141924959216963, + "loss": 0.1397, + "num_input_tokens_seen": 10915648, + "step": 5045 + }, + { + "epoch": 0.8238172920065253, + "grad_norm": 0.3023868501186371, + "learning_rate": 0.00041182707993474716, + "loss": 0.1021, + "num_input_tokens_seen": 10926880, + "step": 5050 + }, + { + "epoch": 0.8246329526916802, + "grad_norm": 0.22447927296161652, + "learning_rate": 0.00041223491027732464, + "loss": 0.1367, + "num_input_tokens_seen": 10938432, + "step": 5055 + }, + { + "epoch": 0.8254486133768353, + "grad_norm": 0.3445993661880493, + "learning_rate": 0.00041264274061990217, + "loss": 0.2051, + "num_input_tokens_seen": 10948992, + "step": 5060 + }, + { + "epoch": 0.8262642740619902, + "grad_norm": 0.7806461453437805, + "learning_rate": 0.0004130505709624796, + "loss": 0.2293, + "num_input_tokens_seen": 10958272, + "step": 5065 + }, + { + "epoch": 0.8270799347471451, + "grad_norm": 0.2192508578300476, + "learning_rate": 0.0004134584013050571, + "loss": 0.1373, + "num_input_tokens_seen": 10968512, + "step": 5070 + }, + { + "epoch": 0.8278955954323002, + "grad_norm": 0.085968516767025, + "learning_rate": 0.0004138662316476346, + "loss": 0.0933, + "num_input_tokens_seen": 10979072, + "step": 5075 + }, + { + "epoch": 0.8287112561174551, + "grad_norm": 0.03377152606844902, + "learning_rate": 0.0004142740619902121, + "loss": 0.0616, + "num_input_tokens_seen": 10988736, + "step": 5080 + }, + { + "epoch": 0.8295269168026101, + "grad_norm": 0.45503151416778564, + "learning_rate": 0.0004146818923327896, + "loss": 0.1234, + "num_input_tokens_seen": 11000512, + "step": 5085 + }, + { + "epoch": 0.8303425774877651, + "grad_norm": 0.07442937046289444, + "learning_rate": 0.00041508972267536703, + "loss": 0.2238, + "num_input_tokens_seen": 11010240, + "step": 5090 + }, + { + "epoch": 0.8311582381729201, + "grad_norm": 0.07135679572820663, + "learning_rate": 0.00041549755301794456, + "loss": 0.2291, + "num_input_tokens_seen": 11020672, + "step": 5095 + }, + { + "epoch": 0.831973898858075, + "grad_norm": 0.1200980693101883, + "learning_rate": 0.00041590538336052203, + "loss": 0.1161, + "num_input_tokens_seen": 11031808, + "step": 5100 + }, + { + "epoch": 0.83278955954323, + "grad_norm": 0.6344635486602783, + "learning_rate": 0.00041631321370309957, + "loss": 0.1934, + "num_input_tokens_seen": 11042144, + "step": 5105 + }, + { + "epoch": 0.833605220228385, + "grad_norm": 0.12114536017179489, + "learning_rate": 0.000416721044045677, + "loss": 0.1146, + "num_input_tokens_seen": 11053728, + "step": 5110 + }, + { + "epoch": 0.8344208809135399, + "grad_norm": 0.17902326583862305, + "learning_rate": 0.00041712887438825446, + "loss": 0.1455, + "num_input_tokens_seen": 11064032, + "step": 5115 + }, + { + "epoch": 0.835236541598695, + "grad_norm": 0.3889411389827728, + "learning_rate": 0.000417536704730832, + "loss": 0.1479, + "num_input_tokens_seen": 11073792, + "step": 5120 + }, + { + "epoch": 0.8360522022838499, + "grad_norm": 0.2544262707233429, + "learning_rate": 0.00041794453507340947, + "loss": 0.1368, + "num_input_tokens_seen": 11085216, + "step": 5125 + }, + { + "epoch": 0.8368678629690048, + "grad_norm": 0.11887114495038986, + "learning_rate": 0.000418352365415987, + "loss": 0.2017, + "num_input_tokens_seen": 11096288, + "step": 5130 + }, + { + "epoch": 0.8376835236541599, + "grad_norm": 0.12224949896335602, + "learning_rate": 0.0004187601957585644, + "loss": 0.1165, + "num_input_tokens_seen": 11107584, + "step": 5135 + }, + { + "epoch": 0.8384991843393148, + "grad_norm": 0.06587617099285126, + "learning_rate": 0.0004191680261011419, + "loss": 0.0683, + "num_input_tokens_seen": 11118752, + "step": 5140 + }, + { + "epoch": 0.8393148450244698, + "grad_norm": 0.21378469467163086, + "learning_rate": 0.00041957585644371943, + "loss": 0.0734, + "num_input_tokens_seen": 11127840, + "step": 5145 + }, + { + "epoch": 0.8401305057096248, + "grad_norm": 0.03329100087285042, + "learning_rate": 0.0004199836867862969, + "loss": 0.1529, + "num_input_tokens_seen": 11137952, + "step": 5150 + }, + { + "epoch": 0.8409461663947798, + "grad_norm": 0.054880183190107346, + "learning_rate": 0.00042039151712887444, + "loss": 0.1675, + "num_input_tokens_seen": 11147008, + "step": 5155 + }, + { + "epoch": 0.8417618270799347, + "grad_norm": 0.3553010821342468, + "learning_rate": 0.00042079934747145186, + "loss": 0.0819, + "num_input_tokens_seen": 11158912, + "step": 5160 + }, + { + "epoch": 0.8425774877650897, + "grad_norm": 0.38440319895744324, + "learning_rate": 0.0004212071778140294, + "loss": 0.0951, + "num_input_tokens_seen": 11169440, + "step": 5165 + }, + { + "epoch": 0.8433931484502447, + "grad_norm": 0.16979166865348816, + "learning_rate": 0.00042161500815660687, + "loss": 0.1047, + "num_input_tokens_seen": 11179488, + "step": 5170 + }, + { + "epoch": 0.8442088091353996, + "grad_norm": 0.03982337936758995, + "learning_rate": 0.00042202283849918434, + "loss": 0.2587, + "num_input_tokens_seen": 11190912, + "step": 5175 + }, + { + "epoch": 0.8450244698205547, + "grad_norm": 0.20960843563079834, + "learning_rate": 0.0004224306688417618, + "loss": 0.2019, + "num_input_tokens_seen": 11202208, + "step": 5180 + }, + { + "epoch": 0.8458401305057096, + "grad_norm": 0.2301092892885208, + "learning_rate": 0.0004228384991843393, + "loss": 0.2067, + "num_input_tokens_seen": 11212448, + "step": 5185 + }, + { + "epoch": 0.8466557911908646, + "grad_norm": 0.15763430297374725, + "learning_rate": 0.00042324632952691683, + "loss": 0.1482, + "num_input_tokens_seen": 11223648, + "step": 5190 + }, + { + "epoch": 0.8474714518760196, + "grad_norm": 0.0923219621181488, + "learning_rate": 0.0004236541598694943, + "loss": 0.1393, + "num_input_tokens_seen": 11234880, + "step": 5195 + }, + { + "epoch": 0.8482871125611745, + "grad_norm": 0.14025697112083435, + "learning_rate": 0.00042406199021207183, + "loss": 0.1089, + "num_input_tokens_seen": 11245184, + "step": 5200 + }, + { + "epoch": 0.8491027732463295, + "grad_norm": 0.40715232491493225, + "learning_rate": 0.00042446982055464926, + "loss": 0.0975, + "num_input_tokens_seen": 11256288, + "step": 5205 + }, + { + "epoch": 0.8499184339314845, + "grad_norm": 0.15133698284626007, + "learning_rate": 0.00042487765089722673, + "loss": 0.1388, + "num_input_tokens_seen": 11265728, + "step": 5210 + }, + { + "epoch": 0.8507340946166395, + "grad_norm": 0.07266948372125626, + "learning_rate": 0.00042528548123980426, + "loss": 0.1098, + "num_input_tokens_seen": 11275712, + "step": 5215 + }, + { + "epoch": 0.8515497553017944, + "grad_norm": 0.6715647578239441, + "learning_rate": 0.00042569331158238174, + "loss": 0.2122, + "num_input_tokens_seen": 11287776, + "step": 5220 + }, + { + "epoch": 0.8523654159869495, + "grad_norm": 0.5559086799621582, + "learning_rate": 0.00042610114192495927, + "loss": 0.2131, + "num_input_tokens_seen": 11298528, + "step": 5225 + }, + { + "epoch": 0.8531810766721044, + "grad_norm": 0.22525621950626373, + "learning_rate": 0.0004265089722675367, + "loss": 0.1807, + "num_input_tokens_seen": 11309536, + "step": 5230 + }, + { + "epoch": 0.8539967373572593, + "grad_norm": 0.21484312415122986, + "learning_rate": 0.00042691680261011417, + "loss": 0.2228, + "num_input_tokens_seen": 11319200, + "step": 5235 + }, + { + "epoch": 0.8548123980424144, + "grad_norm": 0.09586647152900696, + "learning_rate": 0.0004273246329526917, + "loss": 0.0829, + "num_input_tokens_seen": 11329120, + "step": 5240 + }, + { + "epoch": 0.8556280587275693, + "grad_norm": 0.01951626129448414, + "learning_rate": 0.0004277324632952692, + "loss": 0.1154, + "num_input_tokens_seen": 11339360, + "step": 5245 + }, + { + "epoch": 0.8564437194127243, + "grad_norm": 0.579093873500824, + "learning_rate": 0.00042814029363784665, + "loss": 0.1639, + "num_input_tokens_seen": 11350848, + "step": 5250 + }, + { + "epoch": 0.8572593800978793, + "grad_norm": 0.06659485399723053, + "learning_rate": 0.00042854812398042413, + "loss": 0.1419, + "num_input_tokens_seen": 11360704, + "step": 5255 + }, + { + "epoch": 0.8580750407830342, + "grad_norm": 0.47835665941238403, + "learning_rate": 0.00042895595432300166, + "loss": 0.1247, + "num_input_tokens_seen": 11370816, + "step": 5260 + }, + { + "epoch": 0.8588907014681892, + "grad_norm": 0.5534810423851013, + "learning_rate": 0.00042936378466557914, + "loss": 0.0846, + "num_input_tokens_seen": 11382080, + "step": 5265 + }, + { + "epoch": 0.8597063621533442, + "grad_norm": 0.09427545964717865, + "learning_rate": 0.0004297716150081566, + "loss": 0.2277, + "num_input_tokens_seen": 11392736, + "step": 5270 + }, + { + "epoch": 0.8605220228384992, + "grad_norm": 0.03366581350564957, + "learning_rate": 0.0004301794453507341, + "loss": 0.1835, + "num_input_tokens_seen": 11402720, + "step": 5275 + }, + { + "epoch": 0.8613376835236541, + "grad_norm": 0.16627462208271027, + "learning_rate": 0.00043058727569331157, + "loss": 0.0732, + "num_input_tokens_seen": 11413984, + "step": 5280 + }, + { + "epoch": 0.8621533442088092, + "grad_norm": 0.49312978982925415, + "learning_rate": 0.0004309951060358891, + "loss": 0.1853, + "num_input_tokens_seen": 11424832, + "step": 5285 + }, + { + "epoch": 0.8629690048939641, + "grad_norm": 0.30036258697509766, + "learning_rate": 0.0004314029363784666, + "loss": 0.0606, + "num_input_tokens_seen": 11436128, + "step": 5290 + }, + { + "epoch": 0.863784665579119, + "grad_norm": 0.028072098270058632, + "learning_rate": 0.0004318107667210441, + "loss": 0.1608, + "num_input_tokens_seen": 11448448, + "step": 5295 + }, + { + "epoch": 0.8646003262642741, + "grad_norm": 0.14690212905406952, + "learning_rate": 0.0004322185970636215, + "loss": 0.1608, + "num_input_tokens_seen": 11458176, + "step": 5300 + }, + { + "epoch": 0.865415986949429, + "grad_norm": 0.18263575434684753, + "learning_rate": 0.000432626427406199, + "loss": 0.1955, + "num_input_tokens_seen": 11468480, + "step": 5305 + }, + { + "epoch": 0.866231647634584, + "grad_norm": 0.2216104120016098, + "learning_rate": 0.00043303425774877653, + "loss": 0.1234, + "num_input_tokens_seen": 11478880, + "step": 5310 + }, + { + "epoch": 0.867047308319739, + "grad_norm": 0.366914838552475, + "learning_rate": 0.000433442088091354, + "loss": 0.1729, + "num_input_tokens_seen": 11490176, + "step": 5315 + }, + { + "epoch": 0.867862969004894, + "grad_norm": 0.09106352180242538, + "learning_rate": 0.0004338499184339315, + "loss": 0.1709, + "num_input_tokens_seen": 11500992, + "step": 5320 + }, + { + "epoch": 0.8686786296900489, + "grad_norm": 0.49218812584877014, + "learning_rate": 0.00043425774877650896, + "loss": 0.0797, + "num_input_tokens_seen": 11512288, + "step": 5325 + }, + { + "epoch": 0.8694942903752039, + "grad_norm": 0.13443315029144287, + "learning_rate": 0.0004346655791190865, + "loss": 0.0834, + "num_input_tokens_seen": 11521664, + "step": 5330 + }, + { + "epoch": 0.8703099510603589, + "grad_norm": 0.19490578770637512, + "learning_rate": 0.00043507340946166397, + "loss": 0.056, + "num_input_tokens_seen": 11532608, + "step": 5335 + }, + { + "epoch": 0.8711256117455138, + "grad_norm": 0.27892568707466125, + "learning_rate": 0.00043548123980424145, + "loss": 0.1554, + "num_input_tokens_seen": 11543072, + "step": 5340 + }, + { + "epoch": 0.8719412724306689, + "grad_norm": 0.4815479815006256, + "learning_rate": 0.0004358890701468189, + "loss": 0.1447, + "num_input_tokens_seen": 11554144, + "step": 5345 + }, + { + "epoch": 0.8727569331158238, + "grad_norm": 0.017784638330340385, + "learning_rate": 0.0004362969004893964, + "loss": 0.1103, + "num_input_tokens_seen": 11565376, + "step": 5350 + }, + { + "epoch": 0.8735725938009788, + "grad_norm": 0.33473095297813416, + "learning_rate": 0.00043670473083197393, + "loss": 0.2967, + "num_input_tokens_seen": 11576128, + "step": 5355 + }, + { + "epoch": 0.8743882544861338, + "grad_norm": 0.026542387902736664, + "learning_rate": 0.0004371125611745514, + "loss": 0.047, + "num_input_tokens_seen": 11588000, + "step": 5360 + }, + { + "epoch": 0.8752039151712887, + "grad_norm": 0.10609222203493118, + "learning_rate": 0.0004375203915171289, + "loss": 0.0884, + "num_input_tokens_seen": 11599520, + "step": 5365 + }, + { + "epoch": 0.8760195758564437, + "grad_norm": 0.21308600902557373, + "learning_rate": 0.00043792822185970636, + "loss": 0.1057, + "num_input_tokens_seen": 11611168, + "step": 5370 + }, + { + "epoch": 0.8768352365415987, + "grad_norm": 0.05683080852031708, + "learning_rate": 0.00043833605220228384, + "loss": 0.082, + "num_input_tokens_seen": 11621632, + "step": 5375 + }, + { + "epoch": 0.8776508972267537, + "grad_norm": 0.5174321532249451, + "learning_rate": 0.00043874388254486137, + "loss": 0.2837, + "num_input_tokens_seen": 11632160, + "step": 5380 + }, + { + "epoch": 0.8784665579119086, + "grad_norm": 0.16221262514591217, + "learning_rate": 0.00043915171288743884, + "loss": 0.1263, + "num_input_tokens_seen": 11641344, + "step": 5385 + }, + { + "epoch": 0.8792822185970636, + "grad_norm": 0.13459455966949463, + "learning_rate": 0.0004395595432300163, + "loss": 0.1075, + "num_input_tokens_seen": 11652128, + "step": 5390 + }, + { + "epoch": 0.8800978792822186, + "grad_norm": 0.04580937698483467, + "learning_rate": 0.0004399673735725938, + "loss": 0.053, + "num_input_tokens_seen": 11662528, + "step": 5395 + }, + { + "epoch": 0.8809135399673735, + "grad_norm": 0.07564152777194977, + "learning_rate": 0.00044037520391517127, + "loss": 0.1061, + "num_input_tokens_seen": 11672544, + "step": 5400 + }, + { + "epoch": 0.8817292006525286, + "grad_norm": 0.42318272590637207, + "learning_rate": 0.0004407830342577488, + "loss": 0.1772, + "num_input_tokens_seen": 11683552, + "step": 5405 + }, + { + "epoch": 0.8825448613376835, + "grad_norm": 0.26819872856140137, + "learning_rate": 0.0004411908646003263, + "loss": 0.1336, + "num_input_tokens_seen": 11693408, + "step": 5410 + }, + { + "epoch": 0.8833605220228385, + "grad_norm": 0.1825403869152069, + "learning_rate": 0.00044159869494290376, + "loss": 0.1796, + "num_input_tokens_seen": 11704160, + "step": 5415 + }, + { + "epoch": 0.8841761827079935, + "grad_norm": 0.06430928409099579, + "learning_rate": 0.00044200652528548123, + "loss": 0.1323, + "num_input_tokens_seen": 11714752, + "step": 5420 + }, + { + "epoch": 0.8849918433931484, + "grad_norm": 0.15096871554851532, + "learning_rate": 0.00044241435562805876, + "loss": 0.0384, + "num_input_tokens_seen": 11725408, + "step": 5425 + }, + { + "epoch": 0.8858075040783034, + "grad_norm": 0.22706635296344757, + "learning_rate": 0.00044282218597063624, + "loss": 0.1147, + "num_input_tokens_seen": 11736640, + "step": 5430 + }, + { + "epoch": 0.8866231647634584, + "grad_norm": 0.05543182045221329, + "learning_rate": 0.0004432300163132137, + "loss": 0.1505, + "num_input_tokens_seen": 11747008, + "step": 5435 + }, + { + "epoch": 0.8874388254486134, + "grad_norm": 0.5526264905929565, + "learning_rate": 0.0004436378466557912, + "loss": 0.1798, + "num_input_tokens_seen": 11757728, + "step": 5440 + }, + { + "epoch": 0.8882544861337683, + "grad_norm": 0.015536564402282238, + "learning_rate": 0.00044404567699836867, + "loss": 0.1271, + "num_input_tokens_seen": 11768704, + "step": 5445 + }, + { + "epoch": 0.8890701468189234, + "grad_norm": 0.15706466138362885, + "learning_rate": 0.0004444535073409462, + "loss": 0.1342, + "num_input_tokens_seen": 11780288, + "step": 5450 + }, + { + "epoch": 0.8898858075040783, + "grad_norm": 0.6504952907562256, + "learning_rate": 0.0004448613376835237, + "loss": 0.3482, + "num_input_tokens_seen": 11790080, + "step": 5455 + }, + { + "epoch": 0.8907014681892332, + "grad_norm": 0.14098556339740753, + "learning_rate": 0.0004452691680261011, + "loss": 0.2623, + "num_input_tokens_seen": 11801952, + "step": 5460 + }, + { + "epoch": 0.8915171288743883, + "grad_norm": 0.0845152735710144, + "learning_rate": 0.00044567699836867863, + "loss": 0.1243, + "num_input_tokens_seen": 11811520, + "step": 5465 + }, + { + "epoch": 0.8923327895595432, + "grad_norm": 0.20279009640216827, + "learning_rate": 0.0004460848287112561, + "loss": 0.1442, + "num_input_tokens_seen": 11822240, + "step": 5470 + }, + { + "epoch": 0.8931484502446982, + "grad_norm": 0.14917077124118805, + "learning_rate": 0.00044649265905383364, + "loss": 0.1511, + "num_input_tokens_seen": 11833632, + "step": 5475 + }, + { + "epoch": 0.8939641109298532, + "grad_norm": 0.039948079735040665, + "learning_rate": 0.0004469004893964111, + "loss": 0.121, + "num_input_tokens_seen": 11844640, + "step": 5480 + }, + { + "epoch": 0.8947797716150081, + "grad_norm": 0.18594685196876526, + "learning_rate": 0.0004473083197389886, + "loss": 0.1085, + "num_input_tokens_seen": 11855776, + "step": 5485 + }, + { + "epoch": 0.8955954323001631, + "grad_norm": 0.17190532386302948, + "learning_rate": 0.00044771615008156607, + "loss": 0.3427, + "num_input_tokens_seen": 11865856, + "step": 5490 + }, + { + "epoch": 0.8964110929853181, + "grad_norm": 0.15031887590885162, + "learning_rate": 0.00044812398042414354, + "loss": 0.0881, + "num_input_tokens_seen": 11877504, + "step": 5495 + }, + { + "epoch": 0.8972267536704731, + "grad_norm": 0.04914926737546921, + "learning_rate": 0.00044853181076672107, + "loss": 0.1142, + "num_input_tokens_seen": 11888320, + "step": 5500 + }, + { + "epoch": 0.898042414355628, + "grad_norm": 0.1394360214471817, + "learning_rate": 0.00044893964110929855, + "loss": 0.1406, + "num_input_tokens_seen": 11899456, + "step": 5505 + }, + { + "epoch": 0.8988580750407831, + "grad_norm": 0.5909246802330017, + "learning_rate": 0.000449347471451876, + "loss": 0.1699, + "num_input_tokens_seen": 11910240, + "step": 5510 + }, + { + "epoch": 0.899673735725938, + "grad_norm": 0.04671233892440796, + "learning_rate": 0.0004497553017944535, + "loss": 0.0517, + "num_input_tokens_seen": 11920512, + "step": 5515 + }, + { + "epoch": 0.9004893964110929, + "grad_norm": 0.4719904661178589, + "learning_rate": 0.00045016313213703103, + "loss": 0.1502, + "num_input_tokens_seen": 11930592, + "step": 5520 + }, + { + "epoch": 0.901305057096248, + "grad_norm": 0.153912752866745, + "learning_rate": 0.0004505709624796085, + "loss": 0.3191, + "num_input_tokens_seen": 11941088, + "step": 5525 + }, + { + "epoch": 0.9021207177814029, + "grad_norm": 0.20019592344760895, + "learning_rate": 0.00045097879282218593, + "loss": 0.118, + "num_input_tokens_seen": 11952320, + "step": 5530 + }, + { + "epoch": 0.9029363784665579, + "grad_norm": 0.2389611005783081, + "learning_rate": 0.00045138662316476346, + "loss": 0.083, + "num_input_tokens_seen": 11962880, + "step": 5535 + }, + { + "epoch": 0.9037520391517129, + "grad_norm": 0.08271603286266327, + "learning_rate": 0.00045179445350734094, + "loss": 0.0364, + "num_input_tokens_seen": 11973472, + "step": 5540 + }, + { + "epoch": 0.9045676998368679, + "grad_norm": 0.5467060208320618, + "learning_rate": 0.00045220228384991847, + "loss": 0.4195, + "num_input_tokens_seen": 11984096, + "step": 5545 + }, + { + "epoch": 0.9053833605220228, + "grad_norm": 0.24795718491077423, + "learning_rate": 0.00045261011419249595, + "loss": 0.1156, + "num_input_tokens_seen": 11995488, + "step": 5550 + }, + { + "epoch": 0.9061990212071778, + "grad_norm": 0.33023780584335327, + "learning_rate": 0.0004530179445350734, + "loss": 0.1028, + "num_input_tokens_seen": 12007104, + "step": 5555 + }, + { + "epoch": 0.9070146818923328, + "grad_norm": 0.3138332664966583, + "learning_rate": 0.0004534257748776509, + "loss": 0.1619, + "num_input_tokens_seen": 12018688, + "step": 5560 + }, + { + "epoch": 0.9078303425774877, + "grad_norm": 0.11865737289190292, + "learning_rate": 0.0004538336052202284, + "loss": 0.1064, + "num_input_tokens_seen": 12028096, + "step": 5565 + }, + { + "epoch": 0.9086460032626428, + "grad_norm": 0.3372235894203186, + "learning_rate": 0.0004542414355628059, + "loss": 0.1468, + "num_input_tokens_seen": 12038976, + "step": 5570 + }, + { + "epoch": 0.9094616639477977, + "grad_norm": 0.19052010774612427, + "learning_rate": 0.0004546492659053834, + "loss": 0.1107, + "num_input_tokens_seen": 12050816, + "step": 5575 + }, + { + "epoch": 0.9102773246329527, + "grad_norm": 0.21978150308132172, + "learning_rate": 0.00045505709624796086, + "loss": 0.1906, + "num_input_tokens_seen": 12062336, + "step": 5580 + }, + { + "epoch": 0.9110929853181077, + "grad_norm": 0.9295349717140198, + "learning_rate": 0.00045546492659053833, + "loss": 0.1756, + "num_input_tokens_seen": 12072096, + "step": 5585 + }, + { + "epoch": 0.9119086460032626, + "grad_norm": 0.2849160432815552, + "learning_rate": 0.0004558727569331158, + "loss": 0.0891, + "num_input_tokens_seen": 12083136, + "step": 5590 + }, + { + "epoch": 0.9127243066884176, + "grad_norm": 0.12839558720588684, + "learning_rate": 0.00045628058727569334, + "loss": 0.1447, + "num_input_tokens_seen": 12094272, + "step": 5595 + }, + { + "epoch": 0.9135399673735726, + "grad_norm": 0.07938763499259949, + "learning_rate": 0.00045668841761827076, + "loss": 0.1533, + "num_input_tokens_seen": 12105280, + "step": 5600 + }, + { + "epoch": 0.9143556280587276, + "grad_norm": 0.11017264425754547, + "learning_rate": 0.0004570962479608483, + "loss": 0.1845, + "num_input_tokens_seen": 12115968, + "step": 5605 + }, + { + "epoch": 0.9151712887438825, + "grad_norm": 0.4598122537136078, + "learning_rate": 0.00045750407830342577, + "loss": 0.1534, + "num_input_tokens_seen": 12127136, + "step": 5610 + }, + { + "epoch": 0.9159869494290375, + "grad_norm": 0.23632459342479706, + "learning_rate": 0.0004579119086460033, + "loss": 0.0649, + "num_input_tokens_seen": 12136704, + "step": 5615 + }, + { + "epoch": 0.9168026101141925, + "grad_norm": 0.07224715501070023, + "learning_rate": 0.0004583197389885808, + "loss": 0.0862, + "num_input_tokens_seen": 12146784, + "step": 5620 + }, + { + "epoch": 0.9176182707993474, + "grad_norm": 0.2349843680858612, + "learning_rate": 0.0004587275693311582, + "loss": 0.0717, + "num_input_tokens_seen": 12157248, + "step": 5625 + }, + { + "epoch": 0.9184339314845025, + "grad_norm": 0.19105643033981323, + "learning_rate": 0.00045913539967373573, + "loss": 0.1553, + "num_input_tokens_seen": 12168256, + "step": 5630 + }, + { + "epoch": 0.9192495921696574, + "grad_norm": 0.1620476394891739, + "learning_rate": 0.0004595432300163132, + "loss": 0.0696, + "num_input_tokens_seen": 12177760, + "step": 5635 + }, + { + "epoch": 0.9200652528548124, + "grad_norm": 0.13576938211917877, + "learning_rate": 0.00045995106035889074, + "loss": 0.0583, + "num_input_tokens_seen": 12188192, + "step": 5640 + }, + { + "epoch": 0.9208809135399674, + "grad_norm": 0.46857231855392456, + "learning_rate": 0.0004603588907014682, + "loss": 0.2646, + "num_input_tokens_seen": 12198880, + "step": 5645 + }, + { + "epoch": 0.9216965742251223, + "grad_norm": 0.06446434557437897, + "learning_rate": 0.0004607667210440457, + "loss": 0.1958, + "num_input_tokens_seen": 12208800, + "step": 5650 + }, + { + "epoch": 0.9225122349102773, + "grad_norm": 0.052255790680646896, + "learning_rate": 0.00046117455138662317, + "loss": 0.1403, + "num_input_tokens_seen": 12219360, + "step": 5655 + }, + { + "epoch": 0.9233278955954323, + "grad_norm": 0.8032695055007935, + "learning_rate": 0.00046158238172920064, + "loss": 0.1231, + "num_input_tokens_seen": 12229376, + "step": 5660 + }, + { + "epoch": 0.9241435562805873, + "grad_norm": 0.2427329272031784, + "learning_rate": 0.0004619902120717782, + "loss": 0.1765, + "num_input_tokens_seen": 12241152, + "step": 5665 + }, + { + "epoch": 0.9249592169657422, + "grad_norm": 0.1363193839788437, + "learning_rate": 0.0004623980424143556, + "loss": 0.1933, + "num_input_tokens_seen": 12252192, + "step": 5670 + }, + { + "epoch": 0.9257748776508973, + "grad_norm": 0.09399381279945374, + "learning_rate": 0.00046280587275693313, + "loss": 0.1296, + "num_input_tokens_seen": 12262848, + "step": 5675 + }, + { + "epoch": 0.9265905383360522, + "grad_norm": 0.16970834136009216, + "learning_rate": 0.0004632137030995106, + "loss": 0.1107, + "num_input_tokens_seen": 12273728, + "step": 5680 + }, + { + "epoch": 0.9274061990212071, + "grad_norm": 0.11786321550607681, + "learning_rate": 0.00046362153344208813, + "loss": 0.0899, + "num_input_tokens_seen": 12284512, + "step": 5685 + }, + { + "epoch": 0.9282218597063622, + "grad_norm": 0.6774787902832031, + "learning_rate": 0.0004640293637846656, + "loss": 0.2493, + "num_input_tokens_seen": 12295968, + "step": 5690 + }, + { + "epoch": 0.9290375203915171, + "grad_norm": 0.0617743618786335, + "learning_rate": 0.00046443719412724303, + "loss": 0.0964, + "num_input_tokens_seen": 12307616, + "step": 5695 + }, + { + "epoch": 0.9298531810766721, + "grad_norm": 0.23958179354667664, + "learning_rate": 0.00046484502446982056, + "loss": 0.1443, + "num_input_tokens_seen": 12318400, + "step": 5700 + }, + { + "epoch": 0.9306688417618271, + "grad_norm": 0.09052935242652893, + "learning_rate": 0.00046525285481239804, + "loss": 0.1503, + "num_input_tokens_seen": 12329472, + "step": 5705 + }, + { + "epoch": 0.9314845024469821, + "grad_norm": 0.11406863480806351, + "learning_rate": 0.00046566068515497557, + "loss": 0.0592, + "num_input_tokens_seen": 12340032, + "step": 5710 + }, + { + "epoch": 0.932300163132137, + "grad_norm": 0.15726540982723236, + "learning_rate": 0.00046606851549755305, + "loss": 0.0964, + "num_input_tokens_seen": 12351584, + "step": 5715 + }, + { + "epoch": 0.933115823817292, + "grad_norm": 0.3423006236553192, + "learning_rate": 0.00046647634584013047, + "loss": 0.2585, + "num_input_tokens_seen": 12362752, + "step": 5720 + }, + { + "epoch": 0.933931484502447, + "grad_norm": 0.17774665355682373, + "learning_rate": 0.000466884176182708, + "loss": 0.1728, + "num_input_tokens_seen": 12374720, + "step": 5725 + }, + { + "epoch": 0.9347471451876019, + "grad_norm": 0.15029606223106384, + "learning_rate": 0.0004672920065252855, + "loss": 0.1353, + "num_input_tokens_seen": 12384896, + "step": 5730 + }, + { + "epoch": 0.935562805872757, + "grad_norm": 0.07887958735227585, + "learning_rate": 0.000467699836867863, + "loss": 0.1015, + "num_input_tokens_seen": 12395808, + "step": 5735 + }, + { + "epoch": 0.9363784665579119, + "grad_norm": 0.27178138494491577, + "learning_rate": 0.0004681076672104405, + "loss": 0.1417, + "num_input_tokens_seen": 12406880, + "step": 5740 + }, + { + "epoch": 0.9371941272430668, + "grad_norm": 0.3018706440925598, + "learning_rate": 0.00046851549755301796, + "loss": 0.2522, + "num_input_tokens_seen": 12418240, + "step": 5745 + }, + { + "epoch": 0.9380097879282219, + "grad_norm": 0.26374009251594543, + "learning_rate": 0.00046892332789559544, + "loss": 0.2575, + "num_input_tokens_seen": 12428896, + "step": 5750 + }, + { + "epoch": 0.9388254486133768, + "grad_norm": 0.10178331285715103, + "learning_rate": 0.0004693311582381729, + "loss": 0.1135, + "num_input_tokens_seen": 12439872, + "step": 5755 + }, + { + "epoch": 0.9396411092985318, + "grad_norm": 0.03823598846793175, + "learning_rate": 0.00046973898858075044, + "loss": 0.0308, + "num_input_tokens_seen": 12451840, + "step": 5760 + }, + { + "epoch": 0.9404567699836868, + "grad_norm": 0.07429031282663345, + "learning_rate": 0.00047014681892332787, + "loss": 0.0888, + "num_input_tokens_seen": 12463328, + "step": 5765 + }, + { + "epoch": 0.9412724306688418, + "grad_norm": 0.053411390632390976, + "learning_rate": 0.0004705546492659054, + "loss": 0.2314, + "num_input_tokens_seen": 12474080, + "step": 5770 + }, + { + "epoch": 0.9420880913539967, + "grad_norm": 0.10605911165475845, + "learning_rate": 0.0004709624796084829, + "loss": 0.2763, + "num_input_tokens_seen": 12484960, + "step": 5775 + }, + { + "epoch": 0.9429037520391517, + "grad_norm": 0.24866236746311188, + "learning_rate": 0.0004713703099510604, + "loss": 0.0808, + "num_input_tokens_seen": 12495584, + "step": 5780 + }, + { + "epoch": 0.9437194127243067, + "grad_norm": 0.09534251689910889, + "learning_rate": 0.0004717781402936379, + "loss": 0.1366, + "num_input_tokens_seen": 12505568, + "step": 5785 + }, + { + "epoch": 0.9445350734094616, + "grad_norm": 0.09393291175365448, + "learning_rate": 0.0004721859706362153, + "loss": 0.1921, + "num_input_tokens_seen": 12516480, + "step": 5790 + }, + { + "epoch": 0.9453507340946167, + "grad_norm": 0.18047569692134857, + "learning_rate": 0.00047259380097879283, + "loss": 0.1783, + "num_input_tokens_seen": 12527136, + "step": 5795 + }, + { + "epoch": 0.9461663947797716, + "grad_norm": 0.1642356663942337, + "learning_rate": 0.0004730016313213703, + "loss": 0.1689, + "num_input_tokens_seen": 12537568, + "step": 5800 + }, + { + "epoch": 0.9469820554649266, + "grad_norm": 0.13363811373710632, + "learning_rate": 0.00047340946166394784, + "loss": 0.1009, + "num_input_tokens_seen": 12548736, + "step": 5805 + }, + { + "epoch": 0.9477977161500816, + "grad_norm": 0.04642777889966965, + "learning_rate": 0.0004738172920065253, + "loss": 0.1228, + "num_input_tokens_seen": 12560192, + "step": 5810 + }, + { + "epoch": 0.9486133768352365, + "grad_norm": 0.04871002584695816, + "learning_rate": 0.00047422512234910274, + "loss": 0.0947, + "num_input_tokens_seen": 12570816, + "step": 5815 + }, + { + "epoch": 0.9494290375203915, + "grad_norm": 0.8384509086608887, + "learning_rate": 0.00047463295269168027, + "loss": 0.2352, + "num_input_tokens_seen": 12582272, + "step": 5820 + }, + { + "epoch": 0.9502446982055465, + "grad_norm": 0.06325852870941162, + "learning_rate": 0.00047504078303425775, + "loss": 0.124, + "num_input_tokens_seen": 12594432, + "step": 5825 + }, + { + "epoch": 0.9510603588907015, + "grad_norm": 0.09114784002304077, + "learning_rate": 0.0004754486133768353, + "loss": 0.0276, + "num_input_tokens_seen": 12606048, + "step": 5830 + }, + { + "epoch": 0.9518760195758564, + "grad_norm": 0.28845664858818054, + "learning_rate": 0.0004758564437194127, + "loss": 0.1905, + "num_input_tokens_seen": 12617088, + "step": 5835 + }, + { + "epoch": 0.9526916802610114, + "grad_norm": 0.2616293132305145, + "learning_rate": 0.00047626427406199023, + "loss": 0.1532, + "num_input_tokens_seen": 12627808, + "step": 5840 + }, + { + "epoch": 0.9535073409461664, + "grad_norm": 0.03393285721540451, + "learning_rate": 0.0004766721044045677, + "loss": 0.0426, + "num_input_tokens_seen": 12638720, + "step": 5845 + }, + { + "epoch": 0.9543230016313213, + "grad_norm": 0.2862362861633301, + "learning_rate": 0.0004770799347471452, + "loss": 0.1367, + "num_input_tokens_seen": 12648608, + "step": 5850 + }, + { + "epoch": 0.9551386623164764, + "grad_norm": 0.07333455979824066, + "learning_rate": 0.0004774877650897227, + "loss": 0.0314, + "num_input_tokens_seen": 12658944, + "step": 5855 + }, + { + "epoch": 0.9559543230016313, + "grad_norm": 0.568708598613739, + "learning_rate": 0.00047789559543230014, + "loss": 0.1792, + "num_input_tokens_seen": 12669856, + "step": 5860 + }, + { + "epoch": 0.9567699836867863, + "grad_norm": 0.3254932463169098, + "learning_rate": 0.00047830342577487767, + "loss": 0.1766, + "num_input_tokens_seen": 12680096, + "step": 5865 + }, + { + "epoch": 0.9575856443719413, + "grad_norm": 0.011022812686860561, + "learning_rate": 0.00047871125611745514, + "loss": 0.0274, + "num_input_tokens_seen": 12691040, + "step": 5870 + }, + { + "epoch": 0.9584013050570962, + "grad_norm": 0.021599851548671722, + "learning_rate": 0.0004791190864600327, + "loss": 0.1182, + "num_input_tokens_seen": 12701728, + "step": 5875 + }, + { + "epoch": 0.9592169657422512, + "grad_norm": 0.08620810508728027, + "learning_rate": 0.00047952691680261015, + "loss": 0.1513, + "num_input_tokens_seen": 12712224, + "step": 5880 + }, + { + "epoch": 0.9600326264274062, + "grad_norm": 0.18727059662342072, + "learning_rate": 0.00047993474714518757, + "loss": 0.1962, + "num_input_tokens_seen": 12723264, + "step": 5885 + }, + { + "epoch": 0.9608482871125612, + "grad_norm": 0.24827536940574646, + "learning_rate": 0.0004803425774877651, + "loss": 0.1598, + "num_input_tokens_seen": 12734400, + "step": 5890 + }, + { + "epoch": 0.9616639477977161, + "grad_norm": 0.015218405053019524, + "learning_rate": 0.0004807504078303426, + "loss": 0.1146, + "num_input_tokens_seen": 12745504, + "step": 5895 + }, + { + "epoch": 0.9624796084828712, + "grad_norm": 0.2958707809448242, + "learning_rate": 0.0004811582381729201, + "loss": 0.165, + "num_input_tokens_seen": 12755520, + "step": 5900 + }, + { + "epoch": 0.9632952691680261, + "grad_norm": 0.24511082470417023, + "learning_rate": 0.00048156606851549753, + "loss": 0.0566, + "num_input_tokens_seen": 12766336, + "step": 5905 + }, + { + "epoch": 0.964110929853181, + "grad_norm": 0.5579676628112793, + "learning_rate": 0.00048197389885807506, + "loss": 0.1712, + "num_input_tokens_seen": 12776704, + "step": 5910 + }, + { + "epoch": 0.9649265905383361, + "grad_norm": 0.12991249561309814, + "learning_rate": 0.00048238172920065254, + "loss": 0.2261, + "num_input_tokens_seen": 12787872, + "step": 5915 + }, + { + "epoch": 0.965742251223491, + "grad_norm": 0.17312704026699066, + "learning_rate": 0.00048278955954323, + "loss": 0.0663, + "num_input_tokens_seen": 12798880, + "step": 5920 + }, + { + "epoch": 0.966557911908646, + "grad_norm": 0.5402299761772156, + "learning_rate": 0.00048319738988580755, + "loss": 0.1603, + "num_input_tokens_seen": 12809696, + "step": 5925 + }, + { + "epoch": 0.967373572593801, + "grad_norm": 0.09169294685125351, + "learning_rate": 0.00048360522022838497, + "loss": 0.1146, + "num_input_tokens_seen": 12820192, + "step": 5930 + }, + { + "epoch": 0.968189233278956, + "grad_norm": 0.09496288001537323, + "learning_rate": 0.0004840130505709625, + "loss": 0.0673, + "num_input_tokens_seen": 12832384, + "step": 5935 + }, + { + "epoch": 0.9690048939641109, + "grad_norm": 0.2580087184906006, + "learning_rate": 0.00048442088091354, + "loss": 0.1096, + "num_input_tokens_seen": 12843936, + "step": 5940 + }, + { + "epoch": 0.9698205546492659, + "grad_norm": 0.14182347059249878, + "learning_rate": 0.00048482871125611745, + "loss": 0.1778, + "num_input_tokens_seen": 12854752, + "step": 5945 + }, + { + "epoch": 0.9706362153344209, + "grad_norm": 0.3405098617076874, + "learning_rate": 0.000485236541598695, + "loss": 0.0879, + "num_input_tokens_seen": 12865248, + "step": 5950 + }, + { + "epoch": 0.9714518760195758, + "grad_norm": 0.34928223490715027, + "learning_rate": 0.0004856443719412724, + "loss": 0.1449, + "num_input_tokens_seen": 12876128, + "step": 5955 + }, + { + "epoch": 0.9722675367047309, + "grad_norm": 0.4829985201358795, + "learning_rate": 0.00048605220228384994, + "loss": 0.1794, + "num_input_tokens_seen": 12887008, + "step": 5960 + }, + { + "epoch": 0.9730831973898858, + "grad_norm": 0.005287457723170519, + "learning_rate": 0.0004864600326264274, + "loss": 0.0562, + "num_input_tokens_seen": 12898624, + "step": 5965 + }, + { + "epoch": 0.9738988580750407, + "grad_norm": 0.05972887948155403, + "learning_rate": 0.00048686786296900494, + "loss": 0.064, + "num_input_tokens_seen": 12909888, + "step": 5970 + }, + { + "epoch": 0.9747145187601958, + "grad_norm": 0.8596710562705994, + "learning_rate": 0.00048727569331158237, + "loss": 0.3676, + "num_input_tokens_seen": 12920032, + "step": 5975 + }, + { + "epoch": 0.9755301794453507, + "grad_norm": 0.5911656022071838, + "learning_rate": 0.00048768352365415984, + "loss": 0.3441, + "num_input_tokens_seen": 12931392, + "step": 5980 + }, + { + "epoch": 0.9763458401305057, + "grad_norm": 0.1250295490026474, + "learning_rate": 0.00048809135399673737, + "loss": 0.1476, + "num_input_tokens_seen": 12941472, + "step": 5985 + }, + { + "epoch": 0.9771615008156607, + "grad_norm": 0.12026496976613998, + "learning_rate": 0.0004884991843393148, + "loss": 0.1644, + "num_input_tokens_seen": 12952192, + "step": 5990 + }, + { + "epoch": 0.9779771615008157, + "grad_norm": 0.4380301833152771, + "learning_rate": 0.0004889070146818923, + "loss": 0.2336, + "num_input_tokens_seen": 12963200, + "step": 5995 + }, + { + "epoch": 0.9787928221859706, + "grad_norm": 0.22438715398311615, + "learning_rate": 0.0004893148450244698, + "loss": 0.1858, + "num_input_tokens_seen": 12972768, + "step": 6000 + }, + { + "epoch": 0.9796084828711256, + "grad_norm": 0.19824226200580597, + "learning_rate": 0.0004897226753670474, + "loss": 0.099, + "num_input_tokens_seen": 12984064, + "step": 6005 + }, + { + "epoch": 0.9804241435562806, + "grad_norm": 0.11898528039455414, + "learning_rate": 0.0004901305057096248, + "loss": 0.1879, + "num_input_tokens_seen": 12994336, + "step": 6010 + }, + { + "epoch": 0.9812398042414355, + "grad_norm": 0.1742682009935379, + "learning_rate": 0.0004905383360522022, + "loss": 0.0591, + "num_input_tokens_seen": 13004096, + "step": 6015 + }, + { + "epoch": 0.9820554649265906, + "grad_norm": 0.06963789463043213, + "learning_rate": 0.0004909461663947798, + "loss": 0.0412, + "num_input_tokens_seen": 13015168, + "step": 6020 + }, + { + "epoch": 0.9828711256117455, + "grad_norm": 0.06431388854980469, + "learning_rate": 0.0004913539967373573, + "loss": 0.0707, + "num_input_tokens_seen": 13026080, + "step": 6025 + }, + { + "epoch": 0.9836867862969005, + "grad_norm": 1.0310012102127075, + "learning_rate": 0.0004917618270799348, + "loss": 0.2891, + "num_input_tokens_seen": 13037888, + "step": 6030 + }, + { + "epoch": 0.9845024469820555, + "grad_norm": 0.557636559009552, + "learning_rate": 0.0004921696574225122, + "loss": 0.3144, + "num_input_tokens_seen": 13049440, + "step": 6035 + }, + { + "epoch": 0.9853181076672104, + "grad_norm": 0.09070974588394165, + "learning_rate": 0.0004925774877650897, + "loss": 0.09, + "num_input_tokens_seen": 13060480, + "step": 6040 + }, + { + "epoch": 0.9861337683523654, + "grad_norm": 0.10034071654081345, + "learning_rate": 0.0004929853181076672, + "loss": 0.0741, + "num_input_tokens_seen": 13071584, + "step": 6045 + }, + { + "epoch": 0.9869494290375204, + "grad_norm": 0.45269960165023804, + "learning_rate": 0.0004933931484502447, + "loss": 0.2326, + "num_input_tokens_seen": 13081376, + "step": 6050 + }, + { + "epoch": 0.9877650897226754, + "grad_norm": 0.03084792196750641, + "learning_rate": 0.0004938009787928223, + "loss": 0.0724, + "num_input_tokens_seen": 13092768, + "step": 6055 + }, + { + "epoch": 0.9885807504078303, + "grad_norm": 0.09663219004869461, + "learning_rate": 0.0004942088091353996, + "loss": 0.181, + "num_input_tokens_seen": 13104096, + "step": 6060 + }, + { + "epoch": 0.9893964110929854, + "grad_norm": 0.0576571561396122, + "learning_rate": 0.0004946166394779772, + "loss": 0.1219, + "num_input_tokens_seen": 13115488, + "step": 6065 + }, + { + "epoch": 0.9902120717781403, + "grad_norm": 0.03444729745388031, + "learning_rate": 0.0004950244698205547, + "loss": 0.0611, + "num_input_tokens_seen": 13126752, + "step": 6070 + }, + { + "epoch": 0.9910277324632952, + "grad_norm": 0.13362960517406464, + "learning_rate": 0.0004954323001631322, + "loss": 0.1766, + "num_input_tokens_seen": 13137920, + "step": 6075 + }, + { + "epoch": 0.9918433931484503, + "grad_norm": 0.15596185624599457, + "learning_rate": 0.0004958401305057096, + "loss": 0.1385, + "num_input_tokens_seen": 13148416, + "step": 6080 + }, + { + "epoch": 0.9926590538336052, + "grad_norm": 0.5052196383476257, + "learning_rate": 0.0004962479608482871, + "loss": 0.1411, + "num_input_tokens_seen": 13157920, + "step": 6085 + }, + { + "epoch": 0.9934747145187602, + "grad_norm": 0.18146930634975433, + "learning_rate": 0.0004966557911908646, + "loss": 0.0864, + "num_input_tokens_seen": 13167968, + "step": 6090 + }, + { + "epoch": 0.9942903752039152, + "grad_norm": 0.4113079905509949, + "learning_rate": 0.0004970636215334421, + "loss": 0.1795, + "num_input_tokens_seen": 13179648, + "step": 6095 + }, + { + "epoch": 0.9951060358890701, + "grad_norm": 0.047881823033094406, + "learning_rate": 0.0004974714518760197, + "loss": 0.1678, + "num_input_tokens_seen": 13190656, + "step": 6100 + }, + { + "epoch": 0.9959216965742251, + "grad_norm": 0.06008487567305565, + "learning_rate": 0.000497879282218597, + "loss": 0.0872, + "num_input_tokens_seen": 13201120, + "step": 6105 + }, + { + "epoch": 0.9967373572593801, + "grad_norm": 0.19662658870220184, + "learning_rate": 0.0004982871125611745, + "loss": 0.1092, + "num_input_tokens_seen": 13211744, + "step": 6110 + }, + { + "epoch": 0.9975530179445351, + "grad_norm": 0.02735212817788124, + "learning_rate": 0.0004986949429037521, + "loss": 0.2012, + "num_input_tokens_seen": 13222752, + "step": 6115 + }, + { + "epoch": 0.99836867862969, + "grad_norm": 0.06319998949766159, + "learning_rate": 0.0004991027732463296, + "loss": 0.1773, + "num_input_tokens_seen": 13233696, + "step": 6120 + }, + { + "epoch": 0.9991843393148451, + "grad_norm": 0.23112884163856506, + "learning_rate": 0.000499510603588907, + "loss": 0.174, + "num_input_tokens_seen": 13245504, + "step": 6125 + }, + { + "epoch": 1.0, + "grad_norm": 0.052175283432006836, + "learning_rate": 0.0004999184339314845, + "loss": 0.1392, + "num_input_tokens_seen": 13255424, + "step": 6130 + }, + { + "epoch": 1.0, + "eval_loss": 0.13635002076625824, + "eval_runtime": 103.8719, + "eval_samples_per_second": 26.234, + "eval_steps_per_second": 6.566, + "num_input_tokens_seen": 13255424, + "step": 6130 + }, + { + "epoch": 1.000815660685155, + "grad_norm": 0.20552437007427216, + "learning_rate": 0.000500326264274062, + "loss": 0.0946, + "num_input_tokens_seen": 13265952, + "step": 6135 + }, + { + "epoch": 1.0016313213703099, + "grad_norm": 0.11177679896354675, + "learning_rate": 0.0005007340946166395, + "loss": 0.0879, + "num_input_tokens_seen": 13276768, + "step": 6140 + }, + { + "epoch": 1.002446982055465, + "grad_norm": 0.15691284835338593, + "learning_rate": 0.0005011419249592169, + "loss": 0.2027, + "num_input_tokens_seen": 13287776, + "step": 6145 + }, + { + "epoch": 1.00326264274062, + "grad_norm": 0.07786725461483002, + "learning_rate": 0.0005015497553017944, + "loss": 0.2037, + "num_input_tokens_seen": 13299488, + "step": 6150 + }, + { + "epoch": 1.004078303425775, + "grad_norm": 0.4646124243736267, + "learning_rate": 0.000501957585644372, + "loss": 0.1619, + "num_input_tokens_seen": 13310208, + "step": 6155 + }, + { + "epoch": 1.0048939641109298, + "grad_norm": 0.06262074410915375, + "learning_rate": 0.0005023654159869494, + "loss": 0.0411, + "num_input_tokens_seen": 13319904, + "step": 6160 + }, + { + "epoch": 1.0057096247960848, + "grad_norm": 0.08551330119371414, + "learning_rate": 0.000502773246329527, + "loss": 0.0911, + "num_input_tokens_seen": 13330528, + "step": 6165 + }, + { + "epoch": 1.0065252854812399, + "grad_norm": 0.04239974915981293, + "learning_rate": 0.0005031810766721044, + "loss": 0.0828, + "num_input_tokens_seen": 13341760, + "step": 6170 + }, + { + "epoch": 1.0073409461663947, + "grad_norm": 0.04535163193941116, + "learning_rate": 0.0005035889070146818, + "loss": 0.2349, + "num_input_tokens_seen": 13352448, + "step": 6175 + }, + { + "epoch": 1.0081566068515497, + "grad_norm": 0.3622733950614929, + "learning_rate": 0.0005039967373572594, + "loss": 0.2006, + "num_input_tokens_seen": 13363296, + "step": 6180 + }, + { + "epoch": 1.0089722675367048, + "grad_norm": 0.34349578619003296, + "learning_rate": 0.0005044045676998369, + "loss": 0.0696, + "num_input_tokens_seen": 13374368, + "step": 6185 + }, + { + "epoch": 1.0097879282218598, + "grad_norm": 0.09896185249090195, + "learning_rate": 0.0005048123980424144, + "loss": 0.0816, + "num_input_tokens_seen": 13385184, + "step": 6190 + }, + { + "epoch": 1.0106035889070146, + "grad_norm": 0.48803338408470154, + "learning_rate": 0.0005052202283849918, + "loss": 0.2371, + "num_input_tokens_seen": 13395104, + "step": 6195 + }, + { + "epoch": 1.0114192495921697, + "grad_norm": 0.2075025737285614, + "learning_rate": 0.0005056280587275693, + "loss": 0.06, + "num_input_tokens_seen": 13406080, + "step": 6200 + }, + { + "epoch": 1.0122349102773247, + "grad_norm": 0.3805916905403137, + "learning_rate": 0.0005060358890701469, + "loss": 0.1059, + "num_input_tokens_seen": 13417952, + "step": 6205 + }, + { + "epoch": 1.0130505709624795, + "grad_norm": 0.20125994086265564, + "learning_rate": 0.0005064437194127242, + "loss": 0.1306, + "num_input_tokens_seen": 13428512, + "step": 6210 + }, + { + "epoch": 1.0138662316476346, + "grad_norm": 0.1493004858493805, + "learning_rate": 0.0005068515497553018, + "loss": 0.0985, + "num_input_tokens_seen": 13439648, + "step": 6215 + }, + { + "epoch": 1.0146818923327896, + "grad_norm": 0.15149638056755066, + "learning_rate": 0.0005072593800978793, + "loss": 0.1974, + "num_input_tokens_seen": 13451520, + "step": 6220 + }, + { + "epoch": 1.0154975530179446, + "grad_norm": 0.5729100108146667, + "learning_rate": 0.0005076672104404568, + "loss": 0.1164, + "num_input_tokens_seen": 13460480, + "step": 6225 + }, + { + "epoch": 1.0163132137030995, + "grad_norm": 0.22920866310596466, + "learning_rate": 0.0005080750407830343, + "loss": 0.115, + "num_input_tokens_seen": 13471328, + "step": 6230 + }, + { + "epoch": 1.0171288743882545, + "grad_norm": 0.4724791347980499, + "learning_rate": 0.0005084828711256117, + "loss": 0.1769, + "num_input_tokens_seen": 13482048, + "step": 6235 + }, + { + "epoch": 1.0179445350734095, + "grad_norm": 0.32108229398727417, + "learning_rate": 0.0005088907014681893, + "loss": 0.1663, + "num_input_tokens_seen": 13492512, + "step": 6240 + }, + { + "epoch": 1.0187601957585644, + "grad_norm": 0.29344242811203003, + "learning_rate": 0.0005092985318107667, + "loss": 0.1347, + "num_input_tokens_seen": 13504000, + "step": 6245 + }, + { + "epoch": 1.0195758564437194, + "grad_norm": 0.05758266523480415, + "learning_rate": 0.0005097063621533442, + "loss": 0.1329, + "num_input_tokens_seen": 13514784, + "step": 6250 + }, + { + "epoch": 1.0203915171288744, + "grad_norm": 0.2905932664871216, + "learning_rate": 0.0005101141924959218, + "loss": 0.1403, + "num_input_tokens_seen": 13525376, + "step": 6255 + }, + { + "epoch": 1.0212071778140293, + "grad_norm": 0.10168883949518204, + "learning_rate": 0.0005105220228384992, + "loss": 0.1752, + "num_input_tokens_seen": 13535872, + "step": 6260 + }, + { + "epoch": 1.0220228384991843, + "grad_norm": 0.2596852779388428, + "learning_rate": 0.0005109298531810767, + "loss": 0.1772, + "num_input_tokens_seen": 13546464, + "step": 6265 + }, + { + "epoch": 1.0228384991843393, + "grad_norm": 0.128738671541214, + "learning_rate": 0.0005113376835236542, + "loss": 0.1972, + "num_input_tokens_seen": 13555936, + "step": 6270 + }, + { + "epoch": 1.0236541598694944, + "grad_norm": 0.13057231903076172, + "learning_rate": 0.0005117455138662317, + "loss": 0.0531, + "num_input_tokens_seen": 13566528, + "step": 6275 + }, + { + "epoch": 1.0244698205546492, + "grad_norm": 0.09578564763069153, + "learning_rate": 0.0005121533442088091, + "loss": 0.1299, + "num_input_tokens_seen": 13577152, + "step": 6280 + }, + { + "epoch": 1.0252854812398042, + "grad_norm": 0.44692936539649963, + "learning_rate": 0.0005125611745513866, + "loss": 0.1396, + "num_input_tokens_seen": 13588704, + "step": 6285 + }, + { + "epoch": 1.0261011419249593, + "grad_norm": 0.2416256219148636, + "learning_rate": 0.0005129690048939642, + "loss": 0.0655, + "num_input_tokens_seen": 13599552, + "step": 6290 + }, + { + "epoch": 1.026916802610114, + "grad_norm": 0.0603041872382164, + "learning_rate": 0.0005133768352365417, + "loss": 0.0421, + "num_input_tokens_seen": 13611264, + "step": 6295 + }, + { + "epoch": 1.0277324632952691, + "grad_norm": 0.3074585199356079, + "learning_rate": 0.000513784665579119, + "loss": 0.1716, + "num_input_tokens_seen": 13621760, + "step": 6300 + }, + { + "epoch": 1.0285481239804242, + "grad_norm": 0.6041052341461182, + "learning_rate": 0.0005141924959216966, + "loss": 0.2124, + "num_input_tokens_seen": 13632672, + "step": 6305 + }, + { + "epoch": 1.0293637846655792, + "grad_norm": 0.0694386288523674, + "learning_rate": 0.0005146003262642741, + "loss": 0.0206, + "num_input_tokens_seen": 13643584, + "step": 6310 + }, + { + "epoch": 1.030179445350734, + "grad_norm": 0.062036722898483276, + "learning_rate": 0.0005150081566068515, + "loss": 0.241, + "num_input_tokens_seen": 13655488, + "step": 6315 + }, + { + "epoch": 1.030995106035889, + "grad_norm": 0.1432199329137802, + "learning_rate": 0.000515415986949429, + "loss": 0.145, + "num_input_tokens_seen": 13666368, + "step": 6320 + }, + { + "epoch": 1.031810766721044, + "grad_norm": 0.25270965695381165, + "learning_rate": 0.0005158238172920065, + "loss": 0.1853, + "num_input_tokens_seen": 13678048, + "step": 6325 + }, + { + "epoch": 1.032626427406199, + "grad_norm": 0.31149885058403015, + "learning_rate": 0.0005162316476345841, + "loss": 0.0854, + "num_input_tokens_seen": 13689024, + "step": 6330 + }, + { + "epoch": 1.033442088091354, + "grad_norm": 0.02859092690050602, + "learning_rate": 0.0005166394779771615, + "loss": 0.1719, + "num_input_tokens_seen": 13699744, + "step": 6335 + }, + { + "epoch": 1.034257748776509, + "grad_norm": 0.0813438668847084, + "learning_rate": 0.000517047308319739, + "loss": 0.1306, + "num_input_tokens_seen": 13710016, + "step": 6340 + }, + { + "epoch": 1.035073409461664, + "grad_norm": 0.1725296825170517, + "learning_rate": 0.0005174551386623165, + "loss": 0.0873, + "num_input_tokens_seen": 13721664, + "step": 6345 + }, + { + "epoch": 1.0358890701468189, + "grad_norm": 0.40978899598121643, + "learning_rate": 0.0005178629690048939, + "loss": 0.0463, + "num_input_tokens_seen": 13734752, + "step": 6350 + }, + { + "epoch": 1.036704730831974, + "grad_norm": 0.05709673836827278, + "learning_rate": 0.0005182707993474715, + "loss": 0.0884, + "num_input_tokens_seen": 13745504, + "step": 6355 + }, + { + "epoch": 1.037520391517129, + "grad_norm": 0.444414883852005, + "learning_rate": 0.000518678629690049, + "loss": 0.1339, + "num_input_tokens_seen": 13755968, + "step": 6360 + }, + { + "epoch": 1.0383360522022838, + "grad_norm": 0.39112377166748047, + "learning_rate": 0.0005190864600326263, + "loss": 0.1134, + "num_input_tokens_seen": 13767296, + "step": 6365 + }, + { + "epoch": 1.0391517128874388, + "grad_norm": 0.14747588336467743, + "learning_rate": 0.0005194942903752039, + "loss": 0.0949, + "num_input_tokens_seen": 13778656, + "step": 6370 + }, + { + "epoch": 1.0399673735725938, + "grad_norm": 0.8481307625770569, + "learning_rate": 0.0005199021207177814, + "loss": 0.1337, + "num_input_tokens_seen": 13789216, + "step": 6375 + }, + { + "epoch": 1.0407830342577489, + "grad_norm": 0.2780773341655731, + "learning_rate": 0.000520309951060359, + "loss": 0.0894, + "num_input_tokens_seen": 13799680, + "step": 6380 + }, + { + "epoch": 1.0415986949429037, + "grad_norm": 0.016589326784014702, + "learning_rate": 0.0005207177814029364, + "loss": 0.118, + "num_input_tokens_seen": 13810720, + "step": 6385 + }, + { + "epoch": 1.0424143556280587, + "grad_norm": 0.10203398019075394, + "learning_rate": 0.0005211256117455138, + "loss": 0.0776, + "num_input_tokens_seen": 13821856, + "step": 6390 + }, + { + "epoch": 1.0432300163132138, + "grad_norm": 0.08254091441631317, + "learning_rate": 0.0005215334420880914, + "loss": 0.2058, + "num_input_tokens_seen": 13833984, + "step": 6395 + }, + { + "epoch": 1.0440456769983686, + "grad_norm": 0.2902688682079315, + "learning_rate": 0.0005219412724306688, + "loss": 0.2484, + "num_input_tokens_seen": 13844928, + "step": 6400 + }, + { + "epoch": 1.0448613376835236, + "grad_norm": 1.066491723060608, + "learning_rate": 0.0005223491027732464, + "loss": 0.1819, + "num_input_tokens_seen": 13856384, + "step": 6405 + }, + { + "epoch": 1.0456769983686787, + "grad_norm": 0.050130825489759445, + "learning_rate": 0.0005227569331158238, + "loss": 0.0902, + "num_input_tokens_seen": 13867456, + "step": 6410 + }, + { + "epoch": 1.0464926590538337, + "grad_norm": 0.15338997542858124, + "learning_rate": 0.0005231647634584013, + "loss": 0.0667, + "num_input_tokens_seen": 13877216, + "step": 6415 + }, + { + "epoch": 1.0473083197389885, + "grad_norm": 0.21062275767326355, + "learning_rate": 0.0005235725938009788, + "loss": 0.3659, + "num_input_tokens_seen": 13888192, + "step": 6420 + }, + { + "epoch": 1.0481239804241436, + "grad_norm": 0.07641365379095078, + "learning_rate": 0.0005239804241435563, + "loss": 0.0418, + "num_input_tokens_seen": 13899072, + "step": 6425 + }, + { + "epoch": 1.0489396411092986, + "grad_norm": 0.19561930000782013, + "learning_rate": 0.0005243882544861339, + "loss": 0.1995, + "num_input_tokens_seen": 13909792, + "step": 6430 + }, + { + "epoch": 1.0497553017944534, + "grad_norm": 0.18206077814102173, + "learning_rate": 0.0005247960848287112, + "loss": 0.1011, + "num_input_tokens_seen": 13920800, + "step": 6435 + }, + { + "epoch": 1.0505709624796085, + "grad_norm": 0.15215028822422028, + "learning_rate": 0.0005252039151712887, + "loss": 0.0923, + "num_input_tokens_seen": 13930976, + "step": 6440 + }, + { + "epoch": 1.0513866231647635, + "grad_norm": 0.06231206655502319, + "learning_rate": 0.0005256117455138663, + "loss": 0.057, + "num_input_tokens_seen": 13942080, + "step": 6445 + }, + { + "epoch": 1.0522022838499185, + "grad_norm": 0.24639299511909485, + "learning_rate": 0.0005260195758564438, + "loss": 0.0866, + "num_input_tokens_seen": 13952736, + "step": 6450 + }, + { + "epoch": 1.0530179445350734, + "grad_norm": 0.34501662850379944, + "learning_rate": 0.0005264274061990211, + "loss": 0.1083, + "num_input_tokens_seen": 13962400, + "step": 6455 + }, + { + "epoch": 1.0538336052202284, + "grad_norm": 0.008477751165628433, + "learning_rate": 0.0005268352365415987, + "loss": 0.0404, + "num_input_tokens_seen": 13974304, + "step": 6460 + }, + { + "epoch": 1.0546492659053834, + "grad_norm": 0.1356765329837799, + "learning_rate": 0.0005272430668841762, + "loss": 0.0898, + "num_input_tokens_seen": 13985248, + "step": 6465 + }, + { + "epoch": 1.0554649265905383, + "grad_norm": 0.4822375178337097, + "learning_rate": 0.0005276508972267537, + "loss": 0.1247, + "num_input_tokens_seen": 13995840, + "step": 6470 + }, + { + "epoch": 1.0562805872756933, + "grad_norm": 0.020600905641913414, + "learning_rate": 0.0005280587275693311, + "loss": 0.053, + "num_input_tokens_seen": 14007104, + "step": 6475 + }, + { + "epoch": 1.0570962479608483, + "grad_norm": 0.14444732666015625, + "learning_rate": 0.0005284665579119086, + "loss": 0.0486, + "num_input_tokens_seen": 14018144, + "step": 6480 + }, + { + "epoch": 1.0579119086460032, + "grad_norm": 0.22547604143619537, + "learning_rate": 0.0005288743882544862, + "loss": 0.0931, + "num_input_tokens_seen": 14029024, + "step": 6485 + }, + { + "epoch": 1.0587275693311582, + "grad_norm": 0.016369134187698364, + "learning_rate": 0.0005292822185970636, + "loss": 0.1448, + "num_input_tokens_seen": 14040480, + "step": 6490 + }, + { + "epoch": 1.0595432300163132, + "grad_norm": 0.025439415127038956, + "learning_rate": 0.0005296900489396412, + "loss": 0.0068, + "num_input_tokens_seen": 14050976, + "step": 6495 + }, + { + "epoch": 1.0603588907014683, + "grad_norm": 0.1567530333995819, + "learning_rate": 0.0005300978792822186, + "loss": 0.0226, + "num_input_tokens_seen": 14060352, + "step": 6500 + }, + { + "epoch": 1.061174551386623, + "grad_norm": 1.5363329648971558, + "learning_rate": 0.000530505709624796, + "loss": 0.1478, + "num_input_tokens_seen": 14071680, + "step": 6505 + }, + { + "epoch": 1.0619902120717781, + "grad_norm": 0.30160683393478394, + "learning_rate": 0.0005309135399673736, + "loss": 0.0936, + "num_input_tokens_seen": 14083296, + "step": 6510 + }, + { + "epoch": 1.0628058727569332, + "grad_norm": 0.33012139797210693, + "learning_rate": 0.0005313213703099511, + "loss": 0.0697, + "num_input_tokens_seen": 14094752, + "step": 6515 + }, + { + "epoch": 1.0636215334420882, + "grad_norm": 0.5668163299560547, + "learning_rate": 0.0005317292006525287, + "loss": 0.2822, + "num_input_tokens_seen": 14105088, + "step": 6520 + }, + { + "epoch": 1.064437194127243, + "grad_norm": 0.5016757845878601, + "learning_rate": 0.000532137030995106, + "loss": 0.1197, + "num_input_tokens_seen": 14114976, + "step": 6525 + }, + { + "epoch": 1.065252854812398, + "grad_norm": 0.26769590377807617, + "learning_rate": 0.0005325448613376835, + "loss": 0.1469, + "num_input_tokens_seen": 14126592, + "step": 6530 + }, + { + "epoch": 1.066068515497553, + "grad_norm": 0.28698790073394775, + "learning_rate": 0.0005329526916802611, + "loss": 0.1116, + "num_input_tokens_seen": 14138048, + "step": 6535 + }, + { + "epoch": 1.066884176182708, + "grad_norm": 0.14941178262233734, + "learning_rate": 0.0005333605220228385, + "loss": 0.0827, + "num_input_tokens_seen": 14148128, + "step": 6540 + }, + { + "epoch": 1.067699836867863, + "grad_norm": 0.22471418976783752, + "learning_rate": 0.000533768352365416, + "loss": 0.0885, + "num_input_tokens_seen": 14160096, + "step": 6545 + }, + { + "epoch": 1.068515497553018, + "grad_norm": 0.0772034153342247, + "learning_rate": 0.0005341761827079935, + "loss": 0.0842, + "num_input_tokens_seen": 14170752, + "step": 6550 + }, + { + "epoch": 1.0693311582381728, + "grad_norm": 0.054675713181495667, + "learning_rate": 0.000534584013050571, + "loss": 0.2781, + "num_input_tokens_seen": 14181952, + "step": 6555 + }, + { + "epoch": 1.0701468189233279, + "grad_norm": 0.06377039849758148, + "learning_rate": 0.0005349918433931485, + "loss": 0.1335, + "num_input_tokens_seen": 14192384, + "step": 6560 + }, + { + "epoch": 1.070962479608483, + "grad_norm": 0.011692183092236519, + "learning_rate": 0.0005353996737357259, + "loss": 0.0241, + "num_input_tokens_seen": 14204384, + "step": 6565 + }, + { + "epoch": 1.071778140293638, + "grad_norm": 0.02358190529048443, + "learning_rate": 0.0005358075040783035, + "loss": 0.1529, + "num_input_tokens_seen": 14215776, + "step": 6570 + }, + { + "epoch": 1.0725938009787928, + "grad_norm": 0.07083281129598618, + "learning_rate": 0.0005362153344208809, + "loss": 0.1342, + "num_input_tokens_seen": 14227520, + "step": 6575 + }, + { + "epoch": 1.0734094616639478, + "grad_norm": 0.179110586643219, + "learning_rate": 0.0005366231647634584, + "loss": 0.1763, + "num_input_tokens_seen": 14238976, + "step": 6580 + }, + { + "epoch": 1.0742251223491028, + "grad_norm": 0.06617755442857742, + "learning_rate": 0.000537030995106036, + "loss": 0.0765, + "num_input_tokens_seen": 14249344, + "step": 6585 + }, + { + "epoch": 1.0750407830342577, + "grad_norm": 0.034860655665397644, + "learning_rate": 0.0005374388254486133, + "loss": 0.0396, + "num_input_tokens_seen": 14259840, + "step": 6590 + }, + { + "epoch": 1.0758564437194127, + "grad_norm": 0.06494531035423279, + "learning_rate": 0.0005378466557911908, + "loss": 0.1837, + "num_input_tokens_seen": 14271456, + "step": 6595 + }, + { + "epoch": 1.0766721044045677, + "grad_norm": 0.20017178356647491, + "learning_rate": 0.0005382544861337684, + "loss": 0.0931, + "num_input_tokens_seen": 14282688, + "step": 6600 + }, + { + "epoch": 1.0774877650897228, + "grad_norm": 0.430498331785202, + "learning_rate": 0.0005386623164763459, + "loss": 0.1031, + "num_input_tokens_seen": 14293920, + "step": 6605 + }, + { + "epoch": 1.0783034257748776, + "grad_norm": 0.04969804361462593, + "learning_rate": 0.0005390701468189233, + "loss": 0.0297, + "num_input_tokens_seen": 14304512, + "step": 6610 + }, + { + "epoch": 1.0791190864600326, + "grad_norm": 0.18835468590259552, + "learning_rate": 0.0005394779771615008, + "loss": 0.0643, + "num_input_tokens_seen": 14315616, + "step": 6615 + }, + { + "epoch": 1.0799347471451877, + "grad_norm": 0.15687093138694763, + "learning_rate": 0.0005398858075040783, + "loss": 0.0548, + "num_input_tokens_seen": 14326400, + "step": 6620 + }, + { + "epoch": 1.0807504078303425, + "grad_norm": 0.019069327041506767, + "learning_rate": 0.0005402936378466558, + "loss": 0.1271, + "num_input_tokens_seen": 14336288, + "step": 6625 + }, + { + "epoch": 1.0815660685154975, + "grad_norm": 0.10939546674489975, + "learning_rate": 0.0005407014681892332, + "loss": 0.0533, + "num_input_tokens_seen": 14347616, + "step": 6630 + }, + { + "epoch": 1.0823817292006526, + "grad_norm": 0.06843362748622894, + "learning_rate": 0.0005411092985318108, + "loss": 0.0982, + "num_input_tokens_seen": 14357664, + "step": 6635 + }, + { + "epoch": 1.0831973898858076, + "grad_norm": 0.21762220561504364, + "learning_rate": 0.0005415171288743883, + "loss": 0.2263, + "num_input_tokens_seen": 14367232, + "step": 6640 + }, + { + "epoch": 1.0840130505709624, + "grad_norm": 0.13196387887001038, + "learning_rate": 0.0005419249592169657, + "loss": 0.0884, + "num_input_tokens_seen": 14378048, + "step": 6645 + }, + { + "epoch": 1.0848287112561175, + "grad_norm": 0.07705891877412796, + "learning_rate": 0.0005423327895595433, + "loss": 0.1639, + "num_input_tokens_seen": 14389248, + "step": 6650 + }, + { + "epoch": 1.0856443719412725, + "grad_norm": 0.08115267753601074, + "learning_rate": 0.0005427406199021207, + "loss": 0.0974, + "num_input_tokens_seen": 14399616, + "step": 6655 + }, + { + "epoch": 1.0864600326264273, + "grad_norm": 0.27088385820388794, + "learning_rate": 0.0005431484502446982, + "loss": 0.2975, + "num_input_tokens_seen": 14412000, + "step": 6660 + }, + { + "epoch": 1.0872756933115824, + "grad_norm": 0.11545766890048981, + "learning_rate": 0.0005435562805872757, + "loss": 0.1676, + "num_input_tokens_seen": 14423392, + "step": 6665 + }, + { + "epoch": 1.0880913539967374, + "grad_norm": 0.12074887752532959, + "learning_rate": 0.0005439641109298532, + "loss": 0.2093, + "num_input_tokens_seen": 14433792, + "step": 6670 + }, + { + "epoch": 1.0889070146818924, + "grad_norm": 0.06263269484043121, + "learning_rate": 0.0005443719412724307, + "loss": 0.0876, + "num_input_tokens_seen": 14445152, + "step": 6675 + }, + { + "epoch": 1.0897226753670473, + "grad_norm": 0.07423686236143112, + "learning_rate": 0.0005447797716150081, + "loss": 0.1269, + "num_input_tokens_seen": 14455968, + "step": 6680 + }, + { + "epoch": 1.0905383360522023, + "grad_norm": 0.04535862058401108, + "learning_rate": 0.0005451876019575857, + "loss": 0.0389, + "num_input_tokens_seen": 14465824, + "step": 6685 + }, + { + "epoch": 1.0913539967373573, + "grad_norm": 0.5719911456108093, + "learning_rate": 0.0005455954323001632, + "loss": 0.2001, + "num_input_tokens_seen": 14476480, + "step": 6690 + }, + { + "epoch": 1.0921696574225122, + "grad_norm": 0.06406523287296295, + "learning_rate": 0.0005460032626427405, + "loss": 0.1601, + "num_input_tokens_seen": 14486144, + "step": 6695 + }, + { + "epoch": 1.0929853181076672, + "grad_norm": 0.21749967336654663, + "learning_rate": 0.0005464110929853181, + "loss": 0.2104, + "num_input_tokens_seen": 14497664, + "step": 6700 + }, + { + "epoch": 1.0938009787928222, + "grad_norm": 0.5275992155075073, + "learning_rate": 0.0005468189233278956, + "loss": 0.1364, + "num_input_tokens_seen": 14509888, + "step": 6705 + }, + { + "epoch": 1.094616639477977, + "grad_norm": 0.11150766164064407, + "learning_rate": 0.0005472267536704732, + "loss": 0.1324, + "num_input_tokens_seen": 14521664, + "step": 6710 + }, + { + "epoch": 1.095432300163132, + "grad_norm": 0.22698082029819489, + "learning_rate": 0.0005476345840130506, + "loss": 0.1436, + "num_input_tokens_seen": 14532768, + "step": 6715 + }, + { + "epoch": 1.0962479608482871, + "grad_norm": 0.12279754877090454, + "learning_rate": 0.000548042414355628, + "loss": 0.0779, + "num_input_tokens_seen": 14544160, + "step": 6720 + }, + { + "epoch": 1.0970636215334422, + "grad_norm": 0.10162794589996338, + "learning_rate": 0.0005484502446982056, + "loss": 0.083, + "num_input_tokens_seen": 14554112, + "step": 6725 + }, + { + "epoch": 1.097879282218597, + "grad_norm": 0.21879129111766815, + "learning_rate": 0.000548858075040783, + "loss": 0.1007, + "num_input_tokens_seen": 14564096, + "step": 6730 + }, + { + "epoch": 1.098694942903752, + "grad_norm": 0.01607028767466545, + "learning_rate": 0.0005492659053833605, + "loss": 0.1184, + "num_input_tokens_seen": 14574592, + "step": 6735 + }, + { + "epoch": 1.099510603588907, + "grad_norm": 0.009699334390461445, + "learning_rate": 0.000549673735725938, + "loss": 0.0235, + "num_input_tokens_seen": 14585728, + "step": 6740 + }, + { + "epoch": 1.100326264274062, + "grad_norm": 0.28271132707595825, + "learning_rate": 0.0005500815660685155, + "loss": 0.1423, + "num_input_tokens_seen": 14594464, + "step": 6745 + }, + { + "epoch": 1.101141924959217, + "grad_norm": 0.1730027049779892, + "learning_rate": 0.000550489396411093, + "loss": 0.1257, + "num_input_tokens_seen": 14604480, + "step": 6750 + }, + { + "epoch": 1.101957585644372, + "grad_norm": 0.3124956488609314, + "learning_rate": 0.0005508972267536705, + "loss": 0.0818, + "num_input_tokens_seen": 14616128, + "step": 6755 + }, + { + "epoch": 1.102773246329527, + "grad_norm": 0.46795615553855896, + "learning_rate": 0.000551305057096248, + "loss": 0.0905, + "num_input_tokens_seen": 14627008, + "step": 6760 + }, + { + "epoch": 1.1035889070146818, + "grad_norm": 0.06501724570989609, + "learning_rate": 0.0005517128874388254, + "loss": 0.1682, + "num_input_tokens_seen": 14637280, + "step": 6765 + }, + { + "epoch": 1.1044045676998369, + "grad_norm": 0.6208559274673462, + "learning_rate": 0.0005521207177814029, + "loss": 0.1652, + "num_input_tokens_seen": 14646976, + "step": 6770 + }, + { + "epoch": 1.105220228384992, + "grad_norm": 0.23027293384075165, + "learning_rate": 0.0005525285481239805, + "loss": 0.0889, + "num_input_tokens_seen": 14657152, + "step": 6775 + }, + { + "epoch": 1.1060358890701467, + "grad_norm": 0.07337773591279984, + "learning_rate": 0.000552936378466558, + "loss": 0.126, + "num_input_tokens_seen": 14667488, + "step": 6780 + }, + { + "epoch": 1.1068515497553018, + "grad_norm": 0.2906239330768585, + "learning_rate": 0.0005533442088091353, + "loss": 0.3041, + "num_input_tokens_seen": 14678112, + "step": 6785 + }, + { + "epoch": 1.1076672104404568, + "grad_norm": 0.0229903943836689, + "learning_rate": 0.0005537520391517129, + "loss": 0.0735, + "num_input_tokens_seen": 14689312, + "step": 6790 + }, + { + "epoch": 1.1084828711256118, + "grad_norm": 0.054861538112163544, + "learning_rate": 0.0005541598694942904, + "loss": 0.1642, + "num_input_tokens_seen": 14700032, + "step": 6795 + }, + { + "epoch": 1.1092985318107667, + "grad_norm": 0.16190479695796967, + "learning_rate": 0.0005545676998368679, + "loss": 0.1552, + "num_input_tokens_seen": 14711008, + "step": 6800 + }, + { + "epoch": 1.1101141924959217, + "grad_norm": 0.028706805780529976, + "learning_rate": 0.0005549755301794454, + "loss": 0.0586, + "num_input_tokens_seen": 14720448, + "step": 6805 + }, + { + "epoch": 1.1109298531810767, + "grad_norm": 0.09901495277881622, + "learning_rate": 0.0005553833605220228, + "loss": 0.095, + "num_input_tokens_seen": 14732128, + "step": 6810 + }, + { + "epoch": 1.1117455138662315, + "grad_norm": 0.27511176466941833, + "learning_rate": 0.0005557911908646003, + "loss": 0.0633, + "num_input_tokens_seen": 14743200, + "step": 6815 + }, + { + "epoch": 1.1125611745513866, + "grad_norm": 0.6085115075111389, + "learning_rate": 0.0005561990212071778, + "loss": 0.1689, + "num_input_tokens_seen": 14754464, + "step": 6820 + }, + { + "epoch": 1.1133768352365416, + "grad_norm": 0.25932618975639343, + "learning_rate": 0.0005566068515497554, + "loss": 0.1432, + "num_input_tokens_seen": 14763840, + "step": 6825 + }, + { + "epoch": 1.1141924959216967, + "grad_norm": 0.06148757040500641, + "learning_rate": 0.0005570146818923328, + "loss": 0.0978, + "num_input_tokens_seen": 14774304, + "step": 6830 + }, + { + "epoch": 1.1150081566068515, + "grad_norm": 0.0395883172750473, + "learning_rate": 0.0005574225122349102, + "loss": 0.0489, + "num_input_tokens_seen": 14783456, + "step": 6835 + }, + { + "epoch": 1.1158238172920065, + "grad_norm": 0.030602682381868362, + "learning_rate": 0.0005578303425774878, + "loss": 0.1257, + "num_input_tokens_seen": 14793664, + "step": 6840 + }, + { + "epoch": 1.1166394779771616, + "grad_norm": 0.07073304802179337, + "learning_rate": 0.0005582381729200653, + "loss": 0.0522, + "num_input_tokens_seen": 14804256, + "step": 6845 + }, + { + "epoch": 1.1174551386623164, + "grad_norm": 0.21918964385986328, + "learning_rate": 0.0005586460032626428, + "loss": 0.1968, + "num_input_tokens_seen": 14815744, + "step": 6850 + }, + { + "epoch": 1.1182707993474714, + "grad_norm": 0.07887905091047287, + "learning_rate": 0.0005590538336052202, + "loss": 0.0636, + "num_input_tokens_seen": 14826304, + "step": 6855 + }, + { + "epoch": 1.1190864600326265, + "grad_norm": 0.1146250069141388, + "learning_rate": 0.0005594616639477977, + "loss": 0.2652, + "num_input_tokens_seen": 14838016, + "step": 6860 + }, + { + "epoch": 1.1199021207177815, + "grad_norm": 0.10534577071666718, + "learning_rate": 0.0005598694942903753, + "loss": 0.0815, + "num_input_tokens_seen": 14849344, + "step": 6865 + }, + { + "epoch": 1.1207177814029363, + "grad_norm": 0.21944677829742432, + "learning_rate": 0.0005602773246329527, + "loss": 0.186, + "num_input_tokens_seen": 14860608, + "step": 6870 + }, + { + "epoch": 1.1215334420880914, + "grad_norm": 0.1881939023733139, + "learning_rate": 0.0005606851549755301, + "loss": 0.0595, + "num_input_tokens_seen": 14870976, + "step": 6875 + }, + { + "epoch": 1.1223491027732464, + "grad_norm": 0.19812922179698944, + "learning_rate": 0.0005610929853181077, + "loss": 0.0881, + "num_input_tokens_seen": 14880960, + "step": 6880 + }, + { + "epoch": 1.1231647634584012, + "grad_norm": 0.032493770122528076, + "learning_rate": 0.0005615008156606851, + "loss": 0.1206, + "num_input_tokens_seen": 14892256, + "step": 6885 + }, + { + "epoch": 1.1239804241435563, + "grad_norm": 0.216365784406662, + "learning_rate": 0.0005619086460032627, + "loss": 0.3036, + "num_input_tokens_seen": 14902304, + "step": 6890 + }, + { + "epoch": 1.1247960848287113, + "grad_norm": 0.07021628320217133, + "learning_rate": 0.0005623164763458401, + "loss": 0.1249, + "num_input_tokens_seen": 14913536, + "step": 6895 + }, + { + "epoch": 1.1256117455138663, + "grad_norm": 0.22371597588062286, + "learning_rate": 0.0005627243066884176, + "loss": 0.1678, + "num_input_tokens_seen": 14924480, + "step": 6900 + }, + { + "epoch": 1.1264274061990212, + "grad_norm": 0.21907971799373627, + "learning_rate": 0.0005631321370309951, + "loss": 0.1681, + "num_input_tokens_seen": 14935264, + "step": 6905 + }, + { + "epoch": 1.1272430668841762, + "grad_norm": 0.23060083389282227, + "learning_rate": 0.0005635399673735726, + "loss": 0.0892, + "num_input_tokens_seen": 14947072, + "step": 6910 + }, + { + "epoch": 1.1280587275693312, + "grad_norm": 0.15607233345508575, + "learning_rate": 0.0005639477977161502, + "loss": 0.1384, + "num_input_tokens_seen": 14957888, + "step": 6915 + }, + { + "epoch": 1.128874388254486, + "grad_norm": 0.12730056047439575, + "learning_rate": 0.0005643556280587275, + "loss": 0.1477, + "num_input_tokens_seen": 14967424, + "step": 6920 + }, + { + "epoch": 1.129690048939641, + "grad_norm": 0.388334184885025, + "learning_rate": 0.000564763458401305, + "loss": 0.128, + "num_input_tokens_seen": 14977536, + "step": 6925 + }, + { + "epoch": 1.1305057096247961, + "grad_norm": 0.058998655527830124, + "learning_rate": 0.0005651712887438826, + "loss": 0.0698, + "num_input_tokens_seen": 14987008, + "step": 6930 + }, + { + "epoch": 1.131321370309951, + "grad_norm": 0.031149208545684814, + "learning_rate": 0.0005655791190864601, + "loss": 0.2709, + "num_input_tokens_seen": 14998272, + "step": 6935 + }, + { + "epoch": 1.132137030995106, + "grad_norm": 0.4222646653652191, + "learning_rate": 0.0005659869494290375, + "loss": 0.3011, + "num_input_tokens_seen": 15007392, + "step": 6940 + }, + { + "epoch": 1.132952691680261, + "grad_norm": 0.25837603211402893, + "learning_rate": 0.000566394779771615, + "loss": 0.1895, + "num_input_tokens_seen": 15016608, + "step": 6945 + }, + { + "epoch": 1.133768352365416, + "grad_norm": 0.16666200757026672, + "learning_rate": 0.0005668026101141925, + "loss": 0.1543, + "num_input_tokens_seen": 15027712, + "step": 6950 + }, + { + "epoch": 1.1345840130505709, + "grad_norm": 0.07614574581384659, + "learning_rate": 0.00056721044045677, + "loss": 0.1288, + "num_input_tokens_seen": 15038080, + "step": 6955 + }, + { + "epoch": 1.135399673735726, + "grad_norm": 0.1177593395113945, + "learning_rate": 0.0005676182707993474, + "loss": 0.1641, + "num_input_tokens_seen": 15050432, + "step": 6960 + }, + { + "epoch": 1.136215334420881, + "grad_norm": 0.0698639526963234, + "learning_rate": 0.000568026101141925, + "loss": 0.1051, + "num_input_tokens_seen": 15060992, + "step": 6965 + }, + { + "epoch": 1.137030995106036, + "grad_norm": 0.17899353802204132, + "learning_rate": 0.0005684339314845025, + "loss": 0.0759, + "num_input_tokens_seen": 15070816, + "step": 6970 + }, + { + "epoch": 1.1378466557911908, + "grad_norm": 0.11010434478521347, + "learning_rate": 0.0005688417618270799, + "loss": 0.138, + "num_input_tokens_seen": 15080128, + "step": 6975 + }, + { + "epoch": 1.1386623164763459, + "grad_norm": 0.06893116235733032, + "learning_rate": 0.0005692495921696575, + "loss": 0.1004, + "num_input_tokens_seen": 15090464, + "step": 6980 + }, + { + "epoch": 1.139477977161501, + "grad_norm": 0.05863642320036888, + "learning_rate": 0.0005696574225122349, + "loss": 0.1242, + "num_input_tokens_seen": 15101472, + "step": 6985 + }, + { + "epoch": 1.1402936378466557, + "grad_norm": 0.22673600912094116, + "learning_rate": 0.0005700652528548124, + "loss": 0.1268, + "num_input_tokens_seen": 15111616, + "step": 6990 + }, + { + "epoch": 1.1411092985318108, + "grad_norm": 0.36856165528297424, + "learning_rate": 0.0005704730831973899, + "loss": 0.2203, + "num_input_tokens_seen": 15122496, + "step": 6995 + }, + { + "epoch": 1.1419249592169658, + "grad_norm": 0.04950616881251335, + "learning_rate": 0.0005708809135399674, + "loss": 0.163, + "num_input_tokens_seen": 15134112, + "step": 7000 + }, + { + "epoch": 1.1427406199021206, + "grad_norm": 0.27419692277908325, + "learning_rate": 0.000571288743882545, + "loss": 0.1202, + "num_input_tokens_seen": 15144352, + "step": 7005 + }, + { + "epoch": 1.1435562805872757, + "grad_norm": 0.04713694751262665, + "learning_rate": 0.0005716965742251223, + "loss": 0.0994, + "num_input_tokens_seen": 15154304, + "step": 7010 + }, + { + "epoch": 1.1443719412724307, + "grad_norm": 0.17500461637973785, + "learning_rate": 0.0005721044045676999, + "loss": 0.0972, + "num_input_tokens_seen": 15165024, + "step": 7015 + }, + { + "epoch": 1.1451876019575857, + "grad_norm": 0.183096244931221, + "learning_rate": 0.0005725122349102774, + "loss": 0.1087, + "num_input_tokens_seen": 15176224, + "step": 7020 + }, + { + "epoch": 1.1460032626427405, + "grad_norm": 0.07248888164758682, + "learning_rate": 0.0005729200652528548, + "loss": 0.1045, + "num_input_tokens_seen": 15186464, + "step": 7025 + }, + { + "epoch": 1.1468189233278956, + "grad_norm": 0.26332277059555054, + "learning_rate": 0.0005733278955954323, + "loss": 0.2417, + "num_input_tokens_seen": 15197408, + "step": 7030 + }, + { + "epoch": 1.1476345840130506, + "grad_norm": 0.22281448543071747, + "learning_rate": 0.0005737357259380098, + "loss": 0.076, + "num_input_tokens_seen": 15209504, + "step": 7035 + }, + { + "epoch": 1.1484502446982057, + "grad_norm": 0.1054937094449997, + "learning_rate": 0.0005741435562805873, + "loss": 0.2588, + "num_input_tokens_seen": 15220576, + "step": 7040 + }, + { + "epoch": 1.1492659053833605, + "grad_norm": 0.0714089497923851, + "learning_rate": 0.0005745513866231648, + "loss": 0.09, + "num_input_tokens_seen": 15230752, + "step": 7045 + }, + { + "epoch": 1.1500815660685155, + "grad_norm": 0.27331599593162537, + "learning_rate": 0.0005749592169657422, + "loss": 0.0814, + "num_input_tokens_seen": 15241312, + "step": 7050 + }, + { + "epoch": 1.1508972267536706, + "grad_norm": 0.10048350691795349, + "learning_rate": 0.0005753670473083198, + "loss": 0.1027, + "num_input_tokens_seen": 15252512, + "step": 7055 + }, + { + "epoch": 1.1517128874388254, + "grad_norm": 0.09194018691778183, + "learning_rate": 0.0005757748776508972, + "loss": 0.1055, + "num_input_tokens_seen": 15262400, + "step": 7060 + }, + { + "epoch": 1.1525285481239804, + "grad_norm": 0.35544732213020325, + "learning_rate": 0.0005761827079934747, + "loss": 0.186, + "num_input_tokens_seen": 15274816, + "step": 7065 + }, + { + "epoch": 1.1533442088091355, + "grad_norm": 0.07152602821588516, + "learning_rate": 0.0005765905383360523, + "loss": 0.1217, + "num_input_tokens_seen": 15285248, + "step": 7070 + }, + { + "epoch": 1.1541598694942903, + "grad_norm": 0.23456107079982758, + "learning_rate": 0.0005769983686786296, + "loss": 0.0474, + "num_input_tokens_seen": 15295552, + "step": 7075 + }, + { + "epoch": 1.1549755301794453, + "grad_norm": 0.054550401866436005, + "learning_rate": 0.0005774061990212072, + "loss": 0.0647, + "num_input_tokens_seen": 15305248, + "step": 7080 + }, + { + "epoch": 1.1557911908646004, + "grad_norm": 0.01704372465610504, + "learning_rate": 0.0005778140293637847, + "loss": 0.0558, + "num_input_tokens_seen": 15316576, + "step": 7085 + }, + { + "epoch": 1.1566068515497552, + "grad_norm": 0.016035085543990135, + "learning_rate": 0.0005782218597063622, + "loss": 0.1513, + "num_input_tokens_seen": 15328256, + "step": 7090 + }, + { + "epoch": 1.1574225122349102, + "grad_norm": 0.2782580554485321, + "learning_rate": 0.0005786296900489396, + "loss": 0.1633, + "num_input_tokens_seen": 15338752, + "step": 7095 + }, + { + "epoch": 1.1582381729200653, + "grad_norm": 0.02512599341571331, + "learning_rate": 0.0005790375203915171, + "loss": 0.0487, + "num_input_tokens_seen": 15348224, + "step": 7100 + }, + { + "epoch": 1.1590538336052203, + "grad_norm": 0.058114953339099884, + "learning_rate": 0.0005794453507340947, + "loss": 0.1043, + "num_input_tokens_seen": 15359008, + "step": 7105 + }, + { + "epoch": 1.1598694942903751, + "grad_norm": 0.02511642687022686, + "learning_rate": 0.0005798531810766721, + "loss": 0.078, + "num_input_tokens_seen": 15369408, + "step": 7110 + }, + { + "epoch": 1.1606851549755302, + "grad_norm": 0.19992566108703613, + "learning_rate": 0.0005802610114192495, + "loss": 0.0287, + "num_input_tokens_seen": 15379392, + "step": 7115 + }, + { + "epoch": 1.1615008156606852, + "grad_norm": 0.06997409462928772, + "learning_rate": 0.0005806688417618271, + "loss": 0.0817, + "num_input_tokens_seen": 15389920, + "step": 7120 + }, + { + "epoch": 1.1623164763458402, + "grad_norm": 0.03639506548643112, + "learning_rate": 0.0005810766721044046, + "loss": 0.1004, + "num_input_tokens_seen": 15400896, + "step": 7125 + }, + { + "epoch": 1.163132137030995, + "grad_norm": 0.04965808987617493, + "learning_rate": 0.0005814845024469821, + "loss": 0.0616, + "num_input_tokens_seen": 15412320, + "step": 7130 + }, + { + "epoch": 1.16394779771615, + "grad_norm": 0.018771076574921608, + "learning_rate": 0.0005818923327895596, + "loss": 0.1234, + "num_input_tokens_seen": 15423680, + "step": 7135 + }, + { + "epoch": 1.1647634584013051, + "grad_norm": 0.037603430449962616, + "learning_rate": 0.000582300163132137, + "loss": 0.0967, + "num_input_tokens_seen": 15433952, + "step": 7140 + }, + { + "epoch": 1.16557911908646, + "grad_norm": 0.1583869308233261, + "learning_rate": 0.0005827079934747145, + "loss": 0.1022, + "num_input_tokens_seen": 15444896, + "step": 7145 + }, + { + "epoch": 1.166394779771615, + "grad_norm": 0.06455934792757034, + "learning_rate": 0.000583115823817292, + "loss": 0.0893, + "num_input_tokens_seen": 15456192, + "step": 7150 + }, + { + "epoch": 1.16721044045677, + "grad_norm": 0.5529431104660034, + "learning_rate": 0.0005835236541598696, + "loss": 0.2945, + "num_input_tokens_seen": 15465312, + "step": 7155 + }, + { + "epoch": 1.1680261011419248, + "grad_norm": 0.011243646964430809, + "learning_rate": 0.000583931484502447, + "loss": 0.1078, + "num_input_tokens_seen": 15474496, + "step": 7160 + }, + { + "epoch": 1.1688417618270799, + "grad_norm": 0.4211251437664032, + "learning_rate": 0.0005843393148450244, + "loss": 0.1601, + "num_input_tokens_seen": 15484992, + "step": 7165 + }, + { + "epoch": 1.169657422512235, + "grad_norm": 0.030863694846630096, + "learning_rate": 0.000584747145187602, + "loss": 0.0479, + "num_input_tokens_seen": 15495232, + "step": 7170 + }, + { + "epoch": 1.17047308319739, + "grad_norm": 0.05095810070633888, + "learning_rate": 0.0005851549755301795, + "loss": 0.1482, + "num_input_tokens_seen": 15506304, + "step": 7175 + }, + { + "epoch": 1.1712887438825448, + "grad_norm": 0.06538262963294983, + "learning_rate": 0.0005855628058727568, + "loss": 0.1386, + "num_input_tokens_seen": 15517248, + "step": 7180 + }, + { + "epoch": 1.1721044045676998, + "grad_norm": 0.016057291999459267, + "learning_rate": 0.0005859706362153344, + "loss": 0.066, + "num_input_tokens_seen": 15527040, + "step": 7185 + }, + { + "epoch": 1.1729200652528549, + "grad_norm": 0.24081219732761383, + "learning_rate": 0.0005863784665579119, + "loss": 0.2164, + "num_input_tokens_seen": 15538368, + "step": 7190 + }, + { + "epoch": 1.17373572593801, + "grad_norm": 0.09472464770078659, + "learning_rate": 0.0005867862969004895, + "loss": 0.0432, + "num_input_tokens_seen": 15548608, + "step": 7195 + }, + { + "epoch": 1.1745513866231647, + "grad_norm": 0.07453358173370361, + "learning_rate": 0.0005871941272430669, + "loss": 0.1506, + "num_input_tokens_seen": 15560032, + "step": 7200 + }, + { + "epoch": 1.1753670473083198, + "grad_norm": 0.18550845980644226, + "learning_rate": 0.0005876019575856443, + "loss": 0.1615, + "num_input_tokens_seen": 15571616, + "step": 7205 + }, + { + "epoch": 1.1761827079934748, + "grad_norm": 0.03769610449671745, + "learning_rate": 0.0005880097879282219, + "loss": 0.122, + "num_input_tokens_seen": 15583392, + "step": 7210 + }, + { + "epoch": 1.1769983686786296, + "grad_norm": 0.03199080750346184, + "learning_rate": 0.0005884176182707993, + "loss": 0.1366, + "num_input_tokens_seen": 15593504, + "step": 7215 + }, + { + "epoch": 1.1778140293637847, + "grad_norm": 0.22953973710536957, + "learning_rate": 0.0005888254486133769, + "loss": 0.1067, + "num_input_tokens_seen": 15604928, + "step": 7220 + }, + { + "epoch": 1.1786296900489397, + "grad_norm": 0.3515639305114746, + "learning_rate": 0.0005892332789559544, + "loss": 0.1873, + "num_input_tokens_seen": 15615200, + "step": 7225 + }, + { + "epoch": 1.1794453507340945, + "grad_norm": 0.2525303065776825, + "learning_rate": 0.0005896411092985318, + "loss": 0.0812, + "num_input_tokens_seen": 15626176, + "step": 7230 + }, + { + "epoch": 1.1802610114192496, + "grad_norm": 0.19057056307792664, + "learning_rate": 0.0005900489396411093, + "loss": 0.1706, + "num_input_tokens_seen": 15636768, + "step": 7235 + }, + { + "epoch": 1.1810766721044046, + "grad_norm": 0.10746180266141891, + "learning_rate": 0.0005904567699836868, + "loss": 0.1832, + "num_input_tokens_seen": 15648512, + "step": 7240 + }, + { + "epoch": 1.1818923327895596, + "grad_norm": 0.21963466703891754, + "learning_rate": 0.0005908646003262644, + "loss": 0.1024, + "num_input_tokens_seen": 15659328, + "step": 7245 + }, + { + "epoch": 1.1827079934747144, + "grad_norm": 0.10227061808109283, + "learning_rate": 0.0005912724306688417, + "loss": 0.0654, + "num_input_tokens_seen": 15669920, + "step": 7250 + }, + { + "epoch": 1.1835236541598695, + "grad_norm": 0.31351569294929504, + "learning_rate": 0.0005916802610114192, + "loss": 0.1336, + "num_input_tokens_seen": 15681152, + "step": 7255 + }, + { + "epoch": 1.1843393148450245, + "grad_norm": 0.25523555278778076, + "learning_rate": 0.0005920880913539968, + "loss": 0.1183, + "num_input_tokens_seen": 15690912, + "step": 7260 + }, + { + "epoch": 1.1851549755301796, + "grad_norm": 0.3154444694519043, + "learning_rate": 0.0005924959216965743, + "loss": 0.1102, + "num_input_tokens_seen": 15701216, + "step": 7265 + }, + { + "epoch": 1.1859706362153344, + "grad_norm": 0.15853968262672424, + "learning_rate": 0.0005929037520391517, + "loss": 0.1668, + "num_input_tokens_seen": 15712000, + "step": 7270 + }, + { + "epoch": 1.1867862969004894, + "grad_norm": 0.2364727407693863, + "learning_rate": 0.0005933115823817292, + "loss": 0.1852, + "num_input_tokens_seen": 15722848, + "step": 7275 + }, + { + "epoch": 1.1876019575856445, + "grad_norm": 0.10909298062324524, + "learning_rate": 0.0005937194127243067, + "loss": 0.2011, + "num_input_tokens_seen": 15734816, + "step": 7280 + }, + { + "epoch": 1.1884176182707993, + "grad_norm": 0.07953449338674545, + "learning_rate": 0.0005941272430668842, + "loss": 0.1326, + "num_input_tokens_seen": 15745664, + "step": 7285 + }, + { + "epoch": 1.1892332789559543, + "grad_norm": 0.4206564128398895, + "learning_rate": 0.0005945350734094617, + "loss": 0.1332, + "num_input_tokens_seen": 15756896, + "step": 7290 + }, + { + "epoch": 1.1900489396411094, + "grad_norm": 0.17394089698791504, + "learning_rate": 0.0005949429037520392, + "loss": 0.193, + "num_input_tokens_seen": 15766656, + "step": 7295 + }, + { + "epoch": 1.1908646003262642, + "grad_norm": 0.35734906792640686, + "learning_rate": 0.0005953507340946166, + "loss": 0.1472, + "num_input_tokens_seen": 15777792, + "step": 7300 + }, + { + "epoch": 1.1916802610114192, + "grad_norm": 0.267839640378952, + "learning_rate": 0.0005957585644371941, + "loss": 0.2118, + "num_input_tokens_seen": 15787968, + "step": 7305 + }, + { + "epoch": 1.1924959216965743, + "grad_norm": 0.23547734320163727, + "learning_rate": 0.0005961663947797717, + "loss": 0.2665, + "num_input_tokens_seen": 15797920, + "step": 7310 + }, + { + "epoch": 1.1933115823817293, + "grad_norm": 0.10362395644187927, + "learning_rate": 0.0005965742251223491, + "loss": 0.2258, + "num_input_tokens_seen": 15809376, + "step": 7315 + }, + { + "epoch": 1.1941272430668841, + "grad_norm": 0.3007364869117737, + "learning_rate": 0.0005969820554649265, + "loss": 0.1096, + "num_input_tokens_seen": 15821408, + "step": 7320 + }, + { + "epoch": 1.1949429037520392, + "grad_norm": 0.10045509040355682, + "learning_rate": 0.0005973898858075041, + "loss": 0.1064, + "num_input_tokens_seen": 15830816, + "step": 7325 + }, + { + "epoch": 1.1957585644371942, + "grad_norm": 0.05680025368928909, + "learning_rate": 0.0005977977161500816, + "loss": 0.0522, + "num_input_tokens_seen": 15841792, + "step": 7330 + }, + { + "epoch": 1.196574225122349, + "grad_norm": 0.05178492143750191, + "learning_rate": 0.000598205546492659, + "loss": 0.211, + "num_input_tokens_seen": 15852768, + "step": 7335 + }, + { + "epoch": 1.197389885807504, + "grad_norm": 0.014478541910648346, + "learning_rate": 0.0005986133768352365, + "loss": 0.0501, + "num_input_tokens_seen": 15863680, + "step": 7340 + }, + { + "epoch": 1.198205546492659, + "grad_norm": 0.37951260805130005, + "learning_rate": 0.000599021207177814, + "loss": 0.2018, + "num_input_tokens_seen": 15874976, + "step": 7345 + }, + { + "epoch": 1.1990212071778141, + "grad_norm": 0.008188205771148205, + "learning_rate": 0.0005994290375203916, + "loss": 0.0303, + "num_input_tokens_seen": 15885408, + "step": 7350 + }, + { + "epoch": 1.199836867862969, + "grad_norm": 0.31182661652565, + "learning_rate": 0.000599836867862969, + "loss": 0.2097, + "num_input_tokens_seen": 15895168, + "step": 7355 + }, + { + "epoch": 1.200652528548124, + "grad_norm": 0.21963611245155334, + "learning_rate": 0.0006002446982055465, + "loss": 0.1539, + "num_input_tokens_seen": 15906240, + "step": 7360 + }, + { + "epoch": 1.201468189233279, + "grad_norm": 0.22818638384342194, + "learning_rate": 0.000600652528548124, + "loss": 0.14, + "num_input_tokens_seen": 15917024, + "step": 7365 + }, + { + "epoch": 1.2022838499184338, + "grad_norm": 0.5327635407447815, + "learning_rate": 0.0006010603588907014, + "loss": 0.3148, + "num_input_tokens_seen": 15926496, + "step": 7370 + }, + { + "epoch": 1.2030995106035889, + "grad_norm": 0.10238040238618851, + "learning_rate": 0.000601468189233279, + "loss": 0.1251, + "num_input_tokens_seen": 15937600, + "step": 7375 + }, + { + "epoch": 1.203915171288744, + "grad_norm": 0.26709550619125366, + "learning_rate": 0.0006018760195758564, + "loss": 0.1398, + "num_input_tokens_seen": 15948064, + "step": 7380 + }, + { + "epoch": 1.2047308319738987, + "grad_norm": 0.05076899006962776, + "learning_rate": 0.000602283849918434, + "loss": 0.0907, + "num_input_tokens_seen": 15958752, + "step": 7385 + }, + { + "epoch": 1.2055464926590538, + "grad_norm": 0.31821298599243164, + "learning_rate": 0.0006026916802610114, + "loss": 0.1874, + "num_input_tokens_seen": 15970080, + "step": 7390 + }, + { + "epoch": 1.2063621533442088, + "grad_norm": 0.15782451629638672, + "learning_rate": 0.0006030995106035889, + "loss": 0.0796, + "num_input_tokens_seen": 15982080, + "step": 7395 + }, + { + "epoch": 1.2071778140293639, + "grad_norm": 0.1060723289847374, + "learning_rate": 0.0006035073409461665, + "loss": 0.0633, + "num_input_tokens_seen": 15993056, + "step": 7400 + }, + { + "epoch": 1.2079934747145187, + "grad_norm": 0.08098359405994415, + "learning_rate": 0.0006039151712887438, + "loss": 0.1073, + "num_input_tokens_seen": 16003584, + "step": 7405 + }, + { + "epoch": 1.2088091353996737, + "grad_norm": 0.22977346181869507, + "learning_rate": 0.0006043230016313214, + "loss": 0.1374, + "num_input_tokens_seen": 16014912, + "step": 7410 + }, + { + "epoch": 1.2096247960848288, + "grad_norm": 0.1667831838130951, + "learning_rate": 0.0006047308319738989, + "loss": 0.0992, + "num_input_tokens_seen": 16026304, + "step": 7415 + }, + { + "epoch": 1.2104404567699838, + "grad_norm": 0.045863885432481766, + "learning_rate": 0.0006051386623164764, + "loss": 0.089, + "num_input_tokens_seen": 16038464, + "step": 7420 + }, + { + "epoch": 1.2112561174551386, + "grad_norm": 0.477449506521225, + "learning_rate": 0.0006055464926590538, + "loss": 0.1467, + "num_input_tokens_seen": 16049728, + "step": 7425 + }, + { + "epoch": 1.2120717781402937, + "grad_norm": 0.43670424818992615, + "learning_rate": 0.0006059543230016313, + "loss": 0.1687, + "num_input_tokens_seen": 16060032, + "step": 7430 + }, + { + "epoch": 1.2128874388254487, + "grad_norm": 0.738351047039032, + "learning_rate": 0.0006063621533442089, + "loss": 0.1123, + "num_input_tokens_seen": 16072064, + "step": 7435 + }, + { + "epoch": 1.2137030995106035, + "grad_norm": 0.23298379778862, + "learning_rate": 0.0006067699836867863, + "loss": 0.0822, + "num_input_tokens_seen": 16082112, + "step": 7440 + }, + { + "epoch": 1.2145187601957586, + "grad_norm": 0.05664366856217384, + "learning_rate": 0.0006071778140293637, + "loss": 0.1621, + "num_input_tokens_seen": 16093280, + "step": 7445 + }, + { + "epoch": 1.2153344208809136, + "grad_norm": 0.3159855306148529, + "learning_rate": 0.0006075856443719413, + "loss": 0.1636, + "num_input_tokens_seen": 16104352, + "step": 7450 + }, + { + "epoch": 1.2161500815660684, + "grad_norm": 0.0692550390958786, + "learning_rate": 0.0006079934747145188, + "loss": 0.1286, + "num_input_tokens_seen": 16114784, + "step": 7455 + }, + { + "epoch": 1.2169657422512234, + "grad_norm": 0.14434915781021118, + "learning_rate": 0.0006084013050570962, + "loss": 0.0743, + "num_input_tokens_seen": 16126080, + "step": 7460 + }, + { + "epoch": 1.2177814029363785, + "grad_norm": 0.07492110133171082, + "learning_rate": 0.0006088091353996738, + "loss": 0.2133, + "num_input_tokens_seen": 16136480, + "step": 7465 + }, + { + "epoch": 1.2185970636215335, + "grad_norm": 0.23890624940395355, + "learning_rate": 0.0006092169657422512, + "loss": 0.2887, + "num_input_tokens_seen": 16147424, + "step": 7470 + }, + { + "epoch": 1.2194127243066883, + "grad_norm": 0.12416541576385498, + "learning_rate": 0.0006096247960848287, + "loss": 0.2805, + "num_input_tokens_seen": 16157280, + "step": 7475 + }, + { + "epoch": 1.2202283849918434, + "grad_norm": 0.17372988164424896, + "learning_rate": 0.0006100326264274062, + "loss": 0.1071, + "num_input_tokens_seen": 16168320, + "step": 7480 + }, + { + "epoch": 1.2210440456769984, + "grad_norm": 0.07400710880756378, + "learning_rate": 0.0006104404567699837, + "loss": 0.0821, + "num_input_tokens_seen": 16178976, + "step": 7485 + }, + { + "epoch": 1.2218597063621535, + "grad_norm": 0.07692970335483551, + "learning_rate": 0.0006108482871125613, + "loss": 0.1337, + "num_input_tokens_seen": 16190208, + "step": 7490 + }, + { + "epoch": 1.2226753670473083, + "grad_norm": 0.09909338504076004, + "learning_rate": 0.0006112561174551386, + "loss": 0.1749, + "num_input_tokens_seen": 16199488, + "step": 7495 + }, + { + "epoch": 1.2234910277324633, + "grad_norm": 0.032885245978832245, + "learning_rate": 0.0006116639477977162, + "loss": 0.1782, + "num_input_tokens_seen": 16210432, + "step": 7500 + }, + { + "epoch": 1.2243066884176184, + "grad_norm": 0.1396157145500183, + "learning_rate": 0.0006120717781402937, + "loss": 0.1037, + "num_input_tokens_seen": 16221664, + "step": 7505 + }, + { + "epoch": 1.2251223491027732, + "grad_norm": 0.048744648694992065, + "learning_rate": 0.000612479608482871, + "loss": 0.1597, + "num_input_tokens_seen": 16232064, + "step": 7510 + }, + { + "epoch": 1.2259380097879282, + "grad_norm": 0.023725448176264763, + "learning_rate": 0.0006128874388254486, + "loss": 0.116, + "num_input_tokens_seen": 16242976, + "step": 7515 + }, + { + "epoch": 1.2267536704730833, + "grad_norm": 0.01951914280653, + "learning_rate": 0.0006132952691680261, + "loss": 0.1875, + "num_input_tokens_seen": 16253440, + "step": 7520 + }, + { + "epoch": 1.227569331158238, + "grad_norm": 0.1583794206380844, + "learning_rate": 0.0006137030995106036, + "loss": 0.1235, + "num_input_tokens_seen": 16263680, + "step": 7525 + }, + { + "epoch": 1.2283849918433931, + "grad_norm": 0.0603773407638073, + "learning_rate": 0.0006141109298531811, + "loss": 0.1633, + "num_input_tokens_seen": 16274944, + "step": 7530 + }, + { + "epoch": 1.2292006525285482, + "grad_norm": 0.17134246230125427, + "learning_rate": 0.0006145187601957585, + "loss": 0.1221, + "num_input_tokens_seen": 16285376, + "step": 7535 + }, + { + "epoch": 1.2300163132137032, + "grad_norm": 0.19265292584896088, + "learning_rate": 0.0006149265905383361, + "loss": 0.0939, + "num_input_tokens_seen": 16296544, + "step": 7540 + }, + { + "epoch": 1.230831973898858, + "grad_norm": 0.07948993146419525, + "learning_rate": 0.0006153344208809135, + "loss": 0.1278, + "num_input_tokens_seen": 16306752, + "step": 7545 + }, + { + "epoch": 1.231647634584013, + "grad_norm": 0.5413017272949219, + "learning_rate": 0.0006157422512234911, + "loss": 0.1777, + "num_input_tokens_seen": 16317536, + "step": 7550 + }, + { + "epoch": 1.232463295269168, + "grad_norm": 0.5602454543113708, + "learning_rate": 0.0006161500815660686, + "loss": 0.1497, + "num_input_tokens_seen": 16328640, + "step": 7555 + }, + { + "epoch": 1.233278955954323, + "grad_norm": 0.24598528444766998, + "learning_rate": 0.0006165579119086459, + "loss": 0.2734, + "num_input_tokens_seen": 16340032, + "step": 7560 + }, + { + "epoch": 1.234094616639478, + "grad_norm": 0.13220785558223724, + "learning_rate": 0.0006169657422512235, + "loss": 0.1357, + "num_input_tokens_seen": 16351328, + "step": 7565 + }, + { + "epoch": 1.234910277324633, + "grad_norm": 0.1043863594532013, + "learning_rate": 0.000617373572593801, + "loss": 0.1883, + "num_input_tokens_seen": 16360832, + "step": 7570 + }, + { + "epoch": 1.235725938009788, + "grad_norm": 0.10327058285474777, + "learning_rate": 0.0006177814029363786, + "loss": 0.0639, + "num_input_tokens_seen": 16372640, + "step": 7575 + }, + { + "epoch": 1.2365415986949428, + "grad_norm": 0.051614295691251755, + "learning_rate": 0.0006181892332789559, + "loss": 0.0593, + "num_input_tokens_seen": 16383360, + "step": 7580 + }, + { + "epoch": 1.2373572593800979, + "grad_norm": 0.15088234841823578, + "learning_rate": 0.0006185970636215334, + "loss": 0.182, + "num_input_tokens_seen": 16393728, + "step": 7585 + }, + { + "epoch": 1.238172920065253, + "grad_norm": 0.36186710000038147, + "learning_rate": 0.000619004893964111, + "loss": 0.1953, + "num_input_tokens_seen": 16405440, + "step": 7590 + }, + { + "epoch": 1.2389885807504077, + "grad_norm": 0.16884486377239227, + "learning_rate": 0.0006194127243066884, + "loss": 0.2185, + "num_input_tokens_seen": 16416672, + "step": 7595 + }, + { + "epoch": 1.2398042414355628, + "grad_norm": 0.17424054443836212, + "learning_rate": 0.000619820554649266, + "loss": 0.1332, + "num_input_tokens_seen": 16427136, + "step": 7600 + }, + { + "epoch": 1.2406199021207178, + "grad_norm": 0.20878785848617554, + "learning_rate": 0.0006202283849918434, + "loss": 0.1257, + "num_input_tokens_seen": 16437984, + "step": 7605 + }, + { + "epoch": 1.2414355628058726, + "grad_norm": 0.095785953104496, + "learning_rate": 0.0006206362153344209, + "loss": 0.1579, + "num_input_tokens_seen": 16448640, + "step": 7610 + }, + { + "epoch": 1.2422512234910277, + "grad_norm": 0.3330758213996887, + "learning_rate": 0.0006210440456769984, + "loss": 0.2479, + "num_input_tokens_seen": 16457856, + "step": 7615 + }, + { + "epoch": 1.2430668841761827, + "grad_norm": 0.053061868995428085, + "learning_rate": 0.0006214518760195759, + "loss": 0.1237, + "num_input_tokens_seen": 16469056, + "step": 7620 + }, + { + "epoch": 1.2438825448613378, + "grad_norm": 0.07327862828969955, + "learning_rate": 0.0006218597063621533, + "loss": 0.1128, + "num_input_tokens_seen": 16479616, + "step": 7625 + }, + { + "epoch": 1.2446982055464926, + "grad_norm": 0.048818189650774, + "learning_rate": 0.0006222675367047308, + "loss": 0.0584, + "num_input_tokens_seen": 16489952, + "step": 7630 + }, + { + "epoch": 1.2455138662316476, + "grad_norm": 0.07355426996946335, + "learning_rate": 0.0006226753670473083, + "loss": 0.0629, + "num_input_tokens_seen": 16501152, + "step": 7635 + }, + { + "epoch": 1.2463295269168027, + "grad_norm": 0.06048765033483505, + "learning_rate": 0.0006230831973898859, + "loss": 0.0269, + "num_input_tokens_seen": 16512608, + "step": 7640 + }, + { + "epoch": 1.2471451876019577, + "grad_norm": 0.11570721864700317, + "learning_rate": 0.0006234910277324634, + "loss": 0.06, + "num_input_tokens_seen": 16524064, + "step": 7645 + }, + { + "epoch": 1.2479608482871125, + "grad_norm": 0.15921324491500854, + "learning_rate": 0.0006238988580750407, + "loss": 0.104, + "num_input_tokens_seen": 16534688, + "step": 7650 + }, + { + "epoch": 1.2487765089722676, + "grad_norm": 0.23092800378799438, + "learning_rate": 0.0006243066884176183, + "loss": 0.0591, + "num_input_tokens_seen": 16544928, + "step": 7655 + }, + { + "epoch": 1.2495921696574226, + "grad_norm": 0.6252056956291199, + "learning_rate": 0.0006247145187601958, + "loss": 0.3248, + "num_input_tokens_seen": 16555200, + "step": 7660 + }, + { + "epoch": 1.2504078303425774, + "grad_norm": 0.3867167532444, + "learning_rate": 0.0006251223491027733, + "loss": 0.3274, + "num_input_tokens_seen": 16566304, + "step": 7665 + }, + { + "epoch": 1.2512234910277324, + "grad_norm": 0.21520960330963135, + "learning_rate": 0.0006255301794453507, + "loss": 0.2226, + "num_input_tokens_seen": 16578560, + "step": 7670 + }, + { + "epoch": 1.2520391517128875, + "grad_norm": 0.08306780457496643, + "learning_rate": 0.0006259380097879282, + "loss": 0.1575, + "num_input_tokens_seen": 16588704, + "step": 7675 + }, + { + "epoch": 1.2528548123980423, + "grad_norm": 0.08445748686790466, + "learning_rate": 0.0006263458401305058, + "loss": 0.0928, + "num_input_tokens_seen": 16599776, + "step": 7680 + }, + { + "epoch": 1.2536704730831973, + "grad_norm": 0.028339169919490814, + "learning_rate": 0.0006267536704730832, + "loss": 0.0913, + "num_input_tokens_seen": 16609824, + "step": 7685 + }, + { + "epoch": 1.2544861337683524, + "grad_norm": 0.2406913787126541, + "learning_rate": 0.0006271615008156607, + "loss": 0.2018, + "num_input_tokens_seen": 16621280, + "step": 7690 + }, + { + "epoch": 1.2553017944535072, + "grad_norm": 0.13106633722782135, + "learning_rate": 0.0006275693311582382, + "loss": 0.0989, + "num_input_tokens_seen": 16631776, + "step": 7695 + }, + { + "epoch": 1.2561174551386622, + "grad_norm": 0.332002729177475, + "learning_rate": 0.0006279771615008156, + "loss": 0.1123, + "num_input_tokens_seen": 16643584, + "step": 7700 + }, + { + "epoch": 1.2569331158238173, + "grad_norm": 0.049703653901815414, + "learning_rate": 0.0006283849918433932, + "loss": 0.0968, + "num_input_tokens_seen": 16654432, + "step": 7705 + }, + { + "epoch": 1.2577487765089723, + "grad_norm": 0.031025424599647522, + "learning_rate": 0.0006287928221859707, + "loss": 0.128, + "num_input_tokens_seen": 16664064, + "step": 7710 + }, + { + "epoch": 1.2585644371941274, + "grad_norm": 0.12425588071346283, + "learning_rate": 0.0006292006525285482, + "loss": 0.1324, + "num_input_tokens_seen": 16674432, + "step": 7715 + }, + { + "epoch": 1.2593800978792822, + "grad_norm": 0.16951139271259308, + "learning_rate": 0.0006296084828711256, + "loss": 0.111, + "num_input_tokens_seen": 16685920, + "step": 7720 + }, + { + "epoch": 1.2601957585644372, + "grad_norm": 0.16042248904705048, + "learning_rate": 0.0006300163132137031, + "loss": 0.1113, + "num_input_tokens_seen": 16696320, + "step": 7725 + }, + { + "epoch": 1.2610114192495923, + "grad_norm": 0.03045455366373062, + "learning_rate": 0.0006304241435562807, + "loss": 0.0681, + "num_input_tokens_seen": 16707040, + "step": 7730 + }, + { + "epoch": 1.261827079934747, + "grad_norm": 0.26645559072494507, + "learning_rate": 0.000630831973898858, + "loss": 0.2319, + "num_input_tokens_seen": 16718208, + "step": 7735 + }, + { + "epoch": 1.2626427406199021, + "grad_norm": 0.20267876982688904, + "learning_rate": 0.0006312398042414356, + "loss": 0.0403, + "num_input_tokens_seen": 16728480, + "step": 7740 + }, + { + "epoch": 1.2634584013050572, + "grad_norm": 0.02899232506752014, + "learning_rate": 0.0006316476345840131, + "loss": 0.0844, + "num_input_tokens_seen": 16740384, + "step": 7745 + }, + { + "epoch": 1.264274061990212, + "grad_norm": 0.1729649305343628, + "learning_rate": 0.0006320554649265906, + "loss": 0.1533, + "num_input_tokens_seen": 16749888, + "step": 7750 + }, + { + "epoch": 1.265089722675367, + "grad_norm": 0.1135585680603981, + "learning_rate": 0.000632463295269168, + "loss": 0.1495, + "num_input_tokens_seen": 16761024, + "step": 7755 + }, + { + "epoch": 1.265905383360522, + "grad_norm": 0.04681342467665672, + "learning_rate": 0.0006328711256117455, + "loss": 0.0456, + "num_input_tokens_seen": 16771296, + "step": 7760 + }, + { + "epoch": 1.2667210440456769, + "grad_norm": 0.2058001309633255, + "learning_rate": 0.000633278955954323, + "loss": 0.0974, + "num_input_tokens_seen": 16781984, + "step": 7765 + }, + { + "epoch": 1.267536704730832, + "grad_norm": 0.3027314841747284, + "learning_rate": 0.0006336867862969005, + "loss": 0.0931, + "num_input_tokens_seen": 16793056, + "step": 7770 + }, + { + "epoch": 1.268352365415987, + "grad_norm": 0.06296977400779724, + "learning_rate": 0.000634094616639478, + "loss": 0.1434, + "num_input_tokens_seen": 16805568, + "step": 7775 + }, + { + "epoch": 1.269168026101142, + "grad_norm": 0.25227290391921997, + "learning_rate": 0.0006345024469820555, + "loss": 0.0815, + "num_input_tokens_seen": 16815808, + "step": 7780 + }, + { + "epoch": 1.269983686786297, + "grad_norm": 0.02726042829453945, + "learning_rate": 0.0006349102773246329, + "loss": 0.1643, + "num_input_tokens_seen": 16826272, + "step": 7785 + }, + { + "epoch": 1.2707993474714518, + "grad_norm": 0.20239824056625366, + "learning_rate": 0.0006353181076672104, + "loss": 0.1483, + "num_input_tokens_seen": 16836576, + "step": 7790 + }, + { + "epoch": 1.2716150081566069, + "grad_norm": 0.13095703721046448, + "learning_rate": 0.000635725938009788, + "loss": 0.07, + "num_input_tokens_seen": 16846720, + "step": 7795 + }, + { + "epoch": 1.272430668841762, + "grad_norm": 0.05034344643354416, + "learning_rate": 0.0006361337683523654, + "loss": 0.0495, + "num_input_tokens_seen": 16856960, + "step": 7800 + }, + { + "epoch": 1.2732463295269167, + "grad_norm": 0.3856923282146454, + "learning_rate": 0.0006365415986949429, + "loss": 0.1942, + "num_input_tokens_seen": 16868512, + "step": 7805 + }, + { + "epoch": 1.2740619902120718, + "grad_norm": 0.01495252177119255, + "learning_rate": 0.0006369494290375204, + "loss": 0.0313, + "num_input_tokens_seen": 16879744, + "step": 7810 + }, + { + "epoch": 1.2748776508972268, + "grad_norm": 0.021752089262008667, + "learning_rate": 0.0006373572593800979, + "loss": 0.0828, + "num_input_tokens_seen": 16889440, + "step": 7815 + }, + { + "epoch": 1.2756933115823816, + "grad_norm": 0.023451196029782295, + "learning_rate": 0.0006377650897226754, + "loss": 0.1314, + "num_input_tokens_seen": 16900768, + "step": 7820 + }, + { + "epoch": 1.2765089722675367, + "grad_norm": 0.22325833141803741, + "learning_rate": 0.0006381729200652528, + "loss": 0.153, + "num_input_tokens_seen": 16910240, + "step": 7825 + }, + { + "epoch": 1.2773246329526917, + "grad_norm": 0.09795359522104263, + "learning_rate": 0.0006385807504078304, + "loss": 0.115, + "num_input_tokens_seen": 16921600, + "step": 7830 + }, + { + "epoch": 1.2781402936378465, + "grad_norm": 0.38116297125816345, + "learning_rate": 0.0006389885807504079, + "loss": 0.1101, + "num_input_tokens_seen": 16933344, + "step": 7835 + }, + { + "epoch": 1.2789559543230016, + "grad_norm": 0.47757688164711, + "learning_rate": 0.0006393964110929853, + "loss": 0.3118, + "num_input_tokens_seen": 16944416, + "step": 7840 + }, + { + "epoch": 1.2797716150081566, + "grad_norm": 0.4107573926448822, + "learning_rate": 0.0006398042414355628, + "loss": 0.233, + "num_input_tokens_seen": 16954816, + "step": 7845 + }, + { + "epoch": 1.2805872756933117, + "grad_norm": 0.14131440222263336, + "learning_rate": 0.0006402120717781403, + "loss": 0.1174, + "num_input_tokens_seen": 16964992, + "step": 7850 + }, + { + "epoch": 1.2814029363784667, + "grad_norm": 0.04313009977340698, + "learning_rate": 0.0006406199021207178, + "loss": 0.061, + "num_input_tokens_seen": 16975200, + "step": 7855 + }, + { + "epoch": 1.2822185970636215, + "grad_norm": 0.13069091737270355, + "learning_rate": 0.0006410277324632953, + "loss": 0.0842, + "num_input_tokens_seen": 16985984, + "step": 7860 + }, + { + "epoch": 1.2830342577487766, + "grad_norm": 0.025872059166431427, + "learning_rate": 0.0006414355628058727, + "loss": 0.0571, + "num_input_tokens_seen": 16997504, + "step": 7865 + }, + { + "epoch": 1.2838499184339316, + "grad_norm": 0.0267726331949234, + "learning_rate": 0.0006418433931484503, + "loss": 0.049, + "num_input_tokens_seen": 17007296, + "step": 7870 + }, + { + "epoch": 1.2846655791190864, + "grad_norm": 0.10828512907028198, + "learning_rate": 0.0006422512234910277, + "loss": 0.1539, + "num_input_tokens_seen": 17017792, + "step": 7875 + }, + { + "epoch": 1.2854812398042414, + "grad_norm": 0.11446195095777512, + "learning_rate": 0.0006426590538336053, + "loss": 0.1212, + "num_input_tokens_seen": 17029472, + "step": 7880 + }, + { + "epoch": 1.2862969004893965, + "grad_norm": 0.017614372074604034, + "learning_rate": 0.0006430668841761828, + "loss": 0.0478, + "num_input_tokens_seen": 17039808, + "step": 7885 + }, + { + "epoch": 1.2871125611745513, + "grad_norm": 0.06402583420276642, + "learning_rate": 0.0006434747145187601, + "loss": 0.0624, + "num_input_tokens_seen": 17050720, + "step": 7890 + }, + { + "epoch": 1.2879282218597063, + "grad_norm": 0.13314387202262878, + "learning_rate": 0.0006438825448613377, + "loss": 0.1575, + "num_input_tokens_seen": 17061792, + "step": 7895 + }, + { + "epoch": 1.2887438825448614, + "grad_norm": 0.0664740651845932, + "learning_rate": 0.0006442903752039152, + "loss": 0.1593, + "num_input_tokens_seen": 17072512, + "step": 7900 + }, + { + "epoch": 1.2895595432300162, + "grad_norm": 0.07506439089775085, + "learning_rate": 0.0006446982055464927, + "loss": 0.0866, + "num_input_tokens_seen": 17084288, + "step": 7905 + }, + { + "epoch": 1.2903752039151712, + "grad_norm": 0.5275997519493103, + "learning_rate": 0.0006451060358890701, + "loss": 0.3537, + "num_input_tokens_seen": 17095648, + "step": 7910 + }, + { + "epoch": 1.2911908646003263, + "grad_norm": 0.10702264308929443, + "learning_rate": 0.0006455138662316476, + "loss": 0.1069, + "num_input_tokens_seen": 17106976, + "step": 7915 + }, + { + "epoch": 1.2920065252854813, + "grad_norm": 0.17453348636627197, + "learning_rate": 0.0006459216965742252, + "loss": 0.1044, + "num_input_tokens_seen": 17116640, + "step": 7920 + }, + { + "epoch": 1.2928221859706361, + "grad_norm": 0.04211637005209923, + "learning_rate": 0.0006463295269168026, + "loss": 0.1929, + "num_input_tokens_seen": 17127584, + "step": 7925 + }, + { + "epoch": 1.2936378466557912, + "grad_norm": 0.02777124010026455, + "learning_rate": 0.00064673735725938, + "loss": 0.0759, + "num_input_tokens_seen": 17137760, + "step": 7930 + }, + { + "epoch": 1.2944535073409462, + "grad_norm": 0.2638346552848816, + "learning_rate": 0.0006471451876019576, + "loss": 0.0961, + "num_input_tokens_seen": 17149056, + "step": 7935 + }, + { + "epoch": 1.2952691680261013, + "grad_norm": 0.06882566213607788, + "learning_rate": 0.0006475530179445351, + "loss": 0.1392, + "num_input_tokens_seen": 17160096, + "step": 7940 + }, + { + "epoch": 1.296084828711256, + "grad_norm": 0.235148623585701, + "learning_rate": 0.0006479608482871126, + "loss": 0.1379, + "num_input_tokens_seen": 17170592, + "step": 7945 + }, + { + "epoch": 1.2969004893964111, + "grad_norm": 0.41831865906715393, + "learning_rate": 0.0006483686786296901, + "loss": 0.269, + "num_input_tokens_seen": 17181760, + "step": 7950 + }, + { + "epoch": 1.2977161500815662, + "grad_norm": 0.3294907808303833, + "learning_rate": 0.0006487765089722675, + "loss": 0.2033, + "num_input_tokens_seen": 17192416, + "step": 7955 + }, + { + "epoch": 1.298531810766721, + "grad_norm": 0.09484521299600601, + "learning_rate": 0.000649184339314845, + "loss": 0.084, + "num_input_tokens_seen": 17204256, + "step": 7960 + }, + { + "epoch": 1.299347471451876, + "grad_norm": 0.20619317889213562, + "learning_rate": 0.0006495921696574225, + "loss": 0.1994, + "num_input_tokens_seen": 17215520, + "step": 7965 + }, + { + "epoch": 1.300163132137031, + "grad_norm": 0.32548433542251587, + "learning_rate": 0.0006500000000000001, + "loss": 0.2032, + "num_input_tokens_seen": 17226560, + "step": 7970 + }, + { + "epoch": 1.3009787928221859, + "grad_norm": 0.09451394528150558, + "learning_rate": 0.0006504078303425776, + "loss": 0.0827, + "num_input_tokens_seen": 17236864, + "step": 7975 + }, + { + "epoch": 1.301794453507341, + "grad_norm": 0.12397997826337814, + "learning_rate": 0.0006508156606851549, + "loss": 0.1195, + "num_input_tokens_seen": 17247424, + "step": 7980 + }, + { + "epoch": 1.302610114192496, + "grad_norm": 0.17262053489685059, + "learning_rate": 0.0006512234910277325, + "loss": 0.1068, + "num_input_tokens_seen": 17258208, + "step": 7985 + }, + { + "epoch": 1.3034257748776508, + "grad_norm": 0.19197200238704681, + "learning_rate": 0.00065163132137031, + "loss": 0.0635, + "num_input_tokens_seen": 17269280, + "step": 7990 + }, + { + "epoch": 1.3042414355628058, + "grad_norm": 0.06886317580938339, + "learning_rate": 0.0006520391517128875, + "loss": 0.0507, + "num_input_tokens_seen": 17278816, + "step": 7995 + }, + { + "epoch": 1.3050570962479608, + "grad_norm": 0.20275108516216278, + "learning_rate": 0.0006524469820554649, + "loss": 0.0759, + "num_input_tokens_seen": 17289824, + "step": 8000 + }, + { + "epoch": 1.3058727569331159, + "grad_norm": 0.10766874253749847, + "learning_rate": 0.0006528548123980424, + "loss": 0.0615, + "num_input_tokens_seen": 17300448, + "step": 8005 + }, + { + "epoch": 1.306688417618271, + "grad_norm": 0.34369486570358276, + "learning_rate": 0.0006532626427406199, + "loss": 0.2698, + "num_input_tokens_seen": 17310592, + "step": 8010 + }, + { + "epoch": 1.3075040783034257, + "grad_norm": 0.018222760409116745, + "learning_rate": 0.0006536704730831974, + "loss": 0.1493, + "num_input_tokens_seen": 17321632, + "step": 8015 + }, + { + "epoch": 1.3083197389885808, + "grad_norm": 0.07134456932544708, + "learning_rate": 0.000654078303425775, + "loss": 0.1612, + "num_input_tokens_seen": 17332960, + "step": 8020 + }, + { + "epoch": 1.3091353996737358, + "grad_norm": 0.4593985080718994, + "learning_rate": 0.0006544861337683524, + "loss": 0.2286, + "num_input_tokens_seen": 17343808, + "step": 8025 + }, + { + "epoch": 1.3099510603588906, + "grad_norm": 0.06018417328596115, + "learning_rate": 0.0006548939641109298, + "loss": 0.2168, + "num_input_tokens_seen": 17353696, + "step": 8030 + }, + { + "epoch": 1.3107667210440457, + "grad_norm": 0.14180895686149597, + "learning_rate": 0.0006553017944535074, + "loss": 0.1579, + "num_input_tokens_seen": 17364000, + "step": 8035 + }, + { + "epoch": 1.3115823817292007, + "grad_norm": 0.07524849474430084, + "learning_rate": 0.0006557096247960849, + "loss": 0.1322, + "num_input_tokens_seen": 17371936, + "step": 8040 + }, + { + "epoch": 1.3123980424143555, + "grad_norm": 0.11009208858013153, + "learning_rate": 0.0006561174551386622, + "loss": 0.1167, + "num_input_tokens_seen": 17382304, + "step": 8045 + }, + { + "epoch": 1.3132137030995106, + "grad_norm": 0.15841297805309296, + "learning_rate": 0.0006565252854812398, + "loss": 0.0841, + "num_input_tokens_seen": 17392000, + "step": 8050 + }, + { + "epoch": 1.3140293637846656, + "grad_norm": 0.056525927037000656, + "learning_rate": 0.0006569331158238173, + "loss": 0.1197, + "num_input_tokens_seen": 17402528, + "step": 8055 + }, + { + "epoch": 1.3148450244698204, + "grad_norm": 0.05757668614387512, + "learning_rate": 0.0006573409461663949, + "loss": 0.2046, + "num_input_tokens_seen": 17412480, + "step": 8060 + }, + { + "epoch": 1.3156606851549755, + "grad_norm": 0.15594319999217987, + "learning_rate": 0.0006577487765089722, + "loss": 0.0923, + "num_input_tokens_seen": 17423520, + "step": 8065 + }, + { + "epoch": 1.3164763458401305, + "grad_norm": 0.21413877606391907, + "learning_rate": 0.0006581566068515497, + "loss": 0.1012, + "num_input_tokens_seen": 17434176, + "step": 8070 + }, + { + "epoch": 1.3172920065252856, + "grad_norm": 0.3471675515174866, + "learning_rate": 0.0006585644371941273, + "loss": 0.132, + "num_input_tokens_seen": 17445504, + "step": 8075 + }, + { + "epoch": 1.3181076672104406, + "grad_norm": 0.10413924604654312, + "learning_rate": 0.0006589722675367047, + "loss": 0.1008, + "num_input_tokens_seen": 17456544, + "step": 8080 + }, + { + "epoch": 1.3189233278955954, + "grad_norm": 0.1117105782032013, + "learning_rate": 0.0006593800978792823, + "loss": 0.1105, + "num_input_tokens_seen": 17466432, + "step": 8085 + }, + { + "epoch": 1.3197389885807504, + "grad_norm": 0.2799171507358551, + "learning_rate": 0.0006597879282218597, + "loss": 0.091, + "num_input_tokens_seen": 17477472, + "step": 8090 + }, + { + "epoch": 1.3205546492659055, + "grad_norm": 0.03801706060767174, + "learning_rate": 0.0006601957585644372, + "loss": 0.0413, + "num_input_tokens_seen": 17487360, + "step": 8095 + }, + { + "epoch": 1.3213703099510603, + "grad_norm": 0.025429189205169678, + "learning_rate": 0.0006606035889070147, + "loss": 0.0459, + "num_input_tokens_seen": 17497568, + "step": 8100 + }, + { + "epoch": 1.3221859706362153, + "grad_norm": 0.1716073453426361, + "learning_rate": 0.0006610114192495922, + "loss": 0.0462, + "num_input_tokens_seen": 17508768, + "step": 8105 + }, + { + "epoch": 1.3230016313213704, + "grad_norm": 0.02504614181816578, + "learning_rate": 0.0006614192495921697, + "loss": 0.1017, + "num_input_tokens_seen": 17518368, + "step": 8110 + }, + { + "epoch": 1.3238172920065252, + "grad_norm": 0.43064630031585693, + "learning_rate": 0.0006618270799347471, + "loss": 0.3693, + "num_input_tokens_seen": 17528416, + "step": 8115 + }, + { + "epoch": 1.3246329526916802, + "grad_norm": 0.03348245471715927, + "learning_rate": 0.0006622349102773246, + "loss": 0.336, + "num_input_tokens_seen": 17537600, + "step": 8120 + }, + { + "epoch": 1.3254486133768353, + "grad_norm": 0.08706320822238922, + "learning_rate": 0.0006626427406199022, + "loss": 0.0658, + "num_input_tokens_seen": 17548736, + "step": 8125 + }, + { + "epoch": 1.32626427406199, + "grad_norm": 0.22052991390228271, + "learning_rate": 0.0006630505709624797, + "loss": 0.2187, + "num_input_tokens_seen": 17558336, + "step": 8130 + }, + { + "epoch": 1.3270799347471451, + "grad_norm": 0.2766116261482239, + "learning_rate": 0.0006634584013050571, + "loss": 0.0864, + "num_input_tokens_seen": 17570272, + "step": 8135 + }, + { + "epoch": 1.3278955954323002, + "grad_norm": 0.08737711608409882, + "learning_rate": 0.0006638662316476346, + "loss": 0.0667, + "num_input_tokens_seen": 17580960, + "step": 8140 + }, + { + "epoch": 1.3287112561174552, + "grad_norm": 0.05752582848072052, + "learning_rate": 0.0006642740619902121, + "loss": 0.0903, + "num_input_tokens_seen": 17590528, + "step": 8145 + }, + { + "epoch": 1.32952691680261, + "grad_norm": 0.14932742714881897, + "learning_rate": 0.0006646818923327896, + "loss": 0.0991, + "num_input_tokens_seen": 17600672, + "step": 8150 + }, + { + "epoch": 1.330342577487765, + "grad_norm": 0.37247616052627563, + "learning_rate": 0.000665089722675367, + "loss": 0.2768, + "num_input_tokens_seen": 17611648, + "step": 8155 + }, + { + "epoch": 1.3311582381729201, + "grad_norm": 0.31361883878707886, + "learning_rate": 0.0006654975530179446, + "loss": 0.1746, + "num_input_tokens_seen": 17622656, + "step": 8160 + }, + { + "epoch": 1.3319738988580752, + "grad_norm": 0.03875495120882988, + "learning_rate": 0.0006659053833605221, + "loss": 0.0893, + "num_input_tokens_seen": 17632864, + "step": 8165 + }, + { + "epoch": 1.33278955954323, + "grad_norm": 0.0479498952627182, + "learning_rate": 0.0006663132137030995, + "loss": 0.1497, + "num_input_tokens_seen": 17643968, + "step": 8170 + }, + { + "epoch": 1.333605220228385, + "grad_norm": 0.06282901018857956, + "learning_rate": 0.000666721044045677, + "loss": 0.1052, + "num_input_tokens_seen": 17654272, + "step": 8175 + }, + { + "epoch": 1.33442088091354, + "grad_norm": 0.06560949981212616, + "learning_rate": 0.0006671288743882545, + "loss": 0.117, + "num_input_tokens_seen": 17665728, + "step": 8180 + }, + { + "epoch": 1.3352365415986949, + "grad_norm": 0.06316741555929184, + "learning_rate": 0.0006675367047308319, + "loss": 0.1838, + "num_input_tokens_seen": 17677856, + "step": 8185 + }, + { + "epoch": 1.33605220228385, + "grad_norm": 0.051666028797626495, + "learning_rate": 0.0006679445350734095, + "loss": 0.1016, + "num_input_tokens_seen": 17688672, + "step": 8190 + }, + { + "epoch": 1.336867862969005, + "grad_norm": 0.08728346228599548, + "learning_rate": 0.000668352365415987, + "loss": 0.0849, + "num_input_tokens_seen": 17699456, + "step": 8195 + }, + { + "epoch": 1.3376835236541598, + "grad_norm": 0.27650997042655945, + "learning_rate": 0.0006687601957585645, + "loss": 0.114, + "num_input_tokens_seen": 17710976, + "step": 8200 + }, + { + "epoch": 1.3384991843393148, + "grad_norm": 0.24919366836547852, + "learning_rate": 0.0006691680261011419, + "loss": 0.119, + "num_input_tokens_seen": 17721152, + "step": 8205 + }, + { + "epoch": 1.3393148450244698, + "grad_norm": 0.0928003191947937, + "learning_rate": 0.0006695758564437194, + "loss": 0.1532, + "num_input_tokens_seen": 17731200, + "step": 8210 + }, + { + "epoch": 1.3401305057096247, + "grad_norm": 0.2068202942609787, + "learning_rate": 0.000669983686786297, + "loss": 0.1623, + "num_input_tokens_seen": 17741760, + "step": 8215 + }, + { + "epoch": 1.3409461663947797, + "grad_norm": 0.27806898951530457, + "learning_rate": 0.0006703915171288743, + "loss": 0.1147, + "num_input_tokens_seen": 17754368, + "step": 8220 + }, + { + "epoch": 1.3417618270799347, + "grad_norm": 0.07539620995521545, + "learning_rate": 0.0006707993474714519, + "loss": 0.2163, + "num_input_tokens_seen": 17764096, + "step": 8225 + }, + { + "epoch": 1.3425774877650898, + "grad_norm": 0.07630554586648941, + "learning_rate": 0.0006712071778140294, + "loss": 0.1017, + "num_input_tokens_seen": 17774784, + "step": 8230 + }, + { + "epoch": 1.3433931484502448, + "grad_norm": 0.01350832637399435, + "learning_rate": 0.0006716150081566068, + "loss": 0.0314, + "num_input_tokens_seen": 17786112, + "step": 8235 + }, + { + "epoch": 1.3442088091353996, + "grad_norm": 0.10318346321582794, + "learning_rate": 0.0006720228384991843, + "loss": 0.0679, + "num_input_tokens_seen": 17797120, + "step": 8240 + }, + { + "epoch": 1.3450244698205547, + "grad_norm": 0.24617379903793335, + "learning_rate": 0.0006724306688417618, + "loss": 0.1362, + "num_input_tokens_seen": 17808032, + "step": 8245 + }, + { + "epoch": 1.3458401305057097, + "grad_norm": 0.09801829606294632, + "learning_rate": 0.0006728384991843394, + "loss": 0.0415, + "num_input_tokens_seen": 17818144, + "step": 8250 + }, + { + "epoch": 1.3466557911908645, + "grad_norm": 0.10813087224960327, + "learning_rate": 0.0006732463295269168, + "loss": 0.0622, + "num_input_tokens_seen": 17828032, + "step": 8255 + }, + { + "epoch": 1.3474714518760196, + "grad_norm": 0.05490534380078316, + "learning_rate": 0.0006736541598694943, + "loss": 0.1019, + "num_input_tokens_seen": 17839232, + "step": 8260 + }, + { + "epoch": 1.3482871125611746, + "grad_norm": 0.15574784576892853, + "learning_rate": 0.0006740619902120718, + "loss": 0.0343, + "num_input_tokens_seen": 17849088, + "step": 8265 + }, + { + "epoch": 1.3491027732463294, + "grad_norm": 0.24546416103839874, + "learning_rate": 0.0006744698205546492, + "loss": 0.0812, + "num_input_tokens_seen": 17860512, + "step": 8270 + }, + { + "epoch": 1.3499184339314845, + "grad_norm": 0.017809003591537476, + "learning_rate": 0.0006748776508972268, + "loss": 0.1855, + "num_input_tokens_seen": 17872000, + "step": 8275 + }, + { + "epoch": 1.3507340946166395, + "grad_norm": 0.031105950474739075, + "learning_rate": 0.0006752854812398043, + "loss": 0.1227, + "num_input_tokens_seen": 17883712, + "step": 8280 + }, + { + "epoch": 1.3515497553017943, + "grad_norm": 0.3675673305988312, + "learning_rate": 0.0006756933115823817, + "loss": 0.1293, + "num_input_tokens_seen": 17894240, + "step": 8285 + }, + { + "epoch": 1.3523654159869494, + "grad_norm": 0.8100501894950867, + "learning_rate": 0.0006761011419249592, + "loss": 0.1854, + "num_input_tokens_seen": 17903904, + "step": 8290 + }, + { + "epoch": 1.3531810766721044, + "grad_norm": 0.13887664675712585, + "learning_rate": 0.0006765089722675367, + "loss": 0.1497, + "num_input_tokens_seen": 17914208, + "step": 8295 + }, + { + "epoch": 1.3539967373572595, + "grad_norm": 0.02626083977520466, + "learning_rate": 0.0006769168026101143, + "loss": 0.0406, + "num_input_tokens_seen": 17924640, + "step": 8300 + }, + { + "epoch": 1.3548123980424145, + "grad_norm": 0.03734460473060608, + "learning_rate": 0.0006773246329526917, + "loss": 0.0643, + "num_input_tokens_seen": 17935520, + "step": 8305 + }, + { + "epoch": 1.3556280587275693, + "grad_norm": 0.16976048052310944, + "learning_rate": 0.0006777324632952691, + "loss": 0.2144, + "num_input_tokens_seen": 17946848, + "step": 8310 + }, + { + "epoch": 1.3564437194127243, + "grad_norm": 0.10975205898284912, + "learning_rate": 0.0006781402936378467, + "loss": 0.0718, + "num_input_tokens_seen": 17957568, + "step": 8315 + }, + { + "epoch": 1.3572593800978794, + "grad_norm": 0.15514613687992096, + "learning_rate": 0.0006785481239804242, + "loss": 0.0433, + "num_input_tokens_seen": 17967776, + "step": 8320 + }, + { + "epoch": 1.3580750407830342, + "grad_norm": 0.27203500270843506, + "learning_rate": 0.0006789559543230017, + "loss": 0.0811, + "num_input_tokens_seen": 17978688, + "step": 8325 + }, + { + "epoch": 1.3588907014681892, + "grad_norm": 0.029379505664110184, + "learning_rate": 0.0006793637846655791, + "loss": 0.169, + "num_input_tokens_seen": 17990432, + "step": 8330 + }, + { + "epoch": 1.3597063621533443, + "grad_norm": 0.6003684401512146, + "learning_rate": 0.0006797716150081566, + "loss": 0.1984, + "num_input_tokens_seen": 18001888, + "step": 8335 + }, + { + "epoch": 1.360522022838499, + "grad_norm": 0.027604883536696434, + "learning_rate": 0.0006801794453507341, + "loss": 0.104, + "num_input_tokens_seen": 18013536, + "step": 8340 + }, + { + "epoch": 1.3613376835236541, + "grad_norm": 0.46177938580513, + "learning_rate": 0.0006805872756933116, + "loss": 0.3262, + "num_input_tokens_seen": 18024864, + "step": 8345 + }, + { + "epoch": 1.3621533442088092, + "grad_norm": 0.036756206303834915, + "learning_rate": 0.000680995106035889, + "loss": 0.0273, + "num_input_tokens_seen": 18036192, + "step": 8350 + }, + { + "epoch": 1.362969004893964, + "grad_norm": 0.2614979147911072, + "learning_rate": 0.0006814029363784666, + "loss": 0.179, + "num_input_tokens_seen": 18044864, + "step": 8355 + }, + { + "epoch": 1.363784665579119, + "grad_norm": 0.04288998246192932, + "learning_rate": 0.000681810766721044, + "loss": 0.1959, + "num_input_tokens_seen": 18056192, + "step": 8360 + }, + { + "epoch": 1.364600326264274, + "grad_norm": 0.04592842236161232, + "learning_rate": 0.0006822185970636216, + "loss": 0.1298, + "num_input_tokens_seen": 18066592, + "step": 8365 + }, + { + "epoch": 1.3654159869494291, + "grad_norm": 0.10178498178720474, + "learning_rate": 0.0006826264274061991, + "loss": 0.1448, + "num_input_tokens_seen": 18077408, + "step": 8370 + }, + { + "epoch": 1.366231647634584, + "grad_norm": 0.17799578607082367, + "learning_rate": 0.0006830342577487764, + "loss": 0.0783, + "num_input_tokens_seen": 18088544, + "step": 8375 + }, + { + "epoch": 1.367047308319739, + "grad_norm": 0.17037107050418854, + "learning_rate": 0.000683442088091354, + "loss": 0.1645, + "num_input_tokens_seen": 18100192, + "step": 8380 + }, + { + "epoch": 1.367862969004894, + "grad_norm": 0.12822948396205902, + "learning_rate": 0.0006838499184339315, + "loss": 0.2197, + "num_input_tokens_seen": 18110432, + "step": 8385 + }, + { + "epoch": 1.368678629690049, + "grad_norm": 0.036184217780828476, + "learning_rate": 0.0006842577487765091, + "loss": 0.1104, + "num_input_tokens_seen": 18120928, + "step": 8390 + }, + { + "epoch": 1.3694942903752039, + "grad_norm": 0.08512768894433975, + "learning_rate": 0.0006846655791190864, + "loss": 0.1433, + "num_input_tokens_seen": 18131104, + "step": 8395 + }, + { + "epoch": 1.370309951060359, + "grad_norm": 0.0482826866209507, + "learning_rate": 0.0006850734094616639, + "loss": 0.0752, + "num_input_tokens_seen": 18142720, + "step": 8400 + }, + { + "epoch": 1.371125611745514, + "grad_norm": 0.04397287964820862, + "learning_rate": 0.0006854812398042415, + "loss": 0.0944, + "num_input_tokens_seen": 18153664, + "step": 8405 + }, + { + "epoch": 1.3719412724306688, + "grad_norm": 0.2409050017595291, + "learning_rate": 0.0006858890701468189, + "loss": 0.4038, + "num_input_tokens_seen": 18164512, + "step": 8410 + }, + { + "epoch": 1.3727569331158238, + "grad_norm": 0.06901174038648605, + "learning_rate": 0.0006862969004893965, + "loss": 0.216, + "num_input_tokens_seen": 18174368, + "step": 8415 + }, + { + "epoch": 1.3735725938009788, + "grad_norm": 0.1169184073805809, + "learning_rate": 0.0006867047308319739, + "loss": 0.1188, + "num_input_tokens_seen": 18185216, + "step": 8420 + }, + { + "epoch": 1.3743882544861337, + "grad_norm": 0.09329386055469513, + "learning_rate": 0.0006871125611745514, + "loss": 0.1434, + "num_input_tokens_seen": 18196256, + "step": 8425 + }, + { + "epoch": 1.3752039151712887, + "grad_norm": 0.2341182976961136, + "learning_rate": 0.0006875203915171289, + "loss": 0.0968, + "num_input_tokens_seen": 18207264, + "step": 8430 + }, + { + "epoch": 1.3760195758564437, + "grad_norm": 0.4152822494506836, + "learning_rate": 0.0006879282218597064, + "loss": 0.2357, + "num_input_tokens_seen": 18217760, + "step": 8435 + }, + { + "epoch": 1.3768352365415986, + "grad_norm": 0.05489760637283325, + "learning_rate": 0.000688336052202284, + "loss": 0.0602, + "num_input_tokens_seen": 18228608, + "step": 8440 + }, + { + "epoch": 1.3776508972267536, + "grad_norm": 0.5383395552635193, + "learning_rate": 0.0006887438825448613, + "loss": 0.1899, + "num_input_tokens_seen": 18239200, + "step": 8445 + }, + { + "epoch": 1.3784665579119086, + "grad_norm": 0.4563646614551544, + "learning_rate": 0.0006891517128874388, + "loss": 0.155, + "num_input_tokens_seen": 18251200, + "step": 8450 + }, + { + "epoch": 1.3792822185970637, + "grad_norm": 0.10035424679517746, + "learning_rate": 0.0006895595432300164, + "loss": 0.0621, + "num_input_tokens_seen": 18261856, + "step": 8455 + }, + { + "epoch": 1.3800978792822187, + "grad_norm": 0.03919481858611107, + "learning_rate": 0.0006899673735725939, + "loss": 0.0615, + "num_input_tokens_seen": 18273152, + "step": 8460 + }, + { + "epoch": 1.3809135399673735, + "grad_norm": 0.10031604021787643, + "learning_rate": 0.0006903752039151713, + "loss": 0.0745, + "num_input_tokens_seen": 18284896, + "step": 8465 + }, + { + "epoch": 1.3817292006525286, + "grad_norm": 0.033023033291101456, + "learning_rate": 0.0006907830342577488, + "loss": 0.0907, + "num_input_tokens_seen": 18294912, + "step": 8470 + }, + { + "epoch": 1.3825448613376836, + "grad_norm": 0.028262877836823463, + "learning_rate": 0.0006911908646003263, + "loss": 0.0901, + "num_input_tokens_seen": 18304672, + "step": 8475 + }, + { + "epoch": 1.3833605220228384, + "grad_norm": 0.34882044792175293, + "learning_rate": 0.0006915986949429038, + "loss": 0.1843, + "num_input_tokens_seen": 18316256, + "step": 8480 + }, + { + "epoch": 1.3841761827079935, + "grad_norm": 0.025419259443879128, + "learning_rate": 0.0006920065252854812, + "loss": 0.1686, + "num_input_tokens_seen": 18327904, + "step": 8485 + }, + { + "epoch": 1.3849918433931485, + "grad_norm": 0.013332895003259182, + "learning_rate": 0.0006924143556280587, + "loss": 0.0396, + "num_input_tokens_seen": 18338752, + "step": 8490 + }, + { + "epoch": 1.3858075040783033, + "grad_norm": 0.030229412019252777, + "learning_rate": 0.0006928221859706362, + "loss": 0.0979, + "num_input_tokens_seen": 18350464, + "step": 8495 + }, + { + "epoch": 1.3866231647634584, + "grad_norm": 0.22581715881824493, + "learning_rate": 0.0006932300163132137, + "loss": 0.1898, + "num_input_tokens_seen": 18360832, + "step": 8500 + }, + { + "epoch": 1.3874388254486134, + "grad_norm": 0.419089674949646, + "learning_rate": 0.0006936378466557913, + "loss": 0.1684, + "num_input_tokens_seen": 18370976, + "step": 8505 + }, + { + "epoch": 1.3882544861337682, + "grad_norm": 0.04601133614778519, + "learning_rate": 0.0006940456769983687, + "loss": 0.1074, + "num_input_tokens_seen": 18381920, + "step": 8510 + }, + { + "epoch": 1.3890701468189233, + "grad_norm": 0.19682633876800537, + "learning_rate": 0.0006944535073409461, + "loss": 0.1794, + "num_input_tokens_seen": 18393152, + "step": 8515 + }, + { + "epoch": 1.3898858075040783, + "grad_norm": 0.08403383195400238, + "learning_rate": 0.0006948613376835237, + "loss": 0.0674, + "num_input_tokens_seen": 18403168, + "step": 8520 + }, + { + "epoch": 1.3907014681892333, + "grad_norm": 0.02600703202188015, + "learning_rate": 0.0006952691680261012, + "loss": 0.1077, + "num_input_tokens_seen": 18413792, + "step": 8525 + }, + { + "epoch": 1.3915171288743884, + "grad_norm": 0.6728271842002869, + "learning_rate": 0.0006956769983686786, + "loss": 0.1033, + "num_input_tokens_seen": 18424864, + "step": 8530 + }, + { + "epoch": 1.3923327895595432, + "grad_norm": 0.19836731255054474, + "learning_rate": 0.0006960848287112561, + "loss": 0.0448, + "num_input_tokens_seen": 18434144, + "step": 8535 + }, + { + "epoch": 1.3931484502446982, + "grad_norm": 0.17070646584033966, + "learning_rate": 0.0006964926590538336, + "loss": 0.1813, + "num_input_tokens_seen": 18444576, + "step": 8540 + }, + { + "epoch": 1.3939641109298533, + "grad_norm": 0.11258327215909958, + "learning_rate": 0.0006969004893964112, + "loss": 0.1716, + "num_input_tokens_seen": 18455808, + "step": 8545 + }, + { + "epoch": 1.394779771615008, + "grad_norm": 0.65053391456604, + "learning_rate": 0.0006973083197389885, + "loss": 0.0816, + "num_input_tokens_seen": 18466848, + "step": 8550 + }, + { + "epoch": 1.3955954323001631, + "grad_norm": 0.08368664234876633, + "learning_rate": 0.0006977161500815661, + "loss": 0.1156, + "num_input_tokens_seen": 18479040, + "step": 8555 + }, + { + "epoch": 1.3964110929853182, + "grad_norm": 0.3286716938018799, + "learning_rate": 0.0006981239804241436, + "loss": 0.2004, + "num_input_tokens_seen": 18490080, + "step": 8560 + }, + { + "epoch": 1.397226753670473, + "grad_norm": 1.1257439851760864, + "learning_rate": 0.000698531810766721, + "loss": 0.2256, + "num_input_tokens_seen": 18501632, + "step": 8565 + }, + { + "epoch": 1.398042414355628, + "grad_norm": 0.19313710927963257, + "learning_rate": 0.0006989396411092986, + "loss": 0.1922, + "num_input_tokens_seen": 18513056, + "step": 8570 + }, + { + "epoch": 1.398858075040783, + "grad_norm": 0.12139634788036346, + "learning_rate": 0.000699347471451876, + "loss": 0.0814, + "num_input_tokens_seen": 18523616, + "step": 8575 + }, + { + "epoch": 1.399673735725938, + "grad_norm": 0.17387810349464417, + "learning_rate": 0.0006997553017944536, + "loss": 0.1519, + "num_input_tokens_seen": 18534688, + "step": 8580 + }, + { + "epoch": 1.400489396411093, + "grad_norm": 0.18278487026691437, + "learning_rate": 0.000700163132137031, + "loss": 0.2493, + "num_input_tokens_seen": 18545120, + "step": 8585 + }, + { + "epoch": 1.401305057096248, + "grad_norm": 0.10564879328012466, + "learning_rate": 0.0007005709624796085, + "loss": 0.0782, + "num_input_tokens_seen": 18556224, + "step": 8590 + }, + { + "epoch": 1.402120717781403, + "grad_norm": 0.5196696519851685, + "learning_rate": 0.000700978792822186, + "loss": 0.2816, + "num_input_tokens_seen": 18566272, + "step": 8595 + }, + { + "epoch": 1.4029363784665578, + "grad_norm": 0.14458255469799042, + "learning_rate": 0.0007013866231647634, + "loss": 0.162, + "num_input_tokens_seen": 18577216, + "step": 8600 + }, + { + "epoch": 1.4037520391517129, + "grad_norm": 0.05249457061290741, + "learning_rate": 0.000701794453507341, + "loss": 0.337, + "num_input_tokens_seen": 18587072, + "step": 8605 + }, + { + "epoch": 1.404567699836868, + "grad_norm": 0.3068144619464874, + "learning_rate": 0.0007022022838499185, + "loss": 0.1498, + "num_input_tokens_seen": 18598112, + "step": 8610 + }, + { + "epoch": 1.405383360522023, + "grad_norm": 0.08390171080827713, + "learning_rate": 0.000702610114192496, + "loss": 0.1163, + "num_input_tokens_seen": 18608384, + "step": 8615 + }, + { + "epoch": 1.4061990212071778, + "grad_norm": 0.169025257229805, + "learning_rate": 0.0007030179445350734, + "loss": 0.1028, + "num_input_tokens_seen": 18618944, + "step": 8620 + }, + { + "epoch": 1.4070146818923328, + "grad_norm": 0.04982120916247368, + "learning_rate": 0.0007034257748776509, + "loss": 0.1588, + "num_input_tokens_seen": 18629248, + "step": 8625 + }, + { + "epoch": 1.4078303425774878, + "grad_norm": 0.04942760616540909, + "learning_rate": 0.0007038336052202285, + "loss": 0.0376, + "num_input_tokens_seen": 18641248, + "step": 8630 + }, + { + "epoch": 1.4086460032626427, + "grad_norm": 0.051234494894742966, + "learning_rate": 0.0007042414355628059, + "loss": 0.0283, + "num_input_tokens_seen": 18651520, + "step": 8635 + }, + { + "epoch": 1.4094616639477977, + "grad_norm": 0.20060387253761292, + "learning_rate": 0.0007046492659053833, + "loss": 0.1709, + "num_input_tokens_seen": 18662304, + "step": 8640 + }, + { + "epoch": 1.4102773246329527, + "grad_norm": 0.011496175080537796, + "learning_rate": 0.0007050570962479609, + "loss": 0.1497, + "num_input_tokens_seen": 18672416, + "step": 8645 + }, + { + "epoch": 1.4110929853181076, + "grad_norm": 0.0568263903260231, + "learning_rate": 0.0007054649265905384, + "loss": 0.1343, + "num_input_tokens_seen": 18683136, + "step": 8650 + }, + { + "epoch": 1.4119086460032626, + "grad_norm": 0.178445965051651, + "learning_rate": 0.0007058727569331158, + "loss": 0.2065, + "num_input_tokens_seen": 18693472, + "step": 8655 + }, + { + "epoch": 1.4127243066884176, + "grad_norm": 0.11439131200313568, + "learning_rate": 0.0007062805872756933, + "loss": 0.0716, + "num_input_tokens_seen": 18704128, + "step": 8660 + }, + { + "epoch": 1.4135399673735725, + "grad_norm": 0.13826791942119598, + "learning_rate": 0.0007066884176182708, + "loss": 0.0675, + "num_input_tokens_seen": 18715456, + "step": 8665 + }, + { + "epoch": 1.4143556280587275, + "grad_norm": 0.061010994017124176, + "learning_rate": 0.0007070962479608483, + "loss": 0.1816, + "num_input_tokens_seen": 18726240, + "step": 8670 + }, + { + "epoch": 1.4151712887438825, + "grad_norm": 0.3579946458339691, + "learning_rate": 0.0007075040783034258, + "loss": 0.0997, + "num_input_tokens_seen": 18736992, + "step": 8675 + }, + { + "epoch": 1.4159869494290376, + "grad_norm": 0.16792601346969604, + "learning_rate": 0.0007079119086460033, + "loss": 0.0593, + "num_input_tokens_seen": 18749216, + "step": 8680 + }, + { + "epoch": 1.4168026101141926, + "grad_norm": 0.026371244341135025, + "learning_rate": 0.0007083197389885808, + "loss": 0.0741, + "num_input_tokens_seen": 18760352, + "step": 8685 + }, + { + "epoch": 1.4176182707993474, + "grad_norm": 0.05590897798538208, + "learning_rate": 0.0007087275693311582, + "loss": 0.0373, + "num_input_tokens_seen": 18769792, + "step": 8690 + }, + { + "epoch": 1.4184339314845025, + "grad_norm": 0.018722031265497208, + "learning_rate": 0.0007091353996737358, + "loss": 0.1725, + "num_input_tokens_seen": 18781696, + "step": 8695 + }, + { + "epoch": 1.4192495921696575, + "grad_norm": 0.1691659688949585, + "learning_rate": 0.0007095432300163133, + "loss": 0.1654, + "num_input_tokens_seen": 18790912, + "step": 8700 + }, + { + "epoch": 1.4200652528548123, + "grad_norm": 0.3589876890182495, + "learning_rate": 0.0007099510603588906, + "loss": 0.1437, + "num_input_tokens_seen": 18800672, + "step": 8705 + }, + { + "epoch": 1.4208809135399674, + "grad_norm": 0.3833759129047394, + "learning_rate": 0.0007103588907014682, + "loss": 0.1872, + "num_input_tokens_seen": 18812032, + "step": 8710 + }, + { + "epoch": 1.4216965742251224, + "grad_norm": 0.5868796706199646, + "learning_rate": 0.0007107667210440457, + "loss": 0.189, + "num_input_tokens_seen": 18823328, + "step": 8715 + }, + { + "epoch": 1.4225122349102772, + "grad_norm": 0.1617187112569809, + "learning_rate": 0.0007111745513866232, + "loss": 0.1027, + "num_input_tokens_seen": 18834048, + "step": 8720 + }, + { + "epoch": 1.4233278955954323, + "grad_norm": 0.2903682291507721, + "learning_rate": 0.0007115823817292006, + "loss": 0.1478, + "num_input_tokens_seen": 18845216, + "step": 8725 + }, + { + "epoch": 1.4241435562805873, + "grad_norm": 0.06245076656341553, + "learning_rate": 0.0007119902120717781, + "loss": 0.1351, + "num_input_tokens_seen": 18854848, + "step": 8730 + }, + { + "epoch": 1.4249592169657421, + "grad_norm": 0.036956749856472015, + "learning_rate": 0.0007123980424143557, + "loss": 0.1469, + "num_input_tokens_seen": 18866368, + "step": 8735 + }, + { + "epoch": 1.4257748776508972, + "grad_norm": 0.21718493103981018, + "learning_rate": 0.0007128058727569331, + "loss": 0.1795, + "num_input_tokens_seen": 18878464, + "step": 8740 + }, + { + "epoch": 1.4265905383360522, + "grad_norm": 0.013804431073367596, + "learning_rate": 0.0007132137030995107, + "loss": 0.0576, + "num_input_tokens_seen": 18889856, + "step": 8745 + }, + { + "epoch": 1.4274061990212072, + "grad_norm": 0.08589949458837509, + "learning_rate": 0.0007136215334420881, + "loss": 0.1215, + "num_input_tokens_seen": 18900352, + "step": 8750 + }, + { + "epoch": 1.4282218597063623, + "grad_norm": 0.02286672219634056, + "learning_rate": 0.0007140293637846655, + "loss": 0.0843, + "num_input_tokens_seen": 18911936, + "step": 8755 + }, + { + "epoch": 1.429037520391517, + "grad_norm": 0.04689103737473488, + "learning_rate": 0.0007144371941272431, + "loss": 0.0534, + "num_input_tokens_seen": 18922048, + "step": 8760 + }, + { + "epoch": 1.4298531810766721, + "grad_norm": 0.26850247383117676, + "learning_rate": 0.0007148450244698206, + "loss": 0.0801, + "num_input_tokens_seen": 18932288, + "step": 8765 + }, + { + "epoch": 1.4306688417618272, + "grad_norm": 0.1247810497879982, + "learning_rate": 0.0007152528548123982, + "loss": 0.1454, + "num_input_tokens_seen": 18943008, + "step": 8770 + }, + { + "epoch": 1.431484502446982, + "grad_norm": 0.4752637445926666, + "learning_rate": 0.0007156606851549755, + "loss": 0.1226, + "num_input_tokens_seen": 18953376, + "step": 8775 + }, + { + "epoch": 1.432300163132137, + "grad_norm": 0.08867906033992767, + "learning_rate": 0.000716068515497553, + "loss": 0.0771, + "num_input_tokens_seen": 18965600, + "step": 8780 + }, + { + "epoch": 1.433115823817292, + "grad_norm": 0.01523397397249937, + "learning_rate": 0.0007164763458401306, + "loss": 0.0322, + "num_input_tokens_seen": 18975936, + "step": 8785 + }, + { + "epoch": 1.433931484502447, + "grad_norm": 0.010996107943356037, + "learning_rate": 0.000716884176182708, + "loss": 0.0654, + "num_input_tokens_seen": 18986048, + "step": 8790 + }, + { + "epoch": 1.434747145187602, + "grad_norm": 0.10460605472326279, + "learning_rate": 0.0007172920065252854, + "loss": 0.1291, + "num_input_tokens_seen": 18997056, + "step": 8795 + }, + { + "epoch": 1.435562805872757, + "grad_norm": 0.008371557109057903, + "learning_rate": 0.000717699836867863, + "loss": 0.034, + "num_input_tokens_seen": 19007744, + "step": 8800 + }, + { + "epoch": 1.4363784665579118, + "grad_norm": 0.2582967281341553, + "learning_rate": 0.0007181076672104405, + "loss": 0.0945, + "num_input_tokens_seen": 19018784, + "step": 8805 + }, + { + "epoch": 1.4371941272430668, + "grad_norm": 0.1181015744805336, + "learning_rate": 0.000718515497553018, + "loss": 0.0833, + "num_input_tokens_seen": 19030176, + "step": 8810 + }, + { + "epoch": 1.4380097879282219, + "grad_norm": 0.08758709579706192, + "learning_rate": 0.0007189233278955954, + "loss": 0.0734, + "num_input_tokens_seen": 19041088, + "step": 8815 + }, + { + "epoch": 1.438825448613377, + "grad_norm": 0.013876451179385185, + "learning_rate": 0.0007193311582381729, + "loss": 0.0158, + "num_input_tokens_seen": 19052352, + "step": 8820 + }, + { + "epoch": 1.4396411092985317, + "grad_norm": 0.028831837698817253, + "learning_rate": 0.0007197389885807504, + "loss": 0.0805, + "num_input_tokens_seen": 19062688, + "step": 8825 + }, + { + "epoch": 1.4404567699836868, + "grad_norm": 0.44155487418174744, + "learning_rate": 0.0007201468189233279, + "loss": 0.1303, + "num_input_tokens_seen": 19073504, + "step": 8830 + }, + { + "epoch": 1.4412724306688418, + "grad_norm": 0.17047381401062012, + "learning_rate": 0.0007205546492659055, + "loss": 0.1946, + "num_input_tokens_seen": 19085792, + "step": 8835 + }, + { + "epoch": 1.4420880913539968, + "grad_norm": 0.11155971139669418, + "learning_rate": 0.0007209624796084829, + "loss": 0.2309, + "num_input_tokens_seen": 19096512, + "step": 8840 + }, + { + "epoch": 1.4429037520391517, + "grad_norm": 0.2505459189414978, + "learning_rate": 0.0007213703099510603, + "loss": 0.1148, + "num_input_tokens_seen": 19106336, + "step": 8845 + }, + { + "epoch": 1.4437194127243067, + "grad_norm": 0.1573144793510437, + "learning_rate": 0.0007217781402936379, + "loss": 0.0722, + "num_input_tokens_seen": 19116800, + "step": 8850 + }, + { + "epoch": 1.4445350734094617, + "grad_norm": 0.15521514415740967, + "learning_rate": 0.0007221859706362154, + "loss": 0.1737, + "num_input_tokens_seen": 19127488, + "step": 8855 + }, + { + "epoch": 1.4453507340946166, + "grad_norm": 0.017011022195219994, + "learning_rate": 0.0007225938009787928, + "loss": 0.1335, + "num_input_tokens_seen": 19135264, + "step": 8860 + }, + { + "epoch": 1.4461663947797716, + "grad_norm": 0.18913504481315613, + "learning_rate": 0.0007230016313213703, + "loss": 0.2038, + "num_input_tokens_seen": 19146656, + "step": 8865 + }, + { + "epoch": 1.4469820554649266, + "grad_norm": 0.013087444938719273, + "learning_rate": 0.0007234094616639478, + "loss": 0.0715, + "num_input_tokens_seen": 19158880, + "step": 8870 + }, + { + "epoch": 1.4477977161500815, + "grad_norm": 0.1711316704750061, + "learning_rate": 0.0007238172920065254, + "loss": 0.0831, + "num_input_tokens_seen": 19170240, + "step": 8875 + }, + { + "epoch": 1.4486133768352365, + "grad_norm": 0.06917469948530197, + "learning_rate": 0.0007242251223491027, + "loss": 0.1215, + "num_input_tokens_seen": 19180960, + "step": 8880 + }, + { + "epoch": 1.4494290375203915, + "grad_norm": 0.28107255697250366, + "learning_rate": 0.0007246329526916803, + "loss": 0.3589, + "num_input_tokens_seen": 19192576, + "step": 8885 + }, + { + "epoch": 1.4502446982055464, + "grad_norm": 0.23159846663475037, + "learning_rate": 0.0007250407830342578, + "loss": 0.0737, + "num_input_tokens_seen": 19202752, + "step": 8890 + }, + { + "epoch": 1.4510603588907014, + "grad_norm": 0.2844611406326294, + "learning_rate": 0.0007254486133768352, + "loss": 0.096, + "num_input_tokens_seen": 19214048, + "step": 8895 + }, + { + "epoch": 1.4518760195758564, + "grad_norm": 0.08565635234117508, + "learning_rate": 0.0007258564437194128, + "loss": 0.0931, + "num_input_tokens_seen": 19225056, + "step": 8900 + }, + { + "epoch": 1.4526916802610115, + "grad_norm": 0.3863488435745239, + "learning_rate": 0.0007262642740619902, + "loss": 0.1521, + "num_input_tokens_seen": 19236480, + "step": 8905 + }, + { + "epoch": 1.4535073409461665, + "grad_norm": 0.1263965666294098, + "learning_rate": 0.0007266721044045678, + "loss": 0.1762, + "num_input_tokens_seen": 19248192, + "step": 8910 + }, + { + "epoch": 1.4543230016313213, + "grad_norm": 0.05847330763936043, + "learning_rate": 0.0007270799347471452, + "loss": 0.071, + "num_input_tokens_seen": 19258368, + "step": 8915 + }, + { + "epoch": 1.4551386623164764, + "grad_norm": 0.03335745260119438, + "learning_rate": 0.0007274877650897227, + "loss": 0.0452, + "num_input_tokens_seen": 19269088, + "step": 8920 + }, + { + "epoch": 1.4559543230016314, + "grad_norm": 0.19478656351566315, + "learning_rate": 0.0007278955954323002, + "loss": 0.0941, + "num_input_tokens_seen": 19280384, + "step": 8925 + }, + { + "epoch": 1.4567699836867862, + "grad_norm": 0.04548482596874237, + "learning_rate": 0.0007283034257748776, + "loss": 0.2333, + "num_input_tokens_seen": 19292192, + "step": 8930 + }, + { + "epoch": 1.4575856443719413, + "grad_norm": 0.3347051441669464, + "learning_rate": 0.0007287112561174551, + "loss": 0.1592, + "num_input_tokens_seen": 19303904, + "step": 8935 + }, + { + "epoch": 1.4584013050570963, + "grad_norm": 0.08720278739929199, + "learning_rate": 0.0007291190864600327, + "loss": 0.0659, + "num_input_tokens_seen": 19315712, + "step": 8940 + }, + { + "epoch": 1.4592169657422511, + "grad_norm": 0.04172803461551666, + "learning_rate": 0.00072952691680261, + "loss": 0.1854, + "num_input_tokens_seen": 19326848, + "step": 8945 + }, + { + "epoch": 1.4600326264274062, + "grad_norm": 0.5160301923751831, + "learning_rate": 0.0007299347471451876, + "loss": 0.2644, + "num_input_tokens_seen": 19338496, + "step": 8950 + }, + { + "epoch": 1.4608482871125612, + "grad_norm": 0.1625453382730484, + "learning_rate": 0.0007303425774877651, + "loss": 0.2256, + "num_input_tokens_seen": 19348992, + "step": 8955 + }, + { + "epoch": 1.461663947797716, + "grad_norm": 0.17559388279914856, + "learning_rate": 0.0007307504078303426, + "loss": 0.1537, + "num_input_tokens_seen": 19359680, + "step": 8960 + }, + { + "epoch": 1.462479608482871, + "grad_norm": 0.09148979932069778, + "learning_rate": 0.0007311582381729201, + "loss": 0.0996, + "num_input_tokens_seen": 19370848, + "step": 8965 + }, + { + "epoch": 1.463295269168026, + "grad_norm": 0.020864585414528847, + "learning_rate": 0.0007315660685154975, + "loss": 0.1251, + "num_input_tokens_seen": 19381888, + "step": 8970 + }, + { + "epoch": 1.4641109298531811, + "grad_norm": 0.008470486849546432, + "learning_rate": 0.0007319738988580751, + "loss": 0.058, + "num_input_tokens_seen": 19392768, + "step": 8975 + }, + { + "epoch": 1.4649265905383362, + "grad_norm": 0.09830211848020554, + "learning_rate": 0.0007323817292006525, + "loss": 0.0515, + "num_input_tokens_seen": 19403488, + "step": 8980 + }, + { + "epoch": 1.465742251223491, + "grad_norm": 0.061676375567913055, + "learning_rate": 0.00073278955954323, + "loss": 0.1704, + "num_input_tokens_seen": 19414656, + "step": 8985 + }, + { + "epoch": 1.466557911908646, + "grad_norm": 0.060905564576387405, + "learning_rate": 0.0007331973898858076, + "loss": 0.2525, + "num_input_tokens_seen": 19424960, + "step": 8990 + }, + { + "epoch": 1.467373572593801, + "grad_norm": 0.13291729986667633, + "learning_rate": 0.000733605220228385, + "loss": 0.1334, + "num_input_tokens_seen": 19436512, + "step": 8995 + }, + { + "epoch": 1.468189233278956, + "grad_norm": 0.4683821201324463, + "learning_rate": 0.0007340130505709625, + "loss": 0.1236, + "num_input_tokens_seen": 19447808, + "step": 9000 + }, + { + "epoch": 1.469004893964111, + "grad_norm": 0.07702501118183136, + "learning_rate": 0.00073442088091354, + "loss": 0.0601, + "num_input_tokens_seen": 19457984, + "step": 9005 + }, + { + "epoch": 1.469820554649266, + "grad_norm": 0.0318388007581234, + "learning_rate": 0.0007348287112561175, + "loss": 0.0518, + "num_input_tokens_seen": 19467680, + "step": 9010 + }, + { + "epoch": 1.4706362153344208, + "grad_norm": 0.11955982446670532, + "learning_rate": 0.0007352365415986949, + "loss": 0.1662, + "num_input_tokens_seen": 19475968, + "step": 9015 + }, + { + "epoch": 1.4714518760195758, + "grad_norm": 0.13984152674674988, + "learning_rate": 0.0007356443719412724, + "loss": 0.2114, + "num_input_tokens_seen": 19487328, + "step": 9020 + }, + { + "epoch": 1.4722675367047309, + "grad_norm": 0.1693083792924881, + "learning_rate": 0.00073605220228385, + "loss": 0.058, + "num_input_tokens_seen": 19497280, + "step": 9025 + }, + { + "epoch": 1.4730831973898857, + "grad_norm": 0.10998763889074326, + "learning_rate": 0.0007364600326264275, + "loss": 0.1837, + "num_input_tokens_seen": 19507264, + "step": 9030 + }, + { + "epoch": 1.4738988580750407, + "grad_norm": 0.08506206423044205, + "learning_rate": 0.0007368678629690048, + "loss": 0.1165, + "num_input_tokens_seen": 19518080, + "step": 9035 + }, + { + "epoch": 1.4747145187601958, + "grad_norm": 0.10371915251016617, + "learning_rate": 0.0007372756933115824, + "loss": 0.0609, + "num_input_tokens_seen": 19528288, + "step": 9040 + }, + { + "epoch": 1.4755301794453508, + "grad_norm": 0.48107215762138367, + "learning_rate": 0.0007376835236541599, + "loss": 0.1408, + "num_input_tokens_seen": 19539072, + "step": 9045 + }, + { + "epoch": 1.4763458401305056, + "grad_norm": 0.28694599866867065, + "learning_rate": 0.0007380913539967374, + "loss": 0.1991, + "num_input_tokens_seen": 19550496, + "step": 9050 + }, + { + "epoch": 1.4771615008156607, + "grad_norm": 0.021819839254021645, + "learning_rate": 0.0007384991843393149, + "loss": 0.0521, + "num_input_tokens_seen": 19562752, + "step": 9055 + }, + { + "epoch": 1.4779771615008157, + "grad_norm": 0.01266930066049099, + "learning_rate": 0.0007389070146818923, + "loss": 0.1476, + "num_input_tokens_seen": 19572512, + "step": 9060 + }, + { + "epoch": 1.4787928221859707, + "grad_norm": 0.059075579047203064, + "learning_rate": 0.0007393148450244699, + "loss": 0.1867, + "num_input_tokens_seen": 19582752, + "step": 9065 + }, + { + "epoch": 1.4796084828711256, + "grad_norm": 0.27781084179878235, + "learning_rate": 0.0007397226753670473, + "loss": 0.1437, + "num_input_tokens_seen": 19593184, + "step": 9070 + }, + { + "epoch": 1.4804241435562806, + "grad_norm": 0.06741946935653687, + "learning_rate": 0.0007401305057096248, + "loss": 0.0395, + "num_input_tokens_seen": 19604768, + "step": 9075 + }, + { + "epoch": 1.4812398042414356, + "grad_norm": 0.07379153370857239, + "learning_rate": 0.0007405383360522023, + "loss": 0.1027, + "num_input_tokens_seen": 19616576, + "step": 9080 + }, + { + "epoch": 1.4820554649265905, + "grad_norm": 0.04416964203119278, + "learning_rate": 0.0007409461663947797, + "loss": 0.0756, + "num_input_tokens_seen": 19628384, + "step": 9085 + }, + { + "epoch": 1.4828711256117455, + "grad_norm": 0.15102028846740723, + "learning_rate": 0.0007413539967373573, + "loss": 0.1792, + "num_input_tokens_seen": 19639136, + "step": 9090 + }, + { + "epoch": 1.4836867862969005, + "grad_norm": 0.3125499188899994, + "learning_rate": 0.0007417618270799348, + "loss": 0.1091, + "num_input_tokens_seen": 19649856, + "step": 9095 + }, + { + "epoch": 1.4845024469820554, + "grad_norm": 0.033201105892658234, + "learning_rate": 0.0007421696574225123, + "loss": 0.1491, + "num_input_tokens_seen": 19661120, + "step": 9100 + }, + { + "epoch": 1.4853181076672104, + "grad_norm": 0.17851990461349487, + "learning_rate": 0.0007425774877650897, + "loss": 0.0904, + "num_input_tokens_seen": 19671648, + "step": 9105 + }, + { + "epoch": 1.4861337683523654, + "grad_norm": 0.037107013165950775, + "learning_rate": 0.0007429853181076672, + "loss": 0.0693, + "num_input_tokens_seen": 19683584, + "step": 9110 + }, + { + "epoch": 1.4869494290375203, + "grad_norm": 0.1203000620007515, + "learning_rate": 0.0007433931484502448, + "loss": 0.1329, + "num_input_tokens_seen": 19694080, + "step": 9115 + }, + { + "epoch": 1.4877650897226753, + "grad_norm": 0.22686584293842316, + "learning_rate": 0.0007438009787928222, + "loss": 0.2536, + "num_input_tokens_seen": 19705824, + "step": 9120 + }, + { + "epoch": 1.4885807504078303, + "grad_norm": 0.16893191635608673, + "learning_rate": 0.0007442088091353996, + "loss": 0.2919, + "num_input_tokens_seen": 19717056, + "step": 9125 + }, + { + "epoch": 1.4893964110929854, + "grad_norm": 0.13513310253620148, + "learning_rate": 0.0007446166394779772, + "loss": 0.0545, + "num_input_tokens_seen": 19728512, + "step": 9130 + }, + { + "epoch": 1.4902120717781404, + "grad_norm": 0.14521175622940063, + "learning_rate": 0.0007450244698205547, + "loss": 0.0621, + "num_input_tokens_seen": 19740768, + "step": 9135 + }, + { + "epoch": 1.4910277324632952, + "grad_norm": 0.31101036071777344, + "learning_rate": 0.0007454323001631322, + "loss": 0.1769, + "num_input_tokens_seen": 19752576, + "step": 9140 + }, + { + "epoch": 1.4918433931484503, + "grad_norm": 0.011045384220778942, + "learning_rate": 0.0007458401305057096, + "loss": 0.0416, + "num_input_tokens_seen": 19764064, + "step": 9145 + }, + { + "epoch": 1.4926590538336053, + "grad_norm": 0.015448955819010735, + "learning_rate": 0.0007462479608482871, + "loss": 0.1013, + "num_input_tokens_seen": 19776608, + "step": 9150 + }, + { + "epoch": 1.4934747145187601, + "grad_norm": 0.01463619526475668, + "learning_rate": 0.0007466557911908646, + "loss": 0.2255, + "num_input_tokens_seen": 19787488, + "step": 9155 + }, + { + "epoch": 1.4942903752039152, + "grad_norm": 0.05092671141028404, + "learning_rate": 0.0007470636215334421, + "loss": 0.0728, + "num_input_tokens_seen": 19798240, + "step": 9160 + }, + { + "epoch": 1.4951060358890702, + "grad_norm": 0.030885811895132065, + "learning_rate": 0.0007474714518760197, + "loss": 0.2295, + "num_input_tokens_seen": 19808384, + "step": 9165 + }, + { + "epoch": 1.495921696574225, + "grad_norm": 0.11534195393323898, + "learning_rate": 0.0007478792822185971, + "loss": 0.2156, + "num_input_tokens_seen": 19817696, + "step": 9170 + }, + { + "epoch": 1.49673735725938, + "grad_norm": 0.04448262229561806, + "learning_rate": 0.0007482871125611745, + "loss": 0.1033, + "num_input_tokens_seen": 19828384, + "step": 9175 + }, + { + "epoch": 1.497553017944535, + "grad_norm": 0.16696128249168396, + "learning_rate": 0.0007486949429037521, + "loss": 0.1396, + "num_input_tokens_seen": 19840704, + "step": 9180 + }, + { + "epoch": 1.49836867862969, + "grad_norm": 0.08385413140058517, + "learning_rate": 0.0007491027732463296, + "loss": 0.1197, + "num_input_tokens_seen": 19851648, + "step": 9185 + }, + { + "epoch": 1.499184339314845, + "grad_norm": 0.10853728652000427, + "learning_rate": 0.000749510603588907, + "loss": 0.0869, + "num_input_tokens_seen": 19861568, + "step": 9190 + }, + { + "epoch": 1.5, + "grad_norm": 0.016677072271704674, + "learning_rate": 0.0007499184339314845, + "loss": 0.0606, + "num_input_tokens_seen": 19871232, + "step": 9195 + }, + { + "epoch": 1.5008156606851548, + "grad_norm": 0.056000567972660065, + "learning_rate": 0.000750326264274062, + "loss": 0.0325, + "num_input_tokens_seen": 19882400, + "step": 9200 + }, + { + "epoch": 1.50163132137031, + "grad_norm": 0.2266232669353485, + "learning_rate": 0.0007507340946166395, + "loss": 0.1308, + "num_input_tokens_seen": 19893824, + "step": 9205 + }, + { + "epoch": 1.502446982055465, + "grad_norm": 0.05299942195415497, + "learning_rate": 0.000751141924959217, + "loss": 0.0541, + "num_input_tokens_seen": 19905280, + "step": 9210 + }, + { + "epoch": 1.50326264274062, + "grad_norm": 0.027796033769845963, + "learning_rate": 0.0007515497553017944, + "loss": 0.085, + "num_input_tokens_seen": 19916032, + "step": 9215 + }, + { + "epoch": 1.504078303425775, + "grad_norm": 0.032457806169986725, + "learning_rate": 0.000751957585644372, + "loss": 0.1097, + "num_input_tokens_seen": 19927232, + "step": 9220 + }, + { + "epoch": 1.5048939641109298, + "grad_norm": 0.014959331601858139, + "learning_rate": 0.0007523654159869494, + "loss": 0.0088, + "num_input_tokens_seen": 19936992, + "step": 9225 + }, + { + "epoch": 1.5057096247960848, + "grad_norm": 0.14538100361824036, + "learning_rate": 0.000752773246329527, + "loss": 0.0934, + "num_input_tokens_seen": 19948256, + "step": 9230 + }, + { + "epoch": 1.5065252854812399, + "grad_norm": 0.03245909512042999, + "learning_rate": 0.0007531810766721044, + "loss": 0.2254, + "num_input_tokens_seen": 19960416, + "step": 9235 + }, + { + "epoch": 1.5073409461663947, + "grad_norm": 0.17514733970165253, + "learning_rate": 0.0007535889070146818, + "loss": 0.1052, + "num_input_tokens_seen": 19970656, + "step": 9240 + }, + { + "epoch": 1.5081566068515497, + "grad_norm": 0.01840328611433506, + "learning_rate": 0.0007539967373572594, + "loss": 0.1254, + "num_input_tokens_seen": 19982176, + "step": 9245 + }, + { + "epoch": 1.5089722675367048, + "grad_norm": 0.29731595516204834, + "learning_rate": 0.0007544045676998369, + "loss": 0.2605, + "num_input_tokens_seen": 19991968, + "step": 9250 + }, + { + "epoch": 1.5097879282218596, + "grad_norm": 0.3750736117362976, + "learning_rate": 0.0007548123980424145, + "loss": 0.1608, + "num_input_tokens_seen": 20003968, + "step": 9255 + }, + { + "epoch": 1.5106035889070146, + "grad_norm": 0.16095604002475739, + "learning_rate": 0.0007552202283849918, + "loss": 0.1174, + "num_input_tokens_seen": 20014784, + "step": 9260 + }, + { + "epoch": 1.5114192495921697, + "grad_norm": 0.18407396972179413, + "learning_rate": 0.0007556280587275693, + "loss": 0.1921, + "num_input_tokens_seen": 20024192, + "step": 9265 + }, + { + "epoch": 1.5122349102773245, + "grad_norm": 0.03471226990222931, + "learning_rate": 0.0007560358890701469, + "loss": 0.0808, + "num_input_tokens_seen": 20034368, + "step": 9270 + }, + { + "epoch": 1.5130505709624797, + "grad_norm": 0.39387789368629456, + "learning_rate": 0.0007564437194127243, + "loss": 0.1819, + "num_input_tokens_seen": 20045216, + "step": 9275 + }, + { + "epoch": 1.5138662316476346, + "grad_norm": 0.03568276762962341, + "learning_rate": 0.0007568515497553018, + "loss": 0.0563, + "num_input_tokens_seen": 20056928, + "step": 9280 + }, + { + "epoch": 1.5146818923327896, + "grad_norm": 0.23924300074577332, + "learning_rate": 0.0007572593800978793, + "loss": 0.1762, + "num_input_tokens_seen": 20067520, + "step": 9285 + }, + { + "epoch": 1.5154975530179446, + "grad_norm": 0.06556529551744461, + "learning_rate": 0.0007576672104404568, + "loss": 0.0933, + "num_input_tokens_seen": 20077568, + "step": 9290 + }, + { + "epoch": 1.5163132137030995, + "grad_norm": 0.21158908307552338, + "learning_rate": 0.0007580750407830343, + "loss": 0.356, + "num_input_tokens_seen": 20087584, + "step": 9295 + }, + { + "epoch": 1.5171288743882545, + "grad_norm": 0.061087146401405334, + "learning_rate": 0.0007584828711256117, + "loss": 0.0795, + "num_input_tokens_seen": 20098880, + "step": 9300 + }, + { + "epoch": 1.5179445350734095, + "grad_norm": 0.21947313845157623, + "learning_rate": 0.0007588907014681893, + "loss": 0.1673, + "num_input_tokens_seen": 20109568, + "step": 9305 + }, + { + "epoch": 1.5187601957585644, + "grad_norm": 0.039298392832279205, + "learning_rate": 0.0007592985318107667, + "loss": 0.091, + "num_input_tokens_seen": 20121184, + "step": 9310 + }, + { + "epoch": 1.5195758564437194, + "grad_norm": 0.031186887994408607, + "learning_rate": 0.0007597063621533442, + "loss": 0.1188, + "num_input_tokens_seen": 20132608, + "step": 9315 + }, + { + "epoch": 1.5203915171288744, + "grad_norm": 0.06751430779695511, + "learning_rate": 0.0007601141924959218, + "loss": 0.0381, + "num_input_tokens_seen": 20143200, + "step": 9320 + }, + { + "epoch": 1.5212071778140293, + "grad_norm": 0.01318424567580223, + "learning_rate": 0.0007605220228384992, + "loss": 0.0985, + "num_input_tokens_seen": 20153344, + "step": 9325 + }, + { + "epoch": 1.5220228384991843, + "grad_norm": 0.12425762414932251, + "learning_rate": 0.0007609298531810767, + "loss": 0.109, + "num_input_tokens_seen": 20164288, + "step": 9330 + }, + { + "epoch": 1.5228384991843393, + "grad_norm": 0.2752070128917694, + "learning_rate": 0.0007613376835236542, + "loss": 0.1237, + "num_input_tokens_seen": 20174880, + "step": 9335 + }, + { + "epoch": 1.5236541598694942, + "grad_norm": 0.05250950902700424, + "learning_rate": 0.0007617455138662317, + "loss": 0.1344, + "num_input_tokens_seen": 20184960, + "step": 9340 + }, + { + "epoch": 1.5244698205546494, + "grad_norm": 0.07619812339544296, + "learning_rate": 0.0007621533442088091, + "loss": 0.0666, + "num_input_tokens_seen": 20194464, + "step": 9345 + }, + { + "epoch": 1.5252854812398042, + "grad_norm": 0.07250954210758209, + "learning_rate": 0.0007625611745513866, + "loss": 0.3211, + "num_input_tokens_seen": 20204352, + "step": 9350 + }, + { + "epoch": 1.5261011419249593, + "grad_norm": 0.04100877419114113, + "learning_rate": 0.0007629690048939642, + "loss": 0.0936, + "num_input_tokens_seen": 20214816, + "step": 9355 + }, + { + "epoch": 1.5269168026101143, + "grad_norm": 0.2989422678947449, + "learning_rate": 0.0007633768352365417, + "loss": 0.2116, + "num_input_tokens_seen": 20225664, + "step": 9360 + }, + { + "epoch": 1.5277324632952691, + "grad_norm": 0.16885030269622803, + "learning_rate": 0.000763784665579119, + "loss": 0.089, + "num_input_tokens_seen": 20236672, + "step": 9365 + }, + { + "epoch": 1.5285481239804242, + "grad_norm": 0.24660871922969818, + "learning_rate": 0.0007641924959216966, + "loss": 0.1751, + "num_input_tokens_seen": 20246784, + "step": 9370 + }, + { + "epoch": 1.5293637846655792, + "grad_norm": 0.02454557456076145, + "learning_rate": 0.0007646003262642741, + "loss": 0.0601, + "num_input_tokens_seen": 20257088, + "step": 9375 + }, + { + "epoch": 1.530179445350734, + "grad_norm": 0.03438768535852432, + "learning_rate": 0.0007650081566068515, + "loss": 0.0455, + "num_input_tokens_seen": 20267520, + "step": 9380 + }, + { + "epoch": 1.530995106035889, + "grad_norm": 0.05252232775092125, + "learning_rate": 0.0007654159869494291, + "loss": 0.102, + "num_input_tokens_seen": 20277568, + "step": 9385 + }, + { + "epoch": 1.531810766721044, + "grad_norm": 0.12352188676595688, + "learning_rate": 0.0007658238172920065, + "loss": 0.111, + "num_input_tokens_seen": 20288768, + "step": 9390 + }, + { + "epoch": 1.532626427406199, + "grad_norm": 0.1519812047481537, + "learning_rate": 0.0007662316476345841, + "loss": 0.1015, + "num_input_tokens_seen": 20299968, + "step": 9395 + }, + { + "epoch": 1.533442088091354, + "grad_norm": 0.11303206533193588, + "learning_rate": 0.0007666394779771615, + "loss": 0.1775, + "num_input_tokens_seen": 20310592, + "step": 9400 + }, + { + "epoch": 1.534257748776509, + "grad_norm": 0.2930207848548889, + "learning_rate": 0.000767047308319739, + "loss": 0.288, + "num_input_tokens_seen": 20321088, + "step": 9405 + }, + { + "epoch": 1.5350734094616638, + "grad_norm": 0.14500972628593445, + "learning_rate": 0.0007674551386623165, + "loss": 0.0861, + "num_input_tokens_seen": 20331136, + "step": 9410 + }, + { + "epoch": 1.535889070146819, + "grad_norm": 0.18947528302669525, + "learning_rate": 0.0007678629690048939, + "loss": 0.0965, + "num_input_tokens_seen": 20343136, + "step": 9415 + }, + { + "epoch": 1.536704730831974, + "grad_norm": 0.045977018773555756, + "learning_rate": 0.0007682707993474715, + "loss": 0.1988, + "num_input_tokens_seen": 20354656, + "step": 9420 + }, + { + "epoch": 1.5375203915171287, + "grad_norm": 0.07507748901844025, + "learning_rate": 0.000768678629690049, + "loss": 0.0542, + "num_input_tokens_seen": 20365856, + "step": 9425 + }, + { + "epoch": 1.538336052202284, + "grad_norm": 0.07397603243589401, + "learning_rate": 0.0007690864600326263, + "loss": 0.1039, + "num_input_tokens_seen": 20376064, + "step": 9430 + }, + { + "epoch": 1.5391517128874388, + "grad_norm": 0.05261943116784096, + "learning_rate": 0.0007694942903752039, + "loss": 0.0729, + "num_input_tokens_seen": 20387104, + "step": 9435 + }, + { + "epoch": 1.5399673735725938, + "grad_norm": 0.11922776699066162, + "learning_rate": 0.0007699021207177814, + "loss": 0.0589, + "num_input_tokens_seen": 20398112, + "step": 9440 + }, + { + "epoch": 1.5407830342577489, + "grad_norm": 0.06288610398769379, + "learning_rate": 0.000770309951060359, + "loss": 0.0493, + "num_input_tokens_seen": 20409152, + "step": 9445 + }, + { + "epoch": 1.5415986949429037, + "grad_norm": 0.0780673623085022, + "learning_rate": 0.0007707177814029364, + "loss": 0.0884, + "num_input_tokens_seen": 20420704, + "step": 9450 + }, + { + "epoch": 1.5424143556280587, + "grad_norm": 0.07994091510772705, + "learning_rate": 0.0007711256117455138, + "loss": 0.1297, + "num_input_tokens_seen": 20431456, + "step": 9455 + }, + { + "epoch": 1.5432300163132138, + "grad_norm": 0.058916181325912476, + "learning_rate": 0.0007715334420880914, + "loss": 0.0969, + "num_input_tokens_seen": 20442880, + "step": 9460 + }, + { + "epoch": 1.5440456769983686, + "grad_norm": 0.0766553208231926, + "learning_rate": 0.0007719412724306688, + "loss": 0.0257, + "num_input_tokens_seen": 20454624, + "step": 9465 + }, + { + "epoch": 1.5448613376835236, + "grad_norm": 0.03179330378770828, + "learning_rate": 0.0007723491027732464, + "loss": 0.1407, + "num_input_tokens_seen": 20465824, + "step": 9470 + }, + { + "epoch": 1.5456769983686787, + "grad_norm": 0.043396756052970886, + "learning_rate": 0.0007727569331158239, + "loss": 0.2144, + "num_input_tokens_seen": 20477088, + "step": 9475 + }, + { + "epoch": 1.5464926590538335, + "grad_norm": 0.005034312605857849, + "learning_rate": 0.0007731647634584013, + "loss": 0.063, + "num_input_tokens_seen": 20487296, + "step": 9480 + }, + { + "epoch": 1.5473083197389887, + "grad_norm": 0.033181268721818924, + "learning_rate": 0.0007735725938009788, + "loss": 0.2054, + "num_input_tokens_seen": 20498080, + "step": 9485 + }, + { + "epoch": 1.5481239804241436, + "grad_norm": 0.46535250544548035, + "learning_rate": 0.0007739804241435563, + "loss": 0.2473, + "num_input_tokens_seen": 20509824, + "step": 9490 + }, + { + "epoch": 1.5489396411092984, + "grad_norm": 0.08377546817064285, + "learning_rate": 0.0007743882544861339, + "loss": 0.1666, + "num_input_tokens_seen": 20520928, + "step": 9495 + }, + { + "epoch": 1.5497553017944536, + "grad_norm": 0.13419397175312042, + "learning_rate": 0.0007747960848287112, + "loss": 0.1501, + "num_input_tokens_seen": 20532288, + "step": 9500 + }, + { + "epoch": 1.5505709624796085, + "grad_norm": 0.09725387394428253, + "learning_rate": 0.0007752039151712887, + "loss": 0.1634, + "num_input_tokens_seen": 20543456, + "step": 9505 + }, + { + "epoch": 1.5513866231647635, + "grad_norm": 0.13389593362808228, + "learning_rate": 0.0007756117455138663, + "loss": 0.0908, + "num_input_tokens_seen": 20554592, + "step": 9510 + }, + { + "epoch": 1.5522022838499185, + "grad_norm": 0.28901609778404236, + "learning_rate": 0.0007760195758564438, + "loss": 0.1475, + "num_input_tokens_seen": 20565504, + "step": 9515 + }, + { + "epoch": 1.5530179445350734, + "grad_norm": 0.33048662543296814, + "learning_rate": 0.0007764274061990211, + "loss": 0.2592, + "num_input_tokens_seen": 20576896, + "step": 9520 + }, + { + "epoch": 1.5538336052202284, + "grad_norm": 0.07838031649589539, + "learning_rate": 0.0007768352365415987, + "loss": 0.1271, + "num_input_tokens_seen": 20586720, + "step": 9525 + }, + { + "epoch": 1.5546492659053834, + "grad_norm": 0.21444471180438995, + "learning_rate": 0.0007772430668841762, + "loss": 0.1455, + "num_input_tokens_seen": 20597376, + "step": 9530 + }, + { + "epoch": 1.5554649265905383, + "grad_norm": 0.08738496154546738, + "learning_rate": 0.0007776508972267537, + "loss": 0.1057, + "num_input_tokens_seen": 20608608, + "step": 9535 + }, + { + "epoch": 1.5562805872756933, + "grad_norm": 0.11887580901384354, + "learning_rate": 0.0007780587275693312, + "loss": 0.0979, + "num_input_tokens_seen": 20620128, + "step": 9540 + }, + { + "epoch": 1.5570962479608483, + "grad_norm": 0.012339063920080662, + "learning_rate": 0.0007784665579119086, + "loss": 0.0412, + "num_input_tokens_seen": 20631200, + "step": 9545 + }, + { + "epoch": 1.5579119086460032, + "grad_norm": 0.006594108883291483, + "learning_rate": 0.0007788743882544862, + "loss": 0.0315, + "num_input_tokens_seen": 20640352, + "step": 9550 + }, + { + "epoch": 1.5587275693311582, + "grad_norm": 0.32848331332206726, + "learning_rate": 0.0007792822185970636, + "loss": 0.1736, + "num_input_tokens_seen": 20652832, + "step": 9555 + }, + { + "epoch": 1.5595432300163132, + "grad_norm": 0.010082591325044632, + "learning_rate": 0.0007796900489396412, + "loss": 0.0581, + "num_input_tokens_seen": 20663488, + "step": 9560 + }, + { + "epoch": 1.560358890701468, + "grad_norm": 0.04229852929711342, + "learning_rate": 0.0007800978792822186, + "loss": 0.0641, + "num_input_tokens_seen": 20674464, + "step": 9565 + }, + { + "epoch": 1.5611745513866233, + "grad_norm": 0.097086101770401, + "learning_rate": 0.000780505709624796, + "loss": 0.1025, + "num_input_tokens_seen": 20685760, + "step": 9570 + }, + { + "epoch": 1.5619902120717781, + "grad_norm": 0.44878190755844116, + "learning_rate": 0.0007809135399673736, + "loss": 0.2277, + "num_input_tokens_seen": 20697280, + "step": 9575 + }, + { + "epoch": 1.5628058727569332, + "grad_norm": 0.005180702079087496, + "learning_rate": 0.0007813213703099511, + "loss": 0.1879, + "num_input_tokens_seen": 20708000, + "step": 9580 + }, + { + "epoch": 1.5636215334420882, + "grad_norm": 0.03598257154226303, + "learning_rate": 0.0007817292006525287, + "loss": 0.103, + "num_input_tokens_seen": 20719424, + "step": 9585 + }, + { + "epoch": 1.564437194127243, + "grad_norm": 0.18096789717674255, + "learning_rate": 0.000782137030995106, + "loss": 0.1517, + "num_input_tokens_seen": 20728096, + "step": 9590 + }, + { + "epoch": 1.565252854812398, + "grad_norm": 0.12832759320735931, + "learning_rate": 0.0007825448613376835, + "loss": 0.1847, + "num_input_tokens_seen": 20738816, + "step": 9595 + }, + { + "epoch": 1.566068515497553, + "grad_norm": 0.03327531740069389, + "learning_rate": 0.0007829526916802611, + "loss": 0.1345, + "num_input_tokens_seen": 20749408, + "step": 9600 + }, + { + "epoch": 1.566884176182708, + "grad_norm": 0.044648993760347366, + "learning_rate": 0.0007833605220228385, + "loss": 0.0479, + "num_input_tokens_seen": 20759200, + "step": 9605 + }, + { + "epoch": 1.567699836867863, + "grad_norm": 0.019522270187735558, + "learning_rate": 0.000783768352365416, + "loss": 0.1455, + "num_input_tokens_seen": 20770560, + "step": 9610 + }, + { + "epoch": 1.568515497553018, + "grad_norm": 0.1261817365884781, + "learning_rate": 0.0007841761827079935, + "loss": 0.1079, + "num_input_tokens_seen": 20781056, + "step": 9615 + }, + { + "epoch": 1.5693311582381728, + "grad_norm": 0.13513056933879852, + "learning_rate": 0.000784584013050571, + "loss": 0.052, + "num_input_tokens_seen": 20791648, + "step": 9620 + }, + { + "epoch": 1.5701468189233279, + "grad_norm": 0.1296224445104599, + "learning_rate": 0.0007849918433931485, + "loss": 0.1006, + "num_input_tokens_seen": 20804128, + "step": 9625 + }, + { + "epoch": 1.570962479608483, + "grad_norm": 0.0451621375977993, + "learning_rate": 0.000785399673735726, + "loss": 0.0751, + "num_input_tokens_seen": 20814752, + "step": 9630 + }, + { + "epoch": 1.5717781402936377, + "grad_norm": 0.0406937450170517, + "learning_rate": 0.0007858075040783035, + "loss": 0.1367, + "num_input_tokens_seen": 20826720, + "step": 9635 + }, + { + "epoch": 1.572593800978793, + "grad_norm": 0.06996465474367142, + "learning_rate": 0.0007862153344208809, + "loss": 0.0698, + "num_input_tokens_seen": 20837536, + "step": 9640 + }, + { + "epoch": 1.5734094616639478, + "grad_norm": 0.2727460563182831, + "learning_rate": 0.0007866231647634584, + "loss": 0.1619, + "num_input_tokens_seen": 20847744, + "step": 9645 + }, + { + "epoch": 1.5742251223491026, + "grad_norm": 0.12174668908119202, + "learning_rate": 0.000787030995106036, + "loss": 0.1635, + "num_input_tokens_seen": 20859136, + "step": 9650 + }, + { + "epoch": 1.5750407830342579, + "grad_norm": 0.16419067978858948, + "learning_rate": 0.0007874388254486133, + "loss": 0.1022, + "num_input_tokens_seen": 20868832, + "step": 9655 + }, + { + "epoch": 1.5758564437194127, + "grad_norm": 0.13058078289031982, + "learning_rate": 0.0007878466557911908, + "loss": 0.1202, + "num_input_tokens_seen": 20878080, + "step": 9660 + }, + { + "epoch": 1.5766721044045677, + "grad_norm": 0.20850171148777008, + "learning_rate": 0.0007882544861337684, + "loss": 0.1652, + "num_input_tokens_seen": 20889600, + "step": 9665 + }, + { + "epoch": 1.5774877650897228, + "grad_norm": 0.36702680587768555, + "learning_rate": 0.0007886623164763459, + "loss": 0.288, + "num_input_tokens_seen": 20899392, + "step": 9670 + }, + { + "epoch": 1.5783034257748776, + "grad_norm": 0.23824627697467804, + "learning_rate": 0.0007890701468189233, + "loss": 0.2362, + "num_input_tokens_seen": 20909888, + "step": 9675 + }, + { + "epoch": 1.5791190864600326, + "grad_norm": 0.10039061307907104, + "learning_rate": 0.0007894779771615008, + "loss": 0.0615, + "num_input_tokens_seen": 20920032, + "step": 9680 + }, + { + "epoch": 1.5799347471451877, + "grad_norm": 0.01969858631491661, + "learning_rate": 0.0007898858075040783, + "loss": 0.1192, + "num_input_tokens_seen": 20931136, + "step": 9685 + }, + { + "epoch": 1.5807504078303425, + "grad_norm": 0.02699236571788788, + "learning_rate": 0.0007902936378466558, + "loss": 0.082, + "num_input_tokens_seen": 20941792, + "step": 9690 + }, + { + "epoch": 1.5815660685154975, + "grad_norm": 0.04595969617366791, + "learning_rate": 0.0007907014681892332, + "loss": 0.1182, + "num_input_tokens_seen": 20952672, + "step": 9695 + }, + { + "epoch": 1.5823817292006526, + "grad_norm": 0.12928828597068787, + "learning_rate": 0.0007911092985318108, + "loss": 0.0772, + "num_input_tokens_seen": 20964480, + "step": 9700 + }, + { + "epoch": 1.5831973898858074, + "grad_norm": 0.04820702597498894, + "learning_rate": 0.0007915171288743883, + "loss": 0.1038, + "num_input_tokens_seen": 20975392, + "step": 9705 + }, + { + "epoch": 1.5840130505709626, + "grad_norm": 0.03449046239256859, + "learning_rate": 0.0007919249592169657, + "loss": 0.2203, + "num_input_tokens_seen": 20985344, + "step": 9710 + }, + { + "epoch": 1.5848287112561175, + "grad_norm": 0.3834605813026428, + "learning_rate": 0.0007923327895595433, + "loss": 0.1747, + "num_input_tokens_seen": 20995456, + "step": 9715 + }, + { + "epoch": 1.5856443719412723, + "grad_norm": 0.05773041024804115, + "learning_rate": 0.0007927406199021207, + "loss": 0.1023, + "num_input_tokens_seen": 21006816, + "step": 9720 + }, + { + "epoch": 1.5864600326264275, + "grad_norm": 0.22168226540088654, + "learning_rate": 0.0007931484502446982, + "loss": 0.1424, + "num_input_tokens_seen": 21017344, + "step": 9725 + }, + { + "epoch": 1.5872756933115824, + "grad_norm": 0.11118586361408234, + "learning_rate": 0.0007935562805872757, + "loss": 0.1115, + "num_input_tokens_seen": 21027040, + "step": 9730 + }, + { + "epoch": 1.5880913539967374, + "grad_norm": 0.040485039353370667, + "learning_rate": 0.0007939641109298532, + "loss": 0.0629, + "num_input_tokens_seen": 21037472, + "step": 9735 + }, + { + "epoch": 1.5889070146818924, + "grad_norm": 0.4708271026611328, + "learning_rate": 0.0007943719412724308, + "loss": 0.1649, + "num_input_tokens_seen": 21048800, + "step": 9740 + }, + { + "epoch": 1.5897226753670473, + "grad_norm": 0.19345107674598694, + "learning_rate": 0.0007947797716150081, + "loss": 0.2276, + "num_input_tokens_seen": 21060672, + "step": 9745 + }, + { + "epoch": 1.5905383360522023, + "grad_norm": 0.05481063947081566, + "learning_rate": 0.0007951876019575857, + "loss": 0.154, + "num_input_tokens_seen": 21070880, + "step": 9750 + }, + { + "epoch": 1.5913539967373573, + "grad_norm": 0.11768931150436401, + "learning_rate": 0.0007955954323001632, + "loss": 0.2829, + "num_input_tokens_seen": 21081792, + "step": 9755 + }, + { + "epoch": 1.5921696574225122, + "grad_norm": 0.12601208686828613, + "learning_rate": 0.0007960032626427406, + "loss": 0.0859, + "num_input_tokens_seen": 21093280, + "step": 9760 + }, + { + "epoch": 1.5929853181076672, + "grad_norm": 0.1481441855430603, + "learning_rate": 0.0007964110929853181, + "loss": 0.0906, + "num_input_tokens_seen": 21103264, + "step": 9765 + }, + { + "epoch": 1.5938009787928222, + "grad_norm": 0.04904396831989288, + "learning_rate": 0.0007968189233278956, + "loss": 0.1373, + "num_input_tokens_seen": 21113632, + "step": 9770 + }, + { + "epoch": 1.594616639477977, + "grad_norm": 0.2448497712612152, + "learning_rate": 0.0007972267536704732, + "loss": 0.205, + "num_input_tokens_seen": 21124448, + "step": 9775 + }, + { + "epoch": 1.595432300163132, + "grad_norm": 0.28078874945640564, + "learning_rate": 0.0007976345840130506, + "loss": 0.2114, + "num_input_tokens_seen": 21135776, + "step": 9780 + }, + { + "epoch": 1.5962479608482871, + "grad_norm": 0.05803528055548668, + "learning_rate": 0.000798042414355628, + "loss": 0.1542, + "num_input_tokens_seen": 21147072, + "step": 9785 + }, + { + "epoch": 1.597063621533442, + "grad_norm": 0.12829574942588806, + "learning_rate": 0.0007984502446982056, + "loss": 0.1289, + "num_input_tokens_seen": 21158848, + "step": 9790 + }, + { + "epoch": 1.5978792822185972, + "grad_norm": 0.0741729810833931, + "learning_rate": 0.000798858075040783, + "loss": 0.0633, + "num_input_tokens_seen": 21168448, + "step": 9795 + }, + { + "epoch": 1.598694942903752, + "grad_norm": 0.2089935541152954, + "learning_rate": 0.0007992659053833605, + "loss": 0.1512, + "num_input_tokens_seen": 21179744, + "step": 9800 + }, + { + "epoch": 1.599510603588907, + "grad_norm": 0.1523425430059433, + "learning_rate": 0.0007996737357259381, + "loss": 0.1326, + "num_input_tokens_seen": 21191904, + "step": 9805 + }, + { + "epoch": 1.600326264274062, + "grad_norm": 0.020436199381947517, + "learning_rate": 0.0008000815660685155, + "loss": 0.1332, + "num_input_tokens_seen": 21202848, + "step": 9810 + }, + { + "epoch": 1.601141924959217, + "grad_norm": 0.041961271315813065, + "learning_rate": 0.000800489396411093, + "loss": 0.062, + "num_input_tokens_seen": 21214304, + "step": 9815 + }, + { + "epoch": 1.601957585644372, + "grad_norm": 0.12393607199192047, + "learning_rate": 0.0008008972267536705, + "loss": 0.0793, + "num_input_tokens_seen": 21224448, + "step": 9820 + }, + { + "epoch": 1.602773246329527, + "grad_norm": 0.29161787033081055, + "learning_rate": 0.000801305057096248, + "loss": 0.2252, + "num_input_tokens_seen": 21235392, + "step": 9825 + }, + { + "epoch": 1.6035889070146818, + "grad_norm": 0.09719390422105789, + "learning_rate": 0.0008017128874388254, + "loss": 0.1203, + "num_input_tokens_seen": 21246656, + "step": 9830 + }, + { + "epoch": 1.6044045676998369, + "grad_norm": 0.16663017868995667, + "learning_rate": 0.0008021207177814029, + "loss": 0.0721, + "num_input_tokens_seen": 21257376, + "step": 9835 + }, + { + "epoch": 1.605220228384992, + "grad_norm": 0.05958310514688492, + "learning_rate": 0.0008025285481239805, + "loss": 0.1304, + "num_input_tokens_seen": 21267456, + "step": 9840 + }, + { + "epoch": 1.6060358890701467, + "grad_norm": 0.18147332966327667, + "learning_rate": 0.000802936378466558, + "loss": 0.1667, + "num_input_tokens_seen": 21279104, + "step": 9845 + }, + { + "epoch": 1.6068515497553018, + "grad_norm": 0.022242316976189613, + "learning_rate": 0.0008033442088091353, + "loss": 0.229, + "num_input_tokens_seen": 21291328, + "step": 9850 + }, + { + "epoch": 1.6076672104404568, + "grad_norm": 0.1411367803812027, + "learning_rate": 0.0008037520391517129, + "loss": 0.0981, + "num_input_tokens_seen": 21302016, + "step": 9855 + }, + { + "epoch": 1.6084828711256116, + "grad_norm": 0.19621413946151733, + "learning_rate": 0.0008041598694942904, + "loss": 0.128, + "num_input_tokens_seen": 21312448, + "step": 9860 + }, + { + "epoch": 1.6092985318107669, + "grad_norm": 0.06629271060228348, + "learning_rate": 0.0008045676998368679, + "loss": 0.0462, + "num_input_tokens_seen": 21323136, + "step": 9865 + }, + { + "epoch": 1.6101141924959217, + "grad_norm": 0.21681833267211914, + "learning_rate": 0.0008049755301794454, + "loss": 0.0824, + "num_input_tokens_seen": 21334368, + "step": 9870 + }, + { + "epoch": 1.6109298531810765, + "grad_norm": 0.07766492664813995, + "learning_rate": 0.0008053833605220228, + "loss": 0.0689, + "num_input_tokens_seen": 21344992, + "step": 9875 + }, + { + "epoch": 1.6117455138662318, + "grad_norm": 0.029593780636787415, + "learning_rate": 0.0008057911908646003, + "loss": 0.0666, + "num_input_tokens_seen": 21354688, + "step": 9880 + }, + { + "epoch": 1.6125611745513866, + "grad_norm": 0.3840160667896271, + "learning_rate": 0.0008061990212071778, + "loss": 0.3735, + "num_input_tokens_seen": 21365696, + "step": 9885 + }, + { + "epoch": 1.6133768352365416, + "grad_norm": 0.2226332128047943, + "learning_rate": 0.0008066068515497554, + "loss": 0.2567, + "num_input_tokens_seen": 21378240, + "step": 9890 + }, + { + "epoch": 1.6141924959216967, + "grad_norm": 0.36604976654052734, + "learning_rate": 0.0008070146818923329, + "loss": 0.3924, + "num_input_tokens_seen": 21388992, + "step": 9895 + }, + { + "epoch": 1.6150081566068515, + "grad_norm": 0.04600756615400314, + "learning_rate": 0.0008074225122349102, + "loss": 0.1061, + "num_input_tokens_seen": 21399808, + "step": 9900 + }, + { + "epoch": 1.6158238172920065, + "grad_norm": 0.071851447224617, + "learning_rate": 0.0008078303425774878, + "loss": 0.0776, + "num_input_tokens_seen": 21409696, + "step": 9905 + }, + { + "epoch": 1.6166394779771616, + "grad_norm": 0.02987734228372574, + "learning_rate": 0.0008082381729200653, + "loss": 0.0843, + "num_input_tokens_seen": 21420416, + "step": 9910 + }, + { + "epoch": 1.6174551386623164, + "grad_norm": 0.3032320439815521, + "learning_rate": 0.0008086460032626428, + "loss": 0.2382, + "num_input_tokens_seen": 21431648, + "step": 9915 + }, + { + "epoch": 1.6182707993474714, + "grad_norm": 0.07586050778627396, + "learning_rate": 0.0008090538336052202, + "loss": 0.1852, + "num_input_tokens_seen": 21441856, + "step": 9920 + }, + { + "epoch": 1.6190864600326265, + "grad_norm": 0.08818615227937698, + "learning_rate": 0.0008094616639477977, + "loss": 0.2086, + "num_input_tokens_seen": 21452736, + "step": 9925 + }, + { + "epoch": 1.6199021207177813, + "grad_norm": 0.05056015029549599, + "learning_rate": 0.0008098694942903753, + "loss": 0.1165, + "num_input_tokens_seen": 21463552, + "step": 9930 + }, + { + "epoch": 1.6207177814029365, + "grad_norm": 0.10703907161951065, + "learning_rate": 0.0008102773246329527, + "loss": 0.1589, + "num_input_tokens_seen": 21475136, + "step": 9935 + }, + { + "epoch": 1.6215334420880914, + "grad_norm": 0.016163919121026993, + "learning_rate": 0.0008106851549755301, + "loss": 0.1409, + "num_input_tokens_seen": 21485536, + "step": 9940 + }, + { + "epoch": 1.6223491027732462, + "grad_norm": 0.0341041274368763, + "learning_rate": 0.0008110929853181077, + "loss": 0.1248, + "num_input_tokens_seen": 21496320, + "step": 9945 + }, + { + "epoch": 1.6231647634584014, + "grad_norm": 0.060666803270578384, + "learning_rate": 0.0008115008156606851, + "loss": 0.0667, + "num_input_tokens_seen": 21506688, + "step": 9950 + }, + { + "epoch": 1.6239804241435563, + "grad_norm": 0.2748398780822754, + "learning_rate": 0.0008119086460032627, + "loss": 0.2972, + "num_input_tokens_seen": 21517184, + "step": 9955 + }, + { + "epoch": 1.6247960848287113, + "grad_norm": 0.056389469653367996, + "learning_rate": 0.0008123164763458402, + "loss": 0.0547, + "num_input_tokens_seen": 21527104, + "step": 9960 + }, + { + "epoch": 1.6256117455138663, + "grad_norm": 0.29895633459091187, + "learning_rate": 0.0008127243066884176, + "loss": 0.2512, + "num_input_tokens_seen": 21537024, + "step": 9965 + }, + { + "epoch": 1.6264274061990212, + "grad_norm": 0.18905404210090637, + "learning_rate": 0.0008131321370309951, + "loss": 0.1181, + "num_input_tokens_seen": 21547648, + "step": 9970 + }, + { + "epoch": 1.6272430668841762, + "grad_norm": 0.09550945460796356, + "learning_rate": 0.0008135399673735726, + "loss": 0.0961, + "num_input_tokens_seen": 21558944, + "step": 9975 + }, + { + "epoch": 1.6280587275693312, + "grad_norm": 0.23355981707572937, + "learning_rate": 0.0008139477977161502, + "loss": 0.2585, + "num_input_tokens_seen": 21569408, + "step": 9980 + }, + { + "epoch": 1.628874388254486, + "grad_norm": 0.1826048642396927, + "learning_rate": 0.0008143556280587275, + "loss": 0.1497, + "num_input_tokens_seen": 21580384, + "step": 9985 + }, + { + "epoch": 1.629690048939641, + "grad_norm": 0.08309773355722427, + "learning_rate": 0.000814763458401305, + "loss": 0.1063, + "num_input_tokens_seen": 21591328, + "step": 9990 + }, + { + "epoch": 1.6305057096247961, + "grad_norm": 0.11571859568357468, + "learning_rate": 0.0008151712887438826, + "loss": 0.0889, + "num_input_tokens_seen": 21603456, + "step": 9995 + }, + { + "epoch": 1.631321370309951, + "grad_norm": 0.2297494262456894, + "learning_rate": 0.0008155791190864601, + "loss": 0.1695, + "num_input_tokens_seen": 21613248, + "step": 10000 + }, + { + "epoch": 1.632137030995106, + "grad_norm": 0.049684058874845505, + "learning_rate": 0.0008159869494290375, + "loss": 0.0506, + "num_input_tokens_seen": 21624256, + "step": 10005 + }, + { + "epoch": 1.632952691680261, + "grad_norm": 0.06639562547206879, + "learning_rate": 0.000816394779771615, + "loss": 0.0637, + "num_input_tokens_seen": 21636032, + "step": 10010 + }, + { + "epoch": 1.6337683523654158, + "grad_norm": 0.10734783113002777, + "learning_rate": 0.0008168026101141925, + "loss": 0.0935, + "num_input_tokens_seen": 21647264, + "step": 10015 + }, + { + "epoch": 1.634584013050571, + "grad_norm": 0.2397036999464035, + "learning_rate": 0.00081721044045677, + "loss": 0.0696, + "num_input_tokens_seen": 21658496, + "step": 10020 + }, + { + "epoch": 1.635399673735726, + "grad_norm": 0.10784047842025757, + "learning_rate": 0.0008176182707993475, + "loss": 0.1956, + "num_input_tokens_seen": 21669248, + "step": 10025 + }, + { + "epoch": 1.636215334420881, + "grad_norm": 0.1248219832777977, + "learning_rate": 0.000818026101141925, + "loss": 0.0431, + "num_input_tokens_seen": 21679840, + "step": 10030 + }, + { + "epoch": 1.637030995106036, + "grad_norm": 0.015480201691389084, + "learning_rate": 0.0008184339314845025, + "loss": 0.1137, + "num_input_tokens_seen": 21691904, + "step": 10035 + }, + { + "epoch": 1.6378466557911908, + "grad_norm": 0.01644779182970524, + "learning_rate": 0.0008188417618270799, + "loss": 0.0286, + "num_input_tokens_seen": 21701408, + "step": 10040 + }, + { + "epoch": 1.6386623164763459, + "grad_norm": 0.08081140369176865, + "learning_rate": 0.0008192495921696575, + "loss": 0.0458, + "num_input_tokens_seen": 21712256, + "step": 10045 + }, + { + "epoch": 1.639477977161501, + "grad_norm": 0.09865362197160721, + "learning_rate": 0.0008196574225122349, + "loss": 0.0516, + "num_input_tokens_seen": 21722304, + "step": 10050 + }, + { + "epoch": 1.6402936378466557, + "grad_norm": 0.10547200590372086, + "learning_rate": 0.0008200652528548124, + "loss": 0.0661, + "num_input_tokens_seen": 21733152, + "step": 10055 + }, + { + "epoch": 1.6411092985318108, + "grad_norm": 0.2603350877761841, + "learning_rate": 0.0008204730831973899, + "loss": 0.134, + "num_input_tokens_seen": 21744160, + "step": 10060 + }, + { + "epoch": 1.6419249592169658, + "grad_norm": 0.16272181272506714, + "learning_rate": 0.0008208809135399674, + "loss": 0.0737, + "num_input_tokens_seen": 21754432, + "step": 10065 + }, + { + "epoch": 1.6427406199021206, + "grad_norm": 0.07605135440826416, + "learning_rate": 0.000821288743882545, + "loss": 0.0646, + "num_input_tokens_seen": 21764928, + "step": 10070 + }, + { + "epoch": 1.6435562805872757, + "grad_norm": 0.28782913088798523, + "learning_rate": 0.0008216965742251223, + "loss": 0.1637, + "num_input_tokens_seen": 21775008, + "step": 10075 + }, + { + "epoch": 1.6443719412724307, + "grad_norm": 0.3939211666584015, + "learning_rate": 0.0008221044045676999, + "loss": 0.0873, + "num_input_tokens_seen": 21785792, + "step": 10080 + }, + { + "epoch": 1.6451876019575855, + "grad_norm": 0.06839653104543686, + "learning_rate": 0.0008225122349102774, + "loss": 0.1476, + "num_input_tokens_seen": 21796704, + "step": 10085 + }, + { + "epoch": 1.6460032626427408, + "grad_norm": 0.047159500420093536, + "learning_rate": 0.0008229200652528548, + "loss": 0.3164, + "num_input_tokens_seen": 21807840, + "step": 10090 + }, + { + "epoch": 1.6468189233278956, + "grad_norm": 0.12887166440486908, + "learning_rate": 0.0008233278955954323, + "loss": 0.1284, + "num_input_tokens_seen": 21818528, + "step": 10095 + }, + { + "epoch": 1.6476345840130504, + "grad_norm": 0.015008017420768738, + "learning_rate": 0.0008237357259380098, + "loss": 0.0651, + "num_input_tokens_seen": 21829472, + "step": 10100 + }, + { + "epoch": 1.6484502446982057, + "grad_norm": 0.1243605688214302, + "learning_rate": 0.0008241435562805873, + "loss": 0.2511, + "num_input_tokens_seen": 21841024, + "step": 10105 + }, + { + "epoch": 1.6492659053833605, + "grad_norm": 0.1753050982952118, + "learning_rate": 0.0008245513866231648, + "loss": 0.23, + "num_input_tokens_seen": 21852096, + "step": 10110 + }, + { + "epoch": 1.6500815660685155, + "grad_norm": 0.15876537561416626, + "learning_rate": 0.0008249592169657422, + "loss": 0.1388, + "num_input_tokens_seen": 21863904, + "step": 10115 + }, + { + "epoch": 1.6508972267536706, + "grad_norm": 0.1318834275007248, + "learning_rate": 0.0008253670473083198, + "loss": 0.1527, + "num_input_tokens_seen": 21874048, + "step": 10120 + }, + { + "epoch": 1.6517128874388254, + "grad_norm": 0.23546653985977173, + "learning_rate": 0.0008257748776508972, + "loss": 0.1547, + "num_input_tokens_seen": 21885632, + "step": 10125 + }, + { + "epoch": 1.6525285481239804, + "grad_norm": 0.051521025598049164, + "learning_rate": 0.0008261827079934747, + "loss": 0.2125, + "num_input_tokens_seen": 21896864, + "step": 10130 + }, + { + "epoch": 1.6533442088091355, + "grad_norm": 0.14373049139976501, + "learning_rate": 0.0008265905383360523, + "loss": 0.118, + "num_input_tokens_seen": 21907712, + "step": 10135 + }, + { + "epoch": 1.6541598694942903, + "grad_norm": 0.0739160105586052, + "learning_rate": 0.0008269983686786296, + "loss": 0.1091, + "num_input_tokens_seen": 21918464, + "step": 10140 + }, + { + "epoch": 1.6549755301794453, + "grad_norm": 0.04132320359349251, + "learning_rate": 0.0008274061990212072, + "loss": 0.1063, + "num_input_tokens_seen": 21929664, + "step": 10145 + }, + { + "epoch": 1.6557911908646004, + "grad_norm": 0.04538346081972122, + "learning_rate": 0.0008278140293637847, + "loss": 0.1459, + "num_input_tokens_seen": 21940288, + "step": 10150 + }, + { + "epoch": 1.6566068515497552, + "grad_norm": 0.09234213829040527, + "learning_rate": 0.0008282218597063622, + "loss": 0.0304, + "num_input_tokens_seen": 21950464, + "step": 10155 + }, + { + "epoch": 1.6574225122349104, + "grad_norm": 0.19551825523376465, + "learning_rate": 0.0008286296900489396, + "loss": 0.153, + "num_input_tokens_seen": 21961696, + "step": 10160 + }, + { + "epoch": 1.6582381729200653, + "grad_norm": 0.013777018524706364, + "learning_rate": 0.0008290375203915171, + "loss": 0.0799, + "num_input_tokens_seen": 21972192, + "step": 10165 + }, + { + "epoch": 1.65905383360522, + "grad_norm": 0.024932939559221268, + "learning_rate": 0.0008294453507340947, + "loss": 0.1541, + "num_input_tokens_seen": 21982112, + "step": 10170 + }, + { + "epoch": 1.6598694942903753, + "grad_norm": 0.0689426064491272, + "learning_rate": 0.0008298531810766721, + "loss": 0.1206, + "num_input_tokens_seen": 21993568, + "step": 10175 + }, + { + "epoch": 1.6606851549755302, + "grad_norm": 0.20779022574424744, + "learning_rate": 0.0008302610114192496, + "loss": 0.131, + "num_input_tokens_seen": 22003296, + "step": 10180 + }, + { + "epoch": 1.6615008156606852, + "grad_norm": 0.01464125420898199, + "learning_rate": 0.0008306688417618271, + "loss": 0.0764, + "num_input_tokens_seen": 22013344, + "step": 10185 + }, + { + "epoch": 1.6623164763458402, + "grad_norm": 0.041058529168367386, + "learning_rate": 0.0008310766721044046, + "loss": 0.1722, + "num_input_tokens_seen": 22022112, + "step": 10190 + }, + { + "epoch": 1.663132137030995, + "grad_norm": 0.018639344722032547, + "learning_rate": 0.0008314845024469821, + "loss": 0.1507, + "num_input_tokens_seen": 22032704, + "step": 10195 + }, + { + "epoch": 1.66394779771615, + "grad_norm": 0.017282113432884216, + "learning_rate": 0.0008318923327895596, + "loss": 0.1048, + "num_input_tokens_seen": 22044320, + "step": 10200 + }, + { + "epoch": 1.6647634584013051, + "grad_norm": 0.006951620802283287, + "learning_rate": 0.000832300163132137, + "loss": 0.0282, + "num_input_tokens_seen": 22055008, + "step": 10205 + }, + { + "epoch": 1.66557911908646, + "grad_norm": 0.12123756110668182, + "learning_rate": 0.0008327079934747145, + "loss": 0.155, + "num_input_tokens_seen": 22065920, + "step": 10210 + }, + { + "epoch": 1.666394779771615, + "grad_norm": 0.0201109666377306, + "learning_rate": 0.000833115823817292, + "loss": 0.0625, + "num_input_tokens_seen": 22077024, + "step": 10215 + }, + { + "epoch": 1.66721044045677, + "grad_norm": 0.36986878514289856, + "learning_rate": 0.0008335236541598696, + "loss": 0.3849, + "num_input_tokens_seen": 22088320, + "step": 10220 + }, + { + "epoch": 1.6680261011419248, + "grad_norm": 0.010629205033183098, + "learning_rate": 0.0008339314845024471, + "loss": 0.0607, + "num_input_tokens_seen": 22098528, + "step": 10225 + }, + { + "epoch": 1.6688417618270799, + "grad_norm": 0.2106454223394394, + "learning_rate": 0.0008343393148450244, + "loss": 0.2102, + "num_input_tokens_seen": 22109792, + "step": 10230 + }, + { + "epoch": 1.669657422512235, + "grad_norm": 0.029254142194986343, + "learning_rate": 0.000834747145187602, + "loss": 0.1715, + "num_input_tokens_seen": 22120064, + "step": 10235 + }, + { + "epoch": 1.6704730831973897, + "grad_norm": 0.09629207849502563, + "learning_rate": 0.0008351549755301795, + "loss": 0.1165, + "num_input_tokens_seen": 22131200, + "step": 10240 + }, + { + "epoch": 1.671288743882545, + "grad_norm": 0.06033516675233841, + "learning_rate": 0.0008355628058727569, + "loss": 0.0895, + "num_input_tokens_seen": 22142880, + "step": 10245 + }, + { + "epoch": 1.6721044045676998, + "grad_norm": 0.05895635485649109, + "learning_rate": 0.0008359706362153344, + "loss": 0.2684, + "num_input_tokens_seen": 22151744, + "step": 10250 + }, + { + "epoch": 1.6729200652528549, + "grad_norm": 0.10152393579483032, + "learning_rate": 0.0008363784665579119, + "loss": 0.1143, + "num_input_tokens_seen": 22162432, + "step": 10255 + }, + { + "epoch": 1.67373572593801, + "grad_norm": 0.08833316713571548, + "learning_rate": 0.0008367862969004895, + "loss": 0.0898, + "num_input_tokens_seen": 22174432, + "step": 10260 + }, + { + "epoch": 1.6745513866231647, + "grad_norm": 0.14907990396022797, + "learning_rate": 0.0008371941272430669, + "loss": 0.151, + "num_input_tokens_seen": 22185184, + "step": 10265 + }, + { + "epoch": 1.6753670473083198, + "grad_norm": 0.01790524460375309, + "learning_rate": 0.0008376019575856443, + "loss": 0.1193, + "num_input_tokens_seen": 22196480, + "step": 10270 + }, + { + "epoch": 1.6761827079934748, + "grad_norm": 0.02373746782541275, + "learning_rate": 0.0008380097879282219, + "loss": 0.1322, + "num_input_tokens_seen": 22207392, + "step": 10275 + }, + { + "epoch": 1.6769983686786296, + "grad_norm": 0.1984463483095169, + "learning_rate": 0.0008384176182707993, + "loss": 0.1432, + "num_input_tokens_seen": 22218880, + "step": 10280 + }, + { + "epoch": 1.6778140293637847, + "grad_norm": 0.020526498556137085, + "learning_rate": 0.0008388254486133769, + "loss": 0.1844, + "num_input_tokens_seen": 22229184, + "step": 10285 + }, + { + "epoch": 1.6786296900489397, + "grad_norm": 0.21198874711990356, + "learning_rate": 0.0008392332789559544, + "loss": 0.1638, + "num_input_tokens_seen": 22240096, + "step": 10290 + }, + { + "epoch": 1.6794453507340945, + "grad_norm": 0.019683420658111572, + "learning_rate": 0.0008396411092985318, + "loss": 0.2849, + "num_input_tokens_seen": 22251616, + "step": 10295 + }, + { + "epoch": 1.6802610114192496, + "grad_norm": 0.030743658542633057, + "learning_rate": 0.0008400489396411093, + "loss": 0.1036, + "num_input_tokens_seen": 22262464, + "step": 10300 + }, + { + "epoch": 1.6810766721044046, + "grad_norm": 0.11596253514289856, + "learning_rate": 0.0008404567699836868, + "loss": 0.1571, + "num_input_tokens_seen": 22272640, + "step": 10305 + }, + { + "epoch": 1.6818923327895594, + "grad_norm": 0.15217702090740204, + "learning_rate": 0.0008408646003262644, + "loss": 0.1719, + "num_input_tokens_seen": 22283264, + "step": 10310 + }, + { + "epoch": 1.6827079934747147, + "grad_norm": 0.03295318782329559, + "learning_rate": 0.0008412724306688417, + "loss": 0.0892, + "num_input_tokens_seen": 22294112, + "step": 10315 + }, + { + "epoch": 1.6835236541598695, + "grad_norm": 0.08529092371463776, + "learning_rate": 0.0008416802610114192, + "loss": 0.1603, + "num_input_tokens_seen": 22305184, + "step": 10320 + }, + { + "epoch": 1.6843393148450243, + "grad_norm": 0.11946918815374374, + "learning_rate": 0.0008420880913539968, + "loss": 0.1235, + "num_input_tokens_seen": 22315840, + "step": 10325 + }, + { + "epoch": 1.6851549755301796, + "grad_norm": 0.2306964099407196, + "learning_rate": 0.0008424959216965743, + "loss": 0.1512, + "num_input_tokens_seen": 22325728, + "step": 10330 + }, + { + "epoch": 1.6859706362153344, + "grad_norm": 0.11452781409025192, + "learning_rate": 0.0008429037520391518, + "loss": 0.1231, + "num_input_tokens_seen": 22335904, + "step": 10335 + }, + { + "epoch": 1.6867862969004894, + "grad_norm": 0.10477444529533386, + "learning_rate": 0.0008433115823817292, + "loss": 0.184, + "num_input_tokens_seen": 22345184, + "step": 10340 + }, + { + "epoch": 1.6876019575856445, + "grad_norm": 0.287737637758255, + "learning_rate": 0.0008437194127243067, + "loss": 0.2148, + "num_input_tokens_seen": 22356736, + "step": 10345 + }, + { + "epoch": 1.6884176182707993, + "grad_norm": 0.08812838047742844, + "learning_rate": 0.0008441272430668842, + "loss": 0.0983, + "num_input_tokens_seen": 22368704, + "step": 10350 + }, + { + "epoch": 1.6892332789559543, + "grad_norm": 0.030177898705005646, + "learning_rate": 0.0008445350734094617, + "loss": 0.2643, + "num_input_tokens_seen": 22379904, + "step": 10355 + }, + { + "epoch": 1.6900489396411094, + "grad_norm": 0.07883886992931366, + "learning_rate": 0.0008449429037520392, + "loss": 0.1075, + "num_input_tokens_seen": 22391776, + "step": 10360 + }, + { + "epoch": 1.6908646003262642, + "grad_norm": 0.16962724924087524, + "learning_rate": 0.0008453507340946166, + "loss": 0.1294, + "num_input_tokens_seen": 22401792, + "step": 10365 + }, + { + "epoch": 1.6916802610114192, + "grad_norm": 0.2183760106563568, + "learning_rate": 0.0008457585644371941, + "loss": 0.126, + "num_input_tokens_seen": 22411968, + "step": 10370 + }, + { + "epoch": 1.6924959216965743, + "grad_norm": 0.12575066089630127, + "learning_rate": 0.0008461663947797717, + "loss": 0.1403, + "num_input_tokens_seen": 22424000, + "step": 10375 + }, + { + "epoch": 1.693311582381729, + "grad_norm": 0.12444450706243515, + "learning_rate": 0.0008465742251223492, + "loss": 0.1251, + "num_input_tokens_seen": 22434048, + "step": 10380 + }, + { + "epoch": 1.6941272430668843, + "grad_norm": 0.014765706844627857, + "learning_rate": 0.0008469820554649265, + "loss": 0.1346, + "num_input_tokens_seen": 22446176, + "step": 10385 + }, + { + "epoch": 1.6949429037520392, + "grad_norm": 0.1841944307088852, + "learning_rate": 0.0008473898858075041, + "loss": 0.0711, + "num_input_tokens_seen": 22457600, + "step": 10390 + }, + { + "epoch": 1.695758564437194, + "grad_norm": 0.0306500606238842, + "learning_rate": 0.0008477977161500816, + "loss": 0.0725, + "num_input_tokens_seen": 22468768, + "step": 10395 + }, + { + "epoch": 1.6965742251223492, + "grad_norm": 0.17870499193668365, + "learning_rate": 0.0008482055464926591, + "loss": 0.1207, + "num_input_tokens_seen": 22480448, + "step": 10400 + }, + { + "epoch": 1.697389885807504, + "grad_norm": 0.16828739643096924, + "learning_rate": 0.0008486133768352365, + "loss": 0.0722, + "num_input_tokens_seen": 22490880, + "step": 10405 + }, + { + "epoch": 1.698205546492659, + "grad_norm": 0.0035635128151625395, + "learning_rate": 0.000849021207177814, + "loss": 0.2761, + "num_input_tokens_seen": 22500704, + "step": 10410 + }, + { + "epoch": 1.6990212071778141, + "grad_norm": 0.0133226178586483, + "learning_rate": 0.0008494290375203916, + "loss": 0.0865, + "num_input_tokens_seen": 22511040, + "step": 10415 + }, + { + "epoch": 1.699836867862969, + "grad_norm": 0.022621020674705505, + "learning_rate": 0.000849836867862969, + "loss": 0.0859, + "num_input_tokens_seen": 22521664, + "step": 10420 + }, + { + "epoch": 1.700652528548124, + "grad_norm": 0.13418453931808472, + "learning_rate": 0.0008502446982055465, + "loss": 0.0984, + "num_input_tokens_seen": 22532192, + "step": 10425 + }, + { + "epoch": 1.701468189233279, + "grad_norm": 0.03183869272470474, + "learning_rate": 0.000850652528548124, + "loss": 0.1715, + "num_input_tokens_seen": 22543712, + "step": 10430 + }, + { + "epoch": 1.7022838499184338, + "grad_norm": 0.07206690311431885, + "learning_rate": 0.0008510603588907014, + "loss": 0.1131, + "num_input_tokens_seen": 22555232, + "step": 10435 + }, + { + "epoch": 1.7030995106035889, + "grad_norm": 0.10996096581220627, + "learning_rate": 0.000851468189233279, + "loss": 0.0312, + "num_input_tokens_seen": 22566720, + "step": 10440 + }, + { + "epoch": 1.703915171288744, + "grad_norm": 0.07621190696954727, + "learning_rate": 0.0008518760195758565, + "loss": 0.0909, + "num_input_tokens_seen": 22577376, + "step": 10445 + }, + { + "epoch": 1.7047308319738987, + "grad_norm": 0.4012081027030945, + "learning_rate": 0.000852283849918434, + "loss": 0.1315, + "num_input_tokens_seen": 22589568, + "step": 10450 + }, + { + "epoch": 1.7055464926590538, + "grad_norm": 0.09326352924108505, + "learning_rate": 0.0008526916802610114, + "loss": 0.0868, + "num_input_tokens_seen": 22600672, + "step": 10455 + }, + { + "epoch": 1.7063621533442088, + "grad_norm": 0.03809165954589844, + "learning_rate": 0.0008530995106035889, + "loss": 0.2185, + "num_input_tokens_seen": 22611200, + "step": 10460 + }, + { + "epoch": 1.7071778140293636, + "grad_norm": 0.013529691845178604, + "learning_rate": 0.0008535073409461665, + "loss": 0.0671, + "num_input_tokens_seen": 22620608, + "step": 10465 + }, + { + "epoch": 1.707993474714519, + "grad_norm": 0.009886613115668297, + "learning_rate": 0.0008539151712887438, + "loss": 0.1265, + "num_input_tokens_seen": 22631168, + "step": 10470 + }, + { + "epoch": 1.7088091353996737, + "grad_norm": 0.06647709757089615, + "learning_rate": 0.0008543230016313214, + "loss": 0.1613, + "num_input_tokens_seen": 22640640, + "step": 10475 + }, + { + "epoch": 1.7096247960848288, + "grad_norm": 0.035106390714645386, + "learning_rate": 0.0008547308319738989, + "loss": 0.0858, + "num_input_tokens_seen": 22650976, + "step": 10480 + }, + { + "epoch": 1.7104404567699838, + "grad_norm": 0.22978582978248596, + "learning_rate": 0.0008551386623164764, + "loss": 0.3131, + "num_input_tokens_seen": 22661472, + "step": 10485 + }, + { + "epoch": 1.7112561174551386, + "grad_norm": 0.05932708457112312, + "learning_rate": 0.0008555464926590538, + "loss": 0.2018, + "num_input_tokens_seen": 22673024, + "step": 10490 + }, + { + "epoch": 1.7120717781402937, + "grad_norm": 0.04095899686217308, + "learning_rate": 0.0008559543230016313, + "loss": 0.1149, + "num_input_tokens_seen": 22684416, + "step": 10495 + }, + { + "epoch": 1.7128874388254487, + "grad_norm": 0.07021505385637283, + "learning_rate": 0.0008563621533442089, + "loss": 0.0951, + "num_input_tokens_seen": 22695136, + "step": 10500 + }, + { + "epoch": 1.7137030995106035, + "grad_norm": 0.046308018267154694, + "learning_rate": 0.0008567699836867863, + "loss": 0.1735, + "num_input_tokens_seen": 22705408, + "step": 10505 + }, + { + "epoch": 1.7145187601957586, + "grad_norm": 0.18671134114265442, + "learning_rate": 0.0008571778140293638, + "loss": 0.11, + "num_input_tokens_seen": 22716000, + "step": 10510 + }, + { + "epoch": 1.7153344208809136, + "grad_norm": 0.1588408499956131, + "learning_rate": 0.0008575856443719413, + "loss": 0.1847, + "num_input_tokens_seen": 22727936, + "step": 10515 + }, + { + "epoch": 1.7161500815660684, + "grad_norm": 0.1301068663597107, + "learning_rate": 0.0008579934747145188, + "loss": 0.2814, + "num_input_tokens_seen": 22737632, + "step": 10520 + }, + { + "epoch": 1.7169657422512234, + "grad_norm": 0.19701120257377625, + "learning_rate": 0.0008584013050570962, + "loss": 0.1735, + "num_input_tokens_seen": 22748288, + "step": 10525 + }, + { + "epoch": 1.7177814029363785, + "grad_norm": 0.18534375727176666, + "learning_rate": 0.0008588091353996738, + "loss": 0.206, + "num_input_tokens_seen": 22760288, + "step": 10530 + }, + { + "epoch": 1.7185970636215333, + "grad_norm": 0.05705312639474869, + "learning_rate": 0.0008592169657422512, + "loss": 0.1251, + "num_input_tokens_seen": 22772032, + "step": 10535 + }, + { + "epoch": 1.7194127243066886, + "grad_norm": 0.07957900315523148, + "learning_rate": 0.0008596247960848287, + "loss": 0.1957, + "num_input_tokens_seen": 22783392, + "step": 10540 + }, + { + "epoch": 1.7202283849918434, + "grad_norm": 0.019550856202840805, + "learning_rate": 0.0008600326264274062, + "loss": 0.1014, + "num_input_tokens_seen": 22795808, + "step": 10545 + }, + { + "epoch": 1.7210440456769984, + "grad_norm": 0.07609003037214279, + "learning_rate": 0.0008604404567699837, + "loss": 0.2708, + "num_input_tokens_seen": 22806400, + "step": 10550 + }, + { + "epoch": 1.7218597063621535, + "grad_norm": 0.08137402683496475, + "learning_rate": 0.0008608482871125613, + "loss": 0.1305, + "num_input_tokens_seen": 22816256, + "step": 10555 + }, + { + "epoch": 1.7226753670473083, + "grad_norm": 0.03238554298877716, + "learning_rate": 0.0008612561174551386, + "loss": 0.0469, + "num_input_tokens_seen": 22827200, + "step": 10560 + }, + { + "epoch": 1.7234910277324633, + "grad_norm": 0.06092572957277298, + "learning_rate": 0.0008616639477977162, + "loss": 0.0801, + "num_input_tokens_seen": 22837248, + "step": 10565 + }, + { + "epoch": 1.7243066884176184, + "grad_norm": 0.028193719685077667, + "learning_rate": 0.0008620717781402937, + "loss": 0.0746, + "num_input_tokens_seen": 22846048, + "step": 10570 + }, + { + "epoch": 1.7251223491027732, + "grad_norm": 0.2146746814250946, + "learning_rate": 0.0008624796084828711, + "loss": 0.1816, + "num_input_tokens_seen": 22856672, + "step": 10575 + }, + { + "epoch": 1.7259380097879282, + "grad_norm": 0.25911369919776917, + "learning_rate": 0.0008628874388254486, + "loss": 0.1885, + "num_input_tokens_seen": 22868224, + "step": 10580 + }, + { + "epoch": 1.7267536704730833, + "grad_norm": 0.18619155883789062, + "learning_rate": 0.0008632952691680261, + "loss": 0.2791, + "num_input_tokens_seen": 22879584, + "step": 10585 + }, + { + "epoch": 1.727569331158238, + "grad_norm": 0.03678226098418236, + "learning_rate": 0.0008637030995106036, + "loss": 0.0505, + "num_input_tokens_seen": 22889696, + "step": 10590 + }, + { + "epoch": 1.7283849918433931, + "grad_norm": 0.038924794644117355, + "learning_rate": 0.0008641109298531811, + "loss": 0.0877, + "num_input_tokens_seen": 22900224, + "step": 10595 + }, + { + "epoch": 1.7292006525285482, + "grad_norm": 0.10341635346412659, + "learning_rate": 0.0008645187601957585, + "loss": 0.2104, + "num_input_tokens_seen": 22911136, + "step": 10600 + }, + { + "epoch": 1.730016313213703, + "grad_norm": 0.07807479798793793, + "learning_rate": 0.0008649265905383361, + "loss": 0.2673, + "num_input_tokens_seen": 22922400, + "step": 10605 + }, + { + "epoch": 1.7308319738988582, + "grad_norm": 0.09826861321926117, + "learning_rate": 0.0008653344208809135, + "loss": 0.1244, + "num_input_tokens_seen": 22933664, + "step": 10610 + }, + { + "epoch": 1.731647634584013, + "grad_norm": 0.0757172629237175, + "learning_rate": 0.0008657422512234911, + "loss": 0.1333, + "num_input_tokens_seen": 22943104, + "step": 10615 + }, + { + "epoch": 1.7324632952691679, + "grad_norm": 0.09593085199594498, + "learning_rate": 0.0008661500815660686, + "loss": 0.1428, + "num_input_tokens_seen": 22953344, + "step": 10620 + }, + { + "epoch": 1.7332789559543231, + "grad_norm": 0.06658720225095749, + "learning_rate": 0.0008665579119086459, + "loss": 0.1233, + "num_input_tokens_seen": 22964928, + "step": 10625 + }, + { + "epoch": 1.734094616639478, + "grad_norm": 0.030424823984503746, + "learning_rate": 0.0008669657422512235, + "loss": 0.1548, + "num_input_tokens_seen": 22977152, + "step": 10630 + }, + { + "epoch": 1.734910277324633, + "grad_norm": 0.11356142163276672, + "learning_rate": 0.000867373572593801, + "loss": 0.0982, + "num_input_tokens_seen": 22988448, + "step": 10635 + }, + { + "epoch": 1.735725938009788, + "grad_norm": 0.016592618077993393, + "learning_rate": 0.0008677814029363786, + "loss": 0.1046, + "num_input_tokens_seen": 22998592, + "step": 10640 + }, + { + "epoch": 1.7365415986949428, + "grad_norm": 0.06403378397226334, + "learning_rate": 0.0008681892332789559, + "loss": 0.0613, + "num_input_tokens_seen": 23009344, + "step": 10645 + }, + { + "epoch": 1.7373572593800979, + "grad_norm": 0.2613638639450073, + "learning_rate": 0.0008685970636215334, + "loss": 0.2167, + "num_input_tokens_seen": 23019584, + "step": 10650 + }, + { + "epoch": 1.738172920065253, + "grad_norm": 0.08611439913511276, + "learning_rate": 0.000869004893964111, + "loss": 0.1633, + "num_input_tokens_seen": 23030336, + "step": 10655 + }, + { + "epoch": 1.7389885807504077, + "grad_norm": 0.08289582282304764, + "learning_rate": 0.0008694127243066884, + "loss": 0.0929, + "num_input_tokens_seen": 23041760, + "step": 10660 + }, + { + "epoch": 1.7398042414355628, + "grad_norm": 0.18202540278434753, + "learning_rate": 0.000869820554649266, + "loss": 0.1043, + "num_input_tokens_seen": 23051840, + "step": 10665 + }, + { + "epoch": 1.7406199021207178, + "grad_norm": 0.30654197931289673, + "learning_rate": 0.0008702283849918434, + "loss": 0.1155, + "num_input_tokens_seen": 23061632, + "step": 10670 + }, + { + "epoch": 1.7414355628058726, + "grad_norm": 0.14201697707176208, + "learning_rate": 0.0008706362153344209, + "loss": 0.1498, + "num_input_tokens_seen": 23071520, + "step": 10675 + }, + { + "epoch": 1.7422512234910277, + "grad_norm": 0.006509506143629551, + "learning_rate": 0.0008710440456769984, + "loss": 0.2371, + "num_input_tokens_seen": 23082304, + "step": 10680 + }, + { + "epoch": 1.7430668841761827, + "grad_norm": 0.2300765961408615, + "learning_rate": 0.0008714518760195759, + "loss": 0.2189, + "num_input_tokens_seen": 23092416, + "step": 10685 + }, + { + "epoch": 1.7438825448613375, + "grad_norm": 0.19121153652668, + "learning_rate": 0.0008718597063621533, + "loss": 0.165, + "num_input_tokens_seen": 23102720, + "step": 10690 + }, + { + "epoch": 1.7446982055464928, + "grad_norm": 0.11062421649694443, + "learning_rate": 0.0008722675367047308, + "loss": 0.0811, + "num_input_tokens_seen": 23114016, + "step": 10695 + }, + { + "epoch": 1.7455138662316476, + "grad_norm": 0.27690812945365906, + "learning_rate": 0.0008726753670473083, + "loss": 0.104, + "num_input_tokens_seen": 23125888, + "step": 10700 + }, + { + "epoch": 1.7463295269168027, + "grad_norm": 0.10156457871198654, + "learning_rate": 0.0008730831973898859, + "loss": 0.0668, + "num_input_tokens_seen": 23137248, + "step": 10705 + }, + { + "epoch": 1.7471451876019577, + "grad_norm": 0.04147868603467941, + "learning_rate": 0.0008734910277324634, + "loss": 0.1691, + "num_input_tokens_seen": 23148064, + "step": 10710 + }, + { + "epoch": 1.7479608482871125, + "grad_norm": 0.09371112287044525, + "learning_rate": 0.0008738988580750407, + "loss": 0.0318, + "num_input_tokens_seen": 23158208, + "step": 10715 + }, + { + "epoch": 1.7487765089722676, + "grad_norm": 0.06488977372646332, + "learning_rate": 0.0008743066884176183, + "loss": 0.1386, + "num_input_tokens_seen": 23170144, + "step": 10720 + }, + { + "epoch": 1.7495921696574226, + "grad_norm": 0.05057325214147568, + "learning_rate": 0.0008747145187601958, + "loss": 0.0869, + "num_input_tokens_seen": 23180768, + "step": 10725 + }, + { + "epoch": 1.7504078303425774, + "grad_norm": 0.23917360603809357, + "learning_rate": 0.0008751223491027733, + "loss": 0.1797, + "num_input_tokens_seen": 23191840, + "step": 10730 + }, + { + "epoch": 1.7512234910277324, + "grad_norm": 0.0471155010163784, + "learning_rate": 0.0008755301794453507, + "loss": 0.1502, + "num_input_tokens_seen": 23201760, + "step": 10735 + }, + { + "epoch": 1.7520391517128875, + "grad_norm": 0.03939206153154373, + "learning_rate": 0.0008759380097879282, + "loss": 0.071, + "num_input_tokens_seen": 23213024, + "step": 10740 + }, + { + "epoch": 1.7528548123980423, + "grad_norm": 0.019556110724806786, + "learning_rate": 0.0008763458401305058, + "loss": 0.0869, + "num_input_tokens_seen": 23224512, + "step": 10745 + }, + { + "epoch": 1.7536704730831973, + "grad_norm": 0.03442414477467537, + "learning_rate": 0.0008767536704730832, + "loss": 0.0555, + "num_input_tokens_seen": 23235040, + "step": 10750 + }, + { + "epoch": 1.7544861337683524, + "grad_norm": 0.1760786771774292, + "learning_rate": 0.0008771615008156608, + "loss": 0.1837, + "num_input_tokens_seen": 23245056, + "step": 10755 + }, + { + "epoch": 1.7553017944535072, + "grad_norm": 0.04427696019411087, + "learning_rate": 0.0008775693311582382, + "loss": 0.0827, + "num_input_tokens_seen": 23257824, + "step": 10760 + }, + { + "epoch": 1.7561174551386625, + "grad_norm": 0.09889457374811172, + "learning_rate": 0.0008779771615008156, + "loss": 0.0922, + "num_input_tokens_seen": 23269376, + "step": 10765 + }, + { + "epoch": 1.7569331158238173, + "grad_norm": 0.3897489607334137, + "learning_rate": 0.0008783849918433932, + "loss": 0.1227, + "num_input_tokens_seen": 23282560, + "step": 10770 + }, + { + "epoch": 1.7577487765089723, + "grad_norm": 0.13181956112384796, + "learning_rate": 0.0008787928221859707, + "loss": 0.0693, + "num_input_tokens_seen": 23292896, + "step": 10775 + }, + { + "epoch": 1.7585644371941274, + "grad_norm": 0.3234865665435791, + "learning_rate": 0.0008792006525285482, + "loss": 0.0555, + "num_input_tokens_seen": 23303168, + "step": 10780 + }, + { + "epoch": 1.7593800978792822, + "grad_norm": 0.043175894767045975, + "learning_rate": 0.0008796084828711256, + "loss": 0.1585, + "num_input_tokens_seen": 23314784, + "step": 10785 + }, + { + "epoch": 1.7601957585644372, + "grad_norm": 0.3319920301437378, + "learning_rate": 0.0008800163132137031, + "loss": 0.1369, + "num_input_tokens_seen": 23325696, + "step": 10790 + }, + { + "epoch": 1.7610114192495923, + "grad_norm": 0.3128829002380371, + "learning_rate": 0.0008804241435562807, + "loss": 0.326, + "num_input_tokens_seen": 23337120, + "step": 10795 + }, + { + "epoch": 1.761827079934747, + "grad_norm": 0.12764349579811096, + "learning_rate": 0.000880831973898858, + "loss": 0.0762, + "num_input_tokens_seen": 23347584, + "step": 10800 + }, + { + "epoch": 1.7626427406199021, + "grad_norm": 0.2954727113246918, + "learning_rate": 0.0008812398042414356, + "loss": 0.2981, + "num_input_tokens_seen": 23357504, + "step": 10805 + }, + { + "epoch": 1.7634584013050572, + "grad_norm": 0.34298840165138245, + "learning_rate": 0.0008816476345840131, + "loss": 0.1536, + "num_input_tokens_seen": 23367456, + "step": 10810 + }, + { + "epoch": 1.764274061990212, + "grad_norm": 0.040248893201351166, + "learning_rate": 0.0008820554649265906, + "loss": 0.1438, + "num_input_tokens_seen": 23377984, + "step": 10815 + }, + { + "epoch": 1.765089722675367, + "grad_norm": 0.07334783673286438, + "learning_rate": 0.000882463295269168, + "loss": 0.1992, + "num_input_tokens_seen": 23390272, + "step": 10820 + }, + { + "epoch": 1.765905383360522, + "grad_norm": 0.0807512179017067, + "learning_rate": 0.0008828711256117455, + "loss": 0.0548, + "num_input_tokens_seen": 23400416, + "step": 10825 + }, + { + "epoch": 1.7667210440456769, + "grad_norm": 0.0668400228023529, + "learning_rate": 0.000883278955954323, + "loss": 0.0817, + "num_input_tokens_seen": 23412704, + "step": 10830 + }, + { + "epoch": 1.7675367047308321, + "grad_norm": 0.04719111695885658, + "learning_rate": 0.0008836867862969005, + "loss": 0.1618, + "num_input_tokens_seen": 23421952, + "step": 10835 + }, + { + "epoch": 1.768352365415987, + "grad_norm": 0.025539319962263107, + "learning_rate": 0.000884094616639478, + "loss": 0.0666, + "num_input_tokens_seen": 23431136, + "step": 10840 + }, + { + "epoch": 1.7691680261011418, + "grad_norm": 0.03110872395336628, + "learning_rate": 0.0008845024469820555, + "loss": 0.2034, + "num_input_tokens_seen": 23442880, + "step": 10845 + }, + { + "epoch": 1.769983686786297, + "grad_norm": 0.1294354945421219, + "learning_rate": 0.0008849102773246329, + "loss": 0.1134, + "num_input_tokens_seen": 23453120, + "step": 10850 + }, + { + "epoch": 1.7707993474714518, + "grad_norm": 0.03611038252711296, + "learning_rate": 0.0008853181076672104, + "loss": 0.2415, + "num_input_tokens_seen": 23464192, + "step": 10855 + }, + { + "epoch": 1.7716150081566069, + "grad_norm": 0.03079996071755886, + "learning_rate": 0.000885725938009788, + "loss": 0.1517, + "num_input_tokens_seen": 23475264, + "step": 10860 + }, + { + "epoch": 1.772430668841762, + "grad_norm": 0.08433035016059875, + "learning_rate": 0.0008861337683523655, + "loss": 0.0781, + "num_input_tokens_seen": 23485440, + "step": 10865 + }, + { + "epoch": 1.7732463295269167, + "grad_norm": 0.06088045611977577, + "learning_rate": 0.0008865415986949429, + "loss": 0.1107, + "num_input_tokens_seen": 23496768, + "step": 10870 + }, + { + "epoch": 1.7740619902120718, + "grad_norm": 0.012889078818261623, + "learning_rate": 0.0008869494290375204, + "loss": 0.1393, + "num_input_tokens_seen": 23507328, + "step": 10875 + }, + { + "epoch": 1.7748776508972268, + "grad_norm": 0.02911030873656273, + "learning_rate": 0.0008873572593800979, + "loss": 0.0695, + "num_input_tokens_seen": 23518976, + "step": 10880 + }, + { + "epoch": 1.7756933115823816, + "grad_norm": 0.015479132533073425, + "learning_rate": 0.0008877650897226754, + "loss": 0.1162, + "num_input_tokens_seen": 23530720, + "step": 10885 + }, + { + "epoch": 1.7765089722675367, + "grad_norm": 0.0848546028137207, + "learning_rate": 0.0008881729200652528, + "loss": 0.0763, + "num_input_tokens_seen": 23541312, + "step": 10890 + }, + { + "epoch": 1.7773246329526917, + "grad_norm": 0.021081771701574326, + "learning_rate": 0.0008885807504078304, + "loss": 0.0796, + "num_input_tokens_seen": 23552064, + "step": 10895 + }, + { + "epoch": 1.7781402936378465, + "grad_norm": 0.026856867596507072, + "learning_rate": 0.0008889885807504079, + "loss": 0.0871, + "num_input_tokens_seen": 23563104, + "step": 10900 + }, + { + "epoch": 1.7789559543230016, + "grad_norm": 0.3725176155567169, + "learning_rate": 0.0008893964110929853, + "loss": 0.25, + "num_input_tokens_seen": 23572992, + "step": 10905 + }, + { + "epoch": 1.7797716150081566, + "grad_norm": 0.22330375015735626, + "learning_rate": 0.0008898042414355628, + "loss": 0.1289, + "num_input_tokens_seen": 23583328, + "step": 10910 + }, + { + "epoch": 1.7805872756933114, + "grad_norm": 0.030970728024840355, + "learning_rate": 0.0008902120717781403, + "loss": 0.0726, + "num_input_tokens_seen": 23593888, + "step": 10915 + }, + { + "epoch": 1.7814029363784667, + "grad_norm": 0.07348272949457169, + "learning_rate": 0.0008906199021207178, + "loss": 0.1213, + "num_input_tokens_seen": 23604544, + "step": 10920 + }, + { + "epoch": 1.7822185970636215, + "grad_norm": 0.03655874356627464, + "learning_rate": 0.0008910277324632953, + "loss": 0.2103, + "num_input_tokens_seen": 23615584, + "step": 10925 + }, + { + "epoch": 1.7830342577487766, + "grad_norm": 0.007576843723654747, + "learning_rate": 0.0008914355628058728, + "loss": 0.0384, + "num_input_tokens_seen": 23625568, + "step": 10930 + }, + { + "epoch": 1.7838499184339316, + "grad_norm": 0.015224359929561615, + "learning_rate": 0.0008918433931484503, + "loss": 0.0379, + "num_input_tokens_seen": 23636320, + "step": 10935 + }, + { + "epoch": 1.7846655791190864, + "grad_norm": 0.13596542179584503, + "learning_rate": 0.0008922512234910277, + "loss": 0.2034, + "num_input_tokens_seen": 23646912, + "step": 10940 + }, + { + "epoch": 1.7854812398042414, + "grad_norm": 0.03236247971653938, + "learning_rate": 0.0008926590538336053, + "loss": 0.082, + "num_input_tokens_seen": 23657120, + "step": 10945 + }, + { + "epoch": 1.7862969004893965, + "grad_norm": 0.03309471160173416, + "learning_rate": 0.0008930668841761828, + "loss": 0.1908, + "num_input_tokens_seen": 23667712, + "step": 10950 + }, + { + "epoch": 1.7871125611745513, + "grad_norm": 0.04807749763131142, + "learning_rate": 0.0008934747145187601, + "loss": 0.0609, + "num_input_tokens_seen": 23678720, + "step": 10955 + }, + { + "epoch": 1.7879282218597063, + "grad_norm": 0.25209635496139526, + "learning_rate": 0.0008938825448613377, + "loss": 0.0808, + "num_input_tokens_seen": 23688160, + "step": 10960 + }, + { + "epoch": 1.7887438825448614, + "grad_norm": 0.11106795817613602, + "learning_rate": 0.0008942903752039152, + "loss": 0.127, + "num_input_tokens_seen": 23699680, + "step": 10965 + }, + { + "epoch": 1.7895595432300162, + "grad_norm": 0.0785897746682167, + "learning_rate": 0.0008946982055464927, + "loss": 0.1104, + "num_input_tokens_seen": 23710880, + "step": 10970 + }, + { + "epoch": 1.7903752039151712, + "grad_norm": 0.03193995729088783, + "learning_rate": 0.0008951060358890701, + "loss": 0.0885, + "num_input_tokens_seen": 23722304, + "step": 10975 + }, + { + "epoch": 1.7911908646003263, + "grad_norm": 0.3477129638195038, + "learning_rate": 0.0008955138662316476, + "loss": 0.2199, + "num_input_tokens_seen": 23734048, + "step": 10980 + }, + { + "epoch": 1.792006525285481, + "grad_norm": 0.009047658182680607, + "learning_rate": 0.0008959216965742252, + "loss": 0.1155, + "num_input_tokens_seen": 23745600, + "step": 10985 + }, + { + "epoch": 1.7928221859706364, + "grad_norm": 0.02471218630671501, + "learning_rate": 0.0008963295269168026, + "loss": 0.0772, + "num_input_tokens_seen": 23756800, + "step": 10990 + }, + { + "epoch": 1.7936378466557912, + "grad_norm": 0.0520327165722847, + "learning_rate": 0.0008967373572593801, + "loss": 0.2318, + "num_input_tokens_seen": 23766880, + "step": 10995 + }, + { + "epoch": 1.7944535073409462, + "grad_norm": 0.0077630458399653435, + "learning_rate": 0.0008971451876019576, + "loss": 0.0614, + "num_input_tokens_seen": 23777472, + "step": 11000 + }, + { + "epoch": 1.7952691680261013, + "grad_norm": 0.07723210752010345, + "learning_rate": 0.0008975530179445351, + "loss": 0.146, + "num_input_tokens_seen": 23786336, + "step": 11005 + }, + { + "epoch": 1.796084828711256, + "grad_norm": 0.1038888618350029, + "learning_rate": 0.0008979608482871126, + "loss": 0.1772, + "num_input_tokens_seen": 23797184, + "step": 11010 + }, + { + "epoch": 1.7969004893964111, + "grad_norm": 0.056932754814624786, + "learning_rate": 0.0008983686786296901, + "loss": 0.1083, + "num_input_tokens_seen": 23808288, + "step": 11015 + }, + { + "epoch": 1.7977161500815662, + "grad_norm": 0.14512395858764648, + "learning_rate": 0.0008987765089722675, + "loss": 0.2331, + "num_input_tokens_seen": 23818176, + "step": 11020 + }, + { + "epoch": 1.798531810766721, + "grad_norm": 0.11331702768802643, + "learning_rate": 0.000899184339314845, + "loss": 0.1132, + "num_input_tokens_seen": 23829344, + "step": 11025 + }, + { + "epoch": 1.799347471451876, + "grad_norm": 0.032154519110918045, + "learning_rate": 0.0008995921696574225, + "loss": 0.0833, + "num_input_tokens_seen": 23839936, + "step": 11030 + }, + { + "epoch": 1.800163132137031, + "grad_norm": 0.15132077038288116, + "learning_rate": 0.0009000000000000001, + "loss": 0.0816, + "num_input_tokens_seen": 23852736, + "step": 11035 + }, + { + "epoch": 1.8009787928221859, + "grad_norm": 0.049387961626052856, + "learning_rate": 0.0009004078303425776, + "loss": 0.0694, + "num_input_tokens_seen": 23864736, + "step": 11040 + }, + { + "epoch": 1.801794453507341, + "grad_norm": 0.2985081076622009, + "learning_rate": 0.0009008156606851549, + "loss": 0.1508, + "num_input_tokens_seen": 23874432, + "step": 11045 + }, + { + "epoch": 1.802610114192496, + "grad_norm": 0.0945889875292778, + "learning_rate": 0.0009012234910277325, + "loss": 0.0375, + "num_input_tokens_seen": 23885056, + "step": 11050 + }, + { + "epoch": 1.8034257748776508, + "grad_norm": 0.13480478525161743, + "learning_rate": 0.00090163132137031, + "loss": 0.1878, + "num_input_tokens_seen": 23896096, + "step": 11055 + }, + { + "epoch": 1.804241435562806, + "grad_norm": 0.015052284114062786, + "learning_rate": 0.0009020391517128875, + "loss": 0.0886, + "num_input_tokens_seen": 23907200, + "step": 11060 + }, + { + "epoch": 1.8050570962479608, + "grad_norm": 0.06604567915201187, + "learning_rate": 0.0009024469820554649, + "loss": 0.1228, + "num_input_tokens_seen": 23916576, + "step": 11065 + }, + { + "epoch": 1.8058727569331157, + "grad_norm": 0.016169311478734016, + "learning_rate": 0.0009028548123980424, + "loss": 0.0455, + "num_input_tokens_seen": 23926784, + "step": 11070 + }, + { + "epoch": 1.806688417618271, + "grad_norm": 0.059709712862968445, + "learning_rate": 0.0009032626427406199, + "loss": 0.1538, + "num_input_tokens_seen": 23937632, + "step": 11075 + }, + { + "epoch": 1.8075040783034257, + "grad_norm": 0.07011251151561737, + "learning_rate": 0.0009036704730831974, + "loss": 0.0195, + "num_input_tokens_seen": 23949024, + "step": 11080 + }, + { + "epoch": 1.8083197389885808, + "grad_norm": 0.07370084524154663, + "learning_rate": 0.000904078303425775, + "loss": 0.0847, + "num_input_tokens_seen": 23960192, + "step": 11085 + }, + { + "epoch": 1.8091353996737358, + "grad_norm": 0.039578117430210114, + "learning_rate": 0.0009044861337683524, + "loss": 0.1702, + "num_input_tokens_seen": 23970304, + "step": 11090 + }, + { + "epoch": 1.8099510603588906, + "grad_norm": 0.019992655143141747, + "learning_rate": 0.0009048939641109298, + "loss": 0.1181, + "num_input_tokens_seen": 23980544, + "step": 11095 + }, + { + "epoch": 1.8107667210440457, + "grad_norm": 0.3185132145881653, + "learning_rate": 0.0009053017944535074, + "loss": 0.1191, + "num_input_tokens_seen": 23991680, + "step": 11100 + }, + { + "epoch": 1.8115823817292007, + "grad_norm": 0.042107485234737396, + "learning_rate": 0.0009057096247960849, + "loss": 0.2586, + "num_input_tokens_seen": 24000672, + "step": 11105 + }, + { + "epoch": 1.8123980424143555, + "grad_norm": 0.10557562857866287, + "learning_rate": 0.0009061174551386622, + "loss": 0.169, + "num_input_tokens_seen": 24010496, + "step": 11110 + }, + { + "epoch": 1.8132137030995106, + "grad_norm": 0.05226168781518936, + "learning_rate": 0.0009065252854812398, + "loss": 0.1118, + "num_input_tokens_seen": 24021056, + "step": 11115 + }, + { + "epoch": 1.8140293637846656, + "grad_norm": 0.05869543179869652, + "learning_rate": 0.0009069331158238173, + "loss": 0.0729, + "num_input_tokens_seen": 24031136, + "step": 11120 + }, + { + "epoch": 1.8148450244698204, + "grad_norm": 0.05642861872911453, + "learning_rate": 0.0009073409461663949, + "loss": 0.0324, + "num_input_tokens_seen": 24040864, + "step": 11125 + }, + { + "epoch": 1.8156606851549757, + "grad_norm": 0.020973458886146545, + "learning_rate": 0.0009077487765089722, + "loss": 0.1566, + "num_input_tokens_seen": 24051168, + "step": 11130 + }, + { + "epoch": 1.8164763458401305, + "grad_norm": 0.143061101436615, + "learning_rate": 0.0009081566068515497, + "loss": 0.1379, + "num_input_tokens_seen": 24060448, + "step": 11135 + }, + { + "epoch": 1.8172920065252853, + "grad_norm": 0.026111792773008347, + "learning_rate": 0.0009085644371941273, + "loss": 0.1691, + "num_input_tokens_seen": 24070208, + "step": 11140 + }, + { + "epoch": 1.8181076672104406, + "grad_norm": 0.021318498998880386, + "learning_rate": 0.0009089722675367047, + "loss": 0.0647, + "num_input_tokens_seen": 24080000, + "step": 11145 + }, + { + "epoch": 1.8189233278955954, + "grad_norm": 0.0548306442797184, + "learning_rate": 0.0009093800978792823, + "loss": 0.0697, + "num_input_tokens_seen": 24089120, + "step": 11150 + }, + { + "epoch": 1.8197389885807504, + "grad_norm": 0.057875704020261765, + "learning_rate": 0.0009097879282218597, + "loss": 0.0699, + "num_input_tokens_seen": 24099968, + "step": 11155 + }, + { + "epoch": 1.8205546492659055, + "grad_norm": 0.13523219525814056, + "learning_rate": 0.0009101957585644372, + "loss": 0.0695, + "num_input_tokens_seen": 24110560, + "step": 11160 + }, + { + "epoch": 1.8213703099510603, + "grad_norm": 0.02999040111899376, + "learning_rate": 0.0009106035889070147, + "loss": 0.0685, + "num_input_tokens_seen": 24121984, + "step": 11165 + }, + { + "epoch": 1.8221859706362153, + "grad_norm": 0.027061957865953445, + "learning_rate": 0.0009110114192495922, + "loss": 0.1093, + "num_input_tokens_seen": 24132544, + "step": 11170 + }, + { + "epoch": 1.8230016313213704, + "grad_norm": 0.23212389647960663, + "learning_rate": 0.0009114192495921697, + "loss": 0.1339, + "num_input_tokens_seen": 24143808, + "step": 11175 + }, + { + "epoch": 1.8238172920065252, + "grad_norm": 0.06036336347460747, + "learning_rate": 0.0009118270799347471, + "loss": 0.2633, + "num_input_tokens_seen": 24154496, + "step": 11180 + }, + { + "epoch": 1.8246329526916802, + "grad_norm": 0.16399706900119781, + "learning_rate": 0.0009122349102773246, + "loss": 0.1299, + "num_input_tokens_seen": 24165472, + "step": 11185 + }, + { + "epoch": 1.8254486133768353, + "grad_norm": 0.021526144817471504, + "learning_rate": 0.0009126427406199022, + "loss": 0.075, + "num_input_tokens_seen": 24177088, + "step": 11190 + }, + { + "epoch": 1.82626427406199, + "grad_norm": 0.0362713560461998, + "learning_rate": 0.0009130505709624797, + "loss": 0.1654, + "num_input_tokens_seen": 24188768, + "step": 11195 + }, + { + "epoch": 1.8270799347471451, + "grad_norm": 0.035849086940288544, + "learning_rate": 0.0009134584013050571, + "loss": 0.092, + "num_input_tokens_seen": 24199712, + "step": 11200 + }, + { + "epoch": 1.8278955954323002, + "grad_norm": 0.14648644626140594, + "learning_rate": 0.0009138662316476346, + "loss": 0.1627, + "num_input_tokens_seen": 24210912, + "step": 11205 + }, + { + "epoch": 1.828711256117455, + "grad_norm": 0.23167702555656433, + "learning_rate": 0.0009142740619902121, + "loss": 0.2089, + "num_input_tokens_seen": 24221824, + "step": 11210 + }, + { + "epoch": 1.8295269168026103, + "grad_norm": 0.15459120273590088, + "learning_rate": 0.0009146818923327896, + "loss": 0.1228, + "num_input_tokens_seen": 24231168, + "step": 11215 + }, + { + "epoch": 1.830342577487765, + "grad_norm": 0.018223119899630547, + "learning_rate": 0.000915089722675367, + "loss": 0.0812, + "num_input_tokens_seen": 24241408, + "step": 11220 + }, + { + "epoch": 1.8311582381729201, + "grad_norm": 0.10892859846353531, + "learning_rate": 0.0009154975530179446, + "loss": 0.1189, + "num_input_tokens_seen": 24251520, + "step": 11225 + }, + { + "epoch": 1.8319738988580752, + "grad_norm": 0.043409861624240875, + "learning_rate": 0.0009159053833605221, + "loss": 0.1598, + "num_input_tokens_seen": 24262144, + "step": 11230 + }, + { + "epoch": 1.83278955954323, + "grad_norm": 0.08669284731149673, + "learning_rate": 0.0009163132137030995, + "loss": 0.0946, + "num_input_tokens_seen": 24273440, + "step": 11235 + }, + { + "epoch": 1.833605220228385, + "grad_norm": 0.16202498972415924, + "learning_rate": 0.000916721044045677, + "loss": 0.0674, + "num_input_tokens_seen": 24284832, + "step": 11240 + }, + { + "epoch": 1.83442088091354, + "grad_norm": 0.031536467373371124, + "learning_rate": 0.0009171288743882545, + "loss": 0.1639, + "num_input_tokens_seen": 24295488, + "step": 11245 + }, + { + "epoch": 1.8352365415986949, + "grad_norm": 0.02305692434310913, + "learning_rate": 0.0009175367047308319, + "loss": 0.039, + "num_input_tokens_seen": 24306624, + "step": 11250 + }, + { + "epoch": 1.83605220228385, + "grad_norm": 0.025382673367857933, + "learning_rate": 0.0009179445350734095, + "loss": 0.0658, + "num_input_tokens_seen": 24318176, + "step": 11255 + }, + { + "epoch": 1.836867862969005, + "grad_norm": 0.018094176426529884, + "learning_rate": 0.000918352365415987, + "loss": 0.0591, + "num_input_tokens_seen": 24329024, + "step": 11260 + }, + { + "epoch": 1.8376835236541598, + "grad_norm": 0.007994367741048336, + "learning_rate": 0.0009187601957585645, + "loss": 0.0408, + "num_input_tokens_seen": 24339680, + "step": 11265 + }, + { + "epoch": 1.8384991843393148, + "grad_norm": 0.007514690048992634, + "learning_rate": 0.0009191680261011419, + "loss": 0.1953, + "num_input_tokens_seen": 24350368, + "step": 11270 + }, + { + "epoch": 1.8393148450244698, + "grad_norm": 0.06945843994617462, + "learning_rate": 0.0009195758564437194, + "loss": 0.1055, + "num_input_tokens_seen": 24361536, + "step": 11275 + }, + { + "epoch": 1.8401305057096247, + "grad_norm": 0.0076647233217954636, + "learning_rate": 0.000919983686786297, + "loss": 0.2817, + "num_input_tokens_seen": 24372768, + "step": 11280 + }, + { + "epoch": 1.84094616639478, + "grad_norm": 0.2833631932735443, + "learning_rate": 0.0009203915171288743, + "loss": 0.1755, + "num_input_tokens_seen": 24382592, + "step": 11285 + }, + { + "epoch": 1.8417618270799347, + "grad_norm": 0.1453360617160797, + "learning_rate": 0.0009207993474714519, + "loss": 0.1293, + "num_input_tokens_seen": 24393248, + "step": 11290 + }, + { + "epoch": 1.8425774877650896, + "grad_norm": 0.20776934921741486, + "learning_rate": 0.0009212071778140294, + "loss": 0.2186, + "num_input_tokens_seen": 24403232, + "step": 11295 + }, + { + "epoch": 1.8433931484502448, + "grad_norm": 0.057129938155412674, + "learning_rate": 0.0009216150081566068, + "loss": 0.0772, + "num_input_tokens_seen": 24413792, + "step": 11300 + }, + { + "epoch": 1.8442088091353996, + "grad_norm": 0.014874089509248734, + "learning_rate": 0.0009220228384991844, + "loss": 0.0793, + "num_input_tokens_seen": 24423328, + "step": 11305 + }, + { + "epoch": 1.8450244698205547, + "grad_norm": 0.1219726949930191, + "learning_rate": 0.0009224306688417618, + "loss": 0.2631, + "num_input_tokens_seen": 24434464, + "step": 11310 + }, + { + "epoch": 1.8458401305057097, + "grad_norm": 0.024110184982419014, + "learning_rate": 0.0009228384991843394, + "loss": 0.1495, + "num_input_tokens_seen": 24444224, + "step": 11315 + }, + { + "epoch": 1.8466557911908645, + "grad_norm": 0.02561296336352825, + "learning_rate": 0.0009232463295269168, + "loss": 0.0729, + "num_input_tokens_seen": 24454912, + "step": 11320 + }, + { + "epoch": 1.8474714518760196, + "grad_norm": 0.029777340590953827, + "learning_rate": 0.0009236541598694943, + "loss": 0.0571, + "num_input_tokens_seen": 24466304, + "step": 11325 + }, + { + "epoch": 1.8482871125611746, + "grad_norm": 0.11445748060941696, + "learning_rate": 0.0009240619902120718, + "loss": 0.1442, + "num_input_tokens_seen": 24476960, + "step": 11330 + }, + { + "epoch": 1.8491027732463294, + "grad_norm": 0.06909628212451935, + "learning_rate": 0.0009244698205546492, + "loss": 0.1147, + "num_input_tokens_seen": 24486784, + "step": 11335 + }, + { + "epoch": 1.8499184339314845, + "grad_norm": 0.03881843015551567, + "learning_rate": 0.0009248776508972268, + "loss": 0.0753, + "num_input_tokens_seen": 24497120, + "step": 11340 + }, + { + "epoch": 1.8507340946166395, + "grad_norm": 0.08346770703792572, + "learning_rate": 0.0009252854812398043, + "loss": 0.1134, + "num_input_tokens_seen": 24507200, + "step": 11345 + }, + { + "epoch": 1.8515497553017943, + "grad_norm": 0.1436188668012619, + "learning_rate": 0.0009256933115823818, + "loss": 0.2271, + "num_input_tokens_seen": 24517920, + "step": 11350 + }, + { + "epoch": 1.8523654159869496, + "grad_norm": 0.020359840244054794, + "learning_rate": 0.0009261011419249592, + "loss": 0.1281, + "num_input_tokens_seen": 24528640, + "step": 11355 + }, + { + "epoch": 1.8531810766721044, + "grad_norm": 0.04125358164310455, + "learning_rate": 0.0009265089722675367, + "loss": 0.1304, + "num_input_tokens_seen": 24540352, + "step": 11360 + }, + { + "epoch": 1.8539967373572592, + "grad_norm": 0.09571640938520432, + "learning_rate": 0.0009269168026101143, + "loss": 0.0467, + "num_input_tokens_seen": 24551584, + "step": 11365 + }, + { + "epoch": 1.8548123980424145, + "grad_norm": 0.03580986708402634, + "learning_rate": 0.0009273246329526917, + "loss": 0.1086, + "num_input_tokens_seen": 24562400, + "step": 11370 + }, + { + "epoch": 1.8556280587275693, + "grad_norm": 0.07027873396873474, + "learning_rate": 0.0009277324632952691, + "loss": 0.104, + "num_input_tokens_seen": 24572736, + "step": 11375 + }, + { + "epoch": 1.8564437194127243, + "grad_norm": 0.14368842542171478, + "learning_rate": 0.0009281402936378467, + "loss": 0.1561, + "num_input_tokens_seen": 24584416, + "step": 11380 + }, + { + "epoch": 1.8572593800978794, + "grad_norm": 0.2705132067203522, + "learning_rate": 0.0009285481239804242, + "loss": 0.1597, + "num_input_tokens_seen": 24593472, + "step": 11385 + }, + { + "epoch": 1.8580750407830342, + "grad_norm": 0.02602589875459671, + "learning_rate": 0.0009289559543230017, + "loss": 0.2818, + "num_input_tokens_seen": 24604544, + "step": 11390 + }, + { + "epoch": 1.8588907014681892, + "grad_norm": 0.04475562274456024, + "learning_rate": 0.0009293637846655791, + "loss": 0.0958, + "num_input_tokens_seen": 24614784, + "step": 11395 + }, + { + "epoch": 1.8597063621533443, + "grad_norm": 0.10915054380893707, + "learning_rate": 0.0009297716150081566, + "loss": 0.0644, + "num_input_tokens_seen": 24625504, + "step": 11400 + }, + { + "epoch": 1.860522022838499, + "grad_norm": 0.17165768146514893, + "learning_rate": 0.0009301794453507341, + "loss": 0.1655, + "num_input_tokens_seen": 24636192, + "step": 11405 + }, + { + "epoch": 1.8613376835236541, + "grad_norm": 0.42642858624458313, + "learning_rate": 0.0009305872756933116, + "loss": 0.1854, + "num_input_tokens_seen": 24647200, + "step": 11410 + }, + { + "epoch": 1.8621533442088092, + "grad_norm": 0.04805293679237366, + "learning_rate": 0.000930995106035889, + "loss": 0.0599, + "num_input_tokens_seen": 24656640, + "step": 11415 + }, + { + "epoch": 1.862969004893964, + "grad_norm": 0.09276475012302399, + "learning_rate": 0.0009314029363784666, + "loss": 0.0466, + "num_input_tokens_seen": 24665664, + "step": 11420 + }, + { + "epoch": 1.863784665579119, + "grad_norm": 0.03534523397684097, + "learning_rate": 0.000931810766721044, + "loss": 0.1497, + "num_input_tokens_seen": 24676896, + "step": 11425 + }, + { + "epoch": 1.864600326264274, + "grad_norm": 0.01687133125960827, + "learning_rate": 0.0009322185970636216, + "loss": 0.0401, + "num_input_tokens_seen": 24688960, + "step": 11430 + }, + { + "epoch": 1.865415986949429, + "grad_norm": 0.141363725066185, + "learning_rate": 0.0009326264274061991, + "loss": 0.1061, + "num_input_tokens_seen": 24699200, + "step": 11435 + }, + { + "epoch": 1.8662316476345842, + "grad_norm": 0.06316829472780228, + "learning_rate": 0.0009330342577487764, + "loss": 0.1041, + "num_input_tokens_seen": 24709600, + "step": 11440 + }, + { + "epoch": 1.867047308319739, + "grad_norm": 0.1642911285161972, + "learning_rate": 0.000933442088091354, + "loss": 0.1694, + "num_input_tokens_seen": 24719968, + "step": 11445 + }, + { + "epoch": 1.867862969004894, + "grad_norm": 0.02142133004963398, + "learning_rate": 0.0009338499184339315, + "loss": 0.0339, + "num_input_tokens_seen": 24731520, + "step": 11450 + }, + { + "epoch": 1.868678629690049, + "grad_norm": 0.05895593762397766, + "learning_rate": 0.0009342577487765091, + "loss": 0.0339, + "num_input_tokens_seen": 24742880, + "step": 11455 + }, + { + "epoch": 1.8694942903752039, + "grad_norm": 0.002248830161988735, + "learning_rate": 0.0009346655791190864, + "loss": 0.174, + "num_input_tokens_seen": 24753600, + "step": 11460 + }, + { + "epoch": 1.870309951060359, + "grad_norm": 0.010971946641802788, + "learning_rate": 0.0009350734094616639, + "loss": 0.1819, + "num_input_tokens_seen": 24764608, + "step": 11465 + }, + { + "epoch": 1.871125611745514, + "grad_norm": 0.2766876518726349, + "learning_rate": 0.0009354812398042415, + "loss": 0.2227, + "num_input_tokens_seen": 24775872, + "step": 11470 + }, + { + "epoch": 1.8719412724306688, + "grad_norm": 0.22458480298519135, + "learning_rate": 0.0009358890701468189, + "loss": 0.1244, + "num_input_tokens_seen": 24786208, + "step": 11475 + }, + { + "epoch": 1.8727569331158238, + "grad_norm": 0.18716177344322205, + "learning_rate": 0.0009362969004893965, + "loss": 0.2578, + "num_input_tokens_seen": 24795296, + "step": 11480 + }, + { + "epoch": 1.8735725938009788, + "grad_norm": 0.16263943910598755, + "learning_rate": 0.0009367047308319739, + "loss": 0.0866, + "num_input_tokens_seen": 24805632, + "step": 11485 + }, + { + "epoch": 1.8743882544861337, + "grad_norm": 0.14317260682582855, + "learning_rate": 0.0009371125611745514, + "loss": 0.1193, + "num_input_tokens_seen": 24815552, + "step": 11490 + }, + { + "epoch": 1.8752039151712887, + "grad_norm": 0.03302580863237381, + "learning_rate": 0.0009375203915171289, + "loss": 0.1467, + "num_input_tokens_seen": 24827264, + "step": 11495 + }, + { + "epoch": 1.8760195758564437, + "grad_norm": 0.037248365581035614, + "learning_rate": 0.0009379282218597064, + "loss": 0.1079, + "num_input_tokens_seen": 24839168, + "step": 11500 + }, + { + "epoch": 1.8768352365415986, + "grad_norm": 0.0626988336443901, + "learning_rate": 0.000938336052202284, + "loss": 0.1252, + "num_input_tokens_seen": 24850336, + "step": 11505 + }, + { + "epoch": 1.8776508972267538, + "grad_norm": 0.021842312067747116, + "learning_rate": 0.0009387438825448613, + "loss": 0.0813, + "num_input_tokens_seen": 24861984, + "step": 11510 + }, + { + "epoch": 1.8784665579119086, + "grad_norm": 0.0793977752327919, + "learning_rate": 0.0009391517128874388, + "loss": 0.119, + "num_input_tokens_seen": 24872160, + "step": 11515 + }, + { + "epoch": 1.8792822185970635, + "grad_norm": 0.11082105338573456, + "learning_rate": 0.0009395595432300164, + "loss": 0.1457, + "num_input_tokens_seen": 24882272, + "step": 11520 + }, + { + "epoch": 1.8800978792822187, + "grad_norm": 0.1430175006389618, + "learning_rate": 0.0009399673735725939, + "loss": 0.1913, + "num_input_tokens_seen": 24892448, + "step": 11525 + }, + { + "epoch": 1.8809135399673735, + "grad_norm": 0.08023831248283386, + "learning_rate": 0.0009403752039151713, + "loss": 0.0561, + "num_input_tokens_seen": 24902016, + "step": 11530 + }, + { + "epoch": 1.8817292006525286, + "grad_norm": 0.12794429063796997, + "learning_rate": 0.0009407830342577488, + "loss": 0.1204, + "num_input_tokens_seen": 24913344, + "step": 11535 + }, + { + "epoch": 1.8825448613376836, + "grad_norm": 0.05356927216053009, + "learning_rate": 0.0009411908646003263, + "loss": 0.2222, + "num_input_tokens_seen": 24923840, + "step": 11540 + }, + { + "epoch": 1.8833605220228384, + "grad_norm": 0.05542874336242676, + "learning_rate": 0.0009415986949429038, + "loss": 0.1353, + "num_input_tokens_seen": 24933568, + "step": 11545 + }, + { + "epoch": 1.8841761827079935, + "grad_norm": 0.03168076276779175, + "learning_rate": 0.0009420065252854812, + "loss": 0.1021, + "num_input_tokens_seen": 24943520, + "step": 11550 + }, + { + "epoch": 1.8849918433931485, + "grad_norm": 0.19358858466148376, + "learning_rate": 0.0009424143556280587, + "loss": 0.1402, + "num_input_tokens_seen": 24954976, + "step": 11555 + }, + { + "epoch": 1.8858075040783033, + "grad_norm": 0.1855020970106125, + "learning_rate": 0.0009428221859706362, + "loss": 0.0866, + "num_input_tokens_seen": 24964384, + "step": 11560 + }, + { + "epoch": 1.8866231647634584, + "grad_norm": 0.046280428767204285, + "learning_rate": 0.0009432300163132137, + "loss": 0.0773, + "num_input_tokens_seen": 24975584, + "step": 11565 + }, + { + "epoch": 1.8874388254486134, + "grad_norm": 0.2696060538291931, + "learning_rate": 0.0009436378466557913, + "loss": 0.1485, + "num_input_tokens_seen": 24987520, + "step": 11570 + }, + { + "epoch": 1.8882544861337682, + "grad_norm": 0.013958961702883244, + "learning_rate": 0.0009440456769983687, + "loss": 0.0381, + "num_input_tokens_seen": 24999424, + "step": 11575 + }, + { + "epoch": 1.8890701468189235, + "grad_norm": 0.30746808648109436, + "learning_rate": 0.0009444535073409461, + "loss": 0.2816, + "num_input_tokens_seen": 25010624, + "step": 11580 + }, + { + "epoch": 1.8898858075040783, + "grad_norm": 0.28807562589645386, + "learning_rate": 0.0009448613376835237, + "loss": 0.2736, + "num_input_tokens_seen": 25021696, + "step": 11585 + }, + { + "epoch": 1.8907014681892331, + "grad_norm": 0.11786199361085892, + "learning_rate": 0.0009452691680261012, + "loss": 0.0887, + "num_input_tokens_seen": 25032640, + "step": 11590 + }, + { + "epoch": 1.8915171288743884, + "grad_norm": 0.016197798773646355, + "learning_rate": 0.0009456769983686786, + "loss": 0.1775, + "num_input_tokens_seen": 25043744, + "step": 11595 + }, + { + "epoch": 1.8923327895595432, + "grad_norm": 0.13031327724456787, + "learning_rate": 0.0009460848287112561, + "loss": 0.2131, + "num_input_tokens_seen": 25055136, + "step": 11600 + }, + { + "epoch": 1.8931484502446982, + "grad_norm": 0.013974967412650585, + "learning_rate": 0.0009464926590538336, + "loss": 0.1036, + "num_input_tokens_seen": 25066528, + "step": 11605 + }, + { + "epoch": 1.8939641109298533, + "grad_norm": 0.03549167886376381, + "learning_rate": 0.0009469004893964112, + "loss": 0.0973, + "num_input_tokens_seen": 25077280, + "step": 11610 + }, + { + "epoch": 1.894779771615008, + "grad_norm": 0.21018864214420319, + "learning_rate": 0.0009473083197389885, + "loss": 0.1879, + "num_input_tokens_seen": 25088544, + "step": 11615 + }, + { + "epoch": 1.8955954323001631, + "grad_norm": 0.04777180030941963, + "learning_rate": 0.0009477161500815661, + "loss": 0.065, + "num_input_tokens_seen": 25097344, + "step": 11620 + }, + { + "epoch": 1.8964110929853182, + "grad_norm": 0.07198808342218399, + "learning_rate": 0.0009481239804241436, + "loss": 0.1397, + "num_input_tokens_seen": 25108736, + "step": 11625 + }, + { + "epoch": 1.897226753670473, + "grad_norm": 0.058210521936416626, + "learning_rate": 0.000948531810766721, + "loss": 0.1564, + "num_input_tokens_seen": 25119648, + "step": 11630 + }, + { + "epoch": 1.898042414355628, + "grad_norm": 0.042264074087142944, + "learning_rate": 0.0009489396411092986, + "loss": 0.0725, + "num_input_tokens_seen": 25130048, + "step": 11635 + }, + { + "epoch": 1.898858075040783, + "grad_norm": 0.10666552931070328, + "learning_rate": 0.000949347471451876, + "loss": 0.0792, + "num_input_tokens_seen": 25140032, + "step": 11640 + }, + { + "epoch": 1.899673735725938, + "grad_norm": 0.01564154587686062, + "learning_rate": 0.0009497553017944536, + "loss": 0.1096, + "num_input_tokens_seen": 25151136, + "step": 11645 + }, + { + "epoch": 1.900489396411093, + "grad_norm": 0.02543068118393421, + "learning_rate": 0.000950163132137031, + "loss": 0.1011, + "num_input_tokens_seen": 25161408, + "step": 11650 + }, + { + "epoch": 1.901305057096248, + "grad_norm": 0.2721196711063385, + "learning_rate": 0.0009505709624796085, + "loss": 0.1944, + "num_input_tokens_seen": 25172224, + "step": 11655 + }, + { + "epoch": 1.9021207177814028, + "grad_norm": 0.09692230820655823, + "learning_rate": 0.000950978792822186, + "loss": 0.2058, + "num_input_tokens_seen": 25183424, + "step": 11660 + }, + { + "epoch": 1.902936378466558, + "grad_norm": 0.11975245922803879, + "learning_rate": 0.0009513866231647634, + "loss": 0.0831, + "num_input_tokens_seen": 25194944, + "step": 11665 + }, + { + "epoch": 1.9037520391517129, + "grad_norm": 0.07688738405704498, + "learning_rate": 0.000951794453507341, + "loss": 0.081, + "num_input_tokens_seen": 25205728, + "step": 11670 + }, + { + "epoch": 1.904567699836868, + "grad_norm": 0.02023891732096672, + "learning_rate": 0.0009522022838499185, + "loss": 0.1036, + "num_input_tokens_seen": 25216576, + "step": 11675 + }, + { + "epoch": 1.905383360522023, + "grad_norm": 0.010016894899308681, + "learning_rate": 0.000952610114192496, + "loss": 0.018, + "num_input_tokens_seen": 25227616, + "step": 11680 + }, + { + "epoch": 1.9061990212071778, + "grad_norm": 0.0811651423573494, + "learning_rate": 0.0009530179445350734, + "loss": 0.1365, + "num_input_tokens_seen": 25237856, + "step": 11685 + }, + { + "epoch": 1.9070146818923328, + "grad_norm": 0.23438715934753418, + "learning_rate": 0.0009534257748776509, + "loss": 0.2387, + "num_input_tokens_seen": 25248256, + "step": 11690 + }, + { + "epoch": 1.9078303425774878, + "grad_norm": 0.1352328360080719, + "learning_rate": 0.0009538336052202285, + "loss": 0.0673, + "num_input_tokens_seen": 25258944, + "step": 11695 + }, + { + "epoch": 1.9086460032626427, + "grad_norm": 0.024276230484247208, + "learning_rate": 0.0009542414355628059, + "loss": 0.1028, + "num_input_tokens_seen": 25269120, + "step": 11700 + }, + { + "epoch": 1.9094616639477977, + "grad_norm": 0.146909698843956, + "learning_rate": 0.0009546492659053833, + "loss": 0.1185, + "num_input_tokens_seen": 25280160, + "step": 11705 + }, + { + "epoch": 1.9102773246329527, + "grad_norm": 0.1413983553647995, + "learning_rate": 0.0009550570962479609, + "loss": 0.2007, + "num_input_tokens_seen": 25290112, + "step": 11710 + }, + { + "epoch": 1.9110929853181076, + "grad_norm": 0.09862280637025833, + "learning_rate": 0.0009554649265905384, + "loss": 0.1075, + "num_input_tokens_seen": 25301664, + "step": 11715 + }, + { + "epoch": 1.9119086460032626, + "grad_norm": 0.028346039354801178, + "learning_rate": 0.0009558727569331158, + "loss": 0.1446, + "num_input_tokens_seen": 25311264, + "step": 11720 + }, + { + "epoch": 1.9127243066884176, + "grad_norm": 0.06314757466316223, + "learning_rate": 0.0009562805872756934, + "loss": 0.0831, + "num_input_tokens_seen": 25321984, + "step": 11725 + }, + { + "epoch": 1.9135399673735725, + "grad_norm": 0.08205103129148483, + "learning_rate": 0.0009566884176182708, + "loss": 0.1057, + "num_input_tokens_seen": 25333664, + "step": 11730 + }, + { + "epoch": 1.9143556280587277, + "grad_norm": 0.027284633368253708, + "learning_rate": 0.0009570962479608483, + "loss": 0.0995, + "num_input_tokens_seen": 25342944, + "step": 11735 + }, + { + "epoch": 1.9151712887438825, + "grad_norm": 0.1888815313577652, + "learning_rate": 0.0009575040783034258, + "loss": 0.2126, + "num_input_tokens_seen": 25353568, + "step": 11740 + }, + { + "epoch": 1.9159869494290374, + "grad_norm": 0.015189380384981632, + "learning_rate": 0.0009579119086460033, + "loss": 0.0878, + "num_input_tokens_seen": 25364736, + "step": 11745 + }, + { + "epoch": 1.9168026101141926, + "grad_norm": 0.004496204666793346, + "learning_rate": 0.0009583197389885808, + "loss": 0.0476, + "num_input_tokens_seen": 25374528, + "step": 11750 + }, + { + "epoch": 1.9176182707993474, + "grad_norm": 0.04603388532996178, + "learning_rate": 0.0009587275693311582, + "loss": 0.1368, + "num_input_tokens_seen": 25385440, + "step": 11755 + }, + { + "epoch": 1.9184339314845025, + "grad_norm": 0.18411973118782043, + "learning_rate": 0.0009591353996737358, + "loss": 0.2136, + "num_input_tokens_seen": 25397344, + "step": 11760 + }, + { + "epoch": 1.9192495921696575, + "grad_norm": 0.009517129510641098, + "learning_rate": 0.0009595432300163133, + "loss": 0.083, + "num_input_tokens_seen": 25408448, + "step": 11765 + }, + { + "epoch": 1.9200652528548123, + "grad_norm": 0.09775668382644653, + "learning_rate": 0.0009599510603588906, + "loss": 0.1621, + "num_input_tokens_seen": 25418336, + "step": 11770 + }, + { + "epoch": 1.9208809135399674, + "grad_norm": 0.0947251245379448, + "learning_rate": 0.0009603588907014682, + "loss": 0.1877, + "num_input_tokens_seen": 25429824, + "step": 11775 + }, + { + "epoch": 1.9216965742251224, + "grad_norm": 0.044867224991321564, + "learning_rate": 0.0009607667210440457, + "loss": 0.2092, + "num_input_tokens_seen": 25439584, + "step": 11780 + }, + { + "epoch": 1.9225122349102772, + "grad_norm": 0.09496048092842102, + "learning_rate": 0.0009611745513866232, + "loss": 0.1362, + "num_input_tokens_seen": 25451744, + "step": 11785 + }, + { + "epoch": 1.9233278955954323, + "grad_norm": 0.06410021334886551, + "learning_rate": 0.0009615823817292007, + "loss": 0.0422, + "num_input_tokens_seen": 25461856, + "step": 11790 + }, + { + "epoch": 1.9241435562805873, + "grad_norm": 0.2719918191432953, + "learning_rate": 0.0009619902120717781, + "loss": 0.2074, + "num_input_tokens_seen": 25473120, + "step": 11795 + }, + { + "epoch": 1.9249592169657421, + "grad_norm": 0.10191784799098969, + "learning_rate": 0.0009623980424143557, + "loss": 0.0649, + "num_input_tokens_seen": 25485632, + "step": 11800 + }, + { + "epoch": 1.9257748776508974, + "grad_norm": 0.15965360403060913, + "learning_rate": 0.0009628058727569331, + "loss": 0.0806, + "num_input_tokens_seen": 25496160, + "step": 11805 + }, + { + "epoch": 1.9265905383360522, + "grad_norm": 0.19438891112804413, + "learning_rate": 0.0009632137030995107, + "loss": 0.1046, + "num_input_tokens_seen": 25507712, + "step": 11810 + }, + { + "epoch": 1.927406199021207, + "grad_norm": 0.2586320638656616, + "learning_rate": 0.0009636215334420881, + "loss": 0.2197, + "num_input_tokens_seen": 25519328, + "step": 11815 + }, + { + "epoch": 1.9282218597063623, + "grad_norm": 0.03463272750377655, + "learning_rate": 0.0009640293637846655, + "loss": 0.0318, + "num_input_tokens_seen": 25528992, + "step": 11820 + }, + { + "epoch": 1.929037520391517, + "grad_norm": 0.061738938093185425, + "learning_rate": 0.0009644371941272431, + "loss": 0.1206, + "num_input_tokens_seen": 25540032, + "step": 11825 + }, + { + "epoch": 1.9298531810766721, + "grad_norm": 0.2623274028301239, + "learning_rate": 0.0009648450244698206, + "loss": 0.1026, + "num_input_tokens_seen": 25551392, + "step": 11830 + }, + { + "epoch": 1.9306688417618272, + "grad_norm": 0.08953751623630524, + "learning_rate": 0.0009652528548123982, + "loss": 0.0899, + "num_input_tokens_seen": 25561856, + "step": 11835 + }, + { + "epoch": 1.931484502446982, + "grad_norm": 0.08229319006204605, + "learning_rate": 0.0009656606851549755, + "loss": 0.2078, + "num_input_tokens_seen": 25571712, + "step": 11840 + }, + { + "epoch": 1.932300163132137, + "grad_norm": 0.1476791799068451, + "learning_rate": 0.000966068515497553, + "loss": 0.0821, + "num_input_tokens_seen": 25581088, + "step": 11845 + }, + { + "epoch": 1.933115823817292, + "grad_norm": 0.012489629909396172, + "learning_rate": 0.0009664763458401306, + "loss": 0.0216, + "num_input_tokens_seen": 25593376, + "step": 11850 + }, + { + "epoch": 1.933931484502447, + "grad_norm": 0.11344364285469055, + "learning_rate": 0.000966884176182708, + "loss": 0.1529, + "num_input_tokens_seen": 25603488, + "step": 11855 + }, + { + "epoch": 1.934747145187602, + "grad_norm": 0.13952037692070007, + "learning_rate": 0.0009672920065252854, + "loss": 0.1841, + "num_input_tokens_seen": 25613216, + "step": 11860 + }, + { + "epoch": 1.935562805872757, + "grad_norm": 0.14237654209136963, + "learning_rate": 0.000967699836867863, + "loss": 0.0656, + "num_input_tokens_seen": 25624096, + "step": 11865 + }, + { + "epoch": 1.9363784665579118, + "grad_norm": 0.018689442425966263, + "learning_rate": 0.0009681076672104405, + "loss": 0.1611, + "num_input_tokens_seen": 25633760, + "step": 11870 + }, + { + "epoch": 1.9371941272430668, + "grad_norm": 0.13344629108905792, + "learning_rate": 0.000968515497553018, + "loss": 0.1209, + "num_input_tokens_seen": 25645216, + "step": 11875 + }, + { + "epoch": 1.9380097879282219, + "grad_norm": 0.06747011840343475, + "learning_rate": 0.0009689233278955954, + "loss": 0.2747, + "num_input_tokens_seen": 25656288, + "step": 11880 + }, + { + "epoch": 1.9388254486133767, + "grad_norm": 0.05733760818839073, + "learning_rate": 0.0009693311582381729, + "loss": 0.2396, + "num_input_tokens_seen": 25667488, + "step": 11885 + }, + { + "epoch": 1.939641109298532, + "grad_norm": 0.14990225434303284, + "learning_rate": 0.0009697389885807504, + "loss": 0.1336, + "num_input_tokens_seen": 25678848, + "step": 11890 + }, + { + "epoch": 1.9404567699836868, + "grad_norm": 0.045559726655483246, + "learning_rate": 0.0009701468189233279, + "loss": 0.1424, + "num_input_tokens_seen": 25689280, + "step": 11895 + }, + { + "epoch": 1.9412724306688418, + "grad_norm": 0.13860392570495605, + "learning_rate": 0.0009705546492659055, + "loss": 0.1337, + "num_input_tokens_seen": 25699552, + "step": 11900 + }, + { + "epoch": 1.9420880913539968, + "grad_norm": 0.02404346503317356, + "learning_rate": 0.0009709624796084829, + "loss": 0.1727, + "num_input_tokens_seen": 25709920, + "step": 11905 + }, + { + "epoch": 1.9429037520391517, + "grad_norm": 0.21738503873348236, + "learning_rate": 0.0009713703099510603, + "loss": 0.2026, + "num_input_tokens_seen": 25722592, + "step": 11910 + }, + { + "epoch": 1.9437194127243067, + "grad_norm": 0.036390043795108795, + "learning_rate": 0.0009717781402936379, + "loss": 0.032, + "num_input_tokens_seen": 25733696, + "step": 11915 + }, + { + "epoch": 1.9445350734094617, + "grad_norm": 0.07713183015584946, + "learning_rate": 0.0009721859706362154, + "loss": 0.1689, + "num_input_tokens_seen": 25743712, + "step": 11920 + }, + { + "epoch": 1.9453507340946166, + "grad_norm": 0.034438278526067734, + "learning_rate": 0.0009725938009787928, + "loss": 0.1995, + "num_input_tokens_seen": 25753952, + "step": 11925 + }, + { + "epoch": 1.9461663947797716, + "grad_norm": 0.03439588099718094, + "learning_rate": 0.0009730016313213703, + "loss": 0.1411, + "num_input_tokens_seen": 25764896, + "step": 11930 + }, + { + "epoch": 1.9469820554649266, + "grad_norm": 0.08104047924280167, + "learning_rate": 0.0009734094616639478, + "loss": 0.1262, + "num_input_tokens_seen": 25774944, + "step": 11935 + }, + { + "epoch": 1.9477977161500815, + "grad_norm": 0.052425529807806015, + "learning_rate": 0.0009738172920065254, + "loss": 0.123, + "num_input_tokens_seen": 25785472, + "step": 11940 + }, + { + "epoch": 1.9486133768352365, + "grad_norm": 0.0337790921330452, + "learning_rate": 0.0009742251223491027, + "loss": 0.1456, + "num_input_tokens_seen": 25796448, + "step": 11945 + }, + { + "epoch": 1.9494290375203915, + "grad_norm": 0.06778880953788757, + "learning_rate": 0.0009746329526916803, + "loss": 0.1017, + "num_input_tokens_seen": 25807168, + "step": 11950 + }, + { + "epoch": 1.9502446982055464, + "grad_norm": 0.2675227224826813, + "learning_rate": 0.0009750407830342578, + "loss": 0.1215, + "num_input_tokens_seen": 25817792, + "step": 11955 + }, + { + "epoch": 1.9510603588907016, + "grad_norm": 0.06858845800161362, + "learning_rate": 0.0009754486133768352, + "loss": 0.1706, + "num_input_tokens_seen": 25827392, + "step": 11960 + }, + { + "epoch": 1.9518760195758564, + "grad_norm": 0.019781967625021935, + "learning_rate": 0.0009758564437194128, + "loss": 0.0393, + "num_input_tokens_seen": 25838720, + "step": 11965 + }, + { + "epoch": 1.9526916802610113, + "grad_norm": 0.15546073019504547, + "learning_rate": 0.0009762642740619902, + "loss": 0.0764, + "num_input_tokens_seen": 25850496, + "step": 11970 + }, + { + "epoch": 1.9535073409461665, + "grad_norm": 0.050317637622356415, + "learning_rate": 0.0009766721044045677, + "loss": 0.0929, + "num_input_tokens_seen": 25862240, + "step": 11975 + }, + { + "epoch": 1.9543230016313213, + "grad_norm": 0.05223330855369568, + "learning_rate": 0.0009770799347471452, + "loss": 0.1081, + "num_input_tokens_seen": 25873280, + "step": 11980 + }, + { + "epoch": 1.9551386623164764, + "grad_norm": 0.019760608673095703, + "learning_rate": 0.0009774877650897227, + "loss": 0.0687, + "num_input_tokens_seen": 25883872, + "step": 11985 + }, + { + "epoch": 1.9559543230016314, + "grad_norm": 0.020289970561861992, + "learning_rate": 0.0009778955954323001, + "loss": 0.0168, + "num_input_tokens_seen": 25895424, + "step": 11990 + }, + { + "epoch": 1.9567699836867862, + "grad_norm": 0.35939252376556396, + "learning_rate": 0.0009783034257748776, + "loss": 0.3265, + "num_input_tokens_seen": 25906944, + "step": 11995 + }, + { + "epoch": 1.9575856443719413, + "grad_norm": 0.08467115461826324, + "learning_rate": 0.000978711256117455, + "loss": 0.0506, + "num_input_tokens_seen": 25918720, + "step": 12000 + }, + { + "epoch": 1.9584013050570963, + "grad_norm": 0.054660551249980927, + "learning_rate": 0.0009791190864600326, + "loss": 0.028, + "num_input_tokens_seen": 25929248, + "step": 12005 + }, + { + "epoch": 1.9592169657422511, + "grad_norm": 0.10032329708337784, + "learning_rate": 0.00097952691680261, + "loss": 0.2214, + "num_input_tokens_seen": 25939040, + "step": 12010 + }, + { + "epoch": 1.9600326264274062, + "grad_norm": 0.018500017002224922, + "learning_rate": 0.0009799347471451875, + "loss": 0.1818, + "num_input_tokens_seen": 25949024, + "step": 12015 + }, + { + "epoch": 1.9608482871125612, + "grad_norm": 0.015873201191425323, + "learning_rate": 0.0009803425774877652, + "loss": 0.1543, + "num_input_tokens_seen": 25959264, + "step": 12020 + }, + { + "epoch": 1.961663947797716, + "grad_norm": 0.053824182599782944, + "learning_rate": 0.0009807504078303427, + "loss": 0.1094, + "num_input_tokens_seen": 25970432, + "step": 12025 + }, + { + "epoch": 1.9624796084828713, + "grad_norm": 0.17976006865501404, + "learning_rate": 0.00098115823817292, + "loss": 0.1779, + "num_input_tokens_seen": 25981856, + "step": 12030 + }, + { + "epoch": 1.963295269168026, + "grad_norm": 0.05679651349782944, + "learning_rate": 0.0009815660685154977, + "loss": 0.2429, + "num_input_tokens_seen": 25993696, + "step": 12035 + }, + { + "epoch": 1.964110929853181, + "grad_norm": 0.04254353418946266, + "learning_rate": 0.0009819738988580751, + "loss": 0.1423, + "num_input_tokens_seen": 26004736, + "step": 12040 + }, + { + "epoch": 1.9649265905383362, + "grad_norm": 0.02969471551477909, + "learning_rate": 0.0009823817292006526, + "loss": 0.0541, + "num_input_tokens_seen": 26016480, + "step": 12045 + }, + { + "epoch": 1.965742251223491, + "grad_norm": 0.0478089340031147, + "learning_rate": 0.00098278955954323, + "loss": 0.0313, + "num_input_tokens_seen": 26027072, + "step": 12050 + }, + { + "epoch": 1.966557911908646, + "grad_norm": 0.2654189169406891, + "learning_rate": 0.0009831973898858076, + "loss": 0.1784, + "num_input_tokens_seen": 26036096, + "step": 12055 + }, + { + "epoch": 1.967373572593801, + "grad_norm": 0.04217769578099251, + "learning_rate": 0.000983605220228385, + "loss": 0.1182, + "num_input_tokens_seen": 26047264, + "step": 12060 + }, + { + "epoch": 1.968189233278956, + "grad_norm": 0.06786169111728668, + "learning_rate": 0.0009840130505709625, + "loss": 0.0983, + "num_input_tokens_seen": 26058176, + "step": 12065 + }, + { + "epoch": 1.969004893964111, + "grad_norm": 0.03131534904241562, + "learning_rate": 0.00098442088091354, + "loss": 0.0572, + "num_input_tokens_seen": 26069664, + "step": 12070 + }, + { + "epoch": 1.969820554649266, + "grad_norm": 0.014411543495953083, + "learning_rate": 0.0009848287112561175, + "loss": 0.0639, + "num_input_tokens_seen": 26080640, + "step": 12075 + }, + { + "epoch": 1.9706362153344208, + "grad_norm": 0.08846641331911087, + "learning_rate": 0.000985236541598695, + "loss": 0.0418, + "num_input_tokens_seen": 26091360, + "step": 12080 + }, + { + "epoch": 1.9714518760195758, + "grad_norm": 0.035760972648859024, + "learning_rate": 0.0009856443719412724, + "loss": 0.0666, + "num_input_tokens_seen": 26102080, + "step": 12085 + }, + { + "epoch": 1.9722675367047309, + "grad_norm": 0.24266232550144196, + "learning_rate": 0.00098605220228385, + "loss": 0.1866, + "num_input_tokens_seen": 26112192, + "step": 12090 + }, + { + "epoch": 1.9730831973898857, + "grad_norm": 0.34169813990592957, + "learning_rate": 0.0009864600326264274, + "loss": 0.2238, + "num_input_tokens_seen": 26122496, + "step": 12095 + }, + { + "epoch": 1.9738988580750407, + "grad_norm": 0.12048333883285522, + "learning_rate": 0.0009868678629690048, + "loss": 0.0643, + "num_input_tokens_seen": 26134048, + "step": 12100 + }, + { + "epoch": 1.9747145187601958, + "grad_norm": 0.0553930439054966, + "learning_rate": 0.0009872756933115823, + "loss": 0.2014, + "num_input_tokens_seen": 26144192, + "step": 12105 + }, + { + "epoch": 1.9755301794453506, + "grad_norm": 0.023241301998496056, + "learning_rate": 0.00098768352365416, + "loss": 0.1545, + "num_input_tokens_seen": 26155328, + "step": 12110 + }, + { + "epoch": 1.9763458401305058, + "grad_norm": 0.01942472904920578, + "learning_rate": 0.0009880913539967373, + "loss": 0.1344, + "num_input_tokens_seen": 26166272, + "step": 12115 + }, + { + "epoch": 1.9771615008156607, + "grad_norm": 0.11509157717227936, + "learning_rate": 0.0009884991843393148, + "loss": 0.2316, + "num_input_tokens_seen": 26175520, + "step": 12120 + }, + { + "epoch": 1.9779771615008157, + "grad_norm": 0.21557745337486267, + "learning_rate": 0.0009889070146818924, + "loss": 0.3436, + "num_input_tokens_seen": 26187008, + "step": 12125 + }, + { + "epoch": 1.9787928221859707, + "grad_norm": 0.11575018614530563, + "learning_rate": 0.00098931484502447, + "loss": 0.1403, + "num_input_tokens_seen": 26198016, + "step": 12130 + }, + { + "epoch": 1.9796084828711256, + "grad_norm": 0.04281505569815636, + "learning_rate": 0.0009897226753670474, + "loss": 0.2141, + "num_input_tokens_seen": 26209440, + "step": 12135 + }, + { + "epoch": 1.9804241435562806, + "grad_norm": 0.04773539677262306, + "learning_rate": 0.0009901305057096249, + "loss": 0.1016, + "num_input_tokens_seen": 26220736, + "step": 12140 + }, + { + "epoch": 1.9812398042414356, + "grad_norm": 0.10108579695224762, + "learning_rate": 0.0009905383360522024, + "loss": 0.0694, + "num_input_tokens_seen": 26231456, + "step": 12145 + }, + { + "epoch": 1.9820554649265905, + "grad_norm": 0.03429146856069565, + "learning_rate": 0.0009909461663947798, + "loss": 0.0899, + "num_input_tokens_seen": 26241568, + "step": 12150 + }, + { + "epoch": 1.9828711256117455, + "grad_norm": 0.07473118603229523, + "learning_rate": 0.0009913539967373573, + "loss": 0.1422, + "num_input_tokens_seen": 26252480, + "step": 12155 + }, + { + "epoch": 1.9836867862969005, + "grad_norm": 0.026282135397195816, + "learning_rate": 0.0009917618270799348, + "loss": 0.0556, + "num_input_tokens_seen": 26263264, + "step": 12160 + }, + { + "epoch": 1.9845024469820554, + "grad_norm": 0.13597652316093445, + "learning_rate": 0.0009921696574225123, + "loss": 0.1316, + "num_input_tokens_seen": 26274496, + "step": 12165 + }, + { + "epoch": 1.9853181076672104, + "grad_norm": 0.06429655849933624, + "learning_rate": 0.0009925774877650897, + "loss": 0.1186, + "num_input_tokens_seen": 26285152, + "step": 12170 + }, + { + "epoch": 1.9861337683523654, + "grad_norm": 0.144106924533844, + "learning_rate": 0.0009929853181076672, + "loss": 0.23, + "num_input_tokens_seen": 26296064, + "step": 12175 + }, + { + "epoch": 1.9869494290375203, + "grad_norm": 0.15405425429344177, + "learning_rate": 0.0009933931484502447, + "loss": 0.1472, + "num_input_tokens_seen": 26305984, + "step": 12180 + }, + { + "epoch": 1.9877650897226755, + "grad_norm": 0.08156856149435043, + "learning_rate": 0.0009938009787928222, + "loss": 0.1344, + "num_input_tokens_seen": 26316896, + "step": 12185 + }, + { + "epoch": 1.9885807504078303, + "grad_norm": 0.034149665385484695, + "learning_rate": 0.0009942088091353996, + "loss": 0.0996, + "num_input_tokens_seen": 26326976, + "step": 12190 + }, + { + "epoch": 1.9893964110929854, + "grad_norm": 0.08690574020147324, + "learning_rate": 0.0009946166394779771, + "loss": 0.168, + "num_input_tokens_seen": 26337504, + "step": 12195 + }, + { + "epoch": 1.9902120717781404, + "grad_norm": 0.04139183089137077, + "learning_rate": 0.0009950244698205548, + "loss": 0.0588, + "num_input_tokens_seen": 26348608, + "step": 12200 + }, + { + "epoch": 1.9910277324632952, + "grad_norm": 0.26938915252685547, + "learning_rate": 0.000995432300163132, + "loss": 0.1612, + "num_input_tokens_seen": 26358240, + "step": 12205 + }, + { + "epoch": 1.9918433931484503, + "grad_norm": 0.20158658921718597, + "learning_rate": 0.0009958401305057095, + "loss": 0.0712, + "num_input_tokens_seen": 26369568, + "step": 12210 + }, + { + "epoch": 1.9926590538336053, + "grad_norm": 0.05390627309679985, + "learning_rate": 0.0009962479608482872, + "loss": 0.1147, + "num_input_tokens_seen": 26379936, + "step": 12215 + }, + { + "epoch": 1.9934747145187601, + "grad_norm": 0.06181644648313522, + "learning_rate": 0.0009966557911908645, + "loss": 0.1475, + "num_input_tokens_seen": 26390624, + "step": 12220 + }, + { + "epoch": 1.9942903752039152, + "grad_norm": 0.25694912672042847, + "learning_rate": 0.0009970636215334422, + "loss": 0.1115, + "num_input_tokens_seen": 26399616, + "step": 12225 + }, + { + "epoch": 1.9951060358890702, + "grad_norm": 0.05160185694694519, + "learning_rate": 0.0009974714518760197, + "loss": 0.0934, + "num_input_tokens_seen": 26410208, + "step": 12230 + }, + { + "epoch": 1.995921696574225, + "grad_norm": 0.06562802940607071, + "learning_rate": 0.0009978792822185971, + "loss": 0.1132, + "num_input_tokens_seen": 26420160, + "step": 12235 + }, + { + "epoch": 1.99673735725938, + "grad_norm": 0.03730420768260956, + "learning_rate": 0.0009982871125611746, + "loss": 0.0677, + "num_input_tokens_seen": 26430432, + "step": 12240 + }, + { + "epoch": 1.997553017944535, + "grad_norm": 0.04397095739841461, + "learning_rate": 0.000998694942903752, + "loss": 0.0814, + "num_input_tokens_seen": 26440128, + "step": 12245 + }, + { + "epoch": 1.99836867862969, + "grad_norm": 0.22800733149051666, + "learning_rate": 0.0009991027732463296, + "loss": 0.1041, + "num_input_tokens_seen": 26450496, + "step": 12250 + }, + { + "epoch": 1.9991843393148452, + "grad_norm": 0.21904075145721436, + "learning_rate": 0.000999510603588907, + "loss": 0.1414, + "num_input_tokens_seen": 26461824, + "step": 12255 + }, + { + "epoch": 2.0, + "grad_norm": 0.05233241617679596, + "learning_rate": 0.0009999184339314845, + "loss": 0.0776, + "num_input_tokens_seen": 26471216, + "step": 12260 + }, + { + "epoch": 2.0, + "eval_loss": 0.13062207400798798, + "eval_runtime": 103.6599, + "eval_samples_per_second": 26.288, + "eval_steps_per_second": 6.579, + "num_input_tokens_seen": 26471216, + "step": 12260 + }, + { + "epoch": 2.000815660685155, + "grad_norm": 0.20280566811561584, + "learning_rate": 0.000999999996757397, + "loss": 0.1198, + "num_input_tokens_seen": 26481904, + "step": 12265 + }, + { + "epoch": 2.00163132137031, + "grad_norm": 0.2520328760147095, + "learning_rate": 0.0009999999835843226, + "loss": 0.1793, + "num_input_tokens_seen": 26491344, + "step": 12270 + }, + { + "epoch": 2.002446982055465, + "grad_norm": 0.02056376077234745, + "learning_rate": 0.000999999960278114, + "loss": 0.0616, + "num_input_tokens_seen": 26502160, + "step": 12275 + }, + { + "epoch": 2.0032626427406197, + "grad_norm": 0.23525598645210266, + "learning_rate": 0.000999999926838772, + "loss": 0.0916, + "num_input_tokens_seen": 26512816, + "step": 12280 + }, + { + "epoch": 2.004078303425775, + "grad_norm": 0.007705080322921276, + "learning_rate": 0.0009999998832662972, + "loss": 0.0314, + "num_input_tokens_seen": 26522640, + "step": 12285 + }, + { + "epoch": 2.00489396411093, + "grad_norm": 0.01065191999077797, + "learning_rate": 0.0009999998295606907, + "loss": 0.2861, + "num_input_tokens_seen": 26533072, + "step": 12290 + }, + { + "epoch": 2.0057096247960846, + "grad_norm": 0.0217636376619339, + "learning_rate": 0.000999999765721953, + "loss": 0.0936, + "num_input_tokens_seen": 26543408, + "step": 12295 + }, + { + "epoch": 2.00652528548124, + "grad_norm": 0.03687581419944763, + "learning_rate": 0.000999999691750086, + "loss": 0.223, + "num_input_tokens_seen": 26554320, + "step": 12300 + }, + { + "epoch": 2.0073409461663947, + "grad_norm": 0.049486830830574036, + "learning_rate": 0.0009999996076450908, + "loss": 0.1541, + "num_input_tokens_seen": 26565008, + "step": 12305 + }, + { + "epoch": 2.00815660685155, + "grad_norm": 0.024196365848183632, + "learning_rate": 0.0009999995134069692, + "loss": 0.0537, + "num_input_tokens_seen": 26576912, + "step": 12310 + }, + { + "epoch": 2.0089722675367048, + "grad_norm": 0.09822910279035568, + "learning_rate": 0.0009999994090357234, + "loss": 0.3343, + "num_input_tokens_seen": 26586800, + "step": 12315 + }, + { + "epoch": 2.0097879282218596, + "grad_norm": 0.0848051980137825, + "learning_rate": 0.0009999992945313551, + "loss": 0.1014, + "num_input_tokens_seen": 26598160, + "step": 12320 + }, + { + "epoch": 2.010603588907015, + "grad_norm": 0.09586621820926666, + "learning_rate": 0.0009999991698938669, + "loss": 0.1177, + "num_input_tokens_seen": 26609136, + "step": 12325 + }, + { + "epoch": 2.0114192495921697, + "grad_norm": 0.023700157180428505, + "learning_rate": 0.000999999035123261, + "loss": 0.0686, + "num_input_tokens_seen": 26620752, + "step": 12330 + }, + { + "epoch": 2.0122349102773245, + "grad_norm": 0.303498238325119, + "learning_rate": 0.0009999988902195407, + "loss": 0.2223, + "num_input_tokens_seen": 26630256, + "step": 12335 + }, + { + "epoch": 2.0130505709624797, + "grad_norm": 0.25997933745384216, + "learning_rate": 0.0009999987351827085, + "loss": 0.2448, + "num_input_tokens_seen": 26641136, + "step": 12340 + }, + { + "epoch": 2.0138662316476346, + "grad_norm": 0.07568349689245224, + "learning_rate": 0.0009999985700127674, + "loss": 0.0864, + "num_input_tokens_seen": 26651824, + "step": 12345 + }, + { + "epoch": 2.0146818923327894, + "grad_norm": 0.16815774142742157, + "learning_rate": 0.0009999983947097213, + "loss": 0.2509, + "num_input_tokens_seen": 26662064, + "step": 12350 + }, + { + "epoch": 2.0154975530179446, + "grad_norm": 0.08486269414424896, + "learning_rate": 0.0009999982092735733, + "loss": 0.1091, + "num_input_tokens_seen": 26673104, + "step": 12355 + }, + { + "epoch": 2.0163132137030995, + "grad_norm": 0.1370411068201065, + "learning_rate": 0.0009999980137043274, + "loss": 0.1843, + "num_input_tokens_seen": 26682736, + "step": 12360 + }, + { + "epoch": 2.0171288743882543, + "grad_norm": 0.01647280715405941, + "learning_rate": 0.0009999978080019872, + "loss": 0.1562, + "num_input_tokens_seen": 26694224, + "step": 12365 + }, + { + "epoch": 2.0179445350734095, + "grad_norm": 0.07305126637220383, + "learning_rate": 0.0009999975921665574, + "loss": 0.1131, + "num_input_tokens_seen": 26704880, + "step": 12370 + }, + { + "epoch": 2.0187601957585644, + "grad_norm": 0.21917147934436798, + "learning_rate": 0.000999997366198042, + "loss": 0.134, + "num_input_tokens_seen": 26713648, + "step": 12375 + }, + { + "epoch": 2.0195758564437196, + "grad_norm": 0.01418970339000225, + "learning_rate": 0.0009999971300964456, + "loss": 0.0612, + "num_input_tokens_seen": 26724560, + "step": 12380 + }, + { + "epoch": 2.0203915171288744, + "grad_norm": 0.3252423405647278, + "learning_rate": 0.0009999968838617732, + "loss": 0.1394, + "num_input_tokens_seen": 26735344, + "step": 12385 + }, + { + "epoch": 2.0212071778140293, + "grad_norm": 0.22255448997020721, + "learning_rate": 0.0009999966274940296, + "loss": 0.1269, + "num_input_tokens_seen": 26746288, + "step": 12390 + }, + { + "epoch": 2.0220228384991845, + "grad_norm": 0.2203729748725891, + "learning_rate": 0.00099999636099322, + "loss": 0.1444, + "num_input_tokens_seen": 26757936, + "step": 12395 + }, + { + "epoch": 2.0228384991843393, + "grad_norm": 0.0524790957570076, + "learning_rate": 0.0009999960843593498, + "loss": 0.0958, + "num_input_tokens_seen": 26770256, + "step": 12400 + }, + { + "epoch": 2.023654159869494, + "grad_norm": 0.22992610931396484, + "learning_rate": 0.0009999957975924249, + "loss": 0.1361, + "num_input_tokens_seen": 26780240, + "step": 12405 + }, + { + "epoch": 2.0244698205546494, + "grad_norm": 0.014083504676818848, + "learning_rate": 0.0009999955006924507, + "loss": 0.108, + "num_input_tokens_seen": 26791824, + "step": 12410 + }, + { + "epoch": 2.0252854812398042, + "grad_norm": 0.020348355174064636, + "learning_rate": 0.0009999951936594334, + "loss": 0.1582, + "num_input_tokens_seen": 26802576, + "step": 12415 + }, + { + "epoch": 2.026101141924959, + "grad_norm": 0.03638402372598648, + "learning_rate": 0.0009999948764933793, + "loss": 0.0886, + "num_input_tokens_seen": 26814256, + "step": 12420 + }, + { + "epoch": 2.0269168026101143, + "grad_norm": 0.1523914486169815, + "learning_rate": 0.0009999945491942946, + "loss": 0.3271, + "num_input_tokens_seen": 26824528, + "step": 12425 + }, + { + "epoch": 2.027732463295269, + "grad_norm": 0.0656893253326416, + "learning_rate": 0.0009999942117621863, + "loss": 0.0548, + "num_input_tokens_seen": 26836816, + "step": 12430 + }, + { + "epoch": 2.028548123980424, + "grad_norm": 0.06798010319471359, + "learning_rate": 0.0009999938641970607, + "loss": 0.1255, + "num_input_tokens_seen": 26846800, + "step": 12435 + }, + { + "epoch": 2.029363784665579, + "grad_norm": 0.041597068309783936, + "learning_rate": 0.0009999935064989255, + "loss": 0.1114, + "num_input_tokens_seen": 26858288, + "step": 12440 + }, + { + "epoch": 2.030179445350734, + "grad_norm": 0.07969851791858673, + "learning_rate": 0.0009999931386677873, + "loss": 0.0694, + "num_input_tokens_seen": 26868624, + "step": 12445 + }, + { + "epoch": 2.0309951060358893, + "grad_norm": 0.020414838567376137, + "learning_rate": 0.000999992760703654, + "loss": 0.0782, + "num_input_tokens_seen": 26879536, + "step": 12450 + }, + { + "epoch": 2.031810766721044, + "grad_norm": 0.1460573375225067, + "learning_rate": 0.000999992372606533, + "loss": 0.3533, + "num_input_tokens_seen": 26890320, + "step": 12455 + }, + { + "epoch": 2.032626427406199, + "grad_norm": 0.03602667525410652, + "learning_rate": 0.0009999919743764324, + "loss": 0.1561, + "num_input_tokens_seen": 26900784, + "step": 12460 + }, + { + "epoch": 2.033442088091354, + "grad_norm": 0.022558797150850296, + "learning_rate": 0.00099999156601336, + "loss": 0.0954, + "num_input_tokens_seen": 26912944, + "step": 12465 + }, + { + "epoch": 2.034257748776509, + "grad_norm": 0.01439907867461443, + "learning_rate": 0.0009999911475173245, + "loss": 0.1414, + "num_input_tokens_seen": 26923792, + "step": 12470 + }, + { + "epoch": 2.035073409461664, + "grad_norm": 0.04700160771608353, + "learning_rate": 0.000999990718888334, + "loss": 0.0664, + "num_input_tokens_seen": 26934736, + "step": 12475 + }, + { + "epoch": 2.035889070146819, + "grad_norm": 0.01069821696728468, + "learning_rate": 0.0009999902801263974, + "loss": 0.1014, + "num_input_tokens_seen": 26945840, + "step": 12480 + }, + { + "epoch": 2.036704730831974, + "grad_norm": 0.020104454830288887, + "learning_rate": 0.0009999898312315232, + "loss": 0.0774, + "num_input_tokens_seen": 26956304, + "step": 12485 + }, + { + "epoch": 2.0375203915171287, + "grad_norm": 0.03403728827834129, + "learning_rate": 0.000999989372203721, + "loss": 0.0476, + "num_input_tokens_seen": 26967632, + "step": 12490 + }, + { + "epoch": 2.038336052202284, + "grad_norm": 0.07206019759178162, + "learning_rate": 0.0009999889030429998, + "loss": 0.0756, + "num_input_tokens_seen": 26978160, + "step": 12495 + }, + { + "epoch": 2.039151712887439, + "grad_norm": 0.09049206227064133, + "learning_rate": 0.0009999884237493692, + "loss": 0.23, + "num_input_tokens_seen": 26987568, + "step": 12500 + }, + { + "epoch": 2.0399673735725936, + "grad_norm": 0.06570205837488174, + "learning_rate": 0.000999987934322839, + "loss": 0.1467, + "num_input_tokens_seen": 26999760, + "step": 12505 + }, + { + "epoch": 2.040783034257749, + "grad_norm": 0.11072354018688202, + "learning_rate": 0.000999987434763419, + "loss": 0.0411, + "num_input_tokens_seen": 27011696, + "step": 12510 + }, + { + "epoch": 2.0415986949429037, + "grad_norm": 0.07574472576379776, + "learning_rate": 0.0009999869250711193, + "loss": 0.1063, + "num_input_tokens_seen": 27022608, + "step": 12515 + }, + { + "epoch": 2.0424143556280585, + "grad_norm": 0.5725246071815491, + "learning_rate": 0.0009999864052459503, + "loss": 0.1364, + "num_input_tokens_seen": 27033680, + "step": 12520 + }, + { + "epoch": 2.0432300163132138, + "grad_norm": 0.32237333059310913, + "learning_rate": 0.0009999858752879228, + "loss": 0.1521, + "num_input_tokens_seen": 27045136, + "step": 12525 + }, + { + "epoch": 2.0440456769983686, + "grad_norm": 0.03749745339155197, + "learning_rate": 0.0009999853351970469, + "loss": 0.1176, + "num_input_tokens_seen": 27055760, + "step": 12530 + }, + { + "epoch": 2.044861337683524, + "grad_norm": 0.017230404540896416, + "learning_rate": 0.000999984784973334, + "loss": 0.0596, + "num_input_tokens_seen": 27066352, + "step": 12535 + }, + { + "epoch": 2.0456769983686787, + "grad_norm": 0.07658790051937103, + "learning_rate": 0.0009999842246167952, + "loss": 0.1469, + "num_input_tokens_seen": 27077872, + "step": 12540 + }, + { + "epoch": 2.0464926590538335, + "grad_norm": 0.25818324089050293, + "learning_rate": 0.0009999836541274417, + "loss": 0.1251, + "num_input_tokens_seen": 27088912, + "step": 12545 + }, + { + "epoch": 2.0473083197389887, + "grad_norm": 0.021115077659487724, + "learning_rate": 0.0009999830735052853, + "loss": 0.0248, + "num_input_tokens_seen": 27099216, + "step": 12550 + }, + { + "epoch": 2.0481239804241436, + "grad_norm": 0.05359815061092377, + "learning_rate": 0.0009999824827503377, + "loss": 0.2465, + "num_input_tokens_seen": 27109712, + "step": 12555 + }, + { + "epoch": 2.0489396411092984, + "grad_norm": 0.035242266952991486, + "learning_rate": 0.0009999818818626105, + "loss": 0.0416, + "num_input_tokens_seen": 27120304, + "step": 12560 + }, + { + "epoch": 2.0497553017944536, + "grad_norm": 0.005031228065490723, + "learning_rate": 0.0009999812708421166, + "loss": 0.1355, + "num_input_tokens_seen": 27130992, + "step": 12565 + }, + { + "epoch": 2.0505709624796085, + "grad_norm": 0.1788245290517807, + "learning_rate": 0.0009999806496888677, + "loss": 0.1206, + "num_input_tokens_seen": 27141136, + "step": 12570 + }, + { + "epoch": 2.0513866231647633, + "grad_norm": 0.037924475967884064, + "learning_rate": 0.0009999800184028766, + "loss": 0.1023, + "num_input_tokens_seen": 27152208, + "step": 12575 + }, + { + "epoch": 2.0522022838499185, + "grad_norm": 0.01139590423554182, + "learning_rate": 0.0009999793769841564, + "loss": 0.1394, + "num_input_tokens_seen": 27164016, + "step": 12580 + }, + { + "epoch": 2.0530179445350734, + "grad_norm": 0.05709296837449074, + "learning_rate": 0.0009999787254327196, + "loss": 0.1523, + "num_input_tokens_seen": 27174800, + "step": 12585 + }, + { + "epoch": 2.053833605220228, + "grad_norm": 0.02837609313428402, + "learning_rate": 0.00099997806374858, + "loss": 0.0887, + "num_input_tokens_seen": 27184784, + "step": 12590 + }, + { + "epoch": 2.0546492659053834, + "grad_norm": 0.02988213486969471, + "learning_rate": 0.0009999773919317505, + "loss": 0.0835, + "num_input_tokens_seen": 27195600, + "step": 12595 + }, + { + "epoch": 2.0554649265905383, + "grad_norm": 0.023099282756447792, + "learning_rate": 0.000999976709982245, + "loss": 0.1611, + "num_input_tokens_seen": 27206416, + "step": 12600 + }, + { + "epoch": 2.0562805872756935, + "grad_norm": 0.08739599585533142, + "learning_rate": 0.000999976017900077, + "loss": 0.1105, + "num_input_tokens_seen": 27217360, + "step": 12605 + }, + { + "epoch": 2.0570962479608483, + "grad_norm": 0.1226113885641098, + "learning_rate": 0.0009999753156852609, + "loss": 0.2292, + "num_input_tokens_seen": 27228304, + "step": 12610 + }, + { + "epoch": 2.057911908646003, + "grad_norm": 0.07845673710107803, + "learning_rate": 0.0009999746033378105, + "loss": 0.1635, + "num_input_tokens_seen": 27240208, + "step": 12615 + }, + { + "epoch": 2.0587275693311584, + "grad_norm": 0.0835578665137291, + "learning_rate": 0.0009999738808577408, + "loss": 0.0588, + "num_input_tokens_seen": 27250256, + "step": 12620 + }, + { + "epoch": 2.0595432300163132, + "grad_norm": 0.2026551067829132, + "learning_rate": 0.000999973148245066, + "loss": 0.1708, + "num_input_tokens_seen": 27260080, + "step": 12625 + }, + { + "epoch": 2.060358890701468, + "grad_norm": 0.07049740850925446, + "learning_rate": 0.000999972405499801, + "loss": 0.0582, + "num_input_tokens_seen": 27272112, + "step": 12630 + }, + { + "epoch": 2.0611745513866233, + "grad_norm": 0.03523268550634384, + "learning_rate": 0.0009999716526219611, + "loss": 0.108, + "num_input_tokens_seen": 27282672, + "step": 12635 + }, + { + "epoch": 2.061990212071778, + "grad_norm": 0.07659529894590378, + "learning_rate": 0.0009999708896115613, + "loss": 0.0637, + "num_input_tokens_seen": 27294704, + "step": 12640 + }, + { + "epoch": 2.062805872756933, + "grad_norm": 0.03385327756404877, + "learning_rate": 0.0009999701164686173, + "loss": 0.0789, + "num_input_tokens_seen": 27306320, + "step": 12645 + }, + { + "epoch": 2.063621533442088, + "grad_norm": 0.040308158844709396, + "learning_rate": 0.0009999693331931446, + "loss": 0.0774, + "num_input_tokens_seen": 27316720, + "step": 12650 + }, + { + "epoch": 2.064437194127243, + "grad_norm": 0.012962274253368378, + "learning_rate": 0.000999968539785159, + "loss": 0.0142, + "num_input_tokens_seen": 27327408, + "step": 12655 + }, + { + "epoch": 2.065252854812398, + "grad_norm": 0.19636216759681702, + "learning_rate": 0.0009999677362446768, + "loss": 0.1811, + "num_input_tokens_seen": 27338672, + "step": 12660 + }, + { + "epoch": 2.066068515497553, + "grad_norm": 0.1305185854434967, + "learning_rate": 0.000999966922571714, + "loss": 0.2339, + "num_input_tokens_seen": 27350352, + "step": 12665 + }, + { + "epoch": 2.066884176182708, + "grad_norm": 0.018722908571362495, + "learning_rate": 0.0009999660987662876, + "loss": 0.0678, + "num_input_tokens_seen": 27360432, + "step": 12670 + }, + { + "epoch": 2.067699836867863, + "grad_norm": 0.049624983221292496, + "learning_rate": 0.0009999652648284136, + "loss": 0.145, + "num_input_tokens_seen": 27369840, + "step": 12675 + }, + { + "epoch": 2.068515497553018, + "grad_norm": 0.058470483869314194, + "learning_rate": 0.0009999644207581092, + "loss": 0.034, + "num_input_tokens_seen": 27380016, + "step": 12680 + }, + { + "epoch": 2.069331158238173, + "grad_norm": 0.15666672587394714, + "learning_rate": 0.000999963566555392, + "loss": 0.1268, + "num_input_tokens_seen": 27391504, + "step": 12685 + }, + { + "epoch": 2.070146818923328, + "grad_norm": 0.05239586904644966, + "learning_rate": 0.0009999627022202785, + "loss": 0.0625, + "num_input_tokens_seen": 27402064, + "step": 12690 + }, + { + "epoch": 2.070962479608483, + "grad_norm": 0.04302447289228439, + "learning_rate": 0.0009999618277527868, + "loss": 0.1068, + "num_input_tokens_seen": 27413808, + "step": 12695 + }, + { + "epoch": 2.0717781402936377, + "grad_norm": 0.027048500254750252, + "learning_rate": 0.0009999609431529345, + "loss": 0.1593, + "num_input_tokens_seen": 27425520, + "step": 12700 + }, + { + "epoch": 2.072593800978793, + "grad_norm": 0.21865993738174438, + "learning_rate": 0.0009999600484207392, + "loss": 0.2108, + "num_input_tokens_seen": 27436624, + "step": 12705 + }, + { + "epoch": 2.073409461663948, + "grad_norm": 0.03209158033132553, + "learning_rate": 0.0009999591435562193, + "loss": 0.1168, + "num_input_tokens_seen": 27448176, + "step": 12710 + }, + { + "epoch": 2.0742251223491026, + "grad_norm": 0.10187777876853943, + "learning_rate": 0.0009999582285593932, + "loss": 0.0966, + "num_input_tokens_seen": 27459568, + "step": 12715 + }, + { + "epoch": 2.075040783034258, + "grad_norm": 0.1484181135892868, + "learning_rate": 0.0009999573034302793, + "loss": 0.0918, + "num_input_tokens_seen": 27470800, + "step": 12720 + }, + { + "epoch": 2.0758564437194127, + "grad_norm": 0.07416558265686035, + "learning_rate": 0.0009999563681688964, + "loss": 0.0704, + "num_input_tokens_seen": 27481168, + "step": 12725 + }, + { + "epoch": 2.0766721044045675, + "grad_norm": 0.05581318587064743, + "learning_rate": 0.0009999554227752634, + "loss": 0.068, + "num_input_tokens_seen": 27492528, + "step": 12730 + }, + { + "epoch": 2.0774877650897228, + "grad_norm": 0.030746089294552803, + "learning_rate": 0.0009999544672493997, + "loss": 0.1336, + "num_input_tokens_seen": 27503152, + "step": 12735 + }, + { + "epoch": 2.0783034257748776, + "grad_norm": 0.06911895424127579, + "learning_rate": 0.0009999535015913243, + "loss": 0.0708, + "num_input_tokens_seen": 27513904, + "step": 12740 + }, + { + "epoch": 2.0791190864600324, + "grad_norm": 0.2280171811580658, + "learning_rate": 0.0009999525258010571, + "loss": 0.2823, + "num_input_tokens_seen": 27525648, + "step": 12745 + }, + { + "epoch": 2.0799347471451877, + "grad_norm": 0.029335683211684227, + "learning_rate": 0.0009999515398786177, + "loss": 0.0604, + "num_input_tokens_seen": 27537264, + "step": 12750 + }, + { + "epoch": 2.0807504078303425, + "grad_norm": 0.15743935108184814, + "learning_rate": 0.000999950543824026, + "loss": 0.0811, + "num_input_tokens_seen": 27548144, + "step": 12755 + }, + { + "epoch": 2.0815660685154977, + "grad_norm": 0.27116477489471436, + "learning_rate": 0.0009999495376373025, + "loss": 0.1132, + "num_input_tokens_seen": 27559568, + "step": 12760 + }, + { + "epoch": 2.0823817292006526, + "grad_norm": 0.3159608244895935, + "learning_rate": 0.0009999485213184672, + "loss": 0.145, + "num_input_tokens_seen": 27570544, + "step": 12765 + }, + { + "epoch": 2.0831973898858074, + "grad_norm": 0.08795563131570816, + "learning_rate": 0.000999947494867541, + "loss": 0.2114, + "num_input_tokens_seen": 27581616, + "step": 12770 + }, + { + "epoch": 2.0840130505709626, + "grad_norm": 0.012950689531862736, + "learning_rate": 0.0009999464582845445, + "loss": 0.1316, + "num_input_tokens_seen": 27592112, + "step": 12775 + }, + { + "epoch": 2.0848287112561175, + "grad_norm": 0.2127687633037567, + "learning_rate": 0.0009999454115694989, + "loss": 0.207, + "num_input_tokens_seen": 27603056, + "step": 12780 + }, + { + "epoch": 2.0856443719412723, + "grad_norm": 0.05413515120744705, + "learning_rate": 0.0009999443547224253, + "loss": 0.1051, + "num_input_tokens_seen": 27613616, + "step": 12785 + }, + { + "epoch": 2.0864600326264275, + "grad_norm": 0.03091531991958618, + "learning_rate": 0.0009999432877433449, + "loss": 0.1603, + "num_input_tokens_seen": 27624784, + "step": 12790 + }, + { + "epoch": 2.0872756933115824, + "grad_norm": 0.023545589298009872, + "learning_rate": 0.0009999422106322798, + "loss": 0.0819, + "num_input_tokens_seen": 27636048, + "step": 12795 + }, + { + "epoch": 2.088091353996737, + "grad_norm": 0.07782018184661865, + "learning_rate": 0.0009999411233892516, + "loss": 0.1292, + "num_input_tokens_seen": 27645328, + "step": 12800 + }, + { + "epoch": 2.0889070146818924, + "grad_norm": 0.04108646512031555, + "learning_rate": 0.000999940026014282, + "loss": 0.1473, + "num_input_tokens_seen": 27655024, + "step": 12805 + }, + { + "epoch": 2.0897226753670473, + "grad_norm": 0.21999132633209229, + "learning_rate": 0.000999938918507394, + "loss": 0.1712, + "num_input_tokens_seen": 27666224, + "step": 12810 + }, + { + "epoch": 2.090538336052202, + "grad_norm": 0.01640498638153076, + "learning_rate": 0.0009999378008686093, + "loss": 0.0231, + "num_input_tokens_seen": 27677904, + "step": 12815 + }, + { + "epoch": 2.0913539967373573, + "grad_norm": 0.1342468559741974, + "learning_rate": 0.000999936673097951, + "loss": 0.1203, + "num_input_tokens_seen": 27687088, + "step": 12820 + }, + { + "epoch": 2.092169657422512, + "grad_norm": 0.19053426384925842, + "learning_rate": 0.0009999355351954418, + "loss": 0.2077, + "num_input_tokens_seen": 27697008, + "step": 12825 + }, + { + "epoch": 2.0929853181076674, + "grad_norm": 0.03679375723004341, + "learning_rate": 0.0009999343871611045, + "loss": 0.0647, + "num_input_tokens_seen": 27708208, + "step": 12830 + }, + { + "epoch": 2.0938009787928222, + "grad_norm": 0.039942409843206406, + "learning_rate": 0.000999933228994963, + "loss": 0.1776, + "num_input_tokens_seen": 27718704, + "step": 12835 + }, + { + "epoch": 2.094616639477977, + "grad_norm": 0.03391319513320923, + "learning_rate": 0.00099993206069704, + "loss": 0.1189, + "num_input_tokens_seen": 27728816, + "step": 12840 + }, + { + "epoch": 2.0954323001631323, + "grad_norm": 0.16504710912704468, + "learning_rate": 0.0009999308822673599, + "loss": 0.1629, + "num_input_tokens_seen": 27738992, + "step": 12845 + }, + { + "epoch": 2.096247960848287, + "grad_norm": 0.08140378445386887, + "learning_rate": 0.000999929693705946, + "loss": 0.1359, + "num_input_tokens_seen": 27750384, + "step": 12850 + }, + { + "epoch": 2.097063621533442, + "grad_norm": 0.03582719340920448, + "learning_rate": 0.000999928495012823, + "loss": 0.0872, + "num_input_tokens_seen": 27760784, + "step": 12855 + }, + { + "epoch": 2.097879282218597, + "grad_norm": 0.03194251284003258, + "learning_rate": 0.0009999272861880148, + "loss": 0.076, + "num_input_tokens_seen": 27771952, + "step": 12860 + }, + { + "epoch": 2.098694942903752, + "grad_norm": 0.10031555593013763, + "learning_rate": 0.0009999260672315456, + "loss": 0.1371, + "num_input_tokens_seen": 27781744, + "step": 12865 + }, + { + "epoch": 2.099510603588907, + "grad_norm": 0.032720427960157394, + "learning_rate": 0.0009999248381434406, + "loss": 0.0264, + "num_input_tokens_seen": 27792944, + "step": 12870 + }, + { + "epoch": 2.100326264274062, + "grad_norm": 0.05713163688778877, + "learning_rate": 0.0009999235989237249, + "loss": 0.0523, + "num_input_tokens_seen": 27804048, + "step": 12875 + }, + { + "epoch": 2.101141924959217, + "grad_norm": 0.030589789152145386, + "learning_rate": 0.0009999223495724228, + "loss": 0.1469, + "num_input_tokens_seen": 27814960, + "step": 12880 + }, + { + "epoch": 2.1019575856443717, + "grad_norm": 0.007542574778199196, + "learning_rate": 0.0009999210900895603, + "loss": 0.0453, + "num_input_tokens_seen": 27827280, + "step": 12885 + }, + { + "epoch": 2.102773246329527, + "grad_norm": 0.20224641263484955, + "learning_rate": 0.0009999198204751628, + "loss": 0.1165, + "num_input_tokens_seen": 27837520, + "step": 12890 + }, + { + "epoch": 2.103588907014682, + "grad_norm": 0.15749670565128326, + "learning_rate": 0.0009999185407292557, + "loss": 0.1158, + "num_input_tokens_seen": 27848912, + "step": 12895 + }, + { + "epoch": 2.104404567699837, + "grad_norm": 0.030561348423361778, + "learning_rate": 0.0009999172508518654, + "loss": 0.0505, + "num_input_tokens_seen": 27859504, + "step": 12900 + }, + { + "epoch": 2.105220228384992, + "grad_norm": 0.03285450488328934, + "learning_rate": 0.0009999159508430177, + "loss": 0.0992, + "num_input_tokens_seen": 27870736, + "step": 12905 + }, + { + "epoch": 2.1060358890701467, + "grad_norm": 0.07831714302301407, + "learning_rate": 0.000999914640702739, + "loss": 0.1413, + "num_input_tokens_seen": 27880720, + "step": 12910 + }, + { + "epoch": 2.106851549755302, + "grad_norm": 0.20582278072834015, + "learning_rate": 0.000999913320431056, + "loss": 0.2452, + "num_input_tokens_seen": 27892048, + "step": 12915 + }, + { + "epoch": 2.107667210440457, + "grad_norm": 0.023099590092897415, + "learning_rate": 0.0009999119900279956, + "loss": 0.0393, + "num_input_tokens_seen": 27903760, + "step": 12920 + }, + { + "epoch": 2.1084828711256116, + "grad_norm": 0.041197262704372406, + "learning_rate": 0.0009999106494935843, + "loss": 0.1241, + "num_input_tokens_seen": 27914192, + "step": 12925 + }, + { + "epoch": 2.109298531810767, + "grad_norm": 0.021470213308930397, + "learning_rate": 0.0009999092988278496, + "loss": 0.0317, + "num_input_tokens_seen": 27925936, + "step": 12930 + }, + { + "epoch": 2.1101141924959217, + "grad_norm": 0.11283034086227417, + "learning_rate": 0.0009999079380308186, + "loss": 0.2112, + "num_input_tokens_seen": 27937392, + "step": 12935 + }, + { + "epoch": 2.1109298531810765, + "grad_norm": 0.08010166138410568, + "learning_rate": 0.000999906567102519, + "loss": 0.0796, + "num_input_tokens_seen": 27948304, + "step": 12940 + }, + { + "epoch": 2.1117455138662318, + "grad_norm": 0.09811649471521378, + "learning_rate": 0.0009999051860429791, + "loss": 0.0569, + "num_input_tokens_seen": 27959344, + "step": 12945 + }, + { + "epoch": 2.1125611745513866, + "grad_norm": 0.23027090728282928, + "learning_rate": 0.000999903794852226, + "loss": 0.3074, + "num_input_tokens_seen": 27970032, + "step": 12950 + }, + { + "epoch": 2.1133768352365414, + "grad_norm": 0.14236557483673096, + "learning_rate": 0.0009999023935302886, + "loss": 0.1646, + "num_input_tokens_seen": 27979984, + "step": 12955 + }, + { + "epoch": 2.1141924959216967, + "grad_norm": 0.023605871945619583, + "learning_rate": 0.000999900982077195, + "loss": 0.1201, + "num_input_tokens_seen": 27989200, + "step": 12960 + }, + { + "epoch": 2.1150081566068515, + "grad_norm": 0.05145183205604553, + "learning_rate": 0.0009998995604929735, + "loss": 0.0955, + "num_input_tokens_seen": 28000432, + "step": 12965 + }, + { + "epoch": 2.1158238172920063, + "grad_norm": 0.13342423737049103, + "learning_rate": 0.0009998981287776536, + "loss": 0.1178, + "num_input_tokens_seen": 28010064, + "step": 12970 + }, + { + "epoch": 2.1166394779771616, + "grad_norm": 0.21482190489768982, + "learning_rate": 0.0009998966869312637, + "loss": 0.187, + "num_input_tokens_seen": 28021776, + "step": 12975 + }, + { + "epoch": 2.1174551386623164, + "grad_norm": 0.038029927760362625, + "learning_rate": 0.0009998952349538335, + "loss": 0.1751, + "num_input_tokens_seen": 28032656, + "step": 12980 + }, + { + "epoch": 2.1182707993474716, + "grad_norm": 0.003811969654634595, + "learning_rate": 0.000999893772845392, + "loss": 0.0721, + "num_input_tokens_seen": 28043728, + "step": 12985 + }, + { + "epoch": 2.1190864600326265, + "grad_norm": 0.036346595734357834, + "learning_rate": 0.0009998923006059692, + "loss": 0.1764, + "num_input_tokens_seen": 28055120, + "step": 12990 + }, + { + "epoch": 2.1199021207177813, + "grad_norm": 0.21362502872943878, + "learning_rate": 0.0009998908182355948, + "loss": 0.1384, + "num_input_tokens_seen": 28065936, + "step": 12995 + }, + { + "epoch": 2.1207177814029365, + "grad_norm": 0.15023855865001678, + "learning_rate": 0.0009998893257342986, + "loss": 0.0848, + "num_input_tokens_seen": 28077040, + "step": 13000 + }, + { + "epoch": 2.1215334420880914, + "grad_norm": 0.1258479654788971, + "learning_rate": 0.000999887823102111, + "loss": 0.064, + "num_input_tokens_seen": 28088624, + "step": 13005 + }, + { + "epoch": 2.122349102773246, + "grad_norm": 0.13647812604904175, + "learning_rate": 0.0009998863103390628, + "loss": 0.219, + "num_input_tokens_seen": 28097520, + "step": 13010 + }, + { + "epoch": 2.1231647634584014, + "grad_norm": 0.0504603236913681, + "learning_rate": 0.0009998847874451843, + "loss": 0.2375, + "num_input_tokens_seen": 28107504, + "step": 13015 + }, + { + "epoch": 2.1239804241435563, + "grad_norm": 0.11593887209892273, + "learning_rate": 0.0009998832544205064, + "loss": 0.1569, + "num_input_tokens_seen": 28118352, + "step": 13020 + }, + { + "epoch": 2.124796084828711, + "grad_norm": 0.08521158248186111, + "learning_rate": 0.0009998817112650603, + "loss": 0.0633, + "num_input_tokens_seen": 28129936, + "step": 13025 + }, + { + "epoch": 2.1256117455138663, + "grad_norm": 0.15191933512687683, + "learning_rate": 0.000999880157978877, + "loss": 0.1817, + "num_input_tokens_seen": 28140496, + "step": 13030 + }, + { + "epoch": 2.126427406199021, + "grad_norm": 0.1695515215396881, + "learning_rate": 0.0009998785945619882, + "loss": 0.1332, + "num_input_tokens_seen": 28151440, + "step": 13035 + }, + { + "epoch": 2.1272430668841764, + "grad_norm": 0.09944857656955719, + "learning_rate": 0.0009998770210144256, + "loss": 0.1264, + "num_input_tokens_seen": 28161200, + "step": 13040 + }, + { + "epoch": 2.1280587275693312, + "grad_norm": 0.13683369755744934, + "learning_rate": 0.000999875437336221, + "loss": 0.0561, + "num_input_tokens_seen": 28171568, + "step": 13045 + }, + { + "epoch": 2.128874388254486, + "grad_norm": 0.06536974757909775, + "learning_rate": 0.0009998738435274064, + "loss": 0.1863, + "num_input_tokens_seen": 28183344, + "step": 13050 + }, + { + "epoch": 2.1296900489396413, + "grad_norm": 0.03261822462081909, + "learning_rate": 0.0009998722395880145, + "loss": 0.0808, + "num_input_tokens_seen": 28193552, + "step": 13055 + }, + { + "epoch": 2.130505709624796, + "grad_norm": 0.022606901824474335, + "learning_rate": 0.0009998706255180774, + "loss": 0.142, + "num_input_tokens_seen": 28203600, + "step": 13060 + }, + { + "epoch": 2.131321370309951, + "grad_norm": 0.052153971046209335, + "learning_rate": 0.0009998690013176279, + "loss": 0.326, + "num_input_tokens_seen": 28213584, + "step": 13065 + }, + { + "epoch": 2.132137030995106, + "grad_norm": 0.02137759141623974, + "learning_rate": 0.0009998673669866988, + "loss": 0.0758, + "num_input_tokens_seen": 28224016, + "step": 13070 + }, + { + "epoch": 2.132952691680261, + "grad_norm": 0.025548746809363365, + "learning_rate": 0.0009998657225253236, + "loss": 0.1375, + "num_input_tokens_seen": 28235760, + "step": 13075 + }, + { + "epoch": 2.133768352365416, + "grad_norm": 0.05538545548915863, + "learning_rate": 0.0009998640679335354, + "loss": 0.1466, + "num_input_tokens_seen": 28244656, + "step": 13080 + }, + { + "epoch": 2.134584013050571, + "grad_norm": 0.09617538750171661, + "learning_rate": 0.0009998624032113677, + "loss": 0.0928, + "num_input_tokens_seen": 28254096, + "step": 13085 + }, + { + "epoch": 2.135399673735726, + "grad_norm": 0.0704553872346878, + "learning_rate": 0.0009998607283588543, + "loss": 0.1518, + "num_input_tokens_seen": 28264880, + "step": 13090 + }, + { + "epoch": 2.1362153344208807, + "grad_norm": 0.08678105473518372, + "learning_rate": 0.000999859043376029, + "loss": 0.1116, + "num_input_tokens_seen": 28274448, + "step": 13095 + }, + { + "epoch": 2.137030995106036, + "grad_norm": 0.10627443343400955, + "learning_rate": 0.0009998573482629264, + "loss": 0.0721, + "num_input_tokens_seen": 28285776, + "step": 13100 + }, + { + "epoch": 2.137846655791191, + "grad_norm": 0.1952490359544754, + "learning_rate": 0.0009998556430195803, + "loss": 0.1298, + "num_input_tokens_seen": 28296016, + "step": 13105 + }, + { + "epoch": 2.1386623164763456, + "grad_norm": 0.1826934814453125, + "learning_rate": 0.0009998539276460255, + "loss": 0.1691, + "num_input_tokens_seen": 28307952, + "step": 13110 + }, + { + "epoch": 2.139477977161501, + "grad_norm": 0.03026331402361393, + "learning_rate": 0.0009998522021422967, + "loss": 0.0911, + "num_input_tokens_seen": 28318960, + "step": 13115 + }, + { + "epoch": 2.1402936378466557, + "grad_norm": 0.040153101086616516, + "learning_rate": 0.000999850466508429, + "loss": 0.0223, + "num_input_tokens_seen": 28329136, + "step": 13120 + }, + { + "epoch": 2.141109298531811, + "grad_norm": 0.02977476455271244, + "learning_rate": 0.0009998487207444574, + "loss": 0.0392, + "num_input_tokens_seen": 28339696, + "step": 13125 + }, + { + "epoch": 2.141924959216966, + "grad_norm": 0.18199479579925537, + "learning_rate": 0.0009998469648504174, + "loss": 0.1711, + "num_input_tokens_seen": 28351440, + "step": 13130 + }, + { + "epoch": 2.1427406199021206, + "grad_norm": 0.04141247272491455, + "learning_rate": 0.0009998451988263444, + "loss": 0.1299, + "num_input_tokens_seen": 28361232, + "step": 13135 + }, + { + "epoch": 2.143556280587276, + "grad_norm": 0.02136673964560032, + "learning_rate": 0.0009998434226722746, + "loss": 0.1407, + "num_input_tokens_seen": 28370864, + "step": 13140 + }, + { + "epoch": 2.1443719412724307, + "grad_norm": 0.15826228260993958, + "learning_rate": 0.0009998416363882438, + "loss": 0.0466, + "num_input_tokens_seen": 28381456, + "step": 13145 + }, + { + "epoch": 2.1451876019575855, + "grad_norm": 0.02003076858818531, + "learning_rate": 0.0009998398399742878, + "loss": 0.1206, + "num_input_tokens_seen": 28391600, + "step": 13150 + }, + { + "epoch": 2.1460032626427408, + "grad_norm": 0.14566271007061005, + "learning_rate": 0.0009998380334304436, + "loss": 0.1604, + "num_input_tokens_seen": 28402640, + "step": 13155 + }, + { + "epoch": 2.1468189233278956, + "grad_norm": 0.17364197969436646, + "learning_rate": 0.0009998362167567476, + "loss": 0.2252, + "num_input_tokens_seen": 28412272, + "step": 13160 + }, + { + "epoch": 2.1476345840130504, + "grad_norm": 0.09216035157442093, + "learning_rate": 0.0009998343899532364, + "loss": 0.1453, + "num_input_tokens_seen": 28423312, + "step": 13165 + }, + { + "epoch": 2.1484502446982057, + "grad_norm": 0.08613400161266327, + "learning_rate": 0.0009998325530199473, + "loss": 0.2749, + "num_input_tokens_seen": 28433424, + "step": 13170 + }, + { + "epoch": 2.1492659053833605, + "grad_norm": 0.22874774038791656, + "learning_rate": 0.0009998307059569174, + "loss": 0.1813, + "num_input_tokens_seen": 28443984, + "step": 13175 + }, + { + "epoch": 2.1500815660685153, + "grad_norm": 0.043442502617836, + "learning_rate": 0.0009998288487641843, + "loss": 0.1528, + "num_input_tokens_seen": 28454832, + "step": 13180 + }, + { + "epoch": 2.1508972267536706, + "grad_norm": 0.04164640232920647, + "learning_rate": 0.0009998269814417854, + "loss": 0.1319, + "num_input_tokens_seen": 28467280, + "step": 13185 + }, + { + "epoch": 2.1517128874388254, + "grad_norm": 0.20310817658901215, + "learning_rate": 0.0009998251039897586, + "loss": 0.2783, + "num_input_tokens_seen": 28478128, + "step": 13190 + }, + { + "epoch": 2.15252854812398, + "grad_norm": 0.11066415160894394, + "learning_rate": 0.000999823216408142, + "loss": 0.1351, + "num_input_tokens_seen": 28489200, + "step": 13195 + }, + { + "epoch": 2.1533442088091355, + "grad_norm": 0.026592491194605827, + "learning_rate": 0.0009998213186969739, + "loss": 0.0583, + "num_input_tokens_seen": 28500240, + "step": 13200 + }, + { + "epoch": 2.1541598694942903, + "grad_norm": 0.10467559844255447, + "learning_rate": 0.0009998194108562927, + "loss": 0.0912, + "num_input_tokens_seen": 28511824, + "step": 13205 + }, + { + "epoch": 2.1549755301794455, + "grad_norm": 0.0133062694221735, + "learning_rate": 0.000999817492886137, + "loss": 0.065, + "num_input_tokens_seen": 28521456, + "step": 13210 + }, + { + "epoch": 2.1557911908646004, + "grad_norm": 0.1335669904947281, + "learning_rate": 0.000999815564786546, + "loss": 0.1204, + "num_input_tokens_seen": 28531888, + "step": 13215 + }, + { + "epoch": 2.156606851549755, + "grad_norm": 0.008207677863538265, + "learning_rate": 0.0009998136265575582, + "loss": 0.0841, + "num_input_tokens_seen": 28541616, + "step": 13220 + }, + { + "epoch": 2.1574225122349104, + "grad_norm": 0.0902860090136528, + "learning_rate": 0.0009998116781992133, + "loss": 0.0976, + "num_input_tokens_seen": 28553776, + "step": 13225 + }, + { + "epoch": 2.1582381729200653, + "grad_norm": 0.14238712191581726, + "learning_rate": 0.0009998097197115507, + "loss": 0.0623, + "num_input_tokens_seen": 28565680, + "step": 13230 + }, + { + "epoch": 2.15905383360522, + "grad_norm": 0.12988384068012238, + "learning_rate": 0.00099980775109461, + "loss": 0.1411, + "num_input_tokens_seen": 28576560, + "step": 13235 + }, + { + "epoch": 2.1598694942903753, + "grad_norm": 0.018119171261787415, + "learning_rate": 0.0009998057723484312, + "loss": 0.3073, + "num_input_tokens_seen": 28586352, + "step": 13240 + }, + { + "epoch": 2.16068515497553, + "grad_norm": 0.14208227396011353, + "learning_rate": 0.0009998037834730545, + "loss": 0.1782, + "num_input_tokens_seen": 28598416, + "step": 13245 + }, + { + "epoch": 2.161500815660685, + "grad_norm": 0.01080414094030857, + "learning_rate": 0.0009998017844685201, + "loss": 0.1514, + "num_input_tokens_seen": 28609424, + "step": 13250 + }, + { + "epoch": 2.1623164763458402, + "grad_norm": 0.08940687030553818, + "learning_rate": 0.0009997997753348684, + "loss": 0.0888, + "num_input_tokens_seen": 28619184, + "step": 13255 + }, + { + "epoch": 2.163132137030995, + "grad_norm": 0.05870174244046211, + "learning_rate": 0.0009997977560721402, + "loss": 0.1142, + "num_input_tokens_seen": 28629232, + "step": 13260 + }, + { + "epoch": 2.1639477977161503, + "grad_norm": 0.06812714040279388, + "learning_rate": 0.0009997957266803766, + "loss": 0.0554, + "num_input_tokens_seen": 28639408, + "step": 13265 + }, + { + "epoch": 2.164763458401305, + "grad_norm": 0.031555160880088806, + "learning_rate": 0.0009997936871596182, + "loss": 0.1425, + "num_input_tokens_seen": 28650736, + "step": 13270 + }, + { + "epoch": 2.16557911908646, + "grad_norm": 0.1013028547167778, + "learning_rate": 0.000999791637509907, + "loss": 0.0724, + "num_input_tokens_seen": 28660784, + "step": 13275 + }, + { + "epoch": 2.166394779771615, + "grad_norm": 0.16207702457904816, + "learning_rate": 0.0009997895777312843, + "loss": 0.0981, + "num_input_tokens_seen": 28671248, + "step": 13280 + }, + { + "epoch": 2.16721044045677, + "grad_norm": 0.15483161807060242, + "learning_rate": 0.0009997875078237915, + "loss": 0.1272, + "num_input_tokens_seen": 28682000, + "step": 13285 + }, + { + "epoch": 2.168026101141925, + "grad_norm": 0.09992340952157974, + "learning_rate": 0.000999785427787471, + "loss": 0.1712, + "num_input_tokens_seen": 28692912, + "step": 13290 + }, + { + "epoch": 2.16884176182708, + "grad_norm": 0.239858478307724, + "learning_rate": 0.0009997833376223647, + "loss": 0.2184, + "num_input_tokens_seen": 28703504, + "step": 13295 + }, + { + "epoch": 2.169657422512235, + "grad_norm": 0.030496256425976753, + "learning_rate": 0.000999781237328515, + "loss": 0.0564, + "num_input_tokens_seen": 28714416, + "step": 13300 + }, + { + "epoch": 2.1704730831973897, + "grad_norm": 0.16649505496025085, + "learning_rate": 0.0009997791269059646, + "loss": 0.1128, + "num_input_tokens_seen": 28725200, + "step": 13305 + }, + { + "epoch": 2.171288743882545, + "grad_norm": 0.13046658039093018, + "learning_rate": 0.0009997770063547562, + "loss": 0.0726, + "num_input_tokens_seen": 28736176, + "step": 13310 + }, + { + "epoch": 2.1721044045677, + "grad_norm": 0.0701921209692955, + "learning_rate": 0.0009997748756749327, + "loss": 0.0467, + "num_input_tokens_seen": 28746928, + "step": 13315 + }, + { + "epoch": 2.1729200652528546, + "grad_norm": 0.13671933114528656, + "learning_rate": 0.0009997727348665373, + "loss": 0.1444, + "num_input_tokens_seen": 28757584, + "step": 13320 + }, + { + "epoch": 2.17373572593801, + "grad_norm": 0.12040413916110992, + "learning_rate": 0.0009997705839296135, + "loss": 0.0553, + "num_input_tokens_seen": 28766224, + "step": 13325 + }, + { + "epoch": 2.1745513866231647, + "grad_norm": 0.02872053161263466, + "learning_rate": 0.0009997684228642049, + "loss": 0.0494, + "num_input_tokens_seen": 28775984, + "step": 13330 + }, + { + "epoch": 2.1753670473083195, + "grad_norm": 0.005882376339286566, + "learning_rate": 0.0009997662516703552, + "loss": 0.1211, + "num_input_tokens_seen": 28787056, + "step": 13335 + }, + { + "epoch": 2.176182707993475, + "grad_norm": 0.1135839968919754, + "learning_rate": 0.0009997640703481082, + "loss": 0.1063, + "num_input_tokens_seen": 28797648, + "step": 13340 + }, + { + "epoch": 2.1769983686786296, + "grad_norm": 0.011162982322275639, + "learning_rate": 0.0009997618788975084, + "loss": 0.1024, + "num_input_tokens_seen": 28808656, + "step": 13345 + }, + { + "epoch": 2.177814029363785, + "grad_norm": 0.14825639128684998, + "learning_rate": 0.0009997596773186, + "loss": 0.0579, + "num_input_tokens_seen": 28818448, + "step": 13350 + }, + { + "epoch": 2.1786296900489397, + "grad_norm": 0.21489086747169495, + "learning_rate": 0.000999757465611428, + "loss": 0.127, + "num_input_tokens_seen": 28828752, + "step": 13355 + }, + { + "epoch": 2.1794453507340945, + "grad_norm": 0.19029727578163147, + "learning_rate": 0.000999755243776037, + "loss": 0.1167, + "num_input_tokens_seen": 28838896, + "step": 13360 + }, + { + "epoch": 2.1802610114192498, + "grad_norm": 0.013307043351233006, + "learning_rate": 0.000999753011812472, + "loss": 0.0844, + "num_input_tokens_seen": 28848880, + "step": 13365 + }, + { + "epoch": 2.1810766721044046, + "grad_norm": 0.17979095876216888, + "learning_rate": 0.000999750769720778, + "loss": 0.2494, + "num_input_tokens_seen": 28860592, + "step": 13370 + }, + { + "epoch": 2.1818923327895594, + "grad_norm": 0.029981788247823715, + "learning_rate": 0.0009997485175010008, + "loss": 0.1093, + "num_input_tokens_seen": 28872432, + "step": 13375 + }, + { + "epoch": 2.1827079934747147, + "grad_norm": 0.2070823758840561, + "learning_rate": 0.000999746255153186, + "loss": 0.0712, + "num_input_tokens_seen": 28883856, + "step": 13380 + }, + { + "epoch": 2.1835236541598695, + "grad_norm": 0.032613664865493774, + "learning_rate": 0.0009997439826773791, + "loss": 0.1579, + "num_input_tokens_seen": 28894384, + "step": 13385 + }, + { + "epoch": 2.1843393148450243, + "grad_norm": 0.29440703988075256, + "learning_rate": 0.0009997417000736266, + "loss": 0.0665, + "num_input_tokens_seen": 28902288, + "step": 13390 + }, + { + "epoch": 2.1851549755301796, + "grad_norm": 0.04788945987820625, + "learning_rate": 0.0009997394073419747, + "loss": 0.103, + "num_input_tokens_seen": 28913296, + "step": 13395 + }, + { + "epoch": 2.1859706362153344, + "grad_norm": 0.017567042261362076, + "learning_rate": 0.0009997371044824697, + "loss": 0.0758, + "num_input_tokens_seen": 28923888, + "step": 13400 + }, + { + "epoch": 2.186786296900489, + "grad_norm": 0.2214508205652237, + "learning_rate": 0.0009997347914951582, + "loss": 0.3458, + "num_input_tokens_seen": 28934640, + "step": 13405 + }, + { + "epoch": 2.1876019575856445, + "grad_norm": 0.009490398690104485, + "learning_rate": 0.0009997324683800872, + "loss": 0.1425, + "num_input_tokens_seen": 28945648, + "step": 13410 + }, + { + "epoch": 2.1884176182707993, + "grad_norm": 0.11446644365787506, + "learning_rate": 0.0009997301351373038, + "loss": 0.0965, + "num_input_tokens_seen": 28957392, + "step": 13415 + }, + { + "epoch": 2.189233278955954, + "grad_norm": 0.13229867815971375, + "learning_rate": 0.0009997277917668552, + "loss": 0.0841, + "num_input_tokens_seen": 28967056, + "step": 13420 + }, + { + "epoch": 2.1900489396411094, + "grad_norm": 0.02078085020184517, + "learning_rate": 0.000999725438268789, + "loss": 0.1588, + "num_input_tokens_seen": 28977808, + "step": 13425 + }, + { + "epoch": 2.190864600326264, + "grad_norm": 0.12957251071929932, + "learning_rate": 0.0009997230746431529, + "loss": 0.1236, + "num_input_tokens_seen": 28987440, + "step": 13430 + }, + { + "epoch": 2.1916802610114194, + "grad_norm": 0.08797205239534378, + "learning_rate": 0.0009997207008899946, + "loss": 0.066, + "num_input_tokens_seen": 28997392, + "step": 13435 + }, + { + "epoch": 2.1924959216965743, + "grad_norm": 0.2534710466861725, + "learning_rate": 0.0009997183170093625, + "loss": 0.2646, + "num_input_tokens_seen": 29007824, + "step": 13440 + }, + { + "epoch": 2.193311582381729, + "grad_norm": 0.12591876089572906, + "learning_rate": 0.000999715923001305, + "loss": 0.1257, + "num_input_tokens_seen": 29019024, + "step": 13445 + }, + { + "epoch": 2.1941272430668843, + "grad_norm": 0.10431301593780518, + "learning_rate": 0.00099971351886587, + "loss": 0.1406, + "num_input_tokens_seen": 29031600, + "step": 13450 + }, + { + "epoch": 2.194942903752039, + "grad_norm": 0.23118093609809875, + "learning_rate": 0.0009997111046031067, + "loss": 0.2524, + "num_input_tokens_seen": 29043056, + "step": 13455 + }, + { + "epoch": 2.195758564437194, + "grad_norm": 0.13233636319637299, + "learning_rate": 0.000999708680213064, + "loss": 0.1312, + "num_input_tokens_seen": 29053648, + "step": 13460 + }, + { + "epoch": 2.1965742251223492, + "grad_norm": 0.10303754359483719, + "learning_rate": 0.000999706245695791, + "loss": 0.1194, + "num_input_tokens_seen": 29064112, + "step": 13465 + }, + { + "epoch": 2.197389885807504, + "grad_norm": 0.031526438891887665, + "learning_rate": 0.0009997038010513368, + "loss": 0.0797, + "num_input_tokens_seen": 29075216, + "step": 13470 + }, + { + "epoch": 2.198205546492659, + "grad_norm": 0.2664559781551361, + "learning_rate": 0.0009997013462797514, + "loss": 0.1166, + "num_input_tokens_seen": 29084368, + "step": 13475 + }, + { + "epoch": 2.199021207177814, + "grad_norm": 0.022741097956895828, + "learning_rate": 0.000999698881381084, + "loss": 0.0414, + "num_input_tokens_seen": 29096016, + "step": 13480 + }, + { + "epoch": 2.199836867862969, + "grad_norm": 0.029821766540408134, + "learning_rate": 0.0009996964063553851, + "loss": 0.0422, + "num_input_tokens_seen": 29106768, + "step": 13485 + }, + { + "epoch": 2.200652528548124, + "grad_norm": 0.2790866792201996, + "learning_rate": 0.0009996939212027045, + "loss": 0.1781, + "num_input_tokens_seen": 29116560, + "step": 13490 + }, + { + "epoch": 2.201468189233279, + "grad_norm": 0.06957297027111053, + "learning_rate": 0.0009996914259230928, + "loss": 0.0812, + "num_input_tokens_seen": 29127376, + "step": 13495 + }, + { + "epoch": 2.202283849918434, + "grad_norm": 0.03924049437046051, + "learning_rate": 0.0009996889205166003, + "loss": 0.1376, + "num_input_tokens_seen": 29139088, + "step": 13500 + }, + { + "epoch": 2.203099510603589, + "grad_norm": 0.02585531957447529, + "learning_rate": 0.000999686404983278, + "loss": 0.0709, + "num_input_tokens_seen": 29149488, + "step": 13505 + }, + { + "epoch": 2.203915171288744, + "grad_norm": 0.02488371729850769, + "learning_rate": 0.0009996838793231771, + "loss": 0.0251, + "num_input_tokens_seen": 29159728, + "step": 13510 + }, + { + "epoch": 2.2047308319738987, + "grad_norm": 0.10791287571191788, + "learning_rate": 0.0009996813435363481, + "loss": 0.1225, + "num_input_tokens_seen": 29170928, + "step": 13515 + }, + { + "epoch": 2.205546492659054, + "grad_norm": 0.17194044589996338, + "learning_rate": 0.000999678797622843, + "loss": 0.1687, + "num_input_tokens_seen": 29180912, + "step": 13520 + }, + { + "epoch": 2.206362153344209, + "grad_norm": 0.019657181575894356, + "learning_rate": 0.000999676241582713, + "loss": 0.1058, + "num_input_tokens_seen": 29191216, + "step": 13525 + }, + { + "epoch": 2.2071778140293636, + "grad_norm": 0.14660616219043732, + "learning_rate": 0.0009996736754160102, + "loss": 0.0711, + "num_input_tokens_seen": 29201424, + "step": 13530 + }, + { + "epoch": 2.207993474714519, + "grad_norm": 0.01304369792342186, + "learning_rate": 0.0009996710991227865, + "loss": 0.1724, + "num_input_tokens_seen": 29211152, + "step": 13535 + }, + { + "epoch": 2.2088091353996737, + "grad_norm": 0.25268176198005676, + "learning_rate": 0.000999668512703094, + "loss": 0.1344, + "num_input_tokens_seen": 29221744, + "step": 13540 + }, + { + "epoch": 2.2096247960848285, + "grad_norm": 0.043931201100349426, + "learning_rate": 0.0009996659161569852, + "loss": 0.2054, + "num_input_tokens_seen": 29231568, + "step": 13545 + }, + { + "epoch": 2.210440456769984, + "grad_norm": 0.36950093507766724, + "learning_rate": 0.0009996633094845127, + "loss": 0.3584, + "num_input_tokens_seen": 29242128, + "step": 13550 + }, + { + "epoch": 2.2112561174551386, + "grad_norm": 0.0662706270813942, + "learning_rate": 0.0009996606926857296, + "loss": 0.113, + "num_input_tokens_seen": 29250992, + "step": 13555 + }, + { + "epoch": 2.2120717781402934, + "grad_norm": 0.10020389407873154, + "learning_rate": 0.0009996580657606886, + "loss": 0.0698, + "num_input_tokens_seen": 29261072, + "step": 13560 + }, + { + "epoch": 2.2128874388254487, + "grad_norm": 0.041853442788124084, + "learning_rate": 0.0009996554287094428, + "loss": 0.1319, + "num_input_tokens_seen": 29271760, + "step": 13565 + }, + { + "epoch": 2.2137030995106035, + "grad_norm": 0.13062892854213715, + "learning_rate": 0.0009996527815320463, + "loss": 0.2347, + "num_input_tokens_seen": 29282480, + "step": 13570 + }, + { + "epoch": 2.2145187601957588, + "grad_norm": 0.2043205052614212, + "learning_rate": 0.000999650124228552, + "loss": 0.1243, + "num_input_tokens_seen": 29292912, + "step": 13575 + }, + { + "epoch": 2.2153344208809136, + "grad_norm": 0.04886353760957718, + "learning_rate": 0.0009996474567990142, + "loss": 0.074, + "num_input_tokens_seen": 29302800, + "step": 13580 + }, + { + "epoch": 2.2161500815660684, + "grad_norm": 0.03988846391439438, + "learning_rate": 0.0009996447792434868, + "loss": 0.0565, + "num_input_tokens_seen": 29313424, + "step": 13585 + }, + { + "epoch": 2.2169657422512237, + "grad_norm": 0.18873798847198486, + "learning_rate": 0.000999642091562024, + "loss": 0.2115, + "num_input_tokens_seen": 29323856, + "step": 13590 + }, + { + "epoch": 2.2177814029363785, + "grad_norm": 0.010911782272160053, + "learning_rate": 0.0009996393937546806, + "loss": 0.0724, + "num_input_tokens_seen": 29334832, + "step": 13595 + }, + { + "epoch": 2.2185970636215333, + "grad_norm": 0.025439320132136345, + "learning_rate": 0.000999636685821511, + "loss": 0.0434, + "num_input_tokens_seen": 29345936, + "step": 13600 + }, + { + "epoch": 2.2194127243066886, + "grad_norm": 0.23711273074150085, + "learning_rate": 0.0009996339677625702, + "loss": 0.1873, + "num_input_tokens_seen": 29356560, + "step": 13605 + }, + { + "epoch": 2.2202283849918434, + "grad_norm": 0.01632430963218212, + "learning_rate": 0.000999631239577913, + "loss": 0.0582, + "num_input_tokens_seen": 29367024, + "step": 13610 + }, + { + "epoch": 2.221044045676998, + "grad_norm": 0.06748680770397186, + "learning_rate": 0.000999628501267595, + "loss": 0.0753, + "num_input_tokens_seen": 29376944, + "step": 13615 + }, + { + "epoch": 2.2218597063621535, + "grad_norm": 0.14331598579883575, + "learning_rate": 0.0009996257528316716, + "loss": 0.1977, + "num_input_tokens_seen": 29389552, + "step": 13620 + }, + { + "epoch": 2.2226753670473083, + "grad_norm": 0.01919965259730816, + "learning_rate": 0.0009996229942701984, + "loss": 0.0343, + "num_input_tokens_seen": 29399600, + "step": 13625 + }, + { + "epoch": 2.223491027732463, + "grad_norm": 0.047127995640039444, + "learning_rate": 0.0009996202255832317, + "loss": 0.0555, + "num_input_tokens_seen": 29410896, + "step": 13630 + }, + { + "epoch": 2.2243066884176184, + "grad_norm": 0.24656490981578827, + "learning_rate": 0.000999617446770827, + "loss": 0.0794, + "num_input_tokens_seen": 29421136, + "step": 13635 + }, + { + "epoch": 2.225122349102773, + "grad_norm": 0.12916496396064758, + "learning_rate": 0.0009996146578330409, + "loss": 0.0894, + "num_input_tokens_seen": 29431536, + "step": 13640 + }, + { + "epoch": 2.225938009787928, + "grad_norm": 0.005559421610087156, + "learning_rate": 0.0009996118587699302, + "loss": 0.1604, + "num_input_tokens_seen": 29442416, + "step": 13645 + }, + { + "epoch": 2.2267536704730833, + "grad_norm": 0.22010643780231476, + "learning_rate": 0.0009996090495815514, + "loss": 0.0799, + "num_input_tokens_seen": 29453232, + "step": 13650 + }, + { + "epoch": 2.227569331158238, + "grad_norm": 0.03835262730717659, + "learning_rate": 0.000999606230267961, + "loss": 0.0452, + "num_input_tokens_seen": 29464976, + "step": 13655 + }, + { + "epoch": 2.2283849918433933, + "grad_norm": 0.17862075567245483, + "learning_rate": 0.000999603400829217, + "loss": 0.1201, + "num_input_tokens_seen": 29475536, + "step": 13660 + }, + { + "epoch": 2.229200652528548, + "grad_norm": 0.00975730549544096, + "learning_rate": 0.0009996005612653762, + "loss": 0.0145, + "num_input_tokens_seen": 29486256, + "step": 13665 + }, + { + "epoch": 2.230016313213703, + "grad_norm": 0.08683815598487854, + "learning_rate": 0.000999597711576496, + "loss": 0.0832, + "num_input_tokens_seen": 29498512, + "step": 13670 + }, + { + "epoch": 2.2308319738988582, + "grad_norm": 0.07465891540050507, + "learning_rate": 0.0009995948517626347, + "loss": 0.1191, + "num_input_tokens_seen": 29509360, + "step": 13675 + }, + { + "epoch": 2.231647634584013, + "grad_norm": 0.006697999779134989, + "learning_rate": 0.0009995919818238496, + "loss": 0.0416, + "num_input_tokens_seen": 29520400, + "step": 13680 + }, + { + "epoch": 2.232463295269168, + "grad_norm": 0.07438579201698303, + "learning_rate": 0.0009995891017601996, + "loss": 0.1862, + "num_input_tokens_seen": 29532016, + "step": 13685 + }, + { + "epoch": 2.233278955954323, + "grad_norm": 0.0766034796833992, + "learning_rate": 0.0009995862115717426, + "loss": 0.0263, + "num_input_tokens_seen": 29543728, + "step": 13690 + }, + { + "epoch": 2.234094616639478, + "grad_norm": 0.03681030124425888, + "learning_rate": 0.000999583311258537, + "loss": 0.4458, + "num_input_tokens_seen": 29554352, + "step": 13695 + }, + { + "epoch": 2.2349102773246328, + "grad_norm": 0.02508831024169922, + "learning_rate": 0.000999580400820642, + "loss": 0.0785, + "num_input_tokens_seen": 29565168, + "step": 13700 + }, + { + "epoch": 2.235725938009788, + "grad_norm": 0.14671660959720612, + "learning_rate": 0.0009995774802581165, + "loss": 0.1354, + "num_input_tokens_seen": 29575792, + "step": 13705 + }, + { + "epoch": 2.236541598694943, + "grad_norm": 0.034362368285655975, + "learning_rate": 0.0009995745495710194, + "loss": 0.0562, + "num_input_tokens_seen": 29586384, + "step": 13710 + }, + { + "epoch": 2.237357259380098, + "grad_norm": 0.006821879185736179, + "learning_rate": 0.0009995716087594104, + "loss": 0.0406, + "num_input_tokens_seen": 29597232, + "step": 13715 + }, + { + "epoch": 2.238172920065253, + "grad_norm": 0.13872727751731873, + "learning_rate": 0.000999568657823349, + "loss": 0.0895, + "num_input_tokens_seen": 29606896, + "step": 13720 + }, + { + "epoch": 2.2389885807504077, + "grad_norm": 0.03490706905722618, + "learning_rate": 0.000999565696762895, + "loss": 0.0586, + "num_input_tokens_seen": 29617648, + "step": 13725 + }, + { + "epoch": 2.239804241435563, + "grad_norm": 0.2900860011577606, + "learning_rate": 0.0009995627255781083, + "loss": 0.2632, + "num_input_tokens_seen": 29627664, + "step": 13730 + }, + { + "epoch": 2.240619902120718, + "grad_norm": 0.09676387161016464, + "learning_rate": 0.0009995597442690493, + "loss": 0.0877, + "num_input_tokens_seen": 29639088, + "step": 13735 + }, + { + "epoch": 2.2414355628058726, + "grad_norm": 0.19148506224155426, + "learning_rate": 0.0009995567528357785, + "loss": 0.0886, + "num_input_tokens_seen": 29649648, + "step": 13740 + }, + { + "epoch": 2.242251223491028, + "grad_norm": 0.15304531157016754, + "learning_rate": 0.0009995537512783562, + "loss": 0.1626, + "num_input_tokens_seen": 29661360, + "step": 13745 + }, + { + "epoch": 2.2430668841761827, + "grad_norm": 0.022043250501155853, + "learning_rate": 0.0009995507395968435, + "loss": 0.0606, + "num_input_tokens_seen": 29672336, + "step": 13750 + }, + { + "epoch": 2.2438825448613375, + "grad_norm": 0.3151552081108093, + "learning_rate": 0.0009995477177913014, + "loss": 0.1789, + "num_input_tokens_seen": 29683472, + "step": 13755 + }, + { + "epoch": 2.244698205546493, + "grad_norm": 0.10174253582954407, + "learning_rate": 0.0009995446858617908, + "loss": 0.0325, + "num_input_tokens_seen": 29695472, + "step": 13760 + }, + { + "epoch": 2.2455138662316476, + "grad_norm": 0.04973383620381355, + "learning_rate": 0.0009995416438083736, + "loss": 0.1871, + "num_input_tokens_seen": 29705520, + "step": 13765 + }, + { + "epoch": 2.2463295269168024, + "grad_norm": 0.0545341931283474, + "learning_rate": 0.0009995385916311112, + "loss": 0.0607, + "num_input_tokens_seen": 29715824, + "step": 13770 + }, + { + "epoch": 2.2471451876019577, + "grad_norm": 0.05339788645505905, + "learning_rate": 0.0009995355293300656, + "loss": 0.0675, + "num_input_tokens_seen": 29726864, + "step": 13775 + }, + { + "epoch": 2.2479608482871125, + "grad_norm": 0.007548432797193527, + "learning_rate": 0.0009995324569052988, + "loss": 0.0376, + "num_input_tokens_seen": 29738128, + "step": 13780 + }, + { + "epoch": 2.2487765089722673, + "grad_norm": 0.0271480493247509, + "learning_rate": 0.000999529374356873, + "loss": 0.1495, + "num_input_tokens_seen": 29747152, + "step": 13785 + }, + { + "epoch": 2.2495921696574226, + "grad_norm": 0.035655371844768524, + "learning_rate": 0.0009995262816848507, + "loss": 0.0895, + "num_input_tokens_seen": 29756912, + "step": 13790 + }, + { + "epoch": 2.2504078303425774, + "grad_norm": 0.19201703369617462, + "learning_rate": 0.0009995231788892949, + "loss": 0.2336, + "num_input_tokens_seen": 29766768, + "step": 13795 + }, + { + "epoch": 2.2512234910277327, + "grad_norm": 0.19227369129657745, + "learning_rate": 0.000999520065970268, + "loss": 0.1072, + "num_input_tokens_seen": 29778032, + "step": 13800 + }, + { + "epoch": 2.2520391517128875, + "grad_norm": 0.1643262505531311, + "learning_rate": 0.000999516942927833, + "loss": 0.1662, + "num_input_tokens_seen": 29787408, + "step": 13805 + }, + { + "epoch": 2.2528548123980423, + "grad_norm": 0.06243119016289711, + "learning_rate": 0.0009995138097620537, + "loss": 0.0576, + "num_input_tokens_seen": 29797232, + "step": 13810 + }, + { + "epoch": 2.2536704730831976, + "grad_norm": 0.04362228885293007, + "learning_rate": 0.0009995106664729934, + "loss": 0.1572, + "num_input_tokens_seen": 29808016, + "step": 13815 + }, + { + "epoch": 2.2544861337683524, + "grad_norm": 0.14224180579185486, + "learning_rate": 0.0009995075130607158, + "loss": 0.0907, + "num_input_tokens_seen": 29818640, + "step": 13820 + }, + { + "epoch": 2.255301794453507, + "grad_norm": 0.02562633343040943, + "learning_rate": 0.0009995043495252848, + "loss": 0.1662, + "num_input_tokens_seen": 29829776, + "step": 13825 + }, + { + "epoch": 2.2561174551386625, + "grad_norm": 0.011271154507994652, + "learning_rate": 0.0009995011758667644, + "loss": 0.0453, + "num_input_tokens_seen": 29840304, + "step": 13830 + }, + { + "epoch": 2.2569331158238173, + "grad_norm": 0.1104244738817215, + "learning_rate": 0.000999497992085219, + "loss": 0.2668, + "num_input_tokens_seen": 29852048, + "step": 13835 + }, + { + "epoch": 2.257748776508972, + "grad_norm": 0.17477242648601532, + "learning_rate": 0.0009994947981807132, + "loss": 0.139, + "num_input_tokens_seen": 29861264, + "step": 13840 + }, + { + "epoch": 2.2585644371941274, + "grad_norm": 0.015874188393354416, + "learning_rate": 0.0009994915941533115, + "loss": 0.0708, + "num_input_tokens_seen": 29871984, + "step": 13845 + }, + { + "epoch": 2.259380097879282, + "grad_norm": 0.10188250243663788, + "learning_rate": 0.0009994883800030791, + "loss": 0.1619, + "num_input_tokens_seen": 29882256, + "step": 13850 + }, + { + "epoch": 2.2601957585644374, + "grad_norm": 0.22156667709350586, + "learning_rate": 0.0009994851557300812, + "loss": 0.1545, + "num_input_tokens_seen": 29891696, + "step": 13855 + }, + { + "epoch": 2.2610114192495923, + "grad_norm": 0.1048242449760437, + "learning_rate": 0.000999481921334383, + "loss": 0.0939, + "num_input_tokens_seen": 29903856, + "step": 13860 + }, + { + "epoch": 2.261827079934747, + "grad_norm": 0.164866104722023, + "learning_rate": 0.0009994786768160496, + "loss": 0.124, + "num_input_tokens_seen": 29913520, + "step": 13865 + }, + { + "epoch": 2.262642740619902, + "grad_norm": 0.035744041204452515, + "learning_rate": 0.0009994754221751474, + "loss": 0.0282, + "num_input_tokens_seen": 29924016, + "step": 13870 + }, + { + "epoch": 2.263458401305057, + "grad_norm": 0.13268551230430603, + "learning_rate": 0.0009994721574117422, + "loss": 0.1795, + "num_input_tokens_seen": 29935120, + "step": 13875 + }, + { + "epoch": 2.264274061990212, + "grad_norm": 0.1933874785900116, + "learning_rate": 0.0009994688825259001, + "loss": 0.1537, + "num_input_tokens_seen": 29946288, + "step": 13880 + }, + { + "epoch": 2.2650897226753672, + "grad_norm": 0.14510099589824677, + "learning_rate": 0.0009994655975176874, + "loss": 0.0928, + "num_input_tokens_seen": 29957008, + "step": 13885 + }, + { + "epoch": 2.265905383360522, + "grad_norm": 0.03160623088479042, + "learning_rate": 0.0009994623023871709, + "loss": 0.0662, + "num_input_tokens_seen": 29966896, + "step": 13890 + }, + { + "epoch": 2.266721044045677, + "grad_norm": 0.2612110376358032, + "learning_rate": 0.000999458997134417, + "loss": 0.2403, + "num_input_tokens_seen": 29976528, + "step": 13895 + }, + { + "epoch": 2.267536704730832, + "grad_norm": 0.0315590500831604, + "learning_rate": 0.000999455681759493, + "loss": 0.0906, + "num_input_tokens_seen": 29987120, + "step": 13900 + }, + { + "epoch": 2.268352365415987, + "grad_norm": 0.03478406369686127, + "learning_rate": 0.0009994523562624662, + "loss": 0.0667, + "num_input_tokens_seen": 29997680, + "step": 13905 + }, + { + "epoch": 2.2691680261011418, + "grad_norm": 0.02003956027328968, + "learning_rate": 0.0009994490206434038, + "loss": 0.2138, + "num_input_tokens_seen": 30008784, + "step": 13910 + }, + { + "epoch": 2.269983686786297, + "grad_norm": 0.021669182926416397, + "learning_rate": 0.000999445674902373, + "loss": 0.1753, + "num_input_tokens_seen": 30018416, + "step": 13915 + }, + { + "epoch": 2.270799347471452, + "grad_norm": 0.18153920769691467, + "learning_rate": 0.0009994423190394423, + "loss": 0.078, + "num_input_tokens_seen": 30030352, + "step": 13920 + }, + { + "epoch": 2.2716150081566067, + "grad_norm": 0.05024305358529091, + "learning_rate": 0.0009994389530546795, + "loss": 0.0816, + "num_input_tokens_seen": 30041392, + "step": 13925 + }, + { + "epoch": 2.272430668841762, + "grad_norm": 0.04041077941656113, + "learning_rate": 0.0009994355769481524, + "loss": 0.033, + "num_input_tokens_seen": 30052240, + "step": 13930 + }, + { + "epoch": 2.2732463295269167, + "grad_norm": 0.03617650270462036, + "learning_rate": 0.00099943219071993, + "loss": 0.1935, + "num_input_tokens_seen": 30062192, + "step": 13935 + }, + { + "epoch": 2.274061990212072, + "grad_norm": 0.04580409452319145, + "learning_rate": 0.0009994287943700807, + "loss": 0.0835, + "num_input_tokens_seen": 30073808, + "step": 13940 + }, + { + "epoch": 2.274877650897227, + "grad_norm": 0.11890444159507751, + "learning_rate": 0.0009994253878986732, + "loss": 0.1514, + "num_input_tokens_seen": 30084016, + "step": 13945 + }, + { + "epoch": 2.2756933115823816, + "grad_norm": 0.14737261831760406, + "learning_rate": 0.0009994219713057768, + "loss": 0.0772, + "num_input_tokens_seen": 30095600, + "step": 13950 + }, + { + "epoch": 2.2765089722675365, + "grad_norm": 0.15552979707717896, + "learning_rate": 0.0009994185445914604, + "loss": 0.1039, + "num_input_tokens_seen": 30106800, + "step": 13955 + }, + { + "epoch": 2.2773246329526917, + "grad_norm": 0.042308300733566284, + "learning_rate": 0.000999415107755794, + "loss": 0.0301, + "num_input_tokens_seen": 30118288, + "step": 13960 + }, + { + "epoch": 2.2781402936378465, + "grad_norm": 0.03570830076932907, + "learning_rate": 0.0009994116607988464, + "loss": 0.0905, + "num_input_tokens_seen": 30129552, + "step": 13965 + }, + { + "epoch": 2.278955954323002, + "grad_norm": 0.05580104514956474, + "learning_rate": 0.0009994082037206881, + "loss": 0.1372, + "num_input_tokens_seen": 30140720, + "step": 13970 + }, + { + "epoch": 2.2797716150081566, + "grad_norm": 0.05943964421749115, + "learning_rate": 0.0009994047365213892, + "loss": 0.0892, + "num_input_tokens_seen": 30150640, + "step": 13975 + }, + { + "epoch": 2.2805872756933114, + "grad_norm": 0.039422646164894104, + "learning_rate": 0.0009994012592010196, + "loss": 0.0526, + "num_input_tokens_seen": 30160976, + "step": 13980 + }, + { + "epoch": 2.2814029363784667, + "grad_norm": 0.23809316754341125, + "learning_rate": 0.00099939777175965, + "loss": 0.3018, + "num_input_tokens_seen": 30170576, + "step": 13985 + }, + { + "epoch": 2.2822185970636215, + "grad_norm": 0.1039339005947113, + "learning_rate": 0.000999394274197351, + "loss": 0.0765, + "num_input_tokens_seen": 30181904, + "step": 13990 + }, + { + "epoch": 2.2830342577487763, + "grad_norm": 0.13251617550849915, + "learning_rate": 0.0009993907665141934, + "loss": 0.0801, + "num_input_tokens_seen": 30193392, + "step": 13995 + }, + { + "epoch": 2.2838499184339316, + "grad_norm": 0.02194301225244999, + "learning_rate": 0.0009993872487102486, + "loss": 0.0374, + "num_input_tokens_seen": 30204528, + "step": 14000 + }, + { + "epoch": 2.2846655791190864, + "grad_norm": 0.1534743309020996, + "learning_rate": 0.0009993837207855876, + "loss": 0.0511, + "num_input_tokens_seen": 30215760, + "step": 14005 + }, + { + "epoch": 2.2854812398042412, + "grad_norm": 0.009152466431260109, + "learning_rate": 0.000999380182740282, + "loss": 0.0336, + "num_input_tokens_seen": 30226096, + "step": 14010 + }, + { + "epoch": 2.2862969004893965, + "grad_norm": 0.017874700948596, + "learning_rate": 0.0009993766345744036, + "loss": 0.0804, + "num_input_tokens_seen": 30236784, + "step": 14015 + }, + { + "epoch": 2.2871125611745513, + "grad_norm": 0.0666658878326416, + "learning_rate": 0.000999373076288024, + "loss": 0.1244, + "num_input_tokens_seen": 30248432, + "step": 14020 + }, + { + "epoch": 2.2879282218597066, + "grad_norm": 0.07837460935115814, + "learning_rate": 0.0009993695078812156, + "loss": 0.1674, + "num_input_tokens_seen": 30258448, + "step": 14025 + }, + { + "epoch": 2.2887438825448614, + "grad_norm": 0.03440437465906143, + "learning_rate": 0.0009993659293540506, + "loss": 0.2099, + "num_input_tokens_seen": 30269424, + "step": 14030 + }, + { + "epoch": 2.289559543230016, + "grad_norm": 0.028172293677926064, + "learning_rate": 0.0009993623407066016, + "loss": 0.0592, + "num_input_tokens_seen": 30280144, + "step": 14035 + }, + { + "epoch": 2.2903752039151715, + "grad_norm": 0.01224245224148035, + "learning_rate": 0.0009993587419389412, + "loss": 0.0323, + "num_input_tokens_seen": 30290064, + "step": 14040 + }, + { + "epoch": 2.2911908646003263, + "grad_norm": 0.01701202616095543, + "learning_rate": 0.0009993551330511423, + "loss": 0.1569, + "num_input_tokens_seen": 30299984, + "step": 14045 + }, + { + "epoch": 2.292006525285481, + "grad_norm": 0.05539962649345398, + "learning_rate": 0.0009993515140432783, + "loss": 0.0415, + "num_input_tokens_seen": 30311248, + "step": 14050 + }, + { + "epoch": 2.2928221859706364, + "grad_norm": 0.041384872049093246, + "learning_rate": 0.0009993478849154224, + "loss": 0.1209, + "num_input_tokens_seen": 30322160, + "step": 14055 + }, + { + "epoch": 2.293637846655791, + "grad_norm": 0.023523643612861633, + "learning_rate": 0.0009993442456676482, + "loss": 0.0699, + "num_input_tokens_seen": 30333616, + "step": 14060 + }, + { + "epoch": 2.294453507340946, + "grad_norm": 0.16825833916664124, + "learning_rate": 0.0009993405963000294, + "loss": 0.0504, + "num_input_tokens_seen": 30344112, + "step": 14065 + }, + { + "epoch": 2.2952691680261013, + "grad_norm": 0.06740614771842957, + "learning_rate": 0.00099933693681264, + "loss": 0.0879, + "num_input_tokens_seen": 30354192, + "step": 14070 + }, + { + "epoch": 2.296084828711256, + "grad_norm": 0.17952290177345276, + "learning_rate": 0.000999333267205554, + "loss": 0.069, + "num_input_tokens_seen": 30365168, + "step": 14075 + }, + { + "epoch": 2.2969004893964113, + "grad_norm": 0.1948961317539215, + "learning_rate": 0.000999329587478846, + "loss": 0.1123, + "num_input_tokens_seen": 30376464, + "step": 14080 + }, + { + "epoch": 2.297716150081566, + "grad_norm": 0.09298911690711975, + "learning_rate": 0.0009993258976325903, + "loss": 0.0626, + "num_input_tokens_seen": 30387472, + "step": 14085 + }, + { + "epoch": 2.298531810766721, + "grad_norm": 0.27564337849617004, + "learning_rate": 0.0009993221976668618, + "loss": 0.2202, + "num_input_tokens_seen": 30398224, + "step": 14090 + }, + { + "epoch": 2.299347471451876, + "grad_norm": 0.29000547528266907, + "learning_rate": 0.0009993184875817357, + "loss": 0.2597, + "num_input_tokens_seen": 30409872, + "step": 14095 + }, + { + "epoch": 2.300163132137031, + "grad_norm": 0.19742950797080994, + "learning_rate": 0.0009993147673772868, + "loss": 0.1598, + "num_input_tokens_seen": 30420144, + "step": 14100 + }, + { + "epoch": 2.300978792822186, + "grad_norm": 0.16948331892490387, + "learning_rate": 0.000999311037053591, + "loss": 0.0882, + "num_input_tokens_seen": 30430704, + "step": 14105 + }, + { + "epoch": 2.301794453507341, + "grad_norm": 0.04386034607887268, + "learning_rate": 0.0009993072966107235, + "loss": 0.1001, + "num_input_tokens_seen": 30441328, + "step": 14110 + }, + { + "epoch": 2.302610114192496, + "grad_norm": 0.023586563766002655, + "learning_rate": 0.0009993035460487602, + "loss": 0.1113, + "num_input_tokens_seen": 30452208, + "step": 14115 + }, + { + "epoch": 2.3034257748776508, + "grad_norm": 0.10792867839336395, + "learning_rate": 0.0009992997853677773, + "loss": 0.0743, + "num_input_tokens_seen": 30462960, + "step": 14120 + }, + { + "epoch": 2.304241435562806, + "grad_norm": 0.08810919523239136, + "learning_rate": 0.0009992960145678506, + "loss": 0.2205, + "num_input_tokens_seen": 30473168, + "step": 14125 + }, + { + "epoch": 2.305057096247961, + "grad_norm": 0.19869904220104218, + "learning_rate": 0.0009992922336490568, + "loss": 0.1445, + "num_input_tokens_seen": 30483824, + "step": 14130 + }, + { + "epoch": 2.3058727569331157, + "grad_norm": 0.0569668747484684, + "learning_rate": 0.0009992884426114725, + "loss": 0.1957, + "num_input_tokens_seen": 30495632, + "step": 14135 + }, + { + "epoch": 2.306688417618271, + "grad_norm": 0.0614323690533638, + "learning_rate": 0.0009992846414551746, + "loss": 0.0607, + "num_input_tokens_seen": 30505808, + "step": 14140 + }, + { + "epoch": 2.3075040783034257, + "grad_norm": 0.2005668580532074, + "learning_rate": 0.00099928083018024, + "loss": 0.0989, + "num_input_tokens_seen": 30516368, + "step": 14145 + }, + { + "epoch": 2.3083197389885806, + "grad_norm": 0.03438250720500946, + "learning_rate": 0.000999277008786746, + "loss": 0.0854, + "num_input_tokens_seen": 30527856, + "step": 14150 + }, + { + "epoch": 2.309135399673736, + "grad_norm": 0.18564473092556, + "learning_rate": 0.0009992731772747701, + "loss": 0.3052, + "num_input_tokens_seen": 30537936, + "step": 14155 + }, + { + "epoch": 2.3099510603588906, + "grad_norm": 0.0878414511680603, + "learning_rate": 0.0009992693356443898, + "loss": 0.1055, + "num_input_tokens_seen": 30548304, + "step": 14160 + }, + { + "epoch": 2.310766721044046, + "grad_norm": 0.013666670769453049, + "learning_rate": 0.0009992654838956831, + "loss": 0.0265, + "num_input_tokens_seen": 30559728, + "step": 14165 + }, + { + "epoch": 2.3115823817292007, + "grad_norm": 0.04274529218673706, + "learning_rate": 0.000999261622028728, + "loss": 0.0693, + "num_input_tokens_seen": 30569840, + "step": 14170 + }, + { + "epoch": 2.3123980424143555, + "grad_norm": 0.08407062292098999, + "learning_rate": 0.0009992577500436027, + "loss": 0.1592, + "num_input_tokens_seen": 30580400, + "step": 14175 + }, + { + "epoch": 2.3132137030995104, + "grad_norm": 0.3202744126319885, + "learning_rate": 0.0009992538679403857, + "loss": 0.1667, + "num_input_tokens_seen": 30594544, + "step": 14180 + }, + { + "epoch": 2.3140293637846656, + "grad_norm": 0.04484544321894646, + "learning_rate": 0.0009992499757191559, + "loss": 0.059, + "num_input_tokens_seen": 30604912, + "step": 14185 + }, + { + "epoch": 2.3148450244698204, + "grad_norm": 0.057070109993219376, + "learning_rate": 0.000999246073379992, + "loss": 0.0721, + "num_input_tokens_seen": 30616400, + "step": 14190 + }, + { + "epoch": 2.3156606851549757, + "grad_norm": 0.08638685196638107, + "learning_rate": 0.0009992421609229729, + "loss": 0.1036, + "num_input_tokens_seen": 30627824, + "step": 14195 + }, + { + "epoch": 2.3164763458401305, + "grad_norm": 0.0068628303706645966, + "learning_rate": 0.0009992382383481782, + "loss": 0.0753, + "num_input_tokens_seen": 30638320, + "step": 14200 + }, + { + "epoch": 2.3172920065252853, + "grad_norm": 0.028828194364905357, + "learning_rate": 0.0009992343056556873, + "loss": 0.1986, + "num_input_tokens_seen": 30649744, + "step": 14205 + }, + { + "epoch": 2.3181076672104406, + "grad_norm": 0.10007745027542114, + "learning_rate": 0.0009992303628455796, + "loss": 0.2755, + "num_input_tokens_seen": 30661136, + "step": 14210 + }, + { + "epoch": 2.3189233278955954, + "grad_norm": 0.03869021683931351, + "learning_rate": 0.0009992264099179355, + "loss": 0.0724, + "num_input_tokens_seen": 30672368, + "step": 14215 + }, + { + "epoch": 2.3197389885807502, + "grad_norm": 0.15511001646518707, + "learning_rate": 0.000999222446872835, + "loss": 0.1614, + "num_input_tokens_seen": 30684784, + "step": 14220 + }, + { + "epoch": 2.3205546492659055, + "grad_norm": 0.08031713217496872, + "learning_rate": 0.0009992184737103583, + "loss": 0.0795, + "num_input_tokens_seen": 30696432, + "step": 14225 + }, + { + "epoch": 2.3213703099510603, + "grad_norm": 0.05332498252391815, + "learning_rate": 0.0009992144904305857, + "loss": 0.0838, + "num_input_tokens_seen": 30706256, + "step": 14230 + }, + { + "epoch": 2.322185970636215, + "grad_norm": 0.1857920140028, + "learning_rate": 0.0009992104970335982, + "loss": 0.0992, + "num_input_tokens_seen": 30716976, + "step": 14235 + }, + { + "epoch": 2.3230016313213704, + "grad_norm": 0.16593743860721588, + "learning_rate": 0.0009992064935194767, + "loss": 0.1141, + "num_input_tokens_seen": 30726768, + "step": 14240 + }, + { + "epoch": 2.323817292006525, + "grad_norm": 0.024933604523539543, + "learning_rate": 0.0009992024798883025, + "loss": 0.1633, + "num_input_tokens_seen": 30737072, + "step": 14245 + }, + { + "epoch": 2.3246329526916805, + "grad_norm": 0.2120092362165451, + "learning_rate": 0.0009991984561401566, + "loss": 0.1211, + "num_input_tokens_seen": 30748400, + "step": 14250 + }, + { + "epoch": 2.3254486133768353, + "grad_norm": 0.09216459840536118, + "learning_rate": 0.0009991944222751208, + "loss": 0.0991, + "num_input_tokens_seen": 30760240, + "step": 14255 + }, + { + "epoch": 2.32626427406199, + "grad_norm": 0.028513865545392036, + "learning_rate": 0.0009991903782932765, + "loss": 0.101, + "num_input_tokens_seen": 30770928, + "step": 14260 + }, + { + "epoch": 2.3270799347471454, + "grad_norm": 0.13767805695533752, + "learning_rate": 0.0009991863241947062, + "loss": 0.1064, + "num_input_tokens_seen": 30780240, + "step": 14265 + }, + { + "epoch": 2.3278955954323, + "grad_norm": 0.011141417548060417, + "learning_rate": 0.0009991822599794916, + "loss": 0.0743, + "num_input_tokens_seen": 30792144, + "step": 14270 + }, + { + "epoch": 2.328711256117455, + "grad_norm": 0.13007600605487823, + "learning_rate": 0.0009991781856477156, + "loss": 0.1677, + "num_input_tokens_seen": 30802416, + "step": 14275 + }, + { + "epoch": 2.3295269168026103, + "grad_norm": 0.0643002986907959, + "learning_rate": 0.00099917410119946, + "loss": 0.1096, + "num_input_tokens_seen": 30813872, + "step": 14280 + }, + { + "epoch": 2.330342577487765, + "grad_norm": 0.013769935816526413, + "learning_rate": 0.0009991700066348081, + "loss": 0.0428, + "num_input_tokens_seen": 30825584, + "step": 14285 + }, + { + "epoch": 2.33115823817292, + "grad_norm": 0.16517998278141022, + "learning_rate": 0.000999165901953843, + "loss": 0.0747, + "num_input_tokens_seen": 30836688, + "step": 14290 + }, + { + "epoch": 2.331973898858075, + "grad_norm": 0.010283554904162884, + "learning_rate": 0.0009991617871566473, + "loss": 0.0654, + "num_input_tokens_seen": 30848816, + "step": 14295 + }, + { + "epoch": 2.33278955954323, + "grad_norm": 0.13715708255767822, + "learning_rate": 0.000999157662243305, + "loss": 0.0924, + "num_input_tokens_seen": 30858608, + "step": 14300 + }, + { + "epoch": 2.3336052202283852, + "grad_norm": 0.010564406402409077, + "learning_rate": 0.0009991535272138995, + "loss": 0.0859, + "num_input_tokens_seen": 30868560, + "step": 14305 + }, + { + "epoch": 2.33442088091354, + "grad_norm": 0.020471880212426186, + "learning_rate": 0.0009991493820685142, + "loss": 0.0476, + "num_input_tokens_seen": 30878192, + "step": 14310 + }, + { + "epoch": 2.335236541598695, + "grad_norm": 0.026267528533935547, + "learning_rate": 0.000999145226807234, + "loss": 0.0753, + "num_input_tokens_seen": 30888432, + "step": 14315 + }, + { + "epoch": 2.3360522022838497, + "grad_norm": 0.018368130549788475, + "learning_rate": 0.000999141061430142, + "loss": 0.1214, + "num_input_tokens_seen": 30898864, + "step": 14320 + }, + { + "epoch": 2.336867862969005, + "grad_norm": 0.12905187904834747, + "learning_rate": 0.0009991368859373236, + "loss": 0.166, + "num_input_tokens_seen": 30909424, + "step": 14325 + }, + { + "epoch": 2.3376835236541598, + "grad_norm": 0.17623016238212585, + "learning_rate": 0.0009991327003288626, + "loss": 0.1826, + "num_input_tokens_seen": 30919600, + "step": 14330 + }, + { + "epoch": 2.338499184339315, + "grad_norm": 0.009566441178321838, + "learning_rate": 0.0009991285046048446, + "loss": 0.0404, + "num_input_tokens_seen": 30930416, + "step": 14335 + }, + { + "epoch": 2.33931484502447, + "grad_norm": 0.0062688495963811874, + "learning_rate": 0.0009991242987653541, + "loss": 0.0338, + "num_input_tokens_seen": 30940560, + "step": 14340 + }, + { + "epoch": 2.3401305057096247, + "grad_norm": 0.05438132584095001, + "learning_rate": 0.0009991200828104766, + "loss": 0.1324, + "num_input_tokens_seen": 30951440, + "step": 14345 + }, + { + "epoch": 2.34094616639478, + "grad_norm": 0.0036873987410217524, + "learning_rate": 0.0009991158567402973, + "loss": 0.1556, + "num_input_tokens_seen": 30961936, + "step": 14350 + }, + { + "epoch": 2.3417618270799347, + "grad_norm": 0.14076970517635345, + "learning_rate": 0.0009991116205549022, + "loss": 0.1195, + "num_input_tokens_seen": 30972112, + "step": 14355 + }, + { + "epoch": 2.3425774877650896, + "grad_norm": 0.09590610861778259, + "learning_rate": 0.0009991073742543768, + "loss": 0.1444, + "num_input_tokens_seen": 30984272, + "step": 14360 + }, + { + "epoch": 2.343393148450245, + "grad_norm": 0.01931748539209366, + "learning_rate": 0.0009991031178388072, + "loss": 0.2474, + "num_input_tokens_seen": 30996368, + "step": 14365 + }, + { + "epoch": 2.3442088091353996, + "grad_norm": 0.08409635722637177, + "learning_rate": 0.0009990988513082799, + "loss": 0.1785, + "num_input_tokens_seen": 31007216, + "step": 14370 + }, + { + "epoch": 2.3450244698205545, + "grad_norm": 0.04656350612640381, + "learning_rate": 0.0009990945746628812, + "loss": 0.0604, + "num_input_tokens_seen": 31018416, + "step": 14375 + }, + { + "epoch": 2.3458401305057097, + "grad_norm": 0.008510172367095947, + "learning_rate": 0.0009990902879026978, + "loss": 0.0757, + "num_input_tokens_seen": 31030416, + "step": 14380 + }, + { + "epoch": 2.3466557911908645, + "grad_norm": 0.09007712453603745, + "learning_rate": 0.0009990859910278167, + "loss": 0.068, + "num_input_tokens_seen": 31040752, + "step": 14385 + }, + { + "epoch": 2.34747145187602, + "grad_norm": 0.03487147390842438, + "learning_rate": 0.0009990816840383247, + "loss": 0.0915, + "num_input_tokens_seen": 31052048, + "step": 14390 + }, + { + "epoch": 2.3482871125611746, + "grad_norm": 0.32675087451934814, + "learning_rate": 0.0009990773669343092, + "loss": 0.2063, + "num_input_tokens_seen": 31062320, + "step": 14395 + }, + { + "epoch": 2.3491027732463294, + "grad_norm": 0.03751207888126373, + "learning_rate": 0.0009990730397158578, + "loss": 0.1264, + "num_input_tokens_seen": 31071856, + "step": 14400 + }, + { + "epoch": 2.3499184339314847, + "grad_norm": 0.009249582886695862, + "learning_rate": 0.0009990687023830583, + "loss": 0.0337, + "num_input_tokens_seen": 31082096, + "step": 14405 + }, + { + "epoch": 2.3507340946166395, + "grad_norm": 0.10462295264005661, + "learning_rate": 0.0009990643549359982, + "loss": 0.1448, + "num_input_tokens_seen": 31092400, + "step": 14410 + }, + { + "epoch": 2.3515497553017943, + "grad_norm": 0.012947984971106052, + "learning_rate": 0.0009990599973747657, + "loss": 0.1204, + "num_input_tokens_seen": 31102768, + "step": 14415 + }, + { + "epoch": 2.3523654159869496, + "grad_norm": 0.16959279775619507, + "learning_rate": 0.0009990556296994497, + "loss": 0.0726, + "num_input_tokens_seen": 31113936, + "step": 14420 + }, + { + "epoch": 2.3531810766721044, + "grad_norm": 0.008147962391376495, + "learning_rate": 0.000999051251910138, + "loss": 0.1144, + "num_input_tokens_seen": 31123920, + "step": 14425 + }, + { + "epoch": 2.3539967373572592, + "grad_norm": 0.11115682870149612, + "learning_rate": 0.0009990468640069196, + "loss": 0.1458, + "num_input_tokens_seen": 31135472, + "step": 14430 + }, + { + "epoch": 2.3548123980424145, + "grad_norm": 0.007962728850543499, + "learning_rate": 0.0009990424659898833, + "loss": 0.1333, + "num_input_tokens_seen": 31145552, + "step": 14435 + }, + { + "epoch": 2.3556280587275693, + "grad_norm": 0.01810404099524021, + "learning_rate": 0.0009990380578591186, + "loss": 0.0339, + "num_input_tokens_seen": 31157680, + "step": 14440 + }, + { + "epoch": 2.356443719412724, + "grad_norm": 0.0555172823369503, + "learning_rate": 0.0009990336396147144, + "loss": 0.1041, + "num_input_tokens_seen": 31168720, + "step": 14445 + }, + { + "epoch": 2.3572593800978794, + "grad_norm": 0.23831014335155487, + "learning_rate": 0.0009990292112567606, + "loss": 0.1187, + "num_input_tokens_seen": 31178832, + "step": 14450 + }, + { + "epoch": 2.358075040783034, + "grad_norm": 0.2068205028772354, + "learning_rate": 0.0009990247727853466, + "loss": 0.1141, + "num_input_tokens_seen": 31190288, + "step": 14455 + }, + { + "epoch": 2.358890701468189, + "grad_norm": 0.13767342269420624, + "learning_rate": 0.0009990203242005626, + "loss": 0.0396, + "num_input_tokens_seen": 31202128, + "step": 14460 + }, + { + "epoch": 2.3597063621533443, + "grad_norm": 0.02237624116241932, + "learning_rate": 0.0009990158655024985, + "loss": 0.0711, + "num_input_tokens_seen": 31213232, + "step": 14465 + }, + { + "epoch": 2.360522022838499, + "grad_norm": 0.16623124480247498, + "learning_rate": 0.0009990113966912451, + "loss": 0.0789, + "num_input_tokens_seen": 31224272, + "step": 14470 + }, + { + "epoch": 2.3613376835236544, + "grad_norm": 0.0216312687844038, + "learning_rate": 0.0009990069177668926, + "loss": 0.1006, + "num_input_tokens_seen": 31234064, + "step": 14475 + }, + { + "epoch": 2.362153344208809, + "grad_norm": 0.0067304689437150955, + "learning_rate": 0.0009990024287295318, + "loss": 0.1823, + "num_input_tokens_seen": 31245776, + "step": 14480 + }, + { + "epoch": 2.362969004893964, + "grad_norm": 0.005319551564753056, + "learning_rate": 0.000998997929579254, + "loss": 0.0429, + "num_input_tokens_seen": 31257392, + "step": 14485 + }, + { + "epoch": 2.3637846655791193, + "grad_norm": 0.15284186601638794, + "learning_rate": 0.0009989934203161498, + "loss": 0.1055, + "num_input_tokens_seen": 31267408, + "step": 14490 + }, + { + "epoch": 2.364600326264274, + "grad_norm": 0.01890537329018116, + "learning_rate": 0.0009989889009403112, + "loss": 0.1914, + "num_input_tokens_seen": 31278960, + "step": 14495 + }, + { + "epoch": 2.365415986949429, + "grad_norm": 0.18082453310489655, + "learning_rate": 0.0009989843714518294, + "loss": 0.2481, + "num_input_tokens_seen": 31289264, + "step": 14500 + }, + { + "epoch": 2.366231647634584, + "grad_norm": 0.2374914586544037, + "learning_rate": 0.0009989798318507962, + "loss": 0.1355, + "num_input_tokens_seen": 31299152, + "step": 14505 + }, + { + "epoch": 2.367047308319739, + "grad_norm": 0.025896022096276283, + "learning_rate": 0.0009989752821373038, + "loss": 0.1099, + "num_input_tokens_seen": 31308944, + "step": 14510 + }, + { + "epoch": 2.367862969004894, + "grad_norm": 0.0031363663729280233, + "learning_rate": 0.0009989707223114444, + "loss": 0.0615, + "num_input_tokens_seen": 31319824, + "step": 14515 + }, + { + "epoch": 2.368678629690049, + "grad_norm": 0.07448268681764603, + "learning_rate": 0.0009989661523733102, + "loss": 0.1134, + "num_input_tokens_seen": 31332016, + "step": 14520 + }, + { + "epoch": 2.369494290375204, + "grad_norm": 0.14439260959625244, + "learning_rate": 0.000998961572322994, + "loss": 0.3225, + "num_input_tokens_seen": 31342512, + "step": 14525 + }, + { + "epoch": 2.370309951060359, + "grad_norm": 0.0310188177973032, + "learning_rate": 0.0009989569821605886, + "loss": 0.0399, + "num_input_tokens_seen": 31352528, + "step": 14530 + }, + { + "epoch": 2.371125611745514, + "grad_norm": 0.07678642868995667, + "learning_rate": 0.0009989523818861867, + "loss": 0.0586, + "num_input_tokens_seen": 31361520, + "step": 14535 + }, + { + "epoch": 2.3719412724306688, + "grad_norm": 0.04294511303305626, + "learning_rate": 0.0009989477714998822, + "loss": 0.099, + "num_input_tokens_seen": 31372304, + "step": 14540 + }, + { + "epoch": 2.3727569331158236, + "grad_norm": 0.08723995834589005, + "learning_rate": 0.000998943151001768, + "loss": 0.1508, + "num_input_tokens_seen": 31382960, + "step": 14545 + }, + { + "epoch": 2.373572593800979, + "grad_norm": 0.18996146321296692, + "learning_rate": 0.0009989385203919379, + "loss": 0.2638, + "num_input_tokens_seen": 31394128, + "step": 14550 + }, + { + "epoch": 2.3743882544861337, + "grad_norm": 0.12724678218364716, + "learning_rate": 0.0009989338796704856, + "loss": 0.0648, + "num_input_tokens_seen": 31405072, + "step": 14555 + }, + { + "epoch": 2.375203915171289, + "grad_norm": 0.0339246429502964, + "learning_rate": 0.0009989292288375053, + "loss": 0.0499, + "num_input_tokens_seen": 31415280, + "step": 14560 + }, + { + "epoch": 2.3760195758564437, + "grad_norm": 0.026740889996290207, + "learning_rate": 0.0009989245678930915, + "loss": 0.0407, + "num_input_tokens_seen": 31426384, + "step": 14565 + }, + { + "epoch": 2.3768352365415986, + "grad_norm": 0.02933860570192337, + "learning_rate": 0.0009989198968373381, + "loss": 0.0504, + "num_input_tokens_seen": 31436304, + "step": 14570 + }, + { + "epoch": 2.377650897226754, + "grad_norm": 0.015854576602578163, + "learning_rate": 0.0009989152156703403, + "loss": 0.1029, + "num_input_tokens_seen": 31448240, + "step": 14575 + }, + { + "epoch": 2.3784665579119086, + "grad_norm": 0.014037691988050938, + "learning_rate": 0.0009989105243921926, + "loss": 0.1483, + "num_input_tokens_seen": 31458928, + "step": 14580 + }, + { + "epoch": 2.3792822185970635, + "grad_norm": 0.09939184039831161, + "learning_rate": 0.0009989058230029904, + "loss": 0.0568, + "num_input_tokens_seen": 31470480, + "step": 14585 + }, + { + "epoch": 2.3800978792822187, + "grad_norm": 0.012454574927687645, + "learning_rate": 0.0009989011115028286, + "loss": 0.126, + "num_input_tokens_seen": 31481968, + "step": 14590 + }, + { + "epoch": 2.3809135399673735, + "grad_norm": 0.14813783764839172, + "learning_rate": 0.0009988963898918029, + "loss": 0.244, + "num_input_tokens_seen": 31493360, + "step": 14595 + }, + { + "epoch": 2.3817292006525284, + "grad_norm": 0.07208909839391708, + "learning_rate": 0.000998891658170009, + "loss": 0.1141, + "num_input_tokens_seen": 31505552, + "step": 14600 + }, + { + "epoch": 2.3825448613376836, + "grad_norm": 0.008604118600487709, + "learning_rate": 0.0009988869163375428, + "loss": 0.0831, + "num_input_tokens_seen": 31516336, + "step": 14605 + }, + { + "epoch": 2.3833605220228384, + "grad_norm": 0.00505094276741147, + "learning_rate": 0.0009988821643945002, + "loss": 0.0688, + "num_input_tokens_seen": 31527152, + "step": 14610 + }, + { + "epoch": 2.3841761827079937, + "grad_norm": 0.008136876858770847, + "learning_rate": 0.0009988774023409776, + "loss": 0.045, + "num_input_tokens_seen": 31536400, + "step": 14615 + }, + { + "epoch": 2.3849918433931485, + "grad_norm": 0.13164255023002625, + "learning_rate": 0.0009988726301770718, + "loss": 0.051, + "num_input_tokens_seen": 31547056, + "step": 14620 + }, + { + "epoch": 2.3858075040783033, + "grad_norm": 0.005653201602399349, + "learning_rate": 0.0009988678479028793, + "loss": 0.108, + "num_input_tokens_seen": 31557840, + "step": 14625 + }, + { + "epoch": 2.3866231647634586, + "grad_norm": 0.01573813520371914, + "learning_rate": 0.000998863055518497, + "loss": 0.0288, + "num_input_tokens_seen": 31569424, + "step": 14630 + }, + { + "epoch": 2.3874388254486134, + "grad_norm": 0.02346787601709366, + "learning_rate": 0.0009988582530240217, + "loss": 0.1866, + "num_input_tokens_seen": 31579504, + "step": 14635 + }, + { + "epoch": 2.3882544861337682, + "grad_norm": 0.05880860239267349, + "learning_rate": 0.0009988534404195516, + "loss": 0.21, + "num_input_tokens_seen": 31591120, + "step": 14640 + }, + { + "epoch": 2.3890701468189235, + "grad_norm": 0.031170688569545746, + "learning_rate": 0.000998848617705183, + "loss": 0.046, + "num_input_tokens_seen": 31601968, + "step": 14645 + }, + { + "epoch": 2.3898858075040783, + "grad_norm": 0.016977254301309586, + "learning_rate": 0.000998843784881015, + "loss": 0.1818, + "num_input_tokens_seen": 31612944, + "step": 14650 + }, + { + "epoch": 2.390701468189233, + "grad_norm": 0.25859296321868896, + "learning_rate": 0.0009988389419471446, + "loss": 0.0543, + "num_input_tokens_seen": 31623472, + "step": 14655 + }, + { + "epoch": 2.3915171288743884, + "grad_norm": 0.037718791514635086, + "learning_rate": 0.0009988340889036701, + "loss": 0.1973, + "num_input_tokens_seen": 31635248, + "step": 14660 + }, + { + "epoch": 2.392332789559543, + "grad_norm": 0.1221359372138977, + "learning_rate": 0.0009988292257506902, + "loss": 0.1237, + "num_input_tokens_seen": 31646320, + "step": 14665 + }, + { + "epoch": 2.393148450244698, + "grad_norm": 0.01673707738518715, + "learning_rate": 0.000998824352488303, + "loss": 0.0645, + "num_input_tokens_seen": 31658608, + "step": 14670 + }, + { + "epoch": 2.3939641109298533, + "grad_norm": 0.0323331318795681, + "learning_rate": 0.0009988194691166077, + "loss": 0.1024, + "num_input_tokens_seen": 31669680, + "step": 14675 + }, + { + "epoch": 2.394779771615008, + "grad_norm": 0.10637158900499344, + "learning_rate": 0.000998814575635703, + "loss": 0.252, + "num_input_tokens_seen": 31679696, + "step": 14680 + }, + { + "epoch": 2.395595432300163, + "grad_norm": 0.3138655126094818, + "learning_rate": 0.000998809672045688, + "loss": 0.1547, + "num_input_tokens_seen": 31691472, + "step": 14685 + }, + { + "epoch": 2.396411092985318, + "grad_norm": 0.05599002540111542, + "learning_rate": 0.0009988047583466622, + "loss": 0.1576, + "num_input_tokens_seen": 31701968, + "step": 14690 + }, + { + "epoch": 2.397226753670473, + "grad_norm": 0.02759976126253605, + "learning_rate": 0.0009987998345387255, + "loss": 0.1322, + "num_input_tokens_seen": 31713104, + "step": 14695 + }, + { + "epoch": 2.3980424143556283, + "grad_norm": 0.011101621203124523, + "learning_rate": 0.000998794900621977, + "loss": 0.1133, + "num_input_tokens_seen": 31724560, + "step": 14700 + }, + { + "epoch": 2.398858075040783, + "grad_norm": 0.017178161069750786, + "learning_rate": 0.0009987899565965172, + "loss": 0.16, + "num_input_tokens_seen": 31733776, + "step": 14705 + }, + { + "epoch": 2.399673735725938, + "grad_norm": 0.02037607878446579, + "learning_rate": 0.0009987850024624463, + "loss": 0.1189, + "num_input_tokens_seen": 31745680, + "step": 14710 + }, + { + "epoch": 2.400489396411093, + "grad_norm": 0.15602557361125946, + "learning_rate": 0.0009987800382198647, + "loss": 0.1416, + "num_input_tokens_seen": 31756240, + "step": 14715 + }, + { + "epoch": 2.401305057096248, + "grad_norm": 0.03485483676195145, + "learning_rate": 0.0009987750638688726, + "loss": 0.0579, + "num_input_tokens_seen": 31767056, + "step": 14720 + }, + { + "epoch": 2.402120717781403, + "grad_norm": 0.027752429246902466, + "learning_rate": 0.000998770079409571, + "loss": 0.0465, + "num_input_tokens_seen": 31777968, + "step": 14725 + }, + { + "epoch": 2.402936378466558, + "grad_norm": 0.012985512614250183, + "learning_rate": 0.0009987650848420613, + "loss": 0.1215, + "num_input_tokens_seen": 31788976, + "step": 14730 + }, + { + "epoch": 2.403752039151713, + "grad_norm": 0.03232298791408539, + "learning_rate": 0.0009987600801664442, + "loss": 0.1092, + "num_input_tokens_seen": 31799280, + "step": 14735 + }, + { + "epoch": 2.4045676998368677, + "grad_norm": 0.05259961634874344, + "learning_rate": 0.0009987550653828214, + "loss": 0.1126, + "num_input_tokens_seen": 31810288, + "step": 14740 + }, + { + "epoch": 2.405383360522023, + "grad_norm": 0.08376846462488174, + "learning_rate": 0.0009987500404912946, + "loss": 0.0746, + "num_input_tokens_seen": 31820880, + "step": 14745 + }, + { + "epoch": 2.4061990212071778, + "grad_norm": 0.01526689063757658, + "learning_rate": 0.0009987450054919655, + "loss": 0.0498, + "num_input_tokens_seen": 31830704, + "step": 14750 + }, + { + "epoch": 2.407014681892333, + "grad_norm": 0.030786165967583656, + "learning_rate": 0.000998739960384936, + "loss": 0.0864, + "num_input_tokens_seen": 31842608, + "step": 14755 + }, + { + "epoch": 2.407830342577488, + "grad_norm": 0.10097739100456238, + "learning_rate": 0.0009987349051703088, + "loss": 0.0804, + "num_input_tokens_seen": 31853456, + "step": 14760 + }, + { + "epoch": 2.4086460032626427, + "grad_norm": 1.4012517929077148, + "learning_rate": 0.0009987298398481859, + "loss": 0.3416, + "num_input_tokens_seen": 31864240, + "step": 14765 + }, + { + "epoch": 2.4094616639477975, + "grad_norm": 0.07046882808208466, + "learning_rate": 0.00099872476441867, + "loss": 0.1514, + "num_input_tokens_seen": 31874896, + "step": 14770 + }, + { + "epoch": 2.4102773246329527, + "grad_norm": 0.08250351250171661, + "learning_rate": 0.0009987196788818643, + "loss": 0.1104, + "num_input_tokens_seen": 31887120, + "step": 14775 + }, + { + "epoch": 2.4110929853181076, + "grad_norm": 0.19960038363933563, + "learning_rate": 0.0009987145832378713, + "loss": 0.1777, + "num_input_tokens_seen": 31899216, + "step": 14780 + }, + { + "epoch": 2.411908646003263, + "grad_norm": 0.16811136901378632, + "learning_rate": 0.0009987094774867949, + "loss": 0.0708, + "num_input_tokens_seen": 31911248, + "step": 14785 + }, + { + "epoch": 2.4127243066884176, + "grad_norm": 0.09051735699176788, + "learning_rate": 0.000998704361628738, + "loss": 0.0702, + "num_input_tokens_seen": 31921264, + "step": 14790 + }, + { + "epoch": 2.4135399673735725, + "grad_norm": 0.20876678824424744, + "learning_rate": 0.000998699235663805, + "loss": 0.087, + "num_input_tokens_seen": 31932208, + "step": 14795 + }, + { + "epoch": 2.4143556280587277, + "grad_norm": 0.17740465700626373, + "learning_rate": 0.000998694099592099, + "loss": 0.2159, + "num_input_tokens_seen": 31943792, + "step": 14800 + }, + { + "epoch": 2.4151712887438825, + "grad_norm": 0.008857152424752712, + "learning_rate": 0.0009986889534137245, + "loss": 0.245, + "num_input_tokens_seen": 31954128, + "step": 14805 + }, + { + "epoch": 2.4159869494290374, + "grad_norm": 0.0843997374176979, + "learning_rate": 0.0009986837971287857, + "loss": 0.0783, + "num_input_tokens_seen": 31964176, + "step": 14810 + }, + { + "epoch": 2.4168026101141926, + "grad_norm": 0.06013961136341095, + "learning_rate": 0.0009986786307373873, + "loss": 0.0844, + "num_input_tokens_seen": 31975312, + "step": 14815 + }, + { + "epoch": 2.4176182707993474, + "grad_norm": 0.05406338721513748, + "learning_rate": 0.0009986734542396336, + "loss": 0.0419, + "num_input_tokens_seen": 31985520, + "step": 14820 + }, + { + "epoch": 2.4184339314845023, + "grad_norm": 0.009938972070813179, + "learning_rate": 0.0009986682676356299, + "loss": 0.0178, + "num_input_tokens_seen": 31996656, + "step": 14825 + }, + { + "epoch": 2.4192495921696575, + "grad_norm": 0.13016776740550995, + "learning_rate": 0.000998663070925481, + "loss": 0.0885, + "num_input_tokens_seen": 32005680, + "step": 14830 + }, + { + "epoch": 2.4200652528548123, + "grad_norm": 0.011630987748503685, + "learning_rate": 0.0009986578641092924, + "loss": 0.1475, + "num_input_tokens_seen": 32017232, + "step": 14835 + }, + { + "epoch": 2.4208809135399676, + "grad_norm": 0.014288578182458878, + "learning_rate": 0.0009986526471871698, + "loss": 0.1037, + "num_input_tokens_seen": 32028112, + "step": 14840 + }, + { + "epoch": 2.4216965742251224, + "grad_norm": 0.12494445592164993, + "learning_rate": 0.0009986474201592187, + "loss": 0.1492, + "num_input_tokens_seen": 32038608, + "step": 14845 + }, + { + "epoch": 2.4225122349102772, + "grad_norm": 0.10034672170877457, + "learning_rate": 0.0009986421830255447, + "loss": 0.1448, + "num_input_tokens_seen": 32048880, + "step": 14850 + }, + { + "epoch": 2.4233278955954325, + "grad_norm": 0.15734760463237762, + "learning_rate": 0.0009986369357862545, + "loss": 0.1766, + "num_input_tokens_seen": 32058864, + "step": 14855 + }, + { + "epoch": 2.4241435562805873, + "grad_norm": 0.030377594754099846, + "learning_rate": 0.0009986316784414543, + "loss": 0.0779, + "num_input_tokens_seen": 32068816, + "step": 14860 + }, + { + "epoch": 2.424959216965742, + "grad_norm": 0.19264714419841766, + "learning_rate": 0.0009986264109912507, + "loss": 0.1322, + "num_input_tokens_seen": 32080848, + "step": 14865 + }, + { + "epoch": 2.4257748776508974, + "grad_norm": 0.05988854169845581, + "learning_rate": 0.00099862113343575, + "loss": 0.112, + "num_input_tokens_seen": 32091824, + "step": 14870 + }, + { + "epoch": 2.426590538336052, + "grad_norm": 0.18630032241344452, + "learning_rate": 0.0009986158457750596, + "loss": 0.1125, + "num_input_tokens_seen": 32102448, + "step": 14875 + }, + { + "epoch": 2.427406199021207, + "grad_norm": 0.10120212286710739, + "learning_rate": 0.0009986105480092866, + "loss": 0.0527, + "num_input_tokens_seen": 32114000, + "step": 14880 + }, + { + "epoch": 2.4282218597063623, + "grad_norm": 0.005008499603718519, + "learning_rate": 0.0009986052401385385, + "loss": 0.0462, + "num_input_tokens_seen": 32126192, + "step": 14885 + }, + { + "epoch": 2.429037520391517, + "grad_norm": 0.15468277037143707, + "learning_rate": 0.0009985999221629224, + "loss": 0.0391, + "num_input_tokens_seen": 32136880, + "step": 14890 + }, + { + "epoch": 2.429853181076672, + "grad_norm": 0.031266950070858, + "learning_rate": 0.0009985945940825464, + "loss": 0.1579, + "num_input_tokens_seen": 32147408, + "step": 14895 + }, + { + "epoch": 2.430668841761827, + "grad_norm": 0.09388963133096695, + "learning_rate": 0.0009985892558975185, + "loss": 0.3103, + "num_input_tokens_seen": 32158416, + "step": 14900 + }, + { + "epoch": 2.431484502446982, + "grad_norm": 0.11383213102817535, + "learning_rate": 0.0009985839076079469, + "loss": 0.2237, + "num_input_tokens_seen": 32169264, + "step": 14905 + }, + { + "epoch": 2.432300163132137, + "grad_norm": 0.022066829726099968, + "learning_rate": 0.0009985785492139397, + "loss": 0.0989, + "num_input_tokens_seen": 32181872, + "step": 14910 + }, + { + "epoch": 2.433115823817292, + "grad_norm": 0.0214407816529274, + "learning_rate": 0.0009985731807156057, + "loss": 0.2539, + "num_input_tokens_seen": 32192816, + "step": 14915 + }, + { + "epoch": 2.433931484502447, + "grad_norm": 0.044516611844301224, + "learning_rate": 0.0009985678021130538, + "loss": 0.0399, + "num_input_tokens_seen": 32203952, + "step": 14920 + }, + { + "epoch": 2.434747145187602, + "grad_norm": 0.033372409641742706, + "learning_rate": 0.000998562413406393, + "loss": 0.0897, + "num_input_tokens_seen": 32215088, + "step": 14925 + }, + { + "epoch": 2.435562805872757, + "grad_norm": 0.020450297743082047, + "learning_rate": 0.0009985570145957324, + "loss": 0.0515, + "num_input_tokens_seen": 32226256, + "step": 14930 + }, + { + "epoch": 2.436378466557912, + "grad_norm": 0.10483633726835251, + "learning_rate": 0.0009985516056811815, + "loss": 0.1071, + "num_input_tokens_seen": 32238096, + "step": 14935 + }, + { + "epoch": 2.437194127243067, + "grad_norm": 0.03492344915866852, + "learning_rate": 0.0009985461866628496, + "loss": 0.1094, + "num_input_tokens_seen": 32247440, + "step": 14940 + }, + { + "epoch": 2.438009787928222, + "grad_norm": 0.024806993082165718, + "learning_rate": 0.000998540757540847, + "loss": 0.0525, + "num_input_tokens_seen": 32257488, + "step": 14945 + }, + { + "epoch": 2.4388254486133767, + "grad_norm": 0.03686249628663063, + "learning_rate": 0.0009985353183152835, + "loss": 0.05, + "num_input_tokens_seen": 32268816, + "step": 14950 + }, + { + "epoch": 2.439641109298532, + "grad_norm": 0.11800311505794525, + "learning_rate": 0.0009985298689862692, + "loss": 0.1154, + "num_input_tokens_seen": 32280048, + "step": 14955 + }, + { + "epoch": 2.4404567699836868, + "grad_norm": 0.040155164897441864, + "learning_rate": 0.0009985244095539149, + "loss": 0.0583, + "num_input_tokens_seen": 32289680, + "step": 14960 + }, + { + "epoch": 2.4412724306688416, + "grad_norm": 0.036987077444791794, + "learning_rate": 0.0009985189400183306, + "loss": 0.0268, + "num_input_tokens_seen": 32300656, + "step": 14965 + }, + { + "epoch": 2.442088091353997, + "grad_norm": 0.2764950394630432, + "learning_rate": 0.0009985134603796278, + "loss": 0.1361, + "num_input_tokens_seen": 32311888, + "step": 14970 + }, + { + "epoch": 2.4429037520391517, + "grad_norm": 0.1540442705154419, + "learning_rate": 0.0009985079706379175, + "loss": 0.2034, + "num_input_tokens_seen": 32323248, + "step": 14975 + }, + { + "epoch": 2.443719412724307, + "grad_norm": 0.013691945001482964, + "learning_rate": 0.0009985024707933107, + "loss": 0.0606, + "num_input_tokens_seen": 32333808, + "step": 14980 + }, + { + "epoch": 2.4445350734094617, + "grad_norm": 0.24718362092971802, + "learning_rate": 0.0009984969608459186, + "loss": 0.2727, + "num_input_tokens_seen": 32345648, + "step": 14985 + }, + { + "epoch": 2.4453507340946166, + "grad_norm": 0.05974946543574333, + "learning_rate": 0.0009984914407958536, + "loss": 0.0773, + "num_input_tokens_seen": 32356272, + "step": 14990 + }, + { + "epoch": 2.4461663947797714, + "grad_norm": 0.027013089507818222, + "learning_rate": 0.000998485910643227, + "loss": 0.2367, + "num_input_tokens_seen": 32366800, + "step": 14995 + }, + { + "epoch": 2.4469820554649266, + "grad_norm": 0.03589235991239548, + "learning_rate": 0.000998480370388151, + "loss": 0.1641, + "num_input_tokens_seen": 32377072, + "step": 15000 + }, + { + "epoch": 2.4477977161500815, + "grad_norm": 0.08772020041942596, + "learning_rate": 0.000998474820030738, + "loss": 0.204, + "num_input_tokens_seen": 32386640, + "step": 15005 + }, + { + "epoch": 2.4486133768352367, + "grad_norm": 0.0963241383433342, + "learning_rate": 0.0009984692595711004, + "loss": 0.0789, + "num_input_tokens_seen": 32396496, + "step": 15010 + }, + { + "epoch": 2.4494290375203915, + "grad_norm": 0.0835120677947998, + "learning_rate": 0.0009984636890093509, + "loss": 0.104, + "num_input_tokens_seen": 32407824, + "step": 15015 + }, + { + "epoch": 2.4502446982055464, + "grad_norm": 0.03772015497088432, + "learning_rate": 0.0009984581083456023, + "loss": 0.1258, + "num_input_tokens_seen": 32417968, + "step": 15020 + }, + { + "epoch": 2.4510603588907016, + "grad_norm": 0.020211776718497276, + "learning_rate": 0.000998452517579968, + "loss": 0.1922, + "num_input_tokens_seen": 32428240, + "step": 15025 + }, + { + "epoch": 2.4518760195758564, + "grad_norm": 0.12109000980854034, + "learning_rate": 0.000998446916712561, + "loss": 0.0912, + "num_input_tokens_seen": 32438704, + "step": 15030 + }, + { + "epoch": 2.4526916802610113, + "grad_norm": 0.10055312514305115, + "learning_rate": 0.0009984413057434948, + "loss": 0.1498, + "num_input_tokens_seen": 32449776, + "step": 15035 + }, + { + "epoch": 2.4535073409461665, + "grad_norm": 0.06574121862649918, + "learning_rate": 0.0009984356846728835, + "loss": 0.0493, + "num_input_tokens_seen": 32460240, + "step": 15040 + }, + { + "epoch": 2.4543230016313213, + "grad_norm": 0.009783199056982994, + "learning_rate": 0.0009984300535008405, + "loss": 0.0443, + "num_input_tokens_seen": 32469808, + "step": 15045 + }, + { + "epoch": 2.455138662316476, + "grad_norm": 0.22190611064434052, + "learning_rate": 0.0009984244122274802, + "loss": 0.151, + "num_input_tokens_seen": 32481232, + "step": 15050 + }, + { + "epoch": 2.4559543230016314, + "grad_norm": 0.10256678611040115, + "learning_rate": 0.000998418760852917, + "loss": 0.1523, + "num_input_tokens_seen": 32492272, + "step": 15055 + }, + { + "epoch": 2.4567699836867862, + "grad_norm": 0.01308070681989193, + "learning_rate": 0.0009984130993772652, + "loss": 0.1173, + "num_input_tokens_seen": 32503312, + "step": 15060 + }, + { + "epoch": 2.4575856443719415, + "grad_norm": 0.13927842676639557, + "learning_rate": 0.0009984074278006397, + "loss": 0.097, + "num_input_tokens_seen": 32513424, + "step": 15065 + }, + { + "epoch": 2.4584013050570963, + "grad_norm": 0.17522437870502472, + "learning_rate": 0.0009984017461231553, + "loss": 0.1441, + "num_input_tokens_seen": 32525264, + "step": 15070 + }, + { + "epoch": 2.459216965742251, + "grad_norm": 0.09254247695207596, + "learning_rate": 0.0009983960543449276, + "loss": 0.1655, + "num_input_tokens_seen": 32536592, + "step": 15075 + }, + { + "epoch": 2.4600326264274064, + "grad_norm": 0.0227352287620306, + "learning_rate": 0.0009983903524660711, + "loss": 0.0696, + "num_input_tokens_seen": 32547344, + "step": 15080 + }, + { + "epoch": 2.460848287112561, + "grad_norm": 0.07951617985963821, + "learning_rate": 0.0009983846404867022, + "loss": 0.0536, + "num_input_tokens_seen": 32558800, + "step": 15085 + }, + { + "epoch": 2.461663947797716, + "grad_norm": 0.22675608098506927, + "learning_rate": 0.0009983789184069363, + "loss": 0.0955, + "num_input_tokens_seen": 32569392, + "step": 15090 + }, + { + "epoch": 2.4624796084828713, + "grad_norm": 0.011685811914503574, + "learning_rate": 0.0009983731862268893, + "loss": 0.0496, + "num_input_tokens_seen": 32579632, + "step": 15095 + }, + { + "epoch": 2.463295269168026, + "grad_norm": 0.03270738199353218, + "learning_rate": 0.0009983674439466774, + "loss": 0.1854, + "num_input_tokens_seen": 32590128, + "step": 15100 + }, + { + "epoch": 2.464110929853181, + "grad_norm": 0.04386333003640175, + "learning_rate": 0.000998361691566417, + "loss": 0.0405, + "num_input_tokens_seen": 32600656, + "step": 15105 + }, + { + "epoch": 2.464926590538336, + "grad_norm": 0.06154201179742813, + "learning_rate": 0.0009983559290862247, + "loss": 0.0468, + "num_input_tokens_seen": 32610928, + "step": 15110 + }, + { + "epoch": 2.465742251223491, + "grad_norm": 0.0069643487222492695, + "learning_rate": 0.0009983501565062173, + "loss": 0.0827, + "num_input_tokens_seen": 32621488, + "step": 15115 + }, + { + "epoch": 2.466557911908646, + "grad_norm": 0.021469606086611748, + "learning_rate": 0.000998344373826512, + "loss": 0.061, + "num_input_tokens_seen": 32633712, + "step": 15120 + }, + { + "epoch": 2.467373572593801, + "grad_norm": 0.0062162489630281925, + "learning_rate": 0.0009983385810472256, + "loss": 0.0726, + "num_input_tokens_seen": 32644560, + "step": 15125 + }, + { + "epoch": 2.468189233278956, + "grad_norm": 0.09205188602209091, + "learning_rate": 0.0009983327781684756, + "loss": 0.152, + "num_input_tokens_seen": 32655888, + "step": 15130 + }, + { + "epoch": 2.4690048939641107, + "grad_norm": 0.02069658599793911, + "learning_rate": 0.0009983269651903798, + "loss": 0.1446, + "num_input_tokens_seen": 32666832, + "step": 15135 + }, + { + "epoch": 2.469820554649266, + "grad_norm": 0.0121985524892807, + "learning_rate": 0.0009983211421130558, + "loss": 0.0343, + "num_input_tokens_seen": 32677616, + "step": 15140 + }, + { + "epoch": 2.470636215334421, + "grad_norm": 0.11200583726167679, + "learning_rate": 0.0009983153089366218, + "loss": 0.1029, + "num_input_tokens_seen": 32687696, + "step": 15145 + }, + { + "epoch": 2.471451876019576, + "grad_norm": 0.2712540626525879, + "learning_rate": 0.0009983094656611958, + "loss": 0.1921, + "num_input_tokens_seen": 32698096, + "step": 15150 + }, + { + "epoch": 2.472267536704731, + "grad_norm": 0.04694950953125954, + "learning_rate": 0.0009983036122868962, + "loss": 0.0917, + "num_input_tokens_seen": 32707920, + "step": 15155 + }, + { + "epoch": 2.4730831973898857, + "grad_norm": 0.20331218838691711, + "learning_rate": 0.000998297748813842, + "loss": 0.2123, + "num_input_tokens_seen": 32719472, + "step": 15160 + }, + { + "epoch": 2.473898858075041, + "grad_norm": 0.3649890422821045, + "learning_rate": 0.0009982918752421516, + "loss": 0.1766, + "num_input_tokens_seen": 32730064, + "step": 15165 + }, + { + "epoch": 2.4747145187601958, + "grad_norm": 0.03769911825656891, + "learning_rate": 0.0009982859915719444, + "loss": 0.1882, + "num_input_tokens_seen": 32740944, + "step": 15170 + }, + { + "epoch": 2.4755301794453506, + "grad_norm": 0.028579184785485268, + "learning_rate": 0.0009982800978033395, + "loss": 0.09, + "num_input_tokens_seen": 32751856, + "step": 15175 + }, + { + "epoch": 2.476345840130506, + "grad_norm": 0.08331523090600967, + "learning_rate": 0.000998274193936456, + "loss": 0.1942, + "num_input_tokens_seen": 32762928, + "step": 15180 + }, + { + "epoch": 2.4771615008156607, + "grad_norm": 0.12021384388208389, + "learning_rate": 0.000998268279971414, + "loss": 0.0961, + "num_input_tokens_seen": 32773840, + "step": 15185 + }, + { + "epoch": 2.4779771615008155, + "grad_norm": 0.1657324731349945, + "learning_rate": 0.0009982623559083332, + "loss": 0.201, + "num_input_tokens_seen": 32786064, + "step": 15190 + }, + { + "epoch": 2.4787928221859707, + "grad_norm": 0.09939758479595184, + "learning_rate": 0.0009982564217473338, + "loss": 0.1506, + "num_input_tokens_seen": 32796816, + "step": 15195 + }, + { + "epoch": 2.4796084828711256, + "grad_norm": 0.07574938982725143, + "learning_rate": 0.000998250477488536, + "loss": 0.1728, + "num_input_tokens_seen": 32807088, + "step": 15200 + }, + { + "epoch": 2.480424143556281, + "grad_norm": 0.23787730932235718, + "learning_rate": 0.0009982445231320597, + "loss": 0.2258, + "num_input_tokens_seen": 32817424, + "step": 15205 + }, + { + "epoch": 2.4812398042414356, + "grad_norm": 0.12397035956382751, + "learning_rate": 0.0009982385586780264, + "loss": 0.1624, + "num_input_tokens_seen": 32827984, + "step": 15210 + }, + { + "epoch": 2.4820554649265905, + "grad_norm": 0.04578749090433121, + "learning_rate": 0.0009982325841265567, + "loss": 0.1631, + "num_input_tokens_seen": 32839664, + "step": 15215 + }, + { + "epoch": 2.4828711256117453, + "grad_norm": 0.13960406184196472, + "learning_rate": 0.0009982265994777717, + "loss": 0.1748, + "num_input_tokens_seen": 32851280, + "step": 15220 + }, + { + "epoch": 2.4836867862969005, + "grad_norm": 0.02447548694908619, + "learning_rate": 0.0009982206047317926, + "loss": 0.1363, + "num_input_tokens_seen": 32862224, + "step": 15225 + }, + { + "epoch": 2.4845024469820554, + "grad_norm": 0.014738969504833221, + "learning_rate": 0.0009982145998887406, + "loss": 0.0353, + "num_input_tokens_seen": 32872816, + "step": 15230 + }, + { + "epoch": 2.4853181076672106, + "grad_norm": 0.06700402498245239, + "learning_rate": 0.000998208584948738, + "loss": 0.1377, + "num_input_tokens_seen": 32882896, + "step": 15235 + }, + { + "epoch": 2.4861337683523654, + "grad_norm": 0.08216087520122528, + "learning_rate": 0.0009982025599119062, + "loss": 0.1206, + "num_input_tokens_seen": 32894256, + "step": 15240 + }, + { + "epoch": 2.4869494290375203, + "grad_norm": 0.04997613653540611, + "learning_rate": 0.0009981965247783677, + "loss": 0.1127, + "num_input_tokens_seen": 32905392, + "step": 15245 + }, + { + "epoch": 2.4877650897226755, + "grad_norm": 0.02395535819232464, + "learning_rate": 0.0009981904795482446, + "loss": 0.1053, + "num_input_tokens_seen": 32916592, + "step": 15250 + }, + { + "epoch": 2.4885807504078303, + "grad_norm": 0.014164482243359089, + "learning_rate": 0.0009981844242216594, + "loss": 0.0371, + "num_input_tokens_seen": 32927216, + "step": 15255 + }, + { + "epoch": 2.489396411092985, + "grad_norm": 0.09248672425746918, + "learning_rate": 0.0009981783587987348, + "loss": 0.0736, + "num_input_tokens_seen": 32937648, + "step": 15260 + }, + { + "epoch": 2.4902120717781404, + "grad_norm": 0.031160561367869377, + "learning_rate": 0.0009981722832795937, + "loss": 0.1924, + "num_input_tokens_seen": 32948496, + "step": 15265 + }, + { + "epoch": 2.4910277324632952, + "grad_norm": 0.05899035558104515, + "learning_rate": 0.0009981661976643595, + "loss": 0.0832, + "num_input_tokens_seen": 32958832, + "step": 15270 + }, + { + "epoch": 2.49184339314845, + "grad_norm": 0.1237209290266037, + "learning_rate": 0.0009981601019531552, + "loss": 0.1017, + "num_input_tokens_seen": 32969456, + "step": 15275 + }, + { + "epoch": 2.4926590538336053, + "grad_norm": 0.011852693744003773, + "learning_rate": 0.0009981539961461045, + "loss": 0.0715, + "num_input_tokens_seen": 32980496, + "step": 15280 + }, + { + "epoch": 2.49347471451876, + "grad_norm": 0.041877441108226776, + "learning_rate": 0.000998147880243331, + "loss": 0.2096, + "num_input_tokens_seen": 32991888, + "step": 15285 + }, + { + "epoch": 2.4942903752039154, + "grad_norm": 0.16636720299720764, + "learning_rate": 0.000998141754244959, + "loss": 0.0662, + "num_input_tokens_seen": 33002672, + "step": 15290 + }, + { + "epoch": 2.49510603588907, + "grad_norm": 0.12959960103034973, + "learning_rate": 0.0009981356181511124, + "loss": 0.0905, + "num_input_tokens_seen": 33012720, + "step": 15295 + }, + { + "epoch": 2.495921696574225, + "grad_norm": 0.017951570451259613, + "learning_rate": 0.0009981294719619152, + "loss": 0.0758, + "num_input_tokens_seen": 33023760, + "step": 15300 + }, + { + "epoch": 2.4967373572593803, + "grad_norm": 0.22545279562473297, + "learning_rate": 0.0009981233156774927, + "loss": 0.1982, + "num_input_tokens_seen": 33034192, + "step": 15305 + }, + { + "epoch": 2.497553017944535, + "grad_norm": 0.3039394021034241, + "learning_rate": 0.0009981171492979691, + "loss": 0.2329, + "num_input_tokens_seen": 33044080, + "step": 15310 + }, + { + "epoch": 2.49836867862969, + "grad_norm": 0.06327145546674728, + "learning_rate": 0.0009981109728234698, + "loss": 0.1041, + "num_input_tokens_seen": 33055184, + "step": 15315 + }, + { + "epoch": 2.499184339314845, + "grad_norm": 0.16796927154064178, + "learning_rate": 0.0009981047862541194, + "loss": 0.1107, + "num_input_tokens_seen": 33065712, + "step": 15320 + }, + { + "epoch": 2.5, + "grad_norm": 0.14921368658542633, + "learning_rate": 0.0009980985895900439, + "loss": 0.0673, + "num_input_tokens_seen": 33075856, + "step": 15325 + }, + { + "epoch": 2.500815660685155, + "grad_norm": 0.014145435765385628, + "learning_rate": 0.0009980923828313685, + "loss": 0.0368, + "num_input_tokens_seen": 33087312, + "step": 15330 + }, + { + "epoch": 2.50163132137031, + "grad_norm": 0.009606493636965752, + "learning_rate": 0.000998086165978219, + "loss": 0.153, + "num_input_tokens_seen": 33098960, + "step": 15335 + }, + { + "epoch": 2.502446982055465, + "grad_norm": 0.014807668514549732, + "learning_rate": 0.0009980799390307215, + "loss": 0.1356, + "num_input_tokens_seen": 33109712, + "step": 15340 + }, + { + "epoch": 2.50326264274062, + "grad_norm": 0.17672359943389893, + "learning_rate": 0.0009980737019890024, + "loss": 0.1941, + "num_input_tokens_seen": 33121008, + "step": 15345 + }, + { + "epoch": 2.504078303425775, + "grad_norm": 0.017882846295833588, + "learning_rate": 0.0009980674548531877, + "loss": 0.1054, + "num_input_tokens_seen": 33133360, + "step": 15350 + }, + { + "epoch": 2.50489396411093, + "grad_norm": 0.1421702355146408, + "learning_rate": 0.0009980611976234041, + "loss": 0.1029, + "num_input_tokens_seen": 33144688, + "step": 15355 + }, + { + "epoch": 2.5057096247960846, + "grad_norm": 0.03871380537748337, + "learning_rate": 0.0009980549302997788, + "loss": 0.0777, + "num_input_tokens_seen": 33155024, + "step": 15360 + }, + { + "epoch": 2.50652528548124, + "grad_norm": 0.040098994970321655, + "learning_rate": 0.000998048652882438, + "loss": 0.1903, + "num_input_tokens_seen": 33166576, + "step": 15365 + }, + { + "epoch": 2.5073409461663947, + "grad_norm": 0.03982541710138321, + "learning_rate": 0.00099804236537151, + "loss": 0.1248, + "num_input_tokens_seen": 33176720, + "step": 15370 + }, + { + "epoch": 2.50815660685155, + "grad_norm": 0.14701324701309204, + "learning_rate": 0.0009980360677671214, + "loss": 0.1261, + "num_input_tokens_seen": 33187824, + "step": 15375 + }, + { + "epoch": 2.5089722675367048, + "grad_norm": 0.05496620759367943, + "learning_rate": 0.0009980297600694, + "loss": 0.0673, + "num_input_tokens_seen": 33198800, + "step": 15380 + }, + { + "epoch": 2.5097879282218596, + "grad_norm": 0.07654522359371185, + "learning_rate": 0.0009980234422784738, + "loss": 0.1344, + "num_input_tokens_seen": 33209232, + "step": 15385 + }, + { + "epoch": 2.5106035889070144, + "grad_norm": 0.14955522119998932, + "learning_rate": 0.0009980171143944708, + "loss": 0.2313, + "num_input_tokens_seen": 33219760, + "step": 15390 + }, + { + "epoch": 2.5114192495921697, + "grad_norm": 0.03198719769716263, + "learning_rate": 0.000998010776417519, + "loss": 0.0774, + "num_input_tokens_seen": 33229040, + "step": 15395 + }, + { + "epoch": 2.5122349102773245, + "grad_norm": 0.016548406332731247, + "learning_rate": 0.0009980044283477473, + "loss": 0.0671, + "num_input_tokens_seen": 33238960, + "step": 15400 + }, + { + "epoch": 2.5130505709624797, + "grad_norm": 0.13226677477359772, + "learning_rate": 0.000997998070185284, + "loss": 0.2025, + "num_input_tokens_seen": 33247920, + "step": 15405 + }, + { + "epoch": 2.5138662316476346, + "grad_norm": 0.0312221460044384, + "learning_rate": 0.000997991701930258, + "loss": 0.1193, + "num_input_tokens_seen": 33259504, + "step": 15410 + }, + { + "epoch": 2.5146818923327894, + "grad_norm": 0.018016701564192772, + "learning_rate": 0.0009979853235827984, + "loss": 0.0878, + "num_input_tokens_seen": 33269872, + "step": 15415 + }, + { + "epoch": 2.5154975530179446, + "grad_norm": 0.11532194167375565, + "learning_rate": 0.0009979789351430347, + "loss": 0.0902, + "num_input_tokens_seen": 33282192, + "step": 15420 + }, + { + "epoch": 2.5163132137030995, + "grad_norm": 0.06278378516435623, + "learning_rate": 0.0009979725366110958, + "loss": 0.0594, + "num_input_tokens_seen": 33293648, + "step": 15425 + }, + { + "epoch": 2.5171288743882547, + "grad_norm": 0.032258644700050354, + "learning_rate": 0.0009979661279871119, + "loss": 0.2658, + "num_input_tokens_seen": 33304752, + "step": 15430 + }, + { + "epoch": 2.5179445350734095, + "grad_norm": 0.15767447650432587, + "learning_rate": 0.0009979597092712128, + "loss": 0.1004, + "num_input_tokens_seen": 33315216, + "step": 15435 + }, + { + "epoch": 2.5187601957585644, + "grad_norm": 0.16893155872821808, + "learning_rate": 0.0009979532804635283, + "loss": 0.2091, + "num_input_tokens_seen": 33325392, + "step": 15440 + }, + { + "epoch": 2.519575856443719, + "grad_norm": 0.01622322015464306, + "learning_rate": 0.000997946841564189, + "loss": 0.1281, + "num_input_tokens_seen": 33336656, + "step": 15445 + }, + { + "epoch": 2.5203915171288744, + "grad_norm": 0.09399479627609253, + "learning_rate": 0.0009979403925733253, + "loss": 0.0991, + "num_input_tokens_seen": 33347856, + "step": 15450 + }, + { + "epoch": 2.5212071778140293, + "grad_norm": 0.056593138724565506, + "learning_rate": 0.0009979339334910678, + "loss": 0.1486, + "num_input_tokens_seen": 33358640, + "step": 15455 + }, + { + "epoch": 2.5220228384991845, + "grad_norm": 0.03446366637945175, + "learning_rate": 0.0009979274643175473, + "loss": 0.0843, + "num_input_tokens_seen": 33369200, + "step": 15460 + }, + { + "epoch": 2.5228384991843393, + "grad_norm": 0.04967934265732765, + "learning_rate": 0.0009979209850528954, + "loss": 0.1338, + "num_input_tokens_seen": 33378320, + "step": 15465 + }, + { + "epoch": 2.523654159869494, + "grad_norm": 0.013403919525444508, + "learning_rate": 0.0009979144956972427, + "loss": 0.1339, + "num_input_tokens_seen": 33388976, + "step": 15470 + }, + { + "epoch": 2.5244698205546494, + "grad_norm": 0.03906738758087158, + "learning_rate": 0.0009979079962507214, + "loss": 0.0999, + "num_input_tokens_seen": 33399664, + "step": 15475 + }, + { + "epoch": 2.5252854812398042, + "grad_norm": 0.10049603134393692, + "learning_rate": 0.0009979014867134628, + "loss": 0.1564, + "num_input_tokens_seen": 33409616, + "step": 15480 + }, + { + "epoch": 2.5261011419249595, + "grad_norm": 0.0630389004945755, + "learning_rate": 0.000997894967085599, + "loss": 0.0324, + "num_input_tokens_seen": 33420624, + "step": 15485 + }, + { + "epoch": 2.5269168026101143, + "grad_norm": 0.12915821373462677, + "learning_rate": 0.000997888437367262, + "loss": 0.0829, + "num_input_tokens_seen": 33430960, + "step": 15490 + }, + { + "epoch": 2.527732463295269, + "grad_norm": 0.036744870245456696, + "learning_rate": 0.0009978818975585843, + "loss": 0.0479, + "num_input_tokens_seen": 33443344, + "step": 15495 + }, + { + "epoch": 2.528548123980424, + "grad_norm": 0.035208433866500854, + "learning_rate": 0.0009978753476596982, + "loss": 0.0984, + "num_input_tokens_seen": 33454032, + "step": 15500 + }, + { + "epoch": 2.529363784665579, + "grad_norm": 0.17320133745670319, + "learning_rate": 0.0009978687876707366, + "loss": 0.1051, + "num_input_tokens_seen": 33465360, + "step": 15505 + }, + { + "epoch": 2.530179445350734, + "grad_norm": 0.042222894728183746, + "learning_rate": 0.0009978622175918323, + "loss": 0.0718, + "num_input_tokens_seen": 33476336, + "step": 15510 + }, + { + "epoch": 2.5309951060358893, + "grad_norm": 0.005529859568923712, + "learning_rate": 0.0009978556374231188, + "loss": 0.0206, + "num_input_tokens_seen": 33486736, + "step": 15515 + }, + { + "epoch": 2.531810766721044, + "grad_norm": 0.028392836451530457, + "learning_rate": 0.0009978490471647292, + "loss": 0.0347, + "num_input_tokens_seen": 33497680, + "step": 15520 + }, + { + "epoch": 2.532626427406199, + "grad_norm": 0.16974955797195435, + "learning_rate": 0.000997842446816797, + "loss": 0.0985, + "num_input_tokens_seen": 33509456, + "step": 15525 + }, + { + "epoch": 2.5334420880913537, + "grad_norm": 0.22244645655155182, + "learning_rate": 0.0009978358363794562, + "loss": 0.1196, + "num_input_tokens_seen": 33521040, + "step": 15530 + }, + { + "epoch": 2.534257748776509, + "grad_norm": 0.14352953433990479, + "learning_rate": 0.0009978292158528406, + "loss": 0.1087, + "num_input_tokens_seen": 33531472, + "step": 15535 + }, + { + "epoch": 2.535073409461664, + "grad_norm": 0.032876551151275635, + "learning_rate": 0.0009978225852370843, + "loss": 0.0805, + "num_input_tokens_seen": 33541552, + "step": 15540 + }, + { + "epoch": 2.535889070146819, + "grad_norm": 0.01684001460671425, + "learning_rate": 0.000997815944532322, + "loss": 0.0154, + "num_input_tokens_seen": 33551920, + "step": 15545 + }, + { + "epoch": 2.536704730831974, + "grad_norm": 0.05973837152123451, + "learning_rate": 0.0009978092937386878, + "loss": 0.2252, + "num_input_tokens_seen": 33562960, + "step": 15550 + }, + { + "epoch": 2.5375203915171287, + "grad_norm": 0.016618214547634125, + "learning_rate": 0.0009978026328563167, + "loss": 0.1091, + "num_input_tokens_seen": 33572848, + "step": 15555 + }, + { + "epoch": 2.538336052202284, + "grad_norm": 0.18274018168449402, + "learning_rate": 0.0009977959618853438, + "loss": 0.1991, + "num_input_tokens_seen": 33582224, + "step": 15560 + }, + { + "epoch": 2.539151712887439, + "grad_norm": 0.07621336728334427, + "learning_rate": 0.0009977892808259044, + "loss": 0.0303, + "num_input_tokens_seen": 33593680, + "step": 15565 + }, + { + "epoch": 2.539967373572594, + "grad_norm": 0.05442234128713608, + "learning_rate": 0.0009977825896781336, + "loss": 0.1567, + "num_input_tokens_seen": 33605616, + "step": 15570 + }, + { + "epoch": 2.540783034257749, + "grad_norm": 0.2087874561548233, + "learning_rate": 0.0009977758884421673, + "loss": 0.1778, + "num_input_tokens_seen": 33616752, + "step": 15575 + }, + { + "epoch": 2.5415986949429037, + "grad_norm": 0.0077378093264997005, + "learning_rate": 0.000997769177118141, + "loss": 0.1353, + "num_input_tokens_seen": 33628048, + "step": 15580 + }, + { + "epoch": 2.5424143556280585, + "grad_norm": 0.04020821675658226, + "learning_rate": 0.0009977624557061908, + "loss": 0.0298, + "num_input_tokens_seen": 33639280, + "step": 15585 + }, + { + "epoch": 2.5432300163132138, + "grad_norm": 0.034978773444890976, + "learning_rate": 0.000997755724206453, + "loss": 0.055, + "num_input_tokens_seen": 33649808, + "step": 15590 + }, + { + "epoch": 2.5440456769983686, + "grad_norm": 0.013452628627419472, + "learning_rate": 0.0009977489826190641, + "loss": 0.28, + "num_input_tokens_seen": 33658992, + "step": 15595 + }, + { + "epoch": 2.544861337683524, + "grad_norm": 0.009423590265214443, + "learning_rate": 0.0009977422309441605, + "loss": 0.0339, + "num_input_tokens_seen": 33669904, + "step": 15600 + }, + { + "epoch": 2.5456769983686787, + "grad_norm": 0.011391459964215755, + "learning_rate": 0.0009977354691818794, + "loss": 0.1147, + "num_input_tokens_seen": 33680080, + "step": 15605 + }, + { + "epoch": 2.5464926590538335, + "grad_norm": 0.008289741352200508, + "learning_rate": 0.0009977286973323575, + "loss": 0.1155, + "num_input_tokens_seen": 33692208, + "step": 15610 + }, + { + "epoch": 2.5473083197389887, + "grad_norm": 0.05895299091935158, + "learning_rate": 0.000997721915395732, + "loss": 0.2202, + "num_input_tokens_seen": 33703088, + "step": 15615 + }, + { + "epoch": 2.5481239804241436, + "grad_norm": 0.013220601715147495, + "learning_rate": 0.0009977151233721406, + "loss": 0.1662, + "num_input_tokens_seen": 33712944, + "step": 15620 + }, + { + "epoch": 2.5489396411092984, + "grad_norm": 0.009266866371035576, + "learning_rate": 0.0009977083212617207, + "loss": 0.1772, + "num_input_tokens_seen": 33723408, + "step": 15625 + }, + { + "epoch": 2.5497553017944536, + "grad_norm": 0.09896119683980942, + "learning_rate": 0.0009977015090646105, + "loss": 0.1255, + "num_input_tokens_seen": 33733488, + "step": 15630 + }, + { + "epoch": 2.5505709624796085, + "grad_norm": 0.022420763969421387, + "learning_rate": 0.0009976946867809476, + "loss": 0.1533, + "num_input_tokens_seen": 33742640, + "step": 15635 + }, + { + "epoch": 2.5513866231647633, + "grad_norm": 0.026887355372309685, + "learning_rate": 0.0009976878544108705, + "loss": 0.2023, + "num_input_tokens_seen": 33753584, + "step": 15640 + }, + { + "epoch": 2.5522022838499185, + "grad_norm": 0.03986315429210663, + "learning_rate": 0.000997681011954518, + "loss": 0.0527, + "num_input_tokens_seen": 33764656, + "step": 15645 + }, + { + "epoch": 2.5530179445350734, + "grad_norm": 0.04462524130940437, + "learning_rate": 0.0009976741594120281, + "loss": 0.1128, + "num_input_tokens_seen": 33775632, + "step": 15650 + }, + { + "epoch": 2.5538336052202286, + "grad_norm": 0.00447855144739151, + "learning_rate": 0.00099766729678354, + "loss": 0.0923, + "num_input_tokens_seen": 33787536, + "step": 15655 + }, + { + "epoch": 2.5546492659053834, + "grad_norm": 0.14613580703735352, + "learning_rate": 0.0009976604240691932, + "loss": 0.1505, + "num_input_tokens_seen": 33798736, + "step": 15660 + }, + { + "epoch": 2.5554649265905383, + "grad_norm": 0.01982048712670803, + "learning_rate": 0.0009976535412691261, + "loss": 0.0826, + "num_input_tokens_seen": 33810256, + "step": 15665 + }, + { + "epoch": 2.556280587275693, + "grad_norm": 0.08966480195522308, + "learning_rate": 0.0009976466483834789, + "loss": 0.1452, + "num_input_tokens_seen": 33820272, + "step": 15670 + }, + { + "epoch": 2.5570962479608483, + "grad_norm": 0.031216297298669815, + "learning_rate": 0.0009976397454123911, + "loss": 0.1995, + "num_input_tokens_seen": 33830448, + "step": 15675 + }, + { + "epoch": 2.557911908646003, + "grad_norm": 0.13245442509651184, + "learning_rate": 0.0009976328323560025, + "loss": 0.354, + "num_input_tokens_seen": 33841808, + "step": 15680 + }, + { + "epoch": 2.5587275693311584, + "grad_norm": 0.02866053767502308, + "learning_rate": 0.0009976259092144533, + "loss": 0.085, + "num_input_tokens_seen": 33851088, + "step": 15685 + }, + { + "epoch": 2.5595432300163132, + "grad_norm": 0.06127011030912399, + "learning_rate": 0.0009976189759878836, + "loss": 0.0584, + "num_input_tokens_seen": 33863600, + "step": 15690 + }, + { + "epoch": 2.560358890701468, + "grad_norm": 0.03415321558713913, + "learning_rate": 0.0009976120326764342, + "loss": 0.1155, + "num_input_tokens_seen": 33873712, + "step": 15695 + }, + { + "epoch": 2.5611745513866233, + "grad_norm": 0.057170283049345016, + "learning_rate": 0.0009976050792802457, + "loss": 0.1155, + "num_input_tokens_seen": 33884496, + "step": 15700 + }, + { + "epoch": 2.561990212071778, + "grad_norm": 0.18055884540081024, + "learning_rate": 0.000997598115799459, + "loss": 0.1895, + "num_input_tokens_seen": 33896176, + "step": 15705 + }, + { + "epoch": 2.5628058727569334, + "grad_norm": 0.011545859277248383, + "learning_rate": 0.0009975911422342152, + "loss": 0.053, + "num_input_tokens_seen": 33907760, + "step": 15710 + }, + { + "epoch": 2.563621533442088, + "grad_norm": 0.07192616909742355, + "learning_rate": 0.0009975841585846558, + "loss": 0.1192, + "num_input_tokens_seen": 33918864, + "step": 15715 + }, + { + "epoch": 2.564437194127243, + "grad_norm": 0.0953463464975357, + "learning_rate": 0.000997577164850922, + "loss": 0.1145, + "num_input_tokens_seen": 33929072, + "step": 15720 + }, + { + "epoch": 2.565252854812398, + "grad_norm": 0.02232409454882145, + "learning_rate": 0.000997570161033156, + "loss": 0.1204, + "num_input_tokens_seen": 33939440, + "step": 15725 + }, + { + "epoch": 2.566068515497553, + "grad_norm": 0.25486090779304504, + "learning_rate": 0.0009975631471314992, + "loss": 0.0646, + "num_input_tokens_seen": 33949168, + "step": 15730 + }, + { + "epoch": 2.566884176182708, + "grad_norm": 0.017930161207914352, + "learning_rate": 0.0009975561231460942, + "loss": 0.073, + "num_input_tokens_seen": 33958800, + "step": 15735 + }, + { + "epoch": 2.567699836867863, + "grad_norm": 0.01774352602660656, + "learning_rate": 0.000997549089077083, + "loss": 0.1514, + "num_input_tokens_seen": 33968144, + "step": 15740 + }, + { + "epoch": 2.568515497553018, + "grad_norm": 0.02600264362990856, + "learning_rate": 0.0009975420449246084, + "loss": 0.0731, + "num_input_tokens_seen": 33979344, + "step": 15745 + }, + { + "epoch": 2.569331158238173, + "grad_norm": 0.2240467518568039, + "learning_rate": 0.0009975349906888131, + "loss": 0.1108, + "num_input_tokens_seen": 33990160, + "step": 15750 + }, + { + "epoch": 2.5701468189233276, + "grad_norm": 0.1824868768453598, + "learning_rate": 0.00099752792636984, + "loss": 0.1037, + "num_input_tokens_seen": 34000336, + "step": 15755 + }, + { + "epoch": 2.570962479608483, + "grad_norm": 0.02142205461859703, + "learning_rate": 0.0009975208519678324, + "loss": 0.1392, + "num_input_tokens_seen": 34011696, + "step": 15760 + }, + { + "epoch": 2.5717781402936377, + "grad_norm": 0.148535817861557, + "learning_rate": 0.0009975137674829335, + "loss": 0.1566, + "num_input_tokens_seen": 34022256, + "step": 15765 + }, + { + "epoch": 2.572593800978793, + "grad_norm": 0.16442307829856873, + "learning_rate": 0.000997506672915287, + "loss": 0.1731, + "num_input_tokens_seen": 34032848, + "step": 15770 + }, + { + "epoch": 2.573409461663948, + "grad_norm": 0.11616021394729614, + "learning_rate": 0.0009974995682650368, + "loss": 0.1735, + "num_input_tokens_seen": 34043888, + "step": 15775 + }, + { + "epoch": 2.5742251223491026, + "grad_norm": 0.15792444348335266, + "learning_rate": 0.0009974924535323265, + "loss": 0.2241, + "num_input_tokens_seen": 34055760, + "step": 15780 + }, + { + "epoch": 2.575040783034258, + "grad_norm": 0.08082375675439835, + "learning_rate": 0.0009974853287173006, + "loss": 0.1539, + "num_input_tokens_seen": 34067536, + "step": 15785 + }, + { + "epoch": 2.5758564437194127, + "grad_norm": 0.05074841529130936, + "learning_rate": 0.0009974781938201034, + "loss": 0.0602, + "num_input_tokens_seen": 34078896, + "step": 15790 + }, + { + "epoch": 2.576672104404568, + "grad_norm": 0.02613057568669319, + "learning_rate": 0.0009974710488408795, + "loss": 0.0438, + "num_input_tokens_seen": 34089648, + "step": 15795 + }, + { + "epoch": 2.5774877650897228, + "grad_norm": 0.011358670890331268, + "learning_rate": 0.0009974638937797736, + "loss": 0.1266, + "num_input_tokens_seen": 34101392, + "step": 15800 + }, + { + "epoch": 2.5783034257748776, + "grad_norm": 0.02720820903778076, + "learning_rate": 0.000997456728636931, + "loss": 0.0718, + "num_input_tokens_seen": 34111184, + "step": 15805 + }, + { + "epoch": 2.5791190864600324, + "grad_norm": 0.2023083120584488, + "learning_rate": 0.0009974495534124967, + "loss": 0.2293, + "num_input_tokens_seen": 34121872, + "step": 15810 + }, + { + "epoch": 2.5799347471451877, + "grad_norm": 0.01462864875793457, + "learning_rate": 0.000997442368106616, + "loss": 0.114, + "num_input_tokens_seen": 34132912, + "step": 15815 + }, + { + "epoch": 2.5807504078303425, + "grad_norm": 0.017234236001968384, + "learning_rate": 0.0009974351727194347, + "loss": 0.0767, + "num_input_tokens_seen": 34143632, + "step": 15820 + }, + { + "epoch": 2.5815660685154977, + "grad_norm": 0.016773421317338943, + "learning_rate": 0.0009974279672510986, + "loss": 0.0245, + "num_input_tokens_seen": 34154928, + "step": 15825 + }, + { + "epoch": 2.5823817292006526, + "grad_norm": 0.018271803855895996, + "learning_rate": 0.0009974207517017537, + "loss": 0.0541, + "num_input_tokens_seen": 34165168, + "step": 15830 + }, + { + "epoch": 2.5831973898858074, + "grad_norm": 0.09012021124362946, + "learning_rate": 0.0009974135260715465, + "loss": 0.0866, + "num_input_tokens_seen": 34176304, + "step": 15835 + }, + { + "epoch": 2.5840130505709626, + "grad_norm": 0.03086942993104458, + "learning_rate": 0.0009974062903606229, + "loss": 0.058, + "num_input_tokens_seen": 34186928, + "step": 15840 + }, + { + "epoch": 2.5848287112561175, + "grad_norm": 0.3191803991794586, + "learning_rate": 0.0009973990445691298, + "loss": 0.1462, + "num_input_tokens_seen": 34197712, + "step": 15845 + }, + { + "epoch": 2.5856443719412723, + "grad_norm": 0.02641608938574791, + "learning_rate": 0.0009973917886972143, + "loss": 0.048, + "num_input_tokens_seen": 34208144, + "step": 15850 + }, + { + "epoch": 2.5864600326264275, + "grad_norm": 0.025220094248652458, + "learning_rate": 0.000997384522745023, + "loss": 0.127, + "num_input_tokens_seen": 34219504, + "step": 15855 + }, + { + "epoch": 2.5872756933115824, + "grad_norm": 0.004061461891978979, + "learning_rate": 0.0009973772467127035, + "loss": 0.1875, + "num_input_tokens_seen": 34230032, + "step": 15860 + }, + { + "epoch": 2.588091353996737, + "grad_norm": 0.11642669141292572, + "learning_rate": 0.000997369960600403, + "loss": 0.037, + "num_input_tokens_seen": 34240400, + "step": 15865 + }, + { + "epoch": 2.5889070146818924, + "grad_norm": 0.07248809933662415, + "learning_rate": 0.0009973626644082694, + "loss": 0.121, + "num_input_tokens_seen": 34252048, + "step": 15870 + }, + { + "epoch": 2.5897226753670473, + "grad_norm": 0.03970911353826523, + "learning_rate": 0.0009973553581364503, + "loss": 0.0151, + "num_input_tokens_seen": 34262832, + "step": 15875 + }, + { + "epoch": 2.5905383360522025, + "grad_norm": 0.006574940402060747, + "learning_rate": 0.0009973480417850942, + "loss": 0.0384, + "num_input_tokens_seen": 34274512, + "step": 15880 + }, + { + "epoch": 2.5913539967373573, + "grad_norm": 0.21048304438591003, + "learning_rate": 0.0009973407153543489, + "loss": 0.2012, + "num_input_tokens_seen": 34284240, + "step": 15885 + }, + { + "epoch": 2.592169657422512, + "grad_norm": 0.010378936305642128, + "learning_rate": 0.0009973333788443632, + "loss": 0.1344, + "num_input_tokens_seen": 34293744, + "step": 15890 + }, + { + "epoch": 2.592985318107667, + "grad_norm": 0.09485937654972076, + "learning_rate": 0.0009973260322552855, + "loss": 0.0879, + "num_input_tokens_seen": 34304144, + "step": 15895 + }, + { + "epoch": 2.5938009787928222, + "grad_norm": 0.0019799217116087675, + "learning_rate": 0.000997318675587265, + "loss": 0.1599, + "num_input_tokens_seen": 34314992, + "step": 15900 + }, + { + "epoch": 2.594616639477977, + "grad_norm": 0.1326042115688324, + "learning_rate": 0.0009973113088404507, + "loss": 0.2377, + "num_input_tokens_seen": 34326768, + "step": 15905 + }, + { + "epoch": 2.5954323001631323, + "grad_norm": 0.03656404837965965, + "learning_rate": 0.0009973039320149916, + "loss": 0.0829, + "num_input_tokens_seen": 34336080, + "step": 15910 + }, + { + "epoch": 2.596247960848287, + "grad_norm": 0.03923811763525009, + "learning_rate": 0.0009972965451110376, + "loss": 0.0728, + "num_input_tokens_seen": 34347568, + "step": 15915 + }, + { + "epoch": 2.597063621533442, + "grad_norm": 0.24911628663539886, + "learning_rate": 0.0009972891481287382, + "loss": 0.2545, + "num_input_tokens_seen": 34358864, + "step": 15920 + }, + { + "epoch": 2.597879282218597, + "grad_norm": 0.12007445096969604, + "learning_rate": 0.0009972817410682433, + "loss": 0.0954, + "num_input_tokens_seen": 34371024, + "step": 15925 + }, + { + "epoch": 2.598694942903752, + "grad_norm": 0.006647109519690275, + "learning_rate": 0.0009972743239297032, + "loss": 0.1845, + "num_input_tokens_seen": 34381520, + "step": 15930 + }, + { + "epoch": 2.5995106035889073, + "grad_norm": 0.05227941274642944, + "learning_rate": 0.000997266896713268, + "loss": 0.0222, + "num_input_tokens_seen": 34392880, + "step": 15935 + }, + { + "epoch": 2.600326264274062, + "grad_norm": 0.0952635332942009, + "learning_rate": 0.0009972594594190884, + "loss": 0.1416, + "num_input_tokens_seen": 34404816, + "step": 15940 + }, + { + "epoch": 2.601141924959217, + "grad_norm": 0.02615487016737461, + "learning_rate": 0.0009972520120473149, + "loss": 0.1262, + "num_input_tokens_seen": 34414896, + "step": 15945 + }, + { + "epoch": 2.6019575856443717, + "grad_norm": 0.19116218388080597, + "learning_rate": 0.0009972445545980988, + "loss": 0.1472, + "num_input_tokens_seen": 34426480, + "step": 15950 + }, + { + "epoch": 2.602773246329527, + "grad_norm": 0.027874654158949852, + "learning_rate": 0.0009972370870715908, + "loss": 0.0442, + "num_input_tokens_seen": 34437328, + "step": 15955 + }, + { + "epoch": 2.603588907014682, + "grad_norm": 0.05413804203271866, + "learning_rate": 0.0009972296094679426, + "loss": 0.0972, + "num_input_tokens_seen": 34449584, + "step": 15960 + }, + { + "epoch": 2.604404567699837, + "grad_norm": 0.04909636452794075, + "learning_rate": 0.0009972221217873054, + "loss": 0.1781, + "num_input_tokens_seen": 34460592, + "step": 15965 + }, + { + "epoch": 2.605220228384992, + "grad_norm": 0.06522244960069656, + "learning_rate": 0.0009972146240298312, + "loss": 0.0541, + "num_input_tokens_seen": 34472944, + "step": 15970 + }, + { + "epoch": 2.6060358890701467, + "grad_norm": 0.010133202187716961, + "learning_rate": 0.000997207116195672, + "loss": 0.034, + "num_input_tokens_seen": 34483536, + "step": 15975 + }, + { + "epoch": 2.6068515497553015, + "grad_norm": 0.008424052968621254, + "learning_rate": 0.0009971995982849795, + "loss": 0.239, + "num_input_tokens_seen": 34495248, + "step": 15980 + }, + { + "epoch": 2.607667210440457, + "grad_norm": 0.09996895492076874, + "learning_rate": 0.0009971920702979066, + "loss": 0.1024, + "num_input_tokens_seen": 34506832, + "step": 15985 + }, + { + "epoch": 2.6084828711256116, + "grad_norm": 0.0076310415752232075, + "learning_rate": 0.000997184532234606, + "loss": 0.0352, + "num_input_tokens_seen": 34517168, + "step": 15990 + }, + { + "epoch": 2.609298531810767, + "grad_norm": 0.04842585697770119, + "learning_rate": 0.0009971769840952296, + "loss": 0.1446, + "num_input_tokens_seen": 34528272, + "step": 15995 + }, + { + "epoch": 2.6101141924959217, + "grad_norm": 0.014273292385041714, + "learning_rate": 0.0009971694258799312, + "loss": 0.0249, + "num_input_tokens_seen": 34537904, + "step": 16000 + }, + { + "epoch": 2.6109298531810765, + "grad_norm": 0.2039312869310379, + "learning_rate": 0.0009971618575888637, + "loss": 0.2369, + "num_input_tokens_seen": 34548368, + "step": 16005 + }, + { + "epoch": 2.6117455138662318, + "grad_norm": 0.027443569153547287, + "learning_rate": 0.0009971542792221802, + "loss": 0.0325, + "num_input_tokens_seen": 34559024, + "step": 16010 + }, + { + "epoch": 2.6125611745513866, + "grad_norm": 0.07282551378011703, + "learning_rate": 0.000997146690780035, + "loss": 0.1998, + "num_input_tokens_seen": 34570224, + "step": 16015 + }, + { + "epoch": 2.613376835236542, + "grad_norm": 0.10752265900373459, + "learning_rate": 0.000997139092262581, + "loss": 0.069, + "num_input_tokens_seen": 34581968, + "step": 16020 + }, + { + "epoch": 2.6141924959216967, + "grad_norm": 0.045813702046871185, + "learning_rate": 0.0009971314836699728, + "loss": 0.1011, + "num_input_tokens_seen": 34593392, + "step": 16025 + }, + { + "epoch": 2.6150081566068515, + "grad_norm": 0.047865137457847595, + "learning_rate": 0.0009971238650023644, + "loss": 0.0413, + "num_input_tokens_seen": 34604176, + "step": 16030 + }, + { + "epoch": 2.6158238172920063, + "grad_norm": 0.08814916759729385, + "learning_rate": 0.0009971162362599102, + "loss": 0.0822, + "num_input_tokens_seen": 34615024, + "step": 16035 + }, + { + "epoch": 2.6166394779771616, + "grad_norm": 0.19699475169181824, + "learning_rate": 0.000997108597442765, + "loss": 0.2013, + "num_input_tokens_seen": 34626160, + "step": 16040 + }, + { + "epoch": 2.6174551386623164, + "grad_norm": 0.05102047324180603, + "learning_rate": 0.000997100948551083, + "loss": 0.0849, + "num_input_tokens_seen": 34635632, + "step": 16045 + }, + { + "epoch": 2.6182707993474716, + "grad_norm": 0.08056139945983887, + "learning_rate": 0.0009970932895850201, + "loss": 0.043, + "num_input_tokens_seen": 34646288, + "step": 16050 + }, + { + "epoch": 2.6190864600326265, + "grad_norm": 0.00316016492433846, + "learning_rate": 0.000997085620544731, + "loss": 0.2138, + "num_input_tokens_seen": 34657232, + "step": 16055 + }, + { + "epoch": 2.6199021207177813, + "grad_norm": 0.140364408493042, + "learning_rate": 0.0009970779414303712, + "loss": 0.1879, + "num_input_tokens_seen": 34667408, + "step": 16060 + }, + { + "epoch": 2.6207177814029365, + "grad_norm": 0.03655733913183212, + "learning_rate": 0.0009970702522420962, + "loss": 0.0818, + "num_input_tokens_seen": 34677616, + "step": 16065 + }, + { + "epoch": 2.6215334420880914, + "grad_norm": 0.04961032792925835, + "learning_rate": 0.000997062552980062, + "loss": 0.0728, + "num_input_tokens_seen": 34688688, + "step": 16070 + }, + { + "epoch": 2.622349102773246, + "grad_norm": 0.1233089417219162, + "learning_rate": 0.0009970548436444248, + "loss": 0.1021, + "num_input_tokens_seen": 34700272, + "step": 16075 + }, + { + "epoch": 2.6231647634584014, + "grad_norm": 0.0446881465613842, + "learning_rate": 0.0009970471242353406, + "loss": 0.2407, + "num_input_tokens_seen": 34710832, + "step": 16080 + }, + { + "epoch": 2.6239804241435563, + "grad_norm": 0.076204814016819, + "learning_rate": 0.0009970393947529657, + "loss": 0.1545, + "num_input_tokens_seen": 34722224, + "step": 16085 + }, + { + "epoch": 2.624796084828711, + "grad_norm": 0.024876916781067848, + "learning_rate": 0.0009970316551974568, + "loss": 0.1053, + "num_input_tokens_seen": 34733136, + "step": 16090 + }, + { + "epoch": 2.6256117455138663, + "grad_norm": 0.019722016528248787, + "learning_rate": 0.0009970239055689712, + "loss": 0.0684, + "num_input_tokens_seen": 34743408, + "step": 16095 + }, + { + "epoch": 2.626427406199021, + "grad_norm": 0.3286396265029907, + "learning_rate": 0.0009970161458676655, + "loss": 0.307, + "num_input_tokens_seen": 34755184, + "step": 16100 + }, + { + "epoch": 2.6272430668841764, + "grad_norm": 0.06816285848617554, + "learning_rate": 0.000997008376093697, + "loss": 0.0734, + "num_input_tokens_seen": 34765040, + "step": 16105 + }, + { + "epoch": 2.6280587275693312, + "grad_norm": 0.03844304010272026, + "learning_rate": 0.0009970005962472233, + "loss": 0.1003, + "num_input_tokens_seen": 34775312, + "step": 16110 + }, + { + "epoch": 2.628874388254486, + "grad_norm": 0.0412660613656044, + "learning_rate": 0.0009969928063284022, + "loss": 0.0566, + "num_input_tokens_seen": 34784400, + "step": 16115 + }, + { + "epoch": 2.629690048939641, + "grad_norm": 0.08752242475748062, + "learning_rate": 0.0009969850063373913, + "loss": 0.1033, + "num_input_tokens_seen": 34796080, + "step": 16120 + }, + { + "epoch": 2.630505709624796, + "grad_norm": 0.0770307257771492, + "learning_rate": 0.0009969771962743488, + "loss": 0.0398, + "num_input_tokens_seen": 34807280, + "step": 16125 + }, + { + "epoch": 2.631321370309951, + "grad_norm": 0.09530388563871384, + "learning_rate": 0.0009969693761394326, + "loss": 0.1281, + "num_input_tokens_seen": 34818896, + "step": 16130 + }, + { + "epoch": 2.632137030995106, + "grad_norm": 0.044809695333242416, + "learning_rate": 0.000996961545932802, + "loss": 0.0923, + "num_input_tokens_seen": 34830288, + "step": 16135 + }, + { + "epoch": 2.632952691680261, + "grad_norm": 0.05540524423122406, + "learning_rate": 0.0009969537056546151, + "loss": 0.0747, + "num_input_tokens_seen": 34840688, + "step": 16140 + }, + { + "epoch": 2.633768352365416, + "grad_norm": 0.14792972803115845, + "learning_rate": 0.000996945855305031, + "loss": 0.1074, + "num_input_tokens_seen": 34852848, + "step": 16145 + }, + { + "epoch": 2.634584013050571, + "grad_norm": 0.25191640853881836, + "learning_rate": 0.0009969379948842085, + "loss": 0.2799, + "num_input_tokens_seen": 34863728, + "step": 16150 + }, + { + "epoch": 2.635399673735726, + "grad_norm": 0.018112903460860252, + "learning_rate": 0.0009969301243923073, + "loss": 0.0946, + "num_input_tokens_seen": 34874672, + "step": 16155 + }, + { + "epoch": 2.636215334420881, + "grad_norm": 0.11276046931743622, + "learning_rate": 0.0009969222438294867, + "loss": 0.1038, + "num_input_tokens_seen": 34885904, + "step": 16160 + }, + { + "epoch": 2.637030995106036, + "grad_norm": 0.11514152586460114, + "learning_rate": 0.0009969143531959063, + "loss": 0.171, + "num_input_tokens_seen": 34896144, + "step": 16165 + }, + { + "epoch": 2.637846655791191, + "grad_norm": 0.22373029589653015, + "learning_rate": 0.0009969064524917265, + "loss": 0.1962, + "num_input_tokens_seen": 34906928, + "step": 16170 + }, + { + "epoch": 2.6386623164763456, + "grad_norm": 0.04494435340166092, + "learning_rate": 0.000996898541717107, + "loss": 0.1221, + "num_input_tokens_seen": 34917392, + "step": 16175 + }, + { + "epoch": 2.639477977161501, + "grad_norm": 0.023480776697397232, + "learning_rate": 0.0009968906208722077, + "loss": 0.0763, + "num_input_tokens_seen": 34928240, + "step": 16180 + }, + { + "epoch": 2.6402936378466557, + "grad_norm": 0.13084053993225098, + "learning_rate": 0.00099688268995719, + "loss": 0.0493, + "num_input_tokens_seen": 34939824, + "step": 16185 + }, + { + "epoch": 2.641109298531811, + "grad_norm": 0.058285802602767944, + "learning_rate": 0.0009968747489722141, + "loss": 0.0749, + "num_input_tokens_seen": 34950704, + "step": 16190 + }, + { + "epoch": 2.641924959216966, + "grad_norm": 0.015076878480613232, + "learning_rate": 0.0009968667979174412, + "loss": 0.1356, + "num_input_tokens_seen": 34962224, + "step": 16195 + }, + { + "epoch": 2.6427406199021206, + "grad_norm": 0.012966454960405827, + "learning_rate": 0.0009968588367930324, + "loss": 0.0985, + "num_input_tokens_seen": 34973936, + "step": 16200 + }, + { + "epoch": 2.6435562805872754, + "grad_norm": 0.23642700910568237, + "learning_rate": 0.0009968508655991489, + "loss": 0.0887, + "num_input_tokens_seen": 34985296, + "step": 16205 + }, + { + "epoch": 2.6443719412724307, + "grad_norm": 0.21331088244915009, + "learning_rate": 0.0009968428843359523, + "loss": 0.2171, + "num_input_tokens_seen": 34996528, + "step": 16210 + }, + { + "epoch": 2.6451876019575855, + "grad_norm": 0.16859817504882812, + "learning_rate": 0.0009968348930036043, + "loss": 0.1193, + "num_input_tokens_seen": 35007056, + "step": 16215 + }, + { + "epoch": 2.6460032626427408, + "grad_norm": 0.05117064714431763, + "learning_rate": 0.000996826891602267, + "loss": 0.0595, + "num_input_tokens_seen": 35017744, + "step": 16220 + }, + { + "epoch": 2.6468189233278956, + "grad_norm": 0.07311154901981354, + "learning_rate": 0.0009968188801321024, + "loss": 0.1327, + "num_input_tokens_seen": 35029104, + "step": 16225 + }, + { + "epoch": 2.6476345840130504, + "grad_norm": 0.04667254537343979, + "learning_rate": 0.000996810858593273, + "loss": 0.089, + "num_input_tokens_seen": 35039632, + "step": 16230 + }, + { + "epoch": 2.6484502446982057, + "grad_norm": 0.030987687408924103, + "learning_rate": 0.000996802826985941, + "loss": 0.1056, + "num_input_tokens_seen": 35050704, + "step": 16235 + }, + { + "epoch": 2.6492659053833605, + "grad_norm": 0.005027539096772671, + "learning_rate": 0.0009967947853102698, + "loss": 0.0661, + "num_input_tokens_seen": 35060688, + "step": 16240 + }, + { + "epoch": 2.6500815660685157, + "grad_norm": 0.03209603205323219, + "learning_rate": 0.000996786733566422, + "loss": 0.0317, + "num_input_tokens_seen": 35071920, + "step": 16245 + }, + { + "epoch": 2.6508972267536706, + "grad_norm": 0.007075433153659105, + "learning_rate": 0.0009967786717545609, + "loss": 0.0254, + "num_input_tokens_seen": 35083056, + "step": 16250 + }, + { + "epoch": 2.6517128874388254, + "grad_norm": 0.05450182780623436, + "learning_rate": 0.0009967705998748496, + "loss": 0.0516, + "num_input_tokens_seen": 35094320, + "step": 16255 + }, + { + "epoch": 2.65252854812398, + "grad_norm": 0.013659525662660599, + "learning_rate": 0.000996762517927452, + "loss": 0.1173, + "num_input_tokens_seen": 35104560, + "step": 16260 + }, + { + "epoch": 2.6533442088091355, + "grad_norm": 0.2563559412956238, + "learning_rate": 0.0009967544259125317, + "loss": 0.1933, + "num_input_tokens_seen": 35115376, + "step": 16265 + }, + { + "epoch": 2.6541598694942903, + "grad_norm": 0.06311127543449402, + "learning_rate": 0.000996746323830253, + "loss": 0.109, + "num_input_tokens_seen": 35126192, + "step": 16270 + }, + { + "epoch": 2.6549755301794455, + "grad_norm": 0.04986641928553581, + "learning_rate": 0.0009967382116807797, + "loss": 0.2096, + "num_input_tokens_seen": 35137936, + "step": 16275 + }, + { + "epoch": 2.6557911908646004, + "grad_norm": 0.050005145370960236, + "learning_rate": 0.0009967300894642764, + "loss": 0.1298, + "num_input_tokens_seen": 35148560, + "step": 16280 + }, + { + "epoch": 2.656606851549755, + "grad_norm": 0.2781466841697693, + "learning_rate": 0.0009967219571809076, + "loss": 0.0838, + "num_input_tokens_seen": 35159632, + "step": 16285 + }, + { + "epoch": 2.6574225122349104, + "grad_norm": 0.004133820533752441, + "learning_rate": 0.0009967138148308384, + "loss": 0.1553, + "num_input_tokens_seen": 35170000, + "step": 16290 + }, + { + "epoch": 2.6582381729200653, + "grad_norm": 0.010567733086645603, + "learning_rate": 0.0009967056624142336, + "loss": 0.0522, + "num_input_tokens_seen": 35181360, + "step": 16295 + }, + { + "epoch": 2.65905383360522, + "grad_norm": 0.03892872855067253, + "learning_rate": 0.0009966974999312584, + "loss": 0.1246, + "num_input_tokens_seen": 35191600, + "step": 16300 + }, + { + "epoch": 2.6598694942903753, + "grad_norm": 0.029648205265402794, + "learning_rate": 0.000996689327382078, + "loss": 0.035, + "num_input_tokens_seen": 35201136, + "step": 16305 + }, + { + "epoch": 2.66068515497553, + "grad_norm": 0.034195419400930405, + "learning_rate": 0.0009966811447668586, + "loss": 0.0587, + "num_input_tokens_seen": 35211600, + "step": 16310 + }, + { + "epoch": 2.661500815660685, + "grad_norm": 0.01270847488194704, + "learning_rate": 0.0009966729520857658, + "loss": 0.1418, + "num_input_tokens_seen": 35222416, + "step": 16315 + }, + { + "epoch": 2.6623164763458402, + "grad_norm": 0.04060065746307373, + "learning_rate": 0.0009966647493389654, + "loss": 0.0995, + "num_input_tokens_seen": 35233904, + "step": 16320 + }, + { + "epoch": 2.663132137030995, + "grad_norm": 0.027049917727708817, + "learning_rate": 0.0009966565365266238, + "loss": 0.1362, + "num_input_tokens_seen": 35243984, + "step": 16325 + }, + { + "epoch": 2.6639477977161503, + "grad_norm": 0.04302258789539337, + "learning_rate": 0.0009966483136489073, + "loss": 0.0693, + "num_input_tokens_seen": 35255216, + "step": 16330 + }, + { + "epoch": 2.664763458401305, + "grad_norm": 0.20379215478897095, + "learning_rate": 0.0009966400807059827, + "loss": 0.1275, + "num_input_tokens_seen": 35265136, + "step": 16335 + }, + { + "epoch": 2.66557911908646, + "grad_norm": 0.012177269905805588, + "learning_rate": 0.000996631837698017, + "loss": 0.0783, + "num_input_tokens_seen": 35275600, + "step": 16340 + }, + { + "epoch": 2.6663947797716148, + "grad_norm": 0.06536693871021271, + "learning_rate": 0.000996623584625177, + "loss": 0.1058, + "num_input_tokens_seen": 35286928, + "step": 16345 + }, + { + "epoch": 2.66721044045677, + "grad_norm": 0.08594885468482971, + "learning_rate": 0.00099661532148763, + "loss": 0.1441, + "num_input_tokens_seen": 35299024, + "step": 16350 + }, + { + "epoch": 2.668026101141925, + "grad_norm": 0.09080636501312256, + "learning_rate": 0.0009966070482855436, + "loss": 0.0704, + "num_input_tokens_seen": 35310320, + "step": 16355 + }, + { + "epoch": 2.66884176182708, + "grad_norm": 0.028082668781280518, + "learning_rate": 0.0009965987650190852, + "loss": 0.0351, + "num_input_tokens_seen": 35321104, + "step": 16360 + }, + { + "epoch": 2.669657422512235, + "grad_norm": 0.18926265835762024, + "learning_rate": 0.000996590471688423, + "loss": 0.2103, + "num_input_tokens_seen": 35332688, + "step": 16365 + }, + { + "epoch": 2.6704730831973897, + "grad_norm": 0.010513481684029102, + "learning_rate": 0.000996582168293725, + "loss": 0.1222, + "num_input_tokens_seen": 35342416, + "step": 16370 + }, + { + "epoch": 2.671288743882545, + "grad_norm": 0.04382769763469696, + "learning_rate": 0.0009965738548351592, + "loss": 0.094, + "num_input_tokens_seen": 35353616, + "step": 16375 + }, + { + "epoch": 2.6721044045677, + "grad_norm": 0.0995483547449112, + "learning_rate": 0.0009965655313128945, + "loss": 0.0962, + "num_input_tokens_seen": 35364304, + "step": 16380 + }, + { + "epoch": 2.672920065252855, + "grad_norm": 0.06052175536751747, + "learning_rate": 0.0009965571977270994, + "loss": 0.1651, + "num_input_tokens_seen": 35374992, + "step": 16385 + }, + { + "epoch": 2.67373572593801, + "grad_norm": 0.18613076210021973, + "learning_rate": 0.0009965488540779426, + "loss": 0.1645, + "num_input_tokens_seen": 35386544, + "step": 16390 + }, + { + "epoch": 2.6745513866231647, + "grad_norm": 0.011818121187388897, + "learning_rate": 0.0009965405003655933, + "loss": 0.1571, + "num_input_tokens_seen": 35397616, + "step": 16395 + }, + { + "epoch": 2.6753670473083195, + "grad_norm": 0.049891602247953415, + "learning_rate": 0.000996532136590221, + "loss": 0.1412, + "num_input_tokens_seen": 35407408, + "step": 16400 + }, + { + "epoch": 2.676182707993475, + "grad_norm": 0.04860110208392143, + "learning_rate": 0.000996523762751995, + "loss": 0.037, + "num_input_tokens_seen": 35418512, + "step": 16405 + }, + { + "epoch": 2.6769983686786296, + "grad_norm": 0.10363364964723587, + "learning_rate": 0.000996515378851085, + "loss": 0.2416, + "num_input_tokens_seen": 35429712, + "step": 16410 + }, + { + "epoch": 2.677814029363785, + "grad_norm": 0.04161006957292557, + "learning_rate": 0.0009965069848876609, + "loss": 0.0971, + "num_input_tokens_seen": 35440560, + "step": 16415 + }, + { + "epoch": 2.6786296900489397, + "grad_norm": 0.05684167891740799, + "learning_rate": 0.000996498580861893, + "loss": 0.2234, + "num_input_tokens_seen": 35451504, + "step": 16420 + }, + { + "epoch": 2.6794453507340945, + "grad_norm": 0.0680040717124939, + "learning_rate": 0.0009964901667739517, + "loss": 0.1753, + "num_input_tokens_seen": 35462640, + "step": 16425 + }, + { + "epoch": 2.6802610114192493, + "grad_norm": 0.05222951993346214, + "learning_rate": 0.000996481742624007, + "loss": 0.0828, + "num_input_tokens_seen": 35474160, + "step": 16430 + }, + { + "epoch": 2.6810766721044046, + "grad_norm": 0.03267939016222954, + "learning_rate": 0.00099647330841223, + "loss": 0.0963, + "num_input_tokens_seen": 35485200, + "step": 16435 + }, + { + "epoch": 2.6818923327895594, + "grad_norm": 0.06117767468094826, + "learning_rate": 0.0009964648641387918, + "loss": 0.363, + "num_input_tokens_seen": 35496176, + "step": 16440 + }, + { + "epoch": 2.6827079934747147, + "grad_norm": 0.018112177029252052, + "learning_rate": 0.000996456409803863, + "loss": 0.1143, + "num_input_tokens_seen": 35506960, + "step": 16445 + }, + { + "epoch": 2.6835236541598695, + "grad_norm": 0.19327247142791748, + "learning_rate": 0.0009964479454076156, + "loss": 0.1561, + "num_input_tokens_seen": 35517744, + "step": 16450 + }, + { + "epoch": 2.6843393148450243, + "grad_norm": 0.1426132321357727, + "learning_rate": 0.0009964394709502207, + "loss": 0.226, + "num_input_tokens_seen": 35529680, + "step": 16455 + }, + { + "epoch": 2.6851549755301796, + "grad_norm": 0.02000572346150875, + "learning_rate": 0.0009964309864318502, + "loss": 0.122, + "num_input_tokens_seen": 35540496, + "step": 16460 + }, + { + "epoch": 2.6859706362153344, + "grad_norm": 0.13784906268119812, + "learning_rate": 0.0009964224918526758, + "loss": 0.1307, + "num_input_tokens_seen": 35552176, + "step": 16465 + }, + { + "epoch": 2.6867862969004896, + "grad_norm": 0.025162290781736374, + "learning_rate": 0.0009964139872128699, + "loss": 0.0548, + "num_input_tokens_seen": 35563440, + "step": 16470 + }, + { + "epoch": 2.6876019575856445, + "grad_norm": 0.07458358258008957, + "learning_rate": 0.000996405472512605, + "loss": 0.0405, + "num_input_tokens_seen": 35574256, + "step": 16475 + }, + { + "epoch": 2.6884176182707993, + "grad_norm": 0.1809203326702118, + "learning_rate": 0.0009963969477520531, + "loss": 0.0864, + "num_input_tokens_seen": 35584368, + "step": 16480 + }, + { + "epoch": 2.689233278955954, + "grad_norm": 0.05000342056155205, + "learning_rate": 0.0009963884129313876, + "loss": 0.0676, + "num_input_tokens_seen": 35594896, + "step": 16485 + }, + { + "epoch": 2.6900489396411094, + "grad_norm": 0.05927790328860283, + "learning_rate": 0.0009963798680507811, + "loss": 0.1027, + "num_input_tokens_seen": 35605488, + "step": 16490 + }, + { + "epoch": 2.690864600326264, + "grad_norm": 0.07674163579940796, + "learning_rate": 0.0009963713131104068, + "loss": 0.0358, + "num_input_tokens_seen": 35617616, + "step": 16495 + }, + { + "epoch": 2.6916802610114194, + "grad_norm": 0.03120202012360096, + "learning_rate": 0.0009963627481104384, + "loss": 0.2406, + "num_input_tokens_seen": 35628304, + "step": 16500 + }, + { + "epoch": 2.6924959216965743, + "grad_norm": 0.04719226807355881, + "learning_rate": 0.000996354173051049, + "loss": 0.155, + "num_input_tokens_seen": 35640016, + "step": 16505 + }, + { + "epoch": 2.693311582381729, + "grad_norm": 0.13795311748981476, + "learning_rate": 0.0009963455879324129, + "loss": 0.1473, + "num_input_tokens_seen": 35651632, + "step": 16510 + }, + { + "epoch": 2.6941272430668843, + "grad_norm": 0.21742363274097443, + "learning_rate": 0.0009963369927547035, + "loss": 0.0717, + "num_input_tokens_seen": 35661200, + "step": 16515 + }, + { + "epoch": 2.694942903752039, + "grad_norm": 0.12605741620063782, + "learning_rate": 0.0009963283875180952, + "loss": 0.0931, + "num_input_tokens_seen": 35671472, + "step": 16520 + }, + { + "epoch": 2.695758564437194, + "grad_norm": 0.12286864966154099, + "learning_rate": 0.0009963197722227628, + "loss": 0.1201, + "num_input_tokens_seen": 35682480, + "step": 16525 + }, + { + "epoch": 2.6965742251223492, + "grad_norm": 0.09845346957445145, + "learning_rate": 0.0009963111468688805, + "loss": 0.1452, + "num_input_tokens_seen": 35693168, + "step": 16530 + }, + { + "epoch": 2.697389885807504, + "grad_norm": 0.03589067980647087, + "learning_rate": 0.000996302511456623, + "loss": 0.06, + "num_input_tokens_seen": 35704624, + "step": 16535 + }, + { + "epoch": 2.698205546492659, + "grad_norm": 0.1269928365945816, + "learning_rate": 0.0009962938659861657, + "loss": 0.059, + "num_input_tokens_seen": 35714768, + "step": 16540 + }, + { + "epoch": 2.699021207177814, + "grad_norm": 0.006672917399555445, + "learning_rate": 0.0009962852104576836, + "loss": 0.0791, + "num_input_tokens_seen": 35724240, + "step": 16545 + }, + { + "epoch": 2.699836867862969, + "grad_norm": 0.048434000462293625, + "learning_rate": 0.0009962765448713522, + "loss": 0.0802, + "num_input_tokens_seen": 35735472, + "step": 16550 + }, + { + "epoch": 2.700652528548124, + "grad_norm": 0.006401789374649525, + "learning_rate": 0.000996267869227347, + "loss": 0.0435, + "num_input_tokens_seen": 35747376, + "step": 16555 + }, + { + "epoch": 2.701468189233279, + "grad_norm": 0.0249084010720253, + "learning_rate": 0.0009962591835258436, + "loss": 0.0188, + "num_input_tokens_seen": 35758960, + "step": 16560 + }, + { + "epoch": 2.702283849918434, + "grad_norm": 0.15324918925762177, + "learning_rate": 0.0009962504877670186, + "loss": 0.1264, + "num_input_tokens_seen": 35770512, + "step": 16565 + }, + { + "epoch": 2.7030995106035887, + "grad_norm": 0.010256961919367313, + "learning_rate": 0.0009962417819510479, + "loss": 0.2247, + "num_input_tokens_seen": 35781424, + "step": 16570 + }, + { + "epoch": 2.703915171288744, + "grad_norm": 0.10292331129312515, + "learning_rate": 0.0009962330660781078, + "loss": 0.0999, + "num_input_tokens_seen": 35793200, + "step": 16575 + }, + { + "epoch": 2.7047308319738987, + "grad_norm": 0.04794760048389435, + "learning_rate": 0.0009962243401483752, + "loss": 0.065, + "num_input_tokens_seen": 35804880, + "step": 16580 + }, + { + "epoch": 2.705546492659054, + "grad_norm": 0.006814230233430862, + "learning_rate": 0.000996215604162027, + "loss": 0.1619, + "num_input_tokens_seen": 35815376, + "step": 16585 + }, + { + "epoch": 2.706362153344209, + "grad_norm": 0.07483354210853577, + "learning_rate": 0.0009962068581192399, + "loss": 0.0953, + "num_input_tokens_seen": 35825008, + "step": 16590 + }, + { + "epoch": 2.7071778140293636, + "grad_norm": 0.13249722123146057, + "learning_rate": 0.0009961981020201913, + "loss": 0.096, + "num_input_tokens_seen": 35834384, + "step": 16595 + }, + { + "epoch": 2.707993474714519, + "grad_norm": 0.26114413142204285, + "learning_rate": 0.0009961893358650586, + "loss": 0.1324, + "num_input_tokens_seen": 35844752, + "step": 16600 + }, + { + "epoch": 2.7088091353996737, + "grad_norm": 0.011786268092691898, + "learning_rate": 0.00099618055965402, + "loss": 0.1443, + "num_input_tokens_seen": 35855664, + "step": 16605 + }, + { + "epoch": 2.709624796084829, + "grad_norm": 0.017760034650564194, + "learning_rate": 0.0009961717733872524, + "loss": 0.028, + "num_input_tokens_seen": 35867472, + "step": 16610 + }, + { + "epoch": 2.710440456769984, + "grad_norm": 0.013623403385281563, + "learning_rate": 0.0009961629770649347, + "loss": 0.0688, + "num_input_tokens_seen": 35878544, + "step": 16615 + }, + { + "epoch": 2.7112561174551386, + "grad_norm": 0.08606085926294327, + "learning_rate": 0.0009961541706872447, + "loss": 0.0984, + "num_input_tokens_seen": 35890704, + "step": 16620 + }, + { + "epoch": 2.7120717781402934, + "grad_norm": 0.006418906152248383, + "learning_rate": 0.000996145354254361, + "loss": 0.0234, + "num_input_tokens_seen": 35901424, + "step": 16625 + }, + { + "epoch": 2.7128874388254487, + "grad_norm": 0.12155283242464066, + "learning_rate": 0.0009961365277664624, + "loss": 0.2166, + "num_input_tokens_seen": 35912688, + "step": 16630 + }, + { + "epoch": 2.7137030995106035, + "grad_norm": 0.17688867449760437, + "learning_rate": 0.0009961276912237276, + "loss": 0.1361, + "num_input_tokens_seen": 35924272, + "step": 16635 + }, + { + "epoch": 2.7145187601957588, + "grad_norm": 0.03098876029253006, + "learning_rate": 0.0009961188446263357, + "loss": 0.1887, + "num_input_tokens_seen": 35935984, + "step": 16640 + }, + { + "epoch": 2.7153344208809136, + "grad_norm": 0.10147203505039215, + "learning_rate": 0.0009961099879744661, + "loss": 0.2432, + "num_input_tokens_seen": 35945840, + "step": 16645 + }, + { + "epoch": 2.7161500815660684, + "grad_norm": 0.03277093917131424, + "learning_rate": 0.0009961011212682982, + "loss": 0.116, + "num_input_tokens_seen": 35956336, + "step": 16650 + }, + { + "epoch": 2.7169657422512232, + "grad_norm": 0.00960891880095005, + "learning_rate": 0.0009960922445080118, + "loss": 0.0635, + "num_input_tokens_seen": 35967664, + "step": 16655 + }, + { + "epoch": 2.7177814029363785, + "grad_norm": 0.10828480869531631, + "learning_rate": 0.0009960833576937867, + "loss": 0.2616, + "num_input_tokens_seen": 35978480, + "step": 16660 + }, + { + "epoch": 2.7185970636215333, + "grad_norm": 0.01858861744403839, + "learning_rate": 0.000996074460825803, + "loss": 0.1065, + "num_input_tokens_seen": 35989040, + "step": 16665 + }, + { + "epoch": 2.7194127243066886, + "grad_norm": 0.05515943467617035, + "learning_rate": 0.0009960655539042412, + "loss": 0.1553, + "num_input_tokens_seen": 35999792, + "step": 16670 + }, + { + "epoch": 2.7202283849918434, + "grad_norm": 0.019007014110684395, + "learning_rate": 0.0009960566369292814, + "loss": 0.0316, + "num_input_tokens_seen": 36009392, + "step": 16675 + }, + { + "epoch": 2.721044045676998, + "grad_norm": 0.034815505146980286, + "learning_rate": 0.0009960477099011048, + "loss": 0.1742, + "num_input_tokens_seen": 36019952, + "step": 16680 + }, + { + "epoch": 2.7218597063621535, + "grad_norm": 0.009619861841201782, + "learning_rate": 0.000996038772819892, + "loss": 0.0484, + "num_input_tokens_seen": 36031152, + "step": 16685 + }, + { + "epoch": 2.7226753670473083, + "grad_norm": 0.053180892020463943, + "learning_rate": 0.0009960298256858238, + "loss": 0.0819, + "num_input_tokens_seen": 36041808, + "step": 16690 + }, + { + "epoch": 2.7234910277324635, + "grad_norm": 0.1688222587108612, + "learning_rate": 0.0009960208684990824, + "loss": 0.1105, + "num_input_tokens_seen": 36051248, + "step": 16695 + }, + { + "epoch": 2.7243066884176184, + "grad_norm": 0.0084160715341568, + "learning_rate": 0.0009960119012598489, + "loss": 0.116, + "num_input_tokens_seen": 36063024, + "step": 16700 + }, + { + "epoch": 2.725122349102773, + "grad_norm": 0.026014313101768494, + "learning_rate": 0.0009960029239683046, + "loss": 0.198, + "num_input_tokens_seen": 36074800, + "step": 16705 + }, + { + "epoch": 2.725938009787928, + "grad_norm": 0.04177123308181763, + "learning_rate": 0.000995993936624632, + "loss": 0.0723, + "num_input_tokens_seen": 36084784, + "step": 16710 + }, + { + "epoch": 2.7267536704730833, + "grad_norm": 0.20435082912445068, + "learning_rate": 0.000995984939229013, + "loss": 0.0981, + "num_input_tokens_seen": 36095248, + "step": 16715 + }, + { + "epoch": 2.727569331158238, + "grad_norm": 0.12735266983509064, + "learning_rate": 0.0009959759317816302, + "loss": 0.2375, + "num_input_tokens_seen": 36106288, + "step": 16720 + }, + { + "epoch": 2.7283849918433933, + "grad_norm": 0.03834512084722519, + "learning_rate": 0.0009959669142826659, + "loss": 0.0679, + "num_input_tokens_seen": 36116976, + "step": 16725 + }, + { + "epoch": 2.729200652528548, + "grad_norm": 0.020699964836239815, + "learning_rate": 0.0009959578867323028, + "loss": 0.0633, + "num_input_tokens_seen": 36127504, + "step": 16730 + }, + { + "epoch": 2.730016313213703, + "grad_norm": 0.006737627554684877, + "learning_rate": 0.000995948849130724, + "loss": 0.0449, + "num_input_tokens_seen": 36137744, + "step": 16735 + }, + { + "epoch": 2.7308319738988582, + "grad_norm": 0.08040996640920639, + "learning_rate": 0.0009959398014781128, + "loss": 0.1631, + "num_input_tokens_seen": 36149328, + "step": 16740 + }, + { + "epoch": 2.731647634584013, + "grad_norm": 0.020759882405400276, + "learning_rate": 0.000995930743774652, + "loss": 0.0431, + "num_input_tokens_seen": 36160688, + "step": 16745 + }, + { + "epoch": 2.732463295269168, + "grad_norm": 0.08896606415510178, + "learning_rate": 0.0009959216760205257, + "loss": 0.0707, + "num_input_tokens_seen": 36171056, + "step": 16750 + }, + { + "epoch": 2.733278955954323, + "grad_norm": 0.006299588363617659, + "learning_rate": 0.0009959125982159176, + "loss": 0.1441, + "num_input_tokens_seen": 36181552, + "step": 16755 + }, + { + "epoch": 2.734094616639478, + "grad_norm": 0.04413047432899475, + "learning_rate": 0.0009959035103610115, + "loss": 0.0834, + "num_input_tokens_seen": 36191472, + "step": 16760 + }, + { + "epoch": 2.7349102773246328, + "grad_norm": 0.14438296854496002, + "learning_rate": 0.0009958944124559919, + "loss": 0.2134, + "num_input_tokens_seen": 36204272, + "step": 16765 + }, + { + "epoch": 2.735725938009788, + "grad_norm": 0.050696443766355515, + "learning_rate": 0.0009958853045010426, + "loss": 0.0557, + "num_input_tokens_seen": 36216144, + "step": 16770 + }, + { + "epoch": 2.736541598694943, + "grad_norm": 0.11860430985689163, + "learning_rate": 0.0009958761864963487, + "loss": 0.0754, + "num_input_tokens_seen": 36226672, + "step": 16775 + }, + { + "epoch": 2.737357259380098, + "grad_norm": 0.20829935371875763, + "learning_rate": 0.0009958670584420948, + "loss": 0.298, + "num_input_tokens_seen": 36238448, + "step": 16780 + }, + { + "epoch": 2.738172920065253, + "grad_norm": 0.022341042757034302, + "learning_rate": 0.000995857920338466, + "loss": 0.2273, + "num_input_tokens_seen": 36248400, + "step": 16785 + }, + { + "epoch": 2.7389885807504077, + "grad_norm": 0.0673818364739418, + "learning_rate": 0.0009958487721856474, + "loss": 0.0499, + "num_input_tokens_seen": 36259824, + "step": 16790 + }, + { + "epoch": 2.7398042414355626, + "grad_norm": 0.0898568406701088, + "learning_rate": 0.0009958396139838242, + "loss": 0.0776, + "num_input_tokens_seen": 36269168, + "step": 16795 + }, + { + "epoch": 2.740619902120718, + "grad_norm": 0.05799885466694832, + "learning_rate": 0.0009958304457331822, + "loss": 0.0654, + "num_input_tokens_seen": 36278288, + "step": 16800 + }, + { + "epoch": 2.7414355628058726, + "grad_norm": 0.07931547611951828, + "learning_rate": 0.0009958212674339075, + "loss": 0.0692, + "num_input_tokens_seen": 36289200, + "step": 16805 + }, + { + "epoch": 2.742251223491028, + "grad_norm": 0.07164021581411362, + "learning_rate": 0.0009958120790861855, + "loss": 0.1046, + "num_input_tokens_seen": 36300112, + "step": 16810 + }, + { + "epoch": 2.7430668841761827, + "grad_norm": 0.14807431399822235, + "learning_rate": 0.000995802880690203, + "loss": 0.199, + "num_input_tokens_seen": 36309872, + "step": 16815 + }, + { + "epoch": 2.7438825448613375, + "grad_norm": 0.008118130266666412, + "learning_rate": 0.000995793672246146, + "loss": 0.1201, + "num_input_tokens_seen": 36320080, + "step": 16820 + }, + { + "epoch": 2.744698205546493, + "grad_norm": 0.016023870557546616, + "learning_rate": 0.0009957844537542013, + "loss": 0.1853, + "num_input_tokens_seen": 36330448, + "step": 16825 + }, + { + "epoch": 2.7455138662316476, + "grad_norm": 0.18821802735328674, + "learning_rate": 0.0009957752252145557, + "loss": 0.1643, + "num_input_tokens_seen": 36341008, + "step": 16830 + }, + { + "epoch": 2.746329526916803, + "grad_norm": 0.025277676060795784, + "learning_rate": 0.0009957659866273963, + "loss": 0.1134, + "num_input_tokens_seen": 36351728, + "step": 16835 + }, + { + "epoch": 2.7471451876019577, + "grad_norm": 0.040302034467458725, + "learning_rate": 0.0009957567379929103, + "loss": 0.1228, + "num_input_tokens_seen": 36362832, + "step": 16840 + }, + { + "epoch": 2.7479608482871125, + "grad_norm": 0.20980079472064972, + "learning_rate": 0.0009957474793112848, + "loss": 0.1315, + "num_input_tokens_seen": 36373520, + "step": 16845 + }, + { + "epoch": 2.7487765089722673, + "grad_norm": 0.13790211081504822, + "learning_rate": 0.0009957382105827079, + "loss": 0.0805, + "num_input_tokens_seen": 36384912, + "step": 16850 + }, + { + "epoch": 2.7495921696574226, + "grad_norm": 0.017296679317951202, + "learning_rate": 0.0009957289318073674, + "loss": 0.0286, + "num_input_tokens_seen": 36395760, + "step": 16855 + }, + { + "epoch": 2.7504078303425774, + "grad_norm": 0.05207030847668648, + "learning_rate": 0.000995719642985451, + "loss": 0.0413, + "num_input_tokens_seen": 36406768, + "step": 16860 + }, + { + "epoch": 2.7512234910277327, + "grad_norm": 0.04654117673635483, + "learning_rate": 0.0009957103441171472, + "loss": 0.0275, + "num_input_tokens_seen": 36415728, + "step": 16865 + }, + { + "epoch": 2.7520391517128875, + "grad_norm": 0.01514064148068428, + "learning_rate": 0.0009957010352026447, + "loss": 0.2576, + "num_input_tokens_seen": 36427728, + "step": 16870 + }, + { + "epoch": 2.7528548123980423, + "grad_norm": 0.006023972760885954, + "learning_rate": 0.0009956917162421317, + "loss": 0.2038, + "num_input_tokens_seen": 36438640, + "step": 16875 + }, + { + "epoch": 2.753670473083197, + "grad_norm": 0.26965123414993286, + "learning_rate": 0.0009956823872357972, + "loss": 0.1544, + "num_input_tokens_seen": 36448976, + "step": 16880 + }, + { + "epoch": 2.7544861337683524, + "grad_norm": 0.01376594603061676, + "learning_rate": 0.0009956730481838303, + "loss": 0.2072, + "num_input_tokens_seen": 36458736, + "step": 16885 + }, + { + "epoch": 2.755301794453507, + "grad_norm": 0.24467112123966217, + "learning_rate": 0.0009956636990864202, + "loss": 0.2255, + "num_input_tokens_seen": 36468144, + "step": 16890 + }, + { + "epoch": 2.7561174551386625, + "grad_norm": 0.035813480615615845, + "learning_rate": 0.0009956543399437569, + "loss": 0.1184, + "num_input_tokens_seen": 36478512, + "step": 16895 + }, + { + "epoch": 2.7569331158238173, + "grad_norm": 0.10317237675189972, + "learning_rate": 0.0009956449707560291, + "loss": 0.0867, + "num_input_tokens_seen": 36490384, + "step": 16900 + }, + { + "epoch": 2.757748776508972, + "grad_norm": 0.14292746782302856, + "learning_rate": 0.0009956355915234274, + "loss": 0.0805, + "num_input_tokens_seen": 36500944, + "step": 16905 + }, + { + "epoch": 2.7585644371941274, + "grad_norm": 0.06484779715538025, + "learning_rate": 0.0009956262022461416, + "loss": 0.0792, + "num_input_tokens_seen": 36511056, + "step": 16910 + }, + { + "epoch": 2.759380097879282, + "grad_norm": 0.0332934632897377, + "learning_rate": 0.0009956168029243621, + "loss": 0.1565, + "num_input_tokens_seen": 36522448, + "step": 16915 + }, + { + "epoch": 2.7601957585644374, + "grad_norm": 0.0191267691552639, + "learning_rate": 0.0009956073935582794, + "loss": 0.1997, + "num_input_tokens_seen": 36533456, + "step": 16920 + }, + { + "epoch": 2.7610114192495923, + "grad_norm": 0.017247522249817848, + "learning_rate": 0.000995597974148084, + "loss": 0.1833, + "num_input_tokens_seen": 36544656, + "step": 16925 + }, + { + "epoch": 2.761827079934747, + "grad_norm": 0.06324011832475662, + "learning_rate": 0.0009955885446939672, + "loss": 0.08, + "num_input_tokens_seen": 36556144, + "step": 16930 + }, + { + "epoch": 2.762642740619902, + "grad_norm": 0.05617910996079445, + "learning_rate": 0.0009955791051961195, + "loss": 0.0969, + "num_input_tokens_seen": 36566064, + "step": 16935 + }, + { + "epoch": 2.763458401305057, + "grad_norm": 0.09463348984718323, + "learning_rate": 0.000995569655654733, + "loss": 0.0997, + "num_input_tokens_seen": 36576528, + "step": 16940 + }, + { + "epoch": 2.764274061990212, + "grad_norm": 0.022959791123867035, + "learning_rate": 0.0009955601960699983, + "loss": 0.1026, + "num_input_tokens_seen": 36587856, + "step": 16945 + }, + { + "epoch": 2.7650897226753672, + "grad_norm": 0.05635806545615196, + "learning_rate": 0.0009955507264421079, + "loss": 0.1328, + "num_input_tokens_seen": 36597936, + "step": 16950 + }, + { + "epoch": 2.765905383360522, + "grad_norm": 0.08979813754558563, + "learning_rate": 0.0009955412467712531, + "loss": 0.0814, + "num_input_tokens_seen": 36608688, + "step": 16955 + }, + { + "epoch": 2.766721044045677, + "grad_norm": 0.088919997215271, + "learning_rate": 0.0009955317570576265, + "loss": 0.2369, + "num_input_tokens_seen": 36619824, + "step": 16960 + }, + { + "epoch": 2.767536704730832, + "grad_norm": 0.2753000259399414, + "learning_rate": 0.0009955222573014202, + "loss": 0.1653, + "num_input_tokens_seen": 36630960, + "step": 16965 + }, + { + "epoch": 2.768352365415987, + "grad_norm": 0.0219334177672863, + "learning_rate": 0.0009955127475028266, + "loss": 0.1144, + "num_input_tokens_seen": 36640976, + "step": 16970 + }, + { + "epoch": 2.7691680261011418, + "grad_norm": 0.09595170617103577, + "learning_rate": 0.0009955032276620388, + "loss": 0.1972, + "num_input_tokens_seen": 36651792, + "step": 16975 + }, + { + "epoch": 2.769983686786297, + "grad_norm": 0.07782889157533646, + "learning_rate": 0.0009954936977792492, + "loss": 0.0547, + "num_input_tokens_seen": 36662896, + "step": 16980 + }, + { + "epoch": 2.770799347471452, + "grad_norm": 0.20237375795841217, + "learning_rate": 0.0009954841578546515, + "loss": 0.1596, + "num_input_tokens_seen": 36673040, + "step": 16985 + }, + { + "epoch": 2.7716150081566067, + "grad_norm": 0.16264452040195465, + "learning_rate": 0.0009954746078884387, + "loss": 0.2101, + "num_input_tokens_seen": 36684464, + "step": 16990 + }, + { + "epoch": 2.772430668841762, + "grad_norm": 0.09795284271240234, + "learning_rate": 0.0009954650478808042, + "loss": 0.1346, + "num_input_tokens_seen": 36696368, + "step": 16995 + }, + { + "epoch": 2.7732463295269167, + "grad_norm": 0.024558700621128082, + "learning_rate": 0.0009954554778319423, + "loss": 0.0879, + "num_input_tokens_seen": 36707600, + "step": 17000 + }, + { + "epoch": 2.774061990212072, + "grad_norm": 0.10869482159614563, + "learning_rate": 0.0009954458977420465, + "loss": 0.1708, + "num_input_tokens_seen": 36718992, + "step": 17005 + }, + { + "epoch": 2.774877650897227, + "grad_norm": 0.018605228513479233, + "learning_rate": 0.000995436307611311, + "loss": 0.0794, + "num_input_tokens_seen": 36729840, + "step": 17010 + }, + { + "epoch": 2.7756933115823816, + "grad_norm": 0.04528221860527992, + "learning_rate": 0.0009954267074399302, + "loss": 0.0904, + "num_input_tokens_seen": 36738672, + "step": 17015 + }, + { + "epoch": 2.7765089722675365, + "grad_norm": 0.0910518541932106, + "learning_rate": 0.0009954170972280988, + "loss": 0.0636, + "num_input_tokens_seen": 36749104, + "step": 17020 + }, + { + "epoch": 2.7773246329526917, + "grad_norm": 0.024681704118847847, + "learning_rate": 0.0009954074769760112, + "loss": 0.0755, + "num_input_tokens_seen": 36760560, + "step": 17025 + }, + { + "epoch": 2.7781402936378465, + "grad_norm": 0.0610785074532032, + "learning_rate": 0.0009953978466838629, + "loss": 0.1034, + "num_input_tokens_seen": 36771728, + "step": 17030 + }, + { + "epoch": 2.778955954323002, + "grad_norm": 0.3423584997653961, + "learning_rate": 0.0009953882063518486, + "loss": 0.1557, + "num_input_tokens_seen": 36783472, + "step": 17035 + }, + { + "epoch": 2.7797716150081566, + "grad_norm": 0.05371207743883133, + "learning_rate": 0.000995378555980164, + "loss": 0.21, + "num_input_tokens_seen": 36794672, + "step": 17040 + }, + { + "epoch": 2.7805872756933114, + "grad_norm": 0.010864299722015858, + "learning_rate": 0.0009953688955690045, + "loss": 0.1129, + "num_input_tokens_seen": 36805840, + "step": 17045 + }, + { + "epoch": 2.7814029363784667, + "grad_norm": 0.1616959273815155, + "learning_rate": 0.0009953592251185658, + "loss": 0.0768, + "num_input_tokens_seen": 36816368, + "step": 17050 + }, + { + "epoch": 2.7822185970636215, + "grad_norm": 0.17181627452373505, + "learning_rate": 0.000995349544629044, + "loss": 0.0974, + "num_input_tokens_seen": 36827920, + "step": 17055 + }, + { + "epoch": 2.7830342577487768, + "grad_norm": 0.2582882046699524, + "learning_rate": 0.0009953398541006353, + "loss": 0.2697, + "num_input_tokens_seen": 36839312, + "step": 17060 + }, + { + "epoch": 2.7838499184339316, + "grad_norm": 0.09182535111904144, + "learning_rate": 0.0009953301535335361, + "loss": 0.1056, + "num_input_tokens_seen": 36849264, + "step": 17065 + }, + { + "epoch": 2.7846655791190864, + "grad_norm": 0.07948463410139084, + "learning_rate": 0.000995320442927943, + "loss": 0.1619, + "num_input_tokens_seen": 36859664, + "step": 17070 + }, + { + "epoch": 2.7854812398042412, + "grad_norm": 0.039482396095991135, + "learning_rate": 0.0009953107222840528, + "loss": 0.0612, + "num_input_tokens_seen": 36869776, + "step": 17075 + }, + { + "epoch": 2.7862969004893965, + "grad_norm": 0.1092955619096756, + "learning_rate": 0.0009953009916020624, + "loss": 0.1102, + "num_input_tokens_seen": 36880912, + "step": 17080 + }, + { + "epoch": 2.7871125611745513, + "grad_norm": 0.141911119222641, + "learning_rate": 0.0009952912508821691, + "loss": 0.2338, + "num_input_tokens_seen": 36891056, + "step": 17085 + }, + { + "epoch": 2.7879282218597066, + "grad_norm": 0.03731909021735191, + "learning_rate": 0.0009952815001245702, + "loss": 0.1015, + "num_input_tokens_seen": 36901456, + "step": 17090 + }, + { + "epoch": 2.7887438825448614, + "grad_norm": 0.013447783887386322, + "learning_rate": 0.0009952717393294636, + "loss": 0.0812, + "num_input_tokens_seen": 36912336, + "step": 17095 + }, + { + "epoch": 2.789559543230016, + "grad_norm": 0.010318592190742493, + "learning_rate": 0.0009952619684970468, + "loss": 0.0608, + "num_input_tokens_seen": 36921456, + "step": 17100 + }, + { + "epoch": 2.790375203915171, + "grad_norm": 0.04862210899591446, + "learning_rate": 0.0009952521876275178, + "loss": 0.0812, + "num_input_tokens_seen": 36932272, + "step": 17105 + }, + { + "epoch": 2.7911908646003263, + "grad_norm": 0.03783220797777176, + "learning_rate": 0.0009952423967210752, + "loss": 0.1264, + "num_input_tokens_seen": 36942640, + "step": 17110 + }, + { + "epoch": 2.792006525285481, + "grad_norm": 0.1405632495880127, + "learning_rate": 0.0009952325957779168, + "loss": 0.1583, + "num_input_tokens_seen": 36954192, + "step": 17115 + }, + { + "epoch": 2.7928221859706364, + "grad_norm": 0.05524764582514763, + "learning_rate": 0.0009952227847982418, + "loss": 0.0944, + "num_input_tokens_seen": 36964880, + "step": 17120 + }, + { + "epoch": 2.793637846655791, + "grad_norm": 0.11905545741319656, + "learning_rate": 0.000995212963782249, + "loss": 0.1299, + "num_input_tokens_seen": 36976112, + "step": 17125 + }, + { + "epoch": 2.794453507340946, + "grad_norm": 0.009620288386940956, + "learning_rate": 0.000995203132730137, + "loss": 0.072, + "num_input_tokens_seen": 36985232, + "step": 17130 + }, + { + "epoch": 2.7952691680261013, + "grad_norm": 0.060959186404943466, + "learning_rate": 0.0009951932916421053, + "loss": 0.0438, + "num_input_tokens_seen": 36995376, + "step": 17135 + }, + { + "epoch": 2.796084828711256, + "grad_norm": 0.052504248917102814, + "learning_rate": 0.0009951834405183535, + "loss": 0.0797, + "num_input_tokens_seen": 37006576, + "step": 17140 + }, + { + "epoch": 2.7969004893964113, + "grad_norm": 0.16311654448509216, + "learning_rate": 0.0009951735793590811, + "loss": 0.1691, + "num_input_tokens_seen": 37017264, + "step": 17145 + }, + { + "epoch": 2.797716150081566, + "grad_norm": 0.026551628485322, + "learning_rate": 0.0009951637081644879, + "loss": 0.0194, + "num_input_tokens_seen": 37027472, + "step": 17150 + }, + { + "epoch": 2.798531810766721, + "grad_norm": 0.26946067810058594, + "learning_rate": 0.000995153826934774, + "loss": 0.0897, + "num_input_tokens_seen": 37038768, + "step": 17155 + }, + { + "epoch": 2.799347471451876, + "grad_norm": 0.04951508715748787, + "learning_rate": 0.0009951439356701394, + "loss": 0.0899, + "num_input_tokens_seen": 37049424, + "step": 17160 + }, + { + "epoch": 2.800163132137031, + "grad_norm": 0.518153727054596, + "learning_rate": 0.0009951340343707852, + "loss": 0.2822, + "num_input_tokens_seen": 37060912, + "step": 17165 + }, + { + "epoch": 2.800978792822186, + "grad_norm": 0.27946290373802185, + "learning_rate": 0.0009951241230369114, + "loss": 0.2128, + "num_input_tokens_seen": 37070032, + "step": 17170 + }, + { + "epoch": 2.801794453507341, + "grad_norm": 0.15536077320575714, + "learning_rate": 0.0009951142016687193, + "loss": 0.0987, + "num_input_tokens_seen": 37079792, + "step": 17175 + }, + { + "epoch": 2.802610114192496, + "grad_norm": 0.0241768229752779, + "learning_rate": 0.0009951042702664099, + "loss": 0.0465, + "num_input_tokens_seen": 37091344, + "step": 17180 + }, + { + "epoch": 2.8034257748776508, + "grad_norm": 0.11844358593225479, + "learning_rate": 0.0009950943288301842, + "loss": 0.095, + "num_input_tokens_seen": 37102800, + "step": 17185 + }, + { + "epoch": 2.804241435562806, + "grad_norm": 0.07570809870958328, + "learning_rate": 0.0009950843773602438, + "loss": 0.1378, + "num_input_tokens_seen": 37114512, + "step": 17190 + }, + { + "epoch": 2.805057096247961, + "grad_norm": 0.1600935459136963, + "learning_rate": 0.0009950744158567905, + "loss": 0.1875, + "num_input_tokens_seen": 37125232, + "step": 17195 + }, + { + "epoch": 2.8058727569331157, + "grad_norm": 0.13964888453483582, + "learning_rate": 0.0009950644443200262, + "loss": 0.1014, + "num_input_tokens_seen": 37136656, + "step": 17200 + }, + { + "epoch": 2.806688417618271, + "grad_norm": 0.027238652110099792, + "learning_rate": 0.0009950544627501529, + "loss": 0.1251, + "num_input_tokens_seen": 37148112, + "step": 17205 + }, + { + "epoch": 2.8075040783034257, + "grad_norm": 0.02390744350850582, + "learning_rate": 0.0009950444711473727, + "loss": 0.0724, + "num_input_tokens_seen": 37160208, + "step": 17210 + }, + { + "epoch": 2.8083197389885806, + "grad_norm": 0.023210374638438225, + "learning_rate": 0.0009950344695118885, + "loss": 0.0451, + "num_input_tokens_seen": 37170288, + "step": 17215 + }, + { + "epoch": 2.809135399673736, + "grad_norm": 0.1044159084558487, + "learning_rate": 0.0009950244578439027, + "loss": 0.0659, + "num_input_tokens_seen": 37179408, + "step": 17220 + }, + { + "epoch": 2.8099510603588906, + "grad_norm": 0.005428074859082699, + "learning_rate": 0.0009950144361436182, + "loss": 0.0717, + "num_input_tokens_seen": 37191472, + "step": 17225 + }, + { + "epoch": 2.810766721044046, + "grad_norm": 0.1078447699546814, + "learning_rate": 0.0009950044044112383, + "loss": 0.0678, + "num_input_tokens_seen": 37200528, + "step": 17230 + }, + { + "epoch": 2.8115823817292007, + "grad_norm": 0.03558633476495743, + "learning_rate": 0.000994994362646966, + "loss": 0.065, + "num_input_tokens_seen": 37210960, + "step": 17235 + }, + { + "epoch": 2.8123980424143555, + "grad_norm": 0.00993434339761734, + "learning_rate": 0.0009949843108510053, + "loss": 0.1409, + "num_input_tokens_seen": 37222192, + "step": 17240 + }, + { + "epoch": 2.8132137030995104, + "grad_norm": 0.05124455317854881, + "learning_rate": 0.0009949742490235594, + "loss": 0.1151, + "num_input_tokens_seen": 37234064, + "step": 17245 + }, + { + "epoch": 2.8140293637846656, + "grad_norm": 0.03307792916893959, + "learning_rate": 0.0009949641771648324, + "loss": 0.2606, + "num_input_tokens_seen": 37245968, + "step": 17250 + }, + { + "epoch": 2.8148450244698204, + "grad_norm": 0.17549562454223633, + "learning_rate": 0.0009949540952750285, + "loss": 0.222, + "num_input_tokens_seen": 37256432, + "step": 17255 + }, + { + "epoch": 2.8156606851549757, + "grad_norm": 0.2019302099943161, + "learning_rate": 0.000994944003354352, + "loss": 0.152, + "num_input_tokens_seen": 37267312, + "step": 17260 + }, + { + "epoch": 2.8164763458401305, + "grad_norm": 0.04240552708506584, + "learning_rate": 0.0009949339014030075, + "loss": 0.0822, + "num_input_tokens_seen": 37278256, + "step": 17265 + }, + { + "epoch": 2.8172920065252853, + "grad_norm": 0.04113017022609711, + "learning_rate": 0.0009949237894211994, + "loss": 0.107, + "num_input_tokens_seen": 37289968, + "step": 17270 + }, + { + "epoch": 2.8181076672104406, + "grad_norm": 0.024106428027153015, + "learning_rate": 0.000994913667409133, + "loss": 0.1159, + "num_input_tokens_seen": 37301072, + "step": 17275 + }, + { + "epoch": 2.8189233278955954, + "grad_norm": 0.29162514209747314, + "learning_rate": 0.0009949035353670132, + "loss": 0.211, + "num_input_tokens_seen": 37311664, + "step": 17280 + }, + { + "epoch": 2.8197389885807507, + "grad_norm": 0.2500517964363098, + "learning_rate": 0.0009948933932950456, + "loss": 0.124, + "num_input_tokens_seen": 37323120, + "step": 17285 + }, + { + "epoch": 2.8205546492659055, + "grad_norm": 0.02582480199635029, + "learning_rate": 0.0009948832411934352, + "loss": 0.0815, + "num_input_tokens_seen": 37334672, + "step": 17290 + }, + { + "epoch": 2.8213703099510603, + "grad_norm": 0.009961195290088654, + "learning_rate": 0.0009948730790623884, + "loss": 0.1014, + "num_input_tokens_seen": 37346384, + "step": 17295 + }, + { + "epoch": 2.822185970636215, + "grad_norm": 0.01918845996260643, + "learning_rate": 0.0009948629069021107, + "loss": 0.0282, + "num_input_tokens_seen": 37358256, + "step": 17300 + }, + { + "epoch": 2.8230016313213704, + "grad_norm": 0.26306575536727905, + "learning_rate": 0.0009948527247128085, + "loss": 0.1609, + "num_input_tokens_seen": 37369968, + "step": 17305 + }, + { + "epoch": 2.823817292006525, + "grad_norm": 0.03258789703249931, + "learning_rate": 0.0009948425324946882, + "loss": 0.148, + "num_input_tokens_seen": 37380432, + "step": 17310 + }, + { + "epoch": 2.8246329526916805, + "grad_norm": 0.16268616914749146, + "learning_rate": 0.0009948323302479561, + "loss": 0.2848, + "num_input_tokens_seen": 37390480, + "step": 17315 + }, + { + "epoch": 2.8254486133768353, + "grad_norm": 0.04954542964696884, + "learning_rate": 0.000994822117972819, + "loss": 0.098, + "num_input_tokens_seen": 37402608, + "step": 17320 + }, + { + "epoch": 2.82626427406199, + "grad_norm": 0.08560967445373535, + "learning_rate": 0.000994811895669484, + "loss": 0.0905, + "num_input_tokens_seen": 37413936, + "step": 17325 + }, + { + "epoch": 2.827079934747145, + "grad_norm": 0.012213559821248055, + "learning_rate": 0.0009948016633381583, + "loss": 0.0624, + "num_input_tokens_seen": 37426480, + "step": 17330 + }, + { + "epoch": 2.8278955954323, + "grad_norm": 0.10586144030094147, + "learning_rate": 0.0009947914209790492, + "loss": 0.1401, + "num_input_tokens_seen": 37438224, + "step": 17335 + }, + { + "epoch": 2.828711256117455, + "grad_norm": 0.024381063878536224, + "learning_rate": 0.0009947811685923642, + "loss": 0.0371, + "num_input_tokens_seen": 37448176, + "step": 17340 + }, + { + "epoch": 2.8295269168026103, + "grad_norm": 0.10891689360141754, + "learning_rate": 0.0009947709061783113, + "loss": 0.1128, + "num_input_tokens_seen": 37458672, + "step": 17345 + }, + { + "epoch": 2.830342577487765, + "grad_norm": 0.01587914675474167, + "learning_rate": 0.000994760633737098, + "loss": 0.2115, + "num_input_tokens_seen": 37469840, + "step": 17350 + }, + { + "epoch": 2.83115823817292, + "grad_norm": 0.08119525015354156, + "learning_rate": 0.0009947503512689332, + "loss": 0.1747, + "num_input_tokens_seen": 37479376, + "step": 17355 + }, + { + "epoch": 2.831973898858075, + "grad_norm": 0.1989976018667221, + "learning_rate": 0.0009947400587740245, + "loss": 0.2034, + "num_input_tokens_seen": 37489392, + "step": 17360 + }, + { + "epoch": 2.83278955954323, + "grad_norm": 0.033615998923778534, + "learning_rate": 0.0009947297562525811, + "loss": 0.1102, + "num_input_tokens_seen": 37499504, + "step": 17365 + }, + { + "epoch": 2.8336052202283852, + "grad_norm": 0.04159093275666237, + "learning_rate": 0.0009947194437048116, + "loss": 0.1535, + "num_input_tokens_seen": 37510512, + "step": 17370 + }, + { + "epoch": 2.83442088091354, + "grad_norm": 0.03205300495028496, + "learning_rate": 0.000994709121130925, + "loss": 0.1371, + "num_input_tokens_seen": 37521424, + "step": 17375 + }, + { + "epoch": 2.835236541598695, + "grad_norm": 0.12948866188526154, + "learning_rate": 0.0009946987885311304, + "loss": 0.2229, + "num_input_tokens_seen": 37532976, + "step": 17380 + }, + { + "epoch": 2.8360522022838497, + "grad_norm": 0.037492360919713974, + "learning_rate": 0.0009946884459056374, + "loss": 0.211, + "num_input_tokens_seen": 37543504, + "step": 17385 + }, + { + "epoch": 2.836867862969005, + "grad_norm": 0.023258114233613014, + "learning_rate": 0.0009946780932546552, + "loss": 0.1333, + "num_input_tokens_seen": 37554000, + "step": 17390 + }, + { + "epoch": 2.8376835236541598, + "grad_norm": 0.09578645974397659, + "learning_rate": 0.0009946677305783943, + "loss": 0.067, + "num_input_tokens_seen": 37565360, + "step": 17395 + }, + { + "epoch": 2.838499184339315, + "grad_norm": 0.015446759760379791, + "learning_rate": 0.000994657357877064, + "loss": 0.1953, + "num_input_tokens_seen": 37576368, + "step": 17400 + }, + { + "epoch": 2.83931484502447, + "grad_norm": 0.01721014827489853, + "learning_rate": 0.0009946469751508748, + "loss": 0.1332, + "num_input_tokens_seen": 37587664, + "step": 17405 + }, + { + "epoch": 2.8401305057096247, + "grad_norm": 0.07589091360569, + "learning_rate": 0.0009946365824000374, + "loss": 0.1953, + "num_input_tokens_seen": 37598256, + "step": 17410 + }, + { + "epoch": 2.84094616639478, + "grad_norm": 0.02403646521270275, + "learning_rate": 0.000994626179624762, + "loss": 0.1988, + "num_input_tokens_seen": 37608976, + "step": 17415 + }, + { + "epoch": 2.8417618270799347, + "grad_norm": 0.1484004706144333, + "learning_rate": 0.0009946157668252597, + "loss": 0.1761, + "num_input_tokens_seen": 37618352, + "step": 17420 + }, + { + "epoch": 2.8425774877650896, + "grad_norm": 0.058430951088666916, + "learning_rate": 0.0009946053440017413, + "loss": 0.1914, + "num_input_tokens_seen": 37628816, + "step": 17425 + }, + { + "epoch": 2.843393148450245, + "grad_norm": 0.05371273681521416, + "learning_rate": 0.000994594911154418, + "loss": 0.0749, + "num_input_tokens_seen": 37639824, + "step": 17430 + }, + { + "epoch": 2.8442088091353996, + "grad_norm": 0.06556174904108047, + "learning_rate": 0.0009945844682835018, + "loss": 0.0909, + "num_input_tokens_seen": 37651504, + "step": 17435 + }, + { + "epoch": 2.8450244698205545, + "grad_norm": 0.022147638723254204, + "learning_rate": 0.0009945740153892036, + "loss": 0.0776, + "num_input_tokens_seen": 37662992, + "step": 17440 + }, + { + "epoch": 2.8458401305057097, + "grad_norm": 0.034693729132413864, + "learning_rate": 0.0009945635524717359, + "loss": 0.0564, + "num_input_tokens_seen": 37673904, + "step": 17445 + }, + { + "epoch": 2.8466557911908645, + "grad_norm": 0.0689237043261528, + "learning_rate": 0.00099455307953131, + "loss": 0.0801, + "num_input_tokens_seen": 37684368, + "step": 17450 + }, + { + "epoch": 2.84747145187602, + "grad_norm": 0.06305966526269913, + "learning_rate": 0.0009945425965681388, + "loss": 0.0545, + "num_input_tokens_seen": 37694480, + "step": 17455 + }, + { + "epoch": 2.8482871125611746, + "grad_norm": 0.004915526602417231, + "learning_rate": 0.0009945321035824343, + "loss": 0.0284, + "num_input_tokens_seen": 37705040, + "step": 17460 + }, + { + "epoch": 2.8491027732463294, + "grad_norm": 0.13261525332927704, + "learning_rate": 0.0009945216005744096, + "loss": 0.2728, + "num_input_tokens_seen": 37716304, + "step": 17465 + }, + { + "epoch": 2.8499184339314843, + "grad_norm": 0.03296510502696037, + "learning_rate": 0.0009945110875442774, + "loss": 0.0562, + "num_input_tokens_seen": 37726352, + "step": 17470 + }, + { + "epoch": 2.8507340946166395, + "grad_norm": 0.0632663443684578, + "learning_rate": 0.0009945005644922504, + "loss": 0.0362, + "num_input_tokens_seen": 37736048, + "step": 17475 + }, + { + "epoch": 2.8515497553017943, + "grad_norm": 0.2271157056093216, + "learning_rate": 0.0009944900314185422, + "loss": 0.1657, + "num_input_tokens_seen": 37746704, + "step": 17480 + }, + { + "epoch": 2.8523654159869496, + "grad_norm": 0.018760349601507187, + "learning_rate": 0.0009944794883233663, + "loss": 0.0334, + "num_input_tokens_seen": 37758000, + "step": 17485 + }, + { + "epoch": 2.8531810766721044, + "grad_norm": 0.010596570558845997, + "learning_rate": 0.0009944689352069363, + "loss": 0.2656, + "num_input_tokens_seen": 37768240, + "step": 17490 + }, + { + "epoch": 2.8539967373572592, + "grad_norm": 0.03754739090800285, + "learning_rate": 0.000994458372069466, + "loss": 0.1724, + "num_input_tokens_seen": 37778576, + "step": 17495 + }, + { + "epoch": 2.8548123980424145, + "grad_norm": 0.04009263217449188, + "learning_rate": 0.0009944477989111695, + "loss": 0.0655, + "num_input_tokens_seen": 37788880, + "step": 17500 + }, + { + "epoch": 2.8556280587275693, + "grad_norm": 0.013271852396428585, + "learning_rate": 0.0009944372157322612, + "loss": 0.0771, + "num_input_tokens_seen": 37799856, + "step": 17505 + }, + { + "epoch": 2.8564437194127246, + "grad_norm": 0.05043952539563179, + "learning_rate": 0.0009944266225329552, + "loss": 0.152, + "num_input_tokens_seen": 37809808, + "step": 17510 + }, + { + "epoch": 2.8572593800978794, + "grad_norm": 0.18690840899944305, + "learning_rate": 0.0009944160193134668, + "loss": 0.2086, + "num_input_tokens_seen": 37819856, + "step": 17515 + }, + { + "epoch": 2.858075040783034, + "grad_norm": 0.0931902527809143, + "learning_rate": 0.0009944054060740104, + "loss": 0.1752, + "num_input_tokens_seen": 37831792, + "step": 17520 + }, + { + "epoch": 2.858890701468189, + "grad_norm": 0.1438274085521698, + "learning_rate": 0.0009943947828148013, + "loss": 0.0776, + "num_input_tokens_seen": 37843280, + "step": 17525 + }, + { + "epoch": 2.8597063621533443, + "grad_norm": 0.08573637157678604, + "learning_rate": 0.0009943841495360546, + "loss": 0.0937, + "num_input_tokens_seen": 37854928, + "step": 17530 + }, + { + "epoch": 2.860522022838499, + "grad_norm": 0.04021572321653366, + "learning_rate": 0.0009943735062379862, + "loss": 0.1281, + "num_input_tokens_seen": 37865424, + "step": 17535 + }, + { + "epoch": 2.8613376835236544, + "grad_norm": 0.1548701524734497, + "learning_rate": 0.0009943628529208114, + "loss": 0.1288, + "num_input_tokens_seen": 37876624, + "step": 17540 + }, + { + "epoch": 2.862153344208809, + "grad_norm": 0.11105731129646301, + "learning_rate": 0.0009943521895847461, + "loss": 0.1222, + "num_input_tokens_seen": 37886896, + "step": 17545 + }, + { + "epoch": 2.862969004893964, + "grad_norm": 0.05588413402438164, + "learning_rate": 0.0009943415162300066, + "loss": 0.2577, + "num_input_tokens_seen": 37897424, + "step": 17550 + }, + { + "epoch": 2.863784665579119, + "grad_norm": 0.01313700620085001, + "learning_rate": 0.0009943308328568094, + "loss": 0.1002, + "num_input_tokens_seen": 37908080, + "step": 17555 + }, + { + "epoch": 2.864600326264274, + "grad_norm": 0.04249060899019241, + "learning_rate": 0.0009943201394653706, + "loss": 0.059, + "num_input_tokens_seen": 37918768, + "step": 17560 + }, + { + "epoch": 2.865415986949429, + "grad_norm": 0.04770621284842491, + "learning_rate": 0.0009943094360559072, + "loss": 0.167, + "num_input_tokens_seen": 37929488, + "step": 17565 + }, + { + "epoch": 2.866231647634584, + "grad_norm": 0.01963675208389759, + "learning_rate": 0.0009942987226286358, + "loss": 0.1489, + "num_input_tokens_seen": 37940560, + "step": 17570 + }, + { + "epoch": 2.867047308319739, + "grad_norm": 0.0744885578751564, + "learning_rate": 0.0009942879991837739, + "loss": 0.1813, + "num_input_tokens_seen": 37951248, + "step": 17575 + }, + { + "epoch": 2.867862969004894, + "grad_norm": 0.00410545663908124, + "learning_rate": 0.0009942772657215385, + "loss": 0.0688, + "num_input_tokens_seen": 37961936, + "step": 17580 + }, + { + "epoch": 2.868678629690049, + "grad_norm": 0.15364985167980194, + "learning_rate": 0.0009942665222421475, + "loss": 0.1599, + "num_input_tokens_seen": 37972560, + "step": 17585 + }, + { + "epoch": 2.869494290375204, + "grad_norm": 0.02903786487877369, + "learning_rate": 0.0009942557687458182, + "loss": 0.099, + "num_input_tokens_seen": 37984016, + "step": 17590 + }, + { + "epoch": 2.870309951060359, + "grad_norm": 0.020804740488529205, + "learning_rate": 0.0009942450052327688, + "loss": 0.084, + "num_input_tokens_seen": 37994896, + "step": 17595 + }, + { + "epoch": 2.871125611745514, + "grad_norm": 0.06187284365296364, + "learning_rate": 0.0009942342317032172, + "loss": 0.0964, + "num_input_tokens_seen": 38006832, + "step": 17600 + }, + { + "epoch": 2.8719412724306688, + "grad_norm": 0.2258651852607727, + "learning_rate": 0.000994223448157382, + "loss": 0.1151, + "num_input_tokens_seen": 38018832, + "step": 17605 + }, + { + "epoch": 2.8727569331158236, + "grad_norm": 0.007890415377914906, + "learning_rate": 0.000994212654595482, + "loss": 0.1937, + "num_input_tokens_seen": 38029520, + "step": 17610 + }, + { + "epoch": 2.873572593800979, + "grad_norm": 0.031123114749789238, + "learning_rate": 0.0009942018510177351, + "loss": 0.0696, + "num_input_tokens_seen": 38039568, + "step": 17615 + }, + { + "epoch": 2.8743882544861337, + "grad_norm": 0.011710697785019875, + "learning_rate": 0.000994191037424361, + "loss": 0.1397, + "num_input_tokens_seen": 38051344, + "step": 17620 + }, + { + "epoch": 2.875203915171289, + "grad_norm": 0.08112957328557968, + "learning_rate": 0.0009941802138155786, + "loss": 0.0636, + "num_input_tokens_seen": 38062448, + "step": 17625 + }, + { + "epoch": 2.8760195758564437, + "grad_norm": 0.02962901070713997, + "learning_rate": 0.0009941693801916074, + "loss": 0.0348, + "num_input_tokens_seen": 38073520, + "step": 17630 + }, + { + "epoch": 2.8768352365415986, + "grad_norm": 0.1084759309887886, + "learning_rate": 0.0009941585365526666, + "loss": 0.1821, + "num_input_tokens_seen": 38084400, + "step": 17635 + }, + { + "epoch": 2.877650897226754, + "grad_norm": 0.0017527119489386678, + "learning_rate": 0.0009941476828989762, + "loss": 0.2174, + "num_input_tokens_seen": 38094288, + "step": 17640 + }, + { + "epoch": 2.8784665579119086, + "grad_norm": 0.02333504520356655, + "learning_rate": 0.0009941368192307562, + "loss": 0.0516, + "num_input_tokens_seen": 38104752, + "step": 17645 + }, + { + "epoch": 2.8792822185970635, + "grad_norm": 0.042213909327983856, + "learning_rate": 0.0009941259455482267, + "loss": 0.0674, + "num_input_tokens_seen": 38113456, + "step": 17650 + }, + { + "epoch": 2.8800978792822187, + "grad_norm": 0.07142660021781921, + "learning_rate": 0.0009941150618516079, + "loss": 0.1091, + "num_input_tokens_seen": 38123888, + "step": 17655 + }, + { + "epoch": 2.8809135399673735, + "grad_norm": 0.13434121012687683, + "learning_rate": 0.0009941041681411206, + "loss": 0.0775, + "num_input_tokens_seen": 38134448, + "step": 17660 + }, + { + "epoch": 2.8817292006525284, + "grad_norm": 0.28143176436424255, + "learning_rate": 0.0009940932644169858, + "loss": 0.3521, + "num_input_tokens_seen": 38145104, + "step": 17665 + }, + { + "epoch": 2.8825448613376836, + "grad_norm": 0.16651491820812225, + "learning_rate": 0.000994082350679424, + "loss": 0.3099, + "num_input_tokens_seen": 38155984, + "step": 17670 + }, + { + "epoch": 2.8833605220228384, + "grad_norm": 0.14934590458869934, + "learning_rate": 0.0009940714269286565, + "loss": 0.1476, + "num_input_tokens_seen": 38167408, + "step": 17675 + }, + { + "epoch": 2.8841761827079937, + "grad_norm": 0.10967616736888885, + "learning_rate": 0.000994060493164905, + "loss": 0.0802, + "num_input_tokens_seen": 38178096, + "step": 17680 + }, + { + "epoch": 2.8849918433931485, + "grad_norm": 0.019994784146547318, + "learning_rate": 0.0009940495493883906, + "loss": 0.0393, + "num_input_tokens_seen": 38188304, + "step": 17685 + }, + { + "epoch": 2.8858075040783033, + "grad_norm": 0.11474478989839554, + "learning_rate": 0.0009940385955993353, + "loss": 0.0671, + "num_input_tokens_seen": 38198512, + "step": 17690 + }, + { + "epoch": 2.886623164763458, + "grad_norm": 0.009458590298891068, + "learning_rate": 0.0009940276317979611, + "loss": 0.1143, + "num_input_tokens_seen": 38209840, + "step": 17695 + }, + { + "epoch": 2.8874388254486134, + "grad_norm": 0.05440307408571243, + "learning_rate": 0.0009940166579844906, + "loss": 0.1404, + "num_input_tokens_seen": 38219760, + "step": 17700 + }, + { + "epoch": 2.8882544861337682, + "grad_norm": 0.028910527005791664, + "learning_rate": 0.0009940056741591455, + "loss": 0.0457, + "num_input_tokens_seen": 38231376, + "step": 17705 + }, + { + "epoch": 2.8890701468189235, + "grad_norm": 0.15710993111133575, + "learning_rate": 0.0009939946803221487, + "loss": 0.1914, + "num_input_tokens_seen": 38242096, + "step": 17710 + }, + { + "epoch": 2.8898858075040783, + "grad_norm": 0.009441476315259933, + "learning_rate": 0.000993983676473723, + "loss": 0.0695, + "num_input_tokens_seen": 38252624, + "step": 17715 + }, + { + "epoch": 2.890701468189233, + "grad_norm": 0.05692329630255699, + "learning_rate": 0.0009939726626140917, + "loss": 0.0731, + "num_input_tokens_seen": 38262864, + "step": 17720 + }, + { + "epoch": 2.8915171288743884, + "grad_norm": 0.011534065939486027, + "learning_rate": 0.0009939616387434776, + "loss": 0.0265, + "num_input_tokens_seen": 38274640, + "step": 17725 + }, + { + "epoch": 2.892332789559543, + "grad_norm": 0.012839607894420624, + "learning_rate": 0.0009939506048621044, + "loss": 0.0242, + "num_input_tokens_seen": 38286512, + "step": 17730 + }, + { + "epoch": 2.8931484502446985, + "grad_norm": 0.005959564354270697, + "learning_rate": 0.0009939395609701953, + "loss": 0.0599, + "num_input_tokens_seen": 38297456, + "step": 17735 + }, + { + "epoch": 2.8939641109298533, + "grad_norm": 0.040610216557979584, + "learning_rate": 0.0009939285070679745, + "loss": 0.1036, + "num_input_tokens_seen": 38308304, + "step": 17740 + }, + { + "epoch": 2.894779771615008, + "grad_norm": 0.0968034490942955, + "learning_rate": 0.000993917443155666, + "loss": 0.2373, + "num_input_tokens_seen": 38318640, + "step": 17745 + }, + { + "epoch": 2.895595432300163, + "grad_norm": 0.024083608761429787, + "learning_rate": 0.0009939063692334937, + "loss": 0.0412, + "num_input_tokens_seen": 38330320, + "step": 17750 + }, + { + "epoch": 2.896411092985318, + "grad_norm": 0.05791725963354111, + "learning_rate": 0.0009938952853016825, + "loss": 0.0357, + "num_input_tokens_seen": 38340432, + "step": 17755 + }, + { + "epoch": 2.897226753670473, + "grad_norm": 0.06370843946933746, + "learning_rate": 0.0009938841913604568, + "loss": 0.1086, + "num_input_tokens_seen": 38351760, + "step": 17760 + }, + { + "epoch": 2.8980424143556283, + "grad_norm": 0.08623971045017242, + "learning_rate": 0.0009938730874100412, + "loss": 0.0788, + "num_input_tokens_seen": 38361136, + "step": 17765 + }, + { + "epoch": 2.898858075040783, + "grad_norm": 0.005400337744504213, + "learning_rate": 0.0009938619734506612, + "loss": 0.0297, + "num_input_tokens_seen": 38371408, + "step": 17770 + }, + { + "epoch": 2.899673735725938, + "grad_norm": 0.1572704315185547, + "learning_rate": 0.0009938508494825417, + "loss": 0.1891, + "num_input_tokens_seen": 38383056, + "step": 17775 + }, + { + "epoch": 2.9004893964110927, + "grad_norm": 0.01725652813911438, + "learning_rate": 0.0009938397155059083, + "loss": 0.0253, + "num_input_tokens_seen": 38393840, + "step": 17780 + }, + { + "epoch": 2.901305057096248, + "grad_norm": 0.0848039984703064, + "learning_rate": 0.0009938285715209866, + "loss": 0.1063, + "num_input_tokens_seen": 38406096, + "step": 17785 + }, + { + "epoch": 2.902120717781403, + "grad_norm": 0.02380322478711605, + "learning_rate": 0.0009938174175280023, + "loss": 0.1184, + "num_input_tokens_seen": 38416912, + "step": 17790 + }, + { + "epoch": 2.902936378466558, + "grad_norm": 0.18703535199165344, + "learning_rate": 0.0009938062535271817, + "loss": 0.1481, + "num_input_tokens_seen": 38428016, + "step": 17795 + }, + { + "epoch": 2.903752039151713, + "grad_norm": 0.06174721196293831, + "learning_rate": 0.0009937950795187508, + "loss": 0.1208, + "num_input_tokens_seen": 38438480, + "step": 17800 + }, + { + "epoch": 2.9045676998368677, + "grad_norm": 0.21998439729213715, + "learning_rate": 0.0009937838955029362, + "loss": 0.037, + "num_input_tokens_seen": 38448112, + "step": 17805 + }, + { + "epoch": 2.905383360522023, + "grad_norm": 0.10594562441110611, + "learning_rate": 0.0009937727014799646, + "loss": 0.1797, + "num_input_tokens_seen": 38457872, + "step": 17810 + }, + { + "epoch": 2.9061990212071778, + "grad_norm": 0.06243889406323433, + "learning_rate": 0.0009937614974500628, + "loss": 0.1982, + "num_input_tokens_seen": 38469040, + "step": 17815 + }, + { + "epoch": 2.907014681892333, + "grad_norm": 0.2063465565443039, + "learning_rate": 0.000993750283413458, + "loss": 0.2005, + "num_input_tokens_seen": 38479344, + "step": 17820 + }, + { + "epoch": 2.907830342577488, + "grad_norm": 0.16148215532302856, + "learning_rate": 0.0009937390593703773, + "loss": 0.1061, + "num_input_tokens_seen": 38489552, + "step": 17825 + }, + { + "epoch": 2.9086460032626427, + "grad_norm": 0.043657150119543076, + "learning_rate": 0.000993727825321048, + "loss": 0.0919, + "num_input_tokens_seen": 38500752, + "step": 17830 + }, + { + "epoch": 2.9094616639477975, + "grad_norm": 0.4393543303012848, + "learning_rate": 0.0009937165812656983, + "loss": 0.1806, + "num_input_tokens_seen": 38512432, + "step": 17835 + }, + { + "epoch": 2.9102773246329527, + "grad_norm": 0.007073238492012024, + "learning_rate": 0.0009937053272045554, + "loss": 0.1106, + "num_input_tokens_seen": 38522256, + "step": 17840 + }, + { + "epoch": 2.9110929853181076, + "grad_norm": 0.04384694620966911, + "learning_rate": 0.000993694063137848, + "loss": 0.1287, + "num_input_tokens_seen": 38532784, + "step": 17845 + }, + { + "epoch": 2.911908646003263, + "grad_norm": 0.15666642785072327, + "learning_rate": 0.000993682789065804, + "loss": 0.1848, + "num_input_tokens_seen": 38542640, + "step": 17850 + }, + { + "epoch": 2.9127243066884176, + "grad_norm": 0.09653788059949875, + "learning_rate": 0.0009936715049886522, + "loss": 0.1407, + "num_input_tokens_seen": 38553008, + "step": 17855 + }, + { + "epoch": 2.9135399673735725, + "grad_norm": 0.05634286627173424, + "learning_rate": 0.0009936602109066209, + "loss": 0.1972, + "num_input_tokens_seen": 38565200, + "step": 17860 + }, + { + "epoch": 2.9143556280587277, + "grad_norm": 0.05563356727361679, + "learning_rate": 0.0009936489068199392, + "loss": 0.1179, + "num_input_tokens_seen": 38574736, + "step": 17865 + }, + { + "epoch": 2.9151712887438825, + "grad_norm": 0.029675351455807686, + "learning_rate": 0.0009936375927288362, + "loss": 0.0821, + "num_input_tokens_seen": 38584496, + "step": 17870 + }, + { + "epoch": 2.9159869494290374, + "grad_norm": 0.006378476042300463, + "learning_rate": 0.000993626268633541, + "loss": 0.0885, + "num_input_tokens_seen": 38594448, + "step": 17875 + }, + { + "epoch": 2.9168026101141926, + "grad_norm": 0.050347164273262024, + "learning_rate": 0.0009936149345342834, + "loss": 0.0456, + "num_input_tokens_seen": 38604976, + "step": 17880 + }, + { + "epoch": 2.9176182707993474, + "grad_norm": 0.01886771246790886, + "learning_rate": 0.000993603590431293, + "loss": 0.1827, + "num_input_tokens_seen": 38615792, + "step": 17885 + }, + { + "epoch": 2.9184339314845023, + "grad_norm": 0.16464698314666748, + "learning_rate": 0.0009935922363247995, + "loss": 0.1886, + "num_input_tokens_seen": 38626320, + "step": 17890 + }, + { + "epoch": 2.9192495921696575, + "grad_norm": 0.08448215574026108, + "learning_rate": 0.0009935808722150333, + "loss": 0.0977, + "num_input_tokens_seen": 38637008, + "step": 17895 + }, + { + "epoch": 2.9200652528548123, + "grad_norm": 0.04829553887248039, + "learning_rate": 0.0009935694981022245, + "loss": 0.0437, + "num_input_tokens_seen": 38647056, + "step": 17900 + }, + { + "epoch": 2.9208809135399676, + "grad_norm": 0.21752440929412842, + "learning_rate": 0.0009935581139866039, + "loss": 0.3291, + "num_input_tokens_seen": 38658512, + "step": 17905 + }, + { + "epoch": 2.9216965742251224, + "grad_norm": 0.02581077627837658, + "learning_rate": 0.0009935467198684015, + "loss": 0.0563, + "num_input_tokens_seen": 38669328, + "step": 17910 + }, + { + "epoch": 2.9225122349102772, + "grad_norm": 0.040956396609544754, + "learning_rate": 0.0009935353157478493, + "loss": 0.1057, + "num_input_tokens_seen": 38680048, + "step": 17915 + }, + { + "epoch": 2.923327895595432, + "grad_norm": 0.030965333804488182, + "learning_rate": 0.0009935239016251776, + "loss": 0.2133, + "num_input_tokens_seen": 38691952, + "step": 17920 + }, + { + "epoch": 2.9241435562805873, + "grad_norm": 0.047703735530376434, + "learning_rate": 0.0009935124775006178, + "loss": 0.0652, + "num_input_tokens_seen": 38701872, + "step": 17925 + }, + { + "epoch": 2.924959216965742, + "grad_norm": 0.007446065079420805, + "learning_rate": 0.0009935010433744017, + "loss": 0.0437, + "num_input_tokens_seen": 38713104, + "step": 17930 + }, + { + "epoch": 2.9257748776508974, + "grad_norm": 0.018667729571461678, + "learning_rate": 0.000993489599246761, + "loss": 0.1571, + "num_input_tokens_seen": 38723952, + "step": 17935 + }, + { + "epoch": 2.926590538336052, + "grad_norm": 0.029752418398857117, + "learning_rate": 0.0009934781451179273, + "loss": 0.0558, + "num_input_tokens_seen": 38735216, + "step": 17940 + }, + { + "epoch": 2.927406199021207, + "grad_norm": 0.014645390212535858, + "learning_rate": 0.000993466680988133, + "loss": 0.1674, + "num_input_tokens_seen": 38746032, + "step": 17945 + }, + { + "epoch": 2.9282218597063623, + "grad_norm": 0.09012807905673981, + "learning_rate": 0.0009934552068576105, + "loss": 0.117, + "num_input_tokens_seen": 38756464, + "step": 17950 + }, + { + "epoch": 2.929037520391517, + "grad_norm": 0.08319991827011108, + "learning_rate": 0.0009934437227265924, + "loss": 0.2181, + "num_input_tokens_seen": 38767952, + "step": 17955 + }, + { + "epoch": 2.9298531810766724, + "grad_norm": 0.00931257102638483, + "learning_rate": 0.0009934322285953111, + "loss": 0.0565, + "num_input_tokens_seen": 38779312, + "step": 17960 + }, + { + "epoch": 2.930668841761827, + "grad_norm": 0.015849901363253593, + "learning_rate": 0.0009934207244639997, + "loss": 0.0683, + "num_input_tokens_seen": 38790576, + "step": 17965 + }, + { + "epoch": 2.931484502446982, + "grad_norm": 0.21831755340099335, + "learning_rate": 0.0009934092103328915, + "loss": 0.1036, + "num_input_tokens_seen": 38802032, + "step": 17970 + }, + { + "epoch": 2.932300163132137, + "grad_norm": 0.16292500495910645, + "learning_rate": 0.0009933976862022196, + "loss": 0.2263, + "num_input_tokens_seen": 38812080, + "step": 17975 + }, + { + "epoch": 2.933115823817292, + "grad_norm": 0.18336792290210724, + "learning_rate": 0.0009933861520722176, + "loss": 0.2011, + "num_input_tokens_seen": 38823664, + "step": 17980 + }, + { + "epoch": 2.933931484502447, + "grad_norm": 0.1631539911031723, + "learning_rate": 0.0009933746079431195, + "loss": 0.0883, + "num_input_tokens_seen": 38833840, + "step": 17985 + }, + { + "epoch": 2.934747145187602, + "grad_norm": 0.1639021933078766, + "learning_rate": 0.000993363053815159, + "loss": 0.196, + "num_input_tokens_seen": 38844400, + "step": 17990 + }, + { + "epoch": 2.935562805872757, + "grad_norm": 0.03159809857606888, + "learning_rate": 0.0009933514896885705, + "loss": 0.0729, + "num_input_tokens_seen": 38855248, + "step": 17995 + }, + { + "epoch": 2.936378466557912, + "grad_norm": 0.06429016590118408, + "learning_rate": 0.000993339915563588, + "loss": 0.091, + "num_input_tokens_seen": 38867472, + "step": 18000 + }, + { + "epoch": 2.9371941272430666, + "grad_norm": 0.029551563784480095, + "learning_rate": 0.0009933283314404462, + "loss": 0.0591, + "num_input_tokens_seen": 38878864, + "step": 18005 + }, + { + "epoch": 2.938009787928222, + "grad_norm": 0.16410605609416962, + "learning_rate": 0.0009933167373193802, + "loss": 0.1481, + "num_input_tokens_seen": 38890032, + "step": 18010 + }, + { + "epoch": 2.9388254486133767, + "grad_norm": 0.019476020708680153, + "learning_rate": 0.0009933051332006245, + "loss": 0.0389, + "num_input_tokens_seen": 38900624, + "step": 18015 + }, + { + "epoch": 2.939641109298532, + "grad_norm": 0.005929023027420044, + "learning_rate": 0.0009932935190844145, + "loss": 0.0416, + "num_input_tokens_seen": 38911984, + "step": 18020 + }, + { + "epoch": 2.9404567699836868, + "grad_norm": 0.11233475059270859, + "learning_rate": 0.0009932818949709855, + "loss": 0.0637, + "num_input_tokens_seen": 38922896, + "step": 18025 + }, + { + "epoch": 2.9412724306688416, + "grad_norm": 0.36294788122177124, + "learning_rate": 0.0009932702608605733, + "loss": 0.1825, + "num_input_tokens_seen": 38933872, + "step": 18030 + }, + { + "epoch": 2.942088091353997, + "grad_norm": 0.08428234606981277, + "learning_rate": 0.0009932586167534134, + "loss": 0.0424, + "num_input_tokens_seen": 38943088, + "step": 18035 + }, + { + "epoch": 2.9429037520391517, + "grad_norm": 0.1771748960018158, + "learning_rate": 0.0009932469626497418, + "loss": 0.2328, + "num_input_tokens_seen": 38954064, + "step": 18040 + }, + { + "epoch": 2.943719412724307, + "grad_norm": 0.015557125210762024, + "learning_rate": 0.000993235298549795, + "loss": 0.0994, + "num_input_tokens_seen": 38965904, + "step": 18045 + }, + { + "epoch": 2.9445350734094617, + "grad_norm": 0.04112093150615692, + "learning_rate": 0.0009932236244538089, + "loss": 0.1326, + "num_input_tokens_seen": 38976080, + "step": 18050 + }, + { + "epoch": 2.9453507340946166, + "grad_norm": 0.009412819519639015, + "learning_rate": 0.0009932119403620206, + "loss": 0.0523, + "num_input_tokens_seen": 38987344, + "step": 18055 + }, + { + "epoch": 2.9461663947797714, + "grad_norm": 0.09612077474594116, + "learning_rate": 0.0009932002462746665, + "loss": 0.0468, + "num_input_tokens_seen": 38999472, + "step": 18060 + }, + { + "epoch": 2.9469820554649266, + "grad_norm": 0.017046939581632614, + "learning_rate": 0.0009931885421919837, + "loss": 0.1083, + "num_input_tokens_seen": 39010352, + "step": 18065 + }, + { + "epoch": 2.9477977161500815, + "grad_norm": 0.009880765341222286, + "learning_rate": 0.0009931768281142095, + "loss": 0.1375, + "num_input_tokens_seen": 39020816, + "step": 18070 + }, + { + "epoch": 2.9486133768352367, + "grad_norm": 0.07250680774450302, + "learning_rate": 0.0009931651040415812, + "loss": 0.0962, + "num_input_tokens_seen": 39032112, + "step": 18075 + }, + { + "epoch": 2.9494290375203915, + "grad_norm": 0.005578977055847645, + "learning_rate": 0.0009931533699743364, + "loss": 0.0235, + "num_input_tokens_seen": 39043536, + "step": 18080 + }, + { + "epoch": 2.9502446982055464, + "grad_norm": 0.06634651869535446, + "learning_rate": 0.000993141625912713, + "loss": 0.1068, + "num_input_tokens_seen": 39053232, + "step": 18085 + }, + { + "epoch": 2.9510603588907016, + "grad_norm": 0.028256632387638092, + "learning_rate": 0.0009931298718569492, + "loss": 0.0152, + "num_input_tokens_seen": 39064368, + "step": 18090 + }, + { + "epoch": 2.9518760195758564, + "grad_norm": 0.05586544796824455, + "learning_rate": 0.0009931181078072827, + "loss": 0.4254, + "num_input_tokens_seen": 39072560, + "step": 18095 + }, + { + "epoch": 2.9526916802610113, + "grad_norm": 0.006039158441126347, + "learning_rate": 0.0009931063337639521, + "loss": 0.1029, + "num_input_tokens_seen": 39084080, + "step": 18100 + }, + { + "epoch": 2.9535073409461665, + "grad_norm": 0.05745425820350647, + "learning_rate": 0.0009930945497271964, + "loss": 0.073, + "num_input_tokens_seen": 39095056, + "step": 18105 + }, + { + "epoch": 2.9543230016313213, + "grad_norm": 0.025508873164653778, + "learning_rate": 0.0009930827556972539, + "loss": 0.1215, + "num_input_tokens_seen": 39106000, + "step": 18110 + }, + { + "epoch": 2.955138662316476, + "grad_norm": 0.05272262170910835, + "learning_rate": 0.0009930709516743639, + "loss": 0.1989, + "num_input_tokens_seen": 39115760, + "step": 18115 + }, + { + "epoch": 2.9559543230016314, + "grad_norm": 0.248722106218338, + "learning_rate": 0.0009930591376587654, + "loss": 0.1881, + "num_input_tokens_seen": 39126160, + "step": 18120 + }, + { + "epoch": 2.9567699836867862, + "grad_norm": 0.12603089213371277, + "learning_rate": 0.0009930473136506982, + "loss": 0.1281, + "num_input_tokens_seen": 39137264, + "step": 18125 + }, + { + "epoch": 2.9575856443719415, + "grad_norm": 0.15137000381946564, + "learning_rate": 0.0009930354796504018, + "loss": 0.1493, + "num_input_tokens_seen": 39147824, + "step": 18130 + }, + { + "epoch": 2.9584013050570963, + "grad_norm": 0.054597869515419006, + "learning_rate": 0.0009930236356581158, + "loss": 0.1799, + "num_input_tokens_seen": 39158768, + "step": 18135 + }, + { + "epoch": 2.959216965742251, + "grad_norm": 0.02385844849050045, + "learning_rate": 0.0009930117816740803, + "loss": 0.112, + "num_input_tokens_seen": 39168752, + "step": 18140 + }, + { + "epoch": 2.960032626427406, + "grad_norm": 0.051901597529649734, + "learning_rate": 0.0009929999176985355, + "loss": 0.0658, + "num_input_tokens_seen": 39180528, + "step": 18145 + }, + { + "epoch": 2.960848287112561, + "grad_norm": 0.12770681083202362, + "learning_rate": 0.0009929880437317222, + "loss": 0.2286, + "num_input_tokens_seen": 39192336, + "step": 18150 + }, + { + "epoch": 2.961663947797716, + "grad_norm": 0.05842861533164978, + "learning_rate": 0.0009929761597738808, + "loss": 0.1192, + "num_input_tokens_seen": 39203056, + "step": 18155 + }, + { + "epoch": 2.9624796084828713, + "grad_norm": 0.026668380945920944, + "learning_rate": 0.000992964265825252, + "loss": 0.097, + "num_input_tokens_seen": 39213104, + "step": 18160 + }, + { + "epoch": 2.963295269168026, + "grad_norm": 0.11329828947782516, + "learning_rate": 0.0009929523618860772, + "loss": 0.221, + "num_input_tokens_seen": 39223792, + "step": 18165 + }, + { + "epoch": 2.964110929853181, + "grad_norm": 0.1376471370458603, + "learning_rate": 0.000992940447956597, + "loss": 0.096, + "num_input_tokens_seen": 39235248, + "step": 18170 + }, + { + "epoch": 2.964926590538336, + "grad_norm": 0.011270344257354736, + "learning_rate": 0.000992928524037054, + "loss": 0.1048, + "num_input_tokens_seen": 39245424, + "step": 18175 + }, + { + "epoch": 2.965742251223491, + "grad_norm": 0.015286853536963463, + "learning_rate": 0.0009929165901276884, + "loss": 0.146, + "num_input_tokens_seen": 39255344, + "step": 18180 + }, + { + "epoch": 2.9665579119086463, + "grad_norm": 0.119348905980587, + "learning_rate": 0.000992904646228743, + "loss": 0.0673, + "num_input_tokens_seen": 39266192, + "step": 18185 + }, + { + "epoch": 2.967373572593801, + "grad_norm": 0.13732200860977173, + "learning_rate": 0.00099289269234046, + "loss": 0.1813, + "num_input_tokens_seen": 39277040, + "step": 18190 + }, + { + "epoch": 2.968189233278956, + "grad_norm": 0.12898394465446472, + "learning_rate": 0.000992880728463081, + "loss": 0.1576, + "num_input_tokens_seen": 39286160, + "step": 18195 + }, + { + "epoch": 2.9690048939641107, + "grad_norm": 0.028791218996047974, + "learning_rate": 0.0009928687545968486, + "loss": 0.0575, + "num_input_tokens_seen": 39296496, + "step": 18200 + }, + { + "epoch": 2.969820554649266, + "grad_norm": 0.07386782765388489, + "learning_rate": 0.0009928567707420059, + "loss": 0.0676, + "num_input_tokens_seen": 39306672, + "step": 18205 + }, + { + "epoch": 2.970636215334421, + "grad_norm": 0.016163716092705727, + "learning_rate": 0.0009928447768987956, + "loss": 0.0452, + "num_input_tokens_seen": 39316816, + "step": 18210 + }, + { + "epoch": 2.971451876019576, + "grad_norm": 0.025255532935261726, + "learning_rate": 0.0009928327730674604, + "loss": 0.1212, + "num_input_tokens_seen": 39327984, + "step": 18215 + }, + { + "epoch": 2.972267536704731, + "grad_norm": 0.04218817874789238, + "learning_rate": 0.000992820759248244, + "loss": 0.0878, + "num_input_tokens_seen": 39338832, + "step": 18220 + }, + { + "epoch": 2.9730831973898857, + "grad_norm": 0.029554584994912148, + "learning_rate": 0.00099280873544139, + "loss": 0.1389, + "num_input_tokens_seen": 39349104, + "step": 18225 + }, + { + "epoch": 2.9738988580750405, + "grad_norm": 0.16504336893558502, + "learning_rate": 0.0009927967016471414, + "loss": 0.1839, + "num_input_tokens_seen": 39358640, + "step": 18230 + }, + { + "epoch": 2.9747145187601958, + "grad_norm": 0.01982973702251911, + "learning_rate": 0.0009927846578657426, + "loss": 0.0495, + "num_input_tokens_seen": 39368688, + "step": 18235 + }, + { + "epoch": 2.9755301794453506, + "grad_norm": 0.062448952347040176, + "learning_rate": 0.0009927726040974377, + "loss": 0.1365, + "num_input_tokens_seen": 39378224, + "step": 18240 + }, + { + "epoch": 2.976345840130506, + "grad_norm": 0.03732555732131004, + "learning_rate": 0.0009927605403424707, + "loss": 0.0429, + "num_input_tokens_seen": 39388368, + "step": 18245 + }, + { + "epoch": 2.9771615008156607, + "grad_norm": 0.11052993685007095, + "learning_rate": 0.0009927484666010862, + "loss": 0.097, + "num_input_tokens_seen": 39399120, + "step": 18250 + }, + { + "epoch": 2.9779771615008155, + "grad_norm": 0.029810762032866478, + "learning_rate": 0.000992736382873529, + "loss": 0.1014, + "num_input_tokens_seen": 39408560, + "step": 18255 + }, + { + "epoch": 2.9787928221859707, + "grad_norm": 0.039250992238521576, + "learning_rate": 0.000992724289160044, + "loss": 0.0911, + "num_input_tokens_seen": 39419664, + "step": 18260 + }, + { + "epoch": 2.9796084828711256, + "grad_norm": 0.12138062715530396, + "learning_rate": 0.000992712185460876, + "loss": 0.0441, + "num_input_tokens_seen": 39430480, + "step": 18265 + }, + { + "epoch": 2.980424143556281, + "grad_norm": 0.039968572556972504, + "learning_rate": 0.0009927000717762707, + "loss": 0.0517, + "num_input_tokens_seen": 39441008, + "step": 18270 + }, + { + "epoch": 2.9812398042414356, + "grad_norm": 0.07548398524522781, + "learning_rate": 0.0009926879481064734, + "loss": 0.0461, + "num_input_tokens_seen": 39451984, + "step": 18275 + }, + { + "epoch": 2.9820554649265905, + "grad_norm": 0.19831591844558716, + "learning_rate": 0.0009926758144517297, + "loss": 0.2131, + "num_input_tokens_seen": 39461904, + "step": 18280 + }, + { + "epoch": 2.9828711256117453, + "grad_norm": 0.03678493574261665, + "learning_rate": 0.000992663670812286, + "loss": 0.0281, + "num_input_tokens_seen": 39472656, + "step": 18285 + }, + { + "epoch": 2.9836867862969005, + "grad_norm": 0.011070506647229195, + "learning_rate": 0.0009926515171883874, + "loss": 0.1975, + "num_input_tokens_seen": 39483376, + "step": 18290 + }, + { + "epoch": 2.9845024469820554, + "grad_norm": 0.016634484753012657, + "learning_rate": 0.0009926393535802812, + "loss": 0.0581, + "num_input_tokens_seen": 39496144, + "step": 18295 + }, + { + "epoch": 2.9853181076672106, + "grad_norm": 0.10536892712116241, + "learning_rate": 0.0009926271799882134, + "loss": 0.1513, + "num_input_tokens_seen": 39506128, + "step": 18300 + }, + { + "epoch": 2.9861337683523654, + "grad_norm": 0.11362192034721375, + "learning_rate": 0.000992614996412431, + "loss": 0.2074, + "num_input_tokens_seen": 39515440, + "step": 18305 + }, + { + "epoch": 2.9869494290375203, + "grad_norm": 0.016855238005518913, + "learning_rate": 0.0009926028028531808, + "loss": 0.043, + "num_input_tokens_seen": 39526512, + "step": 18310 + }, + { + "epoch": 2.9877650897226755, + "grad_norm": 0.04535501077771187, + "learning_rate": 0.0009925905993107098, + "loss": 0.0372, + "num_input_tokens_seen": 39537136, + "step": 18315 + }, + { + "epoch": 2.9885807504078303, + "grad_norm": 0.10710170865058899, + "learning_rate": 0.0009925783857852653, + "loss": 0.096, + "num_input_tokens_seen": 39545808, + "step": 18320 + }, + { + "epoch": 2.9893964110929856, + "grad_norm": 0.030050192028284073, + "learning_rate": 0.0009925661622770953, + "loss": 0.0686, + "num_input_tokens_seen": 39558000, + "step": 18325 + }, + { + "epoch": 2.9902120717781404, + "grad_norm": 0.024196673184633255, + "learning_rate": 0.0009925539287864466, + "loss": 0.2214, + "num_input_tokens_seen": 39570000, + "step": 18330 + }, + { + "epoch": 2.9910277324632952, + "grad_norm": 0.08378178626298904, + "learning_rate": 0.000992541685313568, + "loss": 0.189, + "num_input_tokens_seen": 39580560, + "step": 18335 + }, + { + "epoch": 2.99184339314845, + "grad_norm": 0.0670500099658966, + "learning_rate": 0.0009925294318587075, + "loss": 0.0603, + "num_input_tokens_seen": 39591408, + "step": 18340 + }, + { + "epoch": 2.9926590538336053, + "grad_norm": 0.20562945306301117, + "learning_rate": 0.000992517168422113, + "loss": 0.0918, + "num_input_tokens_seen": 39601872, + "step": 18345 + }, + { + "epoch": 2.99347471451876, + "grad_norm": 0.13966943323612213, + "learning_rate": 0.0009925048950040333, + "loss": 0.1183, + "num_input_tokens_seen": 39613008, + "step": 18350 + }, + { + "epoch": 2.9942903752039154, + "grad_norm": 0.168809175491333, + "learning_rate": 0.000992492611604717, + "loss": 0.1411, + "num_input_tokens_seen": 39624272, + "step": 18355 + }, + { + "epoch": 2.99510603588907, + "grad_norm": 0.018767395988106728, + "learning_rate": 0.0009924803182244134, + "loss": 0.0473, + "num_input_tokens_seen": 39635216, + "step": 18360 + }, + { + "epoch": 2.995921696574225, + "grad_norm": 0.03149434179067612, + "learning_rate": 0.0009924680148633714, + "loss": 0.064, + "num_input_tokens_seen": 39645808, + "step": 18365 + }, + { + "epoch": 2.99673735725938, + "grad_norm": 0.06067895516753197, + "learning_rate": 0.0009924557015218401, + "loss": 0.1169, + "num_input_tokens_seen": 39656560, + "step": 18370 + }, + { + "epoch": 2.997553017944535, + "grad_norm": 0.04650947079062462, + "learning_rate": 0.0009924433782000695, + "loss": 0.0839, + "num_input_tokens_seen": 39666704, + "step": 18375 + }, + { + "epoch": 2.99836867862969, + "grad_norm": 0.16989737749099731, + "learning_rate": 0.000992431044898309, + "loss": 0.0755, + "num_input_tokens_seen": 39677008, + "step": 18380 + }, + { + "epoch": 2.999184339314845, + "grad_norm": 0.06653452664613724, + "learning_rate": 0.0009924187016168086, + "loss": 0.0203, + "num_input_tokens_seen": 39686416, + "step": 18385 + }, + { + "epoch": 3.0, + "grad_norm": 0.003800722537562251, + "learning_rate": 0.0009924063483558187, + "loss": 0.0857, + "num_input_tokens_seen": 39694112, + "step": 18390 + }, + { + "epoch": 3.0, + "eval_loss": 0.1191425696015358, + "eval_runtime": 103.4865, + "eval_samples_per_second": 26.332, + "eval_steps_per_second": 6.59, + "num_input_tokens_seen": 39694112, + "step": 18390 + }, + { + "epoch": 3.000815660685155, + "grad_norm": 0.015187148936092854, + "learning_rate": 0.0009923939851155896, + "loss": 0.016, + "num_input_tokens_seen": 39703936, + "step": 18395 + }, + { + "epoch": 3.00163132137031, + "grad_norm": 0.0029656249098479748, + "learning_rate": 0.0009923816118963715, + "loss": 0.024, + "num_input_tokens_seen": 39715968, + "step": 18400 + }, + { + "epoch": 3.002446982055465, + "grad_norm": 0.11419258266687393, + "learning_rate": 0.0009923692286984156, + "loss": 0.0443, + "num_input_tokens_seen": 39726912, + "step": 18405 + }, + { + "epoch": 3.0032626427406197, + "grad_norm": 0.19660399854183197, + "learning_rate": 0.0009923568355219726, + "loss": 0.21, + "num_input_tokens_seen": 39737504, + "step": 18410 + }, + { + "epoch": 3.004078303425775, + "grad_norm": 0.17878009378910065, + "learning_rate": 0.0009923444323672937, + "loss": 0.3417, + "num_input_tokens_seen": 39749792, + "step": 18415 + }, + { + "epoch": 3.00489396411093, + "grad_norm": 0.22647486627101898, + "learning_rate": 0.0009923320192346302, + "loss": 0.0794, + "num_input_tokens_seen": 39760928, + "step": 18420 + }, + { + "epoch": 3.0057096247960846, + "grad_norm": 0.08126525580883026, + "learning_rate": 0.000992319596124234, + "loss": 0.0496, + "num_input_tokens_seen": 39772096, + "step": 18425 + }, + { + "epoch": 3.00652528548124, + "grad_norm": 0.16598422825336456, + "learning_rate": 0.0009923071630363563, + "loss": 0.0993, + "num_input_tokens_seen": 39782528, + "step": 18430 + }, + { + "epoch": 3.0073409461663947, + "grad_norm": 0.013870811089873314, + "learning_rate": 0.0009922947199712496, + "loss": 0.0159, + "num_input_tokens_seen": 39794336, + "step": 18435 + }, + { + "epoch": 3.00815660685155, + "grad_norm": 0.22880510985851288, + "learning_rate": 0.0009922822669291658, + "loss": 0.2149, + "num_input_tokens_seen": 39805376, + "step": 18440 + }, + { + "epoch": 3.0089722675367048, + "grad_norm": 0.16160979866981506, + "learning_rate": 0.0009922698039103574, + "loss": 0.1223, + "num_input_tokens_seen": 39816768, + "step": 18445 + }, + { + "epoch": 3.0097879282218596, + "grad_norm": 0.05318981036543846, + "learning_rate": 0.000992257330915077, + "loss": 0.1421, + "num_input_tokens_seen": 39827520, + "step": 18450 + }, + { + "epoch": 3.010603588907015, + "grad_norm": 0.06232697516679764, + "learning_rate": 0.0009922448479435773, + "loss": 0.1497, + "num_input_tokens_seen": 39839072, + "step": 18455 + }, + { + "epoch": 3.0114192495921697, + "grad_norm": 0.06278172880411148, + "learning_rate": 0.0009922323549961113, + "loss": 0.069, + "num_input_tokens_seen": 39850432, + "step": 18460 + }, + { + "epoch": 3.0122349102773245, + "grad_norm": 0.013571258634328842, + "learning_rate": 0.000992219852072932, + "loss": 0.078, + "num_input_tokens_seen": 39861600, + "step": 18465 + }, + { + "epoch": 3.0130505709624797, + "grad_norm": 0.09859306365251541, + "learning_rate": 0.0009922073391742932, + "loss": 0.1023, + "num_input_tokens_seen": 39872288, + "step": 18470 + }, + { + "epoch": 3.0138662316476346, + "grad_norm": 0.015152882784605026, + "learning_rate": 0.0009921948163004483, + "loss": 0.1117, + "num_input_tokens_seen": 39881984, + "step": 18475 + }, + { + "epoch": 3.0146818923327894, + "grad_norm": 0.0032743141055107117, + "learning_rate": 0.000992182283451651, + "loss": 0.0845, + "num_input_tokens_seen": 39893184, + "step": 18480 + }, + { + "epoch": 3.0154975530179446, + "grad_norm": 0.006576085928827524, + "learning_rate": 0.0009921697406281554, + "loss": 0.0673, + "num_input_tokens_seen": 39903200, + "step": 18485 + }, + { + "epoch": 3.0163132137030995, + "grad_norm": 0.027355672791600227, + "learning_rate": 0.0009921571878302154, + "loss": 0.0768, + "num_input_tokens_seen": 39913728, + "step": 18490 + }, + { + "epoch": 3.0171288743882543, + "grad_norm": 0.07741007953882217, + "learning_rate": 0.0009921446250580857, + "loss": 0.0773, + "num_input_tokens_seen": 39924576, + "step": 18495 + }, + { + "epoch": 3.0179445350734095, + "grad_norm": 0.11628734320402145, + "learning_rate": 0.000992132052312021, + "loss": 0.1216, + "num_input_tokens_seen": 39935808, + "step": 18500 + }, + { + "epoch": 3.0187601957585644, + "grad_norm": 0.023185666650533676, + "learning_rate": 0.000992119469592276, + "loss": 0.1127, + "num_input_tokens_seen": 39946592, + "step": 18505 + }, + { + "epoch": 3.0195758564437196, + "grad_norm": 0.028024334460496902, + "learning_rate": 0.0009921068768991056, + "loss": 0.0804, + "num_input_tokens_seen": 39958592, + "step": 18510 + }, + { + "epoch": 3.0203915171288744, + "grad_norm": 0.11683911085128784, + "learning_rate": 0.0009920942742327648, + "loss": 0.1547, + "num_input_tokens_seen": 39969760, + "step": 18515 + }, + { + "epoch": 3.0212071778140293, + "grad_norm": 0.026023484766483307, + "learning_rate": 0.0009920816615935095, + "loss": 0.084, + "num_input_tokens_seen": 39980064, + "step": 18520 + }, + { + "epoch": 3.0220228384991845, + "grad_norm": 0.12365030497312546, + "learning_rate": 0.000992069038981595, + "loss": 0.0734, + "num_input_tokens_seen": 39991840, + "step": 18525 + }, + { + "epoch": 3.0228384991843393, + "grad_norm": 0.009049572050571442, + "learning_rate": 0.0009920564063972772, + "loss": 0.0604, + "num_input_tokens_seen": 40002912, + "step": 18530 + }, + { + "epoch": 3.023654159869494, + "grad_norm": 0.008538025431334972, + "learning_rate": 0.0009920437638408122, + "loss": 0.1153, + "num_input_tokens_seen": 40014624, + "step": 18535 + }, + { + "epoch": 3.0244698205546494, + "grad_norm": 0.10501033067703247, + "learning_rate": 0.000992031111312456, + "loss": 0.2588, + "num_input_tokens_seen": 40025408, + "step": 18540 + }, + { + "epoch": 3.0252854812398042, + "grad_norm": 0.10174082964658737, + "learning_rate": 0.0009920184488124654, + "loss": 0.1097, + "num_input_tokens_seen": 40036352, + "step": 18545 + }, + { + "epoch": 3.026101141924959, + "grad_norm": 0.036076296120882034, + "learning_rate": 0.0009920057763410962, + "loss": 0.0573, + "num_input_tokens_seen": 40045888, + "step": 18550 + }, + { + "epoch": 3.0269168026101143, + "grad_norm": 0.0417712926864624, + "learning_rate": 0.0009919930938986064, + "loss": 0.1114, + "num_input_tokens_seen": 40056928, + "step": 18555 + }, + { + "epoch": 3.027732463295269, + "grad_norm": 0.020543865859508514, + "learning_rate": 0.000991980401485252, + "loss": 0.0341, + "num_input_tokens_seen": 40067456, + "step": 18560 + }, + { + "epoch": 3.028548123980424, + "grad_norm": 0.007914445362985134, + "learning_rate": 0.000991967699101291, + "loss": 0.1021, + "num_input_tokens_seen": 40077760, + "step": 18565 + }, + { + "epoch": 3.029363784665579, + "grad_norm": 0.033559802919626236, + "learning_rate": 0.00099195498674698, + "loss": 0.0581, + "num_input_tokens_seen": 40089376, + "step": 18570 + }, + { + "epoch": 3.030179445350734, + "grad_norm": 0.17421254515647888, + "learning_rate": 0.0009919422644225776, + "loss": 0.1538, + "num_input_tokens_seen": 40100544, + "step": 18575 + }, + { + "epoch": 3.0309951060358893, + "grad_norm": 0.012329780496656895, + "learning_rate": 0.0009919295321283409, + "loss": 0.0459, + "num_input_tokens_seen": 40111616, + "step": 18580 + }, + { + "epoch": 3.031810766721044, + "grad_norm": 0.031448185443878174, + "learning_rate": 0.0009919167898645282, + "loss": 0.1767, + "num_input_tokens_seen": 40123712, + "step": 18585 + }, + { + "epoch": 3.032626427406199, + "grad_norm": 0.04019459709525108, + "learning_rate": 0.0009919040376313976, + "loss": 0.0219, + "num_input_tokens_seen": 40134976, + "step": 18590 + }, + { + "epoch": 3.033442088091354, + "grad_norm": 0.23669855296611786, + "learning_rate": 0.0009918912754292078, + "loss": 0.1551, + "num_input_tokens_seen": 40144992, + "step": 18595 + }, + { + "epoch": 3.034257748776509, + "grad_norm": 0.002897634170949459, + "learning_rate": 0.0009918785032582173, + "loss": 0.0426, + "num_input_tokens_seen": 40156928, + "step": 18600 + }, + { + "epoch": 3.035073409461664, + "grad_norm": 0.01952413283288479, + "learning_rate": 0.000991865721118685, + "loss": 0.1009, + "num_input_tokens_seen": 40167264, + "step": 18605 + }, + { + "epoch": 3.035889070146819, + "grad_norm": 0.10718651115894318, + "learning_rate": 0.0009918529290108696, + "loss": 0.1648, + "num_input_tokens_seen": 40178944, + "step": 18610 + }, + { + "epoch": 3.036704730831974, + "grad_norm": 0.08355123549699783, + "learning_rate": 0.000991840126935031, + "loss": 0.1347, + "num_input_tokens_seen": 40190240, + "step": 18615 + }, + { + "epoch": 3.0375203915171287, + "grad_norm": 0.05343741551041603, + "learning_rate": 0.000991827314891428, + "loss": 0.1002, + "num_input_tokens_seen": 40201696, + "step": 18620 + }, + { + "epoch": 3.038336052202284, + "grad_norm": 0.012799674645066261, + "learning_rate": 0.0009918144928803205, + "loss": 0.1224, + "num_input_tokens_seen": 40211424, + "step": 18625 + }, + { + "epoch": 3.039151712887439, + "grad_norm": 0.12329939752817154, + "learning_rate": 0.0009918016609019686, + "loss": 0.1195, + "num_input_tokens_seen": 40221888, + "step": 18630 + }, + { + "epoch": 3.0399673735725936, + "grad_norm": 0.020006440579891205, + "learning_rate": 0.0009917888189566323, + "loss": 0.0251, + "num_input_tokens_seen": 40233120, + "step": 18635 + }, + { + "epoch": 3.040783034257749, + "grad_norm": 0.25153401494026184, + "learning_rate": 0.0009917759670445712, + "loss": 0.0736, + "num_input_tokens_seen": 40243104, + "step": 18640 + }, + { + "epoch": 3.0415986949429037, + "grad_norm": 0.014973518438637257, + "learning_rate": 0.0009917631051660468, + "loss": 0.1733, + "num_input_tokens_seen": 40253344, + "step": 18645 + }, + { + "epoch": 3.0424143556280585, + "grad_norm": 0.0644526481628418, + "learning_rate": 0.0009917502333213189, + "loss": 0.1261, + "num_input_tokens_seen": 40265280, + "step": 18650 + }, + { + "epoch": 3.0432300163132138, + "grad_norm": 0.08235220611095428, + "learning_rate": 0.0009917373515106486, + "loss": 0.099, + "num_input_tokens_seen": 40276224, + "step": 18655 + }, + { + "epoch": 3.0440456769983686, + "grad_norm": 0.03709649294614792, + "learning_rate": 0.0009917244597342973, + "loss": 0.2001, + "num_input_tokens_seen": 40287168, + "step": 18660 + }, + { + "epoch": 3.044861337683524, + "grad_norm": 0.01651575230062008, + "learning_rate": 0.000991711557992526, + "loss": 0.0425, + "num_input_tokens_seen": 40297184, + "step": 18665 + }, + { + "epoch": 3.0456769983686787, + "grad_norm": 0.12023568153381348, + "learning_rate": 0.000991698646285596, + "loss": 0.0778, + "num_input_tokens_seen": 40308128, + "step": 18670 + }, + { + "epoch": 3.0464926590538335, + "grad_norm": 0.11977537721395493, + "learning_rate": 0.0009916857246137693, + "loss": 0.0899, + "num_input_tokens_seen": 40320352, + "step": 18675 + }, + { + "epoch": 3.0473083197389887, + "grad_norm": 0.049315474927425385, + "learning_rate": 0.0009916727929773078, + "loss": 0.1461, + "num_input_tokens_seen": 40331424, + "step": 18680 + }, + { + "epoch": 3.0481239804241436, + "grad_norm": 0.02754947543144226, + "learning_rate": 0.0009916598513764732, + "loss": 0.1275, + "num_input_tokens_seen": 40343264, + "step": 18685 + }, + { + "epoch": 3.0489396411092984, + "grad_norm": 0.17863212525844574, + "learning_rate": 0.0009916468998115281, + "loss": 0.1114, + "num_input_tokens_seen": 40356608, + "step": 18690 + }, + { + "epoch": 3.0497553017944536, + "grad_norm": 0.16332320868968964, + "learning_rate": 0.000991633938282735, + "loss": 0.1713, + "num_input_tokens_seen": 40367456, + "step": 18695 + }, + { + "epoch": 3.0505709624796085, + "grad_norm": 0.13141590356826782, + "learning_rate": 0.0009916209667903562, + "loss": 0.085, + "num_input_tokens_seen": 40377728, + "step": 18700 + }, + { + "epoch": 3.0513866231647633, + "grad_norm": 0.11269141733646393, + "learning_rate": 0.0009916079853346548, + "loss": 0.0896, + "num_input_tokens_seen": 40389184, + "step": 18705 + }, + { + "epoch": 3.0522022838499185, + "grad_norm": 0.15092810988426208, + "learning_rate": 0.0009915949939158942, + "loss": 0.1009, + "num_input_tokens_seen": 40400512, + "step": 18710 + }, + { + "epoch": 3.0530179445350734, + "grad_norm": 0.10117918252944946, + "learning_rate": 0.0009915819925343373, + "loss": 0.091, + "num_input_tokens_seen": 40411008, + "step": 18715 + }, + { + "epoch": 3.053833605220228, + "grad_norm": 0.21219304203987122, + "learning_rate": 0.0009915689811902477, + "loss": 0.1013, + "num_input_tokens_seen": 40422528, + "step": 18720 + }, + { + "epoch": 3.0546492659053834, + "grad_norm": 0.004308608360588551, + "learning_rate": 0.000991555959883889, + "loss": 0.1098, + "num_input_tokens_seen": 40432384, + "step": 18725 + }, + { + "epoch": 3.0554649265905383, + "grad_norm": 0.025102291256189346, + "learning_rate": 0.0009915429286155254, + "loss": 0.1354, + "num_input_tokens_seen": 40443424, + "step": 18730 + }, + { + "epoch": 3.0562805872756935, + "grad_norm": 0.005208468064665794, + "learning_rate": 0.0009915298873854207, + "loss": 0.1072, + "num_input_tokens_seen": 40453760, + "step": 18735 + }, + { + "epoch": 3.0570962479608483, + "grad_norm": 0.09660587459802628, + "learning_rate": 0.0009915168361938392, + "loss": 0.061, + "num_input_tokens_seen": 40464832, + "step": 18740 + }, + { + "epoch": 3.057911908646003, + "grad_norm": 0.21853257715702057, + "learning_rate": 0.0009915037750410456, + "loss": 0.1874, + "num_input_tokens_seen": 40475136, + "step": 18745 + }, + { + "epoch": 3.0587275693311584, + "grad_norm": 0.11508253216743469, + "learning_rate": 0.0009914907039273045, + "loss": 0.2093, + "num_input_tokens_seen": 40485856, + "step": 18750 + }, + { + "epoch": 3.0595432300163132, + "grad_norm": 0.017460908740758896, + "learning_rate": 0.0009914776228528805, + "loss": 0.127, + "num_input_tokens_seen": 40497120, + "step": 18755 + }, + { + "epoch": 3.060358890701468, + "grad_norm": 0.23283438384532928, + "learning_rate": 0.0009914645318180393, + "loss": 0.1051, + "num_input_tokens_seen": 40508928, + "step": 18760 + }, + { + "epoch": 3.0611745513866233, + "grad_norm": 0.010226443409919739, + "learning_rate": 0.0009914514308230458, + "loss": 0.1476, + "num_input_tokens_seen": 40519904, + "step": 18765 + }, + { + "epoch": 3.061990212071778, + "grad_norm": 0.15015244483947754, + "learning_rate": 0.0009914383198681657, + "loss": 0.1727, + "num_input_tokens_seen": 40529568, + "step": 18770 + }, + { + "epoch": 3.062805872756933, + "grad_norm": 0.11146983504295349, + "learning_rate": 0.0009914251989536645, + "loss": 0.1303, + "num_input_tokens_seen": 40540512, + "step": 18775 + }, + { + "epoch": 3.063621533442088, + "grad_norm": 0.027400687336921692, + "learning_rate": 0.0009914120680798082, + "loss": 0.1303, + "num_input_tokens_seen": 40550592, + "step": 18780 + }, + { + "epoch": 3.064437194127243, + "grad_norm": 0.10246971249580383, + "learning_rate": 0.000991398927246863, + "loss": 0.075, + "num_input_tokens_seen": 40561824, + "step": 18785 + }, + { + "epoch": 3.065252854812398, + "grad_norm": 0.07750476151704788, + "learning_rate": 0.000991385776455095, + "loss": 0.05, + "num_input_tokens_seen": 40572416, + "step": 18790 + }, + { + "epoch": 3.066068515497553, + "grad_norm": 0.10629399865865707, + "learning_rate": 0.0009913726157047712, + "loss": 0.1672, + "num_input_tokens_seen": 40583712, + "step": 18795 + }, + { + "epoch": 3.066884176182708, + "grad_norm": 0.07451663911342621, + "learning_rate": 0.0009913594449961576, + "loss": 0.159, + "num_input_tokens_seen": 40594560, + "step": 18800 + }, + { + "epoch": 3.067699836867863, + "grad_norm": 0.08556533604860306, + "learning_rate": 0.0009913462643295217, + "loss": 0.0913, + "num_input_tokens_seen": 40605536, + "step": 18805 + }, + { + "epoch": 3.068515497553018, + "grad_norm": 0.05361387878656387, + "learning_rate": 0.0009913330737051304, + "loss": 0.0744, + "num_input_tokens_seen": 40615328, + "step": 18810 + }, + { + "epoch": 3.069331158238173, + "grad_norm": 0.1697191447019577, + "learning_rate": 0.0009913198731232513, + "loss": 0.0864, + "num_input_tokens_seen": 40625504, + "step": 18815 + }, + { + "epoch": 3.070146818923328, + "grad_norm": 0.16835199296474457, + "learning_rate": 0.0009913066625841513, + "loss": 0.1752, + "num_input_tokens_seen": 40636896, + "step": 18820 + }, + { + "epoch": 3.070962479608483, + "grad_norm": 0.21473479270935059, + "learning_rate": 0.0009912934420880988, + "loss": 0.186, + "num_input_tokens_seen": 40647456, + "step": 18825 + }, + { + "epoch": 3.0717781402936377, + "grad_norm": 0.12539741396903992, + "learning_rate": 0.0009912802116353613, + "loss": 0.0891, + "num_input_tokens_seen": 40656832, + "step": 18830 + }, + { + "epoch": 3.072593800978793, + "grad_norm": 0.17911890149116516, + "learning_rate": 0.0009912669712262073, + "loss": 0.0843, + "num_input_tokens_seen": 40666752, + "step": 18835 + }, + { + "epoch": 3.073409461663948, + "grad_norm": 0.07693876326084137, + "learning_rate": 0.0009912537208609047, + "loss": 0.0286, + "num_input_tokens_seen": 40678368, + "step": 18840 + }, + { + "epoch": 3.0742251223491026, + "grad_norm": 0.1722443699836731, + "learning_rate": 0.0009912404605397222, + "loss": 0.1616, + "num_input_tokens_seen": 40689632, + "step": 18845 + }, + { + "epoch": 3.075040783034258, + "grad_norm": 0.06141762435436249, + "learning_rate": 0.0009912271902629288, + "loss": 0.0608, + "num_input_tokens_seen": 40700576, + "step": 18850 + }, + { + "epoch": 3.0758564437194127, + "grad_norm": 0.058088019490242004, + "learning_rate": 0.000991213910030793, + "loss": 0.1616, + "num_input_tokens_seen": 40711488, + "step": 18855 + }, + { + "epoch": 3.0766721044045675, + "grad_norm": 0.017817892134189606, + "learning_rate": 0.0009912006198435843, + "loss": 0.1509, + "num_input_tokens_seen": 40720832, + "step": 18860 + }, + { + "epoch": 3.0774877650897228, + "grad_norm": 0.21528476476669312, + "learning_rate": 0.000991187319701572, + "loss": 0.128, + "num_input_tokens_seen": 40731200, + "step": 18865 + }, + { + "epoch": 3.0783034257748776, + "grad_norm": 0.04258257895708084, + "learning_rate": 0.0009911740096050252, + "loss": 0.0312, + "num_input_tokens_seen": 40742432, + "step": 18870 + }, + { + "epoch": 3.0791190864600324, + "grad_norm": 0.011882427148520947, + "learning_rate": 0.0009911606895542143, + "loss": 0.1068, + "num_input_tokens_seen": 40753856, + "step": 18875 + }, + { + "epoch": 3.0799347471451877, + "grad_norm": 0.06226782128214836, + "learning_rate": 0.0009911473595494089, + "loss": 0.0597, + "num_input_tokens_seen": 40762976, + "step": 18880 + }, + { + "epoch": 3.0807504078303425, + "grad_norm": 0.17603141069412231, + "learning_rate": 0.0009911340195908791, + "loss": 0.0953, + "num_input_tokens_seen": 40773504, + "step": 18885 + }, + { + "epoch": 3.0815660685154977, + "grad_norm": 0.00825310405343771, + "learning_rate": 0.0009911206696788955, + "loss": 0.0149, + "num_input_tokens_seen": 40784576, + "step": 18890 + }, + { + "epoch": 3.0823817292006526, + "grad_norm": 0.013906232081353664, + "learning_rate": 0.0009911073098137285, + "loss": 0.0404, + "num_input_tokens_seen": 40794240, + "step": 18895 + }, + { + "epoch": 3.0831973898858074, + "grad_norm": 0.010389653034508228, + "learning_rate": 0.0009910939399956488, + "loss": 0.0531, + "num_input_tokens_seen": 40805600, + "step": 18900 + }, + { + "epoch": 3.0840130505709626, + "grad_norm": 0.021234940737485886, + "learning_rate": 0.0009910805602249273, + "loss": 0.0305, + "num_input_tokens_seen": 40816352, + "step": 18905 + }, + { + "epoch": 3.0848287112561175, + "grad_norm": 0.025505250319838524, + "learning_rate": 0.0009910671705018353, + "loss": 0.0611, + "num_input_tokens_seen": 40826720, + "step": 18910 + }, + { + "epoch": 3.0856443719412723, + "grad_norm": 0.1341158151626587, + "learning_rate": 0.000991053770826644, + "loss": 0.1589, + "num_input_tokens_seen": 40837216, + "step": 18915 + }, + { + "epoch": 3.0864600326264275, + "grad_norm": 0.09766783565282822, + "learning_rate": 0.0009910403611996252, + "loss": 0.0298, + "num_input_tokens_seen": 40847904, + "step": 18920 + }, + { + "epoch": 3.0872756933115824, + "grad_norm": 0.021354546770453453, + "learning_rate": 0.0009910269416210508, + "loss": 0.1803, + "num_input_tokens_seen": 40858304, + "step": 18925 + }, + { + "epoch": 3.088091353996737, + "grad_norm": 0.056844551116228104, + "learning_rate": 0.0009910135120911924, + "loss": 0.0445, + "num_input_tokens_seen": 40869088, + "step": 18930 + }, + { + "epoch": 3.0889070146818924, + "grad_norm": 0.02281181514263153, + "learning_rate": 0.0009910000726103222, + "loss": 0.0207, + "num_input_tokens_seen": 40878976, + "step": 18935 + }, + { + "epoch": 3.0897226753670473, + "grad_norm": 0.21029332280158997, + "learning_rate": 0.0009909866231787125, + "loss": 0.2575, + "num_input_tokens_seen": 40890144, + "step": 18940 + }, + { + "epoch": 3.090538336052202, + "grad_norm": 0.11248669028282166, + "learning_rate": 0.0009909731637966362, + "loss": 0.1817, + "num_input_tokens_seen": 40900448, + "step": 18945 + }, + { + "epoch": 3.0913539967373573, + "grad_norm": 0.015390118584036827, + "learning_rate": 0.0009909596944643658, + "loss": 0.0239, + "num_input_tokens_seen": 40910880, + "step": 18950 + }, + { + "epoch": 3.092169657422512, + "grad_norm": 0.019929755479097366, + "learning_rate": 0.0009909462151821745, + "loss": 0.0507, + "num_input_tokens_seen": 40922752, + "step": 18955 + }, + { + "epoch": 3.0929853181076674, + "grad_norm": 0.2307569682598114, + "learning_rate": 0.0009909327259503351, + "loss": 0.067, + "num_input_tokens_seen": 40932640, + "step": 18960 + }, + { + "epoch": 3.0938009787928222, + "grad_norm": 0.007074298337101936, + "learning_rate": 0.0009909192267691215, + "loss": 0.0228, + "num_input_tokens_seen": 40943264, + "step": 18965 + }, + { + "epoch": 3.094616639477977, + "grad_norm": 0.025699250400066376, + "learning_rate": 0.000990905717638807, + "loss": 0.0328, + "num_input_tokens_seen": 40953664, + "step": 18970 + }, + { + "epoch": 3.0954323001631323, + "grad_norm": 0.02787945047020912, + "learning_rate": 0.000990892198559665, + "loss": 0.1898, + "num_input_tokens_seen": 40964544, + "step": 18975 + }, + { + "epoch": 3.096247960848287, + "grad_norm": 0.02038802206516266, + "learning_rate": 0.0009908786695319702, + "loss": 0.0831, + "num_input_tokens_seen": 40975520, + "step": 18980 + }, + { + "epoch": 3.097063621533442, + "grad_norm": 0.0447017066180706, + "learning_rate": 0.0009908651305559964, + "loss": 0.0802, + "num_input_tokens_seen": 40985728, + "step": 18985 + }, + { + "epoch": 3.097879282218597, + "grad_norm": 0.018079962581396103, + "learning_rate": 0.000990851581632018, + "loss": 0.1527, + "num_input_tokens_seen": 40996064, + "step": 18990 + }, + { + "epoch": 3.098694942903752, + "grad_norm": 0.005051951389759779, + "learning_rate": 0.0009908380227603094, + "loss": 0.1376, + "num_input_tokens_seen": 41007456, + "step": 18995 + }, + { + "epoch": 3.099510603588907, + "grad_norm": 0.4051731824874878, + "learning_rate": 0.000990824453941146, + "loss": 0.143, + "num_input_tokens_seen": 41017792, + "step": 19000 + }, + { + "epoch": 3.100326264274062, + "grad_norm": 0.00485280342400074, + "learning_rate": 0.000990810875174802, + "loss": 0.0875, + "num_input_tokens_seen": 41028512, + "step": 19005 + }, + { + "epoch": 3.101141924959217, + "grad_norm": 0.06485464423894882, + "learning_rate": 0.0009907972864615531, + "loss": 0.0758, + "num_input_tokens_seen": 41039520, + "step": 19010 + }, + { + "epoch": 3.1019575856443717, + "grad_norm": 0.04456976801156998, + "learning_rate": 0.0009907836878016746, + "loss": 0.0969, + "num_input_tokens_seen": 41050336, + "step": 19015 + }, + { + "epoch": 3.102773246329527, + "grad_norm": 0.0465349480509758, + "learning_rate": 0.000990770079195442, + "loss": 0.0163, + "num_input_tokens_seen": 41061088, + "step": 19020 + }, + { + "epoch": 3.103588907014682, + "grad_norm": 0.006169457454234362, + "learning_rate": 0.0009907564606431315, + "loss": 0.17, + "num_input_tokens_seen": 41071680, + "step": 19025 + }, + { + "epoch": 3.104404567699837, + "grad_norm": 0.0036021487321704626, + "learning_rate": 0.0009907428321450182, + "loss": 0.0633, + "num_input_tokens_seen": 41083744, + "step": 19030 + }, + { + "epoch": 3.105220228384992, + "grad_norm": 0.13639992475509644, + "learning_rate": 0.0009907291937013792, + "loss": 0.0905, + "num_input_tokens_seen": 41094688, + "step": 19035 + }, + { + "epoch": 3.1060358890701467, + "grad_norm": 0.007958930917084217, + "learning_rate": 0.0009907155453124906, + "loss": 0.1781, + "num_input_tokens_seen": 41106656, + "step": 19040 + }, + { + "epoch": 3.106851549755302, + "grad_norm": 0.08698417991399765, + "learning_rate": 0.0009907018869786289, + "loss": 0.1551, + "num_input_tokens_seen": 41117280, + "step": 19045 + }, + { + "epoch": 3.107667210440457, + "grad_norm": 0.014690346084535122, + "learning_rate": 0.0009906882187000708, + "loss": 0.0907, + "num_input_tokens_seen": 41128064, + "step": 19050 + }, + { + "epoch": 3.1084828711256116, + "grad_norm": 0.09760601073503494, + "learning_rate": 0.0009906745404770936, + "loss": 0.0937, + "num_input_tokens_seen": 41137440, + "step": 19055 + }, + { + "epoch": 3.109298531810767, + "grad_norm": 0.11052919924259186, + "learning_rate": 0.0009906608523099743, + "loss": 0.0809, + "num_input_tokens_seen": 41147648, + "step": 19060 + }, + { + "epoch": 3.1101141924959217, + "grad_norm": 0.12663492560386658, + "learning_rate": 0.0009906471541989905, + "loss": 0.0892, + "num_input_tokens_seen": 41158688, + "step": 19065 + }, + { + "epoch": 3.1109298531810765, + "grad_norm": 0.06582798063755035, + "learning_rate": 0.0009906334461444195, + "loss": 0.027, + "num_input_tokens_seen": 41168160, + "step": 19070 + }, + { + "epoch": 3.1117455138662318, + "grad_norm": 0.10944589227437973, + "learning_rate": 0.0009906197281465395, + "loss": 0.0631, + "num_input_tokens_seen": 41179936, + "step": 19075 + }, + { + "epoch": 3.1125611745513866, + "grad_norm": 0.009495393373072147, + "learning_rate": 0.0009906060002056283, + "loss": 0.1269, + "num_input_tokens_seen": 41190528, + "step": 19080 + }, + { + "epoch": 3.1133768352365414, + "grad_norm": 0.15334062278270721, + "learning_rate": 0.000990592262321964, + "loss": 0.044, + "num_input_tokens_seen": 41200128, + "step": 19085 + }, + { + "epoch": 3.1141924959216967, + "grad_norm": 0.21439135074615479, + "learning_rate": 0.0009905785144958253, + "loss": 0.0828, + "num_input_tokens_seen": 41211040, + "step": 19090 + }, + { + "epoch": 3.1150081566068515, + "grad_norm": 0.025667604058980942, + "learning_rate": 0.0009905647567274905, + "loss": 0.3109, + "num_input_tokens_seen": 41223392, + "step": 19095 + }, + { + "epoch": 3.1158238172920063, + "grad_norm": 0.24907664954662323, + "learning_rate": 0.0009905509890172385, + "loss": 0.0723, + "num_input_tokens_seen": 41233344, + "step": 19100 + }, + { + "epoch": 3.1166394779771616, + "grad_norm": 0.2514860928058624, + "learning_rate": 0.0009905372113653487, + "loss": 0.3056, + "num_input_tokens_seen": 41244512, + "step": 19105 + }, + { + "epoch": 3.1174551386623164, + "grad_norm": 0.0708160549402237, + "learning_rate": 0.0009905234237721, + "loss": 0.1646, + "num_input_tokens_seen": 41255360, + "step": 19110 + }, + { + "epoch": 3.1182707993474716, + "grad_norm": 0.13378243148326874, + "learning_rate": 0.0009905096262377716, + "loss": 0.0704, + "num_input_tokens_seen": 41265856, + "step": 19115 + }, + { + "epoch": 3.1190864600326265, + "grad_norm": 0.010708755813539028, + "learning_rate": 0.0009904958187626433, + "loss": 0.1844, + "num_input_tokens_seen": 41277216, + "step": 19120 + }, + { + "epoch": 3.1199021207177813, + "grad_norm": 0.2905430793762207, + "learning_rate": 0.0009904820013469952, + "loss": 0.2722, + "num_input_tokens_seen": 41288256, + "step": 19125 + }, + { + "epoch": 3.1207177814029365, + "grad_norm": 0.03431887552142143, + "learning_rate": 0.0009904681739911073, + "loss": 0.08, + "num_input_tokens_seen": 41298784, + "step": 19130 + }, + { + "epoch": 3.1215334420880914, + "grad_norm": 0.03253684192895889, + "learning_rate": 0.0009904543366952593, + "loss": 0.1166, + "num_input_tokens_seen": 41309792, + "step": 19135 + }, + { + "epoch": 3.122349102773246, + "grad_norm": 0.11648626625537872, + "learning_rate": 0.0009904404894597323, + "loss": 0.0929, + "num_input_tokens_seen": 41320320, + "step": 19140 + }, + { + "epoch": 3.1231647634584014, + "grad_norm": 0.155125230550766, + "learning_rate": 0.0009904266322848063, + "loss": 0.1012, + "num_input_tokens_seen": 41331008, + "step": 19145 + }, + { + "epoch": 3.1239804241435563, + "grad_norm": 0.01813601702451706, + "learning_rate": 0.0009904127651707627, + "loss": 0.1637, + "num_input_tokens_seen": 41341888, + "step": 19150 + }, + { + "epoch": 3.124796084828711, + "grad_norm": 0.025928236544132233, + "learning_rate": 0.000990398888117882, + "loss": 0.0892, + "num_input_tokens_seen": 41351840, + "step": 19155 + }, + { + "epoch": 3.1256117455138663, + "grad_norm": 0.03621959313750267, + "learning_rate": 0.0009903850011264458, + "loss": 0.0318, + "num_input_tokens_seen": 41362592, + "step": 19160 + }, + { + "epoch": 3.126427406199021, + "grad_norm": 0.04368871450424194, + "learning_rate": 0.0009903711041967357, + "loss": 0.0634, + "num_input_tokens_seen": 41372704, + "step": 19165 + }, + { + "epoch": 3.1272430668841764, + "grad_norm": 0.025095276534557343, + "learning_rate": 0.000990357197329033, + "loss": 0.1006, + "num_input_tokens_seen": 41383584, + "step": 19170 + }, + { + "epoch": 3.1280587275693312, + "grad_norm": 0.19453568756580353, + "learning_rate": 0.0009903432805236194, + "loss": 0.3416, + "num_input_tokens_seen": 41394976, + "step": 19175 + }, + { + "epoch": 3.128874388254486, + "grad_norm": 0.09131759405136108, + "learning_rate": 0.0009903293537807773, + "loss": 0.1016, + "num_input_tokens_seen": 41406016, + "step": 19180 + }, + { + "epoch": 3.1296900489396413, + "grad_norm": 0.012099217623472214, + "learning_rate": 0.0009903154171007889, + "loss": 0.0782, + "num_input_tokens_seen": 41417376, + "step": 19185 + }, + { + "epoch": 3.130505709624796, + "grad_norm": 0.16639100015163422, + "learning_rate": 0.0009903014704839366, + "loss": 0.1255, + "num_input_tokens_seen": 41428416, + "step": 19190 + }, + { + "epoch": 3.131321370309951, + "grad_norm": 0.1578473597764969, + "learning_rate": 0.000990287513930503, + "loss": 0.2123, + "num_input_tokens_seen": 41439552, + "step": 19195 + }, + { + "epoch": 3.132137030995106, + "grad_norm": 0.023801276460289955, + "learning_rate": 0.000990273547440771, + "loss": 0.0452, + "num_input_tokens_seen": 41448928, + "step": 19200 + }, + { + "epoch": 3.132952691680261, + "grad_norm": 0.07704932242631912, + "learning_rate": 0.0009902595710150233, + "loss": 0.0803, + "num_input_tokens_seen": 41460096, + "step": 19205 + }, + { + "epoch": 3.133768352365416, + "grad_norm": 0.1660885363817215, + "learning_rate": 0.0009902455846535437, + "loss": 0.137, + "num_input_tokens_seen": 41469696, + "step": 19210 + }, + { + "epoch": 3.134584013050571, + "grad_norm": 0.02600104734301567, + "learning_rate": 0.0009902315883566152, + "loss": 0.0516, + "num_input_tokens_seen": 41480224, + "step": 19215 + }, + { + "epoch": 3.135399673735726, + "grad_norm": 0.03606642782688141, + "learning_rate": 0.000990217582124522, + "loss": 0.0825, + "num_input_tokens_seen": 41490176, + "step": 19220 + }, + { + "epoch": 3.1362153344208807, + "grad_norm": 0.1487298309803009, + "learning_rate": 0.0009902035659575474, + "loss": 0.1034, + "num_input_tokens_seen": 41500928, + "step": 19225 + }, + { + "epoch": 3.137030995106036, + "grad_norm": 0.011286617256700993, + "learning_rate": 0.0009901895398559757, + "loss": 0.1285, + "num_input_tokens_seen": 41512608, + "step": 19230 + }, + { + "epoch": 3.137846655791191, + "grad_norm": 0.09289821982383728, + "learning_rate": 0.0009901755038200912, + "loss": 0.0387, + "num_input_tokens_seen": 41522880, + "step": 19235 + }, + { + "epoch": 3.1386623164763456, + "grad_norm": 0.07729063928127289, + "learning_rate": 0.0009901614578501782, + "loss": 0.1722, + "num_input_tokens_seen": 41533632, + "step": 19240 + }, + { + "epoch": 3.139477977161501, + "grad_norm": 0.1389188915491104, + "learning_rate": 0.0009901474019465215, + "loss": 0.0675, + "num_input_tokens_seen": 41546112, + "step": 19245 + }, + { + "epoch": 3.1402936378466557, + "grad_norm": 0.05707646161317825, + "learning_rate": 0.0009901333361094057, + "loss": 0.0295, + "num_input_tokens_seen": 41557376, + "step": 19250 + }, + { + "epoch": 3.141109298531811, + "grad_norm": 0.20002104341983795, + "learning_rate": 0.0009901192603391162, + "loss": 0.2213, + "num_input_tokens_seen": 41568160, + "step": 19255 + }, + { + "epoch": 3.141924959216966, + "grad_norm": 0.013452370651066303, + "learning_rate": 0.0009901051746359381, + "loss": 0.0788, + "num_input_tokens_seen": 41577696, + "step": 19260 + }, + { + "epoch": 3.1427406199021206, + "grad_norm": 0.033235128968954086, + "learning_rate": 0.0009900910790001571, + "loss": 0.0442, + "num_input_tokens_seen": 41589088, + "step": 19265 + }, + { + "epoch": 3.143556280587276, + "grad_norm": 0.16297116875648499, + "learning_rate": 0.0009900769734320586, + "loss": 0.1856, + "num_input_tokens_seen": 41599904, + "step": 19270 + }, + { + "epoch": 3.1443719412724307, + "grad_norm": 0.007626981474459171, + "learning_rate": 0.0009900628579319283, + "loss": 0.0208, + "num_input_tokens_seen": 41610784, + "step": 19275 + }, + { + "epoch": 3.1451876019575855, + "grad_norm": 0.009447101503610611, + "learning_rate": 0.0009900487325000527, + "loss": 0.0981, + "num_input_tokens_seen": 41620768, + "step": 19280 + }, + { + "epoch": 3.1460032626427408, + "grad_norm": 0.02414044179022312, + "learning_rate": 0.0009900345971367178, + "loss": 0.1145, + "num_input_tokens_seen": 41632384, + "step": 19285 + }, + { + "epoch": 3.1468189233278956, + "grad_norm": 0.16770002245903015, + "learning_rate": 0.00099002045184221, + "loss": 0.2019, + "num_input_tokens_seen": 41643680, + "step": 19290 + }, + { + "epoch": 3.1476345840130504, + "grad_norm": 0.21761904656887054, + "learning_rate": 0.0009900062966168163, + "loss": 0.1121, + "num_input_tokens_seen": 41655424, + "step": 19295 + }, + { + "epoch": 3.1484502446982057, + "grad_norm": 0.0558011569082737, + "learning_rate": 0.0009899921314608232, + "loss": 0.0725, + "num_input_tokens_seen": 41666144, + "step": 19300 + }, + { + "epoch": 3.1492659053833605, + "grad_norm": 0.12790575623512268, + "learning_rate": 0.0009899779563745182, + "loss": 0.089, + "num_input_tokens_seen": 41677312, + "step": 19305 + }, + { + "epoch": 3.1500815660685153, + "grad_norm": 0.28836047649383545, + "learning_rate": 0.0009899637713581882, + "loss": 0.099, + "num_input_tokens_seen": 41689344, + "step": 19310 + }, + { + "epoch": 3.1508972267536706, + "grad_norm": 0.01892138458788395, + "learning_rate": 0.0009899495764121207, + "loss": 0.026, + "num_input_tokens_seen": 41700032, + "step": 19315 + }, + { + "epoch": 3.1517128874388254, + "grad_norm": 0.010217717848718166, + "learning_rate": 0.0009899353715366037, + "loss": 0.0865, + "num_input_tokens_seen": 41711040, + "step": 19320 + }, + { + "epoch": 3.15252854812398, + "grad_norm": 0.018575187772512436, + "learning_rate": 0.0009899211567319247, + "loss": 0.0846, + "num_input_tokens_seen": 41721184, + "step": 19325 + }, + { + "epoch": 3.1533442088091355, + "grad_norm": 0.17691223323345184, + "learning_rate": 0.000989906931998372, + "loss": 0.198, + "num_input_tokens_seen": 41731616, + "step": 19330 + }, + { + "epoch": 3.1541598694942903, + "grad_norm": 0.05230746790766716, + "learning_rate": 0.000989892697336234, + "loss": 0.0709, + "num_input_tokens_seen": 41741568, + "step": 19335 + }, + { + "epoch": 3.1549755301794455, + "grad_norm": 0.1409013867378235, + "learning_rate": 0.0009898784527457988, + "loss": 0.1278, + "num_input_tokens_seen": 41752512, + "step": 19340 + }, + { + "epoch": 3.1557911908646004, + "grad_norm": 0.12194265425205231, + "learning_rate": 0.0009898641982273553, + "loss": 0.0713, + "num_input_tokens_seen": 41763040, + "step": 19345 + }, + { + "epoch": 3.156606851549755, + "grad_norm": 0.056760456413030624, + "learning_rate": 0.0009898499337811925, + "loss": 0.2415, + "num_input_tokens_seen": 41773472, + "step": 19350 + }, + { + "epoch": 3.1574225122349104, + "grad_norm": 0.09315556287765503, + "learning_rate": 0.0009898356594075992, + "loss": 0.0296, + "num_input_tokens_seen": 41784096, + "step": 19355 + }, + { + "epoch": 3.1582381729200653, + "grad_norm": 0.027661597356200218, + "learning_rate": 0.0009898213751068652, + "loss": 0.032, + "num_input_tokens_seen": 41795616, + "step": 19360 + }, + { + "epoch": 3.15905383360522, + "grad_norm": 0.034224506467580795, + "learning_rate": 0.0009898070808792795, + "loss": 0.0153, + "num_input_tokens_seen": 41806784, + "step": 19365 + }, + { + "epoch": 3.1598694942903753, + "grad_norm": 0.12046553939580917, + "learning_rate": 0.0009897927767251319, + "loss": 0.1427, + "num_input_tokens_seen": 41817920, + "step": 19370 + }, + { + "epoch": 3.16068515497553, + "grad_norm": 0.16331994533538818, + "learning_rate": 0.0009897784626447122, + "loss": 0.2217, + "num_input_tokens_seen": 41827520, + "step": 19375 + }, + { + "epoch": 3.161500815660685, + "grad_norm": 0.04203738272190094, + "learning_rate": 0.0009897641386383106, + "loss": 0.1376, + "num_input_tokens_seen": 41838496, + "step": 19380 + }, + { + "epoch": 3.1623164763458402, + "grad_norm": 0.061120469123125076, + "learning_rate": 0.0009897498047062177, + "loss": 0.0754, + "num_input_tokens_seen": 41848800, + "step": 19385 + }, + { + "epoch": 3.163132137030995, + "grad_norm": 0.0720907524228096, + "learning_rate": 0.0009897354608487234, + "loss": 0.2287, + "num_input_tokens_seen": 41859616, + "step": 19390 + }, + { + "epoch": 3.1639477977161503, + "grad_norm": 0.05037796497344971, + "learning_rate": 0.000989721107066119, + "loss": 0.0883, + "num_input_tokens_seen": 41870432, + "step": 19395 + }, + { + "epoch": 3.164763458401305, + "grad_norm": 0.05961848050355911, + "learning_rate": 0.000989706743358695, + "loss": 0.1516, + "num_input_tokens_seen": 41880960, + "step": 19400 + }, + { + "epoch": 3.16557911908646, + "grad_norm": 0.12163160741329193, + "learning_rate": 0.0009896923697267426, + "loss": 0.2262, + "num_input_tokens_seen": 41891456, + "step": 19405 + }, + { + "epoch": 3.166394779771615, + "grad_norm": 0.1431042104959488, + "learning_rate": 0.0009896779861705532, + "loss": 0.0984, + "num_input_tokens_seen": 41903040, + "step": 19410 + }, + { + "epoch": 3.16721044045677, + "grad_norm": 0.1294279545545578, + "learning_rate": 0.000989663592690418, + "loss": 0.1284, + "num_input_tokens_seen": 41914272, + "step": 19415 + }, + { + "epoch": 3.168026101141925, + "grad_norm": 0.1290607899427414, + "learning_rate": 0.0009896491892866291, + "loss": 0.1653, + "num_input_tokens_seen": 41925184, + "step": 19420 + }, + { + "epoch": 3.16884176182708, + "grad_norm": 0.22190162539482117, + "learning_rate": 0.0009896347759594782, + "loss": 0.1609, + "num_input_tokens_seen": 41936288, + "step": 19425 + }, + { + "epoch": 3.169657422512235, + "grad_norm": 0.12256866693496704, + "learning_rate": 0.0009896203527092573, + "loss": 0.1221, + "num_input_tokens_seen": 41945984, + "step": 19430 + }, + { + "epoch": 3.1704730831973897, + "grad_norm": 0.41951707005500793, + "learning_rate": 0.000989605919536259, + "loss": 0.3257, + "num_input_tokens_seen": 41956256, + "step": 19435 + }, + { + "epoch": 3.171288743882545, + "grad_norm": 0.14430862665176392, + "learning_rate": 0.0009895914764407755, + "loss": 0.1256, + "num_input_tokens_seen": 41967040, + "step": 19440 + }, + { + "epoch": 3.1721044045677, + "grad_norm": 0.019953053444623947, + "learning_rate": 0.0009895770234230996, + "loss": 0.0735, + "num_input_tokens_seen": 41977664, + "step": 19445 + }, + { + "epoch": 3.1729200652528546, + "grad_norm": 0.14844955503940582, + "learning_rate": 0.0009895625604835244, + "loss": 0.1368, + "num_input_tokens_seen": 41988384, + "step": 19450 + }, + { + "epoch": 3.17373572593801, + "grad_norm": 0.13878247141838074, + "learning_rate": 0.0009895480876223428, + "loss": 0.2278, + "num_input_tokens_seen": 42000576, + "step": 19455 + }, + { + "epoch": 3.1745513866231647, + "grad_norm": 0.08862339705228806, + "learning_rate": 0.000989533604839848, + "loss": 0.0831, + "num_input_tokens_seen": 42010176, + "step": 19460 + }, + { + "epoch": 3.1753670473083195, + "grad_norm": 0.017274098470807076, + "learning_rate": 0.0009895191121363338, + "loss": 0.0661, + "num_input_tokens_seen": 42021248, + "step": 19465 + }, + { + "epoch": 3.176182707993475, + "grad_norm": 0.015619040466845036, + "learning_rate": 0.0009895046095120938, + "loss": 0.0732, + "num_input_tokens_seen": 42031712, + "step": 19470 + }, + { + "epoch": 3.1769983686786296, + "grad_norm": 0.06580100953578949, + "learning_rate": 0.0009894900969674221, + "loss": 0.1229, + "num_input_tokens_seen": 42042848, + "step": 19475 + }, + { + "epoch": 3.177814029363785, + "grad_norm": 0.16553306579589844, + "learning_rate": 0.0009894755745026124, + "loss": 0.1658, + "num_input_tokens_seen": 42053856, + "step": 19480 + }, + { + "epoch": 3.1786296900489397, + "grad_norm": 0.15429700911045074, + "learning_rate": 0.0009894610421179594, + "loss": 0.1264, + "num_input_tokens_seen": 42064864, + "step": 19485 + }, + { + "epoch": 3.1794453507340945, + "grad_norm": 0.1423814743757248, + "learning_rate": 0.0009894464998137572, + "loss": 0.162, + "num_input_tokens_seen": 42076864, + "step": 19490 + }, + { + "epoch": 3.1802610114192498, + "grad_norm": 0.04404854029417038, + "learning_rate": 0.000989431947590301, + "loss": 0.0947, + "num_input_tokens_seen": 42086592, + "step": 19495 + }, + { + "epoch": 3.1810766721044046, + "grad_norm": 0.0164632685482502, + "learning_rate": 0.0009894173854478854, + "loss": 0.168, + "num_input_tokens_seen": 42096736, + "step": 19500 + }, + { + "epoch": 3.1818923327895594, + "grad_norm": 0.030744237825274467, + "learning_rate": 0.0009894028133868055, + "loss": 0.1258, + "num_input_tokens_seen": 42106592, + "step": 19505 + }, + { + "epoch": 3.1827079934747147, + "grad_norm": 0.04916336014866829, + "learning_rate": 0.000989388231407357, + "loss": 0.1065, + "num_input_tokens_seen": 42117024, + "step": 19510 + }, + { + "epoch": 3.1835236541598695, + "grad_norm": 0.10267696529626846, + "learning_rate": 0.000989373639509835, + "loss": 0.0829, + "num_input_tokens_seen": 42127776, + "step": 19515 + }, + { + "epoch": 3.1843393148450243, + "grad_norm": 0.035392045974731445, + "learning_rate": 0.0009893590376945354, + "loss": 0.0877, + "num_input_tokens_seen": 42138976, + "step": 19520 + }, + { + "epoch": 3.1851549755301796, + "grad_norm": 0.1751907914876938, + "learning_rate": 0.000989344425961754, + "loss": 0.0854, + "num_input_tokens_seen": 42150752, + "step": 19525 + }, + { + "epoch": 3.1859706362153344, + "grad_norm": 0.041772108525037766, + "learning_rate": 0.000989329804311787, + "loss": 0.0753, + "num_input_tokens_seen": 42161696, + "step": 19530 + }, + { + "epoch": 3.186786296900489, + "grad_norm": 0.11701906472444534, + "learning_rate": 0.000989315172744931, + "loss": 0.0907, + "num_input_tokens_seen": 42173312, + "step": 19535 + }, + { + "epoch": 3.1876019575856445, + "grad_norm": 0.08318718522787094, + "learning_rate": 0.0009893005312614823, + "loss": 0.1805, + "num_input_tokens_seen": 42185344, + "step": 19540 + }, + { + "epoch": 3.1884176182707993, + "grad_norm": 0.0393078476190567, + "learning_rate": 0.0009892858798617374, + "loss": 0.0757, + "num_input_tokens_seen": 42196480, + "step": 19545 + }, + { + "epoch": 3.189233278955954, + "grad_norm": 0.11877913028001785, + "learning_rate": 0.0009892712185459935, + "loss": 0.0679, + "num_input_tokens_seen": 42206624, + "step": 19550 + }, + { + "epoch": 3.1900489396411094, + "grad_norm": 0.22260494530200958, + "learning_rate": 0.0009892565473145476, + "loss": 0.1515, + "num_input_tokens_seen": 42218688, + "step": 19555 + }, + { + "epoch": 3.190864600326264, + "grad_norm": 0.007652849890291691, + "learning_rate": 0.0009892418661676973, + "loss": 0.028, + "num_input_tokens_seen": 42228896, + "step": 19560 + }, + { + "epoch": 3.1916802610114194, + "grad_norm": 0.02648564614355564, + "learning_rate": 0.0009892271751057399, + "loss": 0.2077, + "num_input_tokens_seen": 42239424, + "step": 19565 + }, + { + "epoch": 3.1924959216965743, + "grad_norm": 0.027710402384400368, + "learning_rate": 0.000989212474128973, + "loss": 0.1494, + "num_input_tokens_seen": 42250272, + "step": 19570 + }, + { + "epoch": 3.193311582381729, + "grad_norm": 0.05966367945075035, + "learning_rate": 0.0009891977632376949, + "loss": 0.0477, + "num_input_tokens_seen": 42261024, + "step": 19575 + }, + { + "epoch": 3.1941272430668843, + "grad_norm": 0.012587291188538074, + "learning_rate": 0.0009891830424322034, + "loss": 0.083, + "num_input_tokens_seen": 42271680, + "step": 19580 + }, + { + "epoch": 3.194942903752039, + "grad_norm": 0.02283092960715294, + "learning_rate": 0.000989168311712797, + "loss": 0.0766, + "num_input_tokens_seen": 42281792, + "step": 19585 + }, + { + "epoch": 3.195758564437194, + "grad_norm": 0.011813844554126263, + "learning_rate": 0.0009891535710797744, + "loss": 0.0343, + "num_input_tokens_seen": 42292128, + "step": 19590 + }, + { + "epoch": 3.1965742251223492, + "grad_norm": 0.08824115991592407, + "learning_rate": 0.0009891388205334338, + "loss": 0.0597, + "num_input_tokens_seen": 42303232, + "step": 19595 + }, + { + "epoch": 3.197389885807504, + "grad_norm": 0.013826590031385422, + "learning_rate": 0.0009891240600740747, + "loss": 0.0541, + "num_input_tokens_seen": 42313056, + "step": 19600 + }, + { + "epoch": 3.198205546492659, + "grad_norm": 0.01044532097876072, + "learning_rate": 0.000989109289701996, + "loss": 0.0476, + "num_input_tokens_seen": 42324320, + "step": 19605 + }, + { + "epoch": 3.199021207177814, + "grad_norm": 0.046715158969163895, + "learning_rate": 0.000989094509417497, + "loss": 0.056, + "num_input_tokens_seen": 42332800, + "step": 19610 + }, + { + "epoch": 3.199836867862969, + "grad_norm": 0.5766527652740479, + "learning_rate": 0.0009890797192208774, + "loss": 0.2323, + "num_input_tokens_seen": 42344448, + "step": 19615 + }, + { + "epoch": 3.200652528548124, + "grad_norm": 0.016124876216053963, + "learning_rate": 0.0009890649191124368, + "loss": 0.1193, + "num_input_tokens_seen": 42355520, + "step": 19620 + }, + { + "epoch": 3.201468189233279, + "grad_norm": 0.03409599885344505, + "learning_rate": 0.000989050109092475, + "loss": 0.0899, + "num_input_tokens_seen": 42366656, + "step": 19625 + }, + { + "epoch": 3.202283849918434, + "grad_norm": 0.03628487139940262, + "learning_rate": 0.0009890352891612927, + "loss": 0.0315, + "num_input_tokens_seen": 42377824, + "step": 19630 + }, + { + "epoch": 3.203099510603589, + "grad_norm": 0.08518194407224655, + "learning_rate": 0.0009890204593191896, + "loss": 0.048, + "num_input_tokens_seen": 42388064, + "step": 19635 + }, + { + "epoch": 3.203915171288744, + "grad_norm": 0.0055519938468933105, + "learning_rate": 0.0009890056195664668, + "loss": 0.1133, + "num_input_tokens_seen": 42399648, + "step": 19640 + }, + { + "epoch": 3.2047308319738987, + "grad_norm": 0.001922906725667417, + "learning_rate": 0.0009889907699034246, + "loss": 0.0847, + "num_input_tokens_seen": 42409024, + "step": 19645 + }, + { + "epoch": 3.205546492659054, + "grad_norm": 0.08530575037002563, + "learning_rate": 0.000988975910330364, + "loss": 0.0273, + "num_input_tokens_seen": 42419552, + "step": 19650 + }, + { + "epoch": 3.206362153344209, + "grad_norm": 0.15175947546958923, + "learning_rate": 0.0009889610408475864, + "loss": 0.2094, + "num_input_tokens_seen": 42428992, + "step": 19655 + }, + { + "epoch": 3.2071778140293636, + "grad_norm": 0.010942497290670872, + "learning_rate": 0.000988946161455393, + "loss": 0.0368, + "num_input_tokens_seen": 42440736, + "step": 19660 + }, + { + "epoch": 3.207993474714519, + "grad_norm": 0.03400535136461258, + "learning_rate": 0.0009889312721540855, + "loss": 0.075, + "num_input_tokens_seen": 42451168, + "step": 19665 + }, + { + "epoch": 3.2088091353996737, + "grad_norm": 0.1479405164718628, + "learning_rate": 0.0009889163729439653, + "loss": 0.0774, + "num_input_tokens_seen": 42462528, + "step": 19670 + }, + { + "epoch": 3.2096247960848285, + "grad_norm": 0.01050618290901184, + "learning_rate": 0.0009889014638253346, + "loss": 0.0415, + "num_input_tokens_seen": 42471648, + "step": 19675 + }, + { + "epoch": 3.210440456769984, + "grad_norm": 0.10927870124578476, + "learning_rate": 0.0009888865447984956, + "loss": 0.108, + "num_input_tokens_seen": 42482944, + "step": 19680 + }, + { + "epoch": 3.2112561174551386, + "grad_norm": 0.28259843587875366, + "learning_rate": 0.0009888716158637505, + "loss": 0.1553, + "num_input_tokens_seen": 42493152, + "step": 19685 + }, + { + "epoch": 3.2120717781402934, + "grad_norm": 0.14287403225898743, + "learning_rate": 0.000988856677021402, + "loss": 0.0635, + "num_input_tokens_seen": 42504544, + "step": 19690 + }, + { + "epoch": 3.2128874388254487, + "grad_norm": 0.09954876452684402, + "learning_rate": 0.0009888417282717529, + "loss": 0.0765, + "num_input_tokens_seen": 42514496, + "step": 19695 + }, + { + "epoch": 3.2137030995106035, + "grad_norm": 0.1207166388630867, + "learning_rate": 0.000988826769615106, + "loss": 0.1568, + "num_input_tokens_seen": 42524704, + "step": 19700 + }, + { + "epoch": 3.2145187601957588, + "grad_norm": 0.2107514590024948, + "learning_rate": 0.0009888118010517642, + "loss": 0.1623, + "num_input_tokens_seen": 42534688, + "step": 19705 + }, + { + "epoch": 3.2153344208809136, + "grad_norm": 0.014131457544863224, + "learning_rate": 0.0009887968225820315, + "loss": 0.2011, + "num_input_tokens_seen": 42546016, + "step": 19710 + }, + { + "epoch": 3.2161500815660684, + "grad_norm": 0.03957388922572136, + "learning_rate": 0.0009887818342062106, + "loss": 0.1014, + "num_input_tokens_seen": 42556672, + "step": 19715 + }, + { + "epoch": 3.2169657422512237, + "grad_norm": 0.01063316036015749, + "learning_rate": 0.0009887668359246063, + "loss": 0.1195, + "num_input_tokens_seen": 42567168, + "step": 19720 + }, + { + "epoch": 3.2177814029363785, + "grad_norm": 0.0050678858533501625, + "learning_rate": 0.0009887518277375217, + "loss": 0.0283, + "num_input_tokens_seen": 42577760, + "step": 19725 + }, + { + "epoch": 3.2185970636215333, + "grad_norm": 0.11844401061534882, + "learning_rate": 0.0009887368096452617, + "loss": 0.1656, + "num_input_tokens_seen": 42588672, + "step": 19730 + }, + { + "epoch": 3.2194127243066886, + "grad_norm": 0.03981683403253555, + "learning_rate": 0.0009887217816481298, + "loss": 0.0253, + "num_input_tokens_seen": 42600480, + "step": 19735 + }, + { + "epoch": 3.2202283849918434, + "grad_norm": 0.004193587694317102, + "learning_rate": 0.0009887067437464312, + "loss": 0.0888, + "num_input_tokens_seen": 42612992, + "step": 19740 + }, + { + "epoch": 3.221044045676998, + "grad_norm": 0.05297451093792915, + "learning_rate": 0.0009886916959404703, + "loss": 0.0781, + "num_input_tokens_seen": 42624576, + "step": 19745 + }, + { + "epoch": 3.2218597063621535, + "grad_norm": 0.040194373577833176, + "learning_rate": 0.0009886766382305526, + "loss": 0.1213, + "num_input_tokens_seen": 42636128, + "step": 19750 + }, + { + "epoch": 3.2226753670473083, + "grad_norm": 0.010000188834965229, + "learning_rate": 0.0009886615706169825, + "loss": 0.0883, + "num_input_tokens_seen": 42646304, + "step": 19755 + }, + { + "epoch": 3.223491027732463, + "grad_norm": 0.019517701119184494, + "learning_rate": 0.0009886464931000661, + "loss": 0.153, + "num_input_tokens_seen": 42656576, + "step": 19760 + }, + { + "epoch": 3.2243066884176184, + "grad_norm": 0.03761085122823715, + "learning_rate": 0.0009886314056801084, + "loss": 0.0191, + "num_input_tokens_seen": 42666624, + "step": 19765 + }, + { + "epoch": 3.225122349102773, + "grad_norm": 0.1993914246559143, + "learning_rate": 0.0009886163083574154, + "loss": 0.2123, + "num_input_tokens_seen": 42676096, + "step": 19770 + }, + { + "epoch": 3.225938009787928, + "grad_norm": 0.04169526696205139, + "learning_rate": 0.000988601201132293, + "loss": 0.0656, + "num_input_tokens_seen": 42686752, + "step": 19775 + }, + { + "epoch": 3.2267536704730833, + "grad_norm": 0.1619374006986618, + "learning_rate": 0.0009885860840050478, + "loss": 0.1514, + "num_input_tokens_seen": 42697792, + "step": 19780 + }, + { + "epoch": 3.227569331158238, + "grad_norm": 0.038715824484825134, + "learning_rate": 0.0009885709569759852, + "loss": 0.0604, + "num_input_tokens_seen": 42708384, + "step": 19785 + }, + { + "epoch": 3.2283849918433933, + "grad_norm": 0.21826331317424774, + "learning_rate": 0.0009885558200454128, + "loss": 0.1557, + "num_input_tokens_seen": 42719136, + "step": 19790 + }, + { + "epoch": 3.229200652528548, + "grad_norm": 0.006297794170677662, + "learning_rate": 0.0009885406732136367, + "loss": 0.0863, + "num_input_tokens_seen": 42728896, + "step": 19795 + }, + { + "epoch": 3.230016313213703, + "grad_norm": 0.1730261743068695, + "learning_rate": 0.0009885255164809644, + "loss": 0.1161, + "num_input_tokens_seen": 42740512, + "step": 19800 + }, + { + "epoch": 3.2308319738988582, + "grad_norm": 0.047330789268016815, + "learning_rate": 0.0009885103498477026, + "loss": 0.0437, + "num_input_tokens_seen": 42750240, + "step": 19805 + }, + { + "epoch": 3.231647634584013, + "grad_norm": 0.016037365421652794, + "learning_rate": 0.0009884951733141586, + "loss": 0.0127, + "num_input_tokens_seen": 42760672, + "step": 19810 + }, + { + "epoch": 3.232463295269168, + "grad_norm": 0.25029993057250977, + "learning_rate": 0.0009884799868806406, + "loss": 0.2666, + "num_input_tokens_seen": 42771680, + "step": 19815 + }, + { + "epoch": 3.233278955954323, + "grad_norm": 0.0780901089310646, + "learning_rate": 0.000988464790547456, + "loss": 0.1638, + "num_input_tokens_seen": 42782048, + "step": 19820 + }, + { + "epoch": 3.234094616639478, + "grad_norm": 0.24309828877449036, + "learning_rate": 0.0009884495843149124, + "loss": 0.155, + "num_input_tokens_seen": 42792864, + "step": 19825 + }, + { + "epoch": 3.2349102773246328, + "grad_norm": 0.012334475293755531, + "learning_rate": 0.0009884343681833185, + "loss": 0.0939, + "num_input_tokens_seen": 42803904, + "step": 19830 + }, + { + "epoch": 3.235725938009788, + "grad_norm": 0.05174738168716431, + "learning_rate": 0.0009884191421529825, + "loss": 0.0615, + "num_input_tokens_seen": 42814720, + "step": 19835 + }, + { + "epoch": 3.236541598694943, + "grad_norm": 0.03651052340865135, + "learning_rate": 0.000988403906224213, + "loss": 0.0534, + "num_input_tokens_seen": 42825856, + "step": 19840 + }, + { + "epoch": 3.237357259380098, + "grad_norm": 0.05609550327062607, + "learning_rate": 0.0009883886603973188, + "loss": 0.1694, + "num_input_tokens_seen": 42837408, + "step": 19845 + }, + { + "epoch": 3.238172920065253, + "grad_norm": 0.16002701222896576, + "learning_rate": 0.0009883734046726086, + "loss": 0.1204, + "num_input_tokens_seen": 42848960, + "step": 19850 + }, + { + "epoch": 3.2389885807504077, + "grad_norm": 0.007556559983640909, + "learning_rate": 0.0009883581390503922, + "loss": 0.2185, + "num_input_tokens_seen": 42859168, + "step": 19855 + }, + { + "epoch": 3.239804241435563, + "grad_norm": 0.03003288060426712, + "learning_rate": 0.0009883428635309784, + "loss": 0.0832, + "num_input_tokens_seen": 42867872, + "step": 19860 + }, + { + "epoch": 3.240619902120718, + "grad_norm": 0.1715545356273651, + "learning_rate": 0.0009883275781146768, + "loss": 0.2003, + "num_input_tokens_seen": 42879264, + "step": 19865 + }, + { + "epoch": 3.2414355628058726, + "grad_norm": 0.01096479594707489, + "learning_rate": 0.0009883122828017977, + "loss": 0.0333, + "num_input_tokens_seen": 42890816, + "step": 19870 + }, + { + "epoch": 3.242251223491028, + "grad_norm": 0.25855255126953125, + "learning_rate": 0.0009882969775926505, + "loss": 0.1685, + "num_input_tokens_seen": 42902400, + "step": 19875 + }, + { + "epoch": 3.2430668841761827, + "grad_norm": 0.09562316536903381, + "learning_rate": 0.0009882816624875454, + "loss": 0.1369, + "num_input_tokens_seen": 42913152, + "step": 19880 + }, + { + "epoch": 3.2438825448613375, + "grad_norm": 0.28010687232017517, + "learning_rate": 0.0009882663374867933, + "loss": 0.2001, + "num_input_tokens_seen": 42924672, + "step": 19885 + }, + { + "epoch": 3.244698205546493, + "grad_norm": 0.01753748208284378, + "learning_rate": 0.0009882510025907042, + "loss": 0.1653, + "num_input_tokens_seen": 42934880, + "step": 19890 + }, + { + "epoch": 3.2455138662316476, + "grad_norm": 0.009085813537240028, + "learning_rate": 0.0009882356577995894, + "loss": 0.0553, + "num_input_tokens_seen": 42946176, + "step": 19895 + }, + { + "epoch": 3.2463295269168024, + "grad_norm": 0.058416616171598434, + "learning_rate": 0.0009882203031137595, + "loss": 0.0896, + "num_input_tokens_seen": 42958016, + "step": 19900 + }, + { + "epoch": 3.2471451876019577, + "grad_norm": 0.0504441037774086, + "learning_rate": 0.000988204938533526, + "loss": 0.0639, + "num_input_tokens_seen": 42968320, + "step": 19905 + }, + { + "epoch": 3.2479608482871125, + "grad_norm": 0.16939640045166016, + "learning_rate": 0.0009881895640591997, + "loss": 0.141, + "num_input_tokens_seen": 42980096, + "step": 19910 + }, + { + "epoch": 3.2487765089722673, + "grad_norm": 0.1258719116449356, + "learning_rate": 0.0009881741796910928, + "loss": 0.1493, + "num_input_tokens_seen": 42990304, + "step": 19915 + }, + { + "epoch": 3.2495921696574226, + "grad_norm": 0.009623922407627106, + "learning_rate": 0.0009881587854295168, + "loss": 0.1725, + "num_input_tokens_seen": 43001280, + "step": 19920 + }, + { + "epoch": 3.2504078303425774, + "grad_norm": 0.08955050259828568, + "learning_rate": 0.0009881433812747838, + "loss": 0.0576, + "num_input_tokens_seen": 43011584, + "step": 19925 + }, + { + "epoch": 3.2512234910277327, + "grad_norm": 0.0080439243465662, + "learning_rate": 0.000988127967227206, + "loss": 0.0997, + "num_input_tokens_seen": 43022432, + "step": 19930 + }, + { + "epoch": 3.2520391517128875, + "grad_norm": 0.03764588385820389, + "learning_rate": 0.0009881125432870956, + "loss": 0.1969, + "num_input_tokens_seen": 43033792, + "step": 19935 + }, + { + "epoch": 3.2528548123980423, + "grad_norm": 0.02885637991130352, + "learning_rate": 0.0009880971094547652, + "loss": 0.2073, + "num_input_tokens_seen": 43045504, + "step": 19940 + }, + { + "epoch": 3.2536704730831976, + "grad_norm": 0.01383949164301157, + "learning_rate": 0.0009880816657305278, + "loss": 0.0947, + "num_input_tokens_seen": 43056544, + "step": 19945 + }, + { + "epoch": 3.2544861337683524, + "grad_norm": 0.17397625744342804, + "learning_rate": 0.0009880662121146964, + "loss": 0.1455, + "num_input_tokens_seen": 43065184, + "step": 19950 + }, + { + "epoch": 3.255301794453507, + "grad_norm": 0.022838570177555084, + "learning_rate": 0.0009880507486075838, + "loss": 0.0462, + "num_input_tokens_seen": 43074784, + "step": 19955 + }, + { + "epoch": 3.2561174551386625, + "grad_norm": 0.197303906083107, + "learning_rate": 0.0009880352752095038, + "loss": 0.0948, + "num_input_tokens_seen": 43084864, + "step": 19960 + }, + { + "epoch": 3.2569331158238173, + "grad_norm": 0.023669840767979622, + "learning_rate": 0.0009880197919207698, + "loss": 0.2483, + "num_input_tokens_seen": 43096576, + "step": 19965 + }, + { + "epoch": 3.257748776508972, + "grad_norm": 0.13685080409049988, + "learning_rate": 0.0009880042987416957, + "loss": 0.1159, + "num_input_tokens_seen": 43106816, + "step": 19970 + }, + { + "epoch": 3.2585644371941274, + "grad_norm": 0.07496922463178635, + "learning_rate": 0.0009879887956725953, + "loss": 0.0469, + "num_input_tokens_seen": 43118304, + "step": 19975 + }, + { + "epoch": 3.259380097879282, + "grad_norm": 0.04282359406352043, + "learning_rate": 0.0009879732827137828, + "loss": 0.2092, + "num_input_tokens_seen": 43128384, + "step": 19980 + }, + { + "epoch": 3.2601957585644374, + "grad_norm": 0.15203064680099487, + "learning_rate": 0.0009879577598655728, + "loss": 0.1565, + "num_input_tokens_seen": 43139136, + "step": 19985 + }, + { + "epoch": 3.2610114192495923, + "grad_norm": 0.04612713307142258, + "learning_rate": 0.0009879422271282798, + "loss": 0.0874, + "num_input_tokens_seen": 43150048, + "step": 19990 + }, + { + "epoch": 3.261827079934747, + "grad_norm": 0.20606836676597595, + "learning_rate": 0.0009879266845022187, + "loss": 0.1295, + "num_input_tokens_seen": 43159456, + "step": 19995 + }, + { + "epoch": 3.262642740619902, + "grad_norm": 0.1792658269405365, + "learning_rate": 0.0009879111319877041, + "loss": 0.1846, + "num_input_tokens_seen": 43170816, + "step": 20000 + }, + { + "epoch": 3.263458401305057, + "grad_norm": 0.17177966237068176, + "learning_rate": 0.0009878955695850516, + "loss": 0.1721, + "num_input_tokens_seen": 43182336, + "step": 20005 + }, + { + "epoch": 3.264274061990212, + "grad_norm": 0.169560045003891, + "learning_rate": 0.0009878799972945762, + "loss": 0.2424, + "num_input_tokens_seen": 43192384, + "step": 20010 + }, + { + "epoch": 3.2650897226753672, + "grad_norm": 0.09047980606555939, + "learning_rate": 0.000987864415116594, + "loss": 0.1982, + "num_input_tokens_seen": 43203744, + "step": 20015 + }, + { + "epoch": 3.265905383360522, + "grad_norm": 0.1685606986284256, + "learning_rate": 0.0009878488230514206, + "loss": 0.1224, + "num_input_tokens_seen": 43213376, + "step": 20020 + }, + { + "epoch": 3.266721044045677, + "grad_norm": 0.1426238715648651, + "learning_rate": 0.0009878332210993717, + "loss": 0.2, + "num_input_tokens_seen": 43223744, + "step": 20025 + }, + { + "epoch": 3.267536704730832, + "grad_norm": 0.1901000738143921, + "learning_rate": 0.0009878176092607638, + "loss": 0.1779, + "num_input_tokens_seen": 43234752, + "step": 20030 + }, + { + "epoch": 3.268352365415987, + "grad_norm": 0.1842803806066513, + "learning_rate": 0.0009878019875359132, + "loss": 0.1895, + "num_input_tokens_seen": 43245280, + "step": 20035 + }, + { + "epoch": 3.2691680261011418, + "grad_norm": 0.08340982347726822, + "learning_rate": 0.0009877863559251366, + "loss": 0.0951, + "num_input_tokens_seen": 43256672, + "step": 20040 + }, + { + "epoch": 3.269983686786297, + "grad_norm": 0.10031823068857193, + "learning_rate": 0.0009877707144287505, + "loss": 0.1966, + "num_input_tokens_seen": 43268288, + "step": 20045 + }, + { + "epoch": 3.270799347471452, + "grad_norm": 0.04976049065589905, + "learning_rate": 0.0009877550630470722, + "loss": 0.0387, + "num_input_tokens_seen": 43278080, + "step": 20050 + }, + { + "epoch": 3.2716150081566067, + "grad_norm": 0.01608528569340706, + "learning_rate": 0.000987739401780419, + "loss": 0.0503, + "num_input_tokens_seen": 43287392, + "step": 20055 + }, + { + "epoch": 3.272430668841762, + "grad_norm": 0.04355829209089279, + "learning_rate": 0.0009877237306291076, + "loss": 0.0554, + "num_input_tokens_seen": 43297824, + "step": 20060 + }, + { + "epoch": 3.2732463295269167, + "grad_norm": 0.1150590106844902, + "learning_rate": 0.0009877080495934564, + "loss": 0.1144, + "num_input_tokens_seen": 43308864, + "step": 20065 + }, + { + "epoch": 3.274061990212072, + "grad_norm": 0.17519082129001617, + "learning_rate": 0.0009876923586737828, + "loss": 0.1355, + "num_input_tokens_seen": 43318912, + "step": 20070 + }, + { + "epoch": 3.274877650897227, + "grad_norm": 0.01690365932881832, + "learning_rate": 0.000987676657870405, + "loss": 0.0957, + "num_input_tokens_seen": 43328896, + "step": 20075 + }, + { + "epoch": 3.2756933115823816, + "grad_norm": 0.02377474680542946, + "learning_rate": 0.0009876609471836408, + "loss": 0.0756, + "num_input_tokens_seen": 43340512, + "step": 20080 + }, + { + "epoch": 3.2765089722675365, + "grad_norm": 0.04190889745950699, + "learning_rate": 0.000987645226613809, + "loss": 0.0488, + "num_input_tokens_seen": 43351840, + "step": 20085 + }, + { + "epoch": 3.2773246329526917, + "grad_norm": 0.11968051642179489, + "learning_rate": 0.0009876294961612283, + "loss": 0.2708, + "num_input_tokens_seen": 43363328, + "step": 20090 + }, + { + "epoch": 3.2781402936378465, + "grad_norm": 0.059578005224466324, + "learning_rate": 0.0009876137558262168, + "loss": 0.1248, + "num_input_tokens_seen": 43374688, + "step": 20095 + }, + { + "epoch": 3.278955954323002, + "grad_norm": 0.034352321177721024, + "learning_rate": 0.0009875980056090943, + "loss": 0.1441, + "num_input_tokens_seen": 43384832, + "step": 20100 + }, + { + "epoch": 3.2797716150081566, + "grad_norm": 0.12010779976844788, + "learning_rate": 0.0009875822455101795, + "loss": 0.0987, + "num_input_tokens_seen": 43396352, + "step": 20105 + }, + { + "epoch": 3.2805872756933114, + "grad_norm": 0.31800228357315063, + "learning_rate": 0.000987566475529792, + "loss": 0.0535, + "num_input_tokens_seen": 43407104, + "step": 20110 + }, + { + "epoch": 3.2814029363784667, + "grad_norm": 0.05476715415716171, + "learning_rate": 0.0009875506956682513, + "loss": 0.0708, + "num_input_tokens_seen": 43418816, + "step": 20115 + }, + { + "epoch": 3.2822185970636215, + "grad_norm": 0.006512451451271772, + "learning_rate": 0.0009875349059258773, + "loss": 0.0835, + "num_input_tokens_seen": 43429088, + "step": 20120 + }, + { + "epoch": 3.2830342577487763, + "grad_norm": 0.0856441929936409, + "learning_rate": 0.00098751910630299, + "loss": 0.0643, + "num_input_tokens_seen": 43439744, + "step": 20125 + }, + { + "epoch": 3.2838499184339316, + "grad_norm": 0.049245286732912064, + "learning_rate": 0.0009875032967999096, + "loss": 0.0171, + "num_input_tokens_seen": 43451264, + "step": 20130 + }, + { + "epoch": 3.2846655791190864, + "grad_norm": 0.004814895335584879, + "learning_rate": 0.0009874874774169562, + "loss": 0.1204, + "num_input_tokens_seen": 43461696, + "step": 20135 + }, + { + "epoch": 3.2854812398042412, + "grad_norm": 0.10136290639638901, + "learning_rate": 0.0009874716481544509, + "loss": 0.0382, + "num_input_tokens_seen": 43471904, + "step": 20140 + }, + { + "epoch": 3.2862969004893965, + "grad_norm": 0.012933127582073212, + "learning_rate": 0.0009874558090127142, + "loss": 0.2296, + "num_input_tokens_seen": 43482400, + "step": 20145 + }, + { + "epoch": 3.2871125611745513, + "grad_norm": 0.022629836574196815, + "learning_rate": 0.0009874399599920669, + "loss": 0.12, + "num_input_tokens_seen": 43492160, + "step": 20150 + }, + { + "epoch": 3.2879282218597066, + "grad_norm": 0.08124856650829315, + "learning_rate": 0.0009874241010928307, + "loss": 0.081, + "num_input_tokens_seen": 43502688, + "step": 20155 + }, + { + "epoch": 3.2887438825448614, + "grad_norm": 0.09135818481445312, + "learning_rate": 0.0009874082323153266, + "loss": 0.1027, + "num_input_tokens_seen": 43513184, + "step": 20160 + }, + { + "epoch": 3.289559543230016, + "grad_norm": 0.21702003479003906, + "learning_rate": 0.0009873923536598765, + "loss": 0.2271, + "num_input_tokens_seen": 43522592, + "step": 20165 + }, + { + "epoch": 3.2903752039151715, + "grad_norm": 0.0939573347568512, + "learning_rate": 0.000987376465126802, + "loss": 0.0761, + "num_input_tokens_seen": 43533312, + "step": 20170 + }, + { + "epoch": 3.2911908646003263, + "grad_norm": 0.121924489736557, + "learning_rate": 0.0009873605667164252, + "loss": 0.1761, + "num_input_tokens_seen": 43543808, + "step": 20175 + }, + { + "epoch": 3.292006525285481, + "grad_norm": 0.19730672240257263, + "learning_rate": 0.0009873446584290682, + "loss": 0.2502, + "num_input_tokens_seen": 43554528, + "step": 20180 + }, + { + "epoch": 3.2928221859706364, + "grad_norm": 0.03863910958170891, + "learning_rate": 0.0009873287402650535, + "loss": 0.0603, + "num_input_tokens_seen": 43564960, + "step": 20185 + }, + { + "epoch": 3.293637846655791, + "grad_norm": 0.008187521249055862, + "learning_rate": 0.0009873128122247035, + "loss": 0.1319, + "num_input_tokens_seen": 43576576, + "step": 20190 + }, + { + "epoch": 3.294453507340946, + "grad_norm": 0.1582612842321396, + "learning_rate": 0.0009872968743083414, + "loss": 0.1499, + "num_input_tokens_seen": 43588576, + "step": 20195 + }, + { + "epoch": 3.2952691680261013, + "grad_norm": 0.03137233480811119, + "learning_rate": 0.0009872809265162898, + "loss": 0.2646, + "num_input_tokens_seen": 43598912, + "step": 20200 + }, + { + "epoch": 3.296084828711256, + "grad_norm": 0.03531002253293991, + "learning_rate": 0.000987264968848872, + "loss": 0.1021, + "num_input_tokens_seen": 43609568, + "step": 20205 + }, + { + "epoch": 3.2969004893964113, + "grad_norm": 0.02927856706082821, + "learning_rate": 0.0009872490013064117, + "loss": 0.0802, + "num_input_tokens_seen": 43619904, + "step": 20210 + }, + { + "epoch": 3.297716150081566, + "grad_norm": 0.027753494679927826, + "learning_rate": 0.000987233023889232, + "loss": 0.1015, + "num_input_tokens_seen": 43630976, + "step": 20215 + }, + { + "epoch": 3.298531810766721, + "grad_norm": 0.05915002152323723, + "learning_rate": 0.000987217036597657, + "loss": 0.037, + "num_input_tokens_seen": 43640384, + "step": 20220 + }, + { + "epoch": 3.299347471451876, + "grad_norm": 0.17940424382686615, + "learning_rate": 0.000987201039432011, + "loss": 0.205, + "num_input_tokens_seen": 43650912, + "step": 20225 + }, + { + "epoch": 3.300163132137031, + "grad_norm": 0.0787631943821907, + "learning_rate": 0.0009871850323926177, + "loss": 0.1568, + "num_input_tokens_seen": 43662368, + "step": 20230 + }, + { + "epoch": 3.300978792822186, + "grad_norm": 0.24962536990642548, + "learning_rate": 0.0009871690154798017, + "loss": 0.1478, + "num_input_tokens_seen": 43674048, + "step": 20235 + }, + { + "epoch": 3.301794453507341, + "grad_norm": 0.035794150084257126, + "learning_rate": 0.0009871529886938874, + "loss": 0.1075, + "num_input_tokens_seen": 43685120, + "step": 20240 + }, + { + "epoch": 3.302610114192496, + "grad_norm": 0.013689146377146244, + "learning_rate": 0.0009871369520352, + "loss": 0.0702, + "num_input_tokens_seen": 43696288, + "step": 20245 + }, + { + "epoch": 3.3034257748776508, + "grad_norm": 0.050041962414979935, + "learning_rate": 0.0009871209055040643, + "loss": 0.1481, + "num_input_tokens_seen": 43708000, + "step": 20250 + }, + { + "epoch": 3.304241435562806, + "grad_norm": 0.0076757390052080154, + "learning_rate": 0.0009871048491008052, + "loss": 0.0804, + "num_input_tokens_seen": 43720416, + "step": 20255 + }, + { + "epoch": 3.305057096247961, + "grad_norm": 0.017270803451538086, + "learning_rate": 0.0009870887828257486, + "loss": 0.2065, + "num_input_tokens_seen": 43730080, + "step": 20260 + }, + { + "epoch": 3.3058727569331157, + "grad_norm": 0.10030859708786011, + "learning_rate": 0.00098707270667922, + "loss": 0.0587, + "num_input_tokens_seen": 43740896, + "step": 20265 + }, + { + "epoch": 3.306688417618271, + "grad_norm": 0.22853267192840576, + "learning_rate": 0.000987056620661545, + "loss": 0.1822, + "num_input_tokens_seen": 43752128, + "step": 20270 + }, + { + "epoch": 3.3075040783034257, + "grad_norm": 0.019631093367934227, + "learning_rate": 0.0009870405247730497, + "loss": 0.0636, + "num_input_tokens_seen": 43763008, + "step": 20275 + }, + { + "epoch": 3.3083197389885806, + "grad_norm": 0.07752983272075653, + "learning_rate": 0.0009870244190140602, + "loss": 0.1288, + "num_input_tokens_seen": 43774144, + "step": 20280 + }, + { + "epoch": 3.309135399673736, + "grad_norm": 0.14323344826698303, + "learning_rate": 0.000987008303384903, + "loss": 0.1305, + "num_input_tokens_seen": 43785280, + "step": 20285 + }, + { + "epoch": 3.3099510603588906, + "grad_norm": 0.01318053063005209, + "learning_rate": 0.000986992177885905, + "loss": 0.0226, + "num_input_tokens_seen": 43796288, + "step": 20290 + }, + { + "epoch": 3.310766721044046, + "grad_norm": 0.08851774781942368, + "learning_rate": 0.0009869760425173927, + "loss": 0.0516, + "num_input_tokens_seen": 43807392, + "step": 20295 + }, + { + "epoch": 3.3115823817292007, + "grad_norm": 0.08077265322208405, + "learning_rate": 0.000986959897279693, + "loss": 0.1776, + "num_input_tokens_seen": 43819328, + "step": 20300 + }, + { + "epoch": 3.3123980424143555, + "grad_norm": 0.0336269810795784, + "learning_rate": 0.0009869437421731332, + "loss": 0.0523, + "num_input_tokens_seen": 43829632, + "step": 20305 + }, + { + "epoch": 3.3132137030995104, + "grad_norm": 0.08483406901359558, + "learning_rate": 0.0009869275771980405, + "loss": 0.1331, + "num_input_tokens_seen": 43839520, + "step": 20310 + }, + { + "epoch": 3.3140293637846656, + "grad_norm": 0.11510413885116577, + "learning_rate": 0.000986911402354743, + "loss": 0.0706, + "num_input_tokens_seen": 43850048, + "step": 20315 + }, + { + "epoch": 3.3148450244698204, + "grad_norm": 0.027269015088677406, + "learning_rate": 0.0009868952176435683, + "loss": 0.0215, + "num_input_tokens_seen": 43860480, + "step": 20320 + }, + { + "epoch": 3.3156606851549757, + "grad_norm": 0.03962091729044914, + "learning_rate": 0.0009868790230648443, + "loss": 0.0508, + "num_input_tokens_seen": 43871456, + "step": 20325 + }, + { + "epoch": 3.3164763458401305, + "grad_norm": 0.1779964119195938, + "learning_rate": 0.0009868628186188993, + "loss": 0.1266, + "num_input_tokens_seen": 43882208, + "step": 20330 + }, + { + "epoch": 3.3172920065252853, + "grad_norm": 0.0028698297683149576, + "learning_rate": 0.0009868466043060616, + "loss": 0.0132, + "num_input_tokens_seen": 43893568, + "step": 20335 + }, + { + "epoch": 3.3181076672104406, + "grad_norm": 0.0065576727502048016, + "learning_rate": 0.00098683038012666, + "loss": 0.1158, + "num_input_tokens_seen": 43904736, + "step": 20340 + }, + { + "epoch": 3.3189233278955954, + "grad_norm": 0.026138747110962868, + "learning_rate": 0.0009868141460810226, + "loss": 0.0734, + "num_input_tokens_seen": 43915232, + "step": 20345 + }, + { + "epoch": 3.3197389885807502, + "grad_norm": 0.007043990772217512, + "learning_rate": 0.0009867979021694795, + "loss": 0.2558, + "num_input_tokens_seen": 43925664, + "step": 20350 + }, + { + "epoch": 3.3205546492659055, + "grad_norm": 0.026781899854540825, + "learning_rate": 0.0009867816483923593, + "loss": 0.0822, + "num_input_tokens_seen": 43936736, + "step": 20355 + }, + { + "epoch": 3.3213703099510603, + "grad_norm": 0.19813309609889984, + "learning_rate": 0.0009867653847499913, + "loss": 0.2725, + "num_input_tokens_seen": 43948320, + "step": 20360 + }, + { + "epoch": 3.322185970636215, + "grad_norm": 0.03146130591630936, + "learning_rate": 0.0009867491112427055, + "loss": 0.0248, + "num_input_tokens_seen": 43958304, + "step": 20365 + }, + { + "epoch": 3.3230016313213704, + "grad_norm": 0.027321165427565575, + "learning_rate": 0.0009867328278708313, + "loss": 0.1294, + "num_input_tokens_seen": 43969120, + "step": 20370 + }, + { + "epoch": 3.323817292006525, + "grad_norm": 0.031976111233234406, + "learning_rate": 0.0009867165346346988, + "loss": 0.1004, + "num_input_tokens_seen": 43980768, + "step": 20375 + }, + { + "epoch": 3.3246329526916805, + "grad_norm": 0.10402572154998779, + "learning_rate": 0.0009867002315346383, + "loss": 0.1289, + "num_input_tokens_seen": 43991360, + "step": 20380 + }, + { + "epoch": 3.3254486133768353, + "grad_norm": 0.06584568321704865, + "learning_rate": 0.0009866839185709805, + "loss": 0.1333, + "num_input_tokens_seen": 44002464, + "step": 20385 + }, + { + "epoch": 3.32626427406199, + "grad_norm": 0.024981629103422165, + "learning_rate": 0.0009866675957440553, + "loss": 0.0691, + "num_input_tokens_seen": 44011264, + "step": 20390 + }, + { + "epoch": 3.3270799347471454, + "grad_norm": 0.08106168359518051, + "learning_rate": 0.0009866512630541942, + "loss": 0.0867, + "num_input_tokens_seen": 44021632, + "step": 20395 + }, + { + "epoch": 3.3278955954323, + "grad_norm": 0.014570559374988079, + "learning_rate": 0.0009866349205017277, + "loss": 0.0888, + "num_input_tokens_seen": 44031872, + "step": 20400 + }, + { + "epoch": 3.328711256117455, + "grad_norm": 0.04672045260667801, + "learning_rate": 0.0009866185680869873, + "loss": 0.1426, + "num_input_tokens_seen": 44044064, + "step": 20405 + }, + { + "epoch": 3.3295269168026103, + "grad_norm": 0.2176208347082138, + "learning_rate": 0.0009866022058103042, + "loss": 0.2371, + "num_input_tokens_seen": 44055488, + "step": 20410 + }, + { + "epoch": 3.330342577487765, + "grad_norm": 0.07017062604427338, + "learning_rate": 0.0009865858336720102, + "loss": 0.0618, + "num_input_tokens_seen": 44066496, + "step": 20415 + }, + { + "epoch": 3.33115823817292, + "grad_norm": 0.028713030740618706, + "learning_rate": 0.000986569451672437, + "loss": 0.1338, + "num_input_tokens_seen": 44077664, + "step": 20420 + }, + { + "epoch": 3.331973898858075, + "grad_norm": 0.024329572916030884, + "learning_rate": 0.0009865530598119163, + "loss": 0.1751, + "num_input_tokens_seen": 44089344, + "step": 20425 + }, + { + "epoch": 3.33278955954323, + "grad_norm": 0.02615816704928875, + "learning_rate": 0.000986536658090781, + "loss": 0.0218, + "num_input_tokens_seen": 44100288, + "step": 20430 + }, + { + "epoch": 3.3336052202283852, + "grad_norm": 0.0656154528260231, + "learning_rate": 0.0009865202465093631, + "loss": 0.0472, + "num_input_tokens_seen": 44111040, + "step": 20435 + }, + { + "epoch": 3.33442088091354, + "grad_norm": 0.0606275238096714, + "learning_rate": 0.000986503825067995, + "loss": 0.0719, + "num_input_tokens_seen": 44122144, + "step": 20440 + }, + { + "epoch": 3.335236541598695, + "grad_norm": 0.037974048405885696, + "learning_rate": 0.0009864873937670098, + "loss": 0.1324, + "num_input_tokens_seen": 44133344, + "step": 20445 + }, + { + "epoch": 3.3360522022838497, + "grad_norm": 0.10318107903003693, + "learning_rate": 0.0009864709526067404, + "loss": 0.0619, + "num_input_tokens_seen": 44143520, + "step": 20450 + }, + { + "epoch": 3.336867862969005, + "grad_norm": 0.008245442062616348, + "learning_rate": 0.0009864545015875199, + "loss": 0.0176, + "num_input_tokens_seen": 44154496, + "step": 20455 + }, + { + "epoch": 3.3376835236541598, + "grad_norm": 0.024953382089734077, + "learning_rate": 0.000986438040709682, + "loss": 0.026, + "num_input_tokens_seen": 44165504, + "step": 20460 + }, + { + "epoch": 3.338499184339315, + "grad_norm": 0.09225403517484665, + "learning_rate": 0.00098642156997356, + "loss": 0.1685, + "num_input_tokens_seen": 44177472, + "step": 20465 + }, + { + "epoch": 3.33931484502447, + "grad_norm": 0.05510839819908142, + "learning_rate": 0.0009864050893794878, + "loss": 0.1055, + "num_input_tokens_seen": 44187712, + "step": 20470 + }, + { + "epoch": 3.3401305057096247, + "grad_norm": 0.09082439541816711, + "learning_rate": 0.0009863885989277994, + "loss": 0.1813, + "num_input_tokens_seen": 44198816, + "step": 20475 + }, + { + "epoch": 3.34094616639478, + "grad_norm": 0.23258695006370544, + "learning_rate": 0.0009863720986188291, + "loss": 0.2204, + "num_input_tokens_seen": 44209600, + "step": 20480 + }, + { + "epoch": 3.3417618270799347, + "grad_norm": 0.1346280574798584, + "learning_rate": 0.0009863555884529114, + "loss": 0.106, + "num_input_tokens_seen": 44220768, + "step": 20485 + }, + { + "epoch": 3.3425774877650896, + "grad_norm": 0.011103808879852295, + "learning_rate": 0.0009863390684303804, + "loss": 0.1948, + "num_input_tokens_seen": 44231936, + "step": 20490 + }, + { + "epoch": 3.343393148450245, + "grad_norm": 0.01016100775450468, + "learning_rate": 0.0009863225385515714, + "loss": 0.0656, + "num_input_tokens_seen": 44242912, + "step": 20495 + }, + { + "epoch": 3.3442088091353996, + "grad_norm": 0.012500526383519173, + "learning_rate": 0.000986305998816819, + "loss": 0.1167, + "num_input_tokens_seen": 44254144, + "step": 20500 + }, + { + "epoch": 3.3450244698205545, + "grad_norm": 0.025834238156676292, + "learning_rate": 0.000986289449226459, + "loss": 0.0853, + "num_input_tokens_seen": 44264832, + "step": 20505 + }, + { + "epoch": 3.3458401305057097, + "grad_norm": 0.14657558500766754, + "learning_rate": 0.000986272889780826, + "loss": 0.095, + "num_input_tokens_seen": 44276800, + "step": 20510 + }, + { + "epoch": 3.3466557911908645, + "grad_norm": 0.08369407802820206, + "learning_rate": 0.000986256320480256, + "loss": 0.1363, + "num_input_tokens_seen": 44287584, + "step": 20515 + }, + { + "epoch": 3.34747145187602, + "grad_norm": 0.15682542324066162, + "learning_rate": 0.0009862397413250852, + "loss": 0.0864, + "num_input_tokens_seen": 44298688, + "step": 20520 + }, + { + "epoch": 3.3482871125611746, + "grad_norm": 0.028945323079824448, + "learning_rate": 0.0009862231523156489, + "loss": 0.0446, + "num_input_tokens_seen": 44309184, + "step": 20525 + }, + { + "epoch": 3.3491027732463294, + "grad_norm": 0.15475015342235565, + "learning_rate": 0.0009862065534522837, + "loss": 0.1609, + "num_input_tokens_seen": 44319872, + "step": 20530 + }, + { + "epoch": 3.3499184339314847, + "grad_norm": 0.1731501966714859, + "learning_rate": 0.000986189944735326, + "loss": 0.0872, + "num_input_tokens_seen": 44330432, + "step": 20535 + }, + { + "epoch": 3.3507340946166395, + "grad_norm": 0.03867650032043457, + "learning_rate": 0.000986173326165112, + "loss": 0.1984, + "num_input_tokens_seen": 44341920, + "step": 20540 + }, + { + "epoch": 3.3515497553017943, + "grad_norm": 0.058678966015577316, + "learning_rate": 0.000986156697741979, + "loss": 0.0477, + "num_input_tokens_seen": 44352096, + "step": 20545 + }, + { + "epoch": 3.3523654159869496, + "grad_norm": 0.0667642205953598, + "learning_rate": 0.0009861400594662637, + "loss": 0.0962, + "num_input_tokens_seen": 44363456, + "step": 20550 + }, + { + "epoch": 3.3531810766721044, + "grad_norm": 0.036497242748737335, + "learning_rate": 0.0009861234113383035, + "loss": 0.0504, + "num_input_tokens_seen": 44375168, + "step": 20555 + }, + { + "epoch": 3.3539967373572592, + "grad_norm": 0.010187935084104538, + "learning_rate": 0.0009861067533584356, + "loss": 0.0954, + "num_input_tokens_seen": 44385824, + "step": 20560 + }, + { + "epoch": 3.3548123980424145, + "grad_norm": 0.17760713398456573, + "learning_rate": 0.0009860900855269976, + "loss": 0.3319, + "num_input_tokens_seen": 44396736, + "step": 20565 + }, + { + "epoch": 3.3556280587275693, + "grad_norm": 0.023151518777012825, + "learning_rate": 0.0009860734078443276, + "loss": 0.1429, + "num_input_tokens_seen": 44407648, + "step": 20570 + }, + { + "epoch": 3.356443719412724, + "grad_norm": 0.013548966497182846, + "learning_rate": 0.0009860567203107632, + "loss": 0.1281, + "num_input_tokens_seen": 44418336, + "step": 20575 + }, + { + "epoch": 3.3572593800978794, + "grad_norm": 0.3307119607925415, + "learning_rate": 0.0009860400229266427, + "loss": 0.1049, + "num_input_tokens_seen": 44428960, + "step": 20580 + }, + { + "epoch": 3.358075040783034, + "grad_norm": 0.02051873691380024, + "learning_rate": 0.0009860233156923047, + "loss": 0.0455, + "num_input_tokens_seen": 44440608, + "step": 20585 + }, + { + "epoch": 3.358890701468189, + "grad_norm": 0.056116845458745956, + "learning_rate": 0.0009860065986080876, + "loss": 0.0814, + "num_input_tokens_seen": 44451040, + "step": 20590 + }, + { + "epoch": 3.3597063621533443, + "grad_norm": 0.032296888530254364, + "learning_rate": 0.00098598987167433, + "loss": 0.0752, + "num_input_tokens_seen": 44461792, + "step": 20595 + }, + { + "epoch": 3.360522022838499, + "grad_norm": 0.006687713786959648, + "learning_rate": 0.0009859731348913713, + "loss": 0.0169, + "num_input_tokens_seen": 44472000, + "step": 20600 + }, + { + "epoch": 3.3613376835236544, + "grad_norm": 0.03389527276158333, + "learning_rate": 0.0009859563882595507, + "loss": 0.0894, + "num_input_tokens_seen": 44482752, + "step": 20605 + }, + { + "epoch": 3.362153344208809, + "grad_norm": 0.03401738405227661, + "learning_rate": 0.0009859396317792074, + "loss": 0.1405, + "num_input_tokens_seen": 44494624, + "step": 20610 + }, + { + "epoch": 3.362969004893964, + "grad_norm": 0.1774500012397766, + "learning_rate": 0.0009859228654506807, + "loss": 0.1464, + "num_input_tokens_seen": 44504256, + "step": 20615 + }, + { + "epoch": 3.3637846655791193, + "grad_norm": 0.008117337711155415, + "learning_rate": 0.0009859060892743108, + "loss": 0.052, + "num_input_tokens_seen": 44515680, + "step": 20620 + }, + { + "epoch": 3.364600326264274, + "grad_norm": 0.05046914145350456, + "learning_rate": 0.0009858893032504378, + "loss": 0.0847, + "num_input_tokens_seen": 44527360, + "step": 20625 + }, + { + "epoch": 3.365415986949429, + "grad_norm": 0.05099128559231758, + "learning_rate": 0.0009858725073794016, + "loss": 0.1542, + "num_input_tokens_seen": 44537600, + "step": 20630 + }, + { + "epoch": 3.366231647634584, + "grad_norm": 0.19158139824867249, + "learning_rate": 0.0009858557016615423, + "loss": 0.049, + "num_input_tokens_seen": 44548704, + "step": 20635 + }, + { + "epoch": 3.367047308319739, + "grad_norm": 0.020610924810171127, + "learning_rate": 0.0009858388860972012, + "loss": 0.1083, + "num_input_tokens_seen": 44559584, + "step": 20640 + }, + { + "epoch": 3.367862969004894, + "grad_norm": 0.10929691791534424, + "learning_rate": 0.0009858220606867188, + "loss": 0.0314, + "num_input_tokens_seen": 44568576, + "step": 20645 + }, + { + "epoch": 3.368678629690049, + "grad_norm": 0.087400883436203, + "learning_rate": 0.000985805225430436, + "loss": 0.1626, + "num_input_tokens_seen": 44579968, + "step": 20650 + }, + { + "epoch": 3.369494290375204, + "grad_norm": 0.01995384879410267, + "learning_rate": 0.0009857883803286937, + "loss": 0.1444, + "num_input_tokens_seen": 44591552, + "step": 20655 + }, + { + "epoch": 3.370309951060359, + "grad_norm": 0.11651373654603958, + "learning_rate": 0.0009857715253818338, + "loss": 0.1157, + "num_input_tokens_seen": 44602016, + "step": 20660 + }, + { + "epoch": 3.371125611745514, + "grad_norm": 0.008793378248810768, + "learning_rate": 0.000985754660590198, + "loss": 0.0743, + "num_input_tokens_seen": 44611936, + "step": 20665 + }, + { + "epoch": 3.3719412724306688, + "grad_norm": 0.016294458881020546, + "learning_rate": 0.0009857377859541275, + "loss": 0.0874, + "num_input_tokens_seen": 44623232, + "step": 20670 + }, + { + "epoch": 3.3727569331158236, + "grad_norm": 0.007734575308859348, + "learning_rate": 0.0009857209014739645, + "loss": 0.0597, + "num_input_tokens_seen": 44634592, + "step": 20675 + }, + { + "epoch": 3.373572593800979, + "grad_norm": 0.00543740950524807, + "learning_rate": 0.0009857040071500512, + "loss": 0.0706, + "num_input_tokens_seen": 44645568, + "step": 20680 + }, + { + "epoch": 3.3743882544861337, + "grad_norm": 0.22849687933921814, + "learning_rate": 0.0009856871029827303, + "loss": 0.2608, + "num_input_tokens_seen": 44656992, + "step": 20685 + }, + { + "epoch": 3.375203915171289, + "grad_norm": 0.04630988836288452, + "learning_rate": 0.0009856701889723438, + "loss": 0.1067, + "num_input_tokens_seen": 44669248, + "step": 20690 + }, + { + "epoch": 3.3760195758564437, + "grad_norm": 0.03394695743918419, + "learning_rate": 0.0009856532651192351, + "loss": 0.0942, + "num_input_tokens_seen": 44680864, + "step": 20695 + }, + { + "epoch": 3.3768352365415986, + "grad_norm": 0.01367984525859356, + "learning_rate": 0.0009856363314237468, + "loss": 0.1016, + "num_input_tokens_seen": 44691456, + "step": 20700 + }, + { + "epoch": 3.377650897226754, + "grad_norm": 0.02007197216153145, + "learning_rate": 0.0009856193878862221, + "loss": 0.0473, + "num_input_tokens_seen": 44702336, + "step": 20705 + }, + { + "epoch": 3.3784665579119086, + "grad_norm": 0.13313066959381104, + "learning_rate": 0.0009856024345070045, + "loss": 0.1975, + "num_input_tokens_seen": 44713728, + "step": 20710 + }, + { + "epoch": 3.3792822185970635, + "grad_norm": 0.04491923004388809, + "learning_rate": 0.0009855854712864376, + "loss": 0.1329, + "num_input_tokens_seen": 44723968, + "step": 20715 + }, + { + "epoch": 3.3800978792822187, + "grad_norm": 0.014980019070208073, + "learning_rate": 0.000985568498224865, + "loss": 0.0525, + "num_input_tokens_seen": 44734304, + "step": 20720 + }, + { + "epoch": 3.3809135399673735, + "grad_norm": 0.0933455228805542, + "learning_rate": 0.0009855515153226308, + "loss": 0.1274, + "num_input_tokens_seen": 44745568, + "step": 20725 + }, + { + "epoch": 3.3817292006525284, + "grad_norm": 0.1336585283279419, + "learning_rate": 0.0009855345225800792, + "loss": 0.0742, + "num_input_tokens_seen": 44757216, + "step": 20730 + }, + { + "epoch": 3.3825448613376836, + "grad_norm": 0.009883769787847996, + "learning_rate": 0.0009855175199975546, + "loss": 0.0465, + "num_input_tokens_seen": 44767808, + "step": 20735 + }, + { + "epoch": 3.3833605220228384, + "grad_norm": 0.10384946316480637, + "learning_rate": 0.0009855005075754015, + "loss": 0.1414, + "num_input_tokens_seen": 44778208, + "step": 20740 + }, + { + "epoch": 3.3841761827079937, + "grad_norm": 0.010534235276281834, + "learning_rate": 0.0009854834853139647, + "loss": 0.1554, + "num_input_tokens_seen": 44788032, + "step": 20745 + }, + { + "epoch": 3.3849918433931485, + "grad_norm": 0.1143837496638298, + "learning_rate": 0.0009854664532135892, + "loss": 0.172, + "num_input_tokens_seen": 44797440, + "step": 20750 + }, + { + "epoch": 3.3858075040783033, + "grad_norm": 0.1774929165840149, + "learning_rate": 0.0009854494112746203, + "loss": 0.2246, + "num_input_tokens_seen": 44809248, + "step": 20755 + }, + { + "epoch": 3.3866231647634586, + "grad_norm": 0.03608318045735359, + "learning_rate": 0.000985432359497403, + "loss": 0.1435, + "num_input_tokens_seen": 44820768, + "step": 20760 + }, + { + "epoch": 3.3874388254486134, + "grad_norm": 0.04511318355798721, + "learning_rate": 0.0009854152978822834, + "loss": 0.0381, + "num_input_tokens_seen": 44832128, + "step": 20765 + }, + { + "epoch": 3.3882544861337682, + "grad_norm": 0.04830940067768097, + "learning_rate": 0.0009853982264296068, + "loss": 0.1278, + "num_input_tokens_seen": 44840992, + "step": 20770 + }, + { + "epoch": 3.3890701468189235, + "grad_norm": 0.02576763741672039, + "learning_rate": 0.0009853811451397195, + "loss": 0.0897, + "num_input_tokens_seen": 44851232, + "step": 20775 + }, + { + "epoch": 3.3898858075040783, + "grad_norm": 0.013392743654549122, + "learning_rate": 0.0009853640540129674, + "loss": 0.0782, + "num_input_tokens_seen": 44860672, + "step": 20780 + }, + { + "epoch": 3.390701468189233, + "grad_norm": 0.08622924983501434, + "learning_rate": 0.0009853469530496971, + "loss": 0.043, + "num_input_tokens_seen": 44870688, + "step": 20785 + }, + { + "epoch": 3.3915171288743884, + "grad_norm": 0.09218402951955795, + "learning_rate": 0.000985329842250255, + "loss": 0.066, + "num_input_tokens_seen": 44881760, + "step": 20790 + }, + { + "epoch": 3.392332789559543, + "grad_norm": 0.09677506238222122, + "learning_rate": 0.000985312721614988, + "loss": 0.0947, + "num_input_tokens_seen": 44892896, + "step": 20795 + }, + { + "epoch": 3.393148450244698, + "grad_norm": 0.09081647545099258, + "learning_rate": 0.0009852955911442431, + "loss": 0.0781, + "num_input_tokens_seen": 44903296, + "step": 20800 + }, + { + "epoch": 3.3939641109298533, + "grad_norm": 0.02185557596385479, + "learning_rate": 0.0009852784508383673, + "loss": 0.1244, + "num_input_tokens_seen": 44913664, + "step": 20805 + }, + { + "epoch": 3.394779771615008, + "grad_norm": 0.19217585027217865, + "learning_rate": 0.0009852613006977081, + "loss": 0.1403, + "num_input_tokens_seen": 44924864, + "step": 20810 + }, + { + "epoch": 3.395595432300163, + "grad_norm": 0.029419617727398872, + "learning_rate": 0.0009852441407226132, + "loss": 0.0977, + "num_input_tokens_seen": 44935008, + "step": 20815 + }, + { + "epoch": 3.396411092985318, + "grad_norm": 0.0033333494793623686, + "learning_rate": 0.00098522697091343, + "loss": 0.1632, + "num_input_tokens_seen": 44945280, + "step": 20820 + }, + { + "epoch": 3.397226753670473, + "grad_norm": 0.02038710191845894, + "learning_rate": 0.0009852097912705067, + "loss": 0.0398, + "num_input_tokens_seen": 44956064, + "step": 20825 + }, + { + "epoch": 3.3980424143556283, + "grad_norm": 0.03771764785051346, + "learning_rate": 0.0009851926017941917, + "loss": 0.0511, + "num_input_tokens_seen": 44967424, + "step": 20830 + }, + { + "epoch": 3.398858075040783, + "grad_norm": 0.015467549674212933, + "learning_rate": 0.0009851754024848328, + "loss": 0.0783, + "num_input_tokens_seen": 44977952, + "step": 20835 + }, + { + "epoch": 3.399673735725938, + "grad_norm": 0.0038697707932442427, + "learning_rate": 0.0009851581933427792, + "loss": 0.1132, + "num_input_tokens_seen": 44988384, + "step": 20840 + }, + { + "epoch": 3.400489396411093, + "grad_norm": 0.004471632651984692, + "learning_rate": 0.000985140974368379, + "loss": 0.0192, + "num_input_tokens_seen": 44998816, + "step": 20845 + }, + { + "epoch": 3.401305057096248, + "grad_norm": 0.12683843076229095, + "learning_rate": 0.0009851237455619818, + "loss": 0.12, + "num_input_tokens_seen": 45010112, + "step": 20850 + }, + { + "epoch": 3.402120717781403, + "grad_norm": 0.19443084299564362, + "learning_rate": 0.0009851065069239361, + "loss": 0.1018, + "num_input_tokens_seen": 45021184, + "step": 20855 + }, + { + "epoch": 3.402936378466558, + "grad_norm": 0.044982731342315674, + "learning_rate": 0.0009850892584545921, + "loss": 0.1605, + "num_input_tokens_seen": 45030976, + "step": 20860 + }, + { + "epoch": 3.403752039151713, + "grad_norm": 0.1446046531200409, + "learning_rate": 0.0009850720001542985, + "loss": 0.1045, + "num_input_tokens_seen": 45042400, + "step": 20865 + }, + { + "epoch": 3.4045676998368677, + "grad_norm": 0.006332816556096077, + "learning_rate": 0.0009850547320234058, + "loss": 0.1086, + "num_input_tokens_seen": 45052352, + "step": 20870 + }, + { + "epoch": 3.405383360522023, + "grad_norm": 0.10956069082021713, + "learning_rate": 0.0009850374540622633, + "loss": 0.0366, + "num_input_tokens_seen": 45063520, + "step": 20875 + }, + { + "epoch": 3.4061990212071778, + "grad_norm": 0.08039423823356628, + "learning_rate": 0.0009850201662712217, + "loss": 0.0791, + "num_input_tokens_seen": 45074144, + "step": 20880 + }, + { + "epoch": 3.407014681892333, + "grad_norm": 0.03137603774666786, + "learning_rate": 0.0009850028686506313, + "loss": 0.0615, + "num_input_tokens_seen": 45084704, + "step": 20885 + }, + { + "epoch": 3.407830342577488, + "grad_norm": 0.007954450324177742, + "learning_rate": 0.000984985561200842, + "loss": 0.175, + "num_input_tokens_seen": 45095744, + "step": 20890 + }, + { + "epoch": 3.4086460032626427, + "grad_norm": 0.015032630413770676, + "learning_rate": 0.0009849682439222055, + "loss": 0.0757, + "num_input_tokens_seen": 45105952, + "step": 20895 + }, + { + "epoch": 3.4094616639477975, + "grad_norm": 0.006763865239918232, + "learning_rate": 0.000984950916815072, + "loss": 0.0094, + "num_input_tokens_seen": 45118080, + "step": 20900 + }, + { + "epoch": 3.4102773246329527, + "grad_norm": 0.003127865493297577, + "learning_rate": 0.0009849335798797932, + "loss": 0.0211, + "num_input_tokens_seen": 45129024, + "step": 20905 + }, + { + "epoch": 3.4110929853181076, + "grad_norm": 0.03169337660074234, + "learning_rate": 0.0009849162331167201, + "loss": 0.0489, + "num_input_tokens_seen": 45139200, + "step": 20910 + }, + { + "epoch": 3.411908646003263, + "grad_norm": 0.00271624606102705, + "learning_rate": 0.0009848988765262044, + "loss": 0.0577, + "num_input_tokens_seen": 45150752, + "step": 20915 + }, + { + "epoch": 3.4127243066884176, + "grad_norm": 0.057396937161684036, + "learning_rate": 0.0009848815101085977, + "loss": 0.0346, + "num_input_tokens_seen": 45162016, + "step": 20920 + }, + { + "epoch": 3.4135399673735725, + "grad_norm": 0.035289935767650604, + "learning_rate": 0.0009848641338642524, + "loss": 0.076, + "num_input_tokens_seen": 45171904, + "step": 20925 + }, + { + "epoch": 3.4143556280587277, + "grad_norm": 0.29885706305503845, + "learning_rate": 0.00098484674779352, + "loss": 0.2374, + "num_input_tokens_seen": 45183712, + "step": 20930 + }, + { + "epoch": 3.4151712887438825, + "grad_norm": 0.15530546009540558, + "learning_rate": 0.0009848293518967533, + "loss": 0.07, + "num_input_tokens_seen": 45194656, + "step": 20935 + }, + { + "epoch": 3.4159869494290374, + "grad_norm": 0.00298583903349936, + "learning_rate": 0.0009848119461743049, + "loss": 0.1636, + "num_input_tokens_seen": 45204800, + "step": 20940 + }, + { + "epoch": 3.4168026101141926, + "grad_norm": 0.3138507306575775, + "learning_rate": 0.000984794530626527, + "loss": 0.1886, + "num_input_tokens_seen": 45215776, + "step": 20945 + }, + { + "epoch": 3.4176182707993474, + "grad_norm": 0.017290201038122177, + "learning_rate": 0.0009847771052537732, + "loss": 0.0538, + "num_input_tokens_seen": 45225952, + "step": 20950 + }, + { + "epoch": 3.4184339314845023, + "grad_norm": 0.10135749727487564, + "learning_rate": 0.0009847596700563966, + "loss": 0.1021, + "num_input_tokens_seen": 45237312, + "step": 20955 + }, + { + "epoch": 3.4192495921696575, + "grad_norm": 0.022728487849235535, + "learning_rate": 0.00098474222503475, + "loss": 0.052, + "num_input_tokens_seen": 45246912, + "step": 20960 + }, + { + "epoch": 3.4200652528548123, + "grad_norm": 0.02813965082168579, + "learning_rate": 0.0009847247701891874, + "loss": 0.0846, + "num_input_tokens_seen": 45257440, + "step": 20965 + }, + { + "epoch": 3.4208809135399676, + "grad_norm": 0.020502887666225433, + "learning_rate": 0.0009847073055200624, + "loss": 0.1488, + "num_input_tokens_seen": 45268416, + "step": 20970 + }, + { + "epoch": 3.4216965742251224, + "grad_norm": 0.021539224311709404, + "learning_rate": 0.0009846898310277288, + "loss": 0.0782, + "num_input_tokens_seen": 45278848, + "step": 20975 + }, + { + "epoch": 3.4225122349102772, + "grad_norm": 0.13268448412418365, + "learning_rate": 0.000984672346712541, + "loss": 0.0789, + "num_input_tokens_seen": 45291072, + "step": 20980 + }, + { + "epoch": 3.4233278955954325, + "grad_norm": 0.049775343388319016, + "learning_rate": 0.0009846548525748533, + "loss": 0.0855, + "num_input_tokens_seen": 45302336, + "step": 20985 + }, + { + "epoch": 3.4241435562805873, + "grad_norm": 0.062417641282081604, + "learning_rate": 0.0009846373486150201, + "loss": 0.077, + "num_input_tokens_seen": 45312864, + "step": 20990 + }, + { + "epoch": 3.424959216965742, + "grad_norm": 0.023129766806960106, + "learning_rate": 0.0009846198348333964, + "loss": 0.0657, + "num_input_tokens_seen": 45323744, + "step": 20995 + }, + { + "epoch": 3.4257748776508974, + "grad_norm": 0.12047817558050156, + "learning_rate": 0.0009846023112303369, + "loss": 0.1356, + "num_input_tokens_seen": 45333760, + "step": 21000 + }, + { + "epoch": 3.426590538336052, + "grad_norm": 0.22374491393566132, + "learning_rate": 0.0009845847778061968, + "loss": 0.2351, + "num_input_tokens_seen": 45344896, + "step": 21005 + }, + { + "epoch": 3.427406199021207, + "grad_norm": 0.0333823598921299, + "learning_rate": 0.0009845672345613313, + "loss": 0.0508, + "num_input_tokens_seen": 45354336, + "step": 21010 + }, + { + "epoch": 3.4282218597063623, + "grad_norm": 0.024579208344221115, + "learning_rate": 0.0009845496814960962, + "loss": 0.2681, + "num_input_tokens_seen": 45365376, + "step": 21015 + }, + { + "epoch": 3.429037520391517, + "grad_norm": 0.12661434710025787, + "learning_rate": 0.0009845321186108468, + "loss": 0.0448, + "num_input_tokens_seen": 45376416, + "step": 21020 + }, + { + "epoch": 3.429853181076672, + "grad_norm": 0.01600196212530136, + "learning_rate": 0.0009845145459059397, + "loss": 0.1697, + "num_input_tokens_seen": 45386272, + "step": 21025 + }, + { + "epoch": 3.430668841761827, + "grad_norm": 0.008204374462366104, + "learning_rate": 0.0009844969633817306, + "loss": 0.0464, + "num_input_tokens_seen": 45395936, + "step": 21030 + }, + { + "epoch": 3.431484502446982, + "grad_norm": 0.011502916924655437, + "learning_rate": 0.000984479371038576, + "loss": 0.0661, + "num_input_tokens_seen": 45407680, + "step": 21035 + }, + { + "epoch": 3.432300163132137, + "grad_norm": 0.06499218940734863, + "learning_rate": 0.0009844617688768323, + "loss": 0.05, + "num_input_tokens_seen": 45417088, + "step": 21040 + }, + { + "epoch": 3.433115823817292, + "grad_norm": 0.0662451907992363, + "learning_rate": 0.000984444156896856, + "loss": 0.0987, + "num_input_tokens_seen": 45429536, + "step": 21045 + }, + { + "epoch": 3.433931484502447, + "grad_norm": 0.01749119907617569, + "learning_rate": 0.0009844265350990047, + "loss": 0.0694, + "num_input_tokens_seen": 45440320, + "step": 21050 + }, + { + "epoch": 3.434747145187602, + "grad_norm": 0.04644760489463806, + "learning_rate": 0.000984408903483635, + "loss": 0.2048, + "num_input_tokens_seen": 45451008, + "step": 21055 + }, + { + "epoch": 3.435562805872757, + "grad_norm": 0.06190920248627663, + "learning_rate": 0.0009843912620511042, + "loss": 0.0614, + "num_input_tokens_seen": 45462432, + "step": 21060 + }, + { + "epoch": 3.436378466557912, + "grad_norm": 0.142157182097435, + "learning_rate": 0.00098437361080177, + "loss": 0.0786, + "num_input_tokens_seen": 45472544, + "step": 21065 + }, + { + "epoch": 3.437194127243067, + "grad_norm": 0.020878633484244347, + "learning_rate": 0.0009843559497359903, + "loss": 0.0491, + "num_input_tokens_seen": 45483168, + "step": 21070 + }, + { + "epoch": 3.438009787928222, + "grad_norm": 0.011706641875207424, + "learning_rate": 0.0009843382788541227, + "loss": 0.0296, + "num_input_tokens_seen": 45492832, + "step": 21075 + }, + { + "epoch": 3.4388254486133767, + "grad_norm": 0.17179611325263977, + "learning_rate": 0.0009843205981565253, + "loss": 0.2044, + "num_input_tokens_seen": 45504064, + "step": 21080 + }, + { + "epoch": 3.439641109298532, + "grad_norm": 0.11788588017225266, + "learning_rate": 0.0009843029076435567, + "loss": 0.1046, + "num_input_tokens_seen": 45514336, + "step": 21085 + }, + { + "epoch": 3.4404567699836868, + "grad_norm": 0.07058888673782349, + "learning_rate": 0.0009842852073155754, + "loss": 0.0286, + "num_input_tokens_seen": 45524480, + "step": 21090 + }, + { + "epoch": 3.4412724306688416, + "grad_norm": 0.26519283652305603, + "learning_rate": 0.00098426749717294, + "loss": 0.2242, + "num_input_tokens_seen": 45535264, + "step": 21095 + }, + { + "epoch": 3.442088091353997, + "grad_norm": 0.07345568388700485, + "learning_rate": 0.0009842497772160092, + "loss": 0.0345, + "num_input_tokens_seen": 45546560, + "step": 21100 + }, + { + "epoch": 3.4429037520391517, + "grad_norm": 0.02888215333223343, + "learning_rate": 0.0009842320474451427, + "loss": 0.0601, + "num_input_tokens_seen": 45558016, + "step": 21105 + }, + { + "epoch": 3.443719412724307, + "grad_norm": 0.0038877481129020452, + "learning_rate": 0.0009842143078606991, + "loss": 0.0536, + "num_input_tokens_seen": 45568512, + "step": 21110 + }, + { + "epoch": 3.4445350734094617, + "grad_norm": 0.014261267147958279, + "learning_rate": 0.0009841965584630385, + "loss": 0.0715, + "num_input_tokens_seen": 45579968, + "step": 21115 + }, + { + "epoch": 3.4453507340946166, + "grad_norm": 0.07963086664676666, + "learning_rate": 0.0009841787992525203, + "loss": 0.1047, + "num_input_tokens_seen": 45592064, + "step": 21120 + }, + { + "epoch": 3.4461663947797714, + "grad_norm": 0.013594537042081356, + "learning_rate": 0.0009841610302295048, + "loss": 0.108, + "num_input_tokens_seen": 45603040, + "step": 21125 + }, + { + "epoch": 3.4469820554649266, + "grad_norm": 0.0373501218855381, + "learning_rate": 0.0009841432513943516, + "loss": 0.0533, + "num_input_tokens_seen": 45613536, + "step": 21130 + }, + { + "epoch": 3.4477977161500815, + "grad_norm": 0.04946838691830635, + "learning_rate": 0.0009841254627474213, + "loss": 0.1302, + "num_input_tokens_seen": 45625120, + "step": 21135 + }, + { + "epoch": 3.4486133768352367, + "grad_norm": 0.16107678413391113, + "learning_rate": 0.000984107664289074, + "loss": 0.1456, + "num_input_tokens_seen": 45635744, + "step": 21140 + }, + { + "epoch": 3.4494290375203915, + "grad_norm": 0.04442241042852402, + "learning_rate": 0.0009840898560196712, + "loss": 0.1647, + "num_input_tokens_seen": 45646432, + "step": 21145 + }, + { + "epoch": 3.4502446982055464, + "grad_norm": 0.10360156744718552, + "learning_rate": 0.000984072037939573, + "loss": 0.085, + "num_input_tokens_seen": 45656576, + "step": 21150 + }, + { + "epoch": 3.4510603588907016, + "grad_norm": 0.09855679422616959, + "learning_rate": 0.000984054210049141, + "loss": 0.2321, + "num_input_tokens_seen": 45667904, + "step": 21155 + }, + { + "epoch": 3.4518760195758564, + "grad_norm": 0.003918938804417849, + "learning_rate": 0.0009840363723487365, + "loss": 0.0387, + "num_input_tokens_seen": 45678528, + "step": 21160 + }, + { + "epoch": 3.4526916802610113, + "grad_norm": 0.14749102294445038, + "learning_rate": 0.0009840185248387208, + "loss": 0.1458, + "num_input_tokens_seen": 45689312, + "step": 21165 + }, + { + "epoch": 3.4535073409461665, + "grad_norm": 0.05680210888385773, + "learning_rate": 0.0009840006675194558, + "loss": 0.0672, + "num_input_tokens_seen": 45700160, + "step": 21170 + }, + { + "epoch": 3.4543230016313213, + "grad_norm": 0.1646868735551834, + "learning_rate": 0.000983982800391303, + "loss": 0.2412, + "num_input_tokens_seen": 45710432, + "step": 21175 + }, + { + "epoch": 3.455138662316476, + "grad_norm": 0.31605181097984314, + "learning_rate": 0.0009839649234546248, + "loss": 0.0544, + "num_input_tokens_seen": 45721248, + "step": 21180 + }, + { + "epoch": 3.4559543230016314, + "grad_norm": 0.06806404143571854, + "learning_rate": 0.0009839470367097836, + "loss": 0.0789, + "num_input_tokens_seen": 45732768, + "step": 21185 + }, + { + "epoch": 3.4567699836867862, + "grad_norm": 0.012087629176676273, + "learning_rate": 0.0009839291401571417, + "loss": 0.0222, + "num_input_tokens_seen": 45743200, + "step": 21190 + }, + { + "epoch": 3.4575856443719415, + "grad_norm": 0.057843826711177826, + "learning_rate": 0.0009839112337970619, + "loss": 0.116, + "num_input_tokens_seen": 45753376, + "step": 21195 + }, + { + "epoch": 3.4584013050570963, + "grad_norm": 0.014727100729942322, + "learning_rate": 0.0009838933176299072, + "loss": 0.0399, + "num_input_tokens_seen": 45764064, + "step": 21200 + }, + { + "epoch": 3.459216965742251, + "grad_norm": 0.1727193146944046, + "learning_rate": 0.0009838753916560404, + "loss": 0.2123, + "num_input_tokens_seen": 45774944, + "step": 21205 + }, + { + "epoch": 3.4600326264274064, + "grad_norm": 0.07166225463151932, + "learning_rate": 0.000983857455875825, + "loss": 0.0726, + "num_input_tokens_seen": 45785856, + "step": 21210 + }, + { + "epoch": 3.460848287112561, + "grad_norm": 0.2608206272125244, + "learning_rate": 0.0009838395102896244, + "loss": 0.2526, + "num_input_tokens_seen": 45796352, + "step": 21215 + }, + { + "epoch": 3.461663947797716, + "grad_norm": 0.10049410164356232, + "learning_rate": 0.0009838215548978024, + "loss": 0.0797, + "num_input_tokens_seen": 45808160, + "step": 21220 + }, + { + "epoch": 3.4624796084828713, + "grad_norm": 0.01086132600903511, + "learning_rate": 0.0009838035897007226, + "loss": 0.0534, + "num_input_tokens_seen": 45819744, + "step": 21225 + }, + { + "epoch": 3.463295269168026, + "grad_norm": 0.005954584572464228, + "learning_rate": 0.0009837856146987496, + "loss": 0.1358, + "num_input_tokens_seen": 45830048, + "step": 21230 + }, + { + "epoch": 3.464110929853181, + "grad_norm": 0.11598404496908188, + "learning_rate": 0.0009837676298922473, + "loss": 0.0509, + "num_input_tokens_seen": 45841216, + "step": 21235 + }, + { + "epoch": 3.464926590538336, + "grad_norm": 0.02165006287395954, + "learning_rate": 0.0009837496352815803, + "loss": 0.1017, + "num_input_tokens_seen": 45850720, + "step": 21240 + }, + { + "epoch": 3.465742251223491, + "grad_norm": 0.18030884861946106, + "learning_rate": 0.000983731630867113, + "loss": 0.1657, + "num_input_tokens_seen": 45862272, + "step": 21245 + }, + { + "epoch": 3.466557911908646, + "grad_norm": 0.01824588142335415, + "learning_rate": 0.0009837136166492109, + "loss": 0.123, + "num_input_tokens_seen": 45872576, + "step": 21250 + }, + { + "epoch": 3.467373572593801, + "grad_norm": 0.014222935773432255, + "learning_rate": 0.0009836955926282385, + "loss": 0.1106, + "num_input_tokens_seen": 45882816, + "step": 21255 + }, + { + "epoch": 3.468189233278956, + "grad_norm": 0.01783698797225952, + "learning_rate": 0.0009836775588045613, + "loss": 0.157, + "num_input_tokens_seen": 45893856, + "step": 21260 + }, + { + "epoch": 3.4690048939641107, + "grad_norm": 0.15413255989551544, + "learning_rate": 0.0009836595151785448, + "loss": 0.1809, + "num_input_tokens_seen": 45904128, + "step": 21265 + }, + { + "epoch": 3.469820554649266, + "grad_norm": 0.08001356571912766, + "learning_rate": 0.0009836414617505548, + "loss": 0.1492, + "num_input_tokens_seen": 45914624, + "step": 21270 + }, + { + "epoch": 3.470636215334421, + "grad_norm": 0.037837933748960495, + "learning_rate": 0.000983623398520957, + "loss": 0.1506, + "num_input_tokens_seen": 45926432, + "step": 21275 + }, + { + "epoch": 3.471451876019576, + "grad_norm": 0.020428791642189026, + "learning_rate": 0.0009836053254901173, + "loss": 0.0662, + "num_input_tokens_seen": 45936608, + "step": 21280 + }, + { + "epoch": 3.472267536704731, + "grad_norm": 0.03009522706270218, + "learning_rate": 0.0009835872426584024, + "loss": 0.1297, + "num_input_tokens_seen": 45948768, + "step": 21285 + }, + { + "epoch": 3.4730831973898857, + "grad_norm": 0.00466992799192667, + "learning_rate": 0.0009835691500261784, + "loss": 0.0318, + "num_input_tokens_seen": 45958080, + "step": 21290 + }, + { + "epoch": 3.473898858075041, + "grad_norm": 0.2116037756204605, + "learning_rate": 0.0009835510475938124, + "loss": 0.1789, + "num_input_tokens_seen": 45968768, + "step": 21295 + }, + { + "epoch": 3.4747145187601958, + "grad_norm": 0.19801630079746246, + "learning_rate": 0.0009835329353616708, + "loss": 0.0569, + "num_input_tokens_seen": 45979360, + "step": 21300 + }, + { + "epoch": 3.4755301794453506, + "grad_norm": 0.08452171087265015, + "learning_rate": 0.000983514813330121, + "loss": 0.0603, + "num_input_tokens_seen": 45990880, + "step": 21305 + }, + { + "epoch": 3.476345840130506, + "grad_norm": 0.007304913364350796, + "learning_rate": 0.00098349668149953, + "loss": 0.1494, + "num_input_tokens_seen": 46001344, + "step": 21310 + }, + { + "epoch": 3.4771615008156607, + "grad_norm": 0.06574325263500214, + "learning_rate": 0.0009834785398702653, + "loss": 0.0802, + "num_input_tokens_seen": 46013152, + "step": 21315 + }, + { + "epoch": 3.4779771615008155, + "grad_norm": 0.005409311503171921, + "learning_rate": 0.0009834603884426947, + "loss": 0.0777, + "num_input_tokens_seen": 46023424, + "step": 21320 + }, + { + "epoch": 3.4787928221859707, + "grad_norm": 0.05082765221595764, + "learning_rate": 0.000983442227217186, + "loss": 0.0564, + "num_input_tokens_seen": 46033568, + "step": 21325 + }, + { + "epoch": 3.4796084828711256, + "grad_norm": 0.06047376990318298, + "learning_rate": 0.0009834240561941072, + "loss": 0.0417, + "num_input_tokens_seen": 46045088, + "step": 21330 + }, + { + "epoch": 3.480424143556281, + "grad_norm": 0.0103231742978096, + "learning_rate": 0.000983405875373827, + "loss": 0.0854, + "num_input_tokens_seen": 46056640, + "step": 21335 + }, + { + "epoch": 3.4812398042414356, + "grad_norm": 0.02116631343960762, + "learning_rate": 0.0009833876847567132, + "loss": 0.0858, + "num_input_tokens_seen": 46067008, + "step": 21340 + }, + { + "epoch": 3.4820554649265905, + "grad_norm": 0.011003903113305569, + "learning_rate": 0.0009833694843431346, + "loss": 0.0604, + "num_input_tokens_seen": 46078048, + "step": 21345 + }, + { + "epoch": 3.4828711256117453, + "grad_norm": 0.009066099300980568, + "learning_rate": 0.0009833512741334604, + "loss": 0.1327, + "num_input_tokens_seen": 46089568, + "step": 21350 + }, + { + "epoch": 3.4836867862969005, + "grad_norm": 0.06870605051517487, + "learning_rate": 0.0009833330541280595, + "loss": 0.075, + "num_input_tokens_seen": 46101024, + "step": 21355 + }, + { + "epoch": 3.4845024469820554, + "grad_norm": 0.183268740773201, + "learning_rate": 0.0009833148243273012, + "loss": 0.1381, + "num_input_tokens_seen": 46112064, + "step": 21360 + }, + { + "epoch": 3.4853181076672106, + "grad_norm": 0.17442356050014496, + "learning_rate": 0.0009832965847315547, + "loss": 0.1589, + "num_input_tokens_seen": 46122080, + "step": 21365 + }, + { + "epoch": 3.4861337683523654, + "grad_norm": 0.23719912767410278, + "learning_rate": 0.00098327833534119, + "loss": 0.1086, + "num_input_tokens_seen": 46132992, + "step": 21370 + }, + { + "epoch": 3.4869494290375203, + "grad_norm": 0.1742006540298462, + "learning_rate": 0.0009832600761565764, + "loss": 0.1555, + "num_input_tokens_seen": 46144576, + "step": 21375 + }, + { + "epoch": 3.4877650897226755, + "grad_norm": 0.018509503453969955, + "learning_rate": 0.0009832418071780845, + "loss": 0.2007, + "num_input_tokens_seen": 46155296, + "step": 21380 + }, + { + "epoch": 3.4885807504078303, + "grad_norm": 0.22304745018482208, + "learning_rate": 0.0009832235284060842, + "loss": 0.1386, + "num_input_tokens_seen": 46166528, + "step": 21385 + }, + { + "epoch": 3.489396411092985, + "grad_norm": 0.03720647096633911, + "learning_rate": 0.0009832052398409464, + "loss": 0.1913, + "num_input_tokens_seen": 46176672, + "step": 21390 + }, + { + "epoch": 3.4902120717781404, + "grad_norm": 0.07083740830421448, + "learning_rate": 0.000983186941483041, + "loss": 0.1097, + "num_input_tokens_seen": 46186880, + "step": 21395 + }, + { + "epoch": 3.4910277324632952, + "grad_norm": 0.12235971540212631, + "learning_rate": 0.0009831686333327397, + "loss": 0.0891, + "num_input_tokens_seen": 46196320, + "step": 21400 + }, + { + "epoch": 3.49184339314845, + "grad_norm": 0.021608423441648483, + "learning_rate": 0.0009831503153904127, + "loss": 0.0347, + "num_input_tokens_seen": 46207040, + "step": 21405 + }, + { + "epoch": 3.4926590538336053, + "grad_norm": 0.08407104760408401, + "learning_rate": 0.000983131987656432, + "loss": 0.0631, + "num_input_tokens_seen": 46217248, + "step": 21410 + }, + { + "epoch": 3.49347471451876, + "grad_norm": 0.02711568772792816, + "learning_rate": 0.0009831136501311684, + "loss": 0.1136, + "num_input_tokens_seen": 46226688, + "step": 21415 + }, + { + "epoch": 3.4942903752039154, + "grad_norm": 0.07743047177791595, + "learning_rate": 0.000983095302814994, + "loss": 0.1357, + "num_input_tokens_seen": 46236768, + "step": 21420 + }, + { + "epoch": 3.49510603588907, + "grad_norm": 0.014534658752381802, + "learning_rate": 0.0009830769457082804, + "loss": 0.1101, + "num_input_tokens_seen": 46247872, + "step": 21425 + }, + { + "epoch": 3.495921696574225, + "grad_norm": 0.15765748918056488, + "learning_rate": 0.0009830585788113994, + "loss": 0.1543, + "num_input_tokens_seen": 46258688, + "step": 21430 + }, + { + "epoch": 3.4967373572593803, + "grad_norm": 0.28878265619277954, + "learning_rate": 0.0009830402021247238, + "loss": 0.084, + "num_input_tokens_seen": 46269504, + "step": 21435 + }, + { + "epoch": 3.497553017944535, + "grad_norm": 0.09718022495508194, + "learning_rate": 0.0009830218156486256, + "loss": 0.0572, + "num_input_tokens_seen": 46280928, + "step": 21440 + }, + { + "epoch": 3.49836867862969, + "grad_norm": 0.01615605317056179, + "learning_rate": 0.0009830034193834777, + "loss": 0.0848, + "num_input_tokens_seen": 46291648, + "step": 21445 + }, + { + "epoch": 3.499184339314845, + "grad_norm": 0.17543765902519226, + "learning_rate": 0.0009829850133296527, + "loss": 0.0907, + "num_input_tokens_seen": 46302592, + "step": 21450 + }, + { + "epoch": 3.5, + "grad_norm": 0.07823494076728821, + "learning_rate": 0.0009829665974875237, + "loss": 0.0959, + "num_input_tokens_seen": 46313216, + "step": 21455 + }, + { + "epoch": 3.500815660685155, + "grad_norm": 0.006602311972528696, + "learning_rate": 0.0009829481718574638, + "loss": 0.1477, + "num_input_tokens_seen": 46322656, + "step": 21460 + }, + { + "epoch": 3.50163132137031, + "grad_norm": 0.041894447058439255, + "learning_rate": 0.0009829297364398466, + "loss": 0.0472, + "num_input_tokens_seen": 46332576, + "step": 21465 + }, + { + "epoch": 3.502446982055465, + "grad_norm": 0.0024145469069480896, + "learning_rate": 0.0009829112912350456, + "loss": 0.146, + "num_input_tokens_seen": 46343584, + "step": 21470 + }, + { + "epoch": 3.50326264274062, + "grad_norm": 0.03968200832605362, + "learning_rate": 0.000982892836243435, + "loss": 0.0715, + "num_input_tokens_seen": 46354240, + "step": 21475 + }, + { + "epoch": 3.504078303425775, + "grad_norm": 0.07537969946861267, + "learning_rate": 0.000982874371465388, + "loss": 0.1187, + "num_input_tokens_seen": 46364736, + "step": 21480 + }, + { + "epoch": 3.50489396411093, + "grad_norm": 0.02518860250711441, + "learning_rate": 0.0009828558969012795, + "loss": 0.1496, + "num_input_tokens_seen": 46374976, + "step": 21485 + }, + { + "epoch": 3.5057096247960846, + "grad_norm": 0.35393378138542175, + "learning_rate": 0.0009828374125514837, + "loss": 0.1134, + "num_input_tokens_seen": 46386080, + "step": 21490 + }, + { + "epoch": 3.50652528548124, + "grad_norm": 0.30863848328590393, + "learning_rate": 0.0009828189184163752, + "loss": 0.0391, + "num_input_tokens_seen": 46397152, + "step": 21495 + }, + { + "epoch": 3.5073409461663947, + "grad_norm": 0.026995185762643814, + "learning_rate": 0.0009828004144963288, + "loss": 0.1451, + "num_input_tokens_seen": 46408480, + "step": 21500 + }, + { + "epoch": 3.50815660685155, + "grad_norm": 0.014156169258058071, + "learning_rate": 0.0009827819007917195, + "loss": 0.134, + "num_input_tokens_seen": 46419712, + "step": 21505 + }, + { + "epoch": 3.5089722675367048, + "grad_norm": 0.20525045692920685, + "learning_rate": 0.0009827633773029228, + "loss": 0.3111, + "num_input_tokens_seen": 46431072, + "step": 21510 + }, + { + "epoch": 3.5097879282218596, + "grad_norm": 0.124051034450531, + "learning_rate": 0.0009827448440303135, + "loss": 0.0611, + "num_input_tokens_seen": 46441280, + "step": 21515 + }, + { + "epoch": 3.5106035889070144, + "grad_norm": 0.030473211780190468, + "learning_rate": 0.0009827263009742678, + "loss": 0.1719, + "num_input_tokens_seen": 46451808, + "step": 21520 + }, + { + "epoch": 3.5114192495921697, + "grad_norm": 0.17139644920825958, + "learning_rate": 0.000982707748135161, + "loss": 0.0759, + "num_input_tokens_seen": 46463136, + "step": 21525 + }, + { + "epoch": 3.5122349102773245, + "grad_norm": 0.12074781209230423, + "learning_rate": 0.0009826891855133693, + "loss": 0.1515, + "num_input_tokens_seen": 46473280, + "step": 21530 + }, + { + "epoch": 3.5130505709624797, + "grad_norm": 0.1016492024064064, + "learning_rate": 0.000982670613109269, + "loss": 0.0468, + "num_input_tokens_seen": 46484032, + "step": 21535 + }, + { + "epoch": 3.5138662316476346, + "grad_norm": 0.023893876001238823, + "learning_rate": 0.0009826520309232365, + "loss": 0.0474, + "num_input_tokens_seen": 46495488, + "step": 21540 + }, + { + "epoch": 3.5146818923327894, + "grad_norm": 0.08100047707557678, + "learning_rate": 0.0009826334389556482, + "loss": 0.1857, + "num_input_tokens_seen": 46506944, + "step": 21545 + }, + { + "epoch": 3.5154975530179446, + "grad_norm": 0.08882609754800797, + "learning_rate": 0.000982614837206881, + "loss": 0.0654, + "num_input_tokens_seen": 46517216, + "step": 21550 + }, + { + "epoch": 3.5163132137030995, + "grad_norm": 0.09666024893522263, + "learning_rate": 0.000982596225677312, + "loss": 0.2018, + "num_input_tokens_seen": 46528864, + "step": 21555 + }, + { + "epoch": 3.5171288743882547, + "grad_norm": 0.02566951885819435, + "learning_rate": 0.0009825776043673182, + "loss": 0.1513, + "num_input_tokens_seen": 46539072, + "step": 21560 + }, + { + "epoch": 3.5179445350734095, + "grad_norm": 0.04488889500498772, + "learning_rate": 0.000982558973277277, + "loss": 0.0546, + "num_input_tokens_seen": 46550016, + "step": 21565 + }, + { + "epoch": 3.5187601957585644, + "grad_norm": 0.07405301183462143, + "learning_rate": 0.0009825403324075662, + "loss": 0.095, + "num_input_tokens_seen": 46559968, + "step": 21570 + }, + { + "epoch": 3.519575856443719, + "grad_norm": 0.021635068580508232, + "learning_rate": 0.0009825216817585633, + "loss": 0.036, + "num_input_tokens_seen": 46571456, + "step": 21575 + }, + { + "epoch": 3.5203915171288744, + "grad_norm": 0.01687135361135006, + "learning_rate": 0.0009825030213306463, + "loss": 0.0633, + "num_input_tokens_seen": 46583104, + "step": 21580 + }, + { + "epoch": 3.5212071778140293, + "grad_norm": 0.038620784878730774, + "learning_rate": 0.0009824843511241936, + "loss": 0.1467, + "num_input_tokens_seen": 46592384, + "step": 21585 + }, + { + "epoch": 3.5220228384991845, + "grad_norm": 0.1370106190443039, + "learning_rate": 0.0009824656711395834, + "loss": 0.0697, + "num_input_tokens_seen": 46603488, + "step": 21590 + }, + { + "epoch": 3.5228384991843393, + "grad_norm": 0.01430982630699873, + "learning_rate": 0.0009824469813771945, + "loss": 0.2333, + "num_input_tokens_seen": 46614528, + "step": 21595 + }, + { + "epoch": 3.523654159869494, + "grad_norm": 0.14202818274497986, + "learning_rate": 0.0009824282818374052, + "loss": 0.073, + "num_input_tokens_seen": 46625920, + "step": 21600 + }, + { + "epoch": 3.5244698205546494, + "grad_norm": 0.0048104808665812016, + "learning_rate": 0.000982409572520595, + "loss": 0.0333, + "num_input_tokens_seen": 46636288, + "step": 21605 + }, + { + "epoch": 3.5252854812398042, + "grad_norm": 0.15958112478256226, + "learning_rate": 0.0009823908534271426, + "loss": 0.1468, + "num_input_tokens_seen": 46645536, + "step": 21610 + }, + { + "epoch": 3.5261011419249595, + "grad_norm": 0.011767551302909851, + "learning_rate": 0.0009823721245574278, + "loss": 0.0478, + "num_input_tokens_seen": 46656224, + "step": 21615 + }, + { + "epoch": 3.5269168026101143, + "grad_norm": 0.050557691603899, + "learning_rate": 0.0009823533859118299, + "loss": 0.0751, + "num_input_tokens_seen": 46665920, + "step": 21620 + }, + { + "epoch": 3.527732463295269, + "grad_norm": 0.05469447001814842, + "learning_rate": 0.0009823346374907287, + "loss": 0.0616, + "num_input_tokens_seen": 46676864, + "step": 21625 + }, + { + "epoch": 3.528548123980424, + "grad_norm": 0.016357893124222755, + "learning_rate": 0.000982315879294504, + "loss": 0.1983, + "num_input_tokens_seen": 46688192, + "step": 21630 + }, + { + "epoch": 3.529363784665579, + "grad_norm": 0.02337736450135708, + "learning_rate": 0.0009822971113235366, + "loss": 0.1288, + "num_input_tokens_seen": 46698848, + "step": 21635 + }, + { + "epoch": 3.530179445350734, + "grad_norm": 0.26037532091140747, + "learning_rate": 0.0009822783335782061, + "loss": 0.1597, + "num_input_tokens_seen": 46709920, + "step": 21640 + }, + { + "epoch": 3.5309951060358893, + "grad_norm": 0.11585263907909393, + "learning_rate": 0.0009822595460588935, + "loss": 0.105, + "num_input_tokens_seen": 46721152, + "step": 21645 + }, + { + "epoch": 3.531810766721044, + "grad_norm": 0.06194286793470383, + "learning_rate": 0.0009822407487659792, + "loss": 0.1093, + "num_input_tokens_seen": 46732320, + "step": 21650 + }, + { + "epoch": 3.532626427406199, + "grad_norm": 0.008391073904931545, + "learning_rate": 0.0009822219416998445, + "loss": 0.1435, + "num_input_tokens_seen": 46744032, + "step": 21655 + }, + { + "epoch": 3.5334420880913537, + "grad_norm": 0.04855462163686752, + "learning_rate": 0.0009822031248608704, + "loss": 0.0836, + "num_input_tokens_seen": 46755584, + "step": 21660 + }, + { + "epoch": 3.534257748776509, + "grad_norm": 0.15102599561214447, + "learning_rate": 0.0009821842982494383, + "loss": 0.0388, + "num_input_tokens_seen": 46766368, + "step": 21665 + }, + { + "epoch": 3.535073409461664, + "grad_norm": 0.030343789607286453, + "learning_rate": 0.0009821654618659297, + "loss": 0.0759, + "num_input_tokens_seen": 46777920, + "step": 21670 + }, + { + "epoch": 3.535889070146819, + "grad_norm": 0.00454878993332386, + "learning_rate": 0.0009821466157107263, + "loss": 0.1401, + "num_input_tokens_seen": 46789312, + "step": 21675 + }, + { + "epoch": 3.536704730831974, + "grad_norm": 0.014585529454052448, + "learning_rate": 0.0009821277597842101, + "loss": 0.0558, + "num_input_tokens_seen": 46800736, + "step": 21680 + }, + { + "epoch": 3.5375203915171287, + "grad_norm": 0.2144649773836136, + "learning_rate": 0.0009821088940867632, + "loss": 0.1237, + "num_input_tokens_seen": 46812096, + "step": 21685 + }, + { + "epoch": 3.538336052202284, + "grad_norm": 0.03157595917582512, + "learning_rate": 0.0009820900186187681, + "loss": 0.1427, + "num_input_tokens_seen": 46822624, + "step": 21690 + }, + { + "epoch": 3.539151712887439, + "grad_norm": 0.09309013187885284, + "learning_rate": 0.0009820711333806068, + "loss": 0.1691, + "num_input_tokens_seen": 46833760, + "step": 21695 + }, + { + "epoch": 3.539967373572594, + "grad_norm": 0.14409902691841125, + "learning_rate": 0.000982052238372663, + "loss": 0.1346, + "num_input_tokens_seen": 46845312, + "step": 21700 + }, + { + "epoch": 3.540783034257749, + "grad_norm": 0.7762025594711304, + "learning_rate": 0.0009820333335953187, + "loss": 0.1102, + "num_input_tokens_seen": 46855904, + "step": 21705 + }, + { + "epoch": 3.5415986949429037, + "grad_norm": 0.010160159319639206, + "learning_rate": 0.0009820144190489574, + "loss": 0.1755, + "num_input_tokens_seen": 46865760, + "step": 21710 + }, + { + "epoch": 3.5424143556280585, + "grad_norm": 0.03377804532647133, + "learning_rate": 0.0009819954947339624, + "loss": 0.0545, + "num_input_tokens_seen": 46877888, + "step": 21715 + }, + { + "epoch": 3.5432300163132138, + "grad_norm": 0.034942951053380966, + "learning_rate": 0.0009819765606507173, + "loss": 0.0606, + "num_input_tokens_seen": 46889312, + "step": 21720 + }, + { + "epoch": 3.5440456769983686, + "grad_norm": 0.19004157185554504, + "learning_rate": 0.0009819576167996058, + "loss": 0.123, + "num_input_tokens_seen": 46899392, + "step": 21725 + }, + { + "epoch": 3.544861337683524, + "grad_norm": 0.0632287859916687, + "learning_rate": 0.000981938663181012, + "loss": 0.0824, + "num_input_tokens_seen": 46910784, + "step": 21730 + }, + { + "epoch": 3.5456769983686787, + "grad_norm": 0.07041964679956436, + "learning_rate": 0.0009819196997953195, + "loss": 0.1932, + "num_input_tokens_seen": 46921920, + "step": 21735 + }, + { + "epoch": 3.5464926590538335, + "grad_norm": 0.04367127642035484, + "learning_rate": 0.000981900726642913, + "loss": 0.0891, + "num_input_tokens_seen": 46931776, + "step": 21740 + }, + { + "epoch": 3.5473083197389887, + "grad_norm": 0.07827101647853851, + "learning_rate": 0.0009818817437241768, + "loss": 0.2371, + "num_input_tokens_seen": 46942848, + "step": 21745 + }, + { + "epoch": 3.5481239804241436, + "grad_norm": 0.012732475064694881, + "learning_rate": 0.000981862751039496, + "loss": 0.0527, + "num_input_tokens_seen": 46953408, + "step": 21750 + }, + { + "epoch": 3.5489396411092984, + "grad_norm": 0.08355928212404251, + "learning_rate": 0.000981843748589255, + "loss": 0.0629, + "num_input_tokens_seen": 46964416, + "step": 21755 + }, + { + "epoch": 3.5497553017944536, + "grad_norm": 0.026946188881993294, + "learning_rate": 0.0009818247363738396, + "loss": 0.0377, + "num_input_tokens_seen": 46974304, + "step": 21760 + }, + { + "epoch": 3.5505709624796085, + "grad_norm": 0.01880880817770958, + "learning_rate": 0.0009818057143936344, + "loss": 0.061, + "num_input_tokens_seen": 46984256, + "step": 21765 + }, + { + "epoch": 3.5513866231647633, + "grad_norm": 0.05923675745725632, + "learning_rate": 0.000981786682649025, + "loss": 0.1068, + "num_input_tokens_seen": 46995424, + "step": 21770 + }, + { + "epoch": 3.5522022838499185, + "grad_norm": 0.013598200865089893, + "learning_rate": 0.0009817676411403976, + "loss": 0.2098, + "num_input_tokens_seen": 47006880, + "step": 21775 + }, + { + "epoch": 3.5530179445350734, + "grad_norm": 0.1357584297657013, + "learning_rate": 0.0009817485898681378, + "loss": 0.2028, + "num_input_tokens_seen": 47016864, + "step": 21780 + }, + { + "epoch": 3.5538336052202286, + "grad_norm": 0.11286557465791702, + "learning_rate": 0.0009817295288326315, + "loss": 0.1565, + "num_input_tokens_seen": 47029248, + "step": 21785 + }, + { + "epoch": 3.5546492659053834, + "grad_norm": 0.05836876109242439, + "learning_rate": 0.0009817104580342653, + "loss": 0.1046, + "num_input_tokens_seen": 47040480, + "step": 21790 + }, + { + "epoch": 3.5554649265905383, + "grad_norm": 0.02187487483024597, + "learning_rate": 0.0009816913774734254, + "loss": 0.0374, + "num_input_tokens_seen": 47051872, + "step": 21795 + }, + { + "epoch": 3.556280587275693, + "grad_norm": 0.007799254264682531, + "learning_rate": 0.0009816722871504987, + "loss": 0.0232, + "num_input_tokens_seen": 47063008, + "step": 21800 + }, + { + "epoch": 3.5570962479608483, + "grad_norm": 0.020848968997597694, + "learning_rate": 0.0009816531870658722, + "loss": 0.0549, + "num_input_tokens_seen": 47073792, + "step": 21805 + }, + { + "epoch": 3.557911908646003, + "grad_norm": 0.016179397702217102, + "learning_rate": 0.0009816340772199328, + "loss": 0.0892, + "num_input_tokens_seen": 47084832, + "step": 21810 + }, + { + "epoch": 3.5587275693311584, + "grad_norm": 0.02550504170358181, + "learning_rate": 0.0009816149576130678, + "loss": 0.0369, + "num_input_tokens_seen": 47096032, + "step": 21815 + }, + { + "epoch": 3.5595432300163132, + "grad_norm": 0.01226399652659893, + "learning_rate": 0.0009815958282456648, + "loss": 0.1247, + "num_input_tokens_seen": 47106720, + "step": 21820 + }, + { + "epoch": 3.560358890701468, + "grad_norm": 0.008796843700110912, + "learning_rate": 0.0009815766891181112, + "loss": 0.0277, + "num_input_tokens_seen": 47118592, + "step": 21825 + }, + { + "epoch": 3.5611745513866233, + "grad_norm": 0.27409377694129944, + "learning_rate": 0.0009815575402307953, + "loss": 0.0783, + "num_input_tokens_seen": 47129792, + "step": 21830 + }, + { + "epoch": 3.561990212071778, + "grad_norm": 0.02622702717781067, + "learning_rate": 0.0009815383815841047, + "loss": 0.0268, + "num_input_tokens_seen": 47140224, + "step": 21835 + }, + { + "epoch": 3.5628058727569334, + "grad_norm": 0.09698707610368729, + "learning_rate": 0.0009815192131784282, + "loss": 0.1152, + "num_input_tokens_seen": 47151904, + "step": 21840 + }, + { + "epoch": 3.563621533442088, + "grad_norm": 0.17272219061851501, + "learning_rate": 0.0009815000350141539, + "loss": 0.0791, + "num_input_tokens_seen": 47161920, + "step": 21845 + }, + { + "epoch": 3.564437194127243, + "grad_norm": 0.008663838729262352, + "learning_rate": 0.0009814808470916705, + "loss": 0.1662, + "num_input_tokens_seen": 47172128, + "step": 21850 + }, + { + "epoch": 3.565252854812398, + "grad_norm": 0.22659926116466522, + "learning_rate": 0.0009814616494113668, + "loss": 0.1022, + "num_input_tokens_seen": 47182784, + "step": 21855 + }, + { + "epoch": 3.566068515497553, + "grad_norm": 0.0031124281231313944, + "learning_rate": 0.0009814424419736323, + "loss": 0.0192, + "num_input_tokens_seen": 47194336, + "step": 21860 + }, + { + "epoch": 3.566884176182708, + "grad_norm": 0.129921555519104, + "learning_rate": 0.0009814232247788556, + "loss": 0.0803, + "num_input_tokens_seen": 47204832, + "step": 21865 + }, + { + "epoch": 3.567699836867863, + "grad_norm": 0.04423084482550621, + "learning_rate": 0.0009814039978274269, + "loss": 0.0915, + "num_input_tokens_seen": 47215648, + "step": 21870 + }, + { + "epoch": 3.568515497553018, + "grad_norm": 0.007059141527861357, + "learning_rate": 0.0009813847611197352, + "loss": 0.1123, + "num_input_tokens_seen": 47226080, + "step": 21875 + }, + { + "epoch": 3.569331158238173, + "grad_norm": 0.07656865566968918, + "learning_rate": 0.0009813655146561709, + "loss": 0.0258, + "num_input_tokens_seen": 47238112, + "step": 21880 + }, + { + "epoch": 3.5701468189233276, + "grad_norm": 0.2083427459001541, + "learning_rate": 0.0009813462584371236, + "loss": 0.0916, + "num_input_tokens_seen": 47249216, + "step": 21885 + }, + { + "epoch": 3.570962479608483, + "grad_norm": 0.004050250630825758, + "learning_rate": 0.0009813269924629838, + "loss": 0.0424, + "num_input_tokens_seen": 47259296, + "step": 21890 + }, + { + "epoch": 3.5717781402936377, + "grad_norm": 0.011138557456433773, + "learning_rate": 0.000981307716734142, + "loss": 0.1211, + "num_input_tokens_seen": 47270336, + "step": 21895 + }, + { + "epoch": 3.572593800978793, + "grad_norm": 0.05811057239770889, + "learning_rate": 0.0009812884312509883, + "loss": 0.1024, + "num_input_tokens_seen": 47280736, + "step": 21900 + }, + { + "epoch": 3.573409461663948, + "grad_norm": 0.17921335995197296, + "learning_rate": 0.0009812691360139144, + "loss": 0.1242, + "num_input_tokens_seen": 47291328, + "step": 21905 + }, + { + "epoch": 3.5742251223491026, + "grad_norm": 0.019869348034262657, + "learning_rate": 0.000981249831023311, + "loss": 0.1377, + "num_input_tokens_seen": 47302368, + "step": 21910 + }, + { + "epoch": 3.575040783034258, + "grad_norm": 0.11883337050676346, + "learning_rate": 0.000981230516279569, + "loss": 0.0432, + "num_input_tokens_seen": 47312768, + "step": 21915 + }, + { + "epoch": 3.5758564437194127, + "grad_norm": 0.00856821145862341, + "learning_rate": 0.0009812111917830801, + "loss": 0.027, + "num_input_tokens_seen": 47323520, + "step": 21920 + }, + { + "epoch": 3.576672104404568, + "grad_norm": 0.21909934282302856, + "learning_rate": 0.000981191857534236, + "loss": 0.1202, + "num_input_tokens_seen": 47334688, + "step": 21925 + }, + { + "epoch": 3.5774877650897228, + "grad_norm": 0.017508920282125473, + "learning_rate": 0.0009811725135334287, + "loss": 0.0398, + "num_input_tokens_seen": 47345184, + "step": 21930 + }, + { + "epoch": 3.5783034257748776, + "grad_norm": 0.13208003342151642, + "learning_rate": 0.0009811531597810497, + "loss": 0.0704, + "num_input_tokens_seen": 47356608, + "step": 21935 + }, + { + "epoch": 3.5791190864600324, + "grad_norm": 0.12823589146137238, + "learning_rate": 0.0009811337962774916, + "loss": 0.3105, + "num_input_tokens_seen": 47367520, + "step": 21940 + }, + { + "epoch": 3.5799347471451877, + "grad_norm": 0.05735274776816368, + "learning_rate": 0.0009811144230231468, + "loss": 0.1416, + "num_input_tokens_seen": 47376288, + "step": 21945 + }, + { + "epoch": 3.5807504078303425, + "grad_norm": 0.09610272943973541, + "learning_rate": 0.0009810950400184078, + "loss": 0.0735, + "num_input_tokens_seen": 47386592, + "step": 21950 + }, + { + "epoch": 3.5815660685154977, + "grad_norm": 0.17457140982151031, + "learning_rate": 0.0009810756472636677, + "loss": 0.2127, + "num_input_tokens_seen": 47397792, + "step": 21955 + }, + { + "epoch": 3.5823817292006526, + "grad_norm": 0.0038679027929902077, + "learning_rate": 0.000981056244759319, + "loss": 0.0285, + "num_input_tokens_seen": 47409280, + "step": 21960 + }, + { + "epoch": 3.5831973898858074, + "grad_norm": 0.004641172010451555, + "learning_rate": 0.0009810368325057555, + "loss": 0.0444, + "num_input_tokens_seen": 47419680, + "step": 21965 + }, + { + "epoch": 3.5840130505709626, + "grad_norm": 0.10527661442756653, + "learning_rate": 0.0009810174105033703, + "loss": 0.1007, + "num_input_tokens_seen": 47431296, + "step": 21970 + }, + { + "epoch": 3.5848287112561175, + "grad_norm": 0.03643881157040596, + "learning_rate": 0.000980997978752557, + "loss": 0.0592, + "num_input_tokens_seen": 47441824, + "step": 21975 + }, + { + "epoch": 3.5856443719412723, + "grad_norm": 0.13610932230949402, + "learning_rate": 0.0009809785372537094, + "loss": 0.0232, + "num_input_tokens_seen": 47452192, + "step": 21980 + }, + { + "epoch": 3.5864600326264275, + "grad_norm": 0.17315170168876648, + "learning_rate": 0.0009809590860072217, + "loss": 0.0563, + "num_input_tokens_seen": 47461696, + "step": 21985 + }, + { + "epoch": 3.5872756933115824, + "grad_norm": 0.032739173620939255, + "learning_rate": 0.0009809396250134881, + "loss": 0.1437, + "num_input_tokens_seen": 47472960, + "step": 21990 + }, + { + "epoch": 3.588091353996737, + "grad_norm": 0.01509760320186615, + "learning_rate": 0.0009809201542729028, + "loss": 0.1138, + "num_input_tokens_seen": 47483712, + "step": 21995 + }, + { + "epoch": 3.5889070146818924, + "grad_norm": 0.09615371376276016, + "learning_rate": 0.0009809006737858603, + "loss": 0.1942, + "num_input_tokens_seen": 47493728, + "step": 22000 + }, + { + "epoch": 3.5897226753670473, + "grad_norm": 0.023802801966667175, + "learning_rate": 0.0009808811835527557, + "loss": 0.0217, + "num_input_tokens_seen": 47504928, + "step": 22005 + }, + { + "epoch": 3.5905383360522025, + "grad_norm": 0.04752498120069504, + "learning_rate": 0.000980861683573984, + "loss": 0.0921, + "num_input_tokens_seen": 47514880, + "step": 22010 + }, + { + "epoch": 3.5913539967373573, + "grad_norm": 0.17955318093299866, + "learning_rate": 0.00098084217384994, + "loss": 0.0743, + "num_input_tokens_seen": 47526688, + "step": 22015 + }, + { + "epoch": 3.592169657422512, + "grad_norm": 0.02287198230624199, + "learning_rate": 0.0009808226543810198, + "loss": 0.0466, + "num_input_tokens_seen": 47536768, + "step": 22020 + }, + { + "epoch": 3.592985318107667, + "grad_norm": 0.007511013653129339, + "learning_rate": 0.0009808031251676182, + "loss": 0.2034, + "num_input_tokens_seen": 47548768, + "step": 22025 + }, + { + "epoch": 3.5938009787928222, + "grad_norm": 0.006780500058084726, + "learning_rate": 0.0009807835862101313, + "loss": 0.1427, + "num_input_tokens_seen": 47559968, + "step": 22030 + }, + { + "epoch": 3.594616639477977, + "grad_norm": 0.24111032485961914, + "learning_rate": 0.0009807640375089552, + "loss": 0.152, + "num_input_tokens_seen": 47571040, + "step": 22035 + }, + { + "epoch": 3.5954323001631323, + "grad_norm": 0.11046013981103897, + "learning_rate": 0.000980744479064486, + "loss": 0.109, + "num_input_tokens_seen": 47582816, + "step": 22040 + }, + { + "epoch": 3.596247960848287, + "grad_norm": 0.06996925175189972, + "learning_rate": 0.00098072491087712, + "loss": 0.0567, + "num_input_tokens_seen": 47594112, + "step": 22045 + }, + { + "epoch": 3.597063621533442, + "grad_norm": 0.04954519867897034, + "learning_rate": 0.0009807053329472539, + "loss": 0.1914, + "num_input_tokens_seen": 47604320, + "step": 22050 + }, + { + "epoch": 3.597879282218597, + "grad_norm": 0.005976733285933733, + "learning_rate": 0.0009806857452752844, + "loss": 0.1137, + "num_input_tokens_seen": 47613472, + "step": 22055 + }, + { + "epoch": 3.598694942903752, + "grad_norm": 0.049997031688690186, + "learning_rate": 0.0009806661478616084, + "loss": 0.1709, + "num_input_tokens_seen": 47624096, + "step": 22060 + }, + { + "epoch": 3.5995106035889073, + "grad_norm": 0.21383458375930786, + "learning_rate": 0.000980646540706623, + "loss": 0.141, + "num_input_tokens_seen": 47635456, + "step": 22065 + }, + { + "epoch": 3.600326264274062, + "grad_norm": 0.11436790227890015, + "learning_rate": 0.0009806269238107261, + "loss": 0.0634, + "num_input_tokens_seen": 47644448, + "step": 22070 + }, + { + "epoch": 3.601141924959217, + "grad_norm": 0.14876404404640198, + "learning_rate": 0.0009806072971743148, + "loss": 0.1468, + "num_input_tokens_seen": 47654144, + "step": 22075 + }, + { + "epoch": 3.6019575856443717, + "grad_norm": 0.023250600323081017, + "learning_rate": 0.000980587660797787, + "loss": 0.0282, + "num_input_tokens_seen": 47664672, + "step": 22080 + }, + { + "epoch": 3.602773246329527, + "grad_norm": 0.08113319426774979, + "learning_rate": 0.00098056801468154, + "loss": 0.1201, + "num_input_tokens_seen": 47676512, + "step": 22085 + }, + { + "epoch": 3.603588907014682, + "grad_norm": 0.0897209495306015, + "learning_rate": 0.0009805483588259732, + "loss": 0.1021, + "num_input_tokens_seen": 47687168, + "step": 22090 + }, + { + "epoch": 3.604404567699837, + "grad_norm": 0.10479800403118134, + "learning_rate": 0.000980528693231484, + "loss": 0.0882, + "num_input_tokens_seen": 47698208, + "step": 22095 + }, + { + "epoch": 3.605220228384992, + "grad_norm": 0.034853607416152954, + "learning_rate": 0.0009805090178984712, + "loss": 0.0384, + "num_input_tokens_seen": 47708448, + "step": 22100 + }, + { + "epoch": 3.6060358890701467, + "grad_norm": 0.03427622839808464, + "learning_rate": 0.0009804893328273336, + "loss": 0.0558, + "num_input_tokens_seen": 47718624, + "step": 22105 + }, + { + "epoch": 3.6068515497553015, + "grad_norm": 0.09817445278167725, + "learning_rate": 0.0009804696380184704, + "loss": 0.3093, + "num_input_tokens_seen": 47729184, + "step": 22110 + }, + { + "epoch": 3.607667210440457, + "grad_norm": 0.06453338265419006, + "learning_rate": 0.0009804499334722801, + "loss": 0.1165, + "num_input_tokens_seen": 47740992, + "step": 22115 + }, + { + "epoch": 3.6084828711256116, + "grad_norm": 0.020237017422914505, + "learning_rate": 0.0009804302191891625, + "loss": 0.0861, + "num_input_tokens_seen": 47751520, + "step": 22120 + }, + { + "epoch": 3.609298531810767, + "grad_norm": 0.08078811317682266, + "learning_rate": 0.0009804104951695173, + "loss": 0.1536, + "num_input_tokens_seen": 47761920, + "step": 22125 + }, + { + "epoch": 3.6101141924959217, + "grad_norm": 0.016782863065600395, + "learning_rate": 0.0009803907614137435, + "loss": 0.1406, + "num_input_tokens_seen": 47773376, + "step": 22130 + }, + { + "epoch": 3.6109298531810765, + "grad_norm": 0.06218579038977623, + "learning_rate": 0.0009803710179222419, + "loss": 0.0684, + "num_input_tokens_seen": 47784064, + "step": 22135 + }, + { + "epoch": 3.6117455138662318, + "grad_norm": 0.11012060195207596, + "learning_rate": 0.000980351264695412, + "loss": 0.0528, + "num_input_tokens_seen": 47794592, + "step": 22140 + }, + { + "epoch": 3.6125611745513866, + "grad_norm": 0.07170744240283966, + "learning_rate": 0.0009803315017336545, + "loss": 0.0843, + "num_input_tokens_seen": 47805504, + "step": 22145 + }, + { + "epoch": 3.613376835236542, + "grad_norm": 0.020582256838679314, + "learning_rate": 0.0009803117290373697, + "loss": 0.1173, + "num_input_tokens_seen": 47816128, + "step": 22150 + }, + { + "epoch": 3.6141924959216967, + "grad_norm": 0.11927594244480133, + "learning_rate": 0.0009802919466069585, + "loss": 0.0808, + "num_input_tokens_seen": 47827264, + "step": 22155 + }, + { + "epoch": 3.6150081566068515, + "grad_norm": 0.012182388454675674, + "learning_rate": 0.0009802721544428215, + "loss": 0.0531, + "num_input_tokens_seen": 47838208, + "step": 22160 + }, + { + "epoch": 3.6158238172920063, + "grad_norm": 0.16328498721122742, + "learning_rate": 0.0009802523525453601, + "loss": 0.3425, + "num_input_tokens_seen": 47848192, + "step": 22165 + }, + { + "epoch": 3.6166394779771616, + "grad_norm": 0.008396895602345467, + "learning_rate": 0.0009802325409149757, + "loss": 0.1378, + "num_input_tokens_seen": 47858688, + "step": 22170 + }, + { + "epoch": 3.6174551386623164, + "grad_norm": 0.05154656618833542, + "learning_rate": 0.0009802127195520697, + "loss": 0.0306, + "num_input_tokens_seen": 47870048, + "step": 22175 + }, + { + "epoch": 3.6182707993474716, + "grad_norm": 0.10817541927099228, + "learning_rate": 0.0009801928884570434, + "loss": 0.1192, + "num_input_tokens_seen": 47881984, + "step": 22180 + }, + { + "epoch": 3.6190864600326265, + "grad_norm": 0.055921148508787155, + "learning_rate": 0.0009801730476302992, + "loss": 0.0374, + "num_input_tokens_seen": 47893376, + "step": 22185 + }, + { + "epoch": 3.6199021207177813, + "grad_norm": 0.03370668739080429, + "learning_rate": 0.000980153197072239, + "loss": 0.1129, + "num_input_tokens_seen": 47903680, + "step": 22190 + }, + { + "epoch": 3.6207177814029365, + "grad_norm": 0.20434153079986572, + "learning_rate": 0.0009801333367832651, + "loss": 0.154, + "num_input_tokens_seen": 47914144, + "step": 22195 + }, + { + "epoch": 3.6215334420880914, + "grad_norm": 0.02956208772957325, + "learning_rate": 0.0009801134667637803, + "loss": 0.0243, + "num_input_tokens_seen": 47924224, + "step": 22200 + }, + { + "epoch": 3.622349102773246, + "grad_norm": 0.014472408220171928, + "learning_rate": 0.0009800935870141868, + "loss": 0.0334, + "num_input_tokens_seen": 47935264, + "step": 22205 + }, + { + "epoch": 3.6231647634584014, + "grad_norm": 0.14801977574825287, + "learning_rate": 0.0009800736975348878, + "loss": 0.0773, + "num_input_tokens_seen": 47946432, + "step": 22210 + }, + { + "epoch": 3.6239804241435563, + "grad_norm": 0.015665441751480103, + "learning_rate": 0.0009800537983262862, + "loss": 0.1415, + "num_input_tokens_seen": 47956608, + "step": 22215 + }, + { + "epoch": 3.624796084828711, + "grad_norm": 0.0496622733771801, + "learning_rate": 0.0009800338893887857, + "loss": 0.1674, + "num_input_tokens_seen": 47967552, + "step": 22220 + }, + { + "epoch": 3.6256117455138663, + "grad_norm": 0.16881395876407623, + "learning_rate": 0.000980013970722789, + "loss": 0.1994, + "num_input_tokens_seen": 47977888, + "step": 22225 + }, + { + "epoch": 3.626427406199021, + "grad_norm": 0.06680756062269211, + "learning_rate": 0.0009799940423287005, + "loss": 0.0488, + "num_input_tokens_seen": 47988576, + "step": 22230 + }, + { + "epoch": 3.6272430668841764, + "grad_norm": 0.10704405605792999, + "learning_rate": 0.000979974104206924, + "loss": 0.2733, + "num_input_tokens_seen": 47999776, + "step": 22235 + }, + { + "epoch": 3.6280587275693312, + "grad_norm": 0.022311555221676826, + "learning_rate": 0.0009799541563578632, + "loss": 0.135, + "num_input_tokens_seen": 48010688, + "step": 22240 + }, + { + "epoch": 3.628874388254486, + "grad_norm": 0.034336838871240616, + "learning_rate": 0.0009799341987819224, + "loss": 0.0305, + "num_input_tokens_seen": 48021152, + "step": 22245 + }, + { + "epoch": 3.629690048939641, + "grad_norm": 0.07198742032051086, + "learning_rate": 0.0009799142314795065, + "loss": 0.1507, + "num_input_tokens_seen": 48030176, + "step": 22250 + }, + { + "epoch": 3.630505709624796, + "grad_norm": 0.12438057363033295, + "learning_rate": 0.0009798942544510198, + "loss": 0.1497, + "num_input_tokens_seen": 48039232, + "step": 22255 + }, + { + "epoch": 3.631321370309951, + "grad_norm": 0.02349034883081913, + "learning_rate": 0.000979874267696867, + "loss": 0.1046, + "num_input_tokens_seen": 48049984, + "step": 22260 + }, + { + "epoch": 3.632137030995106, + "grad_norm": 0.027484968304634094, + "learning_rate": 0.0009798542712174537, + "loss": 0.08, + "num_input_tokens_seen": 48061088, + "step": 22265 + }, + { + "epoch": 3.632952691680261, + "grad_norm": 0.1283213347196579, + "learning_rate": 0.0009798342650131845, + "loss": 0.1242, + "num_input_tokens_seen": 48072704, + "step": 22270 + }, + { + "epoch": 3.633768352365416, + "grad_norm": 0.032759904861450195, + "learning_rate": 0.0009798142490844656, + "loss": 0.0713, + "num_input_tokens_seen": 48082848, + "step": 22275 + }, + { + "epoch": 3.634584013050571, + "grad_norm": 0.13074278831481934, + "learning_rate": 0.0009797942234317022, + "loss": 0.1636, + "num_input_tokens_seen": 48094464, + "step": 22280 + }, + { + "epoch": 3.635399673735726, + "grad_norm": 0.111960768699646, + "learning_rate": 0.0009797741880553, + "loss": 0.131, + "num_input_tokens_seen": 48105088, + "step": 22285 + }, + { + "epoch": 3.636215334420881, + "grad_norm": 0.1873292326927185, + "learning_rate": 0.0009797541429556653, + "loss": 0.1139, + "num_input_tokens_seen": 48114432, + "step": 22290 + }, + { + "epoch": 3.637030995106036, + "grad_norm": 0.21978068351745605, + "learning_rate": 0.0009797340881332044, + "loss": 0.0842, + "num_input_tokens_seen": 48124288, + "step": 22295 + }, + { + "epoch": 3.637846655791191, + "grad_norm": 0.05812196806073189, + "learning_rate": 0.0009797140235883236, + "loss": 0.1898, + "num_input_tokens_seen": 48134144, + "step": 22300 + }, + { + "epoch": 3.6386623164763456, + "grad_norm": 0.058976538479328156, + "learning_rate": 0.0009796939493214294, + "loss": 0.0751, + "num_input_tokens_seen": 48144416, + "step": 22305 + }, + { + "epoch": 3.639477977161501, + "grad_norm": 0.05278509110212326, + "learning_rate": 0.000979673865332929, + "loss": 0.0343, + "num_input_tokens_seen": 48155584, + "step": 22310 + }, + { + "epoch": 3.6402936378466557, + "grad_norm": 0.20454150438308716, + "learning_rate": 0.0009796537716232289, + "loss": 0.1888, + "num_input_tokens_seen": 48167232, + "step": 22315 + }, + { + "epoch": 3.641109298531811, + "grad_norm": 0.21192015707492828, + "learning_rate": 0.000979633668192737, + "loss": 0.1851, + "num_input_tokens_seen": 48178208, + "step": 22320 + }, + { + "epoch": 3.641924959216966, + "grad_norm": 0.013276055455207825, + "learning_rate": 0.0009796135550418602, + "loss": 0.0459, + "num_input_tokens_seen": 48188896, + "step": 22325 + }, + { + "epoch": 3.6427406199021206, + "grad_norm": 0.16026149690151215, + "learning_rate": 0.0009795934321710062, + "loss": 0.1719, + "num_input_tokens_seen": 48200000, + "step": 22330 + }, + { + "epoch": 3.6435562805872754, + "grad_norm": 0.19552117586135864, + "learning_rate": 0.0009795732995805829, + "loss": 0.1737, + "num_input_tokens_seen": 48211200, + "step": 22335 + }, + { + "epoch": 3.6443719412724307, + "grad_norm": 0.11249581724405289, + "learning_rate": 0.0009795531572709983, + "loss": 0.178, + "num_input_tokens_seen": 48221504, + "step": 22340 + }, + { + "epoch": 3.6451876019575855, + "grad_norm": 0.1763429343700409, + "learning_rate": 0.0009795330052426608, + "loss": 0.2026, + "num_input_tokens_seen": 48230976, + "step": 22345 + }, + { + "epoch": 3.6460032626427408, + "grad_norm": 0.09847237169742584, + "learning_rate": 0.0009795128434959785, + "loss": 0.0565, + "num_input_tokens_seen": 48242368, + "step": 22350 + }, + { + "epoch": 3.6468189233278956, + "grad_norm": 0.06162923201918602, + "learning_rate": 0.00097949267203136, + "loss": 0.1123, + "num_input_tokens_seen": 48253696, + "step": 22355 + }, + { + "epoch": 3.6476345840130504, + "grad_norm": 0.06795935332775116, + "learning_rate": 0.0009794724908492143, + "loss": 0.1455, + "num_input_tokens_seen": 48264768, + "step": 22360 + }, + { + "epoch": 3.6484502446982057, + "grad_norm": 0.02337501384317875, + "learning_rate": 0.0009794522999499503, + "loss": 0.1238, + "num_input_tokens_seen": 48275520, + "step": 22365 + }, + { + "epoch": 3.6492659053833605, + "grad_norm": 0.1153092309832573, + "learning_rate": 0.0009794320993339772, + "loss": 0.1104, + "num_input_tokens_seen": 48284960, + "step": 22370 + }, + { + "epoch": 3.6500815660685157, + "grad_norm": 0.032297346740961075, + "learning_rate": 0.0009794118890017046, + "loss": 0.0336, + "num_input_tokens_seen": 48295840, + "step": 22375 + }, + { + "epoch": 3.6508972267536706, + "grad_norm": 0.07139687985181808, + "learning_rate": 0.0009793916689535417, + "loss": 0.0617, + "num_input_tokens_seen": 48306880, + "step": 22380 + }, + { + "epoch": 3.6517128874388254, + "grad_norm": 0.04134083539247513, + "learning_rate": 0.0009793714391898984, + "loss": 0.0793, + "num_input_tokens_seen": 48318336, + "step": 22385 + }, + { + "epoch": 3.65252854812398, + "grad_norm": 0.1838148534297943, + "learning_rate": 0.000979351199711185, + "loss": 0.1565, + "num_input_tokens_seen": 48329152, + "step": 22390 + }, + { + "epoch": 3.6533442088091355, + "grad_norm": 0.02638830617070198, + "learning_rate": 0.0009793309505178112, + "loss": 0.0241, + "num_input_tokens_seen": 48339136, + "step": 22395 + }, + { + "epoch": 3.6541598694942903, + "grad_norm": 0.04090012237429619, + "learning_rate": 0.000979310691610188, + "loss": 0.0861, + "num_input_tokens_seen": 48350208, + "step": 22400 + }, + { + "epoch": 3.6549755301794455, + "grad_norm": 0.04064729064702988, + "learning_rate": 0.0009792904229887253, + "loss": 0.1429, + "num_input_tokens_seen": 48362016, + "step": 22405 + }, + { + "epoch": 3.6557911908646004, + "grad_norm": 0.03304686397314072, + "learning_rate": 0.0009792701446538342, + "loss": 0.1006, + "num_input_tokens_seen": 48371424, + "step": 22410 + }, + { + "epoch": 3.656606851549755, + "grad_norm": 0.030130738392472267, + "learning_rate": 0.0009792498566059255, + "loss": 0.1856, + "num_input_tokens_seen": 48383328, + "step": 22415 + }, + { + "epoch": 3.6574225122349104, + "grad_norm": 0.011232363991439342, + "learning_rate": 0.0009792295588454106, + "loss": 0.0247, + "num_input_tokens_seen": 48394304, + "step": 22420 + }, + { + "epoch": 3.6582381729200653, + "grad_norm": 0.12669359147548676, + "learning_rate": 0.0009792092513727006, + "loss": 0.0528, + "num_input_tokens_seen": 48405376, + "step": 22425 + }, + { + "epoch": 3.65905383360522, + "grad_norm": 0.05973414331674576, + "learning_rate": 0.0009791889341882075, + "loss": 0.034, + "num_input_tokens_seen": 48416352, + "step": 22430 + }, + { + "epoch": 3.6598694942903753, + "grad_norm": 0.1548718512058258, + "learning_rate": 0.0009791686072923424, + "loss": 0.1788, + "num_input_tokens_seen": 48427488, + "step": 22435 + }, + { + "epoch": 3.66068515497553, + "grad_norm": 0.0041184513829648495, + "learning_rate": 0.0009791482706855178, + "loss": 0.0424, + "num_input_tokens_seen": 48439520, + "step": 22440 + }, + { + "epoch": 3.661500815660685, + "grad_norm": 0.035848312079906464, + "learning_rate": 0.0009791279243681456, + "loss": 0.0338, + "num_input_tokens_seen": 48449984, + "step": 22445 + }, + { + "epoch": 3.6623164763458402, + "grad_norm": 0.14587965607643127, + "learning_rate": 0.0009791075683406383, + "loss": 0.22, + "num_input_tokens_seen": 48460672, + "step": 22450 + }, + { + "epoch": 3.663132137030995, + "grad_norm": 0.029001962393522263, + "learning_rate": 0.0009790872026034082, + "loss": 0.045, + "num_input_tokens_seen": 48471328, + "step": 22455 + }, + { + "epoch": 3.6639477977161503, + "grad_norm": 0.012982098385691643, + "learning_rate": 0.0009790668271568684, + "loss": 0.0304, + "num_input_tokens_seen": 48481664, + "step": 22460 + }, + { + "epoch": 3.664763458401305, + "grad_norm": 0.01756366901099682, + "learning_rate": 0.0009790464420014312, + "loss": 0.0975, + "num_input_tokens_seen": 48492896, + "step": 22465 + }, + { + "epoch": 3.66557911908646, + "grad_norm": 0.06174856424331665, + "learning_rate": 0.0009790260471375105, + "loss": 0.0412, + "num_input_tokens_seen": 48503648, + "step": 22470 + }, + { + "epoch": 3.6663947797716148, + "grad_norm": 0.08776724338531494, + "learning_rate": 0.0009790056425655193, + "loss": 0.0602, + "num_input_tokens_seen": 48514368, + "step": 22475 + }, + { + "epoch": 3.66721044045677, + "grad_norm": 0.011923730373382568, + "learning_rate": 0.0009789852282858708, + "loss": 0.1686, + "num_input_tokens_seen": 48525504, + "step": 22480 + }, + { + "epoch": 3.668026101141925, + "grad_norm": 0.09198827296495438, + "learning_rate": 0.0009789648042989793, + "loss": 0.0809, + "num_input_tokens_seen": 48534880, + "step": 22485 + }, + { + "epoch": 3.66884176182708, + "grad_norm": 0.14244717359542847, + "learning_rate": 0.0009789443706052583, + "loss": 0.194, + "num_input_tokens_seen": 48546656, + "step": 22490 + }, + { + "epoch": 3.669657422512235, + "grad_norm": 0.00833072792738676, + "learning_rate": 0.000978923927205122, + "loss": 0.0555, + "num_input_tokens_seen": 48557952, + "step": 22495 + }, + { + "epoch": 3.6704730831973897, + "grad_norm": 0.011483049020171165, + "learning_rate": 0.0009789034740989848, + "loss": 0.0691, + "num_input_tokens_seen": 48568832, + "step": 22500 + }, + { + "epoch": 3.671288743882545, + "grad_norm": 0.06822390854358673, + "learning_rate": 0.0009788830112872611, + "loss": 0.1972, + "num_input_tokens_seen": 48578784, + "step": 22505 + }, + { + "epoch": 3.6721044045677, + "grad_norm": 0.0038656939286738634, + "learning_rate": 0.0009788625387703658, + "loss": 0.0911, + "num_input_tokens_seen": 48590016, + "step": 22510 + }, + { + "epoch": 3.672920065252855, + "grad_norm": 0.033892277628183365, + "learning_rate": 0.0009788420565487136, + "loss": 0.0526, + "num_input_tokens_seen": 48600608, + "step": 22515 + }, + { + "epoch": 3.67373572593801, + "grad_norm": 0.005967119242995977, + "learning_rate": 0.0009788215646227196, + "loss": 0.0372, + "num_input_tokens_seen": 48610912, + "step": 22520 + }, + { + "epoch": 3.6745513866231647, + "grad_norm": 0.08318198472261429, + "learning_rate": 0.0009788010629927992, + "loss": 0.0957, + "num_input_tokens_seen": 48622272, + "step": 22525 + }, + { + "epoch": 3.6753670473083195, + "grad_norm": 0.013141914270818233, + "learning_rate": 0.000978780551659368, + "loss": 0.0982, + "num_input_tokens_seen": 48633376, + "step": 22530 + }, + { + "epoch": 3.676182707993475, + "grad_norm": 0.22537027299404144, + "learning_rate": 0.0009787600306228415, + "loss": 0.1497, + "num_input_tokens_seen": 48644608, + "step": 22535 + }, + { + "epoch": 3.6769983686786296, + "grad_norm": 0.07313469797372818, + "learning_rate": 0.0009787394998836355, + "loss": 0.0625, + "num_input_tokens_seen": 48656448, + "step": 22540 + }, + { + "epoch": 3.677814029363785, + "grad_norm": 0.036538027226924896, + "learning_rate": 0.0009787189594421663, + "loss": 0.1492, + "num_input_tokens_seen": 48667936, + "step": 22545 + }, + { + "epoch": 3.6786296900489397, + "grad_norm": 0.1372271627187729, + "learning_rate": 0.00097869840929885, + "loss": 0.241, + "num_input_tokens_seen": 48678240, + "step": 22550 + }, + { + "epoch": 3.6794453507340945, + "grad_norm": 0.10177024453878403, + "learning_rate": 0.0009786778494541033, + "loss": 0.0584, + "num_input_tokens_seen": 48687296, + "step": 22555 + }, + { + "epoch": 3.6802610114192493, + "grad_norm": 0.015342644415795803, + "learning_rate": 0.0009786572799083426, + "loss": 0.0319, + "num_input_tokens_seen": 48698720, + "step": 22560 + }, + { + "epoch": 3.6810766721044046, + "grad_norm": 0.1638292670249939, + "learning_rate": 0.000978636700661985, + "loss": 0.0807, + "num_input_tokens_seen": 48709696, + "step": 22565 + }, + { + "epoch": 3.6818923327895594, + "grad_norm": 0.1733890175819397, + "learning_rate": 0.0009786161117154475, + "loss": 0.0478, + "num_input_tokens_seen": 48720032, + "step": 22570 + }, + { + "epoch": 3.6827079934747147, + "grad_norm": 0.18457923829555511, + "learning_rate": 0.0009785955130691471, + "loss": 0.1239, + "num_input_tokens_seen": 48729664, + "step": 22575 + }, + { + "epoch": 3.6835236541598695, + "grad_norm": 0.026617489755153656, + "learning_rate": 0.0009785749047235017, + "loss": 0.0489, + "num_input_tokens_seen": 48742336, + "step": 22580 + }, + { + "epoch": 3.6843393148450243, + "grad_norm": 0.03972393646836281, + "learning_rate": 0.0009785542866789288, + "loss": 0.068, + "num_input_tokens_seen": 48754048, + "step": 22585 + }, + { + "epoch": 3.6851549755301796, + "grad_norm": 0.057122793048620224, + "learning_rate": 0.000978533658935846, + "loss": 0.0697, + "num_input_tokens_seen": 48765344, + "step": 22590 + }, + { + "epoch": 3.6859706362153344, + "grad_norm": 0.09369645267724991, + "learning_rate": 0.0009785130214946716, + "loss": 0.1098, + "num_input_tokens_seen": 48777152, + "step": 22595 + }, + { + "epoch": 3.6867862969004896, + "grad_norm": 0.05586928129196167, + "learning_rate": 0.0009784923743558238, + "loss": 0.081, + "num_input_tokens_seen": 48786880, + "step": 22600 + }, + { + "epoch": 3.6876019575856445, + "grad_norm": 0.010382387787103653, + "learning_rate": 0.000978471717519721, + "loss": 0.0131, + "num_input_tokens_seen": 48796256, + "step": 22605 + }, + { + "epoch": 3.6884176182707993, + "grad_norm": 0.018875733017921448, + "learning_rate": 0.0009784510509867818, + "loss": 0.1612, + "num_input_tokens_seen": 48808096, + "step": 22610 + }, + { + "epoch": 3.689233278955954, + "grad_norm": 0.0659688413143158, + "learning_rate": 0.0009784303747574254, + "loss": 0.1626, + "num_input_tokens_seen": 48818848, + "step": 22615 + }, + { + "epoch": 3.6900489396411094, + "grad_norm": 0.015230267308652401, + "learning_rate": 0.0009784096888320703, + "loss": 0.0803, + "num_input_tokens_seen": 48831296, + "step": 22620 + }, + { + "epoch": 3.690864600326264, + "grad_norm": 0.04165417701005936, + "learning_rate": 0.000978388993211136, + "loss": 0.1199, + "num_input_tokens_seen": 48841536, + "step": 22625 + }, + { + "epoch": 3.6916802610114194, + "grad_norm": 0.2265789657831192, + "learning_rate": 0.0009783682878950416, + "loss": 0.2312, + "num_input_tokens_seen": 48853152, + "step": 22630 + }, + { + "epoch": 3.6924959216965743, + "grad_norm": 0.01558384858071804, + "learning_rate": 0.0009783475728842074, + "loss": 0.1092, + "num_input_tokens_seen": 48862688, + "step": 22635 + }, + { + "epoch": 3.693311582381729, + "grad_norm": 0.3484792709350586, + "learning_rate": 0.0009783268481790527, + "loss": 0.1325, + "num_input_tokens_seen": 48874688, + "step": 22640 + }, + { + "epoch": 3.6941272430668843, + "grad_norm": 0.08699874579906464, + "learning_rate": 0.0009783061137799975, + "loss": 0.1039, + "num_input_tokens_seen": 48885856, + "step": 22645 + }, + { + "epoch": 3.694942903752039, + "grad_norm": 0.06225195899605751, + "learning_rate": 0.000978285369687462, + "loss": 0.1551, + "num_input_tokens_seen": 48896096, + "step": 22650 + }, + { + "epoch": 3.695758564437194, + "grad_norm": 0.12717466056346893, + "learning_rate": 0.000978264615901867, + "loss": 0.2005, + "num_input_tokens_seen": 48907328, + "step": 22655 + }, + { + "epoch": 3.6965742251223492, + "grad_norm": 0.11328820139169693, + "learning_rate": 0.0009782438524236327, + "loss": 0.1419, + "num_input_tokens_seen": 48918304, + "step": 22660 + }, + { + "epoch": 3.697389885807504, + "grad_norm": 0.030300073325634003, + "learning_rate": 0.00097822307925318, + "loss": 0.0657, + "num_input_tokens_seen": 48929440, + "step": 22665 + }, + { + "epoch": 3.698205546492659, + "grad_norm": 0.07645571231842041, + "learning_rate": 0.00097820229639093, + "loss": 0.0335, + "num_input_tokens_seen": 48940256, + "step": 22670 + }, + { + "epoch": 3.699021207177814, + "grad_norm": 0.018116958439350128, + "learning_rate": 0.0009781815038373042, + "loss": 0.0827, + "num_input_tokens_seen": 48949856, + "step": 22675 + }, + { + "epoch": 3.699836867862969, + "grad_norm": 0.1341787725687027, + "learning_rate": 0.000978160701592723, + "loss": 0.0671, + "num_input_tokens_seen": 48960768, + "step": 22680 + }, + { + "epoch": 3.700652528548124, + "grad_norm": 0.1685478836297989, + "learning_rate": 0.000978139889657609, + "loss": 0.1561, + "num_input_tokens_seen": 48971264, + "step": 22685 + }, + { + "epoch": 3.701468189233279, + "grad_norm": 0.01278993021696806, + "learning_rate": 0.0009781190680323833, + "loss": 0.1425, + "num_input_tokens_seen": 48981632, + "step": 22690 + }, + { + "epoch": 3.702283849918434, + "grad_norm": 0.12621980905532837, + "learning_rate": 0.0009780982367174683, + "loss": 0.2153, + "num_input_tokens_seen": 48993024, + "step": 22695 + }, + { + "epoch": 3.7030995106035887, + "grad_norm": 0.014589357189834118, + "learning_rate": 0.000978077395713286, + "loss": 0.1709, + "num_input_tokens_seen": 49004064, + "step": 22700 + }, + { + "epoch": 3.703915171288744, + "grad_norm": 0.11047063767910004, + "learning_rate": 0.0009780565450202587, + "loss": 0.1542, + "num_input_tokens_seen": 49016032, + "step": 22705 + }, + { + "epoch": 3.7047308319738987, + "grad_norm": 0.06583212316036224, + "learning_rate": 0.0009780356846388091, + "loss": 0.0665, + "num_input_tokens_seen": 49026336, + "step": 22710 + }, + { + "epoch": 3.705546492659054, + "grad_norm": 0.11619914323091507, + "learning_rate": 0.00097801481456936, + "loss": 0.1401, + "num_input_tokens_seen": 49037312, + "step": 22715 + }, + { + "epoch": 3.706362153344209, + "grad_norm": 0.04538528621196747, + "learning_rate": 0.0009779939348123342, + "loss": 0.1543, + "num_input_tokens_seen": 49048480, + "step": 22720 + }, + { + "epoch": 3.7071778140293636, + "grad_norm": 0.09305012226104736, + "learning_rate": 0.000977973045368155, + "loss": 0.2704, + "num_input_tokens_seen": 49059840, + "step": 22725 + }, + { + "epoch": 3.707993474714519, + "grad_norm": 0.03942722827196121, + "learning_rate": 0.0009779521462372457, + "loss": 0.0922, + "num_input_tokens_seen": 49070720, + "step": 22730 + }, + { + "epoch": 3.7088091353996737, + "grad_norm": 0.021622756496071815, + "learning_rate": 0.0009779312374200298, + "loss": 0.0607, + "num_input_tokens_seen": 49081408, + "step": 22735 + }, + { + "epoch": 3.709624796084829, + "grad_norm": 0.07174298167228699, + "learning_rate": 0.0009779103189169309, + "loss": 0.0617, + "num_input_tokens_seen": 49092416, + "step": 22740 + }, + { + "epoch": 3.710440456769984, + "grad_norm": 0.08502726256847382, + "learning_rate": 0.0009778893907283733, + "loss": 0.149, + "num_input_tokens_seen": 49102464, + "step": 22745 + }, + { + "epoch": 3.7112561174551386, + "grad_norm": 0.02458067424595356, + "learning_rate": 0.000977868452854781, + "loss": 0.0434, + "num_input_tokens_seen": 49113920, + "step": 22750 + }, + { + "epoch": 3.7120717781402934, + "grad_norm": 0.020260639488697052, + "learning_rate": 0.000977847505296578, + "loss": 0.0967, + "num_input_tokens_seen": 49124096, + "step": 22755 + }, + { + "epoch": 3.7128874388254487, + "grad_norm": 0.0071016158908605576, + "learning_rate": 0.0009778265480541895, + "loss": 0.0682, + "num_input_tokens_seen": 49134048, + "step": 22760 + }, + { + "epoch": 3.7137030995106035, + "grad_norm": 0.046654488891363144, + "learning_rate": 0.0009778055811280396, + "loss": 0.0308, + "num_input_tokens_seen": 49144800, + "step": 22765 + }, + { + "epoch": 3.7145187601957588, + "grad_norm": 0.18117330968379974, + "learning_rate": 0.0009777846045185535, + "loss": 0.1415, + "num_input_tokens_seen": 49156128, + "step": 22770 + }, + { + "epoch": 3.7153344208809136, + "grad_norm": 0.049614764750003815, + "learning_rate": 0.0009777636182261562, + "loss": 0.1221, + "num_input_tokens_seen": 49167168, + "step": 22775 + }, + { + "epoch": 3.7161500815660684, + "grad_norm": 0.010831250809133053, + "learning_rate": 0.0009777426222512733, + "loss": 0.044, + "num_input_tokens_seen": 49177248, + "step": 22780 + }, + { + "epoch": 3.7169657422512232, + "grad_norm": 0.029537372291088104, + "learning_rate": 0.0009777216165943298, + "loss": 0.0343, + "num_input_tokens_seen": 49189568, + "step": 22785 + }, + { + "epoch": 3.7177814029363785, + "grad_norm": 0.24401909112930298, + "learning_rate": 0.0009777006012557522, + "loss": 0.1414, + "num_input_tokens_seen": 49198912, + "step": 22790 + }, + { + "epoch": 3.7185970636215333, + "grad_norm": 0.03947121649980545, + "learning_rate": 0.0009776795762359654, + "loss": 0.1224, + "num_input_tokens_seen": 49210112, + "step": 22795 + }, + { + "epoch": 3.7194127243066886, + "grad_norm": 0.2472681701183319, + "learning_rate": 0.0009776585415353963, + "loss": 0.1295, + "num_input_tokens_seen": 49221344, + "step": 22800 + }, + { + "epoch": 3.7202283849918434, + "grad_norm": 0.0034598468337208033, + "learning_rate": 0.0009776374971544708, + "loss": 0.1138, + "num_input_tokens_seen": 49232064, + "step": 22805 + }, + { + "epoch": 3.721044045676998, + "grad_norm": 0.1858585774898529, + "learning_rate": 0.0009776164430936153, + "loss": 0.1055, + "num_input_tokens_seen": 49242144, + "step": 22810 + }, + { + "epoch": 3.7218597063621535, + "grad_norm": 0.1301809847354889, + "learning_rate": 0.000977595379353257, + "loss": 0.0699, + "num_input_tokens_seen": 49253248, + "step": 22815 + }, + { + "epoch": 3.7226753670473083, + "grad_norm": 0.11833587288856506, + "learning_rate": 0.0009775743059338223, + "loss": 0.0802, + "num_input_tokens_seen": 49264384, + "step": 22820 + }, + { + "epoch": 3.7234910277324635, + "grad_norm": 0.008296025916934013, + "learning_rate": 0.0009775532228357385, + "loss": 0.0254, + "num_input_tokens_seen": 49276416, + "step": 22825 + }, + { + "epoch": 3.7243066884176184, + "grad_norm": 0.08529414236545563, + "learning_rate": 0.0009775321300594328, + "loss": 0.0444, + "num_input_tokens_seen": 49288800, + "step": 22830 + }, + { + "epoch": 3.725122349102773, + "grad_norm": 0.18556520342826843, + "learning_rate": 0.0009775110276053327, + "loss": 0.1331, + "num_input_tokens_seen": 49299328, + "step": 22835 + }, + { + "epoch": 3.725938009787928, + "grad_norm": 0.16216923296451569, + "learning_rate": 0.000977489915473866, + "loss": 0.0934, + "num_input_tokens_seen": 49311360, + "step": 22840 + }, + { + "epoch": 3.7267536704730833, + "grad_norm": 0.030783111229538918, + "learning_rate": 0.0009774687936654602, + "loss": 0.22, + "num_input_tokens_seen": 49321632, + "step": 22845 + }, + { + "epoch": 3.727569331158238, + "grad_norm": 0.16580526530742645, + "learning_rate": 0.0009774476621805437, + "loss": 0.0757, + "num_input_tokens_seen": 49332256, + "step": 22850 + }, + { + "epoch": 3.7283849918433933, + "grad_norm": 0.019450953230261803, + "learning_rate": 0.0009774265210195446, + "loss": 0.0723, + "num_input_tokens_seen": 49343360, + "step": 22855 + }, + { + "epoch": 3.729200652528548, + "grad_norm": 0.0033402855042368174, + "learning_rate": 0.0009774053701828913, + "loss": 0.1133, + "num_input_tokens_seen": 49354112, + "step": 22860 + }, + { + "epoch": 3.730016313213703, + "grad_norm": 0.022833161056041718, + "learning_rate": 0.0009773842096710127, + "loss": 0.122, + "num_input_tokens_seen": 49365120, + "step": 22865 + }, + { + "epoch": 3.7308319738988582, + "grad_norm": 0.13699039816856384, + "learning_rate": 0.0009773630394843374, + "loss": 0.1885, + "num_input_tokens_seen": 49377600, + "step": 22870 + }, + { + "epoch": 3.731647634584013, + "grad_norm": 0.05008578300476074, + "learning_rate": 0.0009773418596232945, + "loss": 0.1185, + "num_input_tokens_seen": 49387648, + "step": 22875 + }, + { + "epoch": 3.732463295269168, + "grad_norm": 0.24140454828739166, + "learning_rate": 0.0009773206700883135, + "loss": 0.2542, + "num_input_tokens_seen": 49398368, + "step": 22880 + }, + { + "epoch": 3.733278955954323, + "grad_norm": 0.1162518858909607, + "learning_rate": 0.0009772994708798232, + "loss": 0.0974, + "num_input_tokens_seen": 49408416, + "step": 22885 + }, + { + "epoch": 3.734094616639478, + "grad_norm": 0.07058801501989365, + "learning_rate": 0.000977278261998254, + "loss": 0.2702, + "num_input_tokens_seen": 49417888, + "step": 22890 + }, + { + "epoch": 3.7349102773246328, + "grad_norm": 0.09200078248977661, + "learning_rate": 0.0009772570434440353, + "loss": 0.1439, + "num_input_tokens_seen": 49427296, + "step": 22895 + }, + { + "epoch": 3.735725938009788, + "grad_norm": 0.04666079208254814, + "learning_rate": 0.000977235815217597, + "loss": 0.1805, + "num_input_tokens_seen": 49437984, + "step": 22900 + }, + { + "epoch": 3.736541598694943, + "grad_norm": 0.09155704081058502, + "learning_rate": 0.0009772145773193695, + "loss": 0.1551, + "num_input_tokens_seen": 49448288, + "step": 22905 + }, + { + "epoch": 3.737357259380098, + "grad_norm": 0.02924705669283867, + "learning_rate": 0.0009771933297497831, + "loss": 0.1273, + "num_input_tokens_seen": 49458624, + "step": 22910 + }, + { + "epoch": 3.738172920065253, + "grad_norm": 0.07338499277830124, + "learning_rate": 0.0009771720725092687, + "loss": 0.0835, + "num_input_tokens_seen": 49468032, + "step": 22915 + }, + { + "epoch": 3.7389885807504077, + "grad_norm": 0.06460889428853989, + "learning_rate": 0.000977150805598257, + "loss": 0.0623, + "num_input_tokens_seen": 49479744, + "step": 22920 + }, + { + "epoch": 3.7398042414355626, + "grad_norm": 0.013302492909133434, + "learning_rate": 0.0009771295290171788, + "loss": 0.0592, + "num_input_tokens_seen": 49490688, + "step": 22925 + }, + { + "epoch": 3.740619902120718, + "grad_norm": 0.11618553102016449, + "learning_rate": 0.0009771082427664655, + "loss": 0.0532, + "num_input_tokens_seen": 49499936, + "step": 22930 + }, + { + "epoch": 3.7414355628058726, + "grad_norm": 0.022082814946770668, + "learning_rate": 0.0009770869468465483, + "loss": 0.0571, + "num_input_tokens_seen": 49510080, + "step": 22935 + }, + { + "epoch": 3.742251223491028, + "grad_norm": 0.059666723012924194, + "learning_rate": 0.000977065641257859, + "loss": 0.0918, + "num_input_tokens_seen": 49521056, + "step": 22940 + }, + { + "epoch": 3.7430668841761827, + "grad_norm": 0.027317995205521584, + "learning_rate": 0.000977044326000829, + "loss": 0.0544, + "num_input_tokens_seen": 49531712, + "step": 22945 + }, + { + "epoch": 3.7438825448613375, + "grad_norm": 0.11584898084402084, + "learning_rate": 0.0009770230010758907, + "loss": 0.0694, + "num_input_tokens_seen": 49542912, + "step": 22950 + }, + { + "epoch": 3.744698205546493, + "grad_norm": 0.06977637112140656, + "learning_rate": 0.0009770016664834762, + "loss": 0.0325, + "num_input_tokens_seen": 49553888, + "step": 22955 + }, + { + "epoch": 3.7455138662316476, + "grad_norm": 0.02564827725291252, + "learning_rate": 0.000976980322224018, + "loss": 0.1574, + "num_input_tokens_seen": 49565376, + "step": 22960 + }, + { + "epoch": 3.746329526916803, + "grad_norm": 0.04051194712519646, + "learning_rate": 0.0009769589682979481, + "loss": 0.0402, + "num_input_tokens_seen": 49577184, + "step": 22965 + }, + { + "epoch": 3.7471451876019577, + "grad_norm": 0.09753694385290146, + "learning_rate": 0.0009769376047056998, + "loss": 0.0497, + "num_input_tokens_seen": 49588608, + "step": 22970 + }, + { + "epoch": 3.7479608482871125, + "grad_norm": 0.03887641429901123, + "learning_rate": 0.0009769162314477058, + "loss": 0.0331, + "num_input_tokens_seen": 49599840, + "step": 22975 + }, + { + "epoch": 3.7487765089722673, + "grad_norm": 0.0328313373029232, + "learning_rate": 0.0009768948485243997, + "loss": 0.1735, + "num_input_tokens_seen": 49611040, + "step": 22980 + }, + { + "epoch": 3.7495921696574226, + "grad_norm": 0.01911292038857937, + "learning_rate": 0.0009768734559362142, + "loss": 0.0408, + "num_input_tokens_seen": 49621472, + "step": 22985 + }, + { + "epoch": 3.7504078303425774, + "grad_norm": 0.01694757491350174, + "learning_rate": 0.0009768520536835832, + "loss": 0.074, + "num_input_tokens_seen": 49632832, + "step": 22990 + }, + { + "epoch": 3.7512234910277327, + "grad_norm": 0.023697957396507263, + "learning_rate": 0.0009768306417669405, + "loss": 0.2178, + "num_input_tokens_seen": 49643136, + "step": 22995 + }, + { + "epoch": 3.7520391517128875, + "grad_norm": 0.015679119154810905, + "learning_rate": 0.00097680922018672, + "loss": 0.141, + "num_input_tokens_seen": 49653920, + "step": 23000 + }, + { + "epoch": 3.7528548123980423, + "grad_norm": 0.004297219682484865, + "learning_rate": 0.0009767877889433555, + "loss": 0.0427, + "num_input_tokens_seen": 49664992, + "step": 23005 + }, + { + "epoch": 3.753670473083197, + "grad_norm": 0.011121313087642193, + "learning_rate": 0.0009767663480372817, + "loss": 0.0194, + "num_input_tokens_seen": 49676000, + "step": 23010 + }, + { + "epoch": 3.7544861337683524, + "grad_norm": 0.02730000764131546, + "learning_rate": 0.0009767448974689332, + "loss": 0.0832, + "num_input_tokens_seen": 49687712, + "step": 23015 + }, + { + "epoch": 3.755301794453507, + "grad_norm": 0.08978021889925003, + "learning_rate": 0.0009767234372387444, + "loss": 0.0377, + "num_input_tokens_seen": 49698944, + "step": 23020 + }, + { + "epoch": 3.7561174551386625, + "grad_norm": 0.004906428512185812, + "learning_rate": 0.0009767019673471505, + "loss": 0.1056, + "num_input_tokens_seen": 49710048, + "step": 23025 + }, + { + "epoch": 3.7569331158238173, + "grad_norm": 0.07726229727268219, + "learning_rate": 0.0009766804877945864, + "loss": 0.1437, + "num_input_tokens_seen": 49720672, + "step": 23030 + }, + { + "epoch": 3.757748776508972, + "grad_norm": 0.016890767961740494, + "learning_rate": 0.0009766589985814875, + "loss": 0.0453, + "num_input_tokens_seen": 49731776, + "step": 23035 + }, + { + "epoch": 3.7585644371941274, + "grad_norm": 0.07190525531768799, + "learning_rate": 0.0009766374997082893, + "loss": 0.1601, + "num_input_tokens_seen": 49742336, + "step": 23040 + }, + { + "epoch": 3.759380097879282, + "grad_norm": 0.17989444732666016, + "learning_rate": 0.0009766159911754277, + "loss": 0.2412, + "num_input_tokens_seen": 49753408, + "step": 23045 + }, + { + "epoch": 3.7601957585644374, + "grad_norm": 0.024080758914351463, + "learning_rate": 0.0009765944729833382, + "loss": 0.0809, + "num_input_tokens_seen": 49764192, + "step": 23050 + }, + { + "epoch": 3.7610114192495923, + "grad_norm": 0.1472521722316742, + "learning_rate": 0.0009765729451324573, + "loss": 0.074, + "num_input_tokens_seen": 49775840, + "step": 23055 + }, + { + "epoch": 3.761827079934747, + "grad_norm": 0.010671873576939106, + "learning_rate": 0.000976551407623221, + "loss": 0.0968, + "num_input_tokens_seen": 49785408, + "step": 23060 + }, + { + "epoch": 3.762642740619902, + "grad_norm": 0.15884926915168762, + "learning_rate": 0.0009765298604560657, + "loss": 0.1618, + "num_input_tokens_seen": 49796192, + "step": 23065 + }, + { + "epoch": 3.763458401305057, + "grad_norm": 0.012583589181303978, + "learning_rate": 0.0009765083036314284, + "loss": 0.0376, + "num_input_tokens_seen": 49807296, + "step": 23070 + }, + { + "epoch": 3.764274061990212, + "grad_norm": 0.14793801307678223, + "learning_rate": 0.0009764867371497459, + "loss": 0.1269, + "num_input_tokens_seen": 49818368, + "step": 23075 + }, + { + "epoch": 3.7650897226753672, + "grad_norm": 0.0198773592710495, + "learning_rate": 0.000976465161011455, + "loss": 0.1723, + "num_input_tokens_seen": 49830368, + "step": 23080 + }, + { + "epoch": 3.765905383360522, + "grad_norm": 0.11149783432483673, + "learning_rate": 0.0009764435752169933, + "loss": 0.1744, + "num_input_tokens_seen": 49840960, + "step": 23085 + }, + { + "epoch": 3.766721044045677, + "grad_norm": 0.0754162073135376, + "learning_rate": 0.0009764219797667982, + "loss": 0.2412, + "num_input_tokens_seen": 49851776, + "step": 23090 + }, + { + "epoch": 3.767536704730832, + "grad_norm": 0.09039194136857986, + "learning_rate": 0.0009764003746613073, + "loss": 0.1566, + "num_input_tokens_seen": 49862336, + "step": 23095 + }, + { + "epoch": 3.768352365415987, + "grad_norm": 0.12063119560480118, + "learning_rate": 0.0009763787599009583, + "loss": 0.2348, + "num_input_tokens_seen": 49874208, + "step": 23100 + }, + { + "epoch": 3.7691680261011418, + "grad_norm": 0.0636802688241005, + "learning_rate": 0.0009763571354861895, + "loss": 0.103, + "num_input_tokens_seen": 49885440, + "step": 23105 + }, + { + "epoch": 3.769983686786297, + "grad_norm": 0.06430269032716751, + "learning_rate": 0.0009763355014174391, + "loss": 0.1101, + "num_input_tokens_seen": 49896832, + "step": 23110 + }, + { + "epoch": 3.770799347471452, + "grad_norm": 0.022313209250569344, + "learning_rate": 0.0009763138576951454, + "loss": 0.1585, + "num_input_tokens_seen": 49907008, + "step": 23115 + }, + { + "epoch": 3.7716150081566067, + "grad_norm": 0.021583598107099533, + "learning_rate": 0.0009762922043197471, + "loss": 0.0422, + "num_input_tokens_seen": 49917600, + "step": 23120 + }, + { + "epoch": 3.772430668841762, + "grad_norm": 0.09512555599212646, + "learning_rate": 0.0009762705412916831, + "loss": 0.0796, + "num_input_tokens_seen": 49929280, + "step": 23125 + }, + { + "epoch": 3.7732463295269167, + "grad_norm": 0.08746993541717529, + "learning_rate": 0.0009762488686113924, + "loss": 0.0694, + "num_input_tokens_seen": 49940384, + "step": 23130 + }, + { + "epoch": 3.774061990212072, + "grad_norm": 0.0075483075343072414, + "learning_rate": 0.0009762271862793143, + "loss": 0.144, + "num_input_tokens_seen": 49949696, + "step": 23135 + }, + { + "epoch": 3.774877650897227, + "grad_norm": 0.1539970487356186, + "learning_rate": 0.000976205494295888, + "loss": 0.1256, + "num_input_tokens_seen": 49959680, + "step": 23140 + }, + { + "epoch": 3.7756933115823816, + "grad_norm": 0.03422388434410095, + "learning_rate": 0.0009761837926615533, + "loss": 0.1488, + "num_input_tokens_seen": 49970592, + "step": 23145 + }, + { + "epoch": 3.7765089722675365, + "grad_norm": 0.18177658319473267, + "learning_rate": 0.00097616208137675, + "loss": 0.1856, + "num_input_tokens_seen": 49981440, + "step": 23150 + }, + { + "epoch": 3.7773246329526917, + "grad_norm": 0.0072610145434737206, + "learning_rate": 0.000976140360441918, + "loss": 0.1034, + "num_input_tokens_seen": 49993024, + "step": 23155 + }, + { + "epoch": 3.7781402936378465, + "grad_norm": 0.17495934665203094, + "learning_rate": 0.0009761186298574975, + "loss": 0.142, + "num_input_tokens_seen": 50004704, + "step": 23160 + }, + { + "epoch": 3.778955954323002, + "grad_norm": 0.0627005472779274, + "learning_rate": 0.0009760968896239291, + "loss": 0.0966, + "num_input_tokens_seen": 50015424, + "step": 23165 + }, + { + "epoch": 3.7797716150081566, + "grad_norm": 0.13304628431797028, + "learning_rate": 0.0009760751397416532, + "loss": 0.0694, + "num_input_tokens_seen": 50026368, + "step": 23170 + }, + { + "epoch": 3.7805872756933114, + "grad_norm": 0.11880537867546082, + "learning_rate": 0.0009760533802111107, + "loss": 0.1523, + "num_input_tokens_seen": 50036512, + "step": 23175 + }, + { + "epoch": 3.7814029363784667, + "grad_norm": 0.03182271867990494, + "learning_rate": 0.0009760316110327426, + "loss": 0.0702, + "num_input_tokens_seen": 50047776, + "step": 23180 + }, + { + "epoch": 3.7822185970636215, + "grad_norm": 0.23382218182086945, + "learning_rate": 0.00097600983220699, + "loss": 0.1679, + "num_input_tokens_seen": 50058048, + "step": 23185 + }, + { + "epoch": 3.7830342577487768, + "grad_norm": 0.08541269600391388, + "learning_rate": 0.0009759880437342941, + "loss": 0.1048, + "num_input_tokens_seen": 50068992, + "step": 23190 + }, + { + "epoch": 3.7838499184339316, + "grad_norm": 0.1421559751033783, + "learning_rate": 0.0009759662456150967, + "loss": 0.101, + "num_input_tokens_seen": 50079776, + "step": 23195 + }, + { + "epoch": 3.7846655791190864, + "grad_norm": 0.10415133088827133, + "learning_rate": 0.0009759444378498397, + "loss": 0.1034, + "num_input_tokens_seen": 50090592, + "step": 23200 + }, + { + "epoch": 3.7854812398042412, + "grad_norm": 0.05130248889327049, + "learning_rate": 0.0009759226204389646, + "loss": 0.1495, + "num_input_tokens_seen": 50102240, + "step": 23205 + }, + { + "epoch": 3.7862969004893965, + "grad_norm": 0.12014283984899521, + "learning_rate": 0.0009759007933829141, + "loss": 0.1028, + "num_input_tokens_seen": 50113472, + "step": 23210 + }, + { + "epoch": 3.7871125611745513, + "grad_norm": 0.03210382163524628, + "learning_rate": 0.0009758789566821302, + "loss": 0.0393, + "num_input_tokens_seen": 50124224, + "step": 23215 + }, + { + "epoch": 3.7879282218597066, + "grad_norm": 0.057430822402238846, + "learning_rate": 0.0009758571103370556, + "loss": 0.0582, + "num_input_tokens_seen": 50135264, + "step": 23220 + }, + { + "epoch": 3.7887438825448614, + "grad_norm": 0.1591319590806961, + "learning_rate": 0.000975835254348133, + "loss": 0.2025, + "num_input_tokens_seen": 50145824, + "step": 23225 + }, + { + "epoch": 3.789559543230016, + "grad_norm": 0.0040392447263002396, + "learning_rate": 0.0009758133887158053, + "loss": 0.1542, + "num_input_tokens_seen": 50156864, + "step": 23230 + }, + { + "epoch": 3.790375203915171, + "grad_norm": 0.08963494002819061, + "learning_rate": 0.0009757915134405155, + "loss": 0.0655, + "num_input_tokens_seen": 50167872, + "step": 23235 + }, + { + "epoch": 3.7911908646003263, + "grad_norm": 0.03895760700106621, + "learning_rate": 0.0009757696285227073, + "loss": 0.0858, + "num_input_tokens_seen": 50178880, + "step": 23240 + }, + { + "epoch": 3.792006525285481, + "grad_norm": 0.09825877100229263, + "learning_rate": 0.000975747733962824, + "loss": 0.2428, + "num_input_tokens_seen": 50189280, + "step": 23245 + }, + { + "epoch": 3.7928221859706364, + "grad_norm": 0.022679295390844345, + "learning_rate": 0.0009757258297613095, + "loss": 0.0443, + "num_input_tokens_seen": 50200736, + "step": 23250 + }, + { + "epoch": 3.793637846655791, + "grad_norm": 0.008132662624120712, + "learning_rate": 0.0009757039159186072, + "loss": 0.0439, + "num_input_tokens_seen": 50210208, + "step": 23255 + }, + { + "epoch": 3.794453507340946, + "grad_norm": 0.046514492481946945, + "learning_rate": 0.0009756819924351618, + "loss": 0.0434, + "num_input_tokens_seen": 50220960, + "step": 23260 + }, + { + "epoch": 3.7952691680261013, + "grad_norm": 0.029721027240157127, + "learning_rate": 0.0009756600593114174, + "loss": 0.0695, + "num_input_tokens_seen": 50232384, + "step": 23265 + }, + { + "epoch": 3.796084828711256, + "grad_norm": 0.14286768436431885, + "learning_rate": 0.0009756381165478183, + "loss": 0.0602, + "num_input_tokens_seen": 50243328, + "step": 23270 + }, + { + "epoch": 3.7969004893964113, + "grad_norm": 0.011310291476547718, + "learning_rate": 0.0009756161641448095, + "loss": 0.1331, + "num_input_tokens_seen": 50254080, + "step": 23275 + }, + { + "epoch": 3.797716150081566, + "grad_norm": 0.00703558512032032, + "learning_rate": 0.0009755942021028356, + "loss": 0.0466, + "num_input_tokens_seen": 50264384, + "step": 23280 + }, + { + "epoch": 3.798531810766721, + "grad_norm": 0.06367386877536774, + "learning_rate": 0.0009755722304223422, + "loss": 0.0505, + "num_input_tokens_seen": 50275232, + "step": 23285 + }, + { + "epoch": 3.799347471451876, + "grad_norm": 0.1289215087890625, + "learning_rate": 0.000975550249103774, + "loss": 0.2017, + "num_input_tokens_seen": 50286048, + "step": 23290 + }, + { + "epoch": 3.800163132137031, + "grad_norm": 0.019232898950576782, + "learning_rate": 0.0009755282581475768, + "loss": 0.0491, + "num_input_tokens_seen": 50296192, + "step": 23295 + }, + { + "epoch": 3.800978792822186, + "grad_norm": 0.1858430951833725, + "learning_rate": 0.0009755062575541962, + "loss": 0.1061, + "num_input_tokens_seen": 50305888, + "step": 23300 + }, + { + "epoch": 3.801794453507341, + "grad_norm": 0.042528945952653885, + "learning_rate": 0.000975484247324078, + "loss": 0.1086, + "num_input_tokens_seen": 50316640, + "step": 23305 + }, + { + "epoch": 3.802610114192496, + "grad_norm": 0.059037502855062485, + "learning_rate": 0.0009754622274576684, + "loss": 0.1424, + "num_input_tokens_seen": 50328832, + "step": 23310 + }, + { + "epoch": 3.8034257748776508, + "grad_norm": 0.04079289734363556, + "learning_rate": 0.0009754401979554136, + "loss": 0.1004, + "num_input_tokens_seen": 50339136, + "step": 23315 + }, + { + "epoch": 3.804241435562806, + "grad_norm": 0.0049128723330795765, + "learning_rate": 0.00097541815881776, + "loss": 0.1517, + "num_input_tokens_seen": 50350336, + "step": 23320 + }, + { + "epoch": 3.805057096247961, + "grad_norm": 0.16003960371017456, + "learning_rate": 0.0009753961100451544, + "loss": 0.1678, + "num_input_tokens_seen": 50361376, + "step": 23325 + }, + { + "epoch": 3.8058727569331157, + "grad_norm": 0.0016949091805145144, + "learning_rate": 0.0009753740516380433, + "loss": 0.091, + "num_input_tokens_seen": 50372064, + "step": 23330 + }, + { + "epoch": 3.806688417618271, + "grad_norm": 0.16116364300251007, + "learning_rate": 0.0009753519835968743, + "loss": 0.0894, + "num_input_tokens_seen": 50382304, + "step": 23335 + }, + { + "epoch": 3.8075040783034257, + "grad_norm": 0.04034513607621193, + "learning_rate": 0.0009753299059220941, + "loss": 0.1442, + "num_input_tokens_seen": 50392416, + "step": 23340 + }, + { + "epoch": 3.8083197389885806, + "grad_norm": 0.10937430709600449, + "learning_rate": 0.0009753078186141506, + "loss": 0.1816, + "num_input_tokens_seen": 50402816, + "step": 23345 + }, + { + "epoch": 3.809135399673736, + "grad_norm": 0.07109804451465607, + "learning_rate": 0.0009752857216734909, + "loss": 0.1073, + "num_input_tokens_seen": 50412224, + "step": 23350 + }, + { + "epoch": 3.8099510603588906, + "grad_norm": 0.11724358052015305, + "learning_rate": 0.0009752636151005633, + "loss": 0.2468, + "num_input_tokens_seen": 50423680, + "step": 23355 + }, + { + "epoch": 3.810766721044046, + "grad_norm": 0.04426531866192818, + "learning_rate": 0.0009752414988958156, + "loss": 0.0326, + "num_input_tokens_seen": 50435072, + "step": 23360 + }, + { + "epoch": 3.8115823817292007, + "grad_norm": 0.10522560030221939, + "learning_rate": 0.000975219373059696, + "loss": 0.224, + "num_input_tokens_seen": 50446336, + "step": 23365 + }, + { + "epoch": 3.8123980424143555, + "grad_norm": 0.09824762493371964, + "learning_rate": 0.000975197237592653, + "loss": 0.1173, + "num_input_tokens_seen": 50456928, + "step": 23370 + }, + { + "epoch": 3.8132137030995104, + "grad_norm": 0.01370153110474348, + "learning_rate": 0.000975175092495135, + "loss": 0.112, + "num_input_tokens_seen": 50467488, + "step": 23375 + }, + { + "epoch": 3.8140293637846656, + "grad_norm": 0.09012758731842041, + "learning_rate": 0.0009751529377675911, + "loss": 0.0699, + "num_input_tokens_seen": 50479168, + "step": 23380 + }, + { + "epoch": 3.8148450244698204, + "grad_norm": 0.014611635357141495, + "learning_rate": 0.00097513077341047, + "loss": 0.0527, + "num_input_tokens_seen": 50490048, + "step": 23385 + }, + { + "epoch": 3.8156606851549757, + "grad_norm": 0.14765992760658264, + "learning_rate": 0.0009751085994242212, + "loss": 0.1724, + "num_input_tokens_seen": 50500192, + "step": 23390 + }, + { + "epoch": 3.8164763458401305, + "grad_norm": 0.010333874262869358, + "learning_rate": 0.0009750864158092938, + "loss": 0.0675, + "num_input_tokens_seen": 50510944, + "step": 23395 + }, + { + "epoch": 3.8172920065252853, + "grad_norm": 0.012766147963702679, + "learning_rate": 0.0009750642225661375, + "loss": 0.065, + "num_input_tokens_seen": 50521344, + "step": 23400 + }, + { + "epoch": 3.8181076672104406, + "grad_norm": 0.012711434625089169, + "learning_rate": 0.0009750420196952021, + "loss": 0.0495, + "num_input_tokens_seen": 50532064, + "step": 23405 + }, + { + "epoch": 3.8189233278955954, + "grad_norm": 0.04289145767688751, + "learning_rate": 0.0009750198071969376, + "loss": 0.0792, + "num_input_tokens_seen": 50542816, + "step": 23410 + }, + { + "epoch": 3.8197389885807507, + "grad_norm": 0.003050195286050439, + "learning_rate": 0.0009749975850717941, + "loss": 0.0234, + "num_input_tokens_seen": 50553088, + "step": 23415 + }, + { + "epoch": 3.8205546492659055, + "grad_norm": 0.016573579981923103, + "learning_rate": 0.0009749753533202218, + "loss": 0.0273, + "num_input_tokens_seen": 50561792, + "step": 23420 + }, + { + "epoch": 3.8213703099510603, + "grad_norm": 0.09101023524999619, + "learning_rate": 0.0009749531119426716, + "loss": 0.0781, + "num_input_tokens_seen": 50573280, + "step": 23425 + }, + { + "epoch": 3.822185970636215, + "grad_norm": 0.0028289298061281443, + "learning_rate": 0.000974930860939594, + "loss": 0.016, + "num_input_tokens_seen": 50584384, + "step": 23430 + }, + { + "epoch": 3.8230016313213704, + "grad_norm": 0.0058859046548604965, + "learning_rate": 0.0009749086003114399, + "loss": 0.0685, + "num_input_tokens_seen": 50595904, + "step": 23435 + }, + { + "epoch": 3.823817292006525, + "grad_norm": 0.15686874091625214, + "learning_rate": 0.0009748863300586605, + "loss": 0.0357, + "num_input_tokens_seen": 50608224, + "step": 23440 + }, + { + "epoch": 3.8246329526916805, + "grad_norm": 0.2096952348947525, + "learning_rate": 0.0009748640501817074, + "loss": 0.3055, + "num_input_tokens_seen": 50618912, + "step": 23445 + }, + { + "epoch": 3.8254486133768353, + "grad_norm": 0.008424417115747929, + "learning_rate": 0.0009748417606810319, + "loss": 0.0553, + "num_input_tokens_seen": 50629728, + "step": 23450 + }, + { + "epoch": 3.82626427406199, + "grad_norm": 0.03288991376757622, + "learning_rate": 0.0009748194615570857, + "loss": 0.1007, + "num_input_tokens_seen": 50640096, + "step": 23455 + }, + { + "epoch": 3.827079934747145, + "grad_norm": 0.010939883068203926, + "learning_rate": 0.0009747971528103207, + "loss": 0.1303, + "num_input_tokens_seen": 50650176, + "step": 23460 + }, + { + "epoch": 3.8278955954323, + "grad_norm": 0.009515521116554737, + "learning_rate": 0.0009747748344411891, + "loss": 0.1435, + "num_input_tokens_seen": 50661504, + "step": 23465 + }, + { + "epoch": 3.828711256117455, + "grad_norm": 0.15671837329864502, + "learning_rate": 0.0009747525064501433, + "loss": 0.0931, + "num_input_tokens_seen": 50672384, + "step": 23470 + }, + { + "epoch": 3.8295269168026103, + "grad_norm": 0.00947185792028904, + "learning_rate": 0.0009747301688376355, + "loss": 0.1669, + "num_input_tokens_seen": 50683168, + "step": 23475 + }, + { + "epoch": 3.830342577487765, + "grad_norm": 0.24224896728992462, + "learning_rate": 0.0009747078216041187, + "loss": 0.1524, + "num_input_tokens_seen": 50692512, + "step": 23480 + }, + { + "epoch": 3.83115823817292, + "grad_norm": 0.03424467518925667, + "learning_rate": 0.0009746854647500457, + "loss": 0.0656, + "num_input_tokens_seen": 50703168, + "step": 23485 + }, + { + "epoch": 3.831973898858075, + "grad_norm": 0.034658078104257584, + "learning_rate": 0.0009746630982758695, + "loss": 0.1467, + "num_input_tokens_seen": 50714112, + "step": 23490 + }, + { + "epoch": 3.83278955954323, + "grad_norm": 0.1492968052625656, + "learning_rate": 0.0009746407221820435, + "loss": 0.1304, + "num_input_tokens_seen": 50724576, + "step": 23495 + }, + { + "epoch": 3.8336052202283852, + "grad_norm": 0.04148952662944794, + "learning_rate": 0.0009746183364690212, + "loss": 0.1066, + "num_input_tokens_seen": 50735392, + "step": 23500 + }, + { + "epoch": 3.83442088091354, + "grad_norm": 0.08744814991950989, + "learning_rate": 0.0009745959411372561, + "loss": 0.0911, + "num_input_tokens_seen": 50747488, + "step": 23505 + }, + { + "epoch": 3.835236541598695, + "grad_norm": 0.010208824649453163, + "learning_rate": 0.0009745735361872023, + "loss": 0.0728, + "num_input_tokens_seen": 50757120, + "step": 23510 + }, + { + "epoch": 3.8360522022838497, + "grad_norm": 0.032262254506349564, + "learning_rate": 0.0009745511216193137, + "loss": 0.0939, + "num_input_tokens_seen": 50767136, + "step": 23515 + }, + { + "epoch": 3.836867862969005, + "grad_norm": 0.029177214950323105, + "learning_rate": 0.0009745286974340445, + "loss": 0.0281, + "num_input_tokens_seen": 50777728, + "step": 23520 + }, + { + "epoch": 3.8376835236541598, + "grad_norm": 0.11168672144412994, + "learning_rate": 0.0009745062636318495, + "loss": 0.1423, + "num_input_tokens_seen": 50787488, + "step": 23525 + }, + { + "epoch": 3.838499184339315, + "grad_norm": 0.10580579191446304, + "learning_rate": 0.0009744838202131829, + "loss": 0.0479, + "num_input_tokens_seen": 50797760, + "step": 23530 + }, + { + "epoch": 3.83931484502447, + "grad_norm": 0.03305608779191971, + "learning_rate": 0.0009744613671784999, + "loss": 0.0315, + "num_input_tokens_seen": 50808832, + "step": 23535 + }, + { + "epoch": 3.8401305057096247, + "grad_norm": 0.009244843386113644, + "learning_rate": 0.0009744389045282554, + "loss": 0.0672, + "num_input_tokens_seen": 50819296, + "step": 23540 + }, + { + "epoch": 3.84094616639478, + "grad_norm": 0.07480739057064056, + "learning_rate": 0.0009744164322629046, + "loss": 0.072, + "num_input_tokens_seen": 50831424, + "step": 23545 + }, + { + "epoch": 3.8417618270799347, + "grad_norm": 0.011413346976041794, + "learning_rate": 0.0009743939503829027, + "loss": 0.1317, + "num_input_tokens_seen": 50841152, + "step": 23550 + }, + { + "epoch": 3.8425774877650896, + "grad_norm": 0.22705356776714325, + "learning_rate": 0.0009743714588887059, + "loss": 0.3121, + "num_input_tokens_seen": 50850752, + "step": 23555 + }, + { + "epoch": 3.843393148450245, + "grad_norm": 0.11036356538534164, + "learning_rate": 0.0009743489577807696, + "loss": 0.0931, + "num_input_tokens_seen": 50860896, + "step": 23560 + }, + { + "epoch": 3.8442088091353996, + "grad_norm": 0.14470776915550232, + "learning_rate": 0.0009743264470595499, + "loss": 0.1059, + "num_input_tokens_seen": 50871072, + "step": 23565 + }, + { + "epoch": 3.8450244698205545, + "grad_norm": 0.1082136258482933, + "learning_rate": 0.0009743039267255031, + "loss": 0.1008, + "num_input_tokens_seen": 50880960, + "step": 23570 + }, + { + "epoch": 3.8458401305057097, + "grad_norm": 0.06363649666309357, + "learning_rate": 0.0009742813967790855, + "loss": 0.052, + "num_input_tokens_seen": 50890080, + "step": 23575 + }, + { + "epoch": 3.8466557911908645, + "grad_norm": 0.030684353783726692, + "learning_rate": 0.0009742588572207538, + "loss": 0.022, + "num_input_tokens_seen": 50901216, + "step": 23580 + }, + { + "epoch": 3.84747145187602, + "grad_norm": 0.2840745747089386, + "learning_rate": 0.0009742363080509647, + "loss": 0.1625, + "num_input_tokens_seen": 50912416, + "step": 23585 + }, + { + "epoch": 3.8482871125611746, + "grad_norm": 0.01734989695250988, + "learning_rate": 0.000974213749270175, + "loss": 0.0307, + "num_input_tokens_seen": 50922752, + "step": 23590 + }, + { + "epoch": 3.8491027732463294, + "grad_norm": 0.016052482649683952, + "learning_rate": 0.0009741911808788422, + "loss": 0.0187, + "num_input_tokens_seen": 50935168, + "step": 23595 + }, + { + "epoch": 3.8499184339314843, + "grad_norm": 0.18628224730491638, + "learning_rate": 0.0009741686028774236, + "loss": 0.1692, + "num_input_tokens_seen": 50946304, + "step": 23600 + }, + { + "epoch": 3.8507340946166395, + "grad_norm": 0.06740216910839081, + "learning_rate": 0.0009741460152663768, + "loss": 0.0618, + "num_input_tokens_seen": 50955872, + "step": 23605 + }, + { + "epoch": 3.8515497553017943, + "grad_norm": 0.03751668706536293, + "learning_rate": 0.0009741234180461593, + "loss": 0.0802, + "num_input_tokens_seen": 50967488, + "step": 23610 + }, + { + "epoch": 3.8523654159869496, + "grad_norm": 0.04289042204618454, + "learning_rate": 0.0009741008112172293, + "loss": 0.0979, + "num_input_tokens_seen": 50977664, + "step": 23615 + }, + { + "epoch": 3.8531810766721044, + "grad_norm": 0.02130843698978424, + "learning_rate": 0.0009740781947800452, + "loss": 0.0414, + "num_input_tokens_seen": 50989920, + "step": 23620 + }, + { + "epoch": 3.8539967373572592, + "grad_norm": 0.19292832911014557, + "learning_rate": 0.0009740555687350648, + "loss": 0.1342, + "num_input_tokens_seen": 50999136, + "step": 23625 + }, + { + "epoch": 3.8548123980424145, + "grad_norm": 0.002067751483991742, + "learning_rate": 0.0009740329330827471, + "loss": 0.2148, + "num_input_tokens_seen": 51010336, + "step": 23630 + }, + { + "epoch": 3.8556280587275693, + "grad_norm": 0.2244749218225479, + "learning_rate": 0.0009740102878235505, + "loss": 0.1202, + "num_input_tokens_seen": 51021792, + "step": 23635 + }, + { + "epoch": 3.8564437194127246, + "grad_norm": 0.007005223073065281, + "learning_rate": 0.0009739876329579343, + "loss": 0.1492, + "num_input_tokens_seen": 51032416, + "step": 23640 + }, + { + "epoch": 3.8572593800978794, + "grad_norm": 0.12940333783626556, + "learning_rate": 0.0009739649684863572, + "loss": 0.1442, + "num_input_tokens_seen": 51042304, + "step": 23645 + }, + { + "epoch": 3.858075040783034, + "grad_norm": 0.03883710131049156, + "learning_rate": 0.0009739422944092789, + "loss": 0.0672, + "num_input_tokens_seen": 51053664, + "step": 23650 + }, + { + "epoch": 3.858890701468189, + "grad_norm": 0.012478201650083065, + "learning_rate": 0.0009739196107271586, + "loss": 0.0323, + "num_input_tokens_seen": 51064608, + "step": 23655 + }, + { + "epoch": 3.8597063621533443, + "grad_norm": 0.009476371109485626, + "learning_rate": 0.0009738969174404562, + "loss": 0.1317, + "num_input_tokens_seen": 51074208, + "step": 23660 + }, + { + "epoch": 3.860522022838499, + "grad_norm": 0.16200388967990875, + "learning_rate": 0.0009738742145496318, + "loss": 0.1164, + "num_input_tokens_seen": 51085600, + "step": 23665 + }, + { + "epoch": 3.8613376835236544, + "grad_norm": 0.008992874994874, + "learning_rate": 0.000973851502055145, + "loss": 0.025, + "num_input_tokens_seen": 51097344, + "step": 23670 + }, + { + "epoch": 3.862153344208809, + "grad_norm": 0.044610220938920975, + "learning_rate": 0.0009738287799574565, + "loss": 0.0231, + "num_input_tokens_seen": 51109184, + "step": 23675 + }, + { + "epoch": 3.862969004893964, + "grad_norm": 0.02259625867009163, + "learning_rate": 0.0009738060482570268, + "loss": 0.1607, + "num_input_tokens_seen": 51120800, + "step": 23680 + }, + { + "epoch": 3.863784665579119, + "grad_norm": 0.030473671853542328, + "learning_rate": 0.0009737833069543163, + "loss": 0.0531, + "num_input_tokens_seen": 51131008, + "step": 23685 + }, + { + "epoch": 3.864600326264274, + "grad_norm": 0.02612938918173313, + "learning_rate": 0.0009737605560497862, + "loss": 0.1212, + "num_input_tokens_seen": 51141600, + "step": 23690 + }, + { + "epoch": 3.865415986949429, + "grad_norm": 0.10965737700462341, + "learning_rate": 0.0009737377955438973, + "loss": 0.0852, + "num_input_tokens_seen": 51152128, + "step": 23695 + }, + { + "epoch": 3.866231647634584, + "grad_norm": 0.0074377721175551414, + "learning_rate": 0.000973715025437111, + "loss": 0.0445, + "num_input_tokens_seen": 51163840, + "step": 23700 + }, + { + "epoch": 3.867047308319739, + "grad_norm": 0.025100016966462135, + "learning_rate": 0.0009736922457298889, + "loss": 0.0705, + "num_input_tokens_seen": 51173984, + "step": 23705 + }, + { + "epoch": 3.867862969004894, + "grad_norm": 0.14168326556682587, + "learning_rate": 0.0009736694564226924, + "loss": 0.0964, + "num_input_tokens_seen": 51185120, + "step": 23710 + }, + { + "epoch": 3.868678629690049, + "grad_norm": 0.00493616284802556, + "learning_rate": 0.0009736466575159835, + "loss": 0.0781, + "num_input_tokens_seen": 51195968, + "step": 23715 + }, + { + "epoch": 3.869494290375204, + "grad_norm": 0.058191146701574326, + "learning_rate": 0.0009736238490102243, + "loss": 0.0693, + "num_input_tokens_seen": 51206368, + "step": 23720 + }, + { + "epoch": 3.870309951060359, + "grad_norm": 0.04914458841085434, + "learning_rate": 0.0009736010309058769, + "loss": 0.1477, + "num_input_tokens_seen": 51216928, + "step": 23725 + }, + { + "epoch": 3.871125611745514, + "grad_norm": 0.15564818680286407, + "learning_rate": 0.0009735782032034038, + "loss": 0.3329, + "num_input_tokens_seen": 51227936, + "step": 23730 + }, + { + "epoch": 3.8719412724306688, + "grad_norm": 0.04717203602194786, + "learning_rate": 0.0009735553659032674, + "loss": 0.1707, + "num_input_tokens_seen": 51238880, + "step": 23735 + }, + { + "epoch": 3.8727569331158236, + "grad_norm": 0.017344390973448753, + "learning_rate": 0.000973532519005931, + "loss": 0.0508, + "num_input_tokens_seen": 51250560, + "step": 23740 + }, + { + "epoch": 3.873572593800979, + "grad_norm": 0.031641170382499695, + "learning_rate": 0.0009735096625118574, + "loss": 0.0153, + "num_input_tokens_seen": 51261440, + "step": 23745 + }, + { + "epoch": 3.8743882544861337, + "grad_norm": 0.013699406757950783, + "learning_rate": 0.0009734867964215099, + "loss": 0.1292, + "num_input_tokens_seen": 51272640, + "step": 23750 + }, + { + "epoch": 3.875203915171289, + "grad_norm": 0.12096096575260162, + "learning_rate": 0.0009734639207353516, + "loss": 0.275, + "num_input_tokens_seen": 51284064, + "step": 23755 + }, + { + "epoch": 3.8760195758564437, + "grad_norm": 0.09324183315038681, + "learning_rate": 0.0009734410354538464, + "loss": 0.0402, + "num_input_tokens_seen": 51294368, + "step": 23760 + }, + { + "epoch": 3.8768352365415986, + "grad_norm": 0.028728319332003593, + "learning_rate": 0.0009734181405774581, + "loss": 0.0387, + "num_input_tokens_seen": 51304704, + "step": 23765 + }, + { + "epoch": 3.877650897226754, + "grad_norm": 0.12482116371393204, + "learning_rate": 0.0009733952361066505, + "loss": 0.0979, + "num_input_tokens_seen": 51316128, + "step": 23770 + }, + { + "epoch": 3.8784665579119086, + "grad_norm": 0.004078851547092199, + "learning_rate": 0.0009733723220418877, + "loss": 0.0587, + "num_input_tokens_seen": 51327712, + "step": 23775 + }, + { + "epoch": 3.8792822185970635, + "grad_norm": 0.21632879972457886, + "learning_rate": 0.0009733493983836345, + "loss": 0.1404, + "num_input_tokens_seen": 51338624, + "step": 23780 + }, + { + "epoch": 3.8800978792822187, + "grad_norm": 0.012721231207251549, + "learning_rate": 0.0009733264651323553, + "loss": 0.0197, + "num_input_tokens_seen": 51348928, + "step": 23785 + }, + { + "epoch": 3.8809135399673735, + "grad_norm": 0.06910673528909683, + "learning_rate": 0.0009733035222885149, + "loss": 0.0563, + "num_input_tokens_seen": 51359744, + "step": 23790 + }, + { + "epoch": 3.8817292006525284, + "grad_norm": 0.006232867948710918, + "learning_rate": 0.000973280569852578, + "loss": 0.0798, + "num_input_tokens_seen": 51371616, + "step": 23795 + }, + { + "epoch": 3.8825448613376836, + "grad_norm": 0.09287640452384949, + "learning_rate": 0.00097325760782501, + "loss": 0.1961, + "num_input_tokens_seen": 51382752, + "step": 23800 + }, + { + "epoch": 3.8833605220228384, + "grad_norm": 0.16094809770584106, + "learning_rate": 0.0009732346362062763, + "loss": 0.2954, + "num_input_tokens_seen": 51393440, + "step": 23805 + }, + { + "epoch": 3.8841761827079937, + "grad_norm": 0.01938532665371895, + "learning_rate": 0.0009732116549968421, + "loss": 0.0181, + "num_input_tokens_seen": 51401792, + "step": 23810 + }, + { + "epoch": 3.8849918433931485, + "grad_norm": 0.03875003382563591, + "learning_rate": 0.0009731886641971737, + "loss": 0.2991, + "num_input_tokens_seen": 51413216, + "step": 23815 + }, + { + "epoch": 3.8858075040783033, + "grad_norm": 0.013635357841849327, + "learning_rate": 0.0009731656638077367, + "loss": 0.1327, + "num_input_tokens_seen": 51423840, + "step": 23820 + }, + { + "epoch": 3.886623164763458, + "grad_norm": 0.010250317864120007, + "learning_rate": 0.0009731426538289971, + "loss": 0.0447, + "num_input_tokens_seen": 51434464, + "step": 23825 + }, + { + "epoch": 3.8874388254486134, + "grad_norm": 0.06521691381931305, + "learning_rate": 0.0009731196342614214, + "loss": 0.1411, + "num_input_tokens_seen": 51446112, + "step": 23830 + }, + { + "epoch": 3.8882544861337682, + "grad_norm": 0.12613479793071747, + "learning_rate": 0.0009730966051054763, + "loss": 0.2598, + "num_input_tokens_seen": 51456704, + "step": 23835 + }, + { + "epoch": 3.8890701468189235, + "grad_norm": 0.1435096263885498, + "learning_rate": 0.0009730735663616281, + "loss": 0.1497, + "num_input_tokens_seen": 51467424, + "step": 23840 + }, + { + "epoch": 3.8898858075040783, + "grad_norm": 0.07048121839761734, + "learning_rate": 0.0009730505180303441, + "loss": 0.0558, + "num_input_tokens_seen": 51477792, + "step": 23845 + }, + { + "epoch": 3.890701468189233, + "grad_norm": 0.0447540283203125, + "learning_rate": 0.0009730274601120913, + "loss": 0.1876, + "num_input_tokens_seen": 51486368, + "step": 23850 + }, + { + "epoch": 3.8915171288743884, + "grad_norm": 0.16115958988666534, + "learning_rate": 0.0009730043926073369, + "loss": 0.138, + "num_input_tokens_seen": 51496224, + "step": 23855 + }, + { + "epoch": 3.892332789559543, + "grad_norm": 0.007951674051582813, + "learning_rate": 0.0009729813155165484, + "loss": 0.1012, + "num_input_tokens_seen": 51507872, + "step": 23860 + }, + { + "epoch": 3.8931484502446985, + "grad_norm": 0.0698246881365776, + "learning_rate": 0.0009729582288401934, + "loss": 0.0505, + "num_input_tokens_seen": 51518112, + "step": 23865 + }, + { + "epoch": 3.8939641109298533, + "grad_norm": 0.1502692848443985, + "learning_rate": 0.0009729351325787402, + "loss": 0.0948, + "num_input_tokens_seen": 51529120, + "step": 23870 + }, + { + "epoch": 3.894779771615008, + "grad_norm": 0.04103147238492966, + "learning_rate": 0.0009729120267326564, + "loss": 0.1026, + "num_input_tokens_seen": 51540768, + "step": 23875 + }, + { + "epoch": 3.895595432300163, + "grad_norm": 0.0857701301574707, + "learning_rate": 0.0009728889113024103, + "loss": 0.1975, + "num_input_tokens_seen": 51550304, + "step": 23880 + }, + { + "epoch": 3.896411092985318, + "grad_norm": 0.06967068463563919, + "learning_rate": 0.0009728657862884707, + "loss": 0.1198, + "num_input_tokens_seen": 51561088, + "step": 23885 + }, + { + "epoch": 3.897226753670473, + "grad_norm": 0.06206932291388512, + "learning_rate": 0.0009728426516913061, + "loss": 0.1616, + "num_input_tokens_seen": 51571264, + "step": 23890 + }, + { + "epoch": 3.8980424143556283, + "grad_norm": 0.010891067795455456, + "learning_rate": 0.0009728195075113851, + "loss": 0.0668, + "num_input_tokens_seen": 51582912, + "step": 23895 + }, + { + "epoch": 3.898858075040783, + "grad_norm": 0.06700272113084793, + "learning_rate": 0.000972796353749177, + "loss": 0.0407, + "num_input_tokens_seen": 51594144, + "step": 23900 + }, + { + "epoch": 3.899673735725938, + "grad_norm": 0.015332629904150963, + "learning_rate": 0.0009727731904051513, + "loss": 0.0777, + "num_input_tokens_seen": 51604928, + "step": 23905 + }, + { + "epoch": 3.9004893964110927, + "grad_norm": 0.22788116335868835, + "learning_rate": 0.0009727500174797769, + "loss": 0.217, + "num_input_tokens_seen": 51614176, + "step": 23910 + }, + { + "epoch": 3.901305057096248, + "grad_norm": 0.06023408845067024, + "learning_rate": 0.0009727268349735237, + "loss": 0.0422, + "num_input_tokens_seen": 51624992, + "step": 23915 + }, + { + "epoch": 3.902120717781403, + "grad_norm": 0.18296866118907928, + "learning_rate": 0.0009727036428868616, + "loss": 0.0706, + "num_input_tokens_seen": 51635744, + "step": 23920 + }, + { + "epoch": 3.902936378466558, + "grad_norm": 0.017822273075580597, + "learning_rate": 0.0009726804412202604, + "loss": 0.0573, + "num_input_tokens_seen": 51645472, + "step": 23925 + }, + { + "epoch": 3.903752039151713, + "grad_norm": 0.05549195781350136, + "learning_rate": 0.0009726572299741904, + "loss": 0.0965, + "num_input_tokens_seen": 51656128, + "step": 23930 + }, + { + "epoch": 3.9045676998368677, + "grad_norm": 0.07593277841806412, + "learning_rate": 0.0009726340091491221, + "loss": 0.2222, + "num_input_tokens_seen": 51666368, + "step": 23935 + }, + { + "epoch": 3.905383360522023, + "grad_norm": 0.14018172025680542, + "learning_rate": 0.000972610778745526, + "loss": 0.0764, + "num_input_tokens_seen": 51677792, + "step": 23940 + }, + { + "epoch": 3.9061990212071778, + "grad_norm": 0.14810742437839508, + "learning_rate": 0.0009725875387638729, + "loss": 0.1934, + "num_input_tokens_seen": 51689280, + "step": 23945 + }, + { + "epoch": 3.907014681892333, + "grad_norm": 0.04261759668588638, + "learning_rate": 0.0009725642892046339, + "loss": 0.0863, + "num_input_tokens_seen": 51699712, + "step": 23950 + }, + { + "epoch": 3.907830342577488, + "grad_norm": 0.12599970400333405, + "learning_rate": 0.00097254103006828, + "loss": 0.185, + "num_input_tokens_seen": 51710624, + "step": 23955 + }, + { + "epoch": 3.9086460032626427, + "grad_norm": 0.20600534975528717, + "learning_rate": 0.0009725177613552827, + "loss": 0.1112, + "num_input_tokens_seen": 51721952, + "step": 23960 + }, + { + "epoch": 3.9094616639477975, + "grad_norm": 0.015485746785998344, + "learning_rate": 0.0009724944830661135, + "loss": 0.1071, + "num_input_tokens_seen": 51731584, + "step": 23965 + }, + { + "epoch": 3.9102773246329527, + "grad_norm": 0.01716160960495472, + "learning_rate": 0.0009724711952012442, + "loss": 0.1782, + "num_input_tokens_seen": 51743232, + "step": 23970 + }, + { + "epoch": 3.9110929853181076, + "grad_norm": 0.09199640899896622, + "learning_rate": 0.0009724478977611469, + "loss": 0.1372, + "num_input_tokens_seen": 51755712, + "step": 23975 + }, + { + "epoch": 3.911908646003263, + "grad_norm": 0.04298853874206543, + "learning_rate": 0.0009724245907462934, + "loss": 0.0342, + "num_input_tokens_seen": 51767776, + "step": 23980 + }, + { + "epoch": 3.9127243066884176, + "grad_norm": 0.18572263419628143, + "learning_rate": 0.0009724012741571563, + "loss": 0.1375, + "num_input_tokens_seen": 51778176, + "step": 23985 + }, + { + "epoch": 3.9135399673735725, + "grad_norm": 0.02563667483627796, + "learning_rate": 0.000972377947994208, + "loss": 0.0943, + "num_input_tokens_seen": 51789024, + "step": 23990 + }, + { + "epoch": 3.9143556280587277, + "grad_norm": 0.10421527177095413, + "learning_rate": 0.0009723546122579217, + "loss": 0.112, + "num_input_tokens_seen": 51798752, + "step": 23995 + }, + { + "epoch": 3.9151712887438825, + "grad_norm": 0.026869123801589012, + "learning_rate": 0.0009723312669487696, + "loss": 0.1003, + "num_input_tokens_seen": 51809856, + "step": 24000 + }, + { + "epoch": 3.9159869494290374, + "grad_norm": 0.06349535286426544, + "learning_rate": 0.0009723079120672254, + "loss": 0.0334, + "num_input_tokens_seen": 51819584, + "step": 24005 + }, + { + "epoch": 3.9168026101141926, + "grad_norm": 0.22903603315353394, + "learning_rate": 0.0009722845476137621, + "loss": 0.0753, + "num_input_tokens_seen": 51831168, + "step": 24010 + }, + { + "epoch": 3.9176182707993474, + "grad_norm": 0.027885988354682922, + "learning_rate": 0.0009722611735888532, + "loss": 0.117, + "num_input_tokens_seen": 51839776, + "step": 24015 + }, + { + "epoch": 3.9184339314845023, + "grad_norm": 0.009798881597816944, + "learning_rate": 0.0009722377899929727, + "loss": 0.1109, + "num_input_tokens_seen": 51850368, + "step": 24020 + }, + { + "epoch": 3.9192495921696575, + "grad_norm": 0.16364926099777222, + "learning_rate": 0.0009722143968265942, + "loss": 0.1496, + "num_input_tokens_seen": 51861248, + "step": 24025 + }, + { + "epoch": 3.9200652528548123, + "grad_norm": 0.19024907052516937, + "learning_rate": 0.0009721909940901918, + "loss": 0.2666, + "num_input_tokens_seen": 51872928, + "step": 24030 + }, + { + "epoch": 3.9208809135399676, + "grad_norm": 0.19984225928783417, + "learning_rate": 0.0009721675817842402, + "loss": 0.2255, + "num_input_tokens_seen": 51883680, + "step": 24035 + }, + { + "epoch": 3.9216965742251224, + "grad_norm": 0.1755438596010208, + "learning_rate": 0.0009721441599092133, + "loss": 0.2838, + "num_input_tokens_seen": 51895808, + "step": 24040 + }, + { + "epoch": 3.9225122349102772, + "grad_norm": 0.029797155410051346, + "learning_rate": 0.0009721207284655862, + "loss": 0.0627, + "num_input_tokens_seen": 51905760, + "step": 24045 + }, + { + "epoch": 3.923327895595432, + "grad_norm": 0.1139109656214714, + "learning_rate": 0.0009720972874538334, + "loss": 0.1818, + "num_input_tokens_seen": 51916288, + "step": 24050 + }, + { + "epoch": 3.9241435562805873, + "grad_norm": 0.03491639718413353, + "learning_rate": 0.0009720738368744304, + "loss": 0.0471, + "num_input_tokens_seen": 51927040, + "step": 24055 + }, + { + "epoch": 3.924959216965742, + "grad_norm": 0.015610925853252411, + "learning_rate": 0.0009720503767278522, + "loss": 0.0564, + "num_input_tokens_seen": 51937760, + "step": 24060 + }, + { + "epoch": 3.9257748776508974, + "grad_norm": 0.06630711257457733, + "learning_rate": 0.0009720269070145742, + "loss": 0.0514, + "num_input_tokens_seen": 51948384, + "step": 24065 + }, + { + "epoch": 3.926590538336052, + "grad_norm": 0.07051598280668259, + "learning_rate": 0.000972003427735072, + "loss": 0.1267, + "num_input_tokens_seen": 51958496, + "step": 24070 + }, + { + "epoch": 3.927406199021207, + "grad_norm": 0.004908180329948664, + "learning_rate": 0.0009719799388898219, + "loss": 0.0428, + "num_input_tokens_seen": 51969216, + "step": 24075 + }, + { + "epoch": 3.9282218597063623, + "grad_norm": 0.014811759814620018, + "learning_rate": 0.0009719564404792993, + "loss": 0.0812, + "num_input_tokens_seen": 51980768, + "step": 24080 + }, + { + "epoch": 3.929037520391517, + "grad_norm": 0.03435433655977249, + "learning_rate": 0.0009719329325039807, + "loss": 0.0536, + "num_input_tokens_seen": 51990976, + "step": 24085 + }, + { + "epoch": 3.9298531810766724, + "grad_norm": 0.12945859134197235, + "learning_rate": 0.0009719094149643426, + "loss": 0.0693, + "num_input_tokens_seen": 52002240, + "step": 24090 + }, + { + "epoch": 3.930668841761827, + "grad_norm": 0.0459202341735363, + "learning_rate": 0.0009718858878608617, + "loss": 0.0522, + "num_input_tokens_seen": 52013760, + "step": 24095 + }, + { + "epoch": 3.931484502446982, + "grad_norm": 0.05692682042717934, + "learning_rate": 0.0009718623511940145, + "loss": 0.0255, + "num_input_tokens_seen": 52024960, + "step": 24100 + }, + { + "epoch": 3.932300163132137, + "grad_norm": 0.0134621262550354, + "learning_rate": 0.0009718388049642781, + "loss": 0.1203, + "num_input_tokens_seen": 52035872, + "step": 24105 + }, + { + "epoch": 3.933115823817292, + "grad_norm": 0.009682439267635345, + "learning_rate": 0.00097181524917213, + "loss": 0.1521, + "num_input_tokens_seen": 52047136, + "step": 24110 + }, + { + "epoch": 3.933931484502447, + "grad_norm": 0.003689864184707403, + "learning_rate": 0.0009717916838180471, + "loss": 0.2649, + "num_input_tokens_seen": 52059040, + "step": 24115 + }, + { + "epoch": 3.934747145187602, + "grad_norm": 0.009880336001515388, + "learning_rate": 0.0009717681089025073, + "loss": 0.074, + "num_input_tokens_seen": 52070176, + "step": 24120 + }, + { + "epoch": 3.935562805872757, + "grad_norm": 0.03160657361149788, + "learning_rate": 0.0009717445244259882, + "loss": 0.0439, + "num_input_tokens_seen": 52080576, + "step": 24125 + }, + { + "epoch": 3.936378466557912, + "grad_norm": 0.052636656910181046, + "learning_rate": 0.0009717209303889679, + "loss": 0.0492, + "num_input_tokens_seen": 52091744, + "step": 24130 + }, + { + "epoch": 3.9371941272430666, + "grad_norm": 0.08156874775886536, + "learning_rate": 0.0009716973267919246, + "loss": 0.2644, + "num_input_tokens_seen": 52102432, + "step": 24135 + }, + { + "epoch": 3.938009787928222, + "grad_norm": 0.009791438467800617, + "learning_rate": 0.0009716737136353365, + "loss": 0.0759, + "num_input_tokens_seen": 52111968, + "step": 24140 + }, + { + "epoch": 3.9388254486133767, + "grad_norm": 0.021028507500886917, + "learning_rate": 0.0009716500909196824, + "loss": 0.1077, + "num_input_tokens_seen": 52123872, + "step": 24145 + }, + { + "epoch": 3.939641109298532, + "grad_norm": 0.007046426180750132, + "learning_rate": 0.0009716264586454406, + "loss": 0.0633, + "num_input_tokens_seen": 52134400, + "step": 24150 + }, + { + "epoch": 3.9404567699836868, + "grad_norm": 0.005420156288892031, + "learning_rate": 0.0009716028168130906, + "loss": 0.0263, + "num_input_tokens_seen": 52144448, + "step": 24155 + }, + { + "epoch": 3.9412724306688416, + "grad_norm": 0.11813469976186752, + "learning_rate": 0.000971579165423111, + "loss": 0.1137, + "num_input_tokens_seen": 52155840, + "step": 24160 + }, + { + "epoch": 3.942088091353997, + "grad_norm": 0.009505799040198326, + "learning_rate": 0.0009715555044759815, + "loss": 0.0891, + "num_input_tokens_seen": 52167968, + "step": 24165 + }, + { + "epoch": 3.9429037520391517, + "grad_norm": 0.07877328991889954, + "learning_rate": 0.0009715318339721814, + "loss": 0.0596, + "num_input_tokens_seen": 52177792, + "step": 24170 + }, + { + "epoch": 3.943719412724307, + "grad_norm": 0.20188076794147491, + "learning_rate": 0.0009715081539121908, + "loss": 0.1436, + "num_input_tokens_seen": 52188992, + "step": 24175 + }, + { + "epoch": 3.9445350734094617, + "grad_norm": 0.20412327349185944, + "learning_rate": 0.0009714844642964891, + "loss": 0.1147, + "num_input_tokens_seen": 52200192, + "step": 24180 + }, + { + "epoch": 3.9453507340946166, + "grad_norm": 0.11833662539720535, + "learning_rate": 0.0009714607651255565, + "loss": 0.0985, + "num_input_tokens_seen": 52210944, + "step": 24185 + }, + { + "epoch": 3.9461663947797714, + "grad_norm": 0.04366360977292061, + "learning_rate": 0.0009714370563998736, + "loss": 0.1389, + "num_input_tokens_seen": 52220960, + "step": 24190 + }, + { + "epoch": 3.9469820554649266, + "grad_norm": 0.13393987715244293, + "learning_rate": 0.0009714133381199205, + "loss": 0.1807, + "num_input_tokens_seen": 52231232, + "step": 24195 + }, + { + "epoch": 3.9477977161500815, + "grad_norm": 0.011434974148869514, + "learning_rate": 0.0009713896102861782, + "loss": 0.1046, + "num_input_tokens_seen": 52240032, + "step": 24200 + }, + { + "epoch": 3.9486133768352367, + "grad_norm": 0.007954268716275692, + "learning_rate": 0.0009713658728991274, + "loss": 0.0688, + "num_input_tokens_seen": 52251968, + "step": 24205 + }, + { + "epoch": 3.9494290375203915, + "grad_norm": 0.0034647146239876747, + "learning_rate": 0.0009713421259592493, + "loss": 0.0408, + "num_input_tokens_seen": 52263072, + "step": 24210 + }, + { + "epoch": 3.9502446982055464, + "grad_norm": 0.14254719018936157, + "learning_rate": 0.0009713183694670249, + "loss": 0.1208, + "num_input_tokens_seen": 52274336, + "step": 24215 + }, + { + "epoch": 3.9510603588907016, + "grad_norm": 0.03650829195976257, + "learning_rate": 0.000971294603422936, + "loss": 0.0869, + "num_input_tokens_seen": 52284992, + "step": 24220 + }, + { + "epoch": 3.9518760195758564, + "grad_norm": 0.0948442742228508, + "learning_rate": 0.000971270827827464, + "loss": 0.038, + "num_input_tokens_seen": 52296032, + "step": 24225 + }, + { + "epoch": 3.9526916802610113, + "grad_norm": 0.17962689697742462, + "learning_rate": 0.0009712470426810909, + "loss": 0.2692, + "num_input_tokens_seen": 52306432, + "step": 24230 + }, + { + "epoch": 3.9535073409461665, + "grad_norm": 0.023669281974434853, + "learning_rate": 0.0009712232479842986, + "loss": 0.0271, + "num_input_tokens_seen": 52316640, + "step": 24235 + }, + { + "epoch": 3.9543230016313213, + "grad_norm": 0.1327577829360962, + "learning_rate": 0.0009711994437375693, + "loss": 0.0992, + "num_input_tokens_seen": 52327104, + "step": 24240 + }, + { + "epoch": 3.955138662316476, + "grad_norm": 0.0070892455987632275, + "learning_rate": 0.0009711756299413856, + "loss": 0.1505, + "num_input_tokens_seen": 52338080, + "step": 24245 + }, + { + "epoch": 3.9559543230016314, + "grad_norm": 0.010778171010315418, + "learning_rate": 0.0009711518065962302, + "loss": 0.2094, + "num_input_tokens_seen": 52348576, + "step": 24250 + }, + { + "epoch": 3.9567699836867862, + "grad_norm": 0.021005019545555115, + "learning_rate": 0.0009711279737025856, + "loss": 0.1491, + "num_input_tokens_seen": 52358848, + "step": 24255 + }, + { + "epoch": 3.9575856443719415, + "grad_norm": 0.011824915185570717, + "learning_rate": 0.0009711041312609349, + "loss": 0.0767, + "num_input_tokens_seen": 52369248, + "step": 24260 + }, + { + "epoch": 3.9584013050570963, + "grad_norm": 0.04517203941941261, + "learning_rate": 0.0009710802792717613, + "loss": 0.1477, + "num_input_tokens_seen": 52380640, + "step": 24265 + }, + { + "epoch": 3.959216965742251, + "grad_norm": 0.03443364053964615, + "learning_rate": 0.0009710564177355483, + "loss": 0.0891, + "num_input_tokens_seen": 52391136, + "step": 24270 + }, + { + "epoch": 3.960032626427406, + "grad_norm": 0.035911448299884796, + "learning_rate": 0.0009710325466527794, + "loss": 0.0659, + "num_input_tokens_seen": 52401824, + "step": 24275 + }, + { + "epoch": 3.960848287112561, + "grad_norm": 0.21114785969257355, + "learning_rate": 0.0009710086660239386, + "loss": 0.4266, + "num_input_tokens_seen": 52412576, + "step": 24280 + }, + { + "epoch": 3.961663947797716, + "grad_norm": 0.1301741749048233, + "learning_rate": 0.0009709847758495094, + "loss": 0.1741, + "num_input_tokens_seen": 52422720, + "step": 24285 + }, + { + "epoch": 3.9624796084828713, + "grad_norm": 0.20639094710350037, + "learning_rate": 0.0009709608761299763, + "loss": 0.1208, + "num_input_tokens_seen": 52432800, + "step": 24290 + }, + { + "epoch": 3.963295269168026, + "grad_norm": 0.02470194734632969, + "learning_rate": 0.0009709369668658237, + "loss": 0.0496, + "num_input_tokens_seen": 52443680, + "step": 24295 + }, + { + "epoch": 3.964110929853181, + "grad_norm": 0.010821464471518993, + "learning_rate": 0.0009709130480575359, + "loss": 0.1857, + "num_input_tokens_seen": 52454976, + "step": 24300 + }, + { + "epoch": 3.964926590538336, + "grad_norm": 0.0643695667386055, + "learning_rate": 0.0009708891197055978, + "loss": 0.1537, + "num_input_tokens_seen": 52465376, + "step": 24305 + }, + { + "epoch": 3.965742251223491, + "grad_norm": 0.051256515085697174, + "learning_rate": 0.0009708651818104943, + "loss": 0.177, + "num_input_tokens_seen": 52477408, + "step": 24310 + }, + { + "epoch": 3.9665579119086463, + "grad_norm": 0.034413184970617294, + "learning_rate": 0.0009708412343727106, + "loss": 0.1051, + "num_input_tokens_seen": 52489472, + "step": 24315 + }, + { + "epoch": 3.967373572593801, + "grad_norm": 0.02862395904958248, + "learning_rate": 0.000970817277392732, + "loss": 0.0351, + "num_input_tokens_seen": 52499648, + "step": 24320 + }, + { + "epoch": 3.968189233278956, + "grad_norm": 0.04880475252866745, + "learning_rate": 0.000970793310871044, + "loss": 0.1834, + "num_input_tokens_seen": 52508096, + "step": 24325 + }, + { + "epoch": 3.9690048939641107, + "grad_norm": 0.04193776473402977, + "learning_rate": 0.0009707693348081323, + "loss": 0.0755, + "num_input_tokens_seen": 52520224, + "step": 24330 + }, + { + "epoch": 3.969820554649266, + "grad_norm": 0.020621774718165398, + "learning_rate": 0.0009707453492044829, + "loss": 0.11, + "num_input_tokens_seen": 52531520, + "step": 24335 + }, + { + "epoch": 3.970636215334421, + "grad_norm": 0.02446540631353855, + "learning_rate": 0.0009707213540605817, + "loss": 0.1342, + "num_input_tokens_seen": 52543040, + "step": 24340 + }, + { + "epoch": 3.971451876019576, + "grad_norm": 0.11845069378614426, + "learning_rate": 0.0009706973493769152, + "loss": 0.2788, + "num_input_tokens_seen": 52553760, + "step": 24345 + }, + { + "epoch": 3.972267536704731, + "grad_norm": 0.08139796555042267, + "learning_rate": 0.0009706733351539696, + "loss": 0.0903, + "num_input_tokens_seen": 52564416, + "step": 24350 + }, + { + "epoch": 3.9730831973898857, + "grad_norm": 0.13644473254680634, + "learning_rate": 0.0009706493113922318, + "loss": 0.0981, + "num_input_tokens_seen": 52574880, + "step": 24355 + }, + { + "epoch": 3.9738988580750405, + "grad_norm": 0.04093753173947334, + "learning_rate": 0.000970625278092189, + "loss": 0.171, + "num_input_tokens_seen": 52586464, + "step": 24360 + }, + { + "epoch": 3.9747145187601958, + "grad_norm": 0.015360267832875252, + "learning_rate": 0.0009706012352543276, + "loss": 0.0407, + "num_input_tokens_seen": 52596608, + "step": 24365 + }, + { + "epoch": 3.9755301794453506, + "grad_norm": 0.09974577277898788, + "learning_rate": 0.0009705771828791353, + "loss": 0.1886, + "num_input_tokens_seen": 52607680, + "step": 24370 + }, + { + "epoch": 3.976345840130506, + "grad_norm": 0.18576501309871674, + "learning_rate": 0.0009705531209670993, + "loss": 0.1829, + "num_input_tokens_seen": 52619648, + "step": 24375 + }, + { + "epoch": 3.9771615008156607, + "grad_norm": 0.02524561993777752, + "learning_rate": 0.0009705290495187073, + "loss": 0.0887, + "num_input_tokens_seen": 52630528, + "step": 24380 + }, + { + "epoch": 3.9779771615008155, + "grad_norm": 0.028243429958820343, + "learning_rate": 0.0009705049685344474, + "loss": 0.0178, + "num_input_tokens_seen": 52640608, + "step": 24385 + }, + { + "epoch": 3.9787928221859707, + "grad_norm": 0.11663435399532318, + "learning_rate": 0.0009704808780148074, + "loss": 0.1466, + "num_input_tokens_seen": 52651872, + "step": 24390 + }, + { + "epoch": 3.9796084828711256, + "grad_norm": 0.10319039970636368, + "learning_rate": 0.0009704567779602754, + "loss": 0.0964, + "num_input_tokens_seen": 52663904, + "step": 24395 + }, + { + "epoch": 3.980424143556281, + "grad_norm": 0.07420803606510162, + "learning_rate": 0.0009704326683713402, + "loss": 0.1791, + "num_input_tokens_seen": 52674848, + "step": 24400 + }, + { + "epoch": 3.9812398042414356, + "grad_norm": 0.21930944919586182, + "learning_rate": 0.00097040854924849, + "loss": 0.3198, + "num_input_tokens_seen": 52686656, + "step": 24405 + }, + { + "epoch": 3.9820554649265905, + "grad_norm": 0.020767202600836754, + "learning_rate": 0.0009703844205922139, + "loss": 0.0801, + "num_input_tokens_seen": 52696544, + "step": 24410 + }, + { + "epoch": 3.9828711256117453, + "grad_norm": 0.0045809028670191765, + "learning_rate": 0.0009703602824030007, + "loss": 0.0485, + "num_input_tokens_seen": 52706912, + "step": 24415 + }, + { + "epoch": 3.9836867862969005, + "grad_norm": 0.012361938133835793, + "learning_rate": 0.0009703361346813398, + "loss": 0.0851, + "num_input_tokens_seen": 52718016, + "step": 24420 + }, + { + "epoch": 3.9845024469820554, + "grad_norm": 0.20187696814537048, + "learning_rate": 0.0009703119774277205, + "loss": 0.3752, + "num_input_tokens_seen": 52728288, + "step": 24425 + }, + { + "epoch": 3.9853181076672106, + "grad_norm": 0.029316125437617302, + "learning_rate": 0.0009702878106426321, + "loss": 0.1012, + "num_input_tokens_seen": 52739104, + "step": 24430 + }, + { + "epoch": 3.9861337683523654, + "grad_norm": 0.11292201280593872, + "learning_rate": 0.0009702636343265649, + "loss": 0.054, + "num_input_tokens_seen": 52749536, + "step": 24435 + }, + { + "epoch": 3.9869494290375203, + "grad_norm": 0.1034129410982132, + "learning_rate": 0.0009702394484800084, + "loss": 0.1939, + "num_input_tokens_seen": 52760416, + "step": 24440 + }, + { + "epoch": 3.9877650897226755, + "grad_norm": 0.023671921342611313, + "learning_rate": 0.000970215253103453, + "loss": 0.088, + "num_input_tokens_seen": 52772160, + "step": 24445 + }, + { + "epoch": 3.9885807504078303, + "grad_norm": 0.056872595101594925, + "learning_rate": 0.0009701910481973889, + "loss": 0.049, + "num_input_tokens_seen": 52781984, + "step": 24450 + }, + { + "epoch": 3.9893964110929856, + "grad_norm": 0.13757658004760742, + "learning_rate": 0.0009701668337623069, + "loss": 0.0934, + "num_input_tokens_seen": 52791776, + "step": 24455 + }, + { + "epoch": 3.9902120717781404, + "grad_norm": 0.010149016976356506, + "learning_rate": 0.0009701426097986974, + "loss": 0.0974, + "num_input_tokens_seen": 52802528, + "step": 24460 + }, + { + "epoch": 3.9910277324632952, + "grad_norm": 0.056381430476903915, + "learning_rate": 0.0009701183763070516, + "loss": 0.1012, + "num_input_tokens_seen": 52812640, + "step": 24465 + }, + { + "epoch": 3.99184339314845, + "grad_norm": 0.04301521182060242, + "learning_rate": 0.0009700941332878605, + "loss": 0.0833, + "num_input_tokens_seen": 52823744, + "step": 24470 + }, + { + "epoch": 3.9926590538336053, + "grad_norm": 0.16426050662994385, + "learning_rate": 0.0009700698807416153, + "loss": 0.0696, + "num_input_tokens_seen": 52834016, + "step": 24475 + }, + { + "epoch": 3.99347471451876, + "grad_norm": 0.2111726552248001, + "learning_rate": 0.0009700456186688078, + "loss": 0.1414, + "num_input_tokens_seen": 52845248, + "step": 24480 + }, + { + "epoch": 3.9942903752039154, + "grad_norm": 0.04490203037858009, + "learning_rate": 0.0009700213470699295, + "loss": 0.0305, + "num_input_tokens_seen": 52855104, + "step": 24485 + }, + { + "epoch": 3.99510603588907, + "grad_norm": 0.006937049794942141, + "learning_rate": 0.0009699970659454723, + "loss": 0.0531, + "num_input_tokens_seen": 52866048, + "step": 24490 + }, + { + "epoch": 3.995921696574225, + "grad_norm": 0.0050145722925662994, + "learning_rate": 0.0009699727752959284, + "loss": 0.0579, + "num_input_tokens_seen": 52876768, + "step": 24495 + }, + { + "epoch": 3.99673735725938, + "grad_norm": 0.012230563908815384, + "learning_rate": 0.00096994847512179, + "loss": 0.0678, + "num_input_tokens_seen": 52887872, + "step": 24500 + }, + { + "epoch": 3.997553017944535, + "grad_norm": 0.04020436108112335, + "learning_rate": 0.0009699241654235495, + "loss": 0.0839, + "num_input_tokens_seen": 52898624, + "step": 24505 + }, + { + "epoch": 3.99836867862969, + "grad_norm": 0.03805246204137802, + "learning_rate": 0.0009698998462016997, + "loss": 0.0841, + "num_input_tokens_seen": 52908768, + "step": 24510 + }, + { + "epoch": 3.999184339314845, + "grad_norm": 0.004216975066810846, + "learning_rate": 0.0009698755174567333, + "loss": 0.0395, + "num_input_tokens_seen": 52920064, + "step": 24515 + }, + { + "epoch": 4.0, + "grad_norm": 0.0028089042752981186, + "learning_rate": 0.0009698511791891435, + "loss": 0.024, + "num_input_tokens_seen": 52929744, + "step": 24520 + }, + { + "epoch": 4.0, + "eval_loss": 0.11304692924022675, + "eval_runtime": 103.99, + "eval_samples_per_second": 26.204, + "eval_steps_per_second": 6.558, + "num_input_tokens_seen": 52929744, + "step": 24520 + }, + { + "epoch": 4.000815660685155, + "grad_norm": 0.2917846143245697, + "learning_rate": 0.0009698268313994236, + "loss": 0.0885, + "num_input_tokens_seen": 52941840, + "step": 24525 + }, + { + "epoch": 4.00163132137031, + "grad_norm": 0.005945888347923756, + "learning_rate": 0.0009698024740880668, + "loss": 0.0557, + "num_input_tokens_seen": 52952560, + "step": 24530 + }, + { + "epoch": 4.002446982055465, + "grad_norm": 0.01604945957660675, + "learning_rate": 0.0009697781072555672, + "loss": 0.0846, + "num_input_tokens_seen": 52961968, + "step": 24535 + }, + { + "epoch": 4.00326264274062, + "grad_norm": 0.15214551985263824, + "learning_rate": 0.0009697537309024181, + "loss": 0.1097, + "num_input_tokens_seen": 52971408, + "step": 24540 + }, + { + "epoch": 4.004078303425775, + "grad_norm": 0.006527643650770187, + "learning_rate": 0.0009697293450291136, + "loss": 0.1304, + "num_input_tokens_seen": 52982832, + "step": 24545 + }, + { + "epoch": 4.00489396411093, + "grad_norm": 0.1918172836303711, + "learning_rate": 0.0009697049496361481, + "loss": 0.0544, + "num_input_tokens_seen": 52993648, + "step": 24550 + }, + { + "epoch": 4.005709624796085, + "grad_norm": 0.012356461957097054, + "learning_rate": 0.000969680544724016, + "loss": 0.1118, + "num_input_tokens_seen": 53004272, + "step": 24555 + }, + { + "epoch": 4.006525285481239, + "grad_norm": 0.39064091444015503, + "learning_rate": 0.0009696561302932117, + "loss": 0.3015, + "num_input_tokens_seen": 53013872, + "step": 24560 + }, + { + "epoch": 4.007340946166395, + "grad_norm": 0.027265828102827072, + "learning_rate": 0.0009696317063442303, + "loss": 0.0279, + "num_input_tokens_seen": 53024912, + "step": 24565 + }, + { + "epoch": 4.00815660685155, + "grad_norm": 0.2182610034942627, + "learning_rate": 0.0009696072728775664, + "loss": 0.1508, + "num_input_tokens_seen": 53037360, + "step": 24570 + }, + { + "epoch": 4.008972267536705, + "grad_norm": 0.03107822872698307, + "learning_rate": 0.0009695828298937155, + "loss": 0.0585, + "num_input_tokens_seen": 53048688, + "step": 24575 + }, + { + "epoch": 4.00978792822186, + "grad_norm": 0.019608978182077408, + "learning_rate": 0.0009695583773931728, + "loss": 0.0434, + "num_input_tokens_seen": 53059568, + "step": 24580 + }, + { + "epoch": 4.010603588907014, + "grad_norm": 0.017205512151122093, + "learning_rate": 0.000969533915376434, + "loss": 0.0413, + "num_input_tokens_seen": 53071312, + "step": 24585 + }, + { + "epoch": 4.011419249592169, + "grad_norm": 0.14792314171791077, + "learning_rate": 0.0009695094438439947, + "loss": 0.1299, + "num_input_tokens_seen": 53082544, + "step": 24590 + }, + { + "epoch": 4.012234910277325, + "grad_norm": 0.04373430460691452, + "learning_rate": 0.000969484962796351, + "loss": 0.0256, + "num_input_tokens_seen": 53093168, + "step": 24595 + }, + { + "epoch": 4.01305057096248, + "grad_norm": 0.009424775838851929, + "learning_rate": 0.0009694604722339987, + "loss": 0.1116, + "num_input_tokens_seen": 53104688, + "step": 24600 + }, + { + "epoch": 4.013866231647635, + "grad_norm": 0.08269114047288895, + "learning_rate": 0.0009694359721574345, + "loss": 0.1305, + "num_input_tokens_seen": 53116528, + "step": 24605 + }, + { + "epoch": 4.014681892332789, + "grad_norm": 0.44035181403160095, + "learning_rate": 0.0009694114625671548, + "loss": 0.1024, + "num_input_tokens_seen": 53128016, + "step": 24610 + }, + { + "epoch": 4.015497553017944, + "grad_norm": 0.014272435568273067, + "learning_rate": 0.0009693869434636564, + "loss": 0.0592, + "num_input_tokens_seen": 53139120, + "step": 24615 + }, + { + "epoch": 4.0163132137031, + "grad_norm": 0.015641603618860245, + "learning_rate": 0.000969362414847436, + "loss": 0.0495, + "num_input_tokens_seen": 53149392, + "step": 24620 + }, + { + "epoch": 4.017128874388255, + "grad_norm": 0.23640552163124084, + "learning_rate": 0.0009693378767189909, + "loss": 0.0599, + "num_input_tokens_seen": 53160304, + "step": 24625 + }, + { + "epoch": 4.0179445350734095, + "grad_norm": 0.055725518614053726, + "learning_rate": 0.0009693133290788184, + "loss": 0.0446, + "num_input_tokens_seen": 53171248, + "step": 24630 + }, + { + "epoch": 4.018760195758564, + "grad_norm": 0.06288314610719681, + "learning_rate": 0.0009692887719274159, + "loss": 0.1541, + "num_input_tokens_seen": 53183312, + "step": 24635 + }, + { + "epoch": 4.019575856443719, + "grad_norm": 0.02160368673503399, + "learning_rate": 0.0009692642052652811, + "loss": 0.0837, + "num_input_tokens_seen": 53194192, + "step": 24640 + }, + { + "epoch": 4.020391517128874, + "grad_norm": 0.004235861822962761, + "learning_rate": 0.0009692396290929118, + "loss": 0.3035, + "num_input_tokens_seen": 53205296, + "step": 24645 + }, + { + "epoch": 4.02120717781403, + "grad_norm": 0.004079623147845268, + "learning_rate": 0.0009692150434108061, + "loss": 0.0758, + "num_input_tokens_seen": 53215632, + "step": 24650 + }, + { + "epoch": 4.0220228384991845, + "grad_norm": 0.019200026988983154, + "learning_rate": 0.0009691904482194625, + "loss": 0.1193, + "num_input_tokens_seen": 53226704, + "step": 24655 + }, + { + "epoch": 4.022838499184339, + "grad_norm": 0.00861707329750061, + "learning_rate": 0.000969165843519379, + "loss": 0.0187, + "num_input_tokens_seen": 53237328, + "step": 24660 + }, + { + "epoch": 4.023654159869494, + "grad_norm": 0.021664723753929138, + "learning_rate": 0.0009691412293110546, + "loss": 0.06, + "num_input_tokens_seen": 53248912, + "step": 24665 + }, + { + "epoch": 4.024469820554649, + "grad_norm": 0.005959618836641312, + "learning_rate": 0.0009691166055949881, + "loss": 0.0783, + "num_input_tokens_seen": 53260528, + "step": 24670 + }, + { + "epoch": 4.025285481239805, + "grad_norm": 0.029672566801309586, + "learning_rate": 0.0009690919723716785, + "loss": 0.069, + "num_input_tokens_seen": 53272016, + "step": 24675 + }, + { + "epoch": 4.0261011419249595, + "grad_norm": 0.05666002631187439, + "learning_rate": 0.000969067329641625, + "loss": 0.035, + "num_input_tokens_seen": 53282640, + "step": 24680 + }, + { + "epoch": 4.026916802610114, + "grad_norm": 0.0023556475061923265, + "learning_rate": 0.000969042677405327, + "loss": 0.0732, + "num_input_tokens_seen": 53293904, + "step": 24685 + }, + { + "epoch": 4.027732463295269, + "grad_norm": 0.002951778471469879, + "learning_rate": 0.0009690180156632839, + "loss": 0.0607, + "num_input_tokens_seen": 53305904, + "step": 24690 + }, + { + "epoch": 4.028548123980424, + "grad_norm": 0.00254252040758729, + "learning_rate": 0.000968993344415996, + "loss": 0.0148, + "num_input_tokens_seen": 53316528, + "step": 24695 + }, + { + "epoch": 4.029363784665579, + "grad_norm": 0.06488464027643204, + "learning_rate": 0.0009689686636639629, + "loss": 0.1381, + "num_input_tokens_seen": 53328464, + "step": 24700 + }, + { + "epoch": 4.0301794453507345, + "grad_norm": 0.21089515089988708, + "learning_rate": 0.000968943973407685, + "loss": 0.1308, + "num_input_tokens_seen": 53339440, + "step": 24705 + }, + { + "epoch": 4.030995106035889, + "grad_norm": 0.015781166031956673, + "learning_rate": 0.0009689192736476624, + "loss": 0.1113, + "num_input_tokens_seen": 53350000, + "step": 24710 + }, + { + "epoch": 4.031810766721044, + "grad_norm": 0.009945242665708065, + "learning_rate": 0.000968894564384396, + "loss": 0.0241, + "num_input_tokens_seen": 53360528, + "step": 24715 + }, + { + "epoch": 4.032626427406199, + "grad_norm": 0.006212721113115549, + "learning_rate": 0.0009688698456183863, + "loss": 0.0317, + "num_input_tokens_seen": 53371600, + "step": 24720 + }, + { + "epoch": 4.033442088091354, + "grad_norm": 0.06531178206205368, + "learning_rate": 0.0009688451173501345, + "loss": 0.1115, + "num_input_tokens_seen": 53382032, + "step": 24725 + }, + { + "epoch": 4.034257748776509, + "grad_norm": 0.058198247104883194, + "learning_rate": 0.0009688203795801415, + "loss": 0.13, + "num_input_tokens_seen": 53392816, + "step": 24730 + }, + { + "epoch": 4.035073409461664, + "grad_norm": 0.05636320263147354, + "learning_rate": 0.0009687956323089088, + "loss": 0.068, + "num_input_tokens_seen": 53404624, + "step": 24735 + }, + { + "epoch": 4.035889070146819, + "grad_norm": 0.13674426078796387, + "learning_rate": 0.000968770875536938, + "loss": 0.1391, + "num_input_tokens_seen": 53416048, + "step": 24740 + }, + { + "epoch": 4.036704730831974, + "grad_norm": 0.04053742066025734, + "learning_rate": 0.0009687461092647308, + "loss": 0.063, + "num_input_tokens_seen": 53426704, + "step": 24745 + }, + { + "epoch": 4.037520391517129, + "grad_norm": 0.1300506889820099, + "learning_rate": 0.0009687213334927888, + "loss": 0.0661, + "num_input_tokens_seen": 53439248, + "step": 24750 + }, + { + "epoch": 4.0383360522022835, + "grad_norm": 0.034887779504060745, + "learning_rate": 0.0009686965482216145, + "loss": 0.1907, + "num_input_tokens_seen": 53448400, + "step": 24755 + }, + { + "epoch": 4.039151712887439, + "grad_norm": 0.164821058511734, + "learning_rate": 0.00096867175345171, + "loss": 0.2202, + "num_input_tokens_seen": 53460336, + "step": 24760 + }, + { + "epoch": 4.039967373572594, + "grad_norm": 0.13601839542388916, + "learning_rate": 0.0009686469491835779, + "loss": 0.1431, + "num_input_tokens_seen": 53471568, + "step": 24765 + }, + { + "epoch": 4.040783034257749, + "grad_norm": 0.014577140100300312, + "learning_rate": 0.0009686221354177209, + "loss": 0.0342, + "num_input_tokens_seen": 53483568, + "step": 24770 + }, + { + "epoch": 4.041598694942904, + "grad_norm": 0.05224091559648514, + "learning_rate": 0.0009685973121546417, + "loss": 0.1029, + "num_input_tokens_seen": 53492848, + "step": 24775 + }, + { + "epoch": 4.0424143556280585, + "grad_norm": 0.23117469251155853, + "learning_rate": 0.0009685724793948436, + "loss": 0.1512, + "num_input_tokens_seen": 53502992, + "step": 24780 + }, + { + "epoch": 4.043230016313213, + "grad_norm": 0.04325005039572716, + "learning_rate": 0.0009685476371388298, + "loss": 0.163, + "num_input_tokens_seen": 53513264, + "step": 24785 + }, + { + "epoch": 4.044045676998369, + "grad_norm": 0.03501881659030914, + "learning_rate": 0.0009685227853871037, + "loss": 0.0526, + "num_input_tokens_seen": 53523984, + "step": 24790 + }, + { + "epoch": 4.044861337683524, + "grad_norm": 0.060579538345336914, + "learning_rate": 0.000968497924140169, + "loss": 0.1033, + "num_input_tokens_seen": 53534672, + "step": 24795 + }, + { + "epoch": 4.045676998368679, + "grad_norm": 0.0872158631682396, + "learning_rate": 0.0009684730533985296, + "loss": 0.0752, + "num_input_tokens_seen": 53546224, + "step": 24800 + }, + { + "epoch": 4.0464926590538335, + "grad_norm": 0.006320657674223185, + "learning_rate": 0.0009684481731626895, + "loss": 0.0126, + "num_input_tokens_seen": 53555952, + "step": 24805 + }, + { + "epoch": 4.047308319738988, + "grad_norm": 0.0056551722809672356, + "learning_rate": 0.0009684232834331528, + "loss": 0.0836, + "num_input_tokens_seen": 53567024, + "step": 24810 + }, + { + "epoch": 4.048123980424143, + "grad_norm": 0.02776307985186577, + "learning_rate": 0.000968398384210424, + "loss": 0.0992, + "num_input_tokens_seen": 53578032, + "step": 24815 + }, + { + "epoch": 4.048939641109299, + "grad_norm": 0.016916630789637566, + "learning_rate": 0.0009683734754950078, + "loss": 0.0591, + "num_input_tokens_seen": 53589968, + "step": 24820 + }, + { + "epoch": 4.049755301794454, + "grad_norm": 0.037797361612319946, + "learning_rate": 0.0009683485572874089, + "loss": 0.0317, + "num_input_tokens_seen": 53599472, + "step": 24825 + }, + { + "epoch": 4.0505709624796085, + "grad_norm": 0.0032569735776633024, + "learning_rate": 0.0009683236295881324, + "loss": 0.0718, + "num_input_tokens_seen": 53611376, + "step": 24830 + }, + { + "epoch": 4.051386623164763, + "grad_norm": 0.04766928777098656, + "learning_rate": 0.0009682986923976834, + "loss": 0.0575, + "num_input_tokens_seen": 53623760, + "step": 24835 + }, + { + "epoch": 4.052202283849918, + "grad_norm": 0.28422072529792786, + "learning_rate": 0.0009682737457165673, + "loss": 0.2351, + "num_input_tokens_seen": 53634416, + "step": 24840 + }, + { + "epoch": 4.053017944535074, + "grad_norm": 0.003699280321598053, + "learning_rate": 0.0009682487895452898, + "loss": 0.0511, + "num_input_tokens_seen": 53644528, + "step": 24845 + }, + { + "epoch": 4.053833605220229, + "grad_norm": 0.13272452354431152, + "learning_rate": 0.0009682238238843565, + "loss": 0.063, + "num_input_tokens_seen": 53654480, + "step": 24850 + }, + { + "epoch": 4.054649265905383, + "grad_norm": 0.012849804013967514, + "learning_rate": 0.0009681988487342735, + "loss": 0.175, + "num_input_tokens_seen": 53665136, + "step": 24855 + }, + { + "epoch": 4.055464926590538, + "grad_norm": 0.02238376811146736, + "learning_rate": 0.0009681738640955466, + "loss": 0.1095, + "num_input_tokens_seen": 53677104, + "step": 24860 + }, + { + "epoch": 4.056280587275693, + "grad_norm": 0.006406494881957769, + "learning_rate": 0.0009681488699686827, + "loss": 0.0446, + "num_input_tokens_seen": 53688016, + "step": 24865 + }, + { + "epoch": 4.057096247960848, + "grad_norm": 0.1803017109632492, + "learning_rate": 0.000968123866354188, + "loss": 0.1048, + "num_input_tokens_seen": 53698544, + "step": 24870 + }, + { + "epoch": 4.057911908646004, + "grad_norm": 0.42231833934783936, + "learning_rate": 0.0009680988532525693, + "loss": 0.1238, + "num_input_tokens_seen": 53710000, + "step": 24875 + }, + { + "epoch": 4.058727569331158, + "grad_norm": 0.00441970257088542, + "learning_rate": 0.0009680738306643335, + "loss": 0.1508, + "num_input_tokens_seen": 53720656, + "step": 24880 + }, + { + "epoch": 4.059543230016313, + "grad_norm": 0.013295281678438187, + "learning_rate": 0.0009680487985899878, + "loss": 0.099, + "num_input_tokens_seen": 53731280, + "step": 24885 + }, + { + "epoch": 4.060358890701468, + "grad_norm": 0.047380875796079636, + "learning_rate": 0.0009680237570300392, + "loss": 0.042, + "num_input_tokens_seen": 53741712, + "step": 24890 + }, + { + "epoch": 4.061174551386623, + "grad_norm": 0.06575069576501846, + "learning_rate": 0.0009679987059849956, + "loss": 0.064, + "num_input_tokens_seen": 53753360, + "step": 24895 + }, + { + "epoch": 4.061990212071779, + "grad_norm": 0.053161948919296265, + "learning_rate": 0.0009679736454553645, + "loss": 0.1244, + "num_input_tokens_seen": 53764112, + "step": 24900 + }, + { + "epoch": 4.062805872756933, + "grad_norm": 0.017140526324510574, + "learning_rate": 0.0009679485754416538, + "loss": 0.0877, + "num_input_tokens_seen": 53774320, + "step": 24905 + }, + { + "epoch": 4.063621533442088, + "grad_norm": 0.013312417082488537, + "learning_rate": 0.0009679234959443717, + "loss": 0.0215, + "num_input_tokens_seen": 53784848, + "step": 24910 + }, + { + "epoch": 4.064437194127243, + "grad_norm": 0.007571144960820675, + "learning_rate": 0.0009678984069640262, + "loss": 0.0085, + "num_input_tokens_seen": 53795632, + "step": 24915 + }, + { + "epoch": 4.065252854812398, + "grad_norm": 0.09294047951698303, + "learning_rate": 0.000967873308501126, + "loss": 0.1099, + "num_input_tokens_seen": 53806480, + "step": 24920 + }, + { + "epoch": 4.066068515497553, + "grad_norm": 0.2097572684288025, + "learning_rate": 0.0009678482005561795, + "loss": 0.062, + "num_input_tokens_seen": 53816816, + "step": 24925 + }, + { + "epoch": 4.066884176182708, + "grad_norm": 0.03203193470835686, + "learning_rate": 0.0009678230831296959, + "loss": 0.0389, + "num_input_tokens_seen": 53828720, + "step": 24930 + }, + { + "epoch": 4.067699836867863, + "grad_norm": 0.08545438945293427, + "learning_rate": 0.000967797956222184, + "loss": 0.0611, + "num_input_tokens_seen": 53840816, + "step": 24935 + }, + { + "epoch": 4.068515497553018, + "grad_norm": 0.02790004201233387, + "learning_rate": 0.000967772819834153, + "loss": 0.0587, + "num_input_tokens_seen": 53851120, + "step": 24940 + }, + { + "epoch": 4.069331158238173, + "grad_norm": 0.012067534029483795, + "learning_rate": 0.0009677476739661124, + "loss": 0.0844, + "num_input_tokens_seen": 53862288, + "step": 24945 + }, + { + "epoch": 4.070146818923328, + "grad_norm": 0.1628466248512268, + "learning_rate": 0.0009677225186185719, + "loss": 0.2125, + "num_input_tokens_seen": 53873584, + "step": 24950 + }, + { + "epoch": 4.0709624796084825, + "grad_norm": 0.03747514635324478, + "learning_rate": 0.0009676973537920411, + "loss": 0.2453, + "num_input_tokens_seen": 53885328, + "step": 24955 + }, + { + "epoch": 4.071778140293638, + "grad_norm": 0.025888165459036827, + "learning_rate": 0.0009676721794870302, + "loss": 0.0124, + "num_input_tokens_seen": 53895376, + "step": 24960 + }, + { + "epoch": 4.072593800978793, + "grad_norm": 0.024857787415385246, + "learning_rate": 0.0009676469957040492, + "loss": 0.0897, + "num_input_tokens_seen": 53906640, + "step": 24965 + }, + { + "epoch": 4.073409461663948, + "grad_norm": 0.16511480510234833, + "learning_rate": 0.0009676218024436087, + "loss": 0.1201, + "num_input_tokens_seen": 53917264, + "step": 24970 + }, + { + "epoch": 4.074225122349103, + "grad_norm": 0.034926723688840866, + "learning_rate": 0.0009675965997062192, + "loss": 0.1315, + "num_input_tokens_seen": 53927984, + "step": 24975 + }, + { + "epoch": 4.075040783034257, + "grad_norm": 0.0059353020042181015, + "learning_rate": 0.0009675713874923912, + "loss": 0.0492, + "num_input_tokens_seen": 53938672, + "step": 24980 + }, + { + "epoch": 4.075856443719413, + "grad_norm": 0.16642434895038605, + "learning_rate": 0.0009675461658026361, + "loss": 0.0739, + "num_input_tokens_seen": 53951120, + "step": 24985 + }, + { + "epoch": 4.076672104404568, + "grad_norm": 0.025159316137433052, + "learning_rate": 0.0009675209346374647, + "loss": 0.0392, + "num_input_tokens_seen": 53962192, + "step": 24990 + }, + { + "epoch": 4.077487765089723, + "grad_norm": 0.030797351151704788, + "learning_rate": 0.0009674956939973885, + "loss": 0.149, + "num_input_tokens_seen": 53973488, + "step": 24995 + }, + { + "epoch": 4.078303425774878, + "grad_norm": 0.11491513252258301, + "learning_rate": 0.0009674704438829189, + "loss": 0.1056, + "num_input_tokens_seen": 53984272, + "step": 25000 + }, + { + "epoch": 4.079119086460032, + "grad_norm": 0.17148253321647644, + "learning_rate": 0.0009674451842945679, + "loss": 0.1415, + "num_input_tokens_seen": 53994704, + "step": 25005 + }, + { + "epoch": 4.079934747145187, + "grad_norm": 0.030827978625893593, + "learning_rate": 0.0009674199152328472, + "loss": 0.0585, + "num_input_tokens_seen": 54005552, + "step": 25010 + }, + { + "epoch": 4.080750407830343, + "grad_norm": 0.03486708551645279, + "learning_rate": 0.0009673946366982689, + "loss": 0.0638, + "num_input_tokens_seen": 54016560, + "step": 25015 + }, + { + "epoch": 4.081566068515498, + "grad_norm": 0.18917278945446014, + "learning_rate": 0.0009673693486913453, + "loss": 0.0891, + "num_input_tokens_seen": 54026736, + "step": 25020 + }, + { + "epoch": 4.082381729200653, + "grad_norm": 0.09397385269403458, + "learning_rate": 0.000967344051212589, + "loss": 0.1734, + "num_input_tokens_seen": 54036592, + "step": 25025 + }, + { + "epoch": 4.083197389885807, + "grad_norm": 0.05814465880393982, + "learning_rate": 0.0009673187442625126, + "loss": 0.243, + "num_input_tokens_seen": 54046864, + "step": 25030 + }, + { + "epoch": 4.084013050570962, + "grad_norm": 0.030367521569132805, + "learning_rate": 0.0009672934278416292, + "loss": 0.059, + "num_input_tokens_seen": 54058448, + "step": 25035 + }, + { + "epoch": 4.084828711256117, + "grad_norm": 0.20593537390232086, + "learning_rate": 0.0009672681019504514, + "loss": 0.1712, + "num_input_tokens_seen": 54068848, + "step": 25040 + }, + { + "epoch": 4.085644371941273, + "grad_norm": 0.1625973880290985, + "learning_rate": 0.0009672427665894929, + "loss": 0.1946, + "num_input_tokens_seen": 54079504, + "step": 25045 + }, + { + "epoch": 4.0864600326264275, + "grad_norm": 0.05336277559399605, + "learning_rate": 0.0009672174217592671, + "loss": 0.0967, + "num_input_tokens_seen": 54089424, + "step": 25050 + }, + { + "epoch": 4.087275693311582, + "grad_norm": 0.15389741957187653, + "learning_rate": 0.0009671920674602874, + "loss": 0.062, + "num_input_tokens_seen": 54100848, + "step": 25055 + }, + { + "epoch": 4.088091353996737, + "grad_norm": 0.05887462571263313, + "learning_rate": 0.0009671667036930678, + "loss": 0.086, + "num_input_tokens_seen": 54111376, + "step": 25060 + }, + { + "epoch": 4.088907014681892, + "grad_norm": 0.02259242907166481, + "learning_rate": 0.0009671413304581224, + "loss": 0.0241, + "num_input_tokens_seen": 54123408, + "step": 25065 + }, + { + "epoch": 4.089722675367048, + "grad_norm": 0.01118428073823452, + "learning_rate": 0.0009671159477559652, + "loss": 0.0786, + "num_input_tokens_seen": 54133296, + "step": 25070 + }, + { + "epoch": 4.0905383360522025, + "grad_norm": 0.016618698835372925, + "learning_rate": 0.0009670905555871108, + "loss": 0.0657, + "num_input_tokens_seen": 54144240, + "step": 25075 + }, + { + "epoch": 4.091353996737357, + "grad_norm": 0.0037635231856256723, + "learning_rate": 0.0009670651539520737, + "loss": 0.0662, + "num_input_tokens_seen": 54157008, + "step": 25080 + }, + { + "epoch": 4.092169657422512, + "grad_norm": 0.006061885505914688, + "learning_rate": 0.0009670397428513688, + "loss": 0.0898, + "num_input_tokens_seen": 54167792, + "step": 25085 + }, + { + "epoch": 4.092985318107667, + "grad_norm": 0.02891123667359352, + "learning_rate": 0.000967014322285511, + "loss": 0.0906, + "num_input_tokens_seen": 54178672, + "step": 25090 + }, + { + "epoch": 4.093800978792822, + "grad_norm": 0.008946842513978481, + "learning_rate": 0.0009669888922550154, + "loss": 0.0759, + "num_input_tokens_seen": 54190288, + "step": 25095 + }, + { + "epoch": 4.0946166394779775, + "grad_norm": 0.07760658115148544, + "learning_rate": 0.0009669634527603977, + "loss": 0.0554, + "num_input_tokens_seen": 54199728, + "step": 25100 + }, + { + "epoch": 4.095432300163132, + "grad_norm": 0.01673254929482937, + "learning_rate": 0.000966938003802173, + "loss": 0.0239, + "num_input_tokens_seen": 54209840, + "step": 25105 + }, + { + "epoch": 4.096247960848287, + "grad_norm": 0.14422902464866638, + "learning_rate": 0.0009669125453808573, + "loss": 0.143, + "num_input_tokens_seen": 54219856, + "step": 25110 + }, + { + "epoch": 4.097063621533442, + "grad_norm": 0.005726001225411892, + "learning_rate": 0.0009668870774969668, + "loss": 0.0776, + "num_input_tokens_seen": 54230096, + "step": 25115 + }, + { + "epoch": 4.097879282218597, + "grad_norm": 0.009977491572499275, + "learning_rate": 0.0009668616001510173, + "loss": 0.0183, + "num_input_tokens_seen": 54241840, + "step": 25120 + }, + { + "epoch": 4.0986949429037525, + "grad_norm": 0.07573343068361282, + "learning_rate": 0.0009668361133435252, + "loss": 0.1028, + "num_input_tokens_seen": 54251920, + "step": 25125 + }, + { + "epoch": 4.099510603588907, + "grad_norm": 0.013866202905774117, + "learning_rate": 0.0009668106170750071, + "loss": 0.0925, + "num_input_tokens_seen": 54263344, + "step": 25130 + }, + { + "epoch": 4.100326264274062, + "grad_norm": 0.212263822555542, + "learning_rate": 0.0009667851113459795, + "loss": 0.1665, + "num_input_tokens_seen": 54274384, + "step": 25135 + }, + { + "epoch": 4.101141924959217, + "grad_norm": 0.021173465996980667, + "learning_rate": 0.0009667595961569595, + "loss": 0.0344, + "num_input_tokens_seen": 54285296, + "step": 25140 + }, + { + "epoch": 4.101957585644372, + "grad_norm": 0.08236562460660934, + "learning_rate": 0.0009667340715084641, + "loss": 0.1333, + "num_input_tokens_seen": 54295504, + "step": 25145 + }, + { + "epoch": 4.102773246329527, + "grad_norm": 0.1214727908372879, + "learning_rate": 0.0009667085374010107, + "loss": 0.0627, + "num_input_tokens_seen": 54306704, + "step": 25150 + }, + { + "epoch": 4.103588907014682, + "grad_norm": 0.0035130369942635298, + "learning_rate": 0.0009666829938351169, + "loss": 0.075, + "num_input_tokens_seen": 54316720, + "step": 25155 + }, + { + "epoch": 4.104404567699837, + "grad_norm": 0.07010453939437866, + "learning_rate": 0.0009666574408113, + "loss": 0.0906, + "num_input_tokens_seen": 54327408, + "step": 25160 + }, + { + "epoch": 4.105220228384992, + "grad_norm": 0.06455550342798233, + "learning_rate": 0.0009666318783300782, + "loss": 0.0483, + "num_input_tokens_seen": 54337104, + "step": 25165 + }, + { + "epoch": 4.106035889070147, + "grad_norm": 0.005117730237543583, + "learning_rate": 0.0009666063063919693, + "loss": 0.1078, + "num_input_tokens_seen": 54347408, + "step": 25170 + }, + { + "epoch": 4.1068515497553015, + "grad_norm": 0.15416225790977478, + "learning_rate": 0.0009665807249974917, + "loss": 0.1197, + "num_input_tokens_seen": 54357264, + "step": 25175 + }, + { + "epoch": 4.107667210440456, + "grad_norm": 0.17809714376926422, + "learning_rate": 0.0009665551341471639, + "loss": 0.1409, + "num_input_tokens_seen": 54368816, + "step": 25180 + }, + { + "epoch": 4.108482871125612, + "grad_norm": 0.10321211069822311, + "learning_rate": 0.0009665295338415044, + "loss": 0.1482, + "num_input_tokens_seen": 54380848, + "step": 25185 + }, + { + "epoch": 4.109298531810767, + "grad_norm": 0.028538476675748825, + "learning_rate": 0.0009665039240810319, + "loss": 0.0578, + "num_input_tokens_seen": 54391760, + "step": 25190 + }, + { + "epoch": 4.110114192495922, + "grad_norm": 0.033008161932229996, + "learning_rate": 0.0009664783048662658, + "loss": 0.2387, + "num_input_tokens_seen": 54402384, + "step": 25195 + }, + { + "epoch": 4.1109298531810765, + "grad_norm": 0.03307179734110832, + "learning_rate": 0.0009664526761977249, + "loss": 0.0322, + "num_input_tokens_seen": 54413680, + "step": 25200 + }, + { + "epoch": 4.111745513866231, + "grad_norm": 0.15691694617271423, + "learning_rate": 0.0009664270380759289, + "loss": 0.0822, + "num_input_tokens_seen": 54422640, + "step": 25205 + }, + { + "epoch": 4.112561174551387, + "grad_norm": 0.20907220244407654, + "learning_rate": 0.0009664013905013971, + "loss": 0.1574, + "num_input_tokens_seen": 54434576, + "step": 25210 + }, + { + "epoch": 4.113376835236542, + "grad_norm": 0.02874820865690708, + "learning_rate": 0.0009663757334746497, + "loss": 0.0239, + "num_input_tokens_seen": 54444272, + "step": 25215 + }, + { + "epoch": 4.114192495921697, + "grad_norm": 0.17071279883384705, + "learning_rate": 0.0009663500669962063, + "loss": 0.103, + "num_input_tokens_seen": 54456016, + "step": 25220 + }, + { + "epoch": 4.1150081566068515, + "grad_norm": 0.0020776446908712387, + "learning_rate": 0.0009663243910665872, + "loss": 0.012, + "num_input_tokens_seen": 54466448, + "step": 25225 + }, + { + "epoch": 4.115823817292006, + "grad_norm": 0.2548483908176422, + "learning_rate": 0.0009662987056863128, + "loss": 0.1196, + "num_input_tokens_seen": 54476592, + "step": 25230 + }, + { + "epoch": 4.116639477977161, + "grad_norm": 0.001670819940045476, + "learning_rate": 0.0009662730108559034, + "loss": 0.0322, + "num_input_tokens_seen": 54488048, + "step": 25235 + }, + { + "epoch": 4.117455138662317, + "grad_norm": 0.035325583070516586, + "learning_rate": 0.0009662473065758801, + "loss": 0.0297, + "num_input_tokens_seen": 54499120, + "step": 25240 + }, + { + "epoch": 4.118270799347472, + "grad_norm": 0.13067218661308289, + "learning_rate": 0.0009662215928467636, + "loss": 0.0845, + "num_input_tokens_seen": 54510576, + "step": 25245 + }, + { + "epoch": 4.1190864600326265, + "grad_norm": 0.0387139655649662, + "learning_rate": 0.000966195869669075, + "loss": 0.1303, + "num_input_tokens_seen": 54522160, + "step": 25250 + }, + { + "epoch": 4.119902120717781, + "grad_norm": 0.003784678177908063, + "learning_rate": 0.0009661701370433358, + "loss": 0.0129, + "num_input_tokens_seen": 54533872, + "step": 25255 + }, + { + "epoch": 4.120717781402936, + "grad_norm": 0.06491962820291519, + "learning_rate": 0.0009661443949700674, + "loss": 0.1499, + "num_input_tokens_seen": 54545200, + "step": 25260 + }, + { + "epoch": 4.121533442088092, + "grad_norm": 0.030042633414268494, + "learning_rate": 0.0009661186434497915, + "loss": 0.1568, + "num_input_tokens_seen": 54554960, + "step": 25265 + }, + { + "epoch": 4.122349102773247, + "grad_norm": 0.01743953675031662, + "learning_rate": 0.0009660928824830299, + "loss": 0.1183, + "num_input_tokens_seen": 54565456, + "step": 25270 + }, + { + "epoch": 4.123164763458401, + "grad_norm": 0.006780917756259441, + "learning_rate": 0.0009660671120703048, + "loss": 0.0752, + "num_input_tokens_seen": 54575536, + "step": 25275 + }, + { + "epoch": 4.123980424143556, + "grad_norm": 0.07270966470241547, + "learning_rate": 0.0009660413322121384, + "loss": 0.0212, + "num_input_tokens_seen": 54585296, + "step": 25280 + }, + { + "epoch": 4.124796084828711, + "grad_norm": 0.0239655040204525, + "learning_rate": 0.0009660155429090531, + "loss": 0.1033, + "num_input_tokens_seen": 54595600, + "step": 25285 + }, + { + "epoch": 4.125611745513866, + "grad_norm": 0.038283880800008774, + "learning_rate": 0.0009659897441615717, + "loss": 0.0869, + "num_input_tokens_seen": 54606160, + "step": 25290 + }, + { + "epoch": 4.126427406199022, + "grad_norm": 0.07180234789848328, + "learning_rate": 0.000965963935970217, + "loss": 0.0241, + "num_input_tokens_seen": 54617232, + "step": 25295 + }, + { + "epoch": 4.127243066884176, + "grad_norm": 0.0430586151778698, + "learning_rate": 0.0009659381183355121, + "loss": 0.0356, + "num_input_tokens_seen": 54628688, + "step": 25300 + }, + { + "epoch": 4.128058727569331, + "grad_norm": 0.03182794153690338, + "learning_rate": 0.0009659122912579801, + "loss": 0.0585, + "num_input_tokens_seen": 54640016, + "step": 25305 + }, + { + "epoch": 4.128874388254486, + "grad_norm": 0.019779110327363014, + "learning_rate": 0.0009658864547381445, + "loss": 0.0718, + "num_input_tokens_seen": 54648848, + "step": 25310 + }, + { + "epoch": 4.129690048939641, + "grad_norm": 0.11613103747367859, + "learning_rate": 0.0009658606087765288, + "loss": 0.1891, + "num_input_tokens_seen": 54659376, + "step": 25315 + }, + { + "epoch": 4.130505709624796, + "grad_norm": 0.0035958706866949797, + "learning_rate": 0.0009658347533736569, + "loss": 0.0405, + "num_input_tokens_seen": 54670928, + "step": 25320 + }, + { + "epoch": 4.131321370309951, + "grad_norm": 0.014619206078350544, + "learning_rate": 0.0009658088885300528, + "loss": 0.1163, + "num_input_tokens_seen": 54681584, + "step": 25325 + }, + { + "epoch": 4.132137030995106, + "grad_norm": 0.009095299988985062, + "learning_rate": 0.0009657830142462406, + "loss": 0.1118, + "num_input_tokens_seen": 54692432, + "step": 25330 + }, + { + "epoch": 4.132952691680261, + "grad_norm": 0.0790816992521286, + "learning_rate": 0.0009657571305227449, + "loss": 0.1457, + "num_input_tokens_seen": 54703152, + "step": 25335 + }, + { + "epoch": 4.133768352365416, + "grad_norm": 0.014859483577311039, + "learning_rate": 0.0009657312373600899, + "loss": 0.0229, + "num_input_tokens_seen": 54714032, + "step": 25340 + }, + { + "epoch": 4.134584013050571, + "grad_norm": 0.06535041332244873, + "learning_rate": 0.0009657053347588005, + "loss": 0.0814, + "num_input_tokens_seen": 54724272, + "step": 25345 + }, + { + "epoch": 4.135399673735726, + "grad_norm": 0.03271479159593582, + "learning_rate": 0.0009656794227194019, + "loss": 0.0142, + "num_input_tokens_seen": 54734864, + "step": 25350 + }, + { + "epoch": 4.136215334420881, + "grad_norm": 0.05432186648249626, + "learning_rate": 0.0009656535012424189, + "loss": 0.1013, + "num_input_tokens_seen": 54744464, + "step": 25355 + }, + { + "epoch": 4.137030995106036, + "grad_norm": 0.2506901025772095, + "learning_rate": 0.000965627570328377, + "loss": 0.2048, + "num_input_tokens_seen": 54754672, + "step": 25360 + }, + { + "epoch": 4.137846655791191, + "grad_norm": 0.11805327981710434, + "learning_rate": 0.0009656016299778017, + "loss": 0.1535, + "num_input_tokens_seen": 54766256, + "step": 25365 + }, + { + "epoch": 4.138662316476346, + "grad_norm": 0.0069347405806183815, + "learning_rate": 0.0009655756801912188, + "loss": 0.1053, + "num_input_tokens_seen": 54777296, + "step": 25370 + }, + { + "epoch": 4.1394779771615005, + "grad_norm": 0.17729564011096954, + "learning_rate": 0.000965549720969154, + "loss": 0.1813, + "num_input_tokens_seen": 54787728, + "step": 25375 + }, + { + "epoch": 4.140293637846656, + "grad_norm": 0.07092685252428055, + "learning_rate": 0.0009655237523121336, + "loss": 0.1027, + "num_input_tokens_seen": 54798864, + "step": 25380 + }, + { + "epoch": 4.141109298531811, + "grad_norm": 0.005467765964567661, + "learning_rate": 0.0009654977742206837, + "loss": 0.1025, + "num_input_tokens_seen": 54809872, + "step": 25385 + }, + { + "epoch": 4.141924959216966, + "grad_norm": 0.010642649605870247, + "learning_rate": 0.000965471786695331, + "loss": 0.1326, + "num_input_tokens_seen": 54819248, + "step": 25390 + }, + { + "epoch": 4.142740619902121, + "grad_norm": 0.01924213208258152, + "learning_rate": 0.0009654457897366021, + "loss": 0.0882, + "num_input_tokens_seen": 54830448, + "step": 25395 + }, + { + "epoch": 4.143556280587275, + "grad_norm": 0.019007250666618347, + "learning_rate": 0.0009654197833450235, + "loss": 0.1, + "num_input_tokens_seen": 54841776, + "step": 25400 + }, + { + "epoch": 4.14437194127243, + "grad_norm": 0.15038681030273438, + "learning_rate": 0.0009653937675211229, + "loss": 0.1339, + "num_input_tokens_seen": 54851888, + "step": 25405 + }, + { + "epoch": 4.145187601957586, + "grad_norm": 0.09251240640878677, + "learning_rate": 0.000965367742265427, + "loss": 0.1631, + "num_input_tokens_seen": 54861968, + "step": 25410 + }, + { + "epoch": 4.146003262642741, + "grad_norm": 0.03529278188943863, + "learning_rate": 0.0009653417075784635, + "loss": 0.1645, + "num_input_tokens_seen": 54873104, + "step": 25415 + }, + { + "epoch": 4.146818923327896, + "grad_norm": 0.0808977410197258, + "learning_rate": 0.0009653156634607601, + "loss": 0.0326, + "num_input_tokens_seen": 54883600, + "step": 25420 + }, + { + "epoch": 4.14763458401305, + "grad_norm": 0.010449567809700966, + "learning_rate": 0.0009652896099128443, + "loss": 0.0498, + "num_input_tokens_seen": 54895184, + "step": 25425 + }, + { + "epoch": 4.148450244698205, + "grad_norm": 0.05876653641462326, + "learning_rate": 0.0009652635469352443, + "loss": 0.0985, + "num_input_tokens_seen": 54906640, + "step": 25430 + }, + { + "epoch": 4.149265905383361, + "grad_norm": 0.042890846729278564, + "learning_rate": 0.0009652374745284884, + "loss": 0.1573, + "num_input_tokens_seen": 54918192, + "step": 25435 + }, + { + "epoch": 4.150081566068516, + "grad_norm": 0.01304632518440485, + "learning_rate": 0.0009652113926931048, + "loss": 0.0912, + "num_input_tokens_seen": 54927536, + "step": 25440 + }, + { + "epoch": 4.150897226753671, + "grad_norm": 0.039793629199266434, + "learning_rate": 0.0009651853014296223, + "loss": 0.112, + "num_input_tokens_seen": 54938224, + "step": 25445 + }, + { + "epoch": 4.151712887438825, + "grad_norm": 0.07937805354595184, + "learning_rate": 0.0009651592007385694, + "loss": 0.0462, + "num_input_tokens_seen": 54949104, + "step": 25450 + }, + { + "epoch": 4.15252854812398, + "grad_norm": 0.014087907038629055, + "learning_rate": 0.0009651330906204752, + "loss": 0.0458, + "num_input_tokens_seen": 54960848, + "step": 25455 + }, + { + "epoch": 4.153344208809135, + "grad_norm": 0.14813610911369324, + "learning_rate": 0.0009651069710758689, + "loss": 0.148, + "num_input_tokens_seen": 54971600, + "step": 25460 + }, + { + "epoch": 4.154159869494291, + "grad_norm": 0.04198925569653511, + "learning_rate": 0.0009650808421052798, + "loss": 0.0317, + "num_input_tokens_seen": 54981936, + "step": 25465 + }, + { + "epoch": 4.1549755301794455, + "grad_norm": 0.19615806639194489, + "learning_rate": 0.0009650547037092374, + "loss": 0.078, + "num_input_tokens_seen": 54993552, + "step": 25470 + }, + { + "epoch": 4.1557911908646, + "grad_norm": 0.014402852393686771, + "learning_rate": 0.0009650285558882715, + "loss": 0.0781, + "num_input_tokens_seen": 55005200, + "step": 25475 + }, + { + "epoch": 4.156606851549755, + "grad_norm": 0.008503149263560772, + "learning_rate": 0.0009650023986429119, + "loss": 0.0146, + "num_input_tokens_seen": 55016464, + "step": 25480 + }, + { + "epoch": 4.15742251223491, + "grad_norm": 0.03202563151717186, + "learning_rate": 0.000964976231973689, + "loss": 0.1166, + "num_input_tokens_seen": 55026576, + "step": 25485 + }, + { + "epoch": 4.158238172920065, + "grad_norm": 0.014031129889190197, + "learning_rate": 0.0009649500558811328, + "loss": 0.0663, + "num_input_tokens_seen": 55037904, + "step": 25490 + }, + { + "epoch": 4.1590538336052205, + "grad_norm": 0.03630248084664345, + "learning_rate": 0.0009649238703657739, + "loss": 0.0657, + "num_input_tokens_seen": 55047408, + "step": 25495 + }, + { + "epoch": 4.159869494290375, + "grad_norm": 0.035577546805143356, + "learning_rate": 0.0009648976754281429, + "loss": 0.1454, + "num_input_tokens_seen": 55058064, + "step": 25500 + }, + { + "epoch": 4.16068515497553, + "grad_norm": 0.10374343395233154, + "learning_rate": 0.0009648714710687708, + "loss": 0.0668, + "num_input_tokens_seen": 55069008, + "step": 25505 + }, + { + "epoch": 4.161500815660685, + "grad_norm": 0.169823557138443, + "learning_rate": 0.0009648452572881885, + "loss": 0.1572, + "num_input_tokens_seen": 55079792, + "step": 25510 + }, + { + "epoch": 4.16231647634584, + "grad_norm": 0.1929391324520111, + "learning_rate": 0.0009648190340869274, + "loss": 0.1885, + "num_input_tokens_seen": 55090864, + "step": 25515 + }, + { + "epoch": 4.1631321370309955, + "grad_norm": 0.015418852679431438, + "learning_rate": 0.000964792801465519, + "loss": 0.0341, + "num_input_tokens_seen": 55101904, + "step": 25520 + }, + { + "epoch": 4.16394779771615, + "grad_norm": 0.16468164324760437, + "learning_rate": 0.0009647665594244947, + "loss": 0.074, + "num_input_tokens_seen": 55112656, + "step": 25525 + }, + { + "epoch": 4.164763458401305, + "grad_norm": 0.003919845446944237, + "learning_rate": 0.0009647403079643866, + "loss": 0.2233, + "num_input_tokens_seen": 55123024, + "step": 25530 + }, + { + "epoch": 4.16557911908646, + "grad_norm": 0.014560865238308907, + "learning_rate": 0.0009647140470857267, + "loss": 0.1385, + "num_input_tokens_seen": 55133136, + "step": 25535 + }, + { + "epoch": 4.166394779771615, + "grad_norm": 0.03865538164973259, + "learning_rate": 0.0009646877767890469, + "loss": 0.0393, + "num_input_tokens_seen": 55142864, + "step": 25540 + }, + { + "epoch": 4.16721044045677, + "grad_norm": 0.02826162613928318, + "learning_rate": 0.00096466149707488, + "loss": 0.1017, + "num_input_tokens_seen": 55152848, + "step": 25545 + }, + { + "epoch": 4.168026101141925, + "grad_norm": 0.045532457530498505, + "learning_rate": 0.0009646352079437582, + "loss": 0.0326, + "num_input_tokens_seen": 55162032, + "step": 25550 + }, + { + "epoch": 4.16884176182708, + "grad_norm": 0.10127101093530655, + "learning_rate": 0.0009646089093962145, + "loss": 0.1107, + "num_input_tokens_seen": 55173616, + "step": 25555 + }, + { + "epoch": 4.169657422512235, + "grad_norm": 0.24649570882320404, + "learning_rate": 0.0009645826014327819, + "loss": 0.079, + "num_input_tokens_seen": 55185328, + "step": 25560 + }, + { + "epoch": 4.17047308319739, + "grad_norm": 0.06951960921287537, + "learning_rate": 0.0009645562840539935, + "loss": 0.1098, + "num_input_tokens_seen": 55197072, + "step": 25565 + }, + { + "epoch": 4.171288743882545, + "grad_norm": 0.06736738979816437, + "learning_rate": 0.0009645299572603827, + "loss": 0.0234, + "num_input_tokens_seen": 55205936, + "step": 25570 + }, + { + "epoch": 4.1721044045677, + "grad_norm": 0.1484171450138092, + "learning_rate": 0.000964503621052483, + "loss": 0.0633, + "num_input_tokens_seen": 55216560, + "step": 25575 + }, + { + "epoch": 4.172920065252855, + "grad_norm": 0.03549177199602127, + "learning_rate": 0.0009644772754308281, + "loss": 0.053, + "num_input_tokens_seen": 55227504, + "step": 25580 + }, + { + "epoch": 4.17373572593801, + "grad_norm": 0.178375706076622, + "learning_rate": 0.0009644509203959522, + "loss": 0.1642, + "num_input_tokens_seen": 55238032, + "step": 25585 + }, + { + "epoch": 4.174551386623165, + "grad_norm": 0.009189301170408726, + "learning_rate": 0.0009644245559483891, + "loss": 0.1453, + "num_input_tokens_seen": 55248080, + "step": 25590 + }, + { + "epoch": 4.1753670473083195, + "grad_norm": 0.21092422306537628, + "learning_rate": 0.0009643981820886731, + "loss": 0.0835, + "num_input_tokens_seen": 55259760, + "step": 25595 + }, + { + "epoch": 4.176182707993474, + "grad_norm": 0.04772479459643364, + "learning_rate": 0.0009643717988173389, + "loss": 0.0233, + "num_input_tokens_seen": 55270128, + "step": 25600 + }, + { + "epoch": 4.17699836867863, + "grad_norm": 0.03510193154215813, + "learning_rate": 0.0009643454061349211, + "loss": 0.1661, + "num_input_tokens_seen": 55280592, + "step": 25605 + }, + { + "epoch": 4.177814029363785, + "grad_norm": 0.02469966560602188, + "learning_rate": 0.0009643190040419545, + "loss": 0.027, + "num_input_tokens_seen": 55291344, + "step": 25610 + }, + { + "epoch": 4.17862969004894, + "grad_norm": 0.17844711244106293, + "learning_rate": 0.0009642925925389743, + "loss": 0.0932, + "num_input_tokens_seen": 55301360, + "step": 25615 + }, + { + "epoch": 4.1794453507340945, + "grad_norm": 0.12669458985328674, + "learning_rate": 0.0009642661716265156, + "loss": 0.0667, + "num_input_tokens_seen": 55311696, + "step": 25620 + }, + { + "epoch": 4.180261011419249, + "grad_norm": 0.009872755967080593, + "learning_rate": 0.0009642397413051142, + "loss": 0.1215, + "num_input_tokens_seen": 55322704, + "step": 25625 + }, + { + "epoch": 4.181076672104404, + "grad_norm": 0.17198066413402557, + "learning_rate": 0.0009642133015753054, + "loss": 0.0887, + "num_input_tokens_seen": 55334704, + "step": 25630 + }, + { + "epoch": 4.18189233278956, + "grad_norm": 0.049541257321834564, + "learning_rate": 0.0009641868524376252, + "loss": 0.1258, + "num_input_tokens_seen": 55344912, + "step": 25635 + }, + { + "epoch": 4.182707993474715, + "grad_norm": 0.055650126188993454, + "learning_rate": 0.0009641603938926093, + "loss": 0.1217, + "num_input_tokens_seen": 55354832, + "step": 25640 + }, + { + "epoch": 4.1835236541598695, + "grad_norm": 0.06552504003047943, + "learning_rate": 0.0009641339259407946, + "loss": 0.1553, + "num_input_tokens_seen": 55365776, + "step": 25645 + }, + { + "epoch": 4.184339314845024, + "grad_norm": 0.013210811652243137, + "learning_rate": 0.0009641074485827168, + "loss": 0.2264, + "num_input_tokens_seen": 55376208, + "step": 25650 + }, + { + "epoch": 4.185154975530179, + "grad_norm": 0.11731768399477005, + "learning_rate": 0.0009640809618189129, + "loss": 0.1378, + "num_input_tokens_seen": 55386512, + "step": 25655 + }, + { + "epoch": 4.185970636215335, + "grad_norm": 0.026899442076683044, + "learning_rate": 0.0009640544656499197, + "loss": 0.0592, + "num_input_tokens_seen": 55397168, + "step": 25660 + }, + { + "epoch": 4.18678629690049, + "grad_norm": 0.009896726347506046, + "learning_rate": 0.0009640279600762738, + "loss": 0.1889, + "num_input_tokens_seen": 55408912, + "step": 25665 + }, + { + "epoch": 4.1876019575856445, + "grad_norm": 0.08339975029230118, + "learning_rate": 0.0009640014450985129, + "loss": 0.119, + "num_input_tokens_seen": 55418672, + "step": 25670 + }, + { + "epoch": 4.188417618270799, + "grad_norm": 0.11446402221918106, + "learning_rate": 0.0009639749207171739, + "loss": 0.05, + "num_input_tokens_seen": 55430768, + "step": 25675 + }, + { + "epoch": 4.189233278955954, + "grad_norm": 0.08358004689216614, + "learning_rate": 0.0009639483869327946, + "loss": 0.0978, + "num_input_tokens_seen": 55441232, + "step": 25680 + }, + { + "epoch": 4.190048939641109, + "grad_norm": 0.2137380987405777, + "learning_rate": 0.0009639218437459125, + "loss": 0.1178, + "num_input_tokens_seen": 55453040, + "step": 25685 + }, + { + "epoch": 4.190864600326265, + "grad_norm": 0.02541278302669525, + "learning_rate": 0.000963895291157066, + "loss": 0.1194, + "num_input_tokens_seen": 55462768, + "step": 25690 + }, + { + "epoch": 4.191680261011419, + "grad_norm": 0.019467975944280624, + "learning_rate": 0.0009638687291667927, + "loss": 0.0506, + "num_input_tokens_seen": 55474416, + "step": 25695 + }, + { + "epoch": 4.192495921696574, + "grad_norm": 0.03018118627369404, + "learning_rate": 0.0009638421577756313, + "loss": 0.06, + "num_input_tokens_seen": 55484272, + "step": 25700 + }, + { + "epoch": 4.193311582381729, + "grad_norm": 0.044861964881420135, + "learning_rate": 0.0009638155769841201, + "loss": 0.0461, + "num_input_tokens_seen": 55496336, + "step": 25705 + }, + { + "epoch": 4.194127243066884, + "grad_norm": 0.05079563707113266, + "learning_rate": 0.0009637889867927978, + "loss": 0.0333, + "num_input_tokens_seen": 55508656, + "step": 25710 + }, + { + "epoch": 4.19494290375204, + "grad_norm": 0.1449674367904663, + "learning_rate": 0.0009637623872022034, + "loss": 0.2086, + "num_input_tokens_seen": 55519664, + "step": 25715 + }, + { + "epoch": 4.195758564437194, + "grad_norm": 0.10629749298095703, + "learning_rate": 0.0009637357782128758, + "loss": 0.1126, + "num_input_tokens_seen": 55529744, + "step": 25720 + }, + { + "epoch": 4.196574225122349, + "grad_norm": 0.04819105938076973, + "learning_rate": 0.0009637091598253544, + "loss": 0.0343, + "num_input_tokens_seen": 55540400, + "step": 25725 + }, + { + "epoch": 4.197389885807504, + "grad_norm": 0.22623306512832642, + "learning_rate": 0.0009636825320401787, + "loss": 0.2085, + "num_input_tokens_seen": 55551312, + "step": 25730 + }, + { + "epoch": 4.198205546492659, + "grad_norm": 0.015361045487225056, + "learning_rate": 0.0009636558948578882, + "loss": 0.0295, + "num_input_tokens_seen": 55562448, + "step": 25735 + }, + { + "epoch": 4.199021207177814, + "grad_norm": 0.013365724124014378, + "learning_rate": 0.0009636292482790229, + "loss": 0.0763, + "num_input_tokens_seen": 55572848, + "step": 25740 + }, + { + "epoch": 4.199836867862969, + "grad_norm": 0.11657389253377914, + "learning_rate": 0.0009636025923041227, + "loss": 0.0806, + "num_input_tokens_seen": 55582800, + "step": 25745 + }, + { + "epoch": 4.200652528548124, + "grad_norm": 0.1552727222442627, + "learning_rate": 0.0009635759269337276, + "loss": 0.1255, + "num_input_tokens_seen": 55594096, + "step": 25750 + }, + { + "epoch": 4.201468189233279, + "grad_norm": 0.08266094326972961, + "learning_rate": 0.0009635492521683785, + "loss": 0.0689, + "num_input_tokens_seen": 55605456, + "step": 25755 + }, + { + "epoch": 4.202283849918434, + "grad_norm": 0.0039903996512293816, + "learning_rate": 0.0009635225680086157, + "loss": 0.0245, + "num_input_tokens_seen": 55617072, + "step": 25760 + }, + { + "epoch": 4.203099510603589, + "grad_norm": 0.03781895712018013, + "learning_rate": 0.00096349587445498, + "loss": 0.1977, + "num_input_tokens_seen": 55628176, + "step": 25765 + }, + { + "epoch": 4.2039151712887435, + "grad_norm": 0.052535369992256165, + "learning_rate": 0.0009634691715080124, + "loss": 0.054, + "num_input_tokens_seen": 55637744, + "step": 25770 + }, + { + "epoch": 4.204730831973899, + "grad_norm": 0.011141735129058361, + "learning_rate": 0.0009634424591682542, + "loss": 0.0183, + "num_input_tokens_seen": 55649104, + "step": 25775 + }, + { + "epoch": 4.205546492659054, + "grad_norm": 0.13149775564670563, + "learning_rate": 0.0009634157374362466, + "loss": 0.0764, + "num_input_tokens_seen": 55660272, + "step": 25780 + }, + { + "epoch": 4.206362153344209, + "grad_norm": 0.028822118416428566, + "learning_rate": 0.0009633890063125313, + "loss": 0.1029, + "num_input_tokens_seen": 55670864, + "step": 25785 + }, + { + "epoch": 4.207177814029364, + "grad_norm": 0.03968701511621475, + "learning_rate": 0.0009633622657976498, + "loss": 0.0821, + "num_input_tokens_seen": 55681456, + "step": 25790 + }, + { + "epoch": 4.2079934747145185, + "grad_norm": 0.07468824833631516, + "learning_rate": 0.0009633355158921441, + "loss": 0.0392, + "num_input_tokens_seen": 55692464, + "step": 25795 + }, + { + "epoch": 4.208809135399674, + "grad_norm": 0.04002116248011589, + "learning_rate": 0.0009633087565965564, + "loss": 0.1212, + "num_input_tokens_seen": 55702704, + "step": 25800 + }, + { + "epoch": 4.209624796084829, + "grad_norm": 0.008264587260782719, + "learning_rate": 0.0009632819879114291, + "loss": 0.1135, + "num_input_tokens_seen": 55713040, + "step": 25805 + }, + { + "epoch": 4.210440456769984, + "grad_norm": 0.05958470329642296, + "learning_rate": 0.0009632552098373045, + "loss": 0.046, + "num_input_tokens_seen": 55723920, + "step": 25810 + }, + { + "epoch": 4.211256117455139, + "grad_norm": 0.12449523061513901, + "learning_rate": 0.0009632284223747255, + "loss": 0.095, + "num_input_tokens_seen": 55735440, + "step": 25815 + }, + { + "epoch": 4.212071778140293, + "grad_norm": 0.12355613708496094, + "learning_rate": 0.0009632016255242348, + "loss": 0.0823, + "num_input_tokens_seen": 55745680, + "step": 25820 + }, + { + "epoch": 4.212887438825448, + "grad_norm": 0.15919852256774902, + "learning_rate": 0.0009631748192863756, + "loss": 0.1134, + "num_input_tokens_seen": 55757680, + "step": 25825 + }, + { + "epoch": 4.213703099510604, + "grad_norm": 0.2913621962070465, + "learning_rate": 0.0009631480036616911, + "loss": 0.1223, + "num_input_tokens_seen": 55767536, + "step": 25830 + }, + { + "epoch": 4.214518760195759, + "grad_norm": 0.04399906471371651, + "learning_rate": 0.0009631211786507248, + "loss": 0.0935, + "num_input_tokens_seen": 55778032, + "step": 25835 + }, + { + "epoch": 4.215334420880914, + "grad_norm": 0.016776829957962036, + "learning_rate": 0.0009630943442540202, + "loss": 0.0893, + "num_input_tokens_seen": 55790000, + "step": 25840 + }, + { + "epoch": 4.216150081566068, + "grad_norm": 0.009180238470435143, + "learning_rate": 0.0009630675004721212, + "loss": 0.0404, + "num_input_tokens_seen": 55800688, + "step": 25845 + }, + { + "epoch": 4.216965742251223, + "grad_norm": 0.03179603070020676, + "learning_rate": 0.000963040647305572, + "loss": 0.0415, + "num_input_tokens_seen": 55812208, + "step": 25850 + }, + { + "epoch": 4.217781402936378, + "grad_norm": 0.1224488839507103, + "learning_rate": 0.0009630137847549166, + "loss": 0.0557, + "num_input_tokens_seen": 55823120, + "step": 25855 + }, + { + "epoch": 4.218597063621534, + "grad_norm": 0.003707037540152669, + "learning_rate": 0.0009629869128206997, + "loss": 0.0572, + "num_input_tokens_seen": 55834416, + "step": 25860 + }, + { + "epoch": 4.219412724306689, + "grad_norm": 0.009411961771547794, + "learning_rate": 0.0009629600315034652, + "loss": 0.059, + "num_input_tokens_seen": 55846480, + "step": 25865 + }, + { + "epoch": 4.220228384991843, + "grad_norm": 0.2734367847442627, + "learning_rate": 0.0009629331408037588, + "loss": 0.1476, + "num_input_tokens_seen": 55857200, + "step": 25870 + }, + { + "epoch": 4.221044045676998, + "grad_norm": 0.0076523711904883385, + "learning_rate": 0.0009629062407221248, + "loss": 0.0394, + "num_input_tokens_seen": 55868976, + "step": 25875 + }, + { + "epoch": 4.221859706362153, + "grad_norm": 0.16045591235160828, + "learning_rate": 0.0009628793312591086, + "loss": 0.1019, + "num_input_tokens_seen": 55881360, + "step": 25880 + }, + { + "epoch": 4.222675367047309, + "grad_norm": 0.07055556774139404, + "learning_rate": 0.0009628524124152555, + "loss": 0.1088, + "num_input_tokens_seen": 55892208, + "step": 25885 + }, + { + "epoch": 4.2234910277324635, + "grad_norm": 0.002273108344525099, + "learning_rate": 0.0009628254841911113, + "loss": 0.0582, + "num_input_tokens_seen": 55901936, + "step": 25890 + }, + { + "epoch": 4.224306688417618, + "grad_norm": 0.06637419015169144, + "learning_rate": 0.0009627985465872214, + "loss": 0.0172, + "num_input_tokens_seen": 55912784, + "step": 25895 + }, + { + "epoch": 4.225122349102773, + "grad_norm": 0.20928704738616943, + "learning_rate": 0.0009627715996041319, + "loss": 0.2263, + "num_input_tokens_seen": 55924432, + "step": 25900 + }, + { + "epoch": 4.225938009787928, + "grad_norm": 0.005396584514528513, + "learning_rate": 0.0009627446432423888, + "loss": 0.0896, + "num_input_tokens_seen": 55935216, + "step": 25905 + }, + { + "epoch": 4.226753670473083, + "grad_norm": 0.006933948490768671, + "learning_rate": 0.0009627176775025385, + "loss": 0.1255, + "num_input_tokens_seen": 55945680, + "step": 25910 + }, + { + "epoch": 4.2275693311582385, + "grad_norm": 0.12000980228185654, + "learning_rate": 0.0009626907023851275, + "loss": 0.1505, + "num_input_tokens_seen": 55956496, + "step": 25915 + }, + { + "epoch": 4.228384991843393, + "grad_norm": 0.003090959507972002, + "learning_rate": 0.0009626637178907024, + "loss": 0.0286, + "num_input_tokens_seen": 55967280, + "step": 25920 + }, + { + "epoch": 4.229200652528548, + "grad_norm": 0.01359731424599886, + "learning_rate": 0.0009626367240198101, + "loss": 0.1214, + "num_input_tokens_seen": 55978160, + "step": 25925 + }, + { + "epoch": 4.230016313213703, + "grad_norm": 0.011455285362899303, + "learning_rate": 0.0009626097207729978, + "loss": 0.0702, + "num_input_tokens_seen": 55989808, + "step": 25930 + }, + { + "epoch": 4.230831973898858, + "grad_norm": 0.006472648121416569, + "learning_rate": 0.0009625827081508125, + "loss": 0.0928, + "num_input_tokens_seen": 56000944, + "step": 25935 + }, + { + "epoch": 4.231647634584013, + "grad_norm": 0.013199994340538979, + "learning_rate": 0.000962555686153802, + "loss": 0.0541, + "num_input_tokens_seen": 56011408, + "step": 25940 + }, + { + "epoch": 4.232463295269168, + "grad_norm": 0.23151026666164398, + "learning_rate": 0.0009625286547825136, + "loss": 0.0971, + "num_input_tokens_seen": 56020944, + "step": 25945 + }, + { + "epoch": 4.233278955954323, + "grad_norm": 0.012370586395263672, + "learning_rate": 0.0009625016140374952, + "loss": 0.0172, + "num_input_tokens_seen": 56031792, + "step": 25950 + }, + { + "epoch": 4.234094616639478, + "grad_norm": 0.006817667279392481, + "learning_rate": 0.0009624745639192949, + "loss": 0.0229, + "num_input_tokens_seen": 56041936, + "step": 25955 + }, + { + "epoch": 4.234910277324633, + "grad_norm": 0.01638965681195259, + "learning_rate": 0.0009624475044284609, + "loss": 0.0536, + "num_input_tokens_seen": 56052272, + "step": 25960 + }, + { + "epoch": 4.235725938009788, + "grad_norm": 0.0028721585404127836, + "learning_rate": 0.0009624204355655416, + "loss": 0.0109, + "num_input_tokens_seen": 56061808, + "step": 25965 + }, + { + "epoch": 4.236541598694943, + "grad_norm": 0.20426343381404877, + "learning_rate": 0.0009623933573310855, + "loss": 0.1429, + "num_input_tokens_seen": 56071536, + "step": 25970 + }, + { + "epoch": 4.237357259380098, + "grad_norm": 0.030120905488729477, + "learning_rate": 0.0009623662697256414, + "loss": 0.0808, + "num_input_tokens_seen": 56083568, + "step": 25975 + }, + { + "epoch": 4.238172920065253, + "grad_norm": 0.008668581023812294, + "learning_rate": 0.0009623391727497584, + "loss": 0.0644, + "num_input_tokens_seen": 56094000, + "step": 25980 + }, + { + "epoch": 4.238988580750408, + "grad_norm": 0.20718252658843994, + "learning_rate": 0.0009623120664039855, + "loss": 0.237, + "num_input_tokens_seen": 56103952, + "step": 25985 + }, + { + "epoch": 4.239804241435563, + "grad_norm": 0.04377641901373863, + "learning_rate": 0.000962284950688872, + "loss": 0.0309, + "num_input_tokens_seen": 56114224, + "step": 25990 + }, + { + "epoch": 4.240619902120717, + "grad_norm": 0.0023236190900206566, + "learning_rate": 0.0009622578256049675, + "loss": 0.0355, + "num_input_tokens_seen": 56124464, + "step": 25995 + }, + { + "epoch": 4.241435562805873, + "grad_norm": 0.13870258629322052, + "learning_rate": 0.0009622306911528219, + "loss": 0.0558, + "num_input_tokens_seen": 56135280, + "step": 26000 + }, + { + "epoch": 4.242251223491028, + "grad_norm": 0.02029184065759182, + "learning_rate": 0.0009622035473329848, + "loss": 0.059, + "num_input_tokens_seen": 56144976, + "step": 26005 + }, + { + "epoch": 4.243066884176183, + "grad_norm": 0.02982390858232975, + "learning_rate": 0.0009621763941460067, + "loss": 0.2243, + "num_input_tokens_seen": 56156688, + "step": 26010 + }, + { + "epoch": 4.2438825448613375, + "grad_norm": 0.04767904430627823, + "learning_rate": 0.0009621492315924375, + "loss": 0.0265, + "num_input_tokens_seen": 56167856, + "step": 26015 + }, + { + "epoch": 4.244698205546492, + "grad_norm": 0.06755167990922928, + "learning_rate": 0.0009621220596728278, + "loss": 0.0483, + "num_input_tokens_seen": 56178928, + "step": 26020 + }, + { + "epoch": 4.245513866231648, + "grad_norm": 0.024697525426745415, + "learning_rate": 0.0009620948783877285, + "loss": 0.0883, + "num_input_tokens_seen": 56189584, + "step": 26025 + }, + { + "epoch": 4.246329526916803, + "grad_norm": 0.0070642102509737015, + "learning_rate": 0.0009620676877376902, + "loss": 0.0823, + "num_input_tokens_seen": 56200080, + "step": 26030 + }, + { + "epoch": 4.247145187601958, + "grad_norm": 0.00401959428563714, + "learning_rate": 0.000962040487723264, + "loss": 0.1877, + "num_input_tokens_seen": 56210288, + "step": 26035 + }, + { + "epoch": 4.2479608482871125, + "grad_norm": 0.05826200917363167, + "learning_rate": 0.0009620132783450011, + "loss": 0.0782, + "num_input_tokens_seen": 56220976, + "step": 26040 + }, + { + "epoch": 4.248776508972267, + "grad_norm": 0.06947620958089828, + "learning_rate": 0.0009619860596034531, + "loss": 0.0656, + "num_input_tokens_seen": 56232176, + "step": 26045 + }, + { + "epoch": 4.249592169657422, + "grad_norm": 0.1631036102771759, + "learning_rate": 0.0009619588314991716, + "loss": 0.2692, + "num_input_tokens_seen": 56244400, + "step": 26050 + }, + { + "epoch": 4.250407830342578, + "grad_norm": 0.10844310373067856, + "learning_rate": 0.0009619315940327082, + "loss": 0.0342, + "num_input_tokens_seen": 56255952, + "step": 26055 + }, + { + "epoch": 4.251223491027733, + "grad_norm": 0.020276907831430435, + "learning_rate": 0.0009619043472046151, + "loss": 0.0864, + "num_input_tokens_seen": 56267760, + "step": 26060 + }, + { + "epoch": 4.2520391517128875, + "grad_norm": 0.1164427250623703, + "learning_rate": 0.0009618770910154444, + "loss": 0.044, + "num_input_tokens_seen": 56278800, + "step": 26065 + }, + { + "epoch": 4.252854812398042, + "grad_norm": 0.01006534043699503, + "learning_rate": 0.0009618498254657486, + "loss": 0.049, + "num_input_tokens_seen": 56289552, + "step": 26070 + }, + { + "epoch": 4.253670473083197, + "grad_norm": 0.0091670211404562, + "learning_rate": 0.00096182255055608, + "loss": 0.153, + "num_input_tokens_seen": 56301808, + "step": 26075 + }, + { + "epoch": 4.254486133768353, + "grad_norm": 0.1327732801437378, + "learning_rate": 0.0009617952662869918, + "loss": 0.1062, + "num_input_tokens_seen": 56311536, + "step": 26080 + }, + { + "epoch": 4.255301794453508, + "grad_norm": 0.013581754639744759, + "learning_rate": 0.0009617679726590366, + "loss": 0.0273, + "num_input_tokens_seen": 56323344, + "step": 26085 + }, + { + "epoch": 4.2561174551386625, + "grad_norm": 0.11445581167936325, + "learning_rate": 0.0009617406696727676, + "loss": 0.1674, + "num_input_tokens_seen": 56333904, + "step": 26090 + }, + { + "epoch": 4.256933115823817, + "grad_norm": 0.13308361172676086, + "learning_rate": 0.0009617133573287382, + "loss": 0.0717, + "num_input_tokens_seen": 56343312, + "step": 26095 + }, + { + "epoch": 4.257748776508972, + "grad_norm": 0.03449687734246254, + "learning_rate": 0.0009616860356275019, + "loss": 0.0732, + "num_input_tokens_seen": 56354960, + "step": 26100 + }, + { + "epoch": 4.258564437194127, + "grad_norm": 0.02992508001625538, + "learning_rate": 0.0009616587045696124, + "loss": 0.0548, + "num_input_tokens_seen": 56366032, + "step": 26105 + }, + { + "epoch": 4.259380097879283, + "grad_norm": 0.15899598598480225, + "learning_rate": 0.0009616313641556235, + "loss": 0.0772, + "num_input_tokens_seen": 56376496, + "step": 26110 + }, + { + "epoch": 4.260195758564437, + "grad_norm": 0.007667019031941891, + "learning_rate": 0.0009616040143860896, + "loss": 0.1172, + "num_input_tokens_seen": 56387408, + "step": 26115 + }, + { + "epoch": 4.261011419249592, + "grad_norm": 0.06802377104759216, + "learning_rate": 0.0009615766552615645, + "loss": 0.126, + "num_input_tokens_seen": 56397328, + "step": 26120 + }, + { + "epoch": 4.261827079934747, + "grad_norm": 0.01866072416305542, + "learning_rate": 0.0009615492867826032, + "loss": 0.2666, + "num_input_tokens_seen": 56406640, + "step": 26125 + }, + { + "epoch": 4.262642740619902, + "grad_norm": 0.1499709039926529, + "learning_rate": 0.00096152190894976, + "loss": 0.0607, + "num_input_tokens_seen": 56416400, + "step": 26130 + }, + { + "epoch": 4.263458401305057, + "grad_norm": 0.22221073508262634, + "learning_rate": 0.0009614945217635897, + "loss": 0.1237, + "num_input_tokens_seen": 56426320, + "step": 26135 + }, + { + "epoch": 4.264274061990212, + "grad_norm": 0.2991475760936737, + "learning_rate": 0.0009614671252246476, + "loss": 0.2886, + "num_input_tokens_seen": 56436400, + "step": 26140 + }, + { + "epoch": 4.265089722675367, + "grad_norm": 0.18414202332496643, + "learning_rate": 0.0009614397193334887, + "loss": 0.1688, + "num_input_tokens_seen": 56445712, + "step": 26145 + }, + { + "epoch": 4.265905383360522, + "grad_norm": 0.018621614202857018, + "learning_rate": 0.0009614123040906686, + "loss": 0.1006, + "num_input_tokens_seen": 56456720, + "step": 26150 + }, + { + "epoch": 4.266721044045677, + "grad_norm": 0.013576061464846134, + "learning_rate": 0.0009613848794967428, + "loss": 0.048, + "num_input_tokens_seen": 56468464, + "step": 26155 + }, + { + "epoch": 4.267536704730832, + "grad_norm": 0.19881682097911835, + "learning_rate": 0.0009613574455522671, + "loss": 0.2215, + "num_input_tokens_seen": 56478096, + "step": 26160 + }, + { + "epoch": 4.268352365415987, + "grad_norm": 0.10004175454378128, + "learning_rate": 0.0009613300022577974, + "loss": 0.2241, + "num_input_tokens_seen": 56489776, + "step": 26165 + }, + { + "epoch": 4.269168026101142, + "grad_norm": 0.0246064905077219, + "learning_rate": 0.00096130254961389, + "loss": 0.0853, + "num_input_tokens_seen": 56501232, + "step": 26170 + }, + { + "epoch": 4.269983686786297, + "grad_norm": 0.048825837671756744, + "learning_rate": 0.0009612750876211014, + "loss": 0.0593, + "num_input_tokens_seen": 56511664, + "step": 26175 + }, + { + "epoch": 4.270799347471452, + "grad_norm": 0.12497945129871368, + "learning_rate": 0.0009612476162799878, + "loss": 0.1469, + "num_input_tokens_seen": 56522032, + "step": 26180 + }, + { + "epoch": 4.271615008156607, + "grad_norm": 0.038051679730415344, + "learning_rate": 0.0009612201355911061, + "loss": 0.1926, + "num_input_tokens_seen": 56533008, + "step": 26185 + }, + { + "epoch": 4.2724306688417615, + "grad_norm": 0.10020732134580612, + "learning_rate": 0.0009611926455550135, + "loss": 0.0713, + "num_input_tokens_seen": 56544176, + "step": 26190 + }, + { + "epoch": 4.273246329526917, + "grad_norm": 0.12207670509815216, + "learning_rate": 0.0009611651461722666, + "loss": 0.2296, + "num_input_tokens_seen": 56555376, + "step": 26195 + }, + { + "epoch": 4.274061990212072, + "grad_norm": 0.02697013132274151, + "learning_rate": 0.0009611376374434231, + "loss": 0.0873, + "num_input_tokens_seen": 56566256, + "step": 26200 + }, + { + "epoch": 4.274877650897227, + "grad_norm": 0.12644600868225098, + "learning_rate": 0.0009611101193690403, + "loss": 0.1205, + "num_input_tokens_seen": 56576976, + "step": 26205 + }, + { + "epoch": 4.275693311582382, + "grad_norm": 0.022577185183763504, + "learning_rate": 0.0009610825919496761, + "loss": 0.1095, + "num_input_tokens_seen": 56587504, + "step": 26210 + }, + { + "epoch": 4.2765089722675365, + "grad_norm": 0.15265852212905884, + "learning_rate": 0.0009610550551858881, + "loss": 0.1106, + "num_input_tokens_seen": 56598800, + "step": 26215 + }, + { + "epoch": 4.277324632952691, + "grad_norm": 0.023819593712687492, + "learning_rate": 0.0009610275090782347, + "loss": 0.0896, + "num_input_tokens_seen": 56609584, + "step": 26220 + }, + { + "epoch": 4.278140293637847, + "grad_norm": 0.00973698403686285, + "learning_rate": 0.0009609999536272738, + "loss": 0.0336, + "num_input_tokens_seen": 56619984, + "step": 26225 + }, + { + "epoch": 4.278955954323002, + "grad_norm": 0.030250197276473045, + "learning_rate": 0.0009609723888335641, + "loss": 0.1416, + "num_input_tokens_seen": 56630128, + "step": 26230 + }, + { + "epoch": 4.279771615008157, + "grad_norm": 0.04252338781952858, + "learning_rate": 0.0009609448146976642, + "loss": 0.11, + "num_input_tokens_seen": 56641264, + "step": 26235 + }, + { + "epoch": 4.280587275693311, + "grad_norm": 0.03177511692047119, + "learning_rate": 0.0009609172312201328, + "loss": 0.0224, + "num_input_tokens_seen": 56652688, + "step": 26240 + }, + { + "epoch": 4.281402936378466, + "grad_norm": 0.1196862980723381, + "learning_rate": 0.000960889638401529, + "loss": 0.0988, + "num_input_tokens_seen": 56663600, + "step": 26245 + }, + { + "epoch": 4.282218597063622, + "grad_norm": 0.006066491827368736, + "learning_rate": 0.0009608620362424121, + "loss": 0.0301, + "num_input_tokens_seen": 56674544, + "step": 26250 + }, + { + "epoch": 4.283034257748777, + "grad_norm": 0.009370140731334686, + "learning_rate": 0.0009608344247433412, + "loss": 0.0249, + "num_input_tokens_seen": 56685680, + "step": 26255 + }, + { + "epoch": 4.283849918433932, + "grad_norm": 0.24224993586540222, + "learning_rate": 0.0009608068039048763, + "loss": 0.1325, + "num_input_tokens_seen": 56696752, + "step": 26260 + }, + { + "epoch": 4.284665579119086, + "grad_norm": 0.026159415021538734, + "learning_rate": 0.0009607791737275769, + "loss": 0.123, + "num_input_tokens_seen": 56707600, + "step": 26265 + }, + { + "epoch": 4.285481239804241, + "grad_norm": 0.040241800248622894, + "learning_rate": 0.0009607515342120028, + "loss": 0.1736, + "num_input_tokens_seen": 56718608, + "step": 26270 + }, + { + "epoch": 4.286296900489396, + "grad_norm": 0.012425195425748825, + "learning_rate": 0.0009607238853587144, + "loss": 0.1658, + "num_input_tokens_seen": 56729840, + "step": 26275 + }, + { + "epoch": 4.287112561174552, + "grad_norm": 0.02036895416676998, + "learning_rate": 0.0009606962271682722, + "loss": 0.1062, + "num_input_tokens_seen": 56739984, + "step": 26280 + }, + { + "epoch": 4.287928221859707, + "grad_norm": 0.004493687767535448, + "learning_rate": 0.0009606685596412364, + "loss": 0.2515, + "num_input_tokens_seen": 56751248, + "step": 26285 + }, + { + "epoch": 4.288743882544861, + "grad_norm": 0.02566436678171158, + "learning_rate": 0.0009606408827781679, + "loss": 0.1468, + "num_input_tokens_seen": 56762480, + "step": 26290 + }, + { + "epoch": 4.289559543230016, + "grad_norm": 0.1594172567129135, + "learning_rate": 0.0009606131965796274, + "loss": 0.1905, + "num_input_tokens_seen": 56772016, + "step": 26295 + }, + { + "epoch": 4.290375203915171, + "grad_norm": 0.10375330597162247, + "learning_rate": 0.0009605855010461761, + "loss": 0.0581, + "num_input_tokens_seen": 56781168, + "step": 26300 + }, + { + "epoch": 4.291190864600326, + "grad_norm": 0.06583525985479355, + "learning_rate": 0.0009605577961783756, + "loss": 0.0705, + "num_input_tokens_seen": 56792720, + "step": 26305 + }, + { + "epoch": 4.2920065252854815, + "grad_norm": 0.012155563570559025, + "learning_rate": 0.0009605300819767869, + "loss": 0.0994, + "num_input_tokens_seen": 56803856, + "step": 26310 + }, + { + "epoch": 4.292822185970636, + "grad_norm": 0.021923506632447243, + "learning_rate": 0.000960502358441972, + "loss": 0.0476, + "num_input_tokens_seen": 56814480, + "step": 26315 + }, + { + "epoch": 4.293637846655791, + "grad_norm": 0.018885834142565727, + "learning_rate": 0.0009604746255744925, + "loss": 0.1641, + "num_input_tokens_seen": 56824816, + "step": 26320 + }, + { + "epoch": 4.294453507340946, + "grad_norm": 0.027049990370869637, + "learning_rate": 0.0009604468833749105, + "loss": 0.1407, + "num_input_tokens_seen": 56834928, + "step": 26325 + }, + { + "epoch": 4.295269168026101, + "grad_norm": 0.019557593390345573, + "learning_rate": 0.0009604191318437885, + "loss": 0.0385, + "num_input_tokens_seen": 56845680, + "step": 26330 + }, + { + "epoch": 4.2960848287112565, + "grad_norm": 0.004493829794228077, + "learning_rate": 0.0009603913709816886, + "loss": 0.0613, + "num_input_tokens_seen": 56856464, + "step": 26335 + }, + { + "epoch": 4.296900489396411, + "grad_norm": 0.020050350576639175, + "learning_rate": 0.0009603636007891735, + "loss": 0.1433, + "num_input_tokens_seen": 56867920, + "step": 26340 + }, + { + "epoch": 4.297716150081566, + "grad_norm": 0.02558988146483898, + "learning_rate": 0.0009603358212668061, + "loss": 0.122, + "num_input_tokens_seen": 56878672, + "step": 26345 + }, + { + "epoch": 4.298531810766721, + "grad_norm": 0.018920155242085457, + "learning_rate": 0.0009603080324151492, + "loss": 0.0339, + "num_input_tokens_seen": 56889616, + "step": 26350 + }, + { + "epoch": 4.299347471451876, + "grad_norm": 0.009520800784230232, + "learning_rate": 0.0009602802342347661, + "loss": 0.0561, + "num_input_tokens_seen": 56899184, + "step": 26355 + }, + { + "epoch": 4.300163132137031, + "grad_norm": 0.018508197739720345, + "learning_rate": 0.0009602524267262203, + "loss": 0.1446, + "num_input_tokens_seen": 56910480, + "step": 26360 + }, + { + "epoch": 4.300978792822186, + "grad_norm": 0.023860251531004906, + "learning_rate": 0.0009602246098900749, + "loss": 0.2832, + "num_input_tokens_seen": 56921040, + "step": 26365 + }, + { + "epoch": 4.301794453507341, + "grad_norm": 0.05338333174586296, + "learning_rate": 0.0009601967837268941, + "loss": 0.1142, + "num_input_tokens_seen": 56931920, + "step": 26370 + }, + { + "epoch": 4.302610114192496, + "grad_norm": 0.22567149996757507, + "learning_rate": 0.0009601689482372417, + "loss": 0.095, + "num_input_tokens_seen": 56942960, + "step": 26375 + }, + { + "epoch": 4.303425774877651, + "grad_norm": 0.05263345688581467, + "learning_rate": 0.0009601411034216818, + "loss": 0.0734, + "num_input_tokens_seen": 56955728, + "step": 26380 + }, + { + "epoch": 4.304241435562806, + "grad_norm": 0.2601128816604614, + "learning_rate": 0.0009601132492807787, + "loss": 0.122, + "num_input_tokens_seen": 56966224, + "step": 26385 + }, + { + "epoch": 4.30505709624796, + "grad_norm": 0.21180546283721924, + "learning_rate": 0.000960085385815097, + "loss": 0.2399, + "num_input_tokens_seen": 56975440, + "step": 26390 + }, + { + "epoch": 4.305872756933116, + "grad_norm": 0.16523747146129608, + "learning_rate": 0.0009600575130252012, + "loss": 0.0863, + "num_input_tokens_seen": 56986416, + "step": 26395 + }, + { + "epoch": 4.306688417618271, + "grad_norm": 0.06301485747098923, + "learning_rate": 0.0009600296309116563, + "loss": 0.0841, + "num_input_tokens_seen": 56997392, + "step": 26400 + }, + { + "epoch": 4.307504078303426, + "grad_norm": 0.08565749228000641, + "learning_rate": 0.0009600017394750274, + "loss": 0.1044, + "num_input_tokens_seen": 57008144, + "step": 26405 + }, + { + "epoch": 4.308319738988581, + "grad_norm": 0.019167514517903328, + "learning_rate": 0.0009599738387158794, + "loss": 0.0994, + "num_input_tokens_seen": 57019472, + "step": 26410 + }, + { + "epoch": 4.309135399673735, + "grad_norm": 0.01404428482055664, + "learning_rate": 0.0009599459286347783, + "loss": 0.1542, + "num_input_tokens_seen": 57029136, + "step": 26415 + }, + { + "epoch": 4.309951060358891, + "grad_norm": 0.11114760488271713, + "learning_rate": 0.0009599180092322894, + "loss": 0.0776, + "num_input_tokens_seen": 57039504, + "step": 26420 + }, + { + "epoch": 4.310766721044046, + "grad_norm": 0.11486387997865677, + "learning_rate": 0.0009598900805089786, + "loss": 0.1442, + "num_input_tokens_seen": 57050064, + "step": 26425 + }, + { + "epoch": 4.311582381729201, + "grad_norm": 0.2429983764886856, + "learning_rate": 0.0009598621424654119, + "loss": 0.0819, + "num_input_tokens_seen": 57061232, + "step": 26430 + }, + { + "epoch": 4.3123980424143555, + "grad_norm": 0.017433280125260353, + "learning_rate": 0.0009598341951021557, + "loss": 0.1261, + "num_input_tokens_seen": 57072016, + "step": 26435 + }, + { + "epoch": 4.31321370309951, + "grad_norm": 0.14125719666481018, + "learning_rate": 0.0009598062384197759, + "loss": 0.0848, + "num_input_tokens_seen": 57082576, + "step": 26440 + }, + { + "epoch": 4.314029363784665, + "grad_norm": 0.037804123014211655, + "learning_rate": 0.0009597782724188395, + "loss": 0.0622, + "num_input_tokens_seen": 57093712, + "step": 26445 + }, + { + "epoch": 4.314845024469821, + "grad_norm": 0.0939275398850441, + "learning_rate": 0.0009597502970999132, + "loss": 0.0991, + "num_input_tokens_seen": 57104432, + "step": 26450 + }, + { + "epoch": 4.315660685154976, + "grad_norm": 0.01788988895714283, + "learning_rate": 0.0009597223124635639, + "loss": 0.0349, + "num_input_tokens_seen": 57114576, + "step": 26455 + }, + { + "epoch": 4.3164763458401305, + "grad_norm": 0.23379552364349365, + "learning_rate": 0.0009596943185103586, + "loss": 0.1832, + "num_input_tokens_seen": 57125744, + "step": 26460 + }, + { + "epoch": 4.317292006525285, + "grad_norm": 0.018022971227765083, + "learning_rate": 0.0009596663152408648, + "loss": 0.1155, + "num_input_tokens_seen": 57136784, + "step": 26465 + }, + { + "epoch": 4.31810766721044, + "grad_norm": 0.052554283291101456, + "learning_rate": 0.0009596383026556501, + "loss": 0.0276, + "num_input_tokens_seen": 57147888, + "step": 26470 + }, + { + "epoch": 4.318923327895595, + "grad_norm": 0.004686639178544283, + "learning_rate": 0.000959610280755282, + "loss": 0.0615, + "num_input_tokens_seen": 57158704, + "step": 26475 + }, + { + "epoch": 4.319738988580751, + "grad_norm": 0.0030810674652457237, + "learning_rate": 0.0009595822495403286, + "loss": 0.0817, + "num_input_tokens_seen": 57170096, + "step": 26480 + }, + { + "epoch": 4.3205546492659055, + "grad_norm": 0.06349651515483856, + "learning_rate": 0.0009595542090113579, + "loss": 0.0825, + "num_input_tokens_seen": 57181008, + "step": 26485 + }, + { + "epoch": 4.32137030995106, + "grad_norm": 0.06017550453543663, + "learning_rate": 0.0009595261591689381, + "loss": 0.1398, + "num_input_tokens_seen": 57192752, + "step": 26490 + }, + { + "epoch": 4.322185970636215, + "grad_norm": 0.042055677622556686, + "learning_rate": 0.0009594981000136377, + "loss": 0.1526, + "num_input_tokens_seen": 57204976, + "step": 26495 + }, + { + "epoch": 4.32300163132137, + "grad_norm": 0.20365913212299347, + "learning_rate": 0.0009594700315460254, + "loss": 0.1739, + "num_input_tokens_seen": 57216592, + "step": 26500 + }, + { + "epoch": 4.323817292006526, + "grad_norm": 0.0897751972079277, + "learning_rate": 0.0009594419537666701, + "loss": 0.0807, + "num_input_tokens_seen": 57227568, + "step": 26505 + }, + { + "epoch": 4.3246329526916805, + "grad_norm": 0.20568089187145233, + "learning_rate": 0.0009594138666761407, + "loss": 0.0922, + "num_input_tokens_seen": 57238096, + "step": 26510 + }, + { + "epoch": 4.325448613376835, + "grad_norm": 0.01806853897869587, + "learning_rate": 0.0009593857702750065, + "loss": 0.055, + "num_input_tokens_seen": 57250416, + "step": 26515 + }, + { + "epoch": 4.32626427406199, + "grad_norm": 0.017707910388708115, + "learning_rate": 0.0009593576645638369, + "loss": 0.0468, + "num_input_tokens_seen": 57261008, + "step": 26520 + }, + { + "epoch": 4.327079934747145, + "grad_norm": 0.1735449880361557, + "learning_rate": 0.0009593295495432015, + "loss": 0.1473, + "num_input_tokens_seen": 57271344, + "step": 26525 + }, + { + "epoch": 4.327895595432301, + "grad_norm": 0.060285355895757675, + "learning_rate": 0.00095930142521367, + "loss": 0.0248, + "num_input_tokens_seen": 57280752, + "step": 26530 + }, + { + "epoch": 4.328711256117455, + "grad_norm": 0.07333345711231232, + "learning_rate": 0.0009592732915758127, + "loss": 0.1568, + "num_input_tokens_seen": 57290736, + "step": 26535 + }, + { + "epoch": 4.32952691680261, + "grad_norm": 0.06021733954548836, + "learning_rate": 0.0009592451486301991, + "loss": 0.0521, + "num_input_tokens_seen": 57301680, + "step": 26540 + }, + { + "epoch": 4.330342577487765, + "grad_norm": 0.011063139885663986, + "learning_rate": 0.0009592169963774004, + "loss": 0.0281, + "num_input_tokens_seen": 57313200, + "step": 26545 + }, + { + "epoch": 4.33115823817292, + "grad_norm": 0.08653924614191055, + "learning_rate": 0.0009591888348179865, + "loss": 0.1192, + "num_input_tokens_seen": 57323856, + "step": 26550 + }, + { + "epoch": 4.331973898858075, + "grad_norm": 0.032046619802713394, + "learning_rate": 0.0009591606639525283, + "loss": 0.0396, + "num_input_tokens_seen": 57334768, + "step": 26555 + }, + { + "epoch": 4.33278955954323, + "grad_norm": 0.018364960327744484, + "learning_rate": 0.0009591324837815969, + "loss": 0.0959, + "num_input_tokens_seen": 57344720, + "step": 26560 + }, + { + "epoch": 4.333605220228385, + "grad_norm": 0.04332400858402252, + "learning_rate": 0.0009591042943057631, + "loss": 0.1281, + "num_input_tokens_seen": 57356784, + "step": 26565 + }, + { + "epoch": 4.33442088091354, + "grad_norm": 0.06763613969087601, + "learning_rate": 0.0009590760955255985, + "loss": 0.0787, + "num_input_tokens_seen": 57367184, + "step": 26570 + }, + { + "epoch": 4.335236541598695, + "grad_norm": 0.08104817569255829, + "learning_rate": 0.0009590478874416744, + "loss": 0.2076, + "num_input_tokens_seen": 57377840, + "step": 26575 + }, + { + "epoch": 4.33605220228385, + "grad_norm": 0.00793997012078762, + "learning_rate": 0.0009590196700545626, + "loss": 0.0777, + "num_input_tokens_seen": 57389232, + "step": 26580 + }, + { + "epoch": 4.3368678629690045, + "grad_norm": 0.07168380916118622, + "learning_rate": 0.0009589914433648347, + "loss": 0.0439, + "num_input_tokens_seen": 57400176, + "step": 26585 + }, + { + "epoch": 4.33768352365416, + "grad_norm": 0.009159411303699017, + "learning_rate": 0.000958963207373063, + "loss": 0.1061, + "num_input_tokens_seen": 57410768, + "step": 26590 + }, + { + "epoch": 4.338499184339315, + "grad_norm": 0.18028071522712708, + "learning_rate": 0.0009589349620798197, + "loss": 0.145, + "num_input_tokens_seen": 57421680, + "step": 26595 + }, + { + "epoch": 4.33931484502447, + "grad_norm": 0.22142666578292847, + "learning_rate": 0.0009589067074856772, + "loss": 0.1123, + "num_input_tokens_seen": 57431600, + "step": 26600 + }, + { + "epoch": 4.340130505709625, + "grad_norm": 0.012987782247364521, + "learning_rate": 0.0009588784435912082, + "loss": 0.0263, + "num_input_tokens_seen": 57442640, + "step": 26605 + }, + { + "epoch": 4.3409461663947795, + "grad_norm": 0.0645018145442009, + "learning_rate": 0.0009588501703969852, + "loss": 0.0439, + "num_input_tokens_seen": 57453072, + "step": 26610 + }, + { + "epoch": 4.341761827079935, + "grad_norm": 0.025148050859570503, + "learning_rate": 0.0009588218879035815, + "loss": 0.0683, + "num_input_tokens_seen": 57463792, + "step": 26615 + }, + { + "epoch": 4.34257748776509, + "grad_norm": 0.07497743517160416, + "learning_rate": 0.0009587935961115701, + "loss": 0.0723, + "num_input_tokens_seen": 57474448, + "step": 26620 + }, + { + "epoch": 4.343393148450245, + "grad_norm": 0.1542753428220749, + "learning_rate": 0.0009587652950215247, + "loss": 0.0753, + "num_input_tokens_seen": 57485904, + "step": 26625 + }, + { + "epoch": 4.3442088091354, + "grad_norm": 0.08636511117219925, + "learning_rate": 0.0009587369846340184, + "loss": 0.1518, + "num_input_tokens_seen": 57497008, + "step": 26630 + }, + { + "epoch": 4.3450244698205545, + "grad_norm": 0.018439868465065956, + "learning_rate": 0.000958708664949625, + "loss": 0.0778, + "num_input_tokens_seen": 57507920, + "step": 26635 + }, + { + "epoch": 4.345840130505709, + "grad_norm": 0.07731824368238449, + "learning_rate": 0.0009586803359689189, + "loss": 0.1032, + "num_input_tokens_seen": 57518832, + "step": 26640 + }, + { + "epoch": 4.346655791190865, + "grad_norm": 0.00699506513774395, + "learning_rate": 0.0009586519976924739, + "loss": 0.0776, + "num_input_tokens_seen": 57529808, + "step": 26645 + }, + { + "epoch": 4.34747145187602, + "grad_norm": 0.18085676431655884, + "learning_rate": 0.0009586236501208642, + "loss": 0.0311, + "num_input_tokens_seen": 57540944, + "step": 26650 + }, + { + "epoch": 4.348287112561175, + "grad_norm": 0.06599736213684082, + "learning_rate": 0.0009585952932546644, + "loss": 0.1783, + "num_input_tokens_seen": 57552176, + "step": 26655 + }, + { + "epoch": 4.349102773246329, + "grad_norm": 0.029392439872026443, + "learning_rate": 0.0009585669270944493, + "loss": 0.0383, + "num_input_tokens_seen": 57563184, + "step": 26660 + }, + { + "epoch": 4.349918433931484, + "grad_norm": 0.07563918083906174, + "learning_rate": 0.0009585385516407936, + "loss": 0.2018, + "num_input_tokens_seen": 57574320, + "step": 26665 + }, + { + "epoch": 4.350734094616639, + "grad_norm": 0.03973830118775368, + "learning_rate": 0.0009585101668942726, + "loss": 0.055, + "num_input_tokens_seen": 57584944, + "step": 26670 + }, + { + "epoch": 4.351549755301795, + "grad_norm": 0.1499049812555313, + "learning_rate": 0.0009584817728554613, + "loss": 0.054, + "num_input_tokens_seen": 57595600, + "step": 26675 + }, + { + "epoch": 4.35236541598695, + "grad_norm": 0.3218221366405487, + "learning_rate": 0.0009584533695249353, + "loss": 0.24, + "num_input_tokens_seen": 57607472, + "step": 26680 + }, + { + "epoch": 4.353181076672104, + "grad_norm": 0.10885684192180634, + "learning_rate": 0.0009584249569032701, + "loss": 0.1108, + "num_input_tokens_seen": 57618800, + "step": 26685 + }, + { + "epoch": 4.353996737357259, + "grad_norm": 0.0633697360754013, + "learning_rate": 0.0009583965349910417, + "loss": 0.035, + "num_input_tokens_seen": 57629040, + "step": 26690 + }, + { + "epoch": 4.354812398042414, + "grad_norm": 0.01026102527976036, + "learning_rate": 0.0009583681037888259, + "loss": 0.1452, + "num_input_tokens_seen": 57639728, + "step": 26695 + }, + { + "epoch": 4.35562805872757, + "grad_norm": 0.1466255486011505, + "learning_rate": 0.0009583396632971991, + "loss": 0.1577, + "num_input_tokens_seen": 57650512, + "step": 26700 + }, + { + "epoch": 4.356443719412725, + "grad_norm": 0.16360533237457275, + "learning_rate": 0.0009583112135167376, + "loss": 0.2071, + "num_input_tokens_seen": 57662096, + "step": 26705 + }, + { + "epoch": 4.357259380097879, + "grad_norm": 0.04229990765452385, + "learning_rate": 0.0009582827544480177, + "loss": 0.1554, + "num_input_tokens_seen": 57672080, + "step": 26710 + }, + { + "epoch": 4.358075040783034, + "grad_norm": 0.07549943774938583, + "learning_rate": 0.0009582542860916166, + "loss": 0.2296, + "num_input_tokens_seen": 57682704, + "step": 26715 + }, + { + "epoch": 4.358890701468189, + "grad_norm": 0.16673016548156738, + "learning_rate": 0.000958225808448111, + "loss": 0.1441, + "num_input_tokens_seen": 57693520, + "step": 26720 + }, + { + "epoch": 4.359706362153344, + "grad_norm": 0.12198888510465622, + "learning_rate": 0.0009581973215180782, + "loss": 0.1257, + "num_input_tokens_seen": 57703312, + "step": 26725 + }, + { + "epoch": 4.3605220228384995, + "grad_norm": 0.10937940329313278, + "learning_rate": 0.0009581688253020952, + "loss": 0.0671, + "num_input_tokens_seen": 57714672, + "step": 26730 + }, + { + "epoch": 4.361337683523654, + "grad_norm": 0.009474528022110462, + "learning_rate": 0.00095814031980074, + "loss": 0.0498, + "num_input_tokens_seen": 57724464, + "step": 26735 + }, + { + "epoch": 4.362153344208809, + "grad_norm": 0.10020558536052704, + "learning_rate": 0.0009581118050145898, + "loss": 0.051, + "num_input_tokens_seen": 57735856, + "step": 26740 + }, + { + "epoch": 4.362969004893964, + "grad_norm": 0.009490148164331913, + "learning_rate": 0.0009580832809442228, + "loss": 0.1179, + "num_input_tokens_seen": 57747440, + "step": 26745 + }, + { + "epoch": 4.363784665579119, + "grad_norm": 0.025863057002425194, + "learning_rate": 0.000958054747590217, + "loss": 0.0491, + "num_input_tokens_seen": 57757168, + "step": 26750 + }, + { + "epoch": 4.364600326264274, + "grad_norm": 0.018802035599946976, + "learning_rate": 0.0009580262049531508, + "loss": 0.0377, + "num_input_tokens_seen": 57768112, + "step": 26755 + }, + { + "epoch": 4.365415986949429, + "grad_norm": 0.13142266869544983, + "learning_rate": 0.0009579976530336023, + "loss": 0.1316, + "num_input_tokens_seen": 57778480, + "step": 26760 + }, + { + "epoch": 4.366231647634584, + "grad_norm": 0.003970426507294178, + "learning_rate": 0.0009579690918321504, + "loss": 0.0406, + "num_input_tokens_seen": 57789328, + "step": 26765 + }, + { + "epoch": 4.367047308319739, + "grad_norm": 0.011634787544608116, + "learning_rate": 0.0009579405213493739, + "loss": 0.1208, + "num_input_tokens_seen": 57800112, + "step": 26770 + }, + { + "epoch": 4.367862969004894, + "grad_norm": 0.016473164781928062, + "learning_rate": 0.0009579119415858518, + "loss": 0.1869, + "num_input_tokens_seen": 57810064, + "step": 26775 + }, + { + "epoch": 4.368678629690049, + "grad_norm": 0.06714841723442078, + "learning_rate": 0.0009578833525421633, + "loss": 0.0443, + "num_input_tokens_seen": 57821680, + "step": 26780 + }, + { + "epoch": 4.369494290375204, + "grad_norm": 0.13360939919948578, + "learning_rate": 0.0009578547542188878, + "loss": 0.0881, + "num_input_tokens_seen": 57832496, + "step": 26785 + }, + { + "epoch": 4.370309951060359, + "grad_norm": 0.06591303646564484, + "learning_rate": 0.0009578261466166049, + "loss": 0.1292, + "num_input_tokens_seen": 57842768, + "step": 26790 + }, + { + "epoch": 4.371125611745514, + "grad_norm": 0.1026320680975914, + "learning_rate": 0.0009577975297358943, + "loss": 0.2409, + "num_input_tokens_seen": 57853008, + "step": 26795 + }, + { + "epoch": 4.371941272430669, + "grad_norm": 0.17219308018684387, + "learning_rate": 0.0009577689035773359, + "loss": 0.1595, + "num_input_tokens_seen": 57863312, + "step": 26800 + }, + { + "epoch": 4.372756933115824, + "grad_norm": 0.07794303447008133, + "learning_rate": 0.0009577402681415102, + "loss": 0.0586, + "num_input_tokens_seen": 57873552, + "step": 26805 + }, + { + "epoch": 4.373572593800978, + "grad_norm": 0.22757485508918762, + "learning_rate": 0.0009577116234289971, + "loss": 0.2511, + "num_input_tokens_seen": 57884336, + "step": 26810 + }, + { + "epoch": 4.374388254486134, + "grad_norm": 0.12817223370075226, + "learning_rate": 0.0009576829694403772, + "loss": 0.127, + "num_input_tokens_seen": 57894512, + "step": 26815 + }, + { + "epoch": 4.375203915171289, + "grad_norm": 0.004768801853060722, + "learning_rate": 0.0009576543061762315, + "loss": 0.12, + "num_input_tokens_seen": 57906576, + "step": 26820 + }, + { + "epoch": 4.376019575856444, + "grad_norm": 0.05570979788899422, + "learning_rate": 0.0009576256336371407, + "loss": 0.0796, + "num_input_tokens_seen": 57916816, + "step": 26825 + }, + { + "epoch": 4.376835236541599, + "grad_norm": 0.02097289450466633, + "learning_rate": 0.0009575969518236857, + "loss": 0.1375, + "num_input_tokens_seen": 57927088, + "step": 26830 + }, + { + "epoch": 4.377650897226753, + "grad_norm": 0.017572136595845222, + "learning_rate": 0.0009575682607364482, + "loss": 0.0571, + "num_input_tokens_seen": 57939120, + "step": 26835 + }, + { + "epoch": 4.378466557911908, + "grad_norm": 0.09586196392774582, + "learning_rate": 0.0009575395603760095, + "loss": 0.1178, + "num_input_tokens_seen": 57949264, + "step": 26840 + }, + { + "epoch": 4.379282218597064, + "grad_norm": 0.03536884859204292, + "learning_rate": 0.000957510850742951, + "loss": 0.1053, + "num_input_tokens_seen": 57961328, + "step": 26845 + }, + { + "epoch": 4.380097879282219, + "grad_norm": 0.05623923987150192, + "learning_rate": 0.0009574821318378547, + "loss": 0.1877, + "num_input_tokens_seen": 57972816, + "step": 26850 + }, + { + "epoch": 4.3809135399673735, + "grad_norm": 0.05989392474293709, + "learning_rate": 0.0009574534036613028, + "loss": 0.0625, + "num_input_tokens_seen": 57983440, + "step": 26855 + }, + { + "epoch": 4.381729200652528, + "grad_norm": 0.13224764168262482, + "learning_rate": 0.0009574246662138772, + "loss": 0.2092, + "num_input_tokens_seen": 57994384, + "step": 26860 + }, + { + "epoch": 4.382544861337683, + "grad_norm": 0.06683975458145142, + "learning_rate": 0.0009573959194961604, + "loss": 0.0388, + "num_input_tokens_seen": 58006192, + "step": 26865 + }, + { + "epoch": 4.383360522022839, + "grad_norm": 0.026770124211907387, + "learning_rate": 0.0009573671635087352, + "loss": 0.0297, + "num_input_tokens_seen": 58016784, + "step": 26870 + }, + { + "epoch": 4.384176182707994, + "grad_norm": 0.06765373051166534, + "learning_rate": 0.0009573383982521841, + "loss": 0.0558, + "num_input_tokens_seen": 58026736, + "step": 26875 + }, + { + "epoch": 4.3849918433931485, + "grad_norm": 0.006693508476018906, + "learning_rate": 0.0009573096237270903, + "loss": 0.1076, + "num_input_tokens_seen": 58037200, + "step": 26880 + }, + { + "epoch": 4.385807504078303, + "grad_norm": 0.018703769892454147, + "learning_rate": 0.0009572808399340368, + "loss": 0.0645, + "num_input_tokens_seen": 58049200, + "step": 26885 + }, + { + "epoch": 4.386623164763458, + "grad_norm": 0.04542679712176323, + "learning_rate": 0.000957252046873607, + "loss": 0.0428, + "num_input_tokens_seen": 58060144, + "step": 26890 + }, + { + "epoch": 4.387438825448613, + "grad_norm": 0.04463881626725197, + "learning_rate": 0.0009572232445463843, + "loss": 0.1766, + "num_input_tokens_seen": 58071600, + "step": 26895 + }, + { + "epoch": 4.388254486133769, + "grad_norm": 0.0192339438945055, + "learning_rate": 0.0009571944329529526, + "loss": 0.1873, + "num_input_tokens_seen": 58081616, + "step": 26900 + }, + { + "epoch": 4.3890701468189235, + "grad_norm": 0.027581289410591125, + "learning_rate": 0.0009571656120938956, + "loss": 0.0746, + "num_input_tokens_seen": 58092976, + "step": 26905 + }, + { + "epoch": 4.389885807504078, + "grad_norm": 0.024506481364369392, + "learning_rate": 0.0009571367819697978, + "loss": 0.075, + "num_input_tokens_seen": 58103536, + "step": 26910 + }, + { + "epoch": 4.390701468189233, + "grad_norm": 0.07000896334648132, + "learning_rate": 0.000957107942581243, + "loss": 0.0506, + "num_input_tokens_seen": 58115504, + "step": 26915 + }, + { + "epoch": 4.391517128874388, + "grad_norm": 0.017548924311995506, + "learning_rate": 0.0009570790939288159, + "loss": 0.0736, + "num_input_tokens_seen": 58125648, + "step": 26920 + }, + { + "epoch": 4.392332789559543, + "grad_norm": 0.30422908067703247, + "learning_rate": 0.0009570502360131011, + "loss": 0.1087, + "num_input_tokens_seen": 58136816, + "step": 26925 + }, + { + "epoch": 4.3931484502446985, + "grad_norm": 0.13867206871509552, + "learning_rate": 0.0009570213688346833, + "loss": 0.2371, + "num_input_tokens_seen": 58146832, + "step": 26930 + }, + { + "epoch": 4.393964110929853, + "grad_norm": 0.08178117871284485, + "learning_rate": 0.000956992492394148, + "loss": 0.2029, + "num_input_tokens_seen": 58156912, + "step": 26935 + }, + { + "epoch": 4.394779771615008, + "grad_norm": 0.03900282829999924, + "learning_rate": 0.00095696360669208, + "loss": 0.0644, + "num_input_tokens_seen": 58167856, + "step": 26940 + }, + { + "epoch": 4.395595432300163, + "grad_norm": 0.1432703286409378, + "learning_rate": 0.0009569347117290647, + "loss": 0.1842, + "num_input_tokens_seen": 58178928, + "step": 26945 + }, + { + "epoch": 4.396411092985318, + "grad_norm": 0.017272207885980606, + "learning_rate": 0.0009569058075056878, + "loss": 0.0509, + "num_input_tokens_seen": 58187888, + "step": 26950 + }, + { + "epoch": 4.397226753670473, + "grad_norm": 0.04967975988984108, + "learning_rate": 0.0009568768940225352, + "loss": 0.1031, + "num_input_tokens_seen": 58198160, + "step": 26955 + }, + { + "epoch": 4.398042414355628, + "grad_norm": 0.03481179103255272, + "learning_rate": 0.0009568479712801926, + "loss": 0.0438, + "num_input_tokens_seen": 58209808, + "step": 26960 + }, + { + "epoch": 4.398858075040783, + "grad_norm": 0.18396709859371185, + "learning_rate": 0.0009568190392792464, + "loss": 0.0846, + "num_input_tokens_seen": 58221104, + "step": 26965 + }, + { + "epoch": 4.399673735725938, + "grad_norm": 0.11623002588748932, + "learning_rate": 0.000956790098020283, + "loss": 0.1206, + "num_input_tokens_seen": 58230832, + "step": 26970 + }, + { + "epoch": 4.400489396411093, + "grad_norm": 0.1606121063232422, + "learning_rate": 0.0009567611475038886, + "loss": 0.1366, + "num_input_tokens_seen": 58241328, + "step": 26975 + }, + { + "epoch": 4.401305057096248, + "grad_norm": 0.011269223876297474, + "learning_rate": 0.0009567321877306501, + "loss": 0.0189, + "num_input_tokens_seen": 58252720, + "step": 26980 + }, + { + "epoch": 4.402120717781403, + "grad_norm": 0.1008668765425682, + "learning_rate": 0.0009567032187011546, + "loss": 0.0661, + "num_input_tokens_seen": 58263216, + "step": 26985 + }, + { + "epoch": 4.402936378466558, + "grad_norm": 0.021822618320584297, + "learning_rate": 0.0009566742404159887, + "loss": 0.0245, + "num_input_tokens_seen": 58274320, + "step": 26990 + }, + { + "epoch": 4.403752039151713, + "grad_norm": 0.023007918149232864, + "learning_rate": 0.0009566452528757402, + "loss": 0.2035, + "num_input_tokens_seen": 58285392, + "step": 26995 + }, + { + "epoch": 4.404567699836868, + "grad_norm": 0.14816312491893768, + "learning_rate": 0.0009566162560809963, + "loss": 0.1996, + "num_input_tokens_seen": 58295728, + "step": 27000 + }, + { + "epoch": 4.4053833605220225, + "grad_norm": 0.006734701804816723, + "learning_rate": 0.0009565872500323447, + "loss": 0.1109, + "num_input_tokens_seen": 58306704, + "step": 27005 + }, + { + "epoch": 4.406199021207178, + "grad_norm": 0.04996083304286003, + "learning_rate": 0.0009565582347303733, + "loss": 0.1033, + "num_input_tokens_seen": 58318288, + "step": 27010 + }, + { + "epoch": 4.407014681892333, + "grad_norm": 0.1626897156238556, + "learning_rate": 0.00095652921017567, + "loss": 0.1519, + "num_input_tokens_seen": 58330416, + "step": 27015 + }, + { + "epoch": 4.407830342577488, + "grad_norm": 0.1686783879995346, + "learning_rate": 0.0009565001763688233, + "loss": 0.098, + "num_input_tokens_seen": 58341456, + "step": 27020 + }, + { + "epoch": 4.408646003262643, + "grad_norm": 0.1627761423587799, + "learning_rate": 0.0009564711333104213, + "loss": 0.1787, + "num_input_tokens_seen": 58351568, + "step": 27025 + }, + { + "epoch": 4.4094616639477975, + "grad_norm": 0.14542949199676514, + "learning_rate": 0.0009564420810010526, + "loss": 0.1611, + "num_input_tokens_seen": 58361968, + "step": 27030 + }, + { + "epoch": 4.410277324632952, + "grad_norm": 0.058514758944511414, + "learning_rate": 0.0009564130194413061, + "loss": 0.1152, + "num_input_tokens_seen": 58373200, + "step": 27035 + }, + { + "epoch": 4.411092985318108, + "grad_norm": 0.049681421369314194, + "learning_rate": 0.0009563839486317709, + "loss": 0.1067, + "num_input_tokens_seen": 58383632, + "step": 27040 + }, + { + "epoch": 4.411908646003263, + "grad_norm": 0.14893564581871033, + "learning_rate": 0.000956354868573036, + "loss": 0.1778, + "num_input_tokens_seen": 58394768, + "step": 27045 + }, + { + "epoch": 4.412724306688418, + "grad_norm": 0.2130400836467743, + "learning_rate": 0.0009563257792656908, + "loss": 0.0809, + "num_input_tokens_seen": 58403856, + "step": 27050 + }, + { + "epoch": 4.4135399673735725, + "grad_norm": 0.18294550478458405, + "learning_rate": 0.0009562966807103246, + "loss": 0.1714, + "num_input_tokens_seen": 58415152, + "step": 27055 + }, + { + "epoch": 4.414355628058727, + "grad_norm": 0.10248050093650818, + "learning_rate": 0.0009562675729075274, + "loss": 0.0753, + "num_input_tokens_seen": 58425584, + "step": 27060 + }, + { + "epoch": 4.415171288743883, + "grad_norm": 0.020018069073557854, + "learning_rate": 0.0009562384558578891, + "loss": 0.0353, + "num_input_tokens_seen": 58435088, + "step": 27065 + }, + { + "epoch": 4.415986949429038, + "grad_norm": 0.04110151529312134, + "learning_rate": 0.0009562093295619996, + "loss": 0.0683, + "num_input_tokens_seen": 58447088, + "step": 27070 + }, + { + "epoch": 4.416802610114193, + "grad_norm": 0.22743521630764008, + "learning_rate": 0.0009561801940204493, + "loss": 0.1746, + "num_input_tokens_seen": 58458928, + "step": 27075 + }, + { + "epoch": 4.417618270799347, + "grad_norm": 0.05669097974896431, + "learning_rate": 0.0009561510492338287, + "loss": 0.1259, + "num_input_tokens_seen": 58470224, + "step": 27080 + }, + { + "epoch": 4.418433931484502, + "grad_norm": 0.052969805896282196, + "learning_rate": 0.0009561218952027286, + "loss": 0.0303, + "num_input_tokens_seen": 58480816, + "step": 27085 + }, + { + "epoch": 4.419249592169657, + "grad_norm": 0.23036614060401917, + "learning_rate": 0.0009560927319277395, + "loss": 0.2636, + "num_input_tokens_seen": 58492528, + "step": 27090 + }, + { + "epoch": 4.420065252854813, + "grad_norm": 0.030631111934781075, + "learning_rate": 0.0009560635594094524, + "loss": 0.0484, + "num_input_tokens_seen": 58502640, + "step": 27095 + }, + { + "epoch": 4.420880913539968, + "grad_norm": 0.035295210778713226, + "learning_rate": 0.000956034377648459, + "loss": 0.0569, + "num_input_tokens_seen": 58514064, + "step": 27100 + }, + { + "epoch": 4.421696574225122, + "grad_norm": 0.05358738824725151, + "learning_rate": 0.0009560051866453503, + "loss": 0.0349, + "num_input_tokens_seen": 58526032, + "step": 27105 + }, + { + "epoch": 4.422512234910277, + "grad_norm": 0.056481026113033295, + "learning_rate": 0.000955975986400718, + "loss": 0.0233, + "num_input_tokens_seen": 58537904, + "step": 27110 + }, + { + "epoch": 4.423327895595432, + "grad_norm": 0.11713006347417831, + "learning_rate": 0.000955946776915154, + "loss": 0.2203, + "num_input_tokens_seen": 58548208, + "step": 27115 + }, + { + "epoch": 4.424143556280587, + "grad_norm": 0.03838816657662392, + "learning_rate": 0.00095591755818925, + "loss": 0.1433, + "num_input_tokens_seen": 58559568, + "step": 27120 + }, + { + "epoch": 4.424959216965743, + "grad_norm": 0.020105112344026566, + "learning_rate": 0.0009558883302235984, + "loss": 0.0983, + "num_input_tokens_seen": 58570576, + "step": 27125 + }, + { + "epoch": 4.425774877650897, + "grad_norm": 0.060005463659763336, + "learning_rate": 0.0009558590930187913, + "loss": 0.0552, + "num_input_tokens_seen": 58581680, + "step": 27130 + }, + { + "epoch": 4.426590538336052, + "grad_norm": 0.02040231041610241, + "learning_rate": 0.0009558298465754216, + "loss": 0.0648, + "num_input_tokens_seen": 58593040, + "step": 27135 + }, + { + "epoch": 4.427406199021207, + "grad_norm": 0.00639024144038558, + "learning_rate": 0.0009558005908940816, + "loss": 0.0292, + "num_input_tokens_seen": 58603728, + "step": 27140 + }, + { + "epoch": 4.428221859706362, + "grad_norm": 0.02192430943250656, + "learning_rate": 0.0009557713259753647, + "loss": 0.0694, + "num_input_tokens_seen": 58614640, + "step": 27145 + }, + { + "epoch": 4.4290375203915175, + "grad_norm": 0.01600003056228161, + "learning_rate": 0.0009557420518198634, + "loss": 0.0518, + "num_input_tokens_seen": 58623536, + "step": 27150 + }, + { + "epoch": 4.429853181076672, + "grad_norm": 0.03760291635990143, + "learning_rate": 0.0009557127684281714, + "loss": 0.0833, + "num_input_tokens_seen": 58633648, + "step": 27155 + }, + { + "epoch": 4.430668841761827, + "grad_norm": 0.007039864081889391, + "learning_rate": 0.000955683475800882, + "loss": 0.0616, + "num_input_tokens_seen": 58644624, + "step": 27160 + }, + { + "epoch": 4.431484502446982, + "grad_norm": 0.0096416175365448, + "learning_rate": 0.0009556541739385889, + "loss": 0.0259, + "num_input_tokens_seen": 58655344, + "step": 27165 + }, + { + "epoch": 4.432300163132137, + "grad_norm": 0.08751514554023743, + "learning_rate": 0.000955624862841886, + "loss": 0.0343, + "num_input_tokens_seen": 58666448, + "step": 27170 + }, + { + "epoch": 4.433115823817292, + "grad_norm": 0.007941437885165215, + "learning_rate": 0.0009555955425113672, + "loss": 0.041, + "num_input_tokens_seen": 58678672, + "step": 27175 + }, + { + "epoch": 4.433931484502447, + "grad_norm": 0.03917887061834335, + "learning_rate": 0.0009555662129476266, + "loss": 0.0857, + "num_input_tokens_seen": 58690672, + "step": 27180 + }, + { + "epoch": 4.434747145187602, + "grad_norm": 0.21245108544826508, + "learning_rate": 0.0009555368741512589, + "loss": 0.1097, + "num_input_tokens_seen": 58702320, + "step": 27185 + }, + { + "epoch": 4.435562805872757, + "grad_norm": 0.008150649257004261, + "learning_rate": 0.0009555075261228586, + "loss": 0.1077, + "num_input_tokens_seen": 58714192, + "step": 27190 + }, + { + "epoch": 4.436378466557912, + "grad_norm": 0.006952659692615271, + "learning_rate": 0.0009554781688630204, + "loss": 0.082, + "num_input_tokens_seen": 58725488, + "step": 27195 + }, + { + "epoch": 4.437194127243067, + "grad_norm": 0.182399183511734, + "learning_rate": 0.0009554488023723394, + "loss": 0.1231, + "num_input_tokens_seen": 58736208, + "step": 27200 + }, + { + "epoch": 4.438009787928221, + "grad_norm": 0.013045022264122963, + "learning_rate": 0.0009554194266514105, + "loss": 0.0481, + "num_input_tokens_seen": 58746608, + "step": 27205 + }, + { + "epoch": 4.438825448613377, + "grad_norm": 0.06938508152961731, + "learning_rate": 0.0009553900417008292, + "loss": 0.1279, + "num_input_tokens_seen": 58756880, + "step": 27210 + }, + { + "epoch": 4.439641109298532, + "grad_norm": 0.016153326258063316, + "learning_rate": 0.000955360647521191, + "loss": 0.0158, + "num_input_tokens_seen": 58768080, + "step": 27215 + }, + { + "epoch": 4.440456769983687, + "grad_norm": 0.1726226955652237, + "learning_rate": 0.0009553312441130916, + "loss": 0.134, + "num_input_tokens_seen": 58778896, + "step": 27220 + }, + { + "epoch": 4.441272430668842, + "grad_norm": 0.029812565073370934, + "learning_rate": 0.0009553018314771269, + "loss": 0.2004, + "num_input_tokens_seen": 58790032, + "step": 27225 + }, + { + "epoch": 4.442088091353996, + "grad_norm": 0.08932312577962875, + "learning_rate": 0.0009552724096138931, + "loss": 0.0542, + "num_input_tokens_seen": 58800880, + "step": 27230 + }, + { + "epoch": 4.442903752039152, + "grad_norm": 0.004273497499525547, + "learning_rate": 0.0009552429785239863, + "loss": 0.0333, + "num_input_tokens_seen": 58811760, + "step": 27235 + }, + { + "epoch": 4.443719412724307, + "grad_norm": 0.07288570702075958, + "learning_rate": 0.0009552135382080029, + "loss": 0.1397, + "num_input_tokens_seen": 58821808, + "step": 27240 + }, + { + "epoch": 4.444535073409462, + "grad_norm": 0.06535542756319046, + "learning_rate": 0.0009551840886665398, + "loss": 0.1769, + "num_input_tokens_seen": 58832080, + "step": 27245 + }, + { + "epoch": 4.445350734094617, + "grad_norm": 0.023757988587021828, + "learning_rate": 0.0009551546299001938, + "loss": 0.0263, + "num_input_tokens_seen": 58842480, + "step": 27250 + }, + { + "epoch": 4.446166394779771, + "grad_norm": 0.10377778112888336, + "learning_rate": 0.0009551251619095616, + "loss": 0.2313, + "num_input_tokens_seen": 58852752, + "step": 27255 + }, + { + "epoch": 4.446982055464926, + "grad_norm": 0.013002262450754642, + "learning_rate": 0.0009550956846952408, + "loss": 0.0457, + "num_input_tokens_seen": 58863312, + "step": 27260 + }, + { + "epoch": 4.447797716150082, + "grad_norm": 0.05241988226771355, + "learning_rate": 0.0009550661982578286, + "loss": 0.0732, + "num_input_tokens_seen": 58873104, + "step": 27265 + }, + { + "epoch": 4.448613376835237, + "grad_norm": 0.16758869588375092, + "learning_rate": 0.0009550367025979225, + "loss": 0.0599, + "num_input_tokens_seen": 58884624, + "step": 27270 + }, + { + "epoch": 4.4494290375203915, + "grad_norm": 0.08910631388425827, + "learning_rate": 0.0009550071977161203, + "loss": 0.0456, + "num_input_tokens_seen": 58894864, + "step": 27275 + }, + { + "epoch": 4.450244698205546, + "grad_norm": 0.11660542339086533, + "learning_rate": 0.0009549776836130202, + "loss": 0.0836, + "num_input_tokens_seen": 58906512, + "step": 27280 + }, + { + "epoch": 4.451060358890701, + "grad_norm": 0.008070001378655434, + "learning_rate": 0.0009549481602892201, + "loss": 0.0724, + "num_input_tokens_seen": 58917904, + "step": 27285 + }, + { + "epoch": 4.451876019575856, + "grad_norm": 0.01978311501443386, + "learning_rate": 0.0009549186277453184, + "loss": 0.0503, + "num_input_tokens_seen": 58928304, + "step": 27290 + }, + { + "epoch": 4.452691680261012, + "grad_norm": 0.007964332588016987, + "learning_rate": 0.0009548890859819138, + "loss": 0.0271, + "num_input_tokens_seen": 58939056, + "step": 27295 + }, + { + "epoch": 4.4535073409461665, + "grad_norm": 0.021110136061906815, + "learning_rate": 0.0009548595349996045, + "loss": 0.1347, + "num_input_tokens_seen": 58950384, + "step": 27300 + }, + { + "epoch": 4.454323001631321, + "grad_norm": 0.0071739936247467995, + "learning_rate": 0.0009548299747989897, + "loss": 0.0489, + "num_input_tokens_seen": 58959952, + "step": 27305 + }, + { + "epoch": 4.455138662316476, + "grad_norm": 0.12284361571073532, + "learning_rate": 0.0009548004053806686, + "loss": 0.0734, + "num_input_tokens_seen": 58970288, + "step": 27310 + }, + { + "epoch": 4.455954323001631, + "grad_norm": 0.012956147082149982, + "learning_rate": 0.0009547708267452403, + "loss": 0.1106, + "num_input_tokens_seen": 58981936, + "step": 27315 + }, + { + "epoch": 4.456769983686787, + "grad_norm": 0.0019957309123128653, + "learning_rate": 0.0009547412388933042, + "loss": 0.0591, + "num_input_tokens_seen": 58992432, + "step": 27320 + }, + { + "epoch": 4.4575856443719415, + "grad_norm": 0.056910980492830276, + "learning_rate": 0.0009547116418254601, + "loss": 0.0218, + "num_input_tokens_seen": 59002832, + "step": 27325 + }, + { + "epoch": 4.458401305057096, + "grad_norm": 0.09180209785699844, + "learning_rate": 0.0009546820355423077, + "loss": 0.0371, + "num_input_tokens_seen": 59013936, + "step": 27330 + }, + { + "epoch": 4.459216965742251, + "grad_norm": 0.039425503462553024, + "learning_rate": 0.0009546524200444471, + "loss": 0.0331, + "num_input_tokens_seen": 59023440, + "step": 27335 + }, + { + "epoch": 4.460032626427406, + "grad_norm": 0.1096813827753067, + "learning_rate": 0.0009546227953324784, + "loss": 0.1264, + "num_input_tokens_seen": 59034224, + "step": 27340 + }, + { + "epoch": 4.460848287112561, + "grad_norm": 0.13769815862178802, + "learning_rate": 0.000954593161407002, + "loss": 0.113, + "num_input_tokens_seen": 59044016, + "step": 27345 + }, + { + "epoch": 4.4616639477977165, + "grad_norm": 0.01868092268705368, + "learning_rate": 0.0009545635182686185, + "loss": 0.0201, + "num_input_tokens_seen": 59053648, + "step": 27350 + }, + { + "epoch": 4.462479608482871, + "grad_norm": 0.08464818447828293, + "learning_rate": 0.0009545338659179286, + "loss": 0.0969, + "num_input_tokens_seen": 59064080, + "step": 27355 + }, + { + "epoch": 4.463295269168026, + "grad_norm": 0.13541734218597412, + "learning_rate": 0.0009545042043555334, + "loss": 0.2122, + "num_input_tokens_seen": 59074672, + "step": 27360 + }, + { + "epoch": 4.464110929853181, + "grad_norm": 0.06018571928143501, + "learning_rate": 0.000954474533582034, + "loss": 0.1358, + "num_input_tokens_seen": 59084784, + "step": 27365 + }, + { + "epoch": 4.464926590538336, + "grad_norm": 0.039212699979543686, + "learning_rate": 0.0009544448535980315, + "loss": 0.0195, + "num_input_tokens_seen": 59095504, + "step": 27370 + }, + { + "epoch": 4.465742251223491, + "grad_norm": 0.0034587730187922716, + "learning_rate": 0.0009544151644041275, + "loss": 0.017, + "num_input_tokens_seen": 59105616, + "step": 27375 + }, + { + "epoch": 4.466557911908646, + "grad_norm": 0.4014672338962555, + "learning_rate": 0.0009543854660009237, + "loss": 0.1806, + "num_input_tokens_seen": 59114576, + "step": 27380 + }, + { + "epoch": 4.467373572593801, + "grad_norm": 0.01155854668468237, + "learning_rate": 0.0009543557583890221, + "loss": 0.1539, + "num_input_tokens_seen": 59124976, + "step": 27385 + }, + { + "epoch": 4.468189233278956, + "grad_norm": 0.03605454042553902, + "learning_rate": 0.0009543260415690247, + "loss": 0.0797, + "num_input_tokens_seen": 59136336, + "step": 27390 + }, + { + "epoch": 4.469004893964111, + "grad_norm": 0.025336695834994316, + "learning_rate": 0.0009542963155415336, + "loss": 0.083, + "num_input_tokens_seen": 59146960, + "step": 27395 + }, + { + "epoch": 4.4698205546492655, + "grad_norm": 0.1162446066737175, + "learning_rate": 0.0009542665803071515, + "loss": 0.0272, + "num_input_tokens_seen": 59158096, + "step": 27400 + }, + { + "epoch": 4.470636215334421, + "grad_norm": 0.006321317981928587, + "learning_rate": 0.0009542368358664806, + "loss": 0.0402, + "num_input_tokens_seen": 59168912, + "step": 27405 + }, + { + "epoch": 4.471451876019576, + "grad_norm": 0.0048429640009999275, + "learning_rate": 0.0009542070822201244, + "loss": 0.0863, + "num_input_tokens_seen": 59179728, + "step": 27410 + }, + { + "epoch": 4.472267536704731, + "grad_norm": 0.18373055756092072, + "learning_rate": 0.0009541773193686851, + "loss": 0.1749, + "num_input_tokens_seen": 59191312, + "step": 27415 + }, + { + "epoch": 4.473083197389886, + "grad_norm": 0.016804339364171028, + "learning_rate": 0.0009541475473127664, + "loss": 0.0215, + "num_input_tokens_seen": 59202352, + "step": 27420 + }, + { + "epoch": 4.4738988580750405, + "grad_norm": 0.01077316328883171, + "learning_rate": 0.0009541177660529715, + "loss": 0.0318, + "num_input_tokens_seen": 59212560, + "step": 27425 + }, + { + "epoch": 4.474714518760196, + "grad_norm": 0.013437003828585148, + "learning_rate": 0.0009540879755899041, + "loss": 0.0902, + "num_input_tokens_seen": 59221968, + "step": 27430 + }, + { + "epoch": 4.475530179445351, + "grad_norm": 0.01018616184592247, + "learning_rate": 0.0009540581759241676, + "loss": 0.1503, + "num_input_tokens_seen": 59232848, + "step": 27435 + }, + { + "epoch": 4.476345840130506, + "grad_norm": 0.009301213547587395, + "learning_rate": 0.0009540283670563663, + "loss": 0.1013, + "num_input_tokens_seen": 59243984, + "step": 27440 + }, + { + "epoch": 4.477161500815661, + "grad_norm": 0.028417224064469337, + "learning_rate": 0.0009539985489871041, + "loss": 0.1697, + "num_input_tokens_seen": 59253840, + "step": 27445 + }, + { + "epoch": 4.4779771615008155, + "grad_norm": 0.007942571304738522, + "learning_rate": 0.0009539687217169855, + "loss": 0.0384, + "num_input_tokens_seen": 59264400, + "step": 27450 + }, + { + "epoch": 4.47879282218597, + "grad_norm": 0.17409901320934296, + "learning_rate": 0.0009539388852466146, + "loss": 0.2406, + "num_input_tokens_seen": 59275728, + "step": 27455 + }, + { + "epoch": 4.479608482871126, + "grad_norm": 0.013572280295193195, + "learning_rate": 0.0009539090395765966, + "loss": 0.0376, + "num_input_tokens_seen": 59286896, + "step": 27460 + }, + { + "epoch": 4.480424143556281, + "grad_norm": 0.013725821860134602, + "learning_rate": 0.000953879184707536, + "loss": 0.0582, + "num_input_tokens_seen": 59297808, + "step": 27465 + }, + { + "epoch": 4.481239804241436, + "grad_norm": 0.011211387813091278, + "learning_rate": 0.0009538493206400378, + "loss": 0.0377, + "num_input_tokens_seen": 59308752, + "step": 27470 + }, + { + "epoch": 4.4820554649265905, + "grad_norm": 0.10982939600944519, + "learning_rate": 0.0009538194473747077, + "loss": 0.0883, + "num_input_tokens_seen": 59318320, + "step": 27475 + }, + { + "epoch": 4.482871125611745, + "grad_norm": 0.06800361722707748, + "learning_rate": 0.0009537895649121504, + "loss": 0.1242, + "num_input_tokens_seen": 59328464, + "step": 27480 + }, + { + "epoch": 4.4836867862969, + "grad_norm": 0.04133123531937599, + "learning_rate": 0.0009537596732529721, + "loss": 0.0621, + "num_input_tokens_seen": 59338000, + "step": 27485 + }, + { + "epoch": 4.484502446982056, + "grad_norm": 0.13754095137119293, + "learning_rate": 0.0009537297723977784, + "loss": 0.1288, + "num_input_tokens_seen": 59347792, + "step": 27490 + }, + { + "epoch": 4.485318107667211, + "grad_norm": 0.030860092490911484, + "learning_rate": 0.0009536998623471752, + "loss": 0.1151, + "num_input_tokens_seen": 59357648, + "step": 27495 + }, + { + "epoch": 4.486133768352365, + "grad_norm": 0.004192314576357603, + "learning_rate": 0.0009536699431017688, + "loss": 0.062, + "num_input_tokens_seen": 59367760, + "step": 27500 + }, + { + "epoch": 4.48694942903752, + "grad_norm": 0.1000770702958107, + "learning_rate": 0.0009536400146621653, + "loss": 0.0607, + "num_input_tokens_seen": 59378416, + "step": 27505 + }, + { + "epoch": 4.487765089722675, + "grad_norm": 0.14327333867549896, + "learning_rate": 0.0009536100770289717, + "loss": 0.0628, + "num_input_tokens_seen": 59388944, + "step": 27510 + }, + { + "epoch": 4.488580750407831, + "grad_norm": 0.0850549265742302, + "learning_rate": 0.0009535801302027942, + "loss": 0.0903, + "num_input_tokens_seen": 59397776, + "step": 27515 + }, + { + "epoch": 4.489396411092986, + "grad_norm": 0.09549807012081146, + "learning_rate": 0.0009535501741842401, + "loss": 0.0191, + "num_input_tokens_seen": 59408464, + "step": 27520 + }, + { + "epoch": 4.49021207177814, + "grad_norm": 0.007356693036854267, + "learning_rate": 0.0009535202089739162, + "loss": 0.2035, + "num_input_tokens_seen": 59419888, + "step": 27525 + }, + { + "epoch": 4.491027732463295, + "grad_norm": 0.028307851403951645, + "learning_rate": 0.0009534902345724301, + "loss": 0.0966, + "num_input_tokens_seen": 59429712, + "step": 27530 + }, + { + "epoch": 4.49184339314845, + "grad_norm": 0.012953936122357845, + "learning_rate": 0.000953460250980389, + "loss": 0.0284, + "num_input_tokens_seen": 59440688, + "step": 27535 + }, + { + "epoch": 4.492659053833605, + "grad_norm": 0.0412299819290638, + "learning_rate": 0.0009534302581984007, + "loss": 0.0653, + "num_input_tokens_seen": 59451120, + "step": 27540 + }, + { + "epoch": 4.493474714518761, + "grad_norm": 0.011484194546937943, + "learning_rate": 0.000953400256227073, + "loss": 0.066, + "num_input_tokens_seen": 59461776, + "step": 27545 + }, + { + "epoch": 4.494290375203915, + "grad_norm": 0.030943267047405243, + "learning_rate": 0.0009533702450670138, + "loss": 0.2721, + "num_input_tokens_seen": 59472464, + "step": 27550 + }, + { + "epoch": 4.49510603588907, + "grad_norm": 0.11319664865732193, + "learning_rate": 0.0009533402247188317, + "loss": 0.2176, + "num_input_tokens_seen": 59482704, + "step": 27555 + }, + { + "epoch": 4.495921696574225, + "grad_norm": 0.09330317378044128, + "learning_rate": 0.0009533101951831347, + "loss": 0.1293, + "num_input_tokens_seen": 59492976, + "step": 27560 + }, + { + "epoch": 4.49673735725938, + "grad_norm": 0.16171112656593323, + "learning_rate": 0.0009532801564605315, + "loss": 0.0896, + "num_input_tokens_seen": 59503344, + "step": 27565 + }, + { + "epoch": 4.497553017944535, + "grad_norm": 0.05448443442583084, + "learning_rate": 0.000953250108551631, + "loss": 0.0195, + "num_input_tokens_seen": 59514896, + "step": 27570 + }, + { + "epoch": 4.49836867862969, + "grad_norm": 0.12921537458896637, + "learning_rate": 0.0009532200514570419, + "loss": 0.2943, + "num_input_tokens_seen": 59527408, + "step": 27575 + }, + { + "epoch": 4.499184339314845, + "grad_norm": 0.13408558070659637, + "learning_rate": 0.0009531899851773737, + "loss": 0.1589, + "num_input_tokens_seen": 59537520, + "step": 27580 + }, + { + "epoch": 4.5, + "grad_norm": 0.061856500804424286, + "learning_rate": 0.0009531599097132354, + "loss": 0.1906, + "num_input_tokens_seen": 59549072, + "step": 27585 + }, + { + "epoch": 4.500815660685155, + "grad_norm": 0.023835081607103348, + "learning_rate": 0.0009531298250652367, + "loss": 0.0306, + "num_input_tokens_seen": 59559824, + "step": 27590 + }, + { + "epoch": 4.50163132137031, + "grad_norm": 0.03424694016575813, + "learning_rate": 0.0009530997312339873, + "loss": 0.1089, + "num_input_tokens_seen": 59569776, + "step": 27595 + }, + { + "epoch": 4.502446982055465, + "grad_norm": 0.04408469423651695, + "learning_rate": 0.000953069628220097, + "loss": 0.0696, + "num_input_tokens_seen": 59579184, + "step": 27600 + }, + { + "epoch": 4.50326264274062, + "grad_norm": 0.015978412702679634, + "learning_rate": 0.0009530395160241759, + "loss": 0.0524, + "num_input_tokens_seen": 59590416, + "step": 27605 + }, + { + "epoch": 4.504078303425775, + "grad_norm": 0.00949131976813078, + "learning_rate": 0.0009530093946468343, + "loss": 0.091, + "num_input_tokens_seen": 59601136, + "step": 27610 + }, + { + "epoch": 4.50489396411093, + "grad_norm": 0.136776864528656, + "learning_rate": 0.0009529792640886827, + "loss": 0.1631, + "num_input_tokens_seen": 59612816, + "step": 27615 + }, + { + "epoch": 4.505709624796085, + "grad_norm": 0.060061562806367874, + "learning_rate": 0.0009529491243503316, + "loss": 0.0901, + "num_input_tokens_seen": 59624080, + "step": 27620 + }, + { + "epoch": 4.506525285481239, + "grad_norm": 0.1110927164554596, + "learning_rate": 0.000952918975432392, + "loss": 0.0698, + "num_input_tokens_seen": 59634608, + "step": 27625 + }, + { + "epoch": 4.507340946166395, + "grad_norm": 0.14070548117160797, + "learning_rate": 0.0009528888173354746, + "loss": 0.0745, + "num_input_tokens_seen": 59645200, + "step": 27630 + }, + { + "epoch": 4.50815660685155, + "grad_norm": 0.009473079815506935, + "learning_rate": 0.000952858650060191, + "loss": 0.0757, + "num_input_tokens_seen": 59654736, + "step": 27635 + }, + { + "epoch": 4.508972267536705, + "grad_norm": 0.0065461150370538235, + "learning_rate": 0.0009528284736071522, + "loss": 0.086, + "num_input_tokens_seen": 59664624, + "step": 27640 + }, + { + "epoch": 4.50978792822186, + "grad_norm": 0.007517965976148844, + "learning_rate": 0.00095279828797697, + "loss": 0.1293, + "num_input_tokens_seen": 59674480, + "step": 27645 + }, + { + "epoch": 4.510603588907014, + "grad_norm": 0.16662810742855072, + "learning_rate": 0.000952768093170256, + "loss": 0.0647, + "num_input_tokens_seen": 59685392, + "step": 27650 + }, + { + "epoch": 4.511419249592169, + "grad_norm": 0.01383263524621725, + "learning_rate": 0.0009527378891876223, + "loss": 0.0279, + "num_input_tokens_seen": 59695280, + "step": 27655 + }, + { + "epoch": 4.512234910277325, + "grad_norm": 0.019782204180955887, + "learning_rate": 0.0009527076760296809, + "loss": 0.1187, + "num_input_tokens_seen": 59705872, + "step": 27660 + }, + { + "epoch": 4.51305057096248, + "grad_norm": 0.04180212318897247, + "learning_rate": 0.0009526774536970442, + "loss": 0.0286, + "num_input_tokens_seen": 59717936, + "step": 27665 + }, + { + "epoch": 4.513866231647635, + "grad_norm": 0.1733533889055252, + "learning_rate": 0.0009526472221903247, + "loss": 0.1163, + "num_input_tokens_seen": 59728144, + "step": 27670 + }, + { + "epoch": 4.514681892332789, + "grad_norm": 0.05631665140390396, + "learning_rate": 0.0009526169815101349, + "loss": 0.0643, + "num_input_tokens_seen": 59739888, + "step": 27675 + }, + { + "epoch": 4.515497553017944, + "grad_norm": 0.014165625907480717, + "learning_rate": 0.0009525867316570877, + "loss": 0.0221, + "num_input_tokens_seen": 59750320, + "step": 27680 + }, + { + "epoch": 4.5163132137031, + "grad_norm": 0.09435293823480606, + "learning_rate": 0.0009525564726317963, + "loss": 0.0945, + "num_input_tokens_seen": 59761520, + "step": 27685 + }, + { + "epoch": 4.517128874388255, + "grad_norm": 0.050828587263822556, + "learning_rate": 0.000952526204434874, + "loss": 0.147, + "num_input_tokens_seen": 59771632, + "step": 27690 + }, + { + "epoch": 4.5179445350734095, + "grad_norm": 0.2414238154888153, + "learning_rate": 0.000952495927066934, + "loss": 0.1323, + "num_input_tokens_seen": 59781744, + "step": 27695 + }, + { + "epoch": 4.518760195758564, + "grad_norm": 0.195554718375206, + "learning_rate": 0.00095246564052859, + "loss": 0.1077, + "num_input_tokens_seen": 59791760, + "step": 27700 + }, + { + "epoch": 4.519575856443719, + "grad_norm": 0.06786315888166428, + "learning_rate": 0.0009524353448204558, + "loss": 0.0364, + "num_input_tokens_seen": 59801872, + "step": 27705 + }, + { + "epoch": 4.520391517128875, + "grad_norm": 0.006322198547422886, + "learning_rate": 0.0009524050399431454, + "loss": 0.2081, + "num_input_tokens_seen": 59811152, + "step": 27710 + }, + { + "epoch": 4.52120717781403, + "grad_norm": 0.015922456979751587, + "learning_rate": 0.0009523747258972729, + "loss": 0.079, + "num_input_tokens_seen": 59821264, + "step": 27715 + }, + { + "epoch": 4.5220228384991845, + "grad_norm": 0.1280314177274704, + "learning_rate": 0.0009523444026834528, + "loss": 0.1432, + "num_input_tokens_seen": 59832336, + "step": 27720 + }, + { + "epoch": 4.522838499184339, + "grad_norm": 0.014652982354164124, + "learning_rate": 0.0009523140703022995, + "loss": 0.0304, + "num_input_tokens_seen": 59842288, + "step": 27725 + }, + { + "epoch": 4.523654159869494, + "grad_norm": 0.20480993390083313, + "learning_rate": 0.0009522837287544277, + "loss": 0.0942, + "num_input_tokens_seen": 59852912, + "step": 27730 + }, + { + "epoch": 4.524469820554649, + "grad_norm": 0.012573088519275188, + "learning_rate": 0.0009522533780404526, + "loss": 0.0873, + "num_input_tokens_seen": 59863280, + "step": 27735 + }, + { + "epoch": 4.525285481239804, + "grad_norm": 0.015705108642578125, + "learning_rate": 0.0009522230181609888, + "loss": 0.0739, + "num_input_tokens_seen": 59874288, + "step": 27740 + }, + { + "epoch": 4.5261011419249595, + "grad_norm": 0.12113679945468903, + "learning_rate": 0.000952192649116652, + "loss": 0.0751, + "num_input_tokens_seen": 59886000, + "step": 27745 + }, + { + "epoch": 4.526916802610114, + "grad_norm": 0.05041411146521568, + "learning_rate": 0.0009521622709080574, + "loss": 0.0199, + "num_input_tokens_seen": 59896432, + "step": 27750 + }, + { + "epoch": 4.527732463295269, + "grad_norm": 0.11806817352771759, + "learning_rate": 0.0009521318835358208, + "loss": 0.2068, + "num_input_tokens_seen": 59906704, + "step": 27755 + }, + { + "epoch": 4.528548123980424, + "grad_norm": 0.05975692346692085, + "learning_rate": 0.000952101487000558, + "loss": 0.0303, + "num_input_tokens_seen": 59917392, + "step": 27760 + }, + { + "epoch": 4.529363784665579, + "grad_norm": 0.12676216661930084, + "learning_rate": 0.0009520710813028852, + "loss": 0.1522, + "num_input_tokens_seen": 59927984, + "step": 27765 + }, + { + "epoch": 4.5301794453507345, + "grad_norm": 0.026642469689249992, + "learning_rate": 0.0009520406664434183, + "loss": 0.101, + "num_input_tokens_seen": 59938160, + "step": 27770 + }, + { + "epoch": 4.530995106035889, + "grad_norm": 0.026598934084177017, + "learning_rate": 0.0009520102424227739, + "loss": 0.0454, + "num_input_tokens_seen": 59950000, + "step": 27775 + }, + { + "epoch": 4.531810766721044, + "grad_norm": 0.04175793379545212, + "learning_rate": 0.0009519798092415683, + "loss": 0.0416, + "num_input_tokens_seen": 59960720, + "step": 27780 + }, + { + "epoch": 4.532626427406199, + "grad_norm": 0.04849720746278763, + "learning_rate": 0.0009519493669004189, + "loss": 0.0161, + "num_input_tokens_seen": 59970864, + "step": 27785 + }, + { + "epoch": 4.533442088091354, + "grad_norm": 0.20847368240356445, + "learning_rate": 0.0009519189153999419, + "loss": 0.1509, + "num_input_tokens_seen": 59982256, + "step": 27790 + }, + { + "epoch": 4.5342577487765094, + "grad_norm": 0.005125290248543024, + "learning_rate": 0.0009518884547407549, + "loss": 0.1699, + "num_input_tokens_seen": 59993584, + "step": 27795 + }, + { + "epoch": 4.535073409461664, + "grad_norm": 0.014223860576748848, + "learning_rate": 0.0009518579849234752, + "loss": 0.013, + "num_input_tokens_seen": 60004720, + "step": 27800 + }, + { + "epoch": 4.535889070146819, + "grad_norm": 0.016635876148939133, + "learning_rate": 0.00095182750594872, + "loss": 0.1287, + "num_input_tokens_seen": 60015120, + "step": 27805 + }, + { + "epoch": 4.536704730831974, + "grad_norm": 0.007937697693705559, + "learning_rate": 0.0009517970178171074, + "loss": 0.0347, + "num_input_tokens_seen": 60024752, + "step": 27810 + }, + { + "epoch": 4.537520391517129, + "grad_norm": 0.03570374846458435, + "learning_rate": 0.000951766520529255, + "loss": 0.0773, + "num_input_tokens_seen": 60037072, + "step": 27815 + }, + { + "epoch": 4.5383360522022835, + "grad_norm": 0.007745860144495964, + "learning_rate": 0.0009517360140857809, + "loss": 0.0628, + "num_input_tokens_seen": 60048144, + "step": 27820 + }, + { + "epoch": 4.539151712887438, + "grad_norm": 0.026655148714780807, + "learning_rate": 0.0009517054984873035, + "loss": 0.1354, + "num_input_tokens_seen": 60058544, + "step": 27825 + }, + { + "epoch": 4.539967373572594, + "grad_norm": 0.13499534130096436, + "learning_rate": 0.0009516749737344412, + "loss": 0.0922, + "num_input_tokens_seen": 60068688, + "step": 27830 + }, + { + "epoch": 4.540783034257749, + "grad_norm": 0.1816767454147339, + "learning_rate": 0.0009516444398278125, + "loss": 0.0861, + "num_input_tokens_seen": 60079664, + "step": 27835 + }, + { + "epoch": 4.541598694942904, + "grad_norm": 0.0577971450984478, + "learning_rate": 0.0009516138967680363, + "loss": 0.1395, + "num_input_tokens_seen": 60089296, + "step": 27840 + }, + { + "epoch": 4.5424143556280585, + "grad_norm": 0.07940755039453506, + "learning_rate": 0.0009515833445557314, + "loss": 0.0355, + "num_input_tokens_seen": 60099568, + "step": 27845 + }, + { + "epoch": 4.543230016313213, + "grad_norm": 0.016803240403532982, + "learning_rate": 0.0009515527831915174, + "loss": 0.1002, + "num_input_tokens_seen": 60110160, + "step": 27850 + }, + { + "epoch": 4.544045676998369, + "grad_norm": 0.10514587163925171, + "learning_rate": 0.0009515222126760132, + "loss": 0.0413, + "num_input_tokens_seen": 60120880, + "step": 27855 + }, + { + "epoch": 4.544861337683524, + "grad_norm": 0.018270540982484818, + "learning_rate": 0.0009514916330098386, + "loss": 0.0566, + "num_input_tokens_seen": 60132336, + "step": 27860 + }, + { + "epoch": 4.545676998368679, + "grad_norm": 0.12346773594617844, + "learning_rate": 0.0009514610441936133, + "loss": 0.0609, + "num_input_tokens_seen": 60142608, + "step": 27865 + }, + { + "epoch": 4.5464926590538335, + "grad_norm": 0.20731449127197266, + "learning_rate": 0.0009514304462279574, + "loss": 0.052, + "num_input_tokens_seen": 60153744, + "step": 27870 + }, + { + "epoch": 4.547308319738988, + "grad_norm": 0.004202079959213734, + "learning_rate": 0.0009513998391134906, + "loss": 0.0636, + "num_input_tokens_seen": 60166288, + "step": 27875 + }, + { + "epoch": 4.548123980424144, + "grad_norm": 0.34414222836494446, + "learning_rate": 0.0009513692228508336, + "loss": 0.2312, + "num_input_tokens_seen": 60176208, + "step": 27880 + }, + { + "epoch": 4.548939641109299, + "grad_norm": 0.15242110192775726, + "learning_rate": 0.0009513385974406066, + "loss": 0.1096, + "num_input_tokens_seen": 60187088, + "step": 27885 + }, + { + "epoch": 4.549755301794454, + "grad_norm": 0.05139626935124397, + "learning_rate": 0.0009513079628834305, + "loss": 0.2106, + "num_input_tokens_seen": 60197488, + "step": 27890 + }, + { + "epoch": 4.5505709624796085, + "grad_norm": 0.15399006009101868, + "learning_rate": 0.0009512773191799258, + "loss": 0.0721, + "num_input_tokens_seen": 60208816, + "step": 27895 + }, + { + "epoch": 4.551386623164763, + "grad_norm": 0.15106838941574097, + "learning_rate": 0.0009512466663307138, + "loss": 0.0596, + "num_input_tokens_seen": 60219728, + "step": 27900 + }, + { + "epoch": 4.552202283849918, + "grad_norm": 0.010437490418553352, + "learning_rate": 0.0009512160043364157, + "loss": 0.0624, + "num_input_tokens_seen": 60230288, + "step": 27905 + }, + { + "epoch": 4.553017944535073, + "grad_norm": 0.01003468967974186, + "learning_rate": 0.0009511853331976527, + "loss": 0.0299, + "num_input_tokens_seen": 60242384, + "step": 27910 + }, + { + "epoch": 4.553833605220229, + "grad_norm": 0.006762669887393713, + "learning_rate": 0.0009511546529150467, + "loss": 0.1079, + "num_input_tokens_seen": 60253360, + "step": 27915 + }, + { + "epoch": 4.554649265905383, + "grad_norm": 0.00667745154350996, + "learning_rate": 0.0009511239634892195, + "loss": 0.1983, + "num_input_tokens_seen": 60263440, + "step": 27920 + }, + { + "epoch": 4.555464926590538, + "grad_norm": 0.1364312618970871, + "learning_rate": 0.0009510932649207926, + "loss": 0.0282, + "num_input_tokens_seen": 60273392, + "step": 27925 + }, + { + "epoch": 4.556280587275693, + "grad_norm": 0.03798520565032959, + "learning_rate": 0.0009510625572103886, + "loss": 0.1091, + "num_input_tokens_seen": 60285424, + "step": 27930 + }, + { + "epoch": 4.557096247960848, + "grad_norm": 0.21692712604999542, + "learning_rate": 0.0009510318403586297, + "loss": 0.1186, + "num_input_tokens_seen": 60295248, + "step": 27935 + }, + { + "epoch": 4.557911908646004, + "grad_norm": 0.025022750720381737, + "learning_rate": 0.0009510011143661382, + "loss": 0.1713, + "num_input_tokens_seen": 60306320, + "step": 27940 + }, + { + "epoch": 4.558727569331158, + "grad_norm": 0.028065409511327744, + "learning_rate": 0.0009509703792335371, + "loss": 0.1051, + "num_input_tokens_seen": 60315600, + "step": 27945 + }, + { + "epoch": 4.559543230016313, + "grad_norm": 0.007943467237055302, + "learning_rate": 0.0009509396349614492, + "loss": 0.1787, + "num_input_tokens_seen": 60326608, + "step": 27950 + }, + { + "epoch": 4.560358890701468, + "grad_norm": 0.015227630734443665, + "learning_rate": 0.0009509088815504975, + "loss": 0.1719, + "num_input_tokens_seen": 60336944, + "step": 27955 + }, + { + "epoch": 4.561174551386623, + "grad_norm": 0.1743406057357788, + "learning_rate": 0.0009508781190013053, + "loss": 0.1598, + "num_input_tokens_seen": 60348656, + "step": 27960 + }, + { + "epoch": 4.561990212071779, + "grad_norm": 0.06680949777364731, + "learning_rate": 0.0009508473473144961, + "loss": 0.0785, + "num_input_tokens_seen": 60359568, + "step": 27965 + }, + { + "epoch": 4.562805872756933, + "grad_norm": 0.12657639384269714, + "learning_rate": 0.0009508165664906933, + "loss": 0.078, + "num_input_tokens_seen": 60370192, + "step": 27970 + }, + { + "epoch": 4.563621533442088, + "grad_norm": 0.03476430103182793, + "learning_rate": 0.000950785776530521, + "loss": 0.0738, + "num_input_tokens_seen": 60381136, + "step": 27975 + }, + { + "epoch": 4.564437194127243, + "grad_norm": 0.10437197238206863, + "learning_rate": 0.0009507549774346029, + "loss": 0.0947, + "num_input_tokens_seen": 60392304, + "step": 27980 + }, + { + "epoch": 4.565252854812398, + "grad_norm": 0.009143018163740635, + "learning_rate": 0.0009507241692035635, + "loss": 0.1985, + "num_input_tokens_seen": 60402544, + "step": 27985 + }, + { + "epoch": 4.566068515497553, + "grad_norm": 0.04454313963651657, + "learning_rate": 0.0009506933518380272, + "loss": 0.0322, + "num_input_tokens_seen": 60413488, + "step": 27990 + }, + { + "epoch": 4.566884176182708, + "grad_norm": 0.06461065262556076, + "learning_rate": 0.0009506625253386181, + "loss": 0.0704, + "num_input_tokens_seen": 60423696, + "step": 27995 + }, + { + "epoch": 4.567699836867863, + "grad_norm": 0.12377383559942245, + "learning_rate": 0.0009506316897059614, + "loss": 0.114, + "num_input_tokens_seen": 60434576, + "step": 28000 + }, + { + "epoch": 4.568515497553018, + "grad_norm": 0.2436123937368393, + "learning_rate": 0.0009506008449406818, + "loss": 0.0815, + "num_input_tokens_seen": 60443856, + "step": 28005 + }, + { + "epoch": 4.569331158238173, + "grad_norm": 0.14003624022006989, + "learning_rate": 0.0009505699910434043, + "loss": 0.1366, + "num_input_tokens_seen": 60454864, + "step": 28010 + }, + { + "epoch": 4.570146818923328, + "grad_norm": 0.13798795640468597, + "learning_rate": 0.0009505391280147545, + "loss": 0.2037, + "num_input_tokens_seen": 60465456, + "step": 28015 + }, + { + "epoch": 4.5709624796084825, + "grad_norm": 0.10241395235061646, + "learning_rate": 0.0009505082558553577, + "loss": 0.0892, + "num_input_tokens_seen": 60476816, + "step": 28020 + }, + { + "epoch": 4.571778140293638, + "grad_norm": 0.08835651725530624, + "learning_rate": 0.0009504773745658395, + "loss": 0.0896, + "num_input_tokens_seen": 60488784, + "step": 28025 + }, + { + "epoch": 4.572593800978793, + "grad_norm": 0.03953687101602554, + "learning_rate": 0.0009504464841468259, + "loss": 0.0468, + "num_input_tokens_seen": 60500656, + "step": 28030 + }, + { + "epoch": 4.573409461663948, + "grad_norm": 0.11180218309164047, + "learning_rate": 0.000950415584598943, + "loss": 0.105, + "num_input_tokens_seen": 60511664, + "step": 28035 + }, + { + "epoch": 4.574225122349103, + "grad_norm": 0.18699076771736145, + "learning_rate": 0.0009503846759228167, + "loss": 0.1168, + "num_input_tokens_seen": 60522000, + "step": 28040 + }, + { + "epoch": 4.575040783034257, + "grad_norm": 0.08983857184648514, + "learning_rate": 0.0009503537581190736, + "loss": 0.1193, + "num_input_tokens_seen": 60532688, + "step": 28045 + }, + { + "epoch": 4.575856443719413, + "grad_norm": 0.01459483802318573, + "learning_rate": 0.0009503228311883402, + "loss": 0.0678, + "num_input_tokens_seen": 60543152, + "step": 28050 + }, + { + "epoch": 4.576672104404568, + "grad_norm": 0.05870693922042847, + "learning_rate": 0.0009502918951312436, + "loss": 0.071, + "num_input_tokens_seen": 60552272, + "step": 28055 + }, + { + "epoch": 4.577487765089723, + "grad_norm": 0.015833450481295586, + "learning_rate": 0.0009502609499484104, + "loss": 0.1947, + "num_input_tokens_seen": 60563888, + "step": 28060 + }, + { + "epoch": 4.578303425774878, + "grad_norm": 0.017761284485459328, + "learning_rate": 0.0009502299956404679, + "loss": 0.3092, + "num_input_tokens_seen": 60574576, + "step": 28065 + }, + { + "epoch": 4.579119086460032, + "grad_norm": 0.12296734750270844, + "learning_rate": 0.0009501990322080433, + "loss": 0.0668, + "num_input_tokens_seen": 60584848, + "step": 28070 + }, + { + "epoch": 4.579934747145187, + "grad_norm": 0.1512214094400406, + "learning_rate": 0.0009501680596517641, + "loss": 0.1509, + "num_input_tokens_seen": 60595312, + "step": 28075 + }, + { + "epoch": 4.580750407830343, + "grad_norm": 0.0685778483748436, + "learning_rate": 0.0009501370779722582, + "loss": 0.0563, + "num_input_tokens_seen": 60604848, + "step": 28080 + }, + { + "epoch": 4.581566068515498, + "grad_norm": 0.006333703640848398, + "learning_rate": 0.0009501060871701534, + "loss": 0.1506, + "num_input_tokens_seen": 60614640, + "step": 28085 + }, + { + "epoch": 4.582381729200653, + "grad_norm": 0.062499091029167175, + "learning_rate": 0.0009500750872460778, + "loss": 0.0644, + "num_input_tokens_seen": 60624912, + "step": 28090 + }, + { + "epoch": 4.583197389885807, + "grad_norm": 0.07755851000547409, + "learning_rate": 0.0009500440782006594, + "loss": 0.0533, + "num_input_tokens_seen": 60637968, + "step": 28095 + }, + { + "epoch": 4.584013050570962, + "grad_norm": 0.003198633436113596, + "learning_rate": 0.000950013060034527, + "loss": 0.128, + "num_input_tokens_seen": 60648816, + "step": 28100 + }, + { + "epoch": 4.584828711256117, + "grad_norm": 0.13238146901130676, + "learning_rate": 0.0009499820327483091, + "loss": 0.0999, + "num_input_tokens_seen": 60659888, + "step": 28105 + }, + { + "epoch": 4.585644371941273, + "grad_norm": 0.0198823269456625, + "learning_rate": 0.0009499509963426342, + "loss": 0.0929, + "num_input_tokens_seen": 60671472, + "step": 28110 + }, + { + "epoch": 4.5864600326264275, + "grad_norm": 0.2334928810596466, + "learning_rate": 0.0009499199508181318, + "loss": 0.1365, + "num_input_tokens_seen": 60683696, + "step": 28115 + }, + { + "epoch": 4.587275693311582, + "grad_norm": 0.06788095086812973, + "learning_rate": 0.0009498888961754308, + "loss": 0.0814, + "num_input_tokens_seen": 60694832, + "step": 28120 + }, + { + "epoch": 4.588091353996737, + "grad_norm": 0.02721845917403698, + "learning_rate": 0.0009498578324151606, + "loss": 0.0863, + "num_input_tokens_seen": 60706192, + "step": 28125 + }, + { + "epoch": 4.588907014681892, + "grad_norm": 0.014601021073758602, + "learning_rate": 0.0009498267595379506, + "loss": 0.0418, + "num_input_tokens_seen": 60716880, + "step": 28130 + }, + { + "epoch": 4.589722675367048, + "grad_norm": 0.20648455619812012, + "learning_rate": 0.0009497956775444307, + "loss": 0.2727, + "num_input_tokens_seen": 60727792, + "step": 28135 + }, + { + "epoch": 4.5905383360522025, + "grad_norm": 0.19037242233753204, + "learning_rate": 0.0009497645864352309, + "loss": 0.142, + "num_input_tokens_seen": 60738800, + "step": 28140 + }, + { + "epoch": 4.591353996737357, + "grad_norm": 0.056999899446964264, + "learning_rate": 0.0009497334862109812, + "loss": 0.1241, + "num_input_tokens_seen": 60748336, + "step": 28145 + }, + { + "epoch": 4.592169657422512, + "grad_norm": 0.006086436565965414, + "learning_rate": 0.0009497023768723119, + "loss": 0.1392, + "num_input_tokens_seen": 60759728, + "step": 28150 + }, + { + "epoch": 4.592985318107667, + "grad_norm": 0.08298686891794205, + "learning_rate": 0.0009496712584198532, + "loss": 0.0739, + "num_input_tokens_seen": 60771184, + "step": 28155 + }, + { + "epoch": 4.593800978792823, + "grad_norm": 0.03358568996191025, + "learning_rate": 0.0009496401308542363, + "loss": 0.0342, + "num_input_tokens_seen": 60782864, + "step": 28160 + }, + { + "epoch": 4.5946166394779775, + "grad_norm": 0.3085566759109497, + "learning_rate": 0.0009496089941760915, + "loss": 0.228, + "num_input_tokens_seen": 60794800, + "step": 28165 + }, + { + "epoch": 4.595432300163132, + "grad_norm": 0.005338909570127726, + "learning_rate": 0.0009495778483860502, + "loss": 0.0233, + "num_input_tokens_seen": 60806224, + "step": 28170 + }, + { + "epoch": 4.596247960848287, + "grad_norm": 0.003244395600631833, + "learning_rate": 0.0009495466934847434, + "loss": 0.1439, + "num_input_tokens_seen": 60817232, + "step": 28175 + }, + { + "epoch": 4.597063621533442, + "grad_norm": 0.042625606060028076, + "learning_rate": 0.0009495155294728026, + "loss": 0.1103, + "num_input_tokens_seen": 60828400, + "step": 28180 + }, + { + "epoch": 4.597879282218597, + "grad_norm": 0.017731165513396263, + "learning_rate": 0.0009494843563508594, + "loss": 0.1589, + "num_input_tokens_seen": 60840080, + "step": 28185 + }, + { + "epoch": 4.598694942903752, + "grad_norm": 0.15132448077201843, + "learning_rate": 0.0009494531741195454, + "loss": 0.0703, + "num_input_tokens_seen": 60851088, + "step": 28190 + }, + { + "epoch": 4.599510603588907, + "grad_norm": 0.02884131856262684, + "learning_rate": 0.0009494219827794928, + "loss": 0.103, + "num_input_tokens_seen": 60862096, + "step": 28195 + }, + { + "epoch": 4.600326264274062, + "grad_norm": 0.005228702444583178, + "learning_rate": 0.0009493907823313334, + "loss": 0.1718, + "num_input_tokens_seen": 60872848, + "step": 28200 + }, + { + "epoch": 4.601141924959217, + "grad_norm": 0.045995041728019714, + "learning_rate": 0.0009493595727756998, + "loss": 0.0254, + "num_input_tokens_seen": 60883760, + "step": 28205 + }, + { + "epoch": 4.601957585644372, + "grad_norm": 0.1486821174621582, + "learning_rate": 0.0009493283541132245, + "loss": 0.1201, + "num_input_tokens_seen": 60893456, + "step": 28210 + }, + { + "epoch": 4.602773246329527, + "grad_norm": 0.05317138880491257, + "learning_rate": 0.0009492971263445401, + "loss": 0.0859, + "num_input_tokens_seen": 60905136, + "step": 28215 + }, + { + "epoch": 4.603588907014682, + "grad_norm": 0.08010109513998032, + "learning_rate": 0.0009492658894702792, + "loss": 0.1545, + "num_input_tokens_seen": 60915088, + "step": 28220 + }, + { + "epoch": 4.604404567699837, + "grad_norm": 0.04018460214138031, + "learning_rate": 0.0009492346434910753, + "loss": 0.0918, + "num_input_tokens_seen": 60926704, + "step": 28225 + }, + { + "epoch": 4.605220228384992, + "grad_norm": 0.23530346155166626, + "learning_rate": 0.0009492033884075615, + "loss": 0.1394, + "num_input_tokens_seen": 60937808, + "step": 28230 + }, + { + "epoch": 4.606035889070147, + "grad_norm": 0.1638834923505783, + "learning_rate": 0.000949172124220371, + "loss": 0.0801, + "num_input_tokens_seen": 60948112, + "step": 28235 + }, + { + "epoch": 4.6068515497553015, + "grad_norm": 0.025158405303955078, + "learning_rate": 0.0009491408509301378, + "loss": 0.1864, + "num_input_tokens_seen": 60959312, + "step": 28240 + }, + { + "epoch": 4.607667210440457, + "grad_norm": 0.0906141847372055, + "learning_rate": 0.0009491095685374954, + "loss": 0.0565, + "num_input_tokens_seen": 60970256, + "step": 28245 + }, + { + "epoch": 4.608482871125612, + "grad_norm": 0.007031626999378204, + "learning_rate": 0.0009490782770430777, + "loss": 0.0372, + "num_input_tokens_seen": 60980752, + "step": 28250 + }, + { + "epoch": 4.609298531810767, + "grad_norm": 0.005469320807605982, + "learning_rate": 0.0009490469764475191, + "loss": 0.0522, + "num_input_tokens_seen": 60991344, + "step": 28255 + }, + { + "epoch": 4.610114192495922, + "grad_norm": 0.019031571224331856, + "learning_rate": 0.0009490156667514541, + "loss": 0.0367, + "num_input_tokens_seen": 61001264, + "step": 28260 + }, + { + "epoch": 4.6109298531810765, + "grad_norm": 0.00580698624253273, + "learning_rate": 0.0009489843479555167, + "loss": 0.0145, + "num_input_tokens_seen": 61011408, + "step": 28265 + }, + { + "epoch": 4.611745513866231, + "grad_norm": 0.12320695072412491, + "learning_rate": 0.000948953020060342, + "loss": 0.1337, + "num_input_tokens_seen": 61020944, + "step": 28270 + }, + { + "epoch": 4.612561174551386, + "grad_norm": 0.09463177621364594, + "learning_rate": 0.0009489216830665649, + "loss": 0.0296, + "num_input_tokens_seen": 61031248, + "step": 28275 + }, + { + "epoch": 4.613376835236542, + "grad_norm": 0.18280619382858276, + "learning_rate": 0.0009488903369748203, + "loss": 0.1975, + "num_input_tokens_seen": 61042320, + "step": 28280 + }, + { + "epoch": 4.614192495921697, + "grad_norm": 0.07874840497970581, + "learning_rate": 0.0009488589817857435, + "loss": 0.2118, + "num_input_tokens_seen": 61053520, + "step": 28285 + }, + { + "epoch": 4.6150081566068515, + "grad_norm": 0.04449717327952385, + "learning_rate": 0.0009488276174999702, + "loss": 0.0407, + "num_input_tokens_seen": 61062960, + "step": 28290 + }, + { + "epoch": 4.615823817292006, + "grad_norm": 0.00582935381680727, + "learning_rate": 0.0009487962441181357, + "loss": 0.0284, + "num_input_tokens_seen": 61073392, + "step": 28295 + }, + { + "epoch": 4.616639477977161, + "grad_norm": 0.005751934368163347, + "learning_rate": 0.0009487648616408762, + "loss": 0.0453, + "num_input_tokens_seen": 61083984, + "step": 28300 + }, + { + "epoch": 4.617455138662317, + "grad_norm": 0.03791828453540802, + "learning_rate": 0.0009487334700688273, + "loss": 0.0311, + "num_input_tokens_seen": 61095280, + "step": 28305 + }, + { + "epoch": 4.618270799347472, + "grad_norm": 0.027981318533420563, + "learning_rate": 0.0009487020694026254, + "loss": 0.0543, + "num_input_tokens_seen": 61107024, + "step": 28310 + }, + { + "epoch": 4.6190864600326265, + "grad_norm": 0.011553804390132427, + "learning_rate": 0.0009486706596429068, + "loss": 0.0733, + "num_input_tokens_seen": 61117552, + "step": 28315 + }, + { + "epoch": 4.619902120717781, + "grad_norm": 0.16075754165649414, + "learning_rate": 0.0009486392407903082, + "loss": 0.0891, + "num_input_tokens_seen": 61129040, + "step": 28320 + }, + { + "epoch": 4.620717781402936, + "grad_norm": 0.023777015507221222, + "learning_rate": 0.000948607812845466, + "loss": 0.093, + "num_input_tokens_seen": 61139664, + "step": 28325 + }, + { + "epoch": 4.621533442088092, + "grad_norm": 0.18443554639816284, + "learning_rate": 0.0009485763758090176, + "loss": 0.3451, + "num_input_tokens_seen": 61151728, + "step": 28330 + }, + { + "epoch": 4.622349102773247, + "grad_norm": 0.2226351499557495, + "learning_rate": 0.0009485449296815999, + "loss": 0.1539, + "num_input_tokens_seen": 61162448, + "step": 28335 + }, + { + "epoch": 4.623164763458401, + "grad_norm": 0.15153902769088745, + "learning_rate": 0.00094851347446385, + "loss": 0.1141, + "num_input_tokens_seen": 61173136, + "step": 28340 + }, + { + "epoch": 4.623980424143556, + "grad_norm": 0.12760953605175018, + "learning_rate": 0.0009484820101564058, + "loss": 0.0758, + "num_input_tokens_seen": 61183120, + "step": 28345 + }, + { + "epoch": 4.624796084828711, + "grad_norm": 0.0419689416885376, + "learning_rate": 0.0009484505367599045, + "loss": 0.0398, + "num_input_tokens_seen": 61194000, + "step": 28350 + }, + { + "epoch": 4.625611745513866, + "grad_norm": 0.08702402561903, + "learning_rate": 0.0009484190542749844, + "loss": 0.0657, + "num_input_tokens_seen": 61204688, + "step": 28355 + }, + { + "epoch": 4.626427406199021, + "grad_norm": 0.15682923793792725, + "learning_rate": 0.0009483875627022831, + "loss": 0.1455, + "num_input_tokens_seen": 61215376, + "step": 28360 + }, + { + "epoch": 4.627243066884176, + "grad_norm": 0.11016713082790375, + "learning_rate": 0.0009483560620424391, + "loss": 0.0431, + "num_input_tokens_seen": 61226256, + "step": 28365 + }, + { + "epoch": 4.628058727569331, + "grad_norm": 0.0033596267458051443, + "learning_rate": 0.0009483245522960909, + "loss": 0.0438, + "num_input_tokens_seen": 61238192, + "step": 28370 + }, + { + "epoch": 4.628874388254486, + "grad_norm": 0.0054781073704361916, + "learning_rate": 0.0009482930334638766, + "loss": 0.0617, + "num_input_tokens_seen": 61248944, + "step": 28375 + }, + { + "epoch": 4.629690048939641, + "grad_norm": 0.005737095605581999, + "learning_rate": 0.0009482615055464354, + "loss": 0.0501, + "num_input_tokens_seen": 61260752, + "step": 28380 + }, + { + "epoch": 4.630505709624796, + "grad_norm": 0.25772082805633545, + "learning_rate": 0.0009482299685444062, + "loss": 0.1784, + "num_input_tokens_seen": 61271984, + "step": 28385 + }, + { + "epoch": 4.631321370309951, + "grad_norm": 0.47116729617118835, + "learning_rate": 0.0009481984224584279, + "loss": 0.1347, + "num_input_tokens_seen": 61283248, + "step": 28390 + }, + { + "epoch": 4.632137030995106, + "grad_norm": 0.015150204300880432, + "learning_rate": 0.0009481668672891401, + "loss": 0.072, + "num_input_tokens_seen": 61294512, + "step": 28395 + }, + { + "epoch": 4.632952691680261, + "grad_norm": 0.1735508143901825, + "learning_rate": 0.0009481353030371822, + "loss": 0.1168, + "num_input_tokens_seen": 61304944, + "step": 28400 + }, + { + "epoch": 4.633768352365416, + "grad_norm": 0.02996252290904522, + "learning_rate": 0.0009481037297031939, + "loss": 0.033, + "num_input_tokens_seen": 61315408, + "step": 28405 + }, + { + "epoch": 4.634584013050571, + "grad_norm": 0.06427101790904999, + "learning_rate": 0.0009480721472878151, + "loss": 0.0901, + "num_input_tokens_seen": 61326000, + "step": 28410 + }, + { + "epoch": 4.635399673735726, + "grad_norm": 0.015492763370275497, + "learning_rate": 0.0009480405557916858, + "loss": 0.1217, + "num_input_tokens_seen": 61335984, + "step": 28415 + }, + { + "epoch": 4.636215334420881, + "grad_norm": 0.029575888067483902, + "learning_rate": 0.0009480089552154461, + "loss": 0.0464, + "num_input_tokens_seen": 61347344, + "step": 28420 + }, + { + "epoch": 4.637030995106036, + "grad_norm": 0.19137391448020935, + "learning_rate": 0.0009479773455597367, + "loss": 0.1147, + "num_input_tokens_seen": 61358352, + "step": 28425 + }, + { + "epoch": 4.637846655791191, + "grad_norm": 0.01463820319622755, + "learning_rate": 0.0009479457268251981, + "loss": 0.026, + "num_input_tokens_seen": 61369168, + "step": 28430 + }, + { + "epoch": 4.638662316476346, + "grad_norm": 0.1874268501996994, + "learning_rate": 0.0009479140990124711, + "loss": 0.1626, + "num_input_tokens_seen": 61379984, + "step": 28435 + }, + { + "epoch": 4.6394779771615005, + "grad_norm": 0.016300490126013756, + "learning_rate": 0.0009478824621221967, + "loss": 0.0522, + "num_input_tokens_seen": 61391760, + "step": 28440 + }, + { + "epoch": 4.640293637846656, + "grad_norm": 0.11277855932712555, + "learning_rate": 0.0009478508161550159, + "loss": 0.1948, + "num_input_tokens_seen": 61401680, + "step": 28445 + }, + { + "epoch": 4.641109298531811, + "grad_norm": 0.003096688771620393, + "learning_rate": 0.0009478191611115702, + "loss": 0.0481, + "num_input_tokens_seen": 61411696, + "step": 28450 + }, + { + "epoch": 4.641924959216966, + "grad_norm": 0.03411673381924629, + "learning_rate": 0.0009477874969925011, + "loss": 0.1257, + "num_input_tokens_seen": 61421968, + "step": 28455 + }, + { + "epoch": 4.642740619902121, + "grad_norm": 0.11193767189979553, + "learning_rate": 0.0009477558237984503, + "loss": 0.0533, + "num_input_tokens_seen": 61430960, + "step": 28460 + }, + { + "epoch": 4.643556280587275, + "grad_norm": 0.04996386170387268, + "learning_rate": 0.0009477241415300599, + "loss": 0.1936, + "num_input_tokens_seen": 61442032, + "step": 28465 + }, + { + "epoch": 4.64437194127243, + "grad_norm": 0.1237870305776596, + "learning_rate": 0.0009476924501879715, + "loss": 0.1108, + "num_input_tokens_seen": 61453136, + "step": 28470 + }, + { + "epoch": 4.645187601957586, + "grad_norm": 0.21713986992835999, + "learning_rate": 0.0009476607497728279, + "loss": 0.1869, + "num_input_tokens_seen": 61464048, + "step": 28475 + }, + { + "epoch": 4.646003262642741, + "grad_norm": 0.013031347654759884, + "learning_rate": 0.0009476290402852712, + "loss": 0.1125, + "num_input_tokens_seen": 61475344, + "step": 28480 + }, + { + "epoch": 4.646818923327896, + "grad_norm": 0.08353379368782043, + "learning_rate": 0.0009475973217259442, + "loss": 0.0964, + "num_input_tokens_seen": 61485168, + "step": 28485 + }, + { + "epoch": 4.64763458401305, + "grad_norm": 0.07830562442541122, + "learning_rate": 0.0009475655940954896, + "loss": 0.0925, + "num_input_tokens_seen": 61496304, + "step": 28490 + }, + { + "epoch": 4.648450244698205, + "grad_norm": 0.04835088923573494, + "learning_rate": 0.0009475338573945504, + "loss": 0.0517, + "num_input_tokens_seen": 61507088, + "step": 28495 + }, + { + "epoch": 4.649265905383361, + "grad_norm": 0.10488462448120117, + "learning_rate": 0.0009475021116237699, + "loss": 0.1426, + "num_input_tokens_seen": 61518032, + "step": 28500 + }, + { + "epoch": 4.650081566068516, + "grad_norm": 0.06608767807483673, + "learning_rate": 0.0009474703567837915, + "loss": 0.0695, + "num_input_tokens_seen": 61529424, + "step": 28505 + }, + { + "epoch": 4.650897226753671, + "grad_norm": 0.15668794512748718, + "learning_rate": 0.0009474385928752585, + "loss": 0.0381, + "num_input_tokens_seen": 61540976, + "step": 28510 + }, + { + "epoch": 4.651712887438825, + "grad_norm": 0.017441483214497566, + "learning_rate": 0.0009474068198988151, + "loss": 0.031, + "num_input_tokens_seen": 61552368, + "step": 28515 + }, + { + "epoch": 4.65252854812398, + "grad_norm": 0.02698986791074276, + "learning_rate": 0.0009473750378551046, + "loss": 0.0283, + "num_input_tokens_seen": 61563184, + "step": 28520 + }, + { + "epoch": 4.653344208809135, + "grad_norm": 0.20803463459014893, + "learning_rate": 0.0009473432467447715, + "loss": 0.1971, + "num_input_tokens_seen": 61572720, + "step": 28525 + }, + { + "epoch": 4.654159869494291, + "grad_norm": 0.011961153708398342, + "learning_rate": 0.00094731144656846, + "loss": 0.108, + "num_input_tokens_seen": 61584176, + "step": 28530 + }, + { + "epoch": 4.6549755301794455, + "grad_norm": 0.014614155516028404, + "learning_rate": 0.0009472796373268147, + "loss": 0.0276, + "num_input_tokens_seen": 61595152, + "step": 28535 + }, + { + "epoch": 4.6557911908646, + "grad_norm": 0.04883689433336258, + "learning_rate": 0.00094724781902048, + "loss": 0.0332, + "num_input_tokens_seen": 61606032, + "step": 28540 + }, + { + "epoch": 4.656606851549755, + "grad_norm": 0.012334324419498444, + "learning_rate": 0.0009472159916501011, + "loss": 0.0405, + "num_input_tokens_seen": 61617680, + "step": 28545 + }, + { + "epoch": 4.65742251223491, + "grad_norm": 0.013164161704480648, + "learning_rate": 0.0009471841552163225, + "loss": 0.0531, + "num_input_tokens_seen": 61628080, + "step": 28550 + }, + { + "epoch": 4.658238172920065, + "grad_norm": 0.0608336441218853, + "learning_rate": 0.0009471523097197898, + "loss": 0.1521, + "num_input_tokens_seen": 61639440, + "step": 28555 + }, + { + "epoch": 4.6590538336052205, + "grad_norm": 0.003759450977668166, + "learning_rate": 0.0009471204551611483, + "loss": 0.146, + "num_input_tokens_seen": 61650544, + "step": 28560 + }, + { + "epoch": 4.659869494290375, + "grad_norm": 0.026677144691348076, + "learning_rate": 0.0009470885915410437, + "loss": 0.1359, + "num_input_tokens_seen": 61661232, + "step": 28565 + }, + { + "epoch": 4.66068515497553, + "grad_norm": 0.018192993476986885, + "learning_rate": 0.0009470567188601214, + "loss": 0.0365, + "num_input_tokens_seen": 61671984, + "step": 28570 + }, + { + "epoch": 4.661500815660685, + "grad_norm": 0.038613077253103256, + "learning_rate": 0.0009470248371190277, + "loss": 0.0496, + "num_input_tokens_seen": 61682160, + "step": 28575 + }, + { + "epoch": 4.66231647634584, + "grad_norm": 0.0067091514356434345, + "learning_rate": 0.0009469929463184086, + "loss": 0.1338, + "num_input_tokens_seen": 61692112, + "step": 28580 + }, + { + "epoch": 4.6631321370309955, + "grad_norm": 0.006950204726308584, + "learning_rate": 0.0009469610464589104, + "loss": 0.1799, + "num_input_tokens_seen": 61703312, + "step": 28585 + }, + { + "epoch": 4.66394779771615, + "grad_norm": 0.04839944839477539, + "learning_rate": 0.0009469291375411795, + "loss": 0.018, + "num_input_tokens_seen": 61713616, + "step": 28590 + }, + { + "epoch": 4.664763458401305, + "grad_norm": 0.005708110984414816, + "learning_rate": 0.0009468972195658626, + "loss": 0.1448, + "num_input_tokens_seen": 61724080, + "step": 28595 + }, + { + "epoch": 4.66557911908646, + "grad_norm": 0.17327173054218292, + "learning_rate": 0.0009468652925336068, + "loss": 0.1357, + "num_input_tokens_seen": 61735216, + "step": 28600 + }, + { + "epoch": 4.666394779771615, + "grad_norm": 0.12230390310287476, + "learning_rate": 0.0009468333564450587, + "loss": 0.1126, + "num_input_tokens_seen": 61746736, + "step": 28605 + }, + { + "epoch": 4.6672104404567705, + "grad_norm": 0.16936272382736206, + "learning_rate": 0.000946801411300866, + "loss": 0.1464, + "num_input_tokens_seen": 61758032, + "step": 28610 + }, + { + "epoch": 4.668026101141925, + "grad_norm": 0.02461591362953186, + "learning_rate": 0.0009467694571016758, + "loss": 0.1004, + "num_input_tokens_seen": 61768624, + "step": 28615 + }, + { + "epoch": 4.66884176182708, + "grad_norm": 0.10801559686660767, + "learning_rate": 0.0009467374938481359, + "loss": 0.0728, + "num_input_tokens_seen": 61779408, + "step": 28620 + }, + { + "epoch": 4.669657422512235, + "grad_norm": 0.052083954215049744, + "learning_rate": 0.0009467055215408939, + "loss": 0.0395, + "num_input_tokens_seen": 61790320, + "step": 28625 + }, + { + "epoch": 4.67047308319739, + "grad_norm": 0.01797199249267578, + "learning_rate": 0.0009466735401805977, + "loss": 0.0304, + "num_input_tokens_seen": 61801968, + "step": 28630 + }, + { + "epoch": 4.671288743882545, + "grad_norm": 0.011699764989316463, + "learning_rate": 0.0009466415497678957, + "loss": 0.1429, + "num_input_tokens_seen": 61812944, + "step": 28635 + }, + { + "epoch": 4.672104404567699, + "grad_norm": 0.1461026817560196, + "learning_rate": 0.000946609550303436, + "loss": 0.0969, + "num_input_tokens_seen": 61824912, + "step": 28640 + }, + { + "epoch": 4.672920065252855, + "grad_norm": 0.012564334087073803, + "learning_rate": 0.0009465775417878673, + "loss": 0.0584, + "num_input_tokens_seen": 61835120, + "step": 28645 + }, + { + "epoch": 4.67373572593801, + "grad_norm": 0.005659809336066246, + "learning_rate": 0.0009465455242218382, + "loss": 0.0714, + "num_input_tokens_seen": 61845680, + "step": 28650 + }, + { + "epoch": 4.674551386623165, + "grad_norm": 0.14616887271404266, + "learning_rate": 0.0009465134976059975, + "loss": 0.13, + "num_input_tokens_seen": 61856368, + "step": 28655 + }, + { + "epoch": 4.6753670473083195, + "grad_norm": 0.06459316611289978, + "learning_rate": 0.0009464814619409942, + "loss": 0.0772, + "num_input_tokens_seen": 61866448, + "step": 28660 + }, + { + "epoch": 4.676182707993474, + "grad_norm": 0.029206709936261177, + "learning_rate": 0.0009464494172274778, + "loss": 0.0822, + "num_input_tokens_seen": 61877040, + "step": 28665 + }, + { + "epoch": 4.67699836867863, + "grad_norm": 0.11989596486091614, + "learning_rate": 0.0009464173634660978, + "loss": 0.1125, + "num_input_tokens_seen": 61887472, + "step": 28670 + }, + { + "epoch": 4.677814029363785, + "grad_norm": 0.08024808019399643, + "learning_rate": 0.0009463853006575032, + "loss": 0.0479, + "num_input_tokens_seen": 61898864, + "step": 28675 + }, + { + "epoch": 4.67862969004894, + "grad_norm": 0.07209980487823486, + "learning_rate": 0.0009463532288023444, + "loss": 0.0184, + "num_input_tokens_seen": 61910160, + "step": 28680 + }, + { + "epoch": 4.6794453507340945, + "grad_norm": 0.27977579832077026, + "learning_rate": 0.0009463211479012712, + "loss": 0.3223, + "num_input_tokens_seen": 61920176, + "step": 28685 + }, + { + "epoch": 4.680261011419249, + "grad_norm": 0.1652607023715973, + "learning_rate": 0.0009462890579549338, + "loss": 0.1184, + "num_input_tokens_seen": 61930992, + "step": 28690 + }, + { + "epoch": 4.681076672104405, + "grad_norm": 0.06058865413069725, + "learning_rate": 0.0009462569589639825, + "loss": 0.1225, + "num_input_tokens_seen": 61941744, + "step": 28695 + }, + { + "epoch": 4.68189233278956, + "grad_norm": 0.04110928252339363, + "learning_rate": 0.0009462248509290676, + "loss": 0.1501, + "num_input_tokens_seen": 61951760, + "step": 28700 + }, + { + "epoch": 4.682707993474715, + "grad_norm": 0.04822893068194389, + "learning_rate": 0.0009461927338508402, + "loss": 0.3095, + "num_input_tokens_seen": 61961040, + "step": 28705 + }, + { + "epoch": 4.6835236541598695, + "grad_norm": 0.008379818871617317, + "learning_rate": 0.0009461606077299509, + "loss": 0.0468, + "num_input_tokens_seen": 61971856, + "step": 28710 + }, + { + "epoch": 4.684339314845024, + "grad_norm": 0.014420662075281143, + "learning_rate": 0.000946128472567051, + "loss": 0.018, + "num_input_tokens_seen": 61983760, + "step": 28715 + }, + { + "epoch": 4.685154975530179, + "grad_norm": 0.017404265701770782, + "learning_rate": 0.0009460963283627917, + "loss": 0.0325, + "num_input_tokens_seen": 61994480, + "step": 28720 + }, + { + "epoch": 4.685970636215334, + "grad_norm": 0.12886545062065125, + "learning_rate": 0.0009460641751178243, + "loss": 0.1171, + "num_input_tokens_seen": 62005360, + "step": 28725 + }, + { + "epoch": 4.68678629690049, + "grad_norm": 0.023413924500346184, + "learning_rate": 0.0009460320128328003, + "loss": 0.086, + "num_input_tokens_seen": 62016976, + "step": 28730 + }, + { + "epoch": 4.6876019575856445, + "grad_norm": 0.0498243048787117, + "learning_rate": 0.0009459998415083721, + "loss": 0.0834, + "num_input_tokens_seen": 62028176, + "step": 28735 + }, + { + "epoch": 4.688417618270799, + "grad_norm": 0.014727417379617691, + "learning_rate": 0.000945967661145191, + "loss": 0.1026, + "num_input_tokens_seen": 62038672, + "step": 28740 + }, + { + "epoch": 4.689233278955954, + "grad_norm": 0.025079650804400444, + "learning_rate": 0.0009459354717439097, + "loss": 0.0157, + "num_input_tokens_seen": 62049424, + "step": 28745 + }, + { + "epoch": 4.690048939641109, + "grad_norm": 0.03569481521844864, + "learning_rate": 0.0009459032733051805, + "loss": 0.118, + "num_input_tokens_seen": 62060112, + "step": 28750 + }, + { + "epoch": 4.690864600326265, + "grad_norm": 0.012694702483713627, + "learning_rate": 0.0009458710658296555, + "loss": 0.0846, + "num_input_tokens_seen": 62071728, + "step": 28755 + }, + { + "epoch": 4.691680261011419, + "grad_norm": 0.01697978563606739, + "learning_rate": 0.000945838849317988, + "loss": 0.2106, + "num_input_tokens_seen": 62081680, + "step": 28760 + }, + { + "epoch": 4.692495921696574, + "grad_norm": 0.18275408446788788, + "learning_rate": 0.0009458066237708302, + "loss": 0.0666, + "num_input_tokens_seen": 62091984, + "step": 28765 + }, + { + "epoch": 4.693311582381729, + "grad_norm": 0.005989693105220795, + "learning_rate": 0.0009457743891888359, + "loss": 0.1213, + "num_input_tokens_seen": 62102864, + "step": 28770 + }, + { + "epoch": 4.694127243066884, + "grad_norm": 0.1643328219652176, + "learning_rate": 0.0009457421455726582, + "loss": 0.1618, + "num_input_tokens_seen": 62114256, + "step": 28775 + }, + { + "epoch": 4.69494290375204, + "grad_norm": 0.10635054111480713, + "learning_rate": 0.0009457098929229503, + "loss": 0.0757, + "num_input_tokens_seen": 62124816, + "step": 28780 + }, + { + "epoch": 4.695758564437194, + "grad_norm": 0.04666154459118843, + "learning_rate": 0.0009456776312403661, + "loss": 0.0569, + "num_input_tokens_seen": 62136368, + "step": 28785 + }, + { + "epoch": 4.696574225122349, + "grad_norm": 0.03000042773783207, + "learning_rate": 0.0009456453605255592, + "loss": 0.1282, + "num_input_tokens_seen": 62147024, + "step": 28790 + }, + { + "epoch": 4.697389885807504, + "grad_norm": 0.02815760299563408, + "learning_rate": 0.0009456130807791839, + "loss": 0.0259, + "num_input_tokens_seen": 62156848, + "step": 28795 + }, + { + "epoch": 4.698205546492659, + "grad_norm": 0.06074070557951927, + "learning_rate": 0.000945580792001894, + "loss": 0.1102, + "num_input_tokens_seen": 62168496, + "step": 28800 + }, + { + "epoch": 4.699021207177814, + "grad_norm": 0.03505658730864525, + "learning_rate": 0.0009455484941943442, + "loss": 0.1394, + "num_input_tokens_seen": 62178800, + "step": 28805 + }, + { + "epoch": 4.699836867862969, + "grad_norm": 0.01908975839614868, + "learning_rate": 0.0009455161873571889, + "loss": 0.0644, + "num_input_tokens_seen": 62190768, + "step": 28810 + }, + { + "epoch": 4.700652528548124, + "grad_norm": 0.014994807541370392, + "learning_rate": 0.000945483871491083, + "loss": 0.0146, + "num_input_tokens_seen": 62201648, + "step": 28815 + }, + { + "epoch": 4.701468189233279, + "grad_norm": 0.031434159725904465, + "learning_rate": 0.0009454515465966812, + "loss": 0.1145, + "num_input_tokens_seen": 62212816, + "step": 28820 + }, + { + "epoch": 4.702283849918434, + "grad_norm": 0.008396290242671967, + "learning_rate": 0.0009454192126746388, + "loss": 0.1337, + "num_input_tokens_seen": 62222992, + "step": 28825 + }, + { + "epoch": 4.703099510603589, + "grad_norm": 0.07450134307146072, + "learning_rate": 0.000945386869725611, + "loss": 0.1335, + "num_input_tokens_seen": 62234192, + "step": 28830 + }, + { + "epoch": 4.7039151712887435, + "grad_norm": 0.003438853658735752, + "learning_rate": 0.0009453545177502532, + "loss": 0.1188, + "num_input_tokens_seen": 62244848, + "step": 28835 + }, + { + "epoch": 4.704730831973899, + "grad_norm": 0.1286911964416504, + "learning_rate": 0.0009453221567492211, + "loss": 0.1499, + "num_input_tokens_seen": 62255376, + "step": 28840 + }, + { + "epoch": 4.705546492659054, + "grad_norm": 0.10739689320325851, + "learning_rate": 0.0009452897867231705, + "loss": 0.08, + "num_input_tokens_seen": 62266192, + "step": 28845 + }, + { + "epoch": 4.706362153344209, + "grad_norm": 0.014540545642375946, + "learning_rate": 0.0009452574076727576, + "loss": 0.0375, + "num_input_tokens_seen": 62277232, + "step": 28850 + }, + { + "epoch": 4.707177814029364, + "grad_norm": 0.03177092224359512, + "learning_rate": 0.0009452250195986385, + "loss": 0.0538, + "num_input_tokens_seen": 62288368, + "step": 28855 + }, + { + "epoch": 4.7079934747145185, + "grad_norm": 0.07683566212654114, + "learning_rate": 0.0009451926225014695, + "loss": 0.1984, + "num_input_tokens_seen": 62299024, + "step": 28860 + }, + { + "epoch": 4.708809135399674, + "grad_norm": 0.18355312943458557, + "learning_rate": 0.0009451602163819073, + "loss": 0.247, + "num_input_tokens_seen": 62309136, + "step": 28865 + }, + { + "epoch": 4.709624796084829, + "grad_norm": 0.13076087832450867, + "learning_rate": 0.0009451278012406086, + "loss": 0.057, + "num_input_tokens_seen": 62320656, + "step": 28870 + }, + { + "epoch": 4.710440456769984, + "grad_norm": 0.010717041790485382, + "learning_rate": 0.0009450953770782304, + "loss": 0.0193, + "num_input_tokens_seen": 62330896, + "step": 28875 + }, + { + "epoch": 4.711256117455139, + "grad_norm": 0.01622324250638485, + "learning_rate": 0.0009450629438954296, + "loss": 0.0922, + "num_input_tokens_seen": 62342032, + "step": 28880 + }, + { + "epoch": 4.712071778140293, + "grad_norm": 0.06714911013841629, + "learning_rate": 0.0009450305016928636, + "loss": 0.1026, + "num_input_tokens_seen": 62351984, + "step": 28885 + }, + { + "epoch": 4.712887438825448, + "grad_norm": 0.11107388883829117, + "learning_rate": 0.00094499805047119, + "loss": 0.0217, + "num_input_tokens_seen": 62362256, + "step": 28890 + }, + { + "epoch": 4.713703099510604, + "grad_norm": 0.1321137398481369, + "learning_rate": 0.0009449655902310665, + "loss": 0.1661, + "num_input_tokens_seen": 62373040, + "step": 28895 + }, + { + "epoch": 4.714518760195759, + "grad_norm": 0.004365905188024044, + "learning_rate": 0.0009449331209731507, + "loss": 0.0612, + "num_input_tokens_seen": 62384304, + "step": 28900 + }, + { + "epoch": 4.715334420880914, + "grad_norm": 0.08062531799077988, + "learning_rate": 0.0009449006426981007, + "loss": 0.0294, + "num_input_tokens_seen": 62394864, + "step": 28905 + }, + { + "epoch": 4.716150081566068, + "grad_norm": 0.032696615904569626, + "learning_rate": 0.0009448681554065749, + "loss": 0.0492, + "num_input_tokens_seen": 62405040, + "step": 28910 + }, + { + "epoch": 4.716965742251223, + "grad_norm": 0.0385369211435318, + "learning_rate": 0.0009448356590992316, + "loss": 0.0528, + "num_input_tokens_seen": 62417040, + "step": 28915 + }, + { + "epoch": 4.717781402936378, + "grad_norm": 0.22030751407146454, + "learning_rate": 0.0009448031537767292, + "loss": 0.2488, + "num_input_tokens_seen": 62427088, + "step": 28920 + }, + { + "epoch": 4.718597063621534, + "grad_norm": 0.08304321020841599, + "learning_rate": 0.0009447706394397266, + "loss": 0.14, + "num_input_tokens_seen": 62436816, + "step": 28925 + }, + { + "epoch": 4.719412724306689, + "grad_norm": 0.03714625537395477, + "learning_rate": 0.0009447381160888831, + "loss": 0.0428, + "num_input_tokens_seen": 62447248, + "step": 28930 + }, + { + "epoch": 4.720228384991843, + "grad_norm": 0.010251031257212162, + "learning_rate": 0.0009447055837248572, + "loss": 0.0843, + "num_input_tokens_seen": 62456368, + "step": 28935 + }, + { + "epoch": 4.721044045676998, + "grad_norm": 0.09434520453214645, + "learning_rate": 0.0009446730423483085, + "loss": 0.1236, + "num_input_tokens_seen": 62467088, + "step": 28940 + }, + { + "epoch": 4.721859706362153, + "grad_norm": 0.1920565366744995, + "learning_rate": 0.0009446404919598965, + "loss": 0.2143, + "num_input_tokens_seen": 62476752, + "step": 28945 + }, + { + "epoch": 4.722675367047309, + "grad_norm": 0.003190784715116024, + "learning_rate": 0.000944607932560281, + "loss": 0.0457, + "num_input_tokens_seen": 62487152, + "step": 28950 + }, + { + "epoch": 4.7234910277324635, + "grad_norm": 0.03627556934952736, + "learning_rate": 0.0009445753641501215, + "loss": 0.028, + "num_input_tokens_seen": 62498544, + "step": 28955 + }, + { + "epoch": 4.724306688417618, + "grad_norm": 0.004415176343172789, + "learning_rate": 0.0009445427867300785, + "loss": 0.0604, + "num_input_tokens_seen": 62509488, + "step": 28960 + }, + { + "epoch": 4.725122349102773, + "grad_norm": 0.06204039603471756, + "learning_rate": 0.0009445102003008119, + "loss": 0.0778, + "num_input_tokens_seen": 62520464, + "step": 28965 + }, + { + "epoch": 4.725938009787928, + "grad_norm": 0.0658777579665184, + "learning_rate": 0.0009444776048629822, + "loss": 0.25, + "num_input_tokens_seen": 62531536, + "step": 28970 + }, + { + "epoch": 4.726753670473083, + "grad_norm": 0.24704846739768982, + "learning_rate": 0.0009444450004172498, + "loss": 0.0949, + "num_input_tokens_seen": 62542320, + "step": 28975 + }, + { + "epoch": 4.7275693311582385, + "grad_norm": 0.15060031414031982, + "learning_rate": 0.0009444123869642758, + "loss": 0.1303, + "num_input_tokens_seen": 62551760, + "step": 28980 + }, + { + "epoch": 4.728384991843393, + "grad_norm": 0.06394050270318985, + "learning_rate": 0.000944379764504721, + "loss": 0.0962, + "num_input_tokens_seen": 62562512, + "step": 28985 + }, + { + "epoch": 4.729200652528548, + "grad_norm": 0.040367111563682556, + "learning_rate": 0.0009443471330392466, + "loss": 0.1217, + "num_input_tokens_seen": 62573648, + "step": 28990 + }, + { + "epoch": 4.730016313213703, + "grad_norm": 0.01803169585764408, + "learning_rate": 0.0009443144925685137, + "loss": 0.1981, + "num_input_tokens_seen": 62584592, + "step": 28995 + }, + { + "epoch": 4.730831973898858, + "grad_norm": 0.005480082705616951, + "learning_rate": 0.0009442818430931841, + "loss": 0.0341, + "num_input_tokens_seen": 62596496, + "step": 29000 + }, + { + "epoch": 4.731647634584013, + "grad_norm": 0.022640734910964966, + "learning_rate": 0.0009442491846139192, + "loss": 0.0422, + "num_input_tokens_seen": 62607312, + "step": 29005 + }, + { + "epoch": 4.732463295269168, + "grad_norm": 0.07619868963956833, + "learning_rate": 0.0009442165171313811, + "loss": 0.0424, + "num_input_tokens_seen": 62617680, + "step": 29010 + }, + { + "epoch": 4.733278955954323, + "grad_norm": 0.039414893835783005, + "learning_rate": 0.0009441838406462318, + "loss": 0.0592, + "num_input_tokens_seen": 62627664, + "step": 29015 + }, + { + "epoch": 4.734094616639478, + "grad_norm": 0.17120850086212158, + "learning_rate": 0.0009441511551591333, + "loss": 0.2195, + "num_input_tokens_seen": 62638352, + "step": 29020 + }, + { + "epoch": 4.734910277324633, + "grad_norm": 0.05387534573674202, + "learning_rate": 0.0009441184606707484, + "loss": 0.066, + "num_input_tokens_seen": 62649520, + "step": 29025 + }, + { + "epoch": 4.735725938009788, + "grad_norm": 0.019851237535476685, + "learning_rate": 0.0009440857571817394, + "loss": 0.0138, + "num_input_tokens_seen": 62660080, + "step": 29030 + }, + { + "epoch": 4.736541598694943, + "grad_norm": 0.024015527218580246, + "learning_rate": 0.000944053044692769, + "loss": 0.206, + "num_input_tokens_seen": 62669808, + "step": 29035 + }, + { + "epoch": 4.737357259380098, + "grad_norm": 0.054755594581365585, + "learning_rate": 0.0009440203232045005, + "loss": 0.0807, + "num_input_tokens_seen": 62681136, + "step": 29040 + }, + { + "epoch": 4.738172920065253, + "grad_norm": 0.016112105920910835, + "learning_rate": 0.000943987592717597, + "loss": 0.1306, + "num_input_tokens_seen": 62692144, + "step": 29045 + }, + { + "epoch": 4.738988580750408, + "grad_norm": 0.040498241782188416, + "learning_rate": 0.0009439548532327216, + "loss": 0.1346, + "num_input_tokens_seen": 62701360, + "step": 29050 + }, + { + "epoch": 4.739804241435563, + "grad_norm": 0.28053414821624756, + "learning_rate": 0.0009439221047505377, + "loss": 0.0838, + "num_input_tokens_seen": 62713488, + "step": 29055 + }, + { + "epoch": 4.740619902120718, + "grad_norm": 0.0031493771821260452, + "learning_rate": 0.0009438893472717094, + "loss": 0.1154, + "num_input_tokens_seen": 62725264, + "step": 29060 + }, + { + "epoch": 4.741435562805873, + "grad_norm": 0.12282595038414001, + "learning_rate": 0.0009438565807969005, + "loss": 0.0938, + "num_input_tokens_seen": 62735312, + "step": 29065 + }, + { + "epoch": 4.742251223491028, + "grad_norm": 0.02405609004199505, + "learning_rate": 0.0009438238053267746, + "loss": 0.1531, + "num_input_tokens_seen": 62745744, + "step": 29070 + }, + { + "epoch": 4.743066884176183, + "grad_norm": 0.14400707185268402, + "learning_rate": 0.0009437910208619964, + "loss": 0.0724, + "num_input_tokens_seen": 62754960, + "step": 29075 + }, + { + "epoch": 4.7438825448613375, + "grad_norm": 0.05895408242940903, + "learning_rate": 0.0009437582274032301, + "loss": 0.123, + "num_input_tokens_seen": 62765712, + "step": 29080 + }, + { + "epoch": 4.744698205546492, + "grad_norm": 0.23508873581886292, + "learning_rate": 0.0009437254249511404, + "loss": 0.0722, + "num_input_tokens_seen": 62776784, + "step": 29085 + }, + { + "epoch": 4.745513866231647, + "grad_norm": 0.015804868191480637, + "learning_rate": 0.0009436926135063922, + "loss": 0.2008, + "num_input_tokens_seen": 62787984, + "step": 29090 + }, + { + "epoch": 4.746329526916803, + "grad_norm": 0.021579764783382416, + "learning_rate": 0.0009436597930696502, + "loss": 0.0837, + "num_input_tokens_seen": 62797680, + "step": 29095 + }, + { + "epoch": 4.747145187601958, + "grad_norm": 0.019593367353081703, + "learning_rate": 0.0009436269636415798, + "loss": 0.0396, + "num_input_tokens_seen": 62808112, + "step": 29100 + }, + { + "epoch": 4.7479608482871125, + "grad_norm": 0.04316122829914093, + "learning_rate": 0.000943594125222846, + "loss": 0.2068, + "num_input_tokens_seen": 62818928, + "step": 29105 + }, + { + "epoch": 4.748776508972267, + "grad_norm": 0.05884399265050888, + "learning_rate": 0.0009435612778141146, + "loss": 0.1594, + "num_input_tokens_seen": 62829872, + "step": 29110 + }, + { + "epoch": 4.749592169657422, + "grad_norm": 0.039331063628196716, + "learning_rate": 0.0009435284214160513, + "loss": 0.1076, + "num_input_tokens_seen": 62840144, + "step": 29115 + }, + { + "epoch": 4.750407830342578, + "grad_norm": 0.012597598135471344, + "learning_rate": 0.0009434955560293217, + "loss": 0.129, + "num_input_tokens_seen": 62851856, + "step": 29120 + }, + { + "epoch": 4.751223491027733, + "grad_norm": 0.010564728640019894, + "learning_rate": 0.0009434626816545922, + "loss": 0.0617, + "num_input_tokens_seen": 62863056, + "step": 29125 + }, + { + "epoch": 4.7520391517128875, + "grad_norm": 0.11567310988903046, + "learning_rate": 0.0009434297982925288, + "loss": 0.0639, + "num_input_tokens_seen": 62872592, + "step": 29130 + }, + { + "epoch": 4.752854812398042, + "grad_norm": 0.2663860619068146, + "learning_rate": 0.000943396905943798, + "loss": 0.1388, + "num_input_tokens_seen": 62883600, + "step": 29135 + }, + { + "epoch": 4.753670473083197, + "grad_norm": 0.23019269108772278, + "learning_rate": 0.0009433640046090664, + "loss": 0.181, + "num_input_tokens_seen": 62894160, + "step": 29140 + }, + { + "epoch": 4.754486133768353, + "grad_norm": 0.1516154408454895, + "learning_rate": 0.0009433310942890009, + "loss": 0.0513, + "num_input_tokens_seen": 62905648, + "step": 29145 + }, + { + "epoch": 4.755301794453508, + "grad_norm": 0.007199846673756838, + "learning_rate": 0.0009432981749842683, + "loss": 0.0469, + "num_input_tokens_seen": 62918256, + "step": 29150 + }, + { + "epoch": 4.7561174551386625, + "grad_norm": 0.21071100234985352, + "learning_rate": 0.0009432652466955358, + "loss": 0.147, + "num_input_tokens_seen": 62929200, + "step": 29155 + }, + { + "epoch": 4.756933115823817, + "grad_norm": 0.031637098640203476, + "learning_rate": 0.0009432323094234708, + "loss": 0.0276, + "num_input_tokens_seen": 62940016, + "step": 29160 + }, + { + "epoch": 4.757748776508972, + "grad_norm": 0.1269330233335495, + "learning_rate": 0.0009431993631687408, + "loss": 0.0463, + "num_input_tokens_seen": 62949744, + "step": 29165 + }, + { + "epoch": 4.758564437194127, + "grad_norm": 0.013794936239719391, + "learning_rate": 0.0009431664079320134, + "loss": 0.0261, + "num_input_tokens_seen": 62960848, + "step": 29170 + }, + { + "epoch": 4.759380097879282, + "grad_norm": 0.08217586576938629, + "learning_rate": 0.0009431334437139565, + "loss": 0.244, + "num_input_tokens_seen": 62971984, + "step": 29175 + }, + { + "epoch": 4.760195758564437, + "grad_norm": 0.04564249515533447, + "learning_rate": 0.0009431004705152384, + "loss": 0.0976, + "num_input_tokens_seen": 62982224, + "step": 29180 + }, + { + "epoch": 4.761011419249592, + "grad_norm": 0.09166817367076874, + "learning_rate": 0.0009430674883365269, + "loss": 0.1201, + "num_input_tokens_seen": 62992464, + "step": 29185 + }, + { + "epoch": 4.761827079934747, + "grad_norm": 0.052575141191482544, + "learning_rate": 0.0009430344971784909, + "loss": 0.0629, + "num_input_tokens_seen": 63002960, + "step": 29190 + }, + { + "epoch": 4.762642740619902, + "grad_norm": 0.020644627511501312, + "learning_rate": 0.0009430014970417986, + "loss": 0.1761, + "num_input_tokens_seen": 63013520, + "step": 29195 + }, + { + "epoch": 4.763458401305057, + "grad_norm": 0.1522044837474823, + "learning_rate": 0.0009429684879271191, + "loss": 0.1569, + "num_input_tokens_seen": 63023760, + "step": 29200 + }, + { + "epoch": 4.764274061990212, + "grad_norm": 0.005635553039610386, + "learning_rate": 0.0009429354698351212, + "loss": 0.0646, + "num_input_tokens_seen": 63035248, + "step": 29205 + }, + { + "epoch": 4.765089722675367, + "grad_norm": 0.01611248403787613, + "learning_rate": 0.0009429024427664741, + "loss": 0.1356, + "num_input_tokens_seen": 63045904, + "step": 29210 + }, + { + "epoch": 4.765905383360522, + "grad_norm": 0.09721479564905167, + "learning_rate": 0.0009428694067218473, + "loss": 0.0662, + "num_input_tokens_seen": 63056336, + "step": 29215 + }, + { + "epoch": 4.766721044045677, + "grad_norm": 0.04713112488389015, + "learning_rate": 0.0009428363617019099, + "loss": 0.1572, + "num_input_tokens_seen": 63067376, + "step": 29220 + }, + { + "epoch": 4.767536704730832, + "grad_norm": 0.022350581362843513, + "learning_rate": 0.0009428033077073319, + "loss": 0.1603, + "num_input_tokens_seen": 63077456, + "step": 29225 + }, + { + "epoch": 4.768352365415987, + "grad_norm": 0.011944299563765526, + "learning_rate": 0.0009427702447387833, + "loss": 0.0131, + "num_input_tokens_seen": 63087024, + "step": 29230 + }, + { + "epoch": 4.769168026101142, + "grad_norm": 0.028478989377617836, + "learning_rate": 0.0009427371727969338, + "loss": 0.0673, + "num_input_tokens_seen": 63098608, + "step": 29235 + }, + { + "epoch": 4.769983686786297, + "grad_norm": 0.00210229167714715, + "learning_rate": 0.000942704091882454, + "loss": 0.0791, + "num_input_tokens_seen": 63108144, + "step": 29240 + }, + { + "epoch": 4.770799347471452, + "grad_norm": 0.21192315220832825, + "learning_rate": 0.0009426710019960141, + "loss": 0.2013, + "num_input_tokens_seen": 63118736, + "step": 29245 + }, + { + "epoch": 4.771615008156607, + "grad_norm": 0.03179307281970978, + "learning_rate": 0.0009426379031382848, + "loss": 0.0179, + "num_input_tokens_seen": 63128656, + "step": 29250 + }, + { + "epoch": 4.7724306688417615, + "grad_norm": 0.008113854564726353, + "learning_rate": 0.0009426047953099368, + "loss": 0.0678, + "num_input_tokens_seen": 63139632, + "step": 29255 + }, + { + "epoch": 4.773246329526917, + "grad_norm": 0.11698970943689346, + "learning_rate": 0.0009425716785116412, + "loss": 0.0802, + "num_input_tokens_seen": 63150128, + "step": 29260 + }, + { + "epoch": 4.774061990212072, + "grad_norm": 0.055682577192783356, + "learning_rate": 0.0009425385527440691, + "loss": 0.085, + "num_input_tokens_seen": 63160112, + "step": 29265 + }, + { + "epoch": 4.774877650897227, + "grad_norm": 0.03262883052229881, + "learning_rate": 0.0009425054180078917, + "loss": 0.128, + "num_input_tokens_seen": 63172592, + "step": 29270 + }, + { + "epoch": 4.775693311582382, + "grad_norm": 0.11100230365991592, + "learning_rate": 0.0009424722743037808, + "loss": 0.0913, + "num_input_tokens_seen": 63184304, + "step": 29275 + }, + { + "epoch": 4.7765089722675365, + "grad_norm": 0.004344983492046595, + "learning_rate": 0.0009424391216324078, + "loss": 0.0289, + "num_input_tokens_seen": 63195184, + "step": 29280 + }, + { + "epoch": 4.777324632952691, + "grad_norm": 0.013685915619134903, + "learning_rate": 0.0009424059599944449, + "loss": 0.0453, + "num_input_tokens_seen": 63206064, + "step": 29285 + }, + { + "epoch": 4.778140293637847, + "grad_norm": 0.016094228252768517, + "learning_rate": 0.0009423727893905638, + "loss": 0.032, + "num_input_tokens_seen": 63215536, + "step": 29290 + }, + { + "epoch": 4.778955954323002, + "grad_norm": 0.0919591560959816, + "learning_rate": 0.0009423396098214372, + "loss": 0.0767, + "num_input_tokens_seen": 63226480, + "step": 29295 + }, + { + "epoch": 4.779771615008157, + "grad_norm": 0.08174548298120499, + "learning_rate": 0.0009423064212877371, + "loss": 0.1099, + "num_input_tokens_seen": 63237200, + "step": 29300 + }, + { + "epoch": 4.780587275693311, + "grad_norm": 0.012983143329620361, + "learning_rate": 0.0009422732237901361, + "loss": 0.1226, + "num_input_tokens_seen": 63247632, + "step": 29305 + }, + { + "epoch": 4.781402936378466, + "grad_norm": 0.03150840476155281, + "learning_rate": 0.0009422400173293073, + "loss": 0.1579, + "num_input_tokens_seen": 63258864, + "step": 29310 + }, + { + "epoch": 4.782218597063622, + "grad_norm": 0.03159575164318085, + "learning_rate": 0.0009422068019059235, + "loss": 0.1358, + "num_input_tokens_seen": 63270928, + "step": 29315 + }, + { + "epoch": 4.783034257748777, + "grad_norm": 0.1498037725687027, + "learning_rate": 0.0009421735775206582, + "loss": 0.1385, + "num_input_tokens_seen": 63281200, + "step": 29320 + }, + { + "epoch": 4.783849918433932, + "grad_norm": 0.03605213388800621, + "learning_rate": 0.000942140344174184, + "loss": 0.11, + "num_input_tokens_seen": 63291184, + "step": 29325 + }, + { + "epoch": 4.784665579119086, + "grad_norm": 0.010444783605635166, + "learning_rate": 0.0009421071018671749, + "loss": 0.0673, + "num_input_tokens_seen": 63302544, + "step": 29330 + }, + { + "epoch": 4.785481239804241, + "grad_norm": 0.010891892947256565, + "learning_rate": 0.0009420738506003047, + "loss": 0.1027, + "num_input_tokens_seen": 63312464, + "step": 29335 + }, + { + "epoch": 4.786296900489396, + "grad_norm": 0.07446567714214325, + "learning_rate": 0.0009420405903742471, + "loss": 0.2665, + "num_input_tokens_seen": 63323856, + "step": 29340 + }, + { + "epoch": 4.787112561174552, + "grad_norm": 0.058459922671318054, + "learning_rate": 0.000942007321189676, + "loss": 0.0502, + "num_input_tokens_seen": 63335664, + "step": 29345 + }, + { + "epoch": 4.787928221859707, + "grad_norm": 0.055532775819301605, + "learning_rate": 0.0009419740430472659, + "loss": 0.0277, + "num_input_tokens_seen": 63347632, + "step": 29350 + }, + { + "epoch": 4.788743882544861, + "grad_norm": 0.10797680914402008, + "learning_rate": 0.0009419407559476911, + "loss": 0.1438, + "num_input_tokens_seen": 63359152, + "step": 29355 + }, + { + "epoch": 4.789559543230016, + "grad_norm": 0.0318618007004261, + "learning_rate": 0.0009419074598916262, + "loss": 0.0469, + "num_input_tokens_seen": 63370224, + "step": 29360 + }, + { + "epoch": 4.790375203915171, + "grad_norm": 0.027501096948981285, + "learning_rate": 0.0009418741548797462, + "loss": 0.0655, + "num_input_tokens_seen": 63380784, + "step": 29365 + }, + { + "epoch": 4.791190864600326, + "grad_norm": 0.0989944115281105, + "learning_rate": 0.0009418408409127257, + "loss": 0.2599, + "num_input_tokens_seen": 63392656, + "step": 29370 + }, + { + "epoch": 4.7920065252854815, + "grad_norm": 0.06135750189423561, + "learning_rate": 0.0009418075179912402, + "loss": 0.1274, + "num_input_tokens_seen": 63403440, + "step": 29375 + }, + { + "epoch": 4.792822185970636, + "grad_norm": 0.1572854220867157, + "learning_rate": 0.0009417741861159648, + "loss": 0.0846, + "num_input_tokens_seen": 63413968, + "step": 29380 + }, + { + "epoch": 4.793637846655791, + "grad_norm": 0.05631653591990471, + "learning_rate": 0.0009417408452875751, + "loss": 0.1282, + "num_input_tokens_seen": 63424752, + "step": 29385 + }, + { + "epoch": 4.794453507340946, + "grad_norm": 0.02439217083156109, + "learning_rate": 0.0009417074955067467, + "loss": 0.0477, + "num_input_tokens_seen": 63435792, + "step": 29390 + }, + { + "epoch": 4.795269168026101, + "grad_norm": 0.014964789152145386, + "learning_rate": 0.0009416741367741557, + "loss": 0.0294, + "num_input_tokens_seen": 63448016, + "step": 29395 + }, + { + "epoch": 4.7960848287112565, + "grad_norm": 0.007130731362849474, + "learning_rate": 0.0009416407690904778, + "loss": 0.0206, + "num_input_tokens_seen": 63456880, + "step": 29400 + }, + { + "epoch": 4.796900489396411, + "grad_norm": 0.2347111850976944, + "learning_rate": 0.0009416073924563897, + "loss": 0.1476, + "num_input_tokens_seen": 63467504, + "step": 29405 + }, + { + "epoch": 4.797716150081566, + "grad_norm": 0.06022638455033302, + "learning_rate": 0.0009415740068725674, + "loss": 0.0781, + "num_input_tokens_seen": 63478288, + "step": 29410 + }, + { + "epoch": 4.798531810766721, + "grad_norm": 0.0055657862685620785, + "learning_rate": 0.0009415406123396878, + "loss": 0.0582, + "num_input_tokens_seen": 63487888, + "step": 29415 + }, + { + "epoch": 4.799347471451876, + "grad_norm": 0.005403745919466019, + "learning_rate": 0.0009415072088584275, + "loss": 0.0431, + "num_input_tokens_seen": 63498576, + "step": 29420 + }, + { + "epoch": 4.800163132137031, + "grad_norm": 0.005292664747685194, + "learning_rate": 0.0009414737964294635, + "loss": 0.077, + "num_input_tokens_seen": 63509872, + "step": 29425 + }, + { + "epoch": 4.800978792822186, + "grad_norm": 0.034147851169109344, + "learning_rate": 0.0009414403750534731, + "loss": 0.0266, + "num_input_tokens_seen": 63521328, + "step": 29430 + }, + { + "epoch": 4.801794453507341, + "grad_norm": 0.10782653093338013, + "learning_rate": 0.0009414069447311333, + "loss": 0.1388, + "num_input_tokens_seen": 63531248, + "step": 29435 + }, + { + "epoch": 4.802610114192496, + "grad_norm": 0.19998647272586823, + "learning_rate": 0.0009413735054631218, + "loss": 0.2257, + "num_input_tokens_seen": 63541648, + "step": 29440 + }, + { + "epoch": 4.803425774877651, + "grad_norm": 0.19111989438533783, + "learning_rate": 0.0009413400572501164, + "loss": 0.0462, + "num_input_tokens_seen": 63551408, + "step": 29445 + }, + { + "epoch": 4.804241435562806, + "grad_norm": 0.02402065321803093, + "learning_rate": 0.0009413066000927948, + "loss": 0.0324, + "num_input_tokens_seen": 63562160, + "step": 29450 + }, + { + "epoch": 4.80505709624796, + "grad_norm": 0.004702876787632704, + "learning_rate": 0.0009412731339918353, + "loss": 0.0655, + "num_input_tokens_seen": 63573648, + "step": 29455 + }, + { + "epoch": 4.805872756933116, + "grad_norm": 0.006390291266143322, + "learning_rate": 0.0009412396589479157, + "loss": 0.1632, + "num_input_tokens_seen": 63584048, + "step": 29460 + }, + { + "epoch": 4.806688417618271, + "grad_norm": 0.0683237686753273, + "learning_rate": 0.0009412061749617147, + "loss": 0.0525, + "num_input_tokens_seen": 63595344, + "step": 29465 + }, + { + "epoch": 4.807504078303426, + "grad_norm": 0.1013072058558464, + "learning_rate": 0.0009411726820339109, + "loss": 0.1005, + "num_input_tokens_seen": 63606800, + "step": 29470 + }, + { + "epoch": 4.808319738988581, + "grad_norm": 0.2400810718536377, + "learning_rate": 0.000941139180165183, + "loss": 0.1394, + "num_input_tokens_seen": 63617456, + "step": 29475 + }, + { + "epoch": 4.809135399673735, + "grad_norm": 0.025798004120588303, + "learning_rate": 0.0009411056693562101, + "loss": 0.2191, + "num_input_tokens_seen": 63628336, + "step": 29480 + }, + { + "epoch": 4.809951060358891, + "grad_norm": 0.009161945432424545, + "learning_rate": 0.000941072149607671, + "loss": 0.0564, + "num_input_tokens_seen": 63640720, + "step": 29485 + }, + { + "epoch": 4.810766721044046, + "grad_norm": 0.11191725730895996, + "learning_rate": 0.0009410386209202455, + "loss": 0.0553, + "num_input_tokens_seen": 63650960, + "step": 29490 + }, + { + "epoch": 4.811582381729201, + "grad_norm": 0.2783498764038086, + "learning_rate": 0.0009410050832946127, + "loss": 0.0564, + "num_input_tokens_seen": 63662192, + "step": 29495 + }, + { + "epoch": 4.8123980424143555, + "grad_norm": 0.14226217567920685, + "learning_rate": 0.0009409715367314527, + "loss": 0.0992, + "num_input_tokens_seen": 63673232, + "step": 29500 + }, + { + "epoch": 4.81321370309951, + "grad_norm": 0.012991759926080704, + "learning_rate": 0.0009409379812314447, + "loss": 0.0395, + "num_input_tokens_seen": 63683728, + "step": 29505 + }, + { + "epoch": 4.814029363784666, + "grad_norm": 0.0882367342710495, + "learning_rate": 0.0009409044167952694, + "loss": 0.046, + "num_input_tokens_seen": 63694768, + "step": 29510 + }, + { + "epoch": 4.814845024469821, + "grad_norm": 0.11043034493923187, + "learning_rate": 0.0009408708434236066, + "loss": 0.1459, + "num_input_tokens_seen": 63705488, + "step": 29515 + }, + { + "epoch": 4.815660685154976, + "grad_norm": 0.13936880230903625, + "learning_rate": 0.000940837261117137, + "loss": 0.2203, + "num_input_tokens_seen": 63715600, + "step": 29520 + }, + { + "epoch": 4.8164763458401305, + "grad_norm": 0.21391724050045013, + "learning_rate": 0.000940803669876541, + "loss": 0.2094, + "num_input_tokens_seen": 63726000, + "step": 29525 + }, + { + "epoch": 4.817292006525285, + "grad_norm": 0.10898435115814209, + "learning_rate": 0.0009407700697024995, + "loss": 0.0801, + "num_input_tokens_seen": 63736400, + "step": 29530 + }, + { + "epoch": 4.81810766721044, + "grad_norm": 0.2113555371761322, + "learning_rate": 0.0009407364605956933, + "loss": 0.3635, + "num_input_tokens_seen": 63747600, + "step": 29535 + }, + { + "epoch": 4.818923327895595, + "grad_norm": 0.025326663628220558, + "learning_rate": 0.0009407028425568036, + "loss": 0.0355, + "num_input_tokens_seen": 63758416, + "step": 29540 + }, + { + "epoch": 4.819738988580751, + "grad_norm": 0.029131080955266953, + "learning_rate": 0.0009406692155865117, + "loss": 0.1544, + "num_input_tokens_seen": 63768976, + "step": 29545 + }, + { + "epoch": 4.8205546492659055, + "grad_norm": 0.03382393717765808, + "learning_rate": 0.0009406355796854993, + "loss": 0.0303, + "num_input_tokens_seen": 63778896, + "step": 29550 + }, + { + "epoch": 4.82137030995106, + "grad_norm": 0.19992893934249878, + "learning_rate": 0.0009406019348544478, + "loss": 0.0824, + "num_input_tokens_seen": 63790384, + "step": 29555 + }, + { + "epoch": 4.822185970636215, + "grad_norm": 0.03237656131386757, + "learning_rate": 0.000940568281094039, + "loss": 0.0436, + "num_input_tokens_seen": 63800688, + "step": 29560 + }, + { + "epoch": 4.82300163132137, + "grad_norm": 0.01662839762866497, + "learning_rate": 0.0009405346184049552, + "loss": 0.0376, + "num_input_tokens_seen": 63811856, + "step": 29565 + }, + { + "epoch": 4.823817292006526, + "grad_norm": 0.020806660875678062, + "learning_rate": 0.0009405009467878787, + "loss": 0.0868, + "num_input_tokens_seen": 63822576, + "step": 29570 + }, + { + "epoch": 4.8246329526916805, + "grad_norm": 0.04764997959136963, + "learning_rate": 0.0009404672662434914, + "loss": 0.0516, + "num_input_tokens_seen": 63833168, + "step": 29575 + }, + { + "epoch": 4.825448613376835, + "grad_norm": 0.17588429152965546, + "learning_rate": 0.0009404335767724763, + "loss": 0.0872, + "num_input_tokens_seen": 63843312, + "step": 29580 + }, + { + "epoch": 4.82626427406199, + "grad_norm": 0.18665458261966705, + "learning_rate": 0.000940399878375516, + "loss": 0.228, + "num_input_tokens_seen": 63853904, + "step": 29585 + }, + { + "epoch": 4.827079934747145, + "grad_norm": 0.35105305910110474, + "learning_rate": 0.0009403661710532936, + "loss": 0.3491, + "num_input_tokens_seen": 63865072, + "step": 29590 + }, + { + "epoch": 4.827895595432301, + "grad_norm": 0.067581407725811, + "learning_rate": 0.0009403324548064919, + "loss": 0.0885, + "num_input_tokens_seen": 63875760, + "step": 29595 + }, + { + "epoch": 4.828711256117455, + "grad_norm": 0.024092979729175568, + "learning_rate": 0.0009402987296357946, + "loss": 0.0467, + "num_input_tokens_seen": 63887216, + "step": 29600 + }, + { + "epoch": 4.82952691680261, + "grad_norm": 0.007830696180462837, + "learning_rate": 0.0009402649955418848, + "loss": 0.095, + "num_input_tokens_seen": 63897104, + "step": 29605 + }, + { + "epoch": 4.830342577487765, + "grad_norm": 0.044722072780132294, + "learning_rate": 0.0009402312525254464, + "loss": 0.0834, + "num_input_tokens_seen": 63907984, + "step": 29610 + }, + { + "epoch": 4.83115823817292, + "grad_norm": 0.04842796549201012, + "learning_rate": 0.0009401975005871632, + "loss": 0.1179, + "num_input_tokens_seen": 63918640, + "step": 29615 + }, + { + "epoch": 4.831973898858075, + "grad_norm": 0.23899756371974945, + "learning_rate": 0.0009401637397277193, + "loss": 0.1685, + "num_input_tokens_seen": 63929136, + "step": 29620 + }, + { + "epoch": 4.8327895595432295, + "grad_norm": 0.13567295670509338, + "learning_rate": 0.0009401299699477988, + "loss": 0.2646, + "num_input_tokens_seen": 63940464, + "step": 29625 + }, + { + "epoch": 4.833605220228385, + "grad_norm": 0.0698167234659195, + "learning_rate": 0.0009400961912480861, + "loss": 0.1187, + "num_input_tokens_seen": 63950512, + "step": 29630 + }, + { + "epoch": 4.83442088091354, + "grad_norm": 0.22257371246814728, + "learning_rate": 0.0009400624036292657, + "loss": 0.2397, + "num_input_tokens_seen": 63961840, + "step": 29635 + }, + { + "epoch": 4.835236541598695, + "grad_norm": 0.05028904601931572, + "learning_rate": 0.0009400286070920226, + "loss": 0.0766, + "num_input_tokens_seen": 63971440, + "step": 29640 + }, + { + "epoch": 4.83605220228385, + "grad_norm": 0.05889785662293434, + "learning_rate": 0.0009399948016370415, + "loss": 0.1115, + "num_input_tokens_seen": 63982608, + "step": 29645 + }, + { + "epoch": 4.8368678629690045, + "grad_norm": 0.012519282288849354, + "learning_rate": 0.0009399609872650075, + "loss": 0.2264, + "num_input_tokens_seen": 63994000, + "step": 29650 + }, + { + "epoch": 4.83768352365416, + "grad_norm": 0.10202843695878983, + "learning_rate": 0.000939927163976606, + "loss": 0.1416, + "num_input_tokens_seen": 64005008, + "step": 29655 + }, + { + "epoch": 4.838499184339315, + "grad_norm": 0.11825115233659744, + "learning_rate": 0.0009398933317725225, + "loss": 0.1085, + "num_input_tokens_seen": 64014448, + "step": 29660 + }, + { + "epoch": 4.83931484502447, + "grad_norm": 0.014535506255924702, + "learning_rate": 0.0009398594906534424, + "loss": 0.0774, + "num_input_tokens_seen": 64026672, + "step": 29665 + }, + { + "epoch": 4.840130505709625, + "grad_norm": 0.03534112498164177, + "learning_rate": 0.0009398256406200518, + "loss": 0.1853, + "num_input_tokens_seen": 64036752, + "step": 29670 + }, + { + "epoch": 4.8409461663947795, + "grad_norm": 0.01896858587861061, + "learning_rate": 0.0009397917816730368, + "loss": 0.1338, + "num_input_tokens_seen": 64047760, + "step": 29675 + }, + { + "epoch": 4.841761827079935, + "grad_norm": 0.009882763028144836, + "learning_rate": 0.0009397579138130832, + "loss": 0.1454, + "num_input_tokens_seen": 64059664, + "step": 29680 + }, + { + "epoch": 4.84257748776509, + "grad_norm": 0.035995859652757645, + "learning_rate": 0.0009397240370408777, + "loss": 0.0943, + "num_input_tokens_seen": 64071408, + "step": 29685 + }, + { + "epoch": 4.843393148450245, + "grad_norm": 0.05616047605872154, + "learning_rate": 0.0009396901513571068, + "loss": 0.1342, + "num_input_tokens_seen": 64082448, + "step": 29690 + }, + { + "epoch": 4.8442088091354, + "grad_norm": 0.0765305683016777, + "learning_rate": 0.0009396562567624572, + "loss": 0.0969, + "num_input_tokens_seen": 64094320, + "step": 29695 + }, + { + "epoch": 4.8450244698205545, + "grad_norm": 0.021148085594177246, + "learning_rate": 0.0009396223532576159, + "loss": 0.0905, + "num_input_tokens_seen": 64105392, + "step": 29700 + }, + { + "epoch": 4.845840130505709, + "grad_norm": 0.053946319967508316, + "learning_rate": 0.0009395884408432696, + "loss": 0.124, + "num_input_tokens_seen": 64117008, + "step": 29705 + }, + { + "epoch": 4.846655791190865, + "grad_norm": 0.03874645009636879, + "learning_rate": 0.0009395545195201062, + "loss": 0.3134, + "num_input_tokens_seen": 64128016, + "step": 29710 + }, + { + "epoch": 4.84747145187602, + "grad_norm": 0.13925381004810333, + "learning_rate": 0.0009395205892888126, + "loss": 0.0982, + "num_input_tokens_seen": 64138448, + "step": 29715 + }, + { + "epoch": 4.848287112561175, + "grad_norm": 0.06175979971885681, + "learning_rate": 0.0009394866501500769, + "loss": 0.0476, + "num_input_tokens_seen": 64148848, + "step": 29720 + }, + { + "epoch": 4.849102773246329, + "grad_norm": 0.1269979178905487, + "learning_rate": 0.0009394527021045866, + "loss": 0.1806, + "num_input_tokens_seen": 64159280, + "step": 29725 + }, + { + "epoch": 4.849918433931484, + "grad_norm": 0.01345459558069706, + "learning_rate": 0.0009394187451530298, + "loss": 0.127, + "num_input_tokens_seen": 64170384, + "step": 29730 + }, + { + "epoch": 4.850734094616639, + "grad_norm": 0.024148106575012207, + "learning_rate": 0.0009393847792960948, + "loss": 0.1227, + "num_input_tokens_seen": 64180112, + "step": 29735 + }, + { + "epoch": 4.851549755301795, + "grad_norm": 0.10312563180923462, + "learning_rate": 0.0009393508045344697, + "loss": 0.0827, + "num_input_tokens_seen": 64190032, + "step": 29740 + }, + { + "epoch": 4.85236541598695, + "grad_norm": 0.04453880339860916, + "learning_rate": 0.0009393168208688432, + "loss": 0.0923, + "num_input_tokens_seen": 64201936, + "step": 29745 + }, + { + "epoch": 4.853181076672104, + "grad_norm": 0.11050072312355042, + "learning_rate": 0.0009392828282999042, + "loss": 0.0579, + "num_input_tokens_seen": 64212784, + "step": 29750 + }, + { + "epoch": 4.853996737357259, + "grad_norm": 0.006029341835528612, + "learning_rate": 0.0009392488268283412, + "loss": 0.0768, + "num_input_tokens_seen": 64224208, + "step": 29755 + }, + { + "epoch": 4.854812398042414, + "grad_norm": 0.009381423704326153, + "learning_rate": 0.0009392148164548436, + "loss": 0.0688, + "num_input_tokens_seen": 64235792, + "step": 29760 + }, + { + "epoch": 4.85562805872757, + "grad_norm": 0.022448210045695305, + "learning_rate": 0.0009391807971801005, + "loss": 0.1316, + "num_input_tokens_seen": 64246512, + "step": 29765 + }, + { + "epoch": 4.856443719412725, + "grad_norm": 0.03308221697807312, + "learning_rate": 0.0009391467690048014, + "loss": 0.0812, + "num_input_tokens_seen": 64256624, + "step": 29770 + }, + { + "epoch": 4.857259380097879, + "grad_norm": 0.03677208349108696, + "learning_rate": 0.000939112731929636, + "loss": 0.1391, + "num_input_tokens_seen": 64266544, + "step": 29775 + }, + { + "epoch": 4.858075040783034, + "grad_norm": 0.18791401386260986, + "learning_rate": 0.000939078685955294, + "loss": 0.1671, + "num_input_tokens_seen": 64277904, + "step": 29780 + }, + { + "epoch": 4.858890701468189, + "grad_norm": 0.029058706015348434, + "learning_rate": 0.0009390446310824654, + "loss": 0.0641, + "num_input_tokens_seen": 64288464, + "step": 29785 + }, + { + "epoch": 4.859706362153344, + "grad_norm": 0.010044633410871029, + "learning_rate": 0.0009390105673118405, + "loss": 0.1431, + "num_input_tokens_seen": 64299408, + "step": 29790 + }, + { + "epoch": 4.8605220228384995, + "grad_norm": 0.10219626128673553, + "learning_rate": 0.0009389764946441094, + "loss": 0.0884, + "num_input_tokens_seen": 64310224, + "step": 29795 + }, + { + "epoch": 4.861337683523654, + "grad_norm": 0.021998152136802673, + "learning_rate": 0.0009389424130799628, + "loss": 0.0361, + "num_input_tokens_seen": 64321520, + "step": 29800 + }, + { + "epoch": 4.862153344208809, + "grad_norm": 0.28462687134742737, + "learning_rate": 0.0009389083226200914, + "loss": 0.2041, + "num_input_tokens_seen": 64332880, + "step": 29805 + }, + { + "epoch": 4.862969004893964, + "grad_norm": 0.028202053159475327, + "learning_rate": 0.0009388742232651859, + "loss": 0.117, + "num_input_tokens_seen": 64342576, + "step": 29810 + }, + { + "epoch": 4.863784665579119, + "grad_norm": 0.072103351354599, + "learning_rate": 0.0009388401150159377, + "loss": 0.0622, + "num_input_tokens_seen": 64353936, + "step": 29815 + }, + { + "epoch": 4.864600326264274, + "grad_norm": 0.0030460006091743708, + "learning_rate": 0.0009388059978730377, + "loss": 0.0418, + "num_input_tokens_seen": 64364592, + "step": 29820 + }, + { + "epoch": 4.865415986949429, + "grad_norm": 0.04686173424124718, + "learning_rate": 0.0009387718718371776, + "loss": 0.0941, + "num_input_tokens_seen": 64375696, + "step": 29825 + }, + { + "epoch": 4.866231647634584, + "grad_norm": 0.008065537549555302, + "learning_rate": 0.0009387377369090489, + "loss": 0.0562, + "num_input_tokens_seen": 64387440, + "step": 29830 + }, + { + "epoch": 4.867047308319739, + "grad_norm": 0.004290367942303419, + "learning_rate": 0.0009387035930893433, + "loss": 0.0515, + "num_input_tokens_seen": 64399088, + "step": 29835 + }, + { + "epoch": 4.867862969004894, + "grad_norm": 0.020208125934004784, + "learning_rate": 0.0009386694403787529, + "loss": 0.3256, + "num_input_tokens_seen": 64409712, + "step": 29840 + }, + { + "epoch": 4.868678629690049, + "grad_norm": 0.02642834186553955, + "learning_rate": 0.0009386352787779697, + "loss": 0.1905, + "num_input_tokens_seen": 64421200, + "step": 29845 + }, + { + "epoch": 4.869494290375204, + "grad_norm": 0.030870405957102776, + "learning_rate": 0.0009386011082876863, + "loss": 0.186, + "num_input_tokens_seen": 64431344, + "step": 29850 + }, + { + "epoch": 4.870309951060359, + "grad_norm": 0.028852149844169617, + "learning_rate": 0.000938566928908595, + "loss": 0.0825, + "num_input_tokens_seen": 64442928, + "step": 29855 + }, + { + "epoch": 4.871125611745514, + "grad_norm": 0.010144411586225033, + "learning_rate": 0.0009385327406413883, + "loss": 0.0911, + "num_input_tokens_seen": 64453968, + "step": 29860 + }, + { + "epoch": 4.871941272430669, + "grad_norm": 0.008977201767265797, + "learning_rate": 0.0009384985434867597, + "loss": 0.0549, + "num_input_tokens_seen": 64465008, + "step": 29865 + }, + { + "epoch": 4.872756933115824, + "grad_norm": 0.01899038814008236, + "learning_rate": 0.0009384643374454014, + "loss": 0.1008, + "num_input_tokens_seen": 64474832, + "step": 29870 + }, + { + "epoch": 4.873572593800979, + "grad_norm": 0.07091876119375229, + "learning_rate": 0.0009384301225180074, + "loss": 0.0708, + "num_input_tokens_seen": 64485776, + "step": 29875 + }, + { + "epoch": 4.874388254486134, + "grad_norm": 0.008357093669474125, + "learning_rate": 0.0009383958987052706, + "loss": 0.1755, + "num_input_tokens_seen": 64497392, + "step": 29880 + }, + { + "epoch": 4.875203915171289, + "grad_norm": 0.02689467743039131, + "learning_rate": 0.0009383616660078849, + "loss": 0.0426, + "num_input_tokens_seen": 64508176, + "step": 29885 + }, + { + "epoch": 4.876019575856444, + "grad_norm": 0.17094379663467407, + "learning_rate": 0.0009383274244265438, + "loss": 0.1914, + "num_input_tokens_seen": 64517808, + "step": 29890 + }, + { + "epoch": 4.876835236541599, + "grad_norm": 0.26990625262260437, + "learning_rate": 0.0009382931739619416, + "loss": 0.1533, + "num_input_tokens_seen": 64527696, + "step": 29895 + }, + { + "epoch": 4.877650897226753, + "grad_norm": 0.04622545465826988, + "learning_rate": 0.000938258914614772, + "loss": 0.0438, + "num_input_tokens_seen": 64538448, + "step": 29900 + }, + { + "epoch": 4.878466557911908, + "grad_norm": 0.05918402597308159, + "learning_rate": 0.0009382246463857295, + "loss": 0.0387, + "num_input_tokens_seen": 64548784, + "step": 29905 + }, + { + "epoch": 4.879282218597064, + "grad_norm": 0.007347288075834513, + "learning_rate": 0.0009381903692755087, + "loss": 0.0521, + "num_input_tokens_seen": 64559152, + "step": 29910 + }, + { + "epoch": 4.880097879282219, + "grad_norm": 0.07398077100515366, + "learning_rate": 0.0009381560832848043, + "loss": 0.0278, + "num_input_tokens_seen": 64570096, + "step": 29915 + }, + { + "epoch": 4.8809135399673735, + "grad_norm": 0.007107334211468697, + "learning_rate": 0.0009381217884143109, + "loss": 0.0971, + "num_input_tokens_seen": 64581744, + "step": 29920 + }, + { + "epoch": 4.881729200652528, + "grad_norm": 0.039815500378608704, + "learning_rate": 0.0009380874846647236, + "loss": 0.1407, + "num_input_tokens_seen": 64591856, + "step": 29925 + }, + { + "epoch": 4.882544861337683, + "grad_norm": 0.04699449986219406, + "learning_rate": 0.0009380531720367378, + "loss": 0.028, + "num_input_tokens_seen": 64602192, + "step": 29930 + }, + { + "epoch": 4.883360522022839, + "grad_norm": 0.013091751374304295, + "learning_rate": 0.0009380188505310488, + "loss": 0.0716, + "num_input_tokens_seen": 64612976, + "step": 29935 + }, + { + "epoch": 4.884176182707994, + "grad_norm": 0.06615221500396729, + "learning_rate": 0.0009379845201483519, + "loss": 0.0484, + "num_input_tokens_seen": 64623984, + "step": 29940 + }, + { + "epoch": 4.8849918433931485, + "grad_norm": 0.08346287906169891, + "learning_rate": 0.0009379501808893433, + "loss": 0.0517, + "num_input_tokens_seen": 64634992, + "step": 29945 + }, + { + "epoch": 4.885807504078303, + "grad_norm": 0.011664381250739098, + "learning_rate": 0.0009379158327547186, + "loss": 0.0763, + "num_input_tokens_seen": 64645840, + "step": 29950 + }, + { + "epoch": 4.886623164763458, + "grad_norm": 0.0075580887496471405, + "learning_rate": 0.000937881475745174, + "loss": 0.0778, + "num_input_tokens_seen": 64656016, + "step": 29955 + }, + { + "epoch": 4.887438825448614, + "grad_norm": 0.047186482697725296, + "learning_rate": 0.0009378471098614059, + "loss": 0.0492, + "num_input_tokens_seen": 64667760, + "step": 29960 + }, + { + "epoch": 4.888254486133769, + "grad_norm": 0.26316067576408386, + "learning_rate": 0.0009378127351041106, + "loss": 0.1654, + "num_input_tokens_seen": 64677872, + "step": 29965 + }, + { + "epoch": 4.8890701468189235, + "grad_norm": 0.009862682782113552, + "learning_rate": 0.0009377783514739848, + "loss": 0.0224, + "num_input_tokens_seen": 64688784, + "step": 29970 + }, + { + "epoch": 4.889885807504078, + "grad_norm": 0.035016439855098724, + "learning_rate": 0.0009377439589717254, + "loss": 0.1874, + "num_input_tokens_seen": 64698576, + "step": 29975 + }, + { + "epoch": 4.890701468189233, + "grad_norm": 0.03121376968920231, + "learning_rate": 0.0009377095575980293, + "loss": 0.024, + "num_input_tokens_seen": 64709520, + "step": 29980 + }, + { + "epoch": 4.891517128874388, + "grad_norm": 0.04900757595896721, + "learning_rate": 0.0009376751473535939, + "loss": 0.0519, + "num_input_tokens_seen": 64720592, + "step": 29985 + }, + { + "epoch": 4.892332789559543, + "grad_norm": 0.03388383984565735, + "learning_rate": 0.0009376407282391161, + "loss": 0.1533, + "num_input_tokens_seen": 64731568, + "step": 29990 + }, + { + "epoch": 4.8931484502446985, + "grad_norm": 0.337084025144577, + "learning_rate": 0.0009376063002552939, + "loss": 0.2133, + "num_input_tokens_seen": 64742256, + "step": 29995 + }, + { + "epoch": 4.893964110929853, + "grad_norm": 0.007556947413831949, + "learning_rate": 0.0009375718634028249, + "loss": 0.0209, + "num_input_tokens_seen": 64752208, + "step": 30000 + }, + { + "epoch": 4.894779771615008, + "grad_norm": 0.0036500575952231884, + "learning_rate": 0.0009375374176824071, + "loss": 0.0985, + "num_input_tokens_seen": 64761168, + "step": 30005 + }, + { + "epoch": 4.895595432300163, + "grad_norm": 0.016749773174524307, + "learning_rate": 0.0009375029630947384, + "loss": 0.0294, + "num_input_tokens_seen": 64773040, + "step": 30010 + }, + { + "epoch": 4.896411092985318, + "grad_norm": 0.034434981644153595, + "learning_rate": 0.000937468499640517, + "loss": 0.1449, + "num_input_tokens_seen": 64785584, + "step": 30015 + }, + { + "epoch": 4.897226753670473, + "grad_norm": 0.10326296836137772, + "learning_rate": 0.0009374340273204416, + "loss": 0.0892, + "num_input_tokens_seen": 64797648, + "step": 30020 + }, + { + "epoch": 4.898042414355628, + "grad_norm": 0.011707501485943794, + "learning_rate": 0.0009373995461352107, + "loss": 0.0347, + "num_input_tokens_seen": 64808912, + "step": 30025 + }, + { + "epoch": 4.898858075040783, + "grad_norm": 0.01278750505298376, + "learning_rate": 0.0009373650560855232, + "loss": 0.2603, + "num_input_tokens_seen": 64820336, + "step": 30030 + }, + { + "epoch": 4.899673735725938, + "grad_norm": 0.00990685261785984, + "learning_rate": 0.0009373305571720779, + "loss": 0.0723, + "num_input_tokens_seen": 64831600, + "step": 30035 + }, + { + "epoch": 4.900489396411093, + "grad_norm": 0.0068994141183793545, + "learning_rate": 0.0009372960493955741, + "loss": 0.0334, + "num_input_tokens_seen": 64842000, + "step": 30040 + }, + { + "epoch": 4.901305057096248, + "grad_norm": 0.0627712607383728, + "learning_rate": 0.0009372615327567111, + "loss": 0.0694, + "num_input_tokens_seen": 64853456, + "step": 30045 + }, + { + "epoch": 4.902120717781403, + "grad_norm": 0.009238829836249352, + "learning_rate": 0.0009372270072561885, + "loss": 0.0533, + "num_input_tokens_seen": 64863952, + "step": 30050 + }, + { + "epoch": 4.902936378466558, + "grad_norm": 0.05693186819553375, + "learning_rate": 0.0009371924728947059, + "loss": 0.1067, + "num_input_tokens_seen": 64874128, + "step": 30055 + }, + { + "epoch": 4.903752039151713, + "grad_norm": 0.0053004976361989975, + "learning_rate": 0.0009371579296729631, + "loss": 0.0728, + "num_input_tokens_seen": 64883856, + "step": 30060 + }, + { + "epoch": 4.904567699836868, + "grad_norm": 0.2245243936777115, + "learning_rate": 0.0009371233775916604, + "loss": 0.144, + "num_input_tokens_seen": 64894736, + "step": 30065 + }, + { + "epoch": 4.9053833605220225, + "grad_norm": 0.016815267503261566, + "learning_rate": 0.0009370888166514979, + "loss": 0.0464, + "num_input_tokens_seen": 64905808, + "step": 30070 + }, + { + "epoch": 4.906199021207177, + "grad_norm": 0.03701188415288925, + "learning_rate": 0.0009370542468531761, + "loss": 0.2063, + "num_input_tokens_seen": 64917104, + "step": 30075 + }, + { + "epoch": 4.907014681892333, + "grad_norm": 0.05308428779244423, + "learning_rate": 0.0009370196681973955, + "loss": 0.026, + "num_input_tokens_seen": 64926800, + "step": 30080 + }, + { + "epoch": 4.907830342577488, + "grad_norm": 0.010071882046759129, + "learning_rate": 0.0009369850806848569, + "loss": 0.092, + "num_input_tokens_seen": 64937904, + "step": 30085 + }, + { + "epoch": 4.908646003262643, + "grad_norm": 0.00285824341699481, + "learning_rate": 0.0009369504843162613, + "loss": 0.1552, + "num_input_tokens_seen": 64948592, + "step": 30090 + }, + { + "epoch": 4.9094616639477975, + "grad_norm": 0.06950188428163528, + "learning_rate": 0.0009369158790923098, + "loss": 0.0371, + "num_input_tokens_seen": 64959152, + "step": 30095 + }, + { + "epoch": 4.910277324632952, + "grad_norm": 0.01122577115893364, + "learning_rate": 0.0009368812650137038, + "loss": 0.2674, + "num_input_tokens_seen": 64970736, + "step": 30100 + }, + { + "epoch": 4.911092985318108, + "grad_norm": 0.060130488127470016, + "learning_rate": 0.0009368466420811446, + "loss": 0.1494, + "num_input_tokens_seen": 64982032, + "step": 30105 + }, + { + "epoch": 4.911908646003263, + "grad_norm": 0.15006208419799805, + "learning_rate": 0.0009368120102953341, + "loss": 0.0844, + "num_input_tokens_seen": 64994448, + "step": 30110 + }, + { + "epoch": 4.912724306688418, + "grad_norm": 0.013946138322353363, + "learning_rate": 0.0009367773696569742, + "loss": 0.1014, + "num_input_tokens_seen": 65005744, + "step": 30115 + }, + { + "epoch": 4.9135399673735725, + "grad_norm": 0.03129027783870697, + "learning_rate": 0.0009367427201667667, + "loss": 0.0191, + "num_input_tokens_seen": 65016848, + "step": 30120 + }, + { + "epoch": 4.914355628058727, + "grad_norm": 0.050090089440345764, + "learning_rate": 0.000936708061825414, + "loss": 0.0976, + "num_input_tokens_seen": 65027600, + "step": 30125 + }, + { + "epoch": 4.915171288743883, + "grad_norm": 0.026475775986909866, + "learning_rate": 0.0009366733946336184, + "loss": 0.1046, + "num_input_tokens_seen": 65039120, + "step": 30130 + }, + { + "epoch": 4.915986949429038, + "grad_norm": 0.16325153410434723, + "learning_rate": 0.0009366387185920824, + "loss": 0.0743, + "num_input_tokens_seen": 65050512, + "step": 30135 + }, + { + "epoch": 4.916802610114193, + "grad_norm": 0.1270059496164322, + "learning_rate": 0.0009366040337015089, + "loss": 0.1378, + "num_input_tokens_seen": 65059888, + "step": 30140 + }, + { + "epoch": 4.917618270799347, + "grad_norm": 0.023009872063994408, + "learning_rate": 0.0009365693399626009, + "loss": 0.0363, + "num_input_tokens_seen": 65071120, + "step": 30145 + }, + { + "epoch": 4.918433931484502, + "grad_norm": 0.020160719752311707, + "learning_rate": 0.0009365346373760613, + "loss": 0.1381, + "num_input_tokens_seen": 65081744, + "step": 30150 + }, + { + "epoch": 4.919249592169657, + "grad_norm": 0.021635323762893677, + "learning_rate": 0.0009364999259425935, + "loss": 0.0841, + "num_input_tokens_seen": 65093040, + "step": 30155 + }, + { + "epoch": 4.920065252854813, + "grad_norm": 0.006790952757000923, + "learning_rate": 0.0009364652056629008, + "loss": 0.1244, + "num_input_tokens_seen": 65103056, + "step": 30160 + }, + { + "epoch": 4.920880913539968, + "grad_norm": 0.10219030827283859, + "learning_rate": 0.0009364304765376872, + "loss": 0.0706, + "num_input_tokens_seen": 65113904, + "step": 30165 + }, + { + "epoch": 4.921696574225122, + "grad_norm": 0.17027582228183746, + "learning_rate": 0.0009363957385676563, + "loss": 0.1939, + "num_input_tokens_seen": 65124016, + "step": 30170 + }, + { + "epoch": 4.922512234910277, + "grad_norm": 0.17677275836467743, + "learning_rate": 0.0009363609917535122, + "loss": 0.0319, + "num_input_tokens_seen": 65135120, + "step": 30175 + }, + { + "epoch": 4.923327895595432, + "grad_norm": 0.02091541513800621, + "learning_rate": 0.000936326236095959, + "loss": 0.1336, + "num_input_tokens_seen": 65144784, + "step": 30180 + }, + { + "epoch": 4.924143556280587, + "grad_norm": 0.12887035310268402, + "learning_rate": 0.0009362914715957011, + "loss": 0.1299, + "num_input_tokens_seen": 65154928, + "step": 30185 + }, + { + "epoch": 4.924959216965743, + "grad_norm": 0.04844297096133232, + "learning_rate": 0.000936256698253443, + "loss": 0.0368, + "num_input_tokens_seen": 65165808, + "step": 30190 + }, + { + "epoch": 4.925774877650897, + "grad_norm": 0.08234269171953201, + "learning_rate": 0.0009362219160698895, + "loss": 0.1232, + "num_input_tokens_seen": 65175952, + "step": 30195 + }, + { + "epoch": 4.926590538336052, + "grad_norm": 0.02902955561876297, + "learning_rate": 0.0009361871250457457, + "loss": 0.069, + "num_input_tokens_seen": 65185904, + "step": 30200 + }, + { + "epoch": 4.927406199021207, + "grad_norm": 0.006376279518008232, + "learning_rate": 0.0009361523251817161, + "loss": 0.0863, + "num_input_tokens_seen": 65196848, + "step": 30205 + }, + { + "epoch": 4.928221859706362, + "grad_norm": 0.026694167405366898, + "learning_rate": 0.0009361175164785065, + "loss": 0.1628, + "num_input_tokens_seen": 65207440, + "step": 30210 + }, + { + "epoch": 4.9290375203915175, + "grad_norm": 0.01747024990618229, + "learning_rate": 0.0009360826989368223, + "loss": 0.0559, + "num_input_tokens_seen": 65217904, + "step": 30215 + }, + { + "epoch": 4.929853181076672, + "grad_norm": 0.03206579387187958, + "learning_rate": 0.0009360478725573689, + "loss": 0.0369, + "num_input_tokens_seen": 65228464, + "step": 30220 + }, + { + "epoch": 4.930668841761827, + "grad_norm": 0.23463496565818787, + "learning_rate": 0.0009360130373408522, + "loss": 0.278, + "num_input_tokens_seen": 65239024, + "step": 30225 + }, + { + "epoch": 4.931484502446982, + "grad_norm": 0.1615784466266632, + "learning_rate": 0.000935978193287978, + "loss": 0.1004, + "num_input_tokens_seen": 65249872, + "step": 30230 + }, + { + "epoch": 4.932300163132137, + "grad_norm": 0.06594087183475494, + "learning_rate": 0.0009359433403994529, + "loss": 0.1209, + "num_input_tokens_seen": 65260560, + "step": 30235 + }, + { + "epoch": 4.933115823817292, + "grad_norm": 0.04537193849682808, + "learning_rate": 0.0009359084786759828, + "loss": 0.0269, + "num_input_tokens_seen": 65272016, + "step": 30240 + }, + { + "epoch": 4.933931484502447, + "grad_norm": 0.02021726779639721, + "learning_rate": 0.0009358736081182746, + "loss": 0.0542, + "num_input_tokens_seen": 65281136, + "step": 30245 + }, + { + "epoch": 4.934747145187602, + "grad_norm": 0.1692354679107666, + "learning_rate": 0.0009358387287270346, + "loss": 0.1647, + "num_input_tokens_seen": 65290640, + "step": 30250 + }, + { + "epoch": 4.935562805872757, + "grad_norm": 0.22962255775928497, + "learning_rate": 0.0009358038405029699, + "loss": 0.2791, + "num_input_tokens_seen": 65302032, + "step": 30255 + }, + { + "epoch": 4.936378466557912, + "grad_norm": 0.11426787823438644, + "learning_rate": 0.0009357689434467875, + "loss": 0.2355, + "num_input_tokens_seen": 65313168, + "step": 30260 + }, + { + "epoch": 4.937194127243067, + "grad_norm": 0.023075003176927567, + "learning_rate": 0.0009357340375591947, + "loss": 0.0213, + "num_input_tokens_seen": 65324528, + "step": 30265 + }, + { + "epoch": 4.938009787928221, + "grad_norm": 0.15527451038360596, + "learning_rate": 0.0009356991228408988, + "loss": 0.0822, + "num_input_tokens_seen": 65335568, + "step": 30270 + }, + { + "epoch": 4.938825448613377, + "grad_norm": 0.08529616892337799, + "learning_rate": 0.0009356641992926075, + "loss": 0.2065, + "num_input_tokens_seen": 65346032, + "step": 30275 + }, + { + "epoch": 4.939641109298532, + "grad_norm": 0.015049923211336136, + "learning_rate": 0.0009356292669150286, + "loss": 0.1805, + "num_input_tokens_seen": 65357712, + "step": 30280 + }, + { + "epoch": 4.940456769983687, + "grad_norm": 0.005308869294822216, + "learning_rate": 0.0009355943257088698, + "loss": 0.0882, + "num_input_tokens_seen": 65369552, + "step": 30285 + }, + { + "epoch": 4.941272430668842, + "grad_norm": 0.03604280576109886, + "learning_rate": 0.0009355593756748395, + "loss": 0.0936, + "num_input_tokens_seen": 65380912, + "step": 30290 + }, + { + "epoch": 4.942088091353996, + "grad_norm": 0.008536944165825844, + "learning_rate": 0.0009355244168136459, + "loss": 0.2753, + "num_input_tokens_seen": 65392080, + "step": 30295 + }, + { + "epoch": 4.942903752039152, + "grad_norm": 0.016893096268177032, + "learning_rate": 0.0009354894491259975, + "loss": 0.0457, + "num_input_tokens_seen": 65402256, + "step": 30300 + }, + { + "epoch": 4.943719412724307, + "grad_norm": 0.06712765991687775, + "learning_rate": 0.0009354544726126029, + "loss": 0.1581, + "num_input_tokens_seen": 65413456, + "step": 30305 + }, + { + "epoch": 4.944535073409462, + "grad_norm": 0.02855013683438301, + "learning_rate": 0.000935419487274171, + "loss": 0.0881, + "num_input_tokens_seen": 65424912, + "step": 30310 + }, + { + "epoch": 4.945350734094617, + "grad_norm": 0.020487578585743904, + "learning_rate": 0.0009353844931114108, + "loss": 0.0284, + "num_input_tokens_seen": 65434896, + "step": 30315 + }, + { + "epoch": 4.946166394779771, + "grad_norm": 0.0737096443772316, + "learning_rate": 0.0009353494901250316, + "loss": 0.0506, + "num_input_tokens_seen": 65445424, + "step": 30320 + }, + { + "epoch": 4.946982055464927, + "grad_norm": 0.009225163608789444, + "learning_rate": 0.0009353144783157428, + "loss": 0.0633, + "num_input_tokens_seen": 65454672, + "step": 30325 + }, + { + "epoch": 4.947797716150082, + "grad_norm": 0.009523071348667145, + "learning_rate": 0.0009352794576842536, + "loss": 0.0622, + "num_input_tokens_seen": 65465776, + "step": 30330 + }, + { + "epoch": 4.948613376835237, + "grad_norm": 0.1992601603269577, + "learning_rate": 0.0009352444282312742, + "loss": 0.2305, + "num_input_tokens_seen": 65477168, + "step": 30335 + }, + { + "epoch": 4.9494290375203915, + "grad_norm": 0.02317291870713234, + "learning_rate": 0.0009352093899575143, + "loss": 0.0103, + "num_input_tokens_seen": 65487248, + "step": 30340 + }, + { + "epoch": 4.950244698205546, + "grad_norm": 0.03341202810406685, + "learning_rate": 0.0009351743428636838, + "loss": 0.0737, + "num_input_tokens_seen": 65497360, + "step": 30345 + }, + { + "epoch": 4.951060358890701, + "grad_norm": 0.04441145807504654, + "learning_rate": 0.0009351392869504934, + "loss": 0.1009, + "num_input_tokens_seen": 65507728, + "step": 30350 + }, + { + "epoch": 4.951876019575856, + "grad_norm": 0.03949672356247902, + "learning_rate": 0.0009351042222186533, + "loss": 0.0428, + "num_input_tokens_seen": 65517104, + "step": 30355 + }, + { + "epoch": 4.952691680261012, + "grad_norm": 0.11418640613555908, + "learning_rate": 0.0009350691486688743, + "loss": 0.163, + "num_input_tokens_seen": 65529168, + "step": 30360 + }, + { + "epoch": 4.9535073409461665, + "grad_norm": 0.081434465944767, + "learning_rate": 0.0009350340663018668, + "loss": 0.0518, + "num_input_tokens_seen": 65538960, + "step": 30365 + }, + { + "epoch": 4.954323001631321, + "grad_norm": 0.0019200535025447607, + "learning_rate": 0.0009349989751183422, + "loss": 0.0183, + "num_input_tokens_seen": 65549392, + "step": 30370 + }, + { + "epoch": 4.955138662316476, + "grad_norm": 0.11670206487178802, + "learning_rate": 0.0009349638751190115, + "loss": 0.0669, + "num_input_tokens_seen": 65559824, + "step": 30375 + }, + { + "epoch": 4.955954323001631, + "grad_norm": 0.22959190607070923, + "learning_rate": 0.0009349287663045862, + "loss": 0.1679, + "num_input_tokens_seen": 65570896, + "step": 30380 + }, + { + "epoch": 4.956769983686787, + "grad_norm": 0.01145635824650526, + "learning_rate": 0.0009348936486757775, + "loss": 0.0665, + "num_input_tokens_seen": 65582384, + "step": 30385 + }, + { + "epoch": 4.9575856443719415, + "grad_norm": 0.14921236038208008, + "learning_rate": 0.0009348585222332975, + "loss": 0.3085, + "num_input_tokens_seen": 65593264, + "step": 30390 + }, + { + "epoch": 4.958401305057096, + "grad_norm": 0.03834659978747368, + "learning_rate": 0.0009348233869778577, + "loss": 0.2899, + "num_input_tokens_seen": 65604016, + "step": 30395 + }, + { + "epoch": 4.959216965742251, + "grad_norm": 0.1616576462984085, + "learning_rate": 0.0009347882429101706, + "loss": 0.0717, + "num_input_tokens_seen": 65613616, + "step": 30400 + }, + { + "epoch": 4.960032626427406, + "grad_norm": 0.1412140429019928, + "learning_rate": 0.000934753090030948, + "loss": 0.206, + "num_input_tokens_seen": 65624816, + "step": 30405 + }, + { + "epoch": 4.960848287112562, + "grad_norm": 0.13235744833946228, + "learning_rate": 0.0009347179283409027, + "loss": 0.2584, + "num_input_tokens_seen": 65635792, + "step": 30410 + }, + { + "epoch": 4.9616639477977165, + "grad_norm": 0.05358226224780083, + "learning_rate": 0.0009346827578407468, + "loss": 0.1738, + "num_input_tokens_seen": 65646512, + "step": 30415 + }, + { + "epoch": 4.962479608482871, + "grad_norm": 0.05593210086226463, + "learning_rate": 0.0009346475785311936, + "loss": 0.104, + "num_input_tokens_seen": 65656976, + "step": 30420 + }, + { + "epoch": 4.963295269168026, + "grad_norm": 0.14764393866062164, + "learning_rate": 0.0009346123904129558, + "loss": 0.1473, + "num_input_tokens_seen": 65667792, + "step": 30425 + }, + { + "epoch": 4.964110929853181, + "grad_norm": 0.3284737467765808, + "learning_rate": 0.0009345771934867464, + "loss": 0.1525, + "num_input_tokens_seen": 65679088, + "step": 30430 + }, + { + "epoch": 4.964926590538336, + "grad_norm": 0.12388251721858978, + "learning_rate": 0.000934541987753279, + "loss": 0.0836, + "num_input_tokens_seen": 65689840, + "step": 30435 + }, + { + "epoch": 4.9657422512234906, + "grad_norm": 0.06869780272245407, + "learning_rate": 0.0009345067732132671, + "loss": 0.1036, + "num_input_tokens_seen": 65700528, + "step": 30440 + }, + { + "epoch": 4.966557911908646, + "grad_norm": 0.06497956812381744, + "learning_rate": 0.0009344715498674241, + "loss": 0.0683, + "num_input_tokens_seen": 65710128, + "step": 30445 + }, + { + "epoch": 4.967373572593801, + "grad_norm": 0.019313139840960503, + "learning_rate": 0.0009344363177164639, + "loss": 0.1687, + "num_input_tokens_seen": 65722064, + "step": 30450 + }, + { + "epoch": 4.968189233278956, + "grad_norm": 0.15426386892795563, + "learning_rate": 0.0009344010767611007, + "loss": 0.1447, + "num_input_tokens_seen": 65732336, + "step": 30455 + }, + { + "epoch": 4.969004893964111, + "grad_norm": 0.016518309712409973, + "learning_rate": 0.0009343658270020485, + "loss": 0.0486, + "num_input_tokens_seen": 65743504, + "step": 30460 + }, + { + "epoch": 4.9698205546492655, + "grad_norm": 0.017250383272767067, + "learning_rate": 0.000934330568440022, + "loss": 0.0786, + "num_input_tokens_seen": 65754608, + "step": 30465 + }, + { + "epoch": 4.970636215334421, + "grad_norm": 0.0339357890188694, + "learning_rate": 0.0009342953010757353, + "loss": 0.0841, + "num_input_tokens_seen": 65765584, + "step": 30470 + }, + { + "epoch": 4.971451876019576, + "grad_norm": 0.015469072386622429, + "learning_rate": 0.0009342600249099036, + "loss": 0.186, + "num_input_tokens_seen": 65776816, + "step": 30475 + }, + { + "epoch": 4.972267536704731, + "grad_norm": 0.16749094426631927, + "learning_rate": 0.0009342247399432414, + "loss": 0.1858, + "num_input_tokens_seen": 65788336, + "step": 30480 + }, + { + "epoch": 4.973083197389886, + "grad_norm": 0.0026593629736453295, + "learning_rate": 0.0009341894461764641, + "loss": 0.2986, + "num_input_tokens_seen": 65798704, + "step": 30485 + }, + { + "epoch": 4.9738988580750405, + "grad_norm": 0.09327977895736694, + "learning_rate": 0.0009341541436102868, + "loss": 0.0575, + "num_input_tokens_seen": 65808816, + "step": 30490 + }, + { + "epoch": 4.974714518760196, + "grad_norm": 0.07371420413255692, + "learning_rate": 0.0009341188322454251, + "loss": 0.0586, + "num_input_tokens_seen": 65819088, + "step": 30495 + }, + { + "epoch": 4.975530179445351, + "grad_norm": 0.16261614859104156, + "learning_rate": 0.0009340835120825946, + "loss": 0.2204, + "num_input_tokens_seen": 65830256, + "step": 30500 + }, + { + "epoch": 4.976345840130506, + "grad_norm": 0.015217346139252186, + "learning_rate": 0.0009340481831225109, + "loss": 0.1069, + "num_input_tokens_seen": 65839792, + "step": 30505 + }, + { + "epoch": 4.977161500815661, + "grad_norm": 0.029433144256472588, + "learning_rate": 0.0009340128453658902, + "loss": 0.0577, + "num_input_tokens_seen": 65850512, + "step": 30510 + }, + { + "epoch": 4.9779771615008155, + "grad_norm": 0.03517317771911621, + "learning_rate": 0.0009339774988134487, + "loss": 0.0816, + "num_input_tokens_seen": 65861584, + "step": 30515 + }, + { + "epoch": 4.97879282218597, + "grad_norm": 0.06163543090224266, + "learning_rate": 0.0009339421434659025, + "loss": 0.178, + "num_input_tokens_seen": 65872880, + "step": 30520 + }, + { + "epoch": 4.979608482871125, + "grad_norm": 0.03921574354171753, + "learning_rate": 0.0009339067793239682, + "loss": 0.091, + "num_input_tokens_seen": 65882544, + "step": 30525 + }, + { + "epoch": 4.980424143556281, + "grad_norm": 0.05431029573082924, + "learning_rate": 0.0009338714063883627, + "loss": 0.0602, + "num_input_tokens_seen": 65893456, + "step": 30530 + }, + { + "epoch": 4.981239804241436, + "grad_norm": 0.03704000636935234, + "learning_rate": 0.0009338360246598028, + "loss": 0.0391, + "num_input_tokens_seen": 65905264, + "step": 30535 + }, + { + "epoch": 4.9820554649265905, + "grad_norm": 0.12698346376419067, + "learning_rate": 0.0009338006341390053, + "loss": 0.2379, + "num_input_tokens_seen": 65916400, + "step": 30540 + }, + { + "epoch": 4.982871125611745, + "grad_norm": 0.03991067782044411, + "learning_rate": 0.0009337652348266879, + "loss": 0.1004, + "num_input_tokens_seen": 65928272, + "step": 30545 + }, + { + "epoch": 4.9836867862969, + "grad_norm": 0.08072703331708908, + "learning_rate": 0.0009337298267235675, + "loss": 0.0471, + "num_input_tokens_seen": 65939792, + "step": 30550 + }, + { + "epoch": 4.984502446982056, + "grad_norm": 0.019922945648431778, + "learning_rate": 0.0009336944098303621, + "loss": 0.068, + "num_input_tokens_seen": 65950576, + "step": 30555 + }, + { + "epoch": 4.985318107667211, + "grad_norm": 0.07543916255235672, + "learning_rate": 0.0009336589841477893, + "loss": 0.1996, + "num_input_tokens_seen": 65960240, + "step": 30560 + }, + { + "epoch": 4.986133768352365, + "grad_norm": 0.04474138095974922, + "learning_rate": 0.0009336235496765669, + "loss": 0.0501, + "num_input_tokens_seen": 65970544, + "step": 30565 + }, + { + "epoch": 4.98694942903752, + "grad_norm": 0.013572338037192822, + "learning_rate": 0.0009335881064174134, + "loss": 0.0228, + "num_input_tokens_seen": 65982768, + "step": 30570 + }, + { + "epoch": 4.987765089722675, + "grad_norm": 0.05894570052623749, + "learning_rate": 0.0009335526543710466, + "loss": 0.0459, + "num_input_tokens_seen": 65993680, + "step": 30575 + }, + { + "epoch": 4.988580750407831, + "grad_norm": 0.16921456158161163, + "learning_rate": 0.0009335171935381854, + "loss": 0.2059, + "num_input_tokens_seen": 66005136, + "step": 30580 + }, + { + "epoch": 4.989396411092986, + "grad_norm": 0.10699997842311859, + "learning_rate": 0.0009334817239195483, + "loss": 0.1509, + "num_input_tokens_seen": 66015376, + "step": 30585 + }, + { + "epoch": 4.99021207177814, + "grad_norm": 0.190328449010849, + "learning_rate": 0.0009334462455158543, + "loss": 0.0686, + "num_input_tokens_seen": 66025904, + "step": 30590 + }, + { + "epoch": 4.991027732463295, + "grad_norm": 0.11635138094425201, + "learning_rate": 0.0009334107583278222, + "loss": 0.0932, + "num_input_tokens_seen": 66036560, + "step": 30595 + }, + { + "epoch": 4.99184339314845, + "grad_norm": 0.0913156270980835, + "learning_rate": 0.0009333752623561711, + "loss": 0.11, + "num_input_tokens_seen": 66047856, + "step": 30600 + }, + { + "epoch": 4.992659053833605, + "grad_norm": 0.008136956952512264, + "learning_rate": 0.0009333397576016207, + "loss": 0.0888, + "num_input_tokens_seen": 66058736, + "step": 30605 + }, + { + "epoch": 4.993474714518761, + "grad_norm": 0.16477340459823608, + "learning_rate": 0.0009333042440648903, + "loss": 0.0998, + "num_input_tokens_seen": 66068848, + "step": 30610 + }, + { + "epoch": 4.994290375203915, + "grad_norm": 0.18428614735603333, + "learning_rate": 0.0009332687217466997, + "loss": 0.1477, + "num_input_tokens_seen": 66079760, + "step": 30615 + }, + { + "epoch": 4.99510603588907, + "grad_norm": 0.005767988972365856, + "learning_rate": 0.000933233190647769, + "loss": 0.0721, + "num_input_tokens_seen": 66090128, + "step": 30620 + }, + { + "epoch": 4.995921696574225, + "grad_norm": 0.23842494189739227, + "learning_rate": 0.0009331976507688178, + "loss": 0.1572, + "num_input_tokens_seen": 66100304, + "step": 30625 + }, + { + "epoch": 4.99673735725938, + "grad_norm": 0.13110394775867462, + "learning_rate": 0.0009331621021105668, + "loss": 0.2062, + "num_input_tokens_seen": 66111696, + "step": 30630 + }, + { + "epoch": 4.997553017944535, + "grad_norm": 0.20291487872600555, + "learning_rate": 0.0009331265446737364, + "loss": 0.1659, + "num_input_tokens_seen": 66122064, + "step": 30635 + }, + { + "epoch": 4.99836867862969, + "grad_norm": 0.04522571712732315, + "learning_rate": 0.0009330909784590469, + "loss": 0.0256, + "num_input_tokens_seen": 66131376, + "step": 30640 + }, + { + "epoch": 4.999184339314845, + "grad_norm": 0.011287901550531387, + "learning_rate": 0.0009330554034672194, + "loss": 0.0701, + "num_input_tokens_seen": 66142800, + "step": 30645 + }, + { + "epoch": 5.0, + "grad_norm": 0.026959935203194618, + "learning_rate": 0.0009330198196989749, + "loss": 0.0926, + "num_input_tokens_seen": 66152480, + "step": 30650 + }, + { + "epoch": 5.0, + "eval_loss": 0.11134487390518188, + "eval_runtime": 103.8647, + "eval_samples_per_second": 26.236, + "eval_steps_per_second": 6.566, + "num_input_tokens_seen": 66152480, + "step": 30650 + }, + { + "epoch": 5.000815660685155, + "grad_norm": 0.06716465950012207, + "learning_rate": 0.0009329842271550342, + "loss": 0.0348, + "num_input_tokens_seen": 66162912, + "step": 30655 + }, + { + "epoch": 5.00163132137031, + "grad_norm": 0.12330807745456696, + "learning_rate": 0.0009329486258361191, + "loss": 0.0775, + "num_input_tokens_seen": 66173312, + "step": 30660 + }, + { + "epoch": 5.002446982055465, + "grad_norm": 0.03965310752391815, + "learning_rate": 0.0009329130157429507, + "loss": 0.0883, + "num_input_tokens_seen": 66184832, + "step": 30665 + }, + { + "epoch": 5.00326264274062, + "grad_norm": 0.05287100747227669, + "learning_rate": 0.000932877396876251, + "loss": 0.0633, + "num_input_tokens_seen": 66196544, + "step": 30670 + }, + { + "epoch": 5.004078303425775, + "grad_norm": 0.006737517658621073, + "learning_rate": 0.0009328417692367415, + "loss": 0.0268, + "num_input_tokens_seen": 66207904, + "step": 30675 + }, + { + "epoch": 5.00489396411093, + "grad_norm": 0.004301621112972498, + "learning_rate": 0.0009328061328251445, + "loss": 0.0279, + "num_input_tokens_seen": 66218752, + "step": 30680 + }, + { + "epoch": 5.005709624796085, + "grad_norm": 0.021246733143925667, + "learning_rate": 0.0009327704876421824, + "loss": 0.0398, + "num_input_tokens_seen": 66230528, + "step": 30685 + }, + { + "epoch": 5.006525285481239, + "grad_norm": 0.13751445710659027, + "learning_rate": 0.000932734833688577, + "loss": 0.0955, + "num_input_tokens_seen": 66240768, + "step": 30690 + }, + { + "epoch": 5.007340946166395, + "grad_norm": 0.2667020261287689, + "learning_rate": 0.0009326991709650514, + "loss": 0.1222, + "num_input_tokens_seen": 66252672, + "step": 30695 + }, + { + "epoch": 5.00815660685155, + "grad_norm": 0.07113843411207199, + "learning_rate": 0.0009326634994723282, + "loss": 0.0652, + "num_input_tokens_seen": 66263520, + "step": 30700 + }, + { + "epoch": 5.008972267536705, + "grad_norm": 0.0024796391371637583, + "learning_rate": 0.0009326278192111304, + "loss": 0.0114, + "num_input_tokens_seen": 66274752, + "step": 30705 + }, + { + "epoch": 5.00978792822186, + "grad_norm": 0.026515653356909752, + "learning_rate": 0.0009325921301821809, + "loss": 0.4077, + "num_input_tokens_seen": 66285440, + "step": 30710 + }, + { + "epoch": 5.010603588907014, + "grad_norm": 0.028287431225180626, + "learning_rate": 0.000932556432386203, + "loss": 0.0222, + "num_input_tokens_seen": 66295040, + "step": 30715 + }, + { + "epoch": 5.011419249592169, + "grad_norm": 0.010041406378149986, + "learning_rate": 0.0009325207258239204, + "loss": 0.0262, + "num_input_tokens_seen": 66304384, + "step": 30720 + }, + { + "epoch": 5.012234910277325, + "grad_norm": 0.03483397141098976, + "learning_rate": 0.0009324850104960566, + "loss": 0.157, + "num_input_tokens_seen": 66315648, + "step": 30725 + }, + { + "epoch": 5.01305057096248, + "grad_norm": 0.005746485199779272, + "learning_rate": 0.0009324492864033354, + "loss": 0.067, + "num_input_tokens_seen": 66325664, + "step": 30730 + }, + { + "epoch": 5.013866231647635, + "grad_norm": 0.026695504784584045, + "learning_rate": 0.0009324135535464808, + "loss": 0.0784, + "num_input_tokens_seen": 66336544, + "step": 30735 + }, + { + "epoch": 5.014681892332789, + "grad_norm": 0.02534736692905426, + "learning_rate": 0.000932377811926217, + "loss": 0.0267, + "num_input_tokens_seen": 66347360, + "step": 30740 + }, + { + "epoch": 5.015497553017944, + "grad_norm": 0.0139123210683465, + "learning_rate": 0.0009323420615432683, + "loss": 0.0385, + "num_input_tokens_seen": 66356992, + "step": 30745 + }, + { + "epoch": 5.0163132137031, + "grad_norm": 0.03355789557099342, + "learning_rate": 0.0009323063023983593, + "loss": 0.0407, + "num_input_tokens_seen": 66368032, + "step": 30750 + }, + { + "epoch": 5.017128874388255, + "grad_norm": 0.06933846324682236, + "learning_rate": 0.0009322705344922146, + "loss": 0.0905, + "num_input_tokens_seen": 66379424, + "step": 30755 + }, + { + "epoch": 5.0179445350734095, + "grad_norm": 0.028216278180480003, + "learning_rate": 0.0009322347578255592, + "loss": 0.036, + "num_input_tokens_seen": 66390752, + "step": 30760 + }, + { + "epoch": 5.018760195758564, + "grad_norm": 0.06517303735017776, + "learning_rate": 0.0009321989723991181, + "loss": 0.0584, + "num_input_tokens_seen": 66403200, + "step": 30765 + }, + { + "epoch": 5.019575856443719, + "grad_norm": 0.018046068027615547, + "learning_rate": 0.0009321631782136166, + "loss": 0.0155, + "num_input_tokens_seen": 66412768, + "step": 30770 + }, + { + "epoch": 5.020391517128874, + "grad_norm": 0.1381438821554184, + "learning_rate": 0.0009321273752697798, + "loss": 0.0418, + "num_input_tokens_seen": 66424448, + "step": 30775 + }, + { + "epoch": 5.02120717781403, + "grad_norm": 0.019063521176576614, + "learning_rate": 0.0009320915635683338, + "loss": 0.0996, + "num_input_tokens_seen": 66434400, + "step": 30780 + }, + { + "epoch": 5.0220228384991845, + "grad_norm": 0.009055123664438725, + "learning_rate": 0.0009320557431100041, + "loss": 0.2989, + "num_input_tokens_seen": 66445440, + "step": 30785 + }, + { + "epoch": 5.022838499184339, + "grad_norm": 0.15745800733566284, + "learning_rate": 0.0009320199138955165, + "loss": 0.1453, + "num_input_tokens_seen": 66455776, + "step": 30790 + }, + { + "epoch": 5.023654159869494, + "grad_norm": 0.003833366557955742, + "learning_rate": 0.0009319840759255976, + "loss": 0.0094, + "num_input_tokens_seen": 66467616, + "step": 30795 + }, + { + "epoch": 5.024469820554649, + "grad_norm": 0.021754974499344826, + "learning_rate": 0.0009319482292009731, + "loss": 0.0623, + "num_input_tokens_seen": 66477056, + "step": 30800 + }, + { + "epoch": 5.025285481239805, + "grad_norm": 0.016967935487627983, + "learning_rate": 0.0009319123737223698, + "loss": 0.1068, + "num_input_tokens_seen": 66488352, + "step": 30805 + }, + { + "epoch": 5.0261011419249595, + "grad_norm": 0.022289754822850227, + "learning_rate": 0.0009318765094905144, + "loss": 0.0498, + "num_input_tokens_seen": 66497824, + "step": 30810 + }, + { + "epoch": 5.026916802610114, + "grad_norm": 0.010767526924610138, + "learning_rate": 0.0009318406365061336, + "loss": 0.0472, + "num_input_tokens_seen": 66509920, + "step": 30815 + }, + { + "epoch": 5.027732463295269, + "grad_norm": 0.08797360956668854, + "learning_rate": 0.0009318047547699546, + "loss": 0.0362, + "num_input_tokens_seen": 66519520, + "step": 30820 + }, + { + "epoch": 5.028548123980424, + "grad_norm": 0.02853584848344326, + "learning_rate": 0.0009317688642827044, + "loss": 0.0682, + "num_input_tokens_seen": 66529792, + "step": 30825 + }, + { + "epoch": 5.029363784665579, + "grad_norm": 0.19071421027183533, + "learning_rate": 0.0009317329650451103, + "loss": 0.1362, + "num_input_tokens_seen": 66540416, + "step": 30830 + }, + { + "epoch": 5.0301794453507345, + "grad_norm": 0.06761402636766434, + "learning_rate": 0.0009316970570579002, + "loss": 0.0265, + "num_input_tokens_seen": 66549280, + "step": 30835 + }, + { + "epoch": 5.030995106035889, + "grad_norm": 0.014102579094469547, + "learning_rate": 0.0009316611403218013, + "loss": 0.019, + "num_input_tokens_seen": 66560640, + "step": 30840 + }, + { + "epoch": 5.031810766721044, + "grad_norm": 0.06226198747754097, + "learning_rate": 0.000931625214837542, + "loss": 0.089, + "num_input_tokens_seen": 66571808, + "step": 30845 + }, + { + "epoch": 5.032626427406199, + "grad_norm": 0.35398319363594055, + "learning_rate": 0.0009315892806058501, + "loss": 0.2631, + "num_input_tokens_seen": 66583424, + "step": 30850 + }, + { + "epoch": 5.033442088091354, + "grad_norm": 0.1936490833759308, + "learning_rate": 0.0009315533376274541, + "loss": 0.224, + "num_input_tokens_seen": 66593984, + "step": 30855 + }, + { + "epoch": 5.034257748776509, + "grad_norm": 0.04159780964255333, + "learning_rate": 0.0009315173859030821, + "loss": 0.0488, + "num_input_tokens_seen": 66604704, + "step": 30860 + }, + { + "epoch": 5.035073409461664, + "grad_norm": 0.1702297031879425, + "learning_rate": 0.0009314814254334627, + "loss": 0.1724, + "num_input_tokens_seen": 66615808, + "step": 30865 + }, + { + "epoch": 5.035889070146819, + "grad_norm": 0.0423305407166481, + "learning_rate": 0.000931445456219325, + "loss": 0.0243, + "num_input_tokens_seen": 66627008, + "step": 30870 + }, + { + "epoch": 5.036704730831974, + "grad_norm": 0.01736651174724102, + "learning_rate": 0.0009314094782613977, + "loss": 0.0573, + "num_input_tokens_seen": 66638464, + "step": 30875 + }, + { + "epoch": 5.037520391517129, + "grad_norm": 0.12750132381916046, + "learning_rate": 0.0009313734915604103, + "loss": 0.0513, + "num_input_tokens_seen": 66648544, + "step": 30880 + }, + { + "epoch": 5.0383360522022835, + "grad_norm": 0.06630299240350723, + "learning_rate": 0.0009313374961170917, + "loss": 0.0888, + "num_input_tokens_seen": 66660384, + "step": 30885 + }, + { + "epoch": 5.039151712887439, + "grad_norm": 0.05274463817477226, + "learning_rate": 0.0009313014919321715, + "loss": 0.1432, + "num_input_tokens_seen": 66671136, + "step": 30890 + }, + { + "epoch": 5.039967373572594, + "grad_norm": 0.33410975337028503, + "learning_rate": 0.0009312654790063795, + "loss": 0.1383, + "num_input_tokens_seen": 66681664, + "step": 30895 + }, + { + "epoch": 5.040783034257749, + "grad_norm": 0.02053641900420189, + "learning_rate": 0.0009312294573404454, + "loss": 0.0594, + "num_input_tokens_seen": 66691968, + "step": 30900 + }, + { + "epoch": 5.041598694942904, + "grad_norm": 0.016985422000288963, + "learning_rate": 0.0009311934269350993, + "loss": 0.0628, + "num_input_tokens_seen": 66703488, + "step": 30905 + }, + { + "epoch": 5.0424143556280585, + "grad_norm": 0.004801221191883087, + "learning_rate": 0.0009311573877910716, + "loss": 0.075, + "num_input_tokens_seen": 66714304, + "step": 30910 + }, + { + "epoch": 5.043230016313213, + "grad_norm": 0.01780622825026512, + "learning_rate": 0.0009311213399090921, + "loss": 0.0683, + "num_input_tokens_seen": 66724864, + "step": 30915 + }, + { + "epoch": 5.044045676998369, + "grad_norm": 0.108795665204525, + "learning_rate": 0.000931085283289892, + "loss": 0.1411, + "num_input_tokens_seen": 66736736, + "step": 30920 + }, + { + "epoch": 5.044861337683524, + "grad_norm": 0.10420024394989014, + "learning_rate": 0.0009310492179342016, + "loss": 0.1291, + "num_input_tokens_seen": 66748128, + "step": 30925 + }, + { + "epoch": 5.045676998368679, + "grad_norm": 0.032937731593847275, + "learning_rate": 0.0009310131438427521, + "loss": 0.0332, + "num_input_tokens_seen": 66759200, + "step": 30930 + }, + { + "epoch": 5.0464926590538335, + "grad_norm": 0.011512907221913338, + "learning_rate": 0.0009309770610162744, + "loss": 0.17, + "num_input_tokens_seen": 66770720, + "step": 30935 + }, + { + "epoch": 5.047308319738988, + "grad_norm": 0.04573098197579384, + "learning_rate": 0.0009309409694555, + "loss": 0.1237, + "num_input_tokens_seen": 66781312, + "step": 30940 + }, + { + "epoch": 5.048123980424143, + "grad_norm": 0.03466716408729553, + "learning_rate": 0.0009309048691611599, + "loss": 0.035, + "num_input_tokens_seen": 66792864, + "step": 30945 + }, + { + "epoch": 5.048939641109299, + "grad_norm": 0.15388724207878113, + "learning_rate": 0.0009308687601339861, + "loss": 0.1296, + "num_input_tokens_seen": 66802976, + "step": 30950 + }, + { + "epoch": 5.049755301794454, + "grad_norm": 0.05649597942829132, + "learning_rate": 0.0009308326423747103, + "loss": 0.0324, + "num_input_tokens_seen": 66812768, + "step": 30955 + }, + { + "epoch": 5.0505709624796085, + "grad_norm": 0.017839884385466576, + "learning_rate": 0.0009307965158840644, + "loss": 0.1059, + "num_input_tokens_seen": 66823072, + "step": 30960 + }, + { + "epoch": 5.051386623164763, + "grad_norm": 0.050337255001068115, + "learning_rate": 0.0009307603806627807, + "loss": 0.0242, + "num_input_tokens_seen": 66835200, + "step": 30965 + }, + { + "epoch": 5.052202283849918, + "grad_norm": 0.007979363203048706, + "learning_rate": 0.0009307242367115914, + "loss": 0.0345, + "num_input_tokens_seen": 66846528, + "step": 30970 + }, + { + "epoch": 5.053017944535074, + "grad_norm": 0.02444876916706562, + "learning_rate": 0.000930688084031229, + "loss": 0.1061, + "num_input_tokens_seen": 66858080, + "step": 30975 + }, + { + "epoch": 5.053833605220229, + "grad_norm": 0.04180579259991646, + "learning_rate": 0.0009306519226224262, + "loss": 0.1218, + "num_input_tokens_seen": 66869888, + "step": 30980 + }, + { + "epoch": 5.054649265905383, + "grad_norm": 0.29583466053009033, + "learning_rate": 0.0009306157524859158, + "loss": 0.1678, + "num_input_tokens_seen": 66880256, + "step": 30985 + }, + { + "epoch": 5.055464926590538, + "grad_norm": 0.05580262467265129, + "learning_rate": 0.000930579573622431, + "loss": 0.0623, + "num_input_tokens_seen": 66890784, + "step": 30990 + }, + { + "epoch": 5.056280587275693, + "grad_norm": 0.02565973810851574, + "learning_rate": 0.0009305433860327049, + "loss": 0.0525, + "num_input_tokens_seen": 66901920, + "step": 30995 + }, + { + "epoch": 5.057096247960848, + "grad_norm": 0.1978929191827774, + "learning_rate": 0.0009305071897174708, + "loss": 0.1716, + "num_input_tokens_seen": 66912608, + "step": 31000 + }, + { + "epoch": 5.057911908646004, + "grad_norm": 0.02066556178033352, + "learning_rate": 0.0009304709846774625, + "loss": 0.1252, + "num_input_tokens_seen": 66922336, + "step": 31005 + }, + { + "epoch": 5.058727569331158, + "grad_norm": 0.06892592459917068, + "learning_rate": 0.0009304347709134136, + "loss": 0.1394, + "num_input_tokens_seen": 66932960, + "step": 31010 + }, + { + "epoch": 5.059543230016313, + "grad_norm": 0.012479208409786224, + "learning_rate": 0.000930398548426058, + "loss": 0.1827, + "num_input_tokens_seen": 66944448, + "step": 31015 + }, + { + "epoch": 5.060358890701468, + "grad_norm": 0.042949240654706955, + "learning_rate": 0.0009303623172161298, + "loss": 0.0888, + "num_input_tokens_seen": 66955264, + "step": 31020 + }, + { + "epoch": 5.061174551386623, + "grad_norm": 0.01852462813258171, + "learning_rate": 0.0009303260772843632, + "loss": 0.0219, + "num_input_tokens_seen": 66965024, + "step": 31025 + }, + { + "epoch": 5.061990212071779, + "grad_norm": 0.018405335023999214, + "learning_rate": 0.0009302898286314929, + "loss": 0.2151, + "num_input_tokens_seen": 66975072, + "step": 31030 + }, + { + "epoch": 5.062805872756933, + "grad_norm": 0.023245351389050484, + "learning_rate": 0.0009302535712582532, + "loss": 0.1129, + "num_input_tokens_seen": 66987232, + "step": 31035 + }, + { + "epoch": 5.063621533442088, + "grad_norm": 0.17474783957004547, + "learning_rate": 0.0009302173051653792, + "loss": 0.1371, + "num_input_tokens_seen": 66995968, + "step": 31040 + }, + { + "epoch": 5.064437194127243, + "grad_norm": 0.1812702864408493, + "learning_rate": 0.0009301810303536056, + "loss": 0.0818, + "num_input_tokens_seen": 67006912, + "step": 31045 + }, + { + "epoch": 5.065252854812398, + "grad_norm": 0.01352920476347208, + "learning_rate": 0.0009301447468236678, + "loss": 0.082, + "num_input_tokens_seen": 67017824, + "step": 31050 + }, + { + "epoch": 5.066068515497553, + "grad_norm": 0.09946420043706894, + "learning_rate": 0.000930108454576301, + "loss": 0.0382, + "num_input_tokens_seen": 67028736, + "step": 31055 + }, + { + "epoch": 5.066884176182708, + "grad_norm": 0.04256308078765869, + "learning_rate": 0.0009300721536122408, + "loss": 0.0363, + "num_input_tokens_seen": 67039936, + "step": 31060 + }, + { + "epoch": 5.067699836867863, + "grad_norm": 0.017450427636504173, + "learning_rate": 0.0009300358439322228, + "loss": 0.0702, + "num_input_tokens_seen": 67051616, + "step": 31065 + }, + { + "epoch": 5.068515497553018, + "grad_norm": 0.008960510604083538, + "learning_rate": 0.0009299995255369828, + "loss": 0.0467, + "num_input_tokens_seen": 67062240, + "step": 31070 + }, + { + "epoch": 5.069331158238173, + "grad_norm": 0.03234187141060829, + "learning_rate": 0.000929963198427257, + "loss": 0.0625, + "num_input_tokens_seen": 67073408, + "step": 31075 + }, + { + "epoch": 5.070146818923328, + "grad_norm": 0.08352899551391602, + "learning_rate": 0.0009299268626037815, + "loss": 0.0419, + "num_input_tokens_seen": 67084896, + "step": 31080 + }, + { + "epoch": 5.0709624796084825, + "grad_norm": 0.005004632752388716, + "learning_rate": 0.0009298905180672928, + "loss": 0.0243, + "num_input_tokens_seen": 67095040, + "step": 31085 + }, + { + "epoch": 5.071778140293638, + "grad_norm": 0.005839685909450054, + "learning_rate": 0.0009298541648185272, + "loss": 0.0244, + "num_input_tokens_seen": 67106336, + "step": 31090 + }, + { + "epoch": 5.072593800978793, + "grad_norm": 0.0693696066737175, + "learning_rate": 0.0009298178028582218, + "loss": 0.1508, + "num_input_tokens_seen": 67116832, + "step": 31095 + }, + { + "epoch": 5.073409461663948, + "grad_norm": 0.1871979832649231, + "learning_rate": 0.0009297814321871133, + "loss": 0.098, + "num_input_tokens_seen": 67128736, + "step": 31100 + }, + { + "epoch": 5.074225122349103, + "grad_norm": 0.1801316738128662, + "learning_rate": 0.0009297450528059389, + "loss": 0.1099, + "num_input_tokens_seen": 67140128, + "step": 31105 + }, + { + "epoch": 5.075040783034257, + "grad_norm": 0.02624104544520378, + "learning_rate": 0.0009297086647154358, + "loss": 0.1275, + "num_input_tokens_seen": 67151232, + "step": 31110 + }, + { + "epoch": 5.075856443719413, + "grad_norm": 0.008905615657567978, + "learning_rate": 0.0009296722679163417, + "loss": 0.072, + "num_input_tokens_seen": 67161984, + "step": 31115 + }, + { + "epoch": 5.076672104404568, + "grad_norm": 0.2574773132801056, + "learning_rate": 0.0009296358624093937, + "loss": 0.1473, + "num_input_tokens_seen": 67172224, + "step": 31120 + }, + { + "epoch": 5.077487765089723, + "grad_norm": 0.006214026361703873, + "learning_rate": 0.00092959944819533, + "loss": 0.0619, + "num_input_tokens_seen": 67182560, + "step": 31125 + }, + { + "epoch": 5.078303425774878, + "grad_norm": 0.12135818600654602, + "learning_rate": 0.0009295630252748885, + "loss": 0.0282, + "num_input_tokens_seen": 67193024, + "step": 31130 + }, + { + "epoch": 5.079119086460032, + "grad_norm": 0.03141031041741371, + "learning_rate": 0.0009295265936488076, + "loss": 0.1335, + "num_input_tokens_seen": 67204096, + "step": 31135 + }, + { + "epoch": 5.079934747145187, + "grad_norm": 0.010345901362597942, + "learning_rate": 0.0009294901533178251, + "loss": 0.0352, + "num_input_tokens_seen": 67214432, + "step": 31140 + }, + { + "epoch": 5.080750407830343, + "grad_norm": 0.14237365126609802, + "learning_rate": 0.0009294537042826798, + "loss": 0.0519, + "num_input_tokens_seen": 67225792, + "step": 31145 + }, + { + "epoch": 5.081566068515498, + "grad_norm": 0.15012675523757935, + "learning_rate": 0.0009294172465441104, + "loss": 0.1343, + "num_input_tokens_seen": 67236128, + "step": 31150 + }, + { + "epoch": 5.082381729200653, + "grad_norm": 0.044446300715208054, + "learning_rate": 0.0009293807801028558, + "loss": 0.0583, + "num_input_tokens_seen": 67247072, + "step": 31155 + }, + { + "epoch": 5.083197389885807, + "grad_norm": 0.16959770023822784, + "learning_rate": 0.0009293443049596551, + "loss": 0.146, + "num_input_tokens_seen": 67257344, + "step": 31160 + }, + { + "epoch": 5.084013050570962, + "grad_norm": 0.053820785135030746, + "learning_rate": 0.0009293078211152473, + "loss": 0.0495, + "num_input_tokens_seen": 67266848, + "step": 31165 + }, + { + "epoch": 5.084828711256117, + "grad_norm": 0.03816113620996475, + "learning_rate": 0.0009292713285703718, + "loss": 0.1102, + "num_input_tokens_seen": 67277056, + "step": 31170 + }, + { + "epoch": 5.085644371941273, + "grad_norm": 0.06841073930263519, + "learning_rate": 0.0009292348273257684, + "loss": 0.0745, + "num_input_tokens_seen": 67288672, + "step": 31175 + }, + { + "epoch": 5.0864600326264275, + "grad_norm": 0.0062705050222575665, + "learning_rate": 0.0009291983173821765, + "loss": 0.0661, + "num_input_tokens_seen": 67299424, + "step": 31180 + }, + { + "epoch": 5.087275693311582, + "grad_norm": 0.012423977255821228, + "learning_rate": 0.0009291617987403364, + "loss": 0.0851, + "num_input_tokens_seen": 67309056, + "step": 31185 + }, + { + "epoch": 5.088091353996737, + "grad_norm": 0.10146720707416534, + "learning_rate": 0.000929125271400988, + "loss": 0.0365, + "num_input_tokens_seen": 67320320, + "step": 31190 + }, + { + "epoch": 5.088907014681892, + "grad_norm": 0.031027428805828094, + "learning_rate": 0.0009290887353648716, + "loss": 0.0904, + "num_input_tokens_seen": 67330944, + "step": 31195 + }, + { + "epoch": 5.089722675367048, + "grad_norm": 0.012919691391289234, + "learning_rate": 0.0009290521906327276, + "loss": 0.0338, + "num_input_tokens_seen": 67342720, + "step": 31200 + }, + { + "epoch": 5.0905383360522025, + "grad_norm": 0.2202703207731247, + "learning_rate": 0.0009290156372052967, + "loss": 0.0697, + "num_input_tokens_seen": 67353856, + "step": 31205 + }, + { + "epoch": 5.091353996737357, + "grad_norm": 0.005477454047650099, + "learning_rate": 0.0009289790750833196, + "loss": 0.1977, + "num_input_tokens_seen": 67364384, + "step": 31210 + }, + { + "epoch": 5.092169657422512, + "grad_norm": 0.007130507845431566, + "learning_rate": 0.0009289425042675373, + "loss": 0.2089, + "num_input_tokens_seen": 67375328, + "step": 31215 + }, + { + "epoch": 5.092985318107667, + "grad_norm": 0.02244599163532257, + "learning_rate": 0.0009289059247586911, + "loss": 0.0448, + "num_input_tokens_seen": 67386240, + "step": 31220 + }, + { + "epoch": 5.093800978792822, + "grad_norm": 0.014082197099924088, + "learning_rate": 0.0009288693365575222, + "loss": 0.1394, + "num_input_tokens_seen": 67397760, + "step": 31225 + }, + { + "epoch": 5.0946166394779775, + "grad_norm": 0.013584544882178307, + "learning_rate": 0.0009288327396647722, + "loss": 0.0337, + "num_input_tokens_seen": 67408768, + "step": 31230 + }, + { + "epoch": 5.095432300163132, + "grad_norm": 0.13451693952083588, + "learning_rate": 0.0009287961340811826, + "loss": 0.0896, + "num_input_tokens_seen": 67419456, + "step": 31235 + }, + { + "epoch": 5.096247960848287, + "grad_norm": 0.15285320580005646, + "learning_rate": 0.0009287595198074955, + "loss": 0.1219, + "num_input_tokens_seen": 67430528, + "step": 31240 + }, + { + "epoch": 5.097063621533442, + "grad_norm": 0.16829638183116913, + "learning_rate": 0.0009287228968444527, + "loss": 0.0414, + "num_input_tokens_seen": 67441472, + "step": 31245 + }, + { + "epoch": 5.097879282218597, + "grad_norm": 0.11649461835622787, + "learning_rate": 0.0009286862651927966, + "loss": 0.1468, + "num_input_tokens_seen": 67451808, + "step": 31250 + }, + { + "epoch": 5.0986949429037525, + "grad_norm": 0.016306841745972633, + "learning_rate": 0.0009286496248532695, + "loss": 0.0564, + "num_input_tokens_seen": 67463104, + "step": 31255 + }, + { + "epoch": 5.099510603588907, + "grad_norm": 0.015999810770154, + "learning_rate": 0.000928612975826614, + "loss": 0.1111, + "num_input_tokens_seen": 67474144, + "step": 31260 + }, + { + "epoch": 5.100326264274062, + "grad_norm": 0.07067937403917313, + "learning_rate": 0.0009285763181135727, + "loss": 0.0353, + "num_input_tokens_seen": 67484096, + "step": 31265 + }, + { + "epoch": 5.101141924959217, + "grad_norm": 0.0029345466755330563, + "learning_rate": 0.0009285396517148888, + "loss": 0.1034, + "num_input_tokens_seen": 67493664, + "step": 31270 + }, + { + "epoch": 5.101957585644372, + "grad_norm": 0.18859192728996277, + "learning_rate": 0.000928502976631305, + "loss": 0.3743, + "num_input_tokens_seen": 67504768, + "step": 31275 + }, + { + "epoch": 5.102773246329527, + "grad_norm": 0.08005071431398392, + "learning_rate": 0.0009284662928635649, + "loss": 0.04, + "num_input_tokens_seen": 67514848, + "step": 31280 + }, + { + "epoch": 5.103588907014682, + "grad_norm": 0.012497244402766228, + "learning_rate": 0.0009284296004124118, + "loss": 0.0436, + "num_input_tokens_seen": 67525248, + "step": 31285 + }, + { + "epoch": 5.104404567699837, + "grad_norm": 0.1865687221288681, + "learning_rate": 0.0009283928992785894, + "loss": 0.0891, + "num_input_tokens_seen": 67534656, + "step": 31290 + }, + { + "epoch": 5.105220228384992, + "grad_norm": 0.23223555088043213, + "learning_rate": 0.0009283561894628414, + "loss": 0.1298, + "num_input_tokens_seen": 67545024, + "step": 31295 + }, + { + "epoch": 5.106035889070147, + "grad_norm": 0.2613407075405121, + "learning_rate": 0.0009283194709659117, + "loss": 0.2286, + "num_input_tokens_seen": 67556640, + "step": 31300 + }, + { + "epoch": 5.1068515497553015, + "grad_norm": 0.014296147041022778, + "learning_rate": 0.0009282827437885449, + "loss": 0.0378, + "num_input_tokens_seen": 67568032, + "step": 31305 + }, + { + "epoch": 5.107667210440456, + "grad_norm": 0.008059429936110973, + "learning_rate": 0.0009282460079314848, + "loss": 0.1099, + "num_input_tokens_seen": 67578688, + "step": 31310 + }, + { + "epoch": 5.108482871125612, + "grad_norm": 0.008646982721984386, + "learning_rate": 0.0009282092633954759, + "loss": 0.0239, + "num_input_tokens_seen": 67589664, + "step": 31315 + }, + { + "epoch": 5.109298531810767, + "grad_norm": 0.10392355918884277, + "learning_rate": 0.0009281725101812632, + "loss": 0.1031, + "num_input_tokens_seen": 67600736, + "step": 31320 + }, + { + "epoch": 5.110114192495922, + "grad_norm": 0.07619982957839966, + "learning_rate": 0.0009281357482895914, + "loss": 0.1451, + "num_input_tokens_seen": 67612288, + "step": 31325 + }, + { + "epoch": 5.1109298531810765, + "grad_norm": 0.14207406342029572, + "learning_rate": 0.0009280989777212055, + "loss": 0.219, + "num_input_tokens_seen": 67622592, + "step": 31330 + }, + { + "epoch": 5.111745513866231, + "grad_norm": 0.049955934286117554, + "learning_rate": 0.0009280621984768507, + "loss": 0.1072, + "num_input_tokens_seen": 67633568, + "step": 31335 + }, + { + "epoch": 5.112561174551387, + "grad_norm": 0.03495575115084648, + "learning_rate": 0.0009280254105572725, + "loss": 0.1387, + "num_input_tokens_seen": 67644000, + "step": 31340 + }, + { + "epoch": 5.113376835236542, + "grad_norm": 0.024168916046619415, + "learning_rate": 0.0009279886139632163, + "loss": 0.1437, + "num_input_tokens_seen": 67654816, + "step": 31345 + }, + { + "epoch": 5.114192495921697, + "grad_norm": 0.11026232689619064, + "learning_rate": 0.000927951808695428, + "loss": 0.1892, + "num_input_tokens_seen": 67666304, + "step": 31350 + }, + { + "epoch": 5.1150081566068515, + "grad_norm": 0.21152208745479584, + "learning_rate": 0.0009279149947546534, + "loss": 0.132, + "num_input_tokens_seen": 67677536, + "step": 31355 + }, + { + "epoch": 5.115823817292006, + "grad_norm": 0.034677598625421524, + "learning_rate": 0.0009278781721416385, + "loss": 0.1516, + "num_input_tokens_seen": 67688608, + "step": 31360 + }, + { + "epoch": 5.116639477977161, + "grad_norm": 0.19580842554569244, + "learning_rate": 0.0009278413408571295, + "loss": 0.1215, + "num_input_tokens_seen": 67699200, + "step": 31365 + }, + { + "epoch": 5.117455138662317, + "grad_norm": 0.02298540621995926, + "learning_rate": 0.0009278045009018733, + "loss": 0.093, + "num_input_tokens_seen": 67709568, + "step": 31370 + }, + { + "epoch": 5.118270799347472, + "grad_norm": 0.037134189158678055, + "learning_rate": 0.000927767652276616, + "loss": 0.0717, + "num_input_tokens_seen": 67719360, + "step": 31375 + }, + { + "epoch": 5.1190864600326265, + "grad_norm": 0.08715366572141647, + "learning_rate": 0.0009277307949821045, + "loss": 0.1062, + "num_input_tokens_seen": 67729472, + "step": 31380 + }, + { + "epoch": 5.119902120717781, + "grad_norm": 0.1264701634645462, + "learning_rate": 0.000927693929019086, + "loss": 0.1344, + "num_input_tokens_seen": 67739424, + "step": 31385 + }, + { + "epoch": 5.120717781402936, + "grad_norm": 0.11438030749559402, + "learning_rate": 0.0009276570543883074, + "loss": 0.0908, + "num_input_tokens_seen": 67750624, + "step": 31390 + }, + { + "epoch": 5.121533442088092, + "grad_norm": 0.019763559103012085, + "learning_rate": 0.000927620171090516, + "loss": 0.0514, + "num_input_tokens_seen": 67760960, + "step": 31395 + }, + { + "epoch": 5.122349102773247, + "grad_norm": 0.09581348299980164, + "learning_rate": 0.0009275832791264593, + "loss": 0.0762, + "num_input_tokens_seen": 67771232, + "step": 31400 + }, + { + "epoch": 5.123164763458401, + "grad_norm": 0.04690174385905266, + "learning_rate": 0.0009275463784968852, + "loss": 0.0331, + "num_input_tokens_seen": 67781408, + "step": 31405 + }, + { + "epoch": 5.123980424143556, + "grad_norm": 0.00548698753118515, + "learning_rate": 0.0009275094692025413, + "loss": 0.1511, + "num_input_tokens_seen": 67793472, + "step": 31410 + }, + { + "epoch": 5.124796084828711, + "grad_norm": 0.0398702397942543, + "learning_rate": 0.0009274725512441757, + "loss": 0.0211, + "num_input_tokens_seen": 67804704, + "step": 31415 + }, + { + "epoch": 5.125611745513866, + "grad_norm": 0.011445922777056694, + "learning_rate": 0.0009274356246225364, + "loss": 0.0995, + "num_input_tokens_seen": 67815840, + "step": 31420 + }, + { + "epoch": 5.126427406199022, + "grad_norm": 0.012335185892879963, + "learning_rate": 0.0009273986893383722, + "loss": 0.0119, + "num_input_tokens_seen": 67827008, + "step": 31425 + }, + { + "epoch": 5.127243066884176, + "grad_norm": 0.09346325695514679, + "learning_rate": 0.000927361745392431, + "loss": 0.1865, + "num_input_tokens_seen": 67837024, + "step": 31430 + }, + { + "epoch": 5.128058727569331, + "grad_norm": 0.20711888372898102, + "learning_rate": 0.0009273247927854622, + "loss": 0.0707, + "num_input_tokens_seen": 67847360, + "step": 31435 + }, + { + "epoch": 5.128874388254486, + "grad_norm": 0.005657021421939135, + "learning_rate": 0.0009272878315182141, + "loss": 0.1268, + "num_input_tokens_seen": 67857152, + "step": 31440 + }, + { + "epoch": 5.129690048939641, + "grad_norm": 0.19727858901023865, + "learning_rate": 0.0009272508615914363, + "loss": 0.1793, + "num_input_tokens_seen": 67867680, + "step": 31445 + }, + { + "epoch": 5.130505709624796, + "grad_norm": 0.019575193524360657, + "learning_rate": 0.0009272138830058776, + "loss": 0.0242, + "num_input_tokens_seen": 67877632, + "step": 31450 + }, + { + "epoch": 5.131321370309951, + "grad_norm": 0.011218850500881672, + "learning_rate": 0.0009271768957622877, + "loss": 0.1845, + "num_input_tokens_seen": 67886912, + "step": 31455 + }, + { + "epoch": 5.132137030995106, + "grad_norm": 0.00936492532491684, + "learning_rate": 0.0009271398998614162, + "loss": 0.034, + "num_input_tokens_seen": 67895040, + "step": 31460 + }, + { + "epoch": 5.132952691680261, + "grad_norm": 0.04290612414479256, + "learning_rate": 0.0009271028953040126, + "loss": 0.0522, + "num_input_tokens_seen": 67906336, + "step": 31465 + }, + { + "epoch": 5.133768352365416, + "grad_norm": 0.007297531235963106, + "learning_rate": 0.0009270658820908271, + "loss": 0.1078, + "num_input_tokens_seen": 67916992, + "step": 31470 + }, + { + "epoch": 5.134584013050571, + "grad_norm": 0.008339127525687218, + "learning_rate": 0.0009270288602226096, + "loss": 0.1354, + "num_input_tokens_seen": 67928352, + "step": 31475 + }, + { + "epoch": 5.135399673735726, + "grad_norm": 0.06557127088308334, + "learning_rate": 0.0009269918297001106, + "loss": 0.0998, + "num_input_tokens_seen": 67940160, + "step": 31480 + }, + { + "epoch": 5.136215334420881, + "grad_norm": 0.7484502792358398, + "learning_rate": 0.0009269547905240805, + "loss": 0.0373, + "num_input_tokens_seen": 67950304, + "step": 31485 + }, + { + "epoch": 5.137030995106036, + "grad_norm": 0.026651281863451004, + "learning_rate": 0.00092691774269527, + "loss": 0.0898, + "num_input_tokens_seen": 67960224, + "step": 31490 + }, + { + "epoch": 5.137846655791191, + "grad_norm": 0.13884080946445465, + "learning_rate": 0.0009268806862144298, + "loss": 0.2004, + "num_input_tokens_seen": 67971616, + "step": 31495 + }, + { + "epoch": 5.138662316476346, + "grad_norm": 0.03840750455856323, + "learning_rate": 0.0009268436210823109, + "loss": 0.2173, + "num_input_tokens_seen": 67982048, + "step": 31500 + }, + { + "epoch": 5.1394779771615005, + "grad_norm": 0.09224279969930649, + "learning_rate": 0.0009268065472996645, + "loss": 0.1048, + "num_input_tokens_seen": 67991584, + "step": 31505 + }, + { + "epoch": 5.140293637846656, + "grad_norm": 0.20728184282779694, + "learning_rate": 0.0009267694648672423, + "loss": 0.1023, + "num_input_tokens_seen": 68002208, + "step": 31510 + }, + { + "epoch": 5.141109298531811, + "grad_norm": 0.031618643552064896, + "learning_rate": 0.0009267323737857952, + "loss": 0.0246, + "num_input_tokens_seen": 68013344, + "step": 31515 + }, + { + "epoch": 5.141924959216966, + "grad_norm": 0.3493155837059021, + "learning_rate": 0.0009266952740560752, + "loss": 0.2666, + "num_input_tokens_seen": 68025056, + "step": 31520 + }, + { + "epoch": 5.142740619902121, + "grad_norm": 0.024285798892378807, + "learning_rate": 0.0009266581656788342, + "loss": 0.0351, + "num_input_tokens_seen": 68035296, + "step": 31525 + }, + { + "epoch": 5.143556280587275, + "grad_norm": 0.1686321496963501, + "learning_rate": 0.0009266210486548243, + "loss": 0.1307, + "num_input_tokens_seen": 68046080, + "step": 31530 + }, + { + "epoch": 5.14437194127243, + "grad_norm": 0.00723549397662282, + "learning_rate": 0.0009265839229847975, + "loss": 0.1687, + "num_input_tokens_seen": 68057344, + "step": 31535 + }, + { + "epoch": 5.145187601957586, + "grad_norm": 0.015990935266017914, + "learning_rate": 0.0009265467886695064, + "loss": 0.08, + "num_input_tokens_seen": 68067488, + "step": 31540 + }, + { + "epoch": 5.146003262642741, + "grad_norm": 0.23217158019542694, + "learning_rate": 0.0009265096457097035, + "loss": 0.1836, + "num_input_tokens_seen": 68079584, + "step": 31545 + }, + { + "epoch": 5.146818923327896, + "grad_norm": 0.02226824127137661, + "learning_rate": 0.0009264724941061418, + "loss": 0.2224, + "num_input_tokens_seen": 68090816, + "step": 31550 + }, + { + "epoch": 5.14763458401305, + "grad_norm": 0.023588890209794044, + "learning_rate": 0.0009264353338595736, + "loss": 0.0784, + "num_input_tokens_seen": 68102592, + "step": 31555 + }, + { + "epoch": 5.148450244698205, + "grad_norm": 0.10879484564065933, + "learning_rate": 0.0009263981649707527, + "loss": 0.0297, + "num_input_tokens_seen": 68113664, + "step": 31560 + }, + { + "epoch": 5.149265905383361, + "grad_norm": 0.022089151665568352, + "learning_rate": 0.0009263609874404319, + "loss": 0.0509, + "num_input_tokens_seen": 68124480, + "step": 31565 + }, + { + "epoch": 5.150081566068516, + "grad_norm": 0.017972229048609734, + "learning_rate": 0.0009263238012693649, + "loss": 0.1276, + "num_input_tokens_seen": 68135424, + "step": 31570 + }, + { + "epoch": 5.150897226753671, + "grad_norm": 0.02197830006480217, + "learning_rate": 0.0009262866064583051, + "loss": 0.0832, + "num_input_tokens_seen": 68146784, + "step": 31575 + }, + { + "epoch": 5.151712887438825, + "grad_norm": 0.0380043089389801, + "learning_rate": 0.0009262494030080066, + "loss": 0.1433, + "num_input_tokens_seen": 68155744, + "step": 31580 + }, + { + "epoch": 5.15252854812398, + "grad_norm": 0.022763336077332497, + "learning_rate": 0.0009262121909192232, + "loss": 0.0221, + "num_input_tokens_seen": 68167168, + "step": 31585 + }, + { + "epoch": 5.153344208809135, + "grad_norm": 0.18722109496593475, + "learning_rate": 0.0009261749701927089, + "loss": 0.1165, + "num_input_tokens_seen": 68178496, + "step": 31590 + }, + { + "epoch": 5.154159869494291, + "grad_norm": 0.0521707758307457, + "learning_rate": 0.0009261377408292183, + "loss": 0.181, + "num_input_tokens_seen": 68189568, + "step": 31595 + }, + { + "epoch": 5.1549755301794455, + "grad_norm": 0.12279439717531204, + "learning_rate": 0.0009261005028295058, + "loss": 0.1103, + "num_input_tokens_seen": 68198624, + "step": 31600 + }, + { + "epoch": 5.1557911908646, + "grad_norm": 0.12025293707847595, + "learning_rate": 0.000926063256194326, + "loss": 0.0691, + "num_input_tokens_seen": 68210528, + "step": 31605 + }, + { + "epoch": 5.156606851549755, + "grad_norm": 0.008572385646402836, + "learning_rate": 0.0009260260009244339, + "loss": 0.0465, + "num_input_tokens_seen": 68220384, + "step": 31610 + }, + { + "epoch": 5.15742251223491, + "grad_norm": 0.06144726648926735, + "learning_rate": 0.0009259887370205844, + "loss": 0.1801, + "num_input_tokens_seen": 68230464, + "step": 31615 + }, + { + "epoch": 5.158238172920065, + "grad_norm": 0.019451981410384178, + "learning_rate": 0.0009259514644835327, + "loss": 0.1053, + "num_input_tokens_seen": 68240384, + "step": 31620 + }, + { + "epoch": 5.1590538336052205, + "grad_norm": 0.018748527392745018, + "learning_rate": 0.0009259141833140343, + "loss": 0.0463, + "num_input_tokens_seen": 68251840, + "step": 31625 + }, + { + "epoch": 5.159869494290375, + "grad_norm": 0.06314598768949509, + "learning_rate": 0.0009258768935128445, + "loss": 0.1451, + "num_input_tokens_seen": 68263008, + "step": 31630 + }, + { + "epoch": 5.16068515497553, + "grad_norm": 0.019446654245257378, + "learning_rate": 0.0009258395950807194, + "loss": 0.1306, + "num_input_tokens_seen": 68273920, + "step": 31635 + }, + { + "epoch": 5.161500815660685, + "grad_norm": 0.14204244315624237, + "learning_rate": 0.0009258022880184145, + "loss": 0.1264, + "num_input_tokens_seen": 68284256, + "step": 31640 + }, + { + "epoch": 5.16231647634584, + "grad_norm": 0.013157133013010025, + "learning_rate": 0.0009257649723266863, + "loss": 0.0909, + "num_input_tokens_seen": 68296000, + "step": 31645 + }, + { + "epoch": 5.1631321370309955, + "grad_norm": 0.029864266514778137, + "learning_rate": 0.0009257276480062907, + "loss": 0.0457, + "num_input_tokens_seen": 68308224, + "step": 31650 + }, + { + "epoch": 5.16394779771615, + "grad_norm": 0.06908320635557175, + "learning_rate": 0.0009256903150579842, + "loss": 0.0613, + "num_input_tokens_seen": 68318560, + "step": 31655 + }, + { + "epoch": 5.164763458401305, + "grad_norm": 0.006732443813234568, + "learning_rate": 0.0009256529734825234, + "loss": 0.0708, + "num_input_tokens_seen": 68328256, + "step": 31660 + }, + { + "epoch": 5.16557911908646, + "grad_norm": 0.006487551145255566, + "learning_rate": 0.0009256156232806652, + "loss": 0.0842, + "num_input_tokens_seen": 68339648, + "step": 31665 + }, + { + "epoch": 5.166394779771615, + "grad_norm": 0.2353639155626297, + "learning_rate": 0.0009255782644531664, + "loss": 0.0908, + "num_input_tokens_seen": 68351072, + "step": 31670 + }, + { + "epoch": 5.16721044045677, + "grad_norm": 0.17837999761104584, + "learning_rate": 0.0009255408970007842, + "loss": 0.0845, + "num_input_tokens_seen": 68360832, + "step": 31675 + }, + { + "epoch": 5.168026101141925, + "grad_norm": 0.20934976637363434, + "learning_rate": 0.0009255035209242759, + "loss": 0.1196, + "num_input_tokens_seen": 68370016, + "step": 31680 + }, + { + "epoch": 5.16884176182708, + "grad_norm": 0.016385303810238838, + "learning_rate": 0.0009254661362243991, + "loss": 0.0476, + "num_input_tokens_seen": 68380768, + "step": 31685 + }, + { + "epoch": 5.169657422512235, + "grad_norm": 0.09056505560874939, + "learning_rate": 0.000925428742901911, + "loss": 0.029, + "num_input_tokens_seen": 68392448, + "step": 31690 + }, + { + "epoch": 5.17047308319739, + "grad_norm": 0.08092565834522247, + "learning_rate": 0.0009253913409575698, + "loss": 0.151, + "num_input_tokens_seen": 68402016, + "step": 31695 + }, + { + "epoch": 5.171288743882545, + "grad_norm": 0.19901001453399658, + "learning_rate": 0.0009253539303921336, + "loss": 0.142, + "num_input_tokens_seen": 68412704, + "step": 31700 + }, + { + "epoch": 5.1721044045677, + "grad_norm": 0.013578377664089203, + "learning_rate": 0.0009253165112063604, + "loss": 0.1365, + "num_input_tokens_seen": 68424448, + "step": 31705 + }, + { + "epoch": 5.172920065252855, + "grad_norm": 0.0558539479970932, + "learning_rate": 0.0009252790834010085, + "loss": 0.1627, + "num_input_tokens_seen": 68435168, + "step": 31710 + }, + { + "epoch": 5.17373572593801, + "grad_norm": 0.02435706928372383, + "learning_rate": 0.0009252416469768363, + "loss": 0.1344, + "num_input_tokens_seen": 68445696, + "step": 31715 + }, + { + "epoch": 5.174551386623165, + "grad_norm": 0.040784429758787155, + "learning_rate": 0.0009252042019346029, + "loss": 0.0651, + "num_input_tokens_seen": 68455936, + "step": 31720 + }, + { + "epoch": 5.1753670473083195, + "grad_norm": 0.08425629138946533, + "learning_rate": 0.0009251667482750669, + "loss": 0.047, + "num_input_tokens_seen": 68465344, + "step": 31725 + }, + { + "epoch": 5.176182707993474, + "grad_norm": 0.022526482120156288, + "learning_rate": 0.0009251292859989873, + "loss": 0.0327, + "num_input_tokens_seen": 68475936, + "step": 31730 + }, + { + "epoch": 5.17699836867863, + "grad_norm": 0.1948135942220688, + "learning_rate": 0.0009250918151071235, + "loss": 0.0963, + "num_input_tokens_seen": 68486496, + "step": 31735 + }, + { + "epoch": 5.177814029363785, + "grad_norm": 0.018556829541921616, + "learning_rate": 0.0009250543356002347, + "loss": 0.0367, + "num_input_tokens_seen": 68497408, + "step": 31740 + }, + { + "epoch": 5.17862969004894, + "grad_norm": 0.05540525168180466, + "learning_rate": 0.0009250168474790806, + "loss": 0.0498, + "num_input_tokens_seen": 68506720, + "step": 31745 + }, + { + "epoch": 5.1794453507340945, + "grad_norm": 0.19565525650978088, + "learning_rate": 0.0009249793507444208, + "loss": 0.0826, + "num_input_tokens_seen": 68516800, + "step": 31750 + }, + { + "epoch": 5.180261011419249, + "grad_norm": 0.22706341743469238, + "learning_rate": 0.0009249418453970155, + "loss": 0.0642, + "num_input_tokens_seen": 68525120, + "step": 31755 + }, + { + "epoch": 5.181076672104404, + "grad_norm": 0.046376798301935196, + "learning_rate": 0.0009249043314376247, + "loss": 0.0551, + "num_input_tokens_seen": 68535872, + "step": 31760 + }, + { + "epoch": 5.18189233278956, + "grad_norm": 0.11067657917737961, + "learning_rate": 0.0009248668088670084, + "loss": 0.0529, + "num_input_tokens_seen": 68545760, + "step": 31765 + }, + { + "epoch": 5.182707993474715, + "grad_norm": 0.09916000813245773, + "learning_rate": 0.0009248292776859273, + "loss": 0.0561, + "num_input_tokens_seen": 68555552, + "step": 31770 + }, + { + "epoch": 5.1835236541598695, + "grad_norm": 0.8009437322616577, + "learning_rate": 0.0009247917378951419, + "loss": 0.047, + "num_input_tokens_seen": 68565760, + "step": 31775 + }, + { + "epoch": 5.184339314845024, + "grad_norm": 0.033244628459215164, + "learning_rate": 0.0009247541894954132, + "loss": 0.0147, + "num_input_tokens_seen": 68576160, + "step": 31780 + }, + { + "epoch": 5.185154975530179, + "grad_norm": 0.17144986987113953, + "learning_rate": 0.0009247166324875018, + "loss": 0.1846, + "num_input_tokens_seen": 68587360, + "step": 31785 + }, + { + "epoch": 5.185970636215335, + "grad_norm": 0.08225300908088684, + "learning_rate": 0.0009246790668721692, + "loss": 0.027, + "num_input_tokens_seen": 68598336, + "step": 31790 + }, + { + "epoch": 5.18678629690049, + "grad_norm": 0.00510371895506978, + "learning_rate": 0.0009246414926501766, + "loss": 0.2069, + "num_input_tokens_seen": 68610560, + "step": 31795 + }, + { + "epoch": 5.1876019575856445, + "grad_norm": 0.011787333525717258, + "learning_rate": 0.0009246039098222854, + "loss": 0.0778, + "num_input_tokens_seen": 68621472, + "step": 31800 + }, + { + "epoch": 5.188417618270799, + "grad_norm": 0.17525683343410492, + "learning_rate": 0.0009245663183892572, + "loss": 0.1344, + "num_input_tokens_seen": 68632064, + "step": 31805 + }, + { + "epoch": 5.189233278955954, + "grad_norm": 0.027543537318706512, + "learning_rate": 0.0009245287183518541, + "loss": 0.0565, + "num_input_tokens_seen": 68642784, + "step": 31810 + }, + { + "epoch": 5.190048939641109, + "grad_norm": 0.200139120221138, + "learning_rate": 0.0009244911097108379, + "loss": 0.0831, + "num_input_tokens_seen": 68652448, + "step": 31815 + }, + { + "epoch": 5.190864600326265, + "grad_norm": 0.01962878555059433, + "learning_rate": 0.000924453492466971, + "loss": 0.0838, + "num_input_tokens_seen": 68662656, + "step": 31820 + }, + { + "epoch": 5.191680261011419, + "grad_norm": 0.17261886596679688, + "learning_rate": 0.0009244158666210154, + "loss": 0.0652, + "num_input_tokens_seen": 68673248, + "step": 31825 + }, + { + "epoch": 5.192495921696574, + "grad_norm": 0.060023050755262375, + "learning_rate": 0.0009243782321737339, + "loss": 0.0567, + "num_input_tokens_seen": 68683776, + "step": 31830 + }, + { + "epoch": 5.193311582381729, + "grad_norm": 0.1422446072101593, + "learning_rate": 0.0009243405891258894, + "loss": 0.1405, + "num_input_tokens_seen": 68694112, + "step": 31835 + }, + { + "epoch": 5.194127243066884, + "grad_norm": 0.1474105715751648, + "learning_rate": 0.0009243029374782443, + "loss": 0.1455, + "num_input_tokens_seen": 68704512, + "step": 31840 + }, + { + "epoch": 5.19494290375204, + "grad_norm": 0.011041504330933094, + "learning_rate": 0.0009242652772315621, + "loss": 0.1078, + "num_input_tokens_seen": 68715776, + "step": 31845 + }, + { + "epoch": 5.195758564437194, + "grad_norm": 0.022026097401976585, + "learning_rate": 0.0009242276083866056, + "loss": 0.0614, + "num_input_tokens_seen": 68725888, + "step": 31850 + }, + { + "epoch": 5.196574225122349, + "grad_norm": 0.010256092995405197, + "learning_rate": 0.0009241899309441386, + "loss": 0.0858, + "num_input_tokens_seen": 68736864, + "step": 31855 + }, + { + "epoch": 5.197389885807504, + "grad_norm": 0.01783805526793003, + "learning_rate": 0.0009241522449049245, + "loss": 0.0832, + "num_input_tokens_seen": 68747264, + "step": 31860 + }, + { + "epoch": 5.198205546492659, + "grad_norm": 0.09555702656507492, + "learning_rate": 0.000924114550269727, + "loss": 0.1372, + "num_input_tokens_seen": 68757792, + "step": 31865 + }, + { + "epoch": 5.199021207177814, + "grad_norm": 0.03810174763202667, + "learning_rate": 0.0009240768470393101, + "loss": 0.2068, + "num_input_tokens_seen": 68769440, + "step": 31870 + }, + { + "epoch": 5.199836867862969, + "grad_norm": 0.142070472240448, + "learning_rate": 0.0009240391352144382, + "loss": 0.0356, + "num_input_tokens_seen": 68780224, + "step": 31875 + }, + { + "epoch": 5.200652528548124, + "grad_norm": 0.3128035366535187, + "learning_rate": 0.0009240014147958751, + "loss": 0.0665, + "num_input_tokens_seen": 68791008, + "step": 31880 + }, + { + "epoch": 5.201468189233279, + "grad_norm": 0.010396911762654781, + "learning_rate": 0.0009239636857843854, + "loss": 0.0689, + "num_input_tokens_seen": 68801088, + "step": 31885 + }, + { + "epoch": 5.202283849918434, + "grad_norm": 0.09615227580070496, + "learning_rate": 0.0009239259481807338, + "loss": 0.0638, + "num_input_tokens_seen": 68812128, + "step": 31890 + }, + { + "epoch": 5.203099510603589, + "grad_norm": 0.20088519155979156, + "learning_rate": 0.0009238882019856851, + "loss": 0.0568, + "num_input_tokens_seen": 68822688, + "step": 31895 + }, + { + "epoch": 5.2039151712887435, + "grad_norm": 0.17087095975875854, + "learning_rate": 0.0009238504472000042, + "loss": 0.1316, + "num_input_tokens_seen": 68832896, + "step": 31900 + }, + { + "epoch": 5.204730831973899, + "grad_norm": 0.025262577459216118, + "learning_rate": 0.0009238126838244562, + "loss": 0.0181, + "num_input_tokens_seen": 68844224, + "step": 31905 + }, + { + "epoch": 5.205546492659054, + "grad_norm": 0.01020906027406454, + "learning_rate": 0.0009237749118598067, + "loss": 0.1124, + "num_input_tokens_seen": 68856512, + "step": 31910 + }, + { + "epoch": 5.206362153344209, + "grad_norm": 0.04413951560854912, + "learning_rate": 0.000923737131306821, + "loss": 0.0507, + "num_input_tokens_seen": 68867552, + "step": 31915 + }, + { + "epoch": 5.207177814029364, + "grad_norm": 0.29515722393989563, + "learning_rate": 0.0009236993421662648, + "loss": 0.0814, + "num_input_tokens_seen": 68877536, + "step": 31920 + }, + { + "epoch": 5.2079934747145185, + "grad_norm": 0.23658820986747742, + "learning_rate": 0.0009236615444389038, + "loss": 0.2535, + "num_input_tokens_seen": 68887360, + "step": 31925 + }, + { + "epoch": 5.208809135399674, + "grad_norm": 0.08254468441009521, + "learning_rate": 0.0009236237381255041, + "loss": 0.1741, + "num_input_tokens_seen": 68899328, + "step": 31930 + }, + { + "epoch": 5.209624796084829, + "grad_norm": 0.011472400277853012, + "learning_rate": 0.0009235859232268322, + "loss": 0.0611, + "num_input_tokens_seen": 68909888, + "step": 31935 + }, + { + "epoch": 5.210440456769984, + "grad_norm": 0.10463349521160126, + "learning_rate": 0.000923548099743654, + "loss": 0.0767, + "num_input_tokens_seen": 68920416, + "step": 31940 + }, + { + "epoch": 5.211256117455139, + "grad_norm": 0.05765919387340546, + "learning_rate": 0.0009235102676767364, + "loss": 0.0779, + "num_input_tokens_seen": 68931936, + "step": 31945 + }, + { + "epoch": 5.212071778140293, + "grad_norm": 0.15697579085826874, + "learning_rate": 0.0009234724270268459, + "loss": 0.1251, + "num_input_tokens_seen": 68942912, + "step": 31950 + }, + { + "epoch": 5.212887438825448, + "grad_norm": 0.01182611659169197, + "learning_rate": 0.0009234345777947493, + "loss": 0.2212, + "num_input_tokens_seen": 68954400, + "step": 31955 + }, + { + "epoch": 5.213703099510604, + "grad_norm": 0.12106959521770477, + "learning_rate": 0.0009233967199812141, + "loss": 0.0629, + "num_input_tokens_seen": 68964032, + "step": 31960 + }, + { + "epoch": 5.214518760195759, + "grad_norm": 0.014623315073549747, + "learning_rate": 0.000923358853587007, + "loss": 0.0448, + "num_input_tokens_seen": 68974944, + "step": 31965 + }, + { + "epoch": 5.215334420880914, + "grad_norm": 0.042599089443683624, + "learning_rate": 0.0009233209786128957, + "loss": 0.0362, + "num_input_tokens_seen": 68985856, + "step": 31970 + }, + { + "epoch": 5.216150081566068, + "grad_norm": 0.018903104588389397, + "learning_rate": 0.0009232830950596479, + "loss": 0.0268, + "num_input_tokens_seen": 68996768, + "step": 31975 + }, + { + "epoch": 5.216965742251223, + "grad_norm": 0.021693456918001175, + "learning_rate": 0.0009232452029280312, + "loss": 0.0949, + "num_input_tokens_seen": 69008064, + "step": 31980 + }, + { + "epoch": 5.217781402936378, + "grad_norm": 0.06211010366678238, + "learning_rate": 0.0009232073022188135, + "loss": 0.1586, + "num_input_tokens_seen": 69018240, + "step": 31985 + }, + { + "epoch": 5.218597063621534, + "grad_norm": 0.1631218045949936, + "learning_rate": 0.0009231693929327628, + "loss": 0.1659, + "num_input_tokens_seen": 69028128, + "step": 31990 + }, + { + "epoch": 5.219412724306689, + "grad_norm": 0.058711402118206024, + "learning_rate": 0.0009231314750706476, + "loss": 0.0724, + "num_input_tokens_seen": 69038720, + "step": 31995 + }, + { + "epoch": 5.220228384991843, + "grad_norm": 0.09651461243629456, + "learning_rate": 0.0009230935486332363, + "loss": 0.0172, + "num_input_tokens_seen": 69050368, + "step": 32000 + }, + { + "epoch": 5.221044045676998, + "grad_norm": 0.03539738804101944, + "learning_rate": 0.0009230556136212975, + "loss": 0.0175, + "num_input_tokens_seen": 69060672, + "step": 32005 + }, + { + "epoch": 5.221859706362153, + "grad_norm": 0.09196539968252182, + "learning_rate": 0.0009230176700356001, + "loss": 0.1276, + "num_input_tokens_seen": 69072480, + "step": 32010 + }, + { + "epoch": 5.222675367047309, + "grad_norm": 0.011287184432148933, + "learning_rate": 0.0009229797178769128, + "loss": 0.031, + "num_input_tokens_seen": 69083232, + "step": 32015 + }, + { + "epoch": 5.2234910277324635, + "grad_norm": 0.06850557774305344, + "learning_rate": 0.000922941757146005, + "loss": 0.0541, + "num_input_tokens_seen": 69094912, + "step": 32020 + }, + { + "epoch": 5.224306688417618, + "grad_norm": 0.01299577858299017, + "learning_rate": 0.000922903787843646, + "loss": 0.0435, + "num_input_tokens_seen": 69106432, + "step": 32025 + }, + { + "epoch": 5.225122349102773, + "grad_norm": 0.039221443235874176, + "learning_rate": 0.0009228658099706053, + "loss": 0.0552, + "num_input_tokens_seen": 69117184, + "step": 32030 + }, + { + "epoch": 5.225938009787928, + "grad_norm": 0.16468507051467896, + "learning_rate": 0.0009228278235276524, + "loss": 0.0669, + "num_input_tokens_seen": 69127520, + "step": 32035 + }, + { + "epoch": 5.226753670473083, + "grad_norm": 0.001850749715231359, + "learning_rate": 0.0009227898285155574, + "loss": 0.0399, + "num_input_tokens_seen": 69139104, + "step": 32040 + }, + { + "epoch": 5.2275693311582385, + "grad_norm": 0.2436332255601883, + "learning_rate": 0.00092275182493509, + "loss": 0.0494, + "num_input_tokens_seen": 69149312, + "step": 32045 + }, + { + "epoch": 5.228384991843393, + "grad_norm": 0.010133507661521435, + "learning_rate": 0.0009227138127870208, + "loss": 0.1396, + "num_input_tokens_seen": 69160576, + "step": 32050 + }, + { + "epoch": 5.229200652528548, + "grad_norm": 0.04245012253522873, + "learning_rate": 0.0009226757920721196, + "loss": 0.0182, + "num_input_tokens_seen": 69171360, + "step": 32055 + }, + { + "epoch": 5.230016313213703, + "grad_norm": 0.20488706231117249, + "learning_rate": 0.0009226377627911575, + "loss": 0.2897, + "num_input_tokens_seen": 69183360, + "step": 32060 + }, + { + "epoch": 5.230831973898858, + "grad_norm": 0.006259676534682512, + "learning_rate": 0.000922599724944905, + "loss": 0.0985, + "num_input_tokens_seen": 69195392, + "step": 32065 + }, + { + "epoch": 5.231647634584013, + "grad_norm": 0.021054506301879883, + "learning_rate": 0.0009225616785341329, + "loss": 0.0142, + "num_input_tokens_seen": 69206816, + "step": 32070 + }, + { + "epoch": 5.232463295269168, + "grad_norm": 0.03910845145583153, + "learning_rate": 0.0009225236235596123, + "loss": 0.0272, + "num_input_tokens_seen": 69217248, + "step": 32075 + }, + { + "epoch": 5.233278955954323, + "grad_norm": 0.15186454355716705, + "learning_rate": 0.0009224855600221145, + "loss": 0.0565, + "num_input_tokens_seen": 69228480, + "step": 32080 + }, + { + "epoch": 5.234094616639478, + "grad_norm": 0.11952879279851913, + "learning_rate": 0.0009224474879224109, + "loss": 0.1891, + "num_input_tokens_seen": 69240832, + "step": 32085 + }, + { + "epoch": 5.234910277324633, + "grad_norm": 0.083279550075531, + "learning_rate": 0.000922409407261273, + "loss": 0.0244, + "num_input_tokens_seen": 69250976, + "step": 32090 + }, + { + "epoch": 5.235725938009788, + "grad_norm": 0.004981351085007191, + "learning_rate": 0.0009223713180394726, + "loss": 0.0564, + "num_input_tokens_seen": 69261184, + "step": 32095 + }, + { + "epoch": 5.236541598694943, + "grad_norm": 0.047434739768505096, + "learning_rate": 0.0009223332202577815, + "loss": 0.1436, + "num_input_tokens_seen": 69273024, + "step": 32100 + }, + { + "epoch": 5.237357259380098, + "grad_norm": 0.014747762121260166, + "learning_rate": 0.0009222951139169722, + "loss": 0.0229, + "num_input_tokens_seen": 69285280, + "step": 32105 + }, + { + "epoch": 5.238172920065253, + "grad_norm": 0.009383789263665676, + "learning_rate": 0.0009222569990178165, + "loss": 0.0565, + "num_input_tokens_seen": 69295680, + "step": 32110 + }, + { + "epoch": 5.238988580750408, + "grad_norm": 0.12762802839279175, + "learning_rate": 0.0009222188755610871, + "loss": 0.1429, + "num_input_tokens_seen": 69305888, + "step": 32115 + }, + { + "epoch": 5.239804241435563, + "grad_norm": 0.018233921378850937, + "learning_rate": 0.0009221807435475564, + "loss": 0.036, + "num_input_tokens_seen": 69317856, + "step": 32120 + }, + { + "epoch": 5.240619902120717, + "grad_norm": 0.1920018196105957, + "learning_rate": 0.0009221426029779975, + "loss": 0.1876, + "num_input_tokens_seen": 69329088, + "step": 32125 + }, + { + "epoch": 5.241435562805873, + "grad_norm": 0.26141083240509033, + "learning_rate": 0.0009221044538531833, + "loss": 0.1444, + "num_input_tokens_seen": 69340544, + "step": 32130 + }, + { + "epoch": 5.242251223491028, + "grad_norm": 0.18719813227653503, + "learning_rate": 0.0009220662961738868, + "loss": 0.3131, + "num_input_tokens_seen": 69351104, + "step": 32135 + }, + { + "epoch": 5.243066884176183, + "grad_norm": 0.003745259251445532, + "learning_rate": 0.0009220281299408815, + "loss": 0.0492, + "num_input_tokens_seen": 69362848, + "step": 32140 + }, + { + "epoch": 5.2438825448613375, + "grad_norm": 0.24420826137065887, + "learning_rate": 0.0009219899551549405, + "loss": 0.0978, + "num_input_tokens_seen": 69373504, + "step": 32145 + }, + { + "epoch": 5.244698205546492, + "grad_norm": 0.009925668127834797, + "learning_rate": 0.0009219517718168379, + "loss": 0.043, + "num_input_tokens_seen": 69384256, + "step": 32150 + }, + { + "epoch": 5.245513866231648, + "grad_norm": 0.1673927754163742, + "learning_rate": 0.0009219135799273474, + "loss": 0.107, + "num_input_tokens_seen": 69394880, + "step": 32155 + }, + { + "epoch": 5.246329526916803, + "grad_norm": 0.06746475398540497, + "learning_rate": 0.0009218753794872429, + "loss": 0.0622, + "num_input_tokens_seen": 69405120, + "step": 32160 + }, + { + "epoch": 5.247145187601958, + "grad_norm": 0.004773485939949751, + "learning_rate": 0.0009218371704972987, + "loss": 0.0378, + "num_input_tokens_seen": 69416640, + "step": 32165 + }, + { + "epoch": 5.2479608482871125, + "grad_norm": 0.04266734421253204, + "learning_rate": 0.0009217989529582889, + "loss": 0.1414, + "num_input_tokens_seen": 69426912, + "step": 32170 + }, + { + "epoch": 5.248776508972267, + "grad_norm": 0.009177938103675842, + "learning_rate": 0.0009217607268709884, + "loss": 0.1679, + "num_input_tokens_seen": 69436512, + "step": 32175 + }, + { + "epoch": 5.249592169657422, + "grad_norm": 0.04884933680295944, + "learning_rate": 0.0009217224922361718, + "loss": 0.3425, + "num_input_tokens_seen": 69447424, + "step": 32180 + }, + { + "epoch": 5.250407830342578, + "grad_norm": 0.07854997366666794, + "learning_rate": 0.0009216842490546138, + "loss": 0.1564, + "num_input_tokens_seen": 69459136, + "step": 32185 + }, + { + "epoch": 5.251223491027733, + "grad_norm": 0.14532209932804108, + "learning_rate": 0.0009216459973270895, + "loss": 0.1643, + "num_input_tokens_seen": 69470048, + "step": 32190 + }, + { + "epoch": 5.2520391517128875, + "grad_norm": 0.15600469708442688, + "learning_rate": 0.0009216077370543743, + "loss": 0.1541, + "num_input_tokens_seen": 69480992, + "step": 32195 + }, + { + "epoch": 5.252854812398042, + "grad_norm": 0.11156395822763443, + "learning_rate": 0.0009215694682372433, + "loss": 0.0692, + "num_input_tokens_seen": 69491936, + "step": 32200 + }, + { + "epoch": 5.253670473083197, + "grad_norm": 0.17112265527248383, + "learning_rate": 0.0009215311908764724, + "loss": 0.2954, + "num_input_tokens_seen": 69500896, + "step": 32205 + }, + { + "epoch": 5.254486133768353, + "grad_norm": 0.05591527000069618, + "learning_rate": 0.000921492904972837, + "loss": 0.084, + "num_input_tokens_seen": 69509888, + "step": 32210 + }, + { + "epoch": 5.255301794453508, + "grad_norm": 0.044657088816165924, + "learning_rate": 0.0009214546105271133, + "loss": 0.0734, + "num_input_tokens_seen": 69519840, + "step": 32215 + }, + { + "epoch": 5.2561174551386625, + "grad_norm": 0.16159150004386902, + "learning_rate": 0.0009214163075400772, + "loss": 0.1098, + "num_input_tokens_seen": 69530400, + "step": 32220 + }, + { + "epoch": 5.256933115823817, + "grad_norm": 0.01479649543762207, + "learning_rate": 0.000921377996012505, + "loss": 0.0612, + "num_input_tokens_seen": 69540992, + "step": 32225 + }, + { + "epoch": 5.257748776508972, + "grad_norm": 0.05854517221450806, + "learning_rate": 0.0009213396759451732, + "loss": 0.1032, + "num_input_tokens_seen": 69552704, + "step": 32230 + }, + { + "epoch": 5.258564437194127, + "grad_norm": 0.17576725780963898, + "learning_rate": 0.0009213013473388584, + "loss": 0.0724, + "num_input_tokens_seen": 69563648, + "step": 32235 + }, + { + "epoch": 5.259380097879283, + "grad_norm": 0.22860734164714813, + "learning_rate": 0.0009212630101943373, + "loss": 0.0986, + "num_input_tokens_seen": 69574880, + "step": 32240 + }, + { + "epoch": 5.260195758564437, + "grad_norm": 0.07967595756053925, + "learning_rate": 0.000921224664512387, + "loss": 0.1636, + "num_input_tokens_seen": 69586496, + "step": 32245 + }, + { + "epoch": 5.261011419249592, + "grad_norm": 0.14609402418136597, + "learning_rate": 0.0009211863102937843, + "loss": 0.1566, + "num_input_tokens_seen": 69596704, + "step": 32250 + }, + { + "epoch": 5.261827079934747, + "grad_norm": 0.22860078513622284, + "learning_rate": 0.0009211479475393068, + "loss": 0.1196, + "num_input_tokens_seen": 69606912, + "step": 32255 + }, + { + "epoch": 5.262642740619902, + "grad_norm": 0.19442203640937805, + "learning_rate": 0.0009211095762497319, + "loss": 0.2183, + "num_input_tokens_seen": 69618880, + "step": 32260 + }, + { + "epoch": 5.263458401305057, + "grad_norm": 0.03838955983519554, + "learning_rate": 0.0009210711964258372, + "loss": 0.0296, + "num_input_tokens_seen": 69629440, + "step": 32265 + }, + { + "epoch": 5.264274061990212, + "grad_norm": 0.13338832557201385, + "learning_rate": 0.0009210328080684005, + "loss": 0.12, + "num_input_tokens_seen": 69640128, + "step": 32270 + }, + { + "epoch": 5.265089722675367, + "grad_norm": 0.1590396612882614, + "learning_rate": 0.0009209944111782, + "loss": 0.1322, + "num_input_tokens_seen": 69650816, + "step": 32275 + }, + { + "epoch": 5.265905383360522, + "grad_norm": 0.003430388867855072, + "learning_rate": 0.0009209560057560134, + "loss": 0.0885, + "num_input_tokens_seen": 69661376, + "step": 32280 + }, + { + "epoch": 5.266721044045677, + "grad_norm": 0.11426710337400436, + "learning_rate": 0.0009209175918026195, + "loss": 0.0379, + "num_input_tokens_seen": 69671200, + "step": 32285 + }, + { + "epoch": 5.267536704730832, + "grad_norm": 0.08991096168756485, + "learning_rate": 0.0009208791693187967, + "loss": 0.1283, + "num_input_tokens_seen": 69682048, + "step": 32290 + }, + { + "epoch": 5.268352365415987, + "grad_norm": 0.04066402465105057, + "learning_rate": 0.0009208407383053235, + "loss": 0.0706, + "num_input_tokens_seen": 69694208, + "step": 32295 + }, + { + "epoch": 5.269168026101142, + "grad_norm": 0.04276157543063164, + "learning_rate": 0.000920802298762979, + "loss": 0.0657, + "num_input_tokens_seen": 69702848, + "step": 32300 + }, + { + "epoch": 5.269983686786297, + "grad_norm": 0.038376402109861374, + "learning_rate": 0.0009207638506925419, + "loss": 0.0958, + "num_input_tokens_seen": 69714080, + "step": 32305 + }, + { + "epoch": 5.270799347471452, + "grad_norm": 0.009452950209379196, + "learning_rate": 0.0009207253940947916, + "loss": 0.0295, + "num_input_tokens_seen": 69725408, + "step": 32310 + }, + { + "epoch": 5.271615008156607, + "grad_norm": 0.04884372651576996, + "learning_rate": 0.0009206869289705075, + "loss": 0.0271, + "num_input_tokens_seen": 69736704, + "step": 32315 + }, + { + "epoch": 5.2724306688417615, + "grad_norm": 0.08049613982439041, + "learning_rate": 0.0009206484553204693, + "loss": 0.0497, + "num_input_tokens_seen": 69747264, + "step": 32320 + }, + { + "epoch": 5.273246329526917, + "grad_norm": 0.08676740527153015, + "learning_rate": 0.0009206099731454562, + "loss": 0.0689, + "num_input_tokens_seen": 69758592, + "step": 32325 + }, + { + "epoch": 5.274061990212072, + "grad_norm": 0.04374626651406288, + "learning_rate": 0.0009205714824462487, + "loss": 0.077, + "num_input_tokens_seen": 69769088, + "step": 32330 + }, + { + "epoch": 5.274877650897227, + "grad_norm": 0.01394625473767519, + "learning_rate": 0.0009205329832236265, + "loss": 0.0734, + "num_input_tokens_seen": 69780064, + "step": 32335 + }, + { + "epoch": 5.275693311582382, + "grad_norm": 0.029180048033595085, + "learning_rate": 0.0009204944754783698, + "loss": 0.1656, + "num_input_tokens_seen": 69792064, + "step": 32340 + }, + { + "epoch": 5.2765089722675365, + "grad_norm": 0.08653794974088669, + "learning_rate": 0.0009204559592112592, + "loss": 0.2095, + "num_input_tokens_seen": 69802304, + "step": 32345 + }, + { + "epoch": 5.277324632952691, + "grad_norm": 0.04603267461061478, + "learning_rate": 0.0009204174344230751, + "loss": 0.0944, + "num_input_tokens_seen": 69811104, + "step": 32350 + }, + { + "epoch": 5.278140293637847, + "grad_norm": 0.008521195501089096, + "learning_rate": 0.0009203789011145984, + "loss": 0.0677, + "num_input_tokens_seen": 69823072, + "step": 32355 + }, + { + "epoch": 5.278955954323002, + "grad_norm": 0.1991201490163803, + "learning_rate": 0.00092034035928661, + "loss": 0.2055, + "num_input_tokens_seen": 69835104, + "step": 32360 + }, + { + "epoch": 5.279771615008157, + "grad_norm": 0.1315097212791443, + "learning_rate": 0.000920301808939891, + "loss": 0.0903, + "num_input_tokens_seen": 69846784, + "step": 32365 + }, + { + "epoch": 5.280587275693311, + "grad_norm": 0.09303920716047287, + "learning_rate": 0.0009202632500752226, + "loss": 0.0636, + "num_input_tokens_seen": 69857024, + "step": 32370 + }, + { + "epoch": 5.281402936378466, + "grad_norm": 0.03423493728041649, + "learning_rate": 0.0009202246826933864, + "loss": 0.0232, + "num_input_tokens_seen": 69866624, + "step": 32375 + }, + { + "epoch": 5.282218597063622, + "grad_norm": 0.1135723888874054, + "learning_rate": 0.0009201861067951638, + "loss": 0.0606, + "num_input_tokens_seen": 69878816, + "step": 32380 + }, + { + "epoch": 5.283034257748777, + "grad_norm": 0.00647773640230298, + "learning_rate": 0.0009201475223813368, + "loss": 0.0939, + "num_input_tokens_seen": 69889184, + "step": 32385 + }, + { + "epoch": 5.283849918433932, + "grad_norm": 0.037884388118982315, + "learning_rate": 0.0009201089294526872, + "loss": 0.069, + "num_input_tokens_seen": 69900160, + "step": 32390 + }, + { + "epoch": 5.284665579119086, + "grad_norm": 0.010856589302420616, + "learning_rate": 0.0009200703280099971, + "loss": 0.097, + "num_input_tokens_seen": 69910848, + "step": 32395 + }, + { + "epoch": 5.285481239804241, + "grad_norm": 0.006763134151697159, + "learning_rate": 0.0009200317180540491, + "loss": 0.022, + "num_input_tokens_seen": 69921856, + "step": 32400 + }, + { + "epoch": 5.286296900489396, + "grad_norm": 0.006428079679608345, + "learning_rate": 0.0009199930995856254, + "loss": 0.1045, + "num_input_tokens_seen": 69932768, + "step": 32405 + }, + { + "epoch": 5.287112561174552, + "grad_norm": 0.21502438187599182, + "learning_rate": 0.0009199544726055087, + "loss": 0.121, + "num_input_tokens_seen": 69943488, + "step": 32410 + }, + { + "epoch": 5.287928221859707, + "grad_norm": 0.2343747317790985, + "learning_rate": 0.000919915837114482, + "loss": 0.0813, + "num_input_tokens_seen": 69953312, + "step": 32415 + }, + { + "epoch": 5.288743882544861, + "grad_norm": 0.13441915810108185, + "learning_rate": 0.0009198771931133281, + "loss": 0.056, + "num_input_tokens_seen": 69963072, + "step": 32420 + }, + { + "epoch": 5.289559543230016, + "grad_norm": 0.004303318448364735, + "learning_rate": 0.0009198385406028302, + "loss": 0.0554, + "num_input_tokens_seen": 69973600, + "step": 32425 + }, + { + "epoch": 5.290375203915171, + "grad_norm": 0.019534459337592125, + "learning_rate": 0.0009197998795837716, + "loss": 0.0292, + "num_input_tokens_seen": 69984384, + "step": 32430 + }, + { + "epoch": 5.291190864600326, + "grad_norm": 0.03160841763019562, + "learning_rate": 0.0009197612100569359, + "loss": 0.0466, + "num_input_tokens_seen": 69995616, + "step": 32435 + }, + { + "epoch": 5.2920065252854815, + "grad_norm": 0.12419536709785461, + "learning_rate": 0.0009197225320231069, + "loss": 0.0372, + "num_input_tokens_seen": 70006432, + "step": 32440 + }, + { + "epoch": 5.292822185970636, + "grad_norm": 0.19679783284664154, + "learning_rate": 0.0009196838454830682, + "loss": 0.2411, + "num_input_tokens_seen": 70017120, + "step": 32445 + }, + { + "epoch": 5.293637846655791, + "grad_norm": 0.05362845957279205, + "learning_rate": 0.000919645150437604, + "loss": 0.1129, + "num_input_tokens_seen": 70027744, + "step": 32450 + }, + { + "epoch": 5.294453507340946, + "grad_norm": 0.06048906221985817, + "learning_rate": 0.0009196064468874985, + "loss": 0.0389, + "num_input_tokens_seen": 70039360, + "step": 32455 + }, + { + "epoch": 5.295269168026101, + "grad_norm": 0.006705464329570532, + "learning_rate": 0.0009195677348335361, + "loss": 0.0734, + "num_input_tokens_seen": 70049952, + "step": 32460 + }, + { + "epoch": 5.2960848287112565, + "grad_norm": 0.014060980640351772, + "learning_rate": 0.0009195290142765012, + "loss": 0.2502, + "num_input_tokens_seen": 70059904, + "step": 32465 + }, + { + "epoch": 5.296900489396411, + "grad_norm": 0.02606513909995556, + "learning_rate": 0.0009194902852171787, + "loss": 0.1261, + "num_input_tokens_seen": 70070464, + "step": 32470 + }, + { + "epoch": 5.297716150081566, + "grad_norm": 0.10168711096048355, + "learning_rate": 0.0009194515476563533, + "loss": 0.0407, + "num_input_tokens_seen": 70081472, + "step": 32475 + }, + { + "epoch": 5.298531810766721, + "grad_norm": 0.033818844705820084, + "learning_rate": 0.0009194128015948103, + "loss": 0.0462, + "num_input_tokens_seen": 70091360, + "step": 32480 + }, + { + "epoch": 5.299347471451876, + "grad_norm": 0.17443619668483734, + "learning_rate": 0.0009193740470333347, + "loss": 0.0642, + "num_input_tokens_seen": 70101088, + "step": 32485 + }, + { + "epoch": 5.300163132137031, + "grad_norm": 0.11893410235643387, + "learning_rate": 0.0009193352839727121, + "loss": 0.0653, + "num_input_tokens_seen": 70112608, + "step": 32490 + }, + { + "epoch": 5.300978792822186, + "grad_norm": 0.0020797194447368383, + "learning_rate": 0.0009192965124137279, + "loss": 0.0458, + "num_input_tokens_seen": 70124128, + "step": 32495 + }, + { + "epoch": 5.301794453507341, + "grad_norm": 0.3257969617843628, + "learning_rate": 0.000919257732357168, + "loss": 0.2114, + "num_input_tokens_seen": 70136736, + "step": 32500 + }, + { + "epoch": 5.302610114192496, + "grad_norm": 0.13580219447612762, + "learning_rate": 0.0009192189438038183, + "loss": 0.1277, + "num_input_tokens_seen": 70148768, + "step": 32505 + }, + { + "epoch": 5.303425774877651, + "grad_norm": 0.18040874600410461, + "learning_rate": 0.0009191801467544649, + "loss": 0.1528, + "num_input_tokens_seen": 70158592, + "step": 32510 + }, + { + "epoch": 5.304241435562806, + "grad_norm": 0.012007476761937141, + "learning_rate": 0.0009191413412098942, + "loss": 0.0477, + "num_input_tokens_seen": 70169856, + "step": 32515 + }, + { + "epoch": 5.30505709624796, + "grad_norm": 0.07372158020734787, + "learning_rate": 0.0009191025271708923, + "loss": 0.023, + "num_input_tokens_seen": 70179680, + "step": 32520 + }, + { + "epoch": 5.305872756933116, + "grad_norm": 0.2031242698431015, + "learning_rate": 0.0009190637046382461, + "loss": 0.1372, + "num_input_tokens_seen": 70191104, + "step": 32525 + }, + { + "epoch": 5.306688417618271, + "grad_norm": 0.7364500761032104, + "learning_rate": 0.0009190248736127422, + "loss": 0.1483, + "num_input_tokens_seen": 70201728, + "step": 32530 + }, + { + "epoch": 5.307504078303426, + "grad_norm": 0.02838551253080368, + "learning_rate": 0.0009189860340951679, + "loss": 0.0187, + "num_input_tokens_seen": 70213088, + "step": 32535 + }, + { + "epoch": 5.308319738988581, + "grad_norm": 0.20185182988643646, + "learning_rate": 0.0009189471860863099, + "loss": 0.0628, + "num_input_tokens_seen": 70223648, + "step": 32540 + }, + { + "epoch": 5.309135399673735, + "grad_norm": 0.08813232183456421, + "learning_rate": 0.0009189083295869558, + "loss": 0.0458, + "num_input_tokens_seen": 70234272, + "step": 32545 + }, + { + "epoch": 5.309951060358891, + "grad_norm": 0.009246550500392914, + "learning_rate": 0.0009188694645978928, + "loss": 0.0786, + "num_input_tokens_seen": 70244800, + "step": 32550 + }, + { + "epoch": 5.310766721044046, + "grad_norm": 0.0045503536239266396, + "learning_rate": 0.0009188305911199088, + "loss": 0.101, + "num_input_tokens_seen": 70255360, + "step": 32555 + }, + { + "epoch": 5.311582381729201, + "grad_norm": 0.0028419364243745804, + "learning_rate": 0.0009187917091537918, + "loss": 0.0556, + "num_input_tokens_seen": 70265888, + "step": 32560 + }, + { + "epoch": 5.3123980424143555, + "grad_norm": 0.012658837251365185, + "learning_rate": 0.0009187528187003293, + "loss": 0.1486, + "num_input_tokens_seen": 70276544, + "step": 32565 + }, + { + "epoch": 5.31321370309951, + "grad_norm": 0.02412761189043522, + "learning_rate": 0.0009187139197603097, + "loss": 0.056, + "num_input_tokens_seen": 70287904, + "step": 32570 + }, + { + "epoch": 5.314029363784665, + "grad_norm": 0.025204742327332497, + "learning_rate": 0.0009186750123345214, + "loss": 0.0753, + "num_input_tokens_seen": 70299872, + "step": 32575 + }, + { + "epoch": 5.314845024469821, + "grad_norm": 0.009722416289150715, + "learning_rate": 0.0009186360964237528, + "loss": 0.1139, + "num_input_tokens_seen": 70310624, + "step": 32580 + }, + { + "epoch": 5.315660685154976, + "grad_norm": 0.016365546733140945, + "learning_rate": 0.0009185971720287926, + "loss": 0.0157, + "num_input_tokens_seen": 70321312, + "step": 32585 + }, + { + "epoch": 5.3164763458401305, + "grad_norm": 0.0059278481639921665, + "learning_rate": 0.0009185582391504299, + "loss": 0.0319, + "num_input_tokens_seen": 70332384, + "step": 32590 + }, + { + "epoch": 5.317292006525285, + "grad_norm": 0.14799389243125916, + "learning_rate": 0.0009185192977894533, + "loss": 0.1078, + "num_input_tokens_seen": 70343232, + "step": 32595 + }, + { + "epoch": 5.31810766721044, + "grad_norm": 0.20223501324653625, + "learning_rate": 0.0009184803479466521, + "loss": 0.2219, + "num_input_tokens_seen": 70355104, + "step": 32600 + }, + { + "epoch": 5.318923327895595, + "grad_norm": 0.19203418493270874, + "learning_rate": 0.0009184413896228161, + "loss": 0.1992, + "num_input_tokens_seen": 70366400, + "step": 32605 + }, + { + "epoch": 5.319738988580751, + "grad_norm": 0.008682274259626865, + "learning_rate": 0.0009184024228187343, + "loss": 0.0917, + "num_input_tokens_seen": 70378400, + "step": 32610 + }, + { + "epoch": 5.3205546492659055, + "grad_norm": 0.05555209890007973, + "learning_rate": 0.0009183634475351967, + "loss": 0.0607, + "num_input_tokens_seen": 70389952, + "step": 32615 + }, + { + "epoch": 5.32137030995106, + "grad_norm": 0.03131626546382904, + "learning_rate": 0.0009183244637729931, + "loss": 0.0876, + "num_input_tokens_seen": 70401408, + "step": 32620 + }, + { + "epoch": 5.322185970636215, + "grad_norm": 0.023293692618608475, + "learning_rate": 0.0009182854715329134, + "loss": 0.1572, + "num_input_tokens_seen": 70412640, + "step": 32625 + }, + { + "epoch": 5.32300163132137, + "grad_norm": 0.03549553081393242, + "learning_rate": 0.0009182464708157481, + "loss": 0.0636, + "num_input_tokens_seen": 70422592, + "step": 32630 + }, + { + "epoch": 5.323817292006526, + "grad_norm": 0.021040933206677437, + "learning_rate": 0.0009182074616222875, + "loss": 0.0867, + "num_input_tokens_seen": 70433856, + "step": 32635 + }, + { + "epoch": 5.3246329526916805, + "grad_norm": 0.19329990446567535, + "learning_rate": 0.0009181684439533223, + "loss": 0.1363, + "num_input_tokens_seen": 70445376, + "step": 32640 + }, + { + "epoch": 5.325448613376835, + "grad_norm": 0.19314946234226227, + "learning_rate": 0.0009181294178096427, + "loss": 0.1816, + "num_input_tokens_seen": 70456384, + "step": 32645 + }, + { + "epoch": 5.32626427406199, + "grad_norm": 0.014693030156195164, + "learning_rate": 0.0009180903831920404, + "loss": 0.0935, + "num_input_tokens_seen": 70467040, + "step": 32650 + }, + { + "epoch": 5.327079934747145, + "grad_norm": 0.10082631558179855, + "learning_rate": 0.0009180513401013059, + "loss": 0.1497, + "num_input_tokens_seen": 70477248, + "step": 32655 + }, + { + "epoch": 5.327895595432301, + "grad_norm": 0.002789553487673402, + "learning_rate": 0.0009180122885382307, + "loss": 0.1141, + "num_input_tokens_seen": 70487648, + "step": 32660 + }, + { + "epoch": 5.328711256117455, + "grad_norm": 0.03882679343223572, + "learning_rate": 0.0009179732285036062, + "loss": 0.1289, + "num_input_tokens_seen": 70499296, + "step": 32665 + }, + { + "epoch": 5.32952691680261, + "grad_norm": 0.08441449701786041, + "learning_rate": 0.0009179341599982239, + "loss": 0.0666, + "num_input_tokens_seen": 70509664, + "step": 32670 + }, + { + "epoch": 5.330342577487765, + "grad_norm": 0.009871457703411579, + "learning_rate": 0.0009178950830228759, + "loss": 0.0638, + "num_input_tokens_seen": 70520064, + "step": 32675 + }, + { + "epoch": 5.33115823817292, + "grad_norm": 0.13113850355148315, + "learning_rate": 0.0009178559975783536, + "loss": 0.1329, + "num_input_tokens_seen": 70531104, + "step": 32680 + }, + { + "epoch": 5.331973898858075, + "grad_norm": 0.0038017858751118183, + "learning_rate": 0.0009178169036654496, + "loss": 0.065, + "num_input_tokens_seen": 70542528, + "step": 32685 + }, + { + "epoch": 5.33278955954323, + "grad_norm": 0.15180036425590515, + "learning_rate": 0.0009177778012849561, + "loss": 0.0502, + "num_input_tokens_seen": 70553216, + "step": 32690 + }, + { + "epoch": 5.333605220228385, + "grad_norm": 0.17331519722938538, + "learning_rate": 0.0009177386904376652, + "loss": 0.146, + "num_input_tokens_seen": 70565504, + "step": 32695 + }, + { + "epoch": 5.33442088091354, + "grad_norm": 0.04667358100414276, + "learning_rate": 0.0009176995711243699, + "loss": 0.1109, + "num_input_tokens_seen": 70577376, + "step": 32700 + }, + { + "epoch": 5.335236541598695, + "grad_norm": 0.024390315636992455, + "learning_rate": 0.0009176604433458631, + "loss": 0.091, + "num_input_tokens_seen": 70588960, + "step": 32705 + }, + { + "epoch": 5.33605220228385, + "grad_norm": 0.133183553814888, + "learning_rate": 0.0009176213071029373, + "loss": 0.0654, + "num_input_tokens_seen": 70600192, + "step": 32710 + }, + { + "epoch": 5.3368678629690045, + "grad_norm": 0.015062015503644943, + "learning_rate": 0.0009175821623963861, + "loss": 0.0979, + "num_input_tokens_seen": 70610816, + "step": 32715 + }, + { + "epoch": 5.33768352365416, + "grad_norm": 0.19104783236980438, + "learning_rate": 0.0009175430092270026, + "loss": 0.1714, + "num_input_tokens_seen": 70621280, + "step": 32720 + }, + { + "epoch": 5.338499184339315, + "grad_norm": 0.05880535766482353, + "learning_rate": 0.0009175038475955804, + "loss": 0.06, + "num_input_tokens_seen": 70631968, + "step": 32725 + }, + { + "epoch": 5.33931484502447, + "grad_norm": 0.020863739773631096, + "learning_rate": 0.0009174646775029129, + "loss": 0.2037, + "num_input_tokens_seen": 70643296, + "step": 32730 + }, + { + "epoch": 5.340130505709625, + "grad_norm": 0.014811712317168713, + "learning_rate": 0.0009174254989497942, + "loss": 0.0625, + "num_input_tokens_seen": 70652576, + "step": 32735 + }, + { + "epoch": 5.3409461663947795, + "grad_norm": 0.028791295364499092, + "learning_rate": 0.0009173863119370183, + "loss": 0.1016, + "num_input_tokens_seen": 70663424, + "step": 32740 + }, + { + "epoch": 5.341761827079935, + "grad_norm": 0.09639950096607208, + "learning_rate": 0.0009173471164653791, + "loss": 0.0707, + "num_input_tokens_seen": 70673536, + "step": 32745 + }, + { + "epoch": 5.34257748776509, + "grad_norm": 0.12958593666553497, + "learning_rate": 0.0009173079125356714, + "loss": 0.0743, + "num_input_tokens_seen": 70683712, + "step": 32750 + }, + { + "epoch": 5.343393148450245, + "grad_norm": 0.172854483127594, + "learning_rate": 0.0009172687001486892, + "loss": 0.1397, + "num_input_tokens_seen": 70693856, + "step": 32755 + }, + { + "epoch": 5.3442088091354, + "grad_norm": 0.022963641211390495, + "learning_rate": 0.0009172294793052277, + "loss": 0.0486, + "num_input_tokens_seen": 70705024, + "step": 32760 + }, + { + "epoch": 5.3450244698205545, + "grad_norm": 0.02787846140563488, + "learning_rate": 0.0009171902500060814, + "loss": 0.0311, + "num_input_tokens_seen": 70715040, + "step": 32765 + }, + { + "epoch": 5.345840130505709, + "grad_norm": 0.17813503742218018, + "learning_rate": 0.0009171510122520455, + "loss": 0.0675, + "num_input_tokens_seen": 70726080, + "step": 32770 + }, + { + "epoch": 5.346655791190865, + "grad_norm": 0.02608194760978222, + "learning_rate": 0.000917111766043915, + "loss": 0.0875, + "num_input_tokens_seen": 70736800, + "step": 32775 + }, + { + "epoch": 5.34747145187602, + "grad_norm": 0.18031036853790283, + "learning_rate": 0.0009170725113824855, + "loss": 0.1318, + "num_input_tokens_seen": 70748928, + "step": 32780 + }, + { + "epoch": 5.348287112561175, + "grad_norm": 0.011285116896033287, + "learning_rate": 0.0009170332482685524, + "loss": 0.0815, + "num_input_tokens_seen": 70760736, + "step": 32785 + }, + { + "epoch": 5.349102773246329, + "grad_norm": 0.1985178291797638, + "learning_rate": 0.0009169939767029116, + "loss": 0.1269, + "num_input_tokens_seen": 70771552, + "step": 32790 + }, + { + "epoch": 5.349918433931484, + "grad_norm": 0.21643872559070587, + "learning_rate": 0.0009169546966863588, + "loss": 0.1315, + "num_input_tokens_seen": 70782656, + "step": 32795 + }, + { + "epoch": 5.350734094616639, + "grad_norm": 0.07655095309019089, + "learning_rate": 0.0009169154082196901, + "loss": 0.0703, + "num_input_tokens_seen": 70792736, + "step": 32800 + }, + { + "epoch": 5.351549755301795, + "grad_norm": 0.012563557364046574, + "learning_rate": 0.0009168761113037019, + "loss": 0.1031, + "num_input_tokens_seen": 70804032, + "step": 32805 + }, + { + "epoch": 5.35236541598695, + "grad_norm": 0.1358395516872406, + "learning_rate": 0.0009168368059391903, + "loss": 0.0656, + "num_input_tokens_seen": 70814208, + "step": 32810 + }, + { + "epoch": 5.353181076672104, + "grad_norm": 0.14327511191368103, + "learning_rate": 0.0009167974921269519, + "loss": 0.1379, + "num_input_tokens_seen": 70825312, + "step": 32815 + }, + { + "epoch": 5.353996737357259, + "grad_norm": 0.07677187770605087, + "learning_rate": 0.0009167581698677838, + "loss": 0.1485, + "num_input_tokens_seen": 70835680, + "step": 32820 + }, + { + "epoch": 5.354812398042414, + "grad_norm": 0.008992388844490051, + "learning_rate": 0.0009167188391624827, + "loss": 0.0555, + "num_input_tokens_seen": 70846336, + "step": 32825 + }, + { + "epoch": 5.35562805872757, + "grad_norm": 0.07988745719194412, + "learning_rate": 0.0009166795000118456, + "loss": 0.0488, + "num_input_tokens_seen": 70858016, + "step": 32830 + }, + { + "epoch": 5.356443719412725, + "grad_norm": 0.20251862704753876, + "learning_rate": 0.0009166401524166699, + "loss": 0.1367, + "num_input_tokens_seen": 70868704, + "step": 32835 + }, + { + "epoch": 5.357259380097879, + "grad_norm": 0.012772920541465282, + "learning_rate": 0.000916600796377753, + "loss": 0.0483, + "num_input_tokens_seen": 70879392, + "step": 32840 + }, + { + "epoch": 5.358075040783034, + "grad_norm": 0.17998351156711578, + "learning_rate": 0.0009165614318958924, + "loss": 0.1928, + "num_input_tokens_seen": 70890848, + "step": 32845 + }, + { + "epoch": 5.358890701468189, + "grad_norm": 0.06456758081912994, + "learning_rate": 0.0009165220589718859, + "loss": 0.0251, + "num_input_tokens_seen": 70901472, + "step": 32850 + }, + { + "epoch": 5.359706362153344, + "grad_norm": 0.044556617736816406, + "learning_rate": 0.0009164826776065316, + "loss": 0.1251, + "num_input_tokens_seen": 70911904, + "step": 32855 + }, + { + "epoch": 5.3605220228384995, + "grad_norm": 0.0192783921957016, + "learning_rate": 0.0009164432878006274, + "loss": 0.0375, + "num_input_tokens_seen": 70922880, + "step": 32860 + }, + { + "epoch": 5.361337683523654, + "grad_norm": 0.048101428896188736, + "learning_rate": 0.0009164038895549716, + "loss": 0.0501, + "num_input_tokens_seen": 70933472, + "step": 32865 + }, + { + "epoch": 5.362153344208809, + "grad_norm": 0.007880575954914093, + "learning_rate": 0.0009163644828703628, + "loss": 0.0717, + "num_input_tokens_seen": 70944864, + "step": 32870 + }, + { + "epoch": 5.362969004893964, + "grad_norm": 0.3458538055419922, + "learning_rate": 0.0009163250677475996, + "loss": 0.073, + "num_input_tokens_seen": 70956512, + "step": 32875 + }, + { + "epoch": 5.363784665579119, + "grad_norm": 0.016341062262654305, + "learning_rate": 0.0009162856441874807, + "loss": 0.0188, + "num_input_tokens_seen": 70966400, + "step": 32880 + }, + { + "epoch": 5.364600326264274, + "grad_norm": 0.09153753519058228, + "learning_rate": 0.0009162462121908052, + "loss": 0.0242, + "num_input_tokens_seen": 70977120, + "step": 32885 + }, + { + "epoch": 5.365415986949429, + "grad_norm": 0.03199958801269531, + "learning_rate": 0.0009162067717583722, + "loss": 0.0679, + "num_input_tokens_seen": 70987712, + "step": 32890 + }, + { + "epoch": 5.366231647634584, + "grad_norm": 0.016981739550828934, + "learning_rate": 0.0009161673228909808, + "loss": 0.0438, + "num_input_tokens_seen": 70998080, + "step": 32895 + }, + { + "epoch": 5.367047308319739, + "grad_norm": 0.09677354991436005, + "learning_rate": 0.0009161278655894307, + "loss": 0.1004, + "num_input_tokens_seen": 71009568, + "step": 32900 + }, + { + "epoch": 5.367862969004894, + "grad_norm": 0.19128036499023438, + "learning_rate": 0.0009160883998545216, + "loss": 0.2398, + "num_input_tokens_seen": 71021728, + "step": 32905 + }, + { + "epoch": 5.368678629690049, + "grad_norm": 0.04529458284378052, + "learning_rate": 0.0009160489256870532, + "loss": 0.0511, + "num_input_tokens_seen": 71032704, + "step": 32910 + }, + { + "epoch": 5.369494290375204, + "grad_norm": 0.023523258045315742, + "learning_rate": 0.0009160094430878255, + "loss": 0.0306, + "num_input_tokens_seen": 71043808, + "step": 32915 + }, + { + "epoch": 5.370309951060359, + "grad_norm": 0.013278118334710598, + "learning_rate": 0.0009159699520576388, + "loss": 0.1489, + "num_input_tokens_seen": 71054752, + "step": 32920 + }, + { + "epoch": 5.371125611745514, + "grad_norm": 0.07473546266555786, + "learning_rate": 0.0009159304525972931, + "loss": 0.1348, + "num_input_tokens_seen": 71065216, + "step": 32925 + }, + { + "epoch": 5.371941272430669, + "grad_norm": 0.024516548961400986, + "learning_rate": 0.0009158909447075894, + "loss": 0.185, + "num_input_tokens_seen": 71074752, + "step": 32930 + }, + { + "epoch": 5.372756933115824, + "grad_norm": 0.009506557136774063, + "learning_rate": 0.0009158514283893279, + "loss": 0.0788, + "num_input_tokens_seen": 71084864, + "step": 32935 + }, + { + "epoch": 5.373572593800978, + "grad_norm": 0.007520253770053387, + "learning_rate": 0.0009158119036433097, + "loss": 0.0999, + "num_input_tokens_seen": 71095392, + "step": 32940 + }, + { + "epoch": 5.374388254486134, + "grad_norm": 0.009985439479351044, + "learning_rate": 0.0009157723704703358, + "loss": 0.1633, + "num_input_tokens_seen": 71105472, + "step": 32945 + }, + { + "epoch": 5.375203915171289, + "grad_norm": 0.03562254086136818, + "learning_rate": 0.0009157328288712075, + "loss": 0.1278, + "num_input_tokens_seen": 71116448, + "step": 32950 + }, + { + "epoch": 5.376019575856444, + "grad_norm": 0.02401045337319374, + "learning_rate": 0.0009156932788467259, + "loss": 0.0315, + "num_input_tokens_seen": 71128032, + "step": 32955 + }, + { + "epoch": 5.376835236541599, + "grad_norm": 0.008072743192315102, + "learning_rate": 0.0009156537203976927, + "loss": 0.0713, + "num_input_tokens_seen": 71140448, + "step": 32960 + }, + { + "epoch": 5.377650897226753, + "grad_norm": 0.024254092946648598, + "learning_rate": 0.0009156141535249094, + "loss": 0.0164, + "num_input_tokens_seen": 71150720, + "step": 32965 + }, + { + "epoch": 5.378466557911908, + "grad_norm": 0.011184643022716045, + "learning_rate": 0.0009155745782291782, + "loss": 0.1193, + "num_input_tokens_seen": 71161920, + "step": 32970 + }, + { + "epoch": 5.379282218597064, + "grad_norm": 0.011535129509866238, + "learning_rate": 0.000915534994511301, + "loss": 0.0258, + "num_input_tokens_seen": 71173888, + "step": 32975 + }, + { + "epoch": 5.380097879282219, + "grad_norm": 0.013877046294510365, + "learning_rate": 0.0009154954023720799, + "loss": 0.0315, + "num_input_tokens_seen": 71185984, + "step": 32980 + }, + { + "epoch": 5.3809135399673735, + "grad_norm": 0.026869961991906166, + "learning_rate": 0.0009154558018123174, + "loss": 0.055, + "num_input_tokens_seen": 71196640, + "step": 32985 + }, + { + "epoch": 5.381729200652528, + "grad_norm": 0.020369326695799828, + "learning_rate": 0.000915416192832816, + "loss": 0.0133, + "num_input_tokens_seen": 71208000, + "step": 32990 + }, + { + "epoch": 5.382544861337683, + "grad_norm": 0.019414683803915977, + "learning_rate": 0.0009153765754343786, + "loss": 0.0324, + "num_input_tokens_seen": 71219616, + "step": 32995 + }, + { + "epoch": 5.383360522022839, + "grad_norm": 0.029827341437339783, + "learning_rate": 0.0009153369496178078, + "loss": 0.0402, + "num_input_tokens_seen": 71230880, + "step": 33000 + }, + { + "epoch": 5.384176182707994, + "grad_norm": 0.2428472340106964, + "learning_rate": 0.0009152973153839068, + "loss": 0.2281, + "num_input_tokens_seen": 71240800, + "step": 33005 + }, + { + "epoch": 5.3849918433931485, + "grad_norm": 0.009897132404148579, + "learning_rate": 0.000915257672733479, + "loss": 0.0575, + "num_input_tokens_seen": 71250784, + "step": 33010 + }, + { + "epoch": 5.385807504078303, + "grad_norm": 0.01277187466621399, + "learning_rate": 0.0009152180216673276, + "loss": 0.1614, + "num_input_tokens_seen": 71260576, + "step": 33015 + }, + { + "epoch": 5.386623164763458, + "grad_norm": 0.04543527960777283, + "learning_rate": 0.0009151783621862564, + "loss": 0.024, + "num_input_tokens_seen": 71270976, + "step": 33020 + }, + { + "epoch": 5.387438825448613, + "grad_norm": 0.026614440605044365, + "learning_rate": 0.0009151386942910688, + "loss": 0.1574, + "num_input_tokens_seen": 71281216, + "step": 33025 + }, + { + "epoch": 5.388254486133769, + "grad_norm": 0.012615106999874115, + "learning_rate": 0.0009150990179825689, + "loss": 0.0917, + "num_input_tokens_seen": 71292064, + "step": 33030 + }, + { + "epoch": 5.3890701468189235, + "grad_norm": 0.014470712281763554, + "learning_rate": 0.000915059333261561, + "loss": 0.1046, + "num_input_tokens_seen": 71303392, + "step": 33035 + }, + { + "epoch": 5.389885807504078, + "grad_norm": 0.0171805527061224, + "learning_rate": 0.0009150196401288491, + "loss": 0.0561, + "num_input_tokens_seen": 71314016, + "step": 33040 + }, + { + "epoch": 5.390701468189233, + "grad_norm": 0.024619970470666885, + "learning_rate": 0.0009149799385852375, + "loss": 0.0452, + "num_input_tokens_seen": 71326016, + "step": 33045 + }, + { + "epoch": 5.391517128874388, + "grad_norm": 0.020328696817159653, + "learning_rate": 0.0009149402286315314, + "loss": 0.1403, + "num_input_tokens_seen": 71335840, + "step": 33050 + }, + { + "epoch": 5.392332789559543, + "grad_norm": 0.010986017994582653, + "learning_rate": 0.0009149005102685348, + "loss": 0.0814, + "num_input_tokens_seen": 71347072, + "step": 33055 + }, + { + "epoch": 5.3931484502446985, + "grad_norm": 0.15942789614200592, + "learning_rate": 0.0009148607834970532, + "loss": 0.0815, + "num_input_tokens_seen": 71358464, + "step": 33060 + }, + { + "epoch": 5.393964110929853, + "grad_norm": 0.04686347022652626, + "learning_rate": 0.0009148210483178916, + "loss": 0.0889, + "num_input_tokens_seen": 71367776, + "step": 33065 + }, + { + "epoch": 5.394779771615008, + "grad_norm": 0.027309052646160126, + "learning_rate": 0.000914781304731855, + "loss": 0.0158, + "num_input_tokens_seen": 71378368, + "step": 33070 + }, + { + "epoch": 5.395595432300163, + "grad_norm": 0.21841654181480408, + "learning_rate": 0.0009147415527397492, + "loss": 0.185, + "num_input_tokens_seen": 71389632, + "step": 33075 + }, + { + "epoch": 5.396411092985318, + "grad_norm": 0.01469578966498375, + "learning_rate": 0.0009147017923423797, + "loss": 0.0208, + "num_input_tokens_seen": 71400064, + "step": 33080 + }, + { + "epoch": 5.397226753670473, + "grad_norm": 0.023963550105690956, + "learning_rate": 0.0009146620235405523, + "loss": 0.0629, + "num_input_tokens_seen": 71410784, + "step": 33085 + }, + { + "epoch": 5.398042414355628, + "grad_norm": 0.20159204304218292, + "learning_rate": 0.0009146222463350729, + "loss": 0.1535, + "num_input_tokens_seen": 71421632, + "step": 33090 + }, + { + "epoch": 5.398858075040783, + "grad_norm": 0.0029823933728039265, + "learning_rate": 0.0009145824607267478, + "loss": 0.0801, + "num_input_tokens_seen": 71432640, + "step": 33095 + }, + { + "epoch": 5.399673735725938, + "grad_norm": 0.05181365832686424, + "learning_rate": 0.0009145426667163832, + "loss": 0.023, + "num_input_tokens_seen": 71442816, + "step": 33100 + }, + { + "epoch": 5.400489396411093, + "grad_norm": 0.05993790924549103, + "learning_rate": 0.0009145028643047855, + "loss": 0.0569, + "num_input_tokens_seen": 71453568, + "step": 33105 + }, + { + "epoch": 5.401305057096248, + "grad_norm": 0.0076814512722194195, + "learning_rate": 0.0009144630534927613, + "loss": 0.0447, + "num_input_tokens_seen": 71464448, + "step": 33110 + }, + { + "epoch": 5.402120717781403, + "grad_norm": 0.06428392231464386, + "learning_rate": 0.0009144232342811179, + "loss": 0.1457, + "num_input_tokens_seen": 71475264, + "step": 33115 + }, + { + "epoch": 5.402936378466558, + "grad_norm": 0.018395787104964256, + "learning_rate": 0.0009143834066706615, + "loss": 0.0443, + "num_input_tokens_seen": 71484704, + "step": 33120 + }, + { + "epoch": 5.403752039151713, + "grad_norm": 0.15782392024993896, + "learning_rate": 0.0009143435706621999, + "loss": 0.0793, + "num_input_tokens_seen": 71494592, + "step": 33125 + }, + { + "epoch": 5.404567699836868, + "grad_norm": 0.21699565649032593, + "learning_rate": 0.0009143037262565401, + "loss": 0.2304, + "num_input_tokens_seen": 71505568, + "step": 33130 + }, + { + "epoch": 5.4053833605220225, + "grad_norm": 0.0035046711564064026, + "learning_rate": 0.00091426387345449, + "loss": 0.1477, + "num_input_tokens_seen": 71515872, + "step": 33135 + }, + { + "epoch": 5.406199021207178, + "grad_norm": 0.0063438680954277515, + "learning_rate": 0.0009142240122568566, + "loss": 0.0725, + "num_input_tokens_seen": 71526720, + "step": 33140 + }, + { + "epoch": 5.407014681892333, + "grad_norm": 0.008436648175120354, + "learning_rate": 0.0009141841426644482, + "loss": 0.0983, + "num_input_tokens_seen": 71537856, + "step": 33145 + }, + { + "epoch": 5.407830342577488, + "grad_norm": 0.0785364955663681, + "learning_rate": 0.0009141442646780728, + "loss": 0.0603, + "num_input_tokens_seen": 71546624, + "step": 33150 + }, + { + "epoch": 5.408646003262643, + "grad_norm": 0.014238372445106506, + "learning_rate": 0.0009141043782985385, + "loss": 0.1056, + "num_input_tokens_seen": 71556864, + "step": 33155 + }, + { + "epoch": 5.4094616639477975, + "grad_norm": 0.14260686933994293, + "learning_rate": 0.0009140644835266537, + "loss": 0.054, + "num_input_tokens_seen": 71566848, + "step": 33160 + }, + { + "epoch": 5.410277324632952, + "grad_norm": 0.14838922023773193, + "learning_rate": 0.0009140245803632268, + "loss": 0.0883, + "num_input_tokens_seen": 71576544, + "step": 33165 + }, + { + "epoch": 5.411092985318108, + "grad_norm": 0.03204767778515816, + "learning_rate": 0.0009139846688090665, + "loss": 0.0284, + "num_input_tokens_seen": 71586816, + "step": 33170 + }, + { + "epoch": 5.411908646003263, + "grad_norm": 0.08572663366794586, + "learning_rate": 0.0009139447488649818, + "loss": 0.0381, + "num_input_tokens_seen": 71596480, + "step": 33175 + }, + { + "epoch": 5.412724306688418, + "grad_norm": 0.007948421873152256, + "learning_rate": 0.0009139048205317817, + "loss": 0.0835, + "num_input_tokens_seen": 71607968, + "step": 33180 + }, + { + "epoch": 5.4135399673735725, + "grad_norm": 0.2346818447113037, + "learning_rate": 0.0009138648838102751, + "loss": 0.0673, + "num_input_tokens_seen": 71618560, + "step": 33185 + }, + { + "epoch": 5.414355628058727, + "grad_norm": 0.045726362615823746, + "learning_rate": 0.0009138249387012718, + "loss": 0.0242, + "num_input_tokens_seen": 71629952, + "step": 33190 + }, + { + "epoch": 5.415171288743883, + "grad_norm": 0.15147966146469116, + "learning_rate": 0.000913784985205581, + "loss": 0.0562, + "num_input_tokens_seen": 71639296, + "step": 33195 + }, + { + "epoch": 5.415986949429038, + "grad_norm": 0.01030566543340683, + "learning_rate": 0.0009137450233240127, + "loss": 0.1473, + "num_input_tokens_seen": 71649760, + "step": 33200 + }, + { + "epoch": 5.416802610114193, + "grad_norm": 0.05504048243165016, + "learning_rate": 0.0009137050530573765, + "loss": 0.0417, + "num_input_tokens_seen": 71660800, + "step": 33205 + }, + { + "epoch": 5.417618270799347, + "grad_norm": 0.011530941352248192, + "learning_rate": 0.0009136650744064827, + "loss": 0.0348, + "num_input_tokens_seen": 71671104, + "step": 33210 + }, + { + "epoch": 5.418433931484502, + "grad_norm": 0.16801400482654572, + "learning_rate": 0.0009136250873721413, + "loss": 0.1002, + "num_input_tokens_seen": 71681568, + "step": 33215 + }, + { + "epoch": 5.419249592169657, + "grad_norm": 0.05222921445965767, + "learning_rate": 0.0009135850919551628, + "loss": 0.1553, + "num_input_tokens_seen": 71691008, + "step": 33220 + }, + { + "epoch": 5.420065252854813, + "grad_norm": 0.1045856699347496, + "learning_rate": 0.0009135450881563578, + "loss": 0.0743, + "num_input_tokens_seen": 71702112, + "step": 33225 + }, + { + "epoch": 5.420880913539968, + "grad_norm": 0.20069441199302673, + "learning_rate": 0.0009135050759765369, + "loss": 0.1435, + "num_input_tokens_seen": 71713696, + "step": 33230 + }, + { + "epoch": 5.421696574225122, + "grad_norm": 0.012249850668013096, + "learning_rate": 0.0009134650554165111, + "loss": 0.2085, + "num_input_tokens_seen": 71724800, + "step": 33235 + }, + { + "epoch": 5.422512234910277, + "grad_norm": 0.004933356773108244, + "learning_rate": 0.0009134250264770914, + "loss": 0.0356, + "num_input_tokens_seen": 71736128, + "step": 33240 + }, + { + "epoch": 5.423327895595432, + "grad_norm": 0.013518745079636574, + "learning_rate": 0.0009133849891590891, + "loss": 0.0797, + "num_input_tokens_seen": 71746912, + "step": 33245 + }, + { + "epoch": 5.424143556280587, + "grad_norm": 0.15208515524864197, + "learning_rate": 0.0009133449434633157, + "loss": 0.1472, + "num_input_tokens_seen": 71757888, + "step": 33250 + }, + { + "epoch": 5.424959216965743, + "grad_norm": 0.00926217157393694, + "learning_rate": 0.0009133048893905824, + "loss": 0.0624, + "num_input_tokens_seen": 71769504, + "step": 33255 + }, + { + "epoch": 5.425774877650897, + "grad_norm": 0.19139079749584198, + "learning_rate": 0.0009132648269417014, + "loss": 0.1056, + "num_input_tokens_seen": 71780352, + "step": 33260 + }, + { + "epoch": 5.426590538336052, + "grad_norm": 0.18640805780887604, + "learning_rate": 0.0009132247561174843, + "loss": 0.1096, + "num_input_tokens_seen": 71791488, + "step": 33265 + }, + { + "epoch": 5.427406199021207, + "grad_norm": 0.07593125849962234, + "learning_rate": 0.0009131846769187434, + "loss": 0.0512, + "num_input_tokens_seen": 71801728, + "step": 33270 + }, + { + "epoch": 5.428221859706362, + "grad_norm": 0.3415531814098358, + "learning_rate": 0.0009131445893462908, + "loss": 0.1318, + "num_input_tokens_seen": 71812096, + "step": 33275 + }, + { + "epoch": 5.4290375203915175, + "grad_norm": 0.03183945640921593, + "learning_rate": 0.000913104493400939, + "loss": 0.0231, + "num_input_tokens_seen": 71822752, + "step": 33280 + }, + { + "epoch": 5.429853181076672, + "grad_norm": 0.0009118574671447277, + "learning_rate": 0.0009130643890835007, + "loss": 0.0798, + "num_input_tokens_seen": 71833056, + "step": 33285 + }, + { + "epoch": 5.430668841761827, + "grad_norm": 0.003372542094439268, + "learning_rate": 0.0009130242763947884, + "loss": 0.0691, + "num_input_tokens_seen": 71842528, + "step": 33290 + }, + { + "epoch": 5.431484502446982, + "grad_norm": 0.030809585005044937, + "learning_rate": 0.0009129841553356152, + "loss": 0.0319, + "num_input_tokens_seen": 71853440, + "step": 33295 + }, + { + "epoch": 5.432300163132137, + "grad_norm": 0.032825469970703125, + "learning_rate": 0.0009129440259067941, + "loss": 0.0331, + "num_input_tokens_seen": 71863040, + "step": 33300 + }, + { + "epoch": 5.433115823817292, + "grad_norm": 0.011249230243265629, + "learning_rate": 0.0009129038881091386, + "loss": 0.0594, + "num_input_tokens_seen": 71873792, + "step": 33305 + }, + { + "epoch": 5.433931484502447, + "grad_norm": 0.007202036213129759, + "learning_rate": 0.000912863741943462, + "loss": 0.029, + "num_input_tokens_seen": 71885856, + "step": 33310 + }, + { + "epoch": 5.434747145187602, + "grad_norm": 0.19997498393058777, + "learning_rate": 0.000912823587410578, + "loss": 0.0729, + "num_input_tokens_seen": 71896416, + "step": 33315 + }, + { + "epoch": 5.435562805872757, + "grad_norm": 0.023515524342656136, + "learning_rate": 0.0009127834245113, + "loss": 0.1104, + "num_input_tokens_seen": 71906976, + "step": 33320 + }, + { + "epoch": 5.436378466557912, + "grad_norm": 0.03044535033404827, + "learning_rate": 0.0009127432532464424, + "loss": 0.1074, + "num_input_tokens_seen": 71916864, + "step": 33325 + }, + { + "epoch": 5.437194127243067, + "grad_norm": 0.031669724732637405, + "learning_rate": 0.0009127030736168192, + "loss": 0.0342, + "num_input_tokens_seen": 71928288, + "step": 33330 + }, + { + "epoch": 5.438009787928221, + "grad_norm": 0.007505581248551607, + "learning_rate": 0.0009126628856232446, + "loss": 0.0795, + "num_input_tokens_seen": 71939104, + "step": 33335 + }, + { + "epoch": 5.438825448613377, + "grad_norm": 0.04485683888196945, + "learning_rate": 0.0009126226892665333, + "loss": 0.0363, + "num_input_tokens_seen": 71950208, + "step": 33340 + }, + { + "epoch": 5.439641109298532, + "grad_norm": 0.30599188804626465, + "learning_rate": 0.0009125824845474996, + "loss": 0.3294, + "num_input_tokens_seen": 71961312, + "step": 33345 + }, + { + "epoch": 5.440456769983687, + "grad_norm": 0.19477291405200958, + "learning_rate": 0.0009125422714669584, + "loss": 0.2279, + "num_input_tokens_seen": 71973248, + "step": 33350 + }, + { + "epoch": 5.441272430668842, + "grad_norm": 0.0038824514485895634, + "learning_rate": 0.0009125020500257248, + "loss": 0.017, + "num_input_tokens_seen": 71983648, + "step": 33355 + }, + { + "epoch": 5.442088091353996, + "grad_norm": 0.003111607162281871, + "learning_rate": 0.000912461820224614, + "loss": 0.0373, + "num_input_tokens_seen": 71995072, + "step": 33360 + }, + { + "epoch": 5.442903752039152, + "grad_norm": 0.0944511666893959, + "learning_rate": 0.000912421582064441, + "loss": 0.0704, + "num_input_tokens_seen": 72006112, + "step": 33365 + }, + { + "epoch": 5.443719412724307, + "grad_norm": 0.004999196622520685, + "learning_rate": 0.0009123813355460214, + "loss": 0.0093, + "num_input_tokens_seen": 72017184, + "step": 33370 + }, + { + "epoch": 5.444535073409462, + "grad_norm": 0.16159506142139435, + "learning_rate": 0.000912341080670171, + "loss": 0.1015, + "num_input_tokens_seen": 72028352, + "step": 33375 + }, + { + "epoch": 5.445350734094617, + "grad_norm": 0.01775406301021576, + "learning_rate": 0.0009123008174377054, + "loss": 0.0345, + "num_input_tokens_seen": 72037664, + "step": 33380 + }, + { + "epoch": 5.446166394779771, + "grad_norm": 0.009099340066313744, + "learning_rate": 0.0009122605458494409, + "loss": 0.0567, + "num_input_tokens_seen": 72049120, + "step": 33385 + }, + { + "epoch": 5.446982055464926, + "grad_norm": 0.29363304376602173, + "learning_rate": 0.0009122202659061934, + "loss": 0.1408, + "num_input_tokens_seen": 72059104, + "step": 33390 + }, + { + "epoch": 5.447797716150082, + "grad_norm": 0.19655779004096985, + "learning_rate": 0.0009121799776087791, + "loss": 0.0786, + "num_input_tokens_seen": 72070048, + "step": 33395 + }, + { + "epoch": 5.448613376835237, + "grad_norm": 0.013881326653063297, + "learning_rate": 0.0009121396809580147, + "loss": 0.2677, + "num_input_tokens_seen": 72080928, + "step": 33400 + }, + { + "epoch": 5.4494290375203915, + "grad_norm": 0.006764030084013939, + "learning_rate": 0.0009120993759547169, + "loss": 0.0596, + "num_input_tokens_seen": 72091488, + "step": 33405 + }, + { + "epoch": 5.450244698205546, + "grad_norm": 0.029064396396279335, + "learning_rate": 0.0009120590625997026, + "loss": 0.0325, + "num_input_tokens_seen": 72102528, + "step": 33410 + }, + { + "epoch": 5.451060358890701, + "grad_norm": 0.16706690192222595, + "learning_rate": 0.0009120187408937884, + "loss": 0.0457, + "num_input_tokens_seen": 72112992, + "step": 33415 + }, + { + "epoch": 5.451876019575856, + "grad_norm": 0.0666644498705864, + "learning_rate": 0.0009119784108377918, + "loss": 0.114, + "num_input_tokens_seen": 72124096, + "step": 33420 + }, + { + "epoch": 5.452691680261012, + "grad_norm": 0.050407107919454575, + "learning_rate": 0.0009119380724325302, + "loss": 0.0583, + "num_input_tokens_seen": 72134624, + "step": 33425 + }, + { + "epoch": 5.4535073409461665, + "grad_norm": 0.12013330310583115, + "learning_rate": 0.0009118977256788208, + "loss": 0.1689, + "num_input_tokens_seen": 72145344, + "step": 33430 + }, + { + "epoch": 5.454323001631321, + "grad_norm": 0.02103497087955475, + "learning_rate": 0.0009118573705774815, + "loss": 0.1046, + "num_input_tokens_seen": 72157312, + "step": 33435 + }, + { + "epoch": 5.455138662316476, + "grad_norm": 0.1760423183441162, + "learning_rate": 0.0009118170071293302, + "loss": 0.2196, + "num_input_tokens_seen": 72168032, + "step": 33440 + }, + { + "epoch": 5.455954323001631, + "grad_norm": 0.08027848601341248, + "learning_rate": 0.0009117766353351848, + "loss": 0.0449, + "num_input_tokens_seen": 72178944, + "step": 33445 + }, + { + "epoch": 5.456769983686787, + "grad_norm": 0.20456601679325104, + "learning_rate": 0.0009117362551958635, + "loss": 0.0674, + "num_input_tokens_seen": 72188640, + "step": 33450 + }, + { + "epoch": 5.4575856443719415, + "grad_norm": 0.06242414563894272, + "learning_rate": 0.0009116958667121847, + "loss": 0.0671, + "num_input_tokens_seen": 72198720, + "step": 33455 + }, + { + "epoch": 5.458401305057096, + "grad_norm": 0.0028042015619575977, + "learning_rate": 0.0009116554698849668, + "loss": 0.0995, + "num_input_tokens_seen": 72208704, + "step": 33460 + }, + { + "epoch": 5.459216965742251, + "grad_norm": 0.010481450706720352, + "learning_rate": 0.0009116150647150286, + "loss": 0.1103, + "num_input_tokens_seen": 72218688, + "step": 33465 + }, + { + "epoch": 5.460032626427406, + "grad_norm": 0.21536676585674286, + "learning_rate": 0.0009115746512031891, + "loss": 0.1001, + "num_input_tokens_seen": 72230336, + "step": 33470 + }, + { + "epoch": 5.460848287112561, + "grad_norm": 0.015356608666479588, + "learning_rate": 0.0009115342293502669, + "loss": 0.0148, + "num_input_tokens_seen": 72241024, + "step": 33475 + }, + { + "epoch": 5.4616639477977165, + "grad_norm": 0.17659713327884674, + "learning_rate": 0.0009114937991570817, + "loss": 0.0705, + "num_input_tokens_seen": 72252064, + "step": 33480 + }, + { + "epoch": 5.462479608482871, + "grad_norm": 0.160111665725708, + "learning_rate": 0.0009114533606244526, + "loss": 0.2032, + "num_input_tokens_seen": 72263616, + "step": 33485 + }, + { + "epoch": 5.463295269168026, + "grad_norm": 0.025515630841255188, + "learning_rate": 0.0009114129137531991, + "loss": 0.2338, + "num_input_tokens_seen": 72274784, + "step": 33490 + }, + { + "epoch": 5.464110929853181, + "grad_norm": 0.007209369447082281, + "learning_rate": 0.000911372458544141, + "loss": 0.0567, + "num_input_tokens_seen": 72284064, + "step": 33495 + }, + { + "epoch": 5.464926590538336, + "grad_norm": 0.20204782485961914, + "learning_rate": 0.0009113319949980983, + "loss": 0.0861, + "num_input_tokens_seen": 72295616, + "step": 33500 + }, + { + "epoch": 5.465742251223491, + "grad_norm": 0.013891778886318207, + "learning_rate": 0.0009112915231158907, + "loss": 0.1555, + "num_input_tokens_seen": 72305952, + "step": 33505 + }, + { + "epoch": 5.466557911908646, + "grad_norm": 0.07518818974494934, + "learning_rate": 0.0009112510428983387, + "loss": 0.1808, + "num_input_tokens_seen": 72316512, + "step": 33510 + }, + { + "epoch": 5.467373572593801, + "grad_norm": 0.020761430263519287, + "learning_rate": 0.0009112105543462628, + "loss": 0.1744, + "num_input_tokens_seen": 72327328, + "step": 33515 + }, + { + "epoch": 5.468189233278956, + "grad_norm": 0.09224016964435577, + "learning_rate": 0.0009111700574604831, + "loss": 0.0363, + "num_input_tokens_seen": 72338560, + "step": 33520 + }, + { + "epoch": 5.469004893964111, + "grad_norm": 0.06859661638736725, + "learning_rate": 0.0009111295522418207, + "loss": 0.0577, + "num_input_tokens_seen": 72349312, + "step": 33525 + }, + { + "epoch": 5.4698205546492655, + "grad_norm": 0.10898208618164062, + "learning_rate": 0.0009110890386910964, + "loss": 0.0468, + "num_input_tokens_seen": 72361024, + "step": 33530 + }, + { + "epoch": 5.470636215334421, + "grad_norm": 0.07327257096767426, + "learning_rate": 0.0009110485168091311, + "loss": 0.2308, + "num_input_tokens_seen": 72372352, + "step": 33535 + }, + { + "epoch": 5.471451876019576, + "grad_norm": 0.022085702046751976, + "learning_rate": 0.0009110079865967462, + "loss": 0.0797, + "num_input_tokens_seen": 72383136, + "step": 33540 + }, + { + "epoch": 5.472267536704731, + "grad_norm": 0.020697934553027153, + "learning_rate": 0.0009109674480547632, + "loss": 0.0628, + "num_input_tokens_seen": 72392224, + "step": 33545 + }, + { + "epoch": 5.473083197389886, + "grad_norm": 0.020640023052692413, + "learning_rate": 0.0009109269011840033, + "loss": 0.0395, + "num_input_tokens_seen": 72402176, + "step": 33550 + }, + { + "epoch": 5.4738988580750405, + "grad_norm": 0.15004222095012665, + "learning_rate": 0.0009108863459852886, + "loss": 0.1415, + "num_input_tokens_seen": 72413792, + "step": 33555 + }, + { + "epoch": 5.474714518760196, + "grad_norm": 0.1674167960882187, + "learning_rate": 0.0009108457824594407, + "loss": 0.1115, + "num_input_tokens_seen": 72425792, + "step": 33560 + }, + { + "epoch": 5.475530179445351, + "grad_norm": 0.006529804784804583, + "learning_rate": 0.0009108052106072819, + "loss": 0.0571, + "num_input_tokens_seen": 72436864, + "step": 33565 + }, + { + "epoch": 5.476345840130506, + "grad_norm": 0.015645606443285942, + "learning_rate": 0.0009107646304296344, + "loss": 0.0235, + "num_input_tokens_seen": 72448096, + "step": 33570 + }, + { + "epoch": 5.477161500815661, + "grad_norm": 0.062118276953697205, + "learning_rate": 0.0009107240419273206, + "loss": 0.0358, + "num_input_tokens_seen": 72459776, + "step": 33575 + }, + { + "epoch": 5.4779771615008155, + "grad_norm": 0.21067044138908386, + "learning_rate": 0.000910683445101163, + "loss": 0.1952, + "num_input_tokens_seen": 72470528, + "step": 33580 + }, + { + "epoch": 5.47879282218597, + "grad_norm": 0.2080061137676239, + "learning_rate": 0.0009106428399519844, + "loss": 0.2912, + "num_input_tokens_seen": 72482144, + "step": 33585 + }, + { + "epoch": 5.479608482871126, + "grad_norm": 0.05376192927360535, + "learning_rate": 0.0009106022264806078, + "loss": 0.1655, + "num_input_tokens_seen": 72493504, + "step": 33590 + }, + { + "epoch": 5.480424143556281, + "grad_norm": 0.009080876596271992, + "learning_rate": 0.000910561604687856, + "loss": 0.0776, + "num_input_tokens_seen": 72503904, + "step": 33595 + }, + { + "epoch": 5.481239804241436, + "grad_norm": 0.14712311327457428, + "learning_rate": 0.0009105209745745526, + "loss": 0.2219, + "num_input_tokens_seen": 72514560, + "step": 33600 + }, + { + "epoch": 5.4820554649265905, + "grad_norm": 0.036863747984170914, + "learning_rate": 0.0009104803361415208, + "loss": 0.0842, + "num_input_tokens_seen": 72526080, + "step": 33605 + }, + { + "epoch": 5.482871125611745, + "grad_norm": 0.020012181252241135, + "learning_rate": 0.0009104396893895843, + "loss": 0.1003, + "num_input_tokens_seen": 72536992, + "step": 33610 + }, + { + "epoch": 5.4836867862969, + "grad_norm": 0.13756419718265533, + "learning_rate": 0.0009103990343195667, + "loss": 0.1437, + "num_input_tokens_seen": 72548224, + "step": 33615 + }, + { + "epoch": 5.484502446982056, + "grad_norm": 0.034922316670417786, + "learning_rate": 0.0009103583709322923, + "loss": 0.0828, + "num_input_tokens_seen": 72559744, + "step": 33620 + }, + { + "epoch": 5.485318107667211, + "grad_norm": 0.02155611850321293, + "learning_rate": 0.0009103176992285847, + "loss": 0.033, + "num_input_tokens_seen": 72570528, + "step": 33625 + }, + { + "epoch": 5.486133768352365, + "grad_norm": 0.1050281673669815, + "learning_rate": 0.0009102770192092684, + "loss": 0.1108, + "num_input_tokens_seen": 72582112, + "step": 33630 + }, + { + "epoch": 5.48694942903752, + "grad_norm": 0.033963318914175034, + "learning_rate": 0.000910236330875168, + "loss": 0.0582, + "num_input_tokens_seen": 72592736, + "step": 33635 + }, + { + "epoch": 5.487765089722675, + "grad_norm": 0.006188414990901947, + "learning_rate": 0.0009101956342271078, + "loss": 0.0405, + "num_input_tokens_seen": 72604096, + "step": 33640 + }, + { + "epoch": 5.488580750407831, + "grad_norm": 0.09458814561367035, + "learning_rate": 0.0009101549292659128, + "loss": 0.0384, + "num_input_tokens_seen": 72614560, + "step": 33645 + }, + { + "epoch": 5.489396411092986, + "grad_norm": 0.21321018040180206, + "learning_rate": 0.0009101142159924077, + "loss": 0.1924, + "num_input_tokens_seen": 72625536, + "step": 33650 + }, + { + "epoch": 5.49021207177814, + "grad_norm": 0.08025496453046799, + "learning_rate": 0.0009100734944074179, + "loss": 0.1297, + "num_input_tokens_seen": 72636384, + "step": 33655 + }, + { + "epoch": 5.491027732463295, + "grad_norm": 0.17484013736248016, + "learning_rate": 0.0009100327645117684, + "loss": 0.1778, + "num_input_tokens_seen": 72646272, + "step": 33660 + }, + { + "epoch": 5.49184339314845, + "grad_norm": 0.0037156378384679556, + "learning_rate": 0.0009099920263062848, + "loss": 0.0843, + "num_input_tokens_seen": 72657184, + "step": 33665 + }, + { + "epoch": 5.492659053833605, + "grad_norm": 0.07498479634523392, + "learning_rate": 0.0009099512797917927, + "loss": 0.1228, + "num_input_tokens_seen": 72668544, + "step": 33670 + }, + { + "epoch": 5.493474714518761, + "grad_norm": 0.010089121758937836, + "learning_rate": 0.0009099105249691179, + "loss": 0.0512, + "num_input_tokens_seen": 72679104, + "step": 33675 + }, + { + "epoch": 5.494290375203915, + "grad_norm": 0.1748151034116745, + "learning_rate": 0.0009098697618390862, + "loss": 0.2071, + "num_input_tokens_seen": 72689856, + "step": 33680 + }, + { + "epoch": 5.49510603588907, + "grad_norm": 0.1186823919415474, + "learning_rate": 0.0009098289904025239, + "loss": 0.1252, + "num_input_tokens_seen": 72701312, + "step": 33685 + }, + { + "epoch": 5.495921696574225, + "grad_norm": 0.007353127468377352, + "learning_rate": 0.0009097882106602571, + "loss": 0.0123, + "num_input_tokens_seen": 72711808, + "step": 33690 + }, + { + "epoch": 5.49673735725938, + "grad_norm": 0.07604477554559708, + "learning_rate": 0.0009097474226131124, + "loss": 0.0619, + "num_input_tokens_seen": 72722528, + "step": 33695 + }, + { + "epoch": 5.497553017944535, + "grad_norm": 0.18470115959644318, + "learning_rate": 0.0009097066262619165, + "loss": 0.1263, + "num_input_tokens_seen": 72733408, + "step": 33700 + }, + { + "epoch": 5.49836867862969, + "grad_norm": 0.16074925661087036, + "learning_rate": 0.000909665821607496, + "loss": 0.0999, + "num_input_tokens_seen": 72744544, + "step": 33705 + }, + { + "epoch": 5.499184339314845, + "grad_norm": 0.07218444347381592, + "learning_rate": 0.0009096250086506779, + "loss": 0.061, + "num_input_tokens_seen": 72755712, + "step": 33710 + }, + { + "epoch": 5.5, + "grad_norm": 0.2168692946434021, + "learning_rate": 0.0009095841873922894, + "loss": 0.1428, + "num_input_tokens_seen": 72765696, + "step": 33715 + }, + { + "epoch": 5.500815660685155, + "grad_norm": 0.051020070910453796, + "learning_rate": 0.0009095433578331576, + "loss": 0.0412, + "num_input_tokens_seen": 72776576, + "step": 33720 + }, + { + "epoch": 5.50163132137031, + "grad_norm": 0.026439936831593513, + "learning_rate": 0.0009095025199741103, + "loss": 0.022, + "num_input_tokens_seen": 72786112, + "step": 33725 + }, + { + "epoch": 5.502446982055465, + "grad_norm": 0.0030693698208779097, + "learning_rate": 0.0009094616738159748, + "loss": 0.0511, + "num_input_tokens_seen": 72796384, + "step": 33730 + }, + { + "epoch": 5.50326264274062, + "grad_norm": 0.07120449841022491, + "learning_rate": 0.000909420819359579, + "loss": 0.0669, + "num_input_tokens_seen": 72806720, + "step": 33735 + }, + { + "epoch": 5.504078303425775, + "grad_norm": 0.13288180530071259, + "learning_rate": 0.000909379956605751, + "loss": 0.0626, + "num_input_tokens_seen": 72817408, + "step": 33740 + }, + { + "epoch": 5.50489396411093, + "grad_norm": 0.05211324989795685, + "learning_rate": 0.000909339085555319, + "loss": 0.103, + "num_input_tokens_seen": 72828096, + "step": 33745 + }, + { + "epoch": 5.505709624796085, + "grad_norm": 0.034254174679517746, + "learning_rate": 0.0009092982062091109, + "loss": 0.0161, + "num_input_tokens_seen": 72839040, + "step": 33750 + }, + { + "epoch": 5.506525285481239, + "grad_norm": 0.11185917258262634, + "learning_rate": 0.0009092573185679556, + "loss": 0.0968, + "num_input_tokens_seen": 72849728, + "step": 33755 + }, + { + "epoch": 5.507340946166395, + "grad_norm": 0.028096096590161324, + "learning_rate": 0.0009092164226326814, + "loss": 0.0329, + "num_input_tokens_seen": 72859872, + "step": 33760 + }, + { + "epoch": 5.50815660685155, + "grad_norm": 0.17730002105236053, + "learning_rate": 0.0009091755184041173, + "loss": 0.3321, + "num_input_tokens_seen": 72871648, + "step": 33765 + }, + { + "epoch": 5.508972267536705, + "grad_norm": 0.06412218511104584, + "learning_rate": 0.0009091346058830923, + "loss": 0.0401, + "num_input_tokens_seen": 72883136, + "step": 33770 + }, + { + "epoch": 5.50978792822186, + "grad_norm": 0.1483617126941681, + "learning_rate": 0.0009090936850704354, + "loss": 0.0578, + "num_input_tokens_seen": 72892896, + "step": 33775 + }, + { + "epoch": 5.510603588907014, + "grad_norm": 0.005130129400640726, + "learning_rate": 0.0009090527559669761, + "loss": 0.1218, + "num_input_tokens_seen": 72904576, + "step": 33780 + }, + { + "epoch": 5.511419249592169, + "grad_norm": 0.009559473022818565, + "learning_rate": 0.0009090118185735438, + "loss": 0.0928, + "num_input_tokens_seen": 72914016, + "step": 33785 + }, + { + "epoch": 5.512234910277325, + "grad_norm": 0.041558630764484406, + "learning_rate": 0.000908970872890968, + "loss": 0.0364, + "num_input_tokens_seen": 72925152, + "step": 33790 + }, + { + "epoch": 5.51305057096248, + "grad_norm": 0.18344418704509735, + "learning_rate": 0.0009089299189200789, + "loss": 0.0465, + "num_input_tokens_seen": 72936320, + "step": 33795 + }, + { + "epoch": 5.513866231647635, + "grad_norm": 0.06462042033672333, + "learning_rate": 0.000908888956661706, + "loss": 0.0178, + "num_input_tokens_seen": 72946848, + "step": 33800 + }, + { + "epoch": 5.514681892332789, + "grad_norm": 0.015973424538969994, + "learning_rate": 0.0009088479861166797, + "loss": 0.017, + "num_input_tokens_seen": 72958208, + "step": 33805 + }, + { + "epoch": 5.515497553017944, + "grad_norm": 0.1158784031867981, + "learning_rate": 0.0009088070072858303, + "loss": 0.0357, + "num_input_tokens_seen": 72969152, + "step": 33810 + }, + { + "epoch": 5.5163132137031, + "grad_norm": 0.012037696316838264, + "learning_rate": 0.0009087660201699884, + "loss": 0.1344, + "num_input_tokens_seen": 72981664, + "step": 33815 + }, + { + "epoch": 5.517128874388255, + "grad_norm": 0.06966777890920639, + "learning_rate": 0.0009087250247699846, + "loss": 0.0278, + "num_input_tokens_seen": 72992224, + "step": 33820 + }, + { + "epoch": 5.5179445350734095, + "grad_norm": 0.01880444772541523, + "learning_rate": 0.0009086840210866493, + "loss": 0.0804, + "num_input_tokens_seen": 73003712, + "step": 33825 + }, + { + "epoch": 5.518760195758564, + "grad_norm": 0.040937092155218124, + "learning_rate": 0.0009086430091208142, + "loss": 0.2372, + "num_input_tokens_seen": 73015328, + "step": 33830 + }, + { + "epoch": 5.519575856443719, + "grad_norm": 0.1825135201215744, + "learning_rate": 0.00090860198887331, + "loss": 0.1236, + "num_input_tokens_seen": 73026304, + "step": 33835 + }, + { + "epoch": 5.520391517128875, + "grad_norm": 0.12590163946151733, + "learning_rate": 0.0009085609603449683, + "loss": 0.149, + "num_input_tokens_seen": 73038688, + "step": 33840 + }, + { + "epoch": 5.52120717781403, + "grad_norm": 0.12962009012699127, + "learning_rate": 0.0009085199235366201, + "loss": 0.0802, + "num_input_tokens_seen": 73049824, + "step": 33845 + }, + { + "epoch": 5.5220228384991845, + "grad_norm": 0.02660832367837429, + "learning_rate": 0.0009084788784490977, + "loss": 0.135, + "num_input_tokens_seen": 73061760, + "step": 33850 + }, + { + "epoch": 5.522838499184339, + "grad_norm": 0.0157585721462965, + "learning_rate": 0.0009084378250832325, + "loss": 0.0995, + "num_input_tokens_seen": 73072704, + "step": 33855 + }, + { + "epoch": 5.523654159869494, + "grad_norm": 0.17335063219070435, + "learning_rate": 0.0009083967634398567, + "loss": 0.0988, + "num_input_tokens_seen": 73084032, + "step": 33860 + }, + { + "epoch": 5.524469820554649, + "grad_norm": 0.013981183990836143, + "learning_rate": 0.0009083556935198024, + "loss": 0.0954, + "num_input_tokens_seen": 73094592, + "step": 33865 + }, + { + "epoch": 5.525285481239804, + "grad_norm": 0.03212665766477585, + "learning_rate": 0.0009083146153239019, + "loss": 0.0789, + "num_input_tokens_seen": 73106304, + "step": 33870 + }, + { + "epoch": 5.5261011419249595, + "grad_norm": 0.09235245734453201, + "learning_rate": 0.0009082735288529878, + "loss": 0.0439, + "num_input_tokens_seen": 73117088, + "step": 33875 + }, + { + "epoch": 5.526916802610114, + "grad_norm": 0.07410083711147308, + "learning_rate": 0.0009082324341078927, + "loss": 0.0361, + "num_input_tokens_seen": 73127776, + "step": 33880 + }, + { + "epoch": 5.527732463295269, + "grad_norm": 0.18009814620018005, + "learning_rate": 0.0009081913310894494, + "loss": 0.0461, + "num_input_tokens_seen": 73139904, + "step": 33885 + }, + { + "epoch": 5.528548123980424, + "grad_norm": 0.013250273652374744, + "learning_rate": 0.000908150219798491, + "loss": 0.0195, + "num_input_tokens_seen": 73150688, + "step": 33890 + }, + { + "epoch": 5.529363784665579, + "grad_norm": 0.015279031358659267, + "learning_rate": 0.0009081091002358506, + "loss": 0.0667, + "num_input_tokens_seen": 73160576, + "step": 33895 + }, + { + "epoch": 5.5301794453507345, + "grad_norm": 0.39547258615493774, + "learning_rate": 0.0009080679724023615, + "loss": 0.154, + "num_input_tokens_seen": 73170272, + "step": 33900 + }, + { + "epoch": 5.530995106035889, + "grad_norm": 0.11409834027290344, + "learning_rate": 0.0009080268362988572, + "loss": 0.2141, + "num_input_tokens_seen": 73181760, + "step": 33905 + }, + { + "epoch": 5.531810766721044, + "grad_norm": 0.004522391594946384, + "learning_rate": 0.0009079856919261716, + "loss": 0.0627, + "num_input_tokens_seen": 73192096, + "step": 33910 + }, + { + "epoch": 5.532626427406199, + "grad_norm": 0.20073451101779938, + "learning_rate": 0.0009079445392851383, + "loss": 0.096, + "num_input_tokens_seen": 73201408, + "step": 33915 + }, + { + "epoch": 5.533442088091354, + "grad_norm": 0.029250508174300194, + "learning_rate": 0.0009079033783765914, + "loss": 0.0668, + "num_input_tokens_seen": 73211936, + "step": 33920 + }, + { + "epoch": 5.5342577487765094, + "grad_norm": 0.11122894287109375, + "learning_rate": 0.0009078622092013651, + "loss": 0.0394, + "num_input_tokens_seen": 73223392, + "step": 33925 + }, + { + "epoch": 5.535073409461664, + "grad_norm": 0.1036161258816719, + "learning_rate": 0.0009078210317602938, + "loss": 0.0858, + "num_input_tokens_seen": 73233920, + "step": 33930 + }, + { + "epoch": 5.535889070146819, + "grad_norm": 0.028571486473083496, + "learning_rate": 0.0009077798460542119, + "loss": 0.0382, + "num_input_tokens_seen": 73244288, + "step": 33935 + }, + { + "epoch": 5.536704730831974, + "grad_norm": 0.15863078832626343, + "learning_rate": 0.0009077386520839541, + "loss": 0.1472, + "num_input_tokens_seen": 73256256, + "step": 33940 + }, + { + "epoch": 5.537520391517129, + "grad_norm": 0.10916135460138321, + "learning_rate": 0.0009076974498503552, + "loss": 0.2502, + "num_input_tokens_seen": 73267264, + "step": 33945 + }, + { + "epoch": 5.5383360522022835, + "grad_norm": 0.058852870017290115, + "learning_rate": 0.0009076562393542502, + "loss": 0.1144, + "num_input_tokens_seen": 73279168, + "step": 33950 + }, + { + "epoch": 5.539151712887438, + "grad_norm": 0.3027729392051697, + "learning_rate": 0.0009076150205964746, + "loss": 0.0609, + "num_input_tokens_seen": 73288128, + "step": 33955 + }, + { + "epoch": 5.539967373572594, + "grad_norm": 0.11328154057264328, + "learning_rate": 0.0009075737935778634, + "loss": 0.0899, + "num_input_tokens_seen": 73297472, + "step": 33960 + }, + { + "epoch": 5.540783034257749, + "grad_norm": 0.021017983555793762, + "learning_rate": 0.0009075325582992522, + "loss": 0.0983, + "num_input_tokens_seen": 73309504, + "step": 33965 + }, + { + "epoch": 5.541598694942904, + "grad_norm": 0.01298400480300188, + "learning_rate": 0.0009074913147614767, + "loss": 0.0804, + "num_input_tokens_seen": 73320192, + "step": 33970 + }, + { + "epoch": 5.5424143556280585, + "grad_norm": 0.011828235350549221, + "learning_rate": 0.0009074500629653728, + "loss": 0.1318, + "num_input_tokens_seen": 73331168, + "step": 33975 + }, + { + "epoch": 5.543230016313213, + "grad_norm": 0.22442537546157837, + "learning_rate": 0.0009074088029117764, + "loss": 0.0476, + "num_input_tokens_seen": 73342944, + "step": 33980 + }, + { + "epoch": 5.544045676998369, + "grad_norm": 0.004960306454449892, + "learning_rate": 0.0009073675346015239, + "loss": 0.0201, + "num_input_tokens_seen": 73353920, + "step": 33985 + }, + { + "epoch": 5.544861337683524, + "grad_norm": 0.24870948493480682, + "learning_rate": 0.0009073262580354516, + "loss": 0.172, + "num_input_tokens_seen": 73366432, + "step": 33990 + }, + { + "epoch": 5.545676998368679, + "grad_norm": 0.005947879049926996, + "learning_rate": 0.0009072849732143957, + "loss": 0.0866, + "num_input_tokens_seen": 73377824, + "step": 33995 + }, + { + "epoch": 5.5464926590538335, + "grad_norm": 0.06429392844438553, + "learning_rate": 0.0009072436801391932, + "loss": 0.0489, + "num_input_tokens_seen": 73387840, + "step": 34000 + }, + { + "epoch": 5.547308319738988, + "grad_norm": 0.01256790291517973, + "learning_rate": 0.0009072023788106811, + "loss": 0.0408, + "num_input_tokens_seen": 73398304, + "step": 34005 + }, + { + "epoch": 5.548123980424144, + "grad_norm": 0.09283246099948883, + "learning_rate": 0.0009071610692296961, + "loss": 0.1101, + "num_input_tokens_seen": 73410016, + "step": 34010 + }, + { + "epoch": 5.548939641109299, + "grad_norm": 0.14394626021385193, + "learning_rate": 0.0009071197513970755, + "loss": 0.032, + "num_input_tokens_seen": 73420960, + "step": 34015 + }, + { + "epoch": 5.549755301794454, + "grad_norm": 0.06780331581830978, + "learning_rate": 0.0009070784253136565, + "loss": 0.0331, + "num_input_tokens_seen": 73431712, + "step": 34020 + }, + { + "epoch": 5.5505709624796085, + "grad_norm": 0.21991151571273804, + "learning_rate": 0.0009070370909802772, + "loss": 0.2048, + "num_input_tokens_seen": 73441376, + "step": 34025 + }, + { + "epoch": 5.551386623164763, + "grad_norm": 0.010057724080979824, + "learning_rate": 0.0009069957483977747, + "loss": 0.0222, + "num_input_tokens_seen": 73450752, + "step": 34030 + }, + { + "epoch": 5.552202283849918, + "grad_norm": 0.1989067643880844, + "learning_rate": 0.0009069543975669869, + "loss": 0.0842, + "num_input_tokens_seen": 73461952, + "step": 34035 + }, + { + "epoch": 5.553017944535073, + "grad_norm": 0.13402990996837616, + "learning_rate": 0.0009069130384887521, + "loss": 0.057, + "num_input_tokens_seen": 73472064, + "step": 34040 + }, + { + "epoch": 5.553833605220229, + "grad_norm": 0.045034706592559814, + "learning_rate": 0.0009068716711639084, + "loss": 0.0272, + "num_input_tokens_seen": 73483200, + "step": 34045 + }, + { + "epoch": 5.554649265905383, + "grad_norm": 0.017399318516254425, + "learning_rate": 0.0009068302955932939, + "loss": 0.0099, + "num_input_tokens_seen": 73494080, + "step": 34050 + }, + { + "epoch": 5.555464926590538, + "grad_norm": 0.16534112393856049, + "learning_rate": 0.0009067889117777477, + "loss": 0.1881, + "num_input_tokens_seen": 73503616, + "step": 34055 + }, + { + "epoch": 5.556280587275693, + "grad_norm": 0.10192167013883591, + "learning_rate": 0.000906747519718108, + "loss": 0.0549, + "num_input_tokens_seen": 73515328, + "step": 34060 + }, + { + "epoch": 5.557096247960848, + "grad_norm": 0.004879343323409557, + "learning_rate": 0.0009067061194152138, + "loss": 0.1444, + "num_input_tokens_seen": 73527520, + "step": 34065 + }, + { + "epoch": 5.557911908646004, + "grad_norm": 0.028041375800967216, + "learning_rate": 0.0009066647108699041, + "loss": 0.1622, + "num_input_tokens_seen": 73538688, + "step": 34070 + }, + { + "epoch": 5.558727569331158, + "grad_norm": 0.04879889264702797, + "learning_rate": 0.0009066232940830182, + "loss": 0.0878, + "num_input_tokens_seen": 73548544, + "step": 34075 + }, + { + "epoch": 5.559543230016313, + "grad_norm": 0.425594300031662, + "learning_rate": 0.0009065818690553955, + "loss": 0.0757, + "num_input_tokens_seen": 73559712, + "step": 34080 + }, + { + "epoch": 5.560358890701468, + "grad_norm": 0.05271350219845772, + "learning_rate": 0.0009065404357878752, + "loss": 0.0231, + "num_input_tokens_seen": 73570304, + "step": 34085 + }, + { + "epoch": 5.561174551386623, + "grad_norm": 0.17369021475315094, + "learning_rate": 0.0009064989942812974, + "loss": 0.1195, + "num_input_tokens_seen": 73581056, + "step": 34090 + }, + { + "epoch": 5.561990212071779, + "grad_norm": 0.011307965032756329, + "learning_rate": 0.0009064575445365019, + "loss": 0.0357, + "num_input_tokens_seen": 73592064, + "step": 34095 + }, + { + "epoch": 5.562805872756933, + "grad_norm": 0.12680430710315704, + "learning_rate": 0.0009064160865543285, + "loss": 0.037, + "num_input_tokens_seen": 73603296, + "step": 34100 + }, + { + "epoch": 5.563621533442088, + "grad_norm": 0.08956966549158096, + "learning_rate": 0.0009063746203356176, + "loss": 0.0351, + "num_input_tokens_seen": 73612704, + "step": 34105 + }, + { + "epoch": 5.564437194127243, + "grad_norm": 0.2503039836883545, + "learning_rate": 0.0009063331458812094, + "loss": 0.1425, + "num_input_tokens_seen": 73623168, + "step": 34110 + }, + { + "epoch": 5.565252854812398, + "grad_norm": 0.0630798488855362, + "learning_rate": 0.0009062916631919445, + "loss": 0.1681, + "num_input_tokens_seen": 73633088, + "step": 34115 + }, + { + "epoch": 5.566068515497553, + "grad_norm": 0.15281018614768982, + "learning_rate": 0.0009062501722686638, + "loss": 0.0557, + "num_input_tokens_seen": 73644512, + "step": 34120 + }, + { + "epoch": 5.566884176182708, + "grad_norm": 0.05337142199277878, + "learning_rate": 0.0009062086731122079, + "loss": 0.0337, + "num_input_tokens_seen": 73655008, + "step": 34125 + }, + { + "epoch": 5.567699836867863, + "grad_norm": 0.15881197154521942, + "learning_rate": 0.0009061671657234179, + "loss": 0.0821, + "num_input_tokens_seen": 73665216, + "step": 34130 + }, + { + "epoch": 5.568515497553018, + "grad_norm": 0.03177871182560921, + "learning_rate": 0.000906125650103135, + "loss": 0.0532, + "num_input_tokens_seen": 73676384, + "step": 34135 + }, + { + "epoch": 5.569331158238173, + "grad_norm": 0.014326388016343117, + "learning_rate": 0.0009060841262522006, + "loss": 0.1033, + "num_input_tokens_seen": 73687456, + "step": 34140 + }, + { + "epoch": 5.570146818923328, + "grad_norm": 0.03469221293926239, + "learning_rate": 0.0009060425941714563, + "loss": 0.0824, + "num_input_tokens_seen": 73699424, + "step": 34145 + }, + { + "epoch": 5.5709624796084825, + "grad_norm": 0.08553054928779602, + "learning_rate": 0.0009060010538617437, + "loss": 0.0503, + "num_input_tokens_seen": 73710400, + "step": 34150 + }, + { + "epoch": 5.571778140293638, + "grad_norm": 0.1510111540555954, + "learning_rate": 0.0009059595053239047, + "loss": 0.1764, + "num_input_tokens_seen": 73721184, + "step": 34155 + }, + { + "epoch": 5.572593800978793, + "grad_norm": 0.047522176057100296, + "learning_rate": 0.0009059179485587813, + "loss": 0.1306, + "num_input_tokens_seen": 73732032, + "step": 34160 + }, + { + "epoch": 5.573409461663948, + "grad_norm": 0.14115864038467407, + "learning_rate": 0.0009058763835672157, + "loss": 0.1624, + "num_input_tokens_seen": 73742944, + "step": 34165 + }, + { + "epoch": 5.574225122349103, + "grad_norm": 0.009205864742398262, + "learning_rate": 0.0009058348103500504, + "loss": 0.045, + "num_input_tokens_seen": 73753312, + "step": 34170 + }, + { + "epoch": 5.575040783034257, + "grad_norm": 0.05270310491323471, + "learning_rate": 0.0009057932289081278, + "loss": 0.0457, + "num_input_tokens_seen": 73764480, + "step": 34175 + }, + { + "epoch": 5.575856443719413, + "grad_norm": 0.044856734573841095, + "learning_rate": 0.0009057516392422906, + "loss": 0.1248, + "num_input_tokens_seen": 73774496, + "step": 34180 + }, + { + "epoch": 5.576672104404568, + "grad_norm": 0.07143118977546692, + "learning_rate": 0.0009057100413533817, + "loss": 0.0667, + "num_input_tokens_seen": 73786688, + "step": 34185 + }, + { + "epoch": 5.577487765089723, + "grad_norm": 0.17338433861732483, + "learning_rate": 0.0009056684352422441, + "loss": 0.0845, + "num_input_tokens_seen": 73798080, + "step": 34190 + }, + { + "epoch": 5.578303425774878, + "grad_norm": 0.06395836174488068, + "learning_rate": 0.0009056268209097211, + "loss": 0.0208, + "num_input_tokens_seen": 73808736, + "step": 34195 + }, + { + "epoch": 5.579119086460032, + "grad_norm": 0.018235396593809128, + "learning_rate": 0.000905585198356656, + "loss": 0.0689, + "num_input_tokens_seen": 73819232, + "step": 34200 + }, + { + "epoch": 5.579934747145187, + "grad_norm": 0.02261078916490078, + "learning_rate": 0.0009055435675838923, + "loss": 0.0334, + "num_input_tokens_seen": 73829312, + "step": 34205 + }, + { + "epoch": 5.580750407830343, + "grad_norm": 0.04820632189512253, + "learning_rate": 0.0009055019285922737, + "loss": 0.1043, + "num_input_tokens_seen": 73840320, + "step": 34210 + }, + { + "epoch": 5.581566068515498, + "grad_norm": 0.10396668314933777, + "learning_rate": 0.0009054602813826441, + "loss": 0.0576, + "num_input_tokens_seen": 73850464, + "step": 34215 + }, + { + "epoch": 5.582381729200653, + "grad_norm": 0.022507701069116592, + "learning_rate": 0.0009054186259558477, + "loss": 0.0417, + "num_input_tokens_seen": 73861152, + "step": 34220 + }, + { + "epoch": 5.583197389885807, + "grad_norm": 0.004437147174030542, + "learning_rate": 0.0009053769623127284, + "loss": 0.0215, + "num_input_tokens_seen": 73872288, + "step": 34225 + }, + { + "epoch": 5.584013050570962, + "grad_norm": 0.038862019777297974, + "learning_rate": 0.0009053352904541306, + "loss": 0.0627, + "num_input_tokens_seen": 73882720, + "step": 34230 + }, + { + "epoch": 5.584828711256117, + "grad_norm": 0.05441214144229889, + "learning_rate": 0.0009052936103808991, + "loss": 0.1771, + "num_input_tokens_seen": 73892448, + "step": 34235 + }, + { + "epoch": 5.585644371941273, + "grad_norm": 0.17609331011772156, + "learning_rate": 0.0009052519220938784, + "loss": 0.1127, + "num_input_tokens_seen": 73904224, + "step": 34240 + }, + { + "epoch": 5.5864600326264275, + "grad_norm": 0.20584510266780853, + "learning_rate": 0.0009052102255939134, + "loss": 0.1131, + "num_input_tokens_seen": 73915424, + "step": 34245 + }, + { + "epoch": 5.587275693311582, + "grad_norm": 0.0354522280395031, + "learning_rate": 0.000905168520881849, + "loss": 0.1589, + "num_input_tokens_seen": 73925344, + "step": 34250 + }, + { + "epoch": 5.588091353996737, + "grad_norm": 0.029173769056797028, + "learning_rate": 0.0009051268079585306, + "loss": 0.0359, + "num_input_tokens_seen": 73935904, + "step": 34255 + }, + { + "epoch": 5.588907014681892, + "grad_norm": 0.0053617144003510475, + "learning_rate": 0.0009050850868248037, + "loss": 0.0106, + "num_input_tokens_seen": 73947296, + "step": 34260 + }, + { + "epoch": 5.589722675367048, + "grad_norm": 0.07877898961305618, + "learning_rate": 0.0009050433574815134, + "loss": 0.0196, + "num_input_tokens_seen": 73955680, + "step": 34265 + }, + { + "epoch": 5.5905383360522025, + "grad_norm": 0.11759869009256363, + "learning_rate": 0.0009050016199295057, + "loss": 0.0716, + "num_input_tokens_seen": 73966304, + "step": 34270 + }, + { + "epoch": 5.591353996737357, + "grad_norm": 0.0020899248775094748, + "learning_rate": 0.0009049598741696263, + "loss": 0.0413, + "num_input_tokens_seen": 73977120, + "step": 34275 + }, + { + "epoch": 5.592169657422512, + "grad_norm": 0.043699197471141815, + "learning_rate": 0.0009049181202027215, + "loss": 0.0436, + "num_input_tokens_seen": 73987648, + "step": 34280 + }, + { + "epoch": 5.592985318107667, + "grad_norm": 0.15356187522411346, + "learning_rate": 0.0009048763580296373, + "loss": 0.0708, + "num_input_tokens_seen": 73997312, + "step": 34285 + }, + { + "epoch": 5.593800978792823, + "grad_norm": 0.023971954360604286, + "learning_rate": 0.00090483458765122, + "loss": 0.1443, + "num_input_tokens_seen": 74009056, + "step": 34290 + }, + { + "epoch": 5.5946166394779775, + "grad_norm": 0.203241765499115, + "learning_rate": 0.0009047928090683162, + "loss": 0.0788, + "num_input_tokens_seen": 74019488, + "step": 34295 + }, + { + "epoch": 5.595432300163132, + "grad_norm": 0.037422485649585724, + "learning_rate": 0.0009047510222817725, + "loss": 0.0901, + "num_input_tokens_seen": 74030880, + "step": 34300 + }, + { + "epoch": 5.596247960848287, + "grad_norm": 0.09347082674503326, + "learning_rate": 0.0009047092272924361, + "loss": 0.2244, + "num_input_tokens_seen": 74042080, + "step": 34305 + }, + { + "epoch": 5.597063621533442, + "grad_norm": 0.003443291410803795, + "learning_rate": 0.0009046674241011537, + "loss": 0.0287, + "num_input_tokens_seen": 74052448, + "step": 34310 + }, + { + "epoch": 5.597879282218597, + "grad_norm": 0.11611700057983398, + "learning_rate": 0.0009046256127087727, + "loss": 0.1702, + "num_input_tokens_seen": 74063392, + "step": 34315 + }, + { + "epoch": 5.598694942903752, + "grad_norm": 0.21308395266532898, + "learning_rate": 0.0009045837931161402, + "loss": 0.1896, + "num_input_tokens_seen": 74074720, + "step": 34320 + }, + { + "epoch": 5.599510603588907, + "grad_norm": 0.057574931532144547, + "learning_rate": 0.0009045419653241038, + "loss": 0.1646, + "num_input_tokens_seen": 74084512, + "step": 34325 + }, + { + "epoch": 5.600326264274062, + "grad_norm": 0.015414676629006863, + "learning_rate": 0.0009045001293335115, + "loss": 0.0508, + "num_input_tokens_seen": 74095520, + "step": 34330 + }, + { + "epoch": 5.601141924959217, + "grad_norm": 0.0826614499092102, + "learning_rate": 0.0009044582851452107, + "loss": 0.1069, + "num_input_tokens_seen": 74105792, + "step": 34335 + }, + { + "epoch": 5.601957585644372, + "grad_norm": 0.036117956042289734, + "learning_rate": 0.0009044164327600499, + "loss": 0.0385, + "num_input_tokens_seen": 74116224, + "step": 34340 + }, + { + "epoch": 5.602773246329527, + "grad_norm": 0.11280613392591476, + "learning_rate": 0.000904374572178877, + "loss": 0.0553, + "num_input_tokens_seen": 74127232, + "step": 34345 + }, + { + "epoch": 5.603588907014682, + "grad_norm": 0.14315198361873627, + "learning_rate": 0.0009043327034025404, + "loss": 0.0608, + "num_input_tokens_seen": 74137088, + "step": 34350 + }, + { + "epoch": 5.604404567699837, + "grad_norm": 0.004624533466994762, + "learning_rate": 0.0009042908264318885, + "loss": 0.0878, + "num_input_tokens_seen": 74148128, + "step": 34355 + }, + { + "epoch": 5.605220228384992, + "grad_norm": 0.02308063954114914, + "learning_rate": 0.0009042489412677702, + "loss": 0.0482, + "num_input_tokens_seen": 74158432, + "step": 34360 + }, + { + "epoch": 5.606035889070147, + "grad_norm": 0.20183399319648743, + "learning_rate": 0.0009042070479110343, + "loss": 0.1668, + "num_input_tokens_seen": 74169376, + "step": 34365 + }, + { + "epoch": 5.6068515497553015, + "grad_norm": 0.1965470016002655, + "learning_rate": 0.0009041651463625298, + "loss": 0.078, + "num_input_tokens_seen": 74180352, + "step": 34370 + }, + { + "epoch": 5.607667210440457, + "grad_norm": 0.009615951217710972, + "learning_rate": 0.0009041232366231059, + "loss": 0.0777, + "num_input_tokens_seen": 74190304, + "step": 34375 + }, + { + "epoch": 5.608482871125612, + "grad_norm": 0.22422800958156586, + "learning_rate": 0.0009040813186936119, + "loss": 0.1702, + "num_input_tokens_seen": 74201504, + "step": 34380 + }, + { + "epoch": 5.609298531810767, + "grad_norm": 0.12859049439430237, + "learning_rate": 0.0009040393925748973, + "loss": 0.1949, + "num_input_tokens_seen": 74211968, + "step": 34385 + }, + { + "epoch": 5.610114192495922, + "grad_norm": 0.005432716105133295, + "learning_rate": 0.0009039974582678121, + "loss": 0.1166, + "num_input_tokens_seen": 74223232, + "step": 34390 + }, + { + "epoch": 5.6109298531810765, + "grad_norm": 0.2040768712759018, + "learning_rate": 0.0009039555157732056, + "loss": 0.1045, + "num_input_tokens_seen": 74234272, + "step": 34395 + }, + { + "epoch": 5.611745513866231, + "grad_norm": 0.009281203150749207, + "learning_rate": 0.0009039135650919283, + "loss": 0.0595, + "num_input_tokens_seen": 74244608, + "step": 34400 + }, + { + "epoch": 5.612561174551386, + "grad_norm": 0.13613788783550262, + "learning_rate": 0.0009038716062248302, + "loss": 0.1104, + "num_input_tokens_seen": 74255104, + "step": 34405 + }, + { + "epoch": 5.613376835236542, + "grad_norm": 0.027781063690781593, + "learning_rate": 0.0009038296391727616, + "loss": 0.0651, + "num_input_tokens_seen": 74265792, + "step": 34410 + }, + { + "epoch": 5.614192495921697, + "grad_norm": 0.13448427617549896, + "learning_rate": 0.0009037876639365731, + "loss": 0.2309, + "num_input_tokens_seen": 74277120, + "step": 34415 + }, + { + "epoch": 5.6150081566068515, + "grad_norm": 0.02441992796957493, + "learning_rate": 0.0009037456805171154, + "loss": 0.0962, + "num_input_tokens_seen": 74286848, + "step": 34420 + }, + { + "epoch": 5.615823817292006, + "grad_norm": 0.10284209996461868, + "learning_rate": 0.0009037036889152391, + "loss": 0.0489, + "num_input_tokens_seen": 74296992, + "step": 34425 + }, + { + "epoch": 5.616639477977161, + "grad_norm": 0.014048759825527668, + "learning_rate": 0.0009036616891317956, + "loss": 0.0409, + "num_input_tokens_seen": 74307680, + "step": 34430 + }, + { + "epoch": 5.617455138662317, + "grad_norm": 0.17982347309589386, + "learning_rate": 0.0009036196811676358, + "loss": 0.125, + "num_input_tokens_seen": 74318368, + "step": 34435 + }, + { + "epoch": 5.618270799347472, + "grad_norm": 0.024998106062412262, + "learning_rate": 0.0009035776650236112, + "loss": 0.0833, + "num_input_tokens_seen": 74328544, + "step": 34440 + }, + { + "epoch": 5.6190864600326265, + "grad_norm": 0.05525532737374306, + "learning_rate": 0.0009035356407005732, + "loss": 0.0804, + "num_input_tokens_seen": 74339904, + "step": 34445 + }, + { + "epoch": 5.619902120717781, + "grad_norm": 0.06843063980340958, + "learning_rate": 0.0009034936081993736, + "loss": 0.0521, + "num_input_tokens_seen": 74350784, + "step": 34450 + }, + { + "epoch": 5.620717781402936, + "grad_norm": 0.006902523338794708, + "learning_rate": 0.0009034515675208641, + "loss": 0.1367, + "num_input_tokens_seen": 74361408, + "step": 34455 + }, + { + "epoch": 5.621533442088092, + "grad_norm": 0.027350980788469315, + "learning_rate": 0.0009034095186658966, + "loss": 0.1069, + "num_input_tokens_seen": 74371904, + "step": 34460 + }, + { + "epoch": 5.622349102773247, + "grad_norm": 0.02010924369096756, + "learning_rate": 0.0009033674616353236, + "loss": 0.2519, + "num_input_tokens_seen": 74383232, + "step": 34465 + }, + { + "epoch": 5.623164763458401, + "grad_norm": 0.10117822140455246, + "learning_rate": 0.0009033253964299972, + "loss": 0.1314, + "num_input_tokens_seen": 74394240, + "step": 34470 + }, + { + "epoch": 5.623980424143556, + "grad_norm": 0.007149694953113794, + "learning_rate": 0.0009032833230507702, + "loss": 0.0411, + "num_input_tokens_seen": 74404160, + "step": 34475 + }, + { + "epoch": 5.624796084828711, + "grad_norm": 0.1487242430448532, + "learning_rate": 0.000903241241498495, + "loss": 0.1833, + "num_input_tokens_seen": 74415904, + "step": 34480 + }, + { + "epoch": 5.625611745513866, + "grad_norm": 0.028028441593050957, + "learning_rate": 0.0009031991517740244, + "loss": 0.0718, + "num_input_tokens_seen": 74426336, + "step": 34485 + }, + { + "epoch": 5.626427406199021, + "grad_norm": 0.07804684340953827, + "learning_rate": 0.0009031570538782115, + "loss": 0.1273, + "num_input_tokens_seen": 74434592, + "step": 34490 + }, + { + "epoch": 5.627243066884176, + "grad_norm": 0.009090384468436241, + "learning_rate": 0.0009031149478119094, + "loss": 0.0628, + "num_input_tokens_seen": 74446464, + "step": 34495 + }, + { + "epoch": 5.628058727569331, + "grad_norm": 0.016845108941197395, + "learning_rate": 0.0009030728335759716, + "loss": 0.12, + "num_input_tokens_seen": 74457056, + "step": 34500 + }, + { + "epoch": 5.628874388254486, + "grad_norm": 0.12272095680236816, + "learning_rate": 0.0009030307111712514, + "loss": 0.0387, + "num_input_tokens_seen": 74467584, + "step": 34505 + }, + { + "epoch": 5.629690048939641, + "grad_norm": 0.017980115488171577, + "learning_rate": 0.0009029885805986027, + "loss": 0.0873, + "num_input_tokens_seen": 74477376, + "step": 34510 + }, + { + "epoch": 5.630505709624796, + "grad_norm": 0.06704236567020416, + "learning_rate": 0.0009029464418588791, + "loss": 0.1431, + "num_input_tokens_seen": 74487648, + "step": 34515 + }, + { + "epoch": 5.631321370309951, + "grad_norm": 0.16162727773189545, + "learning_rate": 0.0009029042949529347, + "loss": 0.0832, + "num_input_tokens_seen": 74498496, + "step": 34520 + }, + { + "epoch": 5.632137030995106, + "grad_norm": 0.16769364476203918, + "learning_rate": 0.0009028621398816236, + "loss": 0.1365, + "num_input_tokens_seen": 74509664, + "step": 34525 + }, + { + "epoch": 5.632952691680261, + "grad_norm": 0.0917636901140213, + "learning_rate": 0.0009028199766458002, + "loss": 0.1055, + "num_input_tokens_seen": 74519808, + "step": 34530 + }, + { + "epoch": 5.633768352365416, + "grad_norm": 0.05809737369418144, + "learning_rate": 0.000902777805246319, + "loss": 0.0803, + "num_input_tokens_seen": 74531840, + "step": 34535 + }, + { + "epoch": 5.634584013050571, + "grad_norm": 0.05872153490781784, + "learning_rate": 0.0009027356256840345, + "loss": 0.1061, + "num_input_tokens_seen": 74543168, + "step": 34540 + }, + { + "epoch": 5.635399673735726, + "grad_norm": 0.012934837490320206, + "learning_rate": 0.0009026934379598018, + "loss": 0.0199, + "num_input_tokens_seen": 74553792, + "step": 34545 + }, + { + "epoch": 5.636215334420881, + "grad_norm": 0.02263866364955902, + "learning_rate": 0.0009026512420744756, + "loss": 0.1242, + "num_input_tokens_seen": 74565312, + "step": 34550 + }, + { + "epoch": 5.637030995106036, + "grad_norm": 0.016767023131251335, + "learning_rate": 0.0009026090380289111, + "loss": 0.2414, + "num_input_tokens_seen": 74576256, + "step": 34555 + }, + { + "epoch": 5.637846655791191, + "grad_norm": 0.008216843008995056, + "learning_rate": 0.0009025668258239638, + "loss": 0.1096, + "num_input_tokens_seen": 74586560, + "step": 34560 + }, + { + "epoch": 5.638662316476346, + "grad_norm": 0.12629282474517822, + "learning_rate": 0.0009025246054604892, + "loss": 0.0408, + "num_input_tokens_seen": 74597792, + "step": 34565 + }, + { + "epoch": 5.6394779771615005, + "grad_norm": 0.25715264678001404, + "learning_rate": 0.0009024823769393427, + "loss": 0.2622, + "num_input_tokens_seen": 74609504, + "step": 34570 + }, + { + "epoch": 5.640293637846656, + "grad_norm": 0.00492464005947113, + "learning_rate": 0.0009024401402613803, + "loss": 0.142, + "num_input_tokens_seen": 74620160, + "step": 34575 + }, + { + "epoch": 5.641109298531811, + "grad_norm": 0.22137777507305145, + "learning_rate": 0.0009023978954274579, + "loss": 0.1374, + "num_input_tokens_seen": 74631072, + "step": 34580 + }, + { + "epoch": 5.641924959216966, + "grad_norm": 0.014612067490816116, + "learning_rate": 0.0009023556424384317, + "loss": 0.0322, + "num_input_tokens_seen": 74641088, + "step": 34585 + }, + { + "epoch": 5.642740619902121, + "grad_norm": 0.04097198694944382, + "learning_rate": 0.0009023133812951581, + "loss": 0.0796, + "num_input_tokens_seen": 74650560, + "step": 34590 + }, + { + "epoch": 5.643556280587275, + "grad_norm": 0.018777618184685707, + "learning_rate": 0.0009022711119984932, + "loss": 0.0652, + "num_input_tokens_seen": 74660608, + "step": 34595 + }, + { + "epoch": 5.64437194127243, + "grad_norm": 0.026519503444433212, + "learning_rate": 0.0009022288345492941, + "loss": 0.0597, + "num_input_tokens_seen": 74672064, + "step": 34600 + }, + { + "epoch": 5.645187601957586, + "grad_norm": 0.112155482172966, + "learning_rate": 0.0009021865489484173, + "loss": 0.1015, + "num_input_tokens_seen": 74683200, + "step": 34605 + }, + { + "epoch": 5.646003262642741, + "grad_norm": 0.0129517437890172, + "learning_rate": 0.0009021442551967198, + "loss": 0.0957, + "num_input_tokens_seen": 74694016, + "step": 34610 + }, + { + "epoch": 5.646818923327896, + "grad_norm": 0.12256155908107758, + "learning_rate": 0.000902101953295059, + "loss": 0.2878, + "num_input_tokens_seen": 74704096, + "step": 34615 + }, + { + "epoch": 5.64763458401305, + "grad_norm": 0.029381511732935905, + "learning_rate": 0.0009020596432442918, + "loss": 0.0791, + "num_input_tokens_seen": 74714528, + "step": 34620 + }, + { + "epoch": 5.648450244698205, + "grad_norm": 0.012893673963844776, + "learning_rate": 0.0009020173250452761, + "loss": 0.2665, + "num_input_tokens_seen": 74725888, + "step": 34625 + }, + { + "epoch": 5.649265905383361, + "grad_norm": 0.20652040839195251, + "learning_rate": 0.0009019749986988692, + "loss": 0.0604, + "num_input_tokens_seen": 74736384, + "step": 34630 + }, + { + "epoch": 5.650081566068516, + "grad_norm": 0.03892023488879204, + "learning_rate": 0.000901932664205929, + "loss": 0.0659, + "num_input_tokens_seen": 74748064, + "step": 34635 + }, + { + "epoch": 5.650897226753671, + "grad_norm": 0.006782356649637222, + "learning_rate": 0.0009018903215673135, + "loss": 0.0773, + "num_input_tokens_seen": 74758720, + "step": 34640 + }, + { + "epoch": 5.651712887438825, + "grad_norm": 0.03663446754217148, + "learning_rate": 0.0009018479707838808, + "loss": 0.0344, + "num_input_tokens_seen": 74770208, + "step": 34645 + }, + { + "epoch": 5.65252854812398, + "grad_norm": 0.029901130124926567, + "learning_rate": 0.0009018056118564893, + "loss": 0.0482, + "num_input_tokens_seen": 74781568, + "step": 34650 + }, + { + "epoch": 5.653344208809135, + "grad_norm": 0.007429866585880518, + "learning_rate": 0.0009017632447859971, + "loss": 0.1499, + "num_input_tokens_seen": 74792608, + "step": 34655 + }, + { + "epoch": 5.654159869494291, + "grad_norm": 0.06287893652915955, + "learning_rate": 0.0009017208695732633, + "loss": 0.0315, + "num_input_tokens_seen": 74801760, + "step": 34660 + }, + { + "epoch": 5.6549755301794455, + "grad_norm": 0.04932676628232002, + "learning_rate": 0.0009016784862191463, + "loss": 0.0653, + "num_input_tokens_seen": 74812256, + "step": 34665 + }, + { + "epoch": 5.6557911908646, + "grad_norm": 0.006923086941242218, + "learning_rate": 0.0009016360947245053, + "loss": 0.0283, + "num_input_tokens_seen": 74822848, + "step": 34670 + }, + { + "epoch": 5.656606851549755, + "grad_norm": 0.010492617264389992, + "learning_rate": 0.0009015936950901993, + "loss": 0.1065, + "num_input_tokens_seen": 74832992, + "step": 34675 + }, + { + "epoch": 5.65742251223491, + "grad_norm": 0.00584011385217309, + "learning_rate": 0.0009015512873170877, + "loss": 0.0198, + "num_input_tokens_seen": 74843136, + "step": 34680 + }, + { + "epoch": 5.658238172920065, + "grad_norm": 0.006381630897521973, + "learning_rate": 0.0009015088714060297, + "loss": 0.0361, + "num_input_tokens_seen": 74852800, + "step": 34685 + }, + { + "epoch": 5.6590538336052205, + "grad_norm": 0.1548745334148407, + "learning_rate": 0.0009014664473578851, + "loss": 0.0512, + "num_input_tokens_seen": 74864512, + "step": 34690 + }, + { + "epoch": 5.659869494290375, + "grad_norm": 0.01961491070687771, + "learning_rate": 0.0009014240151735138, + "loss": 0.0503, + "num_input_tokens_seen": 74875936, + "step": 34695 + }, + { + "epoch": 5.66068515497553, + "grad_norm": 0.008084258064627647, + "learning_rate": 0.0009013815748537755, + "loss": 0.0916, + "num_input_tokens_seen": 74885504, + "step": 34700 + }, + { + "epoch": 5.661500815660685, + "grad_norm": 0.3146737813949585, + "learning_rate": 0.0009013391263995303, + "loss": 0.0898, + "num_input_tokens_seen": 74897632, + "step": 34705 + }, + { + "epoch": 5.66231647634584, + "grad_norm": 0.30916258692741394, + "learning_rate": 0.0009012966698116387, + "loss": 0.0755, + "num_input_tokens_seen": 74908672, + "step": 34710 + }, + { + "epoch": 5.6631321370309955, + "grad_norm": 0.21931174397468567, + "learning_rate": 0.0009012542050909609, + "loss": 0.1567, + "num_input_tokens_seen": 74919104, + "step": 34715 + }, + { + "epoch": 5.66394779771615, + "grad_norm": 0.06572141498327255, + "learning_rate": 0.0009012117322383577, + "loss": 0.0299, + "num_input_tokens_seen": 74929952, + "step": 34720 + }, + { + "epoch": 5.664763458401305, + "grad_norm": 0.04195026680827141, + "learning_rate": 0.0009011692512546897, + "loss": 0.0539, + "num_input_tokens_seen": 74941760, + "step": 34725 + }, + { + "epoch": 5.66557911908646, + "grad_norm": 0.05966934189200401, + "learning_rate": 0.0009011267621408179, + "loss": 0.115, + "num_input_tokens_seen": 74952000, + "step": 34730 + }, + { + "epoch": 5.666394779771615, + "grad_norm": 0.039518121629953384, + "learning_rate": 0.0009010842648976034, + "loss": 0.0199, + "num_input_tokens_seen": 74963008, + "step": 34735 + }, + { + "epoch": 5.6672104404567705, + "grad_norm": 0.013944373466074467, + "learning_rate": 0.0009010417595259077, + "loss": 0.0142, + "num_input_tokens_seen": 74973408, + "step": 34740 + }, + { + "epoch": 5.668026101141925, + "grad_norm": 0.021885816007852554, + "learning_rate": 0.0009009992460265917, + "loss": 0.3203, + "num_input_tokens_seen": 74983744, + "step": 34745 + }, + { + "epoch": 5.66884176182708, + "grad_norm": 0.014975865371525288, + "learning_rate": 0.0009009567244005174, + "loss": 0.122, + "num_input_tokens_seen": 74993952, + "step": 34750 + }, + { + "epoch": 5.669657422512235, + "grad_norm": 0.016420740634202957, + "learning_rate": 0.0009009141946485464, + "loss": 0.061, + "num_input_tokens_seen": 75004128, + "step": 34755 + }, + { + "epoch": 5.67047308319739, + "grad_norm": 0.23973079025745392, + "learning_rate": 0.0009008716567715406, + "loss": 0.1029, + "num_input_tokens_seen": 75014496, + "step": 34760 + }, + { + "epoch": 5.671288743882545, + "grad_norm": 0.07952632755041122, + "learning_rate": 0.0009008291107703621, + "loss": 0.2125, + "num_input_tokens_seen": 75024704, + "step": 34765 + }, + { + "epoch": 5.672104404567699, + "grad_norm": 0.020377283915877342, + "learning_rate": 0.0009007865566458733, + "loss": 0.0618, + "num_input_tokens_seen": 75035488, + "step": 34770 + }, + { + "epoch": 5.672920065252855, + "grad_norm": 0.010059847496449947, + "learning_rate": 0.0009007439943989364, + "loss": 0.0597, + "num_input_tokens_seen": 75048192, + "step": 34775 + }, + { + "epoch": 5.67373572593801, + "grad_norm": 0.017479633912444115, + "learning_rate": 0.0009007014240304143, + "loss": 0.1204, + "num_input_tokens_seen": 75059680, + "step": 34780 + }, + { + "epoch": 5.674551386623165, + "grad_norm": 0.2790597975254059, + "learning_rate": 0.0009006588455411692, + "loss": 0.1056, + "num_input_tokens_seen": 75069696, + "step": 34785 + }, + { + "epoch": 5.6753670473083195, + "grad_norm": 0.06628762930631638, + "learning_rate": 0.0009006162589320645, + "loss": 0.2056, + "num_input_tokens_seen": 75080512, + "step": 34790 + }, + { + "epoch": 5.676182707993474, + "grad_norm": 0.07514029741287231, + "learning_rate": 0.000900573664203963, + "loss": 0.0856, + "num_input_tokens_seen": 75092000, + "step": 34795 + }, + { + "epoch": 5.67699836867863, + "grad_norm": 0.006626702845096588, + "learning_rate": 0.0009005310613577282, + "loss": 0.0974, + "num_input_tokens_seen": 75102208, + "step": 34800 + }, + { + "epoch": 5.677814029363785, + "grad_norm": 0.07639842480421066, + "learning_rate": 0.0009004884503942232, + "loss": 0.0975, + "num_input_tokens_seen": 75112960, + "step": 34805 + }, + { + "epoch": 5.67862969004894, + "grad_norm": 0.05011019483208656, + "learning_rate": 0.0009004458313143118, + "loss": 0.0747, + "num_input_tokens_seen": 75125824, + "step": 34810 + }, + { + "epoch": 5.6794453507340945, + "grad_norm": 0.0052652182057499886, + "learning_rate": 0.0009004032041188575, + "loss": 0.0667, + "num_input_tokens_seen": 75136896, + "step": 34815 + }, + { + "epoch": 5.680261011419249, + "grad_norm": 0.08199330419301987, + "learning_rate": 0.0009003605688087244, + "loss": 0.0464, + "num_input_tokens_seen": 75146528, + "step": 34820 + }, + { + "epoch": 5.681076672104405, + "grad_norm": 0.05263037234544754, + "learning_rate": 0.0009003179253847764, + "loss": 0.1138, + "num_input_tokens_seen": 75157184, + "step": 34825 + }, + { + "epoch": 5.68189233278956, + "grad_norm": 0.16348080337047577, + "learning_rate": 0.0009002752738478779, + "loss": 0.0686, + "num_input_tokens_seen": 75166016, + "step": 34830 + }, + { + "epoch": 5.682707993474715, + "grad_norm": 0.019095242023468018, + "learning_rate": 0.000900232614198893, + "loss": 0.1783, + "num_input_tokens_seen": 75177824, + "step": 34835 + }, + { + "epoch": 5.6835236541598695, + "grad_norm": 0.03749559074640274, + "learning_rate": 0.0009001899464386867, + "loss": 0.1641, + "num_input_tokens_seen": 75188128, + "step": 34840 + }, + { + "epoch": 5.684339314845024, + "grad_norm": 0.024258477613329887, + "learning_rate": 0.0009001472705681233, + "loss": 0.1604, + "num_input_tokens_seen": 75198016, + "step": 34845 + }, + { + "epoch": 5.685154975530179, + "grad_norm": 0.006701468490064144, + "learning_rate": 0.0009001045865880679, + "loss": 0.0419, + "num_input_tokens_seen": 75208768, + "step": 34850 + }, + { + "epoch": 5.685970636215334, + "grad_norm": 0.009347554296255112, + "learning_rate": 0.0009000618944993854, + "loss": 0.0636, + "num_input_tokens_seen": 75220064, + "step": 34855 + }, + { + "epoch": 5.68678629690049, + "grad_norm": 0.08968329429626465, + "learning_rate": 0.0009000191943029412, + "loss": 0.0551, + "num_input_tokens_seen": 75231552, + "step": 34860 + }, + { + "epoch": 5.6876019575856445, + "grad_norm": 0.019691836088895798, + "learning_rate": 0.0008999764859996005, + "loss": 0.0447, + "num_input_tokens_seen": 75242080, + "step": 34865 + }, + { + "epoch": 5.688417618270799, + "grad_norm": 0.00395465362817049, + "learning_rate": 0.000899933769590229, + "loss": 0.0683, + "num_input_tokens_seen": 75253312, + "step": 34870 + }, + { + "epoch": 5.689233278955954, + "grad_norm": 0.026480449363589287, + "learning_rate": 0.0008998910450756923, + "loss": 0.054, + "num_input_tokens_seen": 75265376, + "step": 34875 + }, + { + "epoch": 5.690048939641109, + "grad_norm": 0.14563219249248505, + "learning_rate": 0.0008998483124568561, + "loss": 0.2387, + "num_input_tokens_seen": 75275872, + "step": 34880 + }, + { + "epoch": 5.690864600326265, + "grad_norm": 0.020451823249459267, + "learning_rate": 0.0008998055717345868, + "loss": 0.0305, + "num_input_tokens_seen": 75288096, + "step": 34885 + }, + { + "epoch": 5.691680261011419, + "grad_norm": 0.13150890171527863, + "learning_rate": 0.0008997628229097503, + "loss": 0.086, + "num_input_tokens_seen": 75298112, + "step": 34890 + }, + { + "epoch": 5.692495921696574, + "grad_norm": 0.06536690890789032, + "learning_rate": 0.0008997200659832129, + "loss": 0.171, + "num_input_tokens_seen": 75308992, + "step": 34895 + }, + { + "epoch": 5.693311582381729, + "grad_norm": 0.03447509557008743, + "learning_rate": 0.0008996773009558416, + "loss": 0.1451, + "num_input_tokens_seen": 75320352, + "step": 34900 + }, + { + "epoch": 5.694127243066884, + "grad_norm": 0.008473111316561699, + "learning_rate": 0.0008996345278285027, + "loss": 0.0198, + "num_input_tokens_seen": 75333056, + "step": 34905 + }, + { + "epoch": 5.69494290375204, + "grad_norm": 0.043338000774383545, + "learning_rate": 0.000899591746602063, + "loss": 0.0885, + "num_input_tokens_seen": 75345248, + "step": 34910 + }, + { + "epoch": 5.695758564437194, + "grad_norm": 0.1663362830877304, + "learning_rate": 0.0008995489572773896, + "loss": 0.1132, + "num_input_tokens_seen": 75356576, + "step": 34915 + }, + { + "epoch": 5.696574225122349, + "grad_norm": 0.032339468598365784, + "learning_rate": 0.0008995061598553499, + "loss": 0.0265, + "num_input_tokens_seen": 75368000, + "step": 34920 + }, + { + "epoch": 5.697389885807504, + "grad_norm": 0.017191503196954727, + "learning_rate": 0.000899463354336811, + "loss": 0.0494, + "num_input_tokens_seen": 75378208, + "step": 34925 + }, + { + "epoch": 5.698205546492659, + "grad_norm": 0.06986400485038757, + "learning_rate": 0.0008994205407226403, + "loss": 0.0713, + "num_input_tokens_seen": 75390112, + "step": 34930 + }, + { + "epoch": 5.699021207177814, + "grad_norm": 0.016000935807824135, + "learning_rate": 0.0008993777190137058, + "loss": 0.0745, + "num_input_tokens_seen": 75401824, + "step": 34935 + }, + { + "epoch": 5.699836867862969, + "grad_norm": 0.1325785517692566, + "learning_rate": 0.0008993348892108753, + "loss": 0.1406, + "num_input_tokens_seen": 75412384, + "step": 34940 + }, + { + "epoch": 5.700652528548124, + "grad_norm": 0.08483979851007462, + "learning_rate": 0.0008992920513150165, + "loss": 0.018, + "num_input_tokens_seen": 75423648, + "step": 34945 + }, + { + "epoch": 5.701468189233279, + "grad_norm": 0.18795321881771088, + "learning_rate": 0.0008992492053269976, + "loss": 0.1735, + "num_input_tokens_seen": 75435232, + "step": 34950 + }, + { + "epoch": 5.702283849918434, + "grad_norm": 0.07199457287788391, + "learning_rate": 0.0008992063512476873, + "loss": 0.0492, + "num_input_tokens_seen": 75445952, + "step": 34955 + }, + { + "epoch": 5.703099510603589, + "grad_norm": 0.016024762764573097, + "learning_rate": 0.0008991634890779538, + "loss": 0.021, + "num_input_tokens_seen": 75457696, + "step": 34960 + }, + { + "epoch": 5.7039151712887435, + "grad_norm": 0.07800979167222977, + "learning_rate": 0.0008991206188186658, + "loss": 0.1049, + "num_input_tokens_seen": 75467424, + "step": 34965 + }, + { + "epoch": 5.704730831973899, + "grad_norm": 0.009901685640215874, + "learning_rate": 0.0008990777404706922, + "loss": 0.1267, + "num_input_tokens_seen": 75476928, + "step": 34970 + }, + { + "epoch": 5.705546492659054, + "grad_norm": 0.07788558304309845, + "learning_rate": 0.0008990348540349019, + "loss": 0.039, + "num_input_tokens_seen": 75486624, + "step": 34975 + }, + { + "epoch": 5.706362153344209, + "grad_norm": 0.0043368833139538765, + "learning_rate": 0.0008989919595121641, + "loss": 0.0632, + "num_input_tokens_seen": 75497888, + "step": 34980 + }, + { + "epoch": 5.707177814029364, + "grad_norm": 0.0093643544241786, + "learning_rate": 0.000898949056903348, + "loss": 0.0804, + "num_input_tokens_seen": 75508352, + "step": 34985 + }, + { + "epoch": 5.7079934747145185, + "grad_norm": 0.00530201755464077, + "learning_rate": 0.0008989061462093233, + "loss": 0.0611, + "num_input_tokens_seen": 75520416, + "step": 34990 + }, + { + "epoch": 5.708809135399674, + "grad_norm": 0.023103397339582443, + "learning_rate": 0.0008988632274309593, + "loss": 0.1554, + "num_input_tokens_seen": 75531168, + "step": 34995 + }, + { + "epoch": 5.709624796084829, + "grad_norm": 0.0034944461658596992, + "learning_rate": 0.0008988203005691262, + "loss": 0.0325, + "num_input_tokens_seen": 75541632, + "step": 35000 + }, + { + "epoch": 5.710440456769984, + "grad_norm": 0.05038722977042198, + "learning_rate": 0.0008987773656246936, + "loss": 0.0874, + "num_input_tokens_seen": 75552224, + "step": 35005 + }, + { + "epoch": 5.711256117455139, + "grad_norm": 0.010907181538641453, + "learning_rate": 0.0008987344225985319, + "loss": 0.0159, + "num_input_tokens_seen": 75562080, + "step": 35010 + }, + { + "epoch": 5.712071778140293, + "grad_norm": 0.054860420525074005, + "learning_rate": 0.0008986914714915112, + "loss": 0.1161, + "num_input_tokens_seen": 75571616, + "step": 35015 + }, + { + "epoch": 5.712887438825448, + "grad_norm": 0.00817075651139021, + "learning_rate": 0.000898648512304502, + "loss": 0.0379, + "num_input_tokens_seen": 75581952, + "step": 35020 + }, + { + "epoch": 5.713703099510604, + "grad_norm": 0.002832885831594467, + "learning_rate": 0.0008986055450383752, + "loss": 0.1039, + "num_input_tokens_seen": 75592448, + "step": 35025 + }, + { + "epoch": 5.714518760195759, + "grad_norm": 0.16031870245933533, + "learning_rate": 0.0008985625696940013, + "loss": 0.0719, + "num_input_tokens_seen": 75603872, + "step": 35030 + }, + { + "epoch": 5.715334420880914, + "grad_norm": 0.007768367417156696, + "learning_rate": 0.0008985195862722513, + "loss": 0.0553, + "num_input_tokens_seen": 75614752, + "step": 35035 + }, + { + "epoch": 5.716150081566068, + "grad_norm": 0.008205034770071507, + "learning_rate": 0.0008984765947739964, + "loss": 0.0188, + "num_input_tokens_seen": 75625600, + "step": 35040 + }, + { + "epoch": 5.716965742251223, + "grad_norm": 0.1952839493751526, + "learning_rate": 0.0008984335952001075, + "loss": 0.1975, + "num_input_tokens_seen": 75635648, + "step": 35045 + }, + { + "epoch": 5.717781402936378, + "grad_norm": 0.2406097799539566, + "learning_rate": 0.0008983905875514566, + "loss": 0.1734, + "num_input_tokens_seen": 75647680, + "step": 35050 + }, + { + "epoch": 5.718597063621534, + "grad_norm": 0.03485770896077156, + "learning_rate": 0.000898347571828915, + "loss": 0.0329, + "num_input_tokens_seen": 75658080, + "step": 35055 + }, + { + "epoch": 5.719412724306689, + "grad_norm": 0.01681782677769661, + "learning_rate": 0.0008983045480333545, + "loss": 0.1246, + "num_input_tokens_seen": 75669920, + "step": 35060 + }, + { + "epoch": 5.720228384991843, + "grad_norm": 0.002933309879153967, + "learning_rate": 0.0008982615161656471, + "loss": 0.0358, + "num_input_tokens_seen": 75681792, + "step": 35065 + }, + { + "epoch": 5.721044045676998, + "grad_norm": 0.004071448929607868, + "learning_rate": 0.0008982184762266648, + "loss": 0.0178, + "num_input_tokens_seen": 75692992, + "step": 35070 + }, + { + "epoch": 5.721859706362153, + "grad_norm": 0.03370146453380585, + "learning_rate": 0.00089817542821728, + "loss": 0.0338, + "num_input_tokens_seen": 75704608, + "step": 35075 + }, + { + "epoch": 5.722675367047309, + "grad_norm": 0.022621802985668182, + "learning_rate": 0.0008981323721383649, + "loss": 0.0479, + "num_input_tokens_seen": 75715328, + "step": 35080 + }, + { + "epoch": 5.7234910277324635, + "grad_norm": 0.21395248174667358, + "learning_rate": 0.0008980893079907922, + "loss": 0.1287, + "num_input_tokens_seen": 75725376, + "step": 35085 + }, + { + "epoch": 5.724306688417618, + "grad_norm": 0.0016723985318094492, + "learning_rate": 0.0008980462357754347, + "loss": 0.0796, + "num_input_tokens_seen": 75736512, + "step": 35090 + }, + { + "epoch": 5.725122349102773, + "grad_norm": 0.013339928351342678, + "learning_rate": 0.0008980031554931654, + "loss": 0.0178, + "num_input_tokens_seen": 75747552, + "step": 35095 + }, + { + "epoch": 5.725938009787928, + "grad_norm": 0.02852795645594597, + "learning_rate": 0.0008979600671448571, + "loss": 0.0399, + "num_input_tokens_seen": 75759136, + "step": 35100 + }, + { + "epoch": 5.726753670473083, + "grad_norm": 0.007831635884940624, + "learning_rate": 0.0008979169707313831, + "loss": 0.0653, + "num_input_tokens_seen": 75770208, + "step": 35105 + }, + { + "epoch": 5.7275693311582385, + "grad_norm": 0.0321851447224617, + "learning_rate": 0.000897873866253617, + "loss": 0.0134, + "num_input_tokens_seen": 75780800, + "step": 35110 + }, + { + "epoch": 5.728384991843393, + "grad_norm": 0.07553785294294357, + "learning_rate": 0.0008978307537124324, + "loss": 0.0161, + "num_input_tokens_seen": 75791328, + "step": 35115 + }, + { + "epoch": 5.729200652528548, + "grad_norm": 0.02070830762386322, + "learning_rate": 0.0008977876331087027, + "loss": 0.2251, + "num_input_tokens_seen": 75802688, + "step": 35120 + }, + { + "epoch": 5.730016313213703, + "grad_norm": 0.03978576138615608, + "learning_rate": 0.0008977445044433021, + "loss": 0.1166, + "num_input_tokens_seen": 75812640, + "step": 35125 + }, + { + "epoch": 5.730831973898858, + "grad_norm": 0.0451846607029438, + "learning_rate": 0.0008977013677171045, + "loss": 0.0438, + "num_input_tokens_seen": 75823680, + "step": 35130 + }, + { + "epoch": 5.731647634584013, + "grad_norm": 0.006894924212247133, + "learning_rate": 0.0008976582229309842, + "loss": 0.0278, + "num_input_tokens_seen": 75834208, + "step": 35135 + }, + { + "epoch": 5.732463295269168, + "grad_norm": 0.003591586370021105, + "learning_rate": 0.0008976150700858155, + "loss": 0.1044, + "num_input_tokens_seen": 75846432, + "step": 35140 + }, + { + "epoch": 5.733278955954323, + "grad_norm": 0.05078761652112007, + "learning_rate": 0.000897571909182473, + "loss": 0.2198, + "num_input_tokens_seen": 75856960, + "step": 35145 + }, + { + "epoch": 5.734094616639478, + "grad_norm": 0.06629732251167297, + "learning_rate": 0.0008975287402218314, + "loss": 0.0234, + "num_input_tokens_seen": 75868256, + "step": 35150 + }, + { + "epoch": 5.734910277324633, + "grad_norm": 0.006755585316568613, + "learning_rate": 0.0008974855632047657, + "loss": 0.0185, + "num_input_tokens_seen": 75879808, + "step": 35155 + }, + { + "epoch": 5.735725938009788, + "grad_norm": 0.2323426753282547, + "learning_rate": 0.0008974423781321506, + "loss": 0.2371, + "num_input_tokens_seen": 75891168, + "step": 35160 + }, + { + "epoch": 5.736541598694943, + "grad_norm": 0.008177408017218113, + "learning_rate": 0.0008973991850048616, + "loss": 0.1947, + "num_input_tokens_seen": 75902080, + "step": 35165 + }, + { + "epoch": 5.737357259380098, + "grad_norm": 0.08923593163490295, + "learning_rate": 0.0008973559838237739, + "loss": 0.0899, + "num_input_tokens_seen": 75913568, + "step": 35170 + }, + { + "epoch": 5.738172920065253, + "grad_norm": 0.021789556369185448, + "learning_rate": 0.0008973127745897634, + "loss": 0.0952, + "num_input_tokens_seen": 75923552, + "step": 35175 + }, + { + "epoch": 5.738988580750408, + "grad_norm": 0.17962133884429932, + "learning_rate": 0.0008972695573037052, + "loss": 0.1643, + "num_input_tokens_seen": 75933856, + "step": 35180 + }, + { + "epoch": 5.739804241435563, + "grad_norm": 0.056493062525987625, + "learning_rate": 0.0008972263319664756, + "loss": 0.1245, + "num_input_tokens_seen": 75944896, + "step": 35185 + }, + { + "epoch": 5.740619902120718, + "grad_norm": 0.1534327268600464, + "learning_rate": 0.0008971830985789504, + "loss": 0.126, + "num_input_tokens_seen": 75954880, + "step": 35190 + }, + { + "epoch": 5.741435562805873, + "grad_norm": 0.10149313509464264, + "learning_rate": 0.0008971398571420058, + "loss": 0.0757, + "num_input_tokens_seen": 75964800, + "step": 35195 + }, + { + "epoch": 5.742251223491028, + "grad_norm": 0.11876611411571503, + "learning_rate": 0.0008970966076565183, + "loss": 0.1227, + "num_input_tokens_seen": 75975264, + "step": 35200 + }, + { + "epoch": 5.743066884176183, + "grad_norm": 0.04899505525827408, + "learning_rate": 0.0008970533501233642, + "loss": 0.0359, + "num_input_tokens_seen": 75986976, + "step": 35205 + }, + { + "epoch": 5.7438825448613375, + "grad_norm": 0.023705052211880684, + "learning_rate": 0.0008970100845434204, + "loss": 0.0814, + "num_input_tokens_seen": 75998752, + "step": 35210 + }, + { + "epoch": 5.744698205546492, + "grad_norm": 0.092420294880867, + "learning_rate": 0.0008969668109175635, + "loss": 0.1339, + "num_input_tokens_seen": 76010688, + "step": 35215 + }, + { + "epoch": 5.745513866231647, + "grad_norm": 0.14717933535575867, + "learning_rate": 0.0008969235292466706, + "loss": 0.2898, + "num_input_tokens_seen": 76021408, + "step": 35220 + }, + { + "epoch": 5.746329526916803, + "grad_norm": 0.004998065996915102, + "learning_rate": 0.0008968802395316187, + "loss": 0.1534, + "num_input_tokens_seen": 76032480, + "step": 35225 + }, + { + "epoch": 5.747145187601958, + "grad_norm": 0.15294790267944336, + "learning_rate": 0.0008968369417732855, + "loss": 0.1748, + "num_input_tokens_seen": 76043072, + "step": 35230 + }, + { + "epoch": 5.7479608482871125, + "grad_norm": 0.015233257785439491, + "learning_rate": 0.0008967936359725482, + "loss": 0.0812, + "num_input_tokens_seen": 76054208, + "step": 35235 + }, + { + "epoch": 5.748776508972267, + "grad_norm": 0.017702879384160042, + "learning_rate": 0.0008967503221302844, + "loss": 0.0884, + "num_input_tokens_seen": 76066080, + "step": 35240 + }, + { + "epoch": 5.749592169657422, + "grad_norm": 0.10578637570142746, + "learning_rate": 0.0008967070002473721, + "loss": 0.185, + "num_input_tokens_seen": 76077088, + "step": 35245 + }, + { + "epoch": 5.750407830342578, + "grad_norm": 0.013672021217644215, + "learning_rate": 0.0008966636703246891, + "loss": 0.1049, + "num_input_tokens_seen": 76087104, + "step": 35250 + }, + { + "epoch": 5.751223491027733, + "grad_norm": 0.2054271399974823, + "learning_rate": 0.0008966203323631137, + "loss": 0.1165, + "num_input_tokens_seen": 76098560, + "step": 35255 + }, + { + "epoch": 5.7520391517128875, + "grad_norm": 0.049927644431591034, + "learning_rate": 0.000896576986363524, + "loss": 0.0635, + "num_input_tokens_seen": 76109472, + "step": 35260 + }, + { + "epoch": 5.752854812398042, + "grad_norm": 0.1269955188035965, + "learning_rate": 0.0008965336323267986, + "loss": 0.1808, + "num_input_tokens_seen": 76119744, + "step": 35265 + }, + { + "epoch": 5.753670473083197, + "grad_norm": 0.006356074009090662, + "learning_rate": 0.0008964902702538163, + "loss": 0.0154, + "num_input_tokens_seen": 76129664, + "step": 35270 + }, + { + "epoch": 5.754486133768353, + "grad_norm": 0.05316048115491867, + "learning_rate": 0.0008964469001454554, + "loss": 0.0746, + "num_input_tokens_seen": 76140384, + "step": 35275 + }, + { + "epoch": 5.755301794453508, + "grad_norm": 0.3020736873149872, + "learning_rate": 0.0008964035220025953, + "loss": 0.1024, + "num_input_tokens_seen": 76150624, + "step": 35280 + }, + { + "epoch": 5.7561174551386625, + "grad_norm": 0.006090898532420397, + "learning_rate": 0.000896360135826115, + "loss": 0.0683, + "num_input_tokens_seen": 76161952, + "step": 35285 + }, + { + "epoch": 5.756933115823817, + "grad_norm": 0.03798188641667366, + "learning_rate": 0.0008963167416168936, + "loss": 0.0816, + "num_input_tokens_seen": 76173600, + "step": 35290 + }, + { + "epoch": 5.757748776508972, + "grad_norm": 0.010448027402162552, + "learning_rate": 0.0008962733393758107, + "loss": 0.0177, + "num_input_tokens_seen": 76183808, + "step": 35295 + }, + { + "epoch": 5.758564437194127, + "grad_norm": 0.15920986235141754, + "learning_rate": 0.0008962299291037459, + "loss": 0.1435, + "num_input_tokens_seen": 76194752, + "step": 35300 + }, + { + "epoch": 5.759380097879282, + "grad_norm": 0.18694207072257996, + "learning_rate": 0.000896186510801579, + "loss": 0.1994, + "num_input_tokens_seen": 76205696, + "step": 35305 + }, + { + "epoch": 5.760195758564437, + "grad_norm": 0.01808817870914936, + "learning_rate": 0.0008961430844701899, + "loss": 0.044, + "num_input_tokens_seen": 76215072, + "step": 35310 + }, + { + "epoch": 5.761011419249592, + "grad_norm": 0.08426328748464584, + "learning_rate": 0.0008960996501104583, + "loss": 0.0794, + "num_input_tokens_seen": 76225248, + "step": 35315 + }, + { + "epoch": 5.761827079934747, + "grad_norm": 0.30586087703704834, + "learning_rate": 0.0008960562077232652, + "loss": 0.1164, + "num_input_tokens_seen": 76235552, + "step": 35320 + }, + { + "epoch": 5.762642740619902, + "grad_norm": 0.24793417751789093, + "learning_rate": 0.0008960127573094904, + "loss": 0.1518, + "num_input_tokens_seen": 76246624, + "step": 35325 + }, + { + "epoch": 5.763458401305057, + "grad_norm": 0.26466768980026245, + "learning_rate": 0.0008959692988700148, + "loss": 0.1087, + "num_input_tokens_seen": 76256352, + "step": 35330 + }, + { + "epoch": 5.764274061990212, + "grad_norm": 0.034233685582876205, + "learning_rate": 0.000895925832405719, + "loss": 0.1228, + "num_input_tokens_seen": 76266912, + "step": 35335 + }, + { + "epoch": 5.765089722675367, + "grad_norm": 0.019049430266022682, + "learning_rate": 0.0008958823579174839, + "loss": 0.1375, + "num_input_tokens_seen": 76277344, + "step": 35340 + }, + { + "epoch": 5.765905383360522, + "grad_norm": 0.16720733046531677, + "learning_rate": 0.0008958388754061907, + "loss": 0.1935, + "num_input_tokens_seen": 76288544, + "step": 35345 + }, + { + "epoch": 5.766721044045677, + "grad_norm": 0.09382802248001099, + "learning_rate": 0.0008957953848727205, + "loss": 0.1587, + "num_input_tokens_seen": 76299744, + "step": 35350 + }, + { + "epoch": 5.767536704730832, + "grad_norm": 0.2298130989074707, + "learning_rate": 0.0008957518863179545, + "loss": 0.2096, + "num_input_tokens_seen": 76309280, + "step": 35355 + }, + { + "epoch": 5.768352365415987, + "grad_norm": 0.09195778518915176, + "learning_rate": 0.0008957083797427747, + "loss": 0.114, + "num_input_tokens_seen": 76319072, + "step": 35360 + }, + { + "epoch": 5.769168026101142, + "grad_norm": 0.03482738882303238, + "learning_rate": 0.0008956648651480627, + "loss": 0.0501, + "num_input_tokens_seen": 76329696, + "step": 35365 + }, + { + "epoch": 5.769983686786297, + "grad_norm": 0.038161877542734146, + "learning_rate": 0.0008956213425347001, + "loss": 0.2631, + "num_input_tokens_seen": 76339904, + "step": 35370 + }, + { + "epoch": 5.770799347471452, + "grad_norm": 0.08660417050123215, + "learning_rate": 0.0008955778119035692, + "loss": 0.0777, + "num_input_tokens_seen": 76351776, + "step": 35375 + }, + { + "epoch": 5.771615008156607, + "grad_norm": 0.061839256435632706, + "learning_rate": 0.000895534273255552, + "loss": 0.1279, + "num_input_tokens_seen": 76362208, + "step": 35380 + }, + { + "epoch": 5.7724306688417615, + "grad_norm": 0.07852160930633545, + "learning_rate": 0.0008954907265915311, + "loss": 0.0815, + "num_input_tokens_seen": 76374624, + "step": 35385 + }, + { + "epoch": 5.773246329526917, + "grad_norm": 0.013021070510149002, + "learning_rate": 0.0008954471719123889, + "loss": 0.0977, + "num_input_tokens_seen": 76384064, + "step": 35390 + }, + { + "epoch": 5.774061990212072, + "grad_norm": 0.1448715180158615, + "learning_rate": 0.0008954036092190079, + "loss": 0.0843, + "num_input_tokens_seen": 76394976, + "step": 35395 + }, + { + "epoch": 5.774877650897227, + "grad_norm": 0.01635243371129036, + "learning_rate": 0.0008953600385122713, + "loss": 0.0927, + "num_input_tokens_seen": 76407136, + "step": 35400 + }, + { + "epoch": 5.775693311582382, + "grad_norm": 0.1664854884147644, + "learning_rate": 0.0008953164597930621, + "loss": 0.0961, + "num_input_tokens_seen": 76417536, + "step": 35405 + }, + { + "epoch": 5.7765089722675365, + "grad_norm": 0.008937612175941467, + "learning_rate": 0.0008952728730622632, + "loss": 0.0286, + "num_input_tokens_seen": 76428576, + "step": 35410 + }, + { + "epoch": 5.777324632952691, + "grad_norm": 0.2748555541038513, + "learning_rate": 0.000895229278320758, + "loss": 0.0581, + "num_input_tokens_seen": 76439040, + "step": 35415 + }, + { + "epoch": 5.778140293637847, + "grad_norm": 0.012435352429747581, + "learning_rate": 0.0008951856755694303, + "loss": 0.1852, + "num_input_tokens_seen": 76450624, + "step": 35420 + }, + { + "epoch": 5.778955954323002, + "grad_norm": 0.019059935584664345, + "learning_rate": 0.0008951420648091635, + "loss": 0.0603, + "num_input_tokens_seen": 76459840, + "step": 35425 + }, + { + "epoch": 5.779771615008157, + "grad_norm": 0.059258535504341125, + "learning_rate": 0.0008950984460408414, + "loss": 0.1179, + "num_input_tokens_seen": 76470688, + "step": 35430 + }, + { + "epoch": 5.780587275693311, + "grad_norm": 0.044109195470809937, + "learning_rate": 0.0008950548192653481, + "loss": 0.1961, + "num_input_tokens_seen": 76481728, + "step": 35435 + }, + { + "epoch": 5.781402936378466, + "grad_norm": 0.16723336279392242, + "learning_rate": 0.0008950111844835678, + "loss": 0.0931, + "num_input_tokens_seen": 76492032, + "step": 35440 + }, + { + "epoch": 5.782218597063622, + "grad_norm": 0.1225782185792923, + "learning_rate": 0.0008949675416963847, + "loss": 0.0714, + "num_input_tokens_seen": 76502784, + "step": 35445 + }, + { + "epoch": 5.783034257748777, + "grad_norm": 0.042741745710372925, + "learning_rate": 0.0008949238909046833, + "loss": 0.0396, + "num_input_tokens_seen": 76513856, + "step": 35450 + }, + { + "epoch": 5.783849918433932, + "grad_norm": 0.06107833608984947, + "learning_rate": 0.0008948802321093484, + "loss": 0.069, + "num_input_tokens_seen": 76525056, + "step": 35455 + }, + { + "epoch": 5.784665579119086, + "grad_norm": 0.2204284369945526, + "learning_rate": 0.0008948365653112645, + "loss": 0.1402, + "num_input_tokens_seen": 76535584, + "step": 35460 + }, + { + "epoch": 5.785481239804241, + "grad_norm": 0.1797419935464859, + "learning_rate": 0.0008947928905113166, + "loss": 0.1031, + "num_input_tokens_seen": 76547168, + "step": 35465 + }, + { + "epoch": 5.786296900489396, + "grad_norm": 0.011574863456189632, + "learning_rate": 0.00089474920771039, + "loss": 0.0544, + "num_input_tokens_seen": 76557888, + "step": 35470 + }, + { + "epoch": 5.787112561174552, + "grad_norm": 0.018701527267694473, + "learning_rate": 0.0008947055169093701, + "loss": 0.0469, + "num_input_tokens_seen": 76568864, + "step": 35475 + }, + { + "epoch": 5.787928221859707, + "grad_norm": 0.022701609879732132, + "learning_rate": 0.000894661818109142, + "loss": 0.0372, + "num_input_tokens_seen": 76579296, + "step": 35480 + }, + { + "epoch": 5.788743882544861, + "grad_norm": 0.005170588381588459, + "learning_rate": 0.0008946181113105915, + "loss": 0.0314, + "num_input_tokens_seen": 76589824, + "step": 35485 + }, + { + "epoch": 5.789559543230016, + "grad_norm": 0.008834611624479294, + "learning_rate": 0.0008945743965146044, + "loss": 0.1413, + "num_input_tokens_seen": 76599744, + "step": 35490 + }, + { + "epoch": 5.790375203915171, + "grad_norm": 0.025382647290825844, + "learning_rate": 0.0008945306737220669, + "loss": 0.1045, + "num_input_tokens_seen": 76612000, + "step": 35495 + }, + { + "epoch": 5.791190864600326, + "grad_norm": 0.031471461057662964, + "learning_rate": 0.0008944869429338645, + "loss": 0.0174, + "num_input_tokens_seen": 76623936, + "step": 35500 + }, + { + "epoch": 5.7920065252854815, + "grad_norm": 0.04242995008826256, + "learning_rate": 0.0008944432041508838, + "loss": 0.063, + "num_input_tokens_seen": 76634240, + "step": 35505 + }, + { + "epoch": 5.792822185970636, + "grad_norm": 0.06094391644001007, + "learning_rate": 0.0008943994573740111, + "loss": 0.1362, + "num_input_tokens_seen": 76645344, + "step": 35510 + }, + { + "epoch": 5.793637846655791, + "grad_norm": 0.012706688605248928, + "learning_rate": 0.0008943557026041331, + "loss": 0.0429, + "num_input_tokens_seen": 76656000, + "step": 35515 + }, + { + "epoch": 5.794453507340946, + "grad_norm": 0.24960213899612427, + "learning_rate": 0.0008943119398421367, + "loss": 0.1077, + "num_input_tokens_seen": 76666976, + "step": 35520 + }, + { + "epoch": 5.795269168026101, + "grad_norm": 0.2736920416355133, + "learning_rate": 0.0008942681690889084, + "loss": 0.1498, + "num_input_tokens_seen": 76676192, + "step": 35525 + }, + { + "epoch": 5.7960848287112565, + "grad_norm": 0.008750234730541706, + "learning_rate": 0.0008942243903453356, + "loss": 0.0704, + "num_input_tokens_seen": 76687104, + "step": 35530 + }, + { + "epoch": 5.796900489396411, + "grad_norm": 0.41555219888687134, + "learning_rate": 0.0008941806036123054, + "loss": 0.0924, + "num_input_tokens_seen": 76699104, + "step": 35535 + }, + { + "epoch": 5.797716150081566, + "grad_norm": 0.017580794170498848, + "learning_rate": 0.0008941368088907052, + "loss": 0.1478, + "num_input_tokens_seen": 76710464, + "step": 35540 + }, + { + "epoch": 5.798531810766721, + "grad_norm": 0.020478034391999245, + "learning_rate": 0.0008940930061814226, + "loss": 0.0289, + "num_input_tokens_seen": 76720864, + "step": 35545 + }, + { + "epoch": 5.799347471451876, + "grad_norm": 0.020645439624786377, + "learning_rate": 0.0008940491954853451, + "loss": 0.0699, + "num_input_tokens_seen": 76731520, + "step": 35550 + }, + { + "epoch": 5.800163132137031, + "grad_norm": 0.16809356212615967, + "learning_rate": 0.0008940053768033609, + "loss": 0.1172, + "num_input_tokens_seen": 76741536, + "step": 35555 + }, + { + "epoch": 5.800978792822186, + "grad_norm": 0.01885121874511242, + "learning_rate": 0.0008939615501363581, + "loss": 0.0427, + "num_input_tokens_seen": 76751904, + "step": 35560 + }, + { + "epoch": 5.801794453507341, + "grad_norm": 0.14384441077709198, + "learning_rate": 0.0008939177154852245, + "loss": 0.1677, + "num_input_tokens_seen": 76761504, + "step": 35565 + }, + { + "epoch": 5.802610114192496, + "grad_norm": 0.023430874571204185, + "learning_rate": 0.0008938738728508487, + "loss": 0.0219, + "num_input_tokens_seen": 76771968, + "step": 35570 + }, + { + "epoch": 5.803425774877651, + "grad_norm": 0.25992628931999207, + "learning_rate": 0.0008938300222341192, + "loss": 0.1391, + "num_input_tokens_seen": 76782432, + "step": 35575 + }, + { + "epoch": 5.804241435562806, + "grad_norm": 0.18583820760250092, + "learning_rate": 0.0008937861636359248, + "loss": 0.1185, + "num_input_tokens_seen": 76793152, + "step": 35580 + }, + { + "epoch": 5.80505709624796, + "grad_norm": 0.0860733613371849, + "learning_rate": 0.000893742297057154, + "loss": 0.0463, + "num_input_tokens_seen": 76804000, + "step": 35585 + }, + { + "epoch": 5.805872756933116, + "grad_norm": 0.0031222442630678415, + "learning_rate": 0.0008936984224986962, + "loss": 0.0175, + "num_input_tokens_seen": 76814432, + "step": 35590 + }, + { + "epoch": 5.806688417618271, + "grad_norm": 0.03556114807724953, + "learning_rate": 0.0008936545399614405, + "loss": 0.0829, + "num_input_tokens_seen": 76825472, + "step": 35595 + }, + { + "epoch": 5.807504078303426, + "grad_norm": 0.03534172475337982, + "learning_rate": 0.0008936106494462761, + "loss": 0.0229, + "num_input_tokens_seen": 76837568, + "step": 35600 + }, + { + "epoch": 5.808319738988581, + "grad_norm": 0.5079765915870667, + "learning_rate": 0.0008935667509540926, + "loss": 0.0487, + "num_input_tokens_seen": 76848768, + "step": 35605 + }, + { + "epoch": 5.809135399673735, + "grad_norm": 0.3210282325744629, + "learning_rate": 0.0008935228444857795, + "loss": 0.0825, + "num_input_tokens_seen": 76860000, + "step": 35610 + }, + { + "epoch": 5.809951060358891, + "grad_norm": 0.07551582157611847, + "learning_rate": 0.0008934789300422268, + "loss": 0.0302, + "num_input_tokens_seen": 76871904, + "step": 35615 + }, + { + "epoch": 5.810766721044046, + "grad_norm": 0.013979324139654636, + "learning_rate": 0.0008934350076243245, + "loss": 0.0355, + "num_input_tokens_seen": 76881504, + "step": 35620 + }, + { + "epoch": 5.811582381729201, + "grad_norm": 0.19665570557117462, + "learning_rate": 0.0008933910772329625, + "loss": 0.1097, + "num_input_tokens_seen": 76894080, + "step": 35625 + }, + { + "epoch": 5.8123980424143555, + "grad_norm": 0.013969731517136097, + "learning_rate": 0.0008933471388690314, + "loss": 0.0992, + "num_input_tokens_seen": 76904672, + "step": 35630 + }, + { + "epoch": 5.81321370309951, + "grad_norm": 0.13109974563121796, + "learning_rate": 0.0008933031925334214, + "loss": 0.0372, + "num_input_tokens_seen": 76914976, + "step": 35635 + }, + { + "epoch": 5.814029363784666, + "grad_norm": 0.08777325600385666, + "learning_rate": 0.0008932592382270235, + "loss": 0.0715, + "num_input_tokens_seen": 76925824, + "step": 35640 + }, + { + "epoch": 5.814845024469821, + "grad_norm": 0.009413676336407661, + "learning_rate": 0.0008932152759507279, + "loss": 0.0927, + "num_input_tokens_seen": 76936160, + "step": 35645 + }, + { + "epoch": 5.815660685154976, + "grad_norm": 0.26903340220451355, + "learning_rate": 0.0008931713057054263, + "loss": 0.1137, + "num_input_tokens_seen": 76946016, + "step": 35650 + }, + { + "epoch": 5.8164763458401305, + "grad_norm": 0.29795363545417786, + "learning_rate": 0.0008931273274920091, + "loss": 0.1369, + "num_input_tokens_seen": 76956704, + "step": 35655 + }, + { + "epoch": 5.817292006525285, + "grad_norm": 0.04012474790215492, + "learning_rate": 0.0008930833413113682, + "loss": 0.0426, + "num_input_tokens_seen": 76967360, + "step": 35660 + }, + { + "epoch": 5.81810766721044, + "grad_norm": 0.09159766137599945, + "learning_rate": 0.0008930393471643945, + "loss": 0.1252, + "num_input_tokens_seen": 76979008, + "step": 35665 + }, + { + "epoch": 5.818923327895595, + "grad_norm": 0.010702038183808327, + "learning_rate": 0.0008929953450519799, + "loss": 0.1414, + "num_input_tokens_seen": 76988800, + "step": 35670 + }, + { + "epoch": 5.819738988580751, + "grad_norm": 0.32852408289909363, + "learning_rate": 0.000892951334975016, + "loss": 0.222, + "num_input_tokens_seen": 76999616, + "step": 35675 + }, + { + "epoch": 5.8205546492659055, + "grad_norm": 0.02813861146569252, + "learning_rate": 0.0008929073169343948, + "loss": 0.0201, + "num_input_tokens_seen": 77010656, + "step": 35680 + }, + { + "epoch": 5.82137030995106, + "grad_norm": 0.007033924106508493, + "learning_rate": 0.0008928632909310084, + "loss": 0.0356, + "num_input_tokens_seen": 77021920, + "step": 35685 + }, + { + "epoch": 5.822185970636215, + "grad_norm": 0.057367030531167984, + "learning_rate": 0.000892819256965749, + "loss": 0.0496, + "num_input_tokens_seen": 77032416, + "step": 35690 + }, + { + "epoch": 5.82300163132137, + "grad_norm": 0.21670454740524292, + "learning_rate": 0.0008927752150395092, + "loss": 0.0669, + "num_input_tokens_seen": 77042944, + "step": 35695 + }, + { + "epoch": 5.823817292006526, + "grad_norm": 0.12671571969985962, + "learning_rate": 0.0008927311651531813, + "loss": 0.021, + "num_input_tokens_seen": 77051584, + "step": 35700 + }, + { + "epoch": 5.8246329526916805, + "grad_norm": 0.08893094211816788, + "learning_rate": 0.0008926871073076581, + "loss": 0.0918, + "num_input_tokens_seen": 77062048, + "step": 35705 + }, + { + "epoch": 5.825448613376835, + "grad_norm": 0.05736568570137024, + "learning_rate": 0.0008926430415038324, + "loss": 0.1872, + "num_input_tokens_seen": 77074080, + "step": 35710 + }, + { + "epoch": 5.82626427406199, + "grad_norm": 0.023801175877451897, + "learning_rate": 0.0008925989677425976, + "loss": 0.1545, + "num_input_tokens_seen": 77085632, + "step": 35715 + }, + { + "epoch": 5.827079934747145, + "grad_norm": 0.05330059304833412, + "learning_rate": 0.0008925548860248464, + "loss": 0.0732, + "num_input_tokens_seen": 77097664, + "step": 35720 + }, + { + "epoch": 5.827895595432301, + "grad_norm": 0.032428644597530365, + "learning_rate": 0.0008925107963514727, + "loss": 0.0313, + "num_input_tokens_seen": 77109568, + "step": 35725 + }, + { + "epoch": 5.828711256117455, + "grad_norm": 0.03098035790026188, + "learning_rate": 0.0008924666987233697, + "loss": 0.0813, + "num_input_tokens_seen": 77119648, + "step": 35730 + }, + { + "epoch": 5.82952691680261, + "grad_norm": 0.08395079523324966, + "learning_rate": 0.0008924225931414312, + "loss": 0.0296, + "num_input_tokens_seen": 77132064, + "step": 35735 + }, + { + "epoch": 5.830342577487765, + "grad_norm": 0.05584961175918579, + "learning_rate": 0.000892378479606551, + "loss": 0.062, + "num_input_tokens_seen": 77142880, + "step": 35740 + }, + { + "epoch": 5.83115823817292, + "grad_norm": 0.13756109774112701, + "learning_rate": 0.0008923343581196231, + "loss": 0.0455, + "num_input_tokens_seen": 77154272, + "step": 35745 + }, + { + "epoch": 5.831973898858075, + "grad_norm": 0.213007390499115, + "learning_rate": 0.0008922902286815417, + "loss": 0.1135, + "num_input_tokens_seen": 77165280, + "step": 35750 + }, + { + "epoch": 5.8327895595432295, + "grad_norm": 0.05882050469517708, + "learning_rate": 0.0008922460912932013, + "loss": 0.048, + "num_input_tokens_seen": 77176032, + "step": 35755 + }, + { + "epoch": 5.833605220228385, + "grad_norm": 0.008132684044539928, + "learning_rate": 0.0008922019459554961, + "loss": 0.0747, + "num_input_tokens_seen": 77186880, + "step": 35760 + }, + { + "epoch": 5.83442088091354, + "grad_norm": 0.11372732371091843, + "learning_rate": 0.000892157792669321, + "loss": 0.0945, + "num_input_tokens_seen": 77196704, + "step": 35765 + }, + { + "epoch": 5.835236541598695, + "grad_norm": 0.09561900794506073, + "learning_rate": 0.0008921136314355706, + "loss": 0.0354, + "num_input_tokens_seen": 77208064, + "step": 35770 + }, + { + "epoch": 5.83605220228385, + "grad_norm": 0.1992100477218628, + "learning_rate": 0.0008920694622551402, + "loss": 0.1815, + "num_input_tokens_seen": 77219328, + "step": 35775 + }, + { + "epoch": 5.8368678629690045, + "grad_norm": 0.05465446040034294, + "learning_rate": 0.0008920252851289248, + "loss": 0.0935, + "num_input_tokens_seen": 77230592, + "step": 35780 + }, + { + "epoch": 5.83768352365416, + "grad_norm": 0.05838768556714058, + "learning_rate": 0.0008919811000578195, + "loss": 0.0539, + "num_input_tokens_seen": 77240992, + "step": 35785 + }, + { + "epoch": 5.838499184339315, + "grad_norm": 0.062777079641819, + "learning_rate": 0.0008919369070427201, + "loss": 0.0671, + "num_input_tokens_seen": 77250144, + "step": 35790 + }, + { + "epoch": 5.83931484502447, + "grad_norm": 0.0046429941430687904, + "learning_rate": 0.000891892706084522, + "loss": 0.0953, + "num_input_tokens_seen": 77262464, + "step": 35795 + }, + { + "epoch": 5.840130505709625, + "grad_norm": 0.08209957927465439, + "learning_rate": 0.0008918484971841211, + "loss": 0.0673, + "num_input_tokens_seen": 77273248, + "step": 35800 + }, + { + "epoch": 5.8409461663947795, + "grad_norm": 0.26815515756607056, + "learning_rate": 0.0008918042803424133, + "loss": 0.1736, + "num_input_tokens_seen": 77284032, + "step": 35805 + }, + { + "epoch": 5.841761827079935, + "grad_norm": 0.0913134217262268, + "learning_rate": 0.0008917600555602947, + "loss": 0.0644, + "num_input_tokens_seen": 77295072, + "step": 35810 + }, + { + "epoch": 5.84257748776509, + "grad_norm": 0.3727843165397644, + "learning_rate": 0.0008917158228386616, + "loss": 0.2948, + "num_input_tokens_seen": 77306208, + "step": 35815 + }, + { + "epoch": 5.843393148450245, + "grad_norm": 0.20646388828754425, + "learning_rate": 0.0008916715821784105, + "loss": 0.074, + "num_input_tokens_seen": 77317024, + "step": 35820 + }, + { + "epoch": 5.8442088091354, + "grad_norm": 0.10968193411827087, + "learning_rate": 0.0008916273335804377, + "loss": 0.0643, + "num_input_tokens_seen": 77327648, + "step": 35825 + }, + { + "epoch": 5.8450244698205545, + "grad_norm": 0.024440661072731018, + "learning_rate": 0.0008915830770456403, + "loss": 0.0452, + "num_input_tokens_seen": 77338304, + "step": 35830 + }, + { + "epoch": 5.845840130505709, + "grad_norm": 0.038344550877809525, + "learning_rate": 0.0008915388125749152, + "loss": 0.0834, + "num_input_tokens_seen": 77348768, + "step": 35835 + }, + { + "epoch": 5.846655791190865, + "grad_norm": 0.032103490084409714, + "learning_rate": 0.0008914945401691592, + "loss": 0.0648, + "num_input_tokens_seen": 77358208, + "step": 35840 + }, + { + "epoch": 5.84747145187602, + "grad_norm": 0.011769642122089863, + "learning_rate": 0.0008914502598292698, + "loss": 0.0692, + "num_input_tokens_seen": 77368896, + "step": 35845 + }, + { + "epoch": 5.848287112561175, + "grad_norm": 0.03090481087565422, + "learning_rate": 0.0008914059715561442, + "loss": 0.0185, + "num_input_tokens_seen": 77379616, + "step": 35850 + }, + { + "epoch": 5.849102773246329, + "grad_norm": 0.181661456823349, + "learning_rate": 0.0008913616753506801, + "loss": 0.1266, + "num_input_tokens_seen": 77390400, + "step": 35855 + }, + { + "epoch": 5.849918433931484, + "grad_norm": 0.19730189442634583, + "learning_rate": 0.0008913173712137752, + "loss": 0.1115, + "num_input_tokens_seen": 77400544, + "step": 35860 + }, + { + "epoch": 5.850734094616639, + "grad_norm": 0.01578918658196926, + "learning_rate": 0.0008912730591463274, + "loss": 0.0149, + "num_input_tokens_seen": 77411136, + "step": 35865 + }, + { + "epoch": 5.851549755301795, + "grad_norm": 0.0598427951335907, + "learning_rate": 0.0008912287391492345, + "loss": 0.2076, + "num_input_tokens_seen": 77422240, + "step": 35870 + }, + { + "epoch": 5.85236541598695, + "grad_norm": 0.026703141629695892, + "learning_rate": 0.0008911844112233951, + "loss": 0.0492, + "num_input_tokens_seen": 77431648, + "step": 35875 + }, + { + "epoch": 5.853181076672104, + "grad_norm": 0.06085679307579994, + "learning_rate": 0.0008911400753697072, + "loss": 0.0402, + "num_input_tokens_seen": 77443680, + "step": 35880 + }, + { + "epoch": 5.853996737357259, + "grad_norm": 0.029114073142409325, + "learning_rate": 0.0008910957315890695, + "loss": 0.0482, + "num_input_tokens_seen": 77453824, + "step": 35885 + }, + { + "epoch": 5.854812398042414, + "grad_norm": 0.008561065420508385, + "learning_rate": 0.0008910513798823807, + "loss": 0.1967, + "num_input_tokens_seen": 77464512, + "step": 35890 + }, + { + "epoch": 5.85562805872757, + "grad_norm": 0.1490209549665451, + "learning_rate": 0.0008910070202505396, + "loss": 0.1837, + "num_input_tokens_seen": 77475584, + "step": 35895 + }, + { + "epoch": 5.856443719412725, + "grad_norm": 0.1263769567012787, + "learning_rate": 0.0008909626526944452, + "loss": 0.1013, + "num_input_tokens_seen": 77486464, + "step": 35900 + }, + { + "epoch": 5.857259380097879, + "grad_norm": 0.20194324851036072, + "learning_rate": 0.0008909182772149966, + "loss": 0.1723, + "num_input_tokens_seen": 77495168, + "step": 35905 + }, + { + "epoch": 5.858075040783034, + "grad_norm": 0.2770948112010956, + "learning_rate": 0.0008908738938130933, + "loss": 0.1802, + "num_input_tokens_seen": 77505632, + "step": 35910 + }, + { + "epoch": 5.858890701468189, + "grad_norm": 0.07713992148637772, + "learning_rate": 0.0008908295024896346, + "loss": 0.0847, + "num_input_tokens_seen": 77516512, + "step": 35915 + }, + { + "epoch": 5.859706362153344, + "grad_norm": 0.2031664252281189, + "learning_rate": 0.0008907851032455204, + "loss": 0.123, + "num_input_tokens_seen": 77528352, + "step": 35920 + }, + { + "epoch": 5.8605220228384995, + "grad_norm": 0.10846851766109467, + "learning_rate": 0.0008907406960816502, + "loss": 0.0887, + "num_input_tokens_seen": 77537728, + "step": 35925 + }, + { + "epoch": 5.861337683523654, + "grad_norm": 0.014804016798734665, + "learning_rate": 0.0008906962809989242, + "loss": 0.0727, + "num_input_tokens_seen": 77549216, + "step": 35930 + }, + { + "epoch": 5.862153344208809, + "grad_norm": 0.13727165758609772, + "learning_rate": 0.0008906518579982423, + "loss": 0.193, + "num_input_tokens_seen": 77560288, + "step": 35935 + }, + { + "epoch": 5.862969004893964, + "grad_norm": 0.06701788306236267, + "learning_rate": 0.000890607427080505, + "loss": 0.0999, + "num_input_tokens_seen": 77571968, + "step": 35940 + }, + { + "epoch": 5.863784665579119, + "grad_norm": 0.01595906913280487, + "learning_rate": 0.0008905629882466126, + "loss": 0.0535, + "num_input_tokens_seen": 77583872, + "step": 35945 + }, + { + "epoch": 5.864600326264274, + "grad_norm": 0.03020952083170414, + "learning_rate": 0.0008905185414974659, + "loss": 0.1131, + "num_input_tokens_seen": 77595232, + "step": 35950 + }, + { + "epoch": 5.865415986949429, + "grad_norm": 0.06419949233531952, + "learning_rate": 0.0008904740868339655, + "loss": 0.0284, + "num_input_tokens_seen": 77605696, + "step": 35955 + }, + { + "epoch": 5.866231647634584, + "grad_norm": 0.017439957708120346, + "learning_rate": 0.0008904296242570123, + "loss": 0.1098, + "num_input_tokens_seen": 77616192, + "step": 35960 + }, + { + "epoch": 5.867047308319739, + "grad_norm": 0.10805799812078476, + "learning_rate": 0.0008903851537675076, + "loss": 0.1262, + "num_input_tokens_seen": 77626688, + "step": 35965 + }, + { + "epoch": 5.867862969004894, + "grad_norm": 0.017202205955982208, + "learning_rate": 0.0008903406753663524, + "loss": 0.0515, + "num_input_tokens_seen": 77637152, + "step": 35970 + }, + { + "epoch": 5.868678629690049, + "grad_norm": 0.1181994378566742, + "learning_rate": 0.0008902961890544483, + "loss": 0.0776, + "num_input_tokens_seen": 77647328, + "step": 35975 + }, + { + "epoch": 5.869494290375204, + "grad_norm": 0.028154756873846054, + "learning_rate": 0.0008902516948326967, + "loss": 0.0395, + "num_input_tokens_seen": 77657824, + "step": 35980 + }, + { + "epoch": 5.870309951060359, + "grad_norm": 0.009859089739620686, + "learning_rate": 0.0008902071927019996, + "loss": 0.0418, + "num_input_tokens_seen": 77668768, + "step": 35985 + }, + { + "epoch": 5.871125611745514, + "grad_norm": 0.13221707940101624, + "learning_rate": 0.0008901626826632586, + "loss": 0.15, + "num_input_tokens_seen": 77679040, + "step": 35990 + }, + { + "epoch": 5.871941272430669, + "grad_norm": 0.005262443795800209, + "learning_rate": 0.000890118164717376, + "loss": 0.1334, + "num_input_tokens_seen": 77690304, + "step": 35995 + }, + { + "epoch": 5.872756933115824, + "grad_norm": 0.09470520913600922, + "learning_rate": 0.0008900736388652537, + "loss": 0.0297, + "num_input_tokens_seen": 77700480, + "step": 36000 + }, + { + "epoch": 5.873572593800979, + "grad_norm": 0.05729277804493904, + "learning_rate": 0.0008900291051077944, + "loss": 0.035, + "num_input_tokens_seen": 77711552, + "step": 36005 + }, + { + "epoch": 5.874388254486134, + "grad_norm": 0.06786457449197769, + "learning_rate": 0.0008899845634459005, + "loss": 0.1254, + "num_input_tokens_seen": 77722944, + "step": 36010 + }, + { + "epoch": 5.875203915171289, + "grad_norm": 0.004539950285106897, + "learning_rate": 0.0008899400138804748, + "loss": 0.0447, + "num_input_tokens_seen": 77735072, + "step": 36015 + }, + { + "epoch": 5.876019575856444, + "grad_norm": 0.0032588113099336624, + "learning_rate": 0.0008898954564124197, + "loss": 0.0199, + "num_input_tokens_seen": 77746848, + "step": 36020 + }, + { + "epoch": 5.876835236541599, + "grad_norm": 0.0229257233440876, + "learning_rate": 0.0008898508910426388, + "loss": 0.0836, + "num_input_tokens_seen": 77757664, + "step": 36025 + }, + { + "epoch": 5.877650897226753, + "grad_norm": 0.008827628567814827, + "learning_rate": 0.0008898063177720351, + "loss": 0.0338, + "num_input_tokens_seen": 77768288, + "step": 36030 + }, + { + "epoch": 5.878466557911908, + "grad_norm": 0.003499216167256236, + "learning_rate": 0.0008897617366015118, + "loss": 0.1694, + "num_input_tokens_seen": 77778080, + "step": 36035 + }, + { + "epoch": 5.879282218597064, + "grad_norm": 0.1507546603679657, + "learning_rate": 0.0008897171475319723, + "loss": 0.1587, + "num_input_tokens_seen": 77790784, + "step": 36040 + }, + { + "epoch": 5.880097879282219, + "grad_norm": 0.06880336999893188, + "learning_rate": 0.0008896725505643206, + "loss": 0.0755, + "num_input_tokens_seen": 77801344, + "step": 36045 + }, + { + "epoch": 5.8809135399673735, + "grad_norm": 0.035696517676115036, + "learning_rate": 0.0008896279456994603, + "loss": 0.0728, + "num_input_tokens_seen": 77811392, + "step": 36050 + }, + { + "epoch": 5.881729200652528, + "grad_norm": 0.014336380176246166, + "learning_rate": 0.0008895833329382954, + "loss": 0.0269, + "num_input_tokens_seen": 77821312, + "step": 36055 + }, + { + "epoch": 5.882544861337683, + "grad_norm": 0.02815372683107853, + "learning_rate": 0.00088953871228173, + "loss": 0.1937, + "num_input_tokens_seen": 77831872, + "step": 36060 + }, + { + "epoch": 5.883360522022839, + "grad_norm": 0.04767763614654541, + "learning_rate": 0.0008894940837306685, + "loss": 0.0334, + "num_input_tokens_seen": 77842752, + "step": 36065 + }, + { + "epoch": 5.884176182707994, + "grad_norm": 0.014469556510448456, + "learning_rate": 0.000889449447286015, + "loss": 0.0593, + "num_input_tokens_seen": 77853696, + "step": 36070 + }, + { + "epoch": 5.8849918433931485, + "grad_norm": 0.07536475360393524, + "learning_rate": 0.0008894048029486748, + "loss": 0.1205, + "num_input_tokens_seen": 77864288, + "step": 36075 + }, + { + "epoch": 5.885807504078303, + "grad_norm": 0.018441442400217056, + "learning_rate": 0.0008893601507195521, + "loss": 0.0187, + "num_input_tokens_seen": 77874880, + "step": 36080 + }, + { + "epoch": 5.886623164763458, + "grad_norm": 0.009847963228821754, + "learning_rate": 0.000889315490599552, + "loss": 0.0142, + "num_input_tokens_seen": 77886688, + "step": 36085 + }, + { + "epoch": 5.887438825448614, + "grad_norm": 0.06195588409900665, + "learning_rate": 0.0008892708225895796, + "loss": 0.1071, + "num_input_tokens_seen": 77897376, + "step": 36090 + }, + { + "epoch": 5.888254486133769, + "grad_norm": 0.0886634811758995, + "learning_rate": 0.0008892261466905402, + "loss": 0.0252, + "num_input_tokens_seen": 77908736, + "step": 36095 + }, + { + "epoch": 5.8890701468189235, + "grad_norm": 0.003234286792576313, + "learning_rate": 0.000889181462903339, + "loss": 0.1346, + "num_input_tokens_seen": 77920352, + "step": 36100 + }, + { + "epoch": 5.889885807504078, + "grad_norm": 0.006923974025994539, + "learning_rate": 0.0008891367712288819, + "loss": 0.127, + "num_input_tokens_seen": 77931360, + "step": 36105 + }, + { + "epoch": 5.890701468189233, + "grad_norm": 0.02402672730386257, + "learning_rate": 0.0008890920716680744, + "loss": 0.0434, + "num_input_tokens_seen": 77942208, + "step": 36110 + }, + { + "epoch": 5.891517128874388, + "grad_norm": 0.0024572850670665503, + "learning_rate": 0.0008890473642218226, + "loss": 0.0616, + "num_input_tokens_seen": 77953248, + "step": 36115 + }, + { + "epoch": 5.892332789559543, + "grad_norm": 0.07898968458175659, + "learning_rate": 0.0008890026488910323, + "loss": 0.0384, + "num_input_tokens_seen": 77963584, + "step": 36120 + }, + { + "epoch": 5.8931484502446985, + "grad_norm": 0.15272687375545502, + "learning_rate": 0.0008889579256766098, + "loss": 0.0753, + "num_input_tokens_seen": 77973920, + "step": 36125 + }, + { + "epoch": 5.893964110929853, + "grad_norm": 0.018967045471072197, + "learning_rate": 0.0008889131945794618, + "loss": 0.0666, + "num_input_tokens_seen": 77984736, + "step": 36130 + }, + { + "epoch": 5.894779771615008, + "grad_norm": 0.190431609749794, + "learning_rate": 0.0008888684556004942, + "loss": 0.1407, + "num_input_tokens_seen": 77996160, + "step": 36135 + }, + { + "epoch": 5.895595432300163, + "grad_norm": 0.24428105354309082, + "learning_rate": 0.0008888237087406141, + "loss": 0.1523, + "num_input_tokens_seen": 78006592, + "step": 36140 + }, + { + "epoch": 5.896411092985318, + "grad_norm": 0.005844801664352417, + "learning_rate": 0.0008887789540007285, + "loss": 0.0982, + "num_input_tokens_seen": 78015936, + "step": 36145 + }, + { + "epoch": 5.897226753670473, + "grad_norm": 0.2065504938364029, + "learning_rate": 0.000888734191381744, + "loss": 0.1948, + "num_input_tokens_seen": 78028160, + "step": 36150 + }, + { + "epoch": 5.898042414355628, + "grad_norm": 0.013972515240311623, + "learning_rate": 0.000888689420884568, + "loss": 0.0903, + "num_input_tokens_seen": 78039712, + "step": 36155 + }, + { + "epoch": 5.898858075040783, + "grad_norm": 0.0808735191822052, + "learning_rate": 0.0008886446425101078, + "loss": 0.0542, + "num_input_tokens_seen": 78051936, + "step": 36160 + }, + { + "epoch": 5.899673735725938, + "grad_norm": 0.016068795695900917, + "learning_rate": 0.0008885998562592709, + "loss": 0.1721, + "num_input_tokens_seen": 78062464, + "step": 36165 + }, + { + "epoch": 5.900489396411093, + "grad_norm": 0.02947733923792839, + "learning_rate": 0.0008885550621329649, + "loss": 0.0527, + "num_input_tokens_seen": 78072352, + "step": 36170 + }, + { + "epoch": 5.901305057096248, + "grad_norm": 0.011550496332347393, + "learning_rate": 0.0008885102601320976, + "loss": 0.1095, + "num_input_tokens_seen": 78082816, + "step": 36175 + }, + { + "epoch": 5.902120717781403, + "grad_norm": 0.21144059300422668, + "learning_rate": 0.0008884654502575771, + "loss": 0.1612, + "num_input_tokens_seen": 78092288, + "step": 36180 + }, + { + "epoch": 5.902936378466558, + "grad_norm": 0.0989311933517456, + "learning_rate": 0.0008884206325103115, + "loss": 0.1202, + "num_input_tokens_seen": 78104608, + "step": 36185 + }, + { + "epoch": 5.903752039151713, + "grad_norm": 0.0046556913293898106, + "learning_rate": 0.000888375806891209, + "loss": 0.0256, + "num_input_tokens_seen": 78116160, + "step": 36190 + }, + { + "epoch": 5.904567699836868, + "grad_norm": 0.0441984161734581, + "learning_rate": 0.0008883309734011779, + "loss": 0.2098, + "num_input_tokens_seen": 78127200, + "step": 36195 + }, + { + "epoch": 5.9053833605220225, + "grad_norm": 0.07077279686927795, + "learning_rate": 0.0008882861320411273, + "loss": 0.1032, + "num_input_tokens_seen": 78136928, + "step": 36200 + }, + { + "epoch": 5.906199021207177, + "grad_norm": 0.007242160849273205, + "learning_rate": 0.0008882412828119655, + "loss": 0.0374, + "num_input_tokens_seen": 78148704, + "step": 36205 + }, + { + "epoch": 5.907014681892333, + "grad_norm": 0.06220718100667, + "learning_rate": 0.0008881964257146015, + "loss": 0.0583, + "num_input_tokens_seen": 78158752, + "step": 36210 + }, + { + "epoch": 5.907830342577488, + "grad_norm": 0.2649748623371124, + "learning_rate": 0.0008881515607499446, + "loss": 0.3094, + "num_input_tokens_seen": 78167968, + "step": 36215 + }, + { + "epoch": 5.908646003262643, + "grad_norm": 0.20415550470352173, + "learning_rate": 0.000888106687918904, + "loss": 0.1351, + "num_input_tokens_seen": 78178688, + "step": 36220 + }, + { + "epoch": 5.9094616639477975, + "grad_norm": 0.18021751940250397, + "learning_rate": 0.000888061807222389, + "loss": 0.1618, + "num_input_tokens_seen": 78189376, + "step": 36225 + }, + { + "epoch": 5.910277324632952, + "grad_norm": 0.028843821957707405, + "learning_rate": 0.000888016918661309, + "loss": 0.0501, + "num_input_tokens_seen": 78200000, + "step": 36230 + }, + { + "epoch": 5.911092985318108, + "grad_norm": 0.10646991431713104, + "learning_rate": 0.0008879720222365739, + "loss": 0.0775, + "num_input_tokens_seen": 78210592, + "step": 36235 + }, + { + "epoch": 5.911908646003263, + "grad_norm": 0.21381233632564545, + "learning_rate": 0.0008879271179490938, + "loss": 0.0618, + "num_input_tokens_seen": 78220896, + "step": 36240 + }, + { + "epoch": 5.912724306688418, + "grad_norm": 0.028728807345032692, + "learning_rate": 0.0008878822057997784, + "loss": 0.0133, + "num_input_tokens_seen": 78233088, + "step": 36245 + }, + { + "epoch": 5.9135399673735725, + "grad_norm": 0.1276533603668213, + "learning_rate": 0.000887837285789538, + "loss": 0.2135, + "num_input_tokens_seen": 78243680, + "step": 36250 + }, + { + "epoch": 5.914355628058727, + "grad_norm": 0.004085680469870567, + "learning_rate": 0.0008877923579192831, + "loss": 0.0852, + "num_input_tokens_seen": 78253344, + "step": 36255 + }, + { + "epoch": 5.915171288743883, + "grad_norm": 0.009961583651602268, + "learning_rate": 0.0008877474221899241, + "loss": 0.0635, + "num_input_tokens_seen": 78264864, + "step": 36260 + }, + { + "epoch": 5.915986949429038, + "grad_norm": 0.03305363655090332, + "learning_rate": 0.0008877024786023718, + "loss": 0.0606, + "num_input_tokens_seen": 78276000, + "step": 36265 + }, + { + "epoch": 5.916802610114193, + "grad_norm": 0.05178165063261986, + "learning_rate": 0.0008876575271575366, + "loss": 0.124, + "num_input_tokens_seen": 78287360, + "step": 36270 + }, + { + "epoch": 5.917618270799347, + "grad_norm": 0.00517655024304986, + "learning_rate": 0.0008876125678563301, + "loss": 0.0364, + "num_input_tokens_seen": 78296608, + "step": 36275 + }, + { + "epoch": 5.918433931484502, + "grad_norm": 0.10580654442310333, + "learning_rate": 0.0008875676006996631, + "loss": 0.0341, + "num_input_tokens_seen": 78306848, + "step": 36280 + }, + { + "epoch": 5.919249592169657, + "grad_norm": 0.1294144243001938, + "learning_rate": 0.0008875226256884471, + "loss": 0.0383, + "num_input_tokens_seen": 78317504, + "step": 36285 + }, + { + "epoch": 5.920065252854813, + "grad_norm": 0.19769349694252014, + "learning_rate": 0.0008874776428235933, + "loss": 0.124, + "num_input_tokens_seen": 78327360, + "step": 36290 + }, + { + "epoch": 5.920880913539968, + "grad_norm": 0.00506366603076458, + "learning_rate": 0.0008874326521060138, + "loss": 0.1059, + "num_input_tokens_seen": 78338656, + "step": 36295 + }, + { + "epoch": 5.921696574225122, + "grad_norm": 0.09977197647094727, + "learning_rate": 0.0008873876535366199, + "loss": 0.1181, + "num_input_tokens_seen": 78350464, + "step": 36300 + }, + { + "epoch": 5.922512234910277, + "grad_norm": 0.37759852409362793, + "learning_rate": 0.0008873426471163238, + "loss": 0.0473, + "num_input_tokens_seen": 78360480, + "step": 36305 + }, + { + "epoch": 5.923327895595432, + "grad_norm": 0.12448231875896454, + "learning_rate": 0.0008872976328460376, + "loss": 0.0744, + "num_input_tokens_seen": 78371264, + "step": 36310 + }, + { + "epoch": 5.924143556280587, + "grad_norm": 0.18016120791435242, + "learning_rate": 0.0008872526107266736, + "loss": 0.1507, + "num_input_tokens_seen": 78382176, + "step": 36315 + }, + { + "epoch": 5.924959216965743, + "grad_norm": 0.06458032876253128, + "learning_rate": 0.0008872075807591442, + "loss": 0.2192, + "num_input_tokens_seen": 78392704, + "step": 36320 + }, + { + "epoch": 5.925774877650897, + "grad_norm": 0.11286918073892593, + "learning_rate": 0.0008871625429443617, + "loss": 0.3079, + "num_input_tokens_seen": 78404544, + "step": 36325 + }, + { + "epoch": 5.926590538336052, + "grad_norm": 0.032583173364400864, + "learning_rate": 0.0008871174972832394, + "loss": 0.0888, + "num_input_tokens_seen": 78414400, + "step": 36330 + }, + { + "epoch": 5.927406199021207, + "grad_norm": 0.1005791425704956, + "learning_rate": 0.0008870724437766898, + "loss": 0.1106, + "num_input_tokens_seen": 78425664, + "step": 36335 + }, + { + "epoch": 5.928221859706362, + "grad_norm": 0.13125361502170563, + "learning_rate": 0.0008870273824256261, + "loss": 0.0937, + "num_input_tokens_seen": 78437120, + "step": 36340 + }, + { + "epoch": 5.9290375203915175, + "grad_norm": 0.034235499799251556, + "learning_rate": 0.0008869823132309616, + "loss": 0.0286, + "num_input_tokens_seen": 78447456, + "step": 36345 + }, + { + "epoch": 5.929853181076672, + "grad_norm": 0.03851349651813507, + "learning_rate": 0.0008869372361936096, + "loss": 0.0885, + "num_input_tokens_seen": 78457760, + "step": 36350 + }, + { + "epoch": 5.930668841761827, + "grad_norm": 0.09169448912143707, + "learning_rate": 0.0008868921513144835, + "loss": 0.0299, + "num_input_tokens_seen": 78469248, + "step": 36355 + }, + { + "epoch": 5.931484502446982, + "grad_norm": 0.0052831438370049, + "learning_rate": 0.0008868470585944972, + "loss": 0.1102, + "num_input_tokens_seen": 78480864, + "step": 36360 + }, + { + "epoch": 5.932300163132137, + "grad_norm": 0.19256380200386047, + "learning_rate": 0.0008868019580345645, + "loss": 0.1831, + "num_input_tokens_seen": 78492224, + "step": 36365 + }, + { + "epoch": 5.933115823817292, + "grad_norm": 0.05677660554647446, + "learning_rate": 0.0008867568496355996, + "loss": 0.0355, + "num_input_tokens_seen": 78503072, + "step": 36370 + }, + { + "epoch": 5.933931484502447, + "grad_norm": 0.0740382969379425, + "learning_rate": 0.0008867117333985164, + "loss": 0.1976, + "num_input_tokens_seen": 78511744, + "step": 36375 + }, + { + "epoch": 5.934747145187602, + "grad_norm": 0.1961808055639267, + "learning_rate": 0.0008866666093242292, + "loss": 0.182, + "num_input_tokens_seen": 78522176, + "step": 36380 + }, + { + "epoch": 5.935562805872757, + "grad_norm": 0.009373529814183712, + "learning_rate": 0.0008866214774136528, + "loss": 0.0442, + "num_input_tokens_seen": 78532928, + "step": 36385 + }, + { + "epoch": 5.936378466557912, + "grad_norm": 0.026978205889463425, + "learning_rate": 0.0008865763376677017, + "loss": 0.1268, + "num_input_tokens_seen": 78543552, + "step": 36390 + }, + { + "epoch": 5.937194127243067, + "grad_norm": 0.07651624828577042, + "learning_rate": 0.0008865311900872905, + "loss": 0.0705, + "num_input_tokens_seen": 78553824, + "step": 36395 + }, + { + "epoch": 5.938009787928221, + "grad_norm": 0.16284069418907166, + "learning_rate": 0.0008864860346733346, + "loss": 0.0675, + "num_input_tokens_seen": 78565248, + "step": 36400 + }, + { + "epoch": 5.938825448613377, + "grad_norm": 0.10262231528759003, + "learning_rate": 0.0008864408714267489, + "loss": 0.065, + "num_input_tokens_seen": 78577440, + "step": 36405 + }, + { + "epoch": 5.939641109298532, + "grad_norm": 0.08757799863815308, + "learning_rate": 0.0008863957003484486, + "loss": 0.208, + "num_input_tokens_seen": 78588160, + "step": 36410 + }, + { + "epoch": 5.940456769983687, + "grad_norm": 0.13630235195159912, + "learning_rate": 0.0008863505214393494, + "loss": 0.099, + "num_input_tokens_seen": 78598976, + "step": 36415 + }, + { + "epoch": 5.941272430668842, + "grad_norm": 0.047017715871334076, + "learning_rate": 0.0008863053347003667, + "loss": 0.0855, + "num_input_tokens_seen": 78608768, + "step": 36420 + }, + { + "epoch": 5.942088091353996, + "grad_norm": 0.2511025071144104, + "learning_rate": 0.0008862601401324162, + "loss": 0.1175, + "num_input_tokens_seen": 78620736, + "step": 36425 + }, + { + "epoch": 5.942903752039152, + "grad_norm": 0.03130869194865227, + "learning_rate": 0.0008862149377364142, + "loss": 0.1553, + "num_input_tokens_seen": 78629216, + "step": 36430 + }, + { + "epoch": 5.943719412724307, + "grad_norm": 0.037159208208322525, + "learning_rate": 0.0008861697275132763, + "loss": 0.0772, + "num_input_tokens_seen": 78639040, + "step": 36435 + }, + { + "epoch": 5.944535073409462, + "grad_norm": 0.01796979084610939, + "learning_rate": 0.0008861245094639193, + "loss": 0.063, + "num_input_tokens_seen": 78650624, + "step": 36440 + }, + { + "epoch": 5.945350734094617, + "grad_norm": 0.09340013563632965, + "learning_rate": 0.000886079283589259, + "loss": 0.0796, + "num_input_tokens_seen": 78662368, + "step": 36445 + }, + { + "epoch": 5.946166394779771, + "grad_norm": 0.12010854482650757, + "learning_rate": 0.0008860340498902121, + "loss": 0.1331, + "num_input_tokens_seen": 78673984, + "step": 36450 + }, + { + "epoch": 5.946982055464927, + "grad_norm": 0.006237644702196121, + "learning_rate": 0.0008859888083676958, + "loss": 0.0505, + "num_input_tokens_seen": 78683328, + "step": 36455 + }, + { + "epoch": 5.947797716150082, + "grad_norm": 0.4262031614780426, + "learning_rate": 0.0008859435590226266, + "loss": 0.1165, + "num_input_tokens_seen": 78694496, + "step": 36460 + }, + { + "epoch": 5.948613376835237, + "grad_norm": 0.03708004951477051, + "learning_rate": 0.0008858983018559214, + "loss": 0.0336, + "num_input_tokens_seen": 78704832, + "step": 36465 + }, + { + "epoch": 5.9494290375203915, + "grad_norm": 0.2409103810787201, + "learning_rate": 0.0008858530368684977, + "loss": 0.1116, + "num_input_tokens_seen": 78715840, + "step": 36470 + }, + { + "epoch": 5.950244698205546, + "grad_norm": 0.019162973389029503, + "learning_rate": 0.0008858077640612727, + "loss": 0.0519, + "num_input_tokens_seen": 78726592, + "step": 36475 + }, + { + "epoch": 5.951060358890701, + "grad_norm": 0.0374794602394104, + "learning_rate": 0.0008857624834351639, + "loss": 0.0498, + "num_input_tokens_seen": 78738048, + "step": 36480 + }, + { + "epoch": 5.951876019575856, + "grad_norm": 0.12423013150691986, + "learning_rate": 0.000885717194991089, + "loss": 0.0845, + "num_input_tokens_seen": 78749312, + "step": 36485 + }, + { + "epoch": 5.952691680261012, + "grad_norm": 0.018314247950911522, + "learning_rate": 0.0008856718987299656, + "loss": 0.1233, + "num_input_tokens_seen": 78760512, + "step": 36490 + }, + { + "epoch": 5.9535073409461665, + "grad_norm": 0.17948150634765625, + "learning_rate": 0.0008856265946527122, + "loss": 0.0953, + "num_input_tokens_seen": 78771072, + "step": 36495 + }, + { + "epoch": 5.954323001631321, + "grad_norm": 0.0033492485526949167, + "learning_rate": 0.0008855812827602465, + "loss": 0.0367, + "num_input_tokens_seen": 78781376, + "step": 36500 + }, + { + "epoch": 5.955138662316476, + "grad_norm": 0.010910273529589176, + "learning_rate": 0.0008855359630534871, + "loss": 0.1562, + "num_input_tokens_seen": 78792640, + "step": 36505 + }, + { + "epoch": 5.955954323001631, + "grad_norm": 0.11907117068767548, + "learning_rate": 0.0008854906355333522, + "loss": 0.0905, + "num_input_tokens_seen": 78803776, + "step": 36510 + }, + { + "epoch": 5.956769983686787, + "grad_norm": 0.19721025228500366, + "learning_rate": 0.0008854453002007607, + "loss": 0.0603, + "num_input_tokens_seen": 78813440, + "step": 36515 + }, + { + "epoch": 5.9575856443719415, + "grad_norm": 0.005332444794476032, + "learning_rate": 0.0008853999570566311, + "loss": 0.1595, + "num_input_tokens_seen": 78824736, + "step": 36520 + }, + { + "epoch": 5.958401305057096, + "grad_norm": 0.012158348225057125, + "learning_rate": 0.0008853546061018825, + "loss": 0.1121, + "num_input_tokens_seen": 78833920, + "step": 36525 + }, + { + "epoch": 5.959216965742251, + "grad_norm": 0.008253490552306175, + "learning_rate": 0.000885309247337434, + "loss": 0.0635, + "num_input_tokens_seen": 78845376, + "step": 36530 + }, + { + "epoch": 5.960032626427406, + "grad_norm": 0.14148858189582825, + "learning_rate": 0.0008852638807642048, + "loss": 0.0563, + "num_input_tokens_seen": 78858016, + "step": 36535 + }, + { + "epoch": 5.960848287112562, + "grad_norm": 0.04641634225845337, + "learning_rate": 0.0008852185063831142, + "loss": 0.031, + "num_input_tokens_seen": 78869216, + "step": 36540 + }, + { + "epoch": 5.9616639477977165, + "grad_norm": 0.04847164452075958, + "learning_rate": 0.000885173124195082, + "loss": 0.0808, + "num_input_tokens_seen": 78880096, + "step": 36545 + }, + { + "epoch": 5.962479608482871, + "grad_norm": 0.03328290209174156, + "learning_rate": 0.0008851277342010278, + "loss": 0.0864, + "num_input_tokens_seen": 78889664, + "step": 36550 + }, + { + "epoch": 5.963295269168026, + "grad_norm": 0.0033334684558212757, + "learning_rate": 0.0008850823364018715, + "loss": 0.0351, + "num_input_tokens_seen": 78902240, + "step": 36555 + }, + { + "epoch": 5.964110929853181, + "grad_norm": 0.12017543613910675, + "learning_rate": 0.0008850369307985328, + "loss": 0.0602, + "num_input_tokens_seen": 78911872, + "step": 36560 + }, + { + "epoch": 5.964926590538336, + "grad_norm": 0.08754347264766693, + "learning_rate": 0.0008849915173919327, + "loss": 0.0215, + "num_input_tokens_seen": 78922304, + "step": 36565 + }, + { + "epoch": 5.9657422512234906, + "grad_norm": 0.010991050861775875, + "learning_rate": 0.0008849460961829909, + "loss": 0.0413, + "num_input_tokens_seen": 78933408, + "step": 36570 + }, + { + "epoch": 5.966557911908646, + "grad_norm": 0.017036227509379387, + "learning_rate": 0.0008849006671726281, + "loss": 0.0835, + "num_input_tokens_seen": 78944704, + "step": 36575 + }, + { + "epoch": 5.967373572593801, + "grad_norm": 0.005260332487523556, + "learning_rate": 0.0008848552303617651, + "loss": 0.0325, + "num_input_tokens_seen": 78955584, + "step": 36580 + }, + { + "epoch": 5.968189233278956, + "grad_norm": 0.3492783010005951, + "learning_rate": 0.0008848097857513227, + "loss": 0.0976, + "num_input_tokens_seen": 78966272, + "step": 36585 + }, + { + "epoch": 5.969004893964111, + "grad_norm": 0.03630805388092995, + "learning_rate": 0.0008847643333422216, + "loss": 0.222, + "num_input_tokens_seen": 78976608, + "step": 36590 + }, + { + "epoch": 5.9698205546492655, + "grad_norm": 0.04305567219853401, + "learning_rate": 0.0008847188731353833, + "loss": 0.0533, + "num_input_tokens_seen": 78987360, + "step": 36595 + }, + { + "epoch": 5.970636215334421, + "grad_norm": 0.04944748803973198, + "learning_rate": 0.0008846734051317289, + "loss": 0.0337, + "num_input_tokens_seen": 78999040, + "step": 36600 + }, + { + "epoch": 5.971451876019576, + "grad_norm": 0.03273981064558029, + "learning_rate": 0.0008846279293321801, + "loss": 0.0146, + "num_input_tokens_seen": 79010112, + "step": 36605 + }, + { + "epoch": 5.972267536704731, + "grad_norm": 0.09030260890722275, + "learning_rate": 0.0008845824457376583, + "loss": 0.057, + "num_input_tokens_seen": 79020416, + "step": 36610 + }, + { + "epoch": 5.973083197389886, + "grad_norm": 0.14346785843372345, + "learning_rate": 0.0008845369543490853, + "loss": 0.2294, + "num_input_tokens_seen": 79031648, + "step": 36615 + }, + { + "epoch": 5.9738988580750405, + "grad_norm": 0.013122135773301125, + "learning_rate": 0.0008844914551673832, + "loss": 0.0587, + "num_input_tokens_seen": 79042560, + "step": 36620 + }, + { + "epoch": 5.974714518760196, + "grad_norm": 0.22009605169296265, + "learning_rate": 0.000884445948193474, + "loss": 0.1151, + "num_input_tokens_seen": 79052672, + "step": 36625 + }, + { + "epoch": 5.975530179445351, + "grad_norm": 0.19965970516204834, + "learning_rate": 0.0008844004334282801, + "loss": 0.2201, + "num_input_tokens_seen": 79063744, + "step": 36630 + }, + { + "epoch": 5.976345840130506, + "grad_norm": 0.2095831036567688, + "learning_rate": 0.0008843549108727234, + "loss": 0.0597, + "num_input_tokens_seen": 79075424, + "step": 36635 + }, + { + "epoch": 5.977161500815661, + "grad_norm": 0.08232962340116501, + "learning_rate": 0.0008843093805277271, + "loss": 0.0401, + "num_input_tokens_seen": 79085792, + "step": 36640 + }, + { + "epoch": 5.9779771615008155, + "grad_norm": 0.2018812596797943, + "learning_rate": 0.0008842638423942136, + "loss": 0.1165, + "num_input_tokens_seen": 79096704, + "step": 36645 + }, + { + "epoch": 5.97879282218597, + "grad_norm": 0.14320223033428192, + "learning_rate": 0.0008842182964731058, + "loss": 0.1202, + "num_input_tokens_seen": 79107328, + "step": 36650 + }, + { + "epoch": 5.979608482871125, + "grad_norm": 0.2424679547548294, + "learning_rate": 0.0008841727427653269, + "loss": 0.0649, + "num_input_tokens_seen": 79118144, + "step": 36655 + }, + { + "epoch": 5.980424143556281, + "grad_norm": 0.00438620476052165, + "learning_rate": 0.0008841271812717999, + "loss": 0.1438, + "num_input_tokens_seen": 79129568, + "step": 36660 + }, + { + "epoch": 5.981239804241436, + "grad_norm": 0.027949940413236618, + "learning_rate": 0.0008840816119934485, + "loss": 0.0244, + "num_input_tokens_seen": 79139936, + "step": 36665 + }, + { + "epoch": 5.9820554649265905, + "grad_norm": 0.07955466210842133, + "learning_rate": 0.0008840360349311958, + "loss": 0.0734, + "num_input_tokens_seen": 79149664, + "step": 36670 + }, + { + "epoch": 5.982871125611745, + "grad_norm": 0.01684229075908661, + "learning_rate": 0.0008839904500859656, + "loss": 0.0944, + "num_input_tokens_seen": 79161280, + "step": 36675 + }, + { + "epoch": 5.9836867862969, + "grad_norm": 0.0033336025662720203, + "learning_rate": 0.0008839448574586821, + "loss": 0.0763, + "num_input_tokens_seen": 79171488, + "step": 36680 + }, + { + "epoch": 5.984502446982056, + "grad_norm": 0.14795121550559998, + "learning_rate": 0.0008838992570502687, + "loss": 0.0817, + "num_input_tokens_seen": 79182656, + "step": 36685 + }, + { + "epoch": 5.985318107667211, + "grad_norm": 0.009519455023109913, + "learning_rate": 0.0008838536488616499, + "loss": 0.142, + "num_input_tokens_seen": 79194016, + "step": 36690 + }, + { + "epoch": 5.986133768352365, + "grad_norm": 0.028965627774596214, + "learning_rate": 0.0008838080328937501, + "loss": 0.1551, + "num_input_tokens_seen": 79205472, + "step": 36695 + }, + { + "epoch": 5.98694942903752, + "grad_norm": 0.02132982388138771, + "learning_rate": 0.0008837624091474935, + "loss": 0.1005, + "num_input_tokens_seen": 79215904, + "step": 36700 + }, + { + "epoch": 5.987765089722675, + "grad_norm": 0.012585681863129139, + "learning_rate": 0.0008837167776238049, + "loss": 0.0071, + "num_input_tokens_seen": 79227776, + "step": 36705 + }, + { + "epoch": 5.988580750407831, + "grad_norm": 0.003547877538949251, + "learning_rate": 0.0008836711383236089, + "loss": 0.1087, + "num_input_tokens_seen": 79239488, + "step": 36710 + }, + { + "epoch": 5.989396411092986, + "grad_norm": 0.08918604254722595, + "learning_rate": 0.0008836254912478308, + "loss": 0.0757, + "num_input_tokens_seen": 79249600, + "step": 36715 + }, + { + "epoch": 5.99021207177814, + "grad_norm": 0.014293678104877472, + "learning_rate": 0.0008835798363973952, + "loss": 0.1583, + "num_input_tokens_seen": 79259840, + "step": 36720 + }, + { + "epoch": 5.991027732463295, + "grad_norm": 0.03733542561531067, + "learning_rate": 0.0008835341737732276, + "loss": 0.1115, + "num_input_tokens_seen": 79271488, + "step": 36725 + }, + { + "epoch": 5.99184339314845, + "grad_norm": 0.08054745197296143, + "learning_rate": 0.0008834885033762536, + "loss": 0.0598, + "num_input_tokens_seen": 79282752, + "step": 36730 + }, + { + "epoch": 5.992659053833605, + "grad_norm": 0.1246202364563942, + "learning_rate": 0.0008834428252073986, + "loss": 0.1415, + "num_input_tokens_seen": 79294336, + "step": 36735 + }, + { + "epoch": 5.993474714518761, + "grad_norm": 0.1338900327682495, + "learning_rate": 0.0008833971392675882, + "loss": 0.2988, + "num_input_tokens_seen": 79306272, + "step": 36740 + }, + { + "epoch": 5.994290375203915, + "grad_norm": 0.02472054399549961, + "learning_rate": 0.0008833514455577485, + "loss": 0.0344, + "num_input_tokens_seen": 79317120, + "step": 36745 + }, + { + "epoch": 5.99510603588907, + "grad_norm": 0.019241586327552795, + "learning_rate": 0.0008833057440788053, + "loss": 0.0212, + "num_input_tokens_seen": 79327200, + "step": 36750 + }, + { + "epoch": 5.995921696574225, + "grad_norm": 0.057987939566373825, + "learning_rate": 0.000883260034831685, + "loss": 0.0699, + "num_input_tokens_seen": 79337888, + "step": 36755 + }, + { + "epoch": 5.99673735725938, + "grad_norm": 0.07865146547555923, + "learning_rate": 0.000883214317817314, + "loss": 0.0817, + "num_input_tokens_seen": 79348480, + "step": 36760 + }, + { + "epoch": 5.997553017944535, + "grad_norm": 0.06520058959722519, + "learning_rate": 0.0008831685930366187, + "loss": 0.1397, + "num_input_tokens_seen": 79360480, + "step": 36765 + }, + { + "epoch": 5.99836867862969, + "grad_norm": 0.15988144278526306, + "learning_rate": 0.0008831228604905257, + "loss": 0.1051, + "num_input_tokens_seen": 79370912, + "step": 36770 + }, + { + "epoch": 5.999184339314845, + "grad_norm": 0.09811560064554214, + "learning_rate": 0.0008830771201799619, + "loss": 0.0899, + "num_input_tokens_seen": 79380288, + "step": 36775 + }, + { + "epoch": 6.0, + "grad_norm": 0.004885673988610506, + "learning_rate": 0.0008830313721058543, + "loss": 0.0768, + "num_input_tokens_seen": 79389648, + "step": 36780 + }, + { + "epoch": 6.0, + "eval_loss": 0.11071006208658218, + "eval_runtime": 103.8217, + "eval_samples_per_second": 26.247, + "eval_steps_per_second": 6.569, + "num_input_tokens_seen": 79389648, + "step": 36780 + }, + { + "epoch": 6.000815660685155, + "grad_norm": 0.03924135863780975, + "learning_rate": 0.00088298561626913, + "loss": 0.0529, + "num_input_tokens_seen": 79400656, + "step": 36785 + }, + { + "epoch": 6.00163132137031, + "grad_norm": 0.1462024301290512, + "learning_rate": 0.0008829398526707164, + "loss": 0.0565, + "num_input_tokens_seen": 79412656, + "step": 36790 + }, + { + "epoch": 6.002446982055465, + "grad_norm": 0.014498461969196796, + "learning_rate": 0.0008828940813115408, + "loss": 0.0461, + "num_input_tokens_seen": 79424048, + "step": 36795 + }, + { + "epoch": 6.00326264274062, + "grad_norm": 0.01983257569372654, + "learning_rate": 0.000882848302192531, + "loss": 0.0183, + "num_input_tokens_seen": 79434896, + "step": 36800 + }, + { + "epoch": 6.004078303425775, + "grad_norm": 0.00984436459839344, + "learning_rate": 0.0008828025153146147, + "loss": 0.027, + "num_input_tokens_seen": 79445680, + "step": 36805 + }, + { + "epoch": 6.00489396411093, + "grad_norm": 0.05865294113755226, + "learning_rate": 0.0008827567206787197, + "loss": 0.0505, + "num_input_tokens_seen": 79456592, + "step": 36810 + }, + { + "epoch": 6.005709624796085, + "grad_norm": 0.06138975918292999, + "learning_rate": 0.0008827109182857742, + "loss": 0.0732, + "num_input_tokens_seen": 79466992, + "step": 36815 + }, + { + "epoch": 6.006525285481239, + "grad_norm": 0.01654989831149578, + "learning_rate": 0.0008826651081367065, + "loss": 0.0108, + "num_input_tokens_seen": 79477168, + "step": 36820 + }, + { + "epoch": 6.007340946166395, + "grad_norm": 0.1481417417526245, + "learning_rate": 0.0008826192902324449, + "loss": 0.0671, + "num_input_tokens_seen": 79488368, + "step": 36825 + }, + { + "epoch": 6.00815660685155, + "grad_norm": 0.006521583069115877, + "learning_rate": 0.0008825734645739181, + "loss": 0.0932, + "num_input_tokens_seen": 79498736, + "step": 36830 + }, + { + "epoch": 6.008972267536705, + "grad_norm": 0.02719235047698021, + "learning_rate": 0.0008825276311620546, + "loss": 0.1368, + "num_input_tokens_seen": 79508656, + "step": 36835 + }, + { + "epoch": 6.00978792822186, + "grad_norm": 0.18721903860569, + "learning_rate": 0.0008824817899977834, + "loss": 0.1826, + "num_input_tokens_seen": 79519696, + "step": 36840 + }, + { + "epoch": 6.010603588907014, + "grad_norm": 0.009548144415020943, + "learning_rate": 0.0008824359410820335, + "loss": 0.1671, + "num_input_tokens_seen": 79529776, + "step": 36845 + }, + { + "epoch": 6.011419249592169, + "grad_norm": 0.1931152641773224, + "learning_rate": 0.0008823900844157342, + "loss": 0.1174, + "num_input_tokens_seen": 79541392, + "step": 36850 + }, + { + "epoch": 6.012234910277325, + "grad_norm": 0.11581052094697952, + "learning_rate": 0.0008823442199998147, + "loss": 0.0968, + "num_input_tokens_seen": 79552720, + "step": 36855 + }, + { + "epoch": 6.01305057096248, + "grad_norm": 0.1442585587501526, + "learning_rate": 0.0008822983478352044, + "loss": 0.116, + "num_input_tokens_seen": 79563888, + "step": 36860 + }, + { + "epoch": 6.013866231647635, + "grad_norm": 0.040123820304870605, + "learning_rate": 0.0008822524679228332, + "loss": 0.0938, + "num_input_tokens_seen": 79575760, + "step": 36865 + }, + { + "epoch": 6.014681892332789, + "grad_norm": 0.1661331057548523, + "learning_rate": 0.0008822065802636308, + "loss": 0.0455, + "num_input_tokens_seen": 79587408, + "step": 36870 + }, + { + "epoch": 6.015497553017944, + "grad_norm": 0.20198991894721985, + "learning_rate": 0.0008821606848585273, + "loss": 0.1257, + "num_input_tokens_seen": 79598352, + "step": 36875 + }, + { + "epoch": 6.0163132137031, + "grad_norm": 0.20137766003608704, + "learning_rate": 0.0008821147817084526, + "loss": 0.1454, + "num_input_tokens_seen": 79609520, + "step": 36880 + }, + { + "epoch": 6.017128874388255, + "grad_norm": 0.028597760945558548, + "learning_rate": 0.0008820688708143372, + "loss": 0.0285, + "num_input_tokens_seen": 79621712, + "step": 36885 + }, + { + "epoch": 6.0179445350734095, + "grad_norm": 0.09635327011346817, + "learning_rate": 0.0008820229521771112, + "loss": 0.0502, + "num_input_tokens_seen": 79630576, + "step": 36890 + }, + { + "epoch": 6.018760195758564, + "grad_norm": 0.018573066219687462, + "learning_rate": 0.0008819770257977058, + "loss": 0.1161, + "num_input_tokens_seen": 79641360, + "step": 36895 + }, + { + "epoch": 6.019575856443719, + "grad_norm": 0.009165086783468723, + "learning_rate": 0.0008819310916770511, + "loss": 0.0374, + "num_input_tokens_seen": 79651792, + "step": 36900 + }, + { + "epoch": 6.020391517128874, + "grad_norm": 0.006334943231195211, + "learning_rate": 0.0008818851498160785, + "loss": 0.0439, + "num_input_tokens_seen": 79661488, + "step": 36905 + }, + { + "epoch": 6.02120717781403, + "grad_norm": 0.08824716508388519, + "learning_rate": 0.0008818392002157188, + "loss": 0.1064, + "num_input_tokens_seen": 79671856, + "step": 36910 + }, + { + "epoch": 6.0220228384991845, + "grad_norm": 0.004747677128762007, + "learning_rate": 0.0008817932428769033, + "loss": 0.0405, + "num_input_tokens_seen": 79681232, + "step": 36915 + }, + { + "epoch": 6.022838499184339, + "grad_norm": 0.32237428426742554, + "learning_rate": 0.0008817472778005635, + "loss": 0.1397, + "num_input_tokens_seen": 79691728, + "step": 36920 + }, + { + "epoch": 6.023654159869494, + "grad_norm": 0.01097463071346283, + "learning_rate": 0.0008817013049876308, + "loss": 0.0551, + "num_input_tokens_seen": 79702672, + "step": 36925 + }, + { + "epoch": 6.024469820554649, + "grad_norm": 0.037884730845689774, + "learning_rate": 0.0008816553244390368, + "loss": 0.0338, + "num_input_tokens_seen": 79713552, + "step": 36930 + }, + { + "epoch": 6.025285481239805, + "grad_norm": 0.011626508086919785, + "learning_rate": 0.0008816093361557136, + "loss": 0.015, + "num_input_tokens_seen": 79725296, + "step": 36935 + }, + { + "epoch": 6.0261011419249595, + "grad_norm": 0.2154628187417984, + "learning_rate": 0.0008815633401385932, + "loss": 0.1012, + "num_input_tokens_seen": 79735792, + "step": 36940 + }, + { + "epoch": 6.026916802610114, + "grad_norm": 0.006960687227547169, + "learning_rate": 0.0008815173363886075, + "loss": 0.0168, + "num_input_tokens_seen": 79746576, + "step": 36945 + }, + { + "epoch": 6.027732463295269, + "grad_norm": 0.18785996735095978, + "learning_rate": 0.000881471324906689, + "loss": 0.1216, + "num_input_tokens_seen": 79756496, + "step": 36950 + }, + { + "epoch": 6.028548123980424, + "grad_norm": 0.03791455179452896, + "learning_rate": 0.0008814253056937702, + "loss": 0.0373, + "num_input_tokens_seen": 79767600, + "step": 36955 + }, + { + "epoch": 6.029363784665579, + "grad_norm": 0.0031759492121636868, + "learning_rate": 0.0008813792787507837, + "loss": 0.0642, + "num_input_tokens_seen": 79779056, + "step": 36960 + }, + { + "epoch": 6.0301794453507345, + "grad_norm": 0.07088899612426758, + "learning_rate": 0.0008813332440786623, + "loss": 0.0597, + "num_input_tokens_seen": 79790672, + "step": 36965 + }, + { + "epoch": 6.030995106035889, + "grad_norm": 0.3424316942691803, + "learning_rate": 0.0008812872016783389, + "loss": 0.1447, + "num_input_tokens_seen": 79800144, + "step": 36970 + }, + { + "epoch": 6.031810766721044, + "grad_norm": 0.27340343594551086, + "learning_rate": 0.0008812411515507468, + "loss": 0.1174, + "num_input_tokens_seen": 79811056, + "step": 36975 + }, + { + "epoch": 6.032626427406199, + "grad_norm": 0.17439158260822296, + "learning_rate": 0.000881195093696819, + "loss": 0.1308, + "num_input_tokens_seen": 79821616, + "step": 36980 + }, + { + "epoch": 6.033442088091354, + "grad_norm": 0.046110205352306366, + "learning_rate": 0.000881149028117489, + "loss": 0.0613, + "num_input_tokens_seen": 79833104, + "step": 36985 + }, + { + "epoch": 6.034257748776509, + "grad_norm": 0.037269227206707, + "learning_rate": 0.0008811029548136906, + "loss": 0.0424, + "num_input_tokens_seen": 79842832, + "step": 36990 + }, + { + "epoch": 6.035073409461664, + "grad_norm": 0.12348993122577667, + "learning_rate": 0.0008810568737863574, + "loss": 0.044, + "num_input_tokens_seen": 79853840, + "step": 36995 + }, + { + "epoch": 6.035889070146819, + "grad_norm": 0.1268976628780365, + "learning_rate": 0.000881010785036423, + "loss": 0.1123, + "num_input_tokens_seen": 79865488, + "step": 37000 + }, + { + "epoch": 6.036704730831974, + "grad_norm": 0.07820584625005722, + "learning_rate": 0.0008809646885648218, + "loss": 0.1353, + "num_input_tokens_seen": 79874640, + "step": 37005 + }, + { + "epoch": 6.037520391517129, + "grad_norm": 0.1036931574344635, + "learning_rate": 0.000880918584372488, + "loss": 0.0282, + "num_input_tokens_seen": 79885552, + "step": 37010 + }, + { + "epoch": 6.0383360522022835, + "grad_norm": 0.006788986269384623, + "learning_rate": 0.0008808724724603558, + "loss": 0.0262, + "num_input_tokens_seen": 79896336, + "step": 37015 + }, + { + "epoch": 6.039151712887439, + "grad_norm": 0.1779312789440155, + "learning_rate": 0.0008808263528293596, + "loss": 0.1106, + "num_input_tokens_seen": 79907184, + "step": 37020 + }, + { + "epoch": 6.039967373572594, + "grad_norm": 0.1813160479068756, + "learning_rate": 0.0008807802254804344, + "loss": 0.1048, + "num_input_tokens_seen": 79917936, + "step": 37025 + }, + { + "epoch": 6.040783034257749, + "grad_norm": 0.12429571896791458, + "learning_rate": 0.000880734090414515, + "loss": 0.1267, + "num_input_tokens_seen": 79930000, + "step": 37030 + }, + { + "epoch": 6.041598694942904, + "grad_norm": 0.05348210036754608, + "learning_rate": 0.000880687947632536, + "loss": 0.0387, + "num_input_tokens_seen": 79941456, + "step": 37035 + }, + { + "epoch": 6.0424143556280585, + "grad_norm": 0.006949778646230698, + "learning_rate": 0.000880641797135433, + "loss": 0.0312, + "num_input_tokens_seen": 79951440, + "step": 37040 + }, + { + "epoch": 6.043230016313213, + "grad_norm": 0.005517715122550726, + "learning_rate": 0.000880595638924141, + "loss": 0.0289, + "num_input_tokens_seen": 79962064, + "step": 37045 + }, + { + "epoch": 6.044045676998369, + "grad_norm": 0.0062828464433550835, + "learning_rate": 0.0008805494729995957, + "loss": 0.0314, + "num_input_tokens_seen": 79974160, + "step": 37050 + }, + { + "epoch": 6.044861337683524, + "grad_norm": 0.06380785256624222, + "learning_rate": 0.0008805032993627324, + "loss": 0.0218, + "num_input_tokens_seen": 79985072, + "step": 37055 + }, + { + "epoch": 6.045676998368679, + "grad_norm": 0.15414415299892426, + "learning_rate": 0.0008804571180144871, + "loss": 0.0811, + "num_input_tokens_seen": 79996784, + "step": 37060 + }, + { + "epoch": 6.0464926590538335, + "grad_norm": 0.0872289389371872, + "learning_rate": 0.0008804109289557956, + "loss": 0.0338, + "num_input_tokens_seen": 80008496, + "step": 37065 + }, + { + "epoch": 6.047308319738988, + "grad_norm": 0.20813466608524323, + "learning_rate": 0.0008803647321875942, + "loss": 0.0994, + "num_input_tokens_seen": 80019600, + "step": 37070 + }, + { + "epoch": 6.048123980424143, + "grad_norm": 0.013394215144217014, + "learning_rate": 0.0008803185277108188, + "loss": 0.0137, + "num_input_tokens_seen": 80030864, + "step": 37075 + }, + { + "epoch": 6.048939641109299, + "grad_norm": 0.19859327375888824, + "learning_rate": 0.0008802723155264061, + "loss": 0.0927, + "num_input_tokens_seen": 80041936, + "step": 37080 + }, + { + "epoch": 6.049755301794454, + "grad_norm": 0.23632246255874634, + "learning_rate": 0.0008802260956352924, + "loss": 0.0526, + "num_input_tokens_seen": 80053552, + "step": 37085 + }, + { + "epoch": 6.0505709624796085, + "grad_norm": 0.26411524415016174, + "learning_rate": 0.0008801798680384145, + "loss": 0.2545, + "num_input_tokens_seen": 80064688, + "step": 37090 + }, + { + "epoch": 6.051386623164763, + "grad_norm": 0.15187275409698486, + "learning_rate": 0.0008801336327367096, + "loss": 0.0329, + "num_input_tokens_seen": 80076176, + "step": 37095 + }, + { + "epoch": 6.052202283849918, + "grad_norm": 0.014832193031907082, + "learning_rate": 0.0008800873897311141, + "loss": 0.0512, + "num_input_tokens_seen": 80085968, + "step": 37100 + }, + { + "epoch": 6.053017944535074, + "grad_norm": 0.05173990875482559, + "learning_rate": 0.0008800411390225655, + "loss": 0.311, + "num_input_tokens_seen": 80096048, + "step": 37105 + }, + { + "epoch": 6.053833605220229, + "grad_norm": 0.0730687603354454, + "learning_rate": 0.000879994880612001, + "loss": 0.0481, + "num_input_tokens_seen": 80106608, + "step": 37110 + }, + { + "epoch": 6.054649265905383, + "grad_norm": 0.0034919430036097765, + "learning_rate": 0.0008799486145003583, + "loss": 0.0711, + "num_input_tokens_seen": 80116400, + "step": 37115 + }, + { + "epoch": 6.055464926590538, + "grad_norm": 0.2512289881706238, + "learning_rate": 0.0008799023406885751, + "loss": 0.1428, + "num_input_tokens_seen": 80127024, + "step": 37120 + }, + { + "epoch": 6.056280587275693, + "grad_norm": 0.005942865740507841, + "learning_rate": 0.0008798560591775889, + "loss": 0.0934, + "num_input_tokens_seen": 80137936, + "step": 37125 + }, + { + "epoch": 6.057096247960848, + "grad_norm": 0.15824447572231293, + "learning_rate": 0.0008798097699683376, + "loss": 0.0615, + "num_input_tokens_seen": 80148752, + "step": 37130 + }, + { + "epoch": 6.057911908646004, + "grad_norm": 0.07591063529253006, + "learning_rate": 0.0008797634730617598, + "loss": 0.1139, + "num_input_tokens_seen": 80159856, + "step": 37135 + }, + { + "epoch": 6.058727569331158, + "grad_norm": 0.3355007767677307, + "learning_rate": 0.0008797171684587933, + "loss": 0.083, + "num_input_tokens_seen": 80172656, + "step": 37140 + }, + { + "epoch": 6.059543230016313, + "grad_norm": 0.00912111159414053, + "learning_rate": 0.0008796708561603766, + "loss": 0.0257, + "num_input_tokens_seen": 80184240, + "step": 37145 + }, + { + "epoch": 6.060358890701468, + "grad_norm": 0.04706587642431259, + "learning_rate": 0.0008796245361674484, + "loss": 0.0252, + "num_input_tokens_seen": 80195184, + "step": 37150 + }, + { + "epoch": 6.061174551386623, + "grad_norm": 0.21621105074882507, + "learning_rate": 0.0008795782084809473, + "loss": 0.057, + "num_input_tokens_seen": 80205296, + "step": 37155 + }, + { + "epoch": 6.061990212071779, + "grad_norm": 0.0032947231084108353, + "learning_rate": 0.0008795318731018124, + "loss": 0.0443, + "num_input_tokens_seen": 80216496, + "step": 37160 + }, + { + "epoch": 6.062805872756933, + "grad_norm": 0.06971344351768494, + "learning_rate": 0.0008794855300309827, + "loss": 0.038, + "num_input_tokens_seen": 80227280, + "step": 37165 + }, + { + "epoch": 6.063621533442088, + "grad_norm": 0.0855175331234932, + "learning_rate": 0.0008794391792693973, + "loss": 0.1125, + "num_input_tokens_seen": 80238704, + "step": 37170 + }, + { + "epoch": 6.064437194127243, + "grad_norm": 0.3632354140281677, + "learning_rate": 0.0008793928208179955, + "loss": 0.1231, + "num_input_tokens_seen": 80248560, + "step": 37175 + }, + { + "epoch": 6.065252854812398, + "grad_norm": 0.0037957890890538692, + "learning_rate": 0.000879346454677717, + "loss": 0.0402, + "num_input_tokens_seen": 80258448, + "step": 37180 + }, + { + "epoch": 6.066068515497553, + "grad_norm": 0.0072946492582559586, + "learning_rate": 0.0008793000808495012, + "loss": 0.0153, + "num_input_tokens_seen": 80269168, + "step": 37185 + }, + { + "epoch": 6.066884176182708, + "grad_norm": 0.005088200327008963, + "learning_rate": 0.0008792536993342882, + "loss": 0.0198, + "num_input_tokens_seen": 80278672, + "step": 37190 + }, + { + "epoch": 6.067699836867863, + "grad_norm": 0.011431118473410606, + "learning_rate": 0.0008792073101330177, + "loss": 0.0131, + "num_input_tokens_seen": 80290128, + "step": 37195 + }, + { + "epoch": 6.068515497553018, + "grad_norm": 0.006760529242455959, + "learning_rate": 0.00087916091324663, + "loss": 0.1852, + "num_input_tokens_seen": 80300816, + "step": 37200 + }, + { + "epoch": 6.069331158238173, + "grad_norm": 0.0063934046775102615, + "learning_rate": 0.0008791145086760656, + "loss": 0.0307, + "num_input_tokens_seen": 80311120, + "step": 37205 + }, + { + "epoch": 6.070146818923328, + "grad_norm": 0.12538732588291168, + "learning_rate": 0.0008790680964222647, + "loss": 0.0831, + "num_input_tokens_seen": 80321968, + "step": 37210 + }, + { + "epoch": 6.0709624796084825, + "grad_norm": 0.02447766438126564, + "learning_rate": 0.000879021676486168, + "loss": 0.0947, + "num_input_tokens_seen": 80332176, + "step": 37215 + }, + { + "epoch": 6.071778140293638, + "grad_norm": 0.08471429347991943, + "learning_rate": 0.0008789752488687159, + "loss": 0.0326, + "num_input_tokens_seen": 80343376, + "step": 37220 + }, + { + "epoch": 6.072593800978793, + "grad_norm": 0.011862912215292454, + "learning_rate": 0.00087892881357085, + "loss": 0.0138, + "num_input_tokens_seen": 80354032, + "step": 37225 + }, + { + "epoch": 6.073409461663948, + "grad_norm": 0.007405825424939394, + "learning_rate": 0.0008788823705935107, + "loss": 0.0174, + "num_input_tokens_seen": 80365072, + "step": 37230 + }, + { + "epoch": 6.074225122349103, + "grad_norm": 0.017226114869117737, + "learning_rate": 0.0008788359199376396, + "loss": 0.0426, + "num_input_tokens_seen": 80374512, + "step": 37235 + }, + { + "epoch": 6.075040783034257, + "grad_norm": 0.020373638719320297, + "learning_rate": 0.0008787894616041781, + "loss": 0.0326, + "num_input_tokens_seen": 80384368, + "step": 37240 + }, + { + "epoch": 6.075856443719413, + "grad_norm": 0.02591075748205185, + "learning_rate": 0.0008787429955940675, + "loss": 0.0533, + "num_input_tokens_seen": 80395152, + "step": 37245 + }, + { + "epoch": 6.076672104404568, + "grad_norm": 0.13547790050506592, + "learning_rate": 0.0008786965219082497, + "loss": 0.1048, + "num_input_tokens_seen": 80406288, + "step": 37250 + }, + { + "epoch": 6.077487765089723, + "grad_norm": 0.004472584929317236, + "learning_rate": 0.0008786500405476664, + "loss": 0.0507, + "num_input_tokens_seen": 80417040, + "step": 37255 + }, + { + "epoch": 6.078303425774878, + "grad_norm": 0.07085185497999191, + "learning_rate": 0.0008786035515132598, + "loss": 0.1426, + "num_input_tokens_seen": 80428112, + "step": 37260 + }, + { + "epoch": 6.079119086460032, + "grad_norm": 0.0029244578909128904, + "learning_rate": 0.0008785570548059718, + "loss": 0.0481, + "num_input_tokens_seen": 80438288, + "step": 37265 + }, + { + "epoch": 6.079934747145187, + "grad_norm": 0.038664527237415314, + "learning_rate": 0.0008785105504267449, + "loss": 0.3225, + "num_input_tokens_seen": 80449360, + "step": 37270 + }, + { + "epoch": 6.080750407830343, + "grad_norm": 0.1353144645690918, + "learning_rate": 0.0008784640383765215, + "loss": 0.0428, + "num_input_tokens_seen": 80460752, + "step": 37275 + }, + { + "epoch": 6.081566068515498, + "grad_norm": 0.19793492555618286, + "learning_rate": 0.0008784175186562442, + "loss": 0.1059, + "num_input_tokens_seen": 80472048, + "step": 37280 + }, + { + "epoch": 6.082381729200653, + "grad_norm": 0.040153827518224716, + "learning_rate": 0.000878370991266856, + "loss": 0.049, + "num_input_tokens_seen": 80481840, + "step": 37285 + }, + { + "epoch": 6.083197389885807, + "grad_norm": 0.07553903758525848, + "learning_rate": 0.0008783244562092996, + "loss": 0.0167, + "num_input_tokens_seen": 80493648, + "step": 37290 + }, + { + "epoch": 6.084013050570962, + "grad_norm": 0.01873917505145073, + "learning_rate": 0.0008782779134845181, + "loss": 0.1029, + "num_input_tokens_seen": 80503024, + "step": 37295 + }, + { + "epoch": 6.084828711256117, + "grad_norm": 0.06174977123737335, + "learning_rate": 0.0008782313630934548, + "loss": 0.14, + "num_input_tokens_seen": 80513808, + "step": 37300 + }, + { + "epoch": 6.085644371941273, + "grad_norm": 0.008898982778191566, + "learning_rate": 0.0008781848050370531, + "loss": 0.2012, + "num_input_tokens_seen": 80524176, + "step": 37305 + }, + { + "epoch": 6.0864600326264275, + "grad_norm": 0.0153073500841856, + "learning_rate": 0.0008781382393162566, + "loss": 0.0901, + "num_input_tokens_seen": 80536176, + "step": 37310 + }, + { + "epoch": 6.087275693311582, + "grad_norm": 0.0049950238317251205, + "learning_rate": 0.0008780916659320091, + "loss": 0.0777, + "num_input_tokens_seen": 80546960, + "step": 37315 + }, + { + "epoch": 6.088091353996737, + "grad_norm": 0.17021247744560242, + "learning_rate": 0.0008780450848852541, + "loss": 0.1671, + "num_input_tokens_seen": 80557936, + "step": 37320 + }, + { + "epoch": 6.088907014681892, + "grad_norm": 0.19208839535713196, + "learning_rate": 0.0008779984961769361, + "loss": 0.0874, + "num_input_tokens_seen": 80567728, + "step": 37325 + }, + { + "epoch": 6.089722675367048, + "grad_norm": 0.0050652166828513145, + "learning_rate": 0.0008779518998079988, + "loss": 0.0496, + "num_input_tokens_seen": 80579184, + "step": 37330 + }, + { + "epoch": 6.0905383360522025, + "grad_norm": 0.10569914430379868, + "learning_rate": 0.000877905295779387, + "loss": 0.0591, + "num_input_tokens_seen": 80589872, + "step": 37335 + }, + { + "epoch": 6.091353996737357, + "grad_norm": 0.022624365985393524, + "learning_rate": 0.0008778586840920449, + "loss": 0.0374, + "num_input_tokens_seen": 80599888, + "step": 37340 + }, + { + "epoch": 6.092169657422512, + "grad_norm": 0.179793581366539, + "learning_rate": 0.0008778120647469172, + "loss": 0.0758, + "num_input_tokens_seen": 80610384, + "step": 37345 + }, + { + "epoch": 6.092985318107667, + "grad_norm": 0.025115404278039932, + "learning_rate": 0.0008777654377449487, + "loss": 0.1215, + "num_input_tokens_seen": 80621456, + "step": 37350 + }, + { + "epoch": 6.093800978792822, + "grad_norm": 0.004806799814105034, + "learning_rate": 0.0008777188030870845, + "loss": 0.0885, + "num_input_tokens_seen": 80632208, + "step": 37355 + }, + { + "epoch": 6.0946166394779775, + "grad_norm": 0.12998010218143463, + "learning_rate": 0.0008776721607742695, + "loss": 0.0586, + "num_input_tokens_seen": 80642736, + "step": 37360 + }, + { + "epoch": 6.095432300163132, + "grad_norm": 0.17278088629245758, + "learning_rate": 0.0008776255108074489, + "loss": 0.1218, + "num_input_tokens_seen": 80654064, + "step": 37365 + }, + { + "epoch": 6.096247960848287, + "grad_norm": 0.28417882323265076, + "learning_rate": 0.0008775788531875685, + "loss": 0.0943, + "num_input_tokens_seen": 80664880, + "step": 37370 + }, + { + "epoch": 6.097063621533442, + "grad_norm": 0.11595521867275238, + "learning_rate": 0.0008775321879155735, + "loss": 0.0377, + "num_input_tokens_seen": 80676720, + "step": 37375 + }, + { + "epoch": 6.097879282218597, + "grad_norm": 0.05582326278090477, + "learning_rate": 0.0008774855149924099, + "loss": 0.0228, + "num_input_tokens_seen": 80688240, + "step": 37380 + }, + { + "epoch": 6.0986949429037525, + "grad_norm": 0.026308411732316017, + "learning_rate": 0.0008774388344190234, + "loss": 0.0213, + "num_input_tokens_seen": 80699216, + "step": 37385 + }, + { + "epoch": 6.099510603588907, + "grad_norm": 0.06510169059038162, + "learning_rate": 0.0008773921461963601, + "loss": 0.1001, + "num_input_tokens_seen": 80710192, + "step": 37390 + }, + { + "epoch": 6.100326264274062, + "grad_norm": 0.14889170229434967, + "learning_rate": 0.0008773454503253662, + "loss": 0.0735, + "num_input_tokens_seen": 80720880, + "step": 37395 + }, + { + "epoch": 6.101141924959217, + "grad_norm": 0.037764452397823334, + "learning_rate": 0.0008772987468069881, + "loss": 0.0479, + "num_input_tokens_seen": 80731760, + "step": 37400 + }, + { + "epoch": 6.101957585644372, + "grad_norm": 0.00787284690886736, + "learning_rate": 0.0008772520356421723, + "loss": 0.0705, + "num_input_tokens_seen": 80741712, + "step": 37405 + }, + { + "epoch": 6.102773246329527, + "grad_norm": 0.07853878289461136, + "learning_rate": 0.0008772053168318653, + "loss": 0.1571, + "num_input_tokens_seen": 80753264, + "step": 37410 + }, + { + "epoch": 6.103588907014682, + "grad_norm": 0.18741370737552643, + "learning_rate": 0.000877158590377014, + "loss": 0.0679, + "num_input_tokens_seen": 80764816, + "step": 37415 + }, + { + "epoch": 6.104404567699837, + "grad_norm": 0.05173123627901077, + "learning_rate": 0.0008771118562785656, + "loss": 0.1482, + "num_input_tokens_seen": 80774320, + "step": 37420 + }, + { + "epoch": 6.105220228384992, + "grad_norm": 0.003064174670726061, + "learning_rate": 0.0008770651145374669, + "loss": 0.029, + "num_input_tokens_seen": 80784976, + "step": 37425 + }, + { + "epoch": 6.106035889070147, + "grad_norm": 0.020462844520807266, + "learning_rate": 0.0008770183651546653, + "loss": 0.1379, + "num_input_tokens_seen": 80794672, + "step": 37430 + }, + { + "epoch": 6.1068515497553015, + "grad_norm": 0.029417170211672783, + "learning_rate": 0.0008769716081311083, + "loss": 0.0177, + "num_input_tokens_seen": 80805424, + "step": 37435 + }, + { + "epoch": 6.107667210440456, + "grad_norm": 0.20759232342243195, + "learning_rate": 0.0008769248434677434, + "loss": 0.1041, + "num_input_tokens_seen": 80816240, + "step": 37440 + }, + { + "epoch": 6.108482871125612, + "grad_norm": 0.005841630510985851, + "learning_rate": 0.0008768780711655185, + "loss": 0.1039, + "num_input_tokens_seen": 80825744, + "step": 37445 + }, + { + "epoch": 6.109298531810767, + "grad_norm": 0.24544957280158997, + "learning_rate": 0.0008768312912253811, + "loss": 0.116, + "num_input_tokens_seen": 80835984, + "step": 37450 + }, + { + "epoch": 6.110114192495922, + "grad_norm": 0.1584145426750183, + "learning_rate": 0.0008767845036482798, + "loss": 0.0625, + "num_input_tokens_seen": 80847184, + "step": 37455 + }, + { + "epoch": 6.1109298531810765, + "grad_norm": 0.0656294971704483, + "learning_rate": 0.0008767377084351625, + "loss": 0.0701, + "num_input_tokens_seen": 80858608, + "step": 37460 + }, + { + "epoch": 6.111745513866231, + "grad_norm": 0.010715468786656857, + "learning_rate": 0.0008766909055869777, + "loss": 0.0977, + "num_input_tokens_seen": 80869584, + "step": 37465 + }, + { + "epoch": 6.112561174551387, + "grad_norm": 0.1879115253686905, + "learning_rate": 0.0008766440951046736, + "loss": 0.1036, + "num_input_tokens_seen": 80879408, + "step": 37470 + }, + { + "epoch": 6.113376835236542, + "grad_norm": 0.00635351799428463, + "learning_rate": 0.0008765972769891993, + "loss": 0.046, + "num_input_tokens_seen": 80890032, + "step": 37475 + }, + { + "epoch": 6.114192495921697, + "grad_norm": 0.011777734383940697, + "learning_rate": 0.0008765504512415033, + "loss": 0.1636, + "num_input_tokens_seen": 80902160, + "step": 37480 + }, + { + "epoch": 6.1150081566068515, + "grad_norm": 0.004878508858382702, + "learning_rate": 0.0008765036178625347, + "loss": 0.2295, + "num_input_tokens_seen": 80914032, + "step": 37485 + }, + { + "epoch": 6.115823817292006, + "grad_norm": 0.021546348929405212, + "learning_rate": 0.0008764567768532427, + "loss": 0.0351, + "num_input_tokens_seen": 80924912, + "step": 37490 + }, + { + "epoch": 6.116639477977161, + "grad_norm": 0.05526890233159065, + "learning_rate": 0.0008764099282145767, + "loss": 0.0984, + "num_input_tokens_seen": 80934544, + "step": 37495 + }, + { + "epoch": 6.117455138662317, + "grad_norm": 0.09278127551078796, + "learning_rate": 0.0008763630719474857, + "loss": 0.0937, + "num_input_tokens_seen": 80945456, + "step": 37500 + }, + { + "epoch": 6.118270799347472, + "grad_norm": 0.011443572118878365, + "learning_rate": 0.0008763162080529199, + "loss": 0.0428, + "num_input_tokens_seen": 80956368, + "step": 37505 + }, + { + "epoch": 6.1190864600326265, + "grad_norm": 0.0029998458921909332, + "learning_rate": 0.0008762693365318286, + "loss": 0.0208, + "num_input_tokens_seen": 80967280, + "step": 37510 + }, + { + "epoch": 6.119902120717781, + "grad_norm": 0.011095762252807617, + "learning_rate": 0.0008762224573851619, + "loss": 0.0588, + "num_input_tokens_seen": 80978320, + "step": 37515 + }, + { + "epoch": 6.120717781402936, + "grad_norm": 0.1943042278289795, + "learning_rate": 0.0008761755706138698, + "loss": 0.1323, + "num_input_tokens_seen": 80988624, + "step": 37520 + }, + { + "epoch": 6.121533442088092, + "grad_norm": 0.11927758902311325, + "learning_rate": 0.0008761286762189027, + "loss": 0.1835, + "num_input_tokens_seen": 81000560, + "step": 37525 + }, + { + "epoch": 6.122349102773247, + "grad_norm": 0.012342249043285847, + "learning_rate": 0.0008760817742012106, + "loss": 0.018, + "num_input_tokens_seen": 81011408, + "step": 37530 + }, + { + "epoch": 6.123164763458401, + "grad_norm": 0.024045079946517944, + "learning_rate": 0.0008760348645617444, + "loss": 0.1888, + "num_input_tokens_seen": 81023472, + "step": 37535 + }, + { + "epoch": 6.123980424143556, + "grad_norm": 0.2295476496219635, + "learning_rate": 0.0008759879473014545, + "loss": 0.2341, + "num_input_tokens_seen": 81033968, + "step": 37540 + }, + { + "epoch": 6.124796084828711, + "grad_norm": 0.06232830509543419, + "learning_rate": 0.000875941022421292, + "loss": 0.0227, + "num_input_tokens_seen": 81044304, + "step": 37545 + }, + { + "epoch": 6.125611745513866, + "grad_norm": 0.012648412026464939, + "learning_rate": 0.0008758940899222077, + "loss": 0.0267, + "num_input_tokens_seen": 81055056, + "step": 37550 + }, + { + "epoch": 6.126427406199022, + "grad_norm": 0.06942897289991379, + "learning_rate": 0.0008758471498051528, + "loss": 0.0527, + "num_input_tokens_seen": 81066544, + "step": 37555 + }, + { + "epoch": 6.127243066884176, + "grad_norm": 0.027399610728025436, + "learning_rate": 0.0008758002020710787, + "loss": 0.0342, + "num_input_tokens_seen": 81077360, + "step": 37560 + }, + { + "epoch": 6.128058727569331, + "grad_norm": 0.09616296738386154, + "learning_rate": 0.0008757532467209367, + "loss": 0.0296, + "num_input_tokens_seen": 81088816, + "step": 37565 + }, + { + "epoch": 6.128874388254486, + "grad_norm": 0.03668958693742752, + "learning_rate": 0.0008757062837556784, + "loss": 0.0725, + "num_input_tokens_seen": 81101488, + "step": 37570 + }, + { + "epoch": 6.129690048939641, + "grad_norm": 0.021958520635962486, + "learning_rate": 0.0008756593131762557, + "loss": 0.0623, + "num_input_tokens_seen": 81111568, + "step": 37575 + }, + { + "epoch": 6.130505709624796, + "grad_norm": 0.02801474742591381, + "learning_rate": 0.0008756123349836206, + "loss": 0.0324, + "num_input_tokens_seen": 81121840, + "step": 37580 + }, + { + "epoch": 6.131321370309951, + "grad_norm": 0.12095033377408981, + "learning_rate": 0.0008755653491787249, + "loss": 0.0366, + "num_input_tokens_seen": 81132720, + "step": 37585 + }, + { + "epoch": 6.132137030995106, + "grad_norm": 0.17683523893356323, + "learning_rate": 0.000875518355762521, + "loss": 0.108, + "num_input_tokens_seen": 81143536, + "step": 37590 + }, + { + "epoch": 6.132952691680261, + "grad_norm": 0.2020643949508667, + "learning_rate": 0.0008754713547359612, + "loss": 0.1706, + "num_input_tokens_seen": 81154096, + "step": 37595 + }, + { + "epoch": 6.133768352365416, + "grad_norm": 0.23885972797870636, + "learning_rate": 0.0008754243460999982, + "loss": 0.0585, + "num_input_tokens_seen": 81165104, + "step": 37600 + }, + { + "epoch": 6.134584013050571, + "grad_norm": 0.43120378255844116, + "learning_rate": 0.0008753773298555844, + "loss": 0.1102, + "num_input_tokens_seen": 81175952, + "step": 37605 + }, + { + "epoch": 6.135399673735726, + "grad_norm": 0.1402469128370285, + "learning_rate": 0.0008753303060036728, + "loss": 0.0851, + "num_input_tokens_seen": 81186608, + "step": 37610 + }, + { + "epoch": 6.136215334420881, + "grad_norm": 0.1493108570575714, + "learning_rate": 0.0008752832745452166, + "loss": 0.112, + "num_input_tokens_seen": 81198128, + "step": 37615 + }, + { + "epoch": 6.137030995106036, + "grad_norm": 0.06419505178928375, + "learning_rate": 0.0008752362354811686, + "loss": 0.1085, + "num_input_tokens_seen": 81207792, + "step": 37620 + }, + { + "epoch": 6.137846655791191, + "grad_norm": 0.1388806849718094, + "learning_rate": 0.0008751891888124823, + "loss": 0.0339, + "num_input_tokens_seen": 81219152, + "step": 37625 + }, + { + "epoch": 6.138662316476346, + "grad_norm": 0.06181103736162186, + "learning_rate": 0.0008751421345401111, + "loss": 0.026, + "num_input_tokens_seen": 81230800, + "step": 37630 + }, + { + "epoch": 6.1394779771615005, + "grad_norm": 0.06539842486381531, + "learning_rate": 0.0008750950726650089, + "loss": 0.0426, + "num_input_tokens_seen": 81243536, + "step": 37635 + }, + { + "epoch": 6.140293637846656, + "grad_norm": 0.0017602070001885295, + "learning_rate": 0.0008750480031881289, + "loss": 0.015, + "num_input_tokens_seen": 81255344, + "step": 37640 + }, + { + "epoch": 6.141109298531811, + "grad_norm": 0.005246597807854414, + "learning_rate": 0.0008750009261104255, + "loss": 0.013, + "num_input_tokens_seen": 81266544, + "step": 37645 + }, + { + "epoch": 6.141924959216966, + "grad_norm": 0.253149151802063, + "learning_rate": 0.0008749538414328525, + "loss": 0.1709, + "num_input_tokens_seen": 81276976, + "step": 37650 + }, + { + "epoch": 6.142740619902121, + "grad_norm": 0.003204663749784231, + "learning_rate": 0.0008749067491563643, + "loss": 0.0491, + "num_input_tokens_seen": 81288656, + "step": 37655 + }, + { + "epoch": 6.143556280587275, + "grad_norm": 0.22727282345294952, + "learning_rate": 0.0008748596492819152, + "loss": 0.1496, + "num_input_tokens_seen": 81298928, + "step": 37660 + }, + { + "epoch": 6.14437194127243, + "grad_norm": 0.004330330993980169, + "learning_rate": 0.0008748125418104598, + "loss": 0.1258, + "num_input_tokens_seen": 81310224, + "step": 37665 + }, + { + "epoch": 6.145187601957586, + "grad_norm": 0.01711408421397209, + "learning_rate": 0.0008747654267429526, + "loss": 0.0285, + "num_input_tokens_seen": 81321616, + "step": 37670 + }, + { + "epoch": 6.146003262642741, + "grad_norm": 0.021236281841993332, + "learning_rate": 0.0008747183040803488, + "loss": 0.0886, + "num_input_tokens_seen": 81332656, + "step": 37675 + }, + { + "epoch": 6.146818923327896, + "grad_norm": 0.11086190491914749, + "learning_rate": 0.000874671173823603, + "loss": 0.1121, + "num_input_tokens_seen": 81343440, + "step": 37680 + }, + { + "epoch": 6.14763458401305, + "grad_norm": 0.04109755530953407, + "learning_rate": 0.0008746240359736708, + "loss": 0.088, + "num_input_tokens_seen": 81354672, + "step": 37685 + }, + { + "epoch": 6.148450244698205, + "grad_norm": 0.21775461733341217, + "learning_rate": 0.0008745768905315072, + "loss": 0.0773, + "num_input_tokens_seen": 81365808, + "step": 37690 + }, + { + "epoch": 6.149265905383361, + "grad_norm": 0.10789090394973755, + "learning_rate": 0.0008745297374980676, + "loss": 0.0358, + "num_input_tokens_seen": 81375856, + "step": 37695 + }, + { + "epoch": 6.150081566068516, + "grad_norm": 0.25075653195381165, + "learning_rate": 0.0008744825768743079, + "loss": 0.036, + "num_input_tokens_seen": 81387184, + "step": 37700 + }, + { + "epoch": 6.150897226753671, + "grad_norm": 0.04999377951025963, + "learning_rate": 0.0008744354086611837, + "loss": 0.091, + "num_input_tokens_seen": 81397712, + "step": 37705 + }, + { + "epoch": 6.151712887438825, + "grad_norm": 0.005171324126422405, + "learning_rate": 0.0008743882328596509, + "loss": 0.0438, + "num_input_tokens_seen": 81408592, + "step": 37710 + }, + { + "epoch": 6.15252854812398, + "grad_norm": 0.28692787885665894, + "learning_rate": 0.0008743410494706655, + "loss": 0.1294, + "num_input_tokens_seen": 81420176, + "step": 37715 + }, + { + "epoch": 6.153344208809135, + "grad_norm": 0.02814704366028309, + "learning_rate": 0.0008742938584951841, + "loss": 0.1065, + "num_input_tokens_seen": 81431920, + "step": 37720 + }, + { + "epoch": 6.154159869494291, + "grad_norm": 0.09004232287406921, + "learning_rate": 0.0008742466599341625, + "loss": 0.2108, + "num_input_tokens_seen": 81443312, + "step": 37725 + }, + { + "epoch": 6.1549755301794455, + "grad_norm": 0.01479080505669117, + "learning_rate": 0.0008741994537885578, + "loss": 0.0549, + "num_input_tokens_seen": 81454736, + "step": 37730 + }, + { + "epoch": 6.1557911908646, + "grad_norm": 0.18181893229484558, + "learning_rate": 0.0008741522400593265, + "loss": 0.3745, + "num_input_tokens_seen": 81465968, + "step": 37735 + }, + { + "epoch": 6.156606851549755, + "grad_norm": 0.022431721910834312, + "learning_rate": 0.0008741050187474253, + "loss": 0.0585, + "num_input_tokens_seen": 81477264, + "step": 37740 + }, + { + "epoch": 6.15742251223491, + "grad_norm": 0.1915615350008011, + "learning_rate": 0.0008740577898538114, + "loss": 0.0816, + "num_input_tokens_seen": 81488080, + "step": 37745 + }, + { + "epoch": 6.158238172920065, + "grad_norm": 0.02398707903921604, + "learning_rate": 0.0008740105533794417, + "loss": 0.1117, + "num_input_tokens_seen": 81499536, + "step": 37750 + }, + { + "epoch": 6.1590538336052205, + "grad_norm": 0.01506173424422741, + "learning_rate": 0.0008739633093252738, + "loss": 0.0544, + "num_input_tokens_seen": 81511184, + "step": 37755 + }, + { + "epoch": 6.159869494290375, + "grad_norm": 0.05349522829055786, + "learning_rate": 0.0008739160576922649, + "loss": 0.1383, + "num_input_tokens_seen": 81521648, + "step": 37760 + }, + { + "epoch": 6.16068515497553, + "grad_norm": 0.005421073641628027, + "learning_rate": 0.0008738687984813729, + "loss": 0.0233, + "num_input_tokens_seen": 81532016, + "step": 37765 + }, + { + "epoch": 6.161500815660685, + "grad_norm": 0.1244489774107933, + "learning_rate": 0.0008738215316935554, + "loss": 0.0526, + "num_input_tokens_seen": 81543440, + "step": 37770 + }, + { + "epoch": 6.16231647634584, + "grad_norm": 0.13033387064933777, + "learning_rate": 0.0008737742573297702, + "loss": 0.0722, + "num_input_tokens_seen": 81553680, + "step": 37775 + }, + { + "epoch": 6.1631321370309955, + "grad_norm": 0.04336000606417656, + "learning_rate": 0.0008737269753909757, + "loss": 0.0173, + "num_input_tokens_seen": 81564944, + "step": 37780 + }, + { + "epoch": 6.16394779771615, + "grad_norm": 0.13138771057128906, + "learning_rate": 0.0008736796858781297, + "loss": 0.1151, + "num_input_tokens_seen": 81575376, + "step": 37785 + }, + { + "epoch": 6.164763458401305, + "grad_norm": 0.19995476305484772, + "learning_rate": 0.0008736323887921911, + "loss": 0.1483, + "num_input_tokens_seen": 81585840, + "step": 37790 + }, + { + "epoch": 6.16557911908646, + "grad_norm": 0.2771688401699066, + "learning_rate": 0.0008735850841341179, + "loss": 0.1165, + "num_input_tokens_seen": 81597072, + "step": 37795 + }, + { + "epoch": 6.166394779771615, + "grad_norm": 0.040150150656700134, + "learning_rate": 0.0008735377719048692, + "loss": 0.0842, + "num_input_tokens_seen": 81608336, + "step": 37800 + }, + { + "epoch": 6.16721044045677, + "grad_norm": 0.04477156326174736, + "learning_rate": 0.0008734904521054037, + "loss": 0.0838, + "num_input_tokens_seen": 81619440, + "step": 37805 + }, + { + "epoch": 6.168026101141925, + "grad_norm": 0.16481122374534607, + "learning_rate": 0.0008734431247366803, + "loss": 0.0963, + "num_input_tokens_seen": 81629648, + "step": 37810 + }, + { + "epoch": 6.16884176182708, + "grad_norm": 0.003133920254185796, + "learning_rate": 0.0008733957897996583, + "loss": 0.1759, + "num_input_tokens_seen": 81639408, + "step": 37815 + }, + { + "epoch": 6.169657422512235, + "grad_norm": 0.03938305005431175, + "learning_rate": 0.0008733484472952969, + "loss": 0.0562, + "num_input_tokens_seen": 81650704, + "step": 37820 + }, + { + "epoch": 6.17047308319739, + "grad_norm": 0.0179119985550642, + "learning_rate": 0.0008733010972245554, + "loss": 0.0896, + "num_input_tokens_seen": 81661328, + "step": 37825 + }, + { + "epoch": 6.171288743882545, + "grad_norm": 0.0900697335600853, + "learning_rate": 0.0008732537395883938, + "loss": 0.0404, + "num_input_tokens_seen": 81671728, + "step": 37830 + }, + { + "epoch": 6.1721044045677, + "grad_norm": 0.022646507248282433, + "learning_rate": 0.0008732063743877716, + "loss": 0.0345, + "num_input_tokens_seen": 81681936, + "step": 37835 + }, + { + "epoch": 6.172920065252855, + "grad_norm": 0.11102987080812454, + "learning_rate": 0.0008731590016236489, + "loss": 0.0822, + "num_input_tokens_seen": 81692592, + "step": 37840 + }, + { + "epoch": 6.17373572593801, + "grad_norm": 0.30092403292655945, + "learning_rate": 0.0008731116212969856, + "loss": 0.0667, + "num_input_tokens_seen": 81703696, + "step": 37845 + }, + { + "epoch": 6.174551386623165, + "grad_norm": 0.001498049939982593, + "learning_rate": 0.000873064233408742, + "loss": 0.0491, + "num_input_tokens_seen": 81714448, + "step": 37850 + }, + { + "epoch": 6.1753670473083195, + "grad_norm": 0.011218324303627014, + "learning_rate": 0.0008730168379598782, + "loss": 0.1801, + "num_input_tokens_seen": 81724560, + "step": 37855 + }, + { + "epoch": 6.176182707993474, + "grad_norm": 0.008367417380213737, + "learning_rate": 0.0008729694349513552, + "loss": 0.0093, + "num_input_tokens_seen": 81734384, + "step": 37860 + }, + { + "epoch": 6.17699836867863, + "grad_norm": 0.036906685680150986, + "learning_rate": 0.0008729220243841334, + "loss": 0.0291, + "num_input_tokens_seen": 81745552, + "step": 37865 + }, + { + "epoch": 6.177814029363785, + "grad_norm": 0.12137093394994736, + "learning_rate": 0.0008728746062591737, + "loss": 0.0342, + "num_input_tokens_seen": 81756144, + "step": 37870 + }, + { + "epoch": 6.17862969004894, + "grad_norm": 0.2372027337551117, + "learning_rate": 0.0008728271805774371, + "loss": 0.1163, + "num_input_tokens_seen": 81767024, + "step": 37875 + }, + { + "epoch": 6.1794453507340945, + "grad_norm": 0.32025131583213806, + "learning_rate": 0.0008727797473398846, + "loss": 0.1599, + "num_input_tokens_seen": 81777488, + "step": 37880 + }, + { + "epoch": 6.180261011419249, + "grad_norm": 0.009555737487971783, + "learning_rate": 0.0008727323065474778, + "loss": 0.2086, + "num_input_tokens_seen": 81788624, + "step": 37885 + }, + { + "epoch": 6.181076672104404, + "grad_norm": 0.009936968795955181, + "learning_rate": 0.000872684858201178, + "loss": 0.0363, + "num_input_tokens_seen": 81798224, + "step": 37890 + }, + { + "epoch": 6.18189233278956, + "grad_norm": 0.014876188710331917, + "learning_rate": 0.0008726374023019465, + "loss": 0.0402, + "num_input_tokens_seen": 81808976, + "step": 37895 + }, + { + "epoch": 6.182707993474715, + "grad_norm": 0.2267504632472992, + "learning_rate": 0.0008725899388507454, + "loss": 0.1565, + "num_input_tokens_seen": 81819760, + "step": 37900 + }, + { + "epoch": 6.1835236541598695, + "grad_norm": 0.011921958066523075, + "learning_rate": 0.0008725424678485366, + "loss": 0.1149, + "num_input_tokens_seen": 81830416, + "step": 37905 + }, + { + "epoch": 6.184339314845024, + "grad_norm": 0.05466487631201744, + "learning_rate": 0.0008724949892962821, + "loss": 0.1527, + "num_input_tokens_seen": 81842416, + "step": 37910 + }, + { + "epoch": 6.185154975530179, + "grad_norm": 0.012519368901848793, + "learning_rate": 0.0008724475031949441, + "loss": 0.0462, + "num_input_tokens_seen": 81852080, + "step": 37915 + }, + { + "epoch": 6.185970636215335, + "grad_norm": 0.030443713068962097, + "learning_rate": 0.0008724000095454849, + "loss": 0.1112, + "num_input_tokens_seen": 81862096, + "step": 37920 + }, + { + "epoch": 6.18678629690049, + "grad_norm": 0.18824563920497894, + "learning_rate": 0.0008723525083488671, + "loss": 0.0374, + "num_input_tokens_seen": 81872816, + "step": 37925 + }, + { + "epoch": 6.1876019575856445, + "grad_norm": 0.1912299394607544, + "learning_rate": 0.0008723049996060534, + "loss": 0.1121, + "num_input_tokens_seen": 81883120, + "step": 37930 + }, + { + "epoch": 6.188417618270799, + "grad_norm": 0.022196372970938683, + "learning_rate": 0.0008722574833180065, + "loss": 0.0202, + "num_input_tokens_seen": 81893904, + "step": 37935 + }, + { + "epoch": 6.189233278955954, + "grad_norm": 0.004304963164031506, + "learning_rate": 0.0008722099594856895, + "loss": 0.2117, + "num_input_tokens_seen": 81904368, + "step": 37940 + }, + { + "epoch": 6.190048939641109, + "grad_norm": 0.04400643706321716, + "learning_rate": 0.0008721624281100655, + "loss": 0.0555, + "num_input_tokens_seen": 81915664, + "step": 37945 + }, + { + "epoch": 6.190864600326265, + "grad_norm": 0.01684301719069481, + "learning_rate": 0.0008721148891920978, + "loss": 0.0195, + "num_input_tokens_seen": 81926736, + "step": 37950 + }, + { + "epoch": 6.191680261011419, + "grad_norm": 0.021238630637526512, + "learning_rate": 0.0008720673427327496, + "loss": 0.0132, + "num_input_tokens_seen": 81937552, + "step": 37955 + }, + { + "epoch": 6.192495921696574, + "grad_norm": 0.012611279264092445, + "learning_rate": 0.0008720197887329851, + "loss": 0.0439, + "num_input_tokens_seen": 81947120, + "step": 37960 + }, + { + "epoch": 6.193311582381729, + "grad_norm": 0.20121148228645325, + "learning_rate": 0.0008719722271937673, + "loss": 0.0907, + "num_input_tokens_seen": 81955216, + "step": 37965 + }, + { + "epoch": 6.194127243066884, + "grad_norm": 0.21681632101535797, + "learning_rate": 0.0008719246581160606, + "loss": 0.2727, + "num_input_tokens_seen": 81965360, + "step": 37970 + }, + { + "epoch": 6.19494290375204, + "grad_norm": 0.1289648562669754, + "learning_rate": 0.0008718770815008288, + "loss": 0.177, + "num_input_tokens_seen": 81976112, + "step": 37975 + }, + { + "epoch": 6.195758564437194, + "grad_norm": 0.050753261893987656, + "learning_rate": 0.0008718294973490362, + "loss": 0.0828, + "num_input_tokens_seen": 81987568, + "step": 37980 + }, + { + "epoch": 6.196574225122349, + "grad_norm": 0.1909029483795166, + "learning_rate": 0.0008717819056616472, + "loss": 0.1164, + "num_input_tokens_seen": 81998096, + "step": 37985 + }, + { + "epoch": 6.197389885807504, + "grad_norm": 0.006293233949691057, + "learning_rate": 0.0008717343064396262, + "loss": 0.0118, + "num_input_tokens_seen": 82009872, + "step": 37990 + }, + { + "epoch": 6.198205546492659, + "grad_norm": 0.011111578904092312, + "learning_rate": 0.0008716866996839378, + "loss": 0.0378, + "num_input_tokens_seen": 82020080, + "step": 37995 + }, + { + "epoch": 6.199021207177814, + "grad_norm": 0.005487168673425913, + "learning_rate": 0.0008716390853955472, + "loss": 0.0201, + "num_input_tokens_seen": 82030416, + "step": 38000 + }, + { + "epoch": 6.199836867862969, + "grad_norm": 0.03217056021094322, + "learning_rate": 0.0008715914635754187, + "loss": 0.0386, + "num_input_tokens_seen": 82041296, + "step": 38005 + }, + { + "epoch": 6.200652528548124, + "grad_norm": 0.014048619195818901, + "learning_rate": 0.0008715438342245181, + "loss": 0.0634, + "num_input_tokens_seen": 82051024, + "step": 38010 + }, + { + "epoch": 6.201468189233279, + "grad_norm": 0.22043456137180328, + "learning_rate": 0.0008714961973438103, + "loss": 0.1892, + "num_input_tokens_seen": 82062288, + "step": 38015 + }, + { + "epoch": 6.202283849918434, + "grad_norm": 0.005672913044691086, + "learning_rate": 0.0008714485529342606, + "loss": 0.0436, + "num_input_tokens_seen": 82073808, + "step": 38020 + }, + { + "epoch": 6.203099510603589, + "grad_norm": 0.02088235504925251, + "learning_rate": 0.0008714009009968349, + "loss": 0.1331, + "num_input_tokens_seen": 82083728, + "step": 38025 + }, + { + "epoch": 6.2039151712887435, + "grad_norm": 0.3234838843345642, + "learning_rate": 0.0008713532415324988, + "loss": 0.0865, + "num_input_tokens_seen": 82095600, + "step": 38030 + }, + { + "epoch": 6.204730831973899, + "grad_norm": 0.05386462062597275, + "learning_rate": 0.0008713055745422181, + "loss": 0.0559, + "num_input_tokens_seen": 82105136, + "step": 38035 + }, + { + "epoch": 6.205546492659054, + "grad_norm": 0.06031525507569313, + "learning_rate": 0.000871257900026959, + "loss": 0.026, + "num_input_tokens_seen": 82115504, + "step": 38040 + }, + { + "epoch": 6.206362153344209, + "grad_norm": 0.1599837839603424, + "learning_rate": 0.0008712102179876876, + "loss": 0.1875, + "num_input_tokens_seen": 82126864, + "step": 38045 + }, + { + "epoch": 6.207177814029364, + "grad_norm": 0.03390715643763542, + "learning_rate": 0.0008711625284253701, + "loss": 0.1232, + "num_input_tokens_seen": 82137520, + "step": 38050 + }, + { + "epoch": 6.2079934747145185, + "grad_norm": 0.00901317410171032, + "learning_rate": 0.0008711148313409731, + "loss": 0.0903, + "num_input_tokens_seen": 82148976, + "step": 38055 + }, + { + "epoch": 6.208809135399674, + "grad_norm": 0.24151644110679626, + "learning_rate": 0.0008710671267354633, + "loss": 0.0477, + "num_input_tokens_seen": 82160496, + "step": 38060 + }, + { + "epoch": 6.209624796084829, + "grad_norm": 0.1765967160463333, + "learning_rate": 0.0008710194146098074, + "loss": 0.0793, + "num_input_tokens_seen": 82171440, + "step": 38065 + }, + { + "epoch": 6.210440456769984, + "grad_norm": 0.019025731831789017, + "learning_rate": 0.0008709716949649724, + "loss": 0.1236, + "num_input_tokens_seen": 82182704, + "step": 38070 + }, + { + "epoch": 6.211256117455139, + "grad_norm": 0.03010888397693634, + "learning_rate": 0.0008709239678019255, + "loss": 0.1255, + "num_input_tokens_seen": 82193392, + "step": 38075 + }, + { + "epoch": 6.212071778140293, + "grad_norm": 0.2060699462890625, + "learning_rate": 0.0008708762331216338, + "loss": 0.1306, + "num_input_tokens_seen": 82204080, + "step": 38080 + }, + { + "epoch": 6.212887438825448, + "grad_norm": 0.0230045598000288, + "learning_rate": 0.0008708284909250646, + "loss": 0.0285, + "num_input_tokens_seen": 82215440, + "step": 38085 + }, + { + "epoch": 6.213703099510604, + "grad_norm": 0.19266964495182037, + "learning_rate": 0.0008707807412131858, + "loss": 0.039, + "num_input_tokens_seen": 82226832, + "step": 38090 + }, + { + "epoch": 6.214518760195759, + "grad_norm": 0.016989313066005707, + "learning_rate": 0.0008707329839869649, + "loss": 0.0239, + "num_input_tokens_seen": 82237296, + "step": 38095 + }, + { + "epoch": 6.215334420880914, + "grad_norm": 0.07004913687705994, + "learning_rate": 0.0008706852192473696, + "loss": 0.0492, + "num_input_tokens_seen": 82248400, + "step": 38100 + }, + { + "epoch": 6.216150081566068, + "grad_norm": 0.046207427978515625, + "learning_rate": 0.0008706374469953682, + "loss": 0.1345, + "num_input_tokens_seen": 82259088, + "step": 38105 + }, + { + "epoch": 6.216965742251223, + "grad_norm": 0.006206158548593521, + "learning_rate": 0.0008705896672319286, + "loss": 0.1571, + "num_input_tokens_seen": 82269424, + "step": 38110 + }, + { + "epoch": 6.217781402936378, + "grad_norm": 0.171188622713089, + "learning_rate": 0.0008705418799580196, + "loss": 0.2409, + "num_input_tokens_seen": 82280144, + "step": 38115 + }, + { + "epoch": 6.218597063621534, + "grad_norm": 0.05400080233812332, + "learning_rate": 0.000870494085174609, + "loss": 0.0256, + "num_input_tokens_seen": 82290672, + "step": 38120 + }, + { + "epoch": 6.219412724306689, + "grad_norm": 0.2376798689365387, + "learning_rate": 0.000870446282882666, + "loss": 0.0998, + "num_input_tokens_seen": 82301456, + "step": 38125 + }, + { + "epoch": 6.220228384991843, + "grad_norm": 0.01859196648001671, + "learning_rate": 0.0008703984730831589, + "loss": 0.112, + "num_input_tokens_seen": 82311696, + "step": 38130 + }, + { + "epoch": 6.221044045676998, + "grad_norm": 0.29077544808387756, + "learning_rate": 0.0008703506557770571, + "loss": 0.1478, + "num_input_tokens_seen": 82322320, + "step": 38135 + }, + { + "epoch": 6.221859706362153, + "grad_norm": 0.1722267121076584, + "learning_rate": 0.0008703028309653293, + "loss": 0.0894, + "num_input_tokens_seen": 82332976, + "step": 38140 + }, + { + "epoch": 6.222675367047309, + "grad_norm": 0.3272321820259094, + "learning_rate": 0.0008702549986489449, + "loss": 0.1374, + "num_input_tokens_seen": 82343920, + "step": 38145 + }, + { + "epoch": 6.2234910277324635, + "grad_norm": 0.10053566843271255, + "learning_rate": 0.0008702071588288731, + "loss": 0.0792, + "num_input_tokens_seen": 82354928, + "step": 38150 + }, + { + "epoch": 6.224306688417618, + "grad_norm": 0.020116161555051804, + "learning_rate": 0.0008701593115060837, + "loss": 0.0263, + "num_input_tokens_seen": 82366544, + "step": 38155 + }, + { + "epoch": 6.225122349102773, + "grad_norm": 0.16087542474269867, + "learning_rate": 0.0008701114566815464, + "loss": 0.0872, + "num_input_tokens_seen": 82377520, + "step": 38160 + }, + { + "epoch": 6.225938009787928, + "grad_norm": 0.0877327024936676, + "learning_rate": 0.0008700635943562308, + "loss": 0.0421, + "num_input_tokens_seen": 82388240, + "step": 38165 + }, + { + "epoch": 6.226753670473083, + "grad_norm": 0.032385800033807755, + "learning_rate": 0.0008700157245311071, + "loss": 0.1221, + "num_input_tokens_seen": 82399984, + "step": 38170 + }, + { + "epoch": 6.2275693311582385, + "grad_norm": 0.008664040826261044, + "learning_rate": 0.0008699678472071453, + "loss": 0.2037, + "num_input_tokens_seen": 82410992, + "step": 38175 + }, + { + "epoch": 6.228384991843393, + "grad_norm": 0.014576300047338009, + "learning_rate": 0.0008699199623853156, + "loss": 0.0225, + "num_input_tokens_seen": 82422224, + "step": 38180 + }, + { + "epoch": 6.229200652528548, + "grad_norm": 0.30175673961639404, + "learning_rate": 0.0008698720700665888, + "loss": 0.127, + "num_input_tokens_seen": 82432624, + "step": 38185 + }, + { + "epoch": 6.230016313213703, + "grad_norm": 0.04355394467711449, + "learning_rate": 0.0008698241702519351, + "loss": 0.1725, + "num_input_tokens_seen": 82443696, + "step": 38190 + }, + { + "epoch": 6.230831973898858, + "grad_norm": 0.05053601413965225, + "learning_rate": 0.0008697762629423254, + "loss": 0.0526, + "num_input_tokens_seen": 82454736, + "step": 38195 + }, + { + "epoch": 6.231647634584013, + "grad_norm": 0.19101157784461975, + "learning_rate": 0.0008697283481387308, + "loss": 0.2323, + "num_input_tokens_seen": 82465072, + "step": 38200 + }, + { + "epoch": 6.232463295269168, + "grad_norm": 0.007675885688513517, + "learning_rate": 0.000869680425842122, + "loss": 0.0624, + "num_input_tokens_seen": 82475760, + "step": 38205 + }, + { + "epoch": 6.233278955954323, + "grad_norm": 0.015640323981642723, + "learning_rate": 0.0008696324960534706, + "loss": 0.0925, + "num_input_tokens_seen": 82487056, + "step": 38210 + }, + { + "epoch": 6.234094616639478, + "grad_norm": 0.007141800131648779, + "learning_rate": 0.0008695845587737476, + "loss": 0.1163, + "num_input_tokens_seen": 82498544, + "step": 38215 + }, + { + "epoch": 6.234910277324633, + "grad_norm": 0.0063656168058514595, + "learning_rate": 0.0008695366140039248, + "loss": 0.1269, + "num_input_tokens_seen": 82510096, + "step": 38220 + }, + { + "epoch": 6.235725938009788, + "grad_norm": 0.04095277190208435, + "learning_rate": 0.0008694886617449738, + "loss": 0.0232, + "num_input_tokens_seen": 82520752, + "step": 38225 + }, + { + "epoch": 6.236541598694943, + "grad_norm": 0.2318253070116043, + "learning_rate": 0.0008694407019978661, + "loss": 0.1625, + "num_input_tokens_seen": 82531760, + "step": 38230 + }, + { + "epoch": 6.237357259380098, + "grad_norm": 0.01122771855443716, + "learning_rate": 0.0008693927347635741, + "loss": 0.0412, + "num_input_tokens_seen": 82542576, + "step": 38235 + }, + { + "epoch": 6.238172920065253, + "grad_norm": 0.14079686999320984, + "learning_rate": 0.0008693447600430695, + "loss": 0.1329, + "num_input_tokens_seen": 82553168, + "step": 38240 + }, + { + "epoch": 6.238988580750408, + "grad_norm": 0.0141145009547472, + "learning_rate": 0.000869296777837325, + "loss": 0.1334, + "num_input_tokens_seen": 82563280, + "step": 38245 + }, + { + "epoch": 6.239804241435563, + "grad_norm": 0.08225622028112411, + "learning_rate": 0.0008692487881473128, + "loss": 0.078, + "num_input_tokens_seen": 82575120, + "step": 38250 + }, + { + "epoch": 6.240619902120717, + "grad_norm": 0.2335963249206543, + "learning_rate": 0.0008692007909740054, + "loss": 0.1377, + "num_input_tokens_seen": 82586032, + "step": 38255 + }, + { + "epoch": 6.241435562805873, + "grad_norm": 0.20716847479343414, + "learning_rate": 0.0008691527863183755, + "loss": 0.1369, + "num_input_tokens_seen": 82597040, + "step": 38260 + }, + { + "epoch": 6.242251223491028, + "grad_norm": 0.04523799568414688, + "learning_rate": 0.0008691047741813963, + "loss": 0.0229, + "num_input_tokens_seen": 82608400, + "step": 38265 + }, + { + "epoch": 6.243066884176183, + "grad_norm": 0.05849827453494072, + "learning_rate": 0.0008690567545640406, + "loss": 0.0972, + "num_input_tokens_seen": 82618000, + "step": 38270 + }, + { + "epoch": 6.2438825448613375, + "grad_norm": 0.02852177619934082, + "learning_rate": 0.0008690087274672814, + "loss": 0.155, + "num_input_tokens_seen": 82628880, + "step": 38275 + }, + { + "epoch": 6.244698205546492, + "grad_norm": 0.010735807940363884, + "learning_rate": 0.0008689606928920923, + "loss": 0.0561, + "num_input_tokens_seen": 82639216, + "step": 38280 + }, + { + "epoch": 6.245513866231648, + "grad_norm": 0.01866280660033226, + "learning_rate": 0.0008689126508394467, + "loss": 0.0746, + "num_input_tokens_seen": 82649264, + "step": 38285 + }, + { + "epoch": 6.246329526916803, + "grad_norm": 0.01626741886138916, + "learning_rate": 0.0008688646013103183, + "loss": 0.1611, + "num_input_tokens_seen": 82660112, + "step": 38290 + }, + { + "epoch": 6.247145187601958, + "grad_norm": 0.05529869720339775, + "learning_rate": 0.0008688165443056808, + "loss": 0.0698, + "num_input_tokens_seen": 82671408, + "step": 38295 + }, + { + "epoch": 6.2479608482871125, + "grad_norm": 0.029434949159622192, + "learning_rate": 0.0008687684798265081, + "loss": 0.024, + "num_input_tokens_seen": 82682928, + "step": 38300 + }, + { + "epoch": 6.248776508972267, + "grad_norm": 0.0307852104306221, + "learning_rate": 0.0008687204078737744, + "loss": 0.0428, + "num_input_tokens_seen": 82694864, + "step": 38305 + }, + { + "epoch": 6.249592169657422, + "grad_norm": 0.0888407751917839, + "learning_rate": 0.0008686723284484538, + "loss": 0.0439, + "num_input_tokens_seen": 82705296, + "step": 38310 + }, + { + "epoch": 6.250407830342578, + "grad_norm": 0.11561901122331619, + "learning_rate": 0.0008686242415515209, + "loss": 0.0793, + "num_input_tokens_seen": 82716304, + "step": 38315 + }, + { + "epoch": 6.251223491027733, + "grad_norm": 0.01018332690000534, + "learning_rate": 0.00086857614718395, + "loss": 0.0682, + "num_input_tokens_seen": 82728272, + "step": 38320 + }, + { + "epoch": 6.2520391517128875, + "grad_norm": 0.26415741443634033, + "learning_rate": 0.0008685280453467159, + "loss": 0.0843, + "num_input_tokens_seen": 82738896, + "step": 38325 + }, + { + "epoch": 6.252854812398042, + "grad_norm": 0.02646016515791416, + "learning_rate": 0.0008684799360407935, + "loss": 0.0151, + "num_input_tokens_seen": 82749712, + "step": 38330 + }, + { + "epoch": 6.253670473083197, + "grad_norm": 0.01362162921577692, + "learning_rate": 0.0008684318192671576, + "loss": 0.0282, + "num_input_tokens_seen": 82758608, + "step": 38335 + }, + { + "epoch": 6.254486133768353, + "grad_norm": 0.06068713590502739, + "learning_rate": 0.0008683836950267838, + "loss": 0.0531, + "num_input_tokens_seen": 82770288, + "step": 38340 + }, + { + "epoch": 6.255301794453508, + "grad_norm": 0.3115657567977905, + "learning_rate": 0.0008683355633206469, + "loss": 0.159, + "num_input_tokens_seen": 82782448, + "step": 38345 + }, + { + "epoch": 6.2561174551386625, + "grad_norm": 0.08893144130706787, + "learning_rate": 0.0008682874241497225, + "loss": 0.0506, + "num_input_tokens_seen": 82793264, + "step": 38350 + }, + { + "epoch": 6.256933115823817, + "grad_norm": 0.19866524636745453, + "learning_rate": 0.0008682392775149863, + "loss": 0.0865, + "num_input_tokens_seen": 82804688, + "step": 38355 + }, + { + "epoch": 6.257748776508972, + "grad_norm": 0.0022271168418228626, + "learning_rate": 0.000868191123417414, + "loss": 0.009, + "num_input_tokens_seen": 82816336, + "step": 38360 + }, + { + "epoch": 6.258564437194127, + "grad_norm": 0.03225398436188698, + "learning_rate": 0.0008681429618579815, + "loss": 0.0175, + "num_input_tokens_seen": 82827408, + "step": 38365 + }, + { + "epoch": 6.259380097879283, + "grad_norm": 0.22780710458755493, + "learning_rate": 0.0008680947928376648, + "loss": 0.1379, + "num_input_tokens_seen": 82838128, + "step": 38370 + }, + { + "epoch": 6.260195758564437, + "grad_norm": 0.043727174401283264, + "learning_rate": 0.0008680466163574402, + "loss": 0.032, + "num_input_tokens_seen": 82849072, + "step": 38375 + }, + { + "epoch": 6.261011419249592, + "grad_norm": 0.008422130718827248, + "learning_rate": 0.000867998432418284, + "loss": 0.0407, + "num_input_tokens_seen": 82859088, + "step": 38380 + }, + { + "epoch": 6.261827079934747, + "grad_norm": 0.01911906711757183, + "learning_rate": 0.0008679502410211728, + "loss": 0.0347, + "num_input_tokens_seen": 82870480, + "step": 38385 + }, + { + "epoch": 6.262642740619902, + "grad_norm": 0.14099238812923431, + "learning_rate": 0.0008679020421670831, + "loss": 0.1161, + "num_input_tokens_seen": 82880720, + "step": 38390 + }, + { + "epoch": 6.263458401305057, + "grad_norm": 0.009687887504696846, + "learning_rate": 0.0008678538358569918, + "loss": 0.0365, + "num_input_tokens_seen": 82891536, + "step": 38395 + }, + { + "epoch": 6.264274061990212, + "grad_norm": 0.09386022388935089, + "learning_rate": 0.000867805622091876, + "loss": 0.1857, + "num_input_tokens_seen": 82903056, + "step": 38400 + }, + { + "epoch": 6.265089722675367, + "grad_norm": 0.005296653136610985, + "learning_rate": 0.0008677574008727126, + "loss": 0.0583, + "num_input_tokens_seen": 82912912, + "step": 38405 + }, + { + "epoch": 6.265905383360522, + "grad_norm": 0.058007512241601944, + "learning_rate": 0.0008677091722004788, + "loss": 0.0711, + "num_input_tokens_seen": 82923664, + "step": 38410 + }, + { + "epoch": 6.266721044045677, + "grad_norm": 0.02075113356113434, + "learning_rate": 0.0008676609360761524, + "loss": 0.0401, + "num_input_tokens_seen": 82934256, + "step": 38415 + }, + { + "epoch": 6.267536704730832, + "grad_norm": 0.1981419324874878, + "learning_rate": 0.0008676126925007107, + "loss": 0.2778, + "num_input_tokens_seen": 82945712, + "step": 38420 + }, + { + "epoch": 6.268352365415987, + "grad_norm": 0.1013275533914566, + "learning_rate": 0.0008675644414751311, + "loss": 0.0606, + "num_input_tokens_seen": 82956720, + "step": 38425 + }, + { + "epoch": 6.269168026101142, + "grad_norm": 0.033011481165885925, + "learning_rate": 0.0008675161830003921, + "loss": 0.1177, + "num_input_tokens_seen": 82967856, + "step": 38430 + }, + { + "epoch": 6.269983686786297, + "grad_norm": 0.19729508459568024, + "learning_rate": 0.0008674679170774713, + "loss": 0.1339, + "num_input_tokens_seen": 82979472, + "step": 38435 + }, + { + "epoch": 6.270799347471452, + "grad_norm": 0.023388270288705826, + "learning_rate": 0.0008674196437073472, + "loss": 0.0365, + "num_input_tokens_seen": 82989264, + "step": 38440 + }, + { + "epoch": 6.271615008156607, + "grad_norm": 0.13151520490646362, + "learning_rate": 0.0008673713628909978, + "loss": 0.0986, + "num_input_tokens_seen": 83000368, + "step": 38445 + }, + { + "epoch": 6.2724306688417615, + "grad_norm": 0.003240936668589711, + "learning_rate": 0.0008673230746294016, + "loss": 0.038, + "num_input_tokens_seen": 83011376, + "step": 38450 + }, + { + "epoch": 6.273246329526917, + "grad_norm": 0.013665203005075455, + "learning_rate": 0.0008672747789235373, + "loss": 0.0848, + "num_input_tokens_seen": 83021008, + "step": 38455 + }, + { + "epoch": 6.274061990212072, + "grad_norm": 0.025594400241971016, + "learning_rate": 0.0008672264757743838, + "loss": 0.1138, + "num_input_tokens_seen": 83031888, + "step": 38460 + }, + { + "epoch": 6.274877650897227, + "grad_norm": 0.07326933741569519, + "learning_rate": 0.0008671781651829198, + "loss": 0.1617, + "num_input_tokens_seen": 83042032, + "step": 38465 + }, + { + "epoch": 6.275693311582382, + "grad_norm": 0.16706255078315735, + "learning_rate": 0.0008671298471501246, + "loss": 0.2186, + "num_input_tokens_seen": 83051920, + "step": 38470 + }, + { + "epoch": 6.2765089722675365, + "grad_norm": 0.06096849963068962, + "learning_rate": 0.0008670815216769771, + "loss": 0.157, + "num_input_tokens_seen": 83063280, + "step": 38475 + }, + { + "epoch": 6.277324632952691, + "grad_norm": 0.17529694736003876, + "learning_rate": 0.0008670331887644571, + "loss": 0.1294, + "num_input_tokens_seen": 83072112, + "step": 38480 + }, + { + "epoch": 6.278140293637847, + "grad_norm": 0.03341106325387955, + "learning_rate": 0.0008669848484135439, + "loss": 0.0336, + "num_input_tokens_seen": 83083728, + "step": 38485 + }, + { + "epoch": 6.278955954323002, + "grad_norm": 0.22757704555988312, + "learning_rate": 0.0008669365006252172, + "loss": 0.1759, + "num_input_tokens_seen": 83094256, + "step": 38490 + }, + { + "epoch": 6.279771615008157, + "grad_norm": 0.017978308722376823, + "learning_rate": 0.0008668881454004567, + "loss": 0.0398, + "num_input_tokens_seen": 83105552, + "step": 38495 + }, + { + "epoch": 6.280587275693311, + "grad_norm": 0.020587541162967682, + "learning_rate": 0.0008668397827402425, + "loss": 0.0486, + "num_input_tokens_seen": 83117296, + "step": 38500 + }, + { + "epoch": 6.281402936378466, + "grad_norm": 0.02578771486878395, + "learning_rate": 0.000866791412645555, + "loss": 0.1476, + "num_input_tokens_seen": 83127824, + "step": 38505 + }, + { + "epoch": 6.282218597063622, + "grad_norm": 0.16262699663639069, + "learning_rate": 0.000866743035117374, + "loss": 0.1223, + "num_input_tokens_seen": 83138608, + "step": 38510 + }, + { + "epoch": 6.283034257748777, + "grad_norm": 0.09687996655702591, + "learning_rate": 0.0008666946501566801, + "loss": 0.0794, + "num_input_tokens_seen": 83149936, + "step": 38515 + }, + { + "epoch": 6.283849918433932, + "grad_norm": 0.17375290393829346, + "learning_rate": 0.000866646257764454, + "loss": 0.0875, + "num_input_tokens_seen": 83160528, + "step": 38520 + }, + { + "epoch": 6.284665579119086, + "grad_norm": 0.028647977858781815, + "learning_rate": 0.0008665978579416763, + "loss": 0.0254, + "num_input_tokens_seen": 83170640, + "step": 38525 + }, + { + "epoch": 6.285481239804241, + "grad_norm": 0.03833254426717758, + "learning_rate": 0.000866549450689328, + "loss": 0.1059, + "num_input_tokens_seen": 83182128, + "step": 38530 + }, + { + "epoch": 6.286296900489396, + "grad_norm": 0.06163567304611206, + "learning_rate": 0.0008665010360083902, + "loss": 0.2075, + "num_input_tokens_seen": 83192368, + "step": 38535 + }, + { + "epoch": 6.287112561174552, + "grad_norm": 0.13798999786376953, + "learning_rate": 0.0008664526138998438, + "loss": 0.1461, + "num_input_tokens_seen": 83203024, + "step": 38540 + }, + { + "epoch": 6.287928221859707, + "grad_norm": 0.24057428538799286, + "learning_rate": 0.0008664041843646704, + "loss": 0.2177, + "num_input_tokens_seen": 83214160, + "step": 38545 + }, + { + "epoch": 6.288743882544861, + "grad_norm": 0.008731750771403313, + "learning_rate": 0.0008663557474038512, + "loss": 0.0966, + "num_input_tokens_seen": 83224784, + "step": 38550 + }, + { + "epoch": 6.289559543230016, + "grad_norm": 0.1091703251004219, + "learning_rate": 0.0008663073030183683, + "loss": 0.0517, + "num_input_tokens_seen": 83235344, + "step": 38555 + }, + { + "epoch": 6.290375203915171, + "grad_norm": 0.09148517996072769, + "learning_rate": 0.000866258851209203, + "loss": 0.0922, + "num_input_tokens_seen": 83246384, + "step": 38560 + }, + { + "epoch": 6.291190864600326, + "grad_norm": 0.08089996129274368, + "learning_rate": 0.0008662103919773375, + "loss": 0.0909, + "num_input_tokens_seen": 83257520, + "step": 38565 + }, + { + "epoch": 6.2920065252854815, + "grad_norm": 0.006539955735206604, + "learning_rate": 0.0008661619253237538, + "loss": 0.0539, + "num_input_tokens_seen": 83267824, + "step": 38570 + }, + { + "epoch": 6.292822185970636, + "grad_norm": 0.009001573547720909, + "learning_rate": 0.0008661134512494343, + "loss": 0.0596, + "num_input_tokens_seen": 83277904, + "step": 38575 + }, + { + "epoch": 6.293637846655791, + "grad_norm": 0.08216945081949234, + "learning_rate": 0.0008660649697553612, + "loss": 0.0189, + "num_input_tokens_seen": 83287280, + "step": 38580 + }, + { + "epoch": 6.294453507340946, + "grad_norm": 0.14487679302692413, + "learning_rate": 0.000866016480842517, + "loss": 0.1609, + "num_input_tokens_seen": 83297520, + "step": 38585 + }, + { + "epoch": 6.295269168026101, + "grad_norm": 0.010907143354415894, + "learning_rate": 0.0008659679845118847, + "loss": 0.053, + "num_input_tokens_seen": 83308400, + "step": 38590 + }, + { + "epoch": 6.2960848287112565, + "grad_norm": 0.3504311442375183, + "learning_rate": 0.0008659194807644468, + "loss": 0.1758, + "num_input_tokens_seen": 83321008, + "step": 38595 + }, + { + "epoch": 6.296900489396411, + "grad_norm": 0.016045743599534035, + "learning_rate": 0.0008658709696011864, + "loss": 0.1099, + "num_input_tokens_seen": 83333264, + "step": 38600 + }, + { + "epoch": 6.297716150081566, + "grad_norm": 0.14296463131904602, + "learning_rate": 0.0008658224510230867, + "loss": 0.0532, + "num_input_tokens_seen": 83343856, + "step": 38605 + }, + { + "epoch": 6.298531810766721, + "grad_norm": 0.0192820206284523, + "learning_rate": 0.0008657739250311309, + "loss": 0.0218, + "num_input_tokens_seen": 83354832, + "step": 38610 + }, + { + "epoch": 6.299347471451876, + "grad_norm": 0.006586351431906223, + "learning_rate": 0.0008657253916263026, + "loss": 0.1192, + "num_input_tokens_seen": 83365488, + "step": 38615 + }, + { + "epoch": 6.300163132137031, + "grad_norm": 0.35689684748649597, + "learning_rate": 0.0008656768508095852, + "loss": 0.1958, + "num_input_tokens_seen": 83376688, + "step": 38620 + }, + { + "epoch": 6.300978792822186, + "grad_norm": 0.060436103492975235, + "learning_rate": 0.0008656283025819626, + "loss": 0.0333, + "num_input_tokens_seen": 83387056, + "step": 38625 + }, + { + "epoch": 6.301794453507341, + "grad_norm": 0.06238924711942673, + "learning_rate": 0.0008655797469444186, + "loss": 0.0508, + "num_input_tokens_seen": 83398224, + "step": 38630 + }, + { + "epoch": 6.302610114192496, + "grad_norm": 0.013827160932123661, + "learning_rate": 0.0008655311838979371, + "loss": 0.0318, + "num_input_tokens_seen": 83408880, + "step": 38635 + }, + { + "epoch": 6.303425774877651, + "grad_norm": 0.06771564483642578, + "learning_rate": 0.0008654826134435028, + "loss": 0.0664, + "num_input_tokens_seen": 83419504, + "step": 38640 + }, + { + "epoch": 6.304241435562806, + "grad_norm": 0.07089833915233612, + "learning_rate": 0.0008654340355820993, + "loss": 0.0568, + "num_input_tokens_seen": 83429296, + "step": 38645 + }, + { + "epoch": 6.30505709624796, + "grad_norm": 0.0033963036257773638, + "learning_rate": 0.0008653854503147117, + "loss": 0.0247, + "num_input_tokens_seen": 83440912, + "step": 38650 + }, + { + "epoch": 6.305872756933116, + "grad_norm": 0.01774754375219345, + "learning_rate": 0.0008653368576423244, + "loss": 0.0517, + "num_input_tokens_seen": 83451568, + "step": 38655 + }, + { + "epoch": 6.306688417618271, + "grad_norm": 0.02481848932802677, + "learning_rate": 0.0008652882575659222, + "loss": 0.0138, + "num_input_tokens_seen": 83461200, + "step": 38660 + }, + { + "epoch": 6.307504078303426, + "grad_norm": 0.0183070357888937, + "learning_rate": 0.00086523965008649, + "loss": 0.0437, + "num_input_tokens_seen": 83472016, + "step": 38665 + }, + { + "epoch": 6.308319738988581, + "grad_norm": 0.030995415523648262, + "learning_rate": 0.0008651910352050129, + "loss": 0.109, + "num_input_tokens_seen": 83482896, + "step": 38670 + }, + { + "epoch": 6.309135399673735, + "grad_norm": 0.25824081897735596, + "learning_rate": 0.0008651424129224764, + "loss": 0.208, + "num_input_tokens_seen": 83494384, + "step": 38675 + }, + { + "epoch": 6.309951060358891, + "grad_norm": 0.020894819870591164, + "learning_rate": 0.0008650937832398656, + "loss": 0.0125, + "num_input_tokens_seen": 83504848, + "step": 38680 + }, + { + "epoch": 6.310766721044046, + "grad_norm": 0.0038547967560589314, + "learning_rate": 0.0008650451461581661, + "loss": 0.0315, + "num_input_tokens_seen": 83516208, + "step": 38685 + }, + { + "epoch": 6.311582381729201, + "grad_norm": 0.7001302242279053, + "learning_rate": 0.0008649965016783636, + "loss": 0.125, + "num_input_tokens_seen": 83527056, + "step": 38690 + }, + { + "epoch": 6.3123980424143555, + "grad_norm": 0.16097642481327057, + "learning_rate": 0.0008649478498014441, + "loss": 0.0427, + "num_input_tokens_seen": 83537392, + "step": 38695 + }, + { + "epoch": 6.31321370309951, + "grad_norm": 0.020234184339642525, + "learning_rate": 0.0008648991905283931, + "loss": 0.035, + "num_input_tokens_seen": 83548944, + "step": 38700 + }, + { + "epoch": 6.314029363784665, + "grad_norm": 0.06637764722108841, + "learning_rate": 0.0008648505238601974, + "loss": 0.0369, + "num_input_tokens_seen": 83558832, + "step": 38705 + }, + { + "epoch": 6.314845024469821, + "grad_norm": 0.03275923430919647, + "learning_rate": 0.0008648018497978429, + "loss": 0.0908, + "num_input_tokens_seen": 83567696, + "step": 38710 + }, + { + "epoch": 6.315660685154976, + "grad_norm": 0.001126434770412743, + "learning_rate": 0.0008647531683423162, + "loss": 0.02, + "num_input_tokens_seen": 83578480, + "step": 38715 + }, + { + "epoch": 6.3164763458401305, + "grad_norm": 0.007279345765709877, + "learning_rate": 0.0008647044794946038, + "loss": 0.0849, + "num_input_tokens_seen": 83588496, + "step": 38720 + }, + { + "epoch": 6.317292006525285, + "grad_norm": 0.020582817494869232, + "learning_rate": 0.0008646557832556925, + "loss": 0.0344, + "num_input_tokens_seen": 83599376, + "step": 38725 + }, + { + "epoch": 6.31810766721044, + "grad_norm": 0.11946702748537064, + "learning_rate": 0.000864607079626569, + "loss": 0.0881, + "num_input_tokens_seen": 83610032, + "step": 38730 + }, + { + "epoch": 6.318923327895595, + "grad_norm": 0.1857098489999771, + "learning_rate": 0.0008645583686082206, + "loss": 0.0519, + "num_input_tokens_seen": 83621392, + "step": 38735 + }, + { + "epoch": 6.319738988580751, + "grad_norm": 0.019465666264295578, + "learning_rate": 0.0008645096502016346, + "loss": 0.0495, + "num_input_tokens_seen": 83632368, + "step": 38740 + }, + { + "epoch": 6.3205546492659055, + "grad_norm": 0.025004252791404724, + "learning_rate": 0.0008644609244077978, + "loss": 0.0125, + "num_input_tokens_seen": 83642800, + "step": 38745 + }, + { + "epoch": 6.32137030995106, + "grad_norm": 0.32236382365226746, + "learning_rate": 0.0008644121912276981, + "loss": 0.0963, + "num_input_tokens_seen": 83653552, + "step": 38750 + }, + { + "epoch": 6.322185970636215, + "grad_norm": 0.015966292470693588, + "learning_rate": 0.000864363450662323, + "loss": 0.036, + "num_input_tokens_seen": 83664624, + "step": 38755 + }, + { + "epoch": 6.32300163132137, + "grad_norm": 0.013188747689127922, + "learning_rate": 0.0008643147027126604, + "loss": 0.0332, + "num_input_tokens_seen": 83675536, + "step": 38760 + }, + { + "epoch": 6.323817292006526, + "grad_norm": 0.0052478197030723095, + "learning_rate": 0.0008642659473796984, + "loss": 0.0093, + "num_input_tokens_seen": 83685712, + "step": 38765 + }, + { + "epoch": 6.3246329526916805, + "grad_norm": 0.010467191226780415, + "learning_rate": 0.0008642171846644245, + "loss": 0.0636, + "num_input_tokens_seen": 83696112, + "step": 38770 + }, + { + "epoch": 6.325448613376835, + "grad_norm": 0.011628848500549793, + "learning_rate": 0.0008641684145678275, + "loss": 0.0464, + "num_input_tokens_seen": 83706608, + "step": 38775 + }, + { + "epoch": 6.32626427406199, + "grad_norm": 0.006005770061165094, + "learning_rate": 0.0008641196370908956, + "loss": 0.1901, + "num_input_tokens_seen": 83718288, + "step": 38780 + }, + { + "epoch": 6.327079934747145, + "grad_norm": 0.2343970537185669, + "learning_rate": 0.0008640708522346173, + "loss": 0.0571, + "num_input_tokens_seen": 83729328, + "step": 38785 + }, + { + "epoch": 6.327895595432301, + "grad_norm": 0.2291705310344696, + "learning_rate": 0.0008640220599999813, + "loss": 0.1801, + "num_input_tokens_seen": 83740016, + "step": 38790 + }, + { + "epoch": 6.328711256117455, + "grad_norm": 0.10851339995861053, + "learning_rate": 0.0008639732603879766, + "loss": 0.0684, + "num_input_tokens_seen": 83751120, + "step": 38795 + }, + { + "epoch": 6.32952691680261, + "grad_norm": 0.011274145916104317, + "learning_rate": 0.0008639244533995919, + "loss": 0.1575, + "num_input_tokens_seen": 83760880, + "step": 38800 + }, + { + "epoch": 6.330342577487765, + "grad_norm": 0.0665455088019371, + "learning_rate": 0.0008638756390358164, + "loss": 0.0575, + "num_input_tokens_seen": 83771440, + "step": 38805 + }, + { + "epoch": 6.33115823817292, + "grad_norm": 0.005120498593896627, + "learning_rate": 0.0008638268172976398, + "loss": 0.0974, + "num_input_tokens_seen": 83781424, + "step": 38810 + }, + { + "epoch": 6.331973898858075, + "grad_norm": 0.0027621889021247625, + "learning_rate": 0.0008637779881860509, + "loss": 0.0662, + "num_input_tokens_seen": 83792848, + "step": 38815 + }, + { + "epoch": 6.33278955954323, + "grad_norm": 0.07877042144536972, + "learning_rate": 0.0008637291517020397, + "loss": 0.1352, + "num_input_tokens_seen": 83803152, + "step": 38820 + }, + { + "epoch": 6.333605220228385, + "grad_norm": 0.01588800735771656, + "learning_rate": 0.0008636803078465958, + "loss": 0.021, + "num_input_tokens_seen": 83814672, + "step": 38825 + }, + { + "epoch": 6.33442088091354, + "grad_norm": 0.004627645015716553, + "learning_rate": 0.000863631456620709, + "loss": 0.1049, + "num_input_tokens_seen": 83825904, + "step": 38830 + }, + { + "epoch": 6.335236541598695, + "grad_norm": 0.001880511874333024, + "learning_rate": 0.0008635825980253696, + "loss": 0.0368, + "num_input_tokens_seen": 83837936, + "step": 38835 + }, + { + "epoch": 6.33605220228385, + "grad_norm": 0.11365389078855515, + "learning_rate": 0.0008635337320615675, + "loss": 0.0661, + "num_input_tokens_seen": 83848944, + "step": 38840 + }, + { + "epoch": 6.3368678629690045, + "grad_norm": 0.08720553666353226, + "learning_rate": 0.0008634848587302932, + "loss": 0.0948, + "num_input_tokens_seen": 83861136, + "step": 38845 + }, + { + "epoch": 6.33768352365416, + "grad_norm": 0.1704830676317215, + "learning_rate": 0.0008634359780325372, + "loss": 0.1001, + "num_input_tokens_seen": 83873136, + "step": 38850 + }, + { + "epoch": 6.338499184339315, + "grad_norm": 0.033878568559885025, + "learning_rate": 0.0008633870899692899, + "loss": 0.0649, + "num_input_tokens_seen": 83884496, + "step": 38855 + }, + { + "epoch": 6.33931484502447, + "grad_norm": 0.13720868527889252, + "learning_rate": 0.0008633381945415422, + "loss": 0.0229, + "num_input_tokens_seen": 83895664, + "step": 38860 + }, + { + "epoch": 6.340130505709625, + "grad_norm": 0.08526377379894257, + "learning_rate": 0.0008632892917502852, + "loss": 0.1083, + "num_input_tokens_seen": 83907664, + "step": 38865 + }, + { + "epoch": 6.3409461663947795, + "grad_norm": 0.005983958952128887, + "learning_rate": 0.0008632403815965099, + "loss": 0.0349, + "num_input_tokens_seen": 83918896, + "step": 38870 + }, + { + "epoch": 6.341761827079935, + "grad_norm": 0.36239567399024963, + "learning_rate": 0.0008631914640812073, + "loss": 0.0864, + "num_input_tokens_seen": 83929008, + "step": 38875 + }, + { + "epoch": 6.34257748776509, + "grad_norm": 0.005874789319932461, + "learning_rate": 0.000863142539205369, + "loss": 0.0418, + "num_input_tokens_seen": 83937776, + "step": 38880 + }, + { + "epoch": 6.343393148450245, + "grad_norm": 0.05490971729159355, + "learning_rate": 0.0008630936069699864, + "loss": 0.0178, + "num_input_tokens_seen": 83948816, + "step": 38885 + }, + { + "epoch": 6.3442088091354, + "grad_norm": 0.005061678122729063, + "learning_rate": 0.0008630446673760513, + "loss": 0.0197, + "num_input_tokens_seen": 83959440, + "step": 38890 + }, + { + "epoch": 6.3450244698205545, + "grad_norm": 0.00989916455000639, + "learning_rate": 0.0008629957204245555, + "loss": 0.0093, + "num_input_tokens_seen": 83971248, + "step": 38895 + }, + { + "epoch": 6.345840130505709, + "grad_norm": 0.03195362165570259, + "learning_rate": 0.000862946766116491, + "loss": 0.0155, + "num_input_tokens_seen": 83981584, + "step": 38900 + }, + { + "epoch": 6.346655791190865, + "grad_norm": 0.17813435196876526, + "learning_rate": 0.0008628978044528496, + "loss": 0.0464, + "num_input_tokens_seen": 83992112, + "step": 38905 + }, + { + "epoch": 6.34747145187602, + "grad_norm": 0.12388013303279877, + "learning_rate": 0.000862848835434624, + "loss": 0.0303, + "num_input_tokens_seen": 84003472, + "step": 38910 + }, + { + "epoch": 6.348287112561175, + "grad_norm": 0.019398987293243408, + "learning_rate": 0.0008627998590628065, + "loss": 0.1002, + "num_input_tokens_seen": 84013840, + "step": 38915 + }, + { + "epoch": 6.349102773246329, + "grad_norm": 0.0017847843701019883, + "learning_rate": 0.0008627508753383895, + "loss": 0.0324, + "num_input_tokens_seen": 84024912, + "step": 38920 + }, + { + "epoch": 6.349918433931484, + "grad_norm": 0.0012760453391820192, + "learning_rate": 0.0008627018842623657, + "loss": 0.1395, + "num_input_tokens_seen": 84035632, + "step": 38925 + }, + { + "epoch": 6.350734094616639, + "grad_norm": 0.19427572190761566, + "learning_rate": 0.0008626528858357283, + "loss": 0.068, + "num_input_tokens_seen": 84046448, + "step": 38930 + }, + { + "epoch": 6.351549755301795, + "grad_norm": 0.02943243272602558, + "learning_rate": 0.0008626038800594703, + "loss": 0.0339, + "num_input_tokens_seen": 84057648, + "step": 38935 + }, + { + "epoch": 6.35236541598695, + "grad_norm": 0.19797997176647186, + "learning_rate": 0.0008625548669345842, + "loss": 0.0918, + "num_input_tokens_seen": 84068880, + "step": 38940 + }, + { + "epoch": 6.353181076672104, + "grad_norm": 0.007800791412591934, + "learning_rate": 0.0008625058464620641, + "loss": 0.1267, + "num_input_tokens_seen": 84079696, + "step": 38945 + }, + { + "epoch": 6.353996737357259, + "grad_norm": 0.20053280889987946, + "learning_rate": 0.0008624568186429031, + "loss": 0.1412, + "num_input_tokens_seen": 84089456, + "step": 38950 + }, + { + "epoch": 6.354812398042414, + "grad_norm": 0.006726181134581566, + "learning_rate": 0.0008624077834780948, + "loss": 0.0121, + "num_input_tokens_seen": 84099440, + "step": 38955 + }, + { + "epoch": 6.35562805872757, + "grad_norm": 0.2376016080379486, + "learning_rate": 0.000862358740968633, + "loss": 0.224, + "num_input_tokens_seen": 84109232, + "step": 38960 + }, + { + "epoch": 6.356443719412725, + "grad_norm": 0.014485492371022701, + "learning_rate": 0.0008623096911155117, + "loss": 0.1066, + "num_input_tokens_seen": 84121712, + "step": 38965 + }, + { + "epoch": 6.357259380097879, + "grad_norm": 0.016468653455376625, + "learning_rate": 0.000862260633919725, + "loss": 0.0109, + "num_input_tokens_seen": 84132688, + "step": 38970 + }, + { + "epoch": 6.358075040783034, + "grad_norm": 0.16425646841526031, + "learning_rate": 0.0008622115693822668, + "loss": 0.065, + "num_input_tokens_seen": 84142544, + "step": 38975 + }, + { + "epoch": 6.358890701468189, + "grad_norm": 0.3058423101902008, + "learning_rate": 0.0008621624975041316, + "loss": 0.1592, + "num_input_tokens_seen": 84151312, + "step": 38980 + }, + { + "epoch": 6.359706362153344, + "grad_norm": 0.2886504828929901, + "learning_rate": 0.0008621134182863142, + "loss": 0.0944, + "num_input_tokens_seen": 84162512, + "step": 38985 + }, + { + "epoch": 6.3605220228384995, + "grad_norm": 0.09234227985143661, + "learning_rate": 0.0008620643317298088, + "loss": 0.0767, + "num_input_tokens_seen": 84174480, + "step": 38990 + }, + { + "epoch": 6.361337683523654, + "grad_norm": 0.034436311572790146, + "learning_rate": 0.0008620152378356105, + "loss": 0.0768, + "num_input_tokens_seen": 84185712, + "step": 38995 + }, + { + "epoch": 6.362153344208809, + "grad_norm": 0.004047236870974302, + "learning_rate": 0.0008619661366047141, + "loss": 0.119, + "num_input_tokens_seen": 84195824, + "step": 39000 + }, + { + "epoch": 6.362969004893964, + "grad_norm": 0.3343377113342285, + "learning_rate": 0.0008619170280381148, + "loss": 0.121, + "num_input_tokens_seen": 84207344, + "step": 39005 + }, + { + "epoch": 6.363784665579119, + "grad_norm": 0.008349894545972347, + "learning_rate": 0.0008618679121368078, + "loss": 0.0373, + "num_input_tokens_seen": 84218512, + "step": 39010 + }, + { + "epoch": 6.364600326264274, + "grad_norm": 0.02607981488108635, + "learning_rate": 0.0008618187889017886, + "loss": 0.0234, + "num_input_tokens_seen": 84229520, + "step": 39015 + }, + { + "epoch": 6.365415986949429, + "grad_norm": 0.0024944976903498173, + "learning_rate": 0.0008617696583340524, + "loss": 0.1428, + "num_input_tokens_seen": 84240752, + "step": 39020 + }, + { + "epoch": 6.366231647634584, + "grad_norm": 0.009041876532137394, + "learning_rate": 0.0008617205204345952, + "loss": 0.1183, + "num_input_tokens_seen": 84250224, + "step": 39025 + }, + { + "epoch": 6.367047308319739, + "grad_norm": 0.09398992359638214, + "learning_rate": 0.000861671375204413, + "loss": 0.0249, + "num_input_tokens_seen": 84260784, + "step": 39030 + }, + { + "epoch": 6.367862969004894, + "grad_norm": 0.032975707203149796, + "learning_rate": 0.0008616222226445014, + "loss": 0.0251, + "num_input_tokens_seen": 84272560, + "step": 39035 + }, + { + "epoch": 6.368678629690049, + "grad_norm": 0.03887435048818588, + "learning_rate": 0.0008615730627558566, + "loss": 0.017, + "num_input_tokens_seen": 84282640, + "step": 39040 + }, + { + "epoch": 6.369494290375204, + "grad_norm": 0.10948161780834198, + "learning_rate": 0.0008615238955394753, + "loss": 0.1848, + "num_input_tokens_seen": 84293936, + "step": 39045 + }, + { + "epoch": 6.370309951060359, + "grad_norm": 0.08079273998737335, + "learning_rate": 0.0008614747209963534, + "loss": 0.1038, + "num_input_tokens_seen": 84303376, + "step": 39050 + }, + { + "epoch": 6.371125611745514, + "grad_norm": 0.03941168636083603, + "learning_rate": 0.0008614255391274877, + "loss": 0.1291, + "num_input_tokens_seen": 84313968, + "step": 39055 + }, + { + "epoch": 6.371941272430669, + "grad_norm": 0.09666110575199127, + "learning_rate": 0.0008613763499338751, + "loss": 0.1316, + "num_input_tokens_seen": 84325520, + "step": 39060 + }, + { + "epoch": 6.372756933115824, + "grad_norm": 0.01037772186100483, + "learning_rate": 0.0008613271534165121, + "loss": 0.0596, + "num_input_tokens_seen": 84336496, + "step": 39065 + }, + { + "epoch": 6.373572593800978, + "grad_norm": 0.009522153064608574, + "learning_rate": 0.0008612779495763963, + "loss": 0.0367, + "num_input_tokens_seen": 84346928, + "step": 39070 + }, + { + "epoch": 6.374388254486134, + "grad_norm": 0.017617691308259964, + "learning_rate": 0.0008612287384145243, + "loss": 0.035, + "num_input_tokens_seen": 84358032, + "step": 39075 + }, + { + "epoch": 6.375203915171289, + "grad_norm": 0.0520947240293026, + "learning_rate": 0.0008611795199318937, + "loss": 0.0263, + "num_input_tokens_seen": 84368816, + "step": 39080 + }, + { + "epoch": 6.376019575856444, + "grad_norm": 0.006892753764986992, + "learning_rate": 0.000861130294129502, + "loss": 0.0752, + "num_input_tokens_seen": 84378800, + "step": 39085 + }, + { + "epoch": 6.376835236541599, + "grad_norm": 0.006792945321649313, + "learning_rate": 0.0008610810610083466, + "loss": 0.0214, + "num_input_tokens_seen": 84389584, + "step": 39090 + }, + { + "epoch": 6.377650897226753, + "grad_norm": 0.025548307225108147, + "learning_rate": 0.0008610318205694256, + "loss": 0.1759, + "num_input_tokens_seen": 84398768, + "step": 39095 + }, + { + "epoch": 6.378466557911908, + "grad_norm": 0.013903504237532616, + "learning_rate": 0.0008609825728137366, + "loss": 0.0242, + "num_input_tokens_seen": 84408464, + "step": 39100 + }, + { + "epoch": 6.379282218597064, + "grad_norm": 0.0026127162855118513, + "learning_rate": 0.000860933317742278, + "loss": 0.0191, + "num_input_tokens_seen": 84419600, + "step": 39105 + }, + { + "epoch": 6.380097879282219, + "grad_norm": 0.181004598736763, + "learning_rate": 0.0008608840553560478, + "loss": 0.0736, + "num_input_tokens_seen": 84428784, + "step": 39110 + }, + { + "epoch": 6.3809135399673735, + "grad_norm": 0.003353741252794862, + "learning_rate": 0.0008608347856560443, + "loss": 0.0197, + "num_input_tokens_seen": 84439696, + "step": 39115 + }, + { + "epoch": 6.381729200652528, + "grad_norm": 0.015108766034245491, + "learning_rate": 0.0008607855086432663, + "loss": 0.0175, + "num_input_tokens_seen": 84451280, + "step": 39120 + }, + { + "epoch": 6.382544861337683, + "grad_norm": 0.1941739022731781, + "learning_rate": 0.0008607362243187121, + "loss": 0.1254, + "num_input_tokens_seen": 84462512, + "step": 39125 + }, + { + "epoch": 6.383360522022839, + "grad_norm": 0.04804662615060806, + "learning_rate": 0.0008606869326833809, + "loss": 0.0177, + "num_input_tokens_seen": 84473616, + "step": 39130 + }, + { + "epoch": 6.384176182707994, + "grad_norm": 0.023006822913885117, + "learning_rate": 0.0008606376337382711, + "loss": 0.0981, + "num_input_tokens_seen": 84485776, + "step": 39135 + }, + { + "epoch": 6.3849918433931485, + "grad_norm": 0.04407806321978569, + "learning_rate": 0.0008605883274843824, + "loss": 0.0151, + "num_input_tokens_seen": 84498064, + "step": 39140 + }, + { + "epoch": 6.385807504078303, + "grad_norm": 0.005771650932729244, + "learning_rate": 0.0008605390139227137, + "loss": 0.1724, + "num_input_tokens_seen": 84509296, + "step": 39145 + }, + { + "epoch": 6.386623164763458, + "grad_norm": 0.17830881476402283, + "learning_rate": 0.0008604896930542645, + "loss": 0.0485, + "num_input_tokens_seen": 84519888, + "step": 39150 + }, + { + "epoch": 6.387438825448613, + "grad_norm": 0.006686089094728231, + "learning_rate": 0.0008604403648800346, + "loss": 0.1836, + "num_input_tokens_seen": 84530352, + "step": 39155 + }, + { + "epoch": 6.388254486133769, + "grad_norm": 0.013637065887451172, + "learning_rate": 0.0008603910294010231, + "loss": 0.1193, + "num_input_tokens_seen": 84540912, + "step": 39160 + }, + { + "epoch": 6.3890701468189235, + "grad_norm": 0.026765035465359688, + "learning_rate": 0.0008603416866182305, + "loss": 0.0151, + "num_input_tokens_seen": 84551760, + "step": 39165 + }, + { + "epoch": 6.389885807504078, + "grad_norm": 0.031480468809604645, + "learning_rate": 0.0008602923365326563, + "loss": 0.0439, + "num_input_tokens_seen": 84562960, + "step": 39170 + }, + { + "epoch": 6.390701468189233, + "grad_norm": 0.1783287078142166, + "learning_rate": 0.000860242979145301, + "loss": 0.1165, + "num_input_tokens_seen": 84573264, + "step": 39175 + }, + { + "epoch": 6.391517128874388, + "grad_norm": 0.020328661426901817, + "learning_rate": 0.0008601936144571646, + "loss": 0.0121, + "num_input_tokens_seen": 84583632, + "step": 39180 + }, + { + "epoch": 6.392332789559543, + "grad_norm": 0.14400437474250793, + "learning_rate": 0.0008601442424692476, + "loss": 0.2935, + "num_input_tokens_seen": 84594960, + "step": 39185 + }, + { + "epoch": 6.3931484502446985, + "grad_norm": 0.017621317878365517, + "learning_rate": 0.0008600948631825508, + "loss": 0.0373, + "num_input_tokens_seen": 84604912, + "step": 39190 + }, + { + "epoch": 6.393964110929853, + "grad_norm": 0.026853395625948906, + "learning_rate": 0.0008600454765980747, + "loss": 0.1521, + "num_input_tokens_seen": 84615344, + "step": 39195 + }, + { + "epoch": 6.394779771615008, + "grad_norm": 0.19105254113674164, + "learning_rate": 0.0008599960827168204, + "loss": 0.134, + "num_input_tokens_seen": 84625744, + "step": 39200 + }, + { + "epoch": 6.395595432300163, + "grad_norm": 0.17194747924804688, + "learning_rate": 0.0008599466815397886, + "loss": 0.0494, + "num_input_tokens_seen": 84636784, + "step": 39205 + }, + { + "epoch": 6.396411092985318, + "grad_norm": 0.12843531370162964, + "learning_rate": 0.0008598972730679809, + "loss": 0.1206, + "num_input_tokens_seen": 84648656, + "step": 39210 + }, + { + "epoch": 6.397226753670473, + "grad_norm": 0.020910130813717842, + "learning_rate": 0.0008598478573023982, + "loss": 0.0358, + "num_input_tokens_seen": 84659728, + "step": 39215 + }, + { + "epoch": 6.398042414355628, + "grad_norm": 0.023661211133003235, + "learning_rate": 0.0008597984342440421, + "loss": 0.0333, + "num_input_tokens_seen": 84670672, + "step": 39220 + }, + { + "epoch": 6.398858075040783, + "grad_norm": 0.20780543982982635, + "learning_rate": 0.0008597490038939145, + "loss": 0.1166, + "num_input_tokens_seen": 84679952, + "step": 39225 + }, + { + "epoch": 6.399673735725938, + "grad_norm": 0.013756867498159409, + "learning_rate": 0.0008596995662530169, + "loss": 0.0254, + "num_input_tokens_seen": 84691024, + "step": 39230 + }, + { + "epoch": 6.400489396411093, + "grad_norm": 0.019880151376128197, + "learning_rate": 0.0008596501213223514, + "loss": 0.055, + "num_input_tokens_seen": 84700976, + "step": 39235 + }, + { + "epoch": 6.401305057096248, + "grad_norm": 0.008477531373500824, + "learning_rate": 0.0008596006691029196, + "loss": 0.0237, + "num_input_tokens_seen": 84710896, + "step": 39240 + }, + { + "epoch": 6.402120717781403, + "grad_norm": 0.0896444320678711, + "learning_rate": 0.0008595512095957244, + "loss": 0.0701, + "num_input_tokens_seen": 84722448, + "step": 39245 + }, + { + "epoch": 6.402936378466558, + "grad_norm": 0.29478490352630615, + "learning_rate": 0.0008595017428017677, + "loss": 0.2549, + "num_input_tokens_seen": 84733168, + "step": 39250 + }, + { + "epoch": 6.403752039151713, + "grad_norm": 0.11170192807912827, + "learning_rate": 0.000859452268722052, + "loss": 0.0411, + "num_input_tokens_seen": 84744176, + "step": 39255 + }, + { + "epoch": 6.404567699836868, + "grad_norm": 0.18062400817871094, + "learning_rate": 0.0008594027873575803, + "loss": 0.0192, + "num_input_tokens_seen": 84754704, + "step": 39260 + }, + { + "epoch": 6.4053833605220225, + "grad_norm": 0.028174523264169693, + "learning_rate": 0.0008593532987093551, + "loss": 0.0964, + "num_input_tokens_seen": 84765584, + "step": 39265 + }, + { + "epoch": 6.406199021207178, + "grad_norm": 0.07922739535570145, + "learning_rate": 0.0008593038027783793, + "loss": 0.0332, + "num_input_tokens_seen": 84776144, + "step": 39270 + }, + { + "epoch": 6.407014681892333, + "grad_norm": 0.021971506997942924, + "learning_rate": 0.0008592542995656563, + "loss": 0.1141, + "num_input_tokens_seen": 84785488, + "step": 39275 + }, + { + "epoch": 6.407830342577488, + "grad_norm": 0.15919260680675507, + "learning_rate": 0.000859204789072189, + "loss": 0.1058, + "num_input_tokens_seen": 84796080, + "step": 39280 + }, + { + "epoch": 6.408646003262643, + "grad_norm": 0.027237551286816597, + "learning_rate": 0.0008591552712989812, + "loss": 0.1394, + "num_input_tokens_seen": 84805520, + "step": 39285 + }, + { + "epoch": 6.4094616639477975, + "grad_norm": 0.04617493227124214, + "learning_rate": 0.0008591057462470359, + "loss": 0.0504, + "num_input_tokens_seen": 84816464, + "step": 39290 + }, + { + "epoch": 6.410277324632952, + "grad_norm": 0.010178613476455212, + "learning_rate": 0.0008590562139173573, + "loss": 0.2221, + "num_input_tokens_seen": 84826640, + "step": 39295 + }, + { + "epoch": 6.411092985318108, + "grad_norm": 0.2511279284954071, + "learning_rate": 0.000859006674310949, + "loss": 0.1892, + "num_input_tokens_seen": 84836304, + "step": 39300 + }, + { + "epoch": 6.411908646003263, + "grad_norm": 0.014409887604415417, + "learning_rate": 0.000858957127428815, + "loss": 0.0242, + "num_input_tokens_seen": 84847280, + "step": 39305 + }, + { + "epoch": 6.412724306688418, + "grad_norm": 0.026141835376620293, + "learning_rate": 0.0008589075732719594, + "loss": 0.1097, + "num_input_tokens_seen": 84859696, + "step": 39310 + }, + { + "epoch": 6.4135399673735725, + "grad_norm": 0.029156722128391266, + "learning_rate": 0.0008588580118413867, + "loss": 0.0784, + "num_input_tokens_seen": 84870576, + "step": 39315 + }, + { + "epoch": 6.414355628058727, + "grad_norm": 0.005470012314617634, + "learning_rate": 0.0008588084431381009, + "loss": 0.0733, + "num_input_tokens_seen": 84882224, + "step": 39320 + }, + { + "epoch": 6.415171288743883, + "grad_norm": 0.1792827546596527, + "learning_rate": 0.000858758867163107, + "loss": 0.0949, + "num_input_tokens_seen": 84892496, + "step": 39325 + }, + { + "epoch": 6.415986949429038, + "grad_norm": 0.1913321167230606, + "learning_rate": 0.0008587092839174096, + "loss": 0.1978, + "num_input_tokens_seen": 84903440, + "step": 39330 + }, + { + "epoch": 6.416802610114193, + "grad_norm": 0.15446220338344574, + "learning_rate": 0.0008586596934020132, + "loss": 0.0519, + "num_input_tokens_seen": 84914608, + "step": 39335 + }, + { + "epoch": 6.417618270799347, + "grad_norm": 0.0651579350233078, + "learning_rate": 0.0008586100956179234, + "loss": 0.0243, + "num_input_tokens_seen": 84926768, + "step": 39340 + }, + { + "epoch": 6.418433931484502, + "grad_norm": 0.19779840111732483, + "learning_rate": 0.000858560490566145, + "loss": 0.0624, + "num_input_tokens_seen": 84937488, + "step": 39345 + }, + { + "epoch": 6.419249592169657, + "grad_norm": 0.002062238985672593, + "learning_rate": 0.0008585108782476834, + "loss": 0.0669, + "num_input_tokens_seen": 84949840, + "step": 39350 + }, + { + "epoch": 6.420065252854813, + "grad_norm": 0.08052044361829758, + "learning_rate": 0.000858461258663544, + "loss": 0.1014, + "num_input_tokens_seen": 84959088, + "step": 39355 + }, + { + "epoch": 6.420880913539968, + "grad_norm": 0.007013132330030203, + "learning_rate": 0.0008584116318147324, + "loss": 0.0275, + "num_input_tokens_seen": 84969232, + "step": 39360 + }, + { + "epoch": 6.421696574225122, + "grad_norm": 0.004674549214541912, + "learning_rate": 0.0008583619977022546, + "loss": 0.1164, + "num_input_tokens_seen": 84981488, + "step": 39365 + }, + { + "epoch": 6.422512234910277, + "grad_norm": 0.03125779330730438, + "learning_rate": 0.000858312356327116, + "loss": 0.149, + "num_input_tokens_seen": 84992688, + "step": 39370 + }, + { + "epoch": 6.423327895595432, + "grad_norm": 0.015671614557504654, + "learning_rate": 0.0008582627076903232, + "loss": 0.0178, + "num_input_tokens_seen": 85004144, + "step": 39375 + }, + { + "epoch": 6.424143556280587, + "grad_norm": 0.38116341829299927, + "learning_rate": 0.0008582130517928821, + "loss": 0.1197, + "num_input_tokens_seen": 85015408, + "step": 39380 + }, + { + "epoch": 6.424959216965743, + "grad_norm": 0.08985337615013123, + "learning_rate": 0.000858163388635799, + "loss": 0.055, + "num_input_tokens_seen": 85026064, + "step": 39385 + }, + { + "epoch": 6.425774877650897, + "grad_norm": 0.06867845356464386, + "learning_rate": 0.0008581137182200806, + "loss": 0.0886, + "num_input_tokens_seen": 85037232, + "step": 39390 + }, + { + "epoch": 6.426590538336052, + "grad_norm": 0.23772414028644562, + "learning_rate": 0.0008580640405467333, + "loss": 0.1328, + "num_input_tokens_seen": 85047888, + "step": 39395 + }, + { + "epoch": 6.427406199021207, + "grad_norm": 0.051451675593853, + "learning_rate": 0.0008580143556167638, + "loss": 0.1355, + "num_input_tokens_seen": 85058768, + "step": 39400 + }, + { + "epoch": 6.428221859706362, + "grad_norm": 0.011715248227119446, + "learning_rate": 0.0008579646634311795, + "loss": 0.0151, + "num_input_tokens_seen": 85069296, + "step": 39405 + }, + { + "epoch": 6.4290375203915175, + "grad_norm": 0.011284816078841686, + "learning_rate": 0.0008579149639909872, + "loss": 0.0821, + "num_input_tokens_seen": 85080528, + "step": 39410 + }, + { + "epoch": 6.429853181076672, + "grad_norm": 0.01225210726261139, + "learning_rate": 0.0008578652572971939, + "loss": 0.1147, + "num_input_tokens_seen": 85091024, + "step": 39415 + }, + { + "epoch": 6.430668841761827, + "grad_norm": 0.029842525720596313, + "learning_rate": 0.0008578155433508073, + "loss": 0.1005, + "num_input_tokens_seen": 85100656, + "step": 39420 + }, + { + "epoch": 6.431484502446982, + "grad_norm": 0.00458231708034873, + "learning_rate": 0.0008577658221528349, + "loss": 0.0187, + "num_input_tokens_seen": 85111856, + "step": 39425 + }, + { + "epoch": 6.432300163132137, + "grad_norm": 0.05453876778483391, + "learning_rate": 0.000857716093704284, + "loss": 0.1264, + "num_input_tokens_seen": 85120944, + "step": 39430 + }, + { + "epoch": 6.433115823817292, + "grad_norm": 0.16718176007270813, + "learning_rate": 0.0008576663580061628, + "loss": 0.0522, + "num_input_tokens_seen": 85131440, + "step": 39435 + }, + { + "epoch": 6.433931484502447, + "grad_norm": 0.028926821425557137, + "learning_rate": 0.0008576166150594792, + "loss": 0.0918, + "num_input_tokens_seen": 85140720, + "step": 39440 + }, + { + "epoch": 6.434747145187602, + "grad_norm": 0.05886692926287651, + "learning_rate": 0.0008575668648652411, + "loss": 0.1663, + "num_input_tokens_seen": 85151152, + "step": 39445 + }, + { + "epoch": 6.435562805872757, + "grad_norm": 0.23914267122745514, + "learning_rate": 0.0008575171074244568, + "loss": 0.0942, + "num_input_tokens_seen": 85161392, + "step": 39450 + }, + { + "epoch": 6.436378466557912, + "grad_norm": 0.02947208471596241, + "learning_rate": 0.000857467342738135, + "loss": 0.0513, + "num_input_tokens_seen": 85172656, + "step": 39455 + }, + { + "epoch": 6.437194127243067, + "grad_norm": 0.02373735047876835, + "learning_rate": 0.000857417570807284, + "loss": 0.0274, + "num_input_tokens_seen": 85184784, + "step": 39460 + }, + { + "epoch": 6.438009787928221, + "grad_norm": 0.10858530551195145, + "learning_rate": 0.0008573677916329124, + "loss": 0.0937, + "num_input_tokens_seen": 85195248, + "step": 39465 + }, + { + "epoch": 6.438825448613377, + "grad_norm": 0.021741973236203194, + "learning_rate": 0.0008573180052160291, + "loss": 0.0388, + "num_input_tokens_seen": 85205040, + "step": 39470 + }, + { + "epoch": 6.439641109298532, + "grad_norm": 0.02645989879965782, + "learning_rate": 0.0008572682115576433, + "loss": 0.0257, + "num_input_tokens_seen": 85215344, + "step": 39475 + }, + { + "epoch": 6.440456769983687, + "grad_norm": 0.1932060569524765, + "learning_rate": 0.0008572184106587638, + "loss": 0.0982, + "num_input_tokens_seen": 85226352, + "step": 39480 + }, + { + "epoch": 6.441272430668842, + "grad_norm": 0.006832319777458906, + "learning_rate": 0.0008571686025204002, + "loss": 0.0243, + "num_input_tokens_seen": 85235408, + "step": 39485 + }, + { + "epoch": 6.442088091353996, + "grad_norm": 0.008539782837033272, + "learning_rate": 0.0008571187871435616, + "loss": 0.0894, + "num_input_tokens_seen": 85246352, + "step": 39490 + }, + { + "epoch": 6.442903752039152, + "grad_norm": 0.11443430185317993, + "learning_rate": 0.0008570689645292579, + "loss": 0.1468, + "num_input_tokens_seen": 85256368, + "step": 39495 + }, + { + "epoch": 6.443719412724307, + "grad_norm": 0.040839746594429016, + "learning_rate": 0.0008570191346784986, + "loss": 0.1033, + "num_input_tokens_seen": 85267856, + "step": 39500 + }, + { + "epoch": 6.444535073409462, + "grad_norm": 0.037189751863479614, + "learning_rate": 0.0008569692975922935, + "loss": 0.0761, + "num_input_tokens_seen": 85280112, + "step": 39505 + }, + { + "epoch": 6.445350734094617, + "grad_norm": 0.009280839003622532, + "learning_rate": 0.0008569194532716529, + "loss": 0.0302, + "num_input_tokens_seen": 85288688, + "step": 39510 + }, + { + "epoch": 6.446166394779771, + "grad_norm": 0.05105043575167656, + "learning_rate": 0.0008568696017175868, + "loss": 0.0845, + "num_input_tokens_seen": 85299888, + "step": 39515 + }, + { + "epoch": 6.446982055464926, + "grad_norm": 0.014433260075747967, + "learning_rate": 0.0008568197429311054, + "loss": 0.0289, + "num_input_tokens_seen": 85310128, + "step": 39520 + }, + { + "epoch": 6.447797716150082, + "grad_norm": 0.01801239885389805, + "learning_rate": 0.0008567698769132193, + "loss": 0.0192, + "num_input_tokens_seen": 85321072, + "step": 39525 + }, + { + "epoch": 6.448613376835237, + "grad_norm": 0.22713717818260193, + "learning_rate": 0.0008567200036649391, + "loss": 0.062, + "num_input_tokens_seen": 85333008, + "step": 39530 + }, + { + "epoch": 6.4494290375203915, + "grad_norm": 0.028388522565364838, + "learning_rate": 0.0008566701231872753, + "loss": 0.0163, + "num_input_tokens_seen": 85344048, + "step": 39535 + }, + { + "epoch": 6.450244698205546, + "grad_norm": 0.02563036046922207, + "learning_rate": 0.0008566202354812392, + "loss": 0.1156, + "num_input_tokens_seen": 85353136, + "step": 39540 + }, + { + "epoch": 6.451060358890701, + "grad_norm": 0.2910009026527405, + "learning_rate": 0.0008565703405478415, + "loss": 0.0727, + "num_input_tokens_seen": 85364016, + "step": 39545 + }, + { + "epoch": 6.451876019575856, + "grad_norm": 0.21083374321460724, + "learning_rate": 0.0008565204383880937, + "loss": 0.1, + "num_input_tokens_seen": 85374928, + "step": 39550 + }, + { + "epoch": 6.452691680261012, + "grad_norm": 0.09616486728191376, + "learning_rate": 0.0008564705290030068, + "loss": 0.0662, + "num_input_tokens_seen": 85383888, + "step": 39555 + }, + { + "epoch": 6.4535073409461665, + "grad_norm": 0.22552363574504852, + "learning_rate": 0.0008564206123935924, + "loss": 0.2222, + "num_input_tokens_seen": 85394736, + "step": 39560 + }, + { + "epoch": 6.454323001631321, + "grad_norm": 0.006094069220125675, + "learning_rate": 0.0008563706885608622, + "loss": 0.1781, + "num_input_tokens_seen": 85405232, + "step": 39565 + }, + { + "epoch": 6.455138662316476, + "grad_norm": 0.03274981677532196, + "learning_rate": 0.0008563207575058279, + "loss": 0.0419, + "num_input_tokens_seen": 85416176, + "step": 39570 + }, + { + "epoch": 6.455954323001631, + "grad_norm": 0.14462244510650635, + "learning_rate": 0.0008562708192295012, + "loss": 0.0761, + "num_input_tokens_seen": 85426800, + "step": 39575 + }, + { + "epoch": 6.456769983686787, + "grad_norm": 0.03904232755303383, + "learning_rate": 0.0008562208737328947, + "loss": 0.0379, + "num_input_tokens_seen": 85437520, + "step": 39580 + }, + { + "epoch": 6.4575856443719415, + "grad_norm": 0.023389481008052826, + "learning_rate": 0.0008561709210170201, + "loss": 0.0586, + "num_input_tokens_seen": 85447888, + "step": 39585 + }, + { + "epoch": 6.458401305057096, + "grad_norm": 0.004195408895611763, + "learning_rate": 0.00085612096108289, + "loss": 0.0387, + "num_input_tokens_seen": 85459344, + "step": 39590 + }, + { + "epoch": 6.459216965742251, + "grad_norm": 0.24529923498630524, + "learning_rate": 0.0008560709939315169, + "loss": 0.1445, + "num_input_tokens_seen": 85469744, + "step": 39595 + }, + { + "epoch": 6.460032626427406, + "grad_norm": 0.09731040149927139, + "learning_rate": 0.0008560210195639133, + "loss": 0.0638, + "num_input_tokens_seen": 85481168, + "step": 39600 + }, + { + "epoch": 6.460848287112561, + "grad_norm": 0.030531220138072968, + "learning_rate": 0.0008559710379810922, + "loss": 0.0423, + "num_input_tokens_seen": 85492976, + "step": 39605 + }, + { + "epoch": 6.4616639477977165, + "grad_norm": 0.04872620850801468, + "learning_rate": 0.0008559210491840664, + "loss": 0.0904, + "num_input_tokens_seen": 85502896, + "step": 39610 + }, + { + "epoch": 6.462479608482871, + "grad_norm": 0.027873797342181206, + "learning_rate": 0.0008558710531738489, + "loss": 0.1038, + "num_input_tokens_seen": 85513680, + "step": 39615 + }, + { + "epoch": 6.463295269168026, + "grad_norm": 0.0021757674403488636, + "learning_rate": 0.0008558210499514532, + "loss": 0.1837, + "num_input_tokens_seen": 85524784, + "step": 39620 + }, + { + "epoch": 6.464110929853181, + "grad_norm": 0.16763927042484283, + "learning_rate": 0.0008557710395178926, + "loss": 0.1349, + "num_input_tokens_seen": 85536208, + "step": 39625 + }, + { + "epoch": 6.464926590538336, + "grad_norm": 0.03903459012508392, + "learning_rate": 0.0008557210218741805, + "loss": 0.0546, + "num_input_tokens_seen": 85545936, + "step": 39630 + }, + { + "epoch": 6.465742251223491, + "grad_norm": 0.005725593771785498, + "learning_rate": 0.0008556709970213305, + "loss": 0.0453, + "num_input_tokens_seen": 85556080, + "step": 39635 + }, + { + "epoch": 6.466557911908646, + "grad_norm": 0.4059804081916809, + "learning_rate": 0.0008556209649603566, + "loss": 0.2207, + "num_input_tokens_seen": 85565840, + "step": 39640 + }, + { + "epoch": 6.467373572593801, + "grad_norm": 0.01835477352142334, + "learning_rate": 0.0008555709256922728, + "loss": 0.1863, + "num_input_tokens_seen": 85576912, + "step": 39645 + }, + { + "epoch": 6.468189233278956, + "grad_norm": 0.00645184563472867, + "learning_rate": 0.0008555208792180931, + "loss": 0.0612, + "num_input_tokens_seen": 85587760, + "step": 39650 + }, + { + "epoch": 6.469004893964111, + "grad_norm": 0.003989835735410452, + "learning_rate": 0.0008554708255388317, + "loss": 0.0465, + "num_input_tokens_seen": 85597552, + "step": 39655 + }, + { + "epoch": 6.4698205546492655, + "grad_norm": 0.19041182100772858, + "learning_rate": 0.0008554207646555032, + "loss": 0.0764, + "num_input_tokens_seen": 85608368, + "step": 39660 + }, + { + "epoch": 6.470636215334421, + "grad_norm": 0.0029367648530751467, + "learning_rate": 0.0008553706965691218, + "loss": 0.044, + "num_input_tokens_seen": 85618096, + "step": 39665 + }, + { + "epoch": 6.471451876019576, + "grad_norm": 0.015094537287950516, + "learning_rate": 0.0008553206212807026, + "loss": 0.0212, + "num_input_tokens_seen": 85628656, + "step": 39670 + }, + { + "epoch": 6.472267536704731, + "grad_norm": 0.11712267249822617, + "learning_rate": 0.0008552705387912602, + "loss": 0.0392, + "num_input_tokens_seen": 85639632, + "step": 39675 + }, + { + "epoch": 6.473083197389886, + "grad_norm": 0.005496603436768055, + "learning_rate": 0.0008552204491018096, + "loss": 0.1031, + "num_input_tokens_seen": 85650832, + "step": 39680 + }, + { + "epoch": 6.4738988580750405, + "grad_norm": 0.04186651110649109, + "learning_rate": 0.000855170352213366, + "loss": 0.1051, + "num_input_tokens_seen": 85661296, + "step": 39685 + }, + { + "epoch": 6.474714518760196, + "grad_norm": 0.01675696298480034, + "learning_rate": 0.0008551202481269446, + "loss": 0.0122, + "num_input_tokens_seen": 85672368, + "step": 39690 + }, + { + "epoch": 6.475530179445351, + "grad_norm": 0.002059215446934104, + "learning_rate": 0.000855070136843561, + "loss": 0.0466, + "num_input_tokens_seen": 85682160, + "step": 39695 + }, + { + "epoch": 6.476345840130506, + "grad_norm": 0.253680557012558, + "learning_rate": 0.0008550200183642304, + "loss": 0.076, + "num_input_tokens_seen": 85692656, + "step": 39700 + }, + { + "epoch": 6.477161500815661, + "grad_norm": 0.21408464014530182, + "learning_rate": 0.000854969892689969, + "loss": 0.1869, + "num_input_tokens_seen": 85703600, + "step": 39705 + }, + { + "epoch": 6.4779771615008155, + "grad_norm": 0.28115952014923096, + "learning_rate": 0.0008549197598217923, + "loss": 0.0352, + "num_input_tokens_seen": 85714512, + "step": 39710 + }, + { + "epoch": 6.47879282218597, + "grad_norm": 0.13815860450267792, + "learning_rate": 0.0008548696197607165, + "loss": 0.2136, + "num_input_tokens_seen": 85723888, + "step": 39715 + }, + { + "epoch": 6.479608482871126, + "grad_norm": 0.7359482049942017, + "learning_rate": 0.0008548194725077576, + "loss": 0.0683, + "num_input_tokens_seen": 85734992, + "step": 39720 + }, + { + "epoch": 6.480424143556281, + "grad_norm": 0.02713259495794773, + "learning_rate": 0.000854769318063932, + "loss": 0.0155, + "num_input_tokens_seen": 85746768, + "step": 39725 + }, + { + "epoch": 6.481239804241436, + "grad_norm": 0.0951751172542572, + "learning_rate": 0.0008547191564302561, + "loss": 0.0337, + "num_input_tokens_seen": 85758352, + "step": 39730 + }, + { + "epoch": 6.4820554649265905, + "grad_norm": 0.007417720276862383, + "learning_rate": 0.0008546689876077464, + "loss": 0.0126, + "num_input_tokens_seen": 85769424, + "step": 39735 + }, + { + "epoch": 6.482871125611745, + "grad_norm": 0.4054908752441406, + "learning_rate": 0.0008546188115974198, + "loss": 0.1195, + "num_input_tokens_seen": 85780816, + "step": 39740 + }, + { + "epoch": 6.4836867862969, + "grad_norm": 0.0026738743763417006, + "learning_rate": 0.0008545686284002932, + "loss": 0.1557, + "num_input_tokens_seen": 85792528, + "step": 39745 + }, + { + "epoch": 6.484502446982056, + "grad_norm": 0.012411831878125668, + "learning_rate": 0.0008545184380173835, + "loss": 0.0268, + "num_input_tokens_seen": 85802448, + "step": 39750 + }, + { + "epoch": 6.485318107667211, + "grad_norm": 0.10611975938081741, + "learning_rate": 0.0008544682404497079, + "loss": 0.0301, + "num_input_tokens_seen": 85813104, + "step": 39755 + }, + { + "epoch": 6.486133768352365, + "grad_norm": 0.17993135750293732, + "learning_rate": 0.0008544180356982838, + "loss": 0.1821, + "num_input_tokens_seen": 85823248, + "step": 39760 + }, + { + "epoch": 6.48694942903752, + "grad_norm": 0.08196520805358887, + "learning_rate": 0.0008543678237641284, + "loss": 0.3076, + "num_input_tokens_seen": 85834384, + "step": 39765 + }, + { + "epoch": 6.487765089722675, + "grad_norm": 0.22592702507972717, + "learning_rate": 0.0008543176046482597, + "loss": 0.1097, + "num_input_tokens_seen": 85844848, + "step": 39770 + }, + { + "epoch": 6.488580750407831, + "grad_norm": 0.14438214898109436, + "learning_rate": 0.0008542673783516952, + "loss": 0.0633, + "num_input_tokens_seen": 85856528, + "step": 39775 + }, + { + "epoch": 6.489396411092986, + "grad_norm": 0.012656026519834995, + "learning_rate": 0.0008542171448754528, + "loss": 0.0117, + "num_input_tokens_seen": 85868112, + "step": 39780 + }, + { + "epoch": 6.49021207177814, + "grad_norm": 0.09305392950773239, + "learning_rate": 0.0008541669042205507, + "loss": 0.157, + "num_input_tokens_seen": 85878256, + "step": 39785 + }, + { + "epoch": 6.491027732463295, + "grad_norm": 0.009571591392159462, + "learning_rate": 0.0008541166563880069, + "loss": 0.0375, + "num_input_tokens_seen": 85887440, + "step": 39790 + }, + { + "epoch": 6.49184339314845, + "grad_norm": 0.125692680478096, + "learning_rate": 0.00085406640137884, + "loss": 0.059, + "num_input_tokens_seen": 85897872, + "step": 39795 + }, + { + "epoch": 6.492659053833605, + "grad_norm": 0.012961071915924549, + "learning_rate": 0.0008540161391940681, + "loss": 0.035, + "num_input_tokens_seen": 85909776, + "step": 39800 + }, + { + "epoch": 6.493474714518761, + "grad_norm": 0.17926137149333954, + "learning_rate": 0.0008539658698347102, + "loss": 0.14, + "num_input_tokens_seen": 85919984, + "step": 39805 + }, + { + "epoch": 6.494290375203915, + "grad_norm": 0.028648260980844498, + "learning_rate": 0.0008539155933017848, + "loss": 0.0485, + "num_input_tokens_seen": 85931632, + "step": 39810 + }, + { + "epoch": 6.49510603588907, + "grad_norm": 0.04053341597318649, + "learning_rate": 0.0008538653095963109, + "loss": 0.0208, + "num_input_tokens_seen": 85944176, + "step": 39815 + }, + { + "epoch": 6.495921696574225, + "grad_norm": 0.029405100271105766, + "learning_rate": 0.0008538150187193076, + "loss": 0.0183, + "num_input_tokens_seen": 85956016, + "step": 39820 + }, + { + "epoch": 6.49673735725938, + "grad_norm": 0.054688580334186554, + "learning_rate": 0.0008537647206717942, + "loss": 0.0233, + "num_input_tokens_seen": 85966704, + "step": 39825 + }, + { + "epoch": 6.497553017944535, + "grad_norm": 0.0028236224316060543, + "learning_rate": 0.00085371441545479, + "loss": 0.2684, + "num_input_tokens_seen": 85976048, + "step": 39830 + }, + { + "epoch": 6.49836867862969, + "grad_norm": 0.017597002908587456, + "learning_rate": 0.0008536641030693143, + "loss": 0.1292, + "num_input_tokens_seen": 85985424, + "step": 39835 + }, + { + "epoch": 6.499184339314845, + "grad_norm": 0.01017941627651453, + "learning_rate": 0.000853613783516387, + "loss": 0.0552, + "num_input_tokens_seen": 85996976, + "step": 39840 + }, + { + "epoch": 6.5, + "grad_norm": 0.003783297026529908, + "learning_rate": 0.0008535634567970277, + "loss": 0.0192, + "num_input_tokens_seen": 86008784, + "step": 39845 + }, + { + "epoch": 6.500815660685155, + "grad_norm": 0.1511714607477188, + "learning_rate": 0.0008535131229122565, + "loss": 0.0775, + "num_input_tokens_seen": 86019568, + "step": 39850 + }, + { + "epoch": 6.50163132137031, + "grad_norm": 0.22698621451854706, + "learning_rate": 0.0008534627818630933, + "loss": 0.0848, + "num_input_tokens_seen": 86030448, + "step": 39855 + }, + { + "epoch": 6.502446982055465, + "grad_norm": 0.21191661059856415, + "learning_rate": 0.0008534124336505585, + "loss": 0.1657, + "num_input_tokens_seen": 86041040, + "step": 39860 + }, + { + "epoch": 6.50326264274062, + "grad_norm": 0.016783330589532852, + "learning_rate": 0.0008533620782756724, + "loss": 0.0128, + "num_input_tokens_seen": 86052624, + "step": 39865 + }, + { + "epoch": 6.504078303425775, + "grad_norm": 0.02929145097732544, + "learning_rate": 0.0008533117157394556, + "loss": 0.1055, + "num_input_tokens_seen": 86063472, + "step": 39870 + }, + { + "epoch": 6.50489396411093, + "grad_norm": 0.0052117579616606236, + "learning_rate": 0.0008532613460429285, + "loss": 0.0429, + "num_input_tokens_seen": 86074320, + "step": 39875 + }, + { + "epoch": 6.505709624796085, + "grad_norm": 0.007702103350311518, + "learning_rate": 0.0008532109691871122, + "loss": 0.0185, + "num_input_tokens_seen": 86086320, + "step": 39880 + }, + { + "epoch": 6.506525285481239, + "grad_norm": 0.011785144917666912, + "learning_rate": 0.0008531605851730275, + "loss": 0.1633, + "num_input_tokens_seen": 86098000, + "step": 39885 + }, + { + "epoch": 6.507340946166395, + "grad_norm": 0.0039803520776331425, + "learning_rate": 0.0008531101940016954, + "loss": 0.0233, + "num_input_tokens_seen": 86107088, + "step": 39890 + }, + { + "epoch": 6.50815660685155, + "grad_norm": 0.0041448562406003475, + "learning_rate": 0.0008530597956741374, + "loss": 0.1335, + "num_input_tokens_seen": 86118096, + "step": 39895 + }, + { + "epoch": 6.508972267536705, + "grad_norm": 0.27568310499191284, + "learning_rate": 0.0008530093901913748, + "loss": 0.1179, + "num_input_tokens_seen": 86129136, + "step": 39900 + }, + { + "epoch": 6.50978792822186, + "grad_norm": 0.15066319704055786, + "learning_rate": 0.000852958977554429, + "loss": 0.2067, + "num_input_tokens_seen": 86140816, + "step": 39905 + }, + { + "epoch": 6.510603588907014, + "grad_norm": 0.1193651407957077, + "learning_rate": 0.0008529085577643217, + "loss": 0.0367, + "num_input_tokens_seen": 86151664, + "step": 39910 + }, + { + "epoch": 6.511419249592169, + "grad_norm": 0.031773634254932404, + "learning_rate": 0.0008528581308220748, + "loss": 0.0803, + "num_input_tokens_seen": 86161744, + "step": 39915 + }, + { + "epoch": 6.512234910277325, + "grad_norm": 0.0960845798254013, + "learning_rate": 0.0008528076967287103, + "loss": 0.038, + "num_input_tokens_seen": 86171312, + "step": 39920 + }, + { + "epoch": 6.51305057096248, + "grad_norm": 0.011688338592648506, + "learning_rate": 0.0008527572554852502, + "loss": 0.0446, + "num_input_tokens_seen": 86181392, + "step": 39925 + }, + { + "epoch": 6.513866231647635, + "grad_norm": 0.024687450379133224, + "learning_rate": 0.0008527068070927169, + "loss": 0.0406, + "num_input_tokens_seen": 86191728, + "step": 39930 + }, + { + "epoch": 6.514681892332789, + "grad_norm": 0.022828346118330956, + "learning_rate": 0.0008526563515521327, + "loss": 0.031, + "num_input_tokens_seen": 86201712, + "step": 39935 + }, + { + "epoch": 6.515497553017944, + "grad_norm": 0.021184995770454407, + "learning_rate": 0.0008526058888645202, + "loss": 0.0652, + "num_input_tokens_seen": 86212464, + "step": 39940 + }, + { + "epoch": 6.5163132137031, + "grad_norm": 0.1826930046081543, + "learning_rate": 0.000852555419030902, + "loss": 0.1289, + "num_input_tokens_seen": 86224208, + "step": 39945 + }, + { + "epoch": 6.517128874388255, + "grad_norm": 0.12634719908237457, + "learning_rate": 0.000852504942052301, + "loss": 0.0269, + "num_input_tokens_seen": 86234544, + "step": 39950 + }, + { + "epoch": 6.5179445350734095, + "grad_norm": 0.14329330623149872, + "learning_rate": 0.0008524544579297402, + "loss": 0.1391, + "num_input_tokens_seen": 86245776, + "step": 39955 + }, + { + "epoch": 6.518760195758564, + "grad_norm": 0.006235344335436821, + "learning_rate": 0.0008524039666642424, + "loss": 0.0678, + "num_input_tokens_seen": 86256976, + "step": 39960 + }, + { + "epoch": 6.519575856443719, + "grad_norm": 0.06040893495082855, + "learning_rate": 0.0008523534682568315, + "loss": 0.1262, + "num_input_tokens_seen": 86267568, + "step": 39965 + }, + { + "epoch": 6.520391517128875, + "grad_norm": 0.06819289177656174, + "learning_rate": 0.0008523029627085306, + "loss": 0.0323, + "num_input_tokens_seen": 86278768, + "step": 39970 + }, + { + "epoch": 6.52120717781403, + "grad_norm": 0.13222722709178925, + "learning_rate": 0.000852252450020363, + "loss": 0.034, + "num_input_tokens_seen": 86288080, + "step": 39975 + }, + { + "epoch": 6.5220228384991845, + "grad_norm": 0.012037876062095165, + "learning_rate": 0.0008522019301933528, + "loss": 0.0217, + "num_input_tokens_seen": 86299120, + "step": 39980 + }, + { + "epoch": 6.522838499184339, + "grad_norm": 0.006067375186830759, + "learning_rate": 0.0008521514032285236, + "loss": 0.0931, + "num_input_tokens_seen": 86311056, + "step": 39985 + }, + { + "epoch": 6.523654159869494, + "grad_norm": 0.18859724700450897, + "learning_rate": 0.0008521008691268994, + "loss": 0.1234, + "num_input_tokens_seen": 86321904, + "step": 39990 + }, + { + "epoch": 6.524469820554649, + "grad_norm": 0.0058411224745213985, + "learning_rate": 0.0008520503278895045, + "loss": 0.1715, + "num_input_tokens_seen": 86332880, + "step": 39995 + }, + { + "epoch": 6.525285481239804, + "grad_norm": 0.012634899467229843, + "learning_rate": 0.0008519997795173632, + "loss": 0.0134, + "num_input_tokens_seen": 86344080, + "step": 40000 + }, + { + "epoch": 6.5261011419249595, + "grad_norm": 0.1329302042722702, + "learning_rate": 0.0008519492240114996, + "loss": 0.0446, + "num_input_tokens_seen": 86356240, + "step": 40005 + }, + { + "epoch": 6.526916802610114, + "grad_norm": 0.18105556070804596, + "learning_rate": 0.0008518986613729387, + "loss": 0.0982, + "num_input_tokens_seen": 86367824, + "step": 40010 + }, + { + "epoch": 6.527732463295269, + "grad_norm": 0.0515417642891407, + "learning_rate": 0.0008518480916027049, + "loss": 0.2701, + "num_input_tokens_seen": 86379632, + "step": 40015 + }, + { + "epoch": 6.528548123980424, + "grad_norm": 0.008471941575407982, + "learning_rate": 0.0008517975147018233, + "loss": 0.0889, + "num_input_tokens_seen": 86391248, + "step": 40020 + }, + { + "epoch": 6.529363784665579, + "grad_norm": 0.005254838615655899, + "learning_rate": 0.0008517469306713187, + "loss": 0.0541, + "num_input_tokens_seen": 86401104, + "step": 40025 + }, + { + "epoch": 6.5301794453507345, + "grad_norm": 0.20089179277420044, + "learning_rate": 0.0008516963395122163, + "loss": 0.1796, + "num_input_tokens_seen": 86412528, + "step": 40030 + }, + { + "epoch": 6.530995106035889, + "grad_norm": 0.055668413639068604, + "learning_rate": 0.0008516457412255414, + "loss": 0.0718, + "num_input_tokens_seen": 86422800, + "step": 40035 + }, + { + "epoch": 6.531810766721044, + "grad_norm": 0.04026126116514206, + "learning_rate": 0.0008515951358123195, + "loss": 0.0392, + "num_input_tokens_seen": 86434448, + "step": 40040 + }, + { + "epoch": 6.532626427406199, + "grad_norm": 0.1625850647687912, + "learning_rate": 0.0008515445232735761, + "loss": 0.0543, + "num_input_tokens_seen": 86447312, + "step": 40045 + }, + { + "epoch": 6.533442088091354, + "grad_norm": 0.16222698986530304, + "learning_rate": 0.0008514939036103371, + "loss": 0.1154, + "num_input_tokens_seen": 86458800, + "step": 40050 + }, + { + "epoch": 6.5342577487765094, + "grad_norm": 0.21430686116218567, + "learning_rate": 0.0008514432768236282, + "loss": 0.0554, + "num_input_tokens_seen": 86470928, + "step": 40055 + }, + { + "epoch": 6.535073409461664, + "grad_norm": 0.017081890255212784, + "learning_rate": 0.0008513926429144754, + "loss": 0.1012, + "num_input_tokens_seen": 86481520, + "step": 40060 + }, + { + "epoch": 6.535889070146819, + "grad_norm": 0.04521578177809715, + "learning_rate": 0.0008513420018839049, + "loss": 0.0886, + "num_input_tokens_seen": 86492080, + "step": 40065 + }, + { + "epoch": 6.536704730831974, + "grad_norm": 0.004765080753713846, + "learning_rate": 0.0008512913537329431, + "loss": 0.0315, + "num_input_tokens_seen": 86503280, + "step": 40070 + }, + { + "epoch": 6.537520391517129, + "grad_norm": 0.36580854654312134, + "learning_rate": 0.0008512406984626162, + "loss": 0.2118, + "num_input_tokens_seen": 86513808, + "step": 40075 + }, + { + "epoch": 6.5383360522022835, + "grad_norm": 0.009209499694406986, + "learning_rate": 0.0008511900360739512, + "loss": 0.1276, + "num_input_tokens_seen": 86524944, + "step": 40080 + }, + { + "epoch": 6.539151712887438, + "grad_norm": 0.009207934141159058, + "learning_rate": 0.0008511393665679745, + "loss": 0.2437, + "num_input_tokens_seen": 86535824, + "step": 40085 + }, + { + "epoch": 6.539967373572594, + "grad_norm": 0.17956605553627014, + "learning_rate": 0.000851088689945713, + "loss": 0.0597, + "num_input_tokens_seen": 86546992, + "step": 40090 + }, + { + "epoch": 6.540783034257749, + "grad_norm": 0.014494534581899643, + "learning_rate": 0.0008510380062081939, + "loss": 0.0169, + "num_input_tokens_seen": 86558352, + "step": 40095 + }, + { + "epoch": 6.541598694942904, + "grad_norm": 0.011102148331701756, + "learning_rate": 0.0008509873153564443, + "loss": 0.0388, + "num_input_tokens_seen": 86569424, + "step": 40100 + }, + { + "epoch": 6.5424143556280585, + "grad_norm": 0.011324157007038593, + "learning_rate": 0.0008509366173914914, + "loss": 0.1428, + "num_input_tokens_seen": 86579024, + "step": 40105 + }, + { + "epoch": 6.543230016313213, + "grad_norm": 0.007719113025814295, + "learning_rate": 0.0008508859123143628, + "loss": 0.1263, + "num_input_tokens_seen": 86590832, + "step": 40110 + }, + { + "epoch": 6.544045676998369, + "grad_norm": 0.21098828315734863, + "learning_rate": 0.0008508352001260861, + "loss": 0.0769, + "num_input_tokens_seen": 86602736, + "step": 40115 + }, + { + "epoch": 6.544861337683524, + "grad_norm": 0.15837232768535614, + "learning_rate": 0.000850784480827689, + "loss": 0.0318, + "num_input_tokens_seen": 86613712, + "step": 40120 + }, + { + "epoch": 6.545676998368679, + "grad_norm": 0.010980527848005295, + "learning_rate": 0.0008507337544201994, + "loss": 0.119, + "num_input_tokens_seen": 86624112, + "step": 40125 + }, + { + "epoch": 6.5464926590538335, + "grad_norm": 0.004981511272490025, + "learning_rate": 0.0008506830209046453, + "loss": 0.0081, + "num_input_tokens_seen": 86634000, + "step": 40130 + }, + { + "epoch": 6.547308319738988, + "grad_norm": 0.21405445039272308, + "learning_rate": 0.000850632280282055, + "loss": 0.1733, + "num_input_tokens_seen": 86644816, + "step": 40135 + }, + { + "epoch": 6.548123980424144, + "grad_norm": 0.25402647256851196, + "learning_rate": 0.0008505815325534565, + "loss": 0.1435, + "num_input_tokens_seen": 86656496, + "step": 40140 + }, + { + "epoch": 6.548939641109299, + "grad_norm": 0.020466001704335213, + "learning_rate": 0.0008505307777198788, + "loss": 0.1006, + "num_input_tokens_seen": 86668400, + "step": 40145 + }, + { + "epoch": 6.549755301794454, + "grad_norm": 0.16190390288829803, + "learning_rate": 0.0008504800157823501, + "loss": 0.049, + "num_input_tokens_seen": 86678832, + "step": 40150 + }, + { + "epoch": 6.5505709624796085, + "grad_norm": 0.03894716873764992, + "learning_rate": 0.000850429246741899, + "loss": 0.0614, + "num_input_tokens_seen": 86689040, + "step": 40155 + }, + { + "epoch": 6.551386623164763, + "grad_norm": 0.008205335587263107, + "learning_rate": 0.0008503784705995549, + "loss": 0.0872, + "num_input_tokens_seen": 86699952, + "step": 40160 + }, + { + "epoch": 6.552202283849918, + "grad_norm": 0.2553068995475769, + "learning_rate": 0.0008503276873563465, + "loss": 0.1478, + "num_input_tokens_seen": 86710320, + "step": 40165 + }, + { + "epoch": 6.553017944535073, + "grad_norm": 0.04390598461031914, + "learning_rate": 0.0008502768970133032, + "loss": 0.085, + "num_input_tokens_seen": 86721648, + "step": 40170 + }, + { + "epoch": 6.553833605220229, + "grad_norm": 0.16973742842674255, + "learning_rate": 0.0008502260995714543, + "loss": 0.1225, + "num_input_tokens_seen": 86732560, + "step": 40175 + }, + { + "epoch": 6.554649265905383, + "grad_norm": 0.08911366760730743, + "learning_rate": 0.0008501752950318292, + "loss": 0.0585, + "num_input_tokens_seen": 86743408, + "step": 40180 + }, + { + "epoch": 6.555464926590538, + "grad_norm": 0.12076473981142044, + "learning_rate": 0.0008501244833954573, + "loss": 0.1184, + "num_input_tokens_seen": 86754384, + "step": 40185 + }, + { + "epoch": 6.556280587275693, + "grad_norm": 0.039213042706251144, + "learning_rate": 0.0008500736646633686, + "loss": 0.0156, + "num_input_tokens_seen": 86764592, + "step": 40190 + }, + { + "epoch": 6.557096247960848, + "grad_norm": 0.07977621257305145, + "learning_rate": 0.0008500228388365933, + "loss": 0.0913, + "num_input_tokens_seen": 86776176, + "step": 40195 + }, + { + "epoch": 6.557911908646004, + "grad_norm": 0.13130591809749603, + "learning_rate": 0.0008499720059161608, + "loss": 0.0313, + "num_input_tokens_seen": 86786320, + "step": 40200 + }, + { + "epoch": 6.558727569331158, + "grad_norm": 0.002292054705321789, + "learning_rate": 0.0008499211659031018, + "loss": 0.0252, + "num_input_tokens_seen": 86798288, + "step": 40205 + }, + { + "epoch": 6.559543230016313, + "grad_norm": 0.047816142439842224, + "learning_rate": 0.0008498703187984465, + "loss": 0.07, + "num_input_tokens_seen": 86809456, + "step": 40210 + }, + { + "epoch": 6.560358890701468, + "grad_norm": 0.04253561422228813, + "learning_rate": 0.0008498194646032253, + "loss": 0.1049, + "num_input_tokens_seen": 86819824, + "step": 40215 + }, + { + "epoch": 6.561174551386623, + "grad_norm": 0.4150613844394684, + "learning_rate": 0.0008497686033184687, + "loss": 0.1265, + "num_input_tokens_seen": 86829872, + "step": 40220 + }, + { + "epoch": 6.561990212071779, + "grad_norm": 0.00703182490542531, + "learning_rate": 0.0008497177349452077, + "loss": 0.0852, + "num_input_tokens_seen": 86839472, + "step": 40225 + }, + { + "epoch": 6.562805872756933, + "grad_norm": 0.18496310710906982, + "learning_rate": 0.0008496668594844733, + "loss": 0.0409, + "num_input_tokens_seen": 86851184, + "step": 40230 + }, + { + "epoch": 6.563621533442088, + "grad_norm": 0.005887219682335854, + "learning_rate": 0.0008496159769372964, + "loss": 0.0709, + "num_input_tokens_seen": 86861136, + "step": 40235 + }, + { + "epoch": 6.564437194127243, + "grad_norm": 0.009166869334876537, + "learning_rate": 0.0008495650873047081, + "loss": 0.0426, + "num_input_tokens_seen": 86871344, + "step": 40240 + }, + { + "epoch": 6.565252854812398, + "grad_norm": 0.2893689274787903, + "learning_rate": 0.0008495141905877398, + "loss": 0.0375, + "num_input_tokens_seen": 86881488, + "step": 40245 + }, + { + "epoch": 6.566068515497553, + "grad_norm": 0.02804822288453579, + "learning_rate": 0.0008494632867874232, + "loss": 0.1631, + "num_input_tokens_seen": 86893264, + "step": 40250 + }, + { + "epoch": 6.566884176182708, + "grad_norm": 0.006722630467265844, + "learning_rate": 0.0008494123759047897, + "loss": 0.0687, + "num_input_tokens_seen": 86904208, + "step": 40255 + }, + { + "epoch": 6.567699836867863, + "grad_norm": 0.1756766140460968, + "learning_rate": 0.0008493614579408712, + "loss": 0.0641, + "num_input_tokens_seen": 86916080, + "step": 40260 + }, + { + "epoch": 6.568515497553018, + "grad_norm": 0.004879720509052277, + "learning_rate": 0.0008493105328966995, + "loss": 0.0203, + "num_input_tokens_seen": 86926640, + "step": 40265 + }, + { + "epoch": 6.569331158238173, + "grad_norm": 0.004532058257609606, + "learning_rate": 0.0008492596007733066, + "loss": 0.0963, + "num_input_tokens_seen": 86936688, + "step": 40270 + }, + { + "epoch": 6.570146818923328, + "grad_norm": 0.007803574204444885, + "learning_rate": 0.0008492086615717251, + "loss": 0.0619, + "num_input_tokens_seen": 86948272, + "step": 40275 + }, + { + "epoch": 6.5709624796084825, + "grad_norm": 0.44439488649368286, + "learning_rate": 0.0008491577152929867, + "loss": 0.161, + "num_input_tokens_seen": 86959184, + "step": 40280 + }, + { + "epoch": 6.571778140293638, + "grad_norm": 0.048845577985048294, + "learning_rate": 0.0008491067619381247, + "loss": 0.0278, + "num_input_tokens_seen": 86970032, + "step": 40285 + }, + { + "epoch": 6.572593800978793, + "grad_norm": 0.005780165083706379, + "learning_rate": 0.0008490558015081711, + "loss": 0.0672, + "num_input_tokens_seen": 86980048, + "step": 40290 + }, + { + "epoch": 6.573409461663948, + "grad_norm": 0.3023688495159149, + "learning_rate": 0.0008490048340041587, + "loss": 0.0842, + "num_input_tokens_seen": 86991888, + "step": 40295 + }, + { + "epoch": 6.574225122349103, + "grad_norm": 0.011381558142602444, + "learning_rate": 0.0008489538594271209, + "loss": 0.0171, + "num_input_tokens_seen": 87003056, + "step": 40300 + }, + { + "epoch": 6.575040783034257, + "grad_norm": 0.2420693039894104, + "learning_rate": 0.0008489028777780901, + "loss": 0.0625, + "num_input_tokens_seen": 87014192, + "step": 40305 + }, + { + "epoch": 6.575856443719413, + "grad_norm": 0.023077018558979034, + "learning_rate": 0.0008488518890581002, + "loss": 0.1329, + "num_input_tokens_seen": 87024112, + "step": 40310 + }, + { + "epoch": 6.576672104404568, + "grad_norm": 0.01438105572015047, + "learning_rate": 0.0008488008932681841, + "loss": 0.0503, + "num_input_tokens_seen": 87034896, + "step": 40315 + }, + { + "epoch": 6.577487765089723, + "grad_norm": 0.024481069296598434, + "learning_rate": 0.0008487498904093753, + "loss": 0.0855, + "num_input_tokens_seen": 87045392, + "step": 40320 + }, + { + "epoch": 6.578303425774878, + "grad_norm": 0.15265655517578125, + "learning_rate": 0.0008486988804827077, + "loss": 0.0777, + "num_input_tokens_seen": 87056432, + "step": 40325 + }, + { + "epoch": 6.579119086460032, + "grad_norm": 0.07714859396219254, + "learning_rate": 0.0008486478634892149, + "loss": 0.1682, + "num_input_tokens_seen": 87067440, + "step": 40330 + }, + { + "epoch": 6.579934747145187, + "grad_norm": 0.005474518518894911, + "learning_rate": 0.0008485968394299308, + "loss": 0.0486, + "num_input_tokens_seen": 87078064, + "step": 40335 + }, + { + "epoch": 6.580750407830343, + "grad_norm": 0.004332924727350473, + "learning_rate": 0.0008485458083058896, + "loss": 0.0214, + "num_input_tokens_seen": 87087920, + "step": 40340 + }, + { + "epoch": 6.581566068515498, + "grad_norm": 0.062577024102211, + "learning_rate": 0.0008484947701181254, + "loss": 0.0507, + "num_input_tokens_seen": 87098032, + "step": 40345 + }, + { + "epoch": 6.582381729200653, + "grad_norm": 0.01286846399307251, + "learning_rate": 0.0008484437248676726, + "loss": 0.18, + "num_input_tokens_seen": 87108304, + "step": 40350 + }, + { + "epoch": 6.583197389885807, + "grad_norm": 0.08173384517431259, + "learning_rate": 0.0008483926725555655, + "loss": 0.0305, + "num_input_tokens_seen": 87119088, + "step": 40355 + }, + { + "epoch": 6.584013050570962, + "grad_norm": 0.02262444980442524, + "learning_rate": 0.0008483416131828392, + "loss": 0.1748, + "num_input_tokens_seen": 87130032, + "step": 40360 + }, + { + "epoch": 6.584828711256117, + "grad_norm": 0.025480419397354126, + "learning_rate": 0.000848290546750528, + "loss": 0.2143, + "num_input_tokens_seen": 87141104, + "step": 40365 + }, + { + "epoch": 6.585644371941273, + "grad_norm": 0.11885299533605576, + "learning_rate": 0.0008482394732596672, + "loss": 0.1055, + "num_input_tokens_seen": 87151472, + "step": 40370 + }, + { + "epoch": 6.5864600326264275, + "grad_norm": 0.15320080518722534, + "learning_rate": 0.0008481883927112917, + "loss": 0.0413, + "num_input_tokens_seen": 87161680, + "step": 40375 + }, + { + "epoch": 6.587275693311582, + "grad_norm": 0.02157760038971901, + "learning_rate": 0.0008481373051064365, + "loss": 0.025, + "num_input_tokens_seen": 87172080, + "step": 40380 + }, + { + "epoch": 6.588091353996737, + "grad_norm": 0.11880740523338318, + "learning_rate": 0.0008480862104461374, + "loss": 0.0302, + "num_input_tokens_seen": 87181168, + "step": 40385 + }, + { + "epoch": 6.588907014681892, + "grad_norm": 0.009291236288845539, + "learning_rate": 0.0008480351087314295, + "loss": 0.1375, + "num_input_tokens_seen": 87192144, + "step": 40390 + }, + { + "epoch": 6.589722675367048, + "grad_norm": 0.031172290444374084, + "learning_rate": 0.0008479839999633487, + "loss": 0.1623, + "num_input_tokens_seen": 87203344, + "step": 40395 + }, + { + "epoch": 6.5905383360522025, + "grad_norm": 0.032374609261751175, + "learning_rate": 0.0008479328841429306, + "loss": 0.0241, + "num_input_tokens_seen": 87214416, + "step": 40400 + }, + { + "epoch": 6.591353996737357, + "grad_norm": 0.05712704360485077, + "learning_rate": 0.0008478817612712113, + "loss": 0.0275, + "num_input_tokens_seen": 87225168, + "step": 40405 + }, + { + "epoch": 6.592169657422512, + "grad_norm": 0.02561403252184391, + "learning_rate": 0.0008478306313492267, + "loss": 0.0916, + "num_input_tokens_seen": 87237232, + "step": 40410 + }, + { + "epoch": 6.592985318107667, + "grad_norm": 0.33621060848236084, + "learning_rate": 0.0008477794943780132, + "loss": 0.1849, + "num_input_tokens_seen": 87247696, + "step": 40415 + }, + { + "epoch": 6.593800978792823, + "grad_norm": 0.22155427932739258, + "learning_rate": 0.0008477283503586072, + "loss": 0.1265, + "num_input_tokens_seen": 87258448, + "step": 40420 + }, + { + "epoch": 6.5946166394779775, + "grad_norm": 0.02212555892765522, + "learning_rate": 0.0008476771992920449, + "loss": 0.051, + "num_input_tokens_seen": 87268880, + "step": 40425 + }, + { + "epoch": 6.595432300163132, + "grad_norm": 0.0549081489443779, + "learning_rate": 0.0008476260411793631, + "loss": 0.1266, + "num_input_tokens_seen": 87279568, + "step": 40430 + }, + { + "epoch": 6.596247960848287, + "grad_norm": 0.00974389910697937, + "learning_rate": 0.0008475748760215984, + "loss": 0.0684, + "num_input_tokens_seen": 87289776, + "step": 40435 + }, + { + "epoch": 6.597063621533442, + "grad_norm": 0.006863224320113659, + "learning_rate": 0.0008475237038197882, + "loss": 0.0665, + "num_input_tokens_seen": 87301488, + "step": 40440 + }, + { + "epoch": 6.597879282218597, + "grad_norm": 0.014286517165601254, + "learning_rate": 0.0008474725245749691, + "loss": 0.0311, + "num_input_tokens_seen": 87312112, + "step": 40445 + }, + { + "epoch": 6.598694942903752, + "grad_norm": 0.1479281634092331, + "learning_rate": 0.0008474213382881786, + "loss": 0.1331, + "num_input_tokens_seen": 87322416, + "step": 40450 + }, + { + "epoch": 6.599510603588907, + "grad_norm": 0.011257442645728588, + "learning_rate": 0.0008473701449604539, + "loss": 0.0235, + "num_input_tokens_seen": 87333040, + "step": 40455 + }, + { + "epoch": 6.600326264274062, + "grad_norm": 0.013942491263151169, + "learning_rate": 0.0008473189445928325, + "loss": 0.0656, + "num_input_tokens_seen": 87344112, + "step": 40460 + }, + { + "epoch": 6.601141924959217, + "grad_norm": 0.026433805003762245, + "learning_rate": 0.0008472677371863521, + "loss": 0.0673, + "num_input_tokens_seen": 87353680, + "step": 40465 + }, + { + "epoch": 6.601957585644372, + "grad_norm": 0.016513798385858536, + "learning_rate": 0.0008472165227420505, + "loss": 0.0368, + "num_input_tokens_seen": 87363536, + "step": 40470 + }, + { + "epoch": 6.602773246329527, + "grad_norm": 0.00970372837036848, + "learning_rate": 0.0008471653012609655, + "loss": 0.1053, + "num_input_tokens_seen": 87374992, + "step": 40475 + }, + { + "epoch": 6.603588907014682, + "grad_norm": 0.0030176257714629173, + "learning_rate": 0.0008471140727441353, + "loss": 0.0894, + "num_input_tokens_seen": 87386448, + "step": 40480 + }, + { + "epoch": 6.604404567699837, + "grad_norm": 0.09641921520233154, + "learning_rate": 0.0008470628371925981, + "loss": 0.1413, + "num_input_tokens_seen": 87395760, + "step": 40485 + }, + { + "epoch": 6.605220228384992, + "grad_norm": 0.08832412958145142, + "learning_rate": 0.0008470115946073922, + "loss": 0.0262, + "num_input_tokens_seen": 87407088, + "step": 40490 + }, + { + "epoch": 6.606035889070147, + "grad_norm": 0.12247590720653534, + "learning_rate": 0.0008469603449895562, + "loss": 0.0467, + "num_input_tokens_seen": 87418256, + "step": 40495 + }, + { + "epoch": 6.6068515497553015, + "grad_norm": 0.008906899020075798, + "learning_rate": 0.0008469090883401286, + "loss": 0.0339, + "num_input_tokens_seen": 87427984, + "step": 40500 + }, + { + "epoch": 6.607667210440457, + "grad_norm": 0.009157797321677208, + "learning_rate": 0.0008468578246601482, + "loss": 0.0825, + "num_input_tokens_seen": 87437424, + "step": 40505 + }, + { + "epoch": 6.608482871125612, + "grad_norm": 0.10492284595966339, + "learning_rate": 0.000846806553950654, + "loss": 0.1346, + "num_input_tokens_seen": 87447504, + "step": 40510 + }, + { + "epoch": 6.609298531810767, + "grad_norm": 0.021229412406682968, + "learning_rate": 0.0008467552762126851, + "loss": 0.0233, + "num_input_tokens_seen": 87457456, + "step": 40515 + }, + { + "epoch": 6.610114192495922, + "grad_norm": 0.029088690876960754, + "learning_rate": 0.0008467039914472805, + "loss": 0.0222, + "num_input_tokens_seen": 87468464, + "step": 40520 + }, + { + "epoch": 6.6109298531810765, + "grad_norm": 0.01440515834838152, + "learning_rate": 0.0008466526996554797, + "loss": 0.0139, + "num_input_tokens_seen": 87479504, + "step": 40525 + }, + { + "epoch": 6.611745513866231, + "grad_norm": 0.1770254224538803, + "learning_rate": 0.0008466014008383224, + "loss": 0.0575, + "num_input_tokens_seen": 87489840, + "step": 40530 + }, + { + "epoch": 6.612561174551386, + "grad_norm": 0.2256847620010376, + "learning_rate": 0.0008465500949968479, + "loss": 0.0456, + "num_input_tokens_seen": 87501424, + "step": 40535 + }, + { + "epoch": 6.613376835236542, + "grad_norm": 0.002514507621526718, + "learning_rate": 0.000846498782132096, + "loss": 0.0086, + "num_input_tokens_seen": 87512688, + "step": 40540 + }, + { + "epoch": 6.614192495921697, + "grad_norm": 0.004171543288975954, + "learning_rate": 0.0008464474622451067, + "loss": 0.0185, + "num_input_tokens_seen": 87523056, + "step": 40545 + }, + { + "epoch": 6.6150081566068515, + "grad_norm": 0.07648947834968567, + "learning_rate": 0.0008463961353369202, + "loss": 0.0636, + "num_input_tokens_seen": 87532784, + "step": 40550 + }, + { + "epoch": 6.615823817292006, + "grad_norm": 0.0419023297727108, + "learning_rate": 0.0008463448014085765, + "loss": 0.0302, + "num_input_tokens_seen": 87542224, + "step": 40555 + }, + { + "epoch": 6.616639477977161, + "grad_norm": 0.2082689255475998, + "learning_rate": 0.000846293460461116, + "loss": 0.1091, + "num_input_tokens_seen": 87554224, + "step": 40560 + }, + { + "epoch": 6.617455138662317, + "grad_norm": 0.25201615691185, + "learning_rate": 0.0008462421124955792, + "loss": 0.0355, + "num_input_tokens_seen": 87564944, + "step": 40565 + }, + { + "epoch": 6.618270799347472, + "grad_norm": 0.024194026365876198, + "learning_rate": 0.0008461907575130069, + "loss": 0.0535, + "num_input_tokens_seen": 87575696, + "step": 40570 + }, + { + "epoch": 6.6190864600326265, + "grad_norm": 0.010772458277642727, + "learning_rate": 0.0008461393955144397, + "loss": 0.1124, + "num_input_tokens_seen": 87587216, + "step": 40575 + }, + { + "epoch": 6.619902120717781, + "grad_norm": 0.01024528220295906, + "learning_rate": 0.0008460880265009185, + "loss": 0.0714, + "num_input_tokens_seen": 87597872, + "step": 40580 + }, + { + "epoch": 6.620717781402936, + "grad_norm": 0.03568638488650322, + "learning_rate": 0.0008460366504734843, + "loss": 0.0637, + "num_input_tokens_seen": 87608816, + "step": 40585 + }, + { + "epoch": 6.621533442088092, + "grad_norm": 0.04990572854876518, + "learning_rate": 0.0008459852674331785, + "loss": 0.1056, + "num_input_tokens_seen": 87619600, + "step": 40590 + }, + { + "epoch": 6.622349102773247, + "grad_norm": 0.17502950131893158, + "learning_rate": 0.0008459338773810424, + "loss": 0.2325, + "num_input_tokens_seen": 87629168, + "step": 40595 + }, + { + "epoch": 6.623164763458401, + "grad_norm": 0.002538851462304592, + "learning_rate": 0.0008458824803181174, + "loss": 0.0329, + "num_input_tokens_seen": 87640144, + "step": 40600 + }, + { + "epoch": 6.623980424143556, + "grad_norm": 0.025381911545991898, + "learning_rate": 0.0008458310762454451, + "loss": 0.0242, + "num_input_tokens_seen": 87650128, + "step": 40605 + }, + { + "epoch": 6.624796084828711, + "grad_norm": 0.0137851033359766, + "learning_rate": 0.0008457796651640672, + "loss": 0.1057, + "num_input_tokens_seen": 87661264, + "step": 40610 + }, + { + "epoch": 6.625611745513866, + "grad_norm": 0.0044571650214493275, + "learning_rate": 0.0008457282470750259, + "loss": 0.0543, + "num_input_tokens_seen": 87672464, + "step": 40615 + }, + { + "epoch": 6.626427406199021, + "grad_norm": 0.024776339530944824, + "learning_rate": 0.0008456768219793631, + "loss": 0.09, + "num_input_tokens_seen": 87684400, + "step": 40620 + }, + { + "epoch": 6.627243066884176, + "grad_norm": 0.0896618515253067, + "learning_rate": 0.000845625389878121, + "loss": 0.1633, + "num_input_tokens_seen": 87695728, + "step": 40625 + }, + { + "epoch": 6.628058727569331, + "grad_norm": 0.004947674926370382, + "learning_rate": 0.0008455739507723418, + "loss": 0.0172, + "num_input_tokens_seen": 87705456, + "step": 40630 + }, + { + "epoch": 6.628874388254486, + "grad_norm": 0.029359731823205948, + "learning_rate": 0.0008455225046630681, + "loss": 0.0122, + "num_input_tokens_seen": 87716816, + "step": 40635 + }, + { + "epoch": 6.629690048939641, + "grad_norm": 0.020221753045916557, + "learning_rate": 0.0008454710515513426, + "loss": 0.0693, + "num_input_tokens_seen": 87727216, + "step": 40640 + }, + { + "epoch": 6.630505709624796, + "grad_norm": 0.006177797913551331, + "learning_rate": 0.0008454195914382079, + "loss": 0.0521, + "num_input_tokens_seen": 87737040, + "step": 40645 + }, + { + "epoch": 6.631321370309951, + "grad_norm": 0.17381806671619415, + "learning_rate": 0.0008453681243247071, + "loss": 0.0986, + "num_input_tokens_seen": 87746928, + "step": 40650 + }, + { + "epoch": 6.632137030995106, + "grad_norm": 0.01752290315926075, + "learning_rate": 0.000845316650211883, + "loss": 0.0998, + "num_input_tokens_seen": 87757424, + "step": 40655 + }, + { + "epoch": 6.632952691680261, + "grad_norm": 0.028803708031773567, + "learning_rate": 0.0008452651691007789, + "loss": 0.0637, + "num_input_tokens_seen": 87768816, + "step": 40660 + }, + { + "epoch": 6.633768352365416, + "grad_norm": 0.016665518283843994, + "learning_rate": 0.0008452136809924384, + "loss": 0.1119, + "num_input_tokens_seen": 87780912, + "step": 40665 + }, + { + "epoch": 6.634584013050571, + "grad_norm": 0.1120605319738388, + "learning_rate": 0.0008451621858879043, + "loss": 0.1197, + "num_input_tokens_seen": 87791920, + "step": 40670 + }, + { + "epoch": 6.635399673735726, + "grad_norm": 0.1365475058555603, + "learning_rate": 0.000845110683788221, + "loss": 0.0841, + "num_input_tokens_seen": 87802352, + "step": 40675 + }, + { + "epoch": 6.636215334420881, + "grad_norm": 0.009444211609661579, + "learning_rate": 0.0008450591746944319, + "loss": 0.0383, + "num_input_tokens_seen": 87813488, + "step": 40680 + }, + { + "epoch": 6.637030995106036, + "grad_norm": 0.18760187923908234, + "learning_rate": 0.0008450076586075805, + "loss": 0.1575, + "num_input_tokens_seen": 87824016, + "step": 40685 + }, + { + "epoch": 6.637846655791191, + "grad_norm": 0.00839977152645588, + "learning_rate": 0.0008449561355287116, + "loss": 0.1945, + "num_input_tokens_seen": 87835120, + "step": 40690 + }, + { + "epoch": 6.638662316476346, + "grad_norm": 0.007052344270050526, + "learning_rate": 0.000844904605458869, + "loss": 0.0457, + "num_input_tokens_seen": 87846480, + "step": 40695 + }, + { + "epoch": 6.6394779771615005, + "grad_norm": 0.07927761226892471, + "learning_rate": 0.0008448530683990968, + "loss": 0.1282, + "num_input_tokens_seen": 87856304, + "step": 40700 + }, + { + "epoch": 6.640293637846656, + "grad_norm": 0.2790197432041168, + "learning_rate": 0.0008448015243504398, + "loss": 0.1103, + "num_input_tokens_seen": 87868080, + "step": 40705 + }, + { + "epoch": 6.641109298531811, + "grad_norm": 0.016041293740272522, + "learning_rate": 0.0008447499733139426, + "loss": 0.0314, + "num_input_tokens_seen": 87878416, + "step": 40710 + }, + { + "epoch": 6.641924959216966, + "grad_norm": 0.02414015308022499, + "learning_rate": 0.0008446984152906496, + "loss": 0.2108, + "num_input_tokens_seen": 87888720, + "step": 40715 + }, + { + "epoch": 6.642740619902121, + "grad_norm": 0.08404606580734253, + "learning_rate": 0.0008446468502816061, + "loss": 0.0945, + "num_input_tokens_seen": 87900528, + "step": 40720 + }, + { + "epoch": 6.643556280587275, + "grad_norm": 0.21531686186790466, + "learning_rate": 0.000844595278287857, + "loss": 0.1879, + "num_input_tokens_seen": 87911984, + "step": 40725 + }, + { + "epoch": 6.64437194127243, + "grad_norm": 0.20615947246551514, + "learning_rate": 0.0008445436993104473, + "loss": 0.1859, + "num_input_tokens_seen": 87922608, + "step": 40730 + }, + { + "epoch": 6.645187601957586, + "grad_norm": 0.01594899781048298, + "learning_rate": 0.0008444921133504225, + "loss": 0.0763, + "num_input_tokens_seen": 87933936, + "step": 40735 + }, + { + "epoch": 6.646003262642741, + "grad_norm": 0.025143783539533615, + "learning_rate": 0.0008444405204088281, + "loss": 0.0783, + "num_input_tokens_seen": 87945488, + "step": 40740 + }, + { + "epoch": 6.646818923327896, + "grad_norm": 0.01654892787337303, + "learning_rate": 0.0008443889204867095, + "loss": 0.1103, + "num_input_tokens_seen": 87956112, + "step": 40745 + }, + { + "epoch": 6.64763458401305, + "grad_norm": 0.17606611549854279, + "learning_rate": 0.0008443373135851125, + "loss": 0.1897, + "num_input_tokens_seen": 87965584, + "step": 40750 + }, + { + "epoch": 6.648450244698205, + "grad_norm": 0.1590752750635147, + "learning_rate": 0.0008442856997050832, + "loss": 0.029, + "num_input_tokens_seen": 87975056, + "step": 40755 + }, + { + "epoch": 6.649265905383361, + "grad_norm": 0.006155477371066809, + "learning_rate": 0.0008442340788476672, + "loss": 0.0591, + "num_input_tokens_seen": 87985712, + "step": 40760 + }, + { + "epoch": 6.650081566068516, + "grad_norm": 0.053119126707315445, + "learning_rate": 0.0008441824510139111, + "loss": 0.0224, + "num_input_tokens_seen": 87996336, + "step": 40765 + }, + { + "epoch": 6.650897226753671, + "grad_norm": 0.06203689053654671, + "learning_rate": 0.0008441308162048609, + "loss": 0.051, + "num_input_tokens_seen": 88007184, + "step": 40770 + }, + { + "epoch": 6.651712887438825, + "grad_norm": 0.017574982717633247, + "learning_rate": 0.0008440791744215632, + "loss": 0.1688, + "num_input_tokens_seen": 88017008, + "step": 40775 + }, + { + "epoch": 6.65252854812398, + "grad_norm": 0.17922617495059967, + "learning_rate": 0.0008440275256650644, + "loss": 0.0614, + "num_input_tokens_seen": 88027760, + "step": 40780 + }, + { + "epoch": 6.653344208809135, + "grad_norm": 0.2157328873872757, + "learning_rate": 0.0008439758699364115, + "loss": 0.1217, + "num_input_tokens_seen": 88039696, + "step": 40785 + }, + { + "epoch": 6.654159869494291, + "grad_norm": 0.013915738090872765, + "learning_rate": 0.0008439242072366511, + "loss": 0.0701, + "num_input_tokens_seen": 88050544, + "step": 40790 + }, + { + "epoch": 6.6549755301794455, + "grad_norm": 0.0037155335303395987, + "learning_rate": 0.0008438725375668305, + "loss": 0.0174, + "num_input_tokens_seen": 88061840, + "step": 40795 + }, + { + "epoch": 6.6557911908646, + "grad_norm": 0.04841242730617523, + "learning_rate": 0.0008438208609279967, + "loss": 0.0664, + "num_input_tokens_seen": 88072816, + "step": 40800 + }, + { + "epoch": 6.656606851549755, + "grad_norm": 0.15743957459926605, + "learning_rate": 0.0008437691773211969, + "loss": 0.0552, + "num_input_tokens_seen": 88083536, + "step": 40805 + }, + { + "epoch": 6.65742251223491, + "grad_norm": 0.19538897275924683, + "learning_rate": 0.0008437174867474786, + "loss": 0.1666, + "num_input_tokens_seen": 88094704, + "step": 40810 + }, + { + "epoch": 6.658238172920065, + "grad_norm": 0.22462762892246246, + "learning_rate": 0.0008436657892078895, + "loss": 0.0695, + "num_input_tokens_seen": 88106288, + "step": 40815 + }, + { + "epoch": 6.6590538336052205, + "grad_norm": 0.0018882813164964318, + "learning_rate": 0.0008436140847034772, + "loss": 0.0151, + "num_input_tokens_seen": 88114960, + "step": 40820 + }, + { + "epoch": 6.659869494290375, + "grad_norm": 0.009555350057780743, + "learning_rate": 0.0008435623732352895, + "loss": 0.1576, + "num_input_tokens_seen": 88124944, + "step": 40825 + }, + { + "epoch": 6.66068515497553, + "grad_norm": 0.010500220581889153, + "learning_rate": 0.0008435106548043745, + "loss": 0.0127, + "num_input_tokens_seen": 88135056, + "step": 40830 + }, + { + "epoch": 6.661500815660685, + "grad_norm": 0.010248217731714249, + "learning_rate": 0.0008434589294117802, + "loss": 0.0272, + "num_input_tokens_seen": 88146768, + "step": 40835 + }, + { + "epoch": 6.66231647634584, + "grad_norm": 0.0256291925907135, + "learning_rate": 0.0008434071970585551, + "loss": 0.125, + "num_input_tokens_seen": 88157968, + "step": 40840 + }, + { + "epoch": 6.6631321370309955, + "grad_norm": 0.00675778416916728, + "learning_rate": 0.0008433554577457475, + "loss": 0.121, + "num_input_tokens_seen": 88168912, + "step": 40845 + }, + { + "epoch": 6.66394779771615, + "grad_norm": 0.12290561944246292, + "learning_rate": 0.000843303711474406, + "loss": 0.068, + "num_input_tokens_seen": 88179632, + "step": 40850 + }, + { + "epoch": 6.664763458401305, + "grad_norm": 0.06480270624160767, + "learning_rate": 0.0008432519582455792, + "loss": 0.0255, + "num_input_tokens_seen": 88191472, + "step": 40855 + }, + { + "epoch": 6.66557911908646, + "grad_norm": 0.004105613101273775, + "learning_rate": 0.0008432001980603161, + "loss": 0.0153, + "num_input_tokens_seen": 88202800, + "step": 40860 + }, + { + "epoch": 6.666394779771615, + "grad_norm": 0.18599529564380646, + "learning_rate": 0.0008431484309196656, + "loss": 0.1504, + "num_input_tokens_seen": 88212848, + "step": 40865 + }, + { + "epoch": 6.6672104404567705, + "grad_norm": 0.022124314680695534, + "learning_rate": 0.0008430966568246768, + "loss": 0.1512, + "num_input_tokens_seen": 88224112, + "step": 40870 + }, + { + "epoch": 6.668026101141925, + "grad_norm": 0.07740606367588043, + "learning_rate": 0.0008430448757763989, + "loss": 0.0764, + "num_input_tokens_seen": 88234704, + "step": 40875 + }, + { + "epoch": 6.66884176182708, + "grad_norm": 0.016131598502397537, + "learning_rate": 0.0008429930877758814, + "loss": 0.0456, + "num_input_tokens_seen": 88247216, + "step": 40880 + }, + { + "epoch": 6.669657422512235, + "grad_norm": 0.0850653201341629, + "learning_rate": 0.000842941292824174, + "loss": 0.1425, + "num_input_tokens_seen": 88259664, + "step": 40885 + }, + { + "epoch": 6.67047308319739, + "grad_norm": 0.04681131988763809, + "learning_rate": 0.0008428894909223261, + "loss": 0.0518, + "num_input_tokens_seen": 88270128, + "step": 40890 + }, + { + "epoch": 6.671288743882545, + "grad_norm": 0.21991361677646637, + "learning_rate": 0.0008428376820713879, + "loss": 0.0413, + "num_input_tokens_seen": 88280176, + "step": 40895 + }, + { + "epoch": 6.672104404567699, + "grad_norm": 0.2084745317697525, + "learning_rate": 0.000842785866272409, + "loss": 0.112, + "num_input_tokens_seen": 88290288, + "step": 40900 + }, + { + "epoch": 6.672920065252855, + "grad_norm": 0.008162369020283222, + "learning_rate": 0.0008427340435264397, + "loss": 0.0652, + "num_input_tokens_seen": 88301296, + "step": 40905 + }, + { + "epoch": 6.67373572593801, + "grad_norm": 0.18261808156967163, + "learning_rate": 0.0008426822138345302, + "loss": 0.0808, + "num_input_tokens_seen": 88312752, + "step": 40910 + }, + { + "epoch": 6.674551386623165, + "grad_norm": 0.014610488899052143, + "learning_rate": 0.0008426303771977311, + "loss": 0.0144, + "num_input_tokens_seen": 88324144, + "step": 40915 + }, + { + "epoch": 6.6753670473083195, + "grad_norm": 0.03971157595515251, + "learning_rate": 0.0008425785336170925, + "loss": 0.0467, + "num_input_tokens_seen": 88335088, + "step": 40920 + }, + { + "epoch": 6.676182707993474, + "grad_norm": 0.013696584850549698, + "learning_rate": 0.0008425266830936654, + "loss": 0.0383, + "num_input_tokens_seen": 88345360, + "step": 40925 + }, + { + "epoch": 6.67699836867863, + "grad_norm": 0.24036268889904022, + "learning_rate": 0.0008424748256285005, + "loss": 0.177, + "num_input_tokens_seen": 88355888, + "step": 40930 + }, + { + "epoch": 6.677814029363785, + "grad_norm": 0.004243504721671343, + "learning_rate": 0.0008424229612226488, + "loss": 0.0738, + "num_input_tokens_seen": 88366832, + "step": 40935 + }, + { + "epoch": 6.67862969004894, + "grad_norm": 0.17402209341526031, + "learning_rate": 0.0008423710898771614, + "loss": 0.1377, + "num_input_tokens_seen": 88378320, + "step": 40940 + }, + { + "epoch": 6.6794453507340945, + "grad_norm": 0.18166255950927734, + "learning_rate": 0.0008423192115930897, + "loss": 0.1051, + "num_input_tokens_seen": 88388656, + "step": 40945 + }, + { + "epoch": 6.680261011419249, + "grad_norm": 0.18320345878601074, + "learning_rate": 0.0008422673263714848, + "loss": 0.2413, + "num_input_tokens_seen": 88399632, + "step": 40950 + }, + { + "epoch": 6.681076672104405, + "grad_norm": 0.026363050565123558, + "learning_rate": 0.0008422154342133983, + "loss": 0.0127, + "num_input_tokens_seen": 88410032, + "step": 40955 + }, + { + "epoch": 6.68189233278956, + "grad_norm": 0.0672154575586319, + "learning_rate": 0.0008421635351198819, + "loss": 0.0465, + "num_input_tokens_seen": 88421008, + "step": 40960 + }, + { + "epoch": 6.682707993474715, + "grad_norm": 0.09843332320451736, + "learning_rate": 0.0008421116290919875, + "loss": 0.0471, + "num_input_tokens_seen": 88432144, + "step": 40965 + }, + { + "epoch": 6.6835236541598695, + "grad_norm": 0.03872723504900932, + "learning_rate": 0.0008420597161307668, + "loss": 0.1802, + "num_input_tokens_seen": 88442576, + "step": 40970 + }, + { + "epoch": 6.684339314845024, + "grad_norm": 0.09604460000991821, + "learning_rate": 0.0008420077962372721, + "loss": 0.1244, + "num_input_tokens_seen": 88452048, + "step": 40975 + }, + { + "epoch": 6.685154975530179, + "grad_norm": 0.09437929093837738, + "learning_rate": 0.0008419558694125555, + "loss": 0.2253, + "num_input_tokens_seen": 88463280, + "step": 40980 + }, + { + "epoch": 6.685970636215334, + "grad_norm": 0.07316838949918747, + "learning_rate": 0.0008419039356576695, + "loss": 0.0711, + "num_input_tokens_seen": 88473488, + "step": 40985 + }, + { + "epoch": 6.68678629690049, + "grad_norm": 0.011294277384877205, + "learning_rate": 0.0008418519949736664, + "loss": 0.0198, + "num_input_tokens_seen": 88484304, + "step": 40990 + }, + { + "epoch": 6.6876019575856445, + "grad_norm": 0.30193161964416504, + "learning_rate": 0.000841800047361599, + "loss": 0.1684, + "num_input_tokens_seen": 88495088, + "step": 40995 + }, + { + "epoch": 6.688417618270799, + "grad_norm": 0.003618774935603142, + "learning_rate": 0.00084174809282252, + "loss": 0.2993, + "num_input_tokens_seen": 88506512, + "step": 41000 + }, + { + "epoch": 6.689233278955954, + "grad_norm": 0.12098127603530884, + "learning_rate": 0.0008416961313574824, + "loss": 0.0563, + "num_input_tokens_seen": 88518192, + "step": 41005 + }, + { + "epoch": 6.690048939641109, + "grad_norm": 0.039779067039489746, + "learning_rate": 0.0008416441629675391, + "loss": 0.0304, + "num_input_tokens_seen": 88526928, + "step": 41010 + }, + { + "epoch": 6.690864600326265, + "grad_norm": 0.021030575037002563, + "learning_rate": 0.0008415921876537436, + "loss": 0.0595, + "num_input_tokens_seen": 88537616, + "step": 41015 + }, + { + "epoch": 6.691680261011419, + "grad_norm": 0.013087180443108082, + "learning_rate": 0.000841540205417149, + "loss": 0.1346, + "num_input_tokens_seen": 88547408, + "step": 41020 + }, + { + "epoch": 6.692495921696574, + "grad_norm": 0.0035095035564154387, + "learning_rate": 0.0008414882162588089, + "loss": 0.0417, + "num_input_tokens_seen": 88557232, + "step": 41025 + }, + { + "epoch": 6.693311582381729, + "grad_norm": 0.04334179684519768, + "learning_rate": 0.0008414362201797768, + "loss": 0.0826, + "num_input_tokens_seen": 88567632, + "step": 41030 + }, + { + "epoch": 6.694127243066884, + "grad_norm": 0.041898977011442184, + "learning_rate": 0.0008413842171811066, + "loss": 0.0515, + "num_input_tokens_seen": 88579312, + "step": 41035 + }, + { + "epoch": 6.69494290375204, + "grad_norm": 0.13928961753845215, + "learning_rate": 0.0008413322072638523, + "loss": 0.0632, + "num_input_tokens_seen": 88590608, + "step": 41040 + }, + { + "epoch": 6.695758564437194, + "grad_norm": 0.061256371438503265, + "learning_rate": 0.0008412801904290677, + "loss": 0.0427, + "num_input_tokens_seen": 88601904, + "step": 41045 + }, + { + "epoch": 6.696574225122349, + "grad_norm": 0.002626802772283554, + "learning_rate": 0.000841228166677807, + "loss": 0.012, + "num_input_tokens_seen": 88611088, + "step": 41050 + }, + { + "epoch": 6.697389885807504, + "grad_norm": 0.1937313675880432, + "learning_rate": 0.0008411761360111248, + "loss": 0.1072, + "num_input_tokens_seen": 88623120, + "step": 41055 + }, + { + "epoch": 6.698205546492659, + "grad_norm": 0.062182579189538956, + "learning_rate": 0.0008411240984300752, + "loss": 0.1743, + "num_input_tokens_seen": 88634064, + "step": 41060 + }, + { + "epoch": 6.699021207177814, + "grad_norm": 0.09133990854024887, + "learning_rate": 0.0008410720539357132, + "loss": 0.049, + "num_input_tokens_seen": 88645392, + "step": 41065 + }, + { + "epoch": 6.699836867862969, + "grad_norm": 0.007124387193471193, + "learning_rate": 0.0008410200025290933, + "loss": 0.0098, + "num_input_tokens_seen": 88657808, + "step": 41070 + }, + { + "epoch": 6.700652528548124, + "grad_norm": 0.07760413736104965, + "learning_rate": 0.0008409679442112703, + "loss": 0.0425, + "num_input_tokens_seen": 88668048, + "step": 41075 + }, + { + "epoch": 6.701468189233279, + "grad_norm": 0.11132733523845673, + "learning_rate": 0.0008409158789832994, + "loss": 0.09, + "num_input_tokens_seen": 88677456, + "step": 41080 + }, + { + "epoch": 6.702283849918434, + "grad_norm": 0.2809132933616638, + "learning_rate": 0.0008408638068462357, + "loss": 0.2881, + "num_input_tokens_seen": 88688592, + "step": 41085 + }, + { + "epoch": 6.703099510603589, + "grad_norm": 0.010132144205272198, + "learning_rate": 0.0008408117278011347, + "loss": 0.0509, + "num_input_tokens_seen": 88699152, + "step": 41090 + }, + { + "epoch": 6.7039151712887435, + "grad_norm": 0.01508582103997469, + "learning_rate": 0.0008407596418490515, + "loss": 0.028, + "num_input_tokens_seen": 88709776, + "step": 41095 + }, + { + "epoch": 6.704730831973899, + "grad_norm": 0.12753300368785858, + "learning_rate": 0.0008407075489910421, + "loss": 0.1339, + "num_input_tokens_seen": 88721328, + "step": 41100 + }, + { + "epoch": 6.705546492659054, + "grad_norm": 0.17767579853534698, + "learning_rate": 0.0008406554492281618, + "loss": 0.1465, + "num_input_tokens_seen": 88732432, + "step": 41105 + }, + { + "epoch": 6.706362153344209, + "grad_norm": 0.01995852403342724, + "learning_rate": 0.0008406033425614667, + "loss": 0.0434, + "num_input_tokens_seen": 88743312, + "step": 41110 + }, + { + "epoch": 6.707177814029364, + "grad_norm": 0.10104304552078247, + "learning_rate": 0.0008405512289920129, + "loss": 0.0828, + "num_input_tokens_seen": 88754096, + "step": 41115 + }, + { + "epoch": 6.7079934747145185, + "grad_norm": 0.03875511512160301, + "learning_rate": 0.0008404991085208562, + "loss": 0.058, + "num_input_tokens_seen": 88765200, + "step": 41120 + }, + { + "epoch": 6.708809135399674, + "grad_norm": 0.006703234743326902, + "learning_rate": 0.0008404469811490534, + "loss": 0.0789, + "num_input_tokens_seen": 88777136, + "step": 41125 + }, + { + "epoch": 6.709624796084829, + "grad_norm": 0.06017681583762169, + "learning_rate": 0.0008403948468776604, + "loss": 0.0987, + "num_input_tokens_seen": 88789040, + "step": 41130 + }, + { + "epoch": 6.710440456769984, + "grad_norm": 0.021386008709669113, + "learning_rate": 0.0008403427057077342, + "loss": 0.2412, + "num_input_tokens_seen": 88799792, + "step": 41135 + }, + { + "epoch": 6.711256117455139, + "grad_norm": 0.16998514533042908, + "learning_rate": 0.0008402905576403312, + "loss": 0.0857, + "num_input_tokens_seen": 88810224, + "step": 41140 + }, + { + "epoch": 6.712071778140293, + "grad_norm": 0.028178734704852104, + "learning_rate": 0.0008402384026765084, + "loss": 0.1628, + "num_input_tokens_seen": 88820112, + "step": 41145 + }, + { + "epoch": 6.712887438825448, + "grad_norm": 0.11772794276475906, + "learning_rate": 0.0008401862408173226, + "loss": 0.0721, + "num_input_tokens_seen": 88830864, + "step": 41150 + }, + { + "epoch": 6.713703099510604, + "grad_norm": 0.15482349693775177, + "learning_rate": 0.0008401340720638313, + "loss": 0.0592, + "num_input_tokens_seen": 88842096, + "step": 41155 + }, + { + "epoch": 6.714518760195759, + "grad_norm": 0.23721086978912354, + "learning_rate": 0.0008400818964170913, + "loss": 0.0798, + "num_input_tokens_seen": 88853232, + "step": 41160 + }, + { + "epoch": 6.715334420880914, + "grad_norm": 0.012258430011570454, + "learning_rate": 0.0008400297138781605, + "loss": 0.087, + "num_input_tokens_seen": 88863408, + "step": 41165 + }, + { + "epoch": 6.716150081566068, + "grad_norm": 0.16345863044261932, + "learning_rate": 0.0008399775244480961, + "loss": 0.1403, + "num_input_tokens_seen": 88873648, + "step": 41170 + }, + { + "epoch": 6.716965742251223, + "grad_norm": 0.004188027232885361, + "learning_rate": 0.0008399253281279557, + "loss": 0.0467, + "num_input_tokens_seen": 88884720, + "step": 41175 + }, + { + "epoch": 6.717781402936378, + "grad_norm": 0.07818343490362167, + "learning_rate": 0.0008398731249187975, + "loss": 0.055, + "num_input_tokens_seen": 88895632, + "step": 41180 + }, + { + "epoch": 6.718597063621534, + "grad_norm": 0.03466807305812836, + "learning_rate": 0.0008398209148216793, + "loss": 0.1596, + "num_input_tokens_seen": 88905456, + "step": 41185 + }, + { + "epoch": 6.719412724306689, + "grad_norm": 0.02091260440647602, + "learning_rate": 0.000839768697837659, + "loss": 0.1226, + "num_input_tokens_seen": 88916848, + "step": 41190 + }, + { + "epoch": 6.720228384991843, + "grad_norm": 0.07259862869977951, + "learning_rate": 0.0008397164739677951, + "loss": 0.0467, + "num_input_tokens_seen": 88928240, + "step": 41195 + }, + { + "epoch": 6.721044045676998, + "grad_norm": 0.013527154922485352, + "learning_rate": 0.0008396642432131459, + "loss": 0.0117, + "num_input_tokens_seen": 88940016, + "step": 41200 + }, + { + "epoch": 6.721859706362153, + "grad_norm": 0.26618337631225586, + "learning_rate": 0.0008396120055747698, + "loss": 0.1706, + "num_input_tokens_seen": 88950544, + "step": 41205 + }, + { + "epoch": 6.722675367047309, + "grad_norm": 0.1150444969534874, + "learning_rate": 0.0008395597610537257, + "loss": 0.1785, + "num_input_tokens_seen": 88961520, + "step": 41210 + }, + { + "epoch": 6.7234910277324635, + "grad_norm": 0.06275145709514618, + "learning_rate": 0.0008395075096510723, + "loss": 0.0691, + "num_input_tokens_seen": 88971952, + "step": 41215 + }, + { + "epoch": 6.724306688417618, + "grad_norm": 0.05700957402586937, + "learning_rate": 0.0008394552513678684, + "loss": 0.0859, + "num_input_tokens_seen": 88981840, + "step": 41220 + }, + { + "epoch": 6.725122349102773, + "grad_norm": 0.09610020369291306, + "learning_rate": 0.0008394029862051733, + "loss": 0.0871, + "num_input_tokens_seen": 88992656, + "step": 41225 + }, + { + "epoch": 6.725938009787928, + "grad_norm": 0.017957471311092377, + "learning_rate": 0.0008393507141640461, + "loss": 0.0811, + "num_input_tokens_seen": 89004432, + "step": 41230 + }, + { + "epoch": 6.726753670473083, + "grad_norm": 0.014881430193781853, + "learning_rate": 0.0008392984352455461, + "loss": 0.0393, + "num_input_tokens_seen": 89014736, + "step": 41235 + }, + { + "epoch": 6.7275693311582385, + "grad_norm": 0.00634511886164546, + "learning_rate": 0.0008392461494507331, + "loss": 0.1016, + "num_input_tokens_seen": 89024976, + "step": 41240 + }, + { + "epoch": 6.728384991843393, + "grad_norm": 0.02639460191130638, + "learning_rate": 0.0008391938567806663, + "loss": 0.0128, + "num_input_tokens_seen": 89035440, + "step": 41245 + }, + { + "epoch": 6.729200652528548, + "grad_norm": 0.22993804514408112, + "learning_rate": 0.0008391415572364058, + "loss": 0.0938, + "num_input_tokens_seen": 89046608, + "step": 41250 + }, + { + "epoch": 6.730016313213703, + "grad_norm": 0.057400986552238464, + "learning_rate": 0.0008390892508190113, + "loss": 0.089, + "num_input_tokens_seen": 89056912, + "step": 41255 + }, + { + "epoch": 6.730831973898858, + "grad_norm": 0.028799260035157204, + "learning_rate": 0.000839036937529543, + "loss": 0.0585, + "num_input_tokens_seen": 89068048, + "step": 41260 + }, + { + "epoch": 6.731647634584013, + "grad_norm": 0.007639918010681868, + "learning_rate": 0.0008389846173690611, + "loss": 0.1331, + "num_input_tokens_seen": 89078544, + "step": 41265 + }, + { + "epoch": 6.732463295269168, + "grad_norm": 0.16584822535514832, + "learning_rate": 0.0008389322903386261, + "loss": 0.1052, + "num_input_tokens_seen": 89090768, + "step": 41270 + }, + { + "epoch": 6.733278955954323, + "grad_norm": 0.302126944065094, + "learning_rate": 0.0008388799564392979, + "loss": 0.1384, + "num_input_tokens_seen": 89101040, + "step": 41275 + }, + { + "epoch": 6.734094616639478, + "grad_norm": 0.06703297793865204, + "learning_rate": 0.0008388276156721377, + "loss": 0.0452, + "num_input_tokens_seen": 89112112, + "step": 41280 + }, + { + "epoch": 6.734910277324633, + "grad_norm": 0.02489074319601059, + "learning_rate": 0.0008387752680382062, + "loss": 0.1492, + "num_input_tokens_seen": 89123056, + "step": 41285 + }, + { + "epoch": 6.735725938009788, + "grad_norm": 0.1354849636554718, + "learning_rate": 0.0008387229135385638, + "loss": 0.1099, + "num_input_tokens_seen": 89135120, + "step": 41290 + }, + { + "epoch": 6.736541598694943, + "grad_norm": 0.007585674058645964, + "learning_rate": 0.0008386705521742719, + "loss": 0.0899, + "num_input_tokens_seen": 89147184, + "step": 41295 + }, + { + "epoch": 6.737357259380098, + "grad_norm": 0.06748169660568237, + "learning_rate": 0.0008386181839463918, + "loss": 0.0596, + "num_input_tokens_seen": 89156912, + "step": 41300 + }, + { + "epoch": 6.738172920065253, + "grad_norm": 0.006284903734922409, + "learning_rate": 0.0008385658088559845, + "loss": 0.0239, + "num_input_tokens_seen": 89167184, + "step": 41305 + }, + { + "epoch": 6.738988580750408, + "grad_norm": 0.011478284373879433, + "learning_rate": 0.0008385134269041116, + "loss": 0.0143, + "num_input_tokens_seen": 89178448, + "step": 41310 + }, + { + "epoch": 6.739804241435563, + "grad_norm": 0.03838944435119629, + "learning_rate": 0.0008384610380918347, + "loss": 0.0997, + "num_input_tokens_seen": 89189072, + "step": 41315 + }, + { + "epoch": 6.740619902120718, + "grad_norm": 0.006093547213822603, + "learning_rate": 0.0008384086424202156, + "loss": 0.0402, + "num_input_tokens_seen": 89200496, + "step": 41320 + }, + { + "epoch": 6.741435562805873, + "grad_norm": 0.0246463380753994, + "learning_rate": 0.0008383562398903157, + "loss": 0.0427, + "num_input_tokens_seen": 89209840, + "step": 41325 + }, + { + "epoch": 6.742251223491028, + "grad_norm": 0.05461215600371361, + "learning_rate": 0.0008383038305031976, + "loss": 0.1413, + "num_input_tokens_seen": 89220240, + "step": 41330 + }, + { + "epoch": 6.743066884176183, + "grad_norm": 0.09736643731594086, + "learning_rate": 0.0008382514142599234, + "loss": 0.0537, + "num_input_tokens_seen": 89232400, + "step": 41335 + }, + { + "epoch": 6.7438825448613375, + "grad_norm": 0.08002153784036636, + "learning_rate": 0.0008381989911615548, + "loss": 0.1627, + "num_input_tokens_seen": 89242608, + "step": 41340 + }, + { + "epoch": 6.744698205546492, + "grad_norm": 0.042618632316589355, + "learning_rate": 0.0008381465612091549, + "loss": 0.0344, + "num_input_tokens_seen": 89253328, + "step": 41345 + }, + { + "epoch": 6.745513866231647, + "grad_norm": 0.08739539235830307, + "learning_rate": 0.0008380941244037858, + "loss": 0.1271, + "num_input_tokens_seen": 89263984, + "step": 41350 + }, + { + "epoch": 6.746329526916803, + "grad_norm": 0.033923663198947906, + "learning_rate": 0.0008380416807465106, + "loss": 0.0181, + "num_input_tokens_seen": 89275856, + "step": 41355 + }, + { + "epoch": 6.747145187601958, + "grad_norm": 0.02580278180539608, + "learning_rate": 0.0008379892302383916, + "loss": 0.0866, + "num_input_tokens_seen": 89287024, + "step": 41360 + }, + { + "epoch": 6.7479608482871125, + "grad_norm": 0.05051056668162346, + "learning_rate": 0.0008379367728804923, + "loss": 0.137, + "num_input_tokens_seen": 89298032, + "step": 41365 + }, + { + "epoch": 6.748776508972267, + "grad_norm": 0.1584167182445526, + "learning_rate": 0.0008378843086738755, + "loss": 0.0547, + "num_input_tokens_seen": 89309424, + "step": 41370 + }, + { + "epoch": 6.749592169657422, + "grad_norm": 0.28312966227531433, + "learning_rate": 0.0008378318376196046, + "loss": 0.0626, + "num_input_tokens_seen": 89320880, + "step": 41375 + }, + { + "epoch": 6.750407830342578, + "grad_norm": 0.05350763350725174, + "learning_rate": 0.0008377793597187428, + "loss": 0.1495, + "num_input_tokens_seen": 89332624, + "step": 41380 + }, + { + "epoch": 6.751223491027733, + "grad_norm": 0.0965968668460846, + "learning_rate": 0.000837726874972354, + "loss": 0.0603, + "num_input_tokens_seen": 89343344, + "step": 41385 + }, + { + "epoch": 6.7520391517128875, + "grad_norm": 0.12621326744556427, + "learning_rate": 0.0008376743833815015, + "loss": 0.1442, + "num_input_tokens_seen": 89355344, + "step": 41390 + }, + { + "epoch": 6.752854812398042, + "grad_norm": 0.18820859491825104, + "learning_rate": 0.0008376218849472493, + "loss": 0.081, + "num_input_tokens_seen": 89366160, + "step": 41395 + }, + { + "epoch": 6.753670473083197, + "grad_norm": 0.052911221981048584, + "learning_rate": 0.0008375693796706613, + "loss": 0.0858, + "num_input_tokens_seen": 89376976, + "step": 41400 + }, + { + "epoch": 6.754486133768353, + "grad_norm": 0.167776957154274, + "learning_rate": 0.0008375168675528016, + "loss": 0.1075, + "num_input_tokens_seen": 89388176, + "step": 41405 + }, + { + "epoch": 6.755301794453508, + "grad_norm": 0.17197886109352112, + "learning_rate": 0.0008374643485947342, + "loss": 0.0675, + "num_input_tokens_seen": 89399152, + "step": 41410 + }, + { + "epoch": 6.7561174551386625, + "grad_norm": 0.15825363993644714, + "learning_rate": 0.0008374118227975238, + "loss": 0.2705, + "num_input_tokens_seen": 89409968, + "step": 41415 + }, + { + "epoch": 6.756933115823817, + "grad_norm": 0.018170418217778206, + "learning_rate": 0.0008373592901622349, + "loss": 0.0662, + "num_input_tokens_seen": 89420560, + "step": 41420 + }, + { + "epoch": 6.757748776508972, + "grad_norm": 0.1717032790184021, + "learning_rate": 0.0008373067506899319, + "loss": 0.1088, + "num_input_tokens_seen": 89431952, + "step": 41425 + }, + { + "epoch": 6.758564437194127, + "grad_norm": 0.02405218780040741, + "learning_rate": 0.0008372542043816797, + "loss": 0.0324, + "num_input_tokens_seen": 89443152, + "step": 41430 + }, + { + "epoch": 6.759380097879282, + "grad_norm": 0.028346359729766846, + "learning_rate": 0.0008372016512385432, + "loss": 0.0361, + "num_input_tokens_seen": 89454576, + "step": 41435 + }, + { + "epoch": 6.760195758564437, + "grad_norm": 0.0835704356431961, + "learning_rate": 0.0008371490912615875, + "loss": 0.027, + "num_input_tokens_seen": 89465840, + "step": 41440 + }, + { + "epoch": 6.761011419249592, + "grad_norm": 0.004486468154937029, + "learning_rate": 0.0008370965244518778, + "loss": 0.0248, + "num_input_tokens_seen": 89476400, + "step": 41445 + }, + { + "epoch": 6.761827079934747, + "grad_norm": 0.20643800497055054, + "learning_rate": 0.0008370439508104794, + "loss": 0.2141, + "num_input_tokens_seen": 89487568, + "step": 41450 + }, + { + "epoch": 6.762642740619902, + "grad_norm": 0.108114093542099, + "learning_rate": 0.0008369913703384576, + "loss": 0.0932, + "num_input_tokens_seen": 89496400, + "step": 41455 + }, + { + "epoch": 6.763458401305057, + "grad_norm": 0.011129074729979038, + "learning_rate": 0.0008369387830368785, + "loss": 0.0455, + "num_input_tokens_seen": 89507440, + "step": 41460 + }, + { + "epoch": 6.764274061990212, + "grad_norm": 0.028652314096689224, + "learning_rate": 0.0008368861889068071, + "loss": 0.1431, + "num_input_tokens_seen": 89516624, + "step": 41465 + }, + { + "epoch": 6.765089722675367, + "grad_norm": 0.013793841935694218, + "learning_rate": 0.0008368335879493099, + "loss": 0.076, + "num_input_tokens_seen": 89526352, + "step": 41470 + }, + { + "epoch": 6.765905383360522, + "grad_norm": 0.0880393460392952, + "learning_rate": 0.0008367809801654529, + "loss": 0.1123, + "num_input_tokens_seen": 89538128, + "step": 41475 + }, + { + "epoch": 6.766721044045677, + "grad_norm": 0.13772685825824738, + "learning_rate": 0.0008367283655563018, + "loss": 0.0479, + "num_input_tokens_seen": 89549136, + "step": 41480 + }, + { + "epoch": 6.767536704730832, + "grad_norm": 0.008785467594861984, + "learning_rate": 0.0008366757441229235, + "loss": 0.0797, + "num_input_tokens_seen": 89559728, + "step": 41485 + }, + { + "epoch": 6.768352365415987, + "grad_norm": 0.01301692333072424, + "learning_rate": 0.000836623115866384, + "loss": 0.0552, + "num_input_tokens_seen": 89569904, + "step": 41490 + }, + { + "epoch": 6.769168026101142, + "grad_norm": 0.00553498649969697, + "learning_rate": 0.00083657048078775, + "loss": 0.0758, + "num_input_tokens_seen": 89581008, + "step": 41495 + }, + { + "epoch": 6.769983686786297, + "grad_norm": 0.059878524392843246, + "learning_rate": 0.0008365178388880883, + "loss": 0.2503, + "num_input_tokens_seen": 89591728, + "step": 41500 + }, + { + "epoch": 6.770799347471452, + "grad_norm": 0.00952261220663786, + "learning_rate": 0.0008364651901684657, + "loss": 0.0223, + "num_input_tokens_seen": 89601456, + "step": 41505 + }, + { + "epoch": 6.771615008156607, + "grad_norm": 0.024038473144173622, + "learning_rate": 0.0008364125346299492, + "loss": 0.0468, + "num_input_tokens_seen": 89612368, + "step": 41510 + }, + { + "epoch": 6.7724306688417615, + "grad_norm": 0.15927067399024963, + "learning_rate": 0.0008363598722736057, + "loss": 0.0748, + "num_input_tokens_seen": 89623312, + "step": 41515 + }, + { + "epoch": 6.773246329526917, + "grad_norm": 0.1774238497018814, + "learning_rate": 0.0008363072031005028, + "loss": 0.1727, + "num_input_tokens_seen": 89633936, + "step": 41520 + }, + { + "epoch": 6.774061990212072, + "grad_norm": 0.005390689708292484, + "learning_rate": 0.0008362545271117079, + "loss": 0.068, + "num_input_tokens_seen": 89643696, + "step": 41525 + }, + { + "epoch": 6.774877650897227, + "grad_norm": 0.014011479914188385, + "learning_rate": 0.0008362018443082884, + "loss": 0.024, + "num_input_tokens_seen": 89653936, + "step": 41530 + }, + { + "epoch": 6.775693311582382, + "grad_norm": 0.13323059678077698, + "learning_rate": 0.000836149154691312, + "loss": 0.029, + "num_input_tokens_seen": 89664752, + "step": 41535 + }, + { + "epoch": 6.7765089722675365, + "grad_norm": 0.0058629876002669334, + "learning_rate": 0.0008360964582618465, + "loss": 0.03, + "num_input_tokens_seen": 89675984, + "step": 41540 + }, + { + "epoch": 6.777324632952691, + "grad_norm": 0.01777850277721882, + "learning_rate": 0.0008360437550209599, + "loss": 0.0099, + "num_input_tokens_seen": 89686704, + "step": 41545 + }, + { + "epoch": 6.778140293637847, + "grad_norm": 0.23482449352741241, + "learning_rate": 0.0008359910449697203, + "loss": 0.1335, + "num_input_tokens_seen": 89697040, + "step": 41550 + }, + { + "epoch": 6.778955954323002, + "grad_norm": 0.008135825395584106, + "learning_rate": 0.0008359383281091961, + "loss": 0.0255, + "num_input_tokens_seen": 89708208, + "step": 41555 + }, + { + "epoch": 6.779771615008157, + "grad_norm": 0.009201602078974247, + "learning_rate": 0.0008358856044404553, + "loss": 0.0207, + "num_input_tokens_seen": 89719120, + "step": 41560 + }, + { + "epoch": 6.780587275693311, + "grad_norm": 0.010593346320092678, + "learning_rate": 0.0008358328739645668, + "loss": 0.0319, + "num_input_tokens_seen": 89730384, + "step": 41565 + }, + { + "epoch": 6.781402936378466, + "grad_norm": 0.018084678798913956, + "learning_rate": 0.000835780136682599, + "loss": 0.0217, + "num_input_tokens_seen": 89742128, + "step": 41570 + }, + { + "epoch": 6.782218597063622, + "grad_norm": 0.005250321235507727, + "learning_rate": 0.0008357273925956208, + "loss": 0.0948, + "num_input_tokens_seen": 89752944, + "step": 41575 + }, + { + "epoch": 6.783034257748777, + "grad_norm": 0.004432154353708029, + "learning_rate": 0.000835674641704701, + "loss": 0.1143, + "num_input_tokens_seen": 89762448, + "step": 41580 + }, + { + "epoch": 6.783849918433932, + "grad_norm": 0.006419645622372627, + "learning_rate": 0.0008356218840109089, + "loss": 0.1408, + "num_input_tokens_seen": 89773200, + "step": 41585 + }, + { + "epoch": 6.784665579119086, + "grad_norm": 0.008174563758075237, + "learning_rate": 0.0008355691195153134, + "loss": 0.0437, + "num_input_tokens_seen": 89783952, + "step": 41590 + }, + { + "epoch": 6.785481239804241, + "grad_norm": 0.010266945697367191, + "learning_rate": 0.000835516348218984, + "loss": 0.0469, + "num_input_tokens_seen": 89796528, + "step": 41595 + }, + { + "epoch": 6.786296900489396, + "grad_norm": 0.005594492424279451, + "learning_rate": 0.0008354635701229902, + "loss": 0.0198, + "num_input_tokens_seen": 89807664, + "step": 41600 + }, + { + "epoch": 6.787112561174552, + "grad_norm": 0.11710002273321152, + "learning_rate": 0.0008354107852284016, + "loss": 0.0932, + "num_input_tokens_seen": 89817552, + "step": 41605 + }, + { + "epoch": 6.787928221859707, + "grad_norm": 0.02294880710542202, + "learning_rate": 0.0008353579935362881, + "loss": 0.0583, + "num_input_tokens_seen": 89829008, + "step": 41610 + }, + { + "epoch": 6.788743882544861, + "grad_norm": 0.002552360063418746, + "learning_rate": 0.0008353051950477192, + "loss": 0.1488, + "num_input_tokens_seen": 89839856, + "step": 41615 + }, + { + "epoch": 6.789559543230016, + "grad_norm": 0.01202454511076212, + "learning_rate": 0.0008352523897637652, + "loss": 0.0819, + "num_input_tokens_seen": 89851312, + "step": 41620 + }, + { + "epoch": 6.790375203915171, + "grad_norm": 0.03201969340443611, + "learning_rate": 0.0008351995776854962, + "loss": 0.0539, + "num_input_tokens_seen": 89863248, + "step": 41625 + }, + { + "epoch": 6.791190864600326, + "grad_norm": 0.07802778482437134, + "learning_rate": 0.0008351467588139827, + "loss": 0.0866, + "num_input_tokens_seen": 89875344, + "step": 41630 + }, + { + "epoch": 6.7920065252854815, + "grad_norm": 0.006255864631384611, + "learning_rate": 0.0008350939331502949, + "loss": 0.0852, + "num_input_tokens_seen": 89887056, + "step": 41635 + }, + { + "epoch": 6.792822185970636, + "grad_norm": 0.004048587754368782, + "learning_rate": 0.0008350411006955033, + "loss": 0.0779, + "num_input_tokens_seen": 89898064, + "step": 41640 + }, + { + "epoch": 6.793637846655791, + "grad_norm": 0.022298306226730347, + "learning_rate": 0.0008349882614506789, + "loss": 0.071, + "num_input_tokens_seen": 89908816, + "step": 41645 + }, + { + "epoch": 6.794453507340946, + "grad_norm": 0.17016233503818512, + "learning_rate": 0.0008349354154168924, + "loss": 0.0776, + "num_input_tokens_seen": 89918896, + "step": 41650 + }, + { + "epoch": 6.795269168026101, + "grad_norm": 0.16810102760791779, + "learning_rate": 0.0008348825625952148, + "loss": 0.0622, + "num_input_tokens_seen": 89928880, + "step": 41655 + }, + { + "epoch": 6.7960848287112565, + "grad_norm": 0.026905179023742676, + "learning_rate": 0.0008348297029867172, + "loss": 0.0863, + "num_input_tokens_seen": 89939952, + "step": 41660 + }, + { + "epoch": 6.796900489396411, + "grad_norm": 0.1609235256910324, + "learning_rate": 0.0008347768365924709, + "loss": 0.2353, + "num_input_tokens_seen": 89949360, + "step": 41665 + }, + { + "epoch": 6.797716150081566, + "grad_norm": 0.06641438603401184, + "learning_rate": 0.0008347239634135474, + "loss": 0.126, + "num_input_tokens_seen": 89960144, + "step": 41670 + }, + { + "epoch": 6.798531810766721, + "grad_norm": 0.18190033733844757, + "learning_rate": 0.0008346710834510181, + "loss": 0.169, + "num_input_tokens_seen": 89971184, + "step": 41675 + }, + { + "epoch": 6.799347471451876, + "grad_norm": 0.0116404565051198, + "learning_rate": 0.0008346181967059548, + "loss": 0.0957, + "num_input_tokens_seen": 89982288, + "step": 41680 + }, + { + "epoch": 6.800163132137031, + "grad_norm": 0.015148242004215717, + "learning_rate": 0.0008345653031794292, + "loss": 0.0219, + "num_input_tokens_seen": 89993744, + "step": 41685 + }, + { + "epoch": 6.800978792822186, + "grad_norm": 0.14517849683761597, + "learning_rate": 0.0008345124028725133, + "loss": 0.0523, + "num_input_tokens_seen": 90005232, + "step": 41690 + }, + { + "epoch": 6.801794453507341, + "grad_norm": 0.015842413529753685, + "learning_rate": 0.0008344594957862792, + "loss": 0.0805, + "num_input_tokens_seen": 90016240, + "step": 41695 + }, + { + "epoch": 6.802610114192496, + "grad_norm": 0.007341781631112099, + "learning_rate": 0.000834406581921799, + "loss": 0.0106, + "num_input_tokens_seen": 90026736, + "step": 41700 + }, + { + "epoch": 6.803425774877651, + "grad_norm": 0.0991702526807785, + "learning_rate": 0.0008343536612801454, + "loss": 0.0503, + "num_input_tokens_seen": 90037296, + "step": 41705 + }, + { + "epoch": 6.804241435562806, + "grad_norm": 0.005417232867330313, + "learning_rate": 0.0008343007338623906, + "loss": 0.0575, + "num_input_tokens_seen": 90048176, + "step": 41710 + }, + { + "epoch": 6.80505709624796, + "grad_norm": 0.0063780527561903, + "learning_rate": 0.0008342477996696074, + "loss": 0.0132, + "num_input_tokens_seen": 90059056, + "step": 41715 + }, + { + "epoch": 6.805872756933116, + "grad_norm": 0.05414315313100815, + "learning_rate": 0.0008341948587028684, + "loss": 0.0197, + "num_input_tokens_seen": 90069456, + "step": 41720 + }, + { + "epoch": 6.806688417618271, + "grad_norm": 0.011467905715107918, + "learning_rate": 0.0008341419109632466, + "loss": 0.0412, + "num_input_tokens_seen": 90081840, + "step": 41725 + }, + { + "epoch": 6.807504078303426, + "grad_norm": 0.2744033932685852, + "learning_rate": 0.0008340889564518153, + "loss": 0.1273, + "num_input_tokens_seen": 90091600, + "step": 41730 + }, + { + "epoch": 6.808319738988581, + "grad_norm": 0.10340234637260437, + "learning_rate": 0.0008340359951696472, + "loss": 0.0462, + "num_input_tokens_seen": 90103344, + "step": 41735 + }, + { + "epoch": 6.809135399673735, + "grad_norm": 0.2096472531557083, + "learning_rate": 0.0008339830271178162, + "loss": 0.0383, + "num_input_tokens_seen": 90114352, + "step": 41740 + }, + { + "epoch": 6.809951060358891, + "grad_norm": 0.14978985488414764, + "learning_rate": 0.0008339300522973952, + "loss": 0.0273, + "num_input_tokens_seen": 90125584, + "step": 41745 + }, + { + "epoch": 6.810766721044046, + "grad_norm": 0.008478322997689247, + "learning_rate": 0.0008338770707094583, + "loss": 0.1333, + "num_input_tokens_seen": 90136304, + "step": 41750 + }, + { + "epoch": 6.811582381729201, + "grad_norm": 0.02924121916294098, + "learning_rate": 0.0008338240823550789, + "loss": 0.0418, + "num_input_tokens_seen": 90147088, + "step": 41755 + }, + { + "epoch": 6.8123980424143555, + "grad_norm": 0.003852690104395151, + "learning_rate": 0.000833771087235331, + "loss": 0.2877, + "num_input_tokens_seen": 90158512, + "step": 41760 + }, + { + "epoch": 6.81321370309951, + "grad_norm": 0.04444701224565506, + "learning_rate": 0.0008337180853512885, + "loss": 0.0546, + "num_input_tokens_seen": 90168880, + "step": 41765 + }, + { + "epoch": 6.814029363784666, + "grad_norm": 0.018941668793559074, + "learning_rate": 0.0008336650767040258, + "loss": 0.0142, + "num_input_tokens_seen": 90179696, + "step": 41770 + }, + { + "epoch": 6.814845024469821, + "grad_norm": 0.11209810525178909, + "learning_rate": 0.000833612061294617, + "loss": 0.0332, + "num_input_tokens_seen": 90190960, + "step": 41775 + }, + { + "epoch": 6.815660685154976, + "grad_norm": 0.10310380160808563, + "learning_rate": 0.0008335590391241365, + "loss": 0.036, + "num_input_tokens_seen": 90202288, + "step": 41780 + }, + { + "epoch": 6.8164763458401305, + "grad_norm": 0.021101856604218483, + "learning_rate": 0.000833506010193659, + "loss": 0.0167, + "num_input_tokens_seen": 90214288, + "step": 41785 + }, + { + "epoch": 6.817292006525285, + "grad_norm": 0.1528523713350296, + "learning_rate": 0.000833452974504259, + "loss": 0.14, + "num_input_tokens_seen": 90225168, + "step": 41790 + }, + { + "epoch": 6.81810766721044, + "grad_norm": 0.004218652844429016, + "learning_rate": 0.0008333999320570116, + "loss": 0.0249, + "num_input_tokens_seen": 90235760, + "step": 41795 + }, + { + "epoch": 6.818923327895595, + "grad_norm": 0.004681533668190241, + "learning_rate": 0.0008333468828529916, + "loss": 0.0718, + "num_input_tokens_seen": 90246864, + "step": 41800 + }, + { + "epoch": 6.819738988580751, + "grad_norm": 0.005578580778092146, + "learning_rate": 0.0008332938268932742, + "loss": 0.0637, + "num_input_tokens_seen": 90257680, + "step": 41805 + }, + { + "epoch": 6.8205546492659055, + "grad_norm": 0.04931236058473587, + "learning_rate": 0.0008332407641789344, + "loss": 0.1582, + "num_input_tokens_seen": 90268976, + "step": 41810 + }, + { + "epoch": 6.82137030995106, + "grad_norm": 0.03313962370157242, + "learning_rate": 0.0008331876947110478, + "loss": 0.0418, + "num_input_tokens_seen": 90279696, + "step": 41815 + }, + { + "epoch": 6.822185970636215, + "grad_norm": 0.003912081476300955, + "learning_rate": 0.00083313461849069, + "loss": 0.0511, + "num_input_tokens_seen": 90288624, + "step": 41820 + }, + { + "epoch": 6.82300163132137, + "grad_norm": 0.001711890916340053, + "learning_rate": 0.0008330815355189365, + "loss": 0.1588, + "num_input_tokens_seen": 90299984, + "step": 41825 + }, + { + "epoch": 6.823817292006526, + "grad_norm": 0.020905552431941032, + "learning_rate": 0.0008330284457968631, + "loss": 0.0119, + "num_input_tokens_seen": 90309264, + "step": 41830 + }, + { + "epoch": 6.8246329526916805, + "grad_norm": 0.018808018416166306, + "learning_rate": 0.0008329753493255458, + "loss": 0.0379, + "num_input_tokens_seen": 90320528, + "step": 41835 + }, + { + "epoch": 6.825448613376835, + "grad_norm": 0.007119704503566027, + "learning_rate": 0.0008329222461060606, + "loss": 0.0297, + "num_input_tokens_seen": 90331184, + "step": 41840 + }, + { + "epoch": 6.82626427406199, + "grad_norm": 0.0033158219885081053, + "learning_rate": 0.0008328691361394838, + "loss": 0.044, + "num_input_tokens_seen": 90341200, + "step": 41845 + }, + { + "epoch": 6.827079934747145, + "grad_norm": 0.018451089039444923, + "learning_rate": 0.0008328160194268916, + "loss": 0.1375, + "num_input_tokens_seen": 90350992, + "step": 41850 + }, + { + "epoch": 6.827895595432301, + "grad_norm": 0.29108482599258423, + "learning_rate": 0.0008327628959693606, + "loss": 0.1402, + "num_input_tokens_seen": 90361520, + "step": 41855 + }, + { + "epoch": 6.828711256117455, + "grad_norm": 0.020895570516586304, + "learning_rate": 0.0008327097657679674, + "loss": 0.0761, + "num_input_tokens_seen": 90372112, + "step": 41860 + }, + { + "epoch": 6.82952691680261, + "grad_norm": 0.06214666739106178, + "learning_rate": 0.0008326566288237887, + "loss": 0.2773, + "num_input_tokens_seen": 90383856, + "step": 41865 + }, + { + "epoch": 6.830342577487765, + "grad_norm": 0.028796516358852386, + "learning_rate": 0.0008326034851379014, + "loss": 0.0637, + "num_input_tokens_seen": 90393552, + "step": 41870 + }, + { + "epoch": 6.83115823817292, + "grad_norm": 0.34118348360061646, + "learning_rate": 0.0008325503347113826, + "loss": 0.1794, + "num_input_tokens_seen": 90403056, + "step": 41875 + }, + { + "epoch": 6.831973898858075, + "grad_norm": 0.14424841105937958, + "learning_rate": 0.0008324971775453094, + "loss": 0.1827, + "num_input_tokens_seen": 90414160, + "step": 41880 + }, + { + "epoch": 6.8327895595432295, + "grad_norm": 0.15164506435394287, + "learning_rate": 0.0008324440136407591, + "loss": 0.1192, + "num_input_tokens_seen": 90426160, + "step": 41885 + }, + { + "epoch": 6.833605220228385, + "grad_norm": 0.11004175245761871, + "learning_rate": 0.000832390842998809, + "loss": 0.2716, + "num_input_tokens_seen": 90436816, + "step": 41890 + }, + { + "epoch": 6.83442088091354, + "grad_norm": 0.14111927151679993, + "learning_rate": 0.0008323376656205369, + "loss": 0.0793, + "num_input_tokens_seen": 90448144, + "step": 41895 + }, + { + "epoch": 6.835236541598695, + "grad_norm": 0.09101493656635284, + "learning_rate": 0.0008322844815070204, + "loss": 0.0298, + "num_input_tokens_seen": 90458352, + "step": 41900 + }, + { + "epoch": 6.83605220228385, + "grad_norm": 0.26972270011901855, + "learning_rate": 0.0008322312906593373, + "loss": 0.1301, + "num_input_tokens_seen": 90469264, + "step": 41905 + }, + { + "epoch": 6.8368678629690045, + "grad_norm": 0.028450144454836845, + "learning_rate": 0.0008321780930785657, + "loss": 0.1119, + "num_input_tokens_seen": 90478832, + "step": 41910 + }, + { + "epoch": 6.83768352365416, + "grad_norm": 0.02067798748612404, + "learning_rate": 0.0008321248887657836, + "loss": 0.0658, + "num_input_tokens_seen": 90489840, + "step": 41915 + }, + { + "epoch": 6.838499184339315, + "grad_norm": 0.026986312121152878, + "learning_rate": 0.0008320716777220694, + "loss": 0.04, + "num_input_tokens_seen": 90501232, + "step": 41920 + }, + { + "epoch": 6.83931484502447, + "grad_norm": 0.05997032672166824, + "learning_rate": 0.0008320184599485012, + "loss": 0.0272, + "num_input_tokens_seen": 90512560, + "step": 41925 + }, + { + "epoch": 6.840130505709625, + "grad_norm": 0.010622574016451836, + "learning_rate": 0.0008319652354461577, + "loss": 0.0298, + "num_input_tokens_seen": 90523856, + "step": 41930 + }, + { + "epoch": 6.8409461663947795, + "grad_norm": 0.05540604889392853, + "learning_rate": 0.0008319120042161179, + "loss": 0.2076, + "num_input_tokens_seen": 90532720, + "step": 41935 + }, + { + "epoch": 6.841761827079935, + "grad_norm": 0.008242795243859291, + "learning_rate": 0.00083185876625946, + "loss": 0.0684, + "num_input_tokens_seen": 90543888, + "step": 41940 + }, + { + "epoch": 6.84257748776509, + "grad_norm": 0.2523602545261383, + "learning_rate": 0.0008318055215772633, + "loss": 0.2193, + "num_input_tokens_seen": 90553712, + "step": 41945 + }, + { + "epoch": 6.843393148450245, + "grad_norm": 0.2314700335264206, + "learning_rate": 0.0008317522701706066, + "loss": 0.0736, + "num_input_tokens_seen": 90563920, + "step": 41950 + }, + { + "epoch": 6.8442088091354, + "grad_norm": 0.008542205207049847, + "learning_rate": 0.0008316990120405695, + "loss": 0.0145, + "num_input_tokens_seen": 90574544, + "step": 41955 + }, + { + "epoch": 6.8450244698205545, + "grad_norm": 0.024296162649989128, + "learning_rate": 0.0008316457471882311, + "loss": 0.0694, + "num_input_tokens_seen": 90586384, + "step": 41960 + }, + { + "epoch": 6.845840130505709, + "grad_norm": 0.04843230918049812, + "learning_rate": 0.0008315924756146708, + "loss": 0.112, + "num_input_tokens_seen": 90597776, + "step": 41965 + }, + { + "epoch": 6.846655791190865, + "grad_norm": 0.013434921391308308, + "learning_rate": 0.0008315391973209685, + "loss": 0.0267, + "num_input_tokens_seen": 90610128, + "step": 41970 + }, + { + "epoch": 6.84747145187602, + "grad_norm": 0.00877129752188921, + "learning_rate": 0.0008314859123082037, + "loss": 0.1665, + "num_input_tokens_seen": 90621168, + "step": 41975 + }, + { + "epoch": 6.848287112561175, + "grad_norm": 0.04210628569126129, + "learning_rate": 0.0008314326205774563, + "loss": 0.0364, + "num_input_tokens_seen": 90631536, + "step": 41980 + }, + { + "epoch": 6.849102773246329, + "grad_norm": 0.20910513401031494, + "learning_rate": 0.0008313793221298065, + "loss": 0.1053, + "num_input_tokens_seen": 90641168, + "step": 41985 + }, + { + "epoch": 6.849918433931484, + "grad_norm": 0.0039442661218345165, + "learning_rate": 0.0008313260169663343, + "loss": 0.0384, + "num_input_tokens_seen": 90651632, + "step": 41990 + }, + { + "epoch": 6.850734094616639, + "grad_norm": 0.015237169340252876, + "learning_rate": 0.00083127270508812, + "loss": 0.0532, + "num_input_tokens_seen": 90662128, + "step": 41995 + }, + { + "epoch": 6.851549755301795, + "grad_norm": 0.04682604968547821, + "learning_rate": 0.0008312193864962442, + "loss": 0.0581, + "num_input_tokens_seen": 90673808, + "step": 42000 + }, + { + "epoch": 6.85236541598695, + "grad_norm": 0.1308463215827942, + "learning_rate": 0.0008311660611917873, + "loss": 0.044, + "num_input_tokens_seen": 90684752, + "step": 42005 + }, + { + "epoch": 6.853181076672104, + "grad_norm": 0.0018114261329174042, + "learning_rate": 0.00083111272917583, + "loss": 0.0493, + "num_input_tokens_seen": 90695312, + "step": 42010 + }, + { + "epoch": 6.853996737357259, + "grad_norm": 0.12212783098220825, + "learning_rate": 0.0008310593904494532, + "loss": 0.0648, + "num_input_tokens_seen": 90705520, + "step": 42015 + }, + { + "epoch": 6.854812398042414, + "grad_norm": 0.07093533128499985, + "learning_rate": 0.000831006045013738, + "loss": 0.0244, + "num_input_tokens_seen": 90717584, + "step": 42020 + }, + { + "epoch": 6.85562805872757, + "grad_norm": 0.029440863057971, + "learning_rate": 0.0008309526928697653, + "loss": 0.0456, + "num_input_tokens_seen": 90728496, + "step": 42025 + }, + { + "epoch": 6.856443719412725, + "grad_norm": 0.2156383991241455, + "learning_rate": 0.0008308993340186164, + "loss": 0.0886, + "num_input_tokens_seen": 90738960, + "step": 42030 + }, + { + "epoch": 6.857259380097879, + "grad_norm": 0.004571730270981789, + "learning_rate": 0.0008308459684613727, + "loss": 0.0334, + "num_input_tokens_seen": 90750352, + "step": 42035 + }, + { + "epoch": 6.858075040783034, + "grad_norm": 0.021215567365288734, + "learning_rate": 0.0008307925961991158, + "loss": 0.1275, + "num_input_tokens_seen": 90761040, + "step": 42040 + }, + { + "epoch": 6.858890701468189, + "grad_norm": 0.12855474650859833, + "learning_rate": 0.0008307392172329273, + "loss": 0.0672, + "num_input_tokens_seen": 90770992, + "step": 42045 + }, + { + "epoch": 6.859706362153344, + "grad_norm": 0.058318838477134705, + "learning_rate": 0.000830685831563889, + "loss": 0.0376, + "num_input_tokens_seen": 90779088, + "step": 42050 + }, + { + "epoch": 6.8605220228384995, + "grad_norm": 0.009229238145053387, + "learning_rate": 0.0008306324391930827, + "loss": 0.2508, + "num_input_tokens_seen": 90789616, + "step": 42055 + }, + { + "epoch": 6.861337683523654, + "grad_norm": 0.429904043674469, + "learning_rate": 0.0008305790401215906, + "loss": 0.0695, + "num_input_tokens_seen": 90800496, + "step": 42060 + }, + { + "epoch": 6.862153344208809, + "grad_norm": 0.19588007032871246, + "learning_rate": 0.000830525634350495, + "loss": 0.1402, + "num_input_tokens_seen": 90809808, + "step": 42065 + }, + { + "epoch": 6.862969004893964, + "grad_norm": 0.10626070946455002, + "learning_rate": 0.0008304722218808782, + "loss": 0.1107, + "num_input_tokens_seen": 90820880, + "step": 42070 + }, + { + "epoch": 6.863784665579119, + "grad_norm": 0.02412840537726879, + "learning_rate": 0.0008304188027138225, + "loss": 0.0251, + "num_input_tokens_seen": 90831696, + "step": 42075 + }, + { + "epoch": 6.864600326264274, + "grad_norm": 0.024603717029094696, + "learning_rate": 0.0008303653768504105, + "loss": 0.0132, + "num_input_tokens_seen": 90842512, + "step": 42080 + }, + { + "epoch": 6.865415986949429, + "grad_norm": 0.08951440453529358, + "learning_rate": 0.000830311944291725, + "loss": 0.0219, + "num_input_tokens_seen": 90852784, + "step": 42085 + }, + { + "epoch": 6.866231647634584, + "grad_norm": 0.17952275276184082, + "learning_rate": 0.0008302585050388491, + "loss": 0.2274, + "num_input_tokens_seen": 90863280, + "step": 42090 + }, + { + "epoch": 6.867047308319739, + "grad_norm": 0.0043809181079268456, + "learning_rate": 0.0008302050590928656, + "loss": 0.163, + "num_input_tokens_seen": 90874160, + "step": 42095 + }, + { + "epoch": 6.867862969004894, + "grad_norm": 0.05145062506198883, + "learning_rate": 0.0008301516064548577, + "loss": 0.2306, + "num_input_tokens_seen": 90884752, + "step": 42100 + }, + { + "epoch": 6.868678629690049, + "grad_norm": 0.07419506460428238, + "learning_rate": 0.0008300981471259086, + "loss": 0.0983, + "num_input_tokens_seen": 90894704, + "step": 42105 + }, + { + "epoch": 6.869494290375204, + "grad_norm": 0.009989812038838863, + "learning_rate": 0.0008300446811071018, + "loss": 0.0697, + "num_input_tokens_seen": 90904528, + "step": 42110 + }, + { + "epoch": 6.870309951060359, + "grad_norm": 0.11967950314283371, + "learning_rate": 0.0008299912083995208, + "loss": 0.0632, + "num_input_tokens_seen": 90915696, + "step": 42115 + }, + { + "epoch": 6.871125611745514, + "grad_norm": 0.0992414578795433, + "learning_rate": 0.0008299377290042493, + "loss": 0.1288, + "num_input_tokens_seen": 90925776, + "step": 42120 + }, + { + "epoch": 6.871941272430669, + "grad_norm": 0.16489127278327942, + "learning_rate": 0.0008298842429223714, + "loss": 0.0463, + "num_input_tokens_seen": 90936144, + "step": 42125 + }, + { + "epoch": 6.872756933115824, + "grad_norm": 0.0187029130756855, + "learning_rate": 0.0008298307501549706, + "loss": 0.1198, + "num_input_tokens_seen": 90946480, + "step": 42130 + }, + { + "epoch": 6.873572593800979, + "grad_norm": 0.1030842736363411, + "learning_rate": 0.0008297772507031314, + "loss": 0.0387, + "num_input_tokens_seen": 90956720, + "step": 42135 + }, + { + "epoch": 6.874388254486134, + "grad_norm": 0.0898529440164566, + "learning_rate": 0.0008297237445679378, + "loss": 0.0891, + "num_input_tokens_seen": 90967312, + "step": 42140 + }, + { + "epoch": 6.875203915171289, + "grad_norm": 0.033187348395586014, + "learning_rate": 0.0008296702317504741, + "loss": 0.0739, + "num_input_tokens_seen": 90978000, + "step": 42145 + }, + { + "epoch": 6.876019575856444, + "grad_norm": 0.24983662366867065, + "learning_rate": 0.0008296167122518252, + "loss": 0.1066, + "num_input_tokens_seen": 90988304, + "step": 42150 + }, + { + "epoch": 6.876835236541599, + "grad_norm": 0.1629522293806076, + "learning_rate": 0.0008295631860730752, + "loss": 0.1132, + "num_input_tokens_seen": 90998288, + "step": 42155 + }, + { + "epoch": 6.877650897226753, + "grad_norm": 0.015678847208619118, + "learning_rate": 0.0008295096532153093, + "loss": 0.0462, + "num_input_tokens_seen": 91009936, + "step": 42160 + }, + { + "epoch": 6.878466557911908, + "grad_norm": 0.18625904619693756, + "learning_rate": 0.0008294561136796122, + "loss": 0.0879, + "num_input_tokens_seen": 91021456, + "step": 42165 + }, + { + "epoch": 6.879282218597064, + "grad_norm": 0.1537686288356781, + "learning_rate": 0.000829402567467069, + "loss": 0.114, + "num_input_tokens_seen": 91032784, + "step": 42170 + }, + { + "epoch": 6.880097879282219, + "grad_norm": 0.12550821900367737, + "learning_rate": 0.000829349014578765, + "loss": 0.048, + "num_input_tokens_seen": 91043952, + "step": 42175 + }, + { + "epoch": 6.8809135399673735, + "grad_norm": 0.2548525333404541, + "learning_rate": 0.0008292954550157853, + "loss": 0.0793, + "num_input_tokens_seen": 91054704, + "step": 42180 + }, + { + "epoch": 6.881729200652528, + "grad_norm": 0.09930320829153061, + "learning_rate": 0.0008292418887792155, + "loss": 0.1623, + "num_input_tokens_seen": 91064176, + "step": 42185 + }, + { + "epoch": 6.882544861337683, + "grad_norm": 0.16448988020420074, + "learning_rate": 0.0008291883158701413, + "loss": 0.0954, + "num_input_tokens_seen": 91074768, + "step": 42190 + }, + { + "epoch": 6.883360522022839, + "grad_norm": 0.3170793354511261, + "learning_rate": 0.000829134736289648, + "loss": 0.1509, + "num_input_tokens_seen": 91085520, + "step": 42195 + }, + { + "epoch": 6.884176182707994, + "grad_norm": 0.2149861454963684, + "learning_rate": 0.0008290811500388219, + "loss": 0.319, + "num_input_tokens_seen": 91094800, + "step": 42200 + }, + { + "epoch": 6.8849918433931485, + "grad_norm": 0.01610528863966465, + "learning_rate": 0.0008290275571187488, + "loss": 0.053, + "num_input_tokens_seen": 91105840, + "step": 42205 + }, + { + "epoch": 6.885807504078303, + "grad_norm": 0.11854418367147446, + "learning_rate": 0.0008289739575305148, + "loss": 0.2037, + "num_input_tokens_seen": 91116624, + "step": 42210 + }, + { + "epoch": 6.886623164763458, + "grad_norm": 0.04946037009358406, + "learning_rate": 0.0008289203512752063, + "loss": 0.059, + "num_input_tokens_seen": 91128176, + "step": 42215 + }, + { + "epoch": 6.887438825448614, + "grad_norm": 0.13297899067401886, + "learning_rate": 0.0008288667383539097, + "loss": 0.1817, + "num_input_tokens_seen": 91138544, + "step": 42220 + }, + { + "epoch": 6.888254486133769, + "grad_norm": 0.029817230999469757, + "learning_rate": 0.0008288131187677112, + "loss": 0.1033, + "num_input_tokens_seen": 91148624, + "step": 42225 + }, + { + "epoch": 6.8890701468189235, + "grad_norm": 0.053531546145677567, + "learning_rate": 0.000828759492517698, + "loss": 0.1066, + "num_input_tokens_seen": 91158672, + "step": 42230 + }, + { + "epoch": 6.889885807504078, + "grad_norm": 0.37412360310554504, + "learning_rate": 0.0008287058596049563, + "loss": 0.0573, + "num_input_tokens_seen": 91169648, + "step": 42235 + }, + { + "epoch": 6.890701468189233, + "grad_norm": 0.01705174520611763, + "learning_rate": 0.0008286522200305738, + "loss": 0.189, + "num_input_tokens_seen": 91180496, + "step": 42240 + }, + { + "epoch": 6.891517128874388, + "grad_norm": 0.09822686016559601, + "learning_rate": 0.0008285985737956367, + "loss": 0.0714, + "num_input_tokens_seen": 91191440, + "step": 42245 + }, + { + "epoch": 6.892332789559543, + "grad_norm": 0.07510721683502197, + "learning_rate": 0.0008285449209012328, + "loss": 0.1112, + "num_input_tokens_seen": 91201584, + "step": 42250 + }, + { + "epoch": 6.8931484502446985, + "grad_norm": 0.00950673408806324, + "learning_rate": 0.0008284912613484493, + "loss": 0.0234, + "num_input_tokens_seen": 91213104, + "step": 42255 + }, + { + "epoch": 6.893964110929853, + "grad_norm": 0.027947278693318367, + "learning_rate": 0.0008284375951383738, + "loss": 0.07, + "num_input_tokens_seen": 91224560, + "step": 42260 + }, + { + "epoch": 6.894779771615008, + "grad_norm": 0.010591290891170502, + "learning_rate": 0.0008283839222720935, + "loss": 0.0923, + "num_input_tokens_seen": 91235824, + "step": 42265 + }, + { + "epoch": 6.895595432300163, + "grad_norm": 0.017493370920419693, + "learning_rate": 0.0008283302427506966, + "loss": 0.045, + "num_input_tokens_seen": 91247600, + "step": 42270 + }, + { + "epoch": 6.896411092985318, + "grad_norm": 0.023538336157798767, + "learning_rate": 0.0008282765565752708, + "loss": 0.0816, + "num_input_tokens_seen": 91259120, + "step": 42275 + }, + { + "epoch": 6.897226753670473, + "grad_norm": 0.07859956473112106, + "learning_rate": 0.0008282228637469042, + "loss": 0.0292, + "num_input_tokens_seen": 91269040, + "step": 42280 + }, + { + "epoch": 6.898042414355628, + "grad_norm": 0.18535509705543518, + "learning_rate": 0.0008281691642666848, + "loss": 0.0843, + "num_input_tokens_seen": 91280208, + "step": 42285 + }, + { + "epoch": 6.898858075040783, + "grad_norm": 0.00809511262923479, + "learning_rate": 0.000828115458135701, + "loss": 0.0768, + "num_input_tokens_seen": 91291088, + "step": 42290 + }, + { + "epoch": 6.899673735725938, + "grad_norm": 0.02836800180375576, + "learning_rate": 0.0008280617453550412, + "loss": 0.0766, + "num_input_tokens_seen": 91302032, + "step": 42295 + }, + { + "epoch": 6.900489396411093, + "grad_norm": 0.01807980239391327, + "learning_rate": 0.0008280080259257939, + "loss": 0.0318, + "num_input_tokens_seen": 91313552, + "step": 42300 + }, + { + "epoch": 6.901305057096248, + "grad_norm": 0.05334368720650673, + "learning_rate": 0.0008279542998490479, + "loss": 0.023, + "num_input_tokens_seen": 91323728, + "step": 42305 + }, + { + "epoch": 6.902120717781403, + "grad_norm": 0.02683844417333603, + "learning_rate": 0.000827900567125892, + "loss": 0.1843, + "num_input_tokens_seen": 91334960, + "step": 42310 + }, + { + "epoch": 6.902936378466558, + "grad_norm": 0.00377538800239563, + "learning_rate": 0.0008278468277574152, + "loss": 0.0189, + "num_input_tokens_seen": 91346896, + "step": 42315 + }, + { + "epoch": 6.903752039151713, + "grad_norm": 0.0027475962415337563, + "learning_rate": 0.0008277930817447063, + "loss": 0.0585, + "num_input_tokens_seen": 91357168, + "step": 42320 + }, + { + "epoch": 6.904567699836868, + "grad_norm": 0.03678572550415993, + "learning_rate": 0.000827739329088855, + "loss": 0.0723, + "num_input_tokens_seen": 91368688, + "step": 42325 + }, + { + "epoch": 6.9053833605220225, + "grad_norm": 0.01282608974725008, + "learning_rate": 0.0008276855697909502, + "loss": 0.035, + "num_input_tokens_seen": 91379600, + "step": 42330 + }, + { + "epoch": 6.906199021207177, + "grad_norm": 0.02552957832813263, + "learning_rate": 0.0008276318038520818, + "loss": 0.1047, + "num_input_tokens_seen": 91389872, + "step": 42335 + }, + { + "epoch": 6.907014681892333, + "grad_norm": 0.14766758680343628, + "learning_rate": 0.0008275780312733392, + "loss": 0.1004, + "num_input_tokens_seen": 91400944, + "step": 42340 + }, + { + "epoch": 6.907830342577488, + "grad_norm": 0.04660836607217789, + "learning_rate": 0.0008275242520558124, + "loss": 0.0492, + "num_input_tokens_seen": 91412080, + "step": 42345 + }, + { + "epoch": 6.908646003262643, + "grad_norm": 0.011048818938434124, + "learning_rate": 0.000827470466200591, + "loss": 0.0385, + "num_input_tokens_seen": 91423952, + "step": 42350 + }, + { + "epoch": 6.9094616639477975, + "grad_norm": 0.156296968460083, + "learning_rate": 0.0008274166737087652, + "loss": 0.1717, + "num_input_tokens_seen": 91434480, + "step": 42355 + }, + { + "epoch": 6.910277324632952, + "grad_norm": 0.1801958978176117, + "learning_rate": 0.000827362874581425, + "loss": 0.1016, + "num_input_tokens_seen": 91444464, + "step": 42360 + }, + { + "epoch": 6.911092985318108, + "grad_norm": 0.009582912549376488, + "learning_rate": 0.000827309068819661, + "loss": 0.072, + "num_input_tokens_seen": 91453104, + "step": 42365 + }, + { + "epoch": 6.911908646003263, + "grad_norm": 0.004077945835888386, + "learning_rate": 0.0008272552564245635, + "loss": 0.0338, + "num_input_tokens_seen": 91463120, + "step": 42370 + }, + { + "epoch": 6.912724306688418, + "grad_norm": 0.30642205476760864, + "learning_rate": 0.000827201437397223, + "loss": 0.0663, + "num_input_tokens_seen": 91473104, + "step": 42375 + }, + { + "epoch": 6.9135399673735725, + "grad_norm": 0.04695622995495796, + "learning_rate": 0.0008271476117387303, + "loss": 0.1013, + "num_input_tokens_seen": 91484304, + "step": 42380 + }, + { + "epoch": 6.914355628058727, + "grad_norm": 0.007017203606665134, + "learning_rate": 0.0008270937794501763, + "loss": 0.0411, + "num_input_tokens_seen": 91495536, + "step": 42385 + }, + { + "epoch": 6.915171288743883, + "grad_norm": 0.008955334313213825, + "learning_rate": 0.0008270399405326519, + "loss": 0.0049, + "num_input_tokens_seen": 91505392, + "step": 42390 + }, + { + "epoch": 6.915986949429038, + "grad_norm": 0.004118876997381449, + "learning_rate": 0.0008269860949872484, + "loss": 0.0412, + "num_input_tokens_seen": 91517200, + "step": 42395 + }, + { + "epoch": 6.916802610114193, + "grad_norm": 0.007053800858557224, + "learning_rate": 0.0008269322428150565, + "loss": 0.0881, + "num_input_tokens_seen": 91528592, + "step": 42400 + }, + { + "epoch": 6.917618270799347, + "grad_norm": 0.031920380890369415, + "learning_rate": 0.0008268783840171682, + "loss": 0.0278, + "num_input_tokens_seen": 91539216, + "step": 42405 + }, + { + "epoch": 6.918433931484502, + "grad_norm": 0.0018391908379271626, + "learning_rate": 0.0008268245185946748, + "loss": 0.1365, + "num_input_tokens_seen": 91549616, + "step": 42410 + }, + { + "epoch": 6.919249592169657, + "grad_norm": 0.01727990061044693, + "learning_rate": 0.0008267706465486677, + "loss": 0.1234, + "num_input_tokens_seen": 91560496, + "step": 42415 + }, + { + "epoch": 6.920065252854813, + "grad_norm": 0.020252011716365814, + "learning_rate": 0.000826716767880239, + "loss": 0.1724, + "num_input_tokens_seen": 91571408, + "step": 42420 + }, + { + "epoch": 6.920880913539968, + "grad_norm": 0.01203061267733574, + "learning_rate": 0.0008266628825904807, + "loss": 0.0135, + "num_input_tokens_seen": 91580496, + "step": 42425 + }, + { + "epoch": 6.921696574225122, + "grad_norm": 0.1385554075241089, + "learning_rate": 0.0008266089906804845, + "loss": 0.2366, + "num_input_tokens_seen": 91591984, + "step": 42430 + }, + { + "epoch": 6.922512234910277, + "grad_norm": 0.22573736310005188, + "learning_rate": 0.0008265550921513428, + "loss": 0.1033, + "num_input_tokens_seen": 91602672, + "step": 42435 + }, + { + "epoch": 6.923327895595432, + "grad_norm": 0.23567673563957214, + "learning_rate": 0.000826501187004148, + "loss": 0.0372, + "num_input_tokens_seen": 91612752, + "step": 42440 + }, + { + "epoch": 6.924143556280587, + "grad_norm": 0.004416801035404205, + "learning_rate": 0.0008264472752399923, + "loss": 0.0265, + "num_input_tokens_seen": 91623184, + "step": 42445 + }, + { + "epoch": 6.924959216965743, + "grad_norm": 0.008850377053022385, + "learning_rate": 0.0008263933568599687, + "loss": 0.0481, + "num_input_tokens_seen": 91633488, + "step": 42450 + }, + { + "epoch": 6.925774877650897, + "grad_norm": 0.12176383286714554, + "learning_rate": 0.0008263394318651693, + "loss": 0.2035, + "num_input_tokens_seen": 91645456, + "step": 42455 + }, + { + "epoch": 6.926590538336052, + "grad_norm": 0.009678368456661701, + "learning_rate": 0.0008262855002566876, + "loss": 0.1503, + "num_input_tokens_seen": 91656016, + "step": 42460 + }, + { + "epoch": 6.927406199021207, + "grad_norm": 0.06630418449640274, + "learning_rate": 0.0008262315620356163, + "loss": 0.098, + "num_input_tokens_seen": 91667728, + "step": 42465 + }, + { + "epoch": 6.928221859706362, + "grad_norm": 0.03242621570825577, + "learning_rate": 0.0008261776172030484, + "loss": 0.1019, + "num_input_tokens_seen": 91679280, + "step": 42470 + }, + { + "epoch": 6.9290375203915175, + "grad_norm": 0.014712878502905369, + "learning_rate": 0.0008261236657600773, + "loss": 0.0551, + "num_input_tokens_seen": 91690032, + "step": 42475 + }, + { + "epoch": 6.929853181076672, + "grad_norm": 0.03813314065337181, + "learning_rate": 0.0008260697077077964, + "loss": 0.0343, + "num_input_tokens_seen": 91700944, + "step": 42480 + }, + { + "epoch": 6.930668841761827, + "grad_norm": 0.2364635020494461, + "learning_rate": 0.0008260157430472992, + "loss": 0.0595, + "num_input_tokens_seen": 91710096, + "step": 42485 + }, + { + "epoch": 6.931484502446982, + "grad_norm": 0.06237725540995598, + "learning_rate": 0.0008259617717796795, + "loss": 0.1606, + "num_input_tokens_seen": 91719856, + "step": 42490 + }, + { + "epoch": 6.932300163132137, + "grad_norm": 0.2965821921825409, + "learning_rate": 0.0008259077939060309, + "loss": 0.2043, + "num_input_tokens_seen": 91730448, + "step": 42495 + }, + { + "epoch": 6.933115823817292, + "grad_norm": 0.212357297539711, + "learning_rate": 0.0008258538094274475, + "loss": 0.1497, + "num_input_tokens_seen": 91742256, + "step": 42500 + }, + { + "epoch": 6.933931484502447, + "grad_norm": 0.15861161053180695, + "learning_rate": 0.0008257998183450233, + "loss": 0.1473, + "num_input_tokens_seen": 91752848, + "step": 42505 + }, + { + "epoch": 6.934747145187602, + "grad_norm": 0.11676274985074997, + "learning_rate": 0.0008257458206598524, + "loss": 0.0954, + "num_input_tokens_seen": 91764240, + "step": 42510 + }, + { + "epoch": 6.935562805872757, + "grad_norm": 0.02400301769375801, + "learning_rate": 0.0008256918163730291, + "loss": 0.0588, + "num_input_tokens_seen": 91775696, + "step": 42515 + }, + { + "epoch": 6.936378466557912, + "grad_norm": 0.013892348855733871, + "learning_rate": 0.0008256378054856482, + "loss": 0.0358, + "num_input_tokens_seen": 91786576, + "step": 42520 + }, + { + "epoch": 6.937194127243067, + "grad_norm": 0.012792380526661873, + "learning_rate": 0.000825583787998804, + "loss": 0.0196, + "num_input_tokens_seen": 91796880, + "step": 42525 + }, + { + "epoch": 6.938009787928221, + "grad_norm": 0.015135562047362328, + "learning_rate": 0.0008255297639135912, + "loss": 0.1248, + "num_input_tokens_seen": 91807344, + "step": 42530 + }, + { + "epoch": 6.938825448613377, + "grad_norm": 0.019044624641537666, + "learning_rate": 0.000825475733231105, + "loss": 0.0451, + "num_input_tokens_seen": 91818640, + "step": 42535 + }, + { + "epoch": 6.939641109298532, + "grad_norm": 0.027838317677378654, + "learning_rate": 0.0008254216959524399, + "loss": 0.0396, + "num_input_tokens_seen": 91829616, + "step": 42540 + }, + { + "epoch": 6.940456769983687, + "grad_norm": 0.20418648421764374, + "learning_rate": 0.0008253676520786914, + "loss": 0.0493, + "num_input_tokens_seen": 91839376, + "step": 42545 + }, + { + "epoch": 6.941272430668842, + "grad_norm": 0.0036137220449745655, + "learning_rate": 0.0008253136016109547, + "loss": 0.0839, + "num_input_tokens_seen": 91849776, + "step": 42550 + }, + { + "epoch": 6.942088091353996, + "grad_norm": 0.23398908972740173, + "learning_rate": 0.0008252595445503253, + "loss": 0.2354, + "num_input_tokens_seen": 91861520, + "step": 42555 + }, + { + "epoch": 6.942903752039152, + "grad_norm": 0.01237261202186346, + "learning_rate": 0.0008252054808978984, + "loss": 0.0899, + "num_input_tokens_seen": 91872880, + "step": 42560 + }, + { + "epoch": 6.943719412724307, + "grad_norm": 0.043671976774930954, + "learning_rate": 0.0008251514106547698, + "loss": 0.1106, + "num_input_tokens_seen": 91884016, + "step": 42565 + }, + { + "epoch": 6.944535073409462, + "grad_norm": 0.2794142961502075, + "learning_rate": 0.0008250973338220356, + "loss": 0.1788, + "num_input_tokens_seen": 91895504, + "step": 42570 + }, + { + "epoch": 6.945350734094617, + "grad_norm": 0.01190102193504572, + "learning_rate": 0.0008250432504007914, + "loss": 0.0757, + "num_input_tokens_seen": 91906288, + "step": 42575 + }, + { + "epoch": 6.946166394779771, + "grad_norm": 0.19030845165252686, + "learning_rate": 0.0008249891603921334, + "loss": 0.0971, + "num_input_tokens_seen": 91916752, + "step": 42580 + }, + { + "epoch": 6.946982055464927, + "grad_norm": 0.1780133843421936, + "learning_rate": 0.0008249350637971577, + "loss": 0.1882, + "num_input_tokens_seen": 91926992, + "step": 42585 + }, + { + "epoch": 6.947797716150082, + "grad_norm": 0.008191144093871117, + "learning_rate": 0.0008248809606169609, + "loss": 0.0456, + "num_input_tokens_seen": 91935504, + "step": 42590 + }, + { + "epoch": 6.948613376835237, + "grad_norm": 0.13146579265594482, + "learning_rate": 0.0008248268508526393, + "loss": 0.1529, + "num_input_tokens_seen": 91946320, + "step": 42595 + }, + { + "epoch": 6.9494290375203915, + "grad_norm": 0.07623667269945145, + "learning_rate": 0.0008247727345052894, + "loss": 0.0614, + "num_input_tokens_seen": 91957008, + "step": 42600 + }, + { + "epoch": 6.950244698205546, + "grad_norm": 0.19863177835941315, + "learning_rate": 0.000824718611576008, + "loss": 0.0358, + "num_input_tokens_seen": 91967824, + "step": 42605 + }, + { + "epoch": 6.951060358890701, + "grad_norm": 0.20406146347522736, + "learning_rate": 0.0008246644820658922, + "loss": 0.0814, + "num_input_tokens_seen": 91979024, + "step": 42610 + }, + { + "epoch": 6.951876019575856, + "grad_norm": 0.00437986059114337, + "learning_rate": 0.0008246103459760385, + "loss": 0.0942, + "num_input_tokens_seen": 91990288, + "step": 42615 + }, + { + "epoch": 6.952691680261012, + "grad_norm": 0.08447308838367462, + "learning_rate": 0.0008245562033075446, + "loss": 0.0859, + "num_input_tokens_seen": 92000592, + "step": 42620 + }, + { + "epoch": 6.9535073409461665, + "grad_norm": 0.012689322233200073, + "learning_rate": 0.0008245020540615074, + "loss": 0.0785, + "num_input_tokens_seen": 92009520, + "step": 42625 + }, + { + "epoch": 6.954323001631321, + "grad_norm": 0.02655809372663498, + "learning_rate": 0.0008244478982390245, + "loss": 0.0394, + "num_input_tokens_seen": 92020848, + "step": 42630 + }, + { + "epoch": 6.955138662316476, + "grad_norm": 0.23065075278282166, + "learning_rate": 0.0008243937358411933, + "loss": 0.1242, + "num_input_tokens_seen": 92030384, + "step": 42635 + }, + { + "epoch": 6.955954323001631, + "grad_norm": 0.14189988374710083, + "learning_rate": 0.0008243395668691113, + "loss": 0.186, + "num_input_tokens_seen": 92040528, + "step": 42640 + }, + { + "epoch": 6.956769983686787, + "grad_norm": 0.019454549998044968, + "learning_rate": 0.0008242853913238769, + "loss": 0.0401, + "num_input_tokens_seen": 92052368, + "step": 42645 + }, + { + "epoch": 6.9575856443719415, + "grad_norm": 0.02865343913435936, + "learning_rate": 0.0008242312092065873, + "loss": 0.1045, + "num_input_tokens_seen": 92063152, + "step": 42650 + }, + { + "epoch": 6.958401305057096, + "grad_norm": 0.046370331197977066, + "learning_rate": 0.0008241770205183412, + "loss": 0.0249, + "num_input_tokens_seen": 92073296, + "step": 42655 + }, + { + "epoch": 6.959216965742251, + "grad_norm": 0.17401672899723053, + "learning_rate": 0.0008241228252602364, + "loss": 0.1316, + "num_input_tokens_seen": 92084176, + "step": 42660 + }, + { + "epoch": 6.960032626427406, + "grad_norm": 0.13900305330753326, + "learning_rate": 0.0008240686234333714, + "loss": 0.0631, + "num_input_tokens_seen": 92095344, + "step": 42665 + }, + { + "epoch": 6.960848287112562, + "grad_norm": 0.1870998591184616, + "learning_rate": 0.0008240144150388446, + "loss": 0.0595, + "num_input_tokens_seen": 92106384, + "step": 42670 + }, + { + "epoch": 6.9616639477977165, + "grad_norm": 0.11556775867938995, + "learning_rate": 0.0008239602000777548, + "loss": 0.0426, + "num_input_tokens_seen": 92117104, + "step": 42675 + }, + { + "epoch": 6.962479608482871, + "grad_norm": 0.09335777163505554, + "learning_rate": 0.0008239059785512005, + "loss": 0.074, + "num_input_tokens_seen": 92127600, + "step": 42680 + }, + { + "epoch": 6.963295269168026, + "grad_norm": 0.0664716362953186, + "learning_rate": 0.0008238517504602805, + "loss": 0.0649, + "num_input_tokens_seen": 92138448, + "step": 42685 + }, + { + "epoch": 6.964110929853181, + "grad_norm": 0.030236179009079933, + "learning_rate": 0.0008237975158060939, + "loss": 0.0168, + "num_input_tokens_seen": 92148144, + "step": 42690 + }, + { + "epoch": 6.964926590538336, + "grad_norm": 0.3220222592353821, + "learning_rate": 0.0008237432745897402, + "loss": 0.3278, + "num_input_tokens_seen": 92158640, + "step": 42695 + }, + { + "epoch": 6.9657422512234906, + "grad_norm": 0.12084414064884186, + "learning_rate": 0.000823689026812318, + "loss": 0.1605, + "num_input_tokens_seen": 92169008, + "step": 42700 + }, + { + "epoch": 6.966557911908646, + "grad_norm": 0.1535603404045105, + "learning_rate": 0.0008236347724749274, + "loss": 0.0645, + "num_input_tokens_seen": 92180432, + "step": 42705 + }, + { + "epoch": 6.967373572593801, + "grad_norm": 0.011310217902064323, + "learning_rate": 0.0008235805115786672, + "loss": 0.079, + "num_input_tokens_seen": 92191376, + "step": 42710 + }, + { + "epoch": 6.968189233278956, + "grad_norm": 0.006256133317947388, + "learning_rate": 0.0008235262441246376, + "loss": 0.0499, + "num_input_tokens_seen": 92202256, + "step": 42715 + }, + { + "epoch": 6.969004893964111, + "grad_norm": 0.1212262436747551, + "learning_rate": 0.0008234719701139384, + "loss": 0.0194, + "num_input_tokens_seen": 92212400, + "step": 42720 + }, + { + "epoch": 6.9698205546492655, + "grad_norm": 0.1868121474981308, + "learning_rate": 0.0008234176895476692, + "loss": 0.4019, + "num_input_tokens_seen": 92223216, + "step": 42725 + }, + { + "epoch": 6.970636215334421, + "grad_norm": 0.01925431191921234, + "learning_rate": 0.0008233634024269302, + "loss": 0.0504, + "num_input_tokens_seen": 92234928, + "step": 42730 + }, + { + "epoch": 6.971451876019576, + "grad_norm": 0.02216983027756214, + "learning_rate": 0.0008233091087528217, + "loss": 0.1395, + "num_input_tokens_seen": 92245392, + "step": 42735 + }, + { + "epoch": 6.972267536704731, + "grad_norm": 0.1445787101984024, + "learning_rate": 0.000823254808526444, + "loss": 0.098, + "num_input_tokens_seen": 92255824, + "step": 42740 + }, + { + "epoch": 6.973083197389886, + "grad_norm": 0.0640123039484024, + "learning_rate": 0.0008232005017488975, + "loss": 0.1, + "num_input_tokens_seen": 92265904, + "step": 42745 + }, + { + "epoch": 6.9738988580750405, + "grad_norm": 0.00967552699148655, + "learning_rate": 0.0008231461884212828, + "loss": 0.1252, + "num_input_tokens_seen": 92276176, + "step": 42750 + }, + { + "epoch": 6.974714518760196, + "grad_norm": 0.07092220336198807, + "learning_rate": 0.0008230918685447006, + "loss": 0.0913, + "num_input_tokens_seen": 92285744, + "step": 42755 + }, + { + "epoch": 6.975530179445351, + "grad_norm": 0.07268622517585754, + "learning_rate": 0.000823037542120252, + "loss": 0.0292, + "num_input_tokens_seen": 92298128, + "step": 42760 + }, + { + "epoch": 6.976345840130506, + "grad_norm": 0.009699149057269096, + "learning_rate": 0.0008229832091490377, + "loss": 0.023, + "num_input_tokens_seen": 92308752, + "step": 42765 + }, + { + "epoch": 6.977161500815661, + "grad_norm": 0.04308341071009636, + "learning_rate": 0.0008229288696321588, + "loss": 0.1923, + "num_input_tokens_seen": 92319632, + "step": 42770 + }, + { + "epoch": 6.9779771615008155, + "grad_norm": 0.036936596035957336, + "learning_rate": 0.0008228745235707169, + "loss": 0.0562, + "num_input_tokens_seen": 92330576, + "step": 42775 + }, + { + "epoch": 6.97879282218597, + "grad_norm": 0.19755522906780243, + "learning_rate": 0.000822820170965813, + "loss": 0.0824, + "num_input_tokens_seen": 92340304, + "step": 42780 + }, + { + "epoch": 6.979608482871125, + "grad_norm": 0.015825990587472916, + "learning_rate": 0.0008227658118185491, + "loss": 0.1667, + "num_input_tokens_seen": 92350576, + "step": 42785 + }, + { + "epoch": 6.980424143556281, + "grad_norm": 0.17621886730194092, + "learning_rate": 0.0008227114461300262, + "loss": 0.0634, + "num_input_tokens_seen": 92359664, + "step": 42790 + }, + { + "epoch": 6.981239804241436, + "grad_norm": 0.03504452481865883, + "learning_rate": 0.0008226570739013466, + "loss": 0.1335, + "num_input_tokens_seen": 92370032, + "step": 42795 + }, + { + "epoch": 6.9820554649265905, + "grad_norm": 0.031075354665517807, + "learning_rate": 0.0008226026951336121, + "loss": 0.14, + "num_input_tokens_seen": 92380976, + "step": 42800 + }, + { + "epoch": 6.982871125611745, + "grad_norm": 0.02287365309894085, + "learning_rate": 0.0008225483098279247, + "loss": 0.0955, + "num_input_tokens_seen": 92393136, + "step": 42805 + }, + { + "epoch": 6.9836867862969, + "grad_norm": 0.015109248459339142, + "learning_rate": 0.0008224939179853868, + "loss": 0.1101, + "num_input_tokens_seen": 92404208, + "step": 42810 + }, + { + "epoch": 6.984502446982056, + "grad_norm": 0.012240604497492313, + "learning_rate": 0.0008224395196071003, + "loss": 0.0932, + "num_input_tokens_seen": 92415152, + "step": 42815 + }, + { + "epoch": 6.985318107667211, + "grad_norm": 0.012044954113662243, + "learning_rate": 0.000822385114694168, + "loss": 0.0272, + "num_input_tokens_seen": 92426192, + "step": 42820 + }, + { + "epoch": 6.986133768352365, + "grad_norm": 0.013591110706329346, + "learning_rate": 0.0008223307032476923, + "loss": 0.0747, + "num_input_tokens_seen": 92438064, + "step": 42825 + }, + { + "epoch": 6.98694942903752, + "grad_norm": 0.23926964402198792, + "learning_rate": 0.0008222762852687762, + "loss": 0.0809, + "num_input_tokens_seen": 92447280, + "step": 42830 + }, + { + "epoch": 6.987765089722675, + "grad_norm": 0.016742514446377754, + "learning_rate": 0.0008222218607585221, + "loss": 0.039, + "num_input_tokens_seen": 92458416, + "step": 42835 + }, + { + "epoch": 6.988580750407831, + "grad_norm": 0.030596354976296425, + "learning_rate": 0.0008221674297180334, + "loss": 0.0714, + "num_input_tokens_seen": 92469680, + "step": 42840 + }, + { + "epoch": 6.989396411092986, + "grad_norm": 0.09713397175073624, + "learning_rate": 0.000822112992148413, + "loss": 0.0998, + "num_input_tokens_seen": 92480592, + "step": 42845 + }, + { + "epoch": 6.99021207177814, + "grad_norm": 0.10529834032058716, + "learning_rate": 0.000822058548050764, + "loss": 0.07, + "num_input_tokens_seen": 92492336, + "step": 42850 + }, + { + "epoch": 6.991027732463295, + "grad_norm": 0.016078690066933632, + "learning_rate": 0.0008220040974261901, + "loss": 0.0197, + "num_input_tokens_seen": 92502608, + "step": 42855 + }, + { + "epoch": 6.99184339314845, + "grad_norm": 0.03494907543063164, + "learning_rate": 0.0008219496402757948, + "loss": 0.0443, + "num_input_tokens_seen": 92513520, + "step": 42860 + }, + { + "epoch": 6.992659053833605, + "grad_norm": 0.09524475783109665, + "learning_rate": 0.0008218951766006815, + "loss": 0.0906, + "num_input_tokens_seen": 92524656, + "step": 42865 + }, + { + "epoch": 6.993474714518761, + "grad_norm": 0.04555177688598633, + "learning_rate": 0.0008218407064019541, + "loss": 0.0477, + "num_input_tokens_seen": 92535408, + "step": 42870 + }, + { + "epoch": 6.994290375203915, + "grad_norm": 0.14599859714508057, + "learning_rate": 0.0008217862296807165, + "loss": 0.0454, + "num_input_tokens_seen": 92545744, + "step": 42875 + }, + { + "epoch": 6.99510603588907, + "grad_norm": 0.2189893275499344, + "learning_rate": 0.0008217317464380727, + "loss": 0.1091, + "num_input_tokens_seen": 92556432, + "step": 42880 + }, + { + "epoch": 6.995921696574225, + "grad_norm": 0.19589288532733917, + "learning_rate": 0.0008216772566751269, + "loss": 0.1165, + "num_input_tokens_seen": 92568592, + "step": 42885 + }, + { + "epoch": 6.99673735725938, + "grad_norm": 0.015440725721418858, + "learning_rate": 0.0008216227603929835, + "loss": 0.041, + "num_input_tokens_seen": 92579632, + "step": 42890 + }, + { + "epoch": 6.997553017944535, + "grad_norm": 0.0045351614244282246, + "learning_rate": 0.0008215682575927468, + "loss": 0.0175, + "num_input_tokens_seen": 92591344, + "step": 42895 + }, + { + "epoch": 6.99836867862969, + "grad_norm": 0.009294147603213787, + "learning_rate": 0.0008215137482755215, + "loss": 0.1352, + "num_input_tokens_seen": 92601264, + "step": 42900 + }, + { + "epoch": 6.999184339314845, + "grad_norm": 0.015701280906796455, + "learning_rate": 0.0008214592324424122, + "loss": 0.1079, + "num_input_tokens_seen": 92612464, + "step": 42905 + }, + { + "epoch": 7.0, + "grad_norm": 0.03876131772994995, + "learning_rate": 0.0008214047100945236, + "loss": 0.0824, + "num_input_tokens_seen": 92621824, + "step": 42910 + }, + { + "epoch": 7.0, + "eval_loss": 0.1156083270907402, + "eval_runtime": 104.3264, + "eval_samples_per_second": 26.12, + "eval_steps_per_second": 6.537, + "num_input_tokens_seen": 92621824, + "step": 42910 + }, + { + "epoch": 7.000815660685155, + "grad_norm": 0.037131935358047485, + "learning_rate": 0.0008213501812329609, + "loss": 0.0602, + "num_input_tokens_seen": 92633024, + "step": 42915 + }, + { + "epoch": 7.00163132137031, + "grad_norm": 0.027400575578212738, + "learning_rate": 0.0008212956458588292, + "loss": 0.0628, + "num_input_tokens_seen": 92644768, + "step": 42920 + }, + { + "epoch": 7.002446982055465, + "grad_norm": 0.009423032402992249, + "learning_rate": 0.0008212411039732336, + "loss": 0.0215, + "num_input_tokens_seen": 92654240, + "step": 42925 + }, + { + "epoch": 7.00326264274062, + "grad_norm": 0.03070981428027153, + "learning_rate": 0.0008211865555772795, + "loss": 0.0181, + "num_input_tokens_seen": 92664288, + "step": 42930 + }, + { + "epoch": 7.004078303425775, + "grad_norm": 0.0034174807369709015, + "learning_rate": 0.0008211320006720723, + "loss": 0.0841, + "num_input_tokens_seen": 92675104, + "step": 42935 + }, + { + "epoch": 7.00489396411093, + "grad_norm": 0.10467658191919327, + "learning_rate": 0.000821077439258718, + "loss": 0.0222, + "num_input_tokens_seen": 92685056, + "step": 42940 + }, + { + "epoch": 7.005709624796085, + "grad_norm": 0.12603029608726501, + "learning_rate": 0.0008210228713383218, + "loss": 0.0624, + "num_input_tokens_seen": 92696000, + "step": 42945 + }, + { + "epoch": 7.006525285481239, + "grad_norm": 0.02047021873295307, + "learning_rate": 0.00082096829691199, + "loss": 0.0616, + "num_input_tokens_seen": 92706272, + "step": 42950 + }, + { + "epoch": 7.007340946166395, + "grad_norm": 0.004346213303506374, + "learning_rate": 0.0008209137159808284, + "loss": 0.0985, + "num_input_tokens_seen": 92717120, + "step": 42955 + }, + { + "epoch": 7.00815660685155, + "grad_norm": 0.03590996935963631, + "learning_rate": 0.0008208591285459434, + "loss": 0.0471, + "num_input_tokens_seen": 92726432, + "step": 42960 + }, + { + "epoch": 7.008972267536705, + "grad_norm": 0.14895787835121155, + "learning_rate": 0.0008208045346084409, + "loss": 0.1461, + "num_input_tokens_seen": 92736896, + "step": 42965 + }, + { + "epoch": 7.00978792822186, + "grad_norm": 0.04574083164334297, + "learning_rate": 0.0008207499341694278, + "loss": 0.0304, + "num_input_tokens_seen": 92747904, + "step": 42970 + }, + { + "epoch": 7.010603588907014, + "grad_norm": 0.01121122483164072, + "learning_rate": 0.0008206953272300102, + "loss": 0.0564, + "num_input_tokens_seen": 92757920, + "step": 42975 + }, + { + "epoch": 7.011419249592169, + "grad_norm": 0.003328252350911498, + "learning_rate": 0.000820640713791295, + "loss": 0.0304, + "num_input_tokens_seen": 92769664, + "step": 42980 + }, + { + "epoch": 7.012234910277325, + "grad_norm": 0.026780661195516586, + "learning_rate": 0.000820586093854389, + "loss": 0.0194, + "num_input_tokens_seen": 92780256, + "step": 42985 + }, + { + "epoch": 7.01305057096248, + "grad_norm": 0.14641427993774414, + "learning_rate": 0.0008205314674203989, + "loss": 0.0507, + "num_input_tokens_seen": 92791744, + "step": 42990 + }, + { + "epoch": 7.013866231647635, + "grad_norm": 0.2173553854227066, + "learning_rate": 0.0008204768344904323, + "loss": 0.0471, + "num_input_tokens_seen": 92800928, + "step": 42995 + }, + { + "epoch": 7.014681892332789, + "grad_norm": 0.08130283653736115, + "learning_rate": 0.0008204221950655959, + "loss": 0.0782, + "num_input_tokens_seen": 92812352, + "step": 43000 + }, + { + "epoch": 7.015497553017944, + "grad_norm": 0.005499284248799086, + "learning_rate": 0.0008203675491469973, + "loss": 0.0332, + "num_input_tokens_seen": 92822944, + "step": 43005 + }, + { + "epoch": 7.0163132137031, + "grad_norm": 0.04402667284011841, + "learning_rate": 0.0008203128967357438, + "loss": 0.0274, + "num_input_tokens_seen": 92832960, + "step": 43010 + }, + { + "epoch": 7.017128874388255, + "grad_norm": 0.009440748021006584, + "learning_rate": 0.0008202582378329433, + "loss": 0.0593, + "num_input_tokens_seen": 92843584, + "step": 43015 + }, + { + "epoch": 7.0179445350734095, + "grad_norm": 0.0612904392182827, + "learning_rate": 0.0008202035724397032, + "loss": 0.0343, + "num_input_tokens_seen": 92853440, + "step": 43020 + }, + { + "epoch": 7.018760195758564, + "grad_norm": 0.00972797628492117, + "learning_rate": 0.0008201489005571316, + "loss": 0.0276, + "num_input_tokens_seen": 92862112, + "step": 43025 + }, + { + "epoch": 7.019575856443719, + "grad_norm": 0.007808845490217209, + "learning_rate": 0.0008200942221863363, + "loss": 0.0423, + "num_input_tokens_seen": 92872000, + "step": 43030 + }, + { + "epoch": 7.020391517128874, + "grad_norm": 0.02122422121465206, + "learning_rate": 0.0008200395373284255, + "loss": 0.0141, + "num_input_tokens_seen": 92881856, + "step": 43035 + }, + { + "epoch": 7.02120717781403, + "grad_norm": 0.04375322908163071, + "learning_rate": 0.0008199848459845077, + "loss": 0.0436, + "num_input_tokens_seen": 92892768, + "step": 43040 + }, + { + "epoch": 7.0220228384991845, + "grad_norm": 0.04231950640678406, + "learning_rate": 0.0008199301481556907, + "loss": 0.1604, + "num_input_tokens_seen": 92904384, + "step": 43045 + }, + { + "epoch": 7.022838499184339, + "grad_norm": 0.034688569605350494, + "learning_rate": 0.0008198754438430836, + "loss": 0.0968, + "num_input_tokens_seen": 92916160, + "step": 43050 + }, + { + "epoch": 7.023654159869494, + "grad_norm": 0.23951658606529236, + "learning_rate": 0.000819820733047795, + "loss": 0.069, + "num_input_tokens_seen": 92924640, + "step": 43055 + }, + { + "epoch": 7.024469820554649, + "grad_norm": 0.00775554496794939, + "learning_rate": 0.0008197660157709333, + "loss": 0.0219, + "num_input_tokens_seen": 92935264, + "step": 43060 + }, + { + "epoch": 7.025285481239805, + "grad_norm": 0.21151119470596313, + "learning_rate": 0.0008197112920136076, + "loss": 0.0414, + "num_input_tokens_seen": 92947232, + "step": 43065 + }, + { + "epoch": 7.0261011419249595, + "grad_norm": 0.014842070639133453, + "learning_rate": 0.000819656561776927, + "loss": 0.0927, + "num_input_tokens_seen": 92958816, + "step": 43070 + }, + { + "epoch": 7.026916802610114, + "grad_norm": 0.5499007701873779, + "learning_rate": 0.0008196018250620008, + "loss": 0.2615, + "num_input_tokens_seen": 92969824, + "step": 43075 + }, + { + "epoch": 7.027732463295269, + "grad_norm": 0.07445286959409714, + "learning_rate": 0.0008195470818699381, + "loss": 0.0642, + "num_input_tokens_seen": 92980928, + "step": 43080 + }, + { + "epoch": 7.028548123980424, + "grad_norm": 0.13033291697502136, + "learning_rate": 0.0008194923322018484, + "loss": 0.0727, + "num_input_tokens_seen": 92992480, + "step": 43085 + }, + { + "epoch": 7.029363784665579, + "grad_norm": 0.011426898650825024, + "learning_rate": 0.0008194375760588413, + "loss": 0.0281, + "num_input_tokens_seen": 93003232, + "step": 43090 + }, + { + "epoch": 7.0301794453507345, + "grad_norm": 0.053231701254844666, + "learning_rate": 0.0008193828134420265, + "loss": 0.0528, + "num_input_tokens_seen": 93013952, + "step": 43095 + }, + { + "epoch": 7.030995106035889, + "grad_norm": 0.041947923600673676, + "learning_rate": 0.0008193280443525138, + "loss": 0.0927, + "num_input_tokens_seen": 93024064, + "step": 43100 + }, + { + "epoch": 7.031810766721044, + "grad_norm": 0.02890467830002308, + "learning_rate": 0.0008192732687914131, + "loss": 0.0438, + "num_input_tokens_seen": 93036128, + "step": 43105 + }, + { + "epoch": 7.032626427406199, + "grad_norm": 0.202922984957695, + "learning_rate": 0.0008192184867598347, + "loss": 0.0624, + "num_input_tokens_seen": 93047904, + "step": 43110 + }, + { + "epoch": 7.033442088091354, + "grad_norm": 0.0022696128580719233, + "learning_rate": 0.0008191636982588887, + "loss": 0.1998, + "num_input_tokens_seen": 93057568, + "step": 43115 + }, + { + "epoch": 7.034257748776509, + "grad_norm": 0.07663566619157791, + "learning_rate": 0.0008191089032896855, + "loss": 0.0135, + "num_input_tokens_seen": 93068192, + "step": 43120 + }, + { + "epoch": 7.035073409461664, + "grad_norm": 0.002557918429374695, + "learning_rate": 0.0008190541018533353, + "loss": 0.0074, + "num_input_tokens_seen": 93080032, + "step": 43125 + }, + { + "epoch": 7.035889070146819, + "grad_norm": 0.07567388564348221, + "learning_rate": 0.0008189992939509491, + "loss": 0.046, + "num_input_tokens_seen": 93090784, + "step": 43130 + }, + { + "epoch": 7.036704730831974, + "grad_norm": 0.07048489153385162, + "learning_rate": 0.0008189444795836377, + "loss": 0.0276, + "num_input_tokens_seen": 93102176, + "step": 43135 + }, + { + "epoch": 7.037520391517129, + "grad_norm": 0.007807167712599039, + "learning_rate": 0.0008188896587525118, + "loss": 0.0113, + "num_input_tokens_seen": 93112928, + "step": 43140 + }, + { + "epoch": 7.0383360522022835, + "grad_norm": 0.055167485028505325, + "learning_rate": 0.0008188348314586823, + "loss": 0.0116, + "num_input_tokens_seen": 93123872, + "step": 43145 + }, + { + "epoch": 7.039151712887439, + "grad_norm": 0.006673318799585104, + "learning_rate": 0.0008187799977032605, + "loss": 0.0637, + "num_input_tokens_seen": 93134368, + "step": 43150 + }, + { + "epoch": 7.039967373572594, + "grad_norm": 0.17948339879512787, + "learning_rate": 0.0008187251574873576, + "loss": 0.0451, + "num_input_tokens_seen": 93146720, + "step": 43155 + }, + { + "epoch": 7.040783034257749, + "grad_norm": 0.26381227374076843, + "learning_rate": 0.0008186703108120852, + "loss": 0.1097, + "num_input_tokens_seen": 93157632, + "step": 43160 + }, + { + "epoch": 7.041598694942904, + "grad_norm": 0.043247610330581665, + "learning_rate": 0.0008186154576785545, + "loss": 0.0294, + "num_input_tokens_seen": 93167264, + "step": 43165 + }, + { + "epoch": 7.0424143556280585, + "grad_norm": 0.1953558325767517, + "learning_rate": 0.0008185605980878775, + "loss": 0.0725, + "num_input_tokens_seen": 93179136, + "step": 43170 + }, + { + "epoch": 7.043230016313213, + "grad_norm": 0.024059362709522247, + "learning_rate": 0.0008185057320411658, + "loss": 0.1041, + "num_input_tokens_seen": 93189536, + "step": 43175 + }, + { + "epoch": 7.044045676998369, + "grad_norm": 0.011491659097373486, + "learning_rate": 0.0008184508595395314, + "loss": 0.0093, + "num_input_tokens_seen": 93201568, + "step": 43180 + }, + { + "epoch": 7.044861337683524, + "grad_norm": 0.013646122999489307, + "learning_rate": 0.0008183959805840863, + "loss": 0.0467, + "num_input_tokens_seen": 93212128, + "step": 43185 + }, + { + "epoch": 7.045676998368679, + "grad_norm": 0.2544953525066376, + "learning_rate": 0.0008183410951759429, + "loss": 0.2302, + "num_input_tokens_seen": 93222784, + "step": 43190 + }, + { + "epoch": 7.0464926590538335, + "grad_norm": 0.018381478264927864, + "learning_rate": 0.0008182862033162131, + "loss": 0.0189, + "num_input_tokens_seen": 93234016, + "step": 43195 + }, + { + "epoch": 7.047308319738988, + "grad_norm": 0.27290987968444824, + "learning_rate": 0.0008182313050060098, + "loss": 0.0575, + "num_input_tokens_seen": 93243200, + "step": 43200 + }, + { + "epoch": 7.048123980424143, + "grad_norm": 0.004748641978949308, + "learning_rate": 0.0008181764002464454, + "loss": 0.0039, + "num_input_tokens_seen": 93254432, + "step": 43205 + }, + { + "epoch": 7.048939641109299, + "grad_norm": 0.025404224172234535, + "learning_rate": 0.0008181214890386326, + "loss": 0.0107, + "num_input_tokens_seen": 93263360, + "step": 43210 + }, + { + "epoch": 7.049755301794454, + "grad_norm": 0.004372281488031149, + "learning_rate": 0.0008180665713836842, + "loss": 0.0699, + "num_input_tokens_seen": 93273376, + "step": 43215 + }, + { + "epoch": 7.0505709624796085, + "grad_norm": 0.11529897898435593, + "learning_rate": 0.0008180116472827133, + "loss": 0.1152, + "num_input_tokens_seen": 93284416, + "step": 43220 + }, + { + "epoch": 7.051386623164763, + "grad_norm": 0.03548279404640198, + "learning_rate": 0.000817956716736833, + "loss": 0.0521, + "num_input_tokens_seen": 93295776, + "step": 43225 + }, + { + "epoch": 7.052202283849918, + "grad_norm": 0.005636999849230051, + "learning_rate": 0.0008179017797471562, + "loss": 0.19, + "num_input_tokens_seen": 93305408, + "step": 43230 + }, + { + "epoch": 7.053017944535074, + "grad_norm": 0.3719158172607422, + "learning_rate": 0.0008178468363147968, + "loss": 0.2396, + "num_input_tokens_seen": 93316192, + "step": 43235 + }, + { + "epoch": 7.053833605220229, + "grad_norm": 0.036824941635131836, + "learning_rate": 0.000817791886440868, + "loss": 0.0292, + "num_input_tokens_seen": 93327072, + "step": 43240 + }, + { + "epoch": 7.054649265905383, + "grad_norm": 0.007996642962098122, + "learning_rate": 0.0008177369301264834, + "loss": 0.0219, + "num_input_tokens_seen": 93338368, + "step": 43245 + }, + { + "epoch": 7.055464926590538, + "grad_norm": 0.05967731028795242, + "learning_rate": 0.0008176819673727569, + "loss": 0.0589, + "num_input_tokens_seen": 93349952, + "step": 43250 + }, + { + "epoch": 7.056280587275693, + "grad_norm": 0.21131840348243713, + "learning_rate": 0.0008176269981808023, + "loss": 0.0651, + "num_input_tokens_seen": 93360480, + "step": 43255 + }, + { + "epoch": 7.057096247960848, + "grad_norm": 0.031012583523988724, + "learning_rate": 0.0008175720225517337, + "loss": 0.027, + "num_input_tokens_seen": 93371104, + "step": 43260 + }, + { + "epoch": 7.057911908646004, + "grad_norm": 0.13730266690254211, + "learning_rate": 0.0008175170404866652, + "loss": 0.128, + "num_input_tokens_seen": 93381792, + "step": 43265 + }, + { + "epoch": 7.058727569331158, + "grad_norm": 0.008105648681521416, + "learning_rate": 0.0008174620519867109, + "loss": 0.021, + "num_input_tokens_seen": 93393024, + "step": 43270 + }, + { + "epoch": 7.059543230016313, + "grad_norm": 0.007758202031254768, + "learning_rate": 0.0008174070570529854, + "loss": 0.0672, + "num_input_tokens_seen": 93403968, + "step": 43275 + }, + { + "epoch": 7.060358890701468, + "grad_norm": 0.021091684699058533, + "learning_rate": 0.0008173520556866035, + "loss": 0.0132, + "num_input_tokens_seen": 93414336, + "step": 43280 + }, + { + "epoch": 7.061174551386623, + "grad_norm": 0.0786118134856224, + "learning_rate": 0.0008172970478886794, + "loss": 0.0273, + "num_input_tokens_seen": 93425312, + "step": 43285 + }, + { + "epoch": 7.061990212071779, + "grad_norm": 0.16032278537750244, + "learning_rate": 0.0008172420336603281, + "loss": 0.1708, + "num_input_tokens_seen": 93434368, + "step": 43290 + }, + { + "epoch": 7.062805872756933, + "grad_norm": 0.11233207583427429, + "learning_rate": 0.0008171870130026646, + "loss": 0.0625, + "num_input_tokens_seen": 93445696, + "step": 43295 + }, + { + "epoch": 7.063621533442088, + "grad_norm": 0.14412659406661987, + "learning_rate": 0.000817131985916804, + "loss": 0.0254, + "num_input_tokens_seen": 93456768, + "step": 43300 + }, + { + "epoch": 7.064437194127243, + "grad_norm": 0.0023524111602455378, + "learning_rate": 0.0008170769524038613, + "loss": 0.2003, + "num_input_tokens_seen": 93468320, + "step": 43305 + }, + { + "epoch": 7.065252854812398, + "grad_norm": 0.010751700028777122, + "learning_rate": 0.0008170219124649518, + "loss": 0.02, + "num_input_tokens_seen": 93480832, + "step": 43310 + }, + { + "epoch": 7.066068515497553, + "grad_norm": 0.07297611981630325, + "learning_rate": 0.0008169668661011912, + "loss": 0.0266, + "num_input_tokens_seen": 93490880, + "step": 43315 + }, + { + "epoch": 7.066884176182708, + "grad_norm": 0.0020377058535814285, + "learning_rate": 0.0008169118133136951, + "loss": 0.0107, + "num_input_tokens_seen": 93501600, + "step": 43320 + }, + { + "epoch": 7.067699836867863, + "grad_norm": 0.2200804352760315, + "learning_rate": 0.0008168567541035788, + "loss": 0.091, + "num_input_tokens_seen": 93512832, + "step": 43325 + }, + { + "epoch": 7.068515497553018, + "grad_norm": 0.3517674207687378, + "learning_rate": 0.0008168016884719585, + "loss": 0.0335, + "num_input_tokens_seen": 93524416, + "step": 43330 + }, + { + "epoch": 7.069331158238173, + "grad_norm": 0.004308112896978855, + "learning_rate": 0.0008167466164199499, + "loss": 0.0116, + "num_input_tokens_seen": 93535744, + "step": 43335 + }, + { + "epoch": 7.070146818923328, + "grad_norm": 0.0076243216171860695, + "learning_rate": 0.0008166915379486697, + "loss": 0.0079, + "num_input_tokens_seen": 93546016, + "step": 43340 + }, + { + "epoch": 7.0709624796084825, + "grad_norm": 0.24400244653224945, + "learning_rate": 0.0008166364530592334, + "loss": 0.0524, + "num_input_tokens_seen": 93555040, + "step": 43345 + }, + { + "epoch": 7.071778140293638, + "grad_norm": 0.021054862067103386, + "learning_rate": 0.0008165813617527579, + "loss": 0.0078, + "num_input_tokens_seen": 93565184, + "step": 43350 + }, + { + "epoch": 7.072593800978793, + "grad_norm": 0.01333923451602459, + "learning_rate": 0.0008165262640303595, + "loss": 0.0889, + "num_input_tokens_seen": 93576352, + "step": 43355 + }, + { + "epoch": 7.073409461663948, + "grad_norm": 0.002517408225685358, + "learning_rate": 0.0008164711598931546, + "loss": 0.1283, + "num_input_tokens_seen": 93586880, + "step": 43360 + }, + { + "epoch": 7.074225122349103, + "grad_norm": 0.025187604129314423, + "learning_rate": 0.0008164160493422604, + "loss": 0.1121, + "num_input_tokens_seen": 93598560, + "step": 43365 + }, + { + "epoch": 7.075040783034257, + "grad_norm": 0.2560834288597107, + "learning_rate": 0.0008163609323787934, + "loss": 0.1505, + "num_input_tokens_seen": 93608928, + "step": 43370 + }, + { + "epoch": 7.075856443719413, + "grad_norm": 0.30149444937705994, + "learning_rate": 0.0008163058090038709, + "loss": 0.1954, + "num_input_tokens_seen": 93619552, + "step": 43375 + }, + { + "epoch": 7.076672104404568, + "grad_norm": 0.11861838400363922, + "learning_rate": 0.0008162506792186099, + "loss": 0.0318, + "num_input_tokens_seen": 93631264, + "step": 43380 + }, + { + "epoch": 7.077487765089723, + "grad_norm": 0.06109795719385147, + "learning_rate": 0.0008161955430241276, + "loss": 0.0902, + "num_input_tokens_seen": 93641792, + "step": 43385 + }, + { + "epoch": 7.078303425774878, + "grad_norm": 0.005356751848012209, + "learning_rate": 0.0008161404004215415, + "loss": 0.0505, + "num_input_tokens_seen": 93652992, + "step": 43390 + }, + { + "epoch": 7.079119086460032, + "grad_norm": 0.025907672941684723, + "learning_rate": 0.0008160852514119692, + "loss": 0.0466, + "num_input_tokens_seen": 93663552, + "step": 43395 + }, + { + "epoch": 7.079934747145187, + "grad_norm": 0.01540401577949524, + "learning_rate": 0.0008160300959965284, + "loss": 0.0491, + "num_input_tokens_seen": 93674272, + "step": 43400 + }, + { + "epoch": 7.080750407830343, + "grad_norm": 0.007649971172213554, + "learning_rate": 0.0008159749341763367, + "loss": 0.0246, + "num_input_tokens_seen": 93685536, + "step": 43405 + }, + { + "epoch": 7.081566068515498, + "grad_norm": 0.022227033972740173, + "learning_rate": 0.000815919765952512, + "loss": 0.0249, + "num_input_tokens_seen": 93696640, + "step": 43410 + }, + { + "epoch": 7.082381729200653, + "grad_norm": 0.08225790411233902, + "learning_rate": 0.0008158645913261726, + "loss": 0.0238, + "num_input_tokens_seen": 93707008, + "step": 43415 + }, + { + "epoch": 7.083197389885807, + "grad_norm": 0.09061209112405777, + "learning_rate": 0.0008158094102984366, + "loss": 0.0414, + "num_input_tokens_seen": 93717376, + "step": 43420 + }, + { + "epoch": 7.084013050570962, + "grad_norm": 0.06409075111150742, + "learning_rate": 0.0008157542228704221, + "loss": 0.0852, + "num_input_tokens_seen": 93728032, + "step": 43425 + }, + { + "epoch": 7.084828711256117, + "grad_norm": 0.008115890435874462, + "learning_rate": 0.0008156990290432478, + "loss": 0.1264, + "num_input_tokens_seen": 93737792, + "step": 43430 + }, + { + "epoch": 7.085644371941273, + "grad_norm": 0.005179638043045998, + "learning_rate": 0.0008156438288180321, + "loss": 0.0275, + "num_input_tokens_seen": 93749088, + "step": 43435 + }, + { + "epoch": 7.0864600326264275, + "grad_norm": 0.08255940675735474, + "learning_rate": 0.0008155886221958939, + "loss": 0.0209, + "num_input_tokens_seen": 93760512, + "step": 43440 + }, + { + "epoch": 7.087275693311582, + "grad_norm": 0.12074115127325058, + "learning_rate": 0.0008155334091779518, + "loss": 0.0436, + "num_input_tokens_seen": 93771712, + "step": 43445 + }, + { + "epoch": 7.088091353996737, + "grad_norm": 0.023160280659794807, + "learning_rate": 0.0008154781897653251, + "loss": 0.0419, + "num_input_tokens_seen": 93783008, + "step": 43450 + }, + { + "epoch": 7.088907014681892, + "grad_norm": 0.2038068324327469, + "learning_rate": 0.0008154229639591324, + "loss": 0.0877, + "num_input_tokens_seen": 93792928, + "step": 43455 + }, + { + "epoch": 7.089722675367048, + "grad_norm": 0.00666784169152379, + "learning_rate": 0.0008153677317604935, + "loss": 0.0795, + "num_input_tokens_seen": 93803872, + "step": 43460 + }, + { + "epoch": 7.0905383360522025, + "grad_norm": 0.13625626266002655, + "learning_rate": 0.0008153124931705271, + "loss": 0.0958, + "num_input_tokens_seen": 93815584, + "step": 43465 + }, + { + "epoch": 7.091353996737357, + "grad_norm": 0.03665043041110039, + "learning_rate": 0.0008152572481903533, + "loss": 0.056, + "num_input_tokens_seen": 93827040, + "step": 43470 + }, + { + "epoch": 7.092169657422512, + "grad_norm": 0.08530497550964355, + "learning_rate": 0.0008152019968210913, + "loss": 0.0549, + "num_input_tokens_seen": 93836800, + "step": 43475 + }, + { + "epoch": 7.092985318107667, + "grad_norm": 0.18086695671081543, + "learning_rate": 0.0008151467390638611, + "loss": 0.0303, + "num_input_tokens_seen": 93848032, + "step": 43480 + }, + { + "epoch": 7.093800978792822, + "grad_norm": 0.18106631934642792, + "learning_rate": 0.0008150914749197823, + "loss": 0.1936, + "num_input_tokens_seen": 93858720, + "step": 43485 + }, + { + "epoch": 7.0946166394779775, + "grad_norm": 0.0066991038620471954, + "learning_rate": 0.0008150362043899751, + "loss": 0.0301, + "num_input_tokens_seen": 93869696, + "step": 43490 + }, + { + "epoch": 7.095432300163132, + "grad_norm": 0.00845072977244854, + "learning_rate": 0.0008149809274755595, + "loss": 0.0237, + "num_input_tokens_seen": 93880672, + "step": 43495 + }, + { + "epoch": 7.096247960848287, + "grad_norm": 0.03990786895155907, + "learning_rate": 0.0008149256441776559, + "loss": 0.0296, + "num_input_tokens_seen": 93891840, + "step": 43500 + }, + { + "epoch": 7.097063621533442, + "grad_norm": 0.0037902649492025375, + "learning_rate": 0.0008148703544973846, + "loss": 0.0959, + "num_input_tokens_seen": 93900928, + "step": 43505 + }, + { + "epoch": 7.097879282218597, + "grad_norm": 0.21109671890735626, + "learning_rate": 0.000814815058435866, + "loss": 0.0846, + "num_input_tokens_seen": 93912480, + "step": 43510 + }, + { + "epoch": 7.0986949429037525, + "grad_norm": 0.007070023566484451, + "learning_rate": 0.0008147597559942211, + "loss": 0.0525, + "num_input_tokens_seen": 93924192, + "step": 43515 + }, + { + "epoch": 7.099510603588907, + "grad_norm": 0.1824546456336975, + "learning_rate": 0.0008147044471735703, + "loss": 0.0344, + "num_input_tokens_seen": 93934304, + "step": 43520 + }, + { + "epoch": 7.100326264274062, + "grad_norm": 0.013199805282056332, + "learning_rate": 0.0008146491319750346, + "loss": 0.0679, + "num_input_tokens_seen": 93944032, + "step": 43525 + }, + { + "epoch": 7.101141924959217, + "grad_norm": 0.28457075357437134, + "learning_rate": 0.0008145938103997352, + "loss": 0.0633, + "num_input_tokens_seen": 93954304, + "step": 43530 + }, + { + "epoch": 7.101957585644372, + "grad_norm": 0.0027539508882910013, + "learning_rate": 0.0008145384824487931, + "loss": 0.048, + "num_input_tokens_seen": 93964864, + "step": 43535 + }, + { + "epoch": 7.102773246329527, + "grad_norm": 0.3405183255672455, + "learning_rate": 0.0008144831481233296, + "loss": 0.1525, + "num_input_tokens_seen": 93976448, + "step": 43540 + }, + { + "epoch": 7.103588907014682, + "grad_norm": 0.014308121986687183, + "learning_rate": 0.0008144278074244662, + "loss": 0.0142, + "num_input_tokens_seen": 93986848, + "step": 43545 + }, + { + "epoch": 7.104404567699837, + "grad_norm": 0.021814975887537003, + "learning_rate": 0.0008143724603533243, + "loss": 0.0258, + "num_input_tokens_seen": 93997472, + "step": 43550 + }, + { + "epoch": 7.105220228384992, + "grad_norm": 0.004456061404198408, + "learning_rate": 0.0008143171069110258, + "loss": 0.019, + "num_input_tokens_seen": 94007744, + "step": 43555 + }, + { + "epoch": 7.106035889070147, + "grad_norm": 0.08419227600097656, + "learning_rate": 0.0008142617470986924, + "loss": 0.2272, + "num_input_tokens_seen": 94018880, + "step": 43560 + }, + { + "epoch": 7.1068515497553015, + "grad_norm": 0.002248758217319846, + "learning_rate": 0.000814206380917446, + "loss": 0.0221, + "num_input_tokens_seen": 94030112, + "step": 43565 + }, + { + "epoch": 7.107667210440456, + "grad_norm": 0.19043007493019104, + "learning_rate": 0.0008141510083684087, + "loss": 0.125, + "num_input_tokens_seen": 94041120, + "step": 43570 + }, + { + "epoch": 7.108482871125612, + "grad_norm": 0.1310606449842453, + "learning_rate": 0.0008140956294527026, + "loss": 0.0662, + "num_input_tokens_seen": 94052928, + "step": 43575 + }, + { + "epoch": 7.109298531810767, + "grad_norm": 0.014848673716187477, + "learning_rate": 0.00081404024417145, + "loss": 0.0478, + "num_input_tokens_seen": 94064480, + "step": 43580 + }, + { + "epoch": 7.110114192495922, + "grad_norm": 0.08837317675352097, + "learning_rate": 0.0008139848525257737, + "loss": 0.0691, + "num_input_tokens_seen": 94076064, + "step": 43585 + }, + { + "epoch": 7.1109298531810765, + "grad_norm": 0.24765542149543762, + "learning_rate": 0.000813929454516796, + "loss": 0.076, + "num_input_tokens_seen": 94085216, + "step": 43590 + }, + { + "epoch": 7.111745513866231, + "grad_norm": 0.4069119095802307, + "learning_rate": 0.0008138740501456396, + "loss": 0.2066, + "num_input_tokens_seen": 94095904, + "step": 43595 + }, + { + "epoch": 7.112561174551387, + "grad_norm": 0.03654762730002403, + "learning_rate": 0.0008138186394134275, + "loss": 0.0358, + "num_input_tokens_seen": 94108288, + "step": 43600 + }, + { + "epoch": 7.113376835236542, + "grad_norm": 0.005083922296762466, + "learning_rate": 0.0008137632223212824, + "loss": 0.0346, + "num_input_tokens_seen": 94117440, + "step": 43605 + }, + { + "epoch": 7.114192495921697, + "grad_norm": 0.0020810861606150866, + "learning_rate": 0.0008137077988703276, + "loss": 0.0651, + "num_input_tokens_seen": 94127488, + "step": 43610 + }, + { + "epoch": 7.1150081566068515, + "grad_norm": 0.15817776322364807, + "learning_rate": 0.0008136523690616864, + "loss": 0.0386, + "num_input_tokens_seen": 94137824, + "step": 43615 + }, + { + "epoch": 7.115823817292006, + "grad_norm": 0.25216636061668396, + "learning_rate": 0.000813596932896482, + "loss": 0.1614, + "num_input_tokens_seen": 94149632, + "step": 43620 + }, + { + "epoch": 7.116639477977161, + "grad_norm": 0.025949271395802498, + "learning_rate": 0.000813541490375838, + "loss": 0.0494, + "num_input_tokens_seen": 94161184, + "step": 43625 + }, + { + "epoch": 7.117455138662317, + "grad_norm": 0.011439338326454163, + "learning_rate": 0.0008134860415008778, + "loss": 0.0414, + "num_input_tokens_seen": 94170656, + "step": 43630 + }, + { + "epoch": 7.118270799347472, + "grad_norm": 0.012489515356719494, + "learning_rate": 0.0008134305862727253, + "loss": 0.0401, + "num_input_tokens_seen": 94180384, + "step": 43635 + }, + { + "epoch": 7.1190864600326265, + "grad_norm": 0.004568039905279875, + "learning_rate": 0.0008133751246925046, + "loss": 0.0188, + "num_input_tokens_seen": 94191456, + "step": 43640 + }, + { + "epoch": 7.119902120717781, + "grad_norm": 0.006068985443562269, + "learning_rate": 0.0008133196567613391, + "loss": 0.0853, + "num_input_tokens_seen": 94202112, + "step": 43645 + }, + { + "epoch": 7.120717781402936, + "grad_norm": 0.15539832413196564, + "learning_rate": 0.0008132641824803534, + "loss": 0.1121, + "num_input_tokens_seen": 94213312, + "step": 43650 + }, + { + "epoch": 7.121533442088092, + "grad_norm": 0.009866473264992237, + "learning_rate": 0.0008132087018506716, + "loss": 0.0821, + "num_input_tokens_seen": 94225248, + "step": 43655 + }, + { + "epoch": 7.122349102773247, + "grad_norm": 0.17353636026382446, + "learning_rate": 0.0008131532148734182, + "loss": 0.1229, + "num_input_tokens_seen": 94236416, + "step": 43660 + }, + { + "epoch": 7.123164763458401, + "grad_norm": 0.02224327251315117, + "learning_rate": 0.0008130977215497177, + "loss": 0.0176, + "num_input_tokens_seen": 94247040, + "step": 43665 + }, + { + "epoch": 7.123980424143556, + "grad_norm": 0.154402494430542, + "learning_rate": 0.0008130422218806945, + "loss": 0.0499, + "num_input_tokens_seen": 94257472, + "step": 43670 + }, + { + "epoch": 7.124796084828711, + "grad_norm": 0.2165873944759369, + "learning_rate": 0.0008129867158674737, + "loss": 0.1286, + "num_input_tokens_seen": 94267776, + "step": 43675 + }, + { + "epoch": 7.125611745513866, + "grad_norm": 0.29000595211982727, + "learning_rate": 0.00081293120351118, + "loss": 0.065, + "num_input_tokens_seen": 94279552, + "step": 43680 + }, + { + "epoch": 7.126427406199022, + "grad_norm": 0.010081104002892971, + "learning_rate": 0.0008128756848129386, + "loss": 0.0382, + "num_input_tokens_seen": 94291424, + "step": 43685 + }, + { + "epoch": 7.127243066884176, + "grad_norm": 0.08514951169490814, + "learning_rate": 0.0008128201597738744, + "loss": 0.0539, + "num_input_tokens_seen": 94302080, + "step": 43690 + }, + { + "epoch": 7.128058727569331, + "grad_norm": 0.012129147537052631, + "learning_rate": 0.0008127646283951129, + "loss": 0.0731, + "num_input_tokens_seen": 94313440, + "step": 43695 + }, + { + "epoch": 7.128874388254486, + "grad_norm": 0.022055018693208694, + "learning_rate": 0.0008127090906777793, + "loss": 0.0186, + "num_input_tokens_seen": 94322784, + "step": 43700 + }, + { + "epoch": 7.129690048939641, + "grad_norm": 0.21874502301216125, + "learning_rate": 0.0008126535466229993, + "loss": 0.0486, + "num_input_tokens_seen": 94333280, + "step": 43705 + }, + { + "epoch": 7.130505709624796, + "grad_norm": 0.26190274953842163, + "learning_rate": 0.0008125979962318987, + "loss": 0.0912, + "num_input_tokens_seen": 94344032, + "step": 43710 + }, + { + "epoch": 7.131321370309951, + "grad_norm": 0.008965296670794487, + "learning_rate": 0.000812542439505603, + "loss": 0.0545, + "num_input_tokens_seen": 94354816, + "step": 43715 + }, + { + "epoch": 7.132137030995106, + "grad_norm": 0.015662262216210365, + "learning_rate": 0.0008124868764452384, + "loss": 0.0837, + "num_input_tokens_seen": 94364896, + "step": 43720 + }, + { + "epoch": 7.132952691680261, + "grad_norm": 0.0291599091142416, + "learning_rate": 0.0008124313070519307, + "loss": 0.1204, + "num_input_tokens_seen": 94375648, + "step": 43725 + }, + { + "epoch": 7.133768352365416, + "grad_norm": 0.007128790020942688, + "learning_rate": 0.0008123757313268064, + "loss": 0.0928, + "num_input_tokens_seen": 94386816, + "step": 43730 + }, + { + "epoch": 7.134584013050571, + "grad_norm": 0.015012525022029877, + "learning_rate": 0.0008123201492709915, + "loss": 0.0657, + "num_input_tokens_seen": 94397856, + "step": 43735 + }, + { + "epoch": 7.135399673735726, + "grad_norm": 0.20593784749507904, + "learning_rate": 0.0008122645608856125, + "loss": 0.0829, + "num_input_tokens_seen": 94408128, + "step": 43740 + }, + { + "epoch": 7.136215334420881, + "grad_norm": 0.01139918901026249, + "learning_rate": 0.0008122089661717961, + "loss": 0.0775, + "num_input_tokens_seen": 94418912, + "step": 43745 + }, + { + "epoch": 7.137030995106036, + "grad_norm": 0.018016086891293526, + "learning_rate": 0.000812153365130669, + "loss": 0.0297, + "num_input_tokens_seen": 94428576, + "step": 43750 + }, + { + "epoch": 7.137846655791191, + "grad_norm": 0.000830206845421344, + "learning_rate": 0.0008120977577633578, + "loss": 0.1955, + "num_input_tokens_seen": 94439232, + "step": 43755 + }, + { + "epoch": 7.138662316476346, + "grad_norm": 0.08882143348455429, + "learning_rate": 0.0008120421440709897, + "loss": 0.049, + "num_input_tokens_seen": 94450144, + "step": 43760 + }, + { + "epoch": 7.1394779771615005, + "grad_norm": 0.011505679227411747, + "learning_rate": 0.0008119865240546918, + "loss": 0.0232, + "num_input_tokens_seen": 94461664, + "step": 43765 + }, + { + "epoch": 7.140293637846656, + "grad_norm": 0.013276083394885063, + "learning_rate": 0.000811930897715591, + "loss": 0.0109, + "num_input_tokens_seen": 94471392, + "step": 43770 + }, + { + "epoch": 7.141109298531811, + "grad_norm": 0.01674427092075348, + "learning_rate": 0.0008118752650548151, + "loss": 0.1275, + "num_input_tokens_seen": 94482848, + "step": 43775 + }, + { + "epoch": 7.141924959216966, + "grad_norm": 0.014342593960464, + "learning_rate": 0.0008118196260734911, + "loss": 0.0969, + "num_input_tokens_seen": 94493792, + "step": 43780 + }, + { + "epoch": 7.142740619902121, + "grad_norm": 0.08684085309505463, + "learning_rate": 0.000811763980772747, + "loss": 0.0292, + "num_input_tokens_seen": 94504448, + "step": 43785 + }, + { + "epoch": 7.143556280587275, + "grad_norm": 0.011089487932622433, + "learning_rate": 0.0008117083291537102, + "loss": 0.0674, + "num_input_tokens_seen": 94515296, + "step": 43790 + }, + { + "epoch": 7.14437194127243, + "grad_norm": 0.025529926642775536, + "learning_rate": 0.0008116526712175087, + "loss": 0.014, + "num_input_tokens_seen": 94525984, + "step": 43795 + }, + { + "epoch": 7.145187601957586, + "grad_norm": 0.03695162013173103, + "learning_rate": 0.0008115970069652705, + "loss": 0.1155, + "num_input_tokens_seen": 94536512, + "step": 43800 + }, + { + "epoch": 7.146003262642741, + "grad_norm": 0.01570282317698002, + "learning_rate": 0.0008115413363981237, + "loss": 0.0616, + "num_input_tokens_seen": 94546208, + "step": 43805 + }, + { + "epoch": 7.146818923327896, + "grad_norm": 0.04409771040081978, + "learning_rate": 0.0008114856595171963, + "loss": 0.0383, + "num_input_tokens_seen": 94558656, + "step": 43810 + }, + { + "epoch": 7.14763458401305, + "grad_norm": 0.02857673540711403, + "learning_rate": 0.000811429976323617, + "loss": 0.0153, + "num_input_tokens_seen": 94568832, + "step": 43815 + }, + { + "epoch": 7.148450244698205, + "grad_norm": 0.26509881019592285, + "learning_rate": 0.0008113742868185142, + "loss": 0.1537, + "num_input_tokens_seen": 94580512, + "step": 43820 + }, + { + "epoch": 7.149265905383361, + "grad_norm": 0.21938468515872955, + "learning_rate": 0.0008113185910030163, + "loss": 0.0914, + "num_input_tokens_seen": 94592608, + "step": 43825 + }, + { + "epoch": 7.150081566068516, + "grad_norm": 0.20130236446857452, + "learning_rate": 0.0008112628888782523, + "loss": 0.2288, + "num_input_tokens_seen": 94603200, + "step": 43830 + }, + { + "epoch": 7.150897226753671, + "grad_norm": 0.12405912578105927, + "learning_rate": 0.0008112071804453511, + "loss": 0.0458, + "num_input_tokens_seen": 94614656, + "step": 43835 + }, + { + "epoch": 7.151712887438825, + "grad_norm": 0.08040972054004669, + "learning_rate": 0.0008111514657054415, + "loss": 0.0952, + "num_input_tokens_seen": 94624512, + "step": 43840 + }, + { + "epoch": 7.15252854812398, + "grad_norm": 0.008264830335974693, + "learning_rate": 0.0008110957446596527, + "loss": 0.0954, + "num_input_tokens_seen": 94635360, + "step": 43845 + }, + { + "epoch": 7.153344208809135, + "grad_norm": 0.0061530740931630135, + "learning_rate": 0.0008110400173091142, + "loss": 0.0315, + "num_input_tokens_seen": 94645440, + "step": 43850 + }, + { + "epoch": 7.154159869494291, + "grad_norm": 0.005276225507259369, + "learning_rate": 0.0008109842836549549, + "loss": 0.1282, + "num_input_tokens_seen": 94656832, + "step": 43855 + }, + { + "epoch": 7.1549755301794455, + "grad_norm": 0.011168314144015312, + "learning_rate": 0.0008109285436983047, + "loss": 0.019, + "num_input_tokens_seen": 94666816, + "step": 43860 + }, + { + "epoch": 7.1557911908646, + "grad_norm": 0.03374440222978592, + "learning_rate": 0.000810872797440293, + "loss": 0.0505, + "num_input_tokens_seen": 94677888, + "step": 43865 + }, + { + "epoch": 7.156606851549755, + "grad_norm": 0.10254388302564621, + "learning_rate": 0.0008108170448820498, + "loss": 0.0735, + "num_input_tokens_seen": 94689088, + "step": 43870 + }, + { + "epoch": 7.15742251223491, + "grad_norm": 0.0074286083690822124, + "learning_rate": 0.0008107612860247049, + "loss": 0.0684, + "num_input_tokens_seen": 94700992, + "step": 43875 + }, + { + "epoch": 7.158238172920065, + "grad_norm": 0.042076606303453445, + "learning_rate": 0.0008107055208693882, + "loss": 0.0224, + "num_input_tokens_seen": 94710112, + "step": 43880 + }, + { + "epoch": 7.1590538336052205, + "grad_norm": 0.005659482441842556, + "learning_rate": 0.00081064974941723, + "loss": 0.1526, + "num_input_tokens_seen": 94720544, + "step": 43885 + }, + { + "epoch": 7.159869494290375, + "grad_norm": 0.009977896697819233, + "learning_rate": 0.0008105939716693606, + "loss": 0.0165, + "num_input_tokens_seen": 94730496, + "step": 43890 + }, + { + "epoch": 7.16068515497553, + "grad_norm": 0.007157988380640745, + "learning_rate": 0.0008105381876269104, + "loss": 0.1122, + "num_input_tokens_seen": 94741696, + "step": 43895 + }, + { + "epoch": 7.161500815660685, + "grad_norm": 0.0726219192147255, + "learning_rate": 0.0008104823972910098, + "loss": 0.0246, + "num_input_tokens_seen": 94752000, + "step": 43900 + }, + { + "epoch": 7.16231647634584, + "grad_norm": 0.08875808119773865, + "learning_rate": 0.0008104266006627895, + "loss": 0.0646, + "num_input_tokens_seen": 94763264, + "step": 43905 + }, + { + "epoch": 7.1631321370309955, + "grad_norm": 0.24911761283874512, + "learning_rate": 0.0008103707977433804, + "loss": 0.1179, + "num_input_tokens_seen": 94773504, + "step": 43910 + }, + { + "epoch": 7.16394779771615, + "grad_norm": 0.10114079713821411, + "learning_rate": 0.0008103149885339134, + "loss": 0.1813, + "num_input_tokens_seen": 94783328, + "step": 43915 + }, + { + "epoch": 7.164763458401305, + "grad_norm": 0.013637186028063297, + "learning_rate": 0.0008102591730355193, + "loss": 0.0179, + "num_input_tokens_seen": 94795584, + "step": 43920 + }, + { + "epoch": 7.16557911908646, + "grad_norm": 0.042124371975660324, + "learning_rate": 0.0008102033512493297, + "loss": 0.0716, + "num_input_tokens_seen": 94805408, + "step": 43925 + }, + { + "epoch": 7.166394779771615, + "grad_norm": 0.28791525959968567, + "learning_rate": 0.0008101475231764756, + "loss": 0.0644, + "num_input_tokens_seen": 94817024, + "step": 43930 + }, + { + "epoch": 7.16721044045677, + "grad_norm": 0.13619418442249298, + "learning_rate": 0.0008100916888180884, + "loss": 0.1604, + "num_input_tokens_seen": 94828480, + "step": 43935 + }, + { + "epoch": 7.168026101141925, + "grad_norm": 0.3316522538661957, + "learning_rate": 0.0008100358481752998, + "loss": 0.1234, + "num_input_tokens_seen": 94839552, + "step": 43940 + }, + { + "epoch": 7.16884176182708, + "grad_norm": 0.004593029152601957, + "learning_rate": 0.0008099800012492415, + "loss": 0.1164, + "num_input_tokens_seen": 94851200, + "step": 43945 + }, + { + "epoch": 7.169657422512235, + "grad_norm": 0.005391410551965237, + "learning_rate": 0.0008099241480410451, + "loss": 0.0525, + "num_input_tokens_seen": 94863744, + "step": 43950 + }, + { + "epoch": 7.17047308319739, + "grad_norm": 0.0871690884232521, + "learning_rate": 0.0008098682885518427, + "loss": 0.0718, + "num_input_tokens_seen": 94873504, + "step": 43955 + }, + { + "epoch": 7.171288743882545, + "grad_norm": 0.0019110729917883873, + "learning_rate": 0.0008098124227827663, + "loss": 0.0459, + "num_input_tokens_seen": 94883232, + "step": 43960 + }, + { + "epoch": 7.1721044045677, + "grad_norm": 0.02396559715270996, + "learning_rate": 0.0008097565507349482, + "loss": 0.1094, + "num_input_tokens_seen": 94893984, + "step": 43965 + }, + { + "epoch": 7.172920065252855, + "grad_norm": 0.014821198768913746, + "learning_rate": 0.0008097006724095208, + "loss": 0.1352, + "num_input_tokens_seen": 94904960, + "step": 43970 + }, + { + "epoch": 7.17373572593801, + "grad_norm": 0.2088932991027832, + "learning_rate": 0.0008096447878076161, + "loss": 0.0772, + "num_input_tokens_seen": 94914656, + "step": 43975 + }, + { + "epoch": 7.174551386623165, + "grad_norm": 0.12234581261873245, + "learning_rate": 0.0008095888969303672, + "loss": 0.0807, + "num_input_tokens_seen": 94925632, + "step": 43980 + }, + { + "epoch": 7.1753670473083195, + "grad_norm": 0.15001000463962555, + "learning_rate": 0.0008095329997789063, + "loss": 0.0462, + "num_input_tokens_seen": 94937376, + "step": 43985 + }, + { + "epoch": 7.176182707993474, + "grad_norm": 0.35589733719825745, + "learning_rate": 0.0008094770963543667, + "loss": 0.0695, + "num_input_tokens_seen": 94948608, + "step": 43990 + }, + { + "epoch": 7.17699836867863, + "grad_norm": 0.005793836433440447, + "learning_rate": 0.0008094211866578812, + "loss": 0.0177, + "num_input_tokens_seen": 94958368, + "step": 43995 + }, + { + "epoch": 7.177814029363785, + "grad_norm": 0.022857986390590668, + "learning_rate": 0.0008093652706905827, + "loss": 0.018, + "num_input_tokens_seen": 94968576, + "step": 44000 + }, + { + "epoch": 7.17862969004894, + "grad_norm": 0.4139455258846283, + "learning_rate": 0.0008093093484536045, + "loss": 0.1317, + "num_input_tokens_seen": 94979552, + "step": 44005 + }, + { + "epoch": 7.1794453507340945, + "grad_norm": 0.03317214548587799, + "learning_rate": 0.0008092534199480801, + "loss": 0.0904, + "num_input_tokens_seen": 94990656, + "step": 44010 + }, + { + "epoch": 7.180261011419249, + "grad_norm": 0.0371476374566555, + "learning_rate": 0.0008091974851751427, + "loss": 0.0875, + "num_input_tokens_seen": 95000480, + "step": 44015 + }, + { + "epoch": 7.181076672104404, + "grad_norm": 0.016350461170077324, + "learning_rate": 0.0008091415441359261, + "loss": 0.0153, + "num_input_tokens_seen": 95011584, + "step": 44020 + }, + { + "epoch": 7.18189233278956, + "grad_norm": 0.010699317790567875, + "learning_rate": 0.000809085596831564, + "loss": 0.1714, + "num_input_tokens_seen": 95021824, + "step": 44025 + }, + { + "epoch": 7.182707993474715, + "grad_norm": 0.015659386292099953, + "learning_rate": 0.0008090296432631901, + "loss": 0.0182, + "num_input_tokens_seen": 95033696, + "step": 44030 + }, + { + "epoch": 7.1835236541598695, + "grad_norm": 0.028240693733096123, + "learning_rate": 0.0008089736834319384, + "loss": 0.0468, + "num_input_tokens_seen": 95043936, + "step": 44035 + }, + { + "epoch": 7.184339314845024, + "grad_norm": 0.06662532687187195, + "learning_rate": 0.0008089177173389431, + "loss": 0.0189, + "num_input_tokens_seen": 95055040, + "step": 44040 + }, + { + "epoch": 7.185154975530179, + "grad_norm": 0.09909687936306, + "learning_rate": 0.0008088617449853382, + "loss": 0.1042, + "num_input_tokens_seen": 95066368, + "step": 44045 + }, + { + "epoch": 7.185970636215335, + "grad_norm": 0.2886401116847992, + "learning_rate": 0.0008088057663722583, + "loss": 0.0764, + "num_input_tokens_seen": 95077728, + "step": 44050 + }, + { + "epoch": 7.18678629690049, + "grad_norm": 0.10438928753137589, + "learning_rate": 0.000808749781500838, + "loss": 0.0624, + "num_input_tokens_seen": 95088160, + "step": 44055 + }, + { + "epoch": 7.1876019575856445, + "grad_norm": 0.2474057525396347, + "learning_rate": 0.0008086937903722114, + "loss": 0.1301, + "num_input_tokens_seen": 95099424, + "step": 44060 + }, + { + "epoch": 7.188417618270799, + "grad_norm": 0.02843836508691311, + "learning_rate": 0.0008086377929875137, + "loss": 0.0915, + "num_input_tokens_seen": 95109600, + "step": 44065 + }, + { + "epoch": 7.189233278955954, + "grad_norm": 0.006170382723212242, + "learning_rate": 0.0008085817893478797, + "loss": 0.1538, + "num_input_tokens_seen": 95119424, + "step": 44070 + }, + { + "epoch": 7.190048939641109, + "grad_norm": 0.28220275044441223, + "learning_rate": 0.0008085257794544441, + "loss": 0.1204, + "num_input_tokens_seen": 95130368, + "step": 44075 + }, + { + "epoch": 7.190864600326265, + "grad_norm": 0.0660393163561821, + "learning_rate": 0.0008084697633083422, + "loss": 0.02, + "num_input_tokens_seen": 95142400, + "step": 44080 + }, + { + "epoch": 7.191680261011419, + "grad_norm": 0.005627527832984924, + "learning_rate": 0.0008084137409107093, + "loss": 0.2249, + "num_input_tokens_seen": 95153472, + "step": 44085 + }, + { + "epoch": 7.192495921696574, + "grad_norm": 0.2373238205909729, + "learning_rate": 0.0008083577122626806, + "loss": 0.17, + "num_input_tokens_seen": 95163072, + "step": 44090 + }, + { + "epoch": 7.193311582381729, + "grad_norm": 0.08928152173757553, + "learning_rate": 0.0008083016773653917, + "loss": 0.0396, + "num_input_tokens_seen": 95174656, + "step": 44095 + }, + { + "epoch": 7.194127243066884, + "grad_norm": 0.041915565729141235, + "learning_rate": 0.0008082456362199783, + "loss": 0.081, + "num_input_tokens_seen": 95184960, + "step": 44100 + }, + { + "epoch": 7.19494290375204, + "grad_norm": 0.2657569944858551, + "learning_rate": 0.000808189588827576, + "loss": 0.1285, + "num_input_tokens_seen": 95194848, + "step": 44105 + }, + { + "epoch": 7.195758564437194, + "grad_norm": 0.2140921801328659, + "learning_rate": 0.0008081335351893206, + "loss": 0.0924, + "num_input_tokens_seen": 95207072, + "step": 44110 + }, + { + "epoch": 7.196574225122349, + "grad_norm": 0.006198524497449398, + "learning_rate": 0.0008080774753063485, + "loss": 0.0869, + "num_input_tokens_seen": 95217920, + "step": 44115 + }, + { + "epoch": 7.197389885807504, + "grad_norm": 0.04713178798556328, + "learning_rate": 0.0008080214091797953, + "loss": 0.0461, + "num_input_tokens_seen": 95228224, + "step": 44120 + }, + { + "epoch": 7.198205546492659, + "grad_norm": 0.0165233351290226, + "learning_rate": 0.0008079653368107975, + "loss": 0.09, + "num_input_tokens_seen": 95239008, + "step": 44125 + }, + { + "epoch": 7.199021207177814, + "grad_norm": 0.045287325978279114, + "learning_rate": 0.0008079092582004915, + "loss": 0.1659, + "num_input_tokens_seen": 95248800, + "step": 44130 + }, + { + "epoch": 7.199836867862969, + "grad_norm": 0.09527766704559326, + "learning_rate": 0.0008078531733500137, + "loss": 0.1735, + "num_input_tokens_seen": 95259840, + "step": 44135 + }, + { + "epoch": 7.200652528548124, + "grad_norm": 0.2100456804037094, + "learning_rate": 0.000807797082260501, + "loss": 0.1135, + "num_input_tokens_seen": 95270848, + "step": 44140 + }, + { + "epoch": 7.201468189233279, + "grad_norm": 0.008313166908919811, + "learning_rate": 0.0008077409849330898, + "loss": 0.0426, + "num_input_tokens_seen": 95281472, + "step": 44145 + }, + { + "epoch": 7.202283849918434, + "grad_norm": 0.03848846256732941, + "learning_rate": 0.0008076848813689171, + "loss": 0.1728, + "num_input_tokens_seen": 95292160, + "step": 44150 + }, + { + "epoch": 7.203099510603589, + "grad_norm": 0.14277447760105133, + "learning_rate": 0.0008076287715691201, + "loss": 0.0661, + "num_input_tokens_seen": 95302208, + "step": 44155 + }, + { + "epoch": 7.2039151712887435, + "grad_norm": 0.09677635878324509, + "learning_rate": 0.0008075726555348357, + "loss": 0.1719, + "num_input_tokens_seen": 95312032, + "step": 44160 + }, + { + "epoch": 7.204730831973899, + "grad_norm": 0.04431038349866867, + "learning_rate": 0.0008075165332672013, + "loss": 0.1151, + "num_input_tokens_seen": 95323744, + "step": 44165 + }, + { + "epoch": 7.205546492659054, + "grad_norm": 0.07031480967998505, + "learning_rate": 0.0008074604047673542, + "loss": 0.0514, + "num_input_tokens_seen": 95333856, + "step": 44170 + }, + { + "epoch": 7.206362153344209, + "grad_norm": 0.0030981923919171095, + "learning_rate": 0.000807404270036432, + "loss": 0.0442, + "num_input_tokens_seen": 95344864, + "step": 44175 + }, + { + "epoch": 7.207177814029364, + "grad_norm": 0.06514888256788254, + "learning_rate": 0.0008073481290755723, + "loss": 0.0334, + "num_input_tokens_seen": 95355520, + "step": 44180 + }, + { + "epoch": 7.2079934747145185, + "grad_norm": 0.019382953643798828, + "learning_rate": 0.0008072919818859128, + "loss": 0.0138, + "num_input_tokens_seen": 95366752, + "step": 44185 + }, + { + "epoch": 7.208809135399674, + "grad_norm": 0.011314399540424347, + "learning_rate": 0.0008072358284685915, + "loss": 0.0284, + "num_input_tokens_seen": 95378304, + "step": 44190 + }, + { + "epoch": 7.209624796084829, + "grad_norm": 0.37969690561294556, + "learning_rate": 0.0008071796688247463, + "loss": 0.1555, + "num_input_tokens_seen": 95389024, + "step": 44195 + }, + { + "epoch": 7.210440456769984, + "grad_norm": 0.012319131754338741, + "learning_rate": 0.0008071235029555155, + "loss": 0.0206, + "num_input_tokens_seen": 95398656, + "step": 44200 + }, + { + "epoch": 7.211256117455139, + "grad_norm": 0.1699908971786499, + "learning_rate": 0.0008070673308620373, + "loss": 0.0543, + "num_input_tokens_seen": 95408160, + "step": 44205 + }, + { + "epoch": 7.212071778140293, + "grad_norm": 0.0082081388682127, + "learning_rate": 0.0008070111525454501, + "loss": 0.0199, + "num_input_tokens_seen": 95419936, + "step": 44210 + }, + { + "epoch": 7.212887438825448, + "grad_norm": 0.08519245684146881, + "learning_rate": 0.0008069549680068923, + "loss": 0.1502, + "num_input_tokens_seen": 95429952, + "step": 44215 + }, + { + "epoch": 7.213703099510604, + "grad_norm": 0.009200910106301308, + "learning_rate": 0.0008068987772475029, + "loss": 0.0375, + "num_input_tokens_seen": 95441024, + "step": 44220 + }, + { + "epoch": 7.214518760195759, + "grad_norm": 0.007293750066310167, + "learning_rate": 0.0008068425802684204, + "loss": 0.0328, + "num_input_tokens_seen": 95452448, + "step": 44225 + }, + { + "epoch": 7.215334420880914, + "grad_norm": 0.07962770015001297, + "learning_rate": 0.0008067863770707838, + "loss": 0.0412, + "num_input_tokens_seen": 95462432, + "step": 44230 + }, + { + "epoch": 7.216150081566068, + "grad_norm": 0.36658623814582825, + "learning_rate": 0.0008067301676557319, + "loss": 0.1121, + "num_input_tokens_seen": 95473312, + "step": 44235 + }, + { + "epoch": 7.216965742251223, + "grad_norm": 0.28085070848464966, + "learning_rate": 0.0008066739520244042, + "loss": 0.1571, + "num_input_tokens_seen": 95482624, + "step": 44240 + }, + { + "epoch": 7.217781402936378, + "grad_norm": 0.009199964813888073, + "learning_rate": 0.0008066177301779396, + "loss": 0.008, + "num_input_tokens_seen": 95494304, + "step": 44245 + }, + { + "epoch": 7.218597063621534, + "grad_norm": 0.007793826516717672, + "learning_rate": 0.0008065615021174779, + "loss": 0.0721, + "num_input_tokens_seen": 95505024, + "step": 44250 + }, + { + "epoch": 7.219412724306689, + "grad_norm": 0.2228405922651291, + "learning_rate": 0.0008065052678441584, + "loss": 0.0542, + "num_input_tokens_seen": 95516224, + "step": 44255 + }, + { + "epoch": 7.220228384991843, + "grad_norm": 0.007188491988927126, + "learning_rate": 0.0008064490273591209, + "loss": 0.0178, + "num_input_tokens_seen": 95527168, + "step": 44260 + }, + { + "epoch": 7.221044045676998, + "grad_norm": 0.010537146590650082, + "learning_rate": 0.000806392780663505, + "loss": 0.0259, + "num_input_tokens_seen": 95537152, + "step": 44265 + }, + { + "epoch": 7.221859706362153, + "grad_norm": 0.10568735003471375, + "learning_rate": 0.0008063365277584508, + "loss": 0.032, + "num_input_tokens_seen": 95549248, + "step": 44270 + }, + { + "epoch": 7.222675367047309, + "grad_norm": 0.0827159583568573, + "learning_rate": 0.0008062802686450982, + "loss": 0.1131, + "num_input_tokens_seen": 95560704, + "step": 44275 + }, + { + "epoch": 7.2234910277324635, + "grad_norm": 0.0011052305344492197, + "learning_rate": 0.0008062240033245875, + "loss": 0.0693, + "num_input_tokens_seen": 95572864, + "step": 44280 + }, + { + "epoch": 7.224306688417618, + "grad_norm": 0.13459408283233643, + "learning_rate": 0.0008061677317980587, + "loss": 0.0893, + "num_input_tokens_seen": 95582944, + "step": 44285 + }, + { + "epoch": 7.225122349102773, + "grad_norm": 0.008045937865972519, + "learning_rate": 0.0008061114540666525, + "loss": 0.0495, + "num_input_tokens_seen": 95593472, + "step": 44290 + }, + { + "epoch": 7.225938009787928, + "grad_norm": 0.05994420498609543, + "learning_rate": 0.0008060551701315093, + "loss": 0.1217, + "num_input_tokens_seen": 95604064, + "step": 44295 + }, + { + "epoch": 7.226753670473083, + "grad_norm": 0.005377685651183128, + "learning_rate": 0.00080599887999377, + "loss": 0.0139, + "num_input_tokens_seen": 95616192, + "step": 44300 + }, + { + "epoch": 7.2275693311582385, + "grad_norm": 0.321201354265213, + "learning_rate": 0.0008059425836545751, + "loss": 0.1536, + "num_input_tokens_seen": 95626976, + "step": 44305 + }, + { + "epoch": 7.228384991843393, + "grad_norm": 0.01316730584949255, + "learning_rate": 0.0008058862811150657, + "loss": 0.0206, + "num_input_tokens_seen": 95635328, + "step": 44310 + }, + { + "epoch": 7.229200652528548, + "grad_norm": 0.07034162431955338, + "learning_rate": 0.0008058299723763826, + "loss": 0.076, + "num_input_tokens_seen": 95645248, + "step": 44315 + }, + { + "epoch": 7.230016313213703, + "grad_norm": 0.002507708966732025, + "learning_rate": 0.0008057736574396673, + "loss": 0.1009, + "num_input_tokens_seen": 95656928, + "step": 44320 + }, + { + "epoch": 7.230831973898858, + "grad_norm": 0.00748209236189723, + "learning_rate": 0.000805717336306061, + "loss": 0.0095, + "num_input_tokens_seen": 95667200, + "step": 44325 + }, + { + "epoch": 7.231647634584013, + "grad_norm": 0.06890705227851868, + "learning_rate": 0.000805661008976705, + "loss": 0.1148, + "num_input_tokens_seen": 95677248, + "step": 44330 + }, + { + "epoch": 7.232463295269168, + "grad_norm": 0.007699467241764069, + "learning_rate": 0.0008056046754527406, + "loss": 0.09, + "num_input_tokens_seen": 95687104, + "step": 44335 + }, + { + "epoch": 7.233278955954323, + "grad_norm": 0.03326559066772461, + "learning_rate": 0.00080554833573531, + "loss": 0.1028, + "num_input_tokens_seen": 95697792, + "step": 44340 + }, + { + "epoch": 7.234094616639478, + "grad_norm": 0.015088078565895557, + "learning_rate": 0.0008054919898255548, + "loss": 0.1197, + "num_input_tokens_seen": 95708800, + "step": 44345 + }, + { + "epoch": 7.234910277324633, + "grad_norm": 0.0038959085941314697, + "learning_rate": 0.0008054356377246168, + "loss": 0.0539, + "num_input_tokens_seen": 95719296, + "step": 44350 + }, + { + "epoch": 7.235725938009788, + "grad_norm": 0.017113251611590385, + "learning_rate": 0.0008053792794336381, + "loss": 0.0461, + "num_input_tokens_seen": 95730336, + "step": 44355 + }, + { + "epoch": 7.236541598694943, + "grad_norm": 0.03525650128722191, + "learning_rate": 0.0008053229149537611, + "loss": 0.1673, + "num_input_tokens_seen": 95741760, + "step": 44360 + }, + { + "epoch": 7.237357259380098, + "grad_norm": 0.029893476516008377, + "learning_rate": 0.0008052665442861278, + "loss": 0.1033, + "num_input_tokens_seen": 95752896, + "step": 44365 + }, + { + "epoch": 7.238172920065253, + "grad_norm": 0.18972687423229218, + "learning_rate": 0.0008052101674318805, + "loss": 0.2537, + "num_input_tokens_seen": 95763968, + "step": 44370 + }, + { + "epoch": 7.238988580750408, + "grad_norm": 0.007927324622869492, + "learning_rate": 0.0008051537843921623, + "loss": 0.0131, + "num_input_tokens_seen": 95774528, + "step": 44375 + }, + { + "epoch": 7.239804241435563, + "grad_norm": 0.03225468844175339, + "learning_rate": 0.0008050973951681153, + "loss": 0.0223, + "num_input_tokens_seen": 95785600, + "step": 44380 + }, + { + "epoch": 7.240619902120717, + "grad_norm": 0.08666492998600006, + "learning_rate": 0.0008050409997608827, + "loss": 0.2147, + "num_input_tokens_seen": 95795104, + "step": 44385 + }, + { + "epoch": 7.241435562805873, + "grad_norm": 0.011844728142023087, + "learning_rate": 0.0008049845981716072, + "loss": 0.0249, + "num_input_tokens_seen": 95807104, + "step": 44390 + }, + { + "epoch": 7.242251223491028, + "grad_norm": 0.011588122695684433, + "learning_rate": 0.0008049281904014318, + "loss": 0.0613, + "num_input_tokens_seen": 95816832, + "step": 44395 + }, + { + "epoch": 7.243066884176183, + "grad_norm": 0.13519887626171112, + "learning_rate": 0.0008048717764514999, + "loss": 0.0326, + "num_input_tokens_seen": 95826688, + "step": 44400 + }, + { + "epoch": 7.2438825448613375, + "grad_norm": 0.006396844517439604, + "learning_rate": 0.0008048153563229548, + "loss": 0.0172, + "num_input_tokens_seen": 95835904, + "step": 44405 + }, + { + "epoch": 7.244698205546492, + "grad_norm": 0.16635337471961975, + "learning_rate": 0.0008047589300169398, + "loss": 0.0923, + "num_input_tokens_seen": 95847136, + "step": 44410 + }, + { + "epoch": 7.245513866231648, + "grad_norm": 0.009575733914971352, + "learning_rate": 0.0008047024975345983, + "loss": 0.1289, + "num_input_tokens_seen": 95858368, + "step": 44415 + }, + { + "epoch": 7.246329526916803, + "grad_norm": 0.008785121142864227, + "learning_rate": 0.0008046460588770743, + "loss": 0.1947, + "num_input_tokens_seen": 95868544, + "step": 44420 + }, + { + "epoch": 7.247145187601958, + "grad_norm": 0.02464008890092373, + "learning_rate": 0.0008045896140455114, + "loss": 0.0367, + "num_input_tokens_seen": 95878944, + "step": 44425 + }, + { + "epoch": 7.2479608482871125, + "grad_norm": 0.41443708539009094, + "learning_rate": 0.0008045331630410535, + "loss": 0.0926, + "num_input_tokens_seen": 95889792, + "step": 44430 + }, + { + "epoch": 7.248776508972267, + "grad_norm": 0.009849157184362411, + "learning_rate": 0.0008044767058648448, + "loss": 0.0675, + "num_input_tokens_seen": 95900896, + "step": 44435 + }, + { + "epoch": 7.249592169657422, + "grad_norm": 0.03230138123035431, + "learning_rate": 0.0008044202425180293, + "loss": 0.0734, + "num_input_tokens_seen": 95911808, + "step": 44440 + }, + { + "epoch": 7.250407830342578, + "grad_norm": 0.03321171924471855, + "learning_rate": 0.0008043637730017515, + "loss": 0.0664, + "num_input_tokens_seen": 95921696, + "step": 44445 + }, + { + "epoch": 7.251223491027733, + "grad_norm": 0.04850888252258301, + "learning_rate": 0.0008043072973171557, + "loss": 0.0645, + "num_input_tokens_seen": 95931104, + "step": 44450 + }, + { + "epoch": 7.2520391517128875, + "grad_norm": 0.017876798287034035, + "learning_rate": 0.0008042508154653865, + "loss": 0.0931, + "num_input_tokens_seen": 95941376, + "step": 44455 + }, + { + "epoch": 7.252854812398042, + "grad_norm": 0.035348426550626755, + "learning_rate": 0.0008041943274475886, + "loss": 0.0721, + "num_input_tokens_seen": 95950496, + "step": 44460 + }, + { + "epoch": 7.253670473083197, + "grad_norm": 0.11679347604513168, + "learning_rate": 0.0008041378332649067, + "loss": 0.0412, + "num_input_tokens_seen": 95961824, + "step": 44465 + }, + { + "epoch": 7.254486133768353, + "grad_norm": 0.18483665585517883, + "learning_rate": 0.0008040813329184857, + "loss": 0.1062, + "num_input_tokens_seen": 95972768, + "step": 44470 + }, + { + "epoch": 7.255301794453508, + "grad_norm": 0.03931093588471413, + "learning_rate": 0.000804024826409471, + "loss": 0.0653, + "num_input_tokens_seen": 95985024, + "step": 44475 + }, + { + "epoch": 7.2561174551386625, + "grad_norm": 0.011977437883615494, + "learning_rate": 0.0008039683137390073, + "loss": 0.0056, + "num_input_tokens_seen": 95996768, + "step": 44480 + }, + { + "epoch": 7.256933115823817, + "grad_norm": 0.17915965616703033, + "learning_rate": 0.0008039117949082401, + "loss": 0.0771, + "num_input_tokens_seen": 96007328, + "step": 44485 + }, + { + "epoch": 7.257748776508972, + "grad_norm": 0.09553663432598114, + "learning_rate": 0.0008038552699183148, + "loss": 0.0787, + "num_input_tokens_seen": 96017472, + "step": 44490 + }, + { + "epoch": 7.258564437194127, + "grad_norm": 0.029736965894699097, + "learning_rate": 0.0008037987387703771, + "loss": 0.0244, + "num_input_tokens_seen": 96026048, + "step": 44495 + }, + { + "epoch": 7.259380097879283, + "grad_norm": 0.0023016296327114105, + "learning_rate": 0.0008037422014655725, + "loss": 0.0335, + "num_input_tokens_seen": 96037024, + "step": 44500 + }, + { + "epoch": 7.260195758564437, + "grad_norm": 0.03756634518504143, + "learning_rate": 0.0008036856580050469, + "loss": 0.0226, + "num_input_tokens_seen": 96048640, + "step": 44505 + }, + { + "epoch": 7.261011419249592, + "grad_norm": 0.06967487931251526, + "learning_rate": 0.000803629108389946, + "loss": 0.0222, + "num_input_tokens_seen": 96059968, + "step": 44510 + }, + { + "epoch": 7.261827079934747, + "grad_norm": 0.11164039373397827, + "learning_rate": 0.0008035725526214164, + "loss": 0.0708, + "num_input_tokens_seen": 96070592, + "step": 44515 + }, + { + "epoch": 7.262642740619902, + "grad_norm": 0.4093096852302551, + "learning_rate": 0.0008035159907006037, + "loss": 0.0625, + "num_input_tokens_seen": 96080768, + "step": 44520 + }, + { + "epoch": 7.263458401305057, + "grad_norm": 0.008445779792964458, + "learning_rate": 0.0008034594226286545, + "loss": 0.0935, + "num_input_tokens_seen": 96091584, + "step": 44525 + }, + { + "epoch": 7.264274061990212, + "grad_norm": 0.27699118852615356, + "learning_rate": 0.0008034028484067149, + "loss": 0.1375, + "num_input_tokens_seen": 96103264, + "step": 44530 + }, + { + "epoch": 7.265089722675367, + "grad_norm": 0.00355733186006546, + "learning_rate": 0.0008033462680359319, + "loss": 0.1964, + "num_input_tokens_seen": 96112992, + "step": 44535 + }, + { + "epoch": 7.265905383360522, + "grad_norm": 0.1967250555753708, + "learning_rate": 0.000803289681517452, + "loss": 0.0803, + "num_input_tokens_seen": 96124480, + "step": 44540 + }, + { + "epoch": 7.266721044045677, + "grad_norm": 0.14862824976444244, + "learning_rate": 0.0008032330888524217, + "loss": 0.0496, + "num_input_tokens_seen": 96134848, + "step": 44545 + }, + { + "epoch": 7.267536704730832, + "grad_norm": 0.0052166893146932125, + "learning_rate": 0.0008031764900419885, + "loss": 0.0315, + "num_input_tokens_seen": 96146176, + "step": 44550 + }, + { + "epoch": 7.268352365415987, + "grad_norm": 0.008145149797201157, + "learning_rate": 0.000803119885087299, + "loss": 0.0846, + "num_input_tokens_seen": 96157824, + "step": 44555 + }, + { + "epoch": 7.269168026101142, + "grad_norm": 0.002600356237962842, + "learning_rate": 0.0008030632739895004, + "loss": 0.0138, + "num_input_tokens_seen": 96168992, + "step": 44560 + }, + { + "epoch": 7.269983686786297, + "grad_norm": 0.03837938234210014, + "learning_rate": 0.0008030066567497401, + "loss": 0.0665, + "num_input_tokens_seen": 96179712, + "step": 44565 + }, + { + "epoch": 7.270799347471452, + "grad_norm": 0.10307849198579788, + "learning_rate": 0.0008029500333691656, + "loss": 0.082, + "num_input_tokens_seen": 96190656, + "step": 44570 + }, + { + "epoch": 7.271615008156607, + "grad_norm": 0.11772072315216064, + "learning_rate": 0.0008028934038489243, + "loss": 0.0436, + "num_input_tokens_seen": 96201120, + "step": 44575 + }, + { + "epoch": 7.2724306688417615, + "grad_norm": 0.021636299788951874, + "learning_rate": 0.000802836768190164, + "loss": 0.0135, + "num_input_tokens_seen": 96210304, + "step": 44580 + }, + { + "epoch": 7.273246329526917, + "grad_norm": 0.015999101102352142, + "learning_rate": 0.0008027801263940322, + "loss": 0.0861, + "num_input_tokens_seen": 96221472, + "step": 44585 + }, + { + "epoch": 7.274061990212072, + "grad_norm": 0.0023375414311885834, + "learning_rate": 0.0008027234784616773, + "loss": 0.0323, + "num_input_tokens_seen": 96232736, + "step": 44590 + }, + { + "epoch": 7.274877650897227, + "grad_norm": 0.2989862859249115, + "learning_rate": 0.0008026668243942469, + "loss": 0.1643, + "num_input_tokens_seen": 96244352, + "step": 44595 + }, + { + "epoch": 7.275693311582382, + "grad_norm": 0.0029028900898993015, + "learning_rate": 0.0008026101641928895, + "loss": 0.032, + "num_input_tokens_seen": 96254240, + "step": 44600 + }, + { + "epoch": 7.2765089722675365, + "grad_norm": 0.018947646021842957, + "learning_rate": 0.000802553497858753, + "loss": 0.0118, + "num_input_tokens_seen": 96265696, + "step": 44605 + }, + { + "epoch": 7.277324632952691, + "grad_norm": 0.014446857385337353, + "learning_rate": 0.0008024968253929861, + "loss": 0.0496, + "num_input_tokens_seen": 96277664, + "step": 44610 + }, + { + "epoch": 7.278140293637847, + "grad_norm": 0.07986364513635635, + "learning_rate": 0.0008024401467967375, + "loss": 0.0703, + "num_input_tokens_seen": 96288448, + "step": 44615 + }, + { + "epoch": 7.278955954323002, + "grad_norm": 0.05514919012784958, + "learning_rate": 0.0008023834620711555, + "loss": 0.0225, + "num_input_tokens_seen": 96298432, + "step": 44620 + }, + { + "epoch": 7.279771615008157, + "grad_norm": 0.00040360583807341754, + "learning_rate": 0.000802326771217389, + "loss": 0.0126, + "num_input_tokens_seen": 96309568, + "step": 44625 + }, + { + "epoch": 7.280587275693311, + "grad_norm": 0.04625226557254791, + "learning_rate": 0.0008022700742365871, + "loss": 0.0133, + "num_input_tokens_seen": 96319968, + "step": 44630 + }, + { + "epoch": 7.281402936378466, + "grad_norm": 0.00672645028680563, + "learning_rate": 0.0008022133711298987, + "loss": 0.065, + "num_input_tokens_seen": 96331232, + "step": 44635 + }, + { + "epoch": 7.282218597063622, + "grad_norm": 0.07001020014286041, + "learning_rate": 0.0008021566618984728, + "loss": 0.0834, + "num_input_tokens_seen": 96341696, + "step": 44640 + }, + { + "epoch": 7.283034257748777, + "grad_norm": 0.01747494377195835, + "learning_rate": 0.0008020999465434589, + "loss": 0.0821, + "num_input_tokens_seen": 96352160, + "step": 44645 + }, + { + "epoch": 7.283849918433932, + "grad_norm": 0.08647828549146652, + "learning_rate": 0.0008020432250660063, + "loss": 0.0837, + "num_input_tokens_seen": 96362592, + "step": 44650 + }, + { + "epoch": 7.284665579119086, + "grad_norm": 0.23880939185619354, + "learning_rate": 0.0008019864974672646, + "loss": 0.0545, + "num_input_tokens_seen": 96375424, + "step": 44655 + }, + { + "epoch": 7.285481239804241, + "grad_norm": 0.1673363894224167, + "learning_rate": 0.0008019297637483836, + "loss": 0.1628, + "num_input_tokens_seen": 96387296, + "step": 44660 + }, + { + "epoch": 7.286296900489396, + "grad_norm": 0.17855383455753326, + "learning_rate": 0.0008018730239105127, + "loss": 0.136, + "num_input_tokens_seen": 96397632, + "step": 44665 + }, + { + "epoch": 7.287112561174552, + "grad_norm": 0.003699185326695442, + "learning_rate": 0.000801816277954802, + "loss": 0.0103, + "num_input_tokens_seen": 96408096, + "step": 44670 + }, + { + "epoch": 7.287928221859707, + "grad_norm": 0.002879117848351598, + "learning_rate": 0.0008017595258824016, + "loss": 0.0071, + "num_input_tokens_seen": 96417408, + "step": 44675 + }, + { + "epoch": 7.288743882544861, + "grad_norm": 0.11479058116674423, + "learning_rate": 0.0008017027676944617, + "loss": 0.0919, + "num_input_tokens_seen": 96427328, + "step": 44680 + }, + { + "epoch": 7.289559543230016, + "grad_norm": 0.005524273030459881, + "learning_rate": 0.0008016460033921323, + "loss": 0.0095, + "num_input_tokens_seen": 96437920, + "step": 44685 + }, + { + "epoch": 7.290375203915171, + "grad_norm": 0.0830729529261589, + "learning_rate": 0.0008015892329765642, + "loss": 0.0552, + "num_input_tokens_seen": 96447872, + "step": 44690 + }, + { + "epoch": 7.291190864600326, + "grad_norm": 0.012838074006140232, + "learning_rate": 0.0008015324564489075, + "loss": 0.0078, + "num_input_tokens_seen": 96458592, + "step": 44695 + }, + { + "epoch": 7.2920065252854815, + "grad_norm": 0.006786803249269724, + "learning_rate": 0.0008014756738103132, + "loss": 0.1395, + "num_input_tokens_seen": 96469312, + "step": 44700 + }, + { + "epoch": 7.292822185970636, + "grad_norm": 0.012400000356137753, + "learning_rate": 0.0008014188850619318, + "loss": 0.0166, + "num_input_tokens_seen": 96478976, + "step": 44705 + }, + { + "epoch": 7.293637846655791, + "grad_norm": 0.011501033790409565, + "learning_rate": 0.0008013620902049143, + "loss": 0.057, + "num_input_tokens_seen": 96489952, + "step": 44710 + }, + { + "epoch": 7.294453507340946, + "grad_norm": 0.002165677258744836, + "learning_rate": 0.0008013052892404118, + "loss": 0.151, + "num_input_tokens_seen": 96500672, + "step": 44715 + }, + { + "epoch": 7.295269168026101, + "grad_norm": 0.2537006139755249, + "learning_rate": 0.0008012484821695754, + "loss": 0.14, + "num_input_tokens_seen": 96511488, + "step": 44720 + }, + { + "epoch": 7.2960848287112565, + "grad_norm": 0.0373079888522625, + "learning_rate": 0.0008011916689935563, + "loss": 0.0675, + "num_input_tokens_seen": 96522336, + "step": 44725 + }, + { + "epoch": 7.296900489396411, + "grad_norm": 0.010195508599281311, + "learning_rate": 0.000801134849713506, + "loss": 0.1843, + "num_input_tokens_seen": 96533088, + "step": 44730 + }, + { + "epoch": 7.297716150081566, + "grad_norm": 0.025146406143903732, + "learning_rate": 0.0008010780243305758, + "loss": 0.0129, + "num_input_tokens_seen": 96543552, + "step": 44735 + }, + { + "epoch": 7.298531810766721, + "grad_norm": 0.004011362791061401, + "learning_rate": 0.0008010211928459177, + "loss": 0.0889, + "num_input_tokens_seen": 96554432, + "step": 44740 + }, + { + "epoch": 7.299347471451876, + "grad_norm": 0.01152226235717535, + "learning_rate": 0.0008009643552606831, + "loss": 0.0194, + "num_input_tokens_seen": 96565536, + "step": 44745 + }, + { + "epoch": 7.300163132137031, + "grad_norm": 0.00699689332395792, + "learning_rate": 0.0008009075115760243, + "loss": 0.0161, + "num_input_tokens_seen": 96576576, + "step": 44750 + }, + { + "epoch": 7.300978792822186, + "grad_norm": 0.15883934497833252, + "learning_rate": 0.0008008506617930926, + "loss": 0.0351, + "num_input_tokens_seen": 96588640, + "step": 44755 + }, + { + "epoch": 7.301794453507341, + "grad_norm": 0.004342640750110149, + "learning_rate": 0.000800793805913041, + "loss": 0.0297, + "num_input_tokens_seen": 96600160, + "step": 44760 + }, + { + "epoch": 7.302610114192496, + "grad_norm": 0.010322721675038338, + "learning_rate": 0.0008007369439370211, + "loss": 0.1493, + "num_input_tokens_seen": 96610944, + "step": 44765 + }, + { + "epoch": 7.303425774877651, + "grad_norm": 0.131282240152359, + "learning_rate": 0.0008006800758661856, + "loss": 0.1049, + "num_input_tokens_seen": 96620960, + "step": 44770 + }, + { + "epoch": 7.304241435562806, + "grad_norm": 0.007805538829416037, + "learning_rate": 0.000800623201701687, + "loss": 0.116, + "num_input_tokens_seen": 96630592, + "step": 44775 + }, + { + "epoch": 7.30505709624796, + "grad_norm": 0.008898080326616764, + "learning_rate": 0.0008005663214446777, + "loss": 0.1102, + "num_input_tokens_seen": 96642368, + "step": 44780 + }, + { + "epoch": 7.305872756933116, + "grad_norm": 0.09902067482471466, + "learning_rate": 0.0008005094350963107, + "loss": 0.0284, + "num_input_tokens_seen": 96654464, + "step": 44785 + }, + { + "epoch": 7.306688417618271, + "grad_norm": 0.17569200694561005, + "learning_rate": 0.0008004525426577387, + "loss": 0.1584, + "num_input_tokens_seen": 96664992, + "step": 44790 + }, + { + "epoch": 7.307504078303426, + "grad_norm": 0.02207139879465103, + "learning_rate": 0.0008003956441301149, + "loss": 0.0245, + "num_input_tokens_seen": 96675168, + "step": 44795 + }, + { + "epoch": 7.308319738988581, + "grad_norm": 0.15170609951019287, + "learning_rate": 0.0008003387395145922, + "loss": 0.0552, + "num_input_tokens_seen": 96684992, + "step": 44800 + }, + { + "epoch": 7.309135399673735, + "grad_norm": 0.10844897478818893, + "learning_rate": 0.0008002818288123239, + "loss": 0.0258, + "num_input_tokens_seen": 96695808, + "step": 44805 + }, + { + "epoch": 7.309951060358891, + "grad_norm": 0.21508342027664185, + "learning_rate": 0.0008002249120244635, + "loss": 0.1812, + "num_input_tokens_seen": 96707840, + "step": 44810 + }, + { + "epoch": 7.310766721044046, + "grad_norm": 0.20279374718666077, + "learning_rate": 0.0008001679891521642, + "loss": 0.2129, + "num_input_tokens_seen": 96718784, + "step": 44815 + }, + { + "epoch": 7.311582381729201, + "grad_norm": 0.014630110003054142, + "learning_rate": 0.00080011106019658, + "loss": 0.0209, + "num_input_tokens_seen": 96729376, + "step": 44820 + }, + { + "epoch": 7.3123980424143555, + "grad_norm": 0.04041663557291031, + "learning_rate": 0.0008000541251588644, + "loss": 0.028, + "num_input_tokens_seen": 96740640, + "step": 44825 + }, + { + "epoch": 7.31321370309951, + "grad_norm": 0.01726936362683773, + "learning_rate": 0.0007999971840401714, + "loss": 0.0403, + "num_input_tokens_seen": 96751104, + "step": 44830 + }, + { + "epoch": 7.314029363784665, + "grad_norm": 0.027054470032453537, + "learning_rate": 0.0007999402368416548, + "loss": 0.1324, + "num_input_tokens_seen": 96760672, + "step": 44835 + }, + { + "epoch": 7.314845024469821, + "grad_norm": 0.09804193675518036, + "learning_rate": 0.0007998832835644687, + "loss": 0.0546, + "num_input_tokens_seen": 96771840, + "step": 44840 + }, + { + "epoch": 7.315660685154976, + "grad_norm": 0.007839972153306007, + "learning_rate": 0.0007998263242097675, + "loss": 0.1204, + "num_input_tokens_seen": 96782592, + "step": 44845 + }, + { + "epoch": 7.3164763458401305, + "grad_norm": 0.0018097873544320464, + "learning_rate": 0.0007997693587787056, + "loss": 0.1061, + "num_input_tokens_seen": 96794016, + "step": 44850 + }, + { + "epoch": 7.317292006525285, + "grad_norm": 0.011595363728702068, + "learning_rate": 0.0007997123872724373, + "loss": 0.1541, + "num_input_tokens_seen": 96803968, + "step": 44855 + }, + { + "epoch": 7.31810766721044, + "grad_norm": 0.09957374632358551, + "learning_rate": 0.0007996554096921172, + "loss": 0.1027, + "num_input_tokens_seen": 96814304, + "step": 44860 + }, + { + "epoch": 7.318923327895595, + "grad_norm": 0.006796934641897678, + "learning_rate": 0.0007995984260389001, + "loss": 0.0505, + "num_input_tokens_seen": 96824128, + "step": 44865 + }, + { + "epoch": 7.319738988580751, + "grad_norm": 0.047414228320121765, + "learning_rate": 0.0007995414363139408, + "loss": 0.1051, + "num_input_tokens_seen": 96835712, + "step": 44870 + }, + { + "epoch": 7.3205546492659055, + "grad_norm": 0.1455640196800232, + "learning_rate": 0.0007994844405183944, + "loss": 0.0366, + "num_input_tokens_seen": 96846272, + "step": 44875 + }, + { + "epoch": 7.32137030995106, + "grad_norm": 0.007186947390437126, + "learning_rate": 0.0007994274386534158, + "loss": 0.0548, + "num_input_tokens_seen": 96857408, + "step": 44880 + }, + { + "epoch": 7.322185970636215, + "grad_norm": 0.023099562153220177, + "learning_rate": 0.0007993704307201604, + "loss": 0.1111, + "num_input_tokens_seen": 96868032, + "step": 44885 + }, + { + "epoch": 7.32300163132137, + "grad_norm": 0.009358012117445469, + "learning_rate": 0.0007993134167197833, + "loss": 0.0485, + "num_input_tokens_seen": 96879136, + "step": 44890 + }, + { + "epoch": 7.323817292006526, + "grad_norm": 0.0956074595451355, + "learning_rate": 0.0007992563966534403, + "loss": 0.0503, + "num_input_tokens_seen": 96890016, + "step": 44895 + }, + { + "epoch": 7.3246329526916805, + "grad_norm": 0.03040735051035881, + "learning_rate": 0.0007991993705222867, + "loss": 0.0431, + "num_input_tokens_seen": 96900160, + "step": 44900 + }, + { + "epoch": 7.325448613376835, + "grad_norm": 0.16506527364253998, + "learning_rate": 0.0007991423383274782, + "loss": 0.1045, + "num_input_tokens_seen": 96909856, + "step": 44905 + }, + { + "epoch": 7.32626427406199, + "grad_norm": 0.18014702200889587, + "learning_rate": 0.0007990853000701708, + "loss": 0.0544, + "num_input_tokens_seen": 96921280, + "step": 44910 + }, + { + "epoch": 7.327079934747145, + "grad_norm": 0.04607102647423744, + "learning_rate": 0.0007990282557515204, + "loss": 0.0628, + "num_input_tokens_seen": 96931136, + "step": 44915 + }, + { + "epoch": 7.327895595432301, + "grad_norm": 0.0045592570677399635, + "learning_rate": 0.0007989712053726829, + "loss": 0.0123, + "num_input_tokens_seen": 96942112, + "step": 44920 + }, + { + "epoch": 7.328711256117455, + "grad_norm": 0.021046232432127, + "learning_rate": 0.0007989141489348149, + "loss": 0.0298, + "num_input_tokens_seen": 96953664, + "step": 44925 + }, + { + "epoch": 7.32952691680261, + "grad_norm": 0.005334815010428429, + "learning_rate": 0.0007988570864390723, + "loss": 0.0543, + "num_input_tokens_seen": 96964800, + "step": 44930 + }, + { + "epoch": 7.330342577487765, + "grad_norm": 0.0940176248550415, + "learning_rate": 0.0007988000178866117, + "loss": 0.1046, + "num_input_tokens_seen": 96975680, + "step": 44935 + }, + { + "epoch": 7.33115823817292, + "grad_norm": 0.013159208931028843, + "learning_rate": 0.0007987429432785897, + "loss": 0.0257, + "num_input_tokens_seen": 96986496, + "step": 44940 + }, + { + "epoch": 7.331973898858075, + "grad_norm": 0.28472015261650085, + "learning_rate": 0.000798685862616163, + "loss": 0.0458, + "num_input_tokens_seen": 96997184, + "step": 44945 + }, + { + "epoch": 7.33278955954323, + "grad_norm": 0.009768480435013771, + "learning_rate": 0.0007986287759004884, + "loss": 0.0091, + "num_input_tokens_seen": 97008416, + "step": 44950 + }, + { + "epoch": 7.333605220228385, + "grad_norm": 0.02832314930856228, + "learning_rate": 0.000798571683132723, + "loss": 0.0358, + "num_input_tokens_seen": 97020000, + "step": 44955 + }, + { + "epoch": 7.33442088091354, + "grad_norm": 0.008958880789577961, + "learning_rate": 0.0007985145843140233, + "loss": 0.0583, + "num_input_tokens_seen": 97030752, + "step": 44960 + }, + { + "epoch": 7.335236541598695, + "grad_norm": 0.021089205518364906, + "learning_rate": 0.0007984574794455472, + "loss": 0.2063, + "num_input_tokens_seen": 97040416, + "step": 44965 + }, + { + "epoch": 7.33605220228385, + "grad_norm": 0.046808261424303055, + "learning_rate": 0.0007984003685284516, + "loss": 0.2151, + "num_input_tokens_seen": 97051936, + "step": 44970 + }, + { + "epoch": 7.3368678629690045, + "grad_norm": 0.05121166259050369, + "learning_rate": 0.0007983432515638937, + "loss": 0.0501, + "num_input_tokens_seen": 97060864, + "step": 44975 + }, + { + "epoch": 7.33768352365416, + "grad_norm": 0.16538378596305847, + "learning_rate": 0.0007982861285530317, + "loss": 0.1071, + "num_input_tokens_seen": 97073376, + "step": 44980 + }, + { + "epoch": 7.338499184339315, + "grad_norm": 0.398477166891098, + "learning_rate": 0.0007982289994970227, + "loss": 0.0616, + "num_input_tokens_seen": 97085696, + "step": 44985 + }, + { + "epoch": 7.33931484502447, + "grad_norm": 0.004775048233568668, + "learning_rate": 0.0007981718643970246, + "loss": 0.0246, + "num_input_tokens_seen": 97097856, + "step": 44990 + }, + { + "epoch": 7.340130505709625, + "grad_norm": 0.018571704626083374, + "learning_rate": 0.0007981147232541956, + "loss": 0.0522, + "num_input_tokens_seen": 97109280, + "step": 44995 + }, + { + "epoch": 7.3409461663947795, + "grad_norm": 0.009491035714745522, + "learning_rate": 0.0007980575760696935, + "loss": 0.1543, + "num_input_tokens_seen": 97121728, + "step": 45000 + }, + { + "epoch": 7.341761827079935, + "grad_norm": 0.015966255217790604, + "learning_rate": 0.0007980004228446765, + "loss": 0.0248, + "num_input_tokens_seen": 97132480, + "step": 45005 + }, + { + "epoch": 7.34257748776509, + "grad_norm": 0.016801917925477028, + "learning_rate": 0.0007979432635803029, + "loss": 0.0543, + "num_input_tokens_seen": 97143904, + "step": 45010 + }, + { + "epoch": 7.343393148450245, + "grad_norm": 0.023630334064364433, + "learning_rate": 0.000797886098277731, + "loss": 0.0235, + "num_input_tokens_seen": 97154816, + "step": 45015 + }, + { + "epoch": 7.3442088091354, + "grad_norm": 0.24428388476371765, + "learning_rate": 0.0007978289269381196, + "loss": 0.0362, + "num_input_tokens_seen": 97165664, + "step": 45020 + }, + { + "epoch": 7.3450244698205545, + "grad_norm": 0.009656902402639389, + "learning_rate": 0.0007977717495626271, + "loss": 0.0348, + "num_input_tokens_seen": 97174976, + "step": 45025 + }, + { + "epoch": 7.345840130505709, + "grad_norm": 0.015367275103926659, + "learning_rate": 0.0007977145661524123, + "loss": 0.0199, + "num_input_tokens_seen": 97186720, + "step": 45030 + }, + { + "epoch": 7.346655791190865, + "grad_norm": 0.05532888323068619, + "learning_rate": 0.000797657376708634, + "loss": 0.0288, + "num_input_tokens_seen": 97196608, + "step": 45035 + }, + { + "epoch": 7.34747145187602, + "grad_norm": 0.14162828028202057, + "learning_rate": 0.0007976001812324516, + "loss": 0.0418, + "num_input_tokens_seen": 97208352, + "step": 45040 + }, + { + "epoch": 7.348287112561175, + "grad_norm": 0.0438138023018837, + "learning_rate": 0.0007975429797250239, + "loss": 0.0248, + "num_input_tokens_seen": 97218048, + "step": 45045 + }, + { + "epoch": 7.349102773246329, + "grad_norm": 0.0062089841812849045, + "learning_rate": 0.0007974857721875102, + "loss": 0.098, + "num_input_tokens_seen": 97229920, + "step": 45050 + }, + { + "epoch": 7.349918433931484, + "grad_norm": 0.006517962086945772, + "learning_rate": 0.0007974285586210701, + "loss": 0.0179, + "num_input_tokens_seen": 97242816, + "step": 45055 + }, + { + "epoch": 7.350734094616639, + "grad_norm": 0.05108761414885521, + "learning_rate": 0.0007973713390268629, + "loss": 0.0349, + "num_input_tokens_seen": 97254656, + "step": 45060 + }, + { + "epoch": 7.351549755301795, + "grad_norm": 0.21053609251976013, + "learning_rate": 0.0007973141134060483, + "loss": 0.1371, + "num_input_tokens_seen": 97266432, + "step": 45065 + }, + { + "epoch": 7.35236541598695, + "grad_norm": 0.030001185834407806, + "learning_rate": 0.0007972568817597857, + "loss": 0.0098, + "num_input_tokens_seen": 97278080, + "step": 45070 + }, + { + "epoch": 7.353181076672104, + "grad_norm": 0.1827940195798874, + "learning_rate": 0.0007971996440892356, + "loss": 0.1231, + "num_input_tokens_seen": 97288672, + "step": 45075 + }, + { + "epoch": 7.353996737357259, + "grad_norm": 0.002455601468682289, + "learning_rate": 0.0007971424003955577, + "loss": 0.0418, + "num_input_tokens_seen": 97300736, + "step": 45080 + }, + { + "epoch": 7.354812398042414, + "grad_norm": 0.006635445635765791, + "learning_rate": 0.0007970851506799119, + "loss": 0.0315, + "num_input_tokens_seen": 97311808, + "step": 45085 + }, + { + "epoch": 7.35562805872757, + "grad_norm": 0.18565182387828827, + "learning_rate": 0.0007970278949434588, + "loss": 0.0809, + "num_input_tokens_seen": 97321120, + "step": 45090 + }, + { + "epoch": 7.356443719412725, + "grad_norm": 0.08726845681667328, + "learning_rate": 0.0007969706331873586, + "loss": 0.1152, + "num_input_tokens_seen": 97331712, + "step": 45095 + }, + { + "epoch": 7.357259380097879, + "grad_norm": 0.007611857261508703, + "learning_rate": 0.0007969133654127718, + "loss": 0.0094, + "num_input_tokens_seen": 97342720, + "step": 45100 + }, + { + "epoch": 7.358075040783034, + "grad_norm": 0.015608113259077072, + "learning_rate": 0.0007968560916208589, + "loss": 0.0619, + "num_input_tokens_seen": 97353920, + "step": 45105 + }, + { + "epoch": 7.358890701468189, + "grad_norm": 0.206514373421669, + "learning_rate": 0.0007967988118127808, + "loss": 0.0788, + "num_input_tokens_seen": 97364320, + "step": 45110 + }, + { + "epoch": 7.359706362153344, + "grad_norm": 0.08558665215969086, + "learning_rate": 0.0007967415259896982, + "loss": 0.06, + "num_input_tokens_seen": 97374656, + "step": 45115 + }, + { + "epoch": 7.3605220228384995, + "grad_norm": 0.013548692688345909, + "learning_rate": 0.0007966842341527722, + "loss": 0.0181, + "num_input_tokens_seen": 97386272, + "step": 45120 + }, + { + "epoch": 7.361337683523654, + "grad_norm": 0.24696654081344604, + "learning_rate": 0.0007966269363031637, + "loss": 0.0499, + "num_input_tokens_seen": 97396992, + "step": 45125 + }, + { + "epoch": 7.362153344208809, + "grad_norm": 0.028792886063456535, + "learning_rate": 0.0007965696324420342, + "loss": 0.0853, + "num_input_tokens_seen": 97407296, + "step": 45130 + }, + { + "epoch": 7.362969004893964, + "grad_norm": 0.003730390453711152, + "learning_rate": 0.0007965123225705447, + "loss": 0.1978, + "num_input_tokens_seen": 97417824, + "step": 45135 + }, + { + "epoch": 7.363784665579119, + "grad_norm": 0.0015873888041824102, + "learning_rate": 0.000796455006689857, + "loss": 0.0067, + "num_input_tokens_seen": 97428096, + "step": 45140 + }, + { + "epoch": 7.364600326264274, + "grad_norm": 0.0625576376914978, + "learning_rate": 0.0007963976848011324, + "loss": 0.0164, + "num_input_tokens_seen": 97439680, + "step": 45145 + }, + { + "epoch": 7.365415986949429, + "grad_norm": 0.05944611877202988, + "learning_rate": 0.0007963403569055328, + "loss": 0.0966, + "num_input_tokens_seen": 97450880, + "step": 45150 + }, + { + "epoch": 7.366231647634584, + "grad_norm": 0.40613335371017456, + "learning_rate": 0.0007962830230042197, + "loss": 0.3875, + "num_input_tokens_seen": 97461568, + "step": 45155 + }, + { + "epoch": 7.367047308319739, + "grad_norm": 0.004573889076709747, + "learning_rate": 0.0007962256830983556, + "loss": 0.2342, + "num_input_tokens_seen": 97472416, + "step": 45160 + }, + { + "epoch": 7.367862969004894, + "grad_norm": 0.029481355100870132, + "learning_rate": 0.0007961683371891019, + "loss": 0.0777, + "num_input_tokens_seen": 97483744, + "step": 45165 + }, + { + "epoch": 7.368678629690049, + "grad_norm": 0.03885212540626526, + "learning_rate": 0.0007961109852776214, + "loss": 0.035, + "num_input_tokens_seen": 97494752, + "step": 45170 + }, + { + "epoch": 7.369494290375204, + "grad_norm": 0.007029502186924219, + "learning_rate": 0.0007960536273650761, + "loss": 0.015, + "num_input_tokens_seen": 97505056, + "step": 45175 + }, + { + "epoch": 7.370309951060359, + "grad_norm": 0.03160068392753601, + "learning_rate": 0.0007959962634526285, + "loss": 0.0798, + "num_input_tokens_seen": 97516352, + "step": 45180 + }, + { + "epoch": 7.371125611745514, + "grad_norm": 0.0029659410938620567, + "learning_rate": 0.0007959388935414411, + "loss": 0.131, + "num_input_tokens_seen": 97526784, + "step": 45185 + }, + { + "epoch": 7.371941272430669, + "grad_norm": 0.07732724398374557, + "learning_rate": 0.0007958815176326764, + "loss": 0.0453, + "num_input_tokens_seen": 97538368, + "step": 45190 + }, + { + "epoch": 7.372756933115824, + "grad_norm": 0.37898892164230347, + "learning_rate": 0.0007958241357274976, + "loss": 0.0772, + "num_input_tokens_seen": 97549440, + "step": 45195 + }, + { + "epoch": 7.373572593800978, + "grad_norm": 0.014226013794541359, + "learning_rate": 0.0007957667478270674, + "loss": 0.0587, + "num_input_tokens_seen": 97559264, + "step": 45200 + }, + { + "epoch": 7.374388254486134, + "grad_norm": 0.1606178879737854, + "learning_rate": 0.0007957093539325489, + "loss": 0.053, + "num_input_tokens_seen": 97569312, + "step": 45205 + }, + { + "epoch": 7.375203915171289, + "grad_norm": 0.042820170521736145, + "learning_rate": 0.000795651954045105, + "loss": 0.0447, + "num_input_tokens_seen": 97579712, + "step": 45210 + }, + { + "epoch": 7.376019575856444, + "grad_norm": 0.017695242539048195, + "learning_rate": 0.0007955945481658992, + "loss": 0.0341, + "num_input_tokens_seen": 97590912, + "step": 45215 + }, + { + "epoch": 7.376835236541599, + "grad_norm": 0.01615849696099758, + "learning_rate": 0.0007955371362960951, + "loss": 0.0666, + "num_input_tokens_seen": 97602176, + "step": 45220 + }, + { + "epoch": 7.377650897226753, + "grad_norm": 0.03625781461596489, + "learning_rate": 0.000795479718436856, + "loss": 0.1258, + "num_input_tokens_seen": 97612128, + "step": 45225 + }, + { + "epoch": 7.378466557911908, + "grad_norm": 0.12820467352867126, + "learning_rate": 0.0007954222945893455, + "loss": 0.0711, + "num_input_tokens_seen": 97622528, + "step": 45230 + }, + { + "epoch": 7.379282218597064, + "grad_norm": 0.006325066089630127, + "learning_rate": 0.0007953648647547274, + "loss": 0.034, + "num_input_tokens_seen": 97634304, + "step": 45235 + }, + { + "epoch": 7.380097879282219, + "grad_norm": 0.07013335824012756, + "learning_rate": 0.0007953074289341655, + "loss": 0.1229, + "num_input_tokens_seen": 97644864, + "step": 45240 + }, + { + "epoch": 7.3809135399673735, + "grad_norm": 0.01700296252965927, + "learning_rate": 0.0007952499871288241, + "loss": 0.0609, + "num_input_tokens_seen": 97656768, + "step": 45245 + }, + { + "epoch": 7.381729200652528, + "grad_norm": 0.11692510545253754, + "learning_rate": 0.0007951925393398672, + "loss": 0.154, + "num_input_tokens_seen": 97668416, + "step": 45250 + }, + { + "epoch": 7.382544861337683, + "grad_norm": 0.1630610227584839, + "learning_rate": 0.0007951350855684588, + "loss": 0.0311, + "num_input_tokens_seen": 97679392, + "step": 45255 + }, + { + "epoch": 7.383360522022839, + "grad_norm": 0.1557503342628479, + "learning_rate": 0.0007950776258157637, + "loss": 0.1459, + "num_input_tokens_seen": 97689312, + "step": 45260 + }, + { + "epoch": 7.384176182707994, + "grad_norm": 0.07793676853179932, + "learning_rate": 0.000795020160082946, + "loss": 0.0266, + "num_input_tokens_seen": 97700448, + "step": 45265 + }, + { + "epoch": 7.3849918433931485, + "grad_norm": 0.15000972151756287, + "learning_rate": 0.0007949626883711707, + "loss": 0.0383, + "num_input_tokens_seen": 97710528, + "step": 45270 + }, + { + "epoch": 7.385807504078303, + "grad_norm": 0.017676938325166702, + "learning_rate": 0.0007949052106816022, + "loss": 0.0336, + "num_input_tokens_seen": 97718720, + "step": 45275 + }, + { + "epoch": 7.386623164763458, + "grad_norm": 0.005435280501842499, + "learning_rate": 0.0007948477270154056, + "loss": 0.0503, + "num_input_tokens_seen": 97727744, + "step": 45280 + }, + { + "epoch": 7.387438825448613, + "grad_norm": 0.00518799526616931, + "learning_rate": 0.0007947902373737456, + "loss": 0.009, + "num_input_tokens_seen": 97738112, + "step": 45285 + }, + { + "epoch": 7.388254486133769, + "grad_norm": 0.003281576791778207, + "learning_rate": 0.0007947327417577875, + "loss": 0.0838, + "num_input_tokens_seen": 97748320, + "step": 45290 + }, + { + "epoch": 7.3890701468189235, + "grad_norm": 0.018928417935967445, + "learning_rate": 0.0007946752401686966, + "loss": 0.0505, + "num_input_tokens_seen": 97760480, + "step": 45295 + }, + { + "epoch": 7.389885807504078, + "grad_norm": 0.22991769015789032, + "learning_rate": 0.000794617732607638, + "loss": 0.1832, + "num_input_tokens_seen": 97770912, + "step": 45300 + }, + { + "epoch": 7.390701468189233, + "grad_norm": 0.2968190908432007, + "learning_rate": 0.0007945602190757775, + "loss": 0.1263, + "num_input_tokens_seen": 97781984, + "step": 45305 + }, + { + "epoch": 7.391517128874388, + "grad_norm": 0.27317890524864197, + "learning_rate": 0.0007945026995742803, + "loss": 0.1594, + "num_input_tokens_seen": 97793152, + "step": 45310 + }, + { + "epoch": 7.392332789559543, + "grad_norm": 0.010756957344710827, + "learning_rate": 0.0007944451741043124, + "loss": 0.0462, + "num_input_tokens_seen": 97804896, + "step": 45315 + }, + { + "epoch": 7.3931484502446985, + "grad_norm": 0.34141770005226135, + "learning_rate": 0.0007943876426670395, + "loss": 0.0904, + "num_input_tokens_seen": 97815008, + "step": 45320 + }, + { + "epoch": 7.393964110929853, + "grad_norm": 0.10904575139284134, + "learning_rate": 0.0007943301052636276, + "loss": 0.0323, + "num_input_tokens_seen": 97826400, + "step": 45325 + }, + { + "epoch": 7.394779771615008, + "grad_norm": 0.04549914225935936, + "learning_rate": 0.0007942725618952426, + "loss": 0.1009, + "num_input_tokens_seen": 97837024, + "step": 45330 + }, + { + "epoch": 7.395595432300163, + "grad_norm": 0.02724381536245346, + "learning_rate": 0.000794215012563051, + "loss": 0.0677, + "num_input_tokens_seen": 97847616, + "step": 45335 + }, + { + "epoch": 7.396411092985318, + "grad_norm": 0.005902724806219339, + "learning_rate": 0.0007941574572682187, + "loss": 0.0484, + "num_input_tokens_seen": 97858464, + "step": 45340 + }, + { + "epoch": 7.397226753670473, + "grad_norm": 0.2103486955165863, + "learning_rate": 0.0007940998960119126, + "loss": 0.0662, + "num_input_tokens_seen": 97868224, + "step": 45345 + }, + { + "epoch": 7.398042414355628, + "grad_norm": 0.14964665472507477, + "learning_rate": 0.0007940423287952989, + "loss": 0.0786, + "num_input_tokens_seen": 97879520, + "step": 45350 + }, + { + "epoch": 7.398858075040783, + "grad_norm": 0.3316873610019684, + "learning_rate": 0.0007939847556195443, + "loss": 0.1179, + "num_input_tokens_seen": 97889568, + "step": 45355 + }, + { + "epoch": 7.399673735725938, + "grad_norm": 0.11790391057729721, + "learning_rate": 0.0007939271764858158, + "loss": 0.1297, + "num_input_tokens_seen": 97899840, + "step": 45360 + }, + { + "epoch": 7.400489396411093, + "grad_norm": 0.196848064661026, + "learning_rate": 0.0007938695913952802, + "loss": 0.1106, + "num_input_tokens_seen": 97910112, + "step": 45365 + }, + { + "epoch": 7.401305057096248, + "grad_norm": 0.03056715801358223, + "learning_rate": 0.0007938120003491045, + "loss": 0.0291, + "num_input_tokens_seen": 97921664, + "step": 45370 + }, + { + "epoch": 7.402120717781403, + "grad_norm": 0.19020619988441467, + "learning_rate": 0.0007937544033484558, + "loss": 0.111, + "num_input_tokens_seen": 97934112, + "step": 45375 + }, + { + "epoch": 7.402936378466558, + "grad_norm": 0.102375328540802, + "learning_rate": 0.0007936968003945015, + "loss": 0.0707, + "num_input_tokens_seen": 97945760, + "step": 45380 + }, + { + "epoch": 7.403752039151713, + "grad_norm": 0.011488073505461216, + "learning_rate": 0.0007936391914884092, + "loss": 0.3252, + "num_input_tokens_seen": 97956320, + "step": 45385 + }, + { + "epoch": 7.404567699836868, + "grad_norm": 0.017468484118580818, + "learning_rate": 0.0007935815766313459, + "loss": 0.1704, + "num_input_tokens_seen": 97966368, + "step": 45390 + }, + { + "epoch": 7.4053833605220225, + "grad_norm": 0.03229326754808426, + "learning_rate": 0.0007935239558244795, + "loss": 0.0486, + "num_input_tokens_seen": 97976736, + "step": 45395 + }, + { + "epoch": 7.406199021207178, + "grad_norm": 0.09293685108423233, + "learning_rate": 0.000793466329068978, + "loss": 0.0346, + "num_input_tokens_seen": 97987616, + "step": 45400 + }, + { + "epoch": 7.407014681892333, + "grad_norm": 0.011953872628509998, + "learning_rate": 0.000793408696366009, + "loss": 0.0745, + "num_input_tokens_seen": 97999424, + "step": 45405 + }, + { + "epoch": 7.407830342577488, + "grad_norm": 0.08539988845586777, + "learning_rate": 0.0007933510577167404, + "loss": 0.1466, + "num_input_tokens_seen": 98010688, + "step": 45410 + }, + { + "epoch": 7.408646003262643, + "grad_norm": 0.11707698553800583, + "learning_rate": 0.0007932934131223406, + "loss": 0.0711, + "num_input_tokens_seen": 98022304, + "step": 45415 + }, + { + "epoch": 7.4094616639477975, + "grad_norm": 0.06049269437789917, + "learning_rate": 0.0007932357625839776, + "loss": 0.0689, + "num_input_tokens_seen": 98032448, + "step": 45420 + }, + { + "epoch": 7.410277324632952, + "grad_norm": 0.053298674523830414, + "learning_rate": 0.0007931781061028201, + "loss": 0.083, + "num_input_tokens_seen": 98043936, + "step": 45425 + }, + { + "epoch": 7.411092985318108, + "grad_norm": 0.12486092001199722, + "learning_rate": 0.0007931204436800361, + "loss": 0.1394, + "num_input_tokens_seen": 98054880, + "step": 45430 + }, + { + "epoch": 7.411908646003263, + "grad_norm": 0.04418341815471649, + "learning_rate": 0.0007930627753167945, + "loss": 0.0202, + "num_input_tokens_seen": 98065440, + "step": 45435 + }, + { + "epoch": 7.412724306688418, + "grad_norm": 0.0034797724802047014, + "learning_rate": 0.0007930051010142641, + "loss": 0.0119, + "num_input_tokens_seen": 98076256, + "step": 45440 + }, + { + "epoch": 7.4135399673735725, + "grad_norm": 0.24782520532608032, + "learning_rate": 0.0007929474207736136, + "loss": 0.0567, + "num_input_tokens_seen": 98087648, + "step": 45445 + }, + { + "epoch": 7.414355628058727, + "grad_norm": 0.012800190597772598, + "learning_rate": 0.000792889734596012, + "loss": 0.0171, + "num_input_tokens_seen": 98097952, + "step": 45450 + }, + { + "epoch": 7.415171288743883, + "grad_norm": 0.03905637934803963, + "learning_rate": 0.0007928320424826284, + "loss": 0.1051, + "num_input_tokens_seen": 98109440, + "step": 45455 + }, + { + "epoch": 7.415986949429038, + "grad_norm": 0.08256906270980835, + "learning_rate": 0.0007927743444346317, + "loss": 0.0266, + "num_input_tokens_seen": 98120160, + "step": 45460 + }, + { + "epoch": 7.416802610114193, + "grad_norm": 0.13369162380695343, + "learning_rate": 0.0007927166404531916, + "loss": 0.1026, + "num_input_tokens_seen": 98131488, + "step": 45465 + }, + { + "epoch": 7.417618270799347, + "grad_norm": 0.00662427581846714, + "learning_rate": 0.0007926589305394776, + "loss": 0.0151, + "num_input_tokens_seen": 98142528, + "step": 45470 + }, + { + "epoch": 7.418433931484502, + "grad_norm": 0.051119912415742874, + "learning_rate": 0.0007926012146946591, + "loss": 0.0692, + "num_input_tokens_seen": 98153472, + "step": 45475 + }, + { + "epoch": 7.419249592169657, + "grad_norm": 0.04073776304721832, + "learning_rate": 0.0007925434929199058, + "loss": 0.1137, + "num_input_tokens_seen": 98162784, + "step": 45480 + }, + { + "epoch": 7.420065252854813, + "grad_norm": 0.021243726834654808, + "learning_rate": 0.0007924857652163873, + "loss": 0.1876, + "num_input_tokens_seen": 98173376, + "step": 45485 + }, + { + "epoch": 7.420880913539968, + "grad_norm": 0.10374884307384491, + "learning_rate": 0.0007924280315852739, + "loss": 0.0183, + "num_input_tokens_seen": 98183872, + "step": 45490 + }, + { + "epoch": 7.421696574225122, + "grad_norm": 0.07742811739444733, + "learning_rate": 0.0007923702920277355, + "loss": 0.0457, + "num_input_tokens_seen": 98195200, + "step": 45495 + }, + { + "epoch": 7.422512234910277, + "grad_norm": 0.028647365048527718, + "learning_rate": 0.0007923125465449421, + "loss": 0.1043, + "num_input_tokens_seen": 98205344, + "step": 45500 + }, + { + "epoch": 7.423327895595432, + "grad_norm": 0.005188109818845987, + "learning_rate": 0.0007922547951380643, + "loss": 0.1016, + "num_input_tokens_seen": 98216896, + "step": 45505 + }, + { + "epoch": 7.424143556280587, + "grad_norm": 0.044221121817827225, + "learning_rate": 0.0007921970378082722, + "loss": 0.1025, + "num_input_tokens_seen": 98228896, + "step": 45510 + }, + { + "epoch": 7.424959216965743, + "grad_norm": 0.12920063734054565, + "learning_rate": 0.0007921392745567364, + "loss": 0.1118, + "num_input_tokens_seen": 98239488, + "step": 45515 + }, + { + "epoch": 7.425774877650897, + "grad_norm": 0.023605380207300186, + "learning_rate": 0.0007920815053846277, + "loss": 0.0706, + "num_input_tokens_seen": 98251232, + "step": 45520 + }, + { + "epoch": 7.426590538336052, + "grad_norm": 0.020623747259378433, + "learning_rate": 0.0007920237302931167, + "loss": 0.0318, + "num_input_tokens_seen": 98262432, + "step": 45525 + }, + { + "epoch": 7.427406199021207, + "grad_norm": 0.011629869230091572, + "learning_rate": 0.0007919659492833744, + "loss": 0.1307, + "num_input_tokens_seen": 98272320, + "step": 45530 + }, + { + "epoch": 7.428221859706362, + "grad_norm": 0.037228185683488846, + "learning_rate": 0.0007919081623565717, + "loss": 0.0624, + "num_input_tokens_seen": 98283904, + "step": 45535 + }, + { + "epoch": 7.4290375203915175, + "grad_norm": 0.17857886850833893, + "learning_rate": 0.0007918503695138799, + "loss": 0.1925, + "num_input_tokens_seen": 98295840, + "step": 45540 + }, + { + "epoch": 7.429853181076672, + "grad_norm": 0.005219275131821632, + "learning_rate": 0.0007917925707564699, + "loss": 0.0192, + "num_input_tokens_seen": 98307072, + "step": 45545 + }, + { + "epoch": 7.430668841761827, + "grad_norm": 0.16295675933361053, + "learning_rate": 0.0007917347660855134, + "loss": 0.0725, + "num_input_tokens_seen": 98318144, + "step": 45550 + }, + { + "epoch": 7.431484502446982, + "grad_norm": 0.08073597401380539, + "learning_rate": 0.0007916769555021819, + "loss": 0.0505, + "num_input_tokens_seen": 98329728, + "step": 45555 + }, + { + "epoch": 7.432300163132137, + "grad_norm": 0.02928158827126026, + "learning_rate": 0.0007916191390076468, + "loss": 0.0277, + "num_input_tokens_seen": 98340256, + "step": 45560 + }, + { + "epoch": 7.433115823817292, + "grad_norm": 0.001891314866952598, + "learning_rate": 0.0007915613166030799, + "loss": 0.0842, + "num_input_tokens_seen": 98350976, + "step": 45565 + }, + { + "epoch": 7.433931484502447, + "grad_norm": 0.04041924327611923, + "learning_rate": 0.0007915034882896528, + "loss": 0.0691, + "num_input_tokens_seen": 98361536, + "step": 45570 + }, + { + "epoch": 7.434747145187602, + "grad_norm": 0.027740156278014183, + "learning_rate": 0.0007914456540685379, + "loss": 0.0264, + "num_input_tokens_seen": 98372064, + "step": 45575 + }, + { + "epoch": 7.435562805872757, + "grad_norm": 0.4753979444503784, + "learning_rate": 0.0007913878139409072, + "loss": 0.2289, + "num_input_tokens_seen": 98382688, + "step": 45580 + }, + { + "epoch": 7.436378466557912, + "grad_norm": 0.14314419031143188, + "learning_rate": 0.0007913299679079326, + "loss": 0.0564, + "num_input_tokens_seen": 98393312, + "step": 45585 + }, + { + "epoch": 7.437194127243067, + "grad_norm": 0.0863778367638588, + "learning_rate": 0.000791272115970787, + "loss": 0.1797, + "num_input_tokens_seen": 98403136, + "step": 45590 + }, + { + "epoch": 7.438009787928221, + "grad_norm": 0.0023562051355838776, + "learning_rate": 0.0007912142581306421, + "loss": 0.0538, + "num_input_tokens_seen": 98414752, + "step": 45595 + }, + { + "epoch": 7.438825448613377, + "grad_norm": 0.015079490840435028, + "learning_rate": 0.0007911563943886709, + "loss": 0.0455, + "num_input_tokens_seen": 98426496, + "step": 45600 + }, + { + "epoch": 7.439641109298532, + "grad_norm": 0.007972031831741333, + "learning_rate": 0.000791098524746046, + "loss": 0.0615, + "num_input_tokens_seen": 98437696, + "step": 45605 + }, + { + "epoch": 7.440456769983687, + "grad_norm": 0.10708468407392502, + "learning_rate": 0.0007910406492039404, + "loss": 0.1248, + "num_input_tokens_seen": 98448640, + "step": 45610 + }, + { + "epoch": 7.441272430668842, + "grad_norm": 0.02202642522752285, + "learning_rate": 0.0007909827677635267, + "loss": 0.0578, + "num_input_tokens_seen": 98459360, + "step": 45615 + }, + { + "epoch": 7.442088091353996, + "grad_norm": 0.18345937132835388, + "learning_rate": 0.000790924880425978, + "loss": 0.0643, + "num_input_tokens_seen": 98470560, + "step": 45620 + }, + { + "epoch": 7.442903752039152, + "grad_norm": 0.02337002195417881, + "learning_rate": 0.0007908669871924676, + "loss": 0.0221, + "num_input_tokens_seen": 98482432, + "step": 45625 + }, + { + "epoch": 7.443719412724307, + "grad_norm": 0.2804628908634186, + "learning_rate": 0.0007908090880641688, + "loss": 0.0676, + "num_input_tokens_seen": 98493056, + "step": 45630 + }, + { + "epoch": 7.444535073409462, + "grad_norm": 0.005265125539153814, + "learning_rate": 0.0007907511830422547, + "loss": 0.0511, + "num_input_tokens_seen": 98503904, + "step": 45635 + }, + { + "epoch": 7.445350734094617, + "grad_norm": 0.006431952118873596, + "learning_rate": 0.0007906932721278992, + "loss": 0.0938, + "num_input_tokens_seen": 98514848, + "step": 45640 + }, + { + "epoch": 7.446166394779771, + "grad_norm": 0.010906088165938854, + "learning_rate": 0.0007906353553222757, + "loss": 0.0111, + "num_input_tokens_seen": 98525696, + "step": 45645 + }, + { + "epoch": 7.446982055464926, + "grad_norm": 0.0019036248559132218, + "learning_rate": 0.000790577432626558, + "loss": 0.0207, + "num_input_tokens_seen": 98536672, + "step": 45650 + }, + { + "epoch": 7.447797716150082, + "grad_norm": 0.11320781707763672, + "learning_rate": 0.0007905195040419202, + "loss": 0.0662, + "num_input_tokens_seen": 98546912, + "step": 45655 + }, + { + "epoch": 7.448613376835237, + "grad_norm": 0.26842591166496277, + "learning_rate": 0.0007904615695695359, + "loss": 0.2255, + "num_input_tokens_seen": 98558336, + "step": 45660 + }, + { + "epoch": 7.4494290375203915, + "grad_norm": 0.11304190754890442, + "learning_rate": 0.0007904036292105794, + "loss": 0.0333, + "num_input_tokens_seen": 98568096, + "step": 45665 + }, + { + "epoch": 7.450244698205546, + "grad_norm": 0.008788889274001122, + "learning_rate": 0.000790345682966225, + "loss": 0.1113, + "num_input_tokens_seen": 98579072, + "step": 45670 + }, + { + "epoch": 7.451060358890701, + "grad_norm": 0.2839869260787964, + "learning_rate": 0.000790287730837647, + "loss": 0.1458, + "num_input_tokens_seen": 98589600, + "step": 45675 + }, + { + "epoch": 7.451876019575856, + "grad_norm": 0.28748512268066406, + "learning_rate": 0.0007902297728260199, + "loss": 0.1335, + "num_input_tokens_seen": 98600864, + "step": 45680 + }, + { + "epoch": 7.452691680261012, + "grad_norm": 0.0048972731456160545, + "learning_rate": 0.0007901718089325183, + "loss": 0.04, + "num_input_tokens_seen": 98611968, + "step": 45685 + }, + { + "epoch": 7.4535073409461665, + "grad_norm": 0.06445912271738052, + "learning_rate": 0.0007901138391583169, + "loss": 0.0917, + "num_input_tokens_seen": 98622912, + "step": 45690 + }, + { + "epoch": 7.454323001631321, + "grad_norm": 0.05211620032787323, + "learning_rate": 0.0007900558635045904, + "loss": 0.28, + "num_input_tokens_seen": 98634592, + "step": 45695 + }, + { + "epoch": 7.455138662316476, + "grad_norm": 0.007363923825323582, + "learning_rate": 0.000789997881972514, + "loss": 0.1129, + "num_input_tokens_seen": 98644096, + "step": 45700 + }, + { + "epoch": 7.455954323001631, + "grad_norm": 0.11903098970651627, + "learning_rate": 0.0007899398945632626, + "loss": 0.0477, + "num_input_tokens_seen": 98654400, + "step": 45705 + }, + { + "epoch": 7.456769983686787, + "grad_norm": 0.021192517131567, + "learning_rate": 0.0007898819012780114, + "loss": 0.0211, + "num_input_tokens_seen": 98665568, + "step": 45710 + }, + { + "epoch": 7.4575856443719415, + "grad_norm": 0.11045344173908234, + "learning_rate": 0.0007898239021179356, + "loss": 0.069, + "num_input_tokens_seen": 98674624, + "step": 45715 + }, + { + "epoch": 7.458401305057096, + "grad_norm": 0.05654820427298546, + "learning_rate": 0.000789765897084211, + "loss": 0.1439, + "num_input_tokens_seen": 98685536, + "step": 45720 + }, + { + "epoch": 7.459216965742251, + "grad_norm": 0.3496699631214142, + "learning_rate": 0.0007897078861780127, + "loss": 0.1358, + "num_input_tokens_seen": 98696352, + "step": 45725 + }, + { + "epoch": 7.460032626427406, + "grad_norm": 0.09856715053319931, + "learning_rate": 0.0007896498694005168, + "loss": 0.0542, + "num_input_tokens_seen": 98707808, + "step": 45730 + }, + { + "epoch": 7.460848287112561, + "grad_norm": 0.02175956778228283, + "learning_rate": 0.0007895918467528987, + "loss": 0.0343, + "num_input_tokens_seen": 98718112, + "step": 45735 + }, + { + "epoch": 7.4616639477977165, + "grad_norm": 0.052862830460071564, + "learning_rate": 0.0007895338182363343, + "loss": 0.0753, + "num_input_tokens_seen": 98729312, + "step": 45740 + }, + { + "epoch": 7.462479608482871, + "grad_norm": 0.0069775888696312904, + "learning_rate": 0.0007894757838519999, + "loss": 0.0524, + "num_input_tokens_seen": 98739168, + "step": 45745 + }, + { + "epoch": 7.463295269168026, + "grad_norm": 0.5612104535102844, + "learning_rate": 0.0007894177436010716, + "loss": 0.2083, + "num_input_tokens_seen": 98749376, + "step": 45750 + }, + { + "epoch": 7.464110929853181, + "grad_norm": 0.01380117516964674, + "learning_rate": 0.0007893596974847255, + "loss": 0.0222, + "num_input_tokens_seen": 98760928, + "step": 45755 + }, + { + "epoch": 7.464926590538336, + "grad_norm": 0.01961393468081951, + "learning_rate": 0.000789301645504138, + "loss": 0.0829, + "num_input_tokens_seen": 98770560, + "step": 45760 + }, + { + "epoch": 7.465742251223491, + "grad_norm": 0.02134229801595211, + "learning_rate": 0.0007892435876604857, + "loss": 0.0863, + "num_input_tokens_seen": 98780512, + "step": 45765 + }, + { + "epoch": 7.466557911908646, + "grad_norm": 0.03447406738996506, + "learning_rate": 0.0007891855239549453, + "loss": 0.028, + "num_input_tokens_seen": 98792192, + "step": 45770 + }, + { + "epoch": 7.467373572593801, + "grad_norm": 0.39587467908859253, + "learning_rate": 0.0007891274543886933, + "loss": 0.1553, + "num_input_tokens_seen": 98802752, + "step": 45775 + }, + { + "epoch": 7.468189233278956, + "grad_norm": 0.02360408380627632, + "learning_rate": 0.0007890693789629064, + "loss": 0.0332, + "num_input_tokens_seen": 98813600, + "step": 45780 + }, + { + "epoch": 7.469004893964111, + "grad_norm": 0.008248478174209595, + "learning_rate": 0.0007890112976787621, + "loss": 0.0136, + "num_input_tokens_seen": 98825280, + "step": 45785 + }, + { + "epoch": 7.4698205546492655, + "grad_norm": 0.10533123463392258, + "learning_rate": 0.0007889532105374373, + "loss": 0.0487, + "num_input_tokens_seen": 98836480, + "step": 45790 + }, + { + "epoch": 7.470636215334421, + "grad_norm": 0.05389018356800079, + "learning_rate": 0.0007888951175401089, + "loss": 0.0809, + "num_input_tokens_seen": 98847424, + "step": 45795 + }, + { + "epoch": 7.471451876019576, + "grad_norm": 0.12185373157262802, + "learning_rate": 0.0007888370186879545, + "loss": 0.1099, + "num_input_tokens_seen": 98858016, + "step": 45800 + }, + { + "epoch": 7.472267536704731, + "grad_norm": 0.0032347210217267275, + "learning_rate": 0.0007887789139821516, + "loss": 0.0264, + "num_input_tokens_seen": 98868864, + "step": 45805 + }, + { + "epoch": 7.473083197389886, + "grad_norm": 0.24199385941028595, + "learning_rate": 0.0007887208034238777, + "loss": 0.1008, + "num_input_tokens_seen": 98879936, + "step": 45810 + }, + { + "epoch": 7.4738988580750405, + "grad_norm": 0.09054215997457504, + "learning_rate": 0.0007886626870143103, + "loss": 0.0444, + "num_input_tokens_seen": 98891328, + "step": 45815 + }, + { + "epoch": 7.474714518760196, + "grad_norm": 0.5284417271614075, + "learning_rate": 0.0007886045647546274, + "loss": 0.1209, + "num_input_tokens_seen": 98902592, + "step": 45820 + }, + { + "epoch": 7.475530179445351, + "grad_norm": 0.005147279240190983, + "learning_rate": 0.0007885464366460069, + "loss": 0.0461, + "num_input_tokens_seen": 98913440, + "step": 45825 + }, + { + "epoch": 7.476345840130506, + "grad_norm": 0.04313492029905319, + "learning_rate": 0.0007884883026896268, + "loss": 0.0415, + "num_input_tokens_seen": 98923808, + "step": 45830 + }, + { + "epoch": 7.477161500815661, + "grad_norm": 0.046300292015075684, + "learning_rate": 0.0007884301628866652, + "loss": 0.0513, + "num_input_tokens_seen": 98933856, + "step": 45835 + }, + { + "epoch": 7.4779771615008155, + "grad_norm": 0.1123092919588089, + "learning_rate": 0.0007883720172383007, + "loss": 0.021, + "num_input_tokens_seen": 98944960, + "step": 45840 + }, + { + "epoch": 7.47879282218597, + "grad_norm": 0.016281159594655037, + "learning_rate": 0.0007883138657457111, + "loss": 0.0247, + "num_input_tokens_seen": 98956512, + "step": 45845 + }, + { + "epoch": 7.479608482871126, + "grad_norm": 0.04821819067001343, + "learning_rate": 0.0007882557084100755, + "loss": 0.1532, + "num_input_tokens_seen": 98967520, + "step": 45850 + }, + { + "epoch": 7.480424143556281, + "grad_norm": 0.2784560024738312, + "learning_rate": 0.0007881975452325722, + "loss": 0.0871, + "num_input_tokens_seen": 98977920, + "step": 45855 + }, + { + "epoch": 7.481239804241436, + "grad_norm": 0.003940748982131481, + "learning_rate": 0.00078813937621438, + "loss": 0.0365, + "num_input_tokens_seen": 98987392, + "step": 45860 + }, + { + "epoch": 7.4820554649265905, + "grad_norm": 0.02854633517563343, + "learning_rate": 0.000788081201356678, + "loss": 0.0417, + "num_input_tokens_seen": 98998304, + "step": 45865 + }, + { + "epoch": 7.482871125611745, + "grad_norm": 0.018137488514184952, + "learning_rate": 0.0007880230206606449, + "loss": 0.0427, + "num_input_tokens_seen": 99008480, + "step": 45870 + }, + { + "epoch": 7.4836867862969, + "grad_norm": 0.038598205894231796, + "learning_rate": 0.0007879648341274599, + "loss": 0.0341, + "num_input_tokens_seen": 99019584, + "step": 45875 + }, + { + "epoch": 7.484502446982056, + "grad_norm": 0.2073914259672165, + "learning_rate": 0.0007879066417583021, + "loss": 0.1476, + "num_input_tokens_seen": 99029920, + "step": 45880 + }, + { + "epoch": 7.485318107667211, + "grad_norm": 0.2836424708366394, + "learning_rate": 0.0007878484435543511, + "loss": 0.2164, + "num_input_tokens_seen": 99040288, + "step": 45885 + }, + { + "epoch": 7.486133768352365, + "grad_norm": 0.008115977048873901, + "learning_rate": 0.0007877902395167862, + "loss": 0.0479, + "num_input_tokens_seen": 99053024, + "step": 45890 + }, + { + "epoch": 7.48694942903752, + "grad_norm": 0.002475854242220521, + "learning_rate": 0.000787732029646787, + "loss": 0.0296, + "num_input_tokens_seen": 99062720, + "step": 45895 + }, + { + "epoch": 7.487765089722675, + "grad_norm": 0.02101140096783638, + "learning_rate": 0.0007876738139455332, + "loss": 0.0125, + "num_input_tokens_seen": 99073728, + "step": 45900 + }, + { + "epoch": 7.488580750407831, + "grad_norm": 0.11315298080444336, + "learning_rate": 0.0007876155924142046, + "loss": 0.2321, + "num_input_tokens_seen": 99084704, + "step": 45905 + }, + { + "epoch": 7.489396411092986, + "grad_norm": 0.23733113706111908, + "learning_rate": 0.0007875573650539811, + "loss": 0.3605, + "num_input_tokens_seen": 99095744, + "step": 45910 + }, + { + "epoch": 7.49021207177814, + "grad_norm": 0.14126139879226685, + "learning_rate": 0.0007874991318660429, + "loss": 0.0625, + "num_input_tokens_seen": 99107168, + "step": 45915 + }, + { + "epoch": 7.491027732463295, + "grad_norm": 0.004500252660363913, + "learning_rate": 0.0007874408928515702, + "loss": 0.028, + "num_input_tokens_seen": 99118208, + "step": 45920 + }, + { + "epoch": 7.49184339314845, + "grad_norm": 0.01806046813726425, + "learning_rate": 0.000787382648011743, + "loss": 0.0412, + "num_input_tokens_seen": 99129152, + "step": 45925 + }, + { + "epoch": 7.492659053833605, + "grad_norm": 0.036013007164001465, + "learning_rate": 0.0007873243973477419, + "loss": 0.0222, + "num_input_tokens_seen": 99140192, + "step": 45930 + }, + { + "epoch": 7.493474714518761, + "grad_norm": 0.030963629484176636, + "learning_rate": 0.0007872661408607473, + "loss": 0.0715, + "num_input_tokens_seen": 99150368, + "step": 45935 + }, + { + "epoch": 7.494290375203915, + "grad_norm": 0.002720885444432497, + "learning_rate": 0.0007872078785519401, + "loss": 0.0987, + "num_input_tokens_seen": 99161728, + "step": 45940 + }, + { + "epoch": 7.49510603588907, + "grad_norm": 0.07131746411323547, + "learning_rate": 0.0007871496104225007, + "loss": 0.0758, + "num_input_tokens_seen": 99173024, + "step": 45945 + }, + { + "epoch": 7.495921696574225, + "grad_norm": 0.017207255586981773, + "learning_rate": 0.0007870913364736103, + "loss": 0.0699, + "num_input_tokens_seen": 99183008, + "step": 45950 + }, + { + "epoch": 7.49673735725938, + "grad_norm": 0.009386662393808365, + "learning_rate": 0.0007870330567064499, + "loss": 0.0284, + "num_input_tokens_seen": 99194976, + "step": 45955 + }, + { + "epoch": 7.497553017944535, + "grad_norm": 0.3741611838340759, + "learning_rate": 0.0007869747711222001, + "loss": 0.0786, + "num_input_tokens_seen": 99205760, + "step": 45960 + }, + { + "epoch": 7.49836867862969, + "grad_norm": 0.048032522201538086, + "learning_rate": 0.0007869164797220429, + "loss": 0.0435, + "num_input_tokens_seen": 99215808, + "step": 45965 + }, + { + "epoch": 7.499184339314845, + "grad_norm": 0.011592647060751915, + "learning_rate": 0.000786858182507159, + "loss": 0.1804, + "num_input_tokens_seen": 99226848, + "step": 45970 + }, + { + "epoch": 7.5, + "grad_norm": 0.01963452436029911, + "learning_rate": 0.0007867998794787303, + "loss": 0.0821, + "num_input_tokens_seen": 99237152, + "step": 45975 + }, + { + "epoch": 7.500815660685155, + "grad_norm": 0.1919567584991455, + "learning_rate": 0.0007867415706379381, + "loss": 0.1631, + "num_input_tokens_seen": 99247936, + "step": 45980 + }, + { + "epoch": 7.50163132137031, + "grad_norm": 0.2966495454311371, + "learning_rate": 0.0007866832559859642, + "loss": 0.0786, + "num_input_tokens_seen": 99259456, + "step": 45985 + }, + { + "epoch": 7.502446982055465, + "grad_norm": 0.019804665818810463, + "learning_rate": 0.0007866249355239905, + "loss": 0.0151, + "num_input_tokens_seen": 99269760, + "step": 45990 + }, + { + "epoch": 7.50326264274062, + "grad_norm": 0.11622557789087296, + "learning_rate": 0.0007865666092531989, + "loss": 0.0394, + "num_input_tokens_seen": 99280256, + "step": 45995 + }, + { + "epoch": 7.504078303425775, + "grad_norm": 0.025021374225616455, + "learning_rate": 0.0007865082771747713, + "loss": 0.0246, + "num_input_tokens_seen": 99289600, + "step": 46000 + }, + { + "epoch": 7.50489396411093, + "grad_norm": 0.030868900939822197, + "learning_rate": 0.00078644993928989, + "loss": 0.0404, + "num_input_tokens_seen": 99300128, + "step": 46005 + }, + { + "epoch": 7.505709624796085, + "grad_norm": 0.1959637999534607, + "learning_rate": 0.0007863915955997374, + "loss": 0.172, + "num_input_tokens_seen": 99311680, + "step": 46010 + }, + { + "epoch": 7.506525285481239, + "grad_norm": 0.1512918919324875, + "learning_rate": 0.0007863332461054957, + "loss": 0.1369, + "num_input_tokens_seen": 99322976, + "step": 46015 + }, + { + "epoch": 7.507340946166395, + "grad_norm": 0.004533303435891867, + "learning_rate": 0.0007862748908083477, + "loss": 0.1816, + "num_input_tokens_seen": 99335584, + "step": 46020 + }, + { + "epoch": 7.50815660685155, + "grad_norm": 0.2223842740058899, + "learning_rate": 0.0007862165297094758, + "loss": 0.0811, + "num_input_tokens_seen": 99346528, + "step": 46025 + }, + { + "epoch": 7.508972267536705, + "grad_norm": 0.01374430675059557, + "learning_rate": 0.0007861581628100628, + "loss": 0.0377, + "num_input_tokens_seen": 99357280, + "step": 46030 + }, + { + "epoch": 7.50978792822186, + "grad_norm": 0.04576978087425232, + "learning_rate": 0.0007860997901112917, + "loss": 0.0303, + "num_input_tokens_seen": 99368192, + "step": 46035 + }, + { + "epoch": 7.510603588907014, + "grad_norm": 0.07817045599222183, + "learning_rate": 0.0007860414116143453, + "loss": 0.0384, + "num_input_tokens_seen": 99379104, + "step": 46040 + }, + { + "epoch": 7.511419249592169, + "grad_norm": 0.1468290388584137, + "learning_rate": 0.0007859830273204069, + "loss": 0.0429, + "num_input_tokens_seen": 99391200, + "step": 46045 + }, + { + "epoch": 7.512234910277325, + "grad_norm": 0.015742124989628792, + "learning_rate": 0.0007859246372306595, + "loss": 0.0169, + "num_input_tokens_seen": 99402272, + "step": 46050 + }, + { + "epoch": 7.51305057096248, + "grad_norm": 0.1696634739637375, + "learning_rate": 0.0007858662413462867, + "loss": 0.0728, + "num_input_tokens_seen": 99414016, + "step": 46055 + }, + { + "epoch": 7.513866231647635, + "grad_norm": 0.08048196136951447, + "learning_rate": 0.000785807839668472, + "loss": 0.1126, + "num_input_tokens_seen": 99423424, + "step": 46060 + }, + { + "epoch": 7.514681892332789, + "grad_norm": 0.042760301381349564, + "learning_rate": 0.0007857494321983987, + "loss": 0.0136, + "num_input_tokens_seen": 99434592, + "step": 46065 + }, + { + "epoch": 7.515497553017944, + "grad_norm": 0.10188845545053482, + "learning_rate": 0.0007856910189372506, + "loss": 0.0196, + "num_input_tokens_seen": 99444096, + "step": 46070 + }, + { + "epoch": 7.5163132137031, + "grad_norm": 0.07203994691371918, + "learning_rate": 0.0007856325998862118, + "loss": 0.1228, + "num_input_tokens_seen": 99455328, + "step": 46075 + }, + { + "epoch": 7.517128874388255, + "grad_norm": 0.032810721546411514, + "learning_rate": 0.0007855741750464658, + "loss": 0.0471, + "num_input_tokens_seen": 99465152, + "step": 46080 + }, + { + "epoch": 7.5179445350734095, + "grad_norm": 0.0027981449384242296, + "learning_rate": 0.0007855157444191969, + "loss": 0.0194, + "num_input_tokens_seen": 99475072, + "step": 46085 + }, + { + "epoch": 7.518760195758564, + "grad_norm": 0.35480666160583496, + "learning_rate": 0.0007854573080055894, + "loss": 0.042, + "num_input_tokens_seen": 99485344, + "step": 46090 + }, + { + "epoch": 7.519575856443719, + "grad_norm": 0.01701878383755684, + "learning_rate": 0.0007853988658068274, + "loss": 0.0667, + "num_input_tokens_seen": 99496864, + "step": 46095 + }, + { + "epoch": 7.520391517128875, + "grad_norm": 0.019124653190374374, + "learning_rate": 0.000785340417824095, + "loss": 0.0291, + "num_input_tokens_seen": 99507520, + "step": 46100 + }, + { + "epoch": 7.52120717781403, + "grad_norm": 0.02421344630420208, + "learning_rate": 0.0007852819640585773, + "loss": 0.1235, + "num_input_tokens_seen": 99517664, + "step": 46105 + }, + { + "epoch": 7.5220228384991845, + "grad_norm": 0.10566727072000504, + "learning_rate": 0.0007852235045114588, + "loss": 0.0365, + "num_input_tokens_seen": 99527392, + "step": 46110 + }, + { + "epoch": 7.522838499184339, + "grad_norm": 0.010944555513560772, + "learning_rate": 0.000785165039183924, + "loss": 0.1446, + "num_input_tokens_seen": 99538784, + "step": 46115 + }, + { + "epoch": 7.523654159869494, + "grad_norm": 0.04998289793729782, + "learning_rate": 0.0007851065680771581, + "loss": 0.0639, + "num_input_tokens_seen": 99549024, + "step": 46120 + }, + { + "epoch": 7.524469820554649, + "grad_norm": 0.11685500293970108, + "learning_rate": 0.0007850480911923457, + "loss": 0.0489, + "num_input_tokens_seen": 99560480, + "step": 46125 + }, + { + "epoch": 7.525285481239804, + "grad_norm": 0.2706635594367981, + "learning_rate": 0.0007849896085306723, + "loss": 0.058, + "num_input_tokens_seen": 99571968, + "step": 46130 + }, + { + "epoch": 7.5261011419249595, + "grad_norm": 0.014723085798323154, + "learning_rate": 0.0007849311200933228, + "loss": 0.0768, + "num_input_tokens_seen": 99582112, + "step": 46135 + }, + { + "epoch": 7.526916802610114, + "grad_norm": 0.007247124798595905, + "learning_rate": 0.0007848726258814826, + "loss": 0.1112, + "num_input_tokens_seen": 99591360, + "step": 46140 + }, + { + "epoch": 7.527732463295269, + "grad_norm": 0.2555147707462311, + "learning_rate": 0.0007848141258963375, + "loss": 0.0892, + "num_input_tokens_seen": 99601696, + "step": 46145 + }, + { + "epoch": 7.528548123980424, + "grad_norm": 0.03253483399748802, + "learning_rate": 0.0007847556201390727, + "loss": 0.078, + "num_input_tokens_seen": 99612928, + "step": 46150 + }, + { + "epoch": 7.529363784665579, + "grad_norm": 0.10616229474544525, + "learning_rate": 0.0007846971086108741, + "loss": 0.0638, + "num_input_tokens_seen": 99622720, + "step": 46155 + }, + { + "epoch": 7.5301794453507345, + "grad_norm": 0.01619393192231655, + "learning_rate": 0.0007846385913129273, + "loss": 0.2007, + "num_input_tokens_seen": 99633888, + "step": 46160 + }, + { + "epoch": 7.530995106035889, + "grad_norm": 0.15025803446769714, + "learning_rate": 0.0007845800682464185, + "loss": 0.09, + "num_input_tokens_seen": 99644960, + "step": 46165 + }, + { + "epoch": 7.531810766721044, + "grad_norm": 0.01432060170918703, + "learning_rate": 0.0007845215394125336, + "loss": 0.0248, + "num_input_tokens_seen": 99656096, + "step": 46170 + }, + { + "epoch": 7.532626427406199, + "grad_norm": 0.2009487897157669, + "learning_rate": 0.0007844630048124586, + "loss": 0.0963, + "num_input_tokens_seen": 99666752, + "step": 46175 + }, + { + "epoch": 7.533442088091354, + "grad_norm": 0.08197979629039764, + "learning_rate": 0.00078440446444738, + "loss": 0.0349, + "num_input_tokens_seen": 99678016, + "step": 46180 + }, + { + "epoch": 7.5342577487765094, + "grad_norm": 0.009611856192350388, + "learning_rate": 0.0007843459183184843, + "loss": 0.0137, + "num_input_tokens_seen": 99687456, + "step": 46185 + }, + { + "epoch": 7.535073409461664, + "grad_norm": 0.23774437606334686, + "learning_rate": 0.0007842873664269576, + "loss": 0.0714, + "num_input_tokens_seen": 99699584, + "step": 46190 + }, + { + "epoch": 7.535889070146819, + "grad_norm": 0.0137681495398283, + "learning_rate": 0.0007842288087739868, + "loss": 0.0134, + "num_input_tokens_seen": 99710624, + "step": 46195 + }, + { + "epoch": 7.536704730831974, + "grad_norm": 0.009281824342906475, + "learning_rate": 0.0007841702453607589, + "loss": 0.0388, + "num_input_tokens_seen": 99720704, + "step": 46200 + }, + { + "epoch": 7.537520391517129, + "grad_norm": 0.001227008760906756, + "learning_rate": 0.0007841116761884601, + "loss": 0.0483, + "num_input_tokens_seen": 99729984, + "step": 46205 + }, + { + "epoch": 7.5383360522022835, + "grad_norm": 0.006545028183609247, + "learning_rate": 0.000784053101258278, + "loss": 0.1183, + "num_input_tokens_seen": 99741632, + "step": 46210 + }, + { + "epoch": 7.539151712887438, + "grad_norm": 0.004287282936275005, + "learning_rate": 0.0007839945205713995, + "loss": 0.0914, + "num_input_tokens_seen": 99750688, + "step": 46215 + }, + { + "epoch": 7.539967373572594, + "grad_norm": 0.03882336989045143, + "learning_rate": 0.0007839359341290116, + "loss": 0.0338, + "num_input_tokens_seen": 99761440, + "step": 46220 + }, + { + "epoch": 7.540783034257749, + "grad_norm": 0.26387444138526917, + "learning_rate": 0.0007838773419323019, + "loss": 0.0604, + "num_input_tokens_seen": 99770912, + "step": 46225 + }, + { + "epoch": 7.541598694942904, + "grad_norm": 0.025288641452789307, + "learning_rate": 0.0007838187439824577, + "loss": 0.0463, + "num_input_tokens_seen": 99782368, + "step": 46230 + }, + { + "epoch": 7.5424143556280585, + "grad_norm": 0.03209105134010315, + "learning_rate": 0.0007837601402806666, + "loss": 0.0154, + "num_input_tokens_seen": 99794560, + "step": 46235 + }, + { + "epoch": 7.543230016313213, + "grad_norm": 0.008957461453974247, + "learning_rate": 0.0007837015308281163, + "loss": 0.0219, + "num_input_tokens_seen": 99805184, + "step": 46240 + }, + { + "epoch": 7.544045676998369, + "grad_norm": 0.38704678416252136, + "learning_rate": 0.0007836429156259946, + "loss": 0.0446, + "num_input_tokens_seen": 99816160, + "step": 46245 + }, + { + "epoch": 7.544861337683524, + "grad_norm": 0.015264240093529224, + "learning_rate": 0.0007835842946754893, + "loss": 0.0256, + "num_input_tokens_seen": 99826784, + "step": 46250 + }, + { + "epoch": 7.545676998368679, + "grad_norm": 0.006368184927850962, + "learning_rate": 0.0007835256679777887, + "loss": 0.0164, + "num_input_tokens_seen": 99837920, + "step": 46255 + }, + { + "epoch": 7.5464926590538335, + "grad_norm": 0.18929897248744965, + "learning_rate": 0.0007834670355340805, + "loss": 0.1629, + "num_input_tokens_seen": 99848672, + "step": 46260 + }, + { + "epoch": 7.547308319738988, + "grad_norm": 0.03622492775321007, + "learning_rate": 0.0007834083973455535, + "loss": 0.1297, + "num_input_tokens_seen": 99861216, + "step": 46265 + }, + { + "epoch": 7.548123980424144, + "grad_norm": 0.1713002473115921, + "learning_rate": 0.0007833497534133955, + "loss": 0.0634, + "num_input_tokens_seen": 99870784, + "step": 46270 + }, + { + "epoch": 7.548939641109299, + "grad_norm": 0.10626640170812607, + "learning_rate": 0.0007832911037387955, + "loss": 0.0442, + "num_input_tokens_seen": 99882272, + "step": 46275 + }, + { + "epoch": 7.549755301794454, + "grad_norm": 0.014792431145906448, + "learning_rate": 0.000783232448322942, + "loss": 0.0778, + "num_input_tokens_seen": 99892992, + "step": 46280 + }, + { + "epoch": 7.5505709624796085, + "grad_norm": 0.006565001793205738, + "learning_rate": 0.0007831737871670235, + "loss": 0.0121, + "num_input_tokens_seen": 99903744, + "step": 46285 + }, + { + "epoch": 7.551386623164763, + "grad_norm": 0.012091482989490032, + "learning_rate": 0.0007831151202722288, + "loss": 0.0145, + "num_input_tokens_seen": 99914688, + "step": 46290 + }, + { + "epoch": 7.552202283849918, + "grad_norm": 0.0027943248860538006, + "learning_rate": 0.0007830564476397473, + "loss": 0.0239, + "num_input_tokens_seen": 99925632, + "step": 46295 + }, + { + "epoch": 7.553017944535073, + "grad_norm": 0.15013094246387482, + "learning_rate": 0.0007829977692707676, + "loss": 0.0655, + "num_input_tokens_seen": 99935552, + "step": 46300 + }, + { + "epoch": 7.553833605220229, + "grad_norm": 0.009538470767438412, + "learning_rate": 0.0007829390851664793, + "loss": 0.0059, + "num_input_tokens_seen": 99946880, + "step": 46305 + }, + { + "epoch": 7.554649265905383, + "grad_norm": 0.08074900507926941, + "learning_rate": 0.0007828803953280713, + "loss": 0.0297, + "num_input_tokens_seen": 99957792, + "step": 46310 + }, + { + "epoch": 7.555464926590538, + "grad_norm": 0.0160321407020092, + "learning_rate": 0.0007828216997567333, + "loss": 0.0117, + "num_input_tokens_seen": 99967168, + "step": 46315 + }, + { + "epoch": 7.556280587275693, + "grad_norm": 0.007955282926559448, + "learning_rate": 0.0007827629984536548, + "loss": 0.0332, + "num_input_tokens_seen": 99978784, + "step": 46320 + }, + { + "epoch": 7.557096247960848, + "grad_norm": 0.023182855919003487, + "learning_rate": 0.0007827042914200254, + "loss": 0.1429, + "num_input_tokens_seen": 99990144, + "step": 46325 + }, + { + "epoch": 7.557911908646004, + "grad_norm": 0.004746480844914913, + "learning_rate": 0.000782645578657035, + "loss": 0.0038, + "num_input_tokens_seen": 100001568, + "step": 46330 + }, + { + "epoch": 7.558727569331158, + "grad_norm": 0.225945383310318, + "learning_rate": 0.0007825868601658733, + "loss": 0.146, + "num_input_tokens_seen": 100012128, + "step": 46335 + }, + { + "epoch": 7.559543230016313, + "grad_norm": 0.0051491702906787395, + "learning_rate": 0.0007825281359477303, + "loss": 0.0555, + "num_input_tokens_seen": 100023936, + "step": 46340 + }, + { + "epoch": 7.560358890701468, + "grad_norm": 0.03131938353180885, + "learning_rate": 0.0007824694060037964, + "loss": 0.0112, + "num_input_tokens_seen": 100035552, + "step": 46345 + }, + { + "epoch": 7.561174551386623, + "grad_norm": 0.004873788449913263, + "learning_rate": 0.0007824106703352616, + "loss": 0.0044, + "num_input_tokens_seen": 100045408, + "step": 46350 + }, + { + "epoch": 7.561990212071779, + "grad_norm": 0.010518714785575867, + "learning_rate": 0.0007823519289433162, + "loss": 0.1466, + "num_input_tokens_seen": 100057184, + "step": 46355 + }, + { + "epoch": 7.562805872756933, + "grad_norm": 0.05088549107313156, + "learning_rate": 0.0007822931818291508, + "loss": 0.0121, + "num_input_tokens_seen": 100068384, + "step": 46360 + }, + { + "epoch": 7.563621533442088, + "grad_norm": 0.0537845753133297, + "learning_rate": 0.0007822344289939561, + "loss": 0.0117, + "num_input_tokens_seen": 100079488, + "step": 46365 + }, + { + "epoch": 7.564437194127243, + "grad_norm": 0.00526401586830616, + "learning_rate": 0.0007821756704389224, + "loss": 0.0304, + "num_input_tokens_seen": 100090720, + "step": 46370 + }, + { + "epoch": 7.565252854812398, + "grad_norm": 0.12497130781412125, + "learning_rate": 0.000782116906165241, + "loss": 0.0384, + "num_input_tokens_seen": 100101248, + "step": 46375 + }, + { + "epoch": 7.566068515497553, + "grad_norm": 0.055331166833639145, + "learning_rate": 0.0007820581361741025, + "loss": 0.039, + "num_input_tokens_seen": 100112224, + "step": 46380 + }, + { + "epoch": 7.566884176182708, + "grad_norm": 0.036670684814453125, + "learning_rate": 0.0007819993604666982, + "loss": 0.0668, + "num_input_tokens_seen": 100121632, + "step": 46385 + }, + { + "epoch": 7.567699836867863, + "grad_norm": 0.0038586510345339775, + "learning_rate": 0.0007819405790442189, + "loss": 0.1165, + "num_input_tokens_seen": 100133024, + "step": 46390 + }, + { + "epoch": 7.568515497553018, + "grad_norm": 0.21471700072288513, + "learning_rate": 0.0007818817919078562, + "loss": 0.1802, + "num_input_tokens_seen": 100143008, + "step": 46395 + }, + { + "epoch": 7.569331158238173, + "grad_norm": 0.02432515099644661, + "learning_rate": 0.0007818229990588013, + "loss": 0.2675, + "num_input_tokens_seen": 100153408, + "step": 46400 + }, + { + "epoch": 7.570146818923328, + "grad_norm": 0.5932703614234924, + "learning_rate": 0.000781764200498246, + "loss": 0.0833, + "num_input_tokens_seen": 100163744, + "step": 46405 + }, + { + "epoch": 7.5709624796084825, + "grad_norm": 0.009936857037246227, + "learning_rate": 0.0007817053962273817, + "loss": 0.0163, + "num_input_tokens_seen": 100172608, + "step": 46410 + }, + { + "epoch": 7.571778140293638, + "grad_norm": 0.005890344735234976, + "learning_rate": 0.0007816465862474, + "loss": 0.0123, + "num_input_tokens_seen": 100182400, + "step": 46415 + }, + { + "epoch": 7.572593800978793, + "grad_norm": 0.011194584891200066, + "learning_rate": 0.000781587770559493, + "loss": 0.0374, + "num_input_tokens_seen": 100191936, + "step": 46420 + }, + { + "epoch": 7.573409461663948, + "grad_norm": 0.004106007516384125, + "learning_rate": 0.0007815289491648527, + "loss": 0.0581, + "num_input_tokens_seen": 100202048, + "step": 46425 + }, + { + "epoch": 7.574225122349103, + "grad_norm": 0.06704815477132797, + "learning_rate": 0.000781470122064671, + "loss": 0.067, + "num_input_tokens_seen": 100213632, + "step": 46430 + }, + { + "epoch": 7.575040783034257, + "grad_norm": 0.13246703147888184, + "learning_rate": 0.0007814112892601403, + "loss": 0.2559, + "num_input_tokens_seen": 100224096, + "step": 46435 + }, + { + "epoch": 7.575856443719413, + "grad_norm": 0.015076338313519955, + "learning_rate": 0.0007813524507524527, + "loss": 0.0165, + "num_input_tokens_seen": 100234368, + "step": 46440 + }, + { + "epoch": 7.576672104404568, + "grad_norm": 0.0034018096048384905, + "learning_rate": 0.0007812936065428009, + "loss": 0.0593, + "num_input_tokens_seen": 100244256, + "step": 46445 + }, + { + "epoch": 7.577487765089723, + "grad_norm": 0.062234122306108475, + "learning_rate": 0.0007812347566323774, + "loss": 0.0168, + "num_input_tokens_seen": 100255360, + "step": 46450 + }, + { + "epoch": 7.578303425774878, + "grad_norm": 0.006689689587801695, + "learning_rate": 0.0007811759010223747, + "loss": 0.122, + "num_input_tokens_seen": 100265824, + "step": 46455 + }, + { + "epoch": 7.579119086460032, + "grad_norm": 0.014267882332205772, + "learning_rate": 0.0007811170397139855, + "loss": 0.2066, + "num_input_tokens_seen": 100276512, + "step": 46460 + }, + { + "epoch": 7.579934747145187, + "grad_norm": 0.024747714400291443, + "learning_rate": 0.000781058172708403, + "loss": 0.0684, + "num_input_tokens_seen": 100286560, + "step": 46465 + }, + { + "epoch": 7.580750407830343, + "grad_norm": 0.08009031414985657, + "learning_rate": 0.00078099930000682, + "loss": 0.0438, + "num_input_tokens_seen": 100297856, + "step": 46470 + }, + { + "epoch": 7.581566068515498, + "grad_norm": 0.015079363249242306, + "learning_rate": 0.0007809404216104299, + "loss": 0.0838, + "num_input_tokens_seen": 100308352, + "step": 46475 + }, + { + "epoch": 7.582381729200653, + "grad_norm": 0.17071382701396942, + "learning_rate": 0.0007808815375204257, + "loss": 0.0309, + "num_input_tokens_seen": 100317536, + "step": 46480 + }, + { + "epoch": 7.583197389885807, + "grad_norm": 0.018091991543769836, + "learning_rate": 0.0007808226477380007, + "loss": 0.0293, + "num_input_tokens_seen": 100327808, + "step": 46485 + }, + { + "epoch": 7.584013050570962, + "grad_norm": 0.006568972021341324, + "learning_rate": 0.0007807637522643484, + "loss": 0.0253, + "num_input_tokens_seen": 100339712, + "step": 46490 + }, + { + "epoch": 7.584828711256117, + "grad_norm": 0.10979203879833221, + "learning_rate": 0.0007807048511006628, + "loss": 0.1308, + "num_input_tokens_seen": 100349696, + "step": 46495 + }, + { + "epoch": 7.585644371941273, + "grad_norm": 0.007415767293423414, + "learning_rate": 0.0007806459442481372, + "loss": 0.1214, + "num_input_tokens_seen": 100360160, + "step": 46500 + }, + { + "epoch": 7.5864600326264275, + "grad_norm": 0.02958816848695278, + "learning_rate": 0.0007805870317079654, + "loss": 0.1157, + "num_input_tokens_seen": 100370848, + "step": 46505 + }, + { + "epoch": 7.587275693311582, + "grad_norm": 0.2562585771083832, + "learning_rate": 0.0007805281134813416, + "loss": 0.0754, + "num_input_tokens_seen": 100381248, + "step": 46510 + }, + { + "epoch": 7.588091353996737, + "grad_norm": 0.25494882464408875, + "learning_rate": 0.0007804691895694595, + "loss": 0.0803, + "num_input_tokens_seen": 100389888, + "step": 46515 + }, + { + "epoch": 7.588907014681892, + "grad_norm": 0.01317585352808237, + "learning_rate": 0.0007804102599735137, + "loss": 0.1282, + "num_input_tokens_seen": 100399840, + "step": 46520 + }, + { + "epoch": 7.589722675367048, + "grad_norm": 0.05510897561907768, + "learning_rate": 0.0007803513246946981, + "loss": 0.0466, + "num_input_tokens_seen": 100410848, + "step": 46525 + }, + { + "epoch": 7.5905383360522025, + "grad_norm": 0.2794935703277588, + "learning_rate": 0.0007802923837342072, + "loss": 0.2526, + "num_input_tokens_seen": 100422368, + "step": 46530 + }, + { + "epoch": 7.591353996737357, + "grad_norm": 0.03801713511347771, + "learning_rate": 0.0007802334370932357, + "loss": 0.0735, + "num_input_tokens_seen": 100433504, + "step": 46535 + }, + { + "epoch": 7.592169657422512, + "grad_norm": 0.017860641703009605, + "learning_rate": 0.0007801744847729781, + "loss": 0.0144, + "num_input_tokens_seen": 100443584, + "step": 46540 + }, + { + "epoch": 7.592985318107667, + "grad_norm": 0.16718651354312897, + "learning_rate": 0.0007801155267746291, + "loss": 0.2004, + "num_input_tokens_seen": 100453216, + "step": 46545 + }, + { + "epoch": 7.593800978792823, + "grad_norm": 0.026073988527059555, + "learning_rate": 0.0007800565630993834, + "loss": 0.0112, + "num_input_tokens_seen": 100463296, + "step": 46550 + }, + { + "epoch": 7.5946166394779775, + "grad_norm": 0.23177111148834229, + "learning_rate": 0.0007799975937484365, + "loss": 0.1949, + "num_input_tokens_seen": 100473728, + "step": 46555 + }, + { + "epoch": 7.595432300163132, + "grad_norm": 0.05889357998967171, + "learning_rate": 0.000779938618722983, + "loss": 0.0929, + "num_input_tokens_seen": 100485344, + "step": 46560 + }, + { + "epoch": 7.596247960848287, + "grad_norm": 0.24748052656650543, + "learning_rate": 0.0007798796380242183, + "loss": 0.2074, + "num_input_tokens_seen": 100495552, + "step": 46565 + }, + { + "epoch": 7.597063621533442, + "grad_norm": 0.027371667325496674, + "learning_rate": 0.0007798206516533377, + "loss": 0.1277, + "num_input_tokens_seen": 100506944, + "step": 46570 + }, + { + "epoch": 7.597879282218597, + "grad_norm": 0.1969236135482788, + "learning_rate": 0.0007797616596115365, + "loss": 0.1073, + "num_input_tokens_seen": 100517760, + "step": 46575 + }, + { + "epoch": 7.598694942903752, + "grad_norm": 0.08001730591058731, + "learning_rate": 0.0007797026619000105, + "loss": 0.0786, + "num_input_tokens_seen": 100528768, + "step": 46580 + }, + { + "epoch": 7.599510603588907, + "grad_norm": 0.12615911662578583, + "learning_rate": 0.0007796436585199553, + "loss": 0.1187, + "num_input_tokens_seen": 100539392, + "step": 46585 + }, + { + "epoch": 7.600326264274062, + "grad_norm": 0.029265137389302254, + "learning_rate": 0.0007795846494725665, + "loss": 0.0675, + "num_input_tokens_seen": 100550176, + "step": 46590 + }, + { + "epoch": 7.601141924959217, + "grad_norm": 0.19184695184230804, + "learning_rate": 0.00077952563475904, + "loss": 0.0813, + "num_input_tokens_seen": 100561216, + "step": 46595 + }, + { + "epoch": 7.601957585644372, + "grad_norm": 0.010343669913709164, + "learning_rate": 0.000779466614380572, + "loss": 0.0281, + "num_input_tokens_seen": 100571648, + "step": 46600 + }, + { + "epoch": 7.602773246329527, + "grad_norm": 0.1851484477519989, + "learning_rate": 0.0007794075883383586, + "loss": 0.1607, + "num_input_tokens_seen": 100583776, + "step": 46605 + }, + { + "epoch": 7.603588907014682, + "grad_norm": 0.016286205500364304, + "learning_rate": 0.0007793485566335958, + "loss": 0.0353, + "num_input_tokens_seen": 100592992, + "step": 46610 + }, + { + "epoch": 7.604404567699837, + "grad_norm": 0.009325031191110611, + "learning_rate": 0.0007792895192674802, + "loss": 0.056, + "num_input_tokens_seen": 100603680, + "step": 46615 + }, + { + "epoch": 7.605220228384992, + "grad_norm": 0.20486290752887726, + "learning_rate": 0.0007792304762412084, + "loss": 0.2021, + "num_input_tokens_seen": 100615808, + "step": 46620 + }, + { + "epoch": 7.606035889070147, + "grad_norm": 0.10156647861003876, + "learning_rate": 0.0007791714275559765, + "loss": 0.0588, + "num_input_tokens_seen": 100625696, + "step": 46625 + }, + { + "epoch": 7.6068515497553015, + "grad_norm": 0.10061921924352646, + "learning_rate": 0.0007791123732129815, + "loss": 0.0562, + "num_input_tokens_seen": 100635968, + "step": 46630 + }, + { + "epoch": 7.607667210440457, + "grad_norm": 0.017044829204678535, + "learning_rate": 0.0007790533132134201, + "loss": 0.0931, + "num_input_tokens_seen": 100647104, + "step": 46635 + }, + { + "epoch": 7.608482871125612, + "grad_norm": 0.03325619548559189, + "learning_rate": 0.0007789942475584894, + "loss": 0.0179, + "num_input_tokens_seen": 100656992, + "step": 46640 + }, + { + "epoch": 7.609298531810767, + "grad_norm": 0.17666223645210266, + "learning_rate": 0.0007789351762493865, + "loss": 0.0882, + "num_input_tokens_seen": 100668288, + "step": 46645 + }, + { + "epoch": 7.610114192495922, + "grad_norm": 0.024186953902244568, + "learning_rate": 0.0007788760992873083, + "loss": 0.1074, + "num_input_tokens_seen": 100680736, + "step": 46650 + }, + { + "epoch": 7.6109298531810765, + "grad_norm": 0.020467402413487434, + "learning_rate": 0.000778817016673452, + "loss": 0.0288, + "num_input_tokens_seen": 100692000, + "step": 46655 + }, + { + "epoch": 7.611745513866231, + "grad_norm": 0.19253192842006683, + "learning_rate": 0.0007787579284090154, + "loss": 0.1421, + "num_input_tokens_seen": 100701440, + "step": 46660 + }, + { + "epoch": 7.612561174551386, + "grad_norm": 0.07677418738603592, + "learning_rate": 0.0007786988344951956, + "loss": 0.0854, + "num_input_tokens_seen": 100712224, + "step": 46665 + }, + { + "epoch": 7.613376835236542, + "grad_norm": 0.03219592943787575, + "learning_rate": 0.0007786397349331904, + "loss": 0.0383, + "num_input_tokens_seen": 100722912, + "step": 46670 + }, + { + "epoch": 7.614192495921697, + "grad_norm": 0.07338977605104446, + "learning_rate": 0.0007785806297241976, + "loss": 0.0979, + "num_input_tokens_seen": 100732384, + "step": 46675 + }, + { + "epoch": 7.6150081566068515, + "grad_norm": 0.21995481848716736, + "learning_rate": 0.0007785215188694148, + "loss": 0.1471, + "num_input_tokens_seen": 100743328, + "step": 46680 + }, + { + "epoch": 7.615823817292006, + "grad_norm": 0.11532082408666611, + "learning_rate": 0.0007784624023700402, + "loss": 0.1009, + "num_input_tokens_seen": 100754944, + "step": 46685 + }, + { + "epoch": 7.616639477977161, + "grad_norm": 0.03261370211839676, + "learning_rate": 0.0007784032802272716, + "loss": 0.072, + "num_input_tokens_seen": 100767200, + "step": 46690 + }, + { + "epoch": 7.617455138662317, + "grad_norm": 0.08721105754375458, + "learning_rate": 0.0007783441524423074, + "loss": 0.0701, + "num_input_tokens_seen": 100777408, + "step": 46695 + }, + { + "epoch": 7.618270799347472, + "grad_norm": 0.042922139167785645, + "learning_rate": 0.0007782850190163459, + "loss": 0.1184, + "num_input_tokens_seen": 100787584, + "step": 46700 + }, + { + "epoch": 7.6190864600326265, + "grad_norm": 0.016226772218942642, + "learning_rate": 0.0007782258799505855, + "loss": 0.0828, + "num_input_tokens_seen": 100796800, + "step": 46705 + }, + { + "epoch": 7.619902120717781, + "grad_norm": 0.04629151150584221, + "learning_rate": 0.0007781667352462245, + "loss": 0.0392, + "num_input_tokens_seen": 100806528, + "step": 46710 + }, + { + "epoch": 7.620717781402936, + "grad_norm": 0.09846754372119904, + "learning_rate": 0.0007781075849044619, + "loss": 0.0497, + "num_input_tokens_seen": 100817120, + "step": 46715 + }, + { + "epoch": 7.621533442088092, + "grad_norm": 0.3087049126625061, + "learning_rate": 0.0007780484289264961, + "loss": 0.2842, + "num_input_tokens_seen": 100828000, + "step": 46720 + }, + { + "epoch": 7.622349102773247, + "grad_norm": 0.009019329212605953, + "learning_rate": 0.0007779892673135264, + "loss": 0.0355, + "num_input_tokens_seen": 100839424, + "step": 46725 + }, + { + "epoch": 7.623164763458401, + "grad_norm": 0.33577847480773926, + "learning_rate": 0.0007779301000667516, + "loss": 0.2343, + "num_input_tokens_seen": 100849472, + "step": 46730 + }, + { + "epoch": 7.623980424143556, + "grad_norm": 0.14628195762634277, + "learning_rate": 0.0007778709271873706, + "loss": 0.055, + "num_input_tokens_seen": 100859232, + "step": 46735 + }, + { + "epoch": 7.624796084828711, + "grad_norm": 0.22046029567718506, + "learning_rate": 0.0007778117486765825, + "loss": 0.0552, + "num_input_tokens_seen": 100869888, + "step": 46740 + }, + { + "epoch": 7.625611745513866, + "grad_norm": 0.10369850695133209, + "learning_rate": 0.0007777525645355872, + "loss": 0.1577, + "num_input_tokens_seen": 100881440, + "step": 46745 + }, + { + "epoch": 7.626427406199021, + "grad_norm": 0.08013732731342316, + "learning_rate": 0.0007776933747655838, + "loss": 0.0623, + "num_input_tokens_seen": 100892160, + "step": 46750 + }, + { + "epoch": 7.627243066884176, + "grad_norm": 0.028127849102020264, + "learning_rate": 0.0007776341793677719, + "loss": 0.0542, + "num_input_tokens_seen": 100901280, + "step": 46755 + }, + { + "epoch": 7.628058727569331, + "grad_norm": 0.11611472070217133, + "learning_rate": 0.000777574978343351, + "loss": 0.1066, + "num_input_tokens_seen": 100913440, + "step": 46760 + }, + { + "epoch": 7.628874388254486, + "grad_norm": 0.004973673261702061, + "learning_rate": 0.000777515771693521, + "loss": 0.0572, + "num_input_tokens_seen": 100923840, + "step": 46765 + }, + { + "epoch": 7.629690048939641, + "grad_norm": 0.057666193693876266, + "learning_rate": 0.0007774565594194821, + "loss": 0.0386, + "num_input_tokens_seen": 100933824, + "step": 46770 + }, + { + "epoch": 7.630505709624796, + "grad_norm": 0.043347232043743134, + "learning_rate": 0.0007773973415224339, + "loss": 0.068, + "num_input_tokens_seen": 100943424, + "step": 46775 + }, + { + "epoch": 7.631321370309951, + "grad_norm": 0.02474125660955906, + "learning_rate": 0.0007773381180035766, + "loss": 0.1134, + "num_input_tokens_seen": 100955040, + "step": 46780 + }, + { + "epoch": 7.632137030995106, + "grad_norm": 0.1982913613319397, + "learning_rate": 0.0007772788888641107, + "loss": 0.0828, + "num_input_tokens_seen": 100966848, + "step": 46785 + }, + { + "epoch": 7.632952691680261, + "grad_norm": 0.006726962048560381, + "learning_rate": 0.0007772196541052361, + "loss": 0.0162, + "num_input_tokens_seen": 100977888, + "step": 46790 + }, + { + "epoch": 7.633768352365416, + "grad_norm": 0.053251706063747406, + "learning_rate": 0.0007771604137281538, + "loss": 0.0294, + "num_input_tokens_seen": 100988736, + "step": 46795 + }, + { + "epoch": 7.634584013050571, + "grad_norm": 0.014402381144464016, + "learning_rate": 0.0007771011677340639, + "loss": 0.0884, + "num_input_tokens_seen": 100999360, + "step": 46800 + }, + { + "epoch": 7.635399673735726, + "grad_norm": 0.007497976999729872, + "learning_rate": 0.0007770419161241675, + "loss": 0.0168, + "num_input_tokens_seen": 101011104, + "step": 46805 + }, + { + "epoch": 7.636215334420881, + "grad_norm": 0.10259098559617996, + "learning_rate": 0.0007769826588996651, + "loss": 0.0449, + "num_input_tokens_seen": 101021664, + "step": 46810 + }, + { + "epoch": 7.637030995106036, + "grad_norm": 0.008259537629783154, + "learning_rate": 0.0007769233960617576, + "loss": 0.0309, + "num_input_tokens_seen": 101032576, + "step": 46815 + }, + { + "epoch": 7.637846655791191, + "grad_norm": 0.177310049533844, + "learning_rate": 0.0007768641276116465, + "loss": 0.0227, + "num_input_tokens_seen": 101043040, + "step": 46820 + }, + { + "epoch": 7.638662316476346, + "grad_norm": 0.13302385807037354, + "learning_rate": 0.0007768048535505324, + "loss": 0.219, + "num_input_tokens_seen": 101054400, + "step": 46825 + }, + { + "epoch": 7.6394779771615005, + "grad_norm": 0.019431116059422493, + "learning_rate": 0.0007767455738796169, + "loss": 0.0131, + "num_input_tokens_seen": 101064896, + "step": 46830 + }, + { + "epoch": 7.640293637846656, + "grad_norm": 0.24176053702831268, + "learning_rate": 0.0007766862886001011, + "loss": 0.0475, + "num_input_tokens_seen": 101074848, + "step": 46835 + }, + { + "epoch": 7.641109298531811, + "grad_norm": 0.0011875767959281802, + "learning_rate": 0.0007766269977131868, + "loss": 0.1033, + "num_input_tokens_seen": 101085312, + "step": 46840 + }, + { + "epoch": 7.641924959216966, + "grad_norm": 0.008996953256428242, + "learning_rate": 0.0007765677012200753, + "loss": 0.0046, + "num_input_tokens_seen": 101095584, + "step": 46845 + }, + { + "epoch": 7.642740619902121, + "grad_norm": 0.013733599334955215, + "learning_rate": 0.0007765083991219688, + "loss": 0.0158, + "num_input_tokens_seen": 101105760, + "step": 46850 + }, + { + "epoch": 7.643556280587275, + "grad_norm": 0.01829490065574646, + "learning_rate": 0.0007764490914200686, + "loss": 0.0192, + "num_input_tokens_seen": 101116576, + "step": 46855 + }, + { + "epoch": 7.64437194127243, + "grad_norm": 0.007279630284756422, + "learning_rate": 0.0007763897781155769, + "loss": 0.0379, + "num_input_tokens_seen": 101127840, + "step": 46860 + }, + { + "epoch": 7.645187601957586, + "grad_norm": 0.009403941221535206, + "learning_rate": 0.0007763304592096956, + "loss": 0.1334, + "num_input_tokens_seen": 101140192, + "step": 46865 + }, + { + "epoch": 7.646003262642741, + "grad_norm": 0.1523141711950302, + "learning_rate": 0.0007762711347036273, + "loss": 0.0457, + "num_input_tokens_seen": 101150336, + "step": 46870 + }, + { + "epoch": 7.646818923327896, + "grad_norm": 0.011988531798124313, + "learning_rate": 0.0007762118045985738, + "loss": 0.0222, + "num_input_tokens_seen": 101161088, + "step": 46875 + }, + { + "epoch": 7.64763458401305, + "grad_norm": 0.030068593099713326, + "learning_rate": 0.0007761524688957377, + "loss": 0.012, + "num_input_tokens_seen": 101171776, + "step": 46880 + }, + { + "epoch": 7.648450244698205, + "grad_norm": 0.01157914474606514, + "learning_rate": 0.0007760931275963215, + "loss": 0.0861, + "num_input_tokens_seen": 101180544, + "step": 46885 + }, + { + "epoch": 7.649265905383361, + "grad_norm": 0.28757941722869873, + "learning_rate": 0.0007760337807015276, + "loss": 0.0236, + "num_input_tokens_seen": 101192448, + "step": 46890 + }, + { + "epoch": 7.650081566068516, + "grad_norm": 0.015597200952470303, + "learning_rate": 0.0007759744282125593, + "loss": 0.1327, + "num_input_tokens_seen": 101203168, + "step": 46895 + }, + { + "epoch": 7.650897226753671, + "grad_norm": 0.4055164158344269, + "learning_rate": 0.000775915070130619, + "loss": 0.0899, + "num_input_tokens_seen": 101213216, + "step": 46900 + }, + { + "epoch": 7.651712887438825, + "grad_norm": 0.009663230739533901, + "learning_rate": 0.0007758557064569096, + "loss": 0.0583, + "num_input_tokens_seen": 101224960, + "step": 46905 + }, + { + "epoch": 7.65252854812398, + "grad_norm": 0.29801616072654724, + "learning_rate": 0.0007757963371926346, + "loss": 0.0992, + "num_input_tokens_seen": 101235296, + "step": 46910 + }, + { + "epoch": 7.653344208809135, + "grad_norm": 0.350406676530838, + "learning_rate": 0.000775736962338997, + "loss": 0.0315, + "num_input_tokens_seen": 101246784, + "step": 46915 + }, + { + "epoch": 7.654159869494291, + "grad_norm": 0.019528506323695183, + "learning_rate": 0.0007756775818971998, + "loss": 0.1, + "num_input_tokens_seen": 101257440, + "step": 46920 + }, + { + "epoch": 7.6549755301794455, + "grad_norm": 0.006236244924366474, + "learning_rate": 0.0007756181958684467, + "loss": 0.042, + "num_input_tokens_seen": 101268672, + "step": 46925 + }, + { + "epoch": 7.6557911908646, + "grad_norm": 0.00803590938448906, + "learning_rate": 0.0007755588042539414, + "loss": 0.025, + "num_input_tokens_seen": 101279168, + "step": 46930 + }, + { + "epoch": 7.656606851549755, + "grad_norm": 0.3264743983745575, + "learning_rate": 0.0007754994070548873, + "loss": 0.1989, + "num_input_tokens_seen": 101290336, + "step": 46935 + }, + { + "epoch": 7.65742251223491, + "grad_norm": 0.01258645486086607, + "learning_rate": 0.0007754400042724881, + "loss": 0.1332, + "num_input_tokens_seen": 101301344, + "step": 46940 + }, + { + "epoch": 7.658238172920065, + "grad_norm": 0.03560754656791687, + "learning_rate": 0.0007753805959079481, + "loss": 0.0943, + "num_input_tokens_seen": 101312160, + "step": 46945 + }, + { + "epoch": 7.6590538336052205, + "grad_norm": 0.04574985429644585, + "learning_rate": 0.0007753211819624706, + "loss": 0.0898, + "num_input_tokens_seen": 101322720, + "step": 46950 + }, + { + "epoch": 7.659869494290375, + "grad_norm": 0.062320858240127563, + "learning_rate": 0.0007752617624372602, + "loss": 0.0307, + "num_input_tokens_seen": 101334048, + "step": 46955 + }, + { + "epoch": 7.66068515497553, + "grad_norm": 0.2878982424736023, + "learning_rate": 0.000775202337333521, + "loss": 0.1865, + "num_input_tokens_seen": 101345760, + "step": 46960 + }, + { + "epoch": 7.661500815660685, + "grad_norm": 0.17490990459918976, + "learning_rate": 0.0007751429066524575, + "loss": 0.0835, + "num_input_tokens_seen": 101356416, + "step": 46965 + }, + { + "epoch": 7.66231647634584, + "grad_norm": 0.2936611473560333, + "learning_rate": 0.0007750834703952738, + "loss": 0.1743, + "num_input_tokens_seen": 101367360, + "step": 46970 + }, + { + "epoch": 7.6631321370309955, + "grad_norm": 0.18206502497196198, + "learning_rate": 0.0007750240285631745, + "loss": 0.0755, + "num_input_tokens_seen": 101378848, + "step": 46975 + }, + { + "epoch": 7.66394779771615, + "grad_norm": 0.008685864508152008, + "learning_rate": 0.0007749645811573646, + "loss": 0.025, + "num_input_tokens_seen": 101389184, + "step": 46980 + }, + { + "epoch": 7.664763458401305, + "grad_norm": 0.10776425898075104, + "learning_rate": 0.0007749051281790484, + "loss": 0.0811, + "num_input_tokens_seen": 101399840, + "step": 46985 + }, + { + "epoch": 7.66557911908646, + "grad_norm": 0.1290866881608963, + "learning_rate": 0.0007748456696294312, + "loss": 0.197, + "num_input_tokens_seen": 101410112, + "step": 46990 + }, + { + "epoch": 7.666394779771615, + "grad_norm": 0.16020077466964722, + "learning_rate": 0.0007747862055097179, + "loss": 0.0976, + "num_input_tokens_seen": 101420224, + "step": 46995 + }, + { + "epoch": 7.6672104404567705, + "grad_norm": 0.1934269815683365, + "learning_rate": 0.0007747267358211135, + "loss": 0.0454, + "num_input_tokens_seen": 101430080, + "step": 47000 + }, + { + "epoch": 7.668026101141925, + "grad_norm": 0.03580223023891449, + "learning_rate": 0.0007746672605648231, + "loss": 0.0185, + "num_input_tokens_seen": 101441856, + "step": 47005 + }, + { + "epoch": 7.66884176182708, + "grad_norm": 0.24266868829727173, + "learning_rate": 0.0007746077797420524, + "loss": 0.1602, + "num_input_tokens_seen": 101451232, + "step": 47010 + }, + { + "epoch": 7.669657422512235, + "grad_norm": 0.050518471747636795, + "learning_rate": 0.0007745482933540067, + "loss": 0.0967, + "num_input_tokens_seen": 101461792, + "step": 47015 + }, + { + "epoch": 7.67047308319739, + "grad_norm": 0.03776390478014946, + "learning_rate": 0.0007744888014018914, + "loss": 0.1361, + "num_input_tokens_seen": 101472096, + "step": 47020 + }, + { + "epoch": 7.671288743882545, + "grad_norm": 0.0736125186085701, + "learning_rate": 0.0007744293038869125, + "loss": 0.0331, + "num_input_tokens_seen": 101482784, + "step": 47025 + }, + { + "epoch": 7.672104404567699, + "grad_norm": 0.10928630083799362, + "learning_rate": 0.0007743698008102755, + "loss": 0.0642, + "num_input_tokens_seen": 101493248, + "step": 47030 + }, + { + "epoch": 7.672920065252855, + "grad_norm": 0.016964375972747803, + "learning_rate": 0.0007743102921731864, + "loss": 0.0337, + "num_input_tokens_seen": 101504128, + "step": 47035 + }, + { + "epoch": 7.67373572593801, + "grad_norm": 0.08498941361904144, + "learning_rate": 0.0007742507779768513, + "loss": 0.0536, + "num_input_tokens_seen": 101515808, + "step": 47040 + }, + { + "epoch": 7.674551386623165, + "grad_norm": 0.0076843444257974625, + "learning_rate": 0.0007741912582224764, + "loss": 0.0975, + "num_input_tokens_seen": 101526976, + "step": 47045 + }, + { + "epoch": 7.6753670473083195, + "grad_norm": 0.015889909118413925, + "learning_rate": 0.0007741317329112675, + "loss": 0.0182, + "num_input_tokens_seen": 101537440, + "step": 47050 + }, + { + "epoch": 7.676182707993474, + "grad_norm": 0.04796231538057327, + "learning_rate": 0.0007740722020444315, + "loss": 0.1469, + "num_input_tokens_seen": 101549280, + "step": 47055 + }, + { + "epoch": 7.67699836867863, + "grad_norm": 0.0197526216506958, + "learning_rate": 0.0007740126656231746, + "loss": 0.0308, + "num_input_tokens_seen": 101558720, + "step": 47060 + }, + { + "epoch": 7.677814029363785, + "grad_norm": 0.02961111254990101, + "learning_rate": 0.0007739531236487034, + "loss": 0.0871, + "num_input_tokens_seen": 101570400, + "step": 47065 + }, + { + "epoch": 7.67862969004894, + "grad_norm": 0.18880467116832733, + "learning_rate": 0.0007738935761222247, + "loss": 0.0509, + "num_input_tokens_seen": 101581472, + "step": 47070 + }, + { + "epoch": 7.6794453507340945, + "grad_norm": 0.0316985622048378, + "learning_rate": 0.0007738340230449451, + "loss": 0.0286, + "num_input_tokens_seen": 101592480, + "step": 47075 + }, + { + "epoch": 7.680261011419249, + "grad_norm": 0.27232664823532104, + "learning_rate": 0.0007737744644180718, + "loss": 0.0899, + "num_input_tokens_seen": 101601952, + "step": 47080 + }, + { + "epoch": 7.681076672104405, + "grad_norm": 0.11677951365709305, + "learning_rate": 0.0007737149002428114, + "loss": 0.0472, + "num_input_tokens_seen": 101612384, + "step": 47085 + }, + { + "epoch": 7.68189233278956, + "grad_norm": 0.11763057857751846, + "learning_rate": 0.0007736553305203715, + "loss": 0.0483, + "num_input_tokens_seen": 101623872, + "step": 47090 + }, + { + "epoch": 7.682707993474715, + "grad_norm": 0.026809118688106537, + "learning_rate": 0.0007735957552519592, + "loss": 0.1359, + "num_input_tokens_seen": 101635072, + "step": 47095 + }, + { + "epoch": 7.6835236541598695, + "grad_norm": 0.07621929049491882, + "learning_rate": 0.0007735361744387818, + "loss": 0.1277, + "num_input_tokens_seen": 101644896, + "step": 47100 + }, + { + "epoch": 7.684339314845024, + "grad_norm": 0.006795486435294151, + "learning_rate": 0.0007734765880820468, + "loss": 0.0333, + "num_input_tokens_seen": 101655584, + "step": 47105 + }, + { + "epoch": 7.685154975530179, + "grad_norm": 0.010941782966256142, + "learning_rate": 0.0007734169961829618, + "loss": 0.0533, + "num_input_tokens_seen": 101666912, + "step": 47110 + }, + { + "epoch": 7.685970636215334, + "grad_norm": 0.018587803468108177, + "learning_rate": 0.0007733573987427346, + "loss": 0.0224, + "num_input_tokens_seen": 101676768, + "step": 47115 + }, + { + "epoch": 7.68678629690049, + "grad_norm": 0.03765375167131424, + "learning_rate": 0.0007732977957625729, + "loss": 0.0558, + "num_input_tokens_seen": 101687936, + "step": 47120 + }, + { + "epoch": 7.6876019575856445, + "grad_norm": 0.1699625849723816, + "learning_rate": 0.0007732381872436846, + "loss": 0.1816, + "num_input_tokens_seen": 101698592, + "step": 47125 + }, + { + "epoch": 7.688417618270799, + "grad_norm": 0.15257039666175842, + "learning_rate": 0.0007731785731872778, + "loss": 0.1122, + "num_input_tokens_seen": 101709312, + "step": 47130 + }, + { + "epoch": 7.689233278955954, + "grad_norm": 0.040049903094768524, + "learning_rate": 0.0007731189535945609, + "loss": 0.1104, + "num_input_tokens_seen": 101720512, + "step": 47135 + }, + { + "epoch": 7.690048939641109, + "grad_norm": 0.003142146160826087, + "learning_rate": 0.0007730593284667416, + "loss": 0.1097, + "num_input_tokens_seen": 101732640, + "step": 47140 + }, + { + "epoch": 7.690864600326265, + "grad_norm": 0.01368060614913702, + "learning_rate": 0.0007729996978050287, + "loss": 0.0751, + "num_input_tokens_seen": 101742400, + "step": 47145 + }, + { + "epoch": 7.691680261011419, + "grad_norm": 0.019158734008669853, + "learning_rate": 0.0007729400616106308, + "loss": 0.2062, + "num_input_tokens_seen": 101751936, + "step": 47150 + }, + { + "epoch": 7.692495921696574, + "grad_norm": 0.14647071063518524, + "learning_rate": 0.0007728804198847561, + "loss": 0.0308, + "num_input_tokens_seen": 101763136, + "step": 47155 + }, + { + "epoch": 7.693311582381729, + "grad_norm": 0.13443398475646973, + "learning_rate": 0.0007728207726286136, + "loss": 0.0435, + "num_input_tokens_seen": 101773440, + "step": 47160 + }, + { + "epoch": 7.694127243066884, + "grad_norm": 0.33008673787117004, + "learning_rate": 0.000772761119843412, + "loss": 0.1381, + "num_input_tokens_seen": 101783264, + "step": 47165 + }, + { + "epoch": 7.69494290375204, + "grad_norm": 0.028178369626402855, + "learning_rate": 0.0007727014615303602, + "loss": 0.0793, + "num_input_tokens_seen": 101793856, + "step": 47170 + }, + { + "epoch": 7.695758564437194, + "grad_norm": 0.015002277679741383, + "learning_rate": 0.0007726417976906674, + "loss": 0.0892, + "num_input_tokens_seen": 101804832, + "step": 47175 + }, + { + "epoch": 7.696574225122349, + "grad_norm": 0.10893011093139648, + "learning_rate": 0.0007725821283255427, + "loss": 0.0884, + "num_input_tokens_seen": 101815808, + "step": 47180 + }, + { + "epoch": 7.697389885807504, + "grad_norm": 0.00839167833328247, + "learning_rate": 0.0007725224534361955, + "loss": 0.0983, + "num_input_tokens_seen": 101825120, + "step": 47185 + }, + { + "epoch": 7.698205546492659, + "grad_norm": 0.02416566200554371, + "learning_rate": 0.000772462773023835, + "loss": 0.0371, + "num_input_tokens_seen": 101836416, + "step": 47190 + }, + { + "epoch": 7.699021207177814, + "grad_norm": 0.02070494368672371, + "learning_rate": 0.0007724030870896707, + "loss": 0.0287, + "num_input_tokens_seen": 101846240, + "step": 47195 + }, + { + "epoch": 7.699836867862969, + "grad_norm": 0.040686871856451035, + "learning_rate": 0.0007723433956349123, + "loss": 0.0759, + "num_input_tokens_seen": 101857152, + "step": 47200 + }, + { + "epoch": 7.700652528548124, + "grad_norm": 0.02595238760113716, + "learning_rate": 0.0007722836986607696, + "loss": 0.0254, + "num_input_tokens_seen": 101867296, + "step": 47205 + }, + { + "epoch": 7.701468189233279, + "grad_norm": 0.00969753134995699, + "learning_rate": 0.000772223996168452, + "loss": 0.028, + "num_input_tokens_seen": 101876000, + "step": 47210 + }, + { + "epoch": 7.702283849918434, + "grad_norm": 0.18852238357067108, + "learning_rate": 0.0007721642881591701, + "loss": 0.0242, + "num_input_tokens_seen": 101886080, + "step": 47215 + }, + { + "epoch": 7.703099510603589, + "grad_norm": 0.027744583785533905, + "learning_rate": 0.0007721045746341335, + "loss": 0.1115, + "num_input_tokens_seen": 101896480, + "step": 47220 + }, + { + "epoch": 7.7039151712887435, + "grad_norm": 0.0012571450788527727, + "learning_rate": 0.0007720448555945527, + "loss": 0.0834, + "num_input_tokens_seen": 101907136, + "step": 47225 + }, + { + "epoch": 7.704730831973899, + "grad_norm": 0.0048811654560267925, + "learning_rate": 0.0007719851310416376, + "loss": 0.0351, + "num_input_tokens_seen": 101918624, + "step": 47230 + }, + { + "epoch": 7.705546492659054, + "grad_norm": 0.018812011927366257, + "learning_rate": 0.0007719254009765988, + "loss": 0.0659, + "num_input_tokens_seen": 101929408, + "step": 47235 + }, + { + "epoch": 7.706362153344209, + "grad_norm": 0.008872810751199722, + "learning_rate": 0.0007718656654006469, + "loss": 0.0675, + "num_input_tokens_seen": 101940736, + "step": 47240 + }, + { + "epoch": 7.707177814029364, + "grad_norm": 0.04965772479772568, + "learning_rate": 0.0007718059243149921, + "loss": 0.018, + "num_input_tokens_seen": 101950848, + "step": 47245 + }, + { + "epoch": 7.7079934747145185, + "grad_norm": 0.0060510290786623955, + "learning_rate": 0.0007717461777208458, + "loss": 0.0721, + "num_input_tokens_seen": 101961376, + "step": 47250 + }, + { + "epoch": 7.708809135399674, + "grad_norm": 0.012217323295772076, + "learning_rate": 0.0007716864256194182, + "loss": 0.0351, + "num_input_tokens_seen": 101972448, + "step": 47255 + }, + { + "epoch": 7.709624796084829, + "grad_norm": 0.0449352040886879, + "learning_rate": 0.0007716266680119207, + "loss": 0.0371, + "num_input_tokens_seen": 101983712, + "step": 47260 + }, + { + "epoch": 7.710440456769984, + "grad_norm": 0.018092213198542595, + "learning_rate": 0.0007715669048995641, + "loss": 0.0384, + "num_input_tokens_seen": 101993792, + "step": 47265 + }, + { + "epoch": 7.711256117455139, + "grad_norm": 0.01799091137945652, + "learning_rate": 0.0007715071362835597, + "loss": 0.0832, + "num_input_tokens_seen": 102004800, + "step": 47270 + }, + { + "epoch": 7.712071778140293, + "grad_norm": 0.06081044673919678, + "learning_rate": 0.0007714473621651188, + "loss": 0.1236, + "num_input_tokens_seen": 102015936, + "step": 47275 + }, + { + "epoch": 7.712887438825448, + "grad_norm": 0.006354760844260454, + "learning_rate": 0.0007713875825454526, + "loss": 0.1744, + "num_input_tokens_seen": 102026592, + "step": 47280 + }, + { + "epoch": 7.713703099510604, + "grad_norm": 0.15061309933662415, + "learning_rate": 0.0007713277974257729, + "loss": 0.0604, + "num_input_tokens_seen": 102036448, + "step": 47285 + }, + { + "epoch": 7.714518760195759, + "grad_norm": 0.23491689562797546, + "learning_rate": 0.0007712680068072911, + "loss": 0.2595, + "num_input_tokens_seen": 102048800, + "step": 47290 + }, + { + "epoch": 7.715334420880914, + "grad_norm": 0.009531409479677677, + "learning_rate": 0.000771208210691219, + "loss": 0.0288, + "num_input_tokens_seen": 102060000, + "step": 47295 + }, + { + "epoch": 7.716150081566068, + "grad_norm": 0.02414640039205551, + "learning_rate": 0.0007711484090787686, + "loss": 0.0358, + "num_input_tokens_seen": 102071104, + "step": 47300 + }, + { + "epoch": 7.716965742251223, + "grad_norm": 0.1658041775226593, + "learning_rate": 0.0007710886019711516, + "loss": 0.1099, + "num_input_tokens_seen": 102082880, + "step": 47305 + }, + { + "epoch": 7.717781402936378, + "grad_norm": 0.004195346962660551, + "learning_rate": 0.0007710287893695803, + "loss": 0.0219, + "num_input_tokens_seen": 102094816, + "step": 47310 + }, + { + "epoch": 7.718597063621534, + "grad_norm": 0.008565414696931839, + "learning_rate": 0.0007709689712752666, + "loss": 0.0846, + "num_input_tokens_seen": 102105632, + "step": 47315 + }, + { + "epoch": 7.719412724306689, + "grad_norm": 0.3287879526615143, + "learning_rate": 0.000770909147689423, + "loss": 0.1723, + "num_input_tokens_seen": 102115808, + "step": 47320 + }, + { + "epoch": 7.720228384991843, + "grad_norm": 0.13529710471630096, + "learning_rate": 0.000770849318613262, + "loss": 0.0834, + "num_input_tokens_seen": 102127392, + "step": 47325 + }, + { + "epoch": 7.721044045676998, + "grad_norm": 0.03702203184366226, + "learning_rate": 0.0007707894840479957, + "loss": 0.0424, + "num_input_tokens_seen": 102139136, + "step": 47330 + }, + { + "epoch": 7.721859706362153, + "grad_norm": 0.023298079147934914, + "learning_rate": 0.0007707296439948372, + "loss": 0.1257, + "num_input_tokens_seen": 102150688, + "step": 47335 + }, + { + "epoch": 7.722675367047309, + "grad_norm": 0.15577064454555511, + "learning_rate": 0.0007706697984549988, + "loss": 0.3274, + "num_input_tokens_seen": 102161568, + "step": 47340 + }, + { + "epoch": 7.7234910277324635, + "grad_norm": 0.16380055248737335, + "learning_rate": 0.0007706099474296938, + "loss": 0.0661, + "num_input_tokens_seen": 102173440, + "step": 47345 + }, + { + "epoch": 7.724306688417618, + "grad_norm": 0.009353193454444408, + "learning_rate": 0.0007705500909201349, + "loss": 0.0113, + "num_input_tokens_seen": 102185376, + "step": 47350 + }, + { + "epoch": 7.725122349102773, + "grad_norm": 0.018933897837996483, + "learning_rate": 0.0007704902289275351, + "loss": 0.0178, + "num_input_tokens_seen": 102195456, + "step": 47355 + }, + { + "epoch": 7.725938009787928, + "grad_norm": 0.17287416756153107, + "learning_rate": 0.0007704303614531076, + "loss": 0.0934, + "num_input_tokens_seen": 102206080, + "step": 47360 + }, + { + "epoch": 7.726753670473083, + "grad_norm": 0.2625787854194641, + "learning_rate": 0.0007703704884980659, + "loss": 0.1297, + "num_input_tokens_seen": 102216736, + "step": 47365 + }, + { + "epoch": 7.7275693311582385, + "grad_norm": 0.005955064669251442, + "learning_rate": 0.0007703106100636233, + "loss": 0.0242, + "num_input_tokens_seen": 102227904, + "step": 47370 + }, + { + "epoch": 7.728384991843393, + "grad_norm": 0.07390919327735901, + "learning_rate": 0.0007702507261509932, + "loss": 0.0238, + "num_input_tokens_seen": 102239200, + "step": 47375 + }, + { + "epoch": 7.729200652528548, + "grad_norm": 0.03744277358055115, + "learning_rate": 0.000770190836761389, + "loss": 0.0217, + "num_input_tokens_seen": 102251264, + "step": 47380 + }, + { + "epoch": 7.730016313213703, + "grad_norm": 0.01001934427767992, + "learning_rate": 0.0007701309418960252, + "loss": 0.0273, + "num_input_tokens_seen": 102262432, + "step": 47385 + }, + { + "epoch": 7.730831973898858, + "grad_norm": 0.06305497139692307, + "learning_rate": 0.000770071041556115, + "loss": 0.0663, + "num_input_tokens_seen": 102274912, + "step": 47390 + }, + { + "epoch": 7.731647634584013, + "grad_norm": 0.01765844225883484, + "learning_rate": 0.0007700111357428724, + "loss": 0.0933, + "num_input_tokens_seen": 102285696, + "step": 47395 + }, + { + "epoch": 7.732463295269168, + "grad_norm": 0.07994136214256287, + "learning_rate": 0.0007699512244575118, + "loss": 0.0217, + "num_input_tokens_seen": 102296160, + "step": 47400 + }, + { + "epoch": 7.733278955954323, + "grad_norm": 0.0871066227555275, + "learning_rate": 0.0007698913077012471, + "loss": 0.0779, + "num_input_tokens_seen": 102307712, + "step": 47405 + }, + { + "epoch": 7.734094616639478, + "grad_norm": 0.04768913611769676, + "learning_rate": 0.0007698313854752925, + "loss": 0.0606, + "num_input_tokens_seen": 102320032, + "step": 47410 + }, + { + "epoch": 7.734910277324633, + "grad_norm": 0.009936755523085594, + "learning_rate": 0.0007697714577808627, + "loss": 0.1833, + "num_input_tokens_seen": 102330656, + "step": 47415 + }, + { + "epoch": 7.735725938009788, + "grad_norm": 0.10741622745990753, + "learning_rate": 0.0007697115246191723, + "loss": 0.0962, + "num_input_tokens_seen": 102340704, + "step": 47420 + }, + { + "epoch": 7.736541598694943, + "grad_norm": 0.035204704850912094, + "learning_rate": 0.0007696515859914355, + "loss": 0.0529, + "num_input_tokens_seen": 102350752, + "step": 47425 + }, + { + "epoch": 7.737357259380098, + "grad_norm": 0.004631939344108105, + "learning_rate": 0.0007695916418988672, + "loss": 0.0825, + "num_input_tokens_seen": 102360800, + "step": 47430 + }, + { + "epoch": 7.738172920065253, + "grad_norm": 0.07401111721992493, + "learning_rate": 0.0007695316923426823, + "loss": 0.0473, + "num_input_tokens_seen": 102372096, + "step": 47435 + }, + { + "epoch": 7.738988580750408, + "grad_norm": 0.046925708651542664, + "learning_rate": 0.0007694717373240957, + "loss": 0.1792, + "num_input_tokens_seen": 102382112, + "step": 47440 + }, + { + "epoch": 7.739804241435563, + "grad_norm": 0.13744260370731354, + "learning_rate": 0.0007694117768443225, + "loss": 0.0549, + "num_input_tokens_seen": 102391392, + "step": 47445 + }, + { + "epoch": 7.740619902120718, + "grad_norm": 0.14652004837989807, + "learning_rate": 0.0007693518109045779, + "loss": 0.0377, + "num_input_tokens_seen": 102402720, + "step": 47450 + }, + { + "epoch": 7.741435562805873, + "grad_norm": 0.0048824031837284565, + "learning_rate": 0.0007692918395060772, + "loss": 0.2757, + "num_input_tokens_seen": 102412992, + "step": 47455 + }, + { + "epoch": 7.742251223491028, + "grad_norm": 0.09160833805799484, + "learning_rate": 0.0007692318626500357, + "loss": 0.2021, + "num_input_tokens_seen": 102424416, + "step": 47460 + }, + { + "epoch": 7.743066884176183, + "grad_norm": 0.057271476835012436, + "learning_rate": 0.000769171880337669, + "loss": 0.1258, + "num_input_tokens_seen": 102435296, + "step": 47465 + }, + { + "epoch": 7.7438825448613375, + "grad_norm": 0.11778069287538528, + "learning_rate": 0.0007691118925701927, + "loss": 0.1102, + "num_input_tokens_seen": 102445856, + "step": 47470 + }, + { + "epoch": 7.744698205546492, + "grad_norm": 0.01778312213718891, + "learning_rate": 0.0007690518993488225, + "loss": 0.0246, + "num_input_tokens_seen": 102457024, + "step": 47475 + }, + { + "epoch": 7.745513866231647, + "grad_norm": 0.16780374944210052, + "learning_rate": 0.0007689919006747741, + "loss": 0.0609, + "num_input_tokens_seen": 102467744, + "step": 47480 + }, + { + "epoch": 7.746329526916803, + "grad_norm": 0.09525378048419952, + "learning_rate": 0.0007689318965492637, + "loss": 0.0912, + "num_input_tokens_seen": 102477408, + "step": 47485 + }, + { + "epoch": 7.747145187601958, + "grad_norm": 0.058734066784381866, + "learning_rate": 0.0007688718869735072, + "loss": 0.0441, + "num_input_tokens_seen": 102487840, + "step": 47490 + }, + { + "epoch": 7.7479608482871125, + "grad_norm": 0.03938870504498482, + "learning_rate": 0.0007688118719487209, + "loss": 0.0465, + "num_input_tokens_seen": 102498624, + "step": 47495 + }, + { + "epoch": 7.748776508972267, + "grad_norm": 0.015664294362068176, + "learning_rate": 0.000768751851476121, + "loss": 0.0277, + "num_input_tokens_seen": 102509952, + "step": 47500 + }, + { + "epoch": 7.749592169657422, + "grad_norm": 0.00878231879323721, + "learning_rate": 0.0007686918255569238, + "loss": 0.0205, + "num_input_tokens_seen": 102521120, + "step": 47505 + }, + { + "epoch": 7.750407830342578, + "grad_norm": 0.01463279314339161, + "learning_rate": 0.000768631794192346, + "loss": 0.1627, + "num_input_tokens_seen": 102532832, + "step": 47510 + }, + { + "epoch": 7.751223491027733, + "grad_norm": 0.049709126353263855, + "learning_rate": 0.0007685717573836041, + "loss": 0.0394, + "num_input_tokens_seen": 102542752, + "step": 47515 + }, + { + "epoch": 7.7520391517128875, + "grad_norm": 0.018938735127449036, + "learning_rate": 0.0007685117151319148, + "loss": 0.0455, + "num_input_tokens_seen": 102553920, + "step": 47520 + }, + { + "epoch": 7.752854812398042, + "grad_norm": 0.07690119743347168, + "learning_rate": 0.000768451667438495, + "loss": 0.0407, + "num_input_tokens_seen": 102565088, + "step": 47525 + }, + { + "epoch": 7.753670473083197, + "grad_norm": 0.011308367364108562, + "learning_rate": 0.0007683916143045615, + "loss": 0.0331, + "num_input_tokens_seen": 102576704, + "step": 47530 + }, + { + "epoch": 7.754486133768353, + "grad_norm": 0.022574447095394135, + "learning_rate": 0.0007683315557313315, + "loss": 0.2233, + "num_input_tokens_seen": 102588000, + "step": 47535 + }, + { + "epoch": 7.755301794453508, + "grad_norm": 0.0635739266872406, + "learning_rate": 0.0007682714917200222, + "loss": 0.0603, + "num_input_tokens_seen": 102598144, + "step": 47540 + }, + { + "epoch": 7.7561174551386625, + "grad_norm": 0.005777337122708559, + "learning_rate": 0.0007682114222718507, + "loss": 0.0639, + "num_input_tokens_seen": 102608256, + "step": 47545 + }, + { + "epoch": 7.756933115823817, + "grad_norm": 0.023892994970083237, + "learning_rate": 0.0007681513473880345, + "loss": 0.042, + "num_input_tokens_seen": 102618528, + "step": 47550 + }, + { + "epoch": 7.757748776508972, + "grad_norm": 0.15535689890384674, + "learning_rate": 0.000768091267069791, + "loss": 0.1221, + "num_input_tokens_seen": 102629632, + "step": 47555 + }, + { + "epoch": 7.758564437194127, + "grad_norm": 0.03592200204730034, + "learning_rate": 0.000768031181318338, + "loss": 0.0225, + "num_input_tokens_seen": 102639712, + "step": 47560 + }, + { + "epoch": 7.759380097879282, + "grad_norm": 0.031805459409952164, + "learning_rate": 0.000767971090134893, + "loss": 0.0405, + "num_input_tokens_seen": 102650272, + "step": 47565 + }, + { + "epoch": 7.760195758564437, + "grad_norm": 0.0348435677587986, + "learning_rate": 0.0007679109935206741, + "loss": 0.036, + "num_input_tokens_seen": 102661216, + "step": 47570 + }, + { + "epoch": 7.761011419249592, + "grad_norm": 0.1940479874610901, + "learning_rate": 0.0007678508914768989, + "loss": 0.0523, + "num_input_tokens_seen": 102671200, + "step": 47575 + }, + { + "epoch": 7.761827079934747, + "grad_norm": 0.03355374559760094, + "learning_rate": 0.0007677907840047855, + "loss": 0.0413, + "num_input_tokens_seen": 102682144, + "step": 47580 + }, + { + "epoch": 7.762642740619902, + "grad_norm": 0.20242135226726532, + "learning_rate": 0.0007677306711055523, + "loss": 0.0864, + "num_input_tokens_seen": 102693376, + "step": 47585 + }, + { + "epoch": 7.763458401305057, + "grad_norm": 0.319576233625412, + "learning_rate": 0.0007676705527804173, + "loss": 0.0348, + "num_input_tokens_seen": 102704800, + "step": 47590 + }, + { + "epoch": 7.764274061990212, + "grad_norm": 0.22054317593574524, + "learning_rate": 0.000767610429030599, + "loss": 0.1637, + "num_input_tokens_seen": 102715776, + "step": 47595 + }, + { + "epoch": 7.765089722675367, + "grad_norm": 0.34517884254455566, + "learning_rate": 0.0007675502998573159, + "loss": 0.2325, + "num_input_tokens_seen": 102727232, + "step": 47600 + }, + { + "epoch": 7.765905383360522, + "grad_norm": 0.12015649676322937, + "learning_rate": 0.0007674901652617865, + "loss": 0.0942, + "num_input_tokens_seen": 102737568, + "step": 47605 + }, + { + "epoch": 7.766721044045677, + "grad_norm": 0.01218132022768259, + "learning_rate": 0.0007674300252452297, + "loss": 0.0811, + "num_input_tokens_seen": 102749728, + "step": 47610 + }, + { + "epoch": 7.767536704730832, + "grad_norm": 0.02160639502108097, + "learning_rate": 0.000767369879808864, + "loss": 0.0385, + "num_input_tokens_seen": 102760256, + "step": 47615 + }, + { + "epoch": 7.768352365415987, + "grad_norm": 0.009865098632872105, + "learning_rate": 0.0007673097289539086, + "loss": 0.0317, + "num_input_tokens_seen": 102771648, + "step": 47620 + }, + { + "epoch": 7.769168026101142, + "grad_norm": 0.02918362244963646, + "learning_rate": 0.0007672495726815825, + "loss": 0.1089, + "num_input_tokens_seen": 102782912, + "step": 47625 + }, + { + "epoch": 7.769983686786297, + "grad_norm": 0.022020578384399414, + "learning_rate": 0.0007671894109931048, + "loss": 0.0709, + "num_input_tokens_seen": 102795200, + "step": 47630 + }, + { + "epoch": 7.770799347471452, + "grad_norm": 0.008933918550610542, + "learning_rate": 0.0007671292438896946, + "loss": 0.1075, + "num_input_tokens_seen": 102805600, + "step": 47635 + }, + { + "epoch": 7.771615008156607, + "grad_norm": 0.16770917177200317, + "learning_rate": 0.0007670690713725715, + "loss": 0.045, + "num_input_tokens_seen": 102816800, + "step": 47640 + }, + { + "epoch": 7.7724306688417615, + "grad_norm": 0.008098525926470757, + "learning_rate": 0.0007670088934429548, + "loss": 0.0312, + "num_input_tokens_seen": 102827968, + "step": 47645 + }, + { + "epoch": 7.773246329526917, + "grad_norm": 0.003544506384059787, + "learning_rate": 0.0007669487101020642, + "loss": 0.0253, + "num_input_tokens_seen": 102838400, + "step": 47650 + }, + { + "epoch": 7.774061990212072, + "grad_norm": 0.14747589826583862, + "learning_rate": 0.0007668885213511193, + "loss": 0.0624, + "num_input_tokens_seen": 102848704, + "step": 47655 + }, + { + "epoch": 7.774877650897227, + "grad_norm": 0.10589897632598877, + "learning_rate": 0.0007668283271913399, + "loss": 0.0442, + "num_input_tokens_seen": 102859296, + "step": 47660 + }, + { + "epoch": 7.775693311582382, + "grad_norm": 0.030968595296144485, + "learning_rate": 0.000766768127623946, + "loss": 0.024, + "num_input_tokens_seen": 102870592, + "step": 47665 + }, + { + "epoch": 7.7765089722675365, + "grad_norm": 0.13514214754104614, + "learning_rate": 0.0007667079226501576, + "loss": 0.1071, + "num_input_tokens_seen": 102881568, + "step": 47670 + }, + { + "epoch": 7.777324632952691, + "grad_norm": 0.15891215205192566, + "learning_rate": 0.0007666477122711948, + "loss": 0.1197, + "num_input_tokens_seen": 102892320, + "step": 47675 + }, + { + "epoch": 7.778140293637847, + "grad_norm": 0.1431882679462433, + "learning_rate": 0.000766587496488278, + "loss": 0.2966, + "num_input_tokens_seen": 102903456, + "step": 47680 + }, + { + "epoch": 7.778955954323002, + "grad_norm": 0.019111640751361847, + "learning_rate": 0.0007665272753026271, + "loss": 0.0923, + "num_input_tokens_seen": 102914208, + "step": 47685 + }, + { + "epoch": 7.779771615008157, + "grad_norm": 0.3069877624511719, + "learning_rate": 0.000766467048715463, + "loss": 0.0781, + "num_input_tokens_seen": 102925152, + "step": 47690 + }, + { + "epoch": 7.780587275693311, + "grad_norm": 0.19489958882331848, + "learning_rate": 0.000766406816728006, + "loss": 0.1342, + "num_input_tokens_seen": 102936064, + "step": 47695 + }, + { + "epoch": 7.781402936378466, + "grad_norm": 0.053401701152324677, + "learning_rate": 0.000766346579341477, + "loss": 0.0616, + "num_input_tokens_seen": 102947200, + "step": 47700 + }, + { + "epoch": 7.782218597063622, + "grad_norm": 0.17794251441955566, + "learning_rate": 0.0007662863365570967, + "loss": 0.0764, + "num_input_tokens_seen": 102957216, + "step": 47705 + }, + { + "epoch": 7.783034257748777, + "grad_norm": 0.024267204105854034, + "learning_rate": 0.000766226088376086, + "loss": 0.0318, + "num_input_tokens_seen": 102968160, + "step": 47710 + }, + { + "epoch": 7.783849918433932, + "grad_norm": 0.02879796363413334, + "learning_rate": 0.0007661658347996659, + "loss": 0.0886, + "num_input_tokens_seen": 102978912, + "step": 47715 + }, + { + "epoch": 7.784665579119086, + "grad_norm": 0.052593715488910675, + "learning_rate": 0.0007661055758290574, + "loss": 0.3016, + "num_input_tokens_seen": 102989568, + "step": 47720 + }, + { + "epoch": 7.785481239804241, + "grad_norm": 0.013086318038403988, + "learning_rate": 0.0007660453114654819, + "loss": 0.0422, + "num_input_tokens_seen": 103000512, + "step": 47725 + }, + { + "epoch": 7.786296900489396, + "grad_norm": 0.06768391281366348, + "learning_rate": 0.0007659850417101606, + "loss": 0.0869, + "num_input_tokens_seen": 103012320, + "step": 47730 + }, + { + "epoch": 7.787112561174552, + "grad_norm": 0.015306583605706692, + "learning_rate": 0.0007659247665643151, + "loss": 0.0413, + "num_input_tokens_seen": 103023936, + "step": 47735 + }, + { + "epoch": 7.787928221859707, + "grad_norm": 0.23901499807834625, + "learning_rate": 0.0007658644860291668, + "loss": 0.0648, + "num_input_tokens_seen": 103033696, + "step": 47740 + }, + { + "epoch": 7.788743882544861, + "grad_norm": 0.3507566750049591, + "learning_rate": 0.0007658042001059373, + "loss": 0.0701, + "num_input_tokens_seen": 103045280, + "step": 47745 + }, + { + "epoch": 7.789559543230016, + "grad_norm": 0.00680531607940793, + "learning_rate": 0.0007657439087958486, + "loss": 0.0072, + "num_input_tokens_seen": 103055584, + "step": 47750 + }, + { + "epoch": 7.790375203915171, + "grad_norm": 0.18501955270767212, + "learning_rate": 0.0007656836121001225, + "loss": 0.0993, + "num_input_tokens_seen": 103066880, + "step": 47755 + }, + { + "epoch": 7.791190864600326, + "grad_norm": 0.008849621750414371, + "learning_rate": 0.0007656233100199809, + "loss": 0.0192, + "num_input_tokens_seen": 103077568, + "step": 47760 + }, + { + "epoch": 7.7920065252854815, + "grad_norm": 0.015436594374477863, + "learning_rate": 0.000765563002556646, + "loss": 0.0244, + "num_input_tokens_seen": 103088672, + "step": 47765 + }, + { + "epoch": 7.792822185970636, + "grad_norm": 0.04929293692111969, + "learning_rate": 0.00076550268971134, + "loss": 0.0206, + "num_input_tokens_seen": 103098496, + "step": 47770 + }, + { + "epoch": 7.793637846655791, + "grad_norm": 0.2269917130470276, + "learning_rate": 0.0007654423714852852, + "loss": 0.1171, + "num_input_tokens_seen": 103109856, + "step": 47775 + }, + { + "epoch": 7.794453507340946, + "grad_norm": 0.15163250267505646, + "learning_rate": 0.0007653820478797038, + "loss": 0.0397, + "num_input_tokens_seen": 103120032, + "step": 47780 + }, + { + "epoch": 7.795269168026101, + "grad_norm": 0.23630495369434357, + "learning_rate": 0.0007653217188958188, + "loss": 0.0543, + "num_input_tokens_seen": 103131072, + "step": 47785 + }, + { + "epoch": 7.7960848287112565, + "grad_norm": 0.2555839419364929, + "learning_rate": 0.0007652613845348524, + "loss": 0.1485, + "num_input_tokens_seen": 103142240, + "step": 47790 + }, + { + "epoch": 7.796900489396411, + "grad_norm": 0.2572099566459656, + "learning_rate": 0.0007652010447980276, + "loss": 0.132, + "num_input_tokens_seen": 103152960, + "step": 47795 + }, + { + "epoch": 7.797716150081566, + "grad_norm": 0.015411442145705223, + "learning_rate": 0.0007651406996865672, + "loss": 0.0695, + "num_input_tokens_seen": 103164352, + "step": 47800 + }, + { + "epoch": 7.798531810766721, + "grad_norm": 0.1217227578163147, + "learning_rate": 0.000765080349201694, + "loss": 0.085, + "num_input_tokens_seen": 103175680, + "step": 47805 + }, + { + "epoch": 7.799347471451876, + "grad_norm": 0.05562624707818031, + "learning_rate": 0.0007650199933446314, + "loss": 0.0658, + "num_input_tokens_seen": 103185952, + "step": 47810 + }, + { + "epoch": 7.800163132137031, + "grad_norm": 0.15175358951091766, + "learning_rate": 0.0007649596321166025, + "loss": 0.1246, + "num_input_tokens_seen": 103195840, + "step": 47815 + }, + { + "epoch": 7.800978792822186, + "grad_norm": 0.18305866420269012, + "learning_rate": 0.0007648992655188305, + "loss": 0.0519, + "num_input_tokens_seen": 103207104, + "step": 47820 + }, + { + "epoch": 7.801794453507341, + "grad_norm": 0.023939423263072968, + "learning_rate": 0.0007648388935525388, + "loss": 0.1008, + "num_input_tokens_seen": 103218624, + "step": 47825 + }, + { + "epoch": 7.802610114192496, + "grad_norm": 0.16830895841121674, + "learning_rate": 0.0007647785162189509, + "loss": 0.0659, + "num_input_tokens_seen": 103230592, + "step": 47830 + }, + { + "epoch": 7.803425774877651, + "grad_norm": 0.06158572435379028, + "learning_rate": 0.0007647181335192905, + "loss": 0.1829, + "num_input_tokens_seen": 103241984, + "step": 47835 + }, + { + "epoch": 7.804241435562806, + "grad_norm": 0.2995761036872864, + "learning_rate": 0.0007646577454547814, + "loss": 0.1113, + "num_input_tokens_seen": 103252800, + "step": 47840 + }, + { + "epoch": 7.80505709624796, + "grad_norm": 0.0030535554978996515, + "learning_rate": 0.0007645973520266472, + "loss": 0.0347, + "num_input_tokens_seen": 103265440, + "step": 47845 + }, + { + "epoch": 7.805872756933116, + "grad_norm": 0.05388334020972252, + "learning_rate": 0.000764536953236112, + "loss": 0.0648, + "num_input_tokens_seen": 103274976, + "step": 47850 + }, + { + "epoch": 7.806688417618271, + "grad_norm": 0.06490805000066757, + "learning_rate": 0.0007644765490844, + "loss": 0.0309, + "num_input_tokens_seen": 103285344, + "step": 47855 + }, + { + "epoch": 7.807504078303426, + "grad_norm": 0.01553364098072052, + "learning_rate": 0.0007644161395727352, + "loss": 0.0207, + "num_input_tokens_seen": 103296576, + "step": 47860 + }, + { + "epoch": 7.808319738988581, + "grad_norm": 0.01717471517622471, + "learning_rate": 0.0007643557247023418, + "loss": 0.1417, + "num_input_tokens_seen": 103306880, + "step": 47865 + }, + { + "epoch": 7.809135399673735, + "grad_norm": 0.08137975633144379, + "learning_rate": 0.0007642953044744443, + "loss": 0.0764, + "num_input_tokens_seen": 103317760, + "step": 47870 + }, + { + "epoch": 7.809951060358891, + "grad_norm": 0.030959924682974815, + "learning_rate": 0.0007642348788902672, + "loss": 0.0778, + "num_input_tokens_seen": 103328096, + "step": 47875 + }, + { + "epoch": 7.810766721044046, + "grad_norm": 0.14504072070121765, + "learning_rate": 0.000764174447951035, + "loss": 0.2175, + "num_input_tokens_seen": 103338560, + "step": 47880 + }, + { + "epoch": 7.811582381729201, + "grad_norm": 0.05535130947828293, + "learning_rate": 0.0007641140116579725, + "loss": 0.1647, + "num_input_tokens_seen": 103349824, + "step": 47885 + }, + { + "epoch": 7.8123980424143555, + "grad_norm": 0.021434113383293152, + "learning_rate": 0.0007640535700123047, + "loss": 0.0242, + "num_input_tokens_seen": 103361056, + "step": 47890 + }, + { + "epoch": 7.81321370309951, + "grad_norm": 0.008315632119774818, + "learning_rate": 0.000763993123015256, + "loss": 0.0585, + "num_input_tokens_seen": 103371520, + "step": 47895 + }, + { + "epoch": 7.814029363784666, + "grad_norm": 0.17355884611606598, + "learning_rate": 0.0007639326706680521, + "loss": 0.0705, + "num_input_tokens_seen": 103382784, + "step": 47900 + }, + { + "epoch": 7.814845024469821, + "grad_norm": 0.06749306619167328, + "learning_rate": 0.0007638722129719175, + "loss": 0.2469, + "num_input_tokens_seen": 103393376, + "step": 47905 + }, + { + "epoch": 7.815660685154976, + "grad_norm": 0.003625948214903474, + "learning_rate": 0.0007638117499280778, + "loss": 0.109, + "num_input_tokens_seen": 103404000, + "step": 47910 + }, + { + "epoch": 7.8164763458401305, + "grad_norm": 0.0020950506441295147, + "learning_rate": 0.0007637512815377585, + "loss": 0.0195, + "num_input_tokens_seen": 103414848, + "step": 47915 + }, + { + "epoch": 7.817292006525285, + "grad_norm": 0.003775825025513768, + "learning_rate": 0.0007636908078021848, + "loss": 0.1217, + "num_input_tokens_seen": 103426144, + "step": 47920 + }, + { + "epoch": 7.81810766721044, + "grad_norm": 0.011291631497442722, + "learning_rate": 0.0007636303287225823, + "loss": 0.088, + "num_input_tokens_seen": 103435584, + "step": 47925 + }, + { + "epoch": 7.818923327895595, + "grad_norm": 0.04091306030750275, + "learning_rate": 0.0007635698443001768, + "loss": 0.0297, + "num_input_tokens_seen": 103446464, + "step": 47930 + }, + { + "epoch": 7.819738988580751, + "grad_norm": 0.23727735877037048, + "learning_rate": 0.0007635093545361942, + "loss": 0.1059, + "num_input_tokens_seen": 103457248, + "step": 47935 + }, + { + "epoch": 7.8205546492659055, + "grad_norm": 0.2789122760295868, + "learning_rate": 0.00076344885943186, + "loss": 0.1151, + "num_input_tokens_seen": 103468128, + "step": 47940 + }, + { + "epoch": 7.82137030995106, + "grad_norm": 0.0022102808579802513, + "learning_rate": 0.0007633883589884007, + "loss": 0.0549, + "num_input_tokens_seen": 103478656, + "step": 47945 + }, + { + "epoch": 7.822185970636215, + "grad_norm": 0.09964501857757568, + "learning_rate": 0.000763327853207042, + "loss": 0.0516, + "num_input_tokens_seen": 103489472, + "step": 47950 + }, + { + "epoch": 7.82300163132137, + "grad_norm": 0.15040382742881775, + "learning_rate": 0.0007632673420890104, + "loss": 0.169, + "num_input_tokens_seen": 103500384, + "step": 47955 + }, + { + "epoch": 7.823817292006526, + "grad_norm": 0.17601391673088074, + "learning_rate": 0.000763206825635532, + "loss": 0.2146, + "num_input_tokens_seen": 103510432, + "step": 47960 + }, + { + "epoch": 7.8246329526916805, + "grad_norm": 0.021761463955044746, + "learning_rate": 0.0007631463038478334, + "loss": 0.1303, + "num_input_tokens_seen": 103520544, + "step": 47965 + }, + { + "epoch": 7.825448613376835, + "grad_norm": 0.1539314240217209, + "learning_rate": 0.0007630857767271413, + "loss": 0.1046, + "num_input_tokens_seen": 103531264, + "step": 47970 + }, + { + "epoch": 7.82626427406199, + "grad_norm": 0.060885198414325714, + "learning_rate": 0.000763025244274682, + "loss": 0.0423, + "num_input_tokens_seen": 103542176, + "step": 47975 + }, + { + "epoch": 7.827079934747145, + "grad_norm": 0.03094269521534443, + "learning_rate": 0.0007629647064916825, + "loss": 0.025, + "num_input_tokens_seen": 103552608, + "step": 47980 + }, + { + "epoch": 7.827895595432301, + "grad_norm": 0.01502175536006689, + "learning_rate": 0.0007629041633793696, + "loss": 0.1319, + "num_input_tokens_seen": 103563552, + "step": 47985 + }, + { + "epoch": 7.828711256117455, + "grad_norm": 0.01323753222823143, + "learning_rate": 0.0007628436149389703, + "loss": 0.0876, + "num_input_tokens_seen": 103575296, + "step": 47990 + }, + { + "epoch": 7.82952691680261, + "grad_norm": 0.06749841570854187, + "learning_rate": 0.000762783061171712, + "loss": 0.0456, + "num_input_tokens_seen": 103586304, + "step": 47995 + }, + { + "epoch": 7.830342577487765, + "grad_norm": 0.12266375124454498, + "learning_rate": 0.0007627225020788213, + "loss": 0.0986, + "num_input_tokens_seen": 103596768, + "step": 48000 + }, + { + "epoch": 7.83115823817292, + "grad_norm": 0.09488127380609512, + "learning_rate": 0.0007626619376615258, + "loss": 0.0974, + "num_input_tokens_seen": 103607808, + "step": 48005 + }, + { + "epoch": 7.831973898858075, + "grad_norm": 0.033919695764780045, + "learning_rate": 0.000762601367921053, + "loss": 0.0279, + "num_input_tokens_seen": 103618848, + "step": 48010 + }, + { + "epoch": 7.8327895595432295, + "grad_norm": 0.0929301306605339, + "learning_rate": 0.0007625407928586303, + "loss": 0.0458, + "num_input_tokens_seen": 103629600, + "step": 48015 + }, + { + "epoch": 7.833605220228385, + "grad_norm": 0.01206700038164854, + "learning_rate": 0.0007624802124754855, + "loss": 0.0401, + "num_input_tokens_seen": 103639744, + "step": 48020 + }, + { + "epoch": 7.83442088091354, + "grad_norm": 0.015365520492196083, + "learning_rate": 0.000762419626772846, + "loss": 0.0944, + "num_input_tokens_seen": 103649472, + "step": 48025 + }, + { + "epoch": 7.835236541598695, + "grad_norm": 0.22857356071472168, + "learning_rate": 0.0007623590357519401, + "loss": 0.1242, + "num_input_tokens_seen": 103659968, + "step": 48030 + }, + { + "epoch": 7.83605220228385, + "grad_norm": 0.17013679444789886, + "learning_rate": 0.0007622984394139953, + "loss": 0.0434, + "num_input_tokens_seen": 103671584, + "step": 48035 + }, + { + "epoch": 7.8368678629690045, + "grad_norm": 0.03199853375554085, + "learning_rate": 0.00076223783776024, + "loss": 0.1002, + "num_input_tokens_seen": 103681984, + "step": 48040 + }, + { + "epoch": 7.83768352365416, + "grad_norm": 0.06364502012729645, + "learning_rate": 0.0007621772307919022, + "loss": 0.1865, + "num_input_tokens_seen": 103692544, + "step": 48045 + }, + { + "epoch": 7.838499184339315, + "grad_norm": 0.006414428353309631, + "learning_rate": 0.0007621166185102104, + "loss": 0.0193, + "num_input_tokens_seen": 103703488, + "step": 48050 + }, + { + "epoch": 7.83931484502447, + "grad_norm": 0.0031967160757631063, + "learning_rate": 0.0007620560009163926, + "loss": 0.008, + "num_input_tokens_seen": 103714464, + "step": 48055 + }, + { + "epoch": 7.840130505709625, + "grad_norm": 0.018532024696469307, + "learning_rate": 0.0007619953780116775, + "loss": 0.019, + "num_input_tokens_seen": 103725184, + "step": 48060 + }, + { + "epoch": 7.8409461663947795, + "grad_norm": 0.27150070667266846, + "learning_rate": 0.0007619347497972937, + "loss": 0.0629, + "num_input_tokens_seen": 103735520, + "step": 48065 + }, + { + "epoch": 7.841761827079935, + "grad_norm": 0.2214997559785843, + "learning_rate": 0.00076187411627447, + "loss": 0.0906, + "num_input_tokens_seen": 103747072, + "step": 48070 + }, + { + "epoch": 7.84257748776509, + "grad_norm": 0.02284124679863453, + "learning_rate": 0.0007618134774444351, + "loss": 0.0093, + "num_input_tokens_seen": 103757632, + "step": 48075 + }, + { + "epoch": 7.843393148450245, + "grad_norm": 0.17375697195529938, + "learning_rate": 0.0007617528333084178, + "loss": 0.1061, + "num_input_tokens_seen": 103768576, + "step": 48080 + }, + { + "epoch": 7.8442088091354, + "grad_norm": 0.01205364428460598, + "learning_rate": 0.0007616921838676475, + "loss": 0.0623, + "num_input_tokens_seen": 103779840, + "step": 48085 + }, + { + "epoch": 7.8450244698205545, + "grad_norm": 0.00940660573542118, + "learning_rate": 0.0007616315291233531, + "loss": 0.1372, + "num_input_tokens_seen": 103789664, + "step": 48090 + }, + { + "epoch": 7.845840130505709, + "grad_norm": 0.0029068104922771454, + "learning_rate": 0.0007615708690767637, + "loss": 0.1193, + "num_input_tokens_seen": 103800832, + "step": 48095 + }, + { + "epoch": 7.846655791190865, + "grad_norm": 0.1633109599351883, + "learning_rate": 0.0007615102037291089, + "loss": 0.0378, + "num_input_tokens_seen": 103811616, + "step": 48100 + }, + { + "epoch": 7.84747145187602, + "grad_norm": 0.16224534809589386, + "learning_rate": 0.000761449533081618, + "loss": 0.2858, + "num_input_tokens_seen": 103822464, + "step": 48105 + }, + { + "epoch": 7.848287112561175, + "grad_norm": 0.16000252962112427, + "learning_rate": 0.0007613888571355208, + "loss": 0.081, + "num_input_tokens_seen": 103833984, + "step": 48110 + }, + { + "epoch": 7.849102773246329, + "grad_norm": 0.009912429377436638, + "learning_rate": 0.0007613281758920467, + "loss": 0.0187, + "num_input_tokens_seen": 103845664, + "step": 48115 + }, + { + "epoch": 7.849918433931484, + "grad_norm": 0.0428059883415699, + "learning_rate": 0.0007612674893524256, + "loss": 0.2497, + "num_input_tokens_seen": 103855936, + "step": 48120 + }, + { + "epoch": 7.850734094616639, + "grad_norm": 0.0610620379447937, + "learning_rate": 0.0007612067975178874, + "loss": 0.0163, + "num_input_tokens_seen": 103866304, + "step": 48125 + }, + { + "epoch": 7.851549755301795, + "grad_norm": 0.014108601026237011, + "learning_rate": 0.0007611461003896621, + "loss": 0.067, + "num_input_tokens_seen": 103877280, + "step": 48130 + }, + { + "epoch": 7.85236541598695, + "grad_norm": 0.04415108263492584, + "learning_rate": 0.0007610853979689797, + "loss": 0.0691, + "num_input_tokens_seen": 103888192, + "step": 48135 + }, + { + "epoch": 7.853181076672104, + "grad_norm": 0.05788084492087364, + "learning_rate": 0.0007610246902570706, + "loss": 0.0862, + "num_input_tokens_seen": 103897568, + "step": 48140 + }, + { + "epoch": 7.853996737357259, + "grad_norm": 0.15386447310447693, + "learning_rate": 0.000760963977255165, + "loss": 0.075, + "num_input_tokens_seen": 103908448, + "step": 48145 + }, + { + "epoch": 7.854812398042414, + "grad_norm": 0.09477708488702774, + "learning_rate": 0.0007609032589644934, + "loss": 0.0569, + "num_input_tokens_seen": 103917856, + "step": 48150 + }, + { + "epoch": 7.85562805872757, + "grad_norm": 0.004400565288960934, + "learning_rate": 0.0007608425353862863, + "loss": 0.0377, + "num_input_tokens_seen": 103929792, + "step": 48155 + }, + { + "epoch": 7.856443719412725, + "grad_norm": 0.32010260224342346, + "learning_rate": 0.000760781806521774, + "loss": 0.1827, + "num_input_tokens_seen": 103939488, + "step": 48160 + }, + { + "epoch": 7.857259380097879, + "grad_norm": 0.005035667680203915, + "learning_rate": 0.0007607210723721879, + "loss": 0.0298, + "num_input_tokens_seen": 103949600, + "step": 48165 + }, + { + "epoch": 7.858075040783034, + "grad_norm": 0.046824630349874496, + "learning_rate": 0.0007606603329387585, + "loss": 0.2197, + "num_input_tokens_seen": 103959936, + "step": 48170 + }, + { + "epoch": 7.858890701468189, + "grad_norm": 0.01573644019663334, + "learning_rate": 0.0007605995882227166, + "loss": 0.0297, + "num_input_tokens_seen": 103971136, + "step": 48175 + }, + { + "epoch": 7.859706362153344, + "grad_norm": 0.06723793596029282, + "learning_rate": 0.0007605388382252936, + "loss": 0.1221, + "num_input_tokens_seen": 103981760, + "step": 48180 + }, + { + "epoch": 7.8605220228384995, + "grad_norm": 0.06097792088985443, + "learning_rate": 0.0007604780829477205, + "loss": 0.0489, + "num_input_tokens_seen": 103992480, + "step": 48185 + }, + { + "epoch": 7.861337683523654, + "grad_norm": 0.10127381235361099, + "learning_rate": 0.0007604173223912285, + "loss": 0.0304, + "num_input_tokens_seen": 104002720, + "step": 48190 + }, + { + "epoch": 7.862153344208809, + "grad_norm": 0.14566576480865479, + "learning_rate": 0.0007603565565570493, + "loss": 0.078, + "num_input_tokens_seen": 104011968, + "step": 48195 + }, + { + "epoch": 7.862969004893964, + "grad_norm": 0.04649549722671509, + "learning_rate": 0.0007602957854464141, + "loss": 0.1464, + "num_input_tokens_seen": 104023328, + "step": 48200 + }, + { + "epoch": 7.863784665579119, + "grad_norm": 0.2787606418132782, + "learning_rate": 0.0007602350090605546, + "loss": 0.1261, + "num_input_tokens_seen": 104032896, + "step": 48205 + }, + { + "epoch": 7.864600326264274, + "grad_norm": 0.1980082392692566, + "learning_rate": 0.0007601742274007023, + "loss": 0.0664, + "num_input_tokens_seen": 104044128, + "step": 48210 + }, + { + "epoch": 7.865415986949429, + "grad_norm": 0.12827168405056, + "learning_rate": 0.0007601134404680894, + "loss": 0.0855, + "num_input_tokens_seen": 104055808, + "step": 48215 + }, + { + "epoch": 7.866231647634584, + "grad_norm": 0.005193778779357672, + "learning_rate": 0.0007600526482639477, + "loss": 0.1162, + "num_input_tokens_seen": 104066816, + "step": 48220 + }, + { + "epoch": 7.867047308319739, + "grad_norm": 0.12690505385398865, + "learning_rate": 0.0007599918507895092, + "loss": 0.0705, + "num_input_tokens_seen": 104078624, + "step": 48225 + }, + { + "epoch": 7.867862969004894, + "grad_norm": 0.003398684086278081, + "learning_rate": 0.000759931048046006, + "loss": 0.0208, + "num_input_tokens_seen": 104090592, + "step": 48230 + }, + { + "epoch": 7.868678629690049, + "grad_norm": 0.10064653307199478, + "learning_rate": 0.0007598702400346703, + "loss": 0.1684, + "num_input_tokens_seen": 104101376, + "step": 48235 + }, + { + "epoch": 7.869494290375204, + "grad_norm": 0.17004089057445526, + "learning_rate": 0.0007598094267567345, + "loss": 0.0957, + "num_input_tokens_seen": 104111968, + "step": 48240 + }, + { + "epoch": 7.870309951060359, + "grad_norm": 0.03696007654070854, + "learning_rate": 0.0007597486082134311, + "loss": 0.0257, + "num_input_tokens_seen": 104123616, + "step": 48245 + }, + { + "epoch": 7.871125611745514, + "grad_norm": 0.053766246885061264, + "learning_rate": 0.0007596877844059926, + "loss": 0.0465, + "num_input_tokens_seen": 104134176, + "step": 48250 + }, + { + "epoch": 7.871941272430669, + "grad_norm": 0.08471935987472534, + "learning_rate": 0.0007596269553356518, + "loss": 0.0326, + "num_input_tokens_seen": 104144704, + "step": 48255 + }, + { + "epoch": 7.872756933115824, + "grad_norm": 0.011327247135341167, + "learning_rate": 0.0007595661210036414, + "loss": 0.0127, + "num_input_tokens_seen": 104155648, + "step": 48260 + }, + { + "epoch": 7.873572593800979, + "grad_norm": 0.09601664543151855, + "learning_rate": 0.0007595052814111942, + "loss": 0.0504, + "num_input_tokens_seen": 104166080, + "step": 48265 + }, + { + "epoch": 7.874388254486134, + "grad_norm": 0.04323287680745125, + "learning_rate": 0.0007594444365595435, + "loss": 0.1202, + "num_input_tokens_seen": 104177152, + "step": 48270 + }, + { + "epoch": 7.875203915171289, + "grad_norm": 0.029387159273028374, + "learning_rate": 0.0007593835864499219, + "loss": 0.1539, + "num_input_tokens_seen": 104187104, + "step": 48275 + }, + { + "epoch": 7.876019575856444, + "grad_norm": 0.22904320061206818, + "learning_rate": 0.0007593227310835629, + "loss": 0.1936, + "num_input_tokens_seen": 104198176, + "step": 48280 + }, + { + "epoch": 7.876835236541599, + "grad_norm": 0.057843372225761414, + "learning_rate": 0.0007592618704616998, + "loss": 0.0871, + "num_input_tokens_seen": 104208896, + "step": 48285 + }, + { + "epoch": 7.877650897226753, + "grad_norm": 0.013300434686243534, + "learning_rate": 0.0007592010045855662, + "loss": 0.0352, + "num_input_tokens_seen": 104220512, + "step": 48290 + }, + { + "epoch": 7.878466557911908, + "grad_norm": 0.22537243366241455, + "learning_rate": 0.0007591401334563952, + "loss": 0.1926, + "num_input_tokens_seen": 104230816, + "step": 48295 + }, + { + "epoch": 7.879282218597064, + "grad_norm": 0.09585357457399368, + "learning_rate": 0.0007590792570754207, + "loss": 0.0322, + "num_input_tokens_seen": 104242240, + "step": 48300 + }, + { + "epoch": 7.880097879282219, + "grad_norm": 0.006919694133102894, + "learning_rate": 0.0007590183754438764, + "loss": 0.031, + "num_input_tokens_seen": 104253376, + "step": 48305 + }, + { + "epoch": 7.8809135399673735, + "grad_norm": 0.004928524140268564, + "learning_rate": 0.0007589574885629961, + "loss": 0.0246, + "num_input_tokens_seen": 104264864, + "step": 48310 + }, + { + "epoch": 7.881729200652528, + "grad_norm": 0.06503266841173172, + "learning_rate": 0.0007588965964340137, + "loss": 0.1113, + "num_input_tokens_seen": 104275712, + "step": 48315 + }, + { + "epoch": 7.882544861337683, + "grad_norm": 0.059820204973220825, + "learning_rate": 0.0007588356990581635, + "loss": 0.0282, + "num_input_tokens_seen": 104287328, + "step": 48320 + }, + { + "epoch": 7.883360522022839, + "grad_norm": 0.15940812230110168, + "learning_rate": 0.0007587747964366796, + "loss": 0.0179, + "num_input_tokens_seen": 104297344, + "step": 48325 + }, + { + "epoch": 7.884176182707994, + "grad_norm": 0.004403805825859308, + "learning_rate": 0.0007587138885707959, + "loss": 0.0121, + "num_input_tokens_seen": 104308992, + "step": 48330 + }, + { + "epoch": 7.8849918433931485, + "grad_norm": 0.01798327825963497, + "learning_rate": 0.000758652975461747, + "loss": 0.0997, + "num_input_tokens_seen": 104319616, + "step": 48335 + }, + { + "epoch": 7.885807504078303, + "grad_norm": 0.004846068099141121, + "learning_rate": 0.0007585920571107677, + "loss": 0.0375, + "num_input_tokens_seen": 104331520, + "step": 48340 + }, + { + "epoch": 7.886623164763458, + "grad_norm": 0.17932161688804626, + "learning_rate": 0.0007585311335190923, + "loss": 0.1288, + "num_input_tokens_seen": 104341664, + "step": 48345 + }, + { + "epoch": 7.887438825448614, + "grad_norm": 0.07531636953353882, + "learning_rate": 0.0007584702046879554, + "loss": 0.0898, + "num_input_tokens_seen": 104352640, + "step": 48350 + }, + { + "epoch": 7.888254486133769, + "grad_norm": 0.022938642650842667, + "learning_rate": 0.0007584092706185919, + "loss": 0.1144, + "num_input_tokens_seen": 104362496, + "step": 48355 + }, + { + "epoch": 7.8890701468189235, + "grad_norm": 0.008694291114807129, + "learning_rate": 0.0007583483313122368, + "loss": 0.1267, + "num_input_tokens_seen": 104372192, + "step": 48360 + }, + { + "epoch": 7.889885807504078, + "grad_norm": 0.003126397728919983, + "learning_rate": 0.000758287386770125, + "loss": 0.0433, + "num_input_tokens_seen": 104383168, + "step": 48365 + }, + { + "epoch": 7.890701468189233, + "grad_norm": 0.003002016805112362, + "learning_rate": 0.0007582264369934915, + "loss": 0.0493, + "num_input_tokens_seen": 104393632, + "step": 48370 + }, + { + "epoch": 7.891517128874388, + "grad_norm": 0.03168673440814018, + "learning_rate": 0.0007581654819835717, + "loss": 0.1316, + "num_input_tokens_seen": 104404096, + "step": 48375 + }, + { + "epoch": 7.892332789559543, + "grad_norm": 0.014192328788340092, + "learning_rate": 0.0007581045217416011, + "loss": 0.0587, + "num_input_tokens_seen": 104414592, + "step": 48380 + }, + { + "epoch": 7.8931484502446985, + "grad_norm": 0.13138961791992188, + "learning_rate": 0.0007580435562688148, + "loss": 0.1105, + "num_input_tokens_seen": 104424736, + "step": 48385 + }, + { + "epoch": 7.893964110929853, + "grad_norm": 0.12062573432922363, + "learning_rate": 0.0007579825855664486, + "loss": 0.0243, + "num_input_tokens_seen": 104434976, + "step": 48390 + }, + { + "epoch": 7.894779771615008, + "grad_norm": 0.0431981086730957, + "learning_rate": 0.0007579216096357378, + "loss": 0.1993, + "num_input_tokens_seen": 104445920, + "step": 48395 + }, + { + "epoch": 7.895595432300163, + "grad_norm": 0.03383784368634224, + "learning_rate": 0.0007578606284779185, + "loss": 0.1752, + "num_input_tokens_seen": 104455584, + "step": 48400 + }, + { + "epoch": 7.896411092985318, + "grad_norm": 0.08887404948472977, + "learning_rate": 0.0007577996420942266, + "loss": 0.0277, + "num_input_tokens_seen": 104466304, + "step": 48405 + }, + { + "epoch": 7.897226753670473, + "grad_norm": 0.017706193029880524, + "learning_rate": 0.0007577386504858978, + "loss": 0.0328, + "num_input_tokens_seen": 104476096, + "step": 48410 + }, + { + "epoch": 7.898042414355628, + "grad_norm": 0.017310122027993202, + "learning_rate": 0.0007576776536541682, + "loss": 0.0494, + "num_input_tokens_seen": 104486656, + "step": 48415 + }, + { + "epoch": 7.898858075040783, + "grad_norm": 0.037849392741918564, + "learning_rate": 0.0007576166516002741, + "loss": 0.0686, + "num_input_tokens_seen": 104497600, + "step": 48420 + }, + { + "epoch": 7.899673735725938, + "grad_norm": 0.3871559798717499, + "learning_rate": 0.0007575556443254518, + "loss": 0.1426, + "num_input_tokens_seen": 104508384, + "step": 48425 + }, + { + "epoch": 7.900489396411093, + "grad_norm": 0.02457922324538231, + "learning_rate": 0.0007574946318309376, + "loss": 0.0866, + "num_input_tokens_seen": 104519136, + "step": 48430 + }, + { + "epoch": 7.901305057096248, + "grad_norm": 0.051355913281440735, + "learning_rate": 0.000757433614117968, + "loss": 0.021, + "num_input_tokens_seen": 104529728, + "step": 48435 + }, + { + "epoch": 7.902120717781403, + "grad_norm": 0.008268642239272594, + "learning_rate": 0.0007573725911877797, + "loss": 0.1404, + "num_input_tokens_seen": 104541888, + "step": 48440 + }, + { + "epoch": 7.902936378466558, + "grad_norm": 0.010468416847288609, + "learning_rate": 0.0007573115630416092, + "loss": 0.0273, + "num_input_tokens_seen": 104552736, + "step": 48445 + }, + { + "epoch": 7.903752039151713, + "grad_norm": 0.17140138149261475, + "learning_rate": 0.0007572505296806935, + "loss": 0.1287, + "num_input_tokens_seen": 104562464, + "step": 48450 + }, + { + "epoch": 7.904567699836868, + "grad_norm": 0.027772270143032074, + "learning_rate": 0.0007571894911062696, + "loss": 0.0434, + "num_input_tokens_seen": 104573600, + "step": 48455 + }, + { + "epoch": 7.9053833605220225, + "grad_norm": 0.010094069875776768, + "learning_rate": 0.0007571284473195743, + "loss": 0.0364, + "num_input_tokens_seen": 104584800, + "step": 48460 + }, + { + "epoch": 7.906199021207177, + "grad_norm": 0.2739095687866211, + "learning_rate": 0.0007570673983218448, + "loss": 0.0995, + "num_input_tokens_seen": 104596192, + "step": 48465 + }, + { + "epoch": 7.907014681892333, + "grad_norm": 0.36304837465286255, + "learning_rate": 0.0007570063441143185, + "loss": 0.2079, + "num_input_tokens_seen": 104606976, + "step": 48470 + }, + { + "epoch": 7.907830342577488, + "grad_norm": 0.3222163915634155, + "learning_rate": 0.0007569452846982325, + "loss": 0.0844, + "num_input_tokens_seen": 104618304, + "step": 48475 + }, + { + "epoch": 7.908646003262643, + "grad_norm": 0.14590516686439514, + "learning_rate": 0.0007568842200748243, + "loss": 0.0342, + "num_input_tokens_seen": 104627840, + "step": 48480 + }, + { + "epoch": 7.9094616639477975, + "grad_norm": 0.14609917998313904, + "learning_rate": 0.0007568231502453317, + "loss": 0.0385, + "num_input_tokens_seen": 104639296, + "step": 48485 + }, + { + "epoch": 7.910277324632952, + "grad_norm": 0.086878702044487, + "learning_rate": 0.000756762075210992, + "loss": 0.0521, + "num_input_tokens_seen": 104648992, + "step": 48490 + }, + { + "epoch": 7.911092985318108, + "grad_norm": 0.014088406227529049, + "learning_rate": 0.0007567009949730431, + "loss": 0.0433, + "num_input_tokens_seen": 104659680, + "step": 48495 + }, + { + "epoch": 7.911908646003263, + "grad_norm": 0.003740956075489521, + "learning_rate": 0.000756639909532723, + "loss": 0.1165, + "num_input_tokens_seen": 104670688, + "step": 48500 + }, + { + "epoch": 7.912724306688418, + "grad_norm": 0.11189651489257812, + "learning_rate": 0.0007565788188912694, + "loss": 0.2164, + "num_input_tokens_seen": 104681600, + "step": 48505 + }, + { + "epoch": 7.9135399673735725, + "grad_norm": 0.06286628544330597, + "learning_rate": 0.0007565177230499206, + "loss": 0.1016, + "num_input_tokens_seen": 104692672, + "step": 48510 + }, + { + "epoch": 7.914355628058727, + "grad_norm": 0.007871391251683235, + "learning_rate": 0.0007564566220099147, + "loss": 0.0257, + "num_input_tokens_seen": 104703296, + "step": 48515 + }, + { + "epoch": 7.915171288743883, + "grad_norm": 0.025546282529830933, + "learning_rate": 0.00075639551577249, + "loss": 0.0203, + "num_input_tokens_seen": 104715648, + "step": 48520 + }, + { + "epoch": 7.915986949429038, + "grad_norm": 0.2044786661863327, + "learning_rate": 0.0007563344043388851, + "loss": 0.0973, + "num_input_tokens_seen": 104724864, + "step": 48525 + }, + { + "epoch": 7.916802610114193, + "grad_norm": 0.16749307513237, + "learning_rate": 0.0007562732877103382, + "loss": 0.0943, + "num_input_tokens_seen": 104734592, + "step": 48530 + }, + { + "epoch": 7.917618270799347, + "grad_norm": 0.22469349205493927, + "learning_rate": 0.000756212165888088, + "loss": 0.0599, + "num_input_tokens_seen": 104744992, + "step": 48535 + }, + { + "epoch": 7.918433931484502, + "grad_norm": 0.025696752592921257, + "learning_rate": 0.0007561510388733732, + "loss": 0.1073, + "num_input_tokens_seen": 104756512, + "step": 48540 + }, + { + "epoch": 7.919249592169657, + "grad_norm": 0.35504814982414246, + "learning_rate": 0.0007560899066674327, + "loss": 0.0894, + "num_input_tokens_seen": 104767840, + "step": 48545 + }, + { + "epoch": 7.920065252854813, + "grad_norm": 0.09544529020786285, + "learning_rate": 0.0007560287692715053, + "loss": 0.1228, + "num_input_tokens_seen": 104778944, + "step": 48550 + }, + { + "epoch": 7.920880913539968, + "grad_norm": 0.014338984154164791, + "learning_rate": 0.0007559676266868302, + "loss": 0.0426, + "num_input_tokens_seen": 104789088, + "step": 48555 + }, + { + "epoch": 7.921696574225122, + "grad_norm": 0.021478459239006042, + "learning_rate": 0.0007559064789146464, + "loss": 0.0381, + "num_input_tokens_seen": 104800032, + "step": 48560 + }, + { + "epoch": 7.922512234910277, + "grad_norm": 0.007439580280333757, + "learning_rate": 0.000755845325956193, + "loss": 0.0255, + "num_input_tokens_seen": 104810720, + "step": 48565 + }, + { + "epoch": 7.923327895595432, + "grad_norm": 0.009801163338124752, + "learning_rate": 0.0007557841678127097, + "loss": 0.0222, + "num_input_tokens_seen": 104820896, + "step": 48570 + }, + { + "epoch": 7.924143556280587, + "grad_norm": 0.005310403648763895, + "learning_rate": 0.0007557230044854357, + "loss": 0.1418, + "num_input_tokens_seen": 104831808, + "step": 48575 + }, + { + "epoch": 7.924959216965743, + "grad_norm": 0.004937141668051481, + "learning_rate": 0.0007556618359756107, + "loss": 0.0116, + "num_input_tokens_seen": 104841920, + "step": 48580 + }, + { + "epoch": 7.925774877650897, + "grad_norm": 0.007599037606269121, + "learning_rate": 0.0007556006622844742, + "loss": 0.013, + "num_input_tokens_seen": 104853408, + "step": 48585 + }, + { + "epoch": 7.926590538336052, + "grad_norm": 0.01367875374853611, + "learning_rate": 0.000755539483413266, + "loss": 0.1907, + "num_input_tokens_seen": 104862880, + "step": 48590 + }, + { + "epoch": 7.927406199021207, + "grad_norm": 0.029852217063307762, + "learning_rate": 0.0007554782993632259, + "loss": 0.1497, + "num_input_tokens_seen": 104873952, + "step": 48595 + }, + { + "epoch": 7.928221859706362, + "grad_norm": 0.006447079125791788, + "learning_rate": 0.0007554171101355941, + "loss": 0.0169, + "num_input_tokens_seen": 104884128, + "step": 48600 + }, + { + "epoch": 7.9290375203915175, + "grad_norm": 0.034852199256420135, + "learning_rate": 0.0007553559157316105, + "loss": 0.046, + "num_input_tokens_seen": 104895296, + "step": 48605 + }, + { + "epoch": 7.929853181076672, + "grad_norm": 0.014176536351442337, + "learning_rate": 0.0007552947161525153, + "loss": 0.0438, + "num_input_tokens_seen": 104905760, + "step": 48610 + }, + { + "epoch": 7.930668841761827, + "grad_norm": 0.2670537233352661, + "learning_rate": 0.0007552335113995489, + "loss": 0.0726, + "num_input_tokens_seen": 104916864, + "step": 48615 + }, + { + "epoch": 7.931484502446982, + "grad_norm": 0.04038407653570175, + "learning_rate": 0.0007551723014739515, + "loss": 0.0865, + "num_input_tokens_seen": 104927104, + "step": 48620 + }, + { + "epoch": 7.932300163132137, + "grad_norm": 0.3317245543003082, + "learning_rate": 0.0007551110863769638, + "loss": 0.0683, + "num_input_tokens_seen": 104937632, + "step": 48625 + }, + { + "epoch": 7.933115823817292, + "grad_norm": 0.28098195791244507, + "learning_rate": 0.0007550498661098263, + "loss": 0.2207, + "num_input_tokens_seen": 104948448, + "step": 48630 + }, + { + "epoch": 7.933931484502447, + "grad_norm": 0.01069544441998005, + "learning_rate": 0.0007549886406737796, + "loss": 0.0214, + "num_input_tokens_seen": 104958688, + "step": 48635 + }, + { + "epoch": 7.934747145187602, + "grad_norm": 0.017293743789196014, + "learning_rate": 0.0007549274100700647, + "loss": 0.016, + "num_input_tokens_seen": 104969408, + "step": 48640 + }, + { + "epoch": 7.935562805872757, + "grad_norm": 0.017817217856645584, + "learning_rate": 0.0007548661742999225, + "loss": 0.0243, + "num_input_tokens_seen": 104979968, + "step": 48645 + }, + { + "epoch": 7.936378466557912, + "grad_norm": 0.06672964990139008, + "learning_rate": 0.0007548049333645939, + "loss": 0.1294, + "num_input_tokens_seen": 104990688, + "step": 48650 + }, + { + "epoch": 7.937194127243067, + "grad_norm": 0.16509655117988586, + "learning_rate": 0.00075474368726532, + "loss": 0.0862, + "num_input_tokens_seen": 105002048, + "step": 48655 + }, + { + "epoch": 7.938009787928221, + "grad_norm": 0.01978885382413864, + "learning_rate": 0.0007546824360033421, + "loss": 0.0988, + "num_input_tokens_seen": 105013056, + "step": 48660 + }, + { + "epoch": 7.938825448613377, + "grad_norm": 0.21536606550216675, + "learning_rate": 0.0007546211795799016, + "loss": 0.2479, + "num_input_tokens_seen": 105021856, + "step": 48665 + }, + { + "epoch": 7.939641109298532, + "grad_norm": 0.263004332780838, + "learning_rate": 0.0007545599179962399, + "loss": 0.2333, + "num_input_tokens_seen": 105031936, + "step": 48670 + }, + { + "epoch": 7.940456769983687, + "grad_norm": 0.23876844346523285, + "learning_rate": 0.0007544986512535985, + "loss": 0.2671, + "num_input_tokens_seen": 105044000, + "step": 48675 + }, + { + "epoch": 7.941272430668842, + "grad_norm": 0.1761641502380371, + "learning_rate": 0.0007544373793532191, + "loss": 0.0556, + "num_input_tokens_seen": 105054272, + "step": 48680 + }, + { + "epoch": 7.942088091353996, + "grad_norm": 0.020206674933433533, + "learning_rate": 0.0007543761022963436, + "loss": 0.028, + "num_input_tokens_seen": 105065984, + "step": 48685 + }, + { + "epoch": 7.942903752039152, + "grad_norm": 0.22411933541297913, + "learning_rate": 0.0007543148200842134, + "loss": 0.113, + "num_input_tokens_seen": 105077440, + "step": 48690 + }, + { + "epoch": 7.943719412724307, + "grad_norm": 0.24834826588630676, + "learning_rate": 0.0007542535327180708, + "loss": 0.1736, + "num_input_tokens_seen": 105087104, + "step": 48695 + }, + { + "epoch": 7.944535073409462, + "grad_norm": 0.11571584641933441, + "learning_rate": 0.0007541922401991579, + "loss": 0.0567, + "num_input_tokens_seen": 105097824, + "step": 48700 + }, + { + "epoch": 7.945350734094617, + "grad_norm": 0.016416028141975403, + "learning_rate": 0.0007541309425287168, + "loss": 0.0166, + "num_input_tokens_seen": 105108064, + "step": 48705 + }, + { + "epoch": 7.946166394779771, + "grad_norm": 0.01847304217517376, + "learning_rate": 0.0007540696397079898, + "loss": 0.0251, + "num_input_tokens_seen": 105118784, + "step": 48710 + }, + { + "epoch": 7.946982055464927, + "grad_norm": 0.03549588471651077, + "learning_rate": 0.0007540083317382192, + "loss": 0.097, + "num_input_tokens_seen": 105128608, + "step": 48715 + }, + { + "epoch": 7.947797716150082, + "grad_norm": 0.03343822434544563, + "learning_rate": 0.0007539470186206474, + "loss": 0.0457, + "num_input_tokens_seen": 105138720, + "step": 48720 + }, + { + "epoch": 7.948613376835237, + "grad_norm": 0.16476163268089294, + "learning_rate": 0.0007538857003565174, + "loss": 0.0598, + "num_input_tokens_seen": 105150368, + "step": 48725 + }, + { + "epoch": 7.9494290375203915, + "grad_norm": 0.1172182708978653, + "learning_rate": 0.0007538243769470714, + "loss": 0.0663, + "num_input_tokens_seen": 105160544, + "step": 48730 + }, + { + "epoch": 7.950244698205546, + "grad_norm": 0.10923966765403748, + "learning_rate": 0.0007537630483935524, + "loss": 0.1099, + "num_input_tokens_seen": 105170784, + "step": 48735 + }, + { + "epoch": 7.951060358890701, + "grad_norm": 0.014825700782239437, + "learning_rate": 0.0007537017146972033, + "loss": 0.0836, + "num_input_tokens_seen": 105180960, + "step": 48740 + }, + { + "epoch": 7.951876019575856, + "grad_norm": 0.025320563465356827, + "learning_rate": 0.0007536403758592672, + "loss": 0.0467, + "num_input_tokens_seen": 105191200, + "step": 48745 + }, + { + "epoch": 7.952691680261012, + "grad_norm": 0.13513469696044922, + "learning_rate": 0.000753579031880987, + "loss": 0.0549, + "num_input_tokens_seen": 105202176, + "step": 48750 + }, + { + "epoch": 7.9535073409461665, + "grad_norm": 0.08608119934797287, + "learning_rate": 0.0007535176827636061, + "loss": 0.1557, + "num_input_tokens_seen": 105212352, + "step": 48755 + }, + { + "epoch": 7.954323001631321, + "grad_norm": 0.20529378950595856, + "learning_rate": 0.0007534563285083678, + "loss": 0.0765, + "num_input_tokens_seen": 105222464, + "step": 48760 + }, + { + "epoch": 7.955138662316476, + "grad_norm": 0.06052173301577568, + "learning_rate": 0.0007533949691165152, + "loss": 0.0365, + "num_input_tokens_seen": 105233984, + "step": 48765 + }, + { + "epoch": 7.955954323001631, + "grad_norm": 0.08458875864744186, + "learning_rate": 0.0007533336045892925, + "loss": 0.0444, + "num_input_tokens_seen": 105244768, + "step": 48770 + }, + { + "epoch": 7.956769983686787, + "grad_norm": 0.006116112694144249, + "learning_rate": 0.0007532722349279426, + "loss": 0.1336, + "num_input_tokens_seen": 105255680, + "step": 48775 + }, + { + "epoch": 7.9575856443719415, + "grad_norm": 0.002088747685775161, + "learning_rate": 0.0007532108601337097, + "loss": 0.01, + "num_input_tokens_seen": 105268608, + "step": 48780 + }, + { + "epoch": 7.958401305057096, + "grad_norm": 0.1638191193342209, + "learning_rate": 0.0007531494802078376, + "loss": 0.1134, + "num_input_tokens_seen": 105278464, + "step": 48785 + }, + { + "epoch": 7.959216965742251, + "grad_norm": 0.010092548094689846, + "learning_rate": 0.00075308809515157, + "loss": 0.0098, + "num_input_tokens_seen": 105289856, + "step": 48790 + }, + { + "epoch": 7.960032626427406, + "grad_norm": 0.004665705841034651, + "learning_rate": 0.0007530267049661511, + "loss": 0.0893, + "num_input_tokens_seen": 105300160, + "step": 48795 + }, + { + "epoch": 7.960848287112562, + "grad_norm": 0.013447677716612816, + "learning_rate": 0.000752965309652825, + "loss": 0.0649, + "num_input_tokens_seen": 105312352, + "step": 48800 + }, + { + "epoch": 7.9616639477977165, + "grad_norm": 0.03577107936143875, + "learning_rate": 0.0007529039092128361, + "loss": 0.0255, + "num_input_tokens_seen": 105324544, + "step": 48805 + }, + { + "epoch": 7.962479608482871, + "grad_norm": 0.015414412133395672, + "learning_rate": 0.0007528425036474287, + "loss": 0.0252, + "num_input_tokens_seen": 105335072, + "step": 48810 + }, + { + "epoch": 7.963295269168026, + "grad_norm": 0.0241608414798975, + "learning_rate": 0.000752781092957847, + "loss": 0.0196, + "num_input_tokens_seen": 105344960, + "step": 48815 + }, + { + "epoch": 7.964110929853181, + "grad_norm": 0.014076568186283112, + "learning_rate": 0.000752719677145336, + "loss": 0.0115, + "num_input_tokens_seen": 105355264, + "step": 48820 + }, + { + "epoch": 7.964926590538336, + "grad_norm": 0.0016049693804234266, + "learning_rate": 0.0007526582562111399, + "loss": 0.0318, + "num_input_tokens_seen": 105366624, + "step": 48825 + }, + { + "epoch": 7.9657422512234906, + "grad_norm": 0.019754130393266678, + "learning_rate": 0.0007525968301565038, + "loss": 0.156, + "num_input_tokens_seen": 105377888, + "step": 48830 + }, + { + "epoch": 7.966557911908646, + "grad_norm": 0.005438779480755329, + "learning_rate": 0.0007525353989826726, + "loss": 0.1001, + "num_input_tokens_seen": 105388192, + "step": 48835 + }, + { + "epoch": 7.967373572593801, + "grad_norm": 0.17653228342533112, + "learning_rate": 0.000752473962690891, + "loss": 0.2358, + "num_input_tokens_seen": 105400256, + "step": 48840 + }, + { + "epoch": 7.968189233278956, + "grad_norm": 0.12529172003269196, + "learning_rate": 0.0007524125212824044, + "loss": 0.1567, + "num_input_tokens_seen": 105411328, + "step": 48845 + }, + { + "epoch": 7.969004893964111, + "grad_norm": 0.028262002393603325, + "learning_rate": 0.0007523510747584578, + "loss": 0.0182, + "num_input_tokens_seen": 105421408, + "step": 48850 + }, + { + "epoch": 7.9698205546492655, + "grad_norm": 0.004097122233361006, + "learning_rate": 0.0007522896231202967, + "loss": 0.0216, + "num_input_tokens_seen": 105431872, + "step": 48855 + }, + { + "epoch": 7.970636215334421, + "grad_norm": 0.07609119266271591, + "learning_rate": 0.0007522281663691661, + "loss": 0.0231, + "num_input_tokens_seen": 105443776, + "step": 48860 + }, + { + "epoch": 7.971451876019576, + "grad_norm": 0.031410425901412964, + "learning_rate": 0.0007521667045063119, + "loss": 0.0249, + "num_input_tokens_seen": 105453088, + "step": 48865 + }, + { + "epoch": 7.972267536704731, + "grad_norm": 0.0062339045107364655, + "learning_rate": 0.0007521052375329793, + "loss": 0.0246, + "num_input_tokens_seen": 105463744, + "step": 48870 + }, + { + "epoch": 7.973083197389886, + "grad_norm": 0.004037452861666679, + "learning_rate": 0.0007520437654504144, + "loss": 0.0273, + "num_input_tokens_seen": 105474752, + "step": 48875 + }, + { + "epoch": 7.9738988580750405, + "grad_norm": 0.1217520534992218, + "learning_rate": 0.0007519822882598629, + "loss": 0.1039, + "num_input_tokens_seen": 105486016, + "step": 48880 + }, + { + "epoch": 7.974714518760196, + "grad_norm": 0.005209515802562237, + "learning_rate": 0.0007519208059625707, + "loss": 0.1017, + "num_input_tokens_seen": 105496800, + "step": 48885 + }, + { + "epoch": 7.975530179445351, + "grad_norm": 0.002773680491372943, + "learning_rate": 0.0007518593185597837, + "loss": 0.0549, + "num_input_tokens_seen": 105506528, + "step": 48890 + }, + { + "epoch": 7.976345840130506, + "grad_norm": 0.1834457367658615, + "learning_rate": 0.000751797826052748, + "loss": 0.0305, + "num_input_tokens_seen": 105517312, + "step": 48895 + }, + { + "epoch": 7.977161500815661, + "grad_norm": 0.00693803234025836, + "learning_rate": 0.0007517363284427101, + "loss": 0.078, + "num_input_tokens_seen": 105526976, + "step": 48900 + }, + { + "epoch": 7.9779771615008155, + "grad_norm": 0.013209372758865356, + "learning_rate": 0.0007516748257309162, + "loss": 0.0095, + "num_input_tokens_seen": 105537408, + "step": 48905 + }, + { + "epoch": 7.97879282218597, + "grad_norm": 0.08409403264522552, + "learning_rate": 0.0007516133179186125, + "loss": 0.0352, + "num_input_tokens_seen": 105547712, + "step": 48910 + }, + { + "epoch": 7.979608482871125, + "grad_norm": 0.06969171017408371, + "learning_rate": 0.0007515518050070458, + "loss": 0.1559, + "num_input_tokens_seen": 105557760, + "step": 48915 + }, + { + "epoch": 7.980424143556281, + "grad_norm": 0.09518082439899445, + "learning_rate": 0.0007514902869974627, + "loss": 0.138, + "num_input_tokens_seen": 105568800, + "step": 48920 + }, + { + "epoch": 7.981239804241436, + "grad_norm": 0.005551798734813929, + "learning_rate": 0.0007514287638911099, + "loss": 0.0191, + "num_input_tokens_seen": 105578944, + "step": 48925 + }, + { + "epoch": 7.9820554649265905, + "grad_norm": 0.20882639288902283, + "learning_rate": 0.0007513672356892342, + "loss": 0.0592, + "num_input_tokens_seen": 105589664, + "step": 48930 + }, + { + "epoch": 7.982871125611745, + "grad_norm": 0.026288477703928947, + "learning_rate": 0.0007513057023930825, + "loss": 0.013, + "num_input_tokens_seen": 105601056, + "step": 48935 + }, + { + "epoch": 7.9836867862969, + "grad_norm": 0.04064859822392464, + "learning_rate": 0.000751244164003902, + "loss": 0.0847, + "num_input_tokens_seen": 105611808, + "step": 48940 + }, + { + "epoch": 7.984502446982056, + "grad_norm": 0.005815352778881788, + "learning_rate": 0.00075118262052294, + "loss": 0.1313, + "num_input_tokens_seen": 105622944, + "step": 48945 + }, + { + "epoch": 7.985318107667211, + "grad_norm": 0.08015744388103485, + "learning_rate": 0.0007511210719514432, + "loss": 0.1731, + "num_input_tokens_seen": 105633920, + "step": 48950 + }, + { + "epoch": 7.986133768352365, + "grad_norm": 0.03654742240905762, + "learning_rate": 0.0007510595182906595, + "loss": 0.0938, + "num_input_tokens_seen": 105645280, + "step": 48955 + }, + { + "epoch": 7.98694942903752, + "grad_norm": 0.21273072063922882, + "learning_rate": 0.0007509979595418362, + "loss": 0.0515, + "num_input_tokens_seen": 105656032, + "step": 48960 + }, + { + "epoch": 7.987765089722675, + "grad_norm": 0.08512967079877853, + "learning_rate": 0.0007509363957062207, + "loss": 0.0853, + "num_input_tokens_seen": 105666208, + "step": 48965 + }, + { + "epoch": 7.988580750407831, + "grad_norm": 0.2499631941318512, + "learning_rate": 0.0007508748267850609, + "loss": 0.1445, + "num_input_tokens_seen": 105677600, + "step": 48970 + }, + { + "epoch": 7.989396411092986, + "grad_norm": 0.10974223911762238, + "learning_rate": 0.0007508132527796043, + "loss": 0.1033, + "num_input_tokens_seen": 105688256, + "step": 48975 + }, + { + "epoch": 7.99021207177814, + "grad_norm": 0.0054068658500909805, + "learning_rate": 0.0007507516736910992, + "loss": 0.0206, + "num_input_tokens_seen": 105698560, + "step": 48980 + }, + { + "epoch": 7.991027732463295, + "grad_norm": 0.037877704948186874, + "learning_rate": 0.0007506900895207932, + "loss": 0.1003, + "num_input_tokens_seen": 105709792, + "step": 48985 + }, + { + "epoch": 7.99184339314845, + "grad_norm": 0.021731335669755936, + "learning_rate": 0.0007506285002699346, + "loss": 0.0711, + "num_input_tokens_seen": 105720608, + "step": 48990 + }, + { + "epoch": 7.992659053833605, + "grad_norm": 0.005956695415079594, + "learning_rate": 0.0007505669059397715, + "loss": 0.0101, + "num_input_tokens_seen": 105732192, + "step": 48995 + }, + { + "epoch": 7.993474714518761, + "grad_norm": 0.03811812400817871, + "learning_rate": 0.0007505053065315521, + "loss": 0.165, + "num_input_tokens_seen": 105742688, + "step": 49000 + }, + { + "epoch": 7.994290375203915, + "grad_norm": 0.17578482627868652, + "learning_rate": 0.0007504437020465248, + "loss": 0.0597, + "num_input_tokens_seen": 105754208, + "step": 49005 + }, + { + "epoch": 7.99510603588907, + "grad_norm": 0.006330165546387434, + "learning_rate": 0.0007503820924859382, + "loss": 0.035, + "num_input_tokens_seen": 105765920, + "step": 49010 + }, + { + "epoch": 7.995921696574225, + "grad_norm": 0.027257638052105904, + "learning_rate": 0.000750320477851041, + "loss": 0.0989, + "num_input_tokens_seen": 105776992, + "step": 49015 + }, + { + "epoch": 7.99673735725938, + "grad_norm": 0.004199329763650894, + "learning_rate": 0.0007502588581430817, + "loss": 0.1884, + "num_input_tokens_seen": 105788576, + "step": 49020 + }, + { + "epoch": 7.997553017944535, + "grad_norm": 0.011431850492954254, + "learning_rate": 0.0007501972333633091, + "loss": 0.0357, + "num_input_tokens_seen": 105800384, + "step": 49025 + }, + { + "epoch": 7.99836867862969, + "grad_norm": 0.03893594816327095, + "learning_rate": 0.0007501356035129723, + "loss": 0.1101, + "num_input_tokens_seen": 105810528, + "step": 49030 + }, + { + "epoch": 7.999184339314845, + "grad_norm": 0.0013488149270415306, + "learning_rate": 0.0007500739685933201, + "loss": 0.0433, + "num_input_tokens_seen": 105821440, + "step": 49035 + }, + { + "epoch": 8.0, + "grad_norm": 0.00565652409568429, + "learning_rate": 0.0007500123286056018, + "loss": 0.0343, + "num_input_tokens_seen": 105830544, + "step": 49040 + }, + { + "epoch": 8.0, + "eval_loss": 0.11187697947025299, + "eval_runtime": 104.092, + "eval_samples_per_second": 26.179, + "eval_steps_per_second": 6.552, + "num_input_tokens_seen": 105830544, + "step": 49040 + }, + { + "epoch": 8.000815660685156, + "grad_norm": 0.03719082102179527, + "learning_rate": 0.0007499506835510663, + "loss": 0.0477, + "num_input_tokens_seen": 105843312, + "step": 49045 + }, + { + "epoch": 8.00163132137031, + "grad_norm": 0.03124874085187912, + "learning_rate": 0.0007498890334309633, + "loss": 0.0125, + "num_input_tokens_seen": 105851248, + "step": 49050 + }, + { + "epoch": 8.002446982055465, + "grad_norm": 0.2775762379169464, + "learning_rate": 0.000749827378246542, + "loss": 0.0761, + "num_input_tokens_seen": 105862352, + "step": 49055 + }, + { + "epoch": 8.00326264274062, + "grad_norm": 0.05207438766956329, + "learning_rate": 0.0007497657179990518, + "loss": 0.0275, + "num_input_tokens_seen": 105873264, + "step": 49060 + }, + { + "epoch": 8.004078303425775, + "grad_norm": 0.0031996567267924547, + "learning_rate": 0.0007497040526897426, + "loss": 0.015, + "num_input_tokens_seen": 105884272, + "step": 49065 + }, + { + "epoch": 8.00489396411093, + "grad_norm": 0.17413076758384705, + "learning_rate": 0.0007496423823198639, + "loss": 0.082, + "num_input_tokens_seen": 105894832, + "step": 49070 + }, + { + "epoch": 8.005709624796085, + "grad_norm": 0.01603052392601967, + "learning_rate": 0.0007495807068906657, + "loss": 0.0048, + "num_input_tokens_seen": 105905520, + "step": 49075 + }, + { + "epoch": 8.00652528548124, + "grad_norm": 0.022662393748760223, + "learning_rate": 0.0007495190264033978, + "loss": 0.1021, + "num_input_tokens_seen": 105917424, + "step": 49080 + }, + { + "epoch": 8.007340946166394, + "grad_norm": 0.12525016069412231, + "learning_rate": 0.0007494573408593103, + "loss": 0.0356, + "num_input_tokens_seen": 105928560, + "step": 49085 + }, + { + "epoch": 8.00815660685155, + "grad_norm": 0.08209449797868729, + "learning_rate": 0.0007493956502596533, + "loss": 0.0217, + "num_input_tokens_seen": 105939120, + "step": 49090 + }, + { + "epoch": 8.008972267536704, + "grad_norm": 0.11279091984033585, + "learning_rate": 0.0007493339546056772, + "loss": 0.0374, + "num_input_tokens_seen": 105949424, + "step": 49095 + }, + { + "epoch": 8.00978792822186, + "grad_norm": 0.27754855155944824, + "learning_rate": 0.0007492722538986321, + "loss": 0.0659, + "num_input_tokens_seen": 105959280, + "step": 49100 + }, + { + "epoch": 8.010603588907015, + "grad_norm": 0.0619928240776062, + "learning_rate": 0.0007492105481397686, + "loss": 0.0527, + "num_input_tokens_seen": 105968272, + "step": 49105 + }, + { + "epoch": 8.01141924959217, + "grad_norm": 0.1165754497051239, + "learning_rate": 0.0007491488373303373, + "loss": 0.0978, + "num_input_tokens_seen": 105978736, + "step": 49110 + }, + { + "epoch": 8.012234910277325, + "grad_norm": 0.06867875158786774, + "learning_rate": 0.0007490871214715885, + "loss": 0.0654, + "num_input_tokens_seen": 105989872, + "step": 49115 + }, + { + "epoch": 8.013050570962479, + "grad_norm": 0.015703529119491577, + "learning_rate": 0.0007490254005647735, + "loss": 0.0353, + "num_input_tokens_seen": 105999792, + "step": 49120 + }, + { + "epoch": 8.013866231647635, + "grad_norm": 0.0701662078499794, + "learning_rate": 0.0007489636746111426, + "loss": 0.0375, + "num_input_tokens_seen": 106010256, + "step": 49125 + }, + { + "epoch": 8.01468189233279, + "grad_norm": 0.20747560262680054, + "learning_rate": 0.0007489019436119471, + "loss": 0.1941, + "num_input_tokens_seen": 106020496, + "step": 49130 + }, + { + "epoch": 8.015497553017944, + "grad_norm": 0.0069278753362596035, + "learning_rate": 0.0007488402075684379, + "loss": 0.0197, + "num_input_tokens_seen": 106031248, + "step": 49135 + }, + { + "epoch": 8.0163132137031, + "grad_norm": 0.0024970360100269318, + "learning_rate": 0.0007487784664818662, + "loss": 0.0065, + "num_input_tokens_seen": 106041232, + "step": 49140 + }, + { + "epoch": 8.017128874388254, + "grad_norm": 0.16227933764457703, + "learning_rate": 0.0007487167203534834, + "loss": 0.0534, + "num_input_tokens_seen": 106051696, + "step": 49145 + }, + { + "epoch": 8.01794453507341, + "grad_norm": 0.08358407765626907, + "learning_rate": 0.0007486549691845405, + "loss": 0.0722, + "num_input_tokens_seen": 106062672, + "step": 49150 + }, + { + "epoch": 8.018760195758565, + "grad_norm": 0.04775271192193031, + "learning_rate": 0.0007485932129762895, + "loss": 0.017, + "num_input_tokens_seen": 106072784, + "step": 49155 + }, + { + "epoch": 8.01957585644372, + "grad_norm": 0.0023331495467573404, + "learning_rate": 0.0007485314517299815, + "loss": 0.0248, + "num_input_tokens_seen": 106083952, + "step": 49160 + }, + { + "epoch": 8.020391517128875, + "grad_norm": 0.0585792176425457, + "learning_rate": 0.0007484696854468684, + "loss": 0.069, + "num_input_tokens_seen": 106095632, + "step": 49165 + }, + { + "epoch": 8.021207177814029, + "grad_norm": 0.023990076035261154, + "learning_rate": 0.0007484079141282018, + "loss": 0.0089, + "num_input_tokens_seen": 106106160, + "step": 49170 + }, + { + "epoch": 8.022022838499185, + "grad_norm": 0.33813580870628357, + "learning_rate": 0.0007483461377752339, + "loss": 0.0603, + "num_input_tokens_seen": 106116784, + "step": 49175 + }, + { + "epoch": 8.022838499184338, + "grad_norm": 0.0071504730731248856, + "learning_rate": 0.0007482843563892164, + "loss": 0.0416, + "num_input_tokens_seen": 106127088, + "step": 49180 + }, + { + "epoch": 8.023654159869494, + "grad_norm": 0.049690067768096924, + "learning_rate": 0.0007482225699714014, + "loss": 0.0594, + "num_input_tokens_seen": 106136912, + "step": 49185 + }, + { + "epoch": 8.02446982055465, + "grad_norm": 0.019339650869369507, + "learning_rate": 0.0007481607785230411, + "loss": 0.0332, + "num_input_tokens_seen": 106148816, + "step": 49190 + }, + { + "epoch": 8.025285481239804, + "grad_norm": 0.0016177351353690028, + "learning_rate": 0.0007480989820453878, + "loss": 0.0072, + "num_input_tokens_seen": 106158672, + "step": 49195 + }, + { + "epoch": 8.02610114192496, + "grad_norm": 0.05868733301758766, + "learning_rate": 0.0007480371805396941, + "loss": 0.0203, + "num_input_tokens_seen": 106169744, + "step": 49200 + }, + { + "epoch": 8.026916802610113, + "grad_norm": 0.0030028580222278833, + "learning_rate": 0.0007479753740072121, + "loss": 0.1349, + "num_input_tokens_seen": 106179824, + "step": 49205 + }, + { + "epoch": 8.02773246329527, + "grad_norm": 0.00705280527472496, + "learning_rate": 0.0007479135624491946, + "loss": 0.0358, + "num_input_tokens_seen": 106190640, + "step": 49210 + }, + { + "epoch": 8.028548123980425, + "grad_norm": 0.032098107039928436, + "learning_rate": 0.0007478517458668943, + "loss": 0.0179, + "num_input_tokens_seen": 106200112, + "step": 49215 + }, + { + "epoch": 8.029363784665579, + "grad_norm": 0.21257726848125458, + "learning_rate": 0.0007477899242615639, + "loss": 0.0353, + "num_input_tokens_seen": 106211440, + "step": 49220 + }, + { + "epoch": 8.030179445350734, + "grad_norm": 0.0040211062878370285, + "learning_rate": 0.0007477280976344563, + "loss": 0.0096, + "num_input_tokens_seen": 106222224, + "step": 49225 + }, + { + "epoch": 8.030995106035888, + "grad_norm": 0.003391520818695426, + "learning_rate": 0.0007476662659868246, + "loss": 0.0489, + "num_input_tokens_seen": 106233968, + "step": 49230 + }, + { + "epoch": 8.031810766721044, + "grad_norm": 0.3110547959804535, + "learning_rate": 0.0007476044293199218, + "loss": 0.129, + "num_input_tokens_seen": 106244944, + "step": 49235 + }, + { + "epoch": 8.0326264274062, + "grad_norm": 0.3978988230228424, + "learning_rate": 0.0007475425876350011, + "loss": 0.0911, + "num_input_tokens_seen": 106255664, + "step": 49240 + }, + { + "epoch": 8.033442088091354, + "grad_norm": 0.20787635445594788, + "learning_rate": 0.000747480740933316, + "loss": 0.0659, + "num_input_tokens_seen": 106266672, + "step": 49245 + }, + { + "epoch": 8.03425774877651, + "grad_norm": 0.021922480314970016, + "learning_rate": 0.0007474188892161196, + "loss": 0.1473, + "num_input_tokens_seen": 106278288, + "step": 49250 + }, + { + "epoch": 8.035073409461663, + "grad_norm": 0.17151710391044617, + "learning_rate": 0.0007473570324846656, + "loss": 0.0236, + "num_input_tokens_seen": 106289744, + "step": 49255 + }, + { + "epoch": 8.035889070146819, + "grad_norm": 0.06157829239964485, + "learning_rate": 0.0007472951707402074, + "loss": 0.0306, + "num_input_tokens_seen": 106298480, + "step": 49260 + }, + { + "epoch": 8.036704730831975, + "grad_norm": 0.009118116460740566, + "learning_rate": 0.0007472333039839989, + "loss": 0.0128, + "num_input_tokens_seen": 106310800, + "step": 49265 + }, + { + "epoch": 8.037520391517129, + "grad_norm": 0.24590137600898743, + "learning_rate": 0.000747171432217294, + "loss": 0.1572, + "num_input_tokens_seen": 106322000, + "step": 49270 + }, + { + "epoch": 8.038336052202284, + "grad_norm": 0.0024115999694913626, + "learning_rate": 0.0007471095554413463, + "loss": 0.1053, + "num_input_tokens_seen": 106331632, + "step": 49275 + }, + { + "epoch": 8.039151712887438, + "grad_norm": 0.03993025794625282, + "learning_rate": 0.0007470476736574102, + "loss": 0.0233, + "num_input_tokens_seen": 106343344, + "step": 49280 + }, + { + "epoch": 8.039967373572594, + "grad_norm": 0.008030765689909458, + "learning_rate": 0.0007469857868667393, + "loss": 0.0822, + "num_input_tokens_seen": 106354288, + "step": 49285 + }, + { + "epoch": 8.040783034257748, + "grad_norm": 0.00874246098101139, + "learning_rate": 0.0007469238950705883, + "loss": 0.0178, + "num_input_tokens_seen": 106365168, + "step": 49290 + }, + { + "epoch": 8.041598694942904, + "grad_norm": 0.021646322682499886, + "learning_rate": 0.0007468619982702112, + "loss": 0.0347, + "num_input_tokens_seen": 106375440, + "step": 49295 + }, + { + "epoch": 8.04241435562806, + "grad_norm": 0.11247748136520386, + "learning_rate": 0.0007468000964668625, + "loss": 0.0218, + "num_input_tokens_seen": 106386032, + "step": 49300 + }, + { + "epoch": 8.043230016313213, + "grad_norm": 0.011812375858426094, + "learning_rate": 0.0007467381896617968, + "loss": 0.0464, + "num_input_tokens_seen": 106398160, + "step": 49305 + }, + { + "epoch": 8.044045676998369, + "grad_norm": 0.008256683126091957, + "learning_rate": 0.0007466762778562687, + "loss": 0.0353, + "num_input_tokens_seen": 106408432, + "step": 49310 + }, + { + "epoch": 8.044861337683523, + "grad_norm": 0.005382542032748461, + "learning_rate": 0.000746614361051533, + "loss": 0.1722, + "num_input_tokens_seen": 106419440, + "step": 49315 + }, + { + "epoch": 8.045676998368679, + "grad_norm": 0.04113069176673889, + "learning_rate": 0.0007465524392488443, + "loss": 0.0469, + "num_input_tokens_seen": 106430000, + "step": 49320 + }, + { + "epoch": 8.046492659053834, + "grad_norm": 0.01023928727954626, + "learning_rate": 0.0007464905124494578, + "loss": 0.0448, + "num_input_tokens_seen": 106441392, + "step": 49325 + }, + { + "epoch": 8.047308319738988, + "grad_norm": 0.03549192100763321, + "learning_rate": 0.0007464285806546283, + "loss": 0.0172, + "num_input_tokens_seen": 106452688, + "step": 49330 + }, + { + "epoch": 8.048123980424144, + "grad_norm": 0.06181855499744415, + "learning_rate": 0.0007463666438656109, + "loss": 0.0236, + "num_input_tokens_seen": 106462352, + "step": 49335 + }, + { + "epoch": 8.048939641109298, + "grad_norm": 0.04214494675397873, + "learning_rate": 0.000746304702083661, + "loss": 0.0907, + "num_input_tokens_seen": 106472208, + "step": 49340 + }, + { + "epoch": 8.049755301794454, + "grad_norm": 0.09523848444223404, + "learning_rate": 0.0007462427553100339, + "loss": 0.1835, + "num_input_tokens_seen": 106481840, + "step": 49345 + }, + { + "epoch": 8.05057096247961, + "grad_norm": 0.005696515552699566, + "learning_rate": 0.0007461808035459848, + "loss": 0.1597, + "num_input_tokens_seen": 106492912, + "step": 49350 + }, + { + "epoch": 8.051386623164763, + "grad_norm": 0.013449899852275848, + "learning_rate": 0.0007461188467927695, + "loss": 0.1428, + "num_input_tokens_seen": 106503440, + "step": 49355 + }, + { + "epoch": 8.052202283849919, + "grad_norm": 0.020131755620241165, + "learning_rate": 0.0007460568850516436, + "loss": 0.0178, + "num_input_tokens_seen": 106512880, + "step": 49360 + }, + { + "epoch": 8.053017944535073, + "grad_norm": 0.02667674794793129, + "learning_rate": 0.0007459949183238627, + "loss": 0.0219, + "num_input_tokens_seen": 106525296, + "step": 49365 + }, + { + "epoch": 8.053833605220229, + "grad_norm": 0.3335675299167633, + "learning_rate": 0.0007459329466106829, + "loss": 0.185, + "num_input_tokens_seen": 106535536, + "step": 49370 + }, + { + "epoch": 8.054649265905383, + "grad_norm": 0.023953191936016083, + "learning_rate": 0.0007458709699133597, + "loss": 0.0724, + "num_input_tokens_seen": 106546160, + "step": 49375 + }, + { + "epoch": 8.055464926590538, + "grad_norm": 0.011892963200807571, + "learning_rate": 0.0007458089882331495, + "loss": 0.0117, + "num_input_tokens_seen": 106558064, + "step": 49380 + }, + { + "epoch": 8.056280587275694, + "grad_norm": 0.017951400950551033, + "learning_rate": 0.0007457470015713085, + "loss": 0.0207, + "num_input_tokens_seen": 106568400, + "step": 49385 + }, + { + "epoch": 8.057096247960848, + "grad_norm": 0.283562570810318, + "learning_rate": 0.0007456850099290927, + "loss": 0.062, + "num_input_tokens_seen": 106578192, + "step": 49390 + }, + { + "epoch": 8.057911908646004, + "grad_norm": 0.004318437539041042, + "learning_rate": 0.0007456230133077583, + "loss": 0.0141, + "num_input_tokens_seen": 106587312, + "step": 49395 + }, + { + "epoch": 8.058727569331158, + "grad_norm": 0.009352088905870914, + "learning_rate": 0.0007455610117085618, + "loss": 0.0318, + "num_input_tokens_seen": 106598672, + "step": 49400 + }, + { + "epoch": 8.059543230016313, + "grad_norm": 0.2895674407482147, + "learning_rate": 0.0007454990051327602, + "loss": 0.0717, + "num_input_tokens_seen": 106608208, + "step": 49405 + }, + { + "epoch": 8.060358890701469, + "grad_norm": 0.04009447246789932, + "learning_rate": 0.0007454369935816098, + "loss": 0.0756, + "num_input_tokens_seen": 106619984, + "step": 49410 + }, + { + "epoch": 8.061174551386623, + "grad_norm": 0.1951700896024704, + "learning_rate": 0.0007453749770563673, + "loss": 0.1058, + "num_input_tokens_seen": 106631792, + "step": 49415 + }, + { + "epoch": 8.061990212071779, + "grad_norm": 0.0019079549238085747, + "learning_rate": 0.0007453129555582896, + "loss": 0.0248, + "num_input_tokens_seen": 106643600, + "step": 49420 + }, + { + "epoch": 8.062805872756933, + "grad_norm": 0.24292688071727753, + "learning_rate": 0.0007452509290886336, + "loss": 0.0717, + "num_input_tokens_seen": 106654672, + "step": 49425 + }, + { + "epoch": 8.063621533442088, + "grad_norm": 0.24535971879959106, + "learning_rate": 0.0007451888976486565, + "loss": 0.0636, + "num_input_tokens_seen": 106665456, + "step": 49430 + }, + { + "epoch": 8.064437194127244, + "grad_norm": 0.2217271625995636, + "learning_rate": 0.0007451268612396154, + "loss": 0.2257, + "num_input_tokens_seen": 106676880, + "step": 49435 + }, + { + "epoch": 8.065252854812398, + "grad_norm": 0.08346287906169891, + "learning_rate": 0.0007450648198627673, + "loss": 0.0864, + "num_input_tokens_seen": 106688528, + "step": 49440 + }, + { + "epoch": 8.066068515497554, + "grad_norm": 0.02116892673075199, + "learning_rate": 0.0007450027735193699, + "loss": 0.0223, + "num_input_tokens_seen": 106698992, + "step": 49445 + }, + { + "epoch": 8.066884176182707, + "grad_norm": 0.016278840601444244, + "learning_rate": 0.0007449407222106804, + "loss": 0.0593, + "num_input_tokens_seen": 106709168, + "step": 49450 + }, + { + "epoch": 8.067699836867863, + "grad_norm": 0.03761972114443779, + "learning_rate": 0.0007448786659379565, + "loss": 0.1784, + "num_input_tokens_seen": 106719920, + "step": 49455 + }, + { + "epoch": 8.068515497553017, + "grad_norm": 0.021950116381049156, + "learning_rate": 0.0007448166047024556, + "loss": 0.0171, + "num_input_tokens_seen": 106729872, + "step": 49460 + }, + { + "epoch": 8.069331158238173, + "grad_norm": 0.09063199907541275, + "learning_rate": 0.0007447545385054358, + "loss": 0.0677, + "num_input_tokens_seen": 106741168, + "step": 49465 + }, + { + "epoch": 8.070146818923329, + "grad_norm": 0.22505639493465424, + "learning_rate": 0.0007446924673481548, + "loss": 0.0655, + "num_input_tokens_seen": 106751376, + "step": 49470 + }, + { + "epoch": 8.070962479608482, + "grad_norm": 0.3453344702720642, + "learning_rate": 0.0007446303912318705, + "loss": 0.1202, + "num_input_tokens_seen": 106761904, + "step": 49475 + }, + { + "epoch": 8.071778140293638, + "grad_norm": 0.2702260911464691, + "learning_rate": 0.000744568310157841, + "loss": 0.1227, + "num_input_tokens_seen": 106772240, + "step": 49480 + }, + { + "epoch": 8.072593800978792, + "grad_norm": 0.01567579247057438, + "learning_rate": 0.0007445062241273244, + "loss": 0.0094, + "num_input_tokens_seen": 106783216, + "step": 49485 + }, + { + "epoch": 8.073409461663948, + "grad_norm": 0.08043105900287628, + "learning_rate": 0.000744444133141579, + "loss": 0.0891, + "num_input_tokens_seen": 106794896, + "step": 49490 + }, + { + "epoch": 8.074225122349104, + "grad_norm": 0.14575450122356415, + "learning_rate": 0.0007443820372018631, + "loss": 0.1475, + "num_input_tokens_seen": 106805360, + "step": 49495 + }, + { + "epoch": 8.075040783034257, + "grad_norm": 0.11461183428764343, + "learning_rate": 0.0007443199363094353, + "loss": 0.0343, + "num_input_tokens_seen": 106816144, + "step": 49500 + }, + { + "epoch": 8.075856443719413, + "grad_norm": 0.030665356665849686, + "learning_rate": 0.0007442578304655541, + "loss": 0.0187, + "num_input_tokens_seen": 106827088, + "step": 49505 + }, + { + "epoch": 8.076672104404567, + "grad_norm": 0.10686243325471878, + "learning_rate": 0.0007441957196714778, + "loss": 0.0496, + "num_input_tokens_seen": 106837488, + "step": 49510 + }, + { + "epoch": 8.077487765089723, + "grad_norm": 0.013757050968706608, + "learning_rate": 0.0007441336039284656, + "loss": 0.0325, + "num_input_tokens_seen": 106849520, + "step": 49515 + }, + { + "epoch": 8.078303425774878, + "grad_norm": 0.005685810465365648, + "learning_rate": 0.0007440714832377764, + "loss": 0.0385, + "num_input_tokens_seen": 106860528, + "step": 49520 + }, + { + "epoch": 8.079119086460032, + "grad_norm": 0.4398340582847595, + "learning_rate": 0.0007440093576006688, + "loss": 0.1198, + "num_input_tokens_seen": 106869936, + "step": 49525 + }, + { + "epoch": 8.079934747145188, + "grad_norm": 0.034693941473960876, + "learning_rate": 0.000743947227018402, + "loss": 0.0104, + "num_input_tokens_seen": 106881936, + "step": 49530 + }, + { + "epoch": 8.080750407830342, + "grad_norm": 0.025841880589723587, + "learning_rate": 0.0007438850914922352, + "loss": 0.0339, + "num_input_tokens_seen": 106891376, + "step": 49535 + }, + { + "epoch": 8.081566068515498, + "grad_norm": 0.021484823897480965, + "learning_rate": 0.0007438229510234278, + "loss": 0.0466, + "num_input_tokens_seen": 106901168, + "step": 49540 + }, + { + "epoch": 8.082381729200652, + "grad_norm": 0.03086649626493454, + "learning_rate": 0.0007437608056132388, + "loss": 0.1391, + "num_input_tokens_seen": 106912432, + "step": 49545 + }, + { + "epoch": 8.083197389885807, + "grad_norm": 0.12069348990917206, + "learning_rate": 0.0007436986552629279, + "loss": 0.0113, + "num_input_tokens_seen": 106924592, + "step": 49550 + }, + { + "epoch": 8.084013050570963, + "grad_norm": 0.0030465559102594852, + "learning_rate": 0.0007436364999737546, + "loss": 0.0254, + "num_input_tokens_seen": 106934704, + "step": 49555 + }, + { + "epoch": 8.084828711256117, + "grad_norm": 0.3401923179626465, + "learning_rate": 0.0007435743397469785, + "loss": 0.1097, + "num_input_tokens_seen": 106945936, + "step": 49560 + }, + { + "epoch": 8.085644371941273, + "grad_norm": 0.3312312662601471, + "learning_rate": 0.0007435121745838595, + "loss": 0.0709, + "num_input_tokens_seen": 106957264, + "step": 49565 + }, + { + "epoch": 8.086460032626427, + "grad_norm": 0.021351389586925507, + "learning_rate": 0.0007434500044856574, + "loss": 0.1681, + "num_input_tokens_seen": 106968816, + "step": 49570 + }, + { + "epoch": 8.087275693311582, + "grad_norm": 0.01188052911311388, + "learning_rate": 0.000743387829453632, + "loss": 0.0223, + "num_input_tokens_seen": 106981680, + "step": 49575 + }, + { + "epoch": 8.088091353996738, + "grad_norm": 0.015689412131905556, + "learning_rate": 0.0007433256494890435, + "loss": 0.0286, + "num_input_tokens_seen": 106992048, + "step": 49580 + }, + { + "epoch": 8.088907014681892, + "grad_norm": 0.02924872562289238, + "learning_rate": 0.000743263464593152, + "loss": 0.0093, + "num_input_tokens_seen": 107004016, + "step": 49585 + }, + { + "epoch": 8.089722675367048, + "grad_norm": 0.296697735786438, + "learning_rate": 0.0007432012747672179, + "loss": 0.0899, + "num_input_tokens_seen": 107015024, + "step": 49590 + }, + { + "epoch": 8.090538336052202, + "grad_norm": 0.045094795525074005, + "learning_rate": 0.0007431390800125013, + "loss": 0.0131, + "num_input_tokens_seen": 107025008, + "step": 49595 + }, + { + "epoch": 8.091353996737357, + "grad_norm": 0.09916126728057861, + "learning_rate": 0.0007430768803302629, + "loss": 0.0353, + "num_input_tokens_seen": 107035408, + "step": 49600 + }, + { + "epoch": 8.092169657422513, + "grad_norm": 0.009082852862775326, + "learning_rate": 0.0007430146757217631, + "loss": 0.0157, + "num_input_tokens_seen": 107046960, + "step": 49605 + }, + { + "epoch": 8.092985318107667, + "grad_norm": 0.08944400399923325, + "learning_rate": 0.0007429524661882626, + "loss": 0.1163, + "num_input_tokens_seen": 107058736, + "step": 49610 + }, + { + "epoch": 8.093800978792823, + "grad_norm": 0.018375713378190994, + "learning_rate": 0.0007428902517310222, + "loss": 0.1522, + "num_input_tokens_seen": 107069872, + "step": 49615 + }, + { + "epoch": 8.094616639477977, + "grad_norm": 0.0020468742586672306, + "learning_rate": 0.0007428280323513028, + "loss": 0.073, + "num_input_tokens_seen": 107080400, + "step": 49620 + }, + { + "epoch": 8.095432300163132, + "grad_norm": 0.005916260182857513, + "learning_rate": 0.0007427658080503652, + "loss": 0.0123, + "num_input_tokens_seen": 107091696, + "step": 49625 + }, + { + "epoch": 8.096247960848286, + "grad_norm": 0.14975595474243164, + "learning_rate": 0.0007427035788294704, + "loss": 0.1371, + "num_input_tokens_seen": 107101168, + "step": 49630 + }, + { + "epoch": 8.097063621533442, + "grad_norm": 0.04701889306306839, + "learning_rate": 0.0007426413446898799, + "loss": 0.0227, + "num_input_tokens_seen": 107112464, + "step": 49635 + }, + { + "epoch": 8.097879282218598, + "grad_norm": 0.16771496832370758, + "learning_rate": 0.0007425791056328546, + "loss": 0.0488, + "num_input_tokens_seen": 107123280, + "step": 49640 + }, + { + "epoch": 8.098694942903752, + "grad_norm": 0.19909067451953888, + "learning_rate": 0.0007425168616596561, + "loss": 0.0505, + "num_input_tokens_seen": 107134800, + "step": 49645 + }, + { + "epoch": 8.099510603588907, + "grad_norm": 0.007578084245324135, + "learning_rate": 0.0007424546127715456, + "loss": 0.0395, + "num_input_tokens_seen": 107145904, + "step": 49650 + }, + { + "epoch": 8.100326264274061, + "grad_norm": 0.20924271643161774, + "learning_rate": 0.0007423923589697849, + "loss": 0.1546, + "num_input_tokens_seen": 107156464, + "step": 49655 + }, + { + "epoch": 8.101141924959217, + "grad_norm": 0.030126303434371948, + "learning_rate": 0.0007423301002556355, + "loss": 0.0432, + "num_input_tokens_seen": 107167824, + "step": 49660 + }, + { + "epoch": 8.101957585644373, + "grad_norm": 0.26863422989845276, + "learning_rate": 0.0007422678366303592, + "loss": 0.0596, + "num_input_tokens_seen": 107178928, + "step": 49665 + }, + { + "epoch": 8.102773246329527, + "grad_norm": 0.00434926338493824, + "learning_rate": 0.000742205568095218, + "loss": 0.0698, + "num_input_tokens_seen": 107191120, + "step": 49670 + }, + { + "epoch": 8.103588907014682, + "grad_norm": 0.006093534640967846, + "learning_rate": 0.0007421432946514736, + "loss": 0.0095, + "num_input_tokens_seen": 107201872, + "step": 49675 + }, + { + "epoch": 8.104404567699836, + "grad_norm": 0.01669297181069851, + "learning_rate": 0.0007420810163003881, + "loss": 0.1504, + "num_input_tokens_seen": 107214192, + "step": 49680 + }, + { + "epoch": 8.105220228384992, + "grad_norm": 0.23923209309577942, + "learning_rate": 0.0007420187330432238, + "loss": 0.0715, + "num_input_tokens_seen": 107225616, + "step": 49685 + }, + { + "epoch": 8.106035889070148, + "grad_norm": 0.022906359285116196, + "learning_rate": 0.0007419564448812428, + "loss": 0.1543, + "num_input_tokens_seen": 107236208, + "step": 49690 + }, + { + "epoch": 8.106851549755302, + "grad_norm": 0.025257373228669167, + "learning_rate": 0.0007418941518157075, + "loss": 0.0094, + "num_input_tokens_seen": 107247632, + "step": 49695 + }, + { + "epoch": 8.107667210440457, + "grad_norm": 0.004960741847753525, + "learning_rate": 0.0007418318538478803, + "loss": 0.0082, + "num_input_tokens_seen": 107258288, + "step": 49700 + }, + { + "epoch": 8.108482871125611, + "grad_norm": 0.012015601620078087, + "learning_rate": 0.0007417695509790239, + "loss": 0.0085, + "num_input_tokens_seen": 107268944, + "step": 49705 + }, + { + "epoch": 8.109298531810767, + "grad_norm": 0.13145971298217773, + "learning_rate": 0.0007417072432104007, + "loss": 0.0163, + "num_input_tokens_seen": 107279472, + "step": 49710 + }, + { + "epoch": 8.11011419249592, + "grad_norm": 0.4388778805732727, + "learning_rate": 0.0007416449305432738, + "loss": 0.0624, + "num_input_tokens_seen": 107290064, + "step": 49715 + }, + { + "epoch": 8.110929853181077, + "grad_norm": 0.0035671512596309185, + "learning_rate": 0.0007415826129789057, + "loss": 0.0405, + "num_input_tokens_seen": 107300560, + "step": 49720 + }, + { + "epoch": 8.111745513866232, + "grad_norm": 0.003578565316274762, + "learning_rate": 0.0007415202905185594, + "loss": 0.0105, + "num_input_tokens_seen": 107310512, + "step": 49725 + }, + { + "epoch": 8.112561174551386, + "grad_norm": 0.2788443863391876, + "learning_rate": 0.0007414579631634981, + "loss": 0.0434, + "num_input_tokens_seen": 107321296, + "step": 49730 + }, + { + "epoch": 8.113376835236542, + "grad_norm": 0.012547682970762253, + "learning_rate": 0.0007413956309149848, + "loss": 0.0481, + "num_input_tokens_seen": 107331920, + "step": 49735 + }, + { + "epoch": 8.114192495921696, + "grad_norm": 0.028151346370577812, + "learning_rate": 0.000741333293774283, + "loss": 0.0163, + "num_input_tokens_seen": 107342768, + "step": 49740 + }, + { + "epoch": 8.115008156606851, + "grad_norm": 0.049519214779138565, + "learning_rate": 0.0007412709517426556, + "loss": 0.0351, + "num_input_tokens_seen": 107352304, + "step": 49745 + }, + { + "epoch": 8.115823817292007, + "grad_norm": 0.0009572605486027896, + "learning_rate": 0.0007412086048213665, + "loss": 0.007, + "num_input_tokens_seen": 107362448, + "step": 49750 + }, + { + "epoch": 8.116639477977161, + "grad_norm": 0.007476430386304855, + "learning_rate": 0.000741146253011679, + "loss": 0.0151, + "num_input_tokens_seen": 107372368, + "step": 49755 + }, + { + "epoch": 8.117455138662317, + "grad_norm": 0.026140011847019196, + "learning_rate": 0.0007410838963148568, + "loss": 0.0132, + "num_input_tokens_seen": 107383120, + "step": 49760 + }, + { + "epoch": 8.11827079934747, + "grad_norm": 0.011069286614656448, + "learning_rate": 0.0007410215347321634, + "loss": 0.0401, + "num_input_tokens_seen": 107394576, + "step": 49765 + }, + { + "epoch": 8.119086460032626, + "grad_norm": 0.005492599215358496, + "learning_rate": 0.000740959168264863, + "loss": 0.0688, + "num_input_tokens_seen": 107404752, + "step": 49770 + }, + { + "epoch": 8.119902120717782, + "grad_norm": 0.0033613936975598335, + "learning_rate": 0.0007408967969142193, + "loss": 0.1009, + "num_input_tokens_seen": 107414288, + "step": 49775 + }, + { + "epoch": 8.120717781402936, + "grad_norm": 0.04260189086198807, + "learning_rate": 0.0007408344206814965, + "loss": 0.072, + "num_input_tokens_seen": 107425648, + "step": 49780 + }, + { + "epoch": 8.121533442088092, + "grad_norm": 0.03864915668964386, + "learning_rate": 0.0007407720395679585, + "loss": 0.0373, + "num_input_tokens_seen": 107437936, + "step": 49785 + }, + { + "epoch": 8.122349102773246, + "grad_norm": 0.008719193749129772, + "learning_rate": 0.0007407096535748698, + "loss": 0.008, + "num_input_tokens_seen": 107448336, + "step": 49790 + }, + { + "epoch": 8.123164763458401, + "grad_norm": 0.01019352488219738, + "learning_rate": 0.0007406472627034946, + "loss": 0.1017, + "num_input_tokens_seen": 107458416, + "step": 49795 + }, + { + "epoch": 8.123980424143557, + "grad_norm": 0.012598958797752857, + "learning_rate": 0.0007405848669550973, + "loss": 0.0992, + "num_input_tokens_seen": 107469872, + "step": 49800 + }, + { + "epoch": 8.124796084828711, + "grad_norm": 0.025490177795290947, + "learning_rate": 0.0007405224663309425, + "loss": 0.0734, + "num_input_tokens_seen": 107479760, + "step": 49805 + }, + { + "epoch": 8.125611745513867, + "grad_norm": 0.12471040338277817, + "learning_rate": 0.0007404600608322948, + "loss": 0.0251, + "num_input_tokens_seen": 107490096, + "step": 49810 + }, + { + "epoch": 8.12642740619902, + "grad_norm": 0.016378337517380714, + "learning_rate": 0.0007403976504604189, + "loss": 0.0355, + "num_input_tokens_seen": 107500368, + "step": 49815 + }, + { + "epoch": 8.127243066884176, + "grad_norm": 0.009597363881766796, + "learning_rate": 0.0007403352352165797, + "loss": 0.0197, + "num_input_tokens_seen": 107511600, + "step": 49820 + }, + { + "epoch": 8.12805872756933, + "grad_norm": 0.04834035784006119, + "learning_rate": 0.0007402728151020419, + "loss": 0.0727, + "num_input_tokens_seen": 107522672, + "step": 49825 + }, + { + "epoch": 8.128874388254486, + "grad_norm": 0.08384225517511368, + "learning_rate": 0.0007402103901180708, + "loss": 0.0088, + "num_input_tokens_seen": 107534096, + "step": 49830 + }, + { + "epoch": 8.129690048939642, + "grad_norm": 0.23516012728214264, + "learning_rate": 0.0007401479602659315, + "loss": 0.1494, + "num_input_tokens_seen": 107543472, + "step": 49835 + }, + { + "epoch": 8.130505709624796, + "grad_norm": 0.28979215025901794, + "learning_rate": 0.000740085525546889, + "loss": 0.1357, + "num_input_tokens_seen": 107554224, + "step": 49840 + }, + { + "epoch": 8.131321370309951, + "grad_norm": 0.013044273480772972, + "learning_rate": 0.0007400230859622088, + "loss": 0.0084, + "num_input_tokens_seen": 107564240, + "step": 49845 + }, + { + "epoch": 8.132137030995105, + "grad_norm": 0.03242725878953934, + "learning_rate": 0.0007399606415131563, + "loss": 0.0156, + "num_input_tokens_seen": 107574800, + "step": 49850 + }, + { + "epoch": 8.132952691680261, + "grad_norm": 0.030560357496142387, + "learning_rate": 0.0007398981922009971, + "loss": 0.0964, + "num_input_tokens_seen": 107585520, + "step": 49855 + }, + { + "epoch": 8.133768352365417, + "grad_norm": 0.3216109573841095, + "learning_rate": 0.0007398357380269966, + "loss": 0.0646, + "num_input_tokens_seen": 107596112, + "step": 49860 + }, + { + "epoch": 8.13458401305057, + "grad_norm": 0.038035932928323746, + "learning_rate": 0.0007397732789924205, + "loss": 0.0272, + "num_input_tokens_seen": 107607120, + "step": 49865 + }, + { + "epoch": 8.135399673735726, + "grad_norm": 1.2136625051498413, + "learning_rate": 0.0007397108150985349, + "loss": 0.0475, + "num_input_tokens_seen": 107618000, + "step": 49870 + }, + { + "epoch": 8.13621533442088, + "grad_norm": 0.3429149091243744, + "learning_rate": 0.0007396483463466055, + "loss": 0.1118, + "num_input_tokens_seen": 107629232, + "step": 49875 + }, + { + "epoch": 8.137030995106036, + "grad_norm": 0.1950923502445221, + "learning_rate": 0.0007395858727378982, + "loss": 0.042, + "num_input_tokens_seen": 107640784, + "step": 49880 + }, + { + "epoch": 8.137846655791192, + "grad_norm": 0.22323675453662872, + "learning_rate": 0.0007395233942736794, + "loss": 0.0602, + "num_input_tokens_seen": 107650576, + "step": 49885 + }, + { + "epoch": 8.138662316476346, + "grad_norm": 0.00939212180674076, + "learning_rate": 0.0007394609109552152, + "loss": 0.0062, + "num_input_tokens_seen": 107660368, + "step": 49890 + }, + { + "epoch": 8.139477977161501, + "grad_norm": 0.028932545334100723, + "learning_rate": 0.0007393984227837718, + "loss": 0.0347, + "num_input_tokens_seen": 107670096, + "step": 49895 + }, + { + "epoch": 8.140293637846655, + "grad_norm": 0.2399331033229828, + "learning_rate": 0.0007393359297606155, + "loss": 0.0703, + "num_input_tokens_seen": 107680240, + "step": 49900 + }, + { + "epoch": 8.141109298531811, + "grad_norm": 0.003571945009753108, + "learning_rate": 0.0007392734318870133, + "loss": 0.0266, + "num_input_tokens_seen": 107691504, + "step": 49905 + }, + { + "epoch": 8.141924959216965, + "grad_norm": 0.3337574005126953, + "learning_rate": 0.0007392109291642311, + "loss": 0.1991, + "num_input_tokens_seen": 107701968, + "step": 49910 + }, + { + "epoch": 8.14274061990212, + "grad_norm": 0.26637575030326843, + "learning_rate": 0.0007391484215935363, + "loss": 0.2421, + "num_input_tokens_seen": 107713136, + "step": 49915 + }, + { + "epoch": 8.143556280587276, + "grad_norm": 0.03772047162055969, + "learning_rate": 0.000739085909176195, + "loss": 0.0493, + "num_input_tokens_seen": 107724080, + "step": 49920 + }, + { + "epoch": 8.14437194127243, + "grad_norm": 0.15571381151676178, + "learning_rate": 0.0007390233919134747, + "loss": 0.0255, + "num_input_tokens_seen": 107735792, + "step": 49925 + }, + { + "epoch": 8.145187601957586, + "grad_norm": 0.005958837457001209, + "learning_rate": 0.0007389608698066422, + "loss": 0.0933, + "num_input_tokens_seen": 107747280, + "step": 49930 + }, + { + "epoch": 8.14600326264274, + "grad_norm": 0.08870885521173477, + "learning_rate": 0.0007388983428569643, + "loss": 0.1145, + "num_input_tokens_seen": 107758448, + "step": 49935 + }, + { + "epoch": 8.146818923327896, + "grad_norm": 0.10818574577569962, + "learning_rate": 0.0007388358110657085, + "loss": 0.0388, + "num_input_tokens_seen": 107769712, + "step": 49940 + }, + { + "epoch": 8.147634584013051, + "grad_norm": 0.0035674839746207, + "learning_rate": 0.000738773274434142, + "loss": 0.0793, + "num_input_tokens_seen": 107780624, + "step": 49945 + }, + { + "epoch": 8.148450244698205, + "grad_norm": 0.004521622788161039, + "learning_rate": 0.0007387107329635322, + "loss": 0.0147, + "num_input_tokens_seen": 107790800, + "step": 49950 + }, + { + "epoch": 8.149265905383361, + "grad_norm": 0.057634882628917694, + "learning_rate": 0.0007386481866551466, + "loss": 0.1094, + "num_input_tokens_seen": 107802736, + "step": 49955 + }, + { + "epoch": 8.150081566068515, + "grad_norm": 0.036182701587677, + "learning_rate": 0.0007385856355102528, + "loss": 0.0186, + "num_input_tokens_seen": 107813552, + "step": 49960 + }, + { + "epoch": 8.15089722675367, + "grad_norm": 0.017950497567653656, + "learning_rate": 0.0007385230795301183, + "loss": 0.1447, + "num_input_tokens_seen": 107825264, + "step": 49965 + }, + { + "epoch": 8.151712887438826, + "grad_norm": 0.0023306964430958033, + "learning_rate": 0.000738460518716011, + "loss": 0.023, + "num_input_tokens_seen": 107836880, + "step": 49970 + }, + { + "epoch": 8.15252854812398, + "grad_norm": 0.005620780400931835, + "learning_rate": 0.0007383979530691989, + "loss": 0.029, + "num_input_tokens_seen": 107849392, + "step": 49975 + }, + { + "epoch": 8.153344208809136, + "grad_norm": 0.17551542818546295, + "learning_rate": 0.0007383353825909498, + "loss": 0.0508, + "num_input_tokens_seen": 107860144, + "step": 49980 + }, + { + "epoch": 8.15415986949429, + "grad_norm": 0.015075127594172955, + "learning_rate": 0.0007382728072825318, + "loss": 0.0393, + "num_input_tokens_seen": 107869520, + "step": 49985 + }, + { + "epoch": 8.154975530179446, + "grad_norm": 0.031038962304592133, + "learning_rate": 0.0007382102271452132, + "loss": 0.188, + "num_input_tokens_seen": 107879792, + "step": 49990 + }, + { + "epoch": 8.1557911908646, + "grad_norm": 0.017149314284324646, + "learning_rate": 0.0007381476421802621, + "loss": 0.0323, + "num_input_tokens_seen": 107890672, + "step": 49995 + }, + { + "epoch": 8.156606851549755, + "grad_norm": 0.037959400564432144, + "learning_rate": 0.0007380850523889469, + "loss": 0.1079, + "num_input_tokens_seen": 107900528, + "step": 50000 + }, + { + "epoch": 8.15742251223491, + "grad_norm": 0.08239606767892838, + "learning_rate": 0.0007380224577725361, + "loss": 0.0587, + "num_input_tokens_seen": 107911248, + "step": 50005 + }, + { + "epoch": 8.158238172920065, + "grad_norm": 0.005146282725036144, + "learning_rate": 0.0007379598583322982, + "loss": 0.011, + "num_input_tokens_seen": 107922800, + "step": 50010 + }, + { + "epoch": 8.15905383360522, + "grad_norm": 0.04264938831329346, + "learning_rate": 0.0007378972540695019, + "loss": 0.0238, + "num_input_tokens_seen": 107934032, + "step": 50015 + }, + { + "epoch": 8.159869494290374, + "grad_norm": 0.006742180325090885, + "learning_rate": 0.0007378346449854159, + "loss": 0.0439, + "num_input_tokens_seen": 107946128, + "step": 50020 + }, + { + "epoch": 8.16068515497553, + "grad_norm": 0.0027138078585267067, + "learning_rate": 0.0007377720310813092, + "loss": 0.0763, + "num_input_tokens_seen": 107956912, + "step": 50025 + }, + { + "epoch": 8.161500815660686, + "grad_norm": 0.22717513144016266, + "learning_rate": 0.0007377094123584507, + "loss": 0.0404, + "num_input_tokens_seen": 107968112, + "step": 50030 + }, + { + "epoch": 8.16231647634584, + "grad_norm": 0.005620263051241636, + "learning_rate": 0.0007376467888181094, + "loss": 0.0123, + "num_input_tokens_seen": 107978352, + "step": 50035 + }, + { + "epoch": 8.163132137030995, + "grad_norm": 0.11898821592330933, + "learning_rate": 0.0007375841604615542, + "loss": 0.0766, + "num_input_tokens_seen": 107989264, + "step": 50040 + }, + { + "epoch": 8.16394779771615, + "grad_norm": 0.1939682960510254, + "learning_rate": 0.0007375215272900548, + "loss": 0.097, + "num_input_tokens_seen": 107999888, + "step": 50045 + }, + { + "epoch": 8.164763458401305, + "grad_norm": 0.006276941858232021, + "learning_rate": 0.0007374588893048803, + "loss": 0.1679, + "num_input_tokens_seen": 108009680, + "step": 50050 + }, + { + "epoch": 8.16557911908646, + "grad_norm": 0.19604939222335815, + "learning_rate": 0.0007373962465073002, + "loss": 0.075, + "num_input_tokens_seen": 108020368, + "step": 50055 + }, + { + "epoch": 8.166394779771615, + "grad_norm": 0.012251893989741802, + "learning_rate": 0.0007373335988985839, + "loss": 0.0706, + "num_input_tokens_seen": 108030736, + "step": 50060 + }, + { + "epoch": 8.16721044045677, + "grad_norm": 0.0021199060138314962, + "learning_rate": 0.0007372709464800013, + "loss": 0.0204, + "num_input_tokens_seen": 108041232, + "step": 50065 + }, + { + "epoch": 8.168026101141924, + "grad_norm": 0.03952307999134064, + "learning_rate": 0.0007372082892528218, + "loss": 0.0694, + "num_input_tokens_seen": 108053264, + "step": 50070 + }, + { + "epoch": 8.16884176182708, + "grad_norm": 0.028476953506469727, + "learning_rate": 0.0007371456272183156, + "loss": 0.012, + "num_input_tokens_seen": 108063088, + "step": 50075 + }, + { + "epoch": 8.169657422512234, + "grad_norm": 0.0072764987125992775, + "learning_rate": 0.0007370829603777523, + "loss": 0.0079, + "num_input_tokens_seen": 108072688, + "step": 50080 + }, + { + "epoch": 8.17047308319739, + "grad_norm": 0.01800713501870632, + "learning_rate": 0.000737020288732402, + "loss": 0.0162, + "num_input_tokens_seen": 108085296, + "step": 50085 + }, + { + "epoch": 8.171288743882545, + "grad_norm": 0.08102546632289886, + "learning_rate": 0.0007369576122835349, + "loss": 0.0246, + "num_input_tokens_seen": 108096816, + "step": 50090 + }, + { + "epoch": 8.1721044045677, + "grad_norm": 0.0048878430388867855, + "learning_rate": 0.0007368949310324211, + "loss": 0.1114, + "num_input_tokens_seen": 108107280, + "step": 50095 + }, + { + "epoch": 8.172920065252855, + "grad_norm": 0.002329473849385977, + "learning_rate": 0.0007368322449803311, + "loss": 0.0232, + "num_input_tokens_seen": 108117552, + "step": 50100 + }, + { + "epoch": 8.173735725938009, + "grad_norm": 0.0033999725710600615, + "learning_rate": 0.0007367695541285353, + "loss": 0.0198, + "num_input_tokens_seen": 108129264, + "step": 50105 + }, + { + "epoch": 8.174551386623165, + "grad_norm": 0.30080005526542664, + "learning_rate": 0.0007367068584783041, + "loss": 0.0462, + "num_input_tokens_seen": 108139472, + "step": 50110 + }, + { + "epoch": 8.17536704730832, + "grad_norm": 0.2299824357032776, + "learning_rate": 0.000736644158030908, + "loss": 0.0776, + "num_input_tokens_seen": 108149712, + "step": 50115 + }, + { + "epoch": 8.176182707993474, + "grad_norm": 0.012273544445633888, + "learning_rate": 0.0007365814527876179, + "loss": 0.0177, + "num_input_tokens_seen": 108160464, + "step": 50120 + }, + { + "epoch": 8.17699836867863, + "grad_norm": 0.27951616048812866, + "learning_rate": 0.0007365187427497045, + "loss": 0.1404, + "num_input_tokens_seen": 108171312, + "step": 50125 + }, + { + "epoch": 8.177814029363784, + "grad_norm": 0.5882936716079712, + "learning_rate": 0.0007364560279184387, + "loss": 0.0416, + "num_input_tokens_seen": 108181616, + "step": 50130 + }, + { + "epoch": 8.17862969004894, + "grad_norm": 0.003504018299281597, + "learning_rate": 0.0007363933082950917, + "loss": 0.1142, + "num_input_tokens_seen": 108191792, + "step": 50135 + }, + { + "epoch": 8.179445350734095, + "grad_norm": 0.005385532975196838, + "learning_rate": 0.0007363305838809344, + "loss": 0.111, + "num_input_tokens_seen": 108203216, + "step": 50140 + }, + { + "epoch": 8.18026101141925, + "grad_norm": 0.017848767340183258, + "learning_rate": 0.0007362678546772379, + "loss": 0.0444, + "num_input_tokens_seen": 108214000, + "step": 50145 + }, + { + "epoch": 8.181076672104405, + "grad_norm": 0.29265037178993225, + "learning_rate": 0.0007362051206852736, + "loss": 0.0592, + "num_input_tokens_seen": 108224528, + "step": 50150 + }, + { + "epoch": 8.181892332789559, + "grad_norm": 0.02796623483300209, + "learning_rate": 0.0007361423819063128, + "loss": 0.0128, + "num_input_tokens_seen": 108235088, + "step": 50155 + }, + { + "epoch": 8.182707993474715, + "grad_norm": 0.021538639441132545, + "learning_rate": 0.0007360796383416273, + "loss": 0.0215, + "num_input_tokens_seen": 108245616, + "step": 50160 + }, + { + "epoch": 8.18352365415987, + "grad_norm": 0.21633203327655792, + "learning_rate": 0.0007360168899924883, + "loss": 0.0764, + "num_input_tokens_seen": 108256144, + "step": 50165 + }, + { + "epoch": 8.184339314845024, + "grad_norm": 0.00879256147891283, + "learning_rate": 0.0007359541368601675, + "loss": 0.059, + "num_input_tokens_seen": 108267248, + "step": 50170 + }, + { + "epoch": 8.18515497553018, + "grad_norm": 0.057316552847623825, + "learning_rate": 0.0007358913789459369, + "loss": 0.0592, + "num_input_tokens_seen": 108278128, + "step": 50175 + }, + { + "epoch": 8.185970636215334, + "grad_norm": 0.1072782501578331, + "learning_rate": 0.0007358286162510683, + "loss": 0.0814, + "num_input_tokens_seen": 108290352, + "step": 50180 + }, + { + "epoch": 8.18678629690049, + "grad_norm": 0.0840066447854042, + "learning_rate": 0.0007357658487768337, + "loss": 0.0154, + "num_input_tokens_seen": 108301776, + "step": 50185 + }, + { + "epoch": 8.187601957585644, + "grad_norm": 0.37570205330848694, + "learning_rate": 0.0007357030765245049, + "loss": 0.0582, + "num_input_tokens_seen": 108312208, + "step": 50190 + }, + { + "epoch": 8.1884176182708, + "grad_norm": 0.16873884201049805, + "learning_rate": 0.0007356402994953544, + "loss": 0.038, + "num_input_tokens_seen": 108322480, + "step": 50195 + }, + { + "epoch": 8.189233278955955, + "grad_norm": 0.009506826288998127, + "learning_rate": 0.0007355775176906543, + "loss": 0.0181, + "num_input_tokens_seen": 108333392, + "step": 50200 + }, + { + "epoch": 8.190048939641109, + "grad_norm": 0.00810005608946085, + "learning_rate": 0.0007355147311116768, + "loss": 0.018, + "num_input_tokens_seen": 108343856, + "step": 50205 + }, + { + "epoch": 8.190864600326265, + "grad_norm": 0.015775280073285103, + "learning_rate": 0.0007354519397596946, + "loss": 0.0816, + "num_input_tokens_seen": 108355728, + "step": 50210 + }, + { + "epoch": 8.191680261011419, + "grad_norm": 0.005609696730971336, + "learning_rate": 0.0007353891436359801, + "loss": 0.1723, + "num_input_tokens_seen": 108366736, + "step": 50215 + }, + { + "epoch": 8.192495921696574, + "grad_norm": 0.0420515201985836, + "learning_rate": 0.000735326342741806, + "loss": 0.0128, + "num_input_tokens_seen": 108376912, + "step": 50220 + }, + { + "epoch": 8.19331158238173, + "grad_norm": 0.010869069956243038, + "learning_rate": 0.0007352635370784451, + "loss": 0.0684, + "num_input_tokens_seen": 108387440, + "step": 50225 + }, + { + "epoch": 8.194127243066884, + "grad_norm": 0.040659014135599136, + "learning_rate": 0.00073520072664717, + "loss": 0.1584, + "num_input_tokens_seen": 108397232, + "step": 50230 + }, + { + "epoch": 8.19494290375204, + "grad_norm": 0.14344748854637146, + "learning_rate": 0.000735137911449254, + "loss": 0.0373, + "num_input_tokens_seen": 108408016, + "step": 50235 + }, + { + "epoch": 8.195758564437194, + "grad_norm": 0.2394501119852066, + "learning_rate": 0.0007350750914859698, + "loss": 0.1266, + "num_input_tokens_seen": 108419024, + "step": 50240 + }, + { + "epoch": 8.19657422512235, + "grad_norm": 0.03212181478738785, + "learning_rate": 0.0007350122667585908, + "loss": 0.0187, + "num_input_tokens_seen": 108430320, + "step": 50245 + }, + { + "epoch": 8.197389885807505, + "grad_norm": 0.0022966123651713133, + "learning_rate": 0.0007349494372683899, + "loss": 0.0139, + "num_input_tokens_seen": 108441104, + "step": 50250 + }, + { + "epoch": 8.198205546492659, + "grad_norm": 0.018729545176029205, + "learning_rate": 0.0007348866030166407, + "loss": 0.1584, + "num_input_tokens_seen": 108452656, + "step": 50255 + }, + { + "epoch": 8.199021207177815, + "grad_norm": 0.3668605387210846, + "learning_rate": 0.0007348237640046165, + "loss": 0.1341, + "num_input_tokens_seen": 108462992, + "step": 50260 + }, + { + "epoch": 8.199836867862969, + "grad_norm": 0.0057599726133048534, + "learning_rate": 0.0007347609202335907, + "loss": 0.0422, + "num_input_tokens_seen": 108473744, + "step": 50265 + }, + { + "epoch": 8.200652528548124, + "grad_norm": 0.06297006458044052, + "learning_rate": 0.0007346980717048373, + "loss": 0.0382, + "num_input_tokens_seen": 108485136, + "step": 50270 + }, + { + "epoch": 8.201468189233278, + "grad_norm": 0.14650148153305054, + "learning_rate": 0.0007346352184196296, + "loss": 0.0256, + "num_input_tokens_seen": 108496720, + "step": 50275 + }, + { + "epoch": 8.202283849918434, + "grad_norm": 0.5659634470939636, + "learning_rate": 0.0007345723603792415, + "loss": 0.0923, + "num_input_tokens_seen": 108508016, + "step": 50280 + }, + { + "epoch": 8.20309951060359, + "grad_norm": 0.27211794257164, + "learning_rate": 0.000734509497584947, + "loss": 0.0721, + "num_input_tokens_seen": 108517872, + "step": 50285 + }, + { + "epoch": 8.203915171288743, + "grad_norm": 0.34656408429145813, + "learning_rate": 0.0007344466300380201, + "loss": 0.0811, + "num_input_tokens_seen": 108527280, + "step": 50290 + }, + { + "epoch": 8.2047308319739, + "grad_norm": 0.0060843355022370815, + "learning_rate": 0.0007343837577397347, + "loss": 0.1149, + "num_input_tokens_seen": 108538192, + "step": 50295 + }, + { + "epoch": 8.205546492659053, + "grad_norm": 0.29166820645332336, + "learning_rate": 0.0007343208806913651, + "loss": 0.0923, + "num_input_tokens_seen": 108546960, + "step": 50300 + }, + { + "epoch": 8.206362153344209, + "grad_norm": 0.05073179677128792, + "learning_rate": 0.0007342579988941858, + "loss": 0.0215, + "num_input_tokens_seen": 108556912, + "step": 50305 + }, + { + "epoch": 8.207177814029365, + "grad_norm": 0.13538983464241028, + "learning_rate": 0.0007341951123494708, + "loss": 0.0257, + "num_input_tokens_seen": 108568144, + "step": 50310 + }, + { + "epoch": 8.207993474714518, + "grad_norm": 0.005452352110296488, + "learning_rate": 0.0007341322210584947, + "loss": 0.0055, + "num_input_tokens_seen": 108578160, + "step": 50315 + }, + { + "epoch": 8.208809135399674, + "grad_norm": 0.0064372471533715725, + "learning_rate": 0.0007340693250225322, + "loss": 0.0443, + "num_input_tokens_seen": 108589296, + "step": 50320 + }, + { + "epoch": 8.209624796084828, + "grad_norm": 0.026119455695152283, + "learning_rate": 0.0007340064242428579, + "loss": 0.0573, + "num_input_tokens_seen": 108600240, + "step": 50325 + }, + { + "epoch": 8.210440456769984, + "grad_norm": 0.18645314872264862, + "learning_rate": 0.0007339435187207466, + "loss": 0.0204, + "num_input_tokens_seen": 108611056, + "step": 50330 + }, + { + "epoch": 8.21125611745514, + "grad_norm": 0.004375093150883913, + "learning_rate": 0.0007338806084574731, + "loss": 0.1869, + "num_input_tokens_seen": 108620400, + "step": 50335 + }, + { + "epoch": 8.212071778140293, + "grad_norm": 0.024965085089206696, + "learning_rate": 0.0007338176934543124, + "loss": 0.0186, + "num_input_tokens_seen": 108632432, + "step": 50340 + }, + { + "epoch": 8.21288743882545, + "grad_norm": 0.07735445350408554, + "learning_rate": 0.0007337547737125394, + "loss": 0.2509, + "num_input_tokens_seen": 108642896, + "step": 50345 + }, + { + "epoch": 8.213703099510603, + "grad_norm": 0.006733644753694534, + "learning_rate": 0.0007336918492334294, + "loss": 0.0664, + "num_input_tokens_seen": 108652816, + "step": 50350 + }, + { + "epoch": 8.214518760195759, + "grad_norm": 0.004174524452537298, + "learning_rate": 0.0007336289200182576, + "loss": 0.0636, + "num_input_tokens_seen": 108663792, + "step": 50355 + }, + { + "epoch": 8.215334420880913, + "grad_norm": 0.015392466448247433, + "learning_rate": 0.0007335659860682994, + "loss": 0.0314, + "num_input_tokens_seen": 108674896, + "step": 50360 + }, + { + "epoch": 8.216150081566068, + "grad_norm": 0.026614753529429436, + "learning_rate": 0.0007335030473848302, + "loss": 0.0278, + "num_input_tokens_seen": 108685584, + "step": 50365 + }, + { + "epoch": 8.216965742251224, + "grad_norm": 0.16512061655521393, + "learning_rate": 0.0007334401039691255, + "loss": 0.0304, + "num_input_tokens_seen": 108696560, + "step": 50370 + }, + { + "epoch": 8.217781402936378, + "grad_norm": 0.012778915464878082, + "learning_rate": 0.000733377155822461, + "loss": 0.0113, + "num_input_tokens_seen": 108706512, + "step": 50375 + }, + { + "epoch": 8.218597063621534, + "grad_norm": 0.3787010908126831, + "learning_rate": 0.0007333142029461124, + "loss": 0.1446, + "num_input_tokens_seen": 108717808, + "step": 50380 + }, + { + "epoch": 8.219412724306688, + "grad_norm": 0.16041482985019684, + "learning_rate": 0.0007332512453413555, + "loss": 0.0387, + "num_input_tokens_seen": 108729040, + "step": 50385 + }, + { + "epoch": 8.220228384991843, + "grad_norm": 0.0862046629190445, + "learning_rate": 0.0007331882830094661, + "loss": 0.0587, + "num_input_tokens_seen": 108739536, + "step": 50390 + }, + { + "epoch": 8.221044045676999, + "grad_norm": 0.04885483533143997, + "learning_rate": 0.0007331253159517204, + "loss": 0.03, + "num_input_tokens_seen": 108750640, + "step": 50395 + }, + { + "epoch": 8.221859706362153, + "grad_norm": 0.012324989773333073, + "learning_rate": 0.0007330623441693944, + "loss": 0.0176, + "num_input_tokens_seen": 108760368, + "step": 50400 + }, + { + "epoch": 8.222675367047309, + "grad_norm": 0.16954010725021362, + "learning_rate": 0.0007329993676637643, + "loss": 0.0699, + "num_input_tokens_seen": 108772336, + "step": 50405 + }, + { + "epoch": 8.223491027732463, + "grad_norm": 0.24897179007530212, + "learning_rate": 0.0007329363864361065, + "loss": 0.0876, + "num_input_tokens_seen": 108782800, + "step": 50410 + }, + { + "epoch": 8.224306688417618, + "grad_norm": 0.0013731664512306452, + "learning_rate": 0.0007328734004876974, + "loss": 0.0164, + "num_input_tokens_seen": 108794000, + "step": 50415 + }, + { + "epoch": 8.225122349102774, + "grad_norm": 0.07233874499797821, + "learning_rate": 0.0007328104098198131, + "loss": 0.0081, + "num_input_tokens_seen": 108805392, + "step": 50420 + }, + { + "epoch": 8.225938009787928, + "grad_norm": 0.05255259573459625, + "learning_rate": 0.000732747414433731, + "loss": 0.0146, + "num_input_tokens_seen": 108817616, + "step": 50425 + }, + { + "epoch": 8.226753670473084, + "grad_norm": 0.037548601627349854, + "learning_rate": 0.000732684414330727, + "loss": 0.014, + "num_input_tokens_seen": 108829360, + "step": 50430 + }, + { + "epoch": 8.227569331158238, + "grad_norm": 0.1310933530330658, + "learning_rate": 0.0007326214095120781, + "loss": 0.1706, + "num_input_tokens_seen": 108839696, + "step": 50435 + }, + { + "epoch": 8.228384991843393, + "grad_norm": 0.0031148619018495083, + "learning_rate": 0.0007325583999790613, + "loss": 0.0142, + "num_input_tokens_seen": 108851184, + "step": 50440 + }, + { + "epoch": 8.229200652528547, + "grad_norm": 0.010511726140975952, + "learning_rate": 0.0007324953857329535, + "loss": 0.1164, + "num_input_tokens_seen": 108861744, + "step": 50445 + }, + { + "epoch": 8.230016313213703, + "grad_norm": 0.0679972916841507, + "learning_rate": 0.0007324323667750319, + "loss": 0.0244, + "num_input_tokens_seen": 108872816, + "step": 50450 + }, + { + "epoch": 8.230831973898859, + "grad_norm": 0.008308612741529942, + "learning_rate": 0.0007323693431065734, + "loss": 0.0676, + "num_input_tokens_seen": 108884688, + "step": 50455 + }, + { + "epoch": 8.231647634584013, + "grad_norm": 0.15962466597557068, + "learning_rate": 0.0007323063147288553, + "loss": 0.0392, + "num_input_tokens_seen": 108896208, + "step": 50460 + }, + { + "epoch": 8.232463295269168, + "grad_norm": 0.010848425328731537, + "learning_rate": 0.0007322432816431551, + "loss": 0.1119, + "num_input_tokens_seen": 108907536, + "step": 50465 + }, + { + "epoch": 8.233278955954322, + "grad_norm": 0.001676029758527875, + "learning_rate": 0.0007321802438507502, + "loss": 0.0301, + "num_input_tokens_seen": 108917360, + "step": 50470 + }, + { + "epoch": 8.234094616639478, + "grad_norm": 0.009221650660037994, + "learning_rate": 0.0007321172013529182, + "loss": 0.0853, + "num_input_tokens_seen": 108927952, + "step": 50475 + }, + { + "epoch": 8.234910277324634, + "grad_norm": 0.005944567266851664, + "learning_rate": 0.0007320541541509366, + "loss": 0.0075, + "num_input_tokens_seen": 108939248, + "step": 50480 + }, + { + "epoch": 8.235725938009788, + "grad_norm": 0.01497675385326147, + "learning_rate": 0.0007319911022460831, + "loss": 0.1476, + "num_input_tokens_seen": 108950832, + "step": 50485 + }, + { + "epoch": 8.236541598694943, + "grad_norm": 0.031807366758584976, + "learning_rate": 0.0007319280456396357, + "loss": 0.0927, + "num_input_tokens_seen": 108961488, + "step": 50490 + }, + { + "epoch": 8.237357259380097, + "grad_norm": 0.2277534306049347, + "learning_rate": 0.0007318649843328722, + "loss": 0.0527, + "num_input_tokens_seen": 108971888, + "step": 50495 + }, + { + "epoch": 8.238172920065253, + "grad_norm": 0.045359063893556595, + "learning_rate": 0.0007318019183270707, + "loss": 0.0215, + "num_input_tokens_seen": 108983856, + "step": 50500 + }, + { + "epoch": 8.238988580750409, + "grad_norm": 0.05261260271072388, + "learning_rate": 0.0007317388476235091, + "loss": 0.1815, + "num_input_tokens_seen": 108995408, + "step": 50505 + }, + { + "epoch": 8.239804241435563, + "grad_norm": 0.0012343940325081348, + "learning_rate": 0.0007316757722234659, + "loss": 0.135, + "num_input_tokens_seen": 109006736, + "step": 50510 + }, + { + "epoch": 8.240619902120718, + "grad_norm": 0.07976092398166656, + "learning_rate": 0.0007316126921282193, + "loss": 0.0162, + "num_input_tokens_seen": 109016528, + "step": 50515 + }, + { + "epoch": 8.241435562805872, + "grad_norm": 0.24189190566539764, + "learning_rate": 0.0007315496073390477, + "loss": 0.0947, + "num_input_tokens_seen": 109028688, + "step": 50520 + }, + { + "epoch": 8.242251223491028, + "grad_norm": 0.039111923426389694, + "learning_rate": 0.0007314865178572295, + "loss": 0.1213, + "num_input_tokens_seen": 109039792, + "step": 50525 + }, + { + "epoch": 8.243066884176184, + "grad_norm": 0.006544589996337891, + "learning_rate": 0.0007314234236840434, + "loss": 0.0194, + "num_input_tokens_seen": 109050416, + "step": 50530 + }, + { + "epoch": 8.243882544861338, + "grad_norm": 0.020613284781575203, + "learning_rate": 0.000731360324820768, + "loss": 0.1046, + "num_input_tokens_seen": 109060816, + "step": 50535 + }, + { + "epoch": 8.244698205546493, + "grad_norm": 0.009807380847632885, + "learning_rate": 0.000731297221268682, + "loss": 0.016, + "num_input_tokens_seen": 109071920, + "step": 50540 + }, + { + "epoch": 8.245513866231647, + "grad_norm": 0.02374882809817791, + "learning_rate": 0.0007312341130290645, + "loss": 0.0222, + "num_input_tokens_seen": 109082512, + "step": 50545 + }, + { + "epoch": 8.246329526916803, + "grad_norm": 0.03517920896410942, + "learning_rate": 0.0007311710001031943, + "loss": 0.1215, + "num_input_tokens_seen": 109093744, + "step": 50550 + }, + { + "epoch": 8.247145187601957, + "grad_norm": 0.06269794702529907, + "learning_rate": 0.0007311078824923506, + "loss": 0.0267, + "num_input_tokens_seen": 109104368, + "step": 50555 + }, + { + "epoch": 8.247960848287113, + "grad_norm": 0.17267194390296936, + "learning_rate": 0.0007310447601978125, + "loss": 0.0846, + "num_input_tokens_seen": 109115920, + "step": 50560 + }, + { + "epoch": 8.248776508972268, + "grad_norm": 0.13185855746269226, + "learning_rate": 0.0007309816332208592, + "loss": 0.0433, + "num_input_tokens_seen": 109127088, + "step": 50565 + }, + { + "epoch": 8.249592169657422, + "grad_norm": 0.0029770960099995136, + "learning_rate": 0.00073091850156277, + "loss": 0.0282, + "num_input_tokens_seen": 109138736, + "step": 50570 + }, + { + "epoch": 8.250407830342578, + "grad_norm": 0.020211854949593544, + "learning_rate": 0.0007308553652248244, + "loss": 0.0849, + "num_input_tokens_seen": 109149488, + "step": 50575 + }, + { + "epoch": 8.251223491027732, + "grad_norm": 0.04552924633026123, + "learning_rate": 0.0007307922242083022, + "loss": 0.0248, + "num_input_tokens_seen": 109160656, + "step": 50580 + }, + { + "epoch": 8.252039151712887, + "grad_norm": 0.10104500502347946, + "learning_rate": 0.0007307290785144826, + "loss": 0.0198, + "num_input_tokens_seen": 109171312, + "step": 50585 + }, + { + "epoch": 8.252854812398043, + "grad_norm": 0.3054099977016449, + "learning_rate": 0.0007306659281446456, + "loss": 0.0375, + "num_input_tokens_seen": 109182384, + "step": 50590 + }, + { + "epoch": 8.253670473083197, + "grad_norm": 0.0024543420877307653, + "learning_rate": 0.000730602773100071, + "loss": 0.0135, + "num_input_tokens_seen": 109191792, + "step": 50595 + }, + { + "epoch": 8.254486133768353, + "grad_norm": 0.11228777468204498, + "learning_rate": 0.0007305396133820385, + "loss": 0.0943, + "num_input_tokens_seen": 109202960, + "step": 50600 + }, + { + "epoch": 8.255301794453507, + "grad_norm": 0.012242484837770462, + "learning_rate": 0.0007304764489918284, + "loss": 0.1144, + "num_input_tokens_seen": 109214288, + "step": 50605 + }, + { + "epoch": 8.256117455138662, + "grad_norm": 0.0036556487902998924, + "learning_rate": 0.0007304132799307206, + "loss": 0.0157, + "num_input_tokens_seen": 109225328, + "step": 50610 + }, + { + "epoch": 8.256933115823816, + "grad_norm": 0.032981522381305695, + "learning_rate": 0.0007303501061999956, + "loss": 0.0795, + "num_input_tokens_seen": 109236016, + "step": 50615 + }, + { + "epoch": 8.257748776508972, + "grad_norm": 0.02444942481815815, + "learning_rate": 0.0007302869278009332, + "loss": 0.2139, + "num_input_tokens_seen": 109247344, + "step": 50620 + }, + { + "epoch": 8.258564437194128, + "grad_norm": 0.09682659804821014, + "learning_rate": 0.0007302237447348141, + "loss": 0.0432, + "num_input_tokens_seen": 109258864, + "step": 50625 + }, + { + "epoch": 8.259380097879282, + "grad_norm": 0.004521368537098169, + "learning_rate": 0.0007301605570029189, + "loss": 0.0464, + "num_input_tokens_seen": 109270192, + "step": 50630 + }, + { + "epoch": 8.260195758564437, + "grad_norm": 0.00717537896707654, + "learning_rate": 0.000730097364606528, + "loss": 0.0161, + "num_input_tokens_seen": 109280016, + "step": 50635 + }, + { + "epoch": 8.261011419249591, + "grad_norm": 0.01471832487732172, + "learning_rate": 0.000730034167546922, + "loss": 0.0451, + "num_input_tokens_seen": 109290256, + "step": 50640 + }, + { + "epoch": 8.261827079934747, + "grad_norm": 0.018939554691314697, + "learning_rate": 0.0007299709658253819, + "loss": 0.0493, + "num_input_tokens_seen": 109300208, + "step": 50645 + }, + { + "epoch": 8.262642740619903, + "grad_norm": 0.10142449289560318, + "learning_rate": 0.0007299077594431885, + "loss": 0.1141, + "num_input_tokens_seen": 109310960, + "step": 50650 + }, + { + "epoch": 8.263458401305057, + "grad_norm": 0.036198537796735764, + "learning_rate": 0.0007298445484016225, + "loss": 0.0451, + "num_input_tokens_seen": 109321040, + "step": 50655 + }, + { + "epoch": 8.264274061990212, + "grad_norm": 0.013958665542304516, + "learning_rate": 0.0007297813327019652, + "loss": 0.1055, + "num_input_tokens_seen": 109331056, + "step": 50660 + }, + { + "epoch": 8.265089722675366, + "grad_norm": 0.021503480151295662, + "learning_rate": 0.0007297181123454977, + "loss": 0.1026, + "num_input_tokens_seen": 109340816, + "step": 50665 + }, + { + "epoch": 8.265905383360522, + "grad_norm": 0.0584653802216053, + "learning_rate": 0.0007296548873335013, + "loss": 0.0659, + "num_input_tokens_seen": 109351984, + "step": 50670 + }, + { + "epoch": 8.266721044045678, + "grad_norm": 0.01251292135566473, + "learning_rate": 0.0007295916576672572, + "loss": 0.057, + "num_input_tokens_seen": 109361616, + "step": 50675 + }, + { + "epoch": 8.267536704730832, + "grad_norm": 0.2499684989452362, + "learning_rate": 0.0007295284233480468, + "loss": 0.0319, + "num_input_tokens_seen": 109372240, + "step": 50680 + }, + { + "epoch": 8.268352365415987, + "grad_norm": 0.04325515404343605, + "learning_rate": 0.0007294651843771519, + "loss": 0.0339, + "num_input_tokens_seen": 109383120, + "step": 50685 + }, + { + "epoch": 8.269168026101141, + "grad_norm": 0.008919685147702694, + "learning_rate": 0.0007294019407558538, + "loss": 0.0221, + "num_input_tokens_seen": 109391952, + "step": 50690 + }, + { + "epoch": 8.269983686786297, + "grad_norm": 0.019909190014004707, + "learning_rate": 0.0007293386924854346, + "loss": 0.0186, + "num_input_tokens_seen": 109402576, + "step": 50695 + }, + { + "epoch": 8.270799347471453, + "grad_norm": 0.40029022097587585, + "learning_rate": 0.0007292754395671757, + "loss": 0.0675, + "num_input_tokens_seen": 109412976, + "step": 50700 + }, + { + "epoch": 8.271615008156607, + "grad_norm": 0.06428990513086319, + "learning_rate": 0.0007292121820023592, + "loss": 0.041, + "num_input_tokens_seen": 109425040, + "step": 50705 + }, + { + "epoch": 8.272430668841762, + "grad_norm": 0.08304207026958466, + "learning_rate": 0.000729148919792267, + "loss": 0.0402, + "num_input_tokens_seen": 109436592, + "step": 50710 + }, + { + "epoch": 8.273246329526916, + "grad_norm": 0.21790191531181335, + "learning_rate": 0.000729085652938181, + "loss": 0.0715, + "num_input_tokens_seen": 109447184, + "step": 50715 + }, + { + "epoch": 8.274061990212072, + "grad_norm": 0.006662940140813589, + "learning_rate": 0.0007290223814413841, + "loss": 0.0403, + "num_input_tokens_seen": 109457616, + "step": 50720 + }, + { + "epoch": 8.274877650897226, + "grad_norm": 0.030080795288085938, + "learning_rate": 0.0007289591053031578, + "loss": 0.005, + "num_input_tokens_seen": 109466896, + "step": 50725 + }, + { + "epoch": 8.275693311582382, + "grad_norm": 0.01349678635597229, + "learning_rate": 0.000728895824524785, + "loss": 0.0118, + "num_input_tokens_seen": 109478800, + "step": 50730 + }, + { + "epoch": 8.276508972267537, + "grad_norm": 0.11009848117828369, + "learning_rate": 0.0007288325391075478, + "loss": 0.0619, + "num_input_tokens_seen": 109488272, + "step": 50735 + }, + { + "epoch": 8.277324632952691, + "grad_norm": 0.03973151370882988, + "learning_rate": 0.000728769249052729, + "loss": 0.014, + "num_input_tokens_seen": 109499248, + "step": 50740 + }, + { + "epoch": 8.278140293637847, + "grad_norm": 0.1538897603750229, + "learning_rate": 0.000728705954361611, + "loss": 0.1074, + "num_input_tokens_seen": 109509584, + "step": 50745 + }, + { + "epoch": 8.278955954323001, + "grad_norm": 0.009154248982667923, + "learning_rate": 0.0007286426550354768, + "loss": 0.1924, + "num_input_tokens_seen": 109520656, + "step": 50750 + }, + { + "epoch": 8.279771615008157, + "grad_norm": 0.009647359140217304, + "learning_rate": 0.000728579351075609, + "loss": 0.0258, + "num_input_tokens_seen": 109531600, + "step": 50755 + }, + { + "epoch": 8.280587275693312, + "grad_norm": 0.004562761168926954, + "learning_rate": 0.0007285160424832909, + "loss": 0.1146, + "num_input_tokens_seen": 109541936, + "step": 50760 + }, + { + "epoch": 8.281402936378466, + "grad_norm": 0.08278850466012955, + "learning_rate": 0.0007284527292598051, + "loss": 0.0207, + "num_input_tokens_seen": 109552336, + "step": 50765 + }, + { + "epoch": 8.282218597063622, + "grad_norm": 0.05430145934224129, + "learning_rate": 0.0007283894114064351, + "loss": 0.1574, + "num_input_tokens_seen": 109563376, + "step": 50770 + }, + { + "epoch": 8.283034257748776, + "grad_norm": 0.38736823201179504, + "learning_rate": 0.0007283260889244639, + "loss": 0.0474, + "num_input_tokens_seen": 109575216, + "step": 50775 + }, + { + "epoch": 8.283849918433932, + "grad_norm": 0.0542893223464489, + "learning_rate": 0.0007282627618151747, + "loss": 0.0325, + "num_input_tokens_seen": 109585904, + "step": 50780 + }, + { + "epoch": 8.284665579119087, + "grad_norm": 0.33248165249824524, + "learning_rate": 0.0007281994300798511, + "loss": 0.183, + "num_input_tokens_seen": 109597936, + "step": 50785 + }, + { + "epoch": 8.285481239804241, + "grad_norm": 0.05477141588926315, + "learning_rate": 0.0007281360937197767, + "loss": 0.0831, + "num_input_tokens_seen": 109609424, + "step": 50790 + }, + { + "epoch": 8.286296900489397, + "grad_norm": 0.018158728256821632, + "learning_rate": 0.0007280727527362349, + "loss": 0.0259, + "num_input_tokens_seen": 109621488, + "step": 50795 + }, + { + "epoch": 8.28711256117455, + "grad_norm": 0.006904778070747852, + "learning_rate": 0.0007280094071305095, + "loss": 0.0087, + "num_input_tokens_seen": 109631696, + "step": 50800 + }, + { + "epoch": 8.287928221859707, + "grad_norm": 0.1075032502412796, + "learning_rate": 0.0007279460569038841, + "loss": 0.1257, + "num_input_tokens_seen": 109642544, + "step": 50805 + }, + { + "epoch": 8.28874388254486, + "grad_norm": 0.06787388771772385, + "learning_rate": 0.0007278827020576427, + "loss": 0.0283, + "num_input_tokens_seen": 109652752, + "step": 50810 + }, + { + "epoch": 8.289559543230016, + "grad_norm": 0.2797449827194214, + "learning_rate": 0.0007278193425930692, + "loss": 0.1108, + "num_input_tokens_seen": 109664496, + "step": 50815 + }, + { + "epoch": 8.290375203915172, + "grad_norm": 0.027959732338786125, + "learning_rate": 0.0007277559785114478, + "loss": 0.0708, + "num_input_tokens_seen": 109676752, + "step": 50820 + }, + { + "epoch": 8.291190864600326, + "grad_norm": 0.02896394580602646, + "learning_rate": 0.0007276926098140626, + "loss": 0.0284, + "num_input_tokens_seen": 109687792, + "step": 50825 + }, + { + "epoch": 8.292006525285482, + "grad_norm": 0.011908689513802528, + "learning_rate": 0.0007276292365021979, + "loss": 0.0399, + "num_input_tokens_seen": 109698416, + "step": 50830 + }, + { + "epoch": 8.292822185970635, + "grad_norm": 0.008374841883778572, + "learning_rate": 0.0007275658585771378, + "loss": 0.1065, + "num_input_tokens_seen": 109709488, + "step": 50835 + }, + { + "epoch": 8.293637846655791, + "grad_norm": 0.2722913920879364, + "learning_rate": 0.0007275024760401668, + "loss": 0.0726, + "num_input_tokens_seen": 109720912, + "step": 50840 + }, + { + "epoch": 8.294453507340947, + "grad_norm": 0.007487046532332897, + "learning_rate": 0.0007274390888925697, + "loss": 0.0887, + "num_input_tokens_seen": 109731600, + "step": 50845 + }, + { + "epoch": 8.2952691680261, + "grad_norm": 0.01902494579553604, + "learning_rate": 0.0007273756971356308, + "loss": 0.1277, + "num_input_tokens_seen": 109742064, + "step": 50850 + }, + { + "epoch": 8.296084828711257, + "grad_norm": 0.05688869580626488, + "learning_rate": 0.000727312300770635, + "loss": 0.0649, + "num_input_tokens_seen": 109752848, + "step": 50855 + }, + { + "epoch": 8.29690048939641, + "grad_norm": 0.07428126782178879, + "learning_rate": 0.0007272488997988671, + "loss": 0.03, + "num_input_tokens_seen": 109763728, + "step": 50860 + }, + { + "epoch": 8.297716150081566, + "grad_norm": 0.020539697259664536, + "learning_rate": 0.000727185494221612, + "loss": 0.1739, + "num_input_tokens_seen": 109774864, + "step": 50865 + }, + { + "epoch": 8.298531810766722, + "grad_norm": 0.004688397515565157, + "learning_rate": 0.0007271220840401546, + "loss": 0.0839, + "num_input_tokens_seen": 109785296, + "step": 50870 + }, + { + "epoch": 8.299347471451876, + "grad_norm": 0.08649436384439468, + "learning_rate": 0.0007270586692557799, + "loss": 0.0743, + "num_input_tokens_seen": 109795856, + "step": 50875 + }, + { + "epoch": 8.300163132137031, + "grad_norm": 0.31723353266716003, + "learning_rate": 0.0007269952498697733, + "loss": 0.1236, + "num_input_tokens_seen": 109806480, + "step": 50880 + }, + { + "epoch": 8.300978792822185, + "grad_norm": 0.08268705755472183, + "learning_rate": 0.0007269318258834202, + "loss": 0.0121, + "num_input_tokens_seen": 109817680, + "step": 50885 + }, + { + "epoch": 8.301794453507341, + "grad_norm": 0.033189307898283005, + "learning_rate": 0.0007268683972980056, + "loss": 0.0169, + "num_input_tokens_seen": 109829616, + "step": 50890 + }, + { + "epoch": 8.302610114192497, + "grad_norm": 0.039219487458467484, + "learning_rate": 0.0007268049641148152, + "loss": 0.0633, + "num_input_tokens_seen": 109839632, + "step": 50895 + }, + { + "epoch": 8.30342577487765, + "grad_norm": 0.04654090851545334, + "learning_rate": 0.0007267415263351343, + "loss": 0.0224, + "num_input_tokens_seen": 109851344, + "step": 50900 + }, + { + "epoch": 8.304241435562806, + "grad_norm": 0.01601216197013855, + "learning_rate": 0.0007266780839602488, + "loss": 0.0328, + "num_input_tokens_seen": 109861072, + "step": 50905 + }, + { + "epoch": 8.30505709624796, + "grad_norm": 0.0998639166355133, + "learning_rate": 0.0007266146369914445, + "loss": 0.0547, + "num_input_tokens_seen": 109872144, + "step": 50910 + }, + { + "epoch": 8.305872756933116, + "grad_norm": 0.1830855756998062, + "learning_rate": 0.0007265511854300069, + "loss": 0.1511, + "num_input_tokens_seen": 109883152, + "step": 50915 + }, + { + "epoch": 8.30668841761827, + "grad_norm": 0.0042611039243638515, + "learning_rate": 0.0007264877292772223, + "loss": 0.0274, + "num_input_tokens_seen": 109894672, + "step": 50920 + }, + { + "epoch": 8.307504078303426, + "grad_norm": 0.042120836675167084, + "learning_rate": 0.0007264242685343765, + "loss": 0.0957, + "num_input_tokens_seen": 109905456, + "step": 50925 + }, + { + "epoch": 8.308319738988581, + "grad_norm": 0.1844879388809204, + "learning_rate": 0.0007263608032027557, + "loss": 0.2137, + "num_input_tokens_seen": 109915024, + "step": 50930 + }, + { + "epoch": 8.309135399673735, + "grad_norm": 0.055228784680366516, + "learning_rate": 0.000726297333283646, + "loss": 0.1009, + "num_input_tokens_seen": 109923728, + "step": 50935 + }, + { + "epoch": 8.309951060358891, + "grad_norm": 0.024709882214665413, + "learning_rate": 0.0007262338587783338, + "loss": 0.0647, + "num_input_tokens_seen": 109934128, + "step": 50940 + }, + { + "epoch": 8.310766721044045, + "grad_norm": 0.004321193788200617, + "learning_rate": 0.0007261703796881054, + "loss": 0.0169, + "num_input_tokens_seen": 109945072, + "step": 50945 + }, + { + "epoch": 8.3115823817292, + "grad_norm": 0.02428867295384407, + "learning_rate": 0.0007261068960142474, + "loss": 0.0447, + "num_input_tokens_seen": 109955184, + "step": 50950 + }, + { + "epoch": 8.312398042414356, + "grad_norm": 0.02695038914680481, + "learning_rate": 0.0007260434077580463, + "loss": 0.0609, + "num_input_tokens_seen": 109965232, + "step": 50955 + }, + { + "epoch": 8.31321370309951, + "grad_norm": 0.005688962526619434, + "learning_rate": 0.0007259799149207887, + "loss": 0.0105, + "num_input_tokens_seen": 109976944, + "step": 50960 + }, + { + "epoch": 8.314029363784666, + "grad_norm": 0.0051245237700641155, + "learning_rate": 0.0007259164175037616, + "loss": 0.0841, + "num_input_tokens_seen": 109987504, + "step": 50965 + }, + { + "epoch": 8.31484502446982, + "grad_norm": 0.012548436410725117, + "learning_rate": 0.0007258529155082516, + "loss": 0.0554, + "num_input_tokens_seen": 109998160, + "step": 50970 + }, + { + "epoch": 8.315660685154976, + "grad_norm": 0.041546598076820374, + "learning_rate": 0.0007257894089355458, + "loss": 0.0235, + "num_input_tokens_seen": 110008464, + "step": 50975 + }, + { + "epoch": 8.31647634584013, + "grad_norm": 0.1574370115995407, + "learning_rate": 0.0007257258977869313, + "loss": 0.1479, + "num_input_tokens_seen": 110020080, + "step": 50980 + }, + { + "epoch": 8.317292006525285, + "grad_norm": 0.004845879971981049, + "learning_rate": 0.000725662382063695, + "loss": 0.0207, + "num_input_tokens_seen": 110029104, + "step": 50985 + }, + { + "epoch": 8.318107667210441, + "grad_norm": 0.00915614329278469, + "learning_rate": 0.0007255988617671241, + "loss": 0.016, + "num_input_tokens_seen": 110040528, + "step": 50990 + }, + { + "epoch": 8.318923327895595, + "grad_norm": 0.005439273081719875, + "learning_rate": 0.0007255353368985063, + "loss": 0.0849, + "num_input_tokens_seen": 110049744, + "step": 50995 + }, + { + "epoch": 8.31973898858075, + "grad_norm": 0.03500760346651077, + "learning_rate": 0.0007254718074591285, + "loss": 0.0094, + "num_input_tokens_seen": 110061680, + "step": 51000 + }, + { + "epoch": 8.320554649265905, + "grad_norm": 0.020430684089660645, + "learning_rate": 0.0007254082734502788, + "loss": 0.1018, + "num_input_tokens_seen": 110072880, + "step": 51005 + }, + { + "epoch": 8.32137030995106, + "grad_norm": 0.005908131133764982, + "learning_rate": 0.0007253447348732443, + "loss": 0.0496, + "num_input_tokens_seen": 110084464, + "step": 51010 + }, + { + "epoch": 8.322185970636216, + "grad_norm": 0.03224621340632439, + "learning_rate": 0.000725281191729313, + "loss": 0.0968, + "num_input_tokens_seen": 110095536, + "step": 51015 + }, + { + "epoch": 8.32300163132137, + "grad_norm": 0.022098278626799583, + "learning_rate": 0.0007252176440197726, + "loss": 0.0837, + "num_input_tokens_seen": 110104496, + "step": 51020 + }, + { + "epoch": 8.323817292006526, + "grad_norm": 0.005999719724059105, + "learning_rate": 0.0007251540917459109, + "loss": 0.0548, + "num_input_tokens_seen": 110115888, + "step": 51025 + }, + { + "epoch": 8.32463295269168, + "grad_norm": 0.016046350821852684, + "learning_rate": 0.0007250905349090158, + "loss": 0.044, + "num_input_tokens_seen": 110125104, + "step": 51030 + }, + { + "epoch": 8.325448613376835, + "grad_norm": 0.018497195094823837, + "learning_rate": 0.0007250269735103754, + "loss": 0.023, + "num_input_tokens_seen": 110135728, + "step": 51035 + }, + { + "epoch": 8.326264274061991, + "grad_norm": 0.05013975873589516, + "learning_rate": 0.0007249634075512781, + "loss": 0.1003, + "num_input_tokens_seen": 110146832, + "step": 51040 + }, + { + "epoch": 8.327079934747145, + "grad_norm": 0.015607539564371109, + "learning_rate": 0.0007248998370330119, + "loss": 0.0469, + "num_input_tokens_seen": 110157520, + "step": 51045 + }, + { + "epoch": 8.3278955954323, + "grad_norm": 0.0025452955160290003, + "learning_rate": 0.0007248362619568651, + "loss": 0.0163, + "num_input_tokens_seen": 110168688, + "step": 51050 + }, + { + "epoch": 8.328711256117455, + "grad_norm": 0.010289053432643414, + "learning_rate": 0.0007247726823241264, + "loss": 0.0623, + "num_input_tokens_seen": 110179056, + "step": 51055 + }, + { + "epoch": 8.32952691680261, + "grad_norm": 0.016661109402775764, + "learning_rate": 0.0007247090981360841, + "loss": 0.0936, + "num_input_tokens_seen": 110190160, + "step": 51060 + }, + { + "epoch": 8.330342577487766, + "grad_norm": 0.018751347437500954, + "learning_rate": 0.0007246455093940268, + "loss": 0.0429, + "num_input_tokens_seen": 110201424, + "step": 51065 + }, + { + "epoch": 8.33115823817292, + "grad_norm": 0.0572807639837265, + "learning_rate": 0.0007245819160992434, + "loss": 0.0396, + "num_input_tokens_seen": 110212304, + "step": 51070 + }, + { + "epoch": 8.331973898858076, + "grad_norm": 0.21826277673244476, + "learning_rate": 0.0007245183182530224, + "loss": 0.0553, + "num_input_tokens_seen": 110223216, + "step": 51075 + }, + { + "epoch": 8.33278955954323, + "grad_norm": 0.133011132478714, + "learning_rate": 0.0007244547158566531, + "loss": 0.1981, + "num_input_tokens_seen": 110234160, + "step": 51080 + }, + { + "epoch": 8.333605220228385, + "grad_norm": 0.009748709388077259, + "learning_rate": 0.0007243911089114239, + "loss": 0.0413, + "num_input_tokens_seen": 110245264, + "step": 51085 + }, + { + "epoch": 8.33442088091354, + "grad_norm": 0.013448234647512436, + "learning_rate": 0.0007243274974186245, + "loss": 0.0467, + "num_input_tokens_seen": 110256560, + "step": 51090 + }, + { + "epoch": 8.335236541598695, + "grad_norm": 0.2558216452598572, + "learning_rate": 0.0007242638813795437, + "loss": 0.2283, + "num_input_tokens_seen": 110267920, + "step": 51095 + }, + { + "epoch": 8.33605220228385, + "grad_norm": 0.006753480061888695, + "learning_rate": 0.0007242002607954708, + "loss": 0.1366, + "num_input_tokens_seen": 110278032, + "step": 51100 + }, + { + "epoch": 8.336867862969005, + "grad_norm": 0.026097765192389488, + "learning_rate": 0.000724136635667695, + "loss": 0.0169, + "num_input_tokens_seen": 110289232, + "step": 51105 + }, + { + "epoch": 8.33768352365416, + "grad_norm": 0.2825978100299835, + "learning_rate": 0.0007240730059975063, + "loss": 0.1176, + "num_input_tokens_seen": 110299664, + "step": 51110 + }, + { + "epoch": 8.338499184339314, + "grad_norm": 0.014273480512201786, + "learning_rate": 0.0007240093717861937, + "loss": 0.0651, + "num_input_tokens_seen": 110310480, + "step": 51115 + }, + { + "epoch": 8.33931484502447, + "grad_norm": 0.04844564199447632, + "learning_rate": 0.000723945733035047, + "loss": 0.0113, + "num_input_tokens_seen": 110322640, + "step": 51120 + }, + { + "epoch": 8.340130505709626, + "grad_norm": 0.11391949653625488, + "learning_rate": 0.0007238820897453559, + "loss": 0.0205, + "num_input_tokens_seen": 110334992, + "step": 51125 + }, + { + "epoch": 8.34094616639478, + "grad_norm": 0.02098294347524643, + "learning_rate": 0.0007238184419184104, + "loss": 0.012, + "num_input_tokens_seen": 110345616, + "step": 51130 + }, + { + "epoch": 8.341761827079935, + "grad_norm": 0.05506865307688713, + "learning_rate": 0.0007237547895555001, + "loss": 0.0218, + "num_input_tokens_seen": 110356752, + "step": 51135 + }, + { + "epoch": 8.34257748776509, + "grad_norm": 0.02489614672958851, + "learning_rate": 0.0007236911326579152, + "loss": 0.0439, + "num_input_tokens_seen": 110367568, + "step": 51140 + }, + { + "epoch": 8.343393148450245, + "grad_norm": 0.018398351967334747, + "learning_rate": 0.0007236274712269457, + "loss": 0.0183, + "num_input_tokens_seen": 110378576, + "step": 51145 + }, + { + "epoch": 8.3442088091354, + "grad_norm": 0.22536776959896088, + "learning_rate": 0.0007235638052638819, + "loss": 0.1026, + "num_input_tokens_seen": 110389072, + "step": 51150 + }, + { + "epoch": 8.345024469820554, + "grad_norm": 0.1956091821193695, + "learning_rate": 0.0007235001347700139, + "loss": 0.2184, + "num_input_tokens_seen": 110400464, + "step": 51155 + }, + { + "epoch": 8.34584013050571, + "grad_norm": 0.008925708942115307, + "learning_rate": 0.0007234364597466321, + "loss": 0.0359, + "num_input_tokens_seen": 110411728, + "step": 51160 + }, + { + "epoch": 8.346655791190864, + "grad_norm": 0.004584786482155323, + "learning_rate": 0.000723372780195027, + "loss": 0.0137, + "num_input_tokens_seen": 110422480, + "step": 51165 + }, + { + "epoch": 8.34747145187602, + "grad_norm": 0.04825051873922348, + "learning_rate": 0.0007233090961164892, + "loss": 0.0377, + "num_input_tokens_seen": 110433680, + "step": 51170 + }, + { + "epoch": 8.348287112561174, + "grad_norm": 0.026193510740995407, + "learning_rate": 0.000723245407512309, + "loss": 0.0314, + "num_input_tokens_seen": 110442512, + "step": 51175 + }, + { + "epoch": 8.34910277324633, + "grad_norm": 0.04807530716061592, + "learning_rate": 0.0007231817143837778, + "loss": 0.0333, + "num_input_tokens_seen": 110454032, + "step": 51180 + }, + { + "epoch": 8.349918433931485, + "grad_norm": 0.17871999740600586, + "learning_rate": 0.0007231180167321858, + "loss": 0.0333, + "num_input_tokens_seen": 110464432, + "step": 51185 + }, + { + "epoch": 8.350734094616639, + "grad_norm": 0.007019665092229843, + "learning_rate": 0.0007230543145588242, + "loss": 0.1611, + "num_input_tokens_seen": 110476176, + "step": 51190 + }, + { + "epoch": 8.351549755301795, + "grad_norm": 0.14817704260349274, + "learning_rate": 0.000722990607864984, + "loss": 0.1364, + "num_input_tokens_seen": 110486800, + "step": 51195 + }, + { + "epoch": 8.352365415986949, + "grad_norm": 0.2666585147380829, + "learning_rate": 0.0007229268966519562, + "loss": 0.1673, + "num_input_tokens_seen": 110498448, + "step": 51200 + }, + { + "epoch": 8.353181076672104, + "grad_norm": 0.03059711866080761, + "learning_rate": 0.0007228631809210321, + "loss": 0.0083, + "num_input_tokens_seen": 110510064, + "step": 51205 + }, + { + "epoch": 8.35399673735726, + "grad_norm": 0.09947142750024796, + "learning_rate": 0.0007227994606735029, + "loss": 0.036, + "num_input_tokens_seen": 110522352, + "step": 51210 + }, + { + "epoch": 8.354812398042414, + "grad_norm": 0.017664039507508278, + "learning_rate": 0.0007227357359106598, + "loss": 0.011, + "num_input_tokens_seen": 110534640, + "step": 51215 + }, + { + "epoch": 8.35562805872757, + "grad_norm": 0.010136286728084087, + "learning_rate": 0.0007226720066337946, + "loss": 0.0557, + "num_input_tokens_seen": 110546608, + "step": 51220 + }, + { + "epoch": 8.356443719412724, + "grad_norm": 0.141001358628273, + "learning_rate": 0.0007226082728441989, + "loss": 0.0545, + "num_input_tokens_seen": 110556656, + "step": 51225 + }, + { + "epoch": 8.35725938009788, + "grad_norm": 0.08009041845798492, + "learning_rate": 0.0007225445345431638, + "loss": 0.0988, + "num_input_tokens_seen": 110567184, + "step": 51230 + }, + { + "epoch": 8.358075040783035, + "grad_norm": 0.2358008623123169, + "learning_rate": 0.0007224807917319817, + "loss": 0.0325, + "num_input_tokens_seen": 110576848, + "step": 51235 + }, + { + "epoch": 8.358890701468189, + "grad_norm": 0.023778993636369705, + "learning_rate": 0.000722417044411944, + "loss": 0.0328, + "num_input_tokens_seen": 110587920, + "step": 51240 + }, + { + "epoch": 8.359706362153345, + "grad_norm": 0.12822575867176056, + "learning_rate": 0.0007223532925843427, + "loss": 0.061, + "num_input_tokens_seen": 110599184, + "step": 51245 + }, + { + "epoch": 8.360522022838499, + "grad_norm": 0.16785474121570587, + "learning_rate": 0.0007222895362504698, + "loss": 0.0422, + "num_input_tokens_seen": 110610768, + "step": 51250 + }, + { + "epoch": 8.361337683523654, + "grad_norm": 0.010825047269463539, + "learning_rate": 0.0007222257754116176, + "loss": 0.0937, + "num_input_tokens_seen": 110621136, + "step": 51255 + }, + { + "epoch": 8.362153344208808, + "grad_norm": 0.010380970314145088, + "learning_rate": 0.000722162010069078, + "loss": 0.0567, + "num_input_tokens_seen": 110631088, + "step": 51260 + }, + { + "epoch": 8.362969004893964, + "grad_norm": 0.018530340865254402, + "learning_rate": 0.0007220982402241436, + "loss": 0.0171, + "num_input_tokens_seen": 110643184, + "step": 51265 + }, + { + "epoch": 8.36378466557912, + "grad_norm": 0.23015162348747253, + "learning_rate": 0.0007220344658781065, + "loss": 0.0877, + "num_input_tokens_seen": 110653712, + "step": 51270 + }, + { + "epoch": 8.364600326264274, + "grad_norm": 0.17841795086860657, + "learning_rate": 0.0007219706870322594, + "loss": 0.1355, + "num_input_tokens_seen": 110663600, + "step": 51275 + }, + { + "epoch": 8.36541598694943, + "grad_norm": 0.1662629097700119, + "learning_rate": 0.0007219069036878945, + "loss": 0.0541, + "num_input_tokens_seen": 110674192, + "step": 51280 + }, + { + "epoch": 8.366231647634583, + "grad_norm": 0.010045567527413368, + "learning_rate": 0.0007218431158463048, + "loss": 0.0899, + "num_input_tokens_seen": 110684880, + "step": 51285 + }, + { + "epoch": 8.367047308319739, + "grad_norm": 0.004146745894104242, + "learning_rate": 0.000721779323508783, + "loss": 0.046, + "num_input_tokens_seen": 110695120, + "step": 51290 + }, + { + "epoch": 8.367862969004895, + "grad_norm": 0.011898959055542946, + "learning_rate": 0.0007217155266766217, + "loss": 0.013, + "num_input_tokens_seen": 110704912, + "step": 51295 + }, + { + "epoch": 8.368678629690049, + "grad_norm": 0.22022446990013123, + "learning_rate": 0.0007216517253511143, + "loss": 0.0311, + "num_input_tokens_seen": 110715952, + "step": 51300 + }, + { + "epoch": 8.369494290375204, + "grad_norm": 0.020168880000710487, + "learning_rate": 0.0007215879195335531, + "loss": 0.0345, + "num_input_tokens_seen": 110727312, + "step": 51305 + }, + { + "epoch": 8.370309951060358, + "grad_norm": 0.0007196568185463548, + "learning_rate": 0.0007215241092252319, + "loss": 0.018, + "num_input_tokens_seen": 110738448, + "step": 51310 + }, + { + "epoch": 8.371125611745514, + "grad_norm": 0.26062363386154175, + "learning_rate": 0.0007214602944274435, + "loss": 0.1151, + "num_input_tokens_seen": 110750960, + "step": 51315 + }, + { + "epoch": 8.37194127243067, + "grad_norm": 0.0037940163165330887, + "learning_rate": 0.0007213964751414812, + "loss": 0.0035, + "num_input_tokens_seen": 110761584, + "step": 51320 + }, + { + "epoch": 8.372756933115824, + "grad_norm": 0.0035547183360904455, + "learning_rate": 0.0007213326513686386, + "loss": 0.0115, + "num_input_tokens_seen": 110771792, + "step": 51325 + }, + { + "epoch": 8.37357259380098, + "grad_norm": 0.0763133242726326, + "learning_rate": 0.0007212688231102091, + "loss": 0.0244, + "num_input_tokens_seen": 110783472, + "step": 51330 + }, + { + "epoch": 8.374388254486133, + "grad_norm": 0.3203875720500946, + "learning_rate": 0.000721204990367486, + "loss": 0.0652, + "num_input_tokens_seen": 110793584, + "step": 51335 + }, + { + "epoch": 8.375203915171289, + "grad_norm": 0.014163392595946789, + "learning_rate": 0.0007211411531417633, + "loss": 0.0201, + "num_input_tokens_seen": 110803440, + "step": 51340 + }, + { + "epoch": 8.376019575856443, + "grad_norm": 0.0051974887028336525, + "learning_rate": 0.0007210773114343345, + "loss": 0.0501, + "num_input_tokens_seen": 110813808, + "step": 51345 + }, + { + "epoch": 8.376835236541599, + "grad_norm": 0.02585349790751934, + "learning_rate": 0.0007210134652464935, + "loss": 0.0431, + "num_input_tokens_seen": 110824016, + "step": 51350 + }, + { + "epoch": 8.377650897226754, + "grad_norm": 0.03663434088230133, + "learning_rate": 0.0007209496145795343, + "loss": 0.0579, + "num_input_tokens_seen": 110834672, + "step": 51355 + }, + { + "epoch": 8.378466557911908, + "grad_norm": 0.014262940734624863, + "learning_rate": 0.000720885759434751, + "loss": 0.0054, + "num_input_tokens_seen": 110845456, + "step": 51360 + }, + { + "epoch": 8.379282218597064, + "grad_norm": 0.0309141892939806, + "learning_rate": 0.0007208218998134375, + "loss": 0.1833, + "num_input_tokens_seen": 110856080, + "step": 51365 + }, + { + "epoch": 8.380097879282218, + "grad_norm": 0.026822904124855995, + "learning_rate": 0.000720758035716888, + "loss": 0.0207, + "num_input_tokens_seen": 110867056, + "step": 51370 + }, + { + "epoch": 8.380913539967374, + "grad_norm": 0.19866830110549927, + "learning_rate": 0.0007206941671463969, + "loss": 0.0758, + "num_input_tokens_seen": 110878352, + "step": 51375 + }, + { + "epoch": 8.38172920065253, + "grad_norm": 0.11611539125442505, + "learning_rate": 0.0007206302941032586, + "loss": 0.0358, + "num_input_tokens_seen": 110889584, + "step": 51380 + }, + { + "epoch": 8.382544861337683, + "grad_norm": 0.10820174217224121, + "learning_rate": 0.0007205664165887673, + "loss": 0.0827, + "num_input_tokens_seen": 110900464, + "step": 51385 + }, + { + "epoch": 8.383360522022839, + "grad_norm": 0.17516988515853882, + "learning_rate": 0.000720502534604218, + "loss": 0.0891, + "num_input_tokens_seen": 110911568, + "step": 51390 + }, + { + "epoch": 8.384176182707993, + "grad_norm": 0.10223973542451859, + "learning_rate": 0.0007204386481509049, + "loss": 0.0411, + "num_input_tokens_seen": 110922640, + "step": 51395 + }, + { + "epoch": 8.384991843393149, + "grad_norm": 0.021960755810141563, + "learning_rate": 0.0007203747572301231, + "loss": 0.0859, + "num_input_tokens_seen": 110933872, + "step": 51400 + }, + { + "epoch": 8.385807504078304, + "grad_norm": 0.025504587218165398, + "learning_rate": 0.0007203108618431672, + "loss": 0.0504, + "num_input_tokens_seen": 110945648, + "step": 51405 + }, + { + "epoch": 8.386623164763458, + "grad_norm": 0.011567308567464352, + "learning_rate": 0.0007202469619913322, + "loss": 0.0243, + "num_input_tokens_seen": 110956528, + "step": 51410 + }, + { + "epoch": 8.387438825448614, + "grad_norm": 0.20543573796749115, + "learning_rate": 0.0007201830576759132, + "loss": 0.0845, + "num_input_tokens_seen": 110966224, + "step": 51415 + }, + { + "epoch": 8.388254486133768, + "grad_norm": 0.011920414865016937, + "learning_rate": 0.0007201191488982051, + "loss": 0.0483, + "num_input_tokens_seen": 110976912, + "step": 51420 + }, + { + "epoch": 8.389070146818923, + "grad_norm": 0.0048531657084822655, + "learning_rate": 0.0007200552356595031, + "loss": 0.0082, + "num_input_tokens_seen": 110988080, + "step": 51425 + }, + { + "epoch": 8.38988580750408, + "grad_norm": 0.035106051713228226, + "learning_rate": 0.0007199913179611029, + "loss": 0.0302, + "num_input_tokens_seen": 110999472, + "step": 51430 + }, + { + "epoch": 8.390701468189233, + "grad_norm": 0.04304172843694687, + "learning_rate": 0.0007199273958042994, + "loss": 0.0272, + "num_input_tokens_seen": 111011184, + "step": 51435 + }, + { + "epoch": 8.391517128874389, + "grad_norm": 0.03024912439286709, + "learning_rate": 0.0007198634691903882, + "loss": 0.0066, + "num_input_tokens_seen": 111022480, + "step": 51440 + }, + { + "epoch": 8.392332789559543, + "grad_norm": 0.031165925785899162, + "learning_rate": 0.0007197995381206649, + "loss": 0.0751, + "num_input_tokens_seen": 111032912, + "step": 51445 + }, + { + "epoch": 8.393148450244698, + "grad_norm": 0.2963460087776184, + "learning_rate": 0.0007197356025964252, + "loss": 0.0375, + "num_input_tokens_seen": 111044272, + "step": 51450 + }, + { + "epoch": 8.393964110929852, + "grad_norm": 0.07541589438915253, + "learning_rate": 0.0007196716626189646, + "loss": 0.0215, + "num_input_tokens_seen": 111054704, + "step": 51455 + }, + { + "epoch": 8.394779771615008, + "grad_norm": 0.019047759473323822, + "learning_rate": 0.0007196077181895792, + "loss": 0.0685, + "num_input_tokens_seen": 111065040, + "step": 51460 + }, + { + "epoch": 8.395595432300164, + "grad_norm": 0.11137405037879944, + "learning_rate": 0.0007195437693095647, + "loss": 0.023, + "num_input_tokens_seen": 111077200, + "step": 51465 + }, + { + "epoch": 8.396411092985318, + "grad_norm": 0.11799478530883789, + "learning_rate": 0.0007194798159802174, + "loss": 0.0795, + "num_input_tokens_seen": 111088016, + "step": 51470 + }, + { + "epoch": 8.397226753670473, + "grad_norm": 0.07974501699209213, + "learning_rate": 0.0007194158582028332, + "loss": 0.0199, + "num_input_tokens_seen": 111098320, + "step": 51475 + }, + { + "epoch": 8.398042414355627, + "grad_norm": 0.0079717468470335, + "learning_rate": 0.0007193518959787081, + "loss": 0.0468, + "num_input_tokens_seen": 111108528, + "step": 51480 + }, + { + "epoch": 8.398858075040783, + "grad_norm": 0.04966919869184494, + "learning_rate": 0.0007192879293091386, + "loss": 0.0615, + "num_input_tokens_seen": 111120080, + "step": 51485 + }, + { + "epoch": 8.399673735725939, + "grad_norm": 0.0031234254129230976, + "learning_rate": 0.000719223958195421, + "loss": 0.0322, + "num_input_tokens_seen": 111130512, + "step": 51490 + }, + { + "epoch": 8.400489396411093, + "grad_norm": 0.007806080859154463, + "learning_rate": 0.0007191599826388518, + "loss": 0.2016, + "num_input_tokens_seen": 111141520, + "step": 51495 + }, + { + "epoch": 8.401305057096248, + "grad_norm": 0.004260449204593897, + "learning_rate": 0.0007190960026407276, + "loss": 0.0341, + "num_input_tokens_seen": 111152112, + "step": 51500 + }, + { + "epoch": 8.402120717781402, + "grad_norm": 0.04556277394294739, + "learning_rate": 0.0007190320182023449, + "loss": 0.0478, + "num_input_tokens_seen": 111162896, + "step": 51505 + }, + { + "epoch": 8.402936378466558, + "grad_norm": 0.014514339156448841, + "learning_rate": 0.0007189680293250005, + "loss": 0.179, + "num_input_tokens_seen": 111173616, + "step": 51510 + }, + { + "epoch": 8.403752039151712, + "grad_norm": 0.13980957865715027, + "learning_rate": 0.0007189040360099913, + "loss": 0.0219, + "num_input_tokens_seen": 111183856, + "step": 51515 + }, + { + "epoch": 8.404567699836868, + "grad_norm": 0.07007280737161636, + "learning_rate": 0.000718840038258614, + "loss": 0.1197, + "num_input_tokens_seen": 111193936, + "step": 51520 + }, + { + "epoch": 8.405383360522023, + "grad_norm": 0.005643871147185564, + "learning_rate": 0.0007187760360721658, + "loss": 0.0269, + "num_input_tokens_seen": 111204912, + "step": 51525 + }, + { + "epoch": 8.406199021207177, + "grad_norm": 0.019541198387742043, + "learning_rate": 0.0007187120294519434, + "loss": 0.0216, + "num_input_tokens_seen": 111215408, + "step": 51530 + }, + { + "epoch": 8.407014681892333, + "grad_norm": 0.005972282961010933, + "learning_rate": 0.0007186480183992446, + "loss": 0.0211, + "num_input_tokens_seen": 111227056, + "step": 51535 + }, + { + "epoch": 8.407830342577487, + "grad_norm": 0.1496337503194809, + "learning_rate": 0.0007185840029153663, + "loss": 0.1539, + "num_input_tokens_seen": 111238512, + "step": 51540 + }, + { + "epoch": 8.408646003262643, + "grad_norm": 0.012793335132300854, + "learning_rate": 0.0007185199830016058, + "loss": 0.0397, + "num_input_tokens_seen": 111249392, + "step": 51545 + }, + { + "epoch": 8.409461663947798, + "grad_norm": 0.26890987157821655, + "learning_rate": 0.0007184559586592606, + "loss": 0.1402, + "num_input_tokens_seen": 111259600, + "step": 51550 + }, + { + "epoch": 8.410277324632952, + "grad_norm": 0.006930783856660128, + "learning_rate": 0.0007183919298896283, + "loss": 0.0973, + "num_input_tokens_seen": 111270032, + "step": 51555 + }, + { + "epoch": 8.411092985318108, + "grad_norm": 0.006691479589790106, + "learning_rate": 0.0007183278966940065, + "loss": 0.1157, + "num_input_tokens_seen": 111279504, + "step": 51560 + }, + { + "epoch": 8.411908646003262, + "grad_norm": 0.2901208996772766, + "learning_rate": 0.000718263859073693, + "loss": 0.1027, + "num_input_tokens_seen": 111291504, + "step": 51565 + }, + { + "epoch": 8.412724306688418, + "grad_norm": 0.06120949983596802, + "learning_rate": 0.0007181998170299854, + "loss": 0.1592, + "num_input_tokens_seen": 111303152, + "step": 51570 + }, + { + "epoch": 8.413539967373573, + "grad_norm": 0.130525603890419, + "learning_rate": 0.0007181357705641818, + "loss": 0.0394, + "num_input_tokens_seen": 111313232, + "step": 51575 + }, + { + "epoch": 8.414355628058727, + "grad_norm": 0.0014545980375260115, + "learning_rate": 0.0007180717196775799, + "loss": 0.0095, + "num_input_tokens_seen": 111324240, + "step": 51580 + }, + { + "epoch": 8.415171288743883, + "grad_norm": 0.007819964550435543, + "learning_rate": 0.0007180076643714781, + "loss": 0.1298, + "num_input_tokens_seen": 111335056, + "step": 51585 + }, + { + "epoch": 8.415986949429037, + "grad_norm": 0.3152365982532501, + "learning_rate": 0.0007179436046471743, + "loss": 0.0814, + "num_input_tokens_seen": 111345200, + "step": 51590 + }, + { + "epoch": 8.416802610114193, + "grad_norm": 0.18859584629535675, + "learning_rate": 0.0007178795405059671, + "loss": 0.0579, + "num_input_tokens_seen": 111354640, + "step": 51595 + }, + { + "epoch": 8.417618270799348, + "grad_norm": 0.040938299149274826, + "learning_rate": 0.0007178154719491545, + "loss": 0.0543, + "num_input_tokens_seen": 111365008, + "step": 51600 + }, + { + "epoch": 8.418433931484502, + "grad_norm": 0.009354202076792717, + "learning_rate": 0.0007177513989780349, + "loss": 0.1675, + "num_input_tokens_seen": 111376080, + "step": 51605 + }, + { + "epoch": 8.419249592169658, + "grad_norm": 0.05440622940659523, + "learning_rate": 0.0007176873215939072, + "loss": 0.0721, + "num_input_tokens_seen": 111387376, + "step": 51610 + }, + { + "epoch": 8.420065252854812, + "grad_norm": 0.04928793013095856, + "learning_rate": 0.0007176232397980696, + "loss": 0.026, + "num_input_tokens_seen": 111398224, + "step": 51615 + }, + { + "epoch": 8.420880913539968, + "grad_norm": 0.2831956744194031, + "learning_rate": 0.000717559153591821, + "loss": 0.1609, + "num_input_tokens_seen": 111408624, + "step": 51620 + }, + { + "epoch": 8.421696574225122, + "grad_norm": 0.01685427874326706, + "learning_rate": 0.0007174950629764602, + "loss": 0.0104, + "num_input_tokens_seen": 111419664, + "step": 51625 + }, + { + "epoch": 8.422512234910277, + "grad_norm": 0.25670167803764343, + "learning_rate": 0.0007174309679532859, + "loss": 0.0648, + "num_input_tokens_seen": 111430480, + "step": 51630 + }, + { + "epoch": 8.423327895595433, + "grad_norm": 0.0039424533024430275, + "learning_rate": 0.0007173668685235973, + "loss": 0.1131, + "num_input_tokens_seen": 111439984, + "step": 51635 + }, + { + "epoch": 8.424143556280587, + "grad_norm": 0.1545783132314682, + "learning_rate": 0.0007173027646886934, + "loss": 0.0553, + "num_input_tokens_seen": 111451472, + "step": 51640 + }, + { + "epoch": 8.424959216965743, + "grad_norm": 0.018799273297190666, + "learning_rate": 0.0007172386564498733, + "loss": 0.0741, + "num_input_tokens_seen": 111462544, + "step": 51645 + }, + { + "epoch": 8.425774877650896, + "grad_norm": 0.025868525728583336, + "learning_rate": 0.0007171745438084362, + "loss": 0.0125, + "num_input_tokens_seen": 111474800, + "step": 51650 + }, + { + "epoch": 8.426590538336052, + "grad_norm": 0.19350582361221313, + "learning_rate": 0.0007171104267656814, + "loss": 0.0341, + "num_input_tokens_seen": 111485776, + "step": 51655 + }, + { + "epoch": 8.427406199021208, + "grad_norm": 0.001786147360689938, + "learning_rate": 0.0007170463053229085, + "loss": 0.1778, + "num_input_tokens_seen": 111497616, + "step": 51660 + }, + { + "epoch": 8.428221859706362, + "grad_norm": 0.19943515956401825, + "learning_rate": 0.0007169821794814168, + "loss": 0.1605, + "num_input_tokens_seen": 111508848, + "step": 51665 + }, + { + "epoch": 8.429037520391518, + "grad_norm": 0.002455131383612752, + "learning_rate": 0.000716918049242506, + "loss": 0.121, + "num_input_tokens_seen": 111520816, + "step": 51670 + }, + { + "epoch": 8.429853181076671, + "grad_norm": 0.33039167523384094, + "learning_rate": 0.0007168539146074757, + "loss": 0.1669, + "num_input_tokens_seen": 111531984, + "step": 51675 + }, + { + "epoch": 8.430668841761827, + "grad_norm": 0.3032156825065613, + "learning_rate": 0.0007167897755776258, + "loss": 0.0957, + "num_input_tokens_seen": 111542800, + "step": 51680 + }, + { + "epoch": 8.431484502446983, + "grad_norm": 0.0743522197008133, + "learning_rate": 0.0007167256321542561, + "loss": 0.0317, + "num_input_tokens_seen": 111553040, + "step": 51685 + }, + { + "epoch": 8.432300163132137, + "grad_norm": 0.13208863139152527, + "learning_rate": 0.0007166614843386666, + "loss": 0.0225, + "num_input_tokens_seen": 111563120, + "step": 51690 + }, + { + "epoch": 8.433115823817293, + "grad_norm": 0.011141360737383366, + "learning_rate": 0.0007165973321321571, + "loss": 0.0366, + "num_input_tokens_seen": 111573776, + "step": 51695 + }, + { + "epoch": 8.433931484502446, + "grad_norm": 0.002313321689143777, + "learning_rate": 0.0007165331755360281, + "loss": 0.1222, + "num_input_tokens_seen": 111584080, + "step": 51700 + }, + { + "epoch": 8.434747145187602, + "grad_norm": 0.31276389956474304, + "learning_rate": 0.0007164690145515793, + "loss": 0.0844, + "num_input_tokens_seen": 111594480, + "step": 51705 + }, + { + "epoch": 8.435562805872756, + "grad_norm": 0.2628081440925598, + "learning_rate": 0.0007164048491801116, + "loss": 0.1382, + "num_input_tokens_seen": 111604656, + "step": 51710 + }, + { + "epoch": 8.436378466557912, + "grad_norm": 0.022221244871616364, + "learning_rate": 0.0007163406794229249, + "loss": 0.0313, + "num_input_tokens_seen": 111616304, + "step": 51715 + }, + { + "epoch": 8.437194127243067, + "grad_norm": 0.07572013884782791, + "learning_rate": 0.0007162765052813199, + "loss": 0.0252, + "num_input_tokens_seen": 111627920, + "step": 51720 + }, + { + "epoch": 8.438009787928221, + "grad_norm": 0.044566810131073, + "learning_rate": 0.0007162123267565972, + "loss": 0.0434, + "num_input_tokens_seen": 111638864, + "step": 51725 + }, + { + "epoch": 8.438825448613377, + "grad_norm": 0.49775147438049316, + "learning_rate": 0.0007161481438500574, + "loss": 0.0437, + "num_input_tokens_seen": 111649296, + "step": 51730 + }, + { + "epoch": 8.439641109298531, + "grad_norm": 0.15197955071926117, + "learning_rate": 0.0007160839565630014, + "loss": 0.0488, + "num_input_tokens_seen": 111659152, + "step": 51735 + }, + { + "epoch": 8.440456769983687, + "grad_norm": 0.10689079761505127, + "learning_rate": 0.0007160197648967298, + "loss": 0.0388, + "num_input_tokens_seen": 111669744, + "step": 51740 + }, + { + "epoch": 8.441272430668842, + "grad_norm": 0.004313585348427296, + "learning_rate": 0.0007159555688525434, + "loss": 0.0261, + "num_input_tokens_seen": 111680304, + "step": 51745 + }, + { + "epoch": 8.442088091353996, + "grad_norm": 0.06742192804813385, + "learning_rate": 0.0007158913684317437, + "loss": 0.1307, + "num_input_tokens_seen": 111692400, + "step": 51750 + }, + { + "epoch": 8.442903752039152, + "grad_norm": 0.006459634285420179, + "learning_rate": 0.0007158271636356315, + "loss": 0.0429, + "num_input_tokens_seen": 111702160, + "step": 51755 + }, + { + "epoch": 8.443719412724306, + "grad_norm": 0.020626895129680634, + "learning_rate": 0.000715762954465508, + "loss": 0.0292, + "num_input_tokens_seen": 111713712, + "step": 51760 + }, + { + "epoch": 8.444535073409462, + "grad_norm": 0.002480248687788844, + "learning_rate": 0.0007156987409226745, + "loss": 0.0549, + "num_input_tokens_seen": 111724112, + "step": 51765 + }, + { + "epoch": 8.445350734094617, + "grad_norm": 0.010978852398693562, + "learning_rate": 0.0007156345230084325, + "loss": 0.0321, + "num_input_tokens_seen": 111735504, + "step": 51770 + }, + { + "epoch": 8.446166394779771, + "grad_norm": 0.0012566217919811606, + "learning_rate": 0.0007155703007240832, + "loss": 0.0191, + "num_input_tokens_seen": 111744912, + "step": 51775 + }, + { + "epoch": 8.446982055464927, + "grad_norm": 0.04769023507833481, + "learning_rate": 0.0007155060740709284, + "loss": 0.0216, + "num_input_tokens_seen": 111756080, + "step": 51780 + }, + { + "epoch": 8.447797716150081, + "grad_norm": 0.031197376549243927, + "learning_rate": 0.0007154418430502696, + "loss": 0.0061, + "num_input_tokens_seen": 111766928, + "step": 51785 + }, + { + "epoch": 8.448613376835237, + "grad_norm": 0.011030906811356544, + "learning_rate": 0.0007153776076634084, + "loss": 0.0235, + "num_input_tokens_seen": 111775760, + "step": 51790 + }, + { + "epoch": 8.449429037520392, + "grad_norm": 0.004557689651846886, + "learning_rate": 0.0007153133679116469, + "loss": 0.1679, + "num_input_tokens_seen": 111787184, + "step": 51795 + }, + { + "epoch": 8.450244698205546, + "grad_norm": 0.02023228257894516, + "learning_rate": 0.0007152491237962867, + "loss": 0.0652, + "num_input_tokens_seen": 111798064, + "step": 51800 + }, + { + "epoch": 8.451060358890702, + "grad_norm": 0.027366291731595993, + "learning_rate": 0.0007151848753186301, + "loss": 0.0877, + "num_input_tokens_seen": 111808016, + "step": 51805 + }, + { + "epoch": 8.451876019575856, + "grad_norm": 0.019394002854824066, + "learning_rate": 0.000715120622479979, + "loss": 0.011, + "num_input_tokens_seen": 111819568, + "step": 51810 + }, + { + "epoch": 8.452691680261012, + "grad_norm": 0.01409931480884552, + "learning_rate": 0.0007150563652816355, + "loss": 0.0088, + "num_input_tokens_seen": 111829840, + "step": 51815 + }, + { + "epoch": 8.453507340946166, + "grad_norm": 0.004109088331460953, + "learning_rate": 0.0007149921037249021, + "loss": 0.0513, + "num_input_tokens_seen": 111841136, + "step": 51820 + }, + { + "epoch": 8.454323001631321, + "grad_norm": 0.0027548891957849264, + "learning_rate": 0.0007149278378110808, + "loss": 0.1236, + "num_input_tokens_seen": 111852208, + "step": 51825 + }, + { + "epoch": 8.455138662316477, + "grad_norm": 0.009796842001378536, + "learning_rate": 0.0007148635675414743, + "loss": 0.0075, + "num_input_tokens_seen": 111862608, + "step": 51830 + }, + { + "epoch": 8.455954323001631, + "grad_norm": 0.0068572526797652245, + "learning_rate": 0.000714799292917385, + "loss": 0.0562, + "num_input_tokens_seen": 111873424, + "step": 51835 + }, + { + "epoch": 8.456769983686787, + "grad_norm": 0.1852836310863495, + "learning_rate": 0.0007147350139401156, + "loss": 0.1259, + "num_input_tokens_seen": 111884016, + "step": 51840 + }, + { + "epoch": 8.45758564437194, + "grad_norm": 0.09862899035215378, + "learning_rate": 0.0007146707306109687, + "loss": 0.0292, + "num_input_tokens_seen": 111895440, + "step": 51845 + }, + { + "epoch": 8.458401305057096, + "grad_norm": 0.22003084421157837, + "learning_rate": 0.000714606442931247, + "loss": 0.0203, + "num_input_tokens_seen": 111907120, + "step": 51850 + }, + { + "epoch": 8.459216965742252, + "grad_norm": 0.02025659941136837, + "learning_rate": 0.0007145421509022536, + "loss": 0.1376, + "num_input_tokens_seen": 111916464, + "step": 51855 + }, + { + "epoch": 8.460032626427406, + "grad_norm": 0.13997063040733337, + "learning_rate": 0.0007144778545252914, + "loss": 0.0299, + "num_input_tokens_seen": 111927376, + "step": 51860 + }, + { + "epoch": 8.460848287112562, + "grad_norm": 0.3448311686515808, + "learning_rate": 0.0007144135538016633, + "loss": 0.0877, + "num_input_tokens_seen": 111937392, + "step": 51865 + }, + { + "epoch": 8.461663947797716, + "grad_norm": 0.08352023363113403, + "learning_rate": 0.0007143492487326726, + "loss": 0.0946, + "num_input_tokens_seen": 111948848, + "step": 51870 + }, + { + "epoch": 8.462479608482871, + "grad_norm": 0.03707576170563698, + "learning_rate": 0.0007142849393196223, + "loss": 0.0096, + "num_input_tokens_seen": 111959664, + "step": 51875 + }, + { + "epoch": 8.463295269168025, + "grad_norm": 0.22436010837554932, + "learning_rate": 0.000714220625563816, + "loss": 0.1542, + "num_input_tokens_seen": 111969712, + "step": 51880 + }, + { + "epoch": 8.464110929853181, + "grad_norm": 0.016553662717342377, + "learning_rate": 0.0007141563074665571, + "loss": 0.1511, + "num_input_tokens_seen": 111981168, + "step": 51885 + }, + { + "epoch": 8.464926590538337, + "grad_norm": 0.3455713391304016, + "learning_rate": 0.0007140919850291488, + "loss": 0.1706, + "num_input_tokens_seen": 111991952, + "step": 51890 + }, + { + "epoch": 8.46574225122349, + "grad_norm": 0.048013024032115936, + "learning_rate": 0.0007140276582528947, + "loss": 0.0244, + "num_input_tokens_seen": 112003024, + "step": 51895 + }, + { + "epoch": 8.466557911908646, + "grad_norm": 0.4979349970817566, + "learning_rate": 0.0007139633271390988, + "loss": 0.1285, + "num_input_tokens_seen": 112013360, + "step": 51900 + }, + { + "epoch": 8.4673735725938, + "grad_norm": 0.09776439517736435, + "learning_rate": 0.0007138989916890644, + "loss": 0.1021, + "num_input_tokens_seen": 112023664, + "step": 51905 + }, + { + "epoch": 8.468189233278956, + "grad_norm": 0.039543576538562775, + "learning_rate": 0.0007138346519040959, + "loss": 0.1098, + "num_input_tokens_seen": 112034928, + "step": 51910 + }, + { + "epoch": 8.469004893964112, + "grad_norm": 0.1539844423532486, + "learning_rate": 0.0007137703077854967, + "loss": 0.1038, + "num_input_tokens_seen": 112046128, + "step": 51915 + }, + { + "epoch": 8.469820554649266, + "grad_norm": 0.0055220467038452625, + "learning_rate": 0.0007137059593345711, + "loss": 0.0075, + "num_input_tokens_seen": 112056944, + "step": 51920 + }, + { + "epoch": 8.470636215334421, + "grad_norm": 0.007494974881410599, + "learning_rate": 0.0007136416065526231, + "loss": 0.1566, + "num_input_tokens_seen": 112067472, + "step": 51925 + }, + { + "epoch": 8.471451876019575, + "grad_norm": 0.02552417851984501, + "learning_rate": 0.0007135772494409569, + "loss": 0.0118, + "num_input_tokens_seen": 112076400, + "step": 51930 + }, + { + "epoch": 8.47226753670473, + "grad_norm": 0.15605245530605316, + "learning_rate": 0.0007135128880008768, + "loss": 0.1198, + "num_input_tokens_seen": 112087312, + "step": 51935 + }, + { + "epoch": 8.473083197389887, + "grad_norm": 0.055323775857686996, + "learning_rate": 0.0007134485222336873, + "loss": 0.1413, + "num_input_tokens_seen": 112098544, + "step": 51940 + }, + { + "epoch": 8.47389885807504, + "grad_norm": 0.10875693708658218, + "learning_rate": 0.0007133841521406925, + "loss": 0.1353, + "num_input_tokens_seen": 112108304, + "step": 51945 + }, + { + "epoch": 8.474714518760196, + "grad_norm": 0.1120438426733017, + "learning_rate": 0.0007133197777231973, + "loss": 0.019, + "num_input_tokens_seen": 112119600, + "step": 51950 + }, + { + "epoch": 8.47553017944535, + "grad_norm": 0.02269870415329933, + "learning_rate": 0.0007132553989825061, + "loss": 0.0138, + "num_input_tokens_seen": 112130192, + "step": 51955 + }, + { + "epoch": 8.476345840130506, + "grad_norm": 0.09473484754562378, + "learning_rate": 0.0007131910159199238, + "loss": 0.2043, + "num_input_tokens_seen": 112142288, + "step": 51960 + }, + { + "epoch": 8.477161500815662, + "grad_norm": 0.23337393999099731, + "learning_rate": 0.000713126628536755, + "loss": 0.0752, + "num_input_tokens_seen": 112154032, + "step": 51965 + }, + { + "epoch": 8.477977161500815, + "grad_norm": 0.0935952216386795, + "learning_rate": 0.0007130622368343048, + "loss": 0.0172, + "num_input_tokens_seen": 112165520, + "step": 51970 + }, + { + "epoch": 8.478792822185971, + "grad_norm": 0.20787476003170013, + "learning_rate": 0.000712997840813878, + "loss": 0.1151, + "num_input_tokens_seen": 112176560, + "step": 51975 + }, + { + "epoch": 8.479608482871125, + "grad_norm": 0.13415087759494781, + "learning_rate": 0.0007129334404767797, + "loss": 0.0182, + "num_input_tokens_seen": 112187248, + "step": 51980 + }, + { + "epoch": 8.48042414355628, + "grad_norm": 0.02902115136384964, + "learning_rate": 0.0007128690358243153, + "loss": 0.013, + "num_input_tokens_seen": 112198128, + "step": 51985 + }, + { + "epoch": 8.481239804241435, + "grad_norm": 0.28016719222068787, + "learning_rate": 0.0007128046268577898, + "loss": 0.0828, + "num_input_tokens_seen": 112209680, + "step": 51990 + }, + { + "epoch": 8.48205546492659, + "grad_norm": 0.03677164018154144, + "learning_rate": 0.0007127402135785086, + "loss": 0.1055, + "num_input_tokens_seen": 112220688, + "step": 51995 + }, + { + "epoch": 8.482871125611746, + "grad_norm": 0.5363232493400574, + "learning_rate": 0.000712675795987777, + "loss": 0.1035, + "num_input_tokens_seen": 112231632, + "step": 52000 + }, + { + "epoch": 8.4836867862969, + "grad_norm": 0.017000947147607803, + "learning_rate": 0.0007126113740869006, + "loss": 0.123, + "num_input_tokens_seen": 112240592, + "step": 52005 + }, + { + "epoch": 8.484502446982056, + "grad_norm": 0.01858418434858322, + "learning_rate": 0.000712546947877185, + "loss": 0.02, + "num_input_tokens_seen": 112251440, + "step": 52010 + }, + { + "epoch": 8.48531810766721, + "grad_norm": 0.11790456622838974, + "learning_rate": 0.0007124825173599359, + "loss": 0.0793, + "num_input_tokens_seen": 112262192, + "step": 52015 + }, + { + "epoch": 8.486133768352365, + "grad_norm": 0.0401974581182003, + "learning_rate": 0.000712418082536459, + "loss": 0.0129, + "num_input_tokens_seen": 112273808, + "step": 52020 + }, + { + "epoch": 8.486949429037521, + "grad_norm": 0.2520396411418915, + "learning_rate": 0.0007123536434080602, + "loss": 0.0472, + "num_input_tokens_seen": 112284304, + "step": 52025 + }, + { + "epoch": 8.487765089722675, + "grad_norm": 0.22187262773513794, + "learning_rate": 0.0007122891999760454, + "loss": 0.0723, + "num_input_tokens_seen": 112295152, + "step": 52030 + }, + { + "epoch": 8.48858075040783, + "grad_norm": 0.017292635515332222, + "learning_rate": 0.0007122247522417206, + "loss": 0.0956, + "num_input_tokens_seen": 112305648, + "step": 52035 + }, + { + "epoch": 8.489396411092985, + "grad_norm": 0.07060546427965164, + "learning_rate": 0.0007121603002063921, + "loss": 0.0466, + "num_input_tokens_seen": 112316144, + "step": 52040 + }, + { + "epoch": 8.49021207177814, + "grad_norm": 0.021480968222022057, + "learning_rate": 0.000712095843871366, + "loss": 0.0303, + "num_input_tokens_seen": 112328560, + "step": 52045 + }, + { + "epoch": 8.491027732463296, + "grad_norm": 0.07707304507493973, + "learning_rate": 0.0007120313832379483, + "loss": 0.0452, + "num_input_tokens_seen": 112338640, + "step": 52050 + }, + { + "epoch": 8.49184339314845, + "grad_norm": 0.029224997386336327, + "learning_rate": 0.000711966918307446, + "loss": 0.0766, + "num_input_tokens_seen": 112349232, + "step": 52055 + }, + { + "epoch": 8.492659053833606, + "grad_norm": 0.060784850269556046, + "learning_rate": 0.000711902449081165, + "loss": 0.0377, + "num_input_tokens_seen": 112360400, + "step": 52060 + }, + { + "epoch": 8.49347471451876, + "grad_norm": 0.056990232318639755, + "learning_rate": 0.000711837975560412, + "loss": 0.064, + "num_input_tokens_seen": 112371408, + "step": 52065 + }, + { + "epoch": 8.494290375203915, + "grad_norm": 0.014333458617329597, + "learning_rate": 0.0007117734977464937, + "loss": 0.019, + "num_input_tokens_seen": 112382544, + "step": 52070 + }, + { + "epoch": 8.49510603588907, + "grad_norm": 0.09614971280097961, + "learning_rate": 0.0007117090156407168, + "loss": 0.091, + "num_input_tokens_seen": 112392688, + "step": 52075 + }, + { + "epoch": 8.495921696574225, + "grad_norm": 0.04041562229394913, + "learning_rate": 0.0007116445292443883, + "loss": 0.1091, + "num_input_tokens_seen": 112404240, + "step": 52080 + }, + { + "epoch": 8.49673735725938, + "grad_norm": 0.0139201320707798, + "learning_rate": 0.0007115800385588148, + "loss": 0.0793, + "num_input_tokens_seen": 112414768, + "step": 52085 + }, + { + "epoch": 8.497553017944535, + "grad_norm": 0.058195810765028, + "learning_rate": 0.0007115155435853034, + "loss": 0.0909, + "num_input_tokens_seen": 112425424, + "step": 52090 + }, + { + "epoch": 8.49836867862969, + "grad_norm": 0.03535224124789238, + "learning_rate": 0.0007114510443251613, + "loss": 0.0282, + "num_input_tokens_seen": 112436720, + "step": 52095 + }, + { + "epoch": 8.499184339314844, + "grad_norm": 0.023233380168676376, + "learning_rate": 0.0007113865407796955, + "loss": 0.1284, + "num_input_tokens_seen": 112446896, + "step": 52100 + }, + { + "epoch": 8.5, + "grad_norm": 0.00572694418951869, + "learning_rate": 0.0007113220329502131, + "loss": 0.099, + "num_input_tokens_seen": 112458064, + "step": 52105 + }, + { + "epoch": 8.500815660685156, + "grad_norm": 0.08019023388624191, + "learning_rate": 0.0007112575208380219, + "loss": 0.0655, + "num_input_tokens_seen": 112469200, + "step": 52110 + }, + { + "epoch": 8.50163132137031, + "grad_norm": 0.46045657992362976, + "learning_rate": 0.0007111930044444288, + "loss": 0.2202, + "num_input_tokens_seen": 112479888, + "step": 52115 + }, + { + "epoch": 8.502446982055465, + "grad_norm": 0.01028076559305191, + "learning_rate": 0.0007111284837707416, + "loss": 0.0947, + "num_input_tokens_seen": 112490896, + "step": 52120 + }, + { + "epoch": 8.50326264274062, + "grad_norm": 0.06554724276065826, + "learning_rate": 0.0007110639588182679, + "loss": 0.0318, + "num_input_tokens_seen": 112503152, + "step": 52125 + }, + { + "epoch": 8.504078303425775, + "grad_norm": 0.016513722017407417, + "learning_rate": 0.0007109994295883154, + "loss": 0.1838, + "num_input_tokens_seen": 112513936, + "step": 52130 + }, + { + "epoch": 8.50489396411093, + "grad_norm": 0.14783404767513275, + "learning_rate": 0.0007109348960821916, + "loss": 0.0403, + "num_input_tokens_seen": 112523536, + "step": 52135 + }, + { + "epoch": 8.505709624796085, + "grad_norm": 0.02176591381430626, + "learning_rate": 0.0007108703583012047, + "loss": 0.0857, + "num_input_tokens_seen": 112534512, + "step": 52140 + }, + { + "epoch": 8.50652528548124, + "grad_norm": 0.11707507818937302, + "learning_rate": 0.0007108058162466624, + "loss": 0.2024, + "num_input_tokens_seen": 112546032, + "step": 52145 + }, + { + "epoch": 8.507340946166394, + "grad_norm": 0.31073057651519775, + "learning_rate": 0.0007107412699198729, + "loss": 0.0781, + "num_input_tokens_seen": 112556016, + "step": 52150 + }, + { + "epoch": 8.50815660685155, + "grad_norm": 0.11862120032310486, + "learning_rate": 0.0007106767193221442, + "loss": 0.0774, + "num_input_tokens_seen": 112565520, + "step": 52155 + }, + { + "epoch": 8.508972267536706, + "grad_norm": 0.4077545404434204, + "learning_rate": 0.0007106121644547844, + "loss": 0.0795, + "num_input_tokens_seen": 112575824, + "step": 52160 + }, + { + "epoch": 8.50978792822186, + "grad_norm": 0.1618225872516632, + "learning_rate": 0.000710547605319102, + "loss": 0.0665, + "num_input_tokens_seen": 112587056, + "step": 52165 + }, + { + "epoch": 8.510603588907015, + "grad_norm": 0.07212002575397491, + "learning_rate": 0.0007104830419164052, + "loss": 0.0246, + "num_input_tokens_seen": 112598192, + "step": 52170 + }, + { + "epoch": 8.51141924959217, + "grad_norm": 0.1664574295282364, + "learning_rate": 0.0007104184742480025, + "loss": 0.0194, + "num_input_tokens_seen": 112609584, + "step": 52175 + }, + { + "epoch": 8.512234910277325, + "grad_norm": 0.047069571912288666, + "learning_rate": 0.0007103539023152025, + "loss": 0.0488, + "num_input_tokens_seen": 112620720, + "step": 52180 + }, + { + "epoch": 8.513050570962479, + "grad_norm": 0.04685068875551224, + "learning_rate": 0.0007102893261193141, + "loss": 0.0521, + "num_input_tokens_seen": 112631696, + "step": 52185 + }, + { + "epoch": 8.513866231647635, + "grad_norm": 0.0098946588113904, + "learning_rate": 0.0007102247456616456, + "loss": 0.0635, + "num_input_tokens_seen": 112641872, + "step": 52190 + }, + { + "epoch": 8.51468189233279, + "grad_norm": 0.006688955705612898, + "learning_rate": 0.0007101601609435057, + "loss": 0.0133, + "num_input_tokens_seen": 112652240, + "step": 52195 + }, + { + "epoch": 8.515497553017944, + "grad_norm": 0.012163212522864342, + "learning_rate": 0.0007100955719662038, + "loss": 0.0182, + "num_input_tokens_seen": 112663824, + "step": 52200 + }, + { + "epoch": 8.5163132137031, + "grad_norm": 0.030284985899925232, + "learning_rate": 0.0007100309787310485, + "loss": 0.0165, + "num_input_tokens_seen": 112675216, + "step": 52205 + }, + { + "epoch": 8.517128874388254, + "grad_norm": 0.043319232761859894, + "learning_rate": 0.0007099663812393489, + "loss": 0.0111, + "num_input_tokens_seen": 112684464, + "step": 52210 + }, + { + "epoch": 8.51794453507341, + "grad_norm": 0.035232290625572205, + "learning_rate": 0.0007099017794924144, + "loss": 0.0152, + "num_input_tokens_seen": 112695120, + "step": 52215 + }, + { + "epoch": 8.518760195758565, + "grad_norm": 0.007696705870330334, + "learning_rate": 0.000709837173491554, + "loss": 0.0429, + "num_input_tokens_seen": 112705776, + "step": 52220 + }, + { + "epoch": 8.51957585644372, + "grad_norm": 0.027577213943004608, + "learning_rate": 0.0007097725632380771, + "loss": 0.0399, + "num_input_tokens_seen": 112716368, + "step": 52225 + }, + { + "epoch": 8.520391517128875, + "grad_norm": 0.010155752301216125, + "learning_rate": 0.0007097079487332931, + "loss": 0.0117, + "num_input_tokens_seen": 112727440, + "step": 52230 + }, + { + "epoch": 8.521207177814029, + "grad_norm": 0.23838578164577484, + "learning_rate": 0.0007096433299785113, + "loss": 0.2089, + "num_input_tokens_seen": 112736336, + "step": 52235 + }, + { + "epoch": 8.522022838499185, + "grad_norm": 0.0322929322719574, + "learning_rate": 0.0007095787069750416, + "loss": 0.0337, + "num_input_tokens_seen": 112747312, + "step": 52240 + }, + { + "epoch": 8.522838499184338, + "grad_norm": 0.011756251566112041, + "learning_rate": 0.0007095140797241936, + "loss": 0.1305, + "num_input_tokens_seen": 112758160, + "step": 52245 + }, + { + "epoch": 8.523654159869494, + "grad_norm": 0.02623768523335457, + "learning_rate": 0.0007094494482272768, + "loss": 0.0167, + "num_input_tokens_seen": 112768080, + "step": 52250 + }, + { + "epoch": 8.52446982055465, + "grad_norm": 0.038000885397195816, + "learning_rate": 0.0007093848124856014, + "loss": 0.1232, + "num_input_tokens_seen": 112778416, + "step": 52255 + }, + { + "epoch": 8.525285481239804, + "grad_norm": 0.34021851420402527, + "learning_rate": 0.000709320172500477, + "loss": 0.0533, + "num_input_tokens_seen": 112788944, + "step": 52260 + }, + { + "epoch": 8.52610114192496, + "grad_norm": 0.004646057263016701, + "learning_rate": 0.0007092555282732139, + "loss": 0.0087, + "num_input_tokens_seen": 112799664, + "step": 52265 + }, + { + "epoch": 8.526916802610113, + "grad_norm": 0.018455583602190018, + "learning_rate": 0.000709190879805122, + "loss": 0.0903, + "num_input_tokens_seen": 112809840, + "step": 52270 + }, + { + "epoch": 8.52773246329527, + "grad_norm": 0.2697279453277588, + "learning_rate": 0.0007091262270975116, + "loss": 0.1009, + "num_input_tokens_seen": 112820240, + "step": 52275 + }, + { + "epoch": 8.528548123980425, + "grad_norm": 0.013409428298473358, + "learning_rate": 0.0007090615701516929, + "loss": 0.055, + "num_input_tokens_seen": 112830704, + "step": 52280 + }, + { + "epoch": 8.529363784665579, + "grad_norm": 0.22376123070716858, + "learning_rate": 0.0007089969089689761, + "loss": 0.0816, + "num_input_tokens_seen": 112842352, + "step": 52285 + }, + { + "epoch": 8.530179445350734, + "grad_norm": 0.34014448523521423, + "learning_rate": 0.0007089322435506719, + "loss": 0.0597, + "num_input_tokens_seen": 112853904, + "step": 52290 + }, + { + "epoch": 8.530995106035888, + "grad_norm": 0.13514401018619537, + "learning_rate": 0.0007088675738980909, + "loss": 0.0254, + "num_input_tokens_seen": 112863504, + "step": 52295 + }, + { + "epoch": 8.531810766721044, + "grad_norm": 0.12043710052967072, + "learning_rate": 0.0007088029000125435, + "loss": 0.0176, + "num_input_tokens_seen": 112873808, + "step": 52300 + }, + { + "epoch": 8.5326264274062, + "grad_norm": 0.5051378607749939, + "learning_rate": 0.0007087382218953403, + "loss": 0.0664, + "num_input_tokens_seen": 112883696, + "step": 52305 + }, + { + "epoch": 8.533442088091354, + "grad_norm": 0.01088625006377697, + "learning_rate": 0.0007086735395477923, + "loss": 0.0134, + "num_input_tokens_seen": 112893424, + "step": 52310 + }, + { + "epoch": 8.53425774877651, + "grad_norm": 0.0050513120368123055, + "learning_rate": 0.0007086088529712103, + "loss": 0.0085, + "num_input_tokens_seen": 112904016, + "step": 52315 + }, + { + "epoch": 8.535073409461663, + "grad_norm": 0.5004691481590271, + "learning_rate": 0.0007085441621669053, + "loss": 0.0802, + "num_input_tokens_seen": 112915376, + "step": 52320 + }, + { + "epoch": 8.535889070146819, + "grad_norm": 0.002639029873535037, + "learning_rate": 0.0007084794671361883, + "loss": 0.0178, + "num_input_tokens_seen": 112925424, + "step": 52325 + }, + { + "epoch": 8.536704730831975, + "grad_norm": 0.0072048138827085495, + "learning_rate": 0.0007084147678803703, + "loss": 0.0395, + "num_input_tokens_seen": 112936656, + "step": 52330 + }, + { + "epoch": 8.537520391517129, + "grad_norm": 0.02907838672399521, + "learning_rate": 0.0007083500644007628, + "loss": 0.0279, + "num_input_tokens_seen": 112947600, + "step": 52335 + }, + { + "epoch": 8.538336052202284, + "grad_norm": 0.0023188372142612934, + "learning_rate": 0.0007082853566986769, + "loss": 0.0135, + "num_input_tokens_seen": 112957840, + "step": 52340 + }, + { + "epoch": 8.539151712887438, + "grad_norm": 0.4080224633216858, + "learning_rate": 0.0007082206447754239, + "loss": 0.0783, + "num_input_tokens_seen": 112969488, + "step": 52345 + }, + { + "epoch": 8.539967373572594, + "grad_norm": 0.012631695717573166, + "learning_rate": 0.0007081559286323155, + "loss": 0.0117, + "num_input_tokens_seen": 112979216, + "step": 52350 + }, + { + "epoch": 8.540783034257748, + "grad_norm": 0.018039526417851448, + "learning_rate": 0.0007080912082706631, + "loss": 0.0101, + "num_input_tokens_seen": 112990736, + "step": 52355 + }, + { + "epoch": 8.541598694942904, + "grad_norm": 0.28995388746261597, + "learning_rate": 0.0007080264836917783, + "loss": 0.1258, + "num_input_tokens_seen": 113001296, + "step": 52360 + }, + { + "epoch": 8.54241435562806, + "grad_norm": 0.002150471555069089, + "learning_rate": 0.000707961754896973, + "loss": 0.0095, + "num_input_tokens_seen": 113011312, + "step": 52365 + }, + { + "epoch": 8.543230016313213, + "grad_norm": 0.23483797907829285, + "learning_rate": 0.0007078970218875589, + "loss": 0.0461, + "num_input_tokens_seen": 113020688, + "step": 52370 + }, + { + "epoch": 8.544045676998369, + "grad_norm": 0.043991025537252426, + "learning_rate": 0.0007078322846648479, + "loss": 0.0427, + "num_input_tokens_seen": 113031888, + "step": 52375 + }, + { + "epoch": 8.544861337683523, + "grad_norm": 0.10694552212953568, + "learning_rate": 0.0007077675432301521, + "loss": 0.1368, + "num_input_tokens_seen": 113042800, + "step": 52380 + }, + { + "epoch": 8.545676998368679, + "grad_norm": 0.05213603749871254, + "learning_rate": 0.0007077027975847833, + "loss": 0.0182, + "num_input_tokens_seen": 113052176, + "step": 52385 + }, + { + "epoch": 8.546492659053834, + "grad_norm": 0.015906430780887604, + "learning_rate": 0.0007076380477300539, + "loss": 0.0274, + "num_input_tokens_seen": 113062608, + "step": 52390 + }, + { + "epoch": 8.547308319738988, + "grad_norm": 0.02583555318415165, + "learning_rate": 0.0007075732936672761, + "loss": 0.035, + "num_input_tokens_seen": 113075248, + "step": 52395 + }, + { + "epoch": 8.548123980424144, + "grad_norm": 0.09450580179691315, + "learning_rate": 0.0007075085353977622, + "loss": 0.0767, + "num_input_tokens_seen": 113086256, + "step": 52400 + }, + { + "epoch": 8.548939641109298, + "grad_norm": 0.08770249783992767, + "learning_rate": 0.0007074437729228245, + "loss": 0.0209, + "num_input_tokens_seen": 113096752, + "step": 52405 + }, + { + "epoch": 8.549755301794454, + "grad_norm": 0.001393382903188467, + "learning_rate": 0.0007073790062437755, + "loss": 0.0469, + "num_input_tokens_seen": 113106480, + "step": 52410 + }, + { + "epoch": 8.550570962479608, + "grad_norm": 0.022632304579019547, + "learning_rate": 0.000707314235361928, + "loss": 0.1394, + "num_input_tokens_seen": 113117424, + "step": 52415 + }, + { + "epoch": 8.551386623164763, + "grad_norm": 0.2660631239414215, + "learning_rate": 0.0007072494602785945, + "loss": 0.1188, + "num_input_tokens_seen": 113127728, + "step": 52420 + }, + { + "epoch": 8.552202283849919, + "grad_norm": 0.028608199208974838, + "learning_rate": 0.0007071846809950878, + "loss": 0.0076, + "num_input_tokens_seen": 113138000, + "step": 52425 + }, + { + "epoch": 8.553017944535073, + "grad_norm": 0.00979638658463955, + "learning_rate": 0.0007071198975127206, + "loss": 0.0286, + "num_input_tokens_seen": 113148752, + "step": 52430 + }, + { + "epoch": 8.553833605220229, + "grad_norm": 0.009747752919793129, + "learning_rate": 0.000707055109832806, + "loss": 0.0511, + "num_input_tokens_seen": 113159600, + "step": 52435 + }, + { + "epoch": 8.554649265905383, + "grad_norm": 0.05585815757513046, + "learning_rate": 0.0007069903179566569, + "loss": 0.04, + "num_input_tokens_seen": 113170416, + "step": 52440 + }, + { + "epoch": 8.555464926590538, + "grad_norm": 0.32199397683143616, + "learning_rate": 0.0007069255218855865, + "loss": 0.056, + "num_input_tokens_seen": 113179984, + "step": 52445 + }, + { + "epoch": 8.556280587275694, + "grad_norm": 0.008434474468231201, + "learning_rate": 0.0007068607216209078, + "loss": 0.0687, + "num_input_tokens_seen": 113189680, + "step": 52450 + }, + { + "epoch": 8.557096247960848, + "grad_norm": 0.008195164613425732, + "learning_rate": 0.0007067959171639342, + "loss": 0.0134, + "num_input_tokens_seen": 113200112, + "step": 52455 + }, + { + "epoch": 8.557911908646004, + "grad_norm": 0.002764615463092923, + "learning_rate": 0.000706731108515979, + "loss": 0.025, + "num_input_tokens_seen": 113210416, + "step": 52460 + }, + { + "epoch": 8.558727569331158, + "grad_norm": 0.0018756149802356958, + "learning_rate": 0.0007066662956783556, + "loss": 0.003, + "num_input_tokens_seen": 113221392, + "step": 52465 + }, + { + "epoch": 8.559543230016313, + "grad_norm": 0.010950838215649128, + "learning_rate": 0.0007066014786523776, + "loss": 0.135, + "num_input_tokens_seen": 113232528, + "step": 52470 + }, + { + "epoch": 8.560358890701469, + "grad_norm": 0.006663813255727291, + "learning_rate": 0.0007065366574393585, + "loss": 0.0833, + "num_input_tokens_seen": 113243056, + "step": 52475 + }, + { + "epoch": 8.561174551386623, + "grad_norm": 0.27490487694740295, + "learning_rate": 0.000706471832040612, + "loss": 0.1055, + "num_input_tokens_seen": 113254576, + "step": 52480 + }, + { + "epoch": 8.561990212071779, + "grad_norm": 0.04118796065449715, + "learning_rate": 0.000706407002457452, + "loss": 0.0878, + "num_input_tokens_seen": 113263472, + "step": 52485 + }, + { + "epoch": 8.562805872756933, + "grad_norm": 0.024681242182850838, + "learning_rate": 0.0007063421686911921, + "loss": 0.0153, + "num_input_tokens_seen": 113276112, + "step": 52490 + }, + { + "epoch": 8.563621533442088, + "grad_norm": 0.0034718490205705166, + "learning_rate": 0.0007062773307431465, + "loss": 0.0397, + "num_input_tokens_seen": 113285520, + "step": 52495 + }, + { + "epoch": 8.564437194127244, + "grad_norm": 0.10972175002098083, + "learning_rate": 0.000706212488614629, + "loss": 0.0382, + "num_input_tokens_seen": 113296304, + "step": 52500 + }, + { + "epoch": 8.565252854812398, + "grad_norm": 0.0008186041959561408, + "learning_rate": 0.0007061476423069539, + "loss": 0.011, + "num_input_tokens_seen": 113307632, + "step": 52505 + }, + { + "epoch": 8.566068515497554, + "grad_norm": 0.42714810371398926, + "learning_rate": 0.0007060827918214353, + "loss": 0.0972, + "num_input_tokens_seen": 113318352, + "step": 52510 + }, + { + "epoch": 8.566884176182707, + "grad_norm": 0.006502964999526739, + "learning_rate": 0.0007060179371593876, + "loss": 0.2211, + "num_input_tokens_seen": 113329552, + "step": 52515 + }, + { + "epoch": 8.567699836867863, + "grad_norm": 0.06003216281533241, + "learning_rate": 0.0007059530783221249, + "loss": 0.0372, + "num_input_tokens_seen": 113339216, + "step": 52520 + }, + { + "epoch": 8.568515497553017, + "grad_norm": 0.1339084357023239, + "learning_rate": 0.0007058882153109618, + "loss": 0.1495, + "num_input_tokens_seen": 113349904, + "step": 52525 + }, + { + "epoch": 8.569331158238173, + "grad_norm": 0.005239514168351889, + "learning_rate": 0.000705823348127213, + "loss": 0.0416, + "num_input_tokens_seen": 113358832, + "step": 52530 + }, + { + "epoch": 8.570146818923329, + "grad_norm": 0.011649632826447487, + "learning_rate": 0.0007057584767721927, + "loss": 0.1761, + "num_input_tokens_seen": 113370480, + "step": 52535 + }, + { + "epoch": 8.570962479608482, + "grad_norm": 0.003607572987675667, + "learning_rate": 0.000705693601247216, + "loss": 0.0195, + "num_input_tokens_seen": 113380592, + "step": 52540 + }, + { + "epoch": 8.571778140293638, + "grad_norm": 0.24889199435710907, + "learning_rate": 0.0007056287215535976, + "loss": 0.2102, + "num_input_tokens_seen": 113392112, + "step": 52545 + }, + { + "epoch": 8.572593800978792, + "grad_norm": 0.010209997184574604, + "learning_rate": 0.0007055638376926522, + "loss": 0.0478, + "num_input_tokens_seen": 113403152, + "step": 52550 + }, + { + "epoch": 8.573409461663948, + "grad_norm": 0.19611375033855438, + "learning_rate": 0.0007054989496656949, + "loss": 0.0175, + "num_input_tokens_seen": 113413424, + "step": 52555 + }, + { + "epoch": 8.574225122349104, + "grad_norm": 0.12275520712137222, + "learning_rate": 0.0007054340574740405, + "loss": 0.1246, + "num_input_tokens_seen": 113424848, + "step": 52560 + }, + { + "epoch": 8.575040783034257, + "grad_norm": 0.5146405696868896, + "learning_rate": 0.0007053691611190045, + "loss": 0.4495, + "num_input_tokens_seen": 113434608, + "step": 52565 + }, + { + "epoch": 8.575856443719413, + "grad_norm": 0.01602541282773018, + "learning_rate": 0.0007053042606019017, + "loss": 0.0415, + "num_input_tokens_seen": 113446224, + "step": 52570 + }, + { + "epoch": 8.576672104404567, + "grad_norm": 0.014384308829903603, + "learning_rate": 0.0007052393559240479, + "loss": 0.0859, + "num_input_tokens_seen": 113457296, + "step": 52575 + }, + { + "epoch": 8.577487765089723, + "grad_norm": 0.002115190727636218, + "learning_rate": 0.0007051744470867581, + "loss": 0.0223, + "num_input_tokens_seen": 113468720, + "step": 52580 + }, + { + "epoch": 8.578303425774878, + "grad_norm": 0.36821067333221436, + "learning_rate": 0.0007051095340913478, + "loss": 0.1036, + "num_input_tokens_seen": 113479152, + "step": 52585 + }, + { + "epoch": 8.579119086460032, + "grad_norm": 0.07215467840433121, + "learning_rate": 0.0007050446169391326, + "loss": 0.0313, + "num_input_tokens_seen": 113490608, + "step": 52590 + }, + { + "epoch": 8.579934747145188, + "grad_norm": 0.009305719286203384, + "learning_rate": 0.0007049796956314281, + "loss": 0.0513, + "num_input_tokens_seen": 113501968, + "step": 52595 + }, + { + "epoch": 8.580750407830342, + "grad_norm": 0.008127324283123016, + "learning_rate": 0.00070491477016955, + "loss": 0.0097, + "num_input_tokens_seen": 113512944, + "step": 52600 + }, + { + "epoch": 8.581566068515498, + "grad_norm": 0.02540350705385208, + "learning_rate": 0.0007048498405548142, + "loss": 0.008, + "num_input_tokens_seen": 113524304, + "step": 52605 + }, + { + "epoch": 8.582381729200652, + "grad_norm": 0.23907071352005005, + "learning_rate": 0.0007047849067885366, + "loss": 0.103, + "num_input_tokens_seen": 113536400, + "step": 52610 + }, + { + "epoch": 8.583197389885807, + "grad_norm": 0.2494339942932129, + "learning_rate": 0.000704719968872033, + "loss": 0.0868, + "num_input_tokens_seen": 113546992, + "step": 52615 + }, + { + "epoch": 8.584013050570963, + "grad_norm": 0.019342707470059395, + "learning_rate": 0.0007046550268066194, + "loss": 0.0123, + "num_input_tokens_seen": 113557808, + "step": 52620 + }, + { + "epoch": 8.584828711256117, + "grad_norm": 0.16084115207195282, + "learning_rate": 0.0007045900805936122, + "loss": 0.0389, + "num_input_tokens_seen": 113568336, + "step": 52625 + }, + { + "epoch": 8.585644371941273, + "grad_norm": 0.31939661502838135, + "learning_rate": 0.0007045251302343276, + "loss": 0.0422, + "num_input_tokens_seen": 113579280, + "step": 52630 + }, + { + "epoch": 8.586460032626427, + "grad_norm": 0.03331773728132248, + "learning_rate": 0.0007044601757300815, + "loss": 0.0334, + "num_input_tokens_seen": 113591024, + "step": 52635 + }, + { + "epoch": 8.587275693311582, + "grad_norm": 0.397600382566452, + "learning_rate": 0.0007043952170821907, + "loss": 0.0717, + "num_input_tokens_seen": 113602032, + "step": 52640 + }, + { + "epoch": 8.588091353996738, + "grad_norm": 0.032990712672472, + "learning_rate": 0.0007043302542919715, + "loss": 0.0133, + "num_input_tokens_seen": 113612688, + "step": 52645 + }, + { + "epoch": 8.588907014681892, + "grad_norm": 0.028321128338575363, + "learning_rate": 0.0007042652873607405, + "loss": 0.0271, + "num_input_tokens_seen": 113622224, + "step": 52650 + }, + { + "epoch": 8.589722675367048, + "grad_norm": 0.3022190034389496, + "learning_rate": 0.0007042003162898143, + "loss": 0.1564, + "num_input_tokens_seen": 113631920, + "step": 52655 + }, + { + "epoch": 8.590538336052202, + "grad_norm": 0.01059572584927082, + "learning_rate": 0.0007041353410805097, + "loss": 0.0253, + "num_input_tokens_seen": 113642672, + "step": 52660 + }, + { + "epoch": 8.591353996737357, + "grad_norm": 0.0031689044553786516, + "learning_rate": 0.0007040703617341434, + "loss": 0.1118, + "num_input_tokens_seen": 113653456, + "step": 52665 + }, + { + "epoch": 8.592169657422513, + "grad_norm": 0.37777236104011536, + "learning_rate": 0.0007040053782520324, + "loss": 0.0859, + "num_input_tokens_seen": 113664304, + "step": 52670 + }, + { + "epoch": 8.592985318107667, + "grad_norm": 0.40668317675590515, + "learning_rate": 0.0007039403906354936, + "loss": 0.0993, + "num_input_tokens_seen": 113675024, + "step": 52675 + }, + { + "epoch": 8.593800978792823, + "grad_norm": 0.005418053362518549, + "learning_rate": 0.0007038753988858439, + "loss": 0.0478, + "num_input_tokens_seen": 113685200, + "step": 52680 + }, + { + "epoch": 8.594616639477977, + "grad_norm": 0.018186653032898903, + "learning_rate": 0.0007038104030044008, + "loss": 0.0144, + "num_input_tokens_seen": 113696528, + "step": 52685 + }, + { + "epoch": 8.595432300163132, + "grad_norm": 0.0025320840068161488, + "learning_rate": 0.0007037454029924814, + "loss": 0.0187, + "num_input_tokens_seen": 113707568, + "step": 52690 + }, + { + "epoch": 8.596247960848288, + "grad_norm": 0.009221343323588371, + "learning_rate": 0.0007036803988514028, + "loss": 0.0332, + "num_input_tokens_seen": 113719280, + "step": 52695 + }, + { + "epoch": 8.597063621533442, + "grad_norm": 0.264361709356308, + "learning_rate": 0.0007036153905824825, + "loss": 0.1396, + "num_input_tokens_seen": 113729072, + "step": 52700 + }, + { + "epoch": 8.597879282218598, + "grad_norm": 0.014137408696115017, + "learning_rate": 0.0007035503781870379, + "loss": 0.022, + "num_input_tokens_seen": 113740784, + "step": 52705 + }, + { + "epoch": 8.598694942903752, + "grad_norm": 0.008555461652576923, + "learning_rate": 0.0007034853616663868, + "loss": 0.0116, + "num_input_tokens_seen": 113751088, + "step": 52710 + }, + { + "epoch": 8.599510603588907, + "grad_norm": 0.3061891496181488, + "learning_rate": 0.0007034203410218467, + "loss": 0.2085, + "num_input_tokens_seen": 113762672, + "step": 52715 + }, + { + "epoch": 8.600326264274061, + "grad_norm": 0.015425005927681923, + "learning_rate": 0.0007033553162547355, + "loss": 0.0773, + "num_input_tokens_seen": 113773072, + "step": 52720 + }, + { + "epoch": 8.601141924959217, + "grad_norm": 0.007922991178929806, + "learning_rate": 0.0007032902873663707, + "loss": 0.0508, + "num_input_tokens_seen": 113784080, + "step": 52725 + }, + { + "epoch": 8.601957585644373, + "grad_norm": 0.17800869047641754, + "learning_rate": 0.0007032252543580702, + "loss": 0.0241, + "num_input_tokens_seen": 113794704, + "step": 52730 + }, + { + "epoch": 8.602773246329527, + "grad_norm": 0.1993740051984787, + "learning_rate": 0.0007031602172311523, + "loss": 0.0953, + "num_input_tokens_seen": 113806512, + "step": 52735 + }, + { + "epoch": 8.603588907014682, + "grad_norm": 0.01682325452566147, + "learning_rate": 0.0007030951759869347, + "loss": 0.0447, + "num_input_tokens_seen": 113817872, + "step": 52740 + }, + { + "epoch": 8.604404567699836, + "grad_norm": 0.1043018028140068, + "learning_rate": 0.0007030301306267358, + "loss": 0.1125, + "num_input_tokens_seen": 113829808, + "step": 52745 + }, + { + "epoch": 8.605220228384992, + "grad_norm": 0.0029123290441930294, + "learning_rate": 0.0007029650811518737, + "loss": 0.0147, + "num_input_tokens_seen": 113840656, + "step": 52750 + }, + { + "epoch": 8.606035889070148, + "grad_norm": 0.041133325546979904, + "learning_rate": 0.0007029000275636669, + "loss": 0.0106, + "num_input_tokens_seen": 113851664, + "step": 52755 + }, + { + "epoch": 8.606851549755302, + "grad_norm": 0.15576297044754028, + "learning_rate": 0.0007028349698634335, + "loss": 0.0293, + "num_input_tokens_seen": 113862288, + "step": 52760 + }, + { + "epoch": 8.607667210440457, + "grad_norm": 0.006814592983573675, + "learning_rate": 0.0007027699080524923, + "loss": 0.0374, + "num_input_tokens_seen": 113873456, + "step": 52765 + }, + { + "epoch": 8.608482871125611, + "grad_norm": 0.0027489762287586927, + "learning_rate": 0.0007027048421321616, + "loss": 0.0575, + "num_input_tokens_seen": 113883536, + "step": 52770 + }, + { + "epoch": 8.609298531810767, + "grad_norm": 0.016227509826421738, + "learning_rate": 0.0007026397721037601, + "loss": 0.0092, + "num_input_tokens_seen": 113893520, + "step": 52775 + }, + { + "epoch": 8.61011419249592, + "grad_norm": 0.007164679002016783, + "learning_rate": 0.0007025746979686065, + "loss": 0.1149, + "num_input_tokens_seen": 113904592, + "step": 52780 + }, + { + "epoch": 8.610929853181077, + "grad_norm": 0.029857538640499115, + "learning_rate": 0.0007025096197280196, + "loss": 0.0474, + "num_input_tokens_seen": 113916272, + "step": 52785 + }, + { + "epoch": 8.611745513866232, + "grad_norm": 0.0027718052733689547, + "learning_rate": 0.0007024445373833185, + "loss": 0.0333, + "num_input_tokens_seen": 113928336, + "step": 52790 + }, + { + "epoch": 8.612561174551386, + "grad_norm": 0.009753159247338772, + "learning_rate": 0.000702379450935822, + "loss": 0.0907, + "num_input_tokens_seen": 113939920, + "step": 52795 + }, + { + "epoch": 8.613376835236542, + "grad_norm": 0.12790068984031677, + "learning_rate": 0.0007023143603868492, + "loss": 0.0381, + "num_input_tokens_seen": 113949616, + "step": 52800 + }, + { + "epoch": 8.614192495921696, + "grad_norm": 0.009857307188212872, + "learning_rate": 0.0007022492657377192, + "loss": 0.0755, + "num_input_tokens_seen": 113959408, + "step": 52805 + }, + { + "epoch": 8.615008156606851, + "grad_norm": 0.023038728162646294, + "learning_rate": 0.0007021841669897511, + "loss": 0.1311, + "num_input_tokens_seen": 113970736, + "step": 52810 + }, + { + "epoch": 8.615823817292007, + "grad_norm": 0.004647457040846348, + "learning_rate": 0.0007021190641442645, + "loss": 0.0237, + "num_input_tokens_seen": 113981008, + "step": 52815 + }, + { + "epoch": 8.616639477977161, + "grad_norm": 0.00987796951085329, + "learning_rate": 0.0007020539572025788, + "loss": 0.0833, + "num_input_tokens_seen": 113992816, + "step": 52820 + }, + { + "epoch": 8.617455138662317, + "grad_norm": 0.28402289748191833, + "learning_rate": 0.0007019888461660132, + "loss": 0.0416, + "num_input_tokens_seen": 114005392, + "step": 52825 + }, + { + "epoch": 8.61827079934747, + "grad_norm": 0.05014542490243912, + "learning_rate": 0.0007019237310358874, + "loss": 0.044, + "num_input_tokens_seen": 114015600, + "step": 52830 + }, + { + "epoch": 8.619086460032626, + "grad_norm": 0.04443823918700218, + "learning_rate": 0.000701858611813521, + "loss": 0.127, + "num_input_tokens_seen": 114026448, + "step": 52835 + }, + { + "epoch": 8.619902120717782, + "grad_norm": 0.2592226564884186, + "learning_rate": 0.0007017934885002339, + "loss": 0.1222, + "num_input_tokens_seen": 114037552, + "step": 52840 + }, + { + "epoch": 8.620717781402936, + "grad_norm": 0.0097408602014184, + "learning_rate": 0.0007017283610973456, + "loss": 0.1346, + "num_input_tokens_seen": 114049072, + "step": 52845 + }, + { + "epoch": 8.621533442088092, + "grad_norm": 0.01587936282157898, + "learning_rate": 0.0007016632296061762, + "loss": 0.0659, + "num_input_tokens_seen": 114059824, + "step": 52850 + }, + { + "epoch": 8.622349102773246, + "grad_norm": 0.005757322069257498, + "learning_rate": 0.0007015980940280458, + "loss": 0.0824, + "num_input_tokens_seen": 114070960, + "step": 52855 + }, + { + "epoch": 8.623164763458401, + "grad_norm": 0.022166816517710686, + "learning_rate": 0.0007015329543642741, + "loss": 0.0214, + "num_input_tokens_seen": 114082224, + "step": 52860 + }, + { + "epoch": 8.623980424143557, + "grad_norm": 0.015218913555145264, + "learning_rate": 0.0007014678106161814, + "loss": 0.0063, + "num_input_tokens_seen": 114091760, + "step": 52865 + }, + { + "epoch": 8.624796084828711, + "grad_norm": 0.17290104925632477, + "learning_rate": 0.000701402662785088, + "loss": 0.0913, + "num_input_tokens_seen": 114100912, + "step": 52870 + }, + { + "epoch": 8.625611745513867, + "grad_norm": 0.21617266535758972, + "learning_rate": 0.0007013375108723141, + "loss": 0.0567, + "num_input_tokens_seen": 114111856, + "step": 52875 + }, + { + "epoch": 8.62642740619902, + "grad_norm": 0.023880112916231155, + "learning_rate": 0.0007012723548791802, + "loss": 0.0754, + "num_input_tokens_seen": 114121872, + "step": 52880 + }, + { + "epoch": 8.627243066884176, + "grad_norm": 0.03792709857225418, + "learning_rate": 0.0007012071948070065, + "loss": 0.0878, + "num_input_tokens_seen": 114132304, + "step": 52885 + }, + { + "epoch": 8.62805872756933, + "grad_norm": 0.03756793215870857, + "learning_rate": 0.0007011420306571139, + "loss": 0.0899, + "num_input_tokens_seen": 114142160, + "step": 52890 + }, + { + "epoch": 8.628874388254486, + "grad_norm": 0.01017955131828785, + "learning_rate": 0.0007010768624308228, + "loss": 0.114, + "num_input_tokens_seen": 114153392, + "step": 52895 + }, + { + "epoch": 8.629690048939642, + "grad_norm": 0.045971065759658813, + "learning_rate": 0.0007010116901294541, + "loss": 0.0277, + "num_input_tokens_seen": 114165648, + "step": 52900 + }, + { + "epoch": 8.630505709624796, + "grad_norm": 0.05028890073299408, + "learning_rate": 0.0007009465137543285, + "loss": 0.0192, + "num_input_tokens_seen": 114175248, + "step": 52905 + }, + { + "epoch": 8.631321370309951, + "grad_norm": 0.807207465171814, + "learning_rate": 0.0007008813333067668, + "loss": 0.1668, + "num_input_tokens_seen": 114186256, + "step": 52910 + }, + { + "epoch": 8.632137030995105, + "grad_norm": 0.08155766874551773, + "learning_rate": 0.00070081614878809, + "loss": 0.0207, + "num_input_tokens_seen": 114197264, + "step": 52915 + }, + { + "epoch": 8.632952691680261, + "grad_norm": 0.19397865235805511, + "learning_rate": 0.0007007509601996193, + "loss": 0.018, + "num_input_tokens_seen": 114208944, + "step": 52920 + }, + { + "epoch": 8.633768352365417, + "grad_norm": 0.006410667207092047, + "learning_rate": 0.0007006857675426757, + "loss": 0.0178, + "num_input_tokens_seen": 114219504, + "step": 52925 + }, + { + "epoch": 8.63458401305057, + "grad_norm": 0.03020365722477436, + "learning_rate": 0.0007006205708185804, + "loss": 0.0248, + "num_input_tokens_seen": 114229072, + "step": 52930 + }, + { + "epoch": 8.635399673735726, + "grad_norm": 0.07201595604419708, + "learning_rate": 0.0007005553700286549, + "loss": 0.0154, + "num_input_tokens_seen": 114238768, + "step": 52935 + }, + { + "epoch": 8.63621533442088, + "grad_norm": 0.008957172743976116, + "learning_rate": 0.0007004901651742201, + "loss": 0.005, + "num_input_tokens_seen": 114250256, + "step": 52940 + }, + { + "epoch": 8.637030995106036, + "grad_norm": 0.19040963053703308, + "learning_rate": 0.000700424956256598, + "loss": 0.0969, + "num_input_tokens_seen": 114259696, + "step": 52945 + }, + { + "epoch": 8.63784665579119, + "grad_norm": 0.003955368418246508, + "learning_rate": 0.0007003597432771098, + "loss": 0.079, + "num_input_tokens_seen": 114270288, + "step": 52950 + }, + { + "epoch": 8.638662316476346, + "grad_norm": 0.006187653634697199, + "learning_rate": 0.0007002945262370773, + "loss": 0.0078, + "num_input_tokens_seen": 114280784, + "step": 52955 + }, + { + "epoch": 8.639477977161501, + "grad_norm": 0.05450412631034851, + "learning_rate": 0.0007002293051378221, + "loss": 0.0525, + "num_input_tokens_seen": 114290832, + "step": 52960 + }, + { + "epoch": 8.640293637846655, + "grad_norm": 0.2739194631576538, + "learning_rate": 0.0007001640799806662, + "loss": 0.0453, + "num_input_tokens_seen": 114301648, + "step": 52965 + }, + { + "epoch": 8.641109298531811, + "grad_norm": 0.0019135025795549154, + "learning_rate": 0.000700098850766931, + "loss": 0.0683, + "num_input_tokens_seen": 114311728, + "step": 52970 + }, + { + "epoch": 8.641924959216965, + "grad_norm": 0.05849083513021469, + "learning_rate": 0.0007000336174979389, + "loss": 0.0333, + "num_input_tokens_seen": 114322512, + "step": 52975 + }, + { + "epoch": 8.64274061990212, + "grad_norm": 0.015832483768463135, + "learning_rate": 0.0006999683801750116, + "loss": 0.0915, + "num_input_tokens_seen": 114332592, + "step": 52980 + }, + { + "epoch": 8.643556280587276, + "grad_norm": 0.2695087194442749, + "learning_rate": 0.0006999031387994717, + "loss": 0.1839, + "num_input_tokens_seen": 114343696, + "step": 52985 + }, + { + "epoch": 8.64437194127243, + "grad_norm": 0.49911022186279297, + "learning_rate": 0.0006998378933726408, + "loss": 0.0876, + "num_input_tokens_seen": 114354928, + "step": 52990 + }, + { + "epoch": 8.645187601957586, + "grad_norm": 0.22719460725784302, + "learning_rate": 0.0006997726438958417, + "loss": 0.085, + "num_input_tokens_seen": 114365392, + "step": 52995 + }, + { + "epoch": 8.64600326264274, + "grad_norm": 0.014932883903384209, + "learning_rate": 0.0006997073903703964, + "loss": 0.0095, + "num_input_tokens_seen": 114374992, + "step": 53000 + }, + { + "epoch": 8.646818923327896, + "grad_norm": 0.0018947853241115808, + "learning_rate": 0.0006996421327976276, + "loss": 0.0318, + "num_input_tokens_seen": 114385808, + "step": 53005 + }, + { + "epoch": 8.647634584013051, + "grad_norm": 0.018106942996382713, + "learning_rate": 0.0006995768711788577, + "loss": 0.0365, + "num_input_tokens_seen": 114397072, + "step": 53010 + }, + { + "epoch": 8.648450244698205, + "grad_norm": 0.16452781856060028, + "learning_rate": 0.0006995116055154093, + "loss": 0.0438, + "num_input_tokens_seen": 114408624, + "step": 53015 + }, + { + "epoch": 8.649265905383361, + "grad_norm": 0.006706966552883387, + "learning_rate": 0.000699446335808605, + "loss": 0.1464, + "num_input_tokens_seen": 114418512, + "step": 53020 + }, + { + "epoch": 8.650081566068515, + "grad_norm": 0.08490052819252014, + "learning_rate": 0.0006993810620597677, + "loss": 0.0191, + "num_input_tokens_seen": 114427792, + "step": 53025 + }, + { + "epoch": 8.65089722675367, + "grad_norm": 0.010832375846803188, + "learning_rate": 0.0006993157842702203, + "loss": 0.1713, + "num_input_tokens_seen": 114439888, + "step": 53030 + }, + { + "epoch": 8.651712887438826, + "grad_norm": 0.016182266175746918, + "learning_rate": 0.0006992505024412858, + "loss": 0.095, + "num_input_tokens_seen": 114451280, + "step": 53035 + }, + { + "epoch": 8.65252854812398, + "grad_norm": 0.0029650728683918715, + "learning_rate": 0.000699185216574287, + "loss": 0.0478, + "num_input_tokens_seen": 114463376, + "step": 53040 + }, + { + "epoch": 8.653344208809136, + "grad_norm": 0.004497068468481302, + "learning_rate": 0.0006991199266705472, + "loss": 0.0132, + "num_input_tokens_seen": 114474896, + "step": 53045 + }, + { + "epoch": 8.65415986949429, + "grad_norm": 0.011121721938252449, + "learning_rate": 0.0006990546327313894, + "loss": 0.1278, + "num_input_tokens_seen": 114486000, + "step": 53050 + }, + { + "epoch": 8.654975530179446, + "grad_norm": 0.006438797805458307, + "learning_rate": 0.0006989893347581368, + "loss": 0.0378, + "num_input_tokens_seen": 114497456, + "step": 53055 + }, + { + "epoch": 8.655791190864601, + "grad_norm": 0.017818845808506012, + "learning_rate": 0.000698924032752113, + "loss": 0.0092, + "num_input_tokens_seen": 114508368, + "step": 53060 + }, + { + "epoch": 8.656606851549755, + "grad_norm": 0.20172560214996338, + "learning_rate": 0.0006988587267146414, + "loss": 0.2195, + "num_input_tokens_seen": 114519888, + "step": 53065 + }, + { + "epoch": 8.65742251223491, + "grad_norm": 0.028519172221422195, + "learning_rate": 0.0006987934166470454, + "loss": 0.0379, + "num_input_tokens_seen": 114529712, + "step": 53070 + }, + { + "epoch": 8.658238172920065, + "grad_norm": 0.029973309487104416, + "learning_rate": 0.0006987281025506487, + "loss": 0.0382, + "num_input_tokens_seen": 114540528, + "step": 53075 + }, + { + "epoch": 8.65905383360522, + "grad_norm": 0.1124824658036232, + "learning_rate": 0.0006986627844267748, + "loss": 0.1209, + "num_input_tokens_seen": 114552016, + "step": 53080 + }, + { + "epoch": 8.659869494290374, + "grad_norm": 0.2679826617240906, + "learning_rate": 0.0006985974622767475, + "loss": 0.0382, + "num_input_tokens_seen": 114562672, + "step": 53085 + }, + { + "epoch": 8.66068515497553, + "grad_norm": 0.14035120606422424, + "learning_rate": 0.0006985321361018908, + "loss": 0.0366, + "num_input_tokens_seen": 114574320, + "step": 53090 + }, + { + "epoch": 8.661500815660686, + "grad_norm": 0.007088634185492992, + "learning_rate": 0.0006984668059035284, + "loss": 0.1166, + "num_input_tokens_seen": 114585456, + "step": 53095 + }, + { + "epoch": 8.66231647634584, + "grad_norm": 0.0017905826680362225, + "learning_rate": 0.0006984014716829845, + "loss": 0.0701, + "num_input_tokens_seen": 114596400, + "step": 53100 + }, + { + "epoch": 8.663132137030995, + "grad_norm": 0.13059978187084198, + "learning_rate": 0.0006983361334415831, + "loss": 0.0299, + "num_input_tokens_seen": 114607120, + "step": 53105 + }, + { + "epoch": 8.66394779771615, + "grad_norm": 0.18111494183540344, + "learning_rate": 0.0006982707911806483, + "loss": 0.0373, + "num_input_tokens_seen": 114617808, + "step": 53110 + }, + { + "epoch": 8.664763458401305, + "grad_norm": 0.042073171585798264, + "learning_rate": 0.0006982054449015044, + "loss": 0.0176, + "num_input_tokens_seen": 114627664, + "step": 53115 + }, + { + "epoch": 8.66557911908646, + "grad_norm": 0.00422188313677907, + "learning_rate": 0.0006981400946054758, + "loss": 0.0336, + "num_input_tokens_seen": 114638064, + "step": 53120 + }, + { + "epoch": 8.666394779771615, + "grad_norm": 0.003058836329728365, + "learning_rate": 0.0006980747402938868, + "loss": 0.1213, + "num_input_tokens_seen": 114647760, + "step": 53125 + }, + { + "epoch": 8.66721044045677, + "grad_norm": 0.20592093467712402, + "learning_rate": 0.0006980093819680616, + "loss": 0.1139, + "num_input_tokens_seen": 114658192, + "step": 53130 + }, + { + "epoch": 8.668026101141924, + "grad_norm": 0.06082770600914955, + "learning_rate": 0.0006979440196293254, + "loss": 0.0297, + "num_input_tokens_seen": 114669392, + "step": 53135 + }, + { + "epoch": 8.66884176182708, + "grad_norm": 0.2859734892845154, + "learning_rate": 0.0006978786532790025, + "loss": 0.1236, + "num_input_tokens_seen": 114681168, + "step": 53140 + }, + { + "epoch": 8.669657422512234, + "grad_norm": 0.12199273705482483, + "learning_rate": 0.0006978132829184176, + "loss": 0.0473, + "num_input_tokens_seen": 114692688, + "step": 53145 + }, + { + "epoch": 8.67047308319739, + "grad_norm": 0.07427959889173508, + "learning_rate": 0.0006977479085488956, + "loss": 0.0103, + "num_input_tokens_seen": 114703472, + "step": 53150 + }, + { + "epoch": 8.671288743882545, + "grad_norm": 0.007702711503952742, + "learning_rate": 0.0006976825301717615, + "loss": 0.1669, + "num_input_tokens_seen": 114714704, + "step": 53155 + }, + { + "epoch": 8.6721044045677, + "grad_norm": 0.02255903370678425, + "learning_rate": 0.0006976171477883399, + "loss": 0.0124, + "num_input_tokens_seen": 114725200, + "step": 53160 + }, + { + "epoch": 8.672920065252855, + "grad_norm": 0.03193705156445503, + "learning_rate": 0.0006975517613999562, + "loss": 0.0121, + "num_input_tokens_seen": 114735568, + "step": 53165 + }, + { + "epoch": 8.673735725938009, + "grad_norm": 0.0054568140767514706, + "learning_rate": 0.0006974863710079355, + "loss": 0.0156, + "num_input_tokens_seen": 114746448, + "step": 53170 + }, + { + "epoch": 8.674551386623165, + "grad_norm": 0.043369051069021225, + "learning_rate": 0.0006974209766136031, + "loss": 0.0202, + "num_input_tokens_seen": 114758704, + "step": 53175 + }, + { + "epoch": 8.67536704730832, + "grad_norm": 0.07627914845943451, + "learning_rate": 0.0006973555782182839, + "loss": 0.0414, + "num_input_tokens_seen": 114769744, + "step": 53180 + }, + { + "epoch": 8.676182707993474, + "grad_norm": 0.111325204372406, + "learning_rate": 0.0006972901758233037, + "loss": 0.024, + "num_input_tokens_seen": 114780112, + "step": 53185 + }, + { + "epoch": 8.67699836867863, + "grad_norm": 0.05463525652885437, + "learning_rate": 0.0006972247694299877, + "loss": 0.0424, + "num_input_tokens_seen": 114790384, + "step": 53190 + }, + { + "epoch": 8.677814029363784, + "grad_norm": 0.0541340671479702, + "learning_rate": 0.0006971593590396616, + "loss": 0.0105, + "num_input_tokens_seen": 114801168, + "step": 53195 + }, + { + "epoch": 8.67862969004894, + "grad_norm": 0.3505849838256836, + "learning_rate": 0.000697093944653651, + "loss": 0.1286, + "num_input_tokens_seen": 114812560, + "step": 53200 + }, + { + "epoch": 8.679445350734095, + "grad_norm": 0.023387737572193146, + "learning_rate": 0.0006970285262732815, + "loss": 0.0454, + "num_input_tokens_seen": 114823664, + "step": 53205 + }, + { + "epoch": 8.68026101141925, + "grad_norm": 0.22071020305156708, + "learning_rate": 0.000696963103899879, + "loss": 0.0313, + "num_input_tokens_seen": 114835056, + "step": 53210 + }, + { + "epoch": 8.681076672104405, + "grad_norm": 0.00506369024515152, + "learning_rate": 0.0006968976775347694, + "loss": 0.1215, + "num_input_tokens_seen": 114845744, + "step": 53215 + }, + { + "epoch": 8.681892332789559, + "grad_norm": 0.11679688096046448, + "learning_rate": 0.0006968322471792785, + "loss": 0.0656, + "num_input_tokens_seen": 114857072, + "step": 53220 + }, + { + "epoch": 8.682707993474715, + "grad_norm": 0.33241090178489685, + "learning_rate": 0.0006967668128347324, + "loss": 0.0891, + "num_input_tokens_seen": 114868304, + "step": 53225 + }, + { + "epoch": 8.68352365415987, + "grad_norm": 0.033708229660987854, + "learning_rate": 0.0006967013745024573, + "loss": 0.0223, + "num_input_tokens_seen": 114877360, + "step": 53230 + }, + { + "epoch": 8.684339314845024, + "grad_norm": 0.010344784706830978, + "learning_rate": 0.0006966359321837792, + "loss": 0.0348, + "num_input_tokens_seen": 114888496, + "step": 53235 + }, + { + "epoch": 8.68515497553018, + "grad_norm": 0.5648834705352783, + "learning_rate": 0.0006965704858800246, + "loss": 0.0782, + "num_input_tokens_seen": 114900656, + "step": 53240 + }, + { + "epoch": 8.685970636215334, + "grad_norm": 0.013573498465120792, + "learning_rate": 0.0006965050355925197, + "loss": 0.0194, + "num_input_tokens_seen": 114912208, + "step": 53245 + }, + { + "epoch": 8.68678629690049, + "grad_norm": 0.2690918445587158, + "learning_rate": 0.000696439581322591, + "loss": 0.0651, + "num_input_tokens_seen": 114924144, + "step": 53250 + }, + { + "epoch": 8.687601957585644, + "grad_norm": 0.012531330808997154, + "learning_rate": 0.000696374123071565, + "loss": 0.116, + "num_input_tokens_seen": 114936496, + "step": 53255 + }, + { + "epoch": 8.6884176182708, + "grad_norm": 0.03884517401456833, + "learning_rate": 0.0006963086608407683, + "loss": 0.2182, + "num_input_tokens_seen": 114946928, + "step": 53260 + }, + { + "epoch": 8.689233278955955, + "grad_norm": 0.023427551612257957, + "learning_rate": 0.0006962431946315274, + "loss": 0.0712, + "num_input_tokens_seen": 114957392, + "step": 53265 + }, + { + "epoch": 8.690048939641109, + "grad_norm": 0.07985246181488037, + "learning_rate": 0.0006961777244451694, + "loss": 0.058, + "num_input_tokens_seen": 114968816, + "step": 53270 + }, + { + "epoch": 8.690864600326265, + "grad_norm": 0.3203592002391815, + "learning_rate": 0.0006961122502830208, + "loss": 0.1109, + "num_input_tokens_seen": 114979408, + "step": 53275 + }, + { + "epoch": 8.691680261011419, + "grad_norm": 0.20674173533916473, + "learning_rate": 0.0006960467721464086, + "loss": 0.0999, + "num_input_tokens_seen": 114989776, + "step": 53280 + }, + { + "epoch": 8.692495921696574, + "grad_norm": 0.10323002189397812, + "learning_rate": 0.00069598129003666, + "loss": 0.1159, + "num_input_tokens_seen": 115000976, + "step": 53285 + }, + { + "epoch": 8.69331158238173, + "grad_norm": 0.05111541226506233, + "learning_rate": 0.0006959158039551019, + "loss": 0.0755, + "num_input_tokens_seen": 115010608, + "step": 53290 + }, + { + "epoch": 8.694127243066884, + "grad_norm": 0.2260194718837738, + "learning_rate": 0.0006958503139030616, + "loss": 0.0861, + "num_input_tokens_seen": 115021904, + "step": 53295 + }, + { + "epoch": 8.69494290375204, + "grad_norm": 0.033067554235458374, + "learning_rate": 0.0006957848198818661, + "loss": 0.0181, + "num_input_tokens_seen": 115032240, + "step": 53300 + }, + { + "epoch": 8.695758564437194, + "grad_norm": 0.02037506364285946, + "learning_rate": 0.0006957193218928429, + "loss": 0.0437, + "num_input_tokens_seen": 115043024, + "step": 53305 + }, + { + "epoch": 8.69657422512235, + "grad_norm": 0.2375122308731079, + "learning_rate": 0.0006956538199373194, + "loss": 0.1758, + "num_input_tokens_seen": 115053584, + "step": 53310 + }, + { + "epoch": 8.697389885807503, + "grad_norm": 0.010943911038339138, + "learning_rate": 0.000695588314016623, + "loss": 0.0837, + "num_input_tokens_seen": 115064368, + "step": 53315 + }, + { + "epoch": 8.698205546492659, + "grad_norm": 0.026623595505952835, + "learning_rate": 0.0006955228041320811, + "loss": 0.0172, + "num_input_tokens_seen": 115073712, + "step": 53320 + }, + { + "epoch": 8.699021207177815, + "grad_norm": 0.010865946300327778, + "learning_rate": 0.0006954572902850218, + "loss": 0.0278, + "num_input_tokens_seen": 115085616, + "step": 53325 + }, + { + "epoch": 8.699836867862969, + "grad_norm": 0.23678839206695557, + "learning_rate": 0.0006953917724767724, + "loss": 0.0548, + "num_input_tokens_seen": 115096080, + "step": 53330 + }, + { + "epoch": 8.700652528548124, + "grad_norm": 0.012293090112507343, + "learning_rate": 0.0006953262507086611, + "loss": 0.0467, + "num_input_tokens_seen": 115107344, + "step": 53335 + }, + { + "epoch": 8.701468189233278, + "grad_norm": 0.06910710781812668, + "learning_rate": 0.0006952607249820153, + "loss": 0.0743, + "num_input_tokens_seen": 115117712, + "step": 53340 + }, + { + "epoch": 8.702283849918434, + "grad_norm": 0.31352344155311584, + "learning_rate": 0.0006951951952981631, + "loss": 0.0816, + "num_input_tokens_seen": 115128048, + "step": 53345 + }, + { + "epoch": 8.70309951060359, + "grad_norm": 0.006178428418934345, + "learning_rate": 0.0006951296616584329, + "loss": 0.0391, + "num_input_tokens_seen": 115139152, + "step": 53350 + }, + { + "epoch": 8.703915171288743, + "grad_norm": 0.024337073788046837, + "learning_rate": 0.0006950641240641524, + "loss": 0.061, + "num_input_tokens_seen": 115150032, + "step": 53355 + }, + { + "epoch": 8.7047308319739, + "grad_norm": 0.0059797996655106544, + "learning_rate": 0.0006949985825166501, + "loss": 0.0137, + "num_input_tokens_seen": 115159536, + "step": 53360 + }, + { + "epoch": 8.705546492659053, + "grad_norm": 0.015264063142240047, + "learning_rate": 0.0006949330370172541, + "loss": 0.0555, + "num_input_tokens_seen": 115170576, + "step": 53365 + }, + { + "epoch": 8.706362153344209, + "grad_norm": 0.006317665800452232, + "learning_rate": 0.0006948674875672927, + "loss": 0.1425, + "num_input_tokens_seen": 115182160, + "step": 53370 + }, + { + "epoch": 8.707177814029365, + "grad_norm": 0.09621299058198929, + "learning_rate": 0.0006948019341680945, + "loss": 0.0291, + "num_input_tokens_seen": 115190960, + "step": 53375 + }, + { + "epoch": 8.707993474714518, + "grad_norm": 0.00856974720954895, + "learning_rate": 0.0006947363768209882, + "loss": 0.0144, + "num_input_tokens_seen": 115201680, + "step": 53380 + }, + { + "epoch": 8.708809135399674, + "grad_norm": 0.20764607191085815, + "learning_rate": 0.000694670815527302, + "loss": 0.0728, + "num_input_tokens_seen": 115212496, + "step": 53385 + }, + { + "epoch": 8.709624796084828, + "grad_norm": 0.2051364928483963, + "learning_rate": 0.0006946052502883648, + "loss": 0.1328, + "num_input_tokens_seen": 115223888, + "step": 53390 + }, + { + "epoch": 8.710440456769984, + "grad_norm": 0.002003439236432314, + "learning_rate": 0.0006945396811055053, + "loss": 0.0981, + "num_input_tokens_seen": 115233616, + "step": 53395 + }, + { + "epoch": 8.71125611745514, + "grad_norm": 0.06496845185756683, + "learning_rate": 0.0006944741079800525, + "loss": 0.0777, + "num_input_tokens_seen": 115244496, + "step": 53400 + }, + { + "epoch": 8.712071778140293, + "grad_norm": 0.3451456129550934, + "learning_rate": 0.000694408530913335, + "loss": 0.121, + "num_input_tokens_seen": 115255216, + "step": 53405 + }, + { + "epoch": 8.71288743882545, + "grad_norm": 0.08347432315349579, + "learning_rate": 0.0006943429499066821, + "loss": 0.0203, + "num_input_tokens_seen": 115264720, + "step": 53410 + }, + { + "epoch": 8.713703099510603, + "grad_norm": 0.004459151532500982, + "learning_rate": 0.0006942773649614228, + "loss": 0.1241, + "num_input_tokens_seen": 115276080, + "step": 53415 + }, + { + "epoch": 8.714518760195759, + "grad_norm": 0.024634376168251038, + "learning_rate": 0.0006942117760788862, + "loss": 0.0692, + "num_input_tokens_seen": 115286576, + "step": 53420 + }, + { + "epoch": 8.715334420880914, + "grad_norm": 0.19588033854961395, + "learning_rate": 0.0006941461832604017, + "loss": 0.1609, + "num_input_tokens_seen": 115297200, + "step": 53425 + }, + { + "epoch": 8.716150081566068, + "grad_norm": 0.7318480610847473, + "learning_rate": 0.0006940805865072984, + "loss": 0.0382, + "num_input_tokens_seen": 115308240, + "step": 53430 + }, + { + "epoch": 8.716965742251224, + "grad_norm": 0.0015319057274609804, + "learning_rate": 0.0006940149858209058, + "loss": 0.029, + "num_input_tokens_seen": 115319600, + "step": 53435 + }, + { + "epoch": 8.717781402936378, + "grad_norm": 0.005445023998618126, + "learning_rate": 0.0006939493812025534, + "loss": 0.05, + "num_input_tokens_seen": 115330736, + "step": 53440 + }, + { + "epoch": 8.718597063621534, + "grad_norm": 0.0018443164881318808, + "learning_rate": 0.0006938837726535707, + "loss": 0.2383, + "num_input_tokens_seen": 115342672, + "step": 53445 + }, + { + "epoch": 8.719412724306688, + "grad_norm": 0.0046096001751720905, + "learning_rate": 0.0006938181601752873, + "loss": 0.1871, + "num_input_tokens_seen": 115354320, + "step": 53450 + }, + { + "epoch": 8.720228384991843, + "grad_norm": 0.2677288055419922, + "learning_rate": 0.0006937525437690332, + "loss": 0.265, + "num_input_tokens_seen": 115364880, + "step": 53455 + }, + { + "epoch": 8.721044045676999, + "grad_norm": 0.03454139456152916, + "learning_rate": 0.0006936869234361379, + "loss": 0.0592, + "num_input_tokens_seen": 115375824, + "step": 53460 + }, + { + "epoch": 8.721859706362153, + "grad_norm": 0.010327072814106941, + "learning_rate": 0.0006936212991779314, + "loss": 0.0821, + "num_input_tokens_seen": 115386992, + "step": 53465 + }, + { + "epoch": 8.722675367047309, + "grad_norm": 0.034920867532491684, + "learning_rate": 0.0006935556709957437, + "loss": 0.0392, + "num_input_tokens_seen": 115397008, + "step": 53470 + }, + { + "epoch": 8.723491027732463, + "grad_norm": 0.013524229638278484, + "learning_rate": 0.0006934900388909048, + "loss": 0.1645, + "num_input_tokens_seen": 115407888, + "step": 53475 + }, + { + "epoch": 8.724306688417618, + "grad_norm": 0.161749005317688, + "learning_rate": 0.0006934244028647447, + "loss": 0.0375, + "num_input_tokens_seen": 115417936, + "step": 53480 + }, + { + "epoch": 8.725122349102774, + "grad_norm": 0.1066933274269104, + "learning_rate": 0.0006933587629185938, + "loss": 0.0574, + "num_input_tokens_seen": 115429200, + "step": 53485 + }, + { + "epoch": 8.725938009787928, + "grad_norm": 0.008239652030169964, + "learning_rate": 0.0006932931190537822, + "loss": 0.1087, + "num_input_tokens_seen": 115439920, + "step": 53490 + }, + { + "epoch": 8.726753670473084, + "grad_norm": 0.10506141185760498, + "learning_rate": 0.0006932274712716405, + "loss": 0.0753, + "num_input_tokens_seen": 115450608, + "step": 53495 + }, + { + "epoch": 8.727569331158238, + "grad_norm": 0.7032706141471863, + "learning_rate": 0.0006931618195734988, + "loss": 0.0911, + "num_input_tokens_seen": 115461680, + "step": 53500 + }, + { + "epoch": 8.728384991843393, + "grad_norm": 0.013541514053940773, + "learning_rate": 0.0006930961639606878, + "loss": 0.0596, + "num_input_tokens_seen": 115472752, + "step": 53505 + }, + { + "epoch": 8.729200652528547, + "grad_norm": 0.05383770540356636, + "learning_rate": 0.0006930305044345381, + "loss": 0.1193, + "num_input_tokens_seen": 115483536, + "step": 53510 + }, + { + "epoch": 8.730016313213703, + "grad_norm": 0.056143928319215775, + "learning_rate": 0.0006929648409963802, + "loss": 0.0427, + "num_input_tokens_seen": 115494736, + "step": 53515 + }, + { + "epoch": 8.730831973898859, + "grad_norm": 0.006205033045262098, + "learning_rate": 0.0006928991736475452, + "loss": 0.072, + "num_input_tokens_seen": 115505552, + "step": 53520 + }, + { + "epoch": 8.731647634584013, + "grad_norm": 0.02357732132077217, + "learning_rate": 0.0006928335023893637, + "loss": 0.1277, + "num_input_tokens_seen": 115515920, + "step": 53525 + }, + { + "epoch": 8.732463295269168, + "grad_norm": 0.06865264475345612, + "learning_rate": 0.0006927678272231667, + "loss": 0.1311, + "num_input_tokens_seen": 115526896, + "step": 53530 + }, + { + "epoch": 8.733278955954322, + "grad_norm": 0.181700199842453, + "learning_rate": 0.0006927021481502851, + "loss": 0.0876, + "num_input_tokens_seen": 115536912, + "step": 53535 + }, + { + "epoch": 8.734094616639478, + "grad_norm": 0.12676461040973663, + "learning_rate": 0.0006926364651720499, + "loss": 0.0712, + "num_input_tokens_seen": 115547728, + "step": 53540 + }, + { + "epoch": 8.734910277324634, + "grad_norm": 0.11688786745071411, + "learning_rate": 0.0006925707782897925, + "loss": 0.045, + "num_input_tokens_seen": 115559184, + "step": 53545 + }, + { + "epoch": 8.735725938009788, + "grad_norm": 0.014065644703805447, + "learning_rate": 0.000692505087504844, + "loss": 0.0896, + "num_input_tokens_seen": 115569552, + "step": 53550 + }, + { + "epoch": 8.736541598694943, + "grad_norm": 0.0854867696762085, + "learning_rate": 0.0006924393928185354, + "loss": 0.0898, + "num_input_tokens_seen": 115578928, + "step": 53555 + }, + { + "epoch": 8.737357259380097, + "grad_norm": 0.07008549571037292, + "learning_rate": 0.0006923736942321987, + "loss": 0.0413, + "num_input_tokens_seen": 115589904, + "step": 53560 + }, + { + "epoch": 8.738172920065253, + "grad_norm": 0.42308706045150757, + "learning_rate": 0.0006923079917471648, + "loss": 0.0534, + "num_input_tokens_seen": 115599440, + "step": 53565 + }, + { + "epoch": 8.738988580750409, + "grad_norm": 0.1544969230890274, + "learning_rate": 0.0006922422853647656, + "loss": 0.072, + "num_input_tokens_seen": 115610064, + "step": 53570 + }, + { + "epoch": 8.739804241435563, + "grad_norm": 0.16088442504405975, + "learning_rate": 0.0006921765750863327, + "loss": 0.1449, + "num_input_tokens_seen": 115621360, + "step": 53575 + }, + { + "epoch": 8.740619902120718, + "grad_norm": 0.07043210417032242, + "learning_rate": 0.0006921108609131976, + "loss": 0.0439, + "num_input_tokens_seen": 115632272, + "step": 53580 + }, + { + "epoch": 8.741435562805872, + "grad_norm": 0.024943260475993156, + "learning_rate": 0.0006920451428466923, + "loss": 0.0153, + "num_input_tokens_seen": 115643280, + "step": 53585 + }, + { + "epoch": 8.742251223491028, + "grad_norm": 0.003549138316884637, + "learning_rate": 0.0006919794208881486, + "loss": 0.0291, + "num_input_tokens_seen": 115654128, + "step": 53590 + }, + { + "epoch": 8.743066884176184, + "grad_norm": 0.009637904353439808, + "learning_rate": 0.0006919136950388982, + "loss": 0.0411, + "num_input_tokens_seen": 115664528, + "step": 53595 + }, + { + "epoch": 8.743882544861338, + "grad_norm": 0.0881049782037735, + "learning_rate": 0.0006918479653002734, + "loss": 0.036, + "num_input_tokens_seen": 115675024, + "step": 53600 + }, + { + "epoch": 8.744698205546493, + "grad_norm": 0.1293942779302597, + "learning_rate": 0.0006917822316736062, + "loss": 0.0438, + "num_input_tokens_seen": 115687056, + "step": 53605 + }, + { + "epoch": 8.745513866231647, + "grad_norm": 0.5021342039108276, + "learning_rate": 0.0006917164941602289, + "loss": 0.137, + "num_input_tokens_seen": 115697840, + "step": 53610 + }, + { + "epoch": 8.746329526916803, + "grad_norm": 0.11944330483675003, + "learning_rate": 0.0006916507527614735, + "loss": 0.0227, + "num_input_tokens_seen": 115708848, + "step": 53615 + }, + { + "epoch": 8.747145187601957, + "grad_norm": 0.07474060356616974, + "learning_rate": 0.0006915850074786725, + "loss": 0.3337, + "num_input_tokens_seen": 115719568, + "step": 53620 + }, + { + "epoch": 8.747960848287113, + "grad_norm": 0.06463427096605301, + "learning_rate": 0.0006915192583131582, + "loss": 0.0782, + "num_input_tokens_seen": 115730768, + "step": 53625 + }, + { + "epoch": 8.748776508972268, + "grad_norm": 0.10298333317041397, + "learning_rate": 0.0006914535052662633, + "loss": 0.0817, + "num_input_tokens_seen": 115741840, + "step": 53630 + }, + { + "epoch": 8.749592169657422, + "grad_norm": 0.012856368906795979, + "learning_rate": 0.0006913877483393202, + "loss": 0.023, + "num_input_tokens_seen": 115752208, + "step": 53635 + }, + { + "epoch": 8.750407830342578, + "grad_norm": 0.0912599042057991, + "learning_rate": 0.0006913219875336616, + "loss": 0.0199, + "num_input_tokens_seen": 115762992, + "step": 53640 + }, + { + "epoch": 8.751223491027732, + "grad_norm": 0.007884868420660496, + "learning_rate": 0.0006912562228506201, + "loss": 0.0309, + "num_input_tokens_seen": 115774224, + "step": 53645 + }, + { + "epoch": 8.752039151712887, + "grad_norm": 0.21219564974308014, + "learning_rate": 0.0006911904542915288, + "loss": 0.0242, + "num_input_tokens_seen": 115785776, + "step": 53650 + }, + { + "epoch": 8.752854812398043, + "grad_norm": 0.36498531699180603, + "learning_rate": 0.0006911246818577201, + "loss": 0.1357, + "num_input_tokens_seen": 115796656, + "step": 53655 + }, + { + "epoch": 8.753670473083197, + "grad_norm": 0.18402035534381866, + "learning_rate": 0.0006910589055505275, + "loss": 0.0238, + "num_input_tokens_seen": 115808272, + "step": 53660 + }, + { + "epoch": 8.754486133768353, + "grad_norm": 0.3463498651981354, + "learning_rate": 0.0006909931253712838, + "loss": 0.1204, + "num_input_tokens_seen": 115820144, + "step": 53665 + }, + { + "epoch": 8.755301794453507, + "grad_norm": 0.02510599046945572, + "learning_rate": 0.0006909273413213222, + "loss": 0.1435, + "num_input_tokens_seen": 115831888, + "step": 53670 + }, + { + "epoch": 8.756117455138662, + "grad_norm": 0.16178558766841888, + "learning_rate": 0.0006908615534019757, + "loss": 0.0546, + "num_input_tokens_seen": 115843024, + "step": 53675 + }, + { + "epoch": 8.756933115823816, + "grad_norm": 0.16752606630325317, + "learning_rate": 0.0006907957616145777, + "loss": 0.094, + "num_input_tokens_seen": 115854288, + "step": 53680 + }, + { + "epoch": 8.757748776508972, + "grad_norm": 0.08321017026901245, + "learning_rate": 0.0006907299659604613, + "loss": 0.0866, + "num_input_tokens_seen": 115865296, + "step": 53685 + }, + { + "epoch": 8.758564437194128, + "grad_norm": 0.1414279043674469, + "learning_rate": 0.0006906641664409605, + "loss": 0.0871, + "num_input_tokens_seen": 115876304, + "step": 53690 + }, + { + "epoch": 8.759380097879282, + "grad_norm": 0.29733142256736755, + "learning_rate": 0.0006905983630574084, + "loss": 0.1191, + "num_input_tokens_seen": 115887632, + "step": 53695 + }, + { + "epoch": 8.760195758564437, + "grad_norm": 0.15447205305099487, + "learning_rate": 0.0006905325558111389, + "loss": 0.0563, + "num_input_tokens_seen": 115898640, + "step": 53700 + }, + { + "epoch": 8.761011419249591, + "grad_norm": 0.019435331225395203, + "learning_rate": 0.0006904667447034851, + "loss": 0.0522, + "num_input_tokens_seen": 115909392, + "step": 53705 + }, + { + "epoch": 8.761827079934747, + "grad_norm": 0.005656312219798565, + "learning_rate": 0.0006904009297357814, + "loss": 0.0131, + "num_input_tokens_seen": 115920080, + "step": 53710 + }, + { + "epoch": 8.762642740619903, + "grad_norm": 0.045238934457302094, + "learning_rate": 0.000690335110909361, + "loss": 0.0419, + "num_input_tokens_seen": 115930640, + "step": 53715 + }, + { + "epoch": 8.763458401305057, + "grad_norm": 0.014850836247205734, + "learning_rate": 0.0006902692882255583, + "loss": 0.0381, + "num_input_tokens_seen": 115941392, + "step": 53720 + }, + { + "epoch": 8.764274061990212, + "grad_norm": 0.045302022248506546, + "learning_rate": 0.0006902034616857073, + "loss": 0.0799, + "num_input_tokens_seen": 115952176, + "step": 53725 + }, + { + "epoch": 8.765089722675366, + "grad_norm": 0.13803859055042267, + "learning_rate": 0.0006901376312911416, + "loss": 0.0313, + "num_input_tokens_seen": 115962896, + "step": 53730 + }, + { + "epoch": 8.765905383360522, + "grad_norm": 0.22126765549182892, + "learning_rate": 0.0006900717970431956, + "loss": 0.1219, + "num_input_tokens_seen": 115972944, + "step": 53735 + }, + { + "epoch": 8.766721044045678, + "grad_norm": 0.02448454312980175, + "learning_rate": 0.0006900059589432036, + "loss": 0.0507, + "num_input_tokens_seen": 115983600, + "step": 53740 + }, + { + "epoch": 8.767536704730832, + "grad_norm": 0.01081529725342989, + "learning_rate": 0.0006899401169924997, + "loss": 0.1322, + "num_input_tokens_seen": 115994832, + "step": 53745 + }, + { + "epoch": 8.768352365415987, + "grad_norm": 0.004617114085704088, + "learning_rate": 0.0006898742711924185, + "loss": 0.0204, + "num_input_tokens_seen": 116004848, + "step": 53750 + }, + { + "epoch": 8.769168026101141, + "grad_norm": 0.1322912722826004, + "learning_rate": 0.0006898084215442942, + "loss": 0.0549, + "num_input_tokens_seen": 116016016, + "step": 53755 + }, + { + "epoch": 8.769983686786297, + "grad_norm": 0.06243688985705376, + "learning_rate": 0.0006897425680494616, + "loss": 0.0625, + "num_input_tokens_seen": 116027088, + "step": 53760 + }, + { + "epoch": 8.770799347471453, + "grad_norm": 0.029443373903632164, + "learning_rate": 0.000689676710709255, + "loss": 0.1282, + "num_input_tokens_seen": 116038192, + "step": 53765 + }, + { + "epoch": 8.771615008156607, + "grad_norm": 0.0528830848634243, + "learning_rate": 0.0006896108495250092, + "loss": 0.1482, + "num_input_tokens_seen": 116048272, + "step": 53770 + }, + { + "epoch": 8.772430668841762, + "grad_norm": 0.006356436293572187, + "learning_rate": 0.0006895449844980592, + "loss": 0.0187, + "num_input_tokens_seen": 116060112, + "step": 53775 + }, + { + "epoch": 8.773246329526916, + "grad_norm": 0.02085634507238865, + "learning_rate": 0.0006894791156297394, + "loss": 0.079, + "num_input_tokens_seen": 116071216, + "step": 53780 + }, + { + "epoch": 8.774061990212072, + "grad_norm": 0.16036434471607208, + "learning_rate": 0.0006894132429213851, + "loss": 0.0362, + "num_input_tokens_seen": 116081360, + "step": 53785 + }, + { + "epoch": 8.774877650897226, + "grad_norm": 0.07567689567804337, + "learning_rate": 0.0006893473663743311, + "loss": 0.0634, + "num_input_tokens_seen": 116091216, + "step": 53790 + }, + { + "epoch": 8.775693311582382, + "grad_norm": 0.007545057218521833, + "learning_rate": 0.0006892814859899126, + "loss": 0.0141, + "num_input_tokens_seen": 116101168, + "step": 53795 + }, + { + "epoch": 8.776508972267537, + "grad_norm": 0.04817535728216171, + "learning_rate": 0.0006892156017694646, + "loss": 0.0143, + "num_input_tokens_seen": 116112272, + "step": 53800 + }, + { + "epoch": 8.777324632952691, + "grad_norm": 0.18284785747528076, + "learning_rate": 0.0006891497137143224, + "loss": 0.1719, + "num_input_tokens_seen": 116123184, + "step": 53805 + }, + { + "epoch": 8.778140293637847, + "grad_norm": 0.19335748255252838, + "learning_rate": 0.0006890838218258213, + "loss": 0.0255, + "num_input_tokens_seen": 116133776, + "step": 53810 + }, + { + "epoch": 8.778955954323001, + "grad_norm": 0.2204427272081375, + "learning_rate": 0.0006890179261052967, + "loss": 0.0491, + "num_input_tokens_seen": 116144688, + "step": 53815 + }, + { + "epoch": 8.779771615008157, + "grad_norm": 0.007126415614038706, + "learning_rate": 0.000688952026554084, + "loss": 0.034, + "num_input_tokens_seen": 116154288, + "step": 53820 + }, + { + "epoch": 8.780587275693312, + "grad_norm": 0.060012370347976685, + "learning_rate": 0.0006888861231735186, + "loss": 0.0148, + "num_input_tokens_seen": 116165584, + "step": 53825 + }, + { + "epoch": 8.781402936378466, + "grad_norm": 0.011833728291094303, + "learning_rate": 0.0006888202159649366, + "loss": 0.0254, + "num_input_tokens_seen": 116176848, + "step": 53830 + }, + { + "epoch": 8.782218597063622, + "grad_norm": 0.017359845340251923, + "learning_rate": 0.0006887543049296733, + "loss": 0.0911, + "num_input_tokens_seen": 116186800, + "step": 53835 + }, + { + "epoch": 8.783034257748776, + "grad_norm": 0.04716571047902107, + "learning_rate": 0.0006886883900690645, + "loss": 0.0267, + "num_input_tokens_seen": 116196656, + "step": 53840 + }, + { + "epoch": 8.783849918433932, + "grad_norm": 0.2636497914791107, + "learning_rate": 0.0006886224713844461, + "loss": 0.0503, + "num_input_tokens_seen": 116207760, + "step": 53845 + }, + { + "epoch": 8.784665579119086, + "grad_norm": 0.25649169087409973, + "learning_rate": 0.0006885565488771541, + "loss": 0.0595, + "num_input_tokens_seen": 116216752, + "step": 53850 + }, + { + "epoch": 8.785481239804241, + "grad_norm": 0.39016231894493103, + "learning_rate": 0.0006884906225485245, + "loss": 0.2034, + "num_input_tokens_seen": 116227568, + "step": 53855 + }, + { + "epoch": 8.786296900489397, + "grad_norm": 0.18375276029109955, + "learning_rate": 0.0006884246923998932, + "loss": 0.0274, + "num_input_tokens_seen": 116238736, + "step": 53860 + }, + { + "epoch": 8.78711256117455, + "grad_norm": 0.3292846381664276, + "learning_rate": 0.0006883587584325965, + "loss": 0.2538, + "num_input_tokens_seen": 116249584, + "step": 53865 + }, + { + "epoch": 8.787928221859707, + "grad_norm": 0.0047867149114608765, + "learning_rate": 0.0006882928206479707, + "loss": 0.0708, + "num_input_tokens_seen": 116259664, + "step": 53870 + }, + { + "epoch": 8.78874388254486, + "grad_norm": 0.09740550071001053, + "learning_rate": 0.0006882268790473517, + "loss": 0.0239, + "num_input_tokens_seen": 116270352, + "step": 53875 + }, + { + "epoch": 8.789559543230016, + "grad_norm": 0.1317739337682724, + "learning_rate": 0.0006881609336320764, + "loss": 0.0372, + "num_input_tokens_seen": 116281232, + "step": 53880 + }, + { + "epoch": 8.790375203915172, + "grad_norm": 0.020547520369291306, + "learning_rate": 0.0006880949844034811, + "loss": 0.0145, + "num_input_tokens_seen": 116291728, + "step": 53885 + }, + { + "epoch": 8.791190864600326, + "grad_norm": 0.036667659878730774, + "learning_rate": 0.0006880290313629026, + "loss": 0.1127, + "num_input_tokens_seen": 116302480, + "step": 53890 + }, + { + "epoch": 8.792006525285482, + "grad_norm": 0.1895463913679123, + "learning_rate": 0.0006879630745116769, + "loss": 0.1302, + "num_input_tokens_seen": 116314192, + "step": 53895 + }, + { + "epoch": 8.792822185970635, + "grad_norm": 0.0019925639498978853, + "learning_rate": 0.0006878971138511412, + "loss": 0.0484, + "num_input_tokens_seen": 116325616, + "step": 53900 + }, + { + "epoch": 8.793637846655791, + "grad_norm": 0.003933362662792206, + "learning_rate": 0.000687831149382632, + "loss": 0.0057, + "num_input_tokens_seen": 116335344, + "step": 53905 + }, + { + "epoch": 8.794453507340947, + "grad_norm": 0.029450388625264168, + "learning_rate": 0.0006877651811074863, + "loss": 0.0537, + "num_input_tokens_seen": 116346480, + "step": 53910 + }, + { + "epoch": 8.7952691680261, + "grad_norm": 0.0502365417778492, + "learning_rate": 0.0006876992090270411, + "loss": 0.1416, + "num_input_tokens_seen": 116358576, + "step": 53915 + }, + { + "epoch": 8.796084828711257, + "grad_norm": 0.05891922861337662, + "learning_rate": 0.0006876332331426332, + "loss": 0.069, + "num_input_tokens_seen": 116369872, + "step": 53920 + }, + { + "epoch": 8.79690048939641, + "grad_norm": 0.0057267383672297, + "learning_rate": 0.0006875672534556, + "loss": 0.0324, + "num_input_tokens_seen": 116380432, + "step": 53925 + }, + { + "epoch": 8.797716150081566, + "grad_norm": 0.08551560342311859, + "learning_rate": 0.0006875012699672783, + "loss": 0.043, + "num_input_tokens_seen": 116390544, + "step": 53930 + }, + { + "epoch": 8.798531810766722, + "grad_norm": 0.05100501328706741, + "learning_rate": 0.0006874352826790055, + "loss": 0.0604, + "num_input_tokens_seen": 116402512, + "step": 53935 + }, + { + "epoch": 8.799347471451876, + "grad_norm": 0.02536788396537304, + "learning_rate": 0.000687369291592119, + "loss": 0.0947, + "num_input_tokens_seen": 116413488, + "step": 53940 + }, + { + "epoch": 8.800163132137031, + "grad_norm": 0.03524312749505043, + "learning_rate": 0.0006873032967079561, + "loss": 0.0223, + "num_input_tokens_seen": 116424976, + "step": 53945 + }, + { + "epoch": 8.800978792822185, + "grad_norm": 0.031146781519055367, + "learning_rate": 0.0006872372980278543, + "loss": 0.1485, + "num_input_tokens_seen": 116435856, + "step": 53950 + }, + { + "epoch": 8.801794453507341, + "grad_norm": 0.08017542958259583, + "learning_rate": 0.0006871712955531511, + "loss": 0.168, + "num_input_tokens_seen": 116447472, + "step": 53955 + }, + { + "epoch": 8.802610114192497, + "grad_norm": 0.012428002431988716, + "learning_rate": 0.0006871052892851842, + "loss": 0.0902, + "num_input_tokens_seen": 116458064, + "step": 53960 + }, + { + "epoch": 8.80342577487765, + "grad_norm": 0.05366304889321327, + "learning_rate": 0.0006870392792252911, + "loss": 0.0215, + "num_input_tokens_seen": 116468464, + "step": 53965 + }, + { + "epoch": 8.804241435562806, + "grad_norm": 0.0027092797681689262, + "learning_rate": 0.0006869732653748096, + "loss": 0.1173, + "num_input_tokens_seen": 116477488, + "step": 53970 + }, + { + "epoch": 8.80505709624796, + "grad_norm": 0.017272505909204483, + "learning_rate": 0.000686907247735078, + "loss": 0.1061, + "num_input_tokens_seen": 116488368, + "step": 53975 + }, + { + "epoch": 8.805872756933116, + "grad_norm": 0.10456523299217224, + "learning_rate": 0.0006868412263074337, + "loss": 0.0452, + "num_input_tokens_seen": 116498704, + "step": 53980 + }, + { + "epoch": 8.80668841761827, + "grad_norm": 0.058107808232307434, + "learning_rate": 0.0006867752010932151, + "loss": 0.056, + "num_input_tokens_seen": 116510000, + "step": 53985 + }, + { + "epoch": 8.807504078303426, + "grad_norm": 0.0069722444750368595, + "learning_rate": 0.00068670917209376, + "loss": 0.0537, + "num_input_tokens_seen": 116521552, + "step": 53990 + }, + { + "epoch": 8.808319738988581, + "grad_norm": 0.06479636579751968, + "learning_rate": 0.0006866431393104067, + "loss": 0.0375, + "num_input_tokens_seen": 116532208, + "step": 53995 + }, + { + "epoch": 8.809135399673735, + "grad_norm": 0.007867210544645786, + "learning_rate": 0.0006865771027444933, + "loss": 0.0222, + "num_input_tokens_seen": 116542608, + "step": 54000 + }, + { + "epoch": 8.809951060358891, + "grad_norm": 0.019281016662716866, + "learning_rate": 0.0006865110623973585, + "loss": 0.1964, + "num_input_tokens_seen": 116552656, + "step": 54005 + }, + { + "epoch": 8.810766721044045, + "grad_norm": 0.006513939704746008, + "learning_rate": 0.0006864450182703403, + "loss": 0.0303, + "num_input_tokens_seen": 116565008, + "step": 54010 + }, + { + "epoch": 8.8115823817292, + "grad_norm": 0.01459853257983923, + "learning_rate": 0.0006863789703647771, + "loss": 0.049, + "num_input_tokens_seen": 116574576, + "step": 54015 + }, + { + "epoch": 8.812398042414356, + "grad_norm": 0.004883582703769207, + "learning_rate": 0.0006863129186820079, + "loss": 0.0147, + "num_input_tokens_seen": 116584944, + "step": 54020 + }, + { + "epoch": 8.81321370309951, + "grad_norm": 0.008797681890428066, + "learning_rate": 0.0006862468632233709, + "loss": 0.0255, + "num_input_tokens_seen": 116595920, + "step": 54025 + }, + { + "epoch": 8.814029363784666, + "grad_norm": 0.01578511670231819, + "learning_rate": 0.000686180803990205, + "loss": 0.016, + "num_input_tokens_seen": 116606704, + "step": 54030 + }, + { + "epoch": 8.81484502446982, + "grad_norm": 0.005732331424951553, + "learning_rate": 0.0006861147409838489, + "loss": 0.0347, + "num_input_tokens_seen": 116618032, + "step": 54035 + }, + { + "epoch": 8.815660685154976, + "grad_norm": 0.06826536357402802, + "learning_rate": 0.0006860486742056415, + "loss": 0.0337, + "num_input_tokens_seen": 116628464, + "step": 54040 + }, + { + "epoch": 8.81647634584013, + "grad_norm": 0.01054135337471962, + "learning_rate": 0.0006859826036569216, + "loss": 0.0447, + "num_input_tokens_seen": 116639408, + "step": 54045 + }, + { + "epoch": 8.817292006525285, + "grad_norm": 0.006543059833347797, + "learning_rate": 0.0006859165293390284, + "loss": 0.1498, + "num_input_tokens_seen": 116649808, + "step": 54050 + }, + { + "epoch": 8.818107667210441, + "grad_norm": 0.046262387186288834, + "learning_rate": 0.0006858504512533008, + "loss": 0.0603, + "num_input_tokens_seen": 116661584, + "step": 54055 + }, + { + "epoch": 8.818923327895595, + "grad_norm": 0.010653486475348473, + "learning_rate": 0.000685784369401078, + "loss": 0.0134, + "num_input_tokens_seen": 116671664, + "step": 54060 + }, + { + "epoch": 8.81973898858075, + "grad_norm": 0.12814287841320038, + "learning_rate": 0.0006857182837836994, + "loss": 0.0284, + "num_input_tokens_seen": 116683248, + "step": 54065 + }, + { + "epoch": 8.820554649265905, + "grad_norm": 0.216208815574646, + "learning_rate": 0.0006856521944025041, + "loss": 0.0727, + "num_input_tokens_seen": 116693936, + "step": 54070 + }, + { + "epoch": 8.82137030995106, + "grad_norm": 0.002256500069051981, + "learning_rate": 0.0006855861012588316, + "loss": 0.0265, + "num_input_tokens_seen": 116703856, + "step": 54075 + }, + { + "epoch": 8.822185970636216, + "grad_norm": 0.0023665237240493298, + "learning_rate": 0.0006855200043540213, + "loss": 0.0091, + "num_input_tokens_seen": 116713136, + "step": 54080 + }, + { + "epoch": 8.82300163132137, + "grad_norm": 0.0030063046142458916, + "learning_rate": 0.0006854539036894128, + "loss": 0.006, + "num_input_tokens_seen": 116723696, + "step": 54085 + }, + { + "epoch": 8.823817292006526, + "grad_norm": 0.06086798757314682, + "learning_rate": 0.0006853877992663456, + "loss": 0.0248, + "num_input_tokens_seen": 116735280, + "step": 54090 + }, + { + "epoch": 8.82463295269168, + "grad_norm": 0.2554158568382263, + "learning_rate": 0.0006853216910861595, + "loss": 0.1712, + "num_input_tokens_seen": 116746416, + "step": 54095 + }, + { + "epoch": 8.825448613376835, + "grad_norm": 0.013422626070678234, + "learning_rate": 0.0006852555791501942, + "loss": 0.1193, + "num_input_tokens_seen": 116757232, + "step": 54100 + }, + { + "epoch": 8.826264274061991, + "grad_norm": 0.04847464710474014, + "learning_rate": 0.0006851894634597898, + "loss": 0.0591, + "num_input_tokens_seen": 116768304, + "step": 54105 + }, + { + "epoch": 8.827079934747145, + "grad_norm": 0.20485931634902954, + "learning_rate": 0.0006851233440162858, + "loss": 0.0625, + "num_input_tokens_seen": 116779216, + "step": 54110 + }, + { + "epoch": 8.8278955954323, + "grad_norm": 0.056253526359796524, + "learning_rate": 0.0006850572208210223, + "loss": 0.0189, + "num_input_tokens_seen": 116790160, + "step": 54115 + }, + { + "epoch": 8.828711256117455, + "grad_norm": 0.04299944266676903, + "learning_rate": 0.0006849910938753396, + "loss": 0.0267, + "num_input_tokens_seen": 116801936, + "step": 54120 + }, + { + "epoch": 8.82952691680261, + "grad_norm": 0.01340442430227995, + "learning_rate": 0.0006849249631805777, + "loss": 0.0748, + "num_input_tokens_seen": 116812816, + "step": 54125 + }, + { + "epoch": 8.830342577487766, + "grad_norm": 0.014919687062501907, + "learning_rate": 0.0006848588287380769, + "loss": 0.0476, + "num_input_tokens_seen": 116821840, + "step": 54130 + }, + { + "epoch": 8.83115823817292, + "grad_norm": 0.02519080974161625, + "learning_rate": 0.0006847926905491771, + "loss": 0.0256, + "num_input_tokens_seen": 116832912, + "step": 54135 + }, + { + "epoch": 8.831973898858076, + "grad_norm": 0.04732099920511246, + "learning_rate": 0.0006847265486152192, + "loss": 0.0106, + "num_input_tokens_seen": 116842896, + "step": 54140 + }, + { + "epoch": 8.83278955954323, + "grad_norm": 0.11252713203430176, + "learning_rate": 0.0006846604029375435, + "loss": 0.0675, + "num_input_tokens_seen": 116853744, + "step": 54145 + }, + { + "epoch": 8.833605220228385, + "grad_norm": 0.04709382355213165, + "learning_rate": 0.0006845942535174905, + "loss": 0.0285, + "num_input_tokens_seen": 116863824, + "step": 54150 + }, + { + "epoch": 8.83442088091354, + "grad_norm": 0.0011847659479826689, + "learning_rate": 0.0006845281003564007, + "loss": 0.0595, + "num_input_tokens_seen": 116875056, + "step": 54155 + }, + { + "epoch": 8.835236541598695, + "grad_norm": 0.003138895845040679, + "learning_rate": 0.0006844619434556149, + "loss": 0.0617, + "num_input_tokens_seen": 116886768, + "step": 54160 + }, + { + "epoch": 8.83605220228385, + "grad_norm": 0.21426472067832947, + "learning_rate": 0.0006843957828164737, + "loss": 0.1219, + "num_input_tokens_seen": 116898032, + "step": 54165 + }, + { + "epoch": 8.836867862969005, + "grad_norm": 0.01748785935342312, + "learning_rate": 0.0006843296184403182, + "loss": 0.0271, + "num_input_tokens_seen": 116909008, + "step": 54170 + }, + { + "epoch": 8.83768352365416, + "grad_norm": 0.10299021750688553, + "learning_rate": 0.0006842634503284891, + "loss": 0.0383, + "num_input_tokens_seen": 116920464, + "step": 54175 + }, + { + "epoch": 8.838499184339314, + "grad_norm": 0.0038741808384656906, + "learning_rate": 0.0006841972784823274, + "loss": 0.0708, + "num_input_tokens_seen": 116931568, + "step": 54180 + }, + { + "epoch": 8.83931484502447, + "grad_norm": 0.012992334552109241, + "learning_rate": 0.0006841311029031742, + "loss": 0.0128, + "num_input_tokens_seen": 116943312, + "step": 54185 + }, + { + "epoch": 8.840130505709626, + "grad_norm": 0.022194506600499153, + "learning_rate": 0.0006840649235923706, + "loss": 0.0069, + "num_input_tokens_seen": 116954480, + "step": 54190 + }, + { + "epoch": 8.84094616639478, + "grad_norm": 0.007937074638903141, + "learning_rate": 0.0006839987405512577, + "loss": 0.0641, + "num_input_tokens_seen": 116964880, + "step": 54195 + }, + { + "epoch": 8.841761827079935, + "grad_norm": 0.03616004437208176, + "learning_rate": 0.000683932553781177, + "loss": 0.0107, + "num_input_tokens_seen": 116974896, + "step": 54200 + }, + { + "epoch": 8.84257748776509, + "grad_norm": 0.036622874438762665, + "learning_rate": 0.0006838663632834697, + "loss": 0.0088, + "num_input_tokens_seen": 116985072, + "step": 54205 + }, + { + "epoch": 8.843393148450245, + "grad_norm": 0.0875597670674324, + "learning_rate": 0.0006838001690594775, + "loss": 0.0214, + "num_input_tokens_seen": 116995856, + "step": 54210 + }, + { + "epoch": 8.844208809135399, + "grad_norm": 0.19458378851413727, + "learning_rate": 0.0006837339711105414, + "loss": 0.1517, + "num_input_tokens_seen": 117007152, + "step": 54215 + }, + { + "epoch": 8.845024469820554, + "grad_norm": 0.28629010915756226, + "learning_rate": 0.0006836677694380035, + "loss": 0.1583, + "num_input_tokens_seen": 117017040, + "step": 54220 + }, + { + "epoch": 8.84584013050571, + "grad_norm": 0.1492338627576828, + "learning_rate": 0.0006836015640432054, + "loss": 0.1978, + "num_input_tokens_seen": 117028304, + "step": 54225 + }, + { + "epoch": 8.846655791190864, + "grad_norm": 0.0066625820472836494, + "learning_rate": 0.0006835353549274885, + "loss": 0.1546, + "num_input_tokens_seen": 117039120, + "step": 54230 + }, + { + "epoch": 8.84747145187602, + "grad_norm": 0.007757519371807575, + "learning_rate": 0.0006834691420921948, + "loss": 0.0372, + "num_input_tokens_seen": 117050032, + "step": 54235 + }, + { + "epoch": 8.848287112561174, + "grad_norm": 0.048568617552518845, + "learning_rate": 0.0006834029255386663, + "loss": 0.1079, + "num_input_tokens_seen": 117058448, + "step": 54240 + }, + { + "epoch": 8.84910277324633, + "grad_norm": 0.024753503501415253, + "learning_rate": 0.0006833367052682446, + "loss": 0.0334, + "num_input_tokens_seen": 117068880, + "step": 54245 + }, + { + "epoch": 8.849918433931485, + "grad_norm": 0.2349242866039276, + "learning_rate": 0.0006832704812822722, + "loss": 0.1827, + "num_input_tokens_seen": 117079312, + "step": 54250 + }, + { + "epoch": 8.850734094616639, + "grad_norm": 0.12527932226657867, + "learning_rate": 0.0006832042535820911, + "loss": 0.2015, + "num_input_tokens_seen": 117089776, + "step": 54255 + }, + { + "epoch": 8.851549755301795, + "grad_norm": 0.009287774562835693, + "learning_rate": 0.0006831380221690431, + "loss": 0.2213, + "num_input_tokens_seen": 117100304, + "step": 54260 + }, + { + "epoch": 8.852365415986949, + "grad_norm": 0.00552772544324398, + "learning_rate": 0.0006830717870444709, + "loss": 0.0662, + "num_input_tokens_seen": 117110064, + "step": 54265 + }, + { + "epoch": 8.853181076672104, + "grad_norm": 0.29646697640419006, + "learning_rate": 0.0006830055482097168, + "loss": 0.1517, + "num_input_tokens_seen": 117121872, + "step": 54270 + }, + { + "epoch": 8.85399673735726, + "grad_norm": 0.01229477021843195, + "learning_rate": 0.000682939305666123, + "loss": 0.0138, + "num_input_tokens_seen": 117133040, + "step": 54275 + }, + { + "epoch": 8.854812398042414, + "grad_norm": 0.026781367138028145, + "learning_rate": 0.000682873059415032, + "loss": 0.0779, + "num_input_tokens_seen": 117145488, + "step": 54280 + }, + { + "epoch": 8.85562805872757, + "grad_norm": 0.004449956584721804, + "learning_rate": 0.0006828068094577864, + "loss": 0.1433, + "num_input_tokens_seen": 117156464, + "step": 54285 + }, + { + "epoch": 8.856443719412724, + "grad_norm": 0.29995131492614746, + "learning_rate": 0.0006827405557957291, + "loss": 0.1019, + "num_input_tokens_seen": 117166128, + "step": 54290 + }, + { + "epoch": 8.85725938009788, + "grad_norm": 0.21017254889011383, + "learning_rate": 0.0006826742984302026, + "loss": 0.0553, + "num_input_tokens_seen": 117176272, + "step": 54295 + }, + { + "epoch": 8.858075040783035, + "grad_norm": 0.02461509220302105, + "learning_rate": 0.0006826080373625496, + "loss": 0.2118, + "num_input_tokens_seen": 117187440, + "step": 54300 + }, + { + "epoch": 8.858890701468189, + "grad_norm": 0.015149961225688457, + "learning_rate": 0.0006825417725941132, + "loss": 0.0218, + "num_input_tokens_seen": 117198416, + "step": 54305 + }, + { + "epoch": 8.859706362153345, + "grad_norm": 0.0019865825306624174, + "learning_rate": 0.0006824755041262361, + "loss": 0.1013, + "num_input_tokens_seen": 117209104, + "step": 54310 + }, + { + "epoch": 8.860522022838499, + "grad_norm": 0.02434413507580757, + "learning_rate": 0.0006824092319602614, + "loss": 0.0387, + "num_input_tokens_seen": 117220592, + "step": 54315 + }, + { + "epoch": 8.861337683523654, + "grad_norm": 0.04532919451594353, + "learning_rate": 0.0006823429560975323, + "loss": 0.0229, + "num_input_tokens_seen": 117231280, + "step": 54320 + }, + { + "epoch": 8.86215334420881, + "grad_norm": 0.22949114441871643, + "learning_rate": 0.0006822766765393919, + "loss": 0.1121, + "num_input_tokens_seen": 117243216, + "step": 54325 + }, + { + "epoch": 8.862969004893964, + "grad_norm": 0.014909877441823483, + "learning_rate": 0.0006822103932871832, + "loss": 0.0155, + "num_input_tokens_seen": 117252976, + "step": 54330 + }, + { + "epoch": 8.86378466557912, + "grad_norm": 0.023958783596754074, + "learning_rate": 0.00068214410634225, + "loss": 0.0756, + "num_input_tokens_seen": 117264016, + "step": 54335 + }, + { + "epoch": 8.864600326264274, + "grad_norm": 0.20216239988803864, + "learning_rate": 0.0006820778157059353, + "loss": 0.1319, + "num_input_tokens_seen": 117274864, + "step": 54340 + }, + { + "epoch": 8.86541598694943, + "grad_norm": 0.31600505113601685, + "learning_rate": 0.0006820115213795827, + "loss": 0.0798, + "num_input_tokens_seen": 117285552, + "step": 54345 + }, + { + "epoch": 8.866231647634583, + "grad_norm": 0.10832073539495468, + "learning_rate": 0.0006819452233645357, + "loss": 0.0162, + "num_input_tokens_seen": 117295728, + "step": 54350 + }, + { + "epoch": 8.867047308319739, + "grad_norm": 0.14743418991565704, + "learning_rate": 0.0006818789216621379, + "loss": 0.1863, + "num_input_tokens_seen": 117307408, + "step": 54355 + }, + { + "epoch": 8.867862969004895, + "grad_norm": 0.19545665383338928, + "learning_rate": 0.0006818126162737332, + "loss": 0.1609, + "num_input_tokens_seen": 117318832, + "step": 54360 + }, + { + "epoch": 8.868678629690049, + "grad_norm": 0.018065668642520905, + "learning_rate": 0.000681746307200665, + "loss": 0.1177, + "num_input_tokens_seen": 117329712, + "step": 54365 + }, + { + "epoch": 8.869494290375204, + "grad_norm": 0.03201736882328987, + "learning_rate": 0.0006816799944442774, + "loss": 0.0424, + "num_input_tokens_seen": 117339728, + "step": 54370 + }, + { + "epoch": 8.870309951060358, + "grad_norm": 0.01682724431157112, + "learning_rate": 0.0006816136780059142, + "loss": 0.0295, + "num_input_tokens_seen": 117351408, + "step": 54375 + }, + { + "epoch": 8.871125611745514, + "grad_norm": 0.3079787790775299, + "learning_rate": 0.0006815473578869194, + "loss": 0.1885, + "num_input_tokens_seen": 117361904, + "step": 54380 + }, + { + "epoch": 8.87194127243067, + "grad_norm": 0.00952866766601801, + "learning_rate": 0.0006814810340886372, + "loss": 0.0211, + "num_input_tokens_seen": 117371536, + "step": 54385 + }, + { + "epoch": 8.872756933115824, + "grad_norm": 0.011413199827075005, + "learning_rate": 0.0006814147066124116, + "loss": 0.1188, + "num_input_tokens_seen": 117379984, + "step": 54390 + }, + { + "epoch": 8.87357259380098, + "grad_norm": 0.01727825216948986, + "learning_rate": 0.0006813483754595867, + "loss": 0.048, + "num_input_tokens_seen": 117390416, + "step": 54395 + }, + { + "epoch": 8.874388254486133, + "grad_norm": 0.06836117058992386, + "learning_rate": 0.000681282040631507, + "loss": 0.0589, + "num_input_tokens_seen": 117401136, + "step": 54400 + }, + { + "epoch": 8.875203915171289, + "grad_norm": 0.3057677149772644, + "learning_rate": 0.0006812157021295167, + "loss": 0.1466, + "num_input_tokens_seen": 117411344, + "step": 54405 + }, + { + "epoch": 8.876019575856443, + "grad_norm": 0.06018754094839096, + "learning_rate": 0.0006811493599549603, + "loss": 0.0326, + "num_input_tokens_seen": 117421680, + "step": 54410 + }, + { + "epoch": 8.876835236541599, + "grad_norm": 0.009182003326714039, + "learning_rate": 0.0006810830141091825, + "loss": 0.0353, + "num_input_tokens_seen": 117432048, + "step": 54415 + }, + { + "epoch": 8.877650897226754, + "grad_norm": 0.01096038892865181, + "learning_rate": 0.0006810166645935276, + "loss": 0.0802, + "num_input_tokens_seen": 117444080, + "step": 54420 + }, + { + "epoch": 8.878466557911908, + "grad_norm": 0.045354168862104416, + "learning_rate": 0.0006809503114093403, + "loss": 0.027, + "num_input_tokens_seen": 117454864, + "step": 54425 + }, + { + "epoch": 8.879282218597064, + "grad_norm": 0.006962680723518133, + "learning_rate": 0.0006808839545579655, + "loss": 0.0739, + "num_input_tokens_seen": 117464944, + "step": 54430 + }, + { + "epoch": 8.880097879282218, + "grad_norm": 0.025805363431572914, + "learning_rate": 0.0006808175940407477, + "loss": 0.015, + "num_input_tokens_seen": 117475344, + "step": 54435 + }, + { + "epoch": 8.880913539967374, + "grad_norm": 0.21085435152053833, + "learning_rate": 0.0006807512298590321, + "loss": 0.1422, + "num_input_tokens_seen": 117486288, + "step": 54440 + }, + { + "epoch": 8.88172920065253, + "grad_norm": 0.08830022811889648, + "learning_rate": 0.0006806848620141636, + "loss": 0.0614, + "num_input_tokens_seen": 117496432, + "step": 54445 + }, + { + "epoch": 8.882544861337683, + "grad_norm": 0.017433490604162216, + "learning_rate": 0.0006806184905074871, + "loss": 0.0398, + "num_input_tokens_seen": 117506960, + "step": 54450 + }, + { + "epoch": 8.883360522022839, + "grad_norm": 0.08004128932952881, + "learning_rate": 0.0006805521153403476, + "loss": 0.0854, + "num_input_tokens_seen": 117517360, + "step": 54455 + }, + { + "epoch": 8.884176182707993, + "grad_norm": 0.020499244332313538, + "learning_rate": 0.0006804857365140906, + "loss": 0.1042, + "num_input_tokens_seen": 117527376, + "step": 54460 + }, + { + "epoch": 8.884991843393149, + "grad_norm": 0.11806554347276688, + "learning_rate": 0.0006804193540300612, + "loss": 0.0414, + "num_input_tokens_seen": 117537104, + "step": 54465 + }, + { + "epoch": 8.885807504078304, + "grad_norm": 0.26409220695495605, + "learning_rate": 0.0006803529678896047, + "loss": 0.0928, + "num_input_tokens_seen": 117546896, + "step": 54470 + }, + { + "epoch": 8.886623164763458, + "grad_norm": 0.18930546939373016, + "learning_rate": 0.0006802865780940663, + "loss": 0.0617, + "num_input_tokens_seen": 117557328, + "step": 54475 + }, + { + "epoch": 8.887438825448614, + "grad_norm": 0.050380732864141464, + "learning_rate": 0.000680220184644792, + "loss": 0.026, + "num_input_tokens_seen": 117567568, + "step": 54480 + }, + { + "epoch": 8.888254486133768, + "grad_norm": 0.14145444333553314, + "learning_rate": 0.0006801537875431269, + "loss": 0.032, + "num_input_tokens_seen": 117578032, + "step": 54485 + }, + { + "epoch": 8.889070146818923, + "grad_norm": 0.0036057010293006897, + "learning_rate": 0.0006800873867904167, + "loss": 0.1567, + "num_input_tokens_seen": 117589456, + "step": 54490 + }, + { + "epoch": 8.88988580750408, + "grad_norm": 0.20838099718093872, + "learning_rate": 0.0006800209823880072, + "loss": 0.0603, + "num_input_tokens_seen": 117599952, + "step": 54495 + }, + { + "epoch": 8.890701468189233, + "grad_norm": 0.08609696477651596, + "learning_rate": 0.0006799545743372442, + "loss": 0.1362, + "num_input_tokens_seen": 117610288, + "step": 54500 + }, + { + "epoch": 8.891517128874389, + "grad_norm": 0.09716901928186417, + "learning_rate": 0.0006798881626394734, + "loss": 0.0166, + "num_input_tokens_seen": 117620528, + "step": 54505 + }, + { + "epoch": 8.892332789559543, + "grad_norm": 0.005563515704125166, + "learning_rate": 0.0006798217472960407, + "loss": 0.047, + "num_input_tokens_seen": 117631984, + "step": 54510 + }, + { + "epoch": 8.893148450244698, + "grad_norm": 0.010449321009218693, + "learning_rate": 0.0006797553283082922, + "loss": 0.02, + "num_input_tokens_seen": 117642800, + "step": 54515 + }, + { + "epoch": 8.893964110929852, + "grad_norm": 0.07682334631681442, + "learning_rate": 0.000679688905677574, + "loss": 0.0257, + "num_input_tokens_seen": 117654544, + "step": 54520 + }, + { + "epoch": 8.894779771615008, + "grad_norm": 0.006281704176217318, + "learning_rate": 0.0006796224794052322, + "loss": 0.0082, + "num_input_tokens_seen": 117664688, + "step": 54525 + }, + { + "epoch": 8.895595432300164, + "grad_norm": 0.019011011347174644, + "learning_rate": 0.0006795560494926129, + "loss": 0.0359, + "num_input_tokens_seen": 117675664, + "step": 54530 + }, + { + "epoch": 8.896411092985318, + "grad_norm": 0.002435162663459778, + "learning_rate": 0.0006794896159410625, + "loss": 0.0152, + "num_input_tokens_seen": 117685744, + "step": 54535 + }, + { + "epoch": 8.897226753670473, + "grad_norm": 0.00668777571991086, + "learning_rate": 0.0006794231787519274, + "loss": 0.0561, + "num_input_tokens_seen": 117697104, + "step": 54540 + }, + { + "epoch": 8.898042414355627, + "grad_norm": 0.20317891240119934, + "learning_rate": 0.000679356737926554, + "loss": 0.1328, + "num_input_tokens_seen": 117708272, + "step": 54545 + }, + { + "epoch": 8.898858075040783, + "grad_norm": 0.25155773758888245, + "learning_rate": 0.0006792902934662885, + "loss": 0.0193, + "num_input_tokens_seen": 117718832, + "step": 54550 + }, + { + "epoch": 8.899673735725939, + "grad_norm": 0.007560465019196272, + "learning_rate": 0.000679223845372478, + "loss": 0.0254, + "num_input_tokens_seen": 117729744, + "step": 54555 + }, + { + "epoch": 8.900489396411093, + "grad_norm": 0.03247198835015297, + "learning_rate": 0.0006791573936464689, + "loss": 0.0242, + "num_input_tokens_seen": 117741104, + "step": 54560 + }, + { + "epoch": 8.901305057096248, + "grad_norm": 0.007805227302014828, + "learning_rate": 0.0006790909382896079, + "loss": 0.0265, + "num_input_tokens_seen": 117751280, + "step": 54565 + }, + { + "epoch": 8.902120717781402, + "grad_norm": 0.004938093479722738, + "learning_rate": 0.0006790244793032418, + "loss": 0.0492, + "num_input_tokens_seen": 117761200, + "step": 54570 + }, + { + "epoch": 8.902936378466558, + "grad_norm": 0.08606547862291336, + "learning_rate": 0.0006789580166887176, + "loss": 0.0579, + "num_input_tokens_seen": 117771536, + "step": 54575 + }, + { + "epoch": 8.903752039151712, + "grad_norm": 0.06622520089149475, + "learning_rate": 0.0006788915504473822, + "loss": 0.0166, + "num_input_tokens_seen": 117781808, + "step": 54580 + }, + { + "epoch": 8.904567699836868, + "grad_norm": 0.002919929102063179, + "learning_rate": 0.0006788250805805824, + "loss": 0.1282, + "num_input_tokens_seen": 117793296, + "step": 54585 + }, + { + "epoch": 8.905383360522023, + "grad_norm": 0.05999757722020149, + "learning_rate": 0.0006787586070896657, + "loss": 0.1054, + "num_input_tokens_seen": 117803568, + "step": 54590 + }, + { + "epoch": 8.906199021207177, + "grad_norm": 0.002380428370088339, + "learning_rate": 0.0006786921299759789, + "loss": 0.066, + "num_input_tokens_seen": 117813648, + "step": 54595 + }, + { + "epoch": 8.907014681892333, + "grad_norm": 0.015115426853299141, + "learning_rate": 0.0006786256492408694, + "loss": 0.0205, + "num_input_tokens_seen": 117824816, + "step": 54600 + }, + { + "epoch": 8.907830342577487, + "grad_norm": 0.0031798644922673702, + "learning_rate": 0.0006785591648856846, + "loss": 0.0454, + "num_input_tokens_seen": 117835312, + "step": 54605 + }, + { + "epoch": 8.908646003262643, + "grad_norm": 0.0035450763534754515, + "learning_rate": 0.0006784926769117717, + "loss": 0.0861, + "num_input_tokens_seen": 117845616, + "step": 54610 + }, + { + "epoch": 8.909461663947798, + "grad_norm": 0.02123982273042202, + "learning_rate": 0.0006784261853204783, + "loss": 0.0221, + "num_input_tokens_seen": 117856656, + "step": 54615 + }, + { + "epoch": 8.910277324632952, + "grad_norm": 0.35546594858169556, + "learning_rate": 0.0006783596901131521, + "loss": 0.397, + "num_input_tokens_seen": 117867056, + "step": 54620 + }, + { + "epoch": 8.911092985318108, + "grad_norm": 0.1029837429523468, + "learning_rate": 0.0006782931912911402, + "loss": 0.1212, + "num_input_tokens_seen": 117877552, + "step": 54625 + }, + { + "epoch": 8.911908646003262, + "grad_norm": 0.009889107197523117, + "learning_rate": 0.0006782266888557909, + "loss": 0.036, + "num_input_tokens_seen": 117888624, + "step": 54630 + }, + { + "epoch": 8.912724306688418, + "grad_norm": 0.4738927185535431, + "learning_rate": 0.0006781601828084513, + "loss": 0.1809, + "num_input_tokens_seen": 117897520, + "step": 54635 + }, + { + "epoch": 8.913539967373573, + "grad_norm": 0.028210006654262543, + "learning_rate": 0.0006780936731504699, + "loss": 0.0455, + "num_input_tokens_seen": 117907920, + "step": 54640 + }, + { + "epoch": 8.914355628058727, + "grad_norm": 0.22908316552639008, + "learning_rate": 0.0006780271598831942, + "loss": 0.0857, + "num_input_tokens_seen": 117919088, + "step": 54645 + }, + { + "epoch": 8.915171288743883, + "grad_norm": 0.004710191860795021, + "learning_rate": 0.0006779606430079723, + "loss": 0.0138, + "num_input_tokens_seen": 117928848, + "step": 54650 + }, + { + "epoch": 8.915986949429037, + "grad_norm": 0.008551832288503647, + "learning_rate": 0.0006778941225261522, + "loss": 0.0878, + "num_input_tokens_seen": 117939984, + "step": 54655 + }, + { + "epoch": 8.916802610114193, + "grad_norm": 0.014631007798016071, + "learning_rate": 0.0006778275984390819, + "loss": 0.0273, + "num_input_tokens_seen": 117952784, + "step": 54660 + }, + { + "epoch": 8.917618270799348, + "grad_norm": 0.013700642623007298, + "learning_rate": 0.0006777610707481099, + "loss": 0.0476, + "num_input_tokens_seen": 117963504, + "step": 54665 + }, + { + "epoch": 8.918433931484502, + "grad_norm": 0.006767612881958485, + "learning_rate": 0.0006776945394545841, + "loss": 0.0291, + "num_input_tokens_seen": 117973808, + "step": 54670 + }, + { + "epoch": 8.919249592169658, + "grad_norm": 0.012531627900898457, + "learning_rate": 0.0006776280045598533, + "loss": 0.008, + "num_input_tokens_seen": 117984464, + "step": 54675 + }, + { + "epoch": 8.920065252854812, + "grad_norm": 0.022910302504897118, + "learning_rate": 0.0006775614660652655, + "loss": 0.012, + "num_input_tokens_seen": 117994928, + "step": 54680 + }, + { + "epoch": 8.920880913539968, + "grad_norm": 0.011093943379819393, + "learning_rate": 0.0006774949239721692, + "loss": 0.0493, + "num_input_tokens_seen": 118004304, + "step": 54685 + }, + { + "epoch": 8.921696574225122, + "grad_norm": 0.01702740043401718, + "learning_rate": 0.0006774283782819133, + "loss": 0.0275, + "num_input_tokens_seen": 118015152, + "step": 54690 + }, + { + "epoch": 8.922512234910277, + "grad_norm": 0.06016070768237114, + "learning_rate": 0.0006773618289958462, + "loss": 0.0362, + "num_input_tokens_seen": 118027344, + "step": 54695 + }, + { + "epoch": 8.923327895595433, + "grad_norm": 0.19754047691822052, + "learning_rate": 0.0006772952761153167, + "loss": 0.1527, + "num_input_tokens_seen": 118038672, + "step": 54700 + }, + { + "epoch": 8.924143556280587, + "grad_norm": 0.180416077375412, + "learning_rate": 0.0006772287196416733, + "loss": 0.0907, + "num_input_tokens_seen": 118050576, + "step": 54705 + }, + { + "epoch": 8.924959216965743, + "grad_norm": 0.0073585957288742065, + "learning_rate": 0.0006771621595762652, + "loss": 0.1822, + "num_input_tokens_seen": 118061520, + "step": 54710 + }, + { + "epoch": 8.925774877650896, + "grad_norm": 0.002674032235518098, + "learning_rate": 0.0006770955959204412, + "loss": 0.1206, + "num_input_tokens_seen": 118072080, + "step": 54715 + }, + { + "epoch": 8.926590538336052, + "grad_norm": 0.032474979758262634, + "learning_rate": 0.0006770290286755503, + "loss": 0.0185, + "num_input_tokens_seen": 118082416, + "step": 54720 + }, + { + "epoch": 8.927406199021208, + "grad_norm": 0.016622772440314293, + "learning_rate": 0.0006769624578429414, + "loss": 0.1074, + "num_input_tokens_seen": 118092880, + "step": 54725 + }, + { + "epoch": 8.928221859706362, + "grad_norm": 0.07715722173452377, + "learning_rate": 0.0006768958834239639, + "loss": 0.0239, + "num_input_tokens_seen": 118105136, + "step": 54730 + }, + { + "epoch": 8.929037520391518, + "grad_norm": 0.012955647893249989, + "learning_rate": 0.0006768293054199669, + "loss": 0.0679, + "num_input_tokens_seen": 118114736, + "step": 54735 + }, + { + "epoch": 8.929853181076671, + "grad_norm": 0.028105130419135094, + "learning_rate": 0.0006767627238322998, + "loss": 0.0122, + "num_input_tokens_seen": 118126064, + "step": 54740 + }, + { + "epoch": 8.930668841761827, + "grad_norm": 0.058006029576063156, + "learning_rate": 0.0006766961386623118, + "loss": 0.0163, + "num_input_tokens_seen": 118137456, + "step": 54745 + }, + { + "epoch": 8.931484502446983, + "grad_norm": 0.057216476649045944, + "learning_rate": 0.0006766295499113524, + "loss": 0.0264, + "num_input_tokens_seen": 118147376, + "step": 54750 + }, + { + "epoch": 8.932300163132137, + "grad_norm": 0.00669086305424571, + "learning_rate": 0.000676562957580771, + "loss": 0.0131, + "num_input_tokens_seen": 118158096, + "step": 54755 + }, + { + "epoch": 8.933115823817293, + "grad_norm": 0.0040925247594714165, + "learning_rate": 0.0006764963616719174, + "loss": 0.1171, + "num_input_tokens_seen": 118168688, + "step": 54760 + }, + { + "epoch": 8.933931484502446, + "grad_norm": 0.024594241753220558, + "learning_rate": 0.000676429762186141, + "loss": 0.2253, + "num_input_tokens_seen": 118179792, + "step": 54765 + }, + { + "epoch": 8.934747145187602, + "grad_norm": 0.05240340158343315, + "learning_rate": 0.0006763631591247917, + "loss": 0.0206, + "num_input_tokens_seen": 118190736, + "step": 54770 + }, + { + "epoch": 8.935562805872756, + "grad_norm": 0.028133444488048553, + "learning_rate": 0.0006762965524892194, + "loss": 0.1461, + "num_input_tokens_seen": 118201584, + "step": 54775 + }, + { + "epoch": 8.936378466557912, + "grad_norm": 0.03105170652270317, + "learning_rate": 0.0006762299422807737, + "loss": 0.072, + "num_input_tokens_seen": 118212624, + "step": 54780 + }, + { + "epoch": 8.937194127243067, + "grad_norm": 0.012723773717880249, + "learning_rate": 0.0006761633285008046, + "loss": 0.0356, + "num_input_tokens_seen": 118222800, + "step": 54785 + }, + { + "epoch": 8.938009787928221, + "grad_norm": 0.03822094574570656, + "learning_rate": 0.0006760967111506623, + "loss": 0.0758, + "num_input_tokens_seen": 118232592, + "step": 54790 + }, + { + "epoch": 8.938825448613377, + "grad_norm": 0.031024565920233727, + "learning_rate": 0.0006760300902316967, + "loss": 0.0144, + "num_input_tokens_seen": 118244464, + "step": 54795 + }, + { + "epoch": 8.939641109298531, + "grad_norm": 0.0680294930934906, + "learning_rate": 0.000675963465745258, + "loss": 0.019, + "num_input_tokens_seen": 118253552, + "step": 54800 + }, + { + "epoch": 8.940456769983687, + "grad_norm": 0.011198487132787704, + "learning_rate": 0.0006758968376926965, + "loss": 0.0744, + "num_input_tokens_seen": 118263824, + "step": 54805 + }, + { + "epoch": 8.941272430668842, + "grad_norm": 0.3901715576648712, + "learning_rate": 0.0006758302060753624, + "loss": 0.1445, + "num_input_tokens_seen": 118275216, + "step": 54810 + }, + { + "epoch": 8.942088091353996, + "grad_norm": 0.15061931312084198, + "learning_rate": 0.000675763570894606, + "loss": 0.0994, + "num_input_tokens_seen": 118286224, + "step": 54815 + }, + { + "epoch": 8.942903752039152, + "grad_norm": 0.07025843113660812, + "learning_rate": 0.0006756969321517781, + "loss": 0.1183, + "num_input_tokens_seen": 118296240, + "step": 54820 + }, + { + "epoch": 8.943719412724306, + "grad_norm": 0.003148243995383382, + "learning_rate": 0.0006756302898482288, + "loss": 0.0485, + "num_input_tokens_seen": 118305680, + "step": 54825 + }, + { + "epoch": 8.944535073409462, + "grad_norm": 0.004231143742799759, + "learning_rate": 0.0006755636439853089, + "loss": 0.0892, + "num_input_tokens_seen": 118316880, + "step": 54830 + }, + { + "epoch": 8.945350734094617, + "grad_norm": 0.03749319165945053, + "learning_rate": 0.0006754969945643689, + "loss": 0.0388, + "num_input_tokens_seen": 118328304, + "step": 54835 + }, + { + "epoch": 8.946166394779771, + "grad_norm": 0.025920093059539795, + "learning_rate": 0.0006754303415867599, + "loss": 0.0796, + "num_input_tokens_seen": 118339728, + "step": 54840 + }, + { + "epoch": 8.946982055464927, + "grad_norm": 0.09248571842908859, + "learning_rate": 0.0006753636850538325, + "loss": 0.1171, + "num_input_tokens_seen": 118349872, + "step": 54845 + }, + { + "epoch": 8.947797716150081, + "grad_norm": 0.011808681301772594, + "learning_rate": 0.0006752970249669374, + "loss": 0.1839, + "num_input_tokens_seen": 118359888, + "step": 54850 + }, + { + "epoch": 8.948613376835237, + "grad_norm": 0.027860432863235474, + "learning_rate": 0.0006752303613274257, + "loss": 0.0326, + "num_input_tokens_seen": 118370544, + "step": 54855 + }, + { + "epoch": 8.949429037520392, + "grad_norm": 0.12592335045337677, + "learning_rate": 0.0006751636941366486, + "loss": 0.0754, + "num_input_tokens_seen": 118381584, + "step": 54860 + }, + { + "epoch": 8.950244698205546, + "grad_norm": 0.007054011803120375, + "learning_rate": 0.000675097023395957, + "loss": 0.0425, + "num_input_tokens_seen": 118392624, + "step": 54865 + }, + { + "epoch": 8.951060358890702, + "grad_norm": 0.007219217251986265, + "learning_rate": 0.0006750303491067021, + "loss": 0.0483, + "num_input_tokens_seen": 118402768, + "step": 54870 + }, + { + "epoch": 8.951876019575856, + "grad_norm": 0.04315441474318504, + "learning_rate": 0.0006749636712702349, + "loss": 0.0123, + "num_input_tokens_seen": 118413584, + "step": 54875 + }, + { + "epoch": 8.952691680261012, + "grad_norm": 0.052446115761995316, + "learning_rate": 0.0006748969898879071, + "loss": 0.0503, + "num_input_tokens_seen": 118424400, + "step": 54880 + }, + { + "epoch": 8.953507340946166, + "grad_norm": 0.009185166098177433, + "learning_rate": 0.00067483030496107, + "loss": 0.0286, + "num_input_tokens_seen": 118435696, + "step": 54885 + }, + { + "epoch": 8.954323001631321, + "grad_norm": 0.023122737184166908, + "learning_rate": 0.000674763616491075, + "loss": 0.1976, + "num_input_tokens_seen": 118446544, + "step": 54890 + }, + { + "epoch": 8.955138662316477, + "grad_norm": 0.19965215027332306, + "learning_rate": 0.0006746969244792734, + "loss": 0.125, + "num_input_tokens_seen": 118457616, + "step": 54895 + }, + { + "epoch": 8.955954323001631, + "grad_norm": 0.12136277556419373, + "learning_rate": 0.0006746302289270172, + "loss": 0.0431, + "num_input_tokens_seen": 118468816, + "step": 54900 + }, + { + "epoch": 8.956769983686787, + "grad_norm": 0.001943388837389648, + "learning_rate": 0.0006745635298356579, + "loss": 0.0421, + "num_input_tokens_seen": 118480176, + "step": 54905 + }, + { + "epoch": 8.95758564437194, + "grad_norm": 0.009855564683675766, + "learning_rate": 0.0006744968272065469, + "loss": 0.009, + "num_input_tokens_seen": 118491024, + "step": 54910 + }, + { + "epoch": 8.958401305057096, + "grad_norm": 0.07133545726537704, + "learning_rate": 0.0006744301210410366, + "loss": 0.0613, + "num_input_tokens_seen": 118502096, + "step": 54915 + }, + { + "epoch": 8.959216965742252, + "grad_norm": 0.09341428428888321, + "learning_rate": 0.0006743634113404786, + "loss": 0.0427, + "num_input_tokens_seen": 118512464, + "step": 54920 + }, + { + "epoch": 8.960032626427406, + "grad_norm": 0.05852467194199562, + "learning_rate": 0.0006742966981062249, + "loss": 0.0162, + "num_input_tokens_seen": 118523728, + "step": 54925 + }, + { + "epoch": 8.960848287112562, + "grad_norm": 0.020417101681232452, + "learning_rate": 0.0006742299813396274, + "loss": 0.1279, + "num_input_tokens_seen": 118533680, + "step": 54930 + }, + { + "epoch": 8.961663947797716, + "grad_norm": 0.015344120562076569, + "learning_rate": 0.0006741632610420384, + "loss": 0.095, + "num_input_tokens_seen": 118544176, + "step": 54935 + }, + { + "epoch": 8.962479608482871, + "grad_norm": 0.021016424521803856, + "learning_rate": 0.0006740965372148098, + "loss": 0.0414, + "num_input_tokens_seen": 118554384, + "step": 54940 + }, + { + "epoch": 8.963295269168025, + "grad_norm": 0.1998928189277649, + "learning_rate": 0.0006740298098592941, + "loss": 0.1465, + "num_input_tokens_seen": 118566064, + "step": 54945 + }, + { + "epoch": 8.964110929853181, + "grad_norm": 0.02774915285408497, + "learning_rate": 0.0006739630789768436, + "loss": 0.1059, + "num_input_tokens_seen": 118575952, + "step": 54950 + }, + { + "epoch": 8.964926590538337, + "grad_norm": 0.006330416072160006, + "learning_rate": 0.0006738963445688107, + "loss": 0.0876, + "num_input_tokens_seen": 118587408, + "step": 54955 + }, + { + "epoch": 8.96574225122349, + "grad_norm": 0.0027074057143181562, + "learning_rate": 0.0006738296066365476, + "loss": 0.1131, + "num_input_tokens_seen": 118598320, + "step": 54960 + }, + { + "epoch": 8.966557911908646, + "grad_norm": 0.21353696286678314, + "learning_rate": 0.000673762865181407, + "loss": 0.0554, + "num_input_tokens_seen": 118609488, + "step": 54965 + }, + { + "epoch": 8.9673735725938, + "grad_norm": 0.0059530180878937244, + "learning_rate": 0.0006736961202047417, + "loss": 0.0929, + "num_input_tokens_seen": 118621040, + "step": 54970 + }, + { + "epoch": 8.968189233278956, + "grad_norm": 0.032039012759923935, + "learning_rate": 0.0006736293717079041, + "loss": 0.0349, + "num_input_tokens_seen": 118632720, + "step": 54975 + }, + { + "epoch": 8.969004893964112, + "grad_norm": 0.003605822566896677, + "learning_rate": 0.0006735626196922469, + "loss": 0.0289, + "num_input_tokens_seen": 118643024, + "step": 54980 + }, + { + "epoch": 8.969820554649266, + "grad_norm": 0.21763961017131805, + "learning_rate": 0.0006734958641591231, + "loss": 0.2634, + "num_input_tokens_seen": 118653712, + "step": 54985 + }, + { + "epoch": 8.970636215334421, + "grad_norm": 0.2710396647453308, + "learning_rate": 0.0006734291051098856, + "loss": 0.1277, + "num_input_tokens_seen": 118664400, + "step": 54990 + }, + { + "epoch": 8.971451876019575, + "grad_norm": 0.060999076813459396, + "learning_rate": 0.0006733623425458871, + "loss": 0.0487, + "num_input_tokens_seen": 118675504, + "step": 54995 + }, + { + "epoch": 8.97226753670473, + "grad_norm": 0.00511001655831933, + "learning_rate": 0.000673295576468481, + "loss": 0.0591, + "num_input_tokens_seen": 118686192, + "step": 55000 + }, + { + "epoch": 8.973083197389887, + "grad_norm": 0.012306534685194492, + "learning_rate": 0.00067322880687902, + "loss": 0.0622, + "num_input_tokens_seen": 118697776, + "step": 55005 + }, + { + "epoch": 8.97389885807504, + "grad_norm": 0.004581958521157503, + "learning_rate": 0.0006731620337788576, + "loss": 0.0182, + "num_input_tokens_seen": 118708720, + "step": 55010 + }, + { + "epoch": 8.974714518760196, + "grad_norm": 0.06021929159760475, + "learning_rate": 0.0006730952571693469, + "loss": 0.0284, + "num_input_tokens_seen": 118720112, + "step": 55015 + }, + { + "epoch": 8.97553017944535, + "grad_norm": 0.17483726143836975, + "learning_rate": 0.0006730284770518412, + "loss": 0.1815, + "num_input_tokens_seen": 118731184, + "step": 55020 + }, + { + "epoch": 8.976345840130506, + "grad_norm": 0.1597505360841751, + "learning_rate": 0.0006729616934276939, + "loss": 0.135, + "num_input_tokens_seen": 118741392, + "step": 55025 + }, + { + "epoch": 8.977161500815662, + "grad_norm": 0.26488834619522095, + "learning_rate": 0.0006728949062982585, + "loss": 0.1242, + "num_input_tokens_seen": 118751984, + "step": 55030 + }, + { + "epoch": 8.977977161500815, + "grad_norm": 0.04788762703537941, + "learning_rate": 0.0006728281156648885, + "loss": 0.0361, + "num_input_tokens_seen": 118763600, + "step": 55035 + }, + { + "epoch": 8.978792822185971, + "grad_norm": 0.014035162515938282, + "learning_rate": 0.0006727613215289374, + "loss": 0.0332, + "num_input_tokens_seen": 118774512, + "step": 55040 + }, + { + "epoch": 8.979608482871125, + "grad_norm": 0.006271898280829191, + "learning_rate": 0.0006726945238917589, + "loss": 0.0165, + "num_input_tokens_seen": 118785072, + "step": 55045 + }, + { + "epoch": 8.98042414355628, + "grad_norm": 0.008285443298518658, + "learning_rate": 0.000672627722754707, + "loss": 0.0246, + "num_input_tokens_seen": 118795984, + "step": 55050 + }, + { + "epoch": 8.981239804241435, + "grad_norm": 0.056490156799554825, + "learning_rate": 0.0006725609181191352, + "loss": 0.0494, + "num_input_tokens_seen": 118804656, + "step": 55055 + }, + { + "epoch": 8.98205546492659, + "grad_norm": 0.1611725389957428, + "learning_rate": 0.0006724941099863975, + "loss": 0.0138, + "num_input_tokens_seen": 118814224, + "step": 55060 + }, + { + "epoch": 8.982871125611746, + "grad_norm": 0.0049819364212453365, + "learning_rate": 0.0006724272983578478, + "loss": 0.0134, + "num_input_tokens_seen": 118825552, + "step": 55065 + }, + { + "epoch": 8.9836867862969, + "grad_norm": 0.02708779089152813, + "learning_rate": 0.0006723604832348403, + "loss": 0.1129, + "num_input_tokens_seen": 118836752, + "step": 55070 + }, + { + "epoch": 8.984502446982056, + "grad_norm": 0.007590142544358969, + "learning_rate": 0.0006722936646187288, + "loss": 0.0273, + "num_input_tokens_seen": 118846160, + "step": 55075 + }, + { + "epoch": 8.98531810766721, + "grad_norm": 0.027876242995262146, + "learning_rate": 0.0006722268425108675, + "loss": 0.0071, + "num_input_tokens_seen": 118857168, + "step": 55080 + }, + { + "epoch": 8.986133768352365, + "grad_norm": 0.002578563755378127, + "learning_rate": 0.000672160016912611, + "loss": 0.0123, + "num_input_tokens_seen": 118868432, + "step": 55085 + }, + { + "epoch": 8.986949429037521, + "grad_norm": 0.16570094227790833, + "learning_rate": 0.0006720931878253133, + "loss": 0.074, + "num_input_tokens_seen": 118879120, + "step": 55090 + }, + { + "epoch": 8.987765089722675, + "grad_norm": 0.07209951430559158, + "learning_rate": 0.0006720263552503288, + "loss": 0.0231, + "num_input_tokens_seen": 118889040, + "step": 55095 + }, + { + "epoch": 8.98858075040783, + "grad_norm": 0.0016582689713686705, + "learning_rate": 0.000671959519189012, + "loss": 0.0303, + "num_input_tokens_seen": 118898608, + "step": 55100 + }, + { + "epoch": 8.989396411092985, + "grad_norm": 0.04168137535452843, + "learning_rate": 0.0006718926796427174, + "loss": 0.0085, + "num_input_tokens_seen": 118909264, + "step": 55105 + }, + { + "epoch": 8.99021207177814, + "grad_norm": 0.08970534056425095, + "learning_rate": 0.0006718258366127995, + "loss": 0.0565, + "num_input_tokens_seen": 118920048, + "step": 55110 + }, + { + "epoch": 8.991027732463294, + "grad_norm": 0.3565184772014618, + "learning_rate": 0.0006717589901006131, + "loss": 0.0317, + "num_input_tokens_seen": 118931120, + "step": 55115 + }, + { + "epoch": 8.99184339314845, + "grad_norm": 0.024913907051086426, + "learning_rate": 0.0006716921401075129, + "loss": 0.0589, + "num_input_tokens_seen": 118940464, + "step": 55120 + }, + { + "epoch": 8.992659053833606, + "grad_norm": 0.1407296061515808, + "learning_rate": 0.0006716252866348537, + "loss": 0.2472, + "num_input_tokens_seen": 118951120, + "step": 55125 + }, + { + "epoch": 8.99347471451876, + "grad_norm": 0.039190925657749176, + "learning_rate": 0.0006715584296839903, + "loss": 0.069, + "num_input_tokens_seen": 118961616, + "step": 55130 + }, + { + "epoch": 8.994290375203915, + "grad_norm": 0.047454044222831726, + "learning_rate": 0.0006714915692562777, + "loss": 0.0664, + "num_input_tokens_seen": 118973296, + "step": 55135 + }, + { + "epoch": 8.99510603588907, + "grad_norm": 0.19372011721134186, + "learning_rate": 0.0006714247053530709, + "loss": 0.0249, + "num_input_tokens_seen": 118983344, + "step": 55140 + }, + { + "epoch": 8.995921696574225, + "grad_norm": 0.0034268659073859453, + "learning_rate": 0.0006713578379757251, + "loss": 0.0148, + "num_input_tokens_seen": 118994544, + "step": 55145 + }, + { + "epoch": 8.99673735725938, + "grad_norm": 0.10639503598213196, + "learning_rate": 0.0006712909671255952, + "loss": 0.0536, + "num_input_tokens_seen": 119005264, + "step": 55150 + }, + { + "epoch": 8.997553017944535, + "grad_norm": 0.2622402310371399, + "learning_rate": 0.0006712240928040363, + "loss": 0.1971, + "num_input_tokens_seen": 119016688, + "step": 55155 + }, + { + "epoch": 8.99836867862969, + "grad_norm": 0.18849623203277588, + "learning_rate": 0.0006711572150124043, + "loss": 0.1044, + "num_input_tokens_seen": 119027856, + "step": 55160 + }, + { + "epoch": 8.999184339314844, + "grad_norm": 0.30848976969718933, + "learning_rate": 0.0006710903337520539, + "loss": 0.1997, + "num_input_tokens_seen": 119038000, + "step": 55165 + }, + { + "epoch": 9.0, + "grad_norm": 0.0008787537808530033, + "learning_rate": 0.0006710234490243412, + "loss": 0.0227, + "num_input_tokens_seen": 119047920, + "step": 55170 + }, + { + "epoch": 9.0, + "eval_loss": 0.12271988391876221, + "eval_runtime": 104.4228, + "eval_samples_per_second": 26.096, + "eval_steps_per_second": 6.531, + "num_input_tokens_seen": 119047920, + "step": 55170 + }, + { + "epoch": 9.000815660685156, + "grad_norm": 0.10889197140932083, + "learning_rate": 0.0006709565608306212, + "loss": 0.0521, + "num_input_tokens_seen": 119058096, + "step": 55175 + }, + { + "epoch": 9.00163132137031, + "grad_norm": 0.24275128543376923, + "learning_rate": 0.0006708896691722495, + "loss": 0.094, + "num_input_tokens_seen": 119068080, + "step": 55180 + }, + { + "epoch": 9.002446982055465, + "grad_norm": 0.1689465492963791, + "learning_rate": 0.0006708227740505822, + "loss": 0.0805, + "num_input_tokens_seen": 119078576, + "step": 55185 + }, + { + "epoch": 9.00326264274062, + "grad_norm": 0.06446880847215652, + "learning_rate": 0.0006707558754669744, + "loss": 0.1234, + "num_input_tokens_seen": 119089808, + "step": 55190 + }, + { + "epoch": 9.004078303425775, + "grad_norm": 0.3902437388896942, + "learning_rate": 0.0006706889734227823, + "loss": 0.1088, + "num_input_tokens_seen": 119100208, + "step": 55195 + }, + { + "epoch": 9.00489396411093, + "grad_norm": 0.007276099640876055, + "learning_rate": 0.0006706220679193614, + "loss": 0.0903, + "num_input_tokens_seen": 119111632, + "step": 55200 + }, + { + "epoch": 9.005709624796085, + "grad_norm": 0.012900949455797672, + "learning_rate": 0.000670555158958068, + "loss": 0.0281, + "num_input_tokens_seen": 119122608, + "step": 55205 + }, + { + "epoch": 9.00652528548124, + "grad_norm": 0.013819210231304169, + "learning_rate": 0.0006704882465402579, + "loss": 0.0162, + "num_input_tokens_seen": 119132976, + "step": 55210 + }, + { + "epoch": 9.007340946166394, + "grad_norm": 0.05992760509252548, + "learning_rate": 0.0006704213306672873, + "loss": 0.0185, + "num_input_tokens_seen": 119143984, + "step": 55215 + }, + { + "epoch": 9.00815660685155, + "grad_norm": 0.003830722998827696, + "learning_rate": 0.0006703544113405122, + "loss": 0.0548, + "num_input_tokens_seen": 119154256, + "step": 55220 + }, + { + "epoch": 9.008972267536704, + "grad_norm": 0.03900685906410217, + "learning_rate": 0.0006702874885612887, + "loss": 0.1419, + "num_input_tokens_seen": 119165712, + "step": 55225 + }, + { + "epoch": 9.00978792822186, + "grad_norm": 0.02168833278119564, + "learning_rate": 0.0006702205623309734, + "loss": 0.0448, + "num_input_tokens_seen": 119176496, + "step": 55230 + }, + { + "epoch": 9.010603588907015, + "grad_norm": 0.0781978890299797, + "learning_rate": 0.0006701536326509224, + "loss": 0.0462, + "num_input_tokens_seen": 119186640, + "step": 55235 + }, + { + "epoch": 9.01141924959217, + "grad_norm": 0.13815197348594666, + "learning_rate": 0.0006700866995224921, + "loss": 0.1292, + "num_input_tokens_seen": 119197680, + "step": 55240 + }, + { + "epoch": 9.012234910277325, + "grad_norm": 0.032873354852199554, + "learning_rate": 0.0006700197629470393, + "loss": 0.051, + "num_input_tokens_seen": 119208080, + "step": 55245 + }, + { + "epoch": 9.013050570962479, + "grad_norm": 0.1965385228395462, + "learning_rate": 0.00066995282292592, + "loss": 0.0546, + "num_input_tokens_seen": 119218896, + "step": 55250 + }, + { + "epoch": 9.013866231647635, + "grad_norm": 0.18614423274993896, + "learning_rate": 0.0006698858794604914, + "loss": 0.1994, + "num_input_tokens_seen": 119230032, + "step": 55255 + }, + { + "epoch": 9.01468189233279, + "grad_norm": 0.050490181893110275, + "learning_rate": 0.0006698189325521097, + "loss": 0.0182, + "num_input_tokens_seen": 119240112, + "step": 55260 + }, + { + "epoch": 9.015497553017944, + "grad_norm": 0.01280243881046772, + "learning_rate": 0.000669751982202132, + "loss": 0.1549, + "num_input_tokens_seen": 119251184, + "step": 55265 + }, + { + "epoch": 9.0163132137031, + "grad_norm": 0.2053099274635315, + "learning_rate": 0.0006696850284119151, + "loss": 0.0289, + "num_input_tokens_seen": 119263216, + "step": 55270 + }, + { + "epoch": 9.017128874388254, + "grad_norm": 0.004310148768126965, + "learning_rate": 0.0006696180711828159, + "loss": 0.0195, + "num_input_tokens_seen": 119272112, + "step": 55275 + }, + { + "epoch": 9.01794453507341, + "grad_norm": 0.11303390562534332, + "learning_rate": 0.0006695511105161913, + "loss": 0.0753, + "num_input_tokens_seen": 119282352, + "step": 55280 + }, + { + "epoch": 9.018760195758565, + "grad_norm": 0.1404087245464325, + "learning_rate": 0.0006694841464133981, + "loss": 0.0981, + "num_input_tokens_seen": 119293488, + "step": 55285 + }, + { + "epoch": 9.01957585644372, + "grad_norm": 0.2206343114376068, + "learning_rate": 0.0006694171788757939, + "loss": 0.0588, + "num_input_tokens_seen": 119304592, + "step": 55290 + }, + { + "epoch": 9.020391517128875, + "grad_norm": 0.10807430744171143, + "learning_rate": 0.0006693502079047356, + "loss": 0.0855, + "num_input_tokens_seen": 119315088, + "step": 55295 + }, + { + "epoch": 9.021207177814029, + "grad_norm": 0.2664336860179901, + "learning_rate": 0.0006692832335015806, + "loss": 0.1048, + "num_input_tokens_seen": 119326416, + "step": 55300 + }, + { + "epoch": 9.022022838499185, + "grad_norm": 0.043990276753902435, + "learning_rate": 0.000669216255667686, + "loss": 0.0516, + "num_input_tokens_seen": 119337296, + "step": 55305 + }, + { + "epoch": 9.022838499184338, + "grad_norm": 0.1280667930841446, + "learning_rate": 0.0006691492744044093, + "loss": 0.0691, + "num_input_tokens_seen": 119347824, + "step": 55310 + }, + { + "epoch": 9.023654159869494, + "grad_norm": 0.007074454799294472, + "learning_rate": 0.000669082289713108, + "loss": 0.0194, + "num_input_tokens_seen": 119359216, + "step": 55315 + }, + { + "epoch": 9.02446982055465, + "grad_norm": 0.017109306529164314, + "learning_rate": 0.0006690153015951397, + "loss": 0.1009, + "num_input_tokens_seen": 119370800, + "step": 55320 + }, + { + "epoch": 9.025285481239804, + "grad_norm": 0.05777256563305855, + "learning_rate": 0.0006689483100518617, + "loss": 0.1082, + "num_input_tokens_seen": 119380944, + "step": 55325 + }, + { + "epoch": 9.02610114192496, + "grad_norm": 0.0038797534070909023, + "learning_rate": 0.000668881315084632, + "loss": 0.027, + "num_input_tokens_seen": 119390608, + "step": 55330 + }, + { + "epoch": 9.026916802610113, + "grad_norm": 0.01901327073574066, + "learning_rate": 0.0006688143166948082, + "loss": 0.0412, + "num_input_tokens_seen": 119401328, + "step": 55335 + }, + { + "epoch": 9.02773246329527, + "grad_norm": 0.009108876809477806, + "learning_rate": 0.0006687473148837482, + "loss": 0.0253, + "num_input_tokens_seen": 119411472, + "step": 55340 + }, + { + "epoch": 9.028548123980425, + "grad_norm": 0.010530885308980942, + "learning_rate": 0.0006686803096528096, + "loss": 0.0203, + "num_input_tokens_seen": 119423152, + "step": 55345 + }, + { + "epoch": 9.029363784665579, + "grad_norm": 0.013790972530841827, + "learning_rate": 0.0006686133010033507, + "loss": 0.057, + "num_input_tokens_seen": 119434992, + "step": 55350 + }, + { + "epoch": 9.030179445350734, + "grad_norm": 0.15513859689235687, + "learning_rate": 0.0006685462889367293, + "loss": 0.1513, + "num_input_tokens_seen": 119445520, + "step": 55355 + }, + { + "epoch": 9.030995106035888, + "grad_norm": 0.1698751300573349, + "learning_rate": 0.0006684792734543036, + "loss": 0.0448, + "num_input_tokens_seen": 119456080, + "step": 55360 + }, + { + "epoch": 9.031810766721044, + "grad_norm": 0.0450487919151783, + "learning_rate": 0.0006684122545574315, + "loss": 0.0343, + "num_input_tokens_seen": 119467504, + "step": 55365 + }, + { + "epoch": 9.0326264274062, + "grad_norm": 0.2076319456100464, + "learning_rate": 0.0006683452322474715, + "loss": 0.1025, + "num_input_tokens_seen": 119478896, + "step": 55370 + }, + { + "epoch": 9.033442088091354, + "grad_norm": 0.0038757864385843277, + "learning_rate": 0.0006682782065257818, + "loss": 0.0283, + "num_input_tokens_seen": 119489840, + "step": 55375 + }, + { + "epoch": 9.03425774877651, + "grad_norm": 0.011814058758318424, + "learning_rate": 0.000668211177393721, + "loss": 0.0268, + "num_input_tokens_seen": 119499472, + "step": 55380 + }, + { + "epoch": 9.035073409461663, + "grad_norm": 0.22143588960170746, + "learning_rate": 0.0006681441448526471, + "loss": 0.0877, + "num_input_tokens_seen": 119510768, + "step": 55385 + }, + { + "epoch": 9.035889070146819, + "grad_norm": 0.005610577762126923, + "learning_rate": 0.0006680771089039188, + "loss": 0.0301, + "num_input_tokens_seen": 119521968, + "step": 55390 + }, + { + "epoch": 9.036704730831975, + "grad_norm": 0.001646176096983254, + "learning_rate": 0.0006680100695488946, + "loss": 0.016, + "num_input_tokens_seen": 119532912, + "step": 55395 + }, + { + "epoch": 9.037520391517129, + "grad_norm": 0.01902046427130699, + "learning_rate": 0.0006679430267889332, + "loss": 0.0105, + "num_input_tokens_seen": 119544208, + "step": 55400 + }, + { + "epoch": 9.038336052202284, + "grad_norm": 0.03161191940307617, + "learning_rate": 0.0006678759806253933, + "loss": 0.028, + "num_input_tokens_seen": 119554960, + "step": 55405 + }, + { + "epoch": 9.039151712887438, + "grad_norm": 0.008146477863192558, + "learning_rate": 0.0006678089310596339, + "loss": 0.0598, + "num_input_tokens_seen": 119566224, + "step": 55410 + }, + { + "epoch": 9.039967373572594, + "grad_norm": 0.00622534891590476, + "learning_rate": 0.0006677418780930136, + "loss": 0.0655, + "num_input_tokens_seen": 119577424, + "step": 55415 + }, + { + "epoch": 9.040783034257748, + "grad_norm": 0.22823213040828705, + "learning_rate": 0.0006676748217268912, + "loss": 0.0245, + "num_input_tokens_seen": 119588752, + "step": 55420 + }, + { + "epoch": 9.041598694942904, + "grad_norm": 0.008287857286632061, + "learning_rate": 0.0006676077619626259, + "loss": 0.0102, + "num_input_tokens_seen": 119600048, + "step": 55425 + }, + { + "epoch": 9.04241435562806, + "grad_norm": 0.23186719417572021, + "learning_rate": 0.0006675406988015766, + "loss": 0.0593, + "num_input_tokens_seen": 119608944, + "step": 55430 + }, + { + "epoch": 9.043230016313213, + "grad_norm": 0.00419975770637393, + "learning_rate": 0.0006674736322451027, + "loss": 0.0593, + "num_input_tokens_seen": 119618512, + "step": 55435 + }, + { + "epoch": 9.044045676998369, + "grad_norm": 0.007882971316576004, + "learning_rate": 0.000667406562294563, + "loss": 0.0113, + "num_input_tokens_seen": 119629040, + "step": 55440 + }, + { + "epoch": 9.044861337683523, + "grad_norm": 0.003936780616641045, + "learning_rate": 0.0006673394889513169, + "loss": 0.0262, + "num_input_tokens_seen": 119639248, + "step": 55445 + }, + { + "epoch": 9.045676998368679, + "grad_norm": 0.0037879920564591885, + "learning_rate": 0.000667272412216724, + "loss": 0.0301, + "num_input_tokens_seen": 119648848, + "step": 55450 + }, + { + "epoch": 9.046492659053834, + "grad_norm": 0.04175178334116936, + "learning_rate": 0.0006672053320921433, + "loss": 0.0077, + "num_input_tokens_seen": 119660624, + "step": 55455 + }, + { + "epoch": 9.047308319738988, + "grad_norm": 0.018747244030237198, + "learning_rate": 0.0006671382485789344, + "loss": 0.0161, + "num_input_tokens_seen": 119671120, + "step": 55460 + }, + { + "epoch": 9.048123980424144, + "grad_norm": 0.0038657153490930796, + "learning_rate": 0.0006670711616784571, + "loss": 0.1691, + "num_input_tokens_seen": 119680720, + "step": 55465 + }, + { + "epoch": 9.048939641109298, + "grad_norm": 0.01648629829287529, + "learning_rate": 0.0006670040713920704, + "loss": 0.0218, + "num_input_tokens_seen": 119690896, + "step": 55470 + }, + { + "epoch": 9.049755301794454, + "grad_norm": 0.360784649848938, + "learning_rate": 0.0006669369777211344, + "loss": 0.0481, + "num_input_tokens_seen": 119699824, + "step": 55475 + }, + { + "epoch": 9.05057096247961, + "grad_norm": 0.15032315254211426, + "learning_rate": 0.000666869880667009, + "loss": 0.1581, + "num_input_tokens_seen": 119709552, + "step": 55480 + }, + { + "epoch": 9.051386623164763, + "grad_norm": 0.0013978988863527775, + "learning_rate": 0.0006668027802310537, + "loss": 0.006, + "num_input_tokens_seen": 119721136, + "step": 55485 + }, + { + "epoch": 9.052202283849919, + "grad_norm": 0.08294779807329178, + "learning_rate": 0.0006667356764146284, + "loss": 0.0174, + "num_input_tokens_seen": 119731408, + "step": 55490 + }, + { + "epoch": 9.053017944535073, + "grad_norm": 0.0013823151821270585, + "learning_rate": 0.0006666685692190931, + "loss": 0.1114, + "num_input_tokens_seen": 119743728, + "step": 55495 + }, + { + "epoch": 9.053833605220229, + "grad_norm": 0.053627289831638336, + "learning_rate": 0.0006666014586458079, + "loss": 0.1328, + "num_input_tokens_seen": 119754992, + "step": 55500 + }, + { + "epoch": 9.054649265905383, + "grad_norm": 0.014996238984167576, + "learning_rate": 0.0006665343446961327, + "loss": 0.0796, + "num_input_tokens_seen": 119765936, + "step": 55505 + }, + { + "epoch": 9.055464926590538, + "grad_norm": 0.0019899660255759954, + "learning_rate": 0.0006664672273714278, + "loss": 0.0016, + "num_input_tokens_seen": 119776688, + "step": 55510 + }, + { + "epoch": 9.056280587275694, + "grad_norm": 0.29615291953086853, + "learning_rate": 0.0006664001066730532, + "loss": 0.0282, + "num_input_tokens_seen": 119787088, + "step": 55515 + }, + { + "epoch": 9.057096247960848, + "grad_norm": 0.003651589620858431, + "learning_rate": 0.0006663329826023696, + "loss": 0.0428, + "num_input_tokens_seen": 119798576, + "step": 55520 + }, + { + "epoch": 9.057911908646004, + "grad_norm": 0.4569724500179291, + "learning_rate": 0.000666265855160737, + "loss": 0.0995, + "num_input_tokens_seen": 119809424, + "step": 55525 + }, + { + "epoch": 9.058727569331158, + "grad_norm": 0.606597363948822, + "learning_rate": 0.0006661987243495159, + "loss": 0.017, + "num_input_tokens_seen": 119821232, + "step": 55530 + }, + { + "epoch": 9.059543230016313, + "grad_norm": 0.0005566801992245018, + "learning_rate": 0.0006661315901700668, + "loss": 0.0496, + "num_input_tokens_seen": 119832688, + "step": 55535 + }, + { + "epoch": 9.060358890701469, + "grad_norm": 0.026531517505645752, + "learning_rate": 0.0006660644526237502, + "loss": 0.1826, + "num_input_tokens_seen": 119842736, + "step": 55540 + }, + { + "epoch": 9.061174551386623, + "grad_norm": 0.0044412086717784405, + "learning_rate": 0.0006659973117119269, + "loss": 0.0042, + "num_input_tokens_seen": 119853808, + "step": 55545 + }, + { + "epoch": 9.061990212071779, + "grad_norm": 0.024934332817792892, + "learning_rate": 0.0006659301674359575, + "loss": 0.1533, + "num_input_tokens_seen": 119864976, + "step": 55550 + }, + { + "epoch": 9.062805872756933, + "grad_norm": 0.0013933856971561909, + "learning_rate": 0.0006658630197972027, + "loss": 0.0136, + "num_input_tokens_seen": 119875312, + "step": 55555 + }, + { + "epoch": 9.063621533442088, + "grad_norm": 0.018193677067756653, + "learning_rate": 0.0006657958687970233, + "loss": 0.0261, + "num_input_tokens_seen": 119886704, + "step": 55560 + }, + { + "epoch": 9.064437194127244, + "grad_norm": 0.8735257983207703, + "learning_rate": 0.0006657287144367805, + "loss": 0.0377, + "num_input_tokens_seen": 119897424, + "step": 55565 + }, + { + "epoch": 9.065252854812398, + "grad_norm": 0.5183881521224976, + "learning_rate": 0.000665661556717835, + "loss": 0.0888, + "num_input_tokens_seen": 119909520, + "step": 55570 + }, + { + "epoch": 9.066068515497554, + "grad_norm": 0.0076085166074335575, + "learning_rate": 0.0006655943956415479, + "loss": 0.0999, + "num_input_tokens_seen": 119919056, + "step": 55575 + }, + { + "epoch": 9.066884176182707, + "grad_norm": 0.0768093466758728, + "learning_rate": 0.0006655272312092802, + "loss": 0.0258, + "num_input_tokens_seen": 119930448, + "step": 55580 + }, + { + "epoch": 9.067699836867863, + "grad_norm": 0.007505636662244797, + "learning_rate": 0.0006654600634223933, + "loss": 0.0091, + "num_input_tokens_seen": 119939824, + "step": 55585 + }, + { + "epoch": 9.068515497553017, + "grad_norm": 0.012156573124229908, + "learning_rate": 0.0006653928922822482, + "loss": 0.0157, + "num_input_tokens_seen": 119951152, + "step": 55590 + }, + { + "epoch": 9.069331158238173, + "grad_norm": 0.25301066040992737, + "learning_rate": 0.0006653257177902063, + "loss": 0.0941, + "num_input_tokens_seen": 119961104, + "step": 55595 + }, + { + "epoch": 9.070146818923329, + "grad_norm": 0.12305103987455368, + "learning_rate": 0.0006652585399476292, + "loss": 0.0688, + "num_input_tokens_seen": 119971920, + "step": 55600 + }, + { + "epoch": 9.070962479608482, + "grad_norm": 0.0029796783346682787, + "learning_rate": 0.000665191358755878, + "loss": 0.0208, + "num_input_tokens_seen": 119980944, + "step": 55605 + }, + { + "epoch": 9.071778140293638, + "grad_norm": 0.028895268216729164, + "learning_rate": 0.0006651241742163143, + "loss": 0.0885, + "num_input_tokens_seen": 119991792, + "step": 55610 + }, + { + "epoch": 9.072593800978792, + "grad_norm": 0.24953079223632812, + "learning_rate": 0.0006650569863302999, + "loss": 0.0387, + "num_input_tokens_seen": 120002064, + "step": 55615 + }, + { + "epoch": 9.073409461663948, + "grad_norm": 0.12838436663150787, + "learning_rate": 0.0006649897950991962, + "loss": 0.1012, + "num_input_tokens_seen": 120012560, + "step": 55620 + }, + { + "epoch": 9.074225122349104, + "grad_norm": 0.005463998299092054, + "learning_rate": 0.000664922600524365, + "loss": 0.0476, + "num_input_tokens_seen": 120023568, + "step": 55625 + }, + { + "epoch": 9.075040783034257, + "grad_norm": 0.0008604776812717319, + "learning_rate": 0.000664855402607168, + "loss": 0.2043, + "num_input_tokens_seen": 120034448, + "step": 55630 + }, + { + "epoch": 9.075856443719413, + "grad_norm": 0.10134922713041306, + "learning_rate": 0.0006647882013489674, + "loss": 0.0346, + "num_input_tokens_seen": 120046096, + "step": 55635 + }, + { + "epoch": 9.076672104404567, + "grad_norm": 0.0031818451825529337, + "learning_rate": 0.0006647209967511245, + "loss": 0.0081, + "num_input_tokens_seen": 120055344, + "step": 55640 + }, + { + "epoch": 9.077487765089723, + "grad_norm": 0.04806315898895264, + "learning_rate": 0.0006646537888150019, + "loss": 0.028, + "num_input_tokens_seen": 120065808, + "step": 55645 + }, + { + "epoch": 9.078303425774878, + "grad_norm": 0.008393410593271255, + "learning_rate": 0.0006645865775419613, + "loss": 0.0568, + "num_input_tokens_seen": 120075472, + "step": 55650 + }, + { + "epoch": 9.079119086460032, + "grad_norm": 0.013816767372190952, + "learning_rate": 0.0006645193629333649, + "loss": 0.0323, + "num_input_tokens_seen": 120085072, + "step": 55655 + }, + { + "epoch": 9.079934747145188, + "grad_norm": 0.010044675320386887, + "learning_rate": 0.0006644521449905749, + "loss": 0.1001, + "num_input_tokens_seen": 120096304, + "step": 55660 + }, + { + "epoch": 9.080750407830342, + "grad_norm": 0.007965527474880219, + "learning_rate": 0.0006643849237149536, + "loss": 0.007, + "num_input_tokens_seen": 120105968, + "step": 55665 + }, + { + "epoch": 9.081566068515498, + "grad_norm": 0.006031613796949387, + "learning_rate": 0.0006643176991078632, + "loss": 0.005, + "num_input_tokens_seen": 120117360, + "step": 55670 + }, + { + "epoch": 9.082381729200652, + "grad_norm": 0.00902215950191021, + "learning_rate": 0.0006642504711706663, + "loss": 0.0146, + "num_input_tokens_seen": 120128464, + "step": 55675 + }, + { + "epoch": 9.083197389885807, + "grad_norm": 0.27233919501304626, + "learning_rate": 0.000664183239904725, + "loss": 0.0413, + "num_input_tokens_seen": 120139440, + "step": 55680 + }, + { + "epoch": 9.084013050570963, + "grad_norm": 0.02728142961859703, + "learning_rate": 0.0006641160053114021, + "loss": 0.023, + "num_input_tokens_seen": 120149392, + "step": 55685 + }, + { + "epoch": 9.084828711256117, + "grad_norm": 0.20649556815624237, + "learning_rate": 0.0006640487673920605, + "loss": 0.1093, + "num_input_tokens_seen": 120160848, + "step": 55690 + }, + { + "epoch": 9.085644371941273, + "grad_norm": 0.007081093732267618, + "learning_rate": 0.0006639815261480622, + "loss": 0.0152, + "num_input_tokens_seen": 120172368, + "step": 55695 + }, + { + "epoch": 9.086460032626427, + "grad_norm": 0.06534375250339508, + "learning_rate": 0.0006639142815807704, + "loss": 0.0355, + "num_input_tokens_seen": 120183184, + "step": 55700 + }, + { + "epoch": 9.087275693311582, + "grad_norm": 0.16578392684459686, + "learning_rate": 0.0006638470336915477, + "loss": 0.0724, + "num_input_tokens_seen": 120194480, + "step": 55705 + }, + { + "epoch": 9.088091353996738, + "grad_norm": 0.004848463926464319, + "learning_rate": 0.0006637797824817569, + "loss": 0.083, + "num_input_tokens_seen": 120205616, + "step": 55710 + }, + { + "epoch": 9.088907014681892, + "grad_norm": 0.03432942554354668, + "learning_rate": 0.000663712527952761, + "loss": 0.015, + "num_input_tokens_seen": 120216496, + "step": 55715 + }, + { + "epoch": 9.089722675367048, + "grad_norm": 0.03223681449890137, + "learning_rate": 0.0006636452701059232, + "loss": 0.1111, + "num_input_tokens_seen": 120226896, + "step": 55720 + }, + { + "epoch": 9.090538336052202, + "grad_norm": 0.010801179334521294, + "learning_rate": 0.0006635780089426065, + "loss": 0.0059, + "num_input_tokens_seen": 120236400, + "step": 55725 + }, + { + "epoch": 9.091353996737357, + "grad_norm": 0.06588435173034668, + "learning_rate": 0.0006635107444641737, + "loss": 0.0102, + "num_input_tokens_seen": 120247920, + "step": 55730 + }, + { + "epoch": 9.092169657422513, + "grad_norm": 0.0023316836450248957, + "learning_rate": 0.0006634434766719883, + "loss": 0.016, + "num_input_tokens_seen": 120258512, + "step": 55735 + }, + { + "epoch": 9.092985318107667, + "grad_norm": 0.36640992760658264, + "learning_rate": 0.0006633762055674136, + "loss": 0.0401, + "num_input_tokens_seen": 120270032, + "step": 55740 + }, + { + "epoch": 9.093800978792823, + "grad_norm": 0.0031470723915845156, + "learning_rate": 0.0006633089311518128, + "loss": 0.0106, + "num_input_tokens_seen": 120280816, + "step": 55745 + }, + { + "epoch": 9.094616639477977, + "grad_norm": 0.11641258001327515, + "learning_rate": 0.0006632416534265493, + "loss": 0.0145, + "num_input_tokens_seen": 120291664, + "step": 55750 + }, + { + "epoch": 9.095432300163132, + "grad_norm": 0.07135124504566193, + "learning_rate": 0.0006631743723929867, + "loss": 0.0947, + "num_input_tokens_seen": 120302320, + "step": 55755 + }, + { + "epoch": 9.096247960848286, + "grad_norm": 0.005159761756658554, + "learning_rate": 0.0006631070880524883, + "loss": 0.0126, + "num_input_tokens_seen": 120312688, + "step": 55760 + }, + { + "epoch": 9.097063621533442, + "grad_norm": 0.025245938450098038, + "learning_rate": 0.0006630398004064179, + "loss": 0.0117, + "num_input_tokens_seen": 120322928, + "step": 55765 + }, + { + "epoch": 9.097879282218598, + "grad_norm": 0.016673002392053604, + "learning_rate": 0.0006629725094561392, + "loss": 0.0147, + "num_input_tokens_seen": 120332080, + "step": 55770 + }, + { + "epoch": 9.098694942903752, + "grad_norm": 0.04818960279226303, + "learning_rate": 0.0006629052152030158, + "loss": 0.1546, + "num_input_tokens_seen": 120344400, + "step": 55775 + }, + { + "epoch": 9.099510603588907, + "grad_norm": 0.07804453372955322, + "learning_rate": 0.0006628379176484115, + "loss": 0.0211, + "num_input_tokens_seen": 120355344, + "step": 55780 + }, + { + "epoch": 9.100326264274061, + "grad_norm": 0.0631583109498024, + "learning_rate": 0.0006627706167936903, + "loss": 0.0136, + "num_input_tokens_seen": 120366640, + "step": 55785 + }, + { + "epoch": 9.101141924959217, + "grad_norm": 0.006075733341276646, + "learning_rate": 0.0006627033126402159, + "loss": 0.0285, + "num_input_tokens_seen": 120377584, + "step": 55790 + }, + { + "epoch": 9.101957585644373, + "grad_norm": 0.0008804178214631975, + "learning_rate": 0.0006626360051893526, + "loss": 0.0054, + "num_input_tokens_seen": 120388368, + "step": 55795 + }, + { + "epoch": 9.102773246329527, + "grad_norm": 0.007555731106549501, + "learning_rate": 0.0006625686944424642, + "loss": 0.0085, + "num_input_tokens_seen": 120399312, + "step": 55800 + }, + { + "epoch": 9.103588907014682, + "grad_norm": 0.027117138728499413, + "learning_rate": 0.0006625013804009152, + "loss": 0.0166, + "num_input_tokens_seen": 120409616, + "step": 55805 + }, + { + "epoch": 9.104404567699836, + "grad_norm": 0.2197260707616806, + "learning_rate": 0.0006624340630660695, + "loss": 0.1294, + "num_input_tokens_seen": 120421424, + "step": 55810 + }, + { + "epoch": 9.105220228384992, + "grad_norm": 0.02026260830461979, + "learning_rate": 0.0006623667424392914, + "loss": 0.0081, + "num_input_tokens_seen": 120431088, + "step": 55815 + }, + { + "epoch": 9.106035889070148, + "grad_norm": 0.028628699481487274, + "learning_rate": 0.0006622994185219453, + "loss": 0.0126, + "num_input_tokens_seen": 120442320, + "step": 55820 + }, + { + "epoch": 9.106851549755302, + "grad_norm": 0.45490407943725586, + "learning_rate": 0.0006622320913153957, + "loss": 0.0632, + "num_input_tokens_seen": 120453104, + "step": 55825 + }, + { + "epoch": 9.107667210440457, + "grad_norm": 0.00560240400955081, + "learning_rate": 0.0006621647608210068, + "loss": 0.0248, + "num_input_tokens_seen": 120463536, + "step": 55830 + }, + { + "epoch": 9.108482871125611, + "grad_norm": 0.009768038056790829, + "learning_rate": 0.0006620974270401434, + "loss": 0.0443, + "num_input_tokens_seen": 120474672, + "step": 55835 + }, + { + "epoch": 9.109298531810767, + "grad_norm": 0.010903925634920597, + "learning_rate": 0.00066203008997417, + "loss": 0.0099, + "num_input_tokens_seen": 120484208, + "step": 55840 + }, + { + "epoch": 9.11011419249592, + "grad_norm": 0.2930833101272583, + "learning_rate": 0.0006619627496244513, + "loss": 0.1382, + "num_input_tokens_seen": 120494896, + "step": 55845 + }, + { + "epoch": 9.110929853181077, + "grad_norm": 0.04355636239051819, + "learning_rate": 0.0006618954059923517, + "loss": 0.0079, + "num_input_tokens_seen": 120505936, + "step": 55850 + }, + { + "epoch": 9.111745513866232, + "grad_norm": 0.04374802112579346, + "learning_rate": 0.0006618280590792367, + "loss": 0.0069, + "num_input_tokens_seen": 120516944, + "step": 55855 + }, + { + "epoch": 9.112561174551386, + "grad_norm": 0.323953241109848, + "learning_rate": 0.0006617607088864706, + "loss": 0.0295, + "num_input_tokens_seen": 120527344, + "step": 55860 + }, + { + "epoch": 9.113376835236542, + "grad_norm": 0.006870036944746971, + "learning_rate": 0.0006616933554154186, + "loss": 0.009, + "num_input_tokens_seen": 120537520, + "step": 55865 + }, + { + "epoch": 9.114192495921696, + "grad_norm": 0.2186199277639389, + "learning_rate": 0.0006616259986674456, + "loss": 0.1271, + "num_input_tokens_seen": 120549072, + "step": 55870 + }, + { + "epoch": 9.115008156606851, + "grad_norm": 0.45908308029174805, + "learning_rate": 0.0006615586386439169, + "loss": 0.0507, + "num_input_tokens_seen": 120559888, + "step": 55875 + }, + { + "epoch": 9.115823817292007, + "grad_norm": 0.07021364569664001, + "learning_rate": 0.0006614912753461973, + "loss": 0.023, + "num_input_tokens_seen": 120571504, + "step": 55880 + }, + { + "epoch": 9.116639477977161, + "grad_norm": 0.2328910529613495, + "learning_rate": 0.0006614239087756519, + "loss": 0.0506, + "num_input_tokens_seen": 120581456, + "step": 55885 + }, + { + "epoch": 9.117455138662317, + "grad_norm": 0.01687910407781601, + "learning_rate": 0.0006613565389336465, + "loss": 0.0522, + "num_input_tokens_seen": 120592080, + "step": 55890 + }, + { + "epoch": 9.11827079934747, + "grad_norm": 0.14432257413864136, + "learning_rate": 0.0006612891658215461, + "loss": 0.0412, + "num_input_tokens_seen": 120602128, + "step": 55895 + }, + { + "epoch": 9.119086460032626, + "grad_norm": 0.047510210424661636, + "learning_rate": 0.000661221789440716, + "loss": 0.0051, + "num_input_tokens_seen": 120614352, + "step": 55900 + }, + { + "epoch": 9.119902120717782, + "grad_norm": 0.09579040855169296, + "learning_rate": 0.0006611544097925219, + "loss": 0.012, + "num_input_tokens_seen": 120625424, + "step": 55905 + }, + { + "epoch": 9.120717781402936, + "grad_norm": 0.07016277313232422, + "learning_rate": 0.0006610870268783292, + "loss": 0.0125, + "num_input_tokens_seen": 120635536, + "step": 55910 + }, + { + "epoch": 9.121533442088092, + "grad_norm": 0.005467739421874285, + "learning_rate": 0.0006610196406995038, + "loss": 0.1681, + "num_input_tokens_seen": 120646064, + "step": 55915 + }, + { + "epoch": 9.122349102773246, + "grad_norm": 0.009251498617231846, + "learning_rate": 0.0006609522512574107, + "loss": 0.0019, + "num_input_tokens_seen": 120657072, + "step": 55920 + }, + { + "epoch": 9.123164763458401, + "grad_norm": 0.09190008789300919, + "learning_rate": 0.0006608848585534164, + "loss": 0.0479, + "num_input_tokens_seen": 120668816, + "step": 55925 + }, + { + "epoch": 9.123980424143557, + "grad_norm": 0.002916208701208234, + "learning_rate": 0.0006608174625888862, + "loss": 0.0796, + "num_input_tokens_seen": 120679248, + "step": 55930 + }, + { + "epoch": 9.124796084828711, + "grad_norm": 0.019650358706712723, + "learning_rate": 0.000660750063365186, + "loss": 0.0079, + "num_input_tokens_seen": 120690096, + "step": 55935 + }, + { + "epoch": 9.125611745513867, + "grad_norm": 0.18206822872161865, + "learning_rate": 0.000660682660883682, + "loss": 0.0568, + "num_input_tokens_seen": 120702352, + "step": 55940 + }, + { + "epoch": 9.12642740619902, + "grad_norm": 0.005770983174443245, + "learning_rate": 0.0006606152551457401, + "loss": 0.0584, + "num_input_tokens_seen": 120713072, + "step": 55945 + }, + { + "epoch": 9.127243066884176, + "grad_norm": 0.001713667530566454, + "learning_rate": 0.0006605478461527262, + "loss": 0.0404, + "num_input_tokens_seen": 120725200, + "step": 55950 + }, + { + "epoch": 9.12805872756933, + "grad_norm": 0.3524821400642395, + "learning_rate": 0.0006604804339060065, + "loss": 0.0367, + "num_input_tokens_seen": 120735696, + "step": 55955 + }, + { + "epoch": 9.128874388254486, + "grad_norm": 0.3251489996910095, + "learning_rate": 0.0006604130184069472, + "loss": 0.2022, + "num_input_tokens_seen": 120747344, + "step": 55960 + }, + { + "epoch": 9.129690048939642, + "grad_norm": 0.21691279113292694, + "learning_rate": 0.0006603455996569146, + "loss": 0.148, + "num_input_tokens_seen": 120758416, + "step": 55965 + }, + { + "epoch": 9.130505709624796, + "grad_norm": 0.32668566703796387, + "learning_rate": 0.0006602781776572752, + "loss": 0.1424, + "num_input_tokens_seen": 120769552, + "step": 55970 + }, + { + "epoch": 9.131321370309951, + "grad_norm": 0.016717370599508286, + "learning_rate": 0.000660210752409395, + "loss": 0.1095, + "num_input_tokens_seen": 120780656, + "step": 55975 + }, + { + "epoch": 9.132137030995105, + "grad_norm": 0.012981271371245384, + "learning_rate": 0.0006601433239146407, + "loss": 0.0363, + "num_input_tokens_seen": 120792048, + "step": 55980 + }, + { + "epoch": 9.132952691680261, + "grad_norm": 0.003394556464627385, + "learning_rate": 0.0006600758921743788, + "loss": 0.0743, + "num_input_tokens_seen": 120802224, + "step": 55985 + }, + { + "epoch": 9.133768352365417, + "grad_norm": 0.01148404460400343, + "learning_rate": 0.0006600084571899758, + "loss": 0.0347, + "num_input_tokens_seen": 120812720, + "step": 55990 + }, + { + "epoch": 9.13458401305057, + "grad_norm": 0.21401982009410858, + "learning_rate": 0.0006599410189627985, + "loss": 0.0407, + "num_input_tokens_seen": 120824880, + "step": 55995 + }, + { + "epoch": 9.135399673735726, + "grad_norm": 0.014696495607495308, + "learning_rate": 0.0006598735774942135, + "loss": 0.0137, + "num_input_tokens_seen": 120835632, + "step": 56000 + }, + { + "epoch": 9.13621533442088, + "grad_norm": 0.04026520252227783, + "learning_rate": 0.0006598061327855876, + "loss": 0.0237, + "num_input_tokens_seen": 120847984, + "step": 56005 + }, + { + "epoch": 9.137030995106036, + "grad_norm": 0.37597689032554626, + "learning_rate": 0.0006597386848382878, + "loss": 0.1283, + "num_input_tokens_seen": 120858832, + "step": 56010 + }, + { + "epoch": 9.137846655791192, + "grad_norm": 0.008638111874461174, + "learning_rate": 0.000659671233653681, + "loss": 0.0354, + "num_input_tokens_seen": 120869872, + "step": 56015 + }, + { + "epoch": 9.138662316476346, + "grad_norm": 0.012316303327679634, + "learning_rate": 0.0006596037792331338, + "loss": 0.0335, + "num_input_tokens_seen": 120879920, + "step": 56020 + }, + { + "epoch": 9.139477977161501, + "grad_norm": 0.001177173457108438, + "learning_rate": 0.0006595363215780137, + "loss": 0.06, + "num_input_tokens_seen": 120891088, + "step": 56025 + }, + { + "epoch": 9.140293637846655, + "grad_norm": 0.04190114885568619, + "learning_rate": 0.0006594688606896877, + "loss": 0.0098, + "num_input_tokens_seen": 120901968, + "step": 56030 + }, + { + "epoch": 9.141109298531811, + "grad_norm": 0.15887074172496796, + "learning_rate": 0.0006594013965695229, + "loss": 0.0265, + "num_input_tokens_seen": 120912176, + "step": 56035 + }, + { + "epoch": 9.141924959216965, + "grad_norm": 0.08688819408416748, + "learning_rate": 0.0006593339292188865, + "loss": 0.011, + "num_input_tokens_seen": 120923216, + "step": 56040 + }, + { + "epoch": 9.14274061990212, + "grad_norm": 0.1954217553138733, + "learning_rate": 0.0006592664586391461, + "loss": 0.129, + "num_input_tokens_seen": 120934448, + "step": 56045 + }, + { + "epoch": 9.143556280587276, + "grad_norm": 0.0045484998263418674, + "learning_rate": 0.0006591989848316687, + "loss": 0.016, + "num_input_tokens_seen": 120944880, + "step": 56050 + }, + { + "epoch": 9.14437194127243, + "grad_norm": 0.33619779348373413, + "learning_rate": 0.0006591315077978221, + "loss": 0.111, + "num_input_tokens_seen": 120956304, + "step": 56055 + }, + { + "epoch": 9.145187601957586, + "grad_norm": 0.05652850866317749, + "learning_rate": 0.0006590640275389734, + "loss": 0.0164, + "num_input_tokens_seen": 120966320, + "step": 56060 + }, + { + "epoch": 9.14600326264274, + "grad_norm": 0.042844172567129135, + "learning_rate": 0.0006589965440564905, + "loss": 0.0252, + "num_input_tokens_seen": 120977648, + "step": 56065 + }, + { + "epoch": 9.146818923327896, + "grad_norm": 0.004321992862969637, + "learning_rate": 0.000658929057351741, + "loss": 0.0359, + "num_input_tokens_seen": 120987728, + "step": 56070 + }, + { + "epoch": 9.147634584013051, + "grad_norm": 0.03357914090156555, + "learning_rate": 0.0006588615674260925, + "loss": 0.0453, + "num_input_tokens_seen": 120998640, + "step": 56075 + }, + { + "epoch": 9.148450244698205, + "grad_norm": 0.004383898340165615, + "learning_rate": 0.0006587940742809127, + "loss": 0.077, + "num_input_tokens_seen": 121008912, + "step": 56080 + }, + { + "epoch": 9.149265905383361, + "grad_norm": 0.05919858068227768, + "learning_rate": 0.0006587265779175696, + "loss": 0.0313, + "num_input_tokens_seen": 121019184, + "step": 56085 + }, + { + "epoch": 9.150081566068515, + "grad_norm": 0.24879035353660583, + "learning_rate": 0.0006586590783374311, + "loss": 0.0416, + "num_input_tokens_seen": 121031184, + "step": 56090 + }, + { + "epoch": 9.15089722675367, + "grad_norm": 0.3355010449886322, + "learning_rate": 0.000658591575541865, + "loss": 0.1476, + "num_input_tokens_seen": 121042672, + "step": 56095 + }, + { + "epoch": 9.151712887438826, + "grad_norm": 0.04353257641196251, + "learning_rate": 0.0006585240695322395, + "loss": 0.1174, + "num_input_tokens_seen": 121053520, + "step": 56100 + }, + { + "epoch": 9.15252854812398, + "grad_norm": 0.009970235638320446, + "learning_rate": 0.0006584565603099227, + "loss": 0.044, + "num_input_tokens_seen": 121064688, + "step": 56105 + }, + { + "epoch": 9.153344208809136, + "grad_norm": 0.06392436474561691, + "learning_rate": 0.0006583890478762824, + "loss": 0.1302, + "num_input_tokens_seen": 121075088, + "step": 56110 + }, + { + "epoch": 9.15415986949429, + "grad_norm": 0.013123875483870506, + "learning_rate": 0.0006583215322326874, + "loss": 0.1087, + "num_input_tokens_seen": 121085424, + "step": 56115 + }, + { + "epoch": 9.154975530179446, + "grad_norm": 0.11153491586446762, + "learning_rate": 0.0006582540133805056, + "loss": 0.0442, + "num_input_tokens_seen": 121097072, + "step": 56120 + }, + { + "epoch": 9.1557911908646, + "grad_norm": 0.19672666490077972, + "learning_rate": 0.0006581864913211055, + "loss": 0.0692, + "num_input_tokens_seen": 121108880, + "step": 56125 + }, + { + "epoch": 9.156606851549755, + "grad_norm": 0.061419516801834106, + "learning_rate": 0.0006581189660558554, + "loss": 0.0442, + "num_input_tokens_seen": 121120432, + "step": 56130 + }, + { + "epoch": 9.15742251223491, + "grad_norm": 0.11039623618125916, + "learning_rate": 0.000658051437586124, + "loss": 0.2184, + "num_input_tokens_seen": 121131952, + "step": 56135 + }, + { + "epoch": 9.158238172920065, + "grad_norm": 0.037578415125608444, + "learning_rate": 0.0006579839059132796, + "loss": 0.0355, + "num_input_tokens_seen": 121140304, + "step": 56140 + }, + { + "epoch": 9.15905383360522, + "grad_norm": 0.009670006111264229, + "learning_rate": 0.000657916371038691, + "loss": 0.0663, + "num_input_tokens_seen": 121150608, + "step": 56145 + }, + { + "epoch": 9.159869494290374, + "grad_norm": 0.010151606053113937, + "learning_rate": 0.0006578488329637268, + "loss": 0.0779, + "num_input_tokens_seen": 121161296, + "step": 56150 + }, + { + "epoch": 9.16068515497553, + "grad_norm": 0.02386097051203251, + "learning_rate": 0.0006577812916897558, + "loss": 0.0238, + "num_input_tokens_seen": 121172976, + "step": 56155 + }, + { + "epoch": 9.161500815660686, + "grad_norm": 0.004823227412998676, + "learning_rate": 0.0006577137472181466, + "loss": 0.0047, + "num_input_tokens_seen": 121183024, + "step": 56160 + }, + { + "epoch": 9.16231647634584, + "grad_norm": 0.27640271186828613, + "learning_rate": 0.0006576461995502682, + "loss": 0.1043, + "num_input_tokens_seen": 121194032, + "step": 56165 + }, + { + "epoch": 9.163132137030995, + "grad_norm": 0.02659352496266365, + "learning_rate": 0.0006575786486874897, + "loss": 0.0094, + "num_input_tokens_seen": 121205296, + "step": 56170 + }, + { + "epoch": 9.16394779771615, + "grad_norm": 0.01825234480202198, + "learning_rate": 0.0006575110946311801, + "loss": 0.0089, + "num_input_tokens_seen": 121217168, + "step": 56175 + }, + { + "epoch": 9.164763458401305, + "grad_norm": 0.33334025740623474, + "learning_rate": 0.0006574435373827083, + "loss": 0.289, + "num_input_tokens_seen": 121228176, + "step": 56180 + }, + { + "epoch": 9.16557911908646, + "grad_norm": 0.002409928711131215, + "learning_rate": 0.0006573759769434433, + "loss": 0.0241, + "num_input_tokens_seen": 121237968, + "step": 56185 + }, + { + "epoch": 9.166394779771615, + "grad_norm": 0.0017675644485279918, + "learning_rate": 0.0006573084133147547, + "loss": 0.0117, + "num_input_tokens_seen": 121248048, + "step": 56190 + }, + { + "epoch": 9.16721044045677, + "grad_norm": 0.004522443283349276, + "learning_rate": 0.0006572408464980115, + "loss": 0.054, + "num_input_tokens_seen": 121259088, + "step": 56195 + }, + { + "epoch": 9.168026101141924, + "grad_norm": 0.01234686840325594, + "learning_rate": 0.000657173276494583, + "loss": 0.1557, + "num_input_tokens_seen": 121270416, + "step": 56200 + }, + { + "epoch": 9.16884176182708, + "grad_norm": 0.007663615047931671, + "learning_rate": 0.0006571057033058386, + "loss": 0.0369, + "num_input_tokens_seen": 121281424, + "step": 56205 + }, + { + "epoch": 9.169657422512234, + "grad_norm": 0.24240346252918243, + "learning_rate": 0.000657038126933148, + "loss": 0.053, + "num_input_tokens_seen": 121292496, + "step": 56210 + }, + { + "epoch": 9.17047308319739, + "grad_norm": 0.029079364612698555, + "learning_rate": 0.0006569705473778804, + "loss": 0.0114, + "num_input_tokens_seen": 121303280, + "step": 56215 + }, + { + "epoch": 9.171288743882545, + "grad_norm": 0.12774725258350372, + "learning_rate": 0.0006569029646414055, + "loss": 0.0386, + "num_input_tokens_seen": 121315248, + "step": 56220 + }, + { + "epoch": 9.1721044045677, + "grad_norm": 0.3047543466091156, + "learning_rate": 0.0006568353787250931, + "loss": 0.0831, + "num_input_tokens_seen": 121326256, + "step": 56225 + }, + { + "epoch": 9.172920065252855, + "grad_norm": 0.008882143534719944, + "learning_rate": 0.0006567677896303127, + "loss": 0.0221, + "num_input_tokens_seen": 121337264, + "step": 56230 + }, + { + "epoch": 9.173735725938009, + "grad_norm": 0.05905358865857124, + "learning_rate": 0.0006567001973584343, + "loss": 0.0119, + "num_input_tokens_seen": 121347600, + "step": 56235 + }, + { + "epoch": 9.174551386623165, + "grad_norm": 0.0041689821518957615, + "learning_rate": 0.0006566326019108275, + "loss": 0.0828, + "num_input_tokens_seen": 121358800, + "step": 56240 + }, + { + "epoch": 9.17536704730832, + "grad_norm": 0.021673645824193954, + "learning_rate": 0.0006565650032888624, + "loss": 0.0234, + "num_input_tokens_seen": 121369712, + "step": 56245 + }, + { + "epoch": 9.176182707993474, + "grad_norm": 0.019392104819417, + "learning_rate": 0.0006564974014939088, + "loss": 0.0598, + "num_input_tokens_seen": 121380784, + "step": 56250 + }, + { + "epoch": 9.17699836867863, + "grad_norm": 0.008574801497161388, + "learning_rate": 0.0006564297965273369, + "loss": 0.0308, + "num_input_tokens_seen": 121392752, + "step": 56255 + }, + { + "epoch": 9.177814029363784, + "grad_norm": 0.01258694939315319, + "learning_rate": 0.0006563621883905167, + "loss": 0.0884, + "num_input_tokens_seen": 121402928, + "step": 56260 + }, + { + "epoch": 9.17862969004894, + "grad_norm": 0.004097871016710997, + "learning_rate": 0.0006562945770848183, + "loss": 0.0144, + "num_input_tokens_seen": 121413008, + "step": 56265 + }, + { + "epoch": 9.179445350734095, + "grad_norm": 0.0065926723182201385, + "learning_rate": 0.0006562269626116122, + "loss": 0.0095, + "num_input_tokens_seen": 121422896, + "step": 56270 + }, + { + "epoch": 9.18026101141925, + "grad_norm": 0.2966139316558838, + "learning_rate": 0.0006561593449722683, + "loss": 0.1212, + "num_input_tokens_seen": 121434512, + "step": 56275 + }, + { + "epoch": 9.181076672104405, + "grad_norm": 0.09313588589429855, + "learning_rate": 0.0006560917241681573, + "loss": 0.1063, + "num_input_tokens_seen": 121445840, + "step": 56280 + }, + { + "epoch": 9.181892332789559, + "grad_norm": 0.010000567883253098, + "learning_rate": 0.0006560241002006495, + "loss": 0.0268, + "num_input_tokens_seen": 121457264, + "step": 56285 + }, + { + "epoch": 9.182707993474715, + "grad_norm": 0.03887508437037468, + "learning_rate": 0.0006559564730711153, + "loss": 0.0147, + "num_input_tokens_seen": 121467728, + "step": 56290 + }, + { + "epoch": 9.18352365415987, + "grad_norm": 0.006694698240607977, + "learning_rate": 0.0006558888427809255, + "loss": 0.0212, + "num_input_tokens_seen": 121478608, + "step": 56295 + }, + { + "epoch": 9.184339314845024, + "grad_norm": 0.11072932928800583, + "learning_rate": 0.0006558212093314504, + "loss": 0.0131, + "num_input_tokens_seen": 121489488, + "step": 56300 + }, + { + "epoch": 9.18515497553018, + "grad_norm": 0.002785993507131934, + "learning_rate": 0.0006557535727240609, + "loss": 0.1641, + "num_input_tokens_seen": 121500400, + "step": 56305 + }, + { + "epoch": 9.185970636215334, + "grad_norm": 0.017555417492985725, + "learning_rate": 0.0006556859329601275, + "loss": 0.0206, + "num_input_tokens_seen": 121511088, + "step": 56310 + }, + { + "epoch": 9.18678629690049, + "grad_norm": 0.1419685035943985, + "learning_rate": 0.0006556182900410213, + "loss": 0.0903, + "num_input_tokens_seen": 121523088, + "step": 56315 + }, + { + "epoch": 9.187601957585644, + "grad_norm": 0.011668774299323559, + "learning_rate": 0.0006555506439681131, + "loss": 0.0597, + "num_input_tokens_seen": 121533808, + "step": 56320 + }, + { + "epoch": 9.1884176182708, + "grad_norm": 0.023576082661747932, + "learning_rate": 0.0006554829947427736, + "loss": 0.0365, + "num_input_tokens_seen": 121544240, + "step": 56325 + }, + { + "epoch": 9.189233278955955, + "grad_norm": 0.10271507501602173, + "learning_rate": 0.0006554153423663741, + "loss": 0.1661, + "num_input_tokens_seen": 121554992, + "step": 56330 + }, + { + "epoch": 9.190048939641109, + "grad_norm": 0.013610288500785828, + "learning_rate": 0.0006553476868402854, + "loss": 0.1257, + "num_input_tokens_seen": 121565424, + "step": 56335 + }, + { + "epoch": 9.190864600326265, + "grad_norm": 0.092350535094738, + "learning_rate": 0.0006552800281658789, + "loss": 0.0214, + "num_input_tokens_seen": 121575504, + "step": 56340 + }, + { + "epoch": 9.191680261011419, + "grad_norm": 0.032153088599443436, + "learning_rate": 0.0006552123663445255, + "loss": 0.0137, + "num_input_tokens_seen": 121586320, + "step": 56345 + }, + { + "epoch": 9.192495921696574, + "grad_norm": 0.046822525560855865, + "learning_rate": 0.0006551447013775967, + "loss": 0.0537, + "num_input_tokens_seen": 121596976, + "step": 56350 + }, + { + "epoch": 9.19331158238173, + "grad_norm": 0.05437963828444481, + "learning_rate": 0.0006550770332664637, + "loss": 0.0184, + "num_input_tokens_seen": 121607344, + "step": 56355 + }, + { + "epoch": 9.194127243066884, + "grad_norm": 0.018126392737030983, + "learning_rate": 0.0006550093620124979, + "loss": 0.0081, + "num_input_tokens_seen": 121617424, + "step": 56360 + }, + { + "epoch": 9.19494290375204, + "grad_norm": 0.23985566198825836, + "learning_rate": 0.0006549416876170707, + "loss": 0.0773, + "num_input_tokens_seen": 121629744, + "step": 56365 + }, + { + "epoch": 9.195758564437194, + "grad_norm": 0.2545243203639984, + "learning_rate": 0.0006548740100815537, + "loss": 0.1508, + "num_input_tokens_seen": 121641008, + "step": 56370 + }, + { + "epoch": 9.19657422512235, + "grad_norm": 0.04930659383535385, + "learning_rate": 0.0006548063294073183, + "loss": 0.0185, + "num_input_tokens_seen": 121652528, + "step": 56375 + }, + { + "epoch": 9.197389885807505, + "grad_norm": 0.053622763603925705, + "learning_rate": 0.0006547386455957364, + "loss": 0.0292, + "num_input_tokens_seen": 121663856, + "step": 56380 + }, + { + "epoch": 9.198205546492659, + "grad_norm": 0.28881970047950745, + "learning_rate": 0.0006546709586481794, + "loss": 0.0293, + "num_input_tokens_seen": 121676112, + "step": 56385 + }, + { + "epoch": 9.199021207177815, + "grad_norm": 0.11270332336425781, + "learning_rate": 0.0006546032685660193, + "loss": 0.0287, + "num_input_tokens_seen": 121688400, + "step": 56390 + }, + { + "epoch": 9.199836867862969, + "grad_norm": 0.34019017219543457, + "learning_rate": 0.000654535575350628, + "loss": 0.114, + "num_input_tokens_seen": 121698928, + "step": 56395 + }, + { + "epoch": 9.200652528548124, + "grad_norm": 0.1331515908241272, + "learning_rate": 0.0006544678790033769, + "loss": 0.1185, + "num_input_tokens_seen": 121708496, + "step": 56400 + }, + { + "epoch": 9.201468189233278, + "grad_norm": 0.0021028302144259214, + "learning_rate": 0.0006544001795256385, + "loss": 0.0132, + "num_input_tokens_seen": 121720240, + "step": 56405 + }, + { + "epoch": 9.202283849918434, + "grad_norm": 0.006427753251045942, + "learning_rate": 0.0006543324769187844, + "loss": 0.0123, + "num_input_tokens_seen": 121732080, + "step": 56410 + }, + { + "epoch": 9.20309951060359, + "grad_norm": 0.1206018403172493, + "learning_rate": 0.0006542647711841869, + "loss": 0.0162, + "num_input_tokens_seen": 121742096, + "step": 56415 + }, + { + "epoch": 9.203915171288743, + "grad_norm": 0.007657969836145639, + "learning_rate": 0.0006541970623232183, + "loss": 0.0251, + "num_input_tokens_seen": 121752496, + "step": 56420 + }, + { + "epoch": 9.2047308319739, + "grad_norm": 0.017615782096982002, + "learning_rate": 0.0006541293503372506, + "loss": 0.0288, + "num_input_tokens_seen": 121763024, + "step": 56425 + }, + { + "epoch": 9.205546492659053, + "grad_norm": 0.006258980371057987, + "learning_rate": 0.0006540616352276558, + "loss": 0.0672, + "num_input_tokens_seen": 121774352, + "step": 56430 + }, + { + "epoch": 9.206362153344209, + "grad_norm": 0.00507335877045989, + "learning_rate": 0.0006539939169958067, + "loss": 0.0144, + "num_input_tokens_seen": 121785296, + "step": 56435 + }, + { + "epoch": 9.207177814029365, + "grad_norm": 0.13527275621891022, + "learning_rate": 0.0006539261956430755, + "loss": 0.1812, + "num_input_tokens_seen": 121796752, + "step": 56440 + }, + { + "epoch": 9.207993474714518, + "grad_norm": 0.14147591590881348, + "learning_rate": 0.0006538584711708348, + "loss": 0.071, + "num_input_tokens_seen": 121807440, + "step": 56445 + }, + { + "epoch": 9.208809135399674, + "grad_norm": 0.00313786743208766, + "learning_rate": 0.0006537907435804569, + "loss": 0.037, + "num_input_tokens_seen": 121818640, + "step": 56450 + }, + { + "epoch": 9.209624796084828, + "grad_norm": 0.0037348840851336718, + "learning_rate": 0.0006537230128733144, + "loss": 0.0838, + "num_input_tokens_seen": 121828560, + "step": 56455 + }, + { + "epoch": 9.210440456769984, + "grad_norm": 0.009460524655878544, + "learning_rate": 0.0006536552790507802, + "loss": 0.0769, + "num_input_tokens_seen": 121839088, + "step": 56460 + }, + { + "epoch": 9.21125611745514, + "grad_norm": 0.011753401719033718, + "learning_rate": 0.0006535875421142267, + "loss": 0.0064, + "num_input_tokens_seen": 121849744, + "step": 56465 + }, + { + "epoch": 9.212071778140293, + "grad_norm": 0.0032667152117937803, + "learning_rate": 0.0006535198020650269, + "loss": 0.1675, + "num_input_tokens_seen": 121859696, + "step": 56470 + }, + { + "epoch": 9.21288743882545, + "grad_norm": 0.05018065869808197, + "learning_rate": 0.0006534520589045537, + "loss": 0.0144, + "num_input_tokens_seen": 121869840, + "step": 56475 + }, + { + "epoch": 9.213703099510603, + "grad_norm": 0.2122277319431305, + "learning_rate": 0.0006533843126341795, + "loss": 0.0314, + "num_input_tokens_seen": 121880432, + "step": 56480 + }, + { + "epoch": 9.214518760195759, + "grad_norm": 0.01074997428804636, + "learning_rate": 0.0006533165632552777, + "loss": 0.0695, + "num_input_tokens_seen": 121889264, + "step": 56485 + }, + { + "epoch": 9.215334420880913, + "grad_norm": 0.15171043574810028, + "learning_rate": 0.0006532488107692214, + "loss": 0.0185, + "num_input_tokens_seen": 121900016, + "step": 56490 + }, + { + "epoch": 9.216150081566068, + "grad_norm": 0.0063713593408465385, + "learning_rate": 0.0006531810551773836, + "loss": 0.0097, + "num_input_tokens_seen": 121911312, + "step": 56495 + }, + { + "epoch": 9.216965742251224, + "grad_norm": 0.014660344459116459, + "learning_rate": 0.0006531132964811374, + "loss": 0.0262, + "num_input_tokens_seen": 121923088, + "step": 56500 + }, + { + "epoch": 9.217781402936378, + "grad_norm": 0.08053086698055267, + "learning_rate": 0.0006530455346818559, + "loss": 0.0248, + "num_input_tokens_seen": 121932720, + "step": 56505 + }, + { + "epoch": 9.218597063621534, + "grad_norm": 0.06091779097914696, + "learning_rate": 0.0006529777697809125, + "loss": 0.0206, + "num_input_tokens_seen": 121943248, + "step": 56510 + }, + { + "epoch": 9.219412724306688, + "grad_norm": 0.3361516296863556, + "learning_rate": 0.0006529100017796805, + "loss": 0.1169, + "num_input_tokens_seen": 121953712, + "step": 56515 + }, + { + "epoch": 9.220228384991843, + "grad_norm": 0.0914456695318222, + "learning_rate": 0.0006528422306795334, + "loss": 0.0173, + "num_input_tokens_seen": 121965456, + "step": 56520 + }, + { + "epoch": 9.221044045676999, + "grad_norm": 0.011650405824184418, + "learning_rate": 0.0006527744564818446, + "loss": 0.011, + "num_input_tokens_seen": 121975120, + "step": 56525 + }, + { + "epoch": 9.221859706362153, + "grad_norm": 0.008130255155265331, + "learning_rate": 0.0006527066791879875, + "loss": 0.038, + "num_input_tokens_seen": 121987344, + "step": 56530 + }, + { + "epoch": 9.222675367047309, + "grad_norm": 0.014703897759318352, + "learning_rate": 0.000652638898799336, + "loss": 0.0169, + "num_input_tokens_seen": 121998448, + "step": 56535 + }, + { + "epoch": 9.223491027732463, + "grad_norm": 0.0030251971911638975, + "learning_rate": 0.0006525711153172635, + "loss": 0.1526, + "num_input_tokens_seen": 122009104, + "step": 56540 + }, + { + "epoch": 9.224306688417618, + "grad_norm": 0.5417229533195496, + "learning_rate": 0.0006525033287431436, + "loss": 0.0961, + "num_input_tokens_seen": 122019952, + "step": 56545 + }, + { + "epoch": 9.225122349102774, + "grad_norm": 0.26874399185180664, + "learning_rate": 0.0006524355390783506, + "loss": 0.1206, + "num_input_tokens_seen": 122031280, + "step": 56550 + }, + { + "epoch": 9.225938009787928, + "grad_norm": 0.047699008136987686, + "learning_rate": 0.0006523677463242579, + "loss": 0.0132, + "num_input_tokens_seen": 122041584, + "step": 56555 + }, + { + "epoch": 9.226753670473084, + "grad_norm": 0.09653710573911667, + "learning_rate": 0.0006522999504822395, + "loss": 0.0163, + "num_input_tokens_seen": 122051312, + "step": 56560 + }, + { + "epoch": 9.227569331158238, + "grad_norm": 0.5021445155143738, + "learning_rate": 0.0006522321515536694, + "loss": 0.1169, + "num_input_tokens_seen": 122061552, + "step": 56565 + }, + { + "epoch": 9.228384991843393, + "grad_norm": 0.007776590529829264, + "learning_rate": 0.0006521643495399217, + "loss": 0.1124, + "num_input_tokens_seen": 122071184, + "step": 56570 + }, + { + "epoch": 9.229200652528547, + "grad_norm": 0.04108830541372299, + "learning_rate": 0.0006520965444423704, + "loss": 0.0212, + "num_input_tokens_seen": 122083344, + "step": 56575 + }, + { + "epoch": 9.230016313213703, + "grad_norm": 0.0037007180508226156, + "learning_rate": 0.0006520287362623896, + "loss": 0.0103, + "num_input_tokens_seen": 122094832, + "step": 56580 + }, + { + "epoch": 9.230831973898859, + "grad_norm": 0.007904917933046818, + "learning_rate": 0.0006519609250013538, + "loss": 0.0222, + "num_input_tokens_seen": 122106032, + "step": 56585 + }, + { + "epoch": 9.231647634584013, + "grad_norm": 0.38398659229278564, + "learning_rate": 0.000651893110660637, + "loss": 0.0814, + "num_input_tokens_seen": 122116304, + "step": 56590 + }, + { + "epoch": 9.232463295269168, + "grad_norm": 0.05712326988577843, + "learning_rate": 0.0006518252932416135, + "loss": 0.0615, + "num_input_tokens_seen": 122128176, + "step": 56595 + }, + { + "epoch": 9.233278955954322, + "grad_norm": 0.012841320596635342, + "learning_rate": 0.0006517574727456579, + "loss": 0.0929, + "num_input_tokens_seen": 122139696, + "step": 56600 + }, + { + "epoch": 9.234094616639478, + "grad_norm": 0.08611790090799332, + "learning_rate": 0.0006516896491741446, + "loss": 0.1528, + "num_input_tokens_seen": 122151024, + "step": 56605 + }, + { + "epoch": 9.234910277324634, + "grad_norm": 0.01779067888855934, + "learning_rate": 0.000651621822528448, + "loss": 0.0186, + "num_input_tokens_seen": 122161648, + "step": 56610 + }, + { + "epoch": 9.235725938009788, + "grad_norm": 0.04727822542190552, + "learning_rate": 0.000651553992809943, + "loss": 0.1477, + "num_input_tokens_seen": 122172656, + "step": 56615 + }, + { + "epoch": 9.236541598694943, + "grad_norm": 0.005019808653742075, + "learning_rate": 0.0006514861600200039, + "loss": 0.0936, + "num_input_tokens_seen": 122183056, + "step": 56620 + }, + { + "epoch": 9.237357259380097, + "grad_norm": 0.1085418164730072, + "learning_rate": 0.0006514183241600057, + "loss": 0.0273, + "num_input_tokens_seen": 122194224, + "step": 56625 + }, + { + "epoch": 9.238172920065253, + "grad_norm": 0.014106079936027527, + "learning_rate": 0.000651350485231323, + "loss": 0.0764, + "num_input_tokens_seen": 122204784, + "step": 56630 + }, + { + "epoch": 9.238988580750409, + "grad_norm": 0.01294687669724226, + "learning_rate": 0.0006512826432353308, + "loss": 0.0071, + "num_input_tokens_seen": 122215280, + "step": 56635 + }, + { + "epoch": 9.239804241435563, + "grad_norm": 0.2017132192850113, + "learning_rate": 0.000651214798173404, + "loss": 0.0812, + "num_input_tokens_seen": 122226128, + "step": 56640 + }, + { + "epoch": 9.240619902120718, + "grad_norm": 0.21524934470653534, + "learning_rate": 0.0006511469500469173, + "loss": 0.0731, + "num_input_tokens_seen": 122237552, + "step": 56645 + }, + { + "epoch": 9.241435562805872, + "grad_norm": 0.009551075287163258, + "learning_rate": 0.0006510790988572459, + "loss": 0.1725, + "num_input_tokens_seen": 122248880, + "step": 56650 + }, + { + "epoch": 9.242251223491028, + "grad_norm": 0.03400469571352005, + "learning_rate": 0.0006510112446057651, + "loss": 0.0922, + "num_input_tokens_seen": 122259760, + "step": 56655 + }, + { + "epoch": 9.243066884176184, + "grad_norm": 0.3036065697669983, + "learning_rate": 0.0006509433872938497, + "loss": 0.0899, + "num_input_tokens_seen": 122271120, + "step": 56660 + }, + { + "epoch": 9.243882544861338, + "grad_norm": 0.4190451502799988, + "learning_rate": 0.0006508755269228752, + "loss": 0.1734, + "num_input_tokens_seen": 122281424, + "step": 56665 + }, + { + "epoch": 9.244698205546493, + "grad_norm": 0.04617614299058914, + "learning_rate": 0.0006508076634942167, + "loss": 0.0101, + "num_input_tokens_seen": 122292784, + "step": 56670 + }, + { + "epoch": 9.245513866231647, + "grad_norm": 0.014614823274314404, + "learning_rate": 0.0006507397970092496, + "loss": 0.0183, + "num_input_tokens_seen": 122302992, + "step": 56675 + }, + { + "epoch": 9.246329526916803, + "grad_norm": 0.1897294819355011, + "learning_rate": 0.0006506719274693492, + "loss": 0.0241, + "num_input_tokens_seen": 122314320, + "step": 56680 + }, + { + "epoch": 9.247145187601957, + "grad_norm": 0.006125426385551691, + "learning_rate": 0.0006506040548758911, + "loss": 0.0093, + "num_input_tokens_seen": 122324848, + "step": 56685 + }, + { + "epoch": 9.247960848287113, + "grad_norm": 0.008783041499555111, + "learning_rate": 0.0006505361792302509, + "loss": 0.0189, + "num_input_tokens_seen": 122335760, + "step": 56690 + }, + { + "epoch": 9.248776508972268, + "grad_norm": 0.02181299962103367, + "learning_rate": 0.0006504683005338039, + "loss": 0.0924, + "num_input_tokens_seen": 122346416, + "step": 56695 + }, + { + "epoch": 9.249592169657422, + "grad_norm": 0.06377148628234863, + "learning_rate": 0.0006504004187879259, + "loss": 0.0333, + "num_input_tokens_seen": 122358608, + "step": 56700 + }, + { + "epoch": 9.250407830342578, + "grad_norm": 0.0099883321672678, + "learning_rate": 0.0006503325339939927, + "loss": 0.0177, + "num_input_tokens_seen": 122369264, + "step": 56705 + }, + { + "epoch": 9.251223491027732, + "grad_norm": 0.0382743664085865, + "learning_rate": 0.0006502646461533798, + "loss": 0.1807, + "num_input_tokens_seen": 122380528, + "step": 56710 + }, + { + "epoch": 9.252039151712887, + "grad_norm": 0.011702095158398151, + "learning_rate": 0.0006501967552674635, + "loss": 0.0145, + "num_input_tokens_seen": 122391856, + "step": 56715 + }, + { + "epoch": 9.252854812398043, + "grad_norm": 0.05152780935168266, + "learning_rate": 0.0006501288613376193, + "loss": 0.0301, + "num_input_tokens_seen": 122402672, + "step": 56720 + }, + { + "epoch": 9.253670473083197, + "grad_norm": 0.3676697313785553, + "learning_rate": 0.0006500609643652234, + "loss": 0.1128, + "num_input_tokens_seen": 122412368, + "step": 56725 + }, + { + "epoch": 9.254486133768353, + "grad_norm": 0.3944735825061798, + "learning_rate": 0.0006499930643516514, + "loss": 0.0582, + "num_input_tokens_seen": 122423248, + "step": 56730 + }, + { + "epoch": 9.255301794453507, + "grad_norm": 0.01712197996675968, + "learning_rate": 0.0006499251612982798, + "loss": 0.0235, + "num_input_tokens_seen": 122433040, + "step": 56735 + }, + { + "epoch": 9.256117455138662, + "grad_norm": 0.06112810969352722, + "learning_rate": 0.0006498572552064847, + "loss": 0.0129, + "num_input_tokens_seen": 122442928, + "step": 56740 + }, + { + "epoch": 9.256933115823816, + "grad_norm": 0.027684105560183525, + "learning_rate": 0.0006497893460776421, + "loss": 0.0341, + "num_input_tokens_seen": 122453488, + "step": 56745 + }, + { + "epoch": 9.257748776508972, + "grad_norm": 0.013579802587628365, + "learning_rate": 0.0006497214339131284, + "loss": 0.0208, + "num_input_tokens_seen": 122463632, + "step": 56750 + }, + { + "epoch": 9.258564437194128, + "grad_norm": 0.0063019609078764915, + "learning_rate": 0.00064965351871432, + "loss": 0.014, + "num_input_tokens_seen": 122474736, + "step": 56755 + }, + { + "epoch": 9.259380097879282, + "grad_norm": 0.007831118069589138, + "learning_rate": 0.0006495856004825931, + "loss": 0.0121, + "num_input_tokens_seen": 122485776, + "step": 56760 + }, + { + "epoch": 9.260195758564437, + "grad_norm": 0.0047360979951918125, + "learning_rate": 0.0006495176792193243, + "loss": 0.0165, + "num_input_tokens_seen": 122496176, + "step": 56765 + }, + { + "epoch": 9.261011419249591, + "grad_norm": 0.00438972283154726, + "learning_rate": 0.00064944975492589, + "loss": 0.02, + "num_input_tokens_seen": 122507760, + "step": 56770 + }, + { + "epoch": 9.261827079934747, + "grad_norm": 0.04396902024745941, + "learning_rate": 0.0006493818276036669, + "loss": 0.0527, + "num_input_tokens_seen": 122518768, + "step": 56775 + }, + { + "epoch": 9.262642740619903, + "grad_norm": 0.42400607466697693, + "learning_rate": 0.0006493138972540316, + "loss": 0.0772, + "num_input_tokens_seen": 122531216, + "step": 56780 + }, + { + "epoch": 9.263458401305057, + "grad_norm": 0.18625307083129883, + "learning_rate": 0.0006492459638783606, + "loss": 0.0416, + "num_input_tokens_seen": 122541552, + "step": 56785 + }, + { + "epoch": 9.264274061990212, + "grad_norm": 0.18994276225566864, + "learning_rate": 0.0006491780274780308, + "loss": 0.1134, + "num_input_tokens_seen": 122553712, + "step": 56790 + }, + { + "epoch": 9.265089722675366, + "grad_norm": 0.008505444973707199, + "learning_rate": 0.0006491100880544191, + "loss": 0.0439, + "num_input_tokens_seen": 122564976, + "step": 56795 + }, + { + "epoch": 9.265905383360522, + "grad_norm": 0.14765994250774384, + "learning_rate": 0.0006490421456089023, + "loss": 0.12, + "num_input_tokens_seen": 122576272, + "step": 56800 + }, + { + "epoch": 9.266721044045678, + "grad_norm": 0.169557124376297, + "learning_rate": 0.0006489742001428573, + "loss": 0.088, + "num_input_tokens_seen": 122586576, + "step": 56805 + }, + { + "epoch": 9.267536704730832, + "grad_norm": 0.05664984881877899, + "learning_rate": 0.0006489062516576613, + "loss": 0.0077, + "num_input_tokens_seen": 122598000, + "step": 56810 + }, + { + "epoch": 9.268352365415987, + "grad_norm": 0.03039197064936161, + "learning_rate": 0.0006488383001546911, + "loss": 0.017, + "num_input_tokens_seen": 122606896, + "step": 56815 + }, + { + "epoch": 9.269168026101141, + "grad_norm": 0.0743633359670639, + "learning_rate": 0.000648770345635324, + "loss": 0.0165, + "num_input_tokens_seen": 122617200, + "step": 56820 + }, + { + "epoch": 9.269983686786297, + "grad_norm": 0.011844078078866005, + "learning_rate": 0.000648702388100937, + "loss": 0.1673, + "num_input_tokens_seen": 122626512, + "step": 56825 + }, + { + "epoch": 9.270799347471453, + "grad_norm": 0.009855990298092365, + "learning_rate": 0.0006486344275529076, + "loss": 0.0407, + "num_input_tokens_seen": 122636848, + "step": 56830 + }, + { + "epoch": 9.271615008156607, + "grad_norm": 0.2603641748428345, + "learning_rate": 0.0006485664639926128, + "loss": 0.0821, + "num_input_tokens_seen": 122646928, + "step": 56835 + }, + { + "epoch": 9.272430668841762, + "grad_norm": 0.36930206418037415, + "learning_rate": 0.0006484984974214303, + "loss": 0.0731, + "num_input_tokens_seen": 122656752, + "step": 56840 + }, + { + "epoch": 9.273246329526916, + "grad_norm": 0.01867293380200863, + "learning_rate": 0.0006484305278407373, + "loss": 0.0249, + "num_input_tokens_seen": 122668656, + "step": 56845 + }, + { + "epoch": 9.274061990212072, + "grad_norm": 0.01591694913804531, + "learning_rate": 0.0006483625552519114, + "loss": 0.0473, + "num_input_tokens_seen": 122679728, + "step": 56850 + }, + { + "epoch": 9.274877650897226, + "grad_norm": 0.044656701385974884, + "learning_rate": 0.00064829457965633, + "loss": 0.0649, + "num_input_tokens_seen": 122691536, + "step": 56855 + }, + { + "epoch": 9.275693311582382, + "grad_norm": 0.05628037825226784, + "learning_rate": 0.0006482266010553707, + "loss": 0.0337, + "num_input_tokens_seen": 122701488, + "step": 56860 + }, + { + "epoch": 9.276508972267537, + "grad_norm": 0.16904956102371216, + "learning_rate": 0.0006481586194504117, + "loss": 0.0183, + "num_input_tokens_seen": 122712784, + "step": 56865 + }, + { + "epoch": 9.277324632952691, + "grad_norm": 0.18702173233032227, + "learning_rate": 0.00064809063484283, + "loss": 0.1077, + "num_input_tokens_seen": 122723184, + "step": 56870 + }, + { + "epoch": 9.278140293637847, + "grad_norm": 0.032570093870162964, + "learning_rate": 0.0006480226472340039, + "loss": 0.1111, + "num_input_tokens_seen": 122733808, + "step": 56875 + }, + { + "epoch": 9.278955954323001, + "grad_norm": 0.3888135254383087, + "learning_rate": 0.0006479546566253109, + "loss": 0.0576, + "num_input_tokens_seen": 122745552, + "step": 56880 + }, + { + "epoch": 9.279771615008157, + "grad_norm": 0.08713637292385101, + "learning_rate": 0.0006478866630181293, + "loss": 0.0263, + "num_input_tokens_seen": 122755632, + "step": 56885 + }, + { + "epoch": 9.280587275693312, + "grad_norm": 0.019874267280101776, + "learning_rate": 0.0006478186664138366, + "loss": 0.0782, + "num_input_tokens_seen": 122767696, + "step": 56890 + }, + { + "epoch": 9.281402936378466, + "grad_norm": 0.010859251953661442, + "learning_rate": 0.0006477506668138113, + "loss": 0.0063, + "num_input_tokens_seen": 122779376, + "step": 56895 + }, + { + "epoch": 9.282218597063622, + "grad_norm": 0.2600773870944977, + "learning_rate": 0.0006476826642194313, + "loss": 0.0323, + "num_input_tokens_seen": 122790832, + "step": 56900 + }, + { + "epoch": 9.283034257748776, + "grad_norm": 0.007055244408547878, + "learning_rate": 0.0006476146586320747, + "loss": 0.0409, + "num_input_tokens_seen": 122801456, + "step": 56905 + }, + { + "epoch": 9.283849918433932, + "grad_norm": 0.028339680284261703, + "learning_rate": 0.0006475466500531198, + "loss": 0.0114, + "num_input_tokens_seen": 122813520, + "step": 56910 + }, + { + "epoch": 9.284665579119087, + "grad_norm": 0.2776114046573639, + "learning_rate": 0.0006474786384839448, + "loss": 0.0303, + "num_input_tokens_seen": 122823344, + "step": 56915 + }, + { + "epoch": 9.285481239804241, + "grad_norm": 0.011378887109458447, + "learning_rate": 0.0006474106239259282, + "loss": 0.0789, + "num_input_tokens_seen": 122834128, + "step": 56920 + }, + { + "epoch": 9.286296900489397, + "grad_norm": 0.0008269099052995443, + "learning_rate": 0.0006473426063804483, + "loss": 0.0197, + "num_input_tokens_seen": 122845456, + "step": 56925 + }, + { + "epoch": 9.28711256117455, + "grad_norm": 0.0039013263303786516, + "learning_rate": 0.0006472745858488835, + "loss": 0.0383, + "num_input_tokens_seen": 122856272, + "step": 56930 + }, + { + "epoch": 9.287928221859707, + "grad_norm": 0.0005150201031938195, + "learning_rate": 0.0006472065623326123, + "loss": 0.0946, + "num_input_tokens_seen": 122866544, + "step": 56935 + }, + { + "epoch": 9.28874388254486, + "grad_norm": 0.26297903060913086, + "learning_rate": 0.0006471385358330135, + "loss": 0.0308, + "num_input_tokens_seen": 122876432, + "step": 56940 + }, + { + "epoch": 9.289559543230016, + "grad_norm": 0.06994161754846573, + "learning_rate": 0.0006470705063514656, + "loss": 0.0706, + "num_input_tokens_seen": 122888432, + "step": 56945 + }, + { + "epoch": 9.290375203915172, + "grad_norm": 0.01473821047693491, + "learning_rate": 0.0006470024738893473, + "loss": 0.0164, + "num_input_tokens_seen": 122900016, + "step": 56950 + }, + { + "epoch": 9.291190864600326, + "grad_norm": 0.06947450339794159, + "learning_rate": 0.0006469344384480374, + "loss": 0.0651, + "num_input_tokens_seen": 122910096, + "step": 56955 + }, + { + "epoch": 9.292006525285482, + "grad_norm": 0.008020227774977684, + "learning_rate": 0.0006468664000289147, + "loss": 0.0074, + "num_input_tokens_seen": 122920336, + "step": 56960 + }, + { + "epoch": 9.292822185970635, + "grad_norm": 0.01640552468597889, + "learning_rate": 0.000646798358633358, + "loss": 0.1292, + "num_input_tokens_seen": 122931632, + "step": 56965 + }, + { + "epoch": 9.293637846655791, + "grad_norm": 0.004048272036015987, + "learning_rate": 0.0006467303142627465, + "loss": 0.0412, + "num_input_tokens_seen": 122943728, + "step": 56970 + }, + { + "epoch": 9.294453507340947, + "grad_norm": 0.07681961357593536, + "learning_rate": 0.0006466622669184589, + "loss": 0.0208, + "num_input_tokens_seen": 122953264, + "step": 56975 + }, + { + "epoch": 9.2952691680261, + "grad_norm": 0.017706673592329025, + "learning_rate": 0.0006465942166018745, + "loss": 0.0614, + "num_input_tokens_seen": 122965296, + "step": 56980 + }, + { + "epoch": 9.296084828711257, + "grad_norm": 0.14281128346920013, + "learning_rate": 0.0006465261633143722, + "loss": 0.0583, + "num_input_tokens_seen": 122974384, + "step": 56985 + }, + { + "epoch": 9.29690048939641, + "grad_norm": 0.0729246661067009, + "learning_rate": 0.0006464581070573315, + "loss": 0.0239, + "num_input_tokens_seen": 122985296, + "step": 56990 + }, + { + "epoch": 9.297716150081566, + "grad_norm": 0.01002801489084959, + "learning_rate": 0.0006463900478321314, + "loss": 0.0071, + "num_input_tokens_seen": 122996272, + "step": 56995 + }, + { + "epoch": 9.298531810766722, + "grad_norm": 0.0040133134461939335, + "learning_rate": 0.0006463219856401513, + "loss": 0.0353, + "num_input_tokens_seen": 123007440, + "step": 57000 + }, + { + "epoch": 9.299347471451876, + "grad_norm": 0.00965412799268961, + "learning_rate": 0.0006462539204827705, + "loss": 0.1191, + "num_input_tokens_seen": 123018096, + "step": 57005 + }, + { + "epoch": 9.300163132137031, + "grad_norm": 0.011123540811240673, + "learning_rate": 0.0006461858523613684, + "loss": 0.1959, + "num_input_tokens_seen": 123029072, + "step": 57010 + }, + { + "epoch": 9.300978792822185, + "grad_norm": 0.24288173019886017, + "learning_rate": 0.0006461177812773246, + "loss": 0.1011, + "num_input_tokens_seen": 123040816, + "step": 57015 + }, + { + "epoch": 9.301794453507341, + "grad_norm": 0.004511216189712286, + "learning_rate": 0.0006460497072320186, + "loss": 0.0787, + "num_input_tokens_seen": 123051376, + "step": 57020 + }, + { + "epoch": 9.302610114192497, + "grad_norm": 0.41373777389526367, + "learning_rate": 0.00064598163022683, + "loss": 0.0635, + "num_input_tokens_seen": 123061232, + "step": 57025 + }, + { + "epoch": 9.30342577487765, + "grad_norm": 0.008684992790222168, + "learning_rate": 0.0006459135502631386, + "loss": 0.2492, + "num_input_tokens_seen": 123071632, + "step": 57030 + }, + { + "epoch": 9.304241435562806, + "grad_norm": 0.19645847380161285, + "learning_rate": 0.0006458454673423238, + "loss": 0.0393, + "num_input_tokens_seen": 123083600, + "step": 57035 + }, + { + "epoch": 9.30505709624796, + "grad_norm": 0.007247294299304485, + "learning_rate": 0.0006457773814657657, + "loss": 0.0347, + "num_input_tokens_seen": 123093392, + "step": 57040 + }, + { + "epoch": 9.305872756933116, + "grad_norm": 0.0049250805750489235, + "learning_rate": 0.000645709292634844, + "loss": 0.0221, + "num_input_tokens_seen": 123104784, + "step": 57045 + }, + { + "epoch": 9.30668841761827, + "grad_norm": 0.19282647967338562, + "learning_rate": 0.0006456412008509387, + "loss": 0.0493, + "num_input_tokens_seen": 123115344, + "step": 57050 + }, + { + "epoch": 9.307504078303426, + "grad_norm": 0.029192756861448288, + "learning_rate": 0.0006455731061154297, + "loss": 0.0313, + "num_input_tokens_seen": 123125520, + "step": 57055 + }, + { + "epoch": 9.308319738988581, + "grad_norm": 0.030040396377444267, + "learning_rate": 0.0006455050084296969, + "loss": 0.0102, + "num_input_tokens_seen": 123136016, + "step": 57060 + }, + { + "epoch": 9.309135399673735, + "grad_norm": 0.03220193460583687, + "learning_rate": 0.0006454369077951206, + "loss": 0.062, + "num_input_tokens_seen": 123147536, + "step": 57065 + }, + { + "epoch": 9.309951060358891, + "grad_norm": 0.01736394688487053, + "learning_rate": 0.0006453688042130808, + "loss": 0.0077, + "num_input_tokens_seen": 123157904, + "step": 57070 + }, + { + "epoch": 9.310766721044045, + "grad_norm": 0.006239277310669422, + "learning_rate": 0.0006453006976849578, + "loss": 0.022, + "num_input_tokens_seen": 123168720, + "step": 57075 + }, + { + "epoch": 9.3115823817292, + "grad_norm": 0.05099215358495712, + "learning_rate": 0.0006452325882121319, + "loss": 0.0155, + "num_input_tokens_seen": 123180144, + "step": 57080 + }, + { + "epoch": 9.312398042414356, + "grad_norm": 0.10600738972425461, + "learning_rate": 0.0006451644757959834, + "loss": 0.0257, + "num_input_tokens_seen": 123190896, + "step": 57085 + }, + { + "epoch": 9.31321370309951, + "grad_norm": 0.01326788030564785, + "learning_rate": 0.0006450963604378926, + "loss": 0.0136, + "num_input_tokens_seen": 123201424, + "step": 57090 + }, + { + "epoch": 9.314029363784666, + "grad_norm": 0.10123365372419357, + "learning_rate": 0.0006450282421392399, + "loss": 0.0608, + "num_input_tokens_seen": 123211536, + "step": 57095 + }, + { + "epoch": 9.31484502446982, + "grad_norm": 0.018629053607583046, + "learning_rate": 0.0006449601209014059, + "loss": 0.0513, + "num_input_tokens_seen": 123222896, + "step": 57100 + }, + { + "epoch": 9.315660685154976, + "grad_norm": 0.44318827986717224, + "learning_rate": 0.0006448919967257711, + "loss": 0.149, + "num_input_tokens_seen": 123235280, + "step": 57105 + }, + { + "epoch": 9.31647634584013, + "grad_norm": 0.01705995202064514, + "learning_rate": 0.0006448238696137163, + "loss": 0.0139, + "num_input_tokens_seen": 123246416, + "step": 57110 + }, + { + "epoch": 9.317292006525285, + "grad_norm": 0.45521414279937744, + "learning_rate": 0.0006447557395666221, + "loss": 0.0304, + "num_input_tokens_seen": 123257200, + "step": 57115 + }, + { + "epoch": 9.318107667210441, + "grad_norm": 0.0018344040727242827, + "learning_rate": 0.0006446876065858691, + "loss": 0.1617, + "num_input_tokens_seen": 123268816, + "step": 57120 + }, + { + "epoch": 9.318923327895595, + "grad_norm": 0.010229192674160004, + "learning_rate": 0.0006446194706728383, + "loss": 0.0255, + "num_input_tokens_seen": 123279408, + "step": 57125 + }, + { + "epoch": 9.31973898858075, + "grad_norm": 0.04657386988401413, + "learning_rate": 0.0006445513318289104, + "loss": 0.166, + "num_input_tokens_seen": 123289232, + "step": 57130 + }, + { + "epoch": 9.320554649265905, + "grad_norm": 0.25407856702804565, + "learning_rate": 0.0006444831900554664, + "loss": 0.0415, + "num_input_tokens_seen": 123299984, + "step": 57135 + }, + { + "epoch": 9.32137030995106, + "grad_norm": 0.024958781898021698, + "learning_rate": 0.0006444150453538873, + "loss": 0.0593, + "num_input_tokens_seen": 123310768, + "step": 57140 + }, + { + "epoch": 9.322185970636216, + "grad_norm": 0.2563214600086212, + "learning_rate": 0.000644346897725554, + "loss": 0.0265, + "num_input_tokens_seen": 123321232, + "step": 57145 + }, + { + "epoch": 9.32300163132137, + "grad_norm": 0.030786845833063126, + "learning_rate": 0.0006442787471718479, + "loss": 0.0164, + "num_input_tokens_seen": 123332432, + "step": 57150 + }, + { + "epoch": 9.323817292006526, + "grad_norm": 0.04613823443651199, + "learning_rate": 0.0006442105936941498, + "loss": 0.0776, + "num_input_tokens_seen": 123344240, + "step": 57155 + }, + { + "epoch": 9.32463295269168, + "grad_norm": 0.09371013194322586, + "learning_rate": 0.000644142437293841, + "loss": 0.1297, + "num_input_tokens_seen": 123354672, + "step": 57160 + }, + { + "epoch": 9.325448613376835, + "grad_norm": 0.011428690515458584, + "learning_rate": 0.000644074277972303, + "loss": 0.1201, + "num_input_tokens_seen": 123365168, + "step": 57165 + }, + { + "epoch": 9.326264274061991, + "grad_norm": 0.005811300128698349, + "learning_rate": 0.000644006115730917, + "loss": 0.0102, + "num_input_tokens_seen": 123375696, + "step": 57170 + }, + { + "epoch": 9.327079934747145, + "grad_norm": 0.005426878575235605, + "learning_rate": 0.000643937950571064, + "loss": 0.2588, + "num_input_tokens_seen": 123387344, + "step": 57175 + }, + { + "epoch": 9.3278955954323, + "grad_norm": 0.011633199639618397, + "learning_rate": 0.0006438697824941263, + "loss": 0.0329, + "num_input_tokens_seen": 123398992, + "step": 57180 + }, + { + "epoch": 9.328711256117455, + "grad_norm": 0.2705024182796478, + "learning_rate": 0.0006438016115014848, + "loss": 0.2244, + "num_input_tokens_seen": 123409168, + "step": 57185 + }, + { + "epoch": 9.32952691680261, + "grad_norm": 0.00839664600789547, + "learning_rate": 0.0006437334375945212, + "loss": 0.0121, + "num_input_tokens_seen": 123421488, + "step": 57190 + }, + { + "epoch": 9.330342577487766, + "grad_norm": 0.07276766747236252, + "learning_rate": 0.0006436652607746171, + "loss": 0.0182, + "num_input_tokens_seen": 123432816, + "step": 57195 + }, + { + "epoch": 9.33115823817292, + "grad_norm": 0.012110588140785694, + "learning_rate": 0.0006435970810431544, + "loss": 0.0605, + "num_input_tokens_seen": 123444048, + "step": 57200 + }, + { + "epoch": 9.331973898858076, + "grad_norm": 0.004249767400324345, + "learning_rate": 0.0006435288984015146, + "loss": 0.0606, + "num_input_tokens_seen": 123454320, + "step": 57205 + }, + { + "epoch": 9.33278955954323, + "grad_norm": 0.006950191222131252, + "learning_rate": 0.0006434607128510796, + "loss": 0.0096, + "num_input_tokens_seen": 123465424, + "step": 57210 + }, + { + "epoch": 9.333605220228385, + "grad_norm": 0.1303885281085968, + "learning_rate": 0.0006433925243932312, + "loss": 0.0697, + "num_input_tokens_seen": 123475856, + "step": 57215 + }, + { + "epoch": 9.33442088091354, + "grad_norm": 0.024100303649902344, + "learning_rate": 0.0006433243330293514, + "loss": 0.1341, + "num_input_tokens_seen": 123484816, + "step": 57220 + }, + { + "epoch": 9.335236541598695, + "grad_norm": 0.002702070865780115, + "learning_rate": 0.0006432561387608222, + "loss": 0.0326, + "num_input_tokens_seen": 123495920, + "step": 57225 + }, + { + "epoch": 9.33605220228385, + "grad_norm": 0.05319380760192871, + "learning_rate": 0.0006431879415890256, + "loss": 0.1429, + "num_input_tokens_seen": 123507344, + "step": 57230 + }, + { + "epoch": 9.336867862969005, + "grad_norm": 0.02146446704864502, + "learning_rate": 0.0006431197415153437, + "loss": 0.0111, + "num_input_tokens_seen": 123517808, + "step": 57235 + }, + { + "epoch": 9.33768352365416, + "grad_norm": 0.0278292428702116, + "learning_rate": 0.0006430515385411588, + "loss": 0.1188, + "num_input_tokens_seen": 123529456, + "step": 57240 + }, + { + "epoch": 9.338499184339314, + "grad_norm": 0.009830236434936523, + "learning_rate": 0.0006429833326678529, + "loss": 0.0176, + "num_input_tokens_seen": 123538960, + "step": 57245 + }, + { + "epoch": 9.33931484502447, + "grad_norm": 0.23519551753997803, + "learning_rate": 0.0006429151238968083, + "loss": 0.1601, + "num_input_tokens_seen": 123549200, + "step": 57250 + }, + { + "epoch": 9.340130505709626, + "grad_norm": 0.03795861825346947, + "learning_rate": 0.0006428469122294075, + "loss": 0.1372, + "num_input_tokens_seen": 123560176, + "step": 57255 + }, + { + "epoch": 9.34094616639478, + "grad_norm": 0.005263105500489473, + "learning_rate": 0.0006427786976670328, + "loss": 0.0504, + "num_input_tokens_seen": 123571280, + "step": 57260 + }, + { + "epoch": 9.341761827079935, + "grad_norm": 0.15145625174045563, + "learning_rate": 0.0006427104802110667, + "loss": 0.0726, + "num_input_tokens_seen": 123581296, + "step": 57265 + }, + { + "epoch": 9.34257748776509, + "grad_norm": 0.06726136058568954, + "learning_rate": 0.0006426422598628916, + "loss": 0.1903, + "num_input_tokens_seen": 123592016, + "step": 57270 + }, + { + "epoch": 9.343393148450245, + "grad_norm": 0.0278042983263731, + "learning_rate": 0.0006425740366238903, + "loss": 0.0976, + "num_input_tokens_seen": 123602992, + "step": 57275 + }, + { + "epoch": 9.3442088091354, + "grad_norm": 0.10031641274690628, + "learning_rate": 0.0006425058104954451, + "loss": 0.0235, + "num_input_tokens_seen": 123614384, + "step": 57280 + }, + { + "epoch": 9.345024469820554, + "grad_norm": 0.02572438307106495, + "learning_rate": 0.0006424375814789388, + "loss": 0.0288, + "num_input_tokens_seen": 123625744, + "step": 57285 + }, + { + "epoch": 9.34584013050571, + "grad_norm": 0.26742199063301086, + "learning_rate": 0.0006423693495757545, + "loss": 0.21, + "num_input_tokens_seen": 123636816, + "step": 57290 + }, + { + "epoch": 9.346655791190864, + "grad_norm": 0.2872799038887024, + "learning_rate": 0.0006423011147872745, + "loss": 0.1292, + "num_input_tokens_seen": 123648656, + "step": 57295 + }, + { + "epoch": 9.34747145187602, + "grad_norm": 0.1485021710395813, + "learning_rate": 0.000642232877114882, + "loss": 0.0189, + "num_input_tokens_seen": 123658672, + "step": 57300 + }, + { + "epoch": 9.348287112561174, + "grad_norm": 0.05631787329912186, + "learning_rate": 0.0006421646365599597, + "loss": 0.0479, + "num_input_tokens_seen": 123669968, + "step": 57305 + }, + { + "epoch": 9.34910277324633, + "grad_norm": 0.06113633140921593, + "learning_rate": 0.0006420963931238907, + "loss": 0.0211, + "num_input_tokens_seen": 123680880, + "step": 57310 + }, + { + "epoch": 9.349918433931485, + "grad_norm": 0.16676005721092224, + "learning_rate": 0.0006420281468080582, + "loss": 0.0261, + "num_input_tokens_seen": 123691536, + "step": 57315 + }, + { + "epoch": 9.350734094616639, + "grad_norm": 0.03685262054204941, + "learning_rate": 0.0006419598976138451, + "loss": 0.0844, + "num_input_tokens_seen": 123701904, + "step": 57320 + }, + { + "epoch": 9.351549755301795, + "grad_norm": 0.04345966875553131, + "learning_rate": 0.0006418916455426344, + "loss": 0.0266, + "num_input_tokens_seen": 123712528, + "step": 57325 + }, + { + "epoch": 9.352365415986949, + "grad_norm": 0.020734788849949837, + "learning_rate": 0.0006418233905958097, + "loss": 0.0379, + "num_input_tokens_seen": 123723184, + "step": 57330 + }, + { + "epoch": 9.353181076672104, + "grad_norm": 0.006714714225381613, + "learning_rate": 0.000641755132774754, + "loss": 0.0252, + "num_input_tokens_seen": 123733968, + "step": 57335 + }, + { + "epoch": 9.35399673735726, + "grad_norm": 0.01905689761042595, + "learning_rate": 0.0006416868720808507, + "loss": 0.0089, + "num_input_tokens_seen": 123743120, + "step": 57340 + }, + { + "epoch": 9.354812398042414, + "grad_norm": 0.018711669370532036, + "learning_rate": 0.0006416186085154833, + "loss": 0.0423, + "num_input_tokens_seen": 123754992, + "step": 57345 + }, + { + "epoch": 9.35562805872757, + "grad_norm": 0.0033977623097598553, + "learning_rate": 0.0006415503420800349, + "loss": 0.0303, + "num_input_tokens_seen": 123765488, + "step": 57350 + }, + { + "epoch": 9.356443719412724, + "grad_norm": 0.08677202463150024, + "learning_rate": 0.0006414820727758894, + "loss": 0.0415, + "num_input_tokens_seen": 123776944, + "step": 57355 + }, + { + "epoch": 9.35725938009788, + "grad_norm": 0.08101651072502136, + "learning_rate": 0.0006414138006044303, + "loss": 0.1195, + "num_input_tokens_seen": 123789168, + "step": 57360 + }, + { + "epoch": 9.358075040783035, + "grad_norm": 0.014632252976298332, + "learning_rate": 0.0006413455255670409, + "loss": 0.0289, + "num_input_tokens_seen": 123799536, + "step": 57365 + }, + { + "epoch": 9.358890701468189, + "grad_norm": 0.3448006510734558, + "learning_rate": 0.0006412772476651053, + "loss": 0.0704, + "num_input_tokens_seen": 123811536, + "step": 57370 + }, + { + "epoch": 9.359706362153345, + "grad_norm": 0.039816878736019135, + "learning_rate": 0.0006412089669000071, + "loss": 0.0951, + "num_input_tokens_seen": 123820848, + "step": 57375 + }, + { + "epoch": 9.360522022838499, + "grad_norm": 0.0023578498512506485, + "learning_rate": 0.0006411406832731299, + "loss": 0.0064, + "num_input_tokens_seen": 123832112, + "step": 57380 + }, + { + "epoch": 9.361337683523654, + "grad_norm": 0.0023468718864023685, + "learning_rate": 0.0006410723967858577, + "loss": 0.0939, + "num_input_tokens_seen": 123842544, + "step": 57385 + }, + { + "epoch": 9.362153344208808, + "grad_norm": 0.301557332277298, + "learning_rate": 0.0006410041074395744, + "loss": 0.1822, + "num_input_tokens_seen": 123852816, + "step": 57390 + }, + { + "epoch": 9.362969004893964, + "grad_norm": 0.013610106892883778, + "learning_rate": 0.0006409358152356642, + "loss": 0.1641, + "num_input_tokens_seen": 123863312, + "step": 57395 + }, + { + "epoch": 9.36378466557912, + "grad_norm": 0.022769981995224953, + "learning_rate": 0.0006408675201755107, + "loss": 0.0355, + "num_input_tokens_seen": 123873936, + "step": 57400 + }, + { + "epoch": 9.364600326264274, + "grad_norm": 0.35783958435058594, + "learning_rate": 0.0006407992222604983, + "loss": 0.107, + "num_input_tokens_seen": 123885648, + "step": 57405 + }, + { + "epoch": 9.36541598694943, + "grad_norm": 0.15480926632881165, + "learning_rate": 0.000640730921492011, + "loss": 0.0325, + "num_input_tokens_seen": 123896528, + "step": 57410 + }, + { + "epoch": 9.366231647634583, + "grad_norm": 0.011995040811598301, + "learning_rate": 0.000640662617871433, + "loss": 0.0599, + "num_input_tokens_seen": 123907728, + "step": 57415 + }, + { + "epoch": 9.367047308319739, + "grad_norm": 0.0771670788526535, + "learning_rate": 0.0006405943114001486, + "loss": 0.0153, + "num_input_tokens_seen": 123918128, + "step": 57420 + }, + { + "epoch": 9.367862969004895, + "grad_norm": 0.3685389459133148, + "learning_rate": 0.0006405260020795421, + "loss": 0.0852, + "num_input_tokens_seen": 123929392, + "step": 57425 + }, + { + "epoch": 9.368678629690049, + "grad_norm": 0.01290836650878191, + "learning_rate": 0.0006404576899109981, + "loss": 0.13, + "num_input_tokens_seen": 123940688, + "step": 57430 + }, + { + "epoch": 9.369494290375204, + "grad_norm": 0.0028189504519104958, + "learning_rate": 0.0006403893748959007, + "loss": 0.0645, + "num_input_tokens_seen": 123950704, + "step": 57435 + }, + { + "epoch": 9.370309951060358, + "grad_norm": 0.10218276083469391, + "learning_rate": 0.0006403210570356346, + "loss": 0.0343, + "num_input_tokens_seen": 123961552, + "step": 57440 + }, + { + "epoch": 9.371125611745514, + "grad_norm": 0.0061128707602620125, + "learning_rate": 0.0006402527363315843, + "loss": 0.023, + "num_input_tokens_seen": 123971408, + "step": 57445 + }, + { + "epoch": 9.37194127243067, + "grad_norm": 0.005515861790627241, + "learning_rate": 0.0006401844127851342, + "loss": 0.0077, + "num_input_tokens_seen": 123981264, + "step": 57450 + }, + { + "epoch": 9.372756933115824, + "grad_norm": 0.010669322684407234, + "learning_rate": 0.0006401160863976691, + "loss": 0.14, + "num_input_tokens_seen": 123992944, + "step": 57455 + }, + { + "epoch": 9.37357259380098, + "grad_norm": 0.034745775163173676, + "learning_rate": 0.000640047757170574, + "loss": 0.0619, + "num_input_tokens_seen": 124004464, + "step": 57460 + }, + { + "epoch": 9.374388254486133, + "grad_norm": 0.06482822448015213, + "learning_rate": 0.0006399794251052333, + "loss": 0.0191, + "num_input_tokens_seen": 124017072, + "step": 57465 + }, + { + "epoch": 9.375203915171289, + "grad_norm": 0.014620895497500896, + "learning_rate": 0.000639911090203032, + "loss": 0.043, + "num_input_tokens_seen": 124027792, + "step": 57470 + }, + { + "epoch": 9.376019575856443, + "grad_norm": 0.02033432386815548, + "learning_rate": 0.000639842752465355, + "loss": 0.0095, + "num_input_tokens_seen": 124039440, + "step": 57475 + }, + { + "epoch": 9.376835236541599, + "grad_norm": 0.03174997493624687, + "learning_rate": 0.0006397744118935871, + "loss": 0.0586, + "num_input_tokens_seen": 124050608, + "step": 57480 + }, + { + "epoch": 9.377650897226754, + "grad_norm": 0.12518279254436493, + "learning_rate": 0.0006397060684891136, + "loss": 0.0368, + "num_input_tokens_seen": 124061296, + "step": 57485 + }, + { + "epoch": 9.378466557911908, + "grad_norm": 0.30891090631484985, + "learning_rate": 0.0006396377222533192, + "loss": 0.0958, + "num_input_tokens_seen": 124071920, + "step": 57490 + }, + { + "epoch": 9.379282218597064, + "grad_norm": 0.0817989706993103, + "learning_rate": 0.0006395693731875892, + "loss": 0.117, + "num_input_tokens_seen": 124082608, + "step": 57495 + }, + { + "epoch": 9.380097879282218, + "grad_norm": 0.15451738238334656, + "learning_rate": 0.000639501021293309, + "loss": 0.0546, + "num_input_tokens_seen": 124094416, + "step": 57500 + }, + { + "epoch": 9.380913539967374, + "grad_norm": 0.20144473016262054, + "learning_rate": 0.0006394326665718635, + "loss": 0.0294, + "num_input_tokens_seen": 124105168, + "step": 57505 + }, + { + "epoch": 9.38172920065253, + "grad_norm": 0.0020069621969014406, + "learning_rate": 0.0006393643090246381, + "loss": 0.0089, + "num_input_tokens_seen": 124114992, + "step": 57510 + }, + { + "epoch": 9.382544861337683, + "grad_norm": 0.04750031977891922, + "learning_rate": 0.0006392959486530183, + "loss": 0.0154, + "num_input_tokens_seen": 124125936, + "step": 57515 + }, + { + "epoch": 9.383360522022839, + "grad_norm": 0.043811552226543427, + "learning_rate": 0.0006392275854583894, + "loss": 0.1225, + "num_input_tokens_seen": 124137616, + "step": 57520 + }, + { + "epoch": 9.384176182707993, + "grad_norm": 0.13797511160373688, + "learning_rate": 0.0006391592194421367, + "loss": 0.0241, + "num_input_tokens_seen": 124147856, + "step": 57525 + }, + { + "epoch": 9.384991843393149, + "grad_norm": 0.08605752140283585, + "learning_rate": 0.0006390908506056461, + "loss": 0.0669, + "num_input_tokens_seen": 124159824, + "step": 57530 + }, + { + "epoch": 9.385807504078304, + "grad_norm": 0.006014845799654722, + "learning_rate": 0.0006390224789503028, + "loss": 0.0129, + "num_input_tokens_seen": 124169168, + "step": 57535 + }, + { + "epoch": 9.386623164763458, + "grad_norm": 0.0807030200958252, + "learning_rate": 0.0006389541044774927, + "loss": 0.0335, + "num_input_tokens_seen": 124179952, + "step": 57540 + }, + { + "epoch": 9.387438825448614, + "grad_norm": 0.039617739617824554, + "learning_rate": 0.0006388857271886013, + "loss": 0.0759, + "num_input_tokens_seen": 124189648, + "step": 57545 + }, + { + "epoch": 9.388254486133768, + "grad_norm": 0.04973667487502098, + "learning_rate": 0.0006388173470850144, + "loss": 0.1077, + "num_input_tokens_seen": 124200336, + "step": 57550 + }, + { + "epoch": 9.389070146818923, + "grad_norm": 0.433889240026474, + "learning_rate": 0.0006387489641681181, + "loss": 0.13, + "num_input_tokens_seen": 124209264, + "step": 57555 + }, + { + "epoch": 9.38988580750408, + "grad_norm": 0.0018835675437003374, + "learning_rate": 0.0006386805784392978, + "loss": 0.0163, + "num_input_tokens_seen": 124219184, + "step": 57560 + }, + { + "epoch": 9.390701468189233, + "grad_norm": 0.033386994153261185, + "learning_rate": 0.0006386121898999397, + "loss": 0.0601, + "num_input_tokens_seen": 124230800, + "step": 57565 + }, + { + "epoch": 9.391517128874389, + "grad_norm": 0.008684366010129452, + "learning_rate": 0.0006385437985514297, + "loss": 0.0034, + "num_input_tokens_seen": 124242224, + "step": 57570 + }, + { + "epoch": 9.392332789559543, + "grad_norm": 0.2793315351009369, + "learning_rate": 0.000638475404395154, + "loss": 0.0532, + "num_input_tokens_seen": 124252592, + "step": 57575 + }, + { + "epoch": 9.393148450244698, + "grad_norm": 0.022077377885580063, + "learning_rate": 0.0006384070074324984, + "loss": 0.0543, + "num_input_tokens_seen": 124263760, + "step": 57580 + }, + { + "epoch": 9.393964110929852, + "grad_norm": 0.2517971992492676, + "learning_rate": 0.0006383386076648494, + "loss": 0.0256, + "num_input_tokens_seen": 124274704, + "step": 57585 + }, + { + "epoch": 9.394779771615008, + "grad_norm": 0.018244469538331032, + "learning_rate": 0.0006382702050935929, + "loss": 0.0118, + "num_input_tokens_seen": 124284784, + "step": 57590 + }, + { + "epoch": 9.395595432300164, + "grad_norm": 0.013883229345083237, + "learning_rate": 0.0006382017997201152, + "loss": 0.0952, + "num_input_tokens_seen": 124294512, + "step": 57595 + }, + { + "epoch": 9.396411092985318, + "grad_norm": 0.023109138011932373, + "learning_rate": 0.000638133391545803, + "loss": 0.0337, + "num_input_tokens_seen": 124306096, + "step": 57600 + }, + { + "epoch": 9.397226753670473, + "grad_norm": 0.17280927300453186, + "learning_rate": 0.000638064980572042, + "loss": 0.0277, + "num_input_tokens_seen": 124317104, + "step": 57605 + }, + { + "epoch": 9.398042414355627, + "grad_norm": 0.08831552416086197, + "learning_rate": 0.0006379965668002192, + "loss": 0.0215, + "num_input_tokens_seen": 124327792, + "step": 57610 + }, + { + "epoch": 9.398858075040783, + "grad_norm": 0.024735253304243088, + "learning_rate": 0.0006379281502317209, + "loss": 0.1469, + "num_input_tokens_seen": 124340112, + "step": 57615 + }, + { + "epoch": 9.399673735725939, + "grad_norm": 0.0857301875948906, + "learning_rate": 0.0006378597308679338, + "loss": 0.0703, + "num_input_tokens_seen": 124351184, + "step": 57620 + }, + { + "epoch": 9.400489396411093, + "grad_norm": 0.0017573339864611626, + "learning_rate": 0.0006377913087102443, + "loss": 0.0086, + "num_input_tokens_seen": 124361456, + "step": 57625 + }, + { + "epoch": 9.401305057096248, + "grad_norm": 0.3841659426689148, + "learning_rate": 0.0006377228837600391, + "loss": 0.069, + "num_input_tokens_seen": 124372848, + "step": 57630 + }, + { + "epoch": 9.402120717781402, + "grad_norm": 0.02109239622950554, + "learning_rate": 0.0006376544560187049, + "loss": 0.015, + "num_input_tokens_seen": 124383664, + "step": 57635 + }, + { + "epoch": 9.402936378466558, + "grad_norm": 0.019802123308181763, + "learning_rate": 0.0006375860254876286, + "loss": 0.0517, + "num_input_tokens_seen": 124394160, + "step": 57640 + }, + { + "epoch": 9.403752039151712, + "grad_norm": 0.022655053064227104, + "learning_rate": 0.0006375175921681968, + "loss": 0.0352, + "num_input_tokens_seen": 124405584, + "step": 57645 + }, + { + "epoch": 9.404567699836868, + "grad_norm": 0.011922765523195267, + "learning_rate": 0.0006374491560617967, + "loss": 0.0219, + "num_input_tokens_seen": 124416560, + "step": 57650 + }, + { + "epoch": 9.405383360522023, + "grad_norm": 0.28386935591697693, + "learning_rate": 0.0006373807171698151, + "loss": 0.0294, + "num_input_tokens_seen": 124426864, + "step": 57655 + }, + { + "epoch": 9.406199021207177, + "grad_norm": 0.22931598126888275, + "learning_rate": 0.0006373122754936389, + "loss": 0.0348, + "num_input_tokens_seen": 124438384, + "step": 57660 + }, + { + "epoch": 9.407014681892333, + "grad_norm": 0.1192903146147728, + "learning_rate": 0.0006372438310346553, + "loss": 0.0418, + "num_input_tokens_seen": 124449936, + "step": 57665 + }, + { + "epoch": 9.407830342577487, + "grad_norm": 0.0018423540750518441, + "learning_rate": 0.0006371753837942513, + "loss": 0.0705, + "num_input_tokens_seen": 124461040, + "step": 57670 + }, + { + "epoch": 9.408646003262643, + "grad_norm": 0.011341131292283535, + "learning_rate": 0.0006371069337738142, + "loss": 0.0409, + "num_input_tokens_seen": 124472144, + "step": 57675 + }, + { + "epoch": 9.409461663947798, + "grad_norm": 0.0038776511792093515, + "learning_rate": 0.000637038480974731, + "loss": 0.0456, + "num_input_tokens_seen": 124483792, + "step": 57680 + }, + { + "epoch": 9.410277324632952, + "grad_norm": 0.00547692459076643, + "learning_rate": 0.0006369700253983893, + "loss": 0.0963, + "num_input_tokens_seen": 124495152, + "step": 57685 + }, + { + "epoch": 9.411092985318108, + "grad_norm": 0.07136140018701553, + "learning_rate": 0.0006369015670461762, + "loss": 0.0121, + "num_input_tokens_seen": 124505360, + "step": 57690 + }, + { + "epoch": 9.411908646003262, + "grad_norm": 0.024769995361566544, + "learning_rate": 0.0006368331059194792, + "loss": 0.1264, + "num_input_tokens_seen": 124515184, + "step": 57695 + }, + { + "epoch": 9.412724306688418, + "grad_norm": 0.011492529883980751, + "learning_rate": 0.0006367646420196857, + "loss": 0.0985, + "num_input_tokens_seen": 124526416, + "step": 57700 + }, + { + "epoch": 9.413539967373573, + "grad_norm": 0.0042085349559783936, + "learning_rate": 0.0006366961753481832, + "loss": 0.0232, + "num_input_tokens_seen": 124537648, + "step": 57705 + }, + { + "epoch": 9.414355628058727, + "grad_norm": 0.0714123472571373, + "learning_rate": 0.0006366277059063594, + "loss": 0.2345, + "num_input_tokens_seen": 124547920, + "step": 57710 + }, + { + "epoch": 9.415171288743883, + "grad_norm": 0.44819140434265137, + "learning_rate": 0.0006365592336956017, + "loss": 0.0358, + "num_input_tokens_seen": 124556624, + "step": 57715 + }, + { + "epoch": 9.415986949429037, + "grad_norm": 0.5078820586204529, + "learning_rate": 0.0006364907587172978, + "loss": 0.1791, + "num_input_tokens_seen": 124566448, + "step": 57720 + }, + { + "epoch": 9.416802610114193, + "grad_norm": 0.048922035843133926, + "learning_rate": 0.0006364222809728358, + "loss": 0.0528, + "num_input_tokens_seen": 124576624, + "step": 57725 + }, + { + "epoch": 9.417618270799348, + "grad_norm": 0.021715881302952766, + "learning_rate": 0.0006363538004636032, + "loss": 0.0109, + "num_input_tokens_seen": 124587184, + "step": 57730 + }, + { + "epoch": 9.418433931484502, + "grad_norm": 0.25073543190956116, + "learning_rate": 0.0006362853171909876, + "loss": 0.2224, + "num_input_tokens_seen": 124598448, + "step": 57735 + }, + { + "epoch": 9.419249592169658, + "grad_norm": 0.024772431701421738, + "learning_rate": 0.0006362168311563773, + "loss": 0.0292, + "num_input_tokens_seen": 124610576, + "step": 57740 + }, + { + "epoch": 9.420065252854812, + "grad_norm": 0.04119279608130455, + "learning_rate": 0.00063614834236116, + "loss": 0.0469, + "num_input_tokens_seen": 124620656, + "step": 57745 + }, + { + "epoch": 9.420880913539968, + "grad_norm": 0.007710843347012997, + "learning_rate": 0.000636079850806724, + "loss": 0.0354, + "num_input_tokens_seen": 124631344, + "step": 57750 + }, + { + "epoch": 9.421696574225122, + "grad_norm": 0.04185958206653595, + "learning_rate": 0.0006360113564944571, + "loss": 0.0108, + "num_input_tokens_seen": 124641872, + "step": 57755 + }, + { + "epoch": 9.422512234910277, + "grad_norm": 0.012534644454717636, + "learning_rate": 0.0006359428594257476, + "loss": 0.1139, + "num_input_tokens_seen": 124652464, + "step": 57760 + }, + { + "epoch": 9.423327895595433, + "grad_norm": 0.03412327170372009, + "learning_rate": 0.0006358743596019836, + "loss": 0.0153, + "num_input_tokens_seen": 124662896, + "step": 57765 + }, + { + "epoch": 9.424143556280587, + "grad_norm": 0.027609655633568764, + "learning_rate": 0.0006358058570245532, + "loss": 0.0098, + "num_input_tokens_seen": 124674480, + "step": 57770 + }, + { + "epoch": 9.424959216965743, + "grad_norm": 0.002544450806453824, + "learning_rate": 0.0006357373516948451, + "loss": 0.0076, + "num_input_tokens_seen": 124684240, + "step": 57775 + }, + { + "epoch": 9.425774877650896, + "grad_norm": 0.01565762795507908, + "learning_rate": 0.0006356688436142471, + "loss": 0.023, + "num_input_tokens_seen": 124696336, + "step": 57780 + }, + { + "epoch": 9.426590538336052, + "grad_norm": 0.38442736864089966, + "learning_rate": 0.000635600332784148, + "loss": 0.114, + "num_input_tokens_seen": 124708432, + "step": 57785 + }, + { + "epoch": 9.427406199021208, + "grad_norm": 0.02443229779601097, + "learning_rate": 0.0006355318192059361, + "loss": 0.0142, + "num_input_tokens_seen": 124721072, + "step": 57790 + }, + { + "epoch": 9.428221859706362, + "grad_norm": 0.02470846101641655, + "learning_rate": 0.0006354633028809999, + "loss": 0.0979, + "num_input_tokens_seen": 124731696, + "step": 57795 + }, + { + "epoch": 9.429037520391518, + "grad_norm": 0.10680975019931793, + "learning_rate": 0.000635394783810728, + "loss": 0.0758, + "num_input_tokens_seen": 124742416, + "step": 57800 + }, + { + "epoch": 9.429853181076671, + "grad_norm": 0.004452058579772711, + "learning_rate": 0.0006353262619965091, + "loss": 0.0625, + "num_input_tokens_seen": 124752784, + "step": 57805 + }, + { + "epoch": 9.430668841761827, + "grad_norm": 0.1256168633699417, + "learning_rate": 0.000635257737439732, + "loss": 0.1422, + "num_input_tokens_seen": 124763248, + "step": 57810 + }, + { + "epoch": 9.431484502446983, + "grad_norm": 0.022801337763667107, + "learning_rate": 0.0006351892101417849, + "loss": 0.1936, + "num_input_tokens_seen": 124774576, + "step": 57815 + }, + { + "epoch": 9.432300163132137, + "grad_norm": 0.3502349853515625, + "learning_rate": 0.0006351206801040571, + "loss": 0.0966, + "num_input_tokens_seen": 124784368, + "step": 57820 + }, + { + "epoch": 9.433115823817293, + "grad_norm": 0.002230318496003747, + "learning_rate": 0.0006350521473279374, + "loss": 0.1156, + "num_input_tokens_seen": 124793904, + "step": 57825 + }, + { + "epoch": 9.433931484502446, + "grad_norm": 0.029137205332517624, + "learning_rate": 0.0006349836118148146, + "loss": 0.0109, + "num_input_tokens_seen": 124804848, + "step": 57830 + }, + { + "epoch": 9.434747145187602, + "grad_norm": 0.04999954253435135, + "learning_rate": 0.0006349150735660776, + "loss": 0.1103, + "num_input_tokens_seen": 124817840, + "step": 57835 + }, + { + "epoch": 9.435562805872756, + "grad_norm": 0.04000772908329964, + "learning_rate": 0.0006348465325831155, + "loss": 0.0433, + "num_input_tokens_seen": 124828080, + "step": 57840 + }, + { + "epoch": 9.436378466557912, + "grad_norm": 0.010199465788900852, + "learning_rate": 0.0006347779888673175, + "loss": 0.0275, + "num_input_tokens_seen": 124839632, + "step": 57845 + }, + { + "epoch": 9.437194127243067, + "grad_norm": 0.00747048482298851, + "learning_rate": 0.0006347094424200724, + "loss": 0.0895, + "num_input_tokens_seen": 124852176, + "step": 57850 + }, + { + "epoch": 9.438009787928221, + "grad_norm": 0.20637278258800507, + "learning_rate": 0.0006346408932427696, + "loss": 0.1639, + "num_input_tokens_seen": 124862768, + "step": 57855 + }, + { + "epoch": 9.438825448613377, + "grad_norm": 0.34570983052253723, + "learning_rate": 0.0006345723413367983, + "loss": 0.0975, + "num_input_tokens_seen": 124873936, + "step": 57860 + }, + { + "epoch": 9.439641109298531, + "grad_norm": 0.00856519490480423, + "learning_rate": 0.0006345037867035478, + "loss": 0.0661, + "num_input_tokens_seen": 124884912, + "step": 57865 + }, + { + "epoch": 9.440456769983687, + "grad_norm": 0.015306884422898293, + "learning_rate": 0.0006344352293444073, + "loss": 0.0167, + "num_input_tokens_seen": 124895632, + "step": 57870 + }, + { + "epoch": 9.441272430668842, + "grad_norm": 0.16496384143829346, + "learning_rate": 0.0006343666692607665, + "loss": 0.0253, + "num_input_tokens_seen": 124906640, + "step": 57875 + }, + { + "epoch": 9.442088091353996, + "grad_norm": 0.004522433038800955, + "learning_rate": 0.0006342981064540145, + "loss": 0.0084, + "num_input_tokens_seen": 124917392, + "step": 57880 + }, + { + "epoch": 9.442903752039152, + "grad_norm": 0.017204251140356064, + "learning_rate": 0.0006342295409255412, + "loss": 0.0109, + "num_input_tokens_seen": 124927408, + "step": 57885 + }, + { + "epoch": 9.443719412724306, + "grad_norm": 0.16244906187057495, + "learning_rate": 0.000634160972676736, + "loss": 0.0246, + "num_input_tokens_seen": 124938160, + "step": 57890 + }, + { + "epoch": 9.444535073409462, + "grad_norm": 0.0059016696177423, + "learning_rate": 0.0006340924017089884, + "loss": 0.0193, + "num_input_tokens_seen": 124947376, + "step": 57895 + }, + { + "epoch": 9.445350734094617, + "grad_norm": 0.04046238586306572, + "learning_rate": 0.0006340238280236882, + "loss": 0.0349, + "num_input_tokens_seen": 124958576, + "step": 57900 + }, + { + "epoch": 9.446166394779771, + "grad_norm": 0.051697492599487305, + "learning_rate": 0.0006339552516222251, + "loss": 0.1307, + "num_input_tokens_seen": 124968944, + "step": 57905 + }, + { + "epoch": 9.446982055464927, + "grad_norm": 0.06456241011619568, + "learning_rate": 0.0006338866725059889, + "loss": 0.0567, + "num_input_tokens_seen": 124980496, + "step": 57910 + }, + { + "epoch": 9.447797716150081, + "grad_norm": 0.04366364702582359, + "learning_rate": 0.0006338180906763693, + "loss": 0.0168, + "num_input_tokens_seen": 124990256, + "step": 57915 + }, + { + "epoch": 9.448613376835237, + "grad_norm": 0.0031717917881906033, + "learning_rate": 0.0006337495061347565, + "loss": 0.0593, + "num_input_tokens_seen": 125001840, + "step": 57920 + }, + { + "epoch": 9.449429037520392, + "grad_norm": 0.0047446065582334995, + "learning_rate": 0.0006336809188825401, + "loss": 0.0674, + "num_input_tokens_seen": 125012272, + "step": 57925 + }, + { + "epoch": 9.450244698205546, + "grad_norm": 0.13495153188705444, + "learning_rate": 0.0006336123289211104, + "loss": 0.0858, + "num_input_tokens_seen": 125023024, + "step": 57930 + }, + { + "epoch": 9.451060358890702, + "grad_norm": 0.020329171791672707, + "learning_rate": 0.0006335437362518574, + "loss": 0.0443, + "num_input_tokens_seen": 125035184, + "step": 57935 + }, + { + "epoch": 9.451876019575856, + "grad_norm": 0.3629210591316223, + "learning_rate": 0.0006334751408761712, + "loss": 0.0608, + "num_input_tokens_seen": 125045424, + "step": 57940 + }, + { + "epoch": 9.452691680261012, + "grad_norm": 0.2488725483417511, + "learning_rate": 0.0006334065427954418, + "loss": 0.1646, + "num_input_tokens_seen": 125055504, + "step": 57945 + }, + { + "epoch": 9.453507340946166, + "grad_norm": 0.010843664407730103, + "learning_rate": 0.0006333379420110597, + "loss": 0.1261, + "num_input_tokens_seen": 125065744, + "step": 57950 + }, + { + "epoch": 9.454323001631321, + "grad_norm": 0.023506227880716324, + "learning_rate": 0.000633269338524415, + "loss": 0.1013, + "num_input_tokens_seen": 125076720, + "step": 57955 + }, + { + "epoch": 9.455138662316477, + "grad_norm": 0.1148872897028923, + "learning_rate": 0.0006332007323368983, + "loss": 0.1838, + "num_input_tokens_seen": 125087504, + "step": 57960 + }, + { + "epoch": 9.455954323001631, + "grad_norm": 0.04582231864333153, + "learning_rate": 0.0006331321234498995, + "loss": 0.0145, + "num_input_tokens_seen": 125099408, + "step": 57965 + }, + { + "epoch": 9.456769983686787, + "grad_norm": 0.03102817013859749, + "learning_rate": 0.0006330635118648093, + "loss": 0.0116, + "num_input_tokens_seen": 125109904, + "step": 57970 + }, + { + "epoch": 9.45758564437194, + "grad_norm": 0.012057851068675518, + "learning_rate": 0.0006329948975830184, + "loss": 0.0349, + "num_input_tokens_seen": 125120144, + "step": 57975 + }, + { + "epoch": 9.458401305057096, + "grad_norm": 0.30607807636260986, + "learning_rate": 0.0006329262806059173, + "loss": 0.1323, + "num_input_tokens_seen": 125131056, + "step": 57980 + }, + { + "epoch": 9.459216965742252, + "grad_norm": 0.09705314040184021, + "learning_rate": 0.0006328576609348962, + "loss": 0.0258, + "num_input_tokens_seen": 125141072, + "step": 57985 + }, + { + "epoch": 9.460032626427406, + "grad_norm": 0.006125032436102629, + "learning_rate": 0.0006327890385713462, + "loss": 0.0235, + "num_input_tokens_seen": 125152656, + "step": 57990 + }, + { + "epoch": 9.460848287112562, + "grad_norm": 0.0036675489973276854, + "learning_rate": 0.000632720413516658, + "loss": 0.0217, + "num_input_tokens_seen": 125164880, + "step": 57995 + }, + { + "epoch": 9.461663947797716, + "grad_norm": 0.0433930978178978, + "learning_rate": 0.000632651785772222, + "loss": 0.0589, + "num_input_tokens_seen": 125175792, + "step": 58000 + }, + { + "epoch": 9.462479608482871, + "grad_norm": 0.18293575942516327, + "learning_rate": 0.0006325831553394294, + "loss": 0.0372, + "num_input_tokens_seen": 125187152, + "step": 58005 + }, + { + "epoch": 9.463295269168025, + "grad_norm": 0.08676878362894058, + "learning_rate": 0.000632514522219671, + "loss": 0.0316, + "num_input_tokens_seen": 125197072, + "step": 58010 + }, + { + "epoch": 9.464110929853181, + "grad_norm": 0.32110798358917236, + "learning_rate": 0.0006324458864143377, + "loss": 0.1071, + "num_input_tokens_seen": 125208144, + "step": 58015 + }, + { + "epoch": 9.464926590538337, + "grad_norm": 0.10868323594331741, + "learning_rate": 0.0006323772479248204, + "loss": 0.0343, + "num_input_tokens_seen": 125218448, + "step": 58020 + }, + { + "epoch": 9.46574225122349, + "grad_norm": 0.31736016273498535, + "learning_rate": 0.0006323086067525103, + "loss": 0.135, + "num_input_tokens_seen": 125229168, + "step": 58025 + }, + { + "epoch": 9.466557911908646, + "grad_norm": 0.14239910244941711, + "learning_rate": 0.0006322399628987984, + "loss": 0.0935, + "num_input_tokens_seen": 125240496, + "step": 58030 + }, + { + "epoch": 9.4673735725938, + "grad_norm": 0.006784508470445871, + "learning_rate": 0.000632171316365076, + "loss": 0.0459, + "num_input_tokens_seen": 125251408, + "step": 58035 + }, + { + "epoch": 9.468189233278956, + "grad_norm": 0.0020543576683849096, + "learning_rate": 0.000632102667152734, + "loss": 0.0692, + "num_input_tokens_seen": 125262128, + "step": 58040 + }, + { + "epoch": 9.469004893964112, + "grad_norm": 0.03998474031686783, + "learning_rate": 0.000632034015263164, + "loss": 0.0125, + "num_input_tokens_seen": 125272816, + "step": 58045 + }, + { + "epoch": 9.469820554649266, + "grad_norm": 0.008310440927743912, + "learning_rate": 0.0006319653606977571, + "loss": 0.1279, + "num_input_tokens_seen": 125283792, + "step": 58050 + }, + { + "epoch": 9.470636215334421, + "grad_norm": 0.16441604495048523, + "learning_rate": 0.0006318967034579048, + "loss": 0.0288, + "num_input_tokens_seen": 125294960, + "step": 58055 + }, + { + "epoch": 9.471451876019575, + "grad_norm": 0.0020604440942406654, + "learning_rate": 0.0006318280435449985, + "loss": 0.0082, + "num_input_tokens_seen": 125304816, + "step": 58060 + }, + { + "epoch": 9.47226753670473, + "grad_norm": 0.17364826798439026, + "learning_rate": 0.0006317593809604298, + "loss": 0.0192, + "num_input_tokens_seen": 125314448, + "step": 58065 + }, + { + "epoch": 9.473083197389887, + "grad_norm": 0.06007537618279457, + "learning_rate": 0.00063169071570559, + "loss": 0.0235, + "num_input_tokens_seen": 125325584, + "step": 58070 + }, + { + "epoch": 9.47389885807504, + "grad_norm": 0.06449867784976959, + "learning_rate": 0.0006316220477818707, + "loss": 0.0079, + "num_input_tokens_seen": 125337328, + "step": 58075 + }, + { + "epoch": 9.474714518760196, + "grad_norm": 0.14373984932899475, + "learning_rate": 0.0006315533771906638, + "loss": 0.0483, + "num_input_tokens_seen": 125348336, + "step": 58080 + }, + { + "epoch": 9.47553017944535, + "grad_norm": 0.012641966342926025, + "learning_rate": 0.0006314847039333607, + "loss": 0.1253, + "num_input_tokens_seen": 125359152, + "step": 58085 + }, + { + "epoch": 9.476345840130506, + "grad_norm": 0.009027996100485325, + "learning_rate": 0.0006314160280113532, + "loss": 0.0078, + "num_input_tokens_seen": 125371792, + "step": 58090 + }, + { + "epoch": 9.477161500815662, + "grad_norm": 0.02237726002931595, + "learning_rate": 0.0006313473494260333, + "loss": 0.0745, + "num_input_tokens_seen": 125382960, + "step": 58095 + }, + { + "epoch": 9.477977161500815, + "grad_norm": 0.046053677797317505, + "learning_rate": 0.0006312786681787928, + "loss": 0.1707, + "num_input_tokens_seen": 125392752, + "step": 58100 + }, + { + "epoch": 9.478792822185971, + "grad_norm": 0.1953411102294922, + "learning_rate": 0.0006312099842710234, + "loss": 0.0952, + "num_input_tokens_seen": 125404080, + "step": 58105 + }, + { + "epoch": 9.479608482871125, + "grad_norm": 0.26276978850364685, + "learning_rate": 0.0006311412977041172, + "loss": 0.1154, + "num_input_tokens_seen": 125415568, + "step": 58110 + }, + { + "epoch": 9.48042414355628, + "grad_norm": 0.13335752487182617, + "learning_rate": 0.0006310726084794663, + "loss": 0.0291, + "num_input_tokens_seen": 125426832, + "step": 58115 + }, + { + "epoch": 9.481239804241435, + "grad_norm": 0.13730202615261078, + "learning_rate": 0.0006310039165984628, + "loss": 0.0377, + "num_input_tokens_seen": 125436656, + "step": 58120 + }, + { + "epoch": 9.48205546492659, + "grad_norm": 0.37742406129837036, + "learning_rate": 0.0006309352220624986, + "loss": 0.1188, + "num_input_tokens_seen": 125447440, + "step": 58125 + }, + { + "epoch": 9.482871125611746, + "grad_norm": 0.3751169443130493, + "learning_rate": 0.0006308665248729662, + "loss": 0.1023, + "num_input_tokens_seen": 125458928, + "step": 58130 + }, + { + "epoch": 9.4836867862969, + "grad_norm": 0.1729574203491211, + "learning_rate": 0.0006307978250312574, + "loss": 0.035, + "num_input_tokens_seen": 125469520, + "step": 58135 + }, + { + "epoch": 9.484502446982056, + "grad_norm": 0.044819269329309464, + "learning_rate": 0.0006307291225387648, + "loss": 0.0695, + "num_input_tokens_seen": 125480016, + "step": 58140 + }, + { + "epoch": 9.48531810766721, + "grad_norm": 0.005842439364641905, + "learning_rate": 0.0006306604173968808, + "loss": 0.0206, + "num_input_tokens_seen": 125490608, + "step": 58145 + }, + { + "epoch": 9.486133768352365, + "grad_norm": 0.006682158913463354, + "learning_rate": 0.0006305917096069977, + "loss": 0.0908, + "num_input_tokens_seen": 125501392, + "step": 58150 + }, + { + "epoch": 9.486949429037521, + "grad_norm": 0.2749008536338806, + "learning_rate": 0.000630522999170508, + "loss": 0.0584, + "num_input_tokens_seen": 125510448, + "step": 58155 + }, + { + "epoch": 9.487765089722675, + "grad_norm": 0.006672396324574947, + "learning_rate": 0.0006304542860888039, + "loss": 0.0368, + "num_input_tokens_seen": 125521968, + "step": 58160 + }, + { + "epoch": 9.48858075040783, + "grad_norm": 0.045397814363241196, + "learning_rate": 0.0006303855703632783, + "loss": 0.1796, + "num_input_tokens_seen": 125533360, + "step": 58165 + }, + { + "epoch": 9.489396411092985, + "grad_norm": 0.010614805854856968, + "learning_rate": 0.0006303168519953238, + "loss": 0.0236, + "num_input_tokens_seen": 125544400, + "step": 58170 + }, + { + "epoch": 9.49021207177814, + "grad_norm": 0.008353580720722675, + "learning_rate": 0.0006302481309863329, + "loss": 0.008, + "num_input_tokens_seen": 125555568, + "step": 58175 + }, + { + "epoch": 9.491027732463296, + "grad_norm": 0.047933198511600494, + "learning_rate": 0.0006301794073376985, + "loss": 0.1015, + "num_input_tokens_seen": 125566224, + "step": 58180 + }, + { + "epoch": 9.49184339314845, + "grad_norm": 0.05826834216713905, + "learning_rate": 0.0006301106810508131, + "loss": 0.0324, + "num_input_tokens_seen": 125576976, + "step": 58185 + }, + { + "epoch": 9.492659053833606, + "grad_norm": 0.027781715616583824, + "learning_rate": 0.0006300419521270697, + "loss": 0.053, + "num_input_tokens_seen": 125587856, + "step": 58190 + }, + { + "epoch": 9.49347471451876, + "grad_norm": 0.6338221430778503, + "learning_rate": 0.0006299732205678613, + "loss": 0.1394, + "num_input_tokens_seen": 125598832, + "step": 58195 + }, + { + "epoch": 9.494290375203915, + "grad_norm": 0.02350202575325966, + "learning_rate": 0.0006299044863745806, + "loss": 0.0227, + "num_input_tokens_seen": 125610832, + "step": 58200 + }, + { + "epoch": 9.49510603588907, + "grad_norm": 0.058891214430332184, + "learning_rate": 0.0006298357495486208, + "loss": 0.0432, + "num_input_tokens_seen": 125621104, + "step": 58205 + }, + { + "epoch": 9.495921696574225, + "grad_norm": 0.0025677080266177654, + "learning_rate": 0.0006297670100913748, + "loss": 0.0101, + "num_input_tokens_seen": 125632752, + "step": 58210 + }, + { + "epoch": 9.49673735725938, + "grad_norm": 0.07513861358165741, + "learning_rate": 0.0006296982680042357, + "loss": 0.0207, + "num_input_tokens_seen": 125642832, + "step": 58215 + }, + { + "epoch": 9.497553017944535, + "grad_norm": 0.012808836996555328, + "learning_rate": 0.0006296295232885966, + "loss": 0.0089, + "num_input_tokens_seen": 125653840, + "step": 58220 + }, + { + "epoch": 9.49836867862969, + "grad_norm": 0.028728481382131577, + "learning_rate": 0.0006295607759458508, + "loss": 0.02, + "num_input_tokens_seen": 125665104, + "step": 58225 + }, + { + "epoch": 9.499184339314844, + "grad_norm": 0.0017551783239468932, + "learning_rate": 0.0006294920259773915, + "loss": 0.0175, + "num_input_tokens_seen": 125675824, + "step": 58230 + }, + { + "epoch": 9.5, + "grad_norm": 0.16666272282600403, + "learning_rate": 0.0006294232733846121, + "loss": 0.0338, + "num_input_tokens_seen": 125686064, + "step": 58235 + }, + { + "epoch": 9.500815660685156, + "grad_norm": 0.6735655665397644, + "learning_rate": 0.0006293545181689057, + "loss": 0.2334, + "num_input_tokens_seen": 125697296, + "step": 58240 + }, + { + "epoch": 9.50163132137031, + "grad_norm": 0.1587417721748352, + "learning_rate": 0.000629285760331666, + "loss": 0.0363, + "num_input_tokens_seen": 125708688, + "step": 58245 + }, + { + "epoch": 9.502446982055465, + "grad_norm": 0.009426862932741642, + "learning_rate": 0.0006292169998742865, + "loss": 0.0188, + "num_input_tokens_seen": 125719792, + "step": 58250 + }, + { + "epoch": 9.50326264274062, + "grad_norm": 0.028172407299280167, + "learning_rate": 0.0006291482367981605, + "loss": 0.0248, + "num_input_tokens_seen": 125729680, + "step": 58255 + }, + { + "epoch": 9.504078303425775, + "grad_norm": 0.010715372860431671, + "learning_rate": 0.0006290794711046816, + "loss": 0.21, + "num_input_tokens_seen": 125739472, + "step": 58260 + }, + { + "epoch": 9.50489396411093, + "grad_norm": 0.03996831178665161, + "learning_rate": 0.0006290107027952434, + "loss": 0.0103, + "num_input_tokens_seen": 125751952, + "step": 58265 + }, + { + "epoch": 9.505709624796085, + "grad_norm": 0.08126851171255112, + "learning_rate": 0.0006289419318712397, + "loss": 0.0298, + "num_input_tokens_seen": 125762992, + "step": 58270 + }, + { + "epoch": 9.50652528548124, + "grad_norm": 0.01057389285415411, + "learning_rate": 0.0006288731583340642, + "loss": 0.1771, + "num_input_tokens_seen": 125774480, + "step": 58275 + }, + { + "epoch": 9.507340946166394, + "grad_norm": 0.2808247208595276, + "learning_rate": 0.0006288043821851107, + "loss": 0.047, + "num_input_tokens_seen": 125784944, + "step": 58280 + }, + { + "epoch": 9.50815660685155, + "grad_norm": 0.49371376633644104, + "learning_rate": 0.000628735603425773, + "loss": 0.077, + "num_input_tokens_seen": 125795120, + "step": 58285 + }, + { + "epoch": 9.508972267536706, + "grad_norm": 0.26567205786705017, + "learning_rate": 0.0006286668220574448, + "loss": 0.1719, + "num_input_tokens_seen": 125806640, + "step": 58290 + }, + { + "epoch": 9.50978792822186, + "grad_norm": 0.14232409000396729, + "learning_rate": 0.0006285980380815204, + "loss": 0.0232, + "num_input_tokens_seen": 125817712, + "step": 58295 + }, + { + "epoch": 9.510603588907015, + "grad_norm": 0.1384144127368927, + "learning_rate": 0.0006285292514993936, + "loss": 0.1262, + "num_input_tokens_seen": 125829008, + "step": 58300 + }, + { + "epoch": 9.51141924959217, + "grad_norm": 0.26174396276474, + "learning_rate": 0.0006284604623124585, + "loss": 0.0489, + "num_input_tokens_seen": 125840016, + "step": 58305 + }, + { + "epoch": 9.512234910277325, + "grad_norm": 0.010654661804437637, + "learning_rate": 0.0006283916705221091, + "loss": 0.0556, + "num_input_tokens_seen": 125850608, + "step": 58310 + }, + { + "epoch": 9.513050570962479, + "grad_norm": 0.03549783676862717, + "learning_rate": 0.0006283228761297396, + "loss": 0.0209, + "num_input_tokens_seen": 125863312, + "step": 58315 + }, + { + "epoch": 9.513866231647635, + "grad_norm": 0.018784906715154648, + "learning_rate": 0.0006282540791367442, + "loss": 0.0222, + "num_input_tokens_seen": 125874960, + "step": 58320 + }, + { + "epoch": 9.51468189233279, + "grad_norm": 0.05858919024467468, + "learning_rate": 0.0006281852795445173, + "loss": 0.0412, + "num_input_tokens_seen": 125885264, + "step": 58325 + }, + { + "epoch": 9.515497553017944, + "grad_norm": 0.021804898977279663, + "learning_rate": 0.000628116477354453, + "loss": 0.0779, + "num_input_tokens_seen": 125896624, + "step": 58330 + }, + { + "epoch": 9.5163132137031, + "grad_norm": 0.26823222637176514, + "learning_rate": 0.0006280476725679457, + "loss": 0.0628, + "num_input_tokens_seen": 125907056, + "step": 58335 + }, + { + "epoch": 9.517128874388254, + "grad_norm": 0.039276231080293655, + "learning_rate": 0.00062797886518639, + "loss": 0.2067, + "num_input_tokens_seen": 125917616, + "step": 58340 + }, + { + "epoch": 9.51794453507341, + "grad_norm": 0.019215881824493408, + "learning_rate": 0.0006279100552111803, + "loss": 0.0206, + "num_input_tokens_seen": 125927984, + "step": 58345 + }, + { + "epoch": 9.518760195758565, + "grad_norm": 0.05691875144839287, + "learning_rate": 0.0006278412426437109, + "loss": 0.0278, + "num_input_tokens_seen": 125938736, + "step": 58350 + }, + { + "epoch": 9.51957585644372, + "grad_norm": 0.3272572457790375, + "learning_rate": 0.0006277724274853767, + "loss": 0.1703, + "num_input_tokens_seen": 125948816, + "step": 58355 + }, + { + "epoch": 9.520391517128875, + "grad_norm": 0.008914544247090816, + "learning_rate": 0.0006277036097375719, + "loss": 0.0398, + "num_input_tokens_seen": 125959504, + "step": 58360 + }, + { + "epoch": 9.521207177814029, + "grad_norm": 0.04219216853380203, + "learning_rate": 0.0006276347894016917, + "loss": 0.0651, + "num_input_tokens_seen": 125971504, + "step": 58365 + }, + { + "epoch": 9.522022838499185, + "grad_norm": 0.012456799857318401, + "learning_rate": 0.0006275659664791304, + "loss": 0.0536, + "num_input_tokens_seen": 125981840, + "step": 58370 + }, + { + "epoch": 9.522838499184338, + "grad_norm": 0.002332450356334448, + "learning_rate": 0.0006274971409712831, + "loss": 0.0749, + "num_input_tokens_seen": 125992432, + "step": 58375 + }, + { + "epoch": 9.523654159869494, + "grad_norm": 0.14431047439575195, + "learning_rate": 0.0006274283128795445, + "loss": 0.0813, + "num_input_tokens_seen": 126002576, + "step": 58380 + }, + { + "epoch": 9.52446982055465, + "grad_norm": 0.018734421581029892, + "learning_rate": 0.0006273594822053095, + "loss": 0.0367, + "num_input_tokens_seen": 126011920, + "step": 58385 + }, + { + "epoch": 9.525285481239804, + "grad_norm": 0.0065150149166584015, + "learning_rate": 0.000627290648949973, + "loss": 0.0642, + "num_input_tokens_seen": 126022864, + "step": 58390 + }, + { + "epoch": 9.52610114192496, + "grad_norm": 0.08785134553909302, + "learning_rate": 0.00062722181311493, + "loss": 0.0159, + "num_input_tokens_seen": 126034960, + "step": 58395 + }, + { + "epoch": 9.526916802610113, + "grad_norm": 0.16364598274230957, + "learning_rate": 0.0006271529747015755, + "loss": 0.0266, + "num_input_tokens_seen": 126046672, + "step": 58400 + }, + { + "epoch": 9.52773246329527, + "grad_norm": 0.03116810880601406, + "learning_rate": 0.0006270841337113047, + "loss": 0.025, + "num_input_tokens_seen": 126058000, + "step": 58405 + }, + { + "epoch": 9.528548123980425, + "grad_norm": 0.022778917104005814, + "learning_rate": 0.0006270152901455128, + "loss": 0.0316, + "num_input_tokens_seen": 126068496, + "step": 58410 + }, + { + "epoch": 9.529363784665579, + "grad_norm": 0.029924221336841583, + "learning_rate": 0.0006269464440055948, + "loss": 0.109, + "num_input_tokens_seen": 126078544, + "step": 58415 + }, + { + "epoch": 9.530179445350734, + "grad_norm": 0.2362830638885498, + "learning_rate": 0.0006268775952929462, + "loss": 0.2665, + "num_input_tokens_seen": 126089712, + "step": 58420 + }, + { + "epoch": 9.530995106035888, + "grad_norm": 0.008322135545313358, + "learning_rate": 0.000626808744008962, + "loss": 0.0829, + "num_input_tokens_seen": 126100656, + "step": 58425 + }, + { + "epoch": 9.531810766721044, + "grad_norm": 0.016326576471328735, + "learning_rate": 0.0006267398901550379, + "loss": 0.1158, + "num_input_tokens_seen": 126111376, + "step": 58430 + }, + { + "epoch": 9.5326264274062, + "grad_norm": 0.04521000385284424, + "learning_rate": 0.000626671033732569, + "loss": 0.0352, + "num_input_tokens_seen": 126122640, + "step": 58435 + }, + { + "epoch": 9.533442088091354, + "grad_norm": 0.022560684010386467, + "learning_rate": 0.0006266021747429511, + "loss": 0.0181, + "num_input_tokens_seen": 126132432, + "step": 58440 + }, + { + "epoch": 9.53425774877651, + "grad_norm": 0.09816017001867294, + "learning_rate": 0.0006265333131875794, + "loss": 0.0183, + "num_input_tokens_seen": 126142576, + "step": 58445 + }, + { + "epoch": 9.535073409461663, + "grad_norm": 0.012793656438589096, + "learning_rate": 0.0006264644490678496, + "loss": 0.0112, + "num_input_tokens_seen": 126153552, + "step": 58450 + }, + { + "epoch": 9.535889070146819, + "grad_norm": 0.013022114522755146, + "learning_rate": 0.0006263955823851571, + "loss": 0.0957, + "num_input_tokens_seen": 126165072, + "step": 58455 + }, + { + "epoch": 9.536704730831975, + "grad_norm": 0.007401151116937399, + "learning_rate": 0.0006263267131408981, + "loss": 0.0425, + "num_input_tokens_seen": 126175696, + "step": 58460 + }, + { + "epoch": 9.537520391517129, + "grad_norm": 0.004931100644171238, + "learning_rate": 0.0006262578413364679, + "loss": 0.0351, + "num_input_tokens_seen": 126187088, + "step": 58465 + }, + { + "epoch": 9.538336052202284, + "grad_norm": 0.5705454349517822, + "learning_rate": 0.0006261889669732624, + "loss": 0.1458, + "num_input_tokens_seen": 126198480, + "step": 58470 + }, + { + "epoch": 9.539151712887438, + "grad_norm": 0.11045108735561371, + "learning_rate": 0.0006261200900526773, + "loss": 0.0115, + "num_input_tokens_seen": 126209904, + "step": 58475 + }, + { + "epoch": 9.539967373572594, + "grad_norm": 0.004627849441021681, + "learning_rate": 0.0006260512105761086, + "loss": 0.0116, + "num_input_tokens_seen": 126219792, + "step": 58480 + }, + { + "epoch": 9.540783034257748, + "grad_norm": 0.0040384456515312195, + "learning_rate": 0.0006259823285449523, + "loss": 0.0818, + "num_input_tokens_seen": 126230256, + "step": 58485 + }, + { + "epoch": 9.541598694942904, + "grad_norm": 0.006887562572956085, + "learning_rate": 0.0006259134439606043, + "loss": 0.1213, + "num_input_tokens_seen": 126241136, + "step": 58490 + }, + { + "epoch": 9.54241435562806, + "grad_norm": 0.14346547424793243, + "learning_rate": 0.0006258445568244605, + "loss": 0.0289, + "num_input_tokens_seen": 126252048, + "step": 58495 + }, + { + "epoch": 9.543230016313213, + "grad_norm": 0.035686343908309937, + "learning_rate": 0.0006257756671379172, + "loss": 0.0138, + "num_input_tokens_seen": 126262320, + "step": 58500 + }, + { + "epoch": 9.544045676998369, + "grad_norm": 0.31903213262557983, + "learning_rate": 0.0006257067749023704, + "loss": 0.1391, + "num_input_tokens_seen": 126272464, + "step": 58505 + }, + { + "epoch": 9.544861337683523, + "grad_norm": 0.009612503461539745, + "learning_rate": 0.0006256378801192163, + "loss": 0.0345, + "num_input_tokens_seen": 126283984, + "step": 58510 + }, + { + "epoch": 9.545676998368679, + "grad_norm": 0.08942940086126328, + "learning_rate": 0.0006255689827898512, + "loss": 0.038, + "num_input_tokens_seen": 126295024, + "step": 58515 + }, + { + "epoch": 9.546492659053834, + "grad_norm": 0.005535644944757223, + "learning_rate": 0.0006255000829156714, + "loss": 0.0503, + "num_input_tokens_seen": 126305296, + "step": 58520 + }, + { + "epoch": 9.547308319738988, + "grad_norm": 0.005356675013899803, + "learning_rate": 0.0006254311804980733, + "loss": 0.0177, + "num_input_tokens_seen": 126317264, + "step": 58525 + }, + { + "epoch": 9.548123980424144, + "grad_norm": 0.020595025271177292, + "learning_rate": 0.0006253622755384531, + "loss": 0.0597, + "num_input_tokens_seen": 126326640, + "step": 58530 + }, + { + "epoch": 9.548939641109298, + "grad_norm": 0.01699174754321575, + "learning_rate": 0.0006252933680382074, + "loss": 0.0119, + "num_input_tokens_seen": 126337840, + "step": 58535 + }, + { + "epoch": 9.549755301794454, + "grad_norm": 0.011003238148987293, + "learning_rate": 0.0006252244579987327, + "loss": 0.0677, + "num_input_tokens_seen": 126347792, + "step": 58540 + }, + { + "epoch": 9.550570962479608, + "grad_norm": 0.062294624745845795, + "learning_rate": 0.0006251555454214254, + "loss": 0.0534, + "num_input_tokens_seen": 126359344, + "step": 58545 + }, + { + "epoch": 9.551386623164763, + "grad_norm": 0.1440536379814148, + "learning_rate": 0.0006250866303076822, + "loss": 0.0915, + "num_input_tokens_seen": 126370064, + "step": 58550 + }, + { + "epoch": 9.552202283849919, + "grad_norm": 0.026247220113873482, + "learning_rate": 0.0006250177126588998, + "loss": 0.0267, + "num_input_tokens_seen": 126380080, + "step": 58555 + }, + { + "epoch": 9.553017944535073, + "grad_norm": 0.09349619597196579, + "learning_rate": 0.0006249487924764747, + "loss": 0.0114, + "num_input_tokens_seen": 126390864, + "step": 58560 + }, + { + "epoch": 9.553833605220229, + "grad_norm": 0.006312210112810135, + "learning_rate": 0.000624879869761804, + "loss": 0.0647, + "num_input_tokens_seen": 126402224, + "step": 58565 + }, + { + "epoch": 9.554649265905383, + "grad_norm": 0.0071671647019684315, + "learning_rate": 0.0006248109445162843, + "loss": 0.0136, + "num_input_tokens_seen": 126412464, + "step": 58570 + }, + { + "epoch": 9.555464926590538, + "grad_norm": 0.09314778447151184, + "learning_rate": 0.0006247420167413124, + "loss": 0.1706, + "num_input_tokens_seen": 126423600, + "step": 58575 + }, + { + "epoch": 9.556280587275694, + "grad_norm": 0.18284116685390472, + "learning_rate": 0.0006246730864382853, + "loss": 0.0312, + "num_input_tokens_seen": 126434928, + "step": 58580 + }, + { + "epoch": 9.557096247960848, + "grad_norm": 0.009490122087299824, + "learning_rate": 0.0006246041536086, + "loss": 0.05, + "num_input_tokens_seen": 126444944, + "step": 58585 + }, + { + "epoch": 9.557911908646004, + "grad_norm": 0.021935174241662025, + "learning_rate": 0.0006245352182536535, + "loss": 0.053, + "num_input_tokens_seen": 126456208, + "step": 58590 + }, + { + "epoch": 9.558727569331158, + "grad_norm": 0.051720015704631805, + "learning_rate": 0.0006244662803748427, + "loss": 0.0466, + "num_input_tokens_seen": 126467312, + "step": 58595 + }, + { + "epoch": 9.559543230016313, + "grad_norm": 0.03883758559823036, + "learning_rate": 0.0006243973399735649, + "loss": 0.0436, + "num_input_tokens_seen": 126476656, + "step": 58600 + }, + { + "epoch": 9.560358890701469, + "grad_norm": 0.04474123939871788, + "learning_rate": 0.0006243283970512172, + "loss": 0.0616, + "num_input_tokens_seen": 126487920, + "step": 58605 + }, + { + "epoch": 9.561174551386623, + "grad_norm": 0.04152955859899521, + "learning_rate": 0.0006242594516091967, + "loss": 0.0225, + "num_input_tokens_seen": 126499216, + "step": 58610 + }, + { + "epoch": 9.561990212071779, + "grad_norm": 0.03126515448093414, + "learning_rate": 0.000624190503648901, + "loss": 0.0153, + "num_input_tokens_seen": 126509936, + "step": 58615 + }, + { + "epoch": 9.562805872756933, + "grad_norm": 0.31543293595314026, + "learning_rate": 0.000624121553171727, + "loss": 0.1393, + "num_input_tokens_seen": 126521776, + "step": 58620 + }, + { + "epoch": 9.563621533442088, + "grad_norm": 0.12055902183055878, + "learning_rate": 0.0006240526001790723, + "loss": 0.0124, + "num_input_tokens_seen": 126532464, + "step": 58625 + }, + { + "epoch": 9.564437194127244, + "grad_norm": 0.023954760283231735, + "learning_rate": 0.0006239836446723343, + "loss": 0.0272, + "num_input_tokens_seen": 126542992, + "step": 58630 + }, + { + "epoch": 9.565252854812398, + "grad_norm": 0.261066198348999, + "learning_rate": 0.0006239146866529105, + "loss": 0.1152, + "num_input_tokens_seen": 126553680, + "step": 58635 + }, + { + "epoch": 9.566068515497554, + "grad_norm": 0.02144385129213333, + "learning_rate": 0.0006238457261221983, + "loss": 0.0094, + "num_input_tokens_seen": 126563440, + "step": 58640 + }, + { + "epoch": 9.566884176182707, + "grad_norm": 0.09852153062820435, + "learning_rate": 0.0006237767630815955, + "loss": 0.1545, + "num_input_tokens_seen": 126573904, + "step": 58645 + }, + { + "epoch": 9.567699836867863, + "grad_norm": 0.34472477436065674, + "learning_rate": 0.0006237077975324994, + "loss": 0.2388, + "num_input_tokens_seen": 126584560, + "step": 58650 + }, + { + "epoch": 9.568515497553017, + "grad_norm": 0.016418633982539177, + "learning_rate": 0.0006236388294763079, + "loss": 0.0091, + "num_input_tokens_seen": 126595632, + "step": 58655 + }, + { + "epoch": 9.569331158238173, + "grad_norm": 0.009331640787422657, + "learning_rate": 0.0006235698589144188, + "loss": 0.0193, + "num_input_tokens_seen": 126606800, + "step": 58660 + }, + { + "epoch": 9.570146818923329, + "grad_norm": 0.3153429329395294, + "learning_rate": 0.0006235008858482295, + "loss": 0.0477, + "num_input_tokens_seen": 126617808, + "step": 58665 + }, + { + "epoch": 9.570962479608482, + "grad_norm": 0.028084006160497665, + "learning_rate": 0.0006234319102791382, + "loss": 0.1567, + "num_input_tokens_seen": 126628720, + "step": 58670 + }, + { + "epoch": 9.571778140293638, + "grad_norm": 0.005167034454643726, + "learning_rate": 0.0006233629322085427, + "loss": 0.0279, + "num_input_tokens_seen": 126639664, + "step": 58675 + }, + { + "epoch": 9.572593800978792, + "grad_norm": 0.015054023824632168, + "learning_rate": 0.0006232939516378408, + "loss": 0.0317, + "num_input_tokens_seen": 126650736, + "step": 58680 + }, + { + "epoch": 9.573409461663948, + "grad_norm": 0.40828806161880493, + "learning_rate": 0.0006232249685684306, + "loss": 0.12, + "num_input_tokens_seen": 126662224, + "step": 58685 + }, + { + "epoch": 9.574225122349104, + "grad_norm": 0.21481026709079742, + "learning_rate": 0.0006231559830017102, + "loss": 0.0334, + "num_input_tokens_seen": 126673040, + "step": 58690 + }, + { + "epoch": 9.575040783034257, + "grad_norm": 0.0038407742977142334, + "learning_rate": 0.0006230869949390774, + "loss": 0.0321, + "num_input_tokens_seen": 126683824, + "step": 58695 + }, + { + "epoch": 9.575856443719413, + "grad_norm": 0.03867165744304657, + "learning_rate": 0.0006230180043819306, + "loss": 0.0455, + "num_input_tokens_seen": 126693488, + "step": 58700 + }, + { + "epoch": 9.576672104404567, + "grad_norm": 0.30149421095848083, + "learning_rate": 0.0006229490113316678, + "loss": 0.0701, + "num_input_tokens_seen": 126703888, + "step": 58705 + }, + { + "epoch": 9.577487765089723, + "grad_norm": 0.0037768434267491102, + "learning_rate": 0.0006228800157896874, + "loss": 0.139, + "num_input_tokens_seen": 126715440, + "step": 58710 + }, + { + "epoch": 9.578303425774878, + "grad_norm": 0.037405047565698624, + "learning_rate": 0.0006228110177573876, + "loss": 0.0382, + "num_input_tokens_seen": 126725680, + "step": 58715 + }, + { + "epoch": 9.579119086460032, + "grad_norm": 0.010584558360278606, + "learning_rate": 0.0006227420172361667, + "loss": 0.0112, + "num_input_tokens_seen": 126736144, + "step": 58720 + }, + { + "epoch": 9.579934747145188, + "grad_norm": 0.3380958139896393, + "learning_rate": 0.0006226730142274232, + "loss": 0.0634, + "num_input_tokens_seen": 126747504, + "step": 58725 + }, + { + "epoch": 9.580750407830342, + "grad_norm": 0.00872585829347372, + "learning_rate": 0.0006226040087325553, + "loss": 0.095, + "num_input_tokens_seen": 126757904, + "step": 58730 + }, + { + "epoch": 9.581566068515498, + "grad_norm": 0.14456132054328918, + "learning_rate": 0.0006225350007529616, + "loss": 0.0235, + "num_input_tokens_seen": 126768400, + "step": 58735 + }, + { + "epoch": 9.582381729200652, + "grad_norm": 0.3002058267593384, + "learning_rate": 0.0006224659902900408, + "loss": 0.1644, + "num_input_tokens_seen": 126779440, + "step": 58740 + }, + { + "epoch": 9.583197389885807, + "grad_norm": 0.35804423689842224, + "learning_rate": 0.0006223969773451913, + "loss": 0.0834, + "num_input_tokens_seen": 126790672, + "step": 58745 + }, + { + "epoch": 9.584013050570963, + "grad_norm": 0.051532503217458725, + "learning_rate": 0.0006223279619198118, + "loss": 0.0328, + "num_input_tokens_seen": 126799824, + "step": 58750 + }, + { + "epoch": 9.584828711256117, + "grad_norm": 0.19145216047763824, + "learning_rate": 0.000622258944015301, + "loss": 0.0381, + "num_input_tokens_seen": 126811248, + "step": 58755 + }, + { + "epoch": 9.585644371941273, + "grad_norm": 0.24259068071842194, + "learning_rate": 0.0006221899236330575, + "loss": 0.1544, + "num_input_tokens_seen": 126822256, + "step": 58760 + }, + { + "epoch": 9.586460032626427, + "grad_norm": 0.00626565283164382, + "learning_rate": 0.0006221209007744803, + "loss": 0.0611, + "num_input_tokens_seen": 126833840, + "step": 58765 + }, + { + "epoch": 9.587275693311582, + "grad_norm": 0.015487391501665115, + "learning_rate": 0.0006220518754409681, + "loss": 0.0198, + "num_input_tokens_seen": 126846160, + "step": 58770 + }, + { + "epoch": 9.588091353996738, + "grad_norm": 0.30165696144104004, + "learning_rate": 0.0006219828476339195, + "loss": 0.027, + "num_input_tokens_seen": 126857904, + "step": 58775 + }, + { + "epoch": 9.588907014681892, + "grad_norm": 0.04762418568134308, + "learning_rate": 0.0006219138173547341, + "loss": 0.0315, + "num_input_tokens_seen": 126869232, + "step": 58780 + }, + { + "epoch": 9.589722675367048, + "grad_norm": 0.0023494369816035032, + "learning_rate": 0.0006218447846048106, + "loss": 0.0201, + "num_input_tokens_seen": 126880784, + "step": 58785 + }, + { + "epoch": 9.590538336052202, + "grad_norm": 0.5345596075057983, + "learning_rate": 0.0006217757493855477, + "loss": 0.0936, + "num_input_tokens_seen": 126891600, + "step": 58790 + }, + { + "epoch": 9.591353996737357, + "grad_norm": 0.10450033843517303, + "learning_rate": 0.0006217067116983449, + "loss": 0.0627, + "num_input_tokens_seen": 126903056, + "step": 58795 + }, + { + "epoch": 9.592169657422513, + "grad_norm": 0.23620909452438354, + "learning_rate": 0.0006216376715446011, + "loss": 0.0562, + "num_input_tokens_seen": 126914032, + "step": 58800 + }, + { + "epoch": 9.592985318107667, + "grad_norm": 0.006754655856639147, + "learning_rate": 0.0006215686289257156, + "loss": 0.0849, + "num_input_tokens_seen": 126923920, + "step": 58805 + }, + { + "epoch": 9.593800978792823, + "grad_norm": 0.022846341133117676, + "learning_rate": 0.0006214995838430878, + "loss": 0.0145, + "num_input_tokens_seen": 126935216, + "step": 58810 + }, + { + "epoch": 9.594616639477977, + "grad_norm": 0.019748380407691002, + "learning_rate": 0.0006214305362981167, + "loss": 0.0274, + "num_input_tokens_seen": 126944752, + "step": 58815 + }, + { + "epoch": 9.595432300163132, + "grad_norm": 0.15802407264709473, + "learning_rate": 0.0006213614862922015, + "loss": 0.1038, + "num_input_tokens_seen": 126955856, + "step": 58820 + }, + { + "epoch": 9.596247960848288, + "grad_norm": 0.11355206370353699, + "learning_rate": 0.0006212924338267421, + "loss": 0.1414, + "num_input_tokens_seen": 126966064, + "step": 58825 + }, + { + "epoch": 9.597063621533442, + "grad_norm": 0.003097974229604006, + "learning_rate": 0.0006212233789031376, + "loss": 0.0316, + "num_input_tokens_seen": 126977424, + "step": 58830 + }, + { + "epoch": 9.597879282218598, + "grad_norm": 0.22115854918956757, + "learning_rate": 0.0006211543215227874, + "loss": 0.0806, + "num_input_tokens_seen": 126987728, + "step": 58835 + }, + { + "epoch": 9.598694942903752, + "grad_norm": 0.012565471231937408, + "learning_rate": 0.0006210852616870913, + "loss": 0.0873, + "num_input_tokens_seen": 126999536, + "step": 58840 + }, + { + "epoch": 9.599510603588907, + "grad_norm": 0.032392483204603195, + "learning_rate": 0.0006210161993974488, + "loss": 0.127, + "num_input_tokens_seen": 127010384, + "step": 58845 + }, + { + "epoch": 9.600326264274061, + "grad_norm": 0.03774901479482651, + "learning_rate": 0.0006209471346552594, + "loss": 0.0106, + "num_input_tokens_seen": 127020176, + "step": 58850 + }, + { + "epoch": 9.601141924959217, + "grad_norm": 0.061348482966423035, + "learning_rate": 0.000620878067461923, + "loss": 0.0506, + "num_input_tokens_seen": 127030192, + "step": 58855 + }, + { + "epoch": 9.601957585644373, + "grad_norm": 0.04690258949995041, + "learning_rate": 0.0006208089978188392, + "loss": 0.0194, + "num_input_tokens_seen": 127041808, + "step": 58860 + }, + { + "epoch": 9.602773246329527, + "grad_norm": 0.004770675208419561, + "learning_rate": 0.0006207399257274077, + "loss": 0.0731, + "num_input_tokens_seen": 127052400, + "step": 58865 + }, + { + "epoch": 9.603588907014682, + "grad_norm": 0.005034871865063906, + "learning_rate": 0.0006206708511890286, + "loss": 0.1226, + "num_input_tokens_seen": 127063056, + "step": 58870 + }, + { + "epoch": 9.604404567699836, + "grad_norm": 0.02005901001393795, + "learning_rate": 0.0006206017742051014, + "loss": 0.0618, + "num_input_tokens_seen": 127073840, + "step": 58875 + }, + { + "epoch": 9.605220228384992, + "grad_norm": 0.0009396728710271418, + "learning_rate": 0.0006205326947770263, + "loss": 0.0197, + "num_input_tokens_seen": 127084656, + "step": 58880 + }, + { + "epoch": 9.606035889070148, + "grad_norm": 0.005184083245694637, + "learning_rate": 0.0006204636129062034, + "loss": 0.0156, + "num_input_tokens_seen": 127095984, + "step": 58885 + }, + { + "epoch": 9.606851549755302, + "grad_norm": 0.0037674717605113983, + "learning_rate": 0.0006203945285940325, + "loss": 0.046, + "num_input_tokens_seen": 127105776, + "step": 58890 + }, + { + "epoch": 9.607667210440457, + "grad_norm": 0.02215653844177723, + "learning_rate": 0.0006203254418419137, + "loss": 0.0676, + "num_input_tokens_seen": 127117424, + "step": 58895 + }, + { + "epoch": 9.608482871125611, + "grad_norm": 0.17537197470664978, + "learning_rate": 0.0006202563526512471, + "loss": 0.1701, + "num_input_tokens_seen": 127128240, + "step": 58900 + }, + { + "epoch": 9.609298531810767, + "grad_norm": 0.04860261455178261, + "learning_rate": 0.0006201872610234331, + "loss": 0.0384, + "num_input_tokens_seen": 127140176, + "step": 58905 + }, + { + "epoch": 9.61011419249592, + "grad_norm": 0.30008167028427124, + "learning_rate": 0.0006201181669598717, + "loss": 0.0499, + "num_input_tokens_seen": 127149392, + "step": 58910 + }, + { + "epoch": 9.610929853181077, + "grad_norm": 0.02218320220708847, + "learning_rate": 0.0006200490704619633, + "loss": 0.0169, + "num_input_tokens_seen": 127160496, + "step": 58915 + }, + { + "epoch": 9.611745513866232, + "grad_norm": 0.1904144138097763, + "learning_rate": 0.0006199799715311083, + "loss": 0.0165, + "num_input_tokens_seen": 127170544, + "step": 58920 + }, + { + "epoch": 9.612561174551386, + "grad_norm": 0.0004894585581496358, + "learning_rate": 0.0006199108701687068, + "loss": 0.0378, + "num_input_tokens_seen": 127182992, + "step": 58925 + }, + { + "epoch": 9.613376835236542, + "grad_norm": 0.003415751503780484, + "learning_rate": 0.0006198417663761596, + "loss": 0.0067, + "num_input_tokens_seen": 127194320, + "step": 58930 + }, + { + "epoch": 9.614192495921696, + "grad_norm": 0.02836139313876629, + "learning_rate": 0.0006197726601548667, + "loss": 0.0389, + "num_input_tokens_seen": 127205776, + "step": 58935 + }, + { + "epoch": 9.615008156606851, + "grad_norm": 0.34116217494010925, + "learning_rate": 0.0006197035515062291, + "loss": 0.0576, + "num_input_tokens_seen": 127217264, + "step": 58940 + }, + { + "epoch": 9.615823817292007, + "grad_norm": 0.19525346159934998, + "learning_rate": 0.0006196344404316472, + "loss": 0.0988, + "num_input_tokens_seen": 127229584, + "step": 58945 + }, + { + "epoch": 9.616639477977161, + "grad_norm": 0.19460611045360565, + "learning_rate": 0.0006195653269325214, + "loss": 0.0113, + "num_input_tokens_seen": 127239824, + "step": 58950 + }, + { + "epoch": 9.617455138662317, + "grad_norm": 0.02354166842997074, + "learning_rate": 0.0006194962110102528, + "loss": 0.0508, + "num_input_tokens_seen": 127250672, + "step": 58955 + }, + { + "epoch": 9.61827079934747, + "grad_norm": 0.040908332914114, + "learning_rate": 0.0006194270926662416, + "loss": 0.1344, + "num_input_tokens_seen": 127260496, + "step": 58960 + }, + { + "epoch": 9.619086460032626, + "grad_norm": 0.005967097822576761, + "learning_rate": 0.000619357971901889, + "loss": 0.1519, + "num_input_tokens_seen": 127272336, + "step": 58965 + }, + { + "epoch": 9.619902120717782, + "grad_norm": 0.020089037716388702, + "learning_rate": 0.0006192888487185958, + "loss": 0.0073, + "num_input_tokens_seen": 127284368, + "step": 58970 + }, + { + "epoch": 9.620717781402936, + "grad_norm": 0.010542241856455803, + "learning_rate": 0.0006192197231177627, + "loss": 0.0307, + "num_input_tokens_seen": 127294032, + "step": 58975 + }, + { + "epoch": 9.621533442088092, + "grad_norm": 0.004966467618942261, + "learning_rate": 0.0006191505951007906, + "loss": 0.0416, + "num_input_tokens_seen": 127304080, + "step": 58980 + }, + { + "epoch": 9.622349102773246, + "grad_norm": 0.22209875285625458, + "learning_rate": 0.0006190814646690805, + "loss": 0.0957, + "num_input_tokens_seen": 127315888, + "step": 58985 + }, + { + "epoch": 9.623164763458401, + "grad_norm": 0.14290091395378113, + "learning_rate": 0.0006190123318240335, + "loss": 0.0729, + "num_input_tokens_seen": 127326864, + "step": 58990 + }, + { + "epoch": 9.623980424143557, + "grad_norm": 0.019327791407704353, + "learning_rate": 0.0006189431965670507, + "loss": 0.0105, + "num_input_tokens_seen": 127336368, + "step": 58995 + }, + { + "epoch": 9.624796084828711, + "grad_norm": 0.01445768028497696, + "learning_rate": 0.0006188740588995331, + "loss": 0.0126, + "num_input_tokens_seen": 127347824, + "step": 59000 + }, + { + "epoch": 9.625611745513867, + "grad_norm": 0.2910337448120117, + "learning_rate": 0.000618804918822882, + "loss": 0.0889, + "num_input_tokens_seen": 127356656, + "step": 59005 + }, + { + "epoch": 9.62642740619902, + "grad_norm": 0.02562132477760315, + "learning_rate": 0.0006187357763384982, + "loss": 0.2167, + "num_input_tokens_seen": 127367120, + "step": 59010 + }, + { + "epoch": 9.627243066884176, + "grad_norm": 0.11825750023126602, + "learning_rate": 0.0006186666314477835, + "loss": 0.0962, + "num_input_tokens_seen": 127378800, + "step": 59015 + }, + { + "epoch": 9.62805872756933, + "grad_norm": 0.2626383304595947, + "learning_rate": 0.0006185974841521389, + "loss": 0.1408, + "num_input_tokens_seen": 127390000, + "step": 59020 + }, + { + "epoch": 9.628874388254486, + "grad_norm": 0.034753233194351196, + "learning_rate": 0.0006185283344529659, + "loss": 0.0286, + "num_input_tokens_seen": 127400144, + "step": 59025 + }, + { + "epoch": 9.629690048939642, + "grad_norm": 0.007227802649140358, + "learning_rate": 0.0006184591823516658, + "loss": 0.1032, + "num_input_tokens_seen": 127411568, + "step": 59030 + }, + { + "epoch": 9.630505709624796, + "grad_norm": 0.005829382222145796, + "learning_rate": 0.00061839002784964, + "loss": 0.041, + "num_input_tokens_seen": 127421520, + "step": 59035 + }, + { + "epoch": 9.631321370309951, + "grad_norm": 0.035855915397405624, + "learning_rate": 0.0006183208709482903, + "loss": 0.057, + "num_input_tokens_seen": 127433424, + "step": 59040 + }, + { + "epoch": 9.632137030995105, + "grad_norm": 0.3284017741680145, + "learning_rate": 0.0006182517116490179, + "loss": 0.0193, + "num_input_tokens_seen": 127443408, + "step": 59045 + }, + { + "epoch": 9.632952691680261, + "grad_norm": 0.02845144085586071, + "learning_rate": 0.0006181825499532247, + "loss": 0.0179, + "num_input_tokens_seen": 127454256, + "step": 59050 + }, + { + "epoch": 9.633768352365417, + "grad_norm": 0.009389417245984077, + "learning_rate": 0.000618113385862312, + "loss": 0.1183, + "num_input_tokens_seen": 127464624, + "step": 59055 + }, + { + "epoch": 9.63458401305057, + "grad_norm": 0.022526390850543976, + "learning_rate": 0.0006180442193776818, + "loss": 0.0209, + "num_input_tokens_seen": 127474192, + "step": 59060 + }, + { + "epoch": 9.635399673735726, + "grad_norm": 0.31376397609710693, + "learning_rate": 0.0006179750505007357, + "loss": 0.0573, + "num_input_tokens_seen": 127483056, + "step": 59065 + }, + { + "epoch": 9.63621533442088, + "grad_norm": 0.24588175117969513, + "learning_rate": 0.0006179058792328756, + "loss": 0.093, + "num_input_tokens_seen": 127493360, + "step": 59070 + }, + { + "epoch": 9.637030995106036, + "grad_norm": 0.18079529702663422, + "learning_rate": 0.0006178367055755032, + "loss": 0.1421, + "num_input_tokens_seen": 127503728, + "step": 59075 + }, + { + "epoch": 9.63784665579119, + "grad_norm": 0.013411983847618103, + "learning_rate": 0.0006177675295300206, + "loss": 0.0291, + "num_input_tokens_seen": 127514672, + "step": 59080 + }, + { + "epoch": 9.638662316476346, + "grad_norm": 0.003872817615047097, + "learning_rate": 0.0006176983510978296, + "loss": 0.0175, + "num_input_tokens_seen": 127525200, + "step": 59085 + }, + { + "epoch": 9.639477977161501, + "grad_norm": 0.06654699146747589, + "learning_rate": 0.000617629170280332, + "loss": 0.0996, + "num_input_tokens_seen": 127535824, + "step": 59090 + }, + { + "epoch": 9.640293637846655, + "grad_norm": 0.017611471936106682, + "learning_rate": 0.0006175599870789301, + "loss": 0.02, + "num_input_tokens_seen": 127546960, + "step": 59095 + }, + { + "epoch": 9.641109298531811, + "grad_norm": 0.20919202268123627, + "learning_rate": 0.000617490801495026, + "loss": 0.104, + "num_input_tokens_seen": 127557680, + "step": 59100 + }, + { + "epoch": 9.641924959216965, + "grad_norm": 0.026205115020275116, + "learning_rate": 0.0006174216135300219, + "loss": 0.0424, + "num_input_tokens_seen": 127567696, + "step": 59105 + }, + { + "epoch": 9.64274061990212, + "grad_norm": 0.01650651916861534, + "learning_rate": 0.0006173524231853197, + "loss": 0.0295, + "num_input_tokens_seen": 127578736, + "step": 59110 + }, + { + "epoch": 9.643556280587276, + "grad_norm": 0.03077739104628563, + "learning_rate": 0.0006172832304623217, + "loss": 0.018, + "num_input_tokens_seen": 127590224, + "step": 59115 + }, + { + "epoch": 9.64437194127243, + "grad_norm": 0.028544628992676735, + "learning_rate": 0.0006172140353624304, + "loss": 0.0124, + "num_input_tokens_seen": 127601328, + "step": 59120 + }, + { + "epoch": 9.645187601957586, + "grad_norm": 0.6344919204711914, + "learning_rate": 0.0006171448378870479, + "loss": 0.0947, + "num_input_tokens_seen": 127612464, + "step": 59125 + }, + { + "epoch": 9.64600326264274, + "grad_norm": 0.09339115023612976, + "learning_rate": 0.0006170756380375766, + "loss": 0.055, + "num_input_tokens_seen": 127622864, + "step": 59130 + }, + { + "epoch": 9.646818923327896, + "grad_norm": 0.08153016865253448, + "learning_rate": 0.000617006435815419, + "loss": 0.0092, + "num_input_tokens_seen": 127634000, + "step": 59135 + }, + { + "epoch": 9.647634584013051, + "grad_norm": 0.043956682085990906, + "learning_rate": 0.0006169372312219777, + "loss": 0.1149, + "num_input_tokens_seen": 127645264, + "step": 59140 + }, + { + "epoch": 9.648450244698205, + "grad_norm": 0.043870192021131516, + "learning_rate": 0.0006168680242586549, + "loss": 0.0086, + "num_input_tokens_seen": 127655376, + "step": 59145 + }, + { + "epoch": 9.649265905383361, + "grad_norm": 0.043046001344919205, + "learning_rate": 0.0006167988149268533, + "loss": 0.09, + "num_input_tokens_seen": 127665584, + "step": 59150 + }, + { + "epoch": 9.650081566068515, + "grad_norm": 0.027375293895602226, + "learning_rate": 0.0006167296032279757, + "loss": 0.0349, + "num_input_tokens_seen": 127675920, + "step": 59155 + }, + { + "epoch": 9.65089722675367, + "grad_norm": 0.14567482471466064, + "learning_rate": 0.0006166603891634245, + "loss": 0.0312, + "num_input_tokens_seen": 127686928, + "step": 59160 + }, + { + "epoch": 9.651712887438826, + "grad_norm": 0.009442988783121109, + "learning_rate": 0.0006165911727346025, + "loss": 0.0073, + "num_input_tokens_seen": 127697296, + "step": 59165 + }, + { + "epoch": 9.65252854812398, + "grad_norm": 0.024301307275891304, + "learning_rate": 0.0006165219539429126, + "loss": 0.0108, + "num_input_tokens_seen": 127707312, + "step": 59170 + }, + { + "epoch": 9.653344208809136, + "grad_norm": 0.005902509205043316, + "learning_rate": 0.0006164527327897574, + "loss": 0.0425, + "num_input_tokens_seen": 127719056, + "step": 59175 + }, + { + "epoch": 9.65415986949429, + "grad_norm": 0.04859452694654465, + "learning_rate": 0.0006163835092765399, + "loss": 0.0136, + "num_input_tokens_seen": 127730864, + "step": 59180 + }, + { + "epoch": 9.654975530179446, + "grad_norm": 0.19478990137577057, + "learning_rate": 0.0006163142834046629, + "loss": 0.0442, + "num_input_tokens_seen": 127740944, + "step": 59185 + }, + { + "epoch": 9.655791190864601, + "grad_norm": 0.003453123616054654, + "learning_rate": 0.0006162450551755295, + "loss": 0.0062, + "num_input_tokens_seen": 127752016, + "step": 59190 + }, + { + "epoch": 9.656606851549755, + "grad_norm": 0.002414074493572116, + "learning_rate": 0.0006161758245905423, + "loss": 0.0044, + "num_input_tokens_seen": 127762480, + "step": 59195 + }, + { + "epoch": 9.65742251223491, + "grad_norm": 0.007363767828792334, + "learning_rate": 0.0006161065916511047, + "loss": 0.0544, + "num_input_tokens_seen": 127773232, + "step": 59200 + }, + { + "epoch": 9.658238172920065, + "grad_norm": 0.039637234061956406, + "learning_rate": 0.0006160373563586199, + "loss": 0.078, + "num_input_tokens_seen": 127784112, + "step": 59205 + }, + { + "epoch": 9.65905383360522, + "grad_norm": 0.11190462857484818, + "learning_rate": 0.0006159681187144909, + "loss": 0.0326, + "num_input_tokens_seen": 127795600, + "step": 59210 + }, + { + "epoch": 9.659869494290374, + "grad_norm": 0.005917913746088743, + "learning_rate": 0.0006158988787201208, + "loss": 0.0227, + "num_input_tokens_seen": 127805648, + "step": 59215 + }, + { + "epoch": 9.66068515497553, + "grad_norm": 0.18789978325366974, + "learning_rate": 0.0006158296363769128, + "loss": 0.063, + "num_input_tokens_seen": 127816560, + "step": 59220 + }, + { + "epoch": 9.661500815660686, + "grad_norm": 0.13209493458271027, + "learning_rate": 0.0006157603916862703, + "loss": 0.072, + "num_input_tokens_seen": 127828784, + "step": 59225 + }, + { + "epoch": 9.66231647634584, + "grad_norm": 0.0633811429142952, + "learning_rate": 0.0006156911446495967, + "loss": 0.0129, + "num_input_tokens_seen": 127839088, + "step": 59230 + }, + { + "epoch": 9.663132137030995, + "grad_norm": 0.05683831870555878, + "learning_rate": 0.0006156218952682953, + "loss": 0.0562, + "num_input_tokens_seen": 127848016, + "step": 59235 + }, + { + "epoch": 9.66394779771615, + "grad_norm": 0.006049333605915308, + "learning_rate": 0.0006155526435437694, + "loss": 0.009, + "num_input_tokens_seen": 127858320, + "step": 59240 + }, + { + "epoch": 9.664763458401305, + "grad_norm": 0.014514550566673279, + "learning_rate": 0.0006154833894774226, + "loss": 0.1719, + "num_input_tokens_seen": 127869936, + "step": 59245 + }, + { + "epoch": 9.66557911908646, + "grad_norm": 0.004821360111236572, + "learning_rate": 0.0006154141330706586, + "loss": 0.0108, + "num_input_tokens_seen": 127881040, + "step": 59250 + }, + { + "epoch": 9.666394779771615, + "grad_norm": 0.07728572189807892, + "learning_rate": 0.0006153448743248805, + "loss": 0.1021, + "num_input_tokens_seen": 127891792, + "step": 59255 + }, + { + "epoch": 9.66721044045677, + "grad_norm": 0.21957984566688538, + "learning_rate": 0.0006152756132414924, + "loss": 0.1639, + "num_input_tokens_seen": 127903376, + "step": 59260 + }, + { + "epoch": 9.668026101141924, + "grad_norm": 0.08288101851940155, + "learning_rate": 0.0006152063498218977, + "loss": 0.0375, + "num_input_tokens_seen": 127914576, + "step": 59265 + }, + { + "epoch": 9.66884176182708, + "grad_norm": 0.008778671734035015, + "learning_rate": 0.0006151370840675001, + "loss": 0.0085, + "num_input_tokens_seen": 127924784, + "step": 59270 + }, + { + "epoch": 9.669657422512234, + "grad_norm": 0.003736205631867051, + "learning_rate": 0.0006150678159797034, + "loss": 0.0059, + "num_input_tokens_seen": 127934800, + "step": 59275 + }, + { + "epoch": 9.67047308319739, + "grad_norm": 0.25486451387405396, + "learning_rate": 0.0006149985455599115, + "loss": 0.1603, + "num_input_tokens_seen": 127945584, + "step": 59280 + }, + { + "epoch": 9.671288743882545, + "grad_norm": 0.018975313752889633, + "learning_rate": 0.0006149292728095283, + "loss": 0.0076, + "num_input_tokens_seen": 127956272, + "step": 59285 + }, + { + "epoch": 9.6721044045677, + "grad_norm": 0.1916586458683014, + "learning_rate": 0.0006148599977299575, + "loss": 0.0547, + "num_input_tokens_seen": 127966672, + "step": 59290 + }, + { + "epoch": 9.672920065252855, + "grad_norm": 0.4145491421222687, + "learning_rate": 0.0006147907203226031, + "loss": 0.1489, + "num_input_tokens_seen": 127978032, + "step": 59295 + }, + { + "epoch": 9.673735725938009, + "grad_norm": 0.04974278062582016, + "learning_rate": 0.0006147214405888692, + "loss": 0.0182, + "num_input_tokens_seen": 127989072, + "step": 59300 + }, + { + "epoch": 9.674551386623165, + "grad_norm": 0.005701778922230005, + "learning_rate": 0.0006146521585301596, + "loss": 0.0064, + "num_input_tokens_seen": 127999568, + "step": 59305 + }, + { + "epoch": 9.67536704730832, + "grad_norm": 0.01001247949898243, + "learning_rate": 0.0006145828741478788, + "loss": 0.0128, + "num_input_tokens_seen": 128010256, + "step": 59310 + }, + { + "epoch": 9.676182707993474, + "grad_norm": 0.006962750572711229, + "learning_rate": 0.0006145135874434305, + "loss": 0.0132, + "num_input_tokens_seen": 128020176, + "step": 59315 + }, + { + "epoch": 9.67699836867863, + "grad_norm": 0.0029077152721583843, + "learning_rate": 0.0006144442984182193, + "loss": 0.0462, + "num_input_tokens_seen": 128032496, + "step": 59320 + }, + { + "epoch": 9.677814029363784, + "grad_norm": 0.002774407621473074, + "learning_rate": 0.0006143750070736491, + "loss": 0.0062, + "num_input_tokens_seen": 128043632, + "step": 59325 + }, + { + "epoch": 9.67862969004894, + "grad_norm": 0.005361605901271105, + "learning_rate": 0.0006143057134111243, + "loss": 0.1307, + "num_input_tokens_seen": 128054608, + "step": 59330 + }, + { + "epoch": 9.679445350734095, + "grad_norm": 0.016183340921998024, + "learning_rate": 0.0006142364174320492, + "loss": 0.0147, + "num_input_tokens_seen": 128065360, + "step": 59335 + }, + { + "epoch": 9.68026101141925, + "grad_norm": 0.005683739669620991, + "learning_rate": 0.0006141671191378281, + "loss": 0.0572, + "num_input_tokens_seen": 128076464, + "step": 59340 + }, + { + "epoch": 9.681076672104405, + "grad_norm": 0.07049499452114105, + "learning_rate": 0.0006140978185298656, + "loss": 0.0244, + "num_input_tokens_seen": 128086320, + "step": 59345 + }, + { + "epoch": 9.681892332789559, + "grad_norm": 0.016755200922489166, + "learning_rate": 0.0006140285156095661, + "loss": 0.0091, + "num_input_tokens_seen": 128096208, + "step": 59350 + }, + { + "epoch": 9.682707993474715, + "grad_norm": 0.07895629853010178, + "learning_rate": 0.0006139592103783339, + "loss": 0.0662, + "num_input_tokens_seen": 128107376, + "step": 59355 + }, + { + "epoch": 9.68352365415987, + "grad_norm": 0.10353441536426544, + "learning_rate": 0.000613889902837574, + "loss": 0.0229, + "num_input_tokens_seen": 128118800, + "step": 59360 + }, + { + "epoch": 9.684339314845024, + "grad_norm": 0.08477051556110382, + "learning_rate": 0.0006138205929886905, + "loss": 0.1583, + "num_input_tokens_seen": 128131152, + "step": 59365 + }, + { + "epoch": 9.68515497553018, + "grad_norm": 0.06514527648687363, + "learning_rate": 0.0006137512808330884, + "loss": 0.0375, + "num_input_tokens_seen": 128142480, + "step": 59370 + }, + { + "epoch": 9.685970636215334, + "grad_norm": 0.07561139017343521, + "learning_rate": 0.0006136819663721722, + "loss": 0.0104, + "num_input_tokens_seen": 128152816, + "step": 59375 + }, + { + "epoch": 9.68678629690049, + "grad_norm": 0.015533429570496082, + "learning_rate": 0.0006136126496073469, + "loss": 0.0633, + "num_input_tokens_seen": 128163632, + "step": 59380 + }, + { + "epoch": 9.687601957585644, + "grad_norm": 0.008105958811938763, + "learning_rate": 0.0006135433305400169, + "loss": 0.0047, + "num_input_tokens_seen": 128174640, + "step": 59385 + }, + { + "epoch": 9.6884176182708, + "grad_norm": 0.3064955472946167, + "learning_rate": 0.0006134740091715875, + "loss": 0.1008, + "num_input_tokens_seen": 128184144, + "step": 59390 + }, + { + "epoch": 9.689233278955955, + "grad_norm": 0.01269688829779625, + "learning_rate": 0.0006134046855034631, + "loss": 0.0134, + "num_input_tokens_seen": 128192976, + "step": 59395 + }, + { + "epoch": 9.690048939641109, + "grad_norm": 0.0018515820847824216, + "learning_rate": 0.0006133353595370491, + "loss": 0.016, + "num_input_tokens_seen": 128204144, + "step": 59400 + }, + { + "epoch": 9.690864600326265, + "grad_norm": 0.0071369558572769165, + "learning_rate": 0.0006132660312737502, + "loss": 0.0104, + "num_input_tokens_seen": 128215600, + "step": 59405 + }, + { + "epoch": 9.691680261011419, + "grad_norm": 0.09405604004859924, + "learning_rate": 0.0006131967007149716, + "loss": 0.0682, + "num_input_tokens_seen": 128226672, + "step": 59410 + }, + { + "epoch": 9.692495921696574, + "grad_norm": 0.23855988681316376, + "learning_rate": 0.000613127367862118, + "loss": 0.1143, + "num_input_tokens_seen": 128237936, + "step": 59415 + }, + { + "epoch": 9.69331158238173, + "grad_norm": 0.027761278674006462, + "learning_rate": 0.0006130580327165949, + "loss": 0.0075, + "num_input_tokens_seen": 128248688, + "step": 59420 + }, + { + "epoch": 9.694127243066884, + "grad_norm": 0.1607711762189865, + "learning_rate": 0.0006129886952798074, + "loss": 0.0407, + "num_input_tokens_seen": 128259440, + "step": 59425 + }, + { + "epoch": 9.69494290375204, + "grad_norm": 0.04297984018921852, + "learning_rate": 0.0006129193555531606, + "loss": 0.0903, + "num_input_tokens_seen": 128268880, + "step": 59430 + }, + { + "epoch": 9.695758564437194, + "grad_norm": 0.04358898848295212, + "learning_rate": 0.0006128500135380598, + "loss": 0.0133, + "num_input_tokens_seen": 128277744, + "step": 59435 + }, + { + "epoch": 9.69657422512235, + "grad_norm": 0.021122734993696213, + "learning_rate": 0.0006127806692359103, + "loss": 0.0231, + "num_input_tokens_seen": 128288176, + "step": 59440 + }, + { + "epoch": 9.697389885807503, + "grad_norm": 0.1159692034125328, + "learning_rate": 0.0006127113226481175, + "loss": 0.0168, + "num_input_tokens_seen": 128298672, + "step": 59445 + }, + { + "epoch": 9.698205546492659, + "grad_norm": 0.003028104081749916, + "learning_rate": 0.0006126419737760868, + "loss": 0.0038, + "num_input_tokens_seen": 128309264, + "step": 59450 + }, + { + "epoch": 9.699021207177815, + "grad_norm": 0.0046808598563075066, + "learning_rate": 0.0006125726226212236, + "loss": 0.1239, + "num_input_tokens_seen": 128320656, + "step": 59455 + }, + { + "epoch": 9.699836867862969, + "grad_norm": 0.03163069114089012, + "learning_rate": 0.0006125032691849333, + "loss": 0.0478, + "num_input_tokens_seen": 128332016, + "step": 59460 + }, + { + "epoch": 9.700652528548124, + "grad_norm": 0.37062719464302063, + "learning_rate": 0.0006124339134686216, + "loss": 0.0366, + "num_input_tokens_seen": 128342448, + "step": 59465 + }, + { + "epoch": 9.701468189233278, + "grad_norm": 0.020252849906682968, + "learning_rate": 0.0006123645554736941, + "loss": 0.0274, + "num_input_tokens_seen": 128352368, + "step": 59470 + }, + { + "epoch": 9.702283849918434, + "grad_norm": 0.014505825936794281, + "learning_rate": 0.0006122951952015562, + "loss": 0.0482, + "num_input_tokens_seen": 128361488, + "step": 59475 + }, + { + "epoch": 9.70309951060359, + "grad_norm": 0.061234936118125916, + "learning_rate": 0.0006122258326536138, + "loss": 0.1314, + "num_input_tokens_seen": 128371632, + "step": 59480 + }, + { + "epoch": 9.703915171288743, + "grad_norm": 0.0282104704529047, + "learning_rate": 0.0006121564678312724, + "loss": 0.2553, + "num_input_tokens_seen": 128380688, + "step": 59485 + }, + { + "epoch": 9.7047308319739, + "grad_norm": 0.06193218752741814, + "learning_rate": 0.0006120871007359381, + "loss": 0.1717, + "num_input_tokens_seen": 128390480, + "step": 59490 + }, + { + "epoch": 9.705546492659053, + "grad_norm": 0.03727657347917557, + "learning_rate": 0.0006120177313690164, + "loss": 0.0886, + "num_input_tokens_seen": 128401712, + "step": 59495 + }, + { + "epoch": 9.706362153344209, + "grad_norm": 0.0025384698528796434, + "learning_rate": 0.0006119483597319132, + "loss": 0.0188, + "num_input_tokens_seen": 128413040, + "step": 59500 + }, + { + "epoch": 9.707177814029365, + "grad_norm": 0.008450605906546116, + "learning_rate": 0.0006118789858260347, + "loss": 0.0342, + "num_input_tokens_seen": 128421744, + "step": 59505 + }, + { + "epoch": 9.707993474714518, + "grad_norm": 0.23573344945907593, + "learning_rate": 0.0006118096096527863, + "loss": 0.1022, + "num_input_tokens_seen": 128432528, + "step": 59510 + }, + { + "epoch": 9.708809135399674, + "grad_norm": 0.003673044266179204, + "learning_rate": 0.0006117402312135746, + "loss": 0.0251, + "num_input_tokens_seen": 128443664, + "step": 59515 + }, + { + "epoch": 9.709624796084828, + "grad_norm": 0.001616779831238091, + "learning_rate": 0.0006116708505098051, + "loss": 0.0062, + "num_input_tokens_seen": 128454256, + "step": 59520 + }, + { + "epoch": 9.710440456769984, + "grad_norm": 0.08030480146408081, + "learning_rate": 0.0006116014675428842, + "loss": 0.0294, + "num_input_tokens_seen": 128466768, + "step": 59525 + }, + { + "epoch": 9.71125611745514, + "grad_norm": 0.010359351523220539, + "learning_rate": 0.0006115320823142182, + "loss": 0.0684, + "num_input_tokens_seen": 128477264, + "step": 59530 + }, + { + "epoch": 9.712071778140293, + "grad_norm": 0.07781039923429489, + "learning_rate": 0.000611462694825213, + "loss": 0.0674, + "num_input_tokens_seen": 128489040, + "step": 59535 + }, + { + "epoch": 9.71288743882545, + "grad_norm": 0.30486807227134705, + "learning_rate": 0.0006113933050772749, + "loss": 0.0245, + "num_input_tokens_seen": 128500176, + "step": 59540 + }, + { + "epoch": 9.713703099510603, + "grad_norm": 0.006596557330340147, + "learning_rate": 0.00061132391307181, + "loss": 0.0625, + "num_input_tokens_seen": 128510096, + "step": 59545 + }, + { + "epoch": 9.714518760195759, + "grad_norm": 0.004321752116084099, + "learning_rate": 0.0006112545188102249, + "loss": 0.035, + "num_input_tokens_seen": 128520976, + "step": 59550 + }, + { + "epoch": 9.715334420880914, + "grad_norm": 0.008625321090221405, + "learning_rate": 0.0006111851222939257, + "loss": 0.0594, + "num_input_tokens_seen": 128532080, + "step": 59555 + }, + { + "epoch": 9.716150081566068, + "grad_norm": 0.05959155410528183, + "learning_rate": 0.0006111157235243192, + "loss": 0.1803, + "num_input_tokens_seen": 128544176, + "step": 59560 + }, + { + "epoch": 9.716965742251224, + "grad_norm": 0.006775570102035999, + "learning_rate": 0.0006110463225028114, + "loss": 0.103, + "num_input_tokens_seen": 128555472, + "step": 59565 + }, + { + "epoch": 9.717781402936378, + "grad_norm": 0.008153521455824375, + "learning_rate": 0.0006109769192308091, + "loss": 0.0149, + "num_input_tokens_seen": 128565968, + "step": 59570 + }, + { + "epoch": 9.718597063621534, + "grad_norm": 0.23950019478797913, + "learning_rate": 0.0006109075137097188, + "loss": 0.0795, + "num_input_tokens_seen": 128576528, + "step": 59575 + }, + { + "epoch": 9.719412724306688, + "grad_norm": 0.06137224659323692, + "learning_rate": 0.0006108381059409469, + "loss": 0.0439, + "num_input_tokens_seen": 128586928, + "step": 59580 + }, + { + "epoch": 9.720228384991843, + "grad_norm": 0.501958966255188, + "learning_rate": 0.0006107686959259003, + "loss": 0.1304, + "num_input_tokens_seen": 128599280, + "step": 59585 + }, + { + "epoch": 9.721044045676999, + "grad_norm": 0.008308853022754192, + "learning_rate": 0.0006106992836659853, + "loss": 0.0212, + "num_input_tokens_seen": 128609424, + "step": 59590 + }, + { + "epoch": 9.721859706362153, + "grad_norm": 0.008345154114067554, + "learning_rate": 0.0006106298691626091, + "loss": 0.0083, + "num_input_tokens_seen": 128620368, + "step": 59595 + }, + { + "epoch": 9.722675367047309, + "grad_norm": 0.31957361102104187, + "learning_rate": 0.0006105604524171782, + "loss": 0.0464, + "num_input_tokens_seen": 128631792, + "step": 59600 + }, + { + "epoch": 9.723491027732463, + "grad_norm": 0.22444653511047363, + "learning_rate": 0.0006104910334310996, + "loss": 0.028, + "num_input_tokens_seen": 128641584, + "step": 59605 + }, + { + "epoch": 9.724306688417618, + "grad_norm": 0.0019040057668462396, + "learning_rate": 0.0006104216122057799, + "loss": 0.1388, + "num_input_tokens_seen": 128651824, + "step": 59610 + }, + { + "epoch": 9.725122349102774, + "grad_norm": 0.02651318721473217, + "learning_rate": 0.0006103521887426262, + "loss": 0.146, + "num_input_tokens_seen": 128661616, + "step": 59615 + }, + { + "epoch": 9.725938009787928, + "grad_norm": 0.3366755247116089, + "learning_rate": 0.0006102827630430454, + "loss": 0.0452, + "num_input_tokens_seen": 128671952, + "step": 59620 + }, + { + "epoch": 9.726753670473084, + "grad_norm": 0.05788411945104599, + "learning_rate": 0.0006102133351084443, + "loss": 0.0271, + "num_input_tokens_seen": 128682928, + "step": 59625 + }, + { + "epoch": 9.727569331158238, + "grad_norm": 0.28643128275871277, + "learning_rate": 0.0006101439049402304, + "loss": 0.1437, + "num_input_tokens_seen": 128692592, + "step": 59630 + }, + { + "epoch": 9.728384991843393, + "grad_norm": 0.04104488343000412, + "learning_rate": 0.0006100744725398105, + "loss": 0.0094, + "num_input_tokens_seen": 128703280, + "step": 59635 + }, + { + "epoch": 9.729200652528547, + "grad_norm": 0.07834839820861816, + "learning_rate": 0.0006100050379085918, + "loss": 0.0107, + "num_input_tokens_seen": 128713904, + "step": 59640 + }, + { + "epoch": 9.730016313213703, + "grad_norm": 0.017662381753325462, + "learning_rate": 0.0006099356010479814, + "loss": 0.0066, + "num_input_tokens_seen": 128723696, + "step": 59645 + }, + { + "epoch": 9.730831973898859, + "grad_norm": 0.09404371678829193, + "learning_rate": 0.0006098661619593866, + "loss": 0.0309, + "num_input_tokens_seen": 128735120, + "step": 59650 + }, + { + "epoch": 9.731647634584013, + "grad_norm": 0.0023310815449804068, + "learning_rate": 0.0006097967206442147, + "loss": 0.1024, + "num_input_tokens_seen": 128745232, + "step": 59655 + }, + { + "epoch": 9.732463295269168, + "grad_norm": 0.10683703422546387, + "learning_rate": 0.0006097272771038728, + "loss": 0.019, + "num_input_tokens_seen": 128755856, + "step": 59660 + }, + { + "epoch": 9.733278955954322, + "grad_norm": 0.17117604613304138, + "learning_rate": 0.0006096578313397687, + "loss": 0.0205, + "num_input_tokens_seen": 128766768, + "step": 59665 + }, + { + "epoch": 9.734094616639478, + "grad_norm": 0.054592691361904144, + "learning_rate": 0.0006095883833533094, + "loss": 0.0247, + "num_input_tokens_seen": 128778448, + "step": 59670 + }, + { + "epoch": 9.734910277324634, + "grad_norm": 0.012505830265581608, + "learning_rate": 0.0006095189331459024, + "loss": 0.0879, + "num_input_tokens_seen": 128788176, + "step": 59675 + }, + { + "epoch": 9.735725938009788, + "grad_norm": 0.010917918756604195, + "learning_rate": 0.0006094494807189555, + "loss": 0.0116, + "num_input_tokens_seen": 128799280, + "step": 59680 + }, + { + "epoch": 9.736541598694943, + "grad_norm": 0.037680454552173615, + "learning_rate": 0.0006093800260738758, + "loss": 0.0141, + "num_input_tokens_seen": 128808752, + "step": 59685 + }, + { + "epoch": 9.737357259380097, + "grad_norm": 0.020594626665115356, + "learning_rate": 0.0006093105692120712, + "loss": 0.0052, + "num_input_tokens_seen": 128819728, + "step": 59690 + }, + { + "epoch": 9.738172920065253, + "grad_norm": 0.6141136288642883, + "learning_rate": 0.0006092411101349492, + "loss": 0.0704, + "num_input_tokens_seen": 128831440, + "step": 59695 + }, + { + "epoch": 9.738988580750409, + "grad_norm": 0.25520747900009155, + "learning_rate": 0.0006091716488439177, + "loss": 0.0212, + "num_input_tokens_seen": 128841200, + "step": 59700 + }, + { + "epoch": 9.739804241435563, + "grad_norm": 0.002609417075291276, + "learning_rate": 0.0006091021853403841, + "loss": 0.0334, + "num_input_tokens_seen": 128852112, + "step": 59705 + }, + { + "epoch": 9.740619902120718, + "grad_norm": 0.02499447949230671, + "learning_rate": 0.0006090327196257562, + "loss": 0.0188, + "num_input_tokens_seen": 128864592, + "step": 59710 + }, + { + "epoch": 9.741435562805872, + "grad_norm": 0.002854494843631983, + "learning_rate": 0.000608963251701442, + "loss": 0.0082, + "num_input_tokens_seen": 128876272, + "step": 59715 + }, + { + "epoch": 9.742251223491028, + "grad_norm": 0.2157122790813446, + "learning_rate": 0.0006088937815688495, + "loss": 0.0229, + "num_input_tokens_seen": 128887056, + "step": 59720 + }, + { + "epoch": 9.743066884176184, + "grad_norm": 0.2257642149925232, + "learning_rate": 0.0006088243092293861, + "loss": 0.0884, + "num_input_tokens_seen": 128898128, + "step": 59725 + }, + { + "epoch": 9.743882544861338, + "grad_norm": 0.15581852197647095, + "learning_rate": 0.0006087548346844601, + "loss": 0.0456, + "num_input_tokens_seen": 128909040, + "step": 59730 + }, + { + "epoch": 9.744698205546493, + "grad_norm": 0.0007019134354777634, + "learning_rate": 0.0006086853579354793, + "loss": 0.0129, + "num_input_tokens_seen": 128918288, + "step": 59735 + }, + { + "epoch": 9.745513866231647, + "grad_norm": 0.5110894441604614, + "learning_rate": 0.0006086158789838519, + "loss": 0.0574, + "num_input_tokens_seen": 128929584, + "step": 59740 + }, + { + "epoch": 9.746329526916803, + "grad_norm": 0.20720095932483673, + "learning_rate": 0.0006085463978309861, + "loss": 0.0472, + "num_input_tokens_seen": 128940208, + "step": 59745 + }, + { + "epoch": 9.747145187601957, + "grad_norm": 0.03007080778479576, + "learning_rate": 0.0006084769144782897, + "loss": 0.0699, + "num_input_tokens_seen": 128950224, + "step": 59750 + }, + { + "epoch": 9.747960848287113, + "grad_norm": 0.06690139323472977, + "learning_rate": 0.0006084074289271711, + "loss": 0.0227, + "num_input_tokens_seen": 128961072, + "step": 59755 + }, + { + "epoch": 9.748776508972268, + "grad_norm": 0.01135540846735239, + "learning_rate": 0.0006083379411790383, + "loss": 0.0112, + "num_input_tokens_seen": 128972528, + "step": 59760 + }, + { + "epoch": 9.749592169657422, + "grad_norm": 0.022065188735723495, + "learning_rate": 0.0006082684512352997, + "loss": 0.1275, + "num_input_tokens_seen": 128982480, + "step": 59765 + }, + { + "epoch": 9.750407830342578, + "grad_norm": 0.47740688920021057, + "learning_rate": 0.0006081989590973637, + "loss": 0.1524, + "num_input_tokens_seen": 128992304, + "step": 59770 + }, + { + "epoch": 9.751223491027732, + "grad_norm": 0.4422495663166046, + "learning_rate": 0.0006081294647666385, + "loss": 0.0372, + "num_input_tokens_seen": 129003344, + "step": 59775 + }, + { + "epoch": 9.752039151712887, + "grad_norm": 0.26727715134620667, + "learning_rate": 0.0006080599682445325, + "loss": 0.0314, + "num_input_tokens_seen": 129014224, + "step": 59780 + }, + { + "epoch": 9.752854812398043, + "grad_norm": 0.007251810748130083, + "learning_rate": 0.000607990469532454, + "loss": 0.0891, + "num_input_tokens_seen": 129024016, + "step": 59785 + }, + { + "epoch": 9.753670473083197, + "grad_norm": 0.007238328456878662, + "learning_rate": 0.0006079209686318119, + "loss": 0.0632, + "num_input_tokens_seen": 129033840, + "step": 59790 + }, + { + "epoch": 9.754486133768353, + "grad_norm": 0.1290450096130371, + "learning_rate": 0.0006078514655440144, + "loss": 0.0285, + "num_input_tokens_seen": 129044848, + "step": 59795 + }, + { + "epoch": 9.755301794453507, + "grad_norm": 0.011095109395682812, + "learning_rate": 0.0006077819602704702, + "loss": 0.0454, + "num_input_tokens_seen": 129055664, + "step": 59800 + }, + { + "epoch": 9.756117455138662, + "grad_norm": 0.013445612043142319, + "learning_rate": 0.0006077124528125877, + "loss": 0.1113, + "num_input_tokens_seen": 129066288, + "step": 59805 + }, + { + "epoch": 9.756933115823816, + "grad_norm": 0.05516577884554863, + "learning_rate": 0.0006076429431717757, + "loss": 0.0399, + "num_input_tokens_seen": 129076016, + "step": 59810 + }, + { + "epoch": 9.757748776508972, + "grad_norm": 0.010761004872620106, + "learning_rate": 0.000607573431349443, + "loss": 0.043, + "num_input_tokens_seen": 129086704, + "step": 59815 + }, + { + "epoch": 9.758564437194128, + "grad_norm": 0.04795242100954056, + "learning_rate": 0.0006075039173469982, + "loss": 0.0256, + "num_input_tokens_seen": 129096560, + "step": 59820 + }, + { + "epoch": 9.759380097879282, + "grad_norm": 0.5357078909873962, + "learning_rate": 0.0006074344011658501, + "loss": 0.1018, + "num_input_tokens_seen": 129107984, + "step": 59825 + }, + { + "epoch": 9.760195758564437, + "grad_norm": 0.006175927817821503, + "learning_rate": 0.0006073648828074077, + "loss": 0.04, + "num_input_tokens_seen": 129118352, + "step": 59830 + }, + { + "epoch": 9.761011419249591, + "grad_norm": 0.13918474316596985, + "learning_rate": 0.0006072953622730796, + "loss": 0.0729, + "num_input_tokens_seen": 129129072, + "step": 59835 + }, + { + "epoch": 9.761827079934747, + "grad_norm": 0.0017590216593816876, + "learning_rate": 0.0006072258395642748, + "loss": 0.0178, + "num_input_tokens_seen": 129140496, + "step": 59840 + }, + { + "epoch": 9.762642740619903, + "grad_norm": 0.003757696831598878, + "learning_rate": 0.0006071563146824024, + "loss": 0.0064, + "num_input_tokens_seen": 129151952, + "step": 59845 + }, + { + "epoch": 9.763458401305057, + "grad_norm": 0.16617871820926666, + "learning_rate": 0.0006070867876288715, + "loss": 0.1283, + "num_input_tokens_seen": 129165008, + "step": 59850 + }, + { + "epoch": 9.764274061990212, + "grad_norm": 0.23448771238327026, + "learning_rate": 0.0006070172584050908, + "loss": 0.1519, + "num_input_tokens_seen": 129176272, + "step": 59855 + }, + { + "epoch": 9.765089722675366, + "grad_norm": 0.01784597337245941, + "learning_rate": 0.0006069477270124697, + "loss": 0.0097, + "num_input_tokens_seen": 129187504, + "step": 59860 + }, + { + "epoch": 9.765905383360522, + "grad_norm": 0.004336201120167971, + "learning_rate": 0.0006068781934524172, + "loss": 0.025, + "num_input_tokens_seen": 129198928, + "step": 59865 + }, + { + "epoch": 9.766721044045678, + "grad_norm": 0.01495341770350933, + "learning_rate": 0.0006068086577263426, + "loss": 0.0108, + "num_input_tokens_seen": 129211024, + "step": 59870 + }, + { + "epoch": 9.767536704730832, + "grad_norm": 0.11997067928314209, + "learning_rate": 0.0006067391198356551, + "loss": 0.028, + "num_input_tokens_seen": 129222096, + "step": 59875 + }, + { + "epoch": 9.768352365415987, + "grad_norm": 0.24933722615242004, + "learning_rate": 0.0006066695797817638, + "loss": 0.1451, + "num_input_tokens_seen": 129232880, + "step": 59880 + }, + { + "epoch": 9.769168026101141, + "grad_norm": 0.41914379596710205, + "learning_rate": 0.0006066000375660782, + "loss": 0.0493, + "num_input_tokens_seen": 129244560, + "step": 59885 + }, + { + "epoch": 9.769983686786297, + "grad_norm": 0.3530498743057251, + "learning_rate": 0.0006065304931900076, + "loss": 0.1326, + "num_input_tokens_seen": 129255824, + "step": 59890 + }, + { + "epoch": 9.770799347471453, + "grad_norm": 0.16420510411262512, + "learning_rate": 0.0006064609466549614, + "loss": 0.1134, + "num_input_tokens_seen": 129267248, + "step": 59895 + }, + { + "epoch": 9.771615008156607, + "grad_norm": 0.04936949536204338, + "learning_rate": 0.0006063913979623491, + "loss": 0.0181, + "num_input_tokens_seen": 129277616, + "step": 59900 + }, + { + "epoch": 9.772430668841762, + "grad_norm": 0.0723864734172821, + "learning_rate": 0.0006063218471135801, + "loss": 0.018, + "num_input_tokens_seen": 129288112, + "step": 59905 + }, + { + "epoch": 9.773246329526916, + "grad_norm": 0.035946767777204514, + "learning_rate": 0.0006062522941100639, + "loss": 0.079, + "num_input_tokens_seen": 129298064, + "step": 59910 + }, + { + "epoch": 9.774061990212072, + "grad_norm": 0.013810939155519009, + "learning_rate": 0.0006061827389532103, + "loss": 0.006, + "num_input_tokens_seen": 129307408, + "step": 59915 + }, + { + "epoch": 9.774877650897226, + "grad_norm": 0.011505937203764915, + "learning_rate": 0.0006061131816444287, + "loss": 0.0337, + "num_input_tokens_seen": 129318128, + "step": 59920 + }, + { + "epoch": 9.775693311582382, + "grad_norm": 0.008924717083573341, + "learning_rate": 0.000606043622185129, + "loss": 0.0053, + "num_input_tokens_seen": 129328080, + "step": 59925 + }, + { + "epoch": 9.776508972267537, + "grad_norm": 0.033055588603019714, + "learning_rate": 0.0006059740605767207, + "loss": 0.0288, + "num_input_tokens_seen": 129339120, + "step": 59930 + }, + { + "epoch": 9.777324632952691, + "grad_norm": 0.3742935061454773, + "learning_rate": 0.0006059044968206136, + "loss": 0.1242, + "num_input_tokens_seen": 129349648, + "step": 59935 + }, + { + "epoch": 9.778140293637847, + "grad_norm": 0.07690007984638214, + "learning_rate": 0.0006058349309182176, + "loss": 0.0291, + "num_input_tokens_seen": 129360400, + "step": 59940 + }, + { + "epoch": 9.778955954323001, + "grad_norm": 0.2877650260925293, + "learning_rate": 0.0006057653628709424, + "loss": 0.1757, + "num_input_tokens_seen": 129373168, + "step": 59945 + }, + { + "epoch": 9.779771615008157, + "grad_norm": 0.012758604250848293, + "learning_rate": 0.0006056957926801979, + "loss": 0.0082, + "num_input_tokens_seen": 129382736, + "step": 59950 + }, + { + "epoch": 9.780587275693312, + "grad_norm": 0.022527890279889107, + "learning_rate": 0.0006056262203473941, + "loss": 0.1199, + "num_input_tokens_seen": 129394160, + "step": 59955 + }, + { + "epoch": 9.781402936378466, + "grad_norm": 0.020311087369918823, + "learning_rate": 0.000605556645873941, + "loss": 0.0592, + "num_input_tokens_seen": 129405744, + "step": 59960 + }, + { + "epoch": 9.782218597063622, + "grad_norm": 0.16453313827514648, + "learning_rate": 0.0006054870692612487, + "loss": 0.0221, + "num_input_tokens_seen": 129416368, + "step": 59965 + }, + { + "epoch": 9.783034257748776, + "grad_norm": 0.024318648502230644, + "learning_rate": 0.0006054174905107269, + "loss": 0.0479, + "num_input_tokens_seen": 129426928, + "step": 59970 + }, + { + "epoch": 9.783849918433932, + "grad_norm": 0.23672936856746674, + "learning_rate": 0.0006053479096237859, + "loss": 0.0309, + "num_input_tokens_seen": 129438896, + "step": 59975 + }, + { + "epoch": 9.784665579119086, + "grad_norm": 0.0013030687114223838, + "learning_rate": 0.000605278326601836, + "loss": 0.0091, + "num_input_tokens_seen": 129449808, + "step": 59980 + }, + { + "epoch": 9.785481239804241, + "grad_norm": 0.08723476529121399, + "learning_rate": 0.0006052087414462873, + "loss": 0.0293, + "num_input_tokens_seen": 129461232, + "step": 59985 + }, + { + "epoch": 9.786296900489397, + "grad_norm": 0.322837769985199, + "learning_rate": 0.00060513915415855, + "loss": 0.0767, + "num_input_tokens_seen": 129471984, + "step": 59990 + }, + { + "epoch": 9.78711256117455, + "grad_norm": 0.3453318476676941, + "learning_rate": 0.0006050695647400342, + "loss": 0.1067, + "num_input_tokens_seen": 129483088, + "step": 59995 + }, + { + "epoch": 9.787928221859707, + "grad_norm": 0.0707947239279747, + "learning_rate": 0.0006049999731921504, + "loss": 0.0631, + "num_input_tokens_seen": 129492144, + "step": 60000 + }, + { + "epoch": 9.78874388254486, + "grad_norm": 0.0010050504934042692, + "learning_rate": 0.0006049303795163091, + "loss": 0.042, + "num_input_tokens_seen": 129502128, + "step": 60005 + }, + { + "epoch": 9.789559543230016, + "grad_norm": 0.0322040393948555, + "learning_rate": 0.0006048607837139204, + "loss": 0.0731, + "num_input_tokens_seen": 129512848, + "step": 60010 + }, + { + "epoch": 9.790375203915172, + "grad_norm": 0.06374206393957138, + "learning_rate": 0.0006047911857863949, + "loss": 0.0395, + "num_input_tokens_seen": 129522992, + "step": 60015 + }, + { + "epoch": 9.791190864600326, + "grad_norm": 0.3759308159351349, + "learning_rate": 0.0006047215857351431, + "loss": 0.105, + "num_input_tokens_seen": 129533328, + "step": 60020 + }, + { + "epoch": 9.792006525285482, + "grad_norm": 0.05543672665953636, + "learning_rate": 0.0006046519835615756, + "loss": 0.0138, + "num_input_tokens_seen": 129542768, + "step": 60025 + }, + { + "epoch": 9.792822185970635, + "grad_norm": 0.007901391945779324, + "learning_rate": 0.0006045823792671029, + "loss": 0.146, + "num_input_tokens_seen": 129554448, + "step": 60030 + }, + { + "epoch": 9.793637846655791, + "grad_norm": 0.10499479621648788, + "learning_rate": 0.0006045127728531354, + "loss": 0.0231, + "num_input_tokens_seen": 129566736, + "step": 60035 + }, + { + "epoch": 9.794453507340947, + "grad_norm": 0.009346401318907738, + "learning_rate": 0.0006044431643210842, + "loss": 0.0456, + "num_input_tokens_seen": 129577392, + "step": 60040 + }, + { + "epoch": 9.7952691680261, + "grad_norm": 0.04194474220275879, + "learning_rate": 0.0006043735536723595, + "loss": 0.0143, + "num_input_tokens_seen": 129588080, + "step": 60045 + }, + { + "epoch": 9.796084828711257, + "grad_norm": 0.022022457793354988, + "learning_rate": 0.0006043039409083726, + "loss": 0.0169, + "num_input_tokens_seen": 129599312, + "step": 60050 + }, + { + "epoch": 9.79690048939641, + "grad_norm": 0.2469419538974762, + "learning_rate": 0.0006042343260305339, + "loss": 0.1176, + "num_input_tokens_seen": 129609488, + "step": 60055 + }, + { + "epoch": 9.797716150081566, + "grad_norm": 0.4165681302547455, + "learning_rate": 0.0006041647090402544, + "loss": 0.0984, + "num_input_tokens_seen": 129620944, + "step": 60060 + }, + { + "epoch": 9.798531810766722, + "grad_norm": 0.04143780469894409, + "learning_rate": 0.0006040950899389449, + "loss": 0.0682, + "num_input_tokens_seen": 129631920, + "step": 60065 + }, + { + "epoch": 9.799347471451876, + "grad_norm": 0.001754431868903339, + "learning_rate": 0.0006040254687280163, + "loss": 0.0252, + "num_input_tokens_seen": 129643696, + "step": 60070 + }, + { + "epoch": 9.800163132137031, + "grad_norm": 0.044457513839006424, + "learning_rate": 0.0006039558454088796, + "loss": 0.0372, + "num_input_tokens_seen": 129655184, + "step": 60075 + }, + { + "epoch": 9.800978792822185, + "grad_norm": 0.08150798827409744, + "learning_rate": 0.0006038862199829459, + "loss": 0.019, + "num_input_tokens_seen": 129665008, + "step": 60080 + }, + { + "epoch": 9.801794453507341, + "grad_norm": 0.0077219209633767605, + "learning_rate": 0.0006038165924516262, + "loss": 0.0133, + "num_input_tokens_seen": 129675344, + "step": 60085 + }, + { + "epoch": 9.802610114192497, + "grad_norm": 0.040708933025598526, + "learning_rate": 0.0006037469628163315, + "loss": 0.0758, + "num_input_tokens_seen": 129687120, + "step": 60090 + }, + { + "epoch": 9.80342577487765, + "grad_norm": 0.008336449041962624, + "learning_rate": 0.000603677331078473, + "loss": 0.0141, + "num_input_tokens_seen": 129697936, + "step": 60095 + }, + { + "epoch": 9.804241435562806, + "grad_norm": 0.31973516941070557, + "learning_rate": 0.0006036076972394618, + "loss": 0.0286, + "num_input_tokens_seen": 129707312, + "step": 60100 + }, + { + "epoch": 9.80505709624796, + "grad_norm": 0.014146130532026291, + "learning_rate": 0.0006035380613007093, + "loss": 0.0317, + "num_input_tokens_seen": 129718256, + "step": 60105 + }, + { + "epoch": 9.805872756933116, + "grad_norm": 0.056790124624967575, + "learning_rate": 0.0006034684232636266, + "loss": 0.0087, + "num_input_tokens_seen": 129728336, + "step": 60110 + }, + { + "epoch": 9.80668841761827, + "grad_norm": 0.42859262228012085, + "learning_rate": 0.0006033987831296251, + "loss": 0.1397, + "num_input_tokens_seen": 129738960, + "step": 60115 + }, + { + "epoch": 9.807504078303426, + "grad_norm": 0.26962050795555115, + "learning_rate": 0.0006033291409001159, + "loss": 0.227, + "num_input_tokens_seen": 129748752, + "step": 60120 + }, + { + "epoch": 9.808319738988581, + "grad_norm": 0.07892422378063202, + "learning_rate": 0.0006032594965765107, + "loss": 0.0248, + "num_input_tokens_seen": 129758864, + "step": 60125 + }, + { + "epoch": 9.809135399673735, + "grad_norm": 0.0507841594517231, + "learning_rate": 0.0006031898501602207, + "loss": 0.0466, + "num_input_tokens_seen": 129768112, + "step": 60130 + }, + { + "epoch": 9.809951060358891, + "grad_norm": 0.004107207991182804, + "learning_rate": 0.0006031202016526576, + "loss": 0.0413, + "num_input_tokens_seen": 129779440, + "step": 60135 + }, + { + "epoch": 9.810766721044045, + "grad_norm": 0.06342919170856476, + "learning_rate": 0.0006030505510552329, + "loss": 0.0127, + "num_input_tokens_seen": 129790320, + "step": 60140 + }, + { + "epoch": 9.8115823817292, + "grad_norm": 0.021396607160568237, + "learning_rate": 0.0006029808983693579, + "loss": 0.0114, + "num_input_tokens_seen": 129799792, + "step": 60145 + }, + { + "epoch": 9.812398042414356, + "grad_norm": 0.008777686394751072, + "learning_rate": 0.0006029112435964444, + "loss": 0.1202, + "num_input_tokens_seen": 129811472, + "step": 60150 + }, + { + "epoch": 9.81321370309951, + "grad_norm": 0.04003319889307022, + "learning_rate": 0.0006028415867379039, + "loss": 0.056, + "num_input_tokens_seen": 129822512, + "step": 60155 + }, + { + "epoch": 9.814029363784666, + "grad_norm": 0.0838267132639885, + "learning_rate": 0.0006027719277951482, + "loss": 0.0292, + "num_input_tokens_seen": 129834608, + "step": 60160 + }, + { + "epoch": 9.81484502446982, + "grad_norm": 0.013710466213524342, + "learning_rate": 0.000602702266769589, + "loss": 0.0048, + "num_input_tokens_seen": 129845008, + "step": 60165 + }, + { + "epoch": 9.815660685154976, + "grad_norm": 0.15006117522716522, + "learning_rate": 0.0006026326036626382, + "loss": 0.0223, + "num_input_tokens_seen": 129854928, + "step": 60170 + }, + { + "epoch": 9.81647634584013, + "grad_norm": 0.08420473337173462, + "learning_rate": 0.0006025629384757075, + "loss": 0.0548, + "num_input_tokens_seen": 129865904, + "step": 60175 + }, + { + "epoch": 9.817292006525285, + "grad_norm": 0.016414064913988113, + "learning_rate": 0.0006024932712102085, + "loss": 0.1455, + "num_input_tokens_seen": 129878288, + "step": 60180 + }, + { + "epoch": 9.818107667210441, + "grad_norm": 0.0027326825074851513, + "learning_rate": 0.0006024236018675537, + "loss": 0.0149, + "num_input_tokens_seen": 129890512, + "step": 60185 + }, + { + "epoch": 9.818923327895595, + "grad_norm": 0.026732657104730606, + "learning_rate": 0.0006023539304491544, + "loss": 0.1043, + "num_input_tokens_seen": 129900560, + "step": 60190 + }, + { + "epoch": 9.81973898858075, + "grad_norm": 0.01066410169005394, + "learning_rate": 0.000602284256956423, + "loss": 0.036, + "num_input_tokens_seen": 129912272, + "step": 60195 + }, + { + "epoch": 9.820554649265905, + "grad_norm": 0.001908585662022233, + "learning_rate": 0.0006022145813907713, + "loss": 0.1116, + "num_input_tokens_seen": 129923120, + "step": 60200 + }, + { + "epoch": 9.82137030995106, + "grad_norm": 0.0045760078355669975, + "learning_rate": 0.0006021449037536114, + "loss": 0.0051, + "num_input_tokens_seen": 129935280, + "step": 60205 + }, + { + "epoch": 9.822185970636216, + "grad_norm": 0.001607739133760333, + "learning_rate": 0.0006020752240463555, + "loss": 0.014, + "num_input_tokens_seen": 129946032, + "step": 60210 + }, + { + "epoch": 9.82300163132137, + "grad_norm": 0.055917371064424515, + "learning_rate": 0.0006020055422704156, + "loss": 0.0303, + "num_input_tokens_seen": 129956784, + "step": 60215 + }, + { + "epoch": 9.823817292006526, + "grad_norm": 0.07184579223394394, + "learning_rate": 0.0006019358584272042, + "loss": 0.0174, + "num_input_tokens_seen": 129967504, + "step": 60220 + }, + { + "epoch": 9.82463295269168, + "grad_norm": 0.09243524074554443, + "learning_rate": 0.0006018661725181332, + "loss": 0.1386, + "num_input_tokens_seen": 129977840, + "step": 60225 + }, + { + "epoch": 9.825448613376835, + "grad_norm": 0.002829383360221982, + "learning_rate": 0.0006017964845446149, + "loss": 0.0272, + "num_input_tokens_seen": 129988080, + "step": 60230 + }, + { + "epoch": 9.826264274061991, + "grad_norm": 0.055344466120004654, + "learning_rate": 0.0006017267945080618, + "loss": 0.0255, + "num_input_tokens_seen": 129999728, + "step": 60235 + }, + { + "epoch": 9.827079934747145, + "grad_norm": 0.13127902150154114, + "learning_rate": 0.000601657102409886, + "loss": 0.054, + "num_input_tokens_seen": 130010352, + "step": 60240 + }, + { + "epoch": 9.8278955954323, + "grad_norm": 0.24492299556732178, + "learning_rate": 0.0006015874082515003, + "loss": 0.0546, + "num_input_tokens_seen": 130021776, + "step": 60245 + }, + { + "epoch": 9.828711256117455, + "grad_norm": 0.009963960386812687, + "learning_rate": 0.0006015177120343168, + "loss": 0.0481, + "num_input_tokens_seen": 130032304, + "step": 60250 + }, + { + "epoch": 9.82952691680261, + "grad_norm": 0.056077104061841965, + "learning_rate": 0.000601448013759748, + "loss": 0.0425, + "num_input_tokens_seen": 130041840, + "step": 60255 + }, + { + "epoch": 9.830342577487766, + "grad_norm": 0.047805432230234146, + "learning_rate": 0.0006013783134292067, + "loss": 0.0104, + "num_input_tokens_seen": 130051216, + "step": 60260 + }, + { + "epoch": 9.83115823817292, + "grad_norm": 0.3439437747001648, + "learning_rate": 0.0006013086110441049, + "loss": 0.0924, + "num_input_tokens_seen": 130061872, + "step": 60265 + }, + { + "epoch": 9.831973898858076, + "grad_norm": 0.02341892383992672, + "learning_rate": 0.0006012389066058559, + "loss": 0.009, + "num_input_tokens_seen": 130072944, + "step": 60270 + }, + { + "epoch": 9.83278955954323, + "grad_norm": 0.04148042947053909, + "learning_rate": 0.0006011692001158719, + "loss": 0.0184, + "num_input_tokens_seen": 130082704, + "step": 60275 + }, + { + "epoch": 9.833605220228385, + "grad_norm": 0.04239051416516304, + "learning_rate": 0.0006010994915755659, + "loss": 0.0151, + "num_input_tokens_seen": 130093936, + "step": 60280 + }, + { + "epoch": 9.83442088091354, + "grad_norm": 0.03339095041155815, + "learning_rate": 0.0006010297809863503, + "loss": 0.0576, + "num_input_tokens_seen": 130103888, + "step": 60285 + }, + { + "epoch": 9.835236541598695, + "grad_norm": 0.0384807325899601, + "learning_rate": 0.000600960068349638, + "loss": 0.0166, + "num_input_tokens_seen": 130114864, + "step": 60290 + }, + { + "epoch": 9.83605220228385, + "grad_norm": 0.01620279997587204, + "learning_rate": 0.000600890353666842, + "loss": 0.0126, + "num_input_tokens_seen": 130125136, + "step": 60295 + }, + { + "epoch": 9.836867862969005, + "grad_norm": 0.018959032371640205, + "learning_rate": 0.0006008206369393748, + "loss": 0.0235, + "num_input_tokens_seen": 130136240, + "step": 60300 + }, + { + "epoch": 9.83768352365416, + "grad_norm": 0.037905216217041016, + "learning_rate": 0.0006007509181686496, + "loss": 0.0331, + "num_input_tokens_seen": 130146864, + "step": 60305 + }, + { + "epoch": 9.838499184339314, + "grad_norm": 0.001421467517502606, + "learning_rate": 0.0006006811973560792, + "loss": 0.0132, + "num_input_tokens_seen": 130156560, + "step": 60310 + }, + { + "epoch": 9.83931484502447, + "grad_norm": 0.09836296737194061, + "learning_rate": 0.0006006114745030766, + "loss": 0.1212, + "num_input_tokens_seen": 130167664, + "step": 60315 + }, + { + "epoch": 9.840130505709626, + "grad_norm": 0.005882080644369125, + "learning_rate": 0.0006005417496110549, + "loss": 0.101, + "num_input_tokens_seen": 130178992, + "step": 60320 + }, + { + "epoch": 9.84094616639478, + "grad_norm": 0.0539504773914814, + "learning_rate": 0.0006004720226814271, + "loss": 0.0775, + "num_input_tokens_seen": 130188944, + "step": 60325 + }, + { + "epoch": 9.841761827079935, + "grad_norm": 0.0037634889595210552, + "learning_rate": 0.0006004022937156062, + "loss": 0.1043, + "num_input_tokens_seen": 130199408, + "step": 60330 + }, + { + "epoch": 9.84257748776509, + "grad_norm": 0.05407777801156044, + "learning_rate": 0.0006003325627150054, + "loss": 0.076, + "num_input_tokens_seen": 130211152, + "step": 60335 + }, + { + "epoch": 9.843393148450245, + "grad_norm": 0.008126193657517433, + "learning_rate": 0.0006002628296810381, + "loss": 0.0142, + "num_input_tokens_seen": 130222192, + "step": 60340 + }, + { + "epoch": 9.844208809135399, + "grad_norm": 0.04272608086466789, + "learning_rate": 0.0006001930946151172, + "loss": 0.0339, + "num_input_tokens_seen": 130232912, + "step": 60345 + }, + { + "epoch": 9.845024469820554, + "grad_norm": 0.009695055894553661, + "learning_rate": 0.0006001233575186563, + "loss": 0.0053, + "num_input_tokens_seen": 130242704, + "step": 60350 + }, + { + "epoch": 9.84584013050571, + "grad_norm": 0.021923761814832687, + "learning_rate": 0.0006000536183930684, + "loss": 0.005, + "num_input_tokens_seen": 130254224, + "step": 60355 + }, + { + "epoch": 9.846655791190864, + "grad_norm": 0.35703516006469727, + "learning_rate": 0.000599983877239767, + "loss": 0.0975, + "num_input_tokens_seen": 130265392, + "step": 60360 + }, + { + "epoch": 9.84747145187602, + "grad_norm": 0.011605478823184967, + "learning_rate": 0.0005999141340601657, + "loss": 0.0078, + "num_input_tokens_seen": 130276048, + "step": 60365 + }, + { + "epoch": 9.848287112561174, + "grad_norm": 0.0041684480383992195, + "learning_rate": 0.0005998443888556776, + "loss": 0.0266, + "num_input_tokens_seen": 130287472, + "step": 60370 + }, + { + "epoch": 9.84910277324633, + "grad_norm": 0.11872498691082001, + "learning_rate": 0.0005997746416277162, + "loss": 0.0981, + "num_input_tokens_seen": 130298832, + "step": 60375 + }, + { + "epoch": 9.849918433931485, + "grad_norm": 0.005355318542569876, + "learning_rate": 0.0005997048923776953, + "loss": 0.0128, + "num_input_tokens_seen": 130308432, + "step": 60380 + }, + { + "epoch": 9.850734094616639, + "grad_norm": 0.057005710899829865, + "learning_rate": 0.000599635141107028, + "loss": 0.0245, + "num_input_tokens_seen": 130318544, + "step": 60385 + }, + { + "epoch": 9.851549755301795, + "grad_norm": 0.06865450739860535, + "learning_rate": 0.0005995653878171283, + "loss": 0.1704, + "num_input_tokens_seen": 130328176, + "step": 60390 + }, + { + "epoch": 9.852365415986949, + "grad_norm": 0.07154335081577301, + "learning_rate": 0.0005994956325094099, + "loss": 0.0578, + "num_input_tokens_seen": 130338896, + "step": 60395 + }, + { + "epoch": 9.853181076672104, + "grad_norm": 0.002622546162456274, + "learning_rate": 0.000599425875185286, + "loss": 0.0963, + "num_input_tokens_seen": 130350160, + "step": 60400 + }, + { + "epoch": 9.85399673735726, + "grad_norm": 0.008650357834994793, + "learning_rate": 0.0005993561158461708, + "loss": 0.0114, + "num_input_tokens_seen": 130360976, + "step": 60405 + }, + { + "epoch": 9.854812398042414, + "grad_norm": 0.19668813049793243, + "learning_rate": 0.0005992863544934777, + "loss": 0.0199, + "num_input_tokens_seen": 130371184, + "step": 60410 + }, + { + "epoch": 9.85562805872757, + "grad_norm": 0.03569589927792549, + "learning_rate": 0.000599216591128621, + "loss": 0.0179, + "num_input_tokens_seen": 130381168, + "step": 60415 + }, + { + "epoch": 9.856443719412724, + "grad_norm": 0.42427003383636475, + "learning_rate": 0.000599146825753014, + "loss": 0.0376, + "num_input_tokens_seen": 130390544, + "step": 60420 + }, + { + "epoch": 9.85725938009788, + "grad_norm": 0.0046119713224470615, + "learning_rate": 0.0005990770583680707, + "loss": 0.0062, + "num_input_tokens_seen": 130401232, + "step": 60425 + }, + { + "epoch": 9.858075040783035, + "grad_norm": 0.006291973404586315, + "learning_rate": 0.0005990072889752052, + "loss": 0.0033, + "num_input_tokens_seen": 130412560, + "step": 60430 + }, + { + "epoch": 9.858890701468189, + "grad_norm": 0.05732189118862152, + "learning_rate": 0.0005989375175758315, + "loss": 0.2051, + "num_input_tokens_seen": 130423472, + "step": 60435 + }, + { + "epoch": 9.859706362153345, + "grad_norm": 0.2691935896873474, + "learning_rate": 0.0005988677441713633, + "loss": 0.0483, + "num_input_tokens_seen": 130433008, + "step": 60440 + }, + { + "epoch": 9.860522022838499, + "grad_norm": 0.07633983343839645, + "learning_rate": 0.000598797968763215, + "loss": 0.0326, + "num_input_tokens_seen": 130445776, + "step": 60445 + }, + { + "epoch": 9.861337683523654, + "grad_norm": 0.09656926244497299, + "learning_rate": 0.0005987281913528006, + "loss": 0.0151, + "num_input_tokens_seen": 130456720, + "step": 60450 + }, + { + "epoch": 9.86215334420881, + "grad_norm": 0.02181723341345787, + "learning_rate": 0.0005986584119415339, + "loss": 0.0151, + "num_input_tokens_seen": 130468176, + "step": 60455 + }, + { + "epoch": 9.862969004893964, + "grad_norm": 0.0011484201531857252, + "learning_rate": 0.0005985886305308295, + "loss": 0.0696, + "num_input_tokens_seen": 130479344, + "step": 60460 + }, + { + "epoch": 9.86378466557912, + "grad_norm": 0.013567357324063778, + "learning_rate": 0.0005985188471221014, + "loss": 0.0084, + "num_input_tokens_seen": 130490672, + "step": 60465 + }, + { + "epoch": 9.864600326264274, + "grad_norm": 0.006005418952554464, + "learning_rate": 0.0005984490617167639, + "loss": 0.0254, + "num_input_tokens_seen": 130502160, + "step": 60470 + }, + { + "epoch": 9.86541598694943, + "grad_norm": 0.0018919827416539192, + "learning_rate": 0.0005983792743162313, + "loss": 0.0093, + "num_input_tokens_seen": 130514160, + "step": 60475 + }, + { + "epoch": 9.866231647634583, + "grad_norm": 0.0071715605445206165, + "learning_rate": 0.0005983094849219177, + "loss": 0.0279, + "num_input_tokens_seen": 130524336, + "step": 60480 + }, + { + "epoch": 9.867047308319739, + "grad_norm": 0.004263455048203468, + "learning_rate": 0.0005982396935352379, + "loss": 0.0963, + "num_input_tokens_seen": 130535440, + "step": 60485 + }, + { + "epoch": 9.867862969004895, + "grad_norm": 0.007322392426431179, + "learning_rate": 0.000598169900157606, + "loss": 0.0533, + "num_input_tokens_seen": 130546832, + "step": 60490 + }, + { + "epoch": 9.868678629690049, + "grad_norm": 0.02374938502907753, + "learning_rate": 0.0005981001047904365, + "loss": 0.012, + "num_input_tokens_seen": 130557168, + "step": 60495 + }, + { + "epoch": 9.869494290375204, + "grad_norm": 0.03611423447728157, + "learning_rate": 0.000598030307435144, + "loss": 0.0118, + "num_input_tokens_seen": 130567664, + "step": 60500 + }, + { + "epoch": 9.870309951060358, + "grad_norm": 0.37349772453308105, + "learning_rate": 0.000597960508093143, + "loss": 0.0635, + "num_input_tokens_seen": 130577520, + "step": 60505 + }, + { + "epoch": 9.871125611745514, + "grad_norm": 0.020337767899036407, + "learning_rate": 0.0005978907067658479, + "loss": 0.0163, + "num_input_tokens_seen": 130589328, + "step": 60510 + }, + { + "epoch": 9.87194127243067, + "grad_norm": 0.023232286795973778, + "learning_rate": 0.0005978209034546736, + "loss": 0.0703, + "num_input_tokens_seen": 130599760, + "step": 60515 + }, + { + "epoch": 9.872756933115824, + "grad_norm": 0.2161463499069214, + "learning_rate": 0.0005977510981610344, + "loss": 0.0198, + "num_input_tokens_seen": 130610576, + "step": 60520 + }, + { + "epoch": 9.87357259380098, + "grad_norm": 0.005365234799683094, + "learning_rate": 0.0005976812908863454, + "loss": 0.0172, + "num_input_tokens_seen": 130621168, + "step": 60525 + }, + { + "epoch": 9.874388254486133, + "grad_norm": 0.004941691644489765, + "learning_rate": 0.0005976114816320208, + "loss": 0.0085, + "num_input_tokens_seen": 130630736, + "step": 60530 + }, + { + "epoch": 9.875203915171289, + "grad_norm": 0.22290107607841492, + "learning_rate": 0.000597541670399476, + "loss": 0.0342, + "num_input_tokens_seen": 130641840, + "step": 60535 + }, + { + "epoch": 9.876019575856443, + "grad_norm": 0.0028947857208549976, + "learning_rate": 0.0005974718571901254, + "loss": 0.0157, + "num_input_tokens_seen": 130652816, + "step": 60540 + }, + { + "epoch": 9.876835236541599, + "grad_norm": 0.020026614889502525, + "learning_rate": 0.0005974020420053841, + "loss": 0.0058, + "num_input_tokens_seen": 130663248, + "step": 60545 + }, + { + "epoch": 9.877650897226754, + "grad_norm": 0.006422468926757574, + "learning_rate": 0.0005973322248466666, + "loss": 0.0133, + "num_input_tokens_seen": 130675184, + "step": 60550 + }, + { + "epoch": 9.878466557911908, + "grad_norm": 0.32624539732933044, + "learning_rate": 0.0005972624057153882, + "loss": 0.0483, + "num_input_tokens_seen": 130686576, + "step": 60555 + }, + { + "epoch": 9.879282218597064, + "grad_norm": 0.0541231743991375, + "learning_rate": 0.0005971925846129639, + "loss": 0.0376, + "num_input_tokens_seen": 130696240, + "step": 60560 + }, + { + "epoch": 9.880097879282218, + "grad_norm": 0.26999378204345703, + "learning_rate": 0.0005971227615408084, + "loss": 0.0578, + "num_input_tokens_seen": 130706064, + "step": 60565 + }, + { + "epoch": 9.880913539967374, + "grad_norm": 0.036546021699905396, + "learning_rate": 0.0005970529365003371, + "loss": 0.008, + "num_input_tokens_seen": 130716720, + "step": 60570 + }, + { + "epoch": 9.88172920065253, + "grad_norm": 0.03857633098959923, + "learning_rate": 0.0005969831094929648, + "loss": 0.1628, + "num_input_tokens_seen": 130726288, + "step": 60575 + }, + { + "epoch": 9.882544861337683, + "grad_norm": 0.015046238899230957, + "learning_rate": 0.0005969132805201067, + "loss": 0.0098, + "num_input_tokens_seen": 130736368, + "step": 60580 + }, + { + "epoch": 9.883360522022839, + "grad_norm": 0.010672475211322308, + "learning_rate": 0.0005968434495831781, + "loss": 0.0885, + "num_input_tokens_seen": 130746160, + "step": 60585 + }, + { + "epoch": 9.884176182707993, + "grad_norm": 0.01950669102370739, + "learning_rate": 0.000596773616683594, + "loss": 0.0428, + "num_input_tokens_seen": 130755728, + "step": 60590 + }, + { + "epoch": 9.884991843393149, + "grad_norm": 0.02543611451983452, + "learning_rate": 0.0005967037818227701, + "loss": 0.0145, + "num_input_tokens_seen": 130767088, + "step": 60595 + }, + { + "epoch": 9.885807504078304, + "grad_norm": 0.010294155217707157, + "learning_rate": 0.0005966339450021212, + "loss": 0.0458, + "num_input_tokens_seen": 130776944, + "step": 60600 + }, + { + "epoch": 9.886623164763458, + "grad_norm": 0.028084808960556984, + "learning_rate": 0.0005965641062230627, + "loss": 0.0034, + "num_input_tokens_seen": 130786480, + "step": 60605 + }, + { + "epoch": 9.887438825448614, + "grad_norm": 0.0028355824761092663, + "learning_rate": 0.0005964942654870103, + "loss": 0.0299, + "num_input_tokens_seen": 130797968, + "step": 60610 + }, + { + "epoch": 9.888254486133768, + "grad_norm": 0.41090652346611023, + "learning_rate": 0.0005964244227953791, + "loss": 0.0706, + "num_input_tokens_seen": 130808272, + "step": 60615 + }, + { + "epoch": 9.889070146818923, + "grad_norm": 0.013776799663901329, + "learning_rate": 0.0005963545781495847, + "loss": 0.0642, + "num_input_tokens_seen": 130817488, + "step": 60620 + }, + { + "epoch": 9.88988580750408, + "grad_norm": 0.014620115049183369, + "learning_rate": 0.0005962847315510426, + "loss": 0.0039, + "num_input_tokens_seen": 130828432, + "step": 60625 + }, + { + "epoch": 9.890701468189233, + "grad_norm": 0.011676360853016376, + "learning_rate": 0.0005962148830011681, + "loss": 0.0034, + "num_input_tokens_seen": 130838416, + "step": 60630 + }, + { + "epoch": 9.891517128874389, + "grad_norm": 0.10804423689842224, + "learning_rate": 0.0005961450325013771, + "loss": 0.0083, + "num_input_tokens_seen": 130847920, + "step": 60635 + }, + { + "epoch": 9.892332789559543, + "grad_norm": 0.2533286511898041, + "learning_rate": 0.0005960751800530849, + "loss": 0.1369, + "num_input_tokens_seen": 130859184, + "step": 60640 + }, + { + "epoch": 9.893148450244698, + "grad_norm": 0.015121645294129848, + "learning_rate": 0.0005960053256577073, + "loss": 0.0333, + "num_input_tokens_seen": 130869808, + "step": 60645 + }, + { + "epoch": 9.893964110929852, + "grad_norm": 0.0031398022547364235, + "learning_rate": 0.0005959354693166601, + "loss": 0.005, + "num_input_tokens_seen": 130880912, + "step": 60650 + }, + { + "epoch": 9.894779771615008, + "grad_norm": 0.005537780933082104, + "learning_rate": 0.0005958656110313589, + "loss": 0.0076, + "num_input_tokens_seen": 130892336, + "step": 60655 + }, + { + "epoch": 9.895595432300164, + "grad_norm": 0.009504719637334347, + "learning_rate": 0.0005957957508032194, + "loss": 0.0116, + "num_input_tokens_seen": 130900752, + "step": 60660 + }, + { + "epoch": 9.896411092985318, + "grad_norm": 0.0020218915306031704, + "learning_rate": 0.0005957258886336575, + "loss": 0.11, + "num_input_tokens_seen": 130911600, + "step": 60665 + }, + { + "epoch": 9.897226753670473, + "grad_norm": 0.02705889381468296, + "learning_rate": 0.0005956560245240891, + "loss": 0.016, + "num_input_tokens_seen": 130922640, + "step": 60670 + }, + { + "epoch": 9.898042414355627, + "grad_norm": 0.09529057145118713, + "learning_rate": 0.0005955861584759298, + "loss": 0.051, + "num_input_tokens_seen": 130933136, + "step": 60675 + }, + { + "epoch": 9.898858075040783, + "grad_norm": 0.0911308228969574, + "learning_rate": 0.0005955162904905959, + "loss": 0.0979, + "num_input_tokens_seen": 130944336, + "step": 60680 + }, + { + "epoch": 9.899673735725939, + "grad_norm": 0.13382776081562042, + "learning_rate": 0.0005954464205695033, + "loss": 0.0913, + "num_input_tokens_seen": 130952848, + "step": 60685 + }, + { + "epoch": 9.900489396411093, + "grad_norm": 0.014274620451033115, + "learning_rate": 0.0005953765487140678, + "loss": 0.0175, + "num_input_tokens_seen": 130963536, + "step": 60690 + }, + { + "epoch": 9.901305057096248, + "grad_norm": 0.012472741305828094, + "learning_rate": 0.0005953066749257055, + "loss": 0.2966, + "num_input_tokens_seen": 130973840, + "step": 60695 + }, + { + "epoch": 9.902120717781402, + "grad_norm": 0.007896821945905685, + "learning_rate": 0.0005952367992058326, + "loss": 0.0136, + "num_input_tokens_seen": 130984304, + "step": 60700 + }, + { + "epoch": 9.902936378466558, + "grad_norm": 0.3129275441169739, + "learning_rate": 0.0005951669215558651, + "loss": 0.0356, + "num_input_tokens_seen": 130995504, + "step": 60705 + }, + { + "epoch": 9.903752039151712, + "grad_norm": 0.026942551136016846, + "learning_rate": 0.0005950970419772192, + "loss": 0.0322, + "num_input_tokens_seen": 131005968, + "step": 60710 + }, + { + "epoch": 9.904567699836868, + "grad_norm": 0.003663647221401334, + "learning_rate": 0.0005950271604713111, + "loss": 0.0478, + "num_input_tokens_seen": 131018160, + "step": 60715 + }, + { + "epoch": 9.905383360522023, + "grad_norm": 0.013435225002467632, + "learning_rate": 0.000594957277039557, + "loss": 0.084, + "num_input_tokens_seen": 131029008, + "step": 60720 + }, + { + "epoch": 9.906199021207177, + "grad_norm": 0.10777303576469421, + "learning_rate": 0.0005948873916833733, + "loss": 0.2034, + "num_input_tokens_seen": 131038960, + "step": 60725 + }, + { + "epoch": 9.907014681892333, + "grad_norm": 0.010886602103710175, + "learning_rate": 0.0005948175044041764, + "loss": 0.0056, + "num_input_tokens_seen": 131050160, + "step": 60730 + }, + { + "epoch": 9.907830342577487, + "grad_norm": 0.6343721747398376, + "learning_rate": 0.0005947476152033822, + "loss": 0.0989, + "num_input_tokens_seen": 131060048, + "step": 60735 + }, + { + "epoch": 9.908646003262643, + "grad_norm": 0.41824567317962646, + "learning_rate": 0.0005946777240824076, + "loss": 0.0552, + "num_input_tokens_seen": 131069360, + "step": 60740 + }, + { + "epoch": 9.909461663947798, + "grad_norm": 0.30215784907341003, + "learning_rate": 0.0005946078310426687, + "loss": 0.2595, + "num_input_tokens_seen": 131079888, + "step": 60745 + }, + { + "epoch": 9.910277324632952, + "grad_norm": 0.10839759558439255, + "learning_rate": 0.000594537936085582, + "loss": 0.021, + "num_input_tokens_seen": 131090512, + "step": 60750 + }, + { + "epoch": 9.911092985318108, + "grad_norm": 0.004459444899111986, + "learning_rate": 0.0005944680392125643, + "loss": 0.072, + "num_input_tokens_seen": 131101680, + "step": 60755 + }, + { + "epoch": 9.911908646003262, + "grad_norm": 0.057104773819446564, + "learning_rate": 0.0005943981404250318, + "loss": 0.0561, + "num_input_tokens_seen": 131112336, + "step": 60760 + }, + { + "epoch": 9.912724306688418, + "grad_norm": 0.07025784999132156, + "learning_rate": 0.0005943282397244013, + "loss": 0.0214, + "num_input_tokens_seen": 131123088, + "step": 60765 + }, + { + "epoch": 9.913539967373573, + "grad_norm": 0.1623830497264862, + "learning_rate": 0.0005942583371120893, + "loss": 0.0347, + "num_input_tokens_seen": 131132368, + "step": 60770 + }, + { + "epoch": 9.914355628058727, + "grad_norm": 0.20694822072982788, + "learning_rate": 0.0005941884325895127, + "loss": 0.045, + "num_input_tokens_seen": 131143600, + "step": 60775 + }, + { + "epoch": 9.915171288743883, + "grad_norm": 0.08146418631076813, + "learning_rate": 0.0005941185261580878, + "loss": 0.0159, + "num_input_tokens_seen": 131153872, + "step": 60780 + }, + { + "epoch": 9.915986949429037, + "grad_norm": 0.024160081520676613, + "learning_rate": 0.0005940486178192317, + "loss": 0.0083, + "num_input_tokens_seen": 131164848, + "step": 60785 + }, + { + "epoch": 9.916802610114193, + "grad_norm": 0.0390489287674427, + "learning_rate": 0.000593978707574361, + "loss": 0.0096, + "num_input_tokens_seen": 131175280, + "step": 60790 + }, + { + "epoch": 9.917618270799348, + "grad_norm": 0.14191563427448273, + "learning_rate": 0.0005939087954248926, + "loss": 0.0126, + "num_input_tokens_seen": 131185776, + "step": 60795 + }, + { + "epoch": 9.918433931484502, + "grad_norm": 0.07650037854909897, + "learning_rate": 0.0005938388813722432, + "loss": 0.0383, + "num_input_tokens_seen": 131196944, + "step": 60800 + }, + { + "epoch": 9.919249592169658, + "grad_norm": 0.0030891899950802326, + "learning_rate": 0.0005937689654178298, + "loss": 0.0174, + "num_input_tokens_seen": 131208752, + "step": 60805 + }, + { + "epoch": 9.920065252854812, + "grad_norm": 0.03860265389084816, + "learning_rate": 0.0005936990475630696, + "loss": 0.0332, + "num_input_tokens_seen": 131217360, + "step": 60810 + }, + { + "epoch": 9.920880913539968, + "grad_norm": 0.005586592014878988, + "learning_rate": 0.0005936291278093793, + "loss": 0.0219, + "num_input_tokens_seen": 131228304, + "step": 60815 + }, + { + "epoch": 9.921696574225122, + "grad_norm": 0.010130854323506355, + "learning_rate": 0.0005935592061581758, + "loss": 0.0208, + "num_input_tokens_seen": 131239568, + "step": 60820 + }, + { + "epoch": 9.922512234910277, + "grad_norm": 0.2597930431365967, + "learning_rate": 0.0005934892826108764, + "loss": 0.1355, + "num_input_tokens_seen": 131250576, + "step": 60825 + }, + { + "epoch": 9.923327895595433, + "grad_norm": 0.02462649717926979, + "learning_rate": 0.0005934193571688981, + "loss": 0.1645, + "num_input_tokens_seen": 131261616, + "step": 60830 + }, + { + "epoch": 9.924143556280587, + "grad_norm": 0.002622033702209592, + "learning_rate": 0.0005933494298336579, + "loss": 0.1318, + "num_input_tokens_seen": 131272400, + "step": 60835 + }, + { + "epoch": 9.924959216965743, + "grad_norm": 0.006757265888154507, + "learning_rate": 0.0005932795006065732, + "loss": 0.0942, + "num_input_tokens_seen": 131283280, + "step": 60840 + }, + { + "epoch": 9.925774877650896, + "grad_norm": 0.0022342128213495016, + "learning_rate": 0.000593209569489061, + "loss": 0.0087, + "num_input_tokens_seen": 131293808, + "step": 60845 + }, + { + "epoch": 9.926590538336052, + "grad_norm": 0.00587815884500742, + "learning_rate": 0.0005931396364825387, + "loss": 0.0098, + "num_input_tokens_seen": 131304016, + "step": 60850 + }, + { + "epoch": 9.927406199021208, + "grad_norm": 0.038470152765512466, + "learning_rate": 0.0005930697015884234, + "loss": 0.1041, + "num_input_tokens_seen": 131314032, + "step": 60855 + }, + { + "epoch": 9.928221859706362, + "grad_norm": 0.0381760410964489, + "learning_rate": 0.0005929997648081327, + "loss": 0.1348, + "num_input_tokens_seen": 131324112, + "step": 60860 + }, + { + "epoch": 9.929037520391518, + "grad_norm": 0.02548396587371826, + "learning_rate": 0.0005929298261430837, + "loss": 0.0898, + "num_input_tokens_seen": 131334352, + "step": 60865 + }, + { + "epoch": 9.929853181076671, + "grad_norm": 0.026239583268761635, + "learning_rate": 0.0005928598855946939, + "loss": 0.2274, + "num_input_tokens_seen": 131344656, + "step": 60870 + }, + { + "epoch": 9.930668841761827, + "grad_norm": 0.0259616207331419, + "learning_rate": 0.0005927899431643807, + "loss": 0.0485, + "num_input_tokens_seen": 131355984, + "step": 60875 + }, + { + "epoch": 9.931484502446983, + "grad_norm": 0.15575720369815826, + "learning_rate": 0.0005927199988535616, + "loss": 0.0256, + "num_input_tokens_seen": 131367056, + "step": 60880 + }, + { + "epoch": 9.932300163132137, + "grad_norm": 0.2814697027206421, + "learning_rate": 0.0005926500526636542, + "loss": 0.0673, + "num_input_tokens_seen": 131378192, + "step": 60885 + }, + { + "epoch": 9.933115823817293, + "grad_norm": 0.029924139380455017, + "learning_rate": 0.0005925801045960757, + "loss": 0.0774, + "num_input_tokens_seen": 131389040, + "step": 60890 + }, + { + "epoch": 9.933931484502446, + "grad_norm": 0.021249929443001747, + "learning_rate": 0.0005925101546522441, + "loss": 0.0367, + "num_input_tokens_seen": 131399664, + "step": 60895 + }, + { + "epoch": 9.934747145187602, + "grad_norm": 0.0889945700764656, + "learning_rate": 0.0005924402028335769, + "loss": 0.0126, + "num_input_tokens_seen": 131409680, + "step": 60900 + }, + { + "epoch": 9.935562805872756, + "grad_norm": 0.03813540190458298, + "learning_rate": 0.0005923702491414916, + "loss": 0.0193, + "num_input_tokens_seen": 131420240, + "step": 60905 + }, + { + "epoch": 9.936378466557912, + "grad_norm": 0.12350038439035416, + "learning_rate": 0.000592300293577406, + "loss": 0.0226, + "num_input_tokens_seen": 131429488, + "step": 60910 + }, + { + "epoch": 9.937194127243067, + "grad_norm": 0.004099275451153517, + "learning_rate": 0.0005922303361427379, + "loss": 0.125, + "num_input_tokens_seen": 131439792, + "step": 60915 + }, + { + "epoch": 9.938009787928221, + "grad_norm": 0.007772999815642834, + "learning_rate": 0.0005921603768389051, + "loss": 0.0099, + "num_input_tokens_seen": 131448976, + "step": 60920 + }, + { + "epoch": 9.938825448613377, + "grad_norm": 0.018107945099473, + "learning_rate": 0.0005920904156673254, + "loss": 0.0466, + "num_input_tokens_seen": 131459088, + "step": 60925 + }, + { + "epoch": 9.939641109298531, + "grad_norm": 0.012274482287466526, + "learning_rate": 0.0005920204526294165, + "loss": 0.0233, + "num_input_tokens_seen": 131470096, + "step": 60930 + }, + { + "epoch": 9.940456769983687, + "grad_norm": 0.29348087310791016, + "learning_rate": 0.0005919504877265965, + "loss": 0.1133, + "num_input_tokens_seen": 131479376, + "step": 60935 + }, + { + "epoch": 9.941272430668842, + "grad_norm": 0.16367141902446747, + "learning_rate": 0.000591880520960283, + "loss": 0.0299, + "num_input_tokens_seen": 131491792, + "step": 60940 + }, + { + "epoch": 9.942088091353996, + "grad_norm": 0.005660237744450569, + "learning_rate": 0.0005918105523318944, + "loss": 0.0052, + "num_input_tokens_seen": 131502512, + "step": 60945 + }, + { + "epoch": 9.942903752039152, + "grad_norm": 0.0700116902589798, + "learning_rate": 0.0005917405818428484, + "loss": 0.1308, + "num_input_tokens_seen": 131514224, + "step": 60950 + }, + { + "epoch": 9.943719412724306, + "grad_norm": 0.10043734312057495, + "learning_rate": 0.0005916706094945631, + "loss": 0.0499, + "num_input_tokens_seen": 131525840, + "step": 60955 + }, + { + "epoch": 9.944535073409462, + "grad_norm": 0.0025967916008085012, + "learning_rate": 0.0005916006352884567, + "loss": 0.0559, + "num_input_tokens_seen": 131536208, + "step": 60960 + }, + { + "epoch": 9.945350734094617, + "grad_norm": 0.017657585442066193, + "learning_rate": 0.0005915306592259471, + "loss": 0.1007, + "num_input_tokens_seen": 131547344, + "step": 60965 + }, + { + "epoch": 9.946166394779771, + "grad_norm": 0.10215990990400314, + "learning_rate": 0.0005914606813084526, + "loss": 0.0153, + "num_input_tokens_seen": 131558544, + "step": 60970 + }, + { + "epoch": 9.946982055464927, + "grad_norm": 0.009787973947823048, + "learning_rate": 0.0005913907015373915, + "loss": 0.0196, + "num_input_tokens_seen": 131569136, + "step": 60975 + }, + { + "epoch": 9.947797716150081, + "grad_norm": 0.03457729145884514, + "learning_rate": 0.0005913207199141818, + "loss": 0.0381, + "num_input_tokens_seen": 131580848, + "step": 60980 + }, + { + "epoch": 9.948613376835237, + "grad_norm": 0.007378737907856703, + "learning_rate": 0.0005912507364402419, + "loss": 0.0057, + "num_input_tokens_seen": 131591664, + "step": 60985 + }, + { + "epoch": 9.949429037520392, + "grad_norm": 0.003623401280492544, + "learning_rate": 0.0005911807511169899, + "loss": 0.0126, + "num_input_tokens_seen": 131602032, + "step": 60990 + }, + { + "epoch": 9.950244698205546, + "grad_norm": 0.2980554401874542, + "learning_rate": 0.0005911107639458444, + "loss": 0.126, + "num_input_tokens_seen": 131613520, + "step": 60995 + }, + { + "epoch": 9.951060358890702, + "grad_norm": 0.016155565157532692, + "learning_rate": 0.0005910407749282237, + "loss": 0.0683, + "num_input_tokens_seen": 131624624, + "step": 61000 + }, + { + "epoch": 9.951876019575856, + "grad_norm": 0.007753738667815924, + "learning_rate": 0.0005909707840655462, + "loss": 0.0129, + "num_input_tokens_seen": 131634416, + "step": 61005 + }, + { + "epoch": 9.952691680261012, + "grad_norm": 0.16092662513256073, + "learning_rate": 0.0005909007913592304, + "loss": 0.0131, + "num_input_tokens_seen": 131645232, + "step": 61010 + }, + { + "epoch": 9.953507340946166, + "grad_norm": 0.02057763561606407, + "learning_rate": 0.0005908307968106948, + "loss": 0.1237, + "num_input_tokens_seen": 131655824, + "step": 61015 + }, + { + "epoch": 9.954323001631321, + "grad_norm": 0.035560328513383865, + "learning_rate": 0.0005907608004213577, + "loss": 0.0103, + "num_input_tokens_seen": 131666640, + "step": 61020 + }, + { + "epoch": 9.955138662316477, + "grad_norm": 0.046665262430906296, + "learning_rate": 0.0005906908021926379, + "loss": 0.006, + "num_input_tokens_seen": 131675120, + "step": 61025 + }, + { + "epoch": 9.955954323001631, + "grad_norm": 0.2884460687637329, + "learning_rate": 0.000590620802125954, + "loss": 0.0336, + "num_input_tokens_seen": 131686096, + "step": 61030 + }, + { + "epoch": 9.956769983686787, + "grad_norm": 0.012169270776212215, + "learning_rate": 0.0005905508002227247, + "loss": 0.0218, + "num_input_tokens_seen": 131697968, + "step": 61035 + }, + { + "epoch": 9.95758564437194, + "grad_norm": 0.023828787729144096, + "learning_rate": 0.0005904807964843684, + "loss": 0.0142, + "num_input_tokens_seen": 131709168, + "step": 61040 + }, + { + "epoch": 9.958401305057096, + "grad_norm": 0.16877472400665283, + "learning_rate": 0.0005904107909123039, + "loss": 0.183, + "num_input_tokens_seen": 131720656, + "step": 61045 + }, + { + "epoch": 9.959216965742252, + "grad_norm": 0.002537018619477749, + "learning_rate": 0.0005903407835079502, + "loss": 0.0064, + "num_input_tokens_seen": 131731536, + "step": 61050 + }, + { + "epoch": 9.960032626427406, + "grad_norm": 0.01614038273692131, + "learning_rate": 0.000590270774272726, + "loss": 0.0135, + "num_input_tokens_seen": 131742928, + "step": 61055 + }, + { + "epoch": 9.960848287112562, + "grad_norm": 0.021022778004407883, + "learning_rate": 0.0005902007632080499, + "loss": 0.0686, + "num_input_tokens_seen": 131752496, + "step": 61060 + }, + { + "epoch": 9.961663947797716, + "grad_norm": 0.0023374310694634914, + "learning_rate": 0.0005901307503153408, + "loss": 0.062, + "num_input_tokens_seen": 131764048, + "step": 61065 + }, + { + "epoch": 9.962479608482871, + "grad_norm": 0.3484862446784973, + "learning_rate": 0.0005900607355960178, + "loss": 0.0526, + "num_input_tokens_seen": 131774352, + "step": 61070 + }, + { + "epoch": 9.963295269168025, + "grad_norm": 0.14217711985111237, + "learning_rate": 0.0005899907190514999, + "loss": 0.0481, + "num_input_tokens_seen": 131783696, + "step": 61075 + }, + { + "epoch": 9.964110929853181, + "grad_norm": 0.0034184118267148733, + "learning_rate": 0.0005899207006832056, + "loss": 0.0075, + "num_input_tokens_seen": 131795696, + "step": 61080 + }, + { + "epoch": 9.964926590538337, + "grad_norm": 0.01680334098637104, + "learning_rate": 0.0005898506804925545, + "loss": 0.0069, + "num_input_tokens_seen": 131805616, + "step": 61085 + }, + { + "epoch": 9.96574225122349, + "grad_norm": 0.007420746143907309, + "learning_rate": 0.0005897806584809653, + "loss": 0.0935, + "num_input_tokens_seen": 131815056, + "step": 61090 + }, + { + "epoch": 9.966557911908646, + "grad_norm": 0.06177932769060135, + "learning_rate": 0.0005897106346498571, + "loss": 0.011, + "num_input_tokens_seen": 131825392, + "step": 61095 + }, + { + "epoch": 9.9673735725938, + "grad_norm": 0.19370174407958984, + "learning_rate": 0.0005896406090006491, + "loss": 0.0656, + "num_input_tokens_seen": 131836144, + "step": 61100 + }, + { + "epoch": 9.968189233278956, + "grad_norm": 0.008686351589858532, + "learning_rate": 0.0005895705815347605, + "loss": 0.0096, + "num_input_tokens_seen": 131846672, + "step": 61105 + }, + { + "epoch": 9.969004893964112, + "grad_norm": 0.31661173701286316, + "learning_rate": 0.0005895005522536104, + "loss": 0.1495, + "num_input_tokens_seen": 131858544, + "step": 61110 + }, + { + "epoch": 9.969820554649266, + "grad_norm": 0.08961602300405502, + "learning_rate": 0.000589430521158618, + "loss": 0.0136, + "num_input_tokens_seen": 131869296, + "step": 61115 + }, + { + "epoch": 9.970636215334421, + "grad_norm": 0.0035654862876981497, + "learning_rate": 0.0005893604882512027, + "loss": 0.0621, + "num_input_tokens_seen": 131879376, + "step": 61120 + }, + { + "epoch": 9.971451876019575, + "grad_norm": 0.027900006622076035, + "learning_rate": 0.0005892904535327837, + "loss": 0.013, + "num_input_tokens_seen": 131890768, + "step": 61125 + }, + { + "epoch": 9.97226753670473, + "grad_norm": 0.040725186467170715, + "learning_rate": 0.0005892204170047804, + "loss": 0.0233, + "num_input_tokens_seen": 131901104, + "step": 61130 + }, + { + "epoch": 9.973083197389887, + "grad_norm": 0.0007331048254854977, + "learning_rate": 0.0005891503786686123, + "loss": 0.0655, + "num_input_tokens_seen": 131912208, + "step": 61135 + }, + { + "epoch": 9.97389885807504, + "grad_norm": 0.21877209842205048, + "learning_rate": 0.0005890803385256985, + "loss": 0.1172, + "num_input_tokens_seen": 131923472, + "step": 61140 + }, + { + "epoch": 9.974714518760196, + "grad_norm": 0.025718929246068, + "learning_rate": 0.0005890102965774587, + "loss": 0.1168, + "num_input_tokens_seen": 131934384, + "step": 61145 + }, + { + "epoch": 9.97553017944535, + "grad_norm": 0.059255536645650864, + "learning_rate": 0.0005889402528253124, + "loss": 0.0742, + "num_input_tokens_seen": 131944784, + "step": 61150 + }, + { + "epoch": 9.976345840130506, + "grad_norm": 0.3241737186908722, + "learning_rate": 0.0005888702072706788, + "loss": 0.1238, + "num_input_tokens_seen": 131955088, + "step": 61155 + }, + { + "epoch": 9.977161500815662, + "grad_norm": 0.0046053980477154255, + "learning_rate": 0.0005888001599149781, + "loss": 0.1139, + "num_input_tokens_seen": 131966384, + "step": 61160 + }, + { + "epoch": 9.977977161500815, + "grad_norm": 0.008531535975635052, + "learning_rate": 0.0005887301107596292, + "loss": 0.0578, + "num_input_tokens_seen": 131977200, + "step": 61165 + }, + { + "epoch": 9.978792822185971, + "grad_norm": 0.007304763421416283, + "learning_rate": 0.0005886600598060522, + "loss": 0.0266, + "num_input_tokens_seen": 131988848, + "step": 61170 + }, + { + "epoch": 9.979608482871125, + "grad_norm": 0.13695082068443298, + "learning_rate": 0.0005885900070556665, + "loss": 0.0323, + "num_input_tokens_seen": 132000752, + "step": 61175 + }, + { + "epoch": 9.98042414355628, + "grad_norm": 0.036845337599515915, + "learning_rate": 0.0005885199525098919, + "loss": 0.009, + "num_input_tokens_seen": 132010544, + "step": 61180 + }, + { + "epoch": 9.981239804241435, + "grad_norm": 0.05225883424282074, + "learning_rate": 0.0005884498961701483, + "loss": 0.1092, + "num_input_tokens_seen": 132021552, + "step": 61185 + }, + { + "epoch": 9.98205546492659, + "grad_norm": 0.0058896103873848915, + "learning_rate": 0.0005883798380378554, + "loss": 0.2197, + "num_input_tokens_seen": 132032752, + "step": 61190 + }, + { + "epoch": 9.982871125611746, + "grad_norm": 0.006621936336159706, + "learning_rate": 0.0005883097781144329, + "loss": 0.0122, + "num_input_tokens_seen": 132043536, + "step": 61195 + }, + { + "epoch": 9.9836867862969, + "grad_norm": 0.011585002765059471, + "learning_rate": 0.0005882397164013005, + "loss": 0.0112, + "num_input_tokens_seen": 132053840, + "step": 61200 + }, + { + "epoch": 9.984502446982056, + "grad_norm": 0.003418646054342389, + "learning_rate": 0.0005881696528998785, + "loss": 0.0701, + "num_input_tokens_seen": 132064400, + "step": 61205 + }, + { + "epoch": 9.98531810766721, + "grad_norm": 0.005485466681420803, + "learning_rate": 0.0005880995876115868, + "loss": 0.0461, + "num_input_tokens_seen": 132073552, + "step": 61210 + }, + { + "epoch": 9.986133768352365, + "grad_norm": 0.24053853750228882, + "learning_rate": 0.0005880295205378449, + "loss": 0.0469, + "num_input_tokens_seen": 132084400, + "step": 61215 + }, + { + "epoch": 9.986949429037521, + "grad_norm": 0.07899510115385056, + "learning_rate": 0.0005879594516800732, + "loss": 0.0421, + "num_input_tokens_seen": 132095856, + "step": 61220 + }, + { + "epoch": 9.987765089722675, + "grad_norm": 0.00496051087975502, + "learning_rate": 0.0005878893810396916, + "loss": 0.0109, + "num_input_tokens_seen": 132108368, + "step": 61225 + }, + { + "epoch": 9.98858075040783, + "grad_norm": 0.007489520125091076, + "learning_rate": 0.0005878193086181203, + "loss": 0.1431, + "num_input_tokens_seen": 132120432, + "step": 61230 + }, + { + "epoch": 9.989396411092985, + "grad_norm": 0.005763236433267593, + "learning_rate": 0.0005877492344167792, + "loss": 0.0361, + "num_input_tokens_seen": 132131568, + "step": 61235 + }, + { + "epoch": 9.99021207177814, + "grad_norm": 0.005866055842489004, + "learning_rate": 0.0005876791584370886, + "loss": 0.0186, + "num_input_tokens_seen": 132142992, + "step": 61240 + }, + { + "epoch": 9.991027732463294, + "grad_norm": 0.0061873625963926315, + "learning_rate": 0.0005876090806804686, + "loss": 0.0217, + "num_input_tokens_seen": 132153520, + "step": 61245 + }, + { + "epoch": 9.99184339314845, + "grad_norm": 0.011407083831727505, + "learning_rate": 0.0005875390011483394, + "loss": 0.0239, + "num_input_tokens_seen": 132164368, + "step": 61250 + }, + { + "epoch": 9.992659053833606, + "grad_norm": 0.009952979162335396, + "learning_rate": 0.0005874689198421214, + "loss": 0.0062, + "num_input_tokens_seen": 132174992, + "step": 61255 + }, + { + "epoch": 9.99347471451876, + "grad_norm": 0.012712210416793823, + "learning_rate": 0.0005873988367632347, + "loss": 0.0151, + "num_input_tokens_seen": 132186320, + "step": 61260 + }, + { + "epoch": 9.994290375203915, + "grad_norm": 0.03069804050028324, + "learning_rate": 0.0005873287519130997, + "loss": 0.0366, + "num_input_tokens_seen": 132198000, + "step": 61265 + }, + { + "epoch": 9.99510603588907, + "grad_norm": 0.0009589301771484315, + "learning_rate": 0.0005872586652931368, + "loss": 0.0655, + "num_input_tokens_seen": 132209840, + "step": 61270 + }, + { + "epoch": 9.995921696574225, + "grad_norm": 0.02009737119078636, + "learning_rate": 0.0005871885769047664, + "loss": 0.0183, + "num_input_tokens_seen": 132220848, + "step": 61275 + }, + { + "epoch": 9.99673735725938, + "grad_norm": 0.2635076344013214, + "learning_rate": 0.0005871184867494088, + "loss": 0.16, + "num_input_tokens_seen": 132230384, + "step": 61280 + }, + { + "epoch": 9.997553017944535, + "grad_norm": 0.007213435135781765, + "learning_rate": 0.0005870483948284845, + "loss": 0.0129, + "num_input_tokens_seen": 132241200, + "step": 61285 + }, + { + "epoch": 9.99836867862969, + "grad_norm": 0.3387175500392914, + "learning_rate": 0.0005869783011434141, + "loss": 0.1799, + "num_input_tokens_seen": 132251664, + "step": 61290 + }, + { + "epoch": 9.999184339314844, + "grad_norm": 0.0036496040411293507, + "learning_rate": 0.0005869082056956181, + "loss": 0.0356, + "num_input_tokens_seen": 132262576, + "step": 61295 + }, + { + "epoch": 10.0, + "grad_norm": 0.026446862146258354, + "learning_rate": 0.000586838108486517, + "loss": 0.0474, + "num_input_tokens_seen": 132272272, + "step": 61300 + }, + { + "epoch": 10.0, + "eval_loss": 0.12678515911102295, + "eval_runtime": 104.4077, + "eval_samples_per_second": 26.1, + "eval_steps_per_second": 6.532, + "num_input_tokens_seen": 132272272, + "step": 61300 + }, + { + "epoch": 10.000815660685156, + "grad_norm": 0.0049971952103078365, + "learning_rate": 0.0005867680095175315, + "loss": 0.0688, + "num_input_tokens_seen": 132282928, + "step": 61305 + }, + { + "epoch": 10.00163132137031, + "grad_norm": 0.0028669931925833225, + "learning_rate": 0.0005866979087900822, + "loss": 0.0185, + "num_input_tokens_seen": 132294672, + "step": 61310 + }, + { + "epoch": 10.002446982055465, + "grad_norm": 0.0542973168194294, + "learning_rate": 0.0005866278063055898, + "loss": 0.0089, + "num_input_tokens_seen": 132305232, + "step": 61315 + }, + { + "epoch": 10.00326264274062, + "grad_norm": 0.010110327042639256, + "learning_rate": 0.0005865577020654751, + "loss": 0.0141, + "num_input_tokens_seen": 132317168, + "step": 61320 + }, + { + "epoch": 10.004078303425775, + "grad_norm": 0.02987646497786045, + "learning_rate": 0.0005864875960711588, + "loss": 0.027, + "num_input_tokens_seen": 132328688, + "step": 61325 + }, + { + "epoch": 10.00489396411093, + "grad_norm": 0.007350550964474678, + "learning_rate": 0.0005864174883240614, + "loss": 0.008, + "num_input_tokens_seen": 132339472, + "step": 61330 + }, + { + "epoch": 10.005709624796085, + "grad_norm": 0.001487818779423833, + "learning_rate": 0.0005863473788256042, + "loss": 0.017, + "num_input_tokens_seen": 132348624, + "step": 61335 + }, + { + "epoch": 10.00652528548124, + "grad_norm": 0.021875599399209023, + "learning_rate": 0.0005862772675772076, + "loss": 0.0162, + "num_input_tokens_seen": 132357968, + "step": 61340 + }, + { + "epoch": 10.007340946166394, + "grad_norm": 0.0026479815132915974, + "learning_rate": 0.000586207154580293, + "loss": 0.0506, + "num_input_tokens_seen": 132369040, + "step": 61345 + }, + { + "epoch": 10.00815660685155, + "grad_norm": 0.018447181209921837, + "learning_rate": 0.0005861370398362809, + "loss": 0.0922, + "num_input_tokens_seen": 132380304, + "step": 61350 + }, + { + "epoch": 10.008972267536704, + "grad_norm": 0.10940034687519073, + "learning_rate": 0.0005860669233465925, + "loss": 0.0153, + "num_input_tokens_seen": 132391280, + "step": 61355 + }, + { + "epoch": 10.00978792822186, + "grad_norm": 0.0184964407235384, + "learning_rate": 0.0005859968051126486, + "loss": 0.1231, + "num_input_tokens_seen": 132402544, + "step": 61360 + }, + { + "epoch": 10.010603588907015, + "grad_norm": 0.2401627004146576, + "learning_rate": 0.0005859266851358704, + "loss": 0.0537, + "num_input_tokens_seen": 132414064, + "step": 61365 + }, + { + "epoch": 10.01141924959217, + "grad_norm": 0.011693798936903477, + "learning_rate": 0.0005858565634176789, + "loss": 0.0639, + "num_input_tokens_seen": 132425712, + "step": 61370 + }, + { + "epoch": 10.012234910277325, + "grad_norm": 0.03067117929458618, + "learning_rate": 0.0005857864399594953, + "loss": 0.0225, + "num_input_tokens_seen": 132436656, + "step": 61375 + }, + { + "epoch": 10.013050570962479, + "grad_norm": 0.006082721054553986, + "learning_rate": 0.0005857163147627406, + "loss": 0.0158, + "num_input_tokens_seen": 132447440, + "step": 61380 + }, + { + "epoch": 10.013866231647635, + "grad_norm": 0.003431787947192788, + "learning_rate": 0.000585646187828836, + "loss": 0.1353, + "num_input_tokens_seen": 132459376, + "step": 61385 + }, + { + "epoch": 10.01468189233279, + "grad_norm": 0.020110931247472763, + "learning_rate": 0.000585576059159203, + "loss": 0.0082, + "num_input_tokens_seen": 132471440, + "step": 61390 + }, + { + "epoch": 10.015497553017944, + "grad_norm": 0.06702979654073715, + "learning_rate": 0.0005855059287552623, + "loss": 0.0127, + "num_input_tokens_seen": 132482192, + "step": 61395 + }, + { + "epoch": 10.0163132137031, + "grad_norm": 0.01968558505177498, + "learning_rate": 0.0005854357966184356, + "loss": 0.0093, + "num_input_tokens_seen": 132492528, + "step": 61400 + }, + { + "epoch": 10.017128874388254, + "grad_norm": 0.3056025207042694, + "learning_rate": 0.0005853656627501442, + "loss": 0.1355, + "num_input_tokens_seen": 132503440, + "step": 61405 + }, + { + "epoch": 10.01794453507341, + "grad_norm": 0.25775644183158875, + "learning_rate": 0.0005852955271518092, + "loss": 0.1528, + "num_input_tokens_seen": 132513616, + "step": 61410 + }, + { + "epoch": 10.018760195758565, + "grad_norm": 0.13199271261692047, + "learning_rate": 0.0005852253898248522, + "loss": 0.0639, + "num_input_tokens_seen": 132524560, + "step": 61415 + }, + { + "epoch": 10.01957585644372, + "grad_norm": 0.007556598167866468, + "learning_rate": 0.0005851552507706945, + "loss": 0.0055, + "num_input_tokens_seen": 132534192, + "step": 61420 + }, + { + "epoch": 10.020391517128875, + "grad_norm": 0.18122105300426483, + "learning_rate": 0.0005850851099907577, + "loss": 0.2519, + "num_input_tokens_seen": 132545104, + "step": 61425 + }, + { + "epoch": 10.021207177814029, + "grad_norm": 0.00917928759008646, + "learning_rate": 0.0005850149674864631, + "loss": 0.0276, + "num_input_tokens_seen": 132556816, + "step": 61430 + }, + { + "epoch": 10.022022838499185, + "grad_norm": 0.07371887564659119, + "learning_rate": 0.0005849448232592324, + "loss": 0.0092, + "num_input_tokens_seen": 132567888, + "step": 61435 + }, + { + "epoch": 10.022838499184338, + "grad_norm": 0.29154112935066223, + "learning_rate": 0.0005848746773104871, + "loss": 0.054, + "num_input_tokens_seen": 132580912, + "step": 61440 + }, + { + "epoch": 10.023654159869494, + "grad_norm": 0.03154376149177551, + "learning_rate": 0.0005848045296416488, + "loss": 0.1162, + "num_input_tokens_seen": 132593520, + "step": 61445 + }, + { + "epoch": 10.02446982055465, + "grad_norm": 0.006617080420255661, + "learning_rate": 0.0005847343802541391, + "loss": 0.0031, + "num_input_tokens_seen": 132603504, + "step": 61450 + }, + { + "epoch": 10.025285481239804, + "grad_norm": 0.015079149045050144, + "learning_rate": 0.0005846642291493796, + "loss": 0.0111, + "num_input_tokens_seen": 132614576, + "step": 61455 + }, + { + "epoch": 10.02610114192496, + "grad_norm": 0.002986414125189185, + "learning_rate": 0.0005845940763287923, + "loss": 0.0266, + "num_input_tokens_seen": 132624944, + "step": 61460 + }, + { + "epoch": 10.026916802610113, + "grad_norm": 0.0029899596702307463, + "learning_rate": 0.0005845239217937986, + "loss": 0.0988, + "num_input_tokens_seen": 132635248, + "step": 61465 + }, + { + "epoch": 10.02773246329527, + "grad_norm": 0.22422057390213013, + "learning_rate": 0.0005844537655458203, + "loss": 0.0208, + "num_input_tokens_seen": 132646288, + "step": 61470 + }, + { + "epoch": 10.028548123980425, + "grad_norm": 0.028829464688897133, + "learning_rate": 0.0005843836075862794, + "loss": 0.0052, + "num_input_tokens_seen": 132655888, + "step": 61475 + }, + { + "epoch": 10.029363784665579, + "grad_norm": 0.31386232376098633, + "learning_rate": 0.0005843134479165977, + "loss": 0.032, + "num_input_tokens_seen": 132666256, + "step": 61480 + }, + { + "epoch": 10.030179445350734, + "grad_norm": 0.00522128539159894, + "learning_rate": 0.0005842432865381971, + "loss": 0.1209, + "num_input_tokens_seen": 132677104, + "step": 61485 + }, + { + "epoch": 10.030995106035888, + "grad_norm": 0.07230376452207565, + "learning_rate": 0.0005841731234524993, + "loss": 0.0558, + "num_input_tokens_seen": 132688656, + "step": 61490 + }, + { + "epoch": 10.031810766721044, + "grad_norm": 0.0078009325079619884, + "learning_rate": 0.0005841029586609263, + "loss": 0.0079, + "num_input_tokens_seen": 132698896, + "step": 61495 + }, + { + "epoch": 10.0326264274062, + "grad_norm": 0.1573605090379715, + "learning_rate": 0.0005840327921649003, + "loss": 0.0699, + "num_input_tokens_seen": 132709392, + "step": 61500 + }, + { + "epoch": 10.033442088091354, + "grad_norm": 0.05050228536128998, + "learning_rate": 0.0005839626239658431, + "loss": 0.0075, + "num_input_tokens_seen": 132719728, + "step": 61505 + }, + { + "epoch": 10.03425774877651, + "grad_norm": 0.2857561707496643, + "learning_rate": 0.0005838924540651769, + "loss": 0.0496, + "num_input_tokens_seen": 132730192, + "step": 61510 + }, + { + "epoch": 10.035073409461663, + "grad_norm": 0.018673496320843697, + "learning_rate": 0.0005838222824643235, + "loss": 0.0441, + "num_input_tokens_seen": 132740208, + "step": 61515 + }, + { + "epoch": 10.035889070146819, + "grad_norm": 0.0022126613184809685, + "learning_rate": 0.0005837521091647054, + "loss": 0.0312, + "num_input_tokens_seen": 132751184, + "step": 61520 + }, + { + "epoch": 10.036704730831975, + "grad_norm": 0.12979358434677124, + "learning_rate": 0.0005836819341677444, + "loss": 0.0226, + "num_input_tokens_seen": 132762672, + "step": 61525 + }, + { + "epoch": 10.037520391517129, + "grad_norm": 0.030350472778081894, + "learning_rate": 0.0005836117574748629, + "loss": 0.033, + "num_input_tokens_seen": 132774448, + "step": 61530 + }, + { + "epoch": 10.038336052202284, + "grad_norm": 0.017586493864655495, + "learning_rate": 0.0005835415790874832, + "loss": 0.0077, + "num_input_tokens_seen": 132785040, + "step": 61535 + }, + { + "epoch": 10.039151712887438, + "grad_norm": 0.0041281841695308685, + "learning_rate": 0.0005834713990070273, + "loss": 0.0087, + "num_input_tokens_seen": 132795856, + "step": 61540 + }, + { + "epoch": 10.039967373572594, + "grad_norm": 0.1649474799633026, + "learning_rate": 0.0005834012172349174, + "loss": 0.043, + "num_input_tokens_seen": 132808080, + "step": 61545 + }, + { + "epoch": 10.040783034257748, + "grad_norm": 0.0070051159709692, + "learning_rate": 0.0005833310337725764, + "loss": 0.0637, + "num_input_tokens_seen": 132818768, + "step": 61550 + }, + { + "epoch": 10.041598694942904, + "grad_norm": 0.009777032770216465, + "learning_rate": 0.0005832608486214261, + "loss": 0.0192, + "num_input_tokens_seen": 132830928, + "step": 61555 + }, + { + "epoch": 10.04241435562806, + "grad_norm": 0.002175942063331604, + "learning_rate": 0.0005831906617828892, + "loss": 0.0407, + "num_input_tokens_seen": 132842416, + "step": 61560 + }, + { + "epoch": 10.043230016313213, + "grad_norm": 0.011430644430220127, + "learning_rate": 0.0005831204732583879, + "loss": 0.0256, + "num_input_tokens_seen": 132853904, + "step": 61565 + }, + { + "epoch": 10.044045676998369, + "grad_norm": 0.41094970703125, + "learning_rate": 0.0005830502830493447, + "loss": 0.1469, + "num_input_tokens_seen": 132863664, + "step": 61570 + }, + { + "epoch": 10.044861337683523, + "grad_norm": 0.23442670702934265, + "learning_rate": 0.0005829800911571824, + "loss": 0.0276, + "num_input_tokens_seen": 132873456, + "step": 61575 + }, + { + "epoch": 10.045676998368679, + "grad_norm": 0.013725553639233112, + "learning_rate": 0.000582909897583323, + "loss": 0.0083, + "num_input_tokens_seen": 132884560, + "step": 61580 + }, + { + "epoch": 10.046492659053834, + "grad_norm": 0.008123509585857391, + "learning_rate": 0.0005828397023291895, + "loss": 0.0085, + "num_input_tokens_seen": 132895664, + "step": 61585 + }, + { + "epoch": 10.047308319738988, + "grad_norm": 0.06841395050287247, + "learning_rate": 0.0005827695053962043, + "loss": 0.0199, + "num_input_tokens_seen": 132907824, + "step": 61590 + }, + { + "epoch": 10.048123980424144, + "grad_norm": 0.2624976933002472, + "learning_rate": 0.0005826993067857901, + "loss": 0.0419, + "num_input_tokens_seen": 132919376, + "step": 61595 + }, + { + "epoch": 10.048939641109298, + "grad_norm": 0.058688197284936905, + "learning_rate": 0.0005826291064993695, + "loss": 0.0117, + "num_input_tokens_seen": 132930992, + "step": 61600 + }, + { + "epoch": 10.049755301794454, + "grad_norm": 0.04489204287528992, + "learning_rate": 0.0005825589045383654, + "loss": 0.033, + "num_input_tokens_seen": 132941040, + "step": 61605 + }, + { + "epoch": 10.05057096247961, + "grad_norm": 0.0037497221492230892, + "learning_rate": 0.0005824887009042002, + "loss": 0.0069, + "num_input_tokens_seen": 132952272, + "step": 61610 + }, + { + "epoch": 10.051386623164763, + "grad_norm": 0.0069594066590070724, + "learning_rate": 0.0005824184955982967, + "loss": 0.0081, + "num_input_tokens_seen": 132963024, + "step": 61615 + }, + { + "epoch": 10.052202283849919, + "grad_norm": 0.004311118740588427, + "learning_rate": 0.000582348288622078, + "loss": 0.0189, + "num_input_tokens_seen": 132974512, + "step": 61620 + }, + { + "epoch": 10.053017944535073, + "grad_norm": 0.0173836387693882, + "learning_rate": 0.0005822780799769667, + "loss": 0.0301, + "num_input_tokens_seen": 132985392, + "step": 61625 + }, + { + "epoch": 10.053833605220229, + "grad_norm": 0.013632199726998806, + "learning_rate": 0.0005822078696643859, + "loss": 0.0556, + "num_input_tokens_seen": 132996144, + "step": 61630 + }, + { + "epoch": 10.054649265905383, + "grad_norm": 0.0012218935880810022, + "learning_rate": 0.0005821376576857582, + "loss": 0.0629, + "num_input_tokens_seen": 133006576, + "step": 61635 + }, + { + "epoch": 10.055464926590538, + "grad_norm": 0.025944620370864868, + "learning_rate": 0.0005820674440425067, + "loss": 0.0055, + "num_input_tokens_seen": 133017296, + "step": 61640 + }, + { + "epoch": 10.056280587275694, + "grad_norm": 0.26036545634269714, + "learning_rate": 0.0005819972287360543, + "loss": 0.1004, + "num_input_tokens_seen": 133027280, + "step": 61645 + }, + { + "epoch": 10.057096247960848, + "grad_norm": 0.020724255591630936, + "learning_rate": 0.0005819270117678239, + "loss": 0.0082, + "num_input_tokens_seen": 133038832, + "step": 61650 + }, + { + "epoch": 10.057911908646004, + "grad_norm": 0.0026531433686614037, + "learning_rate": 0.0005818567931392389, + "loss": 0.1943, + "num_input_tokens_seen": 133049296, + "step": 61655 + }, + { + "epoch": 10.058727569331158, + "grad_norm": 0.011688055470585823, + "learning_rate": 0.000581786572851722, + "loss": 0.0321, + "num_input_tokens_seen": 133060304, + "step": 61660 + }, + { + "epoch": 10.059543230016313, + "grad_norm": 0.06265543401241302, + "learning_rate": 0.0005817163509066966, + "loss": 0.0086, + "num_input_tokens_seen": 133071216, + "step": 61665 + }, + { + "epoch": 10.060358890701469, + "grad_norm": 0.003058596048504114, + "learning_rate": 0.0005816461273055857, + "loss": 0.0585, + "num_input_tokens_seen": 133083056, + "step": 61670 + }, + { + "epoch": 10.061174551386623, + "grad_norm": 0.10146154463291168, + "learning_rate": 0.0005815759020498122, + "loss": 0.1186, + "num_input_tokens_seen": 133093872, + "step": 61675 + }, + { + "epoch": 10.061990212071779, + "grad_norm": 0.010096694342792034, + "learning_rate": 0.0005815056751407999, + "loss": 0.112, + "num_input_tokens_seen": 133104784, + "step": 61680 + }, + { + "epoch": 10.062805872756933, + "grad_norm": 0.0013990678125992417, + "learning_rate": 0.0005814354465799715, + "loss": 0.0793, + "num_input_tokens_seen": 133114896, + "step": 61685 + }, + { + "epoch": 10.063621533442088, + "grad_norm": 0.05207381397485733, + "learning_rate": 0.0005813652163687504, + "loss": 0.0265, + "num_input_tokens_seen": 133126256, + "step": 61690 + }, + { + "epoch": 10.064437194127244, + "grad_norm": 0.002854426158592105, + "learning_rate": 0.0005812949845085601, + "loss": 0.0354, + "num_input_tokens_seen": 133137200, + "step": 61695 + }, + { + "epoch": 10.065252854812398, + "grad_norm": 0.24887309968471527, + "learning_rate": 0.0005812247510008238, + "loss": 0.0662, + "num_input_tokens_seen": 133148016, + "step": 61700 + }, + { + "epoch": 10.066068515497554, + "grad_norm": 0.009434686042368412, + "learning_rate": 0.0005811545158469649, + "loss": 0.1089, + "num_input_tokens_seen": 133159344, + "step": 61705 + }, + { + "epoch": 10.066884176182707, + "grad_norm": 0.17490193247795105, + "learning_rate": 0.0005810842790484066, + "loss": 0.0291, + "num_input_tokens_seen": 133168560, + "step": 61710 + }, + { + "epoch": 10.067699836867863, + "grad_norm": 0.004364182241261005, + "learning_rate": 0.0005810140406065727, + "loss": 0.0716, + "num_input_tokens_seen": 133178800, + "step": 61715 + }, + { + "epoch": 10.068515497553017, + "grad_norm": 0.038533031940460205, + "learning_rate": 0.0005809438005228866, + "loss": 0.0896, + "num_input_tokens_seen": 133190032, + "step": 61720 + }, + { + "epoch": 10.069331158238173, + "grad_norm": 0.03599059209227562, + "learning_rate": 0.0005808735587987714, + "loss": 0.0337, + "num_input_tokens_seen": 133200624, + "step": 61725 + }, + { + "epoch": 10.070146818923329, + "grad_norm": 0.12202820181846619, + "learning_rate": 0.0005808033154356511, + "loss": 0.0483, + "num_input_tokens_seen": 133212400, + "step": 61730 + }, + { + "epoch": 10.070962479608482, + "grad_norm": 0.0024122411850839853, + "learning_rate": 0.0005807330704349492, + "loss": 0.007, + "num_input_tokens_seen": 133223536, + "step": 61735 + }, + { + "epoch": 10.071778140293638, + "grad_norm": 0.021759632974863052, + "learning_rate": 0.0005806628237980891, + "loss": 0.1048, + "num_input_tokens_seen": 133234640, + "step": 61740 + }, + { + "epoch": 10.072593800978792, + "grad_norm": 0.004035233519971371, + "learning_rate": 0.0005805925755264945, + "loss": 0.048, + "num_input_tokens_seen": 133244720, + "step": 61745 + }, + { + "epoch": 10.073409461663948, + "grad_norm": 0.004185998346656561, + "learning_rate": 0.0005805223256215891, + "loss": 0.0971, + "num_input_tokens_seen": 133255856, + "step": 61750 + }, + { + "epoch": 10.074225122349104, + "grad_norm": 0.026021679863333702, + "learning_rate": 0.0005804520740847966, + "loss": 0.0545, + "num_input_tokens_seen": 133266704, + "step": 61755 + }, + { + "epoch": 10.075040783034257, + "grad_norm": 0.00480511924251914, + "learning_rate": 0.0005803818209175409, + "loss": 0.0454, + "num_input_tokens_seen": 133277488, + "step": 61760 + }, + { + "epoch": 10.075856443719413, + "grad_norm": 0.2196371704339981, + "learning_rate": 0.0005803115661212456, + "loss": 0.089, + "num_input_tokens_seen": 133288272, + "step": 61765 + }, + { + "epoch": 10.076672104404567, + "grad_norm": 0.01837882585823536, + "learning_rate": 0.0005802413096973345, + "loss": 0.0466, + "num_input_tokens_seen": 133299632, + "step": 61770 + }, + { + "epoch": 10.077487765089723, + "grad_norm": 0.020451877266168594, + "learning_rate": 0.0005801710516472315, + "loss": 0.0066, + "num_input_tokens_seen": 133309296, + "step": 61775 + }, + { + "epoch": 10.078303425774878, + "grad_norm": 0.019125672057271004, + "learning_rate": 0.0005801007919723605, + "loss": 0.0072, + "num_input_tokens_seen": 133319760, + "step": 61780 + }, + { + "epoch": 10.079119086460032, + "grad_norm": 0.001961350906640291, + "learning_rate": 0.000580030530674145, + "loss": 0.0575, + "num_input_tokens_seen": 133330960, + "step": 61785 + }, + { + "epoch": 10.079934747145188, + "grad_norm": 0.06800465285778046, + "learning_rate": 0.0005799602677540095, + "loss": 0.0552, + "num_input_tokens_seen": 133342256, + "step": 61790 + }, + { + "epoch": 10.080750407830342, + "grad_norm": 0.104884572327137, + "learning_rate": 0.0005798900032133778, + "loss": 0.0351, + "num_input_tokens_seen": 133352496, + "step": 61795 + }, + { + "epoch": 10.081566068515498, + "grad_norm": 0.014615744352340698, + "learning_rate": 0.0005798197370536737, + "loss": 0.0221, + "num_input_tokens_seen": 133362096, + "step": 61800 + }, + { + "epoch": 10.082381729200652, + "grad_norm": 0.6837376356124878, + "learning_rate": 0.0005797494692763215, + "loss": 0.0437, + "num_input_tokens_seen": 133373200, + "step": 61805 + }, + { + "epoch": 10.083197389885807, + "grad_norm": 0.018247120082378387, + "learning_rate": 0.0005796791998827451, + "loss": 0.0308, + "num_input_tokens_seen": 133384944, + "step": 61810 + }, + { + "epoch": 10.084013050570963, + "grad_norm": 0.0018951846286654472, + "learning_rate": 0.0005796089288743687, + "loss": 0.0105, + "num_input_tokens_seen": 133396144, + "step": 61815 + }, + { + "epoch": 10.084828711256117, + "grad_norm": 0.07727274298667908, + "learning_rate": 0.0005795386562526163, + "loss": 0.0117, + "num_input_tokens_seen": 133407312, + "step": 61820 + }, + { + "epoch": 10.085644371941273, + "grad_norm": 0.0046775490045547485, + "learning_rate": 0.000579468382018912, + "loss": 0.0029, + "num_input_tokens_seen": 133418640, + "step": 61825 + }, + { + "epoch": 10.086460032626427, + "grad_norm": 0.22308705747127533, + "learning_rate": 0.0005793981061746802, + "loss": 0.1058, + "num_input_tokens_seen": 133430160, + "step": 61830 + }, + { + "epoch": 10.087275693311582, + "grad_norm": 0.01267511397600174, + "learning_rate": 0.0005793278287213453, + "loss": 0.0134, + "num_input_tokens_seen": 133441424, + "step": 61835 + }, + { + "epoch": 10.088091353996738, + "grad_norm": 0.09423317015171051, + "learning_rate": 0.000579257549660331, + "loss": 0.0157, + "num_input_tokens_seen": 133452560, + "step": 61840 + }, + { + "epoch": 10.088907014681892, + "grad_norm": 0.019913768395781517, + "learning_rate": 0.0005791872689930621, + "loss": 0.121, + "num_input_tokens_seen": 133464432, + "step": 61845 + }, + { + "epoch": 10.089722675367048, + "grad_norm": 0.1102357804775238, + "learning_rate": 0.0005791169867209626, + "loss": 0.0132, + "num_input_tokens_seen": 133475920, + "step": 61850 + }, + { + "epoch": 10.090538336052202, + "grad_norm": 0.06185174360871315, + "learning_rate": 0.0005790467028454571, + "loss": 0.0566, + "num_input_tokens_seen": 133486992, + "step": 61855 + }, + { + "epoch": 10.091353996737357, + "grad_norm": 0.03568820282816887, + "learning_rate": 0.0005789764173679698, + "loss": 0.0127, + "num_input_tokens_seen": 133498768, + "step": 61860 + }, + { + "epoch": 10.092169657422513, + "grad_norm": 0.024414775893092155, + "learning_rate": 0.0005789061302899252, + "loss": 0.0431, + "num_input_tokens_seen": 133509872, + "step": 61865 + }, + { + "epoch": 10.092985318107667, + "grad_norm": 0.011188477277755737, + "learning_rate": 0.0005788358416127478, + "loss": 0.0104, + "num_input_tokens_seen": 133521264, + "step": 61870 + }, + { + "epoch": 10.093800978792823, + "grad_norm": 0.004692681133747101, + "learning_rate": 0.0005787655513378622, + "loss": 0.0058, + "num_input_tokens_seen": 133531824, + "step": 61875 + }, + { + "epoch": 10.094616639477977, + "grad_norm": 0.01405363716185093, + "learning_rate": 0.0005786952594666925, + "loss": 0.017, + "num_input_tokens_seen": 133543504, + "step": 61880 + }, + { + "epoch": 10.095432300163132, + "grad_norm": 0.02782539092004299, + "learning_rate": 0.0005786249660006638, + "loss": 0.0288, + "num_input_tokens_seen": 133553968, + "step": 61885 + }, + { + "epoch": 10.096247960848286, + "grad_norm": 0.0020635018590837717, + "learning_rate": 0.0005785546709412004, + "loss": 0.0154, + "num_input_tokens_seen": 133563184, + "step": 61890 + }, + { + "epoch": 10.097063621533442, + "grad_norm": 0.2968139946460724, + "learning_rate": 0.0005784843742897268, + "loss": 0.0355, + "num_input_tokens_seen": 133573552, + "step": 61895 + }, + { + "epoch": 10.097879282218598, + "grad_norm": 0.015226349234580994, + "learning_rate": 0.0005784140760476679, + "loss": 0.0037, + "num_input_tokens_seen": 133584432, + "step": 61900 + }, + { + "epoch": 10.098694942903752, + "grad_norm": 0.07021880149841309, + "learning_rate": 0.0005783437762164483, + "loss": 0.0551, + "num_input_tokens_seen": 133595344, + "step": 61905 + }, + { + "epoch": 10.099510603588907, + "grad_norm": 0.004048360977321863, + "learning_rate": 0.0005782734747974926, + "loss": 0.0207, + "num_input_tokens_seen": 133606640, + "step": 61910 + }, + { + "epoch": 10.100326264274061, + "grad_norm": 0.029553735628724098, + "learning_rate": 0.0005782031717922256, + "loss": 0.0322, + "num_input_tokens_seen": 133617264, + "step": 61915 + }, + { + "epoch": 10.101141924959217, + "grad_norm": 0.4037034213542938, + "learning_rate": 0.0005781328672020723, + "loss": 0.2044, + "num_input_tokens_seen": 133628912, + "step": 61920 + }, + { + "epoch": 10.101957585644373, + "grad_norm": 0.2710871696472168, + "learning_rate": 0.0005780625610284572, + "loss": 0.0958, + "num_input_tokens_seen": 133638512, + "step": 61925 + }, + { + "epoch": 10.102773246329527, + "grad_norm": 0.19376686215400696, + "learning_rate": 0.000577992253272805, + "loss": 0.024, + "num_input_tokens_seen": 133648880, + "step": 61930 + }, + { + "epoch": 10.103588907014682, + "grad_norm": 0.44689705967903137, + "learning_rate": 0.0005779219439365411, + "loss": 0.0482, + "num_input_tokens_seen": 133658480, + "step": 61935 + }, + { + "epoch": 10.104404567699836, + "grad_norm": 0.025761045515537262, + "learning_rate": 0.0005778516330210902, + "loss": 0.009, + "num_input_tokens_seen": 133669552, + "step": 61940 + }, + { + "epoch": 10.105220228384992, + "grad_norm": 0.0018303771503269672, + "learning_rate": 0.0005777813205278772, + "loss": 0.0225, + "num_input_tokens_seen": 133680240, + "step": 61945 + }, + { + "epoch": 10.106035889070148, + "grad_norm": 0.01580619439482689, + "learning_rate": 0.0005777110064583271, + "loss": 0.0147, + "num_input_tokens_seen": 133690480, + "step": 61950 + }, + { + "epoch": 10.106851549755302, + "grad_norm": 0.002764845732599497, + "learning_rate": 0.0005776406908138648, + "loss": 0.0892, + "num_input_tokens_seen": 133701104, + "step": 61955 + }, + { + "epoch": 10.107667210440457, + "grad_norm": 0.4119803011417389, + "learning_rate": 0.0005775703735959155, + "loss": 0.1421, + "num_input_tokens_seen": 133712016, + "step": 61960 + }, + { + "epoch": 10.108482871125611, + "grad_norm": 0.0009009005152620375, + "learning_rate": 0.000577500054805904, + "loss": 0.0028, + "num_input_tokens_seen": 133723152, + "step": 61965 + }, + { + "epoch": 10.109298531810767, + "grad_norm": 0.006416691467165947, + "learning_rate": 0.0005774297344452556, + "loss": 0.0121, + "num_input_tokens_seen": 133735088, + "step": 61970 + }, + { + "epoch": 10.11011419249592, + "grad_norm": 0.0011904212879016995, + "learning_rate": 0.0005773594125153955, + "loss": 0.0364, + "num_input_tokens_seen": 133744624, + "step": 61975 + }, + { + "epoch": 10.110929853181077, + "grad_norm": 0.34913864731788635, + "learning_rate": 0.0005772890890177487, + "loss": 0.0201, + "num_input_tokens_seen": 133755344, + "step": 61980 + }, + { + "epoch": 10.111745513866232, + "grad_norm": 0.002706618746742606, + "learning_rate": 0.0005772187639537405, + "loss": 0.0083, + "num_input_tokens_seen": 133766640, + "step": 61985 + }, + { + "epoch": 10.112561174551386, + "grad_norm": 0.01733635924756527, + "learning_rate": 0.000577148437324796, + "loss": 0.0085, + "num_input_tokens_seen": 133776368, + "step": 61990 + }, + { + "epoch": 10.113376835236542, + "grad_norm": 0.01870352402329445, + "learning_rate": 0.0005770781091323407, + "loss": 0.008, + "num_input_tokens_seen": 133787120, + "step": 61995 + }, + { + "epoch": 10.114192495921696, + "grad_norm": 0.3460165560245514, + "learning_rate": 0.0005770077793777996, + "loss": 0.0453, + "num_input_tokens_seen": 133798448, + "step": 62000 + }, + { + "epoch": 10.115008156606851, + "grad_norm": 0.016396356746554375, + "learning_rate": 0.0005769374480625983, + "loss": 0.0452, + "num_input_tokens_seen": 133808112, + "step": 62005 + }, + { + "epoch": 10.115823817292007, + "grad_norm": 0.0031818056013435125, + "learning_rate": 0.000576867115188162, + "loss": 0.0032, + "num_input_tokens_seen": 133819856, + "step": 62010 + }, + { + "epoch": 10.116639477977161, + "grad_norm": 0.014918707311153412, + "learning_rate": 0.000576796780755916, + "loss": 0.0826, + "num_input_tokens_seen": 133831792, + "step": 62015 + }, + { + "epoch": 10.117455138662317, + "grad_norm": 0.14264251291751862, + "learning_rate": 0.0005767264447672859, + "loss": 0.1458, + "num_input_tokens_seen": 133841552, + "step": 62020 + }, + { + "epoch": 10.11827079934747, + "grad_norm": 0.013111090287566185, + "learning_rate": 0.000576656107223697, + "loss": 0.009, + "num_input_tokens_seen": 133852688, + "step": 62025 + }, + { + "epoch": 10.119086460032626, + "grad_norm": 0.005116211250424385, + "learning_rate": 0.0005765857681265749, + "loss": 0.0036, + "num_input_tokens_seen": 133863696, + "step": 62030 + }, + { + "epoch": 10.119902120717782, + "grad_norm": 0.009506931528449059, + "learning_rate": 0.000576515427477345, + "loss": 0.0214, + "num_input_tokens_seen": 133875536, + "step": 62035 + }, + { + "epoch": 10.120717781402936, + "grad_norm": 0.044899359345436096, + "learning_rate": 0.0005764450852774329, + "loss": 0.0158, + "num_input_tokens_seen": 133885872, + "step": 62040 + }, + { + "epoch": 10.121533442088092, + "grad_norm": 0.0009844235610216856, + "learning_rate": 0.0005763747415282642, + "loss": 0.0329, + "num_input_tokens_seen": 133897776, + "step": 62045 + }, + { + "epoch": 10.122349102773246, + "grad_norm": 0.4768241047859192, + "learning_rate": 0.0005763043962312644, + "loss": 0.0421, + "num_input_tokens_seen": 133909040, + "step": 62050 + }, + { + "epoch": 10.123164763458401, + "grad_norm": 0.01316496729850769, + "learning_rate": 0.0005762340493878593, + "loss": 0.0083, + "num_input_tokens_seen": 133919536, + "step": 62055 + }, + { + "epoch": 10.123980424143557, + "grad_norm": 0.006132745184004307, + "learning_rate": 0.0005761637009994745, + "loss": 0.0094, + "num_input_tokens_seen": 133928912, + "step": 62060 + }, + { + "epoch": 10.124796084828711, + "grad_norm": 0.02580087073147297, + "learning_rate": 0.0005760933510675356, + "loss": 0.0034, + "num_input_tokens_seen": 133940784, + "step": 62065 + }, + { + "epoch": 10.125611745513867, + "grad_norm": 0.0038549809250980616, + "learning_rate": 0.0005760229995934684, + "loss": 0.0157, + "num_input_tokens_seen": 133950000, + "step": 62070 + }, + { + "epoch": 10.12642740619902, + "grad_norm": 0.002713927999138832, + "learning_rate": 0.0005759526465786986, + "loss": 0.016, + "num_input_tokens_seen": 133960464, + "step": 62075 + }, + { + "epoch": 10.127243066884176, + "grad_norm": 0.01580626703798771, + "learning_rate": 0.0005758822920246523, + "loss": 0.0307, + "num_input_tokens_seen": 133971728, + "step": 62080 + }, + { + "epoch": 10.12805872756933, + "grad_norm": 0.00869547389447689, + "learning_rate": 0.000575811935932755, + "loss": 0.0755, + "num_input_tokens_seen": 133983696, + "step": 62085 + }, + { + "epoch": 10.128874388254486, + "grad_norm": 0.006843876093626022, + "learning_rate": 0.0005757415783044325, + "loss": 0.0569, + "num_input_tokens_seen": 133994224, + "step": 62090 + }, + { + "epoch": 10.129690048939642, + "grad_norm": 0.014547888189554214, + "learning_rate": 0.0005756712191411109, + "loss": 0.139, + "num_input_tokens_seen": 134005264, + "step": 62095 + }, + { + "epoch": 10.130505709624796, + "grad_norm": 0.1061469316482544, + "learning_rate": 0.0005756008584442161, + "loss": 0.164, + "num_input_tokens_seen": 134017136, + "step": 62100 + }, + { + "epoch": 10.131321370309951, + "grad_norm": 0.1204056590795517, + "learning_rate": 0.0005755304962151739, + "loss": 0.0128, + "num_input_tokens_seen": 134027920, + "step": 62105 + }, + { + "epoch": 10.132137030995105, + "grad_norm": 0.214090496301651, + "learning_rate": 0.0005754601324554104, + "loss": 0.0194, + "num_input_tokens_seen": 134039152, + "step": 62110 + }, + { + "epoch": 10.132952691680261, + "grad_norm": 0.037570152431726456, + "learning_rate": 0.0005753897671663518, + "loss": 0.1069, + "num_input_tokens_seen": 134050192, + "step": 62115 + }, + { + "epoch": 10.133768352365417, + "grad_norm": 0.05548904836177826, + "learning_rate": 0.0005753194003494237, + "loss": 0.0718, + "num_input_tokens_seen": 134061712, + "step": 62120 + }, + { + "epoch": 10.13458401305057, + "grad_norm": 0.19013844430446625, + "learning_rate": 0.0005752490320060524, + "loss": 0.0302, + "num_input_tokens_seen": 134072208, + "step": 62125 + }, + { + "epoch": 10.135399673735726, + "grad_norm": 0.0029874593019485474, + "learning_rate": 0.0005751786621376641, + "loss": 0.0196, + "num_input_tokens_seen": 134083504, + "step": 62130 + }, + { + "epoch": 10.13621533442088, + "grad_norm": 0.47766968607902527, + "learning_rate": 0.0005751082907456849, + "loss": 0.0338, + "num_input_tokens_seen": 134093936, + "step": 62135 + }, + { + "epoch": 10.137030995106036, + "grad_norm": 0.05216453596949577, + "learning_rate": 0.0005750379178315408, + "loss": 0.009, + "num_input_tokens_seen": 134104240, + "step": 62140 + }, + { + "epoch": 10.137846655791192, + "grad_norm": 0.006784575991332531, + "learning_rate": 0.0005749675433966581, + "loss": 0.1089, + "num_input_tokens_seen": 134116176, + "step": 62145 + }, + { + "epoch": 10.138662316476346, + "grad_norm": 0.38610172271728516, + "learning_rate": 0.0005748971674424631, + "loss": 0.1927, + "num_input_tokens_seen": 134127440, + "step": 62150 + }, + { + "epoch": 10.139477977161501, + "grad_norm": 0.028888992965221405, + "learning_rate": 0.0005748267899703819, + "loss": 0.0072, + "num_input_tokens_seen": 134139120, + "step": 62155 + }, + { + "epoch": 10.140293637846655, + "grad_norm": 0.0041194274090230465, + "learning_rate": 0.000574756410981841, + "loss": 0.0051, + "num_input_tokens_seen": 134150192, + "step": 62160 + }, + { + "epoch": 10.141109298531811, + "grad_norm": 0.02953651361167431, + "learning_rate": 0.0005746860304782665, + "loss": 0.033, + "num_input_tokens_seen": 134161712, + "step": 62165 + }, + { + "epoch": 10.141924959216965, + "grad_norm": 0.02464982680976391, + "learning_rate": 0.0005746156484610849, + "loss": 0.0095, + "num_input_tokens_seen": 134171728, + "step": 62170 + }, + { + "epoch": 10.14274061990212, + "grad_norm": 0.015621982514858246, + "learning_rate": 0.0005745452649317225, + "loss": 0.125, + "num_input_tokens_seen": 134181328, + "step": 62175 + }, + { + "epoch": 10.143556280587276, + "grad_norm": 0.1050763726234436, + "learning_rate": 0.0005744748798916057, + "loss": 0.0315, + "num_input_tokens_seen": 134192976, + "step": 62180 + }, + { + "epoch": 10.14437194127243, + "grad_norm": 0.050962530076503754, + "learning_rate": 0.0005744044933421609, + "loss": 0.0122, + "num_input_tokens_seen": 134205168, + "step": 62185 + }, + { + "epoch": 10.145187601957586, + "grad_norm": 0.0037109428085386753, + "learning_rate": 0.0005743341052848147, + "loss": 0.0488, + "num_input_tokens_seen": 134216144, + "step": 62190 + }, + { + "epoch": 10.14600326264274, + "grad_norm": 0.021224696189165115, + "learning_rate": 0.0005742637157209936, + "loss": 0.0059, + "num_input_tokens_seen": 134226384, + "step": 62195 + }, + { + "epoch": 10.146818923327896, + "grad_norm": 0.06981684267520905, + "learning_rate": 0.0005741933246521243, + "loss": 0.0693, + "num_input_tokens_seen": 134236432, + "step": 62200 + }, + { + "epoch": 10.147634584013051, + "grad_norm": 0.3684276342391968, + "learning_rate": 0.0005741229320796329, + "loss": 0.111, + "num_input_tokens_seen": 134248080, + "step": 62205 + }, + { + "epoch": 10.148450244698205, + "grad_norm": 0.0344458632171154, + "learning_rate": 0.0005740525380049464, + "loss": 0.0189, + "num_input_tokens_seen": 134259184, + "step": 62210 + }, + { + "epoch": 10.149265905383361, + "grad_norm": 0.01579313352704048, + "learning_rate": 0.0005739821424294911, + "loss": 0.033, + "num_input_tokens_seen": 134270384, + "step": 62215 + }, + { + "epoch": 10.150081566068515, + "grad_norm": 0.06844834983348846, + "learning_rate": 0.000573911745354694, + "loss": 0.0122, + "num_input_tokens_seen": 134280336, + "step": 62220 + }, + { + "epoch": 10.15089722675367, + "grad_norm": 0.008326658979058266, + "learning_rate": 0.0005738413467819816, + "loss": 0.0592, + "num_input_tokens_seen": 134290800, + "step": 62225 + }, + { + "epoch": 10.151712887438826, + "grad_norm": 0.16074970364570618, + "learning_rate": 0.0005737709467127805, + "loss": 0.0234, + "num_input_tokens_seen": 134301040, + "step": 62230 + }, + { + "epoch": 10.15252854812398, + "grad_norm": 0.3016565144062042, + "learning_rate": 0.0005737005451485177, + "loss": 0.1172, + "num_input_tokens_seen": 134312240, + "step": 62235 + }, + { + "epoch": 10.153344208809136, + "grad_norm": 0.25958481431007385, + "learning_rate": 0.0005736301420906196, + "loss": 0.1123, + "num_input_tokens_seen": 134323312, + "step": 62240 + }, + { + "epoch": 10.15415986949429, + "grad_norm": 0.03370942920446396, + "learning_rate": 0.0005735597375405135, + "loss": 0.0127, + "num_input_tokens_seen": 134334672, + "step": 62245 + }, + { + "epoch": 10.154975530179446, + "grad_norm": 0.08638148009777069, + "learning_rate": 0.000573489331499626, + "loss": 0.0306, + "num_input_tokens_seen": 134344912, + "step": 62250 + }, + { + "epoch": 10.1557911908646, + "grad_norm": 0.03941585496068001, + "learning_rate": 0.000573418923969384, + "loss": 0.0756, + "num_input_tokens_seen": 134355728, + "step": 62255 + }, + { + "epoch": 10.156606851549755, + "grad_norm": 0.27743005752563477, + "learning_rate": 0.0005733485149512143, + "loss": 0.0192, + "num_input_tokens_seen": 134366736, + "step": 62260 + }, + { + "epoch": 10.15742251223491, + "grad_norm": 0.013946570456027985, + "learning_rate": 0.000573278104446544, + "loss": 0.0152, + "num_input_tokens_seen": 134378608, + "step": 62265 + }, + { + "epoch": 10.158238172920065, + "grad_norm": 0.03612165525555611, + "learning_rate": 0.0005732076924567999, + "loss": 0.1033, + "num_input_tokens_seen": 134388624, + "step": 62270 + }, + { + "epoch": 10.15905383360522, + "grad_norm": 0.030385838821530342, + "learning_rate": 0.0005731372789834089, + "loss": 0.0258, + "num_input_tokens_seen": 134399376, + "step": 62275 + }, + { + "epoch": 10.159869494290374, + "grad_norm": 0.0023012408055365086, + "learning_rate": 0.0005730668640277983, + "loss": 0.0047, + "num_input_tokens_seen": 134410512, + "step": 62280 + }, + { + "epoch": 10.16068515497553, + "grad_norm": 0.010217229835689068, + "learning_rate": 0.0005729964475913949, + "loss": 0.0213, + "num_input_tokens_seen": 134419984, + "step": 62285 + }, + { + "epoch": 10.161500815660686, + "grad_norm": 0.036612626165151596, + "learning_rate": 0.0005729260296756259, + "loss": 0.1706, + "num_input_tokens_seen": 134432432, + "step": 62290 + }, + { + "epoch": 10.16231647634584, + "grad_norm": 0.00766077172011137, + "learning_rate": 0.0005728556102819185, + "loss": 0.0075, + "num_input_tokens_seen": 134442992, + "step": 62295 + }, + { + "epoch": 10.163132137030995, + "grad_norm": 0.17222407460212708, + "learning_rate": 0.0005727851894116997, + "loss": 0.0368, + "num_input_tokens_seen": 134453488, + "step": 62300 + }, + { + "epoch": 10.16394779771615, + "grad_norm": 0.07828135788440704, + "learning_rate": 0.0005727147670663967, + "loss": 0.0139, + "num_input_tokens_seen": 134463312, + "step": 62305 + }, + { + "epoch": 10.164763458401305, + "grad_norm": 0.0070399208925664425, + "learning_rate": 0.0005726443432474366, + "loss": 0.0639, + "num_input_tokens_seen": 134474064, + "step": 62310 + }, + { + "epoch": 10.16557911908646, + "grad_norm": 0.03359615057706833, + "learning_rate": 0.0005725739179562469, + "loss": 0.0132, + "num_input_tokens_seen": 134486512, + "step": 62315 + }, + { + "epoch": 10.166394779771615, + "grad_norm": 0.010693912394344807, + "learning_rate": 0.0005725034911942546, + "loss": 0.0188, + "num_input_tokens_seen": 134497776, + "step": 62320 + }, + { + "epoch": 10.16721044045677, + "grad_norm": 0.005402860231697559, + "learning_rate": 0.0005724330629628871, + "loss": 0.1141, + "num_input_tokens_seen": 134508528, + "step": 62325 + }, + { + "epoch": 10.168026101141924, + "grad_norm": 0.1135433092713356, + "learning_rate": 0.0005723626332635717, + "loss": 0.0248, + "num_input_tokens_seen": 134519920, + "step": 62330 + }, + { + "epoch": 10.16884176182708, + "grad_norm": 0.27660107612609863, + "learning_rate": 0.0005722922020977356, + "loss": 0.1071, + "num_input_tokens_seen": 134531152, + "step": 62335 + }, + { + "epoch": 10.169657422512234, + "grad_norm": 0.0226137712597847, + "learning_rate": 0.0005722217694668065, + "loss": 0.0129, + "num_input_tokens_seen": 134542288, + "step": 62340 + }, + { + "epoch": 10.17047308319739, + "grad_norm": 0.0013736312976107001, + "learning_rate": 0.0005721513353722116, + "loss": 0.1963, + "num_input_tokens_seen": 134552848, + "step": 62345 + }, + { + "epoch": 10.171288743882545, + "grad_norm": 0.008334333077073097, + "learning_rate": 0.0005720808998153782, + "loss": 0.0802, + "num_input_tokens_seen": 134563824, + "step": 62350 + }, + { + "epoch": 10.1721044045677, + "grad_norm": 0.01661566272377968, + "learning_rate": 0.000572010462797734, + "loss": 0.0105, + "num_input_tokens_seen": 134573744, + "step": 62355 + }, + { + "epoch": 10.172920065252855, + "grad_norm": 0.11666593700647354, + "learning_rate": 0.0005719400243207065, + "loss": 0.072, + "num_input_tokens_seen": 134584752, + "step": 62360 + }, + { + "epoch": 10.173735725938009, + "grad_norm": 0.009865287691354752, + "learning_rate": 0.0005718695843857231, + "loss": 0.1699, + "num_input_tokens_seen": 134597168, + "step": 62365 + }, + { + "epoch": 10.174551386623165, + "grad_norm": 0.11709529161453247, + "learning_rate": 0.0005717991429942114, + "loss": 0.0452, + "num_input_tokens_seen": 134607408, + "step": 62370 + }, + { + "epoch": 10.17536704730832, + "grad_norm": 0.039313752204179764, + "learning_rate": 0.000571728700147599, + "loss": 0.0057, + "num_input_tokens_seen": 134617456, + "step": 62375 + }, + { + "epoch": 10.176182707993474, + "grad_norm": 0.006439921446144581, + "learning_rate": 0.0005716582558473136, + "loss": 0.1077, + "num_input_tokens_seen": 134628112, + "step": 62380 + }, + { + "epoch": 10.17699836867863, + "grad_norm": 0.1854417771100998, + "learning_rate": 0.0005715878100947824, + "loss": 0.1619, + "num_input_tokens_seen": 134639888, + "step": 62385 + }, + { + "epoch": 10.177814029363784, + "grad_norm": 0.12076258659362793, + "learning_rate": 0.0005715173628914336, + "loss": 0.022, + "num_input_tokens_seen": 134650704, + "step": 62390 + }, + { + "epoch": 10.17862969004894, + "grad_norm": 0.018918918445706367, + "learning_rate": 0.0005714469142386948, + "loss": 0.0168, + "num_input_tokens_seen": 134660752, + "step": 62395 + }, + { + "epoch": 10.179445350734095, + "grad_norm": 0.013586286455392838, + "learning_rate": 0.0005713764641379936, + "loss": 0.0116, + "num_input_tokens_seen": 134670224, + "step": 62400 + }, + { + "epoch": 10.18026101141925, + "grad_norm": 0.004500287119299173, + "learning_rate": 0.0005713060125907578, + "loss": 0.0918, + "num_input_tokens_seen": 134681072, + "step": 62405 + }, + { + "epoch": 10.181076672104405, + "grad_norm": 0.5688797831535339, + "learning_rate": 0.0005712355595984151, + "loss": 0.1061, + "num_input_tokens_seen": 134693104, + "step": 62410 + }, + { + "epoch": 10.181892332789559, + "grad_norm": 0.00568495225161314, + "learning_rate": 0.0005711651051623935, + "loss": 0.0076, + "num_input_tokens_seen": 134705104, + "step": 62415 + }, + { + "epoch": 10.182707993474715, + "grad_norm": 0.0752301886677742, + "learning_rate": 0.0005710946492841208, + "loss": 0.072, + "num_input_tokens_seen": 134715952, + "step": 62420 + }, + { + "epoch": 10.18352365415987, + "grad_norm": 0.04762760177254677, + "learning_rate": 0.0005710241919650248, + "loss": 0.0241, + "num_input_tokens_seen": 134725744, + "step": 62425 + }, + { + "epoch": 10.184339314845024, + "grad_norm": 0.006124766077846289, + "learning_rate": 0.0005709537332065335, + "loss": 0.0055, + "num_input_tokens_seen": 134736944, + "step": 62430 + }, + { + "epoch": 10.18515497553018, + "grad_norm": 0.38362017273902893, + "learning_rate": 0.0005708832730100747, + "loss": 0.104, + "num_input_tokens_seen": 134748240, + "step": 62435 + }, + { + "epoch": 10.185970636215334, + "grad_norm": 0.037003353238105774, + "learning_rate": 0.0005708128113770765, + "loss": 0.0419, + "num_input_tokens_seen": 134759888, + "step": 62440 + }, + { + "epoch": 10.18678629690049, + "grad_norm": 0.3588872253894806, + "learning_rate": 0.0005707423483089669, + "loss": 0.0541, + "num_input_tokens_seen": 134769808, + "step": 62445 + }, + { + "epoch": 10.187601957585644, + "grad_norm": 0.20742616057395935, + "learning_rate": 0.0005706718838071738, + "loss": 0.0374, + "num_input_tokens_seen": 134779888, + "step": 62450 + }, + { + "epoch": 10.1884176182708, + "grad_norm": 0.0142738688737154, + "learning_rate": 0.0005706014178731253, + "loss": 0.0126, + "num_input_tokens_seen": 134791120, + "step": 62455 + }, + { + "epoch": 10.189233278955955, + "grad_norm": 0.04501636326313019, + "learning_rate": 0.0005705309505082496, + "loss": 0.0296, + "num_input_tokens_seen": 134802448, + "step": 62460 + }, + { + "epoch": 10.190048939641109, + "grad_norm": 0.034516364336013794, + "learning_rate": 0.0005704604817139747, + "loss": 0.1133, + "num_input_tokens_seen": 134814000, + "step": 62465 + }, + { + "epoch": 10.190864600326265, + "grad_norm": 0.00314263254404068, + "learning_rate": 0.0005703900114917286, + "loss": 0.019, + "num_input_tokens_seen": 134824080, + "step": 62470 + }, + { + "epoch": 10.191680261011419, + "grad_norm": 0.26889798045158386, + "learning_rate": 0.0005703195398429397, + "loss": 0.0761, + "num_input_tokens_seen": 134834480, + "step": 62475 + }, + { + "epoch": 10.192495921696574, + "grad_norm": 0.24001306295394897, + "learning_rate": 0.0005702490667690363, + "loss": 0.0519, + "num_input_tokens_seen": 134844496, + "step": 62480 + }, + { + "epoch": 10.19331158238173, + "grad_norm": 0.01197401899844408, + "learning_rate": 0.0005701785922714461, + "loss": 0.0105, + "num_input_tokens_seen": 134854448, + "step": 62485 + }, + { + "epoch": 10.194127243066884, + "grad_norm": 0.20064476132392883, + "learning_rate": 0.000570108116351598, + "loss": 0.11, + "num_input_tokens_seen": 134865360, + "step": 62490 + }, + { + "epoch": 10.19494290375204, + "grad_norm": 0.02952468767762184, + "learning_rate": 0.0005700376390109198, + "loss": 0.1505, + "num_input_tokens_seen": 134876112, + "step": 62495 + }, + { + "epoch": 10.195758564437194, + "grad_norm": 0.01146555133163929, + "learning_rate": 0.00056996716025084, + "loss": 0.0092, + "num_input_tokens_seen": 134887504, + "step": 62500 + }, + { + "epoch": 10.19657422512235, + "grad_norm": 0.21673166751861572, + "learning_rate": 0.000569896680072787, + "loss": 0.0846, + "num_input_tokens_seen": 134897936, + "step": 62505 + }, + { + "epoch": 10.197389885807505, + "grad_norm": 0.046677183359861374, + "learning_rate": 0.0005698261984781891, + "loss": 0.0077, + "num_input_tokens_seen": 134909520, + "step": 62510 + }, + { + "epoch": 10.198205546492659, + "grad_norm": 0.010100886225700378, + "learning_rate": 0.0005697557154684749, + "loss": 0.111, + "num_input_tokens_seen": 134919888, + "step": 62515 + }, + { + "epoch": 10.199021207177815, + "grad_norm": 0.0316040925681591, + "learning_rate": 0.0005696852310450723, + "loss": 0.012, + "num_input_tokens_seen": 134930896, + "step": 62520 + }, + { + "epoch": 10.199836867862969, + "grad_norm": 0.07303192466497421, + "learning_rate": 0.0005696147452094102, + "loss": 0.0348, + "num_input_tokens_seen": 134941456, + "step": 62525 + }, + { + "epoch": 10.200652528548124, + "grad_norm": 0.0028564471285790205, + "learning_rate": 0.000569544257962917, + "loss": 0.0104, + "num_input_tokens_seen": 134950448, + "step": 62530 + }, + { + "epoch": 10.201468189233278, + "grad_norm": 0.004689156077802181, + "learning_rate": 0.0005694737693070213, + "loss": 0.0103, + "num_input_tokens_seen": 134960816, + "step": 62535 + }, + { + "epoch": 10.202283849918434, + "grad_norm": 0.2989457845687866, + "learning_rate": 0.0005694032792431515, + "loss": 0.0364, + "num_input_tokens_seen": 134972240, + "step": 62540 + }, + { + "epoch": 10.20309951060359, + "grad_norm": 0.02045276388525963, + "learning_rate": 0.0005693327877727361, + "loss": 0.01, + "num_input_tokens_seen": 134982992, + "step": 62545 + }, + { + "epoch": 10.203915171288743, + "grad_norm": 0.03922825679183006, + "learning_rate": 0.0005692622948972039, + "loss": 0.009, + "num_input_tokens_seen": 134993072, + "step": 62550 + }, + { + "epoch": 10.2047308319739, + "grad_norm": 0.012804524973034859, + "learning_rate": 0.0005691918006179833, + "loss": 0.0421, + "num_input_tokens_seen": 135003824, + "step": 62555 + }, + { + "epoch": 10.205546492659053, + "grad_norm": 0.008104806765913963, + "learning_rate": 0.0005691213049365031, + "loss": 0.063, + "num_input_tokens_seen": 135013424, + "step": 62560 + }, + { + "epoch": 10.206362153344209, + "grad_norm": 0.0039408402517437935, + "learning_rate": 0.000569050807854192, + "loss": 0.0037, + "num_input_tokens_seen": 135024432, + "step": 62565 + }, + { + "epoch": 10.207177814029365, + "grad_norm": 0.027074113488197327, + "learning_rate": 0.0005689803093724788, + "loss": 0.1001, + "num_input_tokens_seen": 135035440, + "step": 62570 + }, + { + "epoch": 10.207993474714518, + "grad_norm": 0.00425309082493186, + "learning_rate": 0.0005689098094927921, + "loss": 0.0475, + "num_input_tokens_seen": 135047056, + "step": 62575 + }, + { + "epoch": 10.208809135399674, + "grad_norm": 0.000645303342025727, + "learning_rate": 0.0005688393082165605, + "loss": 0.1481, + "num_input_tokens_seen": 135058000, + "step": 62580 + }, + { + "epoch": 10.209624796084828, + "grad_norm": 0.03928713500499725, + "learning_rate": 0.0005687688055452132, + "loss": 0.0122, + "num_input_tokens_seen": 135068944, + "step": 62585 + }, + { + "epoch": 10.210440456769984, + "grad_norm": 0.028946872800588608, + "learning_rate": 0.0005686983014801787, + "loss": 0.0312, + "num_input_tokens_seen": 135078064, + "step": 62590 + }, + { + "epoch": 10.21125611745514, + "grad_norm": 0.01564590260386467, + "learning_rate": 0.000568627796022886, + "loss": 0.0133, + "num_input_tokens_seen": 135089168, + "step": 62595 + }, + { + "epoch": 10.212071778140293, + "grad_norm": 0.002126560779288411, + "learning_rate": 0.0005685572891747639, + "loss": 0.0131, + "num_input_tokens_seen": 135100112, + "step": 62600 + }, + { + "epoch": 10.21288743882545, + "grad_norm": 0.05529544875025749, + "learning_rate": 0.0005684867809372415, + "loss": 0.0294, + "num_input_tokens_seen": 135111088, + "step": 62605 + }, + { + "epoch": 10.213703099510603, + "grad_norm": 0.040940333157777786, + "learning_rate": 0.0005684162713117473, + "loss": 0.0528, + "num_input_tokens_seen": 135121040, + "step": 62610 + }, + { + "epoch": 10.214518760195759, + "grad_norm": 0.011304167099297047, + "learning_rate": 0.0005683457602997108, + "loss": 0.0738, + "num_input_tokens_seen": 135132528, + "step": 62615 + }, + { + "epoch": 10.215334420880913, + "grad_norm": 0.00530855031684041, + "learning_rate": 0.0005682752479025608, + "loss": 0.0096, + "num_input_tokens_seen": 135142960, + "step": 62620 + }, + { + "epoch": 10.216150081566068, + "grad_norm": 0.1356511116027832, + "learning_rate": 0.0005682047341217262, + "loss": 0.0337, + "num_input_tokens_seen": 135153328, + "step": 62625 + }, + { + "epoch": 10.216965742251224, + "grad_norm": 0.09058766812086105, + "learning_rate": 0.0005681342189586362, + "loss": 0.0171, + "num_input_tokens_seen": 135163504, + "step": 62630 + }, + { + "epoch": 10.217781402936378, + "grad_norm": 0.0728636160492897, + "learning_rate": 0.0005680637024147199, + "loss": 0.1012, + "num_input_tokens_seen": 135174672, + "step": 62635 + }, + { + "epoch": 10.218597063621534, + "grad_norm": 0.0041594551876187325, + "learning_rate": 0.0005679931844914061, + "loss": 0.0244, + "num_input_tokens_seen": 135186512, + "step": 62640 + }, + { + "epoch": 10.219412724306688, + "grad_norm": 0.14362168312072754, + "learning_rate": 0.0005679226651901243, + "loss": 0.0965, + "num_input_tokens_seen": 135195952, + "step": 62645 + }, + { + "epoch": 10.220228384991843, + "grad_norm": 0.02301289513707161, + "learning_rate": 0.0005678521445123036, + "loss": 0.0587, + "num_input_tokens_seen": 135206928, + "step": 62650 + }, + { + "epoch": 10.221044045676999, + "grad_norm": 0.003895870642736554, + "learning_rate": 0.0005677816224593731, + "loss": 0.1091, + "num_input_tokens_seen": 135217936, + "step": 62655 + }, + { + "epoch": 10.221859706362153, + "grad_norm": 0.22609420120716095, + "learning_rate": 0.0005677110990327618, + "loss": 0.0222, + "num_input_tokens_seen": 135228656, + "step": 62660 + }, + { + "epoch": 10.222675367047309, + "grad_norm": 0.07050885260105133, + "learning_rate": 0.0005676405742338995, + "loss": 0.0154, + "num_input_tokens_seen": 135239024, + "step": 62665 + }, + { + "epoch": 10.223491027732463, + "grad_norm": 0.015503239817917347, + "learning_rate": 0.0005675700480642149, + "loss": 0.0248, + "num_input_tokens_seen": 135250864, + "step": 62670 + }, + { + "epoch": 10.224306688417618, + "grad_norm": 0.00602386612445116, + "learning_rate": 0.0005674995205251376, + "loss": 0.0045, + "num_input_tokens_seen": 135261840, + "step": 62675 + }, + { + "epoch": 10.225122349102774, + "grad_norm": 0.011385610327124596, + "learning_rate": 0.000567428991618097, + "loss": 0.0083, + "num_input_tokens_seen": 135272912, + "step": 62680 + }, + { + "epoch": 10.225938009787928, + "grad_norm": 0.028345797210931778, + "learning_rate": 0.0005673584613445223, + "loss": 0.0065, + "num_input_tokens_seen": 135283120, + "step": 62685 + }, + { + "epoch": 10.226753670473084, + "grad_norm": 0.26997220516204834, + "learning_rate": 0.000567287929705843, + "loss": 0.1093, + "num_input_tokens_seen": 135294032, + "step": 62690 + }, + { + "epoch": 10.227569331158238, + "grad_norm": 0.027153953909873962, + "learning_rate": 0.0005672173967034883, + "loss": 0.0087, + "num_input_tokens_seen": 135304528, + "step": 62695 + }, + { + "epoch": 10.228384991843393, + "grad_norm": 0.013517527841031551, + "learning_rate": 0.0005671468623388878, + "loss": 0.0051, + "num_input_tokens_seen": 135315408, + "step": 62700 + }, + { + "epoch": 10.229200652528547, + "grad_norm": 0.33822906017303467, + "learning_rate": 0.000567076326613471, + "loss": 0.0942, + "num_input_tokens_seen": 135326992, + "step": 62705 + }, + { + "epoch": 10.230016313213703, + "grad_norm": 0.10920916497707367, + "learning_rate": 0.0005670057895286674, + "loss": 0.0319, + "num_input_tokens_seen": 135337296, + "step": 62710 + }, + { + "epoch": 10.230831973898859, + "grad_norm": 0.04552744701504707, + "learning_rate": 0.0005669352510859063, + "loss": 0.0285, + "num_input_tokens_seen": 135348400, + "step": 62715 + }, + { + "epoch": 10.231647634584013, + "grad_norm": 0.24385327100753784, + "learning_rate": 0.0005668647112866175, + "loss": 0.0267, + "num_input_tokens_seen": 135359248, + "step": 62720 + }, + { + "epoch": 10.232463295269168, + "grad_norm": 0.2467089593410492, + "learning_rate": 0.0005667941701322305, + "loss": 0.0207, + "num_input_tokens_seen": 135370352, + "step": 62725 + }, + { + "epoch": 10.233278955954322, + "grad_norm": 0.35685521364212036, + "learning_rate": 0.000566723627624175, + "loss": 0.0342, + "num_input_tokens_seen": 135381392, + "step": 62730 + }, + { + "epoch": 10.234094616639478, + "grad_norm": 0.00995756033807993, + "learning_rate": 0.0005666530837638805, + "loss": 0.0328, + "num_input_tokens_seen": 135392016, + "step": 62735 + }, + { + "epoch": 10.234910277324634, + "grad_norm": 0.08818637579679489, + "learning_rate": 0.0005665825385527766, + "loss": 0.0312, + "num_input_tokens_seen": 135404688, + "step": 62740 + }, + { + "epoch": 10.235725938009788, + "grad_norm": 0.033171508461236954, + "learning_rate": 0.0005665119919922932, + "loss": 0.1323, + "num_input_tokens_seen": 135416240, + "step": 62745 + }, + { + "epoch": 10.236541598694943, + "grad_norm": 0.04459144175052643, + "learning_rate": 0.0005664414440838598, + "loss": 0.0082, + "num_input_tokens_seen": 135427824, + "step": 62750 + }, + { + "epoch": 10.237357259380097, + "grad_norm": 0.0854158103466034, + "learning_rate": 0.0005663708948289065, + "loss": 0.0162, + "num_input_tokens_seen": 135438960, + "step": 62755 + }, + { + "epoch": 10.238172920065253, + "grad_norm": 0.010781729593873024, + "learning_rate": 0.0005663003442288626, + "loss": 0.0739, + "num_input_tokens_seen": 135449360, + "step": 62760 + }, + { + "epoch": 10.238988580750409, + "grad_norm": 0.04206779971718788, + "learning_rate": 0.0005662297922851583, + "loss": 0.0964, + "num_input_tokens_seen": 135461136, + "step": 62765 + }, + { + "epoch": 10.239804241435563, + "grad_norm": 0.01621088571846485, + "learning_rate": 0.0005661592389992231, + "loss": 0.0688, + "num_input_tokens_seen": 135472752, + "step": 62770 + }, + { + "epoch": 10.240619902120718, + "grad_norm": 0.05034844949841499, + "learning_rate": 0.0005660886843724869, + "loss": 0.0167, + "num_input_tokens_seen": 135482704, + "step": 62775 + }, + { + "epoch": 10.241435562805872, + "grad_norm": 0.0698406919836998, + "learning_rate": 0.0005660181284063798, + "loss": 0.0112, + "num_input_tokens_seen": 135493904, + "step": 62780 + }, + { + "epoch": 10.242251223491028, + "grad_norm": 0.0021123981568962336, + "learning_rate": 0.0005659475711023317, + "loss": 0.0048, + "num_input_tokens_seen": 135504496, + "step": 62785 + }, + { + "epoch": 10.243066884176184, + "grad_norm": 0.0033412009943276644, + "learning_rate": 0.0005658770124617722, + "loss": 0.0252, + "num_input_tokens_seen": 135516144, + "step": 62790 + }, + { + "epoch": 10.243882544861338, + "grad_norm": 0.006256371736526489, + "learning_rate": 0.0005658064524861315, + "loss": 0.0566, + "num_input_tokens_seen": 135526800, + "step": 62795 + }, + { + "epoch": 10.244698205546493, + "grad_norm": 0.003300925949588418, + "learning_rate": 0.0005657358911768395, + "loss": 0.0055, + "num_input_tokens_seen": 135538320, + "step": 62800 + }, + { + "epoch": 10.245513866231647, + "grad_norm": 0.020401332527399063, + "learning_rate": 0.0005656653285353265, + "loss": 0.0442, + "num_input_tokens_seen": 135549552, + "step": 62805 + }, + { + "epoch": 10.246329526916803, + "grad_norm": 0.002573966281488538, + "learning_rate": 0.0005655947645630222, + "loss": 0.0272, + "num_input_tokens_seen": 135559504, + "step": 62810 + }, + { + "epoch": 10.247145187601957, + "grad_norm": 0.05568597838282585, + "learning_rate": 0.0005655241992613566, + "loss": 0.0073, + "num_input_tokens_seen": 135570096, + "step": 62815 + }, + { + "epoch": 10.247960848287113, + "grad_norm": 0.32294729351997375, + "learning_rate": 0.0005654536326317602, + "loss": 0.1154, + "num_input_tokens_seen": 135581360, + "step": 62820 + }, + { + "epoch": 10.248776508972268, + "grad_norm": 0.0008304172079078853, + "learning_rate": 0.0005653830646756629, + "loss": 0.0201, + "num_input_tokens_seen": 135592656, + "step": 62825 + }, + { + "epoch": 10.249592169657422, + "grad_norm": 0.06679890304803848, + "learning_rate": 0.0005653124953944947, + "loss": 0.0833, + "num_input_tokens_seen": 135603312, + "step": 62830 + }, + { + "epoch": 10.250407830342578, + "grad_norm": 0.008305130526423454, + "learning_rate": 0.0005652419247896861, + "loss": 0.0037, + "num_input_tokens_seen": 135614416, + "step": 62835 + }, + { + "epoch": 10.251223491027732, + "grad_norm": 0.28826412558555603, + "learning_rate": 0.000565171352862667, + "loss": 0.0591, + "num_input_tokens_seen": 135626384, + "step": 62840 + }, + { + "epoch": 10.252039151712887, + "grad_norm": 0.6463675498962402, + "learning_rate": 0.0005651007796148678, + "loss": 0.0588, + "num_input_tokens_seen": 135636848, + "step": 62845 + }, + { + "epoch": 10.252854812398043, + "grad_norm": 0.0025871365796774626, + "learning_rate": 0.0005650302050477187, + "loss": 0.002, + "num_input_tokens_seen": 135646320, + "step": 62850 + }, + { + "epoch": 10.253670473083197, + "grad_norm": 0.06975041329860687, + "learning_rate": 0.0005649596291626501, + "loss": 0.1158, + "num_input_tokens_seen": 135659056, + "step": 62855 + }, + { + "epoch": 10.254486133768353, + "grad_norm": 0.013552771881222725, + "learning_rate": 0.0005648890519610921, + "loss": 0.1229, + "num_input_tokens_seen": 135670416, + "step": 62860 + }, + { + "epoch": 10.255301794453507, + "grad_norm": 0.0022445095237344503, + "learning_rate": 0.0005648184734444753, + "loss": 0.0046, + "num_input_tokens_seen": 135680592, + "step": 62865 + }, + { + "epoch": 10.256117455138662, + "grad_norm": 0.6660820245742798, + "learning_rate": 0.0005647478936142296, + "loss": 0.0474, + "num_input_tokens_seen": 135691408, + "step": 62870 + }, + { + "epoch": 10.256933115823816, + "grad_norm": 0.0032186959870159626, + "learning_rate": 0.0005646773124717858, + "loss": 0.0647, + "num_input_tokens_seen": 135703600, + "step": 62875 + }, + { + "epoch": 10.257748776508972, + "grad_norm": 0.12357281148433685, + "learning_rate": 0.0005646067300185744, + "loss": 0.0613, + "num_input_tokens_seen": 135713168, + "step": 62880 + }, + { + "epoch": 10.258564437194128, + "grad_norm": 0.0059761242009699345, + "learning_rate": 0.0005645361462560256, + "loss": 0.0659, + "num_input_tokens_seen": 135724688, + "step": 62885 + }, + { + "epoch": 10.259380097879282, + "grad_norm": 0.011934679001569748, + "learning_rate": 0.0005644655611855698, + "loss": 0.057, + "num_input_tokens_seen": 135734736, + "step": 62890 + }, + { + "epoch": 10.260195758564437, + "grad_norm": 0.010906904004514217, + "learning_rate": 0.0005643949748086377, + "loss": 0.0059, + "num_input_tokens_seen": 135745328, + "step": 62895 + }, + { + "epoch": 10.261011419249591, + "grad_norm": 0.012250243686139584, + "learning_rate": 0.0005643243871266598, + "loss": 0.0389, + "num_input_tokens_seen": 135756560, + "step": 62900 + }, + { + "epoch": 10.261827079934747, + "grad_norm": 0.04538460075855255, + "learning_rate": 0.0005642537981410665, + "loss": 0.0493, + "num_input_tokens_seen": 135767216, + "step": 62905 + }, + { + "epoch": 10.262642740619903, + "grad_norm": 0.000650823290925473, + "learning_rate": 0.0005641832078532886, + "loss": 0.0628, + "num_input_tokens_seen": 135777744, + "step": 62910 + }, + { + "epoch": 10.263458401305057, + "grad_norm": 0.09684202075004578, + "learning_rate": 0.0005641126162647564, + "loss": 0.0089, + "num_input_tokens_seen": 135788592, + "step": 62915 + }, + { + "epoch": 10.264274061990212, + "grad_norm": 0.006821627728641033, + "learning_rate": 0.0005640420233769008, + "loss": 0.0223, + "num_input_tokens_seen": 135800272, + "step": 62920 + }, + { + "epoch": 10.265089722675366, + "grad_norm": 0.013574290089309216, + "learning_rate": 0.0005639714291911524, + "loss": 0.0258, + "num_input_tokens_seen": 135811472, + "step": 62925 + }, + { + "epoch": 10.265905383360522, + "grad_norm": 0.04171886667609215, + "learning_rate": 0.0005639008337089416, + "loss": 0.0992, + "num_input_tokens_seen": 135821424, + "step": 62930 + }, + { + "epoch": 10.266721044045678, + "grad_norm": 0.032780274748802185, + "learning_rate": 0.0005638302369316995, + "loss": 0.0658, + "num_input_tokens_seen": 135830768, + "step": 62935 + }, + { + "epoch": 10.267536704730832, + "grad_norm": 0.27403193712234497, + "learning_rate": 0.0005637596388608567, + "loss": 0.074, + "num_input_tokens_seen": 135841008, + "step": 62940 + }, + { + "epoch": 10.268352365415987, + "grad_norm": 0.014591069892048836, + "learning_rate": 0.0005636890394978439, + "loss": 0.0132, + "num_input_tokens_seen": 135851024, + "step": 62945 + }, + { + "epoch": 10.269168026101141, + "grad_norm": 0.019346341490745544, + "learning_rate": 0.0005636184388440919, + "loss": 0.0057, + "num_input_tokens_seen": 135862096, + "step": 62950 + }, + { + "epoch": 10.269983686786297, + "grad_norm": 0.016496028751134872, + "learning_rate": 0.0005635478369010316, + "loss": 0.0786, + "num_input_tokens_seen": 135873360, + "step": 62955 + }, + { + "epoch": 10.270799347471453, + "grad_norm": 0.018708713352680206, + "learning_rate": 0.0005634772336700937, + "loss": 0.0301, + "num_input_tokens_seen": 135884112, + "step": 62960 + }, + { + "epoch": 10.271615008156607, + "grad_norm": 0.001943049719557166, + "learning_rate": 0.0005634066291527092, + "loss": 0.0279, + "num_input_tokens_seen": 135895152, + "step": 62965 + }, + { + "epoch": 10.272430668841762, + "grad_norm": 0.02392006479203701, + "learning_rate": 0.000563336023350309, + "loss": 0.0105, + "num_input_tokens_seen": 135906832, + "step": 62970 + }, + { + "epoch": 10.273246329526916, + "grad_norm": 0.04209519922733307, + "learning_rate": 0.0005632654162643239, + "loss": 0.018, + "num_input_tokens_seen": 135917488, + "step": 62975 + }, + { + "epoch": 10.274061990212072, + "grad_norm": 0.010596969164907932, + "learning_rate": 0.0005631948078961847, + "loss": 0.0197, + "num_input_tokens_seen": 135928080, + "step": 62980 + }, + { + "epoch": 10.274877650897226, + "grad_norm": 0.0158772561699152, + "learning_rate": 0.0005631241982473227, + "loss": 0.0315, + "num_input_tokens_seen": 135937936, + "step": 62985 + }, + { + "epoch": 10.275693311582382, + "grad_norm": 0.2704010605812073, + "learning_rate": 0.0005630535873191687, + "loss": 0.094, + "num_input_tokens_seen": 135948272, + "step": 62990 + }, + { + "epoch": 10.276508972267537, + "grad_norm": 0.005989837925881147, + "learning_rate": 0.0005629829751131538, + "loss": 0.0147, + "num_input_tokens_seen": 135958384, + "step": 62995 + }, + { + "epoch": 10.277324632952691, + "grad_norm": 0.151280015707016, + "learning_rate": 0.0005629123616307089, + "loss": 0.023, + "num_input_tokens_seen": 135968464, + "step": 63000 + }, + { + "epoch": 10.278140293637847, + "grad_norm": 0.0871962234377861, + "learning_rate": 0.0005628417468732653, + "loss": 0.0173, + "num_input_tokens_seen": 135979216, + "step": 63005 + }, + { + "epoch": 10.278955954323001, + "grad_norm": 0.017043069005012512, + "learning_rate": 0.0005627711308422539, + "loss": 0.0091, + "num_input_tokens_seen": 135989584, + "step": 63010 + }, + { + "epoch": 10.279771615008157, + "grad_norm": 0.06069403141736984, + "learning_rate": 0.000562700513539106, + "loss": 0.1332, + "num_input_tokens_seen": 136002384, + "step": 63015 + }, + { + "epoch": 10.280587275693312, + "grad_norm": 0.01025825459510088, + "learning_rate": 0.0005626298949652524, + "loss": 0.0057, + "num_input_tokens_seen": 136012912, + "step": 63020 + }, + { + "epoch": 10.281402936378466, + "grad_norm": 0.023113388568162918, + "learning_rate": 0.0005625592751221248, + "loss": 0.0053, + "num_input_tokens_seen": 136023120, + "step": 63025 + }, + { + "epoch": 10.282218597063622, + "grad_norm": 0.004566239658743143, + "learning_rate": 0.000562488654011154, + "loss": 0.0028, + "num_input_tokens_seen": 136034384, + "step": 63030 + }, + { + "epoch": 10.283034257748776, + "grad_norm": 0.003322204342111945, + "learning_rate": 0.0005624180316337715, + "loss": 0.0068, + "num_input_tokens_seen": 136045136, + "step": 63035 + }, + { + "epoch": 10.283849918433932, + "grad_norm": 0.013915945775806904, + "learning_rate": 0.0005623474079914082, + "loss": 0.0884, + "num_input_tokens_seen": 136057744, + "step": 63040 + }, + { + "epoch": 10.284665579119087, + "grad_norm": 0.053325071930885315, + "learning_rate": 0.0005622767830854957, + "loss": 0.0138, + "num_input_tokens_seen": 136068624, + "step": 63045 + }, + { + "epoch": 10.285481239804241, + "grad_norm": 0.047515708953142166, + "learning_rate": 0.0005622061569174651, + "loss": 0.1424, + "num_input_tokens_seen": 136080048, + "step": 63050 + }, + { + "epoch": 10.286296900489397, + "grad_norm": 0.5769004821777344, + "learning_rate": 0.0005621355294887479, + "loss": 0.1011, + "num_input_tokens_seen": 136090928, + "step": 63055 + }, + { + "epoch": 10.28711256117455, + "grad_norm": 0.046155206859111786, + "learning_rate": 0.0005620649008007755, + "loss": 0.0085, + "num_input_tokens_seen": 136101168, + "step": 63060 + }, + { + "epoch": 10.287928221859707, + "grad_norm": 0.3597506880760193, + "learning_rate": 0.0005619942708549789, + "loss": 0.0815, + "num_input_tokens_seen": 136111824, + "step": 63065 + }, + { + "epoch": 10.28874388254486, + "grad_norm": 0.4630865454673767, + "learning_rate": 0.0005619236396527899, + "loss": 0.099, + "num_input_tokens_seen": 136122000, + "step": 63070 + }, + { + "epoch": 10.289559543230016, + "grad_norm": 0.004796282388269901, + "learning_rate": 0.0005618530071956397, + "loss": 0.0439, + "num_input_tokens_seen": 136132016, + "step": 63075 + }, + { + "epoch": 10.290375203915172, + "grad_norm": 0.06138777732849121, + "learning_rate": 0.00056178237348496, + "loss": 0.0094, + "num_input_tokens_seen": 136142832, + "step": 63080 + }, + { + "epoch": 10.291190864600326, + "grad_norm": 0.036521364003419876, + "learning_rate": 0.0005617117385221819, + "loss": 0.0282, + "num_input_tokens_seen": 136152752, + "step": 63085 + }, + { + "epoch": 10.292006525285482, + "grad_norm": 0.026105321943759918, + "learning_rate": 0.0005616411023087373, + "loss": 0.0111, + "num_input_tokens_seen": 136163248, + "step": 63090 + }, + { + "epoch": 10.292822185970635, + "grad_norm": 0.022812357172369957, + "learning_rate": 0.0005615704648460575, + "loss": 0.119, + "num_input_tokens_seen": 136173744, + "step": 63095 + }, + { + "epoch": 10.293637846655791, + "grad_norm": 0.021446868777275085, + "learning_rate": 0.0005614998261355741, + "loss": 0.1743, + "num_input_tokens_seen": 136183440, + "step": 63100 + }, + { + "epoch": 10.294453507340947, + "grad_norm": 0.007239363621920347, + "learning_rate": 0.0005614291861787188, + "loss": 0.0081, + "num_input_tokens_seen": 136195056, + "step": 63105 + }, + { + "epoch": 10.2952691680261, + "grad_norm": 0.13986659049987793, + "learning_rate": 0.0005613585449769232, + "loss": 0.0115, + "num_input_tokens_seen": 136204880, + "step": 63110 + }, + { + "epoch": 10.296084828711257, + "grad_norm": 0.010859480127692223, + "learning_rate": 0.0005612879025316186, + "loss": 0.0118, + "num_input_tokens_seen": 136215568, + "step": 63115 + }, + { + "epoch": 10.29690048939641, + "grad_norm": 0.04848559945821762, + "learning_rate": 0.000561217258844237, + "loss": 0.0042, + "num_input_tokens_seen": 136226896, + "step": 63120 + }, + { + "epoch": 10.297716150081566, + "grad_norm": 0.006989500019699335, + "learning_rate": 0.0005611466139162101, + "loss": 0.0054, + "num_input_tokens_seen": 136239024, + "step": 63125 + }, + { + "epoch": 10.298531810766722, + "grad_norm": 0.043698739260435104, + "learning_rate": 0.0005610759677489694, + "loss": 0.0128, + "num_input_tokens_seen": 136249520, + "step": 63130 + }, + { + "epoch": 10.299347471451876, + "grad_norm": 0.024610096588730812, + "learning_rate": 0.0005610053203439467, + "loss": 0.0211, + "num_input_tokens_seen": 136260688, + "step": 63135 + }, + { + "epoch": 10.300163132137031, + "grad_norm": 0.12361978739500046, + "learning_rate": 0.0005609346717025737, + "loss": 0.0078, + "num_input_tokens_seen": 136270608, + "step": 63140 + }, + { + "epoch": 10.300978792822185, + "grad_norm": 0.012895677238702774, + "learning_rate": 0.0005608640218262825, + "loss": 0.0168, + "num_input_tokens_seen": 136280656, + "step": 63145 + }, + { + "epoch": 10.301794453507341, + "grad_norm": 0.06461931765079498, + "learning_rate": 0.0005607933707165046, + "loss": 0.0228, + "num_input_tokens_seen": 136289904, + "step": 63150 + }, + { + "epoch": 10.302610114192497, + "grad_norm": 0.009492479264736176, + "learning_rate": 0.000560722718374672, + "loss": 0.0101, + "num_input_tokens_seen": 136300752, + "step": 63155 + }, + { + "epoch": 10.30342577487765, + "grad_norm": 0.0037337199319154024, + "learning_rate": 0.0005606520648022164, + "loss": 0.0254, + "num_input_tokens_seen": 136311600, + "step": 63160 + }, + { + "epoch": 10.304241435562806, + "grad_norm": 0.007616623304784298, + "learning_rate": 0.0005605814100005696, + "loss": 0.0071, + "num_input_tokens_seen": 136323280, + "step": 63165 + }, + { + "epoch": 10.30505709624796, + "grad_norm": 0.007590592373162508, + "learning_rate": 0.0005605107539711639, + "loss": 0.0031, + "num_input_tokens_seen": 136335056, + "step": 63170 + }, + { + "epoch": 10.305872756933116, + "grad_norm": 0.037226613610982895, + "learning_rate": 0.000560440096715431, + "loss": 0.0058, + "num_input_tokens_seen": 136346256, + "step": 63175 + }, + { + "epoch": 10.30668841761827, + "grad_norm": 0.004320470150560141, + "learning_rate": 0.0005603694382348027, + "loss": 0.1272, + "num_input_tokens_seen": 136357424, + "step": 63180 + }, + { + "epoch": 10.307504078303426, + "grad_norm": 0.0010183845879510045, + "learning_rate": 0.0005602987785307112, + "loss": 0.0098, + "num_input_tokens_seen": 136367664, + "step": 63185 + }, + { + "epoch": 10.308319738988581, + "grad_norm": 0.48818346858024597, + "learning_rate": 0.0005602281176045885, + "loss": 0.1192, + "num_input_tokens_seen": 136378352, + "step": 63190 + }, + { + "epoch": 10.309135399673735, + "grad_norm": 0.08746645599603653, + "learning_rate": 0.0005601574554578666, + "loss": 0.1669, + "num_input_tokens_seen": 136389200, + "step": 63195 + }, + { + "epoch": 10.309951060358891, + "grad_norm": 0.0017758028116077185, + "learning_rate": 0.0005600867920919775, + "loss": 0.0429, + "num_input_tokens_seen": 136399504, + "step": 63200 + }, + { + "epoch": 10.310766721044045, + "grad_norm": 0.41978445649147034, + "learning_rate": 0.0005600161275083535, + "loss": 0.0352, + "num_input_tokens_seen": 136411536, + "step": 63205 + }, + { + "epoch": 10.3115823817292, + "grad_norm": 0.024018622934818268, + "learning_rate": 0.0005599454617084264, + "loss": 0.04, + "num_input_tokens_seen": 136422224, + "step": 63210 + }, + { + "epoch": 10.312398042414356, + "grad_norm": 0.0054513453505933285, + "learning_rate": 0.0005598747946936285, + "loss": 0.026, + "num_input_tokens_seen": 136434000, + "step": 63215 + }, + { + "epoch": 10.31321370309951, + "grad_norm": 0.003131336299702525, + "learning_rate": 0.0005598041264653919, + "loss": 0.0076, + "num_input_tokens_seen": 136445264, + "step": 63220 + }, + { + "epoch": 10.314029363784666, + "grad_norm": 0.0009469883516430855, + "learning_rate": 0.0005597334570251489, + "loss": 0.0427, + "num_input_tokens_seen": 136455824, + "step": 63225 + }, + { + "epoch": 10.31484502446982, + "grad_norm": 0.059438154101371765, + "learning_rate": 0.0005596627863743316, + "loss": 0.0245, + "num_input_tokens_seen": 136466384, + "step": 63230 + }, + { + "epoch": 10.315660685154976, + "grad_norm": 0.044389501214027405, + "learning_rate": 0.0005595921145143722, + "loss": 0.011, + "num_input_tokens_seen": 136477104, + "step": 63235 + }, + { + "epoch": 10.31647634584013, + "grad_norm": 0.025000670924782753, + "learning_rate": 0.0005595214414467029, + "loss": 0.0684, + "num_input_tokens_seen": 136486768, + "step": 63240 + }, + { + "epoch": 10.317292006525285, + "grad_norm": 0.0011151476064696908, + "learning_rate": 0.0005594507671727563, + "loss": 0.1115, + "num_input_tokens_seen": 136496816, + "step": 63245 + }, + { + "epoch": 10.318107667210441, + "grad_norm": 0.19021913409233093, + "learning_rate": 0.0005593800916939642, + "loss": 0.0203, + "num_input_tokens_seen": 136505904, + "step": 63250 + }, + { + "epoch": 10.318923327895595, + "grad_norm": 0.00923263467848301, + "learning_rate": 0.0005593094150117595, + "loss": 0.0088, + "num_input_tokens_seen": 136515696, + "step": 63255 + }, + { + "epoch": 10.31973898858075, + "grad_norm": 0.22367063164710999, + "learning_rate": 0.0005592387371275741, + "loss": 0.0143, + "num_input_tokens_seen": 136526256, + "step": 63260 + }, + { + "epoch": 10.320554649265905, + "grad_norm": 0.0028420949820429087, + "learning_rate": 0.0005591680580428406, + "loss": 0.034, + "num_input_tokens_seen": 136538352, + "step": 63265 + }, + { + "epoch": 10.32137030995106, + "grad_norm": 0.018892286345362663, + "learning_rate": 0.0005590973777589912, + "loss": 0.0063, + "num_input_tokens_seen": 136549616, + "step": 63270 + }, + { + "epoch": 10.322185970636216, + "grad_norm": 0.00259226281195879, + "learning_rate": 0.0005590266962774588, + "loss": 0.0585, + "num_input_tokens_seen": 136560272, + "step": 63275 + }, + { + "epoch": 10.32300163132137, + "grad_norm": 0.005845293402671814, + "learning_rate": 0.0005589560135996752, + "loss": 0.0971, + "num_input_tokens_seen": 136571248, + "step": 63280 + }, + { + "epoch": 10.323817292006526, + "grad_norm": 0.42380350828170776, + "learning_rate": 0.0005588853297270734, + "loss": 0.2365, + "num_input_tokens_seen": 136580624, + "step": 63285 + }, + { + "epoch": 10.32463295269168, + "grad_norm": 0.0016279505798593163, + "learning_rate": 0.0005588146446610855, + "loss": 0.1344, + "num_input_tokens_seen": 136591696, + "step": 63290 + }, + { + "epoch": 10.325448613376835, + "grad_norm": 0.013709763064980507, + "learning_rate": 0.0005587439584031444, + "loss": 0.01, + "num_input_tokens_seen": 136602800, + "step": 63295 + }, + { + "epoch": 10.326264274061991, + "grad_norm": 0.1373027116060257, + "learning_rate": 0.0005586732709546824, + "loss": 0.2172, + "num_input_tokens_seen": 136613808, + "step": 63300 + }, + { + "epoch": 10.327079934747145, + "grad_norm": 0.47342002391815186, + "learning_rate": 0.0005586025823171321, + "loss": 0.0689, + "num_input_tokens_seen": 136623600, + "step": 63305 + }, + { + "epoch": 10.3278955954323, + "grad_norm": 0.0009562668856233358, + "learning_rate": 0.0005585318924919262, + "loss": 0.1204, + "num_input_tokens_seen": 136635088, + "step": 63310 + }, + { + "epoch": 10.328711256117455, + "grad_norm": 0.010204900056123734, + "learning_rate": 0.0005584612014804972, + "loss": 0.0151, + "num_input_tokens_seen": 136644656, + "step": 63315 + }, + { + "epoch": 10.32952691680261, + "grad_norm": 0.027546579018235207, + "learning_rate": 0.0005583905092842777, + "loss": 0.0358, + "num_input_tokens_seen": 136656336, + "step": 63320 + }, + { + "epoch": 10.330342577487766, + "grad_norm": 0.02416393905878067, + "learning_rate": 0.0005583198159047005, + "loss": 0.0085, + "num_input_tokens_seen": 136667536, + "step": 63325 + }, + { + "epoch": 10.33115823817292, + "grad_norm": 0.002468561287969351, + "learning_rate": 0.0005582491213431983, + "loss": 0.0393, + "num_input_tokens_seen": 136678384, + "step": 63330 + }, + { + "epoch": 10.331973898858076, + "grad_norm": 0.0362926721572876, + "learning_rate": 0.0005581784256012037, + "loss": 0.0418, + "num_input_tokens_seen": 136687920, + "step": 63335 + }, + { + "epoch": 10.33278955954323, + "grad_norm": 0.020159313455224037, + "learning_rate": 0.0005581077286801495, + "loss": 0.0648, + "num_input_tokens_seen": 136698224, + "step": 63340 + }, + { + "epoch": 10.333605220228385, + "grad_norm": 0.006536018569022417, + "learning_rate": 0.0005580370305814686, + "loss": 0.0195, + "num_input_tokens_seen": 136707984, + "step": 63345 + }, + { + "epoch": 10.33442088091354, + "grad_norm": 0.002190654631704092, + "learning_rate": 0.0005579663313065935, + "loss": 0.0399, + "num_input_tokens_seen": 136719984, + "step": 63350 + }, + { + "epoch": 10.335236541598695, + "grad_norm": 0.01185048371553421, + "learning_rate": 0.0005578956308569572, + "loss": 0.1635, + "num_input_tokens_seen": 136730928, + "step": 63355 + }, + { + "epoch": 10.33605220228385, + "grad_norm": 0.0053945318795740604, + "learning_rate": 0.0005578249292339924, + "loss": 0.0546, + "num_input_tokens_seen": 136743504, + "step": 63360 + }, + { + "epoch": 10.336867862969005, + "grad_norm": 0.006652312818914652, + "learning_rate": 0.0005577542264391322, + "loss": 0.0236, + "num_input_tokens_seen": 136754096, + "step": 63365 + }, + { + "epoch": 10.33768352365416, + "grad_norm": 0.012087957933545113, + "learning_rate": 0.0005576835224738092, + "loss": 0.0121, + "num_input_tokens_seen": 136764304, + "step": 63370 + }, + { + "epoch": 10.338499184339314, + "grad_norm": 0.046494390815496445, + "learning_rate": 0.0005576128173394567, + "loss": 0.0903, + "num_input_tokens_seen": 136776368, + "step": 63375 + }, + { + "epoch": 10.33931484502447, + "grad_norm": 0.4883650243282318, + "learning_rate": 0.0005575421110375072, + "loss": 0.1447, + "num_input_tokens_seen": 136786896, + "step": 63380 + }, + { + "epoch": 10.340130505709626, + "grad_norm": 0.007342170923948288, + "learning_rate": 0.0005574714035693938, + "loss": 0.0092, + "num_input_tokens_seen": 136797136, + "step": 63385 + }, + { + "epoch": 10.34094616639478, + "grad_norm": 0.04582366347312927, + "learning_rate": 0.0005574006949365496, + "loss": 0.0244, + "num_input_tokens_seen": 136806320, + "step": 63390 + }, + { + "epoch": 10.341761827079935, + "grad_norm": 0.048869796097278595, + "learning_rate": 0.0005573299851404074, + "loss": 0.014, + "num_input_tokens_seen": 136815536, + "step": 63395 + }, + { + "epoch": 10.34257748776509, + "grad_norm": 0.08590178936719894, + "learning_rate": 0.0005572592741824003, + "loss": 0.0167, + "num_input_tokens_seen": 136826000, + "step": 63400 + }, + { + "epoch": 10.343393148450245, + "grad_norm": 0.012921919114887714, + "learning_rate": 0.0005571885620639614, + "loss": 0.0568, + "num_input_tokens_seen": 136837360, + "step": 63405 + }, + { + "epoch": 10.3442088091354, + "grad_norm": 0.1376103013753891, + "learning_rate": 0.0005571178487865238, + "loss": 0.0408, + "num_input_tokens_seen": 136847152, + "step": 63410 + }, + { + "epoch": 10.345024469820554, + "grad_norm": 0.009369419887661934, + "learning_rate": 0.0005570471343515205, + "loss": 0.0116, + "num_input_tokens_seen": 136857040, + "step": 63415 + }, + { + "epoch": 10.34584013050571, + "grad_norm": 0.28062954545021057, + "learning_rate": 0.0005569764187603846, + "loss": 0.0617, + "num_input_tokens_seen": 136868272, + "step": 63420 + }, + { + "epoch": 10.346655791190864, + "grad_norm": 0.008118157275021076, + "learning_rate": 0.0005569057020145494, + "loss": 0.0128, + "num_input_tokens_seen": 136879920, + "step": 63425 + }, + { + "epoch": 10.34747145187602, + "grad_norm": 0.04553205147385597, + "learning_rate": 0.0005568349841154479, + "loss": 0.0082, + "num_input_tokens_seen": 136890576, + "step": 63430 + }, + { + "epoch": 10.348287112561174, + "grad_norm": 0.05068276450037956, + "learning_rate": 0.0005567642650645134, + "loss": 0.0288, + "num_input_tokens_seen": 136900656, + "step": 63435 + }, + { + "epoch": 10.34910277324633, + "grad_norm": 0.020725570619106293, + "learning_rate": 0.000556693544863179, + "loss": 0.0101, + "num_input_tokens_seen": 136910928, + "step": 63440 + }, + { + "epoch": 10.349918433931485, + "grad_norm": 0.004965957719832659, + "learning_rate": 0.000556622823512878, + "loss": 0.0021, + "num_input_tokens_seen": 136921648, + "step": 63445 + }, + { + "epoch": 10.350734094616639, + "grad_norm": 0.07689174264669418, + "learning_rate": 0.0005565521010150436, + "loss": 0.0121, + "num_input_tokens_seen": 136932592, + "step": 63450 + }, + { + "epoch": 10.351549755301795, + "grad_norm": 0.04052141308784485, + "learning_rate": 0.0005564813773711092, + "loss": 0.0313, + "num_input_tokens_seen": 136942576, + "step": 63455 + }, + { + "epoch": 10.352365415986949, + "grad_norm": 0.002285655355080962, + "learning_rate": 0.0005564106525825079, + "loss": 0.1168, + "num_input_tokens_seen": 136954032, + "step": 63460 + }, + { + "epoch": 10.353181076672104, + "grad_norm": 0.017103614285588264, + "learning_rate": 0.0005563399266506734, + "loss": 0.1302, + "num_input_tokens_seen": 136965616, + "step": 63465 + }, + { + "epoch": 10.35399673735726, + "grad_norm": 0.012267673388123512, + "learning_rate": 0.0005562691995770386, + "loss": 0.0731, + "num_input_tokens_seen": 136975088, + "step": 63470 + }, + { + "epoch": 10.354812398042414, + "grad_norm": 0.04006676748394966, + "learning_rate": 0.0005561984713630373, + "loss": 0.0044, + "num_input_tokens_seen": 136984592, + "step": 63475 + }, + { + "epoch": 10.35562805872757, + "grad_norm": 0.007620746269822121, + "learning_rate": 0.0005561277420101026, + "loss": 0.0384, + "num_input_tokens_seen": 136995344, + "step": 63480 + }, + { + "epoch": 10.356443719412724, + "grad_norm": 0.03373353183269501, + "learning_rate": 0.0005560570115196679, + "loss": 0.0165, + "num_input_tokens_seen": 137005264, + "step": 63485 + }, + { + "epoch": 10.35725938009788, + "grad_norm": 0.0023493815679103136, + "learning_rate": 0.0005559862798931668, + "loss": 0.0318, + "num_input_tokens_seen": 137015376, + "step": 63490 + }, + { + "epoch": 10.358075040783035, + "grad_norm": 0.10797081142663956, + "learning_rate": 0.0005559155471320326, + "loss": 0.0472, + "num_input_tokens_seen": 137025744, + "step": 63495 + }, + { + "epoch": 10.358890701468189, + "grad_norm": 0.004092478659003973, + "learning_rate": 0.0005558448132376991, + "loss": 0.0297, + "num_input_tokens_seen": 137037616, + "step": 63500 + }, + { + "epoch": 10.359706362153345, + "grad_norm": 0.004008755087852478, + "learning_rate": 0.0005557740782115995, + "loss": 0.1109, + "num_input_tokens_seen": 137046608, + "step": 63505 + }, + { + "epoch": 10.360522022838499, + "grad_norm": 0.003608748549595475, + "learning_rate": 0.0005557033420551676, + "loss": 0.0138, + "num_input_tokens_seen": 137057712, + "step": 63510 + }, + { + "epoch": 10.361337683523654, + "grad_norm": 0.4292721152305603, + "learning_rate": 0.0005556326047698367, + "loss": 0.0922, + "num_input_tokens_seen": 137069232, + "step": 63515 + }, + { + "epoch": 10.362153344208808, + "grad_norm": 0.3824755549430847, + "learning_rate": 0.0005555618663570405, + "loss": 0.1735, + "num_input_tokens_seen": 137080240, + "step": 63520 + }, + { + "epoch": 10.362969004893964, + "grad_norm": 0.21231862902641296, + "learning_rate": 0.0005554911268182126, + "loss": 0.0282, + "num_input_tokens_seen": 137090224, + "step": 63525 + }, + { + "epoch": 10.36378466557912, + "grad_norm": 0.0036346344277262688, + "learning_rate": 0.0005554203861547866, + "loss": 0.0672, + "num_input_tokens_seen": 137101840, + "step": 63530 + }, + { + "epoch": 10.364600326264274, + "grad_norm": 0.5454628467559814, + "learning_rate": 0.0005553496443681961, + "loss": 0.0473, + "num_input_tokens_seen": 137112048, + "step": 63535 + }, + { + "epoch": 10.36541598694943, + "grad_norm": 0.0017473497427999973, + "learning_rate": 0.000555278901459875, + "loss": 0.0167, + "num_input_tokens_seen": 137122864, + "step": 63540 + }, + { + "epoch": 10.366231647634583, + "grad_norm": 0.2607009708881378, + "learning_rate": 0.0005552081574312568, + "loss": 0.0206, + "num_input_tokens_seen": 137133424, + "step": 63545 + }, + { + "epoch": 10.367047308319739, + "grad_norm": 0.32519999146461487, + "learning_rate": 0.0005551374122837752, + "loss": 0.0372, + "num_input_tokens_seen": 137144848, + "step": 63550 + }, + { + "epoch": 10.367862969004895, + "grad_norm": 0.05015113577246666, + "learning_rate": 0.000555066666018864, + "loss": 0.0819, + "num_input_tokens_seen": 137154416, + "step": 63555 + }, + { + "epoch": 10.368678629690049, + "grad_norm": 0.018475867807865143, + "learning_rate": 0.0005549959186379569, + "loss": 0.0548, + "num_input_tokens_seen": 137164400, + "step": 63560 + }, + { + "epoch": 10.369494290375204, + "grad_norm": 0.06442388147115707, + "learning_rate": 0.0005549251701424878, + "loss": 0.0092, + "num_input_tokens_seen": 137174192, + "step": 63565 + }, + { + "epoch": 10.370309951060358, + "grad_norm": 0.012496596202254295, + "learning_rate": 0.0005548544205338905, + "loss": 0.0539, + "num_input_tokens_seen": 137184944, + "step": 63570 + }, + { + "epoch": 10.371125611745514, + "grad_norm": 0.007139415945857763, + "learning_rate": 0.0005547836698135987, + "loss": 0.047, + "num_input_tokens_seen": 137194640, + "step": 63575 + }, + { + "epoch": 10.37194127243067, + "grad_norm": 0.0024393643252551556, + "learning_rate": 0.0005547129179830463, + "loss": 0.0061, + "num_input_tokens_seen": 137205552, + "step": 63580 + }, + { + "epoch": 10.372756933115824, + "grad_norm": 0.3117803633213043, + "learning_rate": 0.0005546421650436674, + "loss": 0.1375, + "num_input_tokens_seen": 137217424, + "step": 63585 + }, + { + "epoch": 10.37357259380098, + "grad_norm": 0.001661142217926681, + "learning_rate": 0.0005545714109968956, + "loss": 0.0076, + "num_input_tokens_seen": 137228528, + "step": 63590 + }, + { + "epoch": 10.374388254486133, + "grad_norm": 0.01959504932165146, + "learning_rate": 0.0005545006558441649, + "loss": 0.0494, + "num_input_tokens_seen": 137238640, + "step": 63595 + }, + { + "epoch": 10.375203915171289, + "grad_norm": 0.046812623739242554, + "learning_rate": 0.0005544298995869093, + "loss": 0.0973, + "num_input_tokens_seen": 137248624, + "step": 63600 + }, + { + "epoch": 10.376019575856443, + "grad_norm": 0.012140447273850441, + "learning_rate": 0.0005543591422265627, + "loss": 0.0065, + "num_input_tokens_seen": 137258960, + "step": 63605 + }, + { + "epoch": 10.376835236541599, + "grad_norm": 0.17128652334213257, + "learning_rate": 0.0005542883837645592, + "loss": 0.0432, + "num_input_tokens_seen": 137269776, + "step": 63610 + }, + { + "epoch": 10.377650897226754, + "grad_norm": 0.002737376606091857, + "learning_rate": 0.0005542176242023326, + "loss": 0.0391, + "num_input_tokens_seen": 137280720, + "step": 63615 + }, + { + "epoch": 10.378466557911908, + "grad_norm": 0.016854213550686836, + "learning_rate": 0.0005541468635413172, + "loss": 0.0071, + "num_input_tokens_seen": 137291536, + "step": 63620 + }, + { + "epoch": 10.379282218597064, + "grad_norm": 0.02417229488492012, + "learning_rate": 0.0005540761017829468, + "loss": 0.0062, + "num_input_tokens_seen": 137302288, + "step": 63625 + }, + { + "epoch": 10.380097879282218, + "grad_norm": 0.21351303160190582, + "learning_rate": 0.0005540053389286556, + "loss": 0.0495, + "num_input_tokens_seen": 137312688, + "step": 63630 + }, + { + "epoch": 10.380913539967374, + "grad_norm": 0.38720688223838806, + "learning_rate": 0.0005539345749798778, + "loss": 0.1016, + "num_input_tokens_seen": 137323120, + "step": 63635 + }, + { + "epoch": 10.38172920065253, + "grad_norm": 0.15643472969532013, + "learning_rate": 0.0005538638099380473, + "loss": 0.0951, + "num_input_tokens_seen": 137332912, + "step": 63640 + }, + { + "epoch": 10.382544861337683, + "grad_norm": 0.004149084910750389, + "learning_rate": 0.0005537930438045984, + "loss": 0.0914, + "num_input_tokens_seen": 137343888, + "step": 63645 + }, + { + "epoch": 10.383360522022839, + "grad_norm": 0.012640498578548431, + "learning_rate": 0.0005537222765809653, + "loss": 0.0861, + "num_input_tokens_seen": 137355824, + "step": 63650 + }, + { + "epoch": 10.384176182707993, + "grad_norm": 0.0069519830867648125, + "learning_rate": 0.000553651508268582, + "loss": 0.0155, + "num_input_tokens_seen": 137365008, + "step": 63655 + }, + { + "epoch": 10.384991843393149, + "grad_norm": 0.017142213881015778, + "learning_rate": 0.000553580738868883, + "loss": 0.0153, + "num_input_tokens_seen": 137376080, + "step": 63660 + }, + { + "epoch": 10.385807504078304, + "grad_norm": 0.0009230608702637255, + "learning_rate": 0.0005535099683833021, + "loss": 0.0283, + "num_input_tokens_seen": 137386608, + "step": 63665 + }, + { + "epoch": 10.386623164763458, + "grad_norm": 0.015557908453047276, + "learning_rate": 0.0005534391968132741, + "loss": 0.1275, + "num_input_tokens_seen": 137398640, + "step": 63670 + }, + { + "epoch": 10.387438825448614, + "grad_norm": 0.002696776296943426, + "learning_rate": 0.0005533684241602327, + "loss": 0.0959, + "num_input_tokens_seen": 137408016, + "step": 63675 + }, + { + "epoch": 10.388254486133768, + "grad_norm": 0.0040484583005309105, + "learning_rate": 0.0005532976504256127, + "loss": 0.0198, + "num_input_tokens_seen": 137419696, + "step": 63680 + }, + { + "epoch": 10.389070146818923, + "grad_norm": 0.001835741219110787, + "learning_rate": 0.000553226875610848, + "loss": 0.2212, + "num_input_tokens_seen": 137431056, + "step": 63685 + }, + { + "epoch": 10.38988580750408, + "grad_norm": 0.008505942299962044, + "learning_rate": 0.0005531560997173733, + "loss": 0.0199, + "num_input_tokens_seen": 137442096, + "step": 63690 + }, + { + "epoch": 10.390701468189233, + "grad_norm": 0.005393583793193102, + "learning_rate": 0.0005530853227466229, + "loss": 0.0152, + "num_input_tokens_seen": 137453616, + "step": 63695 + }, + { + "epoch": 10.391517128874389, + "grad_norm": 0.014967923983931541, + "learning_rate": 0.0005530145447000308, + "loss": 0.0232, + "num_input_tokens_seen": 137463952, + "step": 63700 + }, + { + "epoch": 10.392332789559543, + "grad_norm": 0.011635849252343178, + "learning_rate": 0.0005529437655790319, + "loss": 0.0152, + "num_input_tokens_seen": 137475056, + "step": 63705 + }, + { + "epoch": 10.393148450244698, + "grad_norm": 0.0013373836409300566, + "learning_rate": 0.0005528729853850604, + "loss": 0.0896, + "num_input_tokens_seen": 137484272, + "step": 63710 + }, + { + "epoch": 10.393964110929852, + "grad_norm": 0.008863737806677818, + "learning_rate": 0.0005528022041195507, + "loss": 0.0314, + "num_input_tokens_seen": 137494224, + "step": 63715 + }, + { + "epoch": 10.394779771615008, + "grad_norm": 0.22035373747348785, + "learning_rate": 0.0005527314217839375, + "loss": 0.0224, + "num_input_tokens_seen": 137505296, + "step": 63720 + }, + { + "epoch": 10.395595432300164, + "grad_norm": 0.3056116998195648, + "learning_rate": 0.0005526606383796551, + "loss": 0.025, + "num_input_tokens_seen": 137515760, + "step": 63725 + }, + { + "epoch": 10.396411092985318, + "grad_norm": 0.015530501492321491, + "learning_rate": 0.000552589853908138, + "loss": 0.0684, + "num_input_tokens_seen": 137527280, + "step": 63730 + }, + { + "epoch": 10.397226753670473, + "grad_norm": 0.0076279290951788425, + "learning_rate": 0.0005525190683708207, + "loss": 0.0225, + "num_input_tokens_seen": 137537520, + "step": 63735 + }, + { + "epoch": 10.398042414355627, + "grad_norm": 0.012412765994668007, + "learning_rate": 0.0005524482817691381, + "loss": 0.0245, + "num_input_tokens_seen": 137548432, + "step": 63740 + }, + { + "epoch": 10.398858075040783, + "grad_norm": 0.016592828556895256, + "learning_rate": 0.0005523774941045244, + "loss": 0.0062, + "num_input_tokens_seen": 137558736, + "step": 63745 + }, + { + "epoch": 10.399673735725939, + "grad_norm": 0.11969045549631119, + "learning_rate": 0.0005523067053784143, + "loss": 0.0188, + "num_input_tokens_seen": 137569712, + "step": 63750 + }, + { + "epoch": 10.400489396411093, + "grad_norm": 0.0039054385852068663, + "learning_rate": 0.0005522359155922425, + "loss": 0.0338, + "num_input_tokens_seen": 137580432, + "step": 63755 + }, + { + "epoch": 10.401305057096248, + "grad_norm": 0.005145737901329994, + "learning_rate": 0.0005521651247474436, + "loss": 0.0097, + "num_input_tokens_seen": 137591248, + "step": 63760 + }, + { + "epoch": 10.402120717781402, + "grad_norm": 0.004702454898506403, + "learning_rate": 0.0005520943328454523, + "loss": 0.013, + "num_input_tokens_seen": 137602800, + "step": 63765 + }, + { + "epoch": 10.402936378466558, + "grad_norm": 0.5832039713859558, + "learning_rate": 0.0005520235398877032, + "loss": 0.0972, + "num_input_tokens_seen": 137614512, + "step": 63770 + }, + { + "epoch": 10.403752039151712, + "grad_norm": 0.011609661392867565, + "learning_rate": 0.0005519527458756312, + "loss": 0.0217, + "num_input_tokens_seen": 137625008, + "step": 63775 + }, + { + "epoch": 10.404567699836868, + "grad_norm": 0.3519980311393738, + "learning_rate": 0.0005518819508106706, + "loss": 0.0989, + "num_input_tokens_seen": 137635792, + "step": 63780 + }, + { + "epoch": 10.405383360522023, + "grad_norm": 0.3530219495296478, + "learning_rate": 0.0005518111546942567, + "loss": 0.0207, + "num_input_tokens_seen": 137647120, + "step": 63785 + }, + { + "epoch": 10.406199021207177, + "grad_norm": 0.002966291271150112, + "learning_rate": 0.000551740357527824, + "loss": 0.0727, + "num_input_tokens_seen": 137658416, + "step": 63790 + }, + { + "epoch": 10.407014681892333, + "grad_norm": 0.0037598100025206804, + "learning_rate": 0.0005516695593128073, + "loss": 0.0761, + "num_input_tokens_seen": 137669232, + "step": 63795 + }, + { + "epoch": 10.407830342577487, + "grad_norm": 0.02477104589343071, + "learning_rate": 0.0005515987600506414, + "loss": 0.0283, + "num_input_tokens_seen": 137681296, + "step": 63800 + }, + { + "epoch": 10.408646003262643, + "grad_norm": 0.3461161255836487, + "learning_rate": 0.0005515279597427612, + "loss": 0.0338, + "num_input_tokens_seen": 137690864, + "step": 63805 + }, + { + "epoch": 10.409461663947798, + "grad_norm": 0.00900832936167717, + "learning_rate": 0.0005514571583906014, + "loss": 0.0914, + "num_input_tokens_seen": 137701712, + "step": 63810 + }, + { + "epoch": 10.410277324632952, + "grad_norm": 0.07133755832910538, + "learning_rate": 0.0005513863559955971, + "loss": 0.0345, + "num_input_tokens_seen": 137713072, + "step": 63815 + }, + { + "epoch": 10.411092985318108, + "grad_norm": 0.16226263344287872, + "learning_rate": 0.0005513155525591831, + "loss": 0.0287, + "num_input_tokens_seen": 137724368, + "step": 63820 + }, + { + "epoch": 10.411908646003262, + "grad_norm": 0.008057237602770329, + "learning_rate": 0.0005512447480827945, + "loss": 0.0112, + "num_input_tokens_seen": 137734864, + "step": 63825 + }, + { + "epoch": 10.412724306688418, + "grad_norm": 0.012757726944983006, + "learning_rate": 0.0005511739425678658, + "loss": 0.0074, + "num_input_tokens_seen": 137747312, + "step": 63830 + }, + { + "epoch": 10.413539967373573, + "grad_norm": 0.014100191183388233, + "learning_rate": 0.0005511031360158324, + "loss": 0.0034, + "num_input_tokens_seen": 137758192, + "step": 63835 + }, + { + "epoch": 10.414355628058727, + "grad_norm": 0.009351453743875027, + "learning_rate": 0.0005510323284281291, + "loss": 0.006, + "num_input_tokens_seen": 137768880, + "step": 63840 + }, + { + "epoch": 10.415171288743883, + "grad_norm": 0.10731302201747894, + "learning_rate": 0.0005509615198061909, + "loss": 0.0114, + "num_input_tokens_seen": 137779728, + "step": 63845 + }, + { + "epoch": 10.415986949429037, + "grad_norm": 0.020762013271450996, + "learning_rate": 0.0005508907101514529, + "loss": 0.0085, + "num_input_tokens_seen": 137790800, + "step": 63850 + }, + { + "epoch": 10.416802610114193, + "grad_norm": 0.41157421469688416, + "learning_rate": 0.0005508198994653501, + "loss": 0.1139, + "num_input_tokens_seen": 137802064, + "step": 63855 + }, + { + "epoch": 10.417618270799348, + "grad_norm": 0.023060429841279984, + "learning_rate": 0.0005507490877493176, + "loss": 0.0125, + "num_input_tokens_seen": 137811632, + "step": 63860 + }, + { + "epoch": 10.418433931484502, + "grad_norm": 0.037393826991319656, + "learning_rate": 0.0005506782750047903, + "loss": 0.008, + "num_input_tokens_seen": 137822896, + "step": 63865 + }, + { + "epoch": 10.419249592169658, + "grad_norm": 0.007594020571559668, + "learning_rate": 0.0005506074612332035, + "loss": 0.0076, + "num_input_tokens_seen": 137834032, + "step": 63870 + }, + { + "epoch": 10.420065252854812, + "grad_norm": 0.0772496685385704, + "learning_rate": 0.0005505366464359924, + "loss": 0.0179, + "num_input_tokens_seen": 137844784, + "step": 63875 + }, + { + "epoch": 10.420880913539968, + "grad_norm": 0.1671362966299057, + "learning_rate": 0.000550465830614592, + "loss": 0.0135, + "num_input_tokens_seen": 137854768, + "step": 63880 + }, + { + "epoch": 10.421696574225122, + "grad_norm": 0.007003472652286291, + "learning_rate": 0.0005503950137704374, + "loss": 0.006, + "num_input_tokens_seen": 137864496, + "step": 63885 + }, + { + "epoch": 10.422512234910277, + "grad_norm": 0.0735108032822609, + "learning_rate": 0.0005503241959049641, + "loss": 0.113, + "num_input_tokens_seen": 137874352, + "step": 63890 + }, + { + "epoch": 10.423327895595433, + "grad_norm": 0.0938108041882515, + "learning_rate": 0.000550253377019607, + "loss": 0.0184, + "num_input_tokens_seen": 137885296, + "step": 63895 + }, + { + "epoch": 10.424143556280587, + "grad_norm": 0.007233478594571352, + "learning_rate": 0.0005501825571158016, + "loss": 0.0027, + "num_input_tokens_seen": 137896848, + "step": 63900 + }, + { + "epoch": 10.424959216965743, + "grad_norm": 0.07163023948669434, + "learning_rate": 0.000550111736194983, + "loss": 0.0252, + "num_input_tokens_seen": 137907664, + "step": 63905 + }, + { + "epoch": 10.425774877650896, + "grad_norm": 0.0063054244965314865, + "learning_rate": 0.0005500409142585864, + "loss": 0.1049, + "num_input_tokens_seen": 137918800, + "step": 63910 + }, + { + "epoch": 10.426590538336052, + "grad_norm": 0.003660548711195588, + "learning_rate": 0.0005499700913080472, + "loss": 0.1419, + "num_input_tokens_seen": 137929424, + "step": 63915 + }, + { + "epoch": 10.427406199021208, + "grad_norm": 0.8780221343040466, + "learning_rate": 0.0005498992673448008, + "loss": 0.1147, + "num_input_tokens_seen": 137941232, + "step": 63920 + }, + { + "epoch": 10.428221859706362, + "grad_norm": 0.0027427575550973415, + "learning_rate": 0.0005498284423702824, + "loss": 0.154, + "num_input_tokens_seen": 137951696, + "step": 63925 + }, + { + "epoch": 10.429037520391518, + "grad_norm": 0.5122124552726746, + "learning_rate": 0.0005497576163859273, + "loss": 0.0176, + "num_input_tokens_seen": 137962768, + "step": 63930 + }, + { + "epoch": 10.429853181076671, + "grad_norm": 0.022497553378343582, + "learning_rate": 0.0005496867893931711, + "loss": 0.1105, + "num_input_tokens_seen": 137974480, + "step": 63935 + }, + { + "epoch": 10.430668841761827, + "grad_norm": 0.04115482419729233, + "learning_rate": 0.0005496159613934492, + "loss": 0.0265, + "num_input_tokens_seen": 137985680, + "step": 63940 + }, + { + "epoch": 10.431484502446983, + "grad_norm": 0.0875755250453949, + "learning_rate": 0.0005495451323881967, + "loss": 0.0381, + "num_input_tokens_seen": 137997264, + "step": 63945 + }, + { + "epoch": 10.432300163132137, + "grad_norm": 0.021797379478812218, + "learning_rate": 0.0005494743023788493, + "loss": 0.1187, + "num_input_tokens_seen": 138007696, + "step": 63950 + }, + { + "epoch": 10.433115823817293, + "grad_norm": 0.01686260476708412, + "learning_rate": 0.0005494034713668423, + "loss": 0.0126, + "num_input_tokens_seen": 138018384, + "step": 63955 + }, + { + "epoch": 10.433931484502446, + "grad_norm": 0.0010668630711734295, + "learning_rate": 0.0005493326393536113, + "loss": 0.0044, + "num_input_tokens_seen": 138029968, + "step": 63960 + }, + { + "epoch": 10.434747145187602, + "grad_norm": 0.007378291338682175, + "learning_rate": 0.000549261806340592, + "loss": 0.0489, + "num_input_tokens_seen": 138039664, + "step": 63965 + }, + { + "epoch": 10.435562805872756, + "grad_norm": 0.003996559884399176, + "learning_rate": 0.0005491909723292196, + "loss": 0.0149, + "num_input_tokens_seen": 138051024, + "step": 63970 + }, + { + "epoch": 10.436378466557912, + "grad_norm": 0.27420538663864136, + "learning_rate": 0.0005491201373209295, + "loss": 0.0142, + "num_input_tokens_seen": 138061648, + "step": 63975 + }, + { + "epoch": 10.437194127243067, + "grad_norm": 0.00741687323898077, + "learning_rate": 0.0005490493013171578, + "loss": 0.0132, + "num_input_tokens_seen": 138071536, + "step": 63980 + }, + { + "epoch": 10.438009787928221, + "grad_norm": 0.0401686429977417, + "learning_rate": 0.0005489784643193397, + "loss": 0.0167, + "num_input_tokens_seen": 138082160, + "step": 63985 + }, + { + "epoch": 10.438825448613377, + "grad_norm": 0.004495096392929554, + "learning_rate": 0.0005489076263289109, + "loss": 0.0479, + "num_input_tokens_seen": 138092048, + "step": 63990 + }, + { + "epoch": 10.439641109298531, + "grad_norm": 0.0341220498085022, + "learning_rate": 0.000548836787347307, + "loss": 0.113, + "num_input_tokens_seen": 138103792, + "step": 63995 + }, + { + "epoch": 10.440456769983687, + "grad_norm": 0.014451473020017147, + "learning_rate": 0.0005487659473759635, + "loss": 0.033, + "num_input_tokens_seen": 138114352, + "step": 64000 + }, + { + "epoch": 10.441272430668842, + "grad_norm": 0.004849119111895561, + "learning_rate": 0.0005486951064163164, + "loss": 0.0068, + "num_input_tokens_seen": 138124400, + "step": 64005 + }, + { + "epoch": 10.442088091353996, + "grad_norm": 0.008503446355462074, + "learning_rate": 0.0005486242644698011, + "loss": 0.012, + "num_input_tokens_seen": 138134800, + "step": 64010 + }, + { + "epoch": 10.442903752039152, + "grad_norm": 0.5239019393920898, + "learning_rate": 0.0005485534215378535, + "loss": 0.032, + "num_input_tokens_seen": 138145552, + "step": 64015 + }, + { + "epoch": 10.443719412724306, + "grad_norm": 0.046765729784965515, + "learning_rate": 0.0005484825776219092, + "loss": 0.1139, + "num_input_tokens_seen": 138156464, + "step": 64020 + }, + { + "epoch": 10.444535073409462, + "grad_norm": 0.17480318248271942, + "learning_rate": 0.0005484117327234038, + "loss": 0.0495, + "num_input_tokens_seen": 138167984, + "step": 64025 + }, + { + "epoch": 10.445350734094617, + "grad_norm": 0.0022052365820854902, + "learning_rate": 0.0005483408868437734, + "loss": 0.0313, + "num_input_tokens_seen": 138177840, + "step": 64030 + }, + { + "epoch": 10.446166394779771, + "grad_norm": 0.0038957062643021345, + "learning_rate": 0.0005482700399844536, + "loss": 0.0145, + "num_input_tokens_seen": 138188208, + "step": 64035 + }, + { + "epoch": 10.446982055464927, + "grad_norm": 0.3542209267616272, + "learning_rate": 0.0005481991921468801, + "loss": 0.0672, + "num_input_tokens_seen": 138198544, + "step": 64040 + }, + { + "epoch": 10.447797716150081, + "grad_norm": 0.012097034603357315, + "learning_rate": 0.0005481283433324888, + "loss": 0.0116, + "num_input_tokens_seen": 138210288, + "step": 64045 + }, + { + "epoch": 10.448613376835237, + "grad_norm": 0.030181903392076492, + "learning_rate": 0.0005480574935427157, + "loss": 0.0086, + "num_input_tokens_seen": 138221488, + "step": 64050 + }, + { + "epoch": 10.449429037520392, + "grad_norm": 0.02346578799188137, + "learning_rate": 0.0005479866427789965, + "loss": 0.0418, + "num_input_tokens_seen": 138232272, + "step": 64055 + }, + { + "epoch": 10.450244698205546, + "grad_norm": 0.010822178795933723, + "learning_rate": 0.0005479157910427672, + "loss": 0.0054, + "num_input_tokens_seen": 138243184, + "step": 64060 + }, + { + "epoch": 10.451060358890702, + "grad_norm": 0.014392509125173092, + "learning_rate": 0.0005478449383354634, + "loss": 0.1061, + "num_input_tokens_seen": 138254960, + "step": 64065 + }, + { + "epoch": 10.451876019575856, + "grad_norm": 0.014599153771996498, + "learning_rate": 0.0005477740846585213, + "loss": 0.0107, + "num_input_tokens_seen": 138265520, + "step": 64070 + }, + { + "epoch": 10.452691680261012, + "grad_norm": 0.0026850702706724405, + "learning_rate": 0.0005477032300133768, + "loss": 0.1128, + "num_input_tokens_seen": 138276112, + "step": 64075 + }, + { + "epoch": 10.453507340946166, + "grad_norm": 0.004758888855576515, + "learning_rate": 0.0005476323744014658, + "loss": 0.0023, + "num_input_tokens_seen": 138286128, + "step": 64080 + }, + { + "epoch": 10.454323001631321, + "grad_norm": 0.04943656921386719, + "learning_rate": 0.0005475615178242244, + "loss": 0.0396, + "num_input_tokens_seen": 138296848, + "step": 64085 + }, + { + "epoch": 10.455138662316477, + "grad_norm": 0.08671029657125473, + "learning_rate": 0.0005474906602830884, + "loss": 0.0333, + "num_input_tokens_seen": 138307440, + "step": 64090 + }, + { + "epoch": 10.455954323001631, + "grad_norm": 0.3726256787776947, + "learning_rate": 0.0005474198017794939, + "loss": 0.2082, + "num_input_tokens_seen": 138318128, + "step": 64095 + }, + { + "epoch": 10.456769983686787, + "grad_norm": 0.0593709871172905, + "learning_rate": 0.000547348942314877, + "loss": 0.023, + "num_input_tokens_seen": 138328176, + "step": 64100 + }, + { + "epoch": 10.45758564437194, + "grad_norm": 0.0022720927372574806, + "learning_rate": 0.0005472780818906736, + "loss": 0.0056, + "num_input_tokens_seen": 138337840, + "step": 64105 + }, + { + "epoch": 10.458401305057096, + "grad_norm": 0.018708527088165283, + "learning_rate": 0.00054720722050832, + "loss": 0.0045, + "num_input_tokens_seen": 138348912, + "step": 64110 + }, + { + "epoch": 10.459216965742252, + "grad_norm": 0.0033393516205251217, + "learning_rate": 0.0005471363581692523, + "loss": 0.0356, + "num_input_tokens_seen": 138359440, + "step": 64115 + }, + { + "epoch": 10.460032626427406, + "grad_norm": 0.03131455183029175, + "learning_rate": 0.0005470654948749065, + "loss": 0.0139, + "num_input_tokens_seen": 138368944, + "step": 64120 + }, + { + "epoch": 10.460848287112562, + "grad_norm": 0.006178342271596193, + "learning_rate": 0.0005469946306267185, + "loss": 0.0054, + "num_input_tokens_seen": 138379664, + "step": 64125 + }, + { + "epoch": 10.461663947797716, + "grad_norm": 0.07633642107248306, + "learning_rate": 0.0005469237654261249, + "loss": 0.0278, + "num_input_tokens_seen": 138392496, + "step": 64130 + }, + { + "epoch": 10.462479608482871, + "grad_norm": 0.752743661403656, + "learning_rate": 0.0005468528992745615, + "loss": 0.0884, + "num_input_tokens_seen": 138402800, + "step": 64135 + }, + { + "epoch": 10.463295269168025, + "grad_norm": 0.04524966701865196, + "learning_rate": 0.0005467820321734647, + "loss": 0.0102, + "num_input_tokens_seen": 138412912, + "step": 64140 + }, + { + "epoch": 10.464110929853181, + "grad_norm": 0.056157130748033524, + "learning_rate": 0.0005467111641242709, + "loss": 0.2445, + "num_input_tokens_seen": 138424688, + "step": 64145 + }, + { + "epoch": 10.464926590538337, + "grad_norm": 0.0007423535571433604, + "learning_rate": 0.000546640295128416, + "loss": 0.0807, + "num_input_tokens_seen": 138435120, + "step": 64150 + }, + { + "epoch": 10.46574225122349, + "grad_norm": 0.004523728974163532, + "learning_rate": 0.0005465694251873362, + "loss": 0.0046, + "num_input_tokens_seen": 138446736, + "step": 64155 + }, + { + "epoch": 10.466557911908646, + "grad_norm": 0.006408552173525095, + "learning_rate": 0.000546498554302468, + "loss": 0.011, + "num_input_tokens_seen": 138456464, + "step": 64160 + }, + { + "epoch": 10.4673735725938, + "grad_norm": 0.008871420286595821, + "learning_rate": 0.0005464276824752477, + "loss": 0.2154, + "num_input_tokens_seen": 138466160, + "step": 64165 + }, + { + "epoch": 10.468189233278956, + "grad_norm": 0.007980672642588615, + "learning_rate": 0.0005463568097071115, + "loss": 0.0313, + "num_input_tokens_seen": 138476016, + "step": 64170 + }, + { + "epoch": 10.469004893964112, + "grad_norm": 0.004341542720794678, + "learning_rate": 0.0005462859359994957, + "loss": 0.0123, + "num_input_tokens_seen": 138487184, + "step": 64175 + }, + { + "epoch": 10.469820554649266, + "grad_norm": 0.03515727445483208, + "learning_rate": 0.0005462150613538366, + "loss": 0.0034, + "num_input_tokens_seen": 138497648, + "step": 64180 + }, + { + "epoch": 10.470636215334421, + "grad_norm": 0.40586772561073303, + "learning_rate": 0.0005461441857715708, + "loss": 0.1583, + "num_input_tokens_seen": 138508720, + "step": 64185 + }, + { + "epoch": 10.471451876019575, + "grad_norm": 0.002542255213484168, + "learning_rate": 0.0005460733092541345, + "loss": 0.0324, + "num_input_tokens_seen": 138518224, + "step": 64190 + }, + { + "epoch": 10.47226753670473, + "grad_norm": 0.018054431304335594, + "learning_rate": 0.000546002431802964, + "loss": 0.0117, + "num_input_tokens_seen": 138530064, + "step": 64195 + }, + { + "epoch": 10.473083197389887, + "grad_norm": 0.01659454219043255, + "learning_rate": 0.0005459315534194959, + "loss": 0.0104, + "num_input_tokens_seen": 138540560, + "step": 64200 + }, + { + "epoch": 10.47389885807504, + "grad_norm": 0.009241634048521519, + "learning_rate": 0.0005458606741051667, + "loss": 0.0121, + "num_input_tokens_seen": 138551824, + "step": 64205 + }, + { + "epoch": 10.474714518760196, + "grad_norm": 0.20156247913837433, + "learning_rate": 0.0005457897938614127, + "loss": 0.2924, + "num_input_tokens_seen": 138561072, + "step": 64210 + }, + { + "epoch": 10.47553017944535, + "grad_norm": 0.0007992640021257102, + "learning_rate": 0.0005457189126896704, + "loss": 0.0274, + "num_input_tokens_seen": 138571760, + "step": 64215 + }, + { + "epoch": 10.476345840130506, + "grad_norm": 0.28928518295288086, + "learning_rate": 0.0005456480305913765, + "loss": 0.0352, + "num_input_tokens_seen": 138582352, + "step": 64220 + }, + { + "epoch": 10.477161500815662, + "grad_norm": 0.19051100313663483, + "learning_rate": 0.0005455771475679673, + "loss": 0.0197, + "num_input_tokens_seen": 138593488, + "step": 64225 + }, + { + "epoch": 10.477977161500815, + "grad_norm": 0.005448159761726856, + "learning_rate": 0.0005455062636208793, + "loss": 0.1114, + "num_input_tokens_seen": 138604784, + "step": 64230 + }, + { + "epoch": 10.478792822185971, + "grad_norm": 0.4894399046897888, + "learning_rate": 0.0005454353787515493, + "loss": 0.0478, + "num_input_tokens_seen": 138616144, + "step": 64235 + }, + { + "epoch": 10.479608482871125, + "grad_norm": 0.032557934522628784, + "learning_rate": 0.0005453644929614136, + "loss": 0.0214, + "num_input_tokens_seen": 138626992, + "step": 64240 + }, + { + "epoch": 10.48042414355628, + "grad_norm": 0.3118022084236145, + "learning_rate": 0.0005452936062519088, + "loss": 0.0394, + "num_input_tokens_seen": 138638032, + "step": 64245 + }, + { + "epoch": 10.481239804241435, + "grad_norm": 0.0009059447911567986, + "learning_rate": 0.0005452227186244717, + "loss": 0.0058, + "num_input_tokens_seen": 138648272, + "step": 64250 + }, + { + "epoch": 10.48205546492659, + "grad_norm": 0.014689632691442966, + "learning_rate": 0.0005451518300805389, + "loss": 0.015, + "num_input_tokens_seen": 138658288, + "step": 64255 + }, + { + "epoch": 10.482871125611746, + "grad_norm": 0.17797890305519104, + "learning_rate": 0.0005450809406215469, + "loss": 0.0187, + "num_input_tokens_seen": 138669840, + "step": 64260 + }, + { + "epoch": 10.4836867862969, + "grad_norm": 0.1168985664844513, + "learning_rate": 0.0005450100502489324, + "loss": 0.0219, + "num_input_tokens_seen": 138680816, + "step": 64265 + }, + { + "epoch": 10.484502446982056, + "grad_norm": 0.019894301891326904, + "learning_rate": 0.0005449391589641321, + "loss": 0.1174, + "num_input_tokens_seen": 138691280, + "step": 64270 + }, + { + "epoch": 10.48531810766721, + "grad_norm": 0.0015611272538080812, + "learning_rate": 0.0005448682667685829, + "loss": 0.0086, + "num_input_tokens_seen": 138701840, + "step": 64275 + }, + { + "epoch": 10.486133768352365, + "grad_norm": 0.026562070474028587, + "learning_rate": 0.0005447973736637214, + "loss": 0.0071, + "num_input_tokens_seen": 138712624, + "step": 64280 + }, + { + "epoch": 10.486949429037521, + "grad_norm": 0.004168341401964426, + "learning_rate": 0.0005447264796509841, + "loss": 0.1307, + "num_input_tokens_seen": 138723696, + "step": 64285 + }, + { + "epoch": 10.487765089722675, + "grad_norm": 0.030232056975364685, + "learning_rate": 0.0005446555847318081, + "loss": 0.0118, + "num_input_tokens_seen": 138734000, + "step": 64290 + }, + { + "epoch": 10.48858075040783, + "grad_norm": 0.013917588628828526, + "learning_rate": 0.00054458468890763, + "loss": 0.0646, + "num_input_tokens_seen": 138744304, + "step": 64295 + }, + { + "epoch": 10.489396411092985, + "grad_norm": 0.6741026639938354, + "learning_rate": 0.0005445137921798866, + "loss": 0.1078, + "num_input_tokens_seen": 138755056, + "step": 64300 + }, + { + "epoch": 10.49021207177814, + "grad_norm": 0.04618797078728676, + "learning_rate": 0.0005444428945500147, + "loss": 0.0223, + "num_input_tokens_seen": 138766992, + "step": 64305 + }, + { + "epoch": 10.491027732463296, + "grad_norm": 0.35881149768829346, + "learning_rate": 0.0005443719960194513, + "loss": 0.039, + "num_input_tokens_seen": 138777712, + "step": 64310 + }, + { + "epoch": 10.49184339314845, + "grad_norm": 0.02100551873445511, + "learning_rate": 0.0005443010965896327, + "loss": 0.0581, + "num_input_tokens_seen": 138788912, + "step": 64315 + }, + { + "epoch": 10.492659053833606, + "grad_norm": 0.14398764073848724, + "learning_rate": 0.0005442301962619965, + "loss": 0.014, + "num_input_tokens_seen": 138799376, + "step": 64320 + }, + { + "epoch": 10.49347471451876, + "grad_norm": 0.10722988843917847, + "learning_rate": 0.0005441592950379792, + "loss": 0.1164, + "num_input_tokens_seen": 138809968, + "step": 64325 + }, + { + "epoch": 10.494290375203915, + "grad_norm": 0.14917492866516113, + "learning_rate": 0.0005440883929190179, + "loss": 0.0495, + "num_input_tokens_seen": 138820016, + "step": 64330 + }, + { + "epoch": 10.49510603588907, + "grad_norm": 0.3162502348423004, + "learning_rate": 0.0005440174899065493, + "loss": 0.0623, + "num_input_tokens_seen": 138831056, + "step": 64335 + }, + { + "epoch": 10.495921696574225, + "grad_norm": 0.002860084641724825, + "learning_rate": 0.0005439465860020104, + "loss": 0.1363, + "num_input_tokens_seen": 138842608, + "step": 64340 + }, + { + "epoch": 10.49673735725938, + "grad_norm": 0.15443918108940125, + "learning_rate": 0.0005438756812068382, + "loss": 0.0431, + "num_input_tokens_seen": 138853584, + "step": 64345 + }, + { + "epoch": 10.497553017944535, + "grad_norm": 0.0697326734662056, + "learning_rate": 0.0005438047755224696, + "loss": 0.0083, + "num_input_tokens_seen": 138863792, + "step": 64350 + }, + { + "epoch": 10.49836867862969, + "grad_norm": 0.019864896312355995, + "learning_rate": 0.0005437338689503417, + "loss": 0.0181, + "num_input_tokens_seen": 138875056, + "step": 64355 + }, + { + "epoch": 10.499184339314844, + "grad_norm": 0.4138883948326111, + "learning_rate": 0.0005436629614918915, + "loss": 0.0285, + "num_input_tokens_seen": 138885424, + "step": 64360 + }, + { + "epoch": 10.5, + "grad_norm": 0.01935281977057457, + "learning_rate": 0.0005435920531485559, + "loss": 0.0345, + "num_input_tokens_seen": 138896272, + "step": 64365 + }, + { + "epoch": 10.500815660685156, + "grad_norm": 0.3142714500427246, + "learning_rate": 0.0005435211439217722, + "loss": 0.0294, + "num_input_tokens_seen": 138908080, + "step": 64370 + }, + { + "epoch": 10.50163132137031, + "grad_norm": 0.006073612719774246, + "learning_rate": 0.0005434502338129773, + "loss": 0.0109, + "num_input_tokens_seen": 138918576, + "step": 64375 + }, + { + "epoch": 10.502446982055465, + "grad_norm": 0.006367042660713196, + "learning_rate": 0.0005433793228236081, + "loss": 0.0036, + "num_input_tokens_seen": 138929552, + "step": 64380 + }, + { + "epoch": 10.50326264274062, + "grad_norm": 0.05169467628002167, + "learning_rate": 0.000543308410955102, + "loss": 0.0072, + "num_input_tokens_seen": 138939504, + "step": 64385 + }, + { + "epoch": 10.504078303425775, + "grad_norm": 0.04866122454404831, + "learning_rate": 0.0005432374982088961, + "loss": 0.0342, + "num_input_tokens_seen": 138951408, + "step": 64390 + }, + { + "epoch": 10.50489396411093, + "grad_norm": 0.03663979843258858, + "learning_rate": 0.0005431665845864274, + "loss": 0.0065, + "num_input_tokens_seen": 138962224, + "step": 64395 + }, + { + "epoch": 10.505709624796085, + "grad_norm": 0.14822718501091003, + "learning_rate": 0.0005430956700891331, + "loss": 0.1689, + "num_input_tokens_seen": 138973680, + "step": 64400 + }, + { + "epoch": 10.50652528548124, + "grad_norm": 0.2093113511800766, + "learning_rate": 0.0005430247547184504, + "loss": 0.0363, + "num_input_tokens_seen": 138985072, + "step": 64405 + }, + { + "epoch": 10.507340946166394, + "grad_norm": 0.3468906283378601, + "learning_rate": 0.0005429538384758162, + "loss": 0.0801, + "num_input_tokens_seen": 138995440, + "step": 64410 + }, + { + "epoch": 10.50815660685155, + "grad_norm": 0.4352373480796814, + "learning_rate": 0.0005428829213626683, + "loss": 0.0492, + "num_input_tokens_seen": 139006960, + "step": 64415 + }, + { + "epoch": 10.508972267536706, + "grad_norm": 0.020769799128174782, + "learning_rate": 0.0005428120033804433, + "loss": 0.0072, + "num_input_tokens_seen": 139017424, + "step": 64420 + }, + { + "epoch": 10.50978792822186, + "grad_norm": 0.005844152066856623, + "learning_rate": 0.0005427410845305791, + "loss": 0.0626, + "num_input_tokens_seen": 139028144, + "step": 64425 + }, + { + "epoch": 10.510603588907015, + "grad_norm": 0.1511726677417755, + "learning_rate": 0.0005426701648145124, + "loss": 0.0521, + "num_input_tokens_seen": 139038128, + "step": 64430 + }, + { + "epoch": 10.51141924959217, + "grad_norm": 0.19312071800231934, + "learning_rate": 0.0005425992442336805, + "loss": 0.1477, + "num_input_tokens_seen": 139049616, + "step": 64435 + }, + { + "epoch": 10.512234910277325, + "grad_norm": 0.3681754171848297, + "learning_rate": 0.0005425283227895212, + "loss": 0.083, + "num_input_tokens_seen": 139061136, + "step": 64440 + }, + { + "epoch": 10.513050570962479, + "grad_norm": 0.1490596979856491, + "learning_rate": 0.0005424574004834712, + "loss": 0.0155, + "num_input_tokens_seen": 139071312, + "step": 64445 + }, + { + "epoch": 10.513866231647635, + "grad_norm": 0.007141440641134977, + "learning_rate": 0.0005423864773169683, + "loss": 0.023, + "num_input_tokens_seen": 139082672, + "step": 64450 + }, + { + "epoch": 10.51468189233279, + "grad_norm": 0.4793950915336609, + "learning_rate": 0.0005423155532914497, + "loss": 0.0233, + "num_input_tokens_seen": 139093776, + "step": 64455 + }, + { + "epoch": 10.515497553017944, + "grad_norm": 0.006522107869386673, + "learning_rate": 0.0005422446284083527, + "loss": 0.0606, + "num_input_tokens_seen": 139104112, + "step": 64460 + }, + { + "epoch": 10.5163132137031, + "grad_norm": 0.001900400617159903, + "learning_rate": 0.0005421737026691147, + "loss": 0.0982, + "num_input_tokens_seen": 139114384, + "step": 64465 + }, + { + "epoch": 10.517128874388254, + "grad_norm": 0.20763549208641052, + "learning_rate": 0.0005421027760751731, + "loss": 0.0158, + "num_input_tokens_seen": 139125232, + "step": 64470 + }, + { + "epoch": 10.51794453507341, + "grad_norm": 0.015119647607207298, + "learning_rate": 0.0005420318486279653, + "loss": 0.1123, + "num_input_tokens_seen": 139135472, + "step": 64475 + }, + { + "epoch": 10.518760195758565, + "grad_norm": 0.0019424607744440436, + "learning_rate": 0.0005419609203289288, + "loss": 0.0064, + "num_input_tokens_seen": 139147024, + "step": 64480 + }, + { + "epoch": 10.51957585644372, + "grad_norm": 0.10813461989164352, + "learning_rate": 0.0005418899911795011, + "loss": 0.042, + "num_input_tokens_seen": 139157680, + "step": 64485 + }, + { + "epoch": 10.520391517128875, + "grad_norm": 0.6470907926559448, + "learning_rate": 0.0005418190611811194, + "loss": 0.0705, + "num_input_tokens_seen": 139168272, + "step": 64490 + }, + { + "epoch": 10.521207177814029, + "grad_norm": 0.00607797596603632, + "learning_rate": 0.0005417481303352216, + "loss": 0.0069, + "num_input_tokens_seen": 139179344, + "step": 64495 + }, + { + "epoch": 10.522022838499185, + "grad_norm": 0.041040077805519104, + "learning_rate": 0.0005416771986432448, + "loss": 0.0157, + "num_input_tokens_seen": 139191888, + "step": 64500 + }, + { + "epoch": 10.522838499184338, + "grad_norm": 0.0770135298371315, + "learning_rate": 0.0005416062661066268, + "loss": 0.0223, + "num_input_tokens_seen": 139200592, + "step": 64505 + }, + { + "epoch": 10.523654159869494, + "grad_norm": 0.014629957266151905, + "learning_rate": 0.000541535332726805, + "loss": 0.0042, + "num_input_tokens_seen": 139212080, + "step": 64510 + }, + { + "epoch": 10.52446982055465, + "grad_norm": 0.003678290406242013, + "learning_rate": 0.000541464398505217, + "loss": 0.0209, + "num_input_tokens_seen": 139223088, + "step": 64515 + }, + { + "epoch": 10.525285481239804, + "grad_norm": 0.27014148235321045, + "learning_rate": 0.0005413934634433003, + "loss": 0.0278, + "num_input_tokens_seen": 139234448, + "step": 64520 + }, + { + "epoch": 10.52610114192496, + "grad_norm": 0.015500704757869244, + "learning_rate": 0.0005413225275424926, + "loss": 0.0125, + "num_input_tokens_seen": 139245008, + "step": 64525 + }, + { + "epoch": 10.526916802610113, + "grad_norm": 0.0018587103113532066, + "learning_rate": 0.0005412515908042314, + "loss": 0.2054, + "num_input_tokens_seen": 139255024, + "step": 64530 + }, + { + "epoch": 10.52773246329527, + "grad_norm": 0.023476464673876762, + "learning_rate": 0.0005411806532299544, + "loss": 0.0374, + "num_input_tokens_seen": 139265680, + "step": 64535 + }, + { + "epoch": 10.528548123980425, + "grad_norm": 0.048934902995824814, + "learning_rate": 0.0005411097148210992, + "loss": 0.0441, + "num_input_tokens_seen": 139276240, + "step": 64540 + }, + { + "epoch": 10.529363784665579, + "grad_norm": 0.011443077586591244, + "learning_rate": 0.0005410387755791036, + "loss": 0.0097, + "num_input_tokens_seen": 139285680, + "step": 64545 + }, + { + "epoch": 10.530179445350734, + "grad_norm": 0.29503610730171204, + "learning_rate": 0.0005409678355054051, + "loss": 0.0439, + "num_input_tokens_seen": 139296656, + "step": 64550 + }, + { + "epoch": 10.530995106035888, + "grad_norm": 0.002718844683840871, + "learning_rate": 0.0005408968946014416, + "loss": 0.0126, + "num_input_tokens_seen": 139306672, + "step": 64555 + }, + { + "epoch": 10.531810766721044, + "grad_norm": 0.005052946507930756, + "learning_rate": 0.0005408259528686503, + "loss": 0.0536, + "num_input_tokens_seen": 139316624, + "step": 64560 + }, + { + "epoch": 10.5326264274062, + "grad_norm": 0.0012100170133635402, + "learning_rate": 0.0005407550103084695, + "loss": 0.0062, + "num_input_tokens_seen": 139327472, + "step": 64565 + }, + { + "epoch": 10.533442088091354, + "grad_norm": 0.2487291395664215, + "learning_rate": 0.0005406840669223367, + "loss": 0.0418, + "num_input_tokens_seen": 139338480, + "step": 64570 + }, + { + "epoch": 10.53425774877651, + "grad_norm": 0.302805632352829, + "learning_rate": 0.0005406131227116896, + "loss": 0.0849, + "num_input_tokens_seen": 139350032, + "step": 64575 + }, + { + "epoch": 10.535073409461663, + "grad_norm": 0.0024716814514249563, + "learning_rate": 0.000540542177677966, + "loss": 0.0097, + "num_input_tokens_seen": 139361104, + "step": 64580 + }, + { + "epoch": 10.535889070146819, + "grad_norm": 0.03535851091146469, + "learning_rate": 0.0005404712318226038, + "loss": 0.0519, + "num_input_tokens_seen": 139371056, + "step": 64585 + }, + { + "epoch": 10.536704730831975, + "grad_norm": 0.01143036037683487, + "learning_rate": 0.0005404002851470409, + "loss": 0.012, + "num_input_tokens_seen": 139382544, + "step": 64590 + }, + { + "epoch": 10.537520391517129, + "grad_norm": 0.06018058955669403, + "learning_rate": 0.0005403293376527148, + "loss": 0.0484, + "num_input_tokens_seen": 139391888, + "step": 64595 + }, + { + "epoch": 10.538336052202284, + "grad_norm": 0.24475276470184326, + "learning_rate": 0.0005402583893410636, + "loss": 0.0208, + "num_input_tokens_seen": 139402832, + "step": 64600 + }, + { + "epoch": 10.539151712887438, + "grad_norm": 0.0017478041118010879, + "learning_rate": 0.0005401874402135249, + "loss": 0.0228, + "num_input_tokens_seen": 139413648, + "step": 64605 + }, + { + "epoch": 10.539967373572594, + "grad_norm": 0.013831570744514465, + "learning_rate": 0.000540116490271537, + "loss": 0.0052, + "num_input_tokens_seen": 139425584, + "step": 64610 + }, + { + "epoch": 10.540783034257748, + "grad_norm": 0.015895986929535866, + "learning_rate": 0.0005400455395165373, + "loss": 0.0054, + "num_input_tokens_seen": 139436336, + "step": 64615 + }, + { + "epoch": 10.541598694942904, + "grad_norm": 0.2083173394203186, + "learning_rate": 0.0005399745879499641, + "loss": 0.0132, + "num_input_tokens_seen": 139447984, + "step": 64620 + }, + { + "epoch": 10.54241435562806, + "grad_norm": 0.06844881176948547, + "learning_rate": 0.0005399036355732552, + "loss": 0.0952, + "num_input_tokens_seen": 139459344, + "step": 64625 + }, + { + "epoch": 10.543230016313213, + "grad_norm": 0.005444588605314493, + "learning_rate": 0.0005398326823878482, + "loss": 0.0212, + "num_input_tokens_seen": 139470128, + "step": 64630 + }, + { + "epoch": 10.544045676998369, + "grad_norm": 0.02676192857325077, + "learning_rate": 0.0005397617283951816, + "loss": 0.0408, + "num_input_tokens_seen": 139480784, + "step": 64635 + }, + { + "epoch": 10.544861337683523, + "grad_norm": 0.002877091057598591, + "learning_rate": 0.000539690773596693, + "loss": 0.0255, + "num_input_tokens_seen": 139490608, + "step": 64640 + }, + { + "epoch": 10.545676998368679, + "grad_norm": 0.006319655571132898, + "learning_rate": 0.0005396198179938208, + "loss": 0.0825, + "num_input_tokens_seen": 139501360, + "step": 64645 + }, + { + "epoch": 10.546492659053834, + "grad_norm": 0.43092861771583557, + "learning_rate": 0.0005395488615880024, + "loss": 0.0826, + "num_input_tokens_seen": 139513040, + "step": 64650 + }, + { + "epoch": 10.547308319738988, + "grad_norm": 0.0025406531058251858, + "learning_rate": 0.0005394779043806764, + "loss": 0.0151, + "num_input_tokens_seen": 139523632, + "step": 64655 + }, + { + "epoch": 10.548123980424144, + "grad_norm": 0.0735795870423317, + "learning_rate": 0.0005394069463732805, + "loss": 0.0272, + "num_input_tokens_seen": 139534928, + "step": 64660 + }, + { + "epoch": 10.548939641109298, + "grad_norm": 0.14013820886611938, + "learning_rate": 0.0005393359875672527, + "loss": 0.0889, + "num_input_tokens_seen": 139546256, + "step": 64665 + }, + { + "epoch": 10.549755301794454, + "grad_norm": 0.07900779694318771, + "learning_rate": 0.0005392650279640314, + "loss": 0.0244, + "num_input_tokens_seen": 139557712, + "step": 64670 + }, + { + "epoch": 10.550570962479608, + "grad_norm": 0.481896311044693, + "learning_rate": 0.0005391940675650545, + "loss": 0.1276, + "num_input_tokens_seen": 139567984, + "step": 64675 + }, + { + "epoch": 10.551386623164763, + "grad_norm": 0.039957672357559204, + "learning_rate": 0.00053912310637176, + "loss": 0.0469, + "num_input_tokens_seen": 139579248, + "step": 64680 + }, + { + "epoch": 10.552202283849919, + "grad_norm": 0.006705658044666052, + "learning_rate": 0.0005390521443855861, + "loss": 0.1991, + "num_input_tokens_seen": 139590288, + "step": 64685 + }, + { + "epoch": 10.553017944535073, + "grad_norm": 0.26802974939346313, + "learning_rate": 0.0005389811816079711, + "loss": 0.1842, + "num_input_tokens_seen": 139601712, + "step": 64690 + }, + { + "epoch": 10.553833605220229, + "grad_norm": 0.0012546599609777331, + "learning_rate": 0.0005389102180403529, + "loss": 0.0148, + "num_input_tokens_seen": 139613424, + "step": 64695 + }, + { + "epoch": 10.554649265905383, + "grad_norm": 0.028730299323797226, + "learning_rate": 0.0005388392536841697, + "loss": 0.0163, + "num_input_tokens_seen": 139623696, + "step": 64700 + }, + { + "epoch": 10.555464926590538, + "grad_norm": 0.0012155961012467742, + "learning_rate": 0.00053876828854086, + "loss": 0.0773, + "num_input_tokens_seen": 139634800, + "step": 64705 + }, + { + "epoch": 10.556280587275694, + "grad_norm": 0.021575501188635826, + "learning_rate": 0.0005386973226118615, + "loss": 0.0095, + "num_input_tokens_seen": 139645392, + "step": 64710 + }, + { + "epoch": 10.557096247960848, + "grad_norm": 0.025280790403485298, + "learning_rate": 0.0005386263558986127, + "loss": 0.0084, + "num_input_tokens_seen": 139656144, + "step": 64715 + }, + { + "epoch": 10.557911908646004, + "grad_norm": 0.002455284586176276, + "learning_rate": 0.0005385553884025519, + "loss": 0.0026, + "num_input_tokens_seen": 139667632, + "step": 64720 + }, + { + "epoch": 10.558727569331158, + "grad_norm": 0.024287423118948936, + "learning_rate": 0.000538484420125117, + "loss": 0.0375, + "num_input_tokens_seen": 139679216, + "step": 64725 + }, + { + "epoch": 10.559543230016313, + "grad_norm": 0.013987001031637192, + "learning_rate": 0.0005384134510677468, + "loss": 0.0465, + "num_input_tokens_seen": 139690416, + "step": 64730 + }, + { + "epoch": 10.560358890701469, + "grad_norm": 0.008295894600450993, + "learning_rate": 0.0005383424812318791, + "loss": 0.0107, + "num_input_tokens_seen": 139702000, + "step": 64735 + }, + { + "epoch": 10.561174551386623, + "grad_norm": 0.0026020165532827377, + "learning_rate": 0.0005382715106189525, + "loss": 0.0847, + "num_input_tokens_seen": 139713168, + "step": 64740 + }, + { + "epoch": 10.561990212071779, + "grad_norm": 0.0010350581724196672, + "learning_rate": 0.0005382005392304051, + "loss": 0.0771, + "num_input_tokens_seen": 139722800, + "step": 64745 + }, + { + "epoch": 10.562805872756933, + "grad_norm": 0.0048143211752176285, + "learning_rate": 0.0005381295670676752, + "loss": 0.0032, + "num_input_tokens_seen": 139734000, + "step": 64750 + }, + { + "epoch": 10.563621533442088, + "grad_norm": 0.011510387994349003, + "learning_rate": 0.0005380585941322014, + "loss": 0.0072, + "num_input_tokens_seen": 139744784, + "step": 64755 + }, + { + "epoch": 10.564437194127244, + "grad_norm": 0.30869176983833313, + "learning_rate": 0.000537987620425422, + "loss": 0.1607, + "num_input_tokens_seen": 139757264, + "step": 64760 + }, + { + "epoch": 10.565252854812398, + "grad_norm": 0.01616785116493702, + "learning_rate": 0.0005379166459487752, + "loss": 0.072, + "num_input_tokens_seen": 139768912, + "step": 64765 + }, + { + "epoch": 10.566068515497554, + "grad_norm": 0.024335574358701706, + "learning_rate": 0.0005378456707036995, + "loss": 0.0313, + "num_input_tokens_seen": 139778800, + "step": 64770 + }, + { + "epoch": 10.566884176182707, + "grad_norm": 0.2745490074157715, + "learning_rate": 0.0005377746946916332, + "loss": 0.0674, + "num_input_tokens_seen": 139789392, + "step": 64775 + }, + { + "epoch": 10.567699836867863, + "grad_norm": 0.14419758319854736, + "learning_rate": 0.0005377037179140149, + "loss": 0.0887, + "num_input_tokens_seen": 139801136, + "step": 64780 + }, + { + "epoch": 10.568515497553017, + "grad_norm": 0.004434714559465647, + "learning_rate": 0.0005376327403722828, + "loss": 0.0103, + "num_input_tokens_seen": 139812560, + "step": 64785 + }, + { + "epoch": 10.569331158238173, + "grad_norm": 0.007009184919297695, + "learning_rate": 0.0005375617620678756, + "loss": 0.0841, + "num_input_tokens_seen": 139822832, + "step": 64790 + }, + { + "epoch": 10.570146818923329, + "grad_norm": 0.06770122796297073, + "learning_rate": 0.0005374907830022316, + "loss": 0.0565, + "num_input_tokens_seen": 139833488, + "step": 64795 + }, + { + "epoch": 10.570962479608482, + "grad_norm": 0.02894982509315014, + "learning_rate": 0.0005374198031767892, + "loss": 0.0072, + "num_input_tokens_seen": 139845296, + "step": 64800 + }, + { + "epoch": 10.571778140293638, + "grad_norm": 0.00904886331409216, + "learning_rate": 0.0005373488225929871, + "loss": 0.0082, + "num_input_tokens_seen": 139856112, + "step": 64805 + }, + { + "epoch": 10.572593800978792, + "grad_norm": 0.008890406228601933, + "learning_rate": 0.0005372778412522638, + "loss": 0.0038, + "num_input_tokens_seen": 139868560, + "step": 64810 + }, + { + "epoch": 10.573409461663948, + "grad_norm": 0.014970967546105385, + "learning_rate": 0.0005372068591560577, + "loss": 0.0074, + "num_input_tokens_seen": 139880592, + "step": 64815 + }, + { + "epoch": 10.574225122349104, + "grad_norm": 0.5227879881858826, + "learning_rate": 0.0005371358763058074, + "loss": 0.1011, + "num_input_tokens_seen": 139891440, + "step": 64820 + }, + { + "epoch": 10.575040783034257, + "grad_norm": 0.003705012146383524, + "learning_rate": 0.0005370648927029515, + "loss": 0.0741, + "num_input_tokens_seen": 139901904, + "step": 64825 + }, + { + "epoch": 10.575856443719413, + "grad_norm": 0.034133512526750565, + "learning_rate": 0.0005369939083489283, + "loss": 0.0881, + "num_input_tokens_seen": 139913936, + "step": 64830 + }, + { + "epoch": 10.576672104404567, + "grad_norm": 0.08127888292074203, + "learning_rate": 0.0005369229232451769, + "loss": 0.0431, + "num_input_tokens_seen": 139926224, + "step": 64835 + }, + { + "epoch": 10.577487765089723, + "grad_norm": 0.013978679664433002, + "learning_rate": 0.0005368519373931355, + "loss": 0.0785, + "num_input_tokens_seen": 139936976, + "step": 64840 + }, + { + "epoch": 10.578303425774878, + "grad_norm": 0.03245940059423447, + "learning_rate": 0.0005367809507942429, + "loss": 0.127, + "num_input_tokens_seen": 139947056, + "step": 64845 + }, + { + "epoch": 10.579119086460032, + "grad_norm": 0.008662707172334194, + "learning_rate": 0.0005367099634499375, + "loss": 0.0569, + "num_input_tokens_seen": 139957456, + "step": 64850 + }, + { + "epoch": 10.579934747145188, + "grad_norm": 0.017585407942533493, + "learning_rate": 0.0005366389753616583, + "loss": 0.0156, + "num_input_tokens_seen": 139967952, + "step": 64855 + }, + { + "epoch": 10.580750407830342, + "grad_norm": 0.005092840641736984, + "learning_rate": 0.0005365679865308437, + "loss": 0.02, + "num_input_tokens_seen": 139977776, + "step": 64860 + }, + { + "epoch": 10.581566068515498, + "grad_norm": 0.005094693973660469, + "learning_rate": 0.0005364969969589325, + "loss": 0.0242, + "num_input_tokens_seen": 139989136, + "step": 64865 + }, + { + "epoch": 10.582381729200652, + "grad_norm": 0.013519085012376308, + "learning_rate": 0.0005364260066473634, + "loss": 0.1222, + "num_input_tokens_seen": 139999504, + "step": 64870 + }, + { + "epoch": 10.583197389885807, + "grad_norm": 0.009106375277042389, + "learning_rate": 0.000536355015597575, + "loss": 0.0646, + "num_input_tokens_seen": 140011024, + "step": 64875 + }, + { + "epoch": 10.584013050570963, + "grad_norm": 0.009612289257347584, + "learning_rate": 0.0005362840238110061, + "loss": 0.0303, + "num_input_tokens_seen": 140021328, + "step": 64880 + }, + { + "epoch": 10.584828711256117, + "grad_norm": 0.05717439204454422, + "learning_rate": 0.0005362130312890955, + "loss": 0.1152, + "num_input_tokens_seen": 140033008, + "step": 64885 + }, + { + "epoch": 10.585644371941273, + "grad_norm": 0.17543534934520721, + "learning_rate": 0.0005361420380332818, + "loss": 0.0463, + "num_input_tokens_seen": 140042832, + "step": 64890 + }, + { + "epoch": 10.586460032626427, + "grad_norm": 0.21685631573200226, + "learning_rate": 0.0005360710440450037, + "loss": 0.0234, + "num_input_tokens_seen": 140053296, + "step": 64895 + }, + { + "epoch": 10.587275693311582, + "grad_norm": 0.008545550517737865, + "learning_rate": 0.0005360000493257003, + "loss": 0.0079, + "num_input_tokens_seen": 140064528, + "step": 64900 + }, + { + "epoch": 10.588091353996738, + "grad_norm": 0.10189548134803772, + "learning_rate": 0.0005359290538768102, + "loss": 0.1007, + "num_input_tokens_seen": 140074576, + "step": 64905 + }, + { + "epoch": 10.588907014681892, + "grad_norm": 0.05172334611415863, + "learning_rate": 0.0005358580576997723, + "loss": 0.1306, + "num_input_tokens_seen": 140084368, + "step": 64910 + }, + { + "epoch": 10.589722675367048, + "grad_norm": 0.013548692688345909, + "learning_rate": 0.0005357870607960255, + "loss": 0.0253, + "num_input_tokens_seen": 140094480, + "step": 64915 + }, + { + "epoch": 10.590538336052202, + "grad_norm": 0.875661313533783, + "learning_rate": 0.0005357160631670083, + "loss": 0.1559, + "num_input_tokens_seen": 140105776, + "step": 64920 + }, + { + "epoch": 10.591353996737357, + "grad_norm": 0.007097321562469006, + "learning_rate": 0.0005356450648141599, + "loss": 0.022, + "num_input_tokens_seen": 140116400, + "step": 64925 + }, + { + "epoch": 10.592169657422513, + "grad_norm": 0.20355239510536194, + "learning_rate": 0.0005355740657389189, + "loss": 0.0528, + "num_input_tokens_seen": 140127632, + "step": 64930 + }, + { + "epoch": 10.592985318107667, + "grad_norm": 0.007212704513221979, + "learning_rate": 0.0005355030659427245, + "loss": 0.064, + "num_input_tokens_seen": 140137072, + "step": 64935 + }, + { + "epoch": 10.593800978792823, + "grad_norm": 0.019816026091575623, + "learning_rate": 0.0005354320654270153, + "loss": 0.0743, + "num_input_tokens_seen": 140147088, + "step": 64940 + }, + { + "epoch": 10.594616639477977, + "grad_norm": 0.015937440097332, + "learning_rate": 0.0005353610641932304, + "loss": 0.0987, + "num_input_tokens_seen": 140158064, + "step": 64945 + }, + { + "epoch": 10.595432300163132, + "grad_norm": 0.007632109336555004, + "learning_rate": 0.0005352900622428086, + "loss": 0.0083, + "num_input_tokens_seen": 140168688, + "step": 64950 + }, + { + "epoch": 10.596247960848288, + "grad_norm": 0.04458140581846237, + "learning_rate": 0.0005352190595771889, + "loss": 0.0212, + "num_input_tokens_seen": 140179024, + "step": 64955 + }, + { + "epoch": 10.597063621533442, + "grad_norm": 0.06280036270618439, + "learning_rate": 0.0005351480561978103, + "loss": 0.0336, + "num_input_tokens_seen": 140190064, + "step": 64960 + }, + { + "epoch": 10.597879282218598, + "grad_norm": 0.3518803119659424, + "learning_rate": 0.0005350770521061118, + "loss": 0.0646, + "num_input_tokens_seen": 140202160, + "step": 64965 + }, + { + "epoch": 10.598694942903752, + "grad_norm": 0.006473650690168142, + "learning_rate": 0.0005350060473035324, + "loss": 0.0592, + "num_input_tokens_seen": 140212304, + "step": 64970 + }, + { + "epoch": 10.599510603588907, + "grad_norm": 0.05138443782925606, + "learning_rate": 0.000534935041791511, + "loss": 0.0199, + "num_input_tokens_seen": 140223152, + "step": 64975 + }, + { + "epoch": 10.600326264274061, + "grad_norm": 0.00609197374433279, + "learning_rate": 0.0005348640355714866, + "loss": 0.1194, + "num_input_tokens_seen": 140233872, + "step": 64980 + }, + { + "epoch": 10.601141924959217, + "grad_norm": 0.25754228234291077, + "learning_rate": 0.0005347930286448984, + "loss": 0.0443, + "num_input_tokens_seen": 140244464, + "step": 64985 + }, + { + "epoch": 10.601957585644373, + "grad_norm": 0.14046022295951843, + "learning_rate": 0.0005347220210131853, + "loss": 0.0228, + "num_input_tokens_seen": 140253552, + "step": 64990 + }, + { + "epoch": 10.602773246329527, + "grad_norm": 0.25373032689094543, + "learning_rate": 0.0005346510126777864, + "loss": 0.041, + "num_input_tokens_seen": 140264144, + "step": 64995 + }, + { + "epoch": 10.603588907014682, + "grad_norm": 0.017920494079589844, + "learning_rate": 0.0005345800036401407, + "loss": 0.0202, + "num_input_tokens_seen": 140272784, + "step": 65000 + }, + { + "epoch": 10.604404567699836, + "grad_norm": 0.27044835686683655, + "learning_rate": 0.0005345089939016874, + "loss": 0.0776, + "num_input_tokens_seen": 140282768, + "step": 65005 + }, + { + "epoch": 10.605220228384992, + "grad_norm": 0.003131135832518339, + "learning_rate": 0.0005344379834638656, + "loss": 0.0264, + "num_input_tokens_seen": 140291952, + "step": 65010 + }, + { + "epoch": 10.606035889070148, + "grad_norm": 0.003275089431554079, + "learning_rate": 0.0005343669723281144, + "loss": 0.0907, + "num_input_tokens_seen": 140303568, + "step": 65015 + }, + { + "epoch": 10.606851549755302, + "grad_norm": 0.02005787007510662, + "learning_rate": 0.0005342959604958728, + "loss": 0.037, + "num_input_tokens_seen": 140314960, + "step": 65020 + }, + { + "epoch": 10.607667210440457, + "grad_norm": 0.2630772888660431, + "learning_rate": 0.0005342249479685801, + "loss": 0.0911, + "num_input_tokens_seen": 140325936, + "step": 65025 + }, + { + "epoch": 10.608482871125611, + "grad_norm": 0.010468282736837864, + "learning_rate": 0.0005341539347476754, + "loss": 0.0108, + "num_input_tokens_seen": 140337392, + "step": 65030 + }, + { + "epoch": 10.609298531810767, + "grad_norm": 0.3438645005226135, + "learning_rate": 0.0005340829208345979, + "loss": 0.0821, + "num_input_tokens_seen": 140349488, + "step": 65035 + }, + { + "epoch": 10.61011419249592, + "grad_norm": 0.3162006735801697, + "learning_rate": 0.0005340119062307866, + "loss": 0.0532, + "num_input_tokens_seen": 140362160, + "step": 65040 + }, + { + "epoch": 10.610929853181077, + "grad_norm": 0.03874239698052406, + "learning_rate": 0.0005339408909376812, + "loss": 0.2191, + "num_input_tokens_seen": 140373904, + "step": 65045 + }, + { + "epoch": 10.611745513866232, + "grad_norm": 0.03771042078733444, + "learning_rate": 0.0005338698749567203, + "loss": 0.0352, + "num_input_tokens_seen": 140384240, + "step": 65050 + }, + { + "epoch": 10.612561174551386, + "grad_norm": 0.02035333774983883, + "learning_rate": 0.0005337988582893436, + "loss": 0.1032, + "num_input_tokens_seen": 140396144, + "step": 65055 + }, + { + "epoch": 10.613376835236542, + "grad_norm": 0.02691297046840191, + "learning_rate": 0.0005337278409369901, + "loss": 0.0118, + "num_input_tokens_seen": 140407536, + "step": 65060 + }, + { + "epoch": 10.614192495921696, + "grad_norm": 0.011075315997004509, + "learning_rate": 0.0005336568229010991, + "loss": 0.0086, + "num_input_tokens_seen": 140417616, + "step": 65065 + }, + { + "epoch": 10.615008156606851, + "grad_norm": 0.008334855549037457, + "learning_rate": 0.0005335858041831099, + "loss": 0.0113, + "num_input_tokens_seen": 140429072, + "step": 65070 + }, + { + "epoch": 10.615823817292007, + "grad_norm": 0.0823870450258255, + "learning_rate": 0.0005335147847844618, + "loss": 0.0257, + "num_input_tokens_seen": 140440016, + "step": 65075 + }, + { + "epoch": 10.616639477977161, + "grad_norm": 0.03910454735159874, + "learning_rate": 0.000533443764706594, + "loss": 0.0695, + "num_input_tokens_seen": 140450672, + "step": 65080 + }, + { + "epoch": 10.617455138662317, + "grad_norm": 0.04419783130288124, + "learning_rate": 0.0005333727439509459, + "loss": 0.0234, + "num_input_tokens_seen": 140460944, + "step": 65085 + }, + { + "epoch": 10.61827079934747, + "grad_norm": 0.01412450522184372, + "learning_rate": 0.0005333017225189569, + "loss": 0.0127, + "num_input_tokens_seen": 140471280, + "step": 65090 + }, + { + "epoch": 10.619086460032626, + "grad_norm": 0.015574944205582142, + "learning_rate": 0.0005332307004120662, + "loss": 0.0341, + "num_input_tokens_seen": 140481584, + "step": 65095 + }, + { + "epoch": 10.619902120717782, + "grad_norm": 0.36479663848876953, + "learning_rate": 0.0005331596776317133, + "loss": 0.0491, + "num_input_tokens_seen": 140493104, + "step": 65100 + }, + { + "epoch": 10.620717781402936, + "grad_norm": 0.0037067474331706762, + "learning_rate": 0.0005330886541793372, + "loss": 0.0227, + "num_input_tokens_seen": 140503952, + "step": 65105 + }, + { + "epoch": 10.621533442088092, + "grad_norm": 0.0289887934923172, + "learning_rate": 0.0005330176300563778, + "loss": 0.0106, + "num_input_tokens_seen": 140514256, + "step": 65110 + }, + { + "epoch": 10.622349102773246, + "grad_norm": 0.19536904990673065, + "learning_rate": 0.0005329466052642741, + "loss": 0.0595, + "num_input_tokens_seen": 140523984, + "step": 65115 + }, + { + "epoch": 10.623164763458401, + "grad_norm": 0.00966187659651041, + "learning_rate": 0.0005328755798044658, + "loss": 0.0044, + "num_input_tokens_seen": 140535504, + "step": 65120 + }, + { + "epoch": 10.623980424143557, + "grad_norm": 0.3116471469402313, + "learning_rate": 0.000532804553678392, + "loss": 0.0971, + "num_input_tokens_seen": 140547120, + "step": 65125 + }, + { + "epoch": 10.624796084828711, + "grad_norm": 0.012109462171792984, + "learning_rate": 0.0005327335268874924, + "loss": 0.0135, + "num_input_tokens_seen": 140557200, + "step": 65130 + }, + { + "epoch": 10.625611745513867, + "grad_norm": 0.003591384505853057, + "learning_rate": 0.0005326624994332063, + "loss": 0.0088, + "num_input_tokens_seen": 140567696, + "step": 65135 + }, + { + "epoch": 10.62642740619902, + "grad_norm": 0.0023129871115088463, + "learning_rate": 0.0005325914713169733, + "loss": 0.0112, + "num_input_tokens_seen": 140578224, + "step": 65140 + }, + { + "epoch": 10.627243066884176, + "grad_norm": 0.5553701519966125, + "learning_rate": 0.0005325204425402327, + "loss": 0.0827, + "num_input_tokens_seen": 140588304, + "step": 65145 + }, + { + "epoch": 10.62805872756933, + "grad_norm": 0.3009636700153351, + "learning_rate": 0.0005324494131044241, + "loss": 0.0864, + "num_input_tokens_seen": 140598288, + "step": 65150 + }, + { + "epoch": 10.628874388254486, + "grad_norm": 0.31743693351745605, + "learning_rate": 0.000532378383010987, + "loss": 0.1665, + "num_input_tokens_seen": 140608080, + "step": 65155 + }, + { + "epoch": 10.629690048939642, + "grad_norm": 0.02988928183913231, + "learning_rate": 0.0005323073522613608, + "loss": 0.0232, + "num_input_tokens_seen": 140618512, + "step": 65160 + }, + { + "epoch": 10.630505709624796, + "grad_norm": 0.037130363285541534, + "learning_rate": 0.0005322363208569851, + "loss": 0.0192, + "num_input_tokens_seen": 140630096, + "step": 65165 + }, + { + "epoch": 10.631321370309951, + "grad_norm": 0.02131594344973564, + "learning_rate": 0.0005321652887992996, + "loss": 0.0644, + "num_input_tokens_seen": 140640368, + "step": 65170 + }, + { + "epoch": 10.632137030995105, + "grad_norm": 0.0175090953707695, + "learning_rate": 0.0005320942560897436, + "loss": 0.0252, + "num_input_tokens_seen": 140651408, + "step": 65175 + }, + { + "epoch": 10.632952691680261, + "grad_norm": 0.13203127682209015, + "learning_rate": 0.0005320232227297569, + "loss": 0.0209, + "num_input_tokens_seen": 140661296, + "step": 65180 + }, + { + "epoch": 10.633768352365417, + "grad_norm": 0.005023539531975985, + "learning_rate": 0.0005319521887207789, + "loss": 0.0667, + "num_input_tokens_seen": 140671088, + "step": 65185 + }, + { + "epoch": 10.63458401305057, + "grad_norm": 0.004876819904893637, + "learning_rate": 0.0005318811540642493, + "loss": 0.0431, + "num_input_tokens_seen": 140682288, + "step": 65190 + }, + { + "epoch": 10.635399673735726, + "grad_norm": 0.2825918197631836, + "learning_rate": 0.0005318101187616077, + "loss": 0.1894, + "num_input_tokens_seen": 140692080, + "step": 65195 + }, + { + "epoch": 10.63621533442088, + "grad_norm": 0.0028557758778333664, + "learning_rate": 0.0005317390828142937, + "loss": 0.0272, + "num_input_tokens_seen": 140702640, + "step": 65200 + }, + { + "epoch": 10.637030995106036, + "grad_norm": 0.134634330868721, + "learning_rate": 0.0005316680462237468, + "loss": 0.1702, + "num_input_tokens_seen": 140712880, + "step": 65205 + }, + { + "epoch": 10.63784665579119, + "grad_norm": 0.09145954996347427, + "learning_rate": 0.0005315970089914068, + "loss": 0.0206, + "num_input_tokens_seen": 140724496, + "step": 65210 + }, + { + "epoch": 10.638662316476346, + "grad_norm": 0.018421538174152374, + "learning_rate": 0.0005315259711187134, + "loss": 0.038, + "num_input_tokens_seen": 140734096, + "step": 65215 + }, + { + "epoch": 10.639477977161501, + "grad_norm": 0.016156505793333054, + "learning_rate": 0.0005314549326071061, + "loss": 0.0445, + "num_input_tokens_seen": 140745136, + "step": 65220 + }, + { + "epoch": 10.640293637846655, + "grad_norm": 0.002987145446240902, + "learning_rate": 0.0005313838934580248, + "loss": 0.0065, + "num_input_tokens_seen": 140755088, + "step": 65225 + }, + { + "epoch": 10.641109298531811, + "grad_norm": 0.20771150290966034, + "learning_rate": 0.0005313128536729091, + "loss": 0.0163, + "num_input_tokens_seen": 140766416, + "step": 65230 + }, + { + "epoch": 10.641924959216965, + "grad_norm": 0.10230425745248795, + "learning_rate": 0.0005312418132531985, + "loss": 0.0297, + "num_input_tokens_seen": 140777520, + "step": 65235 + }, + { + "epoch": 10.64274061990212, + "grad_norm": 0.2810080051422119, + "learning_rate": 0.0005311707722003332, + "loss": 0.0531, + "num_input_tokens_seen": 140787632, + "step": 65240 + }, + { + "epoch": 10.643556280587276, + "grad_norm": 0.6550468802452087, + "learning_rate": 0.0005310997305157524, + "loss": 0.0908, + "num_input_tokens_seen": 140798896, + "step": 65245 + }, + { + "epoch": 10.64437194127243, + "grad_norm": 0.4172266125679016, + "learning_rate": 0.0005310286882008962, + "loss": 0.1904, + "num_input_tokens_seen": 140809232, + "step": 65250 + }, + { + "epoch": 10.645187601957586, + "grad_norm": 0.2950693964958191, + "learning_rate": 0.0005309576452572043, + "loss": 0.1105, + "num_input_tokens_seen": 140821776, + "step": 65255 + }, + { + "epoch": 10.64600326264274, + "grad_norm": 0.023870423436164856, + "learning_rate": 0.0005308866016861166, + "loss": 0.0157, + "num_input_tokens_seen": 140831856, + "step": 65260 + }, + { + "epoch": 10.646818923327896, + "grad_norm": 0.15567582845687866, + "learning_rate": 0.0005308155574890725, + "loss": 0.0284, + "num_input_tokens_seen": 140842896, + "step": 65265 + }, + { + "epoch": 10.647634584013051, + "grad_norm": 0.033467598259449005, + "learning_rate": 0.000530744512667512, + "loss": 0.0068, + "num_input_tokens_seen": 140852336, + "step": 65270 + }, + { + "epoch": 10.648450244698205, + "grad_norm": 0.44852450489997864, + "learning_rate": 0.0005306734672228751, + "loss": 0.0973, + "num_input_tokens_seen": 140864816, + "step": 65275 + }, + { + "epoch": 10.649265905383361, + "grad_norm": 0.0037650573067367077, + "learning_rate": 0.0005306024211566014, + "loss": 0.0598, + "num_input_tokens_seen": 140875088, + "step": 65280 + }, + { + "epoch": 10.650081566068515, + "grad_norm": 0.2712796628475189, + "learning_rate": 0.0005305313744701309, + "loss": 0.1506, + "num_input_tokens_seen": 140886544, + "step": 65285 + }, + { + "epoch": 10.65089722675367, + "grad_norm": 0.07346566021442413, + "learning_rate": 0.0005304603271649033, + "loss": 0.0193, + "num_input_tokens_seen": 140897008, + "step": 65290 + }, + { + "epoch": 10.651712887438826, + "grad_norm": 0.3076639771461487, + "learning_rate": 0.0005303892792423585, + "loss": 0.0356, + "num_input_tokens_seen": 140906960, + "step": 65295 + }, + { + "epoch": 10.65252854812398, + "grad_norm": 0.34749335050582886, + "learning_rate": 0.0005303182307039364, + "loss": 0.1172, + "num_input_tokens_seen": 140917808, + "step": 65300 + }, + { + "epoch": 10.653344208809136, + "grad_norm": 0.20521672070026398, + "learning_rate": 0.0005302471815510771, + "loss": 0.052, + "num_input_tokens_seen": 140929168, + "step": 65305 + }, + { + "epoch": 10.65415986949429, + "grad_norm": 0.003810715628787875, + "learning_rate": 0.00053017613178522, + "loss": 0.008, + "num_input_tokens_seen": 140938832, + "step": 65310 + }, + { + "epoch": 10.654975530179446, + "grad_norm": 0.00932942796498537, + "learning_rate": 0.0005301050814078055, + "loss": 0.0218, + "num_input_tokens_seen": 140949328, + "step": 65315 + }, + { + "epoch": 10.655791190864601, + "grad_norm": 0.007939960807561874, + "learning_rate": 0.0005300340304202734, + "loss": 0.0159, + "num_input_tokens_seen": 140958256, + "step": 65320 + }, + { + "epoch": 10.656606851549755, + "grad_norm": 0.005125361494719982, + "learning_rate": 0.0005299629788240634, + "loss": 0.0103, + "num_input_tokens_seen": 140968848, + "step": 65325 + }, + { + "epoch": 10.65742251223491, + "grad_norm": 0.15519355237483978, + "learning_rate": 0.0005298919266206157, + "loss": 0.0169, + "num_input_tokens_seen": 140979760, + "step": 65330 + }, + { + "epoch": 10.658238172920065, + "grad_norm": 0.00500165531411767, + "learning_rate": 0.0005298208738113701, + "loss": 0.0207, + "num_input_tokens_seen": 140990960, + "step": 65335 + }, + { + "epoch": 10.65905383360522, + "grad_norm": 0.004022952634841204, + "learning_rate": 0.0005297498203977668, + "loss": 0.0101, + "num_input_tokens_seen": 141002960, + "step": 65340 + }, + { + "epoch": 10.659869494290374, + "grad_norm": 0.005010330583900213, + "learning_rate": 0.0005296787663812456, + "loss": 0.026, + "num_input_tokens_seen": 141013360, + "step": 65345 + }, + { + "epoch": 10.66068515497553, + "grad_norm": 0.004449171479791403, + "learning_rate": 0.0005296077117632464, + "loss": 0.0334, + "num_input_tokens_seen": 141025264, + "step": 65350 + }, + { + "epoch": 10.661500815660686, + "grad_norm": 0.004114639945328236, + "learning_rate": 0.0005295366565452094, + "loss": 0.0052, + "num_input_tokens_seen": 141036048, + "step": 65355 + }, + { + "epoch": 10.66231647634584, + "grad_norm": 0.039192333817481995, + "learning_rate": 0.0005294656007285748, + "loss": 0.0205, + "num_input_tokens_seen": 141047120, + "step": 65360 + }, + { + "epoch": 10.663132137030995, + "grad_norm": 0.3510650098323822, + "learning_rate": 0.0005293945443147821, + "loss": 0.1974, + "num_input_tokens_seen": 141056464, + "step": 65365 + }, + { + "epoch": 10.66394779771615, + "grad_norm": 0.017688889056444168, + "learning_rate": 0.000529323487305272, + "loss": 0.0236, + "num_input_tokens_seen": 141066992, + "step": 65370 + }, + { + "epoch": 10.664763458401305, + "grad_norm": 0.32009556889533997, + "learning_rate": 0.0005292524297014842, + "loss": 0.0202, + "num_input_tokens_seen": 141077008, + "step": 65375 + }, + { + "epoch": 10.66557911908646, + "grad_norm": 0.714776873588562, + "learning_rate": 0.0005291813715048584, + "loss": 0.1132, + "num_input_tokens_seen": 141086160, + "step": 65380 + }, + { + "epoch": 10.666394779771615, + "grad_norm": 0.005010640248656273, + "learning_rate": 0.0005291103127168355, + "loss": 0.1048, + "num_input_tokens_seen": 141096560, + "step": 65385 + }, + { + "epoch": 10.66721044045677, + "grad_norm": 0.030881229788064957, + "learning_rate": 0.000529039253338855, + "loss": 0.0066, + "num_input_tokens_seen": 141108304, + "step": 65390 + }, + { + "epoch": 10.668026101141924, + "grad_norm": 0.0019811608362942934, + "learning_rate": 0.0005289681933723573, + "loss": 0.0027, + "num_input_tokens_seen": 141118256, + "step": 65395 + }, + { + "epoch": 10.66884176182708, + "grad_norm": 0.0787934735417366, + "learning_rate": 0.0005288971328187824, + "loss": 0.012, + "num_input_tokens_seen": 141129456, + "step": 65400 + }, + { + "epoch": 10.669657422512234, + "grad_norm": 0.0394597128033638, + "learning_rate": 0.0005288260716795704, + "loss": 0.0261, + "num_input_tokens_seen": 141140208, + "step": 65405 + }, + { + "epoch": 10.67047308319739, + "grad_norm": 0.01991061493754387, + "learning_rate": 0.0005287550099561614, + "loss": 0.0211, + "num_input_tokens_seen": 141152144, + "step": 65410 + }, + { + "epoch": 10.671288743882545, + "grad_norm": 0.4816165268421173, + "learning_rate": 0.0005286839476499959, + "loss": 0.0311, + "num_input_tokens_seen": 141163696, + "step": 65415 + }, + { + "epoch": 10.6721044045677, + "grad_norm": 0.3137000501155853, + "learning_rate": 0.0005286128847625136, + "loss": 0.0445, + "num_input_tokens_seen": 141173488, + "step": 65420 + }, + { + "epoch": 10.672920065252855, + "grad_norm": 0.19126470386981964, + "learning_rate": 0.0005285418212951549, + "loss": 0.0524, + "num_input_tokens_seen": 141182320, + "step": 65425 + }, + { + "epoch": 10.673735725938009, + "grad_norm": 0.06783124059438705, + "learning_rate": 0.0005284707572493601, + "loss": 0.0271, + "num_input_tokens_seen": 141191824, + "step": 65430 + }, + { + "epoch": 10.674551386623165, + "grad_norm": 0.12583132088184357, + "learning_rate": 0.0005283996926265692, + "loss": 0.0353, + "num_input_tokens_seen": 141202864, + "step": 65435 + }, + { + "epoch": 10.67536704730832, + "grad_norm": 0.02210039459168911, + "learning_rate": 0.0005283286274282226, + "loss": 0.0219, + "num_input_tokens_seen": 141213968, + "step": 65440 + }, + { + "epoch": 10.676182707993474, + "grad_norm": 0.0048767877742648125, + "learning_rate": 0.0005282575616557603, + "loss": 0.0202, + "num_input_tokens_seen": 141224624, + "step": 65445 + }, + { + "epoch": 10.67699836867863, + "grad_norm": 0.0013604401610791683, + "learning_rate": 0.0005281864953106226, + "loss": 0.0027, + "num_input_tokens_seen": 141236336, + "step": 65450 + }, + { + "epoch": 10.677814029363784, + "grad_norm": 0.0004915795871056616, + "learning_rate": 0.0005281154283942501, + "loss": 0.1381, + "num_input_tokens_seen": 141247408, + "step": 65455 + }, + { + "epoch": 10.67862969004894, + "grad_norm": 0.5550827980041504, + "learning_rate": 0.0005280443609080826, + "loss": 0.1238, + "num_input_tokens_seen": 141259760, + "step": 65460 + }, + { + "epoch": 10.679445350734095, + "grad_norm": 0.0446750670671463, + "learning_rate": 0.0005279732928535606, + "loss": 0.0167, + "num_input_tokens_seen": 141270960, + "step": 65465 + }, + { + "epoch": 10.68026101141925, + "grad_norm": 0.03571578115224838, + "learning_rate": 0.0005279022242321242, + "loss": 0.0138, + "num_input_tokens_seen": 141280976, + "step": 65470 + }, + { + "epoch": 10.681076672104405, + "grad_norm": 0.04581819102168083, + "learning_rate": 0.000527831155045214, + "loss": 0.0063, + "num_input_tokens_seen": 141290320, + "step": 65475 + }, + { + "epoch": 10.681892332789559, + "grad_norm": 0.017313268035650253, + "learning_rate": 0.00052776008529427, + "loss": 0.0072, + "num_input_tokens_seen": 141301488, + "step": 65480 + }, + { + "epoch": 10.682707993474715, + "grad_norm": 0.008808505721390247, + "learning_rate": 0.0005276890149807326, + "loss": 0.0195, + "num_input_tokens_seen": 141312368, + "step": 65485 + }, + { + "epoch": 10.68352365415987, + "grad_norm": 0.05875592306256294, + "learning_rate": 0.0005276179441060423, + "loss": 0.0166, + "num_input_tokens_seen": 141323760, + "step": 65490 + }, + { + "epoch": 10.684339314845024, + "grad_norm": 0.011892681010067463, + "learning_rate": 0.0005275468726716393, + "loss": 0.0343, + "num_input_tokens_seen": 141335760, + "step": 65495 + }, + { + "epoch": 10.68515497553018, + "grad_norm": 0.011522164568305016, + "learning_rate": 0.000527475800678964, + "loss": 0.01, + "num_input_tokens_seen": 141346704, + "step": 65500 + }, + { + "epoch": 10.685970636215334, + "grad_norm": 0.000775989901740104, + "learning_rate": 0.0005274047281294569, + "loss": 0.0237, + "num_input_tokens_seen": 141357840, + "step": 65505 + }, + { + "epoch": 10.68678629690049, + "grad_norm": 0.13805875182151794, + "learning_rate": 0.000527333655024558, + "loss": 0.0121, + "num_input_tokens_seen": 141368144, + "step": 65510 + }, + { + "epoch": 10.687601957585644, + "grad_norm": 0.4413118362426758, + "learning_rate": 0.0005272625813657079, + "loss": 0.0568, + "num_input_tokens_seen": 141379248, + "step": 65515 + }, + { + "epoch": 10.6884176182708, + "grad_norm": 0.0014964413130655885, + "learning_rate": 0.000527191507154347, + "loss": 0.0597, + "num_input_tokens_seen": 141390544, + "step": 65520 + }, + { + "epoch": 10.689233278955955, + "grad_norm": 0.3602645695209503, + "learning_rate": 0.0005271204323919158, + "loss": 0.3272, + "num_input_tokens_seen": 141400976, + "step": 65525 + }, + { + "epoch": 10.690048939641109, + "grad_norm": 0.0027618997264653444, + "learning_rate": 0.0005270493570798546, + "loss": 0.1172, + "num_input_tokens_seen": 141410608, + "step": 65530 + }, + { + "epoch": 10.690864600326265, + "grad_norm": 0.0023711903486400843, + "learning_rate": 0.000526978281219604, + "loss": 0.0117, + "num_input_tokens_seen": 141420816, + "step": 65535 + }, + { + "epoch": 10.691680261011419, + "grad_norm": 0.006863425485789776, + "learning_rate": 0.0005269072048126041, + "loss": 0.0102, + "num_input_tokens_seen": 141431920, + "step": 65540 + }, + { + "epoch": 10.692495921696574, + "grad_norm": 0.0489024855196476, + "learning_rate": 0.0005268361278602957, + "loss": 0.0121, + "num_input_tokens_seen": 141441136, + "step": 65545 + }, + { + "epoch": 10.69331158238173, + "grad_norm": 0.010894126258790493, + "learning_rate": 0.0005267650503641191, + "loss": 0.0398, + "num_input_tokens_seen": 141451760, + "step": 65550 + }, + { + "epoch": 10.694127243066884, + "grad_norm": 0.3488101065158844, + "learning_rate": 0.0005266939723255148, + "loss": 0.0555, + "num_input_tokens_seen": 141462384, + "step": 65555 + }, + { + "epoch": 10.69494290375204, + "grad_norm": 0.033751048147678375, + "learning_rate": 0.0005266228937459233, + "loss": 0.0134, + "num_input_tokens_seen": 141471984, + "step": 65560 + }, + { + "epoch": 10.695758564437194, + "grad_norm": 0.02117815800011158, + "learning_rate": 0.0005265518146267851, + "loss": 0.0729, + "num_input_tokens_seen": 141482384, + "step": 65565 + }, + { + "epoch": 10.69657422512235, + "grad_norm": 0.008152129128575325, + "learning_rate": 0.0005264807349695406, + "loss": 0.0239, + "num_input_tokens_seen": 141492848, + "step": 65570 + }, + { + "epoch": 10.697389885807503, + "grad_norm": 0.11187044531106949, + "learning_rate": 0.0005264096547756305, + "loss": 0.0586, + "num_input_tokens_seen": 141502864, + "step": 65575 + }, + { + "epoch": 10.698205546492659, + "grad_norm": 0.0010355414124205709, + "learning_rate": 0.0005263385740464951, + "loss": 0.0027, + "num_input_tokens_seen": 141513296, + "step": 65580 + }, + { + "epoch": 10.699021207177815, + "grad_norm": 0.10060343891382217, + "learning_rate": 0.0005262674927835752, + "loss": 0.0703, + "num_input_tokens_seen": 141524496, + "step": 65585 + }, + { + "epoch": 10.699836867862969, + "grad_norm": 0.2619067132472992, + "learning_rate": 0.0005261964109883111, + "loss": 0.0687, + "num_input_tokens_seen": 141534576, + "step": 65590 + }, + { + "epoch": 10.700652528548124, + "grad_norm": 0.0013153153704479337, + "learning_rate": 0.0005261253286621437, + "loss": 0.0111, + "num_input_tokens_seen": 141545104, + "step": 65595 + }, + { + "epoch": 10.701468189233278, + "grad_norm": 0.0005263492930680513, + "learning_rate": 0.0005260542458065132, + "loss": 0.0072, + "num_input_tokens_seen": 141556528, + "step": 65600 + }, + { + "epoch": 10.702283849918434, + "grad_norm": 0.1792118400335312, + "learning_rate": 0.0005259831624228605, + "loss": 0.0478, + "num_input_tokens_seen": 141567600, + "step": 65605 + }, + { + "epoch": 10.70309951060359, + "grad_norm": 0.16205546259880066, + "learning_rate": 0.000525912078512626, + "loss": 0.0225, + "num_input_tokens_seen": 141579088, + "step": 65610 + }, + { + "epoch": 10.703915171288743, + "grad_norm": 0.06031881272792816, + "learning_rate": 0.0005258409940772504, + "loss": 0.0355, + "num_input_tokens_seen": 141589808, + "step": 65615 + }, + { + "epoch": 10.7047308319739, + "grad_norm": 0.03302892670035362, + "learning_rate": 0.0005257699091181742, + "loss": 0.0157, + "num_input_tokens_seen": 141600592, + "step": 65620 + }, + { + "epoch": 10.705546492659053, + "grad_norm": 0.003944904077798128, + "learning_rate": 0.0005256988236368382, + "loss": 0.0133, + "num_input_tokens_seen": 141610928, + "step": 65625 + }, + { + "epoch": 10.706362153344209, + "grad_norm": 0.0008393623866140842, + "learning_rate": 0.0005256277376346829, + "loss": 0.0294, + "num_input_tokens_seen": 141622384, + "step": 65630 + }, + { + "epoch": 10.707177814029365, + "grad_norm": 0.006369807291775942, + "learning_rate": 0.0005255566511131489, + "loss": 0.0028, + "num_input_tokens_seen": 141632144, + "step": 65635 + }, + { + "epoch": 10.707993474714518, + "grad_norm": 0.006090414244681597, + "learning_rate": 0.000525485564073677, + "loss": 0.0242, + "num_input_tokens_seen": 141642608, + "step": 65640 + }, + { + "epoch": 10.708809135399674, + "grad_norm": 0.0005646261852234602, + "learning_rate": 0.0005254144765177078, + "loss": 0.0163, + "num_input_tokens_seen": 141654064, + "step": 65645 + }, + { + "epoch": 10.709624796084828, + "grad_norm": 0.31497055292129517, + "learning_rate": 0.0005253433884466821, + "loss": 0.1425, + "num_input_tokens_seen": 141663984, + "step": 65650 + }, + { + "epoch": 10.710440456769984, + "grad_norm": 0.1252189576625824, + "learning_rate": 0.0005252722998620403, + "loss": 0.0159, + "num_input_tokens_seen": 141673104, + "step": 65655 + }, + { + "epoch": 10.71125611745514, + "grad_norm": 0.024731872603297234, + "learning_rate": 0.0005252012107652234, + "loss": 0.1377, + "num_input_tokens_seen": 141684784, + "step": 65660 + }, + { + "epoch": 10.712071778140293, + "grad_norm": 0.0019005038775503635, + "learning_rate": 0.0005251301211576718, + "loss": 0.0025, + "num_input_tokens_seen": 141696560, + "step": 65665 + }, + { + "epoch": 10.71288743882545, + "grad_norm": 0.2789454758167267, + "learning_rate": 0.0005250590310408266, + "loss": 0.046, + "num_input_tokens_seen": 141707824, + "step": 65670 + }, + { + "epoch": 10.713703099510603, + "grad_norm": 0.0026120473630726337, + "learning_rate": 0.0005249879404161284, + "loss": 0.0258, + "num_input_tokens_seen": 141719088, + "step": 65675 + }, + { + "epoch": 10.714518760195759, + "grad_norm": 0.03605186939239502, + "learning_rate": 0.0005249168492850178, + "loss": 0.0109, + "num_input_tokens_seen": 141729840, + "step": 65680 + }, + { + "epoch": 10.715334420880914, + "grad_norm": 0.5007907152175903, + "learning_rate": 0.0005248457576489356, + "loss": 0.1001, + "num_input_tokens_seen": 141740720, + "step": 65685 + }, + { + "epoch": 10.716150081566068, + "grad_norm": 0.013331322930753231, + "learning_rate": 0.0005247746655093228, + "loss": 0.0605, + "num_input_tokens_seen": 141752752, + "step": 65690 + }, + { + "epoch": 10.716965742251224, + "grad_norm": 0.15283232927322388, + "learning_rate": 0.0005247035728676196, + "loss": 0.0177, + "num_input_tokens_seen": 141762896, + "step": 65695 + }, + { + "epoch": 10.717781402936378, + "grad_norm": 0.004067111294716597, + "learning_rate": 0.0005246324797252674, + "loss": 0.0205, + "num_input_tokens_seen": 141774224, + "step": 65700 + }, + { + "epoch": 10.718597063621534, + "grad_norm": 0.01266746036708355, + "learning_rate": 0.0005245613860837068, + "loss": 0.0357, + "num_input_tokens_seen": 141784688, + "step": 65705 + }, + { + "epoch": 10.719412724306688, + "grad_norm": 0.002115273382514715, + "learning_rate": 0.0005244902919443785, + "loss": 0.0121, + "num_input_tokens_seen": 141794768, + "step": 65710 + }, + { + "epoch": 10.720228384991843, + "grad_norm": 0.380673348903656, + "learning_rate": 0.0005244191973087233, + "loss": 0.0308, + "num_input_tokens_seen": 141806480, + "step": 65715 + }, + { + "epoch": 10.721044045676999, + "grad_norm": 0.02467508241534233, + "learning_rate": 0.0005243481021781821, + "loss": 0.1673, + "num_input_tokens_seen": 141817936, + "step": 65720 + }, + { + "epoch": 10.721859706362153, + "grad_norm": 0.08406726270914078, + "learning_rate": 0.0005242770065541958, + "loss": 0.0086, + "num_input_tokens_seen": 141828336, + "step": 65725 + }, + { + "epoch": 10.722675367047309, + "grad_norm": 0.031744927167892456, + "learning_rate": 0.0005242059104382052, + "loss": 0.0056, + "num_input_tokens_seen": 141839792, + "step": 65730 + }, + { + "epoch": 10.723491027732463, + "grad_norm": 0.0007440338958986104, + "learning_rate": 0.000524134813831651, + "loss": 0.014, + "num_input_tokens_seen": 141851088, + "step": 65735 + }, + { + "epoch": 10.724306688417618, + "grad_norm": 0.004869487602263689, + "learning_rate": 0.0005240637167359743, + "loss": 0.0397, + "num_input_tokens_seen": 141861680, + "step": 65740 + }, + { + "epoch": 10.725122349102774, + "grad_norm": 0.08880333602428436, + "learning_rate": 0.0005239926191526157, + "loss": 0.1076, + "num_input_tokens_seen": 141873008, + "step": 65745 + }, + { + "epoch": 10.725938009787928, + "grad_norm": 0.07206324487924576, + "learning_rate": 0.0005239215210830164, + "loss": 0.0194, + "num_input_tokens_seen": 141884816, + "step": 65750 + }, + { + "epoch": 10.726753670473084, + "grad_norm": 0.012749255634844303, + "learning_rate": 0.000523850422528617, + "loss": 0.1071, + "num_input_tokens_seen": 141895696, + "step": 65755 + }, + { + "epoch": 10.727569331158238, + "grad_norm": 0.020351486280560493, + "learning_rate": 0.0005237793234908586, + "loss": 0.0077, + "num_input_tokens_seen": 141906512, + "step": 65760 + }, + { + "epoch": 10.728384991843393, + "grad_norm": 0.02548368275165558, + "learning_rate": 0.000523708223971182, + "loss": 0.0465, + "num_input_tokens_seen": 141917424, + "step": 65765 + }, + { + "epoch": 10.729200652528547, + "grad_norm": 0.26738470792770386, + "learning_rate": 0.0005236371239710283, + "loss": 0.0651, + "num_input_tokens_seen": 141927088, + "step": 65770 + }, + { + "epoch": 10.730016313213703, + "grad_norm": 0.003081423928961158, + "learning_rate": 0.0005235660234918381, + "loss": 0.0091, + "num_input_tokens_seen": 141938256, + "step": 65775 + }, + { + "epoch": 10.730831973898859, + "grad_norm": 0.006351626943796873, + "learning_rate": 0.0005234949225350526, + "loss": 0.0824, + "num_input_tokens_seen": 141947952, + "step": 65780 + }, + { + "epoch": 10.731647634584013, + "grad_norm": 0.012640059925615788, + "learning_rate": 0.0005234238211021127, + "loss": 0.0042, + "num_input_tokens_seen": 141958800, + "step": 65785 + }, + { + "epoch": 10.732463295269168, + "grad_norm": 0.0797172486782074, + "learning_rate": 0.0005233527191944593, + "loss": 0.0492, + "num_input_tokens_seen": 141969296, + "step": 65790 + }, + { + "epoch": 10.733278955954322, + "grad_norm": 0.16686546802520752, + "learning_rate": 0.0005232816168135336, + "loss": 0.0229, + "num_input_tokens_seen": 141980176, + "step": 65795 + }, + { + "epoch": 10.734094616639478, + "grad_norm": 0.021895145997405052, + "learning_rate": 0.0005232105139607763, + "loss": 0.0921, + "num_input_tokens_seen": 141991472, + "step": 65800 + }, + { + "epoch": 10.734910277324634, + "grad_norm": 0.005430617835372686, + "learning_rate": 0.0005231394106376283, + "loss": 0.0601, + "num_input_tokens_seen": 142001744, + "step": 65805 + }, + { + "epoch": 10.735725938009788, + "grad_norm": 0.006489858962595463, + "learning_rate": 0.000523068306845531, + "loss": 0.0411, + "num_input_tokens_seen": 142011504, + "step": 65810 + }, + { + "epoch": 10.736541598694943, + "grad_norm": 0.4535573720932007, + "learning_rate": 0.0005229972025859252, + "loss": 0.0606, + "num_input_tokens_seen": 142020784, + "step": 65815 + }, + { + "epoch": 10.737357259380097, + "grad_norm": 0.001667237957008183, + "learning_rate": 0.0005229260978602519, + "loss": 0.003, + "num_input_tokens_seen": 142031312, + "step": 65820 + }, + { + "epoch": 10.738172920065253, + "grad_norm": 0.0012186936801299453, + "learning_rate": 0.0005228549926699521, + "loss": 0.0679, + "num_input_tokens_seen": 142042096, + "step": 65825 + }, + { + "epoch": 10.738988580750409, + "grad_norm": 0.0283738411962986, + "learning_rate": 0.0005227838870164669, + "loss": 0.036, + "num_input_tokens_seen": 142052784, + "step": 65830 + }, + { + "epoch": 10.739804241435563, + "grad_norm": 0.006058056373149157, + "learning_rate": 0.0005227127809012372, + "loss": 0.0563, + "num_input_tokens_seen": 142064432, + "step": 65835 + }, + { + "epoch": 10.740619902120718, + "grad_norm": 0.00039323573582805693, + "learning_rate": 0.0005226416743257043, + "loss": 0.0281, + "num_input_tokens_seen": 142076048, + "step": 65840 + }, + { + "epoch": 10.741435562805872, + "grad_norm": 0.01851779595017433, + "learning_rate": 0.0005225705672913092, + "loss": 0.0074, + "num_input_tokens_seen": 142086992, + "step": 65845 + }, + { + "epoch": 10.742251223491028, + "grad_norm": 0.5491237640380859, + "learning_rate": 0.0005224994597994929, + "loss": 0.0324, + "num_input_tokens_seen": 142097616, + "step": 65850 + }, + { + "epoch": 10.743066884176184, + "grad_norm": 0.021171612665057182, + "learning_rate": 0.0005224283518516965, + "loss": 0.0091, + "num_input_tokens_seen": 142109008, + "step": 65855 + }, + { + "epoch": 10.743882544861338, + "grad_norm": 0.0034083030186593533, + "learning_rate": 0.000522357243449361, + "loss": 0.01, + "num_input_tokens_seen": 142118448, + "step": 65860 + }, + { + "epoch": 10.744698205546493, + "grad_norm": 0.3479892611503601, + "learning_rate": 0.0005222861345939278, + "loss": 0.168, + "num_input_tokens_seen": 142130480, + "step": 65865 + }, + { + "epoch": 10.745513866231647, + "grad_norm": 0.007497397717088461, + "learning_rate": 0.0005222150252868375, + "loss": 0.0376, + "num_input_tokens_seen": 142141712, + "step": 65870 + }, + { + "epoch": 10.746329526916803, + "grad_norm": 0.040476780384778976, + "learning_rate": 0.0005221439155295318, + "loss": 0.0062, + "num_input_tokens_seen": 142153232, + "step": 65875 + }, + { + "epoch": 10.747145187601957, + "grad_norm": 0.0016957190819084644, + "learning_rate": 0.0005220728053234514, + "loss": 0.0374, + "num_input_tokens_seen": 142163664, + "step": 65880 + }, + { + "epoch": 10.747960848287113, + "grad_norm": 0.1452600359916687, + "learning_rate": 0.0005220016946700378, + "loss": 0.0099, + "num_input_tokens_seen": 142174800, + "step": 65885 + }, + { + "epoch": 10.748776508972268, + "grad_norm": 0.0057751815766096115, + "learning_rate": 0.0005219305835707318, + "loss": 0.0279, + "num_input_tokens_seen": 142185776, + "step": 65890 + }, + { + "epoch": 10.749592169657422, + "grad_norm": 0.019320117309689522, + "learning_rate": 0.0005218594720269748, + "loss": 0.0549, + "num_input_tokens_seen": 142195856, + "step": 65895 + }, + { + "epoch": 10.750407830342578, + "grad_norm": 0.031581539660692215, + "learning_rate": 0.0005217883600402076, + "loss": 0.0382, + "num_input_tokens_seen": 142206160, + "step": 65900 + }, + { + "epoch": 10.751223491027732, + "grad_norm": 0.04237253963947296, + "learning_rate": 0.0005217172476118719, + "loss": 0.1138, + "num_input_tokens_seen": 142217040, + "step": 65905 + }, + { + "epoch": 10.752039151712887, + "grad_norm": 0.008355346508324146, + "learning_rate": 0.0005216461347434084, + "loss": 0.0314, + "num_input_tokens_seen": 142226384, + "step": 65910 + }, + { + "epoch": 10.752854812398043, + "grad_norm": 0.01419571228325367, + "learning_rate": 0.0005215750214362588, + "loss": 0.0947, + "num_input_tokens_seen": 142238288, + "step": 65915 + }, + { + "epoch": 10.753670473083197, + "grad_norm": 0.024219896644353867, + "learning_rate": 0.0005215039076918638, + "loss": 0.0161, + "num_input_tokens_seen": 142249040, + "step": 65920 + }, + { + "epoch": 10.754486133768353, + "grad_norm": 0.5250378251075745, + "learning_rate": 0.0005214327935116651, + "loss": 0.0752, + "num_input_tokens_seen": 142260208, + "step": 65925 + }, + { + "epoch": 10.755301794453507, + "grad_norm": 0.011480088345706463, + "learning_rate": 0.0005213616788971034, + "loss": 0.0319, + "num_input_tokens_seen": 142270320, + "step": 65930 + }, + { + "epoch": 10.756117455138662, + "grad_norm": 0.19423358142375946, + "learning_rate": 0.0005212905638496203, + "loss": 0.111, + "num_input_tokens_seen": 142280560, + "step": 65935 + }, + { + "epoch": 10.756933115823816, + "grad_norm": 0.30237877368927, + "learning_rate": 0.0005212194483706569, + "loss": 0.0192, + "num_input_tokens_seen": 142290608, + "step": 65940 + }, + { + "epoch": 10.757748776508972, + "grad_norm": 0.014463284984230995, + "learning_rate": 0.0005211483324616544, + "loss": 0.1208, + "num_input_tokens_seen": 142302128, + "step": 65945 + }, + { + "epoch": 10.758564437194128, + "grad_norm": 0.07568585127592087, + "learning_rate": 0.0005210772161240541, + "loss": 0.0125, + "num_input_tokens_seen": 142313232, + "step": 65950 + }, + { + "epoch": 10.759380097879282, + "grad_norm": 0.038497135043144226, + "learning_rate": 0.0005210060993592973, + "loss": 0.1271, + "num_input_tokens_seen": 142323920, + "step": 65955 + }, + { + "epoch": 10.760195758564437, + "grad_norm": 0.13206791877746582, + "learning_rate": 0.0005209349821688254, + "loss": 0.058, + "num_input_tokens_seen": 142333424, + "step": 65960 + }, + { + "epoch": 10.761011419249591, + "grad_norm": 0.015888281166553497, + "learning_rate": 0.0005208638645540795, + "loss": 0.0269, + "num_input_tokens_seen": 142344272, + "step": 65965 + }, + { + "epoch": 10.761827079934747, + "grad_norm": 0.006686575710773468, + "learning_rate": 0.0005207927465165007, + "loss": 0.0916, + "num_input_tokens_seen": 142355664, + "step": 65970 + }, + { + "epoch": 10.762642740619903, + "grad_norm": 0.007468751166015863, + "learning_rate": 0.0005207216280575306, + "loss": 0.0073, + "num_input_tokens_seen": 142366704, + "step": 65975 + }, + { + "epoch": 10.763458401305057, + "grad_norm": 0.03421982750296593, + "learning_rate": 0.0005206505091786103, + "loss": 0.0074, + "num_input_tokens_seen": 142378704, + "step": 65980 + }, + { + "epoch": 10.764274061990212, + "grad_norm": 0.004381625447422266, + "learning_rate": 0.0005205793898811814, + "loss": 0.0098, + "num_input_tokens_seen": 142388432, + "step": 65985 + }, + { + "epoch": 10.765089722675366, + "grad_norm": 0.012737615965306759, + "learning_rate": 0.0005205082701666851, + "loss": 0.011, + "num_input_tokens_seen": 142399152, + "step": 65990 + }, + { + "epoch": 10.765905383360522, + "grad_norm": 0.02391059882938862, + "learning_rate": 0.0005204371500365627, + "loss": 0.0519, + "num_input_tokens_seen": 142410448, + "step": 65995 + }, + { + "epoch": 10.766721044045678, + "grad_norm": 0.05659320205450058, + "learning_rate": 0.0005203660294922554, + "loss": 0.017, + "num_input_tokens_seen": 142421424, + "step": 66000 + }, + { + "epoch": 10.767536704730832, + "grad_norm": 0.13078485429286957, + "learning_rate": 0.0005202949085352048, + "loss": 0.0134, + "num_input_tokens_seen": 142431568, + "step": 66005 + }, + { + "epoch": 10.768352365415987, + "grad_norm": 0.18037192523479462, + "learning_rate": 0.000520223787166852, + "loss": 0.0365, + "num_input_tokens_seen": 142443152, + "step": 66010 + }, + { + "epoch": 10.769168026101141, + "grad_norm": 0.022000489756464958, + "learning_rate": 0.0005201526653886385, + "loss": 0.009, + "num_input_tokens_seen": 142453808, + "step": 66015 + }, + { + "epoch": 10.769983686786297, + "grad_norm": 0.28489425778388977, + "learning_rate": 0.0005200815432020058, + "loss": 0.0298, + "num_input_tokens_seen": 142463760, + "step": 66020 + }, + { + "epoch": 10.770799347471453, + "grad_norm": 0.008916635997593403, + "learning_rate": 0.0005200104206083951, + "loss": 0.0148, + "num_input_tokens_seen": 142474032, + "step": 66025 + }, + { + "epoch": 10.771615008156607, + "grad_norm": 0.14338873326778412, + "learning_rate": 0.0005199392976092479, + "loss": 0.0171, + "num_input_tokens_seen": 142484368, + "step": 66030 + }, + { + "epoch": 10.772430668841762, + "grad_norm": 0.004213014151901007, + "learning_rate": 0.0005198681742060055, + "loss": 0.131, + "num_input_tokens_seen": 142495344, + "step": 66035 + }, + { + "epoch": 10.773246329526916, + "grad_norm": 0.007799748796969652, + "learning_rate": 0.0005197970504001091, + "loss": 0.0208, + "num_input_tokens_seen": 142505392, + "step": 66040 + }, + { + "epoch": 10.774061990212072, + "grad_norm": 0.10302001982927322, + "learning_rate": 0.0005197259261930007, + "loss": 0.0223, + "num_input_tokens_seen": 142515760, + "step": 66045 + }, + { + "epoch": 10.774877650897226, + "grad_norm": 0.37575796246528625, + "learning_rate": 0.0005196548015861212, + "loss": 0.0478, + "num_input_tokens_seen": 142525552, + "step": 66050 + }, + { + "epoch": 10.775693311582382, + "grad_norm": 0.29865017533302307, + "learning_rate": 0.0005195836765809123, + "loss": 0.0253, + "num_input_tokens_seen": 142535600, + "step": 66055 + }, + { + "epoch": 10.776508972267537, + "grad_norm": 0.33141353726387024, + "learning_rate": 0.0005195125511788153, + "loss": 0.2256, + "num_input_tokens_seen": 142546928, + "step": 66060 + }, + { + "epoch": 10.777324632952691, + "grad_norm": 0.01182505115866661, + "learning_rate": 0.0005194414253812718, + "loss": 0.004, + "num_input_tokens_seen": 142557936, + "step": 66065 + }, + { + "epoch": 10.778140293637847, + "grad_norm": 0.3566802144050598, + "learning_rate": 0.000519370299189723, + "loss": 0.0983, + "num_input_tokens_seen": 142567856, + "step": 66070 + }, + { + "epoch": 10.778955954323001, + "grad_norm": 0.21780045330524445, + "learning_rate": 0.0005192991726056107, + "loss": 0.0196, + "num_input_tokens_seen": 142578544, + "step": 66075 + }, + { + "epoch": 10.779771615008157, + "grad_norm": 0.0644204318523407, + "learning_rate": 0.0005192280456303759, + "loss": 0.0405, + "num_input_tokens_seen": 142589968, + "step": 66080 + }, + { + "epoch": 10.780587275693312, + "grad_norm": 0.5692721605300903, + "learning_rate": 0.0005191569182654606, + "loss": 0.1653, + "num_input_tokens_seen": 142601488, + "step": 66085 + }, + { + "epoch": 10.781402936378466, + "grad_norm": 0.03694921359419823, + "learning_rate": 0.000519085790512306, + "loss": 0.0725, + "num_input_tokens_seen": 142611280, + "step": 66090 + }, + { + "epoch": 10.782218597063622, + "grad_norm": 0.009856709279119968, + "learning_rate": 0.0005190146623723536, + "loss": 0.0462, + "num_input_tokens_seen": 142621904, + "step": 66095 + }, + { + "epoch": 10.783034257748776, + "grad_norm": 0.1836153268814087, + "learning_rate": 0.000518943533847045, + "loss": 0.0331, + "num_input_tokens_seen": 142632976, + "step": 66100 + }, + { + "epoch": 10.783849918433932, + "grad_norm": 0.009600237011909485, + "learning_rate": 0.0005188724049378216, + "loss": 0.0061, + "num_input_tokens_seen": 142643696, + "step": 66105 + }, + { + "epoch": 10.784665579119086, + "grad_norm": 0.06723660975694656, + "learning_rate": 0.0005188012756461251, + "loss": 0.0102, + "num_input_tokens_seen": 142654448, + "step": 66110 + }, + { + "epoch": 10.785481239804241, + "grad_norm": 0.023283055052161217, + "learning_rate": 0.0005187301459733967, + "loss": 0.0041, + "num_input_tokens_seen": 142665488, + "step": 66115 + }, + { + "epoch": 10.786296900489397, + "grad_norm": 0.0637165904045105, + "learning_rate": 0.0005186590159210783, + "loss": 0.1444, + "num_input_tokens_seen": 142676496, + "step": 66120 + }, + { + "epoch": 10.78711256117455, + "grad_norm": 0.008102311752736568, + "learning_rate": 0.0005185878854906111, + "loss": 0.1394, + "num_input_tokens_seen": 142686960, + "step": 66125 + }, + { + "epoch": 10.787928221859707, + "grad_norm": 0.6604865193367004, + "learning_rate": 0.0005185167546834368, + "loss": 0.0482, + "num_input_tokens_seen": 142697776, + "step": 66130 + }, + { + "epoch": 10.78874388254486, + "grad_norm": 0.015413220040500164, + "learning_rate": 0.0005184456235009972, + "loss": 0.1283, + "num_input_tokens_seen": 142709072, + "step": 66135 + }, + { + "epoch": 10.789559543230016, + "grad_norm": 0.23359888792037964, + "learning_rate": 0.0005183744919447335, + "loss": 0.0262, + "num_input_tokens_seen": 142720144, + "step": 66140 + }, + { + "epoch": 10.790375203915172, + "grad_norm": 0.01299193687736988, + "learning_rate": 0.0005183033600160875, + "loss": 0.0122, + "num_input_tokens_seen": 142730512, + "step": 66145 + }, + { + "epoch": 10.791190864600326, + "grad_norm": 0.0540449284017086, + "learning_rate": 0.0005182322277165005, + "loss": 0.0239, + "num_input_tokens_seen": 142741488, + "step": 66150 + }, + { + "epoch": 10.792006525285482, + "grad_norm": 0.03036809340119362, + "learning_rate": 0.0005181610950474143, + "loss": 0.0069, + "num_input_tokens_seen": 142751888, + "step": 66155 + }, + { + "epoch": 10.792822185970635, + "grad_norm": 0.01635170541703701, + "learning_rate": 0.0005180899620102707, + "loss": 0.0217, + "num_input_tokens_seen": 142763312, + "step": 66160 + }, + { + "epoch": 10.793637846655791, + "grad_norm": 0.33601224422454834, + "learning_rate": 0.000518018828606511, + "loss": 0.1101, + "num_input_tokens_seen": 142774928, + "step": 66165 + }, + { + "epoch": 10.794453507340947, + "grad_norm": 0.018067535012960434, + "learning_rate": 0.0005179476948375767, + "loss": 0.0404, + "num_input_tokens_seen": 142786704, + "step": 66170 + }, + { + "epoch": 10.7952691680261, + "grad_norm": 0.2539023756980896, + "learning_rate": 0.0005178765607049098, + "loss": 0.1157, + "num_input_tokens_seen": 142797936, + "step": 66175 + }, + { + "epoch": 10.796084828711257, + "grad_norm": 0.0030338955111801624, + "learning_rate": 0.0005178054262099516, + "loss": 0.0078, + "num_input_tokens_seen": 142808784, + "step": 66180 + }, + { + "epoch": 10.79690048939641, + "grad_norm": 0.01922004297375679, + "learning_rate": 0.000517734291354144, + "loss": 0.0674, + "num_input_tokens_seen": 142820944, + "step": 66185 + }, + { + "epoch": 10.797716150081566, + "grad_norm": 0.06895926594734192, + "learning_rate": 0.0005176631561389283, + "loss": 0.009, + "num_input_tokens_seen": 142832624, + "step": 66190 + }, + { + "epoch": 10.798531810766722, + "grad_norm": 0.013722254894673824, + "learning_rate": 0.0005175920205657465, + "loss": 0.088, + "num_input_tokens_seen": 142842224, + "step": 66195 + }, + { + "epoch": 10.799347471451876, + "grad_norm": 0.0026969611644744873, + "learning_rate": 0.0005175208846360399, + "loss": 0.0142, + "num_input_tokens_seen": 142852912, + "step": 66200 + }, + { + "epoch": 10.800163132137031, + "grad_norm": 0.018714522942900658, + "learning_rate": 0.0005174497483512506, + "loss": 0.0177, + "num_input_tokens_seen": 142862064, + "step": 66205 + }, + { + "epoch": 10.800978792822185, + "grad_norm": 0.007700165268033743, + "learning_rate": 0.0005173786117128198, + "loss": 0.0084, + "num_input_tokens_seen": 142873712, + "step": 66210 + }, + { + "epoch": 10.801794453507341, + "grad_norm": 0.0065385992638766766, + "learning_rate": 0.0005173074747221895, + "loss": 0.0046, + "num_input_tokens_seen": 142884304, + "step": 66215 + }, + { + "epoch": 10.802610114192497, + "grad_norm": 0.009233236312866211, + "learning_rate": 0.0005172363373808013, + "loss": 0.0106, + "num_input_tokens_seen": 142895632, + "step": 66220 + }, + { + "epoch": 10.80342577487765, + "grad_norm": 0.03193371370434761, + "learning_rate": 0.0005171651996900967, + "loss": 0.0089, + "num_input_tokens_seen": 142906736, + "step": 66225 + }, + { + "epoch": 10.804241435562806, + "grad_norm": 0.030128976330161095, + "learning_rate": 0.0005170940616515175, + "loss": 0.1802, + "num_input_tokens_seen": 142918384, + "step": 66230 + }, + { + "epoch": 10.80505709624796, + "grad_norm": 0.03232830390334129, + "learning_rate": 0.0005170229232665056, + "loss": 0.0923, + "num_input_tokens_seen": 142929424, + "step": 66235 + }, + { + "epoch": 10.805872756933116, + "grad_norm": 0.009060553275048733, + "learning_rate": 0.0005169517845365025, + "loss": 0.0079, + "num_input_tokens_seen": 142940464, + "step": 66240 + }, + { + "epoch": 10.80668841761827, + "grad_norm": 0.005074178799986839, + "learning_rate": 0.0005168806454629501, + "loss": 0.0177, + "num_input_tokens_seen": 142951184, + "step": 66245 + }, + { + "epoch": 10.807504078303426, + "grad_norm": 0.05741208791732788, + "learning_rate": 0.0005168095060472899, + "loss": 0.0066, + "num_input_tokens_seen": 142962096, + "step": 66250 + }, + { + "epoch": 10.808319738988581, + "grad_norm": 0.002940193749964237, + "learning_rate": 0.0005167383662909638, + "loss": 0.0084, + "num_input_tokens_seen": 142972368, + "step": 66255 + }, + { + "epoch": 10.809135399673735, + "grad_norm": 0.040540099143981934, + "learning_rate": 0.0005166672261954134, + "loss": 0.0136, + "num_input_tokens_seen": 142982160, + "step": 66260 + }, + { + "epoch": 10.809951060358891, + "grad_norm": 0.04795058071613312, + "learning_rate": 0.0005165960857620806, + "loss": 0.0531, + "num_input_tokens_seen": 142992656, + "step": 66265 + }, + { + "epoch": 10.810766721044045, + "grad_norm": 0.2362014800310135, + "learning_rate": 0.000516524944992407, + "loss": 0.1161, + "num_input_tokens_seen": 143001936, + "step": 66270 + }, + { + "epoch": 10.8115823817292, + "grad_norm": 0.3614776134490967, + "learning_rate": 0.0005164538038878345, + "loss": 0.0315, + "num_input_tokens_seen": 143013040, + "step": 66275 + }, + { + "epoch": 10.812398042414356, + "grad_norm": 0.020475786179304123, + "learning_rate": 0.0005163826624498047, + "loss": 0.1018, + "num_input_tokens_seen": 143024688, + "step": 66280 + }, + { + "epoch": 10.81321370309951, + "grad_norm": 0.434549480676651, + "learning_rate": 0.0005163115206797596, + "loss": 0.0785, + "num_input_tokens_seen": 143035120, + "step": 66285 + }, + { + "epoch": 10.814029363784666, + "grad_norm": 0.022317729890346527, + "learning_rate": 0.0005162403785791408, + "loss": 0.0178, + "num_input_tokens_seen": 143045264, + "step": 66290 + }, + { + "epoch": 10.81484502446982, + "grad_norm": 0.04018959403038025, + "learning_rate": 0.0005161692361493899, + "loss": 0.1979, + "num_input_tokens_seen": 143056016, + "step": 66295 + }, + { + "epoch": 10.815660685154976, + "grad_norm": 0.10746350884437561, + "learning_rate": 0.0005160980933919491, + "loss": 0.0265, + "num_input_tokens_seen": 143066800, + "step": 66300 + }, + { + "epoch": 10.81647634584013, + "grad_norm": 0.27934181690216064, + "learning_rate": 0.00051602695030826, + "loss": 0.1178, + "num_input_tokens_seen": 143077456, + "step": 66305 + }, + { + "epoch": 10.817292006525285, + "grad_norm": 0.3828637897968292, + "learning_rate": 0.0005159558068997644, + "loss": 0.0854, + "num_input_tokens_seen": 143087856, + "step": 66310 + }, + { + "epoch": 10.818107667210441, + "grad_norm": 0.005358435679227114, + "learning_rate": 0.0005158846631679041, + "loss": 0.0329, + "num_input_tokens_seen": 143098640, + "step": 66315 + }, + { + "epoch": 10.818923327895595, + "grad_norm": 0.27422744035720825, + "learning_rate": 0.0005158135191141211, + "loss": 0.0503, + "num_input_tokens_seen": 143109456, + "step": 66320 + }, + { + "epoch": 10.81973898858075, + "grad_norm": 0.027966933324933052, + "learning_rate": 0.000515742374739857, + "loss": 0.0061, + "num_input_tokens_seen": 143120784, + "step": 66325 + }, + { + "epoch": 10.820554649265905, + "grad_norm": 0.3247361481189728, + "learning_rate": 0.0005156712300465537, + "loss": 0.0626, + "num_input_tokens_seen": 143131568, + "step": 66330 + }, + { + "epoch": 10.82137030995106, + "grad_norm": 0.05779033899307251, + "learning_rate": 0.000515600085035653, + "loss": 0.0374, + "num_input_tokens_seen": 143142960, + "step": 66335 + }, + { + "epoch": 10.822185970636216, + "grad_norm": 0.012751691043376923, + "learning_rate": 0.0005155289397085968, + "loss": 0.0597, + "num_input_tokens_seen": 143152624, + "step": 66340 + }, + { + "epoch": 10.82300163132137, + "grad_norm": 0.13210001587867737, + "learning_rate": 0.0005154577940668269, + "loss": 0.0635, + "num_input_tokens_seen": 143163920, + "step": 66345 + }, + { + "epoch": 10.823817292006526, + "grad_norm": 0.03565424680709839, + "learning_rate": 0.0005153866481117852, + "loss": 0.0374, + "num_input_tokens_seen": 143174032, + "step": 66350 + }, + { + "epoch": 10.82463295269168, + "grad_norm": 0.007200122810900211, + "learning_rate": 0.0005153155018449137, + "loss": 0.0304, + "num_input_tokens_seen": 143182704, + "step": 66355 + }, + { + "epoch": 10.825448613376835, + "grad_norm": 0.004404145758599043, + "learning_rate": 0.000515244355267654, + "loss": 0.0264, + "num_input_tokens_seen": 143193744, + "step": 66360 + }, + { + "epoch": 10.826264274061991, + "grad_norm": 0.18004338443279266, + "learning_rate": 0.0005151732083814481, + "loss": 0.0765, + "num_input_tokens_seen": 143205712, + "step": 66365 + }, + { + "epoch": 10.827079934747145, + "grad_norm": 0.18771035969257355, + "learning_rate": 0.000515102061187738, + "loss": 0.0266, + "num_input_tokens_seen": 143215824, + "step": 66370 + }, + { + "epoch": 10.8278955954323, + "grad_norm": 0.0018929381622001529, + "learning_rate": 0.0005150309136879654, + "loss": 0.0526, + "num_input_tokens_seen": 143226192, + "step": 66375 + }, + { + "epoch": 10.828711256117455, + "grad_norm": 0.01867181807756424, + "learning_rate": 0.0005149597658835722, + "loss": 0.0939, + "num_input_tokens_seen": 143236848, + "step": 66380 + }, + { + "epoch": 10.82952691680261, + "grad_norm": 0.015930958092212677, + "learning_rate": 0.0005148886177760005, + "loss": 0.0422, + "num_input_tokens_seen": 143248176, + "step": 66385 + }, + { + "epoch": 10.830342577487766, + "grad_norm": 0.0012241904623806477, + "learning_rate": 0.000514817469366692, + "loss": 0.009, + "num_input_tokens_seen": 143259248, + "step": 66390 + }, + { + "epoch": 10.83115823817292, + "grad_norm": 0.008852743543684483, + "learning_rate": 0.0005147463206570886, + "loss": 0.0073, + "num_input_tokens_seen": 143270192, + "step": 66395 + }, + { + "epoch": 10.831973898858076, + "grad_norm": 0.04803108423948288, + "learning_rate": 0.0005146751716486324, + "loss": 0.0676, + "num_input_tokens_seen": 143280336, + "step": 66400 + }, + { + "epoch": 10.83278955954323, + "grad_norm": 0.028177138417959213, + "learning_rate": 0.0005146040223427652, + "loss": 0.0108, + "num_input_tokens_seen": 143291184, + "step": 66405 + }, + { + "epoch": 10.833605220228385, + "grad_norm": 0.020188767462968826, + "learning_rate": 0.0005145328727409291, + "loss": 0.0286, + "num_input_tokens_seen": 143301712, + "step": 66410 + }, + { + "epoch": 10.83442088091354, + "grad_norm": 0.0036604523193091154, + "learning_rate": 0.0005144617228445657, + "loss": 0.0144, + "num_input_tokens_seen": 143311728, + "step": 66415 + }, + { + "epoch": 10.835236541598695, + "grad_norm": 0.9981608390808105, + "learning_rate": 0.0005143905726551172, + "loss": 0.0603, + "num_input_tokens_seen": 143322960, + "step": 66420 + }, + { + "epoch": 10.83605220228385, + "grad_norm": 0.10083454847335815, + "learning_rate": 0.0005143194221740255, + "loss": 0.0782, + "num_input_tokens_seen": 143332496, + "step": 66425 + }, + { + "epoch": 10.836867862969005, + "grad_norm": 0.03011765517294407, + "learning_rate": 0.0005142482714027326, + "loss": 0.0216, + "num_input_tokens_seen": 143343760, + "step": 66430 + }, + { + "epoch": 10.83768352365416, + "grad_norm": 0.03511294722557068, + "learning_rate": 0.0005141771203426803, + "loss": 0.0191, + "num_input_tokens_seen": 143354800, + "step": 66435 + }, + { + "epoch": 10.838499184339314, + "grad_norm": 1.1387252807617188, + "learning_rate": 0.0005141059689953107, + "loss": 0.0549, + "num_input_tokens_seen": 143365168, + "step": 66440 + }, + { + "epoch": 10.83931484502447, + "grad_norm": 0.004347668960690498, + "learning_rate": 0.0005140348173620657, + "loss": 0.0056, + "num_input_tokens_seen": 143375600, + "step": 66445 + }, + { + "epoch": 10.840130505709626, + "grad_norm": 0.006302103400230408, + "learning_rate": 0.0005139636654443874, + "loss": 0.0025, + "num_input_tokens_seen": 143385264, + "step": 66450 + }, + { + "epoch": 10.84094616639478, + "grad_norm": 0.25027114152908325, + "learning_rate": 0.0005138925132437178, + "loss": 0.1175, + "num_input_tokens_seen": 143395056, + "step": 66455 + }, + { + "epoch": 10.841761827079935, + "grad_norm": 0.005223659798502922, + "learning_rate": 0.0005138213607614985, + "loss": 0.0399, + "num_input_tokens_seen": 143406832, + "step": 66460 + }, + { + "epoch": 10.84257748776509, + "grad_norm": 0.0013196651125326753, + "learning_rate": 0.000513750207999172, + "loss": 0.003, + "num_input_tokens_seen": 143418544, + "step": 66465 + }, + { + "epoch": 10.843393148450245, + "grad_norm": 0.10590115189552307, + "learning_rate": 0.0005136790549581801, + "loss": 0.1408, + "num_input_tokens_seen": 143428368, + "step": 66470 + }, + { + "epoch": 10.844208809135399, + "grad_norm": 1.1202924251556396, + "learning_rate": 0.0005136079016399647, + "loss": 0.0345, + "num_input_tokens_seen": 143439472, + "step": 66475 + }, + { + "epoch": 10.845024469820554, + "grad_norm": 0.008881057612597942, + "learning_rate": 0.000513536748045968, + "loss": 0.0306, + "num_input_tokens_seen": 143451280, + "step": 66480 + }, + { + "epoch": 10.84584013050571, + "grad_norm": 0.07496795803308487, + "learning_rate": 0.000513465594177632, + "loss": 0.1083, + "num_input_tokens_seen": 143461136, + "step": 66485 + }, + { + "epoch": 10.846655791190864, + "grad_norm": 0.017714444547891617, + "learning_rate": 0.0005133944400363986, + "loss": 0.0062, + "num_input_tokens_seen": 143471952, + "step": 66490 + }, + { + "epoch": 10.84747145187602, + "grad_norm": 0.4075281620025635, + "learning_rate": 0.0005133232856237098, + "loss": 0.1041, + "num_input_tokens_seen": 143483696, + "step": 66495 + }, + { + "epoch": 10.848287112561174, + "grad_norm": 0.009333852678537369, + "learning_rate": 0.0005132521309410078, + "loss": 0.0099, + "num_input_tokens_seen": 143494768, + "step": 66500 + }, + { + "epoch": 10.84910277324633, + "grad_norm": 0.011294697411358356, + "learning_rate": 0.0005131809759897345, + "loss": 0.1519, + "num_input_tokens_seen": 143506608, + "step": 66505 + }, + { + "epoch": 10.849918433931485, + "grad_norm": 0.004104896914213896, + "learning_rate": 0.000513109820771332, + "loss": 0.0704, + "num_input_tokens_seen": 143517168, + "step": 66510 + }, + { + "epoch": 10.850734094616639, + "grad_norm": 0.008419648744165897, + "learning_rate": 0.0005130386652872423, + "loss": 0.0095, + "num_input_tokens_seen": 143528144, + "step": 66515 + }, + { + "epoch": 10.851549755301795, + "grad_norm": 0.015874633565545082, + "learning_rate": 0.0005129675095389076, + "loss": 0.0383, + "num_input_tokens_seen": 143538224, + "step": 66520 + }, + { + "epoch": 10.852365415986949, + "grad_norm": 0.0031939512118697166, + "learning_rate": 0.0005128963535277699, + "loss": 0.0091, + "num_input_tokens_seen": 143548464, + "step": 66525 + }, + { + "epoch": 10.853181076672104, + "grad_norm": 0.3200739026069641, + "learning_rate": 0.0005128251972552711, + "loss": 0.1371, + "num_input_tokens_seen": 143559824, + "step": 66530 + }, + { + "epoch": 10.85399673735726, + "grad_norm": 0.013349249958992004, + "learning_rate": 0.0005127540407228535, + "loss": 0.1225, + "num_input_tokens_seen": 143570864, + "step": 66535 + }, + { + "epoch": 10.854812398042414, + "grad_norm": 0.4610392451286316, + "learning_rate": 0.0005126828839319591, + "loss": 0.1467, + "num_input_tokens_seen": 143582224, + "step": 66540 + }, + { + "epoch": 10.85562805872757, + "grad_norm": 0.26505231857299805, + "learning_rate": 0.0005126117268840299, + "loss": 0.0512, + "num_input_tokens_seen": 143593904, + "step": 66545 + }, + { + "epoch": 10.856443719412724, + "grad_norm": 0.006210309453308582, + "learning_rate": 0.000512540569580508, + "loss": 0.0104, + "num_input_tokens_seen": 143603952, + "step": 66550 + }, + { + "epoch": 10.85725938009788, + "grad_norm": 0.0014477159129455686, + "learning_rate": 0.0005124694120228357, + "loss": 0.0277, + "num_input_tokens_seen": 143614512, + "step": 66555 + }, + { + "epoch": 10.858075040783035, + "grad_norm": 0.0031649244483560324, + "learning_rate": 0.0005123982542124549, + "loss": 0.0068, + "num_input_tokens_seen": 143625808, + "step": 66560 + }, + { + "epoch": 10.858890701468189, + "grad_norm": 0.2617475092411041, + "learning_rate": 0.0005123270961508077, + "loss": 0.0652, + "num_input_tokens_seen": 143634544, + "step": 66565 + }, + { + "epoch": 10.859706362153345, + "grad_norm": 0.039467550814151764, + "learning_rate": 0.0005122559378393363, + "loss": 0.136, + "num_input_tokens_seen": 143643824, + "step": 66570 + }, + { + "epoch": 10.860522022838499, + "grad_norm": 0.021970337256789207, + "learning_rate": 0.0005121847792794828, + "loss": 0.0215, + "num_input_tokens_seen": 143654672, + "step": 66575 + }, + { + "epoch": 10.861337683523654, + "grad_norm": 0.31125566363334656, + "learning_rate": 0.0005121136204726893, + "loss": 0.0163, + "num_input_tokens_seen": 143663792, + "step": 66580 + }, + { + "epoch": 10.86215334420881, + "grad_norm": 0.006358995568007231, + "learning_rate": 0.0005120424614203978, + "loss": 0.0494, + "num_input_tokens_seen": 143673872, + "step": 66585 + }, + { + "epoch": 10.862969004893964, + "grad_norm": 0.044914308935403824, + "learning_rate": 0.0005119713021240507, + "loss": 0.0302, + "num_input_tokens_seen": 143685456, + "step": 66590 + }, + { + "epoch": 10.86378466557912, + "grad_norm": 0.01754479669034481, + "learning_rate": 0.0005119001425850899, + "loss": 0.0212, + "num_input_tokens_seen": 143695536, + "step": 66595 + }, + { + "epoch": 10.864600326264274, + "grad_norm": 0.007191614247858524, + "learning_rate": 0.0005118289828049575, + "loss": 0.1607, + "num_input_tokens_seen": 143706672, + "step": 66600 + }, + { + "epoch": 10.86541598694943, + "grad_norm": 0.15472695231437683, + "learning_rate": 0.0005117578227850958, + "loss": 0.0265, + "num_input_tokens_seen": 143716976, + "step": 66605 + }, + { + "epoch": 10.866231647634583, + "grad_norm": 1.016304612159729, + "learning_rate": 0.000511686662526947, + "loss": 0.0999, + "num_input_tokens_seen": 143726352, + "step": 66610 + }, + { + "epoch": 10.867047308319739, + "grad_norm": 0.38180217146873474, + "learning_rate": 0.0005116155020319531, + "loss": 0.1055, + "num_input_tokens_seen": 143737808, + "step": 66615 + }, + { + "epoch": 10.867862969004895, + "grad_norm": 0.001972678815945983, + "learning_rate": 0.0005115443413015563, + "loss": 0.0179, + "num_input_tokens_seen": 143749680, + "step": 66620 + }, + { + "epoch": 10.868678629690049, + "grad_norm": 0.33859193325042725, + "learning_rate": 0.0005114731803371988, + "loss": 0.1153, + "num_input_tokens_seen": 143759056, + "step": 66625 + }, + { + "epoch": 10.869494290375204, + "grad_norm": 0.05384746193885803, + "learning_rate": 0.0005114020191403228, + "loss": 0.1376, + "num_input_tokens_seen": 143769744, + "step": 66630 + }, + { + "epoch": 10.870309951060358, + "grad_norm": 0.12118818610906601, + "learning_rate": 0.0005113308577123705, + "loss": 0.0323, + "num_input_tokens_seen": 143779504, + "step": 66635 + }, + { + "epoch": 10.871125611745514, + "grad_norm": 0.009494357742369175, + "learning_rate": 0.0005112596960547838, + "loss": 0.1376, + "num_input_tokens_seen": 143790096, + "step": 66640 + }, + { + "epoch": 10.87194127243067, + "grad_norm": 0.0076266746036708355, + "learning_rate": 0.0005111885341690051, + "loss": 0.1534, + "num_input_tokens_seen": 143801008, + "step": 66645 + }, + { + "epoch": 10.872756933115824, + "grad_norm": 0.003127814969047904, + "learning_rate": 0.0005111173720564767, + "loss": 0.0203, + "num_input_tokens_seen": 143811664, + "step": 66650 + }, + { + "epoch": 10.87357259380098, + "grad_norm": 0.12338805943727493, + "learning_rate": 0.0005110462097186405, + "loss": 0.0512, + "num_input_tokens_seen": 143822288, + "step": 66655 + }, + { + "epoch": 10.874388254486133, + "grad_norm": 0.02662845514714718, + "learning_rate": 0.0005109750471569388, + "loss": 0.0761, + "num_input_tokens_seen": 143833104, + "step": 66660 + }, + { + "epoch": 10.875203915171289, + "grad_norm": 0.04466772824525833, + "learning_rate": 0.000510903884372814, + "loss": 0.0159, + "num_input_tokens_seen": 143844336, + "step": 66665 + }, + { + "epoch": 10.876019575856443, + "grad_norm": 0.025245433673262596, + "learning_rate": 0.0005108327213677081, + "loss": 0.0137, + "num_input_tokens_seen": 143855216, + "step": 66670 + }, + { + "epoch": 10.876835236541599, + "grad_norm": 0.028527915477752686, + "learning_rate": 0.0005107615581430633, + "loss": 0.006, + "num_input_tokens_seen": 143865008, + "step": 66675 + }, + { + "epoch": 10.877650897226754, + "grad_norm": 0.2587251365184784, + "learning_rate": 0.0005106903947003221, + "loss": 0.1138, + "num_input_tokens_seen": 143875568, + "step": 66680 + }, + { + "epoch": 10.878466557911908, + "grad_norm": 0.3328368067741394, + "learning_rate": 0.0005106192310409263, + "loss": 0.0773, + "num_input_tokens_seen": 143886192, + "step": 66685 + }, + { + "epoch": 10.879282218597064, + "grad_norm": 0.03119209036231041, + "learning_rate": 0.0005105480671663183, + "loss": 0.1205, + "num_input_tokens_seen": 143896656, + "step": 66690 + }, + { + "epoch": 10.880097879282218, + "grad_norm": 0.41986408829689026, + "learning_rate": 0.0005104769030779404, + "loss": 0.0269, + "num_input_tokens_seen": 143906352, + "step": 66695 + }, + { + "epoch": 10.880913539967374, + "grad_norm": 0.05183674767613411, + "learning_rate": 0.0005104057387772347, + "loss": 0.0164, + "num_input_tokens_seen": 143917296, + "step": 66700 + }, + { + "epoch": 10.88172920065253, + "grad_norm": 0.022945918142795563, + "learning_rate": 0.0005103345742656437, + "loss": 0.058, + "num_input_tokens_seen": 143928080, + "step": 66705 + }, + { + "epoch": 10.882544861337683, + "grad_norm": 0.32585594058036804, + "learning_rate": 0.0005102634095446092, + "loss": 0.0583, + "num_input_tokens_seen": 143938992, + "step": 66710 + }, + { + "epoch": 10.883360522022839, + "grad_norm": 0.013700253330171108, + "learning_rate": 0.0005101922446155738, + "loss": 0.0075, + "num_input_tokens_seen": 143949840, + "step": 66715 + }, + { + "epoch": 10.884176182707993, + "grad_norm": 0.0030772751197218895, + "learning_rate": 0.0005101210794799797, + "loss": 0.0525, + "num_input_tokens_seen": 143961008, + "step": 66720 + }, + { + "epoch": 10.884991843393149, + "grad_norm": 0.0030525512993335724, + "learning_rate": 0.0005100499141392689, + "loss": 0.0389, + "num_input_tokens_seen": 143971888, + "step": 66725 + }, + { + "epoch": 10.885807504078304, + "grad_norm": 0.34103313088417053, + "learning_rate": 0.0005099787485948839, + "loss": 0.0517, + "num_input_tokens_seen": 143983312, + "step": 66730 + }, + { + "epoch": 10.886623164763458, + "grad_norm": 0.026073753833770752, + "learning_rate": 0.000509907582848267, + "loss": 0.0495, + "num_input_tokens_seen": 143992816, + "step": 66735 + }, + { + "epoch": 10.887438825448614, + "grad_norm": 0.07405105233192444, + "learning_rate": 0.0005098364169008604, + "loss": 0.0455, + "num_input_tokens_seen": 144005104, + "step": 66740 + }, + { + "epoch": 10.888254486133768, + "grad_norm": 0.05806545168161392, + "learning_rate": 0.0005097652507541062, + "loss": 0.0174, + "num_input_tokens_seen": 144015088, + "step": 66745 + }, + { + "epoch": 10.889070146818923, + "grad_norm": 0.04766848683357239, + "learning_rate": 0.0005096940844094467, + "loss": 0.0398, + "num_input_tokens_seen": 144026416, + "step": 66750 + }, + { + "epoch": 10.88988580750408, + "grad_norm": 0.003255144227296114, + "learning_rate": 0.0005096229178683244, + "loss": 0.0043, + "num_input_tokens_seen": 144035440, + "step": 66755 + }, + { + "epoch": 10.890701468189233, + "grad_norm": 0.47637036442756653, + "learning_rate": 0.0005095517511321815, + "loss": 0.0774, + "num_input_tokens_seen": 144046544, + "step": 66760 + }, + { + "epoch": 10.891517128874389, + "grad_norm": 0.019214171916246414, + "learning_rate": 0.0005094805842024603, + "loss": 0.177, + "num_input_tokens_seen": 144055984, + "step": 66765 + }, + { + "epoch": 10.892332789559543, + "grad_norm": 0.09027364104986191, + "learning_rate": 0.000509409417080603, + "loss": 0.117, + "num_input_tokens_seen": 144065744, + "step": 66770 + }, + { + "epoch": 10.893148450244698, + "grad_norm": 0.07732967287302017, + "learning_rate": 0.0005093382497680516, + "loss": 0.0342, + "num_input_tokens_seen": 144076112, + "step": 66775 + }, + { + "epoch": 10.893964110929852, + "grad_norm": 0.45761722326278687, + "learning_rate": 0.000509267082266249, + "loss": 0.0448, + "num_input_tokens_seen": 144087824, + "step": 66780 + }, + { + "epoch": 10.894779771615008, + "grad_norm": 0.40398043394088745, + "learning_rate": 0.0005091959145766373, + "loss": 0.0214, + "num_input_tokens_seen": 144098448, + "step": 66785 + }, + { + "epoch": 10.895595432300164, + "grad_norm": 0.14081457257270813, + "learning_rate": 0.0005091247467006588, + "loss": 0.1449, + "num_input_tokens_seen": 144108752, + "step": 66790 + }, + { + "epoch": 10.896411092985318, + "grad_norm": 0.0716458261013031, + "learning_rate": 0.0005090535786397556, + "loss": 0.0385, + "num_input_tokens_seen": 144118800, + "step": 66795 + }, + { + "epoch": 10.897226753670473, + "grad_norm": 0.0237293504178524, + "learning_rate": 0.0005089824103953701, + "loss": 0.0179, + "num_input_tokens_seen": 144129488, + "step": 66800 + }, + { + "epoch": 10.898042414355627, + "grad_norm": 0.025089818984270096, + "learning_rate": 0.0005089112419689447, + "loss": 0.0187, + "num_input_tokens_seen": 144140176, + "step": 66805 + }, + { + "epoch": 10.898858075040783, + "grad_norm": 0.11180520057678223, + "learning_rate": 0.0005088400733619217, + "loss": 0.0274, + "num_input_tokens_seen": 144150320, + "step": 66810 + }, + { + "epoch": 10.899673735725939, + "grad_norm": 0.10480131208896637, + "learning_rate": 0.0005087689045757433, + "loss": 0.0281, + "num_input_tokens_seen": 144162032, + "step": 66815 + }, + { + "epoch": 10.900489396411093, + "grad_norm": 0.00209427229128778, + "learning_rate": 0.000508697735611852, + "loss": 0.0141, + "num_input_tokens_seen": 144173136, + "step": 66820 + }, + { + "epoch": 10.901305057096248, + "grad_norm": 0.4861558675765991, + "learning_rate": 0.0005086265664716901, + "loss": 0.0379, + "num_input_tokens_seen": 144183760, + "step": 66825 + }, + { + "epoch": 10.902120717781402, + "grad_norm": 0.428994357585907, + "learning_rate": 0.0005085553971566998, + "loss": 0.0802, + "num_input_tokens_seen": 144194480, + "step": 66830 + }, + { + "epoch": 10.902936378466558, + "grad_norm": 0.31806638836860657, + "learning_rate": 0.0005084842276683236, + "loss": 0.044, + "num_input_tokens_seen": 144205680, + "step": 66835 + }, + { + "epoch": 10.903752039151712, + "grad_norm": 0.01797155663371086, + "learning_rate": 0.0005084130580080038, + "loss": 0.0317, + "num_input_tokens_seen": 144216560, + "step": 66840 + }, + { + "epoch": 10.904567699836868, + "grad_norm": 0.02310601994395256, + "learning_rate": 0.0005083418881771826, + "loss": 0.0214, + "num_input_tokens_seen": 144227472, + "step": 66845 + }, + { + "epoch": 10.905383360522023, + "grad_norm": 0.009685616008937359, + "learning_rate": 0.0005082707181773025, + "loss": 0.0323, + "num_input_tokens_seen": 144237552, + "step": 66850 + }, + { + "epoch": 10.906199021207177, + "grad_norm": 0.0057588424533605576, + "learning_rate": 0.0005081995480098057, + "loss": 0.0773, + "num_input_tokens_seen": 144248400, + "step": 66855 + }, + { + "epoch": 10.907014681892333, + "grad_norm": 0.026762094348669052, + "learning_rate": 0.0005081283776761348, + "loss": 0.0617, + "num_input_tokens_seen": 144259344, + "step": 66860 + }, + { + "epoch": 10.907830342577487, + "grad_norm": 0.6186009645462036, + "learning_rate": 0.0005080572071777319, + "loss": 0.1584, + "num_input_tokens_seen": 144269936, + "step": 66865 + }, + { + "epoch": 10.908646003262643, + "grad_norm": 0.07359057664871216, + "learning_rate": 0.0005079860365160395, + "loss": 0.1293, + "num_input_tokens_seen": 144279568, + "step": 66870 + }, + { + "epoch": 10.909461663947798, + "grad_norm": 0.004954596050083637, + "learning_rate": 0.0005079148656924999, + "loss": 0.02, + "num_input_tokens_seen": 144289168, + "step": 66875 + }, + { + "epoch": 10.910277324632952, + "grad_norm": 0.14001625776290894, + "learning_rate": 0.0005078436947085557, + "loss": 0.0176, + "num_input_tokens_seen": 144301104, + "step": 66880 + }, + { + "epoch": 10.911092985318108, + "grad_norm": 0.005923207849264145, + "learning_rate": 0.0005077725235656488, + "loss": 0.0192, + "num_input_tokens_seen": 144311664, + "step": 66885 + }, + { + "epoch": 10.911908646003262, + "grad_norm": 0.012273733504116535, + "learning_rate": 0.000507701352265222, + "loss": 0.0049, + "num_input_tokens_seen": 144322288, + "step": 66890 + }, + { + "epoch": 10.912724306688418, + "grad_norm": 0.0034822854213416576, + "learning_rate": 0.0005076301808087176, + "loss": 0.0124, + "num_input_tokens_seen": 144334448, + "step": 66895 + }, + { + "epoch": 10.913539967373573, + "grad_norm": 0.01847565360367298, + "learning_rate": 0.0005075590091975779, + "loss": 0.0082, + "num_input_tokens_seen": 144344208, + "step": 66900 + }, + { + "epoch": 10.914355628058727, + "grad_norm": 0.08559974282979965, + "learning_rate": 0.0005074878374332452, + "loss": 0.0655, + "num_input_tokens_seen": 144355120, + "step": 66905 + }, + { + "epoch": 10.915171288743883, + "grad_norm": 0.008468263782560825, + "learning_rate": 0.000507416665517162, + "loss": 0.0443, + "num_input_tokens_seen": 144366864, + "step": 66910 + }, + { + "epoch": 10.915986949429037, + "grad_norm": 0.13310398161411285, + "learning_rate": 0.0005073454934507708, + "loss": 0.023, + "num_input_tokens_seen": 144377552, + "step": 66915 + }, + { + "epoch": 10.916802610114193, + "grad_norm": 0.05763646587729454, + "learning_rate": 0.0005072743212355135, + "loss": 0.067, + "num_input_tokens_seen": 144388208, + "step": 66920 + }, + { + "epoch": 10.917618270799348, + "grad_norm": 0.039822161197662354, + "learning_rate": 0.0005072031488728331, + "loss": 0.0057, + "num_input_tokens_seen": 144398192, + "step": 66925 + }, + { + "epoch": 10.918433931484502, + "grad_norm": 0.006425980478525162, + "learning_rate": 0.0005071319763641718, + "loss": 0.0788, + "num_input_tokens_seen": 144408304, + "step": 66930 + }, + { + "epoch": 10.919249592169658, + "grad_norm": 0.021236395463347435, + "learning_rate": 0.0005070608037109718, + "loss": 0.122, + "num_input_tokens_seen": 144418160, + "step": 66935 + }, + { + "epoch": 10.920065252854812, + "grad_norm": 0.0026218383572995663, + "learning_rate": 0.0005069896309146758, + "loss": 0.1728, + "num_input_tokens_seen": 144428304, + "step": 66940 + }, + { + "epoch": 10.920880913539968, + "grad_norm": 0.0402076430618763, + "learning_rate": 0.000506918457976726, + "loss": 0.0108, + "num_input_tokens_seen": 144438864, + "step": 66945 + }, + { + "epoch": 10.921696574225122, + "grad_norm": 0.06451573222875595, + "learning_rate": 0.0005068472848985647, + "loss": 0.0198, + "num_input_tokens_seen": 144448720, + "step": 66950 + }, + { + "epoch": 10.922512234910277, + "grad_norm": 0.002615903038531542, + "learning_rate": 0.0005067761116816348, + "loss": 0.1013, + "num_input_tokens_seen": 144459248, + "step": 66955 + }, + { + "epoch": 10.923327895595433, + "grad_norm": 0.01171022653579712, + "learning_rate": 0.0005067049383273783, + "loss": 0.0217, + "num_input_tokens_seen": 144468912, + "step": 66960 + }, + { + "epoch": 10.924143556280587, + "grad_norm": 0.0021881558932363987, + "learning_rate": 0.0005066337648372376, + "loss": 0.0617, + "num_input_tokens_seen": 144479920, + "step": 66965 + }, + { + "epoch": 10.924959216965743, + "grad_norm": 0.6424474120140076, + "learning_rate": 0.0005065625912126553, + "loss": 0.0706, + "num_input_tokens_seen": 144490320, + "step": 66970 + }, + { + "epoch": 10.925774877650896, + "grad_norm": 0.46644723415374756, + "learning_rate": 0.0005064914174550737, + "loss": 0.117, + "num_input_tokens_seen": 144500240, + "step": 66975 + }, + { + "epoch": 10.926590538336052, + "grad_norm": 0.043599795550107956, + "learning_rate": 0.0005064202435659354, + "loss": 0.099, + "num_input_tokens_seen": 144511472, + "step": 66980 + }, + { + "epoch": 10.927406199021208, + "grad_norm": 0.005844366271048784, + "learning_rate": 0.0005063490695466827, + "loss": 0.0054, + "num_input_tokens_seen": 144522000, + "step": 66985 + }, + { + "epoch": 10.928221859706362, + "grad_norm": 0.41834548115730286, + "learning_rate": 0.000506277895398758, + "loss": 0.0868, + "num_input_tokens_seen": 144533552, + "step": 66990 + }, + { + "epoch": 10.929037520391518, + "grad_norm": 0.46168261766433716, + "learning_rate": 0.0005062067211236039, + "loss": 0.0915, + "num_input_tokens_seen": 144545648, + "step": 66995 + }, + { + "epoch": 10.929853181076671, + "grad_norm": 0.23938803374767303, + "learning_rate": 0.0005061355467226626, + "loss": 0.0569, + "num_input_tokens_seen": 144556784, + "step": 67000 + }, + { + "epoch": 10.930668841761827, + "grad_norm": 0.00585373817011714, + "learning_rate": 0.0005060643721973766, + "loss": 0.0466, + "num_input_tokens_seen": 144567504, + "step": 67005 + }, + { + "epoch": 10.931484502446983, + "grad_norm": 0.034302983433008194, + "learning_rate": 0.0005059931975491886, + "loss": 0.0407, + "num_input_tokens_seen": 144579120, + "step": 67010 + }, + { + "epoch": 10.932300163132137, + "grad_norm": 0.014970509335398674, + "learning_rate": 0.0005059220227795409, + "loss": 0.0196, + "num_input_tokens_seen": 144590000, + "step": 67015 + }, + { + "epoch": 10.933115823817293, + "grad_norm": 0.029071809723973274, + "learning_rate": 0.0005058508478898757, + "loss": 0.0977, + "num_input_tokens_seen": 144600176, + "step": 67020 + }, + { + "epoch": 10.933931484502446, + "grad_norm": 0.002314510755240917, + "learning_rate": 0.0005057796728816358, + "loss": 0.0096, + "num_input_tokens_seen": 144611888, + "step": 67025 + }, + { + "epoch": 10.934747145187602, + "grad_norm": 0.025747835636138916, + "learning_rate": 0.0005057084977562633, + "loss": 0.0216, + "num_input_tokens_seen": 144622544, + "step": 67030 + }, + { + "epoch": 10.935562805872756, + "grad_norm": 0.32044142484664917, + "learning_rate": 0.0005056373225152009, + "loss": 0.1329, + "num_input_tokens_seen": 144633392, + "step": 67035 + }, + { + "epoch": 10.936378466557912, + "grad_norm": 0.07185223698616028, + "learning_rate": 0.0005055661471598911, + "loss": 0.0121, + "num_input_tokens_seen": 144644112, + "step": 67040 + }, + { + "epoch": 10.937194127243067, + "grad_norm": 0.15952840447425842, + "learning_rate": 0.0005054949716917763, + "loss": 0.0621, + "num_input_tokens_seen": 144654768, + "step": 67045 + }, + { + "epoch": 10.938009787928221, + "grad_norm": 0.19442293047904968, + "learning_rate": 0.0005054237961122989, + "loss": 0.0667, + "num_input_tokens_seen": 144665264, + "step": 67050 + }, + { + "epoch": 10.938825448613377, + "grad_norm": 0.009865481406450272, + "learning_rate": 0.0005053526204229012, + "loss": 0.0298, + "num_input_tokens_seen": 144676720, + "step": 67055 + }, + { + "epoch": 10.939641109298531, + "grad_norm": 0.015966929495334625, + "learning_rate": 0.000505281444625026, + "loss": 0.1278, + "num_input_tokens_seen": 144688880, + "step": 67060 + }, + { + "epoch": 10.940456769983687, + "grad_norm": 0.3035365641117096, + "learning_rate": 0.0005052102687201156, + "loss": 0.0573, + "num_input_tokens_seen": 144699824, + "step": 67065 + }, + { + "epoch": 10.941272430668842, + "grad_norm": 0.23122626543045044, + "learning_rate": 0.0005051390927096125, + "loss": 0.0413, + "num_input_tokens_seen": 144710256, + "step": 67070 + }, + { + "epoch": 10.942088091353996, + "grad_norm": 0.011138672940433025, + "learning_rate": 0.0005050679165949592, + "loss": 0.0322, + "num_input_tokens_seen": 144722000, + "step": 67075 + }, + { + "epoch": 10.942903752039152, + "grad_norm": 0.009787128306925297, + "learning_rate": 0.0005049967403775982, + "loss": 0.0054, + "num_input_tokens_seen": 144732528, + "step": 67080 + }, + { + "epoch": 10.943719412724306, + "grad_norm": 0.039076391607522964, + "learning_rate": 0.0005049255640589718, + "loss": 0.0071, + "num_input_tokens_seen": 144744176, + "step": 67085 + }, + { + "epoch": 10.944535073409462, + "grad_norm": 0.0067952899262309074, + "learning_rate": 0.0005048543876405225, + "loss": 0.0164, + "num_input_tokens_seen": 144754832, + "step": 67090 + }, + { + "epoch": 10.945350734094617, + "grad_norm": 0.019310537725687027, + "learning_rate": 0.000504783211123693, + "loss": 0.0135, + "num_input_tokens_seen": 144765808, + "step": 67095 + }, + { + "epoch": 10.946166394779771, + "grad_norm": 0.008801273070275784, + "learning_rate": 0.0005047120345099258, + "loss": 0.0452, + "num_input_tokens_seen": 144776912, + "step": 67100 + }, + { + "epoch": 10.946982055464927, + "grad_norm": 0.15680472552776337, + "learning_rate": 0.0005046408578006631, + "loss": 0.0572, + "num_input_tokens_seen": 144788240, + "step": 67105 + }, + { + "epoch": 10.947797716150081, + "grad_norm": 0.06560293585062027, + "learning_rate": 0.0005045696809973474, + "loss": 0.0931, + "num_input_tokens_seen": 144796816, + "step": 67110 + }, + { + "epoch": 10.948613376835237, + "grad_norm": 0.02397521212697029, + "learning_rate": 0.0005044985041014217, + "loss": 0.0124, + "num_input_tokens_seen": 144807440, + "step": 67115 + }, + { + "epoch": 10.949429037520392, + "grad_norm": 0.008901095017790794, + "learning_rate": 0.0005044273271143277, + "loss": 0.0172, + "num_input_tokens_seen": 144819056, + "step": 67120 + }, + { + "epoch": 10.950244698205546, + "grad_norm": 0.014831587672233582, + "learning_rate": 0.0005043561500375085, + "loss": 0.0129, + "num_input_tokens_seen": 144829872, + "step": 67125 + }, + { + "epoch": 10.951060358890702, + "grad_norm": 0.07329477369785309, + "learning_rate": 0.0005042849728724064, + "loss": 0.0196, + "num_input_tokens_seen": 144839984, + "step": 67130 + }, + { + "epoch": 10.951876019575856, + "grad_norm": 0.006357451435178518, + "learning_rate": 0.0005042137956204639, + "loss": 0.0083, + "num_input_tokens_seen": 144850640, + "step": 67135 + }, + { + "epoch": 10.952691680261012, + "grad_norm": 0.0013050257693976164, + "learning_rate": 0.0005041426182831233, + "loss": 0.0087, + "num_input_tokens_seen": 144861456, + "step": 67140 + }, + { + "epoch": 10.953507340946166, + "grad_norm": 0.01807921938598156, + "learning_rate": 0.0005040714408618275, + "loss": 0.0611, + "num_input_tokens_seen": 144872176, + "step": 67145 + }, + { + "epoch": 10.954323001631321, + "grad_norm": 0.08510637283325195, + "learning_rate": 0.0005040002633580188, + "loss": 0.0118, + "num_input_tokens_seen": 144882736, + "step": 67150 + }, + { + "epoch": 10.955138662316477, + "grad_norm": 0.010647742077708244, + "learning_rate": 0.0005039290857731395, + "loss": 0.0206, + "num_input_tokens_seen": 144893328, + "step": 67155 + }, + { + "epoch": 10.955954323001631, + "grad_norm": 1.3817169666290283, + "learning_rate": 0.0005038579081086324, + "loss": 0.0521, + "num_input_tokens_seen": 144903728, + "step": 67160 + }, + { + "epoch": 10.956769983686787, + "grad_norm": 0.09943846613168716, + "learning_rate": 0.0005037867303659399, + "loss": 0.0617, + "num_input_tokens_seen": 144914512, + "step": 67165 + }, + { + "epoch": 10.95758564437194, + "grad_norm": 0.11593145877122879, + "learning_rate": 0.0005037155525465046, + "loss": 0.0142, + "num_input_tokens_seen": 144925904, + "step": 67170 + }, + { + "epoch": 10.958401305057096, + "grad_norm": 0.0029992684721946716, + "learning_rate": 0.0005036443746517688, + "loss": 0.0439, + "num_input_tokens_seen": 144938352, + "step": 67175 + }, + { + "epoch": 10.959216965742252, + "grad_norm": 0.07130617648363113, + "learning_rate": 0.0005035731966831752, + "loss": 0.018, + "num_input_tokens_seen": 144949072, + "step": 67180 + }, + { + "epoch": 10.960032626427406, + "grad_norm": 0.1574365347623825, + "learning_rate": 0.0005035020186421661, + "loss": 0.0124, + "num_input_tokens_seen": 144959248, + "step": 67185 + }, + { + "epoch": 10.960848287112562, + "grad_norm": 0.001897152280434966, + "learning_rate": 0.0005034308405301842, + "loss": 0.0094, + "num_input_tokens_seen": 144969168, + "step": 67190 + }, + { + "epoch": 10.961663947797716, + "grad_norm": 0.022572826594114304, + "learning_rate": 0.0005033596623486719, + "loss": 0.0056, + "num_input_tokens_seen": 144979984, + "step": 67195 + }, + { + "epoch": 10.962479608482871, + "grad_norm": 0.0050385380163788795, + "learning_rate": 0.0005032884840990719, + "loss": 0.1133, + "num_input_tokens_seen": 144990672, + "step": 67200 + }, + { + "epoch": 10.963295269168025, + "grad_norm": 0.00584006542339921, + "learning_rate": 0.0005032173057828265, + "loss": 0.0115, + "num_input_tokens_seen": 145001616, + "step": 67205 + }, + { + "epoch": 10.964110929853181, + "grad_norm": 0.0033857645466923714, + "learning_rate": 0.0005031461274013784, + "loss": 0.0156, + "num_input_tokens_seen": 145012400, + "step": 67210 + }, + { + "epoch": 10.964926590538337, + "grad_norm": 0.18775081634521484, + "learning_rate": 0.0005030749489561701, + "loss": 0.0248, + "num_input_tokens_seen": 145023408, + "step": 67215 + }, + { + "epoch": 10.96574225122349, + "grad_norm": 0.01748412474989891, + "learning_rate": 0.000503003770448644, + "loss": 0.0066, + "num_input_tokens_seen": 145034576, + "step": 67220 + }, + { + "epoch": 10.966557911908646, + "grad_norm": 0.017225490882992744, + "learning_rate": 0.0005029325918802426, + "loss": 0.0176, + "num_input_tokens_seen": 145045616, + "step": 67225 + }, + { + "epoch": 10.9673735725938, + "grad_norm": 0.005488485097885132, + "learning_rate": 0.0005028614132524085, + "loss": 0.0093, + "num_input_tokens_seen": 145055696, + "step": 67230 + }, + { + "epoch": 10.968189233278956, + "grad_norm": 0.003235048381611705, + "learning_rate": 0.0005027902345665843, + "loss": 0.0369, + "num_input_tokens_seen": 145066896, + "step": 67235 + }, + { + "epoch": 10.969004893964112, + "grad_norm": 0.0007825758657418191, + "learning_rate": 0.0005027190558242124, + "loss": 0.0121, + "num_input_tokens_seen": 145077744, + "step": 67240 + }, + { + "epoch": 10.969820554649266, + "grad_norm": 0.15452887117862701, + "learning_rate": 0.0005026478770267355, + "loss": 0.0265, + "num_input_tokens_seen": 145087824, + "step": 67245 + }, + { + "epoch": 10.970636215334421, + "grad_norm": 0.6206972599029541, + "learning_rate": 0.0005025766981755959, + "loss": 0.112, + "num_input_tokens_seen": 145099824, + "step": 67250 + }, + { + "epoch": 10.971451876019575, + "grad_norm": 0.41689279675483704, + "learning_rate": 0.0005025055192722363, + "loss": 0.1636, + "num_input_tokens_seen": 145110512, + "step": 67255 + }, + { + "epoch": 10.97226753670473, + "grad_norm": 0.02389301359653473, + "learning_rate": 0.0005024343403180992, + "loss": 0.0344, + "num_input_tokens_seen": 145121488, + "step": 67260 + }, + { + "epoch": 10.973083197389887, + "grad_norm": 0.29642441868782043, + "learning_rate": 0.0005023631613146272, + "loss": 0.025, + "num_input_tokens_seen": 145132112, + "step": 67265 + }, + { + "epoch": 10.97389885807504, + "grad_norm": 0.571780264377594, + "learning_rate": 0.0005022919822632625, + "loss": 0.0617, + "num_input_tokens_seen": 145142736, + "step": 67270 + }, + { + "epoch": 10.974714518760196, + "grad_norm": 0.07872825860977173, + "learning_rate": 0.0005022208031654479, + "loss": 0.0951, + "num_input_tokens_seen": 145152528, + "step": 67275 + }, + { + "epoch": 10.97553017944535, + "grad_norm": 0.004459878895431757, + "learning_rate": 0.0005021496240226261, + "loss": 0.0031, + "num_input_tokens_seen": 145163280, + "step": 67280 + }, + { + "epoch": 10.976345840130506, + "grad_norm": 0.0033729930873960257, + "learning_rate": 0.0005020784448362393, + "loss": 0.0241, + "num_input_tokens_seen": 145173840, + "step": 67285 + }, + { + "epoch": 10.977161500815662, + "grad_norm": 0.002414536429569125, + "learning_rate": 0.0005020072656077302, + "loss": 0.0589, + "num_input_tokens_seen": 145183088, + "step": 67290 + }, + { + "epoch": 10.977977161500815, + "grad_norm": 0.4573877155780792, + "learning_rate": 0.0005019360863385413, + "loss": 0.0252, + "num_input_tokens_seen": 145193776, + "step": 67295 + }, + { + "epoch": 10.978792822185971, + "grad_norm": 0.002899870742112398, + "learning_rate": 0.0005018649070301152, + "loss": 0.0305, + "num_input_tokens_seen": 145205552, + "step": 67300 + }, + { + "epoch": 10.979608482871125, + "grad_norm": 0.024911347776651382, + "learning_rate": 0.0005017937276838943, + "loss": 0.0065, + "num_input_tokens_seen": 145215504, + "step": 67305 + }, + { + "epoch": 10.98042414355628, + "grad_norm": 0.5657548308372498, + "learning_rate": 0.0005017225483013212, + "loss": 0.0275, + "num_input_tokens_seen": 145226064, + "step": 67310 + }, + { + "epoch": 10.981239804241435, + "grad_norm": 0.2869049608707428, + "learning_rate": 0.0005016513688838387, + "loss": 0.152, + "num_input_tokens_seen": 145236944, + "step": 67315 + }, + { + "epoch": 10.98205546492659, + "grad_norm": 0.013517703860998154, + "learning_rate": 0.0005015801894328889, + "loss": 0.0213, + "num_input_tokens_seen": 145247856, + "step": 67320 + }, + { + "epoch": 10.982871125611746, + "grad_norm": 0.013307986781001091, + "learning_rate": 0.0005015090099499147, + "loss": 0.0651, + "num_input_tokens_seen": 145259344, + "step": 67325 + }, + { + "epoch": 10.9836867862969, + "grad_norm": 0.0020879863295704126, + "learning_rate": 0.0005014378304363584, + "loss": 0.0806, + "num_input_tokens_seen": 145269648, + "step": 67330 + }, + { + "epoch": 10.984502446982056, + "grad_norm": 0.021404843777418137, + "learning_rate": 0.0005013666508936627, + "loss": 0.0068, + "num_input_tokens_seen": 145280304, + "step": 67335 + }, + { + "epoch": 10.98531810766721, + "grad_norm": 0.042149920016527176, + "learning_rate": 0.0005012954713232701, + "loss": 0.0621, + "num_input_tokens_seen": 145292208, + "step": 67340 + }, + { + "epoch": 10.986133768352365, + "grad_norm": 0.008459716103971004, + "learning_rate": 0.0005012242917266232, + "loss": 0.0049, + "num_input_tokens_seen": 145303024, + "step": 67345 + }, + { + "epoch": 10.986949429037521, + "grad_norm": 0.01330799050629139, + "learning_rate": 0.0005011531121051643, + "loss": 0.0354, + "num_input_tokens_seen": 145313296, + "step": 67350 + }, + { + "epoch": 10.987765089722675, + "grad_norm": 0.005794075783342123, + "learning_rate": 0.0005010819324603363, + "loss": 0.0088, + "num_input_tokens_seen": 145324208, + "step": 67355 + }, + { + "epoch": 10.98858075040783, + "grad_norm": 0.021925779059529305, + "learning_rate": 0.0005010107527935815, + "loss": 0.0079, + "num_input_tokens_seen": 145334000, + "step": 67360 + }, + { + "epoch": 10.989396411092985, + "grad_norm": 0.03230549767613411, + "learning_rate": 0.0005009395731063424, + "loss": 0.0121, + "num_input_tokens_seen": 145346128, + "step": 67365 + }, + { + "epoch": 10.99021207177814, + "grad_norm": 0.10584338009357452, + "learning_rate": 0.0005008683934000618, + "loss": 0.01, + "num_input_tokens_seen": 145357264, + "step": 67370 + }, + { + "epoch": 10.991027732463294, + "grad_norm": 0.01694609224796295, + "learning_rate": 0.000500797213676182, + "loss": 0.0236, + "num_input_tokens_seen": 145368592, + "step": 67375 + }, + { + "epoch": 10.99184339314845, + "grad_norm": 0.006252037361264229, + "learning_rate": 0.0005007260339361456, + "loss": 0.0511, + "num_input_tokens_seen": 145378576, + "step": 67380 + }, + { + "epoch": 10.992659053833606, + "grad_norm": 0.3086402416229248, + "learning_rate": 0.0005006548541813953, + "loss": 0.0347, + "num_input_tokens_seen": 145389296, + "step": 67385 + }, + { + "epoch": 10.99347471451876, + "grad_norm": 0.006644485052675009, + "learning_rate": 0.0005005836744133736, + "loss": 0.0592, + "num_input_tokens_seen": 145400624, + "step": 67390 + }, + { + "epoch": 10.994290375203915, + "grad_norm": 0.006041113752871752, + "learning_rate": 0.0005005124946335229, + "loss": 0.007, + "num_input_tokens_seen": 145411760, + "step": 67395 + }, + { + "epoch": 10.99510603588907, + "grad_norm": 0.03510593995451927, + "learning_rate": 0.0005004413148432859, + "loss": 0.0992, + "num_input_tokens_seen": 145422544, + "step": 67400 + }, + { + "epoch": 10.995921696574225, + "grad_norm": 0.015384171158075333, + "learning_rate": 0.000500370135044105, + "loss": 0.0125, + "num_input_tokens_seen": 145433840, + "step": 67405 + }, + { + "epoch": 10.99673735725938, + "grad_norm": 0.2766803503036499, + "learning_rate": 0.000500298955237423, + "loss": 0.0472, + "num_input_tokens_seen": 145444560, + "step": 67410 + }, + { + "epoch": 10.997553017944535, + "grad_norm": 0.03336824104189873, + "learning_rate": 0.0005002277754246822, + "loss": 0.1064, + "num_input_tokens_seen": 145455504, + "step": 67415 + }, + { + "epoch": 10.99836867862969, + "grad_norm": 0.002002457622438669, + "learning_rate": 0.0005001565956073252, + "loss": 0.0417, + "num_input_tokens_seen": 145466928, + "step": 67420 + }, + { + "epoch": 10.999184339314844, + "grad_norm": 0.023544834926724434, + "learning_rate": 0.0005000854157867947, + "loss": 0.0078, + "num_input_tokens_seen": 145476880, + "step": 67425 + }, + { + "epoch": 11.0, + "grad_norm": 0.7011199593544006, + "learning_rate": 0.0005000142359645331, + "loss": 0.0968, + "num_input_tokens_seen": 145487264, + "step": 67430 + }, + { + "epoch": 11.0, + "eval_loss": 0.1396159529685974, + "eval_runtime": 104.3774, + "eval_samples_per_second": 26.107, + "eval_steps_per_second": 6.534, + "num_input_tokens_seen": 145487264, + "step": 67430 + }, + { + "epoch": 11.000815660685156, + "grad_norm": 0.017383918166160583, + "learning_rate": 0.0004999430561419831, + "loss": 0.0217, + "num_input_tokens_seen": 145498432, + "step": 67435 + }, + { + "epoch": 11.00163132137031, + "grad_norm": 0.015165042132139206, + "learning_rate": 0.000499871876320587, + "loss": 0.0147, + "num_input_tokens_seen": 145508448, + "step": 67440 + }, + { + "epoch": 11.002446982055465, + "grad_norm": 0.09373845905065536, + "learning_rate": 0.0004998006965017876, + "loss": 0.0232, + "num_input_tokens_seen": 145518944, + "step": 67445 + }, + { + "epoch": 11.00326264274062, + "grad_norm": 0.10055181384086609, + "learning_rate": 0.0004997295166870271, + "loss": 0.0298, + "num_input_tokens_seen": 145529664, + "step": 67450 + }, + { + "epoch": 11.004078303425775, + "grad_norm": 0.0021532673854380846, + "learning_rate": 0.0004996583368777484, + "loss": 0.0529, + "num_input_tokens_seen": 145542240, + "step": 67455 + }, + { + "epoch": 11.00489396411093, + "grad_norm": 0.015353074297308922, + "learning_rate": 0.000499587157075394, + "loss": 0.0067, + "num_input_tokens_seen": 145551520, + "step": 67460 + }, + { + "epoch": 11.005709624796085, + "grad_norm": 0.002881969790905714, + "learning_rate": 0.0004995159772814063, + "loss": 0.0278, + "num_input_tokens_seen": 145562112, + "step": 67465 + }, + { + "epoch": 11.00652528548124, + "grad_norm": 0.019344991073012352, + "learning_rate": 0.0004994447974972281, + "loss": 0.0095, + "num_input_tokens_seen": 145574208, + "step": 67470 + }, + { + "epoch": 11.007340946166394, + "grad_norm": 0.012570159509778023, + "learning_rate": 0.0004993736177243016, + "loss": 0.0796, + "num_input_tokens_seen": 145585184, + "step": 67475 + }, + { + "epoch": 11.00815660685155, + "grad_norm": 0.008184601552784443, + "learning_rate": 0.0004993024379640697, + "loss": 0.0415, + "num_input_tokens_seen": 145596192, + "step": 67480 + }, + { + "epoch": 11.008972267536704, + "grad_norm": 0.3090513050556183, + "learning_rate": 0.0004992312582179746, + "loss": 0.0455, + "num_input_tokens_seen": 145606304, + "step": 67485 + }, + { + "epoch": 11.00978792822186, + "grad_norm": 0.004438013304024935, + "learning_rate": 0.0004991600784874593, + "loss": 0.0214, + "num_input_tokens_seen": 145616800, + "step": 67490 + }, + { + "epoch": 11.010603588907015, + "grad_norm": 0.04514636844396591, + "learning_rate": 0.0004990888987739657, + "loss": 0.0117, + "num_input_tokens_seen": 145626784, + "step": 67495 + }, + { + "epoch": 11.01141924959217, + "grad_norm": 0.06899908185005188, + "learning_rate": 0.0004990177190789371, + "loss": 0.0375, + "num_input_tokens_seen": 145638048, + "step": 67500 + }, + { + "epoch": 11.012234910277325, + "grad_norm": 0.0006322893314063549, + "learning_rate": 0.0004989465394038153, + "loss": 0.0148, + "num_input_tokens_seen": 145648512, + "step": 67505 + }, + { + "epoch": 11.013050570962479, + "grad_norm": 0.004567243158817291, + "learning_rate": 0.0004988753597500435, + "loss": 0.005, + "num_input_tokens_seen": 145659008, + "step": 67510 + }, + { + "epoch": 11.013866231647635, + "grad_norm": 0.00813292246311903, + "learning_rate": 0.0004988041801190638, + "loss": 0.0238, + "num_input_tokens_seen": 145670240, + "step": 67515 + }, + { + "epoch": 11.01468189233279, + "grad_norm": 0.0021527372300624847, + "learning_rate": 0.000498733000512319, + "loss": 0.1107, + "num_input_tokens_seen": 145680832, + "step": 67520 + }, + { + "epoch": 11.015497553017944, + "grad_norm": 0.39226627349853516, + "learning_rate": 0.0004986618209312515, + "loss": 0.0341, + "num_input_tokens_seen": 145691360, + "step": 67525 + }, + { + "epoch": 11.0163132137031, + "grad_norm": 0.02548142895102501, + "learning_rate": 0.000498590641377304, + "loss": 0.0149, + "num_input_tokens_seen": 145702304, + "step": 67530 + }, + { + "epoch": 11.017128874388254, + "grad_norm": 0.005212225951254368, + "learning_rate": 0.0004985194618519188, + "loss": 0.0055, + "num_input_tokens_seen": 145714720, + "step": 67535 + }, + { + "epoch": 11.01794453507341, + "grad_norm": 0.001658056047745049, + "learning_rate": 0.0004984482823565386, + "loss": 0.0583, + "num_input_tokens_seen": 145726048, + "step": 67540 + }, + { + "epoch": 11.018760195758565, + "grad_norm": 0.45634788274765015, + "learning_rate": 0.0004983771028926059, + "loss": 0.1214, + "num_input_tokens_seen": 145736000, + "step": 67545 + }, + { + "epoch": 11.01957585644372, + "grad_norm": 0.0032540587708353996, + "learning_rate": 0.0004983059234615635, + "loss": 0.0585, + "num_input_tokens_seen": 145746112, + "step": 67550 + }, + { + "epoch": 11.020391517128875, + "grad_norm": 0.13170619308948517, + "learning_rate": 0.0004982347440648534, + "loss": 0.0134, + "num_input_tokens_seen": 145757664, + "step": 67555 + }, + { + "epoch": 11.021207177814029, + "grad_norm": 0.00873930100351572, + "learning_rate": 0.0004981635647039186, + "loss": 0.0159, + "num_input_tokens_seen": 145768224, + "step": 67560 + }, + { + "epoch": 11.022022838499185, + "grad_norm": 0.015919003635644913, + "learning_rate": 0.0004980923853802015, + "loss": 0.0088, + "num_input_tokens_seen": 145780160, + "step": 67565 + }, + { + "epoch": 11.022838499184338, + "grad_norm": 0.005639873910695314, + "learning_rate": 0.0004980212060951447, + "loss": 0.0127, + "num_input_tokens_seen": 145790848, + "step": 67570 + }, + { + "epoch": 11.023654159869494, + "grad_norm": 0.00510849105194211, + "learning_rate": 0.0004979500268501905, + "loss": 0.029, + "num_input_tokens_seen": 145801344, + "step": 67575 + }, + { + "epoch": 11.02446982055465, + "grad_norm": 0.001798538607545197, + "learning_rate": 0.0004978788476467816, + "loss": 0.0043, + "num_input_tokens_seen": 145811680, + "step": 67580 + }, + { + "epoch": 11.025285481239804, + "grad_norm": 0.004775770474225283, + "learning_rate": 0.0004978076684863607, + "loss": 0.0531, + "num_input_tokens_seen": 145822784, + "step": 67585 + }, + { + "epoch": 11.02610114192496, + "grad_norm": 0.0004272933001630008, + "learning_rate": 0.0004977364893703701, + "loss": 0.1395, + "num_input_tokens_seen": 145833792, + "step": 67590 + }, + { + "epoch": 11.026916802610113, + "grad_norm": 0.004037778824567795, + "learning_rate": 0.0004976653103002526, + "loss": 0.0222, + "num_input_tokens_seen": 145844384, + "step": 67595 + }, + { + "epoch": 11.02773246329527, + "grad_norm": 0.008160573430359364, + "learning_rate": 0.0004975941312774502, + "loss": 0.0264, + "num_input_tokens_seen": 145855648, + "step": 67600 + }, + { + "epoch": 11.028548123980425, + "grad_norm": 0.0008483852725476027, + "learning_rate": 0.0004975229523034061, + "loss": 0.1407, + "num_input_tokens_seen": 145865792, + "step": 67605 + }, + { + "epoch": 11.029363784665579, + "grad_norm": 0.153411403298378, + "learning_rate": 0.0004974517733795623, + "loss": 0.0142, + "num_input_tokens_seen": 145875200, + "step": 67610 + }, + { + "epoch": 11.030179445350734, + "grad_norm": 0.0029859000351279974, + "learning_rate": 0.0004973805945073617, + "loss": 0.0114, + "num_input_tokens_seen": 145886912, + "step": 67615 + }, + { + "epoch": 11.030995106035888, + "grad_norm": 0.012036247178912163, + "learning_rate": 0.0004973094156882466, + "loss": 0.0049, + "num_input_tokens_seen": 145898176, + "step": 67620 + }, + { + "epoch": 11.031810766721044, + "grad_norm": 0.021320773288607597, + "learning_rate": 0.0004972382369236596, + "loss": 0.0122, + "num_input_tokens_seen": 145908128, + "step": 67625 + }, + { + "epoch": 11.0326264274062, + "grad_norm": 0.28655901551246643, + "learning_rate": 0.0004971670582150431, + "loss": 0.0181, + "num_input_tokens_seen": 145920544, + "step": 67630 + }, + { + "epoch": 11.033442088091354, + "grad_norm": 0.6762530207633972, + "learning_rate": 0.0004970958795638401, + "loss": 0.0246, + "num_input_tokens_seen": 145930208, + "step": 67635 + }, + { + "epoch": 11.03425774877651, + "grad_norm": 0.021230272948741913, + "learning_rate": 0.0004970247009714924, + "loss": 0.1085, + "num_input_tokens_seen": 145941536, + "step": 67640 + }, + { + "epoch": 11.035073409461663, + "grad_norm": 0.29068219661712646, + "learning_rate": 0.0004969535224394432, + "loss": 0.0453, + "num_input_tokens_seen": 145953312, + "step": 67645 + }, + { + "epoch": 11.035889070146819, + "grad_norm": 0.13652829825878143, + "learning_rate": 0.0004968823439691346, + "loss": 0.0136, + "num_input_tokens_seen": 145963456, + "step": 67650 + }, + { + "epoch": 11.036704730831975, + "grad_norm": 0.004439199808984995, + "learning_rate": 0.0004968111655620093, + "loss": 0.0352, + "num_input_tokens_seen": 145975008, + "step": 67655 + }, + { + "epoch": 11.037520391517129, + "grad_norm": 0.01866345852613449, + "learning_rate": 0.0004967399872195096, + "loss": 0.1574, + "num_input_tokens_seen": 145984928, + "step": 67660 + }, + { + "epoch": 11.038336052202284, + "grad_norm": 0.490433007478714, + "learning_rate": 0.0004966688089430785, + "loss": 0.0389, + "num_input_tokens_seen": 145995584, + "step": 67665 + }, + { + "epoch": 11.039151712887438, + "grad_norm": 0.014313768595457077, + "learning_rate": 0.000496597630734158, + "loss": 0.0439, + "num_input_tokens_seen": 146006368, + "step": 67670 + }, + { + "epoch": 11.039967373572594, + "grad_norm": 0.22988677024841309, + "learning_rate": 0.0004965264525941908, + "loss": 0.0818, + "num_input_tokens_seen": 146017184, + "step": 67675 + }, + { + "epoch": 11.040783034257748, + "grad_norm": 0.0042504072189331055, + "learning_rate": 0.0004964552745246196, + "loss": 0.0068, + "num_input_tokens_seen": 146027392, + "step": 67680 + }, + { + "epoch": 11.041598694942904, + "grad_norm": 0.009536768309772015, + "learning_rate": 0.0004963840965268866, + "loss": 0.0136, + "num_input_tokens_seen": 146037280, + "step": 67685 + }, + { + "epoch": 11.04241435562806, + "grad_norm": 0.08847548067569733, + "learning_rate": 0.0004963129186024346, + "loss": 0.0315, + "num_input_tokens_seen": 146048640, + "step": 67690 + }, + { + "epoch": 11.043230016313213, + "grad_norm": 0.012821653857827187, + "learning_rate": 0.0004962417407527059, + "loss": 0.1586, + "num_input_tokens_seen": 146059296, + "step": 67695 + }, + { + "epoch": 11.044045676998369, + "grad_norm": 0.1483294665813446, + "learning_rate": 0.0004961705629791431, + "loss": 0.0218, + "num_input_tokens_seen": 146070240, + "step": 67700 + }, + { + "epoch": 11.044861337683523, + "grad_norm": 0.012307694181799889, + "learning_rate": 0.0004960993852831888, + "loss": 0.0053, + "num_input_tokens_seen": 146080736, + "step": 67705 + }, + { + "epoch": 11.045676998368679, + "grad_norm": 0.024590613320469856, + "learning_rate": 0.0004960282076662853, + "loss": 0.0519, + "num_input_tokens_seen": 146091488, + "step": 67710 + }, + { + "epoch": 11.046492659053834, + "grad_norm": 0.0075014918111264706, + "learning_rate": 0.0004959570301298752, + "loss": 0.0123, + "num_input_tokens_seen": 146101664, + "step": 67715 + }, + { + "epoch": 11.047308319738988, + "grad_norm": 0.5562306642532349, + "learning_rate": 0.0004958858526754012, + "loss": 0.0396, + "num_input_tokens_seen": 146112448, + "step": 67720 + }, + { + "epoch": 11.048123980424144, + "grad_norm": 0.0026332528796046972, + "learning_rate": 0.0004958146753043053, + "loss": 0.0628, + "num_input_tokens_seen": 146123648, + "step": 67725 + }, + { + "epoch": 11.048939641109298, + "grad_norm": 0.002659478457644582, + "learning_rate": 0.0004957434980180307, + "loss": 0.0535, + "num_input_tokens_seen": 146134368, + "step": 67730 + }, + { + "epoch": 11.049755301794454, + "grad_norm": 0.00478120194748044, + "learning_rate": 0.0004956723208180191, + "loss": 0.0115, + "num_input_tokens_seen": 146145152, + "step": 67735 + }, + { + "epoch": 11.05057096247961, + "grad_norm": 0.004077858291566372, + "learning_rate": 0.0004956011437057138, + "loss": 0.0447, + "num_input_tokens_seen": 146154752, + "step": 67740 + }, + { + "epoch": 11.051386623164763, + "grad_norm": 0.005469323135912418, + "learning_rate": 0.0004955299666825566, + "loss": 0.0177, + "num_input_tokens_seen": 146165248, + "step": 67745 + }, + { + "epoch": 11.052202283849919, + "grad_norm": 0.009411460720002651, + "learning_rate": 0.0004954587897499905, + "loss": 0.0028, + "num_input_tokens_seen": 146176512, + "step": 67750 + }, + { + "epoch": 11.053017944535073, + "grad_norm": 0.24873118102550507, + "learning_rate": 0.0004953876129094576, + "loss": 0.0218, + "num_input_tokens_seen": 146188608, + "step": 67755 + }, + { + "epoch": 11.053833605220229, + "grad_norm": 0.0075334953144192696, + "learning_rate": 0.0004953164361624008, + "loss": 0.0096, + "num_input_tokens_seen": 146199424, + "step": 67760 + }, + { + "epoch": 11.054649265905383, + "grad_norm": 0.0006966172368265688, + "learning_rate": 0.0004952452595102621, + "loss": 0.0106, + "num_input_tokens_seen": 146210688, + "step": 67765 + }, + { + "epoch": 11.055464926590538, + "grad_norm": 0.15889693796634674, + "learning_rate": 0.0004951740829544846, + "loss": 0.0188, + "num_input_tokens_seen": 146222176, + "step": 67770 + }, + { + "epoch": 11.056280587275694, + "grad_norm": 0.001479036291129887, + "learning_rate": 0.00049510290649651, + "loss": 0.0137, + "num_input_tokens_seen": 146233632, + "step": 67775 + }, + { + "epoch": 11.057096247960848, + "grad_norm": 0.06772103905677795, + "learning_rate": 0.0004950317301377813, + "loss": 0.0094, + "num_input_tokens_seen": 146244512, + "step": 67780 + }, + { + "epoch": 11.057911908646004, + "grad_norm": 0.003938071429729462, + "learning_rate": 0.0004949605538797412, + "loss": 0.0705, + "num_input_tokens_seen": 146255520, + "step": 67785 + }, + { + "epoch": 11.058727569331158, + "grad_norm": 0.014110666699707508, + "learning_rate": 0.0004948893777238316, + "loss": 0.0124, + "num_input_tokens_seen": 146266592, + "step": 67790 + }, + { + "epoch": 11.059543230016313, + "grad_norm": 0.04756239056587219, + "learning_rate": 0.0004948182016714954, + "loss": 0.0182, + "num_input_tokens_seen": 146277472, + "step": 67795 + }, + { + "epoch": 11.060358890701469, + "grad_norm": 0.0560716949403286, + "learning_rate": 0.0004947470257241748, + "loss": 0.0072, + "num_input_tokens_seen": 146287872, + "step": 67800 + }, + { + "epoch": 11.061174551386623, + "grad_norm": 0.002097406191751361, + "learning_rate": 0.0004946758498833125, + "loss": 0.0558, + "num_input_tokens_seen": 146298848, + "step": 67805 + }, + { + "epoch": 11.061990212071779, + "grad_norm": 0.02309236116707325, + "learning_rate": 0.0004946046741503507, + "loss": 0.02, + "num_input_tokens_seen": 146309184, + "step": 67810 + }, + { + "epoch": 11.062805872756933, + "grad_norm": 0.010053198784589767, + "learning_rate": 0.0004945334985267323, + "loss": 0.0317, + "num_input_tokens_seen": 146319200, + "step": 67815 + }, + { + "epoch": 11.063621533442088, + "grad_norm": 0.013767745345830917, + "learning_rate": 0.0004944623230138991, + "loss": 0.0238, + "num_input_tokens_seen": 146330688, + "step": 67820 + }, + { + "epoch": 11.064437194127244, + "grad_norm": 0.0714239776134491, + "learning_rate": 0.0004943911476132943, + "loss": 0.0238, + "num_input_tokens_seen": 146340384, + "step": 67825 + }, + { + "epoch": 11.065252854812398, + "grad_norm": 0.3735978305339813, + "learning_rate": 0.0004943199723263597, + "loss": 0.022, + "num_input_tokens_seen": 146349888, + "step": 67830 + }, + { + "epoch": 11.066068515497554, + "grad_norm": 0.01456344872713089, + "learning_rate": 0.0004942487971545383, + "loss": 0.0499, + "num_input_tokens_seen": 146359648, + "step": 67835 + }, + { + "epoch": 11.066884176182707, + "grad_norm": 0.007185864262282848, + "learning_rate": 0.0004941776220992722, + "loss": 0.002, + "num_input_tokens_seen": 146370656, + "step": 67840 + }, + { + "epoch": 11.067699836867863, + "grad_norm": 0.010569276288151741, + "learning_rate": 0.0004941064471620041, + "loss": 0.0155, + "num_input_tokens_seen": 146381216, + "step": 67845 + }, + { + "epoch": 11.068515497553017, + "grad_norm": 0.009927977807819843, + "learning_rate": 0.0004940352723441763, + "loss": 0.0202, + "num_input_tokens_seen": 146391520, + "step": 67850 + }, + { + "epoch": 11.069331158238173, + "grad_norm": 0.07832959294319153, + "learning_rate": 0.0004939640976472311, + "loss": 0.011, + "num_input_tokens_seen": 146402016, + "step": 67855 + }, + { + "epoch": 11.070146818923329, + "grad_norm": 0.015295101329684258, + "learning_rate": 0.0004938929230726111, + "loss": 0.0082, + "num_input_tokens_seen": 146412832, + "step": 67860 + }, + { + "epoch": 11.070962479608482, + "grad_norm": 0.005062935408204794, + "learning_rate": 0.0004938217486217591, + "loss": 0.0055, + "num_input_tokens_seen": 146424128, + "step": 67865 + }, + { + "epoch": 11.071778140293638, + "grad_norm": 0.01769307628273964, + "learning_rate": 0.0004937505742961169, + "loss": 0.0036, + "num_input_tokens_seen": 146433632, + "step": 67870 + }, + { + "epoch": 11.072593800978792, + "grad_norm": 0.006159200798720121, + "learning_rate": 0.0004936794000971274, + "loss": 0.0065, + "num_input_tokens_seen": 146444512, + "step": 67875 + }, + { + "epoch": 11.073409461663948, + "grad_norm": 0.026288103312253952, + "learning_rate": 0.0004936082260262328, + "loss": 0.002, + "num_input_tokens_seen": 146456192, + "step": 67880 + }, + { + "epoch": 11.074225122349104, + "grad_norm": 0.0031699268147349358, + "learning_rate": 0.0004935370520848755, + "loss": 0.0184, + "num_input_tokens_seen": 146466560, + "step": 67885 + }, + { + "epoch": 11.075040783034257, + "grad_norm": 0.10439901798963547, + "learning_rate": 0.0004934658782744983, + "loss": 0.141, + "num_input_tokens_seen": 146478080, + "step": 67890 + }, + { + "epoch": 11.075856443719413, + "grad_norm": 0.011052818968892097, + "learning_rate": 0.0004933947045965431, + "loss": 0.0045, + "num_input_tokens_seen": 146488608, + "step": 67895 + }, + { + "epoch": 11.076672104404567, + "grad_norm": 0.006132977548986673, + "learning_rate": 0.0004933235310524528, + "loss": 0.129, + "num_input_tokens_seen": 146498720, + "step": 67900 + }, + { + "epoch": 11.077487765089723, + "grad_norm": 0.06421475857496262, + "learning_rate": 0.0004932523576436695, + "loss": 0.0066, + "num_input_tokens_seen": 146509856, + "step": 67905 + }, + { + "epoch": 11.078303425774878, + "grad_norm": 0.06756133586168289, + "learning_rate": 0.0004931811843716358, + "loss": 0.1077, + "num_input_tokens_seen": 146520032, + "step": 67910 + }, + { + "epoch": 11.079119086460032, + "grad_norm": 0.003659902373328805, + "learning_rate": 0.000493110011237794, + "loss": 0.0322, + "num_input_tokens_seen": 146532096, + "step": 67915 + }, + { + "epoch": 11.079934747145188, + "grad_norm": 0.4008598327636719, + "learning_rate": 0.0004930388382435866, + "loss": 0.0673, + "num_input_tokens_seen": 146541856, + "step": 67920 + }, + { + "epoch": 11.080750407830342, + "grad_norm": 0.0006906094495207071, + "learning_rate": 0.0004929676653904558, + "loss": 0.0057, + "num_input_tokens_seen": 146552032, + "step": 67925 + }, + { + "epoch": 11.081566068515498, + "grad_norm": 0.005287721287459135, + "learning_rate": 0.0004928964926798445, + "loss": 0.0089, + "num_input_tokens_seen": 146564032, + "step": 67930 + }, + { + "epoch": 11.082381729200652, + "grad_norm": 0.004154246300458908, + "learning_rate": 0.0004928253201131945, + "loss": 0.0137, + "num_input_tokens_seen": 146574688, + "step": 67935 + }, + { + "epoch": 11.083197389885807, + "grad_norm": 0.0018318190705031157, + "learning_rate": 0.0004927541476919487, + "loss": 0.0026, + "num_input_tokens_seen": 146586464, + "step": 67940 + }, + { + "epoch": 11.084013050570963, + "grad_norm": 0.27625352144241333, + "learning_rate": 0.0004926829754175492, + "loss": 0.0534, + "num_input_tokens_seen": 146599008, + "step": 67945 + }, + { + "epoch": 11.084828711256117, + "grad_norm": 0.005931806284934282, + "learning_rate": 0.0004926118032914385, + "loss": 0.0086, + "num_input_tokens_seen": 146610336, + "step": 67950 + }, + { + "epoch": 11.085644371941273, + "grad_norm": 0.019753606989979744, + "learning_rate": 0.0004925406313150589, + "loss": 0.1137, + "num_input_tokens_seen": 146619680, + "step": 67955 + }, + { + "epoch": 11.086460032626427, + "grad_norm": 0.06862779706716537, + "learning_rate": 0.000492469459489853, + "loss": 0.0962, + "num_input_tokens_seen": 146629792, + "step": 67960 + }, + { + "epoch": 11.087275693311582, + "grad_norm": 0.03028160147368908, + "learning_rate": 0.0004923982878172629, + "loss": 0.0133, + "num_input_tokens_seen": 146640192, + "step": 67965 + }, + { + "epoch": 11.088091353996738, + "grad_norm": 0.04311292991042137, + "learning_rate": 0.0004923271162987314, + "loss": 0.0307, + "num_input_tokens_seen": 146651904, + "step": 67970 + }, + { + "epoch": 11.088907014681892, + "grad_norm": 0.0008038950036279857, + "learning_rate": 0.0004922559449357003, + "loss": 0.008, + "num_input_tokens_seen": 146662080, + "step": 67975 + }, + { + "epoch": 11.089722675367048, + "grad_norm": 0.01174769178032875, + "learning_rate": 0.0004921847737296125, + "loss": 0.0098, + "num_input_tokens_seen": 146672352, + "step": 67980 + }, + { + "epoch": 11.090538336052202, + "grad_norm": 0.0028296001255512238, + "learning_rate": 0.0004921136026819101, + "loss": 0.0207, + "num_input_tokens_seen": 146683040, + "step": 67985 + }, + { + "epoch": 11.091353996737357, + "grad_norm": 0.004219654947519302, + "learning_rate": 0.0004920424317940355, + "loss": 0.0058, + "num_input_tokens_seen": 146694720, + "step": 67990 + }, + { + "epoch": 11.092169657422513, + "grad_norm": 0.2297498732805252, + "learning_rate": 0.0004919712610674312, + "loss": 0.0224, + "num_input_tokens_seen": 146703776, + "step": 67995 + }, + { + "epoch": 11.092985318107667, + "grad_norm": 0.0036190850660204887, + "learning_rate": 0.0004919000905035394, + "loss": 0.0038, + "num_input_tokens_seen": 146714720, + "step": 68000 + }, + { + "epoch": 11.093800978792823, + "grad_norm": 0.019035477191209793, + "learning_rate": 0.0004918289201038026, + "loss": 0.0027, + "num_input_tokens_seen": 146724736, + "step": 68005 + }, + { + "epoch": 11.094616639477977, + "grad_norm": 0.0006407679757103324, + "learning_rate": 0.0004917577498696631, + "loss": 0.0075, + "num_input_tokens_seen": 146736544, + "step": 68010 + }, + { + "epoch": 11.095432300163132, + "grad_norm": 0.02990468591451645, + "learning_rate": 0.0004916865798025634, + "loss": 0.0127, + "num_input_tokens_seen": 146748192, + "step": 68015 + }, + { + "epoch": 11.096247960848286, + "grad_norm": 0.0017942434642463923, + "learning_rate": 0.0004916154099039455, + "loss": 0.008, + "num_input_tokens_seen": 146760384, + "step": 68020 + }, + { + "epoch": 11.097063621533442, + "grad_norm": 0.016557930037379265, + "learning_rate": 0.000491544240175252, + "loss": 0.0211, + "num_input_tokens_seen": 146771904, + "step": 68025 + }, + { + "epoch": 11.097879282218598, + "grad_norm": 0.00307639897800982, + "learning_rate": 0.0004914730706179251, + "loss": 0.038, + "num_input_tokens_seen": 146783296, + "step": 68030 + }, + { + "epoch": 11.098694942903752, + "grad_norm": 0.004154440481215715, + "learning_rate": 0.0004914019012334075, + "loss": 0.0224, + "num_input_tokens_seen": 146794176, + "step": 68035 + }, + { + "epoch": 11.099510603588907, + "grad_norm": 0.0012199623743072152, + "learning_rate": 0.000491330732023141, + "loss": 0.0503, + "num_input_tokens_seen": 146805504, + "step": 68040 + }, + { + "epoch": 11.100326264274061, + "grad_norm": 0.009019901975989342, + "learning_rate": 0.0004912595629885685, + "loss": 0.0156, + "num_input_tokens_seen": 146816096, + "step": 68045 + }, + { + "epoch": 11.101141924959217, + "grad_norm": 0.0013762437738478184, + "learning_rate": 0.0004911883941311319, + "loss": 0.0889, + "num_input_tokens_seen": 146825888, + "step": 68050 + }, + { + "epoch": 11.101957585644373, + "grad_norm": 0.057197436690330505, + "learning_rate": 0.0004911172254522737, + "loss": 0.0852, + "num_input_tokens_seen": 146837344, + "step": 68055 + }, + { + "epoch": 11.102773246329527, + "grad_norm": 0.04634103551506996, + "learning_rate": 0.0004910460569534361, + "loss": 0.0443, + "num_input_tokens_seen": 146848128, + "step": 68060 + }, + { + "epoch": 11.103588907014682, + "grad_norm": 0.01836278848350048, + "learning_rate": 0.0004909748886360617, + "loss": 0.003, + "num_input_tokens_seen": 146857632, + "step": 68065 + }, + { + "epoch": 11.104404567699836, + "grad_norm": 0.0007350450614467263, + "learning_rate": 0.0004909037205015924, + "loss": 0.0355, + "num_input_tokens_seen": 146867488, + "step": 68070 + }, + { + "epoch": 11.105220228384992, + "grad_norm": 0.23414427042007446, + "learning_rate": 0.000490832552551471, + "loss": 0.0425, + "num_input_tokens_seen": 146878432, + "step": 68075 + }, + { + "epoch": 11.106035889070148, + "grad_norm": 0.001468871021643281, + "learning_rate": 0.0004907613847871393, + "loss": 0.0021, + "num_input_tokens_seen": 146888736, + "step": 68080 + }, + { + "epoch": 11.106851549755302, + "grad_norm": 0.030377553775906563, + "learning_rate": 0.00049069021721004, + "loss": 0.0253, + "num_input_tokens_seen": 146896352, + "step": 68085 + }, + { + "epoch": 11.107667210440457, + "grad_norm": 0.0390375517308712, + "learning_rate": 0.0004906190498216151, + "loss": 0.0099, + "num_input_tokens_seen": 146905792, + "step": 68090 + }, + { + "epoch": 11.108482871125611, + "grad_norm": 0.5373247265815735, + "learning_rate": 0.0004905478826233072, + "loss": 0.2064, + "num_input_tokens_seen": 146918272, + "step": 68095 + }, + { + "epoch": 11.109298531810767, + "grad_norm": 0.0025782021693885326, + "learning_rate": 0.0004904767156165585, + "loss": 0.2128, + "num_input_tokens_seen": 146928832, + "step": 68100 + }, + { + "epoch": 11.11011419249592, + "grad_norm": 0.011253153905272484, + "learning_rate": 0.000490405548802811, + "loss": 0.0035, + "num_input_tokens_seen": 146939776, + "step": 68105 + }, + { + "epoch": 11.110929853181077, + "grad_norm": 0.002415325725451112, + "learning_rate": 0.0004903343821835075, + "loss": 0.0156, + "num_input_tokens_seen": 146949728, + "step": 68110 + }, + { + "epoch": 11.111745513866232, + "grad_norm": 0.003278841497376561, + "learning_rate": 0.0004902632157600898, + "loss": 0.0559, + "num_input_tokens_seen": 146960480, + "step": 68115 + }, + { + "epoch": 11.112561174551386, + "grad_norm": 0.14115285873413086, + "learning_rate": 0.0004901920495340007, + "loss": 0.0163, + "num_input_tokens_seen": 146971136, + "step": 68120 + }, + { + "epoch": 11.113376835236542, + "grad_norm": 0.01716459169983864, + "learning_rate": 0.0004901208835066818, + "loss": 0.0083, + "num_input_tokens_seen": 146981536, + "step": 68125 + }, + { + "epoch": 11.114192495921696, + "grad_norm": 0.08351199328899384, + "learning_rate": 0.0004900497176795759, + "loss": 0.1163, + "num_input_tokens_seen": 146991840, + "step": 68130 + }, + { + "epoch": 11.115008156606851, + "grad_norm": 0.012174772098660469, + "learning_rate": 0.000489978552054125, + "loss": 0.0159, + "num_input_tokens_seen": 147001632, + "step": 68135 + }, + { + "epoch": 11.115823817292007, + "grad_norm": 0.0066432165913283825, + "learning_rate": 0.0004899073866317717, + "loss": 0.0114, + "num_input_tokens_seen": 147012448, + "step": 68140 + }, + { + "epoch": 11.116639477977161, + "grad_norm": 0.007520776242017746, + "learning_rate": 0.0004898362214139577, + "loss": 0.0552, + "num_input_tokens_seen": 147022336, + "step": 68145 + }, + { + "epoch": 11.117455138662317, + "grad_norm": 0.019644413143396378, + "learning_rate": 0.0004897650564021257, + "loss": 0.2462, + "num_input_tokens_seen": 147033344, + "step": 68150 + }, + { + "epoch": 11.11827079934747, + "grad_norm": 0.0015336110955104232, + "learning_rate": 0.0004896938915977178, + "loss": 0.0099, + "num_input_tokens_seen": 147043360, + "step": 68155 + }, + { + "epoch": 11.119086460032626, + "grad_norm": 0.011734778061509132, + "learning_rate": 0.0004896227270021763, + "loss": 0.0181, + "num_input_tokens_seen": 147053696, + "step": 68160 + }, + { + "epoch": 11.119902120717782, + "grad_norm": 0.00047718005953356624, + "learning_rate": 0.0004895515626169433, + "loss": 0.0079, + "num_input_tokens_seen": 147065248, + "step": 68165 + }, + { + "epoch": 11.120717781402936, + "grad_norm": 0.00804673321545124, + "learning_rate": 0.0004894803984434613, + "loss": 0.0899, + "num_input_tokens_seen": 147075392, + "step": 68170 + }, + { + "epoch": 11.121533442088092, + "grad_norm": 0.3514236509799957, + "learning_rate": 0.0004894092344831722, + "loss": 0.0268, + "num_input_tokens_seen": 147086944, + "step": 68175 + }, + { + "epoch": 11.122349102773246, + "grad_norm": 0.1320733278989792, + "learning_rate": 0.0004893380707375186, + "loss": 0.0467, + "num_input_tokens_seen": 147096640, + "step": 68180 + }, + { + "epoch": 11.123164763458401, + "grad_norm": 0.01817741058766842, + "learning_rate": 0.0004892669072079423, + "loss": 0.0791, + "num_input_tokens_seen": 147107136, + "step": 68185 + }, + { + "epoch": 11.123980424143557, + "grad_norm": 0.021027591079473495, + "learning_rate": 0.000489195743895886, + "loss": 0.0136, + "num_input_tokens_seen": 147118144, + "step": 68190 + }, + { + "epoch": 11.124796084828711, + "grad_norm": 0.015405897051095963, + "learning_rate": 0.0004891245808027913, + "loss": 0.0072, + "num_input_tokens_seen": 147129504, + "step": 68195 + }, + { + "epoch": 11.125611745513867, + "grad_norm": 0.02694462426006794, + "learning_rate": 0.0004890534179301009, + "loss": 0.008, + "num_input_tokens_seen": 147140928, + "step": 68200 + }, + { + "epoch": 11.12642740619902, + "grad_norm": 0.002079995349049568, + "learning_rate": 0.0004889822552792572, + "loss": 0.1038, + "num_input_tokens_seen": 147152064, + "step": 68205 + }, + { + "epoch": 11.127243066884176, + "grad_norm": 0.002286914037540555, + "learning_rate": 0.0004889110928517016, + "loss": 0.0186, + "num_input_tokens_seen": 147162528, + "step": 68210 + }, + { + "epoch": 11.12805872756933, + "grad_norm": 0.004048719070851803, + "learning_rate": 0.0004888399306488771, + "loss": 0.0057, + "num_input_tokens_seen": 147173056, + "step": 68215 + }, + { + "epoch": 11.128874388254486, + "grad_norm": 0.037929385900497437, + "learning_rate": 0.0004887687686722254, + "loss": 0.0187, + "num_input_tokens_seen": 147184416, + "step": 68220 + }, + { + "epoch": 11.129690048939642, + "grad_norm": 0.38494575023651123, + "learning_rate": 0.000488697606923189, + "loss": 0.2433, + "num_input_tokens_seen": 147196224, + "step": 68225 + }, + { + "epoch": 11.130505709624796, + "grad_norm": 0.2656155526638031, + "learning_rate": 0.0004886264454032097, + "loss": 0.0427, + "num_input_tokens_seen": 147206336, + "step": 68230 + }, + { + "epoch": 11.131321370309951, + "grad_norm": 0.0744706392288208, + "learning_rate": 0.0004885552841137302, + "loss": 0.0155, + "num_input_tokens_seen": 147216288, + "step": 68235 + }, + { + "epoch": 11.132137030995105, + "grad_norm": 0.004853496793657541, + "learning_rate": 0.0004884841230561922, + "loss": 0.1297, + "num_input_tokens_seen": 147227744, + "step": 68240 + }, + { + "epoch": 11.132952691680261, + "grad_norm": 0.019723733887076378, + "learning_rate": 0.0004884129622320381, + "loss": 0.0074, + "num_input_tokens_seen": 147238496, + "step": 68245 + }, + { + "epoch": 11.133768352365417, + "grad_norm": 0.0018767437431961298, + "learning_rate": 0.0004883418016427099, + "loss": 0.0103, + "num_input_tokens_seen": 147250944, + "step": 68250 + }, + { + "epoch": 11.13458401305057, + "grad_norm": 0.0192906241863966, + "learning_rate": 0.00048827064128965014, + "loss": 0.0047, + "num_input_tokens_seen": 147261760, + "step": 68255 + }, + { + "epoch": 11.135399673735726, + "grad_norm": 0.1291676014661789, + "learning_rate": 0.00048819948117430047, + "loss": 0.159, + "num_input_tokens_seen": 147272000, + "step": 68260 + }, + { + "epoch": 11.13621533442088, + "grad_norm": 0.04326915740966797, + "learning_rate": 0.00048812832129810347, + "loss": 0.0093, + "num_input_tokens_seen": 147283296, + "step": 68265 + }, + { + "epoch": 11.137030995106036, + "grad_norm": 0.019670160487294197, + "learning_rate": 0.0004880571616625009, + "loss": 0.006, + "num_input_tokens_seen": 147293664, + "step": 68270 + }, + { + "epoch": 11.137846655791192, + "grad_norm": 0.0061174663715064526, + "learning_rate": 0.00048798600226893535, + "loss": 0.0483, + "num_input_tokens_seen": 147304032, + "step": 68275 + }, + { + "epoch": 11.138662316476346, + "grad_norm": 0.0959351435303688, + "learning_rate": 0.00048791484311884844, + "loss": 0.0734, + "num_input_tokens_seen": 147313536, + "step": 68280 + }, + { + "epoch": 11.139477977161501, + "grad_norm": 0.2713870704174042, + "learning_rate": 0.0004878436842136828, + "loss": 0.0562, + "num_input_tokens_seen": 147324704, + "step": 68285 + }, + { + "epoch": 11.140293637846655, + "grad_norm": 0.13622009754180908, + "learning_rate": 0.0004877725255548801, + "loss": 0.0187, + "num_input_tokens_seen": 147333952, + "step": 68290 + }, + { + "epoch": 11.141109298531811, + "grad_norm": 0.003888690611347556, + "learning_rate": 0.0004877013671438828, + "loss": 0.0114, + "num_input_tokens_seen": 147345120, + "step": 68295 + }, + { + "epoch": 11.141924959216965, + "grad_norm": 0.015081457793712616, + "learning_rate": 0.0004876302089821329, + "loss": 0.0119, + "num_input_tokens_seen": 147356864, + "step": 68300 + }, + { + "epoch": 11.14274061990212, + "grad_norm": 0.10644953697919846, + "learning_rate": 0.0004875590510710724, + "loss": 0.055, + "num_input_tokens_seen": 147367264, + "step": 68305 + }, + { + "epoch": 11.143556280587276, + "grad_norm": 0.021891675889492035, + "learning_rate": 0.00048748789341214373, + "loss": 0.0153, + "num_input_tokens_seen": 147378208, + "step": 68310 + }, + { + "epoch": 11.14437194127243, + "grad_norm": 0.04040338471531868, + "learning_rate": 0.00048741673600678857, + "loss": 0.044, + "num_input_tokens_seen": 147389568, + "step": 68315 + }, + { + "epoch": 11.145187601957586, + "grad_norm": 0.0038093479815870523, + "learning_rate": 0.00048734557885644924, + "loss": 0.0116, + "num_input_tokens_seen": 147400768, + "step": 68320 + }, + { + "epoch": 11.14600326264274, + "grad_norm": 0.007398206740617752, + "learning_rate": 0.00048727442196256786, + "loss": 0.0063, + "num_input_tokens_seen": 147411840, + "step": 68325 + }, + { + "epoch": 11.146818923327896, + "grad_norm": 0.018326515331864357, + "learning_rate": 0.0004872032653265865, + "loss": 0.0108, + "num_input_tokens_seen": 147421472, + "step": 68330 + }, + { + "epoch": 11.147634584013051, + "grad_norm": 0.032525684684515, + "learning_rate": 0.0004871321089499472, + "loss": 0.0144, + "num_input_tokens_seen": 147431744, + "step": 68335 + }, + { + "epoch": 11.148450244698205, + "grad_norm": 0.01166996918618679, + "learning_rate": 0.00048706095283409194, + "loss": 0.0082, + "num_input_tokens_seen": 147443232, + "step": 68340 + }, + { + "epoch": 11.149265905383361, + "grad_norm": 0.004951538983732462, + "learning_rate": 0.00048698979698046286, + "loss": 0.0045, + "num_input_tokens_seen": 147453248, + "step": 68345 + }, + { + "epoch": 11.150081566068515, + "grad_norm": 0.04320555925369263, + "learning_rate": 0.0004869186413905023, + "loss": 0.0894, + "num_input_tokens_seen": 147464704, + "step": 68350 + }, + { + "epoch": 11.15089722675367, + "grad_norm": 0.10657025873661041, + "learning_rate": 0.00048684748606565175, + "loss": 0.0141, + "num_input_tokens_seen": 147474976, + "step": 68355 + }, + { + "epoch": 11.151712887438826, + "grad_norm": 0.016399584710597992, + "learning_rate": 0.00048677633100735387, + "loss": 0.0107, + "num_input_tokens_seen": 147485536, + "step": 68360 + }, + { + "epoch": 11.15252854812398, + "grad_norm": 0.021805986762046814, + "learning_rate": 0.00048670517621705016, + "loss": 0.0081, + "num_input_tokens_seen": 147497408, + "step": 68365 + }, + { + "epoch": 11.153344208809136, + "grad_norm": 0.007740562781691551, + "learning_rate": 0.0004866340216961832, + "loss": 0.0556, + "num_input_tokens_seen": 147507360, + "step": 68370 + }, + { + "epoch": 11.15415986949429, + "grad_norm": 0.10723040252923965, + "learning_rate": 0.00048656286744619447, + "loss": 0.0117, + "num_input_tokens_seen": 147517088, + "step": 68375 + }, + { + "epoch": 11.154975530179446, + "grad_norm": 0.13677655160427094, + "learning_rate": 0.0004864917134685265, + "loss": 0.0407, + "num_input_tokens_seen": 147526880, + "step": 68380 + }, + { + "epoch": 11.1557911908646, + "grad_norm": 0.0014111484633758664, + "learning_rate": 0.0004864205597646209, + "loss": 0.0036, + "num_input_tokens_seen": 147538816, + "step": 68385 + }, + { + "epoch": 11.156606851549755, + "grad_norm": 0.1390146017074585, + "learning_rate": 0.00048634940633592006, + "loss": 0.0216, + "num_input_tokens_seen": 147550752, + "step": 68390 + }, + { + "epoch": 11.15742251223491, + "grad_norm": 0.008402802050113678, + "learning_rate": 0.00048627825318386567, + "loss": 0.0073, + "num_input_tokens_seen": 147562080, + "step": 68395 + }, + { + "epoch": 11.158238172920065, + "grad_norm": 0.41068145632743835, + "learning_rate": 0.00048620710030990004, + "loss": 0.059, + "num_input_tokens_seen": 147573472, + "step": 68400 + }, + { + "epoch": 11.15905383360522, + "grad_norm": 0.36436721682548523, + "learning_rate": 0.0004861359477154648, + "loss": 0.0476, + "num_input_tokens_seen": 147583648, + "step": 68405 + }, + { + "epoch": 11.159869494290374, + "grad_norm": 0.012808558531105518, + "learning_rate": 0.00048606479540200243, + "loss": 0.0567, + "num_input_tokens_seen": 147593728, + "step": 68410 + }, + { + "epoch": 11.16068515497553, + "grad_norm": 0.020424550399184227, + "learning_rate": 0.00048599364337095443, + "loss": 0.006, + "num_input_tokens_seen": 147604352, + "step": 68415 + }, + { + "epoch": 11.161500815660686, + "grad_norm": 0.00442297151312232, + "learning_rate": 0.000485922491623763, + "loss": 0.0917, + "num_input_tokens_seen": 147616320, + "step": 68420 + }, + { + "epoch": 11.16231647634584, + "grad_norm": 0.0010854145511984825, + "learning_rate": 0.0004858513401618704, + "loss": 0.0033, + "num_input_tokens_seen": 147627616, + "step": 68425 + }, + { + "epoch": 11.163132137030995, + "grad_norm": 0.023376798257231712, + "learning_rate": 0.00048578018898671804, + "loss": 0.0153, + "num_input_tokens_seen": 147638080, + "step": 68430 + }, + { + "epoch": 11.16394779771615, + "grad_norm": 0.5531041622161865, + "learning_rate": 0.0004857090380997484, + "loss": 0.0536, + "num_input_tokens_seen": 147648832, + "step": 68435 + }, + { + "epoch": 11.164763458401305, + "grad_norm": 0.01766670122742653, + "learning_rate": 0.00048563788750240314, + "loss": 0.0094, + "num_input_tokens_seen": 147659168, + "step": 68440 + }, + { + "epoch": 11.16557911908646, + "grad_norm": 0.25605708360671997, + "learning_rate": 0.00048556673719612445, + "loss": 0.1191, + "num_input_tokens_seen": 147669920, + "step": 68445 + }, + { + "epoch": 11.166394779771615, + "grad_norm": 0.08758559823036194, + "learning_rate": 0.00048549558718235386, + "loss": 0.0917, + "num_input_tokens_seen": 147680672, + "step": 68450 + }, + { + "epoch": 11.16721044045677, + "grad_norm": 0.0011085444130003452, + "learning_rate": 0.0004854244374625339, + "loss": 0.0104, + "num_input_tokens_seen": 147691712, + "step": 68455 + }, + { + "epoch": 11.168026101141924, + "grad_norm": 0.0013005820801481605, + "learning_rate": 0.00048535328803810595, + "loss": 0.0139, + "num_input_tokens_seen": 147701280, + "step": 68460 + }, + { + "epoch": 11.16884176182708, + "grad_norm": 0.052951630204916, + "learning_rate": 0.0004852821389105123, + "loss": 0.0415, + "num_input_tokens_seen": 147712320, + "step": 68465 + }, + { + "epoch": 11.169657422512234, + "grad_norm": 0.005234756972640753, + "learning_rate": 0.00048521099008119484, + "loss": 0.045, + "num_input_tokens_seen": 147723008, + "step": 68470 + }, + { + "epoch": 11.17047308319739, + "grad_norm": 0.01605057902634144, + "learning_rate": 0.0004851398415515954, + "loss": 0.006, + "num_input_tokens_seen": 147732736, + "step": 68475 + }, + { + "epoch": 11.171288743882545, + "grad_norm": 0.042753685265779495, + "learning_rate": 0.0004850686933231559, + "loss": 0.0112, + "num_input_tokens_seen": 147741792, + "step": 68480 + }, + { + "epoch": 11.1721044045677, + "grad_norm": 0.11829470098018646, + "learning_rate": 0.00048499754539731827, + "loss": 0.0093, + "num_input_tokens_seen": 147752672, + "step": 68485 + }, + { + "epoch": 11.172920065252855, + "grad_norm": 0.0021860431879758835, + "learning_rate": 0.0004849263977755243, + "loss": 0.0051, + "num_input_tokens_seen": 147763296, + "step": 68490 + }, + { + "epoch": 11.173735725938009, + "grad_norm": 0.013171934522688389, + "learning_rate": 0.00048485525045921627, + "loss": 0.0033, + "num_input_tokens_seen": 147775584, + "step": 68495 + }, + { + "epoch": 11.174551386623165, + "grad_norm": 0.007527179550379515, + "learning_rate": 0.00048478410344983554, + "loss": 0.1126, + "num_input_tokens_seen": 147785568, + "step": 68500 + }, + { + "epoch": 11.17536704730832, + "grad_norm": 0.016970383003354073, + "learning_rate": 0.00048471295674882447, + "loss": 0.008, + "num_input_tokens_seen": 147796416, + "step": 68505 + }, + { + "epoch": 11.176182707993474, + "grad_norm": 0.036271579563617706, + "learning_rate": 0.0004846418103576245, + "loss": 0.0052, + "num_input_tokens_seen": 147806848, + "step": 68510 + }, + { + "epoch": 11.17699836867863, + "grad_norm": 0.0063886684365570545, + "learning_rate": 0.000484570664277678, + "loss": 0.0067, + "num_input_tokens_seen": 147816992, + "step": 68515 + }, + { + "epoch": 11.177814029363784, + "grad_norm": 0.03593786433339119, + "learning_rate": 0.00048449951851042627, + "loss": 0.0073, + "num_input_tokens_seen": 147828384, + "step": 68520 + }, + { + "epoch": 11.17862969004894, + "grad_norm": 0.1617739200592041, + "learning_rate": 0.0004844283730573115, + "loss": 0.0215, + "num_input_tokens_seen": 147838528, + "step": 68525 + }, + { + "epoch": 11.179445350734095, + "grad_norm": 0.009610825218260288, + "learning_rate": 0.0004843572279197757, + "loss": 0.015, + "num_input_tokens_seen": 147850688, + "step": 68530 + }, + { + "epoch": 11.18026101141925, + "grad_norm": 0.03277764841914177, + "learning_rate": 0.0004842860830992604, + "loss": 0.0081, + "num_input_tokens_seen": 147861504, + "step": 68535 + }, + { + "epoch": 11.181076672104405, + "grad_norm": 0.2084341198205948, + "learning_rate": 0.00048421493859720767, + "loss": 0.0271, + "num_input_tokens_seen": 147872032, + "step": 68540 + }, + { + "epoch": 11.181892332789559, + "grad_norm": 0.0070254020392894745, + "learning_rate": 0.000484143794415059, + "loss": 0.0177, + "num_input_tokens_seen": 147881664, + "step": 68545 + }, + { + "epoch": 11.182707993474715, + "grad_norm": 0.006792774423956871, + "learning_rate": 0.00048407265055425673, + "loss": 0.0777, + "num_input_tokens_seen": 147891936, + "step": 68550 + }, + { + "epoch": 11.18352365415987, + "grad_norm": 0.11536426842212677, + "learning_rate": 0.00048400150701624216, + "loss": 0.0184, + "num_input_tokens_seen": 147903264, + "step": 68555 + }, + { + "epoch": 11.184339314845024, + "grad_norm": 0.009474091231822968, + "learning_rate": 0.0004839303638024576, + "loss": 0.0076, + "num_input_tokens_seen": 147913984, + "step": 68560 + }, + { + "epoch": 11.18515497553018, + "grad_norm": 0.09582109749317169, + "learning_rate": 0.0004838592209143444, + "loss": 0.0319, + "num_input_tokens_seen": 147924704, + "step": 68565 + }, + { + "epoch": 11.185970636215334, + "grad_norm": 0.001250010565854609, + "learning_rate": 0.0004837880783533447, + "loss": 0.0235, + "num_input_tokens_seen": 147935872, + "step": 68570 + }, + { + "epoch": 11.18678629690049, + "grad_norm": 0.08254896849393845, + "learning_rate": 0.00048371693612089996, + "loss": 0.0172, + "num_input_tokens_seen": 147946720, + "step": 68575 + }, + { + "epoch": 11.187601957585644, + "grad_norm": 0.0035739855375140905, + "learning_rate": 0.00048364579421845245, + "loss": 0.0034, + "num_input_tokens_seen": 147957920, + "step": 68580 + }, + { + "epoch": 11.1884176182708, + "grad_norm": 0.40617141127586365, + "learning_rate": 0.0004835746526474434, + "loss": 0.0319, + "num_input_tokens_seen": 147968768, + "step": 68585 + }, + { + "epoch": 11.189233278955955, + "grad_norm": 0.005074291955679655, + "learning_rate": 0.00048350351140931505, + "loss": 0.0042, + "num_input_tokens_seen": 147979808, + "step": 68590 + }, + { + "epoch": 11.190048939641109, + "grad_norm": 0.29348987340927124, + "learning_rate": 0.00048343237050550876, + "loss": 0.0734, + "num_input_tokens_seen": 147990080, + "step": 68595 + }, + { + "epoch": 11.190864600326265, + "grad_norm": 0.23754937946796417, + "learning_rate": 0.0004833612299374667, + "loss": 0.0106, + "num_input_tokens_seen": 148001408, + "step": 68600 + }, + { + "epoch": 11.191680261011419, + "grad_norm": 0.02339480072259903, + "learning_rate": 0.0004832900897066303, + "loss": 0.0276, + "num_input_tokens_seen": 148011936, + "step": 68605 + }, + { + "epoch": 11.192495921696574, + "grad_norm": 0.3079979419708252, + "learning_rate": 0.0004832189498144415, + "loss": 0.1921, + "num_input_tokens_seen": 148022752, + "step": 68610 + }, + { + "epoch": 11.19331158238173, + "grad_norm": 0.2807910144329071, + "learning_rate": 0.0004831478102623419, + "loss": 0.0149, + "num_input_tokens_seen": 148033536, + "step": 68615 + }, + { + "epoch": 11.194127243066884, + "grad_norm": 0.002254726132377982, + "learning_rate": 0.0004830766710517733, + "loss": 0.0042, + "num_input_tokens_seen": 148045216, + "step": 68620 + }, + { + "epoch": 11.19494290375204, + "grad_norm": 0.002774834167212248, + "learning_rate": 0.00048300553218417753, + "loss": 0.0105, + "num_input_tokens_seen": 148055904, + "step": 68625 + }, + { + "epoch": 11.195758564437194, + "grad_norm": 0.003052889835089445, + "learning_rate": 0.0004829343936609961, + "loss": 0.0237, + "num_input_tokens_seen": 148066080, + "step": 68630 + }, + { + "epoch": 11.19657422512235, + "grad_norm": 0.0019634098280221224, + "learning_rate": 0.00048286325548367083, + "loss": 0.1311, + "num_input_tokens_seen": 148076992, + "step": 68635 + }, + { + "epoch": 11.197389885807505, + "grad_norm": 0.010378489270806313, + "learning_rate": 0.0004827921176536435, + "loss": 0.0099, + "num_input_tokens_seen": 148086624, + "step": 68640 + }, + { + "epoch": 11.198205546492659, + "grad_norm": 0.04491036757826805, + "learning_rate": 0.00048272098017235573, + "loss": 0.0085, + "num_input_tokens_seen": 148096672, + "step": 68645 + }, + { + "epoch": 11.199021207177815, + "grad_norm": 0.00126813561655581, + "learning_rate": 0.0004826498430412492, + "loss": 0.0016, + "num_input_tokens_seen": 148106496, + "step": 68650 + }, + { + "epoch": 11.199836867862969, + "grad_norm": 0.19535493850708008, + "learning_rate": 0.00048257870626176565, + "loss": 0.0465, + "num_input_tokens_seen": 148117472, + "step": 68655 + }, + { + "epoch": 11.200652528548124, + "grad_norm": 0.07665639370679855, + "learning_rate": 0.00048250756983534657, + "loss": 0.006, + "num_input_tokens_seen": 148126592, + "step": 68660 + }, + { + "epoch": 11.201468189233278, + "grad_norm": 0.212269127368927, + "learning_rate": 0.000482436433763434, + "loss": 0.021, + "num_input_tokens_seen": 148138272, + "step": 68665 + }, + { + "epoch": 11.202283849918434, + "grad_norm": 0.0254303477704525, + "learning_rate": 0.00048236529804746915, + "loss": 0.0203, + "num_input_tokens_seen": 148148864, + "step": 68670 + }, + { + "epoch": 11.20309951060359, + "grad_norm": 0.06364794075489044, + "learning_rate": 0.0004822941626888941, + "loss": 0.0227, + "num_input_tokens_seen": 148159296, + "step": 68675 + }, + { + "epoch": 11.203915171288743, + "grad_norm": 0.02049858681857586, + "learning_rate": 0.0004822230276891502, + "loss": 0.0276, + "num_input_tokens_seen": 148169088, + "step": 68680 + }, + { + "epoch": 11.2047308319739, + "grad_norm": 0.005524788983166218, + "learning_rate": 0.00048215189304967934, + "loss": 0.0425, + "num_input_tokens_seen": 148180384, + "step": 68685 + }, + { + "epoch": 11.205546492659053, + "grad_norm": 0.02312077395617962, + "learning_rate": 0.00048208075877192275, + "loss": 0.0052, + "num_input_tokens_seen": 148190528, + "step": 68690 + }, + { + "epoch": 11.206362153344209, + "grad_norm": 0.055775344371795654, + "learning_rate": 0.0004820096248573226, + "loss": 0.0062, + "num_input_tokens_seen": 148201216, + "step": 68695 + }, + { + "epoch": 11.207177814029365, + "grad_norm": 0.03741117939352989, + "learning_rate": 0.00048193849130732, + "loss": 0.0096, + "num_input_tokens_seen": 148212160, + "step": 68700 + }, + { + "epoch": 11.207993474714518, + "grad_norm": 0.6265466213226318, + "learning_rate": 0.00048186735812335695, + "loss": 0.036, + "num_input_tokens_seen": 148223712, + "step": 68705 + }, + { + "epoch": 11.208809135399674, + "grad_norm": 0.005353438202291727, + "learning_rate": 0.0004817962253068747, + "loss": 0.0045, + "num_input_tokens_seen": 148234592, + "step": 68710 + }, + { + "epoch": 11.209624796084828, + "grad_norm": 0.45418357849121094, + "learning_rate": 0.0004817250928593153, + "loss": 0.0473, + "num_input_tokens_seen": 148245440, + "step": 68715 + }, + { + "epoch": 11.210440456769984, + "grad_norm": 0.009325501509010792, + "learning_rate": 0.0004816539607821198, + "loss": 0.1387, + "num_input_tokens_seen": 148255904, + "step": 68720 + }, + { + "epoch": 11.21125611745514, + "grad_norm": 0.015116691589355469, + "learning_rate": 0.0004815828290767303, + "loss": 0.0596, + "num_input_tokens_seen": 148266880, + "step": 68725 + }, + { + "epoch": 11.212071778140293, + "grad_norm": 0.0015993528068065643, + "learning_rate": 0.00048151169774458797, + "loss": 0.0829, + "num_input_tokens_seen": 148277760, + "step": 68730 + }, + { + "epoch": 11.21288743882545, + "grad_norm": 0.010977262631058693, + "learning_rate": 0.00048144056678713445, + "loss": 0.0033, + "num_input_tokens_seen": 148288160, + "step": 68735 + }, + { + "epoch": 11.213703099510603, + "grad_norm": 0.47383585572242737, + "learning_rate": 0.00048136943620581164, + "loss": 0.1119, + "num_input_tokens_seen": 148298336, + "step": 68740 + }, + { + "epoch": 11.214518760195759, + "grad_norm": 0.14143721759319305, + "learning_rate": 0.00048129830600206067, + "loss": 0.0607, + "num_input_tokens_seen": 148309248, + "step": 68745 + }, + { + "epoch": 11.215334420880913, + "grad_norm": 0.6546066999435425, + "learning_rate": 0.0004812271761773234, + "loss": 0.0397, + "num_input_tokens_seen": 148319904, + "step": 68750 + }, + { + "epoch": 11.216150081566068, + "grad_norm": 0.03191588073968887, + "learning_rate": 0.00048115604673304105, + "loss": 0.0073, + "num_input_tokens_seen": 148331264, + "step": 68755 + }, + { + "epoch": 11.216965742251224, + "grad_norm": 0.006129416637122631, + "learning_rate": 0.0004810849176706555, + "loss": 0.0255, + "num_input_tokens_seen": 148342624, + "step": 68760 + }, + { + "epoch": 11.217781402936378, + "grad_norm": 0.0005666811484843493, + "learning_rate": 0.00048101378899160786, + "loss": 0.0046, + "num_input_tokens_seen": 148353632, + "step": 68765 + }, + { + "epoch": 11.218597063621534, + "grad_norm": 0.02964751236140728, + "learning_rate": 0.0004809426606973401, + "loss": 0.0327, + "num_input_tokens_seen": 148362816, + "step": 68770 + }, + { + "epoch": 11.219412724306688, + "grad_norm": 0.12862356007099152, + "learning_rate": 0.00048087153278929327, + "loss": 0.0117, + "num_input_tokens_seen": 148373792, + "step": 68775 + }, + { + "epoch": 11.220228384991843, + "grad_norm": 0.006749541498720646, + "learning_rate": 0.0004808004052689093, + "loss": 0.0143, + "num_input_tokens_seen": 148384640, + "step": 68780 + }, + { + "epoch": 11.221044045676999, + "grad_norm": 0.4781631827354431, + "learning_rate": 0.0004807292781376294, + "loss": 0.0609, + "num_input_tokens_seen": 148395424, + "step": 68785 + }, + { + "epoch": 11.221859706362153, + "grad_norm": 0.309983491897583, + "learning_rate": 0.0004806581513968951, + "loss": 0.0668, + "num_input_tokens_seen": 148406304, + "step": 68790 + }, + { + "epoch": 11.222675367047309, + "grad_norm": 0.009176922030746937, + "learning_rate": 0.00048058702504814795, + "loss": 0.0621, + "num_input_tokens_seen": 148416928, + "step": 68795 + }, + { + "epoch": 11.223491027732463, + "grad_norm": 0.019404655322432518, + "learning_rate": 0.0004805158990928293, + "loss": 0.0236, + "num_input_tokens_seen": 148428768, + "step": 68800 + }, + { + "epoch": 11.224306688417618, + "grad_norm": 0.01851831190288067, + "learning_rate": 0.0004804447735323806, + "loss": 0.0081, + "num_input_tokens_seen": 148440256, + "step": 68805 + }, + { + "epoch": 11.225122349102774, + "grad_norm": 0.004205439705401659, + "learning_rate": 0.0004803736483682436, + "loss": 0.0066, + "num_input_tokens_seen": 148450016, + "step": 68810 + }, + { + "epoch": 11.225938009787928, + "grad_norm": 0.026498988270759583, + "learning_rate": 0.0004803025236018593, + "loss": 0.0131, + "num_input_tokens_seen": 148462336, + "step": 68815 + }, + { + "epoch": 11.226753670473084, + "grad_norm": 0.589423656463623, + "learning_rate": 0.00048023139923466954, + "loss": 0.119, + "num_input_tokens_seen": 148473344, + "step": 68820 + }, + { + "epoch": 11.227569331158238, + "grad_norm": 0.5795223116874695, + "learning_rate": 0.00048016027526811536, + "loss": 0.0358, + "num_input_tokens_seen": 148483936, + "step": 68825 + }, + { + "epoch": 11.228384991843393, + "grad_norm": 0.0032945279963314533, + "learning_rate": 0.00048008915170363853, + "loss": 0.0065, + "num_input_tokens_seen": 148493440, + "step": 68830 + }, + { + "epoch": 11.229200652528547, + "grad_norm": 0.00642517302185297, + "learning_rate": 0.0004800180285426802, + "loss": 0.0024, + "num_input_tokens_seen": 148504192, + "step": 68835 + }, + { + "epoch": 11.230016313213703, + "grad_norm": 0.035746075212955475, + "learning_rate": 0.00047994690578668175, + "loss": 0.0267, + "num_input_tokens_seen": 148515520, + "step": 68840 + }, + { + "epoch": 11.230831973898859, + "grad_norm": 0.012348620221018791, + "learning_rate": 0.000479875783437085, + "loss": 0.0508, + "num_input_tokens_seen": 148526464, + "step": 68845 + }, + { + "epoch": 11.231647634584013, + "grad_norm": 0.012041166424751282, + "learning_rate": 0.00047980466149533075, + "loss": 0.0408, + "num_input_tokens_seen": 148537248, + "step": 68850 + }, + { + "epoch": 11.232463295269168, + "grad_norm": 0.2354668378829956, + "learning_rate": 0.0004797335399628609, + "loss": 0.1274, + "num_input_tokens_seen": 148547904, + "step": 68855 + }, + { + "epoch": 11.233278955954322, + "grad_norm": 0.00984420906752348, + "learning_rate": 0.0004796624188411163, + "loss": 0.0088, + "num_input_tokens_seen": 148559712, + "step": 68860 + }, + { + "epoch": 11.234094616639478, + "grad_norm": 0.007017176598310471, + "learning_rate": 0.00047959129813153885, + "loss": 0.0028, + "num_input_tokens_seen": 148571040, + "step": 68865 + }, + { + "epoch": 11.234910277324634, + "grad_norm": 0.20557329058647156, + "learning_rate": 0.00047952017783556945, + "loss": 0.0076, + "num_input_tokens_seen": 148581728, + "step": 68870 + }, + { + "epoch": 11.235725938009788, + "grad_norm": 0.17598062753677368, + "learning_rate": 0.00047944905795464977, + "loss": 0.0139, + "num_input_tokens_seen": 148593056, + "step": 68875 + }, + { + "epoch": 11.236541598694943, + "grad_norm": 0.08315448462963104, + "learning_rate": 0.0004793779384902208, + "loss": 0.0108, + "num_input_tokens_seen": 148603136, + "step": 68880 + }, + { + "epoch": 11.237357259380097, + "grad_norm": 0.365128755569458, + "learning_rate": 0.00047930681944372434, + "loss": 0.0958, + "num_input_tokens_seen": 148613472, + "step": 68885 + }, + { + "epoch": 11.238172920065253, + "grad_norm": 0.0631132423877716, + "learning_rate": 0.00047923570081660115, + "loss": 0.0086, + "num_input_tokens_seen": 148624000, + "step": 68890 + }, + { + "epoch": 11.238988580750409, + "grad_norm": 0.0025263717398047447, + "learning_rate": 0.0004791645826102931, + "loss": 0.0028, + "num_input_tokens_seen": 148636000, + "step": 68895 + }, + { + "epoch": 11.239804241435563, + "grad_norm": 0.003140080953016877, + "learning_rate": 0.000479093464826241, + "loss": 0.0274, + "num_input_tokens_seen": 148646720, + "step": 68900 + }, + { + "epoch": 11.240619902120718, + "grad_norm": 0.010016946122050285, + "learning_rate": 0.00047902234746588653, + "loss": 0.004, + "num_input_tokens_seen": 148656288, + "step": 68905 + }, + { + "epoch": 11.241435562805872, + "grad_norm": 0.1743178814649582, + "learning_rate": 0.0004789512305306706, + "loss": 0.0159, + "num_input_tokens_seen": 148666752, + "step": 68910 + }, + { + "epoch": 11.242251223491028, + "grad_norm": 0.04441777244210243, + "learning_rate": 0.0004788801140220349, + "loss": 0.0053, + "num_input_tokens_seen": 148677248, + "step": 68915 + }, + { + "epoch": 11.243066884176184, + "grad_norm": 0.0007409164099954069, + "learning_rate": 0.00047880899794142026, + "loss": 0.0025, + "num_input_tokens_seen": 148686624, + "step": 68920 + }, + { + "epoch": 11.243882544861338, + "grad_norm": 0.0016849327366799116, + "learning_rate": 0.00047873788229026826, + "loss": 0.0071, + "num_input_tokens_seen": 148696864, + "step": 68925 + }, + { + "epoch": 11.244698205546493, + "grad_norm": 0.041731882840394974, + "learning_rate": 0.0004786667670700201, + "loss": 0.0693, + "num_input_tokens_seen": 148707264, + "step": 68930 + }, + { + "epoch": 11.245513866231647, + "grad_norm": 0.0040927305817604065, + "learning_rate": 0.00047859565228211695, + "loss": 0.0032, + "num_input_tokens_seen": 148717856, + "step": 68935 + }, + { + "epoch": 11.246329526916803, + "grad_norm": 0.5775235295295715, + "learning_rate": 0.00047852453792799997, + "loss": 0.145, + "num_input_tokens_seen": 148728416, + "step": 68940 + }, + { + "epoch": 11.247145187601957, + "grad_norm": 0.030834056437015533, + "learning_rate": 0.0004784534240091105, + "loss": 0.0069, + "num_input_tokens_seen": 148739904, + "step": 68945 + }, + { + "epoch": 11.247960848287113, + "grad_norm": 0.012964191846549511, + "learning_rate": 0.00047838231052688975, + "loss": 0.0147, + "num_input_tokens_seen": 148749984, + "step": 68950 + }, + { + "epoch": 11.248776508972268, + "grad_norm": 0.0015898104757070541, + "learning_rate": 0.0004783111974827789, + "loss": 0.0024, + "num_input_tokens_seen": 148761216, + "step": 68955 + }, + { + "epoch": 11.249592169657422, + "grad_norm": 0.006432735826820135, + "learning_rate": 0.0004782400848782192, + "loss": 0.0633, + "num_input_tokens_seen": 148772288, + "step": 68960 + }, + { + "epoch": 11.250407830342578, + "grad_norm": 0.09992986917495728, + "learning_rate": 0.0004781689727146517, + "loss": 0.0159, + "num_input_tokens_seen": 148784640, + "step": 68965 + }, + { + "epoch": 11.251223491027732, + "grad_norm": 0.003400498302653432, + "learning_rate": 0.0004780978609935178, + "loss": 0.005, + "num_input_tokens_seen": 148796224, + "step": 68970 + }, + { + "epoch": 11.252039151712887, + "grad_norm": 0.028594650328159332, + "learning_rate": 0.00047802674971625825, + "loss": 0.0249, + "num_input_tokens_seen": 148807360, + "step": 68975 + }, + { + "epoch": 11.252854812398043, + "grad_norm": 0.002770372899249196, + "learning_rate": 0.0004779556388843148, + "loss": 0.0134, + "num_input_tokens_seen": 148818592, + "step": 68980 + }, + { + "epoch": 11.253670473083197, + "grad_norm": 0.009856651537120342, + "learning_rate": 0.0004778845284991281, + "loss": 0.0113, + "num_input_tokens_seen": 148828448, + "step": 68985 + }, + { + "epoch": 11.254486133768353, + "grad_norm": 0.010934053920209408, + "learning_rate": 0.00047781341856213965, + "loss": 0.006, + "num_input_tokens_seen": 148839456, + "step": 68990 + }, + { + "epoch": 11.255301794453507, + "grad_norm": 0.010733716189861298, + "learning_rate": 0.00047774230907479025, + "loss": 0.129, + "num_input_tokens_seen": 148849056, + "step": 68995 + }, + { + "epoch": 11.256117455138662, + "grad_norm": 0.28270435333251953, + "learning_rate": 0.0004776712000385214, + "loss": 0.022, + "num_input_tokens_seen": 148858368, + "step": 69000 + }, + { + "epoch": 11.256933115823816, + "grad_norm": 0.02010207064449787, + "learning_rate": 0.0004776000914547738, + "loss": 0.006, + "num_input_tokens_seen": 148869792, + "step": 69005 + }, + { + "epoch": 11.257748776508972, + "grad_norm": 0.013458766974508762, + "learning_rate": 0.00047752898332498894, + "loss": 0.0093, + "num_input_tokens_seen": 148880800, + "step": 69010 + }, + { + "epoch": 11.258564437194128, + "grad_norm": 0.13359004259109497, + "learning_rate": 0.00047745787565060756, + "loss": 0.0084, + "num_input_tokens_seen": 148891392, + "step": 69015 + }, + { + "epoch": 11.259380097879282, + "grad_norm": 0.14816893637180328, + "learning_rate": 0.0004773867684330711, + "loss": 0.0091, + "num_input_tokens_seen": 148903200, + "step": 69020 + }, + { + "epoch": 11.260195758564437, + "grad_norm": 0.06728312373161316, + "learning_rate": 0.0004773156616738203, + "loss": 0.0232, + "num_input_tokens_seen": 148913856, + "step": 69025 + }, + { + "epoch": 11.261011419249591, + "grad_norm": 0.00261763297021389, + "learning_rate": 0.00047724455537429656, + "loss": 0.0106, + "num_input_tokens_seen": 148924192, + "step": 69030 + }, + { + "epoch": 11.261827079934747, + "grad_norm": 0.8279803991317749, + "learning_rate": 0.00047717344953594054, + "loss": 0.1055, + "num_input_tokens_seen": 148934240, + "step": 69035 + }, + { + "epoch": 11.262642740619903, + "grad_norm": 0.09934436529874802, + "learning_rate": 0.0004771023441601938, + "loss": 0.1276, + "num_input_tokens_seen": 148946304, + "step": 69040 + }, + { + "epoch": 11.263458401305057, + "grad_norm": 0.002370397327467799, + "learning_rate": 0.0004770312392484968, + "loss": 0.0132, + "num_input_tokens_seen": 148955584, + "step": 69045 + }, + { + "epoch": 11.264274061990212, + "grad_norm": 0.0007140504894778132, + "learning_rate": 0.000476960134802291, + "loss": 0.0091, + "num_input_tokens_seen": 148965760, + "step": 69050 + }, + { + "epoch": 11.265089722675366, + "grad_norm": 0.00588075490668416, + "learning_rate": 0.00047688903082301746, + "loss": 0.0199, + "num_input_tokens_seen": 148976544, + "step": 69055 + }, + { + "epoch": 11.265905383360522, + "grad_norm": 0.0013728328049182892, + "learning_rate": 0.00047681792731211684, + "loss": 0.0034, + "num_input_tokens_seen": 148987872, + "step": 69060 + }, + { + "epoch": 11.266721044045678, + "grad_norm": 0.009754364378750324, + "learning_rate": 0.00047674682427103045, + "loss": 0.0074, + "num_input_tokens_seen": 148999296, + "step": 69065 + }, + { + "epoch": 11.267536704730832, + "grad_norm": 0.7326864004135132, + "learning_rate": 0.00047667572170119905, + "loss": 0.0297, + "num_input_tokens_seen": 149010464, + "step": 69070 + }, + { + "epoch": 11.268352365415987, + "grad_norm": 0.020309919491410255, + "learning_rate": 0.00047660461960406385, + "loss": 0.0045, + "num_input_tokens_seen": 149022368, + "step": 69075 + }, + { + "epoch": 11.269168026101141, + "grad_norm": 0.11338143795728683, + "learning_rate": 0.0004765335179810656, + "loss": 0.0137, + "num_input_tokens_seen": 149033152, + "step": 69080 + }, + { + "epoch": 11.269983686786297, + "grad_norm": 0.1864491105079651, + "learning_rate": 0.00047646241683364554, + "loss": 0.126, + "num_input_tokens_seen": 149044768, + "step": 69085 + }, + { + "epoch": 11.270799347471453, + "grad_norm": 0.0044025843963027, + "learning_rate": 0.0004763913161632443, + "loss": 0.013, + "num_input_tokens_seen": 149055168, + "step": 69090 + }, + { + "epoch": 11.271615008156607, + "grad_norm": 0.05166783183813095, + "learning_rate": 0.00047632021597130304, + "loss": 0.0075, + "num_input_tokens_seen": 149066720, + "step": 69095 + }, + { + "epoch": 11.272430668841762, + "grad_norm": 0.04078592732548714, + "learning_rate": 0.0004762491162592627, + "loss": 0.0186, + "num_input_tokens_seen": 149076768, + "step": 69100 + }, + { + "epoch": 11.273246329526916, + "grad_norm": 0.01061907596886158, + "learning_rate": 0.00047617801702856406, + "loss": 0.024, + "num_input_tokens_seen": 149087488, + "step": 69105 + }, + { + "epoch": 11.274061990212072, + "grad_norm": 0.06693772226572037, + "learning_rate": 0.00047610691828064815, + "loss": 0.1343, + "num_input_tokens_seen": 149099296, + "step": 69110 + }, + { + "epoch": 11.274877650897226, + "grad_norm": 0.10493671149015427, + "learning_rate": 0.0004760358200169559, + "loss": 0.0248, + "num_input_tokens_seen": 149109888, + "step": 69115 + }, + { + "epoch": 11.275693311582382, + "grad_norm": 0.02473524957895279, + "learning_rate": 0.000475964722238928, + "loss": 0.0088, + "num_input_tokens_seen": 149120992, + "step": 69120 + }, + { + "epoch": 11.276508972267537, + "grad_norm": 0.004808297846466303, + "learning_rate": 0.00047589362494800574, + "loss": 0.0417, + "num_input_tokens_seen": 149131232, + "step": 69125 + }, + { + "epoch": 11.277324632952691, + "grad_norm": 0.03694973513484001, + "learning_rate": 0.00047582252814562954, + "loss": 0.0052, + "num_input_tokens_seen": 149143168, + "step": 69130 + }, + { + "epoch": 11.278140293637847, + "grad_norm": 0.012975016608834267, + "learning_rate": 0.0004757514318332407, + "loss": 0.0038, + "num_input_tokens_seen": 149153024, + "step": 69135 + }, + { + "epoch": 11.278955954323001, + "grad_norm": 0.10611530393362045, + "learning_rate": 0.0004756803360122796, + "loss": 0.0301, + "num_input_tokens_seen": 149164000, + "step": 69140 + }, + { + "epoch": 11.279771615008157, + "grad_norm": 0.0009083959157578647, + "learning_rate": 0.00047560924068418763, + "loss": 0.0342, + "num_input_tokens_seen": 149175520, + "step": 69145 + }, + { + "epoch": 11.280587275693312, + "grad_norm": 0.01197612751275301, + "learning_rate": 0.00047553814585040506, + "loss": 0.0075, + "num_input_tokens_seen": 149186432, + "step": 69150 + }, + { + "epoch": 11.281402936378466, + "grad_norm": 0.3422130048274994, + "learning_rate": 0.00047546705151237323, + "loss": 0.0899, + "num_input_tokens_seen": 149197216, + "step": 69155 + }, + { + "epoch": 11.282218597063622, + "grad_norm": 0.3588181436061859, + "learning_rate": 0.00047539595767153255, + "loss": 0.1696, + "num_input_tokens_seen": 149208000, + "step": 69160 + }, + { + "epoch": 11.283034257748776, + "grad_norm": 0.22462858259677887, + "learning_rate": 0.00047532486432932394, + "loss": 0.0192, + "num_input_tokens_seen": 149218592, + "step": 69165 + }, + { + "epoch": 11.283849918433932, + "grad_norm": 0.02500959485769272, + "learning_rate": 0.00047525377148718845, + "loss": 0.005, + "num_input_tokens_seen": 149229856, + "step": 69170 + }, + { + "epoch": 11.284665579119087, + "grad_norm": 0.02116668038070202, + "learning_rate": 0.00047518267914656656, + "loss": 0.0101, + "num_input_tokens_seen": 149241472, + "step": 69175 + }, + { + "epoch": 11.285481239804241, + "grad_norm": 0.008388462476432323, + "learning_rate": 0.0004751115873088992, + "loss": 0.0273, + "num_input_tokens_seen": 149252256, + "step": 69180 + }, + { + "epoch": 11.286296900489397, + "grad_norm": 0.05486051365733147, + "learning_rate": 0.0004750404959756271, + "loss": 0.0205, + "num_input_tokens_seen": 149262752, + "step": 69185 + }, + { + "epoch": 11.28711256117455, + "grad_norm": 0.0405304878950119, + "learning_rate": 0.0004749694051481911, + "loss": 0.0065, + "num_input_tokens_seen": 149273408, + "step": 69190 + }, + { + "epoch": 11.287928221859707, + "grad_norm": 0.09597187489271164, + "learning_rate": 0.00047489831482803167, + "loss": 0.0235, + "num_input_tokens_seen": 149284384, + "step": 69195 + }, + { + "epoch": 11.28874388254486, + "grad_norm": 0.4298941195011139, + "learning_rate": 0.00047482722501658993, + "loss": 0.0158, + "num_input_tokens_seen": 149295808, + "step": 69200 + }, + { + "epoch": 11.289559543230016, + "grad_norm": 0.03213965892791748, + "learning_rate": 0.00047475613571530624, + "loss": 0.0079, + "num_input_tokens_seen": 149306560, + "step": 69205 + }, + { + "epoch": 11.290375203915172, + "grad_norm": 0.23173700273036957, + "learning_rate": 0.0004746850469256216, + "loss": 0.1561, + "num_input_tokens_seen": 149318272, + "step": 69210 + }, + { + "epoch": 11.291190864600326, + "grad_norm": 0.21506041288375854, + "learning_rate": 0.0004746139586489765, + "loss": 0.0417, + "num_input_tokens_seen": 149329344, + "step": 69215 + }, + { + "epoch": 11.292006525285482, + "grad_norm": 0.010741113685071468, + "learning_rate": 0.00047454287088681194, + "loss": 0.0271, + "num_input_tokens_seen": 149340032, + "step": 69220 + }, + { + "epoch": 11.292822185970635, + "grad_norm": 0.44709673523902893, + "learning_rate": 0.0004744717836405681, + "loss": 0.2083, + "num_input_tokens_seen": 149351072, + "step": 69225 + }, + { + "epoch": 11.293637846655791, + "grad_norm": 0.028850000351667404, + "learning_rate": 0.00047440069691168617, + "loss": 0.0087, + "num_input_tokens_seen": 149362176, + "step": 69230 + }, + { + "epoch": 11.294453507340947, + "grad_norm": 0.002180980984121561, + "learning_rate": 0.0004743296107016065, + "loss": 0.0026, + "num_input_tokens_seen": 149373056, + "step": 69235 + }, + { + "epoch": 11.2952691680261, + "grad_norm": 0.006334132514894009, + "learning_rate": 0.0004742585250117698, + "loss": 0.0022, + "num_input_tokens_seen": 149383008, + "step": 69240 + }, + { + "epoch": 11.296084828711257, + "grad_norm": 0.00515410304069519, + "learning_rate": 0.00047418743984361676, + "loss": 0.0039, + "num_input_tokens_seen": 149393280, + "step": 69245 + }, + { + "epoch": 11.29690048939641, + "grad_norm": 0.002374490024521947, + "learning_rate": 0.0004741163551985881, + "loss": 0.0042, + "num_input_tokens_seen": 149404352, + "step": 69250 + }, + { + "epoch": 11.297716150081566, + "grad_norm": 0.04758382961153984, + "learning_rate": 0.00047404527107812423, + "loss": 0.005, + "num_input_tokens_seen": 149415040, + "step": 69255 + }, + { + "epoch": 11.298531810766722, + "grad_norm": 0.017822077497839928, + "learning_rate": 0.00047397418748366596, + "loss": 0.0181, + "num_input_tokens_seen": 149426528, + "step": 69260 + }, + { + "epoch": 11.299347471451876, + "grad_norm": 0.003392835846170783, + "learning_rate": 0.0004739031044166536, + "loss": 0.0097, + "num_input_tokens_seen": 149437056, + "step": 69265 + }, + { + "epoch": 11.300163132137031, + "grad_norm": 0.000765681266784668, + "learning_rate": 0.0004738320218785281, + "loss": 0.1092, + "num_input_tokens_seen": 149448320, + "step": 69270 + }, + { + "epoch": 11.300978792822185, + "grad_norm": 0.03259848430752754, + "learning_rate": 0.00047376093987072985, + "loss": 0.0121, + "num_input_tokens_seen": 149458496, + "step": 69275 + }, + { + "epoch": 11.301794453507341, + "grad_norm": 0.03993254154920578, + "learning_rate": 0.00047368985839469946, + "loss": 0.0756, + "num_input_tokens_seen": 149469792, + "step": 69280 + }, + { + "epoch": 11.302610114192497, + "grad_norm": 0.004871142562478781, + "learning_rate": 0.00047361877745187743, + "loss": 0.0027, + "num_input_tokens_seen": 149481440, + "step": 69285 + }, + { + "epoch": 11.30342577487765, + "grad_norm": 0.0011485691647976637, + "learning_rate": 0.0004735476970437043, + "loss": 0.021, + "num_input_tokens_seen": 149493216, + "step": 69290 + }, + { + "epoch": 11.304241435562806, + "grad_norm": 0.0032224832102656364, + "learning_rate": 0.0004734766171716208, + "loss": 0.1398, + "num_input_tokens_seen": 149503680, + "step": 69295 + }, + { + "epoch": 11.30505709624796, + "grad_norm": 0.0025530222337692976, + "learning_rate": 0.0004734055378370671, + "loss": 0.1441, + "num_input_tokens_seen": 149514464, + "step": 69300 + }, + { + "epoch": 11.305872756933116, + "grad_norm": 0.005508598405867815, + "learning_rate": 0.00047333445904148414, + "loss": 0.0164, + "num_input_tokens_seen": 149524896, + "step": 69305 + }, + { + "epoch": 11.30668841761827, + "grad_norm": 0.017201591283082962, + "learning_rate": 0.0004732633807863119, + "loss": 0.0132, + "num_input_tokens_seen": 149536832, + "step": 69310 + }, + { + "epoch": 11.307504078303426, + "grad_norm": 0.3183722198009491, + "learning_rate": 0.0004731923030729915, + "loss": 0.0554, + "num_input_tokens_seen": 149548352, + "step": 69315 + }, + { + "epoch": 11.308319738988581, + "grad_norm": 0.02199809066951275, + "learning_rate": 0.0004731212259029628, + "loss": 0.013, + "num_input_tokens_seen": 149558848, + "step": 69320 + }, + { + "epoch": 11.309135399673735, + "grad_norm": 0.01002193707972765, + "learning_rate": 0.0004730501492776668, + "loss": 0.0185, + "num_input_tokens_seen": 149570080, + "step": 69325 + }, + { + "epoch": 11.309951060358891, + "grad_norm": 0.0013371697859838605, + "learning_rate": 0.00047297907319854347, + "loss": 0.0131, + "num_input_tokens_seen": 149581920, + "step": 69330 + }, + { + "epoch": 11.310766721044045, + "grad_norm": 0.036095406860113144, + "learning_rate": 0.0004729079976670338, + "loss": 0.0175, + "num_input_tokens_seen": 149593408, + "step": 69335 + }, + { + "epoch": 11.3115823817292, + "grad_norm": 0.003752897959202528, + "learning_rate": 0.00047283692268457764, + "loss": 0.1671, + "num_input_tokens_seen": 149604640, + "step": 69340 + }, + { + "epoch": 11.312398042414356, + "grad_norm": 0.005528479348868132, + "learning_rate": 0.0004727658482526159, + "loss": 0.0055, + "num_input_tokens_seen": 149614880, + "step": 69345 + }, + { + "epoch": 11.31321370309951, + "grad_norm": 0.007007166743278503, + "learning_rate": 0.00047269477437258863, + "loss": 0.0144, + "num_input_tokens_seen": 149625824, + "step": 69350 + }, + { + "epoch": 11.314029363784666, + "grad_norm": 0.04010507091879845, + "learning_rate": 0.0004726237010459366, + "loss": 0.1363, + "num_input_tokens_seen": 149637376, + "step": 69355 + }, + { + "epoch": 11.31484502446982, + "grad_norm": 0.010353878140449524, + "learning_rate": 0.00047255262827409974, + "loss": 0.0057, + "num_input_tokens_seen": 149648480, + "step": 69360 + }, + { + "epoch": 11.315660685154976, + "grad_norm": 0.023701857775449753, + "learning_rate": 0.00047248155605851896, + "loss": 0.0095, + "num_input_tokens_seen": 149659584, + "step": 69365 + }, + { + "epoch": 11.31647634584013, + "grad_norm": 0.6545511484146118, + "learning_rate": 0.0004724104844006341, + "loss": 0.0246, + "num_input_tokens_seen": 149670112, + "step": 69370 + }, + { + "epoch": 11.317292006525285, + "grad_norm": 0.027166441082954407, + "learning_rate": 0.0004723394133018858, + "loss": 0.083, + "num_input_tokens_seen": 149680608, + "step": 69375 + }, + { + "epoch": 11.318107667210441, + "grad_norm": 0.19956211745738983, + "learning_rate": 0.00047226834276371457, + "loss": 0.1215, + "num_input_tokens_seen": 149692128, + "step": 69380 + }, + { + "epoch": 11.318923327895595, + "grad_norm": 0.021688925102353096, + "learning_rate": 0.00047219727278756033, + "loss": 0.0107, + "num_input_tokens_seen": 149702816, + "step": 69385 + }, + { + "epoch": 11.31973898858075, + "grad_norm": 0.037454113364219666, + "learning_rate": 0.0004721262033748639, + "loss": 0.0167, + "num_input_tokens_seen": 149714016, + "step": 69390 + }, + { + "epoch": 11.320554649265905, + "grad_norm": 0.0012636196333914995, + "learning_rate": 0.00047205513452706503, + "loss": 0.0374, + "num_input_tokens_seen": 149725376, + "step": 69395 + }, + { + "epoch": 11.32137030995106, + "grad_norm": 0.7473606467247009, + "learning_rate": 0.0004719840662456046, + "loss": 0.0568, + "num_input_tokens_seen": 149735584, + "step": 69400 + }, + { + "epoch": 11.322185970636216, + "grad_norm": 0.00351674179546535, + "learning_rate": 0.0004719129985319223, + "loss": 0.014, + "num_input_tokens_seen": 149746432, + "step": 69405 + }, + { + "epoch": 11.32300163132137, + "grad_norm": 0.006801532581448555, + "learning_rate": 0.0004718419313874589, + "loss": 0.0185, + "num_input_tokens_seen": 149756800, + "step": 69410 + }, + { + "epoch": 11.323817292006526, + "grad_norm": 0.09331957995891571, + "learning_rate": 0.00047177086481365444, + "loss": 0.0124, + "num_input_tokens_seen": 149767040, + "step": 69415 + }, + { + "epoch": 11.32463295269168, + "grad_norm": 0.03144987300038338, + "learning_rate": 0.00047169979881194927, + "loss": 0.0129, + "num_input_tokens_seen": 149777312, + "step": 69420 + }, + { + "epoch": 11.325448613376835, + "grad_norm": 0.004917765036225319, + "learning_rate": 0.00047162873338378353, + "loss": 0.0618, + "num_input_tokens_seen": 149788000, + "step": 69425 + }, + { + "epoch": 11.326264274061991, + "grad_norm": 0.014339166693389416, + "learning_rate": 0.0004715576685305975, + "loss": 0.0054, + "num_input_tokens_seen": 149798880, + "step": 69430 + }, + { + "epoch": 11.327079934747145, + "grad_norm": 0.23084481060504913, + "learning_rate": 0.0004714866042538313, + "loss": 0.0438, + "num_input_tokens_seen": 149808864, + "step": 69435 + }, + { + "epoch": 11.3278955954323, + "grad_norm": 0.07707815617322922, + "learning_rate": 0.00047141554055492546, + "loss": 0.015, + "num_input_tokens_seen": 149820192, + "step": 69440 + }, + { + "epoch": 11.328711256117455, + "grad_norm": 0.06775520741939545, + "learning_rate": 0.0004713444774353197, + "loss": 0.0309, + "num_input_tokens_seen": 149831296, + "step": 69445 + }, + { + "epoch": 11.32952691680261, + "grad_norm": 0.04490366950631142, + "learning_rate": 0.0004712734148964547, + "loss": 0.01, + "num_input_tokens_seen": 149842688, + "step": 69450 + }, + { + "epoch": 11.330342577487766, + "grad_norm": 0.0018784068524837494, + "learning_rate": 0.00047120235293977023, + "loss": 0.0205, + "num_input_tokens_seen": 149853344, + "step": 69455 + }, + { + "epoch": 11.33115823817292, + "grad_norm": 0.017038045451045036, + "learning_rate": 0.00047113129156670677, + "loss": 0.0116, + "num_input_tokens_seen": 149865568, + "step": 69460 + }, + { + "epoch": 11.331973898858076, + "grad_norm": 0.9298956990242004, + "learning_rate": 0.00047106023077870407, + "loss": 0.1427, + "num_input_tokens_seen": 149877504, + "step": 69465 + }, + { + "epoch": 11.33278955954323, + "grad_norm": 0.3216319978237152, + "learning_rate": 0.00047098917057720275, + "loss": 0.034, + "num_input_tokens_seen": 149888256, + "step": 69470 + }, + { + "epoch": 11.333605220228385, + "grad_norm": 0.0025566897820681334, + "learning_rate": 0.00047091811096364243, + "loss": 0.0482, + "num_input_tokens_seen": 149899424, + "step": 69475 + }, + { + "epoch": 11.33442088091354, + "grad_norm": 0.020846620202064514, + "learning_rate": 0.00047084705193946357, + "loss": 0.0093, + "num_input_tokens_seen": 149911424, + "step": 69480 + }, + { + "epoch": 11.335236541598695, + "grad_norm": 0.22392310202121735, + "learning_rate": 0.0004707759935061063, + "loss": 0.0412, + "num_input_tokens_seen": 149920544, + "step": 69485 + }, + { + "epoch": 11.33605220228385, + "grad_norm": 0.31497204303741455, + "learning_rate": 0.0004707049356650105, + "loss": 0.1324, + "num_input_tokens_seen": 149932032, + "step": 69490 + }, + { + "epoch": 11.336867862969005, + "grad_norm": 0.002394807757809758, + "learning_rate": 0.0004706338784176165, + "loss": 0.0162, + "num_input_tokens_seen": 149942624, + "step": 69495 + }, + { + "epoch": 11.33768352365416, + "grad_norm": 0.10862261056900024, + "learning_rate": 0.000470562821765364, + "loss": 0.0108, + "num_input_tokens_seen": 149954336, + "step": 69500 + }, + { + "epoch": 11.338499184339314, + "grad_norm": 0.07686685770750046, + "learning_rate": 0.0004704917657096934, + "loss": 0.0089, + "num_input_tokens_seen": 149964544, + "step": 69505 + }, + { + "epoch": 11.33931484502447, + "grad_norm": 0.001927865669131279, + "learning_rate": 0.00047042071025204445, + "loss": 0.0078, + "num_input_tokens_seen": 149975744, + "step": 69510 + }, + { + "epoch": 11.340130505709626, + "grad_norm": 0.10945727676153183, + "learning_rate": 0.0004703496553938576, + "loss": 0.0903, + "num_input_tokens_seen": 149986944, + "step": 69515 + }, + { + "epoch": 11.34094616639478, + "grad_norm": 0.46757346391677856, + "learning_rate": 0.00047027860113657235, + "loss": 0.0337, + "num_input_tokens_seen": 149997664, + "step": 69520 + }, + { + "epoch": 11.341761827079935, + "grad_norm": 0.0011434765765443444, + "learning_rate": 0.00047020754748162914, + "loss": 0.0322, + "num_input_tokens_seen": 150008800, + "step": 69525 + }, + { + "epoch": 11.34257748776509, + "grad_norm": 0.036828648298978806, + "learning_rate": 0.0004701364944304675, + "loss": 0.1434, + "num_input_tokens_seen": 150019872, + "step": 69530 + }, + { + "epoch": 11.343393148450245, + "grad_norm": 0.4839276671409607, + "learning_rate": 0.000470065441984528, + "loss": 0.0452, + "num_input_tokens_seen": 150031456, + "step": 69535 + }, + { + "epoch": 11.3442088091354, + "grad_norm": 0.005866473540663719, + "learning_rate": 0.00046999439014525004, + "loss": 0.004, + "num_input_tokens_seen": 150041600, + "step": 69540 + }, + { + "epoch": 11.345024469820554, + "grad_norm": 0.00668323552235961, + "learning_rate": 0.00046992333891407396, + "loss": 0.0194, + "num_input_tokens_seen": 150052160, + "step": 69545 + }, + { + "epoch": 11.34584013050571, + "grad_norm": 0.036016084253787994, + "learning_rate": 0.00046985228829243955, + "loss": 0.0068, + "num_input_tokens_seen": 150062880, + "step": 69550 + }, + { + "epoch": 11.346655791190864, + "grad_norm": 0.036763593554496765, + "learning_rate": 0.0004697812382817868, + "loss": 0.0236, + "num_input_tokens_seen": 150072480, + "step": 69555 + }, + { + "epoch": 11.34747145187602, + "grad_norm": 0.03251663222908974, + "learning_rate": 0.0004697101888835555, + "loss": 0.0061, + "num_input_tokens_seen": 150082656, + "step": 69560 + }, + { + "epoch": 11.348287112561174, + "grad_norm": 0.016623644158244133, + "learning_rate": 0.0004696391400991857, + "loss": 0.0094, + "num_input_tokens_seen": 150093600, + "step": 69565 + }, + { + "epoch": 11.34910277324633, + "grad_norm": 0.20063531398773193, + "learning_rate": 0.0004695680919301173, + "loss": 0.0128, + "num_input_tokens_seen": 150105312, + "step": 69570 + }, + { + "epoch": 11.349918433931485, + "grad_norm": 0.0027389242313802242, + "learning_rate": 0.00046949704437779005, + "loss": 0.1233, + "num_input_tokens_seen": 150116416, + "step": 69575 + }, + { + "epoch": 11.350734094616639, + "grad_norm": 0.041335154324769974, + "learning_rate": 0.0004694259974436438, + "loss": 0.0077, + "num_input_tokens_seen": 150127360, + "step": 69580 + }, + { + "epoch": 11.351549755301795, + "grad_norm": 0.039992302656173706, + "learning_rate": 0.00046935495112911856, + "loss": 0.0108, + "num_input_tokens_seen": 150138688, + "step": 69585 + }, + { + "epoch": 11.352365415986949, + "grad_norm": 0.041558753699064255, + "learning_rate": 0.0004692839054356542, + "loss": 0.02, + "num_input_tokens_seen": 150149088, + "step": 69590 + }, + { + "epoch": 11.353181076672104, + "grad_norm": 0.004218420013785362, + "learning_rate": 0.0004692128603646904, + "loss": 0.0981, + "num_input_tokens_seen": 150159488, + "step": 69595 + }, + { + "epoch": 11.35399673735726, + "grad_norm": 0.00892791897058487, + "learning_rate": 0.0004691418159176671, + "loss": 0.0107, + "num_input_tokens_seen": 150170560, + "step": 69600 + }, + { + "epoch": 11.354812398042414, + "grad_norm": 0.4613715708255768, + "learning_rate": 0.00046907077209602387, + "loss": 0.1462, + "num_input_tokens_seen": 150181760, + "step": 69605 + }, + { + "epoch": 11.35562805872757, + "grad_norm": 0.3809371888637543, + "learning_rate": 0.0004689997289012009, + "loss": 0.1061, + "num_input_tokens_seen": 150192928, + "step": 69610 + }, + { + "epoch": 11.356443719412724, + "grad_norm": 0.03783030062913895, + "learning_rate": 0.0004689286863346376, + "loss": 0.0251, + "num_input_tokens_seen": 150203040, + "step": 69615 + }, + { + "epoch": 11.35725938009788, + "grad_norm": 0.021683964878320694, + "learning_rate": 0.00046885764439777406, + "loss": 0.0783, + "num_input_tokens_seen": 150214464, + "step": 69620 + }, + { + "epoch": 11.358075040783035, + "grad_norm": 0.00647715013474226, + "learning_rate": 0.0004687866030920496, + "loss": 0.078, + "num_input_tokens_seen": 150224864, + "step": 69625 + }, + { + "epoch": 11.358890701468189, + "grad_norm": 0.011663581244647503, + "learning_rate": 0.00046871556241890455, + "loss": 0.0302, + "num_input_tokens_seen": 150235072, + "step": 69630 + }, + { + "epoch": 11.359706362153345, + "grad_norm": 0.02514946274459362, + "learning_rate": 0.000468644522379778, + "loss": 0.0117, + "num_input_tokens_seen": 150246816, + "step": 69635 + }, + { + "epoch": 11.360522022838499, + "grad_norm": 0.0009114954737015069, + "learning_rate": 0.00046857348297611024, + "loss": 0.0063, + "num_input_tokens_seen": 150258624, + "step": 69640 + }, + { + "epoch": 11.361337683523654, + "grad_norm": 0.018556734547019005, + "learning_rate": 0.0004685024442093405, + "loss": 0.0307, + "num_input_tokens_seen": 150268896, + "step": 69645 + }, + { + "epoch": 11.362153344208808, + "grad_norm": 0.09002438932657242, + "learning_rate": 0.00046843140608090897, + "loss": 0.0091, + "num_input_tokens_seen": 150279552, + "step": 69650 + }, + { + "epoch": 11.362969004893964, + "grad_norm": 0.10402621328830719, + "learning_rate": 0.0004683603685922547, + "loss": 0.0191, + "num_input_tokens_seen": 150290304, + "step": 69655 + }, + { + "epoch": 11.36378466557912, + "grad_norm": 0.17773236334323883, + "learning_rate": 0.00046828933174481797, + "loss": 0.0154, + "num_input_tokens_seen": 150301632, + "step": 69660 + }, + { + "epoch": 11.364600326264274, + "grad_norm": 0.04328451305627823, + "learning_rate": 0.000468218295540038, + "loss": 0.0292, + "num_input_tokens_seen": 150312096, + "step": 69665 + }, + { + "epoch": 11.36541598694943, + "grad_norm": 0.0038381677586585283, + "learning_rate": 0.0004681472599793547, + "loss": 0.0206, + "num_input_tokens_seen": 150323840, + "step": 69670 + }, + { + "epoch": 11.366231647634583, + "grad_norm": 0.001222445978783071, + "learning_rate": 0.00046807622506420745, + "loss": 0.0024, + "num_input_tokens_seen": 150335104, + "step": 69675 + }, + { + "epoch": 11.367047308319739, + "grad_norm": 0.0040596467442810535, + "learning_rate": 0.00046800519079603616, + "loss": 0.0865, + "num_input_tokens_seen": 150346016, + "step": 69680 + }, + { + "epoch": 11.367862969004895, + "grad_norm": 0.004096492659300566, + "learning_rate": 0.00046793415717628006, + "loss": 0.0168, + "num_input_tokens_seen": 150358144, + "step": 69685 + }, + { + "epoch": 11.368678629690049, + "grad_norm": 0.024011576548218727, + "learning_rate": 0.000467863124206379, + "loss": 0.0701, + "num_input_tokens_seen": 150368672, + "step": 69690 + }, + { + "epoch": 11.369494290375204, + "grad_norm": 0.0005396094056777656, + "learning_rate": 0.0004677920918877726, + "loss": 0.0079, + "num_input_tokens_seen": 150379520, + "step": 69695 + }, + { + "epoch": 11.370309951060358, + "grad_norm": 0.021570729091763496, + "learning_rate": 0.0004677210602219002, + "loss": 0.1471, + "num_input_tokens_seen": 150390400, + "step": 69700 + }, + { + "epoch": 11.371125611745514, + "grad_norm": 0.08252798765897751, + "learning_rate": 0.00046765002921020165, + "loss": 0.1564, + "num_input_tokens_seen": 150399808, + "step": 69705 + }, + { + "epoch": 11.37194127243067, + "grad_norm": 0.09565167129039764, + "learning_rate": 0.0004675789988541161, + "loss": 0.0185, + "num_input_tokens_seen": 150409728, + "step": 69710 + }, + { + "epoch": 11.372756933115824, + "grad_norm": 0.016505800187587738, + "learning_rate": 0.0004675079691550833, + "loss": 0.0041, + "num_input_tokens_seen": 150418624, + "step": 69715 + }, + { + "epoch": 11.37357259380098, + "grad_norm": 0.10614673793315887, + "learning_rate": 0.0004674369401145428, + "loss": 0.0203, + "num_input_tokens_seen": 150429056, + "step": 69720 + }, + { + "epoch": 11.374388254486133, + "grad_norm": 0.1371740847826004, + "learning_rate": 0.000467365911733934, + "loss": 0.036, + "num_input_tokens_seen": 150440192, + "step": 69725 + }, + { + "epoch": 11.375203915171289, + "grad_norm": 0.022861870005726814, + "learning_rate": 0.0004672948840146964, + "loss": 0.0041, + "num_input_tokens_seen": 150450944, + "step": 69730 + }, + { + "epoch": 11.376019575856443, + "grad_norm": 0.055315207690000534, + "learning_rate": 0.0004672238569582695, + "loss": 0.0335, + "num_input_tokens_seen": 150461632, + "step": 69735 + }, + { + "epoch": 11.376835236541599, + "grad_norm": 0.003046634839847684, + "learning_rate": 0.00046715283056609255, + "loss": 0.0477, + "num_input_tokens_seen": 150473056, + "step": 69740 + }, + { + "epoch": 11.377650897226754, + "grad_norm": 0.03886079788208008, + "learning_rate": 0.0004670818048396054, + "loss": 0.0115, + "num_input_tokens_seen": 150484512, + "step": 69745 + }, + { + "epoch": 11.378466557911908, + "grad_norm": 0.004731168504804373, + "learning_rate": 0.00046701077978024695, + "loss": 0.0199, + "num_input_tokens_seen": 150493760, + "step": 69750 + }, + { + "epoch": 11.379282218597064, + "grad_norm": 0.003077416680753231, + "learning_rate": 0.0004669397553894572, + "loss": 0.0071, + "num_input_tokens_seen": 150503424, + "step": 69755 + }, + { + "epoch": 11.380097879282218, + "grad_norm": 0.05844417214393616, + "learning_rate": 0.00046686873166867503, + "loss": 0.0576, + "num_input_tokens_seen": 150513984, + "step": 69760 + }, + { + "epoch": 11.380913539967374, + "grad_norm": 0.008587583899497986, + "learning_rate": 0.00046679770861934026, + "loss": 0.0302, + "num_input_tokens_seen": 150525024, + "step": 69765 + }, + { + "epoch": 11.38172920065253, + "grad_norm": 0.01990717649459839, + "learning_rate": 0.00046672668624289177, + "loss": 0.0059, + "num_input_tokens_seen": 150534720, + "step": 69770 + }, + { + "epoch": 11.382544861337683, + "grad_norm": 0.02939121052622795, + "learning_rate": 0.0004666556645407695, + "loss": 0.0214, + "num_input_tokens_seen": 150545856, + "step": 69775 + }, + { + "epoch": 11.383360522022839, + "grad_norm": 0.01129090879112482, + "learning_rate": 0.00046658464351441214, + "loss": 0.0091, + "num_input_tokens_seen": 150556672, + "step": 69780 + }, + { + "epoch": 11.384176182707993, + "grad_norm": 0.019934136420488358, + "learning_rate": 0.0004665136231652597, + "loss": 0.0084, + "num_input_tokens_seen": 150568608, + "step": 69785 + }, + { + "epoch": 11.384991843393149, + "grad_norm": 0.39556777477264404, + "learning_rate": 0.0004664426034947509, + "loss": 0.0068, + "num_input_tokens_seen": 150578432, + "step": 69790 + }, + { + "epoch": 11.385807504078304, + "grad_norm": 0.0016444023931398988, + "learning_rate": 0.00046637158450432557, + "loss": 0.0097, + "num_input_tokens_seen": 150589504, + "step": 69795 + }, + { + "epoch": 11.386623164763458, + "grad_norm": 0.004846824333071709, + "learning_rate": 0.0004663005661954225, + "loss": 0.0125, + "num_input_tokens_seen": 150599616, + "step": 69800 + }, + { + "epoch": 11.387438825448614, + "grad_norm": 0.1290767788887024, + "learning_rate": 0.0004662295485694812, + "loss": 0.0547, + "num_input_tokens_seen": 150610816, + "step": 69805 + }, + { + "epoch": 11.388254486133768, + "grad_norm": 0.05519620701670647, + "learning_rate": 0.00046615853162794115, + "loss": 0.0441, + "num_input_tokens_seen": 150622592, + "step": 69810 + }, + { + "epoch": 11.389070146818923, + "grad_norm": 0.008343004621565342, + "learning_rate": 0.00046608751537224115, + "loss": 0.1445, + "num_input_tokens_seen": 150633568, + "step": 69815 + }, + { + "epoch": 11.38988580750408, + "grad_norm": 0.005876525770872831, + "learning_rate": 0.0004660164998038209, + "loss": 0.0057, + "num_input_tokens_seen": 150644384, + "step": 69820 + }, + { + "epoch": 11.390701468189233, + "grad_norm": 0.0015857660910114646, + "learning_rate": 0.0004659454849241192, + "loss": 0.0416, + "num_input_tokens_seen": 150655232, + "step": 69825 + }, + { + "epoch": 11.391517128874389, + "grad_norm": 0.08757929503917694, + "learning_rate": 0.0004658744707345757, + "loss": 0.0081, + "num_input_tokens_seen": 150665760, + "step": 69830 + }, + { + "epoch": 11.392332789559543, + "grad_norm": 0.3490835428237915, + "learning_rate": 0.000465803457236629, + "loss": 0.0595, + "num_input_tokens_seen": 150675648, + "step": 69835 + }, + { + "epoch": 11.393148450244698, + "grad_norm": 0.0013626095606014132, + "learning_rate": 0.00046573244443171897, + "loss": 0.0051, + "num_input_tokens_seen": 150686784, + "step": 69840 + }, + { + "epoch": 11.393964110929852, + "grad_norm": 0.00834793969988823, + "learning_rate": 0.00046566143232128416, + "loss": 0.0034, + "num_input_tokens_seen": 150697952, + "step": 69845 + }, + { + "epoch": 11.394779771615008, + "grad_norm": 0.00966212060302496, + "learning_rate": 0.0004655904209067642, + "loss": 0.0686, + "num_input_tokens_seen": 150709696, + "step": 69850 + }, + { + "epoch": 11.395595432300164, + "grad_norm": 0.0006316542276181281, + "learning_rate": 0.0004655194101895978, + "loss": 0.0167, + "num_input_tokens_seen": 150719648, + "step": 69855 + }, + { + "epoch": 11.396411092985318, + "grad_norm": 0.00798444077372551, + "learning_rate": 0.00046544840017122437, + "loss": 0.0051, + "num_input_tokens_seen": 150730656, + "step": 69860 + }, + { + "epoch": 11.397226753670473, + "grad_norm": 0.1566763073205948, + "learning_rate": 0.000465377390853083, + "loss": 0.0486, + "num_input_tokens_seen": 150740896, + "step": 69865 + }, + { + "epoch": 11.398042414355627, + "grad_norm": 0.07881169766187668, + "learning_rate": 0.0004653063822366127, + "loss": 0.0072, + "num_input_tokens_seen": 150751776, + "step": 69870 + }, + { + "epoch": 11.398858075040783, + "grad_norm": 0.06643541157245636, + "learning_rate": 0.00046523537432325256, + "loss": 0.007, + "num_input_tokens_seen": 150762240, + "step": 69875 + }, + { + "epoch": 11.399673735725939, + "grad_norm": 0.43710964918136597, + "learning_rate": 0.00046516436711444166, + "loss": 0.2458, + "num_input_tokens_seen": 150773664, + "step": 69880 + }, + { + "epoch": 11.400489396411093, + "grad_norm": 0.01632930524647236, + "learning_rate": 0.000465093360611619, + "loss": 0.011, + "num_input_tokens_seen": 150784608, + "step": 69885 + }, + { + "epoch": 11.401305057096248, + "grad_norm": 0.142897829413414, + "learning_rate": 0.00046502235481622387, + "loss": 0.0427, + "num_input_tokens_seen": 150795584, + "step": 69890 + }, + { + "epoch": 11.402120717781402, + "grad_norm": 0.14096812903881073, + "learning_rate": 0.00046495134972969476, + "loss": 0.124, + "num_input_tokens_seen": 150806080, + "step": 69895 + }, + { + "epoch": 11.402936378466558, + "grad_norm": 0.005512685514986515, + "learning_rate": 0.00046488034535347133, + "loss": 0.024, + "num_input_tokens_seen": 150818016, + "step": 69900 + }, + { + "epoch": 11.403752039151712, + "grad_norm": 0.2379670888185501, + "learning_rate": 0.00046480934168899204, + "loss": 0.0256, + "num_input_tokens_seen": 150828352, + "step": 69905 + }, + { + "epoch": 11.404567699836868, + "grad_norm": 0.317105770111084, + "learning_rate": 0.0004647383387376961, + "loss": 0.147, + "num_input_tokens_seen": 150838816, + "step": 69910 + }, + { + "epoch": 11.405383360522023, + "grad_norm": 0.02887031063437462, + "learning_rate": 0.0004646673365010226, + "loss": 0.1135, + "num_input_tokens_seen": 150849632, + "step": 69915 + }, + { + "epoch": 11.406199021207177, + "grad_norm": 0.4235639274120331, + "learning_rate": 0.0004645963349804102, + "loss": 0.0555, + "num_input_tokens_seen": 150861408, + "step": 69920 + }, + { + "epoch": 11.407014681892333, + "grad_norm": 0.008289646357297897, + "learning_rate": 0.0004645253341772982, + "loss": 0.0121, + "num_input_tokens_seen": 150871456, + "step": 69925 + }, + { + "epoch": 11.407830342577487, + "grad_norm": 0.009953885339200497, + "learning_rate": 0.00046445433409312507, + "loss": 0.012, + "num_input_tokens_seen": 150882112, + "step": 69930 + }, + { + "epoch": 11.408646003262643, + "grad_norm": 0.00826044101268053, + "learning_rate": 0.00046438333472933015, + "loss": 0.067, + "num_input_tokens_seen": 150892576, + "step": 69935 + }, + { + "epoch": 11.409461663947798, + "grad_norm": 0.0064981719478964806, + "learning_rate": 0.0004643123360873519, + "loss": 0.0071, + "num_input_tokens_seen": 150901984, + "step": 69940 + }, + { + "epoch": 11.410277324632952, + "grad_norm": 0.004001571796834469, + "learning_rate": 0.00046424133816862966, + "loss": 0.0043, + "num_input_tokens_seen": 150912736, + "step": 69945 + }, + { + "epoch": 11.411092985318108, + "grad_norm": 0.008818493224680424, + "learning_rate": 0.00046417034097460193, + "loss": 0.0086, + "num_input_tokens_seen": 150922848, + "step": 69950 + }, + { + "epoch": 11.411908646003262, + "grad_norm": 0.023645471781492233, + "learning_rate": 0.0004640993445067078, + "loss": 0.0098, + "num_input_tokens_seen": 150933824, + "step": 69955 + }, + { + "epoch": 11.412724306688418, + "grad_norm": 0.10355032980442047, + "learning_rate": 0.00046402834876638584, + "loss": 0.0421, + "num_input_tokens_seen": 150945344, + "step": 69960 + }, + { + "epoch": 11.413539967373573, + "grad_norm": 0.011725643649697304, + "learning_rate": 0.00046395735375507523, + "loss": 0.0072, + "num_input_tokens_seen": 150956352, + "step": 69965 + }, + { + "epoch": 11.414355628058727, + "grad_norm": 0.23020459711551666, + "learning_rate": 0.0004638863594742144, + "loss": 0.0164, + "num_input_tokens_seen": 150968160, + "step": 69970 + }, + { + "epoch": 11.415171288743883, + "grad_norm": 0.24219931662082672, + "learning_rate": 0.00046381536592524244, + "loss": 0.0211, + "num_input_tokens_seen": 150979008, + "step": 69975 + }, + { + "epoch": 11.415986949429037, + "grad_norm": 0.011142288334667683, + "learning_rate": 0.00046374437310959783, + "loss": 0.0549, + "num_input_tokens_seen": 150989536, + "step": 69980 + }, + { + "epoch": 11.416802610114193, + "grad_norm": 0.06548699736595154, + "learning_rate": 0.0004636733810287197, + "loss": 0.0074, + "num_input_tokens_seen": 151000544, + "step": 69985 + }, + { + "epoch": 11.417618270799348, + "grad_norm": 0.0036305435933172703, + "learning_rate": 0.00046360238968404634, + "loss": 0.0455, + "num_input_tokens_seen": 151010944, + "step": 69990 + }, + { + "epoch": 11.418433931484502, + "grad_norm": 0.07094302028417587, + "learning_rate": 0.000463531399077017, + "loss": 0.0095, + "num_input_tokens_seen": 151020448, + "step": 69995 + }, + { + "epoch": 11.419249592169658, + "grad_norm": 0.01924165152013302, + "learning_rate": 0.00046346040920906985, + "loss": 0.0064, + "num_input_tokens_seen": 151031904, + "step": 70000 + }, + { + "epoch": 11.420065252854812, + "grad_norm": 0.14442592859268188, + "learning_rate": 0.000463389420081644, + "loss": 0.0793, + "num_input_tokens_seen": 151043904, + "step": 70005 + }, + { + "epoch": 11.420880913539968, + "grad_norm": 0.00031384656904265285, + "learning_rate": 0.000463318431696178, + "loss": 0.0118, + "num_input_tokens_seen": 151053888, + "step": 70010 + }, + { + "epoch": 11.421696574225122, + "grad_norm": 0.001238382188603282, + "learning_rate": 0.00046324744405411034, + "loss": 0.017, + "num_input_tokens_seen": 151064576, + "step": 70015 + }, + { + "epoch": 11.422512234910277, + "grad_norm": 0.11373946070671082, + "learning_rate": 0.00046317645715688015, + "loss": 0.0156, + "num_input_tokens_seen": 151075520, + "step": 70020 + }, + { + "epoch": 11.423327895595433, + "grad_norm": 0.5776710510253906, + "learning_rate": 0.00046310547100592557, + "loss": 0.0316, + "num_input_tokens_seen": 151085632, + "step": 70025 + }, + { + "epoch": 11.424143556280587, + "grad_norm": 0.45083749294281006, + "learning_rate": 0.0004630344856026855, + "loss": 0.0531, + "num_input_tokens_seen": 151096032, + "step": 70030 + }, + { + "epoch": 11.424959216965743, + "grad_norm": 0.008504417724907398, + "learning_rate": 0.0004629635009485984, + "loss": 0.0916, + "num_input_tokens_seen": 151106752, + "step": 70035 + }, + { + "epoch": 11.425774877650896, + "grad_norm": 0.002676644828170538, + "learning_rate": 0.000462892517045103, + "loss": 0.0752, + "num_input_tokens_seen": 151117408, + "step": 70040 + }, + { + "epoch": 11.426590538336052, + "grad_norm": 0.005246634595096111, + "learning_rate": 0.0004628215338936378, + "loss": 0.0037, + "num_input_tokens_seen": 151128704, + "step": 70045 + }, + { + "epoch": 11.427406199021208, + "grad_norm": 0.21213151514530182, + "learning_rate": 0.0004627505514956414, + "loss": 0.0124, + "num_input_tokens_seen": 151139392, + "step": 70050 + }, + { + "epoch": 11.428221859706362, + "grad_norm": 0.02078043669462204, + "learning_rate": 0.0004626795698525522, + "loss": 0.0157, + "num_input_tokens_seen": 151150400, + "step": 70055 + }, + { + "epoch": 11.429037520391518, + "grad_norm": 0.11079221963882446, + "learning_rate": 0.00046260858896580916, + "loss": 0.0149, + "num_input_tokens_seen": 151161504, + "step": 70060 + }, + { + "epoch": 11.429853181076671, + "grad_norm": 0.09051915258169174, + "learning_rate": 0.0004625376088368502, + "loss": 0.0122, + "num_input_tokens_seen": 151171872, + "step": 70065 + }, + { + "epoch": 11.430668841761827, + "grad_norm": 0.22650668025016785, + "learning_rate": 0.0004624666294671143, + "loss": 0.0094, + "num_input_tokens_seen": 151182880, + "step": 70070 + }, + { + "epoch": 11.431484502446983, + "grad_norm": 0.003754671197384596, + "learning_rate": 0.00046239565085803966, + "loss": 0.0244, + "num_input_tokens_seen": 151193696, + "step": 70075 + }, + { + "epoch": 11.432300163132137, + "grad_norm": 1.0134994983673096, + "learning_rate": 0.000462324673011065, + "loss": 0.0503, + "num_input_tokens_seen": 151203872, + "step": 70080 + }, + { + "epoch": 11.433115823817293, + "grad_norm": 0.0014540833653882146, + "learning_rate": 0.00046225369592762844, + "loss": 0.0035, + "num_input_tokens_seen": 151214720, + "step": 70085 + }, + { + "epoch": 11.433931484502446, + "grad_norm": 0.011376366019248962, + "learning_rate": 0.00046218271960916886, + "loss": 0.0017, + "num_input_tokens_seen": 151225664, + "step": 70090 + }, + { + "epoch": 11.434747145187602, + "grad_norm": 0.008373523131012917, + "learning_rate": 0.0004621117440571242, + "loss": 0.0134, + "num_input_tokens_seen": 151235744, + "step": 70095 + }, + { + "epoch": 11.435562805872756, + "grad_norm": 0.00508197071030736, + "learning_rate": 0.0004620407692729333, + "loss": 0.0029, + "num_input_tokens_seen": 151244544, + "step": 70100 + }, + { + "epoch": 11.436378466557912, + "grad_norm": 0.005126267671585083, + "learning_rate": 0.0004619697952580342, + "loss": 0.0064, + "num_input_tokens_seen": 151256000, + "step": 70105 + }, + { + "epoch": 11.437194127243067, + "grad_norm": 0.00634891539812088, + "learning_rate": 0.00046189882201386564, + "loss": 0.0057, + "num_input_tokens_seen": 151267616, + "step": 70110 + }, + { + "epoch": 11.438009787928221, + "grad_norm": 0.05916362628340721, + "learning_rate": 0.0004618278495418655, + "loss": 0.004, + "num_input_tokens_seen": 151277664, + "step": 70115 + }, + { + "epoch": 11.438825448613377, + "grad_norm": 0.004598218481987715, + "learning_rate": 0.0004617568778434725, + "loss": 0.0019, + "num_input_tokens_seen": 151287392, + "step": 70120 + }, + { + "epoch": 11.439641109298531, + "grad_norm": 0.006532749626785517, + "learning_rate": 0.0004616859069201251, + "loss": 0.003, + "num_input_tokens_seen": 151297696, + "step": 70125 + }, + { + "epoch": 11.440456769983687, + "grad_norm": 0.008686413988471031, + "learning_rate": 0.0004616149367732612, + "loss": 0.0109, + "num_input_tokens_seen": 151308992, + "step": 70130 + }, + { + "epoch": 11.441272430668842, + "grad_norm": 0.045013487339019775, + "learning_rate": 0.0004615439674043195, + "loss": 0.0061, + "num_input_tokens_seen": 151320608, + "step": 70135 + }, + { + "epoch": 11.442088091353996, + "grad_norm": 0.07258511334657669, + "learning_rate": 0.00046147299881473783, + "loss": 0.0132, + "num_input_tokens_seen": 151330336, + "step": 70140 + }, + { + "epoch": 11.442903752039152, + "grad_norm": 0.049100205302238464, + "learning_rate": 0.0004614020310059549, + "loss": 0.0078, + "num_input_tokens_seen": 151342848, + "step": 70145 + }, + { + "epoch": 11.443719412724306, + "grad_norm": 0.009062208235263824, + "learning_rate": 0.0004613310639794086, + "loss": 0.0078, + "num_input_tokens_seen": 151354272, + "step": 70150 + }, + { + "epoch": 11.444535073409462, + "grad_norm": 0.10276536643505096, + "learning_rate": 0.0004612600977365376, + "loss": 0.0088, + "num_input_tokens_seen": 151366080, + "step": 70155 + }, + { + "epoch": 11.445350734094617, + "grad_norm": 0.004438189789652824, + "learning_rate": 0.0004611891322787796, + "loss": 0.0041, + "num_input_tokens_seen": 151376032, + "step": 70160 + }, + { + "epoch": 11.446166394779771, + "grad_norm": 0.13078641891479492, + "learning_rate": 0.0004611181676075734, + "loss": 0.0188, + "num_input_tokens_seen": 151387648, + "step": 70165 + }, + { + "epoch": 11.446982055464927, + "grad_norm": 0.0005241918843239546, + "learning_rate": 0.00046104720372435647, + "loss": 0.0021, + "num_input_tokens_seen": 151397568, + "step": 70170 + }, + { + "epoch": 11.447797716150081, + "grad_norm": 0.5048108100891113, + "learning_rate": 0.0004609762406305676, + "loss": 0.0318, + "num_input_tokens_seen": 151407968, + "step": 70175 + }, + { + "epoch": 11.448613376835237, + "grad_norm": 0.00792924128472805, + "learning_rate": 0.0004609052783276447, + "loss": 0.0396, + "num_input_tokens_seen": 151418784, + "step": 70180 + }, + { + "epoch": 11.449429037520392, + "grad_norm": 0.3632190525531769, + "learning_rate": 0.0004608343168170259, + "loss": 0.0666, + "num_input_tokens_seen": 151428704, + "step": 70185 + }, + { + "epoch": 11.450244698205546, + "grad_norm": 0.0018684181850403547, + "learning_rate": 0.0004607633561001493, + "loss": 0.0012, + "num_input_tokens_seen": 151439168, + "step": 70190 + }, + { + "epoch": 11.451060358890702, + "grad_norm": 0.0004594696220010519, + "learning_rate": 0.0004606923961784532, + "loss": 0.0024, + "num_input_tokens_seen": 151450720, + "step": 70195 + }, + { + "epoch": 11.451876019575856, + "grad_norm": 0.005671604536473751, + "learning_rate": 0.00046062143705337535, + "loss": 0.013, + "num_input_tokens_seen": 151462208, + "step": 70200 + }, + { + "epoch": 11.452691680261012, + "grad_norm": 0.21807987987995148, + "learning_rate": 0.00046055047872635424, + "loss": 0.0859, + "num_input_tokens_seen": 151472800, + "step": 70205 + }, + { + "epoch": 11.453507340946166, + "grad_norm": 0.04038955643773079, + "learning_rate": 0.0004604795211988275, + "loss": 0.0032, + "num_input_tokens_seen": 151483968, + "step": 70210 + }, + { + "epoch": 11.454323001631321, + "grad_norm": 0.0005768550909124315, + "learning_rate": 0.00046040856447223375, + "loss": 0.0165, + "num_input_tokens_seen": 151494752, + "step": 70215 + }, + { + "epoch": 11.455138662316477, + "grad_norm": 0.01783035881817341, + "learning_rate": 0.00046033760854801033, + "loss": 0.0373, + "num_input_tokens_seen": 151505024, + "step": 70220 + }, + { + "epoch": 11.455954323001631, + "grad_norm": 0.00035412312718108296, + "learning_rate": 0.0004602666534275956, + "loss": 0.0188, + "num_input_tokens_seen": 151515968, + "step": 70225 + }, + { + "epoch": 11.456769983686787, + "grad_norm": 0.01331327948719263, + "learning_rate": 0.0004601956991124278, + "loss": 0.0071, + "num_input_tokens_seen": 151525472, + "step": 70230 + }, + { + "epoch": 11.45758564437194, + "grad_norm": 0.03204686567187309, + "learning_rate": 0.00046012474560394443, + "loss": 0.0404, + "num_input_tokens_seen": 151536288, + "step": 70235 + }, + { + "epoch": 11.458401305057096, + "grad_norm": 0.0021473001688718796, + "learning_rate": 0.00046005379290358386, + "loss": 0.0058, + "num_input_tokens_seen": 151545536, + "step": 70240 + }, + { + "epoch": 11.459216965742252, + "grad_norm": 0.0009514609701000154, + "learning_rate": 0.00045998284101278367, + "loss": 0.0036, + "num_input_tokens_seen": 151556640, + "step": 70245 + }, + { + "epoch": 11.460032626427406, + "grad_norm": 0.7574834823608398, + "learning_rate": 0.0004599118899329821, + "loss": 0.1295, + "num_input_tokens_seen": 151566464, + "step": 70250 + }, + { + "epoch": 11.460848287112562, + "grad_norm": 0.07028722018003464, + "learning_rate": 0.0004598409396656168, + "loss": 0.0341, + "num_input_tokens_seen": 151576384, + "step": 70255 + }, + { + "epoch": 11.461663947797716, + "grad_norm": 0.07774762809276581, + "learning_rate": 0.000459769990212126, + "loss": 0.0263, + "num_input_tokens_seen": 151587936, + "step": 70260 + }, + { + "epoch": 11.462479608482871, + "grad_norm": 0.21779689192771912, + "learning_rate": 0.0004596990415739472, + "loss": 0.0261, + "num_input_tokens_seen": 151599584, + "step": 70265 + }, + { + "epoch": 11.463295269168025, + "grad_norm": 0.1503334939479828, + "learning_rate": 0.0004596280937525186, + "loss": 0.0309, + "num_input_tokens_seen": 151609280, + "step": 70270 + }, + { + "epoch": 11.464110929853181, + "grad_norm": 0.002451465465128422, + "learning_rate": 0.00045955714674927775, + "loss": 0.0013, + "num_input_tokens_seen": 151618528, + "step": 70275 + }, + { + "epoch": 11.464926590538337, + "grad_norm": 0.06902746856212616, + "learning_rate": 0.0004594862005656628, + "loss": 0.0042, + "num_input_tokens_seen": 151628928, + "step": 70280 + }, + { + "epoch": 11.46574225122349, + "grad_norm": 0.4175633192062378, + "learning_rate": 0.00045941525520311116, + "loss": 0.0813, + "num_input_tokens_seen": 151639808, + "step": 70285 + }, + { + "epoch": 11.466557911908646, + "grad_norm": 0.0011378043564036489, + "learning_rate": 0.0004593443106630611, + "loss": 0.0046, + "num_input_tokens_seen": 151649952, + "step": 70290 + }, + { + "epoch": 11.4673735725938, + "grad_norm": 0.41905221343040466, + "learning_rate": 0.00045927336694695, + "loss": 0.0199, + "num_input_tokens_seen": 151660576, + "step": 70295 + }, + { + "epoch": 11.468189233278956, + "grad_norm": 0.0013913254952058196, + "learning_rate": 0.00045920242405621595, + "loss": 0.0128, + "num_input_tokens_seen": 151670272, + "step": 70300 + }, + { + "epoch": 11.469004893964112, + "grad_norm": 0.015381799079477787, + "learning_rate": 0.0004591314819922963, + "loss": 0.0025, + "num_input_tokens_seen": 151681088, + "step": 70305 + }, + { + "epoch": 11.469820554649266, + "grad_norm": 0.3198601305484772, + "learning_rate": 0.0004590605407566292, + "loss": 0.1744, + "num_input_tokens_seen": 151693312, + "step": 70310 + }, + { + "epoch": 11.470636215334421, + "grad_norm": 0.15981487929821014, + "learning_rate": 0.00045898960035065204, + "loss": 0.03, + "num_input_tokens_seen": 151704928, + "step": 70315 + }, + { + "epoch": 11.471451876019575, + "grad_norm": 0.1750972419977188, + "learning_rate": 0.00045891866077580267, + "loss": 0.0077, + "num_input_tokens_seen": 151716096, + "step": 70320 + }, + { + "epoch": 11.47226753670473, + "grad_norm": 0.14325596392154694, + "learning_rate": 0.0004588477220335188, + "loss": 0.0031, + "num_input_tokens_seen": 151725440, + "step": 70325 + }, + { + "epoch": 11.473083197389887, + "grad_norm": 0.056832488626241684, + "learning_rate": 0.000458776784125238, + "loss": 0.0117, + "num_input_tokens_seen": 151735360, + "step": 70330 + }, + { + "epoch": 11.47389885807504, + "grad_norm": 0.002802671631798148, + "learning_rate": 0.0004587058470523981, + "loss": 0.0853, + "num_input_tokens_seen": 151746368, + "step": 70335 + }, + { + "epoch": 11.474714518760196, + "grad_norm": 0.022697867825627327, + "learning_rate": 0.00045863491081643646, + "loss": 0.0056, + "num_input_tokens_seen": 151756928, + "step": 70340 + }, + { + "epoch": 11.47553017944535, + "grad_norm": 0.0034275418147444725, + "learning_rate": 0.00045856397541879087, + "loss": 0.0285, + "num_input_tokens_seen": 151767136, + "step": 70345 + }, + { + "epoch": 11.476345840130506, + "grad_norm": 0.04506643861532211, + "learning_rate": 0.0004584930408608989, + "loss": 0.0243, + "num_input_tokens_seen": 151777664, + "step": 70350 + }, + { + "epoch": 11.477161500815662, + "grad_norm": 0.0002641983737703413, + "learning_rate": 0.0004584221071441981, + "loss": 0.1012, + "num_input_tokens_seen": 151789376, + "step": 70355 + }, + { + "epoch": 11.477977161500815, + "grad_norm": 0.00804605707526207, + "learning_rate": 0.000458351174270126, + "loss": 0.1024, + "num_input_tokens_seen": 151800800, + "step": 70360 + }, + { + "epoch": 11.478792822185971, + "grad_norm": 0.19787639379501343, + "learning_rate": 0.00045828024224012025, + "loss": 0.0089, + "num_input_tokens_seen": 151811648, + "step": 70365 + }, + { + "epoch": 11.479608482871125, + "grad_norm": 0.09667205810546875, + "learning_rate": 0.00045820931105561817, + "loss": 0.0188, + "num_input_tokens_seen": 151822688, + "step": 70370 + }, + { + "epoch": 11.48042414355628, + "grad_norm": 0.0127464160323143, + "learning_rate": 0.0004581383807180577, + "loss": 0.0107, + "num_input_tokens_seen": 151832224, + "step": 70375 + }, + { + "epoch": 11.481239804241435, + "grad_norm": 0.0029441851656883955, + "learning_rate": 0.0004580674512288758, + "loss": 0.0639, + "num_input_tokens_seen": 151842304, + "step": 70380 + }, + { + "epoch": 11.48205546492659, + "grad_norm": 0.10395713150501251, + "learning_rate": 0.0004579965225895104, + "loss": 0.0075, + "num_input_tokens_seen": 151852576, + "step": 70385 + }, + { + "epoch": 11.482871125611746, + "grad_norm": 0.000521772657521069, + "learning_rate": 0.00045792559480139854, + "loss": 0.0542, + "num_input_tokens_seen": 151863328, + "step": 70390 + }, + { + "epoch": 11.4836867862969, + "grad_norm": 0.0492253415286541, + "learning_rate": 0.0004578546678659781, + "loss": 0.005, + "num_input_tokens_seen": 151873920, + "step": 70395 + }, + { + "epoch": 11.484502446982056, + "grad_norm": 0.0035921907983720303, + "learning_rate": 0.00045778374178468605, + "loss": 0.0119, + "num_input_tokens_seen": 151883552, + "step": 70400 + }, + { + "epoch": 11.48531810766721, + "grad_norm": 0.4873466193675995, + "learning_rate": 0.0004577128165589603, + "loss": 0.1165, + "num_input_tokens_seen": 151895136, + "step": 70405 + }, + { + "epoch": 11.486133768352365, + "grad_norm": 0.0797366127371788, + "learning_rate": 0.0004576418921902377, + "loss": 0.0082, + "num_input_tokens_seen": 151907296, + "step": 70410 + }, + { + "epoch": 11.486949429037521, + "grad_norm": 0.0030229513067752123, + "learning_rate": 0.0004575709686799561, + "loss": 0.0297, + "num_input_tokens_seen": 151918944, + "step": 70415 + }, + { + "epoch": 11.487765089722675, + "grad_norm": 0.012577119283378124, + "learning_rate": 0.00045750004602955246, + "loss": 0.0182, + "num_input_tokens_seen": 151929536, + "step": 70420 + }, + { + "epoch": 11.48858075040783, + "grad_norm": 0.1603432595729828, + "learning_rate": 0.0004574291242404645, + "loss": 0.0174, + "num_input_tokens_seen": 151940608, + "step": 70425 + }, + { + "epoch": 11.489396411092985, + "grad_norm": 0.010199863463640213, + "learning_rate": 0.00045735820331412914, + "loss": 0.2806, + "num_input_tokens_seen": 151951584, + "step": 70430 + }, + { + "epoch": 11.49021207177814, + "grad_norm": 0.0625927671790123, + "learning_rate": 0.0004572872832519839, + "loss": 0.0056, + "num_input_tokens_seen": 151962720, + "step": 70435 + }, + { + "epoch": 11.491027732463296, + "grad_norm": 0.02368241548538208, + "learning_rate": 0.0004572163640554662, + "loss": 0.0113, + "num_input_tokens_seen": 151972800, + "step": 70440 + }, + { + "epoch": 11.49184339314845, + "grad_norm": 0.048747409135103226, + "learning_rate": 0.00045714544572601296, + "loss": 0.0117, + "num_input_tokens_seen": 151983520, + "step": 70445 + }, + { + "epoch": 11.492659053833606, + "grad_norm": 0.07149891555309296, + "learning_rate": 0.0004570745282650619, + "loss": 0.0426, + "num_input_tokens_seen": 151994624, + "step": 70450 + }, + { + "epoch": 11.49347471451876, + "grad_norm": 0.11639310419559479, + "learning_rate": 0.00045700361167404967, + "loss": 0.2125, + "num_input_tokens_seen": 152004512, + "step": 70455 + }, + { + "epoch": 11.494290375203915, + "grad_norm": 0.021203190088272095, + "learning_rate": 0.0004569326959544141, + "loss": 0.0122, + "num_input_tokens_seen": 152014368, + "step": 70460 + }, + { + "epoch": 11.49510603588907, + "grad_norm": 0.013591976836323738, + "learning_rate": 0.00045686178110759183, + "loss": 0.0081, + "num_input_tokens_seen": 152025536, + "step": 70465 + }, + { + "epoch": 11.495921696574225, + "grad_norm": 0.00267104827798903, + "learning_rate": 0.0004567908671350206, + "loss": 0.0033, + "num_input_tokens_seen": 152036640, + "step": 70470 + }, + { + "epoch": 11.49673735725938, + "grad_norm": 0.12724779546260834, + "learning_rate": 0.00045671995403813686, + "loss": 0.0148, + "num_input_tokens_seen": 152048192, + "step": 70475 + }, + { + "epoch": 11.497553017944535, + "grad_norm": 0.008935310877859592, + "learning_rate": 0.0004566490418183785, + "loss": 0.015, + "num_input_tokens_seen": 152059808, + "step": 70480 + }, + { + "epoch": 11.49836867862969, + "grad_norm": 0.14394010603427887, + "learning_rate": 0.00045657813047718203, + "loss": 0.0082, + "num_input_tokens_seen": 152070816, + "step": 70485 + }, + { + "epoch": 11.499184339314844, + "grad_norm": 0.3587186932563782, + "learning_rate": 0.000456507220015985, + "loss": 0.0547, + "num_input_tokens_seen": 152082240, + "step": 70490 + }, + { + "epoch": 11.5, + "grad_norm": 0.00459715910255909, + "learning_rate": 0.00045643631043622426, + "loss": 0.0211, + "num_input_tokens_seen": 152093312, + "step": 70495 + }, + { + "epoch": 11.500815660685156, + "grad_norm": 0.2548139691352844, + "learning_rate": 0.00045636540173933697, + "loss": 0.0438, + "num_input_tokens_seen": 152103616, + "step": 70500 + }, + { + "epoch": 11.50163132137031, + "grad_norm": 0.0018603273201733828, + "learning_rate": 0.0004562944939267602, + "loss": 0.1384, + "num_input_tokens_seen": 152113440, + "step": 70505 + }, + { + "epoch": 11.502446982055465, + "grad_norm": 0.3552423119544983, + "learning_rate": 0.00045622358699993093, + "loss": 0.0805, + "num_input_tokens_seen": 152124736, + "step": 70510 + }, + { + "epoch": 11.50326264274062, + "grad_norm": 0.076400026679039, + "learning_rate": 0.00045615268096028613, + "loss": 0.0879, + "num_input_tokens_seen": 152136192, + "step": 70515 + }, + { + "epoch": 11.504078303425775, + "grad_norm": 0.20905813574790955, + "learning_rate": 0.0004560817758092631, + "loss": 0.0152, + "num_input_tokens_seen": 152147200, + "step": 70520 + }, + { + "epoch": 11.50489396411093, + "grad_norm": 0.004925478715449572, + "learning_rate": 0.00045601087154829834, + "loss": 0.0032, + "num_input_tokens_seen": 152158368, + "step": 70525 + }, + { + "epoch": 11.505709624796085, + "grad_norm": 0.0013580898521468043, + "learning_rate": 0.00045593996817882925, + "loss": 0.0057, + "num_input_tokens_seen": 152168352, + "step": 70530 + }, + { + "epoch": 11.50652528548124, + "grad_norm": 0.31950071454048157, + "learning_rate": 0.0004558690657022925, + "loss": 0.0262, + "num_input_tokens_seen": 152178016, + "step": 70535 + }, + { + "epoch": 11.507340946166394, + "grad_norm": 0.028924476355314255, + "learning_rate": 0.0004557981641201252, + "loss": 0.008, + "num_input_tokens_seen": 152188768, + "step": 70540 + }, + { + "epoch": 11.50815660685155, + "grad_norm": 0.2623055577278137, + "learning_rate": 0.000455727263433764, + "loss": 0.034, + "num_input_tokens_seen": 152199296, + "step": 70545 + }, + { + "epoch": 11.508972267536706, + "grad_norm": 0.13284637033939362, + "learning_rate": 0.000455656363644646, + "loss": 0.0184, + "num_input_tokens_seen": 152210624, + "step": 70550 + }, + { + "epoch": 11.50978792822186, + "grad_norm": 0.000787390279583633, + "learning_rate": 0.0004555854647542083, + "loss": 0.0138, + "num_input_tokens_seen": 152222208, + "step": 70555 + }, + { + "epoch": 11.510603588907015, + "grad_norm": 0.09359927475452423, + "learning_rate": 0.00045551456676388725, + "loss": 0.0355, + "num_input_tokens_seen": 152234080, + "step": 70560 + }, + { + "epoch": 11.51141924959217, + "grad_norm": 0.03847875818610191, + "learning_rate": 0.00045544366967512014, + "loss": 0.0173, + "num_input_tokens_seen": 152243840, + "step": 70565 + }, + { + "epoch": 11.512234910277325, + "grad_norm": 0.022007031366229057, + "learning_rate": 0.0004553727734893434, + "loss": 0.0063, + "num_input_tokens_seen": 152254752, + "step": 70570 + }, + { + "epoch": 11.513050570962479, + "grad_norm": 0.001327080768533051, + "learning_rate": 0.0004553018782079942, + "loss": 0.0076, + "num_input_tokens_seen": 152265856, + "step": 70575 + }, + { + "epoch": 11.513866231647635, + "grad_norm": 0.01626906543970108, + "learning_rate": 0.00045523098383250894, + "loss": 0.0033, + "num_input_tokens_seen": 152277408, + "step": 70580 + }, + { + "epoch": 11.51468189233279, + "grad_norm": 0.017068669199943542, + "learning_rate": 0.0004551600903643248, + "loss": 0.0982, + "num_input_tokens_seen": 152288416, + "step": 70585 + }, + { + "epoch": 11.515497553017944, + "grad_norm": 0.004806024022400379, + "learning_rate": 0.00045508919780487805, + "loss": 0.0036, + "num_input_tokens_seen": 152297312, + "step": 70590 + }, + { + "epoch": 11.5163132137031, + "grad_norm": 0.14878104627132416, + "learning_rate": 0.000455018306155606, + "loss": 0.0344, + "num_input_tokens_seen": 152308480, + "step": 70595 + }, + { + "epoch": 11.517128874388254, + "grad_norm": 0.004192311782389879, + "learning_rate": 0.0004549474154179447, + "loss": 0.0046, + "num_input_tokens_seen": 152319648, + "step": 70600 + }, + { + "epoch": 11.51794453507341, + "grad_norm": 0.020315716043114662, + "learning_rate": 0.0004548765255933315, + "loss": 0.0266, + "num_input_tokens_seen": 152329984, + "step": 70605 + }, + { + "epoch": 11.518760195758565, + "grad_norm": 0.35469716787338257, + "learning_rate": 0.00045480563668320244, + "loss": 0.1589, + "num_input_tokens_seen": 152340800, + "step": 70610 + }, + { + "epoch": 11.51957585644372, + "grad_norm": 0.001612176070921123, + "learning_rate": 0.0004547347486889948, + "loss": 0.0251, + "num_input_tokens_seen": 152351424, + "step": 70615 + }, + { + "epoch": 11.520391517128875, + "grad_norm": 0.33990755677223206, + "learning_rate": 0.00045466386161214465, + "loss": 0.0191, + "num_input_tokens_seen": 152360832, + "step": 70620 + }, + { + "epoch": 11.521207177814029, + "grad_norm": 0.004103005863726139, + "learning_rate": 0.00045459297545408906, + "loss": 0.0061, + "num_input_tokens_seen": 152370464, + "step": 70625 + }, + { + "epoch": 11.522022838499185, + "grad_norm": 0.005593008827418089, + "learning_rate": 0.0004545220902162642, + "loss": 0.0335, + "num_input_tokens_seen": 152381696, + "step": 70630 + }, + { + "epoch": 11.522838499184338, + "grad_norm": 0.0006372933858074248, + "learning_rate": 0.000454451205900107, + "loss": 0.0017, + "num_input_tokens_seen": 152391552, + "step": 70635 + }, + { + "epoch": 11.523654159869494, + "grad_norm": 0.015318629331886768, + "learning_rate": 0.00045438032250705394, + "loss": 0.0197, + "num_input_tokens_seen": 152403104, + "step": 70640 + }, + { + "epoch": 11.52446982055465, + "grad_norm": 0.043724432587623596, + "learning_rate": 0.00045430944003854143, + "loss": 0.0123, + "num_input_tokens_seen": 152414208, + "step": 70645 + }, + { + "epoch": 11.525285481239804, + "grad_norm": 0.02152017317712307, + "learning_rate": 0.00045423855849600615, + "loss": 0.0064, + "num_input_tokens_seen": 152425504, + "step": 70650 + }, + { + "epoch": 11.52610114192496, + "grad_norm": 0.14629553258419037, + "learning_rate": 0.00045416767788088435, + "loss": 0.047, + "num_input_tokens_seen": 152436128, + "step": 70655 + }, + { + "epoch": 11.526916802610113, + "grad_norm": 0.00357233127579093, + "learning_rate": 0.00045409679819461286, + "loss": 0.0021, + "num_input_tokens_seen": 152447872, + "step": 70660 + }, + { + "epoch": 11.52773246329527, + "grad_norm": 0.001525213709101081, + "learning_rate": 0.000454025919438628, + "loss": 0.0026, + "num_input_tokens_seen": 152458048, + "step": 70665 + }, + { + "epoch": 11.528548123980425, + "grad_norm": 0.014770734123885632, + "learning_rate": 0.00045395504161436617, + "loss": 0.0232, + "num_input_tokens_seen": 152469728, + "step": 70670 + }, + { + "epoch": 11.529363784665579, + "grad_norm": 0.16989704966545105, + "learning_rate": 0.0004538841647232639, + "loss": 0.0537, + "num_input_tokens_seen": 152480384, + "step": 70675 + }, + { + "epoch": 11.530179445350734, + "grad_norm": 0.005632862448692322, + "learning_rate": 0.0004538132887667574, + "loss": 0.1032, + "num_input_tokens_seen": 152490528, + "step": 70680 + }, + { + "epoch": 11.530995106035888, + "grad_norm": 0.004560346249490976, + "learning_rate": 0.0004537424137462832, + "loss": 0.0087, + "num_input_tokens_seen": 152501888, + "step": 70685 + }, + { + "epoch": 11.531810766721044, + "grad_norm": 0.02286740392446518, + "learning_rate": 0.0004536715396632779, + "loss": 0.0173, + "num_input_tokens_seen": 152513056, + "step": 70690 + }, + { + "epoch": 11.5326264274062, + "grad_norm": 0.3785374164581299, + "learning_rate": 0.00045360066651917733, + "loss": 0.0663, + "num_input_tokens_seen": 152523296, + "step": 70695 + }, + { + "epoch": 11.533442088091354, + "grad_norm": 0.007073297165334225, + "learning_rate": 0.00045352979431541833, + "loss": 0.0189, + "num_input_tokens_seen": 152534368, + "step": 70700 + }, + { + "epoch": 11.53425774877651, + "grad_norm": 0.11865311861038208, + "learning_rate": 0.0004534589230534368, + "loss": 0.0265, + "num_input_tokens_seen": 152546336, + "step": 70705 + }, + { + "epoch": 11.535073409461663, + "grad_norm": 0.020100371912121773, + "learning_rate": 0.00045338805273466954, + "loss": 0.0633, + "num_input_tokens_seen": 152557504, + "step": 70710 + }, + { + "epoch": 11.535889070146819, + "grad_norm": 0.0004467536346055567, + "learning_rate": 0.00045331718336055223, + "loss": 0.0054, + "num_input_tokens_seen": 152568480, + "step": 70715 + }, + { + "epoch": 11.536704730831975, + "grad_norm": 0.004148539155721664, + "learning_rate": 0.0004532463149325216, + "loss": 0.0121, + "num_input_tokens_seen": 152579392, + "step": 70720 + }, + { + "epoch": 11.537520391517129, + "grad_norm": 0.019890323281288147, + "learning_rate": 0.00045317544745201354, + "loss": 0.1715, + "num_input_tokens_seen": 152589856, + "step": 70725 + }, + { + "epoch": 11.538336052202284, + "grad_norm": 0.03215445205569267, + "learning_rate": 0.00045310458092046464, + "loss": 0.0131, + "num_input_tokens_seen": 152601280, + "step": 70730 + }, + { + "epoch": 11.539151712887438, + "grad_norm": 0.2837308645248413, + "learning_rate": 0.0004530337153393107, + "loss": 0.0259, + "num_input_tokens_seen": 152612320, + "step": 70735 + }, + { + "epoch": 11.539967373572594, + "grad_norm": 0.08689332753419876, + "learning_rate": 0.00045296285070998835, + "loss": 0.0899, + "num_input_tokens_seen": 152622592, + "step": 70740 + }, + { + "epoch": 11.540783034257748, + "grad_norm": 0.0049114651046693325, + "learning_rate": 0.0004528919870339332, + "loss": 0.0015, + "num_input_tokens_seen": 152632800, + "step": 70745 + }, + { + "epoch": 11.541598694942904, + "grad_norm": 0.005766826681792736, + "learning_rate": 0.00045282112431258194, + "loss": 0.0103, + "num_input_tokens_seen": 152644256, + "step": 70750 + }, + { + "epoch": 11.54241435562806, + "grad_norm": 0.0004343034524936229, + "learning_rate": 0.00045275026254737027, + "loss": 0.0013, + "num_input_tokens_seen": 152655552, + "step": 70755 + }, + { + "epoch": 11.543230016313213, + "grad_norm": 0.37126705050468445, + "learning_rate": 0.0004526794017397344, + "loss": 0.183, + "num_input_tokens_seen": 152666656, + "step": 70760 + }, + { + "epoch": 11.544045676998369, + "grad_norm": 0.010961020365357399, + "learning_rate": 0.0004526085418911108, + "loss": 0.0063, + "num_input_tokens_seen": 152678944, + "step": 70765 + }, + { + "epoch": 11.544861337683523, + "grad_norm": 0.02804548852145672, + "learning_rate": 0.0004525376830029349, + "loss": 0.021, + "num_input_tokens_seen": 152690336, + "step": 70770 + }, + { + "epoch": 11.545676998368679, + "grad_norm": 0.039194002747535706, + "learning_rate": 0.00045246682507664335, + "loss": 0.0063, + "num_input_tokens_seen": 152700736, + "step": 70775 + }, + { + "epoch": 11.546492659053834, + "grad_norm": 0.06108115613460541, + "learning_rate": 0.0004523959681136716, + "loss": 0.194, + "num_input_tokens_seen": 152711968, + "step": 70780 + }, + { + "epoch": 11.547308319738988, + "grad_norm": 0.7899608612060547, + "learning_rate": 0.00045232511211545625, + "loss": 0.0661, + "num_input_tokens_seen": 152722336, + "step": 70785 + }, + { + "epoch": 11.548123980424144, + "grad_norm": 0.002196177374571562, + "learning_rate": 0.0004522542570834327, + "loss": 0.0216, + "num_input_tokens_seen": 152733344, + "step": 70790 + }, + { + "epoch": 11.548939641109298, + "grad_norm": 0.002225445583462715, + "learning_rate": 0.0004521834030190375, + "loss": 0.0521, + "num_input_tokens_seen": 152743424, + "step": 70795 + }, + { + "epoch": 11.549755301794454, + "grad_norm": 0.14226034283638, + "learning_rate": 0.000452112549923706, + "loss": 0.0345, + "num_input_tokens_seen": 152754272, + "step": 70800 + }, + { + "epoch": 11.550570962479608, + "grad_norm": 0.009430866688489914, + "learning_rate": 0.00045204169779887454, + "loss": 0.0087, + "num_input_tokens_seen": 152765408, + "step": 70805 + }, + { + "epoch": 11.551386623164763, + "grad_norm": 0.001295320107601583, + "learning_rate": 0.0004519708466459789, + "loss": 0.0724, + "num_input_tokens_seen": 152776064, + "step": 70810 + }, + { + "epoch": 11.552202283849919, + "grad_norm": 0.0598505362868309, + "learning_rate": 0.0004518999964664551, + "loss": 0.0159, + "num_input_tokens_seen": 152787104, + "step": 70815 + }, + { + "epoch": 11.553017944535073, + "grad_norm": 0.3598770499229431, + "learning_rate": 0.0004518291472617387, + "loss": 0.1378, + "num_input_tokens_seen": 152798112, + "step": 70820 + }, + { + "epoch": 11.553833605220229, + "grad_norm": 0.06062731519341469, + "learning_rate": 0.00045175829903326594, + "loss": 0.0166, + "num_input_tokens_seen": 152809664, + "step": 70825 + }, + { + "epoch": 11.554649265905383, + "grad_norm": 0.13306595385074615, + "learning_rate": 0.0004516874517824722, + "loss": 0.0782, + "num_input_tokens_seen": 152820960, + "step": 70830 + }, + { + "epoch": 11.555464926590538, + "grad_norm": 0.04939431697130203, + "learning_rate": 0.0004516166055107938, + "loss": 0.1303, + "num_input_tokens_seen": 152832064, + "step": 70835 + }, + { + "epoch": 11.556280587275694, + "grad_norm": 0.0018639718182384968, + "learning_rate": 0.00045154576021966605, + "loss": 0.0649, + "num_input_tokens_seen": 152842496, + "step": 70840 + }, + { + "epoch": 11.557096247960848, + "grad_norm": 0.00571925425902009, + "learning_rate": 0.00045147491591052515, + "loss": 0.0092, + "num_input_tokens_seen": 152852608, + "step": 70845 + }, + { + "epoch": 11.557911908646004, + "grad_norm": 0.3984866142272949, + "learning_rate": 0.0004514040725848064, + "loss": 0.0632, + "num_input_tokens_seen": 152862560, + "step": 70850 + }, + { + "epoch": 11.558727569331158, + "grad_norm": 0.07977241277694702, + "learning_rate": 0.0004513332302439461, + "loss": 0.0437, + "num_input_tokens_seen": 152873216, + "step": 70855 + }, + { + "epoch": 11.559543230016313, + "grad_norm": 0.017012428492307663, + "learning_rate": 0.00045126238888937927, + "loss": 0.0093, + "num_input_tokens_seen": 152884096, + "step": 70860 + }, + { + "epoch": 11.560358890701469, + "grad_norm": 0.00410555861890316, + "learning_rate": 0.00045119154852254204, + "loss": 0.0467, + "num_input_tokens_seen": 152895168, + "step": 70865 + }, + { + "epoch": 11.561174551386623, + "grad_norm": 0.009494518861174583, + "learning_rate": 0.0004511207091448701, + "loss": 0.0397, + "num_input_tokens_seen": 152906528, + "step": 70870 + }, + { + "epoch": 11.561990212071779, + "grad_norm": 0.0072036078199744225, + "learning_rate": 0.0004510498707577989, + "loss": 0.0193, + "num_input_tokens_seen": 152917728, + "step": 70875 + }, + { + "epoch": 11.562805872756933, + "grad_norm": 0.41441667079925537, + "learning_rate": 0.0004509790333627644, + "loss": 0.241, + "num_input_tokens_seen": 152927392, + "step": 70880 + }, + { + "epoch": 11.563621533442088, + "grad_norm": 0.0007598976953886449, + "learning_rate": 0.00045090819696120166, + "loss": 0.0061, + "num_input_tokens_seen": 152938624, + "step": 70885 + }, + { + "epoch": 11.564437194127244, + "grad_norm": 0.0987086221575737, + "learning_rate": 0.0004508373615545469, + "loss": 0.1721, + "num_input_tokens_seen": 152949888, + "step": 70890 + }, + { + "epoch": 11.565252854812398, + "grad_norm": 0.13046546280384064, + "learning_rate": 0.00045076652714423507, + "loss": 0.0951, + "num_input_tokens_seen": 152961376, + "step": 70895 + }, + { + "epoch": 11.566068515497554, + "grad_norm": 0.0207928828895092, + "learning_rate": 0.00045069569373170227, + "loss": 0.0164, + "num_input_tokens_seen": 152972256, + "step": 70900 + }, + { + "epoch": 11.566884176182707, + "grad_norm": 0.11554178595542908, + "learning_rate": 0.0004506248613183836, + "loss": 0.0988, + "num_input_tokens_seen": 152982880, + "step": 70905 + }, + { + "epoch": 11.567699836867863, + "grad_norm": 0.0019060475751757622, + "learning_rate": 0.00045055402990571493, + "loss": 0.1248, + "num_input_tokens_seen": 152995296, + "step": 70910 + }, + { + "epoch": 11.568515497553017, + "grad_norm": 0.4915919303894043, + "learning_rate": 0.00045048319949513136, + "loss": 0.0477, + "num_input_tokens_seen": 153005728, + "step": 70915 + }, + { + "epoch": 11.569331158238173, + "grad_norm": 0.36298349499702454, + "learning_rate": 0.0004504123700880688, + "loss": 0.0694, + "num_input_tokens_seen": 153017312, + "step": 70920 + }, + { + "epoch": 11.570146818923329, + "grad_norm": 0.05356518179178238, + "learning_rate": 0.00045034154168596224, + "loss": 0.0051, + "num_input_tokens_seen": 153029472, + "step": 70925 + }, + { + "epoch": 11.570962479608482, + "grad_norm": 0.006642530672252178, + "learning_rate": 0.00045027071429024757, + "loss": 0.0114, + "num_input_tokens_seen": 153041344, + "step": 70930 + }, + { + "epoch": 11.571778140293638, + "grad_norm": 0.02406645007431507, + "learning_rate": 0.00045019988790235974, + "loss": 0.1075, + "num_input_tokens_seen": 153052352, + "step": 70935 + }, + { + "epoch": 11.572593800978792, + "grad_norm": 0.08967375010251999, + "learning_rate": 0.0004501290625237345, + "loss": 0.0224, + "num_input_tokens_seen": 153062592, + "step": 70940 + }, + { + "epoch": 11.573409461663948, + "grad_norm": 0.017464177682995796, + "learning_rate": 0.00045005823815580696, + "loss": 0.0163, + "num_input_tokens_seen": 153073408, + "step": 70945 + }, + { + "epoch": 11.574225122349104, + "grad_norm": 0.00919350329786539, + "learning_rate": 0.00044998741480001264, + "loss": 0.0833, + "num_input_tokens_seen": 153084416, + "step": 70950 + }, + { + "epoch": 11.575040783034257, + "grad_norm": 0.09908278286457062, + "learning_rate": 0.00044991659245778684, + "loss": 0.0145, + "num_input_tokens_seen": 153095680, + "step": 70955 + }, + { + "epoch": 11.575856443719413, + "grad_norm": 0.0023745258804410696, + "learning_rate": 0.00044984577113056477, + "loss": 0.0062, + "num_input_tokens_seen": 153106944, + "step": 70960 + }, + { + "epoch": 11.576672104404567, + "grad_norm": 0.6820746660232544, + "learning_rate": 0.0004497749508197818, + "loss": 0.1336, + "num_input_tokens_seen": 153117984, + "step": 70965 + }, + { + "epoch": 11.577487765089723, + "grad_norm": 0.0022637268994003534, + "learning_rate": 0.00044970413152687304, + "loss": 0.0108, + "num_input_tokens_seen": 153129440, + "step": 70970 + }, + { + "epoch": 11.578303425774878, + "grad_norm": 0.03817975893616676, + "learning_rate": 0.000449633313253274, + "loss": 0.0179, + "num_input_tokens_seen": 153140288, + "step": 70975 + }, + { + "epoch": 11.579119086460032, + "grad_norm": 0.005806601606309414, + "learning_rate": 0.00044956249600041975, + "loss": 0.0539, + "num_input_tokens_seen": 153150848, + "step": 70980 + }, + { + "epoch": 11.579934747145188, + "grad_norm": 0.02192111127078533, + "learning_rate": 0.00044949167976974553, + "loss": 0.0263, + "num_input_tokens_seen": 153160320, + "step": 70985 + }, + { + "epoch": 11.580750407830342, + "grad_norm": 0.04389500245451927, + "learning_rate": 0.00044942086456268643, + "loss": 0.0349, + "num_input_tokens_seen": 153171104, + "step": 70990 + }, + { + "epoch": 11.581566068515498, + "grad_norm": 0.007001729682087898, + "learning_rate": 0.0004493500503806777, + "loss": 0.0112, + "num_input_tokens_seen": 153180640, + "step": 70995 + }, + { + "epoch": 11.582381729200652, + "grad_norm": 0.29647624492645264, + "learning_rate": 0.0004492792372251544, + "loss": 0.0166, + "num_input_tokens_seen": 153190624, + "step": 71000 + }, + { + "epoch": 11.583197389885807, + "grad_norm": 0.4464181959629059, + "learning_rate": 0.00044920842509755187, + "loss": 0.089, + "num_input_tokens_seen": 153200864, + "step": 71005 + }, + { + "epoch": 11.584013050570963, + "grad_norm": 0.019816353917121887, + "learning_rate": 0.0004491376139993048, + "loss": 0.2698, + "num_input_tokens_seen": 153210688, + "step": 71010 + }, + { + "epoch": 11.584828711256117, + "grad_norm": 0.006263958755880594, + "learning_rate": 0.0004490668039318488, + "loss": 0.0202, + "num_input_tokens_seen": 153222816, + "step": 71015 + }, + { + "epoch": 11.585644371941273, + "grad_norm": 0.09810878336429596, + "learning_rate": 0.00044899599489661837, + "loss": 0.0105, + "num_input_tokens_seen": 153232832, + "step": 71020 + }, + { + "epoch": 11.586460032626427, + "grad_norm": 0.0021480226423591375, + "learning_rate": 0.000448925186895049, + "loss": 0.0037, + "num_input_tokens_seen": 153244160, + "step": 71025 + }, + { + "epoch": 11.587275693311582, + "grad_norm": 0.20656906068325043, + "learning_rate": 0.0004488543799285753, + "loss": 0.0513, + "num_input_tokens_seen": 153254816, + "step": 71030 + }, + { + "epoch": 11.588091353996738, + "grad_norm": 0.003440725849941373, + "learning_rate": 0.00044878357399863266, + "loss": 0.0546, + "num_input_tokens_seen": 153266688, + "step": 71035 + }, + { + "epoch": 11.588907014681892, + "grad_norm": 0.10117941349744797, + "learning_rate": 0.0004487127691066558, + "loss": 0.0358, + "num_input_tokens_seen": 153279040, + "step": 71040 + }, + { + "epoch": 11.589722675367048, + "grad_norm": 0.010887154377996922, + "learning_rate": 0.0004486419652540798, + "loss": 0.0048, + "num_input_tokens_seen": 153290240, + "step": 71045 + }, + { + "epoch": 11.590538336052202, + "grad_norm": 0.009515766985714436, + "learning_rate": 0.0004485711624423393, + "loss": 0.0753, + "num_input_tokens_seen": 153302080, + "step": 71050 + }, + { + "epoch": 11.591353996737357, + "grad_norm": 0.09484504163265228, + "learning_rate": 0.0004485003606728698, + "loss": 0.097, + "num_input_tokens_seen": 153313632, + "step": 71055 + }, + { + "epoch": 11.592169657422513, + "grad_norm": 0.007839200086891651, + "learning_rate": 0.0004484295599471054, + "loss": 0.0074, + "num_input_tokens_seen": 153324960, + "step": 71060 + }, + { + "epoch": 11.592985318107667, + "grad_norm": 0.0022512234281748533, + "learning_rate": 0.00044835876026648176, + "loss": 0.0477, + "num_input_tokens_seen": 153335552, + "step": 71065 + }, + { + "epoch": 11.593800978792823, + "grad_norm": 0.012531466782093048, + "learning_rate": 0.00044828796163243315, + "loss": 0.0237, + "num_input_tokens_seen": 153346304, + "step": 71070 + }, + { + "epoch": 11.594616639477977, + "grad_norm": 0.011579425074160099, + "learning_rate": 0.0004482171640463945, + "loss": 0.0038, + "num_input_tokens_seen": 153357472, + "step": 71075 + }, + { + "epoch": 11.595432300163132, + "grad_norm": 0.0036812543403357267, + "learning_rate": 0.000448146367509801, + "loss": 0.0553, + "num_input_tokens_seen": 153368800, + "step": 71080 + }, + { + "epoch": 11.596247960848288, + "grad_norm": 0.33469879627227783, + "learning_rate": 0.0004480755720240869, + "loss": 0.0543, + "num_input_tokens_seen": 153380800, + "step": 71085 + }, + { + "epoch": 11.597063621533442, + "grad_norm": 0.061560001224279404, + "learning_rate": 0.0004480047775906874, + "loss": 0.0221, + "num_input_tokens_seen": 153391968, + "step": 71090 + }, + { + "epoch": 11.597879282218598, + "grad_norm": 0.26723065972328186, + "learning_rate": 0.0004479339842110368, + "loss": 0.0343, + "num_input_tokens_seen": 153401920, + "step": 71095 + }, + { + "epoch": 11.598694942903752, + "grad_norm": 0.019828828051686287, + "learning_rate": 0.0004478631918865704, + "loss": 0.0449, + "num_input_tokens_seen": 153413408, + "step": 71100 + }, + { + "epoch": 11.599510603588907, + "grad_norm": 0.04922334849834442, + "learning_rate": 0.00044779240061872225, + "loss": 0.0118, + "num_input_tokens_seen": 153424224, + "step": 71105 + }, + { + "epoch": 11.600326264274061, + "grad_norm": 0.049233805388212204, + "learning_rate": 0.00044772161040892755, + "loss": 0.0111, + "num_input_tokens_seen": 153435392, + "step": 71110 + }, + { + "epoch": 11.601141924959217, + "grad_norm": 0.27412280440330505, + "learning_rate": 0.00044765082125862053, + "loss": 0.1369, + "num_input_tokens_seen": 153445792, + "step": 71115 + }, + { + "epoch": 11.601957585644373, + "grad_norm": 0.003101582173258066, + "learning_rate": 0.0004475800331692361, + "loss": 0.0256, + "num_input_tokens_seen": 153455936, + "step": 71120 + }, + { + "epoch": 11.602773246329527, + "grad_norm": 0.005423955153673887, + "learning_rate": 0.0004475092461422089, + "loss": 0.009, + "num_input_tokens_seen": 153466752, + "step": 71125 + }, + { + "epoch": 11.603588907014682, + "grad_norm": 0.0698070228099823, + "learning_rate": 0.0004474384601789733, + "loss": 0.0192, + "num_input_tokens_seen": 153477888, + "step": 71130 + }, + { + "epoch": 11.604404567699836, + "grad_norm": 0.006714033428579569, + "learning_rate": 0.00044736767528096407, + "loss": 0.0043, + "num_input_tokens_seen": 153489696, + "step": 71135 + }, + { + "epoch": 11.605220228384992, + "grad_norm": 0.29046157002449036, + "learning_rate": 0.0004472968914496156, + "loss": 0.0179, + "num_input_tokens_seen": 153500576, + "step": 71140 + }, + { + "epoch": 11.606035889070148, + "grad_norm": 0.3671259880065918, + "learning_rate": 0.00044722610868636243, + "loss": 0.0801, + "num_input_tokens_seen": 153511104, + "step": 71145 + }, + { + "epoch": 11.606851549755302, + "grad_norm": 0.15085823833942413, + "learning_rate": 0.00044715532699263926, + "loss": 0.0321, + "num_input_tokens_seen": 153521376, + "step": 71150 + }, + { + "epoch": 11.607667210440457, + "grad_norm": 0.1269478052854538, + "learning_rate": 0.00044708454636988026, + "loss": 0.1359, + "num_input_tokens_seen": 153531808, + "step": 71155 + }, + { + "epoch": 11.608482871125611, + "grad_norm": 0.017769847065210342, + "learning_rate": 0.00044701376681952033, + "loss": 0.0329, + "num_input_tokens_seen": 153542400, + "step": 71160 + }, + { + "epoch": 11.609298531810767, + "grad_norm": 0.03897063434123993, + "learning_rate": 0.00044694298834299336, + "loss": 0.0138, + "num_input_tokens_seen": 153553856, + "step": 71165 + }, + { + "epoch": 11.61011419249592, + "grad_norm": 0.049788784235715866, + "learning_rate": 0.00044687221094173425, + "loss": 0.0142, + "num_input_tokens_seen": 153564416, + "step": 71170 + }, + { + "epoch": 11.610929853181077, + "grad_norm": 0.0022448066156357527, + "learning_rate": 0.0004468014346171769, + "loss": 0.0046, + "num_input_tokens_seen": 153576352, + "step": 71175 + }, + { + "epoch": 11.611745513866232, + "grad_norm": 0.048350222408771515, + "learning_rate": 0.0004467306593707563, + "loss": 0.0094, + "num_input_tokens_seen": 153587232, + "step": 71180 + }, + { + "epoch": 11.612561174551386, + "grad_norm": 0.02660156786441803, + "learning_rate": 0.00044665988520390624, + "loss": 0.0069, + "num_input_tokens_seen": 153598304, + "step": 71185 + }, + { + "epoch": 11.613376835236542, + "grad_norm": 0.050273191183805466, + "learning_rate": 0.0004465891121180612, + "loss": 0.0089, + "num_input_tokens_seen": 153610304, + "step": 71190 + }, + { + "epoch": 11.614192495921696, + "grad_norm": 0.018890485167503357, + "learning_rate": 0.0004465183401146558, + "loss": 0.008, + "num_input_tokens_seen": 153621056, + "step": 71195 + }, + { + "epoch": 11.615008156606851, + "grad_norm": 0.2468203455209732, + "learning_rate": 0.00044644756919512386, + "loss": 0.123, + "num_input_tokens_seen": 153630592, + "step": 71200 + }, + { + "epoch": 11.615823817292007, + "grad_norm": 0.09083366394042969, + "learning_rate": 0.00044637679936090013, + "loss": 0.0797, + "num_input_tokens_seen": 153640768, + "step": 71205 + }, + { + "epoch": 11.616639477977161, + "grad_norm": 0.3844025433063507, + "learning_rate": 0.00044630603061341837, + "loss": 0.0432, + "num_input_tokens_seen": 153651808, + "step": 71210 + }, + { + "epoch": 11.617455138662317, + "grad_norm": 0.14164689183235168, + "learning_rate": 0.00044623526295411314, + "loss": 0.0108, + "num_input_tokens_seen": 153661856, + "step": 71215 + }, + { + "epoch": 11.61827079934747, + "grad_norm": 0.03903840482234955, + "learning_rate": 0.00044616449638441836, + "loss": 0.0204, + "num_input_tokens_seen": 153672896, + "step": 71220 + }, + { + "epoch": 11.619086460032626, + "grad_norm": 0.19465045630931854, + "learning_rate": 0.0004460937309057686, + "loss": 0.0117, + "num_input_tokens_seen": 153682112, + "step": 71225 + }, + { + "epoch": 11.619902120717782, + "grad_norm": 0.2865956127643585, + "learning_rate": 0.0004460229665195975, + "loss": 0.0115, + "num_input_tokens_seen": 153691776, + "step": 71230 + }, + { + "epoch": 11.620717781402936, + "grad_norm": 0.017243146896362305, + "learning_rate": 0.0004459522032273397, + "loss": 0.0239, + "num_input_tokens_seen": 153702048, + "step": 71235 + }, + { + "epoch": 11.621533442088092, + "grad_norm": 0.002111430512741208, + "learning_rate": 0.00044588144103042883, + "loss": 0.0601, + "num_input_tokens_seen": 153713184, + "step": 71240 + }, + { + "epoch": 11.622349102773246, + "grad_norm": 0.015014237724244595, + "learning_rate": 0.00044581067993029944, + "loss": 0.0213, + "num_input_tokens_seen": 153725088, + "step": 71245 + }, + { + "epoch": 11.623164763458401, + "grad_norm": 0.6647807955741882, + "learning_rate": 0.0004457399199283852, + "loss": 0.0346, + "num_input_tokens_seen": 153736608, + "step": 71250 + }, + { + "epoch": 11.623980424143557, + "grad_norm": 0.01447947695851326, + "learning_rate": 0.00044566916102612043, + "loss": 0.0065, + "num_input_tokens_seen": 153747616, + "step": 71255 + }, + { + "epoch": 11.624796084828711, + "grad_norm": 0.06897450238466263, + "learning_rate": 0.0004455984032249389, + "loss": 0.0555, + "num_input_tokens_seen": 153758464, + "step": 71260 + }, + { + "epoch": 11.625611745513867, + "grad_norm": 0.015616199001669884, + "learning_rate": 0.0004455276465262748, + "loss": 0.0426, + "num_input_tokens_seen": 153770176, + "step": 71265 + }, + { + "epoch": 11.62642740619902, + "grad_norm": 0.40125226974487305, + "learning_rate": 0.0004454568909315621, + "loss": 0.0892, + "num_input_tokens_seen": 153781152, + "step": 71270 + }, + { + "epoch": 11.627243066884176, + "grad_norm": 0.3075340986251831, + "learning_rate": 0.0004453861364422347, + "loss": 0.0673, + "num_input_tokens_seen": 153791264, + "step": 71275 + }, + { + "epoch": 11.62805872756933, + "grad_norm": 0.3425213098526001, + "learning_rate": 0.00044531538305972646, + "loss": 0.0309, + "num_input_tokens_seen": 153802784, + "step": 71280 + }, + { + "epoch": 11.628874388254486, + "grad_norm": 0.030073171481490135, + "learning_rate": 0.0004452446307854714, + "loss": 0.0043, + "num_input_tokens_seen": 153813088, + "step": 71285 + }, + { + "epoch": 11.629690048939642, + "grad_norm": 0.08780338615179062, + "learning_rate": 0.00044517387962090323, + "loss": 0.1244, + "num_input_tokens_seen": 153823744, + "step": 71290 + }, + { + "epoch": 11.630505709624796, + "grad_norm": 0.0017820200882852077, + "learning_rate": 0.00044510312956745607, + "loss": 0.0014, + "num_input_tokens_seen": 153834752, + "step": 71295 + }, + { + "epoch": 11.631321370309951, + "grad_norm": 0.03283948078751564, + "learning_rate": 0.00044503238062656357, + "loss": 0.0094, + "num_input_tokens_seen": 153845632, + "step": 71300 + }, + { + "epoch": 11.632137030995105, + "grad_norm": 0.006110351532697678, + "learning_rate": 0.0004449616327996597, + "loss": 0.0024, + "num_input_tokens_seen": 153856704, + "step": 71305 + }, + { + "epoch": 11.632952691680261, + "grad_norm": 0.8985926508903503, + "learning_rate": 0.0004448908860881781, + "loss": 0.0475, + "num_input_tokens_seen": 153867584, + "step": 71310 + }, + { + "epoch": 11.633768352365417, + "grad_norm": 0.008293491788208485, + "learning_rate": 0.0004448201404935525, + "loss": 0.0042, + "num_input_tokens_seen": 153878976, + "step": 71315 + }, + { + "epoch": 11.63458401305057, + "grad_norm": 0.007783666718751192, + "learning_rate": 0.00044474939601721705, + "loss": 0.0163, + "num_input_tokens_seen": 153890880, + "step": 71320 + }, + { + "epoch": 11.635399673735726, + "grad_norm": 0.0166110061109066, + "learning_rate": 0.00044467865266060487, + "loss": 0.0999, + "num_input_tokens_seen": 153902048, + "step": 71325 + }, + { + "epoch": 11.63621533442088, + "grad_norm": 0.053523894399404526, + "learning_rate": 0.0004446079104251503, + "loss": 0.0657, + "num_input_tokens_seen": 153913792, + "step": 71330 + }, + { + "epoch": 11.637030995106036, + "grad_norm": 0.022940553724765778, + "learning_rate": 0.0004445371693122863, + "loss": 0.029, + "num_input_tokens_seen": 153923744, + "step": 71335 + }, + { + "epoch": 11.63784665579119, + "grad_norm": 0.004669408779591322, + "learning_rate": 0.00044446642932344726, + "loss": 0.1102, + "num_input_tokens_seen": 153935552, + "step": 71340 + }, + { + "epoch": 11.638662316476346, + "grad_norm": 0.23680941760540009, + "learning_rate": 0.0004443956904600663, + "loss": 0.0439, + "num_input_tokens_seen": 153946272, + "step": 71345 + }, + { + "epoch": 11.639477977161501, + "grad_norm": 0.6156103014945984, + "learning_rate": 0.00044432495272357734, + "loss": 0.1083, + "num_input_tokens_seen": 153957056, + "step": 71350 + }, + { + "epoch": 11.640293637846655, + "grad_norm": 0.02600765787065029, + "learning_rate": 0.00044425421611541364, + "loss": 0.0343, + "num_input_tokens_seen": 153966400, + "step": 71355 + }, + { + "epoch": 11.641109298531811, + "grad_norm": 0.006610512267798185, + "learning_rate": 0.0004441834806370092, + "loss": 0.0065, + "num_input_tokens_seen": 153976928, + "step": 71360 + }, + { + "epoch": 11.641924959216965, + "grad_norm": 0.011130956932902336, + "learning_rate": 0.00044411274628979714, + "loss": 0.0557, + "num_input_tokens_seen": 153987840, + "step": 71365 + }, + { + "epoch": 11.64274061990212, + "grad_norm": 0.29936906695365906, + "learning_rate": 0.00044404201307521134, + "loss": 0.064, + "num_input_tokens_seen": 153998368, + "step": 71370 + }, + { + "epoch": 11.643556280587276, + "grad_norm": 0.024964090436697006, + "learning_rate": 0.00044397128099468497, + "loss": 0.0133, + "num_input_tokens_seen": 154008864, + "step": 71375 + }, + { + "epoch": 11.64437194127243, + "grad_norm": 0.011086949147284031, + "learning_rate": 0.0004439005500496519, + "loss": 0.0312, + "num_input_tokens_seen": 154019168, + "step": 71380 + }, + { + "epoch": 11.645187601957586, + "grad_norm": 0.011448938399553299, + "learning_rate": 0.00044382982024154506, + "loss": 0.1459, + "num_input_tokens_seen": 154031040, + "step": 71385 + }, + { + "epoch": 11.64600326264274, + "grad_norm": 0.3198551535606384, + "learning_rate": 0.0004437590915717984, + "loss": 0.0388, + "num_input_tokens_seen": 154041504, + "step": 71390 + }, + { + "epoch": 11.646818923327896, + "grad_norm": 0.234247088432312, + "learning_rate": 0.0004436883640418449, + "loss": 0.0963, + "num_input_tokens_seen": 154052448, + "step": 71395 + }, + { + "epoch": 11.647634584013051, + "grad_norm": 0.06153815612196922, + "learning_rate": 0.0004436176376531181, + "loss": 0.0093, + "num_input_tokens_seen": 154063840, + "step": 71400 + }, + { + "epoch": 11.648450244698205, + "grad_norm": 0.02396884188055992, + "learning_rate": 0.00044354691240705167, + "loss": 0.0176, + "num_input_tokens_seen": 154075360, + "step": 71405 + }, + { + "epoch": 11.649265905383361, + "grad_norm": 0.534089982509613, + "learning_rate": 0.00044347618830507845, + "loss": 0.0212, + "num_input_tokens_seen": 154086560, + "step": 71410 + }, + { + "epoch": 11.650081566068515, + "grad_norm": 0.008758745156228542, + "learning_rate": 0.00044340546534863226, + "loss": 0.0126, + "num_input_tokens_seen": 154097344, + "step": 71415 + }, + { + "epoch": 11.65089722675367, + "grad_norm": 0.01250399649143219, + "learning_rate": 0.00044333474353914576, + "loss": 0.0206, + "num_input_tokens_seen": 154108128, + "step": 71420 + }, + { + "epoch": 11.651712887438826, + "grad_norm": 0.19430381059646606, + "learning_rate": 0.0004432640228780529, + "loss": 0.0163, + "num_input_tokens_seen": 154119264, + "step": 71425 + }, + { + "epoch": 11.65252854812398, + "grad_norm": 0.502029299736023, + "learning_rate": 0.0004431933033667863, + "loss": 0.0322, + "num_input_tokens_seen": 154130304, + "step": 71430 + }, + { + "epoch": 11.653344208809136, + "grad_norm": 0.011702809482812881, + "learning_rate": 0.0004431225850067796, + "loss": 0.0155, + "num_input_tokens_seen": 154141664, + "step": 71435 + }, + { + "epoch": 11.65415986949429, + "grad_norm": 0.02398710325360298, + "learning_rate": 0.0004430518677994659, + "loss": 0.0522, + "num_input_tokens_seen": 154153536, + "step": 71440 + }, + { + "epoch": 11.654975530179446, + "grad_norm": 0.010546736419200897, + "learning_rate": 0.0004429811517462783, + "loss": 0.0906, + "num_input_tokens_seen": 154163936, + "step": 71445 + }, + { + "epoch": 11.655791190864601, + "grad_norm": 0.01737050525844097, + "learning_rate": 0.00044291043684865, + "loss": 0.0284, + "num_input_tokens_seen": 154175520, + "step": 71450 + }, + { + "epoch": 11.656606851549755, + "grad_norm": 0.019813016057014465, + "learning_rate": 0.0004428397231080141, + "loss": 0.0066, + "num_input_tokens_seen": 154186976, + "step": 71455 + }, + { + "epoch": 11.65742251223491, + "grad_norm": 0.02232261933386326, + "learning_rate": 0.0004427690105258037, + "loss": 0.0534, + "num_input_tokens_seen": 154198528, + "step": 71460 + }, + { + "epoch": 11.658238172920065, + "grad_norm": 0.011686442419886589, + "learning_rate": 0.00044269829910345207, + "loss": 0.0433, + "num_input_tokens_seen": 154209504, + "step": 71465 + }, + { + "epoch": 11.65905383360522, + "grad_norm": 0.0016976443585008383, + "learning_rate": 0.00044262758884239185, + "loss": 0.0249, + "num_input_tokens_seen": 154219232, + "step": 71470 + }, + { + "epoch": 11.659869494290374, + "grad_norm": 0.0070905680768191814, + "learning_rate": 0.00044255687974405656, + "loss": 0.0072, + "num_input_tokens_seen": 154230528, + "step": 71475 + }, + { + "epoch": 11.66068515497553, + "grad_norm": 0.004003368783742189, + "learning_rate": 0.0004424861718098788, + "loss": 0.0255, + "num_input_tokens_seen": 154241568, + "step": 71480 + }, + { + "epoch": 11.661500815660686, + "grad_norm": 0.1997532993555069, + "learning_rate": 0.00044241546504129186, + "loss": 0.0324, + "num_input_tokens_seen": 154249472, + "step": 71485 + }, + { + "epoch": 11.66231647634584, + "grad_norm": 0.0012733637122437358, + "learning_rate": 0.0004423447594397284, + "loss": 0.1198, + "num_input_tokens_seen": 154259872, + "step": 71490 + }, + { + "epoch": 11.663132137030995, + "grad_norm": 0.048443034291267395, + "learning_rate": 0.00044227405500662175, + "loss": 0.0075, + "num_input_tokens_seen": 154269792, + "step": 71495 + }, + { + "epoch": 11.66394779771615, + "grad_norm": 0.023019418120384216, + "learning_rate": 0.00044220335174340443, + "loss": 0.0495, + "num_input_tokens_seen": 154278336, + "step": 71500 + }, + { + "epoch": 11.664763458401305, + "grad_norm": 0.06869065016508102, + "learning_rate": 0.00044213264965150943, + "loss": 0.1111, + "num_input_tokens_seen": 154289248, + "step": 71505 + }, + { + "epoch": 11.66557911908646, + "grad_norm": 0.11201704293489456, + "learning_rate": 0.00044206194873237, + "loss": 0.0139, + "num_input_tokens_seen": 154300608, + "step": 71510 + }, + { + "epoch": 11.666394779771615, + "grad_norm": 0.006646198220551014, + "learning_rate": 0.00044199124898741844, + "loss": 0.0351, + "num_input_tokens_seen": 154310976, + "step": 71515 + }, + { + "epoch": 11.66721044045677, + "grad_norm": 0.038329172879457474, + "learning_rate": 0.000441920550418088, + "loss": 0.0224, + "num_input_tokens_seen": 154321760, + "step": 71520 + }, + { + "epoch": 11.668026101141924, + "grad_norm": 0.007524513173848391, + "learning_rate": 0.00044184985302581103, + "loss": 0.0197, + "num_input_tokens_seen": 154333216, + "step": 71525 + }, + { + "epoch": 11.66884176182708, + "grad_norm": 0.009614118374884129, + "learning_rate": 0.00044177915681202083, + "loss": 0.006, + "num_input_tokens_seen": 154344352, + "step": 71530 + }, + { + "epoch": 11.669657422512234, + "grad_norm": 0.009648015722632408, + "learning_rate": 0.00044170846177814965, + "loss": 0.0041, + "num_input_tokens_seen": 154353984, + "step": 71535 + }, + { + "epoch": 11.67047308319739, + "grad_norm": 0.005916877184063196, + "learning_rate": 0.0004416377679256307, + "loss": 0.0675, + "num_input_tokens_seen": 154364512, + "step": 71540 + }, + { + "epoch": 11.671288743882545, + "grad_norm": 0.0076830750331282616, + "learning_rate": 0.0004415670752558961, + "loss": 0.0235, + "num_input_tokens_seen": 154374368, + "step": 71545 + }, + { + "epoch": 11.6721044045677, + "grad_norm": 0.3812910318374634, + "learning_rate": 0.0004414963837703791, + "loss": 0.1109, + "num_input_tokens_seen": 154385728, + "step": 71550 + }, + { + "epoch": 11.672920065252855, + "grad_norm": 0.02752128429710865, + "learning_rate": 0.0004414256934705119, + "loss": 0.0076, + "num_input_tokens_seen": 154397504, + "step": 71555 + }, + { + "epoch": 11.673735725938009, + "grad_norm": 0.002272646641358733, + "learning_rate": 0.00044135500435772755, + "loss": 0.017, + "num_input_tokens_seen": 154408576, + "step": 71560 + }, + { + "epoch": 11.674551386623165, + "grad_norm": 0.009183168411254883, + "learning_rate": 0.0004412843164334582, + "loss": 0.0094, + "num_input_tokens_seen": 154419904, + "step": 71565 + }, + { + "epoch": 11.67536704730832, + "grad_norm": 0.6354069709777832, + "learning_rate": 0.00044121362969913683, + "loss": 0.0277, + "num_input_tokens_seen": 154431776, + "step": 71570 + }, + { + "epoch": 11.676182707993474, + "grad_norm": 0.005621030926704407, + "learning_rate": 0.00044114294415619577, + "loss": 0.0261, + "num_input_tokens_seen": 154443040, + "step": 71575 + }, + { + "epoch": 11.67699836867863, + "grad_norm": 0.07681886106729507, + "learning_rate": 0.00044107225980606765, + "loss": 0.0262, + "num_input_tokens_seen": 154453920, + "step": 71580 + }, + { + "epoch": 11.677814029363784, + "grad_norm": 0.23034776747226715, + "learning_rate": 0.0004410015766501849, + "loss": 0.0843, + "num_input_tokens_seen": 154465504, + "step": 71585 + }, + { + "epoch": 11.67862969004894, + "grad_norm": 0.0009162899805232882, + "learning_rate": 0.00044093089468998006, + "loss": 0.0009, + "num_input_tokens_seen": 154476096, + "step": 71590 + }, + { + "epoch": 11.679445350734095, + "grad_norm": 0.0009175725281238556, + "learning_rate": 0.0004408602139268856, + "loss": 0.0684, + "num_input_tokens_seen": 154487200, + "step": 71595 + }, + { + "epoch": 11.68026101141925, + "grad_norm": 0.001563364639878273, + "learning_rate": 0.00044078953436233387, + "loss": 0.0228, + "num_input_tokens_seen": 154498784, + "step": 71600 + }, + { + "epoch": 11.681076672104405, + "grad_norm": 0.13785617053508759, + "learning_rate": 0.0004407188559977573, + "loss": 0.0089, + "num_input_tokens_seen": 154508608, + "step": 71605 + }, + { + "epoch": 11.681892332789559, + "grad_norm": 0.0018103166949003935, + "learning_rate": 0.00044064817883458833, + "loss": 0.0119, + "num_input_tokens_seen": 154519488, + "step": 71610 + }, + { + "epoch": 11.682707993474715, + "grad_norm": 0.009680917486548424, + "learning_rate": 0.0004405775028742594, + "loss": 0.0072, + "num_input_tokens_seen": 154530272, + "step": 71615 + }, + { + "epoch": 11.68352365415987, + "grad_norm": 0.13455532491207123, + "learning_rate": 0.00044050682811820277, + "loss": 0.02, + "num_input_tokens_seen": 154541344, + "step": 71620 + }, + { + "epoch": 11.684339314845024, + "grad_norm": 0.00945951510220766, + "learning_rate": 0.00044043615456785065, + "loss": 0.0098, + "num_input_tokens_seen": 154552000, + "step": 71625 + }, + { + "epoch": 11.68515497553018, + "grad_norm": 0.350904643535614, + "learning_rate": 0.00044036548222463535, + "loss": 0.1015, + "num_input_tokens_seen": 154562880, + "step": 71630 + }, + { + "epoch": 11.685970636215334, + "grad_norm": 0.007846559397876263, + "learning_rate": 0.0004402948110899894, + "loss": 0.0026, + "num_input_tokens_seen": 154573760, + "step": 71635 + }, + { + "epoch": 11.68678629690049, + "grad_norm": 0.026957768946886063, + "learning_rate": 0.0004402241411653447, + "loss": 0.0047, + "num_input_tokens_seen": 154584832, + "step": 71640 + }, + { + "epoch": 11.687601957585644, + "grad_norm": 0.011540766805410385, + "learning_rate": 0.00044015347245213377, + "loss": 0.0231, + "num_input_tokens_seen": 154594816, + "step": 71645 + }, + { + "epoch": 11.6884176182708, + "grad_norm": 0.09205904603004456, + "learning_rate": 0.00044008280495178844, + "loss": 0.0128, + "num_input_tokens_seen": 154605216, + "step": 71650 + }, + { + "epoch": 11.689233278955955, + "grad_norm": 0.026317743584513664, + "learning_rate": 0.0004400121386657413, + "loss": 0.006, + "num_input_tokens_seen": 154616736, + "step": 71655 + }, + { + "epoch": 11.690048939641109, + "grad_norm": 0.04552512243390083, + "learning_rate": 0.000439941473595424, + "loss": 0.0107, + "num_input_tokens_seen": 154627744, + "step": 71660 + }, + { + "epoch": 11.690864600326265, + "grad_norm": 0.1411546915769577, + "learning_rate": 0.00043987080974226925, + "loss": 0.0983, + "num_input_tokens_seen": 154638752, + "step": 71665 + }, + { + "epoch": 11.691680261011419, + "grad_norm": 0.015095498412847519, + "learning_rate": 0.00043980014710770857, + "loss": 0.0094, + "num_input_tokens_seen": 154649504, + "step": 71670 + }, + { + "epoch": 11.692495921696574, + "grad_norm": 0.0038119046948850155, + "learning_rate": 0.00043972948569317446, + "loss": 0.007, + "num_input_tokens_seen": 154659808, + "step": 71675 + }, + { + "epoch": 11.69331158238173, + "grad_norm": 0.13447749614715576, + "learning_rate": 0.00043965882550009856, + "loss": 0.0108, + "num_input_tokens_seen": 154670336, + "step": 71680 + }, + { + "epoch": 11.694127243066884, + "grad_norm": 0.004058754537254572, + "learning_rate": 0.0004395881665299134, + "loss": 0.0646, + "num_input_tokens_seen": 154680576, + "step": 71685 + }, + { + "epoch": 11.69494290375204, + "grad_norm": 0.001501035294495523, + "learning_rate": 0.0004395175087840503, + "loss": 0.0423, + "num_input_tokens_seen": 154692128, + "step": 71690 + }, + { + "epoch": 11.695758564437194, + "grad_norm": 0.22174081206321716, + "learning_rate": 0.000439446852263942, + "loss": 0.1038, + "num_input_tokens_seen": 154703424, + "step": 71695 + }, + { + "epoch": 11.69657422512235, + "grad_norm": 0.003377341665327549, + "learning_rate": 0.00043937619697101974, + "loss": 0.0105, + "num_input_tokens_seen": 154713056, + "step": 71700 + }, + { + "epoch": 11.697389885807503, + "grad_norm": 0.051962170749902725, + "learning_rate": 0.00043930554290671597, + "loss": 0.0127, + "num_input_tokens_seen": 154723872, + "step": 71705 + }, + { + "epoch": 11.698205546492659, + "grad_norm": 0.000500374473631382, + "learning_rate": 0.0004392348900724622, + "loss": 0.0195, + "num_input_tokens_seen": 154734304, + "step": 71710 + }, + { + "epoch": 11.699021207177815, + "grad_norm": 0.00925895944237709, + "learning_rate": 0.00043916423846969047, + "loss": 0.0035, + "num_input_tokens_seen": 154745888, + "step": 71715 + }, + { + "epoch": 11.699836867862969, + "grad_norm": 0.3275412917137146, + "learning_rate": 0.0004390935880998329, + "loss": 0.0372, + "num_input_tokens_seen": 154756544, + "step": 71720 + }, + { + "epoch": 11.700652528548124, + "grad_norm": 0.015386094339191914, + "learning_rate": 0.00043902293896432064, + "loss": 0.0052, + "num_input_tokens_seen": 154768096, + "step": 71725 + }, + { + "epoch": 11.701468189233278, + "grad_norm": 0.0067831650376319885, + "learning_rate": 0.0004389522910645862, + "loss": 0.008, + "num_input_tokens_seen": 154779296, + "step": 71730 + }, + { + "epoch": 11.702283849918434, + "grad_norm": 0.0016234016511589289, + "learning_rate": 0.00043888164440206086, + "loss": 0.0038, + "num_input_tokens_seen": 154789632, + "step": 71735 + }, + { + "epoch": 11.70309951060359, + "grad_norm": 0.1683165729045868, + "learning_rate": 0.0004388109989781766, + "loss": 0.0171, + "num_input_tokens_seen": 154800576, + "step": 71740 + }, + { + "epoch": 11.703915171288743, + "grad_norm": 0.004572753794491291, + "learning_rate": 0.000438740354794365, + "loss": 0.0884, + "num_input_tokens_seen": 154811328, + "step": 71745 + }, + { + "epoch": 11.7047308319739, + "grad_norm": 0.431792289018631, + "learning_rate": 0.0004386697118520579, + "loss": 0.0756, + "num_input_tokens_seen": 154822208, + "step": 71750 + }, + { + "epoch": 11.705546492659053, + "grad_norm": 0.39914411306381226, + "learning_rate": 0.00043859907015268685, + "loss": 0.04, + "num_input_tokens_seen": 154833088, + "step": 71755 + }, + { + "epoch": 11.706362153344209, + "grad_norm": 0.4269408881664276, + "learning_rate": 0.00043852842969768356, + "loss": 0.0949, + "num_input_tokens_seen": 154844288, + "step": 71760 + }, + { + "epoch": 11.707177814029365, + "grad_norm": 0.001893799751996994, + "learning_rate": 0.0004384577904884795, + "loss": 0.0079, + "num_input_tokens_seen": 154855552, + "step": 71765 + }, + { + "epoch": 11.707993474714518, + "grad_norm": 0.0007316174451261759, + "learning_rate": 0.0004383871525265066, + "loss": 0.0038, + "num_input_tokens_seen": 154866656, + "step": 71770 + }, + { + "epoch": 11.708809135399674, + "grad_norm": 0.0035406064707785845, + "learning_rate": 0.00043831651581319604, + "loss": 0.0905, + "num_input_tokens_seen": 154877600, + "step": 71775 + }, + { + "epoch": 11.709624796084828, + "grad_norm": 0.0010385302593931556, + "learning_rate": 0.00043824588034997974, + "loss": 0.1633, + "num_input_tokens_seen": 154889312, + "step": 71780 + }, + { + "epoch": 11.710440456769984, + "grad_norm": 0.4396411180496216, + "learning_rate": 0.0004381752461382888, + "loss": 0.0977, + "num_input_tokens_seen": 154900736, + "step": 71785 + }, + { + "epoch": 11.71125611745514, + "grad_norm": 0.002041950821876526, + "learning_rate": 0.0004381046131795551, + "loss": 0.036, + "num_input_tokens_seen": 154911552, + "step": 71790 + }, + { + "epoch": 11.712071778140293, + "grad_norm": 0.34035012125968933, + "learning_rate": 0.0004380339814752098, + "loss": 0.0368, + "num_input_tokens_seen": 154922496, + "step": 71795 + }, + { + "epoch": 11.71288743882545, + "grad_norm": 0.006053532939404249, + "learning_rate": 0.0004379633510266846, + "loss": 0.0467, + "num_input_tokens_seen": 154933504, + "step": 71800 + }, + { + "epoch": 11.713703099510603, + "grad_norm": 0.006358067970722914, + "learning_rate": 0.0004378927218354106, + "loss": 0.0029, + "num_input_tokens_seen": 154944256, + "step": 71805 + }, + { + "epoch": 11.714518760195759, + "grad_norm": 0.03646931052207947, + "learning_rate": 0.00043782209390281964, + "loss": 0.0131, + "num_input_tokens_seen": 154953824, + "step": 71810 + }, + { + "epoch": 11.715334420880914, + "grad_norm": 0.008104242384433746, + "learning_rate": 0.00043775146723034253, + "loss": 0.0048, + "num_input_tokens_seen": 154964192, + "step": 71815 + }, + { + "epoch": 11.716150081566068, + "grad_norm": 0.1621163934469223, + "learning_rate": 0.00043768084181941097, + "loss": 0.0146, + "num_input_tokens_seen": 154975456, + "step": 71820 + }, + { + "epoch": 11.716965742251224, + "grad_norm": 0.350459486246109, + "learning_rate": 0.00043761021767145644, + "loss": 0.1368, + "num_input_tokens_seen": 154986304, + "step": 71825 + }, + { + "epoch": 11.717781402936378, + "grad_norm": 0.07811852544546127, + "learning_rate": 0.0004375395947879097, + "loss": 0.0087, + "num_input_tokens_seen": 154997024, + "step": 71830 + }, + { + "epoch": 11.718597063621534, + "grad_norm": 0.055032651871442795, + "learning_rate": 0.0004374689731702026, + "loss": 0.0052, + "num_input_tokens_seen": 155009312, + "step": 71835 + }, + { + "epoch": 11.719412724306688, + "grad_norm": 0.007189947180449963, + "learning_rate": 0.0004373983528197659, + "loss": 0.0929, + "num_input_tokens_seen": 155019008, + "step": 71840 + }, + { + "epoch": 11.720228384991843, + "grad_norm": 0.0014529898762702942, + "learning_rate": 0.0004373277337380311, + "loss": 0.005, + "num_input_tokens_seen": 155029824, + "step": 71845 + }, + { + "epoch": 11.721044045676999, + "grad_norm": 0.055281832814216614, + "learning_rate": 0.00043725711592642913, + "loss": 0.0412, + "num_input_tokens_seen": 155039872, + "step": 71850 + }, + { + "epoch": 11.721859706362153, + "grad_norm": 0.02894619107246399, + "learning_rate": 0.0004371864993863915, + "loss": 0.0131, + "num_input_tokens_seen": 155050208, + "step": 71855 + }, + { + "epoch": 11.722675367047309, + "grad_norm": 0.0021701159421354532, + "learning_rate": 0.00043711588411934893, + "loss": 0.0271, + "num_input_tokens_seen": 155060288, + "step": 71860 + }, + { + "epoch": 11.723491027732463, + "grad_norm": 0.048918455839157104, + "learning_rate": 0.00043704527012673294, + "loss": 0.0096, + "num_input_tokens_seen": 155071552, + "step": 71865 + }, + { + "epoch": 11.724306688417618, + "grad_norm": 0.4136924147605896, + "learning_rate": 0.00043697465740997424, + "loss": 0.1261, + "num_input_tokens_seen": 155083104, + "step": 71870 + }, + { + "epoch": 11.725122349102774, + "grad_norm": 0.43723994493484497, + "learning_rate": 0.00043690404597050426, + "loss": 0.0687, + "num_input_tokens_seen": 155093824, + "step": 71875 + }, + { + "epoch": 11.725938009787928, + "grad_norm": 0.5039966702461243, + "learning_rate": 0.0004368334358097536, + "loss": 0.0329, + "num_input_tokens_seen": 155105728, + "step": 71880 + }, + { + "epoch": 11.726753670473084, + "grad_norm": 0.09117481112480164, + "learning_rate": 0.00043676282692915367, + "loss": 0.0444, + "num_input_tokens_seen": 155116352, + "step": 71885 + }, + { + "epoch": 11.727569331158238, + "grad_norm": 1.2652981281280518, + "learning_rate": 0.0004366922193301352, + "loss": 0.0348, + "num_input_tokens_seen": 155126528, + "step": 71890 + }, + { + "epoch": 11.728384991843393, + "grad_norm": 0.010301213711500168, + "learning_rate": 0.00043662161301412925, + "loss": 0.079, + "num_input_tokens_seen": 155137664, + "step": 71895 + }, + { + "epoch": 11.729200652528547, + "grad_norm": 0.06438102573156357, + "learning_rate": 0.0004365510079825667, + "loss": 0.0581, + "num_input_tokens_seen": 155148128, + "step": 71900 + }, + { + "epoch": 11.730016313213703, + "grad_norm": 0.5005514025688171, + "learning_rate": 0.00043648040423687845, + "loss": 0.0598, + "num_input_tokens_seen": 155159648, + "step": 71905 + }, + { + "epoch": 11.730831973898859, + "grad_norm": 0.04584396630525589, + "learning_rate": 0.00043640980177849534, + "loss": 0.0192, + "num_input_tokens_seen": 155171872, + "step": 71910 + }, + { + "epoch": 11.731647634584013, + "grad_norm": 0.017204057425260544, + "learning_rate": 0.00043633920060884843, + "loss": 0.0107, + "num_input_tokens_seen": 155183104, + "step": 71915 + }, + { + "epoch": 11.732463295269168, + "grad_norm": 0.011246215552091599, + "learning_rate": 0.0004362686007293681, + "loss": 0.0083, + "num_input_tokens_seen": 155194752, + "step": 71920 + }, + { + "epoch": 11.733278955954322, + "grad_norm": 0.009087474085390568, + "learning_rate": 0.0004361980021414858, + "loss": 0.0428, + "num_input_tokens_seen": 155206336, + "step": 71925 + }, + { + "epoch": 11.734094616639478, + "grad_norm": 0.08767760545015335, + "learning_rate": 0.00043612740484663155, + "loss": 0.018, + "num_input_tokens_seen": 155217440, + "step": 71930 + }, + { + "epoch": 11.734910277324634, + "grad_norm": 0.01461027842015028, + "learning_rate": 0.00043605680884623656, + "loss": 0.0272, + "num_input_tokens_seen": 155229056, + "step": 71935 + }, + { + "epoch": 11.735725938009788, + "grad_norm": 0.03229047358036041, + "learning_rate": 0.00043598621414173166, + "loss": 0.0092, + "num_input_tokens_seen": 155240672, + "step": 71940 + }, + { + "epoch": 11.736541598694943, + "grad_norm": 0.042526725679636, + "learning_rate": 0.0004359156207345471, + "loss": 0.2215, + "num_input_tokens_seen": 155250464, + "step": 71945 + }, + { + "epoch": 11.737357259380097, + "grad_norm": 0.0021354216150939465, + "learning_rate": 0.00043584502862611404, + "loss": 0.084, + "num_input_tokens_seen": 155260384, + "step": 71950 + }, + { + "epoch": 11.738172920065253, + "grad_norm": 0.067484550178051, + "learning_rate": 0.00043577443781786263, + "loss": 0.0784, + "num_input_tokens_seen": 155270336, + "step": 71955 + }, + { + "epoch": 11.738988580750409, + "grad_norm": 0.8661009073257446, + "learning_rate": 0.0004357038483112239, + "loss": 0.0512, + "num_input_tokens_seen": 155281312, + "step": 71960 + }, + { + "epoch": 11.739804241435563, + "grad_norm": 0.011847744695842266, + "learning_rate": 0.00043563326010762803, + "loss": 0.0406, + "num_input_tokens_seen": 155293312, + "step": 71965 + }, + { + "epoch": 11.740619902120718, + "grad_norm": 0.004840127192437649, + "learning_rate": 0.00043556267320850605, + "loss": 0.064, + "num_input_tokens_seen": 155305184, + "step": 71970 + }, + { + "epoch": 11.741435562805872, + "grad_norm": 0.01581459678709507, + "learning_rate": 0.000435492087615288, + "loss": 0.0042, + "num_input_tokens_seen": 155314560, + "step": 71975 + }, + { + "epoch": 11.742251223491028, + "grad_norm": 0.0029817908070981503, + "learning_rate": 0.00043542150332940487, + "loss": 0.0054, + "num_input_tokens_seen": 155324832, + "step": 71980 + }, + { + "epoch": 11.743066884176184, + "grad_norm": 0.010415696538984776, + "learning_rate": 0.00043535092035228666, + "loss": 0.0026, + "num_input_tokens_seen": 155334720, + "step": 71985 + }, + { + "epoch": 11.743882544861338, + "grad_norm": 0.039635252207517624, + "learning_rate": 0.00043528033868536433, + "loss": 0.0172, + "num_input_tokens_seen": 155345280, + "step": 71990 + }, + { + "epoch": 11.744698205546493, + "grad_norm": 0.024941336363554, + "learning_rate": 0.0004352097583300678, + "loss": 0.0252, + "num_input_tokens_seen": 155357184, + "step": 71995 + }, + { + "epoch": 11.745513866231647, + "grad_norm": 0.0050639682449400425, + "learning_rate": 0.0004351391792878279, + "loss": 0.1375, + "num_input_tokens_seen": 155368352, + "step": 72000 + }, + { + "epoch": 11.746329526916803, + "grad_norm": 0.3198803663253784, + "learning_rate": 0.00043506860156007453, + "loss": 0.1385, + "num_input_tokens_seen": 155378528, + "step": 72005 + }, + { + "epoch": 11.747145187601957, + "grad_norm": 0.013552944175899029, + "learning_rate": 0.00043499802514823866, + "loss": 0.0083, + "num_input_tokens_seen": 155389408, + "step": 72010 + }, + { + "epoch": 11.747960848287113, + "grad_norm": 0.11822723597288132, + "learning_rate": 0.00043492745005375, + "loss": 0.0768, + "num_input_tokens_seen": 155400864, + "step": 72015 + }, + { + "epoch": 11.748776508972268, + "grad_norm": 0.11993558704853058, + "learning_rate": 0.00043485687627803935, + "loss": 0.0094, + "num_input_tokens_seen": 155410720, + "step": 72020 + }, + { + "epoch": 11.749592169657422, + "grad_norm": 0.0391787625849247, + "learning_rate": 0.00043478630382253646, + "loss": 0.0059, + "num_input_tokens_seen": 155421536, + "step": 72025 + }, + { + "epoch": 11.750407830342578, + "grad_norm": 0.04633534699678421, + "learning_rate": 0.00043471573268867206, + "loss": 0.0077, + "num_input_tokens_seen": 155431744, + "step": 72030 + }, + { + "epoch": 11.751223491027732, + "grad_norm": 0.40625518560409546, + "learning_rate": 0.00043464516287787617, + "loss": 0.0978, + "num_input_tokens_seen": 155442464, + "step": 72035 + }, + { + "epoch": 11.752039151712887, + "grad_norm": 0.02742503210902214, + "learning_rate": 0.0004345745943915788, + "loss": 0.01, + "num_input_tokens_seen": 155454016, + "step": 72040 + }, + { + "epoch": 11.752854812398043, + "grad_norm": 0.01242095697671175, + "learning_rate": 0.0004345040272312104, + "loss": 0.0334, + "num_input_tokens_seen": 155465088, + "step": 72045 + }, + { + "epoch": 11.753670473083197, + "grad_norm": 0.026455940678715706, + "learning_rate": 0.00043443346139820086, + "loss": 0.0046, + "num_input_tokens_seen": 155474624, + "step": 72050 + }, + { + "epoch": 11.754486133768353, + "grad_norm": 0.5758869647979736, + "learning_rate": 0.0004343628968939805, + "loss": 0.0606, + "num_input_tokens_seen": 155484960, + "step": 72055 + }, + { + "epoch": 11.755301794453507, + "grad_norm": 0.0382433719933033, + "learning_rate": 0.0004342923337199793, + "loss": 0.0205, + "num_input_tokens_seen": 155495264, + "step": 72060 + }, + { + "epoch": 11.756117455138662, + "grad_norm": 0.0016977923223748803, + "learning_rate": 0.0004342217718776273, + "loss": 0.0037, + "num_input_tokens_seen": 155506656, + "step": 72065 + }, + { + "epoch": 11.756933115823816, + "grad_norm": 0.022454719990491867, + "learning_rate": 0.00043415121136835454, + "loss": 0.0079, + "num_input_tokens_seen": 155517664, + "step": 72070 + }, + { + "epoch": 11.757748776508972, + "grad_norm": 0.2604980766773224, + "learning_rate": 0.00043408065219359106, + "loss": 0.0284, + "num_input_tokens_seen": 155528480, + "step": 72075 + }, + { + "epoch": 11.758564437194128, + "grad_norm": 0.010173599235713482, + "learning_rate": 0.00043401009435476665, + "loss": 0.1226, + "num_input_tokens_seen": 155539072, + "step": 72080 + }, + { + "epoch": 11.759380097879282, + "grad_norm": 0.0010750076035037637, + "learning_rate": 0.0004339395378533116, + "loss": 0.0041, + "num_input_tokens_seen": 155548768, + "step": 72085 + }, + { + "epoch": 11.760195758564437, + "grad_norm": 0.014476287178695202, + "learning_rate": 0.00043386898269065537, + "loss": 0.0144, + "num_input_tokens_seen": 155560320, + "step": 72090 + }, + { + "epoch": 11.761011419249591, + "grad_norm": 0.36666467785835266, + "learning_rate": 0.00043379842886822836, + "loss": 0.1589, + "num_input_tokens_seen": 155570368, + "step": 72095 + }, + { + "epoch": 11.761827079934747, + "grad_norm": 0.436190128326416, + "learning_rate": 0.0004337278763874599, + "loss": 0.1096, + "num_input_tokens_seen": 155579808, + "step": 72100 + }, + { + "epoch": 11.762642740619903, + "grad_norm": 0.0614510215818882, + "learning_rate": 0.0004336573252497804, + "loss": 0.0071, + "num_input_tokens_seen": 155590304, + "step": 72105 + }, + { + "epoch": 11.763458401305057, + "grad_norm": 0.3529820144176483, + "learning_rate": 0.00043358677545661913, + "loss": 0.1449, + "num_input_tokens_seen": 155602336, + "step": 72110 + }, + { + "epoch": 11.764274061990212, + "grad_norm": 0.16326484084129333, + "learning_rate": 0.0004335162270094063, + "loss": 0.0406, + "num_input_tokens_seen": 155613376, + "step": 72115 + }, + { + "epoch": 11.765089722675366, + "grad_norm": 0.11344678699970245, + "learning_rate": 0.0004334456799095712, + "loss": 0.0898, + "num_input_tokens_seen": 155624992, + "step": 72120 + }, + { + "epoch": 11.765905383360522, + "grad_norm": 0.06390348076820374, + "learning_rate": 0.00043337513415854414, + "loss": 0.0122, + "num_input_tokens_seen": 155635904, + "step": 72125 + }, + { + "epoch": 11.766721044045678, + "grad_norm": 0.019454030320048332, + "learning_rate": 0.0004333045897577542, + "loss": 0.0259, + "num_input_tokens_seen": 155646624, + "step": 72130 + }, + { + "epoch": 11.767536704730832, + "grad_norm": 0.09129560738801956, + "learning_rate": 0.00043323404670863165, + "loss": 0.0168, + "num_input_tokens_seen": 155656384, + "step": 72135 + }, + { + "epoch": 11.768352365415987, + "grad_norm": 0.0006940299645066261, + "learning_rate": 0.0004331635050126056, + "loss": 0.0169, + "num_input_tokens_seen": 155667264, + "step": 72140 + }, + { + "epoch": 11.769168026101141, + "grad_norm": 0.021187156438827515, + "learning_rate": 0.0004330929646711059, + "loss": 0.0378, + "num_input_tokens_seen": 155677024, + "step": 72145 + }, + { + "epoch": 11.769983686786297, + "grad_norm": 0.0029175153467804193, + "learning_rate": 0.0004330224256855624, + "loss": 0.0038, + "num_input_tokens_seen": 155688640, + "step": 72150 + }, + { + "epoch": 11.770799347471453, + "grad_norm": 0.0036662856582552195, + "learning_rate": 0.00043295188805740414, + "loss": 0.0311, + "num_input_tokens_seen": 155698592, + "step": 72155 + }, + { + "epoch": 11.771615008156607, + "grad_norm": 0.4789508283138275, + "learning_rate": 0.0004328813517880612, + "loss": 0.0722, + "num_input_tokens_seen": 155710144, + "step": 72160 + }, + { + "epoch": 11.772430668841762, + "grad_norm": 0.0538821779191494, + "learning_rate": 0.00043281081687896253, + "loss": 0.0146, + "num_input_tokens_seen": 155720192, + "step": 72165 + }, + { + "epoch": 11.773246329526916, + "grad_norm": 0.11486588418483734, + "learning_rate": 0.0004327402833315381, + "loss": 0.0191, + "num_input_tokens_seen": 155731040, + "step": 72170 + }, + { + "epoch": 11.774061990212072, + "grad_norm": 0.0020896433852612972, + "learning_rate": 0.000432669751147217, + "loss": 0.0513, + "num_input_tokens_seen": 155740736, + "step": 72175 + }, + { + "epoch": 11.774877650897226, + "grad_norm": 0.3756389319896698, + "learning_rate": 0.000432599220327429, + "loss": 0.0425, + "num_input_tokens_seen": 155751840, + "step": 72180 + }, + { + "epoch": 11.775693311582382, + "grad_norm": 0.04037978872656822, + "learning_rate": 0.0004325286908736031, + "loss": 0.0088, + "num_input_tokens_seen": 155762912, + "step": 72185 + }, + { + "epoch": 11.776508972267537, + "grad_norm": 0.21421808004379272, + "learning_rate": 0.0004324581627871691, + "loss": 0.0377, + "num_input_tokens_seen": 155774688, + "step": 72190 + }, + { + "epoch": 11.777324632952691, + "grad_norm": 0.04104871302843094, + "learning_rate": 0.00043238763606955586, + "loss": 0.0062, + "num_input_tokens_seen": 155785760, + "step": 72195 + }, + { + "epoch": 11.778140293637847, + "grad_norm": 0.0044779665768146515, + "learning_rate": 0.00043231711072219307, + "loss": 0.006, + "num_input_tokens_seen": 155796256, + "step": 72200 + }, + { + "epoch": 11.778955954323001, + "grad_norm": 0.005733925383538008, + "learning_rate": 0.0004322465867465099, + "loss": 0.0071, + "num_input_tokens_seen": 155806080, + "step": 72205 + }, + { + "epoch": 11.779771615008157, + "grad_norm": 0.05142854526638985, + "learning_rate": 0.0004321760641439356, + "loss": 0.0056, + "num_input_tokens_seen": 155816288, + "step": 72210 + }, + { + "epoch": 11.780587275693312, + "grad_norm": 0.000851978431455791, + "learning_rate": 0.00043210554291589937, + "loss": 0.0144, + "num_input_tokens_seen": 155827488, + "step": 72215 + }, + { + "epoch": 11.781402936378466, + "grad_norm": 0.02198336087167263, + "learning_rate": 0.00043203502306383046, + "loss": 0.0515, + "num_input_tokens_seen": 155838016, + "step": 72220 + }, + { + "epoch": 11.782218597063622, + "grad_norm": 0.005845979321748018, + "learning_rate": 0.0004319645045891579, + "loss": 0.0827, + "num_input_tokens_seen": 155848928, + "step": 72225 + }, + { + "epoch": 11.783034257748776, + "grad_norm": 0.005779265891760588, + "learning_rate": 0.0004318939874933113, + "loss": 0.0071, + "num_input_tokens_seen": 155860352, + "step": 72230 + }, + { + "epoch": 11.783849918433932, + "grad_norm": 0.0015582084888592362, + "learning_rate": 0.00043182347177771907, + "loss": 0.084, + "num_input_tokens_seen": 155871040, + "step": 72235 + }, + { + "epoch": 11.784665579119086, + "grad_norm": 0.38620254397392273, + "learning_rate": 0.000431752957443811, + "loss": 0.1274, + "num_input_tokens_seen": 155882048, + "step": 72240 + }, + { + "epoch": 11.785481239804241, + "grad_norm": 0.016361694782972336, + "learning_rate": 0.00043168244449301555, + "loss": 0.0267, + "num_input_tokens_seen": 155893600, + "step": 72245 + }, + { + "epoch": 11.786296900489397, + "grad_norm": 0.033202167600393295, + "learning_rate": 0.00043161193292676203, + "loss": 0.0055, + "num_input_tokens_seen": 155904832, + "step": 72250 + }, + { + "epoch": 11.78711256117455, + "grad_norm": 0.01852128468453884, + "learning_rate": 0.00043154142274647966, + "loss": 0.0055, + "num_input_tokens_seen": 155915968, + "step": 72255 + }, + { + "epoch": 11.787928221859707, + "grad_norm": 0.027508525177836418, + "learning_rate": 0.000431470913953597, + "loss": 0.0197, + "num_input_tokens_seen": 155927072, + "step": 72260 + }, + { + "epoch": 11.78874388254486, + "grad_norm": 0.0015920454170554876, + "learning_rate": 0.00043140040654954346, + "loss": 0.012, + "num_input_tokens_seen": 155936384, + "step": 72265 + }, + { + "epoch": 11.789559543230016, + "grad_norm": 0.05147100239992142, + "learning_rate": 0.00043132990053574747, + "loss": 0.0139, + "num_input_tokens_seen": 155947744, + "step": 72270 + }, + { + "epoch": 11.790375203915172, + "grad_norm": 0.13846653699874878, + "learning_rate": 0.0004312593959136383, + "loss": 0.0281, + "num_input_tokens_seen": 155957536, + "step": 72275 + }, + { + "epoch": 11.791190864600326, + "grad_norm": 0.18716596066951752, + "learning_rate": 0.0004311888926846445, + "loss": 0.0214, + "num_input_tokens_seen": 155969376, + "step": 72280 + }, + { + "epoch": 11.792006525285482, + "grad_norm": 0.006289002485573292, + "learning_rate": 0.00043111839085019534, + "loss": 0.009, + "num_input_tokens_seen": 155980032, + "step": 72285 + }, + { + "epoch": 11.792822185970635, + "grad_norm": 0.0041765449568629265, + "learning_rate": 0.0004310478904117191, + "loss": 0.0065, + "num_input_tokens_seen": 155990560, + "step": 72290 + }, + { + "epoch": 11.793637846655791, + "grad_norm": 0.00943774450570345, + "learning_rate": 0.0004309773913706451, + "loss": 0.0099, + "num_input_tokens_seen": 156000320, + "step": 72295 + }, + { + "epoch": 11.794453507340947, + "grad_norm": 0.5228952765464783, + "learning_rate": 0.00043090689372840156, + "loss": 0.0781, + "num_input_tokens_seen": 156011488, + "step": 72300 + }, + { + "epoch": 11.7952691680261, + "grad_norm": 0.0036159532610327005, + "learning_rate": 0.0004308363974864178, + "loss": 0.0124, + "num_input_tokens_seen": 156022816, + "step": 72305 + }, + { + "epoch": 11.796084828711257, + "grad_norm": 0.45897358655929565, + "learning_rate": 0.0004307659026461218, + "loss": 0.0426, + "num_input_tokens_seen": 156033824, + "step": 72310 + }, + { + "epoch": 11.79690048939641, + "grad_norm": 0.001046838820911944, + "learning_rate": 0.00043069540920894297, + "loss": 0.0577, + "num_input_tokens_seen": 156045088, + "step": 72315 + }, + { + "epoch": 11.797716150081566, + "grad_norm": 0.011327157728374004, + "learning_rate": 0.0004306249171763093, + "loss": 0.0063, + "num_input_tokens_seen": 156055616, + "step": 72320 + }, + { + "epoch": 11.798531810766722, + "grad_norm": 0.022500766441226006, + "learning_rate": 0.0004305544265496499, + "loss": 0.0735, + "num_input_tokens_seen": 156066528, + "step": 72325 + }, + { + "epoch": 11.799347471451876, + "grad_norm": 0.0011193618411198258, + "learning_rate": 0.000430483937330393, + "loss": 0.0029, + "num_input_tokens_seen": 156077984, + "step": 72330 + }, + { + "epoch": 11.800163132137031, + "grad_norm": 0.04918704181909561, + "learning_rate": 0.0004304134495199674, + "loss": 0.0231, + "num_input_tokens_seen": 156088032, + "step": 72335 + }, + { + "epoch": 11.800978792822185, + "grad_norm": 0.002988564083352685, + "learning_rate": 0.0004303429631198014, + "loss": 0.1342, + "num_input_tokens_seen": 156097728, + "step": 72340 + }, + { + "epoch": 11.801794453507341, + "grad_norm": 0.15276111662387848, + "learning_rate": 0.0004302724781313237, + "loss": 0.1313, + "num_input_tokens_seen": 156107712, + "step": 72345 + }, + { + "epoch": 11.802610114192497, + "grad_norm": 0.001921085873618722, + "learning_rate": 0.0004302019945559627, + "loss": 0.0097, + "num_input_tokens_seen": 156118912, + "step": 72350 + }, + { + "epoch": 11.80342577487765, + "grad_norm": 0.03741778805851936, + "learning_rate": 0.0004301315123951467, + "loss": 0.04, + "num_input_tokens_seen": 156130048, + "step": 72355 + }, + { + "epoch": 11.804241435562806, + "grad_norm": 0.0417608916759491, + "learning_rate": 0.0004300610316503045, + "loss": 0.0168, + "num_input_tokens_seen": 156139840, + "step": 72360 + }, + { + "epoch": 11.80505709624796, + "grad_norm": 0.007151048164814711, + "learning_rate": 0.00042999055232286387, + "loss": 0.0113, + "num_input_tokens_seen": 156150816, + "step": 72365 + }, + { + "epoch": 11.805872756933116, + "grad_norm": 0.010008826851844788, + "learning_rate": 0.00042992007441425376, + "loss": 0.1743, + "num_input_tokens_seen": 156164160, + "step": 72370 + }, + { + "epoch": 11.80668841761827, + "grad_norm": 0.012443705461919308, + "learning_rate": 0.00042984959792590215, + "loss": 0.1265, + "num_input_tokens_seen": 156174464, + "step": 72375 + }, + { + "epoch": 11.807504078303426, + "grad_norm": 0.030072130262851715, + "learning_rate": 0.00042977912285923747, + "loss": 0.0034, + "num_input_tokens_seen": 156185760, + "step": 72380 + }, + { + "epoch": 11.808319738988581, + "grad_norm": 0.058631282299757004, + "learning_rate": 0.000429708649215688, + "loss": 0.1289, + "num_input_tokens_seen": 156196128, + "step": 72385 + }, + { + "epoch": 11.809135399673735, + "grad_norm": 0.07309330999851227, + "learning_rate": 0.00042963817699668183, + "loss": 0.0897, + "num_input_tokens_seen": 156205920, + "step": 72390 + }, + { + "epoch": 11.809951060358891, + "grad_norm": 0.0622737742960453, + "learning_rate": 0.0004295677062036472, + "loss": 0.0071, + "num_input_tokens_seen": 156217184, + "step": 72395 + }, + { + "epoch": 11.810766721044045, + "grad_norm": 0.08554977178573608, + "learning_rate": 0.00042949723683801256, + "loss": 0.0224, + "num_input_tokens_seen": 156227968, + "step": 72400 + }, + { + "epoch": 11.8115823817292, + "grad_norm": 0.09381914883852005, + "learning_rate": 0.0004294267689012057, + "loss": 0.0204, + "num_input_tokens_seen": 156239360, + "step": 72405 + }, + { + "epoch": 11.812398042414356, + "grad_norm": 0.5968732237815857, + "learning_rate": 0.000429356302394655, + "loss": 0.046, + "num_input_tokens_seen": 156248928, + "step": 72410 + }, + { + "epoch": 11.81321370309951, + "grad_norm": 0.42291563749313354, + "learning_rate": 0.00042928583731978833, + "loss": 0.134, + "num_input_tokens_seen": 156259648, + "step": 72415 + }, + { + "epoch": 11.814029363784666, + "grad_norm": 0.42290055751800537, + "learning_rate": 0.00042921537367803403, + "loss": 0.0564, + "num_input_tokens_seen": 156269664, + "step": 72420 + }, + { + "epoch": 11.81484502446982, + "grad_norm": 0.016286712139844894, + "learning_rate": 0.0004291449114708198, + "loss": 0.0216, + "num_input_tokens_seen": 156279552, + "step": 72425 + }, + { + "epoch": 11.815660685154976, + "grad_norm": 0.044731978327035904, + "learning_rate": 0.000429074450699574, + "loss": 0.028, + "num_input_tokens_seen": 156290944, + "step": 72430 + }, + { + "epoch": 11.81647634584013, + "grad_norm": 0.02294662967324257, + "learning_rate": 0.0004290039913657243, + "loss": 0.0068, + "num_input_tokens_seen": 156301856, + "step": 72435 + }, + { + "epoch": 11.817292006525285, + "grad_norm": 0.011808139272034168, + "learning_rate": 0.00042893353347069887, + "loss": 0.1006, + "num_input_tokens_seen": 156313056, + "step": 72440 + }, + { + "epoch": 11.818107667210441, + "grad_norm": 0.002664636354893446, + "learning_rate": 0.0004288630770159254, + "loss": 0.008, + "num_input_tokens_seen": 156324224, + "step": 72445 + }, + { + "epoch": 11.818923327895595, + "grad_norm": 0.009831523522734642, + "learning_rate": 0.00042879262200283216, + "loss": 0.0522, + "num_input_tokens_seen": 156334944, + "step": 72450 + }, + { + "epoch": 11.81973898858075, + "grad_norm": 0.022667329758405685, + "learning_rate": 0.0004287221684328465, + "loss": 0.0055, + "num_input_tokens_seen": 156345664, + "step": 72455 + }, + { + "epoch": 11.820554649265905, + "grad_norm": 0.09040852636098862, + "learning_rate": 0.00042865171630739654, + "loss": 0.007, + "num_input_tokens_seen": 156357504, + "step": 72460 + }, + { + "epoch": 11.82137030995106, + "grad_norm": 0.0035189385525882244, + "learning_rate": 0.0004285812656279102, + "loss": 0.0042, + "num_input_tokens_seen": 156366208, + "step": 72465 + }, + { + "epoch": 11.822185970636216, + "grad_norm": 0.0047759427689015865, + "learning_rate": 0.000428510816395815, + "loss": 0.0044, + "num_input_tokens_seen": 156377376, + "step": 72470 + }, + { + "epoch": 11.82300163132137, + "grad_norm": 0.014816605485975742, + "learning_rate": 0.00042844036861253897, + "loss": 0.0071, + "num_input_tokens_seen": 156387616, + "step": 72475 + }, + { + "epoch": 11.823817292006526, + "grad_norm": 0.14894811809062958, + "learning_rate": 0.00042836992227950944, + "loss": 0.0813, + "num_input_tokens_seen": 156396960, + "step": 72480 + }, + { + "epoch": 11.82463295269168, + "grad_norm": 0.005907776765525341, + "learning_rate": 0.0004282994773981546, + "loss": 0.0102, + "num_input_tokens_seen": 156407104, + "step": 72485 + }, + { + "epoch": 11.825448613376835, + "grad_norm": 0.29801076650619507, + "learning_rate": 0.00042822903396990146, + "loss": 0.053, + "num_input_tokens_seen": 156418784, + "step": 72490 + }, + { + "epoch": 11.826264274061991, + "grad_norm": 0.15158681571483612, + "learning_rate": 0.0004281585919961783, + "loss": 0.0256, + "num_input_tokens_seen": 156428768, + "step": 72495 + }, + { + "epoch": 11.827079934747145, + "grad_norm": 0.40904170274734497, + "learning_rate": 0.00042808815147841214, + "loss": 0.0909, + "num_input_tokens_seen": 156440448, + "step": 72500 + }, + { + "epoch": 11.8278955954323, + "grad_norm": 0.07840380072593689, + "learning_rate": 0.0004280177124180311, + "loss": 0.0078, + "num_input_tokens_seen": 156452544, + "step": 72505 + }, + { + "epoch": 11.828711256117455, + "grad_norm": 0.002008330076932907, + "learning_rate": 0.0004279472748164621, + "loss": 0.0103, + "num_input_tokens_seen": 156462624, + "step": 72510 + }, + { + "epoch": 11.82952691680261, + "grad_norm": 0.003202697029337287, + "learning_rate": 0.0004278768386751332, + "loss": 0.1017, + "num_input_tokens_seen": 156473088, + "step": 72515 + }, + { + "epoch": 11.830342577487766, + "grad_norm": 0.02322370558977127, + "learning_rate": 0.0004278064039954716, + "loss": 0.0038, + "num_input_tokens_seen": 156483936, + "step": 72520 + }, + { + "epoch": 11.83115823817292, + "grad_norm": 0.12745387852191925, + "learning_rate": 0.00042773597077890485, + "loss": 0.0101, + "num_input_tokens_seen": 156495136, + "step": 72525 + }, + { + "epoch": 11.831973898858076, + "grad_norm": 0.0022960996720939875, + "learning_rate": 0.0004276655390268603, + "loss": 0.0092, + "num_input_tokens_seen": 156505984, + "step": 72530 + }, + { + "epoch": 11.83278955954323, + "grad_norm": 0.05009673163294792, + "learning_rate": 0.0004275951087407653, + "loss": 0.0214, + "num_input_tokens_seen": 156516352, + "step": 72535 + }, + { + "epoch": 11.833605220228385, + "grad_norm": 0.008180424571037292, + "learning_rate": 0.0004275246799220473, + "loss": 0.0116, + "num_input_tokens_seen": 156527040, + "step": 72540 + }, + { + "epoch": 11.83442088091354, + "grad_norm": 0.24411429464817047, + "learning_rate": 0.0004274542525721338, + "loss": 0.091, + "num_input_tokens_seen": 156538656, + "step": 72545 + }, + { + "epoch": 11.835236541598695, + "grad_norm": 0.008177938871085644, + "learning_rate": 0.00042738382669245157, + "loss": 0.0078, + "num_input_tokens_seen": 156548000, + "step": 72550 + }, + { + "epoch": 11.83605220228385, + "grad_norm": 0.006559658795595169, + "learning_rate": 0.0004273134022844285, + "loss": 0.0243, + "num_input_tokens_seen": 156559200, + "step": 72555 + }, + { + "epoch": 11.836867862969005, + "grad_norm": 0.005587293300777674, + "learning_rate": 0.00042724297934949136, + "loss": 0.0045, + "num_input_tokens_seen": 156571232, + "step": 72560 + }, + { + "epoch": 11.83768352365416, + "grad_norm": 0.0016860413597896695, + "learning_rate": 0.0004271725578890675, + "loss": 0.0108, + "num_input_tokens_seen": 156582016, + "step": 72565 + }, + { + "epoch": 11.838499184339314, + "grad_norm": 0.027797425165772438, + "learning_rate": 0.00042710213790458435, + "loss": 0.0586, + "num_input_tokens_seen": 156591136, + "step": 72570 + }, + { + "epoch": 11.83931484502447, + "grad_norm": 0.02478928677737713, + "learning_rate": 0.00042703171939746865, + "loss": 0.024, + "num_input_tokens_seen": 156602176, + "step": 72575 + }, + { + "epoch": 11.840130505709626, + "grad_norm": 0.09154313057661057, + "learning_rate": 0.00042696130236914796, + "loss": 0.0181, + "num_input_tokens_seen": 156613760, + "step": 72580 + }, + { + "epoch": 11.84094616639478, + "grad_norm": 0.04857185110449791, + "learning_rate": 0.00042689088682104886, + "loss": 0.0112, + "num_input_tokens_seen": 156624192, + "step": 72585 + }, + { + "epoch": 11.841761827079935, + "grad_norm": 0.002952470676973462, + "learning_rate": 0.00042682047275459893, + "loss": 0.0278, + "num_input_tokens_seen": 156635232, + "step": 72590 + }, + { + "epoch": 11.84257748776509, + "grad_norm": 0.01911775767803192, + "learning_rate": 0.00042675006017122477, + "loss": 0.0041, + "num_input_tokens_seen": 156646144, + "step": 72595 + }, + { + "epoch": 11.843393148450245, + "grad_norm": 0.03756110742688179, + "learning_rate": 0.0004266796490723538, + "loss": 0.1, + "num_input_tokens_seen": 156657056, + "step": 72600 + }, + { + "epoch": 11.844208809135399, + "grad_norm": 0.25630879402160645, + "learning_rate": 0.0004266092394594124, + "loss": 0.043, + "num_input_tokens_seen": 156667616, + "step": 72605 + }, + { + "epoch": 11.845024469820554, + "grad_norm": 0.022624224424362183, + "learning_rate": 0.00042653883133382824, + "loss": 0.0238, + "num_input_tokens_seen": 156679488, + "step": 72610 + }, + { + "epoch": 11.84584013050571, + "grad_norm": 0.24670207500457764, + "learning_rate": 0.00042646842469702754, + "loss": 0.0199, + "num_input_tokens_seen": 156691136, + "step": 72615 + }, + { + "epoch": 11.846655791190864, + "grad_norm": 0.009172812104225159, + "learning_rate": 0.0004263980195504378, + "loss": 0.0046, + "num_input_tokens_seen": 156703008, + "step": 72620 + }, + { + "epoch": 11.84747145187602, + "grad_norm": 0.2725246846675873, + "learning_rate": 0.0004263276158954853, + "loss": 0.118, + "num_input_tokens_seen": 156713696, + "step": 72625 + }, + { + "epoch": 11.848287112561174, + "grad_norm": 0.01504495833069086, + "learning_rate": 0.0004262572137335973, + "loss": 0.0054, + "num_input_tokens_seen": 156725248, + "step": 72630 + }, + { + "epoch": 11.84910277324633, + "grad_norm": 0.008457683026790619, + "learning_rate": 0.00042618681306620025, + "loss": 0.0046, + "num_input_tokens_seen": 156736608, + "step": 72635 + }, + { + "epoch": 11.849918433931485, + "grad_norm": 0.11225425451993942, + "learning_rate": 0.00042611641389472127, + "loss": 0.0183, + "num_input_tokens_seen": 156747040, + "step": 72640 + }, + { + "epoch": 11.850734094616639, + "grad_norm": 0.0021378749515861273, + "learning_rate": 0.0004260460162205867, + "loss": 0.0054, + "num_input_tokens_seen": 156758592, + "step": 72645 + }, + { + "epoch": 11.851549755301795, + "grad_norm": 0.004968815948814154, + "learning_rate": 0.0004259756200452236, + "loss": 0.0696, + "num_input_tokens_seen": 156768928, + "step": 72650 + }, + { + "epoch": 11.852365415986949, + "grad_norm": 0.8242314457893372, + "learning_rate": 0.00042590522537005825, + "loss": 0.0452, + "num_input_tokens_seen": 156779840, + "step": 72655 + }, + { + "epoch": 11.853181076672104, + "grad_norm": 0.3122008442878723, + "learning_rate": 0.00042583483219651763, + "loss": 0.0833, + "num_input_tokens_seen": 156791616, + "step": 72660 + }, + { + "epoch": 11.85399673735726, + "grad_norm": 0.10448703914880753, + "learning_rate": 0.0004257644405260282, + "loss": 0.0171, + "num_input_tokens_seen": 156801536, + "step": 72665 + }, + { + "epoch": 11.854812398042414, + "grad_norm": 0.02397385984659195, + "learning_rate": 0.0004256940503600166, + "loss": 0.0087, + "num_input_tokens_seen": 156812032, + "step": 72670 + }, + { + "epoch": 11.85562805872757, + "grad_norm": 0.0009809827897697687, + "learning_rate": 0.00042562366169990936, + "loss": 0.0064, + "num_input_tokens_seen": 156821920, + "step": 72675 + }, + { + "epoch": 11.856443719412724, + "grad_norm": 0.30117154121398926, + "learning_rate": 0.00042555327454713276, + "loss": 0.134, + "num_input_tokens_seen": 156833056, + "step": 72680 + }, + { + "epoch": 11.85725938009788, + "grad_norm": 0.015220411121845245, + "learning_rate": 0.0004254828889031137, + "loss": 0.023, + "num_input_tokens_seen": 156844192, + "step": 72685 + }, + { + "epoch": 11.858075040783035, + "grad_norm": 0.06113926321268082, + "learning_rate": 0.0004254125047692784, + "loss": 0.0632, + "num_input_tokens_seen": 156855488, + "step": 72690 + }, + { + "epoch": 11.858890701468189, + "grad_norm": 0.05479900538921356, + "learning_rate": 0.00042534212214705326, + "loss": 0.0441, + "num_input_tokens_seen": 156866336, + "step": 72695 + }, + { + "epoch": 11.859706362153345, + "grad_norm": 0.024053962901234627, + "learning_rate": 0.0004252717410378648, + "loss": 0.0031, + "num_input_tokens_seen": 156876928, + "step": 72700 + }, + { + "epoch": 11.860522022838499, + "grad_norm": 0.018071835860610008, + "learning_rate": 0.00042520136144313925, + "loss": 0.2915, + "num_input_tokens_seen": 156886656, + "step": 72705 + }, + { + "epoch": 11.861337683523654, + "grad_norm": 0.13032515347003937, + "learning_rate": 0.0004251309833643029, + "loss": 0.0394, + "num_input_tokens_seen": 156897120, + "step": 72710 + }, + { + "epoch": 11.86215334420881, + "grad_norm": 0.18610690534114838, + "learning_rate": 0.00042506060680278234, + "loss": 0.0721, + "num_input_tokens_seen": 156907584, + "step": 72715 + }, + { + "epoch": 11.862969004893964, + "grad_norm": 0.43544426560401917, + "learning_rate": 0.00042499023176000353, + "loss": 0.0861, + "num_input_tokens_seen": 156918432, + "step": 72720 + }, + { + "epoch": 11.86378466557912, + "grad_norm": 0.0852821096777916, + "learning_rate": 0.000424919858237393, + "loss": 0.0108, + "num_input_tokens_seen": 156928640, + "step": 72725 + }, + { + "epoch": 11.864600326264274, + "grad_norm": 0.07228749990463257, + "learning_rate": 0.00042484948623637656, + "loss": 0.0097, + "num_input_tokens_seen": 156938528, + "step": 72730 + }, + { + "epoch": 11.86541598694943, + "grad_norm": 0.01618649996817112, + "learning_rate": 0.0004247791157583808, + "loss": 0.0052, + "num_input_tokens_seen": 156948704, + "step": 72735 + }, + { + "epoch": 11.866231647634583, + "grad_norm": 0.050602275878190994, + "learning_rate": 0.0004247087468048315, + "loss": 0.0149, + "num_input_tokens_seen": 156959072, + "step": 72740 + }, + { + "epoch": 11.867047308319739, + "grad_norm": 0.2452055960893631, + "learning_rate": 0.00042463837937715515, + "loss": 0.0131, + "num_input_tokens_seen": 156969184, + "step": 72745 + }, + { + "epoch": 11.867862969004895, + "grad_norm": 0.001906348392367363, + "learning_rate": 0.0004245680134767775, + "loss": 0.0099, + "num_input_tokens_seen": 156980256, + "step": 72750 + }, + { + "epoch": 11.868678629690049, + "grad_norm": 0.046270083636045456, + "learning_rate": 0.0004244976491051249, + "loss": 0.0506, + "num_input_tokens_seen": 156991872, + "step": 72755 + }, + { + "epoch": 11.869494290375204, + "grad_norm": 0.14000466465950012, + "learning_rate": 0.00042442728626362306, + "loss": 0.0233, + "num_input_tokens_seen": 157002336, + "step": 72760 + }, + { + "epoch": 11.870309951060358, + "grad_norm": 0.1977013796567917, + "learning_rate": 0.00042435692495369824, + "loss": 0.0121, + "num_input_tokens_seen": 157014592, + "step": 72765 + }, + { + "epoch": 11.871125611745514, + "grad_norm": 0.02360336296260357, + "learning_rate": 0.0004242865651767762, + "loss": 0.0073, + "num_input_tokens_seen": 157025152, + "step": 72770 + }, + { + "epoch": 11.87194127243067, + "grad_norm": 0.03719685226678848, + "learning_rate": 0.0004242162069342831, + "loss": 0.0041, + "num_input_tokens_seen": 157036608, + "step": 72775 + }, + { + "epoch": 11.872756933115824, + "grad_norm": 0.000949940993450582, + "learning_rate": 0.0004241458502276446, + "loss": 0.0016, + "num_input_tokens_seen": 157049568, + "step": 72780 + }, + { + "epoch": 11.87357259380098, + "grad_norm": 0.0015351512702181935, + "learning_rate": 0.00042407549505828657, + "loss": 0.0137, + "num_input_tokens_seen": 157059424, + "step": 72785 + }, + { + "epoch": 11.874388254486133, + "grad_norm": 0.02397902123630047, + "learning_rate": 0.0004240051414276352, + "loss": 0.0333, + "num_input_tokens_seen": 157070432, + "step": 72790 + }, + { + "epoch": 11.875203915171289, + "grad_norm": 0.004845744464546442, + "learning_rate": 0.00042393478933711585, + "loss": 0.0983, + "num_input_tokens_seen": 157081664, + "step": 72795 + }, + { + "epoch": 11.876019575856443, + "grad_norm": 0.09591659903526306, + "learning_rate": 0.0004238644387881546, + "loss": 0.0081, + "num_input_tokens_seen": 157092352, + "step": 72800 + }, + { + "epoch": 11.876835236541599, + "grad_norm": 0.4144083857536316, + "learning_rate": 0.000423794089782177, + "loss": 0.0455, + "num_input_tokens_seen": 157103328, + "step": 72805 + }, + { + "epoch": 11.877650897226754, + "grad_norm": 0.6511532068252563, + "learning_rate": 0.000423723742320609, + "loss": 0.1075, + "num_input_tokens_seen": 157114144, + "step": 72810 + }, + { + "epoch": 11.878466557911908, + "grad_norm": 0.12312391400337219, + "learning_rate": 0.00042365339640487596, + "loss": 0.0133, + "num_input_tokens_seen": 157125216, + "step": 72815 + }, + { + "epoch": 11.879282218597064, + "grad_norm": 0.0015507943462580442, + "learning_rate": 0.0004235830520364038, + "loss": 0.0109, + "num_input_tokens_seen": 157136608, + "step": 72820 + }, + { + "epoch": 11.880097879282218, + "grad_norm": 0.645636260509491, + "learning_rate": 0.0004235127092166179, + "loss": 0.0469, + "num_input_tokens_seen": 157147552, + "step": 72825 + }, + { + "epoch": 11.880913539967374, + "grad_norm": 0.0017213401151821017, + "learning_rate": 0.0004234423679469441, + "loss": 0.0574, + "num_input_tokens_seen": 157159552, + "step": 72830 + }, + { + "epoch": 11.88172920065253, + "grad_norm": 0.5996347069740295, + "learning_rate": 0.0004233720282288078, + "loss": 0.1236, + "num_input_tokens_seen": 157170176, + "step": 72835 + }, + { + "epoch": 11.882544861337683, + "grad_norm": 0.021771810948848724, + "learning_rate": 0.00042330169006363455, + "loss": 0.1232, + "num_input_tokens_seen": 157181952, + "step": 72840 + }, + { + "epoch": 11.883360522022839, + "grad_norm": 0.0015852865763008595, + "learning_rate": 0.0004232313534528499, + "loss": 0.042, + "num_input_tokens_seen": 157194176, + "step": 72845 + }, + { + "epoch": 11.884176182707993, + "grad_norm": 0.007834825664758682, + "learning_rate": 0.00042316101839787916, + "loss": 0.0079, + "num_input_tokens_seen": 157204992, + "step": 72850 + }, + { + "epoch": 11.884991843393149, + "grad_norm": 0.0020042695105075836, + "learning_rate": 0.00042309068490014787, + "loss": 0.0134, + "num_input_tokens_seen": 157214496, + "step": 72855 + }, + { + "epoch": 11.885807504078304, + "grad_norm": 0.07186789810657501, + "learning_rate": 0.00042302035296108156, + "loss": 0.01, + "num_input_tokens_seen": 157225248, + "step": 72860 + }, + { + "epoch": 11.886623164763458, + "grad_norm": 0.32167547941207886, + "learning_rate": 0.00042295002258210525, + "loss": 0.1886, + "num_input_tokens_seen": 157236416, + "step": 72865 + }, + { + "epoch": 11.887438825448614, + "grad_norm": 0.0016747190384194255, + "learning_rate": 0.00042287969376464466, + "loss": 0.0354, + "num_input_tokens_seen": 157246304, + "step": 72870 + }, + { + "epoch": 11.888254486133768, + "grad_norm": 0.07311717420816422, + "learning_rate": 0.0004228093665101247, + "loss": 0.0108, + "num_input_tokens_seen": 157256352, + "step": 72875 + }, + { + "epoch": 11.889070146818923, + "grad_norm": 0.005355560686439276, + "learning_rate": 0.00042273904081997115, + "loss": 0.0582, + "num_input_tokens_seen": 157267136, + "step": 72880 + }, + { + "epoch": 11.88988580750408, + "grad_norm": 0.05295560508966446, + "learning_rate": 0.0004226687166956087, + "loss": 0.0236, + "num_input_tokens_seen": 157277952, + "step": 72885 + }, + { + "epoch": 11.890701468189233, + "grad_norm": 0.3252590000629425, + "learning_rate": 0.00042259839413846275, + "loss": 0.0364, + "num_input_tokens_seen": 157288704, + "step": 72890 + }, + { + "epoch": 11.891517128874389, + "grad_norm": 0.0053372192196547985, + "learning_rate": 0.0004225280731499588, + "loss": 0.011, + "num_input_tokens_seen": 157300928, + "step": 72895 + }, + { + "epoch": 11.892332789559543, + "grad_norm": 0.0022523601073771715, + "learning_rate": 0.00042245775373152153, + "loss": 0.0355, + "num_input_tokens_seen": 157312000, + "step": 72900 + }, + { + "epoch": 11.893148450244698, + "grad_norm": 0.0009241543593816459, + "learning_rate": 0.0004223874358845764, + "loss": 0.0077, + "num_input_tokens_seen": 157322848, + "step": 72905 + }, + { + "epoch": 11.893964110929852, + "grad_norm": 0.27037522196769714, + "learning_rate": 0.0004223171196105482, + "loss": 0.0433, + "num_input_tokens_seen": 157334528, + "step": 72910 + }, + { + "epoch": 11.894779771615008, + "grad_norm": 0.0036983652971684933, + "learning_rate": 0.0004222468049108623, + "loss": 0.0471, + "num_input_tokens_seen": 157343392, + "step": 72915 + }, + { + "epoch": 11.895595432300164, + "grad_norm": 0.002075768541544676, + "learning_rate": 0.00042217649178694327, + "loss": 0.0022, + "num_input_tokens_seen": 157353248, + "step": 72920 + }, + { + "epoch": 11.896411092985318, + "grad_norm": 0.016349466517567635, + "learning_rate": 0.00042210618024021663, + "loss": 0.0832, + "num_input_tokens_seen": 157364736, + "step": 72925 + }, + { + "epoch": 11.897226753670473, + "grad_norm": 0.019292861223220825, + "learning_rate": 0.00042203587027210684, + "loss": 0.0662, + "num_input_tokens_seen": 157373728, + "step": 72930 + }, + { + "epoch": 11.898042414355627, + "grad_norm": 0.02241452969610691, + "learning_rate": 0.00042196556188403924, + "loss": 0.0397, + "num_input_tokens_seen": 157384800, + "step": 72935 + }, + { + "epoch": 11.898858075040783, + "grad_norm": 0.3213445246219635, + "learning_rate": 0.0004218952550774383, + "loss": 0.0665, + "num_input_tokens_seen": 157393696, + "step": 72940 + }, + { + "epoch": 11.899673735725939, + "grad_norm": 0.05883891135454178, + "learning_rate": 0.00042182494985372937, + "loss": 0.0371, + "num_input_tokens_seen": 157404608, + "step": 72945 + }, + { + "epoch": 11.900489396411093, + "grad_norm": 0.3271454870700836, + "learning_rate": 0.0004217546462143368, + "loss": 0.0235, + "num_input_tokens_seen": 157416288, + "step": 72950 + }, + { + "epoch": 11.901305057096248, + "grad_norm": 0.009723849594593048, + "learning_rate": 0.0004216843441606857, + "loss": 0.0153, + "num_input_tokens_seen": 157427200, + "step": 72955 + }, + { + "epoch": 11.902120717781402, + "grad_norm": 0.006236048880964518, + "learning_rate": 0.0004216140436942006, + "loss": 0.0077, + "num_input_tokens_seen": 157437344, + "step": 72960 + }, + { + "epoch": 11.902936378466558, + "grad_norm": 0.005791103933006525, + "learning_rate": 0.0004215437448163065, + "loss": 0.0105, + "num_input_tokens_seen": 157448960, + "step": 72965 + }, + { + "epoch": 11.903752039151712, + "grad_norm": 0.015321547165513039, + "learning_rate": 0.00042147344752842774, + "loss": 0.0148, + "num_input_tokens_seen": 157461088, + "step": 72970 + }, + { + "epoch": 11.904567699836868, + "grad_norm": 0.054951075464487076, + "learning_rate": 0.0004214031518319893, + "loss": 0.0236, + "num_input_tokens_seen": 157470720, + "step": 72975 + }, + { + "epoch": 11.905383360522023, + "grad_norm": 0.011692700907588005, + "learning_rate": 0.0004213328577284157, + "loss": 0.0488, + "num_input_tokens_seen": 157481792, + "step": 72980 + }, + { + "epoch": 11.906199021207177, + "grad_norm": 0.02238883264362812, + "learning_rate": 0.0004212625652191315, + "loss": 0.02, + "num_input_tokens_seen": 157492544, + "step": 72985 + }, + { + "epoch": 11.907014681892333, + "grad_norm": 0.0006403630250133574, + "learning_rate": 0.00042119227430556137, + "loss": 0.1743, + "num_input_tokens_seen": 157503808, + "step": 72990 + }, + { + "epoch": 11.907830342577487, + "grad_norm": 0.14390014111995697, + "learning_rate": 0.0004211219849891296, + "loss": 0.0349, + "num_input_tokens_seen": 157513760, + "step": 72995 + }, + { + "epoch": 11.908646003262643, + "grad_norm": 0.4305567741394043, + "learning_rate": 0.00042105169727126094, + "loss": 0.0894, + "num_input_tokens_seen": 157523648, + "step": 73000 + }, + { + "epoch": 11.909461663947798, + "grad_norm": 0.014185127802193165, + "learning_rate": 0.00042098141115337986, + "loss": 0.0109, + "num_input_tokens_seen": 157534496, + "step": 73005 + }, + { + "epoch": 11.910277324632952, + "grad_norm": 1.5293669700622559, + "learning_rate": 0.0004209111266369107, + "loss": 0.0635, + "num_input_tokens_seen": 157544544, + "step": 73010 + }, + { + "epoch": 11.911092985318108, + "grad_norm": 0.05437318608164787, + "learning_rate": 0.0004208408437232779, + "loss": 0.0678, + "num_input_tokens_seen": 157555488, + "step": 73015 + }, + { + "epoch": 11.911908646003262, + "grad_norm": 0.10591735690832138, + "learning_rate": 0.00042077056241390586, + "loss": 0.0271, + "num_input_tokens_seen": 157567232, + "step": 73020 + }, + { + "epoch": 11.912724306688418, + "grad_norm": 0.0045133670791983604, + "learning_rate": 0.00042070028271021877, + "loss": 0.0292, + "num_input_tokens_seen": 157578080, + "step": 73025 + }, + { + "epoch": 11.913539967373573, + "grad_norm": 0.007219410035759211, + "learning_rate": 0.0004206300046136412, + "loss": 0.0046, + "num_input_tokens_seen": 157588352, + "step": 73030 + }, + { + "epoch": 11.914355628058727, + "grad_norm": 0.010021929629147053, + "learning_rate": 0.00042055972812559707, + "loss": 0.1058, + "num_input_tokens_seen": 157599584, + "step": 73035 + }, + { + "epoch": 11.915171288743883, + "grad_norm": 0.06710880994796753, + "learning_rate": 0.0004204894532475111, + "loss": 0.0131, + "num_input_tokens_seen": 157610208, + "step": 73040 + }, + { + "epoch": 11.915986949429037, + "grad_norm": 0.022323941811919212, + "learning_rate": 0.00042041917998080695, + "loss": 0.1032, + "num_input_tokens_seen": 157620896, + "step": 73045 + }, + { + "epoch": 11.916802610114193, + "grad_norm": 0.001536445808596909, + "learning_rate": 0.0004203489083269093, + "loss": 0.0051, + "num_input_tokens_seen": 157631520, + "step": 73050 + }, + { + "epoch": 11.917618270799348, + "grad_norm": 0.05237545818090439, + "learning_rate": 0.0004202786382872419, + "loss": 0.0387, + "num_input_tokens_seen": 157642848, + "step": 73055 + }, + { + "epoch": 11.918433931484502, + "grad_norm": 0.30910801887512207, + "learning_rate": 0.00042020836986322917, + "loss": 0.0305, + "num_input_tokens_seen": 157654624, + "step": 73060 + }, + { + "epoch": 11.919249592169658, + "grad_norm": 0.0016805122140794992, + "learning_rate": 0.0004201381030562949, + "loss": 0.0425, + "num_input_tokens_seen": 157664864, + "step": 73065 + }, + { + "epoch": 11.920065252854812, + "grad_norm": 0.018283285200595856, + "learning_rate": 0.00042006783786786346, + "loss": 0.0071, + "num_input_tokens_seen": 157675808, + "step": 73070 + }, + { + "epoch": 11.920880913539968, + "grad_norm": 0.010449071414768696, + "learning_rate": 0.0004199975742993585, + "loss": 0.0134, + "num_input_tokens_seen": 157687392, + "step": 73075 + }, + { + "epoch": 11.921696574225122, + "grad_norm": 0.08574571460485458, + "learning_rate": 0.0004199273123522044, + "loss": 0.0162, + "num_input_tokens_seen": 157699040, + "step": 73080 + }, + { + "epoch": 11.922512234910277, + "grad_norm": 0.005719189066439867, + "learning_rate": 0.00041985705202782464, + "loss": 0.0161, + "num_input_tokens_seen": 157710048, + "step": 73085 + }, + { + "epoch": 11.923327895595433, + "grad_norm": 0.019270513206720352, + "learning_rate": 0.00041978679332764366, + "loss": 0.0305, + "num_input_tokens_seen": 157722144, + "step": 73090 + }, + { + "epoch": 11.924143556280587, + "grad_norm": 0.0397915355861187, + "learning_rate": 0.0004197165362530848, + "loss": 0.0226, + "num_input_tokens_seen": 157734112, + "step": 73095 + }, + { + "epoch": 11.924959216965743, + "grad_norm": 0.025323286652565002, + "learning_rate": 0.00041964628080557224, + "loss": 0.0189, + "num_input_tokens_seen": 157743776, + "step": 73100 + }, + { + "epoch": 11.925774877650896, + "grad_norm": 0.0066994898952543736, + "learning_rate": 0.0004195760269865299, + "loss": 0.0507, + "num_input_tokens_seen": 157754496, + "step": 73105 + }, + { + "epoch": 11.926590538336052, + "grad_norm": 0.004962735343724489, + "learning_rate": 0.0004195057747973812, + "loss": 0.0047, + "num_input_tokens_seen": 157764928, + "step": 73110 + }, + { + "epoch": 11.927406199021208, + "grad_norm": 0.13575312495231628, + "learning_rate": 0.0004194355242395503, + "loss": 0.0114, + "num_input_tokens_seen": 157775104, + "step": 73115 + }, + { + "epoch": 11.928221859706362, + "grad_norm": 0.403140127658844, + "learning_rate": 0.00041936527531446046, + "loss": 0.1214, + "num_input_tokens_seen": 157785984, + "step": 73120 + }, + { + "epoch": 11.929037520391518, + "grad_norm": 0.0031369165517389774, + "learning_rate": 0.0004192950280235359, + "loss": 0.0134, + "num_input_tokens_seen": 157796704, + "step": 73125 + }, + { + "epoch": 11.929853181076671, + "grad_norm": 0.034159477800130844, + "learning_rate": 0.0004192247823681997, + "loss": 0.006, + "num_input_tokens_seen": 157808288, + "step": 73130 + }, + { + "epoch": 11.930668841761827, + "grad_norm": 0.0014229965163394809, + "learning_rate": 0.00041915453834987594, + "loss": 0.0054, + "num_input_tokens_seen": 157818848, + "step": 73135 + }, + { + "epoch": 11.931484502446983, + "grad_norm": 0.05739206820726395, + "learning_rate": 0.0004190842959699879, + "loss": 0.0113, + "num_input_tokens_seen": 157830240, + "step": 73140 + }, + { + "epoch": 11.932300163132137, + "grad_norm": 0.007996790111064911, + "learning_rate": 0.0004190140552299593, + "loss": 0.0123, + "num_input_tokens_seen": 157841632, + "step": 73145 + }, + { + "epoch": 11.933115823817293, + "grad_norm": 0.06965608149766922, + "learning_rate": 0.0004189438161312136, + "loss": 0.0903, + "num_input_tokens_seen": 157854048, + "step": 73150 + }, + { + "epoch": 11.933931484502446, + "grad_norm": 0.0012895826948806643, + "learning_rate": 0.00041887357867517435, + "loss": 0.0051, + "num_input_tokens_seen": 157866112, + "step": 73155 + }, + { + "epoch": 11.934747145187602, + "grad_norm": 0.029853580519557, + "learning_rate": 0.0004188033428632649, + "loss": 0.008, + "num_input_tokens_seen": 157877984, + "step": 73160 + }, + { + "epoch": 11.935562805872756, + "grad_norm": 0.0022051497362554073, + "learning_rate": 0.00041873310869690875, + "loss": 0.0112, + "num_input_tokens_seen": 157889536, + "step": 73165 + }, + { + "epoch": 11.936378466557912, + "grad_norm": 0.004599535372108221, + "learning_rate": 0.00041866287617752906, + "loss": 0.0205, + "num_input_tokens_seen": 157899936, + "step": 73170 + }, + { + "epoch": 11.937194127243067, + "grad_norm": 0.003711460391059518, + "learning_rate": 0.0004185926453065496, + "loss": 0.0035, + "num_input_tokens_seen": 157909728, + "step": 73175 + }, + { + "epoch": 11.938009787928221, + "grad_norm": 0.34563347697257996, + "learning_rate": 0.0004185224160853933, + "loss": 0.0624, + "num_input_tokens_seen": 157920768, + "step": 73180 + }, + { + "epoch": 11.938825448613377, + "grad_norm": 0.006612342316657305, + "learning_rate": 0.00041845218851548375, + "loss": 0.0774, + "num_input_tokens_seen": 157931104, + "step": 73185 + }, + { + "epoch": 11.939641109298531, + "grad_norm": 0.003344194032251835, + "learning_rate": 0.0004183819625982439, + "loss": 0.0205, + "num_input_tokens_seen": 157941984, + "step": 73190 + }, + { + "epoch": 11.940456769983687, + "grad_norm": 0.017757335677742958, + "learning_rate": 0.0004183117383350973, + "loss": 0.0045, + "num_input_tokens_seen": 157952512, + "step": 73195 + }, + { + "epoch": 11.941272430668842, + "grad_norm": 0.03641846776008606, + "learning_rate": 0.0004182415157274668, + "loss": 0.0062, + "num_input_tokens_seen": 157963584, + "step": 73200 + }, + { + "epoch": 11.942088091353996, + "grad_norm": 0.04135163500905037, + "learning_rate": 0.00041817129477677564, + "loss": 0.0396, + "num_input_tokens_seen": 157975200, + "step": 73205 + }, + { + "epoch": 11.942903752039152, + "grad_norm": 0.06931928545236588, + "learning_rate": 0.0004181010754844472, + "loss": 0.018, + "num_input_tokens_seen": 157985696, + "step": 73210 + }, + { + "epoch": 11.943719412724306, + "grad_norm": 0.4212145507335663, + "learning_rate": 0.00041803085785190416, + "loss": 0.0642, + "num_input_tokens_seen": 157996896, + "step": 73215 + }, + { + "epoch": 11.944535073409462, + "grad_norm": 0.1402944177389145, + "learning_rate": 0.00041796064188057, + "loss": 0.022, + "num_input_tokens_seen": 158009216, + "step": 73220 + }, + { + "epoch": 11.945350734094617, + "grad_norm": 0.0017296988517045975, + "learning_rate": 0.00041789042757186726, + "loss": 0.0083, + "num_input_tokens_seen": 158020512, + "step": 73225 + }, + { + "epoch": 11.946166394779771, + "grad_norm": 0.0005481366533786058, + "learning_rate": 0.00041782021492721937, + "loss": 0.008, + "num_input_tokens_seen": 158030432, + "step": 73230 + }, + { + "epoch": 11.946982055464927, + "grad_norm": 1.5912744998931885, + "learning_rate": 0.00041775000394804896, + "loss": 0.0387, + "num_input_tokens_seen": 158040576, + "step": 73235 + }, + { + "epoch": 11.947797716150081, + "grad_norm": 0.0009132567211054265, + "learning_rate": 0.0004176797946357792, + "loss": 0.0586, + "num_input_tokens_seen": 158052032, + "step": 73240 + }, + { + "epoch": 11.948613376835237, + "grad_norm": 0.6362619400024414, + "learning_rate": 0.00041760958699183263, + "loss": 0.0834, + "num_input_tokens_seen": 158062016, + "step": 73245 + }, + { + "epoch": 11.949429037520392, + "grad_norm": 0.00650503346696496, + "learning_rate": 0.0004175393810176325, + "loss": 0.0073, + "num_input_tokens_seen": 158072384, + "step": 73250 + }, + { + "epoch": 11.950244698205546, + "grad_norm": 0.003973199520260096, + "learning_rate": 0.00041746917671460124, + "loss": 0.0144, + "num_input_tokens_seen": 158083712, + "step": 73255 + }, + { + "epoch": 11.951060358890702, + "grad_norm": 0.07973134517669678, + "learning_rate": 0.000417398974084162, + "loss": 0.1344, + "num_input_tokens_seen": 158093664, + "step": 73260 + }, + { + "epoch": 11.951876019575856, + "grad_norm": 0.003538121236488223, + "learning_rate": 0.0004173287731277371, + "loss": 0.0355, + "num_input_tokens_seen": 158104352, + "step": 73265 + }, + { + "epoch": 11.952691680261012, + "grad_norm": 0.0013759384164586663, + "learning_rate": 0.00041725857384674974, + "loss": 0.231, + "num_input_tokens_seen": 158114240, + "step": 73270 + }, + { + "epoch": 11.953507340946166, + "grad_norm": 0.008020886220037937, + "learning_rate": 0.0004171883762426221, + "loss": 0.0097, + "num_input_tokens_seen": 158124704, + "step": 73275 + }, + { + "epoch": 11.954323001631321, + "grad_norm": 0.14164352416992188, + "learning_rate": 0.00041711818031677737, + "loss": 0.0186, + "num_input_tokens_seen": 158135136, + "step": 73280 + }, + { + "epoch": 11.955138662316477, + "grad_norm": 0.02433096058666706, + "learning_rate": 0.00041704798607063756, + "loss": 0.0862, + "num_input_tokens_seen": 158145024, + "step": 73285 + }, + { + "epoch": 11.955954323001631, + "grad_norm": 0.02682577632367611, + "learning_rate": 0.0004169777935056257, + "loss": 0.0112, + "num_input_tokens_seen": 158155040, + "step": 73290 + }, + { + "epoch": 11.956769983686787, + "grad_norm": 0.006712156813591719, + "learning_rate": 0.00041690760262316415, + "loss": 0.0916, + "num_input_tokens_seen": 158166208, + "step": 73295 + }, + { + "epoch": 11.95758564437194, + "grad_norm": 0.0030801750253885984, + "learning_rate": 0.0004168374134246754, + "loss": 0.0581, + "num_input_tokens_seen": 158176608, + "step": 73300 + }, + { + "epoch": 11.958401305057096, + "grad_norm": 0.01697118952870369, + "learning_rate": 0.000416767225911582, + "loss": 0.0118, + "num_input_tokens_seen": 158187552, + "step": 73305 + }, + { + "epoch": 11.959216965742252, + "grad_norm": 0.12180114537477493, + "learning_rate": 0.0004166970400853064, + "loss": 0.0104, + "num_input_tokens_seen": 158199136, + "step": 73310 + }, + { + "epoch": 11.960032626427406, + "grad_norm": 0.005208475515246391, + "learning_rate": 0.00041662685594727076, + "loss": 0.0242, + "num_input_tokens_seen": 158209600, + "step": 73315 + }, + { + "epoch": 11.960848287112562, + "grad_norm": 0.6188380718231201, + "learning_rate": 0.0004165566734988979, + "loss": 0.0977, + "num_input_tokens_seen": 158220544, + "step": 73320 + }, + { + "epoch": 11.961663947797716, + "grad_norm": 0.027940770611166954, + "learning_rate": 0.00041648649274160976, + "loss": 0.0038, + "num_input_tokens_seen": 158231456, + "step": 73325 + }, + { + "epoch": 11.962479608482871, + "grad_norm": 0.012904273346066475, + "learning_rate": 0.0004164163136768289, + "loss": 0.0064, + "num_input_tokens_seen": 158241920, + "step": 73330 + }, + { + "epoch": 11.963295269168025, + "grad_norm": 0.0866921991109848, + "learning_rate": 0.0004163461363059774, + "loss": 0.0385, + "num_input_tokens_seen": 158251360, + "step": 73335 + }, + { + "epoch": 11.964110929853181, + "grad_norm": 0.00654540304094553, + "learning_rate": 0.00041627596063047753, + "loss": 0.0041, + "num_input_tokens_seen": 158262528, + "step": 73340 + }, + { + "epoch": 11.964926590538337, + "grad_norm": 0.5953215956687927, + "learning_rate": 0.00041620578665175166, + "loss": 0.1128, + "num_input_tokens_seen": 158272160, + "step": 73345 + }, + { + "epoch": 11.96574225122349, + "grad_norm": 0.004743333440274, + "learning_rate": 0.00041613561437122163, + "loss": 0.1217, + "num_input_tokens_seen": 158282176, + "step": 73350 + }, + { + "epoch": 11.966557911908646, + "grad_norm": 0.07933826744556427, + "learning_rate": 0.0004160654437903101, + "loss": 0.0072, + "num_input_tokens_seen": 158291136, + "step": 73355 + }, + { + "epoch": 11.9673735725938, + "grad_norm": 0.03977036476135254, + "learning_rate": 0.0004159952749104385, + "loss": 0.0062, + "num_input_tokens_seen": 158301184, + "step": 73360 + }, + { + "epoch": 11.968189233278956, + "grad_norm": 0.2224341779947281, + "learning_rate": 0.00041592510773302946, + "loss": 0.0337, + "num_input_tokens_seen": 158311936, + "step": 73365 + }, + { + "epoch": 11.969004893964112, + "grad_norm": 0.03574329987168312, + "learning_rate": 0.0004158549422595045, + "loss": 0.1161, + "num_input_tokens_seen": 158323456, + "step": 73370 + }, + { + "epoch": 11.969820554649266, + "grad_norm": 0.05659911409020424, + "learning_rate": 0.0004157847784912861, + "loss": 0.0086, + "num_input_tokens_seen": 158333568, + "step": 73375 + }, + { + "epoch": 11.970636215334421, + "grad_norm": 0.004377374425530434, + "learning_rate": 0.0004157146164297959, + "loss": 0.0025, + "num_input_tokens_seen": 158343232, + "step": 73380 + }, + { + "epoch": 11.971451876019575, + "grad_norm": 0.0030553217511624098, + "learning_rate": 0.00041564445607645607, + "loss": 0.0497, + "num_input_tokens_seen": 158354912, + "step": 73385 + }, + { + "epoch": 11.97226753670473, + "grad_norm": 0.013683933764696121, + "learning_rate": 0.0004155742974326881, + "loss": 0.0456, + "num_input_tokens_seen": 158364992, + "step": 73390 + }, + { + "epoch": 11.973083197389887, + "grad_norm": 0.3621150255203247, + "learning_rate": 0.00041550414049991435, + "loss": 0.0657, + "num_input_tokens_seen": 158375904, + "step": 73395 + }, + { + "epoch": 11.97389885807504, + "grad_norm": 0.028863178566098213, + "learning_rate": 0.0004154339852795562, + "loss": 0.0054, + "num_input_tokens_seen": 158386752, + "step": 73400 + }, + { + "epoch": 11.974714518760196, + "grad_norm": 0.0013639982789754868, + "learning_rate": 0.0004153638317730358, + "loss": 0.0344, + "num_input_tokens_seen": 158397984, + "step": 73405 + }, + { + "epoch": 11.97553017944535, + "grad_norm": 0.022834187373518944, + "learning_rate": 0.00041529367998177446, + "loss": 0.0818, + "num_input_tokens_seen": 158409440, + "step": 73410 + }, + { + "epoch": 11.976345840130506, + "grad_norm": 0.002542684553191066, + "learning_rate": 0.00041522352990719434, + "loss": 0.0014, + "num_input_tokens_seen": 158421152, + "step": 73415 + }, + { + "epoch": 11.977161500815662, + "grad_norm": 0.18985708057880402, + "learning_rate": 0.0004151533815507168, + "loss": 0.0345, + "num_input_tokens_seen": 158430848, + "step": 73420 + }, + { + "epoch": 11.977977161500815, + "grad_norm": 0.19147111475467682, + "learning_rate": 0.00041508323491376364, + "loss": 0.0966, + "num_input_tokens_seen": 158441728, + "step": 73425 + }, + { + "epoch": 11.978792822185971, + "grad_norm": 0.8241161704063416, + "learning_rate": 0.00041501308999775664, + "loss": 0.0896, + "num_input_tokens_seen": 158452928, + "step": 73430 + }, + { + "epoch": 11.979608482871125, + "grad_norm": 0.01721026562154293, + "learning_rate": 0.00041494294680411695, + "loss": 0.0344, + "num_input_tokens_seen": 158465152, + "step": 73435 + }, + { + "epoch": 11.98042414355628, + "grad_norm": 0.2673587203025818, + "learning_rate": 0.0004148728053342665, + "loss": 0.0308, + "num_input_tokens_seen": 158476160, + "step": 73440 + }, + { + "epoch": 11.981239804241435, + "grad_norm": 0.009005645290017128, + "learning_rate": 0.0004148026655896265, + "loss": 0.0309, + "num_input_tokens_seen": 158486816, + "step": 73445 + }, + { + "epoch": 11.98205546492659, + "grad_norm": 0.010860797949135303, + "learning_rate": 0.0004147325275716188, + "loss": 0.0213, + "num_input_tokens_seen": 158497376, + "step": 73450 + }, + { + "epoch": 11.982871125611746, + "grad_norm": 0.002967157866805792, + "learning_rate": 0.00041466239128166435, + "loss": 0.0606, + "num_input_tokens_seen": 158509312, + "step": 73455 + }, + { + "epoch": 11.9836867862969, + "grad_norm": 0.01825196109712124, + "learning_rate": 0.00041459225672118487, + "loss": 0.1441, + "num_input_tokens_seen": 158519424, + "step": 73460 + }, + { + "epoch": 11.984502446982056, + "grad_norm": 0.0034720024559646845, + "learning_rate": 0.0004145221238916017, + "loss": 0.0446, + "num_input_tokens_seen": 158529536, + "step": 73465 + }, + { + "epoch": 11.98531810766721, + "grad_norm": 0.036982521414756775, + "learning_rate": 0.0004144519927943361, + "loss": 0.1041, + "num_input_tokens_seen": 158540096, + "step": 73470 + }, + { + "epoch": 11.986133768352365, + "grad_norm": 0.009317529387772083, + "learning_rate": 0.0004143818634308094, + "loss": 0.0092, + "num_input_tokens_seen": 158551328, + "step": 73475 + }, + { + "epoch": 11.986949429037521, + "grad_norm": 0.04151898995041847, + "learning_rate": 0.00041431173580244284, + "loss": 0.0111, + "num_input_tokens_seen": 158562720, + "step": 73480 + }, + { + "epoch": 11.987765089722675, + "grad_norm": 0.0044968039728701115, + "learning_rate": 0.0004142416099106576, + "loss": 0.2475, + "num_input_tokens_seen": 158574208, + "step": 73485 + }, + { + "epoch": 11.98858075040783, + "grad_norm": 0.07000982761383057, + "learning_rate": 0.0004141714857568751, + "loss": 0.0746, + "num_input_tokens_seen": 158586752, + "step": 73490 + }, + { + "epoch": 11.989396411092985, + "grad_norm": 0.0032821393106132746, + "learning_rate": 0.0004141013633425161, + "loss": 0.0042, + "num_input_tokens_seen": 158597088, + "step": 73495 + }, + { + "epoch": 11.99021207177814, + "grad_norm": 0.003983738832175732, + "learning_rate": 0.0004140312426690022, + "loss": 0.0119, + "num_input_tokens_seen": 158608320, + "step": 73500 + }, + { + "epoch": 11.991027732463294, + "grad_norm": 0.432071715593338, + "learning_rate": 0.000413961123737754, + "loss": 0.0376, + "num_input_tokens_seen": 158618976, + "step": 73505 + }, + { + "epoch": 11.99184339314845, + "grad_norm": 0.0038938208017498255, + "learning_rate": 0.00041389100655019295, + "loss": 0.0077, + "num_input_tokens_seen": 158630208, + "step": 73510 + }, + { + "epoch": 11.992659053833606, + "grad_norm": 0.0700301080942154, + "learning_rate": 0.00041382089110773975, + "loss": 0.022, + "num_input_tokens_seen": 158641600, + "step": 73515 + }, + { + "epoch": 11.99347471451876, + "grad_norm": 0.07871358841657639, + "learning_rate": 0.00041375077741181564, + "loss": 0.0105, + "num_input_tokens_seen": 158651840, + "step": 73520 + }, + { + "epoch": 11.994290375203915, + "grad_norm": 0.2408740371465683, + "learning_rate": 0.0004136806654638413, + "loss": 0.0349, + "num_input_tokens_seen": 158662912, + "step": 73525 + }, + { + "epoch": 11.99510603588907, + "grad_norm": 0.0064306301064789295, + "learning_rate": 0.0004136105552652377, + "loss": 0.0159, + "num_input_tokens_seen": 158673216, + "step": 73530 + }, + { + "epoch": 11.995921696574225, + "grad_norm": 0.09198891371488571, + "learning_rate": 0.0004135404468174261, + "loss": 0.0394, + "num_input_tokens_seen": 158684320, + "step": 73535 + }, + { + "epoch": 11.99673735725938, + "grad_norm": 0.4025056064128876, + "learning_rate": 0.0004134703401218268, + "loss": 0.2562, + "num_input_tokens_seen": 158695424, + "step": 73540 + }, + { + "epoch": 11.997553017944535, + "grad_norm": 0.047091420739889145, + "learning_rate": 0.00041340023517986096, + "loss": 0.0616, + "num_input_tokens_seen": 158706240, + "step": 73545 + }, + { + "epoch": 11.99836867862969, + "grad_norm": 0.013050161302089691, + "learning_rate": 0.00041333013199294907, + "loss": 0.0258, + "num_input_tokens_seen": 158716864, + "step": 73550 + }, + { + "epoch": 11.999184339314844, + "grad_norm": 0.0068448130041360855, + "learning_rate": 0.0004132600305625122, + "loss": 0.0269, + "num_input_tokens_seen": 158727616, + "step": 73555 + }, + { + "epoch": 12.0, + "grad_norm": 0.009456095285713673, + "learning_rate": 0.0004131899308899706, + "loss": 0.1417, + "num_input_tokens_seen": 158737232, + "step": 73560 + }, + { + "epoch": 12.0, + "eval_loss": 0.14715120196342468, + "eval_runtime": 104.4884, + "eval_samples_per_second": 26.079, + "eval_steps_per_second": 6.527, + "num_input_tokens_seen": 158737232, + "step": 73560 + }, + { + "epoch": 12.000815660685156, + "grad_norm": 0.009924051351845264, + "learning_rate": 0.00041311983297674545, + "loss": 0.0121, + "num_input_tokens_seen": 158748016, + "step": 73565 + }, + { + "epoch": 12.00163132137031, + "grad_norm": 0.007379444316029549, + "learning_rate": 0.00041304973682425685, + "loss": 0.0123, + "num_input_tokens_seen": 158758480, + "step": 73570 + }, + { + "epoch": 12.002446982055465, + "grad_norm": 0.013056386262178421, + "learning_rate": 0.00041297964243392583, + "loss": 0.0204, + "num_input_tokens_seen": 158768688, + "step": 73575 + }, + { + "epoch": 12.00326264274062, + "grad_norm": 0.014037344604730606, + "learning_rate": 0.0004129095498071726, + "loss": 0.0077, + "num_input_tokens_seen": 158780016, + "step": 73580 + }, + { + "epoch": 12.004078303425775, + "grad_norm": 0.012845704331994057, + "learning_rate": 0.000412839458945418, + "loss": 0.1494, + "num_input_tokens_seen": 158790832, + "step": 73585 + }, + { + "epoch": 12.00489396411093, + "grad_norm": 0.12016411870718002, + "learning_rate": 0.0004127693698500821, + "loss": 0.0133, + "num_input_tokens_seen": 158801680, + "step": 73590 + }, + { + "epoch": 12.005709624796085, + "grad_norm": 0.15560196340084076, + "learning_rate": 0.0004126992825225858, + "loss": 0.0168, + "num_input_tokens_seen": 158811760, + "step": 73595 + }, + { + "epoch": 12.00652528548124, + "grad_norm": 0.00793515145778656, + "learning_rate": 0.00041262919696434915, + "loss": 0.0182, + "num_input_tokens_seen": 158821392, + "step": 73600 + }, + { + "epoch": 12.007340946166394, + "grad_norm": 0.14866739511489868, + "learning_rate": 0.0004125591131767927, + "loss": 0.0227, + "num_input_tokens_seen": 158832176, + "step": 73605 + }, + { + "epoch": 12.00815660685155, + "grad_norm": 0.009864246472716331, + "learning_rate": 0.00041248903116133674, + "loss": 0.023, + "num_input_tokens_seen": 158842192, + "step": 73610 + }, + { + "epoch": 12.008972267536704, + "grad_norm": 0.030815215781331062, + "learning_rate": 0.0004124189509194016, + "loss": 0.0105, + "num_input_tokens_seen": 158853456, + "step": 73615 + }, + { + "epoch": 12.00978792822186, + "grad_norm": 0.11526378989219666, + "learning_rate": 0.00041234887245240756, + "loss": 0.0129, + "num_input_tokens_seen": 158863952, + "step": 73620 + }, + { + "epoch": 12.010603588907015, + "grad_norm": 0.023073462769389153, + "learning_rate": 0.00041227879576177475, + "loss": 0.0096, + "num_input_tokens_seen": 158875024, + "step": 73625 + }, + { + "epoch": 12.01141924959217, + "grad_norm": 0.036314450204372406, + "learning_rate": 0.00041220872084892337, + "loss": 0.0085, + "num_input_tokens_seen": 158884624, + "step": 73630 + }, + { + "epoch": 12.012234910277325, + "grad_norm": 0.036202993243932724, + "learning_rate": 0.00041213864771527366, + "loss": 0.0084, + "num_input_tokens_seen": 158895664, + "step": 73635 + }, + { + "epoch": 12.013050570962479, + "grad_norm": 0.02561865746974945, + "learning_rate": 0.0004120685763622458, + "loss": 0.1032, + "num_input_tokens_seen": 158906576, + "step": 73640 + }, + { + "epoch": 12.013866231647635, + "grad_norm": 0.12004603445529938, + "learning_rate": 0.00041199850679125974, + "loss": 0.0228, + "num_input_tokens_seen": 158917648, + "step": 73645 + }, + { + "epoch": 12.01468189233279, + "grad_norm": 0.32851067185401917, + "learning_rate": 0.0004119284390037356, + "loss": 0.028, + "num_input_tokens_seen": 158927664, + "step": 73650 + }, + { + "epoch": 12.015497553017944, + "grad_norm": 0.015434769913554192, + "learning_rate": 0.00041185837300109326, + "loss": 0.1845, + "num_input_tokens_seen": 158939280, + "step": 73655 + }, + { + "epoch": 12.0163132137031, + "grad_norm": 0.004952901974320412, + "learning_rate": 0.00041178830878475304, + "loss": 0.0037, + "num_input_tokens_seen": 158949456, + "step": 73660 + }, + { + "epoch": 12.017128874388254, + "grad_norm": 0.054199304431676865, + "learning_rate": 0.00041171824635613443, + "loss": 0.0214, + "num_input_tokens_seen": 158960944, + "step": 73665 + }, + { + "epoch": 12.01794453507341, + "grad_norm": 0.13922037184238434, + "learning_rate": 0.00041164818571665774, + "loss": 0.0144, + "num_input_tokens_seen": 158971056, + "step": 73670 + }, + { + "epoch": 12.018760195758565, + "grad_norm": 0.00358504056930542, + "learning_rate": 0.00041157812686774245, + "loss": 0.013, + "num_input_tokens_seen": 158982544, + "step": 73675 + }, + { + "epoch": 12.01957585644372, + "grad_norm": 0.0007545426487922668, + "learning_rate": 0.0004115080698108088, + "loss": 0.0038, + "num_input_tokens_seen": 158993552, + "step": 73680 + }, + { + "epoch": 12.020391517128875, + "grad_norm": 0.0022474394645541906, + "learning_rate": 0.0004114380145472761, + "loss": 0.0469, + "num_input_tokens_seen": 159003824, + "step": 73685 + }, + { + "epoch": 12.021207177814029, + "grad_norm": 0.0064369384199380875, + "learning_rate": 0.00041136796107856465, + "loss": 0.0278, + "num_input_tokens_seen": 159014736, + "step": 73690 + }, + { + "epoch": 12.022022838499185, + "grad_norm": 0.056286245584487915, + "learning_rate": 0.00041129790940609375, + "loss": 0.0501, + "num_input_tokens_seen": 159026448, + "step": 73695 + }, + { + "epoch": 12.022838499184338, + "grad_norm": 0.0029463996179401875, + "learning_rate": 0.0004112278595312834, + "loss": 0.0339, + "num_input_tokens_seen": 159037392, + "step": 73700 + }, + { + "epoch": 12.023654159869494, + "grad_norm": 0.0050112600438296795, + "learning_rate": 0.00041115781145555286, + "loss": 0.0778, + "num_input_tokens_seen": 159046064, + "step": 73705 + }, + { + "epoch": 12.02446982055465, + "grad_norm": 0.0011639490257948637, + "learning_rate": 0.0004110877651803222, + "loss": 0.0069, + "num_input_tokens_seen": 159057360, + "step": 73710 + }, + { + "epoch": 12.025285481239804, + "grad_norm": 0.15059702098369598, + "learning_rate": 0.0004110177207070106, + "loss": 0.0142, + "num_input_tokens_seen": 159067216, + "step": 73715 + }, + { + "epoch": 12.02610114192496, + "grad_norm": 0.005269485060125589, + "learning_rate": 0.0004109476780370379, + "loss": 0.0275, + "num_input_tokens_seen": 159077680, + "step": 73720 + }, + { + "epoch": 12.026916802610113, + "grad_norm": 0.15546227991580963, + "learning_rate": 0.00041087763717182336, + "loss": 0.0132, + "num_input_tokens_seen": 159087856, + "step": 73725 + }, + { + "epoch": 12.02773246329527, + "grad_norm": 0.032911963760852814, + "learning_rate": 0.00041080759811278674, + "loss": 0.0138, + "num_input_tokens_seen": 159099376, + "step": 73730 + }, + { + "epoch": 12.028548123980425, + "grad_norm": 0.011458617635071278, + "learning_rate": 0.00041073756086134705, + "loss": 0.0209, + "num_input_tokens_seen": 159111184, + "step": 73735 + }, + { + "epoch": 12.029363784665579, + "grad_norm": 0.06191623955965042, + "learning_rate": 0.00041066752541892395, + "loss": 0.0091, + "num_input_tokens_seen": 159122608, + "step": 73740 + }, + { + "epoch": 12.030179445350734, + "grad_norm": 0.008254610002040863, + "learning_rate": 0.000410597491786937, + "loss": 0.0221, + "num_input_tokens_seen": 159134160, + "step": 73745 + }, + { + "epoch": 12.030995106035888, + "grad_norm": 0.002851321129128337, + "learning_rate": 0.0004105274599668051, + "loss": 0.0027, + "num_input_tokens_seen": 159144912, + "step": 73750 + }, + { + "epoch": 12.031810766721044, + "grad_norm": 0.23893363773822784, + "learning_rate": 0.00041045742995994783, + "loss": 0.0183, + "num_input_tokens_seen": 159155984, + "step": 73755 + }, + { + "epoch": 12.0326264274062, + "grad_norm": 0.01134881004691124, + "learning_rate": 0.0004103874017677842, + "loss": 0.0049, + "num_input_tokens_seen": 159167088, + "step": 73760 + }, + { + "epoch": 12.033442088091354, + "grad_norm": 0.13744163513183594, + "learning_rate": 0.0004103173753917337, + "loss": 0.0181, + "num_input_tokens_seen": 159177840, + "step": 73765 + }, + { + "epoch": 12.03425774877651, + "grad_norm": 0.020513705909252167, + "learning_rate": 0.0004102473508332153, + "loss": 0.0072, + "num_input_tokens_seen": 159189360, + "step": 73770 + }, + { + "epoch": 12.035073409461663, + "grad_norm": 0.002225430915132165, + "learning_rate": 0.00041017732809364824, + "loss": 0.0165, + "num_input_tokens_seen": 159199600, + "step": 73775 + }, + { + "epoch": 12.035889070146819, + "grad_norm": 0.044950198382139206, + "learning_rate": 0.00041010730717445156, + "loss": 0.0452, + "num_input_tokens_seen": 159210832, + "step": 73780 + }, + { + "epoch": 12.036704730831975, + "grad_norm": 0.03426850587129593, + "learning_rate": 0.00041003728807704435, + "loss": 0.0092, + "num_input_tokens_seen": 159219984, + "step": 73785 + }, + { + "epoch": 12.037520391517129, + "grad_norm": 0.4478464722633362, + "learning_rate": 0.00040996727080284555, + "loss": 0.1558, + "num_input_tokens_seen": 159230608, + "step": 73790 + }, + { + "epoch": 12.038336052202284, + "grad_norm": 0.019000953063368797, + "learning_rate": 0.0004098972553532743, + "loss": 0.0029, + "num_input_tokens_seen": 159240784, + "step": 73795 + }, + { + "epoch": 12.039151712887438, + "grad_norm": 0.06745372712612152, + "learning_rate": 0.00040982724172974926, + "loss": 0.0306, + "num_input_tokens_seen": 159251312, + "step": 73800 + }, + { + "epoch": 12.039967373572594, + "grad_norm": 0.014296756125986576, + "learning_rate": 0.0004097572299336899, + "loss": 0.1243, + "num_input_tokens_seen": 159262384, + "step": 73805 + }, + { + "epoch": 12.040783034257748, + "grad_norm": 0.09382767975330353, + "learning_rate": 0.00040968721996651445, + "loss": 0.0093, + "num_input_tokens_seen": 159273552, + "step": 73810 + }, + { + "epoch": 12.041598694942904, + "grad_norm": 0.08110155910253525, + "learning_rate": 0.00040961721182964235, + "loss": 0.031, + "num_input_tokens_seen": 159284080, + "step": 73815 + }, + { + "epoch": 12.04241435562806, + "grad_norm": 0.00540222879499197, + "learning_rate": 0.00040954720552449186, + "loss": 0.0051, + "num_input_tokens_seen": 159294256, + "step": 73820 + }, + { + "epoch": 12.043230016313213, + "grad_norm": 0.016168948262929916, + "learning_rate": 0.0004094772010524822, + "loss": 0.0229, + "num_input_tokens_seen": 159303568, + "step": 73825 + }, + { + "epoch": 12.044045676998369, + "grad_norm": 0.013034249655902386, + "learning_rate": 0.0004094071984150317, + "loss": 0.0229, + "num_input_tokens_seen": 159314576, + "step": 73830 + }, + { + "epoch": 12.044861337683523, + "grad_norm": 0.003740129992365837, + "learning_rate": 0.0004093371976135595, + "loss": 0.0027, + "num_input_tokens_seen": 159326480, + "step": 73835 + }, + { + "epoch": 12.045676998368679, + "grad_norm": 0.00436998438090086, + "learning_rate": 0.0004092671986494837, + "loss": 0.0075, + "num_input_tokens_seen": 159336592, + "step": 73840 + }, + { + "epoch": 12.046492659053834, + "grad_norm": 0.043665919452905655, + "learning_rate": 0.00040919720152422323, + "loss": 0.0045, + "num_input_tokens_seen": 159347536, + "step": 73845 + }, + { + "epoch": 12.047308319738988, + "grad_norm": 0.021763140335679054, + "learning_rate": 0.00040912720623919696, + "loss": 0.0257, + "num_input_tokens_seen": 159359184, + "step": 73850 + }, + { + "epoch": 12.048123980424144, + "grad_norm": 0.0062382942996919155, + "learning_rate": 0.00040905721279582284, + "loss": 0.0647, + "num_input_tokens_seen": 159371920, + "step": 73855 + }, + { + "epoch": 12.048939641109298, + "grad_norm": 0.15809552371501923, + "learning_rate": 0.00040898722119551994, + "loss": 0.0226, + "num_input_tokens_seen": 159382480, + "step": 73860 + }, + { + "epoch": 12.049755301794454, + "grad_norm": 0.05534229800105095, + "learning_rate": 0.0004089172314397063, + "loss": 0.0196, + "num_input_tokens_seen": 159393392, + "step": 73865 + }, + { + "epoch": 12.05057096247961, + "grad_norm": 0.058783091604709625, + "learning_rate": 0.00040884724352980065, + "loss": 0.0135, + "num_input_tokens_seen": 159404656, + "step": 73870 + }, + { + "epoch": 12.051386623164763, + "grad_norm": 0.01652185432612896, + "learning_rate": 0.00040877725746722097, + "loss": 0.0131, + "num_input_tokens_seen": 159415056, + "step": 73875 + }, + { + "epoch": 12.052202283849919, + "grad_norm": 0.0007877106545493007, + "learning_rate": 0.0004087072732533862, + "loss": 0.0126, + "num_input_tokens_seen": 159426960, + "step": 73880 + }, + { + "epoch": 12.053017944535073, + "grad_norm": 0.0052886544726789, + "learning_rate": 0.0004086372908897141, + "loss": 0.0041, + "num_input_tokens_seen": 159438384, + "step": 73885 + }, + { + "epoch": 12.053833605220229, + "grad_norm": 0.17754797637462616, + "learning_rate": 0.0004085673103776234, + "loss": 0.0171, + "num_input_tokens_seen": 159448016, + "step": 73890 + }, + { + "epoch": 12.054649265905383, + "grad_norm": 0.027082880958914757, + "learning_rate": 0.000408497331718532, + "loss": 0.0058, + "num_input_tokens_seen": 159459568, + "step": 73895 + }, + { + "epoch": 12.055464926590538, + "grad_norm": 0.018996762111783028, + "learning_rate": 0.0004084273549138584, + "loss": 0.0054, + "num_input_tokens_seen": 159469680, + "step": 73900 + }, + { + "epoch": 12.056280587275694, + "grad_norm": 0.17834335565567017, + "learning_rate": 0.0004083573799650204, + "loss": 0.0091, + "num_input_tokens_seen": 159480464, + "step": 73905 + }, + { + "epoch": 12.057096247960848, + "grad_norm": 0.09368149936199188, + "learning_rate": 0.00040828740687343654, + "loss": 0.0122, + "num_input_tokens_seen": 159489840, + "step": 73910 + }, + { + "epoch": 12.057911908646004, + "grad_norm": 0.5318507552146912, + "learning_rate": 0.0004082174356405247, + "loss": 0.0271, + "num_input_tokens_seen": 159501168, + "step": 73915 + }, + { + "epoch": 12.058727569331158, + "grad_norm": 0.008883870206773281, + "learning_rate": 0.00040814746626770287, + "loss": 0.0051, + "num_input_tokens_seen": 159512528, + "step": 73920 + }, + { + "epoch": 12.059543230016313, + "grad_norm": 0.0011166409822180867, + "learning_rate": 0.0004080774987563893, + "loss": 0.0937, + "num_input_tokens_seen": 159523376, + "step": 73925 + }, + { + "epoch": 12.060358890701469, + "grad_norm": 0.0011545346351340413, + "learning_rate": 0.0004080075331080017, + "loss": 0.0179, + "num_input_tokens_seen": 159534256, + "step": 73930 + }, + { + "epoch": 12.061174551386623, + "grad_norm": 0.00327668315730989, + "learning_rate": 0.0004079375693239581, + "loss": 0.1064, + "num_input_tokens_seen": 159545072, + "step": 73935 + }, + { + "epoch": 12.061990212071779, + "grad_norm": 0.7087402939796448, + "learning_rate": 0.0004078676074056766, + "loss": 0.144, + "num_input_tokens_seen": 159555024, + "step": 73940 + }, + { + "epoch": 12.062805872756933, + "grad_norm": 0.057137493044137955, + "learning_rate": 0.0004077976473545748, + "loss": 0.0522, + "num_input_tokens_seen": 159566288, + "step": 73945 + }, + { + "epoch": 12.063621533442088, + "grad_norm": 0.004803013987839222, + "learning_rate": 0.0004077276891720707, + "loss": 0.0086, + "num_input_tokens_seen": 159576528, + "step": 73950 + }, + { + "epoch": 12.064437194127244, + "grad_norm": 0.011663583107292652, + "learning_rate": 0.000407657732859582, + "loss": 0.0169, + "num_input_tokens_seen": 159587088, + "step": 73955 + }, + { + "epoch": 12.065252854812398, + "grad_norm": 0.034351348876953125, + "learning_rate": 0.00040758777841852647, + "loss": 0.0184, + "num_input_tokens_seen": 159597648, + "step": 73960 + }, + { + "epoch": 12.066068515497554, + "grad_norm": 0.103532575070858, + "learning_rate": 0.000407517825850322, + "loss": 0.0093, + "num_input_tokens_seen": 159607728, + "step": 73965 + }, + { + "epoch": 12.066884176182707, + "grad_norm": 0.014525679871439934, + "learning_rate": 0.00040744787515638585, + "loss": 0.0041, + "num_input_tokens_seen": 159618224, + "step": 73970 + }, + { + "epoch": 12.067699836867863, + "grad_norm": 0.010570208542048931, + "learning_rate": 0.00040737792633813624, + "loss": 0.0055, + "num_input_tokens_seen": 159629136, + "step": 73975 + }, + { + "epoch": 12.068515497553017, + "grad_norm": 0.03971162438392639, + "learning_rate": 0.00040730797939699014, + "loss": 0.0119, + "num_input_tokens_seen": 159640016, + "step": 73980 + }, + { + "epoch": 12.069331158238173, + "grad_norm": 0.0020236638374626637, + "learning_rate": 0.00040723803433436573, + "loss": 0.0292, + "num_input_tokens_seen": 159651696, + "step": 73985 + }, + { + "epoch": 12.070146818923329, + "grad_norm": 0.0011021002428606153, + "learning_rate": 0.00040716809115167997, + "loss": 0.0119, + "num_input_tokens_seen": 159663536, + "step": 73990 + }, + { + "epoch": 12.070962479608482, + "grad_norm": 0.0029736061114817858, + "learning_rate": 0.0004070981498503508, + "loss": 0.0242, + "num_input_tokens_seen": 159674032, + "step": 73995 + }, + { + "epoch": 12.071778140293638, + "grad_norm": 0.0783427283167839, + "learning_rate": 0.0004070282104317953, + "loss": 0.129, + "num_input_tokens_seen": 159684208, + "step": 74000 + }, + { + "epoch": 12.072593800978792, + "grad_norm": 0.01684621535241604, + "learning_rate": 0.0004069582728974313, + "loss": 0.0744, + "num_input_tokens_seen": 159695408, + "step": 74005 + }, + { + "epoch": 12.073409461663948, + "grad_norm": 0.012730328366160393, + "learning_rate": 0.00040688833724867565, + "loss": 0.0047, + "num_input_tokens_seen": 159706064, + "step": 74010 + }, + { + "epoch": 12.074225122349104, + "grad_norm": 0.33291903138160706, + "learning_rate": 0.0004068184034869462, + "loss": 0.1124, + "num_input_tokens_seen": 159716048, + "step": 74015 + }, + { + "epoch": 12.075040783034257, + "grad_norm": 0.0004434710426721722, + "learning_rate": 0.0004067484716136598, + "loss": 0.0261, + "num_input_tokens_seen": 159726352, + "step": 74020 + }, + { + "epoch": 12.075856443719413, + "grad_norm": 0.2657715678215027, + "learning_rate": 0.00040667854163023415, + "loss": 0.0236, + "num_input_tokens_seen": 159737648, + "step": 74025 + }, + { + "epoch": 12.076672104404567, + "grad_norm": 0.31515979766845703, + "learning_rate": 0.000406608613538086, + "loss": 0.0387, + "num_input_tokens_seen": 159748400, + "step": 74030 + }, + { + "epoch": 12.077487765089723, + "grad_norm": 0.33899086713790894, + "learning_rate": 0.000406538687338633, + "loss": 0.025, + "num_input_tokens_seen": 159760144, + "step": 74035 + }, + { + "epoch": 12.078303425774878, + "grad_norm": 0.12505650520324707, + "learning_rate": 0.0004064687630332919, + "loss": 0.0067, + "num_input_tokens_seen": 159772048, + "step": 74040 + }, + { + "epoch": 12.079119086460032, + "grad_norm": 0.0026974440552294254, + "learning_rate": 0.0004063988406234801, + "loss": 0.0093, + "num_input_tokens_seen": 159782128, + "step": 74045 + }, + { + "epoch": 12.079934747145188, + "grad_norm": 0.03316914662718773, + "learning_rate": 0.0004063289201106144, + "loss": 0.0072, + "num_input_tokens_seen": 159793840, + "step": 74050 + }, + { + "epoch": 12.080750407830342, + "grad_norm": 0.41719675064086914, + "learning_rate": 0.000406259001496112, + "loss": 0.027, + "num_input_tokens_seen": 159805488, + "step": 74055 + }, + { + "epoch": 12.081566068515498, + "grad_norm": 0.02446441724896431, + "learning_rate": 0.00040618908478138986, + "loss": 0.0034, + "num_input_tokens_seen": 159815056, + "step": 74060 + }, + { + "epoch": 12.082381729200652, + "grad_norm": 0.0027038990519940853, + "learning_rate": 0.0004061191699678649, + "loss": 0.0016, + "num_input_tokens_seen": 159826064, + "step": 74065 + }, + { + "epoch": 12.083197389885807, + "grad_norm": 0.0059897019527852535, + "learning_rate": 0.0004060492570569542, + "loss": 0.0028, + "num_input_tokens_seen": 159836944, + "step": 74070 + }, + { + "epoch": 12.084013050570963, + "grad_norm": 0.28468647599220276, + "learning_rate": 0.0004059793460500742, + "loss": 0.0259, + "num_input_tokens_seen": 159847056, + "step": 74075 + }, + { + "epoch": 12.084828711256117, + "grad_norm": 0.04453691095113754, + "learning_rate": 0.0004059094369486423, + "loss": 0.0462, + "num_input_tokens_seen": 159856848, + "step": 74080 + }, + { + "epoch": 12.085644371941273, + "grad_norm": 0.0036157306749373674, + "learning_rate": 0.00040583952975407493, + "loss": 0.0052, + "num_input_tokens_seen": 159869040, + "step": 74085 + }, + { + "epoch": 12.086460032626427, + "grad_norm": 0.001472970237955451, + "learning_rate": 0.000405769624467789, + "loss": 0.006, + "num_input_tokens_seen": 159880368, + "step": 74090 + }, + { + "epoch": 12.087275693311582, + "grad_norm": 0.0065697357058525085, + "learning_rate": 0.0004056997210912011, + "loss": 0.0088, + "num_input_tokens_seen": 159891600, + "step": 74095 + }, + { + "epoch": 12.088091353996738, + "grad_norm": 0.006414678413420916, + "learning_rate": 0.00040562981962572803, + "loss": 0.0134, + "num_input_tokens_seen": 159903344, + "step": 74100 + }, + { + "epoch": 12.088907014681892, + "grad_norm": 0.01593366637825966, + "learning_rate": 0.00040555992007278624, + "loss": 0.0074, + "num_input_tokens_seen": 159913904, + "step": 74105 + }, + { + "epoch": 12.089722675367048, + "grad_norm": 0.051589321345090866, + "learning_rate": 0.00040549002243379267, + "loss": 0.003, + "num_input_tokens_seen": 159923952, + "step": 74110 + }, + { + "epoch": 12.090538336052202, + "grad_norm": 0.002812242368236184, + "learning_rate": 0.00040542012671016355, + "loss": 0.0024, + "num_input_tokens_seen": 159935280, + "step": 74115 + }, + { + "epoch": 12.091353996737357, + "grad_norm": 0.041157614439725876, + "learning_rate": 0.00040535023290331573, + "loss": 0.0054, + "num_input_tokens_seen": 159947312, + "step": 74120 + }, + { + "epoch": 12.092169657422513, + "grad_norm": 0.01795736886560917, + "learning_rate": 0.0004052803410146653, + "loss": 0.0033, + "num_input_tokens_seen": 159957936, + "step": 74125 + }, + { + "epoch": 12.092985318107667, + "grad_norm": 0.05933769419789314, + "learning_rate": 0.0004052104510456291, + "loss": 0.0111, + "num_input_tokens_seen": 159969776, + "step": 74130 + }, + { + "epoch": 12.093800978792823, + "grad_norm": 0.00879963580518961, + "learning_rate": 0.00040514056299762314, + "loss": 0.0393, + "num_input_tokens_seen": 159981232, + "step": 74135 + }, + { + "epoch": 12.094616639477977, + "grad_norm": 0.0013825988862663507, + "learning_rate": 0.0004050706768720642, + "loss": 0.0483, + "num_input_tokens_seen": 159990864, + "step": 74140 + }, + { + "epoch": 12.095432300163132, + "grad_norm": 0.07081010937690735, + "learning_rate": 0.00040500079267036834, + "loss": 0.0057, + "num_input_tokens_seen": 160002352, + "step": 74145 + }, + { + "epoch": 12.096247960848286, + "grad_norm": 0.00036167478538118303, + "learning_rate": 0.000404930910393952, + "loss": 0.0022, + "num_input_tokens_seen": 160013072, + "step": 74150 + }, + { + "epoch": 12.097063621533442, + "grad_norm": 0.0036873186472803354, + "learning_rate": 0.0004048610300442313, + "loss": 0.0108, + "num_input_tokens_seen": 160024080, + "step": 74155 + }, + { + "epoch": 12.097879282218598, + "grad_norm": 0.009987253695726395, + "learning_rate": 0.0004047911516226226, + "loss": 0.0037, + "num_input_tokens_seen": 160034480, + "step": 74160 + }, + { + "epoch": 12.098694942903752, + "grad_norm": 0.002340947510674596, + "learning_rate": 0.0004047212751305418, + "loss": 0.0098, + "num_input_tokens_seen": 160044400, + "step": 74165 + }, + { + "epoch": 12.099510603588907, + "grad_norm": 0.00957136508077383, + "learning_rate": 0.00040465140056940524, + "loss": 0.0013, + "num_input_tokens_seen": 160055024, + "step": 74170 + }, + { + "epoch": 12.100326264274061, + "grad_norm": 0.0026557764504104853, + "learning_rate": 0.00040458152794062925, + "loss": 0.0021, + "num_input_tokens_seen": 160065808, + "step": 74175 + }, + { + "epoch": 12.101141924959217, + "grad_norm": 0.0014571084175258875, + "learning_rate": 0.00040451165724562937, + "loss": 0.002, + "num_input_tokens_seen": 160076592, + "step": 74180 + }, + { + "epoch": 12.101957585644373, + "grad_norm": 0.013624192215502262, + "learning_rate": 0.0004044417884858221, + "loss": 0.0042, + "num_input_tokens_seen": 160087760, + "step": 74185 + }, + { + "epoch": 12.102773246329527, + "grad_norm": 0.003992415964603424, + "learning_rate": 0.0004043719216626231, + "loss": 0.0035, + "num_input_tokens_seen": 160098256, + "step": 74190 + }, + { + "epoch": 12.103588907014682, + "grad_norm": 0.047435540705919266, + "learning_rate": 0.00040430205677744857, + "loss": 0.0109, + "num_input_tokens_seen": 160107792, + "step": 74195 + }, + { + "epoch": 12.104404567699836, + "grad_norm": 0.0148826465010643, + "learning_rate": 0.00040423219383171405, + "loss": 0.1185, + "num_input_tokens_seen": 160118544, + "step": 74200 + }, + { + "epoch": 12.105220228384992, + "grad_norm": 0.0013521467335522175, + "learning_rate": 0.0004041623328268358, + "loss": 0.083, + "num_input_tokens_seen": 160128528, + "step": 74205 + }, + { + "epoch": 12.106035889070148, + "grad_norm": 0.4822540581226349, + "learning_rate": 0.0004040924737642293, + "loss": 0.1113, + "num_input_tokens_seen": 160139024, + "step": 74210 + }, + { + "epoch": 12.106851549755302, + "grad_norm": 0.0016364285256713629, + "learning_rate": 0.0004040226166453107, + "loss": 0.0237, + "num_input_tokens_seen": 160150192, + "step": 74215 + }, + { + "epoch": 12.107667210440457, + "grad_norm": 0.0042993612587451935, + "learning_rate": 0.00040395276147149524, + "loss": 0.003, + "num_input_tokens_seen": 160162064, + "step": 74220 + }, + { + "epoch": 12.108482871125611, + "grad_norm": 0.0038951882161200047, + "learning_rate": 0.000403882908244199, + "loss": 0.0039, + "num_input_tokens_seen": 160172880, + "step": 74225 + }, + { + "epoch": 12.109298531810767, + "grad_norm": 0.007243596948683262, + "learning_rate": 0.00040381305696483773, + "loss": 0.0029, + "num_input_tokens_seen": 160183600, + "step": 74230 + }, + { + "epoch": 12.11011419249592, + "grad_norm": 0.02022252231836319, + "learning_rate": 0.00040374320763482673, + "loss": 0.0195, + "num_input_tokens_seen": 160194000, + "step": 74235 + }, + { + "epoch": 12.110929853181077, + "grad_norm": 0.335334450006485, + "learning_rate": 0.0004036733602555818, + "loss": 0.1579, + "num_input_tokens_seen": 160204176, + "step": 74240 + }, + { + "epoch": 12.111745513866232, + "grad_norm": 0.007324226666241884, + "learning_rate": 0.0004036035148285184, + "loss": 0.0404, + "num_input_tokens_seen": 160214896, + "step": 74245 + }, + { + "epoch": 12.112561174551386, + "grad_norm": 0.2241317182779312, + "learning_rate": 0.00040353367135505193, + "loss": 0.0391, + "num_input_tokens_seen": 160225456, + "step": 74250 + }, + { + "epoch": 12.113376835236542, + "grad_norm": 0.04874589666724205, + "learning_rate": 0.00040346382983659826, + "loss": 0.0417, + "num_input_tokens_seen": 160236464, + "step": 74255 + }, + { + "epoch": 12.114192495921696, + "grad_norm": 0.0018662532093003392, + "learning_rate": 0.0004033939902745723, + "loss": 0.0064, + "num_input_tokens_seen": 160248624, + "step": 74260 + }, + { + "epoch": 12.115008156606851, + "grad_norm": 0.01796708256006241, + "learning_rate": 0.0004033241526703899, + "loss": 0.0103, + "num_input_tokens_seen": 160258672, + "step": 74265 + }, + { + "epoch": 12.115823817292007, + "grad_norm": 0.06866498291492462, + "learning_rate": 0.00040325431702546596, + "loss": 0.0049, + "num_input_tokens_seen": 160268400, + "step": 74270 + }, + { + "epoch": 12.116639477977161, + "grad_norm": 0.029581483453512192, + "learning_rate": 0.000403184483341216, + "loss": 0.0079, + "num_input_tokens_seen": 160279344, + "step": 74275 + }, + { + "epoch": 12.117455138662317, + "grad_norm": 0.024731801822781563, + "learning_rate": 0.0004031146516190556, + "loss": 0.0026, + "num_input_tokens_seen": 160289808, + "step": 74280 + }, + { + "epoch": 12.11827079934747, + "grad_norm": 0.02738928608596325, + "learning_rate": 0.00040304482186039937, + "loss": 0.018, + "num_input_tokens_seen": 160300240, + "step": 74285 + }, + { + "epoch": 12.119086460032626, + "grad_norm": 0.0010728674242272973, + "learning_rate": 0.0004029749940666631, + "loss": 0.0033, + "num_input_tokens_seen": 160311856, + "step": 74290 + }, + { + "epoch": 12.119902120717782, + "grad_norm": 0.030675755813717842, + "learning_rate": 0.00040290516823926145, + "loss": 0.0029, + "num_input_tokens_seen": 160323024, + "step": 74295 + }, + { + "epoch": 12.120717781402936, + "grad_norm": 0.01565350778400898, + "learning_rate": 0.0004028353443796099, + "loss": 0.0065, + "num_input_tokens_seen": 160334576, + "step": 74300 + }, + { + "epoch": 12.121533442088092, + "grad_norm": 0.14077459275722504, + "learning_rate": 0.00040276552248912317, + "loss": 0.0408, + "num_input_tokens_seen": 160344464, + "step": 74305 + }, + { + "epoch": 12.122349102773246, + "grad_norm": 0.017737606540322304, + "learning_rate": 0.00040269570256921673, + "loss": 0.0237, + "num_input_tokens_seen": 160355856, + "step": 74310 + }, + { + "epoch": 12.123164763458401, + "grad_norm": 0.69034343957901, + "learning_rate": 0.00040262588462130507, + "loss": 0.0994, + "num_input_tokens_seen": 160367216, + "step": 74315 + }, + { + "epoch": 12.123980424143557, + "grad_norm": 0.022243104875087738, + "learning_rate": 0.0004025560686468036, + "loss": 0.0432, + "num_input_tokens_seen": 160378288, + "step": 74320 + }, + { + "epoch": 12.124796084828711, + "grad_norm": 0.14463235437870026, + "learning_rate": 0.0004024862546471268, + "loss": 0.011, + "num_input_tokens_seen": 160388432, + "step": 74325 + }, + { + "epoch": 12.125611745513867, + "grad_norm": 0.003319142386317253, + "learning_rate": 0.00040241644262368993, + "loss": 0.1058, + "num_input_tokens_seen": 160399664, + "step": 74330 + }, + { + "epoch": 12.12642740619902, + "grad_norm": 0.2892591655254364, + "learning_rate": 0.00040234663257790747, + "loss": 0.1195, + "num_input_tokens_seen": 160410064, + "step": 74335 + }, + { + "epoch": 12.127243066884176, + "grad_norm": 0.05028560385107994, + "learning_rate": 0.00040227682451119464, + "loss": 0.0043, + "num_input_tokens_seen": 160421488, + "step": 74340 + }, + { + "epoch": 12.12805872756933, + "grad_norm": 0.04615833982825279, + "learning_rate": 0.0004022070184249657, + "loss": 0.0252, + "num_input_tokens_seen": 160431088, + "step": 74345 + }, + { + "epoch": 12.128874388254486, + "grad_norm": 0.000754565407987684, + "learning_rate": 0.0004021372143206358, + "loss": 0.0238, + "num_input_tokens_seen": 160441808, + "step": 74350 + }, + { + "epoch": 12.129690048939642, + "grad_norm": 0.013530561700463295, + "learning_rate": 0.0004020674121996191, + "loss": 0.0014, + "num_input_tokens_seen": 160452080, + "step": 74355 + }, + { + "epoch": 12.130505709624796, + "grad_norm": 0.007507260423153639, + "learning_rate": 0.0004019976120633308, + "loss": 0.0082, + "num_input_tokens_seen": 160463824, + "step": 74360 + }, + { + "epoch": 12.131321370309951, + "grad_norm": 0.07265827059745789, + "learning_rate": 0.000401927813913185, + "loss": 0.0167, + "num_input_tokens_seen": 160475120, + "step": 74365 + }, + { + "epoch": 12.132137030995105, + "grad_norm": 0.0017656140262261033, + "learning_rate": 0.0004018580177505966, + "loss": 0.0138, + "num_input_tokens_seen": 160486512, + "step": 74370 + }, + { + "epoch": 12.132952691680261, + "grad_norm": 0.004244921263307333, + "learning_rate": 0.00040178822357698, + "loss": 0.0068, + "num_input_tokens_seen": 160498128, + "step": 74375 + }, + { + "epoch": 12.133768352365417, + "grad_norm": 0.17322002351284027, + "learning_rate": 0.0004017184313937494, + "loss": 0.0092, + "num_input_tokens_seen": 160509072, + "step": 74380 + }, + { + "epoch": 12.13458401305057, + "grad_norm": 0.3574456572532654, + "learning_rate": 0.0004016486412023198, + "loss": 0.0393, + "num_input_tokens_seen": 160519056, + "step": 74385 + }, + { + "epoch": 12.135399673735726, + "grad_norm": 0.3219027519226074, + "learning_rate": 0.000401578853004105, + "loss": 0.0545, + "num_input_tokens_seen": 160529936, + "step": 74390 + }, + { + "epoch": 12.13621533442088, + "grad_norm": 0.09311213344335556, + "learning_rate": 0.00040150906680051974, + "loss": 0.0072, + "num_input_tokens_seen": 160541296, + "step": 74395 + }, + { + "epoch": 12.137030995106036, + "grad_norm": 0.001389046898111701, + "learning_rate": 0.00040143928259297817, + "loss": 0.0094, + "num_input_tokens_seen": 160552016, + "step": 74400 + }, + { + "epoch": 12.137846655791192, + "grad_norm": 0.0009591339039616287, + "learning_rate": 0.00040136950038289457, + "loss": 0.005, + "num_input_tokens_seen": 160562480, + "step": 74405 + }, + { + "epoch": 12.138662316476346, + "grad_norm": 0.002469372935593128, + "learning_rate": 0.0004012997201716831, + "loss": 0.0109, + "num_input_tokens_seen": 160572368, + "step": 74410 + }, + { + "epoch": 12.139477977161501, + "grad_norm": 0.28964662551879883, + "learning_rate": 0.0004012299419607581, + "loss": 0.1454, + "num_input_tokens_seen": 160583088, + "step": 74415 + }, + { + "epoch": 12.140293637846655, + "grad_norm": 0.13551048934459686, + "learning_rate": 0.00040116016575153344, + "loss": 0.0074, + "num_input_tokens_seen": 160594576, + "step": 74420 + }, + { + "epoch": 12.141109298531811, + "grad_norm": 0.001497600576840341, + "learning_rate": 0.0004010903915454237, + "loss": 0.0046, + "num_input_tokens_seen": 160605264, + "step": 74425 + }, + { + "epoch": 12.141924959216965, + "grad_norm": 0.003375427331775427, + "learning_rate": 0.0004010206193438424, + "loss": 0.0243, + "num_input_tokens_seen": 160617232, + "step": 74430 + }, + { + "epoch": 12.14274061990212, + "grad_norm": 0.0005571177462115884, + "learning_rate": 0.0004009508491482041, + "loss": 0.0028, + "num_input_tokens_seen": 160630224, + "step": 74435 + }, + { + "epoch": 12.143556280587276, + "grad_norm": 0.05119101703166962, + "learning_rate": 0.00040088108095992216, + "loss": 0.0074, + "num_input_tokens_seen": 160641712, + "step": 74440 + }, + { + "epoch": 12.14437194127243, + "grad_norm": 0.028585800901055336, + "learning_rate": 0.00040081131478041115, + "loss": 0.0146, + "num_input_tokens_seen": 160654256, + "step": 74445 + }, + { + "epoch": 12.145187601957586, + "grad_norm": 0.0009304980630986392, + "learning_rate": 0.00040074155061108443, + "loss": 0.0055, + "num_input_tokens_seen": 160664464, + "step": 74450 + }, + { + "epoch": 12.14600326264274, + "grad_norm": 0.8274291157722473, + "learning_rate": 0.00040067178845335633, + "loss": 0.0927, + "num_input_tokens_seen": 160675248, + "step": 74455 + }, + { + "epoch": 12.146818923327896, + "grad_norm": 0.04113573580980301, + "learning_rate": 0.0004006020283086402, + "loss": 0.0054, + "num_input_tokens_seen": 160685872, + "step": 74460 + }, + { + "epoch": 12.147634584013051, + "grad_norm": 0.0005714551662094891, + "learning_rate": 0.00040053227017835033, + "loss": 0.024, + "num_input_tokens_seen": 160696752, + "step": 74465 + }, + { + "epoch": 12.148450244698205, + "grad_norm": 0.0011877024080604315, + "learning_rate": 0.00040046251406389993, + "loss": 0.0018, + "num_input_tokens_seen": 160707824, + "step": 74470 + }, + { + "epoch": 12.149265905383361, + "grad_norm": 0.0005101168062537909, + "learning_rate": 0.0004003927599667032, + "loss": 0.0009, + "num_input_tokens_seen": 160719152, + "step": 74475 + }, + { + "epoch": 12.150081566068515, + "grad_norm": 0.024727245792746544, + "learning_rate": 0.0004003230078881733, + "loss": 0.0057, + "num_input_tokens_seen": 160729616, + "step": 74480 + }, + { + "epoch": 12.15089722675367, + "grad_norm": 0.22565460205078125, + "learning_rate": 0.0004002532578297241, + "loss": 0.0134, + "num_input_tokens_seen": 160740816, + "step": 74485 + }, + { + "epoch": 12.151712887438826, + "grad_norm": 0.005717316176742315, + "learning_rate": 0.0004001835097927694, + "loss": 0.0203, + "num_input_tokens_seen": 160751600, + "step": 74490 + }, + { + "epoch": 12.15252854812398, + "grad_norm": 0.013629831373691559, + "learning_rate": 0.00040011376377872235, + "loss": 0.0325, + "num_input_tokens_seen": 160761936, + "step": 74495 + }, + { + "epoch": 12.153344208809136, + "grad_norm": 0.00954221561551094, + "learning_rate": 0.0004000440197889967, + "loss": 0.0478, + "num_input_tokens_seen": 160772272, + "step": 74500 + }, + { + "epoch": 12.15415986949429, + "grad_norm": 0.02622557058930397, + "learning_rate": 0.0003999742778250056, + "loss": 0.0054, + "num_input_tokens_seen": 160783568, + "step": 74505 + }, + { + "epoch": 12.154975530179446, + "grad_norm": 0.0024565798230469227, + "learning_rate": 0.0003999045378881629, + "loss": 0.0961, + "num_input_tokens_seen": 160794480, + "step": 74510 + }, + { + "epoch": 12.1557911908646, + "grad_norm": 0.00190056674182415, + "learning_rate": 0.0003998347999798815, + "loss": 0.0047, + "num_input_tokens_seen": 160806512, + "step": 74515 + }, + { + "epoch": 12.156606851549755, + "grad_norm": 0.020707450807094574, + "learning_rate": 0.00039976506410157513, + "loss": 0.0104, + "num_input_tokens_seen": 160818448, + "step": 74520 + }, + { + "epoch": 12.15742251223491, + "grad_norm": 0.0028192694298923016, + "learning_rate": 0.0003996953302546567, + "loss": 0.1106, + "num_input_tokens_seen": 160828976, + "step": 74525 + }, + { + "epoch": 12.158238172920065, + "grad_norm": 0.0074721937999129295, + "learning_rate": 0.0003996255984405399, + "loss": 0.0024, + "num_input_tokens_seen": 160839728, + "step": 74530 + }, + { + "epoch": 12.15905383360522, + "grad_norm": 0.5288242697715759, + "learning_rate": 0.00039955586866063735, + "loss": 0.0397, + "num_input_tokens_seen": 160849040, + "step": 74535 + }, + { + "epoch": 12.159869494290374, + "grad_norm": 0.02655063383281231, + "learning_rate": 0.0003994861409163628, + "loss": 0.031, + "num_input_tokens_seen": 160858960, + "step": 74540 + }, + { + "epoch": 12.16068515497553, + "grad_norm": 0.007626372389495373, + "learning_rate": 0.000399416415209129, + "loss": 0.0041, + "num_input_tokens_seen": 160869200, + "step": 74545 + }, + { + "epoch": 12.161500815660686, + "grad_norm": 0.008776957169175148, + "learning_rate": 0.0003993466915403492, + "loss": 0.0426, + "num_input_tokens_seen": 160880080, + "step": 74550 + }, + { + "epoch": 12.16231647634584, + "grad_norm": 0.03911079838871956, + "learning_rate": 0.0003992769699114364, + "loss": 0.0319, + "num_input_tokens_seen": 160891888, + "step": 74555 + }, + { + "epoch": 12.163132137030995, + "grad_norm": 0.0018075419357046485, + "learning_rate": 0.0003992072503238035, + "loss": 0.0021, + "num_input_tokens_seen": 160902288, + "step": 74560 + }, + { + "epoch": 12.16394779771615, + "grad_norm": 0.02013249136507511, + "learning_rate": 0.0003991375327788635, + "loss": 0.0358, + "num_input_tokens_seen": 160912464, + "step": 74565 + }, + { + "epoch": 12.164763458401305, + "grad_norm": 0.0026555154472589493, + "learning_rate": 0.00039906781727802956, + "loss": 0.0023, + "num_input_tokens_seen": 160924624, + "step": 74570 + }, + { + "epoch": 12.16557911908646, + "grad_norm": 0.058747798204422, + "learning_rate": 0.0003989981038227141, + "loss": 0.0622, + "num_input_tokens_seen": 160936144, + "step": 74575 + }, + { + "epoch": 12.166394779771615, + "grad_norm": 0.007467230781912804, + "learning_rate": 0.0003989283924143304, + "loss": 0.0022, + "num_input_tokens_seen": 160946928, + "step": 74580 + }, + { + "epoch": 12.16721044045677, + "grad_norm": 0.0009509428055025637, + "learning_rate": 0.0003988586830542909, + "loss": 0.0009, + "num_input_tokens_seen": 160957808, + "step": 74585 + }, + { + "epoch": 12.168026101141924, + "grad_norm": 0.005281785503029823, + "learning_rate": 0.00039878897574400845, + "loss": 0.012, + "num_input_tokens_seen": 160968400, + "step": 74590 + }, + { + "epoch": 12.16884176182708, + "grad_norm": 0.005842768121510744, + "learning_rate": 0.00039871927048489605, + "loss": 0.0009, + "num_input_tokens_seen": 160979472, + "step": 74595 + }, + { + "epoch": 12.169657422512234, + "grad_norm": 0.0013604004634544253, + "learning_rate": 0.0003986495672783659, + "loss": 0.0024, + "num_input_tokens_seen": 160990512, + "step": 74600 + }, + { + "epoch": 12.17047308319739, + "grad_norm": 0.05610961094498634, + "learning_rate": 0.000398579866125831, + "loss": 0.003, + "num_input_tokens_seen": 161002000, + "step": 74605 + }, + { + "epoch": 12.171288743882545, + "grad_norm": 0.05169655755162239, + "learning_rate": 0.00039851016702870356, + "loss": 0.0032, + "num_input_tokens_seen": 161013488, + "step": 74610 + }, + { + "epoch": 12.1721044045677, + "grad_norm": 0.0005351427826099098, + "learning_rate": 0.0003984404699883966, + "loss": 0.0487, + "num_input_tokens_seen": 161024144, + "step": 74615 + }, + { + "epoch": 12.172920065252855, + "grad_norm": 0.013677247799932957, + "learning_rate": 0.00039837077500632213, + "loss": 0.0501, + "num_input_tokens_seen": 161034736, + "step": 74620 + }, + { + "epoch": 12.173735725938009, + "grad_norm": 0.03766406700015068, + "learning_rate": 0.00039830108208389306, + "loss": 0.1555, + "num_input_tokens_seen": 161045904, + "step": 74625 + }, + { + "epoch": 12.174551386623165, + "grad_norm": 0.0013615316711366177, + "learning_rate": 0.00039823139122252126, + "loss": 0.0048, + "num_input_tokens_seen": 161056272, + "step": 74630 + }, + { + "epoch": 12.17536704730832, + "grad_norm": 0.2723039388656616, + "learning_rate": 0.0003981617024236197, + "loss": 0.0245, + "num_input_tokens_seen": 161064784, + "step": 74635 + }, + { + "epoch": 12.176182707993474, + "grad_norm": 0.05823734775185585, + "learning_rate": 0.0003980920156886003, + "loss": 0.0088, + "num_input_tokens_seen": 161075568, + "step": 74640 + }, + { + "epoch": 12.17699836867863, + "grad_norm": 0.004167646635323763, + "learning_rate": 0.0003980223310188756, + "loss": 0.0019, + "num_input_tokens_seen": 161086288, + "step": 74645 + }, + { + "epoch": 12.177814029363784, + "grad_norm": 0.0021060630679130554, + "learning_rate": 0.00039795264841585755, + "loss": 0.0043, + "num_input_tokens_seen": 161095184, + "step": 74650 + }, + { + "epoch": 12.17862969004894, + "grad_norm": 0.11532260477542877, + "learning_rate": 0.00039788296788095866, + "loss": 0.0726, + "num_input_tokens_seen": 161106640, + "step": 74655 + }, + { + "epoch": 12.179445350734095, + "grad_norm": 0.025890162214636803, + "learning_rate": 0.00039781328941559084, + "loss": 0.1263, + "num_input_tokens_seen": 161118832, + "step": 74660 + }, + { + "epoch": 12.18026101141925, + "grad_norm": 0.0014036158099770546, + "learning_rate": 0.0003977436130211666, + "loss": 0.0159, + "num_input_tokens_seen": 161129136, + "step": 74665 + }, + { + "epoch": 12.181076672104405, + "grad_norm": 0.17675647139549255, + "learning_rate": 0.0003976739386990975, + "loss": 0.0169, + "num_input_tokens_seen": 161140848, + "step": 74670 + }, + { + "epoch": 12.181892332789559, + "grad_norm": 0.004996247589588165, + "learning_rate": 0.0003976042664507961, + "loss": 0.0051, + "num_input_tokens_seen": 161151344, + "step": 74675 + }, + { + "epoch": 12.182707993474715, + "grad_norm": 0.0025636698119342327, + "learning_rate": 0.0003975345962776738, + "loss": 0.0052, + "num_input_tokens_seen": 161162896, + "step": 74680 + }, + { + "epoch": 12.18352365415987, + "grad_norm": 0.001583628705702722, + "learning_rate": 0.0003974649281811431, + "loss": 0.013, + "num_input_tokens_seen": 161174416, + "step": 74685 + }, + { + "epoch": 12.184339314845024, + "grad_norm": 0.02006850391626358, + "learning_rate": 0.00039739526216261566, + "loss": 0.0032, + "num_input_tokens_seen": 161185520, + "step": 74690 + }, + { + "epoch": 12.18515497553018, + "grad_norm": 0.01595499739050865, + "learning_rate": 0.00039732559822350336, + "loss": 0.0196, + "num_input_tokens_seen": 161196528, + "step": 74695 + }, + { + "epoch": 12.185970636215334, + "grad_norm": 0.004150539170950651, + "learning_rate": 0.00039725593636521817, + "loss": 0.0027, + "num_input_tokens_seen": 161206032, + "step": 74700 + }, + { + "epoch": 12.18678629690049, + "grad_norm": 0.000986686209216714, + "learning_rate": 0.0003971862765891716, + "loss": 0.0032, + "num_input_tokens_seen": 161216944, + "step": 74705 + }, + { + "epoch": 12.187601957585644, + "grad_norm": 0.0029097532387822866, + "learning_rate": 0.00039711661889677577, + "loss": 0.0138, + "num_input_tokens_seen": 161228176, + "step": 74710 + }, + { + "epoch": 12.1884176182708, + "grad_norm": 0.00785776600241661, + "learning_rate": 0.00039704696328944205, + "loss": 0.0025, + "num_input_tokens_seen": 161239376, + "step": 74715 + }, + { + "epoch": 12.189233278955955, + "grad_norm": 0.013095185160636902, + "learning_rate": 0.0003969773097685823, + "loss": 0.0929, + "num_input_tokens_seen": 161250352, + "step": 74720 + }, + { + "epoch": 12.190048939641109, + "grad_norm": 0.06309574097394943, + "learning_rate": 0.000396907658335608, + "loss": 0.0048, + "num_input_tokens_seen": 161261392, + "step": 74725 + }, + { + "epoch": 12.190864600326265, + "grad_norm": 0.037989541888237, + "learning_rate": 0.0003968380089919308, + "loss": 0.0132, + "num_input_tokens_seen": 161273360, + "step": 74730 + }, + { + "epoch": 12.191680261011419, + "grad_norm": 0.0008451101602986455, + "learning_rate": 0.0003967683617389621, + "loss": 0.0806, + "num_input_tokens_seen": 161285232, + "step": 74735 + }, + { + "epoch": 12.192495921696574, + "grad_norm": 0.030361147597432137, + "learning_rate": 0.0003966987165781138, + "loss": 0.0363, + "num_input_tokens_seen": 161296208, + "step": 74740 + }, + { + "epoch": 12.19331158238173, + "grad_norm": 0.006845317780971527, + "learning_rate": 0.00039662907351079675, + "loss": 0.0018, + "num_input_tokens_seen": 161307248, + "step": 74745 + }, + { + "epoch": 12.194127243066884, + "grad_norm": 0.0246586911380291, + "learning_rate": 0.00039655943253842293, + "loss": 0.0367, + "num_input_tokens_seen": 161317488, + "step": 74750 + }, + { + "epoch": 12.19494290375204, + "grad_norm": 0.0016998144565150142, + "learning_rate": 0.00039648979366240325, + "loss": 0.006, + "num_input_tokens_seen": 161327280, + "step": 74755 + }, + { + "epoch": 12.195758564437194, + "grad_norm": 0.011350972577929497, + "learning_rate": 0.00039642015688414936, + "loss": 0.0052, + "num_input_tokens_seen": 161338800, + "step": 74760 + }, + { + "epoch": 12.19657422512235, + "grad_norm": 0.0013628799933940172, + "learning_rate": 0.00039635052220507216, + "loss": 0.0018, + "num_input_tokens_seen": 161350960, + "step": 74765 + }, + { + "epoch": 12.197389885807505, + "grad_norm": 0.006588470656424761, + "learning_rate": 0.0003962808896265834, + "loss": 0.0754, + "num_input_tokens_seen": 161361360, + "step": 74770 + }, + { + "epoch": 12.198205546492659, + "grad_norm": 0.02832656353712082, + "learning_rate": 0.0003962112591500937, + "loss": 0.157, + "num_input_tokens_seen": 161372400, + "step": 74775 + }, + { + "epoch": 12.199021207177815, + "grad_norm": 0.10071592032909393, + "learning_rate": 0.00039614163077701474, + "loss": 0.0148, + "num_input_tokens_seen": 161383728, + "step": 74780 + }, + { + "epoch": 12.199836867862969, + "grad_norm": 0.06600882858037949, + "learning_rate": 0.00039607200450875716, + "loss": 0.0155, + "num_input_tokens_seen": 161394416, + "step": 74785 + }, + { + "epoch": 12.200652528548124, + "grad_norm": 1.450798749923706, + "learning_rate": 0.0003960023803467325, + "loss": 0.0227, + "num_input_tokens_seen": 161404944, + "step": 74790 + }, + { + "epoch": 12.201468189233278, + "grad_norm": 0.00263647991232574, + "learning_rate": 0.0003959327582923513, + "loss": 0.0032, + "num_input_tokens_seen": 161415760, + "step": 74795 + }, + { + "epoch": 12.202283849918434, + "grad_norm": 0.06124863028526306, + "learning_rate": 0.000395863138347025, + "loss": 0.0048, + "num_input_tokens_seen": 161425872, + "step": 74800 + }, + { + "epoch": 12.20309951060359, + "grad_norm": 0.0019437120063230395, + "learning_rate": 0.0003957935205121641, + "loss": 0.0027, + "num_input_tokens_seen": 161436688, + "step": 74805 + }, + { + "epoch": 12.203915171288743, + "grad_norm": 0.03643661364912987, + "learning_rate": 0.00039572390478917973, + "loss": 0.0098, + "num_input_tokens_seen": 161447376, + "step": 74810 + }, + { + "epoch": 12.2047308319739, + "grad_norm": 0.0024006329476833344, + "learning_rate": 0.00039565429117948287, + "loss": 0.0024, + "num_input_tokens_seen": 161457744, + "step": 74815 + }, + { + "epoch": 12.205546492659053, + "grad_norm": 0.006184297148138285, + "learning_rate": 0.000395584679684484, + "loss": 0.0116, + "num_input_tokens_seen": 161468912, + "step": 74820 + }, + { + "epoch": 12.206362153344209, + "grad_norm": 0.03142210468649864, + "learning_rate": 0.00039551507030559423, + "loss": 0.0018, + "num_input_tokens_seen": 161479920, + "step": 74825 + }, + { + "epoch": 12.207177814029365, + "grad_norm": 0.01277390867471695, + "learning_rate": 0.0003954454630442239, + "loss": 0.0364, + "num_input_tokens_seen": 161491088, + "step": 74830 + }, + { + "epoch": 12.207993474714518, + "grad_norm": 0.22222667932510376, + "learning_rate": 0.0003953758579017842, + "loss": 0.0117, + "num_input_tokens_seen": 161500912, + "step": 74835 + }, + { + "epoch": 12.208809135399674, + "grad_norm": 0.0012464099563658237, + "learning_rate": 0.00039530625487968507, + "loss": 0.0029, + "num_input_tokens_seen": 161511376, + "step": 74840 + }, + { + "epoch": 12.209624796084828, + "grad_norm": 0.0010708067566156387, + "learning_rate": 0.00039523665397933784, + "loss": 0.0047, + "num_input_tokens_seen": 161521936, + "step": 74845 + }, + { + "epoch": 12.210440456769984, + "grad_norm": 0.09673620015382767, + "learning_rate": 0.0003951670552021525, + "loss": 0.0081, + "num_input_tokens_seen": 161531984, + "step": 74850 + }, + { + "epoch": 12.21125611745514, + "grad_norm": 0.0011108857579529285, + "learning_rate": 0.0003950974585495399, + "loss": 0.0032, + "num_input_tokens_seen": 161542288, + "step": 74855 + }, + { + "epoch": 12.212071778140293, + "grad_norm": 0.07270468026399612, + "learning_rate": 0.0003950278640229103, + "loss": 0.019, + "num_input_tokens_seen": 161552016, + "step": 74860 + }, + { + "epoch": 12.21288743882545, + "grad_norm": 0.021703898906707764, + "learning_rate": 0.0003949582716236743, + "loss": 0.0037, + "num_input_tokens_seen": 161562352, + "step": 74865 + }, + { + "epoch": 12.213703099510603, + "grad_norm": 0.008199620060622692, + "learning_rate": 0.0003948886813532421, + "loss": 0.0111, + "num_input_tokens_seen": 161572400, + "step": 74870 + }, + { + "epoch": 12.214518760195759, + "grad_norm": 0.007011028937995434, + "learning_rate": 0.00039481909321302413, + "loss": 0.0036, + "num_input_tokens_seen": 161582992, + "step": 74875 + }, + { + "epoch": 12.215334420880913, + "grad_norm": 0.011071201413869858, + "learning_rate": 0.0003947495072044306, + "loss": 0.0679, + "num_input_tokens_seen": 161593808, + "step": 74880 + }, + { + "epoch": 12.216150081566068, + "grad_norm": 0.018101373687386513, + "learning_rate": 0.00039467992332887196, + "loss": 0.0093, + "num_input_tokens_seen": 161603952, + "step": 74885 + }, + { + "epoch": 12.216965742251224, + "grad_norm": 0.009042915888130665, + "learning_rate": 0.0003946103415877582, + "loss": 0.0076, + "num_input_tokens_seen": 161615600, + "step": 74890 + }, + { + "epoch": 12.217781402936378, + "grad_norm": 0.023832127451896667, + "learning_rate": 0.00039454076198249964, + "loss": 0.0093, + "num_input_tokens_seen": 161627088, + "step": 74895 + }, + { + "epoch": 12.218597063621534, + "grad_norm": 0.45474666357040405, + "learning_rate": 0.00039447118451450613, + "loss": 0.0449, + "num_input_tokens_seen": 161638288, + "step": 74900 + }, + { + "epoch": 12.219412724306688, + "grad_norm": 0.0025936223100870848, + "learning_rate": 0.00039440160918518825, + "loss": 0.0363, + "num_input_tokens_seen": 161648048, + "step": 74905 + }, + { + "epoch": 12.220228384991843, + "grad_norm": 0.0224508885294199, + "learning_rate": 0.00039433203599595546, + "loss": 0.0031, + "num_input_tokens_seen": 161659056, + "step": 74910 + }, + { + "epoch": 12.221044045676999, + "grad_norm": 0.10197361558675766, + "learning_rate": 0.00039426246494821793, + "loss": 0.0065, + "num_input_tokens_seen": 161670864, + "step": 74915 + }, + { + "epoch": 12.221859706362153, + "grad_norm": 0.006221417337656021, + "learning_rate": 0.000394192896043386, + "loss": 0.0066, + "num_input_tokens_seen": 161681328, + "step": 74920 + }, + { + "epoch": 12.222675367047309, + "grad_norm": 0.0027924743480980396, + "learning_rate": 0.000394123329282869, + "loss": 0.0033, + "num_input_tokens_seen": 161691312, + "step": 74925 + }, + { + "epoch": 12.223491027732463, + "grad_norm": 0.001059170812368393, + "learning_rate": 0.0003940537646680773, + "loss": 0.0132, + "num_input_tokens_seen": 161703120, + "step": 74930 + }, + { + "epoch": 12.224306688417618, + "grad_norm": 0.08574067801237106, + "learning_rate": 0.0003939842022004202, + "loss": 0.0227, + "num_input_tokens_seen": 161714608, + "step": 74935 + }, + { + "epoch": 12.225122349102774, + "grad_norm": 0.03081444464623928, + "learning_rate": 0.00039391464188130796, + "loss": 0.003, + "num_input_tokens_seen": 161725552, + "step": 74940 + }, + { + "epoch": 12.225938009787928, + "grad_norm": 0.0018798905657604337, + "learning_rate": 0.0003938450837121499, + "loss": 0.1095, + "num_input_tokens_seen": 161736816, + "step": 74945 + }, + { + "epoch": 12.226753670473084, + "grad_norm": 0.009571176022291183, + "learning_rate": 0.00039377552769435606, + "loss": 0.0447, + "num_input_tokens_seen": 161747152, + "step": 74950 + }, + { + "epoch": 12.227569331158238, + "grad_norm": 0.0017535814549773932, + "learning_rate": 0.0003937059738293357, + "loss": 0.0048, + "num_input_tokens_seen": 161757808, + "step": 74955 + }, + { + "epoch": 12.228384991843393, + "grad_norm": 1.451026439666748, + "learning_rate": 0.0003936364221184988, + "loss": 0.0454, + "num_input_tokens_seen": 161768880, + "step": 74960 + }, + { + "epoch": 12.229200652528547, + "grad_norm": 0.012817859649658203, + "learning_rate": 0.00039356687256325465, + "loss": 0.0693, + "num_input_tokens_seen": 161780816, + "step": 74965 + }, + { + "epoch": 12.230016313213703, + "grad_norm": 0.012609720230102539, + "learning_rate": 0.0003934973251650129, + "loss": 0.002, + "num_input_tokens_seen": 161790864, + "step": 74970 + }, + { + "epoch": 12.230831973898859, + "grad_norm": 0.007869264110922813, + "learning_rate": 0.0003934277799251829, + "loss": 0.0204, + "num_input_tokens_seen": 161802352, + "step": 74975 + }, + { + "epoch": 12.231647634584013, + "grad_norm": 0.2318950891494751, + "learning_rate": 0.00039335823684517423, + "loss": 0.0941, + "num_input_tokens_seen": 161813584, + "step": 74980 + }, + { + "epoch": 12.232463295269168, + "grad_norm": 0.039717864245176315, + "learning_rate": 0.00039328869592639604, + "loss": 0.0107, + "num_input_tokens_seen": 161824208, + "step": 74985 + }, + { + "epoch": 12.233278955954322, + "grad_norm": 0.000530784425791353, + "learning_rate": 0.00039321915717025797, + "loss": 0.0108, + "num_input_tokens_seen": 161835248, + "step": 74990 + }, + { + "epoch": 12.234094616639478, + "grad_norm": 0.012814320623874664, + "learning_rate": 0.00039314962057816896, + "loss": 0.0498, + "num_input_tokens_seen": 161845392, + "step": 74995 + }, + { + "epoch": 12.234910277324634, + "grad_norm": 0.26087823510169983, + "learning_rate": 0.0003930800861515385, + "loss": 0.013, + "num_input_tokens_seen": 161855984, + "step": 75000 + }, + { + "epoch": 12.235725938009788, + "grad_norm": 0.0018974668346345425, + "learning_rate": 0.00039301055389177577, + "loss": 0.1379, + "num_input_tokens_seen": 161866800, + "step": 75005 + }, + { + "epoch": 12.236541598694943, + "grad_norm": 0.005383786279708147, + "learning_rate": 0.00039294102380028987, + "loss": 0.028, + "num_input_tokens_seen": 161877648, + "step": 75010 + }, + { + "epoch": 12.237357259380097, + "grad_norm": 0.12277581542730331, + "learning_rate": 0.0003928714958784899, + "loss": 0.0133, + "num_input_tokens_seen": 161888400, + "step": 75015 + }, + { + "epoch": 12.238172920065253, + "grad_norm": 0.06266216188669205, + "learning_rate": 0.00039280197012778493, + "loss": 0.0079, + "num_input_tokens_seen": 161898672, + "step": 75020 + }, + { + "epoch": 12.238988580750409, + "grad_norm": 0.0058609070256352425, + "learning_rate": 0.0003927324465495841, + "loss": 0.0664, + "num_input_tokens_seen": 161908464, + "step": 75025 + }, + { + "epoch": 12.239804241435563, + "grad_norm": 0.0008976479875855148, + "learning_rate": 0.0003926629251452963, + "loss": 0.0046, + "num_input_tokens_seen": 161919120, + "step": 75030 + }, + { + "epoch": 12.240619902120718, + "grad_norm": 0.47124531865119934, + "learning_rate": 0.0003925934059163306, + "loss": 0.0932, + "num_input_tokens_seen": 161929232, + "step": 75035 + }, + { + "epoch": 12.241435562805872, + "grad_norm": 0.013031085953116417, + "learning_rate": 0.0003925238888640957, + "loss": 0.0439, + "num_input_tokens_seen": 161939792, + "step": 75040 + }, + { + "epoch": 12.242251223491028, + "grad_norm": 0.003828482236713171, + "learning_rate": 0.0003924543739900005, + "loss": 0.0078, + "num_input_tokens_seen": 161951184, + "step": 75045 + }, + { + "epoch": 12.243066884176184, + "grad_norm": 0.6344477534294128, + "learning_rate": 0.00039238486129545376, + "loss": 0.0145, + "num_input_tokens_seen": 161963184, + "step": 75050 + }, + { + "epoch": 12.243882544861338, + "grad_norm": 0.006866890471428633, + "learning_rate": 0.0003923153507818645, + "loss": 0.0127, + "num_input_tokens_seen": 161973808, + "step": 75055 + }, + { + "epoch": 12.244698205546493, + "grad_norm": 0.01028711348772049, + "learning_rate": 0.00039224584245064114, + "loss": 0.0031, + "num_input_tokens_seen": 161985040, + "step": 75060 + }, + { + "epoch": 12.245513866231647, + "grad_norm": 0.0015036846743896604, + "learning_rate": 0.00039217633630319264, + "loss": 0.0027, + "num_input_tokens_seen": 161996624, + "step": 75065 + }, + { + "epoch": 12.246329526916803, + "grad_norm": 0.021964767947793007, + "learning_rate": 0.00039210683234092733, + "loss": 0.0046, + "num_input_tokens_seen": 162007248, + "step": 75070 + }, + { + "epoch": 12.247145187601957, + "grad_norm": 0.15771441161632538, + "learning_rate": 0.000392037330565254, + "loss": 0.0694, + "num_input_tokens_seen": 162018320, + "step": 75075 + }, + { + "epoch": 12.247960848287113, + "grad_norm": 0.20369605720043182, + "learning_rate": 0.000391967830977581, + "loss": 0.0083, + "num_input_tokens_seen": 162029424, + "step": 75080 + }, + { + "epoch": 12.248776508972268, + "grad_norm": 0.02846320904791355, + "learning_rate": 0.0003918983335793173, + "loss": 0.0283, + "num_input_tokens_seen": 162041424, + "step": 75085 + }, + { + "epoch": 12.249592169657422, + "grad_norm": 0.009391102008521557, + "learning_rate": 0.00039182883837187056, + "loss": 0.0057, + "num_input_tokens_seen": 162052240, + "step": 75090 + }, + { + "epoch": 12.250407830342578, + "grad_norm": 0.017038507387042046, + "learning_rate": 0.00039175934535665, + "loss": 0.0093, + "num_input_tokens_seen": 162061904, + "step": 75095 + }, + { + "epoch": 12.251223491027732, + "grad_norm": 0.0008531765779480338, + "learning_rate": 0.00039168985453506334, + "loss": 0.0068, + "num_input_tokens_seen": 162072272, + "step": 75100 + }, + { + "epoch": 12.252039151712887, + "grad_norm": 0.008246184326708317, + "learning_rate": 0.0003916203659085194, + "loss": 0.0777, + "num_input_tokens_seen": 162084016, + "step": 75105 + }, + { + "epoch": 12.252854812398043, + "grad_norm": 0.016593193635344505, + "learning_rate": 0.00039155087947842607, + "loss": 0.0047, + "num_input_tokens_seen": 162094448, + "step": 75110 + }, + { + "epoch": 12.253670473083197, + "grad_norm": 0.012623871676623821, + "learning_rate": 0.00039148139524619184, + "loss": 0.0253, + "num_input_tokens_seen": 162105712, + "step": 75115 + }, + { + "epoch": 12.254486133768353, + "grad_norm": 0.0075597031973302364, + "learning_rate": 0.00039141191321322464, + "loss": 0.0015, + "num_input_tokens_seen": 162117552, + "step": 75120 + }, + { + "epoch": 12.255301794453507, + "grad_norm": 1.087733507156372, + "learning_rate": 0.00039134243338093285, + "loss": 0.0295, + "num_input_tokens_seen": 162129264, + "step": 75125 + }, + { + "epoch": 12.256117455138662, + "grad_norm": 0.01773880422115326, + "learning_rate": 0.0003912729557507246, + "loss": 0.0059, + "num_input_tokens_seen": 162139504, + "step": 75130 + }, + { + "epoch": 12.256933115823816, + "grad_norm": 0.0020350187551230192, + "learning_rate": 0.0003912034803240077, + "loss": 0.0022, + "num_input_tokens_seen": 162149584, + "step": 75135 + }, + { + "epoch": 12.257748776508972, + "grad_norm": 0.007242864929139614, + "learning_rate": 0.0003911340071021905, + "loss": 0.0086, + "num_input_tokens_seen": 162160624, + "step": 75140 + }, + { + "epoch": 12.258564437194128, + "grad_norm": 0.131486713886261, + "learning_rate": 0.00039106453608668047, + "loss": 0.0319, + "num_input_tokens_seen": 162172016, + "step": 75145 + }, + { + "epoch": 12.259380097879282, + "grad_norm": 0.03611040860414505, + "learning_rate": 0.0003909950672788861, + "loss": 0.0115, + "num_input_tokens_seen": 162182000, + "step": 75150 + }, + { + "epoch": 12.260195758564437, + "grad_norm": 0.0012033432722091675, + "learning_rate": 0.0003909256006802147, + "loss": 0.0035, + "num_input_tokens_seen": 162192080, + "step": 75155 + }, + { + "epoch": 12.261011419249591, + "grad_norm": 1.1938694715499878, + "learning_rate": 0.0003908561362920746, + "loss": 0.0432, + "num_input_tokens_seen": 162203600, + "step": 75160 + }, + { + "epoch": 12.261827079934747, + "grad_norm": 0.12175756692886353, + "learning_rate": 0.00039078667411587316, + "loss": 0.0081, + "num_input_tokens_seen": 162214096, + "step": 75165 + }, + { + "epoch": 12.262642740619903, + "grad_norm": 0.042177435010671616, + "learning_rate": 0.0003907172141530184, + "loss": 0.0051, + "num_input_tokens_seen": 162223952, + "step": 75170 + }, + { + "epoch": 12.263458401305057, + "grad_norm": 0.0028282259590923786, + "learning_rate": 0.00039064775640491796, + "loss": 0.0262, + "num_input_tokens_seen": 162235312, + "step": 75175 + }, + { + "epoch": 12.264274061990212, + "grad_norm": 0.16635634005069733, + "learning_rate": 0.00039057830087297946, + "loss": 0.009, + "num_input_tokens_seen": 162245456, + "step": 75180 + }, + { + "epoch": 12.265089722675366, + "grad_norm": 0.0032050833106040955, + "learning_rate": 0.0003905088475586105, + "loss": 0.0046, + "num_input_tokens_seen": 162256080, + "step": 75185 + }, + { + "epoch": 12.265905383360522, + "grad_norm": 0.0017825954128056765, + "learning_rate": 0.0003904393964632186, + "loss": 0.0631, + "num_input_tokens_seen": 162266256, + "step": 75190 + }, + { + "epoch": 12.266721044045678, + "grad_norm": 0.0007485284586437047, + "learning_rate": 0.00039036994758821124, + "loss": 0.031, + "num_input_tokens_seen": 162277456, + "step": 75195 + }, + { + "epoch": 12.267536704730832, + "grad_norm": 0.049372635781764984, + "learning_rate": 0.00039030050093499623, + "loss": 0.0241, + "num_input_tokens_seen": 162288432, + "step": 75200 + }, + { + "epoch": 12.268352365415987, + "grad_norm": 0.7122042775154114, + "learning_rate": 0.0003902310565049805, + "loss": 0.1598, + "num_input_tokens_seen": 162297968, + "step": 75205 + }, + { + "epoch": 12.269168026101141, + "grad_norm": 0.7093799710273743, + "learning_rate": 0.0003901616142995718, + "loss": 0.0577, + "num_input_tokens_seen": 162309520, + "step": 75210 + }, + { + "epoch": 12.269983686786297, + "grad_norm": 0.007328869309276342, + "learning_rate": 0.0003900921743201772, + "loss": 0.1242, + "num_input_tokens_seen": 162319440, + "step": 75215 + }, + { + "epoch": 12.270799347471453, + "grad_norm": 0.03256691247224808, + "learning_rate": 0.00039002273656820423, + "loss": 0.003, + "num_input_tokens_seen": 162331248, + "step": 75220 + }, + { + "epoch": 12.271615008156607, + "grad_norm": 0.14773625135421753, + "learning_rate": 0.0003899533010450599, + "loss": 0.0071, + "num_input_tokens_seen": 162341872, + "step": 75225 + }, + { + "epoch": 12.272430668841762, + "grad_norm": 0.0065134065225720406, + "learning_rate": 0.0003898838677521515, + "loss": 0.0285, + "num_input_tokens_seen": 162352880, + "step": 75230 + }, + { + "epoch": 12.273246329526916, + "grad_norm": 0.4818114638328552, + "learning_rate": 0.00038981443669088646, + "loss": 0.1387, + "num_input_tokens_seen": 162364144, + "step": 75235 + }, + { + "epoch": 12.274061990212072, + "grad_norm": 0.010879253968596458, + "learning_rate": 0.0003897450078626714, + "loss": 0.0089, + "num_input_tokens_seen": 162373616, + "step": 75240 + }, + { + "epoch": 12.274877650897226, + "grad_norm": 0.12211883068084717, + "learning_rate": 0.0003896755812689138, + "loss": 0.0101, + "num_input_tokens_seen": 162383536, + "step": 75245 + }, + { + "epoch": 12.275693311582382, + "grad_norm": 0.008950205519795418, + "learning_rate": 0.0003896061569110203, + "loss": 0.0246, + "num_input_tokens_seen": 162394128, + "step": 75250 + }, + { + "epoch": 12.276508972267537, + "grad_norm": 0.23400241136550903, + "learning_rate": 0.0003895367347903983, + "loss": 0.0103, + "num_input_tokens_seen": 162404112, + "step": 75255 + }, + { + "epoch": 12.277324632952691, + "grad_norm": 0.001761406660079956, + "learning_rate": 0.0003894673149084543, + "loss": 0.0048, + "num_input_tokens_seen": 162414384, + "step": 75260 + }, + { + "epoch": 12.278140293637847, + "grad_norm": 0.0035153161734342575, + "learning_rate": 0.0003893978972665956, + "loss": 0.0025, + "num_input_tokens_seen": 162425200, + "step": 75265 + }, + { + "epoch": 12.278955954323001, + "grad_norm": 0.04039424657821655, + "learning_rate": 0.0003893284818662286, + "loss": 0.1213, + "num_input_tokens_seen": 162435312, + "step": 75270 + }, + { + "epoch": 12.279771615008157, + "grad_norm": 0.018177783116698265, + "learning_rate": 0.0003892590687087605, + "loss": 0.0878, + "num_input_tokens_seen": 162444816, + "step": 75275 + }, + { + "epoch": 12.280587275693312, + "grad_norm": 0.0016015070723369718, + "learning_rate": 0.0003891896577955977, + "loss": 0.0114, + "num_input_tokens_seen": 162456592, + "step": 75280 + }, + { + "epoch": 12.281402936378466, + "grad_norm": 0.005266851279884577, + "learning_rate": 0.0003891202491281472, + "loss": 0.0114, + "num_input_tokens_seen": 162468432, + "step": 75285 + }, + { + "epoch": 12.282218597063622, + "grad_norm": 0.0033932779915630817, + "learning_rate": 0.0003890508427078153, + "loss": 0.0136, + "num_input_tokens_seen": 162478192, + "step": 75290 + }, + { + "epoch": 12.283034257748776, + "grad_norm": 0.009979058057069778, + "learning_rate": 0.0003889814385360091, + "loss": 0.017, + "num_input_tokens_seen": 162489168, + "step": 75295 + }, + { + "epoch": 12.283849918433932, + "grad_norm": 0.0033886334858834743, + "learning_rate": 0.0003889120366141347, + "loss": 0.0026, + "num_input_tokens_seen": 162500688, + "step": 75300 + }, + { + "epoch": 12.284665579119087, + "grad_norm": 0.11189183592796326, + "learning_rate": 0.0003888426369435989, + "loss": 0.0075, + "num_input_tokens_seen": 162510384, + "step": 75305 + }, + { + "epoch": 12.285481239804241, + "grad_norm": 0.006715911440551281, + "learning_rate": 0.0003887732395258079, + "loss": 0.0099, + "num_input_tokens_seen": 162520848, + "step": 75310 + }, + { + "epoch": 12.286296900489397, + "grad_norm": 0.005836441647261381, + "learning_rate": 0.0003887038443621684, + "loss": 0.0097, + "num_input_tokens_seen": 162531824, + "step": 75315 + }, + { + "epoch": 12.28711256117455, + "grad_norm": 0.012489204294979572, + "learning_rate": 0.0003886344514540868, + "loss": 0.0037, + "num_input_tokens_seen": 162542736, + "step": 75320 + }, + { + "epoch": 12.287928221859707, + "grad_norm": 0.1270523965358734, + "learning_rate": 0.0003885650608029692, + "loss": 0.0262, + "num_input_tokens_seen": 162553968, + "step": 75325 + }, + { + "epoch": 12.28874388254486, + "grad_norm": 0.0016210231697186828, + "learning_rate": 0.00038849567241022205, + "loss": 0.0048, + "num_input_tokens_seen": 162564432, + "step": 75330 + }, + { + "epoch": 12.289559543230016, + "grad_norm": 0.000679267686791718, + "learning_rate": 0.0003884262862772514, + "loss": 0.0021, + "num_input_tokens_seen": 162574992, + "step": 75335 + }, + { + "epoch": 12.290375203915172, + "grad_norm": 0.0013281983556225896, + "learning_rate": 0.0003883569024054638, + "loss": 0.0019, + "num_input_tokens_seen": 162585936, + "step": 75340 + }, + { + "epoch": 12.291190864600326, + "grad_norm": 0.0010293243685737252, + "learning_rate": 0.0003882875207962651, + "loss": 0.1104, + "num_input_tokens_seen": 162597520, + "step": 75345 + }, + { + "epoch": 12.292006525285482, + "grad_norm": 0.09767144918441772, + "learning_rate": 0.0003882181414510616, + "loss": 0.0065, + "num_input_tokens_seen": 162607664, + "step": 75350 + }, + { + "epoch": 12.292822185970635, + "grad_norm": 0.015356496907770634, + "learning_rate": 0.00038814876437125916, + "loss": 0.003, + "num_input_tokens_seen": 162618160, + "step": 75355 + }, + { + "epoch": 12.293637846655791, + "grad_norm": 0.015999896451830864, + "learning_rate": 0.000388079389558264, + "loss": 0.0049, + "num_input_tokens_seen": 162628592, + "step": 75360 + }, + { + "epoch": 12.294453507340947, + "grad_norm": 0.0889120027422905, + "learning_rate": 0.0003880100170134818, + "loss": 0.0045, + "num_input_tokens_seen": 162639056, + "step": 75365 + }, + { + "epoch": 12.2952691680261, + "grad_norm": 0.019953537732362747, + "learning_rate": 0.00038794064673831896, + "loss": 0.003, + "num_input_tokens_seen": 162650384, + "step": 75370 + }, + { + "epoch": 12.296084828711257, + "grad_norm": 0.0006927844951860607, + "learning_rate": 0.0003878712787341809, + "loss": 0.032, + "num_input_tokens_seen": 162661712, + "step": 75375 + }, + { + "epoch": 12.29690048939641, + "grad_norm": 0.04060707613825798, + "learning_rate": 0.0003878019130024737, + "loss": 0.0105, + "num_input_tokens_seen": 162671344, + "step": 75380 + }, + { + "epoch": 12.297716150081566, + "grad_norm": 0.008816522546112537, + "learning_rate": 0.000387732549544603, + "loss": 0.0024, + "num_input_tokens_seen": 162682096, + "step": 75385 + }, + { + "epoch": 12.298531810766722, + "grad_norm": 0.027802985161542892, + "learning_rate": 0.0003876631883619747, + "loss": 0.0076, + "num_input_tokens_seen": 162692432, + "step": 75390 + }, + { + "epoch": 12.299347471451876, + "grad_norm": 0.5121649503707886, + "learning_rate": 0.0003875938294559942, + "loss": 0.0439, + "num_input_tokens_seen": 162703984, + "step": 75395 + }, + { + "epoch": 12.300163132137031, + "grad_norm": 0.9209456443786621, + "learning_rate": 0.0003875244728280676, + "loss": 0.0278, + "num_input_tokens_seen": 162714608, + "step": 75400 + }, + { + "epoch": 12.300978792822185, + "grad_norm": 0.007306493353098631, + "learning_rate": 0.00038745511847960003, + "loss": 0.0133, + "num_input_tokens_seen": 162726928, + "step": 75405 + }, + { + "epoch": 12.301794453507341, + "grad_norm": 0.00247589242644608, + "learning_rate": 0.0003873857664119974, + "loss": 0.0744, + "num_input_tokens_seen": 162738736, + "step": 75410 + }, + { + "epoch": 12.302610114192497, + "grad_norm": 0.001267099636606872, + "learning_rate": 0.00038731641662666493, + "loss": 0.0252, + "num_input_tokens_seen": 162749360, + "step": 75415 + }, + { + "epoch": 12.30342577487765, + "grad_norm": 0.0035249129869043827, + "learning_rate": 0.00038724706912500847, + "loss": 0.0048, + "num_input_tokens_seen": 162759632, + "step": 75420 + }, + { + "epoch": 12.304241435562806, + "grad_norm": 0.026698678731918335, + "learning_rate": 0.0003871777239084329, + "loss": 0.0025, + "num_input_tokens_seen": 162770864, + "step": 75425 + }, + { + "epoch": 12.30505709624796, + "grad_norm": 0.006217367947101593, + "learning_rate": 0.00038710838097834414, + "loss": 0.0042, + "num_input_tokens_seen": 162781456, + "step": 75430 + }, + { + "epoch": 12.305872756933116, + "grad_norm": 0.14995378255844116, + "learning_rate": 0.000387039040336147, + "loss": 0.0068, + "num_input_tokens_seen": 162792528, + "step": 75435 + }, + { + "epoch": 12.30668841761827, + "grad_norm": 0.007958639413118362, + "learning_rate": 0.0003869697019832473, + "loss": 0.002, + "num_input_tokens_seen": 162803408, + "step": 75440 + }, + { + "epoch": 12.307504078303426, + "grad_norm": 0.013846036046743393, + "learning_rate": 0.0003869003659210497, + "loss": 0.0816, + "num_input_tokens_seen": 162814704, + "step": 75445 + }, + { + "epoch": 12.308319738988581, + "grad_norm": 0.022546209394931793, + "learning_rate": 0.00038683103215095965, + "loss": 0.0398, + "num_input_tokens_seen": 162825360, + "step": 75450 + }, + { + "epoch": 12.309135399673735, + "grad_norm": 0.004021415952593088, + "learning_rate": 0.00038676170067438256, + "loss": 0.0261, + "num_input_tokens_seen": 162836272, + "step": 75455 + }, + { + "epoch": 12.309951060358891, + "grad_norm": 0.0011568388435989618, + "learning_rate": 0.00038669237149272303, + "loss": 0.0082, + "num_input_tokens_seen": 162848240, + "step": 75460 + }, + { + "epoch": 12.310766721044045, + "grad_norm": 0.015037529170513153, + "learning_rate": 0.0003866230446073865, + "loss": 0.0026, + "num_input_tokens_seen": 162859472, + "step": 75465 + }, + { + "epoch": 12.3115823817292, + "grad_norm": 0.0018703470705077052, + "learning_rate": 0.0003865537200197776, + "loss": 0.0115, + "num_input_tokens_seen": 162870864, + "step": 75470 + }, + { + "epoch": 12.312398042414356, + "grad_norm": 0.0009740410023368895, + "learning_rate": 0.0003864843977313017, + "loss": 0.0027, + "num_input_tokens_seen": 162880528, + "step": 75475 + }, + { + "epoch": 12.31321370309951, + "grad_norm": 0.03229080140590668, + "learning_rate": 0.0003864150777433634, + "loss": 0.048, + "num_input_tokens_seen": 162891536, + "step": 75480 + }, + { + "epoch": 12.314029363784666, + "grad_norm": 0.021375438198447227, + "learning_rate": 0.0003863457600573676, + "loss": 0.0022, + "num_input_tokens_seen": 162901360, + "step": 75485 + }, + { + "epoch": 12.31484502446982, + "grad_norm": 0.00267455680295825, + "learning_rate": 0.00038627644467471915, + "loss": 0.0094, + "num_input_tokens_seen": 162912624, + "step": 75490 + }, + { + "epoch": 12.315660685154976, + "grad_norm": 1.2979854345321655, + "learning_rate": 0.00038620713159682286, + "loss": 0.0391, + "num_input_tokens_seen": 162922032, + "step": 75495 + }, + { + "epoch": 12.31647634584013, + "grad_norm": 0.010136134922504425, + "learning_rate": 0.0003861378208250834, + "loss": 0.007, + "num_input_tokens_seen": 162933488, + "step": 75500 + }, + { + "epoch": 12.317292006525285, + "grad_norm": 0.0064440625719726086, + "learning_rate": 0.00038606851236090543, + "loss": 0.0367, + "num_input_tokens_seen": 162944976, + "step": 75505 + }, + { + "epoch": 12.318107667210441, + "grad_norm": 0.0013581293169409037, + "learning_rate": 0.00038599920620569357, + "loss": 0.0177, + "num_input_tokens_seen": 162955664, + "step": 75510 + }, + { + "epoch": 12.318923327895595, + "grad_norm": 0.008059518411755562, + "learning_rate": 0.00038592990236085257, + "loss": 0.0025, + "num_input_tokens_seen": 162966768, + "step": 75515 + }, + { + "epoch": 12.31973898858075, + "grad_norm": 0.07571598887443542, + "learning_rate": 0.0003858606008277866, + "loss": 0.0047, + "num_input_tokens_seen": 162978192, + "step": 75520 + }, + { + "epoch": 12.320554649265905, + "grad_norm": 0.03406760096549988, + "learning_rate": 0.0003857913016079005, + "loss": 0.0025, + "num_input_tokens_seen": 162990160, + "step": 75525 + }, + { + "epoch": 12.32137030995106, + "grad_norm": 0.07794225960969925, + "learning_rate": 0.0003857220047025984, + "loss": 0.0961, + "num_input_tokens_seen": 163000848, + "step": 75530 + }, + { + "epoch": 12.322185970636216, + "grad_norm": 0.002615495352074504, + "learning_rate": 0.00038565271011328507, + "loss": 0.0969, + "num_input_tokens_seen": 163012048, + "step": 75535 + }, + { + "epoch": 12.32300163132137, + "grad_norm": 0.32327479124069214, + "learning_rate": 0.00038558341784136437, + "loss": 0.0097, + "num_input_tokens_seen": 163021712, + "step": 75540 + }, + { + "epoch": 12.323817292006526, + "grad_norm": 0.007242204621434212, + "learning_rate": 0.00038551412788824106, + "loss": 0.0018, + "num_input_tokens_seen": 163032496, + "step": 75545 + }, + { + "epoch": 12.32463295269168, + "grad_norm": 0.003345710225403309, + "learning_rate": 0.0003854448402553191, + "loss": 0.0013, + "num_input_tokens_seen": 163043504, + "step": 75550 + }, + { + "epoch": 12.325448613376835, + "grad_norm": 0.2878173589706421, + "learning_rate": 0.0003853755549440026, + "loss": 0.0177, + "num_input_tokens_seen": 163053520, + "step": 75555 + }, + { + "epoch": 12.326264274061991, + "grad_norm": 0.686016857624054, + "learning_rate": 0.0003853062719556962, + "loss": 0.0332, + "num_input_tokens_seen": 163065552, + "step": 75560 + }, + { + "epoch": 12.327079934747145, + "grad_norm": 0.002000858774408698, + "learning_rate": 0.0003852369912918035, + "loss": 0.0567, + "num_input_tokens_seen": 163076560, + "step": 75565 + }, + { + "epoch": 12.3278955954323, + "grad_norm": 0.005756770726293325, + "learning_rate": 0.00038516771295372894, + "loss": 0.0171, + "num_input_tokens_seen": 163087472, + "step": 75570 + }, + { + "epoch": 12.328711256117455, + "grad_norm": 0.0013972214655950665, + "learning_rate": 0.00038509843694287615, + "loss": 0.1517, + "num_input_tokens_seen": 163099120, + "step": 75575 + }, + { + "epoch": 12.32952691680261, + "grad_norm": 0.011590634472668171, + "learning_rate": 0.0003850291632606495, + "loss": 0.0045, + "num_input_tokens_seen": 163109456, + "step": 75580 + }, + { + "epoch": 12.330342577487766, + "grad_norm": 0.5593293309211731, + "learning_rate": 0.00038495989190845246, + "loss": 0.0916, + "num_input_tokens_seen": 163120816, + "step": 75585 + }, + { + "epoch": 12.33115823817292, + "grad_norm": 0.09237557649612427, + "learning_rate": 0.00038489062288768944, + "loss": 0.0163, + "num_input_tokens_seen": 163129456, + "step": 75590 + }, + { + "epoch": 12.331973898858076, + "grad_norm": 0.0006429158383980393, + "learning_rate": 0.00038482135619976373, + "loss": 0.055, + "num_input_tokens_seen": 163140336, + "step": 75595 + }, + { + "epoch": 12.33278955954323, + "grad_norm": 0.022485945373773575, + "learning_rate": 0.0003847520918460795, + "loss": 0.0062, + "num_input_tokens_seen": 163151504, + "step": 75600 + }, + { + "epoch": 12.333605220228385, + "grad_norm": 0.9043557643890381, + "learning_rate": 0.00038468282982804023, + "loss": 0.0473, + "num_input_tokens_seen": 163161648, + "step": 75605 + }, + { + "epoch": 12.33442088091354, + "grad_norm": 0.4864259958267212, + "learning_rate": 0.00038461357014704986, + "loss": 0.0294, + "num_input_tokens_seen": 163172464, + "step": 75610 + }, + { + "epoch": 12.335236541598695, + "grad_norm": 0.03415726497769356, + "learning_rate": 0.00038454431280451163, + "loss": 0.0132, + "num_input_tokens_seen": 163182800, + "step": 75615 + }, + { + "epoch": 12.33605220228385, + "grad_norm": 0.25134116411209106, + "learning_rate": 0.00038447505780182963, + "loss": 0.0079, + "num_input_tokens_seen": 163193008, + "step": 75620 + }, + { + "epoch": 12.336867862969005, + "grad_norm": 0.14140041172504425, + "learning_rate": 0.0003844058051404069, + "loss": 0.0366, + "num_input_tokens_seen": 163202128, + "step": 75625 + }, + { + "epoch": 12.33768352365416, + "grad_norm": 0.05168415233492851, + "learning_rate": 0.00038433655482164727, + "loss": 0.006, + "num_input_tokens_seen": 163213424, + "step": 75630 + }, + { + "epoch": 12.338499184339314, + "grad_norm": 0.3946409821510315, + "learning_rate": 0.0003842673068469541, + "loss": 0.1503, + "num_input_tokens_seen": 163222096, + "step": 75635 + }, + { + "epoch": 12.33931484502447, + "grad_norm": 0.008403047919273376, + "learning_rate": 0.0003841980612177308, + "loss": 0.0661, + "num_input_tokens_seen": 163232336, + "step": 75640 + }, + { + "epoch": 12.340130505709626, + "grad_norm": 0.0053142705000936985, + "learning_rate": 0.00038412881793538063, + "loss": 0.0055, + "num_input_tokens_seen": 163243024, + "step": 75645 + }, + { + "epoch": 12.34094616639478, + "grad_norm": 0.0010392653057351708, + "learning_rate": 0.000384059577001307, + "loss": 0.0221, + "num_input_tokens_seen": 163253264, + "step": 75650 + }, + { + "epoch": 12.341761827079935, + "grad_norm": 0.26996925473213196, + "learning_rate": 0.000383990338416913, + "loss": 0.0169, + "num_input_tokens_seen": 163262768, + "step": 75655 + }, + { + "epoch": 12.34257748776509, + "grad_norm": 0.07632311433553696, + "learning_rate": 0.00038392110218360203, + "loss": 0.0172, + "num_input_tokens_seen": 163274992, + "step": 75660 + }, + { + "epoch": 12.343393148450245, + "grad_norm": 0.0031375945545732975, + "learning_rate": 0.0003838518683027772, + "loss": 0.0167, + "num_input_tokens_seen": 163285584, + "step": 75665 + }, + { + "epoch": 12.3442088091354, + "grad_norm": 0.01576031744480133, + "learning_rate": 0.0003837826367758417, + "loss": 0.1511, + "num_input_tokens_seen": 163296528, + "step": 75670 + }, + { + "epoch": 12.345024469820554, + "grad_norm": 0.0014721389161422849, + "learning_rate": 0.0003837134076041984, + "loss": 0.0046, + "num_input_tokens_seen": 163306768, + "step": 75675 + }, + { + "epoch": 12.34584013050571, + "grad_norm": 0.04493863508105278, + "learning_rate": 0.00038364418078925037, + "loss": 0.0104, + "num_input_tokens_seen": 163317744, + "step": 75680 + }, + { + "epoch": 12.346655791190864, + "grad_norm": 0.0019607397262007, + "learning_rate": 0.0003835749563324008, + "loss": 0.0268, + "num_input_tokens_seen": 163329584, + "step": 75685 + }, + { + "epoch": 12.34747145187602, + "grad_norm": 0.015190725214779377, + "learning_rate": 0.0003835057342350522, + "loss": 0.0088, + "num_input_tokens_seen": 163339344, + "step": 75690 + }, + { + "epoch": 12.348287112561174, + "grad_norm": 0.014669899828732014, + "learning_rate": 0.0003834365144986079, + "loss": 0.1518, + "num_input_tokens_seen": 163350096, + "step": 75695 + }, + { + "epoch": 12.34910277324633, + "grad_norm": 0.0038739086594432592, + "learning_rate": 0.00038336729712447034, + "loss": 0.0906, + "num_input_tokens_seen": 163359952, + "step": 75700 + }, + { + "epoch": 12.349918433931485, + "grad_norm": 0.07513081282377243, + "learning_rate": 0.0003832980821140426, + "loss": 0.022, + "num_input_tokens_seen": 163369776, + "step": 75705 + }, + { + "epoch": 12.350734094616639, + "grad_norm": 0.00690740579739213, + "learning_rate": 0.00038322886946872716, + "loss": 0.0194, + "num_input_tokens_seen": 163380688, + "step": 75710 + }, + { + "epoch": 12.351549755301795, + "grad_norm": 0.0011113430373370647, + "learning_rate": 0.000383159659189927, + "loss": 0.0033, + "num_input_tokens_seen": 163391920, + "step": 75715 + }, + { + "epoch": 12.352365415986949, + "grad_norm": 0.15539173781871796, + "learning_rate": 0.0003830904512790443, + "loss": 0.0566, + "num_input_tokens_seen": 163403728, + "step": 75720 + }, + { + "epoch": 12.353181076672104, + "grad_norm": 0.007026641163975, + "learning_rate": 0.0003830212457374821, + "loss": 0.0108, + "num_input_tokens_seen": 163414416, + "step": 75725 + }, + { + "epoch": 12.35399673735726, + "grad_norm": 0.0020779957994818687, + "learning_rate": 0.00038295204256664264, + "loss": 0.1047, + "num_input_tokens_seen": 163425968, + "step": 75730 + }, + { + "epoch": 12.354812398042414, + "grad_norm": 0.6445653438568115, + "learning_rate": 0.00038288284176792866, + "loss": 0.079, + "num_input_tokens_seen": 163434960, + "step": 75735 + }, + { + "epoch": 12.35562805872757, + "grad_norm": 0.022767413407564163, + "learning_rate": 0.0003828136433427423, + "loss": 0.0063, + "num_input_tokens_seen": 163446352, + "step": 75740 + }, + { + "epoch": 12.356443719412724, + "grad_norm": 0.0182089451700449, + "learning_rate": 0.00038274444729248633, + "loss": 0.0331, + "num_input_tokens_seen": 163457104, + "step": 75745 + }, + { + "epoch": 12.35725938009788, + "grad_norm": 0.014521656557917595, + "learning_rate": 0.00038267525361856264, + "loss": 0.0104, + "num_input_tokens_seen": 163468304, + "step": 75750 + }, + { + "epoch": 12.358075040783035, + "grad_norm": 0.11738842725753784, + "learning_rate": 0.000382606062322374, + "loss": 0.048, + "num_input_tokens_seen": 163480368, + "step": 75755 + }, + { + "epoch": 12.358890701468189, + "grad_norm": 0.008169502019882202, + "learning_rate": 0.00038253687340532224, + "loss": 0.004, + "num_input_tokens_seen": 163490448, + "step": 75760 + }, + { + "epoch": 12.359706362153345, + "grad_norm": 0.0011876021744683385, + "learning_rate": 0.0003824676868688097, + "loss": 0.003, + "num_input_tokens_seen": 163501072, + "step": 75765 + }, + { + "epoch": 12.360522022838499, + "grad_norm": 0.0051433932967484, + "learning_rate": 0.0003823985027142389, + "loss": 0.0078, + "num_input_tokens_seen": 163511120, + "step": 75770 + }, + { + "epoch": 12.361337683523654, + "grad_norm": 0.0320395864546299, + "learning_rate": 0.0003823293209430113, + "loss": 0.0084, + "num_input_tokens_seen": 163521008, + "step": 75775 + }, + { + "epoch": 12.362153344208808, + "grad_norm": 0.0240180641412735, + "learning_rate": 0.00038226014155652956, + "loss": 0.0124, + "num_input_tokens_seen": 163531856, + "step": 75780 + }, + { + "epoch": 12.362969004893964, + "grad_norm": 0.03330185264348984, + "learning_rate": 0.0003821909645561952, + "loss": 0.0066, + "num_input_tokens_seen": 163542960, + "step": 75785 + }, + { + "epoch": 12.36378466557912, + "grad_norm": 0.004346244502812624, + "learning_rate": 0.0003821217899434106, + "loss": 0.0046, + "num_input_tokens_seen": 163552848, + "step": 75790 + }, + { + "epoch": 12.364600326264274, + "grad_norm": 0.008247468620538712, + "learning_rate": 0.0003820526177195772, + "loss": 0.0039, + "num_input_tokens_seen": 163563952, + "step": 75795 + }, + { + "epoch": 12.36541598694943, + "grad_norm": 0.003264525206759572, + "learning_rate": 0.00038198344788609737, + "loss": 0.0052, + "num_input_tokens_seen": 163573904, + "step": 75800 + }, + { + "epoch": 12.366231647634583, + "grad_norm": 0.005881859455257654, + "learning_rate": 0.0003819142804443726, + "loss": 0.0032, + "num_input_tokens_seen": 163584624, + "step": 75805 + }, + { + "epoch": 12.367047308319739, + "grad_norm": 0.6988226175308228, + "learning_rate": 0.0003818451153958047, + "loss": 0.0612, + "num_input_tokens_seen": 163596016, + "step": 75810 + }, + { + "epoch": 12.367862969004895, + "grad_norm": 0.0869535431265831, + "learning_rate": 0.0003817759527417955, + "loss": 0.034, + "num_input_tokens_seen": 163607344, + "step": 75815 + }, + { + "epoch": 12.368678629690049, + "grad_norm": 0.18890467286109924, + "learning_rate": 0.00038170679248374653, + "loss": 0.1653, + "num_input_tokens_seen": 163617936, + "step": 75820 + }, + { + "epoch": 12.369494290375204, + "grad_norm": 0.05017193406820297, + "learning_rate": 0.00038163763462305944, + "loss": 0.0047, + "num_input_tokens_seen": 163628400, + "step": 75825 + }, + { + "epoch": 12.370309951060358, + "grad_norm": 0.0693901851773262, + "learning_rate": 0.000381568479161136, + "loss": 0.0089, + "num_input_tokens_seen": 163638704, + "step": 75830 + }, + { + "epoch": 12.371125611745514, + "grad_norm": 0.0050386288203299046, + "learning_rate": 0.00038149932609937736, + "loss": 0.0183, + "num_input_tokens_seen": 163650128, + "step": 75835 + }, + { + "epoch": 12.37194127243067, + "grad_norm": 0.001376612694002688, + "learning_rate": 0.00038143017543918546, + "loss": 0.0092, + "num_input_tokens_seen": 163660592, + "step": 75840 + }, + { + "epoch": 12.372756933115824, + "grad_norm": 0.11152004450559616, + "learning_rate": 0.0003813610271819612, + "loss": 0.0149, + "num_input_tokens_seen": 163670000, + "step": 75845 + }, + { + "epoch": 12.37357259380098, + "grad_norm": 0.007272131275385618, + "learning_rate": 0.00038129188132910645, + "loss": 0.0042, + "num_input_tokens_seen": 163680336, + "step": 75850 + }, + { + "epoch": 12.374388254486133, + "grad_norm": 0.006919977255165577, + "learning_rate": 0.00038122273788202216, + "loss": 0.1306, + "num_input_tokens_seen": 163690224, + "step": 75855 + }, + { + "epoch": 12.375203915171289, + "grad_norm": 0.02216629683971405, + "learning_rate": 0.00038115359684210993, + "loss": 0.004, + "num_input_tokens_seen": 163700528, + "step": 75860 + }, + { + "epoch": 12.376019575856443, + "grad_norm": 0.017407752573490143, + "learning_rate": 0.00038108445821077066, + "loss": 0.0046, + "num_input_tokens_seen": 163711696, + "step": 75865 + }, + { + "epoch": 12.376835236541599, + "grad_norm": 0.6947182416915894, + "learning_rate": 0.00038101532198940563, + "loss": 0.0509, + "num_input_tokens_seen": 163722640, + "step": 75870 + }, + { + "epoch": 12.377650897226754, + "grad_norm": 0.004892442375421524, + "learning_rate": 0.0003809461881794163, + "loss": 0.0041, + "num_input_tokens_seen": 163732400, + "step": 75875 + }, + { + "epoch": 12.378466557911908, + "grad_norm": 0.3961011469364166, + "learning_rate": 0.0003808770567822033, + "loss": 0.09, + "num_input_tokens_seen": 163742128, + "step": 75880 + }, + { + "epoch": 12.379282218597064, + "grad_norm": 0.634158730506897, + "learning_rate": 0.000380807927799168, + "loss": 0.1006, + "num_input_tokens_seen": 163752144, + "step": 75885 + }, + { + "epoch": 12.380097879282218, + "grad_norm": 0.018228819593787193, + "learning_rate": 0.0003807388012317111, + "loss": 0.0124, + "num_input_tokens_seen": 163762224, + "step": 75890 + }, + { + "epoch": 12.380913539967374, + "grad_norm": 0.03738921135663986, + "learning_rate": 0.0003806696770812339, + "loss": 0.0219, + "num_input_tokens_seen": 163773584, + "step": 75895 + }, + { + "epoch": 12.38172920065253, + "grad_norm": 0.00782018806785345, + "learning_rate": 0.00038060055534913683, + "loss": 0.0557, + "num_input_tokens_seen": 163785264, + "step": 75900 + }, + { + "epoch": 12.382544861337683, + "grad_norm": 0.006975534372031689, + "learning_rate": 0.0003805314360368212, + "loss": 0.0105, + "num_input_tokens_seen": 163795312, + "step": 75905 + }, + { + "epoch": 12.383360522022839, + "grad_norm": 0.007874327711760998, + "learning_rate": 0.0003804623191456874, + "loss": 0.0111, + "num_input_tokens_seen": 163806064, + "step": 75910 + }, + { + "epoch": 12.384176182707993, + "grad_norm": 0.38060954213142395, + "learning_rate": 0.00038039320467713654, + "loss": 0.0692, + "num_input_tokens_seen": 163816016, + "step": 75915 + }, + { + "epoch": 12.384991843393149, + "grad_norm": 0.0023655106779187918, + "learning_rate": 0.0003803240926325689, + "loss": 0.0102, + "num_input_tokens_seen": 163826704, + "step": 75920 + }, + { + "epoch": 12.385807504078304, + "grad_norm": 0.026538284495472908, + "learning_rate": 0.00038025498301338554, + "loss": 0.0253, + "num_input_tokens_seen": 163838064, + "step": 75925 + }, + { + "epoch": 12.386623164763458, + "grad_norm": 0.008674153126776218, + "learning_rate": 0.00038018587582098665, + "loss": 0.0275, + "num_input_tokens_seen": 163848592, + "step": 75930 + }, + { + "epoch": 12.387438825448614, + "grad_norm": 0.07710882276296616, + "learning_rate": 0.0003801167710567731, + "loss": 0.0482, + "num_input_tokens_seen": 163858704, + "step": 75935 + }, + { + "epoch": 12.388254486133768, + "grad_norm": 0.025644633919000626, + "learning_rate": 0.00038004766872214526, + "loss": 0.0059, + "num_input_tokens_seen": 163868816, + "step": 75940 + }, + { + "epoch": 12.389070146818923, + "grad_norm": 0.012728697620332241, + "learning_rate": 0.0003799785688185036, + "loss": 0.0122, + "num_input_tokens_seen": 163878960, + "step": 75945 + }, + { + "epoch": 12.38988580750408, + "grad_norm": 0.0014373852172866464, + "learning_rate": 0.00037990947134724845, + "loss": 0.0189, + "num_input_tokens_seen": 163889424, + "step": 75950 + }, + { + "epoch": 12.390701468189233, + "grad_norm": 0.005364995915442705, + "learning_rate": 0.00037984037630978026, + "loss": 0.0063, + "num_input_tokens_seen": 163899920, + "step": 75955 + }, + { + "epoch": 12.391517128874389, + "grad_norm": 0.0037990352138876915, + "learning_rate": 0.00037977128370749916, + "loss": 0.0036, + "num_input_tokens_seen": 163910704, + "step": 75960 + }, + { + "epoch": 12.392332789559543, + "grad_norm": 0.0033042628783732653, + "learning_rate": 0.00037970219354180573, + "loss": 0.0122, + "num_input_tokens_seen": 163923312, + "step": 75965 + }, + { + "epoch": 12.393148450244698, + "grad_norm": 0.05126928165555, + "learning_rate": 0.0003796331058140997, + "loss": 0.0035, + "num_input_tokens_seen": 163933776, + "step": 75970 + }, + { + "epoch": 12.393964110929852, + "grad_norm": 0.0015858504921197891, + "learning_rate": 0.00037956402052578164, + "loss": 0.092, + "num_input_tokens_seen": 163944432, + "step": 75975 + }, + { + "epoch": 12.394779771615008, + "grad_norm": 0.3597656786441803, + "learning_rate": 0.0003794949376782515, + "loss": 0.032, + "num_input_tokens_seen": 163954736, + "step": 75980 + }, + { + "epoch": 12.395595432300164, + "grad_norm": 0.003377850167453289, + "learning_rate": 0.00037942585727290926, + "loss": 0.111, + "num_input_tokens_seen": 163965104, + "step": 75985 + }, + { + "epoch": 12.396411092985318, + "grad_norm": 0.0036825151182711124, + "learning_rate": 0.000379356779311155, + "loss": 0.0374, + "num_input_tokens_seen": 163975728, + "step": 75990 + }, + { + "epoch": 12.397226753670473, + "grad_norm": 0.01512024737894535, + "learning_rate": 0.0003792877037943886, + "loss": 0.0064, + "num_input_tokens_seen": 163987600, + "step": 75995 + }, + { + "epoch": 12.398042414355627, + "grad_norm": 0.001682682428508997, + "learning_rate": 0.0003792186307240102, + "loss": 0.0185, + "num_input_tokens_seen": 163999248, + "step": 76000 + }, + { + "epoch": 12.398858075040783, + "grad_norm": 0.016918683424592018, + "learning_rate": 0.0003791495601014192, + "loss": 0.0265, + "num_input_tokens_seen": 164008848, + "step": 76005 + }, + { + "epoch": 12.399673735725939, + "grad_norm": 0.005173893645405769, + "learning_rate": 0.00037908049192801596, + "loss": 0.0024, + "num_input_tokens_seen": 164019920, + "step": 76010 + }, + { + "epoch": 12.400489396411093, + "grad_norm": 0.21864071488380432, + "learning_rate": 0.00037901142620519967, + "loss": 0.0198, + "num_input_tokens_seen": 164031632, + "step": 76015 + }, + { + "epoch": 12.401305057096248, + "grad_norm": 0.014110148884356022, + "learning_rate": 0.00037894236293437055, + "loss": 0.0076, + "num_input_tokens_seen": 164042544, + "step": 76020 + }, + { + "epoch": 12.402120717781402, + "grad_norm": 0.005355931352823973, + "learning_rate": 0.00037887330211692783, + "loss": 0.0027, + "num_input_tokens_seen": 164054384, + "step": 76025 + }, + { + "epoch": 12.402936378466558, + "grad_norm": 0.025463759899139404, + "learning_rate": 0.00037880424375427154, + "loss": 0.0117, + "num_input_tokens_seen": 164064208, + "step": 76030 + }, + { + "epoch": 12.403752039151712, + "grad_norm": 0.12928223609924316, + "learning_rate": 0.00037873518784780074, + "loss": 0.0161, + "num_input_tokens_seen": 164074576, + "step": 76035 + }, + { + "epoch": 12.404567699836868, + "grad_norm": 0.0026917222421616316, + "learning_rate": 0.0003786661343989154, + "loss": 0.0156, + "num_input_tokens_seen": 164084720, + "step": 76040 + }, + { + "epoch": 12.405383360522023, + "grad_norm": 0.009686323814094067, + "learning_rate": 0.00037859708340901455, + "loss": 0.1265, + "num_input_tokens_seen": 164095664, + "step": 76045 + }, + { + "epoch": 12.406199021207177, + "grad_norm": 0.008457069285213947, + "learning_rate": 0.00037852803487949804, + "loss": 0.0199, + "num_input_tokens_seen": 164106864, + "step": 76050 + }, + { + "epoch": 12.407014681892333, + "grad_norm": 0.00475624855607748, + "learning_rate": 0.0003784589888117648, + "loss": 0.0296, + "num_input_tokens_seen": 164118096, + "step": 76055 + }, + { + "epoch": 12.407830342577487, + "grad_norm": 0.22005529701709747, + "learning_rate": 0.0003783899452072146, + "loss": 0.0201, + "num_input_tokens_seen": 164128912, + "step": 76060 + }, + { + "epoch": 12.408646003262643, + "grad_norm": 0.11581244319677353, + "learning_rate": 0.00037832090406724617, + "loss": 0.0054, + "num_input_tokens_seen": 164139856, + "step": 76065 + }, + { + "epoch": 12.409461663947798, + "grad_norm": 0.021922726184129715, + "learning_rate": 0.0003782518653932592, + "loss": 0.0434, + "num_input_tokens_seen": 164152016, + "step": 76070 + }, + { + "epoch": 12.410277324632952, + "grad_norm": 0.06680570542812347, + "learning_rate": 0.00037818282918665236, + "loss": 0.0188, + "num_input_tokens_seen": 164162512, + "step": 76075 + }, + { + "epoch": 12.411092985318108, + "grad_norm": 0.006358086597174406, + "learning_rate": 0.0003781137954488251, + "loss": 0.0021, + "num_input_tokens_seen": 164173232, + "step": 76080 + }, + { + "epoch": 12.411908646003262, + "grad_norm": 0.005959421396255493, + "learning_rate": 0.0003780447641811766, + "loss": 0.0054, + "num_input_tokens_seen": 164183408, + "step": 76085 + }, + { + "epoch": 12.412724306688418, + "grad_norm": 0.14587253332138062, + "learning_rate": 0.0003779757353851054, + "loss": 0.0152, + "num_input_tokens_seen": 164195120, + "step": 76090 + }, + { + "epoch": 12.413539967373573, + "grad_norm": 0.22377270460128784, + "learning_rate": 0.000377906709062011, + "loss": 0.1256, + "num_input_tokens_seen": 164206288, + "step": 76095 + }, + { + "epoch": 12.414355628058727, + "grad_norm": 0.17617537081241608, + "learning_rate": 0.00037783768521329177, + "loss": 0.0132, + "num_input_tokens_seen": 164216752, + "step": 76100 + }, + { + "epoch": 12.415171288743883, + "grad_norm": 0.018752476200461388, + "learning_rate": 0.0003777686638403469, + "loss": 0.0437, + "num_input_tokens_seen": 164228432, + "step": 76105 + }, + { + "epoch": 12.415986949429037, + "grad_norm": 0.0029605648014694452, + "learning_rate": 0.0003776996449445752, + "loss": 0.0205, + "num_input_tokens_seen": 164239216, + "step": 76110 + }, + { + "epoch": 12.416802610114193, + "grad_norm": 0.12881970405578613, + "learning_rate": 0.0003776306285273753, + "loss": 0.0157, + "num_input_tokens_seen": 164249424, + "step": 76115 + }, + { + "epoch": 12.417618270799348, + "grad_norm": 0.17530831694602966, + "learning_rate": 0.0003775616145901459, + "loss": 0.0188, + "num_input_tokens_seen": 164260944, + "step": 76120 + }, + { + "epoch": 12.418433931484502, + "grad_norm": 0.011943010613322258, + "learning_rate": 0.0003774926031342858, + "loss": 0.0036, + "num_input_tokens_seen": 164271408, + "step": 76125 + }, + { + "epoch": 12.419249592169658, + "grad_norm": 0.003212936455383897, + "learning_rate": 0.0003774235941611934, + "loss": 0.0078, + "num_input_tokens_seen": 164282064, + "step": 76130 + }, + { + "epoch": 12.420065252854812, + "grad_norm": 0.11782728880643845, + "learning_rate": 0.0003773545876722675, + "loss": 0.0076, + "num_input_tokens_seen": 164293520, + "step": 76135 + }, + { + "epoch": 12.420880913539968, + "grad_norm": 0.006480608135461807, + "learning_rate": 0.00037728558366890633, + "loss": 0.004, + "num_input_tokens_seen": 164304560, + "step": 76140 + }, + { + "epoch": 12.421696574225122, + "grad_norm": 0.0745435506105423, + "learning_rate": 0.00037721658215250864, + "loss": 0.0458, + "num_input_tokens_seen": 164314288, + "step": 76145 + }, + { + "epoch": 12.422512234910277, + "grad_norm": 0.02613998018205166, + "learning_rate": 0.00037714758312447247, + "loss": 0.0063, + "num_input_tokens_seen": 164324688, + "step": 76150 + }, + { + "epoch": 12.423327895595433, + "grad_norm": 0.3094393014907837, + "learning_rate": 0.0003770785865861966, + "loss": 0.0269, + "num_input_tokens_seen": 164336304, + "step": 76155 + }, + { + "epoch": 12.424143556280587, + "grad_norm": 0.0864882618188858, + "learning_rate": 0.0003770095925390789, + "loss": 0.0066, + "num_input_tokens_seen": 164347248, + "step": 76160 + }, + { + "epoch": 12.424959216965743, + "grad_norm": 0.06511864066123962, + "learning_rate": 0.000376940600984518, + "loss": 0.025, + "num_input_tokens_seen": 164357136, + "step": 76165 + }, + { + "epoch": 12.425774877650896, + "grad_norm": 0.000815055042039603, + "learning_rate": 0.0003768716119239118, + "loss": 0.0014, + "num_input_tokens_seen": 164368112, + "step": 76170 + }, + { + "epoch": 12.426590538336052, + "grad_norm": 0.15183547139167786, + "learning_rate": 0.0003768026253586587, + "loss": 0.0085, + "num_input_tokens_seen": 164379376, + "step": 76175 + }, + { + "epoch": 12.427406199021208, + "grad_norm": 0.0025211479514837265, + "learning_rate": 0.00037673364129015653, + "loss": 0.0418, + "num_input_tokens_seen": 164390896, + "step": 76180 + }, + { + "epoch": 12.428221859706362, + "grad_norm": 0.0057022240944206715, + "learning_rate": 0.0003766646597198037, + "loss": 0.1301, + "num_input_tokens_seen": 164402480, + "step": 76185 + }, + { + "epoch": 12.429037520391518, + "grad_norm": 0.02116597630083561, + "learning_rate": 0.0003765956806489978, + "loss": 0.0159, + "num_input_tokens_seen": 164414224, + "step": 76190 + }, + { + "epoch": 12.429853181076671, + "grad_norm": 0.02898281253874302, + "learning_rate": 0.00037652670407913697, + "loss": 0.0077, + "num_input_tokens_seen": 164425072, + "step": 76195 + }, + { + "epoch": 12.430668841761827, + "grad_norm": 0.001888060593046248, + "learning_rate": 0.00037645773001161937, + "loss": 0.0054, + "num_input_tokens_seen": 164436400, + "step": 76200 + }, + { + "epoch": 12.431484502446983, + "grad_norm": 0.006914603523910046, + "learning_rate": 0.0003763887584478423, + "loss": 0.005, + "num_input_tokens_seen": 164446544, + "step": 76205 + }, + { + "epoch": 12.432300163132137, + "grad_norm": 0.08744195103645325, + "learning_rate": 0.00037631978938920414, + "loss": 0.048, + "num_input_tokens_seen": 164457680, + "step": 76210 + }, + { + "epoch": 12.433115823817293, + "grad_norm": 0.0022321208380162716, + "learning_rate": 0.0003762508228371021, + "loss": 0.0079, + "num_input_tokens_seen": 164467824, + "step": 76215 + }, + { + "epoch": 12.433931484502446, + "grad_norm": 0.004156464710831642, + "learning_rate": 0.0003761818587929344, + "loss": 0.0075, + "num_input_tokens_seen": 164478224, + "step": 76220 + }, + { + "epoch": 12.434747145187602, + "grad_norm": 0.008057629689574242, + "learning_rate": 0.0003761128972580981, + "loss": 0.0206, + "num_input_tokens_seen": 164489616, + "step": 76225 + }, + { + "epoch": 12.435562805872756, + "grad_norm": 0.00927797332406044, + "learning_rate": 0.00037604393823399137, + "loss": 0.0821, + "num_input_tokens_seen": 164500144, + "step": 76230 + }, + { + "epoch": 12.436378466557912, + "grad_norm": 0.06778733432292938, + "learning_rate": 0.00037597498172201125, + "loss": 0.0074, + "num_input_tokens_seen": 164510800, + "step": 76235 + }, + { + "epoch": 12.437194127243067, + "grad_norm": 0.42901062965393066, + "learning_rate": 0.0003759060277235556, + "loss": 0.037, + "num_input_tokens_seen": 164523120, + "step": 76240 + }, + { + "epoch": 12.438009787928221, + "grad_norm": 0.014346163719892502, + "learning_rate": 0.00037583707624002163, + "loss": 0.121, + "num_input_tokens_seen": 164534288, + "step": 76245 + }, + { + "epoch": 12.438825448613377, + "grad_norm": 0.002474126871675253, + "learning_rate": 0.00037576812727280683, + "loss": 0.0227, + "num_input_tokens_seen": 164546480, + "step": 76250 + }, + { + "epoch": 12.439641109298531, + "grad_norm": 0.0008122525177896023, + "learning_rate": 0.0003756991808233086, + "loss": 0.0047, + "num_input_tokens_seen": 164557264, + "step": 76255 + }, + { + "epoch": 12.440456769983687, + "grad_norm": 0.005596327129751444, + "learning_rate": 0.0003756302368929241, + "loss": 0.0094, + "num_input_tokens_seen": 164568848, + "step": 76260 + }, + { + "epoch": 12.441272430668842, + "grad_norm": 0.08841634541749954, + "learning_rate": 0.00037556129548305074, + "loss": 0.0774, + "num_input_tokens_seen": 164579920, + "step": 76265 + }, + { + "epoch": 12.442088091353996, + "grad_norm": 0.0028384809847921133, + "learning_rate": 0.0003754923565950855, + "loss": 0.0224, + "num_input_tokens_seen": 164590416, + "step": 76270 + }, + { + "epoch": 12.442903752039152, + "grad_norm": 0.004437841475009918, + "learning_rate": 0.0003754234202304255, + "loss": 0.047, + "num_input_tokens_seen": 164599312, + "step": 76275 + }, + { + "epoch": 12.443719412724306, + "grad_norm": 0.027691485360264778, + "learning_rate": 0.00037535448639046816, + "loss": 0.0068, + "num_input_tokens_seen": 164610448, + "step": 76280 + }, + { + "epoch": 12.444535073409462, + "grad_norm": 0.003333278466016054, + "learning_rate": 0.00037528555507661, + "loss": 0.0024, + "num_input_tokens_seen": 164622448, + "step": 76285 + }, + { + "epoch": 12.445350734094617, + "grad_norm": 0.17494985461235046, + "learning_rate": 0.00037521662629024855, + "loss": 0.0201, + "num_input_tokens_seen": 164632432, + "step": 76290 + }, + { + "epoch": 12.446166394779771, + "grad_norm": 0.0583336316049099, + "learning_rate": 0.00037514770003278027, + "loss": 0.0685, + "num_input_tokens_seen": 164641744, + "step": 76295 + }, + { + "epoch": 12.446982055464927, + "grad_norm": 0.004313532263040543, + "learning_rate": 0.00037507877630560215, + "loss": 0.0041, + "num_input_tokens_seen": 164652816, + "step": 76300 + }, + { + "epoch": 12.447797716150081, + "grad_norm": 0.001048357575200498, + "learning_rate": 0.00037500985511011145, + "loss": 0.079, + "num_input_tokens_seen": 164662992, + "step": 76305 + }, + { + "epoch": 12.448613376835237, + "grad_norm": 0.08319035917520523, + "learning_rate": 0.00037494093644770425, + "loss": 0.0119, + "num_input_tokens_seen": 164674288, + "step": 76310 + }, + { + "epoch": 12.449429037520392, + "grad_norm": 0.010450982488691807, + "learning_rate": 0.000374872020319778, + "loss": 0.005, + "num_input_tokens_seen": 164686064, + "step": 76315 + }, + { + "epoch": 12.450244698205546, + "grad_norm": 0.2618831396102905, + "learning_rate": 0.0003748031067277286, + "loss": 0.0165, + "num_input_tokens_seen": 164695472, + "step": 76320 + }, + { + "epoch": 12.451060358890702, + "grad_norm": 0.10925282537937164, + "learning_rate": 0.00037473419567295337, + "loss": 0.0132, + "num_input_tokens_seen": 164706704, + "step": 76325 + }, + { + "epoch": 12.451876019575856, + "grad_norm": 0.23101277649402618, + "learning_rate": 0.0003746652871568483, + "loss": 0.0175, + "num_input_tokens_seen": 164718288, + "step": 76330 + }, + { + "epoch": 12.452691680261012, + "grad_norm": 0.003719218308106065, + "learning_rate": 0.0003745963811808105, + "loss": 0.004, + "num_input_tokens_seen": 164728496, + "step": 76335 + }, + { + "epoch": 12.453507340946166, + "grad_norm": 0.03532201051712036, + "learning_rate": 0.00037452747774623584, + "loss": 0.0211, + "num_input_tokens_seen": 164738768, + "step": 76340 + }, + { + "epoch": 12.454323001631321, + "grad_norm": 0.007808144204318523, + "learning_rate": 0.0003744585768545212, + "loss": 0.0034, + "num_input_tokens_seen": 164750224, + "step": 76345 + }, + { + "epoch": 12.455138662316477, + "grad_norm": 0.01583998277783394, + "learning_rate": 0.00037438967850706264, + "loss": 0.0627, + "num_input_tokens_seen": 164762096, + "step": 76350 + }, + { + "epoch": 12.455954323001631, + "grad_norm": 0.0034101479686796665, + "learning_rate": 0.0003743207827052567, + "loss": 0.0051, + "num_input_tokens_seen": 164773424, + "step": 76355 + }, + { + "epoch": 12.456769983686787, + "grad_norm": 0.03809216246008873, + "learning_rate": 0.0003742518894504994, + "loss": 0.012, + "num_input_tokens_seen": 164784496, + "step": 76360 + }, + { + "epoch": 12.45758564437194, + "grad_norm": 0.0053803520277142525, + "learning_rate": 0.00037418299874418726, + "loss": 0.0063, + "num_input_tokens_seen": 164796752, + "step": 76365 + }, + { + "epoch": 12.458401305057096, + "grad_norm": 0.0026612922083586454, + "learning_rate": 0.00037411411058771606, + "loss": 0.0037, + "num_input_tokens_seen": 164807696, + "step": 76370 + }, + { + "epoch": 12.459216965742252, + "grad_norm": 0.00023077457444742322, + "learning_rate": 0.00037404522498248234, + "loss": 0.0327, + "num_input_tokens_seen": 164818064, + "step": 76375 + }, + { + "epoch": 12.460032626427406, + "grad_norm": 0.008437009528279305, + "learning_rate": 0.0003739763419298817, + "loss": 0.0076, + "num_input_tokens_seen": 164827504, + "step": 76380 + }, + { + "epoch": 12.460848287112562, + "grad_norm": 0.02583560161292553, + "learning_rate": 0.0003739074614313105, + "loss": 0.0039, + "num_input_tokens_seen": 164838992, + "step": 76385 + }, + { + "epoch": 12.461663947797716, + "grad_norm": 0.22198903560638428, + "learning_rate": 0.00037383858348816445, + "loss": 0.0204, + "num_input_tokens_seen": 164849136, + "step": 76390 + }, + { + "epoch": 12.462479608482871, + "grad_norm": 0.004512605257332325, + "learning_rate": 0.0003737697081018396, + "loss": 0.0064, + "num_input_tokens_seen": 164860048, + "step": 76395 + }, + { + "epoch": 12.463295269168025, + "grad_norm": 0.0038456832990050316, + "learning_rate": 0.0003737008352737318, + "loss": 0.0911, + "num_input_tokens_seen": 164871536, + "step": 76400 + }, + { + "epoch": 12.464110929853181, + "grad_norm": 0.28788241744041443, + "learning_rate": 0.0003736319650052366, + "loss": 0.0335, + "num_input_tokens_seen": 164881808, + "step": 76405 + }, + { + "epoch": 12.464926590538337, + "grad_norm": 0.05041753873229027, + "learning_rate": 0.0003735630972977502, + "loss": 0.059, + "num_input_tokens_seen": 164892432, + "step": 76410 + }, + { + "epoch": 12.46574225122349, + "grad_norm": 0.0046590780839324, + "learning_rate": 0.00037349423215266784, + "loss": 0.0172, + "num_input_tokens_seen": 164903024, + "step": 76415 + }, + { + "epoch": 12.466557911908646, + "grad_norm": 0.45932042598724365, + "learning_rate": 0.0003734253695713854, + "loss": 0.1086, + "num_input_tokens_seen": 164913680, + "step": 76420 + }, + { + "epoch": 12.4673735725938, + "grad_norm": 0.0033545014448463917, + "learning_rate": 0.0003733565095552985, + "loss": 0.0119, + "num_input_tokens_seen": 164923632, + "step": 76425 + }, + { + "epoch": 12.468189233278956, + "grad_norm": 0.0032544296700507402, + "learning_rate": 0.0003732876521058025, + "loss": 0.0014, + "num_input_tokens_seen": 164934544, + "step": 76430 + }, + { + "epoch": 12.469004893964112, + "grad_norm": 0.07587868720293045, + "learning_rate": 0.000373218797224293, + "loss": 0.0181, + "num_input_tokens_seen": 164945072, + "step": 76435 + }, + { + "epoch": 12.469820554649266, + "grad_norm": 0.0007766516064293683, + "learning_rate": 0.00037314994491216547, + "loss": 0.0088, + "num_input_tokens_seen": 164955504, + "step": 76440 + }, + { + "epoch": 12.470636215334421, + "grad_norm": 0.14731749892234802, + "learning_rate": 0.00037308109517081506, + "loss": 0.112, + "num_input_tokens_seen": 164966160, + "step": 76445 + }, + { + "epoch": 12.471451876019575, + "grad_norm": 0.13628606498241425, + "learning_rate": 0.0003730122480016375, + "loss": 0.0059, + "num_input_tokens_seen": 164976976, + "step": 76450 + }, + { + "epoch": 12.47226753670473, + "grad_norm": 0.003251641755923629, + "learning_rate": 0.00037294340340602764, + "loss": 0.0051, + "num_input_tokens_seen": 164987248, + "step": 76455 + }, + { + "epoch": 12.473083197389887, + "grad_norm": 0.1408027708530426, + "learning_rate": 0.0003728745613853811, + "loss": 0.0246, + "num_input_tokens_seen": 164998032, + "step": 76460 + }, + { + "epoch": 12.47389885807504, + "grad_norm": 0.39427927136421204, + "learning_rate": 0.00037280572194109255, + "loss": 0.0356, + "num_input_tokens_seen": 165007856, + "step": 76465 + }, + { + "epoch": 12.474714518760196, + "grad_norm": 0.011660768650472164, + "learning_rate": 0.00037273688507455773, + "loss": 0.0214, + "num_input_tokens_seen": 165018352, + "step": 76470 + }, + { + "epoch": 12.47553017944535, + "grad_norm": 0.006724333856254816, + "learning_rate": 0.00037266805078717106, + "loss": 0.0057, + "num_input_tokens_seen": 165029040, + "step": 76475 + }, + { + "epoch": 12.476345840130506, + "grad_norm": 0.0011521700071170926, + "learning_rate": 0.00037259921908032814, + "loss": 0.002, + "num_input_tokens_seen": 165040080, + "step": 76480 + }, + { + "epoch": 12.477161500815662, + "grad_norm": 0.006178973708301783, + "learning_rate": 0.0003725303899554234, + "loss": 0.0092, + "num_input_tokens_seen": 165051120, + "step": 76485 + }, + { + "epoch": 12.477977161500815, + "grad_norm": 0.014878334477543831, + "learning_rate": 0.00037246156341385234, + "loss": 0.0161, + "num_input_tokens_seen": 165061488, + "step": 76490 + }, + { + "epoch": 12.478792822185971, + "grad_norm": 0.07757678627967834, + "learning_rate": 0.0003723927394570092, + "loss": 0.0037, + "num_input_tokens_seen": 165071888, + "step": 76495 + }, + { + "epoch": 12.479608482871125, + "grad_norm": 0.029129143804311752, + "learning_rate": 0.0003723239180862893, + "loss": 0.0049, + "num_input_tokens_seen": 165083056, + "step": 76500 + }, + { + "epoch": 12.48042414355628, + "grad_norm": 0.005489358212798834, + "learning_rate": 0.00037225509930308696, + "loss": 0.0013, + "num_input_tokens_seen": 165093072, + "step": 76505 + }, + { + "epoch": 12.481239804241435, + "grad_norm": 0.04697535187005997, + "learning_rate": 0.0003721862831087971, + "loss": 0.0989, + "num_input_tokens_seen": 165102768, + "step": 76510 + }, + { + "epoch": 12.48205546492659, + "grad_norm": 0.014262707903981209, + "learning_rate": 0.0003721174695048145, + "loss": 0.0063, + "num_input_tokens_seen": 165112976, + "step": 76515 + }, + { + "epoch": 12.482871125611746, + "grad_norm": 0.0009488939540460706, + "learning_rate": 0.0003720486584925335, + "loss": 0.0026, + "num_input_tokens_seen": 165123568, + "step": 76520 + }, + { + "epoch": 12.4836867862969, + "grad_norm": 0.5135535001754761, + "learning_rate": 0.0003719798500733489, + "loss": 0.1969, + "num_input_tokens_seen": 165135472, + "step": 76525 + }, + { + "epoch": 12.484502446982056, + "grad_norm": 1.4278788566589355, + "learning_rate": 0.00037191104424865487, + "loss": 0.0246, + "num_input_tokens_seen": 165147184, + "step": 76530 + }, + { + "epoch": 12.48531810766721, + "grad_norm": 0.003912392538040876, + "learning_rate": 0.0003718422410198462, + "loss": 0.002, + "num_input_tokens_seen": 165157904, + "step": 76535 + }, + { + "epoch": 12.486133768352365, + "grad_norm": 0.02357400208711624, + "learning_rate": 0.0003717734403883169, + "loss": 0.0178, + "num_input_tokens_seen": 165169008, + "step": 76540 + }, + { + "epoch": 12.486949429037521, + "grad_norm": 0.04132775589823723, + "learning_rate": 0.0003717046423554617, + "loss": 0.0127, + "num_input_tokens_seen": 165178672, + "step": 76545 + }, + { + "epoch": 12.487765089722675, + "grad_norm": 0.02117694914340973, + "learning_rate": 0.0003716358469226745, + "loss": 0.0049, + "num_input_tokens_seen": 165189392, + "step": 76550 + }, + { + "epoch": 12.48858075040783, + "grad_norm": 0.008794351480901241, + "learning_rate": 0.0003715670540913499, + "loss": 0.0029, + "num_input_tokens_seen": 165200208, + "step": 76555 + }, + { + "epoch": 12.489396411092985, + "grad_norm": 0.004870031028985977, + "learning_rate": 0.0003714982638628817, + "loss": 0.0116, + "num_input_tokens_seen": 165210448, + "step": 76560 + }, + { + "epoch": 12.49021207177814, + "grad_norm": 0.011544113047420979, + "learning_rate": 0.00037142947623866417, + "loss": 0.0164, + "num_input_tokens_seen": 165220688, + "step": 76565 + }, + { + "epoch": 12.491027732463296, + "grad_norm": 0.024180198088288307, + "learning_rate": 0.0003713606912200915, + "loss": 0.0038, + "num_input_tokens_seen": 165231536, + "step": 76570 + }, + { + "epoch": 12.49184339314845, + "grad_norm": 0.0063278707675635815, + "learning_rate": 0.00037129190880855764, + "loss": 0.0059, + "num_input_tokens_seen": 165242000, + "step": 76575 + }, + { + "epoch": 12.492659053833606, + "grad_norm": 0.20272912085056305, + "learning_rate": 0.00037122312900545644, + "loss": 0.0105, + "num_input_tokens_seen": 165253712, + "step": 76580 + }, + { + "epoch": 12.49347471451876, + "grad_norm": 0.004526246339082718, + "learning_rate": 0.000371154351812182, + "loss": 0.0997, + "num_input_tokens_seen": 165264240, + "step": 76585 + }, + { + "epoch": 12.494290375203915, + "grad_norm": 0.0763450339436531, + "learning_rate": 0.0003710855772301279, + "loss": 0.0496, + "num_input_tokens_seen": 165275376, + "step": 76590 + }, + { + "epoch": 12.49510603588907, + "grad_norm": 0.12527777254581451, + "learning_rate": 0.00037101680526068837, + "loss": 0.0126, + "num_input_tokens_seen": 165286544, + "step": 76595 + }, + { + "epoch": 12.495921696574225, + "grad_norm": 0.03390096500515938, + "learning_rate": 0.0003709480359052566, + "loss": 0.0063, + "num_input_tokens_seen": 165296016, + "step": 76600 + }, + { + "epoch": 12.49673735725938, + "grad_norm": 0.01609675958752632, + "learning_rate": 0.0003708792691652269, + "loss": 0.0019, + "num_input_tokens_seen": 165306800, + "step": 76605 + }, + { + "epoch": 12.497553017944535, + "grad_norm": 0.011400408111512661, + "learning_rate": 0.00037081050504199245, + "loss": 0.0228, + "num_input_tokens_seen": 165316816, + "step": 76610 + }, + { + "epoch": 12.49836867862969, + "grad_norm": 0.5087020993232727, + "learning_rate": 0.0003707417435369469, + "loss": 0.0266, + "num_input_tokens_seen": 165326544, + "step": 76615 + }, + { + "epoch": 12.499184339314844, + "grad_norm": 0.0009916014969348907, + "learning_rate": 0.00037067298465148416, + "loss": 0.0063, + "num_input_tokens_seen": 165337520, + "step": 76620 + }, + { + "epoch": 12.5, + "grad_norm": 0.47859013080596924, + "learning_rate": 0.00037060422838699716, + "loss": 0.0676, + "num_input_tokens_seen": 165348432, + "step": 76625 + }, + { + "epoch": 12.500815660685156, + "grad_norm": 0.5108631253242493, + "learning_rate": 0.0003705354747448799, + "loss": 0.1385, + "num_input_tokens_seen": 165358640, + "step": 76630 + }, + { + "epoch": 12.50163132137031, + "grad_norm": 0.005186820402741432, + "learning_rate": 0.00037046672372652523, + "loss": 0.062, + "num_input_tokens_seen": 165367920, + "step": 76635 + }, + { + "epoch": 12.502446982055465, + "grad_norm": 0.0016154140466824174, + "learning_rate": 0.00037039797533332697, + "loss": 0.1003, + "num_input_tokens_seen": 165377904, + "step": 76640 + }, + { + "epoch": 12.50326264274062, + "grad_norm": 0.0026025883853435516, + "learning_rate": 0.000370329229566678, + "loss": 0.0216, + "num_input_tokens_seen": 165389520, + "step": 76645 + }, + { + "epoch": 12.504078303425775, + "grad_norm": 0.0011148974299430847, + "learning_rate": 0.0003702604864279718, + "loss": 0.0016, + "num_input_tokens_seen": 165400432, + "step": 76650 + }, + { + "epoch": 12.50489396411093, + "grad_norm": 0.030641701072454453, + "learning_rate": 0.00037019174591860127, + "loss": 0.1094, + "num_input_tokens_seen": 165410448, + "step": 76655 + }, + { + "epoch": 12.505709624796085, + "grad_norm": 0.00726750073954463, + "learning_rate": 0.0003701230080399599, + "loss": 0.0113, + "num_input_tokens_seen": 165420720, + "step": 76660 + }, + { + "epoch": 12.50652528548124, + "grad_norm": 0.044399891048669815, + "learning_rate": 0.00037005427279344027, + "loss": 0.0059, + "num_input_tokens_seen": 165430960, + "step": 76665 + }, + { + "epoch": 12.507340946166394, + "grad_norm": 0.010520456358790398, + "learning_rate": 0.0003699855401804359, + "loss": 0.0017, + "num_input_tokens_seen": 165440976, + "step": 76670 + }, + { + "epoch": 12.50815660685155, + "grad_norm": 0.15288378298282623, + "learning_rate": 0.0003699168102023393, + "loss": 0.0108, + "num_input_tokens_seen": 165452848, + "step": 76675 + }, + { + "epoch": 12.508972267536706, + "grad_norm": 0.02567547746002674, + "learning_rate": 0.0003698480828605437, + "loss": 0.0059, + "num_input_tokens_seen": 165463984, + "step": 76680 + }, + { + "epoch": 12.50978792822186, + "grad_norm": 0.012184906750917435, + "learning_rate": 0.0003697793581564417, + "loss": 0.0156, + "num_input_tokens_seen": 165474320, + "step": 76685 + }, + { + "epoch": 12.510603588907015, + "grad_norm": 0.6739414930343628, + "learning_rate": 0.00036971063609142637, + "loss": 0.1015, + "num_input_tokens_seen": 165485232, + "step": 76690 + }, + { + "epoch": 12.51141924959217, + "grad_norm": 0.014522730372846127, + "learning_rate": 0.00036964191666689005, + "loss": 0.0182, + "num_input_tokens_seen": 165496528, + "step": 76695 + }, + { + "epoch": 12.512234910277325, + "grad_norm": 0.0005946329911239445, + "learning_rate": 0.00036957319988422586, + "loss": 0.0026, + "num_input_tokens_seen": 165506864, + "step": 76700 + }, + { + "epoch": 12.513050570962479, + "grad_norm": 0.02310916595160961, + "learning_rate": 0.0003695044857448261, + "loss": 0.09, + "num_input_tokens_seen": 165517232, + "step": 76705 + }, + { + "epoch": 12.513866231647635, + "grad_norm": 0.007429556921124458, + "learning_rate": 0.0003694357742500835, + "loss": 0.0029, + "num_input_tokens_seen": 165528112, + "step": 76710 + }, + { + "epoch": 12.51468189233279, + "grad_norm": 0.034885190427303314, + "learning_rate": 0.00036936706540139063, + "loss": 0.1329, + "num_input_tokens_seen": 165538640, + "step": 76715 + }, + { + "epoch": 12.515497553017944, + "grad_norm": 0.00142071268055588, + "learning_rate": 0.0003692983592001398, + "loss": 0.0488, + "num_input_tokens_seen": 165549360, + "step": 76720 + }, + { + "epoch": 12.5163132137031, + "grad_norm": 0.006618925370275974, + "learning_rate": 0.0003692296556477237, + "loss": 0.0434, + "num_input_tokens_seen": 165561040, + "step": 76725 + }, + { + "epoch": 12.517128874388254, + "grad_norm": 0.08607296645641327, + "learning_rate": 0.0003691609547455343, + "loss": 0.0057, + "num_input_tokens_seen": 165572080, + "step": 76730 + }, + { + "epoch": 12.51794453507341, + "grad_norm": 0.00076168222585693, + "learning_rate": 0.0003690922564949643, + "loss": 0.0096, + "num_input_tokens_seen": 165583792, + "step": 76735 + }, + { + "epoch": 12.518760195758565, + "grad_norm": 0.0014156397664919496, + "learning_rate": 0.0003690235608974057, + "loss": 0.0031, + "num_input_tokens_seen": 165594096, + "step": 76740 + }, + { + "epoch": 12.51957585644372, + "grad_norm": 0.009406045079231262, + "learning_rate": 0.0003689548679542508, + "loss": 0.0026, + "num_input_tokens_seen": 165603472, + "step": 76745 + }, + { + "epoch": 12.520391517128875, + "grad_norm": 0.0010682566789910197, + "learning_rate": 0.0003688861776668918, + "loss": 0.0096, + "num_input_tokens_seen": 165614960, + "step": 76750 + }, + { + "epoch": 12.521207177814029, + "grad_norm": 0.027011990547180176, + "learning_rate": 0.0003688174900367207, + "loss": 0.0248, + "num_input_tokens_seen": 165625488, + "step": 76755 + }, + { + "epoch": 12.522022838499185, + "grad_norm": 0.04152964800596237, + "learning_rate": 0.00036874880506512954, + "loss": 0.0123, + "num_input_tokens_seen": 165636368, + "step": 76760 + }, + { + "epoch": 12.522838499184338, + "grad_norm": 0.7009368538856506, + "learning_rate": 0.0003686801227535105, + "loss": 0.0432, + "num_input_tokens_seen": 165646832, + "step": 76765 + }, + { + "epoch": 12.523654159869494, + "grad_norm": 0.5715734362602234, + "learning_rate": 0.00036861144310325523, + "loss": 0.1536, + "num_input_tokens_seen": 165657744, + "step": 76770 + }, + { + "epoch": 12.52446982055465, + "grad_norm": 0.12434447556734085, + "learning_rate": 0.0003685427661157559, + "loss": 0.0158, + "num_input_tokens_seen": 165668944, + "step": 76775 + }, + { + "epoch": 12.525285481239804, + "grad_norm": 0.13573065400123596, + "learning_rate": 0.00036847409179240396, + "loss": 0.0525, + "num_input_tokens_seen": 165679312, + "step": 76780 + }, + { + "epoch": 12.52610114192496, + "grad_norm": 0.060312170535326004, + "learning_rate": 0.00036840542013459154, + "loss": 0.012, + "num_input_tokens_seen": 165690704, + "step": 76785 + }, + { + "epoch": 12.526916802610113, + "grad_norm": 0.004763939417898655, + "learning_rate": 0.00036833675114371014, + "loss": 0.0043, + "num_input_tokens_seen": 165702000, + "step": 76790 + }, + { + "epoch": 12.52773246329527, + "grad_norm": 0.003991794306784868, + "learning_rate": 0.00036826808482115167, + "loss": 0.007, + "num_input_tokens_seen": 165713360, + "step": 76795 + }, + { + "epoch": 12.528548123980425, + "grad_norm": 0.03726784139871597, + "learning_rate": 0.00036819942116830736, + "loss": 0.0228, + "num_input_tokens_seen": 165724400, + "step": 76800 + }, + { + "epoch": 12.529363784665579, + "grad_norm": 0.08485420048236847, + "learning_rate": 0.0003681307601865692, + "loss": 0.0072, + "num_input_tokens_seen": 165735088, + "step": 76805 + }, + { + "epoch": 12.530179445350734, + "grad_norm": 0.010444369167089462, + "learning_rate": 0.00036806210187732824, + "loss": 0.0065, + "num_input_tokens_seen": 165746064, + "step": 76810 + }, + { + "epoch": 12.530995106035888, + "grad_norm": 0.03774203360080719, + "learning_rate": 0.00036799344624197637, + "loss": 0.0112, + "num_input_tokens_seen": 165755824, + "step": 76815 + }, + { + "epoch": 12.531810766721044, + "grad_norm": 0.8317771553993225, + "learning_rate": 0.00036792479328190457, + "loss": 0.0578, + "num_input_tokens_seen": 165766896, + "step": 76820 + }, + { + "epoch": 12.5326264274062, + "grad_norm": 0.005820379126816988, + "learning_rate": 0.0003678561429985044, + "loss": 0.0112, + "num_input_tokens_seen": 165777776, + "step": 76825 + }, + { + "epoch": 12.533442088091354, + "grad_norm": 0.043100252747535706, + "learning_rate": 0.00036778749539316736, + "loss": 0.2087, + "num_input_tokens_seen": 165789168, + "step": 76830 + }, + { + "epoch": 12.53425774877651, + "grad_norm": 0.04981034994125366, + "learning_rate": 0.00036771885046728417, + "loss": 0.0229, + "num_input_tokens_seen": 165799024, + "step": 76835 + }, + { + "epoch": 12.535073409461663, + "grad_norm": 0.13894149661064148, + "learning_rate": 0.00036765020822224654, + "loss": 0.0105, + "num_input_tokens_seen": 165810256, + "step": 76840 + }, + { + "epoch": 12.535889070146819, + "grad_norm": 0.06311865150928497, + "learning_rate": 0.0003675815686594451, + "loss": 0.1245, + "num_input_tokens_seen": 165822288, + "step": 76845 + }, + { + "epoch": 12.536704730831975, + "grad_norm": 0.008842695504426956, + "learning_rate": 0.00036751293178027144, + "loss": 0.0035, + "num_input_tokens_seen": 165833136, + "step": 76850 + }, + { + "epoch": 12.537520391517129, + "grad_norm": 0.006130650173872709, + "learning_rate": 0.000367444297586116, + "loss": 0.0149, + "num_input_tokens_seen": 165842608, + "step": 76855 + }, + { + "epoch": 12.538336052202284, + "grad_norm": 0.007810582872480154, + "learning_rate": 0.0003673756660783703, + "loss": 0.0993, + "num_input_tokens_seen": 165853680, + "step": 76860 + }, + { + "epoch": 12.539151712887438, + "grad_norm": 0.08374088257551193, + "learning_rate": 0.00036730703725842474, + "loss": 0.1038, + "num_input_tokens_seen": 165864176, + "step": 76865 + }, + { + "epoch": 12.539967373572594, + "grad_norm": 0.02385077439248562, + "learning_rate": 0.0003672384111276705, + "loss": 0.03, + "num_input_tokens_seen": 165874864, + "step": 76870 + }, + { + "epoch": 12.540783034257748, + "grad_norm": 0.004634529817849398, + "learning_rate": 0.0003671697876874982, + "loss": 0.0354, + "num_input_tokens_seen": 165886032, + "step": 76875 + }, + { + "epoch": 12.541598694942904, + "grad_norm": 0.004864581394940615, + "learning_rate": 0.00036710116693929875, + "loss": 0.0209, + "num_input_tokens_seen": 165897392, + "step": 76880 + }, + { + "epoch": 12.54241435562806, + "grad_norm": 0.011376219801604748, + "learning_rate": 0.0003670325488844627, + "loss": 0.1089, + "num_input_tokens_seen": 165909008, + "step": 76885 + }, + { + "epoch": 12.543230016313213, + "grad_norm": 0.0011367045808583498, + "learning_rate": 0.00036696393352438083, + "loss": 0.059, + "num_input_tokens_seen": 165919888, + "step": 76890 + }, + { + "epoch": 12.544045676998369, + "grad_norm": 0.003012144472450018, + "learning_rate": 0.0003668953208604435, + "loss": 0.0135, + "num_input_tokens_seen": 165930896, + "step": 76895 + }, + { + "epoch": 12.544861337683523, + "grad_norm": 0.011744393967092037, + "learning_rate": 0.0003668267108940414, + "loss": 0.0256, + "num_input_tokens_seen": 165942256, + "step": 76900 + }, + { + "epoch": 12.545676998368679, + "grad_norm": 0.0006758011295460165, + "learning_rate": 0.00036675810362656486, + "loss": 0.0505, + "num_input_tokens_seen": 165953328, + "step": 76905 + }, + { + "epoch": 12.546492659053834, + "grad_norm": 0.006964650005102158, + "learning_rate": 0.00036668949905940455, + "loss": 0.0033, + "num_input_tokens_seen": 165964592, + "step": 76910 + }, + { + "epoch": 12.547308319738988, + "grad_norm": 0.3860965669155121, + "learning_rate": 0.0003666208971939505, + "loss": 0.0172, + "num_input_tokens_seen": 165975344, + "step": 76915 + }, + { + "epoch": 12.548123980424144, + "grad_norm": 0.0030011977069079876, + "learning_rate": 0.0003665522980315933, + "loss": 0.0343, + "num_input_tokens_seen": 165985936, + "step": 76920 + }, + { + "epoch": 12.548939641109298, + "grad_norm": 0.024513008072972298, + "learning_rate": 0.0003664837015737229, + "loss": 0.0069, + "num_input_tokens_seen": 165995952, + "step": 76925 + }, + { + "epoch": 12.549755301794454, + "grad_norm": 0.008004706352949142, + "learning_rate": 0.00036641510782172993, + "loss": 0.0087, + "num_input_tokens_seen": 166005936, + "step": 76930 + }, + { + "epoch": 12.550570962479608, + "grad_norm": 0.012239702977240086, + "learning_rate": 0.0003663465167770039, + "loss": 0.1094, + "num_input_tokens_seen": 166016400, + "step": 76935 + }, + { + "epoch": 12.551386623164763, + "grad_norm": 0.011011580005288124, + "learning_rate": 0.00036627792844093544, + "loss": 0.0094, + "num_input_tokens_seen": 166026288, + "step": 76940 + }, + { + "epoch": 12.552202283849919, + "grad_norm": 0.02000279910862446, + "learning_rate": 0.0003662093428149145, + "loss": 0.0092, + "num_input_tokens_seen": 166037328, + "step": 76945 + }, + { + "epoch": 12.553017944535073, + "grad_norm": 0.15778093039989471, + "learning_rate": 0.0003661407599003308, + "loss": 0.0434, + "num_input_tokens_seen": 166047600, + "step": 76950 + }, + { + "epoch": 12.553833605220229, + "grad_norm": 0.4581848084926605, + "learning_rate": 0.0003660721796985746, + "loss": 0.1635, + "num_input_tokens_seen": 166057904, + "step": 76955 + }, + { + "epoch": 12.554649265905383, + "grad_norm": 0.09858501702547073, + "learning_rate": 0.0003660036022110353, + "loss": 0.0874, + "num_input_tokens_seen": 166069424, + "step": 76960 + }, + { + "epoch": 12.555464926590538, + "grad_norm": 0.016478603705763817, + "learning_rate": 0.00036593502743910336, + "loss": 0.0134, + "num_input_tokens_seen": 166081392, + "step": 76965 + }, + { + "epoch": 12.556280587275694, + "grad_norm": 0.000864124798681587, + "learning_rate": 0.00036586645538416783, + "loss": 0.0046, + "num_input_tokens_seen": 166091344, + "step": 76970 + }, + { + "epoch": 12.557096247960848, + "grad_norm": 0.027731962502002716, + "learning_rate": 0.00036579788604761896, + "loss": 0.0337, + "num_input_tokens_seen": 166102096, + "step": 76975 + }, + { + "epoch": 12.557911908646004, + "grad_norm": 0.11173520237207413, + "learning_rate": 0.000365729319430846, + "loss": 0.0052, + "num_input_tokens_seen": 166112112, + "step": 76980 + }, + { + "epoch": 12.558727569331158, + "grad_norm": 0.0016164095140993595, + "learning_rate": 0.00036566075553523894, + "loss": 0.0051, + "num_input_tokens_seen": 166122672, + "step": 76985 + }, + { + "epoch": 12.559543230016313, + "grad_norm": 0.016156800091266632, + "learning_rate": 0.0003655921943621868, + "loss": 0.0236, + "num_input_tokens_seen": 166132912, + "step": 76990 + }, + { + "epoch": 12.560358890701469, + "grad_norm": 0.013461234048008919, + "learning_rate": 0.0003655236359130796, + "loss": 0.0155, + "num_input_tokens_seen": 166143792, + "step": 76995 + }, + { + "epoch": 12.561174551386623, + "grad_norm": 0.00307975010946393, + "learning_rate": 0.0003654550801893063, + "loss": 0.0293, + "num_input_tokens_seen": 166155312, + "step": 77000 + }, + { + "epoch": 12.561990212071779, + "grad_norm": 0.05063984915614128, + "learning_rate": 0.00036538652719225674, + "loss": 0.065, + "num_input_tokens_seen": 166166672, + "step": 77005 + }, + { + "epoch": 12.562805872756933, + "grad_norm": 0.7182468771934509, + "learning_rate": 0.0003653179769233197, + "loss": 0.0819, + "num_input_tokens_seen": 166176624, + "step": 77010 + }, + { + "epoch": 12.563621533442088, + "grad_norm": 0.004681479185819626, + "learning_rate": 0.00036524942938388495, + "loss": 0.1873, + "num_input_tokens_seen": 166187216, + "step": 77015 + }, + { + "epoch": 12.564437194127244, + "grad_norm": 0.0052529447712004185, + "learning_rate": 0.00036518088457534125, + "loss": 0.0235, + "num_input_tokens_seen": 166197872, + "step": 77020 + }, + { + "epoch": 12.565252854812398, + "grad_norm": 0.0038440870121121407, + "learning_rate": 0.0003651123424990781, + "loss": 0.0186, + "num_input_tokens_seen": 166210288, + "step": 77025 + }, + { + "epoch": 12.566068515497554, + "grad_norm": 0.09148385375738144, + "learning_rate": 0.00036504380315648447, + "loss": 0.0133, + "num_input_tokens_seen": 166221968, + "step": 77030 + }, + { + "epoch": 12.566884176182707, + "grad_norm": 0.6636860370635986, + "learning_rate": 0.0003649752665489492, + "loss": 0.1202, + "num_input_tokens_seen": 166233040, + "step": 77035 + }, + { + "epoch": 12.567699836867863, + "grad_norm": 0.034601736813783646, + "learning_rate": 0.00036490673267786154, + "loss": 0.0063, + "num_input_tokens_seen": 166243760, + "step": 77040 + }, + { + "epoch": 12.568515497553017, + "grad_norm": 0.12983335554599762, + "learning_rate": 0.0003648382015446103, + "loss": 0.1137, + "num_input_tokens_seen": 166255024, + "step": 77045 + }, + { + "epoch": 12.569331158238173, + "grad_norm": 0.0013313490198925138, + "learning_rate": 0.0003647696731505844, + "loss": 0.0096, + "num_input_tokens_seen": 166264784, + "step": 77050 + }, + { + "epoch": 12.570146818923329, + "grad_norm": 0.004809685982763767, + "learning_rate": 0.00036470114749717267, + "loss": 0.057, + "num_input_tokens_seen": 166276016, + "step": 77055 + }, + { + "epoch": 12.570962479608482, + "grad_norm": 0.23200955986976624, + "learning_rate": 0.00036463262458576374, + "loss": 0.0135, + "num_input_tokens_seen": 166286800, + "step": 77060 + }, + { + "epoch": 12.571778140293638, + "grad_norm": 0.12686890363693237, + "learning_rate": 0.0003645641044177465, + "loss": 0.0095, + "num_input_tokens_seen": 166297872, + "step": 77065 + }, + { + "epoch": 12.572593800978792, + "grad_norm": 0.11056645959615707, + "learning_rate": 0.00036449558699450937, + "loss": 0.0155, + "num_input_tokens_seen": 166309328, + "step": 77070 + }, + { + "epoch": 12.573409461663948, + "grad_norm": 0.19649755954742432, + "learning_rate": 0.0003644270723174411, + "loss": 0.0118, + "num_input_tokens_seen": 166320208, + "step": 77075 + }, + { + "epoch": 12.574225122349104, + "grad_norm": 0.007531277369707823, + "learning_rate": 0.0003643585603879303, + "loss": 0.0064, + "num_input_tokens_seen": 166331920, + "step": 77080 + }, + { + "epoch": 12.575040783034257, + "grad_norm": 0.011777924373745918, + "learning_rate": 0.0003642900512073652, + "loss": 0.0073, + "num_input_tokens_seen": 166343728, + "step": 77085 + }, + { + "epoch": 12.575856443719413, + "grad_norm": 0.003730174619704485, + "learning_rate": 0.00036422154477713456, + "loss": 0.0067, + "num_input_tokens_seen": 166355152, + "step": 77090 + }, + { + "epoch": 12.576672104404567, + "grad_norm": 0.11103439331054688, + "learning_rate": 0.00036415304109862633, + "loss": 0.0076, + "num_input_tokens_seen": 166365296, + "step": 77095 + }, + { + "epoch": 12.577487765089723, + "grad_norm": 0.014810984954237938, + "learning_rate": 0.0003640845401732293, + "loss": 0.0505, + "num_input_tokens_seen": 166376016, + "step": 77100 + }, + { + "epoch": 12.578303425774878, + "grad_norm": 0.002150257583707571, + "learning_rate": 0.0003640160420023313, + "loss": 0.022, + "num_input_tokens_seen": 166386896, + "step": 77105 + }, + { + "epoch": 12.579119086460032, + "grad_norm": 0.0013216190272942185, + "learning_rate": 0.00036394754658732086, + "loss": 0.0059, + "num_input_tokens_seen": 166397392, + "step": 77110 + }, + { + "epoch": 12.579934747145188, + "grad_norm": 0.002403589431196451, + "learning_rate": 0.00036387905392958574, + "loss": 0.0482, + "num_input_tokens_seen": 166408944, + "step": 77115 + }, + { + "epoch": 12.580750407830342, + "grad_norm": 0.00277106836438179, + "learning_rate": 0.0003638105640305146, + "loss": 0.0197, + "num_input_tokens_seen": 166420240, + "step": 77120 + }, + { + "epoch": 12.581566068515498, + "grad_norm": 0.00955519825220108, + "learning_rate": 0.00036374207689149487, + "loss": 0.0083, + "num_input_tokens_seen": 166430320, + "step": 77125 + }, + { + "epoch": 12.582381729200652, + "grad_norm": 0.004968844819813967, + "learning_rate": 0.00036367359251391506, + "loss": 0.0105, + "num_input_tokens_seen": 166439824, + "step": 77130 + }, + { + "epoch": 12.583197389885807, + "grad_norm": 0.044087350368499756, + "learning_rate": 0.0003636051108991626, + "loss": 0.0063, + "num_input_tokens_seen": 166451184, + "step": 77135 + }, + { + "epoch": 12.584013050570963, + "grad_norm": 0.0075911665335297585, + "learning_rate": 0.0003635366320486258, + "loss": 0.0232, + "num_input_tokens_seen": 166462448, + "step": 77140 + }, + { + "epoch": 12.584828711256117, + "grad_norm": 0.029772218316793442, + "learning_rate": 0.0003634681559636921, + "loss": 0.0213, + "num_input_tokens_seen": 166473296, + "step": 77145 + }, + { + "epoch": 12.585644371941273, + "grad_norm": 0.0021433790680021048, + "learning_rate": 0.0003633996826457494, + "loss": 0.0068, + "num_input_tokens_seen": 166485136, + "step": 77150 + }, + { + "epoch": 12.586460032626427, + "grad_norm": 0.0959862768650055, + "learning_rate": 0.0003633312120961856, + "loss": 0.0759, + "num_input_tokens_seen": 166495504, + "step": 77155 + }, + { + "epoch": 12.587275693311582, + "grad_norm": 0.09669545292854309, + "learning_rate": 0.000363262744316388, + "loss": 0.0106, + "num_input_tokens_seen": 166506288, + "step": 77160 + }, + { + "epoch": 12.588091353996738, + "grad_norm": 0.017910035327076912, + "learning_rate": 0.00036319427930774453, + "loss": 0.0033, + "num_input_tokens_seen": 166516400, + "step": 77165 + }, + { + "epoch": 12.588907014681892, + "grad_norm": 0.015269089490175247, + "learning_rate": 0.0003631258170716423, + "loss": 0.0016, + "num_input_tokens_seen": 166528304, + "step": 77170 + }, + { + "epoch": 12.589722675367048, + "grad_norm": 0.023825401440262794, + "learning_rate": 0.0003630573576094693, + "loss": 0.0039, + "num_input_tokens_seen": 166537616, + "step": 77175 + }, + { + "epoch": 12.590538336052202, + "grad_norm": 0.2393476665019989, + "learning_rate": 0.0003629889009226124, + "loss": 0.1913, + "num_input_tokens_seen": 166549072, + "step": 77180 + }, + { + "epoch": 12.591353996737357, + "grad_norm": 0.2703637480735779, + "learning_rate": 0.0003629204470124595, + "loss": 0.0221, + "num_input_tokens_seen": 166559280, + "step": 77185 + }, + { + "epoch": 12.592169657422513, + "grad_norm": 0.02257862687110901, + "learning_rate": 0.00036285199588039743, + "loss": 0.0531, + "num_input_tokens_seen": 166571216, + "step": 77190 + }, + { + "epoch": 12.592985318107667, + "grad_norm": 0.3357855975627899, + "learning_rate": 0.0003627835475278137, + "loss": 0.0655, + "num_input_tokens_seen": 166583408, + "step": 77195 + }, + { + "epoch": 12.593800978792823, + "grad_norm": 0.013959350995719433, + "learning_rate": 0.0003627151019560955, + "loss": 0.0072, + "num_input_tokens_seen": 166593328, + "step": 77200 + }, + { + "epoch": 12.594616639477977, + "grad_norm": 0.040643539279699326, + "learning_rate": 0.00036264665916662986, + "loss": 0.0077, + "num_input_tokens_seen": 166604976, + "step": 77205 + }, + { + "epoch": 12.595432300163132, + "grad_norm": 0.08936650305986404, + "learning_rate": 0.000362578219160804, + "loss": 0.0078, + "num_input_tokens_seen": 166615760, + "step": 77210 + }, + { + "epoch": 12.596247960848288, + "grad_norm": 0.3049409091472626, + "learning_rate": 0.0003625097819400048, + "loss": 0.009, + "num_input_tokens_seen": 166627408, + "step": 77215 + }, + { + "epoch": 12.597063621533442, + "grad_norm": 0.008690894581377506, + "learning_rate": 0.0003624413475056192, + "loss": 0.0124, + "num_input_tokens_seen": 166637168, + "step": 77220 + }, + { + "epoch": 12.597879282218598, + "grad_norm": 1.3215547800064087, + "learning_rate": 0.00036237291585903436, + "loss": 0.042, + "num_input_tokens_seen": 166648080, + "step": 77225 + }, + { + "epoch": 12.598694942903752, + "grad_norm": 0.8992112874984741, + "learning_rate": 0.0003623044870016368, + "loss": 0.0293, + "num_input_tokens_seen": 166658160, + "step": 77230 + }, + { + "epoch": 12.599510603588907, + "grad_norm": 0.3619556725025177, + "learning_rate": 0.0003622360609348138, + "loss": 0.0128, + "num_input_tokens_seen": 166669552, + "step": 77235 + }, + { + "epoch": 12.600326264274061, + "grad_norm": 0.0013363189063966274, + "learning_rate": 0.0003621676376599514, + "loss": 0.0017, + "num_input_tokens_seen": 166681232, + "step": 77240 + }, + { + "epoch": 12.601141924959217, + "grad_norm": 0.4246501624584198, + "learning_rate": 0.00036209921717843697, + "loss": 0.0224, + "num_input_tokens_seen": 166691824, + "step": 77245 + }, + { + "epoch": 12.601957585644373, + "grad_norm": 0.09219825267791748, + "learning_rate": 0.00036203079949165664, + "loss": 0.0884, + "num_input_tokens_seen": 166702800, + "step": 77250 + }, + { + "epoch": 12.602773246329527, + "grad_norm": 0.24224811792373657, + "learning_rate": 0.00036196238460099717, + "loss": 0.2001, + "num_input_tokens_seen": 166713200, + "step": 77255 + }, + { + "epoch": 12.603588907014682, + "grad_norm": 0.16774626076221466, + "learning_rate": 0.0003618939725078453, + "loss": 0.0986, + "num_input_tokens_seen": 166723472, + "step": 77260 + }, + { + "epoch": 12.604404567699836, + "grad_norm": 0.027674391865730286, + "learning_rate": 0.0003618255632135871, + "loss": 0.0088, + "num_input_tokens_seen": 166734960, + "step": 77265 + }, + { + "epoch": 12.605220228384992, + "grad_norm": 0.009763489477336407, + "learning_rate": 0.00036175715671960934, + "loss": 0.0068, + "num_input_tokens_seen": 166745968, + "step": 77270 + }, + { + "epoch": 12.606035889070148, + "grad_norm": 0.0009390097693540156, + "learning_rate": 0.000361688753027298, + "loss": 0.095, + "num_input_tokens_seen": 166756336, + "step": 77275 + }, + { + "epoch": 12.606851549755302, + "grad_norm": 0.09471603482961655, + "learning_rate": 0.0003616203521380397, + "loss": 0.0094, + "num_input_tokens_seen": 166765776, + "step": 77280 + }, + { + "epoch": 12.607667210440457, + "grad_norm": 0.10565578192472458, + "learning_rate": 0.00036155195405322026, + "loss": 0.0469, + "num_input_tokens_seen": 166777040, + "step": 77285 + }, + { + "epoch": 12.608482871125611, + "grad_norm": 0.12482444941997528, + "learning_rate": 0.0003614835587742264, + "loss": 0.044, + "num_input_tokens_seen": 166788336, + "step": 77290 + }, + { + "epoch": 12.609298531810767, + "grad_norm": 0.0530070923268795, + "learning_rate": 0.0003614151663024436, + "loss": 0.0026, + "num_input_tokens_seen": 166797712, + "step": 77295 + }, + { + "epoch": 12.61011419249592, + "grad_norm": 0.0028806356713175774, + "learning_rate": 0.0003613467766392586, + "loss": 0.0116, + "num_input_tokens_seen": 166808880, + "step": 77300 + }, + { + "epoch": 12.610929853181077, + "grad_norm": 0.010128357447683811, + "learning_rate": 0.00036127838978605687, + "loss": 0.0014, + "num_input_tokens_seen": 166819952, + "step": 77305 + }, + { + "epoch": 12.611745513866232, + "grad_norm": 0.02484278753399849, + "learning_rate": 0.0003612100057442247, + "loss": 0.0049, + "num_input_tokens_seen": 166830608, + "step": 77310 + }, + { + "epoch": 12.612561174551386, + "grad_norm": 0.09119195491075516, + "learning_rate": 0.00036114162451514765, + "loss": 0.0403, + "num_input_tokens_seen": 166840816, + "step": 77315 + }, + { + "epoch": 12.613376835236542, + "grad_norm": 0.0030363532714545727, + "learning_rate": 0.000361073246100212, + "loss": 0.0162, + "num_input_tokens_seen": 166852016, + "step": 77320 + }, + { + "epoch": 12.614192495921696, + "grad_norm": 0.04425378888845444, + "learning_rate": 0.0003610048705008029, + "loss": 0.0057, + "num_input_tokens_seen": 166862896, + "step": 77325 + }, + { + "epoch": 12.615008156606851, + "grad_norm": 0.007783989887684584, + "learning_rate": 0.00036093649771830674, + "loss": 0.0035, + "num_input_tokens_seen": 166873616, + "step": 77330 + }, + { + "epoch": 12.615823817292007, + "grad_norm": 0.05231314152479172, + "learning_rate": 0.0003608681277541086, + "loss": 0.0096, + "num_input_tokens_seen": 166885104, + "step": 77335 + }, + { + "epoch": 12.616639477977161, + "grad_norm": 0.3613528609275818, + "learning_rate": 0.00036079976060959454, + "loss": 0.0458, + "num_input_tokens_seen": 166895952, + "step": 77340 + }, + { + "epoch": 12.617455138662317, + "grad_norm": 0.02609623596072197, + "learning_rate": 0.0003607313962861499, + "loss": 0.0811, + "num_input_tokens_seen": 166905488, + "step": 77345 + }, + { + "epoch": 12.61827079934747, + "grad_norm": 0.13939078152179718, + "learning_rate": 0.00036066303478516016, + "loss": 0.0033, + "num_input_tokens_seen": 166915728, + "step": 77350 + }, + { + "epoch": 12.619086460032626, + "grad_norm": 0.012315727770328522, + "learning_rate": 0.0003605946761080108, + "loss": 0.0135, + "num_input_tokens_seen": 166925872, + "step": 77355 + }, + { + "epoch": 12.619902120717782, + "grad_norm": 0.4648289978504181, + "learning_rate": 0.000360526320256087, + "loss": 0.0104, + "num_input_tokens_seen": 166937520, + "step": 77360 + }, + { + "epoch": 12.620717781402936, + "grad_norm": 0.003273796522989869, + "learning_rate": 0.0003604579672307744, + "loss": 0.1351, + "num_input_tokens_seen": 166949360, + "step": 77365 + }, + { + "epoch": 12.621533442088092, + "grad_norm": 0.006904638838022947, + "learning_rate": 0.00036038961703345815, + "loss": 0.0046, + "num_input_tokens_seen": 166960944, + "step": 77370 + }, + { + "epoch": 12.622349102773246, + "grad_norm": 0.0068300142884254456, + "learning_rate": 0.00036032126966552335, + "loss": 0.0047, + "num_input_tokens_seen": 166971312, + "step": 77375 + }, + { + "epoch": 12.623164763458401, + "grad_norm": 0.01035599410533905, + "learning_rate": 0.0003602529251283553, + "loss": 0.0126, + "num_input_tokens_seen": 166981456, + "step": 77380 + }, + { + "epoch": 12.623980424143557, + "grad_norm": 0.08199837803840637, + "learning_rate": 0.000360184583423339, + "loss": 0.0094, + "num_input_tokens_seen": 166991984, + "step": 77385 + }, + { + "epoch": 12.624796084828711, + "grad_norm": 0.0021704344544559717, + "learning_rate": 0.0003601162445518593, + "loss": 0.004, + "num_input_tokens_seen": 167002800, + "step": 77390 + }, + { + "epoch": 12.625611745513867, + "grad_norm": 0.031105991452932358, + "learning_rate": 0.0003600479085153017, + "loss": 0.0059, + "num_input_tokens_seen": 167013168, + "step": 77395 + }, + { + "epoch": 12.62642740619902, + "grad_norm": 0.002468442777171731, + "learning_rate": 0.00035997957531505045, + "loss": 0.0205, + "num_input_tokens_seen": 167023344, + "step": 77400 + }, + { + "epoch": 12.627243066884176, + "grad_norm": 0.0039521269500255585, + "learning_rate": 0.00035991124495249094, + "loss": 0.0034, + "num_input_tokens_seen": 167034800, + "step": 77405 + }, + { + "epoch": 12.62805872756933, + "grad_norm": 0.0030596458818763494, + "learning_rate": 0.0003598429174290076, + "loss": 0.0132, + "num_input_tokens_seen": 167045584, + "step": 77410 + }, + { + "epoch": 12.628874388254486, + "grad_norm": 0.010100574232637882, + "learning_rate": 0.0003597745927459856, + "loss": 0.0058, + "num_input_tokens_seen": 167056880, + "step": 77415 + }, + { + "epoch": 12.629690048939642, + "grad_norm": 0.02214542031288147, + "learning_rate": 0.00035970627090480906, + "loss": 0.1047, + "num_input_tokens_seen": 167068176, + "step": 77420 + }, + { + "epoch": 12.630505709624796, + "grad_norm": 0.0011605405015870929, + "learning_rate": 0.0003596379519068632, + "loss": 0.1111, + "num_input_tokens_seen": 167078736, + "step": 77425 + }, + { + "epoch": 12.631321370309951, + "grad_norm": 0.08922293037176132, + "learning_rate": 0.000359569635753532, + "loss": 0.0082, + "num_input_tokens_seen": 167089840, + "step": 77430 + }, + { + "epoch": 12.632137030995105, + "grad_norm": 0.005232447758316994, + "learning_rate": 0.00035950132244620057, + "loss": 0.0091, + "num_input_tokens_seen": 167100816, + "step": 77435 + }, + { + "epoch": 12.632952691680261, + "grad_norm": 0.0013578938087448478, + "learning_rate": 0.0003594330119862529, + "loss": 0.0741, + "num_input_tokens_seen": 167112240, + "step": 77440 + }, + { + "epoch": 12.633768352365417, + "grad_norm": 0.02110360749065876, + "learning_rate": 0.00035936470437507366, + "loss": 0.0066, + "num_input_tokens_seen": 167124784, + "step": 77445 + }, + { + "epoch": 12.63458401305057, + "grad_norm": 0.0032505823764950037, + "learning_rate": 0.000359296399614047, + "loss": 0.0732, + "num_input_tokens_seen": 167136144, + "step": 77450 + }, + { + "epoch": 12.635399673735726, + "grad_norm": 0.01193551067262888, + "learning_rate": 0.00035922809770455745, + "loss": 0.0029, + "num_input_tokens_seen": 167144688, + "step": 77455 + }, + { + "epoch": 12.63621533442088, + "grad_norm": 0.06375737488269806, + "learning_rate": 0.00035915979864798884, + "loss": 0.0089, + "num_input_tokens_seen": 167155824, + "step": 77460 + }, + { + "epoch": 12.637030995106036, + "grad_norm": 0.07871931046247482, + "learning_rate": 0.0003590915024457256, + "loss": 0.0138, + "num_input_tokens_seen": 167167536, + "step": 77465 + }, + { + "epoch": 12.63784665579119, + "grad_norm": 0.19778048992156982, + "learning_rate": 0.0003590232090991521, + "loss": 0.0527, + "num_input_tokens_seen": 167178576, + "step": 77470 + }, + { + "epoch": 12.638662316476346, + "grad_norm": 0.011884092353284359, + "learning_rate": 0.0003589549186096518, + "loss": 0.0228, + "num_input_tokens_seen": 167189232, + "step": 77475 + }, + { + "epoch": 12.639477977161501, + "grad_norm": 0.0015790852485224605, + "learning_rate": 0.0003588866309786093, + "loss": 0.0595, + "num_input_tokens_seen": 167200976, + "step": 77480 + }, + { + "epoch": 12.640293637846655, + "grad_norm": 0.0070068747736513615, + "learning_rate": 0.00035881834620740796, + "loss": 0.0429, + "num_input_tokens_seen": 167212464, + "step": 77485 + }, + { + "epoch": 12.641109298531811, + "grad_norm": 0.010748101398348808, + "learning_rate": 0.0003587500642974322, + "loss": 0.0132, + "num_input_tokens_seen": 167221360, + "step": 77490 + }, + { + "epoch": 12.641924959216965, + "grad_norm": 0.0011903272243216634, + "learning_rate": 0.0003586817852500653, + "loss": 0.0229, + "num_input_tokens_seen": 167232688, + "step": 77495 + }, + { + "epoch": 12.64274061990212, + "grad_norm": 0.008443756960332394, + "learning_rate": 0.00035861350906669156, + "loss": 0.0061, + "num_input_tokens_seen": 167243088, + "step": 77500 + }, + { + "epoch": 12.643556280587276, + "grad_norm": 0.004054565913975239, + "learning_rate": 0.00035854523574869416, + "loss": 0.0127, + "num_input_tokens_seen": 167254384, + "step": 77505 + }, + { + "epoch": 12.64437194127243, + "grad_norm": 0.00791581254452467, + "learning_rate": 0.00035847696529745714, + "loss": 0.0012, + "num_input_tokens_seen": 167265136, + "step": 77510 + }, + { + "epoch": 12.645187601957586, + "grad_norm": 0.009051788598299026, + "learning_rate": 0.000358408697714364, + "loss": 0.0158, + "num_input_tokens_seen": 167276464, + "step": 77515 + }, + { + "epoch": 12.64600326264274, + "grad_norm": 0.002693832153454423, + "learning_rate": 0.0003583404330007981, + "loss": 0.041, + "num_input_tokens_seen": 167286640, + "step": 77520 + }, + { + "epoch": 12.646818923327896, + "grad_norm": 0.01549264881759882, + "learning_rate": 0.00035827217115814313, + "loss": 0.0088, + "num_input_tokens_seen": 167297744, + "step": 77525 + }, + { + "epoch": 12.647634584013051, + "grad_norm": 0.012756925076246262, + "learning_rate": 0.0003582039121877824, + "loss": 0.0025, + "num_input_tokens_seen": 167307600, + "step": 77530 + }, + { + "epoch": 12.648450244698205, + "grad_norm": 0.8364657759666443, + "learning_rate": 0.0003581356560910992, + "loss": 0.0663, + "num_input_tokens_seen": 167317296, + "step": 77535 + }, + { + "epoch": 12.649265905383361, + "grad_norm": 0.0019239794928580523, + "learning_rate": 0.00035806740286947704, + "loss": 0.0038, + "num_input_tokens_seen": 167328080, + "step": 77540 + }, + { + "epoch": 12.650081566068515, + "grad_norm": 0.009139740839600563, + "learning_rate": 0.0003579991525242988, + "loss": 0.0227, + "num_input_tokens_seen": 167338480, + "step": 77545 + }, + { + "epoch": 12.65089722675367, + "grad_norm": 0.016973547637462616, + "learning_rate": 0.0003579309050569481, + "loss": 0.0073, + "num_input_tokens_seen": 167348624, + "step": 77550 + }, + { + "epoch": 12.651712887438826, + "grad_norm": 0.00848863273859024, + "learning_rate": 0.00035786266046880765, + "loss": 0.0009, + "num_input_tokens_seen": 167359312, + "step": 77555 + }, + { + "epoch": 12.65252854812398, + "grad_norm": 0.0013879007892683148, + "learning_rate": 0.0003577944187612609, + "loss": 0.0077, + "num_input_tokens_seen": 167370416, + "step": 77560 + }, + { + "epoch": 12.653344208809136, + "grad_norm": 0.18590618669986725, + "learning_rate": 0.0003577261799356905, + "loss": 0.0204, + "num_input_tokens_seen": 167382096, + "step": 77565 + }, + { + "epoch": 12.65415986949429, + "grad_norm": 0.0060991826467216015, + "learning_rate": 0.0003576579439934796, + "loss": 0.0154, + "num_input_tokens_seen": 167391920, + "step": 77570 + }, + { + "epoch": 12.654975530179446, + "grad_norm": 0.014741920866072178, + "learning_rate": 0.000357589710936011, + "loss": 0.0583, + "num_input_tokens_seen": 167403664, + "step": 77575 + }, + { + "epoch": 12.655791190864601, + "grad_norm": 0.016506381332874298, + "learning_rate": 0.0003575214807646675, + "loss": 0.0035, + "num_input_tokens_seen": 167413968, + "step": 77580 + }, + { + "epoch": 12.656606851549755, + "grad_norm": 0.05445334315299988, + "learning_rate": 0.0003574532534808321, + "loss": 0.0063, + "num_input_tokens_seen": 167425136, + "step": 77585 + }, + { + "epoch": 12.65742251223491, + "grad_norm": 0.0003214953176211566, + "learning_rate": 0.00035738502908588723, + "loss": 0.014, + "num_input_tokens_seen": 167436400, + "step": 77590 + }, + { + "epoch": 12.658238172920065, + "grad_norm": 0.039395369589328766, + "learning_rate": 0.0003573168075812158, + "loss": 0.0142, + "num_input_tokens_seen": 167447728, + "step": 77595 + }, + { + "epoch": 12.65905383360522, + "grad_norm": 0.004109233617782593, + "learning_rate": 0.0003572485889682001, + "loss": 0.0089, + "num_input_tokens_seen": 167458160, + "step": 77600 + }, + { + "epoch": 12.659869494290374, + "grad_norm": 0.32355889678001404, + "learning_rate": 0.00035718037324822304, + "loss": 0.0224, + "num_input_tokens_seen": 167468496, + "step": 77605 + }, + { + "epoch": 12.66068515497553, + "grad_norm": 0.01820485293865204, + "learning_rate": 0.0003571121604226667, + "loss": 0.0301, + "num_input_tokens_seen": 167478864, + "step": 77610 + }, + { + "epoch": 12.661500815660686, + "grad_norm": 0.05411219596862793, + "learning_rate": 0.0003570439504929139, + "loss": 0.0034, + "num_input_tokens_seen": 167490768, + "step": 77615 + }, + { + "epoch": 12.66231647634584, + "grad_norm": 0.002297184197232127, + "learning_rate": 0.00035697574346034655, + "loss": 0.0129, + "num_input_tokens_seen": 167499920, + "step": 77620 + }, + { + "epoch": 12.663132137030995, + "grad_norm": 0.001499962992966175, + "learning_rate": 0.0003569075393263475, + "loss": 0.0007, + "num_input_tokens_seen": 167510096, + "step": 77625 + }, + { + "epoch": 12.66394779771615, + "grad_norm": 0.0021283631213009357, + "learning_rate": 0.0003568393380922984, + "loss": 0.011, + "num_input_tokens_seen": 167521712, + "step": 77630 + }, + { + "epoch": 12.664763458401305, + "grad_norm": 0.018177811056375504, + "learning_rate": 0.0003567711397595819, + "loss": 0.0064, + "num_input_tokens_seen": 167532080, + "step": 77635 + }, + { + "epoch": 12.66557911908646, + "grad_norm": 0.00201037828810513, + "learning_rate": 0.00035670294432957984, + "loss": 0.0021, + "num_input_tokens_seen": 167542320, + "step": 77640 + }, + { + "epoch": 12.666394779771615, + "grad_norm": 0.03254613280296326, + "learning_rate": 0.00035663475180367453, + "loss": 0.0094, + "num_input_tokens_seen": 167551952, + "step": 77645 + }, + { + "epoch": 12.66721044045677, + "grad_norm": 0.00047527250717394054, + "learning_rate": 0.00035656656218324765, + "loss": 0.0086, + "num_input_tokens_seen": 167563824, + "step": 77650 + }, + { + "epoch": 12.668026101141924, + "grad_norm": 0.19187355041503906, + "learning_rate": 0.0003564983754696815, + "loss": 0.0076, + "num_input_tokens_seen": 167574576, + "step": 77655 + }, + { + "epoch": 12.66884176182708, + "grad_norm": 0.0023396743927150965, + "learning_rate": 0.00035643019166435775, + "loss": 0.0021, + "num_input_tokens_seen": 167585616, + "step": 77660 + }, + { + "epoch": 12.669657422512234, + "grad_norm": 0.002921799197793007, + "learning_rate": 0.00035636201076865836, + "loss": 0.0036, + "num_input_tokens_seen": 167595856, + "step": 77665 + }, + { + "epoch": 12.67047308319739, + "grad_norm": 0.008512123487889767, + "learning_rate": 0.000356293832783965, + "loss": 0.0091, + "num_input_tokens_seen": 167607472, + "step": 77670 + }, + { + "epoch": 12.671288743882545, + "grad_norm": 0.009771689772605896, + "learning_rate": 0.0003562256577116595, + "loss": 0.0026, + "num_input_tokens_seen": 167618032, + "step": 77675 + }, + { + "epoch": 12.6721044045677, + "grad_norm": 0.003648345824331045, + "learning_rate": 0.0003561574855531232, + "loss": 0.0047, + "num_input_tokens_seen": 167628528, + "step": 77680 + }, + { + "epoch": 12.672920065252855, + "grad_norm": 0.03722764924168587, + "learning_rate": 0.00035608931630973814, + "loss": 0.0086, + "num_input_tokens_seen": 167639440, + "step": 77685 + }, + { + "epoch": 12.673735725938009, + "grad_norm": 0.013182447291910648, + "learning_rate": 0.0003560211499828856, + "loss": 0.0714, + "num_input_tokens_seen": 167650608, + "step": 77690 + }, + { + "epoch": 12.674551386623165, + "grad_norm": 0.004251719918102026, + "learning_rate": 0.00035595298657394714, + "loss": 0.1046, + "num_input_tokens_seen": 167661296, + "step": 77695 + }, + { + "epoch": 12.67536704730832, + "grad_norm": 0.0034987444523721933, + "learning_rate": 0.0003558848260843041, + "loss": 0.146, + "num_input_tokens_seen": 167671472, + "step": 77700 + }, + { + "epoch": 12.676182707993474, + "grad_norm": 0.08211586624383926, + "learning_rate": 0.00035581666851533777, + "loss": 0.0075, + "num_input_tokens_seen": 167682256, + "step": 77705 + }, + { + "epoch": 12.67699836867863, + "grad_norm": 0.47355416417121887, + "learning_rate": 0.0003557485138684299, + "loss": 0.0938, + "num_input_tokens_seen": 167694096, + "step": 77710 + }, + { + "epoch": 12.677814029363784, + "grad_norm": 0.000719729287084192, + "learning_rate": 0.00035568036214496103, + "loss": 0.0008, + "num_input_tokens_seen": 167705584, + "step": 77715 + }, + { + "epoch": 12.67862969004894, + "grad_norm": 0.0044694566167891026, + "learning_rate": 0.000355612213346313, + "loss": 0.045, + "num_input_tokens_seen": 167716624, + "step": 77720 + }, + { + "epoch": 12.679445350734095, + "grad_norm": 0.017641765996813774, + "learning_rate": 0.00035554406747386635, + "loss": 0.0052, + "num_input_tokens_seen": 167727312, + "step": 77725 + }, + { + "epoch": 12.68026101141925, + "grad_norm": 0.0005881767137907445, + "learning_rate": 0.0003554759245290027, + "loss": 0.0178, + "num_input_tokens_seen": 167738160, + "step": 77730 + }, + { + "epoch": 12.681076672104405, + "grad_norm": 0.3846758008003235, + "learning_rate": 0.0003554077845131025, + "loss": 0.0213, + "num_input_tokens_seen": 167749072, + "step": 77735 + }, + { + "epoch": 12.681892332789559, + "grad_norm": 0.10418780148029327, + "learning_rate": 0.0003553396474275473, + "loss": 0.0506, + "num_input_tokens_seen": 167760464, + "step": 77740 + }, + { + "epoch": 12.682707993474715, + "grad_norm": 0.3565927743911743, + "learning_rate": 0.00035527151327371736, + "loss": 0.1783, + "num_input_tokens_seen": 167771696, + "step": 77745 + }, + { + "epoch": 12.68352365415987, + "grad_norm": 0.20708514750003815, + "learning_rate": 0.00035520338205299407, + "loss": 0.0138, + "num_input_tokens_seen": 167782736, + "step": 77750 + }, + { + "epoch": 12.684339314845024, + "grad_norm": 0.002222479321062565, + "learning_rate": 0.0003551352537667577, + "loss": 0.001, + "num_input_tokens_seen": 167792656, + "step": 77755 + }, + { + "epoch": 12.68515497553018, + "grad_norm": 0.02177123725414276, + "learning_rate": 0.0003550671284163894, + "loss": 0.0062, + "num_input_tokens_seen": 167803568, + "step": 77760 + }, + { + "epoch": 12.685970636215334, + "grad_norm": 0.45163848996162415, + "learning_rate": 0.00035499900600326933, + "loss": 0.0409, + "num_input_tokens_seen": 167814736, + "step": 77765 + }, + { + "epoch": 12.68678629690049, + "grad_norm": 0.006811882369220257, + "learning_rate": 0.00035493088652877866, + "loss": 0.0095, + "num_input_tokens_seen": 167826192, + "step": 77770 + }, + { + "epoch": 12.687601957585644, + "grad_norm": 0.02047066017985344, + "learning_rate": 0.00035486276999429733, + "loss": 0.0233, + "num_input_tokens_seen": 167838736, + "step": 77775 + }, + { + "epoch": 12.6884176182708, + "grad_norm": 0.024450426921248436, + "learning_rate": 0.00035479465640120636, + "loss": 0.2165, + "num_input_tokens_seen": 167850416, + "step": 77780 + }, + { + "epoch": 12.689233278955955, + "grad_norm": 0.7064217925071716, + "learning_rate": 0.0003547265457508856, + "loss": 0.0634, + "num_input_tokens_seen": 167861840, + "step": 77785 + }, + { + "epoch": 12.690048939641109, + "grad_norm": 0.005116404965519905, + "learning_rate": 0.0003546584380447157, + "loss": 0.0111, + "num_input_tokens_seen": 167872240, + "step": 77790 + }, + { + "epoch": 12.690864600326265, + "grad_norm": 0.02620469219982624, + "learning_rate": 0.0003545903332840772, + "loss": 0.0083, + "num_input_tokens_seen": 167883024, + "step": 77795 + }, + { + "epoch": 12.691680261011419, + "grad_norm": 0.016209479421377182, + "learning_rate": 0.0003545222314703498, + "loss": 0.0241, + "num_input_tokens_seen": 167894480, + "step": 77800 + }, + { + "epoch": 12.692495921696574, + "grad_norm": 0.00676144240424037, + "learning_rate": 0.0003544541326049141, + "loss": 0.1362, + "num_input_tokens_seen": 167904528, + "step": 77805 + }, + { + "epoch": 12.69331158238173, + "grad_norm": 0.008267571218311787, + "learning_rate": 0.0003543860366891499, + "loss": 0.0032, + "num_input_tokens_seen": 167915376, + "step": 77810 + }, + { + "epoch": 12.694127243066884, + "grad_norm": 0.09453053027391434, + "learning_rate": 0.0003543179437244376, + "loss": 0.0269, + "num_input_tokens_seen": 167926672, + "step": 77815 + }, + { + "epoch": 12.69494290375204, + "grad_norm": 0.0014216596027836204, + "learning_rate": 0.0003542498537121567, + "loss": 0.0026, + "num_input_tokens_seen": 167937360, + "step": 77820 + }, + { + "epoch": 12.695758564437194, + "grad_norm": 0.003844271646812558, + "learning_rate": 0.0003541817666536876, + "loss": 0.031, + "num_input_tokens_seen": 167948016, + "step": 77825 + }, + { + "epoch": 12.69657422512235, + "grad_norm": 0.4929651618003845, + "learning_rate": 0.00035411368255040994, + "loss": 0.1013, + "num_input_tokens_seen": 167959152, + "step": 77830 + }, + { + "epoch": 12.697389885807503, + "grad_norm": 0.013050700537860394, + "learning_rate": 0.0003540456014037036, + "loss": 0.0035, + "num_input_tokens_seen": 167969456, + "step": 77835 + }, + { + "epoch": 12.698205546492659, + "grad_norm": 0.04627647250890732, + "learning_rate": 0.00035397752321494826, + "loss": 0.0059, + "num_input_tokens_seen": 167979728, + "step": 77840 + }, + { + "epoch": 12.699021207177815, + "grad_norm": 0.00341666373424232, + "learning_rate": 0.0003539094479855237, + "loss": 0.0097, + "num_input_tokens_seen": 167989936, + "step": 77845 + }, + { + "epoch": 12.699836867862969, + "grad_norm": 0.0008452749461866915, + "learning_rate": 0.00035384137571680936, + "loss": 0.0468, + "num_input_tokens_seen": 168000592, + "step": 77850 + }, + { + "epoch": 12.700652528548124, + "grad_norm": 0.014394373632967472, + "learning_rate": 0.0003537733064101852, + "loss": 0.0048, + "num_input_tokens_seen": 168011856, + "step": 77855 + }, + { + "epoch": 12.701468189233278, + "grad_norm": 0.12896941602230072, + "learning_rate": 0.0003537052400670303, + "loss": 0.0178, + "num_input_tokens_seen": 168022896, + "step": 77860 + }, + { + "epoch": 12.702283849918434, + "grad_norm": 0.003145918482914567, + "learning_rate": 0.00035363717668872443, + "loss": 0.1505, + "num_input_tokens_seen": 168033776, + "step": 77865 + }, + { + "epoch": 12.70309951060359, + "grad_norm": 0.0127762071788311, + "learning_rate": 0.00035356911627664665, + "loss": 0.0087, + "num_input_tokens_seen": 168043152, + "step": 77870 + }, + { + "epoch": 12.703915171288743, + "grad_norm": 0.0024632152635604143, + "learning_rate": 0.00035350105883217675, + "loss": 0.0044, + "num_input_tokens_seen": 168054640, + "step": 77875 + }, + { + "epoch": 12.7047308319739, + "grad_norm": 0.06170526146888733, + "learning_rate": 0.00035343300435669356, + "loss": 0.0049, + "num_input_tokens_seen": 168065648, + "step": 77880 + }, + { + "epoch": 12.705546492659053, + "grad_norm": 0.041089512407779694, + "learning_rate": 0.0003533649528515766, + "loss": 0.1294, + "num_input_tokens_seen": 168076176, + "step": 77885 + }, + { + "epoch": 12.706362153344209, + "grad_norm": 0.008773344568908215, + "learning_rate": 0.0003532969043182047, + "loss": 0.0022, + "num_input_tokens_seen": 168087856, + "step": 77890 + }, + { + "epoch": 12.707177814029365, + "grad_norm": 0.6113072633743286, + "learning_rate": 0.0003532288587579572, + "loss": 0.2116, + "num_input_tokens_seen": 168098576, + "step": 77895 + }, + { + "epoch": 12.707993474714518, + "grad_norm": 0.12028361111879349, + "learning_rate": 0.0003531608161722132, + "loss": 0.0087, + "num_input_tokens_seen": 168109264, + "step": 77900 + }, + { + "epoch": 12.708809135399674, + "grad_norm": 0.01094974298030138, + "learning_rate": 0.00035309277656235137, + "loss": 0.006, + "num_input_tokens_seen": 168121104, + "step": 77905 + }, + { + "epoch": 12.709624796084828, + "grad_norm": 0.20771579444408417, + "learning_rate": 0.000353024739929751, + "loss": 0.0218, + "num_input_tokens_seen": 168132464, + "step": 77910 + }, + { + "epoch": 12.710440456769984, + "grad_norm": 0.002628637244924903, + "learning_rate": 0.0003529567062757905, + "loss": 0.0059, + "num_input_tokens_seen": 168143952, + "step": 77915 + }, + { + "epoch": 12.71125611745514, + "grad_norm": 0.39467740058898926, + "learning_rate": 0.0003528886756018491, + "loss": 0.0418, + "num_input_tokens_seen": 168154672, + "step": 77920 + }, + { + "epoch": 12.712071778140293, + "grad_norm": 0.009513895027339458, + "learning_rate": 0.0003528206479093051, + "loss": 0.0261, + "num_input_tokens_seen": 168166896, + "step": 77925 + }, + { + "epoch": 12.71288743882545, + "grad_norm": 0.22106213867664337, + "learning_rate": 0.0003527526231995376, + "loss": 0.0299, + "num_input_tokens_seen": 168178448, + "step": 77930 + }, + { + "epoch": 12.713703099510603, + "grad_norm": 0.282661497592926, + "learning_rate": 0.0003526846014739248, + "loss": 0.05, + "num_input_tokens_seen": 168189904, + "step": 77935 + }, + { + "epoch": 12.714518760195759, + "grad_norm": 0.07393468916416168, + "learning_rate": 0.00035261658273384554, + "loss": 0.0051, + "num_input_tokens_seen": 168200208, + "step": 77940 + }, + { + "epoch": 12.715334420880914, + "grad_norm": 0.0017498735105618834, + "learning_rate": 0.00035254856698067806, + "loss": 0.0104, + "num_input_tokens_seen": 168210960, + "step": 77945 + }, + { + "epoch": 12.716150081566068, + "grad_norm": 0.21453110873699188, + "learning_rate": 0.00035248055421580114, + "loss": 0.05, + "num_input_tokens_seen": 168221456, + "step": 77950 + }, + { + "epoch": 12.716965742251224, + "grad_norm": 0.05668376758694649, + "learning_rate": 0.0003524125444405928, + "loss": 0.0356, + "num_input_tokens_seen": 168232944, + "step": 77955 + }, + { + "epoch": 12.717781402936378, + "grad_norm": 0.7716231346130371, + "learning_rate": 0.00035234453765643146, + "loss": 0.1039, + "num_input_tokens_seen": 168243984, + "step": 77960 + }, + { + "epoch": 12.718597063621534, + "grad_norm": 0.0027923895977437496, + "learning_rate": 0.0003522765338646954, + "loss": 0.0026, + "num_input_tokens_seen": 168254736, + "step": 77965 + }, + { + "epoch": 12.719412724306688, + "grad_norm": 0.0027151750400662422, + "learning_rate": 0.00035220853306676284, + "loss": 0.0199, + "num_input_tokens_seen": 168265456, + "step": 77970 + }, + { + "epoch": 12.720228384991843, + "grad_norm": 0.023490922525525093, + "learning_rate": 0.0003521405352640118, + "loss": 0.0117, + "num_input_tokens_seen": 168275632, + "step": 77975 + }, + { + "epoch": 12.721044045676999, + "grad_norm": 0.06657205522060394, + "learning_rate": 0.00035207254045782036, + "loss": 0.0084, + "num_input_tokens_seen": 168286032, + "step": 77980 + }, + { + "epoch": 12.721859706362153, + "grad_norm": 0.005308172199875116, + "learning_rate": 0.00035200454864956653, + "loss": 0.003, + "num_input_tokens_seen": 168297968, + "step": 77985 + }, + { + "epoch": 12.722675367047309, + "grad_norm": 0.012860084883868694, + "learning_rate": 0.00035193655984062835, + "loss": 0.0053, + "num_input_tokens_seen": 168307888, + "step": 77990 + }, + { + "epoch": 12.723491027732463, + "grad_norm": 0.050960473716259, + "learning_rate": 0.0003518685740323835, + "loss": 0.059, + "num_input_tokens_seen": 168318928, + "step": 77995 + }, + { + "epoch": 12.724306688417618, + "grad_norm": 0.01834881864488125, + "learning_rate": 0.00035180059122621, + "loss": 0.0045, + "num_input_tokens_seen": 168328592, + "step": 78000 + }, + { + "epoch": 12.725122349102774, + "grad_norm": 0.036114711314439774, + "learning_rate": 0.0003517326114234855, + "loss": 0.0166, + "num_input_tokens_seen": 168339056, + "step": 78005 + }, + { + "epoch": 12.725938009787928, + "grad_norm": 0.011531124822795391, + "learning_rate": 0.0003516646346255877, + "loss": 0.0206, + "num_input_tokens_seen": 168350640, + "step": 78010 + }, + { + "epoch": 12.726753670473084, + "grad_norm": 0.14776423573493958, + "learning_rate": 0.00035159666083389436, + "loss": 0.0367, + "num_input_tokens_seen": 168361424, + "step": 78015 + }, + { + "epoch": 12.727569331158238, + "grad_norm": 0.051788005977869034, + "learning_rate": 0.00035152869004978276, + "loss": 0.0056, + "num_input_tokens_seen": 168372112, + "step": 78020 + }, + { + "epoch": 12.728384991843393, + "grad_norm": 0.46521246433258057, + "learning_rate": 0.0003514607222746309, + "loss": 0.0308, + "num_input_tokens_seen": 168384464, + "step": 78025 + }, + { + "epoch": 12.729200652528547, + "grad_norm": 0.006872142665088177, + "learning_rate": 0.0003513927575098156, + "loss": 0.0117, + "num_input_tokens_seen": 168395024, + "step": 78030 + }, + { + "epoch": 12.730016313213703, + "grad_norm": 0.1131977066397667, + "learning_rate": 0.0003513247957567149, + "loss": 0.0113, + "num_input_tokens_seen": 168406800, + "step": 78035 + }, + { + "epoch": 12.730831973898859, + "grad_norm": 0.0014392051380127668, + "learning_rate": 0.0003512568370167055, + "loss": 0.0311, + "num_input_tokens_seen": 168418800, + "step": 78040 + }, + { + "epoch": 12.731647634584013, + "grad_norm": 0.00255782762542367, + "learning_rate": 0.0003511888812911653, + "loss": 0.0036, + "num_input_tokens_seen": 168429392, + "step": 78045 + }, + { + "epoch": 12.732463295269168, + "grad_norm": 0.002064454136416316, + "learning_rate": 0.00035112092858147106, + "loss": 0.0467, + "num_input_tokens_seen": 168441008, + "step": 78050 + }, + { + "epoch": 12.733278955954322, + "grad_norm": 0.008236652240157127, + "learning_rate": 0.0003510529788890001, + "loss": 0.0032, + "num_input_tokens_seen": 168452496, + "step": 78055 + }, + { + "epoch": 12.734094616639478, + "grad_norm": 0.005584695376455784, + "learning_rate": 0.0003509850322151294, + "loss": 0.0091, + "num_input_tokens_seen": 168462224, + "step": 78060 + }, + { + "epoch": 12.734910277324634, + "grad_norm": 0.0034054818097501993, + "learning_rate": 0.0003509170885612362, + "loss": 0.0169, + "num_input_tokens_seen": 168472368, + "step": 78065 + }, + { + "epoch": 12.735725938009788, + "grad_norm": 0.005306158680468798, + "learning_rate": 0.00035084914792869715, + "loss": 0.0224, + "num_input_tokens_seen": 168482480, + "step": 78070 + }, + { + "epoch": 12.736541598694943, + "grad_norm": 0.009345026686787605, + "learning_rate": 0.0003507812103188895, + "loss": 0.0031, + "num_input_tokens_seen": 168492368, + "step": 78075 + }, + { + "epoch": 12.737357259380097, + "grad_norm": 0.012340357527136803, + "learning_rate": 0.0003507132757331898, + "loss": 0.0079, + "num_input_tokens_seen": 168503472, + "step": 78080 + }, + { + "epoch": 12.738172920065253, + "grad_norm": 0.424250990152359, + "learning_rate": 0.00035064534417297513, + "loss": 0.1375, + "num_input_tokens_seen": 168513200, + "step": 78085 + }, + { + "epoch": 12.738988580750409, + "grad_norm": 0.039979513734579086, + "learning_rate": 0.00035057741563962176, + "loss": 0.0252, + "num_input_tokens_seen": 168523120, + "step": 78090 + }, + { + "epoch": 12.739804241435563, + "grad_norm": 0.08580904453992844, + "learning_rate": 0.00035050949013450686, + "loss": 0.0053, + "num_input_tokens_seen": 168533072, + "step": 78095 + }, + { + "epoch": 12.740619902120718, + "grad_norm": 0.007983151823282242, + "learning_rate": 0.0003504415676590066, + "loss": 0.005, + "num_input_tokens_seen": 168544464, + "step": 78100 + }, + { + "epoch": 12.741435562805872, + "grad_norm": 0.03728825971484184, + "learning_rate": 0.00035037364821449766, + "loss": 0.003, + "num_input_tokens_seen": 168555312, + "step": 78105 + }, + { + "epoch": 12.742251223491028, + "grad_norm": 0.0057478249073028564, + "learning_rate": 0.0003503057318023568, + "loss": 0.0017, + "num_input_tokens_seen": 168564624, + "step": 78110 + }, + { + "epoch": 12.743066884176184, + "grad_norm": 0.0008709166431799531, + "learning_rate": 0.00035023781842395994, + "loss": 0.0047, + "num_input_tokens_seen": 168576496, + "step": 78115 + }, + { + "epoch": 12.743882544861338, + "grad_norm": 0.010295792482793331, + "learning_rate": 0.0003501699080806839, + "loss": 0.04, + "num_input_tokens_seen": 168587792, + "step": 78120 + }, + { + "epoch": 12.744698205546493, + "grad_norm": 0.10193426162004471, + "learning_rate": 0.0003501020007739045, + "loss": 0.0045, + "num_input_tokens_seen": 168598256, + "step": 78125 + }, + { + "epoch": 12.745513866231647, + "grad_norm": 0.02217688411474228, + "learning_rate": 0.0003500340965049984, + "loss": 0.0025, + "num_input_tokens_seen": 168608880, + "step": 78130 + }, + { + "epoch": 12.746329526916803, + "grad_norm": 0.009889517910778522, + "learning_rate": 0.00034996619527534153, + "loss": 0.0023, + "num_input_tokens_seen": 168620400, + "step": 78135 + }, + { + "epoch": 12.747145187601957, + "grad_norm": 0.35381829738616943, + "learning_rate": 0.00034989829708631005, + "loss": 0.017, + "num_input_tokens_seen": 168631376, + "step": 78140 + }, + { + "epoch": 12.747960848287113, + "grad_norm": 0.01684013567864895, + "learning_rate": 0.00034983040193927996, + "loss": 0.0108, + "num_input_tokens_seen": 168641776, + "step": 78145 + }, + { + "epoch": 12.748776508972268, + "grad_norm": 0.020151149481534958, + "learning_rate": 0.0003497625098356273, + "loss": 0.0338, + "num_input_tokens_seen": 168652464, + "step": 78150 + }, + { + "epoch": 12.749592169657422, + "grad_norm": 0.007396516855806112, + "learning_rate": 0.00034969462077672793, + "loss": 0.0023, + "num_input_tokens_seen": 168662928, + "step": 78155 + }, + { + "epoch": 12.750407830342578, + "grad_norm": 0.003994908183813095, + "learning_rate": 0.0003496267347639579, + "loss": 0.0185, + "num_input_tokens_seen": 168674448, + "step": 78160 + }, + { + "epoch": 12.751223491027732, + "grad_norm": 0.465512752532959, + "learning_rate": 0.00034955885179869265, + "loss": 0.0292, + "num_input_tokens_seen": 168683792, + "step": 78165 + }, + { + "epoch": 12.752039151712887, + "grad_norm": 0.0006666943663731217, + "learning_rate": 0.0003494909718823083, + "loss": 0.0067, + "num_input_tokens_seen": 168695248, + "step": 78170 + }, + { + "epoch": 12.752854812398043, + "grad_norm": 0.0015538654988631606, + "learning_rate": 0.00034942309501618016, + "loss": 0.0067, + "num_input_tokens_seen": 168706992, + "step": 78175 + }, + { + "epoch": 12.753670473083197, + "grad_norm": 0.10399758070707321, + "learning_rate": 0.00034935522120168417, + "loss": 0.0184, + "num_input_tokens_seen": 168718800, + "step": 78180 + }, + { + "epoch": 12.754486133768353, + "grad_norm": 0.4177756905555725, + "learning_rate": 0.0003492873504401956, + "loss": 0.2015, + "num_input_tokens_seen": 168729552, + "step": 78185 + }, + { + "epoch": 12.755301794453507, + "grad_norm": 0.041986290365457535, + "learning_rate": 0.0003492194827330902, + "loss": 0.0028, + "num_input_tokens_seen": 168740304, + "step": 78190 + }, + { + "epoch": 12.756117455138662, + "grad_norm": 0.6714656352996826, + "learning_rate": 0.00034915161808174314, + "loss": 0.0276, + "num_input_tokens_seen": 168751536, + "step": 78195 + }, + { + "epoch": 12.756933115823816, + "grad_norm": 0.12389890104532242, + "learning_rate": 0.0003490837564875301, + "loss": 0.013, + "num_input_tokens_seen": 168762288, + "step": 78200 + }, + { + "epoch": 12.757748776508972, + "grad_norm": 0.21259838342666626, + "learning_rate": 0.0003490158979518259, + "loss": 0.0096, + "num_input_tokens_seen": 168772912, + "step": 78205 + }, + { + "epoch": 12.758564437194128, + "grad_norm": 0.025044182315468788, + "learning_rate": 0.00034894804247600613, + "loss": 0.0039, + "num_input_tokens_seen": 168781808, + "step": 78210 + }, + { + "epoch": 12.759380097879282, + "grad_norm": 0.0016236485680565238, + "learning_rate": 0.0003488801900614461, + "loss": 0.0016, + "num_input_tokens_seen": 168793104, + "step": 78215 + }, + { + "epoch": 12.760195758564437, + "grad_norm": 0.21260297298431396, + "learning_rate": 0.0003488123407095205, + "loss": 0.0168, + "num_input_tokens_seen": 168803312, + "step": 78220 + }, + { + "epoch": 12.761011419249591, + "grad_norm": 0.015420181676745415, + "learning_rate": 0.00034874449442160485, + "loss": 0.0713, + "num_input_tokens_seen": 168813776, + "step": 78225 + }, + { + "epoch": 12.761827079934747, + "grad_norm": 0.007980900816619396, + "learning_rate": 0.00034867665119907363, + "loss": 0.0063, + "num_input_tokens_seen": 168824496, + "step": 78230 + }, + { + "epoch": 12.762642740619903, + "grad_norm": 0.055971719324588776, + "learning_rate": 0.0003486088110433023, + "loss": 0.081, + "num_input_tokens_seen": 168835440, + "step": 78235 + }, + { + "epoch": 12.763458401305057, + "grad_norm": 0.005327877588570118, + "learning_rate": 0.0003485409739556653, + "loss": 0.0067, + "num_input_tokens_seen": 168845488, + "step": 78240 + }, + { + "epoch": 12.764274061990212, + "grad_norm": 0.13882924616336823, + "learning_rate": 0.0003484731399375377, + "loss": 0.0147, + "num_input_tokens_seen": 168857424, + "step": 78245 + }, + { + "epoch": 12.765089722675366, + "grad_norm": 0.003365674987435341, + "learning_rate": 0.00034840530899029405, + "loss": 0.1023, + "num_input_tokens_seen": 168868528, + "step": 78250 + }, + { + "epoch": 12.765905383360522, + "grad_norm": 0.028432480990886688, + "learning_rate": 0.00034833748111530926, + "loss": 0.0026, + "num_input_tokens_seen": 168878864, + "step": 78255 + }, + { + "epoch": 12.766721044045678, + "grad_norm": 0.7656921744346619, + "learning_rate": 0.00034826965631395767, + "loss": 0.0199, + "num_input_tokens_seen": 168890128, + "step": 78260 + }, + { + "epoch": 12.767536704730832, + "grad_norm": 0.3204622268676758, + "learning_rate": 0.0003482018345876141, + "loss": 0.014, + "num_input_tokens_seen": 168901392, + "step": 78265 + }, + { + "epoch": 12.768352365415987, + "grad_norm": 0.045421164482831955, + "learning_rate": 0.0003481340159376528, + "loss": 0.0216, + "num_input_tokens_seen": 168912464, + "step": 78270 + }, + { + "epoch": 12.769168026101141, + "grad_norm": 0.0036789400037378073, + "learning_rate": 0.0003480662003654483, + "loss": 0.0035, + "num_input_tokens_seen": 168923952, + "step": 78275 + }, + { + "epoch": 12.769983686786297, + "grad_norm": 0.007080696523189545, + "learning_rate": 0.00034799838787237514, + "loss": 0.0041, + "num_input_tokens_seen": 168933232, + "step": 78280 + }, + { + "epoch": 12.770799347471453, + "grad_norm": 0.06549062579870224, + "learning_rate": 0.00034793057845980744, + "loss": 0.0075, + "num_input_tokens_seen": 168944496, + "step": 78285 + }, + { + "epoch": 12.771615008156607, + "grad_norm": 0.004228665493428707, + "learning_rate": 0.00034786277212911943, + "loss": 0.0038, + "num_input_tokens_seen": 168955952, + "step": 78290 + }, + { + "epoch": 12.772430668841762, + "grad_norm": 0.4310317039489746, + "learning_rate": 0.0003477949688816854, + "loss": 0.018, + "num_input_tokens_seen": 168965456, + "step": 78295 + }, + { + "epoch": 12.773246329526916, + "grad_norm": 0.003545227227732539, + "learning_rate": 0.00034772716871887924, + "loss": 0.001, + "num_input_tokens_seen": 168975536, + "step": 78300 + }, + { + "epoch": 12.774061990212072, + "grad_norm": 0.0120839923620224, + "learning_rate": 0.0003476593716420754, + "loss": 0.0044, + "num_input_tokens_seen": 168985648, + "step": 78305 + }, + { + "epoch": 12.774877650897226, + "grad_norm": 0.002790386090055108, + "learning_rate": 0.00034759157765264746, + "loss": 0.0015, + "num_input_tokens_seen": 168997136, + "step": 78310 + }, + { + "epoch": 12.775693311582382, + "grad_norm": 0.0049034785479307175, + "learning_rate": 0.00034752378675196975, + "loss": 0.0071, + "num_input_tokens_seen": 169008336, + "step": 78315 + }, + { + "epoch": 12.776508972267537, + "grad_norm": 0.005099172703921795, + "learning_rate": 0.0003474559989414158, + "loss": 0.0055, + "num_input_tokens_seen": 169019600, + "step": 78320 + }, + { + "epoch": 12.777324632952691, + "grad_norm": 0.0036217174492776394, + "learning_rate": 0.00034738821422235943, + "loss": 0.0049, + "num_input_tokens_seen": 169030992, + "step": 78325 + }, + { + "epoch": 12.778140293637847, + "grad_norm": 0.05576243996620178, + "learning_rate": 0.00034732043259617473, + "loss": 0.0074, + "num_input_tokens_seen": 169042032, + "step": 78330 + }, + { + "epoch": 12.778955954323001, + "grad_norm": 0.0020361198112368584, + "learning_rate": 0.000347252654064235, + "loss": 0.1048, + "num_input_tokens_seen": 169053200, + "step": 78335 + }, + { + "epoch": 12.779771615008157, + "grad_norm": 0.007970694452524185, + "learning_rate": 0.00034718487862791413, + "loss": 0.0058, + "num_input_tokens_seen": 169064560, + "step": 78340 + }, + { + "epoch": 12.780587275693312, + "grad_norm": 0.008741318248212337, + "learning_rate": 0.0003471171062885854, + "loss": 0.0137, + "num_input_tokens_seen": 169075664, + "step": 78345 + }, + { + "epoch": 12.781402936378466, + "grad_norm": 0.00251978961750865, + "learning_rate": 0.00034704933704762266, + "loss": 0.0135, + "num_input_tokens_seen": 169086320, + "step": 78350 + }, + { + "epoch": 12.782218597063622, + "grad_norm": 0.05413142964243889, + "learning_rate": 0.00034698157090639893, + "loss": 0.0095, + "num_input_tokens_seen": 169096944, + "step": 78355 + }, + { + "epoch": 12.783034257748776, + "grad_norm": 0.13590726256370544, + "learning_rate": 0.000346913807866288, + "loss": 0.0167, + "num_input_tokens_seen": 169107664, + "step": 78360 + }, + { + "epoch": 12.783849918433932, + "grad_norm": 0.0008552385843358934, + "learning_rate": 0.00034684604792866277, + "loss": 0.0144, + "num_input_tokens_seen": 169118480, + "step": 78365 + }, + { + "epoch": 12.784665579119086, + "grad_norm": 0.002570715267211199, + "learning_rate": 0.00034677829109489684, + "loss": 0.0181, + "num_input_tokens_seen": 169129104, + "step": 78370 + }, + { + "epoch": 12.785481239804241, + "grad_norm": 0.0011560607235878706, + "learning_rate": 0.00034671053736636307, + "loss": 0.0777, + "num_input_tokens_seen": 169140080, + "step": 78375 + }, + { + "epoch": 12.786296900489397, + "grad_norm": 0.18246564269065857, + "learning_rate": 0.0003466427867444348, + "loss": 0.0097, + "num_input_tokens_seen": 169151760, + "step": 78380 + }, + { + "epoch": 12.78711256117455, + "grad_norm": 0.08689558506011963, + "learning_rate": 0.00034657503923048497, + "loss": 0.004, + "num_input_tokens_seen": 169162416, + "step": 78385 + }, + { + "epoch": 12.787928221859707, + "grad_norm": 0.0031060264445841312, + "learning_rate": 0.00034650729482588665, + "loss": 0.0015, + "num_input_tokens_seen": 169173072, + "step": 78390 + }, + { + "epoch": 12.78874388254486, + "grad_norm": 0.0003656708577182144, + "learning_rate": 0.0003464395535320126, + "loss": 0.0043, + "num_input_tokens_seen": 169183504, + "step": 78395 + }, + { + "epoch": 12.789559543230016, + "grad_norm": 0.02777000330388546, + "learning_rate": 0.000346371815350236, + "loss": 0.1868, + "num_input_tokens_seen": 169193648, + "step": 78400 + }, + { + "epoch": 12.790375203915172, + "grad_norm": 0.4332772195339203, + "learning_rate": 0.0003463040802819292, + "loss": 0.055, + "num_input_tokens_seen": 169204944, + "step": 78405 + }, + { + "epoch": 12.791190864600326, + "grad_norm": 0.0056649185717105865, + "learning_rate": 0.0003462363483284654, + "loss": 0.0023, + "num_input_tokens_seen": 169215728, + "step": 78410 + }, + { + "epoch": 12.792006525285482, + "grad_norm": 0.0021229402627795935, + "learning_rate": 0.0003461686194912169, + "loss": 0.0244, + "num_input_tokens_seen": 169226640, + "step": 78415 + }, + { + "epoch": 12.792822185970635, + "grad_norm": 0.04336128756403923, + "learning_rate": 0.00034610089377155656, + "loss": 0.004, + "num_input_tokens_seen": 169237584, + "step": 78420 + }, + { + "epoch": 12.793637846655791, + "grad_norm": 0.003834417322650552, + "learning_rate": 0.0003460331711708569, + "loss": 0.1088, + "num_input_tokens_seen": 169249392, + "step": 78425 + }, + { + "epoch": 12.794453507340947, + "grad_norm": 0.024947822093963623, + "learning_rate": 0.00034596545169049013, + "loss": 0.1095, + "num_input_tokens_seen": 169260112, + "step": 78430 + }, + { + "epoch": 12.7952691680261, + "grad_norm": 0.006165041588246822, + "learning_rate": 0.00034589773533182924, + "loss": 0.0321, + "num_input_tokens_seen": 169270608, + "step": 78435 + }, + { + "epoch": 12.796084828711257, + "grad_norm": 0.06385862827301025, + "learning_rate": 0.00034583002209624594, + "loss": 0.0099, + "num_input_tokens_seen": 169281456, + "step": 78440 + }, + { + "epoch": 12.79690048939641, + "grad_norm": 0.07925893366336823, + "learning_rate": 0.0003457623119851129, + "loss": 0.0056, + "num_input_tokens_seen": 169291888, + "step": 78445 + }, + { + "epoch": 12.797716150081566, + "grad_norm": 0.025786830112338066, + "learning_rate": 0.00034569460499980233, + "loss": 0.0199, + "num_input_tokens_seen": 169302512, + "step": 78450 + }, + { + "epoch": 12.798531810766722, + "grad_norm": 0.003699333406984806, + "learning_rate": 0.00034562690114168626, + "loss": 0.018, + "num_input_tokens_seen": 169314128, + "step": 78455 + }, + { + "epoch": 12.799347471451876, + "grad_norm": 0.026506368070840836, + "learning_rate": 0.000345559200412137, + "loss": 0.117, + "num_input_tokens_seen": 169324848, + "step": 78460 + }, + { + "epoch": 12.800163132137031, + "grad_norm": 0.06420192122459412, + "learning_rate": 0.00034549150281252633, + "loss": 0.019, + "num_input_tokens_seen": 169334416, + "step": 78465 + }, + { + "epoch": 12.800978792822185, + "grad_norm": 0.0075374385342001915, + "learning_rate": 0.00034542380834422633, + "loss": 0.1454, + "num_input_tokens_seen": 169345456, + "step": 78470 + }, + { + "epoch": 12.801794453507341, + "grad_norm": 0.16572853922843933, + "learning_rate": 0.00034535611700860913, + "loss": 0.0926, + "num_input_tokens_seen": 169355920, + "step": 78475 + }, + { + "epoch": 12.802610114192497, + "grad_norm": 0.7074472904205322, + "learning_rate": 0.00034528842880704626, + "loss": 0.0819, + "num_input_tokens_seen": 169365968, + "step": 78480 + }, + { + "epoch": 12.80342577487765, + "grad_norm": 0.008469204418361187, + "learning_rate": 0.0003452207437409097, + "loss": 0.002, + "num_input_tokens_seen": 169377776, + "step": 78485 + }, + { + "epoch": 12.804241435562806, + "grad_norm": 0.03265415504574776, + "learning_rate": 0.00034515306181157106, + "loss": 0.0047, + "num_input_tokens_seen": 169389456, + "step": 78490 + }, + { + "epoch": 12.80505709624796, + "grad_norm": 0.0499798022210598, + "learning_rate": 0.00034508538302040225, + "loss": 0.006, + "num_input_tokens_seen": 169400336, + "step": 78495 + }, + { + "epoch": 12.805872756933116, + "grad_norm": 0.06109917536377907, + "learning_rate": 0.00034501770736877443, + "loss": 0.1229, + "num_input_tokens_seen": 169411856, + "step": 78500 + }, + { + "epoch": 12.80668841761827, + "grad_norm": 0.0008933853241614997, + "learning_rate": 0.0003449500348580596, + "loss": 0.0101, + "num_input_tokens_seen": 169423056, + "step": 78505 + }, + { + "epoch": 12.807504078303426, + "grad_norm": 0.0005050352774560452, + "learning_rate": 0.0003448823654896288, + "loss": 0.0043, + "num_input_tokens_seen": 169434320, + "step": 78510 + }, + { + "epoch": 12.808319738988581, + "grad_norm": 0.03926096856594086, + "learning_rate": 0.00034481469926485385, + "loss": 0.0543, + "num_input_tokens_seen": 169446384, + "step": 78515 + }, + { + "epoch": 12.809135399673735, + "grad_norm": 0.014172976836562157, + "learning_rate": 0.00034474703618510565, + "loss": 0.1635, + "num_input_tokens_seen": 169457264, + "step": 78520 + }, + { + "epoch": 12.809951060358891, + "grad_norm": 0.09962073713541031, + "learning_rate": 0.00034467937625175596, + "loss": 0.0367, + "num_input_tokens_seen": 169467920, + "step": 78525 + }, + { + "epoch": 12.810766721044045, + "grad_norm": 0.013170093297958374, + "learning_rate": 0.00034461171946617553, + "loss": 0.0231, + "num_input_tokens_seen": 169478736, + "step": 78530 + }, + { + "epoch": 12.8115823817292, + "grad_norm": 0.01600045897066593, + "learning_rate": 0.0003445440658297357, + "loss": 0.0155, + "num_input_tokens_seen": 169489200, + "step": 78535 + }, + { + "epoch": 12.812398042414356, + "grad_norm": 0.018437277525663376, + "learning_rate": 0.0003444764153438079, + "loss": 0.0332, + "num_input_tokens_seen": 169499824, + "step": 78540 + }, + { + "epoch": 12.81321370309951, + "grad_norm": 0.015076526440680027, + "learning_rate": 0.0003444087680097625, + "loss": 0.0057, + "num_input_tokens_seen": 169511280, + "step": 78545 + }, + { + "epoch": 12.814029363784666, + "grad_norm": 0.05792335793375969, + "learning_rate": 0.00034434112382897107, + "loss": 0.0098, + "num_input_tokens_seen": 169523216, + "step": 78550 + }, + { + "epoch": 12.81484502446982, + "grad_norm": 0.03426433354616165, + "learning_rate": 0.000344273482802804, + "loss": 0.0783, + "num_input_tokens_seen": 169534192, + "step": 78555 + }, + { + "epoch": 12.815660685154976, + "grad_norm": 0.036770910024642944, + "learning_rate": 0.00034420584493263264, + "loss": 0.0104, + "num_input_tokens_seen": 169544976, + "step": 78560 + }, + { + "epoch": 12.81647634584013, + "grad_norm": 0.006304776296019554, + "learning_rate": 0.0003441382102198272, + "loss": 0.0016, + "num_input_tokens_seen": 169555568, + "step": 78565 + }, + { + "epoch": 12.817292006525285, + "grad_norm": 0.06469281017780304, + "learning_rate": 0.0003440705786657588, + "loss": 0.0063, + "num_input_tokens_seen": 169565872, + "step": 78570 + }, + { + "epoch": 12.818107667210441, + "grad_norm": 0.019119439646601677, + "learning_rate": 0.00034400295027179776, + "loss": 0.065, + "num_input_tokens_seen": 169576944, + "step": 78575 + }, + { + "epoch": 12.818923327895595, + "grad_norm": 0.0005228684167377651, + "learning_rate": 0.00034393532503931514, + "loss": 0.059, + "num_input_tokens_seen": 169587536, + "step": 78580 + }, + { + "epoch": 12.81973898858075, + "grad_norm": 0.3224520683288574, + "learning_rate": 0.0003438677029696808, + "loss": 0.0242, + "num_input_tokens_seen": 169597648, + "step": 78585 + }, + { + "epoch": 12.820554649265905, + "grad_norm": 0.052939195185899734, + "learning_rate": 0.0003438000840642657, + "loss": 0.006, + "num_input_tokens_seen": 169607920, + "step": 78590 + }, + { + "epoch": 12.82137030995106, + "grad_norm": 0.010336203500628471, + "learning_rate": 0.00034373246832444007, + "loss": 0.0122, + "num_input_tokens_seen": 169618672, + "step": 78595 + }, + { + "epoch": 12.822185970636216, + "grad_norm": 0.005961438175290823, + "learning_rate": 0.00034366485575157413, + "loss": 0.0167, + "num_input_tokens_seen": 169628816, + "step": 78600 + }, + { + "epoch": 12.82300163132137, + "grad_norm": 0.16833347082138062, + "learning_rate": 0.00034359724634703827, + "loss": 0.0181, + "num_input_tokens_seen": 169639952, + "step": 78605 + }, + { + "epoch": 12.823817292006526, + "grad_norm": 0.024784987792372704, + "learning_rate": 0.0003435296401122027, + "loss": 0.0119, + "num_input_tokens_seen": 169650992, + "step": 78610 + }, + { + "epoch": 12.82463295269168, + "grad_norm": 0.007033790927380323, + "learning_rate": 0.0003434620370484372, + "loss": 0.0025, + "num_input_tokens_seen": 169662320, + "step": 78615 + }, + { + "epoch": 12.825448613376835, + "grad_norm": 0.0031684411223977804, + "learning_rate": 0.0003433944371571124, + "loss": 0.0045, + "num_input_tokens_seen": 169672112, + "step": 78620 + }, + { + "epoch": 12.826264274061991, + "grad_norm": 0.006342599634081125, + "learning_rate": 0.00034332684043959777, + "loss": 0.0259, + "num_input_tokens_seen": 169682384, + "step": 78625 + }, + { + "epoch": 12.827079934747145, + "grad_norm": 0.006706783082336187, + "learning_rate": 0.00034325924689726376, + "loss": 0.0017, + "num_input_tokens_seen": 169693232, + "step": 78630 + }, + { + "epoch": 12.8278955954323, + "grad_norm": 0.004058022052049637, + "learning_rate": 0.00034319165653147964, + "loss": 0.02, + "num_input_tokens_seen": 169704432, + "step": 78635 + }, + { + "epoch": 12.828711256117455, + "grad_norm": 0.0036801008973270655, + "learning_rate": 0.00034312406934361553, + "loss": 0.0442, + "num_input_tokens_seen": 169713840, + "step": 78640 + }, + { + "epoch": 12.82952691680261, + "grad_norm": 0.021819138899445534, + "learning_rate": 0.0003430564853350414, + "loss": 0.0211, + "num_input_tokens_seen": 169725136, + "step": 78645 + }, + { + "epoch": 12.830342577487766, + "grad_norm": 0.00030076521215960383, + "learning_rate": 0.0003429889045071265, + "loss": 0.0966, + "num_input_tokens_seen": 169737008, + "step": 78650 + }, + { + "epoch": 12.83115823817292, + "grad_norm": 0.007980968803167343, + "learning_rate": 0.0003429213268612408, + "loss": 0.074, + "num_input_tokens_seen": 169747600, + "step": 78655 + }, + { + "epoch": 12.831973898858076, + "grad_norm": 0.044991377741098404, + "learning_rate": 0.0003428537523987535, + "loss": 0.1221, + "num_input_tokens_seen": 169757616, + "step": 78660 + }, + { + "epoch": 12.83278955954323, + "grad_norm": 0.09849607199430466, + "learning_rate": 0.0003427861811210345, + "loss": 0.0206, + "num_input_tokens_seen": 169768784, + "step": 78665 + }, + { + "epoch": 12.833605220228385, + "grad_norm": 0.002698304830119014, + "learning_rate": 0.0003427186130294527, + "loss": 0.0093, + "num_input_tokens_seen": 169780208, + "step": 78670 + }, + { + "epoch": 12.83442088091354, + "grad_norm": 0.011984925717115402, + "learning_rate": 0.00034265104812537805, + "loss": 0.0068, + "num_input_tokens_seen": 169792048, + "step": 78675 + }, + { + "epoch": 12.835236541598695, + "grad_norm": 0.7704172730445862, + "learning_rate": 0.0003425834864101792, + "loss": 0.0636, + "num_input_tokens_seen": 169802832, + "step": 78680 + }, + { + "epoch": 12.83605220228385, + "grad_norm": 0.018321076408028603, + "learning_rate": 0.000342515927885226, + "loss": 0.0027, + "num_input_tokens_seen": 169814032, + "step": 78685 + }, + { + "epoch": 12.836867862969005, + "grad_norm": 0.0016966605326160789, + "learning_rate": 0.000342448372551887, + "loss": 0.1135, + "num_input_tokens_seen": 169824016, + "step": 78690 + }, + { + "epoch": 12.83768352365416, + "grad_norm": 0.004727560095489025, + "learning_rate": 0.0003423808204115318, + "loss": 0.0281, + "num_input_tokens_seen": 169834704, + "step": 78695 + }, + { + "epoch": 12.838499184339314, + "grad_norm": 0.010498922318220139, + "learning_rate": 0.00034231327146552916, + "loss": 0.1628, + "num_input_tokens_seen": 169845040, + "step": 78700 + }, + { + "epoch": 12.83931484502447, + "grad_norm": 0.010280202142894268, + "learning_rate": 0.00034224572571524823, + "loss": 0.0099, + "num_input_tokens_seen": 169856944, + "step": 78705 + }, + { + "epoch": 12.840130505709626, + "grad_norm": 0.04722465202212334, + "learning_rate": 0.00034217818316205757, + "loss": 0.0344, + "num_input_tokens_seen": 169868048, + "step": 78710 + }, + { + "epoch": 12.84094616639478, + "grad_norm": 0.47131356596946716, + "learning_rate": 0.0003421106438073265, + "loss": 0.0991, + "num_input_tokens_seen": 169877776, + "step": 78715 + }, + { + "epoch": 12.841761827079935, + "grad_norm": 0.007632437627762556, + "learning_rate": 0.0003420431076524233, + "loss": 0.1523, + "num_input_tokens_seen": 169888592, + "step": 78720 + }, + { + "epoch": 12.84257748776509, + "grad_norm": 0.0010299590649083257, + "learning_rate": 0.0003419755746987171, + "loss": 0.0089, + "num_input_tokens_seen": 169899824, + "step": 78725 + }, + { + "epoch": 12.843393148450245, + "grad_norm": 0.09987709671258926, + "learning_rate": 0.0003419080449475761, + "loss": 0.0072, + "num_input_tokens_seen": 169910064, + "step": 78730 + }, + { + "epoch": 12.844208809135399, + "grad_norm": 0.2972666323184967, + "learning_rate": 0.0003418405184003693, + "loss": 0.0353, + "num_input_tokens_seen": 169920112, + "step": 78735 + }, + { + "epoch": 12.845024469820554, + "grad_norm": 0.004713247064501047, + "learning_rate": 0.000341772995058465, + "loss": 0.0079, + "num_input_tokens_seen": 169931088, + "step": 78740 + }, + { + "epoch": 12.84584013050571, + "grad_norm": 0.01959659904241562, + "learning_rate": 0.0003417054749232316, + "loss": 0.0098, + "num_input_tokens_seen": 169942160, + "step": 78745 + }, + { + "epoch": 12.846655791190864, + "grad_norm": 0.164876788854599, + "learning_rate": 0.0003416379579960377, + "loss": 0.0183, + "num_input_tokens_seen": 169951216, + "step": 78750 + }, + { + "epoch": 12.84747145187602, + "grad_norm": 0.006408587098121643, + "learning_rate": 0.00034157044427825137, + "loss": 0.0034, + "num_input_tokens_seen": 169962288, + "step": 78755 + }, + { + "epoch": 12.848287112561174, + "grad_norm": 0.0018021262949332595, + "learning_rate": 0.000341502933771241, + "loss": 0.0032, + "num_input_tokens_seen": 169973648, + "step": 78760 + }, + { + "epoch": 12.84910277324633, + "grad_norm": 0.8015586137771606, + "learning_rate": 0.00034143542647637474, + "loss": 0.1763, + "num_input_tokens_seen": 169984688, + "step": 78765 + }, + { + "epoch": 12.849918433931485, + "grad_norm": 0.0012745046988129616, + "learning_rate": 0.00034136792239502074, + "loss": 0.1462, + "num_input_tokens_seen": 169996240, + "step": 78770 + }, + { + "epoch": 12.850734094616639, + "grad_norm": 0.06848564743995667, + "learning_rate": 0.000341300421528547, + "loss": 0.0415, + "num_input_tokens_seen": 170006768, + "step": 78775 + }, + { + "epoch": 12.851549755301795, + "grad_norm": 0.0323820635676384, + "learning_rate": 0.0003412329238783216, + "loss": 0.0292, + "num_input_tokens_seen": 170017200, + "step": 78780 + }, + { + "epoch": 12.852365415986949, + "grad_norm": 0.0033069495111703873, + "learning_rate": 0.00034116542944571227, + "loss": 0.0036, + "num_input_tokens_seen": 170029168, + "step": 78785 + }, + { + "epoch": 12.853181076672104, + "grad_norm": 0.2374097853899002, + "learning_rate": 0.00034109793823208724, + "loss": 0.0378, + "num_input_tokens_seen": 170040528, + "step": 78790 + }, + { + "epoch": 12.85399673735726, + "grad_norm": 0.001168357557617128, + "learning_rate": 0.0003410304502388139, + "loss": 0.0547, + "num_input_tokens_seen": 170051568, + "step": 78795 + }, + { + "epoch": 12.854812398042414, + "grad_norm": 0.0037985709495842457, + "learning_rate": 0.0003409629654672602, + "loss": 0.0069, + "num_input_tokens_seen": 170062864, + "step": 78800 + }, + { + "epoch": 12.85562805872757, + "grad_norm": 0.013187333010137081, + "learning_rate": 0.0003408954839187938, + "loss": 0.0178, + "num_input_tokens_seen": 170073840, + "step": 78805 + }, + { + "epoch": 12.856443719412724, + "grad_norm": 0.04127464070916176, + "learning_rate": 0.0003408280055947823, + "loss": 0.0226, + "num_input_tokens_seen": 170085456, + "step": 78810 + }, + { + "epoch": 12.85725938009788, + "grad_norm": 0.0017363085644319654, + "learning_rate": 0.00034076053049659295, + "loss": 0.0788, + "num_input_tokens_seen": 170095920, + "step": 78815 + }, + { + "epoch": 12.858075040783035, + "grad_norm": 0.17673943936824799, + "learning_rate": 0.00034069305862559373, + "loss": 0.0081, + "num_input_tokens_seen": 170105968, + "step": 78820 + }, + { + "epoch": 12.858890701468189, + "grad_norm": 0.04637615755200386, + "learning_rate": 0.00034062558998315163, + "loss": 0.0298, + "num_input_tokens_seen": 170116976, + "step": 78825 + }, + { + "epoch": 12.859706362153345, + "grad_norm": 0.0843796357512474, + "learning_rate": 0.0003405581245706342, + "loss": 0.0208, + "num_input_tokens_seen": 170127504, + "step": 78830 + }, + { + "epoch": 12.860522022838499, + "grad_norm": 0.012086664326488972, + "learning_rate": 0.0003404906623894085, + "loss": 0.024, + "num_input_tokens_seen": 170137904, + "step": 78835 + }, + { + "epoch": 12.861337683523654, + "grad_norm": 0.0035488971043378115, + "learning_rate": 0.0003404232034408421, + "loss": 0.0014, + "num_input_tokens_seen": 170149840, + "step": 78840 + }, + { + "epoch": 12.86215334420881, + "grad_norm": 0.0027029954362660646, + "learning_rate": 0.00034035574772630175, + "loss": 0.0083, + "num_input_tokens_seen": 170161072, + "step": 78845 + }, + { + "epoch": 12.862969004893964, + "grad_norm": 0.4472430646419525, + "learning_rate": 0.00034028829524715464, + "loss": 0.0448, + "num_input_tokens_seen": 170171024, + "step": 78850 + }, + { + "epoch": 12.86378466557912, + "grad_norm": 0.4353938698768616, + "learning_rate": 0.000340220846004768, + "loss": 0.0822, + "num_input_tokens_seen": 170181712, + "step": 78855 + }, + { + "epoch": 12.864600326264274, + "grad_norm": 0.012199142016470432, + "learning_rate": 0.00034015340000050846, + "loss": 0.0032, + "num_input_tokens_seen": 170193040, + "step": 78860 + }, + { + "epoch": 12.86541598694943, + "grad_norm": 0.0036806385032832623, + "learning_rate": 0.00034008595723574326, + "loss": 0.01, + "num_input_tokens_seen": 170205328, + "step": 78865 + }, + { + "epoch": 12.866231647634583, + "grad_norm": 0.021960150450468063, + "learning_rate": 0.00034001851771183877, + "loss": 0.0063, + "num_input_tokens_seen": 170215888, + "step": 78870 + }, + { + "epoch": 12.867047308319739, + "grad_norm": 0.002223666524514556, + "learning_rate": 0.00033995108143016216, + "loss": 0.0032, + "num_input_tokens_seen": 170227376, + "step": 78875 + }, + { + "epoch": 12.867862969004895, + "grad_norm": 0.06978488713502884, + "learning_rate": 0.0003398836483920798, + "loss": 0.0087, + "num_input_tokens_seen": 170238416, + "step": 78880 + }, + { + "epoch": 12.868678629690049, + "grad_norm": 0.017015250399708748, + "learning_rate": 0.0003398162185989586, + "loss": 0.0302, + "num_input_tokens_seen": 170249552, + "step": 78885 + }, + { + "epoch": 12.869494290375204, + "grad_norm": 0.08934586495161057, + "learning_rate": 0.0003397487920521647, + "loss": 0.0076, + "num_input_tokens_seen": 170260240, + "step": 78890 + }, + { + "epoch": 12.870309951060358, + "grad_norm": 0.02959534339606762, + "learning_rate": 0.00033968136875306496, + "loss": 0.0073, + "num_input_tokens_seen": 170269808, + "step": 78895 + }, + { + "epoch": 12.871125611745514, + "grad_norm": 0.20734862983226776, + "learning_rate": 0.0003396139487030256, + "loss": 0.117, + "num_input_tokens_seen": 170281040, + "step": 78900 + }, + { + "epoch": 12.87194127243067, + "grad_norm": 0.025473013520240784, + "learning_rate": 0.00033954653190341306, + "loss": 0.0068, + "num_input_tokens_seen": 170290320, + "step": 78905 + }, + { + "epoch": 12.872756933115824, + "grad_norm": 0.12250814586877823, + "learning_rate": 0.0003394791183555936, + "loss": 0.0114, + "num_input_tokens_seen": 170300944, + "step": 78910 + }, + { + "epoch": 12.87357259380098, + "grad_norm": 0.0057367864064872265, + "learning_rate": 0.0003394117080609335, + "loss": 0.0232, + "num_input_tokens_seen": 170311600, + "step": 78915 + }, + { + "epoch": 12.874388254486133, + "grad_norm": 0.006193656008690596, + "learning_rate": 0.0003393443010207988, + "loss": 0.0022, + "num_input_tokens_seen": 170322000, + "step": 78920 + }, + { + "epoch": 12.875203915171289, + "grad_norm": 0.008306687697768211, + "learning_rate": 0.0003392768972365556, + "loss": 0.0181, + "num_input_tokens_seen": 170332240, + "step": 78925 + }, + { + "epoch": 12.876019575856443, + "grad_norm": 0.2128003090620041, + "learning_rate": 0.00033920949670956994, + "loss": 0.0157, + "num_input_tokens_seen": 170343120, + "step": 78930 + }, + { + "epoch": 12.876835236541599, + "grad_norm": 0.02738361433148384, + "learning_rate": 0.000339142099441208, + "loss": 0.0817, + "num_input_tokens_seen": 170352816, + "step": 78935 + }, + { + "epoch": 12.877650897226754, + "grad_norm": 0.03327373042702675, + "learning_rate": 0.0003390747054328353, + "loss": 0.0286, + "num_input_tokens_seen": 170362448, + "step": 78940 + }, + { + "epoch": 12.878466557911908, + "grad_norm": 0.13079527020454407, + "learning_rate": 0.00033900731468581804, + "loss": 0.2258, + "num_input_tokens_seen": 170373040, + "step": 78945 + }, + { + "epoch": 12.879282218597064, + "grad_norm": 0.1498461812734604, + "learning_rate": 0.0003389399272015215, + "loss": 0.0092, + "num_input_tokens_seen": 170383984, + "step": 78950 + }, + { + "epoch": 12.880097879282218, + "grad_norm": 0.006227763369679451, + "learning_rate": 0.0003388725429813117, + "loss": 0.0097, + "num_input_tokens_seen": 170394608, + "step": 78955 + }, + { + "epoch": 12.880913539967374, + "grad_norm": 0.0021961843594908714, + "learning_rate": 0.0003388051620265544, + "loss": 0.0156, + "num_input_tokens_seen": 170406480, + "step": 78960 + }, + { + "epoch": 12.88172920065253, + "grad_norm": 0.10260043293237686, + "learning_rate": 0.0003387377843386148, + "loss": 0.0658, + "num_input_tokens_seen": 170417936, + "step": 78965 + }, + { + "epoch": 12.882544861337683, + "grad_norm": 0.03383588418364525, + "learning_rate": 0.00033867040991885885, + "loss": 0.0436, + "num_input_tokens_seen": 170428848, + "step": 78970 + }, + { + "epoch": 12.883360522022839, + "grad_norm": 0.5421835780143738, + "learning_rate": 0.0003386030387686514, + "loss": 0.2169, + "num_input_tokens_seen": 170439440, + "step": 78975 + }, + { + "epoch": 12.884176182707993, + "grad_norm": 0.0048496234230697155, + "learning_rate": 0.0003385356708893584, + "loss": 0.053, + "num_input_tokens_seen": 170450544, + "step": 78980 + }, + { + "epoch": 12.884991843393149, + "grad_norm": 0.022471651434898376, + "learning_rate": 0.0003384683062823446, + "loss": 0.0323, + "num_input_tokens_seen": 170462384, + "step": 78985 + }, + { + "epoch": 12.885807504078304, + "grad_norm": 0.00471143051981926, + "learning_rate": 0.00033840094494897566, + "loss": 0.0095, + "num_input_tokens_seen": 170473520, + "step": 78990 + }, + { + "epoch": 12.886623164763458, + "grad_norm": 0.002814502688124776, + "learning_rate": 0.0003383335868906164, + "loss": 0.0046, + "num_input_tokens_seen": 170484080, + "step": 78995 + }, + { + "epoch": 12.887438825448614, + "grad_norm": 0.06917837262153625, + "learning_rate": 0.0003382662321086324, + "loss": 0.0088, + "num_input_tokens_seen": 170495568, + "step": 79000 + }, + { + "epoch": 12.888254486133768, + "grad_norm": 0.01607641577720642, + "learning_rate": 0.0003381988806043881, + "loss": 0.0047, + "num_input_tokens_seen": 170505968, + "step": 79005 + }, + { + "epoch": 12.889070146818923, + "grad_norm": 0.013988323509693146, + "learning_rate": 0.0003381315323792489, + "loss": 0.0024, + "num_input_tokens_seen": 170515536, + "step": 79010 + }, + { + "epoch": 12.88988580750408, + "grad_norm": 0.010753112845122814, + "learning_rate": 0.00033806418743457937, + "loss": 0.0051, + "num_input_tokens_seen": 170526160, + "step": 79015 + }, + { + "epoch": 12.890701468189233, + "grad_norm": 0.009210849180817604, + "learning_rate": 0.0003379968457717447, + "loss": 0.0652, + "num_input_tokens_seen": 170537776, + "step": 79020 + }, + { + "epoch": 12.891517128874389, + "grad_norm": 0.07077978551387787, + "learning_rate": 0.00033792950739210934, + "loss": 0.012, + "num_input_tokens_seen": 170549104, + "step": 79025 + }, + { + "epoch": 12.892332789559543, + "grad_norm": 0.015330792404711246, + "learning_rate": 0.0003378621722970382, + "loss": 0.0209, + "num_input_tokens_seen": 170559504, + "step": 79030 + }, + { + "epoch": 12.893148450244698, + "grad_norm": 0.03747064992785454, + "learning_rate": 0.00033779484048789574, + "loss": 0.0195, + "num_input_tokens_seen": 170571312, + "step": 79035 + }, + { + "epoch": 12.893964110929852, + "grad_norm": 0.004267733544111252, + "learning_rate": 0.0003377275119660467, + "loss": 0.0228, + "num_input_tokens_seen": 170582896, + "step": 79040 + }, + { + "epoch": 12.894779771615008, + "grad_norm": 0.05585867539048195, + "learning_rate": 0.00033766018673285535, + "loss": 0.0688, + "num_input_tokens_seen": 170592336, + "step": 79045 + }, + { + "epoch": 12.895595432300164, + "grad_norm": 0.0016656540101394057, + "learning_rate": 0.0003375928647896863, + "loss": 0.0087, + "num_input_tokens_seen": 170604336, + "step": 79050 + }, + { + "epoch": 12.896411092985318, + "grad_norm": 0.5410192012786865, + "learning_rate": 0.000337525546137904, + "loss": 0.2073, + "num_input_tokens_seen": 170615504, + "step": 79055 + }, + { + "epoch": 12.897226753670473, + "grad_norm": 0.043145012110471725, + "learning_rate": 0.0003374582307788725, + "loss": 0.028, + "num_input_tokens_seen": 170625264, + "step": 79060 + }, + { + "epoch": 12.898042414355627, + "grad_norm": 0.03822370991110802, + "learning_rate": 0.0003373909187139562, + "loss": 0.0242, + "num_input_tokens_seen": 170636464, + "step": 79065 + }, + { + "epoch": 12.898858075040783, + "grad_norm": 0.3872824013233185, + "learning_rate": 0.0003373236099445191, + "loss": 0.0267, + "num_input_tokens_seen": 170647632, + "step": 79070 + }, + { + "epoch": 12.899673735725939, + "grad_norm": 0.0950385183095932, + "learning_rate": 0.00033725630447192556, + "loss": 0.1081, + "num_input_tokens_seen": 170658192, + "step": 79075 + }, + { + "epoch": 12.900489396411093, + "grad_norm": 0.04566628858447075, + "learning_rate": 0.0003371890022975394, + "loss": 0.0769, + "num_input_tokens_seen": 170668272, + "step": 79080 + }, + { + "epoch": 12.901305057096248, + "grad_norm": 0.02564418874680996, + "learning_rate": 0.0003371217034227247, + "loss": 0.0024, + "num_input_tokens_seen": 170677232, + "step": 79085 + }, + { + "epoch": 12.902120717781402, + "grad_norm": 0.11556129157543182, + "learning_rate": 0.0003370544078488453, + "loss": 0.042, + "num_input_tokens_seen": 170688688, + "step": 79090 + }, + { + "epoch": 12.902936378466558, + "grad_norm": 0.005944822449237108, + "learning_rate": 0.000336987115577265, + "loss": 0.0261, + "num_input_tokens_seen": 170699536, + "step": 79095 + }, + { + "epoch": 12.903752039151712, + "grad_norm": 0.017453515902161598, + "learning_rate": 0.0003369198266093475, + "loss": 0.1224, + "num_input_tokens_seen": 170709872, + "step": 79100 + }, + { + "epoch": 12.904567699836868, + "grad_norm": 0.029280414804816246, + "learning_rate": 0.00033685254094645685, + "loss": 0.0454, + "num_input_tokens_seen": 170720432, + "step": 79105 + }, + { + "epoch": 12.905383360522023, + "grad_norm": 0.004507084842771292, + "learning_rate": 0.0003367852585899562, + "loss": 0.0014, + "num_input_tokens_seen": 170730448, + "step": 79110 + }, + { + "epoch": 12.906199021207177, + "grad_norm": 0.008810460567474365, + "learning_rate": 0.00033671797954120953, + "loss": 0.2128, + "num_input_tokens_seen": 170740176, + "step": 79115 + }, + { + "epoch": 12.907014681892333, + "grad_norm": 0.5490175485610962, + "learning_rate": 0.0003366507038015799, + "loss": 0.0353, + "num_input_tokens_seen": 170750832, + "step": 79120 + }, + { + "epoch": 12.907830342577487, + "grad_norm": 0.002729184692725539, + "learning_rate": 0.0003365834313724312, + "loss": 0.0058, + "num_input_tokens_seen": 170761680, + "step": 79125 + }, + { + "epoch": 12.908646003262643, + "grad_norm": 0.026525184512138367, + "learning_rate": 0.00033651616225512636, + "loss": 0.0101, + "num_input_tokens_seen": 170771536, + "step": 79130 + }, + { + "epoch": 12.909461663947798, + "grad_norm": 0.00492516765370965, + "learning_rate": 0.0003364488964510292, + "loss": 0.0042, + "num_input_tokens_seen": 170783216, + "step": 79135 + }, + { + "epoch": 12.910277324632952, + "grad_norm": 0.039238233119249344, + "learning_rate": 0.00033638163396150234, + "loss": 0.049, + "num_input_tokens_seen": 170793296, + "step": 79140 + }, + { + "epoch": 12.911092985318108, + "grad_norm": 0.0004934008466079831, + "learning_rate": 0.0003363143747879094, + "loss": 0.003, + "num_input_tokens_seen": 170803760, + "step": 79145 + }, + { + "epoch": 12.911908646003262, + "grad_norm": 0.4092390537261963, + "learning_rate": 0.00033624711893161317, + "loss": 0.0807, + "num_input_tokens_seen": 170813968, + "step": 79150 + }, + { + "epoch": 12.912724306688418, + "grad_norm": 0.042792607098817825, + "learning_rate": 0.000336179866393977, + "loss": 0.0072, + "num_input_tokens_seen": 170824080, + "step": 79155 + }, + { + "epoch": 12.913539967373573, + "grad_norm": 0.002937833545729518, + "learning_rate": 0.0003361126171763634, + "loss": 0.002, + "num_input_tokens_seen": 170835600, + "step": 79160 + }, + { + "epoch": 12.914355628058727, + "grad_norm": 0.001167229376733303, + "learning_rate": 0.0003360453712801358, + "loss": 0.0045, + "num_input_tokens_seen": 170847088, + "step": 79165 + }, + { + "epoch": 12.915171288743883, + "grad_norm": 0.009961942210793495, + "learning_rate": 0.00033597812870665657, + "loss": 0.003, + "num_input_tokens_seen": 170857040, + "step": 79170 + }, + { + "epoch": 12.915986949429037, + "grad_norm": 0.0009721548412926495, + "learning_rate": 0.00033591088945728856, + "loss": 0.0294, + "num_input_tokens_seen": 170868432, + "step": 79175 + }, + { + "epoch": 12.916802610114193, + "grad_norm": 0.11340523511171341, + "learning_rate": 0.0003358436535333947, + "loss": 0.0066, + "num_input_tokens_seen": 170880176, + "step": 79180 + }, + { + "epoch": 12.917618270799348, + "grad_norm": 0.016127068549394608, + "learning_rate": 0.0003357764209363373, + "loss": 0.0181, + "num_input_tokens_seen": 170890576, + "step": 79185 + }, + { + "epoch": 12.918433931484502, + "grad_norm": 0.031770359724760056, + "learning_rate": 0.00033570919166747926, + "loss": 0.0917, + "num_input_tokens_seen": 170902384, + "step": 79190 + }, + { + "epoch": 12.919249592169658, + "grad_norm": 0.5424258708953857, + "learning_rate": 0.0003356419657281827, + "loss": 0.0662, + "num_input_tokens_seen": 170913936, + "step": 79195 + }, + { + "epoch": 12.920065252854812, + "grad_norm": 0.026227612048387527, + "learning_rate": 0.0003355747431198104, + "loss": 0.0071, + "num_input_tokens_seen": 170924816, + "step": 79200 + }, + { + "epoch": 12.920880913539968, + "grad_norm": 0.024833541363477707, + "learning_rate": 0.0003355075238437243, + "loss": 0.0222, + "num_input_tokens_seen": 170936080, + "step": 79205 + }, + { + "epoch": 12.921696574225122, + "grad_norm": 0.016835888847708702, + "learning_rate": 0.0003354403079012871, + "loss": 0.0303, + "num_input_tokens_seen": 170947952, + "step": 79210 + }, + { + "epoch": 12.922512234910277, + "grad_norm": 0.060269519686698914, + "learning_rate": 0.0003353730952938606, + "loss": 0.0066, + "num_input_tokens_seen": 170959088, + "step": 79215 + }, + { + "epoch": 12.923327895595433, + "grad_norm": 0.0039131129160523415, + "learning_rate": 0.0003353058860228073, + "loss": 0.0029, + "num_input_tokens_seen": 170969136, + "step": 79220 + }, + { + "epoch": 12.924143556280587, + "grad_norm": 0.0018969993107020855, + "learning_rate": 0.0003352386800894891, + "loss": 0.0029, + "num_input_tokens_seen": 170979088, + "step": 79225 + }, + { + "epoch": 12.924959216965743, + "grad_norm": 0.012910080142319202, + "learning_rate": 0.0003351714774952681, + "loss": 0.0092, + "num_input_tokens_seen": 170989040, + "step": 79230 + }, + { + "epoch": 12.925774877650896, + "grad_norm": 0.04816114902496338, + "learning_rate": 0.00033510427824150625, + "loss": 0.0048, + "num_input_tokens_seen": 171000560, + "step": 79235 + }, + { + "epoch": 12.926590538336052, + "grad_norm": 0.0021728838328272104, + "learning_rate": 0.0003350370823295653, + "loss": 0.0108, + "num_input_tokens_seen": 171010064, + "step": 79240 + }, + { + "epoch": 12.927406199021208, + "grad_norm": 0.015196848660707474, + "learning_rate": 0.0003349698897608071, + "loss": 0.0239, + "num_input_tokens_seen": 171021328, + "step": 79245 + }, + { + "epoch": 12.928221859706362, + "grad_norm": 0.00736373383551836, + "learning_rate": 0.00033490270053659367, + "loss": 0.005, + "num_input_tokens_seen": 171032848, + "step": 79250 + }, + { + "epoch": 12.929037520391518, + "grad_norm": 0.03985320404171944, + "learning_rate": 0.0003348355146582862, + "loss": 0.0054, + "num_input_tokens_seen": 171043664, + "step": 79255 + }, + { + "epoch": 12.929853181076671, + "grad_norm": 0.04350687563419342, + "learning_rate": 0.00033476833212724676, + "loss": 0.0342, + "num_input_tokens_seen": 171055632, + "step": 79260 + }, + { + "epoch": 12.930668841761827, + "grad_norm": 0.011178359389305115, + "learning_rate": 0.0003347011529448365, + "loss": 0.0039, + "num_input_tokens_seen": 171066576, + "step": 79265 + }, + { + "epoch": 12.931484502446983, + "grad_norm": 0.0008386451518163085, + "learning_rate": 0.00033463397711241727, + "loss": 0.0058, + "num_input_tokens_seen": 171077424, + "step": 79270 + }, + { + "epoch": 12.932300163132137, + "grad_norm": 0.05824890732765198, + "learning_rate": 0.00033456680463135006, + "loss": 0.0051, + "num_input_tokens_seen": 171087600, + "step": 79275 + }, + { + "epoch": 12.933115823817293, + "grad_norm": 0.007049847394227982, + "learning_rate": 0.00033449963550299646, + "loss": 0.0242, + "num_input_tokens_seen": 171097968, + "step": 79280 + }, + { + "epoch": 12.933931484502446, + "grad_norm": 0.005853482987731695, + "learning_rate": 0.00033443246972871785, + "loss": 0.01, + "num_input_tokens_seen": 171108976, + "step": 79285 + }, + { + "epoch": 12.934747145187602, + "grad_norm": 0.16150861978530884, + "learning_rate": 0.000334365307309875, + "loss": 0.0221, + "num_input_tokens_seen": 171118576, + "step": 79290 + }, + { + "epoch": 12.935562805872756, + "grad_norm": 0.06067981943488121, + "learning_rate": 0.00033429814824782967, + "loss": 0.0041, + "num_input_tokens_seen": 171127760, + "step": 79295 + }, + { + "epoch": 12.936378466557912, + "grad_norm": 0.001441295724362135, + "learning_rate": 0.0003342309925439423, + "loss": 0.0034, + "num_input_tokens_seen": 171139472, + "step": 79300 + }, + { + "epoch": 12.937194127243067, + "grad_norm": 0.10349856317043304, + "learning_rate": 0.0003341638401995744, + "loss": 0.0155, + "num_input_tokens_seen": 171149072, + "step": 79305 + }, + { + "epoch": 12.938009787928221, + "grad_norm": 0.0035847376566380262, + "learning_rate": 0.0003340966912160864, + "loss": 0.0515, + "num_input_tokens_seen": 171160176, + "step": 79310 + }, + { + "epoch": 12.938825448613377, + "grad_norm": 0.002332607749849558, + "learning_rate": 0.00033402954559483966, + "loss": 0.0429, + "num_input_tokens_seen": 171170544, + "step": 79315 + }, + { + "epoch": 12.939641109298531, + "grad_norm": 0.0014024268602952361, + "learning_rate": 0.0003339624033371945, + "loss": 0.0092, + "num_input_tokens_seen": 171180112, + "step": 79320 + }, + { + "epoch": 12.940456769983687, + "grad_norm": 0.11708517372608185, + "learning_rate": 0.00033389526444451215, + "loss": 0.0195, + "num_input_tokens_seen": 171192272, + "step": 79325 + }, + { + "epoch": 12.941272430668842, + "grad_norm": 0.00030041002901270986, + "learning_rate": 0.00033382812891815267, + "loss": 0.031, + "num_input_tokens_seen": 171202960, + "step": 79330 + }, + { + "epoch": 12.942088091353996, + "grad_norm": 0.0007848786190152168, + "learning_rate": 0.00033376099675947726, + "loss": 0.0231, + "num_input_tokens_seen": 171213008, + "step": 79335 + }, + { + "epoch": 12.942903752039152, + "grad_norm": 0.055934689939022064, + "learning_rate": 0.0003336938679698459, + "loss": 0.0743, + "num_input_tokens_seen": 171224272, + "step": 79340 + }, + { + "epoch": 12.943719412724306, + "grad_norm": 0.008715910837054253, + "learning_rate": 0.0003336267425506194, + "loss": 0.002, + "num_input_tokens_seen": 171235440, + "step": 79345 + }, + { + "epoch": 12.944535073409462, + "grad_norm": 0.00874448660761118, + "learning_rate": 0.0003335596205031579, + "loss": 0.0052, + "num_input_tokens_seen": 171246928, + "step": 79350 + }, + { + "epoch": 12.945350734094617, + "grad_norm": 0.07247530668973923, + "learning_rate": 0.00033349250182882205, + "loss": 0.1289, + "num_input_tokens_seen": 171259024, + "step": 79355 + }, + { + "epoch": 12.946166394779771, + "grad_norm": 0.0032356390729546547, + "learning_rate": 0.0003334253865289717, + "loss": 0.0044, + "num_input_tokens_seen": 171269808, + "step": 79360 + }, + { + "epoch": 12.946982055464927, + "grad_norm": 0.003655375214293599, + "learning_rate": 0.00033335827460496725, + "loss": 0.0868, + "num_input_tokens_seen": 171280208, + "step": 79365 + }, + { + "epoch": 12.947797716150081, + "grad_norm": 0.16468943655490875, + "learning_rate": 0.0003332911660581688, + "loss": 0.0215, + "num_input_tokens_seen": 171291312, + "step": 79370 + }, + { + "epoch": 12.948613376835237, + "grad_norm": 0.0031945316586643457, + "learning_rate": 0.0003332240608899363, + "loss": 0.0092, + "num_input_tokens_seen": 171303088, + "step": 79375 + }, + { + "epoch": 12.949429037520392, + "grad_norm": 0.020599307492375374, + "learning_rate": 0.0003331569591016298, + "loss": 0.0454, + "num_input_tokens_seen": 171313840, + "step": 79380 + }, + { + "epoch": 12.950244698205546, + "grad_norm": 0.10623030364513397, + "learning_rate": 0.0003330898606946091, + "loss": 0.0064, + "num_input_tokens_seen": 171324048, + "step": 79385 + }, + { + "epoch": 12.951060358890702, + "grad_norm": 0.0016928468830883503, + "learning_rate": 0.0003330227656702342, + "loss": 0.0158, + "num_input_tokens_seen": 171334000, + "step": 79390 + }, + { + "epoch": 12.951876019575856, + "grad_norm": 0.02653159573674202, + "learning_rate": 0.00033295567402986476, + "loss": 0.0055, + "num_input_tokens_seen": 171344208, + "step": 79395 + }, + { + "epoch": 12.952691680261012, + "grad_norm": 0.19686631858348846, + "learning_rate": 0.0003328885857748605, + "loss": 0.0551, + "num_input_tokens_seen": 171355888, + "step": 79400 + }, + { + "epoch": 12.953507340946166, + "grad_norm": 0.021331926807761192, + "learning_rate": 0.00033282150090658115, + "loss": 0.0106, + "num_input_tokens_seen": 171365392, + "step": 79405 + }, + { + "epoch": 12.954323001631321, + "grad_norm": 0.026619870215654373, + "learning_rate": 0.0003327544194263861, + "loss": 0.0177, + "num_input_tokens_seen": 171375696, + "step": 79410 + }, + { + "epoch": 12.955138662316477, + "grad_norm": 0.30048179626464844, + "learning_rate": 0.0003326873413356347, + "loss": 0.011, + "num_input_tokens_seen": 171386096, + "step": 79415 + }, + { + "epoch": 12.955954323001631, + "grad_norm": 0.5854965448379517, + "learning_rate": 0.0003326202666356869, + "loss": 0.0964, + "num_input_tokens_seen": 171395664, + "step": 79420 + }, + { + "epoch": 12.956769983686787, + "grad_norm": 0.01849188096821308, + "learning_rate": 0.0003325531953279015, + "loss": 0.0025, + "num_input_tokens_seen": 171406928, + "step": 79425 + }, + { + "epoch": 12.95758564437194, + "grad_norm": 0.013005008921027184, + "learning_rate": 0.0003324861274136382, + "loss": 0.01, + "num_input_tokens_seen": 171417584, + "step": 79430 + }, + { + "epoch": 12.958401305057096, + "grad_norm": 0.004893851932138205, + "learning_rate": 0.0003324190628942558, + "loss": 0.0023, + "num_input_tokens_seen": 171428912, + "step": 79435 + }, + { + "epoch": 12.959216965742252, + "grad_norm": 0.008670249953866005, + "learning_rate": 0.000332352001771114, + "loss": 0.0098, + "num_input_tokens_seen": 171440112, + "step": 79440 + }, + { + "epoch": 12.960032626427406, + "grad_norm": 0.2036728858947754, + "learning_rate": 0.0003322849440455713, + "loss": 0.0094, + "num_input_tokens_seen": 171450576, + "step": 79445 + }, + { + "epoch": 12.960848287112562, + "grad_norm": 0.07918284833431244, + "learning_rate": 0.0003322178897189871, + "loss": 0.0099, + "num_input_tokens_seen": 171460496, + "step": 79450 + }, + { + "epoch": 12.961663947797716, + "grad_norm": 0.006521111354231834, + "learning_rate": 0.00033215083879272015, + "loss": 0.0405, + "num_input_tokens_seen": 171472016, + "step": 79455 + }, + { + "epoch": 12.962479608482871, + "grad_norm": 0.011404085904359818, + "learning_rate": 0.00033208379126812947, + "loss": 0.1501, + "num_input_tokens_seen": 171483056, + "step": 79460 + }, + { + "epoch": 12.963295269168025, + "grad_norm": 0.005795550998300314, + "learning_rate": 0.0003320167471465736, + "loss": 0.0059, + "num_input_tokens_seen": 171493488, + "step": 79465 + }, + { + "epoch": 12.964110929853181, + "grad_norm": 0.0027488071937114, + "learning_rate": 0.0003319497064294117, + "loss": 0.0032, + "num_input_tokens_seen": 171503696, + "step": 79470 + }, + { + "epoch": 12.964926590538337, + "grad_norm": 0.014351900666952133, + "learning_rate": 0.0003318826691180019, + "loss": 0.0017, + "num_input_tokens_seen": 171513936, + "step": 79475 + }, + { + "epoch": 12.96574225122349, + "grad_norm": 0.01836412213742733, + "learning_rate": 0.00033181563521370337, + "loss": 0.0295, + "num_input_tokens_seen": 171524496, + "step": 79480 + }, + { + "epoch": 12.966557911908646, + "grad_norm": 0.07562732696533203, + "learning_rate": 0.0003317486047178742, + "loss": 0.014, + "num_input_tokens_seen": 171533808, + "step": 79485 + }, + { + "epoch": 12.9673735725938, + "grad_norm": 0.0036102477461099625, + "learning_rate": 0.00033168157763187285, + "loss": 0.0099, + "num_input_tokens_seen": 171545296, + "step": 79490 + }, + { + "epoch": 12.968189233278956, + "grad_norm": 0.021140243858098984, + "learning_rate": 0.0003316145539570581, + "loss": 0.0041, + "num_input_tokens_seen": 171556464, + "step": 79495 + }, + { + "epoch": 12.969004893964112, + "grad_norm": 0.0016450731782242656, + "learning_rate": 0.00033154753369478787, + "loss": 0.0059, + "num_input_tokens_seen": 171567696, + "step": 79500 + }, + { + "epoch": 12.969820554649266, + "grad_norm": 0.0024694097228348255, + "learning_rate": 0.00033148051684642074, + "loss": 0.0028, + "num_input_tokens_seen": 171579248, + "step": 79505 + }, + { + "epoch": 12.970636215334421, + "grad_norm": 0.024948934093117714, + "learning_rate": 0.00033141350341331447, + "loss": 0.0137, + "num_input_tokens_seen": 171591216, + "step": 79510 + }, + { + "epoch": 12.971451876019575, + "grad_norm": 0.010334763675928116, + "learning_rate": 0.00033134649339682773, + "loss": 0.0116, + "num_input_tokens_seen": 171602160, + "step": 79515 + }, + { + "epoch": 12.97226753670473, + "grad_norm": 0.40165987610816956, + "learning_rate": 0.000331279486798318, + "loss": 0.0935, + "num_input_tokens_seen": 171612368, + "step": 79520 + }, + { + "epoch": 12.973083197389887, + "grad_norm": 0.11801686882972717, + "learning_rate": 0.0003312124836191437, + "loss": 0.0066, + "num_input_tokens_seen": 171621872, + "step": 79525 + }, + { + "epoch": 12.97389885807504, + "grad_norm": 0.07985808700323105, + "learning_rate": 0.00033114548386066234, + "loss": 0.0108, + "num_input_tokens_seen": 171631888, + "step": 79530 + }, + { + "epoch": 12.974714518760196, + "grad_norm": 0.0028225078713148832, + "learning_rate": 0.00033107848752423203, + "loss": 0.0074, + "num_input_tokens_seen": 171643504, + "step": 79535 + }, + { + "epoch": 12.97553017944535, + "grad_norm": 0.19482412934303284, + "learning_rate": 0.0003310114946112105, + "loss": 0.0046, + "num_input_tokens_seen": 171654416, + "step": 79540 + }, + { + "epoch": 12.976345840130506, + "grad_norm": 0.027942080050706863, + "learning_rate": 0.00033094450512295535, + "loss": 0.0018, + "num_input_tokens_seen": 171664720, + "step": 79545 + }, + { + "epoch": 12.977161500815662, + "grad_norm": 0.01671537011861801, + "learning_rate": 0.00033087751906082436, + "loss": 0.0117, + "num_input_tokens_seen": 171676240, + "step": 79550 + }, + { + "epoch": 12.977977161500815, + "grad_norm": 0.005374469328671694, + "learning_rate": 0.000330810536426175, + "loss": 0.0035, + "num_input_tokens_seen": 171687376, + "step": 79555 + }, + { + "epoch": 12.978792822185971, + "grad_norm": 0.008844718337059021, + "learning_rate": 0.0003307435572203645, + "loss": 0.0042, + "num_input_tokens_seen": 171697840, + "step": 79560 + }, + { + "epoch": 12.979608482871125, + "grad_norm": 0.0364043153822422, + "learning_rate": 0.00033067658144475087, + "loss": 0.0068, + "num_input_tokens_seen": 171708656, + "step": 79565 + }, + { + "epoch": 12.98042414355628, + "grad_norm": 0.024044346064329147, + "learning_rate": 0.0003306096091006909, + "loss": 0.0199, + "num_input_tokens_seen": 171718704, + "step": 79570 + }, + { + "epoch": 12.981239804241435, + "grad_norm": 0.0008627332281321287, + "learning_rate": 0.0003305426401895423, + "loss": 0.0233, + "num_input_tokens_seen": 171729872, + "step": 79575 + }, + { + "epoch": 12.98205546492659, + "grad_norm": 0.0013759565772488713, + "learning_rate": 0.0003304756747126618, + "loss": 0.0125, + "num_input_tokens_seen": 171741104, + "step": 79580 + }, + { + "epoch": 12.982871125611746, + "grad_norm": 0.06416638195514679, + "learning_rate": 0.00033040871267140705, + "loss": 0.0057, + "num_input_tokens_seen": 171753040, + "step": 79585 + }, + { + "epoch": 12.9836867862969, + "grad_norm": 0.020168280228972435, + "learning_rate": 0.00033034175406713464, + "loss": 0.0027, + "num_input_tokens_seen": 171763984, + "step": 79590 + }, + { + "epoch": 12.984502446982056, + "grad_norm": 0.0020024243276566267, + "learning_rate": 0.0003302747989012019, + "loss": 0.0028, + "num_input_tokens_seen": 171775344, + "step": 79595 + }, + { + "epoch": 12.98531810766721, + "grad_norm": 0.005748272407799959, + "learning_rate": 0.00033020784717496576, + "loss": 0.0159, + "num_input_tokens_seen": 171785104, + "step": 79600 + }, + { + "epoch": 12.986133768352365, + "grad_norm": 0.000428094994276762, + "learning_rate": 0.0003301408988897829, + "loss": 0.0136, + "num_input_tokens_seen": 171796112, + "step": 79605 + }, + { + "epoch": 12.986949429037521, + "grad_norm": 0.000557329913135618, + "learning_rate": 0.00033007395404701035, + "loss": 0.0228, + "num_input_tokens_seen": 171806864, + "step": 79610 + }, + { + "epoch": 12.987765089722675, + "grad_norm": 0.027136636897921562, + "learning_rate": 0.0003300070126480045, + "loss": 0.0176, + "num_input_tokens_seen": 171817712, + "step": 79615 + }, + { + "epoch": 12.98858075040783, + "grad_norm": 0.005132078658789396, + "learning_rate": 0.00032994007469412234, + "loss": 0.0024, + "num_input_tokens_seen": 171829264, + "step": 79620 + }, + { + "epoch": 12.989396411092985, + "grad_norm": 0.0016907569952309132, + "learning_rate": 0.0003298731401867202, + "loss": 0.0039, + "num_input_tokens_seen": 171838864, + "step": 79625 + }, + { + "epoch": 12.99021207177814, + "grad_norm": 0.04445377364754677, + "learning_rate": 0.0003298062091271548, + "loss": 0.157, + "num_input_tokens_seen": 171849456, + "step": 79630 + }, + { + "epoch": 12.991027732463294, + "grad_norm": 0.013869545422494411, + "learning_rate": 0.00032973928151678233, + "loss": 0.0041, + "num_input_tokens_seen": 171860976, + "step": 79635 + }, + { + "epoch": 12.99184339314845, + "grad_norm": 0.0028126141987740993, + "learning_rate": 0.00032967235735695955, + "loss": 0.0016, + "num_input_tokens_seen": 171872496, + "step": 79640 + }, + { + "epoch": 12.992659053833606, + "grad_norm": 0.002891221083700657, + "learning_rate": 0.00032960543664904224, + "loss": 0.0052, + "num_input_tokens_seen": 171883856, + "step": 79645 + }, + { + "epoch": 12.99347471451876, + "grad_norm": 0.002420090837404132, + "learning_rate": 0.0003295385193943872, + "loss": 0.0746, + "num_input_tokens_seen": 171895504, + "step": 79650 + }, + { + "epoch": 12.994290375203915, + "grad_norm": 0.001703395857475698, + "learning_rate": 0.00032947160559435, + "loss": 0.0163, + "num_input_tokens_seen": 171906096, + "step": 79655 + }, + { + "epoch": 12.99510603588907, + "grad_norm": 0.004901599138975143, + "learning_rate": 0.00032940469525028735, + "loss": 0.0136, + "num_input_tokens_seen": 171917072, + "step": 79660 + }, + { + "epoch": 12.995921696574225, + "grad_norm": 0.004203270189464092, + "learning_rate": 0.0003293377883635547, + "loss": 0.0013, + "num_input_tokens_seen": 171929008, + "step": 79665 + }, + { + "epoch": 12.99673735725938, + "grad_norm": 0.003102770773693919, + "learning_rate": 0.0003292708849355085, + "loss": 0.0067, + "num_input_tokens_seen": 171939568, + "step": 79670 + }, + { + "epoch": 12.997553017944535, + "grad_norm": 0.0010859209578484297, + "learning_rate": 0.0003292039849675042, + "loss": 0.0014, + "num_input_tokens_seen": 171948592, + "step": 79675 + }, + { + "epoch": 12.99836867862969, + "grad_norm": 0.012432601302862167, + "learning_rate": 0.0003291370884608979, + "loss": 0.0024, + "num_input_tokens_seen": 171959824, + "step": 79680 + }, + { + "epoch": 12.999184339314844, + "grad_norm": 0.0025029180105775595, + "learning_rate": 0.00032907019541704533, + "loss": 0.0041, + "num_input_tokens_seen": 171970224, + "step": 79685 + }, + { + "epoch": 13.0, + "grad_norm": 0.12261208146810532, + "learning_rate": 0.00032900330583730196, + "loss": 0.0084, + "num_input_tokens_seen": 171979232, + "step": 79690 + }, + { + "epoch": 13.0, + "eval_loss": 0.17562581598758698, + "eval_runtime": 104.5423, + "eval_samples_per_second": 26.066, + "eval_steps_per_second": 6.524, + "num_input_tokens_seen": 171979232, + "step": 79690 + }, + { + "epoch": 13.000815660685156, + "grad_norm": 0.27406132221221924, + "learning_rate": 0.0003289364197230236, + "loss": 0.0118, + "num_input_tokens_seen": 171989664, + "step": 79695 + }, + { + "epoch": 13.00163132137031, + "grad_norm": 0.0006193106528371572, + "learning_rate": 0.0003288695370755657, + "loss": 0.011, + "num_input_tokens_seen": 172000960, + "step": 79700 + }, + { + "epoch": 13.002446982055465, + "grad_norm": 0.0958174467086792, + "learning_rate": 0.0003288026578962836, + "loss": 0.014, + "num_input_tokens_seen": 172010528, + "step": 79705 + }, + { + "epoch": 13.00326264274062, + "grad_norm": 0.003229190595448017, + "learning_rate": 0.0003287357821865329, + "loss": 0.0047, + "num_input_tokens_seen": 172020288, + "step": 79710 + }, + { + "epoch": 13.004078303425775, + "grad_norm": 0.0010088925482705235, + "learning_rate": 0.0003286689099476689, + "loss": 0.0016, + "num_input_tokens_seen": 172032032, + "step": 79715 + }, + { + "epoch": 13.00489396411093, + "grad_norm": 0.01225369144231081, + "learning_rate": 0.00032860204118104674, + "loss": 0.0201, + "num_input_tokens_seen": 172042720, + "step": 79720 + }, + { + "epoch": 13.005709624796085, + "grad_norm": 0.004873646888881922, + "learning_rate": 0.00032853517588802173, + "loss": 0.0046, + "num_input_tokens_seen": 172054112, + "step": 79725 + }, + { + "epoch": 13.00652528548124, + "grad_norm": 0.0016529529821127653, + "learning_rate": 0.0003284683140699487, + "loss": 0.0408, + "num_input_tokens_seen": 172064640, + "step": 79730 + }, + { + "epoch": 13.007340946166394, + "grad_norm": 0.0016954434104263783, + "learning_rate": 0.00032840145572818314, + "loss": 0.0044, + "num_input_tokens_seen": 172076224, + "step": 79735 + }, + { + "epoch": 13.00815660685155, + "grad_norm": 0.1640511006116867, + "learning_rate": 0.0003283346008640795, + "loss": 0.014, + "num_input_tokens_seen": 172087872, + "step": 79740 + }, + { + "epoch": 13.008972267536704, + "grad_norm": 0.0030964950565248728, + "learning_rate": 0.0003282677494789933, + "loss": 0.0076, + "num_input_tokens_seen": 172098976, + "step": 79745 + }, + { + "epoch": 13.00978792822186, + "grad_norm": 0.07687301188707352, + "learning_rate": 0.0003282009015742787, + "loss": 0.0106, + "num_input_tokens_seen": 172109696, + "step": 79750 + }, + { + "epoch": 13.010603588907015, + "grad_norm": 0.006554214749485254, + "learning_rate": 0.00032813405715129097, + "loss": 0.0184, + "num_input_tokens_seen": 172120896, + "step": 79755 + }, + { + "epoch": 13.01141924959217, + "grad_norm": 0.004611602984368801, + "learning_rate": 0.00032806721621138444, + "loss": 0.0063, + "num_input_tokens_seen": 172132192, + "step": 79760 + }, + { + "epoch": 13.012234910277325, + "grad_norm": 0.011387547478079796, + "learning_rate": 0.00032800037875591406, + "loss": 0.0041, + "num_input_tokens_seen": 172143424, + "step": 79765 + }, + { + "epoch": 13.013050570962479, + "grad_norm": 0.0012390944175422192, + "learning_rate": 0.000327933544786234, + "loss": 0.0026, + "num_input_tokens_seen": 172154304, + "step": 79770 + }, + { + "epoch": 13.013866231647635, + "grad_norm": 0.0004866339440923184, + "learning_rate": 0.00032786671430369915, + "loss": 0.0021, + "num_input_tokens_seen": 172164896, + "step": 79775 + }, + { + "epoch": 13.01468189233279, + "grad_norm": 0.0022169637959450483, + "learning_rate": 0.0003277998873096635, + "loss": 0.0021, + "num_input_tokens_seen": 172175200, + "step": 79780 + }, + { + "epoch": 13.015497553017944, + "grad_norm": 0.0011557636316865683, + "learning_rate": 0.00032773306380548176, + "loss": 0.0033, + "num_input_tokens_seen": 172185856, + "step": 79785 + }, + { + "epoch": 13.0163132137031, + "grad_norm": 0.011849002912640572, + "learning_rate": 0.0003276662437925079, + "loss": 0.0768, + "num_input_tokens_seen": 172196672, + "step": 79790 + }, + { + "epoch": 13.017128874388254, + "grad_norm": 0.002333552110940218, + "learning_rate": 0.0003275994272720963, + "loss": 0.0844, + "num_input_tokens_seen": 172207712, + "step": 79795 + }, + { + "epoch": 13.01794453507341, + "grad_norm": 0.004460033494979143, + "learning_rate": 0.0003275326142456009, + "loss": 0.0009, + "num_input_tokens_seen": 172218656, + "step": 79800 + }, + { + "epoch": 13.018760195758565, + "grad_norm": 0.011768416501581669, + "learning_rate": 0.00032746580471437606, + "loss": 0.0025, + "num_input_tokens_seen": 172230176, + "step": 79805 + }, + { + "epoch": 13.01957585644372, + "grad_norm": 0.017235420644283295, + "learning_rate": 0.0003273989986797753, + "loss": 0.0814, + "num_input_tokens_seen": 172240928, + "step": 79810 + }, + { + "epoch": 13.020391517128875, + "grad_norm": 0.014872279018163681, + "learning_rate": 0.00032733219614315283, + "loss": 0.0116, + "num_input_tokens_seen": 172251904, + "step": 79815 + }, + { + "epoch": 13.021207177814029, + "grad_norm": 0.2456275224685669, + "learning_rate": 0.00032726539710586266, + "loss": 0.0132, + "num_input_tokens_seen": 172262688, + "step": 79820 + }, + { + "epoch": 13.022022838499185, + "grad_norm": 0.110738605260849, + "learning_rate": 0.0003271986015692582, + "loss": 0.0187, + "num_input_tokens_seen": 172273376, + "step": 79825 + }, + { + "epoch": 13.022838499184338, + "grad_norm": 0.22187121212482452, + "learning_rate": 0.0003271318095346934, + "loss": 0.0121, + "num_input_tokens_seen": 172284416, + "step": 79830 + }, + { + "epoch": 13.023654159869494, + "grad_norm": 0.006867742165923119, + "learning_rate": 0.00032706502100352165, + "loss": 0.1337, + "num_input_tokens_seen": 172296032, + "step": 79835 + }, + { + "epoch": 13.02446982055465, + "grad_norm": 0.027570147067308426, + "learning_rate": 0.00032699823597709675, + "loss": 0.0078, + "num_input_tokens_seen": 172306720, + "step": 79840 + }, + { + "epoch": 13.025285481239804, + "grad_norm": 0.0022381616290658712, + "learning_rate": 0.00032693145445677194, + "loss": 0.0028, + "num_input_tokens_seen": 172317280, + "step": 79845 + }, + { + "epoch": 13.02610114192496, + "grad_norm": 0.004417035728693008, + "learning_rate": 0.00032686467644390085, + "loss": 0.017, + "num_input_tokens_seen": 172328064, + "step": 79850 + }, + { + "epoch": 13.026916802610113, + "grad_norm": 0.04091395437717438, + "learning_rate": 0.00032679790193983666, + "loss": 0.0573, + "num_input_tokens_seen": 172339040, + "step": 79855 + }, + { + "epoch": 13.02773246329527, + "grad_norm": 0.41993793845176697, + "learning_rate": 0.0003267311309459328, + "loss": 0.0111, + "num_input_tokens_seen": 172349760, + "step": 79860 + }, + { + "epoch": 13.028548123980425, + "grad_norm": 0.38725799322128296, + "learning_rate": 0.00032666436346354236, + "loss": 0.009, + "num_input_tokens_seen": 172361632, + "step": 79865 + }, + { + "epoch": 13.029363784665579, + "grad_norm": 0.0164804570376873, + "learning_rate": 0.0003265975994940185, + "loss": 0.0121, + "num_input_tokens_seen": 172372192, + "step": 79870 + }, + { + "epoch": 13.030179445350734, + "grad_norm": 0.0008421913371421397, + "learning_rate": 0.00032653083903871406, + "loss": 0.0039, + "num_input_tokens_seen": 172383552, + "step": 79875 + }, + { + "epoch": 13.030995106035888, + "grad_norm": 0.08911339193582535, + "learning_rate": 0.0003264640820989825, + "loss": 0.0106, + "num_input_tokens_seen": 172394496, + "step": 79880 + }, + { + "epoch": 13.031810766721044, + "grad_norm": 0.03633780777454376, + "learning_rate": 0.0003263973286761762, + "loss": 0.0033, + "num_input_tokens_seen": 172404096, + "step": 79885 + }, + { + "epoch": 13.0326264274062, + "grad_norm": 0.008666853420436382, + "learning_rate": 0.0003263305787716486, + "loss": 0.008, + "num_input_tokens_seen": 172414880, + "step": 79890 + }, + { + "epoch": 13.033442088091354, + "grad_norm": 0.0026526337023824453, + "learning_rate": 0.00032626383238675184, + "loss": 0.0242, + "num_input_tokens_seen": 172424928, + "step": 79895 + }, + { + "epoch": 13.03425774877651, + "grad_norm": 0.01728689670562744, + "learning_rate": 0.0003261970895228391, + "loss": 0.0053, + "num_input_tokens_seen": 172436416, + "step": 79900 + }, + { + "epoch": 13.035073409461663, + "grad_norm": 0.006354300305247307, + "learning_rate": 0.00032613035018126267, + "loss": 0.0108, + "num_input_tokens_seen": 172447232, + "step": 79905 + }, + { + "epoch": 13.035889070146819, + "grad_norm": 0.0015383753925561905, + "learning_rate": 0.0003260636143633755, + "loss": 0.0151, + "num_input_tokens_seen": 172457152, + "step": 79910 + }, + { + "epoch": 13.036704730831975, + "grad_norm": 0.0012971217511221766, + "learning_rate": 0.0003259968820705296, + "loss": 0.0025, + "num_input_tokens_seen": 172469088, + "step": 79915 + }, + { + "epoch": 13.037520391517129, + "grad_norm": 0.0053301346488296986, + "learning_rate": 0.0003259301533040776, + "loss": 0.0025, + "num_input_tokens_seen": 172479264, + "step": 79920 + }, + { + "epoch": 13.038336052202284, + "grad_norm": 0.00027425261214375496, + "learning_rate": 0.00032586342806537207, + "loss": 0.0014, + "num_input_tokens_seen": 172490016, + "step": 79925 + }, + { + "epoch": 13.039151712887438, + "grad_norm": 0.019373448565602303, + "learning_rate": 0.0003257967063557649, + "loss": 0.0689, + "num_input_tokens_seen": 172500192, + "step": 79930 + }, + { + "epoch": 13.039967373572594, + "grad_norm": 0.006020937580615282, + "learning_rate": 0.0003257299881766087, + "loss": 0.0038, + "num_input_tokens_seen": 172509760, + "step": 79935 + }, + { + "epoch": 13.040783034257748, + "grad_norm": 0.02803732082247734, + "learning_rate": 0.0003256632735292551, + "loss": 0.0024, + "num_input_tokens_seen": 172520960, + "step": 79940 + }, + { + "epoch": 13.041598694942904, + "grad_norm": 0.010991957038640976, + "learning_rate": 0.00032559656241505663, + "loss": 0.0242, + "num_input_tokens_seen": 172532320, + "step": 79945 + }, + { + "epoch": 13.04241435562806, + "grad_norm": 0.011794701218605042, + "learning_rate": 0.0003255298548353649, + "loss": 0.0043, + "num_input_tokens_seen": 172542400, + "step": 79950 + }, + { + "epoch": 13.043230016313213, + "grad_norm": 0.2485843300819397, + "learning_rate": 0.0003254631507915322, + "loss": 0.0181, + "num_input_tokens_seen": 172553504, + "step": 79955 + }, + { + "epoch": 13.044045676998369, + "grad_norm": 0.030596431344747543, + "learning_rate": 0.00032539645028490993, + "loss": 0.0041, + "num_input_tokens_seen": 172564224, + "step": 79960 + }, + { + "epoch": 13.044861337683523, + "grad_norm": 0.0008792920270934701, + "learning_rate": 0.0003253297533168503, + "loss": 0.0016, + "num_input_tokens_seen": 172572128, + "step": 79965 + }, + { + "epoch": 13.045676998368679, + "grad_norm": 0.019316574558615685, + "learning_rate": 0.0003252630598887046, + "loss": 0.0196, + "num_input_tokens_seen": 172582432, + "step": 79970 + }, + { + "epoch": 13.046492659053834, + "grad_norm": 0.0070152911357581615, + "learning_rate": 0.00032519637000182495, + "loss": 0.0022, + "num_input_tokens_seen": 172593984, + "step": 79975 + }, + { + "epoch": 13.047308319738988, + "grad_norm": 0.03219536691904068, + "learning_rate": 0.0003251296836575623, + "loss": 0.0024, + "num_input_tokens_seen": 172604512, + "step": 79980 + }, + { + "epoch": 13.048123980424144, + "grad_norm": 1.1781108379364014, + "learning_rate": 0.00032506300085726874, + "loss": 0.0464, + "num_input_tokens_seen": 172613984, + "step": 79985 + }, + { + "epoch": 13.048939641109298, + "grad_norm": 0.0028719513211399317, + "learning_rate": 0.0003249963216022951, + "loss": 0.0022, + "num_input_tokens_seen": 172626272, + "step": 79990 + }, + { + "epoch": 13.049755301794454, + "grad_norm": 0.02185647189617157, + "learning_rate": 0.0003249296458939932, + "loss": 0.0055, + "num_input_tokens_seen": 172637856, + "step": 79995 + }, + { + "epoch": 13.05057096247961, + "grad_norm": 0.0007532222662121058, + "learning_rate": 0.0003248629737337141, + "loss": 0.0097, + "num_input_tokens_seen": 172648032, + "step": 80000 + }, + { + "epoch": 13.051386623164763, + "grad_norm": 0.008735321462154388, + "learning_rate": 0.000324796305122809, + "loss": 0.0034, + "num_input_tokens_seen": 172659232, + "step": 80005 + }, + { + "epoch": 13.052202283849919, + "grad_norm": 0.012395388446748257, + "learning_rate": 0.000324729640062629, + "loss": 0.0081, + "num_input_tokens_seen": 172670336, + "step": 80010 + }, + { + "epoch": 13.053017944535073, + "grad_norm": 0.0032699857838451862, + "learning_rate": 0.0003246629785545252, + "loss": 0.0036, + "num_input_tokens_seen": 172680480, + "step": 80015 + }, + { + "epoch": 13.053833605220229, + "grad_norm": 0.0012350112665444613, + "learning_rate": 0.0003245963205998485, + "loss": 0.0019, + "num_input_tokens_seen": 172691264, + "step": 80020 + }, + { + "epoch": 13.054649265905383, + "grad_norm": 0.1344241201877594, + "learning_rate": 0.00032452966619994997, + "loss": 0.0135, + "num_input_tokens_seen": 172702016, + "step": 80025 + }, + { + "epoch": 13.055464926590538, + "grad_norm": 0.007685338146984577, + "learning_rate": 0.00032446301535618034, + "loss": 0.0013, + "num_input_tokens_seen": 172713792, + "step": 80030 + }, + { + "epoch": 13.056280587275694, + "grad_norm": 0.0017212851671501994, + "learning_rate": 0.0003243963680698904, + "loss": 0.0048, + "num_input_tokens_seen": 172724960, + "step": 80035 + }, + { + "epoch": 13.057096247960848, + "grad_norm": 0.003269192762672901, + "learning_rate": 0.0003243297243424308, + "loss": 0.0055, + "num_input_tokens_seen": 172735264, + "step": 80040 + }, + { + "epoch": 13.057911908646004, + "grad_norm": 0.1750909984111786, + "learning_rate": 0.0003242630841751522, + "loss": 0.0048, + "num_input_tokens_seen": 172747072, + "step": 80045 + }, + { + "epoch": 13.058727569331158, + "grad_norm": 0.003354448825120926, + "learning_rate": 0.00032419644756940527, + "loss": 0.0176, + "num_input_tokens_seen": 172757376, + "step": 80050 + }, + { + "epoch": 13.059543230016313, + "grad_norm": 0.0428856760263443, + "learning_rate": 0.0003241298145265401, + "loss": 0.0501, + "num_input_tokens_seen": 172768000, + "step": 80055 + }, + { + "epoch": 13.060358890701469, + "grad_norm": 0.05645843967795372, + "learning_rate": 0.00032406318504790753, + "loss": 0.0055, + "num_input_tokens_seen": 172778752, + "step": 80060 + }, + { + "epoch": 13.061174551386623, + "grad_norm": 0.0013886360684409738, + "learning_rate": 0.0003239965591348576, + "loss": 0.0008, + "num_input_tokens_seen": 172789152, + "step": 80065 + }, + { + "epoch": 13.061990212071779, + "grad_norm": 0.0007097867201082408, + "learning_rate": 0.00032392993678874085, + "loss": 0.0011, + "num_input_tokens_seen": 172801312, + "step": 80070 + }, + { + "epoch": 13.062805872756933, + "grad_norm": 0.5078059434890747, + "learning_rate": 0.0003238633180109071, + "loss": 0.0445, + "num_input_tokens_seen": 172812352, + "step": 80075 + }, + { + "epoch": 13.063621533442088, + "grad_norm": 0.010557512752711773, + "learning_rate": 0.00032379670280270677, + "loss": 0.0037, + "num_input_tokens_seen": 172823264, + "step": 80080 + }, + { + "epoch": 13.064437194127244, + "grad_norm": 0.2610393464565277, + "learning_rate": 0.0003237300911654897, + "loss": 0.0178, + "num_input_tokens_seen": 172834464, + "step": 80085 + }, + { + "epoch": 13.065252854812398, + "grad_norm": 0.011020687408745289, + "learning_rate": 0.0003236634831006061, + "loss": 0.0116, + "num_input_tokens_seen": 172846016, + "step": 80090 + }, + { + "epoch": 13.066068515497554, + "grad_norm": 0.00851772166788578, + "learning_rate": 0.0003235968786094055, + "loss": 0.021, + "num_input_tokens_seen": 172856544, + "step": 80095 + }, + { + "epoch": 13.066884176182707, + "grad_norm": 0.01308043859899044, + "learning_rate": 0.0003235302776932382, + "loss": 0.0019, + "num_input_tokens_seen": 172868256, + "step": 80100 + }, + { + "epoch": 13.067699836867863, + "grad_norm": 0.02164997160434723, + "learning_rate": 0.00032346368035345344, + "loss": 0.0347, + "num_input_tokens_seen": 172878912, + "step": 80105 + }, + { + "epoch": 13.068515497553017, + "grad_norm": 0.038283541798591614, + "learning_rate": 0.0003233970865914013, + "loss": 0.0042, + "num_input_tokens_seen": 172890336, + "step": 80110 + }, + { + "epoch": 13.069331158238173, + "grad_norm": 0.0025401043239980936, + "learning_rate": 0.0003233304964084311, + "loss": 0.0492, + "num_input_tokens_seen": 172901312, + "step": 80115 + }, + { + "epoch": 13.070146818923329, + "grad_norm": 0.022682419046759605, + "learning_rate": 0.0003232639098058927, + "loss": 0.002, + "num_input_tokens_seen": 172911264, + "step": 80120 + }, + { + "epoch": 13.070962479608482, + "grad_norm": 0.1966417282819748, + "learning_rate": 0.00032319732678513514, + "loss": 0.0083, + "num_input_tokens_seen": 172921280, + "step": 80125 + }, + { + "epoch": 13.071778140293638, + "grad_norm": 0.00030016902019269764, + "learning_rate": 0.00032313074734750813, + "loss": 0.002, + "num_input_tokens_seen": 172930432, + "step": 80130 + }, + { + "epoch": 13.072593800978792, + "grad_norm": 0.0010847266530618072, + "learning_rate": 0.000323064171494361, + "loss": 0.0022, + "num_input_tokens_seen": 172941728, + "step": 80135 + }, + { + "epoch": 13.073409461663948, + "grad_norm": 0.005629080347716808, + "learning_rate": 0.00032299759922704277, + "loss": 0.0069, + "num_input_tokens_seen": 172952640, + "step": 80140 + }, + { + "epoch": 13.074225122349104, + "grad_norm": 0.003084514057263732, + "learning_rate": 0.0003229310305469029, + "loss": 0.0042, + "num_input_tokens_seen": 172964608, + "step": 80145 + }, + { + "epoch": 13.075040783034257, + "grad_norm": 0.3345184028148651, + "learning_rate": 0.00032286446545529016, + "loss": 0.1137, + "num_input_tokens_seen": 172975904, + "step": 80150 + }, + { + "epoch": 13.075856443719413, + "grad_norm": 0.0941111221909523, + "learning_rate": 0.0003227979039535538, + "loss": 0.0976, + "num_input_tokens_seen": 172985952, + "step": 80155 + }, + { + "epoch": 13.076672104404567, + "grad_norm": 0.020343562588095665, + "learning_rate": 0.0003227313460430427, + "loss": 0.0019, + "num_input_tokens_seen": 172996672, + "step": 80160 + }, + { + "epoch": 13.077487765089723, + "grad_norm": 0.0014234196860343218, + "learning_rate": 0.0003226647917251058, + "loss": 0.0011, + "num_input_tokens_seen": 173007392, + "step": 80165 + }, + { + "epoch": 13.078303425774878, + "grad_norm": 0.007868372835218906, + "learning_rate": 0.0003225982410010918, + "loss": 0.0023, + "num_input_tokens_seen": 173018816, + "step": 80170 + }, + { + "epoch": 13.079119086460032, + "grad_norm": 0.006144057959318161, + "learning_rate": 0.00032253169387234953, + "loss": 0.0027, + "num_input_tokens_seen": 173029952, + "step": 80175 + }, + { + "epoch": 13.079934747145188, + "grad_norm": 0.006870198994874954, + "learning_rate": 0.0003224651503402276, + "loss": 0.0027, + "num_input_tokens_seen": 173040416, + "step": 80180 + }, + { + "epoch": 13.080750407830342, + "grad_norm": 0.0009405722375959158, + "learning_rate": 0.00032239861040607464, + "loss": 0.1236, + "num_input_tokens_seen": 173051392, + "step": 80185 + }, + { + "epoch": 13.081566068515498, + "grad_norm": 0.03857423737645149, + "learning_rate": 0.0003223320740712391, + "loss": 0.0278, + "num_input_tokens_seen": 173061952, + "step": 80190 + }, + { + "epoch": 13.082381729200652, + "grad_norm": 0.003705504583194852, + "learning_rate": 0.0003222655413370696, + "loss": 0.0049, + "num_input_tokens_seen": 173072928, + "step": 80195 + }, + { + "epoch": 13.083197389885807, + "grad_norm": 0.006608698051422834, + "learning_rate": 0.00032219901220491417, + "loss": 0.0298, + "num_input_tokens_seen": 173082848, + "step": 80200 + }, + { + "epoch": 13.084013050570963, + "grad_norm": 0.008448357693850994, + "learning_rate": 0.0003221324866761215, + "loss": 0.005, + "num_input_tokens_seen": 173092032, + "step": 80205 + }, + { + "epoch": 13.084828711256117, + "grad_norm": 0.013571836054325104, + "learning_rate": 0.0003220659647520395, + "loss": 0.0021, + "num_input_tokens_seen": 173102464, + "step": 80210 + }, + { + "epoch": 13.085644371941273, + "grad_norm": 0.01860027387738228, + "learning_rate": 0.00032199944643401655, + "loss": 0.0031, + "num_input_tokens_seen": 173113056, + "step": 80215 + }, + { + "epoch": 13.086460032626427, + "grad_norm": 0.007309226784855127, + "learning_rate": 0.00032193293172340056, + "loss": 0.0085, + "num_input_tokens_seen": 173123552, + "step": 80220 + }, + { + "epoch": 13.087275693311582, + "grad_norm": 0.016038550063967705, + "learning_rate": 0.0003218664206215397, + "loss": 0.0012, + "num_input_tokens_seen": 173133120, + "step": 80225 + }, + { + "epoch": 13.088091353996738, + "grad_norm": 0.0016742933075875044, + "learning_rate": 0.00032179991312978164, + "loss": 0.0191, + "num_input_tokens_seen": 173144064, + "step": 80230 + }, + { + "epoch": 13.088907014681892, + "grad_norm": 0.0003190000425092876, + "learning_rate": 0.00032173340924947436, + "loss": 0.0022, + "num_input_tokens_seen": 173154560, + "step": 80235 + }, + { + "epoch": 13.089722675367048, + "grad_norm": 0.2128380686044693, + "learning_rate": 0.00032166690898196594, + "loss": 0.0262, + "num_input_tokens_seen": 173164192, + "step": 80240 + }, + { + "epoch": 13.090538336052202, + "grad_norm": 0.4230542778968811, + "learning_rate": 0.0003216004123286036, + "loss": 0.1119, + "num_input_tokens_seen": 173173984, + "step": 80245 + }, + { + "epoch": 13.091353996737357, + "grad_norm": 0.002216814551502466, + "learning_rate": 0.0003215339192907355, + "loss": 0.0088, + "num_input_tokens_seen": 173184320, + "step": 80250 + }, + { + "epoch": 13.092169657422513, + "grad_norm": 0.030452510342001915, + "learning_rate": 0.00032146742986970865, + "loss": 0.0189, + "num_input_tokens_seen": 173195168, + "step": 80255 + }, + { + "epoch": 13.092985318107667, + "grad_norm": 0.006471080705523491, + "learning_rate": 0.000321400944066871, + "loss": 0.0071, + "num_input_tokens_seen": 173206112, + "step": 80260 + }, + { + "epoch": 13.093800978792823, + "grad_norm": 0.0053426786325871944, + "learning_rate": 0.00032133446188356964, + "loss": 0.0017, + "num_input_tokens_seen": 173216992, + "step": 80265 + }, + { + "epoch": 13.094616639477977, + "grad_norm": 0.05573317036032677, + "learning_rate": 0.00032126798332115223, + "loss": 0.0031, + "num_input_tokens_seen": 173229536, + "step": 80270 + }, + { + "epoch": 13.095432300163132, + "grad_norm": 0.2104514241218567, + "learning_rate": 0.00032120150838096576, + "loss": 0.0206, + "num_input_tokens_seen": 173240640, + "step": 80275 + }, + { + "epoch": 13.096247960848286, + "grad_norm": 0.12559527158737183, + "learning_rate": 0.00032113503706435767, + "loss": 0.016, + "num_input_tokens_seen": 173251968, + "step": 80280 + }, + { + "epoch": 13.097063621533442, + "grad_norm": 0.04001250118017197, + "learning_rate": 0.00032106856937267475, + "loss": 0.0096, + "num_input_tokens_seen": 173264224, + "step": 80285 + }, + { + "epoch": 13.097879282218598, + "grad_norm": 0.009677977301180363, + "learning_rate": 0.00032100210530726446, + "loss": 0.0093, + "num_input_tokens_seen": 173275360, + "step": 80290 + }, + { + "epoch": 13.098694942903752, + "grad_norm": 0.004945915192365646, + "learning_rate": 0.00032093564486947347, + "loss": 0.0049, + "num_input_tokens_seen": 173286048, + "step": 80295 + }, + { + "epoch": 13.099510603588907, + "grad_norm": 0.05955100804567337, + "learning_rate": 0.0003208691880606488, + "loss": 0.051, + "num_input_tokens_seen": 173298208, + "step": 80300 + }, + { + "epoch": 13.100326264274061, + "grad_norm": 0.2694977819919586, + "learning_rate": 0.0003208027348821373, + "loss": 0.0939, + "num_input_tokens_seen": 173308672, + "step": 80305 + }, + { + "epoch": 13.101141924959217, + "grad_norm": 0.002073630690574646, + "learning_rate": 0.00032073628533528574, + "loss": 0.0159, + "num_input_tokens_seen": 173319808, + "step": 80310 + }, + { + "epoch": 13.101957585644373, + "grad_norm": 0.0028807390481233597, + "learning_rate": 0.0003206698394214407, + "loss": 0.0046, + "num_input_tokens_seen": 173330976, + "step": 80315 + }, + { + "epoch": 13.102773246329527, + "grad_norm": 0.2262181043624878, + "learning_rate": 0.00032060339714194897, + "loss": 0.0132, + "num_input_tokens_seen": 173341984, + "step": 80320 + }, + { + "epoch": 13.103588907014682, + "grad_norm": 0.00042471580673009157, + "learning_rate": 0.0003205369584981568, + "loss": 0.0181, + "num_input_tokens_seen": 173351808, + "step": 80325 + }, + { + "epoch": 13.104404567699836, + "grad_norm": 0.011768683791160583, + "learning_rate": 0.000320470523491411, + "loss": 0.0012, + "num_input_tokens_seen": 173362624, + "step": 80330 + }, + { + "epoch": 13.105220228384992, + "grad_norm": 0.16928160190582275, + "learning_rate": 0.00032040409212305765, + "loss": 0.006, + "num_input_tokens_seen": 173374144, + "step": 80335 + }, + { + "epoch": 13.106035889070148, + "grad_norm": 0.006083453539758921, + "learning_rate": 0.0003203376643944433, + "loss": 0.004, + "num_input_tokens_seen": 173385344, + "step": 80340 + }, + { + "epoch": 13.106851549755302, + "grad_norm": 0.00047520510270260274, + "learning_rate": 0.0003202712403069141, + "loss": 0.0328, + "num_input_tokens_seen": 173396480, + "step": 80345 + }, + { + "epoch": 13.107667210440457, + "grad_norm": 0.015516245737671852, + "learning_rate": 0.00032020481986181606, + "loss": 0.0016, + "num_input_tokens_seen": 173406944, + "step": 80350 + }, + { + "epoch": 13.108482871125611, + "grad_norm": 0.003973082173615694, + "learning_rate": 0.0003201384030604957, + "loss": 0.0132, + "num_input_tokens_seen": 173417728, + "step": 80355 + }, + { + "epoch": 13.109298531810767, + "grad_norm": 0.026500245556235313, + "learning_rate": 0.0003200719899042985, + "loss": 0.0024, + "num_input_tokens_seen": 173428128, + "step": 80360 + }, + { + "epoch": 13.11011419249592, + "grad_norm": 0.002021523891016841, + "learning_rate": 0.00032000558039457094, + "loss": 0.0016, + "num_input_tokens_seen": 173439104, + "step": 80365 + }, + { + "epoch": 13.110929853181077, + "grad_norm": 0.21054063737392426, + "learning_rate": 0.0003199391745326585, + "loss": 0.0104, + "num_input_tokens_seen": 173451104, + "step": 80370 + }, + { + "epoch": 13.111745513866232, + "grad_norm": 0.004043983295559883, + "learning_rate": 0.0003198727723199072, + "loss": 0.0082, + "num_input_tokens_seen": 173462176, + "step": 80375 + }, + { + "epoch": 13.112561174551386, + "grad_norm": 0.19594284892082214, + "learning_rate": 0.0003198063737576625, + "loss": 0.0108, + "num_input_tokens_seen": 173471904, + "step": 80380 + }, + { + "epoch": 13.113376835236542, + "grad_norm": 0.050214435905218124, + "learning_rate": 0.0003197399788472705, + "loss": 0.0029, + "num_input_tokens_seen": 173483232, + "step": 80385 + }, + { + "epoch": 13.114192495921696, + "grad_norm": 0.0026838763151317835, + "learning_rate": 0.0003196735875900762, + "loss": 0.0116, + "num_input_tokens_seen": 173494304, + "step": 80390 + }, + { + "epoch": 13.115008156606851, + "grad_norm": 0.022384969517588615, + "learning_rate": 0.00031960719998742567, + "loss": 0.0039, + "num_input_tokens_seen": 173504320, + "step": 80395 + }, + { + "epoch": 13.115823817292007, + "grad_norm": 0.05211201682686806, + "learning_rate": 0.0003195408160406638, + "loss": 0.0063, + "num_input_tokens_seen": 173515392, + "step": 80400 + }, + { + "epoch": 13.116639477977161, + "grad_norm": 0.0005337659968063235, + "learning_rate": 0.00031947443575113655, + "loss": 0.0006, + "num_input_tokens_seen": 173525440, + "step": 80405 + }, + { + "epoch": 13.117455138662317, + "grad_norm": 0.0039051144849509, + "learning_rate": 0.00031940805912018854, + "loss": 0.002, + "num_input_tokens_seen": 173536000, + "step": 80410 + }, + { + "epoch": 13.11827079934747, + "grad_norm": 0.011386572383344173, + "learning_rate": 0.0003193416861491656, + "loss": 0.0047, + "num_input_tokens_seen": 173546848, + "step": 80415 + }, + { + "epoch": 13.119086460032626, + "grad_norm": 0.006250128149986267, + "learning_rate": 0.00031927531683941234, + "loss": 0.0519, + "num_input_tokens_seen": 173557824, + "step": 80420 + }, + { + "epoch": 13.119902120717782, + "grad_norm": 0.0010326653718948364, + "learning_rate": 0.0003192089511922742, + "loss": 0.0057, + "num_input_tokens_seen": 173568704, + "step": 80425 + }, + { + "epoch": 13.120717781402936, + "grad_norm": 0.08747044205665588, + "learning_rate": 0.0003191425892090959, + "loss": 0.1007, + "num_input_tokens_seen": 173580896, + "step": 80430 + }, + { + "epoch": 13.121533442088092, + "grad_norm": 0.0023840495850890875, + "learning_rate": 0.0003190762308912226, + "loss": 0.0012, + "num_input_tokens_seen": 173591616, + "step": 80435 + }, + { + "epoch": 13.122349102773246, + "grad_norm": 0.004006191156804562, + "learning_rate": 0.0003190098762399989, + "loss": 0.0042, + "num_input_tokens_seen": 173601600, + "step": 80440 + }, + { + "epoch": 13.123164763458401, + "grad_norm": 0.04537961632013321, + "learning_rate": 0.0003189435252567697, + "loss": 0.0074, + "num_input_tokens_seen": 173611488, + "step": 80445 + }, + { + "epoch": 13.123980424143557, + "grad_norm": 0.02513824962079525, + "learning_rate": 0.00031887717794287963, + "loss": 0.0124, + "num_input_tokens_seen": 173621376, + "step": 80450 + }, + { + "epoch": 13.124796084828711, + "grad_norm": 0.0011922186240553856, + "learning_rate": 0.0003188108342996732, + "loss": 0.0032, + "num_input_tokens_seen": 173631616, + "step": 80455 + }, + { + "epoch": 13.125611745513867, + "grad_norm": 0.05637362599372864, + "learning_rate": 0.0003187444943284953, + "loss": 0.0343, + "num_input_tokens_seen": 173643008, + "step": 80460 + }, + { + "epoch": 13.12642740619902, + "grad_norm": 0.002567294053733349, + "learning_rate": 0.00031867815803068996, + "loss": 0.0008, + "num_input_tokens_seen": 173654336, + "step": 80465 + }, + { + "epoch": 13.127243066884176, + "grad_norm": 0.0010322218295186758, + "learning_rate": 0.0003186118254076018, + "loss": 0.0009, + "num_input_tokens_seen": 173664832, + "step": 80470 + }, + { + "epoch": 13.12805872756933, + "grad_norm": 1.9691383838653564, + "learning_rate": 0.00031854549646057517, + "loss": 0.0188, + "num_input_tokens_seen": 173676672, + "step": 80475 + }, + { + "epoch": 13.128874388254486, + "grad_norm": 0.028286365792155266, + "learning_rate": 0.00031847917119095425, + "loss": 0.009, + "num_input_tokens_seen": 173687584, + "step": 80480 + }, + { + "epoch": 13.129690048939642, + "grad_norm": 0.03752854838967323, + "learning_rate": 0.0003184128496000832, + "loss": 0.003, + "num_input_tokens_seen": 173697856, + "step": 80485 + }, + { + "epoch": 13.130505709624796, + "grad_norm": 0.09181585162878036, + "learning_rate": 0.00031834653168930614, + "loss": 0.0887, + "num_input_tokens_seen": 173709312, + "step": 80490 + }, + { + "epoch": 13.131321370309951, + "grad_norm": 0.5715094804763794, + "learning_rate": 0.0003182802174599669, + "loss": 0.1862, + "num_input_tokens_seen": 173719392, + "step": 80495 + }, + { + "epoch": 13.132137030995105, + "grad_norm": 0.11002907156944275, + "learning_rate": 0.00031821390691340985, + "loss": 0.0058, + "num_input_tokens_seen": 173730432, + "step": 80500 + }, + { + "epoch": 13.132952691680261, + "grad_norm": 0.0014578074915334582, + "learning_rate": 0.0003181476000509783, + "loss": 0.0167, + "num_input_tokens_seen": 173740224, + "step": 80505 + }, + { + "epoch": 13.133768352365417, + "grad_norm": 0.00027162733022123575, + "learning_rate": 0.00031808129687401664, + "loss": 0.0014, + "num_input_tokens_seen": 173751680, + "step": 80510 + }, + { + "epoch": 13.13458401305057, + "grad_norm": 0.00035390013363212347, + "learning_rate": 0.00031801499738386797, + "loss": 0.0005, + "num_input_tokens_seen": 173763840, + "step": 80515 + }, + { + "epoch": 13.135399673735726, + "grad_norm": 0.16227097809314728, + "learning_rate": 0.0003179487015818765, + "loss": 0.0069, + "num_input_tokens_seen": 173774720, + "step": 80520 + }, + { + "epoch": 13.13621533442088, + "grad_norm": 0.003046720987185836, + "learning_rate": 0.00031788240946938534, + "loss": 0.0076, + "num_input_tokens_seen": 173787040, + "step": 80525 + }, + { + "epoch": 13.137030995106036, + "grad_norm": 0.005933790002018213, + "learning_rate": 0.00031781612104773836, + "loss": 0.0857, + "num_input_tokens_seen": 173798112, + "step": 80530 + }, + { + "epoch": 13.137846655791192, + "grad_norm": 0.003989804070442915, + "learning_rate": 0.00031774983631827866, + "loss": 0.0349, + "num_input_tokens_seen": 173808960, + "step": 80535 + }, + { + "epoch": 13.138662316476346, + "grad_norm": 0.04630148410797119, + "learning_rate": 0.00031768355528234986, + "loss": 0.0991, + "num_input_tokens_seen": 173820320, + "step": 80540 + }, + { + "epoch": 13.139477977161501, + "grad_norm": 0.0012686635600402951, + "learning_rate": 0.0003176172779412949, + "loss": 0.0011, + "num_input_tokens_seen": 173831648, + "step": 80545 + }, + { + "epoch": 13.140293637846655, + "grad_norm": 0.07240075618028641, + "learning_rate": 0.00031755100429645746, + "loss": 0.0056, + "num_input_tokens_seen": 173842624, + "step": 80550 + }, + { + "epoch": 13.141109298531811, + "grad_norm": 0.0008878922672010958, + "learning_rate": 0.00031748473434918014, + "loss": 0.0025, + "num_input_tokens_seen": 173852608, + "step": 80555 + }, + { + "epoch": 13.141924959216965, + "grad_norm": 0.017875486984848976, + "learning_rate": 0.0003174184681008061, + "loss": 0.0027, + "num_input_tokens_seen": 173863680, + "step": 80560 + }, + { + "epoch": 13.14274061990212, + "grad_norm": 0.13581252098083496, + "learning_rate": 0.00031735220555267874, + "loss": 0.0078, + "num_input_tokens_seen": 173875328, + "step": 80565 + }, + { + "epoch": 13.143556280587276, + "grad_norm": 0.03006221540272236, + "learning_rate": 0.0003172859467061404, + "loss": 0.0025, + "num_input_tokens_seen": 173887200, + "step": 80570 + }, + { + "epoch": 13.14437194127243, + "grad_norm": 0.00019458668248262256, + "learning_rate": 0.0003172196915625344, + "loss": 0.0026, + "num_input_tokens_seen": 173898080, + "step": 80575 + }, + { + "epoch": 13.145187601957586, + "grad_norm": 0.004450054839253426, + "learning_rate": 0.0003171534401232029, + "loss": 0.0513, + "num_input_tokens_seen": 173908800, + "step": 80580 + }, + { + "epoch": 13.14600326264274, + "grad_norm": 0.05440502241253853, + "learning_rate": 0.0003170871923894892, + "loss": 0.0076, + "num_input_tokens_seen": 173920416, + "step": 80585 + }, + { + "epoch": 13.146818923327896, + "grad_norm": 0.03472015634179115, + "learning_rate": 0.0003170209483627353, + "loss": 0.1548, + "num_input_tokens_seen": 173930592, + "step": 80590 + }, + { + "epoch": 13.147634584013051, + "grad_norm": 0.3058006763458252, + "learning_rate": 0.00031695470804428427, + "loss": 0.0259, + "num_input_tokens_seen": 173941344, + "step": 80595 + }, + { + "epoch": 13.148450244698205, + "grad_norm": 0.008313850499689579, + "learning_rate": 0.0003168884714354781, + "loss": 0.0252, + "num_input_tokens_seen": 173951936, + "step": 80600 + }, + { + "epoch": 13.149265905383361, + "grad_norm": 0.004506675526499748, + "learning_rate": 0.0003168222385376596, + "loss": 0.1441, + "num_input_tokens_seen": 173962080, + "step": 80605 + }, + { + "epoch": 13.150081566068515, + "grad_norm": 0.0005903402343392372, + "learning_rate": 0.0003167560093521705, + "loss": 0.0023, + "num_input_tokens_seen": 173972544, + "step": 80610 + }, + { + "epoch": 13.15089722675367, + "grad_norm": 0.16615605354309082, + "learning_rate": 0.00031668978388035347, + "loss": 0.0295, + "num_input_tokens_seen": 173982560, + "step": 80615 + }, + { + "epoch": 13.151712887438826, + "grad_norm": 0.0465259812772274, + "learning_rate": 0.0003166235621235505, + "loss": 0.0083, + "num_input_tokens_seen": 173992864, + "step": 80620 + }, + { + "epoch": 13.15252854812398, + "grad_norm": 0.0029099388048052788, + "learning_rate": 0.00031655734408310367, + "loss": 0.0026, + "num_input_tokens_seen": 174002784, + "step": 80625 + }, + { + "epoch": 13.153344208809136, + "grad_norm": 0.04609158635139465, + "learning_rate": 0.000316491129760355, + "loss": 0.0083, + "num_input_tokens_seen": 174012896, + "step": 80630 + }, + { + "epoch": 13.15415986949429, + "grad_norm": 0.00158396502956748, + "learning_rate": 0.0003164249191566464, + "loss": 0.0065, + "num_input_tokens_seen": 174024448, + "step": 80635 + }, + { + "epoch": 13.154975530179446, + "grad_norm": 0.00041240922291763127, + "learning_rate": 0.00031635871227331957, + "loss": 0.0205, + "num_input_tokens_seen": 174035424, + "step": 80640 + }, + { + "epoch": 13.1557911908646, + "grad_norm": 0.0235210619866848, + "learning_rate": 0.00031629250911171657, + "loss": 0.0041, + "num_input_tokens_seen": 174046016, + "step": 80645 + }, + { + "epoch": 13.156606851549755, + "grad_norm": 0.028905780985951424, + "learning_rate": 0.0003162263096731788, + "loss": 0.0028, + "num_input_tokens_seen": 174055936, + "step": 80650 + }, + { + "epoch": 13.15742251223491, + "grad_norm": 0.01393867190927267, + "learning_rate": 0.0003161601139590482, + "loss": 0.0041, + "num_input_tokens_seen": 174065696, + "step": 80655 + }, + { + "epoch": 13.158238172920065, + "grad_norm": 0.023142701014876366, + "learning_rate": 0.0003160939219706658, + "loss": 0.0283, + "num_input_tokens_seen": 174075904, + "step": 80660 + }, + { + "epoch": 13.15905383360522, + "grad_norm": 0.031767264008522034, + "learning_rate": 0.00031602773370937345, + "loss": 0.034, + "num_input_tokens_seen": 174086528, + "step": 80665 + }, + { + "epoch": 13.159869494290374, + "grad_norm": 0.0672643855214119, + "learning_rate": 0.00031596154917651266, + "loss": 0.1343, + "num_input_tokens_seen": 174097696, + "step": 80670 + }, + { + "epoch": 13.16068515497553, + "grad_norm": 0.0005012695328332484, + "learning_rate": 0.0003158953683734244, + "loss": 0.0077, + "num_input_tokens_seen": 174108192, + "step": 80675 + }, + { + "epoch": 13.161500815660686, + "grad_norm": 0.030545882880687714, + "learning_rate": 0.00031582919130145016, + "loss": 0.0108, + "num_input_tokens_seen": 174120224, + "step": 80680 + }, + { + "epoch": 13.16231647634584, + "grad_norm": 0.2664947807788849, + "learning_rate": 0.0003157630179619308, + "loss": 0.0134, + "num_input_tokens_seen": 174130048, + "step": 80685 + }, + { + "epoch": 13.163132137030995, + "grad_norm": 0.024023720994591713, + "learning_rate": 0.00031569684835620784, + "loss": 0.0047, + "num_input_tokens_seen": 174140992, + "step": 80690 + }, + { + "epoch": 13.16394779771615, + "grad_norm": 0.0062830145470798016, + "learning_rate": 0.00031563068248562185, + "loss": 0.1168, + "num_input_tokens_seen": 174151264, + "step": 80695 + }, + { + "epoch": 13.164763458401305, + "grad_norm": 0.011702772229909897, + "learning_rate": 0.00031556452035151416, + "loss": 0.0165, + "num_input_tokens_seen": 174163136, + "step": 80700 + }, + { + "epoch": 13.16557911908646, + "grad_norm": 0.0019268464529886842, + "learning_rate": 0.00031549836195522517, + "loss": 0.003, + "num_input_tokens_seen": 174174080, + "step": 80705 + }, + { + "epoch": 13.166394779771615, + "grad_norm": 0.0038041851948946714, + "learning_rate": 0.00031543220729809626, + "loss": 0.002, + "num_input_tokens_seen": 174185536, + "step": 80710 + }, + { + "epoch": 13.16721044045677, + "grad_norm": 0.16390781104564667, + "learning_rate": 0.00031536605638146756, + "loss": 0.0079, + "num_input_tokens_seen": 174196064, + "step": 80715 + }, + { + "epoch": 13.168026101141924, + "grad_norm": 0.06964511424303055, + "learning_rate": 0.0003152999092066801, + "loss": 0.1121, + "num_input_tokens_seen": 174206560, + "step": 80720 + }, + { + "epoch": 13.16884176182708, + "grad_norm": 0.016062738373875618, + "learning_rate": 0.0003152337657750741, + "loss": 0.0045, + "num_input_tokens_seen": 174219136, + "step": 80725 + }, + { + "epoch": 13.169657422512234, + "grad_norm": 0.14013929665088654, + "learning_rate": 0.00031516762608799047, + "loss": 0.0172, + "num_input_tokens_seen": 174229952, + "step": 80730 + }, + { + "epoch": 13.17047308319739, + "grad_norm": 0.3120743930339813, + "learning_rate": 0.0003151014901467691, + "loss": 0.0367, + "num_input_tokens_seen": 174239168, + "step": 80735 + }, + { + "epoch": 13.171288743882545, + "grad_norm": 0.000586897658649832, + "learning_rate": 0.00031503535795275096, + "loss": 0.0026, + "num_input_tokens_seen": 174248864, + "step": 80740 + }, + { + "epoch": 13.1721044045677, + "grad_norm": 0.006280173547565937, + "learning_rate": 0.00031496922950727556, + "loss": 0.0344, + "num_input_tokens_seen": 174259872, + "step": 80745 + }, + { + "epoch": 13.172920065252855, + "grad_norm": 0.00427344860509038, + "learning_rate": 0.00031490310481168375, + "loss": 0.0051, + "num_input_tokens_seen": 174271136, + "step": 80750 + }, + { + "epoch": 13.173735725938009, + "grad_norm": 0.0013054907321929932, + "learning_rate": 0.0003148369838673151, + "loss": 0.003, + "num_input_tokens_seen": 174282624, + "step": 80755 + }, + { + "epoch": 13.174551386623165, + "grad_norm": 0.00782183650881052, + "learning_rate": 0.00031477086667551003, + "loss": 0.1282, + "num_input_tokens_seen": 174294368, + "step": 80760 + }, + { + "epoch": 13.17536704730832, + "grad_norm": 0.007074335124343634, + "learning_rate": 0.00031470475323760826, + "loss": 0.0199, + "num_input_tokens_seen": 174305536, + "step": 80765 + }, + { + "epoch": 13.176182707993474, + "grad_norm": 0.002022119704633951, + "learning_rate": 0.0003146386435549496, + "loss": 0.0138, + "num_input_tokens_seen": 174316192, + "step": 80770 + }, + { + "epoch": 13.17699836867863, + "grad_norm": 0.02543884702026844, + "learning_rate": 0.0003145725376288742, + "loss": 0.0237, + "num_input_tokens_seen": 174326336, + "step": 80775 + }, + { + "epoch": 13.177814029363784, + "grad_norm": 0.06555605679750443, + "learning_rate": 0.00031450643546072145, + "loss": 0.0128, + "num_input_tokens_seen": 174336960, + "step": 80780 + }, + { + "epoch": 13.17862969004894, + "grad_norm": 0.006017104722559452, + "learning_rate": 0.0003144403370518311, + "loss": 0.0074, + "num_input_tokens_seen": 174347264, + "step": 80785 + }, + { + "epoch": 13.179445350734095, + "grad_norm": 0.14606615900993347, + "learning_rate": 0.00031437424240354274, + "loss": 0.1346, + "num_input_tokens_seen": 174357984, + "step": 80790 + }, + { + "epoch": 13.18026101141925, + "grad_norm": 0.026094675064086914, + "learning_rate": 0.00031430815151719583, + "loss": 0.002, + "num_input_tokens_seen": 174367520, + "step": 80795 + }, + { + "epoch": 13.181076672104405, + "grad_norm": 0.00727175222709775, + "learning_rate": 0.00031424206439412984, + "loss": 0.0277, + "num_input_tokens_seen": 174380000, + "step": 80800 + }, + { + "epoch": 13.181892332789559, + "grad_norm": 0.0068876431323587894, + "learning_rate": 0.00031417598103568404, + "loss": 0.0012, + "num_input_tokens_seen": 174389440, + "step": 80805 + }, + { + "epoch": 13.182707993474715, + "grad_norm": 0.5171151161193848, + "learning_rate": 0.00031410990144319756, + "loss": 0.0227, + "num_input_tokens_seen": 174400224, + "step": 80810 + }, + { + "epoch": 13.18352365415987, + "grad_norm": 0.010307754389941692, + "learning_rate": 0.00031404382561801006, + "loss": 0.1847, + "num_input_tokens_seen": 174411904, + "step": 80815 + }, + { + "epoch": 13.184339314845024, + "grad_norm": 0.02934756688773632, + "learning_rate": 0.00031397775356146004, + "loss": 0.0073, + "num_input_tokens_seen": 174421984, + "step": 80820 + }, + { + "epoch": 13.18515497553018, + "grad_norm": 0.003819479141384363, + "learning_rate": 0.000313911685274887, + "loss": 0.1208, + "num_input_tokens_seen": 174433472, + "step": 80825 + }, + { + "epoch": 13.185970636215334, + "grad_norm": 0.008918145671486855, + "learning_rate": 0.0003138456207596296, + "loss": 0.0035, + "num_input_tokens_seen": 174444544, + "step": 80830 + }, + { + "epoch": 13.18678629690049, + "grad_norm": 0.005234909243881702, + "learning_rate": 0.0003137795600170271, + "loss": 0.0045, + "num_input_tokens_seen": 174455584, + "step": 80835 + }, + { + "epoch": 13.187601957585644, + "grad_norm": 0.09592694044113159, + "learning_rate": 0.0003137135030484177, + "loss": 0.0103, + "num_input_tokens_seen": 174465856, + "step": 80840 + }, + { + "epoch": 13.1884176182708, + "grad_norm": 0.01953078806400299, + "learning_rate": 0.00031364744985514084, + "loss": 0.0027, + "num_input_tokens_seen": 174476448, + "step": 80845 + }, + { + "epoch": 13.189233278955955, + "grad_norm": 0.01690012589097023, + "learning_rate": 0.00031358140043853455, + "loss": 0.0041, + "num_input_tokens_seen": 174487648, + "step": 80850 + }, + { + "epoch": 13.190048939641109, + "grad_norm": 0.0010701438877731562, + "learning_rate": 0.00031351535479993785, + "loss": 0.0029, + "num_input_tokens_seen": 174497952, + "step": 80855 + }, + { + "epoch": 13.190864600326265, + "grad_norm": 0.0015289948787540197, + "learning_rate": 0.0003134493129406889, + "loss": 0.002, + "num_input_tokens_seen": 174508320, + "step": 80860 + }, + { + "epoch": 13.191680261011419, + "grad_norm": 0.06929512321949005, + "learning_rate": 0.00031338327486212647, + "loss": 0.0167, + "num_input_tokens_seen": 174519264, + "step": 80865 + }, + { + "epoch": 13.192495921696574, + "grad_norm": 0.032603226602077484, + "learning_rate": 0.00031331724056558847, + "loss": 0.0022, + "num_input_tokens_seen": 174531168, + "step": 80870 + }, + { + "epoch": 13.19331158238173, + "grad_norm": 0.006660095881670713, + "learning_rate": 0.0003132512100524134, + "loss": 0.0031, + "num_input_tokens_seen": 174542240, + "step": 80875 + }, + { + "epoch": 13.194127243066884, + "grad_norm": 0.007369876839220524, + "learning_rate": 0.00031318518332393975, + "loss": 0.0036, + "num_input_tokens_seen": 174552800, + "step": 80880 + }, + { + "epoch": 13.19494290375204, + "grad_norm": 0.24104191362857819, + "learning_rate": 0.0003131191603815051, + "loss": 0.1012, + "num_input_tokens_seen": 174564064, + "step": 80885 + }, + { + "epoch": 13.195758564437194, + "grad_norm": 0.0014521119883283973, + "learning_rate": 0.000313053141226448, + "loss": 0.0036, + "num_input_tokens_seen": 174575680, + "step": 80890 + }, + { + "epoch": 13.19657422512235, + "grad_norm": 0.019155146554112434, + "learning_rate": 0.0003129871258601059, + "loss": 0.0043, + "num_input_tokens_seen": 174586432, + "step": 80895 + }, + { + "epoch": 13.197389885807505, + "grad_norm": 0.00543161341920495, + "learning_rate": 0.0003129211142838171, + "loss": 0.0455, + "num_input_tokens_seen": 174596896, + "step": 80900 + }, + { + "epoch": 13.198205546492659, + "grad_norm": 0.0008313761791214347, + "learning_rate": 0.0003128551064989191, + "loss": 0.0108, + "num_input_tokens_seen": 174608768, + "step": 80905 + }, + { + "epoch": 13.199021207177815, + "grad_norm": 0.030279390513896942, + "learning_rate": 0.00031278910250674994, + "loss": 0.0049, + "num_input_tokens_seen": 174619616, + "step": 80910 + }, + { + "epoch": 13.199836867862969, + "grad_norm": 0.002460107207298279, + "learning_rate": 0.00031272310230864695, + "loss": 0.0024, + "num_input_tokens_seen": 174630784, + "step": 80915 + }, + { + "epoch": 13.200652528548124, + "grad_norm": 0.0018134871497750282, + "learning_rate": 0.0003126571059059481, + "loss": 0.0059, + "num_input_tokens_seen": 174641664, + "step": 80920 + }, + { + "epoch": 13.201468189233278, + "grad_norm": 0.0011770866112783551, + "learning_rate": 0.00031259111329999035, + "loss": 0.0007, + "num_input_tokens_seen": 174652736, + "step": 80925 + }, + { + "epoch": 13.202283849918434, + "grad_norm": 0.7516927123069763, + "learning_rate": 0.00031252512449211163, + "loss": 0.0842, + "num_input_tokens_seen": 174662752, + "step": 80930 + }, + { + "epoch": 13.20309951060359, + "grad_norm": 0.004124483093619347, + "learning_rate": 0.0003124591394836491, + "loss": 0.0051, + "num_input_tokens_seen": 174673184, + "step": 80935 + }, + { + "epoch": 13.203915171288743, + "grad_norm": 0.007518226280808449, + "learning_rate": 0.00031239315827593994, + "loss": 0.0028, + "num_input_tokens_seen": 174684032, + "step": 80940 + }, + { + "epoch": 13.2047308319739, + "grad_norm": 0.012893558479845524, + "learning_rate": 0.0003123271808703215, + "loss": 0.0032, + "num_input_tokens_seen": 174695072, + "step": 80945 + }, + { + "epoch": 13.205546492659053, + "grad_norm": 0.0012581703485921025, + "learning_rate": 0.0003122612072681308, + "loss": 0.013, + "num_input_tokens_seen": 174706272, + "step": 80950 + }, + { + "epoch": 13.206362153344209, + "grad_norm": 0.0031796579714864492, + "learning_rate": 0.00031219523747070475, + "loss": 0.0044, + "num_input_tokens_seen": 174717120, + "step": 80955 + }, + { + "epoch": 13.207177814029365, + "grad_norm": 0.0015025814063847065, + "learning_rate": 0.00031212927147938066, + "loss": 0.0037, + "num_input_tokens_seen": 174727520, + "step": 80960 + }, + { + "epoch": 13.207993474714518, + "grad_norm": 0.0012730866437777877, + "learning_rate": 0.0003120633092954951, + "loss": 0.0669, + "num_input_tokens_seen": 174737152, + "step": 80965 + }, + { + "epoch": 13.208809135399674, + "grad_norm": 0.004645083099603653, + "learning_rate": 0.0003119973509203851, + "loss": 0.0097, + "num_input_tokens_seen": 174748192, + "step": 80970 + }, + { + "epoch": 13.209624796084828, + "grad_norm": 0.008076664991676807, + "learning_rate": 0.00031193139635538714, + "loss": 0.0058, + "num_input_tokens_seen": 174759136, + "step": 80975 + }, + { + "epoch": 13.210440456769984, + "grad_norm": 0.595390796661377, + "learning_rate": 0.00031186544560183796, + "loss": 0.1112, + "num_input_tokens_seen": 174770400, + "step": 80980 + }, + { + "epoch": 13.21125611745514, + "grad_norm": 0.0069711292162537575, + "learning_rate": 0.00031179949866107443, + "loss": 0.0163, + "num_input_tokens_seen": 174781568, + "step": 80985 + }, + { + "epoch": 13.212071778140293, + "grad_norm": 0.005004086997359991, + "learning_rate": 0.0003117335555344326, + "loss": 0.0092, + "num_input_tokens_seen": 174792608, + "step": 80990 + }, + { + "epoch": 13.21288743882545, + "grad_norm": 0.004500801209360361, + "learning_rate": 0.00031166761622324936, + "loss": 0.0068, + "num_input_tokens_seen": 174802080, + "step": 80995 + }, + { + "epoch": 13.213703099510603, + "grad_norm": 0.0016947019612416625, + "learning_rate": 0.00031160168072886054, + "loss": 0.0087, + "num_input_tokens_seen": 174813248, + "step": 81000 + }, + { + "epoch": 13.214518760195759, + "grad_norm": 0.13866205513477325, + "learning_rate": 0.00031153574905260287, + "loss": 0.0116, + "num_input_tokens_seen": 174823904, + "step": 81005 + }, + { + "epoch": 13.215334420880913, + "grad_norm": 0.0029896912164986134, + "learning_rate": 0.000311469821195812, + "loss": 0.0233, + "num_input_tokens_seen": 174835072, + "step": 81010 + }, + { + "epoch": 13.216150081566068, + "grad_norm": 0.06322607398033142, + "learning_rate": 0.00031140389715982476, + "loss": 0.0045, + "num_input_tokens_seen": 174847264, + "step": 81015 + }, + { + "epoch": 13.216965742251224, + "grad_norm": 0.011894318275153637, + "learning_rate": 0.00031133797694597655, + "loss": 0.0082, + "num_input_tokens_seen": 174858432, + "step": 81020 + }, + { + "epoch": 13.217781402936378, + "grad_norm": 0.004805736243724823, + "learning_rate": 0.0003112720605556037, + "loss": 0.0044, + "num_input_tokens_seen": 174868000, + "step": 81025 + }, + { + "epoch": 13.218597063621534, + "grad_norm": 0.20973259210586548, + "learning_rate": 0.00031120614799004184, + "loss": 0.0068, + "num_input_tokens_seen": 174877568, + "step": 81030 + }, + { + "epoch": 13.219412724306688, + "grad_norm": 0.04392321780323982, + "learning_rate": 0.0003111402392506271, + "loss": 0.0037, + "num_input_tokens_seen": 174887616, + "step": 81035 + }, + { + "epoch": 13.220228384991843, + "grad_norm": 0.4631519317626953, + "learning_rate": 0.0003110743343386947, + "loss": 0.0851, + "num_input_tokens_seen": 174897568, + "step": 81040 + }, + { + "epoch": 13.221044045676999, + "grad_norm": 0.008608998730778694, + "learning_rate": 0.0003110084332555808, + "loss": 0.0116, + "num_input_tokens_seen": 174908224, + "step": 81045 + }, + { + "epoch": 13.221859706362153, + "grad_norm": 0.06304867565631866, + "learning_rate": 0.00031094253600262063, + "loss": 0.0217, + "num_input_tokens_seen": 174919200, + "step": 81050 + }, + { + "epoch": 13.222675367047309, + "grad_norm": 0.010637091472744942, + "learning_rate": 0.00031087664258115, + "loss": 0.0019, + "num_input_tokens_seen": 174931104, + "step": 81055 + }, + { + "epoch": 13.223491027732463, + "grad_norm": 0.0019851981196552515, + "learning_rate": 0.0003108107529925038, + "loss": 0.013, + "num_input_tokens_seen": 174942464, + "step": 81060 + }, + { + "epoch": 13.224306688417618, + "grad_norm": 0.02493548020720482, + "learning_rate": 0.0003107448672380181, + "loss": 0.0286, + "num_input_tokens_seen": 174953856, + "step": 81065 + }, + { + "epoch": 13.225122349102774, + "grad_norm": 0.1517847329378128, + "learning_rate": 0.0003106789853190274, + "loss": 0.0154, + "num_input_tokens_seen": 174963840, + "step": 81070 + }, + { + "epoch": 13.225938009787928, + "grad_norm": 0.0247593242675066, + "learning_rate": 0.0003106131072368674, + "loss": 0.0049, + "num_input_tokens_seen": 174975424, + "step": 81075 + }, + { + "epoch": 13.226753670473084, + "grad_norm": 0.0007629492320120335, + "learning_rate": 0.00031054723299287303, + "loss": 0.0017, + "num_input_tokens_seen": 174985984, + "step": 81080 + }, + { + "epoch": 13.227569331158238, + "grad_norm": 0.0026555024087429047, + "learning_rate": 0.00031048136258837923, + "loss": 0.0016, + "num_input_tokens_seen": 174996768, + "step": 81085 + }, + { + "epoch": 13.228384991843393, + "grad_norm": 0.008873987011611462, + "learning_rate": 0.0003104154960247211, + "loss": 0.0202, + "num_input_tokens_seen": 175007360, + "step": 81090 + }, + { + "epoch": 13.229200652528547, + "grad_norm": 0.006883780937641859, + "learning_rate": 0.0003103496333032334, + "loss": 0.0206, + "num_input_tokens_seen": 175017312, + "step": 81095 + }, + { + "epoch": 13.230016313213703, + "grad_norm": 0.12087459862232208, + "learning_rate": 0.00031028377442525104, + "loss": 0.0078, + "num_input_tokens_seen": 175028512, + "step": 81100 + }, + { + "epoch": 13.230831973898859, + "grad_norm": 0.020305240526795387, + "learning_rate": 0.0003102179193921086, + "loss": 0.0065, + "num_input_tokens_seen": 175038656, + "step": 81105 + }, + { + "epoch": 13.231647634584013, + "grad_norm": 0.03874611854553223, + "learning_rate": 0.00031015206820514087, + "loss": 0.0077, + "num_input_tokens_seen": 175049504, + "step": 81110 + }, + { + "epoch": 13.232463295269168, + "grad_norm": 2.0964395999908447, + "learning_rate": 0.0003100862208656823, + "loss": 0.1223, + "num_input_tokens_seen": 175060512, + "step": 81115 + }, + { + "epoch": 13.233278955954322, + "grad_norm": 0.09183394908905029, + "learning_rate": 0.0003100203773750674, + "loss": 0.0037, + "num_input_tokens_seen": 175070304, + "step": 81120 + }, + { + "epoch": 13.234094616639478, + "grad_norm": 0.037957221269607544, + "learning_rate": 0.00030995453773463035, + "loss": 0.0048, + "num_input_tokens_seen": 175082368, + "step": 81125 + }, + { + "epoch": 13.234910277324634, + "grad_norm": 0.06841553747653961, + "learning_rate": 0.00030988870194570596, + "loss": 0.006, + "num_input_tokens_seen": 175092896, + "step": 81130 + }, + { + "epoch": 13.235725938009788, + "grad_norm": 0.5958292484283447, + "learning_rate": 0.00030982287000962805, + "loss": 0.1201, + "num_input_tokens_seen": 175103808, + "step": 81135 + }, + { + "epoch": 13.236541598694943, + "grad_norm": 0.005137511063367128, + "learning_rate": 0.000309757041927731, + "loss": 0.0271, + "num_input_tokens_seen": 175115136, + "step": 81140 + }, + { + "epoch": 13.237357259380097, + "grad_norm": 0.2733260989189148, + "learning_rate": 0.00030969121770134877, + "loss": 0.0131, + "num_input_tokens_seen": 175125568, + "step": 81145 + }, + { + "epoch": 13.238172920065253, + "grad_norm": 0.0012301302049309015, + "learning_rate": 0.0003096253973318156, + "loss": 0.0026, + "num_input_tokens_seen": 175136352, + "step": 81150 + }, + { + "epoch": 13.238988580750409, + "grad_norm": 0.0009821879211813211, + "learning_rate": 0.000309559580820465, + "loss": 0.0017, + "num_input_tokens_seen": 175146912, + "step": 81155 + }, + { + "epoch": 13.239804241435563, + "grad_norm": 0.0006925233756192029, + "learning_rate": 0.0003094937681686314, + "loss": 0.001, + "num_input_tokens_seen": 175158208, + "step": 81160 + }, + { + "epoch": 13.240619902120718, + "grad_norm": 0.0035125180147588253, + "learning_rate": 0.00030942795937764794, + "loss": 0.0025, + "num_input_tokens_seen": 175169120, + "step": 81165 + }, + { + "epoch": 13.241435562805872, + "grad_norm": 0.004515068605542183, + "learning_rate": 0.00030936215444884893, + "loss": 0.0125, + "num_input_tokens_seen": 175178400, + "step": 81170 + }, + { + "epoch": 13.242251223491028, + "grad_norm": 0.005201220046728849, + "learning_rate": 0.00030929635338356745, + "loss": 0.0818, + "num_input_tokens_seen": 175188576, + "step": 81175 + }, + { + "epoch": 13.243066884176184, + "grad_norm": 0.007266409229487181, + "learning_rate": 0.0003092305561831375, + "loss": 0.0015, + "num_input_tokens_seen": 175199872, + "step": 81180 + }, + { + "epoch": 13.243882544861338, + "grad_norm": 0.00982783455401659, + "learning_rate": 0.0003091647628488922, + "loss": 0.0018, + "num_input_tokens_seen": 175210560, + "step": 81185 + }, + { + "epoch": 13.244698205546493, + "grad_norm": 0.14936675131320953, + "learning_rate": 0.0003090989733821652, + "loss": 0.0076, + "num_input_tokens_seen": 175221440, + "step": 81190 + }, + { + "epoch": 13.245513866231647, + "grad_norm": 0.2236238569021225, + "learning_rate": 0.0003090331877842895, + "loss": 0.0419, + "num_input_tokens_seen": 175232800, + "step": 81195 + }, + { + "epoch": 13.246329526916803, + "grad_norm": 0.005720221437513828, + "learning_rate": 0.00030896740605659845, + "loss": 0.0012, + "num_input_tokens_seen": 175243296, + "step": 81200 + }, + { + "epoch": 13.247145187601957, + "grad_norm": 0.5966299176216125, + "learning_rate": 0.00030890162820042553, + "loss": 0.0868, + "num_input_tokens_seen": 175255328, + "step": 81205 + }, + { + "epoch": 13.247960848287113, + "grad_norm": 0.006146843545138836, + "learning_rate": 0.00030883585421710334, + "loss": 0.003, + "num_input_tokens_seen": 175267616, + "step": 81210 + }, + { + "epoch": 13.248776508972268, + "grad_norm": 0.0009429160272702575, + "learning_rate": 0.00030877008410796526, + "loss": 0.0018, + "num_input_tokens_seen": 175278048, + "step": 81215 + }, + { + "epoch": 13.249592169657422, + "grad_norm": 0.010916613973677158, + "learning_rate": 0.00030870431787434385, + "loss": 0.0047, + "num_input_tokens_seen": 175289088, + "step": 81220 + }, + { + "epoch": 13.250407830342578, + "grad_norm": 0.0007581052486784756, + "learning_rate": 0.00030863855551757223, + "loss": 0.003, + "num_input_tokens_seen": 175299328, + "step": 81225 + }, + { + "epoch": 13.251223491027732, + "grad_norm": 0.0005442486144602299, + "learning_rate": 0.0003085727970389829, + "loss": 0.032, + "num_input_tokens_seen": 175310080, + "step": 81230 + }, + { + "epoch": 13.252039151712887, + "grad_norm": 0.014770587906241417, + "learning_rate": 0.0003085070424399089, + "loss": 0.0155, + "num_input_tokens_seen": 175319360, + "step": 81235 + }, + { + "epoch": 13.252854812398043, + "grad_norm": 0.0068336124531924725, + "learning_rate": 0.00030844129172168236, + "loss": 0.0034, + "num_input_tokens_seen": 175331136, + "step": 81240 + }, + { + "epoch": 13.253670473083197, + "grad_norm": 0.006275962106883526, + "learning_rate": 0.0003083755448856361, + "loss": 0.0037, + "num_input_tokens_seen": 175341632, + "step": 81245 + }, + { + "epoch": 13.254486133768353, + "grad_norm": 0.6013478636741638, + "learning_rate": 0.00030830980193310265, + "loss": 0.047, + "num_input_tokens_seen": 175352992, + "step": 81250 + }, + { + "epoch": 13.255301794453507, + "grad_norm": 0.030473748221993446, + "learning_rate": 0.00030824406286541415, + "loss": 0.0714, + "num_input_tokens_seen": 175364160, + "step": 81255 + }, + { + "epoch": 13.256117455138662, + "grad_norm": 0.00882295984774828, + "learning_rate": 0.00030817832768390306, + "loss": 0.0138, + "num_input_tokens_seen": 175374912, + "step": 81260 + }, + { + "epoch": 13.256933115823816, + "grad_norm": 0.9932186007499695, + "learning_rate": 0.0003081125963899014, + "loss": 0.0217, + "num_input_tokens_seen": 175385440, + "step": 81265 + }, + { + "epoch": 13.257748776508972, + "grad_norm": 0.23298044502735138, + "learning_rate": 0.0003080468689847414, + "loss": 0.0378, + "num_input_tokens_seen": 175396416, + "step": 81270 + }, + { + "epoch": 13.258564437194128, + "grad_norm": 0.0037160255014896393, + "learning_rate": 0.00030798114546975525, + "loss": 0.0036, + "num_input_tokens_seen": 175406912, + "step": 81275 + }, + { + "epoch": 13.259380097879282, + "grad_norm": 0.03932074084877968, + "learning_rate": 0.00030791542584627455, + "loss": 0.0101, + "num_input_tokens_seen": 175417600, + "step": 81280 + }, + { + "epoch": 13.260195758564437, + "grad_norm": 0.05499856546521187, + "learning_rate": 0.0003078497101156317, + "loss": 0.004, + "num_input_tokens_seen": 175427488, + "step": 81285 + }, + { + "epoch": 13.261011419249591, + "grad_norm": 0.0008840316440910101, + "learning_rate": 0.00030778399827915796, + "loss": 0.0009, + "num_input_tokens_seen": 175438496, + "step": 81290 + }, + { + "epoch": 13.261827079934747, + "grad_norm": 0.03506457060575485, + "learning_rate": 0.0003077182903381856, + "loss": 0.0175, + "num_input_tokens_seen": 175449760, + "step": 81295 + }, + { + "epoch": 13.262642740619903, + "grad_norm": 0.017485596239566803, + "learning_rate": 0.0003076525862940458, + "loss": 0.0042, + "num_input_tokens_seen": 175460160, + "step": 81300 + }, + { + "epoch": 13.263458401305057, + "grad_norm": 0.008316759020090103, + "learning_rate": 0.00030758688614807033, + "loss": 0.0023, + "num_input_tokens_seen": 175469376, + "step": 81305 + }, + { + "epoch": 13.264274061990212, + "grad_norm": 0.02052072249352932, + "learning_rate": 0.0003075211899015909, + "loss": 0.0163, + "num_input_tokens_seen": 175480416, + "step": 81310 + }, + { + "epoch": 13.265089722675366, + "grad_norm": 0.0013523380039259791, + "learning_rate": 0.0003074554975559386, + "loss": 0.0098, + "num_input_tokens_seen": 175490624, + "step": 81315 + }, + { + "epoch": 13.265905383360522, + "grad_norm": 0.004528264980763197, + "learning_rate": 0.000307389809112445, + "loss": 0.0076, + "num_input_tokens_seen": 175501568, + "step": 81320 + }, + { + "epoch": 13.266721044045678, + "grad_norm": 0.028241673484444618, + "learning_rate": 0.0003073241245724411, + "loss": 0.0037, + "num_input_tokens_seen": 175512928, + "step": 81325 + }, + { + "epoch": 13.267536704730832, + "grad_norm": 0.0007365340716205537, + "learning_rate": 0.00030725844393725846, + "loss": 0.0093, + "num_input_tokens_seen": 175523424, + "step": 81330 + }, + { + "epoch": 13.268352365415987, + "grad_norm": 0.001413402846083045, + "learning_rate": 0.00030719276720822774, + "loss": 0.0056, + "num_input_tokens_seen": 175534272, + "step": 81335 + }, + { + "epoch": 13.269168026101141, + "grad_norm": 0.003839910961687565, + "learning_rate": 0.0003071270943866804, + "loss": 0.0063, + "num_input_tokens_seen": 175545760, + "step": 81340 + }, + { + "epoch": 13.269983686786297, + "grad_norm": 0.15810967981815338, + "learning_rate": 0.000307061425473947, + "loss": 0.0098, + "num_input_tokens_seen": 175556416, + "step": 81345 + }, + { + "epoch": 13.270799347471453, + "grad_norm": 0.6461425423622131, + "learning_rate": 0.00030699576047135875, + "loss": 0.0828, + "num_input_tokens_seen": 175566272, + "step": 81350 + }, + { + "epoch": 13.271615008156607, + "grad_norm": 0.0015590997645631433, + "learning_rate": 0.0003069300993802461, + "loss": 0.0033, + "num_input_tokens_seen": 175577440, + "step": 81355 + }, + { + "epoch": 13.272430668841762, + "grad_norm": 0.005757700186222792, + "learning_rate": 0.00030686444220194, + "loss": 0.0238, + "num_input_tokens_seen": 175589184, + "step": 81360 + }, + { + "epoch": 13.273246329526916, + "grad_norm": 0.0030070720240473747, + "learning_rate": 0.00030679878893777085, + "loss": 0.0091, + "num_input_tokens_seen": 175599712, + "step": 81365 + }, + { + "epoch": 13.274061990212072, + "grad_norm": 0.34976568818092346, + "learning_rate": 0.0003067331395890696, + "loss": 0.0419, + "num_input_tokens_seen": 175610400, + "step": 81370 + }, + { + "epoch": 13.274877650897226, + "grad_norm": 0.15500667691230774, + "learning_rate": 0.0003066674941571661, + "loss": 0.0072, + "num_input_tokens_seen": 175621504, + "step": 81375 + }, + { + "epoch": 13.275693311582382, + "grad_norm": 0.036007918417453766, + "learning_rate": 0.0003066018526433914, + "loss": 0.0029, + "num_input_tokens_seen": 175632096, + "step": 81380 + }, + { + "epoch": 13.276508972267537, + "grad_norm": 0.0018731800373643637, + "learning_rate": 0.00030653621504907533, + "loss": 0.0823, + "num_input_tokens_seen": 175643232, + "step": 81385 + }, + { + "epoch": 13.277324632952691, + "grad_norm": 0.0026089863386005163, + "learning_rate": 0.0003064705813755483, + "loss": 0.0726, + "num_input_tokens_seen": 175654016, + "step": 81390 + }, + { + "epoch": 13.278140293637847, + "grad_norm": 0.00018611103587318212, + "learning_rate": 0.0003064049516241405, + "loss": 0.0844, + "num_input_tokens_seen": 175665440, + "step": 81395 + }, + { + "epoch": 13.278955954323001, + "grad_norm": 0.0011685257777571678, + "learning_rate": 0.00030633932579618195, + "loss": 0.0212, + "num_input_tokens_seen": 175676288, + "step": 81400 + }, + { + "epoch": 13.279771615008157, + "grad_norm": 0.0007880550692789257, + "learning_rate": 0.00030627370389300256, + "loss": 0.0042, + "num_input_tokens_seen": 175686592, + "step": 81405 + }, + { + "epoch": 13.280587275693312, + "grad_norm": 0.002316003432497382, + "learning_rate": 0.0003062080859159323, + "loss": 0.0026, + "num_input_tokens_seen": 175698112, + "step": 81410 + }, + { + "epoch": 13.281402936378466, + "grad_norm": 0.9575821161270142, + "learning_rate": 0.0003061424718663011, + "loss": 0.1066, + "num_input_tokens_seen": 175709728, + "step": 81415 + }, + { + "epoch": 13.282218597063622, + "grad_norm": 0.013802909292280674, + "learning_rate": 0.00030607686174543864, + "loss": 0.0206, + "num_input_tokens_seen": 175719968, + "step": 81420 + }, + { + "epoch": 13.283034257748776, + "grad_norm": 0.16165804862976074, + "learning_rate": 0.00030601125555467456, + "loss": 0.0075, + "num_input_tokens_seen": 175731072, + "step": 81425 + }, + { + "epoch": 13.283849918433932, + "grad_norm": 0.12735773622989655, + "learning_rate": 0.0003059456532953385, + "loss": 0.0162, + "num_input_tokens_seen": 175740768, + "step": 81430 + }, + { + "epoch": 13.284665579119087, + "grad_norm": 0.005981746595352888, + "learning_rate": 0.00030588005496876, + "loss": 0.0022, + "num_input_tokens_seen": 175751168, + "step": 81435 + }, + { + "epoch": 13.285481239804241, + "grad_norm": 0.3235642910003662, + "learning_rate": 0.00030581446057626827, + "loss": 0.0109, + "num_input_tokens_seen": 175762496, + "step": 81440 + }, + { + "epoch": 13.286296900489397, + "grad_norm": 0.016444915905594826, + "learning_rate": 0.00030574887011919306, + "loss": 0.0034, + "num_input_tokens_seen": 175773312, + "step": 81445 + }, + { + "epoch": 13.28711256117455, + "grad_norm": 0.005516483448445797, + "learning_rate": 0.0003056832835988632, + "loss": 0.0088, + "num_input_tokens_seen": 175784960, + "step": 81450 + }, + { + "epoch": 13.287928221859707, + "grad_norm": 0.01270576473325491, + "learning_rate": 0.00030561770101660837, + "loss": 0.0126, + "num_input_tokens_seen": 175796192, + "step": 81455 + }, + { + "epoch": 13.28874388254486, + "grad_norm": 0.08100481331348419, + "learning_rate": 0.0003055521223737572, + "loss": 0.0133, + "num_input_tokens_seen": 175807680, + "step": 81460 + }, + { + "epoch": 13.289559543230016, + "grad_norm": 0.2288615107536316, + "learning_rate": 0.0003054865476716391, + "loss": 0.1161, + "num_input_tokens_seen": 175816288, + "step": 81465 + }, + { + "epoch": 13.290375203915172, + "grad_norm": 0.01433555968105793, + "learning_rate": 0.0003054209769115827, + "loss": 0.1251, + "num_input_tokens_seen": 175826784, + "step": 81470 + }, + { + "epoch": 13.291190864600326, + "grad_norm": 0.016465958207845688, + "learning_rate": 0.0003053554100949173, + "loss": 0.0035, + "num_input_tokens_seen": 175836384, + "step": 81475 + }, + { + "epoch": 13.292006525285482, + "grad_norm": 0.0012167328968644142, + "learning_rate": 0.0003052898472229711, + "loss": 0.056, + "num_input_tokens_seen": 175848096, + "step": 81480 + }, + { + "epoch": 13.292822185970635, + "grad_norm": 0.020880959928035736, + "learning_rate": 0.0003052242882970735, + "loss": 0.0062, + "num_input_tokens_seen": 175858784, + "step": 81485 + }, + { + "epoch": 13.293637846655791, + "grad_norm": 0.0016183634288609028, + "learning_rate": 0.0003051587333185525, + "loss": 0.0102, + "num_input_tokens_seen": 175869792, + "step": 81490 + }, + { + "epoch": 13.294453507340947, + "grad_norm": 0.005147203803062439, + "learning_rate": 0.00030509318228873715, + "loss": 0.1188, + "num_input_tokens_seen": 175880544, + "step": 81495 + }, + { + "epoch": 13.2952691680261, + "grad_norm": 0.0014298699097707868, + "learning_rate": 0.00030502763520895556, + "loss": 0.0047, + "num_input_tokens_seen": 175890816, + "step": 81500 + }, + { + "epoch": 13.296084828711257, + "grad_norm": 0.03950461745262146, + "learning_rate": 0.00030496209208053643, + "loss": 0.0035, + "num_input_tokens_seen": 175901024, + "step": 81505 + }, + { + "epoch": 13.29690048939641, + "grad_norm": 0.043528053909540176, + "learning_rate": 0.0003048965529048078, + "loss": 0.0364, + "num_input_tokens_seen": 175910752, + "step": 81510 + }, + { + "epoch": 13.297716150081566, + "grad_norm": 0.0003105907526332885, + "learning_rate": 0.00030483101768309797, + "loss": 0.0015, + "num_input_tokens_seen": 175921952, + "step": 81515 + }, + { + "epoch": 13.298531810766722, + "grad_norm": 0.004460294730961323, + "learning_rate": 0.00030476548641673537, + "loss": 0.0068, + "num_input_tokens_seen": 175932896, + "step": 81520 + }, + { + "epoch": 13.299347471451876, + "grad_norm": 0.006911998614668846, + "learning_rate": 0.0003046999591070476, + "loss": 0.0014, + "num_input_tokens_seen": 175942432, + "step": 81525 + }, + { + "epoch": 13.300163132137031, + "grad_norm": 0.004108709283173084, + "learning_rate": 0.0003046344357553632, + "loss": 0.0012, + "num_input_tokens_seen": 175953120, + "step": 81530 + }, + { + "epoch": 13.300978792822185, + "grad_norm": 0.08252222836017609, + "learning_rate": 0.0003045689163630095, + "loss": 0.0106, + "num_input_tokens_seen": 175965056, + "step": 81535 + }, + { + "epoch": 13.301794453507341, + "grad_norm": 0.016474295407533646, + "learning_rate": 0.000304503400931315, + "loss": 0.0784, + "num_input_tokens_seen": 175976864, + "step": 81540 + }, + { + "epoch": 13.302610114192497, + "grad_norm": 0.009339535608887672, + "learning_rate": 0.00030443788946160676, + "loss": 0.0339, + "num_input_tokens_seen": 175986688, + "step": 81545 + }, + { + "epoch": 13.30342577487765, + "grad_norm": 0.06328590214252472, + "learning_rate": 0.000304372381955213, + "loss": 0.0043, + "num_input_tokens_seen": 175996704, + "step": 81550 + }, + { + "epoch": 13.304241435562806, + "grad_norm": 0.09201286733150482, + "learning_rate": 0.00030430687841346096, + "loss": 0.0116, + "num_input_tokens_seen": 176006912, + "step": 81555 + }, + { + "epoch": 13.30505709624796, + "grad_norm": 0.006636832375079393, + "learning_rate": 0.00030424137883767826, + "loss": 0.0027, + "num_input_tokens_seen": 176018272, + "step": 81560 + }, + { + "epoch": 13.305872756933116, + "grad_norm": 0.0013254358200356364, + "learning_rate": 0.00030417588322919243, + "loss": 0.0212, + "num_input_tokens_seen": 176028192, + "step": 81565 + }, + { + "epoch": 13.30668841761827, + "grad_norm": 0.014928774908185005, + "learning_rate": 0.00030411039158933075, + "loss": 0.0029, + "num_input_tokens_seen": 176039392, + "step": 81570 + }, + { + "epoch": 13.307504078303426, + "grad_norm": 0.008163879625499249, + "learning_rate": 0.0003040449039194205, + "loss": 0.0071, + "num_input_tokens_seen": 176049696, + "step": 81575 + }, + { + "epoch": 13.308319738988581, + "grad_norm": 0.008085324428975582, + "learning_rate": 0.00030397942022078884, + "loss": 0.0087, + "num_input_tokens_seen": 176060576, + "step": 81580 + }, + { + "epoch": 13.309135399673735, + "grad_norm": 0.0035446041729301214, + "learning_rate": 0.00030391394049476275, + "loss": 0.002, + "num_input_tokens_seen": 176070944, + "step": 81585 + }, + { + "epoch": 13.309951060358891, + "grad_norm": 0.01982569508254528, + "learning_rate": 0.00030384846474266965, + "loss": 0.0073, + "num_input_tokens_seen": 176082240, + "step": 81590 + }, + { + "epoch": 13.310766721044045, + "grad_norm": 0.007497382815927267, + "learning_rate": 0.0003037829929658361, + "loss": 0.0098, + "num_input_tokens_seen": 176093888, + "step": 81595 + }, + { + "epoch": 13.3115823817292, + "grad_norm": 0.003676018211990595, + "learning_rate": 0.0003037175251655892, + "loss": 0.0931, + "num_input_tokens_seen": 176104160, + "step": 81600 + }, + { + "epoch": 13.312398042414356, + "grad_norm": 0.015133029781281948, + "learning_rate": 0.0003036520613432555, + "loss": 0.0019, + "num_input_tokens_seen": 176114816, + "step": 81605 + }, + { + "epoch": 13.31321370309951, + "grad_norm": 0.07282832264900208, + "learning_rate": 0.0003035866015001621, + "loss": 0.0038, + "num_input_tokens_seen": 176125216, + "step": 81610 + }, + { + "epoch": 13.314029363784666, + "grad_norm": 0.02972259745001793, + "learning_rate": 0.00030352114563763515, + "loss": 0.0049, + "num_input_tokens_seen": 176136320, + "step": 81615 + }, + { + "epoch": 13.31484502446982, + "grad_norm": 0.5493693351745605, + "learning_rate": 0.00030345569375700145, + "loss": 0.1333, + "num_input_tokens_seen": 176147040, + "step": 81620 + }, + { + "epoch": 13.315660685154976, + "grad_norm": 0.3229658901691437, + "learning_rate": 0.0003033902458595877, + "loss": 0.0104, + "num_input_tokens_seen": 176158848, + "step": 81625 + }, + { + "epoch": 13.31647634584013, + "grad_norm": 0.006957527715712786, + "learning_rate": 0.00030332480194671975, + "loss": 0.0489, + "num_input_tokens_seen": 176169504, + "step": 81630 + }, + { + "epoch": 13.317292006525285, + "grad_norm": 0.010391506366431713, + "learning_rate": 0.0003032593620197245, + "loss": 0.0026, + "num_input_tokens_seen": 176180480, + "step": 81635 + }, + { + "epoch": 13.318107667210441, + "grad_norm": 0.1524864137172699, + "learning_rate": 0.0003031939260799276, + "loss": 0.0043, + "num_input_tokens_seen": 176192288, + "step": 81640 + }, + { + "epoch": 13.318923327895595, + "grad_norm": 0.006265328265726566, + "learning_rate": 0.00030312849412865564, + "loss": 0.0017, + "num_input_tokens_seen": 176203072, + "step": 81645 + }, + { + "epoch": 13.31973898858075, + "grad_norm": 0.00305930245667696, + "learning_rate": 0.00030306306616723424, + "loss": 0.0089, + "num_input_tokens_seen": 176213568, + "step": 81650 + }, + { + "epoch": 13.320554649265905, + "grad_norm": 0.005070611368864775, + "learning_rate": 0.00030299764219698987, + "loss": 0.0043, + "num_input_tokens_seen": 176224032, + "step": 81655 + }, + { + "epoch": 13.32137030995106, + "grad_norm": 0.02538062259554863, + "learning_rate": 0.00030293222221924805, + "loss": 0.1277, + "num_input_tokens_seen": 176234368, + "step": 81660 + }, + { + "epoch": 13.322185970636216, + "grad_norm": 0.059279683977365494, + "learning_rate": 0.0003028668062353349, + "loss": 0.022, + "num_input_tokens_seen": 176244992, + "step": 81665 + }, + { + "epoch": 13.32300163132137, + "grad_norm": 0.0008893334306776524, + "learning_rate": 0.0003028013942465758, + "loss": 0.0216, + "num_input_tokens_seen": 176254752, + "step": 81670 + }, + { + "epoch": 13.323817292006526, + "grad_norm": 0.012717152945697308, + "learning_rate": 0.00030273598625429687, + "loss": 0.0011, + "num_input_tokens_seen": 176265696, + "step": 81675 + }, + { + "epoch": 13.32463295269168, + "grad_norm": 0.026965685188770294, + "learning_rate": 0.00030267058225982315, + "loss": 0.0026, + "num_input_tokens_seen": 176276064, + "step": 81680 + }, + { + "epoch": 13.325448613376835, + "grad_norm": 0.014691062271595001, + "learning_rate": 0.00030260518226448064, + "loss": 0.0073, + "num_input_tokens_seen": 176286272, + "step": 81685 + }, + { + "epoch": 13.326264274061991, + "grad_norm": 0.05481749773025513, + "learning_rate": 0.00030253978626959435, + "loss": 0.056, + "num_input_tokens_seen": 176297344, + "step": 81690 + }, + { + "epoch": 13.327079934747145, + "grad_norm": 0.002554496983066201, + "learning_rate": 0.00030247439427649, + "loss": 0.0049, + "num_input_tokens_seen": 176308896, + "step": 81695 + }, + { + "epoch": 13.3278955954323, + "grad_norm": 0.0037879778537899256, + "learning_rate": 0.0003024090062864924, + "loss": 0.0032, + "num_input_tokens_seen": 176319328, + "step": 81700 + }, + { + "epoch": 13.328711256117455, + "grad_norm": 0.00032335746800526977, + "learning_rate": 0.00030234362230092705, + "loss": 0.0059, + "num_input_tokens_seen": 176330304, + "step": 81705 + }, + { + "epoch": 13.32952691680261, + "grad_norm": 0.003908625338226557, + "learning_rate": 0.0003022782423211189, + "loss": 0.0048, + "num_input_tokens_seen": 176340448, + "step": 81710 + }, + { + "epoch": 13.330342577487766, + "grad_norm": 0.012974311597645283, + "learning_rate": 0.0003022128663483931, + "loss": 0.0242, + "num_input_tokens_seen": 176351296, + "step": 81715 + }, + { + "epoch": 13.33115823817292, + "grad_norm": 0.1014171615242958, + "learning_rate": 0.0003021474943840743, + "loss": 0.0158, + "num_input_tokens_seen": 176362304, + "step": 81720 + }, + { + "epoch": 13.331973898858076, + "grad_norm": 0.031628891825675964, + "learning_rate": 0.00030208212642948755, + "loss": 0.0103, + "num_input_tokens_seen": 176371168, + "step": 81725 + }, + { + "epoch": 13.33278955954323, + "grad_norm": 0.002329864539206028, + "learning_rate": 0.0003020167624859577, + "loss": 0.1031, + "num_input_tokens_seen": 176381984, + "step": 81730 + }, + { + "epoch": 13.333605220228385, + "grad_norm": 0.034606002271175385, + "learning_rate": 0.00030195140255480927, + "loss": 0.0179, + "num_input_tokens_seen": 176393120, + "step": 81735 + }, + { + "epoch": 13.33442088091354, + "grad_norm": 0.11859557777643204, + "learning_rate": 0.0003018860466373669, + "loss": 0.0118, + "num_input_tokens_seen": 176404128, + "step": 81740 + }, + { + "epoch": 13.335236541598695, + "grad_norm": 0.2893815338611603, + "learning_rate": 0.0003018206947349551, + "loss": 0.044, + "num_input_tokens_seen": 176414016, + "step": 81745 + }, + { + "epoch": 13.33605220228385, + "grad_norm": 0.03494579344987869, + "learning_rate": 0.00030175534684889836, + "loss": 0.0102, + "num_input_tokens_seen": 176425376, + "step": 81750 + }, + { + "epoch": 13.336867862969005, + "grad_norm": 0.0027860805857926607, + "learning_rate": 0.00030169000298052096, + "loss": 0.0032, + "num_input_tokens_seen": 176439104, + "step": 81755 + }, + { + "epoch": 13.33768352365416, + "grad_norm": 0.0014131362549960613, + "learning_rate": 0.00030162466313114734, + "loss": 0.0038, + "num_input_tokens_seen": 176448800, + "step": 81760 + }, + { + "epoch": 13.338499184339314, + "grad_norm": 0.003500349121168256, + "learning_rate": 0.00030155932730210145, + "loss": 0.0021, + "num_input_tokens_seen": 176459584, + "step": 81765 + }, + { + "epoch": 13.33931484502447, + "grad_norm": 0.03195258602499962, + "learning_rate": 0.00030149399549470767, + "loss": 0.003, + "num_input_tokens_seen": 176469728, + "step": 81770 + }, + { + "epoch": 13.340130505709626, + "grad_norm": 0.07189842313528061, + "learning_rate": 0.00030142866771028974, + "loss": 0.0292, + "num_input_tokens_seen": 176480320, + "step": 81775 + }, + { + "epoch": 13.34094616639478, + "grad_norm": 0.006430008448660374, + "learning_rate": 0.00030136334395017197, + "loss": 0.0019, + "num_input_tokens_seen": 176491392, + "step": 81780 + }, + { + "epoch": 13.341761827079935, + "grad_norm": 0.006648715119808912, + "learning_rate": 0.0003012980242156778, + "loss": 0.0055, + "num_input_tokens_seen": 176501856, + "step": 81785 + }, + { + "epoch": 13.34257748776509, + "grad_norm": 0.0026397390756756067, + "learning_rate": 0.00030123270850813147, + "loss": 0.0272, + "num_input_tokens_seen": 176511616, + "step": 81790 + }, + { + "epoch": 13.343393148450245, + "grad_norm": 0.006876611616462469, + "learning_rate": 0.0003011673968288562, + "loss": 0.0071, + "num_input_tokens_seen": 176522688, + "step": 81795 + }, + { + "epoch": 13.3442088091354, + "grad_norm": 0.040792930871248245, + "learning_rate": 0.00030110208917917607, + "loss": 0.0028, + "num_input_tokens_seen": 176534496, + "step": 81800 + }, + { + "epoch": 13.345024469820554, + "grad_norm": 0.3827681243419647, + "learning_rate": 0.00030103678556041427, + "loss": 0.0408, + "num_input_tokens_seen": 176545600, + "step": 81805 + }, + { + "epoch": 13.34584013050571, + "grad_norm": 0.010907374322414398, + "learning_rate": 0.00030097148597389456, + "loss": 0.0248, + "num_input_tokens_seen": 176555904, + "step": 81810 + }, + { + "epoch": 13.346655791190864, + "grad_norm": 0.008228674530982971, + "learning_rate": 0.00030090619042094, + "loss": 0.0018, + "num_input_tokens_seen": 176567392, + "step": 81815 + }, + { + "epoch": 13.34747145187602, + "grad_norm": 0.021312372758984566, + "learning_rate": 0.0003008408989028743, + "loss": 0.0024, + "num_input_tokens_seen": 176577536, + "step": 81820 + }, + { + "epoch": 13.348287112561174, + "grad_norm": 0.00636378163471818, + "learning_rate": 0.00030077561142102024, + "loss": 0.1076, + "num_input_tokens_seen": 176586624, + "step": 81825 + }, + { + "epoch": 13.34910277324633, + "grad_norm": 0.1759309619665146, + "learning_rate": 0.0003007103279767013, + "loss": 0.0125, + "num_input_tokens_seen": 176597216, + "step": 81830 + }, + { + "epoch": 13.349918433931485, + "grad_norm": 0.7431128621101379, + "learning_rate": 0.0003006450485712402, + "loss": 0.0671, + "num_input_tokens_seen": 176608416, + "step": 81835 + }, + { + "epoch": 13.350734094616639, + "grad_norm": 0.042946916073560715, + "learning_rate": 0.00030057977320596007, + "loss": 0.0086, + "num_input_tokens_seen": 176619104, + "step": 81840 + }, + { + "epoch": 13.351549755301795, + "grad_norm": 0.0018359622918069363, + "learning_rate": 0.00030051450188218397, + "loss": 0.0014, + "num_input_tokens_seen": 176629952, + "step": 81845 + }, + { + "epoch": 13.352365415986949, + "grad_norm": 0.013308551162481308, + "learning_rate": 0.0003004492346012345, + "loss": 0.0417, + "num_input_tokens_seen": 176639744, + "step": 81850 + }, + { + "epoch": 13.353181076672104, + "grad_norm": 0.00039765582187101245, + "learning_rate": 0.0003003839713644345, + "loss": 0.0038, + "num_input_tokens_seen": 176651392, + "step": 81855 + }, + { + "epoch": 13.35399673735726, + "grad_norm": 0.02820494771003723, + "learning_rate": 0.0003003187121731064, + "loss": 0.0111, + "num_input_tokens_seen": 176662208, + "step": 81860 + }, + { + "epoch": 13.354812398042414, + "grad_norm": 0.0023787389509379864, + "learning_rate": 0.0003002534570285731, + "loss": 0.0062, + "num_input_tokens_seen": 176675232, + "step": 81865 + }, + { + "epoch": 13.35562805872757, + "grad_norm": 0.007850063033401966, + "learning_rate": 0.00030018820593215675, + "loss": 0.0107, + "num_input_tokens_seen": 176686304, + "step": 81870 + }, + { + "epoch": 13.356443719412724, + "grad_norm": 0.012238041497766972, + "learning_rate": 0.0003001229588851799, + "loss": 0.0026, + "num_input_tokens_seen": 176696224, + "step": 81875 + }, + { + "epoch": 13.35725938009788, + "grad_norm": 0.04785846546292305, + "learning_rate": 0.0003000577158889649, + "loss": 0.0028, + "num_input_tokens_seen": 176707264, + "step": 81880 + }, + { + "epoch": 13.358075040783035, + "grad_norm": 0.05561709776520729, + "learning_rate": 0.00029999247694483395, + "loss": 0.0313, + "num_input_tokens_seen": 176719264, + "step": 81885 + }, + { + "epoch": 13.358890701468189, + "grad_norm": 0.00877333339303732, + "learning_rate": 0.00029992724205410914, + "loss": 0.0018, + "num_input_tokens_seen": 176730944, + "step": 81890 + }, + { + "epoch": 13.359706362153345, + "grad_norm": 0.019921107217669487, + "learning_rate": 0.0002998620112181126, + "loss": 0.002, + "num_input_tokens_seen": 176741568, + "step": 81895 + }, + { + "epoch": 13.360522022838499, + "grad_norm": 0.00033841366530396044, + "learning_rate": 0.0002997967844381662, + "loss": 0.0014, + "num_input_tokens_seen": 176751552, + "step": 81900 + }, + { + "epoch": 13.361337683523654, + "grad_norm": 0.005924389231950045, + "learning_rate": 0.00029973156171559214, + "loss": 0.019, + "num_input_tokens_seen": 176761152, + "step": 81905 + }, + { + "epoch": 13.362153344208808, + "grad_norm": 0.47133365273475647, + "learning_rate": 0.0002996663430517118, + "loss": 0.1029, + "num_input_tokens_seen": 176771648, + "step": 81910 + }, + { + "epoch": 13.362969004893964, + "grad_norm": 0.02057645097374916, + "learning_rate": 0.0002996011284478474, + "loss": 0.0026, + "num_input_tokens_seen": 176781664, + "step": 81915 + }, + { + "epoch": 13.36378466557912, + "grad_norm": 0.032185424119234085, + "learning_rate": 0.00029953591790532014, + "loss": 0.0072, + "num_input_tokens_seen": 176792512, + "step": 81920 + }, + { + "epoch": 13.364600326264274, + "grad_norm": 0.019510075449943542, + "learning_rate": 0.000299470711425452, + "loss": 0.0049, + "num_input_tokens_seen": 176803488, + "step": 81925 + }, + { + "epoch": 13.36541598694943, + "grad_norm": 0.0006918744766153395, + "learning_rate": 0.0002994055090095641, + "loss": 0.0101, + "num_input_tokens_seen": 176813632, + "step": 81930 + }, + { + "epoch": 13.366231647634583, + "grad_norm": 0.0007388674421235919, + "learning_rate": 0.00029934031065897824, + "loss": 0.0197, + "num_input_tokens_seen": 176825472, + "step": 81935 + }, + { + "epoch": 13.367047308319739, + "grad_norm": 0.011729322373867035, + "learning_rate": 0.00029927511637501536, + "loss": 0.0073, + "num_input_tokens_seen": 176835776, + "step": 81940 + }, + { + "epoch": 13.367862969004895, + "grad_norm": 0.015458999201655388, + "learning_rate": 0.0002992099261589968, + "loss": 0.1287, + "num_input_tokens_seen": 176847744, + "step": 81945 + }, + { + "epoch": 13.368678629690049, + "grad_norm": 0.16085614264011383, + "learning_rate": 0.00029914474001224413, + "loss": 0.0177, + "num_input_tokens_seen": 176858368, + "step": 81950 + }, + { + "epoch": 13.369494290375204, + "grad_norm": 0.0015722792595624924, + "learning_rate": 0.0002990795579360778, + "loss": 0.003, + "num_input_tokens_seen": 176868448, + "step": 81955 + }, + { + "epoch": 13.370309951060358, + "grad_norm": 0.005608657840639353, + "learning_rate": 0.00029901437993181936, + "loss": 0.0015, + "num_input_tokens_seen": 176879008, + "step": 81960 + }, + { + "epoch": 13.371125611745514, + "grad_norm": 0.028552208095788956, + "learning_rate": 0.0002989492060007893, + "loss": 0.0043, + "num_input_tokens_seen": 176889056, + "step": 81965 + }, + { + "epoch": 13.37194127243067, + "grad_norm": 0.015366587787866592, + "learning_rate": 0.0002988840361443088, + "loss": 0.0235, + "num_input_tokens_seen": 176899008, + "step": 81970 + }, + { + "epoch": 13.372756933115824, + "grad_norm": 0.007630058564245701, + "learning_rate": 0.0002988188703636983, + "loss": 0.0064, + "num_input_tokens_seen": 176909856, + "step": 81975 + }, + { + "epoch": 13.37357259380098, + "grad_norm": 0.0013361324090510607, + "learning_rate": 0.0002987537086602787, + "loss": 0.01, + "num_input_tokens_seen": 176919648, + "step": 81980 + }, + { + "epoch": 13.374388254486133, + "grad_norm": 0.0017252914840355515, + "learning_rate": 0.0002986885510353703, + "loss": 0.0339, + "num_input_tokens_seen": 176929536, + "step": 81985 + }, + { + "epoch": 13.375203915171289, + "grad_norm": 0.000846170587465167, + "learning_rate": 0.00029862339749029413, + "loss": 0.003, + "num_input_tokens_seen": 176941152, + "step": 81990 + }, + { + "epoch": 13.376019575856443, + "grad_norm": 0.00298714661039412, + "learning_rate": 0.0002985582480263699, + "loss": 0.0674, + "num_input_tokens_seen": 176951328, + "step": 81995 + }, + { + "epoch": 13.376835236541599, + "grad_norm": 0.011061177588999271, + "learning_rate": 0.00029849310264491865, + "loss": 0.0517, + "num_input_tokens_seen": 176962496, + "step": 82000 + }, + { + "epoch": 13.377650897226754, + "grad_norm": 0.00428803451359272, + "learning_rate": 0.00029842796134726, + "loss": 0.0033, + "num_input_tokens_seen": 176972736, + "step": 82005 + }, + { + "epoch": 13.378466557911908, + "grad_norm": 0.0022315382957458496, + "learning_rate": 0.0002983628241347147, + "loss": 0.003, + "num_input_tokens_seen": 176983296, + "step": 82010 + }, + { + "epoch": 13.379282218597064, + "grad_norm": 0.0032017696648836136, + "learning_rate": 0.0002982976910086024, + "loss": 0.0101, + "num_input_tokens_seen": 176993824, + "step": 82015 + }, + { + "epoch": 13.380097879282218, + "grad_norm": 0.001729697803966701, + "learning_rate": 0.0002982325619702433, + "loss": 0.088, + "num_input_tokens_seen": 177003488, + "step": 82020 + }, + { + "epoch": 13.380913539967374, + "grad_norm": 0.021472996100783348, + "learning_rate": 0.0002981674370209573, + "loss": 0.0231, + "num_input_tokens_seen": 177014304, + "step": 82025 + }, + { + "epoch": 13.38172920065253, + "grad_norm": 0.1987244188785553, + "learning_rate": 0.00029810231616206426, + "loss": 0.0117, + "num_input_tokens_seen": 177024352, + "step": 82030 + }, + { + "epoch": 13.382544861337683, + "grad_norm": 0.01773775927722454, + "learning_rate": 0.00029803719939488387, + "loss": 0.0026, + "num_input_tokens_seen": 177034944, + "step": 82035 + }, + { + "epoch": 13.383360522022839, + "grad_norm": 0.09902945905923843, + "learning_rate": 0.0002979720867207358, + "loss": 0.0055, + "num_input_tokens_seen": 177046816, + "step": 82040 + }, + { + "epoch": 13.384176182707993, + "grad_norm": 0.00544345797970891, + "learning_rate": 0.0002979069781409397, + "loss": 0.007, + "num_input_tokens_seen": 177056384, + "step": 82045 + }, + { + "epoch": 13.384991843393149, + "grad_norm": 0.004392101429402828, + "learning_rate": 0.00029784187365681516, + "loss": 0.0027, + "num_input_tokens_seen": 177067520, + "step": 82050 + }, + { + "epoch": 13.385807504078304, + "grad_norm": 0.054380178451538086, + "learning_rate": 0.00029777677326968144, + "loss": 0.0052, + "num_input_tokens_seen": 177077984, + "step": 82055 + }, + { + "epoch": 13.386623164763458, + "grad_norm": 1.74069082736969, + "learning_rate": 0.0002977116769808579, + "loss": 0.0292, + "num_input_tokens_seen": 177088384, + "step": 82060 + }, + { + "epoch": 13.387438825448614, + "grad_norm": 0.016861742362380028, + "learning_rate": 0.000297646584791664, + "loss": 0.0054, + "num_input_tokens_seen": 177100160, + "step": 82065 + }, + { + "epoch": 13.388254486133768, + "grad_norm": 0.07194247841835022, + "learning_rate": 0.0002975814967034185, + "loss": 0.0471, + "num_input_tokens_seen": 177110880, + "step": 82070 + }, + { + "epoch": 13.389070146818923, + "grad_norm": 0.00023437743948306888, + "learning_rate": 0.000297516412717441, + "loss": 0.0039, + "num_input_tokens_seen": 177120832, + "step": 82075 + }, + { + "epoch": 13.38988580750408, + "grad_norm": 0.0444839708507061, + "learning_rate": 0.0002974513328350501, + "loss": 0.0208, + "num_input_tokens_seen": 177132192, + "step": 82080 + }, + { + "epoch": 13.390701468189233, + "grad_norm": 0.4914010167121887, + "learning_rate": 0.00029738625705756514, + "loss": 0.0112, + "num_input_tokens_seen": 177143008, + "step": 82085 + }, + { + "epoch": 13.391517128874389, + "grad_norm": 0.013099887408316135, + "learning_rate": 0.0002973211853863044, + "loss": 0.0016, + "num_input_tokens_seen": 177152384, + "step": 82090 + }, + { + "epoch": 13.392332789559543, + "grad_norm": 0.4502934515476227, + "learning_rate": 0.0002972561178225872, + "loss": 0.0226, + "num_input_tokens_seen": 177164608, + "step": 82095 + }, + { + "epoch": 13.393148450244698, + "grad_norm": 0.0002769233542494476, + "learning_rate": 0.00029719105436773187, + "loss": 0.0577, + "num_input_tokens_seen": 177176320, + "step": 82100 + }, + { + "epoch": 13.393964110929852, + "grad_norm": 0.0038608969189226627, + "learning_rate": 0.00029712599502305714, + "loss": 0.0016, + "num_input_tokens_seen": 177186656, + "step": 82105 + }, + { + "epoch": 13.394779771615008, + "grad_norm": 0.016602711752057076, + "learning_rate": 0.0002970609397898814, + "loss": 0.018, + "num_input_tokens_seen": 177197536, + "step": 82110 + }, + { + "epoch": 13.395595432300164, + "grad_norm": 0.002177624264732003, + "learning_rate": 0.0002969958886695233, + "loss": 0.0061, + "num_input_tokens_seen": 177207712, + "step": 82115 + }, + { + "epoch": 13.396411092985318, + "grad_norm": 0.0012109042145311832, + "learning_rate": 0.00029693084166330084, + "loss": 0.0012, + "num_input_tokens_seen": 177218208, + "step": 82120 + }, + { + "epoch": 13.397226753670473, + "grad_norm": 0.014404046349227428, + "learning_rate": 0.00029686579877253276, + "loss": 0.0696, + "num_input_tokens_seen": 177230240, + "step": 82125 + }, + { + "epoch": 13.398042414355627, + "grad_norm": 0.004391298163682222, + "learning_rate": 0.0002968007599985367, + "loss": 0.005, + "num_input_tokens_seen": 177241088, + "step": 82130 + }, + { + "epoch": 13.398858075040783, + "grad_norm": 0.001393658109009266, + "learning_rate": 0.0002967357253426313, + "loss": 0.0138, + "num_input_tokens_seen": 177252256, + "step": 82135 + }, + { + "epoch": 13.399673735725939, + "grad_norm": 0.0024417508393526077, + "learning_rate": 0.000296670694806134, + "loss": 0.0125, + "num_input_tokens_seen": 177262784, + "step": 82140 + }, + { + "epoch": 13.400489396411093, + "grad_norm": 0.007032493129372597, + "learning_rate": 0.00029660566839036315, + "loss": 0.0592, + "num_input_tokens_seen": 177272704, + "step": 82145 + }, + { + "epoch": 13.401305057096248, + "grad_norm": 0.006930625066161156, + "learning_rate": 0.0002965406460966364, + "loss": 0.1296, + "num_input_tokens_seen": 177283680, + "step": 82150 + }, + { + "epoch": 13.402120717781402, + "grad_norm": 0.07191547751426697, + "learning_rate": 0.00029647562792627145, + "loss": 0.0067, + "num_input_tokens_seen": 177293696, + "step": 82155 + }, + { + "epoch": 13.402936378466558, + "grad_norm": 1.0311527252197266, + "learning_rate": 0.0002964106138805864, + "loss": 0.0717, + "num_input_tokens_seen": 177303904, + "step": 82160 + }, + { + "epoch": 13.403752039151712, + "grad_norm": 0.013045537285506725, + "learning_rate": 0.00029634560396089827, + "loss": 0.0026, + "num_input_tokens_seen": 177314880, + "step": 82165 + }, + { + "epoch": 13.404567699836868, + "grad_norm": 0.006351914722472429, + "learning_rate": 0.00029628059816852497, + "loss": 0.0105, + "num_input_tokens_seen": 177325440, + "step": 82170 + }, + { + "epoch": 13.405383360522023, + "grad_norm": 0.00444161519408226, + "learning_rate": 0.0002962155965047837, + "loss": 0.0035, + "num_input_tokens_seen": 177336896, + "step": 82175 + }, + { + "epoch": 13.406199021207177, + "grad_norm": 0.031723037362098694, + "learning_rate": 0.00029615059897099196, + "loss": 0.0102, + "num_input_tokens_seen": 177348288, + "step": 82180 + }, + { + "epoch": 13.407014681892333, + "grad_norm": 0.0010078820632770658, + "learning_rate": 0.0002960856055684668, + "loss": 0.0007, + "num_input_tokens_seen": 177359104, + "step": 82185 + }, + { + "epoch": 13.407830342577487, + "grad_norm": 0.0013822754845023155, + "learning_rate": 0.0002960206162985256, + "loss": 0.0014, + "num_input_tokens_seen": 177369952, + "step": 82190 + }, + { + "epoch": 13.408646003262643, + "grad_norm": 0.0010158444056287408, + "learning_rate": 0.0002959556311624855, + "loss": 0.0017, + "num_input_tokens_seen": 177380864, + "step": 82195 + }, + { + "epoch": 13.409461663947798, + "grad_norm": 0.0012606596574187279, + "learning_rate": 0.0002958906501616632, + "loss": 0.0018, + "num_input_tokens_seen": 177391136, + "step": 82200 + }, + { + "epoch": 13.410277324632952, + "grad_norm": 0.048560187220573425, + "learning_rate": 0.0002958256732973759, + "loss": 0.0035, + "num_input_tokens_seen": 177401408, + "step": 82205 + }, + { + "epoch": 13.411092985318108, + "grad_norm": 0.01944819837808609, + "learning_rate": 0.00029576070057094034, + "loss": 0.0023, + "num_input_tokens_seen": 177412096, + "step": 82210 + }, + { + "epoch": 13.411908646003262, + "grad_norm": 0.0052896663546562195, + "learning_rate": 0.00029569573198367317, + "loss": 0.0015, + "num_input_tokens_seen": 177422976, + "step": 82215 + }, + { + "epoch": 13.412724306688418, + "grad_norm": 0.0008618422434665263, + "learning_rate": 0.00029563076753689137, + "loss": 0.0022, + "num_input_tokens_seen": 177434912, + "step": 82220 + }, + { + "epoch": 13.413539967373573, + "grad_norm": 0.003337537171319127, + "learning_rate": 0.00029556580723191116, + "loss": 0.0171, + "num_input_tokens_seen": 177446080, + "step": 82225 + }, + { + "epoch": 13.414355628058727, + "grad_norm": 0.0009359652176499367, + "learning_rate": 0.00029550085107004937, + "loss": 0.0041, + "num_input_tokens_seen": 177457152, + "step": 82230 + }, + { + "epoch": 13.415171288743883, + "grad_norm": 0.0007921846117824316, + "learning_rate": 0.0002954358990526221, + "loss": 0.027, + "num_input_tokens_seen": 177467264, + "step": 82235 + }, + { + "epoch": 13.415986949429037, + "grad_norm": 0.000646235654130578, + "learning_rate": 0.000295370951180946, + "loss": 0.0051, + "num_input_tokens_seen": 177478176, + "step": 82240 + }, + { + "epoch": 13.416802610114193, + "grad_norm": 0.008427003398537636, + "learning_rate": 0.00029530600745633693, + "loss": 0.0236, + "num_input_tokens_seen": 177489024, + "step": 82245 + }, + { + "epoch": 13.417618270799348, + "grad_norm": 0.003929188009351492, + "learning_rate": 0.0002952410678801116, + "loss": 0.0035, + "num_input_tokens_seen": 177499744, + "step": 82250 + }, + { + "epoch": 13.418433931484502, + "grad_norm": 0.0034211473539471626, + "learning_rate": 0.0002951761324535855, + "loss": 0.0005, + "num_input_tokens_seen": 177510400, + "step": 82255 + }, + { + "epoch": 13.419249592169658, + "grad_norm": 0.0011122202267870307, + "learning_rate": 0.00029511120117807493, + "loss": 0.0019, + "num_input_tokens_seen": 177520672, + "step": 82260 + }, + { + "epoch": 13.420065252854812, + "grad_norm": 0.8003097772598267, + "learning_rate": 0.00029504627405489605, + "loss": 0.0423, + "num_input_tokens_seen": 177531328, + "step": 82265 + }, + { + "epoch": 13.420880913539968, + "grad_norm": 0.002137327566742897, + "learning_rate": 0.0002949813510853641, + "loss": 0.0005, + "num_input_tokens_seen": 177542912, + "step": 82270 + }, + { + "epoch": 13.421696574225122, + "grad_norm": 0.8522122502326965, + "learning_rate": 0.00029491643227079543, + "loss": 0.0664, + "num_input_tokens_seen": 177553376, + "step": 82275 + }, + { + "epoch": 13.422512234910277, + "grad_norm": 0.0015534675912931561, + "learning_rate": 0.00029485151761250527, + "loss": 0.0075, + "num_input_tokens_seen": 177563072, + "step": 82280 + }, + { + "epoch": 13.423327895595433, + "grad_norm": 0.003585314378142357, + "learning_rate": 0.0002947866071118095, + "loss": 0.0574, + "num_input_tokens_seen": 177574848, + "step": 82285 + }, + { + "epoch": 13.424143556280587, + "grad_norm": 0.002225641394034028, + "learning_rate": 0.00029472170077002324, + "loss": 0.0227, + "num_input_tokens_seen": 177585632, + "step": 82290 + }, + { + "epoch": 13.424959216965743, + "grad_norm": 0.9014904499053955, + "learning_rate": 0.0002946567985884624, + "loss": 0.1719, + "num_input_tokens_seen": 177596256, + "step": 82295 + }, + { + "epoch": 13.425774877650896, + "grad_norm": 0.16861099004745483, + "learning_rate": 0.0002945919005684418, + "loss": 0.0504, + "num_input_tokens_seen": 177608352, + "step": 82300 + }, + { + "epoch": 13.426590538336052, + "grad_norm": 0.11513535678386688, + "learning_rate": 0.0002945270067112771, + "loss": 0.0036, + "num_input_tokens_seen": 177619232, + "step": 82305 + }, + { + "epoch": 13.427406199021208, + "grad_norm": 0.0003372204373590648, + "learning_rate": 0.0002944621170182831, + "loss": 0.1202, + "num_input_tokens_seen": 177629376, + "step": 82310 + }, + { + "epoch": 13.428221859706362, + "grad_norm": 0.0010767485946416855, + "learning_rate": 0.00029439723149077523, + "loss": 0.0039, + "num_input_tokens_seen": 177639296, + "step": 82315 + }, + { + "epoch": 13.429037520391518, + "grad_norm": 0.0398411862552166, + "learning_rate": 0.0002943323501300681, + "loss": 0.0047, + "num_input_tokens_seen": 177651488, + "step": 82320 + }, + { + "epoch": 13.429853181076671, + "grad_norm": 0.011246882379055023, + "learning_rate": 0.00029426747293747685, + "loss": 0.0029, + "num_input_tokens_seen": 177663104, + "step": 82325 + }, + { + "epoch": 13.430668841761827, + "grad_norm": 0.054844196885824203, + "learning_rate": 0.00029420259991431633, + "loss": 0.0075, + "num_input_tokens_seen": 177673728, + "step": 82330 + }, + { + "epoch": 13.431484502446983, + "grad_norm": 0.0002601213345769793, + "learning_rate": 0.0002941377310619011, + "loss": 0.0343, + "num_input_tokens_seen": 177683968, + "step": 82335 + }, + { + "epoch": 13.432300163132137, + "grad_norm": 0.02829880267381668, + "learning_rate": 0.00029407286638154597, + "loss": 0.0446, + "num_input_tokens_seen": 177695744, + "step": 82340 + }, + { + "epoch": 13.433115823817293, + "grad_norm": 0.008588245138525963, + "learning_rate": 0.00029400800587456544, + "loss": 0.0042, + "num_input_tokens_seen": 177707168, + "step": 82345 + }, + { + "epoch": 13.433931484502446, + "grad_norm": 0.010826773010194302, + "learning_rate": 0.00029394314954227387, + "loss": 0.0016, + "num_input_tokens_seen": 177718016, + "step": 82350 + }, + { + "epoch": 13.434747145187602, + "grad_norm": 0.21506014466285706, + "learning_rate": 0.000293878297385986, + "loss": 0.0781, + "num_input_tokens_seen": 177726752, + "step": 82355 + }, + { + "epoch": 13.435562805872756, + "grad_norm": 0.0007093166350387037, + "learning_rate": 0.0002938134494070157, + "loss": 0.0047, + "num_input_tokens_seen": 177736704, + "step": 82360 + }, + { + "epoch": 13.436378466557912, + "grad_norm": 0.5733169913291931, + "learning_rate": 0.00029374860560667747, + "loss": 0.0437, + "num_input_tokens_seen": 177748096, + "step": 82365 + }, + { + "epoch": 13.437194127243067, + "grad_norm": 0.006755854468792677, + "learning_rate": 0.00029368376598628545, + "loss": 0.001, + "num_input_tokens_seen": 177758944, + "step": 82370 + }, + { + "epoch": 13.438009787928221, + "grad_norm": 0.0049548218958079815, + "learning_rate": 0.00029361893054715365, + "loss": 0.0104, + "num_input_tokens_seen": 177767904, + "step": 82375 + }, + { + "epoch": 13.438825448613377, + "grad_norm": 0.010104116052389145, + "learning_rate": 0.000293554099290596, + "loss": 0.0033, + "num_input_tokens_seen": 177779200, + "step": 82380 + }, + { + "epoch": 13.439641109298531, + "grad_norm": 0.3901646137237549, + "learning_rate": 0.0002934892722179264, + "loss": 0.0117, + "num_input_tokens_seen": 177790016, + "step": 82385 + }, + { + "epoch": 13.440456769983687, + "grad_norm": 0.004583387635648251, + "learning_rate": 0.0002934244493304588, + "loss": 0.0069, + "num_input_tokens_seen": 177801088, + "step": 82390 + }, + { + "epoch": 13.441272430668842, + "grad_norm": 1.045728087425232, + "learning_rate": 0.0002933596306295066, + "loss": 0.0254, + "num_input_tokens_seen": 177810592, + "step": 82395 + }, + { + "epoch": 13.442088091353996, + "grad_norm": 0.06254040449857712, + "learning_rate": 0.0002932948161163839, + "loss": 0.0086, + "num_input_tokens_seen": 177822080, + "step": 82400 + }, + { + "epoch": 13.442903752039152, + "grad_norm": 0.2375650554895401, + "learning_rate": 0.0002932300057924037, + "loss": 0.0096, + "num_input_tokens_seen": 177832000, + "step": 82405 + }, + { + "epoch": 13.443719412724306, + "grad_norm": 0.02477083168923855, + "learning_rate": 0.0002931651996588799, + "loss": 0.0261, + "num_input_tokens_seen": 177842048, + "step": 82410 + }, + { + "epoch": 13.444535073409462, + "grad_norm": 0.19058597087860107, + "learning_rate": 0.0002931003977171256, + "loss": 0.0085, + "num_input_tokens_seen": 177852704, + "step": 82415 + }, + { + "epoch": 13.445350734094617, + "grad_norm": 0.288620263338089, + "learning_rate": 0.00029303559996845434, + "loss": 0.0243, + "num_input_tokens_seen": 177863456, + "step": 82420 + }, + { + "epoch": 13.446166394779771, + "grad_norm": 0.012926272116601467, + "learning_rate": 0.00029297080641417907, + "loss": 0.0089, + "num_input_tokens_seen": 177874624, + "step": 82425 + }, + { + "epoch": 13.446982055464927, + "grad_norm": 0.001998857595026493, + "learning_rate": 0.0002929060170556132, + "loss": 0.0036, + "num_input_tokens_seen": 177885024, + "step": 82430 + }, + { + "epoch": 13.447797716150081, + "grad_norm": 0.0015537252184003592, + "learning_rate": 0.00029284123189406944, + "loss": 0.0196, + "num_input_tokens_seen": 177896160, + "step": 82435 + }, + { + "epoch": 13.448613376835237, + "grad_norm": 0.06699017435312271, + "learning_rate": 0.00029277645093086114, + "loss": 0.0037, + "num_input_tokens_seen": 177907712, + "step": 82440 + }, + { + "epoch": 13.449429037520392, + "grad_norm": 0.005250576417893171, + "learning_rate": 0.00029271167416730073, + "loss": 0.0047, + "num_input_tokens_seen": 177918912, + "step": 82445 + }, + { + "epoch": 13.450244698205546, + "grad_norm": 0.04469725862145424, + "learning_rate": 0.0002926469016047013, + "loss": 0.005, + "num_input_tokens_seen": 177931200, + "step": 82450 + }, + { + "epoch": 13.451060358890702, + "grad_norm": 0.1076364517211914, + "learning_rate": 0.00029258213324437533, + "loss": 0.0084, + "num_input_tokens_seen": 177942848, + "step": 82455 + }, + { + "epoch": 13.451876019575856, + "grad_norm": 0.0040900856256484985, + "learning_rate": 0.00029251736908763584, + "loss": 0.0097, + "num_input_tokens_seen": 177954048, + "step": 82460 + }, + { + "epoch": 13.452691680261012, + "grad_norm": 0.0005476898513734341, + "learning_rate": 0.00029245260913579477, + "loss": 0.0121, + "num_input_tokens_seen": 177965056, + "step": 82465 + }, + { + "epoch": 13.453507340946166, + "grad_norm": 0.0011316915042698383, + "learning_rate": 0.00029238785339016487, + "loss": 0.0183, + "num_input_tokens_seen": 177974976, + "step": 82470 + }, + { + "epoch": 13.454323001631321, + "grad_norm": 0.08037909865379333, + "learning_rate": 0.0002923231018520588, + "loss": 0.0052, + "num_input_tokens_seen": 177985088, + "step": 82475 + }, + { + "epoch": 13.455138662316477, + "grad_norm": 0.000427785562351346, + "learning_rate": 0.0002922583545227882, + "loss": 0.0782, + "num_input_tokens_seen": 177995872, + "step": 82480 + }, + { + "epoch": 13.455954323001631, + "grad_norm": 0.007915656082332134, + "learning_rate": 0.00029219361140366587, + "loss": 0.0049, + "num_input_tokens_seen": 178006432, + "step": 82485 + }, + { + "epoch": 13.456769983686787, + "grad_norm": 0.38055408000946045, + "learning_rate": 0.0002921288724960034, + "loss": 0.1761, + "num_input_tokens_seen": 178015904, + "step": 82490 + }, + { + "epoch": 13.45758564437194, + "grad_norm": 0.01986478827893734, + "learning_rate": 0.00029206413780111305, + "loss": 0.003, + "num_input_tokens_seen": 178027328, + "step": 82495 + }, + { + "epoch": 13.458401305057096, + "grad_norm": 0.25656750798225403, + "learning_rate": 0.00029199940732030686, + "loss": 0.0088, + "num_input_tokens_seen": 178037824, + "step": 82500 + }, + { + "epoch": 13.459216965742252, + "grad_norm": 0.08144430071115494, + "learning_rate": 0.0002919346810548965, + "loss": 0.0075, + "num_input_tokens_seen": 178047936, + "step": 82505 + }, + { + "epoch": 13.460032626427406, + "grad_norm": 0.004406204912811518, + "learning_rate": 0.00029186995900619373, + "loss": 0.0006, + "num_input_tokens_seen": 178059040, + "step": 82510 + }, + { + "epoch": 13.460848287112562, + "grad_norm": 0.009436530992388725, + "learning_rate": 0.00029180524117551035, + "loss": 0.0032, + "num_input_tokens_seen": 178069696, + "step": 82515 + }, + { + "epoch": 13.461663947797716, + "grad_norm": 0.002101840451359749, + "learning_rate": 0.0002917405275641578, + "loss": 0.0079, + "num_input_tokens_seen": 178080160, + "step": 82520 + }, + { + "epoch": 13.462479608482871, + "grad_norm": 0.02555399015545845, + "learning_rate": 0.00029167581817344775, + "loss": 0.0027, + "num_input_tokens_seen": 178090624, + "step": 82525 + }, + { + "epoch": 13.463295269168025, + "grad_norm": 0.007719612680375576, + "learning_rate": 0.00029161111300469143, + "loss": 0.0128, + "num_input_tokens_seen": 178102784, + "step": 82530 + }, + { + "epoch": 13.464110929853181, + "grad_norm": 0.0930895209312439, + "learning_rate": 0.0002915464120592003, + "loss": 0.003, + "num_input_tokens_seen": 178114144, + "step": 82535 + }, + { + "epoch": 13.464926590538337, + "grad_norm": 0.007071339525282383, + "learning_rate": 0.0002914817153382856, + "loss": 0.0088, + "num_input_tokens_seen": 178125984, + "step": 82540 + }, + { + "epoch": 13.46574225122349, + "grad_norm": 0.000521997397299856, + "learning_rate": 0.00029141702284325846, + "loss": 0.1024, + "num_input_tokens_seen": 178137600, + "step": 82545 + }, + { + "epoch": 13.466557911908646, + "grad_norm": 0.053067464381456375, + "learning_rate": 0.0002913523345754299, + "loss": 0.0054, + "num_input_tokens_seen": 178149088, + "step": 82550 + }, + { + "epoch": 13.4673735725938, + "grad_norm": 0.0006074383272789419, + "learning_rate": 0.0002912876505361111, + "loss": 0.0063, + "num_input_tokens_seen": 178159968, + "step": 82555 + }, + { + "epoch": 13.468189233278956, + "grad_norm": 0.00392296863719821, + "learning_rate": 0.00029122297072661264, + "loss": 0.0153, + "num_input_tokens_seen": 178171008, + "step": 82560 + }, + { + "epoch": 13.469004893964112, + "grad_norm": 0.06978226453065872, + "learning_rate": 0.00029115829514824565, + "loss": 0.0424, + "num_input_tokens_seen": 178182112, + "step": 82565 + }, + { + "epoch": 13.469820554649266, + "grad_norm": 0.10540809482336044, + "learning_rate": 0.00029109362380232075, + "loss": 0.0444, + "num_input_tokens_seen": 178193888, + "step": 82570 + }, + { + "epoch": 13.470636215334421, + "grad_norm": 0.000631477392744273, + "learning_rate": 0.0002910289566901485, + "loss": 0.1374, + "num_input_tokens_seen": 178204416, + "step": 82575 + }, + { + "epoch": 13.471451876019575, + "grad_norm": 0.004823222756385803, + "learning_rate": 0.0002909642938130394, + "loss": 0.0018, + "num_input_tokens_seen": 178215552, + "step": 82580 + }, + { + "epoch": 13.47226753670473, + "grad_norm": 0.0005522229475900531, + "learning_rate": 0.0002908996351723043, + "loss": 0.002, + "num_input_tokens_seen": 178227072, + "step": 82585 + }, + { + "epoch": 13.473083197389887, + "grad_norm": 0.009000611491501331, + "learning_rate": 0.0002908349807692533, + "loss": 0.0017, + "num_input_tokens_seen": 178237760, + "step": 82590 + }, + { + "epoch": 13.47389885807504, + "grad_norm": 0.00043475566781125963, + "learning_rate": 0.00029077033060519674, + "loss": 0.0316, + "num_input_tokens_seen": 178248832, + "step": 82595 + }, + { + "epoch": 13.474714518760196, + "grad_norm": 0.023772073909640312, + "learning_rate": 0.0002907056846814449, + "loss": 0.0088, + "num_input_tokens_seen": 178260416, + "step": 82600 + }, + { + "epoch": 13.47553017944535, + "grad_norm": 0.01588546484708786, + "learning_rate": 0.00029064104299930785, + "loss": 0.0108, + "num_input_tokens_seen": 178271520, + "step": 82605 + }, + { + "epoch": 13.476345840130506, + "grad_norm": 0.07896775007247925, + "learning_rate": 0.00029057640556009567, + "loss": 0.0034, + "num_input_tokens_seen": 178282240, + "step": 82610 + }, + { + "epoch": 13.477161500815662, + "grad_norm": 0.456591933965683, + "learning_rate": 0.0002905117723651183, + "loss": 0.1472, + "num_input_tokens_seen": 178292064, + "step": 82615 + }, + { + "epoch": 13.477977161500815, + "grad_norm": 0.010230864398181438, + "learning_rate": 0.0002904471434156856, + "loss": 0.0099, + "num_input_tokens_seen": 178303168, + "step": 82620 + }, + { + "epoch": 13.478792822185971, + "grad_norm": 0.4852035939693451, + "learning_rate": 0.0002903825187131074, + "loss": 0.0445, + "num_input_tokens_seen": 178313248, + "step": 82625 + }, + { + "epoch": 13.479608482871125, + "grad_norm": 0.001849945867434144, + "learning_rate": 0.00029031789825869334, + "loss": 0.0125, + "num_input_tokens_seen": 178323168, + "step": 82630 + }, + { + "epoch": 13.48042414355628, + "grad_norm": 0.002211230108514428, + "learning_rate": 0.0002902532820537531, + "loss": 0.0085, + "num_input_tokens_seen": 178334432, + "step": 82635 + }, + { + "epoch": 13.481239804241435, + "grad_norm": 0.00035359105095267296, + "learning_rate": 0.00029018867009959623, + "loss": 0.0039, + "num_input_tokens_seen": 178344800, + "step": 82640 + }, + { + "epoch": 13.48205546492659, + "grad_norm": 0.015339674428105354, + "learning_rate": 0.0002901240623975321, + "loss": 0.0021, + "num_input_tokens_seen": 178356320, + "step": 82645 + }, + { + "epoch": 13.482871125611746, + "grad_norm": 0.002982785925269127, + "learning_rate": 0.00029005945894887, + "loss": 0.0042, + "num_input_tokens_seen": 178366912, + "step": 82650 + }, + { + "epoch": 13.4836867862969, + "grad_norm": 0.008437557145953178, + "learning_rate": 0.0002899948597549194, + "loss": 0.0061, + "num_input_tokens_seen": 178376128, + "step": 82655 + }, + { + "epoch": 13.484502446982056, + "grad_norm": 0.0007398732122965157, + "learning_rate": 0.00028993026481698934, + "loss": 0.0011, + "num_input_tokens_seen": 178387296, + "step": 82660 + }, + { + "epoch": 13.48531810766721, + "grad_norm": 0.0009393549989908934, + "learning_rate": 0.00028986567413638895, + "loss": 0.0131, + "num_input_tokens_seen": 178398592, + "step": 82665 + }, + { + "epoch": 13.486133768352365, + "grad_norm": 0.057162459939718246, + "learning_rate": 0.00028980108771442726, + "loss": 0.0128, + "num_input_tokens_seen": 178408320, + "step": 82670 + }, + { + "epoch": 13.486949429037521, + "grad_norm": 0.04537806659936905, + "learning_rate": 0.00028973650555241316, + "loss": 0.0038, + "num_input_tokens_seen": 178419392, + "step": 82675 + }, + { + "epoch": 13.487765089722675, + "grad_norm": 0.002640562830492854, + "learning_rate": 0.0002896719276516555, + "loss": 0.0085, + "num_input_tokens_seen": 178430016, + "step": 82680 + }, + { + "epoch": 13.48858075040783, + "grad_norm": 0.022445326671004295, + "learning_rate": 0.0002896073540134631, + "loss": 0.0049, + "num_input_tokens_seen": 178439936, + "step": 82685 + }, + { + "epoch": 13.489396411092985, + "grad_norm": 0.5306188464164734, + "learning_rate": 0.00028954278463914435, + "loss": 0.0257, + "num_input_tokens_seen": 178451552, + "step": 82690 + }, + { + "epoch": 13.49021207177814, + "grad_norm": 0.021217362955212593, + "learning_rate": 0.00028947821953000845, + "loss": 0.0019, + "num_input_tokens_seen": 178462272, + "step": 82695 + }, + { + "epoch": 13.491027732463296, + "grad_norm": 0.00016088411211967468, + "learning_rate": 0.00028941365868736315, + "loss": 0.0155, + "num_input_tokens_seen": 178473888, + "step": 82700 + }, + { + "epoch": 13.49184339314845, + "grad_norm": 0.005768782924860716, + "learning_rate": 0.00028934910211251755, + "loss": 0.0019, + "num_input_tokens_seen": 178483616, + "step": 82705 + }, + { + "epoch": 13.492659053833606, + "grad_norm": 0.01044163666665554, + "learning_rate": 0.0002892845498067792, + "loss": 0.003, + "num_input_tokens_seen": 178494048, + "step": 82710 + }, + { + "epoch": 13.49347471451876, + "grad_norm": 0.03359925001859665, + "learning_rate": 0.0002892200017714572, + "loss": 0.0389, + "num_input_tokens_seen": 178505536, + "step": 82715 + }, + { + "epoch": 13.494290375203915, + "grad_norm": 0.003682720707729459, + "learning_rate": 0.00028915545800785883, + "loss": 0.0015, + "num_input_tokens_seen": 178514688, + "step": 82720 + }, + { + "epoch": 13.49510603588907, + "grad_norm": 0.027915172278881073, + "learning_rate": 0.0002890909185172928, + "loss": 0.0017, + "num_input_tokens_seen": 178525376, + "step": 82725 + }, + { + "epoch": 13.495921696574225, + "grad_norm": 0.001378602348268032, + "learning_rate": 0.00028902638330106684, + "loss": 0.0006, + "num_input_tokens_seen": 178536576, + "step": 82730 + }, + { + "epoch": 13.49673735725938, + "grad_norm": 0.002186297671869397, + "learning_rate": 0.0002889618523604889, + "loss": 0.0007, + "num_input_tokens_seen": 178546720, + "step": 82735 + }, + { + "epoch": 13.497553017944535, + "grad_norm": 0.6430211067199707, + "learning_rate": 0.0002888973256968667, + "loss": 0.0253, + "num_input_tokens_seen": 178557856, + "step": 82740 + }, + { + "epoch": 13.49836867862969, + "grad_norm": 0.001060758251696825, + "learning_rate": 0.000288832803311508, + "loss": 0.0025, + "num_input_tokens_seen": 178568096, + "step": 82745 + }, + { + "epoch": 13.499184339314844, + "grad_norm": 0.0016204232815653086, + "learning_rate": 0.00028876828520572043, + "loss": 0.0088, + "num_input_tokens_seen": 178579456, + "step": 82750 + }, + { + "epoch": 13.5, + "grad_norm": 0.000256688566878438, + "learning_rate": 0.0002887037713808116, + "loss": 0.0089, + "num_input_tokens_seen": 178589920, + "step": 82755 + }, + { + "epoch": 13.500815660685156, + "grad_norm": 0.0015288153663277626, + "learning_rate": 0.0002886392618380888, + "loss": 0.0014, + "num_input_tokens_seen": 178601024, + "step": 82760 + }, + { + "epoch": 13.50163132137031, + "grad_norm": 0.000967644271440804, + "learning_rate": 0.00028857475657885956, + "loss": 0.0197, + "num_input_tokens_seen": 178611712, + "step": 82765 + }, + { + "epoch": 13.502446982055465, + "grad_norm": 0.004592652898281813, + "learning_rate": 0.00028851025560443103, + "loss": 0.0957, + "num_input_tokens_seen": 178622304, + "step": 82770 + }, + { + "epoch": 13.50326264274062, + "grad_norm": 0.004227324388921261, + "learning_rate": 0.0002884457589161105, + "loss": 0.0026, + "num_input_tokens_seen": 178632896, + "step": 82775 + }, + { + "epoch": 13.504078303425775, + "grad_norm": 0.009296797215938568, + "learning_rate": 0.000288381266515205, + "loss": 0.0011, + "num_input_tokens_seen": 178643328, + "step": 82780 + }, + { + "epoch": 13.50489396411093, + "grad_norm": 0.01785522885620594, + "learning_rate": 0.0002883167784030216, + "loss": 0.077, + "num_input_tokens_seen": 178653376, + "step": 82785 + }, + { + "epoch": 13.505709624796085, + "grad_norm": 0.0028068474493920803, + "learning_rate": 0.00028825229458086726, + "loss": 0.0011, + "num_input_tokens_seen": 178663264, + "step": 82790 + }, + { + "epoch": 13.50652528548124, + "grad_norm": 0.07111160457134247, + "learning_rate": 0.0002881878150500486, + "loss": 0.0037, + "num_input_tokens_seen": 178674272, + "step": 82795 + }, + { + "epoch": 13.507340946166394, + "grad_norm": 0.0026521640829741955, + "learning_rate": 0.00028812333981187297, + "loss": 0.0022, + "num_input_tokens_seen": 178685632, + "step": 82800 + }, + { + "epoch": 13.50815660685155, + "grad_norm": 0.05534886196255684, + "learning_rate": 0.00028805886886764623, + "loss": 0.0035, + "num_input_tokens_seen": 178695328, + "step": 82805 + }, + { + "epoch": 13.508972267536706, + "grad_norm": 0.00622946210205555, + "learning_rate": 0.00028799440221867576, + "loss": 0.0082, + "num_input_tokens_seen": 178705952, + "step": 82810 + }, + { + "epoch": 13.50978792822186, + "grad_norm": 0.004296909552067518, + "learning_rate": 0.00028792993986626725, + "loss": 0.0019, + "num_input_tokens_seen": 178716416, + "step": 82815 + }, + { + "epoch": 13.510603588907015, + "grad_norm": 0.0003072297549806535, + "learning_rate": 0.000287865481811728, + "loss": 0.0075, + "num_input_tokens_seen": 178727936, + "step": 82820 + }, + { + "epoch": 13.51141924959217, + "grad_norm": 0.01372566819190979, + "learning_rate": 0.00028780102805636346, + "loss": 0.0766, + "num_input_tokens_seen": 178737760, + "step": 82825 + }, + { + "epoch": 13.512234910277325, + "grad_norm": 0.02899722009897232, + "learning_rate": 0.0002877365786014806, + "loss": 0.0158, + "num_input_tokens_seen": 178748320, + "step": 82830 + }, + { + "epoch": 13.513050570962479, + "grad_norm": 0.049965109676122665, + "learning_rate": 0.00028767213344838493, + "loss": 0.0051, + "num_input_tokens_seen": 178758656, + "step": 82835 + }, + { + "epoch": 13.513866231647635, + "grad_norm": 0.19075040519237518, + "learning_rate": 0.00028760769259838327, + "loss": 0.0103, + "num_input_tokens_seen": 178770560, + "step": 82840 + }, + { + "epoch": 13.51468189233279, + "grad_norm": 0.0010406267829239368, + "learning_rate": 0.00028754325605278067, + "loss": 0.0013, + "num_input_tokens_seen": 178781536, + "step": 82845 + }, + { + "epoch": 13.515497553017944, + "grad_norm": 0.01500383485108614, + "learning_rate": 0.00028747882381288393, + "loss": 0.0073, + "num_input_tokens_seen": 178793024, + "step": 82850 + }, + { + "epoch": 13.5163132137031, + "grad_norm": 0.0002418215008219704, + "learning_rate": 0.00028741439587999805, + "loss": 0.0108, + "num_input_tokens_seen": 178804736, + "step": 82855 + }, + { + "epoch": 13.517128874388254, + "grad_norm": 0.10607481747865677, + "learning_rate": 0.00028734997225542954, + "loss": 0.0042, + "num_input_tokens_seen": 178816672, + "step": 82860 + }, + { + "epoch": 13.51794453507341, + "grad_norm": 0.5297728180885315, + "learning_rate": 0.0002872855529404832, + "loss": 0.2309, + "num_input_tokens_seen": 178827072, + "step": 82865 + }, + { + "epoch": 13.518760195758565, + "grad_norm": 0.07991251349449158, + "learning_rate": 0.0002872211379364651, + "loss": 0.0077, + "num_input_tokens_seen": 178837792, + "step": 82870 + }, + { + "epoch": 13.51957585644372, + "grad_norm": 0.00047760450979694724, + "learning_rate": 0.00028715672724468065, + "loss": 0.0018, + "num_input_tokens_seen": 178846624, + "step": 82875 + }, + { + "epoch": 13.520391517128875, + "grad_norm": 0.0012604972580447793, + "learning_rate": 0.0002870923208664351, + "loss": 0.0007, + "num_input_tokens_seen": 178857088, + "step": 82880 + }, + { + "epoch": 13.521207177814029, + "grad_norm": 0.0018718052888289094, + "learning_rate": 0.0002870279188030338, + "loss": 0.0099, + "num_input_tokens_seen": 178867744, + "step": 82885 + }, + { + "epoch": 13.522022838499185, + "grad_norm": 0.012806332670152187, + "learning_rate": 0.00028696352105578185, + "loss": 0.1085, + "num_input_tokens_seen": 178877664, + "step": 82890 + }, + { + "epoch": 13.522838499184338, + "grad_norm": 0.006727631203830242, + "learning_rate": 0.0002868991276259844, + "loss": 0.0185, + "num_input_tokens_seen": 178887680, + "step": 82895 + }, + { + "epoch": 13.523654159869494, + "grad_norm": 0.023982727900147438, + "learning_rate": 0.0002868347385149465, + "loss": 0.0613, + "num_input_tokens_seen": 178898688, + "step": 82900 + }, + { + "epoch": 13.52446982055465, + "grad_norm": 0.0002510881458874792, + "learning_rate": 0.000286770353723973, + "loss": 0.0021, + "num_input_tokens_seen": 178908864, + "step": 82905 + }, + { + "epoch": 13.525285481239804, + "grad_norm": 0.009417670778930187, + "learning_rate": 0.00028670597325436886, + "loss": 0.0183, + "num_input_tokens_seen": 178919200, + "step": 82910 + }, + { + "epoch": 13.52610114192496, + "grad_norm": 0.02701220102608204, + "learning_rate": 0.0002866415971074387, + "loss": 0.0066, + "num_input_tokens_seen": 178929440, + "step": 82915 + }, + { + "epoch": 13.526916802610113, + "grad_norm": 0.012573004700243473, + "learning_rate": 0.000286577225284487, + "loss": 0.0009, + "num_input_tokens_seen": 178940064, + "step": 82920 + }, + { + "epoch": 13.52773246329527, + "grad_norm": 0.8829275369644165, + "learning_rate": 0.00028651285778681906, + "loss": 0.0837, + "num_input_tokens_seen": 178950912, + "step": 82925 + }, + { + "epoch": 13.528548123980425, + "grad_norm": 0.001444246037863195, + "learning_rate": 0.00028644849461573847, + "loss": 0.0482, + "num_input_tokens_seen": 178962752, + "step": 82930 + }, + { + "epoch": 13.529363784665579, + "grad_norm": 0.0023165536113083363, + "learning_rate": 0.0002863841357725504, + "loss": 0.0021, + "num_input_tokens_seen": 178974048, + "step": 82935 + }, + { + "epoch": 13.530179445350734, + "grad_norm": 0.032886430621147156, + "learning_rate": 0.00028631978125855844, + "loss": 0.0079, + "num_input_tokens_seen": 178983872, + "step": 82940 + }, + { + "epoch": 13.530995106035888, + "grad_norm": 0.6897380948066711, + "learning_rate": 0.0002862554310750676, + "loss": 0.0307, + "num_input_tokens_seen": 178994848, + "step": 82945 + }, + { + "epoch": 13.531810766721044, + "grad_norm": 0.09496913105249405, + "learning_rate": 0.0002861910852233812, + "loss": 0.0072, + "num_input_tokens_seen": 179006528, + "step": 82950 + }, + { + "epoch": 13.5326264274062, + "grad_norm": 0.04693048819899559, + "learning_rate": 0.00028612674370480406, + "loss": 0.0104, + "num_input_tokens_seen": 179016416, + "step": 82955 + }, + { + "epoch": 13.533442088091354, + "grad_norm": 0.01107507012784481, + "learning_rate": 0.0002860624065206394, + "loss": 0.025, + "num_input_tokens_seen": 179027232, + "step": 82960 + }, + { + "epoch": 13.53425774877651, + "grad_norm": 0.0016508148983120918, + "learning_rate": 0.0002859980736721918, + "loss": 0.0081, + "num_input_tokens_seen": 179039008, + "step": 82965 + }, + { + "epoch": 13.535073409461663, + "grad_norm": 0.01973150484263897, + "learning_rate": 0.0002859337451607644, + "loss": 0.0706, + "num_input_tokens_seen": 179050656, + "step": 82970 + }, + { + "epoch": 13.535889070146819, + "grad_norm": 0.39768746495246887, + "learning_rate": 0.0002858694209876616, + "loss": 0.0433, + "num_input_tokens_seen": 179060416, + "step": 82975 + }, + { + "epoch": 13.536704730831975, + "grad_norm": 0.01599719189107418, + "learning_rate": 0.00028580510115418624, + "loss": 0.0149, + "num_input_tokens_seen": 179072320, + "step": 82980 + }, + { + "epoch": 13.537520391517129, + "grad_norm": 0.0010883890790864825, + "learning_rate": 0.0002857407856616426, + "loss": 0.0054, + "num_input_tokens_seen": 179082144, + "step": 82985 + }, + { + "epoch": 13.538336052202284, + "grad_norm": 0.07143707573413849, + "learning_rate": 0.0002856764745113334, + "loss": 0.0938, + "num_input_tokens_seen": 179092896, + "step": 82990 + }, + { + "epoch": 13.539151712887438, + "grad_norm": 0.04638929292559624, + "learning_rate": 0.00028561216770456267, + "loss": 0.0083, + "num_input_tokens_seen": 179104672, + "step": 82995 + }, + { + "epoch": 13.539967373572594, + "grad_norm": 0.016827169805765152, + "learning_rate": 0.000285547865242633, + "loss": 0.0037, + "num_input_tokens_seen": 179114208, + "step": 83000 + }, + { + "epoch": 13.540783034257748, + "grad_norm": 0.0003715837956406176, + "learning_rate": 0.000285483567126848, + "loss": 0.0035, + "num_input_tokens_seen": 179124768, + "step": 83005 + }, + { + "epoch": 13.541598694942904, + "grad_norm": 0.010867067612707615, + "learning_rate": 0.0002854192733585107, + "loss": 0.0099, + "num_input_tokens_seen": 179135104, + "step": 83010 + }, + { + "epoch": 13.54241435562806, + "grad_norm": 0.021783141419291496, + "learning_rate": 0.000285354983938924, + "loss": 0.0098, + "num_input_tokens_seen": 179146432, + "step": 83015 + }, + { + "epoch": 13.543230016313213, + "grad_norm": 0.0013048859545961022, + "learning_rate": 0.0002852906988693909, + "loss": 0.0869, + "num_input_tokens_seen": 179156704, + "step": 83020 + }, + { + "epoch": 13.544045676998369, + "grad_norm": 0.0006242787349037826, + "learning_rate": 0.0002852264181512142, + "loss": 0.0127, + "num_input_tokens_seen": 179167040, + "step": 83025 + }, + { + "epoch": 13.544861337683523, + "grad_norm": 0.03905348852276802, + "learning_rate": 0.00028516214178569656, + "loss": 0.0449, + "num_input_tokens_seen": 179178912, + "step": 83030 + }, + { + "epoch": 13.545676998368679, + "grad_norm": 0.0015662526711821556, + "learning_rate": 0.0002850978697741406, + "loss": 0.0017, + "num_input_tokens_seen": 179189824, + "step": 83035 + }, + { + "epoch": 13.546492659053834, + "grad_norm": 0.12642863392829895, + "learning_rate": 0.000285033602117849, + "loss": 0.0116, + "num_input_tokens_seen": 179199424, + "step": 83040 + }, + { + "epoch": 13.547308319738988, + "grad_norm": 0.01596909947693348, + "learning_rate": 0.0002849693388181241, + "loss": 0.0056, + "num_input_tokens_seen": 179210560, + "step": 83045 + }, + { + "epoch": 13.548123980424144, + "grad_norm": 0.0011860288213938475, + "learning_rate": 0.00028490507987626837, + "loss": 0.0051, + "num_input_tokens_seen": 179220384, + "step": 83050 + }, + { + "epoch": 13.548939641109298, + "grad_norm": 0.05148168280720711, + "learning_rate": 0.00028484082529358403, + "loss": 0.0035, + "num_input_tokens_seen": 179232416, + "step": 83055 + }, + { + "epoch": 13.549755301794454, + "grad_norm": 0.02825983054935932, + "learning_rate": 0.0002847765750713733, + "loss": 0.0582, + "num_input_tokens_seen": 179242944, + "step": 83060 + }, + { + "epoch": 13.550570962479608, + "grad_norm": 0.132168248295784, + "learning_rate": 0.0002847123292109382, + "loss": 0.0068, + "num_input_tokens_seen": 179254464, + "step": 83065 + }, + { + "epoch": 13.551386623164763, + "grad_norm": 0.0007254588417708874, + "learning_rate": 0.0002846480877135812, + "loss": 0.0025, + "num_input_tokens_seen": 179265728, + "step": 83070 + }, + { + "epoch": 13.552202283849919, + "grad_norm": 0.0046125296503305435, + "learning_rate": 0.00028458385058060355, + "loss": 0.0027, + "num_input_tokens_seen": 179276160, + "step": 83075 + }, + { + "epoch": 13.553017944535073, + "grad_norm": 0.03357213735580444, + "learning_rate": 0.0002845196178133078, + "loss": 0.0042, + "num_input_tokens_seen": 179287264, + "step": 83080 + }, + { + "epoch": 13.553833605220229, + "grad_norm": 0.010055046528577805, + "learning_rate": 0.00028445538941299493, + "loss": 0.0132, + "num_input_tokens_seen": 179297696, + "step": 83085 + }, + { + "epoch": 13.554649265905383, + "grad_norm": 0.0066935596987605095, + "learning_rate": 0.00028439116538096743, + "loss": 0.0128, + "num_input_tokens_seen": 179307008, + "step": 83090 + }, + { + "epoch": 13.555464926590538, + "grad_norm": 0.0013847979716956615, + "learning_rate": 0.0002843269457185261, + "loss": 0.0108, + "num_input_tokens_seen": 179318816, + "step": 83095 + }, + { + "epoch": 13.556280587275694, + "grad_norm": 0.01807354763150215, + "learning_rate": 0.00028426273042697327, + "loss": 0.0021, + "num_input_tokens_seen": 179330208, + "step": 83100 + }, + { + "epoch": 13.557096247960848, + "grad_norm": 0.0034839536529034376, + "learning_rate": 0.0002841985195076094, + "loss": 0.0015, + "num_input_tokens_seen": 179340448, + "step": 83105 + }, + { + "epoch": 13.557911908646004, + "grad_norm": 0.004956569988280535, + "learning_rate": 0.0002841343129617365, + "loss": 0.0213, + "num_input_tokens_seen": 179350272, + "step": 83110 + }, + { + "epoch": 13.558727569331158, + "grad_norm": 0.015398676507174969, + "learning_rate": 0.0002840701107906557, + "loss": 0.0076, + "num_input_tokens_seen": 179360992, + "step": 83115 + }, + { + "epoch": 13.559543230016313, + "grad_norm": 0.07174376398324966, + "learning_rate": 0.00028400591299566793, + "loss": 0.0053, + "num_input_tokens_seen": 179372000, + "step": 83120 + }, + { + "epoch": 13.560358890701469, + "grad_norm": 0.4310682713985443, + "learning_rate": 0.00028394171957807433, + "loss": 0.0372, + "num_input_tokens_seen": 179384032, + "step": 83125 + }, + { + "epoch": 13.561174551386623, + "grad_norm": 0.00428356509655714, + "learning_rate": 0.000283877530539176, + "loss": 0.0034, + "num_input_tokens_seen": 179395072, + "step": 83130 + }, + { + "epoch": 13.561990212071779, + "grad_norm": 0.08834954351186752, + "learning_rate": 0.00028381334588027353, + "loss": 0.009, + "num_input_tokens_seen": 179407392, + "step": 83135 + }, + { + "epoch": 13.562805872756933, + "grad_norm": 0.9195407629013062, + "learning_rate": 0.00028374916560266794, + "loss": 0.0468, + "num_input_tokens_seen": 179418528, + "step": 83140 + }, + { + "epoch": 13.563621533442088, + "grad_norm": 0.3434191048145294, + "learning_rate": 0.0002836849897076598, + "loss": 0.06, + "num_input_tokens_seen": 179429376, + "step": 83145 + }, + { + "epoch": 13.564437194127244, + "grad_norm": 0.32397356629371643, + "learning_rate": 0.00028362081819654984, + "loss": 0.018, + "num_input_tokens_seen": 179439936, + "step": 83150 + }, + { + "epoch": 13.565252854812398, + "grad_norm": 0.0004319230210967362, + "learning_rate": 0.00028355665107063845, + "loss": 0.002, + "num_input_tokens_seen": 179450336, + "step": 83155 + }, + { + "epoch": 13.566068515497554, + "grad_norm": 0.0012091129319742322, + "learning_rate": 0.00028349248833122603, + "loss": 0.0025, + "num_input_tokens_seen": 179460544, + "step": 83160 + }, + { + "epoch": 13.566884176182707, + "grad_norm": 0.01751078851521015, + "learning_rate": 0.0002834283299796131, + "loss": 0.0073, + "num_input_tokens_seen": 179470688, + "step": 83165 + }, + { + "epoch": 13.567699836867863, + "grad_norm": 0.10292528569698334, + "learning_rate": 0.00028336417601709975, + "loss": 0.007, + "num_input_tokens_seen": 179481920, + "step": 83170 + }, + { + "epoch": 13.568515497553017, + "grad_norm": 0.0009308247244916856, + "learning_rate": 0.0002833000264449862, + "loss": 0.0198, + "num_input_tokens_seen": 179492288, + "step": 83175 + }, + { + "epoch": 13.569331158238173, + "grad_norm": 0.0012838526163250208, + "learning_rate": 0.00028323588126457255, + "loss": 0.0469, + "num_input_tokens_seen": 179501568, + "step": 83180 + }, + { + "epoch": 13.570146818923329, + "grad_norm": 0.010635548271238804, + "learning_rate": 0.00028317174047715873, + "loss": 0.0157, + "num_input_tokens_seen": 179513184, + "step": 83185 + }, + { + "epoch": 13.570962479608482, + "grad_norm": 0.0030753749888390303, + "learning_rate": 0.0002831076040840446, + "loss": 0.0009, + "num_input_tokens_seen": 179523712, + "step": 83190 + }, + { + "epoch": 13.571778140293638, + "grad_norm": 0.0016281683929264545, + "learning_rate": 0.0002830434720865301, + "loss": 0.003, + "num_input_tokens_seen": 179535264, + "step": 83195 + }, + { + "epoch": 13.572593800978792, + "grad_norm": 0.00044024462113156915, + "learning_rate": 0.0002829793444859148, + "loss": 0.0025, + "num_input_tokens_seen": 179547424, + "step": 83200 + }, + { + "epoch": 13.573409461663948, + "grad_norm": 0.01339323166757822, + "learning_rate": 0.0002829152212834984, + "loss": 0.0025, + "num_input_tokens_seen": 179558112, + "step": 83205 + }, + { + "epoch": 13.574225122349104, + "grad_norm": 0.0019668112508952618, + "learning_rate": 0.0002828511024805803, + "loss": 0.0243, + "num_input_tokens_seen": 179569056, + "step": 83210 + }, + { + "epoch": 13.575040783034257, + "grad_norm": 0.005892464891076088, + "learning_rate": 0.0002827869880784605, + "loss": 0.0054, + "num_input_tokens_seen": 179579840, + "step": 83215 + }, + { + "epoch": 13.575856443719413, + "grad_norm": 0.0015010841889306903, + "learning_rate": 0.00028272287807843744, + "loss": 0.0407, + "num_input_tokens_seen": 179590624, + "step": 83220 + }, + { + "epoch": 13.576672104404567, + "grad_norm": 0.11558175086975098, + "learning_rate": 0.00028265877248181113, + "loss": 0.0077, + "num_input_tokens_seen": 179601504, + "step": 83225 + }, + { + "epoch": 13.577487765089723, + "grad_norm": 0.01303600799292326, + "learning_rate": 0.0002825946712898806, + "loss": 0.0014, + "num_input_tokens_seen": 179611264, + "step": 83230 + }, + { + "epoch": 13.578303425774878, + "grad_norm": 0.04391882196068764, + "learning_rate": 0.0002825305745039447, + "loss": 0.0053, + "num_input_tokens_seen": 179621280, + "step": 83235 + }, + { + "epoch": 13.579119086460032, + "grad_norm": 0.0006774240755476058, + "learning_rate": 0.00028246648212530267, + "loss": 0.0059, + "num_input_tokens_seen": 179632256, + "step": 83240 + }, + { + "epoch": 13.579934747145188, + "grad_norm": 0.0231030210852623, + "learning_rate": 0.00028240239415525337, + "loss": 0.0111, + "num_input_tokens_seen": 179642464, + "step": 83245 + }, + { + "epoch": 13.580750407830342, + "grad_norm": 0.023019757121801376, + "learning_rate": 0.0002823383105950955, + "loss": 0.0015, + "num_input_tokens_seen": 179654976, + "step": 83250 + }, + { + "epoch": 13.581566068515498, + "grad_norm": 4.050690174102783, + "learning_rate": 0.00028227423144612794, + "loss": 0.1103, + "num_input_tokens_seen": 179664608, + "step": 83255 + }, + { + "epoch": 13.582381729200652, + "grad_norm": 0.0031971558928489685, + "learning_rate": 0.00028221015670964935, + "loss": 0.005, + "num_input_tokens_seen": 179674784, + "step": 83260 + }, + { + "epoch": 13.583197389885807, + "grad_norm": 0.00632003229111433, + "learning_rate": 0.0002821460863869582, + "loss": 0.0009, + "num_input_tokens_seen": 179684608, + "step": 83265 + }, + { + "epoch": 13.584013050570963, + "grad_norm": 0.004915524739772081, + "learning_rate": 0.0002820820204793529, + "loss": 0.0131, + "num_input_tokens_seen": 179694368, + "step": 83270 + }, + { + "epoch": 13.584828711256117, + "grad_norm": 0.016065968200564384, + "learning_rate": 0.0002820179589881319, + "loss": 0.0133, + "num_input_tokens_seen": 179703872, + "step": 83275 + }, + { + "epoch": 13.585644371941273, + "grad_norm": 0.010546640492975712, + "learning_rate": 0.00028195390191459356, + "loss": 0.0027, + "num_input_tokens_seen": 179715872, + "step": 83280 + }, + { + "epoch": 13.586460032626427, + "grad_norm": 0.0024917356204241514, + "learning_rate": 0.000281889849260036, + "loss": 0.0005, + "num_input_tokens_seen": 179726976, + "step": 83285 + }, + { + "epoch": 13.587275693311582, + "grad_norm": 0.0076094819232821465, + "learning_rate": 0.00028182580102575726, + "loss": 0.0429, + "num_input_tokens_seen": 179738016, + "step": 83290 + }, + { + "epoch": 13.588091353996738, + "grad_norm": 0.0005907198647037148, + "learning_rate": 0.00028176175721305555, + "loss": 0.0512, + "num_input_tokens_seen": 179747520, + "step": 83295 + }, + { + "epoch": 13.588907014681892, + "grad_norm": 0.029423119500279427, + "learning_rate": 0.0002816977178232286, + "loss": 0.0039, + "num_input_tokens_seen": 179757696, + "step": 83300 + }, + { + "epoch": 13.589722675367048, + "grad_norm": 0.016097040846943855, + "learning_rate": 0.0002816336828575744, + "loss": 0.0028, + "num_input_tokens_seen": 179768160, + "step": 83305 + }, + { + "epoch": 13.590538336052202, + "grad_norm": 0.0025018611922860146, + "learning_rate": 0.0002815696523173906, + "loss": 0.0004, + "num_input_tokens_seen": 179780032, + "step": 83310 + }, + { + "epoch": 13.591353996737357, + "grad_norm": 0.10312528163194656, + "learning_rate": 0.0002815056262039749, + "loss": 0.0166, + "num_input_tokens_seen": 179790400, + "step": 83315 + }, + { + "epoch": 13.592169657422513, + "grad_norm": 0.009740419685840607, + "learning_rate": 0.0002814416045186249, + "loss": 0.0034, + "num_input_tokens_seen": 179800960, + "step": 83320 + }, + { + "epoch": 13.592985318107667, + "grad_norm": 0.02174578420817852, + "learning_rate": 0.00028137758726263796, + "loss": 0.0097, + "num_input_tokens_seen": 179811232, + "step": 83325 + }, + { + "epoch": 13.593800978792823, + "grad_norm": 0.011192362755537033, + "learning_rate": 0.0002813135744373114, + "loss": 0.0018, + "num_input_tokens_seen": 179822464, + "step": 83330 + }, + { + "epoch": 13.594616639477977, + "grad_norm": 0.0004424086073413491, + "learning_rate": 0.000281249566043943, + "loss": 0.0337, + "num_input_tokens_seen": 179833472, + "step": 83335 + }, + { + "epoch": 13.595432300163132, + "grad_norm": 0.30939725041389465, + "learning_rate": 0.0002811855620838294, + "loss": 0.0103, + "num_input_tokens_seen": 179843840, + "step": 83340 + }, + { + "epoch": 13.596247960848288, + "grad_norm": 0.00808817707002163, + "learning_rate": 0.00028112156255826826, + "loss": 0.0273, + "num_input_tokens_seen": 179855584, + "step": 83345 + }, + { + "epoch": 13.597063621533442, + "grad_norm": 0.047431930899620056, + "learning_rate": 0.000281057567468556, + "loss": 0.0095, + "num_input_tokens_seen": 179866080, + "step": 83350 + }, + { + "epoch": 13.597879282218598, + "grad_norm": 0.017760787159204483, + "learning_rate": 0.00028099357681599004, + "loss": 0.0094, + "num_input_tokens_seen": 179877472, + "step": 83355 + }, + { + "epoch": 13.598694942903752, + "grad_norm": 0.015147211961448193, + "learning_rate": 0.0002809295906018671, + "loss": 0.0008, + "num_input_tokens_seen": 179887456, + "step": 83360 + }, + { + "epoch": 13.599510603588907, + "grad_norm": 0.0052910493686795235, + "learning_rate": 0.00028086560882748386, + "loss": 0.0127, + "num_input_tokens_seen": 179899936, + "step": 83365 + }, + { + "epoch": 13.600326264274061, + "grad_norm": 0.0015278973150998354, + "learning_rate": 0.00028080163149413705, + "loss": 0.0012, + "num_input_tokens_seen": 179911040, + "step": 83370 + }, + { + "epoch": 13.601141924959217, + "grad_norm": 0.8101255297660828, + "learning_rate": 0.0002807376586031233, + "loss": 0.0988, + "num_input_tokens_seen": 179922176, + "step": 83375 + }, + { + "epoch": 13.601957585644373, + "grad_norm": 0.0007795592537149787, + "learning_rate": 0.0002806736901557391, + "loss": 0.0105, + "num_input_tokens_seen": 179933440, + "step": 83380 + }, + { + "epoch": 13.602773246329527, + "grad_norm": 0.0014754291623830795, + "learning_rate": 0.00028060972615328065, + "loss": 0.1713, + "num_input_tokens_seen": 179944128, + "step": 83385 + }, + { + "epoch": 13.603588907014682, + "grad_norm": 0.010329307056963444, + "learning_rate": 0.00028054576659704457, + "loss": 0.0045, + "num_input_tokens_seen": 179955520, + "step": 83390 + }, + { + "epoch": 13.604404567699836, + "grad_norm": 0.05079028755426407, + "learning_rate": 0.00028048181148832685, + "loss": 0.0069, + "num_input_tokens_seen": 179967520, + "step": 83395 + }, + { + "epoch": 13.605220228384992, + "grad_norm": 0.0024400947149842978, + "learning_rate": 0.00028041786082842366, + "loss": 0.0314, + "num_input_tokens_seen": 179979040, + "step": 83400 + }, + { + "epoch": 13.606035889070148, + "grad_norm": 0.6259928941726685, + "learning_rate": 0.0002803539146186311, + "loss": 0.1455, + "num_input_tokens_seen": 179990624, + "step": 83405 + }, + { + "epoch": 13.606851549755302, + "grad_norm": 0.1769079566001892, + "learning_rate": 0.0002802899728602452, + "loss": 0.01, + "num_input_tokens_seen": 180002976, + "step": 83410 + }, + { + "epoch": 13.607667210440457, + "grad_norm": 0.01969536393880844, + "learning_rate": 0.00028022603555456164, + "loss": 0.0092, + "num_input_tokens_seen": 180014880, + "step": 83415 + }, + { + "epoch": 13.608482871125611, + "grad_norm": 0.0008768024272285402, + "learning_rate": 0.00028016210270287635, + "loss": 0.0007, + "num_input_tokens_seen": 180025568, + "step": 83420 + }, + { + "epoch": 13.609298531810767, + "grad_norm": 0.028878426179289818, + "learning_rate": 0.00028009817430648483, + "loss": 0.0068, + "num_input_tokens_seen": 180035616, + "step": 83425 + }, + { + "epoch": 13.61011419249592, + "grad_norm": 0.005707046948373318, + "learning_rate": 0.00028003425036668287, + "loss": 0.0035, + "num_input_tokens_seen": 180047136, + "step": 83430 + }, + { + "epoch": 13.610929853181077, + "grad_norm": 0.03632787615060806, + "learning_rate": 0.00027997033088476554, + "loss": 0.079, + "num_input_tokens_seen": 180057952, + "step": 83435 + }, + { + "epoch": 13.611745513866232, + "grad_norm": 0.0032976618967950344, + "learning_rate": 0.000279906415862029, + "loss": 0.0161, + "num_input_tokens_seen": 180067680, + "step": 83440 + }, + { + "epoch": 13.612561174551386, + "grad_norm": 0.002792885759845376, + "learning_rate": 0.00027984250529976783, + "loss": 0.0033, + "num_input_tokens_seen": 180079104, + "step": 83445 + }, + { + "epoch": 13.613376835236542, + "grad_norm": 0.002465700265020132, + "learning_rate": 0.000279778599199278, + "loss": 0.0267, + "num_input_tokens_seen": 180090400, + "step": 83450 + }, + { + "epoch": 13.614192495921696, + "grad_norm": 0.020679811015725136, + "learning_rate": 0.0002797146975618538, + "loss": 0.0425, + "num_input_tokens_seen": 180101504, + "step": 83455 + }, + { + "epoch": 13.615008156606851, + "grad_norm": 0.17893950641155243, + "learning_rate": 0.0002796508003887911, + "loss": 0.0079, + "num_input_tokens_seen": 180111488, + "step": 83460 + }, + { + "epoch": 13.615823817292007, + "grad_norm": 0.034039419144392014, + "learning_rate": 0.00027958690768138406, + "loss": 0.0106, + "num_input_tokens_seen": 180122880, + "step": 83465 + }, + { + "epoch": 13.616639477977161, + "grad_norm": 0.0013681879499927163, + "learning_rate": 0.0002795230194409283, + "loss": 0.011, + "num_input_tokens_seen": 180133504, + "step": 83470 + }, + { + "epoch": 13.617455138662317, + "grad_norm": 1.260190486907959, + "learning_rate": 0.00027945913566871793, + "loss": 0.0334, + "num_input_tokens_seen": 180145120, + "step": 83475 + }, + { + "epoch": 13.61827079934747, + "grad_norm": 0.040678899735212326, + "learning_rate": 0.0002793952563660483, + "loss": 0.0096, + "num_input_tokens_seen": 180155904, + "step": 83480 + }, + { + "epoch": 13.619086460032626, + "grad_norm": 0.000834224687423557, + "learning_rate": 0.0002793313815342133, + "loss": 0.0023, + "num_input_tokens_seen": 180166112, + "step": 83485 + }, + { + "epoch": 13.619902120717782, + "grad_norm": 0.07367656379938126, + "learning_rate": 0.0002792675111745081, + "loss": 0.174, + "num_input_tokens_seen": 180177824, + "step": 83490 + }, + { + "epoch": 13.620717781402936, + "grad_norm": 0.0015302329557016492, + "learning_rate": 0.0002792036452882265, + "loss": 0.0029, + "num_input_tokens_seen": 180188800, + "step": 83495 + }, + { + "epoch": 13.621533442088092, + "grad_norm": 0.014777740463614464, + "learning_rate": 0.00027913978387666326, + "loss": 0.0026, + "num_input_tokens_seen": 180198944, + "step": 83500 + }, + { + "epoch": 13.622349102773246, + "grad_norm": 0.002392134629189968, + "learning_rate": 0.0002790759269411125, + "loss": 0.0189, + "num_input_tokens_seen": 180209440, + "step": 83505 + }, + { + "epoch": 13.623164763458401, + "grad_norm": 0.002504667965695262, + "learning_rate": 0.00027901207448286836, + "loss": 0.0058, + "num_input_tokens_seen": 180220288, + "step": 83510 + }, + { + "epoch": 13.623980424143557, + "grad_norm": 0.3111249506473541, + "learning_rate": 0.0002789482265032249, + "loss": 0.0198, + "num_input_tokens_seen": 180231552, + "step": 83515 + }, + { + "epoch": 13.624796084828711, + "grad_norm": 0.002734098583459854, + "learning_rate": 0.00027888438300347607, + "loss": 0.0013, + "num_input_tokens_seen": 180244384, + "step": 83520 + }, + { + "epoch": 13.625611745513867, + "grad_norm": 0.030768273398280144, + "learning_rate": 0.00027882054398491564, + "loss": 0.0021, + "num_input_tokens_seen": 180255936, + "step": 83525 + }, + { + "epoch": 13.62642740619902, + "grad_norm": 0.001905204844661057, + "learning_rate": 0.0002787567094488375, + "loss": 0.0934, + "num_input_tokens_seen": 180266368, + "step": 83530 + }, + { + "epoch": 13.627243066884176, + "grad_norm": 0.0013926239917054772, + "learning_rate": 0.00027869287939653534, + "loss": 0.0138, + "num_input_tokens_seen": 180277376, + "step": 83535 + }, + { + "epoch": 13.62805872756933, + "grad_norm": 0.27862751483917236, + "learning_rate": 0.0002786290538293027, + "loss": 0.0276, + "num_input_tokens_seen": 180287232, + "step": 83540 + }, + { + "epoch": 13.628874388254486, + "grad_norm": 0.01329673919826746, + "learning_rate": 0.00027856523274843314, + "loss": 0.0111, + "num_input_tokens_seen": 180297888, + "step": 83545 + }, + { + "epoch": 13.629690048939642, + "grad_norm": 0.3035178780555725, + "learning_rate": 0.00027850141615521983, + "loss": 0.0226, + "num_input_tokens_seen": 180309152, + "step": 83550 + }, + { + "epoch": 13.630505709624796, + "grad_norm": 0.01735401339828968, + "learning_rate": 0.0002784376040509567, + "loss": 0.0729, + "num_input_tokens_seen": 180319552, + "step": 83555 + }, + { + "epoch": 13.631321370309951, + "grad_norm": 0.03845395892858505, + "learning_rate": 0.00027837379643693615, + "loss": 0.0033, + "num_input_tokens_seen": 180328832, + "step": 83560 + }, + { + "epoch": 13.632137030995105, + "grad_norm": 0.048414334654808044, + "learning_rate": 0.0002783099933144523, + "loss": 0.0058, + "num_input_tokens_seen": 180338752, + "step": 83565 + }, + { + "epoch": 13.632952691680261, + "grad_norm": 0.0130057567730546, + "learning_rate": 0.00027824619468479715, + "loss": 0.0029, + "num_input_tokens_seen": 180349312, + "step": 83570 + }, + { + "epoch": 13.633768352365417, + "grad_norm": 0.0015569858951494098, + "learning_rate": 0.00027818240054926463, + "loss": 0.0278, + "num_input_tokens_seen": 180360544, + "step": 83575 + }, + { + "epoch": 13.63458401305057, + "grad_norm": 0.003595008049160242, + "learning_rate": 0.0002781186109091467, + "loss": 0.0028, + "num_input_tokens_seen": 180371136, + "step": 83580 + }, + { + "epoch": 13.635399673735726, + "grad_norm": 0.013447575271129608, + "learning_rate": 0.0002780548257657371, + "loss": 0.0057, + "num_input_tokens_seen": 180382400, + "step": 83585 + }, + { + "epoch": 13.63621533442088, + "grad_norm": 0.7062772512435913, + "learning_rate": 0.00027799104512032756, + "loss": 0.0158, + "num_input_tokens_seen": 180393280, + "step": 83590 + }, + { + "epoch": 13.637030995106036, + "grad_norm": 0.0011510619660839438, + "learning_rate": 0.0002779272689742115, + "loss": 0.0012, + "num_input_tokens_seen": 180403840, + "step": 83595 + }, + { + "epoch": 13.63784665579119, + "grad_norm": 0.006413816474378109, + "learning_rate": 0.0002778634973286807, + "loss": 0.0088, + "num_input_tokens_seen": 180415200, + "step": 83600 + }, + { + "epoch": 13.638662316476346, + "grad_norm": 0.25204241275787354, + "learning_rate": 0.00027779973018502834, + "loss": 0.015, + "num_input_tokens_seen": 180426688, + "step": 83605 + }, + { + "epoch": 13.639477977161501, + "grad_norm": 0.516826331615448, + "learning_rate": 0.0002777359675445459, + "loss": 0.1447, + "num_input_tokens_seen": 180436480, + "step": 83610 + }, + { + "epoch": 13.640293637846655, + "grad_norm": 0.03235753998160362, + "learning_rate": 0.00027767220940852646, + "loss": 0.003, + "num_input_tokens_seen": 180446976, + "step": 83615 + }, + { + "epoch": 13.641109298531811, + "grad_norm": 0.00042045165901072323, + "learning_rate": 0.0002776084557782613, + "loss": 0.0224, + "num_input_tokens_seen": 180458656, + "step": 83620 + }, + { + "epoch": 13.641924959216965, + "grad_norm": 0.0031009865924715996, + "learning_rate": 0.00027754470665504336, + "loss": 0.0045, + "num_input_tokens_seen": 180468448, + "step": 83625 + }, + { + "epoch": 13.64274061990212, + "grad_norm": 0.0009567321976646781, + "learning_rate": 0.0002774809620401637, + "loss": 0.0143, + "num_input_tokens_seen": 180478784, + "step": 83630 + }, + { + "epoch": 13.643556280587276, + "grad_norm": 0.11920958757400513, + "learning_rate": 0.000277417221934915, + "loss": 0.0058, + "num_input_tokens_seen": 180490496, + "step": 83635 + }, + { + "epoch": 13.64437194127243, + "grad_norm": 0.002913987496867776, + "learning_rate": 0.00027735348634058834, + "loss": 0.0041, + "num_input_tokens_seen": 180500416, + "step": 83640 + }, + { + "epoch": 13.645187601957586, + "grad_norm": 0.21653008460998535, + "learning_rate": 0.0002772897552584759, + "loss": 0.067, + "num_input_tokens_seen": 180510720, + "step": 83645 + }, + { + "epoch": 13.64600326264274, + "grad_norm": 0.48877620697021484, + "learning_rate": 0.000277226028689869, + "loss": 0.0146, + "num_input_tokens_seen": 180522080, + "step": 83650 + }, + { + "epoch": 13.646818923327896, + "grad_norm": 0.016910046339035034, + "learning_rate": 0.00027716230663605933, + "loss": 0.0699, + "num_input_tokens_seen": 180532096, + "step": 83655 + }, + { + "epoch": 13.647634584013051, + "grad_norm": 0.06970806419849396, + "learning_rate": 0.00027709858909833823, + "loss": 0.0082, + "num_input_tokens_seen": 180543264, + "step": 83660 + }, + { + "epoch": 13.648450244698205, + "grad_norm": 0.0776260644197464, + "learning_rate": 0.000277034876077997, + "loss": 0.0026, + "num_input_tokens_seen": 180553408, + "step": 83665 + }, + { + "epoch": 13.649265905383361, + "grad_norm": 0.03109067678451538, + "learning_rate": 0.00027697116757632677, + "loss": 0.0013, + "num_input_tokens_seen": 180563520, + "step": 83670 + }, + { + "epoch": 13.650081566068515, + "grad_norm": 0.0013858119491487741, + "learning_rate": 0.0002769074635946188, + "loss": 0.0014, + "num_input_tokens_seen": 180574528, + "step": 83675 + }, + { + "epoch": 13.65089722675367, + "grad_norm": 0.0012028936762362719, + "learning_rate": 0.0002768437641341641, + "loss": 0.0016, + "num_input_tokens_seen": 180585536, + "step": 83680 + }, + { + "epoch": 13.651712887438826, + "grad_norm": 0.0024049426428973675, + "learning_rate": 0.00027678006919625367, + "loss": 0.0489, + "num_input_tokens_seen": 180596480, + "step": 83685 + }, + { + "epoch": 13.65252854812398, + "grad_norm": 0.014546342194080353, + "learning_rate": 0.00027671637878217824, + "loss": 0.0063, + "num_input_tokens_seen": 180607936, + "step": 83690 + }, + { + "epoch": 13.653344208809136, + "grad_norm": 0.002264510141685605, + "learning_rate": 0.0002766526928932285, + "loss": 0.0035, + "num_input_tokens_seen": 180618720, + "step": 83695 + }, + { + "epoch": 13.65415986949429, + "grad_norm": 0.06656309962272644, + "learning_rate": 0.0002765890115306956, + "loss": 0.0048, + "num_input_tokens_seen": 180630144, + "step": 83700 + }, + { + "epoch": 13.654975530179446, + "grad_norm": 0.01725471019744873, + "learning_rate": 0.0002765253346958695, + "loss": 0.0383, + "num_input_tokens_seen": 180641888, + "step": 83705 + }, + { + "epoch": 13.655791190864601, + "grad_norm": 0.00910930521786213, + "learning_rate": 0.00027646166239004134, + "loss": 0.0144, + "num_input_tokens_seen": 180653344, + "step": 83710 + }, + { + "epoch": 13.656606851549755, + "grad_norm": 0.0024980742018669844, + "learning_rate": 0.0002763979946145008, + "loss": 0.0019, + "num_input_tokens_seen": 180664480, + "step": 83715 + }, + { + "epoch": 13.65742251223491, + "grad_norm": 0.0005633598775602877, + "learning_rate": 0.00027633433137053885, + "loss": 0.0022, + "num_input_tokens_seen": 180674944, + "step": 83720 + }, + { + "epoch": 13.658238172920065, + "grad_norm": 0.0008412788738496602, + "learning_rate": 0.00027627067265944514, + "loss": 0.0026, + "num_input_tokens_seen": 180686688, + "step": 83725 + }, + { + "epoch": 13.65905383360522, + "grad_norm": 0.019468821585178375, + "learning_rate": 0.0002762070184825104, + "loss": 0.0045, + "num_input_tokens_seen": 180696960, + "step": 83730 + }, + { + "epoch": 13.659869494290374, + "grad_norm": 0.005795237608253956, + "learning_rate": 0.00027614336884102393, + "loss": 0.0012, + "num_input_tokens_seen": 180706784, + "step": 83735 + }, + { + "epoch": 13.66068515497553, + "grad_norm": 0.0278521329164505, + "learning_rate": 0.0002760797237362765, + "loss": 0.0053, + "num_input_tokens_seen": 180717184, + "step": 83740 + }, + { + "epoch": 13.661500815660686, + "grad_norm": 0.0005541770369745791, + "learning_rate": 0.00027601608316955715, + "loss": 0.0077, + "num_input_tokens_seen": 180727360, + "step": 83745 + }, + { + "epoch": 13.66231647634584, + "grad_norm": 0.770077645778656, + "learning_rate": 0.0002759524471421562, + "loss": 0.0883, + "num_input_tokens_seen": 180739040, + "step": 83750 + }, + { + "epoch": 13.663132137030995, + "grad_norm": 0.006951437331736088, + "learning_rate": 0.00027588881565536303, + "loss": 0.0266, + "num_input_tokens_seen": 180749024, + "step": 83755 + }, + { + "epoch": 13.66394779771615, + "grad_norm": 0.004421426448971033, + "learning_rate": 0.00027582518871046744, + "loss": 0.2015, + "num_input_tokens_seen": 180760224, + "step": 83760 + }, + { + "epoch": 13.664763458401305, + "grad_norm": 0.007538532838225365, + "learning_rate": 0.00027576156630875875, + "loss": 0.0028, + "num_input_tokens_seen": 180770560, + "step": 83765 + }, + { + "epoch": 13.66557911908646, + "grad_norm": 0.003952084109187126, + "learning_rate": 0.0002756979484515264, + "loss": 0.0021, + "num_input_tokens_seen": 180781984, + "step": 83770 + }, + { + "epoch": 13.666394779771615, + "grad_norm": 0.0004522344097495079, + "learning_rate": 0.00027563433514005966, + "loss": 0.0018, + "num_input_tokens_seen": 180792992, + "step": 83775 + }, + { + "epoch": 13.66721044045677, + "grad_norm": 0.0006882618763484061, + "learning_rate": 0.0002755707263756477, + "loss": 0.0022, + "num_input_tokens_seen": 180802976, + "step": 83780 + }, + { + "epoch": 13.668026101141924, + "grad_norm": 0.026845784857869148, + "learning_rate": 0.0002755071221595798, + "loss": 0.0025, + "num_input_tokens_seen": 180813664, + "step": 83785 + }, + { + "epoch": 13.66884176182708, + "grad_norm": 0.000679068558383733, + "learning_rate": 0.0002754435224931447, + "loss": 0.0041, + "num_input_tokens_seen": 180823968, + "step": 83790 + }, + { + "epoch": 13.669657422512234, + "grad_norm": 0.0009350479231216013, + "learning_rate": 0.00027537992737763163, + "loss": 0.0115, + "num_input_tokens_seen": 180834528, + "step": 83795 + }, + { + "epoch": 13.67047308319739, + "grad_norm": 0.0014833023305982351, + "learning_rate": 0.00027531633681432925, + "loss": 0.0016, + "num_input_tokens_seen": 180846176, + "step": 83800 + }, + { + "epoch": 13.671288743882545, + "grad_norm": 0.00406976044178009, + "learning_rate": 0.0002752527508045263, + "loss": 0.0073, + "num_input_tokens_seen": 180855744, + "step": 83805 + }, + { + "epoch": 13.6721044045677, + "grad_norm": 0.017132241278886795, + "learning_rate": 0.0002751891693495115, + "loss": 0.0043, + "num_input_tokens_seen": 180867168, + "step": 83810 + }, + { + "epoch": 13.672920065252855, + "grad_norm": 0.020292937755584717, + "learning_rate": 0.00027512559245057333, + "loss": 0.0056, + "num_input_tokens_seen": 180878048, + "step": 83815 + }, + { + "epoch": 13.673735725938009, + "grad_norm": 0.4274173378944397, + "learning_rate": 0.00027506202010900037, + "loss": 0.0133, + "num_input_tokens_seen": 180889440, + "step": 83820 + }, + { + "epoch": 13.674551386623165, + "grad_norm": 0.0004901851061731577, + "learning_rate": 0.00027499845232608087, + "loss": 0.0032, + "num_input_tokens_seen": 180899584, + "step": 83825 + }, + { + "epoch": 13.67536704730832, + "grad_norm": 0.01101384125649929, + "learning_rate": 0.00027493488910310316, + "loss": 0.0043, + "num_input_tokens_seen": 180910624, + "step": 83830 + }, + { + "epoch": 13.676182707993474, + "grad_norm": 0.08665505796670914, + "learning_rate": 0.0002748713304413555, + "loss": 0.0034, + "num_input_tokens_seen": 180920928, + "step": 83835 + }, + { + "epoch": 13.67699836867863, + "grad_norm": 0.036097027361392975, + "learning_rate": 0.0002748077763421257, + "loss": 0.0057, + "num_input_tokens_seen": 180930048, + "step": 83840 + }, + { + "epoch": 13.677814029363784, + "grad_norm": 0.004113186616450548, + "learning_rate": 0.0002747442268067024, + "loss": 0.003, + "num_input_tokens_seen": 180942368, + "step": 83845 + }, + { + "epoch": 13.67862969004894, + "grad_norm": 0.0017188480123877525, + "learning_rate": 0.00027468068183637265, + "loss": 0.0139, + "num_input_tokens_seen": 180954400, + "step": 83850 + }, + { + "epoch": 13.679445350734095, + "grad_norm": 0.010012054815888405, + "learning_rate": 0.0002746171414324249, + "loss": 0.0054, + "num_input_tokens_seen": 180966400, + "step": 83855 + }, + { + "epoch": 13.68026101141925, + "grad_norm": 0.0034215140622109175, + "learning_rate": 0.00027455360559614677, + "loss": 0.0009, + "num_input_tokens_seen": 180976480, + "step": 83860 + }, + { + "epoch": 13.681076672104405, + "grad_norm": 0.006984212435781956, + "learning_rate": 0.00027449007432882576, + "loss": 0.1199, + "num_input_tokens_seen": 180987808, + "step": 83865 + }, + { + "epoch": 13.681892332789559, + "grad_norm": 0.0009208376286551356, + "learning_rate": 0.00027442654763174955, + "loss": 0.0012, + "num_input_tokens_seen": 180997088, + "step": 83870 + }, + { + "epoch": 13.682707993474715, + "grad_norm": 0.15933479368686676, + "learning_rate": 0.00027436302550620545, + "loss": 0.0192, + "num_input_tokens_seen": 181008352, + "step": 83875 + }, + { + "epoch": 13.68352365415987, + "grad_norm": 0.0363893136382103, + "learning_rate": 0.0002742995079534809, + "loss": 0.0022, + "num_input_tokens_seen": 181020256, + "step": 83880 + }, + { + "epoch": 13.684339314845024, + "grad_norm": 0.20296135544776917, + "learning_rate": 0.0002742359949748632, + "loss": 0.0119, + "num_input_tokens_seen": 181031392, + "step": 83885 + }, + { + "epoch": 13.68515497553018, + "grad_norm": 0.0014104736037552357, + "learning_rate": 0.0002741724865716394, + "loss": 0.11, + "num_input_tokens_seen": 181042304, + "step": 83890 + }, + { + "epoch": 13.685970636215334, + "grad_norm": 0.2732338309288025, + "learning_rate": 0.0002741089827450966, + "loss": 0.0141, + "num_input_tokens_seen": 181052960, + "step": 83895 + }, + { + "epoch": 13.68678629690049, + "grad_norm": 0.000993990572169423, + "learning_rate": 0.0002740454834965219, + "loss": 0.0017, + "num_input_tokens_seen": 181064096, + "step": 83900 + }, + { + "epoch": 13.687601957585644, + "grad_norm": 0.001881644711829722, + "learning_rate": 0.0002739819888272021, + "loss": 0.0153, + "num_input_tokens_seen": 181074112, + "step": 83905 + }, + { + "epoch": 13.6884176182708, + "grad_norm": 0.0005248187808319926, + "learning_rate": 0.000273918498738424, + "loss": 0.0043, + "num_input_tokens_seen": 181083520, + "step": 83910 + }, + { + "epoch": 13.689233278955955, + "grad_norm": 0.16830438375473022, + "learning_rate": 0.00027385501323147433, + "loss": 0.0092, + "num_input_tokens_seen": 181093984, + "step": 83915 + }, + { + "epoch": 13.690048939641109, + "grad_norm": 0.04078533127903938, + "learning_rate": 0.00027379153230763976, + "loss": 0.0915, + "num_input_tokens_seen": 181104096, + "step": 83920 + }, + { + "epoch": 13.690864600326265, + "grad_norm": 0.005261866841465235, + "learning_rate": 0.00027372805596820673, + "loss": 0.1228, + "num_input_tokens_seen": 181114816, + "step": 83925 + }, + { + "epoch": 13.691680261011419, + "grad_norm": 0.006824697833508253, + "learning_rate": 0.0002736645842144616, + "loss": 0.0076, + "num_input_tokens_seen": 181125664, + "step": 83930 + }, + { + "epoch": 13.692495921696574, + "grad_norm": 0.0026527964510023594, + "learning_rate": 0.00027360111704769093, + "loss": 0.0011, + "num_input_tokens_seen": 181135136, + "step": 83935 + }, + { + "epoch": 13.69331158238173, + "grad_norm": 0.04314657673239708, + "learning_rate": 0.00027353765446918075, + "loss": 0.0067, + "num_input_tokens_seen": 181147808, + "step": 83940 + }, + { + "epoch": 13.694127243066884, + "grad_norm": 0.25260767340660095, + "learning_rate": 0.0002734741964802173, + "loss": 0.0096, + "num_input_tokens_seen": 181159584, + "step": 83945 + }, + { + "epoch": 13.69494290375204, + "grad_norm": 0.00410098722204566, + "learning_rate": 0.00027341074308208667, + "loss": 0.0008, + "num_input_tokens_seen": 181170496, + "step": 83950 + }, + { + "epoch": 13.695758564437194, + "grad_norm": 0.0006729807355441153, + "learning_rate": 0.00027334729427607476, + "loss": 0.1375, + "num_input_tokens_seen": 181181120, + "step": 83955 + }, + { + "epoch": 13.69657422512235, + "grad_norm": 0.0018042154842987657, + "learning_rate": 0.00027328385006346746, + "loss": 0.1309, + "num_input_tokens_seen": 181192352, + "step": 83960 + }, + { + "epoch": 13.697389885807503, + "grad_norm": 0.019949940964579582, + "learning_rate": 0.00027322041044555045, + "loss": 0.0014, + "num_input_tokens_seen": 181202112, + "step": 83965 + }, + { + "epoch": 13.698205546492659, + "grad_norm": 0.02108342945575714, + "learning_rate": 0.00027315697542360944, + "loss": 0.0185, + "num_input_tokens_seen": 181213760, + "step": 83970 + }, + { + "epoch": 13.699021207177815, + "grad_norm": 0.02910441905260086, + "learning_rate": 0.00027309354499893045, + "loss": 0.0157, + "num_input_tokens_seen": 181223104, + "step": 83975 + }, + { + "epoch": 13.699836867862969, + "grad_norm": 0.036773670464754105, + "learning_rate": 0.00027303011917279826, + "loss": 0.0083, + "num_input_tokens_seen": 181232512, + "step": 83980 + }, + { + "epoch": 13.700652528548124, + "grad_norm": 0.0448458194732666, + "learning_rate": 0.00027296669794649875, + "loss": 0.0068, + "num_input_tokens_seen": 181243968, + "step": 83985 + }, + { + "epoch": 13.701468189233278, + "grad_norm": 0.0030712252482771873, + "learning_rate": 0.0002729032813213172, + "loss": 0.0035, + "num_input_tokens_seen": 181253856, + "step": 83990 + }, + { + "epoch": 13.702283849918434, + "grad_norm": 0.006518937647342682, + "learning_rate": 0.00027283986929853873, + "loss": 0.0057, + "num_input_tokens_seen": 181264736, + "step": 83995 + }, + { + "epoch": 13.70309951060359, + "grad_norm": 0.0027828472666442394, + "learning_rate": 0.0002727764618794485, + "loss": 0.0028, + "num_input_tokens_seen": 181274976, + "step": 84000 + }, + { + "epoch": 13.703915171288743, + "grad_norm": 0.3594715893268585, + "learning_rate": 0.00027271305906533146, + "loss": 0.0213, + "num_input_tokens_seen": 181285344, + "step": 84005 + }, + { + "epoch": 13.7047308319739, + "grad_norm": 1.1852052211761475, + "learning_rate": 0.00027264966085747267, + "loss": 0.0842, + "num_input_tokens_seen": 181297024, + "step": 84010 + }, + { + "epoch": 13.705546492659053, + "grad_norm": 0.016683485358953476, + "learning_rate": 0.00027258626725715684, + "loss": 0.0963, + "num_input_tokens_seen": 181308032, + "step": 84015 + }, + { + "epoch": 13.706362153344209, + "grad_norm": 0.028316011652350426, + "learning_rate": 0.0002725228782656689, + "loss": 0.028, + "num_input_tokens_seen": 181319232, + "step": 84020 + }, + { + "epoch": 13.707177814029365, + "grad_norm": 0.01709996536374092, + "learning_rate": 0.00027245949388429334, + "loss": 0.0057, + "num_input_tokens_seen": 181329472, + "step": 84025 + }, + { + "epoch": 13.707993474714518, + "grad_norm": 0.002598293125629425, + "learning_rate": 0.0002723961141143148, + "loss": 0.0104, + "num_input_tokens_seen": 181340448, + "step": 84030 + }, + { + "epoch": 13.708809135399674, + "grad_norm": 0.002267255214974284, + "learning_rate": 0.0002723327389570177, + "loss": 0.0157, + "num_input_tokens_seen": 181352064, + "step": 84035 + }, + { + "epoch": 13.709624796084828, + "grad_norm": 0.5322057008743286, + "learning_rate": 0.00027226936841368655, + "loss": 0.0813, + "num_input_tokens_seen": 181363360, + "step": 84040 + }, + { + "epoch": 13.710440456769984, + "grad_norm": 0.007359120063483715, + "learning_rate": 0.00027220600248560557, + "loss": 0.0052, + "num_input_tokens_seen": 181374432, + "step": 84045 + }, + { + "epoch": 13.71125611745514, + "grad_norm": 0.08291971683502197, + "learning_rate": 0.00027214264117405884, + "loss": 0.0396, + "num_input_tokens_seen": 181385472, + "step": 84050 + }, + { + "epoch": 13.712071778140293, + "grad_norm": 0.08208156377077103, + "learning_rate": 0.0002720792844803306, + "loss": 0.0272, + "num_input_tokens_seen": 181396896, + "step": 84055 + }, + { + "epoch": 13.71288743882545, + "grad_norm": 0.0005403195391409099, + "learning_rate": 0.00027201593240570475, + "loss": 0.004, + "num_input_tokens_seen": 181406912, + "step": 84060 + }, + { + "epoch": 13.713703099510603, + "grad_norm": 0.00033968701609410346, + "learning_rate": 0.00027195258495146525, + "loss": 0.0016, + "num_input_tokens_seen": 181417216, + "step": 84065 + }, + { + "epoch": 13.714518760195759, + "grad_norm": 0.013393732719123363, + "learning_rate": 0.00027188924211889593, + "loss": 0.0025, + "num_input_tokens_seen": 181428384, + "step": 84070 + }, + { + "epoch": 13.715334420880914, + "grad_norm": 0.3212685286998749, + "learning_rate": 0.0002718259039092803, + "loss": 0.0168, + "num_input_tokens_seen": 181439168, + "step": 84075 + }, + { + "epoch": 13.716150081566068, + "grad_norm": 0.011765568517148495, + "learning_rate": 0.0002717625703239026, + "loss": 0.0555, + "num_input_tokens_seen": 181449536, + "step": 84080 + }, + { + "epoch": 13.716965742251224, + "grad_norm": 0.007251157891005278, + "learning_rate": 0.00027169924136404553, + "loss": 0.0986, + "num_input_tokens_seen": 181459648, + "step": 84085 + }, + { + "epoch": 13.717781402936378, + "grad_norm": 0.003333869157359004, + "learning_rate": 0.00027163591703099335, + "loss": 0.0147, + "num_input_tokens_seen": 181469568, + "step": 84090 + }, + { + "epoch": 13.718597063621534, + "grad_norm": 0.0059239561669528484, + "learning_rate": 0.0002715725973260286, + "loss": 0.0053, + "num_input_tokens_seen": 181480512, + "step": 84095 + }, + { + "epoch": 13.719412724306688, + "grad_norm": 0.0016899576876312494, + "learning_rate": 0.00027150928225043545, + "loss": 0.0075, + "num_input_tokens_seen": 181491072, + "step": 84100 + }, + { + "epoch": 13.720228384991843, + "grad_norm": 0.003943632822483778, + "learning_rate": 0.00027144597180549603, + "loss": 0.0021, + "num_input_tokens_seen": 181503392, + "step": 84105 + }, + { + "epoch": 13.721044045676999, + "grad_norm": 0.008378212340176105, + "learning_rate": 0.0002713826659924944, + "loss": 0.0146, + "num_input_tokens_seen": 181513888, + "step": 84110 + }, + { + "epoch": 13.721859706362153, + "grad_norm": 0.004583362489938736, + "learning_rate": 0.00027131936481271265, + "loss": 0.0036, + "num_input_tokens_seen": 181524416, + "step": 84115 + }, + { + "epoch": 13.722675367047309, + "grad_norm": 0.0014328404795378447, + "learning_rate": 0.00027125606826743445, + "loss": 0.0017, + "num_input_tokens_seen": 181535584, + "step": 84120 + }, + { + "epoch": 13.723491027732463, + "grad_norm": 0.0007077969494275749, + "learning_rate": 0.0002711927763579418, + "loss": 0.0696, + "num_input_tokens_seen": 181546080, + "step": 84125 + }, + { + "epoch": 13.724306688417618, + "grad_norm": 0.0018838716205209494, + "learning_rate": 0.00027112948908551807, + "loss": 0.0043, + "num_input_tokens_seen": 181557824, + "step": 84130 + }, + { + "epoch": 13.725122349102774, + "grad_norm": 0.03190400451421738, + "learning_rate": 0.00027106620645144555, + "loss": 0.0427, + "num_input_tokens_seen": 181567360, + "step": 84135 + }, + { + "epoch": 13.725938009787928, + "grad_norm": 0.014288217760622501, + "learning_rate": 0.00027100292845700676, + "loss": 0.0016, + "num_input_tokens_seen": 181578528, + "step": 84140 + }, + { + "epoch": 13.726753670473084, + "grad_norm": 0.003423146903514862, + "learning_rate": 0.0002709396551034842, + "loss": 0.001, + "num_input_tokens_seen": 181589568, + "step": 84145 + }, + { + "epoch": 13.727569331158238, + "grad_norm": 0.002322500105947256, + "learning_rate": 0.00027087638639215994, + "loss": 0.0106, + "num_input_tokens_seen": 181601248, + "step": 84150 + }, + { + "epoch": 13.728384991843393, + "grad_norm": 0.8441523313522339, + "learning_rate": 0.00027081312232431654, + "loss": 0.0299, + "num_input_tokens_seen": 181611680, + "step": 84155 + }, + { + "epoch": 13.729200652528547, + "grad_norm": 0.0011434407206252217, + "learning_rate": 0.00027074986290123596, + "loss": 0.0069, + "num_input_tokens_seen": 181622560, + "step": 84160 + }, + { + "epoch": 13.730016313213703, + "grad_norm": 0.12115272134542465, + "learning_rate": 0.0002706866081242001, + "loss": 0.0911, + "num_input_tokens_seen": 181633408, + "step": 84165 + }, + { + "epoch": 13.730831973898859, + "grad_norm": 0.02553938329219818, + "learning_rate": 0.0002706233579944911, + "loss": 0.0237, + "num_input_tokens_seen": 181644320, + "step": 84170 + }, + { + "epoch": 13.731647634584013, + "grad_norm": 0.028921717777848244, + "learning_rate": 0.00027056011251339073, + "loss": 0.0157, + "num_input_tokens_seen": 181654688, + "step": 84175 + }, + { + "epoch": 13.732463295269168, + "grad_norm": 0.008873935788869858, + "learning_rate": 0.0002704968716821806, + "loss": 0.0165, + "num_input_tokens_seen": 181663936, + "step": 84180 + }, + { + "epoch": 13.733278955954322, + "grad_norm": 0.002034940291196108, + "learning_rate": 0.00027043363550214287, + "loss": 0.0034, + "num_input_tokens_seen": 181673664, + "step": 84185 + }, + { + "epoch": 13.734094616639478, + "grad_norm": 1.478367567062378, + "learning_rate": 0.00027037040397455837, + "loss": 0.0387, + "num_input_tokens_seen": 181684320, + "step": 84190 + }, + { + "epoch": 13.734910277324634, + "grad_norm": 0.17809708416461945, + "learning_rate": 0.0002703071771007093, + "loss": 0.0092, + "num_input_tokens_seen": 181695616, + "step": 84195 + }, + { + "epoch": 13.735725938009788, + "grad_norm": 0.7050788402557373, + "learning_rate": 0.0002702439548818763, + "loss": 0.1096, + "num_input_tokens_seen": 181704512, + "step": 84200 + }, + { + "epoch": 13.736541598694943, + "grad_norm": 0.0029484990518540144, + "learning_rate": 0.0002701807373193414, + "loss": 0.0129, + "num_input_tokens_seen": 181715552, + "step": 84205 + }, + { + "epoch": 13.737357259380097, + "grad_norm": 0.042970869690179825, + "learning_rate": 0.000270117524414385, + "loss": 0.0099, + "num_input_tokens_seen": 181726240, + "step": 84210 + }, + { + "epoch": 13.738172920065253, + "grad_norm": 0.011517154052853584, + "learning_rate": 0.000270054316168289, + "loss": 0.0091, + "num_input_tokens_seen": 181737280, + "step": 84215 + }, + { + "epoch": 13.738988580750409, + "grad_norm": 0.3823186457157135, + "learning_rate": 0.0002699911125823336, + "loss": 0.1071, + "num_input_tokens_seen": 181747968, + "step": 84220 + }, + { + "epoch": 13.739804241435563, + "grad_norm": 0.0030795957427471876, + "learning_rate": 0.0002699279136578005, + "loss": 0.0051, + "num_input_tokens_seen": 181759616, + "step": 84225 + }, + { + "epoch": 13.740619902120718, + "grad_norm": 0.18962961435317993, + "learning_rate": 0.0002698647193959697, + "loss": 0.0689, + "num_input_tokens_seen": 181771168, + "step": 84230 + }, + { + "epoch": 13.741435562805872, + "grad_norm": 1.2635283470153809, + "learning_rate": 0.00026980152979812265, + "loss": 0.0273, + "num_input_tokens_seen": 181781568, + "step": 84235 + }, + { + "epoch": 13.742251223491028, + "grad_norm": 0.00022593745961785316, + "learning_rate": 0.0002697383448655393, + "loss": 0.0024, + "num_input_tokens_seen": 181791424, + "step": 84240 + }, + { + "epoch": 13.743066884176184, + "grad_norm": 0.0008522254647687078, + "learning_rate": 0.00026967516459950084, + "loss": 0.0157, + "num_input_tokens_seen": 181801376, + "step": 84245 + }, + { + "epoch": 13.743882544861338, + "grad_norm": 0.0019153539324179292, + "learning_rate": 0.000269611989001287, + "loss": 0.0117, + "num_input_tokens_seen": 181812736, + "step": 84250 + }, + { + "epoch": 13.744698205546493, + "grad_norm": 0.35034364461898804, + "learning_rate": 0.0002695488180721789, + "loss": 0.0188, + "num_input_tokens_seen": 181822784, + "step": 84255 + }, + { + "epoch": 13.745513866231647, + "grad_norm": 0.0017476509092375636, + "learning_rate": 0.0002694856518134559, + "loss": 0.0065, + "num_input_tokens_seen": 181833504, + "step": 84260 + }, + { + "epoch": 13.746329526916803, + "grad_norm": 0.007435102481395006, + "learning_rate": 0.000269422490226399, + "loss": 0.0343, + "num_input_tokens_seen": 181844768, + "step": 84265 + }, + { + "epoch": 13.747145187601957, + "grad_norm": 0.0012154619907960296, + "learning_rate": 0.00026935933331228743, + "loss": 0.0026, + "num_input_tokens_seen": 181855456, + "step": 84270 + }, + { + "epoch": 13.747960848287113, + "grad_norm": 0.00038822778151370585, + "learning_rate": 0.00026929618107240173, + "loss": 0.0056, + "num_input_tokens_seen": 181867648, + "step": 84275 + }, + { + "epoch": 13.748776508972268, + "grad_norm": 0.12923280894756317, + "learning_rate": 0.0002692330335080216, + "loss": 0.0136, + "num_input_tokens_seen": 181877472, + "step": 84280 + }, + { + "epoch": 13.749592169657422, + "grad_norm": 0.0010687632020562887, + "learning_rate": 0.00026916989062042684, + "loss": 0.0052, + "num_input_tokens_seen": 181888224, + "step": 84285 + }, + { + "epoch": 13.750407830342578, + "grad_norm": 0.058179449290037155, + "learning_rate": 0.0002691067524108971, + "loss": 0.0103, + "num_input_tokens_seen": 181899616, + "step": 84290 + }, + { + "epoch": 13.751223491027732, + "grad_norm": 0.0022791014052927494, + "learning_rate": 0.00026904361888071193, + "loss": 0.0214, + "num_input_tokens_seen": 181910624, + "step": 84295 + }, + { + "epoch": 13.752039151712887, + "grad_norm": 0.0633174255490303, + "learning_rate": 0.0002689804900311508, + "loss": 0.2135, + "num_input_tokens_seen": 181920416, + "step": 84300 + }, + { + "epoch": 13.752854812398043, + "grad_norm": 0.0012328175362199545, + "learning_rate": 0.000268917365863493, + "loss": 0.0783, + "num_input_tokens_seen": 181931488, + "step": 84305 + }, + { + "epoch": 13.753670473083197, + "grad_norm": 0.03584255278110504, + "learning_rate": 0.000268854246379018, + "loss": 0.0071, + "num_input_tokens_seen": 181942272, + "step": 84310 + }, + { + "epoch": 13.754486133768353, + "grad_norm": 0.0016830944223329425, + "learning_rate": 0.00026879113157900496, + "loss": 0.0032, + "num_input_tokens_seen": 181952448, + "step": 84315 + }, + { + "epoch": 13.755301794453507, + "grad_norm": 0.0003825765452347696, + "learning_rate": 0.00026872802146473296, + "loss": 0.0844, + "num_input_tokens_seen": 181962976, + "step": 84320 + }, + { + "epoch": 13.756117455138662, + "grad_norm": 0.26183411478996277, + "learning_rate": 0.0002686649160374808, + "loss": 0.0147, + "num_input_tokens_seen": 181972672, + "step": 84325 + }, + { + "epoch": 13.756933115823816, + "grad_norm": 0.011627059429883957, + "learning_rate": 0.0002686018152985279, + "loss": 0.0044, + "num_input_tokens_seen": 181983616, + "step": 84330 + }, + { + "epoch": 13.757748776508972, + "grad_norm": 0.01215373259037733, + "learning_rate": 0.0002685387192491524, + "loss": 0.0022, + "num_input_tokens_seen": 181995296, + "step": 84335 + }, + { + "epoch": 13.758564437194128, + "grad_norm": 0.04153754934668541, + "learning_rate": 0.0002684756278906338, + "loss": 0.1325, + "num_input_tokens_seen": 182006016, + "step": 84340 + }, + { + "epoch": 13.759380097879282, + "grad_norm": 0.03424030542373657, + "learning_rate": 0.0002684125412242499, + "loss": 0.0113, + "num_input_tokens_seen": 182016800, + "step": 84345 + }, + { + "epoch": 13.760195758564437, + "grad_norm": 0.019248194992542267, + "learning_rate": 0.00026834945925128005, + "loss": 0.0174, + "num_input_tokens_seen": 182027392, + "step": 84350 + }, + { + "epoch": 13.761011419249591, + "grad_norm": 0.06448715180158615, + "learning_rate": 0.00026828638197300185, + "loss": 0.042, + "num_input_tokens_seen": 182039232, + "step": 84355 + }, + { + "epoch": 13.761827079934747, + "grad_norm": 0.0089756790548563, + "learning_rate": 0.0002682233093906945, + "loss": 0.017, + "num_input_tokens_seen": 182050848, + "step": 84360 + }, + { + "epoch": 13.762642740619903, + "grad_norm": 0.022131484001874924, + "learning_rate": 0.00026816024150563546, + "loss": 0.0032, + "num_input_tokens_seen": 182061088, + "step": 84365 + }, + { + "epoch": 13.763458401305057, + "grad_norm": 0.0016134614124894142, + "learning_rate": 0.00026809717831910353, + "loss": 0.0041, + "num_input_tokens_seen": 182072064, + "step": 84370 + }, + { + "epoch": 13.764274061990212, + "grad_norm": 0.0008344739908352494, + "learning_rate": 0.0002680341198323761, + "loss": 0.0022, + "num_input_tokens_seen": 182082272, + "step": 84375 + }, + { + "epoch": 13.765089722675366, + "grad_norm": 0.03828657791018486, + "learning_rate": 0.0002679710660467319, + "loss": 0.0056, + "num_input_tokens_seen": 182094176, + "step": 84380 + }, + { + "epoch": 13.765905383360522, + "grad_norm": 0.003431290853768587, + "learning_rate": 0.00026790801696344814, + "loss": 0.006, + "num_input_tokens_seen": 182105600, + "step": 84385 + }, + { + "epoch": 13.766721044045678, + "grad_norm": 0.009849620051681995, + "learning_rate": 0.00026784497258380293, + "loss": 0.0049, + "num_input_tokens_seen": 182117120, + "step": 84390 + }, + { + "epoch": 13.767536704730832, + "grad_norm": 0.0035629882477223873, + "learning_rate": 0.0002677819329090738, + "loss": 0.0234, + "num_input_tokens_seen": 182128032, + "step": 84395 + }, + { + "epoch": 13.768352365415987, + "grad_norm": 0.005654531996697187, + "learning_rate": 0.00026771889794053845, + "loss": 0.0284, + "num_input_tokens_seen": 182139104, + "step": 84400 + }, + { + "epoch": 13.769168026101141, + "grad_norm": 0.014167606830596924, + "learning_rate": 0.00026765586767947433, + "loss": 0.0096, + "num_input_tokens_seen": 182149024, + "step": 84405 + }, + { + "epoch": 13.769983686786297, + "grad_norm": 0.007144684437662363, + "learning_rate": 0.00026759284212715873, + "loss": 0.0348, + "num_input_tokens_seen": 182160352, + "step": 84410 + }, + { + "epoch": 13.770799347471453, + "grad_norm": 0.007787704933434725, + "learning_rate": 0.000267529821284869, + "loss": 0.0031, + "num_input_tokens_seen": 182172000, + "step": 84415 + }, + { + "epoch": 13.771615008156607, + "grad_norm": 0.03676965460181236, + "learning_rate": 0.0002674668051538824, + "loss": 0.0054, + "num_input_tokens_seen": 182182624, + "step": 84420 + }, + { + "epoch": 13.772430668841762, + "grad_norm": 0.34223800897598267, + "learning_rate": 0.0002674037937354761, + "loss": 0.0132, + "num_input_tokens_seen": 182193152, + "step": 84425 + }, + { + "epoch": 13.773246329526916, + "grad_norm": 0.04355357587337494, + "learning_rate": 0.00026734078703092684, + "loss": 0.003, + "num_input_tokens_seen": 182203456, + "step": 84430 + }, + { + "epoch": 13.774061990212072, + "grad_norm": 0.0028624169062823057, + "learning_rate": 0.0002672777850415117, + "loss": 0.0054, + "num_input_tokens_seen": 182213728, + "step": 84435 + }, + { + "epoch": 13.774877650897226, + "grad_norm": 0.008388065733015537, + "learning_rate": 0.0002672147877685075, + "loss": 0.0042, + "num_input_tokens_seen": 182224832, + "step": 84440 + }, + { + "epoch": 13.775693311582382, + "grad_norm": 0.6083003282546997, + "learning_rate": 0.00026715179521319095, + "loss": 0.0348, + "num_input_tokens_seen": 182235168, + "step": 84445 + }, + { + "epoch": 13.776508972267537, + "grad_norm": 0.023990215733647346, + "learning_rate": 0.00026708880737683863, + "loss": 0.001, + "num_input_tokens_seen": 182246336, + "step": 84450 + }, + { + "epoch": 13.777324632952691, + "grad_norm": 0.01223207451403141, + "learning_rate": 0.00026702582426072705, + "loss": 0.0023, + "num_input_tokens_seen": 182257920, + "step": 84455 + }, + { + "epoch": 13.778140293637847, + "grad_norm": 0.0019430754473432899, + "learning_rate": 0.0002669628458661326, + "loss": 0.002, + "num_input_tokens_seen": 182267840, + "step": 84460 + }, + { + "epoch": 13.778955954323001, + "grad_norm": 0.2161453813314438, + "learning_rate": 0.000266899872194332, + "loss": 0.0227, + "num_input_tokens_seen": 182279328, + "step": 84465 + }, + { + "epoch": 13.779771615008157, + "grad_norm": 0.09039250761270523, + "learning_rate": 0.0002668369032466009, + "loss": 0.009, + "num_input_tokens_seen": 182291168, + "step": 84470 + }, + { + "epoch": 13.780587275693312, + "grad_norm": 0.0006417245022021234, + "learning_rate": 0.0002667739390242161, + "loss": 0.0028, + "num_input_tokens_seen": 182302144, + "step": 84475 + }, + { + "epoch": 13.781402936378466, + "grad_norm": 0.004802557174116373, + "learning_rate": 0.00026671097952845284, + "loss": 0.0322, + "num_input_tokens_seen": 182313248, + "step": 84480 + }, + { + "epoch": 13.782218597063622, + "grad_norm": 0.2346547693014145, + "learning_rate": 0.00026664802476058803, + "loss": 0.0154, + "num_input_tokens_seen": 182323968, + "step": 84485 + }, + { + "epoch": 13.783034257748776, + "grad_norm": 0.003233562922105193, + "learning_rate": 0.00026658507472189654, + "loss": 0.0072, + "num_input_tokens_seen": 182334848, + "step": 84490 + }, + { + "epoch": 13.783849918433932, + "grad_norm": 0.00045367670827545226, + "learning_rate": 0.0002665221294136548, + "loss": 0.0085, + "num_input_tokens_seen": 182346176, + "step": 84495 + }, + { + "epoch": 13.784665579119086, + "grad_norm": 0.0028642553370445967, + "learning_rate": 0.0002664591888371384, + "loss": 0.0052, + "num_input_tokens_seen": 182356832, + "step": 84500 + }, + { + "epoch": 13.785481239804241, + "grad_norm": 0.0077073657885193825, + "learning_rate": 0.00026639625299362276, + "loss": 0.0362, + "num_input_tokens_seen": 182367200, + "step": 84505 + }, + { + "epoch": 13.786296900489397, + "grad_norm": 0.0006326684961095452, + "learning_rate": 0.00026633332188438335, + "loss": 0.005, + "num_input_tokens_seen": 182377952, + "step": 84510 + }, + { + "epoch": 13.78711256117455, + "grad_norm": 1.0694580078125, + "learning_rate": 0.00026627039551069563, + "loss": 0.0384, + "num_input_tokens_seen": 182388352, + "step": 84515 + }, + { + "epoch": 13.787928221859707, + "grad_norm": 0.008516918867826462, + "learning_rate": 0.00026620747387383494, + "loss": 0.0414, + "num_input_tokens_seen": 182399424, + "step": 84520 + }, + { + "epoch": 13.78874388254486, + "grad_norm": 0.0007812812691554427, + "learning_rate": 0.0002661445569750762, + "loss": 0.0078, + "num_input_tokens_seen": 182411232, + "step": 84525 + }, + { + "epoch": 13.789559543230016, + "grad_norm": 0.003533656010404229, + "learning_rate": 0.00026608164481569486, + "loss": 0.0071, + "num_input_tokens_seen": 182422272, + "step": 84530 + }, + { + "epoch": 13.790375203915172, + "grad_norm": 0.09073392301797867, + "learning_rate": 0.0002660187373969656, + "loss": 0.0103, + "num_input_tokens_seen": 182433856, + "step": 84535 + }, + { + "epoch": 13.791190864600326, + "grad_norm": 0.006834371481090784, + "learning_rate": 0.00026595583472016355, + "loss": 0.0017, + "num_input_tokens_seen": 182444256, + "step": 84540 + }, + { + "epoch": 13.792006525285482, + "grad_norm": 0.011992399580776691, + "learning_rate": 0.00026589293678656336, + "loss": 0.0167, + "num_input_tokens_seen": 182455136, + "step": 84545 + }, + { + "epoch": 13.792822185970635, + "grad_norm": 0.0020787755493074656, + "learning_rate": 0.0002658300435974398, + "loss": 0.0025, + "num_input_tokens_seen": 182465344, + "step": 84550 + }, + { + "epoch": 13.793637846655791, + "grad_norm": 0.010554094798862934, + "learning_rate": 0.00026576715515406747, + "loss": 0.0024, + "num_input_tokens_seen": 182475968, + "step": 84555 + }, + { + "epoch": 13.794453507340947, + "grad_norm": 0.03928709402680397, + "learning_rate": 0.0002657042714577209, + "loss": 0.0079, + "num_input_tokens_seen": 182485440, + "step": 84560 + }, + { + "epoch": 13.7952691680261, + "grad_norm": 0.00439923582598567, + "learning_rate": 0.0002656413925096745, + "loss": 0.0029, + "num_input_tokens_seen": 182496000, + "step": 84565 + }, + { + "epoch": 13.796084828711257, + "grad_norm": 0.007453288417309523, + "learning_rate": 0.00026557851831120254, + "loss": 0.0056, + "num_input_tokens_seen": 182506336, + "step": 84570 + }, + { + "epoch": 13.79690048939641, + "grad_norm": 0.004348553717136383, + "learning_rate": 0.00026551564886357937, + "loss": 0.0026, + "num_input_tokens_seen": 182517568, + "step": 84575 + }, + { + "epoch": 13.797716150081566, + "grad_norm": 0.02317430078983307, + "learning_rate": 0.00026545278416807895, + "loss": 0.003, + "num_input_tokens_seen": 182528416, + "step": 84580 + }, + { + "epoch": 13.798531810766722, + "grad_norm": 0.33637887239456177, + "learning_rate": 0.00026538992422597547, + "loss": 0.0121, + "num_input_tokens_seen": 182538432, + "step": 84585 + }, + { + "epoch": 13.799347471451876, + "grad_norm": 0.0010666627204045653, + "learning_rate": 0.0002653270690385428, + "loss": 0.002, + "num_input_tokens_seen": 182549632, + "step": 84590 + }, + { + "epoch": 13.800163132137031, + "grad_norm": 0.0002323422668268904, + "learning_rate": 0.00026526421860705474, + "loss": 0.0077, + "num_input_tokens_seen": 182560768, + "step": 84595 + }, + { + "epoch": 13.800978792822185, + "grad_norm": 0.0007312102825380862, + "learning_rate": 0.0002652013729327849, + "loss": 0.0004, + "num_input_tokens_seen": 182572704, + "step": 84600 + }, + { + "epoch": 13.801794453507341, + "grad_norm": 0.5456418991088867, + "learning_rate": 0.00026513853201700727, + "loss": 0.1275, + "num_input_tokens_seen": 182583232, + "step": 84605 + }, + { + "epoch": 13.802610114192497, + "grad_norm": 0.01817525178194046, + "learning_rate": 0.00026507569586099527, + "loss": 0.0058, + "num_input_tokens_seen": 182594144, + "step": 84610 + }, + { + "epoch": 13.80342577487765, + "grad_norm": 0.001074093161150813, + "learning_rate": 0.0002650128644660223, + "loss": 0.0095, + "num_input_tokens_seen": 182604672, + "step": 84615 + }, + { + "epoch": 13.804241435562806, + "grad_norm": 0.01348531898111105, + "learning_rate": 0.0002649500378333617, + "loss": 0.0071, + "num_input_tokens_seen": 182615360, + "step": 84620 + }, + { + "epoch": 13.80505709624796, + "grad_norm": 0.002502233488485217, + "learning_rate": 0.0002648872159642868, + "loss": 0.0092, + "num_input_tokens_seen": 182626016, + "step": 84625 + }, + { + "epoch": 13.805872756933116, + "grad_norm": 0.016285371035337448, + "learning_rate": 0.00026482439886007077, + "loss": 0.0552, + "num_input_tokens_seen": 182636928, + "step": 84630 + }, + { + "epoch": 13.80668841761827, + "grad_norm": 0.02201697789132595, + "learning_rate": 0.00026476158652198655, + "loss": 0.0091, + "num_input_tokens_seen": 182647392, + "step": 84635 + }, + { + "epoch": 13.807504078303426, + "grad_norm": 0.001087754499167204, + "learning_rate": 0.00026469877895130727, + "loss": 0.0219, + "num_input_tokens_seen": 182658272, + "step": 84640 + }, + { + "epoch": 13.808319738988581, + "grad_norm": 0.02722734399139881, + "learning_rate": 0.00026463597614930575, + "loss": 0.0024, + "num_input_tokens_seen": 182668736, + "step": 84645 + }, + { + "epoch": 13.809135399673735, + "grad_norm": 0.01022981945425272, + "learning_rate": 0.00026457317811725466, + "loss": 0.006, + "num_input_tokens_seen": 182679552, + "step": 84650 + }, + { + "epoch": 13.809951060358891, + "grad_norm": 0.002001257846131921, + "learning_rate": 0.00026451038485642687, + "loss": 0.061, + "num_input_tokens_seen": 182689568, + "step": 84655 + }, + { + "epoch": 13.810766721044045, + "grad_norm": 0.013086462393403053, + "learning_rate": 0.0002644475963680948, + "loss": 0.0115, + "num_input_tokens_seen": 182699680, + "step": 84660 + }, + { + "epoch": 13.8115823817292, + "grad_norm": 0.00039004423888400197, + "learning_rate": 0.0002643848126535311, + "loss": 0.0014, + "num_input_tokens_seen": 182710848, + "step": 84665 + }, + { + "epoch": 13.812398042414356, + "grad_norm": 0.05467027425765991, + "learning_rate": 0.000264322033714008, + "loss": 0.0051, + "num_input_tokens_seen": 182721792, + "step": 84670 + }, + { + "epoch": 13.81321370309951, + "grad_norm": 1.3984943628311157, + "learning_rate": 0.0002642592595507979, + "loss": 0.0464, + "num_input_tokens_seen": 182732384, + "step": 84675 + }, + { + "epoch": 13.814029363784666, + "grad_norm": 0.12190397083759308, + "learning_rate": 0.0002641964901651729, + "loss": 0.0151, + "num_input_tokens_seen": 182743744, + "step": 84680 + }, + { + "epoch": 13.81484502446982, + "grad_norm": 0.9040001034736633, + "learning_rate": 0.0002641337255584052, + "loss": 0.0747, + "num_input_tokens_seen": 182754816, + "step": 84685 + }, + { + "epoch": 13.815660685154976, + "grad_norm": 0.0021477816626429558, + "learning_rate": 0.0002640709657317668, + "loss": 0.0043, + "num_input_tokens_seen": 182766752, + "step": 84690 + }, + { + "epoch": 13.81647634584013, + "grad_norm": 0.001280378084629774, + "learning_rate": 0.0002640082106865295, + "loss": 0.1018, + "num_input_tokens_seen": 182776832, + "step": 84695 + }, + { + "epoch": 13.817292006525285, + "grad_norm": 0.013318683952093124, + "learning_rate": 0.00026394546042396525, + "loss": 0.0036, + "num_input_tokens_seen": 182787360, + "step": 84700 + }, + { + "epoch": 13.818107667210441, + "grad_norm": 0.0015263502718880773, + "learning_rate": 0.0002638827149453457, + "loss": 0.0034, + "num_input_tokens_seen": 182797248, + "step": 84705 + }, + { + "epoch": 13.818923327895595, + "grad_norm": 0.7975746989250183, + "learning_rate": 0.0002638199742519425, + "loss": 0.0332, + "num_input_tokens_seen": 182808416, + "step": 84710 + }, + { + "epoch": 13.81973898858075, + "grad_norm": 0.004797052592039108, + "learning_rate": 0.00026375723834502686, + "loss": 0.0023, + "num_input_tokens_seen": 182818912, + "step": 84715 + }, + { + "epoch": 13.820554649265905, + "grad_norm": 0.0019824958872050047, + "learning_rate": 0.0002636945072258709, + "loss": 0.0017, + "num_input_tokens_seen": 182830368, + "step": 84720 + }, + { + "epoch": 13.82137030995106, + "grad_norm": 0.02076047845184803, + "learning_rate": 0.00026363178089574516, + "loss": 0.0069, + "num_input_tokens_seen": 182841536, + "step": 84725 + }, + { + "epoch": 13.822185970636216, + "grad_norm": 0.04782721772789955, + "learning_rate": 0.0002635690593559216, + "loss": 0.0019, + "num_input_tokens_seen": 182852288, + "step": 84730 + }, + { + "epoch": 13.82300163132137, + "grad_norm": 0.014023072086274624, + "learning_rate": 0.0002635063426076706, + "loss": 0.0169, + "num_input_tokens_seen": 182862880, + "step": 84735 + }, + { + "epoch": 13.823817292006526, + "grad_norm": 0.0045128436759114265, + "learning_rate": 0.000263443630652264, + "loss": 0.1188, + "num_input_tokens_seen": 182873792, + "step": 84740 + }, + { + "epoch": 13.82463295269168, + "grad_norm": 0.011381004005670547, + "learning_rate": 0.00026338092349097186, + "loss": 0.0024, + "num_input_tokens_seen": 182884000, + "step": 84745 + }, + { + "epoch": 13.825448613376835, + "grad_norm": 0.022549383342266083, + "learning_rate": 0.00026331822112506576, + "loss": 0.0028, + "num_input_tokens_seen": 182894848, + "step": 84750 + }, + { + "epoch": 13.826264274061991, + "grad_norm": 0.00047267769696190953, + "learning_rate": 0.0002632555235558161, + "loss": 0.0045, + "num_input_tokens_seen": 182903840, + "step": 84755 + }, + { + "epoch": 13.827079934747145, + "grad_norm": 0.0073170228861272335, + "learning_rate": 0.00026319283078449365, + "loss": 0.0022, + "num_input_tokens_seen": 182913024, + "step": 84760 + }, + { + "epoch": 13.8278955954323, + "grad_norm": 0.03414444625377655, + "learning_rate": 0.0002631301428123688, + "loss": 0.0052, + "num_input_tokens_seen": 182924032, + "step": 84765 + }, + { + "epoch": 13.828711256117455, + "grad_norm": 0.04355109855532646, + "learning_rate": 0.00026306745964071223, + "loss": 0.003, + "num_input_tokens_seen": 182934976, + "step": 84770 + }, + { + "epoch": 13.82952691680261, + "grad_norm": 0.0061978367157280445, + "learning_rate": 0.00026300478127079405, + "loss": 0.003, + "num_input_tokens_seen": 182945376, + "step": 84775 + }, + { + "epoch": 13.830342577487766, + "grad_norm": 0.007111057173460722, + "learning_rate": 0.0002629421077038846, + "loss": 0.0054, + "num_input_tokens_seen": 182957088, + "step": 84780 + }, + { + "epoch": 13.83115823817292, + "grad_norm": 0.07892990857362747, + "learning_rate": 0.00026287943894125415, + "loss": 0.0254, + "num_input_tokens_seen": 182968192, + "step": 84785 + }, + { + "epoch": 13.831973898858076, + "grad_norm": 0.008173160254955292, + "learning_rate": 0.0002628167749841727, + "loss": 0.0037, + "num_input_tokens_seen": 182978848, + "step": 84790 + }, + { + "epoch": 13.83278955954323, + "grad_norm": 0.6762431263923645, + "learning_rate": 0.0002627541158339101, + "loss": 0.0365, + "num_input_tokens_seen": 182989536, + "step": 84795 + }, + { + "epoch": 13.833605220228385, + "grad_norm": 0.0004910976276732981, + "learning_rate": 0.0002626914614917364, + "loss": 0.0021, + "num_input_tokens_seen": 182999168, + "step": 84800 + }, + { + "epoch": 13.83442088091354, + "grad_norm": 0.00555798364803195, + "learning_rate": 0.0002626288119589212, + "loss": 0.0015, + "num_input_tokens_seen": 183009344, + "step": 84805 + }, + { + "epoch": 13.835236541598695, + "grad_norm": 0.31210461258888245, + "learning_rate": 0.0002625661672367343, + "loss": 0.026, + "num_input_tokens_seen": 183018880, + "step": 84810 + }, + { + "epoch": 13.83605220228385, + "grad_norm": 0.0018057851120829582, + "learning_rate": 0.00026250352732644524, + "loss": 0.0317, + "num_input_tokens_seen": 183029952, + "step": 84815 + }, + { + "epoch": 13.836867862969005, + "grad_norm": 0.002154577523469925, + "learning_rate": 0.0002624408922293232, + "loss": 0.0018, + "num_input_tokens_seen": 183039840, + "step": 84820 + }, + { + "epoch": 13.83768352365416, + "grad_norm": 0.002014023019000888, + "learning_rate": 0.0002623782619466383, + "loss": 0.001, + "num_input_tokens_seen": 183049568, + "step": 84825 + }, + { + "epoch": 13.838499184339314, + "grad_norm": 0.004079278092831373, + "learning_rate": 0.00026231563647965896, + "loss": 0.0024, + "num_input_tokens_seen": 183061344, + "step": 84830 + }, + { + "epoch": 13.83931484502447, + "grad_norm": 0.003482044441625476, + "learning_rate": 0.00026225301582965524, + "loss": 0.0076, + "num_input_tokens_seen": 183071360, + "step": 84835 + }, + { + "epoch": 13.840130505709626, + "grad_norm": 0.01718735508620739, + "learning_rate": 0.0002621903999978953, + "loss": 0.004, + "num_input_tokens_seen": 183082560, + "step": 84840 + }, + { + "epoch": 13.84094616639478, + "grad_norm": 0.003035926725715399, + "learning_rate": 0.0002621277889856489, + "loss": 0.0067, + "num_input_tokens_seen": 183092352, + "step": 84845 + }, + { + "epoch": 13.841761827079935, + "grad_norm": 0.0008175540715456009, + "learning_rate": 0.0002620651827941843, + "loss": 0.0261, + "num_input_tokens_seen": 183102912, + "step": 84850 + }, + { + "epoch": 13.84257748776509, + "grad_norm": 0.01003414299339056, + "learning_rate": 0.00026200258142477107, + "loss": 0.0018, + "num_input_tokens_seen": 183113472, + "step": 84855 + }, + { + "epoch": 13.843393148450245, + "grad_norm": 0.0009055508999153972, + "learning_rate": 0.00026193998487867697, + "loss": 0.027, + "num_input_tokens_seen": 183124768, + "step": 84860 + }, + { + "epoch": 13.844208809135399, + "grad_norm": 0.005171182099729776, + "learning_rate": 0.0002618773931571715, + "loss": 0.009, + "num_input_tokens_seen": 183135872, + "step": 84865 + }, + { + "epoch": 13.845024469820554, + "grad_norm": 0.03736359626054764, + "learning_rate": 0.00026181480626152236, + "loss": 0.0032, + "num_input_tokens_seen": 183146976, + "step": 84870 + }, + { + "epoch": 13.84584013050571, + "grad_norm": 0.001328750280663371, + "learning_rate": 0.0002617522241929987, + "loss": 0.0007, + "num_input_tokens_seen": 183156672, + "step": 84875 + }, + { + "epoch": 13.846655791190864, + "grad_norm": 0.006559726782143116, + "learning_rate": 0.0002616896469528681, + "loss": 0.0058, + "num_input_tokens_seen": 183166688, + "step": 84880 + }, + { + "epoch": 13.84747145187602, + "grad_norm": 0.009690310806035995, + "learning_rate": 0.00026162707454239944, + "loss": 0.0579, + "num_input_tokens_seen": 183177152, + "step": 84885 + }, + { + "epoch": 13.848287112561174, + "grad_norm": 0.022296492010354996, + "learning_rate": 0.00026156450696286014, + "loss": 0.1086, + "num_input_tokens_seen": 183188512, + "step": 84890 + }, + { + "epoch": 13.84910277324633, + "grad_norm": 0.0015420381678268313, + "learning_rate": 0.0002615019442155189, + "loss": 0.0012, + "num_input_tokens_seen": 183199072, + "step": 84895 + }, + { + "epoch": 13.849918433931485, + "grad_norm": 0.0058791195042431355, + "learning_rate": 0.00026143938630164316, + "loss": 0.0234, + "num_input_tokens_seen": 183209664, + "step": 84900 + }, + { + "epoch": 13.850734094616639, + "grad_norm": 0.12125188857316971, + "learning_rate": 0.00026137683322250094, + "loss": 0.0952, + "num_input_tokens_seen": 183221504, + "step": 84905 + }, + { + "epoch": 13.851549755301795, + "grad_norm": 0.0008184023317880929, + "learning_rate": 0.00026131428497935995, + "loss": 0.0013, + "num_input_tokens_seen": 183231296, + "step": 84910 + }, + { + "epoch": 13.852365415986949, + "grad_norm": 0.1205214336514473, + "learning_rate": 0.0002612517415734877, + "loss": 0.0069, + "num_input_tokens_seen": 183241728, + "step": 84915 + }, + { + "epoch": 13.853181076672104, + "grad_norm": 0.0005909082829020917, + "learning_rate": 0.00026118920300615187, + "loss": 0.0552, + "num_input_tokens_seen": 183253696, + "step": 84920 + }, + { + "epoch": 13.85399673735726, + "grad_norm": 0.00741454865783453, + "learning_rate": 0.0002611266692786197, + "loss": 0.002, + "num_input_tokens_seen": 183263712, + "step": 84925 + }, + { + "epoch": 13.854812398042414, + "grad_norm": 0.011291262693703175, + "learning_rate": 0.00026106414039215865, + "loss": 0.0011, + "num_input_tokens_seen": 183275008, + "step": 84930 + }, + { + "epoch": 13.85562805872757, + "grad_norm": 0.3060096800327301, + "learning_rate": 0.00026100161634803594, + "loss": 0.0106, + "num_input_tokens_seen": 183285952, + "step": 84935 + }, + { + "epoch": 13.856443719412724, + "grad_norm": 0.025619059801101685, + "learning_rate": 0.0002609390971475186, + "loss": 0.0018, + "num_input_tokens_seen": 183297024, + "step": 84940 + }, + { + "epoch": 13.85725938009788, + "grad_norm": 0.026714660227298737, + "learning_rate": 0.00026087658279187357, + "loss": 0.0037, + "num_input_tokens_seen": 183307936, + "step": 84945 + }, + { + "epoch": 13.858075040783035, + "grad_norm": 0.6308284997940063, + "learning_rate": 0.0002608140732823684, + "loss": 0.0982, + "num_input_tokens_seen": 183318496, + "step": 84950 + }, + { + "epoch": 13.858890701468189, + "grad_norm": 0.0027444588486105204, + "learning_rate": 0.00026075156862026896, + "loss": 0.0029, + "num_input_tokens_seen": 183328608, + "step": 84955 + }, + { + "epoch": 13.859706362153345, + "grad_norm": 0.0014102521818131208, + "learning_rate": 0.00026068906880684297, + "loss": 0.0036, + "num_input_tokens_seen": 183340064, + "step": 84960 + }, + { + "epoch": 13.860522022838499, + "grad_norm": 0.0026959239039570093, + "learning_rate": 0.0002606265738433561, + "loss": 0.0016, + "num_input_tokens_seen": 183350688, + "step": 84965 + }, + { + "epoch": 13.861337683523654, + "grad_norm": 0.002494816668331623, + "learning_rate": 0.0002605640837310758, + "loss": 0.0021, + "num_input_tokens_seen": 183360992, + "step": 84970 + }, + { + "epoch": 13.86215334420881, + "grad_norm": 0.0043399217538535595, + "learning_rate": 0.0002605015984712678, + "loss": 0.0072, + "num_input_tokens_seen": 183371456, + "step": 84975 + }, + { + "epoch": 13.862969004893964, + "grad_norm": 0.0006144453654997051, + "learning_rate": 0.000260439118065199, + "loss": 0.0005, + "num_input_tokens_seen": 183382240, + "step": 84980 + }, + { + "epoch": 13.86378466557912, + "grad_norm": 0.0427059605717659, + "learning_rate": 0.000260376642514135, + "loss": 0.0038, + "num_input_tokens_seen": 183393152, + "step": 84985 + }, + { + "epoch": 13.864600326264274, + "grad_norm": 0.014045458287000656, + "learning_rate": 0.00026031417181934276, + "loss": 0.0155, + "num_input_tokens_seen": 183403200, + "step": 84990 + }, + { + "epoch": 13.86541598694943, + "grad_norm": 0.0003714857448358089, + "learning_rate": 0.0002602517059820875, + "loss": 0.0006, + "num_input_tokens_seen": 183414272, + "step": 84995 + }, + { + "epoch": 13.866231647634583, + "grad_norm": 0.062068138271570206, + "learning_rate": 0.0002601892450036359, + "loss": 0.0405, + "num_input_tokens_seen": 183425728, + "step": 85000 + }, + { + "epoch": 13.867047308319739, + "grad_norm": 0.001882745767943561, + "learning_rate": 0.0002601267888852531, + "loss": 0.0059, + "num_input_tokens_seen": 183437344, + "step": 85005 + }, + { + "epoch": 13.867862969004895, + "grad_norm": 0.044060058891773224, + "learning_rate": 0.0002600643376282056, + "loss": 0.1865, + "num_input_tokens_seen": 183447680, + "step": 85010 + }, + { + "epoch": 13.868678629690049, + "grad_norm": 0.027935104444622993, + "learning_rate": 0.0002600018912337584, + "loss": 0.0029, + "num_input_tokens_seen": 183459616, + "step": 85015 + }, + { + "epoch": 13.869494290375204, + "grad_norm": 0.028235308825969696, + "learning_rate": 0.00025993944970317763, + "loss": 0.0067, + "num_input_tokens_seen": 183470560, + "step": 85020 + }, + { + "epoch": 13.870309951060358, + "grad_norm": 0.487893670797348, + "learning_rate": 0.00025987701303772806, + "loss": 0.027, + "num_input_tokens_seen": 183481664, + "step": 85025 + }, + { + "epoch": 13.871125611745514, + "grad_norm": 0.0007524109096266329, + "learning_rate": 0.00025981458123867566, + "loss": 0.0034, + "num_input_tokens_seen": 183493472, + "step": 85030 + }, + { + "epoch": 13.87194127243067, + "grad_norm": 0.00984126329421997, + "learning_rate": 0.0002597521543072854, + "loss": 0.0194, + "num_input_tokens_seen": 183504352, + "step": 85035 + }, + { + "epoch": 13.872756933115824, + "grad_norm": 0.3624925911426544, + "learning_rate": 0.00025968973224482257, + "loss": 0.114, + "num_input_tokens_seen": 183514592, + "step": 85040 + }, + { + "epoch": 13.87357259380098, + "grad_norm": 0.13307076692581177, + "learning_rate": 0.00025962731505255215, + "loss": 0.0121, + "num_input_tokens_seen": 183526112, + "step": 85045 + }, + { + "epoch": 13.874388254486133, + "grad_norm": 0.004628616385161877, + "learning_rate": 0.0002595649027317392, + "loss": 0.0057, + "num_input_tokens_seen": 183535776, + "step": 85050 + }, + { + "epoch": 13.875203915171289, + "grad_norm": 0.26116886734962463, + "learning_rate": 0.0002595024952836484, + "loss": 0.0111, + "num_input_tokens_seen": 183547552, + "step": 85055 + }, + { + "epoch": 13.876019575856443, + "grad_norm": 0.0040539647452533245, + "learning_rate": 0.00025944009270954463, + "loss": 0.0129, + "num_input_tokens_seen": 183557280, + "step": 85060 + }, + { + "epoch": 13.876835236541599, + "grad_norm": 0.5856723785400391, + "learning_rate": 0.00025937769501069264, + "loss": 0.1657, + "num_input_tokens_seen": 183567360, + "step": 85065 + }, + { + "epoch": 13.877650897226754, + "grad_norm": 0.004472846630960703, + "learning_rate": 0.00025931530218835684, + "loss": 0.0032, + "num_input_tokens_seen": 183578432, + "step": 85070 + }, + { + "epoch": 13.878466557911908, + "grad_norm": 0.02256857231259346, + "learning_rate": 0.00025925291424380183, + "loss": 0.0489, + "num_input_tokens_seen": 183589536, + "step": 85075 + }, + { + "epoch": 13.879282218597064, + "grad_norm": 0.05394762009382248, + "learning_rate": 0.00025919053117829185, + "loss": 0.0073, + "num_input_tokens_seen": 183599904, + "step": 85080 + }, + { + "epoch": 13.880097879282218, + "grad_norm": 0.008492419496178627, + "learning_rate": 0.0002591281529930913, + "loss": 0.0028, + "num_input_tokens_seen": 183610240, + "step": 85085 + }, + { + "epoch": 13.880913539967374, + "grad_norm": 0.0021886902395635843, + "learning_rate": 0.0002590657796894641, + "loss": 0.0026, + "num_input_tokens_seen": 183622048, + "step": 85090 + }, + { + "epoch": 13.88172920065253, + "grad_norm": 0.002689148299396038, + "learning_rate": 0.0002590034112686749, + "loss": 0.0021, + "num_input_tokens_seen": 183632960, + "step": 85095 + }, + { + "epoch": 13.882544861337683, + "grad_norm": 0.009632963687181473, + "learning_rate": 0.0002589410477319869, + "loss": 0.0065, + "num_input_tokens_seen": 183645408, + "step": 85100 + }, + { + "epoch": 13.883360522022839, + "grad_norm": 0.004884787369519472, + "learning_rate": 0.0002588786890806647, + "loss": 0.0038, + "num_input_tokens_seen": 183655200, + "step": 85105 + }, + { + "epoch": 13.884176182707993, + "grad_norm": 0.0013453784631565213, + "learning_rate": 0.0002588163353159715, + "loss": 0.014, + "num_input_tokens_seen": 183665568, + "step": 85110 + }, + { + "epoch": 13.884991843393149, + "grad_norm": 0.01336477417498827, + "learning_rate": 0.00025875398643917147, + "loss": 0.0292, + "num_input_tokens_seen": 183677088, + "step": 85115 + }, + { + "epoch": 13.885807504078304, + "grad_norm": 0.0028887561056762934, + "learning_rate": 0.00025869164245152765, + "loss": 0.0051, + "num_input_tokens_seen": 183687296, + "step": 85120 + }, + { + "epoch": 13.886623164763458, + "grad_norm": 0.03622622415423393, + "learning_rate": 0.00025862930335430426, + "loss": 0.007, + "num_input_tokens_seen": 183697280, + "step": 85125 + }, + { + "epoch": 13.887438825448614, + "grad_norm": 0.01827280782163143, + "learning_rate": 0.0002585669691487637, + "loss": 0.003, + "num_input_tokens_seen": 183706112, + "step": 85130 + }, + { + "epoch": 13.888254486133768, + "grad_norm": 0.10345486551523209, + "learning_rate": 0.00025850463983617005, + "loss": 0.0054, + "num_input_tokens_seen": 183716416, + "step": 85135 + }, + { + "epoch": 13.889070146818923, + "grad_norm": 0.03769503906369209, + "learning_rate": 0.0002584423154177863, + "loss": 0.0078, + "num_input_tokens_seen": 183727008, + "step": 85140 + }, + { + "epoch": 13.88988580750408, + "grad_norm": 0.025255361571907997, + "learning_rate": 0.0002583799958948754, + "loss": 0.0126, + "num_input_tokens_seen": 183738720, + "step": 85145 + }, + { + "epoch": 13.890701468189233, + "grad_norm": 0.01294715516269207, + "learning_rate": 0.00025831768126870035, + "loss": 0.0125, + "num_input_tokens_seen": 183748832, + "step": 85150 + }, + { + "epoch": 13.891517128874389, + "grad_norm": 0.0016138808568939567, + "learning_rate": 0.00025825537154052414, + "loss": 0.0033, + "num_input_tokens_seen": 183760448, + "step": 85155 + }, + { + "epoch": 13.892332789559543, + "grad_norm": 0.7326217889785767, + "learning_rate": 0.00025819306671160953, + "loss": 0.0491, + "num_input_tokens_seen": 183771104, + "step": 85160 + }, + { + "epoch": 13.893148450244698, + "grad_norm": 0.006035518832504749, + "learning_rate": 0.00025813076678321914, + "loss": 0.0078, + "num_input_tokens_seen": 183781408, + "step": 85165 + }, + { + "epoch": 13.893964110929852, + "grad_norm": 0.5348854064941406, + "learning_rate": 0.0002580684717566156, + "loss": 0.0605, + "num_input_tokens_seen": 183793024, + "step": 85170 + }, + { + "epoch": 13.894779771615008, + "grad_norm": 0.4127856194972992, + "learning_rate": 0.0002580061816330614, + "loss": 0.0136, + "num_input_tokens_seen": 183804192, + "step": 85175 + }, + { + "epoch": 13.895595432300164, + "grad_norm": 0.03207729384303093, + "learning_rate": 0.00025794389641381894, + "loss": 0.0015, + "num_input_tokens_seen": 183815904, + "step": 85180 + }, + { + "epoch": 13.896411092985318, + "grad_norm": 0.07139331102371216, + "learning_rate": 0.0002578816161001505, + "loss": 0.0079, + "num_input_tokens_seen": 183825376, + "step": 85185 + }, + { + "epoch": 13.897226753670473, + "grad_norm": 0.006600076798349619, + "learning_rate": 0.0002578193406933182, + "loss": 0.0046, + "num_input_tokens_seen": 183835872, + "step": 85190 + }, + { + "epoch": 13.898042414355627, + "grad_norm": 0.0004420229233801365, + "learning_rate": 0.00025775707019458415, + "loss": 0.1729, + "num_input_tokens_seen": 183847584, + "step": 85195 + }, + { + "epoch": 13.898858075040783, + "grad_norm": 0.0020276163704693317, + "learning_rate": 0.0002576948046052105, + "loss": 0.0011, + "num_input_tokens_seen": 183859072, + "step": 85200 + }, + { + "epoch": 13.899673735725939, + "grad_norm": 0.002414155285805464, + "learning_rate": 0.000257632543926459, + "loss": 0.0071, + "num_input_tokens_seen": 183870912, + "step": 85205 + }, + { + "epoch": 13.900489396411093, + "grad_norm": 0.03148249164223671, + "learning_rate": 0.0002575702881595914, + "loss": 0.0022, + "num_input_tokens_seen": 183882432, + "step": 85210 + }, + { + "epoch": 13.901305057096248, + "grad_norm": 0.0010660403640940785, + "learning_rate": 0.0002575080373058695, + "loss": 0.0901, + "num_input_tokens_seen": 183892768, + "step": 85215 + }, + { + "epoch": 13.902120717781402, + "grad_norm": 0.031429436057806015, + "learning_rate": 0.0002574457913665548, + "loss": 0.0032, + "num_input_tokens_seen": 183904416, + "step": 85220 + }, + { + "epoch": 13.902936378466558, + "grad_norm": 0.008137458935379982, + "learning_rate": 0.00025738355034290886, + "loss": 0.0072, + "num_input_tokens_seen": 183915872, + "step": 85225 + }, + { + "epoch": 13.903752039151712, + "grad_norm": 0.012067089788615704, + "learning_rate": 0.00025732131423619303, + "loss": 0.0053, + "num_input_tokens_seen": 183926464, + "step": 85230 + }, + { + "epoch": 13.904567699836868, + "grad_norm": 0.0007666905876249075, + "learning_rate": 0.0002572590830476685, + "loss": 0.0017, + "num_input_tokens_seen": 183937504, + "step": 85235 + }, + { + "epoch": 13.905383360522023, + "grad_norm": 0.0024214971344918013, + "learning_rate": 0.0002571968567785967, + "loss": 0.0254, + "num_input_tokens_seen": 183947680, + "step": 85240 + }, + { + "epoch": 13.906199021207177, + "grad_norm": 0.007955431006848812, + "learning_rate": 0.0002571346354302387, + "loss": 0.0466, + "num_input_tokens_seen": 183956736, + "step": 85245 + }, + { + "epoch": 13.907014681892333, + "grad_norm": 0.018161863088607788, + "learning_rate": 0.0002570724190038554, + "loss": 0.0205, + "num_input_tokens_seen": 183967264, + "step": 85250 + }, + { + "epoch": 13.907830342577487, + "grad_norm": 0.5447240471839905, + "learning_rate": 0.00025701020750070765, + "loss": 0.0316, + "num_input_tokens_seen": 183976416, + "step": 85255 + }, + { + "epoch": 13.908646003262643, + "grad_norm": 0.0014064450515434146, + "learning_rate": 0.0002569480009220563, + "loss": 0.0564, + "num_input_tokens_seen": 183988224, + "step": 85260 + }, + { + "epoch": 13.909461663947798, + "grad_norm": 0.00121423474047333, + "learning_rate": 0.00025688579926916213, + "loss": 0.0325, + "num_input_tokens_seen": 183999872, + "step": 85265 + }, + { + "epoch": 13.910277324632952, + "grad_norm": 0.1072845384478569, + "learning_rate": 0.0002568236025432855, + "loss": 0.017, + "num_input_tokens_seen": 184011040, + "step": 85270 + }, + { + "epoch": 13.911092985318108, + "grad_norm": 0.005678311921656132, + "learning_rate": 0.00025676141074568713, + "loss": 0.0572, + "num_input_tokens_seen": 184022016, + "step": 85275 + }, + { + "epoch": 13.911908646003262, + "grad_norm": 0.0186642874032259, + "learning_rate": 0.00025669922387762747, + "loss": 0.0029, + "num_input_tokens_seen": 184031584, + "step": 85280 + }, + { + "epoch": 13.912724306688418, + "grad_norm": 0.056574515998363495, + "learning_rate": 0.00025663704194036653, + "loss": 0.0088, + "num_input_tokens_seen": 184042112, + "step": 85285 + }, + { + "epoch": 13.913539967373573, + "grad_norm": 0.37403061985969543, + "learning_rate": 0.0002565748649351647, + "loss": 0.0352, + "num_input_tokens_seen": 184052864, + "step": 85290 + }, + { + "epoch": 13.914355628058727, + "grad_norm": 0.515491783618927, + "learning_rate": 0.0002565126928632821, + "loss": 0.0261, + "num_input_tokens_seen": 184063232, + "step": 85295 + }, + { + "epoch": 13.915171288743883, + "grad_norm": 0.0018687272677198052, + "learning_rate": 0.00025645052572597856, + "loss": 0.0079, + "num_input_tokens_seen": 184075296, + "step": 85300 + }, + { + "epoch": 13.915986949429037, + "grad_norm": 0.0006722072721458972, + "learning_rate": 0.0002563883635245141, + "loss": 0.003, + "num_input_tokens_seen": 184085920, + "step": 85305 + }, + { + "epoch": 13.916802610114193, + "grad_norm": 0.007489359471946955, + "learning_rate": 0.0002563262062601486, + "loss": 0.0121, + "num_input_tokens_seen": 184097856, + "step": 85310 + }, + { + "epoch": 13.917618270799348, + "grad_norm": 0.0945957824587822, + "learning_rate": 0.0002562640539341415, + "loss": 0.0034, + "num_input_tokens_seen": 184109472, + "step": 85315 + }, + { + "epoch": 13.918433931484502, + "grad_norm": 0.066319040954113, + "learning_rate": 0.0002562019065477527, + "loss": 0.0124, + "num_input_tokens_seen": 184119392, + "step": 85320 + }, + { + "epoch": 13.919249592169658, + "grad_norm": 0.009680901654064655, + "learning_rate": 0.00025613976410224145, + "loss": 0.0041, + "num_input_tokens_seen": 184129664, + "step": 85325 + }, + { + "epoch": 13.920065252854812, + "grad_norm": 0.007618313189595938, + "learning_rate": 0.00025607762659886726, + "loss": 0.0182, + "num_input_tokens_seen": 184140288, + "step": 85330 + }, + { + "epoch": 13.920880913539968, + "grad_norm": 0.16353178024291992, + "learning_rate": 0.00025601549403888934, + "loss": 0.0054, + "num_input_tokens_seen": 184150720, + "step": 85335 + }, + { + "epoch": 13.921696574225122, + "grad_norm": 0.006778034847229719, + "learning_rate": 0.00025595336642356706, + "loss": 0.0041, + "num_input_tokens_seen": 184161120, + "step": 85340 + }, + { + "epoch": 13.922512234910277, + "grad_norm": 0.0050546289421617985, + "learning_rate": 0.0002558912437541594, + "loss": 0.084, + "num_input_tokens_seen": 184170976, + "step": 85345 + }, + { + "epoch": 13.923327895595433, + "grad_norm": 0.026396675035357475, + "learning_rate": 0.0002558291260319253, + "loss": 0.0071, + "num_input_tokens_seen": 184181888, + "step": 85350 + }, + { + "epoch": 13.924143556280587, + "grad_norm": 0.0012689620489254594, + "learning_rate": 0.0002557670132581235, + "loss": 0.0043, + "num_input_tokens_seen": 184192544, + "step": 85355 + }, + { + "epoch": 13.924959216965743, + "grad_norm": 0.6173958778381348, + "learning_rate": 0.00025570490543401345, + "loss": 0.1048, + "num_input_tokens_seen": 184203648, + "step": 85360 + }, + { + "epoch": 13.925774877650896, + "grad_norm": 0.17514394223690033, + "learning_rate": 0.00025564280256085305, + "loss": 0.0103, + "num_input_tokens_seen": 184214848, + "step": 85365 + }, + { + "epoch": 13.926590538336052, + "grad_norm": 0.3380450904369354, + "learning_rate": 0.0002555807046399016, + "loss": 0.0155, + "num_input_tokens_seen": 184226880, + "step": 85370 + }, + { + "epoch": 13.927406199021208, + "grad_norm": 0.3188222348690033, + "learning_rate": 0.00025551861167241675, + "loss": 0.0203, + "num_input_tokens_seen": 184238112, + "step": 85375 + }, + { + "epoch": 13.928221859706362, + "grad_norm": 0.22720636427402496, + "learning_rate": 0.00025545652365965767, + "loss": 0.0113, + "num_input_tokens_seen": 184248160, + "step": 85380 + }, + { + "epoch": 13.929037520391518, + "grad_norm": 0.01165835652500391, + "learning_rate": 0.00025539444060288235, + "loss": 0.0245, + "num_input_tokens_seen": 184258848, + "step": 85385 + }, + { + "epoch": 13.929853181076671, + "grad_norm": 0.026811476796865463, + "learning_rate": 0.000255332362503349, + "loss": 0.0025, + "num_input_tokens_seen": 184269952, + "step": 85390 + }, + { + "epoch": 13.930668841761827, + "grad_norm": 0.8026761412620544, + "learning_rate": 0.00025527028936231567, + "loss": 0.0691, + "num_input_tokens_seen": 184279936, + "step": 85395 + }, + { + "epoch": 13.931484502446983, + "grad_norm": 0.007217478007078171, + "learning_rate": 0.0002552082211810405, + "loss": 0.0044, + "num_input_tokens_seen": 184290464, + "step": 85400 + }, + { + "epoch": 13.932300163132137, + "grad_norm": 0.10747834295034409, + "learning_rate": 0.0002551461579607811, + "loss": 0.1154, + "num_input_tokens_seen": 184300672, + "step": 85405 + }, + { + "epoch": 13.933115823817293, + "grad_norm": 0.003899279050529003, + "learning_rate": 0.00025508409970279554, + "loss": 0.0021, + "num_input_tokens_seen": 184311584, + "step": 85410 + }, + { + "epoch": 13.933931484502446, + "grad_norm": 0.06039474159479141, + "learning_rate": 0.00025502204640834135, + "loss": 0.055, + "num_input_tokens_seen": 184322432, + "step": 85415 + }, + { + "epoch": 13.934747145187602, + "grad_norm": 0.19456903636455536, + "learning_rate": 0.0002549599980786762, + "loss": 0.0252, + "num_input_tokens_seen": 184334464, + "step": 85420 + }, + { + "epoch": 13.935562805872756, + "grad_norm": 0.001114878337830305, + "learning_rate": 0.0002548979547150576, + "loss": 0.002, + "num_input_tokens_seen": 184344608, + "step": 85425 + }, + { + "epoch": 13.936378466557912, + "grad_norm": 0.08489802479743958, + "learning_rate": 0.0002548359163187428, + "loss": 0.0118, + "num_input_tokens_seen": 184354496, + "step": 85430 + }, + { + "epoch": 13.937194127243067, + "grad_norm": 0.00281240651383996, + "learning_rate": 0.0002547738828909891, + "loss": 0.1526, + "num_input_tokens_seen": 184366016, + "step": 85435 + }, + { + "epoch": 13.938009787928221, + "grad_norm": 0.004008208867162466, + "learning_rate": 0.0002547118544330539, + "loss": 0.0857, + "num_input_tokens_seen": 184375200, + "step": 85440 + }, + { + "epoch": 13.938825448613377, + "grad_norm": 0.002246226416900754, + "learning_rate": 0.0002546498309461941, + "loss": 0.0027, + "num_input_tokens_seen": 184385440, + "step": 85445 + }, + { + "epoch": 13.939641109298531, + "grad_norm": 0.001095998683013022, + "learning_rate": 0.00025458781243166667, + "loss": 0.0217, + "num_input_tokens_seen": 184396704, + "step": 85450 + }, + { + "epoch": 13.940456769983687, + "grad_norm": 0.026345092803239822, + "learning_rate": 0.0002545257988907286, + "loss": 0.0135, + "num_input_tokens_seen": 184408160, + "step": 85455 + }, + { + "epoch": 13.941272430668842, + "grad_norm": 0.04753544181585312, + "learning_rate": 0.0002544637903246364, + "loss": 0.0028, + "num_input_tokens_seen": 184419008, + "step": 85460 + }, + { + "epoch": 13.942088091353996, + "grad_norm": 0.04144052043557167, + "learning_rate": 0.0002544017867346474, + "loss": 0.0063, + "num_input_tokens_seen": 184430016, + "step": 85465 + }, + { + "epoch": 13.942903752039152, + "grad_norm": 0.0638512596487999, + "learning_rate": 0.0002543397881220173, + "loss": 0.1626, + "num_input_tokens_seen": 184440224, + "step": 85470 + }, + { + "epoch": 13.943719412724306, + "grad_norm": 0.008341721259057522, + "learning_rate": 0.00025427779448800345, + "loss": 0.0025, + "num_input_tokens_seen": 184451104, + "step": 85475 + }, + { + "epoch": 13.944535073409462, + "grad_norm": 0.010782654397189617, + "learning_rate": 0.0002542158058338615, + "loss": 0.0053, + "num_input_tokens_seen": 184461888, + "step": 85480 + }, + { + "epoch": 13.945350734094617, + "grad_norm": 0.0063970754854381084, + "learning_rate": 0.00025415382216084837, + "loss": 0.0023, + "num_input_tokens_seen": 184472768, + "step": 85485 + }, + { + "epoch": 13.946166394779771, + "grad_norm": 0.3306712508201599, + "learning_rate": 0.0002540918434702195, + "loss": 0.027, + "num_input_tokens_seen": 184482592, + "step": 85490 + }, + { + "epoch": 13.946982055464927, + "grad_norm": 0.0018524457700550556, + "learning_rate": 0.0002540298697632318, + "loss": 0.0013, + "num_input_tokens_seen": 184494112, + "step": 85495 + }, + { + "epoch": 13.947797716150081, + "grad_norm": 1.1179749965667725, + "learning_rate": 0.0002539679010411404, + "loss": 0.0531, + "num_input_tokens_seen": 184505120, + "step": 85500 + }, + { + "epoch": 13.948613376835237, + "grad_norm": 0.009250002913177013, + "learning_rate": 0.00025390593730520206, + "loss": 0.0542, + "num_input_tokens_seen": 184516800, + "step": 85505 + }, + { + "epoch": 13.949429037520392, + "grad_norm": 0.6076849102973938, + "learning_rate": 0.00025384397855667164, + "loss": 0.3081, + "num_input_tokens_seen": 184528288, + "step": 85510 + }, + { + "epoch": 13.950244698205546, + "grad_norm": 0.007433583028614521, + "learning_rate": 0.0002537820247968057, + "loss": 0.0082, + "num_input_tokens_seen": 184539552, + "step": 85515 + }, + { + "epoch": 13.951060358890702, + "grad_norm": 0.02670317143201828, + "learning_rate": 0.00025372007602685894, + "loss": 0.0172, + "num_input_tokens_seen": 184549984, + "step": 85520 + }, + { + "epoch": 13.951876019575856, + "grad_norm": 0.22358015179634094, + "learning_rate": 0.00025365813224808746, + "loss": 0.0099, + "num_input_tokens_seen": 184560608, + "step": 85525 + }, + { + "epoch": 13.952691680261012, + "grad_norm": 0.0019979842472821474, + "learning_rate": 0.00025359619346174644, + "loss": 0.0088, + "num_input_tokens_seen": 184572352, + "step": 85530 + }, + { + "epoch": 13.953507340946166, + "grad_norm": 0.04846477508544922, + "learning_rate": 0.0002535342596690912, + "loss": 0.0036, + "num_input_tokens_seen": 184582976, + "step": 85535 + }, + { + "epoch": 13.954323001631321, + "grad_norm": 0.0083781648427248, + "learning_rate": 0.0002534723308713768, + "loss": 0.0109, + "num_input_tokens_seen": 184592640, + "step": 85540 + }, + { + "epoch": 13.955138662316477, + "grad_norm": 0.6576086282730103, + "learning_rate": 0.0002534104070698584, + "loss": 0.0361, + "num_input_tokens_seen": 184603936, + "step": 85545 + }, + { + "epoch": 13.955954323001631, + "grad_norm": 0.0004717994015663862, + "learning_rate": 0.00025334848826579095, + "loss": 0.0015, + "num_input_tokens_seen": 184615520, + "step": 85550 + }, + { + "epoch": 13.956769983686787, + "grad_norm": 0.0049641127698123455, + "learning_rate": 0.0002532865744604292, + "loss": 0.0162, + "num_input_tokens_seen": 184625920, + "step": 85555 + }, + { + "epoch": 13.95758564437194, + "grad_norm": 0.18654833734035492, + "learning_rate": 0.000253224665655028, + "loss": 0.0104, + "num_input_tokens_seen": 184637440, + "step": 85560 + }, + { + "epoch": 13.958401305057096, + "grad_norm": 0.0043573202565312386, + "learning_rate": 0.0002531627618508421, + "loss": 0.0274, + "num_input_tokens_seen": 184646816, + "step": 85565 + }, + { + "epoch": 13.959216965742252, + "grad_norm": 0.0024928231723606586, + "learning_rate": 0.00025310086304912584, + "loss": 0.0053, + "num_input_tokens_seen": 184658016, + "step": 85570 + }, + { + "epoch": 13.960032626427406, + "grad_norm": 0.015538577921688557, + "learning_rate": 0.0002530389692511337, + "loss": 0.0628, + "num_input_tokens_seen": 184670496, + "step": 85575 + }, + { + "epoch": 13.960848287112562, + "grad_norm": 0.4682210087776184, + "learning_rate": 0.0002529770804581205, + "loss": 0.0163, + "num_input_tokens_seen": 184680384, + "step": 85580 + }, + { + "epoch": 13.961663947797716, + "grad_norm": 0.0011227945797145367, + "learning_rate": 0.0002529151966713398, + "loss": 0.0142, + "num_input_tokens_seen": 184692448, + "step": 85585 + }, + { + "epoch": 13.962479608482871, + "grad_norm": 0.0059292796067893505, + "learning_rate": 0.00025285331789204633, + "loss": 0.0017, + "num_input_tokens_seen": 184702176, + "step": 85590 + }, + { + "epoch": 13.963295269168025, + "grad_norm": 0.2624618411064148, + "learning_rate": 0.0002527914441214937, + "loss": 0.0153, + "num_input_tokens_seen": 184712960, + "step": 85595 + }, + { + "epoch": 13.964110929853181, + "grad_norm": 0.011470849625766277, + "learning_rate": 0.00025272957536093634, + "loss": 0.0028, + "num_input_tokens_seen": 184723680, + "step": 85600 + }, + { + "epoch": 13.964926590538337, + "grad_norm": 0.0031528035178780556, + "learning_rate": 0.00025266771161162736, + "loss": 0.0047, + "num_input_tokens_seen": 184733504, + "step": 85605 + }, + { + "epoch": 13.96574225122349, + "grad_norm": 0.002349321963265538, + "learning_rate": 0.00025260585287482153, + "loss": 0.0033, + "num_input_tokens_seen": 184745024, + "step": 85610 + }, + { + "epoch": 13.966557911908646, + "grad_norm": 0.0037877317517995834, + "learning_rate": 0.0002525439991517714, + "loss": 0.0016, + "num_input_tokens_seen": 184756064, + "step": 85615 + }, + { + "epoch": 13.9673735725938, + "grad_norm": 0.0002581815351732075, + "learning_rate": 0.0002524821504437316, + "loss": 0.0009, + "num_input_tokens_seen": 184765312, + "step": 85620 + }, + { + "epoch": 13.968189233278956, + "grad_norm": 0.003899811767041683, + "learning_rate": 0.0002524203067519545, + "loss": 0.0093, + "num_input_tokens_seen": 184776864, + "step": 85625 + }, + { + "epoch": 13.969004893964112, + "grad_norm": 0.002065925393253565, + "learning_rate": 0.00025235846807769433, + "loss": 0.0695, + "num_input_tokens_seen": 184787360, + "step": 85630 + }, + { + "epoch": 13.969820554649266, + "grad_norm": 0.002203212818130851, + "learning_rate": 0.0002522966344222036, + "loss": 0.0222, + "num_input_tokens_seen": 184797056, + "step": 85635 + }, + { + "epoch": 13.970636215334421, + "grad_norm": 0.07065599411725998, + "learning_rate": 0.00025223480578673627, + "loss": 0.0052, + "num_input_tokens_seen": 184808832, + "step": 85640 + }, + { + "epoch": 13.971451876019575, + "grad_norm": 0.029614314436912537, + "learning_rate": 0.00025217298217254446, + "loss": 0.0065, + "num_input_tokens_seen": 184819936, + "step": 85645 + }, + { + "epoch": 13.97226753670473, + "grad_norm": 0.009960136376321316, + "learning_rate": 0.0002521111635808819, + "loss": 0.0073, + "num_input_tokens_seen": 184831008, + "step": 85650 + }, + { + "epoch": 13.973083197389887, + "grad_norm": 0.0026855659671127796, + "learning_rate": 0.0002520493500130008, + "loss": 0.0613, + "num_input_tokens_seen": 184842464, + "step": 85655 + }, + { + "epoch": 13.97389885807504, + "grad_norm": 0.00046713888878002763, + "learning_rate": 0.0002519875414701545, + "loss": 0.0024, + "num_input_tokens_seen": 184854464, + "step": 85660 + }, + { + "epoch": 13.974714518760196, + "grad_norm": 0.003356852103024721, + "learning_rate": 0.0002519257379535949, + "loss": 0.0069, + "num_input_tokens_seen": 184864576, + "step": 85665 + }, + { + "epoch": 13.97553017944535, + "grad_norm": 0.054164040833711624, + "learning_rate": 0.00025186393946457516, + "loss": 0.004, + "num_input_tokens_seen": 184875840, + "step": 85670 + }, + { + "epoch": 13.976345840130506, + "grad_norm": 0.022435037419199944, + "learning_rate": 0.0002518021460043474, + "loss": 0.0169, + "num_input_tokens_seen": 184887488, + "step": 85675 + }, + { + "epoch": 13.977161500815662, + "grad_norm": 0.0002861501125153154, + "learning_rate": 0.0002517403575741641, + "loss": 0.0055, + "num_input_tokens_seen": 184898912, + "step": 85680 + }, + { + "epoch": 13.977977161500815, + "grad_norm": 0.1401904821395874, + "learning_rate": 0.0002516785741752773, + "loss": 0.0061, + "num_input_tokens_seen": 184910112, + "step": 85685 + }, + { + "epoch": 13.978792822185971, + "grad_norm": 0.02651825360953808, + "learning_rate": 0.0002516167958089393, + "loss": 0.0383, + "num_input_tokens_seen": 184920160, + "step": 85690 + }, + { + "epoch": 13.979608482871125, + "grad_norm": 0.018840614706277847, + "learning_rate": 0.00025155502247640196, + "loss": 0.0056, + "num_input_tokens_seen": 184930240, + "step": 85695 + }, + { + "epoch": 13.98042414355628, + "grad_norm": 0.05636534467339516, + "learning_rate": 0.0002514932541789173, + "loss": 0.0141, + "num_input_tokens_seen": 184942016, + "step": 85700 + }, + { + "epoch": 13.981239804241435, + "grad_norm": 0.1341962069272995, + "learning_rate": 0.0002514314909177371, + "loss": 0.007, + "num_input_tokens_seen": 184952544, + "step": 85705 + }, + { + "epoch": 13.98205546492659, + "grad_norm": 0.5935261845588684, + "learning_rate": 0.00025136973269411305, + "loss": 0.0756, + "num_input_tokens_seen": 184962624, + "step": 85710 + }, + { + "epoch": 13.982871125611746, + "grad_norm": 0.028065573424100876, + "learning_rate": 0.0002513079795092968, + "loss": 0.0032, + "num_input_tokens_seen": 184973472, + "step": 85715 + }, + { + "epoch": 13.9836867862969, + "grad_norm": 0.02949494868516922, + "learning_rate": 0.0002512462313645396, + "loss": 0.004, + "num_input_tokens_seen": 184986400, + "step": 85720 + }, + { + "epoch": 13.984502446982056, + "grad_norm": 0.0012840087292715907, + "learning_rate": 0.0002511844882610935, + "loss": 0.002, + "num_input_tokens_seen": 184997568, + "step": 85725 + }, + { + "epoch": 13.98531810766721, + "grad_norm": 0.010335260070860386, + "learning_rate": 0.00025112275020020903, + "loss": 0.0115, + "num_input_tokens_seen": 185007744, + "step": 85730 + }, + { + "epoch": 13.986133768352365, + "grad_norm": 0.17363691329956055, + "learning_rate": 0.0002510610171831381, + "loss": 0.0112, + "num_input_tokens_seen": 185018976, + "step": 85735 + }, + { + "epoch": 13.986949429037521, + "grad_norm": 0.014595108106732368, + "learning_rate": 0.00025099928921113113, + "loss": 0.0081, + "num_input_tokens_seen": 185029696, + "step": 85740 + }, + { + "epoch": 13.987765089722675, + "grad_norm": 0.021304717287421227, + "learning_rate": 0.0002509375662854397, + "loss": 0.0722, + "num_input_tokens_seen": 185040064, + "step": 85745 + }, + { + "epoch": 13.98858075040783, + "grad_norm": 0.09347657859325409, + "learning_rate": 0.0002508758484073142, + "loss": 0.0107, + "num_input_tokens_seen": 185050784, + "step": 85750 + }, + { + "epoch": 13.989396411092985, + "grad_norm": 0.02578926458954811, + "learning_rate": 0.00025081413557800604, + "loss": 0.0068, + "num_input_tokens_seen": 185061984, + "step": 85755 + }, + { + "epoch": 13.99021207177814, + "grad_norm": 0.22338710725307465, + "learning_rate": 0.0002507524277987651, + "loss": 0.0253, + "num_input_tokens_seen": 185072544, + "step": 85760 + }, + { + "epoch": 13.991027732463294, + "grad_norm": 0.4476240575313568, + "learning_rate": 0.0002506907250708428, + "loss": 0.0187, + "num_input_tokens_seen": 185082976, + "step": 85765 + }, + { + "epoch": 13.99184339314845, + "grad_norm": 0.003939487971365452, + "learning_rate": 0.0002506290273954888, + "loss": 0.0016, + "num_input_tokens_seen": 185094176, + "step": 85770 + }, + { + "epoch": 13.992659053833606, + "grad_norm": 0.003384500043466687, + "learning_rate": 0.00025056733477395415, + "loss": 0.1068, + "num_input_tokens_seen": 185104480, + "step": 85775 + }, + { + "epoch": 13.99347471451876, + "grad_norm": 0.02045244164764881, + "learning_rate": 0.0002505056472074889, + "loss": 0.0027, + "num_input_tokens_seen": 185115712, + "step": 85780 + }, + { + "epoch": 13.994290375203915, + "grad_norm": 0.11302701383829117, + "learning_rate": 0.0002504439646973432, + "loss": 0.0045, + "num_input_tokens_seen": 185128256, + "step": 85785 + }, + { + "epoch": 13.99510603588907, + "grad_norm": 0.0015306036220863461, + "learning_rate": 0.00025038228724476715, + "loss": 0.0011, + "num_input_tokens_seen": 185138944, + "step": 85790 + }, + { + "epoch": 13.995921696574225, + "grad_norm": 0.00044725899351760745, + "learning_rate": 0.00025032061485101066, + "loss": 0.0028, + "num_input_tokens_seen": 185148640, + "step": 85795 + }, + { + "epoch": 13.99673735725938, + "grad_norm": 0.0006207763799466193, + "learning_rate": 0.0002502589475173237, + "loss": 0.0172, + "num_input_tokens_seen": 185158752, + "step": 85800 + }, + { + "epoch": 13.997553017944535, + "grad_norm": 0.11419045925140381, + "learning_rate": 0.000250197285244956, + "loss": 0.0046, + "num_input_tokens_seen": 185169312, + "step": 85805 + }, + { + "epoch": 13.99836867862969, + "grad_norm": 0.003668437013402581, + "learning_rate": 0.0002501356280351572, + "loss": 0.0038, + "num_input_tokens_seen": 185179040, + "step": 85810 + }, + { + "epoch": 13.999184339314844, + "grad_norm": 0.004584531299769878, + "learning_rate": 0.00025007397588917683, + "loss": 0.0078, + "num_input_tokens_seen": 185190016, + "step": 85815 + }, + { + "epoch": 14.0, + "grad_norm": 0.00041427635005675256, + "learning_rate": 0.0002500123288082644, + "loss": 0.0019, + "num_input_tokens_seen": 185199728, + "step": 85820 + }, + { + "epoch": 14.0, + "eval_loss": 0.19457833468914032, + "eval_runtime": 104.2987, + "eval_samples_per_second": 26.127, + "eval_steps_per_second": 6.539, + "num_input_tokens_seen": 185199728, + "step": 85820 + }, + { + "epoch": 14.000815660685156, + "grad_norm": 0.011671774089336395, + "learning_rate": 0.00024995068679366933, + "loss": 0.0037, + "num_input_tokens_seen": 185211504, + "step": 85825 + }, + { + "epoch": 14.00163132137031, + "grad_norm": 0.0215965174138546, + "learning_rate": 0.00024988904984664075, + "loss": 0.0135, + "num_input_tokens_seen": 185222320, + "step": 85830 + }, + { + "epoch": 14.002446982055465, + "grad_norm": 0.014844218268990517, + "learning_rate": 0.00024982741796842787, + "loss": 0.0019, + "num_input_tokens_seen": 185231856, + "step": 85835 + }, + { + "epoch": 14.00326264274062, + "grad_norm": 0.0018300822703167796, + "learning_rate": 0.00024976579116027975, + "loss": 0.0103, + "num_input_tokens_seen": 185244176, + "step": 85840 + }, + { + "epoch": 14.004078303425775, + "grad_norm": 0.0012848539045080543, + "learning_rate": 0.00024970416942344533, + "loss": 0.0368, + "num_input_tokens_seen": 185255088, + "step": 85845 + }, + { + "epoch": 14.00489396411093, + "grad_norm": 0.008847832679748535, + "learning_rate": 0.00024964255275917335, + "loss": 0.0267, + "num_input_tokens_seen": 185266064, + "step": 85850 + }, + { + "epoch": 14.005709624796085, + "grad_norm": 0.010221637785434723, + "learning_rate": 0.00024958094116871274, + "loss": 0.0145, + "num_input_tokens_seen": 185277456, + "step": 85855 + }, + { + "epoch": 14.00652528548124, + "grad_norm": 0.0034851732198148966, + "learning_rate": 0.000249519334653312, + "loss": 0.0025, + "num_input_tokens_seen": 185286864, + "step": 85860 + }, + { + "epoch": 14.007340946166394, + "grad_norm": 0.05377811938524246, + "learning_rate": 0.0002494577332142195, + "loss": 0.0053, + "num_input_tokens_seen": 185297904, + "step": 85865 + }, + { + "epoch": 14.00815660685155, + "grad_norm": 0.026492495089769363, + "learning_rate": 0.0002493961368526843, + "loss": 0.0036, + "num_input_tokens_seen": 185307504, + "step": 85870 + }, + { + "epoch": 14.008972267536704, + "grad_norm": 0.02235489711165428, + "learning_rate": 0.0002493345455699538, + "loss": 0.0046, + "num_input_tokens_seen": 185317872, + "step": 85875 + }, + { + "epoch": 14.00978792822186, + "grad_norm": 0.016357546672225, + "learning_rate": 0.000249272959367277, + "loss": 0.002, + "num_input_tokens_seen": 185328464, + "step": 85880 + }, + { + "epoch": 14.010603588907015, + "grad_norm": 0.01056151557713747, + "learning_rate": 0.0002492113782459017, + "loss": 0.0162, + "num_input_tokens_seen": 185338032, + "step": 85885 + }, + { + "epoch": 14.01141924959217, + "grad_norm": 0.0174605380743742, + "learning_rate": 0.00024914980220707605, + "loss": 0.0957, + "num_input_tokens_seen": 185348912, + "step": 85890 + }, + { + "epoch": 14.012234910277325, + "grad_norm": 0.0007461474160663784, + "learning_rate": 0.00024908823125204785, + "loss": 0.0096, + "num_input_tokens_seen": 185360048, + "step": 85895 + }, + { + "epoch": 14.013050570962479, + "grad_norm": 0.19543449580669403, + "learning_rate": 0.00024902666538206494, + "loss": 0.0105, + "num_input_tokens_seen": 185370256, + "step": 85900 + }, + { + "epoch": 14.013866231647635, + "grad_norm": 0.05822484940290451, + "learning_rate": 0.000248965104598375, + "loss": 0.0135, + "num_input_tokens_seen": 185380144, + "step": 85905 + }, + { + "epoch": 14.01468189233279, + "grad_norm": 0.006621610373258591, + "learning_rate": 0.0002489035489022257, + "loss": 0.0014, + "num_input_tokens_seen": 185389872, + "step": 85910 + }, + { + "epoch": 14.015497553017944, + "grad_norm": 0.006501856725662947, + "learning_rate": 0.0002488419982948646, + "loss": 0.003, + "num_input_tokens_seen": 185400848, + "step": 85915 + }, + { + "epoch": 14.0163132137031, + "grad_norm": 0.004732016008347273, + "learning_rate": 0.0002487804527775389, + "loss": 0.0128, + "num_input_tokens_seen": 185412592, + "step": 85920 + }, + { + "epoch": 14.017128874388254, + "grad_norm": 0.0004750126681756228, + "learning_rate": 0.0002487189123514961, + "loss": 0.0009, + "num_input_tokens_seen": 185422832, + "step": 85925 + }, + { + "epoch": 14.01794453507341, + "grad_norm": 0.001525798230431974, + "learning_rate": 0.0002486573770179833, + "loss": 0.0083, + "num_input_tokens_seen": 185433744, + "step": 85930 + }, + { + "epoch": 14.018760195758565, + "grad_norm": 0.0014686352806165814, + "learning_rate": 0.00024859584677824757, + "loss": 0.0216, + "num_input_tokens_seen": 185444816, + "step": 85935 + }, + { + "epoch": 14.01957585644372, + "grad_norm": 0.11327487975358963, + "learning_rate": 0.00024853432163353596, + "loss": 0.044, + "num_input_tokens_seen": 185455408, + "step": 85940 + }, + { + "epoch": 14.020391517128875, + "grad_norm": 0.0032419790513813496, + "learning_rate": 0.00024847280158509535, + "loss": 0.0041, + "num_input_tokens_seen": 185466672, + "step": 85945 + }, + { + "epoch": 14.021207177814029, + "grad_norm": 0.017735807225108147, + "learning_rate": 0.00024841128663417243, + "loss": 0.004, + "num_input_tokens_seen": 185476912, + "step": 85950 + }, + { + "epoch": 14.022022838499185, + "grad_norm": 0.04278920590877533, + "learning_rate": 0.000248349776782014, + "loss": 0.0032, + "num_input_tokens_seen": 185488112, + "step": 85955 + }, + { + "epoch": 14.022838499184338, + "grad_norm": 0.0002991057699546218, + "learning_rate": 0.0002482882720298666, + "loss": 0.0031, + "num_input_tokens_seen": 185498384, + "step": 85960 + }, + { + "epoch": 14.023654159869494, + "grad_norm": 0.0011479079257696867, + "learning_rate": 0.0002482267723789767, + "loss": 0.0201, + "num_input_tokens_seen": 185509456, + "step": 85965 + }, + { + "epoch": 14.02446982055465, + "grad_norm": 0.012501798570156097, + "learning_rate": 0.0002481652778305906, + "loss": 0.0013, + "num_input_tokens_seen": 185521104, + "step": 85970 + }, + { + "epoch": 14.025285481239804, + "grad_norm": 0.07686710357666016, + "learning_rate": 0.00024810378838595467, + "loss": 0.0027, + "num_input_tokens_seen": 185530608, + "step": 85975 + }, + { + "epoch": 14.02610114192496, + "grad_norm": 0.0008515570661984384, + "learning_rate": 0.00024804230404631495, + "loss": 0.0134, + "num_input_tokens_seen": 185541328, + "step": 85980 + }, + { + "epoch": 14.026916802610113, + "grad_norm": 0.06440049409866333, + "learning_rate": 0.0002479808248129174, + "loss": 0.0053, + "num_input_tokens_seen": 185551568, + "step": 85985 + }, + { + "epoch": 14.02773246329527, + "grad_norm": 0.000751751009374857, + "learning_rate": 0.00024791935068700855, + "loss": 0.0011, + "num_input_tokens_seen": 185562736, + "step": 85990 + }, + { + "epoch": 14.028548123980425, + "grad_norm": 0.001128409756347537, + "learning_rate": 0.0002478578816698335, + "loss": 0.0096, + "num_input_tokens_seen": 185573008, + "step": 85995 + }, + { + "epoch": 14.029363784665579, + "grad_norm": 0.5704442858695984, + "learning_rate": 0.00024779641776263866, + "loss": 0.1181, + "num_input_tokens_seen": 185583088, + "step": 86000 + }, + { + "epoch": 14.030179445350734, + "grad_norm": 0.13311763107776642, + "learning_rate": 0.00024773495896666904, + "loss": 0.0041, + "num_input_tokens_seen": 185593680, + "step": 86005 + }, + { + "epoch": 14.030995106035888, + "grad_norm": 0.01829630695283413, + "learning_rate": 0.0002476735052831706, + "loss": 0.0156, + "num_input_tokens_seen": 185603952, + "step": 86010 + }, + { + "epoch": 14.031810766721044, + "grad_norm": 0.0008091620984487236, + "learning_rate": 0.0002476120567133888, + "loss": 0.0023, + "num_input_tokens_seen": 185613968, + "step": 86015 + }, + { + "epoch": 14.0326264274062, + "grad_norm": 0.28923311829566956, + "learning_rate": 0.0002475506132585687, + "loss": 0.0172, + "num_input_tokens_seen": 185624112, + "step": 86020 + }, + { + "epoch": 14.033442088091354, + "grad_norm": 0.005707558244466782, + "learning_rate": 0.0002474891749199558, + "loss": 0.0023, + "num_input_tokens_seen": 185633200, + "step": 86025 + }, + { + "epoch": 14.03425774877651, + "grad_norm": 0.00138555106241256, + "learning_rate": 0.000247427741698795, + "loss": 0.0026, + "num_input_tokens_seen": 185643664, + "step": 86030 + }, + { + "epoch": 14.035073409461663, + "grad_norm": 0.02076912485063076, + "learning_rate": 0.00024736631359633147, + "loss": 0.005, + "num_input_tokens_seen": 185654000, + "step": 86035 + }, + { + "epoch": 14.035889070146819, + "grad_norm": 0.005614807363599539, + "learning_rate": 0.00024730489061381013, + "loss": 0.0108, + "num_input_tokens_seen": 185664336, + "step": 86040 + }, + { + "epoch": 14.036704730831975, + "grad_norm": 0.0017058339435607195, + "learning_rate": 0.00024724347275247564, + "loss": 0.0043, + "num_input_tokens_seen": 185675088, + "step": 86045 + }, + { + "epoch": 14.037520391517129, + "grad_norm": 0.03262771666049957, + "learning_rate": 0.0002471820600135729, + "loss": 0.0203, + "num_input_tokens_seen": 185685936, + "step": 86050 + }, + { + "epoch": 14.038336052202284, + "grad_norm": 0.00868172850459814, + "learning_rate": 0.0002471206523983465, + "loss": 0.0078, + "num_input_tokens_seen": 185697552, + "step": 86055 + }, + { + "epoch": 14.039151712887438, + "grad_norm": 0.0012375538935884833, + "learning_rate": 0.00024705924990804076, + "loss": 0.0021, + "num_input_tokens_seen": 185708656, + "step": 86060 + }, + { + "epoch": 14.039967373572594, + "grad_norm": 0.0029186115134507418, + "learning_rate": 0.0002469978525439002, + "loss": 0.0022, + "num_input_tokens_seen": 185720080, + "step": 86065 + }, + { + "epoch": 14.040783034257748, + "grad_norm": 0.013675632886588573, + "learning_rate": 0.00024693646030716923, + "loss": 0.0014, + "num_input_tokens_seen": 185731120, + "step": 86070 + }, + { + "epoch": 14.041598694942904, + "grad_norm": 0.006938478443771601, + "learning_rate": 0.0002468750731990918, + "loss": 0.0046, + "num_input_tokens_seen": 185741936, + "step": 86075 + }, + { + "epoch": 14.04241435562806, + "grad_norm": 0.035929594188928604, + "learning_rate": 0.0002468136912209122, + "loss": 0.0419, + "num_input_tokens_seen": 185752592, + "step": 86080 + }, + { + "epoch": 14.043230016313213, + "grad_norm": 0.0026008724234998226, + "learning_rate": 0.0002467523143738743, + "loss": 0.0012, + "num_input_tokens_seen": 185763024, + "step": 86085 + }, + { + "epoch": 14.044045676998369, + "grad_norm": 0.0017996116075664759, + "learning_rate": 0.00024669094265922204, + "loss": 0.0083, + "num_input_tokens_seen": 185774032, + "step": 86090 + }, + { + "epoch": 14.044861337683523, + "grad_norm": 0.038363099098205566, + "learning_rate": 0.00024662957607819914, + "loss": 0.005, + "num_input_tokens_seen": 185785520, + "step": 86095 + }, + { + "epoch": 14.045676998368679, + "grad_norm": 0.013351533561944962, + "learning_rate": 0.00024656821463204913, + "loss": 0.0041, + "num_input_tokens_seen": 185795696, + "step": 86100 + }, + { + "epoch": 14.046492659053834, + "grad_norm": 0.38755908608436584, + "learning_rate": 0.0002465068583220161, + "loss": 0.0142, + "num_input_tokens_seen": 185806096, + "step": 86105 + }, + { + "epoch": 14.047308319738988, + "grad_norm": 0.005372211337089539, + "learning_rate": 0.0002464455071493429, + "loss": 0.0007, + "num_input_tokens_seen": 185816688, + "step": 86110 + }, + { + "epoch": 14.048123980424144, + "grad_norm": 0.029976407065987587, + "learning_rate": 0.00024638416111527346, + "loss": 0.0053, + "num_input_tokens_seen": 185827408, + "step": 86115 + }, + { + "epoch": 14.048939641109298, + "grad_norm": 0.001205800217576325, + "learning_rate": 0.0002463228202210503, + "loss": 0.0026, + "num_input_tokens_seen": 185837712, + "step": 86120 + }, + { + "epoch": 14.049755301794454, + "grad_norm": 0.0009493742836639285, + "learning_rate": 0.00024626148446791745, + "loss": 0.0029, + "num_input_tokens_seen": 185848336, + "step": 86125 + }, + { + "epoch": 14.05057096247961, + "grad_norm": 0.0019768273923546076, + "learning_rate": 0.00024620015385711706, + "loss": 0.0039, + "num_input_tokens_seen": 185858096, + "step": 86130 + }, + { + "epoch": 14.051386623164763, + "grad_norm": 0.053393010050058365, + "learning_rate": 0.000246138828389893, + "loss": 0.0077, + "num_input_tokens_seen": 185868528, + "step": 86135 + }, + { + "epoch": 14.052202283849919, + "grad_norm": 0.16236300766468048, + "learning_rate": 0.0002460775080674872, + "loss": 0.0066, + "num_input_tokens_seen": 185879952, + "step": 86140 + }, + { + "epoch": 14.053017944535073, + "grad_norm": 0.00963764451444149, + "learning_rate": 0.0002460161928911432, + "loss": 0.0043, + "num_input_tokens_seen": 185891920, + "step": 86145 + }, + { + "epoch": 14.053833605220229, + "grad_norm": 0.18794310092926025, + "learning_rate": 0.0002459548828621028, + "loss": 0.0153, + "num_input_tokens_seen": 185901936, + "step": 86150 + }, + { + "epoch": 14.054649265905383, + "grad_norm": 0.0008016020292416215, + "learning_rate": 0.00024589357798160925, + "loss": 0.0034, + "num_input_tokens_seen": 185913104, + "step": 86155 + }, + { + "epoch": 14.055464926590538, + "grad_norm": 0.00022456534497905523, + "learning_rate": 0.0002458322782509047, + "loss": 0.0004, + "num_input_tokens_seen": 185924496, + "step": 86160 + }, + { + "epoch": 14.056280587275694, + "grad_norm": 0.0023755901493132114, + "learning_rate": 0.00024577098367123146, + "loss": 0.0025, + "num_input_tokens_seen": 185936048, + "step": 86165 + }, + { + "epoch": 14.057096247960848, + "grad_norm": 0.022337786853313446, + "learning_rate": 0.00024570969424383174, + "loss": 0.0021, + "num_input_tokens_seen": 185947248, + "step": 86170 + }, + { + "epoch": 14.057911908646004, + "grad_norm": 0.004280845634639263, + "learning_rate": 0.00024564840996994764, + "loss": 0.0007, + "num_input_tokens_seen": 185958160, + "step": 86175 + }, + { + "epoch": 14.058727569331158, + "grad_norm": 0.001397744519636035, + "learning_rate": 0.0002455871308508212, + "loss": 0.0011, + "num_input_tokens_seen": 185968816, + "step": 86180 + }, + { + "epoch": 14.059543230016313, + "grad_norm": 0.0075432900339365005, + "learning_rate": 0.0002455258568876943, + "loss": 0.0012, + "num_input_tokens_seen": 185980144, + "step": 86185 + }, + { + "epoch": 14.060358890701469, + "grad_norm": 0.08409277349710464, + "learning_rate": 0.0002454645880818087, + "loss": 0.0101, + "num_input_tokens_seen": 185991696, + "step": 86190 + }, + { + "epoch": 14.061174551386623, + "grad_norm": 0.013024120591580868, + "learning_rate": 0.00024540332443440615, + "loss": 0.167, + "num_input_tokens_seen": 186001520, + "step": 86195 + }, + { + "epoch": 14.061990212071779, + "grad_norm": 0.0003296454669907689, + "learning_rate": 0.0002453420659467282, + "loss": 0.0101, + "num_input_tokens_seen": 186011728, + "step": 86200 + }, + { + "epoch": 14.062805872756933, + "grad_norm": 0.28865981101989746, + "learning_rate": 0.00024528081262001615, + "loss": 0.009, + "num_input_tokens_seen": 186022832, + "step": 86205 + }, + { + "epoch": 14.063621533442088, + "grad_norm": 0.01018576417118311, + "learning_rate": 0.000245219564455512, + "loss": 0.0014, + "num_input_tokens_seen": 186033584, + "step": 86210 + }, + { + "epoch": 14.064437194127244, + "grad_norm": 0.0077399867586791515, + "learning_rate": 0.00024515832145445614, + "loss": 0.0492, + "num_input_tokens_seen": 186044976, + "step": 86215 + }, + { + "epoch": 14.065252854812398, + "grad_norm": 0.021094804629683495, + "learning_rate": 0.0002450970836180906, + "loss": 0.0031, + "num_input_tokens_seen": 186056688, + "step": 86220 + }, + { + "epoch": 14.066068515497554, + "grad_norm": 0.012308260425925255, + "learning_rate": 0.0002450358509476556, + "loss": 0.0011, + "num_input_tokens_seen": 186068208, + "step": 86225 + }, + { + "epoch": 14.066884176182707, + "grad_norm": 0.002983053447678685, + "learning_rate": 0.00024497462344439297, + "loss": 0.0026, + "num_input_tokens_seen": 186080784, + "step": 86230 + }, + { + "epoch": 14.067699836867863, + "grad_norm": 0.001272146706469357, + "learning_rate": 0.0002449134011095427, + "loss": 0.0005, + "num_input_tokens_seen": 186092816, + "step": 86235 + }, + { + "epoch": 14.068515497553017, + "grad_norm": 0.00044163476559333503, + "learning_rate": 0.0002448521839443464, + "loss": 0.0063, + "num_input_tokens_seen": 186103856, + "step": 86240 + }, + { + "epoch": 14.069331158238173, + "grad_norm": 0.028374919667840004, + "learning_rate": 0.00024479097195004377, + "loss": 0.0018, + "num_input_tokens_seen": 186115664, + "step": 86245 + }, + { + "epoch": 14.070146818923329, + "grad_norm": 0.01356195006519556, + "learning_rate": 0.0002447297651278763, + "loss": 0.0026, + "num_input_tokens_seen": 186125488, + "step": 86250 + }, + { + "epoch": 14.070962479608482, + "grad_norm": 0.003203232306987047, + "learning_rate": 0.0002446685634790836, + "loss": 0.0032, + "num_input_tokens_seen": 186137424, + "step": 86255 + }, + { + "epoch": 14.071778140293638, + "grad_norm": 0.030309241265058517, + "learning_rate": 0.00024460736700490676, + "loss": 0.0148, + "num_input_tokens_seen": 186147920, + "step": 86260 + }, + { + "epoch": 14.072593800978792, + "grad_norm": 0.002427339553833008, + "learning_rate": 0.00024454617570658524, + "loss": 0.0017, + "num_input_tokens_seen": 186158192, + "step": 86265 + }, + { + "epoch": 14.073409461663948, + "grad_norm": 0.003980029374361038, + "learning_rate": 0.00024448498958535984, + "loss": 0.0019, + "num_input_tokens_seen": 186168912, + "step": 86270 + }, + { + "epoch": 14.074225122349104, + "grad_norm": 0.00046570925042033195, + "learning_rate": 0.00024442380864247, + "loss": 0.0629, + "num_input_tokens_seen": 186179024, + "step": 86275 + }, + { + "epoch": 14.075040783034257, + "grad_norm": 0.02144770324230194, + "learning_rate": 0.00024436263287915623, + "loss": 0.0281, + "num_input_tokens_seen": 186189136, + "step": 86280 + }, + { + "epoch": 14.075856443719413, + "grad_norm": 0.015274712815880775, + "learning_rate": 0.00024430146229665754, + "loss": 0.0018, + "num_input_tokens_seen": 186200112, + "step": 86285 + }, + { + "epoch": 14.076672104404567, + "grad_norm": 0.014072231948375702, + "learning_rate": 0.0002442402968962146, + "loss": 0.0083, + "num_input_tokens_seen": 186209264, + "step": 86290 + }, + { + "epoch": 14.077487765089723, + "grad_norm": 0.006096137687563896, + "learning_rate": 0.00024417913667906604, + "loss": 0.0028, + "num_input_tokens_seen": 186220624, + "step": 86295 + }, + { + "epoch": 14.078303425774878, + "grad_norm": 0.11654862761497498, + "learning_rate": 0.00024411798164645205, + "loss": 0.0545, + "num_input_tokens_seen": 186231536, + "step": 86300 + }, + { + "epoch": 14.079119086460032, + "grad_norm": 0.03843030333518982, + "learning_rate": 0.00024405683179961176, + "loss": 0.0026, + "num_input_tokens_seen": 186242192, + "step": 86305 + }, + { + "epoch": 14.079934747145188, + "grad_norm": 0.0007979121874086559, + "learning_rate": 0.00024399568713978444, + "loss": 0.0017, + "num_input_tokens_seen": 186253904, + "step": 86310 + }, + { + "epoch": 14.080750407830342, + "grad_norm": 0.02977071702480316, + "learning_rate": 0.00024393454766820927, + "loss": 0.0044, + "num_input_tokens_seen": 186266000, + "step": 86315 + }, + { + "epoch": 14.081566068515498, + "grad_norm": 0.000604961474891752, + "learning_rate": 0.00024387341338612535, + "loss": 0.0041, + "num_input_tokens_seen": 186275760, + "step": 86320 + }, + { + "epoch": 14.082381729200652, + "grad_norm": 0.0005818359786644578, + "learning_rate": 0.00024381228429477166, + "loss": 0.0006, + "num_input_tokens_seen": 186285936, + "step": 86325 + }, + { + "epoch": 14.083197389885807, + "grad_norm": 0.015048845671117306, + "learning_rate": 0.00024375116039538697, + "loss": 0.0012, + "num_input_tokens_seen": 186296144, + "step": 86330 + }, + { + "epoch": 14.084013050570963, + "grad_norm": 0.4808575510978699, + "learning_rate": 0.0002436900416892101, + "loss": 0.0312, + "num_input_tokens_seen": 186307792, + "step": 86335 + }, + { + "epoch": 14.084828711256117, + "grad_norm": 0.0050271605141460896, + "learning_rate": 0.00024362892817747972, + "loss": 0.0006, + "num_input_tokens_seen": 186319760, + "step": 86340 + }, + { + "epoch": 14.085644371941273, + "grad_norm": 0.027988841757178307, + "learning_rate": 0.00024356781986143434, + "loss": 0.0063, + "num_input_tokens_seen": 186331248, + "step": 86345 + }, + { + "epoch": 14.086460032626427, + "grad_norm": 0.0016712294891476631, + "learning_rate": 0.00024350671674231217, + "loss": 0.001, + "num_input_tokens_seen": 186341392, + "step": 86350 + }, + { + "epoch": 14.087275693311582, + "grad_norm": 0.0075509450398385525, + "learning_rate": 0.0002434456188213522, + "loss": 0.001, + "num_input_tokens_seen": 186352528, + "step": 86355 + }, + { + "epoch": 14.088091353996738, + "grad_norm": 0.0021157930605113506, + "learning_rate": 0.00024338452609979177, + "loss": 0.002, + "num_input_tokens_seen": 186364816, + "step": 86360 + }, + { + "epoch": 14.088907014681892, + "grad_norm": 0.27282044291496277, + "learning_rate": 0.0002433234385788699, + "loss": 0.0292, + "num_input_tokens_seen": 186376304, + "step": 86365 + }, + { + "epoch": 14.089722675367048, + "grad_norm": 0.0005172526580281556, + "learning_rate": 0.00024326235625982378, + "loss": 0.0014, + "num_input_tokens_seen": 186387152, + "step": 86370 + }, + { + "epoch": 14.090538336052202, + "grad_norm": 0.0065985131077468395, + "learning_rate": 0.00024320127914389213, + "loss": 0.0039, + "num_input_tokens_seen": 186398096, + "step": 86375 + }, + { + "epoch": 14.091353996737357, + "grad_norm": 0.536579430103302, + "learning_rate": 0.00024314020723231183, + "loss": 0.0208, + "num_input_tokens_seen": 186408720, + "step": 86380 + }, + { + "epoch": 14.092169657422513, + "grad_norm": 0.0008578845299780369, + "learning_rate": 0.00024307914052632159, + "loss": 0.0051, + "num_input_tokens_seen": 186420528, + "step": 86385 + }, + { + "epoch": 14.092985318107667, + "grad_norm": 0.0017695410642772913, + "learning_rate": 0.000243018079027158, + "loss": 0.0027, + "num_input_tokens_seen": 186430512, + "step": 86390 + }, + { + "epoch": 14.093800978792823, + "grad_norm": 0.5528125762939453, + "learning_rate": 0.0002429570227360595, + "loss": 0.2923, + "num_input_tokens_seen": 186442224, + "step": 86395 + }, + { + "epoch": 14.094616639477977, + "grad_norm": 0.012076152488589287, + "learning_rate": 0.00024289597165426264, + "loss": 0.0012, + "num_input_tokens_seen": 186451888, + "step": 86400 + }, + { + "epoch": 14.095432300163132, + "grad_norm": 0.03702862188220024, + "learning_rate": 0.00024283492578300542, + "loss": 0.0103, + "num_input_tokens_seen": 186462576, + "step": 86405 + }, + { + "epoch": 14.096247960848286, + "grad_norm": 0.006007469724863768, + "learning_rate": 0.00024277388512352428, + "loss": 0.1221, + "num_input_tokens_seen": 186472144, + "step": 86410 + }, + { + "epoch": 14.097063621533442, + "grad_norm": 0.001316079287789762, + "learning_rate": 0.00024271284967705687, + "loss": 0.0123, + "num_input_tokens_seen": 186483696, + "step": 86415 + }, + { + "epoch": 14.097879282218598, + "grad_norm": 0.0004989941953681409, + "learning_rate": 0.00024265181944483995, + "loss": 0.0027, + "num_input_tokens_seen": 186495120, + "step": 86420 + }, + { + "epoch": 14.098694942903752, + "grad_norm": 0.001054865773767233, + "learning_rate": 0.0002425907944281104, + "loss": 0.003, + "num_input_tokens_seen": 186505520, + "step": 86425 + }, + { + "epoch": 14.099510603588907, + "grad_norm": 0.02121868170797825, + "learning_rate": 0.00024252977462810494, + "loss": 0.0419, + "num_input_tokens_seen": 186516944, + "step": 86430 + }, + { + "epoch": 14.100326264274061, + "grad_norm": 0.011116947047412395, + "learning_rate": 0.0002424687600460602, + "loss": 0.1101, + "num_input_tokens_seen": 186526224, + "step": 86435 + }, + { + "epoch": 14.101141924959217, + "grad_norm": 0.0024929612409323454, + "learning_rate": 0.00024240775068321273, + "loss": 0.003, + "num_input_tokens_seen": 186535792, + "step": 86440 + }, + { + "epoch": 14.101957585644373, + "grad_norm": 0.009313315153121948, + "learning_rate": 0.00024234674654079901, + "loss": 0.0035, + "num_input_tokens_seen": 186546096, + "step": 86445 + }, + { + "epoch": 14.102773246329527, + "grad_norm": 0.0010906142415478826, + "learning_rate": 0.00024228574762005534, + "loss": 0.0019, + "num_input_tokens_seen": 186557616, + "step": 86450 + }, + { + "epoch": 14.103588907014682, + "grad_norm": 0.0051316674798727036, + "learning_rate": 0.00024222475392221787, + "loss": 0.0217, + "num_input_tokens_seen": 186568368, + "step": 86455 + }, + { + "epoch": 14.104404567699836, + "grad_norm": 0.011505071073770523, + "learning_rate": 0.0002421637654485228, + "loss": 0.0312, + "num_input_tokens_seen": 186578640, + "step": 86460 + }, + { + "epoch": 14.105220228384992, + "grad_norm": 0.008313409052789211, + "learning_rate": 0.00024210278220020614, + "loss": 0.0106, + "num_input_tokens_seen": 186588752, + "step": 86465 + }, + { + "epoch": 14.106035889070148, + "grad_norm": 0.001642403076402843, + "learning_rate": 0.00024204180417850373, + "loss": 0.0024, + "num_input_tokens_seen": 186599504, + "step": 86470 + }, + { + "epoch": 14.106851549755302, + "grad_norm": 0.007977745495736599, + "learning_rate": 0.00024198083138465143, + "loss": 0.0083, + "num_input_tokens_seen": 186610096, + "step": 86475 + }, + { + "epoch": 14.107667210440457, + "grad_norm": 0.022300442680716515, + "learning_rate": 0.0002419198638198849, + "loss": 0.0027, + "num_input_tokens_seen": 186620464, + "step": 86480 + }, + { + "epoch": 14.108482871125611, + "grad_norm": 0.0025423523038625717, + "learning_rate": 0.0002418589014854397, + "loss": 0.0073, + "num_input_tokens_seen": 186632208, + "step": 86485 + }, + { + "epoch": 14.109298531810767, + "grad_norm": 0.04908493533730507, + "learning_rate": 0.00024179794438255133, + "loss": 0.0072, + "num_input_tokens_seen": 186643344, + "step": 86490 + }, + { + "epoch": 14.11011419249592, + "grad_norm": 0.026948802173137665, + "learning_rate": 0.000241736992512455, + "loss": 0.003, + "num_input_tokens_seen": 186655248, + "step": 86495 + }, + { + "epoch": 14.110929853181077, + "grad_norm": 0.0006145177758298814, + "learning_rate": 0.00024167604587638653, + "loss": 0.0022, + "num_input_tokens_seen": 186665776, + "step": 86500 + }, + { + "epoch": 14.111745513866232, + "grad_norm": 0.0027730839792639017, + "learning_rate": 0.00024161510447558032, + "loss": 0.0092, + "num_input_tokens_seen": 186675024, + "step": 86505 + }, + { + "epoch": 14.112561174551386, + "grad_norm": 0.002394838957116008, + "learning_rate": 0.0002415541683112722, + "loss": 0.0336, + "num_input_tokens_seen": 186685648, + "step": 86510 + }, + { + "epoch": 14.113376835236542, + "grad_norm": 0.12717679142951965, + "learning_rate": 0.0002414932373846963, + "loss": 0.0077, + "num_input_tokens_seen": 186697424, + "step": 86515 + }, + { + "epoch": 14.114192495921696, + "grad_norm": 0.006228148005902767, + "learning_rate": 0.00024143231169708806, + "loss": 0.0046, + "num_input_tokens_seen": 186707760, + "step": 86520 + }, + { + "epoch": 14.115008156606851, + "grad_norm": 0.00448314705863595, + "learning_rate": 0.0002413713912496821, + "loss": 0.003, + "num_input_tokens_seen": 186717392, + "step": 86525 + }, + { + "epoch": 14.115823817292007, + "grad_norm": 0.013983353041112423, + "learning_rate": 0.00024131047604371292, + "loss": 0.0061, + "num_input_tokens_seen": 186728496, + "step": 86530 + }, + { + "epoch": 14.116639477977161, + "grad_norm": 0.0534401535987854, + "learning_rate": 0.0002412495660804152, + "loss": 0.1007, + "num_input_tokens_seen": 186740592, + "step": 86535 + }, + { + "epoch": 14.117455138662317, + "grad_norm": 0.09861243516206741, + "learning_rate": 0.0002411886613610232, + "loss": 0.0851, + "num_input_tokens_seen": 186751152, + "step": 86540 + }, + { + "epoch": 14.11827079934747, + "grad_norm": 0.012586988508701324, + "learning_rate": 0.00024112776188677133, + "loss": 0.0062, + "num_input_tokens_seen": 186762064, + "step": 86545 + }, + { + "epoch": 14.119086460032626, + "grad_norm": 0.0031633928883820772, + "learning_rate": 0.0002410668676588938, + "loss": 0.0042, + "num_input_tokens_seen": 186773808, + "step": 86550 + }, + { + "epoch": 14.119902120717782, + "grad_norm": 0.03359987214207649, + "learning_rate": 0.0002410059786786246, + "loss": 0.0026, + "num_input_tokens_seen": 186783792, + "step": 86555 + }, + { + "epoch": 14.120717781402936, + "grad_norm": 0.0006193072767928243, + "learning_rate": 0.00024094509494719784, + "loss": 0.0541, + "num_input_tokens_seen": 186793072, + "step": 86560 + }, + { + "epoch": 14.121533442088092, + "grad_norm": 0.2499951422214508, + "learning_rate": 0.0002408842164658474, + "loss": 0.0064, + "num_input_tokens_seen": 186803632, + "step": 86565 + }, + { + "epoch": 14.122349102773246, + "grad_norm": 0.14925557374954224, + "learning_rate": 0.00024082334323580695, + "loss": 0.0107, + "num_input_tokens_seen": 186815536, + "step": 86570 + }, + { + "epoch": 14.123164763458401, + "grad_norm": 0.02999800071120262, + "learning_rate": 0.0002407624752583103, + "loss": 0.0031, + "num_input_tokens_seen": 186826384, + "step": 86575 + }, + { + "epoch": 14.123980424143557, + "grad_norm": 0.02169770374894142, + "learning_rate": 0.00024070161253459093, + "loss": 0.0056, + "num_input_tokens_seen": 186837616, + "step": 86580 + }, + { + "epoch": 14.124796084828711, + "grad_norm": 0.00973606389015913, + "learning_rate": 0.00024064075506588235, + "loss": 0.0052, + "num_input_tokens_seen": 186847632, + "step": 86585 + }, + { + "epoch": 14.125611745513867, + "grad_norm": 0.018632404506206512, + "learning_rate": 0.00024057990285341786, + "loss": 0.0053, + "num_input_tokens_seen": 186858384, + "step": 86590 + }, + { + "epoch": 14.12642740619902, + "grad_norm": 0.017117558047175407, + "learning_rate": 0.00024051905589843076, + "loss": 0.0069, + "num_input_tokens_seen": 186868624, + "step": 86595 + }, + { + "epoch": 14.127243066884176, + "grad_norm": 0.006402240134775639, + "learning_rate": 0.00024045821420215412, + "loss": 0.0619, + "num_input_tokens_seen": 186879056, + "step": 86600 + }, + { + "epoch": 14.12805872756933, + "grad_norm": 0.00499645434319973, + "learning_rate": 0.0002403973777658211, + "loss": 0.0023, + "num_input_tokens_seen": 186889904, + "step": 86605 + }, + { + "epoch": 14.128874388254486, + "grad_norm": 0.00235865474678576, + "learning_rate": 0.0002403365465906645, + "loss": 0.0041, + "num_input_tokens_seen": 186899696, + "step": 86610 + }, + { + "epoch": 14.129690048939642, + "grad_norm": 0.037770580500364304, + "learning_rate": 0.0002402757206779172, + "loss": 0.0043, + "num_input_tokens_seen": 186910384, + "step": 86615 + }, + { + "epoch": 14.130505709624796, + "grad_norm": 0.005501674488186836, + "learning_rate": 0.00024021490002881186, + "loss": 0.0075, + "num_input_tokens_seen": 186921392, + "step": 86620 + }, + { + "epoch": 14.131321370309951, + "grad_norm": 0.004268792923539877, + "learning_rate": 0.000240154084644581, + "loss": 0.0027, + "num_input_tokens_seen": 186932880, + "step": 86625 + }, + { + "epoch": 14.132137030995105, + "grad_norm": 0.10287728160619736, + "learning_rate": 0.0002400932745264574, + "loss": 0.0095, + "num_input_tokens_seen": 186942064, + "step": 86630 + }, + { + "epoch": 14.132952691680261, + "grad_norm": 0.04187287762761116, + "learning_rate": 0.00024003246967567332, + "loss": 0.0033, + "num_input_tokens_seen": 186951408, + "step": 86635 + }, + { + "epoch": 14.133768352365417, + "grad_norm": 0.0038913721218705177, + "learning_rate": 0.00023997167009346104, + "loss": 0.0011, + "num_input_tokens_seen": 186963760, + "step": 86640 + }, + { + "epoch": 14.13458401305057, + "grad_norm": 0.006497113034129143, + "learning_rate": 0.00023991087578105274, + "loss": 0.0017, + "num_input_tokens_seen": 186974544, + "step": 86645 + }, + { + "epoch": 14.135399673735726, + "grad_norm": 0.00020977971144020557, + "learning_rate": 0.00023985008673968052, + "loss": 0.0011, + "num_input_tokens_seen": 186985328, + "step": 86650 + }, + { + "epoch": 14.13621533442088, + "grad_norm": 0.013788911513984203, + "learning_rate": 0.00023978930297057627, + "loss": 0.0084, + "num_input_tokens_seen": 186995504, + "step": 86655 + }, + { + "epoch": 14.137030995106036, + "grad_norm": 0.052885398268699646, + "learning_rate": 0.0002397285244749719, + "loss": 0.0028, + "num_input_tokens_seen": 187006544, + "step": 86660 + }, + { + "epoch": 14.137846655791192, + "grad_norm": 0.00046698813093826175, + "learning_rate": 0.00023966775125409918, + "loss": 0.0949, + "num_input_tokens_seen": 187017328, + "step": 86665 + }, + { + "epoch": 14.138662316476346, + "grad_norm": 0.006365417968481779, + "learning_rate": 0.00023960698330918972, + "loss": 0.0008, + "num_input_tokens_seen": 187028464, + "step": 86670 + }, + { + "epoch": 14.139477977161501, + "grad_norm": 0.0025994847528636456, + "learning_rate": 0.00023954622064147507, + "loss": 0.0057, + "num_input_tokens_seen": 187038736, + "step": 86675 + }, + { + "epoch": 14.140293637846655, + "grad_norm": 0.0004194117500446737, + "learning_rate": 0.00023948546325218667, + "loss": 0.0105, + "num_input_tokens_seen": 187050000, + "step": 86680 + }, + { + "epoch": 14.141109298531811, + "grad_norm": 0.004431330598890781, + "learning_rate": 0.00023942471114255588, + "loss": 0.0063, + "num_input_tokens_seen": 187061040, + "step": 86685 + }, + { + "epoch": 14.141924959216965, + "grad_norm": 0.0016755033284425735, + "learning_rate": 0.00023936396431381386, + "loss": 0.0012, + "num_input_tokens_seen": 187071728, + "step": 86690 + }, + { + "epoch": 14.14274061990212, + "grad_norm": 0.0024547064676880836, + "learning_rate": 0.00023930322276719175, + "loss": 0.0008, + "num_input_tokens_seen": 187083440, + "step": 86695 + }, + { + "epoch": 14.143556280587276, + "grad_norm": 0.010015696287155151, + "learning_rate": 0.0002392424865039205, + "loss": 0.0054, + "num_input_tokens_seen": 187093872, + "step": 86700 + }, + { + "epoch": 14.14437194127243, + "grad_norm": 0.004910641815513372, + "learning_rate": 0.0002391817555252311, + "loss": 0.0042, + "num_input_tokens_seen": 187104208, + "step": 86705 + }, + { + "epoch": 14.145187601957586, + "grad_norm": 0.0027360564563423395, + "learning_rate": 0.0002391210298323543, + "loss": 0.0187, + "num_input_tokens_seen": 187114960, + "step": 86710 + }, + { + "epoch": 14.14600326264274, + "grad_norm": 0.011363659985363483, + "learning_rate": 0.00023906030942652073, + "loss": 0.0028, + "num_input_tokens_seen": 187125200, + "step": 86715 + }, + { + "epoch": 14.146818923327896, + "grad_norm": 0.006349723320454359, + "learning_rate": 0.00023899959430896106, + "loss": 0.0027, + "num_input_tokens_seen": 187135248, + "step": 86720 + }, + { + "epoch": 14.147634584013051, + "grad_norm": 0.011810396797955036, + "learning_rate": 0.00023893888448090573, + "loss": 0.0043, + "num_input_tokens_seen": 187145904, + "step": 86725 + }, + { + "epoch": 14.148450244698205, + "grad_norm": 0.005554541479796171, + "learning_rate": 0.00023887817994358484, + "loss": 0.003, + "num_input_tokens_seen": 187155504, + "step": 86730 + }, + { + "epoch": 14.149265905383361, + "grad_norm": 0.001082675182260573, + "learning_rate": 0.0002388174806982293, + "loss": 0.0083, + "num_input_tokens_seen": 187167408, + "step": 86735 + }, + { + "epoch": 14.150081566068515, + "grad_norm": 0.0032948791049420834, + "learning_rate": 0.00023875678674606848, + "loss": 0.0021, + "num_input_tokens_seen": 187178064, + "step": 86740 + }, + { + "epoch": 14.15089722675367, + "grad_norm": 0.013944054022431374, + "learning_rate": 0.00023869609808833316, + "loss": 0.0046, + "num_input_tokens_seen": 187188976, + "step": 86745 + }, + { + "epoch": 14.151712887438826, + "grad_norm": 0.009103807620704174, + "learning_rate": 0.0002386354147262525, + "loss": 0.0023, + "num_input_tokens_seen": 187199312, + "step": 86750 + }, + { + "epoch": 14.15252854812398, + "grad_norm": 0.0019065203377977014, + "learning_rate": 0.0002385747366610571, + "loss": 0.0012, + "num_input_tokens_seen": 187208816, + "step": 86755 + }, + { + "epoch": 14.153344208809136, + "grad_norm": 0.0811125785112381, + "learning_rate": 0.00023851406389397594, + "loss": 0.0118, + "num_input_tokens_seen": 187218672, + "step": 86760 + }, + { + "epoch": 14.15415986949429, + "grad_norm": 0.01378593873232603, + "learning_rate": 0.00023845339642623937, + "loss": 0.0067, + "num_input_tokens_seen": 187230096, + "step": 86765 + }, + { + "epoch": 14.154975530179446, + "grad_norm": 0.7698084712028503, + "learning_rate": 0.00023839273425907615, + "loss": 0.0429, + "num_input_tokens_seen": 187241264, + "step": 86770 + }, + { + "epoch": 14.1557911908646, + "grad_norm": 0.0028667557053267956, + "learning_rate": 0.0002383320773937162, + "loss": 0.013, + "num_input_tokens_seen": 187250192, + "step": 86775 + }, + { + "epoch": 14.156606851549755, + "grad_norm": 0.011749546974897385, + "learning_rate": 0.00023827142583138873, + "loss": 0.0048, + "num_input_tokens_seen": 187260720, + "step": 86780 + }, + { + "epoch": 14.15742251223491, + "grad_norm": 0.019493889063596725, + "learning_rate": 0.00023821077957332276, + "loss": 0.0125, + "num_input_tokens_seen": 187270096, + "step": 86785 + }, + { + "epoch": 14.158238172920065, + "grad_norm": 0.0005906001897528768, + "learning_rate": 0.00023815013862074746, + "loss": 0.0129, + "num_input_tokens_seen": 187280368, + "step": 86790 + }, + { + "epoch": 14.15905383360522, + "grad_norm": 0.0011287572560831904, + "learning_rate": 0.0002380895029748918, + "loss": 0.0025, + "num_input_tokens_seen": 187291056, + "step": 86795 + }, + { + "epoch": 14.159869494290374, + "grad_norm": 0.01778515614569187, + "learning_rate": 0.00023802887263698464, + "loss": 0.0162, + "num_input_tokens_seen": 187303184, + "step": 86800 + }, + { + "epoch": 14.16068515497553, + "grad_norm": 0.020611291751265526, + "learning_rate": 0.00023796824760825464, + "loss": 0.0008, + "num_input_tokens_seen": 187314896, + "step": 86805 + }, + { + "epoch": 14.161500815660686, + "grad_norm": 0.015079019591212273, + "learning_rate": 0.0002379076278899306, + "loss": 0.0019, + "num_input_tokens_seen": 187325712, + "step": 86810 + }, + { + "epoch": 14.16231647634584, + "grad_norm": 0.0024150789249688387, + "learning_rate": 0.0002378470134832409, + "loss": 0.0017, + "num_input_tokens_seen": 187337136, + "step": 86815 + }, + { + "epoch": 14.163132137030995, + "grad_norm": 0.11301400512456894, + "learning_rate": 0.00023778640438941408, + "loss": 0.002, + "num_input_tokens_seen": 187349008, + "step": 86820 + }, + { + "epoch": 14.16394779771615, + "grad_norm": 0.012705371715128422, + "learning_rate": 0.00023772580060967834, + "loss": 0.0019, + "num_input_tokens_seen": 187359376, + "step": 86825 + }, + { + "epoch": 14.164763458401305, + "grad_norm": 0.027769785374403, + "learning_rate": 0.00023766520214526206, + "loss": 0.0187, + "num_input_tokens_seen": 187371632, + "step": 86830 + }, + { + "epoch": 14.16557911908646, + "grad_norm": 0.0008469356107525527, + "learning_rate": 0.00023760460899739322, + "loss": 0.0006, + "num_input_tokens_seen": 187381520, + "step": 86835 + }, + { + "epoch": 14.166394779771615, + "grad_norm": 0.6364102363586426, + "learning_rate": 0.00023754402116729983, + "loss": 0.0197, + "num_input_tokens_seen": 187392336, + "step": 86840 + }, + { + "epoch": 14.16721044045677, + "grad_norm": 0.005157504230737686, + "learning_rate": 0.00023748343865620964, + "loss": 0.0015, + "num_input_tokens_seen": 187403504, + "step": 86845 + }, + { + "epoch": 14.168026101141924, + "grad_norm": 0.015075829811394215, + "learning_rate": 0.00023742286146535098, + "loss": 0.0022, + "num_input_tokens_seen": 187413776, + "step": 86850 + }, + { + "epoch": 14.16884176182708, + "grad_norm": 0.00047975595225580037, + "learning_rate": 0.00023736228959595073, + "loss": 0.0007, + "num_input_tokens_seen": 187425104, + "step": 86855 + }, + { + "epoch": 14.169657422512234, + "grad_norm": 0.004768040031194687, + "learning_rate": 0.00023730172304923725, + "loss": 0.003, + "num_input_tokens_seen": 187435952, + "step": 86860 + }, + { + "epoch": 14.17047308319739, + "grad_norm": 0.0004957161727361381, + "learning_rate": 0.00023724116182643725, + "loss": 0.0209, + "num_input_tokens_seen": 187446736, + "step": 86865 + }, + { + "epoch": 14.171288743882545, + "grad_norm": 0.010681820102036, + "learning_rate": 0.00023718060592877878, + "loss": 0.0014, + "num_input_tokens_seen": 187458128, + "step": 86870 + }, + { + "epoch": 14.1721044045677, + "grad_norm": 0.025586340576410294, + "learning_rate": 0.00023712005535748838, + "loss": 0.0027, + "num_input_tokens_seen": 187469104, + "step": 86875 + }, + { + "epoch": 14.172920065252855, + "grad_norm": 0.01762760616838932, + "learning_rate": 0.0002370595101137939, + "loss": 0.0014, + "num_input_tokens_seen": 187480528, + "step": 86880 + }, + { + "epoch": 14.173735725938009, + "grad_norm": 0.0017354979645460844, + "learning_rate": 0.00023699897019892165, + "loss": 0.0087, + "num_input_tokens_seen": 187491920, + "step": 86885 + }, + { + "epoch": 14.174551386623165, + "grad_norm": 0.0018656639149412513, + "learning_rate": 0.00023693843561409928, + "loss": 0.0034, + "num_input_tokens_seen": 187502512, + "step": 86890 + }, + { + "epoch": 14.17536704730832, + "grad_norm": 0.0012683272361755371, + "learning_rate": 0.0002368779063605529, + "loss": 0.0399, + "num_input_tokens_seen": 187514480, + "step": 86895 + }, + { + "epoch": 14.176182707993474, + "grad_norm": 0.00021438190015032887, + "learning_rate": 0.00023681738243950984, + "loss": 0.0014, + "num_input_tokens_seen": 187524752, + "step": 86900 + }, + { + "epoch": 14.17699836867863, + "grad_norm": 0.07331182807683945, + "learning_rate": 0.00023675686385219607, + "loss": 0.0051, + "num_input_tokens_seen": 187533904, + "step": 86905 + }, + { + "epoch": 14.177814029363784, + "grad_norm": 0.17696590721607208, + "learning_rate": 0.0002366963505998388, + "loss": 0.0067, + "num_input_tokens_seen": 187545264, + "step": 86910 + }, + { + "epoch": 14.17862969004894, + "grad_norm": 0.0036053138319402933, + "learning_rate": 0.00023663584268366356, + "loss": 0.0013, + "num_input_tokens_seen": 187556944, + "step": 86915 + }, + { + "epoch": 14.179445350734095, + "grad_norm": 0.001828488428145647, + "learning_rate": 0.00023657534010489733, + "loss": 0.0019, + "num_input_tokens_seen": 187567728, + "step": 86920 + }, + { + "epoch": 14.18026101141925, + "grad_norm": 0.0008185507031157613, + "learning_rate": 0.000236514842864766, + "loss": 0.0019, + "num_input_tokens_seen": 187577712, + "step": 86925 + }, + { + "epoch": 14.181076672104405, + "grad_norm": 0.011759464628994465, + "learning_rate": 0.00023645435096449557, + "loss": 0.0692, + "num_input_tokens_seen": 187588048, + "step": 86930 + }, + { + "epoch": 14.181892332789559, + "grad_norm": 0.005426196381449699, + "learning_rate": 0.00023639386440531208, + "loss": 0.0135, + "num_input_tokens_seen": 187598832, + "step": 86935 + }, + { + "epoch": 14.182707993474715, + "grad_norm": 0.004411112517118454, + "learning_rate": 0.00023633338318844137, + "loss": 0.009, + "num_input_tokens_seen": 187610608, + "step": 86940 + }, + { + "epoch": 14.18352365415987, + "grad_norm": 0.0003201637009624392, + "learning_rate": 0.00023627290731510908, + "loss": 0.0004, + "num_input_tokens_seen": 187623024, + "step": 86945 + }, + { + "epoch": 14.184339314845024, + "grad_norm": 0.0009169112308882177, + "learning_rate": 0.00023621243678654099, + "loss": 0.2764, + "num_input_tokens_seen": 187634800, + "step": 86950 + }, + { + "epoch": 14.18515497553018, + "grad_norm": 0.021399592980742455, + "learning_rate": 0.0002361519716039624, + "loss": 0.0116, + "num_input_tokens_seen": 187646480, + "step": 86955 + }, + { + "epoch": 14.185970636215334, + "grad_norm": 0.00042506548925302923, + "learning_rate": 0.00023609151176859884, + "loss": 0.009, + "num_input_tokens_seen": 187655408, + "step": 86960 + }, + { + "epoch": 14.18678629690049, + "grad_norm": 1.1015949249267578, + "learning_rate": 0.00023603105728167562, + "loss": 0.0738, + "num_input_tokens_seen": 187665840, + "step": 86965 + }, + { + "epoch": 14.187601957585644, + "grad_norm": 0.0004070644499734044, + "learning_rate": 0.00023597060814441767, + "loss": 0.0086, + "num_input_tokens_seen": 187676144, + "step": 86970 + }, + { + "epoch": 14.1884176182708, + "grad_norm": 0.015440022572875023, + "learning_rate": 0.00023591016435805067, + "loss": 0.0054, + "num_input_tokens_seen": 187686896, + "step": 86975 + }, + { + "epoch": 14.189233278955955, + "grad_norm": 5.4696125984191895, + "learning_rate": 0.00023584972592379888, + "loss": 0.0839, + "num_input_tokens_seen": 187698160, + "step": 86980 + }, + { + "epoch": 14.190048939641109, + "grad_norm": 0.0031104008667171, + "learning_rate": 0.0002357892928428878, + "loss": 0.0036, + "num_input_tokens_seen": 187708272, + "step": 86985 + }, + { + "epoch": 14.190864600326265, + "grad_norm": 0.004613074939697981, + "learning_rate": 0.00023572886511654157, + "loss": 0.0018, + "num_input_tokens_seen": 187720848, + "step": 86990 + }, + { + "epoch": 14.191680261011419, + "grad_norm": 0.00598951056599617, + "learning_rate": 0.00023566844274598548, + "loss": 0.0074, + "num_input_tokens_seen": 187730384, + "step": 86995 + }, + { + "epoch": 14.192495921696574, + "grad_norm": 0.5423932075500488, + "learning_rate": 0.00023560802573244333, + "loss": 0.0145, + "num_input_tokens_seen": 187742256, + "step": 87000 + }, + { + "epoch": 14.19331158238173, + "grad_norm": 0.0018402452114969492, + "learning_rate": 0.00023554761407714036, + "loss": 0.001, + "num_input_tokens_seen": 187752336, + "step": 87005 + }, + { + "epoch": 14.194127243066884, + "grad_norm": 0.001260284916497767, + "learning_rate": 0.00023548720778130005, + "loss": 0.007, + "num_input_tokens_seen": 187762192, + "step": 87010 + }, + { + "epoch": 14.19494290375204, + "grad_norm": 0.006623595021665096, + "learning_rate": 0.0002354268068461475, + "loss": 0.0118, + "num_input_tokens_seen": 187773040, + "step": 87015 + }, + { + "epoch": 14.195758564437194, + "grad_norm": 0.00867091678082943, + "learning_rate": 0.00023536641127290588, + "loss": 0.0054, + "num_input_tokens_seen": 187783376, + "step": 87020 + }, + { + "epoch": 14.19657422512235, + "grad_norm": 0.0003904358600266278, + "learning_rate": 0.00023530602106280004, + "loss": 0.0008, + "num_input_tokens_seen": 187794768, + "step": 87025 + }, + { + "epoch": 14.197389885807505, + "grad_norm": 0.09367648512125015, + "learning_rate": 0.00023524563621705308, + "loss": 0.0045, + "num_input_tokens_seen": 187805680, + "step": 87030 + }, + { + "epoch": 14.198205546492659, + "grad_norm": 0.406418114900589, + "learning_rate": 0.00023518525673688957, + "loss": 0.0145, + "num_input_tokens_seen": 187816784, + "step": 87035 + }, + { + "epoch": 14.199021207177815, + "grad_norm": 0.038363806903362274, + "learning_rate": 0.0002351248826235324, + "loss": 0.0056, + "num_input_tokens_seen": 187827888, + "step": 87040 + }, + { + "epoch": 14.199836867862969, + "grad_norm": 0.009366153739392757, + "learning_rate": 0.00023506451387820588, + "loss": 0.1512, + "num_input_tokens_seen": 187838736, + "step": 87045 + }, + { + "epoch": 14.200652528548124, + "grad_norm": 0.0006336293299682438, + "learning_rate": 0.0002350041505021327, + "loss": 0.0018, + "num_input_tokens_seen": 187848560, + "step": 87050 + }, + { + "epoch": 14.201468189233278, + "grad_norm": 0.00547432666644454, + "learning_rate": 0.00023494379249653675, + "loss": 0.0016, + "num_input_tokens_seen": 187859472, + "step": 87055 + }, + { + "epoch": 14.202283849918434, + "grad_norm": 0.00029613234801217914, + "learning_rate": 0.0002348834398626411, + "loss": 0.0137, + "num_input_tokens_seen": 187869840, + "step": 87060 + }, + { + "epoch": 14.20309951060359, + "grad_norm": 0.004337287042289972, + "learning_rate": 0.0002348230926016689, + "loss": 0.002, + "num_input_tokens_seen": 187880368, + "step": 87065 + }, + { + "epoch": 14.203915171288743, + "grad_norm": 0.13108645379543304, + "learning_rate": 0.00023476275071484309, + "loss": 0.0094, + "num_input_tokens_seen": 187892368, + "step": 87070 + }, + { + "epoch": 14.2047308319739, + "grad_norm": 0.000945467094425112, + "learning_rate": 0.0002347024142033866, + "loss": 0.0267, + "num_input_tokens_seen": 187903344, + "step": 87075 + }, + { + "epoch": 14.205546492659053, + "grad_norm": 0.0011831428855657578, + "learning_rate": 0.0002346420830685223, + "loss": 0.0153, + "num_input_tokens_seen": 187914672, + "step": 87080 + }, + { + "epoch": 14.206362153344209, + "grad_norm": 0.008464631624519825, + "learning_rate": 0.0002345817573114728, + "loss": 0.0017, + "num_input_tokens_seen": 187925424, + "step": 87085 + }, + { + "epoch": 14.207177814029365, + "grad_norm": 0.0034800039138644934, + "learning_rate": 0.00023452143693346067, + "loss": 0.0009, + "num_input_tokens_seen": 187936400, + "step": 87090 + }, + { + "epoch": 14.207993474714518, + "grad_norm": 0.025463445112109184, + "learning_rate": 0.0002344611219357084, + "loss": 0.0086, + "num_input_tokens_seen": 187945904, + "step": 87095 + }, + { + "epoch": 14.208809135399674, + "grad_norm": 0.0027954517863690853, + "learning_rate": 0.0002344008123194384, + "loss": 0.041, + "num_input_tokens_seen": 187956880, + "step": 87100 + }, + { + "epoch": 14.209624796084828, + "grad_norm": 0.013976474292576313, + "learning_rate": 0.0002343405080858728, + "loss": 0.0017, + "num_input_tokens_seen": 187967792, + "step": 87105 + }, + { + "epoch": 14.210440456769984, + "grad_norm": 0.004476571921259165, + "learning_rate": 0.00023428020923623382, + "loss": 0.0015, + "num_input_tokens_seen": 187979248, + "step": 87110 + }, + { + "epoch": 14.21125611745514, + "grad_norm": 0.0012334961211308837, + "learning_rate": 0.0002342199157717434, + "loss": 0.0008, + "num_input_tokens_seen": 187990480, + "step": 87115 + }, + { + "epoch": 14.212071778140293, + "grad_norm": 0.17892934381961823, + "learning_rate": 0.00023415962769362386, + "loss": 0.0154, + "num_input_tokens_seen": 188000752, + "step": 87120 + }, + { + "epoch": 14.21288743882545, + "grad_norm": 0.0006404994637705386, + "learning_rate": 0.00023409934500309633, + "loss": 0.0011, + "num_input_tokens_seen": 188010992, + "step": 87125 + }, + { + "epoch": 14.213703099510603, + "grad_norm": 0.011221496388316154, + "learning_rate": 0.00023403906770138328, + "loss": 0.0192, + "num_input_tokens_seen": 188022448, + "step": 87130 + }, + { + "epoch": 14.214518760195759, + "grad_norm": 0.005451492499560118, + "learning_rate": 0.00023397879578970554, + "loss": 0.0013, + "num_input_tokens_seen": 188031728, + "step": 87135 + }, + { + "epoch": 14.215334420880913, + "grad_norm": 0.0014786915853619576, + "learning_rate": 0.00023391852926928536, + "loss": 0.0009, + "num_input_tokens_seen": 188041808, + "step": 87140 + }, + { + "epoch": 14.216150081566068, + "grad_norm": 0.002362848725169897, + "learning_rate": 0.0002338582681413433, + "loss": 0.0017, + "num_input_tokens_seen": 188052496, + "step": 87145 + }, + { + "epoch": 14.216965742251224, + "grad_norm": 0.005015270784497261, + "learning_rate": 0.0002337980124071015, + "loss": 0.0038, + "num_input_tokens_seen": 188064752, + "step": 87150 + }, + { + "epoch": 14.217781402936378, + "grad_norm": 0.012547300197184086, + "learning_rate": 0.0002337377620677803, + "loss": 0.0021, + "num_input_tokens_seen": 188075952, + "step": 87155 + }, + { + "epoch": 14.218597063621534, + "grad_norm": 0.04638660326600075, + "learning_rate": 0.00023367751712460134, + "loss": 0.0014, + "num_input_tokens_seen": 188087024, + "step": 87160 + }, + { + "epoch": 14.219412724306688, + "grad_norm": 0.0001594497443875298, + "learning_rate": 0.00023361727757878527, + "loss": 0.0009, + "num_input_tokens_seen": 188095568, + "step": 87165 + }, + { + "epoch": 14.220228384991843, + "grad_norm": 0.0010166977299377322, + "learning_rate": 0.00023355704343155305, + "loss": 0.0069, + "num_input_tokens_seen": 188106864, + "step": 87170 + }, + { + "epoch": 14.221044045676999, + "grad_norm": 0.00445820577442646, + "learning_rate": 0.00023349681468412537, + "loss": 0.0052, + "num_input_tokens_seen": 188117712, + "step": 87175 + }, + { + "epoch": 14.221859706362153, + "grad_norm": 0.9753584861755371, + "learning_rate": 0.00023343659133772277, + "loss": 0.058, + "num_input_tokens_seen": 188129136, + "step": 87180 + }, + { + "epoch": 14.222675367047309, + "grad_norm": 0.04452339559793472, + "learning_rate": 0.0002333763733935659, + "loss": 0.0024, + "num_input_tokens_seen": 188139696, + "step": 87185 + }, + { + "epoch": 14.223491027732463, + "grad_norm": 0.0005485396832227707, + "learning_rate": 0.00023331616085287492, + "loss": 0.0007, + "num_input_tokens_seen": 188150768, + "step": 87190 + }, + { + "epoch": 14.224306688417618, + "grad_norm": 0.0514199398458004, + "learning_rate": 0.00023325595371687037, + "loss": 0.0019, + "num_input_tokens_seen": 188162768, + "step": 87195 + }, + { + "epoch": 14.225122349102774, + "grad_norm": 0.007329524494707584, + "learning_rate": 0.00023319575198677223, + "loss": 0.0018, + "num_input_tokens_seen": 188173520, + "step": 87200 + }, + { + "epoch": 14.225938009787928, + "grad_norm": 0.0008944624569267035, + "learning_rate": 0.00023313555566380068, + "loss": 0.0006, + "num_input_tokens_seen": 188184976, + "step": 87205 + }, + { + "epoch": 14.226753670473084, + "grad_norm": 0.010980749502778053, + "learning_rate": 0.00023307536474917567, + "loss": 0.0088, + "num_input_tokens_seen": 188196208, + "step": 87210 + }, + { + "epoch": 14.227569331158238, + "grad_norm": 0.054462965577840805, + "learning_rate": 0.00023301517924411696, + "loss": 0.0146, + "num_input_tokens_seen": 188206992, + "step": 87215 + }, + { + "epoch": 14.228384991843393, + "grad_norm": 0.002519140485674143, + "learning_rate": 0.00023295499914984436, + "loss": 0.0032, + "num_input_tokens_seen": 188219024, + "step": 87220 + }, + { + "epoch": 14.229200652528547, + "grad_norm": 0.014224231243133545, + "learning_rate": 0.00023289482446757747, + "loss": 0.0021, + "num_input_tokens_seen": 188230288, + "step": 87225 + }, + { + "epoch": 14.230016313213703, + "grad_norm": 0.0005701878108084202, + "learning_rate": 0.0002328346551985358, + "loss": 0.0017, + "num_input_tokens_seen": 188241008, + "step": 87230 + }, + { + "epoch": 14.230831973898859, + "grad_norm": 0.002525158692151308, + "learning_rate": 0.00023277449134393875, + "loss": 0.0012, + "num_input_tokens_seen": 188252304, + "step": 87235 + }, + { + "epoch": 14.231647634584013, + "grad_norm": 0.030390407890081406, + "learning_rate": 0.00023271433290500567, + "loss": 0.0026, + "num_input_tokens_seen": 188263696, + "step": 87240 + }, + { + "epoch": 14.232463295269168, + "grad_norm": 0.0016091355355456471, + "learning_rate": 0.00023265417988295567, + "loss": 0.008, + "num_input_tokens_seen": 188273200, + "step": 87245 + }, + { + "epoch": 14.233278955954322, + "grad_norm": 0.05450786277651787, + "learning_rate": 0.0002325940322790079, + "loss": 0.002, + "num_input_tokens_seen": 188284144, + "step": 87250 + }, + { + "epoch": 14.234094616639478, + "grad_norm": 0.007341490592807531, + "learning_rate": 0.0002325338900943813, + "loss": 0.0093, + "num_input_tokens_seen": 188294480, + "step": 87255 + }, + { + "epoch": 14.234910277324634, + "grad_norm": 0.0007293278467841446, + "learning_rate": 0.00023247375333029452, + "loss": 0.0013, + "num_input_tokens_seen": 188305008, + "step": 87260 + }, + { + "epoch": 14.235725938009788, + "grad_norm": 0.03601422533392906, + "learning_rate": 0.00023241362198796666, + "loss": 0.0141, + "num_input_tokens_seen": 188315824, + "step": 87265 + }, + { + "epoch": 14.236541598694943, + "grad_norm": 0.03650297224521637, + "learning_rate": 0.00023235349606861628, + "loss": 0.0042, + "num_input_tokens_seen": 188327920, + "step": 87270 + }, + { + "epoch": 14.237357259380097, + "grad_norm": 0.06695828586816788, + "learning_rate": 0.00023229337557346174, + "loss": 0.0041, + "num_input_tokens_seen": 188339760, + "step": 87275 + }, + { + "epoch": 14.238172920065253, + "grad_norm": 0.00384063390083611, + "learning_rate": 0.00023223326050372163, + "loss": 0.0013, + "num_input_tokens_seen": 188351952, + "step": 87280 + }, + { + "epoch": 14.238988580750409, + "grad_norm": 0.40686920285224915, + "learning_rate": 0.0002321731508606142, + "loss": 0.0909, + "num_input_tokens_seen": 188363024, + "step": 87285 + }, + { + "epoch": 14.239804241435563, + "grad_norm": 0.00033827233710326254, + "learning_rate": 0.0002321130466453576, + "loss": 0.0034, + "num_input_tokens_seen": 188374832, + "step": 87290 + }, + { + "epoch": 14.240619902120718, + "grad_norm": 0.027995051816105843, + "learning_rate": 0.0002320529478591699, + "loss": 0.057, + "num_input_tokens_seen": 188386224, + "step": 87295 + }, + { + "epoch": 14.241435562805872, + "grad_norm": 0.0494736023247242, + "learning_rate": 0.00023199285450326918, + "loss": 0.3318, + "num_input_tokens_seen": 188397456, + "step": 87300 + }, + { + "epoch": 14.242251223491028, + "grad_norm": 0.03154423087835312, + "learning_rate": 0.00023193276657887326, + "loss": 0.0011, + "num_input_tokens_seen": 188407408, + "step": 87305 + }, + { + "epoch": 14.243066884176184, + "grad_norm": 0.0025930420961230993, + "learning_rate": 0.00023187268408719986, + "loss": 0.0011, + "num_input_tokens_seen": 188417680, + "step": 87310 + }, + { + "epoch": 14.243882544861338, + "grad_norm": 0.0006991227273829281, + "learning_rate": 0.00023181260702946673, + "loss": 0.0191, + "num_input_tokens_seen": 188427504, + "step": 87315 + }, + { + "epoch": 14.244698205546493, + "grad_norm": 0.0020324690267443657, + "learning_rate": 0.00023175253540689124, + "loss": 0.0018, + "num_input_tokens_seen": 188438352, + "step": 87320 + }, + { + "epoch": 14.245513866231647, + "grad_norm": 0.0022498297039419413, + "learning_rate": 0.00023169246922069098, + "loss": 0.0014, + "num_input_tokens_seen": 188448208, + "step": 87325 + }, + { + "epoch": 14.246329526916803, + "grad_norm": 0.036680918186903, + "learning_rate": 0.00023163240847208318, + "loss": 0.0255, + "num_input_tokens_seen": 188459504, + "step": 87330 + }, + { + "epoch": 14.247145187601957, + "grad_norm": 0.002598593942821026, + "learning_rate": 0.0002315723531622851, + "loss": 0.0021, + "num_input_tokens_seen": 188469744, + "step": 87335 + }, + { + "epoch": 14.247960848287113, + "grad_norm": 0.023958008736371994, + "learning_rate": 0.00023151230329251376, + "loss": 0.0026, + "num_input_tokens_seen": 188480752, + "step": 87340 + }, + { + "epoch": 14.248776508972268, + "grad_norm": 0.5930587649345398, + "learning_rate": 0.00023145225886398617, + "loss": 0.1216, + "num_input_tokens_seen": 188491248, + "step": 87345 + }, + { + "epoch": 14.249592169657422, + "grad_norm": 0.012278588488698006, + "learning_rate": 0.0002313922198779193, + "loss": 0.0065, + "num_input_tokens_seen": 188502448, + "step": 87350 + }, + { + "epoch": 14.250407830342578, + "grad_norm": 0.0011242394102737308, + "learning_rate": 0.00023133218633552982, + "loss": 0.0019, + "num_input_tokens_seen": 188515024, + "step": 87355 + }, + { + "epoch": 14.251223491027732, + "grad_norm": 0.0008853948675096035, + "learning_rate": 0.00023127215823803444, + "loss": 0.0016, + "num_input_tokens_seen": 188525168, + "step": 87360 + }, + { + "epoch": 14.252039151712887, + "grad_norm": 0.00048213364789262414, + "learning_rate": 0.00023121213558664966, + "loss": 0.0026, + "num_input_tokens_seen": 188536592, + "step": 87365 + }, + { + "epoch": 14.252854812398043, + "grad_norm": 0.05557115375995636, + "learning_rate": 0.00023115211838259175, + "loss": 0.009, + "num_input_tokens_seen": 188547440, + "step": 87370 + }, + { + "epoch": 14.253670473083197, + "grad_norm": 0.004576754290610552, + "learning_rate": 0.00023109210662707757, + "loss": 0.0077, + "num_input_tokens_seen": 188557424, + "step": 87375 + }, + { + "epoch": 14.254486133768353, + "grad_norm": 0.002228276338428259, + "learning_rate": 0.00023103210032132267, + "loss": 0.0004, + "num_input_tokens_seen": 188567408, + "step": 87380 + }, + { + "epoch": 14.255301794453507, + "grad_norm": 0.07073167711496353, + "learning_rate": 0.0002309720994665438, + "loss": 0.0191, + "num_input_tokens_seen": 188578288, + "step": 87385 + }, + { + "epoch": 14.256117455138662, + "grad_norm": 0.0004489337152335793, + "learning_rate": 0.00023091210406395624, + "loss": 0.002, + "num_input_tokens_seen": 188588464, + "step": 87390 + }, + { + "epoch": 14.256933115823816, + "grad_norm": 0.0038700576405972242, + "learning_rate": 0.00023085211411477663, + "loss": 0.0794, + "num_input_tokens_seen": 188598736, + "step": 87395 + }, + { + "epoch": 14.257748776508972, + "grad_norm": 0.0013947532279416919, + "learning_rate": 0.00023079212962022, + "loss": 0.0037, + "num_input_tokens_seen": 188609040, + "step": 87400 + }, + { + "epoch": 14.258564437194128, + "grad_norm": 0.002498795511201024, + "learning_rate": 0.00023073215058150255, + "loss": 0.0182, + "num_input_tokens_seen": 188620848, + "step": 87405 + }, + { + "epoch": 14.259380097879282, + "grad_norm": 0.0031350580975413322, + "learning_rate": 0.00023067217699983966, + "loss": 0.0005, + "num_input_tokens_seen": 188632112, + "step": 87410 + }, + { + "epoch": 14.260195758564437, + "grad_norm": 0.004708592779934406, + "learning_rate": 0.00023061220887644679, + "loss": 0.0008, + "num_input_tokens_seen": 188641008, + "step": 87415 + }, + { + "epoch": 14.261011419249591, + "grad_norm": 0.006404914427548647, + "learning_rate": 0.00023055224621253923, + "loss": 0.0007, + "num_input_tokens_seen": 188651792, + "step": 87420 + }, + { + "epoch": 14.261827079934747, + "grad_norm": 0.03134569153189659, + "learning_rate": 0.00023049228900933223, + "loss": 0.0016, + "num_input_tokens_seen": 188662768, + "step": 87425 + }, + { + "epoch": 14.262642740619903, + "grad_norm": 0.0026478846557438374, + "learning_rate": 0.00023043233726804087, + "loss": 0.0045, + "num_input_tokens_seen": 188673456, + "step": 87430 + }, + { + "epoch": 14.263458401305057, + "grad_norm": 0.0019294237717986107, + "learning_rate": 0.00023037239098988016, + "loss": 0.0012, + "num_input_tokens_seen": 188684560, + "step": 87435 + }, + { + "epoch": 14.264274061990212, + "grad_norm": 0.004820317029953003, + "learning_rate": 0.00023031245017606506, + "loss": 0.0007, + "num_input_tokens_seen": 188694768, + "step": 87440 + }, + { + "epoch": 14.265089722675366, + "grad_norm": 0.0020945873111486435, + "learning_rate": 0.00023025251482781023, + "loss": 0.0017, + "num_input_tokens_seen": 188703856, + "step": 87445 + }, + { + "epoch": 14.265905383360522, + "grad_norm": 0.0581456683576107, + "learning_rate": 0.00023019258494633038, + "loss": 0.0033, + "num_input_tokens_seen": 188714800, + "step": 87450 + }, + { + "epoch": 14.266721044045678, + "grad_norm": 0.002909975592046976, + "learning_rate": 0.0002301326605328401, + "loss": 0.003, + "num_input_tokens_seen": 188724848, + "step": 87455 + }, + { + "epoch": 14.267536704730832, + "grad_norm": 0.007044430822134018, + "learning_rate": 0.00023007274158855378, + "loss": 0.0128, + "num_input_tokens_seen": 188735344, + "step": 87460 + }, + { + "epoch": 14.268352365415987, + "grad_norm": 0.0011626335326582193, + "learning_rate": 0.00023001282811468577, + "loss": 0.0009, + "num_input_tokens_seen": 188748208, + "step": 87465 + }, + { + "epoch": 14.269168026101141, + "grad_norm": 0.00764869712293148, + "learning_rate": 0.00022995292011245033, + "loss": 0.0808, + "num_input_tokens_seen": 188758576, + "step": 87470 + }, + { + "epoch": 14.269983686786297, + "grad_norm": 0.33236753940582275, + "learning_rate": 0.00022989301758306153, + "loss": 0.0166, + "num_input_tokens_seen": 188769616, + "step": 87475 + }, + { + "epoch": 14.270799347471453, + "grad_norm": 0.010402482934296131, + "learning_rate": 0.00022983312052773336, + "loss": 0.0841, + "num_input_tokens_seen": 188781008, + "step": 87480 + }, + { + "epoch": 14.271615008156607, + "grad_norm": 0.1266389936208725, + "learning_rate": 0.0002297732289476796, + "loss": 0.0033, + "num_input_tokens_seen": 188792368, + "step": 87485 + }, + { + "epoch": 14.272430668841762, + "grad_norm": 0.016937313601374626, + "learning_rate": 0.0002297133428441145, + "loss": 0.0224, + "num_input_tokens_seen": 188804016, + "step": 87490 + }, + { + "epoch": 14.273246329526916, + "grad_norm": 0.005387944169342518, + "learning_rate": 0.000229653462218251, + "loss": 0.0006, + "num_input_tokens_seen": 188815280, + "step": 87495 + }, + { + "epoch": 14.274061990212072, + "grad_norm": 0.0016155753983184695, + "learning_rate": 0.00022959358707130346, + "loss": 0.0086, + "num_input_tokens_seen": 188825904, + "step": 87500 + }, + { + "epoch": 14.274877650897226, + "grad_norm": 0.008116827346384525, + "learning_rate": 0.00022953371740448453, + "loss": 0.1467, + "num_input_tokens_seen": 188837200, + "step": 87505 + }, + { + "epoch": 14.275693311582382, + "grad_norm": 0.0014055295614525676, + "learning_rate": 0.00022947385321900825, + "loss": 0.0144, + "num_input_tokens_seen": 188846832, + "step": 87510 + }, + { + "epoch": 14.276508972267537, + "grad_norm": 0.016630789265036583, + "learning_rate": 0.00022941399451608725, + "loss": 0.0246, + "num_input_tokens_seen": 188857936, + "step": 87515 + }, + { + "epoch": 14.277324632952691, + "grad_norm": 0.23848006129264832, + "learning_rate": 0.00022935414129693523, + "loss": 0.0136, + "num_input_tokens_seen": 188869296, + "step": 87520 + }, + { + "epoch": 14.278140293637847, + "grad_norm": 0.015504665672779083, + "learning_rate": 0.0002292942935627645, + "loss": 0.0188, + "num_input_tokens_seen": 188880784, + "step": 87525 + }, + { + "epoch": 14.278955954323001, + "grad_norm": 0.00411229720339179, + "learning_rate": 0.00022923445131478866, + "loss": 0.0412, + "num_input_tokens_seen": 188891984, + "step": 87530 + }, + { + "epoch": 14.279771615008157, + "grad_norm": 0.03392595797777176, + "learning_rate": 0.00022917461455421984, + "loss": 0.003, + "num_input_tokens_seen": 188904208, + "step": 87535 + }, + { + "epoch": 14.280587275693312, + "grad_norm": 0.021041087806224823, + "learning_rate": 0.00022911478328227136, + "loss": 0.0054, + "num_input_tokens_seen": 188915792, + "step": 87540 + }, + { + "epoch": 14.281402936378466, + "grad_norm": 0.003208666341379285, + "learning_rate": 0.00022905495750015508, + "loss": 0.0065, + "num_input_tokens_seen": 188926320, + "step": 87545 + }, + { + "epoch": 14.282218597063622, + "grad_norm": 0.22430674731731415, + "learning_rate": 0.000228995137209084, + "loss": 0.0072, + "num_input_tokens_seen": 188938576, + "step": 87550 + }, + { + "epoch": 14.283034257748776, + "grad_norm": 0.3514708876609802, + "learning_rate": 0.00022893532241027026, + "loss": 0.0499, + "num_input_tokens_seen": 188949744, + "step": 87555 + }, + { + "epoch": 14.283849918433932, + "grad_norm": 0.01994827575981617, + "learning_rate": 0.00022887551310492605, + "loss": 0.0247, + "num_input_tokens_seen": 188959440, + "step": 87560 + }, + { + "epoch": 14.284665579119087, + "grad_norm": 0.16741904616355896, + "learning_rate": 0.00022881570929426354, + "loss": 0.0115, + "num_input_tokens_seen": 188971440, + "step": 87565 + }, + { + "epoch": 14.285481239804241, + "grad_norm": 0.031145406886935234, + "learning_rate": 0.00022875591097949472, + "loss": 0.0142, + "num_input_tokens_seen": 188982960, + "step": 87570 + }, + { + "epoch": 14.286296900489397, + "grad_norm": 0.029239870607852936, + "learning_rate": 0.00022869611816183144, + "loss": 0.0016, + "num_input_tokens_seen": 188993072, + "step": 87575 + }, + { + "epoch": 14.28711256117455, + "grad_norm": 0.018053926527500153, + "learning_rate": 0.00022863633084248549, + "loss": 0.0152, + "num_input_tokens_seen": 189003920, + "step": 87580 + }, + { + "epoch": 14.287928221859707, + "grad_norm": 0.020224913954734802, + "learning_rate": 0.00022857654902266856, + "loss": 0.0107, + "num_input_tokens_seen": 189014640, + "step": 87585 + }, + { + "epoch": 14.28874388254486, + "grad_norm": 0.21574674546718597, + "learning_rate": 0.00022851677270359217, + "loss": 0.0145, + "num_input_tokens_seen": 189024240, + "step": 87590 + }, + { + "epoch": 14.289559543230016, + "grad_norm": 0.01696234941482544, + "learning_rate": 0.0002284570018864678, + "loss": 0.0021, + "num_input_tokens_seen": 189035216, + "step": 87595 + }, + { + "epoch": 14.290375203915172, + "grad_norm": 0.02322370372712612, + "learning_rate": 0.0002283972365725066, + "loss": 0.0109, + "num_input_tokens_seen": 189045968, + "step": 87600 + }, + { + "epoch": 14.291190864600326, + "grad_norm": 0.33442223072052, + "learning_rate": 0.00022833747676292027, + "loss": 0.0132, + "num_input_tokens_seen": 189056720, + "step": 87605 + }, + { + "epoch": 14.292006525285482, + "grad_norm": 0.011306601576507092, + "learning_rate": 0.00022827772245891925, + "loss": 0.0049, + "num_input_tokens_seen": 189066672, + "step": 87610 + }, + { + "epoch": 14.292822185970635, + "grad_norm": 0.11083826422691345, + "learning_rate": 0.00022821797366171531, + "loss": 0.1783, + "num_input_tokens_seen": 189078096, + "step": 87615 + }, + { + "epoch": 14.293637846655791, + "grad_norm": 0.040654830634593964, + "learning_rate": 0.00022815823037251849, + "loss": 0.0296, + "num_input_tokens_seen": 189088880, + "step": 87620 + }, + { + "epoch": 14.294453507340947, + "grad_norm": 0.18488168716430664, + "learning_rate": 0.00022809849259254034, + "loss": 0.0152, + "num_input_tokens_seen": 189099152, + "step": 87625 + }, + { + "epoch": 14.2952691680261, + "grad_norm": 0.010465022176504135, + "learning_rate": 0.00022803876032299086, + "loss": 0.0145, + "num_input_tokens_seen": 189110864, + "step": 87630 + }, + { + "epoch": 14.296084828711257, + "grad_norm": 0.000913115160074085, + "learning_rate": 0.00022797903356508125, + "loss": 0.0083, + "num_input_tokens_seen": 189122736, + "step": 87635 + }, + { + "epoch": 14.29690048939641, + "grad_norm": 0.887035608291626, + "learning_rate": 0.00022791931232002123, + "loss": 0.011, + "num_input_tokens_seen": 189133648, + "step": 87640 + }, + { + "epoch": 14.297716150081566, + "grad_norm": 0.13027244806289673, + "learning_rate": 0.00022785959658902188, + "loss": 0.0113, + "num_input_tokens_seen": 189144080, + "step": 87645 + }, + { + "epoch": 14.298531810766722, + "grad_norm": 0.005771315656602383, + "learning_rate": 0.00022779988637329263, + "loss": 0.0461, + "num_input_tokens_seen": 189155056, + "step": 87650 + }, + { + "epoch": 14.299347471451876, + "grad_norm": 0.0012138456804677844, + "learning_rate": 0.00022774018167404442, + "loss": 0.0009, + "num_input_tokens_seen": 189166288, + "step": 87655 + }, + { + "epoch": 14.300163132137031, + "grad_norm": 0.0035051237791776657, + "learning_rate": 0.00022768048249248646, + "loss": 0.001, + "num_input_tokens_seen": 189177040, + "step": 87660 + }, + { + "epoch": 14.300978792822185, + "grad_norm": 0.04563437029719353, + "learning_rate": 0.00022762078882982928, + "loss": 0.0029, + "num_input_tokens_seen": 189188080, + "step": 87665 + }, + { + "epoch": 14.301794453507341, + "grad_norm": 0.05453868582844734, + "learning_rate": 0.00022756110068728204, + "loss": 0.003, + "num_input_tokens_seen": 189199600, + "step": 87670 + }, + { + "epoch": 14.302610114192497, + "grad_norm": 0.031622983515262604, + "learning_rate": 0.00022750141806605507, + "loss": 0.0034, + "num_input_tokens_seen": 189208912, + "step": 87675 + }, + { + "epoch": 14.30342577487765, + "grad_norm": 0.020571986213326454, + "learning_rate": 0.00022744174096735715, + "loss": 0.0028, + "num_input_tokens_seen": 189219888, + "step": 87680 + }, + { + "epoch": 14.304241435562806, + "grad_norm": 0.3790666460990906, + "learning_rate": 0.00022738206939239852, + "loss": 0.0251, + "num_input_tokens_seen": 189229776, + "step": 87685 + }, + { + "epoch": 14.30505709624796, + "grad_norm": 1.2150676250457764, + "learning_rate": 0.0002273224033423877, + "loss": 0.058, + "num_input_tokens_seen": 189241040, + "step": 87690 + }, + { + "epoch": 14.305872756933116, + "grad_norm": 0.03313300386071205, + "learning_rate": 0.0002272627428185345, + "loss": 0.0025, + "num_input_tokens_seen": 189251504, + "step": 87695 + }, + { + "epoch": 14.30668841761827, + "grad_norm": 0.0801723524928093, + "learning_rate": 0.0002272030878220478, + "loss": 0.0048, + "num_input_tokens_seen": 189261808, + "step": 87700 + }, + { + "epoch": 14.307504078303426, + "grad_norm": 0.45486554503440857, + "learning_rate": 0.0002271434383541366, + "loss": 0.0228, + "num_input_tokens_seen": 189273264, + "step": 87705 + }, + { + "epoch": 14.308319738988581, + "grad_norm": 0.21809223294258118, + "learning_rate": 0.00022708379441600975, + "loss": 0.0164, + "num_input_tokens_seen": 189285360, + "step": 87710 + }, + { + "epoch": 14.309135399673735, + "grad_norm": 0.006637732032686472, + "learning_rate": 0.000227024156008876, + "loss": 0.0015, + "num_input_tokens_seen": 189297040, + "step": 87715 + }, + { + "epoch": 14.309951060358891, + "grad_norm": 0.0003943484916817397, + "learning_rate": 0.00022696452313394406, + "loss": 0.0006, + "num_input_tokens_seen": 189307664, + "step": 87720 + }, + { + "epoch": 14.310766721044045, + "grad_norm": 0.13997536897659302, + "learning_rate": 0.0002269048957924224, + "loss": 0.0061, + "num_input_tokens_seen": 189317296, + "step": 87725 + }, + { + "epoch": 14.3115823817292, + "grad_norm": 0.018071215599775314, + "learning_rate": 0.0002268452739855195, + "loss": 0.0342, + "num_input_tokens_seen": 189327408, + "step": 87730 + }, + { + "epoch": 14.312398042414356, + "grad_norm": 0.02813112922012806, + "learning_rate": 0.00022678565771444364, + "loss": 0.0045, + "num_input_tokens_seen": 189336656, + "step": 87735 + }, + { + "epoch": 14.31321370309951, + "grad_norm": 0.00038922394742257893, + "learning_rate": 0.00022672604698040306, + "loss": 0.0009, + "num_input_tokens_seen": 189346992, + "step": 87740 + }, + { + "epoch": 14.314029363784666, + "grad_norm": 0.0385550819337368, + "learning_rate": 0.00022666644178460555, + "loss": 0.0023, + "num_input_tokens_seen": 189356240, + "step": 87745 + }, + { + "epoch": 14.31484502446982, + "grad_norm": 0.02891366556286812, + "learning_rate": 0.00022660684212825978, + "loss": 0.0029, + "num_input_tokens_seen": 189366192, + "step": 87750 + }, + { + "epoch": 14.315660685154976, + "grad_norm": 0.04787098616361618, + "learning_rate": 0.00022654724801257276, + "loss": 0.0151, + "num_input_tokens_seen": 189376496, + "step": 87755 + }, + { + "epoch": 14.31647634584013, + "grad_norm": 0.0005688476958312094, + "learning_rate": 0.00022648765943875305, + "loss": 0.0011, + "num_input_tokens_seen": 189388016, + "step": 87760 + }, + { + "epoch": 14.317292006525285, + "grad_norm": 0.0004448130202945322, + "learning_rate": 0.00022642807640800756, + "loss": 0.0011, + "num_input_tokens_seen": 189399472, + "step": 87765 + }, + { + "epoch": 14.318107667210441, + "grad_norm": 0.0269921962171793, + "learning_rate": 0.0002263684989215445, + "loss": 0.0012, + "num_input_tokens_seen": 189410640, + "step": 87770 + }, + { + "epoch": 14.318923327895595, + "grad_norm": 0.003570235101506114, + "learning_rate": 0.00022630892698057055, + "loss": 0.0051, + "num_input_tokens_seen": 189422320, + "step": 87775 + }, + { + "epoch": 14.31973898858075, + "grad_norm": 0.0008455334464088082, + "learning_rate": 0.00022624936058629374, + "loss": 0.0015, + "num_input_tokens_seen": 189432336, + "step": 87780 + }, + { + "epoch": 14.320554649265905, + "grad_norm": 0.0001777293800842017, + "learning_rate": 0.00022618979973992054, + "loss": 0.0013, + "num_input_tokens_seen": 189443792, + "step": 87785 + }, + { + "epoch": 14.32137030995106, + "grad_norm": 0.00039466944872401655, + "learning_rate": 0.00022613024444265883, + "loss": 0.0065, + "num_input_tokens_seen": 189453904, + "step": 87790 + }, + { + "epoch": 14.322185970636216, + "grad_norm": 0.000194728490896523, + "learning_rate": 0.00022607069469571473, + "loss": 0.0012, + "num_input_tokens_seen": 189462992, + "step": 87795 + }, + { + "epoch": 14.32300163132137, + "grad_norm": 0.0022014814894646406, + "learning_rate": 0.00022601115050029574, + "loss": 0.0019, + "num_input_tokens_seen": 189473584, + "step": 87800 + }, + { + "epoch": 14.323817292006526, + "grad_norm": 0.002210188889876008, + "learning_rate": 0.0002259516118576083, + "loss": 0.0159, + "num_input_tokens_seen": 189484464, + "step": 87805 + }, + { + "epoch": 14.32463295269168, + "grad_norm": 0.0046628862619400024, + "learning_rate": 0.00022589207876885914, + "loss": 0.001, + "num_input_tokens_seen": 189495280, + "step": 87810 + }, + { + "epoch": 14.325448613376835, + "grad_norm": 0.0005628266371786594, + "learning_rate": 0.00022583255123525476, + "loss": 0.0151, + "num_input_tokens_seen": 189506992, + "step": 87815 + }, + { + "epoch": 14.326264274061991, + "grad_norm": 0.04859665408730507, + "learning_rate": 0.00022577302925800153, + "loss": 0.0031, + "num_input_tokens_seen": 189517808, + "step": 87820 + }, + { + "epoch": 14.327079934747145, + "grad_norm": 0.0015134724089875817, + "learning_rate": 0.0002257135128383057, + "loss": 0.0016, + "num_input_tokens_seen": 189528688, + "step": 87825 + }, + { + "epoch": 14.3278955954323, + "grad_norm": 0.0010153896873816848, + "learning_rate": 0.00022565400197737352, + "loss": 0.0005, + "num_input_tokens_seen": 189539600, + "step": 87830 + }, + { + "epoch": 14.328711256117455, + "grad_norm": 0.05434511974453926, + "learning_rate": 0.000225594496676411, + "loss": 0.0027, + "num_input_tokens_seen": 189551152, + "step": 87835 + }, + { + "epoch": 14.32952691680261, + "grad_norm": 0.028447221964597702, + "learning_rate": 0.0002255349969366241, + "loss": 0.0134, + "num_input_tokens_seen": 189562672, + "step": 87840 + }, + { + "epoch": 14.330342577487766, + "grad_norm": 0.003119502682238817, + "learning_rate": 0.0002254755027592187, + "loss": 0.001, + "num_input_tokens_seen": 189574320, + "step": 87845 + }, + { + "epoch": 14.33115823817292, + "grad_norm": 0.008021277375519276, + "learning_rate": 0.00022541601414540052, + "loss": 0.0018, + "num_input_tokens_seen": 189586352, + "step": 87850 + }, + { + "epoch": 14.331973898858076, + "grad_norm": 0.001324329525232315, + "learning_rate": 0.00022535653109637512, + "loss": 0.0404, + "num_input_tokens_seen": 189598704, + "step": 87855 + }, + { + "epoch": 14.33278955954323, + "grad_norm": 0.0038747156504541636, + "learning_rate": 0.000225297053613348, + "loss": 0.0072, + "num_input_tokens_seen": 189608784, + "step": 87860 + }, + { + "epoch": 14.333605220228385, + "grad_norm": 0.34431999921798706, + "learning_rate": 0.0002252375816975246, + "loss": 0.0061, + "num_input_tokens_seen": 189620240, + "step": 87865 + }, + { + "epoch": 14.33442088091354, + "grad_norm": 0.0014390117721632123, + "learning_rate": 0.0002251781153501102, + "loss": 0.0034, + "num_input_tokens_seen": 189630640, + "step": 87870 + }, + { + "epoch": 14.335236541598695, + "grad_norm": 0.0045255436562001705, + "learning_rate": 0.0002251186545723099, + "loss": 0.0072, + "num_input_tokens_seen": 189640880, + "step": 87875 + }, + { + "epoch": 14.33605220228385, + "grad_norm": 0.0007916373433545232, + "learning_rate": 0.00022505919936532877, + "loss": 0.0036, + "num_input_tokens_seen": 189651632, + "step": 87880 + }, + { + "epoch": 14.336867862969005, + "grad_norm": 0.0036209370009601116, + "learning_rate": 0.00022499974973037173, + "loss": 0.0164, + "num_input_tokens_seen": 189661392, + "step": 87885 + }, + { + "epoch": 14.33768352365416, + "grad_norm": 0.11181031167507172, + "learning_rate": 0.0002249403056686435, + "loss": 0.0043, + "num_input_tokens_seen": 189672688, + "step": 87890 + }, + { + "epoch": 14.338499184339314, + "grad_norm": 0.15918225049972534, + "learning_rate": 0.0002248808671813492, + "loss": 0.0084, + "num_input_tokens_seen": 189684080, + "step": 87895 + }, + { + "epoch": 14.33931484502447, + "grad_norm": 0.046551670879125595, + "learning_rate": 0.00022482143426969282, + "loss": 0.0021, + "num_input_tokens_seen": 189695184, + "step": 87900 + }, + { + "epoch": 14.340130505709626, + "grad_norm": 0.0007648098981007934, + "learning_rate": 0.00022476200693487936, + "loss": 0.0013, + "num_input_tokens_seen": 189706640, + "step": 87905 + }, + { + "epoch": 14.34094616639478, + "grad_norm": 0.008273759856820107, + "learning_rate": 0.000224702585178113, + "loss": 0.027, + "num_input_tokens_seen": 189718512, + "step": 87910 + }, + { + "epoch": 14.341761827079935, + "grad_norm": 0.0007036213064566255, + "learning_rate": 0.00022464316900059795, + "loss": 0.0011, + "num_input_tokens_seen": 189728944, + "step": 87915 + }, + { + "epoch": 14.34257748776509, + "grad_norm": 0.09043516218662262, + "learning_rate": 0.0002245837584035384, + "loss": 0.0058, + "num_input_tokens_seen": 189738960, + "step": 87920 + }, + { + "epoch": 14.343393148450245, + "grad_norm": 0.009033621288836002, + "learning_rate": 0.00022452435338813842, + "loss": 0.0044, + "num_input_tokens_seen": 189750224, + "step": 87925 + }, + { + "epoch": 14.3442088091354, + "grad_norm": 0.0004997099167667329, + "learning_rate": 0.00022446495395560186, + "loss": 0.0026, + "num_input_tokens_seen": 189760592, + "step": 87930 + }, + { + "epoch": 14.345024469820554, + "grad_norm": 0.0014209254877641797, + "learning_rate": 0.00022440556010713253, + "loss": 0.0007, + "num_input_tokens_seen": 189771248, + "step": 87935 + }, + { + "epoch": 14.34584013050571, + "grad_norm": 0.0017719449242576957, + "learning_rate": 0.00022434617184393418, + "loss": 0.0008, + "num_input_tokens_seen": 189781456, + "step": 87940 + }, + { + "epoch": 14.346655791190864, + "grad_norm": 0.004572118632495403, + "learning_rate": 0.00022428678916721029, + "loss": 0.01, + "num_input_tokens_seen": 189792048, + "step": 87945 + }, + { + "epoch": 14.34747145187602, + "grad_norm": 0.027955148369073868, + "learning_rate": 0.00022422741207816444, + "loss": 0.0013, + "num_input_tokens_seen": 189802832, + "step": 87950 + }, + { + "epoch": 14.348287112561174, + "grad_norm": 0.02973468415439129, + "learning_rate": 0.00022416804057799988, + "loss": 0.007, + "num_input_tokens_seen": 189814416, + "step": 87955 + }, + { + "epoch": 14.34910277324633, + "grad_norm": 0.0536721870303154, + "learning_rate": 0.00022410867466791996, + "loss": 0.0085, + "num_input_tokens_seen": 189825456, + "step": 87960 + }, + { + "epoch": 14.349918433931485, + "grad_norm": 1.2108180522918701, + "learning_rate": 0.00022404931434912768, + "loss": 0.0274, + "num_input_tokens_seen": 189836944, + "step": 87965 + }, + { + "epoch": 14.350734094616639, + "grad_norm": 0.0014311232371255755, + "learning_rate": 0.00022398995962282615, + "loss": 0.0003, + "num_input_tokens_seen": 189848272, + "step": 87970 + }, + { + "epoch": 14.351549755301795, + "grad_norm": 0.0021250087302178144, + "learning_rate": 0.00022393061049021823, + "loss": 0.003, + "num_input_tokens_seen": 189857776, + "step": 87975 + }, + { + "epoch": 14.352365415986949, + "grad_norm": 0.000431596243288368, + "learning_rate": 0.0002238712669525067, + "loss": 0.0038, + "num_input_tokens_seen": 189866960, + "step": 87980 + }, + { + "epoch": 14.353181076672104, + "grad_norm": 0.03172757476568222, + "learning_rate": 0.0002238119290108942, + "loss": 0.006, + "num_input_tokens_seen": 189876720, + "step": 87985 + }, + { + "epoch": 14.35399673735726, + "grad_norm": 0.10000339150428772, + "learning_rate": 0.00022375259666658338, + "loss": 0.0025, + "num_input_tokens_seen": 189886096, + "step": 87990 + }, + { + "epoch": 14.354812398042414, + "grad_norm": 0.00021124315389897674, + "learning_rate": 0.0002236932699207766, + "loss": 0.0083, + "num_input_tokens_seen": 189898064, + "step": 87995 + }, + { + "epoch": 14.35562805872757, + "grad_norm": 0.020446719601750374, + "learning_rate": 0.00022363394877467625, + "loss": 0.0045, + "num_input_tokens_seen": 189908912, + "step": 88000 + }, + { + "epoch": 14.356443719412724, + "grad_norm": 0.0051913694478571415, + "learning_rate": 0.0002235746332294845, + "loss": 0.0018, + "num_input_tokens_seen": 189919216, + "step": 88005 + }, + { + "epoch": 14.35725938009788, + "grad_norm": 0.23507502675056458, + "learning_rate": 0.00022351532328640335, + "loss": 0.0124, + "num_input_tokens_seen": 189929712, + "step": 88010 + }, + { + "epoch": 14.358075040783035, + "grad_norm": 0.0041600679978728294, + "learning_rate": 0.0002234560189466352, + "loss": 0.0022, + "num_input_tokens_seen": 189939280, + "step": 88015 + }, + { + "epoch": 14.358890701468189, + "grad_norm": 0.004800493363291025, + "learning_rate": 0.00022339672021138136, + "loss": 0.012, + "num_input_tokens_seen": 189949840, + "step": 88020 + }, + { + "epoch": 14.359706362153345, + "grad_norm": 0.011248825117945671, + "learning_rate": 0.00022333742708184417, + "loss": 0.002, + "num_input_tokens_seen": 189960912, + "step": 88025 + }, + { + "epoch": 14.360522022838499, + "grad_norm": 0.0004183803393971175, + "learning_rate": 0.0002232781395592247, + "loss": 0.0025, + "num_input_tokens_seen": 189970736, + "step": 88030 + }, + { + "epoch": 14.361337683523654, + "grad_norm": 0.00016537630290258676, + "learning_rate": 0.00022321885764472495, + "loss": 0.0059, + "num_input_tokens_seen": 189981968, + "step": 88035 + }, + { + "epoch": 14.362153344208808, + "grad_norm": 0.0569070465862751, + "learning_rate": 0.00022315958133954612, + "loss": 0.0182, + "num_input_tokens_seen": 189992464, + "step": 88040 + }, + { + "epoch": 14.362969004893964, + "grad_norm": 0.0003405603929422796, + "learning_rate": 0.00022310031064488962, + "loss": 0.0292, + "num_input_tokens_seen": 190002896, + "step": 88045 + }, + { + "epoch": 14.36378466557912, + "grad_norm": 0.8475379943847656, + "learning_rate": 0.0002230410455619566, + "loss": 0.031, + "num_input_tokens_seen": 190014128, + "step": 88050 + }, + { + "epoch": 14.364600326264274, + "grad_norm": 0.00019385386258363724, + "learning_rate": 0.00022298178609194807, + "loss": 0.0087, + "num_input_tokens_seen": 190024464, + "step": 88055 + }, + { + "epoch": 14.36541598694943, + "grad_norm": 0.012066522613167763, + "learning_rate": 0.00022292253223606513, + "loss": 0.0018, + "num_input_tokens_seen": 190034320, + "step": 88060 + }, + { + "epoch": 14.366231647634583, + "grad_norm": 0.43287569284439087, + "learning_rate": 0.0002228632839955086, + "loss": 0.0136, + "num_input_tokens_seen": 190044464, + "step": 88065 + }, + { + "epoch": 14.367047308319739, + "grad_norm": 0.03576499968767166, + "learning_rate": 0.00022280404137147914, + "loss": 0.0034, + "num_input_tokens_seen": 190056016, + "step": 88070 + }, + { + "epoch": 14.367862969004895, + "grad_norm": 0.00535482307896018, + "learning_rate": 0.00022274480436517742, + "loss": 0.0007, + "num_input_tokens_seen": 190066928, + "step": 88075 + }, + { + "epoch": 14.368678629690049, + "grad_norm": 0.0020151836797595024, + "learning_rate": 0.00022268557297780396, + "loss": 0.0035, + "num_input_tokens_seen": 190078320, + "step": 88080 + }, + { + "epoch": 14.369494290375204, + "grad_norm": 0.23606491088867188, + "learning_rate": 0.00022262634721055918, + "loss": 0.0066, + "num_input_tokens_seen": 190089136, + "step": 88085 + }, + { + "epoch": 14.370309951060358, + "grad_norm": 0.006032920442521572, + "learning_rate": 0.00022256712706464338, + "loss": 0.001, + "num_input_tokens_seen": 190100400, + "step": 88090 + }, + { + "epoch": 14.371125611745514, + "grad_norm": 0.0010120195802301168, + "learning_rate": 0.0002225079125412567, + "loss": 0.1414, + "num_input_tokens_seen": 190111088, + "step": 88095 + }, + { + "epoch": 14.37194127243067, + "grad_norm": 0.019229831174016, + "learning_rate": 0.00022244870364159912, + "loss": 0.0013, + "num_input_tokens_seen": 190121008, + "step": 88100 + }, + { + "epoch": 14.372756933115824, + "grad_norm": 0.0128501420840621, + "learning_rate": 0.00022238950036687071, + "loss": 0.0019, + "num_input_tokens_seen": 190130896, + "step": 88105 + }, + { + "epoch": 14.37357259380098, + "grad_norm": 0.6578302383422852, + "learning_rate": 0.00022233030271827126, + "loss": 0.0149, + "num_input_tokens_seen": 190140624, + "step": 88110 + }, + { + "epoch": 14.374388254486133, + "grad_norm": 0.017622362822294235, + "learning_rate": 0.0002222711106970003, + "loss": 0.0057, + "num_input_tokens_seen": 190150320, + "step": 88115 + }, + { + "epoch": 14.375203915171289, + "grad_norm": 0.2559345066547394, + "learning_rate": 0.0002222119243042579, + "loss": 0.0101, + "num_input_tokens_seen": 190162640, + "step": 88120 + }, + { + "epoch": 14.376019575856443, + "grad_norm": 0.000881955202203244, + "learning_rate": 0.00022215274354124294, + "loss": 0.0005, + "num_input_tokens_seen": 190173744, + "step": 88125 + }, + { + "epoch": 14.376835236541599, + "grad_norm": 0.014644431881606579, + "learning_rate": 0.00022209356840915552, + "loss": 0.0023, + "num_input_tokens_seen": 190183792, + "step": 88130 + }, + { + "epoch": 14.377650897226754, + "grad_norm": 0.002176702953875065, + "learning_rate": 0.00022203439890919403, + "loss": 0.0004, + "num_input_tokens_seen": 190194352, + "step": 88135 + }, + { + "epoch": 14.378466557911908, + "grad_norm": 0.44496509432792664, + "learning_rate": 0.00022197523504255846, + "loss": 0.021, + "num_input_tokens_seen": 190205104, + "step": 88140 + }, + { + "epoch": 14.379282218597064, + "grad_norm": 0.002445099875330925, + "learning_rate": 0.00022191607681044712, + "loss": 0.0014, + "num_input_tokens_seen": 190216464, + "step": 88145 + }, + { + "epoch": 14.380097879282218, + "grad_norm": 0.007500083185732365, + "learning_rate": 0.00022185692421405962, + "loss": 0.0008, + "num_input_tokens_seen": 190227248, + "step": 88150 + }, + { + "epoch": 14.380913539967374, + "grad_norm": 0.014515615068376064, + "learning_rate": 0.000221797777254594, + "loss": 0.0048, + "num_input_tokens_seen": 190238032, + "step": 88155 + }, + { + "epoch": 14.38172920065253, + "grad_norm": 0.0010566047858446836, + "learning_rate": 0.00022173863593324971, + "loss": 0.0798, + "num_input_tokens_seen": 190248272, + "step": 88160 + }, + { + "epoch": 14.382544861337683, + "grad_norm": 0.0034641206730157137, + "learning_rate": 0.00022167950025122463, + "loss": 0.0007, + "num_input_tokens_seen": 190259664, + "step": 88165 + }, + { + "epoch": 14.383360522022839, + "grad_norm": 0.0006269419682212174, + "learning_rate": 0.00022162037020971793, + "loss": 0.0004, + "num_input_tokens_seen": 190271088, + "step": 88170 + }, + { + "epoch": 14.384176182707993, + "grad_norm": 0.19482706487178802, + "learning_rate": 0.00022156124580992716, + "loss": 0.0046, + "num_input_tokens_seen": 190281200, + "step": 88175 + }, + { + "epoch": 14.384991843393149, + "grad_norm": 0.00034790855715982616, + "learning_rate": 0.00022150212705305118, + "loss": 0.072, + "num_input_tokens_seen": 190291472, + "step": 88180 + }, + { + "epoch": 14.385807504078304, + "grad_norm": 0.0018287214916199446, + "learning_rate": 0.00022144301394028793, + "loss": 0.0009, + "num_input_tokens_seen": 190302448, + "step": 88185 + }, + { + "epoch": 14.386623164763458, + "grad_norm": 0.23547394573688507, + "learning_rate": 0.0002213839064728353, + "loss": 0.0135, + "num_input_tokens_seen": 190314288, + "step": 88190 + }, + { + "epoch": 14.387438825448614, + "grad_norm": 0.00860915333032608, + "learning_rate": 0.0002213248046518913, + "loss": 0.0017, + "num_input_tokens_seen": 190325296, + "step": 88195 + }, + { + "epoch": 14.388254486133768, + "grad_norm": 0.027270235121250153, + "learning_rate": 0.00022126570847865368, + "loss": 0.0045, + "num_input_tokens_seen": 190333776, + "step": 88200 + }, + { + "epoch": 14.389070146818923, + "grad_norm": 0.0007071119034662843, + "learning_rate": 0.00022120661795432, + "loss": 0.0126, + "num_input_tokens_seen": 190344944, + "step": 88205 + }, + { + "epoch": 14.38988580750408, + "grad_norm": 0.018456434831023216, + "learning_rate": 0.00022114753308008795, + "loss": 0.0013, + "num_input_tokens_seen": 190355792, + "step": 88210 + }, + { + "epoch": 14.390701468189233, + "grad_norm": 0.0016674178186804056, + "learning_rate": 0.00022108845385715488, + "loss": 0.0094, + "num_input_tokens_seen": 190366640, + "step": 88215 + }, + { + "epoch": 14.391517128874389, + "grad_norm": 0.00029255589470267296, + "learning_rate": 0.00022102938028671816, + "loss": 0.0003, + "num_input_tokens_seen": 190377584, + "step": 88220 + }, + { + "epoch": 14.392332789559543, + "grad_norm": 0.010379987768828869, + "learning_rate": 0.00022097031236997488, + "loss": 0.0021, + "num_input_tokens_seen": 190388368, + "step": 88225 + }, + { + "epoch": 14.393148450244698, + "grad_norm": 0.05668234825134277, + "learning_rate": 0.00022091125010812202, + "loss": 0.0057, + "num_input_tokens_seen": 190398832, + "step": 88230 + }, + { + "epoch": 14.393964110929852, + "grad_norm": 0.0009420950664207339, + "learning_rate": 0.00022085219350235707, + "loss": 0.0057, + "num_input_tokens_seen": 190408720, + "step": 88235 + }, + { + "epoch": 14.394779771615008, + "grad_norm": 0.010695450939238071, + "learning_rate": 0.00022079314255387623, + "loss": 0.0008, + "num_input_tokens_seen": 190418832, + "step": 88240 + }, + { + "epoch": 14.395595432300164, + "grad_norm": 0.00033744267420843244, + "learning_rate": 0.00022073409726387688, + "loss": 0.0533, + "num_input_tokens_seen": 190429776, + "step": 88245 + }, + { + "epoch": 14.396411092985318, + "grad_norm": 0.12773603200912476, + "learning_rate": 0.000220675057633555, + "loss": 0.0038, + "num_input_tokens_seen": 190439824, + "step": 88250 + }, + { + "epoch": 14.397226753670473, + "grad_norm": 0.015510771423578262, + "learning_rate": 0.00022061602366410776, + "loss": 0.0056, + "num_input_tokens_seen": 190448880, + "step": 88255 + }, + { + "epoch": 14.398042414355627, + "grad_norm": 0.02884693071246147, + "learning_rate": 0.0002205569953567309, + "loss": 0.055, + "num_input_tokens_seen": 190460976, + "step": 88260 + }, + { + "epoch": 14.398858075040783, + "grad_norm": 0.0016890558181330562, + "learning_rate": 0.00022049797271262133, + "loss": 0.0336, + "num_input_tokens_seen": 190472752, + "step": 88265 + }, + { + "epoch": 14.399673735725939, + "grad_norm": 0.12455187737941742, + "learning_rate": 0.00022043895573297463, + "loss": 0.003, + "num_input_tokens_seen": 190482480, + "step": 88270 + }, + { + "epoch": 14.400489396411093, + "grad_norm": 0.0008485591388307512, + "learning_rate": 0.0002203799444189874, + "loss": 0.0034, + "num_input_tokens_seen": 190493680, + "step": 88275 + }, + { + "epoch": 14.401305057096248, + "grad_norm": 0.0030698119662702084, + "learning_rate": 0.00022032093877185504, + "loss": 0.0177, + "num_input_tokens_seen": 190502768, + "step": 88280 + }, + { + "epoch": 14.402120717781402, + "grad_norm": 0.008513492532074451, + "learning_rate": 0.000220261938792774, + "loss": 0.0028, + "num_input_tokens_seen": 190514000, + "step": 88285 + }, + { + "epoch": 14.402936378466558, + "grad_norm": 0.0014314469881355762, + "learning_rate": 0.00022020294448293925, + "loss": 0.0093, + "num_input_tokens_seen": 190524688, + "step": 88290 + }, + { + "epoch": 14.403752039151712, + "grad_norm": 0.04574088007211685, + "learning_rate": 0.00022014395584354717, + "loss": 0.0016, + "num_input_tokens_seen": 190534960, + "step": 88295 + }, + { + "epoch": 14.404567699836868, + "grad_norm": 0.010423467494547367, + "learning_rate": 0.0002200849728757925, + "loss": 0.0022, + "num_input_tokens_seen": 190546352, + "step": 88300 + }, + { + "epoch": 14.405383360522023, + "grad_norm": 0.03723536804318428, + "learning_rate": 0.00022002599558087126, + "loss": 0.0015, + "num_input_tokens_seen": 190557456, + "step": 88305 + }, + { + "epoch": 14.406199021207177, + "grad_norm": 0.022482335567474365, + "learning_rate": 0.00021996702395997807, + "loss": 0.0043, + "num_input_tokens_seen": 190567952, + "step": 88310 + }, + { + "epoch": 14.407014681892333, + "grad_norm": 0.004353964701294899, + "learning_rate": 0.00021990805801430874, + "loss": 0.0552, + "num_input_tokens_seen": 190577776, + "step": 88315 + }, + { + "epoch": 14.407830342577487, + "grad_norm": 0.0006343530840240419, + "learning_rate": 0.00021984909774505756, + "loss": 0.0322, + "num_input_tokens_seen": 190586800, + "step": 88320 + }, + { + "epoch": 14.408646003262643, + "grad_norm": 0.03434363380074501, + "learning_rate": 0.00021979014315342, + "loss": 0.0038, + "num_input_tokens_seen": 190597840, + "step": 88325 + }, + { + "epoch": 14.409461663947798, + "grad_norm": 0.0008730432018637657, + "learning_rate": 0.00021973119424059068, + "loss": 0.0042, + "num_input_tokens_seen": 190607696, + "step": 88330 + }, + { + "epoch": 14.410277324632952, + "grad_norm": 0.41108080744743347, + "learning_rate": 0.00021967225100776424, + "loss": 0.0117, + "num_input_tokens_seen": 190618608, + "step": 88335 + }, + { + "epoch": 14.411092985318108, + "grad_norm": 0.0016283750301226974, + "learning_rate": 0.00021961331345613522, + "loss": 0.004, + "num_input_tokens_seen": 190628048, + "step": 88340 + }, + { + "epoch": 14.411908646003262, + "grad_norm": 0.44536012411117554, + "learning_rate": 0.00021955438158689818, + "loss": 0.0801, + "num_input_tokens_seen": 190640400, + "step": 88345 + }, + { + "epoch": 14.412724306688418, + "grad_norm": 0.017954912036657333, + "learning_rate": 0.00021949545540124734, + "loss": 0.0026, + "num_input_tokens_seen": 190649648, + "step": 88350 + }, + { + "epoch": 14.413539967373573, + "grad_norm": 0.01651296764612198, + "learning_rate": 0.0002194365349003769, + "loss": 0.0015, + "num_input_tokens_seen": 190660816, + "step": 88355 + }, + { + "epoch": 14.414355628058727, + "grad_norm": 0.043049801141023636, + "learning_rate": 0.00021937762008548102, + "loss": 0.0035, + "num_input_tokens_seen": 190669648, + "step": 88360 + }, + { + "epoch": 14.415171288743883, + "grad_norm": 0.003901546122506261, + "learning_rate": 0.00021931871095775364, + "loss": 0.0816, + "num_input_tokens_seen": 190680112, + "step": 88365 + }, + { + "epoch": 14.415986949429037, + "grad_norm": 0.00514312693849206, + "learning_rate": 0.0002192598075183887, + "loss": 0.0023, + "num_input_tokens_seen": 190692112, + "step": 88370 + }, + { + "epoch": 14.416802610114193, + "grad_norm": 0.07757526636123657, + "learning_rate": 0.00021920090976857971, + "loss": 0.0059, + "num_input_tokens_seen": 190702160, + "step": 88375 + }, + { + "epoch": 14.417618270799348, + "grad_norm": 0.0037375683896243572, + "learning_rate": 0.00021914201770952086, + "loss": 0.0253, + "num_input_tokens_seen": 190711216, + "step": 88380 + }, + { + "epoch": 14.418433931484502, + "grad_norm": 0.006305972579866648, + "learning_rate": 0.00021908313134240493, + "loss": 0.0025, + "num_input_tokens_seen": 190722544, + "step": 88385 + }, + { + "epoch": 14.419249592169658, + "grad_norm": 0.026911692693829536, + "learning_rate": 0.00021902425066842608, + "loss": 0.0019, + "num_input_tokens_seen": 190732496, + "step": 88390 + }, + { + "epoch": 14.420065252854812, + "grad_norm": 0.02325587347149849, + "learning_rate": 0.00021896537568877688, + "loss": 0.002, + "num_input_tokens_seen": 190743280, + "step": 88395 + }, + { + "epoch": 14.420880913539968, + "grad_norm": 0.06777871400117874, + "learning_rate": 0.00021890650640465125, + "loss": 0.0047, + "num_input_tokens_seen": 190753520, + "step": 88400 + }, + { + "epoch": 14.421696574225122, + "grad_norm": 0.00013905800005886704, + "learning_rate": 0.00021884764281724145, + "loss": 0.051, + "num_input_tokens_seen": 190764080, + "step": 88405 + }, + { + "epoch": 14.422512234910277, + "grad_norm": 0.9559711813926697, + "learning_rate": 0.00021878878492774125, + "loss": 0.0814, + "num_input_tokens_seen": 190775120, + "step": 88410 + }, + { + "epoch": 14.423327895595433, + "grad_norm": 0.0019706906750798225, + "learning_rate": 0.00021872993273734266, + "loss": 0.0071, + "num_input_tokens_seen": 190785488, + "step": 88415 + }, + { + "epoch": 14.424143556280587, + "grad_norm": 0.034496963024139404, + "learning_rate": 0.0002186710862472392, + "loss": 0.0043, + "num_input_tokens_seen": 190796400, + "step": 88420 + }, + { + "epoch": 14.424959216965743, + "grad_norm": 2.296041488647461, + "learning_rate": 0.00021861224545862264, + "loss": 0.1092, + "num_input_tokens_seen": 190807280, + "step": 88425 + }, + { + "epoch": 14.425774877650896, + "grad_norm": 0.0017883836990222335, + "learning_rate": 0.0002185534103726863, + "loss": 0.0018, + "num_input_tokens_seen": 190817296, + "step": 88430 + }, + { + "epoch": 14.426590538336052, + "grad_norm": 0.0004735244147013873, + "learning_rate": 0.00021849458099062175, + "loss": 0.0017, + "num_input_tokens_seen": 190826864, + "step": 88435 + }, + { + "epoch": 14.427406199021208, + "grad_norm": 0.013706550002098083, + "learning_rate": 0.00021843575731362187, + "loss": 0.0059, + "num_input_tokens_seen": 190837264, + "step": 88440 + }, + { + "epoch": 14.428221859706362, + "grad_norm": 0.0024802517145872116, + "learning_rate": 0.0002183769393428785, + "loss": 0.0009, + "num_input_tokens_seen": 190847920, + "step": 88445 + }, + { + "epoch": 14.429037520391518, + "grad_norm": 0.053927190601825714, + "learning_rate": 0.00021831812707958376, + "loss": 0.0021, + "num_input_tokens_seen": 190858544, + "step": 88450 + }, + { + "epoch": 14.429853181076671, + "grad_norm": 0.0005102112190797925, + "learning_rate": 0.00021825932052492946, + "loss": 0.0173, + "num_input_tokens_seen": 190869168, + "step": 88455 + }, + { + "epoch": 14.430668841761827, + "grad_norm": 0.00041594402864575386, + "learning_rate": 0.0002182005196801075, + "loss": 0.0009, + "num_input_tokens_seen": 190880976, + "step": 88460 + }, + { + "epoch": 14.431484502446983, + "grad_norm": 0.005394402425736189, + "learning_rate": 0.0002181417245463095, + "loss": 0.0037, + "num_input_tokens_seen": 190892240, + "step": 88465 + }, + { + "epoch": 14.432300163132137, + "grad_norm": 0.0014049963792786002, + "learning_rate": 0.00021808293512472698, + "loss": 0.0004, + "num_input_tokens_seen": 190904656, + "step": 88470 + }, + { + "epoch": 14.433115823817293, + "grad_norm": 0.20719604194164276, + "learning_rate": 0.0002180241514165514, + "loss": 0.0082, + "num_input_tokens_seen": 190915344, + "step": 88475 + }, + { + "epoch": 14.433931484502446, + "grad_norm": 0.0023356846068054438, + "learning_rate": 0.00021796537342297413, + "loss": 0.006, + "num_input_tokens_seen": 190925936, + "step": 88480 + }, + { + "epoch": 14.434747145187602, + "grad_norm": 0.001363952993415296, + "learning_rate": 0.00021790660114518633, + "loss": 0.0041, + "num_input_tokens_seen": 190936432, + "step": 88485 + }, + { + "epoch": 14.435562805872756, + "grad_norm": 0.02121797576546669, + "learning_rate": 0.0002178478345843792, + "loss": 0.0014, + "num_input_tokens_seen": 190946960, + "step": 88490 + }, + { + "epoch": 14.436378466557912, + "grad_norm": 0.0010812993859872222, + "learning_rate": 0.00021778907374174356, + "loss": 0.0102, + "num_input_tokens_seen": 190956656, + "step": 88495 + }, + { + "epoch": 14.437194127243067, + "grad_norm": 0.0005089049227535725, + "learning_rate": 0.00021773031861847036, + "loss": 0.0542, + "num_input_tokens_seen": 190968592, + "step": 88500 + }, + { + "epoch": 14.438009787928221, + "grad_norm": 0.00118454999756068, + "learning_rate": 0.0002176715692157503, + "loss": 0.1015, + "num_input_tokens_seen": 190979856, + "step": 88505 + }, + { + "epoch": 14.438825448613377, + "grad_norm": 0.06683529913425446, + "learning_rate": 0.00021761282553477412, + "loss": 0.0071, + "num_input_tokens_seen": 190990064, + "step": 88510 + }, + { + "epoch": 14.439641109298531, + "grad_norm": 0.08214728534221649, + "learning_rate": 0.00021755408757673228, + "loss": 0.0142, + "num_input_tokens_seen": 191000432, + "step": 88515 + }, + { + "epoch": 14.440456769983687, + "grad_norm": 0.00022285875456873327, + "learning_rate": 0.00021749535534281488, + "loss": 0.0028, + "num_input_tokens_seen": 191011920, + "step": 88520 + }, + { + "epoch": 14.441272430668842, + "grad_norm": 0.003935231827199459, + "learning_rate": 0.00021743662883421294, + "loss": 0.0006, + "num_input_tokens_seen": 191022096, + "step": 88525 + }, + { + "epoch": 14.442088091353996, + "grad_norm": 0.12180408835411072, + "learning_rate": 0.00021737790805211578, + "loss": 0.0043, + "num_input_tokens_seen": 191032784, + "step": 88530 + }, + { + "epoch": 14.442903752039152, + "grad_norm": 0.24376216530799866, + "learning_rate": 0.00021731919299771424, + "loss": 0.0048, + "num_input_tokens_seen": 191042640, + "step": 88535 + }, + { + "epoch": 14.443719412724306, + "grad_norm": 0.04353031888604164, + "learning_rate": 0.00021726048367219747, + "loss": 0.0107, + "num_input_tokens_seen": 191054512, + "step": 88540 + }, + { + "epoch": 14.444535073409462, + "grad_norm": 0.0031608459539711475, + "learning_rate": 0.00021720178007675583, + "loss": 0.0046, + "num_input_tokens_seen": 191064624, + "step": 88545 + }, + { + "epoch": 14.445350734094617, + "grad_norm": 0.0022934952285140753, + "learning_rate": 0.00021714308221257889, + "loss": 0.0011, + "num_input_tokens_seen": 191076720, + "step": 88550 + }, + { + "epoch": 14.446166394779771, + "grad_norm": 0.0005051264888606966, + "learning_rate": 0.00021708439008085624, + "loss": 0.0028, + "num_input_tokens_seen": 191086960, + "step": 88555 + }, + { + "epoch": 14.446982055464927, + "grad_norm": 0.10919643193483353, + "learning_rate": 0.0002170257036827773, + "loss": 0.0419, + "num_input_tokens_seen": 191098384, + "step": 88560 + }, + { + "epoch": 14.447797716150081, + "grad_norm": 0.0001894766464829445, + "learning_rate": 0.00021696702301953147, + "loss": 0.0013, + "num_input_tokens_seen": 191109776, + "step": 88565 + }, + { + "epoch": 14.448613376835237, + "grad_norm": 0.001767312758602202, + "learning_rate": 0.00021690834809230797, + "loss": 0.0037, + "num_input_tokens_seen": 191119888, + "step": 88570 + }, + { + "epoch": 14.449429037520392, + "grad_norm": 0.23064282536506653, + "learning_rate": 0.00021684967890229595, + "loss": 0.0059, + "num_input_tokens_seen": 191130032, + "step": 88575 + }, + { + "epoch": 14.450244698205546, + "grad_norm": 0.0037373783998191357, + "learning_rate": 0.00021679101545068436, + "loss": 0.0017, + "num_input_tokens_seen": 191140176, + "step": 88580 + }, + { + "epoch": 14.451060358890702, + "grad_norm": 0.0006014195969328284, + "learning_rate": 0.00021673235773866212, + "loss": 0.0008, + "num_input_tokens_seen": 191151024, + "step": 88585 + }, + { + "epoch": 14.451876019575856, + "grad_norm": 0.0006806448800489306, + "learning_rate": 0.00021667370576741802, + "loss": 0.0049, + "num_input_tokens_seen": 191162416, + "step": 88590 + }, + { + "epoch": 14.452691680261012, + "grad_norm": 0.005862915422767401, + "learning_rate": 0.00021661505953814064, + "loss": 0.0041, + "num_input_tokens_seen": 191173648, + "step": 88595 + }, + { + "epoch": 14.453507340946166, + "grad_norm": 0.004005435388535261, + "learning_rate": 0.0002165564190520186, + "loss": 0.0052, + "num_input_tokens_seen": 191184560, + "step": 88600 + }, + { + "epoch": 14.454323001631321, + "grad_norm": 0.09444338828325272, + "learning_rate": 0.00021649778431024035, + "loss": 0.0056, + "num_input_tokens_seen": 191195696, + "step": 88605 + }, + { + "epoch": 14.455138662316477, + "grad_norm": 0.003349320963025093, + "learning_rate": 0.0002164391553139941, + "loss": 0.0631, + "num_input_tokens_seen": 191207536, + "step": 88610 + }, + { + "epoch": 14.455954323001631, + "grad_norm": 0.0014255112037062645, + "learning_rate": 0.00021638053206446813, + "loss": 0.0026, + "num_input_tokens_seen": 191218992, + "step": 88615 + }, + { + "epoch": 14.456769983686787, + "grad_norm": 0.000259996741078794, + "learning_rate": 0.00021632191456285045, + "loss": 0.0005, + "num_input_tokens_seen": 191230448, + "step": 88620 + }, + { + "epoch": 14.45758564437194, + "grad_norm": 0.18017897009849548, + "learning_rate": 0.00021626330281032902, + "loss": 0.0073, + "num_input_tokens_seen": 191241040, + "step": 88625 + }, + { + "epoch": 14.458401305057096, + "grad_norm": 0.004403542261570692, + "learning_rate": 0.00021620469680809173, + "loss": 0.0003, + "num_input_tokens_seen": 191251792, + "step": 88630 + }, + { + "epoch": 14.459216965742252, + "grad_norm": 0.004544136114418507, + "learning_rate": 0.0002161460965573263, + "loss": 0.005, + "num_input_tokens_seen": 191262480, + "step": 88635 + }, + { + "epoch": 14.460032626427406, + "grad_norm": 0.0009777155937626958, + "learning_rate": 0.0002160875020592203, + "loss": 0.0004, + "num_input_tokens_seen": 191274320, + "step": 88640 + }, + { + "epoch": 14.460848287112562, + "grad_norm": 0.0008451201720163226, + "learning_rate": 0.00021602891331496123, + "loss": 0.0009, + "num_input_tokens_seen": 191285616, + "step": 88645 + }, + { + "epoch": 14.461663947797716, + "grad_norm": 0.008412153460085392, + "learning_rate": 0.0002159703303257363, + "loss": 0.0009, + "num_input_tokens_seen": 191296208, + "step": 88650 + }, + { + "epoch": 14.462479608482871, + "grad_norm": 0.0013024318031966686, + "learning_rate": 0.00021591175309273314, + "loss": 0.0063, + "num_input_tokens_seen": 191306288, + "step": 88655 + }, + { + "epoch": 14.463295269168025, + "grad_norm": 0.010823320597410202, + "learning_rate": 0.00021585318161713868, + "loss": 0.0006, + "num_input_tokens_seen": 191316560, + "step": 88660 + }, + { + "epoch": 14.464110929853181, + "grad_norm": 0.005986808333545923, + "learning_rate": 0.00021579461590013994, + "loss": 0.0021, + "num_input_tokens_seen": 191326096, + "step": 88665 + }, + { + "epoch": 14.464926590538337, + "grad_norm": 0.001371503691188991, + "learning_rate": 0.0002157360559429239, + "loss": 0.0008, + "num_input_tokens_seen": 191335824, + "step": 88670 + }, + { + "epoch": 14.46574225122349, + "grad_norm": 0.0011750278063118458, + "learning_rate": 0.00021567750174667722, + "loss": 0.0009, + "num_input_tokens_seen": 191346192, + "step": 88675 + }, + { + "epoch": 14.466557911908646, + "grad_norm": 0.011948419734835625, + "learning_rate": 0.00021561895331258674, + "loss": 0.0019, + "num_input_tokens_seen": 191355248, + "step": 88680 + }, + { + "epoch": 14.4673735725938, + "grad_norm": 0.14903397858142853, + "learning_rate": 0.0002155604106418389, + "loss": 0.0044, + "num_input_tokens_seen": 191365872, + "step": 88685 + }, + { + "epoch": 14.468189233278956, + "grad_norm": 0.0004821582406293601, + "learning_rate": 0.00021550187373562015, + "loss": 0.0022, + "num_input_tokens_seen": 191377296, + "step": 88690 + }, + { + "epoch": 14.469004893964112, + "grad_norm": 0.01549572590738535, + "learning_rate": 0.00021544334259511688, + "loss": 0.0387, + "num_input_tokens_seen": 191388432, + "step": 88695 + }, + { + "epoch": 14.469820554649266, + "grad_norm": 0.004329700022935867, + "learning_rate": 0.0002153848172215152, + "loss": 0.0009, + "num_input_tokens_seen": 191397552, + "step": 88700 + }, + { + "epoch": 14.470636215334421, + "grad_norm": 0.049543846398591995, + "learning_rate": 0.00021532629761600132, + "loss": 0.0034, + "num_input_tokens_seen": 191408688, + "step": 88705 + }, + { + "epoch": 14.471451876019575, + "grad_norm": 0.8820896744728088, + "learning_rate": 0.00021526778377976114, + "loss": 0.0914, + "num_input_tokens_seen": 191420560, + "step": 88710 + }, + { + "epoch": 14.47226753670473, + "grad_norm": 0.008689154870808125, + "learning_rate": 0.00021520927571398052, + "loss": 0.0012, + "num_input_tokens_seen": 191431120, + "step": 88715 + }, + { + "epoch": 14.473083197389887, + "grad_norm": 0.00041984725976362824, + "learning_rate": 0.00021515077341984523, + "loss": 0.0953, + "num_input_tokens_seen": 191441008, + "step": 88720 + }, + { + "epoch": 14.47389885807504, + "grad_norm": 0.0017140257405117154, + "learning_rate": 0.00021509227689854083, + "loss": 0.0006, + "num_input_tokens_seen": 191451312, + "step": 88725 + }, + { + "epoch": 14.474714518760196, + "grad_norm": 0.020875675603747368, + "learning_rate": 0.0002150337861512529, + "loss": 0.0065, + "num_input_tokens_seen": 191461232, + "step": 88730 + }, + { + "epoch": 14.47553017944535, + "grad_norm": 0.40436792373657227, + "learning_rate": 0.0002149753011791668, + "loss": 0.0177, + "num_input_tokens_seen": 191472176, + "step": 88735 + }, + { + "epoch": 14.476345840130506, + "grad_norm": 0.008699612691998482, + "learning_rate": 0.00021491682198346778, + "loss": 0.0009, + "num_input_tokens_seen": 191482832, + "step": 88740 + }, + { + "epoch": 14.477161500815662, + "grad_norm": 0.004119328688830137, + "learning_rate": 0.00021485834856534104, + "loss": 0.0062, + "num_input_tokens_seen": 191494256, + "step": 88745 + }, + { + "epoch": 14.477977161500815, + "grad_norm": 0.044878967106342316, + "learning_rate": 0.00021479988092597157, + "loss": 0.0021, + "num_input_tokens_seen": 191505840, + "step": 88750 + }, + { + "epoch": 14.478792822185971, + "grad_norm": 0.04304588958621025, + "learning_rate": 0.00021474141906654414, + "loss": 0.0023, + "num_input_tokens_seen": 191517296, + "step": 88755 + }, + { + "epoch": 14.479608482871125, + "grad_norm": 0.002568679628893733, + "learning_rate": 0.00021468296298824413, + "loss": 0.0008, + "num_input_tokens_seen": 191526608, + "step": 88760 + }, + { + "epoch": 14.48042414355628, + "grad_norm": 0.24028834700584412, + "learning_rate": 0.00021462451269225547, + "loss": 0.0251, + "num_input_tokens_seen": 191537552, + "step": 88765 + }, + { + "epoch": 14.481239804241435, + "grad_norm": 0.6260242462158203, + "learning_rate": 0.00021456606817976337, + "loss": 0.0116, + "num_input_tokens_seen": 191548016, + "step": 88770 + }, + { + "epoch": 14.48205546492659, + "grad_norm": 0.001856622751802206, + "learning_rate": 0.00021450762945195167, + "loss": 0.1583, + "num_input_tokens_seen": 191558864, + "step": 88775 + }, + { + "epoch": 14.482871125611746, + "grad_norm": 0.0011177852284163237, + "learning_rate": 0.00021444919651000544, + "loss": 0.0253, + "num_input_tokens_seen": 191569584, + "step": 88780 + }, + { + "epoch": 14.4836867862969, + "grad_norm": 0.0035513208713382483, + "learning_rate": 0.0002143907693551081, + "loss": 0.0014, + "num_input_tokens_seen": 191579888, + "step": 88785 + }, + { + "epoch": 14.484502446982056, + "grad_norm": 0.0011009352747350931, + "learning_rate": 0.00021433234798844448, + "loss": 0.0009, + "num_input_tokens_seen": 191589776, + "step": 88790 + }, + { + "epoch": 14.48531810766721, + "grad_norm": 0.2989081144332886, + "learning_rate": 0.00021427393241119785, + "loss": 0.0166, + "num_input_tokens_seen": 191600496, + "step": 88795 + }, + { + "epoch": 14.486133768352365, + "grad_norm": 0.005109469406306744, + "learning_rate": 0.00021421552262455268, + "loss": 0.0041, + "num_input_tokens_seen": 191610960, + "step": 88800 + }, + { + "epoch": 14.486949429037521, + "grad_norm": 0.02277921698987484, + "learning_rate": 0.00021415711862969244, + "loss": 0.0009, + "num_input_tokens_seen": 191621296, + "step": 88805 + }, + { + "epoch": 14.487765089722675, + "grad_norm": 0.0015186513774096966, + "learning_rate": 0.00021409872042780083, + "loss": 0.0036, + "num_input_tokens_seen": 191632240, + "step": 88810 + }, + { + "epoch": 14.48858075040783, + "grad_norm": 0.0002403327962383628, + "learning_rate": 0.00021404032802006134, + "loss": 0.0381, + "num_input_tokens_seen": 191643696, + "step": 88815 + }, + { + "epoch": 14.489396411092985, + "grad_norm": 0.003840399207547307, + "learning_rate": 0.00021398194140765736, + "loss": 0.0037, + "num_input_tokens_seen": 191655632, + "step": 88820 + }, + { + "epoch": 14.49021207177814, + "grad_norm": 0.0017868555150926113, + "learning_rate": 0.0002139235605917722, + "loss": 0.0068, + "num_input_tokens_seen": 191666576, + "step": 88825 + }, + { + "epoch": 14.491027732463296, + "grad_norm": 0.007139697205275297, + "learning_rate": 0.00021386518557358898, + "loss": 0.0046, + "num_input_tokens_seen": 191676240, + "step": 88830 + }, + { + "epoch": 14.49184339314845, + "grad_norm": 0.010415250435471535, + "learning_rate": 0.00021380681635429079, + "loss": 0.0019, + "num_input_tokens_seen": 191687088, + "step": 88835 + }, + { + "epoch": 14.492659053833606, + "grad_norm": 0.0006793870707042515, + "learning_rate": 0.00021374845293506046, + "loss": 0.001, + "num_input_tokens_seen": 191697424, + "step": 88840 + }, + { + "epoch": 14.49347471451876, + "grad_norm": 0.0063953036442399025, + "learning_rate": 0.00021369009531708094, + "loss": 0.0031, + "num_input_tokens_seen": 191708624, + "step": 88845 + }, + { + "epoch": 14.494290375203915, + "grad_norm": 0.00016032144776545465, + "learning_rate": 0.0002136317435015348, + "loss": 0.0014, + "num_input_tokens_seen": 191718992, + "step": 88850 + }, + { + "epoch": 14.49510603588907, + "grad_norm": 0.0008625874179415405, + "learning_rate": 0.0002135733974896047, + "loss": 0.0006, + "num_input_tokens_seen": 191730128, + "step": 88855 + }, + { + "epoch": 14.495921696574225, + "grad_norm": 0.000988926156423986, + "learning_rate": 0.00021351505728247282, + "loss": 0.0029, + "num_input_tokens_seen": 191741488, + "step": 88860 + }, + { + "epoch": 14.49673735725938, + "grad_norm": 0.0023778195027261972, + "learning_rate": 0.00021345672288132218, + "loss": 0.0023, + "num_input_tokens_seen": 191753456, + "step": 88865 + }, + { + "epoch": 14.497553017944535, + "grad_norm": 0.002424864796921611, + "learning_rate": 0.00021339839428733415, + "loss": 0.0184, + "num_input_tokens_seen": 191764592, + "step": 88870 + }, + { + "epoch": 14.49836867862969, + "grad_norm": 0.34112095832824707, + "learning_rate": 0.0002133400715016916, + "loss": 0.1519, + "num_input_tokens_seen": 191774224, + "step": 88875 + }, + { + "epoch": 14.499184339314844, + "grad_norm": 0.9522610902786255, + "learning_rate": 0.0002132817545255758, + "loss": 0.1166, + "num_input_tokens_seen": 191784912, + "step": 88880 + }, + { + "epoch": 14.5, + "grad_norm": 0.4158490002155304, + "learning_rate": 0.0002132234433601693, + "loss": 0.0194, + "num_input_tokens_seen": 191795568, + "step": 88885 + }, + { + "epoch": 14.500815660685156, + "grad_norm": 0.0062528932467103004, + "learning_rate": 0.00021316513800665322, + "loss": 0.0111, + "num_input_tokens_seen": 191806768, + "step": 88890 + }, + { + "epoch": 14.50163132137031, + "grad_norm": 0.0027010778430849314, + "learning_rate": 0.0002131068384662098, + "loss": 0.0091, + "num_input_tokens_seen": 191818352, + "step": 88895 + }, + { + "epoch": 14.502446982055465, + "grad_norm": 0.00341239757835865, + "learning_rate": 0.00021304854474001993, + "loss": 0.0007, + "num_input_tokens_seen": 191828048, + "step": 88900 + }, + { + "epoch": 14.50326264274062, + "grad_norm": 0.037552908062934875, + "learning_rate": 0.00021299025682926565, + "loss": 0.0026, + "num_input_tokens_seen": 191838512, + "step": 88905 + }, + { + "epoch": 14.504078303425775, + "grad_norm": 0.00317358854226768, + "learning_rate": 0.0002129319747351276, + "loss": 0.0104, + "num_input_tokens_seen": 191847856, + "step": 88910 + }, + { + "epoch": 14.50489396411093, + "grad_norm": 0.00020476865756791085, + "learning_rate": 0.00021287369845878756, + "loss": 0.0008, + "num_input_tokens_seen": 191858960, + "step": 88915 + }, + { + "epoch": 14.505709624796085, + "grad_norm": 0.0012875087559223175, + "learning_rate": 0.00021281542800142595, + "loss": 0.0033, + "num_input_tokens_seen": 191870064, + "step": 88920 + }, + { + "epoch": 14.50652528548124, + "grad_norm": 0.11623962968587875, + "learning_rate": 0.00021275716336422435, + "loss": 0.0056, + "num_input_tokens_seen": 191882160, + "step": 88925 + }, + { + "epoch": 14.507340946166394, + "grad_norm": 0.002890993608161807, + "learning_rate": 0.00021269890454836288, + "loss": 0.0213, + "num_input_tokens_seen": 191893072, + "step": 88930 + }, + { + "epoch": 14.50815660685155, + "grad_norm": 0.006453386973589659, + "learning_rate": 0.00021264065155502293, + "loss": 0.0053, + "num_input_tokens_seen": 191905072, + "step": 88935 + }, + { + "epoch": 14.508972267536706, + "grad_norm": 0.016498737037181854, + "learning_rate": 0.00021258240438538434, + "loss": 0.0026, + "num_input_tokens_seen": 191916336, + "step": 88940 + }, + { + "epoch": 14.50978792822186, + "grad_norm": 0.0016078692860901356, + "learning_rate": 0.0002125241630406281, + "loss": 0.0309, + "num_input_tokens_seen": 191927376, + "step": 88945 + }, + { + "epoch": 14.510603588907015, + "grad_norm": 0.1636592447757721, + "learning_rate": 0.00021246592752193445, + "loss": 0.008, + "num_input_tokens_seen": 191938832, + "step": 88950 + }, + { + "epoch": 14.51141924959217, + "grad_norm": 0.7704768776893616, + "learning_rate": 0.00021240769783048352, + "loss": 0.0485, + "num_input_tokens_seen": 191950896, + "step": 88955 + }, + { + "epoch": 14.512234910277325, + "grad_norm": 0.00040649878792464733, + "learning_rate": 0.00021234947396745542, + "loss": 0.0024, + "num_input_tokens_seen": 191961680, + "step": 88960 + }, + { + "epoch": 14.513050570962479, + "grad_norm": 0.011789959855377674, + "learning_rate": 0.00021229125593403016, + "loss": 0.0038, + "num_input_tokens_seen": 191972464, + "step": 88965 + }, + { + "epoch": 14.513866231647635, + "grad_norm": 0.005680643953382969, + "learning_rate": 0.00021223304373138753, + "loss": 0.0052, + "num_input_tokens_seen": 191983376, + "step": 88970 + }, + { + "epoch": 14.51468189233279, + "grad_norm": 0.004592582117766142, + "learning_rate": 0.00021217483736070736, + "loss": 0.0256, + "num_input_tokens_seen": 191993936, + "step": 88975 + }, + { + "epoch": 14.515497553017944, + "grad_norm": 0.017762063071131706, + "learning_rate": 0.00021211663682316922, + "loss": 0.0107, + "num_input_tokens_seen": 192005584, + "step": 88980 + }, + { + "epoch": 14.5163132137031, + "grad_norm": 0.00018946125055663288, + "learning_rate": 0.00021205844211995268, + "loss": 0.0037, + "num_input_tokens_seen": 192016624, + "step": 88985 + }, + { + "epoch": 14.517128874388254, + "grad_norm": 0.01492861844599247, + "learning_rate": 0.0002120002532522371, + "loss": 0.0044, + "num_input_tokens_seen": 192027184, + "step": 88990 + }, + { + "epoch": 14.51794453507341, + "grad_norm": 0.0027063197921961546, + "learning_rate": 0.00021194207022120153, + "loss": 0.0004, + "num_input_tokens_seen": 192037328, + "step": 88995 + }, + { + "epoch": 14.518760195758565, + "grad_norm": 0.4809955060482025, + "learning_rate": 0.0002118838930280257, + "loss": 0.0442, + "num_input_tokens_seen": 192048272, + "step": 89000 + }, + { + "epoch": 14.51957585644372, + "grad_norm": 0.007320154923945665, + "learning_rate": 0.00021182572167388792, + "loss": 0.0047, + "num_input_tokens_seen": 192058448, + "step": 89005 + }, + { + "epoch": 14.520391517128875, + "grad_norm": 0.00044022753718309104, + "learning_rate": 0.00021176755615996785, + "loss": 0.0028, + "num_input_tokens_seen": 192069232, + "step": 89010 + }, + { + "epoch": 14.521207177814029, + "grad_norm": 0.012059107422828674, + "learning_rate": 0.00021170939648744346, + "loss": 0.0015, + "num_input_tokens_seen": 192080240, + "step": 89015 + }, + { + "epoch": 14.522022838499185, + "grad_norm": 0.0005549166235141456, + "learning_rate": 0.00021165124265749431, + "loss": 0.0041, + "num_input_tokens_seen": 192091184, + "step": 89020 + }, + { + "epoch": 14.522838499184338, + "grad_norm": 0.010669681243598461, + "learning_rate": 0.00021159309467129816, + "loss": 0.0013, + "num_input_tokens_seen": 192101872, + "step": 89025 + }, + { + "epoch": 14.523654159869494, + "grad_norm": 0.005094995256513357, + "learning_rate": 0.0002115349525300342, + "loss": 0.0095, + "num_input_tokens_seen": 192113232, + "step": 89030 + }, + { + "epoch": 14.52446982055465, + "grad_norm": 0.0003210730792488903, + "learning_rate": 0.00021147681623487997, + "loss": 0.0007, + "num_input_tokens_seen": 192122800, + "step": 89035 + }, + { + "epoch": 14.525285481239804, + "grad_norm": 0.0010976665653288364, + "learning_rate": 0.0002114186857870144, + "loss": 0.0477, + "num_input_tokens_seen": 192132944, + "step": 89040 + }, + { + "epoch": 14.52610114192496, + "grad_norm": 0.01738951914012432, + "learning_rate": 0.00021136056118761494, + "loss": 0.0063, + "num_input_tokens_seen": 192144112, + "step": 89045 + }, + { + "epoch": 14.526916802610113, + "grad_norm": 0.00266857142560184, + "learning_rate": 0.00021130244243786024, + "loss": 0.0798, + "num_input_tokens_seen": 192154960, + "step": 89050 + }, + { + "epoch": 14.52773246329527, + "grad_norm": 0.04859494790434837, + "learning_rate": 0.00021124432953892742, + "loss": 0.1622, + "num_input_tokens_seen": 192165712, + "step": 89055 + }, + { + "epoch": 14.528548123980425, + "grad_norm": 0.00014240448945201933, + "learning_rate": 0.00021118622249199494, + "loss": 0.0031, + "num_input_tokens_seen": 192176944, + "step": 89060 + }, + { + "epoch": 14.529363784665579, + "grad_norm": 0.14338980615139008, + "learning_rate": 0.00021112812129823967, + "loss": 0.005, + "num_input_tokens_seen": 192188016, + "step": 89065 + }, + { + "epoch": 14.530179445350734, + "grad_norm": 0.002399766817688942, + "learning_rate": 0.00021107002595883978, + "loss": 0.0025, + "num_input_tokens_seen": 192199760, + "step": 89070 + }, + { + "epoch": 14.530995106035888, + "grad_norm": 2.23323917388916, + "learning_rate": 0.00021101193647497208, + "loss": 0.0766, + "num_input_tokens_seen": 192211312, + "step": 89075 + }, + { + "epoch": 14.531810766721044, + "grad_norm": 0.0017566434107720852, + "learning_rate": 0.00021095385284781426, + "loss": 0.0631, + "num_input_tokens_seen": 192221104, + "step": 89080 + }, + { + "epoch": 14.5326264274062, + "grad_norm": 0.0018510406371206045, + "learning_rate": 0.00021089577507854324, + "loss": 0.004, + "num_input_tokens_seen": 192230768, + "step": 89085 + }, + { + "epoch": 14.533442088091354, + "grad_norm": 0.08839358389377594, + "learning_rate": 0.00021083770316833618, + "loss": 0.0029, + "num_input_tokens_seen": 192241808, + "step": 89090 + }, + { + "epoch": 14.53425774877651, + "grad_norm": 0.00046705937711521983, + "learning_rate": 0.00021077963711836983, + "loss": 0.0026, + "num_input_tokens_seen": 192252464, + "step": 89095 + }, + { + "epoch": 14.535073409461663, + "grad_norm": 0.012293090112507343, + "learning_rate": 0.00021072157692982103, + "loss": 0.0014, + "num_input_tokens_seen": 192262544, + "step": 89100 + }, + { + "epoch": 14.535889070146819, + "grad_norm": 0.8182880878448486, + "learning_rate": 0.00021066352260386644, + "loss": 0.0883, + "num_input_tokens_seen": 192273840, + "step": 89105 + }, + { + "epoch": 14.536704730831975, + "grad_norm": 0.0243599284440279, + "learning_rate": 0.0002106054741416827, + "loss": 0.003, + "num_input_tokens_seen": 192284880, + "step": 89110 + }, + { + "epoch": 14.537520391517129, + "grad_norm": 0.012285619042813778, + "learning_rate": 0.00021054743154444607, + "loss": 0.0438, + "num_input_tokens_seen": 192295920, + "step": 89115 + }, + { + "epoch": 14.538336052202284, + "grad_norm": 0.010910829529166222, + "learning_rate": 0.00021048939481333297, + "loss": 0.0898, + "num_input_tokens_seen": 192307216, + "step": 89120 + }, + { + "epoch": 14.539151712887438, + "grad_norm": 0.0017362289363518357, + "learning_rate": 0.00021043136394951955, + "loss": 0.0007, + "num_input_tokens_seen": 192318448, + "step": 89125 + }, + { + "epoch": 14.539967373572594, + "grad_norm": 0.0007824370986782014, + "learning_rate": 0.00021037333895418186, + "loss": 0.0306, + "num_input_tokens_seen": 192329424, + "step": 89130 + }, + { + "epoch": 14.540783034257748, + "grad_norm": 0.024142490699887276, + "learning_rate": 0.0002103153198284959, + "loss": 0.0022, + "num_input_tokens_seen": 192341456, + "step": 89135 + }, + { + "epoch": 14.541598694942904, + "grad_norm": 0.0368606336414814, + "learning_rate": 0.0002102573065736373, + "loss": 0.0035, + "num_input_tokens_seen": 192352592, + "step": 89140 + }, + { + "epoch": 14.54241435562806, + "grad_norm": 0.0678473487496376, + "learning_rate": 0.00021019929919078228, + "loss": 0.009, + "num_input_tokens_seen": 192363312, + "step": 89145 + }, + { + "epoch": 14.543230016313213, + "grad_norm": 0.0006564795039594173, + "learning_rate": 0.00021014129768110574, + "loss": 0.0744, + "num_input_tokens_seen": 192374288, + "step": 89150 + }, + { + "epoch": 14.544045676998369, + "grad_norm": 0.3923884928226471, + "learning_rate": 0.0002100833020457839, + "loss": 0.0157, + "num_input_tokens_seen": 192384976, + "step": 89155 + }, + { + "epoch": 14.544861337683523, + "grad_norm": 0.0016631023027002811, + "learning_rate": 0.00021002531228599136, + "loss": 0.0623, + "num_input_tokens_seen": 192395152, + "step": 89160 + }, + { + "epoch": 14.545676998368679, + "grad_norm": 0.022583885118365288, + "learning_rate": 0.00020996732840290405, + "loss": 0.0102, + "num_input_tokens_seen": 192406864, + "step": 89165 + }, + { + "epoch": 14.546492659053834, + "grad_norm": 0.0037688070442527533, + "learning_rate": 0.0002099093503976965, + "loss": 0.0016, + "num_input_tokens_seen": 192418192, + "step": 89170 + }, + { + "epoch": 14.547308319738988, + "grad_norm": 0.007164164911955595, + "learning_rate": 0.0002098513782715442, + "loss": 0.0037, + "num_input_tokens_seen": 192429232, + "step": 89175 + }, + { + "epoch": 14.548123980424144, + "grad_norm": 0.03448100760579109, + "learning_rate": 0.00020979341202562152, + "loss": 0.0079, + "num_input_tokens_seen": 192439632, + "step": 89180 + }, + { + "epoch": 14.548939641109298, + "grad_norm": 0.0371573381125927, + "learning_rate": 0.00020973545166110368, + "loss": 0.0041, + "num_input_tokens_seen": 192451024, + "step": 89185 + }, + { + "epoch": 14.549755301794454, + "grad_norm": 0.17699220776557922, + "learning_rate": 0.00020967749717916513, + "loss": 0.0141, + "num_input_tokens_seen": 192462096, + "step": 89190 + }, + { + "epoch": 14.550570962479608, + "grad_norm": 0.004565078299492598, + "learning_rate": 0.00020961954858098037, + "loss": 0.0088, + "num_input_tokens_seen": 192473040, + "step": 89195 + }, + { + "epoch": 14.551386623164763, + "grad_norm": 0.0022960626520216465, + "learning_rate": 0.0002095616058677239, + "loss": 0.0065, + "num_input_tokens_seen": 192483568, + "step": 89200 + }, + { + "epoch": 14.552202283849919, + "grad_norm": 0.0008615105762146413, + "learning_rate": 0.00020950366904056984, + "loss": 0.0046, + "num_input_tokens_seen": 192493680, + "step": 89205 + }, + { + "epoch": 14.553017944535073, + "grad_norm": 0.005762571003288031, + "learning_rate": 0.00020944573810069252, + "loss": 0.0049, + "num_input_tokens_seen": 192504592, + "step": 89210 + }, + { + "epoch": 14.553833605220229, + "grad_norm": 0.013701875694096088, + "learning_rate": 0.00020938781304926586, + "loss": 0.0042, + "num_input_tokens_seen": 192514960, + "step": 89215 + }, + { + "epoch": 14.554649265905383, + "grad_norm": 0.000595643010456115, + "learning_rate": 0.00020932989388746387, + "loss": 0.013, + "num_input_tokens_seen": 192525808, + "step": 89220 + }, + { + "epoch": 14.555464926590538, + "grad_norm": 0.0008307473035529256, + "learning_rate": 0.0002092719806164603, + "loss": 0.045, + "num_input_tokens_seen": 192536976, + "step": 89225 + }, + { + "epoch": 14.556280587275694, + "grad_norm": 0.0002317542675882578, + "learning_rate": 0.00020921407323742892, + "loss": 0.009, + "num_input_tokens_seen": 192548816, + "step": 89230 + }, + { + "epoch": 14.557096247960848, + "grad_norm": 0.14110343158245087, + "learning_rate": 0.00020915617175154316, + "loss": 0.0783, + "num_input_tokens_seen": 192559888, + "step": 89235 + }, + { + "epoch": 14.557911908646004, + "grad_norm": 0.0049491929821670055, + "learning_rate": 0.00020909827615997657, + "loss": 0.004, + "num_input_tokens_seen": 192571248, + "step": 89240 + }, + { + "epoch": 14.558727569331158, + "grad_norm": 0.0028622259851545095, + "learning_rate": 0.00020904038646390246, + "loss": 0.0024, + "num_input_tokens_seen": 192581840, + "step": 89245 + }, + { + "epoch": 14.559543230016313, + "grad_norm": 0.0019221862312406301, + "learning_rate": 0.00020898250266449399, + "loss": 0.0199, + "num_input_tokens_seen": 192591952, + "step": 89250 + }, + { + "epoch": 14.560358890701469, + "grad_norm": 0.014529379084706306, + "learning_rate": 0.0002089246247629243, + "loss": 0.0044, + "num_input_tokens_seen": 192602800, + "step": 89255 + }, + { + "epoch": 14.561174551386623, + "grad_norm": 0.10432250797748566, + "learning_rate": 0.00020886675276036637, + "loss": 0.0022, + "num_input_tokens_seen": 192613360, + "step": 89260 + }, + { + "epoch": 14.561990212071779, + "grad_norm": 0.002487804973497987, + "learning_rate": 0.00020880888665799304, + "loss": 0.0015, + "num_input_tokens_seen": 192623952, + "step": 89265 + }, + { + "epoch": 14.562805872756933, + "grad_norm": 0.5053357481956482, + "learning_rate": 0.00020875102645697696, + "loss": 0.018, + "num_input_tokens_seen": 192635344, + "step": 89270 + }, + { + "epoch": 14.563621533442088, + "grad_norm": 0.04799091815948486, + "learning_rate": 0.0002086931721584908, + "loss": 0.0048, + "num_input_tokens_seen": 192646736, + "step": 89275 + }, + { + "epoch": 14.564437194127244, + "grad_norm": 0.015837689861655235, + "learning_rate": 0.00020863532376370715, + "loss": 0.0008, + "num_input_tokens_seen": 192657200, + "step": 89280 + }, + { + "epoch": 14.565252854812398, + "grad_norm": 0.011453093029558659, + "learning_rate": 0.000208577481273798, + "loss": 0.0066, + "num_input_tokens_seen": 192667984, + "step": 89285 + }, + { + "epoch": 14.566068515497554, + "grad_norm": 0.0002871434553526342, + "learning_rate": 0.00020851964468993612, + "loss": 0.1229, + "num_input_tokens_seen": 192680336, + "step": 89290 + }, + { + "epoch": 14.566884176182707, + "grad_norm": 0.003082169685512781, + "learning_rate": 0.00020846181401329338, + "loss": 0.0019, + "num_input_tokens_seen": 192691728, + "step": 89295 + }, + { + "epoch": 14.567699836867863, + "grad_norm": 0.00770993297919631, + "learning_rate": 0.00020840398924504188, + "loss": 0.0017, + "num_input_tokens_seen": 192703408, + "step": 89300 + }, + { + "epoch": 14.568515497553017, + "grad_norm": 0.0006581484922207892, + "learning_rate": 0.0002083461703863534, + "loss": 0.0009, + "num_input_tokens_seen": 192713264, + "step": 89305 + }, + { + "epoch": 14.569331158238173, + "grad_norm": 0.0182223878800869, + "learning_rate": 0.0002082883574383998, + "loss": 0.0085, + "num_input_tokens_seen": 192723568, + "step": 89310 + }, + { + "epoch": 14.570146818923329, + "grad_norm": 0.003162509063258767, + "learning_rate": 0.00020823055040235266, + "loss": 0.0308, + "num_input_tokens_seen": 192732464, + "step": 89315 + }, + { + "epoch": 14.570962479608482, + "grad_norm": 0.09682769328355789, + "learning_rate": 0.0002081727492793836, + "loss": 0.009, + "num_input_tokens_seen": 192744112, + "step": 89320 + }, + { + "epoch": 14.571778140293638, + "grad_norm": 0.025619342923164368, + "learning_rate": 0.00020811495407066394, + "loss": 0.0014, + "num_input_tokens_seen": 192755600, + "step": 89325 + }, + { + "epoch": 14.572593800978792, + "grad_norm": 0.2514514923095703, + "learning_rate": 0.00020805716477736508, + "loss": 0.0061, + "num_input_tokens_seen": 192766640, + "step": 89330 + }, + { + "epoch": 14.573409461663948, + "grad_norm": 0.004714816343039274, + "learning_rate": 0.00020799938140065804, + "loss": 0.0014, + "num_input_tokens_seen": 192777968, + "step": 89335 + }, + { + "epoch": 14.574225122349104, + "grad_norm": 0.047039661556482315, + "learning_rate": 0.00020794160394171403, + "loss": 0.0021, + "num_input_tokens_seen": 192788784, + "step": 89340 + }, + { + "epoch": 14.575040783034257, + "grad_norm": 0.001344418851658702, + "learning_rate": 0.00020788383240170395, + "loss": 0.0021, + "num_input_tokens_seen": 192800880, + "step": 89345 + }, + { + "epoch": 14.575856443719413, + "grad_norm": 0.07869241386651993, + "learning_rate": 0.0002078260667817985, + "loss": 0.0024, + "num_input_tokens_seen": 192811632, + "step": 89350 + }, + { + "epoch": 14.576672104404567, + "grad_norm": 0.004596829880028963, + "learning_rate": 0.0002077683070831685, + "loss": 0.0031, + "num_input_tokens_seen": 192822928, + "step": 89355 + }, + { + "epoch": 14.577487765089723, + "grad_norm": 0.002012643963098526, + "learning_rate": 0.00020771055330698446, + "loss": 0.002, + "num_input_tokens_seen": 192833680, + "step": 89360 + }, + { + "epoch": 14.578303425774878, + "grad_norm": 0.01818975806236267, + "learning_rate": 0.0002076528054544169, + "loss": 0.0051, + "num_input_tokens_seen": 192844016, + "step": 89365 + }, + { + "epoch": 14.579119086460032, + "grad_norm": 0.000734273693524301, + "learning_rate": 0.00020759506352663605, + "loss": 0.0145, + "num_input_tokens_seen": 192854864, + "step": 89370 + }, + { + "epoch": 14.579934747145188, + "grad_norm": 0.00042447407031431794, + "learning_rate": 0.0002075373275248122, + "loss": 0.0011, + "num_input_tokens_seen": 192865808, + "step": 89375 + }, + { + "epoch": 14.580750407830342, + "grad_norm": 0.0055851018987596035, + "learning_rate": 0.00020747959745011542, + "loss": 0.0019, + "num_input_tokens_seen": 192876880, + "step": 89380 + }, + { + "epoch": 14.581566068515498, + "grad_norm": 0.0013472469290718436, + "learning_rate": 0.0002074218733037157, + "loss": 0.0352, + "num_input_tokens_seen": 192887728, + "step": 89385 + }, + { + "epoch": 14.582381729200652, + "grad_norm": 0.6183289289474487, + "learning_rate": 0.00020736415508678285, + "loss": 0.0461, + "num_input_tokens_seen": 192900400, + "step": 89390 + }, + { + "epoch": 14.583197389885807, + "grad_norm": 1.1001635789871216, + "learning_rate": 0.0002073064428004865, + "loss": 0.1301, + "num_input_tokens_seen": 192912336, + "step": 89395 + }, + { + "epoch": 14.584013050570963, + "grad_norm": 0.00695022102445364, + "learning_rate": 0.00020724873644599668, + "loss": 0.0006, + "num_input_tokens_seen": 192922960, + "step": 89400 + }, + { + "epoch": 14.584828711256117, + "grad_norm": 0.0020951468031853437, + "learning_rate": 0.0002071910360244823, + "loss": 0.0032, + "num_input_tokens_seen": 192933872, + "step": 89405 + }, + { + "epoch": 14.585644371941273, + "grad_norm": 0.021033864468336105, + "learning_rate": 0.0002071333415371134, + "loss": 0.0055, + "num_input_tokens_seen": 192945232, + "step": 89410 + }, + { + "epoch": 14.586460032626427, + "grad_norm": 0.014605917036533356, + "learning_rate": 0.00020707565298505842, + "loss": 0.0109, + "num_input_tokens_seen": 192954992, + "step": 89415 + }, + { + "epoch": 14.587275693311582, + "grad_norm": 0.0018626763485372066, + "learning_rate": 0.00020701797036948739, + "loss": 0.0014, + "num_input_tokens_seen": 192965072, + "step": 89420 + }, + { + "epoch": 14.588091353996738, + "grad_norm": 0.01722113788127899, + "learning_rate": 0.00020696029369156844, + "loss": 0.0025, + "num_input_tokens_seen": 192974576, + "step": 89425 + }, + { + "epoch": 14.588907014681892, + "grad_norm": 0.17909961938858032, + "learning_rate": 0.0002069026229524711, + "loss": 0.0088, + "num_input_tokens_seen": 192985424, + "step": 89430 + }, + { + "epoch": 14.589722675367048, + "grad_norm": 0.006187803111970425, + "learning_rate": 0.00020684495815336392, + "loss": 0.0008, + "num_input_tokens_seen": 192996624, + "step": 89435 + }, + { + "epoch": 14.590538336052202, + "grad_norm": 0.017531028017401695, + "learning_rate": 0.00020678729929541552, + "loss": 0.0021, + "num_input_tokens_seen": 193005648, + "step": 89440 + }, + { + "epoch": 14.591353996737357, + "grad_norm": 0.007145280484110117, + "learning_rate": 0.00020672964637979453, + "loss": 0.0023, + "num_input_tokens_seen": 193017392, + "step": 89445 + }, + { + "epoch": 14.592169657422513, + "grad_norm": 1.1156888008117676, + "learning_rate": 0.00020667199940766924, + "loss": 0.0221, + "num_input_tokens_seen": 193029040, + "step": 89450 + }, + { + "epoch": 14.592985318107667, + "grad_norm": 0.04135286808013916, + "learning_rate": 0.00020661435838020798, + "loss": 0.0019, + "num_input_tokens_seen": 193040144, + "step": 89455 + }, + { + "epoch": 14.593800978792823, + "grad_norm": 0.0022409185767173767, + "learning_rate": 0.000206556723298579, + "loss": 0.0018, + "num_input_tokens_seen": 193050704, + "step": 89460 + }, + { + "epoch": 14.594616639477977, + "grad_norm": 0.00035441608633846045, + "learning_rate": 0.00020649909416395025, + "loss": 0.0012, + "num_input_tokens_seen": 193061648, + "step": 89465 + }, + { + "epoch": 14.595432300163132, + "grad_norm": 0.002821630332618952, + "learning_rate": 0.00020644147097748967, + "loss": 0.0016, + "num_input_tokens_seen": 193071312, + "step": 89470 + }, + { + "epoch": 14.596247960848288, + "grad_norm": 0.0003396196698304266, + "learning_rate": 0.0002063838537403651, + "loss": 0.0063, + "num_input_tokens_seen": 193081648, + "step": 89475 + }, + { + "epoch": 14.597063621533442, + "grad_norm": 0.002032766817137599, + "learning_rate": 0.00020632624245374426, + "loss": 0.0062, + "num_input_tokens_seen": 193092144, + "step": 89480 + }, + { + "epoch": 14.597879282218598, + "grad_norm": 0.037333693355321884, + "learning_rate": 0.0002062686371187946, + "loss": 0.0206, + "num_input_tokens_seen": 193103344, + "step": 89485 + }, + { + "epoch": 14.598694942903752, + "grad_norm": 0.003661263966932893, + "learning_rate": 0.00020621103773668366, + "loss": 0.0044, + "num_input_tokens_seen": 193113712, + "step": 89490 + }, + { + "epoch": 14.599510603588907, + "grad_norm": 0.019343892112374306, + "learning_rate": 0.00020615344430857874, + "loss": 0.005, + "num_input_tokens_seen": 193124752, + "step": 89495 + }, + { + "epoch": 14.600326264274061, + "grad_norm": 0.002923226449638605, + "learning_rate": 0.00020609585683564687, + "loss": 0.0745, + "num_input_tokens_seen": 193135888, + "step": 89500 + }, + { + "epoch": 14.601141924959217, + "grad_norm": 0.05610254779458046, + "learning_rate": 0.00020603827531905566, + "loss": 0.004, + "num_input_tokens_seen": 193146960, + "step": 89505 + }, + { + "epoch": 14.601957585644373, + "grad_norm": 0.00014962207933422178, + "learning_rate": 0.00020598069975997135, + "loss": 0.0004, + "num_input_tokens_seen": 193156304, + "step": 89510 + }, + { + "epoch": 14.602773246329527, + "grad_norm": 0.15361471474170685, + "learning_rate": 0.0002059231301595615, + "loss": 0.0193, + "num_input_tokens_seen": 193165808, + "step": 89515 + }, + { + "epoch": 14.603588907014682, + "grad_norm": 0.007565390318632126, + "learning_rate": 0.00020586556651899213, + "loss": 0.0158, + "num_input_tokens_seen": 193175632, + "step": 89520 + }, + { + "epoch": 14.604404567699836, + "grad_norm": 0.000803881324827671, + "learning_rate": 0.00020580800883943058, + "loss": 0.0025, + "num_input_tokens_seen": 193187792, + "step": 89525 + }, + { + "epoch": 14.605220228384992, + "grad_norm": 0.00019740602874662727, + "learning_rate": 0.00020575045712204254, + "loss": 0.0025, + "num_input_tokens_seen": 193198192, + "step": 89530 + }, + { + "epoch": 14.606035889070148, + "grad_norm": 0.00045178603613749146, + "learning_rate": 0.00020569291136799512, + "loss": 0.0041, + "num_input_tokens_seen": 193208368, + "step": 89535 + }, + { + "epoch": 14.606851549755302, + "grad_norm": 0.0034555280581116676, + "learning_rate": 0.00020563537157845392, + "loss": 0.0271, + "num_input_tokens_seen": 193219440, + "step": 89540 + }, + { + "epoch": 14.607667210440457, + "grad_norm": 0.0011977417161688209, + "learning_rate": 0.0002055778377545856, + "loss": 0.0006, + "num_input_tokens_seen": 193230512, + "step": 89545 + }, + { + "epoch": 14.608482871125611, + "grad_norm": 0.00887078233063221, + "learning_rate": 0.0002055203098975556, + "loss": 0.0005, + "num_input_tokens_seen": 193240784, + "step": 89550 + }, + { + "epoch": 14.609298531810767, + "grad_norm": 0.029846560209989548, + "learning_rate": 0.00020546278800853048, + "loss": 0.0036, + "num_input_tokens_seen": 193250160, + "step": 89555 + }, + { + "epoch": 14.61011419249592, + "grad_norm": 0.008185265585780144, + "learning_rate": 0.00020540527208867522, + "loss": 0.0014, + "num_input_tokens_seen": 193260080, + "step": 89560 + }, + { + "epoch": 14.610929853181077, + "grad_norm": 0.0793921947479248, + "learning_rate": 0.00020534776213915619, + "loss": 0.0033, + "num_input_tokens_seen": 193269936, + "step": 89565 + }, + { + "epoch": 14.611745513866232, + "grad_norm": 0.0003837935801129788, + "learning_rate": 0.00020529025816113817, + "loss": 0.0023, + "num_input_tokens_seen": 193280272, + "step": 89570 + }, + { + "epoch": 14.612561174551386, + "grad_norm": 0.006542261689901352, + "learning_rate": 0.00020523276015578713, + "loss": 0.0069, + "num_input_tokens_seen": 193291984, + "step": 89575 + }, + { + "epoch": 14.613376835236542, + "grad_norm": 0.0010552759049460292, + "learning_rate": 0.0002051752681242682, + "loss": 0.0156, + "num_input_tokens_seen": 193303760, + "step": 89580 + }, + { + "epoch": 14.614192495921696, + "grad_norm": 0.005941393319517374, + "learning_rate": 0.0002051177820677464, + "loss": 0.0025, + "num_input_tokens_seen": 193314224, + "step": 89585 + }, + { + "epoch": 14.615008156606851, + "grad_norm": 0.032502103596925735, + "learning_rate": 0.00020506030198738683, + "loss": 0.0016, + "num_input_tokens_seen": 193324848, + "step": 89590 + }, + { + "epoch": 14.615823817292007, + "grad_norm": 0.8257111310958862, + "learning_rate": 0.00020500282788435441, + "loss": 0.0202, + "num_input_tokens_seen": 193335920, + "step": 89595 + }, + { + "epoch": 14.616639477977161, + "grad_norm": 1.0040146112442017, + "learning_rate": 0.00020494535975981398, + "loss": 0.025, + "num_input_tokens_seen": 193346960, + "step": 89600 + }, + { + "epoch": 14.617455138662317, + "grad_norm": 0.00017141558055300266, + "learning_rate": 0.0002048878976149301, + "loss": 0.1074, + "num_input_tokens_seen": 193359568, + "step": 89605 + }, + { + "epoch": 14.61827079934747, + "grad_norm": 0.1479143351316452, + "learning_rate": 0.00020483044145086732, + "loss": 0.0083, + "num_input_tokens_seen": 193369840, + "step": 89610 + }, + { + "epoch": 14.619086460032626, + "grad_norm": 0.0011429755249992013, + "learning_rate": 0.00020477299126879013, + "loss": 0.0002, + "num_input_tokens_seen": 193379376, + "step": 89615 + }, + { + "epoch": 14.619902120717782, + "grad_norm": 0.03615289181470871, + "learning_rate": 0.00020471554706986273, + "loss": 0.0259, + "num_input_tokens_seen": 193389584, + "step": 89620 + }, + { + "epoch": 14.620717781402936, + "grad_norm": 0.00016827063518576324, + "learning_rate": 0.00020465810885524928, + "loss": 0.0007, + "num_input_tokens_seen": 193400464, + "step": 89625 + }, + { + "epoch": 14.621533442088092, + "grad_norm": 0.00030358630465343595, + "learning_rate": 0.0002046006766261142, + "loss": 0.0018, + "num_input_tokens_seen": 193411792, + "step": 89630 + }, + { + "epoch": 14.622349102773246, + "grad_norm": 0.349635511636734, + "learning_rate": 0.00020454325038362083, + "loss": 0.0118, + "num_input_tokens_seen": 193422576, + "step": 89635 + }, + { + "epoch": 14.623164763458401, + "grad_norm": 0.001868964172899723, + "learning_rate": 0.00020448583012893363, + "loss": 0.0256, + "num_input_tokens_seen": 193433872, + "step": 89640 + }, + { + "epoch": 14.623980424143557, + "grad_norm": 0.7112645506858826, + "learning_rate": 0.00020442841586321565, + "loss": 0.061, + "num_input_tokens_seen": 193444272, + "step": 89645 + }, + { + "epoch": 14.624796084828711, + "grad_norm": 0.0025172680616378784, + "learning_rate": 0.0002043710075876311, + "loss": 0.0021, + "num_input_tokens_seen": 193456080, + "step": 89650 + }, + { + "epoch": 14.625611745513867, + "grad_norm": 0.004848515149205923, + "learning_rate": 0.00020431360530334282, + "loss": 0.0018, + "num_input_tokens_seen": 193467888, + "step": 89655 + }, + { + "epoch": 14.62642740619902, + "grad_norm": 0.07796057313680649, + "learning_rate": 0.0002042562090115147, + "loss": 0.0663, + "num_input_tokens_seen": 193478000, + "step": 89660 + }, + { + "epoch": 14.627243066884176, + "grad_norm": 0.005459724459797144, + "learning_rate": 0.0002041988187133094, + "loss": 0.041, + "num_input_tokens_seen": 193487984, + "step": 89665 + }, + { + "epoch": 14.62805872756933, + "grad_norm": 0.027315054088830948, + "learning_rate": 0.00020414143440989062, + "loss": 0.0026, + "num_input_tokens_seen": 193498256, + "step": 89670 + }, + { + "epoch": 14.628874388254486, + "grad_norm": 0.004423979669809341, + "learning_rate": 0.00020408405610242063, + "loss": 0.0039, + "num_input_tokens_seen": 193508432, + "step": 89675 + }, + { + "epoch": 14.629690048939642, + "grad_norm": 0.02575138583779335, + "learning_rate": 0.000204026683792063, + "loss": 0.0033, + "num_input_tokens_seen": 193518064, + "step": 89680 + }, + { + "epoch": 14.630505709624796, + "grad_norm": 0.00018337595975026488, + "learning_rate": 0.00020396931747997978, + "loss": 0.0064, + "num_input_tokens_seen": 193529136, + "step": 89685 + }, + { + "epoch": 14.631321370309951, + "grad_norm": 0.0008599350112490356, + "learning_rate": 0.0002039119571673342, + "loss": 0.0007, + "num_input_tokens_seen": 193538640, + "step": 89690 + }, + { + "epoch": 14.632137030995105, + "grad_norm": 0.028386695310473442, + "learning_rate": 0.00020385460285528807, + "loss": 0.0056, + "num_input_tokens_seen": 193549488, + "step": 89695 + }, + { + "epoch": 14.632952691680261, + "grad_norm": 0.0008344990783371031, + "learning_rate": 0.0002037972545450044, + "loss": 0.0008, + "num_input_tokens_seen": 193559024, + "step": 89700 + }, + { + "epoch": 14.633768352365417, + "grad_norm": 0.0040653846226632595, + "learning_rate": 0.0002037399122376449, + "loss": 0.1477, + "num_input_tokens_seen": 193570352, + "step": 89705 + }, + { + "epoch": 14.63458401305057, + "grad_norm": 0.012042321264743805, + "learning_rate": 0.0002036825759343721, + "loss": 0.0094, + "num_input_tokens_seen": 193582096, + "step": 89710 + }, + { + "epoch": 14.635399673735726, + "grad_norm": 0.0028248876333236694, + "learning_rate": 0.0002036252456363476, + "loss": 0.0113, + "num_input_tokens_seen": 193592112, + "step": 89715 + }, + { + "epoch": 14.63621533442088, + "grad_norm": 0.005146393086761236, + "learning_rate": 0.00020356792134473356, + "loss": 0.0071, + "num_input_tokens_seen": 193603632, + "step": 89720 + }, + { + "epoch": 14.637030995106036, + "grad_norm": 0.0017959423130378127, + "learning_rate": 0.0002035106030606917, + "loss": 0.0334, + "num_input_tokens_seen": 193614736, + "step": 89725 + }, + { + "epoch": 14.63784665579119, + "grad_norm": 0.007601138669997454, + "learning_rate": 0.00020345329078538354, + "loss": 0.0014, + "num_input_tokens_seen": 193624656, + "step": 89730 + }, + { + "epoch": 14.638662316476346, + "grad_norm": 0.07916417717933655, + "learning_rate": 0.00020339598451997066, + "loss": 0.0032, + "num_input_tokens_seen": 193634960, + "step": 89735 + }, + { + "epoch": 14.639477977161501, + "grad_norm": 0.003668928286060691, + "learning_rate": 0.00020333868426561448, + "loss": 0.0225, + "num_input_tokens_seen": 193646576, + "step": 89740 + }, + { + "epoch": 14.640293637846655, + "grad_norm": 0.04410633444786072, + "learning_rate": 0.00020328139002347612, + "loss": 0.0163, + "num_input_tokens_seen": 193657136, + "step": 89745 + }, + { + "epoch": 14.641109298531811, + "grad_norm": 0.010206999257206917, + "learning_rate": 0.00020322410179471684, + "loss": 0.0008, + "num_input_tokens_seen": 193668944, + "step": 89750 + }, + { + "epoch": 14.641924959216965, + "grad_norm": 0.008274204097688198, + "learning_rate": 0.00020316681958049758, + "loss": 0.006, + "num_input_tokens_seen": 193679952, + "step": 89755 + }, + { + "epoch": 14.64274061990212, + "grad_norm": 0.010657594539225101, + "learning_rate": 0.00020310954338197934, + "loss": 0.0082, + "num_input_tokens_seen": 193691856, + "step": 89760 + }, + { + "epoch": 14.643556280587276, + "grad_norm": 0.08692426234483719, + "learning_rate": 0.00020305227320032283, + "loss": 0.004, + "num_input_tokens_seen": 193701968, + "step": 89765 + }, + { + "epoch": 14.64437194127243, + "grad_norm": 0.016580168157815933, + "learning_rate": 0.00020299500903668856, + "loss": 0.0028, + "num_input_tokens_seen": 193712528, + "step": 89770 + }, + { + "epoch": 14.645187601957586, + "grad_norm": 0.020174870267510414, + "learning_rate": 0.00020293775089223748, + "loss": 0.0016, + "num_input_tokens_seen": 193723664, + "step": 89775 + }, + { + "epoch": 14.64600326264274, + "grad_norm": 0.00850595161318779, + "learning_rate": 0.00020288049876812943, + "loss": 0.0007, + "num_input_tokens_seen": 193734576, + "step": 89780 + }, + { + "epoch": 14.646818923327896, + "grad_norm": 0.08473452925682068, + "learning_rate": 0.00020282325266552536, + "loss": 0.0053, + "num_input_tokens_seen": 193746192, + "step": 89785 + }, + { + "epoch": 14.647634584013051, + "grad_norm": 0.0015395291848108172, + "learning_rate": 0.0002027660125855847, + "loss": 0.0163, + "num_input_tokens_seen": 193755120, + "step": 89790 + }, + { + "epoch": 14.648450244698205, + "grad_norm": 0.0005728512187488377, + "learning_rate": 0.00020270877852946817, + "loss": 0.0006, + "num_input_tokens_seen": 193764880, + "step": 89795 + }, + { + "epoch": 14.649265905383361, + "grad_norm": 0.025792736560106277, + "learning_rate": 0.0002026515504983351, + "loss": 0.0074, + "num_input_tokens_seen": 193776304, + "step": 89800 + }, + { + "epoch": 14.650081566068515, + "grad_norm": 0.013596449978649616, + "learning_rate": 0.00020259432849334592, + "loss": 0.0153, + "num_input_tokens_seen": 193786544, + "step": 89805 + }, + { + "epoch": 14.65089722675367, + "grad_norm": 0.006325924303382635, + "learning_rate": 0.00020253711251565953, + "loss": 0.0156, + "num_input_tokens_seen": 193798000, + "step": 89810 + }, + { + "epoch": 14.651712887438826, + "grad_norm": 0.00043759215623140335, + "learning_rate": 0.00020247990256643634, + "loss": 0.0013, + "num_input_tokens_seen": 193810032, + "step": 89815 + }, + { + "epoch": 14.65252854812398, + "grad_norm": 0.013071759603917599, + "learning_rate": 0.000202422698646835, + "loss": 0.0014, + "num_input_tokens_seen": 193820240, + "step": 89820 + }, + { + "epoch": 14.653344208809136, + "grad_norm": 0.0008109352202154696, + "learning_rate": 0.00020236550075801535, + "loss": 0.0033, + "num_input_tokens_seen": 193830480, + "step": 89825 + }, + { + "epoch": 14.65415986949429, + "grad_norm": 0.14773964881896973, + "learning_rate": 0.0002023083089011364, + "loss": 0.0155, + "num_input_tokens_seen": 193841168, + "step": 89830 + }, + { + "epoch": 14.654975530179446, + "grad_norm": 0.00043861044105142355, + "learning_rate": 0.00020225112307735717, + "loss": 0.0455, + "num_input_tokens_seen": 193852752, + "step": 89835 + }, + { + "epoch": 14.655791190864601, + "grad_norm": 0.0030247292015701532, + "learning_rate": 0.00020219394328783668, + "loss": 0.003, + "num_input_tokens_seen": 193863824, + "step": 89840 + }, + { + "epoch": 14.656606851549755, + "grad_norm": 0.03485793247818947, + "learning_rate": 0.00020213676953373372, + "loss": 0.0476, + "num_input_tokens_seen": 193874640, + "step": 89845 + }, + { + "epoch": 14.65742251223491, + "grad_norm": 0.7707039713859558, + "learning_rate": 0.00020207960181620706, + "loss": 0.0186, + "num_input_tokens_seen": 193885520, + "step": 89850 + }, + { + "epoch": 14.658238172920065, + "grad_norm": 0.0025808352511376143, + "learning_rate": 0.00020202244013641513, + "loss": 0.0024, + "num_input_tokens_seen": 193897424, + "step": 89855 + }, + { + "epoch": 14.65905383360522, + "grad_norm": 0.0052098119631409645, + "learning_rate": 0.0002019652844955165, + "loss": 0.0067, + "num_input_tokens_seen": 193909264, + "step": 89860 + }, + { + "epoch": 14.659869494290374, + "grad_norm": 0.00428504403680563, + "learning_rate": 0.00020190813489466943, + "loss": 0.0012, + "num_input_tokens_seen": 193918480, + "step": 89865 + }, + { + "epoch": 14.66068515497553, + "grad_norm": 0.0008547519100829959, + "learning_rate": 0.00020185099133503216, + "loss": 0.0021, + "num_input_tokens_seen": 193929520, + "step": 89870 + }, + { + "epoch": 14.661500815660686, + "grad_norm": 0.0018663021037355065, + "learning_rate": 0.00020179385381776283, + "loss": 0.0006, + "num_input_tokens_seen": 193940880, + "step": 89875 + }, + { + "epoch": 14.66231647634584, + "grad_norm": 0.04657228663563728, + "learning_rate": 0.00020173672234401928, + "loss": 0.0068, + "num_input_tokens_seen": 193952560, + "step": 89880 + }, + { + "epoch": 14.663132137030995, + "grad_norm": 0.003102297428995371, + "learning_rate": 0.00020167959691495946, + "loss": 0.0025, + "num_input_tokens_seen": 193964208, + "step": 89885 + }, + { + "epoch": 14.66394779771615, + "grad_norm": 0.01273407507687807, + "learning_rate": 0.00020162247753174105, + "loss": 0.001, + "num_input_tokens_seen": 193974160, + "step": 89890 + }, + { + "epoch": 14.664763458401305, + "grad_norm": 0.0012462548911571503, + "learning_rate": 0.00020156536419552168, + "loss": 0.0004, + "num_input_tokens_seen": 193985520, + "step": 89895 + }, + { + "epoch": 14.66557911908646, + "grad_norm": 0.007393706124275923, + "learning_rate": 0.00020150825690745883, + "loss": 0.0635, + "num_input_tokens_seen": 193997008, + "step": 89900 + }, + { + "epoch": 14.666394779771615, + "grad_norm": 0.06467791646718979, + "learning_rate": 0.00020145115566870975, + "loss": 0.0034, + "num_input_tokens_seen": 194007472, + "step": 89905 + }, + { + "epoch": 14.66721044045677, + "grad_norm": 0.06493475288152695, + "learning_rate": 0.00020139406048043173, + "loss": 0.0185, + "num_input_tokens_seen": 194019888, + "step": 89910 + }, + { + "epoch": 14.668026101141924, + "grad_norm": 0.0012431129580363631, + "learning_rate": 0.00020133697134378176, + "loss": 0.0038, + "num_input_tokens_seen": 194029968, + "step": 89915 + }, + { + "epoch": 14.66884176182708, + "grad_norm": 0.17993931472301483, + "learning_rate": 0.0002012798882599173, + "loss": 0.0046, + "num_input_tokens_seen": 194040432, + "step": 89920 + }, + { + "epoch": 14.669657422512234, + "grad_norm": 0.006886349990963936, + "learning_rate": 0.00020122281122999443, + "loss": 0.0015, + "num_input_tokens_seen": 194050256, + "step": 89925 + }, + { + "epoch": 14.67047308319739, + "grad_norm": 0.012551396153867245, + "learning_rate": 0.00020116574025517053, + "loss": 0.0013, + "num_input_tokens_seen": 194061104, + "step": 89930 + }, + { + "epoch": 14.671288743882545, + "grad_norm": 0.003549652174115181, + "learning_rate": 0.00020110867533660204, + "loss": 0.0141, + "num_input_tokens_seen": 194071184, + "step": 89935 + }, + { + "epoch": 14.6721044045677, + "grad_norm": 0.012341232970356941, + "learning_rate": 0.00020105161647544534, + "loss": 0.0025, + "num_input_tokens_seen": 194082160, + "step": 89940 + }, + { + "epoch": 14.672920065252855, + "grad_norm": 0.003138220403343439, + "learning_rate": 0.00020099456367285695, + "loss": 0.0041, + "num_input_tokens_seen": 194094096, + "step": 89945 + }, + { + "epoch": 14.673735725938009, + "grad_norm": 0.008232166059315205, + "learning_rate": 0.00020093751692999302, + "loss": 0.0007, + "num_input_tokens_seen": 194105424, + "step": 89950 + }, + { + "epoch": 14.674551386623165, + "grad_norm": 0.021067682653665543, + "learning_rate": 0.00020088047624800966, + "loss": 0.0071, + "num_input_tokens_seen": 194115536, + "step": 89955 + }, + { + "epoch": 14.67536704730832, + "grad_norm": 0.0012995904544368386, + "learning_rate": 0.00020082344162806293, + "loss": 0.0147, + "num_input_tokens_seen": 194127856, + "step": 89960 + }, + { + "epoch": 14.676182707993474, + "grad_norm": 0.19958825409412384, + "learning_rate": 0.00020076641307130872, + "loss": 0.0153, + "num_input_tokens_seen": 194138480, + "step": 89965 + }, + { + "epoch": 14.67699836867863, + "grad_norm": 0.0074332114309072495, + "learning_rate": 0.00020070939057890275, + "loss": 0.0805, + "num_input_tokens_seen": 194148848, + "step": 89970 + }, + { + "epoch": 14.677814029363784, + "grad_norm": 0.0018908986821770668, + "learning_rate": 0.00020065237415200062, + "loss": 0.0058, + "num_input_tokens_seen": 194161424, + "step": 89975 + }, + { + "epoch": 14.67862969004894, + "grad_norm": 0.04879371076822281, + "learning_rate": 0.00020059536379175792, + "loss": 0.0026, + "num_input_tokens_seen": 194173872, + "step": 89980 + }, + { + "epoch": 14.679445350734095, + "grad_norm": 0.007000759243965149, + "learning_rate": 0.0002005383594993299, + "loss": 0.0011, + "num_input_tokens_seen": 194184560, + "step": 89985 + }, + { + "epoch": 14.68026101141925, + "grad_norm": 0.001079513574950397, + "learning_rate": 0.00020048136127587203, + "loss": 0.0076, + "num_input_tokens_seen": 194194800, + "step": 89990 + }, + { + "epoch": 14.681076672104405, + "grad_norm": 0.0011407058918848634, + "learning_rate": 0.0002004243691225393, + "loss": 0.0006, + "num_input_tokens_seen": 194205072, + "step": 89995 + }, + { + "epoch": 14.681892332789559, + "grad_norm": 0.00045537023106589913, + "learning_rate": 0.00020036738304048674, + "loss": 0.0006, + "num_input_tokens_seen": 194215216, + "step": 90000 + }, + { + "epoch": 14.682707993474715, + "grad_norm": 0.006771817337721586, + "learning_rate": 0.00020031040303086932, + "loss": 0.0043, + "num_input_tokens_seen": 194225584, + "step": 90005 + }, + { + "epoch": 14.68352365415987, + "grad_norm": 0.0009942397009581327, + "learning_rate": 0.00020025342909484173, + "loss": 0.0067, + "num_input_tokens_seen": 194236528, + "step": 90010 + }, + { + "epoch": 14.684339314845024, + "grad_norm": 0.07654161751270294, + "learning_rate": 0.00020019646123355868, + "loss": 0.0028, + "num_input_tokens_seen": 194247344, + "step": 90015 + }, + { + "epoch": 14.68515497553018, + "grad_norm": 0.0008052191114984453, + "learning_rate": 0.00020013949944817466, + "loss": 0.002, + "num_input_tokens_seen": 194257776, + "step": 90020 + }, + { + "epoch": 14.685970636215334, + "grad_norm": 0.0019854079000651836, + "learning_rate": 0.00020008254373984408, + "loss": 0.0038, + "num_input_tokens_seen": 194269264, + "step": 90025 + }, + { + "epoch": 14.68678629690049, + "grad_norm": 0.0026570523623377085, + "learning_rate": 0.00020002559410972121, + "loss": 0.0186, + "num_input_tokens_seen": 194279600, + "step": 90030 + }, + { + "epoch": 14.687601957585644, + "grad_norm": 0.08695809543132782, + "learning_rate": 0.00019996865055896008, + "loss": 0.0069, + "num_input_tokens_seen": 194289808, + "step": 90035 + }, + { + "epoch": 14.6884176182708, + "grad_norm": 0.22850275039672852, + "learning_rate": 0.0001999117130887152, + "loss": 0.0044, + "num_input_tokens_seen": 194301520, + "step": 90040 + }, + { + "epoch": 14.689233278955955, + "grad_norm": 0.0030070829670876265, + "learning_rate": 0.00019985478170013977, + "loss": 0.0021, + "num_input_tokens_seen": 194311280, + "step": 90045 + }, + { + "epoch": 14.690048939641109, + "grad_norm": 0.006501371040940285, + "learning_rate": 0.00019979785639438836, + "loss": 0.0006, + "num_input_tokens_seen": 194322608, + "step": 90050 + }, + { + "epoch": 14.690864600326265, + "grad_norm": 0.0006940042367205024, + "learning_rate": 0.00019974093717261383, + "loss": 0.0146, + "num_input_tokens_seen": 194332592, + "step": 90055 + }, + { + "epoch": 14.691680261011419, + "grad_norm": 0.0024099715519696474, + "learning_rate": 0.0001996840240359703, + "loss": 0.007, + "num_input_tokens_seen": 194343472, + "step": 90060 + }, + { + "epoch": 14.692495921696574, + "grad_norm": 0.00037129331030882895, + "learning_rate": 0.00019962711698561097, + "loss": 0.0508, + "num_input_tokens_seen": 194354032, + "step": 90065 + }, + { + "epoch": 14.69331158238173, + "grad_norm": 0.002983122831210494, + "learning_rate": 0.0001995702160226892, + "loss": 0.0006, + "num_input_tokens_seen": 194363952, + "step": 90070 + }, + { + "epoch": 14.694127243066884, + "grad_norm": 0.00017071844195015728, + "learning_rate": 0.00019951332114835808, + "loss": 0.0014, + "num_input_tokens_seen": 194374928, + "step": 90075 + }, + { + "epoch": 14.69494290375204, + "grad_norm": 0.004869018215686083, + "learning_rate": 0.00019945643236377074, + "loss": 0.0114, + "num_input_tokens_seen": 194385584, + "step": 90080 + }, + { + "epoch": 14.695758564437194, + "grad_norm": 0.6279474496841431, + "learning_rate": 0.00019939954967008005, + "loss": 0.0554, + "num_input_tokens_seen": 194395600, + "step": 90085 + }, + { + "epoch": 14.69657422512235, + "grad_norm": 0.018211599439382553, + "learning_rate": 0.00019934267306843885, + "loss": 0.0017, + "num_input_tokens_seen": 194407952, + "step": 90090 + }, + { + "epoch": 14.697389885807503, + "grad_norm": 0.05170668289065361, + "learning_rate": 0.0001992858025599998, + "loss": 0.0013, + "num_input_tokens_seen": 194419024, + "step": 90095 + }, + { + "epoch": 14.698205546492659, + "grad_norm": 0.00021628155081998557, + "learning_rate": 0.00019922893814591541, + "loss": 0.0003, + "num_input_tokens_seen": 194431152, + "step": 90100 + }, + { + "epoch": 14.699021207177815, + "grad_norm": 0.031851474195718765, + "learning_rate": 0.00019917207982733814, + "loss": 0.0043, + "num_input_tokens_seen": 194441200, + "step": 90105 + }, + { + "epoch": 14.699836867862969, + "grad_norm": 0.00028537330217659473, + "learning_rate": 0.00019911522760542028, + "loss": 0.0199, + "num_input_tokens_seen": 194452528, + "step": 90110 + }, + { + "epoch": 14.700652528548124, + "grad_norm": 0.05469457432627678, + "learning_rate": 0.0001990583814813141, + "loss": 0.0037, + "num_input_tokens_seen": 194463760, + "step": 90115 + }, + { + "epoch": 14.701468189233278, + "grad_norm": 0.018047045916318893, + "learning_rate": 0.00019900154145617157, + "loss": 0.0226, + "num_input_tokens_seen": 194473840, + "step": 90120 + }, + { + "epoch": 14.702283849918434, + "grad_norm": 0.0007513340096920729, + "learning_rate": 0.00019894470753114456, + "loss": 0.0008, + "num_input_tokens_seen": 194485168, + "step": 90125 + }, + { + "epoch": 14.70309951060359, + "grad_norm": 0.0080259433016181, + "learning_rate": 0.00019888787970738508, + "loss": 0.0106, + "num_input_tokens_seen": 194495984, + "step": 90130 + }, + { + "epoch": 14.703915171288743, + "grad_norm": 0.0002602690365165472, + "learning_rate": 0.00019883105798604468, + "loss": 0.0043, + "num_input_tokens_seen": 194506704, + "step": 90135 + }, + { + "epoch": 14.7047308319739, + "grad_norm": 0.015412014909088612, + "learning_rate": 0.00019877424236827473, + "loss": 0.0012, + "num_input_tokens_seen": 194517584, + "step": 90140 + }, + { + "epoch": 14.705546492659053, + "grad_norm": 0.0011823793174698949, + "learning_rate": 0.00019871743285522725, + "loss": 0.0081, + "num_input_tokens_seen": 194529392, + "step": 90145 + }, + { + "epoch": 14.706362153344209, + "grad_norm": 0.0040732515044510365, + "learning_rate": 0.0001986606294480529, + "loss": 0.0021, + "num_input_tokens_seen": 194540240, + "step": 90150 + }, + { + "epoch": 14.707177814029365, + "grad_norm": 0.00585609395056963, + "learning_rate": 0.00019860383214790345, + "loss": 0.0007, + "num_input_tokens_seen": 194551664, + "step": 90155 + }, + { + "epoch": 14.707993474714518, + "grad_norm": 0.04264798015356064, + "learning_rate": 0.0001985470409559294, + "loss": 0.0114, + "num_input_tokens_seen": 194562896, + "step": 90160 + }, + { + "epoch": 14.708809135399674, + "grad_norm": 0.004952174611389637, + "learning_rate": 0.00019849025587328228, + "loss": 0.0004, + "num_input_tokens_seen": 194573200, + "step": 90165 + }, + { + "epoch": 14.709624796084828, + "grad_norm": 0.0032812152057886124, + "learning_rate": 0.00019843347690111235, + "loss": 0.0008, + "num_input_tokens_seen": 194583760, + "step": 90170 + }, + { + "epoch": 14.710440456769984, + "grad_norm": 0.0012905189068987966, + "learning_rate": 0.00019837670404057085, + "loss": 0.0007, + "num_input_tokens_seen": 194594704, + "step": 90175 + }, + { + "epoch": 14.71125611745514, + "grad_norm": 0.001098940847441554, + "learning_rate": 0.00019831993729280774, + "loss": 0.0969, + "num_input_tokens_seen": 194605488, + "step": 90180 + }, + { + "epoch": 14.712071778140293, + "grad_norm": 0.007208923809230328, + "learning_rate": 0.0001982631766589742, + "loss": 0.1868, + "num_input_tokens_seen": 194616624, + "step": 90185 + }, + { + "epoch": 14.71288743882545, + "grad_norm": 0.0007471378776244819, + "learning_rate": 0.00019820642214021979, + "loss": 0.0266, + "num_input_tokens_seen": 194626896, + "step": 90190 + }, + { + "epoch": 14.713703099510603, + "grad_norm": 0.0022779193241149187, + "learning_rate": 0.00019814967373769544, + "loss": 0.0223, + "num_input_tokens_seen": 194638576, + "step": 90195 + }, + { + "epoch": 14.714518760195759, + "grad_norm": 0.00017099222168326378, + "learning_rate": 0.00019809293145255048, + "loss": 0.0057, + "num_input_tokens_seen": 194649424, + "step": 90200 + }, + { + "epoch": 14.715334420880914, + "grad_norm": 0.00048232366680167615, + "learning_rate": 0.00019803619528593547, + "loss": 0.0017, + "num_input_tokens_seen": 194660432, + "step": 90205 + }, + { + "epoch": 14.716150081566068, + "grad_norm": 0.0007757663843221962, + "learning_rate": 0.00019797946523900006, + "loss": 0.0003, + "num_input_tokens_seen": 194670768, + "step": 90210 + }, + { + "epoch": 14.716965742251224, + "grad_norm": 0.007320431061089039, + "learning_rate": 0.0001979227413128939, + "loss": 0.0024, + "num_input_tokens_seen": 194681456, + "step": 90215 + }, + { + "epoch": 14.717781402936378, + "grad_norm": 0.0037358838599175215, + "learning_rate": 0.0001978660235087666, + "loss": 0.0044, + "num_input_tokens_seen": 194693296, + "step": 90220 + }, + { + "epoch": 14.718597063621534, + "grad_norm": 1.1678922176361084, + "learning_rate": 0.00019780931182776762, + "loss": 0.1413, + "num_input_tokens_seen": 194703952, + "step": 90225 + }, + { + "epoch": 14.719412724306688, + "grad_norm": 0.0008596403640694916, + "learning_rate": 0.0001977526062710463, + "loss": 0.0008, + "num_input_tokens_seen": 194715728, + "step": 90230 + }, + { + "epoch": 14.720228384991843, + "grad_norm": 0.012537977658212185, + "learning_rate": 0.0001976959068397518, + "loss": 0.001, + "num_input_tokens_seen": 194726640, + "step": 90235 + }, + { + "epoch": 14.721044045676999, + "grad_norm": 0.0022540949285030365, + "learning_rate": 0.00019763921353503335, + "loss": 0.0055, + "num_input_tokens_seen": 194737680, + "step": 90240 + }, + { + "epoch": 14.721859706362153, + "grad_norm": 0.022405214607715607, + "learning_rate": 0.0001975825263580397, + "loss": 0.0026, + "num_input_tokens_seen": 194746768, + "step": 90245 + }, + { + "epoch": 14.722675367047309, + "grad_norm": 0.0005993006634525955, + "learning_rate": 0.00019752584530991984, + "loss": 0.0132, + "num_input_tokens_seen": 194756880, + "step": 90250 + }, + { + "epoch": 14.723491027732463, + "grad_norm": 0.0005108333425596356, + "learning_rate": 0.00019746917039182226, + "loss": 0.0101, + "num_input_tokens_seen": 194767440, + "step": 90255 + }, + { + "epoch": 14.724306688417618, + "grad_norm": 0.0009564626961946487, + "learning_rate": 0.0001974125016048961, + "loss": 0.0458, + "num_input_tokens_seen": 194778352, + "step": 90260 + }, + { + "epoch": 14.725122349102774, + "grad_norm": 0.0035904073156416416, + "learning_rate": 0.0001973558389502891, + "loss": 0.0651, + "num_input_tokens_seen": 194789104, + "step": 90265 + }, + { + "epoch": 14.725938009787928, + "grad_norm": 0.010129266418516636, + "learning_rate": 0.0001972991824291503, + "loss": 0.0201, + "num_input_tokens_seen": 194799760, + "step": 90270 + }, + { + "epoch": 14.726753670473084, + "grad_norm": 0.01517915166914463, + "learning_rate": 0.00019724253204262717, + "loss": 0.001, + "num_input_tokens_seen": 194810416, + "step": 90275 + }, + { + "epoch": 14.727569331158238, + "grad_norm": 0.13117088377475739, + "learning_rate": 0.00019718588779186864, + "loss": 0.0051, + "num_input_tokens_seen": 194820880, + "step": 90280 + }, + { + "epoch": 14.728384991843393, + "grad_norm": 0.08906326442956924, + "learning_rate": 0.00019712924967802182, + "loss": 0.0018, + "num_input_tokens_seen": 194831344, + "step": 90285 + }, + { + "epoch": 14.729200652528547, + "grad_norm": 0.0027404262218624353, + "learning_rate": 0.00019707261770223532, + "loss": 0.0013, + "num_input_tokens_seen": 194842352, + "step": 90290 + }, + { + "epoch": 14.730016313213703, + "grad_norm": 0.005311061628162861, + "learning_rate": 0.00019701599186565621, + "loss": 0.022, + "num_input_tokens_seen": 194853968, + "step": 90295 + }, + { + "epoch": 14.730831973898859, + "grad_norm": 0.009094677865505219, + "learning_rate": 0.00019695937216943272, + "loss": 0.0006, + "num_input_tokens_seen": 194864912, + "step": 90300 + }, + { + "epoch": 14.731647634584013, + "grad_norm": 0.020676128566265106, + "learning_rate": 0.00019690275861471168, + "loss": 0.0075, + "num_input_tokens_seen": 194875824, + "step": 90305 + }, + { + "epoch": 14.732463295269168, + "grad_norm": 0.0003037787973880768, + "learning_rate": 0.00019684615120264104, + "loss": 0.0221, + "num_input_tokens_seen": 194885936, + "step": 90310 + }, + { + "epoch": 14.733278955954322, + "grad_norm": 0.0017587352776899934, + "learning_rate": 0.00019678954993436736, + "loss": 0.0058, + "num_input_tokens_seen": 194895824, + "step": 90315 + }, + { + "epoch": 14.734094616639478, + "grad_norm": 0.018381793051958084, + "learning_rate": 0.00019673295481103847, + "loss": 0.0332, + "num_input_tokens_seen": 194907184, + "step": 90320 + }, + { + "epoch": 14.734910277324634, + "grad_norm": 0.008810536004602909, + "learning_rate": 0.00019667636583380066, + "loss": 0.0033, + "num_input_tokens_seen": 194916272, + "step": 90325 + }, + { + "epoch": 14.735725938009788, + "grad_norm": 0.08104515075683594, + "learning_rate": 0.0001966197830038014, + "loss": 0.0377, + "num_input_tokens_seen": 194927536, + "step": 90330 + }, + { + "epoch": 14.736541598694943, + "grad_norm": 0.00022084874217398465, + "learning_rate": 0.00019656320632218676, + "loss": 0.004, + "num_input_tokens_seen": 194937456, + "step": 90335 + }, + { + "epoch": 14.737357259380097, + "grad_norm": 0.06676784157752991, + "learning_rate": 0.00019650663579010401, + "loss": 0.0045, + "num_input_tokens_seen": 194948368, + "step": 90340 + }, + { + "epoch": 14.738172920065253, + "grad_norm": 0.001672516344115138, + "learning_rate": 0.00019645007140869897, + "loss": 0.0067, + "num_input_tokens_seen": 194958768, + "step": 90345 + }, + { + "epoch": 14.738988580750409, + "grad_norm": 0.006418735720217228, + "learning_rate": 0.00019639351317911853, + "loss": 0.0009, + "num_input_tokens_seen": 194969808, + "step": 90350 + }, + { + "epoch": 14.739804241435563, + "grad_norm": 0.02256779372692108, + "learning_rate": 0.00019633696110250864, + "loss": 0.0502, + "num_input_tokens_seen": 194981424, + "step": 90355 + }, + { + "epoch": 14.740619902120718, + "grad_norm": 0.03260838985443115, + "learning_rate": 0.0001962804151800155, + "loss": 0.0017, + "num_input_tokens_seen": 194991856, + "step": 90360 + }, + { + "epoch": 14.741435562805872, + "grad_norm": 0.010096519254148006, + "learning_rate": 0.00019622387541278497, + "loss": 0.0029, + "num_input_tokens_seen": 195003408, + "step": 90365 + }, + { + "epoch": 14.742251223491028, + "grad_norm": 0.0014346149982884526, + "learning_rate": 0.000196167341801963, + "loss": 0.0024, + "num_input_tokens_seen": 195014416, + "step": 90370 + }, + { + "epoch": 14.743066884176184, + "grad_norm": 0.00046522117918357253, + "learning_rate": 0.00019611081434869532, + "loss": 0.0014, + "num_input_tokens_seen": 195024528, + "step": 90375 + }, + { + "epoch": 14.743882544861338, + "grad_norm": 0.03918186575174332, + "learning_rate": 0.00019605429305412746, + "loss": 0.0136, + "num_input_tokens_seen": 195035248, + "step": 90380 + }, + { + "epoch": 14.744698205546493, + "grad_norm": 0.0002477726084180176, + "learning_rate": 0.00019599777791940497, + "loss": 0.0105, + "num_input_tokens_seen": 195046640, + "step": 90385 + }, + { + "epoch": 14.745513866231647, + "grad_norm": 0.03757929056882858, + "learning_rate": 0.00019594126894567315, + "loss": 0.0018, + "num_input_tokens_seen": 195058128, + "step": 90390 + }, + { + "epoch": 14.746329526916803, + "grad_norm": 0.015669500455260277, + "learning_rate": 0.00019588476613407725, + "loss": 0.0048, + "num_input_tokens_seen": 195070640, + "step": 90395 + }, + { + "epoch": 14.747145187601957, + "grad_norm": 0.003472062759101391, + "learning_rate": 0.00019582826948576215, + "loss": 0.0039, + "num_input_tokens_seen": 195080560, + "step": 90400 + }, + { + "epoch": 14.747960848287113, + "grad_norm": 0.00359158543869853, + "learning_rate": 0.00019577177900187342, + "loss": 0.0044, + "num_input_tokens_seen": 195092464, + "step": 90405 + }, + { + "epoch": 14.748776508972268, + "grad_norm": 1.2494508028030396, + "learning_rate": 0.0001957152946835552, + "loss": 0.0403, + "num_input_tokens_seen": 195103664, + "step": 90410 + }, + { + "epoch": 14.749592169657422, + "grad_norm": 0.0008314524311572313, + "learning_rate": 0.00019565881653195284, + "loss": 0.007, + "num_input_tokens_seen": 195114352, + "step": 90415 + }, + { + "epoch": 14.750407830342578, + "grad_norm": 0.6893738508224487, + "learning_rate": 0.00019560234454821034, + "loss": 0.0456, + "num_input_tokens_seen": 195125488, + "step": 90420 + }, + { + "epoch": 14.751223491027732, + "grad_norm": 0.0008506067679263651, + "learning_rate": 0.0001955458787334728, + "loss": 0.007, + "num_input_tokens_seen": 195135312, + "step": 90425 + }, + { + "epoch": 14.752039151712887, + "grad_norm": 0.0002450900210533291, + "learning_rate": 0.00019548941908888396, + "loss": 0.0003, + "num_input_tokens_seen": 195145744, + "step": 90430 + }, + { + "epoch": 14.752854812398043, + "grad_norm": 0.0020428961142897606, + "learning_rate": 0.00019543296561558865, + "loss": 0.0056, + "num_input_tokens_seen": 195156368, + "step": 90435 + }, + { + "epoch": 14.753670473083197, + "grad_norm": 0.0031792190857231617, + "learning_rate": 0.0001953765183147303, + "loss": 0.0913, + "num_input_tokens_seen": 195166960, + "step": 90440 + }, + { + "epoch": 14.754486133768353, + "grad_norm": 0.11402551084756851, + "learning_rate": 0.00019532007718745366, + "loss": 0.0081, + "num_input_tokens_seen": 195178928, + "step": 90445 + }, + { + "epoch": 14.755301794453507, + "grad_norm": 0.003515039337798953, + "learning_rate": 0.00019526364223490172, + "loss": 0.0169, + "num_input_tokens_seen": 195189776, + "step": 90450 + }, + { + "epoch": 14.756117455138662, + "grad_norm": 0.0005434277700260282, + "learning_rate": 0.00019520721345821907, + "loss": 0.0101, + "num_input_tokens_seen": 195200560, + "step": 90455 + }, + { + "epoch": 14.756933115823816, + "grad_norm": 0.0014098555548116565, + "learning_rate": 0.00019515079085854854, + "loss": 0.0726, + "num_input_tokens_seen": 195211632, + "step": 90460 + }, + { + "epoch": 14.757748776508972, + "grad_norm": 0.005730149336159229, + "learning_rate": 0.00019509437443703415, + "loss": 0.0029, + "num_input_tokens_seen": 195222224, + "step": 90465 + }, + { + "epoch": 14.758564437194128, + "grad_norm": 0.0021606602240353823, + "learning_rate": 0.00019503796419481908, + "loss": 0.0015, + "num_input_tokens_seen": 195234064, + "step": 90470 + }, + { + "epoch": 14.759380097879282, + "grad_norm": 0.09056062251329422, + "learning_rate": 0.00019498156013304647, + "loss": 0.0323, + "num_input_tokens_seen": 195243248, + "step": 90475 + }, + { + "epoch": 14.760195758564437, + "grad_norm": 0.0077574304305016994, + "learning_rate": 0.0001949251622528595, + "loss": 0.0027, + "num_input_tokens_seen": 195253328, + "step": 90480 + }, + { + "epoch": 14.761011419249591, + "grad_norm": 0.0041617825627326965, + "learning_rate": 0.0001948687705554012, + "loss": 0.0005, + "num_input_tokens_seen": 195263408, + "step": 90485 + }, + { + "epoch": 14.761827079934747, + "grad_norm": 0.00167485058773309, + "learning_rate": 0.00019481238504181431, + "loss": 0.001, + "num_input_tokens_seen": 195275056, + "step": 90490 + }, + { + "epoch": 14.762642740619903, + "grad_norm": 0.012457344681024551, + "learning_rate": 0.0001947560057132416, + "loss": 0.0019, + "num_input_tokens_seen": 195285584, + "step": 90495 + }, + { + "epoch": 14.763458401305057, + "grad_norm": 0.002297884551808238, + "learning_rate": 0.00019469963257082564, + "loss": 0.0198, + "num_input_tokens_seen": 195296656, + "step": 90500 + }, + { + "epoch": 14.764274061990212, + "grad_norm": 0.20740437507629395, + "learning_rate": 0.00019464326561570894, + "loss": 0.017, + "num_input_tokens_seen": 195307216, + "step": 90505 + }, + { + "epoch": 14.765089722675366, + "grad_norm": 0.015635017305612564, + "learning_rate": 0.0001945869048490338, + "loss": 0.0013, + "num_input_tokens_seen": 195317776, + "step": 90510 + }, + { + "epoch": 14.765905383360522, + "grad_norm": 0.011051137931644917, + "learning_rate": 0.00019453055027194256, + "loss": 0.0019, + "num_input_tokens_seen": 195328688, + "step": 90515 + }, + { + "epoch": 14.766721044045678, + "grad_norm": 0.0018045243341475725, + "learning_rate": 0.00019447420188557714, + "loss": 0.0017, + "num_input_tokens_seen": 195339952, + "step": 90520 + }, + { + "epoch": 14.767536704730832, + "grad_norm": 0.0007379131275229156, + "learning_rate": 0.00019441785969107967, + "loss": 0.0007, + "num_input_tokens_seen": 195350960, + "step": 90525 + }, + { + "epoch": 14.768352365415987, + "grad_norm": 0.001470520393922925, + "learning_rate": 0.00019436152368959193, + "loss": 0.0016, + "num_input_tokens_seen": 195361776, + "step": 90530 + }, + { + "epoch": 14.769168026101141, + "grad_norm": 0.025681860744953156, + "learning_rate": 0.0001943051938822556, + "loss": 0.0076, + "num_input_tokens_seen": 195373456, + "step": 90535 + }, + { + "epoch": 14.769983686786297, + "grad_norm": 0.0010437992168590426, + "learning_rate": 0.00019424887027021237, + "loss": 0.0013, + "num_input_tokens_seen": 195384816, + "step": 90540 + }, + { + "epoch": 14.770799347471453, + "grad_norm": 0.576299786567688, + "learning_rate": 0.00019419255285460347, + "loss": 0.0173, + "num_input_tokens_seen": 195395632, + "step": 90545 + }, + { + "epoch": 14.771615008156607, + "grad_norm": 0.0012524003395810723, + "learning_rate": 0.00019413624163657072, + "loss": 0.0185, + "num_input_tokens_seen": 195407248, + "step": 90550 + }, + { + "epoch": 14.772430668841762, + "grad_norm": 1.6163502931594849, + "learning_rate": 0.00019407993661725475, + "loss": 0.0397, + "num_input_tokens_seen": 195418896, + "step": 90555 + }, + { + "epoch": 14.773246329526916, + "grad_norm": 0.0010413825511932373, + "learning_rate": 0.0001940236377977973, + "loss": 0.0129, + "num_input_tokens_seen": 195429328, + "step": 90560 + }, + { + "epoch": 14.774061990212072, + "grad_norm": 0.04386414214968681, + "learning_rate": 0.00019396734517933867, + "loss": 0.0033, + "num_input_tokens_seen": 195440368, + "step": 90565 + }, + { + "epoch": 14.774877650897226, + "grad_norm": 0.0013385694473981857, + "learning_rate": 0.00019391105876302012, + "loss": 0.0017, + "num_input_tokens_seen": 195451216, + "step": 90570 + }, + { + "epoch": 14.775693311582382, + "grad_norm": 0.0002507951285224408, + "learning_rate": 0.00019385477854998235, + "loss": 0.0004, + "num_input_tokens_seen": 195462640, + "step": 90575 + }, + { + "epoch": 14.776508972267537, + "grad_norm": 0.0012413602089509368, + "learning_rate": 0.00019379850454136582, + "loss": 0.0016, + "num_input_tokens_seen": 195473456, + "step": 90580 + }, + { + "epoch": 14.777324632952691, + "grad_norm": 0.5016093850135803, + "learning_rate": 0.00019374223673831103, + "loss": 0.0282, + "num_input_tokens_seen": 195484112, + "step": 90585 + }, + { + "epoch": 14.778140293637847, + "grad_norm": 0.5469359755516052, + "learning_rate": 0.00019368597514195834, + "loss": 0.0128, + "num_input_tokens_seen": 195493776, + "step": 90590 + }, + { + "epoch": 14.778955954323001, + "grad_norm": 0.02712627500295639, + "learning_rate": 0.00019362971975344796, + "loss": 0.002, + "num_input_tokens_seen": 195504240, + "step": 90595 + }, + { + "epoch": 14.779771615008157, + "grad_norm": 0.014816144481301308, + "learning_rate": 0.00019357347057391994, + "loss": 0.0402, + "num_input_tokens_seen": 195514416, + "step": 90600 + }, + { + "epoch": 14.780587275693312, + "grad_norm": 0.0009532608673907816, + "learning_rate": 0.0001935172276045143, + "loss": 0.001, + "num_input_tokens_seen": 195524400, + "step": 90605 + }, + { + "epoch": 14.781402936378466, + "grad_norm": 0.6168203353881836, + "learning_rate": 0.0001934609908463708, + "loss": 0.1475, + "num_input_tokens_seen": 195535280, + "step": 90610 + }, + { + "epoch": 14.782218597063622, + "grad_norm": 0.0014157983241602778, + "learning_rate": 0.00019340476030062925, + "loss": 0.0055, + "num_input_tokens_seen": 195546832, + "step": 90615 + }, + { + "epoch": 14.783034257748776, + "grad_norm": 1.1413875818252563, + "learning_rate": 0.00019334853596842915, + "loss": 0.2846, + "num_input_tokens_seen": 195556688, + "step": 90620 + }, + { + "epoch": 14.783849918433932, + "grad_norm": 0.0032671098597347736, + "learning_rate": 0.00019329231785090994, + "loss": 0.0129, + "num_input_tokens_seen": 195566000, + "step": 90625 + }, + { + "epoch": 14.784665579119086, + "grad_norm": 0.004854480270296335, + "learning_rate": 0.0001932361059492111, + "loss": 0.0122, + "num_input_tokens_seen": 195576208, + "step": 90630 + }, + { + "epoch": 14.785481239804241, + "grad_norm": 0.0014693269040435553, + "learning_rate": 0.00019317990026447164, + "loss": 0.0011, + "num_input_tokens_seen": 195587344, + "step": 90635 + }, + { + "epoch": 14.786296900489397, + "grad_norm": 0.0011037163203582168, + "learning_rate": 0.00019312370079783075, + "loss": 0.0014, + "num_input_tokens_seen": 195597904, + "step": 90640 + }, + { + "epoch": 14.78711256117455, + "grad_norm": 0.13065768778324127, + "learning_rate": 0.0001930675075504274, + "loss": 0.0036, + "num_input_tokens_seen": 195609168, + "step": 90645 + }, + { + "epoch": 14.787928221859707, + "grad_norm": 0.005767814815044403, + "learning_rate": 0.00019301132052340031, + "loss": 0.0019, + "num_input_tokens_seen": 195619088, + "step": 90650 + }, + { + "epoch": 14.78874388254486, + "grad_norm": 0.09696496278047562, + "learning_rate": 0.0001929551397178883, + "loss": 0.0018, + "num_input_tokens_seen": 195630512, + "step": 90655 + }, + { + "epoch": 14.789559543230016, + "grad_norm": 0.0003517286095302552, + "learning_rate": 0.00019289896513502991, + "loss": 0.0025, + "num_input_tokens_seen": 195642512, + "step": 90660 + }, + { + "epoch": 14.790375203915172, + "grad_norm": 0.010269487276673317, + "learning_rate": 0.00019284279677596355, + "loss": 0.0014, + "num_input_tokens_seen": 195653936, + "step": 90665 + }, + { + "epoch": 14.791190864600326, + "grad_norm": 0.0010697557590901852, + "learning_rate": 0.0001927866346418276, + "loss": 0.0016, + "num_input_tokens_seen": 195664784, + "step": 90670 + }, + { + "epoch": 14.792006525285482, + "grad_norm": 0.0018980283057317138, + "learning_rate": 0.00019273047873376005, + "loss": 0.0007, + "num_input_tokens_seen": 195675888, + "step": 90675 + }, + { + "epoch": 14.792822185970635, + "grad_norm": 0.13122312724590302, + "learning_rate": 0.00019267432905289945, + "loss": 0.0925, + "num_input_tokens_seen": 195686768, + "step": 90680 + }, + { + "epoch": 14.793637846655791, + "grad_norm": 0.017379729077219963, + "learning_rate": 0.00019261818560038313, + "loss": 0.0018, + "num_input_tokens_seen": 195698544, + "step": 90685 + }, + { + "epoch": 14.794453507340947, + "grad_norm": 0.09783849120140076, + "learning_rate": 0.00019256204837734937, + "loss": 0.0033, + "num_input_tokens_seen": 195707408, + "step": 90690 + }, + { + "epoch": 14.7952691680261, + "grad_norm": 0.0017929052701219916, + "learning_rate": 0.00019250591738493572, + "loss": 0.0008, + "num_input_tokens_seen": 195718352, + "step": 90695 + }, + { + "epoch": 14.796084828711257, + "grad_norm": 0.011613667942583561, + "learning_rate": 0.00019244979262427974, + "loss": 0.0779, + "num_input_tokens_seen": 195729680, + "step": 90700 + }, + { + "epoch": 14.79690048939641, + "grad_norm": 0.023189745843410492, + "learning_rate": 0.00019239367409651893, + "loss": 0.0137, + "num_input_tokens_seen": 195739344, + "step": 90705 + }, + { + "epoch": 14.797716150081566, + "grad_norm": 0.038402754813432693, + "learning_rate": 0.00019233756180279043, + "loss": 0.0024, + "num_input_tokens_seen": 195749744, + "step": 90710 + }, + { + "epoch": 14.798531810766722, + "grad_norm": 0.0011591213988140225, + "learning_rate": 0.00019228145574423162, + "loss": 0.0028, + "num_input_tokens_seen": 195760720, + "step": 90715 + }, + { + "epoch": 14.799347471451876, + "grad_norm": 0.00891993846744299, + "learning_rate": 0.00019222535592197944, + "loss": 0.0049, + "num_input_tokens_seen": 195772176, + "step": 90720 + }, + { + "epoch": 14.800163132137031, + "grad_norm": 0.16907335817813873, + "learning_rate": 0.00019216926233717085, + "loss": 0.0082, + "num_input_tokens_seen": 195783568, + "step": 90725 + }, + { + "epoch": 14.800978792822185, + "grad_norm": 0.0001954362087417394, + "learning_rate": 0.0001921131749909427, + "loss": 0.0013, + "num_input_tokens_seen": 195794896, + "step": 90730 + }, + { + "epoch": 14.801794453507341, + "grad_norm": 0.004630944225937128, + "learning_rate": 0.00019205709388443165, + "loss": 0.0016, + "num_input_tokens_seen": 195804112, + "step": 90735 + }, + { + "epoch": 14.802610114192497, + "grad_norm": 0.0023959320969879627, + "learning_rate": 0.00019200101901877426, + "loss": 0.0005, + "num_input_tokens_seen": 195815056, + "step": 90740 + }, + { + "epoch": 14.80342577487765, + "grad_norm": 0.018310220912098885, + "learning_rate": 0.0001919449503951069, + "loss": 0.0024, + "num_input_tokens_seen": 195826032, + "step": 90745 + }, + { + "epoch": 14.804241435562806, + "grad_norm": 0.0009907520143315196, + "learning_rate": 0.00019188888801456594, + "loss": 0.0132, + "num_input_tokens_seen": 195836528, + "step": 90750 + }, + { + "epoch": 14.80505709624796, + "grad_norm": 0.006872013211250305, + "learning_rate": 0.0001918328318782875, + "loss": 0.0019, + "num_input_tokens_seen": 195845808, + "step": 90755 + }, + { + "epoch": 14.805872756933116, + "grad_norm": 0.006974945776164532, + "learning_rate": 0.00019177678198740766, + "loss": 0.0106, + "num_input_tokens_seen": 195856976, + "step": 90760 + }, + { + "epoch": 14.80668841761827, + "grad_norm": 0.00471109664067626, + "learning_rate": 0.00019172073834306235, + "loss": 0.0052, + "num_input_tokens_seen": 195866704, + "step": 90765 + }, + { + "epoch": 14.807504078303426, + "grad_norm": 0.5156394839286804, + "learning_rate": 0.00019166470094638739, + "loss": 0.0223, + "num_input_tokens_seen": 195878544, + "step": 90770 + }, + { + "epoch": 14.808319738988581, + "grad_norm": 0.005725428462028503, + "learning_rate": 0.00019160866979851842, + "loss": 0.0034, + "num_input_tokens_seen": 195890576, + "step": 90775 + }, + { + "epoch": 14.809135399673735, + "grad_norm": 0.009102820418775082, + "learning_rate": 0.00019155264490059077, + "loss": 0.0019, + "num_input_tokens_seen": 195901008, + "step": 90780 + }, + { + "epoch": 14.809951060358891, + "grad_norm": 0.006600756663829088, + "learning_rate": 0.00019149662625374042, + "loss": 0.0022, + "num_input_tokens_seen": 195912304, + "step": 90785 + }, + { + "epoch": 14.810766721044045, + "grad_norm": 0.0037188567221164703, + "learning_rate": 0.00019144061385910195, + "loss": 0.0045, + "num_input_tokens_seen": 195923568, + "step": 90790 + }, + { + "epoch": 14.8115823817292, + "grad_norm": 0.04923973232507706, + "learning_rate": 0.00019138460771781125, + "loss": 0.0024, + "num_input_tokens_seen": 195933776, + "step": 90795 + }, + { + "epoch": 14.812398042414356, + "grad_norm": 0.016255293041467667, + "learning_rate": 0.0001913286078310026, + "loss": 0.0046, + "num_input_tokens_seen": 195943632, + "step": 90800 + }, + { + "epoch": 14.81321370309951, + "grad_norm": 0.0015734318876639009, + "learning_rate": 0.00019127261419981168, + "loss": 0.0038, + "num_input_tokens_seen": 195954672, + "step": 90805 + }, + { + "epoch": 14.814029363784666, + "grad_norm": 0.08767221122980118, + "learning_rate": 0.0001912166268253725, + "loss": 0.0311, + "num_input_tokens_seen": 195965136, + "step": 90810 + }, + { + "epoch": 14.81484502446982, + "grad_norm": 0.00013895744632463902, + "learning_rate": 0.0001911606457088204, + "loss": 0.0033, + "num_input_tokens_seen": 195975152, + "step": 90815 + }, + { + "epoch": 14.815660685154976, + "grad_norm": 0.001294991816394031, + "learning_rate": 0.00019110467085128936, + "loss": 0.0045, + "num_input_tokens_seen": 195986160, + "step": 90820 + }, + { + "epoch": 14.81647634584013, + "grad_norm": 0.0001948714052559808, + "learning_rate": 0.00019104870225391412, + "loss": 0.0015, + "num_input_tokens_seen": 195996592, + "step": 90825 + }, + { + "epoch": 14.817292006525285, + "grad_norm": 0.026278002187609673, + "learning_rate": 0.0001909927399178289, + "loss": 0.0024, + "num_input_tokens_seen": 196007280, + "step": 90830 + }, + { + "epoch": 14.818107667210441, + "grad_norm": 0.0004448066756594926, + "learning_rate": 0.0001909367838441678, + "loss": 0.0114, + "num_input_tokens_seen": 196018704, + "step": 90835 + }, + { + "epoch": 14.818923327895595, + "grad_norm": 0.006399508565664291, + "learning_rate": 0.00019088083403406486, + "loss": 0.0039, + "num_input_tokens_seen": 196029456, + "step": 90840 + }, + { + "epoch": 14.81973898858075, + "grad_norm": 0.006001987494528294, + "learning_rate": 0.00019082489048865393, + "loss": 0.0027, + "num_input_tokens_seen": 196039440, + "step": 90845 + }, + { + "epoch": 14.820554649265905, + "grad_norm": 0.0006337161175906658, + "learning_rate": 0.00019076895320906885, + "loss": 0.1716, + "num_input_tokens_seen": 196049840, + "step": 90850 + }, + { + "epoch": 14.82137030995106, + "grad_norm": 0.00043385321623645723, + "learning_rate": 0.0001907130221964432, + "loss": 0.0003, + "num_input_tokens_seen": 196060272, + "step": 90855 + }, + { + "epoch": 14.822185970636216, + "grad_norm": 0.0028473250567913055, + "learning_rate": 0.0001906570974519105, + "loss": 0.0031, + "num_input_tokens_seen": 196072656, + "step": 90860 + }, + { + "epoch": 14.82300163132137, + "grad_norm": 0.08672349900007248, + "learning_rate": 0.00019060117897660417, + "loss": 0.0467, + "num_input_tokens_seen": 196081904, + "step": 90865 + }, + { + "epoch": 14.823817292006526, + "grad_norm": 0.9235002994537354, + "learning_rate": 0.00019054526677165744, + "loss": 0.0927, + "num_input_tokens_seen": 196092432, + "step": 90870 + }, + { + "epoch": 14.82463295269168, + "grad_norm": 0.0005866262945346534, + "learning_rate": 0.00019048936083820346, + "loss": 0.0006, + "num_input_tokens_seen": 196103920, + "step": 90875 + }, + { + "epoch": 14.825448613376835, + "grad_norm": 0.012682581320405006, + "learning_rate": 0.00019043346117737526, + "loss": 0.0023, + "num_input_tokens_seen": 196114896, + "step": 90880 + }, + { + "epoch": 14.826264274061991, + "grad_norm": 0.0001768836664268747, + "learning_rate": 0.00019037756779030545, + "loss": 0.0646, + "num_input_tokens_seen": 196125744, + "step": 90885 + }, + { + "epoch": 14.827079934747145, + "grad_norm": 0.0008818417554721236, + "learning_rate": 0.00019032168067812738, + "loss": 0.0023, + "num_input_tokens_seen": 196133968, + "step": 90890 + }, + { + "epoch": 14.8278955954323, + "grad_norm": 0.0005644970224238932, + "learning_rate": 0.00019026579984197296, + "loss": 0.0119, + "num_input_tokens_seen": 196145040, + "step": 90895 + }, + { + "epoch": 14.828711256117455, + "grad_norm": 0.0011725968215614557, + "learning_rate": 0.00019020992528297537, + "loss": 0.0006, + "num_input_tokens_seen": 196156112, + "step": 90900 + }, + { + "epoch": 14.82952691680261, + "grad_norm": 0.014635713770985603, + "learning_rate": 0.0001901540570022663, + "loss": 0.0017, + "num_input_tokens_seen": 196168048, + "step": 90905 + }, + { + "epoch": 14.830342577487766, + "grad_norm": 0.0017959743272513151, + "learning_rate": 0.0001900981950009787, + "loss": 0.0274, + "num_input_tokens_seen": 196179568, + "step": 90910 + }, + { + "epoch": 14.83115823817292, + "grad_norm": 0.08322062343358994, + "learning_rate": 0.00019004233928024395, + "loss": 0.0043, + "num_input_tokens_seen": 196188848, + "step": 90915 + }, + { + "epoch": 14.831973898858076, + "grad_norm": 0.004897326696664095, + "learning_rate": 0.0001899864898411947, + "loss": 0.0009, + "num_input_tokens_seen": 196198096, + "step": 90920 + }, + { + "epoch": 14.83278955954323, + "grad_norm": 0.0034920626785606146, + "learning_rate": 0.00018993064668496225, + "loss": 0.0158, + "num_input_tokens_seen": 196208848, + "step": 90925 + }, + { + "epoch": 14.833605220228385, + "grad_norm": 0.03683251142501831, + "learning_rate": 0.00018987480981267892, + "loss": 0.0025, + "num_input_tokens_seen": 196218960, + "step": 90930 + }, + { + "epoch": 14.83442088091354, + "grad_norm": 0.001790532493032515, + "learning_rate": 0.00018981897922547565, + "loss": 0.0837, + "num_input_tokens_seen": 196230288, + "step": 90935 + }, + { + "epoch": 14.835236541598695, + "grad_norm": 0.0013410423416644335, + "learning_rate": 0.00018976315492448453, + "loss": 0.0014, + "num_input_tokens_seen": 196239888, + "step": 90940 + }, + { + "epoch": 14.83605220228385, + "grad_norm": 0.10095477104187012, + "learning_rate": 0.00018970733691083637, + "loss": 0.0048, + "num_input_tokens_seen": 196251024, + "step": 90945 + }, + { + "epoch": 14.836867862969005, + "grad_norm": 0.040366191416978836, + "learning_rate": 0.000189651525185663, + "loss": 0.0314, + "num_input_tokens_seen": 196260528, + "step": 90950 + }, + { + "epoch": 14.83768352365416, + "grad_norm": 0.01338283158838749, + "learning_rate": 0.00018959571975009481, + "loss": 0.0029, + "num_input_tokens_seen": 196270800, + "step": 90955 + }, + { + "epoch": 14.838499184339314, + "grad_norm": 1.0016621351242065, + "learning_rate": 0.00018953992060526348, + "loss": 0.1004, + "num_input_tokens_seen": 196280816, + "step": 90960 + }, + { + "epoch": 14.83931484502447, + "grad_norm": 0.025823697447776794, + "learning_rate": 0.00018948412775229918, + "loss": 0.0165, + "num_input_tokens_seen": 196291248, + "step": 90965 + }, + { + "epoch": 14.840130505709626, + "grad_norm": 0.31389421224594116, + "learning_rate": 0.0001894283411923331, + "loss": 0.0048, + "num_input_tokens_seen": 196302160, + "step": 90970 + }, + { + "epoch": 14.84094616639478, + "grad_norm": 0.001859633019194007, + "learning_rate": 0.0001893725609264957, + "loss": 0.0007, + "num_input_tokens_seen": 196312304, + "step": 90975 + }, + { + "epoch": 14.841761827079935, + "grad_norm": 0.019886551424860954, + "learning_rate": 0.00018931678695591742, + "loss": 0.1416, + "num_input_tokens_seen": 196324048, + "step": 90980 + }, + { + "epoch": 14.84257748776509, + "grad_norm": 0.0004720330471172929, + "learning_rate": 0.00018926101928172856, + "loss": 0.005, + "num_input_tokens_seen": 196334384, + "step": 90985 + }, + { + "epoch": 14.843393148450245, + "grad_norm": 0.01991906762123108, + "learning_rate": 0.00018920525790505933, + "loss": 0.0196, + "num_input_tokens_seen": 196345808, + "step": 90990 + }, + { + "epoch": 14.844208809135399, + "grad_norm": 0.0073630912229418755, + "learning_rate": 0.00018914950282703985, + "loss": 0.0029, + "num_input_tokens_seen": 196356464, + "step": 90995 + }, + { + "epoch": 14.845024469820554, + "grad_norm": 0.0237130019813776, + "learning_rate": 0.00018909375404879998, + "loss": 0.0017, + "num_input_tokens_seen": 196368112, + "step": 91000 + }, + { + "epoch": 14.84584013050571, + "grad_norm": 0.003329494036734104, + "learning_rate": 0.00018903801157146965, + "loss": 0.0021, + "num_input_tokens_seen": 196378352, + "step": 91005 + }, + { + "epoch": 14.846655791190864, + "grad_norm": 0.01158680859953165, + "learning_rate": 0.00018898227539617852, + "loss": 0.1261, + "num_input_tokens_seen": 196389040, + "step": 91010 + }, + { + "epoch": 14.84747145187602, + "grad_norm": 0.001665710937231779, + "learning_rate": 0.0001889265455240561, + "loss": 0.1319, + "num_input_tokens_seen": 196400144, + "step": 91015 + }, + { + "epoch": 14.848287112561174, + "grad_norm": 0.41353392601013184, + "learning_rate": 0.00018887082195623167, + "loss": 0.0604, + "num_input_tokens_seen": 196412176, + "step": 91020 + }, + { + "epoch": 14.84910277324633, + "grad_norm": 0.06578317284584045, + "learning_rate": 0.00018881510469383506, + "loss": 0.0035, + "num_input_tokens_seen": 196423984, + "step": 91025 + }, + { + "epoch": 14.849918433931485, + "grad_norm": 0.012289333157241344, + "learning_rate": 0.00018875939373799483, + "loss": 0.0043, + "num_input_tokens_seen": 196433968, + "step": 91030 + }, + { + "epoch": 14.850734094616639, + "grad_norm": 0.490304172039032, + "learning_rate": 0.00018870368908984063, + "loss": 0.0608, + "num_input_tokens_seen": 196446096, + "step": 91035 + }, + { + "epoch": 14.851549755301795, + "grad_norm": 0.0007964250980876386, + "learning_rate": 0.00018864799075050078, + "loss": 0.0038, + "num_input_tokens_seen": 196456560, + "step": 91040 + }, + { + "epoch": 14.852365415986949, + "grad_norm": 0.0005342008080333471, + "learning_rate": 0.00018859229872110467, + "loss": 0.0772, + "num_input_tokens_seen": 196466544, + "step": 91045 + }, + { + "epoch": 14.853181076672104, + "grad_norm": 0.0030721756629645824, + "learning_rate": 0.00018853661300278034, + "loss": 0.0016, + "num_input_tokens_seen": 196477360, + "step": 91050 + }, + { + "epoch": 14.85399673735726, + "grad_norm": 0.009175121784210205, + "learning_rate": 0.00018848093359665703, + "loss": 0.0018, + "num_input_tokens_seen": 196488944, + "step": 91055 + }, + { + "epoch": 14.854812398042414, + "grad_norm": 0.009594283998012543, + "learning_rate": 0.0001884252605038624, + "loss": 0.0017, + "num_input_tokens_seen": 196499056, + "step": 91060 + }, + { + "epoch": 14.85562805872757, + "grad_norm": 0.001974226674064994, + "learning_rate": 0.00018836959372552553, + "loss": 0.0005, + "num_input_tokens_seen": 196509680, + "step": 91065 + }, + { + "epoch": 14.856443719412724, + "grad_norm": 0.0009759245440363884, + "learning_rate": 0.0001883139332627738, + "loss": 0.0134, + "num_input_tokens_seen": 196519856, + "step": 91070 + }, + { + "epoch": 14.85725938009788, + "grad_norm": 0.003209600457921624, + "learning_rate": 0.00018825827911673592, + "loss": 0.0006, + "num_input_tokens_seen": 196531088, + "step": 91075 + }, + { + "epoch": 14.858075040783035, + "grad_norm": 0.007221479434520006, + "learning_rate": 0.0001882026312885392, + "loss": 0.0034, + "num_input_tokens_seen": 196542384, + "step": 91080 + }, + { + "epoch": 14.858890701468189, + "grad_norm": 0.009790973737835884, + "learning_rate": 0.00018814698977931204, + "loss": 0.0013, + "num_input_tokens_seen": 196553648, + "step": 91085 + }, + { + "epoch": 14.859706362153345, + "grad_norm": 0.0009493590332567692, + "learning_rate": 0.0001880913545901814, + "loss": 0.0088, + "num_input_tokens_seen": 196564624, + "step": 91090 + }, + { + "epoch": 14.860522022838499, + "grad_norm": 0.10604451596736908, + "learning_rate": 0.00018803572572227546, + "loss": 0.0043, + "num_input_tokens_seen": 196575984, + "step": 91095 + }, + { + "epoch": 14.861337683523654, + "grad_norm": 0.046892113983631134, + "learning_rate": 0.000187980103176721, + "loss": 0.1041, + "num_input_tokens_seen": 196586352, + "step": 91100 + }, + { + "epoch": 14.86215334420881, + "grad_norm": 0.2630126476287842, + "learning_rate": 0.0001879244869546457, + "loss": 0.0101, + "num_input_tokens_seen": 196598224, + "step": 91105 + }, + { + "epoch": 14.862969004893964, + "grad_norm": 0.19180892407894135, + "learning_rate": 0.00018786887705717658, + "loss": 0.0053, + "num_input_tokens_seen": 196609840, + "step": 91110 + }, + { + "epoch": 14.86378466557912, + "grad_norm": 0.0405438207089901, + "learning_rate": 0.00018781327348544065, + "loss": 0.0023, + "num_input_tokens_seen": 196620176, + "step": 91115 + }, + { + "epoch": 14.864600326264274, + "grad_norm": 0.0008621526649221778, + "learning_rate": 0.00018775767624056472, + "loss": 0.0009, + "num_input_tokens_seen": 196631952, + "step": 91120 + }, + { + "epoch": 14.86541598694943, + "grad_norm": 0.032930050045251846, + "learning_rate": 0.0001877020853236756, + "loss": 0.0395, + "num_input_tokens_seen": 196642704, + "step": 91125 + }, + { + "epoch": 14.866231647634583, + "grad_norm": 0.0007372420514002442, + "learning_rate": 0.00018764650073589995, + "loss": 0.0031, + "num_input_tokens_seen": 196654544, + "step": 91130 + }, + { + "epoch": 14.867047308319739, + "grad_norm": 0.005048900842666626, + "learning_rate": 0.0001875909224783642, + "loss": 0.0012, + "num_input_tokens_seen": 196664656, + "step": 91135 + }, + { + "epoch": 14.867862969004895, + "grad_norm": 0.0053457641042768955, + "learning_rate": 0.00018753535055219468, + "loss": 0.003, + "num_input_tokens_seen": 196676400, + "step": 91140 + }, + { + "epoch": 14.868678629690049, + "grad_norm": 0.0018338457448408008, + "learning_rate": 0.0001874797849585177, + "loss": 0.0016, + "num_input_tokens_seen": 196687600, + "step": 91145 + }, + { + "epoch": 14.869494290375204, + "grad_norm": 0.010572639293968678, + "learning_rate": 0.00018742422569845935, + "loss": 0.0181, + "num_input_tokens_seen": 196697424, + "step": 91150 + }, + { + "epoch": 14.870309951060358, + "grad_norm": 0.0007677992107346654, + "learning_rate": 0.00018736867277314556, + "loss": 0.0011, + "num_input_tokens_seen": 196708496, + "step": 91155 + }, + { + "epoch": 14.871125611745514, + "grad_norm": 0.007629129569977522, + "learning_rate": 0.00018731312618370228, + "loss": 0.0008, + "num_input_tokens_seen": 196717872, + "step": 91160 + }, + { + "epoch": 14.87194127243067, + "grad_norm": 0.05745696276426315, + "learning_rate": 0.0001872575859312549, + "loss": 0.0026, + "num_input_tokens_seen": 196728944, + "step": 91165 + }, + { + "epoch": 14.872756933115824, + "grad_norm": 0.10047470033168793, + "learning_rate": 0.00018720205201692975, + "loss": 0.0065, + "num_input_tokens_seen": 196739152, + "step": 91170 + }, + { + "epoch": 14.87357259380098, + "grad_norm": 0.12461056560277939, + "learning_rate": 0.00018714652444185137, + "loss": 0.0037, + "num_input_tokens_seen": 196749712, + "step": 91175 + }, + { + "epoch": 14.874388254486133, + "grad_norm": 0.003091733669862151, + "learning_rate": 0.00018709100320714594, + "loss": 0.0643, + "num_input_tokens_seen": 196760560, + "step": 91180 + }, + { + "epoch": 14.875203915171289, + "grad_norm": 0.0011258155573159456, + "learning_rate": 0.00018703548831393795, + "loss": 0.03, + "num_input_tokens_seen": 196770448, + "step": 91185 + }, + { + "epoch": 14.876019575856443, + "grad_norm": 0.03516976907849312, + "learning_rate": 0.00018697997976335317, + "loss": 0.0262, + "num_input_tokens_seen": 196781968, + "step": 91190 + }, + { + "epoch": 14.876835236541599, + "grad_norm": 0.001609648228622973, + "learning_rate": 0.0001869244775565158, + "loss": 0.0191, + "num_input_tokens_seen": 196792848, + "step": 91195 + }, + { + "epoch": 14.877650897226754, + "grad_norm": 0.05825599655508995, + "learning_rate": 0.00018686898169455147, + "loss": 0.004, + "num_input_tokens_seen": 196804464, + "step": 91200 + }, + { + "epoch": 14.878466557911908, + "grad_norm": 0.0006430086796171963, + "learning_rate": 0.00018681349217858408, + "loss": 0.0032, + "num_input_tokens_seen": 196814512, + "step": 91205 + }, + { + "epoch": 14.879282218597064, + "grad_norm": 0.005379104055464268, + "learning_rate": 0.00018675800900973876, + "loss": 0.0024, + "num_input_tokens_seen": 196823728, + "step": 91210 + }, + { + "epoch": 14.880097879282218, + "grad_norm": 0.027618926018476486, + "learning_rate": 0.00018670253218913975, + "loss": 0.002, + "num_input_tokens_seen": 196833584, + "step": 91215 + }, + { + "epoch": 14.880913539967374, + "grad_norm": 0.028396662324666977, + "learning_rate": 0.00018664706171791134, + "loss": 0.008, + "num_input_tokens_seen": 196844944, + "step": 91220 + }, + { + "epoch": 14.88172920065253, + "grad_norm": 0.003062706207856536, + "learning_rate": 0.0001865915975971778, + "loss": 0.0612, + "num_input_tokens_seen": 196856464, + "step": 91225 + }, + { + "epoch": 14.882544861337683, + "grad_norm": 0.0221099853515625, + "learning_rate": 0.00018653613982806311, + "loss": 0.004, + "num_input_tokens_seen": 196866800, + "step": 91230 + }, + { + "epoch": 14.883360522022839, + "grad_norm": 0.04425922781229019, + "learning_rate": 0.0001864806884116912, + "loss": 0.0024, + "num_input_tokens_seen": 196878224, + "step": 91235 + }, + { + "epoch": 14.884176182707993, + "grad_norm": 0.014148225076496601, + "learning_rate": 0.00018642524334918582, + "loss": 0.0035, + "num_input_tokens_seen": 196889136, + "step": 91240 + }, + { + "epoch": 14.884991843393149, + "grad_norm": 0.020783808082342148, + "learning_rate": 0.00018636980464167076, + "loss": 0.0296, + "num_input_tokens_seen": 196900112, + "step": 91245 + }, + { + "epoch": 14.885807504078304, + "grad_norm": 0.39799290895462036, + "learning_rate": 0.00018631437229026942, + "loss": 0.0156, + "num_input_tokens_seen": 196911856, + "step": 91250 + }, + { + "epoch": 14.886623164763458, + "grad_norm": 0.001985697541385889, + "learning_rate": 0.0001862589462961053, + "loss": 0.0006, + "num_input_tokens_seen": 196922896, + "step": 91255 + }, + { + "epoch": 14.887438825448614, + "grad_norm": 0.0019351370865479112, + "learning_rate": 0.0001862035266603016, + "loss": 0.0037, + "num_input_tokens_seen": 196934032, + "step": 91260 + }, + { + "epoch": 14.888254486133768, + "grad_norm": 0.567835807800293, + "learning_rate": 0.00018614811338398153, + "loss": 0.1026, + "num_input_tokens_seen": 196944688, + "step": 91265 + }, + { + "epoch": 14.889070146818923, + "grad_norm": 0.016492810100317, + "learning_rate": 0.0001860927064682681, + "loss": 0.0195, + "num_input_tokens_seen": 196954416, + "step": 91270 + }, + { + "epoch": 14.88988580750408, + "grad_norm": 0.015770548954606056, + "learning_rate": 0.0001860373059142842, + "loss": 0.0042, + "num_input_tokens_seen": 196966224, + "step": 91275 + }, + { + "epoch": 14.890701468189233, + "grad_norm": 0.0033044000156223774, + "learning_rate": 0.00018598191172315253, + "loss": 0.0063, + "num_input_tokens_seen": 196976176, + "step": 91280 + }, + { + "epoch": 14.891517128874389, + "grad_norm": 0.0721779316663742, + "learning_rate": 0.00018592652389599583, + "loss": 0.0038, + "num_input_tokens_seen": 196987216, + "step": 91285 + }, + { + "epoch": 14.892332789559543, + "grad_norm": 0.019662391394376755, + "learning_rate": 0.00018587114243393655, + "loss": 0.0424, + "num_input_tokens_seen": 196997552, + "step": 91290 + }, + { + "epoch": 14.893148450244698, + "grad_norm": 0.024452630430459976, + "learning_rate": 0.00018581576733809707, + "loss": 0.0047, + "num_input_tokens_seen": 197007984, + "step": 91295 + }, + { + "epoch": 14.893964110929852, + "grad_norm": 0.015099613927304745, + "learning_rate": 0.00018576039860959966, + "loss": 0.0014, + "num_input_tokens_seen": 197018960, + "step": 91300 + }, + { + "epoch": 14.894779771615008, + "grad_norm": 0.1823662668466568, + "learning_rate": 0.00018570503624956635, + "loss": 0.0066, + "num_input_tokens_seen": 197030544, + "step": 91305 + }, + { + "epoch": 14.895595432300164, + "grad_norm": 0.03604745864868164, + "learning_rate": 0.00018564968025911905, + "loss": 0.0021, + "num_input_tokens_seen": 197040240, + "step": 91310 + }, + { + "epoch": 14.896411092985318, + "grad_norm": 0.028495369479060173, + "learning_rate": 0.00018559433063937997, + "loss": 0.0017, + "num_input_tokens_seen": 197051152, + "step": 91315 + }, + { + "epoch": 14.897226753670473, + "grad_norm": 0.1599021703004837, + "learning_rate": 0.00018553898739147057, + "loss": 0.0073, + "num_input_tokens_seen": 197062384, + "step": 91320 + }, + { + "epoch": 14.898042414355627, + "grad_norm": 0.003061867319047451, + "learning_rate": 0.00018548365051651255, + "loss": 0.0008, + "num_input_tokens_seen": 197073424, + "step": 91325 + }, + { + "epoch": 14.898858075040783, + "grad_norm": 0.005883309058845043, + "learning_rate": 0.00018542832001562732, + "loss": 0.0016, + "num_input_tokens_seen": 197085136, + "step": 91330 + }, + { + "epoch": 14.899673735725939, + "grad_norm": 0.03733917325735092, + "learning_rate": 0.00018537299588993627, + "loss": 0.1176, + "num_input_tokens_seen": 197096336, + "step": 91335 + }, + { + "epoch": 14.900489396411093, + "grad_norm": 0.0010095015168190002, + "learning_rate": 0.0001853176781405606, + "loss": 0.1294, + "num_input_tokens_seen": 197107568, + "step": 91340 + }, + { + "epoch": 14.901305057096248, + "grad_norm": 0.0015444637974724174, + "learning_rate": 0.00018526236676862134, + "loss": 0.0055, + "num_input_tokens_seen": 197118672, + "step": 91345 + }, + { + "epoch": 14.902120717781402, + "grad_norm": 0.004932647105306387, + "learning_rate": 0.00018520706177523955, + "loss": 0.0045, + "num_input_tokens_seen": 197129168, + "step": 91350 + }, + { + "epoch": 14.902936378466558, + "grad_norm": 0.013107793405652046, + "learning_rate": 0.000185151763161536, + "loss": 0.0027, + "num_input_tokens_seen": 197138992, + "step": 91355 + }, + { + "epoch": 14.903752039151712, + "grad_norm": 0.08017133176326752, + "learning_rate": 0.0001850964709286313, + "loss": 0.0033, + "num_input_tokens_seen": 197149904, + "step": 91360 + }, + { + "epoch": 14.904567699836868, + "grad_norm": 0.007489494979381561, + "learning_rate": 0.00018504118507764618, + "loss": 0.0025, + "num_input_tokens_seen": 197160144, + "step": 91365 + }, + { + "epoch": 14.905383360522023, + "grad_norm": 0.743222177028656, + "learning_rate": 0.00018498590560970098, + "loss": 0.0257, + "num_input_tokens_seen": 197171280, + "step": 91370 + }, + { + "epoch": 14.906199021207177, + "grad_norm": 0.005178035236895084, + "learning_rate": 0.00018493063252591596, + "loss": 0.002, + "num_input_tokens_seen": 197182256, + "step": 91375 + }, + { + "epoch": 14.907014681892333, + "grad_norm": 0.20283222198486328, + "learning_rate": 0.00018487536582741142, + "loss": 0.0144, + "num_input_tokens_seen": 197192784, + "step": 91380 + }, + { + "epoch": 14.907830342577487, + "grad_norm": 0.0067565771751105785, + "learning_rate": 0.00018482010551530736, + "loss": 0.0008, + "num_input_tokens_seen": 197201488, + "step": 91385 + }, + { + "epoch": 14.908646003262643, + "grad_norm": 0.015818143263459206, + "learning_rate": 0.00018476485159072371, + "loss": 0.0032, + "num_input_tokens_seen": 197211984, + "step": 91390 + }, + { + "epoch": 14.909461663947798, + "grad_norm": 0.0944415032863617, + "learning_rate": 0.0001847096040547802, + "loss": 0.0046, + "num_input_tokens_seen": 197223536, + "step": 91395 + }, + { + "epoch": 14.910277324632952, + "grad_norm": 0.0002998944546561688, + "learning_rate": 0.00018465436290859662, + "loss": 0.0018, + "num_input_tokens_seen": 197234768, + "step": 91400 + }, + { + "epoch": 14.911092985318108, + "grad_norm": 1.0219206809997559, + "learning_rate": 0.00018459912815329234, + "loss": 0.0603, + "num_input_tokens_seen": 197246256, + "step": 91405 + }, + { + "epoch": 14.911908646003262, + "grad_norm": 0.0933808833360672, + "learning_rate": 0.00018454389978998686, + "loss": 0.0021, + "num_input_tokens_seen": 197257648, + "step": 91410 + }, + { + "epoch": 14.912724306688418, + "grad_norm": 0.000812934769783169, + "learning_rate": 0.00018448867781979943, + "loss": 0.0431, + "num_input_tokens_seen": 197267888, + "step": 91415 + }, + { + "epoch": 14.913539967373573, + "grad_norm": 0.0050510624423623085, + "learning_rate": 0.00018443346224384906, + "loss": 0.0026, + "num_input_tokens_seen": 197278352, + "step": 91420 + }, + { + "epoch": 14.914355628058727, + "grad_norm": 0.004607289098203182, + "learning_rate": 0.00018437825306325524, + "loss": 0.0111, + "num_input_tokens_seen": 197288528, + "step": 91425 + }, + { + "epoch": 14.915171288743883, + "grad_norm": 0.0017318588215857744, + "learning_rate": 0.00018432305027913615, + "loss": 0.0569, + "num_input_tokens_seen": 197300080, + "step": 91430 + }, + { + "epoch": 14.915986949429037, + "grad_norm": 0.00020641331502702087, + "learning_rate": 0.00018426785389261124, + "loss": 0.0336, + "num_input_tokens_seen": 197311120, + "step": 91435 + }, + { + "epoch": 14.916802610114193, + "grad_norm": 0.048458538949489594, + "learning_rate": 0.00018421266390479846, + "loss": 0.002, + "num_input_tokens_seen": 197321520, + "step": 91440 + }, + { + "epoch": 14.917618270799348, + "grad_norm": 0.0066768513061106205, + "learning_rate": 0.00018415748031681706, + "loss": 0.0027, + "num_input_tokens_seen": 197332528, + "step": 91445 + }, + { + "epoch": 14.918433931484502, + "grad_norm": 0.09895544499158859, + "learning_rate": 0.0001841023031297846, + "loss": 0.0134, + "num_input_tokens_seen": 197343248, + "step": 91450 + }, + { + "epoch": 14.919249592169658, + "grad_norm": 0.15971337258815765, + "learning_rate": 0.0001840471323448199, + "loss": 0.0706, + "num_input_tokens_seen": 197354832, + "step": 91455 + }, + { + "epoch": 14.920065252854812, + "grad_norm": 0.14043289422988892, + "learning_rate": 0.00018399196796304085, + "loss": 0.0066, + "num_input_tokens_seen": 197365200, + "step": 91460 + }, + { + "epoch": 14.920880913539968, + "grad_norm": 0.0024445117451250553, + "learning_rate": 0.0001839368099855655, + "loss": 0.0005, + "num_input_tokens_seen": 197375440, + "step": 91465 + }, + { + "epoch": 14.921696574225122, + "grad_norm": 0.01664692535996437, + "learning_rate": 0.00018388165841351162, + "loss": 0.0064, + "num_input_tokens_seen": 197386064, + "step": 91470 + }, + { + "epoch": 14.922512234910277, + "grad_norm": 0.08200176060199738, + "learning_rate": 0.000183826513247997, + "loss": 0.0217, + "num_input_tokens_seen": 197397296, + "step": 91475 + }, + { + "epoch": 14.923327895595433, + "grad_norm": 0.0005583091988228261, + "learning_rate": 0.0001837713744901391, + "loss": 0.03, + "num_input_tokens_seen": 197408144, + "step": 91480 + }, + { + "epoch": 14.924143556280587, + "grad_norm": 0.00534237502142787, + "learning_rate": 0.00018371624214105553, + "loss": 0.0131, + "num_input_tokens_seen": 197419216, + "step": 91485 + }, + { + "epoch": 14.924959216965743, + "grad_norm": 0.0016707798931747675, + "learning_rate": 0.00018366111620186348, + "loss": 0.0028, + "num_input_tokens_seen": 197429392, + "step": 91490 + }, + { + "epoch": 14.925774877650896, + "grad_norm": 0.005360128823667765, + "learning_rate": 0.0001836059966736803, + "loss": 0.0075, + "num_input_tokens_seen": 197440048, + "step": 91495 + }, + { + "epoch": 14.926590538336052, + "grad_norm": 0.0009816560195758939, + "learning_rate": 0.0001835508835576229, + "loss": 0.0087, + "num_input_tokens_seen": 197451664, + "step": 91500 + }, + { + "epoch": 14.927406199021208, + "grad_norm": 0.0019654834177345037, + "learning_rate": 0.00018349577685480834, + "loss": 0.0074, + "num_input_tokens_seen": 197463856, + "step": 91505 + }, + { + "epoch": 14.928221859706362, + "grad_norm": 0.0009648834820836782, + "learning_rate": 0.0001834406765663534, + "loss": 0.0023, + "num_input_tokens_seen": 197474480, + "step": 91510 + }, + { + "epoch": 14.929037520391518, + "grad_norm": 0.09128442406654358, + "learning_rate": 0.00018338558269337464, + "loss": 0.0038, + "num_input_tokens_seen": 197486320, + "step": 91515 + }, + { + "epoch": 14.929853181076671, + "grad_norm": 0.009674196131527424, + "learning_rate": 0.00018333049523698876, + "loss": 0.0033, + "num_input_tokens_seen": 197496880, + "step": 91520 + }, + { + "epoch": 14.930668841761827, + "grad_norm": 0.0011627740459516644, + "learning_rate": 0.00018327541419831196, + "loss": 0.0036, + "num_input_tokens_seen": 197506384, + "step": 91525 + }, + { + "epoch": 14.931484502446983, + "grad_norm": 0.013783317990601063, + "learning_rate": 0.00018322033957846097, + "loss": 0.0064, + "num_input_tokens_seen": 197516400, + "step": 91530 + }, + { + "epoch": 14.932300163132137, + "grad_norm": 0.010391286574304104, + "learning_rate": 0.00018316527137855138, + "loss": 0.0009, + "num_input_tokens_seen": 197527632, + "step": 91535 + }, + { + "epoch": 14.933115823817293, + "grad_norm": 0.038576073944568634, + "learning_rate": 0.00018311020959969982, + "loss": 0.0038, + "num_input_tokens_seen": 197538736, + "step": 91540 + }, + { + "epoch": 14.933931484502446, + "grad_norm": 0.001383214257657528, + "learning_rate": 0.0001830551542430215, + "loss": 0.003, + "num_input_tokens_seen": 197550448, + "step": 91545 + }, + { + "epoch": 14.934747145187602, + "grad_norm": 0.046550724655389786, + "learning_rate": 0.0001830001053096329, + "loss": 0.004, + "num_input_tokens_seen": 197561712, + "step": 91550 + }, + { + "epoch": 14.935562805872756, + "grad_norm": 0.015697911381721497, + "learning_rate": 0.000182945062800649, + "loss": 0.0035, + "num_input_tokens_seen": 197572176, + "step": 91555 + }, + { + "epoch": 14.936378466557912, + "grad_norm": 0.0006332009215839207, + "learning_rate": 0.0001828900267171859, + "loss": 0.1072, + "num_input_tokens_seen": 197583152, + "step": 91560 + }, + { + "epoch": 14.937194127243067, + "grad_norm": 0.14607708156108856, + "learning_rate": 0.0001828349970603584, + "loss": 0.0046, + "num_input_tokens_seen": 197594256, + "step": 91565 + }, + { + "epoch": 14.938009787928221, + "grad_norm": 0.013719945214688778, + "learning_rate": 0.00018277997383128237, + "loss": 0.0012, + "num_input_tokens_seen": 197604848, + "step": 91570 + }, + { + "epoch": 14.938825448613377, + "grad_norm": 0.09238064289093018, + "learning_rate": 0.00018272495703107222, + "loss": 0.0042, + "num_input_tokens_seen": 197615056, + "step": 91575 + }, + { + "epoch": 14.939641109298531, + "grad_norm": 0.5734052062034607, + "learning_rate": 0.00018266994666084368, + "loss": 0.0748, + "num_input_tokens_seen": 197626448, + "step": 91580 + }, + { + "epoch": 14.940456769983687, + "grad_norm": 0.04518286511301994, + "learning_rate": 0.0001826149427217109, + "loss": 0.0022, + "num_input_tokens_seen": 197637296, + "step": 91585 + }, + { + "epoch": 14.941272430668842, + "grad_norm": 0.0003016606206074357, + "learning_rate": 0.00018255994521478925, + "loss": 0.0022, + "num_input_tokens_seen": 197648368, + "step": 91590 + }, + { + "epoch": 14.942088091353996, + "grad_norm": 0.0005334067973308265, + "learning_rate": 0.00018250495414119273, + "loss": 0.0008, + "num_input_tokens_seen": 197658448, + "step": 91595 + }, + { + "epoch": 14.942903752039152, + "grad_norm": 0.003098636632785201, + "learning_rate": 0.0001824499695020362, + "loss": 0.0016, + "num_input_tokens_seen": 197669456, + "step": 91600 + }, + { + "epoch": 14.943719412724306, + "grad_norm": 0.0009736916981637478, + "learning_rate": 0.0001823949912984339, + "loss": 0.0009, + "num_input_tokens_seen": 197680656, + "step": 91605 + }, + { + "epoch": 14.944535073409462, + "grad_norm": 0.0011866401182487607, + "learning_rate": 0.00018234001953149997, + "loss": 0.0093, + "num_input_tokens_seen": 197691664, + "step": 91610 + }, + { + "epoch": 14.945350734094617, + "grad_norm": 0.00039111453224904835, + "learning_rate": 0.00018228505420234858, + "loss": 0.0047, + "num_input_tokens_seen": 197702928, + "step": 91615 + }, + { + "epoch": 14.946166394779771, + "grad_norm": 0.0008995155221782625, + "learning_rate": 0.00018223009531209355, + "loss": 0.0021, + "num_input_tokens_seen": 197713840, + "step": 91620 + }, + { + "epoch": 14.946982055464927, + "grad_norm": 0.0005915462388657033, + "learning_rate": 0.00018217514286184884, + "loss": 0.0155, + "num_input_tokens_seen": 197724656, + "step": 91625 + }, + { + "epoch": 14.947797716150081, + "grad_norm": 0.09159015864133835, + "learning_rate": 0.00018212019685272802, + "loss": 0.0054, + "num_input_tokens_seen": 197735344, + "step": 91630 + }, + { + "epoch": 14.948613376835237, + "grad_norm": 0.07714743167161942, + "learning_rate": 0.00018206525728584462, + "loss": 0.0046, + "num_input_tokens_seen": 197747344, + "step": 91635 + }, + { + "epoch": 14.949429037520392, + "grad_norm": 0.0011049091117456555, + "learning_rate": 0.00018201032416231217, + "loss": 0.0041, + "num_input_tokens_seen": 197758832, + "step": 91640 + }, + { + "epoch": 14.950244698205546, + "grad_norm": 0.003085568780079484, + "learning_rate": 0.00018195539748324386, + "loss": 0.0038, + "num_input_tokens_seen": 197769968, + "step": 91645 + }, + { + "epoch": 14.951060358890702, + "grad_norm": 0.019435815513134003, + "learning_rate": 0.00018190047724975271, + "loss": 0.0014, + "num_input_tokens_seen": 197781200, + "step": 91650 + }, + { + "epoch": 14.951876019575856, + "grad_norm": 0.00018131126125808805, + "learning_rate": 0.00018184556346295233, + "loss": 0.0005, + "num_input_tokens_seen": 197791952, + "step": 91655 + }, + { + "epoch": 14.952691680261012, + "grad_norm": 0.0013339652214199305, + "learning_rate": 0.00018179065612395484, + "loss": 0.0023, + "num_input_tokens_seen": 197802320, + "step": 91660 + }, + { + "epoch": 14.953507340946166, + "grad_norm": 0.003577487776055932, + "learning_rate": 0.0001817357552338737, + "loss": 0.0017, + "num_input_tokens_seen": 197812752, + "step": 91665 + }, + { + "epoch": 14.954323001631321, + "grad_norm": 0.0070591275580227375, + "learning_rate": 0.0001816808607938209, + "loss": 0.0075, + "num_input_tokens_seen": 197824304, + "step": 91670 + }, + { + "epoch": 14.955138662316477, + "grad_norm": 0.024146627634763718, + "learning_rate": 0.00018162597280490966, + "loss": 0.0389, + "num_input_tokens_seen": 197834896, + "step": 91675 + }, + { + "epoch": 14.955954323001631, + "grad_norm": 0.006658916361629963, + "learning_rate": 0.00018157109126825156, + "loss": 0.0037, + "num_input_tokens_seen": 197845744, + "step": 91680 + }, + { + "epoch": 14.956769983686787, + "grad_norm": 0.030804263427853584, + "learning_rate": 0.0001815162161849596, + "loss": 0.0088, + "num_input_tokens_seen": 197855760, + "step": 91685 + }, + { + "epoch": 14.95758564437194, + "grad_norm": 0.0015893372474238276, + "learning_rate": 0.00018146134755614524, + "loss": 0.0022, + "num_input_tokens_seen": 197867024, + "step": 91690 + }, + { + "epoch": 14.958401305057096, + "grad_norm": 0.0003086631477344781, + "learning_rate": 0.0001814064853829211, + "loss": 0.0011, + "num_input_tokens_seen": 197878416, + "step": 91695 + }, + { + "epoch": 14.959216965742252, + "grad_norm": 0.0018296872731298208, + "learning_rate": 0.00018135162966639835, + "loss": 0.002, + "num_input_tokens_seen": 197888208, + "step": 91700 + }, + { + "epoch": 14.960032626427406, + "grad_norm": 0.055911801755428314, + "learning_rate": 0.00018129678040768938, + "loss": 0.0027, + "num_input_tokens_seen": 197899408, + "step": 91705 + }, + { + "epoch": 14.960848287112562, + "grad_norm": 0.01202060841023922, + "learning_rate": 0.00018124193760790514, + "loss": 0.0017, + "num_input_tokens_seen": 197910672, + "step": 91710 + }, + { + "epoch": 14.961663947797716, + "grad_norm": 0.03473207354545593, + "learning_rate": 0.00018118710126815773, + "loss": 0.0125, + "num_input_tokens_seen": 197921488, + "step": 91715 + }, + { + "epoch": 14.962479608482871, + "grad_norm": 0.0391261987388134, + "learning_rate": 0.00018113227138955785, + "loss": 0.0022, + "num_input_tokens_seen": 197932624, + "step": 91720 + }, + { + "epoch": 14.963295269168025, + "grad_norm": 0.011360060423612595, + "learning_rate": 0.00018107744797321728, + "loss": 0.0092, + "num_input_tokens_seen": 197942032, + "step": 91725 + }, + { + "epoch": 14.964110929853181, + "grad_norm": 0.02873978018760681, + "learning_rate": 0.00018102263102024653, + "loss": 0.0035, + "num_input_tokens_seen": 197952848, + "step": 91730 + }, + { + "epoch": 14.964926590538337, + "grad_norm": 0.35008570551872253, + "learning_rate": 0.00018096782053175715, + "loss": 0.0089, + "num_input_tokens_seen": 197962736, + "step": 91735 + }, + { + "epoch": 14.96574225122349, + "grad_norm": 0.0003296004724688828, + "learning_rate": 0.00018091301650885922, + "loss": 0.0545, + "num_input_tokens_seen": 197973840, + "step": 91740 + }, + { + "epoch": 14.966557911908646, + "grad_norm": 0.0005591457011178136, + "learning_rate": 0.00018085821895266402, + "loss": 0.0005, + "num_input_tokens_seen": 197985040, + "step": 91745 + }, + { + "epoch": 14.9673735725938, + "grad_norm": 0.0009361800621263683, + "learning_rate": 0.00018080342786428184, + "loss": 0.0071, + "num_input_tokens_seen": 197995760, + "step": 91750 + }, + { + "epoch": 14.968189233278956, + "grad_norm": 0.002150339772924781, + "learning_rate": 0.00018074864324482315, + "loss": 0.0007, + "num_input_tokens_seen": 198006992, + "step": 91755 + }, + { + "epoch": 14.969004893964112, + "grad_norm": 0.014244066551327705, + "learning_rate": 0.0001806938650953982, + "loss": 0.0274, + "num_input_tokens_seen": 198016336, + "step": 91760 + }, + { + "epoch": 14.969820554649266, + "grad_norm": 0.03488391265273094, + "learning_rate": 0.00018063909341711716, + "loss": 0.0025, + "num_input_tokens_seen": 198027920, + "step": 91765 + }, + { + "epoch": 14.970636215334421, + "grad_norm": 0.022693637758493423, + "learning_rate": 0.00018058432821109, + "loss": 0.0017, + "num_input_tokens_seen": 198039024, + "step": 91770 + }, + { + "epoch": 14.971451876019575, + "grad_norm": 0.1710520088672638, + "learning_rate": 0.00018052956947842665, + "loss": 0.0037, + "num_input_tokens_seen": 198049296, + "step": 91775 + }, + { + "epoch": 14.97226753670473, + "grad_norm": 0.06752630323171616, + "learning_rate": 0.0001804748172202368, + "loss": 0.0927, + "num_input_tokens_seen": 198060912, + "step": 91780 + }, + { + "epoch": 14.973083197389887, + "grad_norm": 0.00266630994156003, + "learning_rate": 0.00018042007143763018, + "loss": 0.0014, + "num_input_tokens_seen": 198072240, + "step": 91785 + }, + { + "epoch": 14.97389885807504, + "grad_norm": 0.006713633891195059, + "learning_rate": 0.00018036533213171618, + "loss": 0.0022, + "num_input_tokens_seen": 198084112, + "step": 91790 + }, + { + "epoch": 14.974714518760196, + "grad_norm": 0.019593453034758568, + "learning_rate": 0.0001803105993036041, + "loss": 0.0024, + "num_input_tokens_seen": 198094096, + "step": 91795 + }, + { + "epoch": 14.97553017944535, + "grad_norm": 0.45167598128318787, + "learning_rate": 0.0001802558729544036, + "loss": 0.0683, + "num_input_tokens_seen": 198104144, + "step": 91800 + }, + { + "epoch": 14.976345840130506, + "grad_norm": 0.00018482006271369755, + "learning_rate": 0.0001802011530852231, + "loss": 0.0076, + "num_input_tokens_seen": 198114256, + "step": 91805 + }, + { + "epoch": 14.977161500815662, + "grad_norm": 0.0024316012859344482, + "learning_rate": 0.00018014643969717231, + "loss": 0.0122, + "num_input_tokens_seen": 198124944, + "step": 91810 + }, + { + "epoch": 14.977977161500815, + "grad_norm": 0.08692025393247604, + "learning_rate": 0.0001800917327913593, + "loss": 0.025, + "num_input_tokens_seen": 198135472, + "step": 91815 + }, + { + "epoch": 14.978792822185971, + "grad_norm": 0.00587397301569581, + "learning_rate": 0.0001800370323688935, + "loss": 0.0058, + "num_input_tokens_seen": 198145744, + "step": 91820 + }, + { + "epoch": 14.979608482871125, + "grad_norm": 0.14312763512134552, + "learning_rate": 0.00017998233843088284, + "loss": 0.0053, + "num_input_tokens_seen": 198156304, + "step": 91825 + }, + { + "epoch": 14.98042414355628, + "grad_norm": 0.002417525742202997, + "learning_rate": 0.00017992765097843639, + "loss": 0.0011, + "num_input_tokens_seen": 198166736, + "step": 91830 + }, + { + "epoch": 14.981239804241435, + "grad_norm": 0.0005045532598160207, + "learning_rate": 0.00017987297001266172, + "loss": 0.0005, + "num_input_tokens_seen": 198178000, + "step": 91835 + }, + { + "epoch": 14.98205546492659, + "grad_norm": 0.00035960765671916306, + "learning_rate": 0.00017981829553466783, + "loss": 0.0043, + "num_input_tokens_seen": 198188720, + "step": 91840 + }, + { + "epoch": 14.982871125611746, + "grad_norm": 0.029584866017103195, + "learning_rate": 0.00017976362754556203, + "loss": 0.002, + "num_input_tokens_seen": 198199184, + "step": 91845 + }, + { + "epoch": 14.9836867862969, + "grad_norm": 0.02764306217432022, + "learning_rate": 0.0001797089660464527, + "loss": 0.0133, + "num_input_tokens_seen": 198211184, + "step": 91850 + }, + { + "epoch": 14.984502446982056, + "grad_norm": 0.032056599855422974, + "learning_rate": 0.00017965431103844753, + "loss": 0.0034, + "num_input_tokens_seen": 198223312, + "step": 91855 + }, + { + "epoch": 14.98531810766721, + "grad_norm": 0.020697027444839478, + "learning_rate": 0.00017959966252265407, + "loss": 0.0066, + "num_input_tokens_seen": 198233968, + "step": 91860 + }, + { + "epoch": 14.986133768352365, + "grad_norm": 0.0012695527402684093, + "learning_rate": 0.00017954502050018, + "loss": 0.1101, + "num_input_tokens_seen": 198244240, + "step": 91865 + }, + { + "epoch": 14.986949429037521, + "grad_norm": 0.3044207692146301, + "learning_rate": 0.00017949038497213255, + "loss": 0.009, + "num_input_tokens_seen": 198255280, + "step": 91870 + }, + { + "epoch": 14.987765089722675, + "grad_norm": 0.05426936224102974, + "learning_rate": 0.0001794357559396191, + "loss": 0.0053, + "num_input_tokens_seen": 198267120, + "step": 91875 + }, + { + "epoch": 14.98858075040783, + "grad_norm": 0.1922590136528015, + "learning_rate": 0.00017938113340374662, + "loss": 0.007, + "num_input_tokens_seen": 198278608, + "step": 91880 + }, + { + "epoch": 14.989396411092985, + "grad_norm": 0.001317578018642962, + "learning_rate": 0.00017932651736562226, + "loss": 0.0062, + "num_input_tokens_seen": 198288976, + "step": 91885 + }, + { + "epoch": 14.99021207177814, + "grad_norm": 0.01220970880240202, + "learning_rate": 0.00017927190782635283, + "loss": 0.0012, + "num_input_tokens_seen": 198298224, + "step": 91890 + }, + { + "epoch": 14.991027732463294, + "grad_norm": 0.041691407561302185, + "learning_rate": 0.00017921730478704506, + "loss": 0.006, + "num_input_tokens_seen": 198308208, + "step": 91895 + }, + { + "epoch": 14.99184339314845, + "grad_norm": 0.03838913515210152, + "learning_rate": 0.0001791627082488056, + "loss": 0.0039, + "num_input_tokens_seen": 198320016, + "step": 91900 + }, + { + "epoch": 14.992659053833606, + "grad_norm": 0.00014104865840636194, + "learning_rate": 0.00017910811821274082, + "loss": 0.0011, + "num_input_tokens_seen": 198330448, + "step": 91905 + }, + { + "epoch": 14.99347471451876, + "grad_norm": 0.0016173210460692644, + "learning_rate": 0.0001790535346799571, + "loss": 0.0951, + "num_input_tokens_seen": 198342384, + "step": 91910 + }, + { + "epoch": 14.994290375203915, + "grad_norm": 0.004497263114899397, + "learning_rate": 0.00017899895765156065, + "loss": 0.0251, + "num_input_tokens_seen": 198353328, + "step": 91915 + }, + { + "epoch": 14.99510603588907, + "grad_norm": 0.10628994554281235, + "learning_rate": 0.00017894438712865753, + "loss": 0.0032, + "num_input_tokens_seen": 198363888, + "step": 91920 + }, + { + "epoch": 14.995921696574225, + "grad_norm": 0.005305298138409853, + "learning_rate": 0.00017888982311235375, + "loss": 0.0689, + "num_input_tokens_seen": 198374672, + "step": 91925 + }, + { + "epoch": 14.99673735725938, + "grad_norm": 0.041575659066438675, + "learning_rate": 0.00017883526560375502, + "loss": 0.0019, + "num_input_tokens_seen": 198385808, + "step": 91930 + }, + { + "epoch": 14.997553017944535, + "grad_norm": 0.0014565808232873678, + "learning_rate": 0.00017878071460396706, + "loss": 0.0118, + "num_input_tokens_seen": 198394960, + "step": 91935 + }, + { + "epoch": 14.99836867862969, + "grad_norm": 0.010725701227784157, + "learning_rate": 0.0001787261701140952, + "loss": 0.0012, + "num_input_tokens_seen": 198406256, + "step": 91940 + }, + { + "epoch": 14.999184339314844, + "grad_norm": 0.00010247622412862256, + "learning_rate": 0.00017867163213524545, + "loss": 0.0627, + "num_input_tokens_seen": 198417840, + "step": 91945 + }, + { + "epoch": 15.0, + "grad_norm": 0.007938225753605366, + "learning_rate": 0.00017861710066852237, + "loss": 0.0014, + "num_input_tokens_seen": 198426688, + "step": 91950 + }, + { + "epoch": 15.0, + "eval_loss": 0.23222695291042328, + "eval_runtime": 104.2686, + "eval_samples_per_second": 26.134, + "eval_steps_per_second": 6.541, + "num_input_tokens_seen": 198426688, + "step": 91950 + }, + { + "epoch": 15.000815660685156, + "grad_norm": 0.004275687504559755, + "learning_rate": 0.00017856257571503164, + "loss": 0.0016, + "num_input_tokens_seen": 198438688, + "step": 91955 + }, + { + "epoch": 15.00163132137031, + "grad_norm": 0.0018468056805431843, + "learning_rate": 0.00017850805727587804, + "loss": 0.0011, + "num_input_tokens_seen": 198448480, + "step": 91960 + }, + { + "epoch": 15.002446982055465, + "grad_norm": 0.0016898267203941941, + "learning_rate": 0.00017845354535216658, + "loss": 0.0081, + "num_input_tokens_seen": 198460320, + "step": 91965 + }, + { + "epoch": 15.00326264274062, + "grad_norm": 0.0008262842311523855, + "learning_rate": 0.00017839903994500185, + "loss": 0.0046, + "num_input_tokens_seen": 198471712, + "step": 91970 + }, + { + "epoch": 15.004078303425775, + "grad_norm": 0.03921177610754967, + "learning_rate": 0.0001783445410554886, + "loss": 0.0012, + "num_input_tokens_seen": 198482432, + "step": 91975 + }, + { + "epoch": 15.00489396411093, + "grad_norm": 0.265741229057312, + "learning_rate": 0.00017829004868473124, + "loss": 0.0107, + "num_input_tokens_seen": 198493664, + "step": 91980 + }, + { + "epoch": 15.005709624796085, + "grad_norm": 0.00706891156733036, + "learning_rate": 0.00017823556283383418, + "loss": 0.0011, + "num_input_tokens_seen": 198505536, + "step": 91985 + }, + { + "epoch": 15.00652528548124, + "grad_norm": 0.000369491201126948, + "learning_rate": 0.0001781810835039016, + "loss": 0.0062, + "num_input_tokens_seen": 198516416, + "step": 91990 + }, + { + "epoch": 15.007340946166394, + "grad_norm": 0.0004151603498030454, + "learning_rate": 0.0001781266106960377, + "loss": 0.0028, + "num_input_tokens_seen": 198528224, + "step": 91995 + }, + { + "epoch": 15.00815660685155, + "grad_norm": 0.006623450201004744, + "learning_rate": 0.00017807214441134628, + "loss": 0.0012, + "num_input_tokens_seen": 198540096, + "step": 92000 + }, + { + "epoch": 15.008972267536704, + "grad_norm": 0.01897459290921688, + "learning_rate": 0.00017801768465093126, + "loss": 0.0238, + "num_input_tokens_seen": 198550496, + "step": 92005 + }, + { + "epoch": 15.00978792822186, + "grad_norm": 0.0029296332504600286, + "learning_rate": 0.00017796323141589638, + "loss": 0.0016, + "num_input_tokens_seen": 198560512, + "step": 92010 + }, + { + "epoch": 15.010603588907015, + "grad_norm": 0.021039435639977455, + "learning_rate": 0.00017790878470734506, + "loss": 0.022, + "num_input_tokens_seen": 198570880, + "step": 92015 + }, + { + "epoch": 15.01141924959217, + "grad_norm": 0.014424490742385387, + "learning_rate": 0.0001778543445263809, + "loss": 0.0028, + "num_input_tokens_seen": 198581024, + "step": 92020 + }, + { + "epoch": 15.012234910277325, + "grad_norm": 0.003395842155441642, + "learning_rate": 0.00017779991087410707, + "loss": 0.0006, + "num_input_tokens_seen": 198592416, + "step": 92025 + }, + { + "epoch": 15.013050570962479, + "grad_norm": 0.15244269371032715, + "learning_rate": 0.0001777454837516268, + "loss": 0.0061, + "num_input_tokens_seen": 198601888, + "step": 92030 + }, + { + "epoch": 15.013866231647635, + "grad_norm": 0.000390715547837317, + "learning_rate": 0.00017769106316004314, + "loss": 0.0036, + "num_input_tokens_seen": 198612064, + "step": 92035 + }, + { + "epoch": 15.01468189233279, + "grad_norm": 0.0011942335404455662, + "learning_rate": 0.0001776366491004589, + "loss": 0.1355, + "num_input_tokens_seen": 198623072, + "step": 92040 + }, + { + "epoch": 15.015497553017944, + "grad_norm": 0.00480772415176034, + "learning_rate": 0.00017758224157397696, + "loss": 0.0021, + "num_input_tokens_seen": 198635616, + "step": 92045 + }, + { + "epoch": 15.0163132137031, + "grad_norm": 0.07810454070568085, + "learning_rate": 0.00017752784058169992, + "loss": 0.0042, + "num_input_tokens_seen": 198645952, + "step": 92050 + }, + { + "epoch": 15.017128874388254, + "grad_norm": 0.005636147223412991, + "learning_rate": 0.00017747344612473022, + "loss": 0.0007, + "num_input_tokens_seen": 198655616, + "step": 92055 + }, + { + "epoch": 15.01794453507341, + "grad_norm": 0.008288292214274406, + "learning_rate": 0.00017741905820417014, + "loss": 0.0085, + "num_input_tokens_seen": 198665472, + "step": 92060 + }, + { + "epoch": 15.018760195758565, + "grad_norm": 0.002053429139778018, + "learning_rate": 0.00017736467682112245, + "loss": 0.0141, + "num_input_tokens_seen": 198677504, + "step": 92065 + }, + { + "epoch": 15.01957585644372, + "grad_norm": 0.0028106230311095715, + "learning_rate": 0.00017731030197668847, + "loss": 0.0031, + "num_input_tokens_seen": 198688288, + "step": 92070 + }, + { + "epoch": 15.020391517128875, + "grad_norm": 0.039580877870321274, + "learning_rate": 0.00017725593367197095, + "loss": 0.005, + "num_input_tokens_seen": 198697792, + "step": 92075 + }, + { + "epoch": 15.021207177814029, + "grad_norm": 0.008259903639554977, + "learning_rate": 0.00017720157190807107, + "loss": 0.0011, + "num_input_tokens_seen": 198708864, + "step": 92080 + }, + { + "epoch": 15.022022838499185, + "grad_norm": 0.00089216697961092, + "learning_rate": 0.00017714721668609095, + "loss": 0.0063, + "num_input_tokens_seen": 198720448, + "step": 92085 + }, + { + "epoch": 15.022838499184338, + "grad_norm": 0.009794710204005241, + "learning_rate": 0.00017709286800713202, + "loss": 0.0015, + "num_input_tokens_seen": 198732256, + "step": 92090 + }, + { + "epoch": 15.023654159869494, + "grad_norm": 0.0011435254709795117, + "learning_rate": 0.00017703852587229584, + "loss": 0.002, + "num_input_tokens_seen": 198743392, + "step": 92095 + }, + { + "epoch": 15.02446982055465, + "grad_norm": 0.0003642051888164133, + "learning_rate": 0.00017698419028268358, + "loss": 0.102, + "num_input_tokens_seen": 198753376, + "step": 92100 + }, + { + "epoch": 15.025285481239804, + "grad_norm": 0.001653459039516747, + "learning_rate": 0.00017692986123939652, + "loss": 0.0838, + "num_input_tokens_seen": 198763040, + "step": 92105 + }, + { + "epoch": 15.02610114192496, + "grad_norm": 0.002610685070976615, + "learning_rate": 0.00017687553874353563, + "loss": 0.0016, + "num_input_tokens_seen": 198774080, + "step": 92110 + }, + { + "epoch": 15.026916802610113, + "grad_norm": 0.00038397617754526436, + "learning_rate": 0.0001768212227962019, + "loss": 0.0034, + "num_input_tokens_seen": 198785216, + "step": 92115 + }, + { + "epoch": 15.02773246329527, + "grad_norm": 0.018100714311003685, + "learning_rate": 0.00017676691339849605, + "loss": 0.0007, + "num_input_tokens_seen": 198797440, + "step": 92120 + }, + { + "epoch": 15.028548123980425, + "grad_norm": 0.0033893033396452665, + "learning_rate": 0.00017671261055151872, + "loss": 0.0004, + "num_input_tokens_seen": 198808608, + "step": 92125 + }, + { + "epoch": 15.029363784665579, + "grad_norm": 0.002433412941172719, + "learning_rate": 0.00017665831425637052, + "loss": 0.0009, + "num_input_tokens_seen": 198821056, + "step": 92130 + }, + { + "epoch": 15.030179445350734, + "grad_norm": 0.04352866858243942, + "learning_rate": 0.0001766040245141517, + "loss": 0.0031, + "num_input_tokens_seen": 198831232, + "step": 92135 + }, + { + "epoch": 15.030995106035888, + "grad_norm": 0.00241314759477973, + "learning_rate": 0.00017654974132596263, + "loss": 0.0033, + "num_input_tokens_seen": 198842016, + "step": 92140 + }, + { + "epoch": 15.031810766721044, + "grad_norm": 0.012855492532253265, + "learning_rate": 0.00017649546469290333, + "loss": 0.0716, + "num_input_tokens_seen": 198853824, + "step": 92145 + }, + { + "epoch": 15.0326264274062, + "grad_norm": 0.001995042897760868, + "learning_rate": 0.00017644119461607388, + "loss": 0.003, + "num_input_tokens_seen": 198864160, + "step": 92150 + }, + { + "epoch": 15.033442088091354, + "grad_norm": 0.014295226894319057, + "learning_rate": 0.0001763869310965741, + "loss": 0.0014, + "num_input_tokens_seen": 198873888, + "step": 92155 + }, + { + "epoch": 15.03425774877651, + "grad_norm": 0.00182590342592448, + "learning_rate": 0.00017633267413550362, + "loss": 0.0049, + "num_input_tokens_seen": 198883712, + "step": 92160 + }, + { + "epoch": 15.035073409461663, + "grad_norm": 0.0012046555057168007, + "learning_rate": 0.00017627842373396202, + "loss": 0.0111, + "num_input_tokens_seen": 198893024, + "step": 92165 + }, + { + "epoch": 15.035889070146819, + "grad_norm": 0.02611442282795906, + "learning_rate": 0.00017622417989304913, + "loss": 0.0064, + "num_input_tokens_seen": 198903264, + "step": 92170 + }, + { + "epoch": 15.036704730831975, + "grad_norm": 0.06382507085800171, + "learning_rate": 0.0001761699426138636, + "loss": 0.0096, + "num_input_tokens_seen": 198914464, + "step": 92175 + }, + { + "epoch": 15.037520391517129, + "grad_norm": 0.02687195874750614, + "learning_rate": 0.00017611571189750537, + "loss": 0.0011, + "num_input_tokens_seen": 198924800, + "step": 92180 + }, + { + "epoch": 15.038336052202284, + "grad_norm": 0.05115017294883728, + "learning_rate": 0.00017606148774507274, + "loss": 0.0016, + "num_input_tokens_seen": 198934464, + "step": 92185 + }, + { + "epoch": 15.039151712887438, + "grad_norm": 0.005650022067129612, + "learning_rate": 0.0001760072701576654, + "loss": 0.0088, + "num_input_tokens_seen": 198944896, + "step": 92190 + }, + { + "epoch": 15.039967373572594, + "grad_norm": 0.006938491482287645, + "learning_rate": 0.00017595305913638138, + "loss": 0.0011, + "num_input_tokens_seen": 198956064, + "step": 92195 + }, + { + "epoch": 15.040783034257748, + "grad_norm": 0.009232531301677227, + "learning_rate": 0.00017589885468232002, + "loss": 0.0008, + "num_input_tokens_seen": 198966720, + "step": 92200 + }, + { + "epoch": 15.041598694942904, + "grad_norm": 0.010990158654749393, + "learning_rate": 0.00017584465679657918, + "loss": 0.001, + "num_input_tokens_seen": 198977248, + "step": 92205 + }, + { + "epoch": 15.04241435562806, + "grad_norm": 0.1925150752067566, + "learning_rate": 0.00017579046548025796, + "loss": 0.0061, + "num_input_tokens_seen": 198988224, + "step": 92210 + }, + { + "epoch": 15.043230016313213, + "grad_norm": 0.009634612128138542, + "learning_rate": 0.00017573628073445393, + "loss": 0.0012, + "num_input_tokens_seen": 198998272, + "step": 92215 + }, + { + "epoch": 15.044045676998369, + "grad_norm": 0.012327825650572777, + "learning_rate": 0.00017568210256026578, + "loss": 0.0022, + "num_input_tokens_seen": 199009504, + "step": 92220 + }, + { + "epoch": 15.044861337683523, + "grad_norm": 0.002283198293298483, + "learning_rate": 0.000175627930958791, + "loss": 0.0027, + "num_input_tokens_seen": 199021792, + "step": 92225 + }, + { + "epoch": 15.045676998368679, + "grad_norm": 0.004079566337168217, + "learning_rate": 0.0001755737659311278, + "loss": 0.0022, + "num_input_tokens_seen": 199032096, + "step": 92230 + }, + { + "epoch": 15.046492659053834, + "grad_norm": 0.005857102572917938, + "learning_rate": 0.00017551960747837382, + "loss": 0.0306, + "num_input_tokens_seen": 199043392, + "step": 92235 + }, + { + "epoch": 15.047308319738988, + "grad_norm": 0.6055668592453003, + "learning_rate": 0.00017546545560162663, + "loss": 0.0205, + "num_input_tokens_seen": 199054016, + "step": 92240 + }, + { + "epoch": 15.048123980424144, + "grad_norm": 0.0006233384483493865, + "learning_rate": 0.00017541131030198364, + "loss": 0.0019, + "num_input_tokens_seen": 199065760, + "step": 92245 + }, + { + "epoch": 15.048939641109298, + "grad_norm": 0.04825665429234505, + "learning_rate": 0.00017535717158054226, + "loss": 0.002, + "num_input_tokens_seen": 199075808, + "step": 92250 + }, + { + "epoch": 15.049755301794454, + "grad_norm": 0.0003467535425443202, + "learning_rate": 0.00017530303943839965, + "loss": 0.0018, + "num_input_tokens_seen": 199085632, + "step": 92255 + }, + { + "epoch": 15.05057096247961, + "grad_norm": 0.00025809413637034595, + "learning_rate": 0.00017524891387665282, + "loss": 0.0003, + "num_input_tokens_seen": 199096160, + "step": 92260 + }, + { + "epoch": 15.051386623164763, + "grad_norm": 0.0020927947480231524, + "learning_rate": 0.00017519479489639877, + "loss": 0.002, + "num_input_tokens_seen": 199107360, + "step": 92265 + }, + { + "epoch": 15.052202283849919, + "grad_norm": 0.11325211822986603, + "learning_rate": 0.0001751406824987342, + "loss": 0.0044, + "num_input_tokens_seen": 199118720, + "step": 92270 + }, + { + "epoch": 15.053017944535073, + "grad_norm": 0.002931073773652315, + "learning_rate": 0.00017508657668475585, + "loss": 0.0016, + "num_input_tokens_seen": 199129568, + "step": 92275 + }, + { + "epoch": 15.053833605220229, + "grad_norm": 0.02373582124710083, + "learning_rate": 0.00017503247745556, + "loss": 0.001, + "num_input_tokens_seen": 199141216, + "step": 92280 + }, + { + "epoch": 15.054649265905383, + "grad_norm": 0.000552816956769675, + "learning_rate": 0.0001749783848122436, + "loss": 0.0047, + "num_input_tokens_seen": 199151040, + "step": 92285 + }, + { + "epoch": 15.055464926590538, + "grad_norm": 0.001482184394262731, + "learning_rate": 0.0001749242987559022, + "loss": 0.0006, + "num_input_tokens_seen": 199161632, + "step": 92290 + }, + { + "epoch": 15.056280587275694, + "grad_norm": 0.007321056444197893, + "learning_rate": 0.00017487021928763263, + "loss": 0.0176, + "num_input_tokens_seen": 199171616, + "step": 92295 + }, + { + "epoch": 15.057096247960848, + "grad_norm": 0.014589930884540081, + "learning_rate": 0.0001748161464085302, + "loss": 0.0022, + "num_input_tokens_seen": 199182720, + "step": 92300 + }, + { + "epoch": 15.057911908646004, + "grad_norm": 0.02128283865749836, + "learning_rate": 0.00017476208011969142, + "loss": 0.0069, + "num_input_tokens_seen": 199193728, + "step": 92305 + }, + { + "epoch": 15.058727569331158, + "grad_norm": 0.0017726394580677152, + "learning_rate": 0.0001747080204222113, + "loss": 0.0028, + "num_input_tokens_seen": 199205120, + "step": 92310 + }, + { + "epoch": 15.059543230016313, + "grad_norm": 0.0015253267483785748, + "learning_rate": 0.00017465396731718619, + "loss": 0.0032, + "num_input_tokens_seen": 199214528, + "step": 92315 + }, + { + "epoch": 15.060358890701469, + "grad_norm": 0.0005414013867266476, + "learning_rate": 0.0001745999208057108, + "loss": 0.0012, + "num_input_tokens_seen": 199224864, + "step": 92320 + }, + { + "epoch": 15.061174551386623, + "grad_norm": 0.0008364262757822871, + "learning_rate": 0.00017454588088888117, + "loss": 0.0014, + "num_input_tokens_seen": 199235296, + "step": 92325 + }, + { + "epoch": 15.061990212071779, + "grad_norm": 0.000591350719332695, + "learning_rate": 0.00017449184756779178, + "loss": 0.0015, + "num_input_tokens_seen": 199246144, + "step": 92330 + }, + { + "epoch": 15.062805872756933, + "grad_norm": 0.0006110016838647425, + "learning_rate": 0.00017443782084353837, + "loss": 0.0006, + "num_input_tokens_seen": 199256544, + "step": 92335 + }, + { + "epoch": 15.063621533442088, + "grad_norm": 0.007905459962785244, + "learning_rate": 0.0001743838007172152, + "loss": 0.0043, + "num_input_tokens_seen": 199266240, + "step": 92340 + }, + { + "epoch": 15.064437194127244, + "grad_norm": 0.006417552940547466, + "learning_rate": 0.00017432978718991772, + "loss": 0.0004, + "num_input_tokens_seen": 199276800, + "step": 92345 + }, + { + "epoch": 15.065252854812398, + "grad_norm": 0.008733022958040237, + "learning_rate": 0.00017427578026273988, + "loss": 0.0283, + "num_input_tokens_seen": 199287232, + "step": 92350 + }, + { + "epoch": 15.066068515497554, + "grad_norm": 0.0038966131396591663, + "learning_rate": 0.00017422177993677696, + "loss": 0.0028, + "num_input_tokens_seen": 199296160, + "step": 92355 + }, + { + "epoch": 15.066884176182707, + "grad_norm": 0.0006478236173279583, + "learning_rate": 0.00017416778621312257, + "loss": 0.0007, + "num_input_tokens_seen": 199306176, + "step": 92360 + }, + { + "epoch": 15.067699836867863, + "grad_norm": 0.12022221833467484, + "learning_rate": 0.00017411379909287167, + "loss": 0.0024, + "num_input_tokens_seen": 199315648, + "step": 92365 + }, + { + "epoch": 15.068515497553017, + "grad_norm": 0.020134378224611282, + "learning_rate": 0.00017405981857711772, + "loss": 0.002, + "num_input_tokens_seen": 199325632, + "step": 92370 + }, + { + "epoch": 15.069331158238173, + "grad_norm": 0.00044690974755212665, + "learning_rate": 0.0001740058446669552, + "loss": 0.0015, + "num_input_tokens_seen": 199336384, + "step": 92375 + }, + { + "epoch": 15.070146818923329, + "grad_norm": 0.23547402024269104, + "learning_rate": 0.00017395187736347778, + "loss": 0.0083, + "num_input_tokens_seen": 199348864, + "step": 92380 + }, + { + "epoch": 15.070962479608482, + "grad_norm": 0.12250398844480515, + "learning_rate": 0.0001738979166677792, + "loss": 0.0041, + "num_input_tokens_seen": 199358880, + "step": 92385 + }, + { + "epoch": 15.071778140293638, + "grad_norm": 0.0034677661024034023, + "learning_rate": 0.00017384396258095304, + "loss": 0.0011, + "num_input_tokens_seen": 199369984, + "step": 92390 + }, + { + "epoch": 15.072593800978792, + "grad_norm": 0.003349359380081296, + "learning_rate": 0.0001737900151040927, + "loss": 0.0341, + "num_input_tokens_seen": 199380224, + "step": 92395 + }, + { + "epoch": 15.073409461663948, + "grad_norm": 0.0012394021032378078, + "learning_rate": 0.00017373607423829159, + "loss": 0.0006, + "num_input_tokens_seen": 199390464, + "step": 92400 + }, + { + "epoch": 15.074225122349104, + "grad_norm": 0.0053171562030911446, + "learning_rate": 0.00017368213998464278, + "loss": 0.0061, + "num_input_tokens_seen": 199401408, + "step": 92405 + }, + { + "epoch": 15.075040783034257, + "grad_norm": 0.003807183587923646, + "learning_rate": 0.00017362821234423936, + "loss": 0.003, + "num_input_tokens_seen": 199411872, + "step": 92410 + }, + { + "epoch": 15.075856443719413, + "grad_norm": 0.11759316176176071, + "learning_rate": 0.00017357429131817432, + "loss": 0.0036, + "num_input_tokens_seen": 199422976, + "step": 92415 + }, + { + "epoch": 15.076672104404567, + "grad_norm": 0.23413163423538208, + "learning_rate": 0.0001735203769075403, + "loss": 0.0075, + "num_input_tokens_seen": 199434752, + "step": 92420 + }, + { + "epoch": 15.077487765089723, + "grad_norm": 0.0007617458468303084, + "learning_rate": 0.00017346646911342985, + "loss": 0.0009, + "num_input_tokens_seen": 199446272, + "step": 92425 + }, + { + "epoch": 15.078303425774878, + "grad_norm": 0.06205003708600998, + "learning_rate": 0.000173412567936936, + "loss": 0.0025, + "num_input_tokens_seen": 199457376, + "step": 92430 + }, + { + "epoch": 15.079119086460032, + "grad_norm": 0.4398300051689148, + "learning_rate": 0.0001733586733791504, + "loss": 0.1439, + "num_input_tokens_seen": 199469120, + "step": 92435 + }, + { + "epoch": 15.079934747145188, + "grad_norm": 0.021775685250759125, + "learning_rate": 0.000173304785441166, + "loss": 0.0011, + "num_input_tokens_seen": 199481024, + "step": 92440 + }, + { + "epoch": 15.080750407830342, + "grad_norm": 0.001186116598546505, + "learning_rate": 0.00017325090412407423, + "loss": 0.0014, + "num_input_tokens_seen": 199492512, + "step": 92445 + }, + { + "epoch": 15.081566068515498, + "grad_norm": 0.8487502336502075, + "learning_rate": 0.00017319702942896777, + "loss": 0.0083, + "num_input_tokens_seen": 199502944, + "step": 92450 + }, + { + "epoch": 15.082381729200652, + "grad_norm": 0.0009768850868567824, + "learning_rate": 0.00017314316135693775, + "loss": 0.0194, + "num_input_tokens_seen": 199512544, + "step": 92455 + }, + { + "epoch": 15.083197389885807, + "grad_norm": 0.0062217977829277515, + "learning_rate": 0.00017308929990907652, + "loss": 0.0008, + "num_input_tokens_seen": 199523232, + "step": 92460 + }, + { + "epoch": 15.084013050570963, + "grad_norm": 0.007305622100830078, + "learning_rate": 0.000173035445086475, + "loss": 0.0014, + "num_input_tokens_seen": 199533984, + "step": 92465 + }, + { + "epoch": 15.084828711256117, + "grad_norm": 0.008182352408766747, + "learning_rate": 0.0001729815968902253, + "loss": 0.0028, + "num_input_tokens_seen": 199544512, + "step": 92470 + }, + { + "epoch": 15.085644371941273, + "grad_norm": 0.3714638650417328, + "learning_rate": 0.0001729277553214181, + "loss": 0.0072, + "num_input_tokens_seen": 199556192, + "step": 92475 + }, + { + "epoch": 15.086460032626427, + "grad_norm": 0.006954391021281481, + "learning_rate": 0.00017287392038114514, + "loss": 0.0031, + "num_input_tokens_seen": 199567584, + "step": 92480 + }, + { + "epoch": 15.087275693311582, + "grad_norm": 0.19819119572639465, + "learning_rate": 0.00017282009207049686, + "loss": 0.0044, + "num_input_tokens_seen": 199579168, + "step": 92485 + }, + { + "epoch": 15.088091353996738, + "grad_norm": 0.08956929296255112, + "learning_rate": 0.00017276627039056463, + "loss": 0.0023, + "num_input_tokens_seen": 199589248, + "step": 92490 + }, + { + "epoch": 15.088907014681892, + "grad_norm": 0.013940807431936264, + "learning_rate": 0.00017271245534243912, + "loss": 0.0008, + "num_input_tokens_seen": 199601024, + "step": 92495 + }, + { + "epoch": 15.089722675367048, + "grad_norm": 0.043516576290130615, + "learning_rate": 0.00017265864692721084, + "loss": 0.0017, + "num_input_tokens_seen": 199611168, + "step": 92500 + }, + { + "epoch": 15.090538336052202, + "grad_norm": 0.013501893728971481, + "learning_rate": 0.00017260484514597035, + "loss": 0.015, + "num_input_tokens_seen": 199621664, + "step": 92505 + }, + { + "epoch": 15.091353996737357, + "grad_norm": 0.08746878057718277, + "learning_rate": 0.00017255104999980799, + "loss": 0.0029, + "num_input_tokens_seen": 199632640, + "step": 92510 + }, + { + "epoch": 15.092169657422513, + "grad_norm": 0.005462236702442169, + "learning_rate": 0.00017249726148981399, + "loss": 0.0006, + "num_input_tokens_seen": 199644800, + "step": 92515 + }, + { + "epoch": 15.092985318107667, + "grad_norm": 0.0014251795364543796, + "learning_rate": 0.00017244347961707852, + "loss": 0.0015, + "num_input_tokens_seen": 199655264, + "step": 92520 + }, + { + "epoch": 15.093800978792823, + "grad_norm": 0.0002626210334710777, + "learning_rate": 0.00017238970438269142, + "loss": 0.0009, + "num_input_tokens_seen": 199667136, + "step": 92525 + }, + { + "epoch": 15.094616639477977, + "grad_norm": 0.0004512217128649354, + "learning_rate": 0.00017233593578774254, + "loss": 0.0013, + "num_input_tokens_seen": 199678048, + "step": 92530 + }, + { + "epoch": 15.095432300163132, + "grad_norm": 0.004276671912521124, + "learning_rate": 0.00017228217383332163, + "loss": 0.0005, + "num_input_tokens_seen": 199688608, + "step": 92535 + }, + { + "epoch": 15.096247960848286, + "grad_norm": 0.05057811364531517, + "learning_rate": 0.00017222841852051817, + "loss": 0.0029, + "num_input_tokens_seen": 199699008, + "step": 92540 + }, + { + "epoch": 15.097063621533442, + "grad_norm": 0.00020996005332563072, + "learning_rate": 0.0001721746698504217, + "loss": 0.0007, + "num_input_tokens_seen": 199710144, + "step": 92545 + }, + { + "epoch": 15.097879282218598, + "grad_norm": 0.004850344266742468, + "learning_rate": 0.0001721209278241213, + "loss": 0.0008, + "num_input_tokens_seen": 199721024, + "step": 92550 + }, + { + "epoch": 15.098694942903752, + "grad_norm": 0.009677665308117867, + "learning_rate": 0.00017206719244270636, + "loss": 0.0037, + "num_input_tokens_seen": 199731936, + "step": 92555 + }, + { + "epoch": 15.099510603588907, + "grad_norm": 0.03853844851255417, + "learning_rate": 0.00017201346370726572, + "loss": 0.0378, + "num_input_tokens_seen": 199744640, + "step": 92560 + }, + { + "epoch": 15.100326264274061, + "grad_norm": 9.415140084456652e-05, + "learning_rate": 0.00017195974161888833, + "loss": 0.0019, + "num_input_tokens_seen": 199756064, + "step": 92565 + }, + { + "epoch": 15.101141924959217, + "grad_norm": 0.0010337188141420484, + "learning_rate": 0.00017190602617866274, + "loss": 0.0008, + "num_input_tokens_seen": 199766432, + "step": 92570 + }, + { + "epoch": 15.101957585644373, + "grad_norm": 0.00078952731564641, + "learning_rate": 0.0001718523173876781, + "loss": 0.0008, + "num_input_tokens_seen": 199777472, + "step": 92575 + }, + { + "epoch": 15.102773246329527, + "grad_norm": 0.0008189613581635058, + "learning_rate": 0.00017179861524702216, + "loss": 0.0069, + "num_input_tokens_seen": 199789184, + "step": 92580 + }, + { + "epoch": 15.103588907014682, + "grad_norm": 0.0004901462234556675, + "learning_rate": 0.000171744919757784, + "loss": 0.0011, + "num_input_tokens_seen": 199799776, + "step": 92585 + }, + { + "epoch": 15.104404567699836, + "grad_norm": 0.0145787438377738, + "learning_rate": 0.00017169123092105115, + "loss": 0.0007, + "num_input_tokens_seen": 199811424, + "step": 92590 + }, + { + "epoch": 15.105220228384992, + "grad_norm": 0.029071159660816193, + "learning_rate": 0.0001716375487379121, + "loss": 0.0032, + "num_input_tokens_seen": 199822784, + "step": 92595 + }, + { + "epoch": 15.106035889070148, + "grad_norm": 0.06977978348731995, + "learning_rate": 0.00017158387320945472, + "loss": 0.0053, + "num_input_tokens_seen": 199833728, + "step": 92600 + }, + { + "epoch": 15.106851549755302, + "grad_norm": 0.0008084296132437885, + "learning_rate": 0.0001715302043367668, + "loss": 0.006, + "num_input_tokens_seen": 199845344, + "step": 92605 + }, + { + "epoch": 15.107667210440457, + "grad_norm": 0.01468834187835455, + "learning_rate": 0.00017147654212093595, + "loss": 0.0037, + "num_input_tokens_seen": 199856960, + "step": 92610 + }, + { + "epoch": 15.108482871125611, + "grad_norm": 0.020093608647584915, + "learning_rate": 0.00017142288656304977, + "loss": 0.0033, + "num_input_tokens_seen": 199867840, + "step": 92615 + }, + { + "epoch": 15.109298531810767, + "grad_norm": 0.003451162250712514, + "learning_rate": 0.0001713692376641956, + "loss": 0.0007, + "num_input_tokens_seen": 199878912, + "step": 92620 + }, + { + "epoch": 15.11011419249592, + "grad_norm": 0.002677035517990589, + "learning_rate": 0.0001713155954254607, + "loss": 0.0007, + "num_input_tokens_seen": 199890080, + "step": 92625 + }, + { + "epoch": 15.110929853181077, + "grad_norm": 0.000554313650354743, + "learning_rate": 0.00017126195984793225, + "loss": 0.0035, + "num_input_tokens_seen": 199900800, + "step": 92630 + }, + { + "epoch": 15.111745513866232, + "grad_norm": 0.02522178553044796, + "learning_rate": 0.0001712083309326972, + "loss": 0.0133, + "num_input_tokens_seen": 199912608, + "step": 92635 + }, + { + "epoch": 15.112561174551386, + "grad_norm": 0.00012928828073199838, + "learning_rate": 0.0001711547086808425, + "loss": 0.0004, + "num_input_tokens_seen": 199923488, + "step": 92640 + }, + { + "epoch": 15.113376835236542, + "grad_norm": 0.009769278578460217, + "learning_rate": 0.00017110109309345468, + "loss": 0.0007, + "num_input_tokens_seen": 199934592, + "step": 92645 + }, + { + "epoch": 15.114192495921696, + "grad_norm": 0.005562474951148033, + "learning_rate": 0.00017104748417162054, + "loss": 0.0032, + "num_input_tokens_seen": 199945312, + "step": 92650 + }, + { + "epoch": 15.115008156606851, + "grad_norm": 0.0125007014721632, + "learning_rate": 0.0001709938819164264, + "loss": 0.0008, + "num_input_tokens_seen": 199956352, + "step": 92655 + }, + { + "epoch": 15.115823817292007, + "grad_norm": 0.0009301841491833329, + "learning_rate": 0.00017094028632895863, + "loss": 0.0022, + "num_input_tokens_seen": 199967168, + "step": 92660 + }, + { + "epoch": 15.116639477977161, + "grad_norm": 0.09393001347780228, + "learning_rate": 0.0001708866974103034, + "loss": 0.167, + "num_input_tokens_seen": 199977440, + "step": 92665 + }, + { + "epoch": 15.117455138662317, + "grad_norm": 0.05761215463280678, + "learning_rate": 0.0001708331151615467, + "loss": 0.0028, + "num_input_tokens_seen": 199988512, + "step": 92670 + }, + { + "epoch": 15.11827079934747, + "grad_norm": 0.0007358312141150236, + "learning_rate": 0.00017077953958377458, + "loss": 0.0013, + "num_input_tokens_seen": 199999296, + "step": 92675 + }, + { + "epoch": 15.119086460032626, + "grad_norm": 0.002172029810026288, + "learning_rate": 0.0001707259706780727, + "loss": 0.0021, + "num_input_tokens_seen": 200009856, + "step": 92680 + }, + { + "epoch": 15.119902120717782, + "grad_norm": 0.00172037398442626, + "learning_rate": 0.00017067240844552672, + "loss": 0.0013, + "num_input_tokens_seen": 200021184, + "step": 92685 + }, + { + "epoch": 15.120717781402936, + "grad_norm": 0.0005334850866347551, + "learning_rate": 0.00017061885288722218, + "loss": 0.001, + "num_input_tokens_seen": 200031616, + "step": 92690 + }, + { + "epoch": 15.121533442088092, + "grad_norm": 0.002080358797684312, + "learning_rate": 0.00017056530400424446, + "loss": 0.0009, + "num_input_tokens_seen": 200040928, + "step": 92695 + }, + { + "epoch": 15.122349102773246, + "grad_norm": 0.00020994346414227039, + "learning_rate": 0.00017051176179767858, + "loss": 0.0013, + "num_input_tokens_seen": 200051776, + "step": 92700 + }, + { + "epoch": 15.123164763458401, + "grad_norm": 0.0013592796167358756, + "learning_rate": 0.00017045822626861017, + "loss": 0.0016, + "num_input_tokens_seen": 200063232, + "step": 92705 + }, + { + "epoch": 15.123980424143557, + "grad_norm": 0.00758744589984417, + "learning_rate": 0.00017040469741812353, + "loss": 0.0089, + "num_input_tokens_seen": 200072160, + "step": 92710 + }, + { + "epoch": 15.124796084828711, + "grad_norm": 0.010285993106663227, + "learning_rate": 0.00017035117524730398, + "loss": 0.0032, + "num_input_tokens_seen": 200081888, + "step": 92715 + }, + { + "epoch": 15.125611745513867, + "grad_norm": 0.00020340579794719815, + "learning_rate": 0.00017029765975723604, + "loss": 0.0004, + "num_input_tokens_seen": 200092864, + "step": 92720 + }, + { + "epoch": 15.12642740619902, + "grad_norm": 0.0029770575929433107, + "learning_rate": 0.0001702441509490043, + "loss": 0.0005, + "num_input_tokens_seen": 200102976, + "step": 92725 + }, + { + "epoch": 15.127243066884176, + "grad_norm": 0.0034841480664908886, + "learning_rate": 0.00017019064882369317, + "loss": 0.0009, + "num_input_tokens_seen": 200112064, + "step": 92730 + }, + { + "epoch": 15.12805872756933, + "grad_norm": 0.06382524222135544, + "learning_rate": 0.00017013715338238695, + "loss": 0.0023, + "num_input_tokens_seen": 200122656, + "step": 92735 + }, + { + "epoch": 15.128874388254486, + "grad_norm": 0.07054462283849716, + "learning_rate": 0.00017008366462616976, + "loss": 0.0627, + "num_input_tokens_seen": 200133856, + "step": 92740 + }, + { + "epoch": 15.129690048939642, + "grad_norm": 0.0065085492096841335, + "learning_rate": 0.00017003018255612562, + "loss": 0.0058, + "num_input_tokens_seen": 200144608, + "step": 92745 + }, + { + "epoch": 15.130505709624796, + "grad_norm": 0.00557303661480546, + "learning_rate": 0.00016997670717333846, + "loss": 0.0012, + "num_input_tokens_seen": 200156256, + "step": 92750 + }, + { + "epoch": 15.131321370309951, + "grad_norm": 0.03323657438158989, + "learning_rate": 0.00016992323847889195, + "loss": 0.003, + "num_input_tokens_seen": 200167008, + "step": 92755 + }, + { + "epoch": 15.132137030995105, + "grad_norm": 0.0016741449944674969, + "learning_rate": 0.00016986977647386975, + "loss": 0.0041, + "num_input_tokens_seen": 200177248, + "step": 92760 + }, + { + "epoch": 15.132952691680261, + "grad_norm": 0.03913315013051033, + "learning_rate": 0.00016981632115935536, + "loss": 0.0034, + "num_input_tokens_seen": 200186624, + "step": 92765 + }, + { + "epoch": 15.133768352365417, + "grad_norm": 0.0008307689568027854, + "learning_rate": 0.00016976287253643208, + "loss": 0.0022, + "num_input_tokens_seen": 200197152, + "step": 92770 + }, + { + "epoch": 15.13458401305057, + "grad_norm": 0.0053260778076946735, + "learning_rate": 0.0001697094306061831, + "loss": 0.0005, + "num_input_tokens_seen": 200206976, + "step": 92775 + }, + { + "epoch": 15.135399673735726, + "grad_norm": 0.00021304273104760796, + "learning_rate": 0.00016965599536969156, + "loss": 0.0003, + "num_input_tokens_seen": 200216320, + "step": 92780 + }, + { + "epoch": 15.13621533442088, + "grad_norm": 0.0013596056960523129, + "learning_rate": 0.00016960256682804032, + "loss": 0.0012, + "num_input_tokens_seen": 200227200, + "step": 92785 + }, + { + "epoch": 15.137030995106036, + "grad_norm": 0.003737252438440919, + "learning_rate": 0.00016954914498231217, + "loss": 0.0395, + "num_input_tokens_seen": 200237664, + "step": 92790 + }, + { + "epoch": 15.137846655791192, + "grad_norm": 0.40574318170547485, + "learning_rate": 0.00016949572983358986, + "loss": 0.0261, + "num_input_tokens_seen": 200249568, + "step": 92795 + }, + { + "epoch": 15.138662316476346, + "grad_norm": 0.002538427710533142, + "learning_rate": 0.0001694423213829558, + "loss": 0.0248, + "num_input_tokens_seen": 200260448, + "step": 92800 + }, + { + "epoch": 15.139477977161501, + "grad_norm": 0.04654950276017189, + "learning_rate": 0.00016938891963149232, + "loss": 0.0099, + "num_input_tokens_seen": 200271008, + "step": 92805 + }, + { + "epoch": 15.140293637846655, + "grad_norm": 0.013962913304567337, + "learning_rate": 0.00016933552458028213, + "loss": 0.0007, + "num_input_tokens_seen": 200281152, + "step": 92810 + }, + { + "epoch": 15.141109298531811, + "grad_norm": 0.0062345778569579124, + "learning_rate": 0.0001692821362304066, + "loss": 0.0019, + "num_input_tokens_seen": 200293152, + "step": 92815 + }, + { + "epoch": 15.141924959216965, + "grad_norm": 0.03972542658448219, + "learning_rate": 0.00016922875458294856, + "loss": 0.0029, + "num_input_tokens_seen": 200303552, + "step": 92820 + }, + { + "epoch": 15.14274061990212, + "grad_norm": 0.00015585182700306177, + "learning_rate": 0.00016917537963898903, + "loss": 0.0032, + "num_input_tokens_seen": 200315136, + "step": 92825 + }, + { + "epoch": 15.143556280587276, + "grad_norm": 0.007367011625319719, + "learning_rate": 0.0001691220113996105, + "loss": 0.0133, + "num_input_tokens_seen": 200326080, + "step": 92830 + }, + { + "epoch": 15.14437194127243, + "grad_norm": 0.0002375073090661317, + "learning_rate": 0.00016906864986589377, + "loss": 0.0026, + "num_input_tokens_seen": 200336992, + "step": 92835 + }, + { + "epoch": 15.145187601957586, + "grad_norm": 0.014790832065045834, + "learning_rate": 0.00016901529503892098, + "loss": 0.0037, + "num_input_tokens_seen": 200348608, + "step": 92840 + }, + { + "epoch": 15.14600326264274, + "grad_norm": 0.0006704757688567042, + "learning_rate": 0.00016896194691977284, + "loss": 0.0005, + "num_input_tokens_seen": 200359040, + "step": 92845 + }, + { + "epoch": 15.146818923327896, + "grad_norm": 0.0010436594020575285, + "learning_rate": 0.00016890860550953092, + "loss": 0.0185, + "num_input_tokens_seen": 200369568, + "step": 92850 + }, + { + "epoch": 15.147634584013051, + "grad_norm": 0.00023583733127452433, + "learning_rate": 0.00016885527080927616, + "loss": 0.0009, + "num_input_tokens_seen": 200380864, + "step": 92855 + }, + { + "epoch": 15.148450244698205, + "grad_norm": 0.0010770170483738184, + "learning_rate": 0.00016880194282008941, + "loss": 0.0024, + "num_input_tokens_seen": 200392736, + "step": 92860 + }, + { + "epoch": 15.149265905383361, + "grad_norm": 0.004812437109649181, + "learning_rate": 0.0001687486215430515, + "loss": 0.0004, + "num_input_tokens_seen": 200403872, + "step": 92865 + }, + { + "epoch": 15.150081566068515, + "grad_norm": 0.00012874834646936506, + "learning_rate": 0.0001686953069792429, + "loss": 0.0007, + "num_input_tokens_seen": 200413248, + "step": 92870 + }, + { + "epoch": 15.15089722675367, + "grad_norm": 0.04941106587648392, + "learning_rate": 0.00016864199912974427, + "loss": 0.0017, + "num_input_tokens_seen": 200424896, + "step": 92875 + }, + { + "epoch": 15.151712887438826, + "grad_norm": 0.00731720682233572, + "learning_rate": 0.00016858869799563585, + "loss": 0.0096, + "num_input_tokens_seen": 200435872, + "step": 92880 + }, + { + "epoch": 15.15252854812398, + "grad_norm": 0.003006185172125697, + "learning_rate": 0.0001685354035779979, + "loss": 0.0005, + "num_input_tokens_seen": 200447648, + "step": 92885 + }, + { + "epoch": 15.153344208809136, + "grad_norm": 0.00032268970971927047, + "learning_rate": 0.00016848211587791045, + "loss": 0.0007, + "num_input_tokens_seen": 200457536, + "step": 92890 + }, + { + "epoch": 15.15415986949429, + "grad_norm": 0.010344445705413818, + "learning_rate": 0.00016842883489645355, + "loss": 0.0009, + "num_input_tokens_seen": 200467392, + "step": 92895 + }, + { + "epoch": 15.154975530179446, + "grad_norm": 0.000387981825042516, + "learning_rate": 0.00016837556063470688, + "loss": 0.0005, + "num_input_tokens_seen": 200479264, + "step": 92900 + }, + { + "epoch": 15.1557911908646, + "grad_norm": 0.09170874953269958, + "learning_rate": 0.0001683222930937502, + "loss": 0.0062, + "num_input_tokens_seen": 200490624, + "step": 92905 + }, + { + "epoch": 15.156606851549755, + "grad_norm": 0.0004364280612207949, + "learning_rate": 0.00016826903227466284, + "loss": 0.1285, + "num_input_tokens_seen": 200500736, + "step": 92910 + }, + { + "epoch": 15.15742251223491, + "grad_norm": 0.000525986310094595, + "learning_rate": 0.00016821577817852473, + "loss": 0.0037, + "num_input_tokens_seen": 200511968, + "step": 92915 + }, + { + "epoch": 15.158238172920065, + "grad_norm": 0.0005778949125669897, + "learning_rate": 0.00016816253080641441, + "loss": 0.0491, + "num_input_tokens_seen": 200522240, + "step": 92920 + }, + { + "epoch": 15.15905383360522, + "grad_norm": 0.0016380720771849155, + "learning_rate": 0.00016810929015941174, + "loss": 0.0038, + "num_input_tokens_seen": 200533632, + "step": 92925 + }, + { + "epoch": 15.159869494290374, + "grad_norm": 0.0009482292225584388, + "learning_rate": 0.00016805605623859492, + "loss": 0.0044, + "num_input_tokens_seen": 200545024, + "step": 92930 + }, + { + "epoch": 15.16068515497553, + "grad_norm": 0.0010794708505272865, + "learning_rate": 0.0001680028290450436, + "loss": 0.0026, + "num_input_tokens_seen": 200557152, + "step": 92935 + }, + { + "epoch": 15.161500815660686, + "grad_norm": 0.03286047279834747, + "learning_rate": 0.00016794960857983583, + "loss": 0.0014, + "num_input_tokens_seen": 200566944, + "step": 92940 + }, + { + "epoch": 15.16231647634584, + "grad_norm": 0.00029404115048237145, + "learning_rate": 0.00016789639484405077, + "loss": 0.0022, + "num_input_tokens_seen": 200577184, + "step": 92945 + }, + { + "epoch": 15.163132137030995, + "grad_norm": 0.0005150421638973057, + "learning_rate": 0.00016784318783876623, + "loss": 0.0033, + "num_input_tokens_seen": 200588160, + "step": 92950 + }, + { + "epoch": 15.16394779771615, + "grad_norm": 0.0004306863120291382, + "learning_rate": 0.0001677899875650612, + "loss": 0.0047, + "num_input_tokens_seen": 200598272, + "step": 92955 + }, + { + "epoch": 15.164763458401305, + "grad_norm": 0.0010557628702372313, + "learning_rate": 0.00016773679402401321, + "loss": 0.0094, + "num_input_tokens_seen": 200609088, + "step": 92960 + }, + { + "epoch": 15.16557911908646, + "grad_norm": 0.00013803858018945903, + "learning_rate": 0.0001676836072167009, + "loss": 0.0037, + "num_input_tokens_seen": 200619200, + "step": 92965 + }, + { + "epoch": 15.166394779771615, + "grad_norm": 0.012267783284187317, + "learning_rate": 0.0001676304271442015, + "loss": 0.0133, + "num_input_tokens_seen": 200630048, + "step": 92970 + }, + { + "epoch": 15.16721044045677, + "grad_norm": 0.01013946533203125, + "learning_rate": 0.00016757725380759354, + "loss": 0.0023, + "num_input_tokens_seen": 200641696, + "step": 92975 + }, + { + "epoch": 15.168026101141924, + "grad_norm": 0.004624026827514172, + "learning_rate": 0.00016752408720795386, + "loss": 0.0022, + "num_input_tokens_seen": 200652928, + "step": 92980 + }, + { + "epoch": 15.16884176182708, + "grad_norm": 0.004595502745360136, + "learning_rate": 0.00016747092734636067, + "loss": 0.0755, + "num_input_tokens_seen": 200664608, + "step": 92985 + }, + { + "epoch": 15.169657422512234, + "grad_norm": 0.08409766107797623, + "learning_rate": 0.0001674177742238906, + "loss": 0.0074, + "num_input_tokens_seen": 200675008, + "step": 92990 + }, + { + "epoch": 15.17047308319739, + "grad_norm": 0.004185281693935394, + "learning_rate": 0.0001673646278416215, + "loss": 0.0003, + "num_input_tokens_seen": 200685728, + "step": 92995 + }, + { + "epoch": 15.171288743882545, + "grad_norm": 0.09289959073066711, + "learning_rate": 0.00016731148820063013, + "loss": 0.0058, + "num_input_tokens_seen": 200696608, + "step": 93000 + }, + { + "epoch": 15.1721044045677, + "grad_norm": 0.0028860215097665787, + "learning_rate": 0.00016725835530199352, + "loss": 0.0004, + "num_input_tokens_seen": 200706848, + "step": 93005 + }, + { + "epoch": 15.172920065252855, + "grad_norm": 0.002232042606920004, + "learning_rate": 0.00016720522914678843, + "loss": 0.0017, + "num_input_tokens_seen": 200719232, + "step": 93010 + }, + { + "epoch": 15.173735725938009, + "grad_norm": 0.007030048407614231, + "learning_rate": 0.00016715210973609158, + "loss": 0.0004, + "num_input_tokens_seen": 200729920, + "step": 93015 + }, + { + "epoch": 15.174551386623165, + "grad_norm": 0.0005376629414968193, + "learning_rate": 0.00016709899707097948, + "loss": 0.0002, + "num_input_tokens_seen": 200740640, + "step": 93020 + }, + { + "epoch": 15.17536704730832, + "grad_norm": 0.0007648960454389453, + "learning_rate": 0.0001670458911525285, + "loss": 0.0108, + "num_input_tokens_seen": 200751136, + "step": 93025 + }, + { + "epoch": 15.176182707993474, + "grad_norm": 0.00277718435972929, + "learning_rate": 0.00016699279198181493, + "loss": 0.0042, + "num_input_tokens_seen": 200762208, + "step": 93030 + }, + { + "epoch": 15.17699836867863, + "grad_norm": 0.0069837747141718864, + "learning_rate": 0.00016693969955991483, + "loss": 0.0107, + "num_input_tokens_seen": 200773824, + "step": 93035 + }, + { + "epoch": 15.177814029363784, + "grad_norm": 0.0027692194562405348, + "learning_rate": 0.00016688661388790434, + "loss": 0.0007, + "num_input_tokens_seen": 200784736, + "step": 93040 + }, + { + "epoch": 15.17862969004894, + "grad_norm": 0.05169246345758438, + "learning_rate": 0.00016683353496685895, + "loss": 0.008, + "num_input_tokens_seen": 200794784, + "step": 93045 + }, + { + "epoch": 15.179445350734095, + "grad_norm": 0.017291145399212837, + "learning_rate": 0.00016678046279785497, + "loss": 0.0078, + "num_input_tokens_seen": 200805440, + "step": 93050 + }, + { + "epoch": 15.18026101141925, + "grad_norm": 0.027523567900061607, + "learning_rate": 0.00016672739738196734, + "loss": 0.0011, + "num_input_tokens_seen": 200817024, + "step": 93055 + }, + { + "epoch": 15.181076672104405, + "grad_norm": 0.1617918759584427, + "learning_rate": 0.0001666743387202721, + "loss": 0.0069, + "num_input_tokens_seen": 200827648, + "step": 93060 + }, + { + "epoch": 15.181892332789559, + "grad_norm": 0.0018692822195589542, + "learning_rate": 0.00016662128681384388, + "loss": 0.0024, + "num_input_tokens_seen": 200838496, + "step": 93065 + }, + { + "epoch": 15.182707993474715, + "grad_norm": 0.0005498597165569663, + "learning_rate": 0.00016656824166375855, + "loss": 0.0033, + "num_input_tokens_seen": 200849408, + "step": 93070 + }, + { + "epoch": 15.18352365415987, + "grad_norm": 0.001064909272827208, + "learning_rate": 0.0001665152032710905, + "loss": 0.0028, + "num_input_tokens_seen": 200861152, + "step": 93075 + }, + { + "epoch": 15.184339314845024, + "grad_norm": 0.01103004440665245, + "learning_rate": 0.0001664621716369152, + "loss": 0.0035, + "num_input_tokens_seen": 200871872, + "step": 93080 + }, + { + "epoch": 15.18515497553018, + "grad_norm": 0.00013367646897677332, + "learning_rate": 0.00016640914676230677, + "loss": 0.0013, + "num_input_tokens_seen": 200881856, + "step": 93085 + }, + { + "epoch": 15.185970636215334, + "grad_norm": 0.040610380470752716, + "learning_rate": 0.00016635612864834048, + "loss": 0.0009, + "num_input_tokens_seen": 200892448, + "step": 93090 + }, + { + "epoch": 15.18678629690049, + "grad_norm": 0.03554153069853783, + "learning_rate": 0.00016630311729609026, + "loss": 0.001, + "num_input_tokens_seen": 200903904, + "step": 93095 + }, + { + "epoch": 15.187601957585644, + "grad_norm": 0.03929486125707626, + "learning_rate": 0.00016625011270663098, + "loss": 0.0055, + "num_input_tokens_seen": 200913248, + "step": 93100 + }, + { + "epoch": 15.1884176182708, + "grad_norm": 0.028232203796505928, + "learning_rate": 0.00016619711488103622, + "loss": 0.0032, + "num_input_tokens_seen": 200924832, + "step": 93105 + }, + { + "epoch": 15.189233278955955, + "grad_norm": 0.0004619164683390409, + "learning_rate": 0.0001661441238203807, + "loss": 0.068, + "num_input_tokens_seen": 200935584, + "step": 93110 + }, + { + "epoch": 15.190048939641109, + "grad_norm": 0.003007930237799883, + "learning_rate": 0.00016609113952573774, + "loss": 0.0021, + "num_input_tokens_seen": 200945312, + "step": 93115 + }, + { + "epoch": 15.190864600326265, + "grad_norm": 0.0004413631104398519, + "learning_rate": 0.0001660381619981817, + "loss": 0.0066, + "num_input_tokens_seen": 200957216, + "step": 93120 + }, + { + "epoch": 15.191680261011419, + "grad_norm": 0.009486415423452854, + "learning_rate": 0.0001659851912387857, + "loss": 0.0011, + "num_input_tokens_seen": 200968288, + "step": 93125 + }, + { + "epoch": 15.192495921696574, + "grad_norm": 0.0014938893727958202, + "learning_rate": 0.00016593222724862366, + "loss": 0.0008, + "num_input_tokens_seen": 200979680, + "step": 93130 + }, + { + "epoch": 15.19331158238173, + "grad_norm": 0.3065096437931061, + "learning_rate": 0.0001658792700287689, + "loss": 0.0057, + "num_input_tokens_seen": 200989536, + "step": 93135 + }, + { + "epoch": 15.194127243066884, + "grad_norm": 0.000833876256365329, + "learning_rate": 0.00016582631958029454, + "loss": 0.0016, + "num_input_tokens_seen": 201000256, + "step": 93140 + }, + { + "epoch": 15.19494290375204, + "grad_norm": 0.003296643728390336, + "learning_rate": 0.00016577337590427372, + "loss": 0.0103, + "num_input_tokens_seen": 201011360, + "step": 93145 + }, + { + "epoch": 15.195758564437194, + "grad_norm": 0.0006008744239807129, + "learning_rate": 0.00016572043900177946, + "loss": 0.0017, + "num_input_tokens_seen": 201021600, + "step": 93150 + }, + { + "epoch": 15.19657422512235, + "grad_norm": 0.17754621803760529, + "learning_rate": 0.0001656675088738846, + "loss": 0.0027, + "num_input_tokens_seen": 201032320, + "step": 93155 + }, + { + "epoch": 15.197389885807505, + "grad_norm": 0.08714406192302704, + "learning_rate": 0.00016561458552166174, + "loss": 0.0022, + "num_input_tokens_seen": 201043456, + "step": 93160 + }, + { + "epoch": 15.198205546492659, + "grad_norm": 0.002343852072954178, + "learning_rate": 0.00016556166894618352, + "loss": 0.001, + "num_input_tokens_seen": 201054432, + "step": 93165 + }, + { + "epoch": 15.199021207177815, + "grad_norm": 0.00033321298542432487, + "learning_rate": 0.00016550875914852237, + "loss": 0.0006, + "num_input_tokens_seen": 201066112, + "step": 93170 + }, + { + "epoch": 15.199836867862969, + "grad_norm": 0.20812541246414185, + "learning_rate": 0.00016545585612975051, + "loss": 0.0068, + "num_input_tokens_seen": 201076608, + "step": 93175 + }, + { + "epoch": 15.200652528548124, + "grad_norm": 0.001858398667536676, + "learning_rate": 0.00016540295989094018, + "loss": 0.0004, + "num_input_tokens_seen": 201087488, + "step": 93180 + }, + { + "epoch": 15.201468189233278, + "grad_norm": 0.030841004103422165, + "learning_rate": 0.0001653500704331633, + "loss": 0.0025, + "num_input_tokens_seen": 201098208, + "step": 93185 + }, + { + "epoch": 15.202283849918434, + "grad_norm": 0.0010774884140118957, + "learning_rate": 0.0001652971877574916, + "loss": 0.0015, + "num_input_tokens_seen": 201109504, + "step": 93190 + }, + { + "epoch": 15.20309951060359, + "grad_norm": 0.00043485729838721454, + "learning_rate": 0.00016524431186499733, + "loss": 0.0012, + "num_input_tokens_seen": 201118720, + "step": 93195 + }, + { + "epoch": 15.203915171288743, + "grad_norm": 0.002032163320109248, + "learning_rate": 0.0001651914427567514, + "loss": 0.0007, + "num_input_tokens_seen": 201129024, + "step": 93200 + }, + { + "epoch": 15.2047308319739, + "grad_norm": 0.003500643651932478, + "learning_rate": 0.000165138580433826, + "loss": 0.1232, + "num_input_tokens_seen": 201140608, + "step": 93205 + }, + { + "epoch": 15.205546492659053, + "grad_norm": 0.0007357989088632166, + "learning_rate": 0.00016508572489729172, + "loss": 0.0011, + "num_input_tokens_seen": 201150944, + "step": 93210 + }, + { + "epoch": 15.206362153344209, + "grad_norm": 0.000951760564930737, + "learning_rate": 0.00016503287614822042, + "loss": 0.0002, + "num_input_tokens_seen": 201162400, + "step": 93215 + }, + { + "epoch": 15.207177814029365, + "grad_norm": 0.05167536064982414, + "learning_rate": 0.00016498003418768248, + "loss": 0.001, + "num_input_tokens_seen": 201173088, + "step": 93220 + }, + { + "epoch": 15.207993474714518, + "grad_norm": 0.000307158799842, + "learning_rate": 0.00016492719901674947, + "loss": 0.0008, + "num_input_tokens_seen": 201183808, + "step": 93225 + }, + { + "epoch": 15.208809135399674, + "grad_norm": 0.002535782754421234, + "learning_rate": 0.00016487437063649152, + "loss": 0.0018, + "num_input_tokens_seen": 201194400, + "step": 93230 + }, + { + "epoch": 15.209624796084828, + "grad_norm": 0.0018659787019714713, + "learning_rate": 0.00016482154904797974, + "loss": 0.006, + "num_input_tokens_seen": 201204640, + "step": 93235 + }, + { + "epoch": 15.210440456769984, + "grad_norm": 0.00476057967171073, + "learning_rate": 0.0001647687342522845, + "loss": 0.007, + "num_input_tokens_seen": 201216000, + "step": 93240 + }, + { + "epoch": 15.21125611745514, + "grad_norm": 0.015132073312997818, + "learning_rate": 0.00016471592625047615, + "loss": 0.0009, + "num_input_tokens_seen": 201227360, + "step": 93245 + }, + { + "epoch": 15.212071778140293, + "grad_norm": 0.0007798481383360922, + "learning_rate": 0.00016466312504362485, + "loss": 0.0017, + "num_input_tokens_seen": 201238624, + "step": 93250 + }, + { + "epoch": 15.21288743882545, + "grad_norm": 0.02861042320728302, + "learning_rate": 0.00016461033063280074, + "loss": 0.0015, + "num_input_tokens_seen": 201249632, + "step": 93255 + }, + { + "epoch": 15.213703099510603, + "grad_norm": 0.070248082280159, + "learning_rate": 0.00016455754301907376, + "loss": 0.003, + "num_input_tokens_seen": 201259872, + "step": 93260 + }, + { + "epoch": 15.214518760195759, + "grad_norm": 0.006880658678710461, + "learning_rate": 0.00016450476220351368, + "loss": 0.029, + "num_input_tokens_seen": 201270272, + "step": 93265 + }, + { + "epoch": 15.215334420880913, + "grad_norm": 0.01666405238211155, + "learning_rate": 0.00016445198818719025, + "loss": 0.004, + "num_input_tokens_seen": 201280960, + "step": 93270 + }, + { + "epoch": 15.216150081566068, + "grad_norm": 0.2639254331588745, + "learning_rate": 0.00016439922097117294, + "loss": 0.0059, + "num_input_tokens_seen": 201291488, + "step": 93275 + }, + { + "epoch": 15.216965742251224, + "grad_norm": 0.004900115076452494, + "learning_rate": 0.00016434646055653112, + "loss": 0.0011, + "num_input_tokens_seen": 201302304, + "step": 93280 + }, + { + "epoch": 15.217781402936378, + "grad_norm": 0.001026731450110674, + "learning_rate": 0.0001642937069443341, + "loss": 0.0015, + "num_input_tokens_seen": 201313760, + "step": 93285 + }, + { + "epoch": 15.218597063621534, + "grad_norm": 0.03798031061887741, + "learning_rate": 0.00016424096013565098, + "loss": 0.0017, + "num_input_tokens_seen": 201324384, + "step": 93290 + }, + { + "epoch": 15.219412724306688, + "grad_norm": 0.20006714761257172, + "learning_rate": 0.00016418822013155077, + "loss": 0.0055, + "num_input_tokens_seen": 201336000, + "step": 93295 + }, + { + "epoch": 15.220228384991843, + "grad_norm": 0.007183899637311697, + "learning_rate": 0.00016413548693310225, + "loss": 0.1662, + "num_input_tokens_seen": 201346752, + "step": 93300 + }, + { + "epoch": 15.221044045676999, + "grad_norm": 0.0017325283261016011, + "learning_rate": 0.00016408276054137417, + "loss": 0.0014, + "num_input_tokens_seen": 201356352, + "step": 93305 + }, + { + "epoch": 15.221859706362153, + "grad_norm": 0.004359804559499025, + "learning_rate": 0.00016403004095743513, + "loss": 0.0015, + "num_input_tokens_seen": 201367840, + "step": 93310 + }, + { + "epoch": 15.222675367047309, + "grad_norm": 0.015633394941687584, + "learning_rate": 0.00016397732818235344, + "loss": 0.0033, + "num_input_tokens_seen": 201378400, + "step": 93315 + }, + { + "epoch": 15.223491027732463, + "grad_norm": 0.005570504814386368, + "learning_rate": 0.0001639246222171975, + "loss": 0.0047, + "num_input_tokens_seen": 201388672, + "step": 93320 + }, + { + "epoch": 15.224306688417618, + "grad_norm": 0.3404058516025543, + "learning_rate": 0.0001638719230630355, + "loss": 0.125, + "num_input_tokens_seen": 201399648, + "step": 93325 + }, + { + "epoch": 15.225122349102774, + "grad_norm": 0.0017075024079531431, + "learning_rate": 0.0001638192307209353, + "loss": 0.0025, + "num_input_tokens_seen": 201410208, + "step": 93330 + }, + { + "epoch": 15.225938009787928, + "grad_norm": 0.05883980169892311, + "learning_rate": 0.00016376654519196477, + "loss": 0.0034, + "num_input_tokens_seen": 201420512, + "step": 93335 + }, + { + "epoch": 15.226753670473084, + "grad_norm": 0.01556406356394291, + "learning_rate": 0.00016371386647719182, + "loss": 0.0007, + "num_input_tokens_seen": 201430944, + "step": 93340 + }, + { + "epoch": 15.227569331158238, + "grad_norm": 0.02667573280632496, + "learning_rate": 0.00016366119457768407, + "loss": 0.0012, + "num_input_tokens_seen": 201440544, + "step": 93345 + }, + { + "epoch": 15.228384991843393, + "grad_norm": 0.005003976169973612, + "learning_rate": 0.00016360852949450882, + "loss": 0.0005, + "num_input_tokens_seen": 201451232, + "step": 93350 + }, + { + "epoch": 15.229200652528547, + "grad_norm": 0.006092370022088289, + "learning_rate": 0.00016355587122873349, + "loss": 0.0013, + "num_input_tokens_seen": 201461984, + "step": 93355 + }, + { + "epoch": 15.230016313213703, + "grad_norm": 0.02079516462981701, + "learning_rate": 0.00016350321978142525, + "loss": 0.0076, + "num_input_tokens_seen": 201473504, + "step": 93360 + }, + { + "epoch": 15.230831973898859, + "grad_norm": 0.00022696665837429464, + "learning_rate": 0.00016345057515365115, + "loss": 0.0002, + "num_input_tokens_seen": 201484480, + "step": 93365 + }, + { + "epoch": 15.231647634584013, + "grad_norm": 0.00019448397506494075, + "learning_rate": 0.00016339793734647807, + "loss": 0.0015, + "num_input_tokens_seen": 201495104, + "step": 93370 + }, + { + "epoch": 15.232463295269168, + "grad_norm": 0.008671200834214687, + "learning_rate": 0.00016334530636097277, + "loss": 0.0038, + "num_input_tokens_seen": 201506784, + "step": 93375 + }, + { + "epoch": 15.233278955954322, + "grad_norm": 0.007006536703556776, + "learning_rate": 0.00016329268219820192, + "loss": 0.0021, + "num_input_tokens_seen": 201517952, + "step": 93380 + }, + { + "epoch": 15.234094616639478, + "grad_norm": 0.007296527270227671, + "learning_rate": 0.00016324006485923204, + "loss": 0.0021, + "num_input_tokens_seen": 201528800, + "step": 93385 + }, + { + "epoch": 15.234910277324634, + "grad_norm": 0.0007603623671457171, + "learning_rate": 0.00016318745434512944, + "loss": 0.0004, + "num_input_tokens_seen": 201539840, + "step": 93390 + }, + { + "epoch": 15.235725938009788, + "grad_norm": 0.015286428853869438, + "learning_rate": 0.00016313485065696037, + "loss": 0.0046, + "num_input_tokens_seen": 201550816, + "step": 93395 + }, + { + "epoch": 15.236541598694943, + "grad_norm": 0.0007182045956142247, + "learning_rate": 0.00016308225379579088, + "loss": 0.0024, + "num_input_tokens_seen": 201560288, + "step": 93400 + }, + { + "epoch": 15.237357259380097, + "grad_norm": 0.03309705853462219, + "learning_rate": 0.0001630296637626869, + "loss": 0.0058, + "num_input_tokens_seen": 201571264, + "step": 93405 + }, + { + "epoch": 15.238172920065253, + "grad_norm": 0.8415988087654114, + "learning_rate": 0.0001629770805587143, + "loss": 0.0119, + "num_input_tokens_seen": 201582368, + "step": 93410 + }, + { + "epoch": 15.238988580750409, + "grad_norm": 0.001985365292057395, + "learning_rate": 0.0001629245041849387, + "loss": 0.0011, + "num_input_tokens_seen": 201595392, + "step": 93415 + }, + { + "epoch": 15.239804241435563, + "grad_norm": 0.001465521869249642, + "learning_rate": 0.0001628719346424256, + "loss": 0.0004, + "num_input_tokens_seen": 201605568, + "step": 93420 + }, + { + "epoch": 15.240619902120718, + "grad_norm": 0.0037078065797686577, + "learning_rate": 0.00016281937193224051, + "loss": 0.0009, + "num_input_tokens_seen": 201616000, + "step": 93425 + }, + { + "epoch": 15.241435562805872, + "grad_norm": 0.04821900650858879, + "learning_rate": 0.0001627668160554485, + "loss": 0.0005, + "num_input_tokens_seen": 201626880, + "step": 93430 + }, + { + "epoch": 15.242251223491028, + "grad_norm": 0.0006225058459676802, + "learning_rate": 0.00016271426701311483, + "loss": 0.0003, + "num_input_tokens_seen": 201637888, + "step": 93435 + }, + { + "epoch": 15.243066884176184, + "grad_norm": 0.0009561703773215413, + "learning_rate": 0.00016266172480630436, + "loss": 0.001, + "num_input_tokens_seen": 201648960, + "step": 93440 + }, + { + "epoch": 15.243882544861338, + "grad_norm": 0.011829622089862823, + "learning_rate": 0.0001626091894360819, + "loss": 0.0081, + "num_input_tokens_seen": 201660544, + "step": 93445 + }, + { + "epoch": 15.244698205546493, + "grad_norm": 0.011700176633894444, + "learning_rate": 0.00016255666090351245, + "loss": 0.0006, + "num_input_tokens_seen": 201672416, + "step": 93450 + }, + { + "epoch": 15.245513866231647, + "grad_norm": 0.0050372895784676075, + "learning_rate": 0.00016250413920966013, + "loss": 0.0004, + "num_input_tokens_seen": 201683040, + "step": 93455 + }, + { + "epoch": 15.246329526916803, + "grad_norm": 0.004345127381384373, + "learning_rate": 0.0001624516243555898, + "loss": 0.0016, + "num_input_tokens_seen": 201695104, + "step": 93460 + }, + { + "epoch": 15.247145187601957, + "grad_norm": 0.0676548182964325, + "learning_rate": 0.00016239911634236527, + "loss": 0.0016, + "num_input_tokens_seen": 201704928, + "step": 93465 + }, + { + "epoch": 15.247960848287113, + "grad_norm": 0.06143611669540405, + "learning_rate": 0.00016234661517105115, + "loss": 0.0007, + "num_input_tokens_seen": 201716000, + "step": 93470 + }, + { + "epoch": 15.248776508972268, + "grad_norm": 0.003116594161838293, + "learning_rate": 0.00016229412084271095, + "loss": 0.0005, + "num_input_tokens_seen": 201726016, + "step": 93475 + }, + { + "epoch": 15.249592169657422, + "grad_norm": 0.001520035439170897, + "learning_rate": 0.00016224163335840897, + "loss": 0.0152, + "num_input_tokens_seen": 201736032, + "step": 93480 + }, + { + "epoch": 15.250407830342578, + "grad_norm": 0.02137434296309948, + "learning_rate": 0.00016218915271920875, + "loss": 0.0033, + "num_input_tokens_seen": 201746592, + "step": 93485 + }, + { + "epoch": 15.251223491027732, + "grad_norm": 0.00017357330943923444, + "learning_rate": 0.00016213667892617394, + "loss": 0.0031, + "num_input_tokens_seen": 201756768, + "step": 93490 + }, + { + "epoch": 15.252039151712887, + "grad_norm": 0.00151347229257226, + "learning_rate": 0.00016208421198036789, + "loss": 0.0004, + "num_input_tokens_seen": 201768672, + "step": 93495 + }, + { + "epoch": 15.252854812398043, + "grad_norm": 0.04724891483783722, + "learning_rate": 0.00016203175188285397, + "loss": 0.001, + "num_input_tokens_seen": 201779200, + "step": 93500 + }, + { + "epoch": 15.253670473083197, + "grad_norm": 0.010680549778044224, + "learning_rate": 0.00016197929863469534, + "loss": 0.0007, + "num_input_tokens_seen": 201790016, + "step": 93505 + }, + { + "epoch": 15.254486133768353, + "grad_norm": 0.00023476929345633835, + "learning_rate": 0.0001619268522369551, + "loss": 0.0004, + "num_input_tokens_seen": 201800736, + "step": 93510 + }, + { + "epoch": 15.255301794453507, + "grad_norm": 0.0031387237831950188, + "learning_rate": 0.00016187441269069596, + "loss": 0.0086, + "num_input_tokens_seen": 201811808, + "step": 93515 + }, + { + "epoch": 15.256117455138662, + "grad_norm": 0.032479893416166306, + "learning_rate": 0.00016182197999698084, + "loss": 0.0014, + "num_input_tokens_seen": 201822624, + "step": 93520 + }, + { + "epoch": 15.256933115823816, + "grad_norm": 0.00027064234018325806, + "learning_rate": 0.00016176955415687233, + "loss": 0.0009, + "num_input_tokens_seen": 201833696, + "step": 93525 + }, + { + "epoch": 15.257748776508972, + "grad_norm": 0.0028211779426783323, + "learning_rate": 0.00016171713517143288, + "loss": 0.0007, + "num_input_tokens_seen": 201844640, + "step": 93530 + }, + { + "epoch": 15.258564437194128, + "grad_norm": 0.0010961872758343816, + "learning_rate": 0.0001616647230417248, + "loss": 0.0008, + "num_input_tokens_seen": 201855520, + "step": 93535 + }, + { + "epoch": 15.259380097879282, + "grad_norm": 0.0017527317395433784, + "learning_rate": 0.0001616123177688103, + "loss": 0.0004, + "num_input_tokens_seen": 201864352, + "step": 93540 + }, + { + "epoch": 15.260195758564437, + "grad_norm": 0.001248628250323236, + "learning_rate": 0.00016155991935375147, + "loss": 0.0004, + "num_input_tokens_seen": 201874752, + "step": 93545 + }, + { + "epoch": 15.261011419249591, + "grad_norm": 0.003141733119264245, + "learning_rate": 0.00016150752779761008, + "loss": 0.0024, + "num_input_tokens_seen": 201885664, + "step": 93550 + }, + { + "epoch": 15.261827079934747, + "grad_norm": 0.00034613831667229533, + "learning_rate": 0.00016145514310144838, + "loss": 0.0489, + "num_input_tokens_seen": 201898112, + "step": 93555 + }, + { + "epoch": 15.262642740619903, + "grad_norm": 0.005879873409867287, + "learning_rate": 0.0001614027652663273, + "loss": 0.0012, + "num_input_tokens_seen": 201909312, + "step": 93560 + }, + { + "epoch": 15.263458401305057, + "grad_norm": 0.007127746473997831, + "learning_rate": 0.00016135039429330912, + "loss": 0.0111, + "num_input_tokens_seen": 201921216, + "step": 93565 + }, + { + "epoch": 15.264274061990212, + "grad_norm": 0.0010337174171581864, + "learning_rate": 0.0001612980301834544, + "loss": 0.0031, + "num_input_tokens_seen": 201931104, + "step": 93570 + }, + { + "epoch": 15.265089722675366, + "grad_norm": 0.0005627694190479815, + "learning_rate": 0.00016124567293782517, + "loss": 0.0005, + "num_input_tokens_seen": 201940928, + "step": 93575 + }, + { + "epoch": 15.265905383360522, + "grad_norm": 0.7606173157691956, + "learning_rate": 0.00016119332255748177, + "loss": 0.0158, + "num_input_tokens_seen": 201952352, + "step": 93580 + }, + { + "epoch": 15.266721044045678, + "grad_norm": 0.00025212913169525564, + "learning_rate": 0.0001611409790434858, + "loss": 0.0032, + "num_input_tokens_seen": 201963552, + "step": 93585 + }, + { + "epoch": 15.267536704730832, + "grad_norm": 0.0038731670938432217, + "learning_rate": 0.00016108864239689746, + "loss": 0.0004, + "num_input_tokens_seen": 201974240, + "step": 93590 + }, + { + "epoch": 15.268352365415987, + "grad_norm": 0.01029552798718214, + "learning_rate": 0.00016103631261877799, + "loss": 0.0036, + "num_input_tokens_seen": 201983328, + "step": 93595 + }, + { + "epoch": 15.269168026101141, + "grad_norm": 0.008990789763629436, + "learning_rate": 0.0001609839897101874, + "loss": 0.0234, + "num_input_tokens_seen": 201993824, + "step": 93600 + }, + { + "epoch": 15.269983686786297, + "grad_norm": 0.04266885295510292, + "learning_rate": 0.00016093167367218665, + "loss": 0.0015, + "num_input_tokens_seen": 202004384, + "step": 93605 + }, + { + "epoch": 15.270799347471453, + "grad_norm": 0.00022902233467902988, + "learning_rate": 0.0001608793645058353, + "loss": 0.0016, + "num_input_tokens_seen": 202014592, + "step": 93610 + }, + { + "epoch": 15.271615008156607, + "grad_norm": 0.0003972389386035502, + "learning_rate": 0.0001608270622121942, + "loss": 0.0008, + "num_input_tokens_seen": 202026336, + "step": 93615 + }, + { + "epoch": 15.272430668841762, + "grad_norm": 0.0016705964226275682, + "learning_rate": 0.00016077476679232262, + "loss": 0.0034, + "num_input_tokens_seen": 202037760, + "step": 93620 + }, + { + "epoch": 15.273246329526916, + "grad_norm": 0.01417429931461811, + "learning_rate": 0.00016072247824728086, + "loss": 0.0357, + "num_input_tokens_seen": 202048576, + "step": 93625 + }, + { + "epoch": 15.274061990212072, + "grad_norm": 0.0030304461251944304, + "learning_rate": 0.00016067019657812852, + "loss": 0.0003, + "num_input_tokens_seen": 202059104, + "step": 93630 + }, + { + "epoch": 15.274877650897226, + "grad_norm": 0.29061275720596313, + "learning_rate": 0.0001606179217859251, + "loss": 0.0063, + "num_input_tokens_seen": 202070688, + "step": 93635 + }, + { + "epoch": 15.275693311582382, + "grad_norm": 0.028309106826782227, + "learning_rate": 0.00016056565387173005, + "loss": 0.0013, + "num_input_tokens_seen": 202080352, + "step": 93640 + }, + { + "epoch": 15.276508972267537, + "grad_norm": 0.002880884800106287, + "learning_rate": 0.0001605133928366026, + "loss": 0.0043, + "num_input_tokens_seen": 202089440, + "step": 93645 + }, + { + "epoch": 15.277324632952691, + "grad_norm": 0.004692520946264267, + "learning_rate": 0.00016046113868160194, + "loss": 0.002, + "num_input_tokens_seen": 202099840, + "step": 93650 + }, + { + "epoch": 15.278140293637847, + "grad_norm": 0.0003875213151331991, + "learning_rate": 0.00016040889140778703, + "loss": 0.0003, + "num_input_tokens_seen": 202111040, + "step": 93655 + }, + { + "epoch": 15.278955954323001, + "grad_norm": 0.0016542058438062668, + "learning_rate": 0.00016035665101621672, + "loss": 0.0011, + "num_input_tokens_seen": 202121760, + "step": 93660 + }, + { + "epoch": 15.279771615008157, + "grad_norm": 0.002035087440162897, + "learning_rate": 0.00016030441750794976, + "loss": 0.0013, + "num_input_tokens_seen": 202132352, + "step": 93665 + }, + { + "epoch": 15.280587275693312, + "grad_norm": 0.10745411366224289, + "learning_rate": 0.00016025219088404468, + "loss": 0.0042, + "num_input_tokens_seen": 202143840, + "step": 93670 + }, + { + "epoch": 15.281402936378466, + "grad_norm": 0.0002862142282538116, + "learning_rate": 0.00016019997114555983, + "loss": 0.0015, + "num_input_tokens_seen": 202154720, + "step": 93675 + }, + { + "epoch": 15.282218597063622, + "grad_norm": 0.0013032876886427402, + "learning_rate": 0.000160147758293554, + "loss": 0.0004, + "num_input_tokens_seen": 202165536, + "step": 93680 + }, + { + "epoch": 15.283034257748776, + "grad_norm": 0.00022320564312394708, + "learning_rate": 0.00016009555232908456, + "loss": 0.0011, + "num_input_tokens_seen": 202176128, + "step": 93685 + }, + { + "epoch": 15.283849918433932, + "grad_norm": 0.009374073706567287, + "learning_rate": 0.00016004335325321033, + "loss": 0.0005, + "num_input_tokens_seen": 202187072, + "step": 93690 + }, + { + "epoch": 15.284665579119087, + "grad_norm": 0.003811866044998169, + "learning_rate": 0.00015999116106698848, + "loss": 0.0013, + "num_input_tokens_seen": 202198144, + "step": 93695 + }, + { + "epoch": 15.285481239804241, + "grad_norm": 1.7324323654174805, + "learning_rate": 0.0001599389757714774, + "loss": 0.0428, + "num_input_tokens_seen": 202208576, + "step": 93700 + }, + { + "epoch": 15.286296900489397, + "grad_norm": 0.0034322733990848064, + "learning_rate": 0.0001598867973677341, + "loss": 0.0192, + "num_input_tokens_seen": 202219232, + "step": 93705 + }, + { + "epoch": 15.28711256117455, + "grad_norm": 0.045030951499938965, + "learning_rate": 0.00015983462585681657, + "loss": 0.0043, + "num_input_tokens_seen": 202229024, + "step": 93710 + }, + { + "epoch": 15.287928221859707, + "grad_norm": 0.00919533334672451, + "learning_rate": 0.00015978246123978158, + "loss": 0.0007, + "num_input_tokens_seen": 202239040, + "step": 93715 + }, + { + "epoch": 15.28874388254486, + "grad_norm": 0.035612527281045914, + "learning_rate": 0.0001597303035176869, + "loss": 0.0006, + "num_input_tokens_seen": 202250400, + "step": 93720 + }, + { + "epoch": 15.289559543230016, + "grad_norm": 0.0002653069677762687, + "learning_rate": 0.00015967815269158904, + "loss": 0.0003, + "num_input_tokens_seen": 202260960, + "step": 93725 + }, + { + "epoch": 15.290375203915172, + "grad_norm": 0.0003630772407632321, + "learning_rate": 0.0001596260087625454, + "loss": 0.0005, + "num_input_tokens_seen": 202272448, + "step": 93730 + }, + { + "epoch": 15.291190864600326, + "grad_norm": 0.013220174238085747, + "learning_rate": 0.0001595738717316122, + "loss": 0.0009, + "num_input_tokens_seen": 202282336, + "step": 93735 + }, + { + "epoch": 15.292006525285482, + "grad_norm": 0.12450747191905975, + "learning_rate": 0.00015952174159984667, + "loss": 0.0024, + "num_input_tokens_seen": 202293888, + "step": 93740 + }, + { + "epoch": 15.292822185970635, + "grad_norm": 0.00042232818668708205, + "learning_rate": 0.0001594696183683046, + "loss": 0.0016, + "num_input_tokens_seen": 202303040, + "step": 93745 + }, + { + "epoch": 15.293637846655791, + "grad_norm": 0.006470768246799707, + "learning_rate": 0.00015941750203804305, + "loss": 0.0013, + "num_input_tokens_seen": 202313536, + "step": 93750 + }, + { + "epoch": 15.294453507340947, + "grad_norm": 0.026989320293068886, + "learning_rate": 0.0001593653926101176, + "loss": 0.0023, + "num_input_tokens_seen": 202324480, + "step": 93755 + }, + { + "epoch": 15.2952691680261, + "grad_norm": 0.07732109725475311, + "learning_rate": 0.00015931329008558477, + "loss": 0.002, + "num_input_tokens_seen": 202334176, + "step": 93760 + }, + { + "epoch": 15.296084828711257, + "grad_norm": 0.5032738447189331, + "learning_rate": 0.00015926119446550024, + "loss": 0.021, + "num_input_tokens_seen": 202344480, + "step": 93765 + }, + { + "epoch": 15.29690048939641, + "grad_norm": 0.00029917771462351084, + "learning_rate": 0.0001592091057509199, + "loss": 0.0022, + "num_input_tokens_seen": 202355712, + "step": 93770 + }, + { + "epoch": 15.297716150081566, + "grad_norm": 0.00032637015101499856, + "learning_rate": 0.00015915702394289933, + "loss": 0.0122, + "num_input_tokens_seen": 202366880, + "step": 93775 + }, + { + "epoch": 15.298531810766722, + "grad_norm": 0.05940993130207062, + "learning_rate": 0.00015910494904249411, + "loss": 0.0022, + "num_input_tokens_seen": 202377984, + "step": 93780 + }, + { + "epoch": 15.299347471451876, + "grad_norm": 0.012801729142665863, + "learning_rate": 0.0001590528810507595, + "loss": 0.0048, + "num_input_tokens_seen": 202387328, + "step": 93785 + }, + { + "epoch": 15.300163132137031, + "grad_norm": 0.018743107095360756, + "learning_rate": 0.00015900081996875082, + "loss": 0.0433, + "num_input_tokens_seen": 202398304, + "step": 93790 + }, + { + "epoch": 15.300978792822185, + "grad_norm": 0.06855087727308273, + "learning_rate": 0.0001589487657975231, + "loss": 0.0026, + "num_input_tokens_seen": 202408000, + "step": 93795 + }, + { + "epoch": 15.301794453507341, + "grad_norm": 0.17360533773899078, + "learning_rate": 0.00015889671853813126, + "loss": 0.0537, + "num_input_tokens_seen": 202419424, + "step": 93800 + }, + { + "epoch": 15.302610114192497, + "grad_norm": 0.0019473297288641334, + "learning_rate": 0.0001588446781916302, + "loss": 0.003, + "num_input_tokens_seen": 202430912, + "step": 93805 + }, + { + "epoch": 15.30342577487765, + "grad_norm": 0.029717862606048584, + "learning_rate": 0.00015879264475907447, + "loss": 0.0019, + "num_input_tokens_seen": 202442368, + "step": 93810 + }, + { + "epoch": 15.304241435562806, + "grad_norm": 0.0039007309824228287, + "learning_rate": 0.00015874061824151865, + "loss": 0.0015, + "num_input_tokens_seen": 202453376, + "step": 93815 + }, + { + "epoch": 15.30505709624796, + "grad_norm": 0.0038800856564193964, + "learning_rate": 0.00015868859864001693, + "loss": 0.0011, + "num_input_tokens_seen": 202462816, + "step": 93820 + }, + { + "epoch": 15.305872756933116, + "grad_norm": 0.007253418676555157, + "learning_rate": 0.00015863658595562414, + "loss": 0.0092, + "num_input_tokens_seen": 202473856, + "step": 93825 + }, + { + "epoch": 15.30668841761827, + "grad_norm": 0.031155981123447418, + "learning_rate": 0.00015858458018939365, + "loss": 0.0209, + "num_input_tokens_seen": 202486304, + "step": 93830 + }, + { + "epoch": 15.307504078303426, + "grad_norm": 0.05359480530023575, + "learning_rate": 0.00015853258134238007, + "loss": 0.0142, + "num_input_tokens_seen": 202496800, + "step": 93835 + }, + { + "epoch": 15.308319738988581, + "grad_norm": 0.00028071398264728487, + "learning_rate": 0.0001584805894156366, + "loss": 0.0005, + "num_input_tokens_seen": 202507648, + "step": 93840 + }, + { + "epoch": 15.309135399673735, + "grad_norm": 0.0002165523765143007, + "learning_rate": 0.0001584286044102175, + "loss": 0.0036, + "num_input_tokens_seen": 202518592, + "step": 93845 + }, + { + "epoch": 15.309951060358891, + "grad_norm": 0.07421204447746277, + "learning_rate": 0.00015837662632717575, + "loss": 0.0025, + "num_input_tokens_seen": 202528928, + "step": 93850 + }, + { + "epoch": 15.310766721044045, + "grad_norm": 0.0023259257432073355, + "learning_rate": 0.00015832465516756538, + "loss": 0.0014, + "num_input_tokens_seen": 202538720, + "step": 93855 + }, + { + "epoch": 15.3115823817292, + "grad_norm": 0.054206643253564835, + "learning_rate": 0.00015827269093243902, + "loss": 0.0021, + "num_input_tokens_seen": 202549696, + "step": 93860 + }, + { + "epoch": 15.312398042414356, + "grad_norm": 0.0013781872112303972, + "learning_rate": 0.0001582207336228504, + "loss": 0.0035, + "num_input_tokens_seen": 202559520, + "step": 93865 + }, + { + "epoch": 15.31321370309951, + "grad_norm": 0.0036758456844836473, + "learning_rate": 0.00015816878323985184, + "loss": 0.0045, + "num_input_tokens_seen": 202568832, + "step": 93870 + }, + { + "epoch": 15.314029363784666, + "grad_norm": 0.005146983079612255, + "learning_rate": 0.0001581168397844967, + "loss": 0.0013, + "num_input_tokens_seen": 202578912, + "step": 93875 + }, + { + "epoch": 15.31484502446982, + "grad_norm": 0.0047431825660169125, + "learning_rate": 0.0001580649032578375, + "loss": 0.001, + "num_input_tokens_seen": 202589632, + "step": 93880 + }, + { + "epoch": 15.315660685154976, + "grad_norm": 0.016939133405685425, + "learning_rate": 0.00015801297366092689, + "loss": 0.0043, + "num_input_tokens_seen": 202600256, + "step": 93885 + }, + { + "epoch": 15.31647634584013, + "grad_norm": 0.0036439832765609026, + "learning_rate": 0.00015796105099481712, + "loss": 0.0006, + "num_input_tokens_seen": 202610976, + "step": 93890 + }, + { + "epoch": 15.317292006525285, + "grad_norm": 0.022857867181301117, + "learning_rate": 0.00015790913526056061, + "loss": 0.0106, + "num_input_tokens_seen": 202621600, + "step": 93895 + }, + { + "epoch": 15.318107667210441, + "grad_norm": 0.0002249942917842418, + "learning_rate": 0.00015785722645920942, + "loss": 0.0095, + "num_input_tokens_seen": 202633120, + "step": 93900 + }, + { + "epoch": 15.318923327895595, + "grad_norm": 0.04333154857158661, + "learning_rate": 0.00015780532459181557, + "loss": 0.002, + "num_input_tokens_seen": 202643904, + "step": 93905 + }, + { + "epoch": 15.31973898858075, + "grad_norm": 0.0002128455089405179, + "learning_rate": 0.00015775342965943095, + "loss": 0.0509, + "num_input_tokens_seen": 202654944, + "step": 93910 + }, + { + "epoch": 15.320554649265905, + "grad_norm": 0.026830563321709633, + "learning_rate": 0.00015770154166310724, + "loss": 0.0011, + "num_input_tokens_seen": 202665120, + "step": 93915 + }, + { + "epoch": 15.32137030995106, + "grad_norm": 0.08911074697971344, + "learning_rate": 0.00015764966060389602, + "loss": 0.0024, + "num_input_tokens_seen": 202676640, + "step": 93920 + }, + { + "epoch": 15.322185970636216, + "grad_norm": 0.000359884841600433, + "learning_rate": 0.00015759778648284873, + "loss": 0.0006, + "num_input_tokens_seen": 202687744, + "step": 93925 + }, + { + "epoch": 15.32300163132137, + "grad_norm": 0.0005611114320345223, + "learning_rate": 0.00015754591930101664, + "loss": 0.0036, + "num_input_tokens_seen": 202698208, + "step": 93930 + }, + { + "epoch": 15.323817292006526, + "grad_norm": 0.0035494170151650906, + "learning_rate": 0.00015749405905945095, + "loss": 0.0072, + "num_input_tokens_seen": 202708576, + "step": 93935 + }, + { + "epoch": 15.32463295269168, + "grad_norm": 0.6403558254241943, + "learning_rate": 0.00015744220575920266, + "loss": 0.0078, + "num_input_tokens_seen": 202718816, + "step": 93940 + }, + { + "epoch": 15.325448613376835, + "grad_norm": 0.0022993856109678745, + "learning_rate": 0.00015739035940132262, + "loss": 0.0074, + "num_input_tokens_seen": 202728608, + "step": 93945 + }, + { + "epoch": 15.326264274061991, + "grad_norm": 0.0014836600748822093, + "learning_rate": 0.0001573385199868616, + "loss": 0.0018, + "num_input_tokens_seen": 202738912, + "step": 93950 + }, + { + "epoch": 15.327079934747145, + "grad_norm": 0.0010945778340101242, + "learning_rate": 0.00015728668751687015, + "loss": 0.0039, + "num_input_tokens_seen": 202748736, + "step": 93955 + }, + { + "epoch": 15.3278955954323, + "grad_norm": 0.0007776871789246798, + "learning_rate": 0.00015723486199239878, + "loss": 0.1654, + "num_input_tokens_seen": 202760416, + "step": 93960 + }, + { + "epoch": 15.328711256117455, + "grad_norm": 0.0006008428754284978, + "learning_rate": 0.00015718304341449759, + "loss": 0.0026, + "num_input_tokens_seen": 202771456, + "step": 93965 + }, + { + "epoch": 15.32952691680261, + "grad_norm": 0.1772693544626236, + "learning_rate": 0.00015713123178421717, + "loss": 0.0135, + "num_input_tokens_seen": 202782880, + "step": 93970 + }, + { + "epoch": 15.330342577487766, + "grad_norm": 0.003044270211830735, + "learning_rate": 0.00015707942710260704, + "loss": 0.0008, + "num_input_tokens_seen": 202793824, + "step": 93975 + }, + { + "epoch": 15.33115823817292, + "grad_norm": 0.04174460470676422, + "learning_rate": 0.00015702762937071747, + "loss": 0.0009, + "num_input_tokens_seen": 202803808, + "step": 93980 + }, + { + "epoch": 15.331973898858076, + "grad_norm": 0.009711637161672115, + "learning_rate": 0.00015697583858959813, + "loss": 0.0015, + "num_input_tokens_seen": 202813664, + "step": 93985 + }, + { + "epoch": 15.33278955954323, + "grad_norm": 0.72083979845047, + "learning_rate": 0.00015692405476029853, + "loss": 0.0209, + "num_input_tokens_seen": 202824576, + "step": 93990 + }, + { + "epoch": 15.333605220228385, + "grad_norm": 0.0011434814659878612, + "learning_rate": 0.00015687227788386822, + "loss": 0.0009, + "num_input_tokens_seen": 202835104, + "step": 93995 + }, + { + "epoch": 15.33442088091354, + "grad_norm": 0.0002050265611615032, + "learning_rate": 0.00015682050796135644, + "loss": 0.0009, + "num_input_tokens_seen": 202844704, + "step": 94000 + }, + { + "epoch": 15.335236541598695, + "grad_norm": 0.0006181810749694705, + "learning_rate": 0.0001567687449938125, + "loss": 0.0083, + "num_input_tokens_seen": 202855584, + "step": 94005 + }, + { + "epoch": 15.33605220228385, + "grad_norm": 0.0010810698149725795, + "learning_rate": 0.0001567169889822853, + "loss": 0.004, + "num_input_tokens_seen": 202865248, + "step": 94010 + }, + { + "epoch": 15.336867862969005, + "grad_norm": 0.6307497024536133, + "learning_rate": 0.00015666523992782384, + "loss": 0.1248, + "num_input_tokens_seen": 202875936, + "step": 94015 + }, + { + "epoch": 15.33768352365416, + "grad_norm": 0.14658840000629425, + "learning_rate": 0.00015661349783147678, + "loss": 0.0039, + "num_input_tokens_seen": 202887456, + "step": 94020 + }, + { + "epoch": 15.338499184339314, + "grad_norm": 0.0003474289842415601, + "learning_rate": 0.00015656176269429283, + "loss": 0.0004, + "num_input_tokens_seen": 202898240, + "step": 94025 + }, + { + "epoch": 15.33931484502447, + "grad_norm": 0.14753980934619904, + "learning_rate": 0.00015651003451732048, + "loss": 0.0027, + "num_input_tokens_seen": 202909824, + "step": 94030 + }, + { + "epoch": 15.340130505709626, + "grad_norm": 0.07215742021799088, + "learning_rate": 0.00015645831330160804, + "loss": 0.0012, + "num_input_tokens_seen": 202920032, + "step": 94035 + }, + { + "epoch": 15.34094616639478, + "grad_norm": 0.0002563298330642283, + "learning_rate": 0.00015640659904820364, + "loss": 0.0008, + "num_input_tokens_seen": 202931456, + "step": 94040 + }, + { + "epoch": 15.341761827079935, + "grad_norm": 0.006094956770539284, + "learning_rate": 0.00015635489175815537, + "loss": 0.0006, + "num_input_tokens_seen": 202940640, + "step": 94045 + }, + { + "epoch": 15.34257748776509, + "grad_norm": 0.04791932553052902, + "learning_rate": 0.0001563031914325112, + "loss": 0.0047, + "num_input_tokens_seen": 202951360, + "step": 94050 + }, + { + "epoch": 15.343393148450245, + "grad_norm": 0.2458181530237198, + "learning_rate": 0.00015625149807231892, + "loss": 0.0094, + "num_input_tokens_seen": 202962400, + "step": 94055 + }, + { + "epoch": 15.3442088091354, + "grad_norm": 0.02346714213490486, + "learning_rate": 0.00015619981167862602, + "loss": 0.0025, + "num_input_tokens_seen": 202973568, + "step": 94060 + }, + { + "epoch": 15.345024469820554, + "grad_norm": 0.017123041674494743, + "learning_rate": 0.00015614813225248015, + "loss": 0.0434, + "num_input_tokens_seen": 202984896, + "step": 94065 + }, + { + "epoch": 15.34584013050571, + "grad_norm": 0.01915080100297928, + "learning_rate": 0.00015609645979492855, + "loss": 0.0013, + "num_input_tokens_seen": 202995072, + "step": 94070 + }, + { + "epoch": 15.346655791190864, + "grad_norm": 0.01844211481511593, + "learning_rate": 0.00015604479430701845, + "loss": 0.0012, + "num_input_tokens_seen": 203006528, + "step": 94075 + }, + { + "epoch": 15.34747145187602, + "grad_norm": 0.02673475630581379, + "learning_rate": 0.00015599313578979696, + "loss": 0.0019, + "num_input_tokens_seen": 203016320, + "step": 94080 + }, + { + "epoch": 15.348287112561174, + "grad_norm": 0.24581356346607208, + "learning_rate": 0.00015594148424431076, + "loss": 0.0139, + "num_input_tokens_seen": 203026272, + "step": 94085 + }, + { + "epoch": 15.34910277324633, + "grad_norm": 0.0010242423741146922, + "learning_rate": 0.00015588983967160724, + "loss": 0.0005, + "num_input_tokens_seen": 203036736, + "step": 94090 + }, + { + "epoch": 15.349918433931485, + "grad_norm": 0.0005519519909285009, + "learning_rate": 0.0001558382020727323, + "loss": 0.0033, + "num_input_tokens_seen": 203046048, + "step": 94095 + }, + { + "epoch": 15.350734094616639, + "grad_norm": 0.08401108533143997, + "learning_rate": 0.00015578657144873316, + "loss": 0.0215, + "num_input_tokens_seen": 203053952, + "step": 94100 + }, + { + "epoch": 15.351549755301795, + "grad_norm": 0.8340625762939453, + "learning_rate": 0.00015573494780065543, + "loss": 0.1, + "num_input_tokens_seen": 203064768, + "step": 94105 + }, + { + "epoch": 15.352365415986949, + "grad_norm": 0.039435237646102905, + "learning_rate": 0.00015568333112954592, + "loss": 0.0048, + "num_input_tokens_seen": 203075712, + "step": 94110 + }, + { + "epoch": 15.353181076672104, + "grad_norm": 0.00025253635249100626, + "learning_rate": 0.00015563172143645044, + "loss": 0.0008, + "num_input_tokens_seen": 203087168, + "step": 94115 + }, + { + "epoch": 15.35399673735726, + "grad_norm": 0.0023859934881329536, + "learning_rate": 0.00015558011872241506, + "loss": 0.0025, + "num_input_tokens_seen": 203096384, + "step": 94120 + }, + { + "epoch": 15.354812398042414, + "grad_norm": 0.0007968974532559514, + "learning_rate": 0.00015552852298848546, + "loss": 0.0047, + "num_input_tokens_seen": 203107328, + "step": 94125 + }, + { + "epoch": 15.35562805872757, + "grad_norm": 0.003953108098357916, + "learning_rate": 0.00015547693423570736, + "loss": 0.0009, + "num_input_tokens_seen": 203117792, + "step": 94130 + }, + { + "epoch": 15.356443719412724, + "grad_norm": 0.0017384809907525778, + "learning_rate": 0.00015542535246512623, + "loss": 0.0106, + "num_input_tokens_seen": 203128288, + "step": 94135 + }, + { + "epoch": 15.35725938009788, + "grad_norm": 0.014929509721696377, + "learning_rate": 0.00015537377767778742, + "loss": 0.0201, + "num_input_tokens_seen": 203138560, + "step": 94140 + }, + { + "epoch": 15.358075040783035, + "grad_norm": 0.0015721704112365842, + "learning_rate": 0.00015532220987473627, + "loss": 0.0008, + "num_input_tokens_seen": 203150208, + "step": 94145 + }, + { + "epoch": 15.358890701468189, + "grad_norm": 0.0008454607450403273, + "learning_rate": 0.00015527064905701776, + "loss": 0.0014, + "num_input_tokens_seen": 203161152, + "step": 94150 + }, + { + "epoch": 15.359706362153345, + "grad_norm": 0.8442980647087097, + "learning_rate": 0.00015521909522567685, + "loss": 0.1078, + "num_input_tokens_seen": 203171264, + "step": 94155 + }, + { + "epoch": 15.360522022838499, + "grad_norm": 0.0005026679718866944, + "learning_rate": 0.0001551675483817584, + "loss": 0.0025, + "num_input_tokens_seen": 203183360, + "step": 94160 + }, + { + "epoch": 15.361337683523654, + "grad_norm": 0.5801403522491455, + "learning_rate": 0.00015511600852630698, + "loss": 0.0165, + "num_input_tokens_seen": 203195168, + "step": 94165 + }, + { + "epoch": 15.362153344208808, + "grad_norm": 0.004484944045543671, + "learning_rate": 0.0001550644756603672, + "loss": 0.0012, + "num_input_tokens_seen": 203206240, + "step": 94170 + }, + { + "epoch": 15.362969004893964, + "grad_norm": 0.03226489946246147, + "learning_rate": 0.00015501294978498344, + "loss": 0.0024, + "num_input_tokens_seen": 203217088, + "step": 94175 + }, + { + "epoch": 15.36378466557912, + "grad_norm": 0.1310596615076065, + "learning_rate": 0.0001549614309011998, + "loss": 0.0034, + "num_input_tokens_seen": 203227648, + "step": 94180 + }, + { + "epoch": 15.364600326264274, + "grad_norm": 0.003728811861947179, + "learning_rate": 0.00015490991901006052, + "loss": 0.001, + "num_input_tokens_seen": 203238240, + "step": 94185 + }, + { + "epoch": 15.36541598694943, + "grad_norm": 0.0003246025589760393, + "learning_rate": 0.00015485841411260937, + "loss": 0.0021, + "num_input_tokens_seen": 203249312, + "step": 94190 + }, + { + "epoch": 15.366231647634583, + "grad_norm": 0.028267741203308105, + "learning_rate": 0.00015480691620989062, + "loss": 0.0017, + "num_input_tokens_seen": 203260320, + "step": 94195 + }, + { + "epoch": 15.367047308319739, + "grad_norm": 0.007097567431628704, + "learning_rate": 0.00015475542530294728, + "loss": 0.0007, + "num_input_tokens_seen": 203270528, + "step": 94200 + }, + { + "epoch": 15.367862969004895, + "grad_norm": 0.23857952654361725, + "learning_rate": 0.00015470394139282357, + "loss": 0.0065, + "num_input_tokens_seen": 203282272, + "step": 94205 + }, + { + "epoch": 15.368678629690049, + "grad_norm": 0.0013912966242060065, + "learning_rate": 0.0001546524644805622, + "loss": 0.0009, + "num_input_tokens_seen": 203292672, + "step": 94210 + }, + { + "epoch": 15.369494290375204, + "grad_norm": 0.013908866792917252, + "learning_rate": 0.00015460099456720706, + "loss": 0.0439, + "num_input_tokens_seen": 203303520, + "step": 94215 + }, + { + "epoch": 15.370309951060358, + "grad_norm": 0.2309672236442566, + "learning_rate": 0.0001545495316538006, + "loss": 0.0181, + "num_input_tokens_seen": 203315232, + "step": 94220 + }, + { + "epoch": 15.371125611745514, + "grad_norm": 0.0006903464673087001, + "learning_rate": 0.0001544980757413864, + "loss": 0.0007, + "num_input_tokens_seen": 203326464, + "step": 94225 + }, + { + "epoch": 15.37194127243067, + "grad_norm": 0.0017863567918539047, + "learning_rate": 0.00015444662683100676, + "loss": 0.0007, + "num_input_tokens_seen": 203338688, + "step": 94230 + }, + { + "epoch": 15.372756933115824, + "grad_norm": 0.0010759421857073903, + "learning_rate": 0.00015439518492370486, + "loss": 0.0007, + "num_input_tokens_seen": 203350272, + "step": 94235 + }, + { + "epoch": 15.37357259380098, + "grad_norm": 0.0036603561602532864, + "learning_rate": 0.00015434375002052264, + "loss": 0.0005, + "num_input_tokens_seen": 203361504, + "step": 94240 + }, + { + "epoch": 15.374388254486133, + "grad_norm": 0.08018328249454498, + "learning_rate": 0.00015429232212250317, + "loss": 0.0019, + "num_input_tokens_seen": 203371424, + "step": 94245 + }, + { + "epoch": 15.375203915171289, + "grad_norm": 0.0005420346860773861, + "learning_rate": 0.00015424090123068802, + "loss": 0.0013, + "num_input_tokens_seen": 203382368, + "step": 94250 + }, + { + "epoch": 15.376019575856443, + "grad_norm": 0.007118335459381342, + "learning_rate": 0.00015418948734611976, + "loss": 0.0015, + "num_input_tokens_seen": 203392544, + "step": 94255 + }, + { + "epoch": 15.376835236541599, + "grad_norm": 0.004161753226071596, + "learning_rate": 0.0001541380804698403, + "loss": 0.0003, + "num_input_tokens_seen": 203403072, + "step": 94260 + }, + { + "epoch": 15.377650897226754, + "grad_norm": 0.012935330159962177, + "learning_rate": 0.00015408668060289132, + "loss": 0.0006, + "num_input_tokens_seen": 203412640, + "step": 94265 + }, + { + "epoch": 15.378466557911908, + "grad_norm": 0.0010322239249944687, + "learning_rate": 0.00015403528774631463, + "loss": 0.0005, + "num_input_tokens_seen": 203422784, + "step": 94270 + }, + { + "epoch": 15.379282218597064, + "grad_norm": 0.04167158156633377, + "learning_rate": 0.00015398390190115175, + "loss": 0.002, + "num_input_tokens_seen": 203434880, + "step": 94275 + }, + { + "epoch": 15.380097879282218, + "grad_norm": 0.0022990144789218903, + "learning_rate": 0.00015393252306844402, + "loss": 0.0025, + "num_input_tokens_seen": 203446144, + "step": 94280 + }, + { + "epoch": 15.380913539967374, + "grad_norm": 0.0003192865988239646, + "learning_rate": 0.00015388115124923267, + "loss": 0.0006, + "num_input_tokens_seen": 203455712, + "step": 94285 + }, + { + "epoch": 15.38172920065253, + "grad_norm": 0.25969845056533813, + "learning_rate": 0.00015382978644455896, + "loss": 0.017, + "num_input_tokens_seen": 203466112, + "step": 94290 + }, + { + "epoch": 15.382544861337683, + "grad_norm": 0.002667571883648634, + "learning_rate": 0.00015377842865546372, + "loss": 0.001, + "num_input_tokens_seen": 203477408, + "step": 94295 + }, + { + "epoch": 15.383360522022839, + "grad_norm": 0.0008327610557898879, + "learning_rate": 0.0001537270778829879, + "loss": 0.001, + "num_input_tokens_seen": 203487264, + "step": 94300 + }, + { + "epoch": 15.384176182707993, + "grad_norm": 0.001264519989490509, + "learning_rate": 0.00015367573412817186, + "loss": 0.0004, + "num_input_tokens_seen": 203498112, + "step": 94305 + }, + { + "epoch": 15.384991843393149, + "grad_norm": 0.00017284897330682725, + "learning_rate": 0.0001536243973920568, + "loss": 0.0002, + "num_input_tokens_seen": 203509184, + "step": 94310 + }, + { + "epoch": 15.385807504078304, + "grad_norm": 0.00023437404888682067, + "learning_rate": 0.00015357306767568242, + "loss": 0.0012, + "num_input_tokens_seen": 203519616, + "step": 94315 + }, + { + "epoch": 15.386623164763458, + "grad_norm": 0.00020692349062301219, + "learning_rate": 0.00015352174498008963, + "loss": 0.0004, + "num_input_tokens_seen": 203530272, + "step": 94320 + }, + { + "epoch": 15.387438825448614, + "grad_norm": 0.007811365183442831, + "learning_rate": 0.00015347042930631788, + "loss": 0.0006, + "num_input_tokens_seen": 203540992, + "step": 94325 + }, + { + "epoch": 15.388254486133768, + "grad_norm": 0.0009075101697817445, + "learning_rate": 0.0001534191206554078, + "loss": 0.0028, + "num_input_tokens_seen": 203549824, + "step": 94330 + }, + { + "epoch": 15.389070146818923, + "grad_norm": 0.007936549372971058, + "learning_rate": 0.00015336781902839858, + "loss": 0.0158, + "num_input_tokens_seen": 203561024, + "step": 94335 + }, + { + "epoch": 15.38988580750408, + "grad_norm": 0.0017092369962483644, + "learning_rate": 0.00015331652442633053, + "loss": 0.0008, + "num_input_tokens_seen": 203571712, + "step": 94340 + }, + { + "epoch": 15.390701468189233, + "grad_norm": 0.0014804316451773047, + "learning_rate": 0.00015326523685024263, + "loss": 0.001, + "num_input_tokens_seen": 203582432, + "step": 94345 + }, + { + "epoch": 15.391517128874389, + "grad_norm": 3.3359551429748535, + "learning_rate": 0.0001532139563011749, + "loss": 0.05, + "num_input_tokens_seen": 203593632, + "step": 94350 + }, + { + "epoch": 15.392332789559543, + "grad_norm": 0.0009180537308566272, + "learning_rate": 0.00015316268278016594, + "loss": 0.0016, + "num_input_tokens_seen": 203605216, + "step": 94355 + }, + { + "epoch": 15.393148450244698, + "grad_norm": 0.00029028579592704773, + "learning_rate": 0.00015311141628825554, + "loss": 0.0017, + "num_input_tokens_seen": 203616192, + "step": 94360 + }, + { + "epoch": 15.393964110929852, + "grad_norm": 0.023014061152935028, + "learning_rate": 0.000153060156826482, + "loss": 0.0016, + "num_input_tokens_seen": 203627072, + "step": 94365 + }, + { + "epoch": 15.394779771615008, + "grad_norm": 0.09935973584651947, + "learning_rate": 0.0001530089043958849, + "loss": 0.0075, + "num_input_tokens_seen": 203638400, + "step": 94370 + }, + { + "epoch": 15.395595432300164, + "grad_norm": 0.00018579568131826818, + "learning_rate": 0.00015295765899750214, + "loss": 0.0019, + "num_input_tokens_seen": 203649088, + "step": 94375 + }, + { + "epoch": 15.396411092985318, + "grad_norm": 0.0017579933628439903, + "learning_rate": 0.00015290642063237302, + "loss": 0.0033, + "num_input_tokens_seen": 203659744, + "step": 94380 + }, + { + "epoch": 15.397226753670473, + "grad_norm": 0.002796456217765808, + "learning_rate": 0.0001528551893015353, + "loss": 0.0002, + "num_input_tokens_seen": 203670816, + "step": 94385 + }, + { + "epoch": 15.398042414355627, + "grad_norm": 0.0005302283097989857, + "learning_rate": 0.00015280396500602783, + "loss": 0.0013, + "num_input_tokens_seen": 203681984, + "step": 94390 + }, + { + "epoch": 15.398858075040783, + "grad_norm": 0.0008628990035504103, + "learning_rate": 0.00015275274774688817, + "loss": 0.0011, + "num_input_tokens_seen": 203692096, + "step": 94395 + }, + { + "epoch": 15.399673735725939, + "grad_norm": 0.006129425950348377, + "learning_rate": 0.00015270153752515474, + "loss": 0.0026, + "num_input_tokens_seen": 203704064, + "step": 94400 + }, + { + "epoch": 15.400489396411093, + "grad_norm": 0.015153203159570694, + "learning_rate": 0.00015265033434186525, + "loss": 0.0256, + "num_input_tokens_seen": 203715104, + "step": 94405 + }, + { + "epoch": 15.401305057096248, + "grad_norm": 0.2985181510448456, + "learning_rate": 0.00015259913819805736, + "loss": 0.0942, + "num_input_tokens_seen": 203726496, + "step": 94410 + }, + { + "epoch": 15.402120717781402, + "grad_norm": 0.0015327197033911943, + "learning_rate": 0.0001525479490947687, + "loss": 0.0015, + "num_input_tokens_seen": 203737504, + "step": 94415 + }, + { + "epoch": 15.402936378466558, + "grad_norm": 0.0662129670381546, + "learning_rate": 0.00015249676703303654, + "loss": 0.0074, + "num_input_tokens_seen": 203747648, + "step": 94420 + }, + { + "epoch": 15.403752039151712, + "grad_norm": 0.0024822934065014124, + "learning_rate": 0.0001524455920138983, + "loss": 0.0013, + "num_input_tokens_seen": 203759936, + "step": 94425 + }, + { + "epoch": 15.404567699836868, + "grad_norm": 0.00033221766352653503, + "learning_rate": 0.00015239442403839105, + "loss": 0.0006, + "num_input_tokens_seen": 203771104, + "step": 94430 + }, + { + "epoch": 15.405383360522023, + "grad_norm": 0.00018794478091876954, + "learning_rate": 0.0001523432631075517, + "loss": 0.0004, + "num_input_tokens_seen": 203782240, + "step": 94435 + }, + { + "epoch": 15.406199021207177, + "grad_norm": 0.0031244803685694933, + "learning_rate": 0.00015229210922241721, + "loss": 0.0197, + "num_input_tokens_seen": 203793120, + "step": 94440 + }, + { + "epoch": 15.407014681892333, + "grad_norm": 0.0006279939552769065, + "learning_rate": 0.0001522409623840242, + "loss": 0.0094, + "num_input_tokens_seen": 203803904, + "step": 94445 + }, + { + "epoch": 15.407830342577487, + "grad_norm": 0.44786572456359863, + "learning_rate": 0.00015218982259340908, + "loss": 0.015, + "num_input_tokens_seen": 203815840, + "step": 94450 + }, + { + "epoch": 15.408646003262643, + "grad_norm": 0.0018651728751137853, + "learning_rate": 0.0001521386898516088, + "loss": 0.0003, + "num_input_tokens_seen": 203826688, + "step": 94455 + }, + { + "epoch": 15.409461663947798, + "grad_norm": 0.010764861479401588, + "learning_rate": 0.0001520875641596589, + "loss": 0.0004, + "num_input_tokens_seen": 203836736, + "step": 94460 + }, + { + "epoch": 15.410277324632952, + "grad_norm": 0.031098827719688416, + "learning_rate": 0.0001520364455185962, + "loss": 0.002, + "num_input_tokens_seen": 203847360, + "step": 94465 + }, + { + "epoch": 15.411092985318108, + "grad_norm": 0.019350305199623108, + "learning_rate": 0.00015198533392945602, + "loss": 0.0041, + "num_input_tokens_seen": 203856960, + "step": 94470 + }, + { + "epoch": 15.411908646003262, + "grad_norm": 0.0005298391915857792, + "learning_rate": 0.00015193422939327488, + "loss": 0.0013, + "num_input_tokens_seen": 203868640, + "step": 94475 + }, + { + "epoch": 15.412724306688418, + "grad_norm": 0.015063675120472908, + "learning_rate": 0.00015188313191108783, + "loss": 0.0359, + "num_input_tokens_seen": 203879872, + "step": 94480 + }, + { + "epoch": 15.413539967373573, + "grad_norm": 0.0005940967239439487, + "learning_rate": 0.00015183204148393103, + "loss": 0.0785, + "num_input_tokens_seen": 203891200, + "step": 94485 + }, + { + "epoch": 15.414355628058727, + "grad_norm": 0.007825308479368687, + "learning_rate": 0.00015178095811283927, + "loss": 0.0007, + "num_input_tokens_seen": 203903264, + "step": 94490 + }, + { + "epoch": 15.415171288743883, + "grad_norm": 0.008388256654143333, + "learning_rate": 0.00015172988179884846, + "loss": 0.0011, + "num_input_tokens_seen": 203914624, + "step": 94495 + }, + { + "epoch": 15.415986949429037, + "grad_norm": 0.014324476942420006, + "learning_rate": 0.0001516788125429931, + "loss": 0.0028, + "num_input_tokens_seen": 203924640, + "step": 94500 + }, + { + "epoch": 15.416802610114193, + "grad_norm": 0.004510627128183842, + "learning_rate": 0.0001516277503463086, + "loss": 0.005, + "num_input_tokens_seen": 203935168, + "step": 94505 + }, + { + "epoch": 15.417618270799348, + "grad_norm": 0.05507749319076538, + "learning_rate": 0.00015157669520982975, + "loss": 0.0043, + "num_input_tokens_seen": 203945600, + "step": 94510 + }, + { + "epoch": 15.418433931484502, + "grad_norm": 0.003136106999590993, + "learning_rate": 0.0001515256471345911, + "loss": 0.0004, + "num_input_tokens_seen": 203957088, + "step": 94515 + }, + { + "epoch": 15.419249592169658, + "grad_norm": 0.02082819491624832, + "learning_rate": 0.00015147460612162733, + "loss": 0.1247, + "num_input_tokens_seen": 203967616, + "step": 94520 + }, + { + "epoch": 15.420065252854812, + "grad_norm": 0.00018456988618709147, + "learning_rate": 0.00015142357217197278, + "loss": 0.0008, + "num_input_tokens_seen": 203979744, + "step": 94525 + }, + { + "epoch": 15.420880913539968, + "grad_norm": 0.00015732820611447096, + "learning_rate": 0.00015137254528666178, + "loss": 0.0006, + "num_input_tokens_seen": 203990560, + "step": 94530 + }, + { + "epoch": 15.421696574225122, + "grad_norm": 0.456883043050766, + "learning_rate": 0.0001513215254667284, + "loss": 0.0117, + "num_input_tokens_seen": 204001472, + "step": 94535 + }, + { + "epoch": 15.422512234910277, + "grad_norm": 0.25487276911735535, + "learning_rate": 0.00015127051271320664, + "loss": 0.0049, + "num_input_tokens_seen": 204011968, + "step": 94540 + }, + { + "epoch": 15.423327895595433, + "grad_norm": 0.0010142537066712976, + "learning_rate": 0.00015121950702713029, + "loss": 0.0005, + "num_input_tokens_seen": 204021472, + "step": 94545 + }, + { + "epoch": 15.424143556280587, + "grad_norm": 0.006787245161831379, + "learning_rate": 0.00015116850840953311, + "loss": 0.0062, + "num_input_tokens_seen": 204032928, + "step": 94550 + }, + { + "epoch": 15.424959216965743, + "grad_norm": 0.0011104794684797525, + "learning_rate": 0.00015111751686144864, + "loss": 0.001, + "num_input_tokens_seen": 204043456, + "step": 94555 + }, + { + "epoch": 15.425774877650896, + "grad_norm": 0.000692363188136369, + "learning_rate": 0.00015106653238391028, + "loss": 0.0003, + "num_input_tokens_seen": 204053984, + "step": 94560 + }, + { + "epoch": 15.426590538336052, + "grad_norm": 0.009335220791399479, + "learning_rate": 0.00015101555497795127, + "loss": 0.0022, + "num_input_tokens_seen": 204064032, + "step": 94565 + }, + { + "epoch": 15.427406199021208, + "grad_norm": 0.011910196393728256, + "learning_rate": 0.00015096458464460482, + "loss": 0.0355, + "num_input_tokens_seen": 204073632, + "step": 94570 + }, + { + "epoch": 15.428221859706362, + "grad_norm": 0.044483378529548645, + "learning_rate": 0.0001509136213849038, + "loss": 0.002, + "num_input_tokens_seen": 204085696, + "step": 94575 + }, + { + "epoch": 15.429037520391518, + "grad_norm": 0.06943567842245102, + "learning_rate": 0.00015086266519988108, + "loss": 0.0018, + "num_input_tokens_seen": 204096992, + "step": 94580 + }, + { + "epoch": 15.429853181076671, + "grad_norm": 0.0023641546722501516, + "learning_rate": 0.00015081171609056937, + "loss": 0.0009, + "num_input_tokens_seen": 204107008, + "step": 94585 + }, + { + "epoch": 15.430668841761827, + "grad_norm": 0.07614665478467941, + "learning_rate": 0.00015076077405800126, + "loss": 0.0031, + "num_input_tokens_seen": 204118208, + "step": 94590 + }, + { + "epoch": 15.431484502446983, + "grad_norm": 0.03599320724606514, + "learning_rate": 0.0001507098391032089, + "loss": 0.0007, + "num_input_tokens_seen": 204128448, + "step": 94595 + }, + { + "epoch": 15.432300163132137, + "grad_norm": 0.004957592114806175, + "learning_rate": 0.00015065891122722507, + "loss": 0.0006, + "num_input_tokens_seen": 204138752, + "step": 94600 + }, + { + "epoch": 15.433115823817293, + "grad_norm": 0.000371763453586027, + "learning_rate": 0.00015060799043108126, + "loss": 0.0016, + "num_input_tokens_seen": 204149632, + "step": 94605 + }, + { + "epoch": 15.433931484502446, + "grad_norm": 0.0005075891967862844, + "learning_rate": 0.00015055707671581008, + "loss": 0.0008, + "num_input_tokens_seen": 204160576, + "step": 94610 + }, + { + "epoch": 15.434747145187602, + "grad_norm": 0.003097269684076309, + "learning_rate": 0.00015050617008244272, + "loss": 0.0004, + "num_input_tokens_seen": 204171744, + "step": 94615 + }, + { + "epoch": 15.435562805872756, + "grad_norm": 0.011300190351903439, + "learning_rate": 0.00015045527053201137, + "loss": 0.0007, + "num_input_tokens_seen": 204182432, + "step": 94620 + }, + { + "epoch": 15.436378466557912, + "grad_norm": 0.010463264770805836, + "learning_rate": 0.00015040437806554735, + "loss": 0.0011, + "num_input_tokens_seen": 204193920, + "step": 94625 + }, + { + "epoch": 15.437194127243067, + "grad_norm": 0.0010106448316946626, + "learning_rate": 0.00015035349268408216, + "loss": 0.0255, + "num_input_tokens_seen": 204203456, + "step": 94630 + }, + { + "epoch": 15.438009787928221, + "grad_norm": 0.011574169620871544, + "learning_rate": 0.00015030261438864694, + "loss": 0.001, + "num_input_tokens_seen": 204213632, + "step": 94635 + }, + { + "epoch": 15.438825448613377, + "grad_norm": 8.737659663893282e-05, + "learning_rate": 0.0001502517431802729, + "loss": 0.0008, + "num_input_tokens_seen": 204221184, + "step": 94640 + }, + { + "epoch": 15.439641109298531, + "grad_norm": 0.0005524197476916015, + "learning_rate": 0.00015020087905999097, + "loss": 0.0097, + "num_input_tokens_seen": 204232640, + "step": 94645 + }, + { + "epoch": 15.440456769983687, + "grad_norm": 0.0008837959612719715, + "learning_rate": 0.00015015002202883193, + "loss": 0.0121, + "num_input_tokens_seen": 204243072, + "step": 94650 + }, + { + "epoch": 15.441272430668842, + "grad_norm": 0.0008121423306874931, + "learning_rate": 0.00015009917208782657, + "loss": 0.003, + "num_input_tokens_seen": 204255104, + "step": 94655 + }, + { + "epoch": 15.442088091353996, + "grad_norm": 0.00011177684064023197, + "learning_rate": 0.00015004832923800533, + "loss": 0.0013, + "num_input_tokens_seen": 204266368, + "step": 94660 + }, + { + "epoch": 15.442903752039152, + "grad_norm": 0.0006088958471082151, + "learning_rate": 0.00014999749348039866, + "loss": 0.0015, + "num_input_tokens_seen": 204278208, + "step": 94665 + }, + { + "epoch": 15.443719412724306, + "grad_norm": 0.008690602146089077, + "learning_rate": 0.0001499466648160368, + "loss": 0.0006, + "num_input_tokens_seen": 204287424, + "step": 94670 + }, + { + "epoch": 15.444535073409462, + "grad_norm": 0.0003137676976621151, + "learning_rate": 0.00014989584324594986, + "loss": 0.0006, + "num_input_tokens_seen": 204298368, + "step": 94675 + }, + { + "epoch": 15.445350734094617, + "grad_norm": 0.01819598861038685, + "learning_rate": 0.00014984502877116773, + "loss": 0.0072, + "num_input_tokens_seen": 204309088, + "step": 94680 + }, + { + "epoch": 15.446166394779771, + "grad_norm": 0.00020848566782660782, + "learning_rate": 0.00014979422139272037, + "loss": 0.0013, + "num_input_tokens_seen": 204318624, + "step": 94685 + }, + { + "epoch": 15.446982055464927, + "grad_norm": 0.0059334831312298775, + "learning_rate": 0.00014974342111163735, + "loss": 0.0006, + "num_input_tokens_seen": 204328064, + "step": 94690 + }, + { + "epoch": 15.447797716150081, + "grad_norm": 0.019166678190231323, + "learning_rate": 0.00014969262792894822, + "loss": 0.0008, + "num_input_tokens_seen": 204339264, + "step": 94695 + }, + { + "epoch": 15.448613376835237, + "grad_norm": 0.006126645021140575, + "learning_rate": 0.0001496418418456824, + "loss": 0.0051, + "num_input_tokens_seen": 204349280, + "step": 94700 + }, + { + "epoch": 15.449429037520392, + "grad_norm": 0.13542130589485168, + "learning_rate": 0.0001495910628628691, + "loss": 0.0072, + "num_input_tokens_seen": 204359232, + "step": 94705 + }, + { + "epoch": 15.450244698205546, + "grad_norm": 0.005986794363707304, + "learning_rate": 0.00014954029098153748, + "loss": 0.0036, + "num_input_tokens_seen": 204370432, + "step": 94710 + }, + { + "epoch": 15.451060358890702, + "grad_norm": 0.008435364812612534, + "learning_rate": 0.00014948952620271643, + "loss": 0.0006, + "num_input_tokens_seen": 204380160, + "step": 94715 + }, + { + "epoch": 15.451876019575856, + "grad_norm": 0.000725329970009625, + "learning_rate": 0.00014943876852743475, + "loss": 0.0162, + "num_input_tokens_seen": 204391168, + "step": 94720 + }, + { + "epoch": 15.452691680261012, + "grad_norm": 0.00016504598897881806, + "learning_rate": 0.00014938801795672102, + "loss": 0.0006, + "num_input_tokens_seen": 204401184, + "step": 94725 + }, + { + "epoch": 15.453507340946166, + "grad_norm": 0.019201790913939476, + "learning_rate": 0.00014933727449160423, + "loss": 0.0005, + "num_input_tokens_seen": 204410656, + "step": 94730 + }, + { + "epoch": 15.454323001631321, + "grad_norm": 0.15058407187461853, + "learning_rate": 0.00014928653813311204, + "loss": 0.0039, + "num_input_tokens_seen": 204421248, + "step": 94735 + }, + { + "epoch": 15.455138662316477, + "grad_norm": 0.00024242799554485828, + "learning_rate": 0.00014923580888227329, + "loss": 0.0004, + "num_input_tokens_seen": 204430976, + "step": 94740 + }, + { + "epoch": 15.455954323001631, + "grad_norm": 0.006361409090459347, + "learning_rate": 0.00014918508674011582, + "loss": 0.0025, + "num_input_tokens_seen": 204440000, + "step": 94745 + }, + { + "epoch": 15.456769983686787, + "grad_norm": 0.0016374423867091537, + "learning_rate": 0.0001491343717076676, + "loss": 0.0004, + "num_input_tokens_seen": 204450016, + "step": 94750 + }, + { + "epoch": 15.45758564437194, + "grad_norm": 0.03306256979703903, + "learning_rate": 0.00014908366378595645, + "loss": 0.002, + "num_input_tokens_seen": 204460288, + "step": 94755 + }, + { + "epoch": 15.458401305057096, + "grad_norm": 0.0036266965325921774, + "learning_rate": 0.00014903296297601, + "loss": 0.0021, + "num_input_tokens_seen": 204471072, + "step": 94760 + }, + { + "epoch": 15.459216965742252, + "grad_norm": 0.001709536649286747, + "learning_rate": 0.00014898226927885584, + "loss": 0.001, + "num_input_tokens_seen": 204482496, + "step": 94765 + }, + { + "epoch": 15.460032626427406, + "grad_norm": 0.02601746656000614, + "learning_rate": 0.00014893158269552127, + "loss": 0.011, + "num_input_tokens_seen": 204493248, + "step": 94770 + }, + { + "epoch": 15.460848287112562, + "grad_norm": 0.00025870028184726834, + "learning_rate": 0.00014888090322703353, + "loss": 0.0837, + "num_input_tokens_seen": 204503904, + "step": 94775 + }, + { + "epoch": 15.461663947797716, + "grad_norm": 0.00029159363475628197, + "learning_rate": 0.00014883023087441965, + "loss": 0.0036, + "num_input_tokens_seen": 204515456, + "step": 94780 + }, + { + "epoch": 15.462479608482871, + "grad_norm": 0.0029122340492904186, + "learning_rate": 0.0001487795656387067, + "loss": 0.0005, + "num_input_tokens_seen": 204525440, + "step": 94785 + }, + { + "epoch": 15.463295269168025, + "grad_norm": 0.0034530076663941145, + "learning_rate": 0.00014872890752092144, + "loss": 0.0303, + "num_input_tokens_seen": 204536448, + "step": 94790 + }, + { + "epoch": 15.464110929853181, + "grad_norm": 0.0003178312035743147, + "learning_rate": 0.00014867825652209045, + "loss": 0.0436, + "num_input_tokens_seen": 204548064, + "step": 94795 + }, + { + "epoch": 15.464926590538337, + "grad_norm": 0.01973351277410984, + "learning_rate": 0.00014862761264324025, + "loss": 0.0031, + "num_input_tokens_seen": 204557760, + "step": 94800 + }, + { + "epoch": 15.46574225122349, + "grad_norm": 0.0007105742115527391, + "learning_rate": 0.00014857697588539727, + "loss": 0.0189, + "num_input_tokens_seen": 204567552, + "step": 94805 + }, + { + "epoch": 15.466557911908646, + "grad_norm": 0.010178121738135815, + "learning_rate": 0.00014852634624958766, + "loss": 0.0005, + "num_input_tokens_seen": 204578944, + "step": 94810 + }, + { + "epoch": 15.4673735725938, + "grad_norm": 0.0017588479677215219, + "learning_rate": 0.00014847572373683749, + "loss": 0.062, + "num_input_tokens_seen": 204589888, + "step": 94815 + }, + { + "epoch": 15.468189233278956, + "grad_norm": 0.006118913181126118, + "learning_rate": 0.00014842510834817274, + "loss": 0.0005, + "num_input_tokens_seen": 204599872, + "step": 94820 + }, + { + "epoch": 15.469004893964112, + "grad_norm": 0.01382782869040966, + "learning_rate": 0.00014837450008461922, + "loss": 0.0011, + "num_input_tokens_seen": 204611744, + "step": 94825 + }, + { + "epoch": 15.469820554649266, + "grad_norm": 0.007575156632810831, + "learning_rate": 0.00014832389894720233, + "loss": 0.0018, + "num_input_tokens_seen": 204622400, + "step": 94830 + }, + { + "epoch": 15.470636215334421, + "grad_norm": 0.00021573905542027205, + "learning_rate": 0.00014827330493694807, + "loss": 0.0006, + "num_input_tokens_seen": 204633664, + "step": 94835 + }, + { + "epoch": 15.471451876019575, + "grad_norm": 0.005726292263716459, + "learning_rate": 0.0001482227180548812, + "loss": 0.0018, + "num_input_tokens_seen": 204644544, + "step": 94840 + }, + { + "epoch": 15.47226753670473, + "grad_norm": 0.001501880120486021, + "learning_rate": 0.00014817213830202748, + "loss": 0.0023, + "num_input_tokens_seen": 204655808, + "step": 94845 + }, + { + "epoch": 15.473083197389887, + "grad_norm": 0.00036702307988889515, + "learning_rate": 0.00014812156567941143, + "loss": 0.0015, + "num_input_tokens_seen": 204666240, + "step": 94850 + }, + { + "epoch": 15.47389885807504, + "grad_norm": 0.0010513016022741795, + "learning_rate": 0.00014807100018805853, + "loss": 0.0021, + "num_input_tokens_seen": 204677440, + "step": 94855 + }, + { + "epoch": 15.474714518760196, + "grad_norm": 0.0019536451436579227, + "learning_rate": 0.00014802044182899294, + "loss": 0.0005, + "num_input_tokens_seen": 204689024, + "step": 94860 + }, + { + "epoch": 15.47553017944535, + "grad_norm": 0.5853731632232666, + "learning_rate": 0.00014796989060323997, + "loss": 0.0093, + "num_input_tokens_seen": 204698528, + "step": 94865 + }, + { + "epoch": 15.476345840130506, + "grad_norm": 0.001399086439050734, + "learning_rate": 0.00014791934651182338, + "loss": 0.0309, + "num_input_tokens_seen": 204710112, + "step": 94870 + }, + { + "epoch": 15.477161500815662, + "grad_norm": 0.0026542171835899353, + "learning_rate": 0.0001478688095557682, + "loss": 0.001, + "num_input_tokens_seen": 204720416, + "step": 94875 + }, + { + "epoch": 15.477977161500815, + "grad_norm": 0.0022510848939418793, + "learning_rate": 0.00014781827973609803, + "loss": 0.0009, + "num_input_tokens_seen": 204731424, + "step": 94880 + }, + { + "epoch": 15.478792822185971, + "grad_norm": 0.0008555944659747183, + "learning_rate": 0.00014776775705383733, + "loss": 0.0112, + "num_input_tokens_seen": 204744096, + "step": 94885 + }, + { + "epoch": 15.479608482871125, + "grad_norm": 0.0006049593794159591, + "learning_rate": 0.00014771724151000986, + "loss": 0.0004, + "num_input_tokens_seen": 204755200, + "step": 94890 + }, + { + "epoch": 15.48042414355628, + "grad_norm": 0.0004984940751455724, + "learning_rate": 0.00014766673310563945, + "loss": 0.0052, + "num_input_tokens_seen": 204766944, + "step": 94895 + }, + { + "epoch": 15.481239804241435, + "grad_norm": 1.3900939226150513, + "learning_rate": 0.0001476162318417496, + "loss": 0.0311, + "num_input_tokens_seen": 204777408, + "step": 94900 + }, + { + "epoch": 15.48205546492659, + "grad_norm": 0.0009828276233747602, + "learning_rate": 0.00014756573771936382, + "loss": 0.0022, + "num_input_tokens_seen": 204789024, + "step": 94905 + }, + { + "epoch": 15.482871125611746, + "grad_norm": 0.00020471001334954053, + "learning_rate": 0.0001475152507395055, + "loss": 0.0012, + "num_input_tokens_seen": 204800736, + "step": 94910 + }, + { + "epoch": 15.4836867862969, + "grad_norm": 0.0009769471362233162, + "learning_rate": 0.00014746477090319781, + "loss": 0.0002, + "num_input_tokens_seen": 204811648, + "step": 94915 + }, + { + "epoch": 15.484502446982056, + "grad_norm": 0.0007823727210052311, + "learning_rate": 0.00014741429821146375, + "loss": 0.0028, + "num_input_tokens_seen": 204821664, + "step": 94920 + }, + { + "epoch": 15.48531810766721, + "grad_norm": 0.006422135978937149, + "learning_rate": 0.00014736383266532622, + "loss": 0.0039, + "num_input_tokens_seen": 204832928, + "step": 94925 + }, + { + "epoch": 15.486133768352365, + "grad_norm": 0.04214267060160637, + "learning_rate": 0.00014731337426580792, + "loss": 0.0064, + "num_input_tokens_seen": 204843488, + "step": 94930 + }, + { + "epoch": 15.486949429037521, + "grad_norm": 0.3059985339641571, + "learning_rate": 0.0001472629230139314, + "loss": 0.0038, + "num_input_tokens_seen": 204854496, + "step": 94935 + }, + { + "epoch": 15.487765089722675, + "grad_norm": 0.0021008693147450686, + "learning_rate": 0.00014721247891071954, + "loss": 0.0237, + "num_input_tokens_seen": 204865536, + "step": 94940 + }, + { + "epoch": 15.48858075040783, + "grad_norm": 0.0011852094903588295, + "learning_rate": 0.00014716204195719396, + "loss": 0.0568, + "num_input_tokens_seen": 204875648, + "step": 94945 + }, + { + "epoch": 15.489396411092985, + "grad_norm": 0.04308982565999031, + "learning_rate": 0.00014711161215437757, + "loss": 0.0009, + "num_input_tokens_seen": 204885888, + "step": 94950 + }, + { + "epoch": 15.49021207177814, + "grad_norm": 0.005106368567794561, + "learning_rate": 0.00014706118950329173, + "loss": 0.001, + "num_input_tokens_seen": 204897440, + "step": 94955 + }, + { + "epoch": 15.491027732463296, + "grad_norm": 0.00030864315340295434, + "learning_rate": 0.00014701077400495894, + "loss": 0.0032, + "num_input_tokens_seen": 204908480, + "step": 94960 + }, + { + "epoch": 15.49184339314845, + "grad_norm": 0.006175698712468147, + "learning_rate": 0.00014696036566040028, + "loss": 0.0344, + "num_input_tokens_seen": 204920736, + "step": 94965 + }, + { + "epoch": 15.492659053833606, + "grad_norm": 0.011650477536022663, + "learning_rate": 0.00014690996447063798, + "loss": 0.0004, + "num_input_tokens_seen": 204931136, + "step": 94970 + }, + { + "epoch": 15.49347471451876, + "grad_norm": 0.0012376405065879226, + "learning_rate": 0.00014685957043669283, + "loss": 0.0076, + "num_input_tokens_seen": 204940352, + "step": 94975 + }, + { + "epoch": 15.494290375203915, + "grad_norm": 0.17368605732917786, + "learning_rate": 0.00014680918355958683, + "loss": 0.0046, + "num_input_tokens_seen": 204950720, + "step": 94980 + }, + { + "epoch": 15.49510603588907, + "grad_norm": 0.261605829000473, + "learning_rate": 0.00014675880384034046, + "loss": 0.0145, + "num_input_tokens_seen": 204960768, + "step": 94985 + }, + { + "epoch": 15.495921696574225, + "grad_norm": 0.0018758515361696482, + "learning_rate": 0.00014670843127997542, + "loss": 0.001, + "num_input_tokens_seen": 204971968, + "step": 94990 + }, + { + "epoch": 15.49673735725938, + "grad_norm": 0.006634041666984558, + "learning_rate": 0.0001466580658795118, + "loss": 0.0015, + "num_input_tokens_seen": 204982688, + "step": 94995 + }, + { + "epoch": 15.497553017944535, + "grad_norm": 0.0037987958639860153, + "learning_rate": 0.00014660770763997105, + "loss": 0.0004, + "num_input_tokens_seen": 204993088, + "step": 95000 + }, + { + "epoch": 15.49836867862969, + "grad_norm": 0.017223723232746124, + "learning_rate": 0.00014655735656237312, + "loss": 0.0009, + "num_input_tokens_seen": 205003872, + "step": 95005 + }, + { + "epoch": 15.499184339314844, + "grad_norm": 0.00012988693197257817, + "learning_rate": 0.00014650701264773907, + "loss": 0.0066, + "num_input_tokens_seen": 205014144, + "step": 95010 + }, + { + "epoch": 15.5, + "grad_norm": 0.0007451057899743319, + "learning_rate": 0.0001464566758970885, + "loss": 0.0005, + "num_input_tokens_seen": 205025440, + "step": 95015 + }, + { + "epoch": 15.500815660685156, + "grad_norm": 0.0003250829176977277, + "learning_rate": 0.00014640634631144206, + "loss": 0.0152, + "num_input_tokens_seen": 205036480, + "step": 95020 + }, + { + "epoch": 15.50163132137031, + "grad_norm": 0.001480034552514553, + "learning_rate": 0.00014635602389181956, + "loss": 0.0018, + "num_input_tokens_seen": 205047488, + "step": 95025 + }, + { + "epoch": 15.502446982055465, + "grad_norm": 0.0002693575224839151, + "learning_rate": 0.00014630570863924088, + "loss": 0.0063, + "num_input_tokens_seen": 205057536, + "step": 95030 + }, + { + "epoch": 15.50326264274062, + "grad_norm": 0.044891107827425, + "learning_rate": 0.0001462554005547257, + "loss": 0.0038, + "num_input_tokens_seen": 205068448, + "step": 95035 + }, + { + "epoch": 15.504078303425775, + "grad_norm": 0.0015609815018251538, + "learning_rate": 0.00014620509963929362, + "loss": 0.0007, + "num_input_tokens_seen": 205079648, + "step": 95040 + }, + { + "epoch": 15.50489396411093, + "grad_norm": 0.06334149092435837, + "learning_rate": 0.00014615480589396396, + "loss": 0.005, + "num_input_tokens_seen": 205089792, + "step": 95045 + }, + { + "epoch": 15.505709624796085, + "grad_norm": 0.0003389062185306102, + "learning_rate": 0.0001461045193197561, + "loss": 0.0004, + "num_input_tokens_seen": 205100096, + "step": 95050 + }, + { + "epoch": 15.50652528548124, + "grad_norm": 0.012134311720728874, + "learning_rate": 0.00014605423991768908, + "loss": 0.0008, + "num_input_tokens_seen": 205110400, + "step": 95055 + }, + { + "epoch": 15.507340946166394, + "grad_norm": 0.0012584009673446417, + "learning_rate": 0.00014600396768878188, + "loss": 0.0022, + "num_input_tokens_seen": 205121344, + "step": 95060 + }, + { + "epoch": 15.50815660685155, + "grad_norm": 0.001435612328350544, + "learning_rate": 0.0001459537026340534, + "loss": 0.0002, + "num_input_tokens_seen": 205132928, + "step": 95065 + }, + { + "epoch": 15.508972267536706, + "grad_norm": 0.09464535117149353, + "learning_rate": 0.0001459034447545222, + "loss": 0.0067, + "num_input_tokens_seen": 205142976, + "step": 95070 + }, + { + "epoch": 15.50978792822186, + "grad_norm": 0.0006718486547470093, + "learning_rate": 0.00014585319405120695, + "loss": 0.0006, + "num_input_tokens_seen": 205153152, + "step": 95075 + }, + { + "epoch": 15.510603588907015, + "grad_norm": 0.0002579309220891446, + "learning_rate": 0.0001458029505251258, + "loss": 0.0514, + "num_input_tokens_seen": 205163200, + "step": 95080 + }, + { + "epoch": 15.51141924959217, + "grad_norm": 0.00032117412774823606, + "learning_rate": 0.0001457527141772975, + "loss": 0.001, + "num_input_tokens_seen": 205173888, + "step": 95085 + }, + { + "epoch": 15.512234910277325, + "grad_norm": 0.048194728791713715, + "learning_rate": 0.00014570248500873963, + "loss": 0.0009, + "num_input_tokens_seen": 205184320, + "step": 95090 + }, + { + "epoch": 15.513050570962479, + "grad_norm": 0.003065476194024086, + "learning_rate": 0.00014565226302047058, + "loss": 0.0005, + "num_input_tokens_seen": 205195104, + "step": 95095 + }, + { + "epoch": 15.513866231647635, + "grad_norm": 0.00019102955411653966, + "learning_rate": 0.00014560204821350764, + "loss": 0.0022, + "num_input_tokens_seen": 205205824, + "step": 95100 + }, + { + "epoch": 15.51468189233279, + "grad_norm": 0.00033780813100747764, + "learning_rate": 0.00014555184058886905, + "loss": 0.0005, + "num_input_tokens_seen": 205217184, + "step": 95105 + }, + { + "epoch": 15.515497553017944, + "grad_norm": 0.0371658056974411, + "learning_rate": 0.00014550164014757183, + "loss": 0.0012, + "num_input_tokens_seen": 205228256, + "step": 95110 + }, + { + "epoch": 15.5163132137031, + "grad_norm": 0.007323721889406443, + "learning_rate": 0.00014545144689063382, + "loss": 0.0005, + "num_input_tokens_seen": 205239648, + "step": 95115 + }, + { + "epoch": 15.517128874388254, + "grad_norm": 0.0010035258019343019, + "learning_rate": 0.0001454012608190718, + "loss": 0.0019, + "num_input_tokens_seen": 205250784, + "step": 95120 + }, + { + "epoch": 15.51794453507341, + "grad_norm": 0.00017560164269525558, + "learning_rate": 0.0001453510819339033, + "loss": 0.0008, + "num_input_tokens_seen": 205260704, + "step": 95125 + }, + { + "epoch": 15.518760195758565, + "grad_norm": 0.0009511786047369242, + "learning_rate": 0.0001453009102361447, + "loss": 0.0004, + "num_input_tokens_seen": 205270112, + "step": 95130 + }, + { + "epoch": 15.51957585644372, + "grad_norm": 0.0003608663100749254, + "learning_rate": 0.0001452507457268135, + "loss": 0.1331, + "num_input_tokens_seen": 205280928, + "step": 95135 + }, + { + "epoch": 15.520391517128875, + "grad_norm": 0.2536231279373169, + "learning_rate": 0.00014520058840692562, + "loss": 0.0132, + "num_input_tokens_seen": 205291488, + "step": 95140 + }, + { + "epoch": 15.521207177814029, + "grad_norm": 0.0001388175442116335, + "learning_rate": 0.00014515043827749812, + "loss": 0.0004, + "num_input_tokens_seen": 205301024, + "step": 95145 + }, + { + "epoch": 15.522022838499185, + "grad_norm": 0.031359411776065826, + "learning_rate": 0.0001451002953395471, + "loss": 0.0014, + "num_input_tokens_seen": 205313632, + "step": 95150 + }, + { + "epoch": 15.522838499184338, + "grad_norm": 0.0005672869738191366, + "learning_rate": 0.00014505015959408884, + "loss": 0.0003, + "num_input_tokens_seen": 205324832, + "step": 95155 + }, + { + "epoch": 15.523654159869494, + "grad_norm": 0.002573468955233693, + "learning_rate": 0.00014500003104213932, + "loss": 0.1393, + "num_input_tokens_seen": 205336160, + "step": 95160 + }, + { + "epoch": 15.52446982055465, + "grad_norm": 0.024644378572702408, + "learning_rate": 0.0001449499096847146, + "loss": 0.009, + "num_input_tokens_seen": 205347648, + "step": 95165 + }, + { + "epoch": 15.525285481239804, + "grad_norm": 0.006406279746443033, + "learning_rate": 0.00014489979552283035, + "loss": 0.0032, + "num_input_tokens_seen": 205359392, + "step": 95170 + }, + { + "epoch": 15.52610114192496, + "grad_norm": 0.0010946821421384811, + "learning_rate": 0.0001448496885575022, + "loss": 0.0002, + "num_input_tokens_seen": 205368800, + "step": 95175 + }, + { + "epoch": 15.526916802610113, + "grad_norm": 0.003973292652517557, + "learning_rate": 0.00014479958878974564, + "loss": 0.002, + "num_input_tokens_seen": 205379744, + "step": 95180 + }, + { + "epoch": 15.52773246329527, + "grad_norm": 0.035430658608675, + "learning_rate": 0.00014474949622057603, + "loss": 0.0012, + "num_input_tokens_seen": 205389344, + "step": 95185 + }, + { + "epoch": 15.528548123980425, + "grad_norm": 0.0007754186517558992, + "learning_rate": 0.00014469941085100857, + "loss": 0.0002, + "num_input_tokens_seen": 205399264, + "step": 95190 + }, + { + "epoch": 15.529363784665579, + "grad_norm": 0.00035918806679546833, + "learning_rate": 0.00014464933268205826, + "loss": 0.0002, + "num_input_tokens_seen": 205410496, + "step": 95195 + }, + { + "epoch": 15.530179445350734, + "grad_norm": 0.0004321248852647841, + "learning_rate": 0.00014459926171474002, + "loss": 0.0031, + "num_input_tokens_seen": 205420320, + "step": 95200 + }, + { + "epoch": 15.530995106035888, + "grad_norm": 0.004495594184845686, + "learning_rate": 0.0001445491979500686, + "loss": 0.0016, + "num_input_tokens_seen": 205430944, + "step": 95205 + }, + { + "epoch": 15.531810766721044, + "grad_norm": 0.0003027510247193277, + "learning_rate": 0.0001444991413890586, + "loss": 0.001, + "num_input_tokens_seen": 205441344, + "step": 95210 + }, + { + "epoch": 15.5326264274062, + "grad_norm": 0.043840739876031876, + "learning_rate": 0.00014444909203272438, + "loss": 0.0281, + "num_input_tokens_seen": 205452544, + "step": 95215 + }, + { + "epoch": 15.533442088091354, + "grad_norm": 0.003174506826326251, + "learning_rate": 0.0001443990498820806, + "loss": 0.0137, + "num_input_tokens_seen": 205463648, + "step": 95220 + }, + { + "epoch": 15.53425774877651, + "grad_norm": 0.007116552442312241, + "learning_rate": 0.0001443490149381409, + "loss": 0.0076, + "num_input_tokens_seen": 205475296, + "step": 95225 + }, + { + "epoch": 15.535073409461663, + "grad_norm": 0.0003955236461479217, + "learning_rate": 0.0001442989872019199, + "loss": 0.0004, + "num_input_tokens_seen": 205485888, + "step": 95230 + }, + { + "epoch": 15.535889070146819, + "grad_norm": 0.005346110090613365, + "learning_rate": 0.00014424896667443083, + "loss": 0.009, + "num_input_tokens_seen": 205496288, + "step": 95235 + }, + { + "epoch": 15.536704730831975, + "grad_norm": 0.001505639753304422, + "learning_rate": 0.00014419895335668809, + "loss": 0.0012, + "num_input_tokens_seen": 205507104, + "step": 95240 + }, + { + "epoch": 15.537520391517129, + "grad_norm": 0.21402500569820404, + "learning_rate": 0.00014414894724970462, + "loss": 0.0084, + "num_input_tokens_seen": 205518208, + "step": 95245 + }, + { + "epoch": 15.538336052202284, + "grad_norm": 0.0023576044477522373, + "learning_rate": 0.00014409894835449444, + "loss": 0.0083, + "num_input_tokens_seen": 205526688, + "step": 95250 + }, + { + "epoch": 15.539151712887438, + "grad_norm": 0.0031420669984072447, + "learning_rate": 0.00014404895667207028, + "loss": 0.0004, + "num_input_tokens_seen": 205536928, + "step": 95255 + }, + { + "epoch": 15.539967373572594, + "grad_norm": 0.18050356209278107, + "learning_rate": 0.00014399897220344576, + "loss": 0.0244, + "num_input_tokens_seen": 205547744, + "step": 95260 + }, + { + "epoch": 15.540783034257748, + "grad_norm": 0.001217308803461492, + "learning_rate": 0.00014394899494963364, + "loss": 0.0507, + "num_input_tokens_seen": 205559296, + "step": 95265 + }, + { + "epoch": 15.541598694942904, + "grad_norm": 0.00012343519483692944, + "learning_rate": 0.00014389902491164681, + "loss": 0.0004, + "num_input_tokens_seen": 205570112, + "step": 95270 + }, + { + "epoch": 15.54241435562806, + "grad_norm": 0.0001986398419830948, + "learning_rate": 0.00014384906209049804, + "loss": 0.001, + "num_input_tokens_seen": 205579424, + "step": 95275 + }, + { + "epoch": 15.543230016313213, + "grad_norm": 0.36598604917526245, + "learning_rate": 0.0001437991064871998, + "loss": 0.0078, + "num_input_tokens_seen": 205591520, + "step": 95280 + }, + { + "epoch": 15.544045676998369, + "grad_norm": 0.000213752529816702, + "learning_rate": 0.0001437491581027645, + "loss": 0.0003, + "num_input_tokens_seen": 205603328, + "step": 95285 + }, + { + "epoch": 15.544861337683523, + "grad_norm": 0.0418669693171978, + "learning_rate": 0.00014369921693820447, + "loss": 0.0663, + "num_input_tokens_seen": 205614848, + "step": 95290 + }, + { + "epoch": 15.545676998368679, + "grad_norm": 0.009288634173572063, + "learning_rate": 0.00014364928299453184, + "loss": 0.0014, + "num_input_tokens_seen": 205626432, + "step": 95295 + }, + { + "epoch": 15.546492659053834, + "grad_norm": 0.012813269160687923, + "learning_rate": 0.00014359935627275856, + "loss": 0.0006, + "num_input_tokens_seen": 205637536, + "step": 95300 + }, + { + "epoch": 15.547308319738988, + "grad_norm": 0.016869664192199707, + "learning_rate": 0.00014354943677389643, + "loss": 0.0028, + "num_input_tokens_seen": 205647648, + "step": 95305 + }, + { + "epoch": 15.548123980424144, + "grad_norm": 0.07072090357542038, + "learning_rate": 0.00014349952449895715, + "loss": 0.0023, + "num_input_tokens_seen": 205658784, + "step": 95310 + }, + { + "epoch": 15.548939641109298, + "grad_norm": 0.00021751580061390996, + "learning_rate": 0.00014344961944895223, + "loss": 0.0029, + "num_input_tokens_seen": 205669216, + "step": 95315 + }, + { + "epoch": 15.549755301794454, + "grad_norm": 0.004849136341363192, + "learning_rate": 0.00014339972162489317, + "loss": 0.0035, + "num_input_tokens_seen": 205680288, + "step": 95320 + }, + { + "epoch": 15.550570962479608, + "grad_norm": 0.0010033507132902741, + "learning_rate": 0.0001433498310277911, + "loss": 0.0043, + "num_input_tokens_seen": 205691296, + "step": 95325 + }, + { + "epoch": 15.551386623164763, + "grad_norm": 0.003541347337886691, + "learning_rate": 0.0001432999476586571, + "loss": 0.0016, + "num_input_tokens_seen": 205703040, + "step": 95330 + }, + { + "epoch": 15.552202283849919, + "grad_norm": 0.0937349870800972, + "learning_rate": 0.00014325007151850218, + "loss": 0.0047, + "num_input_tokens_seen": 205713664, + "step": 95335 + }, + { + "epoch": 15.553017944535073, + "grad_norm": 0.00021879703854210675, + "learning_rate": 0.00014320020260833716, + "loss": 0.001, + "num_input_tokens_seen": 205723840, + "step": 95340 + }, + { + "epoch": 15.553833605220229, + "grad_norm": 0.014658988453447819, + "learning_rate": 0.00014315034092917268, + "loss": 0.0029, + "num_input_tokens_seen": 205734272, + "step": 95345 + }, + { + "epoch": 15.554649265905383, + "grad_norm": 3.794832229614258, + "learning_rate": 0.00014310048648201917, + "loss": 0.072, + "num_input_tokens_seen": 205743872, + "step": 95350 + }, + { + "epoch": 15.555464926590538, + "grad_norm": 0.003252411261200905, + "learning_rate": 0.0001430506392678871, + "loss": 0.0863, + "num_input_tokens_seen": 205754912, + "step": 95355 + }, + { + "epoch": 15.556280587275694, + "grad_norm": 0.0003053327091038227, + "learning_rate": 0.00014300079928778646, + "loss": 0.0022, + "num_input_tokens_seen": 205765440, + "step": 95360 + }, + { + "epoch": 15.557096247960848, + "grad_norm": 0.0005750078707933426, + "learning_rate": 0.00014295096654272772, + "loss": 0.0004, + "num_input_tokens_seen": 205776544, + "step": 95365 + }, + { + "epoch": 15.557911908646004, + "grad_norm": 0.00024151428078766912, + "learning_rate": 0.00014290114103372058, + "loss": 0.0005, + "num_input_tokens_seen": 205787584, + "step": 95370 + }, + { + "epoch": 15.558727569331158, + "grad_norm": 0.0008908939198590815, + "learning_rate": 0.00014285132276177482, + "loss": 0.0007, + "num_input_tokens_seen": 205797504, + "step": 95375 + }, + { + "epoch": 15.559543230016313, + "grad_norm": 0.004722579848021269, + "learning_rate": 0.00014280151172790006, + "loss": 0.001, + "num_input_tokens_seen": 205808384, + "step": 95380 + }, + { + "epoch": 15.560358890701469, + "grad_norm": 0.00023542277631349862, + "learning_rate": 0.00014275170793310582, + "loss": 0.0005, + "num_input_tokens_seen": 205819168, + "step": 95385 + }, + { + "epoch": 15.561174551386623, + "grad_norm": 0.8846027851104736, + "learning_rate": 0.00014270191137840145, + "loss": 0.029, + "num_input_tokens_seen": 205829792, + "step": 95390 + }, + { + "epoch": 15.561990212071779, + "grad_norm": 0.014058368280529976, + "learning_rate": 0.00014265212206479604, + "loss": 0.003, + "num_input_tokens_seen": 205839488, + "step": 95395 + }, + { + "epoch": 15.562805872756933, + "grad_norm": 0.006271450314670801, + "learning_rate": 0.00014260233999329873, + "loss": 0.0006, + "num_input_tokens_seen": 205850400, + "step": 95400 + }, + { + "epoch": 15.563621533442088, + "grad_norm": 0.01994747295975685, + "learning_rate": 0.00014255256516491845, + "loss": 0.0499, + "num_input_tokens_seen": 205861216, + "step": 95405 + }, + { + "epoch": 15.564437194127244, + "grad_norm": 0.03231491893529892, + "learning_rate": 0.00014250279758066387, + "loss": 0.0034, + "num_input_tokens_seen": 205872128, + "step": 95410 + }, + { + "epoch": 15.565252854812398, + "grad_norm": 0.005498243495821953, + "learning_rate": 0.00014245303724154358, + "loss": 0.0028, + "num_input_tokens_seen": 205883648, + "step": 95415 + }, + { + "epoch": 15.566068515497554, + "grad_norm": 0.00520562008023262, + "learning_rate": 0.00014240328414856607, + "loss": 0.0011, + "num_input_tokens_seen": 205895616, + "step": 95420 + }, + { + "epoch": 15.566884176182707, + "grad_norm": 0.007900963537395, + "learning_rate": 0.00014235353830273966, + "loss": 0.008, + "num_input_tokens_seen": 205906624, + "step": 95425 + }, + { + "epoch": 15.567699836867863, + "grad_norm": 0.0019300401909276843, + "learning_rate": 0.00014230379970507252, + "loss": 0.0008, + "num_input_tokens_seen": 205916320, + "step": 95430 + }, + { + "epoch": 15.568515497553017, + "grad_norm": 0.004398978315293789, + "learning_rate": 0.00014225406835657262, + "loss": 0.148, + "num_input_tokens_seen": 205928064, + "step": 95435 + }, + { + "epoch": 15.569331158238173, + "grad_norm": 0.0015494292601943016, + "learning_rate": 0.00014220434425824785, + "loss": 0.0022, + "num_input_tokens_seen": 205938496, + "step": 95440 + }, + { + "epoch": 15.570146818923329, + "grad_norm": 0.028006885200738907, + "learning_rate": 0.00014215462741110597, + "loss": 0.0013, + "num_input_tokens_seen": 205949440, + "step": 95445 + }, + { + "epoch": 15.570962479608482, + "grad_norm": 0.0013601257232949138, + "learning_rate": 0.00014210491781615453, + "loss": 0.0007, + "num_input_tokens_seen": 205961792, + "step": 95450 + }, + { + "epoch": 15.571778140293638, + "grad_norm": 0.0014163810992613435, + "learning_rate": 0.00014205521547440092, + "loss": 0.0004, + "num_input_tokens_seen": 205972960, + "step": 95455 + }, + { + "epoch": 15.572593800978792, + "grad_norm": 0.032782722264528275, + "learning_rate": 0.00014200552038685249, + "loss": 0.0026, + "num_input_tokens_seen": 205984192, + "step": 95460 + }, + { + "epoch": 15.573409461663948, + "grad_norm": 0.0009078769362531602, + "learning_rate": 0.00014195583255451633, + "loss": 0.0007, + "num_input_tokens_seen": 205995424, + "step": 95465 + }, + { + "epoch": 15.574225122349104, + "grad_norm": 0.6793496012687683, + "learning_rate": 0.00014190615197839929, + "loss": 0.0251, + "num_input_tokens_seen": 206007552, + "step": 95470 + }, + { + "epoch": 15.575040783034257, + "grad_norm": 0.02442045696079731, + "learning_rate": 0.00014185647865950861, + "loss": 0.0012, + "num_input_tokens_seen": 206018048, + "step": 95475 + }, + { + "epoch": 15.575856443719413, + "grad_norm": 0.036046091467142105, + "learning_rate": 0.00014180681259885048, + "loss": 0.003, + "num_input_tokens_seen": 206028992, + "step": 95480 + }, + { + "epoch": 15.576672104404567, + "grad_norm": 0.24406981468200684, + "learning_rate": 0.000141757153797432, + "loss": 0.005, + "num_input_tokens_seen": 206039584, + "step": 95485 + }, + { + "epoch": 15.577487765089723, + "grad_norm": 0.0004443081561475992, + "learning_rate": 0.00014170750225625888, + "loss": 0.0032, + "num_input_tokens_seen": 206050624, + "step": 95490 + }, + { + "epoch": 15.578303425774878, + "grad_norm": 0.0006547347875311971, + "learning_rate": 0.00014165785797633812, + "loss": 0.0004, + "num_input_tokens_seen": 206062624, + "step": 95495 + }, + { + "epoch": 15.579119086460032, + "grad_norm": 0.05275457352399826, + "learning_rate": 0.00014160822095867515, + "loss": 0.0012, + "num_input_tokens_seen": 206073728, + "step": 95500 + }, + { + "epoch": 15.579934747145188, + "grad_norm": 0.00043325810111127794, + "learning_rate": 0.00014155859120427633, + "loss": 0.0013, + "num_input_tokens_seen": 206084256, + "step": 95505 + }, + { + "epoch": 15.580750407830342, + "grad_norm": 0.00030842420528642833, + "learning_rate": 0.00014150896871414743, + "loss": 0.0011, + "num_input_tokens_seen": 206095040, + "step": 95510 + }, + { + "epoch": 15.581566068515498, + "grad_norm": 0.0117607731372118, + "learning_rate": 0.00014145935348929407, + "loss": 0.0033, + "num_input_tokens_seen": 206105216, + "step": 95515 + }, + { + "epoch": 15.582381729200652, + "grad_norm": 0.004052773118019104, + "learning_rate": 0.0001414097455307217, + "loss": 0.0004, + "num_input_tokens_seen": 206116768, + "step": 95520 + }, + { + "epoch": 15.583197389885807, + "grad_norm": 0.0007155505008995533, + "learning_rate": 0.00014136014483943576, + "loss": 0.0004, + "num_input_tokens_seen": 206128608, + "step": 95525 + }, + { + "epoch": 15.584013050570963, + "grad_norm": 0.004507394041866064, + "learning_rate": 0.0001413105514164415, + "loss": 0.029, + "num_input_tokens_seen": 206139744, + "step": 95530 + }, + { + "epoch": 15.584828711256117, + "grad_norm": 0.0011732223210856318, + "learning_rate": 0.0001412609652627439, + "loss": 0.0022, + "num_input_tokens_seen": 206151168, + "step": 95535 + }, + { + "epoch": 15.585644371941273, + "grad_norm": 0.00026782669010572135, + "learning_rate": 0.00014121138637934795, + "loss": 0.049, + "num_input_tokens_seen": 206162400, + "step": 95540 + }, + { + "epoch": 15.586460032626427, + "grad_norm": 0.010355668142437935, + "learning_rate": 0.00014116181476725838, + "loss": 0.0006, + "num_input_tokens_seen": 206173984, + "step": 95545 + }, + { + "epoch": 15.587275693311582, + "grad_norm": 0.0952363982796669, + "learning_rate": 0.00014111225042747987, + "loss": 0.0037, + "num_input_tokens_seen": 206183488, + "step": 95550 + }, + { + "epoch": 15.588091353996738, + "grad_norm": 0.00588382501155138, + "learning_rate": 0.00014106269336101692, + "loss": 0.0005, + "num_input_tokens_seen": 206193120, + "step": 95555 + }, + { + "epoch": 15.588907014681892, + "grad_norm": 0.00022775396064389497, + "learning_rate": 0.0001410131435688738, + "loss": 0.0009, + "num_input_tokens_seen": 206204576, + "step": 95560 + }, + { + "epoch": 15.589722675367048, + "grad_norm": 0.0004134063492529094, + "learning_rate": 0.00014096360105205475, + "loss": 0.0028, + "num_input_tokens_seen": 206214752, + "step": 95565 + }, + { + "epoch": 15.590538336052202, + "grad_norm": 0.46721798181533813, + "learning_rate": 0.00014091406581156373, + "loss": 0.0176, + "num_input_tokens_seen": 206226528, + "step": 95570 + }, + { + "epoch": 15.591353996737357, + "grad_norm": 0.04983755573630333, + "learning_rate": 0.00014086453784840463, + "loss": 0.1596, + "num_input_tokens_seen": 206238912, + "step": 95575 + }, + { + "epoch": 15.592169657422513, + "grad_norm": 0.18890704214572906, + "learning_rate": 0.00014081501716358154, + "loss": 0.0046, + "num_input_tokens_seen": 206250624, + "step": 95580 + }, + { + "epoch": 15.592985318107667, + "grad_norm": 0.0005867051659151912, + "learning_rate": 0.0001407655037580975, + "loss": 0.0145, + "num_input_tokens_seen": 206262240, + "step": 95585 + }, + { + "epoch": 15.593800978792823, + "grad_norm": 0.10632522404193878, + "learning_rate": 0.0001407159976329565, + "loss": 0.0058, + "num_input_tokens_seen": 206271456, + "step": 95590 + }, + { + "epoch": 15.594616639477977, + "grad_norm": 0.0010079618077725172, + "learning_rate": 0.00014066649878916133, + "loss": 0.0012, + "num_input_tokens_seen": 206282272, + "step": 95595 + }, + { + "epoch": 15.595432300163132, + "grad_norm": 0.0011086452286690474, + "learning_rate": 0.00014061700722771569, + "loss": 0.0057, + "num_input_tokens_seen": 206293152, + "step": 95600 + }, + { + "epoch": 15.596247960848288, + "grad_norm": 0.0028215094935148954, + "learning_rate": 0.000140567522949622, + "loss": 0.0003, + "num_input_tokens_seen": 206303072, + "step": 95605 + }, + { + "epoch": 15.597063621533442, + "grad_norm": 0.00015021061699371785, + "learning_rate": 0.00014051804595588375, + "loss": 0.0035, + "num_input_tokens_seen": 206314304, + "step": 95610 + }, + { + "epoch": 15.597879282218598, + "grad_norm": 0.8551382422447205, + "learning_rate": 0.00014046857624750304, + "loss": 0.046, + "num_input_tokens_seen": 206324672, + "step": 95615 + }, + { + "epoch": 15.598694942903752, + "grad_norm": 0.002977900905534625, + "learning_rate": 0.00014041911382548305, + "loss": 0.0029, + "num_input_tokens_seen": 206335360, + "step": 95620 + }, + { + "epoch": 15.599510603588907, + "grad_norm": 0.016733024269342422, + "learning_rate": 0.00014036965869082551, + "loss": 0.001, + "num_input_tokens_seen": 206347776, + "step": 95625 + }, + { + "epoch": 15.600326264274061, + "grad_norm": 0.3299171030521393, + "learning_rate": 0.00014032021084453344, + "loss": 0.011, + "num_input_tokens_seen": 206358880, + "step": 95630 + }, + { + "epoch": 15.601141924959217, + "grad_norm": 0.022868327796459198, + "learning_rate": 0.0001402707702876082, + "loss": 0.0045, + "num_input_tokens_seen": 206369920, + "step": 95635 + }, + { + "epoch": 15.601957585644373, + "grad_norm": 0.15655595064163208, + "learning_rate": 0.0001402213370210525, + "loss": 0.0042, + "num_input_tokens_seen": 206379776, + "step": 95640 + }, + { + "epoch": 15.602773246329527, + "grad_norm": 0.04554076865315437, + "learning_rate": 0.00014017191104586751, + "loss": 0.0051, + "num_input_tokens_seen": 206390496, + "step": 95645 + }, + { + "epoch": 15.603588907014682, + "grad_norm": 0.03889409080147743, + "learning_rate": 0.00014012249236305542, + "loss": 0.0023, + "num_input_tokens_seen": 206403104, + "step": 95650 + }, + { + "epoch": 15.604404567699836, + "grad_norm": 0.013033692725002766, + "learning_rate": 0.00014007308097361749, + "loss": 0.0066, + "num_input_tokens_seen": 206414368, + "step": 95655 + }, + { + "epoch": 15.605220228384992, + "grad_norm": 0.00955155398696661, + "learning_rate": 0.00014002367687855516, + "loss": 0.0014, + "num_input_tokens_seen": 206425248, + "step": 95660 + }, + { + "epoch": 15.606035889070148, + "grad_norm": 0.00036103956517763436, + "learning_rate": 0.00013997428007886975, + "loss": 0.0005, + "num_input_tokens_seen": 206435008, + "step": 95665 + }, + { + "epoch": 15.606851549755302, + "grad_norm": 0.0019217170774936676, + "learning_rate": 0.00013992489057556223, + "loss": 0.0028, + "num_input_tokens_seen": 206446048, + "step": 95670 + }, + { + "epoch": 15.607667210440457, + "grad_norm": 0.059070341289043427, + "learning_rate": 0.00013987550836963358, + "loss": 0.0039, + "num_input_tokens_seen": 206458144, + "step": 95675 + }, + { + "epoch": 15.608482871125611, + "grad_norm": 0.0001320036535616964, + "learning_rate": 0.0001398261334620846, + "loss": 0.0163, + "num_input_tokens_seen": 206468320, + "step": 95680 + }, + { + "epoch": 15.609298531810767, + "grad_norm": 0.0010508454870432615, + "learning_rate": 0.00013977676585391597, + "loss": 0.0002, + "num_input_tokens_seen": 206478720, + "step": 95685 + }, + { + "epoch": 15.61011419249592, + "grad_norm": 0.04514780268073082, + "learning_rate": 0.00013972740554612817, + "loss": 0.0016, + "num_input_tokens_seen": 206489248, + "step": 95690 + }, + { + "epoch": 15.610929853181077, + "grad_norm": 0.0002508106699679047, + "learning_rate": 0.0001396780525397215, + "loss": 0.0017, + "num_input_tokens_seen": 206499040, + "step": 95695 + }, + { + "epoch": 15.611745513866232, + "grad_norm": 0.0015410548076033592, + "learning_rate": 0.00013962870683569605, + "loss": 0.0012, + "num_input_tokens_seen": 206510560, + "step": 95700 + }, + { + "epoch": 15.612561174551386, + "grad_norm": 0.0034480555914342403, + "learning_rate": 0.00013957936843505238, + "loss": 0.0007, + "num_input_tokens_seen": 206521408, + "step": 95705 + }, + { + "epoch": 15.613376835236542, + "grad_norm": 0.0005256517324596643, + "learning_rate": 0.00013953003733878965, + "loss": 0.0011, + "num_input_tokens_seen": 206532000, + "step": 95710 + }, + { + "epoch": 15.614192495921696, + "grad_norm": 0.055461425334215164, + "learning_rate": 0.0001394807135479083, + "loss": 0.1323, + "num_input_tokens_seen": 206543744, + "step": 95715 + }, + { + "epoch": 15.615008156606851, + "grad_norm": 0.00013834108540322632, + "learning_rate": 0.0001394313970634074, + "loss": 0.0018, + "num_input_tokens_seen": 206554144, + "step": 95720 + }, + { + "epoch": 15.615823817292007, + "grad_norm": 0.006016318220645189, + "learning_rate": 0.0001393820878862869, + "loss": 0.0075, + "num_input_tokens_seen": 206563936, + "step": 95725 + }, + { + "epoch": 15.616639477977161, + "grad_norm": 0.0006775193032808602, + "learning_rate": 0.00013933278601754563, + "loss": 0.0049, + "num_input_tokens_seen": 206575840, + "step": 95730 + }, + { + "epoch": 15.617455138662317, + "grad_norm": 0.002568283351138234, + "learning_rate": 0.00013928349145818326, + "loss": 0.1792, + "num_input_tokens_seen": 206586720, + "step": 95735 + }, + { + "epoch": 15.61827079934747, + "grad_norm": 0.012719418853521347, + "learning_rate": 0.00013923420420919823, + "loss": 0.0074, + "num_input_tokens_seen": 206595456, + "step": 95740 + }, + { + "epoch": 15.619086460032626, + "grad_norm": 0.00029855602770112455, + "learning_rate": 0.00013918492427159002, + "loss": 0.0005, + "num_input_tokens_seen": 206606016, + "step": 95745 + }, + { + "epoch": 15.619902120717782, + "grad_norm": 0.021075595170259476, + "learning_rate": 0.00013913565164635672, + "loss": 0.001, + "num_input_tokens_seen": 206617184, + "step": 95750 + }, + { + "epoch": 15.620717781402936, + "grad_norm": 0.011556020006537437, + "learning_rate": 0.00013908638633449756, + "loss": 0.003, + "num_input_tokens_seen": 206629440, + "step": 95755 + }, + { + "epoch": 15.621533442088092, + "grad_norm": 0.0009756295476108789, + "learning_rate": 0.00013903712833701032, + "loss": 0.0032, + "num_input_tokens_seen": 206640608, + "step": 95760 + }, + { + "epoch": 15.622349102773246, + "grad_norm": 0.0008706441731192172, + "learning_rate": 0.0001389878776548939, + "loss": 0.0006, + "num_input_tokens_seen": 206651424, + "step": 95765 + }, + { + "epoch": 15.623164763458401, + "grad_norm": 0.0059125120751559734, + "learning_rate": 0.00013893863428914583, + "loss": 0.0046, + "num_input_tokens_seen": 206662752, + "step": 95770 + }, + { + "epoch": 15.623980424143557, + "grad_norm": 0.0282041747123003, + "learning_rate": 0.00013888939824076464, + "loss": 0.0026, + "num_input_tokens_seen": 206673056, + "step": 95775 + }, + { + "epoch": 15.624796084828711, + "grad_norm": 0.0011267057852819562, + "learning_rate": 0.00013884016951074758, + "loss": 0.0005, + "num_input_tokens_seen": 206684032, + "step": 95780 + }, + { + "epoch": 15.625611745513867, + "grad_norm": 0.006685088854283094, + "learning_rate": 0.00013879094810009284, + "loss": 0.0019, + "num_input_tokens_seen": 206695456, + "step": 95785 + }, + { + "epoch": 15.62642740619902, + "grad_norm": 0.002139841206371784, + "learning_rate": 0.00013874173400979772, + "loss": 0.0035, + "num_input_tokens_seen": 206705760, + "step": 95790 + }, + { + "epoch": 15.627243066884176, + "grad_norm": 0.0016729332273826003, + "learning_rate": 0.00013869252724085974, + "loss": 0.0009, + "num_input_tokens_seen": 206716928, + "step": 95795 + }, + { + "epoch": 15.62805872756933, + "grad_norm": 0.25884413719177246, + "learning_rate": 0.00013864332779427597, + "loss": 0.0063, + "num_input_tokens_seen": 206727424, + "step": 95800 + }, + { + "epoch": 15.628874388254486, + "grad_norm": 0.007465675938874483, + "learning_rate": 0.00013859413567104357, + "loss": 0.0014, + "num_input_tokens_seen": 206739104, + "step": 95805 + }, + { + "epoch": 15.629690048939642, + "grad_norm": 0.0005771016003564, + "learning_rate": 0.00013854495087215951, + "loss": 0.0013, + "num_input_tokens_seen": 206750112, + "step": 95810 + }, + { + "epoch": 15.630505709624796, + "grad_norm": 0.0012112671975046396, + "learning_rate": 0.00013849577339862057, + "loss": 0.0106, + "num_input_tokens_seen": 206761600, + "step": 95815 + }, + { + "epoch": 15.631321370309951, + "grad_norm": 0.0003434692043811083, + "learning_rate": 0.00013844660325142334, + "loss": 0.0045, + "num_input_tokens_seen": 206772704, + "step": 95820 + }, + { + "epoch": 15.632137030995105, + "grad_norm": 0.002186562167480588, + "learning_rate": 0.00013839744043156438, + "loss": 0.0898, + "num_input_tokens_seen": 206783360, + "step": 95825 + }, + { + "epoch": 15.632952691680261, + "grad_norm": 0.0006233472959138453, + "learning_rate": 0.00013834828494004004, + "loss": 0.0036, + "num_input_tokens_seen": 206794720, + "step": 95830 + }, + { + "epoch": 15.633768352365417, + "grad_norm": 0.07907088845968246, + "learning_rate": 0.0001382991367778465, + "loss": 0.0036, + "num_input_tokens_seen": 206804576, + "step": 95835 + }, + { + "epoch": 15.63458401305057, + "grad_norm": 0.011425397358834743, + "learning_rate": 0.00013824999594597975, + "loss": 0.003, + "num_input_tokens_seen": 206815680, + "step": 95840 + }, + { + "epoch": 15.635399673735726, + "grad_norm": 0.0159642081707716, + "learning_rate": 0.00013820086244543562, + "loss": 0.0251, + "num_input_tokens_seen": 206826688, + "step": 95845 + }, + { + "epoch": 15.63621533442088, + "grad_norm": 0.11594539135694504, + "learning_rate": 0.00013815173627721027, + "loss": 0.0262, + "num_input_tokens_seen": 206837952, + "step": 95850 + }, + { + "epoch": 15.637030995106036, + "grad_norm": 0.0035517322830855846, + "learning_rate": 0.00013810261744229873, + "loss": 0.001, + "num_input_tokens_seen": 206849184, + "step": 95855 + }, + { + "epoch": 15.63784665579119, + "grad_norm": 0.0011521008564159274, + "learning_rate": 0.00013805350594169708, + "loss": 0.0099, + "num_input_tokens_seen": 206860160, + "step": 95860 + }, + { + "epoch": 15.638662316476346, + "grad_norm": 0.008991390466690063, + "learning_rate": 0.0001380044017764, + "loss": 0.0045, + "num_input_tokens_seen": 206871232, + "step": 95865 + }, + { + "epoch": 15.639477977161501, + "grad_norm": 0.000429492793045938, + "learning_rate": 0.0001379553049474032, + "loss": 0.0012, + "num_input_tokens_seen": 206880928, + "step": 95870 + }, + { + "epoch": 15.640293637846655, + "grad_norm": 0.08046331256628036, + "learning_rate": 0.00013790621545570114, + "loss": 0.0038, + "num_input_tokens_seen": 206891392, + "step": 95875 + }, + { + "epoch": 15.641109298531811, + "grad_norm": 0.0012031865771859884, + "learning_rate": 0.00013785713330228928, + "loss": 0.0137, + "num_input_tokens_seen": 206901856, + "step": 95880 + }, + { + "epoch": 15.641924959216965, + "grad_norm": 0.0037560241762548685, + "learning_rate": 0.00013780805848816175, + "loss": 0.0021, + "num_input_tokens_seen": 206913440, + "step": 95885 + }, + { + "epoch": 15.64274061990212, + "grad_norm": 0.00032818460022099316, + "learning_rate": 0.0001377589910143135, + "loss": 0.0026, + "num_input_tokens_seen": 206924608, + "step": 95890 + }, + { + "epoch": 15.643556280587276, + "grad_norm": 0.0014637821586802602, + "learning_rate": 0.00013770993088173884, + "loss": 0.0028, + "num_input_tokens_seen": 206934880, + "step": 95895 + }, + { + "epoch": 15.64437194127243, + "grad_norm": 0.007215833757072687, + "learning_rate": 0.000137660878091432, + "loss": 0.0037, + "num_input_tokens_seen": 206946240, + "step": 95900 + }, + { + "epoch": 15.645187601957586, + "grad_norm": 0.001746357069350779, + "learning_rate": 0.0001376118326443872, + "loss": 0.0007, + "num_input_tokens_seen": 206956416, + "step": 95905 + }, + { + "epoch": 15.64600326264274, + "grad_norm": 0.003975310362875462, + "learning_rate": 0.00013756279454159827, + "loss": 0.003, + "num_input_tokens_seen": 206967008, + "step": 95910 + }, + { + "epoch": 15.646818923327896, + "grad_norm": 0.020580396056175232, + "learning_rate": 0.0001375137637840591, + "loss": 0.0009, + "num_input_tokens_seen": 206979456, + "step": 95915 + }, + { + "epoch": 15.647634584013051, + "grad_norm": 0.008439670316874981, + "learning_rate": 0.00013746474037276335, + "loss": 0.0014, + "num_input_tokens_seen": 206989856, + "step": 95920 + }, + { + "epoch": 15.648450244698205, + "grad_norm": 0.0010003503412008286, + "learning_rate": 0.0001374157243087046, + "loss": 0.0023, + "num_input_tokens_seen": 207000224, + "step": 95925 + }, + { + "epoch": 15.649265905383361, + "grad_norm": 0.0017953935312107205, + "learning_rate": 0.00013736671559287612, + "loss": 0.0058, + "num_input_tokens_seen": 207011264, + "step": 95930 + }, + { + "epoch": 15.650081566068515, + "grad_norm": 0.04441596195101738, + "learning_rate": 0.0001373177142262712, + "loss": 0.0093, + "num_input_tokens_seen": 207021344, + "step": 95935 + }, + { + "epoch": 15.65089722675367, + "grad_norm": 0.0027095479890704155, + "learning_rate": 0.0001372687202098829, + "loss": 0.0041, + "num_input_tokens_seen": 207032320, + "step": 95940 + }, + { + "epoch": 15.651712887438826, + "grad_norm": 0.0044021145440638065, + "learning_rate": 0.00013721973354470412, + "loss": 0.0036, + "num_input_tokens_seen": 207042848, + "step": 95945 + }, + { + "epoch": 15.65252854812398, + "grad_norm": 0.00052974175196141, + "learning_rate": 0.00013717075423172765, + "loss": 0.003, + "num_input_tokens_seen": 207053344, + "step": 95950 + }, + { + "epoch": 15.653344208809136, + "grad_norm": 0.012341137044131756, + "learning_rate": 0.00013712178227194617, + "loss": 0.0008, + "num_input_tokens_seen": 207063584, + "step": 95955 + }, + { + "epoch": 15.65415986949429, + "grad_norm": 0.0007492932491004467, + "learning_rate": 0.00013707281766635204, + "loss": 0.0027, + "num_input_tokens_seen": 207075008, + "step": 95960 + }, + { + "epoch": 15.654975530179446, + "grad_norm": 0.0034503850620239973, + "learning_rate": 0.00013702386041593772, + "loss": 0.0016, + "num_input_tokens_seen": 207086560, + "step": 95965 + }, + { + "epoch": 15.655791190864601, + "grad_norm": 0.00048815066111274064, + "learning_rate": 0.00013697491052169536, + "loss": 0.002, + "num_input_tokens_seen": 207097024, + "step": 95970 + }, + { + "epoch": 15.656606851549755, + "grad_norm": 0.015873754397034645, + "learning_rate": 0.00013692596798461692, + "loss": 0.0047, + "num_input_tokens_seen": 207107904, + "step": 95975 + }, + { + "epoch": 15.65742251223491, + "grad_norm": 0.0003603002114687115, + "learning_rate": 0.00013687703280569437, + "loss": 0.0019, + "num_input_tokens_seen": 207119328, + "step": 95980 + }, + { + "epoch": 15.658238172920065, + "grad_norm": 0.001374375307932496, + "learning_rate": 0.0001368281049859194, + "loss": 0.0134, + "num_input_tokens_seen": 207129504, + "step": 95985 + }, + { + "epoch": 15.65905383360522, + "grad_norm": 0.5641927123069763, + "learning_rate": 0.0001367791845262834, + "loss": 0.0134, + "num_input_tokens_seen": 207139616, + "step": 95990 + }, + { + "epoch": 15.659869494290374, + "grad_norm": 0.005141918081790209, + "learning_rate": 0.0001367302714277784, + "loss": 0.0006, + "num_input_tokens_seen": 207149600, + "step": 95995 + }, + { + "epoch": 15.66068515497553, + "grad_norm": 0.002357889199629426, + "learning_rate": 0.00013668136569139488, + "loss": 0.0012, + "num_input_tokens_seen": 207160192, + "step": 96000 + }, + { + "epoch": 15.661500815660686, + "grad_norm": 0.0009057175484485924, + "learning_rate": 0.00013663246731812463, + "loss": 0.0004, + "num_input_tokens_seen": 207170912, + "step": 96005 + }, + { + "epoch": 15.66231647634584, + "grad_norm": 0.00031564992968924344, + "learning_rate": 0.00013658357630895834, + "loss": 0.0185, + "num_input_tokens_seen": 207180896, + "step": 96010 + }, + { + "epoch": 15.663132137030995, + "grad_norm": 0.13129884004592896, + "learning_rate": 0.00013653469266488688, + "loss": 0.0164, + "num_input_tokens_seen": 207192672, + "step": 96015 + }, + { + "epoch": 15.66394779771615, + "grad_norm": 0.003899345640093088, + "learning_rate": 0.000136485816386901, + "loss": 0.0124, + "num_input_tokens_seen": 207202976, + "step": 96020 + }, + { + "epoch": 15.664763458401305, + "grad_norm": 0.004868788179010153, + "learning_rate": 0.00013643694747599123, + "loss": 0.0004, + "num_input_tokens_seen": 207213344, + "step": 96025 + }, + { + "epoch": 15.66557911908646, + "grad_norm": 9.724534902488813e-05, + "learning_rate": 0.0001363880859331479, + "loss": 0.0012, + "num_input_tokens_seen": 207224224, + "step": 96030 + }, + { + "epoch": 15.666394779771615, + "grad_norm": 0.019382517784833908, + "learning_rate": 0.00013633923175936124, + "loss": 0.0006, + "num_input_tokens_seen": 207233984, + "step": 96035 + }, + { + "epoch": 15.66721044045677, + "grad_norm": 0.005930746905505657, + "learning_rate": 0.00013629038495562145, + "loss": 0.0111, + "num_input_tokens_seen": 207245216, + "step": 96040 + }, + { + "epoch": 15.668026101141924, + "grad_norm": 0.0014380632201209664, + "learning_rate": 0.00013624154552291834, + "loss": 0.0107, + "num_input_tokens_seen": 207256000, + "step": 96045 + }, + { + "epoch": 15.66884176182708, + "grad_norm": 0.0002755540772341192, + "learning_rate": 0.00013619271346224183, + "loss": 0.0003, + "num_input_tokens_seen": 207265792, + "step": 96050 + }, + { + "epoch": 15.669657422512234, + "grad_norm": 0.024928852915763855, + "learning_rate": 0.0001361438887745815, + "loss": 0.0012, + "num_input_tokens_seen": 207277760, + "step": 96055 + }, + { + "epoch": 15.67047308319739, + "grad_norm": 0.0004568420408759266, + "learning_rate": 0.0001360950714609268, + "loss": 0.0018, + "num_input_tokens_seen": 207288608, + "step": 96060 + }, + { + "epoch": 15.671288743882545, + "grad_norm": 0.013046540319919586, + "learning_rate": 0.00013604626152226719, + "loss": 0.0061, + "num_input_tokens_seen": 207298656, + "step": 96065 + }, + { + "epoch": 15.6721044045677, + "grad_norm": 0.007091703359037638, + "learning_rate": 0.00013599745895959175, + "loss": 0.059, + "num_input_tokens_seen": 207310016, + "step": 96070 + }, + { + "epoch": 15.672920065252855, + "grad_norm": 0.0008900300599634647, + "learning_rate": 0.00013594866377388958, + "loss": 0.0005, + "num_input_tokens_seen": 207321312, + "step": 96075 + }, + { + "epoch": 15.673735725938009, + "grad_norm": 0.016287876293063164, + "learning_rate": 0.0001358998759661496, + "loss": 0.0035, + "num_input_tokens_seen": 207331424, + "step": 96080 + }, + { + "epoch": 15.674551386623165, + "grad_norm": 0.0004431696725077927, + "learning_rate": 0.00013585109553736053, + "loss": 0.0023, + "num_input_tokens_seen": 207341216, + "step": 96085 + }, + { + "epoch": 15.67536704730832, + "grad_norm": 0.011551488190889359, + "learning_rate": 0.00013580232248851094, + "loss": 0.0004, + "num_input_tokens_seen": 207352192, + "step": 96090 + }, + { + "epoch": 15.676182707993474, + "grad_norm": 0.0019405171042308211, + "learning_rate": 0.00013575355682058932, + "loss": 0.004, + "num_input_tokens_seen": 207363328, + "step": 96095 + }, + { + "epoch": 15.67699836867863, + "grad_norm": 0.022428715601563454, + "learning_rate": 0.0001357047985345839, + "loss": 0.0013, + "num_input_tokens_seen": 207372960, + "step": 96100 + }, + { + "epoch": 15.677814029363784, + "grad_norm": 0.002772005507722497, + "learning_rate": 0.00013565604763148294, + "loss": 0.0005, + "num_input_tokens_seen": 207384192, + "step": 96105 + }, + { + "epoch": 15.67862969004894, + "grad_norm": 0.0009897717973217368, + "learning_rate": 0.00013560730411227417, + "loss": 0.001, + "num_input_tokens_seen": 207395200, + "step": 96110 + }, + { + "epoch": 15.679445350734095, + "grad_norm": 0.0015866904286667705, + "learning_rate": 0.000135558567977946, + "loss": 0.0002, + "num_input_tokens_seen": 207406368, + "step": 96115 + }, + { + "epoch": 15.68026101141925, + "grad_norm": 0.05932014435529709, + "learning_rate": 0.00013550983922948546, + "loss": 0.001, + "num_input_tokens_seen": 207415040, + "step": 96120 + }, + { + "epoch": 15.681076672104405, + "grad_norm": 0.0006106356740929186, + "learning_rate": 0.00013546111786788073, + "loss": 0.0009, + "num_input_tokens_seen": 207425216, + "step": 96125 + }, + { + "epoch": 15.681892332789559, + "grad_norm": 0.001554515096358955, + "learning_rate": 0.00013541240389411857, + "loss": 0.0002, + "num_input_tokens_seen": 207436960, + "step": 96130 + }, + { + "epoch": 15.682707993474715, + "grad_norm": 0.0002937126555480063, + "learning_rate": 0.00013536369730918668, + "loss": 0.001, + "num_input_tokens_seen": 207447936, + "step": 96135 + }, + { + "epoch": 15.68352365415987, + "grad_norm": 0.00015770659956615418, + "learning_rate": 0.00013531499811407212, + "loss": 0.0247, + "num_input_tokens_seen": 207459712, + "step": 96140 + }, + { + "epoch": 15.684339314845024, + "grad_norm": 0.03287169337272644, + "learning_rate": 0.00013526630630976172, + "loss": 0.0142, + "num_input_tokens_seen": 207471552, + "step": 96145 + }, + { + "epoch": 15.68515497553018, + "grad_norm": 0.0006228497950360179, + "learning_rate": 0.00013521762189724228, + "loss": 0.0648, + "num_input_tokens_seen": 207483008, + "step": 96150 + }, + { + "epoch": 15.685970636215334, + "grad_norm": 0.0007559206569567323, + "learning_rate": 0.00013516894487750053, + "loss": 0.0009, + "num_input_tokens_seen": 207494016, + "step": 96155 + }, + { + "epoch": 15.68678629690049, + "grad_norm": 0.004105266649276018, + "learning_rate": 0.00013512027525152293, + "loss": 0.0009, + "num_input_tokens_seen": 207503776, + "step": 96160 + }, + { + "epoch": 15.687601957585644, + "grad_norm": 0.03920479491353035, + "learning_rate": 0.00013507161302029586, + "loss": 0.0015, + "num_input_tokens_seen": 207513568, + "step": 96165 + }, + { + "epoch": 15.6884176182708, + "grad_norm": 0.006282513029873371, + "learning_rate": 0.00013502295818480548, + "loss": 0.0004, + "num_input_tokens_seen": 207525792, + "step": 96170 + }, + { + "epoch": 15.689233278955955, + "grad_norm": 0.2320699542760849, + "learning_rate": 0.00013497431074603784, + "loss": 0.0034, + "num_input_tokens_seen": 207536320, + "step": 96175 + }, + { + "epoch": 15.690048939641109, + "grad_norm": 0.006741566117852926, + "learning_rate": 0.00013492567070497885, + "loss": 0.0008, + "num_input_tokens_seen": 207547296, + "step": 96180 + }, + { + "epoch": 15.690864600326265, + "grad_norm": 0.05982111394405365, + "learning_rate": 0.0001348770380626143, + "loss": 0.0021, + "num_input_tokens_seen": 207556736, + "step": 96185 + }, + { + "epoch": 15.691680261011419, + "grad_norm": 0.00040361491846852005, + "learning_rate": 0.00013482841281992975, + "loss": 0.0008, + "num_input_tokens_seen": 207568768, + "step": 96190 + }, + { + "epoch": 15.692495921696574, + "grad_norm": 0.0008627942879684269, + "learning_rate": 0.00013477979497791064, + "loss": 0.0006, + "num_input_tokens_seen": 207579712, + "step": 96195 + }, + { + "epoch": 15.69331158238173, + "grad_norm": 0.00816900935024023, + "learning_rate": 0.00013473118453754236, + "loss": 0.0006, + "num_input_tokens_seen": 207590432, + "step": 96200 + }, + { + "epoch": 15.694127243066884, + "grad_norm": 0.01830359175801277, + "learning_rate": 0.00013468258149981, + "loss": 0.0005, + "num_input_tokens_seen": 207599904, + "step": 96205 + }, + { + "epoch": 15.69494290375204, + "grad_norm": 0.0024977840948849916, + "learning_rate": 0.00013463398586569854, + "loss": 0.0034, + "num_input_tokens_seen": 207610816, + "step": 96210 + }, + { + "epoch": 15.695758564437194, + "grad_norm": 0.0425003245472908, + "learning_rate": 0.00013458539763619272, + "loss": 0.0084, + "num_input_tokens_seen": 207621632, + "step": 96215 + }, + { + "epoch": 15.69657422512235, + "grad_norm": 0.0005863439291715622, + "learning_rate": 0.00013453681681227763, + "loss": 0.0016, + "num_input_tokens_seen": 207632224, + "step": 96220 + }, + { + "epoch": 15.697389885807503, + "grad_norm": 0.010063264518976212, + "learning_rate": 0.0001344882433949373, + "loss": 0.0094, + "num_input_tokens_seen": 207643328, + "step": 96225 + }, + { + "epoch": 15.698205546492659, + "grad_norm": 0.008641311898827553, + "learning_rate": 0.00013443967738515673, + "loss": 0.0023, + "num_input_tokens_seen": 207654176, + "step": 96230 + }, + { + "epoch": 15.699021207177815, + "grad_norm": 0.0005315160378813744, + "learning_rate": 0.00013439111878391953, + "loss": 0.0196, + "num_input_tokens_seen": 207664704, + "step": 96235 + }, + { + "epoch": 15.699836867862969, + "grad_norm": 0.0019505913369357586, + "learning_rate": 0.00013434256759221037, + "loss": 0.0003, + "num_input_tokens_seen": 207675200, + "step": 96240 + }, + { + "epoch": 15.700652528548124, + "grad_norm": 0.0002812518796417862, + "learning_rate": 0.00013429402381101268, + "loss": 0.059, + "num_input_tokens_seen": 207686432, + "step": 96245 + }, + { + "epoch": 15.701468189233278, + "grad_norm": 0.0018634665757417679, + "learning_rate": 0.00013424548744131088, + "loss": 0.0006, + "num_input_tokens_seen": 207696320, + "step": 96250 + }, + { + "epoch": 15.702283849918434, + "grad_norm": 0.002810975071042776, + "learning_rate": 0.00013419695848408792, + "loss": 0.0015, + "num_input_tokens_seen": 207707840, + "step": 96255 + }, + { + "epoch": 15.70309951060359, + "grad_norm": 0.0023474199697375298, + "learning_rate": 0.00013414843694032792, + "loss": 0.0003, + "num_input_tokens_seen": 207717920, + "step": 96260 + }, + { + "epoch": 15.703915171288743, + "grad_norm": 0.007720315363258123, + "learning_rate": 0.00013409992281101368, + "loss": 0.0261, + "num_input_tokens_seen": 207728800, + "step": 96265 + }, + { + "epoch": 15.7047308319739, + "grad_norm": 0.0040229447185993195, + "learning_rate": 0.000134051416097129, + "loss": 0.0615, + "num_input_tokens_seen": 207739424, + "step": 96270 + }, + { + "epoch": 15.705546492659053, + "grad_norm": 0.001647871802560985, + "learning_rate": 0.00013400291679965633, + "loss": 0.0005, + "num_input_tokens_seen": 207749568, + "step": 96275 + }, + { + "epoch": 15.706362153344209, + "grad_norm": 0.0003507202782202512, + "learning_rate": 0.000133954424919579, + "loss": 0.0005, + "num_input_tokens_seen": 207759776, + "step": 96280 + }, + { + "epoch": 15.707177814029365, + "grad_norm": 0.006732017267495394, + "learning_rate": 0.00013390594045787957, + "loss": 0.0016, + "num_input_tokens_seen": 207771488, + "step": 96285 + }, + { + "epoch": 15.707993474714518, + "grad_norm": 0.0001562029356136918, + "learning_rate": 0.00013385746341554067, + "loss": 0.0013, + "num_input_tokens_seen": 207782112, + "step": 96290 + }, + { + "epoch": 15.708809135399674, + "grad_norm": 0.012718412093818188, + "learning_rate": 0.0001338089937935448, + "loss": 0.1142, + "num_input_tokens_seen": 207792640, + "step": 96295 + }, + { + "epoch": 15.709624796084828, + "grad_norm": 0.00024611371918581426, + "learning_rate": 0.0001337605315928742, + "loss": 0.0017, + "num_input_tokens_seen": 207802720, + "step": 96300 + }, + { + "epoch": 15.710440456769984, + "grad_norm": 0.15190432965755463, + "learning_rate": 0.00013371207681451102, + "loss": 0.004, + "num_input_tokens_seen": 207813696, + "step": 96305 + }, + { + "epoch": 15.71125611745514, + "grad_norm": 0.0019494740990921855, + "learning_rate": 0.00013366362945943733, + "loss": 0.0003, + "num_input_tokens_seen": 207823840, + "step": 96310 + }, + { + "epoch": 15.712071778140293, + "grad_norm": 0.016217073425650597, + "learning_rate": 0.00013361518952863488, + "loss": 0.0022, + "num_input_tokens_seen": 207835712, + "step": 96315 + }, + { + "epoch": 15.71288743882545, + "grad_norm": 0.0004646365705411881, + "learning_rate": 0.00013356675702308541, + "loss": 0.0016, + "num_input_tokens_seen": 207847264, + "step": 96320 + }, + { + "epoch": 15.713703099510603, + "grad_norm": 0.00861755758523941, + "learning_rate": 0.00013351833194377044, + "loss": 0.013, + "num_input_tokens_seen": 207857984, + "step": 96325 + }, + { + "epoch": 15.714518760195759, + "grad_norm": 0.010996782220900059, + "learning_rate": 0.00013346991429167128, + "loss": 0.012, + "num_input_tokens_seen": 207869568, + "step": 96330 + }, + { + "epoch": 15.715334420880914, + "grad_norm": 0.02468242682516575, + "learning_rate": 0.00013342150406776953, + "loss": 0.0026, + "num_input_tokens_seen": 207880576, + "step": 96335 + }, + { + "epoch": 15.716150081566068, + "grad_norm": 0.01951685920357704, + "learning_rate": 0.00013337310127304575, + "loss": 0.0231, + "num_input_tokens_seen": 207891808, + "step": 96340 + }, + { + "epoch": 15.716965742251224, + "grad_norm": 0.04995479807257652, + "learning_rate": 0.0001333247059084815, + "loss": 0.0017, + "num_input_tokens_seen": 207903456, + "step": 96345 + }, + { + "epoch": 15.717781402936378, + "grad_norm": 0.0040632132440805435, + "learning_rate": 0.00013327631797505697, + "loss": 0.0007, + "num_input_tokens_seen": 207913344, + "step": 96350 + }, + { + "epoch": 15.718597063621534, + "grad_norm": 0.0007864636718295515, + "learning_rate": 0.00013322793747375333, + "loss": 0.003, + "num_input_tokens_seen": 207923456, + "step": 96355 + }, + { + "epoch": 15.719412724306688, + "grad_norm": 0.017793137580156326, + "learning_rate": 0.00013317956440555051, + "loss": 0.006, + "num_input_tokens_seen": 207935328, + "step": 96360 + }, + { + "epoch": 15.720228384991843, + "grad_norm": 0.0005952867213636637, + "learning_rate": 0.00013313119877142947, + "loss": 0.0769, + "num_input_tokens_seen": 207945440, + "step": 96365 + }, + { + "epoch": 15.721044045676999, + "grad_norm": 0.01865777187049389, + "learning_rate": 0.00013308284057236984, + "loss": 0.0019, + "num_input_tokens_seen": 207957024, + "step": 96370 + }, + { + "epoch": 15.721859706362153, + "grad_norm": 0.01129191555082798, + "learning_rate": 0.00013303448980935218, + "loss": 0.0006, + "num_input_tokens_seen": 207967488, + "step": 96375 + }, + { + "epoch": 15.722675367047309, + "grad_norm": 0.004014736041426659, + "learning_rate": 0.00013298614648335583, + "loss": 0.0011, + "num_input_tokens_seen": 207978400, + "step": 96380 + }, + { + "epoch": 15.723491027732463, + "grad_norm": 0.012978630140423775, + "learning_rate": 0.0001329378105953611, + "loss": 0.0005, + "num_input_tokens_seen": 207987744, + "step": 96385 + }, + { + "epoch": 15.724306688417618, + "grad_norm": 0.0011605016188696027, + "learning_rate": 0.00013288948214634698, + "loss": 0.0042, + "num_input_tokens_seen": 207998464, + "step": 96390 + }, + { + "epoch": 15.725122349102774, + "grad_norm": 0.0007320415461435914, + "learning_rate": 0.00013284116113729356, + "loss": 0.0025, + "num_input_tokens_seen": 208009440, + "step": 96395 + }, + { + "epoch": 15.725938009787928, + "grad_norm": 0.03152868524193764, + "learning_rate": 0.00013279284756917943, + "loss": 0.0027, + "num_input_tokens_seen": 208020896, + "step": 96400 + }, + { + "epoch": 15.726753670473084, + "grad_norm": 0.038363125175237656, + "learning_rate": 0.00013274454144298438, + "loss": 0.0026, + "num_input_tokens_seen": 208032160, + "step": 96405 + }, + { + "epoch": 15.727569331158238, + "grad_norm": 0.03716590628027916, + "learning_rate": 0.00013269624275968683, + "loss": 0.124, + "num_input_tokens_seen": 208042656, + "step": 96410 + }, + { + "epoch": 15.728384991843393, + "grad_norm": 0.0029637557454407215, + "learning_rate": 0.00013264795152026615, + "loss": 0.0018, + "num_input_tokens_seen": 208052640, + "step": 96415 + }, + { + "epoch": 15.729200652528547, + "grad_norm": 0.014528334140777588, + "learning_rate": 0.00013259966772570048, + "loss": 0.0018, + "num_input_tokens_seen": 208064096, + "step": 96420 + }, + { + "epoch": 15.730016313213703, + "grad_norm": 0.0008620037697255611, + "learning_rate": 0.00013255139137696874, + "loss": 0.0005, + "num_input_tokens_seen": 208075040, + "step": 96425 + }, + { + "epoch": 15.730831973898859, + "grad_norm": 0.001006401958875358, + "learning_rate": 0.0001325031224750492, + "loss": 0.0029, + "num_input_tokens_seen": 208086656, + "step": 96430 + }, + { + "epoch": 15.731647634584013, + "grad_norm": 0.0009363061399199069, + "learning_rate": 0.0001324548610209201, + "loss": 0.0044, + "num_input_tokens_seen": 208097536, + "step": 96435 + }, + { + "epoch": 15.732463295269168, + "grad_norm": 0.0005047292215749621, + "learning_rate": 0.00013240660701555951, + "loss": 0.0004, + "num_input_tokens_seen": 208109568, + "step": 96440 + }, + { + "epoch": 15.733278955954322, + "grad_norm": 0.07031556963920593, + "learning_rate": 0.00013235836045994532, + "loss": 0.0053, + "num_input_tokens_seen": 208119808, + "step": 96445 + }, + { + "epoch": 15.734094616639478, + "grad_norm": 0.05867965891957283, + "learning_rate": 0.00013231012135505538, + "loss": 0.004, + "num_input_tokens_seen": 208129632, + "step": 96450 + }, + { + "epoch": 15.734910277324634, + "grad_norm": 0.009790257550776005, + "learning_rate": 0.00013226188970186725, + "loss": 0.0018, + "num_input_tokens_seen": 208139968, + "step": 96455 + }, + { + "epoch": 15.735725938009788, + "grad_norm": 0.014972585253417492, + "learning_rate": 0.0001322136655013585, + "loss": 0.0025, + "num_input_tokens_seen": 208151232, + "step": 96460 + }, + { + "epoch": 15.736541598694943, + "grad_norm": 0.0009651403524912894, + "learning_rate": 0.00013216544875450633, + "loss": 0.0011, + "num_input_tokens_seen": 208163200, + "step": 96465 + }, + { + "epoch": 15.737357259380097, + "grad_norm": 0.06276267021894455, + "learning_rate": 0.00013211723946228798, + "loss": 0.003, + "num_input_tokens_seen": 208175136, + "step": 96470 + }, + { + "epoch": 15.738172920065253, + "grad_norm": 0.06601838022470474, + "learning_rate": 0.00013206903762568028, + "loss": 0.0015, + "num_input_tokens_seen": 208184960, + "step": 96475 + }, + { + "epoch": 15.738988580750409, + "grad_norm": 0.07869979739189148, + "learning_rate": 0.00013202084324566066, + "loss": 0.0289, + "num_input_tokens_seen": 208195840, + "step": 96480 + }, + { + "epoch": 15.739804241435563, + "grad_norm": 0.008459399454295635, + "learning_rate": 0.0001319726563232051, + "loss": 0.0008, + "num_input_tokens_seen": 208205888, + "step": 96485 + }, + { + "epoch": 15.740619902120718, + "grad_norm": 0.0009534373530186713, + "learning_rate": 0.00013192447685929088, + "loss": 0.004, + "num_input_tokens_seen": 208215744, + "step": 96490 + }, + { + "epoch": 15.741435562805872, + "grad_norm": 0.06231563910841942, + "learning_rate": 0.00013187630485489378, + "loss": 0.0021, + "num_input_tokens_seen": 208226784, + "step": 96495 + }, + { + "epoch": 15.742251223491028, + "grad_norm": 0.002932474948465824, + "learning_rate": 0.0001318281403109906, + "loss": 0.0017, + "num_input_tokens_seen": 208238304, + "step": 96500 + }, + { + "epoch": 15.743066884176184, + "grad_norm": 0.02712051197886467, + "learning_rate": 0.00013177998322855695, + "loss": 0.0008, + "num_input_tokens_seen": 208249408, + "step": 96505 + }, + { + "epoch": 15.743882544861338, + "grad_norm": 0.00321220513433218, + "learning_rate": 0.00013173183360856938, + "loss": 0.0071, + "num_input_tokens_seen": 208260736, + "step": 96510 + }, + { + "epoch": 15.744698205546493, + "grad_norm": 0.3121810257434845, + "learning_rate": 0.00013168369145200303, + "loss": 0.0083, + "num_input_tokens_seen": 208270336, + "step": 96515 + }, + { + "epoch": 15.745513866231647, + "grad_norm": 0.000285373127553612, + "learning_rate": 0.0001316355567598343, + "loss": 0.0011, + "num_input_tokens_seen": 208281440, + "step": 96520 + }, + { + "epoch": 15.746329526916803, + "grad_norm": 0.007942762225866318, + "learning_rate": 0.00013158742953303792, + "loss": 0.0101, + "num_input_tokens_seen": 208292128, + "step": 96525 + }, + { + "epoch": 15.747145187601957, + "grad_norm": 0.008446838706731796, + "learning_rate": 0.00013153930977258987, + "loss": 0.0021, + "num_input_tokens_seen": 208303296, + "step": 96530 + }, + { + "epoch": 15.747960848287113, + "grad_norm": 0.002110583707690239, + "learning_rate": 0.0001314911974794651, + "loss": 0.0007, + "num_input_tokens_seen": 208314496, + "step": 96535 + }, + { + "epoch": 15.748776508972268, + "grad_norm": 0.006463640835136175, + "learning_rate": 0.00013144309265463873, + "loss": 0.0334, + "num_input_tokens_seen": 208326464, + "step": 96540 + }, + { + "epoch": 15.749592169657422, + "grad_norm": 0.002008747076615691, + "learning_rate": 0.00013139499529908562, + "loss": 0.0006, + "num_input_tokens_seen": 208337056, + "step": 96545 + }, + { + "epoch": 15.750407830342578, + "grad_norm": 0.011420495808124542, + "learning_rate": 0.00013134690541378053, + "loss": 0.0037, + "num_input_tokens_seen": 208348192, + "step": 96550 + }, + { + "epoch": 15.751223491027732, + "grad_norm": 0.013537749648094177, + "learning_rate": 0.00013129882299969803, + "loss": 0.0011, + "num_input_tokens_seen": 208359424, + "step": 96555 + }, + { + "epoch": 15.752039151712887, + "grad_norm": 1.0261584520339966, + "learning_rate": 0.00013125074805781268, + "loss": 0.0177, + "num_input_tokens_seen": 208369472, + "step": 96560 + }, + { + "epoch": 15.752854812398043, + "grad_norm": 0.0066556683741509914, + "learning_rate": 0.0001312026805890987, + "loss": 0.0065, + "num_input_tokens_seen": 208379552, + "step": 96565 + }, + { + "epoch": 15.753670473083197, + "grad_norm": 0.0006301743560470641, + "learning_rate": 0.00013115462059453022, + "loss": 0.0002, + "num_input_tokens_seen": 208390816, + "step": 96570 + }, + { + "epoch": 15.754486133768353, + "grad_norm": 0.004520168527960777, + "learning_rate": 0.00013110656807508125, + "loss": 0.0019, + "num_input_tokens_seen": 208400704, + "step": 96575 + }, + { + "epoch": 15.755301794453507, + "grad_norm": 0.13825470209121704, + "learning_rate": 0.0001310585230317257, + "loss": 0.0043, + "num_input_tokens_seen": 208410976, + "step": 96580 + }, + { + "epoch": 15.756117455138662, + "grad_norm": 0.006605338770896196, + "learning_rate": 0.0001310104854654372, + "loss": 0.0007, + "num_input_tokens_seen": 208421824, + "step": 96585 + }, + { + "epoch": 15.756933115823816, + "grad_norm": 0.0004779761075042188, + "learning_rate": 0.0001309624553771893, + "loss": 0.0045, + "num_input_tokens_seen": 208432480, + "step": 96590 + }, + { + "epoch": 15.757748776508972, + "grad_norm": 0.0028838200960308313, + "learning_rate": 0.00013091443276795544, + "loss": 0.0027, + "num_input_tokens_seen": 208443360, + "step": 96595 + }, + { + "epoch": 15.758564437194128, + "grad_norm": 0.0002449949679430574, + "learning_rate": 0.00013086641763870876, + "loss": 0.0002, + "num_input_tokens_seen": 208453056, + "step": 96600 + }, + { + "epoch": 15.759380097879282, + "grad_norm": 0.00039131249650381505, + "learning_rate": 0.00013081840999042244, + "loss": 0.0403, + "num_input_tokens_seen": 208462560, + "step": 96605 + }, + { + "epoch": 15.760195758564437, + "grad_norm": 0.7664036154747009, + "learning_rate": 0.0001307704098240694, + "loss": 0.0155, + "num_input_tokens_seen": 208474304, + "step": 96610 + }, + { + "epoch": 15.761011419249591, + "grad_norm": 0.00031289318576455116, + "learning_rate": 0.0001307224171406224, + "loss": 0.0023, + "num_input_tokens_seen": 208484896, + "step": 96615 + }, + { + "epoch": 15.761827079934747, + "grad_norm": 0.06382239609956741, + "learning_rate": 0.0001306744319410539, + "loss": 0.0023, + "num_input_tokens_seen": 208496000, + "step": 96620 + }, + { + "epoch": 15.762642740619903, + "grad_norm": 0.015141925774514675, + "learning_rate": 0.00013062645422633683, + "loss": 0.0011, + "num_input_tokens_seen": 208506688, + "step": 96625 + }, + { + "epoch": 15.763458401305057, + "grad_norm": 0.001566877355799079, + "learning_rate": 0.000130578483997443, + "loss": 0.0011, + "num_input_tokens_seen": 208517056, + "step": 96630 + }, + { + "epoch": 15.764274061990212, + "grad_norm": 0.044394269585609436, + "learning_rate": 0.00013053052125534497, + "loss": 0.0014, + "num_input_tokens_seen": 208526688, + "step": 96635 + }, + { + "epoch": 15.765089722675366, + "grad_norm": 0.011752372607588768, + "learning_rate": 0.00013048256600101465, + "loss": 0.0053, + "num_input_tokens_seen": 208537792, + "step": 96640 + }, + { + "epoch": 15.765905383360522, + "grad_norm": 0.0001267856714548543, + "learning_rate": 0.00013043461823542387, + "loss": 0.0111, + "num_input_tokens_seen": 208547616, + "step": 96645 + }, + { + "epoch": 15.766721044045678, + "grad_norm": 0.007084175944328308, + "learning_rate": 0.0001303866779595444, + "loss": 0.0063, + "num_input_tokens_seen": 208558880, + "step": 96650 + }, + { + "epoch": 15.767536704730832, + "grad_norm": 0.040538981556892395, + "learning_rate": 0.0001303387451743478, + "loss": 0.004, + "num_input_tokens_seen": 208570464, + "step": 96655 + }, + { + "epoch": 15.768352365415987, + "grad_norm": 0.023656552657485008, + "learning_rate": 0.00013029081988080545, + "loss": 0.0016, + "num_input_tokens_seen": 208581920, + "step": 96660 + }, + { + "epoch": 15.769168026101141, + "grad_norm": 0.0023686618078500032, + "learning_rate": 0.00013024290207988866, + "loss": 0.0061, + "num_input_tokens_seen": 208593504, + "step": 96665 + }, + { + "epoch": 15.769983686786297, + "grad_norm": 0.0009272679453715682, + "learning_rate": 0.00013019499177256848, + "loss": 0.0002, + "num_input_tokens_seen": 208604384, + "step": 96670 + }, + { + "epoch": 15.770799347471453, + "grad_norm": 0.005481208674609661, + "learning_rate": 0.00013014708895981597, + "loss": 0.0007, + "num_input_tokens_seen": 208614272, + "step": 96675 + }, + { + "epoch": 15.771615008156607, + "grad_norm": 0.043851178139448166, + "learning_rate": 0.00013009919364260193, + "loss": 0.0017, + "num_input_tokens_seen": 208624704, + "step": 96680 + }, + { + "epoch": 15.772430668841762, + "grad_norm": 0.000344037136528641, + "learning_rate": 0.0001300513058218969, + "loss": 0.0107, + "num_input_tokens_seen": 208634752, + "step": 96685 + }, + { + "epoch": 15.773246329526916, + "grad_norm": 0.007659910246729851, + "learning_rate": 0.0001300034254986715, + "loss": 0.0031, + "num_input_tokens_seen": 208645696, + "step": 96690 + }, + { + "epoch": 15.774061990212072, + "grad_norm": 0.0049573928117752075, + "learning_rate": 0.00012995555267389608, + "loss": 0.0004, + "num_input_tokens_seen": 208656224, + "step": 96695 + }, + { + "epoch": 15.774877650897226, + "grad_norm": 0.023076273500919342, + "learning_rate": 0.0001299076873485408, + "loss": 0.0011, + "num_input_tokens_seen": 208665984, + "step": 96700 + }, + { + "epoch": 15.775693311582382, + "grad_norm": 0.004285704810172319, + "learning_rate": 0.00012985982952357577, + "loss": 0.0166, + "num_input_tokens_seen": 208676768, + "step": 96705 + }, + { + "epoch": 15.776508972267537, + "grad_norm": 0.0004140162782277912, + "learning_rate": 0.00012981197919997078, + "loss": 0.001, + "num_input_tokens_seen": 208686496, + "step": 96710 + }, + { + "epoch": 15.777324632952691, + "grad_norm": 0.0001956783962668851, + "learning_rate": 0.00012976413637869573, + "loss": 0.0012, + "num_input_tokens_seen": 208698656, + "step": 96715 + }, + { + "epoch": 15.778140293637847, + "grad_norm": 0.00034214468905702233, + "learning_rate": 0.00012971630106072007, + "loss": 0.002, + "num_input_tokens_seen": 208708064, + "step": 96720 + }, + { + "epoch": 15.778955954323001, + "grad_norm": 0.004548894241452217, + "learning_rate": 0.00012966847324701337, + "loss": 0.0307, + "num_input_tokens_seen": 208717888, + "step": 96725 + }, + { + "epoch": 15.779771615008157, + "grad_norm": 0.01016912329941988, + "learning_rate": 0.0001296206529385448, + "loss": 0.0014, + "num_input_tokens_seen": 208728128, + "step": 96730 + }, + { + "epoch": 15.780587275693312, + "grad_norm": 0.0018142885528504848, + "learning_rate": 0.00012957284013628357, + "loss": 0.0003, + "num_input_tokens_seen": 208738592, + "step": 96735 + }, + { + "epoch": 15.781402936378466, + "grad_norm": 0.7855406403541565, + "learning_rate": 0.00012952503484119866, + "loss": 0.1586, + "num_input_tokens_seen": 208748608, + "step": 96740 + }, + { + "epoch": 15.782218597063622, + "grad_norm": 0.006581936962902546, + "learning_rate": 0.0001294772370542589, + "loss": 0.0008, + "num_input_tokens_seen": 208758784, + "step": 96745 + }, + { + "epoch": 15.783034257748776, + "grad_norm": 0.3146686553955078, + "learning_rate": 0.00012942944677643282, + "loss": 0.0074, + "num_input_tokens_seen": 208768832, + "step": 96750 + }, + { + "epoch": 15.783849918433932, + "grad_norm": 0.004072767682373524, + "learning_rate": 0.0001293816640086894, + "loss": 0.0004, + "num_input_tokens_seen": 208779456, + "step": 96755 + }, + { + "epoch": 15.784665579119086, + "grad_norm": 0.003051190171390772, + "learning_rate": 0.00012933388875199643, + "loss": 0.0124, + "num_input_tokens_seen": 208790784, + "step": 96760 + }, + { + "epoch": 15.785481239804241, + "grad_norm": 0.0052550435066223145, + "learning_rate": 0.00012928612100732257, + "loss": 0.0035, + "num_input_tokens_seen": 208801536, + "step": 96765 + }, + { + "epoch": 15.786296900489397, + "grad_norm": 0.0001221710117533803, + "learning_rate": 0.00012923836077563576, + "loss": 0.0008, + "num_input_tokens_seen": 208811360, + "step": 96770 + }, + { + "epoch": 15.78711256117455, + "grad_norm": 0.002182788448408246, + "learning_rate": 0.0001291906080579039, + "loss": 0.0013, + "num_input_tokens_seen": 208823424, + "step": 96775 + }, + { + "epoch": 15.787928221859707, + "grad_norm": 0.06307592988014221, + "learning_rate": 0.0001291428628550948, + "loss": 0.004, + "num_input_tokens_seen": 208834368, + "step": 96780 + }, + { + "epoch": 15.78874388254486, + "grad_norm": 0.053514085710048676, + "learning_rate": 0.000129095125168176, + "loss": 0.0065, + "num_input_tokens_seen": 208845472, + "step": 96785 + }, + { + "epoch": 15.789559543230016, + "grad_norm": 0.00022401301248464733, + "learning_rate": 0.00012904739499811508, + "loss": 0.0031, + "num_input_tokens_seen": 208854688, + "step": 96790 + }, + { + "epoch": 15.790375203915172, + "grad_norm": 0.0005690042162314057, + "learning_rate": 0.00012899967234587922, + "loss": 0.0052, + "num_input_tokens_seen": 208864544, + "step": 96795 + }, + { + "epoch": 15.791190864600326, + "grad_norm": 0.0016603044932708144, + "learning_rate": 0.00012895195721243568, + "loss": 0.0598, + "num_input_tokens_seen": 208875744, + "step": 96800 + }, + { + "epoch": 15.792006525285482, + "grad_norm": 0.0004007740644738078, + "learning_rate": 0.00012890424959875147, + "loss": 0.0003, + "num_input_tokens_seen": 208886848, + "step": 96805 + }, + { + "epoch": 15.792822185970635, + "grad_norm": 0.00042904200381599367, + "learning_rate": 0.0001288565495057934, + "loss": 0.0005, + "num_input_tokens_seen": 208896672, + "step": 96810 + }, + { + "epoch": 15.793637846655791, + "grad_norm": 0.0010241552954539657, + "learning_rate": 0.00012880885693452814, + "loss": 0.0008, + "num_input_tokens_seen": 208906528, + "step": 96815 + }, + { + "epoch": 15.794453507340947, + "grad_norm": 0.00029661323060281575, + "learning_rate": 0.0001287611718859223, + "loss": 0.0116, + "num_input_tokens_seen": 208917120, + "step": 96820 + }, + { + "epoch": 15.7952691680261, + "grad_norm": 0.001706681214272976, + "learning_rate": 0.00012871349436094226, + "loss": 0.0078, + "num_input_tokens_seen": 208927808, + "step": 96825 + }, + { + "epoch": 15.796084828711257, + "grad_norm": 0.00013316575495991856, + "learning_rate": 0.0001286658243605543, + "loss": 0.0039, + "num_input_tokens_seen": 208937952, + "step": 96830 + }, + { + "epoch": 15.79690048939641, + "grad_norm": 0.007063909433782101, + "learning_rate": 0.00012861816188572444, + "loss": 0.0016, + "num_input_tokens_seen": 208949344, + "step": 96835 + }, + { + "epoch": 15.797716150081566, + "grad_norm": 0.004367734771221876, + "learning_rate": 0.00012857050693741866, + "loss": 0.0003, + "num_input_tokens_seen": 208960064, + "step": 96840 + }, + { + "epoch": 15.798531810766722, + "grad_norm": 0.0010831477120518684, + "learning_rate": 0.00012852285951660275, + "loss": 0.0002, + "num_input_tokens_seen": 208971008, + "step": 96845 + }, + { + "epoch": 15.799347471451876, + "grad_norm": 0.0004640770494006574, + "learning_rate": 0.00012847521962424237, + "loss": 0.0005, + "num_input_tokens_seen": 208981696, + "step": 96850 + }, + { + "epoch": 15.800163132137031, + "grad_norm": 0.0002996611874550581, + "learning_rate": 0.00012842758726130281, + "loss": 0.0002, + "num_input_tokens_seen": 208993088, + "step": 96855 + }, + { + "epoch": 15.800978792822185, + "grad_norm": 0.0004612805205397308, + "learning_rate": 0.0001283799624287499, + "loss": 0.0006, + "num_input_tokens_seen": 209003712, + "step": 96860 + }, + { + "epoch": 15.801794453507341, + "grad_norm": 0.025308573618531227, + "learning_rate": 0.00012833234512754817, + "loss": 0.0005, + "num_input_tokens_seen": 209014240, + "step": 96865 + }, + { + "epoch": 15.802610114192497, + "grad_norm": 0.010746176354587078, + "learning_rate": 0.0001282847353586632, + "loss": 0.0015, + "num_input_tokens_seen": 209024480, + "step": 96870 + }, + { + "epoch": 15.80342577487765, + "grad_norm": 0.0002884072018787265, + "learning_rate": 0.0001282371331230594, + "loss": 0.0054, + "num_input_tokens_seen": 209036352, + "step": 96875 + }, + { + "epoch": 15.804241435562806, + "grad_norm": 0.7370665669441223, + "learning_rate": 0.00012818953842170193, + "loss": 0.0568, + "num_input_tokens_seen": 209046048, + "step": 96880 + }, + { + "epoch": 15.80505709624796, + "grad_norm": 0.036297909915447235, + "learning_rate": 0.0001281419512555549, + "loss": 0.0205, + "num_input_tokens_seen": 209057120, + "step": 96885 + }, + { + "epoch": 15.805872756933116, + "grad_norm": 0.00021247354743536562, + "learning_rate": 0.00012809437162558324, + "loss": 0.0003, + "num_input_tokens_seen": 209068256, + "step": 96890 + }, + { + "epoch": 15.80668841761827, + "grad_norm": 0.0002953654620796442, + "learning_rate": 0.00012804679953275068, + "loss": 0.0089, + "num_input_tokens_seen": 209080192, + "step": 96895 + }, + { + "epoch": 15.807504078303426, + "grad_norm": 0.005417425185441971, + "learning_rate": 0.00012799923497802185, + "loss": 0.0042, + "num_input_tokens_seen": 209091936, + "step": 96900 + }, + { + "epoch": 15.808319738988581, + "grad_norm": 0.3397206962108612, + "learning_rate": 0.00012795167796236012, + "loss": 0.0165, + "num_input_tokens_seen": 209104160, + "step": 96905 + }, + { + "epoch": 15.809135399673735, + "grad_norm": 0.45854413509368896, + "learning_rate": 0.00012790412848672977, + "loss": 0.0142, + "num_input_tokens_seen": 209115136, + "step": 96910 + }, + { + "epoch": 15.809951060358891, + "grad_norm": 0.043145909905433655, + "learning_rate": 0.0001278565865520943, + "loss": 0.001, + "num_input_tokens_seen": 209126464, + "step": 96915 + }, + { + "epoch": 15.810766721044045, + "grad_norm": 0.0021676351316273212, + "learning_rate": 0.00012780905215941724, + "loss": 0.0032, + "num_input_tokens_seen": 209135808, + "step": 96920 + }, + { + "epoch": 15.8115823817292, + "grad_norm": 0.0001090740697691217, + "learning_rate": 0.00012776152530966184, + "loss": 0.001, + "num_input_tokens_seen": 209144480, + "step": 96925 + }, + { + "epoch": 15.812398042414356, + "grad_norm": 0.01401578076183796, + "learning_rate": 0.0001277140060037914, + "loss": 0.0007, + "num_input_tokens_seen": 209155008, + "step": 96930 + }, + { + "epoch": 15.81321370309951, + "grad_norm": 0.01780686154961586, + "learning_rate": 0.00012766649424276888, + "loss": 0.001, + "num_input_tokens_seen": 209167136, + "step": 96935 + }, + { + "epoch": 15.814029363784666, + "grad_norm": 0.0027334888000041246, + "learning_rate": 0.00012761899002755716, + "loss": 0.0045, + "num_input_tokens_seen": 209179232, + "step": 96940 + }, + { + "epoch": 15.81484502446982, + "grad_norm": 0.2731187641620636, + "learning_rate": 0.00012757149335911906, + "loss": 0.0152, + "num_input_tokens_seen": 209191264, + "step": 96945 + }, + { + "epoch": 15.815660685154976, + "grad_norm": 0.0018226419342681766, + "learning_rate": 0.00012752400423841708, + "loss": 0.0069, + "num_input_tokens_seen": 209202176, + "step": 96950 + }, + { + "epoch": 15.81647634584013, + "grad_norm": 0.015894455835223198, + "learning_rate": 0.0001274765226664137, + "loss": 0.0006, + "num_input_tokens_seen": 209214208, + "step": 96955 + }, + { + "epoch": 15.817292006525285, + "grad_norm": 0.00038694182876497507, + "learning_rate": 0.00012742904864407095, + "loss": 0.0013, + "num_input_tokens_seen": 209225664, + "step": 96960 + }, + { + "epoch": 15.818107667210441, + "grad_norm": 0.6956227421760559, + "learning_rate": 0.0001273815821723515, + "loss": 0.1088, + "num_input_tokens_seen": 209236608, + "step": 96965 + }, + { + "epoch": 15.818923327895595, + "grad_norm": 0.005713373888283968, + "learning_rate": 0.00012733412325221673, + "loss": 0.005, + "num_input_tokens_seen": 209247328, + "step": 96970 + }, + { + "epoch": 15.81973898858075, + "grad_norm": 0.0004866596427746117, + "learning_rate": 0.00012728667188462893, + "loss": 0.0014, + "num_input_tokens_seen": 209258304, + "step": 96975 + }, + { + "epoch": 15.820554649265905, + "grad_norm": 0.01013571210205555, + "learning_rate": 0.00012723922807054934, + "loss": 0.0011, + "num_input_tokens_seen": 209269056, + "step": 96980 + }, + { + "epoch": 15.82137030995106, + "grad_norm": 0.002947608707472682, + "learning_rate": 0.00012719179181093992, + "loss": 0.0043, + "num_input_tokens_seen": 209280160, + "step": 96985 + }, + { + "epoch": 15.822185970636216, + "grad_norm": 0.0006195795722305775, + "learning_rate": 0.00012714436310676147, + "loss": 0.0007, + "num_input_tokens_seen": 209291392, + "step": 96990 + }, + { + "epoch": 15.82300163132137, + "grad_norm": 0.0012006558245047927, + "learning_rate": 0.00012709694195897587, + "loss": 0.0014, + "num_input_tokens_seen": 209302112, + "step": 96995 + }, + { + "epoch": 15.823817292006526, + "grad_norm": 0.00011235560668865219, + "learning_rate": 0.00012704952836854345, + "loss": 0.0048, + "num_input_tokens_seen": 209312576, + "step": 97000 + }, + { + "epoch": 15.82463295269168, + "grad_norm": 0.0459253303706646, + "learning_rate": 0.00012700212233642577, + "loss": 0.0067, + "num_input_tokens_seen": 209323936, + "step": 97005 + }, + { + "epoch": 15.825448613376835, + "grad_norm": 0.002572731114923954, + "learning_rate": 0.00012695472386358293, + "loss": 0.0004, + "num_input_tokens_seen": 209335616, + "step": 97010 + }, + { + "epoch": 15.826264274061991, + "grad_norm": 0.015001283958554268, + "learning_rate": 0.00012690733295097617, + "loss": 0.0054, + "num_input_tokens_seen": 209345856, + "step": 97015 + }, + { + "epoch": 15.827079934747145, + "grad_norm": 0.00026251739473082125, + "learning_rate": 0.00012685994959956532, + "loss": 0.0049, + "num_input_tokens_seen": 209356096, + "step": 97020 + }, + { + "epoch": 15.8278955954323, + "grad_norm": 0.00016155031335074455, + "learning_rate": 0.00012681257381031124, + "loss": 0.0002, + "num_input_tokens_seen": 209367104, + "step": 97025 + }, + { + "epoch": 15.828711256117455, + "grad_norm": 0.25576311349868774, + "learning_rate": 0.00012676520558417347, + "loss": 0.0081, + "num_input_tokens_seen": 209377472, + "step": 97030 + }, + { + "epoch": 15.82952691680261, + "grad_norm": 0.08373570442199707, + "learning_rate": 0.00012671784492211262, + "loss": 0.0603, + "num_input_tokens_seen": 209388448, + "step": 97035 + }, + { + "epoch": 15.830342577487766, + "grad_norm": 2.080212354660034, + "learning_rate": 0.00012667049182508788, + "loss": 0.0534, + "num_input_tokens_seen": 209399136, + "step": 97040 + }, + { + "epoch": 15.83115823817292, + "grad_norm": 0.010465903207659721, + "learning_rate": 0.00012662314629405936, + "loss": 0.0008, + "num_input_tokens_seen": 209409952, + "step": 97045 + }, + { + "epoch": 15.831973898858076, + "grad_norm": 0.0019401127938181162, + "learning_rate": 0.00012657580832998644, + "loss": 0.0013, + "num_input_tokens_seen": 209421632, + "step": 97050 + }, + { + "epoch": 15.83278955954323, + "grad_norm": 0.3823281228542328, + "learning_rate": 0.0001265284779338285, + "loss": 0.0089, + "num_input_tokens_seen": 209432000, + "step": 97055 + }, + { + "epoch": 15.833605220228385, + "grad_norm": 0.0005895656067878008, + "learning_rate": 0.00012648115510654473, + "loss": 0.0006, + "num_input_tokens_seen": 209443232, + "step": 97060 + }, + { + "epoch": 15.83442088091354, + "grad_norm": 0.01264207623898983, + "learning_rate": 0.00012643383984909423, + "loss": 0.0525, + "num_input_tokens_seen": 209452192, + "step": 97065 + }, + { + "epoch": 15.835236541598695, + "grad_norm": 0.0025257982779294252, + "learning_rate": 0.0001263865321624358, + "loss": 0.0007, + "num_input_tokens_seen": 209463488, + "step": 97070 + }, + { + "epoch": 15.83605220228385, + "grad_norm": 0.004192831926047802, + "learning_rate": 0.0001263392320475283, + "loss": 0.1147, + "num_input_tokens_seen": 209474464, + "step": 97075 + }, + { + "epoch": 15.836867862969005, + "grad_norm": 0.011728022247552872, + "learning_rate": 0.0001262919395053303, + "loss": 0.001, + "num_input_tokens_seen": 209484896, + "step": 97080 + }, + { + "epoch": 15.83768352365416, + "grad_norm": 0.3850136399269104, + "learning_rate": 0.0001262446545368002, + "loss": 0.0084, + "num_input_tokens_seen": 209497280, + "step": 97085 + }, + { + "epoch": 15.838499184339314, + "grad_norm": 0.010293726809322834, + "learning_rate": 0.0001261973771428963, + "loss": 0.0011, + "num_input_tokens_seen": 209508736, + "step": 97090 + }, + { + "epoch": 15.83931484502447, + "grad_norm": 0.002615356119349599, + "learning_rate": 0.00012615010732457677, + "loss": 0.003, + "num_input_tokens_seen": 209519808, + "step": 97095 + }, + { + "epoch": 15.840130505709626, + "grad_norm": 0.16768094897270203, + "learning_rate": 0.00012610284508279956, + "loss": 0.0093, + "num_input_tokens_seen": 209530592, + "step": 97100 + }, + { + "epoch": 15.84094616639478, + "grad_norm": 0.008525360375642776, + "learning_rate": 0.00012605559041852245, + "loss": 0.0049, + "num_input_tokens_seen": 209542592, + "step": 97105 + }, + { + "epoch": 15.841761827079935, + "grad_norm": 0.004228360019624233, + "learning_rate": 0.0001260083433327034, + "loss": 0.0014, + "num_input_tokens_seen": 209552416, + "step": 97110 + }, + { + "epoch": 15.84257748776509, + "grad_norm": 0.0003915185807272792, + "learning_rate": 0.00012596110382629943, + "loss": 0.0013, + "num_input_tokens_seen": 209562784, + "step": 97115 + }, + { + "epoch": 15.843393148450245, + "grad_norm": 0.0004954145406372845, + "learning_rate": 0.0001259138719002685, + "loss": 0.0003, + "num_input_tokens_seen": 209572896, + "step": 97120 + }, + { + "epoch": 15.844208809135399, + "grad_norm": 0.012152092531323433, + "learning_rate": 0.0001258666475555672, + "loss": 0.0004, + "num_input_tokens_seen": 209583616, + "step": 97125 + }, + { + "epoch": 15.845024469820554, + "grad_norm": 0.002862105844542384, + "learning_rate": 0.00012581943079315323, + "loss": 0.0012, + "num_input_tokens_seen": 209593824, + "step": 97130 + }, + { + "epoch": 15.84584013050571, + "grad_norm": 0.06051749736070633, + "learning_rate": 0.00012577222161398288, + "loss": 0.0016, + "num_input_tokens_seen": 209605120, + "step": 97135 + }, + { + "epoch": 15.846655791190864, + "grad_norm": 0.0022249037865549326, + "learning_rate": 0.00012572502001901347, + "loss": 0.0006, + "num_input_tokens_seen": 209617376, + "step": 97140 + }, + { + "epoch": 15.84747145187602, + "grad_norm": 0.0010086512193083763, + "learning_rate": 0.00012567782600920107, + "loss": 0.0005, + "num_input_tokens_seen": 209627328, + "step": 97145 + }, + { + "epoch": 15.848287112561174, + "grad_norm": 0.0028049838729202747, + "learning_rate": 0.0001256306395855027, + "loss": 0.0033, + "num_input_tokens_seen": 209638016, + "step": 97150 + }, + { + "epoch": 15.84910277324633, + "grad_norm": 0.00023311603581532836, + "learning_rate": 0.000125583460748874, + "loss": 0.001, + "num_input_tokens_seen": 209649536, + "step": 97155 + }, + { + "epoch": 15.849918433931485, + "grad_norm": 0.014933982864022255, + "learning_rate": 0.00012553628950027175, + "loss": 0.0006, + "num_input_tokens_seen": 209660576, + "step": 97160 + }, + { + "epoch": 15.850734094616639, + "grad_norm": 0.007416434586048126, + "learning_rate": 0.00012548912584065135, + "loss": 0.0003, + "num_input_tokens_seen": 209671808, + "step": 97165 + }, + { + "epoch": 15.851549755301795, + "grad_norm": 0.0012165815569460392, + "learning_rate": 0.00012544196977096905, + "loss": 0.0006, + "num_input_tokens_seen": 209682080, + "step": 97170 + }, + { + "epoch": 15.852365415986949, + "grad_norm": 0.0004173468623775989, + "learning_rate": 0.00012539482129218045, + "loss": 0.0036, + "num_input_tokens_seen": 209692768, + "step": 97175 + }, + { + "epoch": 15.853181076672104, + "grad_norm": 0.015813061967492104, + "learning_rate": 0.00012534768040524098, + "loss": 0.0007, + "num_input_tokens_seen": 209703232, + "step": 97180 + }, + { + "epoch": 15.85399673735726, + "grad_norm": 0.017142541706562042, + "learning_rate": 0.000125300547111106, + "loss": 0.0064, + "num_input_tokens_seen": 209713984, + "step": 97185 + }, + { + "epoch": 15.854812398042414, + "grad_norm": 0.0004944655811414123, + "learning_rate": 0.00012525342141073083, + "loss": 0.0007, + "num_input_tokens_seen": 209723264, + "step": 97190 + }, + { + "epoch": 15.85562805872757, + "grad_norm": 0.07704387605190277, + "learning_rate": 0.00012520630330507042, + "loss": 0.0063, + "num_input_tokens_seen": 209733504, + "step": 97195 + }, + { + "epoch": 15.856443719412724, + "grad_norm": 0.04261891171336174, + "learning_rate": 0.0001251591927950798, + "loss": 0.0093, + "num_input_tokens_seen": 209744352, + "step": 97200 + }, + { + "epoch": 15.85725938009788, + "grad_norm": 0.15729986131191254, + "learning_rate": 0.00012511208988171362, + "loss": 0.0078, + "num_input_tokens_seen": 209755776, + "step": 97205 + }, + { + "epoch": 15.858075040783035, + "grad_norm": 0.10229165107011795, + "learning_rate": 0.0001250649945659265, + "loss": 0.2051, + "num_input_tokens_seen": 209766592, + "step": 97210 + }, + { + "epoch": 15.858890701468189, + "grad_norm": 0.5327933430671692, + "learning_rate": 0.00012501790684867292, + "loss": 0.0067, + "num_input_tokens_seen": 209777760, + "step": 97215 + }, + { + "epoch": 15.859706362153345, + "grad_norm": 0.0006436871481128037, + "learning_rate": 0.0001249708267309072, + "loss": 0.0004, + "num_input_tokens_seen": 209788736, + "step": 97220 + }, + { + "epoch": 15.860522022838499, + "grad_norm": 0.00656086066737771, + "learning_rate": 0.00012492375421358336, + "loss": 0.003, + "num_input_tokens_seen": 209800480, + "step": 97225 + }, + { + "epoch": 15.861337683523654, + "grad_norm": 0.002523196628317237, + "learning_rate": 0.00012487668929765555, + "loss": 0.0039, + "num_input_tokens_seen": 209811872, + "step": 97230 + }, + { + "epoch": 15.86215334420881, + "grad_norm": 0.0002168751962017268, + "learning_rate": 0.00012482963198407742, + "loss": 0.0004, + "num_input_tokens_seen": 209822848, + "step": 97235 + }, + { + "epoch": 15.862969004893964, + "grad_norm": 0.0060413251630961895, + "learning_rate": 0.00012478258227380262, + "loss": 0.001, + "num_input_tokens_seen": 209835232, + "step": 97240 + }, + { + "epoch": 15.86378466557912, + "grad_norm": 0.06650844216346741, + "learning_rate": 0.0001247355401677851, + "loss": 0.0021, + "num_input_tokens_seen": 209845504, + "step": 97245 + }, + { + "epoch": 15.864600326264274, + "grad_norm": 0.0003474570985417813, + "learning_rate": 0.00012468850566697758, + "loss": 0.0013, + "num_input_tokens_seen": 209855936, + "step": 97250 + }, + { + "epoch": 15.86541598694943, + "grad_norm": 0.0006261382368393242, + "learning_rate": 0.00012464147877233394, + "loss": 0.0009, + "num_input_tokens_seen": 209866208, + "step": 97255 + }, + { + "epoch": 15.866231647634583, + "grad_norm": 0.0013245240552350879, + "learning_rate": 0.00012459445948480663, + "loss": 0.0009, + "num_input_tokens_seen": 209877440, + "step": 97260 + }, + { + "epoch": 15.867047308319739, + "grad_norm": 0.00190239236690104, + "learning_rate": 0.0001245474478053491, + "loss": 0.0057, + "num_input_tokens_seen": 209888480, + "step": 97265 + }, + { + "epoch": 15.867862969004895, + "grad_norm": 0.00018054971587844193, + "learning_rate": 0.00012450044373491355, + "loss": 0.015, + "num_input_tokens_seen": 209898144, + "step": 97270 + }, + { + "epoch": 15.868678629690049, + "grad_norm": 0.0003398199041839689, + "learning_rate": 0.00012445344727445303, + "loss": 0.0003, + "num_input_tokens_seen": 209908832, + "step": 97275 + }, + { + "epoch": 15.869494290375204, + "grad_norm": 0.010553238913416862, + "learning_rate": 0.00012440645842491977, + "loss": 0.0019, + "num_input_tokens_seen": 209919712, + "step": 97280 + }, + { + "epoch": 15.870309951060358, + "grad_norm": 0.0004297386039979756, + "learning_rate": 0.0001243594771872661, + "loss": 0.0021, + "num_input_tokens_seen": 209929792, + "step": 97285 + }, + { + "epoch": 15.871125611745514, + "grad_norm": 0.01043581310659647, + "learning_rate": 0.00012431250356244422, + "loss": 0.0006, + "num_input_tokens_seen": 209941088, + "step": 97290 + }, + { + "epoch": 15.87194127243067, + "grad_norm": 0.019641088321805, + "learning_rate": 0.000124265537551406, + "loss": 0.0012, + "num_input_tokens_seen": 209952288, + "step": 97295 + }, + { + "epoch": 15.872756933115824, + "grad_norm": 0.03592695668339729, + "learning_rate": 0.00012421857915510332, + "loss": 0.0013, + "num_input_tokens_seen": 209963584, + "step": 97300 + }, + { + "epoch": 15.87357259380098, + "grad_norm": 0.18492397665977478, + "learning_rate": 0.00012417162837448787, + "loss": 0.0126, + "num_input_tokens_seen": 209975200, + "step": 97305 + }, + { + "epoch": 15.874388254486133, + "grad_norm": 0.013108990155160427, + "learning_rate": 0.0001241246852105111, + "loss": 0.0483, + "num_input_tokens_seen": 209984672, + "step": 97310 + }, + { + "epoch": 15.875203915171289, + "grad_norm": 0.018556365743279457, + "learning_rate": 0.00012407774966412445, + "loss": 0.0011, + "num_input_tokens_seen": 209996064, + "step": 97315 + }, + { + "epoch": 15.876019575856443, + "grad_norm": 0.0049664536491036415, + "learning_rate": 0.0001240308217362791, + "loss": 0.1144, + "num_input_tokens_seen": 210006720, + "step": 97320 + }, + { + "epoch": 15.876835236541599, + "grad_norm": 0.009857325814664364, + "learning_rate": 0.0001239839014279261, + "loss": 0.0004, + "num_input_tokens_seen": 210018112, + "step": 97325 + }, + { + "epoch": 15.877650897226754, + "grad_norm": 0.16584864258766174, + "learning_rate": 0.0001239369887400163, + "loss": 0.0091, + "num_input_tokens_seen": 210029728, + "step": 97330 + }, + { + "epoch": 15.878466557911908, + "grad_norm": 0.02562606893479824, + "learning_rate": 0.0001238900836735005, + "loss": 0.0011, + "num_input_tokens_seen": 210040832, + "step": 97335 + }, + { + "epoch": 15.879282218597064, + "grad_norm": 0.002094362396746874, + "learning_rate": 0.00012384318622932932, + "loss": 0.0051, + "num_input_tokens_seen": 210050848, + "step": 97340 + }, + { + "epoch": 15.880097879282218, + "grad_norm": 0.0012467966880649328, + "learning_rate": 0.00012379629640845314, + "loss": 0.0006, + "num_input_tokens_seen": 210062080, + "step": 97345 + }, + { + "epoch": 15.880913539967374, + "grad_norm": 0.00012821292330045253, + "learning_rate": 0.0001237494142118223, + "loss": 0.1267, + "num_input_tokens_seen": 210072608, + "step": 97350 + }, + { + "epoch": 15.88172920065253, + "grad_norm": 0.0007084388053044677, + "learning_rate": 0.00012370253964038685, + "loss": 0.0034, + "num_input_tokens_seen": 210082592, + "step": 97355 + }, + { + "epoch": 15.882544861337683, + "grad_norm": 0.0005149324424564838, + "learning_rate": 0.0001236556726950968, + "loss": 0.0005, + "num_input_tokens_seen": 210093568, + "step": 97360 + }, + { + "epoch": 15.883360522022839, + "grad_norm": 0.029566071927547455, + "learning_rate": 0.000123608813376902, + "loss": 0.001, + "num_input_tokens_seen": 210105760, + "step": 97365 + }, + { + "epoch": 15.884176182707993, + "grad_norm": 0.0006552185514010489, + "learning_rate": 0.00012356196168675205, + "loss": 0.0249, + "num_input_tokens_seen": 210117696, + "step": 97370 + }, + { + "epoch": 15.884991843393149, + "grad_norm": 0.006318680476397276, + "learning_rate": 0.00012351511762559653, + "loss": 0.0041, + "num_input_tokens_seen": 210129280, + "step": 97375 + }, + { + "epoch": 15.885807504078304, + "grad_norm": 0.0033526376355439425, + "learning_rate": 0.0001234682811943847, + "loss": 0.0016, + "num_input_tokens_seen": 210140576, + "step": 97380 + }, + { + "epoch": 15.886623164763458, + "grad_norm": 0.004197854548692703, + "learning_rate": 0.00012342145239406573, + "loss": 0.0014, + "num_input_tokens_seen": 210149856, + "step": 97385 + }, + { + "epoch": 15.887438825448614, + "grad_norm": 0.00017658667638897896, + "learning_rate": 0.00012337463122558885, + "loss": 0.0035, + "num_input_tokens_seen": 210158880, + "step": 97390 + }, + { + "epoch": 15.888254486133768, + "grad_norm": 0.006077459082007408, + "learning_rate": 0.00012332781768990286, + "loss": 0.0006, + "num_input_tokens_seen": 210169152, + "step": 97395 + }, + { + "epoch": 15.889070146818923, + "grad_norm": 0.0005703512579202652, + "learning_rate": 0.00012328101178795648, + "loss": 0.0011, + "num_input_tokens_seen": 210180480, + "step": 97400 + }, + { + "epoch": 15.88988580750408, + "grad_norm": 0.00045598679571412504, + "learning_rate": 0.0001232342135206983, + "loss": 0.0269, + "num_input_tokens_seen": 210191712, + "step": 97405 + }, + { + "epoch": 15.890701468189233, + "grad_norm": 0.2269027680158615, + "learning_rate": 0.0001231874228890768, + "loss": 0.0137, + "num_input_tokens_seen": 210203680, + "step": 97410 + }, + { + "epoch": 15.891517128874389, + "grad_norm": 1.050447940826416, + "learning_rate": 0.00012314063989404012, + "loss": 0.0151, + "num_input_tokens_seen": 210214080, + "step": 97415 + }, + { + "epoch": 15.892332789559543, + "grad_norm": 0.07134215533733368, + "learning_rate": 0.00012309386453653647, + "loss": 0.0031, + "num_input_tokens_seen": 210225440, + "step": 97420 + }, + { + "epoch": 15.893148450244698, + "grad_norm": 0.0022262390702962875, + "learning_rate": 0.00012304709681751385, + "loss": 0.0025, + "num_input_tokens_seen": 210236256, + "step": 97425 + }, + { + "epoch": 15.893964110929852, + "grad_norm": 0.027474749833345413, + "learning_rate": 0.00012300033673792, + "loss": 0.0292, + "num_input_tokens_seen": 210247104, + "step": 97430 + }, + { + "epoch": 15.894779771615008, + "grad_norm": 0.0001100892768590711, + "learning_rate": 0.00012295358429870252, + "loss": 0.001, + "num_input_tokens_seen": 210258016, + "step": 97435 + }, + { + "epoch": 15.895595432300164, + "grad_norm": 0.0002217169530922547, + "learning_rate": 0.000122906839500809, + "loss": 0.0115, + "num_input_tokens_seen": 210267680, + "step": 97440 + }, + { + "epoch": 15.896411092985318, + "grad_norm": 0.0012069664662703872, + "learning_rate": 0.0001228601023451868, + "loss": 0.0021, + "num_input_tokens_seen": 210278240, + "step": 97445 + }, + { + "epoch": 15.897226753670473, + "grad_norm": 0.02185111679136753, + "learning_rate": 0.00012281337283278298, + "loss": 0.0104, + "num_input_tokens_seen": 210289152, + "step": 97450 + }, + { + "epoch": 15.898042414355627, + "grad_norm": 0.0018825940787792206, + "learning_rate": 0.0001227666509645447, + "loss": 0.003, + "num_input_tokens_seen": 210300320, + "step": 97455 + }, + { + "epoch": 15.898858075040783, + "grad_norm": 0.4913811683654785, + "learning_rate": 0.00012271993674141878, + "loss": 0.0197, + "num_input_tokens_seen": 210311968, + "step": 97460 + }, + { + "epoch": 15.899673735725939, + "grad_norm": 0.0013186828000470996, + "learning_rate": 0.000122673230164352, + "loss": 0.0047, + "num_input_tokens_seen": 210320960, + "step": 97465 + }, + { + "epoch": 15.900489396411093, + "grad_norm": 0.004887293092906475, + "learning_rate": 0.00012262653123429085, + "loss": 0.0055, + "num_input_tokens_seen": 210329696, + "step": 97470 + }, + { + "epoch": 15.901305057096248, + "grad_norm": 0.0005603775498457253, + "learning_rate": 0.0001225798399521818, + "loss": 0.0009, + "num_input_tokens_seen": 210338720, + "step": 97475 + }, + { + "epoch": 15.902120717781402, + "grad_norm": 0.0018143600318580866, + "learning_rate": 0.00012253315631897106, + "loss": 0.1491, + "num_input_tokens_seen": 210349024, + "step": 97480 + }, + { + "epoch": 15.902936378466558, + "grad_norm": 2.248326063156128, + "learning_rate": 0.00012248648033560473, + "loss": 0.0324, + "num_input_tokens_seen": 210361664, + "step": 97485 + }, + { + "epoch": 15.903752039151712, + "grad_norm": 0.008081893436610699, + "learning_rate": 0.00012243981200302885, + "loss": 0.0033, + "num_input_tokens_seen": 210373184, + "step": 97490 + }, + { + "epoch": 15.904567699836868, + "grad_norm": 0.00040009801159612834, + "learning_rate": 0.00012239315132218898, + "loss": 0.0431, + "num_input_tokens_seen": 210383872, + "step": 97495 + }, + { + "epoch": 15.905383360522023, + "grad_norm": 0.00019279650587122887, + "learning_rate": 0.00012234649829403116, + "loss": 0.0015, + "num_input_tokens_seen": 210394592, + "step": 97500 + }, + { + "epoch": 15.906199021207177, + "grad_norm": 0.5904957056045532, + "learning_rate": 0.0001222998529195004, + "loss": 0.0703, + "num_input_tokens_seen": 210405664, + "step": 97505 + }, + { + "epoch": 15.907014681892333, + "grad_norm": 0.002771394094452262, + "learning_rate": 0.00012225321519954258, + "loss": 0.0017, + "num_input_tokens_seen": 210417632, + "step": 97510 + }, + { + "epoch": 15.907830342577487, + "grad_norm": 0.00031532300636172295, + "learning_rate": 0.00012220658513510224, + "loss": 0.0002, + "num_input_tokens_seen": 210426080, + "step": 97515 + }, + { + "epoch": 15.908646003262643, + "grad_norm": 0.00074313412187621, + "learning_rate": 0.00012215996272712498, + "loss": 0.0003, + "num_input_tokens_seen": 210435232, + "step": 97520 + }, + { + "epoch": 15.909461663947798, + "grad_norm": 0.046291664242744446, + "learning_rate": 0.00012211334797655515, + "loss": 0.0014, + "num_input_tokens_seen": 210446464, + "step": 97525 + }, + { + "epoch": 15.910277324632952, + "grad_norm": 0.0018588141538202763, + "learning_rate": 0.00012206674088433784, + "loss": 0.0009, + "num_input_tokens_seen": 210456928, + "step": 97530 + }, + { + "epoch": 15.911092985318108, + "grad_norm": 0.0001601563417352736, + "learning_rate": 0.00012202014145141749, + "loss": 0.0018, + "num_input_tokens_seen": 210467584, + "step": 97535 + }, + { + "epoch": 15.911908646003262, + "grad_norm": 0.10731473565101624, + "learning_rate": 0.00012197354967873847, + "loss": 0.0034, + "num_input_tokens_seen": 210477984, + "step": 97540 + }, + { + "epoch": 15.912724306688418, + "grad_norm": 0.07836928963661194, + "learning_rate": 0.00012192696556724497, + "loss": 0.0025, + "num_input_tokens_seen": 210489472, + "step": 97545 + }, + { + "epoch": 15.913539967373573, + "grad_norm": 0.0019076764583587646, + "learning_rate": 0.00012188038911788119, + "loss": 0.0269, + "num_input_tokens_seen": 210500672, + "step": 97550 + }, + { + "epoch": 15.914355628058727, + "grad_norm": 0.0007778470171615481, + "learning_rate": 0.00012183382033159101, + "loss": 0.056, + "num_input_tokens_seen": 210511968, + "step": 97555 + }, + { + "epoch": 15.915171288743883, + "grad_norm": 0.0007830605027265847, + "learning_rate": 0.00012178725920931816, + "loss": 0.0055, + "num_input_tokens_seen": 210522816, + "step": 97560 + }, + { + "epoch": 15.915986949429037, + "grad_norm": 0.002245944458991289, + "learning_rate": 0.0001217407057520063, + "loss": 0.0055, + "num_input_tokens_seen": 210533536, + "step": 97565 + }, + { + "epoch": 15.916802610114193, + "grad_norm": 0.00016165840497706085, + "learning_rate": 0.0001216941599605989, + "loss": 0.0031, + "num_input_tokens_seen": 210544544, + "step": 97570 + }, + { + "epoch": 15.917618270799348, + "grad_norm": 0.005016384646296501, + "learning_rate": 0.00012164762183603928, + "loss": 0.0012, + "num_input_tokens_seen": 210555328, + "step": 97575 + }, + { + "epoch": 15.918433931484502, + "grad_norm": 0.002542313886806369, + "learning_rate": 0.00012160109137927061, + "loss": 0.0018, + "num_input_tokens_seen": 210567072, + "step": 97580 + }, + { + "epoch": 15.919249592169658, + "grad_norm": 0.0002365852997172624, + "learning_rate": 0.00012155456859123582, + "loss": 0.0011, + "num_input_tokens_seen": 210576384, + "step": 97585 + }, + { + "epoch": 15.920065252854812, + "grad_norm": 0.024547608569264412, + "learning_rate": 0.00012150805347287774, + "loss": 0.0022, + "num_input_tokens_seen": 210587712, + "step": 97590 + }, + { + "epoch": 15.920880913539968, + "grad_norm": 0.0008528117323294282, + "learning_rate": 0.00012146154602513915, + "loss": 0.0015, + "num_input_tokens_seen": 210598944, + "step": 97595 + }, + { + "epoch": 15.921696574225122, + "grad_norm": 0.10037417709827423, + "learning_rate": 0.00012141504624896244, + "loss": 0.0043, + "num_input_tokens_seen": 210610400, + "step": 97600 + }, + { + "epoch": 15.922512234910277, + "grad_norm": 0.1565827578306198, + "learning_rate": 0.0001213685541452903, + "loss": 0.0654, + "num_input_tokens_seen": 210621856, + "step": 97605 + }, + { + "epoch": 15.923327895595433, + "grad_norm": 0.00020225619664415717, + "learning_rate": 0.00012132206971506449, + "loss": 0.0177, + "num_input_tokens_seen": 210632384, + "step": 97610 + }, + { + "epoch": 15.924143556280587, + "grad_norm": 0.009078694507479668, + "learning_rate": 0.00012127559295922764, + "loss": 0.0062, + "num_input_tokens_seen": 210642496, + "step": 97615 + }, + { + "epoch": 15.924959216965743, + "grad_norm": 0.0013511452125385404, + "learning_rate": 0.00012122912387872098, + "loss": 0.0005, + "num_input_tokens_seen": 210653376, + "step": 97620 + }, + { + "epoch": 15.925774877650896, + "grad_norm": 0.0004468217375688255, + "learning_rate": 0.000121182662474487, + "loss": 0.0355, + "num_input_tokens_seen": 210662720, + "step": 97625 + }, + { + "epoch": 15.926590538336052, + "grad_norm": 0.000479153823107481, + "learning_rate": 0.00012113620874746656, + "loss": 0.0185, + "num_input_tokens_seen": 210673952, + "step": 97630 + }, + { + "epoch": 15.927406199021208, + "grad_norm": 0.002082392107695341, + "learning_rate": 0.00012108976269860183, + "loss": 0.0015, + "num_input_tokens_seen": 210684928, + "step": 97635 + }, + { + "epoch": 15.928221859706362, + "grad_norm": 0.0008237373549491167, + "learning_rate": 0.00012104332432883342, + "loss": 0.0043, + "num_input_tokens_seen": 210695424, + "step": 97640 + }, + { + "epoch": 15.929037520391518, + "grad_norm": 0.05865500122308731, + "learning_rate": 0.0001209968936391031, + "loss": 0.0195, + "num_input_tokens_seen": 210705888, + "step": 97645 + }, + { + "epoch": 15.929853181076671, + "grad_norm": 0.0016774643445387483, + "learning_rate": 0.00012095047063035119, + "loss": 0.0075, + "num_input_tokens_seen": 210716512, + "step": 97650 + }, + { + "epoch": 15.930668841761827, + "grad_norm": 0.0006233852473087609, + "learning_rate": 0.00012090405530351916, + "loss": 0.0028, + "num_input_tokens_seen": 210726304, + "step": 97655 + }, + { + "epoch": 15.931484502446983, + "grad_norm": 0.002237908309325576, + "learning_rate": 0.0001208576476595471, + "loss": 0.0022, + "num_input_tokens_seen": 210737632, + "step": 97660 + }, + { + "epoch": 15.932300163132137, + "grad_norm": 0.0011585287284106016, + "learning_rate": 0.00012081124769937607, + "loss": 0.0078, + "num_input_tokens_seen": 210747168, + "step": 97665 + }, + { + "epoch": 15.933115823817293, + "grad_norm": 0.0050437310710549355, + "learning_rate": 0.00012076485542394583, + "loss": 0.0008, + "num_input_tokens_seen": 210758240, + "step": 97670 + }, + { + "epoch": 15.933931484502446, + "grad_norm": 0.005319483578205109, + "learning_rate": 0.00012071847083419708, + "loss": 0.0073, + "num_input_tokens_seen": 210768640, + "step": 97675 + }, + { + "epoch": 15.934747145187602, + "grad_norm": 0.0087177325040102, + "learning_rate": 0.00012067209393106959, + "loss": 0.0033, + "num_input_tokens_seen": 210778880, + "step": 97680 + }, + { + "epoch": 15.935562805872756, + "grad_norm": 0.2900407612323761, + "learning_rate": 0.00012062572471550337, + "loss": 0.0104, + "num_input_tokens_seen": 210788832, + "step": 97685 + }, + { + "epoch": 15.936378466557912, + "grad_norm": 0.0037559461779892445, + "learning_rate": 0.00012057936318843816, + "loss": 0.0023, + "num_input_tokens_seen": 210800800, + "step": 97690 + }, + { + "epoch": 15.937194127243067, + "grad_norm": 0.0004507193807512522, + "learning_rate": 0.00012053300935081341, + "loss": 0.0147, + "num_input_tokens_seen": 210811712, + "step": 97695 + }, + { + "epoch": 15.938009787928221, + "grad_norm": 0.0022728045005351305, + "learning_rate": 0.00012048666320356865, + "loss": 0.0068, + "num_input_tokens_seen": 210823008, + "step": 97700 + }, + { + "epoch": 15.938825448613377, + "grad_norm": 0.0010347341885790229, + "learning_rate": 0.0001204403247476431, + "loss": 0.0002, + "num_input_tokens_seen": 210834624, + "step": 97705 + }, + { + "epoch": 15.939641109298531, + "grad_norm": 0.0011995843378826976, + "learning_rate": 0.00012039399398397588, + "loss": 0.0016, + "num_input_tokens_seen": 210845536, + "step": 97710 + }, + { + "epoch": 15.940456769983687, + "grad_norm": 0.002373690251260996, + "learning_rate": 0.00012034767091350591, + "loss": 0.0019, + "num_input_tokens_seen": 210855136, + "step": 97715 + }, + { + "epoch": 15.941272430668842, + "grad_norm": 0.20812460780143738, + "learning_rate": 0.00012030135553717204, + "loss": 0.0134, + "num_input_tokens_seen": 210867104, + "step": 97720 + }, + { + "epoch": 15.942088091353996, + "grad_norm": 0.0003311827022116631, + "learning_rate": 0.00012025504785591273, + "loss": 0.001, + "num_input_tokens_seen": 210876640, + "step": 97725 + }, + { + "epoch": 15.942903752039152, + "grad_norm": 0.0035386739764362574, + "learning_rate": 0.00012020874787066688, + "loss": 0.0008, + "num_input_tokens_seen": 210886304, + "step": 97730 + }, + { + "epoch": 15.943719412724306, + "grad_norm": 0.02021712064743042, + "learning_rate": 0.00012016245558237232, + "loss": 0.0017, + "num_input_tokens_seen": 210897408, + "step": 97735 + }, + { + "epoch": 15.944535073409462, + "grad_norm": 0.0018198989564552903, + "learning_rate": 0.0001201161709919677, + "loss": 0.0018, + "num_input_tokens_seen": 210908256, + "step": 97740 + }, + { + "epoch": 15.945350734094617, + "grad_norm": 0.000589891686104238, + "learning_rate": 0.00012006989410039055, + "loss": 0.0039, + "num_input_tokens_seen": 210919360, + "step": 97745 + }, + { + "epoch": 15.946166394779771, + "grad_norm": 0.0003555079165380448, + "learning_rate": 0.00012002362490857921, + "loss": 0.0099, + "num_input_tokens_seen": 210928864, + "step": 97750 + }, + { + "epoch": 15.946982055464927, + "grad_norm": 0.0757540687918663, + "learning_rate": 0.00011997736341747085, + "loss": 0.0018, + "num_input_tokens_seen": 210940160, + "step": 97755 + }, + { + "epoch": 15.947797716150081, + "grad_norm": 0.026278622448444366, + "learning_rate": 0.00011993110962800363, + "loss": 0.0056, + "num_input_tokens_seen": 210952192, + "step": 97760 + }, + { + "epoch": 15.948613376835237, + "grad_norm": 0.0009226752445101738, + "learning_rate": 0.00011988486354111433, + "loss": 0.014, + "num_input_tokens_seen": 210962752, + "step": 97765 + }, + { + "epoch": 15.949429037520392, + "grad_norm": 0.024819817394018173, + "learning_rate": 0.0001198386251577408, + "loss": 0.0156, + "num_input_tokens_seen": 210973408, + "step": 97770 + }, + { + "epoch": 15.950244698205546, + "grad_norm": 0.0018380646361038089, + "learning_rate": 0.00011979239447881945, + "loss": 0.0676, + "num_input_tokens_seen": 210984608, + "step": 97775 + }, + { + "epoch": 15.951060358890702, + "grad_norm": 0.5240771770477295, + "learning_rate": 0.00011974617150528788, + "loss": 0.0516, + "num_input_tokens_seen": 210994656, + "step": 97780 + }, + { + "epoch": 15.951876019575856, + "grad_norm": 0.22899013757705688, + "learning_rate": 0.00011969995623808221, + "loss": 0.0664, + "num_input_tokens_seen": 211006016, + "step": 97785 + }, + { + "epoch": 15.952691680261012, + "grad_norm": 0.0012308888835832477, + "learning_rate": 0.00011965374867813972, + "loss": 0.0016, + "num_input_tokens_seen": 211017472, + "step": 97790 + }, + { + "epoch": 15.953507340946166, + "grad_norm": 0.00044122006511315703, + "learning_rate": 0.00011960754882639619, + "loss": 0.0008, + "num_input_tokens_seen": 211026784, + "step": 97795 + }, + { + "epoch": 15.954323001631321, + "grad_norm": 0.06386946141719818, + "learning_rate": 0.00011956135668378853, + "loss": 0.0017, + "num_input_tokens_seen": 211037504, + "step": 97800 + }, + { + "epoch": 15.955138662316477, + "grad_norm": 0.00036929009365849197, + "learning_rate": 0.00011951517225125231, + "loss": 0.006, + "num_input_tokens_seen": 211048544, + "step": 97805 + }, + { + "epoch": 15.955954323001631, + "grad_norm": 0.04631635919213295, + "learning_rate": 0.00011946899552972395, + "loss": 0.0099, + "num_input_tokens_seen": 211060352, + "step": 97810 + }, + { + "epoch": 15.956769983686787, + "grad_norm": 0.0009441166184842587, + "learning_rate": 0.00011942282652013914, + "loss": 0.0024, + "num_input_tokens_seen": 211071584, + "step": 97815 + }, + { + "epoch": 15.95758564437194, + "grad_norm": 0.0007715206011198461, + "learning_rate": 0.00011937666522343354, + "loss": 0.0005, + "num_input_tokens_seen": 211082208, + "step": 97820 + }, + { + "epoch": 15.958401305057096, + "grad_norm": 0.00039809319423511624, + "learning_rate": 0.0001193305116405427, + "loss": 0.0032, + "num_input_tokens_seen": 211092384, + "step": 97825 + }, + { + "epoch": 15.959216965742252, + "grad_norm": 0.006323513574898243, + "learning_rate": 0.00011928436577240193, + "loss": 0.0008, + "num_input_tokens_seen": 211104000, + "step": 97830 + }, + { + "epoch": 15.960032626427406, + "grad_norm": 0.00017024045519065112, + "learning_rate": 0.00011923822761994646, + "loss": 0.0177, + "num_input_tokens_seen": 211114304, + "step": 97835 + }, + { + "epoch": 15.960848287112562, + "grad_norm": 0.0025748296175152063, + "learning_rate": 0.00011919209718411134, + "loss": 0.0456, + "num_input_tokens_seen": 211125664, + "step": 97840 + }, + { + "epoch": 15.961663947797716, + "grad_norm": 0.00022198451915755868, + "learning_rate": 0.00011914597446583147, + "loss": 0.0251, + "num_input_tokens_seen": 211135584, + "step": 97845 + }, + { + "epoch": 15.962479608482871, + "grad_norm": 0.006263205781579018, + "learning_rate": 0.00011909985946604157, + "loss": 0.0008, + "num_input_tokens_seen": 211146592, + "step": 97850 + }, + { + "epoch": 15.963295269168025, + "grad_norm": 0.0022416478022933006, + "learning_rate": 0.00011905375218567621, + "loss": 0.0041, + "num_input_tokens_seen": 211156576, + "step": 97855 + }, + { + "epoch": 15.964110929853181, + "grad_norm": 0.011908363550901413, + "learning_rate": 0.00011900765262566988, + "loss": 0.0012, + "num_input_tokens_seen": 211167712, + "step": 97860 + }, + { + "epoch": 15.964926590538337, + "grad_norm": 0.11023328453302383, + "learning_rate": 0.00011896156078695675, + "loss": 0.0039, + "num_input_tokens_seen": 211178464, + "step": 97865 + }, + { + "epoch": 15.96574225122349, + "grad_norm": 0.0007150315213948488, + "learning_rate": 0.00011891547667047082, + "loss": 0.007, + "num_input_tokens_seen": 211189568, + "step": 97870 + }, + { + "epoch": 15.966557911908646, + "grad_norm": 0.48678261041641235, + "learning_rate": 0.00011886940027714649, + "loss": 0.0213, + "num_input_tokens_seen": 211199360, + "step": 97875 + }, + { + "epoch": 15.9673735725938, + "grad_norm": 0.0008552306680940092, + "learning_rate": 0.00011882333160791697, + "loss": 0.0016, + "num_input_tokens_seen": 211209760, + "step": 97880 + }, + { + "epoch": 15.968189233278956, + "grad_norm": 0.009128443896770477, + "learning_rate": 0.00011877727066371646, + "loss": 0.0104, + "num_input_tokens_seen": 211219904, + "step": 97885 + }, + { + "epoch": 15.969004893964112, + "grad_norm": 0.007203037850558758, + "learning_rate": 0.00011873121744547794, + "loss": 0.0013, + "num_input_tokens_seen": 211230400, + "step": 97890 + }, + { + "epoch": 15.969820554649266, + "grad_norm": 0.18894238770008087, + "learning_rate": 0.00011868517195413525, + "loss": 0.0961, + "num_input_tokens_seen": 211241440, + "step": 97895 + }, + { + "epoch": 15.970636215334421, + "grad_norm": 0.0025125781539827585, + "learning_rate": 0.00011863913419062095, + "loss": 0.0834, + "num_input_tokens_seen": 211253600, + "step": 97900 + }, + { + "epoch": 15.971451876019575, + "grad_norm": 0.00016240475815720856, + "learning_rate": 0.00011859310415586871, + "loss": 0.001, + "num_input_tokens_seen": 211265280, + "step": 97905 + }, + { + "epoch": 15.97226753670473, + "grad_norm": 0.009829294867813587, + "learning_rate": 0.00011854708185081076, + "loss": 0.0052, + "num_input_tokens_seen": 211275520, + "step": 97910 + }, + { + "epoch": 15.973083197389887, + "grad_norm": 0.0059935227036476135, + "learning_rate": 0.00011850106727638026, + "loss": 0.0047, + "num_input_tokens_seen": 211286496, + "step": 97915 + }, + { + "epoch": 15.97389885807504, + "grad_norm": 0.00037100224290043116, + "learning_rate": 0.00011845506043350956, + "loss": 0.001, + "num_input_tokens_seen": 211295808, + "step": 97920 + }, + { + "epoch": 15.974714518760196, + "grad_norm": 0.0004924999084323645, + "learning_rate": 0.00011840906132313117, + "loss": 0.0048, + "num_input_tokens_seen": 211307168, + "step": 97925 + }, + { + "epoch": 15.97553017944535, + "grad_norm": 0.002984242280945182, + "learning_rate": 0.00011836306994617718, + "loss": 0.0004, + "num_input_tokens_seen": 211317120, + "step": 97930 + }, + { + "epoch": 15.976345840130506, + "grad_norm": 0.003030703403055668, + "learning_rate": 0.00011831708630357968, + "loss": 0.0024, + "num_input_tokens_seen": 211327328, + "step": 97935 + }, + { + "epoch": 15.977161500815662, + "grad_norm": 0.03570723533630371, + "learning_rate": 0.0001182711103962707, + "loss": 0.0024, + "num_input_tokens_seen": 211336608, + "step": 97940 + }, + { + "epoch": 15.977977161500815, + "grad_norm": 0.010937263257801533, + "learning_rate": 0.00011822514222518188, + "loss": 0.0024, + "num_input_tokens_seen": 211348448, + "step": 97945 + }, + { + "epoch": 15.978792822185971, + "grad_norm": 0.0008291919366456568, + "learning_rate": 0.00011817918179124487, + "loss": 0.0013, + "num_input_tokens_seen": 211359104, + "step": 97950 + }, + { + "epoch": 15.979608482871125, + "grad_norm": 0.7104939222335815, + "learning_rate": 0.00011813322909539115, + "loss": 0.0214, + "num_input_tokens_seen": 211369760, + "step": 97955 + }, + { + "epoch": 15.98042414355628, + "grad_norm": 0.018779747188091278, + "learning_rate": 0.0001180872841385519, + "loss": 0.0095, + "num_input_tokens_seen": 211379584, + "step": 97960 + }, + { + "epoch": 15.981239804241435, + "grad_norm": 0.010368380695581436, + "learning_rate": 0.00011804134692165841, + "loss": 0.0013, + "num_input_tokens_seen": 211390400, + "step": 97965 + }, + { + "epoch": 15.98205546492659, + "grad_norm": 0.04207855835556984, + "learning_rate": 0.00011799541744564151, + "loss": 0.0059, + "num_input_tokens_seen": 211400000, + "step": 97970 + }, + { + "epoch": 15.982871125611746, + "grad_norm": 0.0024131108075380325, + "learning_rate": 0.00011794949571143215, + "loss": 0.001, + "num_input_tokens_seen": 211412288, + "step": 97975 + }, + { + "epoch": 15.9836867862969, + "grad_norm": 0.028889138251543045, + "learning_rate": 0.00011790358171996086, + "loss": 0.0022, + "num_input_tokens_seen": 211423744, + "step": 97980 + }, + { + "epoch": 15.984502446982056, + "grad_norm": 0.00279762945137918, + "learning_rate": 0.00011785767547215825, + "loss": 0.0006, + "num_input_tokens_seen": 211435168, + "step": 97985 + }, + { + "epoch": 15.98531810766721, + "grad_norm": 0.008545267395675182, + "learning_rate": 0.00011781177696895462, + "loss": 0.0007, + "num_input_tokens_seen": 211446016, + "step": 97990 + }, + { + "epoch": 15.986133768352365, + "grad_norm": 0.009959133341908455, + "learning_rate": 0.00011776588621128015, + "loss": 0.0007, + "num_input_tokens_seen": 211456512, + "step": 97995 + }, + { + "epoch": 15.986949429037521, + "grad_norm": 0.0009503285400569439, + "learning_rate": 0.00011772000320006493, + "loss": 0.0035, + "num_input_tokens_seen": 211467808, + "step": 98000 + }, + { + "epoch": 15.987765089722675, + "grad_norm": 0.002719377400353551, + "learning_rate": 0.00011767412793623878, + "loss": 0.0007, + "num_input_tokens_seen": 211478624, + "step": 98005 + }, + { + "epoch": 15.98858075040783, + "grad_norm": 0.002309368457645178, + "learning_rate": 0.00011762826042073144, + "loss": 0.0091, + "num_input_tokens_seen": 211488960, + "step": 98010 + }, + { + "epoch": 15.989396411092985, + "grad_norm": 0.0015820630360394716, + "learning_rate": 0.00011758240065447234, + "loss": 0.0011, + "num_input_tokens_seen": 211499360, + "step": 98015 + }, + { + "epoch": 15.99021207177814, + "grad_norm": 0.060039982199668884, + "learning_rate": 0.00011753654863839114, + "loss": 0.0048, + "num_input_tokens_seen": 211510368, + "step": 98020 + }, + { + "epoch": 15.991027732463294, + "grad_norm": 0.44228434562683105, + "learning_rate": 0.00011749070437341702, + "loss": 0.0372, + "num_input_tokens_seen": 211522208, + "step": 98025 + }, + { + "epoch": 15.99184339314845, + "grad_norm": 0.10461252182722092, + "learning_rate": 0.00011744486786047898, + "loss": 0.0071, + "num_input_tokens_seen": 211532800, + "step": 98030 + }, + { + "epoch": 15.992659053833606, + "grad_norm": 0.0020288776140660048, + "learning_rate": 0.00011739903910050603, + "loss": 0.0017, + "num_input_tokens_seen": 211544000, + "step": 98035 + }, + { + "epoch": 15.99347471451876, + "grad_norm": 0.05430680140852928, + "learning_rate": 0.00011735321809442689, + "loss": 0.0016, + "num_input_tokens_seen": 211555008, + "step": 98040 + }, + { + "epoch": 15.994290375203915, + "grad_norm": 0.016352098435163498, + "learning_rate": 0.00011730740484317021, + "loss": 0.0023, + "num_input_tokens_seen": 211565152, + "step": 98045 + }, + { + "epoch": 15.99510603588907, + "grad_norm": 0.0008780654170550406, + "learning_rate": 0.00011726159934766445, + "loss": 0.0011, + "num_input_tokens_seen": 211576064, + "step": 98050 + }, + { + "epoch": 15.995921696574225, + "grad_norm": 0.00010944233508780599, + "learning_rate": 0.00011721580160883794, + "loss": 0.0016, + "num_input_tokens_seen": 211588000, + "step": 98055 + }, + { + "epoch": 15.99673735725938, + "grad_norm": 0.01354175340384245, + "learning_rate": 0.00011717001162761881, + "loss": 0.0024, + "num_input_tokens_seen": 211599008, + "step": 98060 + }, + { + "epoch": 15.997553017944535, + "grad_norm": 0.0018440140411257744, + "learning_rate": 0.000117124229404935, + "loss": 0.0014, + "num_input_tokens_seen": 211609120, + "step": 98065 + }, + { + "epoch": 15.99836867862969, + "grad_norm": 0.45760202407836914, + "learning_rate": 0.00011707845494171443, + "loss": 0.0273, + "num_input_tokens_seen": 211620608, + "step": 98070 + }, + { + "epoch": 15.999184339314844, + "grad_norm": 0.08765044063329697, + "learning_rate": 0.00011703268823888475, + "loss": 0.0147, + "num_input_tokens_seen": 211631296, + "step": 98075 + }, + { + "epoch": 16.0, + "grad_norm": 0.00012897088890895247, + "learning_rate": 0.00011698692929737348, + "loss": 0.0046, + "num_input_tokens_seen": 211640976, + "step": 98080 + }, + { + "epoch": 16.0, + "eval_loss": 0.2587065100669861, + "eval_runtime": 104.7118, + "eval_samples_per_second": 26.024, + "eval_steps_per_second": 6.513, + "num_input_tokens_seen": 211640976, + "step": 98080 + }, + { + "epoch": 16.000815660685156, + "grad_norm": 0.006220451556146145, + "learning_rate": 0.00011694117811810795, + "loss": 0.0018, + "num_input_tokens_seen": 211651504, + "step": 98085 + }, + { + "epoch": 16.00163132137031, + "grad_norm": 0.005368681158870459, + "learning_rate": 0.00011689543470201536, + "loss": 0.0018, + "num_input_tokens_seen": 211661616, + "step": 98090 + }, + { + "epoch": 16.002446982055464, + "grad_norm": 0.003987261094152927, + "learning_rate": 0.00011684969905002286, + "loss": 0.0207, + "num_input_tokens_seen": 211671632, + "step": 98095 + }, + { + "epoch": 16.00326264274062, + "grad_norm": 0.0005349265993572772, + "learning_rate": 0.00011680397116305719, + "loss": 0.0021, + "num_input_tokens_seen": 211682800, + "step": 98100 + }, + { + "epoch": 16.004078303425775, + "grad_norm": 0.0018598485039547086, + "learning_rate": 0.00011675825104204523, + "loss": 0.0009, + "num_input_tokens_seen": 211693296, + "step": 98105 + }, + { + "epoch": 16.00489396411093, + "grad_norm": 0.007518252823501825, + "learning_rate": 0.00011671253868791343, + "loss": 0.0783, + "num_input_tokens_seen": 211705680, + "step": 98110 + }, + { + "epoch": 16.005709624796086, + "grad_norm": 0.00013195314386393875, + "learning_rate": 0.00011666683410158829, + "loss": 0.0011, + "num_input_tokens_seen": 211717104, + "step": 98115 + }, + { + "epoch": 16.00652528548124, + "grad_norm": 0.012572134844958782, + "learning_rate": 0.0001166211372839961, + "loss": 0.0082, + "num_input_tokens_seen": 211728432, + "step": 98120 + }, + { + "epoch": 16.007340946166394, + "grad_norm": 0.052189771085977554, + "learning_rate": 0.00011657544823606286, + "loss": 0.0025, + "num_input_tokens_seen": 211740208, + "step": 98125 + }, + { + "epoch": 16.00815660685155, + "grad_norm": 0.00021805772848892957, + "learning_rate": 0.00011652976695871459, + "loss": 0.0013, + "num_input_tokens_seen": 211751312, + "step": 98130 + }, + { + "epoch": 16.008972267536706, + "grad_norm": 0.0002591028169263154, + "learning_rate": 0.00011648409345287691, + "loss": 0.0062, + "num_input_tokens_seen": 211761392, + "step": 98135 + }, + { + "epoch": 16.00978792822186, + "grad_norm": 0.002081693848595023, + "learning_rate": 0.00011643842771947588, + "loss": 0.1155, + "num_input_tokens_seen": 211772624, + "step": 98140 + }, + { + "epoch": 16.010603588907014, + "grad_norm": 0.03914385661482811, + "learning_rate": 0.00011639276975943641, + "loss": 0.0037, + "num_input_tokens_seen": 211783536, + "step": 98145 + }, + { + "epoch": 16.01141924959217, + "grad_norm": 0.01927109621465206, + "learning_rate": 0.00011634711957368438, + "loss": 0.0043, + "num_input_tokens_seen": 211793424, + "step": 98150 + }, + { + "epoch": 16.012234910277325, + "grad_norm": 0.03387075290083885, + "learning_rate": 0.00011630147716314443, + "loss": 0.0109, + "num_input_tokens_seen": 211804208, + "step": 98155 + }, + { + "epoch": 16.01305057096248, + "grad_norm": 0.044014133512973785, + "learning_rate": 0.00011625584252874189, + "loss": 0.0026, + "num_input_tokens_seen": 211814992, + "step": 98160 + }, + { + "epoch": 16.013866231647636, + "grad_norm": 0.014238683506846428, + "learning_rate": 0.00011621021567140156, + "loss": 0.0014, + "num_input_tokens_seen": 211825488, + "step": 98165 + }, + { + "epoch": 16.01468189233279, + "grad_norm": 0.005838725715875626, + "learning_rate": 0.00011616459659204803, + "loss": 0.0008, + "num_input_tokens_seen": 211837264, + "step": 98170 + }, + { + "epoch": 16.015497553017944, + "grad_norm": 0.014835142530500889, + "learning_rate": 0.00011611898529160591, + "loss": 0.0004, + "num_input_tokens_seen": 211848080, + "step": 98175 + }, + { + "epoch": 16.0163132137031, + "grad_norm": 0.01212687324732542, + "learning_rate": 0.00011607338177099952, + "loss": 0.0062, + "num_input_tokens_seen": 211858992, + "step": 98180 + }, + { + "epoch": 16.017128874388256, + "grad_norm": 0.049160074442625046, + "learning_rate": 0.00011602778603115311, + "loss": 0.0015, + "num_input_tokens_seen": 211869552, + "step": 98185 + }, + { + "epoch": 16.017944535073408, + "grad_norm": 0.0026603706646710634, + "learning_rate": 0.00011598219807299076, + "loss": 0.0055, + "num_input_tokens_seen": 211880048, + "step": 98190 + }, + { + "epoch": 16.018760195758563, + "grad_norm": 0.00038265736657194793, + "learning_rate": 0.00011593661789743626, + "loss": 0.0027, + "num_input_tokens_seen": 211891696, + "step": 98195 + }, + { + "epoch": 16.01957585644372, + "grad_norm": 0.016253113746643066, + "learning_rate": 0.00011589104550541346, + "loss": 0.0044, + "num_input_tokens_seen": 211901392, + "step": 98200 + }, + { + "epoch": 16.020391517128875, + "grad_norm": 0.01661992445588112, + "learning_rate": 0.00011584548089784585, + "loss": 0.001, + "num_input_tokens_seen": 211912240, + "step": 98205 + }, + { + "epoch": 16.02120717781403, + "grad_norm": 0.0011667683720588684, + "learning_rate": 0.00011579992407565698, + "loss": 0.0026, + "num_input_tokens_seen": 211923888, + "step": 98210 + }, + { + "epoch": 16.022022838499183, + "grad_norm": 0.00017881960957311094, + "learning_rate": 0.00011575437503976998, + "loss": 0.0003, + "num_input_tokens_seen": 211935056, + "step": 98215 + }, + { + "epoch": 16.02283849918434, + "grad_norm": 0.0016999911749735475, + "learning_rate": 0.00011570883379110803, + "loss": 0.0026, + "num_input_tokens_seen": 211946512, + "step": 98220 + }, + { + "epoch": 16.023654159869494, + "grad_norm": 0.12831835448741913, + "learning_rate": 0.00011566330033059407, + "loss": 0.0042, + "num_input_tokens_seen": 211958640, + "step": 98225 + }, + { + "epoch": 16.02446982055465, + "grad_norm": 0.0009443134185858071, + "learning_rate": 0.00011561777465915091, + "loss": 0.0067, + "num_input_tokens_seen": 211967632, + "step": 98230 + }, + { + "epoch": 16.025285481239806, + "grad_norm": 0.021930614486336708, + "learning_rate": 0.00011557225677770116, + "loss": 0.0011, + "num_input_tokens_seen": 211978928, + "step": 98235 + }, + { + "epoch": 16.026101141924958, + "grad_norm": 0.0017307219095528126, + "learning_rate": 0.00011552674668716723, + "loss": 0.1496, + "num_input_tokens_seen": 211989232, + "step": 98240 + }, + { + "epoch": 16.026916802610113, + "grad_norm": 0.0006386128370650113, + "learning_rate": 0.00011548124438847174, + "loss": 0.0002, + "num_input_tokens_seen": 211998576, + "step": 98245 + }, + { + "epoch": 16.02773246329527, + "grad_norm": 0.00032772053964436054, + "learning_rate": 0.0001154357498825363, + "loss": 0.0018, + "num_input_tokens_seen": 212009552, + "step": 98250 + }, + { + "epoch": 16.028548123980425, + "grad_norm": 0.002344803186133504, + "learning_rate": 0.00011539026317028361, + "loss": 0.0031, + "num_input_tokens_seen": 212019472, + "step": 98255 + }, + { + "epoch": 16.02936378466558, + "grad_norm": 0.02594367414712906, + "learning_rate": 0.00011534478425263484, + "loss": 0.0023, + "num_input_tokens_seen": 212030000, + "step": 98260 + }, + { + "epoch": 16.030179445350733, + "grad_norm": 0.014831529930233955, + "learning_rate": 0.00011529931313051222, + "loss": 0.0004, + "num_input_tokens_seen": 212041072, + "step": 98265 + }, + { + "epoch": 16.03099510603589, + "grad_norm": 0.11580443382263184, + "learning_rate": 0.00011525384980483683, + "loss": 0.0046, + "num_input_tokens_seen": 212052144, + "step": 98270 + }, + { + "epoch": 16.031810766721044, + "grad_norm": 0.0015331159811466932, + "learning_rate": 0.00011520839427653052, + "loss": 0.0009, + "num_input_tokens_seen": 212063664, + "step": 98275 + }, + { + "epoch": 16.0326264274062, + "grad_norm": 0.0940280333161354, + "learning_rate": 0.00011516294654651393, + "loss": 0.0333, + "num_input_tokens_seen": 212073968, + "step": 98280 + }, + { + "epoch": 16.033442088091356, + "grad_norm": 0.03932836279273033, + "learning_rate": 0.00011511750661570875, + "loss": 0.0012, + "num_input_tokens_seen": 212083376, + "step": 98285 + }, + { + "epoch": 16.034257748776508, + "grad_norm": 0.46406617760658264, + "learning_rate": 0.00011507207448503526, + "loss": 0.0101, + "num_input_tokens_seen": 212094000, + "step": 98290 + }, + { + "epoch": 16.035073409461663, + "grad_norm": 0.00917030032724142, + "learning_rate": 0.00011502665015541481, + "loss": 0.0075, + "num_input_tokens_seen": 212104720, + "step": 98295 + }, + { + "epoch": 16.03588907014682, + "grad_norm": 0.014482107013463974, + "learning_rate": 0.0001149812336277673, + "loss": 0.0262, + "num_input_tokens_seen": 212115504, + "step": 98300 + }, + { + "epoch": 16.036704730831975, + "grad_norm": 0.05290558934211731, + "learning_rate": 0.00011493582490301374, + "loss": 0.003, + "num_input_tokens_seen": 212125776, + "step": 98305 + }, + { + "epoch": 16.03752039151713, + "grad_norm": 0.006001445464789867, + "learning_rate": 0.00011489042398207416, + "loss": 0.0015, + "num_input_tokens_seen": 212136752, + "step": 98310 + }, + { + "epoch": 16.038336052202283, + "grad_norm": 0.0007613174966536462, + "learning_rate": 0.00011484503086586867, + "loss": 0.0018, + "num_input_tokens_seen": 212146480, + "step": 98315 + }, + { + "epoch": 16.03915171288744, + "grad_norm": 0.01588711142539978, + "learning_rate": 0.00011479964555531725, + "loss": 0.0026, + "num_input_tokens_seen": 212157232, + "step": 98320 + }, + { + "epoch": 16.039967373572594, + "grad_norm": 0.010808209888637066, + "learning_rate": 0.00011475426805133965, + "loss": 0.0015, + "num_input_tokens_seen": 212168272, + "step": 98325 + }, + { + "epoch": 16.04078303425775, + "grad_norm": 0.0832233652472496, + "learning_rate": 0.00011470889835485554, + "loss": 0.0081, + "num_input_tokens_seen": 212178128, + "step": 98330 + }, + { + "epoch": 16.041598694942905, + "grad_norm": 0.012053709477186203, + "learning_rate": 0.0001146635364667844, + "loss": 0.0031, + "num_input_tokens_seen": 212189264, + "step": 98335 + }, + { + "epoch": 16.042414355628058, + "grad_norm": 0.00721632270142436, + "learning_rate": 0.0001146181823880455, + "loss": 0.0005, + "num_input_tokens_seen": 212199632, + "step": 98340 + }, + { + "epoch": 16.043230016313213, + "grad_norm": 0.0009931152453646064, + "learning_rate": 0.00011457283611955804, + "loss": 0.0035, + "num_input_tokens_seen": 212209232, + "step": 98345 + }, + { + "epoch": 16.04404567699837, + "grad_norm": 0.03015216253697872, + "learning_rate": 0.00011452749766224102, + "loss": 0.0041, + "num_input_tokens_seen": 212219472, + "step": 98350 + }, + { + "epoch": 16.044861337683525, + "grad_norm": 0.0003405428724363446, + "learning_rate": 0.00011448216701701309, + "loss": 0.0003, + "num_input_tokens_seen": 212230096, + "step": 98355 + }, + { + "epoch": 16.045676998368677, + "grad_norm": 0.001694978796876967, + "learning_rate": 0.00011443684418479344, + "loss": 0.0021, + "num_input_tokens_seen": 212240752, + "step": 98360 + }, + { + "epoch": 16.046492659053833, + "grad_norm": 0.00020498165395110846, + "learning_rate": 0.00011439152916649992, + "loss": 0.0027, + "num_input_tokens_seen": 212251952, + "step": 98365 + }, + { + "epoch": 16.04730831973899, + "grad_norm": 0.0015410864725708961, + "learning_rate": 0.00011434622196305156, + "loss": 0.0024, + "num_input_tokens_seen": 212263248, + "step": 98370 + }, + { + "epoch": 16.048123980424144, + "grad_norm": 0.27943989634513855, + "learning_rate": 0.00011430092257536596, + "loss": 0.0068, + "num_input_tokens_seen": 212272208, + "step": 98375 + }, + { + "epoch": 16.0489396411093, + "grad_norm": 0.0018194891745224595, + "learning_rate": 0.00011425563100436175, + "loss": 0.0034, + "num_input_tokens_seen": 212283984, + "step": 98380 + }, + { + "epoch": 16.049755301794452, + "grad_norm": 0.021141646429896355, + "learning_rate": 0.00011421034725095625, + "loss": 0.0046, + "num_input_tokens_seen": 212295504, + "step": 98385 + }, + { + "epoch": 16.050570962479608, + "grad_norm": 0.00892658531665802, + "learning_rate": 0.00011416507131606773, + "loss": 0.0007, + "num_input_tokens_seen": 212307280, + "step": 98390 + }, + { + "epoch": 16.051386623164763, + "grad_norm": 0.0005959854461252689, + "learning_rate": 0.00011411980320061322, + "loss": 0.0007, + "num_input_tokens_seen": 212317552, + "step": 98395 + }, + { + "epoch": 16.05220228384992, + "grad_norm": 0.0010226027807220817, + "learning_rate": 0.00011407454290551073, + "loss": 0.0011, + "num_input_tokens_seen": 212328016, + "step": 98400 + }, + { + "epoch": 16.053017944535075, + "grad_norm": 0.04430653154850006, + "learning_rate": 0.00011402929043167692, + "loss": 0.0013, + "num_input_tokens_seen": 212338416, + "step": 98405 + }, + { + "epoch": 16.053833605220227, + "grad_norm": 0.009483248926699162, + "learning_rate": 0.00011398404578002946, + "loss": 0.0011, + "num_input_tokens_seen": 212349168, + "step": 98410 + }, + { + "epoch": 16.054649265905383, + "grad_norm": 0.043958380818367004, + "learning_rate": 0.00011393880895148473, + "loss": 0.0264, + "num_input_tokens_seen": 212359728, + "step": 98415 + }, + { + "epoch": 16.05546492659054, + "grad_norm": 0.019906088709831238, + "learning_rate": 0.00011389357994696003, + "loss": 0.002, + "num_input_tokens_seen": 212369712, + "step": 98420 + }, + { + "epoch": 16.056280587275694, + "grad_norm": 0.0008864690898917615, + "learning_rate": 0.00011384835876737154, + "loss": 0.0044, + "num_input_tokens_seen": 212381264, + "step": 98425 + }, + { + "epoch": 16.05709624796085, + "grad_norm": 0.030165446922183037, + "learning_rate": 0.00011380314541363612, + "loss": 0.0009, + "num_input_tokens_seen": 212391280, + "step": 98430 + }, + { + "epoch": 16.057911908646002, + "grad_norm": 0.0005936272209510207, + "learning_rate": 0.00011375793988666966, + "loss": 0.001, + "num_input_tokens_seen": 212402704, + "step": 98435 + }, + { + "epoch": 16.058727569331158, + "grad_norm": 0.00025777670089155436, + "learning_rate": 0.0001137127421873888, + "loss": 0.0011, + "num_input_tokens_seen": 212414640, + "step": 98440 + }, + { + "epoch": 16.059543230016313, + "grad_norm": 0.01043024379760027, + "learning_rate": 0.000113667552316709, + "loss": 0.0019, + "num_input_tokens_seen": 212425488, + "step": 98445 + }, + { + "epoch": 16.06035889070147, + "grad_norm": 0.0027726483531296253, + "learning_rate": 0.00011362237027554645, + "loss": 0.0005, + "num_input_tokens_seen": 212436080, + "step": 98450 + }, + { + "epoch": 16.061174551386625, + "grad_norm": 0.0036185970529913902, + "learning_rate": 0.00011357719606481675, + "loss": 0.0015, + "num_input_tokens_seen": 212447664, + "step": 98455 + }, + { + "epoch": 16.061990212071777, + "grad_norm": 0.001528470660559833, + "learning_rate": 0.00011353202968543535, + "loss": 0.114, + "num_input_tokens_seen": 212457840, + "step": 98460 + }, + { + "epoch": 16.062805872756933, + "grad_norm": 0.002881030086427927, + "learning_rate": 0.00011348687113831768, + "loss": 0.0003, + "num_input_tokens_seen": 212466640, + "step": 98465 + }, + { + "epoch": 16.063621533442088, + "grad_norm": 0.000826209899969399, + "learning_rate": 0.00011344172042437889, + "loss": 0.0089, + "num_input_tokens_seen": 212476016, + "step": 98470 + }, + { + "epoch": 16.064437194127244, + "grad_norm": 0.001241606310941279, + "learning_rate": 0.00011339657754453398, + "loss": 0.0003, + "num_input_tokens_seen": 212486832, + "step": 98475 + }, + { + "epoch": 16.0652528548124, + "grad_norm": 0.0435258112847805, + "learning_rate": 0.00011335144249969793, + "loss": 0.0037, + "num_input_tokens_seen": 212497968, + "step": 98480 + }, + { + "epoch": 16.06606851549755, + "grad_norm": 0.054129548370838165, + "learning_rate": 0.00011330631529078533, + "loss": 0.0025, + "num_input_tokens_seen": 212508592, + "step": 98485 + }, + { + "epoch": 16.066884176182707, + "grad_norm": 0.00026920370873995125, + "learning_rate": 0.00011326119591871087, + "loss": 0.0005, + "num_input_tokens_seen": 212519152, + "step": 98490 + }, + { + "epoch": 16.067699836867863, + "grad_norm": 0.00048789719585329294, + "learning_rate": 0.00011321608438438885, + "loss": 0.0013, + "num_input_tokens_seen": 212529296, + "step": 98495 + }, + { + "epoch": 16.06851549755302, + "grad_norm": 0.3721056580543518, + "learning_rate": 0.00011317098068873339, + "loss": 0.0152, + "num_input_tokens_seen": 212540144, + "step": 98500 + }, + { + "epoch": 16.069331158238175, + "grad_norm": 0.07075155526399612, + "learning_rate": 0.000113125884832659, + "loss": 0.0095, + "num_input_tokens_seen": 212550896, + "step": 98505 + }, + { + "epoch": 16.070146818923327, + "grad_norm": 0.002033502096310258, + "learning_rate": 0.00011308079681707911, + "loss": 0.0056, + "num_input_tokens_seen": 212561168, + "step": 98510 + }, + { + "epoch": 16.070962479608482, + "grad_norm": 0.005683585070073605, + "learning_rate": 0.00011303571664290801, + "loss": 0.0049, + "num_input_tokens_seen": 212572592, + "step": 98515 + }, + { + "epoch": 16.071778140293638, + "grad_norm": 0.0008472249610349536, + "learning_rate": 0.0001129906443110587, + "loss": 0.0002, + "num_input_tokens_seen": 212583408, + "step": 98520 + }, + { + "epoch": 16.072593800978794, + "grad_norm": 0.013350097462534904, + "learning_rate": 0.0001129455798224452, + "loss": 0.0021, + "num_input_tokens_seen": 212593968, + "step": 98525 + }, + { + "epoch": 16.07340946166395, + "grad_norm": 0.008998990058898926, + "learning_rate": 0.00011290052317798027, + "loss": 0.0005, + "num_input_tokens_seen": 212603312, + "step": 98530 + }, + { + "epoch": 16.0742251223491, + "grad_norm": 0.0010321533773094416, + "learning_rate": 0.00011285547437857763, + "loss": 0.0008, + "num_input_tokens_seen": 212614672, + "step": 98535 + }, + { + "epoch": 16.075040783034257, + "grad_norm": 0.003844014136120677, + "learning_rate": 0.00011281043342514957, + "loss": 0.0005, + "num_input_tokens_seen": 212625456, + "step": 98540 + }, + { + "epoch": 16.075856443719413, + "grad_norm": 0.0006590305711142719, + "learning_rate": 0.0001127654003186096, + "loss": 0.0003, + "num_input_tokens_seen": 212634928, + "step": 98545 + }, + { + "epoch": 16.07667210440457, + "grad_norm": 0.0015691033331677318, + "learning_rate": 0.00011272037505986976, + "loss": 0.0003, + "num_input_tokens_seen": 212644048, + "step": 98550 + }, + { + "epoch": 16.07748776508972, + "grad_norm": 0.0001923950476339087, + "learning_rate": 0.00011267535764984293, + "loss": 0.0069, + "num_input_tokens_seen": 212655696, + "step": 98555 + }, + { + "epoch": 16.078303425774877, + "grad_norm": 0.000417013798141852, + "learning_rate": 0.00011263034808944134, + "loss": 0.0018, + "num_input_tokens_seen": 212666544, + "step": 98560 + }, + { + "epoch": 16.079119086460032, + "grad_norm": 0.0002726554521359503, + "learning_rate": 0.00011258534637957718, + "loss": 0.0054, + "num_input_tokens_seen": 212677712, + "step": 98565 + }, + { + "epoch": 16.079934747145188, + "grad_norm": 0.00024837235105223954, + "learning_rate": 0.0001125403525211624, + "loss": 0.0009, + "num_input_tokens_seen": 212687664, + "step": 98570 + }, + { + "epoch": 16.080750407830344, + "grad_norm": 0.009511114098131657, + "learning_rate": 0.00011249536651510894, + "loss": 0.0012, + "num_input_tokens_seen": 212697776, + "step": 98575 + }, + { + "epoch": 16.081566068515496, + "grad_norm": 0.11279970407485962, + "learning_rate": 0.00011245038836232846, + "loss": 0.0034, + "num_input_tokens_seen": 212709840, + "step": 98580 + }, + { + "epoch": 16.08238172920065, + "grad_norm": 0.017823074012994766, + "learning_rate": 0.0001124054180637325, + "loss": 0.0013, + "num_input_tokens_seen": 212721424, + "step": 98585 + }, + { + "epoch": 16.083197389885807, + "grad_norm": 0.002635807264596224, + "learning_rate": 0.00011236045562023245, + "loss": 0.0016, + "num_input_tokens_seen": 212733168, + "step": 98590 + }, + { + "epoch": 16.084013050570963, + "grad_norm": 0.0018998509040102363, + "learning_rate": 0.00011231550103273952, + "loss": 0.0009, + "num_input_tokens_seen": 212744016, + "step": 98595 + }, + { + "epoch": 16.08482871125612, + "grad_norm": 0.0010926040122285485, + "learning_rate": 0.00011227055430216476, + "loss": 0.0007, + "num_input_tokens_seen": 212754128, + "step": 98600 + }, + { + "epoch": 16.08564437194127, + "grad_norm": 0.056472305208444595, + "learning_rate": 0.00011222561542941906, + "loss": 0.0043, + "num_input_tokens_seen": 212765040, + "step": 98605 + }, + { + "epoch": 16.086460032626427, + "grad_norm": 0.0004983888939023018, + "learning_rate": 0.00011218068441541323, + "loss": 0.0008, + "num_input_tokens_seen": 212774416, + "step": 98610 + }, + { + "epoch": 16.087275693311582, + "grad_norm": 0.012620815075933933, + "learning_rate": 0.0001121357612610578, + "loss": 0.0013, + "num_input_tokens_seen": 212784496, + "step": 98615 + }, + { + "epoch": 16.088091353996738, + "grad_norm": 0.011876381002366543, + "learning_rate": 0.0001120908459672632, + "loss": 0.0018, + "num_input_tokens_seen": 212795728, + "step": 98620 + }, + { + "epoch": 16.088907014681894, + "grad_norm": 0.0003187301626894623, + "learning_rate": 0.00011204593853493978, + "loss": 0.0029, + "num_input_tokens_seen": 212806128, + "step": 98625 + }, + { + "epoch": 16.089722675367046, + "grad_norm": 0.0028939340263605118, + "learning_rate": 0.00011200103896499748, + "loss": 0.0024, + "num_input_tokens_seen": 212816976, + "step": 98630 + }, + { + "epoch": 16.0905383360522, + "grad_norm": 0.0009569992544129491, + "learning_rate": 0.00011195614725834636, + "loss": 0.0027, + "num_input_tokens_seen": 212827984, + "step": 98635 + }, + { + "epoch": 16.091353996737357, + "grad_norm": 0.20930473506450653, + "learning_rate": 0.0001119112634158962, + "loss": 0.004, + "num_input_tokens_seen": 212838192, + "step": 98640 + }, + { + "epoch": 16.092169657422513, + "grad_norm": 0.0012635784223675728, + "learning_rate": 0.00011186638743855643, + "loss": 0.0052, + "num_input_tokens_seen": 212847920, + "step": 98645 + }, + { + "epoch": 16.09298531810767, + "grad_norm": 0.0011961633572354913, + "learning_rate": 0.00011182151932723706, + "loss": 0.0005, + "num_input_tokens_seen": 212858480, + "step": 98650 + }, + { + "epoch": 16.09380097879282, + "grad_norm": 0.00017778830078896135, + "learning_rate": 0.00011177665908284667, + "loss": 0.0007, + "num_input_tokens_seen": 212869360, + "step": 98655 + }, + { + "epoch": 16.094616639477977, + "grad_norm": 0.0006655273027718067, + "learning_rate": 0.00011173180670629496, + "loss": 0.0004, + "num_input_tokens_seen": 212880816, + "step": 98660 + }, + { + "epoch": 16.095432300163132, + "grad_norm": 0.00010495977039681748, + "learning_rate": 0.00011168696219849078, + "loss": 0.0004, + "num_input_tokens_seen": 212891152, + "step": 98665 + }, + { + "epoch": 16.096247960848288, + "grad_norm": 0.010190960019826889, + "learning_rate": 0.00011164212556034287, + "loss": 0.0019, + "num_input_tokens_seen": 212900560, + "step": 98670 + }, + { + "epoch": 16.097063621533444, + "grad_norm": 0.0015943309990689158, + "learning_rate": 0.00011159729679275999, + "loss": 0.0032, + "num_input_tokens_seen": 212911472, + "step": 98675 + }, + { + "epoch": 16.097879282218596, + "grad_norm": 0.008765150792896748, + "learning_rate": 0.00011155247589665057, + "loss": 0.0016, + "num_input_tokens_seen": 212922416, + "step": 98680 + }, + { + "epoch": 16.09869494290375, + "grad_norm": 0.00040399146382696927, + "learning_rate": 0.00011150766287292302, + "loss": 0.0012, + "num_input_tokens_seen": 212934160, + "step": 98685 + }, + { + "epoch": 16.099510603588907, + "grad_norm": 0.34434008598327637, + "learning_rate": 0.00011146285772248555, + "loss": 0.0172, + "num_input_tokens_seen": 212945936, + "step": 98690 + }, + { + "epoch": 16.100326264274063, + "grad_norm": 0.006885967217385769, + "learning_rate": 0.00011141806044624614, + "loss": 0.0008, + "num_input_tokens_seen": 212956848, + "step": 98695 + }, + { + "epoch": 16.10114192495922, + "grad_norm": 0.2160376012325287, + "learning_rate": 0.00011137327104511268, + "loss": 0.006, + "num_input_tokens_seen": 212967760, + "step": 98700 + }, + { + "epoch": 16.10195758564437, + "grad_norm": 0.00225471262820065, + "learning_rate": 0.00011132848951999286, + "loss": 0.0149, + "num_input_tokens_seen": 212978224, + "step": 98705 + }, + { + "epoch": 16.102773246329527, + "grad_norm": 0.006119105499237776, + "learning_rate": 0.00011128371587179431, + "loss": 0.1288, + "num_input_tokens_seen": 212988720, + "step": 98710 + }, + { + "epoch": 16.103588907014682, + "grad_norm": 0.14389701187610626, + "learning_rate": 0.00011123895010142437, + "loss": 0.0027, + "num_input_tokens_seen": 212999952, + "step": 98715 + }, + { + "epoch": 16.104404567699838, + "grad_norm": 0.07571909576654434, + "learning_rate": 0.00011119419220979033, + "loss": 0.0076, + "num_input_tokens_seen": 213011088, + "step": 98720 + }, + { + "epoch": 16.10522022838499, + "grad_norm": 0.003942546900361776, + "learning_rate": 0.00011114944219779916, + "loss": 0.0013, + "num_input_tokens_seen": 213022480, + "step": 98725 + }, + { + "epoch": 16.106035889070146, + "grad_norm": 0.00508617889136076, + "learning_rate": 0.00011110470006635781, + "loss": 0.0037, + "num_input_tokens_seen": 213033040, + "step": 98730 + }, + { + "epoch": 16.1068515497553, + "grad_norm": 0.0001612607011338696, + "learning_rate": 0.00011105996581637312, + "loss": 0.0012, + "num_input_tokens_seen": 213044336, + "step": 98735 + }, + { + "epoch": 16.107667210440457, + "grad_norm": 0.004412360023707151, + "learning_rate": 0.00011101523944875163, + "loss": 0.0135, + "num_input_tokens_seen": 213053456, + "step": 98740 + }, + { + "epoch": 16.108482871125613, + "grad_norm": 0.012297747656702995, + "learning_rate": 0.00011097052096439974, + "loss": 0.001, + "num_input_tokens_seen": 213064752, + "step": 98745 + }, + { + "epoch": 16.109298531810765, + "grad_norm": 0.003085362259298563, + "learning_rate": 0.00011092581036422378, + "loss": 0.0004, + "num_input_tokens_seen": 213076080, + "step": 98750 + }, + { + "epoch": 16.11011419249592, + "grad_norm": 0.0005873819463886321, + "learning_rate": 0.00011088110764912984, + "loss": 0.0004, + "num_input_tokens_seen": 213085264, + "step": 98755 + }, + { + "epoch": 16.110929853181077, + "grad_norm": 0.1401209980249405, + "learning_rate": 0.00011083641282002387, + "loss": 0.0072, + "num_input_tokens_seen": 213095760, + "step": 98760 + }, + { + "epoch": 16.111745513866232, + "grad_norm": 0.00036691187415272, + "learning_rate": 0.00011079172587781172, + "loss": 0.0011, + "num_input_tokens_seen": 213106384, + "step": 98765 + }, + { + "epoch": 16.112561174551388, + "grad_norm": 0.00035986569127999246, + "learning_rate": 0.00011074704682339897, + "loss": 0.0042, + "num_input_tokens_seen": 213117936, + "step": 98770 + }, + { + "epoch": 16.11337683523654, + "grad_norm": 0.0001302465097978711, + "learning_rate": 0.00011070237565769097, + "loss": 0.0006, + "num_input_tokens_seen": 213128240, + "step": 98775 + }, + { + "epoch": 16.114192495921696, + "grad_norm": 0.00041698251152411103, + "learning_rate": 0.0001106577123815935, + "loss": 0.0006, + "num_input_tokens_seen": 213140240, + "step": 98780 + }, + { + "epoch": 16.11500815660685, + "grad_norm": 0.013479344546794891, + "learning_rate": 0.0001106130569960111, + "loss": 0.0005, + "num_input_tokens_seen": 213150288, + "step": 98785 + }, + { + "epoch": 16.115823817292007, + "grad_norm": 0.003966978285461664, + "learning_rate": 0.00011056840950184921, + "loss": 0.0047, + "num_input_tokens_seen": 213161456, + "step": 98790 + }, + { + "epoch": 16.116639477977163, + "grad_norm": 0.00035824996302835643, + "learning_rate": 0.00011052376990001256, + "loss": 0.0021, + "num_input_tokens_seen": 213173040, + "step": 98795 + }, + { + "epoch": 16.117455138662315, + "grad_norm": 0.00012466330372262746, + "learning_rate": 0.00011047913819140576, + "loss": 0.0002, + "num_input_tokens_seen": 213184656, + "step": 98800 + }, + { + "epoch": 16.11827079934747, + "grad_norm": 0.2039758861064911, + "learning_rate": 0.00011043451437693342, + "loss": 0.0066, + "num_input_tokens_seen": 213194224, + "step": 98805 + }, + { + "epoch": 16.119086460032626, + "grad_norm": 0.0009383212309330702, + "learning_rate": 0.00011038989845749981, + "loss": 0.0007, + "num_input_tokens_seen": 213205392, + "step": 98810 + }, + { + "epoch": 16.119902120717782, + "grad_norm": 0.0019524554954841733, + "learning_rate": 0.00011034529043400915, + "loss": 0.0014, + "num_input_tokens_seen": 213216496, + "step": 98815 + }, + { + "epoch": 16.120717781402938, + "grad_norm": 0.0005550095229409635, + "learning_rate": 0.00011030069030736551, + "loss": 0.0004, + "num_input_tokens_seen": 213227920, + "step": 98820 + }, + { + "epoch": 16.12153344208809, + "grad_norm": 0.0006389313493855298, + "learning_rate": 0.0001102560980784727, + "loss": 0.0034, + "num_input_tokens_seen": 213237584, + "step": 98825 + }, + { + "epoch": 16.122349102773246, + "grad_norm": 0.015232326462864876, + "learning_rate": 0.00011021151374823457, + "loss": 0.0014, + "num_input_tokens_seen": 213248336, + "step": 98830 + }, + { + "epoch": 16.1231647634584, + "grad_norm": 0.002397713251411915, + "learning_rate": 0.00011016693731755456, + "loss": 0.0023, + "num_input_tokens_seen": 213258736, + "step": 98835 + }, + { + "epoch": 16.123980424143557, + "grad_norm": 0.00016230314213316888, + "learning_rate": 0.00011012236878733606, + "loss": 0.0005, + "num_input_tokens_seen": 213270768, + "step": 98840 + }, + { + "epoch": 16.124796084828713, + "grad_norm": 0.0004333737015258521, + "learning_rate": 0.00011007780815848239, + "loss": 0.0033, + "num_input_tokens_seen": 213282576, + "step": 98845 + }, + { + "epoch": 16.125611745513865, + "grad_norm": 0.002429444342851639, + "learning_rate": 0.00011003325543189663, + "loss": 0.0006, + "num_input_tokens_seen": 213291568, + "step": 98850 + }, + { + "epoch": 16.12642740619902, + "grad_norm": 0.01888953521847725, + "learning_rate": 0.0001099887106084816, + "loss": 0.0037, + "num_input_tokens_seen": 213303056, + "step": 98855 + }, + { + "epoch": 16.127243066884176, + "grad_norm": 0.002490180777385831, + "learning_rate": 0.00010994417368914011, + "loss": 0.0008, + "num_input_tokens_seen": 213314160, + "step": 98860 + }, + { + "epoch": 16.128058727569332, + "grad_norm": 0.0017309001414105296, + "learning_rate": 0.00010989964467477481, + "loss": 0.0002, + "num_input_tokens_seen": 213325232, + "step": 98865 + }, + { + "epoch": 16.128874388254488, + "grad_norm": 0.41680672764778137, + "learning_rate": 0.00010985512356628807, + "loss": 0.1545, + "num_input_tokens_seen": 213336976, + "step": 98870 + }, + { + "epoch": 16.12969004893964, + "grad_norm": 0.0009257174097001553, + "learning_rate": 0.00010981061036458218, + "loss": 0.001, + "num_input_tokens_seen": 213346672, + "step": 98875 + }, + { + "epoch": 16.130505709624796, + "grad_norm": 0.11573344469070435, + "learning_rate": 0.00010976610507055906, + "loss": 0.0051, + "num_input_tokens_seen": 213358128, + "step": 98880 + }, + { + "epoch": 16.13132137030995, + "grad_norm": 0.0752863883972168, + "learning_rate": 0.00010972160768512123, + "loss": 0.0029, + "num_input_tokens_seen": 213368112, + "step": 98885 + }, + { + "epoch": 16.132137030995107, + "grad_norm": 0.00011307926615700126, + "learning_rate": 0.00010967711820916982, + "loss": 0.0001, + "num_input_tokens_seen": 213378608, + "step": 98890 + }, + { + "epoch": 16.13295269168026, + "grad_norm": 0.0001575253118062392, + "learning_rate": 0.00010963263664360706, + "loss": 0.0042, + "num_input_tokens_seen": 213387856, + "step": 98895 + }, + { + "epoch": 16.133768352365415, + "grad_norm": 0.0005037879454903305, + "learning_rate": 0.00010958816298933383, + "loss": 0.0005, + "num_input_tokens_seen": 213398832, + "step": 98900 + }, + { + "epoch": 16.13458401305057, + "grad_norm": 0.06146703287959099, + "learning_rate": 0.00010954369724725205, + "loss": 0.0035, + "num_input_tokens_seen": 213409488, + "step": 98905 + }, + { + "epoch": 16.135399673735726, + "grad_norm": 0.000868406961672008, + "learning_rate": 0.00010949923941826229, + "loss": 0.0001, + "num_input_tokens_seen": 213419984, + "step": 98910 + }, + { + "epoch": 16.136215334420882, + "grad_norm": 0.009556429460644722, + "learning_rate": 0.0001094547895032661, + "loss": 0.0016, + "num_input_tokens_seen": 213430960, + "step": 98915 + }, + { + "epoch": 16.137030995106034, + "grad_norm": 0.0015359712997451425, + "learning_rate": 0.00010941034750316375, + "loss": 0.0045, + "num_input_tokens_seen": 213440656, + "step": 98920 + }, + { + "epoch": 16.13784665579119, + "grad_norm": 0.00029027959681116045, + "learning_rate": 0.00010936591341885648, + "loss": 0.0037, + "num_input_tokens_seen": 213451376, + "step": 98925 + }, + { + "epoch": 16.138662316476346, + "grad_norm": 0.0005547608598135412, + "learning_rate": 0.0001093214872512443, + "loss": 0.0003, + "num_input_tokens_seen": 213462864, + "step": 98930 + }, + { + "epoch": 16.1394779771615, + "grad_norm": 0.00040020651067607105, + "learning_rate": 0.00010927706900122791, + "loss": 0.0023, + "num_input_tokens_seen": 213474576, + "step": 98935 + }, + { + "epoch": 16.140293637846657, + "grad_norm": 0.00023827768745832145, + "learning_rate": 0.00010923265866970739, + "loss": 0.0026, + "num_input_tokens_seen": 213485488, + "step": 98940 + }, + { + "epoch": 16.14110929853181, + "grad_norm": 0.03194522485136986, + "learning_rate": 0.00010918825625758273, + "loss": 0.002, + "num_input_tokens_seen": 213495984, + "step": 98945 + }, + { + "epoch": 16.141924959216965, + "grad_norm": 0.00012043313472531736, + "learning_rate": 0.00010914386176575386, + "loss": 0.0006, + "num_input_tokens_seen": 213507600, + "step": 98950 + }, + { + "epoch": 16.14274061990212, + "grad_norm": 0.025165919214487076, + "learning_rate": 0.00010909947519512048, + "loss": 0.0008, + "num_input_tokens_seen": 213518064, + "step": 98955 + }, + { + "epoch": 16.143556280587276, + "grad_norm": 0.017567554488778114, + "learning_rate": 0.00010905509654658208, + "loss": 0.0017, + "num_input_tokens_seen": 213528656, + "step": 98960 + }, + { + "epoch": 16.144371941272432, + "grad_norm": 0.0005006112041883171, + "learning_rate": 0.00010901072582103816, + "loss": 0.0011, + "num_input_tokens_seen": 213537648, + "step": 98965 + }, + { + "epoch": 16.145187601957584, + "grad_norm": 0.0002035616635112092, + "learning_rate": 0.00010896636301938784, + "loss": 0.0003, + "num_input_tokens_seen": 213549488, + "step": 98970 + }, + { + "epoch": 16.14600326264274, + "grad_norm": 0.00031394895631819963, + "learning_rate": 0.00010892200814253023, + "loss": 0.0145, + "num_input_tokens_seen": 213558704, + "step": 98975 + }, + { + "epoch": 16.146818923327896, + "grad_norm": 0.047532159835100174, + "learning_rate": 0.00010887766119136427, + "loss": 0.0095, + "num_input_tokens_seen": 213569584, + "step": 98980 + }, + { + "epoch": 16.14763458401305, + "grad_norm": 0.05014142021536827, + "learning_rate": 0.00010883332216678853, + "loss": 0.0017, + "num_input_tokens_seen": 213580048, + "step": 98985 + }, + { + "epoch": 16.148450244698207, + "grad_norm": 0.0005862409598194063, + "learning_rate": 0.00010878899106970203, + "loss": 0.001, + "num_input_tokens_seen": 213590768, + "step": 98990 + }, + { + "epoch": 16.14926590538336, + "grad_norm": 1.1277607679367065, + "learning_rate": 0.00010874466790100268, + "loss": 0.1234, + "num_input_tokens_seen": 213599824, + "step": 98995 + }, + { + "epoch": 16.150081566068515, + "grad_norm": 0.008365337736904621, + "learning_rate": 0.00010870035266158918, + "loss": 0.0021, + "num_input_tokens_seen": 213610352, + "step": 99000 + }, + { + "epoch": 16.15089722675367, + "grad_norm": 0.0030512565281242132, + "learning_rate": 0.00010865604535235918, + "loss": 0.0115, + "num_input_tokens_seen": 213621744, + "step": 99005 + }, + { + "epoch": 16.151712887438826, + "grad_norm": 1.011581540107727, + "learning_rate": 0.0001086117459742112, + "loss": 0.1723, + "num_input_tokens_seen": 213632272, + "step": 99010 + }, + { + "epoch": 16.152528548123982, + "grad_norm": 0.0024505695328116417, + "learning_rate": 0.00010856745452804234, + "loss": 0.0193, + "num_input_tokens_seen": 213641808, + "step": 99015 + }, + { + "epoch": 16.153344208809134, + "grad_norm": 0.00047407130477949977, + "learning_rate": 0.0001085231710147509, + "loss": 0.0002, + "num_input_tokens_seen": 213651760, + "step": 99020 + }, + { + "epoch": 16.15415986949429, + "grad_norm": 0.02473132684826851, + "learning_rate": 0.00010847889543523376, + "loss": 0.0013, + "num_input_tokens_seen": 213662608, + "step": 99025 + }, + { + "epoch": 16.154975530179446, + "grad_norm": 8.542987052351236e-05, + "learning_rate": 0.00010843462779038876, + "loss": 0.003, + "num_input_tokens_seen": 213673232, + "step": 99030 + }, + { + "epoch": 16.1557911908646, + "grad_norm": 0.019800664857029915, + "learning_rate": 0.00010839036808111246, + "loss": 0.0008, + "num_input_tokens_seen": 213683792, + "step": 99035 + }, + { + "epoch": 16.156606851549757, + "grad_norm": 0.005705035757273436, + "learning_rate": 0.00010834611630830244, + "loss": 0.0012, + "num_input_tokens_seen": 213694224, + "step": 99040 + }, + { + "epoch": 16.15742251223491, + "grad_norm": 0.05355852469801903, + "learning_rate": 0.00010830187247285489, + "loss": 0.0028, + "num_input_tokens_seen": 213704592, + "step": 99045 + }, + { + "epoch": 16.158238172920065, + "grad_norm": 0.005607281811535358, + "learning_rate": 0.00010825763657566717, + "loss": 0.0006, + "num_input_tokens_seen": 213716144, + "step": 99050 + }, + { + "epoch": 16.15905383360522, + "grad_norm": 0.0002114044182235375, + "learning_rate": 0.00010821340861763506, + "loss": 0.0004, + "num_input_tokens_seen": 213726640, + "step": 99055 + }, + { + "epoch": 16.159869494290376, + "grad_norm": 0.00018516888667363673, + "learning_rate": 0.00010816918859965552, + "loss": 0.0044, + "num_input_tokens_seen": 213738320, + "step": 99060 + }, + { + "epoch": 16.160685154975532, + "grad_norm": 0.00672272639349103, + "learning_rate": 0.00010812497652262421, + "loss": 0.0007, + "num_input_tokens_seen": 213750320, + "step": 99065 + }, + { + "epoch": 16.161500815660684, + "grad_norm": 0.00029566168086603284, + "learning_rate": 0.00010808077238743763, + "loss": 0.0066, + "num_input_tokens_seen": 213760368, + "step": 99070 + }, + { + "epoch": 16.16231647634584, + "grad_norm": 0.0043870508670806885, + "learning_rate": 0.00010803657619499107, + "loss": 0.0003, + "num_input_tokens_seen": 213771536, + "step": 99075 + }, + { + "epoch": 16.163132137030995, + "grad_norm": 0.002170632826164365, + "learning_rate": 0.00010799238794618077, + "loss": 0.0021, + "num_input_tokens_seen": 213782288, + "step": 99080 + }, + { + "epoch": 16.16394779771615, + "grad_norm": 0.0010761994635686278, + "learning_rate": 0.00010794820764190194, + "loss": 0.0002, + "num_input_tokens_seen": 213792208, + "step": 99085 + }, + { + "epoch": 16.164763458401303, + "grad_norm": 0.08890499919652939, + "learning_rate": 0.00010790403528305004, + "loss": 0.0042, + "num_input_tokens_seen": 213804144, + "step": 99090 + }, + { + "epoch": 16.16557911908646, + "grad_norm": 0.010051117278635502, + "learning_rate": 0.0001078598708705203, + "loss": 0.0011, + "num_input_tokens_seen": 213814032, + "step": 99095 + }, + { + "epoch": 16.166394779771615, + "grad_norm": 0.039757758378982544, + "learning_rate": 0.00010781571440520777, + "loss": 0.0016, + "num_input_tokens_seen": 213823824, + "step": 99100 + }, + { + "epoch": 16.16721044045677, + "grad_norm": 0.005802873056381941, + "learning_rate": 0.00010777156588800724, + "loss": 0.0398, + "num_input_tokens_seen": 213834224, + "step": 99105 + }, + { + "epoch": 16.168026101141926, + "grad_norm": 0.028820903971791267, + "learning_rate": 0.00010772742531981356, + "loss": 0.0025, + "num_input_tokens_seen": 213844240, + "step": 99110 + }, + { + "epoch": 16.16884176182708, + "grad_norm": 0.00042527145706117153, + "learning_rate": 0.00010768329270152122, + "loss": 0.0023, + "num_input_tokens_seen": 213854896, + "step": 99115 + }, + { + "epoch": 16.169657422512234, + "grad_norm": 0.0006946519133634865, + "learning_rate": 0.00010763916803402463, + "loss": 0.0066, + "num_input_tokens_seen": 213866640, + "step": 99120 + }, + { + "epoch": 16.17047308319739, + "grad_norm": 0.004098538774996996, + "learning_rate": 0.00010759505131821806, + "loss": 0.1016, + "num_input_tokens_seen": 213878192, + "step": 99125 + }, + { + "epoch": 16.171288743882545, + "grad_norm": 0.3300659656524658, + "learning_rate": 0.00010755094255499542, + "loss": 0.0065, + "num_input_tokens_seen": 213888752, + "step": 99130 + }, + { + "epoch": 16.1721044045677, + "grad_norm": 0.006350142415612936, + "learning_rate": 0.00010750684174525111, + "loss": 0.0015, + "num_input_tokens_seen": 213899600, + "step": 99135 + }, + { + "epoch": 16.172920065252853, + "grad_norm": 0.006888189818710089, + "learning_rate": 0.00010746274888987822, + "loss": 0.0006, + "num_input_tokens_seen": 213911152, + "step": 99140 + }, + { + "epoch": 16.17373572593801, + "grad_norm": 0.00570251327008009, + "learning_rate": 0.00010741866398977101, + "loss": 0.0022, + "num_input_tokens_seen": 213922960, + "step": 99145 + }, + { + "epoch": 16.174551386623165, + "grad_norm": 0.0009478118736296892, + "learning_rate": 0.00010737458704582232, + "loss": 0.0007, + "num_input_tokens_seen": 213933136, + "step": 99150 + }, + { + "epoch": 16.17536704730832, + "grad_norm": 0.002653405535966158, + "learning_rate": 0.00010733051805892602, + "loss": 0.0016, + "num_input_tokens_seen": 213943472, + "step": 99155 + }, + { + "epoch": 16.176182707993476, + "grad_norm": 0.0014007817953824997, + "learning_rate": 0.00010728645702997458, + "loss": 0.0015, + "num_input_tokens_seen": 213953584, + "step": 99160 + }, + { + "epoch": 16.17699836867863, + "grad_norm": 0.0004063938104081899, + "learning_rate": 0.00010724240395986156, + "loss": 0.0004, + "num_input_tokens_seen": 213964176, + "step": 99165 + }, + { + "epoch": 16.177814029363784, + "grad_norm": 0.0001167362934211269, + "learning_rate": 0.00010719835884947921, + "loss": 0.0004, + "num_input_tokens_seen": 213974320, + "step": 99170 + }, + { + "epoch": 16.17862969004894, + "grad_norm": 0.0033445453736931086, + "learning_rate": 0.00010715432169972067, + "loss": 0.0186, + "num_input_tokens_seen": 213984976, + "step": 99175 + }, + { + "epoch": 16.179445350734095, + "grad_norm": 0.20797719061374664, + "learning_rate": 0.00010711029251147791, + "loss": 0.0064, + "num_input_tokens_seen": 213996624, + "step": 99180 + }, + { + "epoch": 16.18026101141925, + "grad_norm": 0.001275201328098774, + "learning_rate": 0.00010706627128564378, + "loss": 0.0066, + "num_input_tokens_seen": 214005808, + "step": 99185 + }, + { + "epoch": 16.181076672104403, + "grad_norm": 0.0005514289368875325, + "learning_rate": 0.00010702225802310983, + "loss": 0.0007, + "num_input_tokens_seen": 214016720, + "step": 99190 + }, + { + "epoch": 16.18189233278956, + "grad_norm": 0.002038146834820509, + "learning_rate": 0.00010697825272476847, + "loss": 0.0011, + "num_input_tokens_seen": 214028272, + "step": 99195 + }, + { + "epoch": 16.182707993474715, + "grad_norm": 0.0009862730512395501, + "learning_rate": 0.00010693425539151141, + "loss": 0.0008, + "num_input_tokens_seen": 214038896, + "step": 99200 + }, + { + "epoch": 16.18352365415987, + "grad_norm": 0.0002357991470489651, + "learning_rate": 0.00010689026602423036, + "loss": 0.0029, + "num_input_tokens_seen": 214050224, + "step": 99205 + }, + { + "epoch": 16.184339314845026, + "grad_norm": 0.0005186043563298881, + "learning_rate": 0.00010684628462381673, + "loss": 0.0008, + "num_input_tokens_seen": 214059408, + "step": 99210 + }, + { + "epoch": 16.18515497553018, + "grad_norm": 0.0064187184907495975, + "learning_rate": 0.00010680231119116185, + "loss": 0.001, + "num_input_tokens_seen": 214071216, + "step": 99215 + }, + { + "epoch": 16.185970636215334, + "grad_norm": 0.0010591634782031178, + "learning_rate": 0.00010675834572715698, + "loss": 0.0021, + "num_input_tokens_seen": 214080784, + "step": 99220 + }, + { + "epoch": 16.18678629690049, + "grad_norm": 0.0001180768376798369, + "learning_rate": 0.00010671438823269314, + "loss": 0.0081, + "num_input_tokens_seen": 214091984, + "step": 99225 + }, + { + "epoch": 16.187601957585645, + "grad_norm": 0.00013387241051532328, + "learning_rate": 0.00010667043870866105, + "loss": 0.0071, + "num_input_tokens_seen": 214103632, + "step": 99230 + }, + { + "epoch": 16.1884176182708, + "grad_norm": 0.0014703941997140646, + "learning_rate": 0.00010662649715595157, + "loss": 0.0003, + "num_input_tokens_seen": 214111920, + "step": 99235 + }, + { + "epoch": 16.189233278955953, + "grad_norm": 0.0001524462568340823, + "learning_rate": 0.00010658256357545509, + "loss": 0.0003, + "num_input_tokens_seen": 214120048, + "step": 99240 + }, + { + "epoch": 16.19004893964111, + "grad_norm": 0.00023062207037582994, + "learning_rate": 0.00010653863796806213, + "loss": 0.001, + "num_input_tokens_seen": 214129296, + "step": 99245 + }, + { + "epoch": 16.190864600326265, + "grad_norm": 0.0017548762261867523, + "learning_rate": 0.00010649472033466273, + "loss": 0.0018, + "num_input_tokens_seen": 214138832, + "step": 99250 + }, + { + "epoch": 16.19168026101142, + "grad_norm": 0.007979316636919975, + "learning_rate": 0.00010645081067614703, + "loss": 0.0005, + "num_input_tokens_seen": 214149840, + "step": 99255 + }, + { + "epoch": 16.192495921696572, + "grad_norm": 0.003006615210324526, + "learning_rate": 0.00010640690899340494, + "loss": 0.0012, + "num_input_tokens_seen": 214161360, + "step": 99260 + }, + { + "epoch": 16.193311582381728, + "grad_norm": 0.0015094154514372349, + "learning_rate": 0.00010636301528732612, + "loss": 0.0009, + "num_input_tokens_seen": 214172496, + "step": 99265 + }, + { + "epoch": 16.194127243066884, + "grad_norm": 0.0016187592409551144, + "learning_rate": 0.00010631912955880018, + "loss": 0.0015, + "num_input_tokens_seen": 214183504, + "step": 99270 + }, + { + "epoch": 16.19494290375204, + "grad_norm": 0.010476581752300262, + "learning_rate": 0.00010627525180871633, + "loss": 0.0102, + "num_input_tokens_seen": 214194864, + "step": 99275 + }, + { + "epoch": 16.195758564437195, + "grad_norm": 0.0011915371287614107, + "learning_rate": 0.00010623138203796429, + "loss": 0.0061, + "num_input_tokens_seen": 214206512, + "step": 99280 + }, + { + "epoch": 16.196574225122347, + "grad_norm": 0.010787293314933777, + "learning_rate": 0.00010618752024743255, + "loss": 0.0004, + "num_input_tokens_seen": 214218960, + "step": 99285 + }, + { + "epoch": 16.197389885807503, + "grad_norm": 0.07102012634277344, + "learning_rate": 0.00010614366643801055, + "loss": 0.0013, + "num_input_tokens_seen": 214229520, + "step": 99290 + }, + { + "epoch": 16.19820554649266, + "grad_norm": 0.0053579616360366344, + "learning_rate": 0.00010609982061058654, + "loss": 0.0011, + "num_input_tokens_seen": 214240112, + "step": 99295 + }, + { + "epoch": 16.199021207177815, + "grad_norm": 0.00011368890409357846, + "learning_rate": 0.0001060559827660495, + "loss": 0.0004, + "num_input_tokens_seen": 214251248, + "step": 99300 + }, + { + "epoch": 16.19983686786297, + "grad_norm": 0.001174992066808045, + "learning_rate": 0.0001060121529052877, + "loss": 0.0003, + "num_input_tokens_seen": 214262032, + "step": 99305 + }, + { + "epoch": 16.200652528548122, + "grad_norm": 0.019266627728939056, + "learning_rate": 0.0001059683310291894, + "loss": 0.0023, + "num_input_tokens_seen": 214273104, + "step": 99310 + }, + { + "epoch": 16.201468189233278, + "grad_norm": 0.004518282134085894, + "learning_rate": 0.00010592451713864282, + "loss": 0.0017, + "num_input_tokens_seen": 214284720, + "step": 99315 + }, + { + "epoch": 16.202283849918434, + "grad_norm": 0.056013450026512146, + "learning_rate": 0.00010588071123453574, + "loss": 0.0014, + "num_input_tokens_seen": 214296336, + "step": 99320 + }, + { + "epoch": 16.20309951060359, + "grad_norm": 0.0002610927331261337, + "learning_rate": 0.00010583691331775608, + "loss": 0.0006, + "num_input_tokens_seen": 214307888, + "step": 99325 + }, + { + "epoch": 16.203915171288745, + "grad_norm": 0.0006846070173196495, + "learning_rate": 0.0001057931233891914, + "loss": 0.0048, + "num_input_tokens_seen": 214318352, + "step": 99330 + }, + { + "epoch": 16.204730831973897, + "grad_norm": 0.04821879789233208, + "learning_rate": 0.00010574934144972908, + "loss": 0.0024, + "num_input_tokens_seen": 214329232, + "step": 99335 + }, + { + "epoch": 16.205546492659053, + "grad_norm": 0.0006477761780843139, + "learning_rate": 0.00010570556750025656, + "loss": 0.0002, + "num_input_tokens_seen": 214338416, + "step": 99340 + }, + { + "epoch": 16.20636215334421, + "grad_norm": 0.008179224096238613, + "learning_rate": 0.00010566180154166094, + "loss": 0.0022, + "num_input_tokens_seen": 214348976, + "step": 99345 + }, + { + "epoch": 16.207177814029365, + "grad_norm": 0.0006066395435482264, + "learning_rate": 0.00010561804357482912, + "loss": 0.0009, + "num_input_tokens_seen": 214359344, + "step": 99350 + }, + { + "epoch": 16.20799347471452, + "grad_norm": 0.008939487859606743, + "learning_rate": 0.00010557429360064796, + "loss": 0.0145, + "num_input_tokens_seen": 214370672, + "step": 99355 + }, + { + "epoch": 16.208809135399672, + "grad_norm": 0.0009951103711500764, + "learning_rate": 0.00010553055162000414, + "loss": 0.0003, + "num_input_tokens_seen": 214382128, + "step": 99360 + }, + { + "epoch": 16.209624796084828, + "grad_norm": 0.005282310303300619, + "learning_rate": 0.0001054868176337841, + "loss": 0.0016, + "num_input_tokens_seen": 214393008, + "step": 99365 + }, + { + "epoch": 16.210440456769984, + "grad_norm": 0.0003233069146517664, + "learning_rate": 0.00010544309164287418, + "loss": 0.0111, + "num_input_tokens_seen": 214404112, + "step": 99370 + }, + { + "epoch": 16.21125611745514, + "grad_norm": 0.012422445230185986, + "learning_rate": 0.00010539937364816049, + "loss": 0.0007, + "num_input_tokens_seen": 214416432, + "step": 99375 + }, + { + "epoch": 16.212071778140295, + "grad_norm": 0.15065011382102966, + "learning_rate": 0.00010535566365052913, + "loss": 0.0069, + "num_input_tokens_seen": 214428240, + "step": 99380 + }, + { + "epoch": 16.212887438825447, + "grad_norm": 0.0004737574199680239, + "learning_rate": 0.00010531196165086587, + "loss": 0.0014, + "num_input_tokens_seen": 214439696, + "step": 99385 + }, + { + "epoch": 16.213703099510603, + "grad_norm": 0.0004930857103317976, + "learning_rate": 0.00010526826765005642, + "loss": 0.0008, + "num_input_tokens_seen": 214449936, + "step": 99390 + }, + { + "epoch": 16.21451876019576, + "grad_norm": 0.0022723281290382147, + "learning_rate": 0.00010522458164898624, + "loss": 0.005, + "num_input_tokens_seen": 214461712, + "step": 99395 + }, + { + "epoch": 16.215334420880914, + "grad_norm": 0.0005975187523290515, + "learning_rate": 0.00010518090364854077, + "loss": 0.0002, + "num_input_tokens_seen": 214473584, + "step": 99400 + }, + { + "epoch": 16.21615008156607, + "grad_norm": 0.0004166236030869186, + "learning_rate": 0.00010513723364960497, + "loss": 0.0016, + "num_input_tokens_seen": 214484528, + "step": 99405 + }, + { + "epoch": 16.216965742251222, + "grad_norm": 0.002230869373306632, + "learning_rate": 0.00010509357165306422, + "loss": 0.0006, + "num_input_tokens_seen": 214495536, + "step": 99410 + }, + { + "epoch": 16.217781402936378, + "grad_norm": 0.09517454355955124, + "learning_rate": 0.00010504991765980321, + "loss": 0.0019, + "num_input_tokens_seen": 214506128, + "step": 99415 + }, + { + "epoch": 16.218597063621534, + "grad_norm": 0.00034491284168325365, + "learning_rate": 0.00010500627167070665, + "loss": 0.0024, + "num_input_tokens_seen": 214516528, + "step": 99420 + }, + { + "epoch": 16.21941272430669, + "grad_norm": 0.006337933242321014, + "learning_rate": 0.00010496263368665904, + "loss": 0.0007, + "num_input_tokens_seen": 214526896, + "step": 99425 + }, + { + "epoch": 16.22022838499184, + "grad_norm": 0.020765701308846474, + "learning_rate": 0.00010491900370854484, + "loss": 0.0632, + "num_input_tokens_seen": 214538448, + "step": 99430 + }, + { + "epoch": 16.221044045676997, + "grad_norm": 0.6227688789367676, + "learning_rate": 0.0001048753817372482, + "loss": 0.0101, + "num_input_tokens_seen": 214548112, + "step": 99435 + }, + { + "epoch": 16.221859706362153, + "grad_norm": 0.008205956779420376, + "learning_rate": 0.00010483176777365322, + "loss": 0.0009, + "num_input_tokens_seen": 214558832, + "step": 99440 + }, + { + "epoch": 16.22267536704731, + "grad_norm": 0.0004693370428867638, + "learning_rate": 0.00010478816181864376, + "loss": 0.001, + "num_input_tokens_seen": 214570064, + "step": 99445 + }, + { + "epoch": 16.223491027732464, + "grad_norm": 0.0004355916171334684, + "learning_rate": 0.0001047445638731036, + "loss": 0.0087, + "num_input_tokens_seen": 214580944, + "step": 99450 + }, + { + "epoch": 16.224306688417617, + "grad_norm": 0.0028469276148825884, + "learning_rate": 0.00010470097393791622, + "loss": 0.0002, + "num_input_tokens_seen": 214591696, + "step": 99455 + }, + { + "epoch": 16.225122349102772, + "grad_norm": 0.0012782367412000895, + "learning_rate": 0.00010465739201396512, + "loss": 0.0009, + "num_input_tokens_seen": 214602384, + "step": 99460 + }, + { + "epoch": 16.225938009787928, + "grad_norm": 0.0006481695454567671, + "learning_rate": 0.00010461381810213344, + "loss": 0.001, + "num_input_tokens_seen": 214613712, + "step": 99465 + }, + { + "epoch": 16.226753670473084, + "grad_norm": 0.0036935091484338045, + "learning_rate": 0.00010457025220330435, + "loss": 0.0005, + "num_input_tokens_seen": 214624752, + "step": 99470 + }, + { + "epoch": 16.22756933115824, + "grad_norm": 0.0008179727592505515, + "learning_rate": 0.00010452669431836076, + "loss": 0.1542, + "num_input_tokens_seen": 214634608, + "step": 99475 + }, + { + "epoch": 16.22838499184339, + "grad_norm": 0.0025631259195506573, + "learning_rate": 0.00010448314444818541, + "loss": 0.0111, + "num_input_tokens_seen": 214645616, + "step": 99480 + }, + { + "epoch": 16.229200652528547, + "grad_norm": 0.005211603827774525, + "learning_rate": 0.00010443960259366081, + "loss": 0.0005, + "num_input_tokens_seen": 214656944, + "step": 99485 + }, + { + "epoch": 16.230016313213703, + "grad_norm": 0.04098931699991226, + "learning_rate": 0.00010439606875566954, + "loss": 0.0024, + "num_input_tokens_seen": 214666800, + "step": 99490 + }, + { + "epoch": 16.23083197389886, + "grad_norm": 0.00017822047811932862, + "learning_rate": 0.00010435254293509378, + "loss": 0.0003, + "num_input_tokens_seen": 214677872, + "step": 99495 + }, + { + "epoch": 16.231647634584014, + "grad_norm": 0.0006994415889494121, + "learning_rate": 0.00010430902513281565, + "loss": 0.0029, + "num_input_tokens_seen": 214689232, + "step": 99500 + }, + { + "epoch": 16.232463295269167, + "grad_norm": 0.0005696951993741095, + "learning_rate": 0.00010426551534971706, + "loss": 0.0021, + "num_input_tokens_seen": 214700176, + "step": 99505 + }, + { + "epoch": 16.233278955954322, + "grad_norm": 0.00014364569506142288, + "learning_rate": 0.00010422201358667987, + "loss": 0.0014, + "num_input_tokens_seen": 214711920, + "step": 99510 + }, + { + "epoch": 16.234094616639478, + "grad_norm": 0.003214114811271429, + "learning_rate": 0.00010417851984458565, + "loss": 0.0004, + "num_input_tokens_seen": 214722544, + "step": 99515 + }, + { + "epoch": 16.234910277324634, + "grad_norm": 0.002923009218648076, + "learning_rate": 0.00010413503412431568, + "loss": 0.0005, + "num_input_tokens_seen": 214733488, + "step": 99520 + }, + { + "epoch": 16.23572593800979, + "grad_norm": 0.013028787449002266, + "learning_rate": 0.00010409155642675178, + "loss": 0.0012, + "num_input_tokens_seen": 214744336, + "step": 99525 + }, + { + "epoch": 16.23654159869494, + "grad_norm": 0.00047758783330209553, + "learning_rate": 0.00010404808675277444, + "loss": 0.011, + "num_input_tokens_seen": 214755664, + "step": 99530 + }, + { + "epoch": 16.237357259380097, + "grad_norm": 0.0005631741951219738, + "learning_rate": 0.00010400462510326513, + "loss": 0.0268, + "num_input_tokens_seen": 214765072, + "step": 99535 + }, + { + "epoch": 16.238172920065253, + "grad_norm": 0.0019707242026925087, + "learning_rate": 0.00010396117147910422, + "loss": 0.0018, + "num_input_tokens_seen": 214776688, + "step": 99540 + }, + { + "epoch": 16.23898858075041, + "grad_norm": 0.0003834923845715821, + "learning_rate": 0.00010391772588117288, + "loss": 0.0744, + "num_input_tokens_seen": 214787760, + "step": 99545 + }, + { + "epoch": 16.239804241435564, + "grad_norm": 0.008628292009234428, + "learning_rate": 0.000103874288310351, + "loss": 0.0015, + "num_input_tokens_seen": 214798192, + "step": 99550 + }, + { + "epoch": 16.240619902120716, + "grad_norm": 0.0013731379294767976, + "learning_rate": 0.0001038308587675193, + "loss": 0.0061, + "num_input_tokens_seen": 214808656, + "step": 99555 + }, + { + "epoch": 16.241435562805872, + "grad_norm": 0.10127880424261093, + "learning_rate": 0.00010378743725355788, + "loss": 0.0028, + "num_input_tokens_seen": 214817552, + "step": 99560 + }, + { + "epoch": 16.242251223491028, + "grad_norm": 0.032486919313669205, + "learning_rate": 0.00010374402376934661, + "loss": 0.0014, + "num_input_tokens_seen": 214829264, + "step": 99565 + }, + { + "epoch": 16.243066884176184, + "grad_norm": 0.004712745547294617, + "learning_rate": 0.00010370061831576544, + "loss": 0.0005, + "num_input_tokens_seen": 214840368, + "step": 99570 + }, + { + "epoch": 16.24388254486134, + "grad_norm": 0.0009125425131060183, + "learning_rate": 0.00010365722089369395, + "loss": 0.0006, + "num_input_tokens_seen": 214851696, + "step": 99575 + }, + { + "epoch": 16.24469820554649, + "grad_norm": 0.004358387086540461, + "learning_rate": 0.00010361383150401165, + "loss": 0.0012, + "num_input_tokens_seen": 214862896, + "step": 99580 + }, + { + "epoch": 16.245513866231647, + "grad_norm": 0.12575645744800568, + "learning_rate": 0.00010357045014759797, + "loss": 0.0171, + "num_input_tokens_seen": 214872944, + "step": 99585 + }, + { + "epoch": 16.246329526916803, + "grad_norm": 0.8606206774711609, + "learning_rate": 0.00010352707682533197, + "loss": 0.029, + "num_input_tokens_seen": 214884624, + "step": 99590 + }, + { + "epoch": 16.24714518760196, + "grad_norm": 0.0018189235124737024, + "learning_rate": 0.00010348371153809277, + "loss": 0.0007, + "num_input_tokens_seen": 214893840, + "step": 99595 + }, + { + "epoch": 16.247960848287114, + "grad_norm": 0.01287928968667984, + "learning_rate": 0.00010344035428675914, + "loss": 0.0006, + "num_input_tokens_seen": 214904688, + "step": 99600 + }, + { + "epoch": 16.248776508972266, + "grad_norm": 0.00549550075083971, + "learning_rate": 0.00010339700507220978, + "loss": 0.0148, + "num_input_tokens_seen": 214916144, + "step": 99605 + }, + { + "epoch": 16.249592169657422, + "grad_norm": 0.0003868980857077986, + "learning_rate": 0.0001033536638953233, + "loss": 0.0019, + "num_input_tokens_seen": 214927184, + "step": 99610 + }, + { + "epoch": 16.250407830342578, + "grad_norm": 0.012497547082602978, + "learning_rate": 0.00010331033075697793, + "loss": 0.0016, + "num_input_tokens_seen": 214937744, + "step": 99615 + }, + { + "epoch": 16.251223491027734, + "grad_norm": 0.004107022657990456, + "learning_rate": 0.00010326700565805197, + "loss": 0.0005, + "num_input_tokens_seen": 214949008, + "step": 99620 + }, + { + "epoch": 16.252039151712886, + "grad_norm": 0.0002096012613037601, + "learning_rate": 0.00010322368859942333, + "loss": 0.0008, + "num_input_tokens_seen": 214960592, + "step": 99625 + }, + { + "epoch": 16.25285481239804, + "grad_norm": 0.017861105501651764, + "learning_rate": 0.00010318037958197024, + "loss": 0.0009, + "num_input_tokens_seen": 214970672, + "step": 99630 + }, + { + "epoch": 16.253670473083197, + "grad_norm": 0.028369246050715446, + "learning_rate": 0.0001031370786065699, + "loss": 0.0022, + "num_input_tokens_seen": 214981168, + "step": 99635 + }, + { + "epoch": 16.254486133768353, + "grad_norm": 0.0032194359228014946, + "learning_rate": 0.00010309378567410039, + "loss": 0.0007, + "num_input_tokens_seen": 214992304, + "step": 99640 + }, + { + "epoch": 16.25530179445351, + "grad_norm": 0.00024047547776717693, + "learning_rate": 0.00010305050078543848, + "loss": 0.0007, + "num_input_tokens_seen": 215003440, + "step": 99645 + }, + { + "epoch": 16.25611745513866, + "grad_norm": 0.04599770903587341, + "learning_rate": 0.00010300722394146212, + "loss": 0.0023, + "num_input_tokens_seen": 215014960, + "step": 99650 + }, + { + "epoch": 16.256933115823816, + "grad_norm": 0.0021965079940855503, + "learning_rate": 0.00010296395514304763, + "loss": 0.0018, + "num_input_tokens_seen": 215025808, + "step": 99655 + }, + { + "epoch": 16.257748776508972, + "grad_norm": 0.009998462162911892, + "learning_rate": 0.00010292069439107254, + "loss": 0.001, + "num_input_tokens_seen": 215036752, + "step": 99660 + }, + { + "epoch": 16.258564437194128, + "grad_norm": 0.10374369472265244, + "learning_rate": 0.00010287744168641311, + "loss": 0.0018, + "num_input_tokens_seen": 215047376, + "step": 99665 + }, + { + "epoch": 16.259380097879284, + "grad_norm": 0.008433827199041843, + "learning_rate": 0.00010283419702994634, + "loss": 0.0004, + "num_input_tokens_seen": 215058512, + "step": 99670 + }, + { + "epoch": 16.260195758564436, + "grad_norm": 0.00021761927928309888, + "learning_rate": 0.0001027909604225481, + "loss": 0.0011, + "num_input_tokens_seen": 215069264, + "step": 99675 + }, + { + "epoch": 16.26101141924959, + "grad_norm": 0.03719954565167427, + "learning_rate": 0.00010274773186509528, + "loss": 0.0023, + "num_input_tokens_seen": 215080624, + "step": 99680 + }, + { + "epoch": 16.261827079934747, + "grad_norm": 0.00039624725468456745, + "learning_rate": 0.00010270451135846332, + "loss": 0.0004, + "num_input_tokens_seen": 215089744, + "step": 99685 + }, + { + "epoch": 16.262642740619903, + "grad_norm": 0.0037325734738260508, + "learning_rate": 0.00010266129890352872, + "loss": 0.0003, + "num_input_tokens_seen": 215100400, + "step": 99690 + }, + { + "epoch": 16.26345840130506, + "grad_norm": 0.0002504123840481043, + "learning_rate": 0.00010261809450116666, + "loss": 0.0016, + "num_input_tokens_seen": 215110416, + "step": 99695 + }, + { + "epoch": 16.26427406199021, + "grad_norm": 0.01634128950536251, + "learning_rate": 0.00010257489815225318, + "loss": 0.0005, + "num_input_tokens_seen": 215120912, + "step": 99700 + }, + { + "epoch": 16.265089722675366, + "grad_norm": 0.02927369624376297, + "learning_rate": 0.00010253170985766357, + "loss": 0.0009, + "num_input_tokens_seen": 215132912, + "step": 99705 + }, + { + "epoch": 16.265905383360522, + "grad_norm": 0.002359341364353895, + "learning_rate": 0.00010248852961827309, + "loss": 0.0006, + "num_input_tokens_seen": 215144240, + "step": 99710 + }, + { + "epoch": 16.266721044045678, + "grad_norm": 0.029927285388112068, + "learning_rate": 0.00010244535743495681, + "loss": 0.0007, + "num_input_tokens_seen": 215155568, + "step": 99715 + }, + { + "epoch": 16.267536704730833, + "grad_norm": 0.0003132217680104077, + "learning_rate": 0.00010240219330858969, + "loss": 0.0004, + "num_input_tokens_seen": 215166320, + "step": 99720 + }, + { + "epoch": 16.268352365415986, + "grad_norm": 0.00040158003685064614, + "learning_rate": 0.00010235903724004652, + "loss": 0.0002, + "num_input_tokens_seen": 215178448, + "step": 99725 + }, + { + "epoch": 16.26916802610114, + "grad_norm": 0.0008211858221329749, + "learning_rate": 0.00010231588923020196, + "loss": 0.0002, + "num_input_tokens_seen": 215190480, + "step": 99730 + }, + { + "epoch": 16.269983686786297, + "grad_norm": 0.0005600028671324253, + "learning_rate": 0.00010227274927993035, + "loss": 0.0005, + "num_input_tokens_seen": 215200304, + "step": 99735 + }, + { + "epoch": 16.270799347471453, + "grad_norm": 0.005810518283396959, + "learning_rate": 0.000102229617390106, + "loss": 0.0004, + "num_input_tokens_seen": 215211152, + "step": 99740 + }, + { + "epoch": 16.27161500815661, + "grad_norm": 0.17758481204509735, + "learning_rate": 0.00010218649356160314, + "loss": 0.005, + "num_input_tokens_seen": 215222960, + "step": 99745 + }, + { + "epoch": 16.27243066884176, + "grad_norm": 0.014876814559102058, + "learning_rate": 0.00010214337779529548, + "loss": 0.0028, + "num_input_tokens_seen": 215234032, + "step": 99750 + }, + { + "epoch": 16.273246329526916, + "grad_norm": 0.004885215777903795, + "learning_rate": 0.00010210027009205719, + "loss": 0.0022, + "num_input_tokens_seen": 215243728, + "step": 99755 + }, + { + "epoch": 16.274061990212072, + "grad_norm": 0.00022133869060780853, + "learning_rate": 0.00010205717045276153, + "loss": 0.0006, + "num_input_tokens_seen": 215254704, + "step": 99760 + }, + { + "epoch": 16.274877650897228, + "grad_norm": 0.002738728653639555, + "learning_rate": 0.00010201407887828234, + "loss": 0.0043, + "num_input_tokens_seen": 215264464, + "step": 99765 + }, + { + "epoch": 16.275693311582383, + "grad_norm": 0.0007970415754243731, + "learning_rate": 0.0001019709953694925, + "loss": 0.0026, + "num_input_tokens_seen": 215275792, + "step": 99770 + }, + { + "epoch": 16.276508972267536, + "grad_norm": 0.0003458178834989667, + "learning_rate": 0.00010192791992726558, + "loss": 0.0082, + "num_input_tokens_seen": 215285872, + "step": 99775 + }, + { + "epoch": 16.27732463295269, + "grad_norm": 0.0007084120297804475, + "learning_rate": 0.00010188485255247415, + "loss": 0.0056, + "num_input_tokens_seen": 215296656, + "step": 99780 + }, + { + "epoch": 16.278140293637847, + "grad_norm": 0.00032040546648204327, + "learning_rate": 0.00010184179324599147, + "loss": 0.002, + "num_input_tokens_seen": 215307344, + "step": 99785 + }, + { + "epoch": 16.278955954323003, + "grad_norm": 0.0046033356338739395, + "learning_rate": 0.00010179874200868966, + "loss": 0.0006, + "num_input_tokens_seen": 215317648, + "step": 99790 + }, + { + "epoch": 16.27977161500816, + "grad_norm": 0.0008522849529981613, + "learning_rate": 0.00010175569884144182, + "loss": 0.0008, + "num_input_tokens_seen": 215329552, + "step": 99795 + }, + { + "epoch": 16.28058727569331, + "grad_norm": 0.00050800119061023, + "learning_rate": 0.00010171266374511962, + "loss": 0.0042, + "num_input_tokens_seen": 215338608, + "step": 99800 + }, + { + "epoch": 16.281402936378466, + "grad_norm": 0.002512179547920823, + "learning_rate": 0.00010166963672059588, + "loss": 0.0003, + "num_input_tokens_seen": 215349104, + "step": 99805 + }, + { + "epoch": 16.282218597063622, + "grad_norm": 0.0039420598186552525, + "learning_rate": 0.00010162661776874193, + "loss": 0.0013, + "num_input_tokens_seen": 215360528, + "step": 99810 + }, + { + "epoch": 16.283034257748778, + "grad_norm": 0.0005421530222520232, + "learning_rate": 0.00010158360689043028, + "loss": 0.0004, + "num_input_tokens_seen": 215370768, + "step": 99815 + }, + { + "epoch": 16.28384991843393, + "grad_norm": 0.0020029775332659483, + "learning_rate": 0.00010154060408653198, + "loss": 0.0008, + "num_input_tokens_seen": 215381872, + "step": 99820 + }, + { + "epoch": 16.284665579119086, + "grad_norm": 0.01990986056625843, + "learning_rate": 0.00010149760935791907, + "loss": 0.0008, + "num_input_tokens_seen": 215392208, + "step": 99825 + }, + { + "epoch": 16.28548123980424, + "grad_norm": 0.01912722736597061, + "learning_rate": 0.00010145462270546241, + "loss": 0.0006, + "num_input_tokens_seen": 215403920, + "step": 99830 + }, + { + "epoch": 16.286296900489397, + "grad_norm": 0.0002840108354575932, + "learning_rate": 0.00010141164413003351, + "loss": 0.0002, + "num_input_tokens_seen": 215414992, + "step": 99835 + }, + { + "epoch": 16.287112561174553, + "grad_norm": 0.0006499798037111759, + "learning_rate": 0.00010136867363250329, + "loss": 0.0016, + "num_input_tokens_seen": 215426160, + "step": 99840 + }, + { + "epoch": 16.287928221859705, + "grad_norm": 0.07542385905981064, + "learning_rate": 0.00010132571121374257, + "loss": 0.0018, + "num_input_tokens_seen": 215437232, + "step": 99845 + }, + { + "epoch": 16.28874388254486, + "grad_norm": 0.03882214426994324, + "learning_rate": 0.00010128275687462212, + "loss": 0.0034, + "num_input_tokens_seen": 215451792, + "step": 99850 + }, + { + "epoch": 16.289559543230016, + "grad_norm": 0.023033874109387398, + "learning_rate": 0.0001012398106160124, + "loss": 0.028, + "num_input_tokens_seen": 215463504, + "step": 99855 + }, + { + "epoch": 16.290375203915172, + "grad_norm": 0.10273370146751404, + "learning_rate": 0.00010119687243878379, + "loss": 0.0132, + "num_input_tokens_seen": 215474896, + "step": 99860 + }, + { + "epoch": 16.291190864600328, + "grad_norm": 0.007935848087072372, + "learning_rate": 0.00010115394234380642, + "loss": 0.0034, + "num_input_tokens_seen": 215486512, + "step": 99865 + }, + { + "epoch": 16.29200652528548, + "grad_norm": 0.024905255064368248, + "learning_rate": 0.00010111102033195041, + "loss": 0.0009, + "num_input_tokens_seen": 215497360, + "step": 99870 + }, + { + "epoch": 16.292822185970635, + "grad_norm": 0.009111877530813217, + "learning_rate": 0.00010106810640408564, + "loss": 0.001, + "num_input_tokens_seen": 215507280, + "step": 99875 + }, + { + "epoch": 16.29363784665579, + "grad_norm": 0.00901962723582983, + "learning_rate": 0.00010102520056108172, + "loss": 0.0156, + "num_input_tokens_seen": 215517200, + "step": 99880 + }, + { + "epoch": 16.294453507340947, + "grad_norm": 0.011788202449679375, + "learning_rate": 0.00010098230280380826, + "loss": 0.0042, + "num_input_tokens_seen": 215527760, + "step": 99885 + }, + { + "epoch": 16.295269168026103, + "grad_norm": 0.0046682520769536495, + "learning_rate": 0.00010093941313313465, + "loss": 0.0004, + "num_input_tokens_seen": 215538640, + "step": 99890 + }, + { + "epoch": 16.296084828711255, + "grad_norm": 0.00810224749147892, + "learning_rate": 0.00010089653154992994, + "loss": 0.0048, + "num_input_tokens_seen": 215549840, + "step": 99895 + }, + { + "epoch": 16.29690048939641, + "grad_norm": 0.00046275145723484457, + "learning_rate": 0.00010085365805506358, + "loss": 0.0018, + "num_input_tokens_seen": 215560752, + "step": 99900 + }, + { + "epoch": 16.297716150081566, + "grad_norm": 0.000980571610853076, + "learning_rate": 0.00010081079264940391, + "loss": 0.0008, + "num_input_tokens_seen": 215571952, + "step": 99905 + }, + { + "epoch": 16.298531810766722, + "grad_norm": 0.0016196774085983634, + "learning_rate": 0.00010076793533382022, + "loss": 0.0006, + "num_input_tokens_seen": 215582096, + "step": 99910 + }, + { + "epoch": 16.299347471451878, + "grad_norm": 7.714620113372803, + "learning_rate": 0.00010072508610918046, + "loss": 0.0773, + "num_input_tokens_seen": 215591120, + "step": 99915 + }, + { + "epoch": 16.30016313213703, + "grad_norm": 0.004446367733180523, + "learning_rate": 0.00010068224497635369, + "loss": 0.0034, + "num_input_tokens_seen": 215601520, + "step": 99920 + }, + { + "epoch": 16.300978792822185, + "grad_norm": 0.0002518888213671744, + "learning_rate": 0.00010063941193620751, + "loss": 0.0004, + "num_input_tokens_seen": 215611664, + "step": 99925 + }, + { + "epoch": 16.30179445350734, + "grad_norm": 0.0023177401162683964, + "learning_rate": 0.0001005965869896105, + "loss": 0.0009, + "num_input_tokens_seen": 215621936, + "step": 99930 + }, + { + "epoch": 16.302610114192497, + "grad_norm": 0.002090372843667865, + "learning_rate": 0.00010055377013743012, + "loss": 0.0019, + "num_input_tokens_seen": 215630928, + "step": 99935 + }, + { + "epoch": 16.303425774877653, + "grad_norm": 0.8441887497901917, + "learning_rate": 0.0001005109613805344, + "loss": 0.0602, + "num_input_tokens_seen": 215643024, + "step": 99940 + }, + { + "epoch": 16.304241435562805, + "grad_norm": 0.002426605671644211, + "learning_rate": 0.00010046816071979087, + "loss": 0.0006, + "num_input_tokens_seen": 215654096, + "step": 99945 + }, + { + "epoch": 16.30505709624796, + "grad_norm": 0.06940564513206482, + "learning_rate": 0.0001004253681560669, + "loss": 0.0029, + "num_input_tokens_seen": 215666352, + "step": 99950 + }, + { + "epoch": 16.305872756933116, + "grad_norm": 0.006272803992033005, + "learning_rate": 0.00010038258369022974, + "loss": 0.0013, + "num_input_tokens_seen": 215678288, + "step": 99955 + }, + { + "epoch": 16.306688417618272, + "grad_norm": 0.0015648877015337348, + "learning_rate": 0.00010033980732314646, + "loss": 0.0033, + "num_input_tokens_seen": 215689040, + "step": 99960 + }, + { + "epoch": 16.307504078303428, + "grad_norm": 0.00014687392103951424, + "learning_rate": 0.00010029703905568399, + "loss": 0.0006, + "num_input_tokens_seen": 215699728, + "step": 99965 + }, + { + "epoch": 16.30831973898858, + "grad_norm": 0.0011339586926624179, + "learning_rate": 0.00010025427888870909, + "loss": 0.0002, + "num_input_tokens_seen": 215710992, + "step": 99970 + }, + { + "epoch": 16.309135399673735, + "grad_norm": 0.0006208168342709541, + "learning_rate": 0.00010021152682308837, + "loss": 0.001, + "num_input_tokens_seen": 215721840, + "step": 99975 + }, + { + "epoch": 16.30995106035889, + "grad_norm": 0.9454302191734314, + "learning_rate": 0.00010016878285968816, + "loss": 0.0236, + "num_input_tokens_seen": 215731600, + "step": 99980 + }, + { + "epoch": 16.310766721044047, + "grad_norm": 0.03533612936735153, + "learning_rate": 0.00010012604699937483, + "loss": 0.0014, + "num_input_tokens_seen": 215741264, + "step": 99985 + }, + { + "epoch": 16.3115823817292, + "grad_norm": 0.0012835239758715034, + "learning_rate": 0.00010008331924301445, + "loss": 0.0004, + "num_input_tokens_seen": 215752624, + "step": 99990 + }, + { + "epoch": 16.312398042414355, + "grad_norm": 0.09615210443735123, + "learning_rate": 0.00010004059959147293, + "loss": 0.0345, + "num_input_tokens_seen": 215761680, + "step": 99995 + }, + { + "epoch": 16.31321370309951, + "grad_norm": 0.004813529551029205, + "learning_rate": 9.999788804561605e-05, + "loss": 0.0035, + "num_input_tokens_seen": 215771408, + "step": 100000 + }, + { + "epoch": 16.314029363784666, + "grad_norm": 0.0002029415190918371, + "learning_rate": 9.995518460630937e-05, + "loss": 0.0003, + "num_input_tokens_seen": 215781840, + "step": 100005 + }, + { + "epoch": 16.31484502446982, + "grad_norm": 0.006620301865041256, + "learning_rate": 9.991248927441837e-05, + "loss": 0.0003, + "num_input_tokens_seen": 215792048, + "step": 100010 + }, + { + "epoch": 16.315660685154974, + "grad_norm": 0.006255440413951874, + "learning_rate": 9.986980205080837e-05, + "loss": 0.0007, + "num_input_tokens_seen": 215804304, + "step": 100015 + }, + { + "epoch": 16.31647634584013, + "grad_norm": 0.01033405214548111, + "learning_rate": 9.982712293634438e-05, + "loss": 0.0024, + "num_input_tokens_seen": 215815632, + "step": 100020 + }, + { + "epoch": 16.317292006525285, + "grad_norm": 0.00022013649868313223, + "learning_rate": 9.97844519318914e-05, + "loss": 0.0004, + "num_input_tokens_seen": 215825296, + "step": 100025 + }, + { + "epoch": 16.31810766721044, + "grad_norm": 0.0024929235223680735, + "learning_rate": 9.974178903831427e-05, + "loss": 0.0017, + "num_input_tokens_seen": 215836272, + "step": 100030 + }, + { + "epoch": 16.318923327895597, + "grad_norm": 0.0019258386455476284, + "learning_rate": 9.969913425647747e-05, + "loss": 0.0006, + "num_input_tokens_seen": 215845904, + "step": 100035 + }, + { + "epoch": 16.31973898858075, + "grad_norm": 0.0014760097255930305, + "learning_rate": 9.965648758724544e-05, + "loss": 0.0036, + "num_input_tokens_seen": 215856688, + "step": 100040 + }, + { + "epoch": 16.320554649265905, + "grad_norm": 0.003909919410943985, + "learning_rate": 9.961384903148269e-05, + "loss": 0.0003, + "num_input_tokens_seen": 215867120, + "step": 100045 + }, + { + "epoch": 16.32137030995106, + "grad_norm": 0.00010758540156530216, + "learning_rate": 9.957121859005324e-05, + "loss": 0.0002, + "num_input_tokens_seen": 215878032, + "step": 100050 + }, + { + "epoch": 16.322185970636216, + "grad_norm": 0.0006419508717954159, + "learning_rate": 9.952859626382099e-05, + "loss": 0.0023, + "num_input_tokens_seen": 215889296, + "step": 100055 + }, + { + "epoch": 16.32300163132137, + "grad_norm": 0.00019168495782651007, + "learning_rate": 9.948598205364979e-05, + "loss": 0.0658, + "num_input_tokens_seen": 215900144, + "step": 100060 + }, + { + "epoch": 16.323817292006524, + "grad_norm": 0.008406216278672218, + "learning_rate": 9.944337596040326e-05, + "loss": 0.0046, + "num_input_tokens_seen": 215912592, + "step": 100065 + }, + { + "epoch": 16.32463295269168, + "grad_norm": 0.00014777647447772324, + "learning_rate": 9.940077798494485e-05, + "loss": 0.0032, + "num_input_tokens_seen": 215923472, + "step": 100070 + }, + { + "epoch": 16.325448613376835, + "grad_norm": 0.2405572533607483, + "learning_rate": 9.935818812813784e-05, + "loss": 0.0073, + "num_input_tokens_seen": 215934544, + "step": 100075 + }, + { + "epoch": 16.32626427406199, + "grad_norm": 0.000317376950988546, + "learning_rate": 9.931560639084541e-05, + "loss": 0.0003, + "num_input_tokens_seen": 215945936, + "step": 100080 + }, + { + "epoch": 16.327079934747147, + "grad_norm": 0.00017242407193407416, + "learning_rate": 9.927303277393051e-05, + "loss": 0.0041, + "num_input_tokens_seen": 215956016, + "step": 100085 + }, + { + "epoch": 16.3278955954323, + "grad_norm": 0.00044570298632606864, + "learning_rate": 9.923046727825602e-05, + "loss": 0.0025, + "num_input_tokens_seen": 215966192, + "step": 100090 + }, + { + "epoch": 16.328711256117455, + "grad_norm": 0.0005495261284522712, + "learning_rate": 9.918790990468446e-05, + "loss": 0.0021, + "num_input_tokens_seen": 215978096, + "step": 100095 + }, + { + "epoch": 16.32952691680261, + "grad_norm": 8.291070116683841e-05, + "learning_rate": 9.914536065407842e-05, + "loss": 0.0083, + "num_input_tokens_seen": 215989392, + "step": 100100 + }, + { + "epoch": 16.330342577487766, + "grad_norm": 0.00036083502345718443, + "learning_rate": 9.910281952730011e-05, + "loss": 0.0007, + "num_input_tokens_seen": 215999664, + "step": 100105 + }, + { + "epoch": 16.33115823817292, + "grad_norm": 0.0008234057459048927, + "learning_rate": 9.906028652521176e-05, + "loss": 0.0246, + "num_input_tokens_seen": 216010736, + "step": 100110 + }, + { + "epoch": 16.331973898858074, + "grad_norm": 0.008624700829386711, + "learning_rate": 9.901776164867538e-05, + "loss": 0.0067, + "num_input_tokens_seen": 216021872, + "step": 100115 + }, + { + "epoch": 16.33278955954323, + "grad_norm": 0.00028523922082968056, + "learning_rate": 9.89752448985527e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216033584, + "step": 100120 + }, + { + "epoch": 16.333605220228385, + "grad_norm": 0.00072526338044554, + "learning_rate": 9.893273627570542e-05, + "loss": 0.002, + "num_input_tokens_seen": 216044752, + "step": 100125 + }, + { + "epoch": 16.33442088091354, + "grad_norm": 0.0009103193297050893, + "learning_rate": 9.889023578099504e-05, + "loss": 0.0023, + "num_input_tokens_seen": 216055952, + "step": 100130 + }, + { + "epoch": 16.335236541598697, + "grad_norm": 0.00038741817115806043, + "learning_rate": 9.884774341528285e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216066480, + "step": 100135 + }, + { + "epoch": 16.33605220228385, + "grad_norm": 0.001201550243422389, + "learning_rate": 9.880525917943006e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216077904, + "step": 100140 + }, + { + "epoch": 16.336867862969005, + "grad_norm": 0.00021966852364130318, + "learning_rate": 9.876278307429764e-05, + "loss": 0.002, + "num_input_tokens_seen": 216089136, + "step": 100145 + }, + { + "epoch": 16.33768352365416, + "grad_norm": 0.008983033709228039, + "learning_rate": 9.872031510074625e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216101328, + "step": 100150 + }, + { + "epoch": 16.338499184339316, + "grad_norm": 0.0004490799328777939, + "learning_rate": 9.867785525963707e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216111792, + "step": 100155 + }, + { + "epoch": 16.339314845024468, + "grad_norm": 0.000757934816647321, + "learning_rate": 9.863540355182998e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216121552, + "step": 100160 + }, + { + "epoch": 16.340130505709624, + "grad_norm": 0.0003240050282329321, + "learning_rate": 9.859295997818585e-05, + "loss": 0.0001, + "num_input_tokens_seen": 216131856, + "step": 100165 + }, + { + "epoch": 16.34094616639478, + "grad_norm": 0.17560900747776031, + "learning_rate": 9.855052453956437e-05, + "loss": 0.0022, + "num_input_tokens_seen": 216141488, + "step": 100170 + }, + { + "epoch": 16.341761827079935, + "grad_norm": 9.212361328536645e-05, + "learning_rate": 9.850809723682603e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216153552, + "step": 100175 + }, + { + "epoch": 16.34257748776509, + "grad_norm": 0.00033529725624248385, + "learning_rate": 9.846567807083018e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216164848, + "step": 100180 + }, + { + "epoch": 16.343393148450243, + "grad_norm": 0.0034189538564532995, + "learning_rate": 9.842326704243682e-05, + "loss": 0.0018, + "num_input_tokens_seen": 216176400, + "step": 100185 + }, + { + "epoch": 16.3442088091354, + "grad_norm": 0.00526026776060462, + "learning_rate": 9.838086415250547e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216186896, + "step": 100190 + }, + { + "epoch": 16.345024469820554, + "grad_norm": 0.000464249518699944, + "learning_rate": 9.833846940189533e-05, + "loss": 0.0003, + "num_input_tokens_seen": 216198160, + "step": 100195 + }, + { + "epoch": 16.34584013050571, + "grad_norm": 0.028154579922556877, + "learning_rate": 9.829608279146568e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216208784, + "step": 100200 + }, + { + "epoch": 16.346655791190866, + "grad_norm": 0.0036850611213594675, + "learning_rate": 9.825370432207554e-05, + "loss": 0.0039, + "num_input_tokens_seen": 216220592, + "step": 100205 + }, + { + "epoch": 16.347471451876018, + "grad_norm": 0.00134883017744869, + "learning_rate": 9.821133399458371e-05, + "loss": 0.0013, + "num_input_tokens_seen": 216231696, + "step": 100210 + }, + { + "epoch": 16.348287112561174, + "grad_norm": 0.011086208745837212, + "learning_rate": 9.81689718098489e-05, + "loss": 0.0077, + "num_input_tokens_seen": 216242928, + "step": 100215 + }, + { + "epoch": 16.34910277324633, + "grad_norm": 0.01893300563097, + "learning_rate": 9.81266177687296e-05, + "loss": 0.0005, + "num_input_tokens_seen": 216253360, + "step": 100220 + }, + { + "epoch": 16.349918433931485, + "grad_norm": 0.0012044749455526471, + "learning_rate": 9.808427187208424e-05, + "loss": 0.003, + "num_input_tokens_seen": 216263536, + "step": 100225 + }, + { + "epoch": 16.35073409461664, + "grad_norm": 0.0020704453345388174, + "learning_rate": 9.8041934120771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216274832, + "step": 100230 + }, + { + "epoch": 16.351549755301793, + "grad_norm": 0.06813068687915802, + "learning_rate": 9.799960451564787e-05, + "loss": 0.0045, + "num_input_tokens_seen": 216285936, + "step": 100235 + }, + { + "epoch": 16.35236541598695, + "grad_norm": 0.03785141184926033, + "learning_rate": 9.795728305757267e-05, + "loss": 0.0011, + "num_input_tokens_seen": 216296336, + "step": 100240 + }, + { + "epoch": 16.353181076672104, + "grad_norm": 0.00036368367727845907, + "learning_rate": 9.791496974740321e-05, + "loss": 0.0016, + "num_input_tokens_seen": 216307472, + "step": 100245 + }, + { + "epoch": 16.35399673735726, + "grad_norm": 0.0002789338177535683, + "learning_rate": 9.787266458599697e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216317456, + "step": 100250 + }, + { + "epoch": 16.354812398042416, + "grad_norm": 0.000222333925194107, + "learning_rate": 9.783036757421132e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216326992, + "step": 100255 + }, + { + "epoch": 16.355628058727568, + "grad_norm": 0.0035536775831133127, + "learning_rate": 9.778807871290346e-05, + "loss": 0.0496, + "num_input_tokens_seen": 216338064, + "step": 100260 + }, + { + "epoch": 16.356443719412724, + "grad_norm": 0.00011459176312200725, + "learning_rate": 9.774579800293026e-05, + "loss": 0.001, + "num_input_tokens_seen": 216349232, + "step": 100265 + }, + { + "epoch": 16.35725938009788, + "grad_norm": 0.028273755684494972, + "learning_rate": 9.770352544514904e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216360400, + "step": 100270 + }, + { + "epoch": 16.358075040783035, + "grad_norm": 0.012659593485295773, + "learning_rate": 9.766126104041601e-05, + "loss": 0.0026, + "num_input_tokens_seen": 216372144, + "step": 100275 + }, + { + "epoch": 16.35889070146819, + "grad_norm": 0.0014826093101873994, + "learning_rate": 9.761900478958813e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216382480, + "step": 100280 + }, + { + "epoch": 16.359706362153343, + "grad_norm": 0.0021580469328910112, + "learning_rate": 9.757675669352133e-05, + "loss": 0.002, + "num_input_tokens_seen": 216392176, + "step": 100285 + }, + { + "epoch": 16.3605220228385, + "grad_norm": 0.004756412468850613, + "learning_rate": 9.753451675307234e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216403184, + "step": 100290 + }, + { + "epoch": 16.361337683523654, + "grad_norm": 0.0006489691440947354, + "learning_rate": 9.749228496909668e-05, + "loss": 0.0764, + "num_input_tokens_seen": 216414288, + "step": 100295 + }, + { + "epoch": 16.36215334420881, + "grad_norm": 0.0027996920980513096, + "learning_rate": 9.745006134245072e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216425520, + "step": 100300 + }, + { + "epoch": 16.362969004893966, + "grad_norm": 0.0014016131171956658, + "learning_rate": 9.740784587398965e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216436272, + "step": 100305 + }, + { + "epoch": 16.363784665579118, + "grad_norm": 0.0005110031343065202, + "learning_rate": 9.736563856456959e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216447632, + "step": 100310 + }, + { + "epoch": 16.364600326264274, + "grad_norm": 0.007204854860901833, + "learning_rate": 9.73234394150454e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216458320, + "step": 100315 + }, + { + "epoch": 16.36541598694943, + "grad_norm": 0.00022284280566964298, + "learning_rate": 9.728124842627278e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216468720, + "step": 100320 + }, + { + "epoch": 16.366231647634585, + "grad_norm": 0.0002744551748037338, + "learning_rate": 9.723906559910634e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216478864, + "step": 100325 + }, + { + "epoch": 16.36704730831974, + "grad_norm": 0.0016357296844944358, + "learning_rate": 9.719689093440126e-05, + "loss": 0.0041, + "num_input_tokens_seen": 216490192, + "step": 100330 + }, + { + "epoch": 16.367862969004893, + "grad_norm": 0.07749425619840622, + "learning_rate": 9.715472443301215e-05, + "loss": 0.0029, + "num_input_tokens_seen": 216500368, + "step": 100335 + }, + { + "epoch": 16.36867862969005, + "grad_norm": 0.04089093208312988, + "learning_rate": 9.711256609579367e-05, + "loss": 0.0041, + "num_input_tokens_seen": 216511152, + "step": 100340 + }, + { + "epoch": 16.369494290375204, + "grad_norm": 0.00022247733431868255, + "learning_rate": 9.707041592360005e-05, + "loss": 0.0013, + "num_input_tokens_seen": 216521520, + "step": 100345 + }, + { + "epoch": 16.37030995106036, + "grad_norm": 0.004842165857553482, + "learning_rate": 9.702827391728564e-05, + "loss": 0.001, + "num_input_tokens_seen": 216532464, + "step": 100350 + }, + { + "epoch": 16.371125611745512, + "grad_norm": 7.804749475326389e-05, + "learning_rate": 9.69861400777045e-05, + "loss": 0.0021, + "num_input_tokens_seen": 216544208, + "step": 100355 + }, + { + "epoch": 16.371941272430668, + "grad_norm": 0.000270723772700876, + "learning_rate": 9.694401440571043e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216554992, + "step": 100360 + }, + { + "epoch": 16.372756933115824, + "grad_norm": 0.00040581001667305827, + "learning_rate": 9.690189690215728e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216566096, + "step": 100365 + }, + { + "epoch": 16.37357259380098, + "grad_norm": 0.00015577208250761032, + "learning_rate": 9.685978756789854e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216578096, + "step": 100370 + }, + { + "epoch": 16.374388254486135, + "grad_norm": 0.0538487546145916, + "learning_rate": 9.681768640378757e-05, + "loss": 0.0016, + "num_input_tokens_seen": 216589040, + "step": 100375 + }, + { + "epoch": 16.375203915171287, + "grad_norm": 0.0019335116958245635, + "learning_rate": 9.677559341067759e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216600656, + "step": 100380 + }, + { + "epoch": 16.376019575856443, + "grad_norm": 0.005911931395530701, + "learning_rate": 9.673350858942198e-05, + "loss": 0.0015, + "num_input_tokens_seen": 216610384, + "step": 100385 + }, + { + "epoch": 16.3768352365416, + "grad_norm": 0.10654398798942566, + "learning_rate": 9.669143194087315e-05, + "loss": 0.0471, + "num_input_tokens_seen": 216620720, + "step": 100390 + }, + { + "epoch": 16.377650897226754, + "grad_norm": 0.006041374523192644, + "learning_rate": 9.664936346588432e-05, + "loss": 0.0005, + "num_input_tokens_seen": 216632112, + "step": 100395 + }, + { + "epoch": 16.37846655791191, + "grad_norm": 0.01834656111896038, + "learning_rate": 9.660730316530757e-05, + "loss": 0.0027, + "num_input_tokens_seen": 216642864, + "step": 100400 + }, + { + "epoch": 16.379282218597062, + "grad_norm": 0.00553221395239234, + "learning_rate": 9.65652510399958e-05, + "loss": 0.0003, + "num_input_tokens_seen": 216654320, + "step": 100405 + }, + { + "epoch": 16.380097879282218, + "grad_norm": 0.03777151182293892, + "learning_rate": 9.652320709080082e-05, + "loss": 0.0017, + "num_input_tokens_seen": 216663728, + "step": 100410 + }, + { + "epoch": 16.380913539967374, + "grad_norm": 0.01004553958773613, + "learning_rate": 9.648117131857509e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216675280, + "step": 100415 + }, + { + "epoch": 16.38172920065253, + "grad_norm": 0.0028462393674999475, + "learning_rate": 9.643914372417011e-05, + "loss": 0.0027, + "num_input_tokens_seen": 216686352, + "step": 100420 + }, + { + "epoch": 16.382544861337685, + "grad_norm": 8.919215179048479e-05, + "learning_rate": 9.639712430843806e-05, + "loss": 0.0008, + "num_input_tokens_seen": 216697712, + "step": 100425 + }, + { + "epoch": 16.383360522022837, + "grad_norm": 0.0055139148607850075, + "learning_rate": 9.635511307223005e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216708784, + "step": 100430 + }, + { + "epoch": 16.384176182707993, + "grad_norm": 0.0002949858026113361, + "learning_rate": 9.631311001639798e-05, + "loss": 0.0018, + "num_input_tokens_seen": 216718864, + "step": 100435 + }, + { + "epoch": 16.38499184339315, + "grad_norm": 0.0052596149034798145, + "learning_rate": 9.62711151417926e-05, + "loss": 0.0036, + "num_input_tokens_seen": 216729040, + "step": 100440 + }, + { + "epoch": 16.385807504078304, + "grad_norm": 0.000227761673158966, + "learning_rate": 9.622912844926551e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216738512, + "step": 100445 + }, + { + "epoch": 16.38662316476346, + "grad_norm": 0.0005523049039766192, + "learning_rate": 9.618714993966704e-05, + "loss": 0.0003, + "num_input_tokens_seen": 216749744, + "step": 100450 + }, + { + "epoch": 16.387438825448612, + "grad_norm": 0.0036612716503441334, + "learning_rate": 9.614517961384856e-05, + "loss": 0.0016, + "num_input_tokens_seen": 216761200, + "step": 100455 + }, + { + "epoch": 16.388254486133768, + "grad_norm": 0.0035140595864504576, + "learning_rate": 9.610321747266005e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216772016, + "step": 100460 + }, + { + "epoch": 16.389070146818923, + "grad_norm": 0.00015596683078911155, + "learning_rate": 9.60612635169525e-05, + "loss": 0.0012, + "num_input_tokens_seen": 216782928, + "step": 100465 + }, + { + "epoch": 16.38988580750408, + "grad_norm": 0.0025905671063810587, + "learning_rate": 9.601931774757561e-05, + "loss": 0.0021, + "num_input_tokens_seen": 216794896, + "step": 100470 + }, + { + "epoch": 16.390701468189235, + "grad_norm": 0.001714176032692194, + "learning_rate": 9.597738016537988e-05, + "loss": 0.0114, + "num_input_tokens_seen": 216804592, + "step": 100475 + }, + { + "epoch": 16.391517128874387, + "grad_norm": 0.00020868261344730854, + "learning_rate": 9.593545077121507e-05, + "loss": 0.0071, + "num_input_tokens_seen": 216816176, + "step": 100480 + }, + { + "epoch": 16.392332789559543, + "grad_norm": 0.04608377441763878, + "learning_rate": 9.589352956593095e-05, + "loss": 0.0182, + "num_input_tokens_seen": 216827152, + "step": 100485 + }, + { + "epoch": 16.3931484502447, + "grad_norm": 0.0033866753801703453, + "learning_rate": 9.585161655037705e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216836624, + "step": 100490 + }, + { + "epoch": 16.393964110929854, + "grad_norm": 0.29261985421180725, + "learning_rate": 9.580971172540287e-05, + "loss": 0.015, + "num_input_tokens_seen": 216846928, + "step": 100495 + }, + { + "epoch": 16.39477977161501, + "grad_norm": 0.008401153609156609, + "learning_rate": 9.576781509185766e-05, + "loss": 0.0004, + "num_input_tokens_seen": 216856944, + "step": 100500 + }, + { + "epoch": 16.395595432300162, + "grad_norm": 0.0003847590123768896, + "learning_rate": 9.572592665059043e-05, + "loss": 0.0023, + "num_input_tokens_seen": 216868528, + "step": 100505 + }, + { + "epoch": 16.396411092985318, + "grad_norm": 0.0006730294553562999, + "learning_rate": 9.568404640245022e-05, + "loss": 0.0017, + "num_input_tokens_seen": 216878896, + "step": 100510 + }, + { + "epoch": 16.397226753670473, + "grad_norm": 0.00018012619693763554, + "learning_rate": 9.564217434828565e-05, + "loss": 0.0002, + "num_input_tokens_seen": 216890448, + "step": 100515 + }, + { + "epoch": 16.39804241435563, + "grad_norm": 0.01507472526282072, + "learning_rate": 9.56003104889454e-05, + "loss": 0.0981, + "num_input_tokens_seen": 216900432, + "step": 100520 + }, + { + "epoch": 16.39885807504078, + "grad_norm": 0.003971244674175978, + "learning_rate": 9.55584548252778e-05, + "loss": 0.0246, + "num_input_tokens_seen": 216910704, + "step": 100525 + }, + { + "epoch": 16.399673735725937, + "grad_norm": 0.0002131751534761861, + "learning_rate": 9.55166073581314e-05, + "loss": 0.0001, + "num_input_tokens_seen": 216921904, + "step": 100530 + }, + { + "epoch": 16.400489396411093, + "grad_norm": 0.001574057969264686, + "learning_rate": 9.547476808835381e-05, + "loss": 0.0006, + "num_input_tokens_seen": 216932176, + "step": 100535 + }, + { + "epoch": 16.40130505709625, + "grad_norm": 0.011223546229302883, + "learning_rate": 9.54329370167935e-05, + "loss": 0.0019, + "num_input_tokens_seen": 216942992, + "step": 100540 + }, + { + "epoch": 16.402120717781404, + "grad_norm": 0.0016546800034120679, + "learning_rate": 9.539111414429769e-05, + "loss": 0.0024, + "num_input_tokens_seen": 216951856, + "step": 100545 + }, + { + "epoch": 16.402936378466556, + "grad_norm": 0.045456912368535995, + "learning_rate": 9.53492994717145e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216963248, + "step": 100550 + }, + { + "epoch": 16.403752039151712, + "grad_norm": 0.00028564370586536825, + "learning_rate": 9.530749299989078e-05, + "loss": 0.0009, + "num_input_tokens_seen": 216973360, + "step": 100555 + }, + { + "epoch": 16.404567699836868, + "grad_norm": 0.0008504764409735799, + "learning_rate": 9.526569472967444e-05, + "loss": 0.0007, + "num_input_tokens_seen": 216984880, + "step": 100560 + }, + { + "epoch": 16.405383360522023, + "grad_norm": 0.00012973764387425035, + "learning_rate": 9.522390466191194e-05, + "loss": 0.0014, + "num_input_tokens_seen": 216996304, + "step": 100565 + }, + { + "epoch": 16.40619902120718, + "grad_norm": 0.0910826325416565, + "learning_rate": 9.518212279745075e-05, + "loss": 0.0028, + "num_input_tokens_seen": 217007888, + "step": 100570 + }, + { + "epoch": 16.40701468189233, + "grad_norm": 0.0003883955650962889, + "learning_rate": 9.514034913713714e-05, + "loss": 0.001, + "num_input_tokens_seen": 217018576, + "step": 100575 + }, + { + "epoch": 16.407830342577487, + "grad_norm": 0.0005625525373034179, + "learning_rate": 9.509858368181812e-05, + "loss": 0.0028, + "num_input_tokens_seen": 217030128, + "step": 100580 + }, + { + "epoch": 16.408646003262643, + "grad_norm": 0.00012947266804985702, + "learning_rate": 9.505682643233993e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217041072, + "step": 100585 + }, + { + "epoch": 16.4094616639478, + "grad_norm": 0.02128785289824009, + "learning_rate": 9.501507738954884e-05, + "loss": 0.0023, + "num_input_tokens_seen": 217052912, + "step": 100590 + }, + { + "epoch": 16.410277324632954, + "grad_norm": 0.0003045329067390412, + "learning_rate": 9.497333655429097e-05, + "loss": 0.0002, + "num_input_tokens_seen": 217064912, + "step": 100595 + }, + { + "epoch": 16.411092985318106, + "grad_norm": 0.00021320332598406821, + "learning_rate": 9.493160392741229e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217075536, + "step": 100600 + }, + { + "epoch": 16.411908646003262, + "grad_norm": 0.005234429147094488, + "learning_rate": 9.488987950975847e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217086960, + "step": 100605 + }, + { + "epoch": 16.412724306688418, + "grad_norm": 0.001382901449687779, + "learning_rate": 9.484816330217522e-05, + "loss": 0.0001, + "num_input_tokens_seen": 217097680, + "step": 100610 + }, + { + "epoch": 16.413539967373573, + "grad_norm": 0.01874091662466526, + "learning_rate": 9.480645530550785e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217108976, + "step": 100615 + }, + { + "epoch": 16.41435562805873, + "grad_norm": 0.001830661203712225, + "learning_rate": 9.47647555206017e-05, + "loss": 0.0026, + "num_input_tokens_seen": 217119216, + "step": 100620 + }, + { + "epoch": 16.41517128874388, + "grad_norm": 0.0009005884639918804, + "learning_rate": 9.472306394830188e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217129552, + "step": 100625 + }, + { + "epoch": 16.415986949429037, + "grad_norm": 0.00043929278035648167, + "learning_rate": 9.46813805894533e-05, + "loss": 0.0022, + "num_input_tokens_seen": 217140144, + "step": 100630 + }, + { + "epoch": 16.416802610114193, + "grad_norm": 0.00016332296945620328, + "learning_rate": 9.46397054449007e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217150576, + "step": 100635 + }, + { + "epoch": 16.41761827079935, + "grad_norm": 0.00011286345397820696, + "learning_rate": 9.459803851548876e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217161328, + "step": 100640 + }, + { + "epoch": 16.418433931484504, + "grad_norm": 0.007786328438669443, + "learning_rate": 9.455637980206177e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217171952, + "step": 100645 + }, + { + "epoch": 16.419249592169656, + "grad_norm": 0.0066762808710336685, + "learning_rate": 9.451472930546417e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217183216, + "step": 100650 + }, + { + "epoch": 16.420065252854812, + "grad_norm": 0.00017575548554304987, + "learning_rate": 9.447308702653995e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217193808, + "step": 100655 + }, + { + "epoch": 16.420880913539968, + "grad_norm": 0.004209660924971104, + "learning_rate": 9.443145296613303e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217203760, + "step": 100660 + }, + { + "epoch": 16.421696574225123, + "grad_norm": 0.0009201216162182391, + "learning_rate": 9.438982712508726e-05, + "loss": 0.0016, + "num_input_tokens_seen": 217215120, + "step": 100665 + }, + { + "epoch": 16.42251223491028, + "grad_norm": 0.0003526066429913044, + "learning_rate": 9.434820950424605e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217225776, + "step": 100670 + }, + { + "epoch": 16.42332789559543, + "grad_norm": 0.002267870120704174, + "learning_rate": 9.430660010445325e-05, + "loss": 0.0029, + "num_input_tokens_seen": 217236848, + "step": 100675 + }, + { + "epoch": 16.424143556280587, + "grad_norm": 0.00021975350682623684, + "learning_rate": 9.426499892655155e-05, + "loss": 0.0138, + "num_input_tokens_seen": 217247408, + "step": 100680 + }, + { + "epoch": 16.424959216965743, + "grad_norm": 0.00042188382940366864, + "learning_rate": 9.422340597138457e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217258320, + "step": 100685 + }, + { + "epoch": 16.4257748776509, + "grad_norm": 0.002426701132208109, + "learning_rate": 9.418182123979496e-05, + "loss": 0.0015, + "num_input_tokens_seen": 217270288, + "step": 100690 + }, + { + "epoch": 16.42659053833605, + "grad_norm": 0.0001718581625027582, + "learning_rate": 9.414024473262561e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217281008, + "step": 100695 + }, + { + "epoch": 16.427406199021206, + "grad_norm": 0.0008882993715815246, + "learning_rate": 9.409867645071901e-05, + "loss": 0.0039, + "num_input_tokens_seen": 217293168, + "step": 100700 + }, + { + "epoch": 16.428221859706362, + "grad_norm": 0.0027643893845379353, + "learning_rate": 9.405711639491771e-05, + "loss": 0.003, + "num_input_tokens_seen": 217303056, + "step": 100705 + }, + { + "epoch": 16.429037520391518, + "grad_norm": 0.0007874544826336205, + "learning_rate": 9.401556456606392e-05, + "loss": 0.0164, + "num_input_tokens_seen": 217313136, + "step": 100710 + }, + { + "epoch": 16.429853181076673, + "grad_norm": 0.013255887664854527, + "learning_rate": 9.397402096499973e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217324176, + "step": 100715 + }, + { + "epoch": 16.430668841761825, + "grad_norm": 0.028142236173152924, + "learning_rate": 9.393248559256706e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217333744, + "step": 100720 + }, + { + "epoch": 16.43148450244698, + "grad_norm": 0.043977174907922745, + "learning_rate": 9.389095844960771e-05, + "loss": 0.0291, + "num_input_tokens_seen": 217344592, + "step": 100725 + }, + { + "epoch": 16.432300163132137, + "grad_norm": 0.0073575228452682495, + "learning_rate": 9.384943953696329e-05, + "loss": 0.002, + "num_input_tokens_seen": 217355536, + "step": 100730 + }, + { + "epoch": 16.433115823817293, + "grad_norm": 0.0004401314363349229, + "learning_rate": 9.380792885547523e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217366416, + "step": 100735 + }, + { + "epoch": 16.43393148450245, + "grad_norm": 0.0012882990995422006, + "learning_rate": 9.376642640598476e-05, + "loss": 0.0014, + "num_input_tokens_seen": 217376400, + "step": 100740 + }, + { + "epoch": 16.4347471451876, + "grad_norm": 0.00209665484726429, + "learning_rate": 9.372493218933303e-05, + "loss": 0.0009, + "num_input_tokens_seen": 217386960, + "step": 100745 + }, + { + "epoch": 16.435562805872756, + "grad_norm": 0.006942251697182655, + "learning_rate": 9.368344620636094e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217397872, + "step": 100750 + }, + { + "epoch": 16.436378466557912, + "grad_norm": 0.0003105809155385941, + "learning_rate": 9.364196845790924e-05, + "loss": 0.0028, + "num_input_tokens_seen": 217409264, + "step": 100755 + }, + { + "epoch": 16.437194127243067, + "grad_norm": 0.012339676730334759, + "learning_rate": 9.360049894481854e-05, + "loss": 0.0011, + "num_input_tokens_seen": 217420560, + "step": 100760 + }, + { + "epoch": 16.438009787928223, + "grad_norm": 0.009776982478797436, + "learning_rate": 9.355903766792929e-05, + "loss": 0.0026, + "num_input_tokens_seen": 217432208, + "step": 100765 + }, + { + "epoch": 16.438825448613375, + "grad_norm": 0.0004512048908509314, + "learning_rate": 9.351758462808174e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217442800, + "step": 100770 + }, + { + "epoch": 16.43964110929853, + "grad_norm": 0.0016869446262717247, + "learning_rate": 9.347613982611603e-05, + "loss": 0.0051, + "num_input_tokens_seen": 217453008, + "step": 100775 + }, + { + "epoch": 16.440456769983687, + "grad_norm": 0.0008906829170882702, + "learning_rate": 9.343470326287206e-05, + "loss": 0.0034, + "num_input_tokens_seen": 217463440, + "step": 100780 + }, + { + "epoch": 16.441272430668842, + "grad_norm": 0.034834109246730804, + "learning_rate": 9.339327493918958e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217473616, + "step": 100785 + }, + { + "epoch": 16.442088091353998, + "grad_norm": 0.00021612562704831362, + "learning_rate": 9.335185485590807e-05, + "loss": 0.0034, + "num_input_tokens_seen": 217484752, + "step": 100790 + }, + { + "epoch": 16.44290375203915, + "grad_norm": 0.00039852780173532665, + "learning_rate": 9.331044301386732e-05, + "loss": 0.0035, + "num_input_tokens_seen": 217495824, + "step": 100795 + }, + { + "epoch": 16.443719412724306, + "grad_norm": 0.002798402449116111, + "learning_rate": 9.326903941390613e-05, + "loss": 0.0012, + "num_input_tokens_seen": 217506256, + "step": 100800 + }, + { + "epoch": 16.44453507340946, + "grad_norm": 0.0001771931565599516, + "learning_rate": 9.322764405686412e-05, + "loss": 0.0001, + "num_input_tokens_seen": 217516144, + "step": 100805 + }, + { + "epoch": 16.445350734094617, + "grad_norm": 0.00020765457884408534, + "learning_rate": 9.318625694357962e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217527120, + "step": 100810 + }, + { + "epoch": 16.446166394779773, + "grad_norm": 0.0034010144881904125, + "learning_rate": 9.314487807489186e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217538192, + "step": 100815 + }, + { + "epoch": 16.446982055464925, + "grad_norm": 0.00025063756038434803, + "learning_rate": 9.310350745163931e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217548400, + "step": 100820 + }, + { + "epoch": 16.44779771615008, + "grad_norm": 0.001447243383154273, + "learning_rate": 9.306214507466032e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217559216, + "step": 100825 + }, + { + "epoch": 16.448613376835237, + "grad_norm": 0.030008086934685707, + "learning_rate": 9.302079094479321e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217570768, + "step": 100830 + }, + { + "epoch": 16.449429037520392, + "grad_norm": 0.0006183035438880324, + "learning_rate": 9.297944506287609e-05, + "loss": 0.0196, + "num_input_tokens_seen": 217581200, + "step": 100835 + }, + { + "epoch": 16.450244698205548, + "grad_norm": 0.012072227895259857, + "learning_rate": 9.293810742974679e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217591728, + "step": 100840 + }, + { + "epoch": 16.4510603588907, + "grad_norm": 0.5550569295883179, + "learning_rate": 9.28967780462432e-05, + "loss": 0.0221, + "num_input_tokens_seen": 217602704, + "step": 100845 + }, + { + "epoch": 16.451876019575856, + "grad_norm": 0.0002853346522897482, + "learning_rate": 9.28554569132028e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217614352, + "step": 100850 + }, + { + "epoch": 16.45269168026101, + "grad_norm": 0.0027459103148430586, + "learning_rate": 9.28141440314631e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217625520, + "step": 100855 + }, + { + "epoch": 16.453507340946167, + "grad_norm": 0.000823643000330776, + "learning_rate": 9.277283940186132e-05, + "loss": 0.0001, + "num_input_tokens_seen": 217635600, + "step": 100860 + }, + { + "epoch": 16.454323001631323, + "grad_norm": 0.012498010881245136, + "learning_rate": 9.273154302523456e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217646928, + "step": 100865 + }, + { + "epoch": 16.455138662316475, + "grad_norm": 0.0019866458605974913, + "learning_rate": 9.269025490241972e-05, + "loss": 0.0038, + "num_input_tokens_seen": 217657680, + "step": 100870 + }, + { + "epoch": 16.45595432300163, + "grad_norm": 0.0003570933477021754, + "learning_rate": 9.264897503425357e-05, + "loss": 0.0017, + "num_input_tokens_seen": 217669360, + "step": 100875 + }, + { + "epoch": 16.456769983686787, + "grad_norm": 0.0003862560843117535, + "learning_rate": 9.260770342157272e-05, + "loss": 0.0005, + "num_input_tokens_seen": 217680112, + "step": 100880 + }, + { + "epoch": 16.457585644371942, + "grad_norm": 0.005069401580840349, + "learning_rate": 9.256644006521358e-05, + "loss": 0.0065, + "num_input_tokens_seen": 217689968, + "step": 100885 + }, + { + "epoch": 16.458401305057095, + "grad_norm": 0.020551789551973343, + "learning_rate": 9.252518496601237e-05, + "loss": 0.0042, + "num_input_tokens_seen": 217700208, + "step": 100890 + }, + { + "epoch": 16.45921696574225, + "grad_norm": 0.000351933907950297, + "learning_rate": 9.248393812480522e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217710192, + "step": 100895 + }, + { + "epoch": 16.460032626427406, + "grad_norm": 0.0015886453911662102, + "learning_rate": 9.244269954242806e-05, + "loss": 0.0042, + "num_input_tokens_seen": 217720592, + "step": 100900 + }, + { + "epoch": 16.46084828711256, + "grad_norm": 0.17945073544979095, + "learning_rate": 9.240146921971642e-05, + "loss": 0.0043, + "num_input_tokens_seen": 217730864, + "step": 100905 + }, + { + "epoch": 16.461663947797717, + "grad_norm": 0.005168005358427763, + "learning_rate": 9.23602471575064e-05, + "loss": 0.0034, + "num_input_tokens_seen": 217743184, + "step": 100910 + }, + { + "epoch": 16.46247960848287, + "grad_norm": 0.0037465947680175304, + "learning_rate": 9.231903335663283e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217754160, + "step": 100915 + }, + { + "epoch": 16.463295269168025, + "grad_norm": 0.0002872601035051048, + "learning_rate": 9.227782781793148e-05, + "loss": 0.0481, + "num_input_tokens_seen": 217763920, + "step": 100920 + }, + { + "epoch": 16.46411092985318, + "grad_norm": 0.0008457910153083503, + "learning_rate": 9.223663054223692e-05, + "loss": 0.0028, + "num_input_tokens_seen": 217773968, + "step": 100925 + }, + { + "epoch": 16.464926590538337, + "grad_norm": 0.5735427737236023, + "learning_rate": 9.219544153038462e-05, + "loss": 0.0919, + "num_input_tokens_seen": 217785072, + "step": 100930 + }, + { + "epoch": 16.465742251223492, + "grad_norm": 0.00021804044081363827, + "learning_rate": 9.21542607832087e-05, + "loss": 0.0003, + "num_input_tokens_seen": 217795472, + "step": 100935 + }, + { + "epoch": 16.466557911908644, + "grad_norm": 0.00012355491344351321, + "learning_rate": 9.211308830154441e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217806960, + "step": 100940 + }, + { + "epoch": 16.4673735725938, + "grad_norm": 0.00017301134357694536, + "learning_rate": 9.20719240862255e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217816944, + "step": 100945 + }, + { + "epoch": 16.468189233278956, + "grad_norm": 0.04972704127430916, + "learning_rate": 9.203076813808687e-05, + "loss": 0.0015, + "num_input_tokens_seen": 217828720, + "step": 100950 + }, + { + "epoch": 16.46900489396411, + "grad_norm": 0.000585968024097383, + "learning_rate": 9.198962045796195e-05, + "loss": 0.0007, + "num_input_tokens_seen": 217838128, + "step": 100955 + }, + { + "epoch": 16.469820554649267, + "grad_norm": 0.00011857187928399071, + "learning_rate": 9.194848104668513e-05, + "loss": 0.0018, + "num_input_tokens_seen": 217849776, + "step": 100960 + }, + { + "epoch": 16.47063621533442, + "grad_norm": 0.048869289457798004, + "learning_rate": 9.190734990508998e-05, + "loss": 0.0256, + "num_input_tokens_seen": 217860720, + "step": 100965 + }, + { + "epoch": 16.471451876019575, + "grad_norm": 0.007496944162994623, + "learning_rate": 9.18662270340101e-05, + "loss": 0.0008, + "num_input_tokens_seen": 217871536, + "step": 100970 + }, + { + "epoch": 16.47226753670473, + "grad_norm": 0.009519577026367188, + "learning_rate": 9.182511243427888e-05, + "loss": 0.0004, + "num_input_tokens_seen": 217883344, + "step": 100975 + }, + { + "epoch": 16.473083197389887, + "grad_norm": 0.004723408259451389, + "learning_rate": 9.178400610672954e-05, + "loss": 0.0017, + "num_input_tokens_seen": 217894832, + "step": 100980 + }, + { + "epoch": 16.473898858075042, + "grad_norm": 0.0042499671690166, + "learning_rate": 9.174290805219521e-05, + "loss": 0.0021, + "num_input_tokens_seen": 217904592, + "step": 100985 + }, + { + "epoch": 16.474714518760194, + "grad_norm": 0.0022579652722924948, + "learning_rate": 9.170181827150875e-05, + "loss": 0.0065, + "num_input_tokens_seen": 217915376, + "step": 100990 + }, + { + "epoch": 16.47553017944535, + "grad_norm": 1.05342435836792, + "learning_rate": 9.166073676550291e-05, + "loss": 0.0649, + "num_input_tokens_seen": 217927760, + "step": 100995 + }, + { + "epoch": 16.476345840130506, + "grad_norm": 0.0001327318896073848, + "learning_rate": 9.161966353501023e-05, + "loss": 0.0029, + "num_input_tokens_seen": 217939120, + "step": 101000 + }, + { + "epoch": 16.47716150081566, + "grad_norm": 0.008629623800516129, + "learning_rate": 9.157859858086315e-05, + "loss": 0.1008, + "num_input_tokens_seen": 217949712, + "step": 101005 + }, + { + "epoch": 16.477977161500817, + "grad_norm": 0.001194642623886466, + "learning_rate": 9.153754190389379e-05, + "loss": 0.0017, + "num_input_tokens_seen": 217959408, + "step": 101010 + }, + { + "epoch": 16.47879282218597, + "grad_norm": 0.0064383684657514095, + "learning_rate": 9.149649350493456e-05, + "loss": 0.0031, + "num_input_tokens_seen": 217971312, + "step": 101015 + }, + { + "epoch": 16.479608482871125, + "grad_norm": 0.005116917658597231, + "learning_rate": 9.145545338481682e-05, + "loss": 0.0006, + "num_input_tokens_seen": 217982800, + "step": 101020 + }, + { + "epoch": 16.48042414355628, + "grad_norm": 0.0068290880881249905, + "learning_rate": 9.141442154437286e-05, + "loss": 0.0007, + "num_input_tokens_seen": 217994704, + "step": 101025 + }, + { + "epoch": 16.481239804241437, + "grad_norm": 0.002408101689070463, + "learning_rate": 9.137339798443372e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218006032, + "step": 101030 + }, + { + "epoch": 16.482055464926592, + "grad_norm": 0.0001908936828840524, + "learning_rate": 9.133238270583133e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218016880, + "step": 101035 + }, + { + "epoch": 16.482871125611744, + "grad_norm": 0.0030339027289301157, + "learning_rate": 9.129137570939632e-05, + "loss": 0.0004, + "num_input_tokens_seen": 218025776, + "step": 101040 + }, + { + "epoch": 16.4836867862969, + "grad_norm": 0.00019987621635664254, + "learning_rate": 9.125037699596039e-05, + "loss": 0.0013, + "num_input_tokens_seen": 218038224, + "step": 101045 + }, + { + "epoch": 16.484502446982056, + "grad_norm": 0.00011005098349414766, + "learning_rate": 9.12093865663538e-05, + "loss": 0.0054, + "num_input_tokens_seen": 218048752, + "step": 101050 + }, + { + "epoch": 16.48531810766721, + "grad_norm": 0.14312519133090973, + "learning_rate": 9.11684044214079e-05, + "loss": 0.0088, + "num_input_tokens_seen": 218060272, + "step": 101055 + }, + { + "epoch": 16.486133768352367, + "grad_norm": 0.018566841259598732, + "learning_rate": 9.112743056195261e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218071984, + "step": 101060 + }, + { + "epoch": 16.48694942903752, + "grad_norm": 0.01109654363244772, + "learning_rate": 9.10864649888189e-05, + "loss": 0.0004, + "num_input_tokens_seen": 218083152, + "step": 101065 + }, + { + "epoch": 16.487765089722675, + "grad_norm": 0.0120266517624259, + "learning_rate": 9.104550770283648e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218094640, + "step": 101070 + }, + { + "epoch": 16.48858075040783, + "grad_norm": 0.0005432644975371659, + "learning_rate": 9.100455870483587e-05, + "loss": 0.0015, + "num_input_tokens_seen": 218104752, + "step": 101075 + }, + { + "epoch": 16.489396411092986, + "grad_norm": 0.004114139825105667, + "learning_rate": 9.096361799564651e-05, + "loss": 0.0029, + "num_input_tokens_seen": 218115120, + "step": 101080 + }, + { + "epoch": 16.49021207177814, + "grad_norm": 0.0035147261805832386, + "learning_rate": 9.092268557609856e-05, + "loss": 0.0012, + "num_input_tokens_seen": 218126448, + "step": 101085 + }, + { + "epoch": 16.491027732463294, + "grad_norm": 0.043826889246702194, + "learning_rate": 9.088176144702104e-05, + "loss": 0.0014, + "num_input_tokens_seen": 218136912, + "step": 101090 + }, + { + "epoch": 16.49184339314845, + "grad_norm": 0.0034974897280335426, + "learning_rate": 9.084084560924394e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218147312, + "step": 101095 + }, + { + "epoch": 16.492659053833606, + "grad_norm": 0.0042905122973024845, + "learning_rate": 9.079993806359587e-05, + "loss": 0.0019, + "num_input_tokens_seen": 218158512, + "step": 101100 + }, + { + "epoch": 16.49347471451876, + "grad_norm": 0.0007126958225853741, + "learning_rate": 9.075903881090636e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218169488, + "step": 101105 + }, + { + "epoch": 16.494290375203914, + "grad_norm": 0.0020732611883431673, + "learning_rate": 9.071814785200399e-05, + "loss": 0.0015, + "num_input_tokens_seen": 218180016, + "step": 101110 + }, + { + "epoch": 16.49510603588907, + "grad_norm": 0.00010162102989852428, + "learning_rate": 9.067726518771762e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218191728, + "step": 101115 + }, + { + "epoch": 16.495921696574225, + "grad_norm": 0.09964632242918015, + "learning_rate": 9.063639081887576e-05, + "loss": 0.0024, + "num_input_tokens_seen": 218203152, + "step": 101120 + }, + { + "epoch": 16.49673735725938, + "grad_norm": 0.002495150314643979, + "learning_rate": 9.059552474630672e-05, + "loss": 0.0039, + "num_input_tokens_seen": 218214320, + "step": 101125 + }, + { + "epoch": 16.497553017944536, + "grad_norm": 0.0021598113235086203, + "learning_rate": 9.055466697083875e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218225776, + "step": 101130 + }, + { + "epoch": 16.49836867862969, + "grad_norm": 0.1636086255311966, + "learning_rate": 9.051381749329984e-05, + "loss": 0.0032, + "num_input_tokens_seen": 218236944, + "step": 101135 + }, + { + "epoch": 16.499184339314844, + "grad_norm": 0.000721384072676301, + "learning_rate": 9.04729763145179e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218248496, + "step": 101140 + }, + { + "epoch": 16.5, + "grad_norm": 0.0008961012936197221, + "learning_rate": 9.043214343532063e-05, + "loss": 0.0002, + "num_input_tokens_seen": 218259248, + "step": 101145 + }, + { + "epoch": 16.500815660685156, + "grad_norm": 0.0006173294386826456, + "learning_rate": 9.039131885653556e-05, + "loss": 0.0073, + "num_input_tokens_seen": 218269392, + "step": 101150 + }, + { + "epoch": 16.50163132137031, + "grad_norm": 0.2789905369281769, + "learning_rate": 9.035050257898991e-05, + "loss": 0.0077, + "num_input_tokens_seen": 218279504, + "step": 101155 + }, + { + "epoch": 16.502446982055464, + "grad_norm": 0.022524939849972725, + "learning_rate": 9.030969460351124e-05, + "loss": 0.0011, + "num_input_tokens_seen": 218290000, + "step": 101160 + }, + { + "epoch": 16.50326264274062, + "grad_norm": 0.029716866090893745, + "learning_rate": 9.026889493092605e-05, + "loss": 0.004, + "num_input_tokens_seen": 218299888, + "step": 101165 + }, + { + "epoch": 16.504078303425775, + "grad_norm": 0.0002559605345595628, + "learning_rate": 9.022810356206179e-05, + "loss": 0.0001, + "num_input_tokens_seen": 218310320, + "step": 101170 + }, + { + "epoch": 16.50489396411093, + "grad_norm": 0.0006618006154894829, + "learning_rate": 9.018732049774459e-05, + "loss": 0.0042, + "num_input_tokens_seen": 218321744, + "step": 101175 + }, + { + "epoch": 16.505709624796086, + "grad_norm": 0.0009725497220642865, + "learning_rate": 9.014654573880143e-05, + "loss": 0.004, + "num_input_tokens_seen": 218334384, + "step": 101180 + }, + { + "epoch": 16.50652528548124, + "grad_norm": 0.004790200386196375, + "learning_rate": 9.010577928605823e-05, + "loss": 0.001, + "num_input_tokens_seen": 218345776, + "step": 101185 + }, + { + "epoch": 16.507340946166394, + "grad_norm": 0.00029979494865983725, + "learning_rate": 9.00650211403417e-05, + "loss": 0.0012, + "num_input_tokens_seen": 218357200, + "step": 101190 + }, + { + "epoch": 16.50815660685155, + "grad_norm": 0.0001249322376679629, + "learning_rate": 9.002427130247726e-05, + "loss": 0.0044, + "num_input_tokens_seen": 218368208, + "step": 101195 + }, + { + "epoch": 16.508972267536706, + "grad_norm": 0.026081014424562454, + "learning_rate": 8.998352977329127e-05, + "loss": 0.0018, + "num_input_tokens_seen": 218378992, + "step": 101200 + }, + { + "epoch": 16.50978792822186, + "grad_norm": 0.0003004127065651119, + "learning_rate": 8.994279655360899e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218389808, + "step": 101205 + }, + { + "epoch": 16.510603588907014, + "grad_norm": 0.0010646298760548234, + "learning_rate": 8.99020716442564e-05, + "loss": 0.0037, + "num_input_tokens_seen": 218399696, + "step": 101210 + }, + { + "epoch": 16.51141924959217, + "grad_norm": 0.04701593145728111, + "learning_rate": 8.986135504605831e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218411984, + "step": 101215 + }, + { + "epoch": 16.512234910277325, + "grad_norm": 0.0044936249032616615, + "learning_rate": 8.982064675984025e-05, + "loss": 0.0013, + "num_input_tokens_seen": 218421552, + "step": 101220 + }, + { + "epoch": 16.51305057096248, + "grad_norm": 0.0033444890286773443, + "learning_rate": 8.977994678642714e-05, + "loss": 0.0105, + "num_input_tokens_seen": 218433264, + "step": 101225 + }, + { + "epoch": 16.513866231647633, + "grad_norm": 0.0002350674767512828, + "learning_rate": 8.973925512664383e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218443472, + "step": 101230 + }, + { + "epoch": 16.51468189233279, + "grad_norm": 0.0001113976541091688, + "learning_rate": 8.969857178131497e-05, + "loss": 0.0015, + "num_input_tokens_seen": 218454000, + "step": 101235 + }, + { + "epoch": 16.515497553017944, + "grad_norm": 0.5564191341400146, + "learning_rate": 8.965789675126501e-05, + "loss": 0.0082, + "num_input_tokens_seen": 218464144, + "step": 101240 + }, + { + "epoch": 16.5163132137031, + "grad_norm": 0.00956058781594038, + "learning_rate": 8.961723003731837e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218474864, + "step": 101245 + }, + { + "epoch": 16.517128874388256, + "grad_norm": 0.0009902393212541938, + "learning_rate": 8.95765716402992e-05, + "loss": 0.0066, + "num_input_tokens_seen": 218484912, + "step": 101250 + }, + { + "epoch": 16.517944535073408, + "grad_norm": 0.00037753465585410595, + "learning_rate": 8.953592156103141e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218496432, + "step": 101255 + }, + { + "epoch": 16.518760195758563, + "grad_norm": 0.0005449475720524788, + "learning_rate": 8.949527980033889e-05, + "loss": 0.0028, + "num_input_tokens_seen": 218507408, + "step": 101260 + }, + { + "epoch": 16.51957585644372, + "grad_norm": 0.021315928548574448, + "learning_rate": 8.945464635904532e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218517712, + "step": 101265 + }, + { + "epoch": 16.520391517128875, + "grad_norm": 0.00045802482054568827, + "learning_rate": 8.94140212379741e-05, + "loss": 0.003, + "num_input_tokens_seen": 218528880, + "step": 101270 + }, + { + "epoch": 16.52120717781403, + "grad_norm": 0.24623122811317444, + "learning_rate": 8.937340443794867e-05, + "loss": 0.0093, + "num_input_tokens_seen": 218540368, + "step": 101275 + }, + { + "epoch": 16.522022838499183, + "grad_norm": 0.00017623046005610377, + "learning_rate": 8.933279595979205e-05, + "loss": 0.0274, + "num_input_tokens_seen": 218550448, + "step": 101280 + }, + { + "epoch": 16.52283849918434, + "grad_norm": 0.0004754549008794129, + "learning_rate": 8.929219580432735e-05, + "loss": 0.0004, + "num_input_tokens_seen": 218561200, + "step": 101285 + }, + { + "epoch": 16.523654159869494, + "grad_norm": 0.036871589720249176, + "learning_rate": 8.925160397237725e-05, + "loss": 0.0012, + "num_input_tokens_seen": 218572656, + "step": 101290 + }, + { + "epoch": 16.52446982055465, + "grad_norm": 0.0009399743285030127, + "learning_rate": 8.921102046476454e-05, + "loss": 0.0014, + "num_input_tokens_seen": 218583408, + "step": 101295 + }, + { + "epoch": 16.525285481239806, + "grad_norm": 0.00896536186337471, + "learning_rate": 8.917044528231145e-05, + "loss": 0.0011, + "num_input_tokens_seen": 218593168, + "step": 101300 + }, + { + "epoch": 16.526101141924958, + "grad_norm": 0.0011907280422747135, + "learning_rate": 8.912987842584075e-05, + "loss": 0.0002, + "num_input_tokens_seen": 218604976, + "step": 101305 + }, + { + "epoch": 16.526916802610113, + "grad_norm": 0.0007228627800941467, + "learning_rate": 8.908931989617403e-05, + "loss": 0.0018, + "num_input_tokens_seen": 218615856, + "step": 101310 + }, + { + "epoch": 16.52773246329527, + "grad_norm": 0.045270953327417374, + "learning_rate": 8.904876969413372e-05, + "loss": 0.0023, + "num_input_tokens_seen": 218626352, + "step": 101315 + }, + { + "epoch": 16.528548123980425, + "grad_norm": 0.000399926386307925, + "learning_rate": 8.900822782054124e-05, + "loss": 0.0011, + "num_input_tokens_seen": 218637456, + "step": 101320 + }, + { + "epoch": 16.52936378466558, + "grad_norm": 0.00344996084459126, + "learning_rate": 8.896769427621848e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218648080, + "step": 101325 + }, + { + "epoch": 16.530179445350733, + "grad_norm": 0.0019137338967993855, + "learning_rate": 8.892716906198683e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218658544, + "step": 101330 + }, + { + "epoch": 16.53099510603589, + "grad_norm": 0.00039312304579652846, + "learning_rate": 8.88866521786676e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218668656, + "step": 101335 + }, + { + "epoch": 16.531810766721044, + "grad_norm": 0.0006052463431842625, + "learning_rate": 8.884614362708188e-05, + "loss": 0.0002, + "num_input_tokens_seen": 218679408, + "step": 101340 + }, + { + "epoch": 16.5326264274062, + "grad_norm": 0.002214816864579916, + "learning_rate": 8.88056434080507e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218689232, + "step": 101345 + }, + { + "epoch": 16.533442088091356, + "grad_norm": 0.001470891642384231, + "learning_rate": 8.876515152239472e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218700880, + "step": 101350 + }, + { + "epoch": 16.534257748776508, + "grad_norm": 0.0007592425099574029, + "learning_rate": 8.872466797093464e-05, + "loss": 0.001, + "num_input_tokens_seen": 218712400, + "step": 101355 + }, + { + "epoch": 16.535073409461663, + "grad_norm": 0.0011189829092472792, + "learning_rate": 8.868419275449096e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218723344, + "step": 101360 + }, + { + "epoch": 16.53588907014682, + "grad_norm": 0.0010157719952985644, + "learning_rate": 8.864372587388387e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218735344, + "step": 101365 + }, + { + "epoch": 16.536704730831975, + "grad_norm": 0.013827089220285416, + "learning_rate": 8.860326732993352e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218746032, + "step": 101370 + }, + { + "epoch": 16.53752039151713, + "grad_norm": 0.000368335226085037, + "learning_rate": 8.856281712345988e-05, + "loss": 0.0002, + "num_input_tokens_seen": 218757328, + "step": 101375 + }, + { + "epoch": 16.538336052202283, + "grad_norm": 0.0175037682056427, + "learning_rate": 8.852237525528262e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218768688, + "step": 101380 + }, + { + "epoch": 16.53915171288744, + "grad_norm": 0.00023096069344319403, + "learning_rate": 8.848194172622148e-05, + "loss": 0.0091, + "num_input_tokens_seen": 218779792, + "step": 101385 + }, + { + "epoch": 16.539967373572594, + "grad_norm": 0.001963430317118764, + "learning_rate": 8.844151653709581e-05, + "loss": 0.0011, + "num_input_tokens_seen": 218790096, + "step": 101390 + }, + { + "epoch": 16.54078303425775, + "grad_norm": 0.0010405801003798842, + "learning_rate": 8.840109968872495e-05, + "loss": 0.0007, + "num_input_tokens_seen": 218801680, + "step": 101395 + }, + { + "epoch": 16.541598694942905, + "grad_norm": 0.002091748407110572, + "learning_rate": 8.836069118192791e-05, + "loss": 0.0002, + "num_input_tokens_seen": 218812976, + "step": 101400 + }, + { + "epoch": 16.542414355628058, + "grad_norm": 0.0023322361521422863, + "learning_rate": 8.83202910175237e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218823952, + "step": 101405 + }, + { + "epoch": 16.543230016313213, + "grad_norm": 0.0007525283726863563, + "learning_rate": 8.827989919633106e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218835408, + "step": 101410 + }, + { + "epoch": 16.54404567699837, + "grad_norm": 0.0015514292754232883, + "learning_rate": 8.82395157191685e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218847344, + "step": 101415 + }, + { + "epoch": 16.544861337683525, + "grad_norm": 0.019484123215079308, + "learning_rate": 8.819914058685458e-05, + "loss": 0.0017, + "num_input_tokens_seen": 218857840, + "step": 101420 + }, + { + "epoch": 16.545676998368677, + "grad_norm": 0.004084591753780842, + "learning_rate": 8.815877380020743e-05, + "loss": 0.0008, + "num_input_tokens_seen": 218868336, + "step": 101425 + }, + { + "epoch": 16.546492659053833, + "grad_norm": 0.0014592388179153204, + "learning_rate": 8.811841536004505e-05, + "loss": 0.0009, + "num_input_tokens_seen": 218876976, + "step": 101430 + }, + { + "epoch": 16.54730831973899, + "grad_norm": 0.11949754506349564, + "learning_rate": 8.807806526718565e-05, + "loss": 0.0024, + "num_input_tokens_seen": 218887888, + "step": 101435 + }, + { + "epoch": 16.548123980424144, + "grad_norm": 0.00040723325219005346, + "learning_rate": 8.803772352244683e-05, + "loss": 0.0006, + "num_input_tokens_seen": 218898896, + "step": 101440 + }, + { + "epoch": 16.5489396411093, + "grad_norm": 0.12976230680942535, + "learning_rate": 8.799739012664615e-05, + "loss": 0.0266, + "num_input_tokens_seen": 218910480, + "step": 101445 + }, + { + "epoch": 16.549755301794452, + "grad_norm": 0.0010594066698104143, + "learning_rate": 8.795706508060102e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218921392, + "step": 101450 + }, + { + "epoch": 16.550570962479608, + "grad_norm": 0.0018355901120230556, + "learning_rate": 8.791674838512864e-05, + "loss": 0.0004, + "num_input_tokens_seen": 218932368, + "step": 101455 + }, + { + "epoch": 16.551386623164763, + "grad_norm": 0.001228198641911149, + "learning_rate": 8.787644004104617e-05, + "loss": 0.0003, + "num_input_tokens_seen": 218942384, + "step": 101460 + }, + { + "epoch": 16.55220228384992, + "grad_norm": 0.0007404611678794026, + "learning_rate": 8.78361400491704e-05, + "loss": 0.098, + "num_input_tokens_seen": 218952752, + "step": 101465 + }, + { + "epoch": 16.553017944535075, + "grad_norm": 0.01958891749382019, + "learning_rate": 8.779584841031818e-05, + "loss": 0.0108, + "num_input_tokens_seen": 218963696, + "step": 101470 + }, + { + "epoch": 16.553833605220227, + "grad_norm": 0.0004320998559705913, + "learning_rate": 8.775556512530597e-05, + "loss": 0.0001, + "num_input_tokens_seen": 218974480, + "step": 101475 + }, + { + "epoch": 16.554649265905383, + "grad_norm": 0.0002444481651764363, + "learning_rate": 8.771529019495022e-05, + "loss": 0.0007, + "num_input_tokens_seen": 218984624, + "step": 101480 + }, + { + "epoch": 16.55546492659054, + "grad_norm": 0.003655742621049285, + "learning_rate": 8.767502362006713e-05, + "loss": 0.0005, + "num_input_tokens_seen": 218993424, + "step": 101485 + }, + { + "epoch": 16.556280587275694, + "grad_norm": 0.1492701768875122, + "learning_rate": 8.763476540147275e-05, + "loss": 0.0102, + "num_input_tokens_seen": 219004368, + "step": 101490 + }, + { + "epoch": 16.55709624796085, + "grad_norm": 0.0018508571665734053, + "learning_rate": 8.759451553998299e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219015280, + "step": 101495 + }, + { + "epoch": 16.557911908646002, + "grad_norm": 0.006536591798067093, + "learning_rate": 8.755427403641352e-05, + "loss": 0.0037, + "num_input_tokens_seen": 219024752, + "step": 101500 + }, + { + "epoch": 16.558727569331158, + "grad_norm": 0.010955830104649067, + "learning_rate": 8.751404089157993e-05, + "loss": 0.0043, + "num_input_tokens_seen": 219034544, + "step": 101505 + }, + { + "epoch": 16.559543230016313, + "grad_norm": 0.00028226300491951406, + "learning_rate": 8.747381610629762e-05, + "loss": 0.0024, + "num_input_tokens_seen": 219045840, + "step": 101510 + }, + { + "epoch": 16.56035889070147, + "grad_norm": 0.00013772597594652325, + "learning_rate": 8.74335996813817e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219055760, + "step": 101515 + }, + { + "epoch": 16.561174551386625, + "grad_norm": 0.018299255520105362, + "learning_rate": 8.739339161764725e-05, + "loss": 0.0029, + "num_input_tokens_seen": 219065840, + "step": 101520 + }, + { + "epoch": 16.561990212071777, + "grad_norm": 0.03594053164124489, + "learning_rate": 8.735319191590918e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219076176, + "step": 101525 + }, + { + "epoch": 16.562805872756933, + "grad_norm": 0.012118668295443058, + "learning_rate": 8.731300057698216e-05, + "loss": 0.0024, + "num_input_tokens_seen": 219086800, + "step": 101530 + }, + { + "epoch": 16.563621533442088, + "grad_norm": 0.00019012393022421747, + "learning_rate": 8.727281760168055e-05, + "loss": 0.0002, + "num_input_tokens_seen": 219096912, + "step": 101535 + }, + { + "epoch": 16.564437194127244, + "grad_norm": 0.00031298547401092947, + "learning_rate": 8.723264299081912e-05, + "loss": 0.0016, + "num_input_tokens_seen": 219109040, + "step": 101540 + }, + { + "epoch": 16.5652528548124, + "grad_norm": 0.001115536317229271, + "learning_rate": 8.719247674521157e-05, + "loss": 0.0022, + "num_input_tokens_seen": 219120432, + "step": 101545 + }, + { + "epoch": 16.56606851549755, + "grad_norm": 0.0077836341224610806, + "learning_rate": 8.715231886567248e-05, + "loss": 0.0003, + "num_input_tokens_seen": 219130928, + "step": 101550 + }, + { + "epoch": 16.566884176182707, + "grad_norm": 0.0001263033482246101, + "learning_rate": 8.711216935301508e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219139600, + "step": 101555 + }, + { + "epoch": 16.567699836867863, + "grad_norm": 0.007635746616870165, + "learning_rate": 8.70720282080536e-05, + "loss": 0.0054, + "num_input_tokens_seen": 219151280, + "step": 101560 + }, + { + "epoch": 16.56851549755302, + "grad_norm": 0.006693224888294935, + "learning_rate": 8.703189543160106e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219162096, + "step": 101565 + }, + { + "epoch": 16.569331158238175, + "grad_norm": 0.0003036673879250884, + "learning_rate": 8.699177102447126e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219172432, + "step": 101570 + }, + { + "epoch": 16.570146818923327, + "grad_norm": 0.02488783188164234, + "learning_rate": 8.695165498747698e-05, + "loss": 0.0026, + "num_input_tokens_seen": 219184400, + "step": 101575 + }, + { + "epoch": 16.570962479608482, + "grad_norm": 0.0005649144877679646, + "learning_rate": 8.691154732143147e-05, + "loss": 0.0005, + "num_input_tokens_seen": 219195408, + "step": 101580 + }, + { + "epoch": 16.571778140293638, + "grad_norm": 0.0007117405184544623, + "learning_rate": 8.687144802714753e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219205712, + "step": 101585 + }, + { + "epoch": 16.572593800978794, + "grad_norm": 0.02072257362306118, + "learning_rate": 8.683135710543777e-05, + "loss": 0.0011, + "num_input_tokens_seen": 219216688, + "step": 101590 + }, + { + "epoch": 16.57340946166395, + "grad_norm": 0.0027385344728827477, + "learning_rate": 8.679127455711466e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219227408, + "step": 101595 + }, + { + "epoch": 16.5742251223491, + "grad_norm": 0.00025816989364102483, + "learning_rate": 8.675120038299062e-05, + "loss": 0.0003, + "num_input_tokens_seen": 219237808, + "step": 101600 + }, + { + "epoch": 16.575040783034257, + "grad_norm": 0.00012907535710837692, + "learning_rate": 8.671113458387775e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219249584, + "step": 101605 + }, + { + "epoch": 16.575856443719413, + "grad_norm": 0.00023899046937003732, + "learning_rate": 8.667107716058798e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219261520, + "step": 101610 + }, + { + "epoch": 16.57667210440457, + "grad_norm": 0.05795467644929886, + "learning_rate": 8.66310281139332e-05, + "loss": 0.0014, + "num_input_tokens_seen": 219272656, + "step": 101615 + }, + { + "epoch": 16.57748776508972, + "grad_norm": 0.011220943182706833, + "learning_rate": 8.659098744472505e-05, + "loss": 0.0947, + "num_input_tokens_seen": 219284016, + "step": 101620 + }, + { + "epoch": 16.578303425774877, + "grad_norm": 0.007325512357056141, + "learning_rate": 8.655095515377498e-05, + "loss": 0.0005, + "num_input_tokens_seen": 219294480, + "step": 101625 + }, + { + "epoch": 16.579119086460032, + "grad_norm": 0.02884707599878311, + "learning_rate": 8.65109312418943e-05, + "loss": 0.0017, + "num_input_tokens_seen": 219305136, + "step": 101630 + }, + { + "epoch": 16.579934747145188, + "grad_norm": 0.005319823510944843, + "learning_rate": 8.647091570989413e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219316464, + "step": 101635 + }, + { + "epoch": 16.580750407830344, + "grad_norm": 0.011124509386718273, + "learning_rate": 8.643090855858549e-05, + "loss": 0.0017, + "num_input_tokens_seen": 219326672, + "step": 101640 + }, + { + "epoch": 16.581566068515496, + "grad_norm": 0.013211015611886978, + "learning_rate": 8.639090978877912e-05, + "loss": 0.0122, + "num_input_tokens_seen": 219336368, + "step": 101645 + }, + { + "epoch": 16.58238172920065, + "grad_norm": 0.010375562123954296, + "learning_rate": 8.635091940128548e-05, + "loss": 0.0003, + "num_input_tokens_seen": 219347600, + "step": 101650 + }, + { + "epoch": 16.583197389885807, + "grad_norm": 0.0018069169018417597, + "learning_rate": 8.631093739691553e-05, + "loss": 0.0163, + "num_input_tokens_seen": 219357488, + "step": 101655 + }, + { + "epoch": 16.584013050570963, + "grad_norm": 0.0005089796613901854, + "learning_rate": 8.627096377647898e-05, + "loss": 0.0043, + "num_input_tokens_seen": 219368368, + "step": 101660 + }, + { + "epoch": 16.58482871125612, + "grad_norm": 0.0012300984235480428, + "learning_rate": 8.623099854078643e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219379856, + "step": 101665 + }, + { + "epoch": 16.58564437194127, + "grad_norm": 0.0005591753870248795, + "learning_rate": 8.619104169064734e-05, + "loss": 0.001, + "num_input_tokens_seen": 219389904, + "step": 101670 + }, + { + "epoch": 16.586460032626427, + "grad_norm": 0.0033935534302145243, + "learning_rate": 8.615109322687203e-05, + "loss": 0.0675, + "num_input_tokens_seen": 219400656, + "step": 101675 + }, + { + "epoch": 16.587275693311582, + "grad_norm": 0.0003701037203427404, + "learning_rate": 8.611115315026951e-05, + "loss": 0.0361, + "num_input_tokens_seen": 219410736, + "step": 101680 + }, + { + "epoch": 16.588091353996738, + "grad_norm": 0.000397946365410462, + "learning_rate": 8.607122146164986e-05, + "loss": 0.0119, + "num_input_tokens_seen": 219423632, + "step": 101685 + }, + { + "epoch": 16.588907014681894, + "grad_norm": 0.052193064242601395, + "learning_rate": 8.60312981618217e-05, + "loss": 0.0037, + "num_input_tokens_seen": 219434800, + "step": 101690 + }, + { + "epoch": 16.589722675367046, + "grad_norm": 0.035141173750162125, + "learning_rate": 8.599138325159472e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219445456, + "step": 101695 + }, + { + "epoch": 16.5905383360522, + "grad_norm": 0.00012049770157318562, + "learning_rate": 8.595147673177728e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219456400, + "step": 101700 + }, + { + "epoch": 16.591353996737357, + "grad_norm": 0.0003388400946278125, + "learning_rate": 8.591157860317871e-05, + "loss": 0.0003, + "num_input_tokens_seen": 219467472, + "step": 101705 + }, + { + "epoch": 16.592169657422513, + "grad_norm": 0.013764660805463791, + "learning_rate": 8.587168886660707e-05, + "loss": 0.0005, + "num_input_tokens_seen": 219478160, + "step": 101710 + }, + { + "epoch": 16.59298531810767, + "grad_norm": 0.01611482910811901, + "learning_rate": 8.583180752287123e-05, + "loss": 0.0056, + "num_input_tokens_seen": 219487664, + "step": 101715 + }, + { + "epoch": 16.59380097879282, + "grad_norm": 0.00016631743346806616, + "learning_rate": 8.579193457277895e-05, + "loss": 0.0011, + "num_input_tokens_seen": 219498192, + "step": 101720 + }, + { + "epoch": 16.594616639477977, + "grad_norm": 0.0024386378936469555, + "learning_rate": 8.575207001713875e-05, + "loss": 0.0406, + "num_input_tokens_seen": 219508400, + "step": 101725 + }, + { + "epoch": 16.595432300163132, + "grad_norm": 0.00016337491979356855, + "learning_rate": 8.571221385675832e-05, + "loss": 0.0017, + "num_input_tokens_seen": 219518896, + "step": 101730 + }, + { + "epoch": 16.596247960848288, + "grad_norm": 0.007651092018932104, + "learning_rate": 8.567236609244544e-05, + "loss": 0.0022, + "num_input_tokens_seen": 219530128, + "step": 101735 + }, + { + "epoch": 16.597063621533444, + "grad_norm": 0.0038077482022345066, + "learning_rate": 8.563252672500771e-05, + "loss": 0.0022, + "num_input_tokens_seen": 219541168, + "step": 101740 + }, + { + "epoch": 16.597879282218596, + "grad_norm": 0.00014837698836345226, + "learning_rate": 8.559269575525247e-05, + "loss": 0.0013, + "num_input_tokens_seen": 219552656, + "step": 101745 + }, + { + "epoch": 16.59869494290375, + "grad_norm": 0.0003932917315978557, + "learning_rate": 8.555287318398697e-05, + "loss": 0.001, + "num_input_tokens_seen": 219563248, + "step": 101750 + }, + { + "epoch": 16.599510603588907, + "grad_norm": 0.0018113452242687345, + "learning_rate": 8.551305901201822e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219573840, + "step": 101755 + }, + { + "epoch": 16.600326264274063, + "grad_norm": 0.00034973089350387454, + "learning_rate": 8.54732532401532e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219583888, + "step": 101760 + }, + { + "epoch": 16.601141924959215, + "grad_norm": 0.00021492174710147083, + "learning_rate": 8.543345586919854e-05, + "loss": 0.0019, + "num_input_tokens_seen": 219594672, + "step": 101765 + }, + { + "epoch": 16.60195758564437, + "grad_norm": 0.1355443149805069, + "learning_rate": 8.53936668999608e-05, + "loss": 0.0101, + "num_input_tokens_seen": 219605520, + "step": 101770 + }, + { + "epoch": 16.602773246329527, + "grad_norm": 0.0004032077267765999, + "learning_rate": 8.535388633324625e-05, + "loss": 0.001, + "num_input_tokens_seen": 219614832, + "step": 101775 + }, + { + "epoch": 16.603588907014682, + "grad_norm": 0.0013673050561919808, + "learning_rate": 8.531411416986152e-05, + "loss": 0.0002, + "num_input_tokens_seen": 219625040, + "step": 101780 + }, + { + "epoch": 16.604404567699838, + "grad_norm": 0.00020742157357744873, + "learning_rate": 8.5274350410612e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219635312, + "step": 101785 + }, + { + "epoch": 16.605220228384994, + "grad_norm": 0.008629449643194675, + "learning_rate": 8.523459505630415e-05, + "loss": 0.0428, + "num_input_tokens_seen": 219646128, + "step": 101790 + }, + { + "epoch": 16.606035889070146, + "grad_norm": 0.0014691862743347883, + "learning_rate": 8.51948481077432e-05, + "loss": 0.0248, + "num_input_tokens_seen": 219657136, + "step": 101795 + }, + { + "epoch": 16.6068515497553, + "grad_norm": 0.0007061989163048565, + "learning_rate": 8.515510956573507e-05, + "loss": 0.0001, + "num_input_tokens_seen": 219667440, + "step": 101800 + }, + { + "epoch": 16.607667210440457, + "grad_norm": 0.0002825288975145668, + "learning_rate": 8.511537943108466e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219677968, + "step": 101805 + }, + { + "epoch": 16.608482871125613, + "grad_norm": 0.6444823741912842, + "learning_rate": 8.507565770459769e-05, + "loss": 0.0255, + "num_input_tokens_seen": 219689424, + "step": 101810 + }, + { + "epoch": 16.609298531810765, + "grad_norm": 0.0001055434113368392, + "learning_rate": 8.503594438707856e-05, + "loss": 0.0021, + "num_input_tokens_seen": 219699888, + "step": 101815 + }, + { + "epoch": 16.61011419249592, + "grad_norm": 0.0029770859982818365, + "learning_rate": 8.499623947933276e-05, + "loss": 0.0026, + "num_input_tokens_seen": 219711376, + "step": 101820 + }, + { + "epoch": 16.610929853181077, + "grad_norm": 0.004444227088242769, + "learning_rate": 8.495654298216438e-05, + "loss": 0.0015, + "num_input_tokens_seen": 219721360, + "step": 101825 + }, + { + "epoch": 16.611745513866232, + "grad_norm": 0.002673287643119693, + "learning_rate": 8.49168548963784e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219732752, + "step": 101830 + }, + { + "epoch": 16.612561174551388, + "grad_norm": 0.034069135785102844, + "learning_rate": 8.487717522277872e-05, + "loss": 0.0032, + "num_input_tokens_seen": 219743824, + "step": 101835 + }, + { + "epoch": 16.61337683523654, + "grad_norm": 1.0724304914474487, + "learning_rate": 8.483750396216988e-05, + "loss": 0.032, + "num_input_tokens_seen": 219755184, + "step": 101840 + }, + { + "epoch": 16.614192495921696, + "grad_norm": 0.00010677549289539456, + "learning_rate": 8.479784111535549e-05, + "loss": 0.0022, + "num_input_tokens_seen": 219765968, + "step": 101845 + }, + { + "epoch": 16.61500815660685, + "grad_norm": 0.04592803493142128, + "learning_rate": 8.475818668313984e-05, + "loss": 0.0018, + "num_input_tokens_seen": 219776016, + "step": 101850 + }, + { + "epoch": 16.615823817292007, + "grad_norm": 0.00033590669045224786, + "learning_rate": 8.471854066632607e-05, + "loss": 0.0034, + "num_input_tokens_seen": 219786448, + "step": 101855 + }, + { + "epoch": 16.616639477977163, + "grad_norm": 0.08825443685054779, + "learning_rate": 8.467890306571795e-05, + "loss": 0.0165, + "num_input_tokens_seen": 219797104, + "step": 101860 + }, + { + "epoch": 16.617455138662315, + "grad_norm": 0.0026127572637051344, + "learning_rate": 8.463927388211878e-05, + "loss": 0.0068, + "num_input_tokens_seen": 219807056, + "step": 101865 + }, + { + "epoch": 16.61827079934747, + "grad_norm": 0.0018160874024033546, + "learning_rate": 8.459965311633161e-05, + "loss": 0.001, + "num_input_tokens_seen": 219817232, + "step": 101870 + }, + { + "epoch": 16.619086460032626, + "grad_norm": 0.033748239278793335, + "learning_rate": 8.456004076915952e-05, + "loss": 0.002, + "num_input_tokens_seen": 219828240, + "step": 101875 + }, + { + "epoch": 16.619902120717782, + "grad_norm": 0.0031462085898965597, + "learning_rate": 8.452043684140514e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219838480, + "step": 101880 + }, + { + "epoch": 16.620717781402938, + "grad_norm": 0.00029203720623627305, + "learning_rate": 8.448084133387124e-05, + "loss": 0.0009, + "num_input_tokens_seen": 219849808, + "step": 101885 + }, + { + "epoch": 16.62153344208809, + "grad_norm": 0.002520489040762186, + "learning_rate": 8.444125424736016e-05, + "loss": 0.0016, + "num_input_tokens_seen": 219861232, + "step": 101890 + }, + { + "epoch": 16.622349102773246, + "grad_norm": 0.010342425666749477, + "learning_rate": 8.440167558267431e-05, + "loss": 0.0051, + "num_input_tokens_seen": 219869904, + "step": 101895 + }, + { + "epoch": 16.6231647634584, + "grad_norm": 0.00011348602129146457, + "learning_rate": 8.436210534061567e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219881680, + "step": 101900 + }, + { + "epoch": 16.623980424143557, + "grad_norm": 0.0003730079042725265, + "learning_rate": 8.432254352198626e-05, + "loss": 0.0007, + "num_input_tokens_seen": 219892400, + "step": 101905 + }, + { + "epoch": 16.624796084828713, + "grad_norm": 0.0003928591904696077, + "learning_rate": 8.428299012758778e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219903344, + "step": 101910 + }, + { + "epoch": 16.625611745513865, + "grad_norm": 0.0016685236478224397, + "learning_rate": 8.424344515822197e-05, + "loss": 0.0005, + "num_input_tokens_seen": 219914416, + "step": 101915 + }, + { + "epoch": 16.62642740619902, + "grad_norm": 0.0004602029512170702, + "learning_rate": 8.420390861468996e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219924688, + "step": 101920 + }, + { + "epoch": 16.627243066884176, + "grad_norm": 0.006051466800272465, + "learning_rate": 8.416438049779351e-05, + "loss": 0.1417, + "num_input_tokens_seen": 219935152, + "step": 101925 + }, + { + "epoch": 16.628058727569332, + "grad_norm": 0.009506523609161377, + "learning_rate": 8.412486080833315e-05, + "loss": 0.0015, + "num_input_tokens_seen": 219945648, + "step": 101930 + }, + { + "epoch": 16.628874388254488, + "grad_norm": 0.0006694772746413946, + "learning_rate": 8.408534954711034e-05, + "loss": 0.0006, + "num_input_tokens_seen": 219955952, + "step": 101935 + }, + { + "epoch": 16.62969004893964, + "grad_norm": 0.00028314359951764345, + "learning_rate": 8.404584671492526e-05, + "loss": 0.0008, + "num_input_tokens_seen": 219967024, + "step": 101940 + }, + { + "epoch": 16.630505709624796, + "grad_norm": 0.013631414622068405, + "learning_rate": 8.400635231257902e-05, + "loss": 0.0027, + "num_input_tokens_seen": 219976304, + "step": 101945 + }, + { + "epoch": 16.63132137030995, + "grad_norm": 0.013736164197325706, + "learning_rate": 8.396686634087159e-05, + "loss": 0.0016, + "num_input_tokens_seen": 219988336, + "step": 101950 + }, + { + "epoch": 16.632137030995107, + "grad_norm": 0.0012892892118543386, + "learning_rate": 8.392738880060358e-05, + "loss": 0.0004, + "num_input_tokens_seen": 219997808, + "step": 101955 + }, + { + "epoch": 16.63295269168026, + "grad_norm": 0.0006707051070407033, + "learning_rate": 8.388791969257458e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220008816, + "step": 101960 + }, + { + "epoch": 16.633768352365415, + "grad_norm": 0.002703250152990222, + "learning_rate": 8.384845901758498e-05, + "loss": 0.002, + "num_input_tokens_seen": 220020048, + "step": 101965 + }, + { + "epoch": 16.63458401305057, + "grad_norm": 0.01806464046239853, + "learning_rate": 8.380900677643421e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220031152, + "step": 101970 + }, + { + "epoch": 16.635399673735726, + "grad_norm": 0.0014469883171841502, + "learning_rate": 8.376956296992195e-05, + "loss": 0.005, + "num_input_tokens_seen": 220041744, + "step": 101975 + }, + { + "epoch": 16.636215334420882, + "grad_norm": 0.00690801814198494, + "learning_rate": 8.373012759884746e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220050896, + "step": 101980 + }, + { + "epoch": 16.637030995106034, + "grad_norm": 0.00046486471546813846, + "learning_rate": 8.369070066401003e-05, + "loss": 0.0004, + "num_input_tokens_seen": 220061040, + "step": 101985 + }, + { + "epoch": 16.63784665579119, + "grad_norm": 0.0005920170806348324, + "learning_rate": 8.365128216620871e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220072336, + "step": 101990 + }, + { + "epoch": 16.638662316476346, + "grad_norm": 0.00013604594278149307, + "learning_rate": 8.361187210624232e-05, + "loss": 0.0016, + "num_input_tokens_seen": 220084016, + "step": 101995 + }, + { + "epoch": 16.6394779771615, + "grad_norm": 0.003790405346080661, + "learning_rate": 8.357247048490957e-05, + "loss": 0.1224, + "num_input_tokens_seen": 220095376, + "step": 102000 + }, + { + "epoch": 16.640293637846657, + "grad_norm": 0.0020531124901026487, + "learning_rate": 8.353307730300897e-05, + "loss": 0.0004, + "num_input_tokens_seen": 220105616, + "step": 102005 + }, + { + "epoch": 16.64110929853181, + "grad_norm": 0.0030671374406665564, + "learning_rate": 8.349369256133888e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220117072, + "step": 102010 + }, + { + "epoch": 16.641924959216965, + "grad_norm": 0.2832264006137848, + "learning_rate": 8.345431626069744e-05, + "loss": 0.0074, + "num_input_tokens_seen": 220127248, + "step": 102015 + }, + { + "epoch": 16.64274061990212, + "grad_norm": 0.055892497301101685, + "learning_rate": 8.34149484018828e-05, + "loss": 0.0043, + "num_input_tokens_seen": 220138224, + "step": 102020 + }, + { + "epoch": 16.643556280587276, + "grad_norm": 0.013957837596535683, + "learning_rate": 8.337558898569264e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220150512, + "step": 102025 + }, + { + "epoch": 16.644371941272432, + "grad_norm": 0.0008855264168232679, + "learning_rate": 8.333623801292472e-05, + "loss": 0.0022, + "num_input_tokens_seen": 220161872, + "step": 102030 + }, + { + "epoch": 16.645187601957584, + "grad_norm": 0.04549108445644379, + "learning_rate": 8.329689548437652e-05, + "loss": 0.0011, + "num_input_tokens_seen": 220171856, + "step": 102035 + }, + { + "epoch": 16.64600326264274, + "grad_norm": 0.0018148330273106694, + "learning_rate": 8.325756140084533e-05, + "loss": 0.001, + "num_input_tokens_seen": 220181712, + "step": 102040 + }, + { + "epoch": 16.646818923327896, + "grad_norm": 0.0018879264825955033, + "learning_rate": 8.321823576312837e-05, + "loss": 0.0003, + "num_input_tokens_seen": 220192336, + "step": 102045 + }, + { + "epoch": 16.64763458401305, + "grad_norm": 0.0021527979988604784, + "learning_rate": 8.317891857202253e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220202768, + "step": 102050 + }, + { + "epoch": 16.648450244698207, + "grad_norm": 0.06411556154489517, + "learning_rate": 8.313960982832475e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220214128, + "step": 102055 + }, + { + "epoch": 16.64926590538336, + "grad_norm": 0.0009894641116261482, + "learning_rate": 8.310030953283154e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220224016, + "step": 102060 + }, + { + "epoch": 16.650081566068515, + "grad_norm": 0.002078613033518195, + "learning_rate": 8.30610176863394e-05, + "loss": 0.001, + "num_input_tokens_seen": 220234384, + "step": 102065 + }, + { + "epoch": 16.65089722675367, + "grad_norm": 0.000977556686848402, + "learning_rate": 8.302173428964472e-05, + "loss": 0.0009, + "num_input_tokens_seen": 220245680, + "step": 102070 + }, + { + "epoch": 16.651712887438826, + "grad_norm": 0.11465684324502945, + "learning_rate": 8.298245934354353e-05, + "loss": 0.0037, + "num_input_tokens_seen": 220256976, + "step": 102075 + }, + { + "epoch": 16.652528548123982, + "grad_norm": 0.0007347901700995862, + "learning_rate": 8.29431928488319e-05, + "loss": 0.0026, + "num_input_tokens_seen": 220268400, + "step": 102080 + }, + { + "epoch": 16.653344208809134, + "grad_norm": 0.0015887911431491375, + "learning_rate": 8.290393480630549e-05, + "loss": 0.0013, + "num_input_tokens_seen": 220278640, + "step": 102085 + }, + { + "epoch": 16.65415986949429, + "grad_norm": 0.17763040959835052, + "learning_rate": 8.286468521676e-05, + "loss": 0.0041, + "num_input_tokens_seen": 220287312, + "step": 102090 + }, + { + "epoch": 16.654975530179446, + "grad_norm": 0.6964755654335022, + "learning_rate": 8.282544408099079e-05, + "loss": 0.0312, + "num_input_tokens_seen": 220297360, + "step": 102095 + }, + { + "epoch": 16.6557911908646, + "grad_norm": 0.00019680529658216983, + "learning_rate": 8.278621139979325e-05, + "loss": 0.0018, + "num_input_tokens_seen": 220308464, + "step": 102100 + }, + { + "epoch": 16.656606851549757, + "grad_norm": 0.05441994592547417, + "learning_rate": 8.274698717396234e-05, + "loss": 0.0009, + "num_input_tokens_seen": 220318960, + "step": 102105 + }, + { + "epoch": 16.65742251223491, + "grad_norm": 0.0004269436758477241, + "learning_rate": 8.270777140429308e-05, + "loss": 0.0023, + "num_input_tokens_seen": 220330544, + "step": 102110 + }, + { + "epoch": 16.658238172920065, + "grad_norm": 0.004170997999608517, + "learning_rate": 8.266856409158025e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220342576, + "step": 102115 + }, + { + "epoch": 16.65905383360522, + "grad_norm": 0.00033073779195547104, + "learning_rate": 8.262936523661835e-05, + "loss": 0.0099, + "num_input_tokens_seen": 220354000, + "step": 102120 + }, + { + "epoch": 16.659869494290376, + "grad_norm": 0.0007536531775258482, + "learning_rate": 8.259017484020181e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220365136, + "step": 102125 + }, + { + "epoch": 16.660685154975532, + "grad_norm": 0.0014028383884578943, + "learning_rate": 8.255099290312495e-05, + "loss": 0.001, + "num_input_tokens_seen": 220376208, + "step": 102130 + }, + { + "epoch": 16.661500815660684, + "grad_norm": 0.8239142298698425, + "learning_rate": 8.251181942618174e-05, + "loss": 0.0469, + "num_input_tokens_seen": 220386640, + "step": 102135 + }, + { + "epoch": 16.66231647634584, + "grad_norm": 0.019336091354489326, + "learning_rate": 8.247265441016621e-05, + "loss": 0.0023, + "num_input_tokens_seen": 220397136, + "step": 102140 + }, + { + "epoch": 16.663132137030995, + "grad_norm": 0.007177197840064764, + "learning_rate": 8.243349785587195e-05, + "loss": 0.002, + "num_input_tokens_seen": 220408848, + "step": 102145 + }, + { + "epoch": 16.66394779771615, + "grad_norm": 0.030749486759305, + "learning_rate": 8.23943497640926e-05, + "loss": 0.0061, + "num_input_tokens_seen": 220418512, + "step": 102150 + }, + { + "epoch": 16.664763458401303, + "grad_norm": 0.05676698684692383, + "learning_rate": 8.235521013562148e-05, + "loss": 0.0018, + "num_input_tokens_seen": 220429328, + "step": 102155 + }, + { + "epoch": 16.66557911908646, + "grad_norm": 0.0005361379007808864, + "learning_rate": 8.231607897125188e-05, + "loss": 0.0009, + "num_input_tokens_seen": 220439312, + "step": 102160 + }, + { + "epoch": 16.666394779771615, + "grad_norm": 0.0014454021584242582, + "learning_rate": 8.227695627177678e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220450448, + "step": 102165 + }, + { + "epoch": 16.66721044045677, + "grad_norm": 0.000402638252126053, + "learning_rate": 8.223784203798912e-05, + "loss": 0.0003, + "num_input_tokens_seen": 220461552, + "step": 102170 + }, + { + "epoch": 16.668026101141926, + "grad_norm": 0.07250863313674927, + "learning_rate": 8.219873627068141e-05, + "loss": 0.0022, + "num_input_tokens_seen": 220472624, + "step": 102175 + }, + { + "epoch": 16.66884176182708, + "grad_norm": 0.0022812052629888058, + "learning_rate": 8.21596389706466e-05, + "loss": 0.0152, + "num_input_tokens_seen": 220482672, + "step": 102180 + }, + { + "epoch": 16.669657422512234, + "grad_norm": 0.006014272570610046, + "learning_rate": 8.212055013867654e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220493488, + "step": 102185 + }, + { + "epoch": 16.67047308319739, + "grad_norm": 0.0030713954474776983, + "learning_rate": 8.208146977556386e-05, + "loss": 0.023, + "num_input_tokens_seen": 220504976, + "step": 102190 + }, + { + "epoch": 16.671288743882545, + "grad_norm": 0.0018777098739519715, + "learning_rate": 8.204239788210011e-05, + "loss": 0.0013, + "num_input_tokens_seen": 220516240, + "step": 102195 + }, + { + "epoch": 16.6721044045677, + "grad_norm": 0.05832066759467125, + "learning_rate": 8.200333445907766e-05, + "loss": 0.0041, + "num_input_tokens_seen": 220527408, + "step": 102200 + }, + { + "epoch": 16.672920065252853, + "grad_norm": 0.0013136352645233274, + "learning_rate": 8.196427950728763e-05, + "loss": 0.0006, + "num_input_tokens_seen": 220537520, + "step": 102205 + }, + { + "epoch": 16.67373572593801, + "grad_norm": 0.000254054059041664, + "learning_rate": 8.192523302752192e-05, + "loss": 0.0001, + "num_input_tokens_seen": 220548304, + "step": 102210 + }, + { + "epoch": 16.674551386623165, + "grad_norm": 0.0003200510691385716, + "learning_rate": 8.188619502057176e-05, + "loss": 0.003, + "num_input_tokens_seen": 220559728, + "step": 102215 + }, + { + "epoch": 16.67536704730832, + "grad_norm": 0.04036097228527069, + "learning_rate": 8.184716548722825e-05, + "loss": 0.0012, + "num_input_tokens_seen": 220571152, + "step": 102220 + }, + { + "epoch": 16.676182707993476, + "grad_norm": 0.001222164137288928, + "learning_rate": 8.180814442828238e-05, + "loss": 0.0019, + "num_input_tokens_seen": 220582544, + "step": 102225 + }, + { + "epoch": 16.67699836867863, + "grad_norm": 0.016134627163410187, + "learning_rate": 8.1769131844525e-05, + "loss": 0.0018, + "num_input_tokens_seen": 220593616, + "step": 102230 + }, + { + "epoch": 16.677814029363784, + "grad_norm": 0.008967578411102295, + "learning_rate": 8.173012773674671e-05, + "loss": 0.0067, + "num_input_tokens_seen": 220604880, + "step": 102235 + }, + { + "epoch": 16.67862969004894, + "grad_norm": 0.0002311637654202059, + "learning_rate": 8.169113210573803e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220615824, + "step": 102240 + }, + { + "epoch": 16.679445350734095, + "grad_norm": 0.000969396554864943, + "learning_rate": 8.165214495228918e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220626160, + "step": 102245 + }, + { + "epoch": 16.68026101141925, + "grad_norm": 0.003749568248167634, + "learning_rate": 8.161316627719035e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220637552, + "step": 102250 + }, + { + "epoch": 16.681076672104403, + "grad_norm": 0.00010079441562993452, + "learning_rate": 8.157419608123145e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220649232, + "step": 102255 + }, + { + "epoch": 16.68189233278956, + "grad_norm": 0.0006482730386778712, + "learning_rate": 8.153523436520226e-05, + "loss": 0.0287, + "num_input_tokens_seen": 220659664, + "step": 102260 + }, + { + "epoch": 16.682707993474715, + "grad_norm": 0.0007627441664226353, + "learning_rate": 8.149628112989243e-05, + "loss": 0.0049, + "num_input_tokens_seen": 220670192, + "step": 102265 + }, + { + "epoch": 16.68352365415987, + "grad_norm": 0.09153059870004654, + "learning_rate": 8.145733637609137e-05, + "loss": 0.0145, + "num_input_tokens_seen": 220680112, + "step": 102270 + }, + { + "epoch": 16.684339314845026, + "grad_norm": 0.007386372424662113, + "learning_rate": 8.141840010458835e-05, + "loss": 0.0049, + "num_input_tokens_seen": 220690192, + "step": 102275 + }, + { + "epoch": 16.68515497553018, + "grad_norm": 0.003157742088660598, + "learning_rate": 8.137947231617237e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220701392, + "step": 102280 + }, + { + "epoch": 16.685970636215334, + "grad_norm": 0.01893356256186962, + "learning_rate": 8.134055301163263e-05, + "loss": 0.0015, + "num_input_tokens_seen": 220712336, + "step": 102285 + }, + { + "epoch": 16.68678629690049, + "grad_norm": 0.008784581907093525, + "learning_rate": 8.130164219175745e-05, + "loss": 0.0006, + "num_input_tokens_seen": 220722576, + "step": 102290 + }, + { + "epoch": 16.687601957585645, + "grad_norm": 0.1381399780511856, + "learning_rate": 8.126273985733595e-05, + "loss": 0.0032, + "num_input_tokens_seen": 220733264, + "step": 102295 + }, + { + "epoch": 16.6884176182708, + "grad_norm": 0.06648890674114227, + "learning_rate": 8.122384600915594e-05, + "loss": 0.0032, + "num_input_tokens_seen": 220744848, + "step": 102300 + }, + { + "epoch": 16.689233278955953, + "grad_norm": 0.0006427154294215143, + "learning_rate": 8.118496064800618e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220755728, + "step": 102305 + }, + { + "epoch": 16.69004893964111, + "grad_norm": 0.00027974555268883705, + "learning_rate": 8.11460837746743e-05, + "loss": 0.002, + "num_input_tokens_seen": 220765840, + "step": 102310 + }, + { + "epoch": 16.690864600326265, + "grad_norm": 0.1433115005493164, + "learning_rate": 8.110721538994859e-05, + "loss": 0.0366, + "num_input_tokens_seen": 220776816, + "step": 102315 + }, + { + "epoch": 16.69168026101142, + "grad_norm": 0.001408384065143764, + "learning_rate": 8.106835549461633e-05, + "loss": 0.0005, + "num_input_tokens_seen": 220787888, + "step": 102320 + }, + { + "epoch": 16.692495921696576, + "grad_norm": 0.008002794347703457, + "learning_rate": 8.102950408946552e-05, + "loss": 0.001, + "num_input_tokens_seen": 220799248, + "step": 102325 + }, + { + "epoch": 16.693311582381728, + "grad_norm": 0.012343596667051315, + "learning_rate": 8.099066117528308e-05, + "loss": 0.0006, + "num_input_tokens_seen": 220810384, + "step": 102330 + }, + { + "epoch": 16.694127243066884, + "grad_norm": 0.0003763117711059749, + "learning_rate": 8.095182675285673e-05, + "loss": 0.0008, + "num_input_tokens_seen": 220821296, + "step": 102335 + }, + { + "epoch": 16.69494290375204, + "grad_norm": 0.008047780022025108, + "learning_rate": 8.091300082297293e-05, + "loss": 0.0104, + "num_input_tokens_seen": 220831248, + "step": 102340 + }, + { + "epoch": 16.695758564437195, + "grad_norm": 0.000963518163189292, + "learning_rate": 8.087418338641906e-05, + "loss": 0.0004, + "num_input_tokens_seen": 220842544, + "step": 102345 + }, + { + "epoch": 16.696574225122347, + "grad_norm": 0.0010410456452518702, + "learning_rate": 8.083537444398131e-05, + "loss": 0.0007, + "num_input_tokens_seen": 220853840, + "step": 102350 + }, + { + "epoch": 16.697389885807503, + "grad_norm": 0.00013864839274901897, + "learning_rate": 8.079657399644664e-05, + "loss": 0.0003, + "num_input_tokens_seen": 220864944, + "step": 102355 + }, + { + "epoch": 16.69820554649266, + "grad_norm": 0.00015487702330574393, + "learning_rate": 8.07577820446011e-05, + "loss": 0.0011, + "num_input_tokens_seen": 220876688, + "step": 102360 + }, + { + "epoch": 16.699021207177815, + "grad_norm": 0.00029875501058995724, + "learning_rate": 8.071899858923098e-05, + "loss": 0.0013, + "num_input_tokens_seen": 220888016, + "step": 102365 + }, + { + "epoch": 16.69983686786297, + "grad_norm": 0.000797287211753428, + "learning_rate": 8.068022363112227e-05, + "loss": 0.0048, + "num_input_tokens_seen": 220899440, + "step": 102370 + }, + { + "epoch": 16.700652528548122, + "grad_norm": 0.05315621569752693, + "learning_rate": 8.064145717106075e-05, + "loss": 0.0031, + "num_input_tokens_seen": 220910320, + "step": 102375 + }, + { + "epoch": 16.701468189233278, + "grad_norm": 0.0010179190430790186, + "learning_rate": 8.06026992098321e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220921200, + "step": 102380 + }, + { + "epoch": 16.702283849918434, + "grad_norm": 0.0012392103672027588, + "learning_rate": 8.056394974822185e-05, + "loss": 0.005, + "num_input_tokens_seen": 220932048, + "step": 102385 + }, + { + "epoch": 16.70309951060359, + "grad_norm": 0.004453158006072044, + "learning_rate": 8.052520878701519e-05, + "loss": 0.0003, + "num_input_tokens_seen": 220942000, + "step": 102390 + }, + { + "epoch": 16.703915171288745, + "grad_norm": 0.0006805131561122835, + "learning_rate": 8.04864763269973e-05, + "loss": 0.0006, + "num_input_tokens_seen": 220952336, + "step": 102395 + }, + { + "epoch": 16.704730831973897, + "grad_norm": 0.00012793141650035977, + "learning_rate": 8.044775236895319e-05, + "loss": 0.0014, + "num_input_tokens_seen": 220964400, + "step": 102400 + }, + { + "epoch": 16.705546492659053, + "grad_norm": 0.04461607709527016, + "learning_rate": 8.040903691366753e-05, + "loss": 0.0541, + "num_input_tokens_seen": 220974672, + "step": 102405 + }, + { + "epoch": 16.70636215334421, + "grad_norm": 0.0006162897334434092, + "learning_rate": 8.037032996192522e-05, + "loss": 0.0002, + "num_input_tokens_seen": 220985904, + "step": 102410 + }, + { + "epoch": 16.707177814029365, + "grad_norm": 0.02729339525103569, + "learning_rate": 8.033163151451028e-05, + "loss": 0.001, + "num_input_tokens_seen": 220997040, + "step": 102415 + }, + { + "epoch": 16.70799347471452, + "grad_norm": 0.0004722194862551987, + "learning_rate": 8.029294157220746e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221007312, + "step": 102420 + }, + { + "epoch": 16.708809135399672, + "grad_norm": 0.022223806008696556, + "learning_rate": 8.025426013580033e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221018800, + "step": 102425 + }, + { + "epoch": 16.709624796084828, + "grad_norm": 0.01881605200469494, + "learning_rate": 8.021558720607342e-05, + "loss": 0.0025, + "num_input_tokens_seen": 221028656, + "step": 102430 + }, + { + "epoch": 16.710440456769984, + "grad_norm": 0.034202367067337036, + "learning_rate": 8.01769227838099e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221040208, + "step": 102435 + }, + { + "epoch": 16.71125611745514, + "grad_norm": 0.0002508640754967928, + "learning_rate": 8.013826686979381e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221051888, + "step": 102440 + }, + { + "epoch": 16.712071778140295, + "grad_norm": 0.004685740917921066, + "learning_rate": 8.00996194648082e-05, + "loss": 0.0054, + "num_input_tokens_seen": 221063088, + "step": 102445 + }, + { + "epoch": 16.712887438825447, + "grad_norm": 0.022753393277525902, + "learning_rate": 8.006098056963668e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221074448, + "step": 102450 + }, + { + "epoch": 16.713703099510603, + "grad_norm": 0.023375259712338448, + "learning_rate": 8.002235018506194e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221085648, + "step": 102455 + }, + { + "epoch": 16.71451876019576, + "grad_norm": 0.01081467792391777, + "learning_rate": 7.998372831186723e-05, + "loss": 0.0357, + "num_input_tokens_seen": 221095248, + "step": 102460 + }, + { + "epoch": 16.715334420880914, + "grad_norm": 0.0035164321307092905, + "learning_rate": 7.99451149508349e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221104336, + "step": 102465 + }, + { + "epoch": 16.71615008156607, + "grad_norm": 0.7312716245651245, + "learning_rate": 7.990651010274791e-05, + "loss": 0.0179, + "num_input_tokens_seen": 221115600, + "step": 102470 + }, + { + "epoch": 16.716965742251222, + "grad_norm": 0.027586141601204872, + "learning_rate": 7.98679137683882e-05, + "loss": 0.0052, + "num_input_tokens_seen": 221126608, + "step": 102475 + }, + { + "epoch": 16.717781402936378, + "grad_norm": 0.015010256320238113, + "learning_rate": 7.982932594853837e-05, + "loss": 0.0019, + "num_input_tokens_seen": 221137872, + "step": 102480 + }, + { + "epoch": 16.718597063621534, + "grad_norm": 0.07624837756156921, + "learning_rate": 7.979074664398012e-05, + "loss": 0.0017, + "num_input_tokens_seen": 221148272, + "step": 102485 + }, + { + "epoch": 16.71941272430669, + "grad_norm": 0.00014859596558380872, + "learning_rate": 7.975217585549566e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221160048, + "step": 102490 + }, + { + "epoch": 16.72022838499184, + "grad_norm": 0.00788468960672617, + "learning_rate": 7.97136135838662e-05, + "loss": 0.0767, + "num_input_tokens_seen": 221169552, + "step": 102495 + }, + { + "epoch": 16.721044045676997, + "grad_norm": 0.00024140271125361323, + "learning_rate": 7.967505982987372e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221180336, + "step": 102500 + }, + { + "epoch": 16.721859706362153, + "grad_norm": 0.02427835762500763, + "learning_rate": 7.963651459429932e-05, + "loss": 0.001, + "num_input_tokens_seen": 221189328, + "step": 102505 + }, + { + "epoch": 16.72267536704731, + "grad_norm": 0.0006680793594568968, + "learning_rate": 7.959797787792428e-05, + "loss": 0.0053, + "num_input_tokens_seen": 221201136, + "step": 102510 + }, + { + "epoch": 16.723491027732464, + "grad_norm": 0.00039737491169944406, + "learning_rate": 7.955944968152951e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221211344, + "step": 102515 + }, + { + "epoch": 16.724306688417617, + "grad_norm": 0.00015685615653637797, + "learning_rate": 7.952093000589583e-05, + "loss": 0.1566, + "num_input_tokens_seen": 221222672, + "step": 102520 + }, + { + "epoch": 16.725122349102772, + "grad_norm": 0.0028556312900036573, + "learning_rate": 7.948241885180396e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221233040, + "step": 102525 + }, + { + "epoch": 16.725938009787928, + "grad_norm": 0.555871307849884, + "learning_rate": 7.944391622003427e-05, + "loss": 0.1291, + "num_input_tokens_seen": 221242704, + "step": 102530 + }, + { + "epoch": 16.726753670473084, + "grad_norm": 0.002889336785301566, + "learning_rate": 7.94054221113672e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221253232, + "step": 102535 + }, + { + "epoch": 16.72756933115824, + "grad_norm": 0.00034748384496197104, + "learning_rate": 7.936693652658278e-05, + "loss": 0.001, + "num_input_tokens_seen": 221263248, + "step": 102540 + }, + { + "epoch": 16.72838499184339, + "grad_norm": 0.001293393550440669, + "learning_rate": 7.9328459466461e-05, + "loss": 0.0015, + "num_input_tokens_seen": 221272656, + "step": 102545 + }, + { + "epoch": 16.729200652528547, + "grad_norm": 0.00038441919605247676, + "learning_rate": 7.928999093178157e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221284400, + "step": 102550 + }, + { + "epoch": 16.730016313213703, + "grad_norm": 0.0003252491296734661, + "learning_rate": 7.925153092332438e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221295984, + "step": 102555 + }, + { + "epoch": 16.73083197389886, + "grad_norm": 0.0001955062907654792, + "learning_rate": 7.921307944186845e-05, + "loss": 0.013, + "num_input_tokens_seen": 221306544, + "step": 102560 + }, + { + "epoch": 16.731647634584014, + "grad_norm": 0.00043699497473426163, + "learning_rate": 7.91746364881935e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221318544, + "step": 102565 + }, + { + "epoch": 16.732463295269167, + "grad_norm": 0.002367377281188965, + "learning_rate": 7.913620206307814e-05, + "loss": 0.0039, + "num_input_tokens_seen": 221328944, + "step": 102570 + }, + { + "epoch": 16.733278955954322, + "grad_norm": 0.029978040605783463, + "learning_rate": 7.909777616730185e-05, + "loss": 0.0019, + "num_input_tokens_seen": 221340624, + "step": 102575 + }, + { + "epoch": 16.734094616639478, + "grad_norm": 0.01640857756137848, + "learning_rate": 7.905935880164278e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221349744, + "step": 102580 + }, + { + "epoch": 16.734910277324634, + "grad_norm": 0.003674093633890152, + "learning_rate": 7.902094996688009e-05, + "loss": 0.0019, + "num_input_tokens_seen": 221360336, + "step": 102585 + }, + { + "epoch": 16.73572593800979, + "grad_norm": 0.277338445186615, + "learning_rate": 7.89825496637916e-05, + "loss": 0.0066, + "num_input_tokens_seen": 221370448, + "step": 102590 + }, + { + "epoch": 16.73654159869494, + "grad_norm": 0.005879329051822424, + "learning_rate": 7.894415789315612e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221381296, + "step": 102595 + }, + { + "epoch": 16.737357259380097, + "grad_norm": 0.09819705039262772, + "learning_rate": 7.890577465575121e-05, + "loss": 0.0054, + "num_input_tokens_seen": 221392784, + "step": 102600 + }, + { + "epoch": 16.738172920065253, + "grad_norm": 0.0011889958987012506, + "learning_rate": 7.886739995235504e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221404080, + "step": 102605 + }, + { + "epoch": 16.73898858075041, + "grad_norm": 0.4108349084854126, + "learning_rate": 7.882903378374528e-05, + "loss": 0.0053, + "num_input_tokens_seen": 221414448, + "step": 102610 + }, + { + "epoch": 16.739804241435564, + "grad_norm": 0.0425170436501503, + "learning_rate": 7.879067615069946e-05, + "loss": 0.0035, + "num_input_tokens_seen": 221425008, + "step": 102615 + }, + { + "epoch": 16.740619902120716, + "grad_norm": 0.00040296383667737246, + "learning_rate": 7.875232705399488e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221435184, + "step": 102620 + }, + { + "epoch": 16.741435562805872, + "grad_norm": 9.315348142990842e-05, + "learning_rate": 7.871398649440886e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221445328, + "step": 102625 + }, + { + "epoch": 16.742251223491028, + "grad_norm": 0.005336970090866089, + "learning_rate": 7.867565447271829e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221457904, + "step": 102630 + }, + { + "epoch": 16.743066884176184, + "grad_norm": 0.00012458793935365975, + "learning_rate": 7.863733098970006e-05, + "loss": 0.0013, + "num_input_tokens_seen": 221468176, + "step": 102635 + }, + { + "epoch": 16.74388254486134, + "grad_norm": 0.15727068483829498, + "learning_rate": 7.85990160461309e-05, + "loss": 0.0407, + "num_input_tokens_seen": 221479120, + "step": 102640 + }, + { + "epoch": 16.74469820554649, + "grad_norm": 0.0022491668350994587, + "learning_rate": 7.856070964278722e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221490288, + "step": 102645 + }, + { + "epoch": 16.745513866231647, + "grad_norm": 0.0904349759221077, + "learning_rate": 7.852241178044539e-05, + "loss": 0.0016, + "num_input_tokens_seen": 221502384, + "step": 102650 + }, + { + "epoch": 16.746329526916803, + "grad_norm": 0.00012989563401788473, + "learning_rate": 7.848412245988157e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221513488, + "step": 102655 + }, + { + "epoch": 16.74714518760196, + "grad_norm": 0.00031062858761288226, + "learning_rate": 7.84458416818718e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221522832, + "step": 102660 + }, + { + "epoch": 16.747960848287114, + "grad_norm": 0.0020518628880381584, + "learning_rate": 7.840756944719174e-05, + "loss": 0.04, + "num_input_tokens_seen": 221534992, + "step": 102665 + }, + { + "epoch": 16.748776508972266, + "grad_norm": 0.01770104467868805, + "learning_rate": 7.836930575661716e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221546032, + "step": 102670 + }, + { + "epoch": 16.749592169657422, + "grad_norm": 0.15464644134044647, + "learning_rate": 7.83310506109235e-05, + "loss": 0.0036, + "num_input_tokens_seen": 221557296, + "step": 102675 + }, + { + "epoch": 16.750407830342578, + "grad_norm": 0.0003354168438818306, + "learning_rate": 7.829280401088601e-05, + "loss": 0.0098, + "num_input_tokens_seen": 221567504, + "step": 102680 + }, + { + "epoch": 16.751223491027734, + "grad_norm": 0.0010419103782624006, + "learning_rate": 7.82545659572798e-05, + "loss": 0.0017, + "num_input_tokens_seen": 221579184, + "step": 102685 + }, + { + "epoch": 16.752039151712886, + "grad_norm": 0.01649213209748268, + "learning_rate": 7.821633645087984e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221590128, + "step": 102690 + }, + { + "epoch": 16.75285481239804, + "grad_norm": 0.00028294921503402293, + "learning_rate": 7.817811549246079e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221600752, + "step": 102695 + }, + { + "epoch": 16.753670473083197, + "grad_norm": 0.0014531684573739767, + "learning_rate": 7.813990308279755e-05, + "loss": 0.0018, + "num_input_tokens_seen": 221611984, + "step": 102700 + }, + { + "epoch": 16.754486133768353, + "grad_norm": 0.00019200649694539607, + "learning_rate": 7.810169922266413e-05, + "loss": 0.0004, + "num_input_tokens_seen": 221623056, + "step": 102705 + }, + { + "epoch": 16.75530179445351, + "grad_norm": 0.1770632117986679, + "learning_rate": 7.806350391283507e-05, + "loss": 0.0239, + "num_input_tokens_seen": 221634128, + "step": 102710 + }, + { + "epoch": 16.75611745513866, + "grad_norm": 0.0002401823876425624, + "learning_rate": 7.80253171540844e-05, + "loss": 0.0009, + "num_input_tokens_seen": 221644528, + "step": 102715 + }, + { + "epoch": 16.756933115823816, + "grad_norm": 0.001496228389441967, + "learning_rate": 7.798713894718602e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221654480, + "step": 102720 + }, + { + "epoch": 16.757748776508972, + "grad_norm": 0.00041144603164866567, + "learning_rate": 7.794896929291361e-05, + "loss": 0.0066, + "num_input_tokens_seen": 221664336, + "step": 102725 + }, + { + "epoch": 16.758564437194128, + "grad_norm": 0.0008128510671667755, + "learning_rate": 7.791080819204072e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221674480, + "step": 102730 + }, + { + "epoch": 16.759380097879284, + "grad_norm": 0.0005182931781746447, + "learning_rate": 7.78726556453408e-05, + "loss": 0.0035, + "num_input_tokens_seen": 221686224, + "step": 102735 + }, + { + "epoch": 16.760195758564436, + "grad_norm": 0.00027991971001029015, + "learning_rate": 7.783451165358696e-05, + "loss": 0.0002, + "num_input_tokens_seen": 221697616, + "step": 102740 + }, + { + "epoch": 16.76101141924959, + "grad_norm": 0.2265561819076538, + "learning_rate": 7.779637621755236e-05, + "loss": 0.0085, + "num_input_tokens_seen": 221707536, + "step": 102745 + }, + { + "epoch": 16.761827079934747, + "grad_norm": 0.011772893369197845, + "learning_rate": 7.775824933800979e-05, + "loss": 0.0069, + "num_input_tokens_seen": 221718800, + "step": 102750 + }, + { + "epoch": 16.762642740619903, + "grad_norm": 0.00034055992728099227, + "learning_rate": 7.772013101573195e-05, + "loss": 0.0008, + "num_input_tokens_seen": 221728816, + "step": 102755 + }, + { + "epoch": 16.76345840130506, + "grad_norm": 0.000549080315977335, + "learning_rate": 7.768202125149132e-05, + "loss": 0.0123, + "num_input_tokens_seen": 221739120, + "step": 102760 + }, + { + "epoch": 16.76427406199021, + "grad_norm": 0.00012444957974366844, + "learning_rate": 7.76439200460603e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221749328, + "step": 102765 + }, + { + "epoch": 16.765089722675366, + "grad_norm": 0.010822059586644173, + "learning_rate": 7.7605827400211e-05, + "loss": 0.0028, + "num_input_tokens_seen": 221760496, + "step": 102770 + }, + { + "epoch": 16.765905383360522, + "grad_norm": 0.21235309541225433, + "learning_rate": 7.75677433147155e-05, + "loss": 0.0076, + "num_input_tokens_seen": 221772592, + "step": 102775 + }, + { + "epoch": 16.766721044045678, + "grad_norm": 0.01310757827013731, + "learning_rate": 7.752966779034553e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221782256, + "step": 102780 + }, + { + "epoch": 16.767536704730833, + "grad_norm": 0.00018895250104833394, + "learning_rate": 7.749160082787283e-05, + "loss": 0.0001, + "num_input_tokens_seen": 221793104, + "step": 102785 + }, + { + "epoch": 16.768352365415986, + "grad_norm": 0.02117677964270115, + "learning_rate": 7.745354242806884e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221804400, + "step": 102790 + }, + { + "epoch": 16.76916802610114, + "grad_norm": 0.0480305440723896, + "learning_rate": 7.741549259170483e-05, + "loss": 0.0021, + "num_input_tokens_seen": 221814416, + "step": 102795 + }, + { + "epoch": 16.769983686786297, + "grad_norm": 0.0031217029318213463, + "learning_rate": 7.737745131955192e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221824528, + "step": 102800 + }, + { + "epoch": 16.770799347471453, + "grad_norm": 0.0017456536879763007, + "learning_rate": 7.733941861238114e-05, + "loss": 0.001, + "num_input_tokens_seen": 221836336, + "step": 102805 + }, + { + "epoch": 16.77161500815661, + "grad_norm": 0.0003405441530048847, + "learning_rate": 7.730139447096319e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221847632, + "step": 102810 + }, + { + "epoch": 16.77243066884176, + "grad_norm": 0.0019163109827786684, + "learning_rate": 7.726337889606861e-05, + "loss": 0.0007, + "num_input_tokens_seen": 221859248, + "step": 102815 + }, + { + "epoch": 16.773246329526916, + "grad_norm": 0.00017169537022709846, + "learning_rate": 7.722537188846817e-05, + "loss": 0.0002, + "num_input_tokens_seen": 221870768, + "step": 102820 + }, + { + "epoch": 16.774061990212072, + "grad_norm": 0.009273098781704903, + "learning_rate": 7.718737344893167e-05, + "loss": 0.004, + "num_input_tokens_seen": 221882064, + "step": 102825 + }, + { + "epoch": 16.774877650897228, + "grad_norm": 0.009518054313957691, + "learning_rate": 7.714938357822965e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221893072, + "step": 102830 + }, + { + "epoch": 16.775693311582383, + "grad_norm": 0.006086020264774561, + "learning_rate": 7.711140227713154e-05, + "loss": 0.002, + "num_input_tokens_seen": 221903984, + "step": 102835 + }, + { + "epoch": 16.776508972267536, + "grad_norm": 0.006843290291726589, + "learning_rate": 7.70734295464075e-05, + "loss": 0.0003, + "num_input_tokens_seen": 221913936, + "step": 102840 + }, + { + "epoch": 16.77732463295269, + "grad_norm": 0.0003682865353766829, + "learning_rate": 7.703546538682688e-05, + "loss": 0.0005, + "num_input_tokens_seen": 221923280, + "step": 102845 + }, + { + "epoch": 16.778140293637847, + "grad_norm": 0.011014909483492374, + "learning_rate": 7.699750979915915e-05, + "loss": 0.0024, + "num_input_tokens_seen": 221932944, + "step": 102850 + }, + { + "epoch": 16.778955954323003, + "grad_norm": 8.551344944862649e-05, + "learning_rate": 7.695956278417349e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221944560, + "step": 102855 + }, + { + "epoch": 16.77977161500816, + "grad_norm": 0.00014704835484735668, + "learning_rate": 7.692162434263894e-05, + "loss": 0.0006, + "num_input_tokens_seen": 221955984, + "step": 102860 + }, + { + "epoch": 16.78058727569331, + "grad_norm": 0.03947598859667778, + "learning_rate": 7.688369447532444e-05, + "loss": 0.0014, + "num_input_tokens_seen": 221967248, + "step": 102865 + }, + { + "epoch": 16.781402936378466, + "grad_norm": 0.03489304706454277, + "learning_rate": 7.684577318299857e-05, + "loss": 0.0026, + "num_input_tokens_seen": 221978000, + "step": 102870 + }, + { + "epoch": 16.782218597063622, + "grad_norm": 0.0006088899099268019, + "learning_rate": 7.680786046642996e-05, + "loss": 0.0013, + "num_input_tokens_seen": 221987920, + "step": 102875 + }, + { + "epoch": 16.783034257748778, + "grad_norm": 0.024368641898036003, + "learning_rate": 7.676995632638689e-05, + "loss": 0.0035, + "num_input_tokens_seen": 221998576, + "step": 102880 + }, + { + "epoch": 16.78384991843393, + "grad_norm": 0.00010480960190761834, + "learning_rate": 7.67320607636376e-05, + "loss": 0.0009, + "num_input_tokens_seen": 222009168, + "step": 102885 + }, + { + "epoch": 16.784665579119086, + "grad_norm": 0.0017283828929066658, + "learning_rate": 7.669417377894999e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222020048, + "step": 102890 + }, + { + "epoch": 16.78548123980424, + "grad_norm": 0.010564571246504784, + "learning_rate": 7.665629537309199e-05, + "loss": 0.0047, + "num_input_tokens_seen": 222032368, + "step": 102895 + }, + { + "epoch": 16.786296900489397, + "grad_norm": 0.2294703871011734, + "learning_rate": 7.661842554683124e-05, + "loss": 0.008, + "num_input_tokens_seen": 222043472, + "step": 102900 + }, + { + "epoch": 16.787112561174553, + "grad_norm": 0.03179721534252167, + "learning_rate": 7.658056430093512e-05, + "loss": 0.0075, + "num_input_tokens_seen": 222054320, + "step": 102905 + }, + { + "epoch": 16.787928221859705, + "grad_norm": 0.05790800228714943, + "learning_rate": 7.654271163617105e-05, + "loss": 0.0021, + "num_input_tokens_seen": 222064496, + "step": 102910 + }, + { + "epoch": 16.78874388254486, + "grad_norm": 0.009331142529845238, + "learning_rate": 7.650486755330616e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222075152, + "step": 102915 + }, + { + "epoch": 16.789559543230016, + "grad_norm": 0.0030185102950781584, + "learning_rate": 7.646703205310718e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222086416, + "step": 102920 + }, + { + "epoch": 16.790375203915172, + "grad_norm": 0.0008598689455538988, + "learning_rate": 7.642920513634138e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222097808, + "step": 102925 + }, + { + "epoch": 16.791190864600328, + "grad_norm": 0.013979151844978333, + "learning_rate": 7.639138680377478e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222108144, + "step": 102930 + }, + { + "epoch": 16.79200652528548, + "grad_norm": 0.014011146500706673, + "learning_rate": 7.63535770561744e-05, + "loss": 0.0012, + "num_input_tokens_seen": 222119184, + "step": 102935 + }, + { + "epoch": 16.792822185970635, + "grad_norm": 0.0002483447315171361, + "learning_rate": 7.631577589430593e-05, + "loss": 0.0175, + "num_input_tokens_seen": 222130384, + "step": 102940 + }, + { + "epoch": 16.79363784665579, + "grad_norm": 0.4877093732357025, + "learning_rate": 7.627798331893604e-05, + "loss": 0.0301, + "num_input_tokens_seen": 222140720, + "step": 102945 + }, + { + "epoch": 16.794453507340947, + "grad_norm": 0.0060813468880951405, + "learning_rate": 7.62401993308301e-05, + "loss": 0.001, + "num_input_tokens_seen": 222152400, + "step": 102950 + }, + { + "epoch": 16.795269168026103, + "grad_norm": 0.00010451649723108858, + "learning_rate": 7.620242393075432e-05, + "loss": 0.0152, + "num_input_tokens_seen": 222164144, + "step": 102955 + }, + { + "epoch": 16.796084828711255, + "grad_norm": 0.11082665622234344, + "learning_rate": 7.61646571194738e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222175088, + "step": 102960 + }, + { + "epoch": 16.79690048939641, + "grad_norm": 0.00024645525263622403, + "learning_rate": 7.612689889775443e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222186256, + "step": 102965 + }, + { + "epoch": 16.797716150081566, + "grad_norm": 0.00013762478192802519, + "learning_rate": 7.60891492663609e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222197744, + "step": 102970 + }, + { + "epoch": 16.798531810766722, + "grad_norm": 0.00016829151718411595, + "learning_rate": 7.605140822605883e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222209840, + "step": 102975 + }, + { + "epoch": 16.799347471451878, + "grad_norm": 0.0014180229045450687, + "learning_rate": 7.601367577761248e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222220592, + "step": 102980 + }, + { + "epoch": 16.80016313213703, + "grad_norm": 0.0034764339216053486, + "learning_rate": 7.597595192178702e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222232048, + "step": 102985 + }, + { + "epoch": 16.800978792822185, + "grad_norm": 0.0035230170469731092, + "learning_rate": 7.59382366593468e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222243120, + "step": 102990 + }, + { + "epoch": 16.80179445350734, + "grad_norm": 0.0007919471827335656, + "learning_rate": 7.590052999105618e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222254288, + "step": 102995 + }, + { + "epoch": 16.802610114192497, + "grad_norm": 0.00017622063751332462, + "learning_rate": 7.586283191767929e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222265712, + "step": 103000 + }, + { + "epoch": 16.803425774877653, + "grad_norm": 0.01508649718016386, + "learning_rate": 7.582514243998023e-05, + "loss": 0.0918, + "num_input_tokens_seen": 222277424, + "step": 103005 + }, + { + "epoch": 16.804241435562805, + "grad_norm": 0.0072076586075127125, + "learning_rate": 7.578746155872268e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222286864, + "step": 103010 + }, + { + "epoch": 16.80505709624796, + "grad_norm": 0.0006490631494671106, + "learning_rate": 7.574978927467046e-05, + "loss": 0.0002, + "num_input_tokens_seen": 222297584, + "step": 103015 + }, + { + "epoch": 16.805872756933116, + "grad_norm": 0.00044445376261137426, + "learning_rate": 7.571212558858692e-05, + "loss": 0.0001, + "num_input_tokens_seen": 222308400, + "step": 103020 + }, + { + "epoch": 16.806688417618272, + "grad_norm": 0.01742391288280487, + "learning_rate": 7.567447050123538e-05, + "loss": 0.1135, + "num_input_tokens_seen": 222319184, + "step": 103025 + }, + { + "epoch": 16.807504078303424, + "grad_norm": 0.00017139030387625098, + "learning_rate": 7.563682401337901e-05, + "loss": 0.0016, + "num_input_tokens_seen": 222329360, + "step": 103030 + }, + { + "epoch": 16.80831973898858, + "grad_norm": 0.19950053095817566, + "learning_rate": 7.559918612578065e-05, + "loss": 0.0059, + "num_input_tokens_seen": 222338832, + "step": 103035 + }, + { + "epoch": 16.809135399673735, + "grad_norm": 0.010523505508899689, + "learning_rate": 7.55615568392034e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222348944, + "step": 103040 + }, + { + "epoch": 16.80995106035889, + "grad_norm": 0.0014753983123227954, + "learning_rate": 7.552393615440939e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222360592, + "step": 103045 + }, + { + "epoch": 16.810766721044047, + "grad_norm": 0.010529310442507267, + "learning_rate": 7.548632407216155e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222370832, + "step": 103050 + }, + { + "epoch": 16.8115823817292, + "grad_norm": 0.00023861821682658046, + "learning_rate": 7.544872059322161e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222381552, + "step": 103055 + }, + { + "epoch": 16.812398042414355, + "grad_norm": 0.0016882512718439102, + "learning_rate": 7.541112571835218e-05, + "loss": 0.0002, + "num_input_tokens_seen": 222393296, + "step": 103060 + }, + { + "epoch": 16.81321370309951, + "grad_norm": 0.018565330654382706, + "learning_rate": 7.537353944831471e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222404144, + "step": 103065 + }, + { + "epoch": 16.814029363784666, + "grad_norm": 0.004279870539903641, + "learning_rate": 7.533596178387136e-05, + "loss": 0.0015, + "num_input_tokens_seen": 222414704, + "step": 103070 + }, + { + "epoch": 16.81484502446982, + "grad_norm": 0.0002945191808976233, + "learning_rate": 7.529839272578326e-05, + "loss": 0.001, + "num_input_tokens_seen": 222422992, + "step": 103075 + }, + { + "epoch": 16.815660685154974, + "grad_norm": 0.022279933094978333, + "learning_rate": 7.526083227481223e-05, + "loss": 0.0027, + "num_input_tokens_seen": 222433552, + "step": 103080 + }, + { + "epoch": 16.81647634584013, + "grad_norm": 0.0125409085303545, + "learning_rate": 7.522328043171899e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222443984, + "step": 103085 + }, + { + "epoch": 16.817292006525285, + "grad_norm": 0.03136913850903511, + "learning_rate": 7.518573719726507e-05, + "loss": 0.0044, + "num_input_tokens_seen": 222453968, + "step": 103090 + }, + { + "epoch": 16.81810766721044, + "grad_norm": 0.019675221294164658, + "learning_rate": 7.514820257221088e-05, + "loss": 0.0007, + "num_input_tokens_seen": 222465424, + "step": 103095 + }, + { + "epoch": 16.818923327895597, + "grad_norm": 0.02000470459461212, + "learning_rate": 7.511067655731757e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222476112, + "step": 103100 + }, + { + "epoch": 16.81973898858075, + "grad_norm": 0.0007504420937038958, + "learning_rate": 7.507315915334517e-05, + "loss": 0.0185, + "num_input_tokens_seen": 222485648, + "step": 103105 + }, + { + "epoch": 16.820554649265905, + "grad_norm": 0.01389289740473032, + "learning_rate": 7.503565036105447e-05, + "loss": 0.001, + "num_input_tokens_seen": 222496816, + "step": 103110 + }, + { + "epoch": 16.82137030995106, + "grad_norm": 0.0005339086637832224, + "learning_rate": 7.49981501812052e-05, + "loss": 0.0016, + "num_input_tokens_seen": 222508176, + "step": 103115 + }, + { + "epoch": 16.822185970636216, + "grad_norm": 0.0015223042573779821, + "learning_rate": 7.496065861455786e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222517616, + "step": 103120 + }, + { + "epoch": 16.82300163132137, + "grad_norm": 0.0004307095077820122, + "learning_rate": 7.492317566187167e-05, + "loss": 0.023, + "num_input_tokens_seen": 222528784, + "step": 103125 + }, + { + "epoch": 16.823817292006524, + "grad_norm": 0.004094603005796671, + "learning_rate": 7.48857013239067e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222539920, + "step": 103130 + }, + { + "epoch": 16.82463295269168, + "grad_norm": 0.0010909035336226225, + "learning_rate": 7.484823560142235e-05, + "loss": 0.0013, + "num_input_tokens_seen": 222551088, + "step": 103135 + }, + { + "epoch": 16.825448613376835, + "grad_norm": 0.0008549643098376691, + "learning_rate": 7.481077849517776e-05, + "loss": 0.0017, + "num_input_tokens_seen": 222562640, + "step": 103140 + }, + { + "epoch": 16.82626427406199, + "grad_norm": 0.003835807554423809, + "learning_rate": 7.477333000593218e-05, + "loss": 0.0342, + "num_input_tokens_seen": 222572880, + "step": 103145 + }, + { + "epoch": 16.827079934747147, + "grad_norm": 0.00484345480799675, + "learning_rate": 7.473589013444449e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222584208, + "step": 103150 + }, + { + "epoch": 16.8278955954323, + "grad_norm": 0.004479347262531519, + "learning_rate": 7.469845888147348e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222593840, + "step": 103155 + }, + { + "epoch": 16.828711256117455, + "grad_norm": 0.2004682421684265, + "learning_rate": 7.466103624777776e-05, + "loss": 0.0046, + "num_input_tokens_seen": 222605808, + "step": 103160 + }, + { + "epoch": 16.82952691680261, + "grad_norm": 0.00037271747714839876, + "learning_rate": 7.462362223411568e-05, + "loss": 0.0002, + "num_input_tokens_seen": 222617200, + "step": 103165 + }, + { + "epoch": 16.830342577487766, + "grad_norm": 0.0797378271818161, + "learning_rate": 7.458621684124556e-05, + "loss": 0.0124, + "num_input_tokens_seen": 222626448, + "step": 103170 + }, + { + "epoch": 16.83115823817292, + "grad_norm": 0.03227737545967102, + "learning_rate": 7.454882006992541e-05, + "loss": 0.0049, + "num_input_tokens_seen": 222638032, + "step": 103175 + }, + { + "epoch": 16.831973898858074, + "grad_norm": 0.011967415921390057, + "learning_rate": 7.451143192091304e-05, + "loss": 0.0261, + "num_input_tokens_seen": 222650224, + "step": 103180 + }, + { + "epoch": 16.83278955954323, + "grad_norm": 0.00020868501451332122, + "learning_rate": 7.447405239496646e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222660880, + "step": 103185 + }, + { + "epoch": 16.833605220228385, + "grad_norm": 0.00012900945148430765, + "learning_rate": 7.443668149284289e-05, + "loss": 0.0011, + "num_input_tokens_seen": 222673072, + "step": 103190 + }, + { + "epoch": 16.83442088091354, + "grad_norm": 0.006847749464213848, + "learning_rate": 7.439931921529996e-05, + "loss": 0.0027, + "num_input_tokens_seen": 222684304, + "step": 103195 + }, + { + "epoch": 16.835236541598697, + "grad_norm": 0.0048612672835588455, + "learning_rate": 7.436196556309454e-05, + "loss": 0.021, + "num_input_tokens_seen": 222695216, + "step": 103200 + }, + { + "epoch": 16.83605220228385, + "grad_norm": 0.0342845655977726, + "learning_rate": 7.432462053698413e-05, + "loss": 0.0141, + "num_input_tokens_seen": 222705776, + "step": 103205 + }, + { + "epoch": 16.836867862969005, + "grad_norm": 0.01872059889137745, + "learning_rate": 7.428728413772502e-05, + "loss": 0.0012, + "num_input_tokens_seen": 222716720, + "step": 103210 + }, + { + "epoch": 16.83768352365416, + "grad_norm": 0.022978441789746284, + "learning_rate": 7.42499563660744e-05, + "loss": 0.0071, + "num_input_tokens_seen": 222726896, + "step": 103215 + }, + { + "epoch": 16.838499184339316, + "grad_norm": 0.013515046797692776, + "learning_rate": 7.421263722278826e-05, + "loss": 0.0017, + "num_input_tokens_seen": 222736816, + "step": 103220 + }, + { + "epoch": 16.839314845024468, + "grad_norm": 0.019652139395475388, + "learning_rate": 7.417532670862343e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222746448, + "step": 103225 + }, + { + "epoch": 16.840130505709624, + "grad_norm": 0.0009630456916056573, + "learning_rate": 7.413802482433557e-05, + "loss": 0.0807, + "num_input_tokens_seen": 222756816, + "step": 103230 + }, + { + "epoch": 16.84094616639478, + "grad_norm": 0.024600954726338387, + "learning_rate": 7.41007315706811e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222768400, + "step": 103235 + }, + { + "epoch": 16.841761827079935, + "grad_norm": 0.005003143567591906, + "learning_rate": 7.406344694841538e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222778736, + "step": 103240 + }, + { + "epoch": 16.84257748776509, + "grad_norm": 0.0010466438252478838, + "learning_rate": 7.402617095829434e-05, + "loss": 0.0022, + "num_input_tokens_seen": 222788528, + "step": 103245 + }, + { + "epoch": 16.843393148450243, + "grad_norm": 0.7211628556251526, + "learning_rate": 7.398890360107336e-05, + "loss": 0.0211, + "num_input_tokens_seen": 222799216, + "step": 103250 + }, + { + "epoch": 16.8442088091354, + "grad_norm": 0.0004959175712428987, + "learning_rate": 7.395164487750766e-05, + "loss": 0.0064, + "num_input_tokens_seen": 222810032, + "step": 103255 + }, + { + "epoch": 16.845024469820554, + "grad_norm": 0.0010244646109640598, + "learning_rate": 7.391439478835233e-05, + "loss": 0.0019, + "num_input_tokens_seen": 222819664, + "step": 103260 + }, + { + "epoch": 16.84584013050571, + "grad_norm": 0.00035454294993542135, + "learning_rate": 7.387715333436235e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222832016, + "step": 103265 + }, + { + "epoch": 16.846655791190866, + "grad_norm": 0.0006541311158798635, + "learning_rate": 7.383992051629246e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222841936, + "step": 103270 + }, + { + "epoch": 16.847471451876018, + "grad_norm": 0.025863567367196083, + "learning_rate": 7.380269633489717e-05, + "loss": 0.0008, + "num_input_tokens_seen": 222851248, + "step": 103275 + }, + { + "epoch": 16.848287112561174, + "grad_norm": 0.003879915690049529, + "learning_rate": 7.376548079093087e-05, + "loss": 0.0004, + "num_input_tokens_seen": 222861584, + "step": 103280 + }, + { + "epoch": 16.84910277324633, + "grad_norm": 0.0017584466841071844, + "learning_rate": 7.372827388514792e-05, + "loss": 0.0042, + "num_input_tokens_seen": 222872720, + "step": 103285 + }, + { + "epoch": 16.849918433931485, + "grad_norm": 0.00012636043538805097, + "learning_rate": 7.369107561830218e-05, + "loss": 0.0053, + "num_input_tokens_seen": 222883952, + "step": 103290 + }, + { + "epoch": 16.85073409461664, + "grad_norm": 0.004172631539404392, + "learning_rate": 7.365388599114764e-05, + "loss": 0.0028, + "num_input_tokens_seen": 222893392, + "step": 103295 + }, + { + "epoch": 16.851549755301793, + "grad_norm": 0.0009082440519705415, + "learning_rate": 7.361670500443796e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222903856, + "step": 103300 + }, + { + "epoch": 16.85236541598695, + "grad_norm": 0.0008943970315158367, + "learning_rate": 7.357953265892665e-05, + "loss": 0.0002, + "num_input_tokens_seen": 222914288, + "step": 103305 + }, + { + "epoch": 16.853181076672104, + "grad_norm": 0.0002977423428092152, + "learning_rate": 7.354236895536704e-05, + "loss": 0.03, + "num_input_tokens_seen": 222925264, + "step": 103310 + }, + { + "epoch": 16.85399673735726, + "grad_norm": 0.0017433040775358677, + "learning_rate": 7.350521389451231e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222935696, + "step": 103315 + }, + { + "epoch": 16.854812398042416, + "grad_norm": 0.014422601088881493, + "learning_rate": 7.346806747711554e-05, + "loss": 0.0023, + "num_input_tokens_seen": 222946864, + "step": 103320 + }, + { + "epoch": 16.855628058727568, + "grad_norm": 0.00016869604587554932, + "learning_rate": 7.343092970392929e-05, + "loss": 0.0003, + "num_input_tokens_seen": 222957840, + "step": 103325 + }, + { + "epoch": 16.856443719412724, + "grad_norm": 0.0013907680986449122, + "learning_rate": 7.339380057570666e-05, + "loss": 0.0014, + "num_input_tokens_seen": 222967632, + "step": 103330 + }, + { + "epoch": 16.85725938009788, + "grad_norm": 0.0005310530541464686, + "learning_rate": 7.335668009319962e-05, + "loss": 0.0006, + "num_input_tokens_seen": 222978416, + "step": 103335 + }, + { + "epoch": 16.858075040783035, + "grad_norm": 0.001258755219168961, + "learning_rate": 7.331956825716091e-05, + "loss": 0.0005, + "num_input_tokens_seen": 222989616, + "step": 103340 + }, + { + "epoch": 16.85889070146819, + "grad_norm": 9.26154971239157e-05, + "learning_rate": 7.328246506834224e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223000048, + "step": 103345 + }, + { + "epoch": 16.859706362153343, + "grad_norm": 0.00016912099090404809, + "learning_rate": 7.32453705274958e-05, + "loss": 0.0098, + "num_input_tokens_seen": 223009872, + "step": 103350 + }, + { + "epoch": 16.8605220228385, + "grad_norm": 0.005508676171302795, + "learning_rate": 7.320828463537333e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223019600, + "step": 103355 + }, + { + "epoch": 16.861337683523654, + "grad_norm": 0.03821824491024017, + "learning_rate": 7.317120739272643e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223030128, + "step": 103360 + }, + { + "epoch": 16.86215334420881, + "grad_norm": 0.00012856218381784856, + "learning_rate": 7.313413880030645e-05, + "loss": 0.0001, + "num_input_tokens_seen": 223040528, + "step": 103365 + }, + { + "epoch": 16.862969004893966, + "grad_norm": 0.0003440727887209505, + "learning_rate": 7.309707885886462e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223051664, + "step": 103370 + }, + { + "epoch": 16.863784665579118, + "grad_norm": 0.016444187611341476, + "learning_rate": 7.306002756915214e-05, + "loss": 0.0036, + "num_input_tokens_seen": 223062896, + "step": 103375 + }, + { + "epoch": 16.864600326264274, + "grad_norm": 0.00038786110235378146, + "learning_rate": 7.302298493191972e-05, + "loss": 0.0017, + "num_input_tokens_seen": 223073616, + "step": 103380 + }, + { + "epoch": 16.86541598694943, + "grad_norm": 0.0009466250194236636, + "learning_rate": 7.298595094791826e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223085648, + "step": 103385 + }, + { + "epoch": 16.866231647634585, + "grad_norm": 0.002305264351889491, + "learning_rate": 7.294892561789817e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223095152, + "step": 103390 + }, + { + "epoch": 16.86704730831974, + "grad_norm": 0.0002634226402733475, + "learning_rate": 7.291190894260985e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223106544, + "step": 103395 + }, + { + "epoch": 16.867862969004893, + "grad_norm": 0.0008474920759908855, + "learning_rate": 7.287490092280346e-05, + "loss": 0.0002, + "num_input_tokens_seen": 223116912, + "step": 103400 + }, + { + "epoch": 16.86867862969005, + "grad_norm": 0.0023164900485426188, + "learning_rate": 7.28379015592291e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223128752, + "step": 103405 + }, + { + "epoch": 16.869494290375204, + "grad_norm": 0.002073081210255623, + "learning_rate": 7.280091085263657e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223139152, + "step": 103410 + }, + { + "epoch": 16.87030995106036, + "grad_norm": 0.002842012792825699, + "learning_rate": 7.276392880377548e-05, + "loss": 0.0007, + "num_input_tokens_seen": 223150000, + "step": 103415 + }, + { + "epoch": 16.871125611745512, + "grad_norm": 0.019670799374580383, + "learning_rate": 7.27269554133954e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223161424, + "step": 103420 + }, + { + "epoch": 16.871941272430668, + "grad_norm": 0.001242808997631073, + "learning_rate": 7.268999068224557e-05, + "loss": 0.002, + "num_input_tokens_seen": 223173136, + "step": 103425 + }, + { + "epoch": 16.872756933115824, + "grad_norm": 0.0010782487224787474, + "learning_rate": 7.265303461107519e-05, + "loss": 0.0004, + "num_input_tokens_seen": 223183536, + "step": 103430 + }, + { + "epoch": 16.87357259380098, + "grad_norm": 0.00033626245567575097, + "learning_rate": 7.261608720063317e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223195280, + "step": 103435 + }, + { + "epoch": 16.874388254486135, + "grad_norm": 0.0005982128204777837, + "learning_rate": 7.25791484516683e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223204592, + "step": 103440 + }, + { + "epoch": 16.875203915171287, + "grad_norm": 0.002980044111609459, + "learning_rate": 7.254221836492925e-05, + "loss": 0.0017, + "num_input_tokens_seen": 223214928, + "step": 103445 + }, + { + "epoch": 16.876019575856443, + "grad_norm": 0.0007717745611444116, + "learning_rate": 7.250529694116436e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223225840, + "step": 103450 + }, + { + "epoch": 16.8768352365416, + "grad_norm": 0.027375217527151108, + "learning_rate": 7.246838418112189e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223236240, + "step": 103455 + }, + { + "epoch": 16.877650897226754, + "grad_norm": 0.0023580349516123533, + "learning_rate": 7.243148008555017e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223247408, + "step": 103460 + }, + { + "epoch": 16.87846655791191, + "grad_norm": 0.008660969324409962, + "learning_rate": 7.239458465519672e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223257968, + "step": 103465 + }, + { + "epoch": 16.879282218597062, + "grad_norm": 0.1821218729019165, + "learning_rate": 7.235769789080954e-05, + "loss": 0.0089, + "num_input_tokens_seen": 223267376, + "step": 103470 + }, + { + "epoch": 16.880097879282218, + "grad_norm": 0.0006848404300399125, + "learning_rate": 7.232081979313615e-05, + "loss": 0.0737, + "num_input_tokens_seen": 223277680, + "step": 103475 + }, + { + "epoch": 16.880913539967374, + "grad_norm": 0.056093502789735794, + "learning_rate": 7.228395036292384e-05, + "loss": 0.0013, + "num_input_tokens_seen": 223289040, + "step": 103480 + }, + { + "epoch": 16.88172920065253, + "grad_norm": 0.0009353241766802967, + "learning_rate": 7.224708960091992e-05, + "loss": 0.0021, + "num_input_tokens_seen": 223300848, + "step": 103485 + }, + { + "epoch": 16.882544861337685, + "grad_norm": 0.001607456710189581, + "learning_rate": 7.221023750787136e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223311696, + "step": 103490 + }, + { + "epoch": 16.883360522022837, + "grad_norm": 0.0019028940005227923, + "learning_rate": 7.217339408452505e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223322640, + "step": 103495 + }, + { + "epoch": 16.884176182707993, + "grad_norm": 0.0003582093049772084, + "learning_rate": 7.21365593316276e-05, + "loss": 0.0011, + "num_input_tokens_seen": 223332656, + "step": 103500 + }, + { + "epoch": 16.88499184339315, + "grad_norm": 0.00021734688198193908, + "learning_rate": 7.209973324992558e-05, + "loss": 0.0041, + "num_input_tokens_seen": 223344016, + "step": 103505 + }, + { + "epoch": 16.885807504078304, + "grad_norm": 0.13429728150367737, + "learning_rate": 7.206291584016533e-05, + "loss": 0.0034, + "num_input_tokens_seen": 223354608, + "step": 103510 + }, + { + "epoch": 16.88662316476346, + "grad_norm": 0.028307495638728142, + "learning_rate": 7.202610710309293e-05, + "loss": 0.0017, + "num_input_tokens_seen": 223366160, + "step": 103515 + }, + { + "epoch": 16.887438825448612, + "grad_norm": 0.00011590484791668132, + "learning_rate": 7.198930703945439e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223377648, + "step": 103520 + }, + { + "epoch": 16.888254486133768, + "grad_norm": 0.00015653629088774323, + "learning_rate": 7.19525156499955e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223389424, + "step": 103525 + }, + { + "epoch": 16.889070146818923, + "grad_norm": 0.1353372037410736, + "learning_rate": 7.191573293546195e-05, + "loss": 0.0063, + "num_input_tokens_seen": 223400912, + "step": 103530 + }, + { + "epoch": 16.88988580750408, + "grad_norm": 0.07635101675987244, + "learning_rate": 7.187895889659906e-05, + "loss": 0.0027, + "num_input_tokens_seen": 223411728, + "step": 103535 + }, + { + "epoch": 16.890701468189235, + "grad_norm": 0.00027066280017606914, + "learning_rate": 7.184219353415228e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223422672, + "step": 103540 + }, + { + "epoch": 16.891517128874387, + "grad_norm": 0.0002334895107196644, + "learning_rate": 7.180543684886654e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223434544, + "step": 103545 + }, + { + "epoch": 16.892332789559543, + "grad_norm": 0.0005206402856856585, + "learning_rate": 7.176868884148679e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223445424, + "step": 103550 + }, + { + "epoch": 16.8931484502447, + "grad_norm": 0.03776264935731888, + "learning_rate": 7.173194951275786e-05, + "loss": 0.0032, + "num_input_tokens_seen": 223455536, + "step": 103555 + }, + { + "epoch": 16.893964110929854, + "grad_norm": 0.02247370220720768, + "learning_rate": 7.169521886342417e-05, + "loss": 0.0017, + "num_input_tokens_seen": 223465296, + "step": 103560 + }, + { + "epoch": 16.894779771615006, + "grad_norm": 0.09824686497449875, + "learning_rate": 7.165849689423043e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223477008, + "step": 103565 + }, + { + "epoch": 16.895595432300162, + "grad_norm": 0.0005710441037081182, + "learning_rate": 7.162178360592037e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223488272, + "step": 103570 + }, + { + "epoch": 16.896411092985318, + "grad_norm": 0.1265021562576294, + "learning_rate": 7.15850789992386e-05, + "loss": 0.003, + "num_input_tokens_seen": 223498640, + "step": 103575 + }, + { + "epoch": 16.897226753670473, + "grad_norm": 0.004714385140687227, + "learning_rate": 7.154838307492839e-05, + "loss": 0.0053, + "num_input_tokens_seen": 223510384, + "step": 103580 + }, + { + "epoch": 16.89804241435563, + "grad_norm": 0.0001558868825668469, + "learning_rate": 7.151169583373402e-05, + "loss": 0.0011, + "num_input_tokens_seen": 223520784, + "step": 103585 + }, + { + "epoch": 16.898858075040785, + "grad_norm": 0.00026034636539407074, + "learning_rate": 7.147501727639844e-05, + "loss": 0.0023, + "num_input_tokens_seen": 223530800, + "step": 103590 + }, + { + "epoch": 16.899673735725937, + "grad_norm": 0.0004300115106161684, + "learning_rate": 7.14383474036655e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223541904, + "step": 103595 + }, + { + "epoch": 16.900489396411093, + "grad_norm": 0.16262738406658173, + "learning_rate": 7.140168621627786e-05, + "loss": 0.0031, + "num_input_tokens_seen": 223552688, + "step": 103600 + }, + { + "epoch": 16.90130505709625, + "grad_norm": 0.01837944984436035, + "learning_rate": 7.136503371497888e-05, + "loss": 0.0016, + "num_input_tokens_seen": 223563216, + "step": 103605 + }, + { + "epoch": 16.902120717781404, + "grad_norm": 0.00023849746503401548, + "learning_rate": 7.132838990051132e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223574192, + "step": 103610 + }, + { + "epoch": 16.902936378466556, + "grad_norm": 0.0010010668775066733, + "learning_rate": 7.129175477361766e-05, + "loss": 0.0013, + "num_input_tokens_seen": 223583248, + "step": 103615 + }, + { + "epoch": 16.903752039151712, + "grad_norm": 0.0004666231106966734, + "learning_rate": 7.125512833504049e-05, + "loss": 0.0001, + "num_input_tokens_seen": 223593136, + "step": 103620 + }, + { + "epoch": 16.904567699836868, + "grad_norm": 0.03831160441040993, + "learning_rate": 7.121851058552209e-05, + "loss": 0.0006, + "num_input_tokens_seen": 223604240, + "step": 103625 + }, + { + "epoch": 16.905383360522023, + "grad_norm": 1.1400467157363892, + "learning_rate": 7.118190152580444e-05, + "loss": 0.0443, + "num_input_tokens_seen": 223616016, + "step": 103630 + }, + { + "epoch": 16.90619902120718, + "grad_norm": 0.009199860505759716, + "learning_rate": 7.114530115662959e-05, + "loss": 0.0006, + "num_input_tokens_seen": 223627984, + "step": 103635 + }, + { + "epoch": 16.90701468189233, + "grad_norm": 0.00014801163342781365, + "learning_rate": 7.110870947873926e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223638544, + "step": 103640 + }, + { + "epoch": 16.907830342577487, + "grad_norm": 0.0002178381837438792, + "learning_rate": 7.107212649287497e-05, + "loss": 0.0026, + "num_input_tokens_seen": 223649168, + "step": 103645 + }, + { + "epoch": 16.908646003262643, + "grad_norm": 0.00028648535953834653, + "learning_rate": 7.103555219977825e-05, + "loss": 0.0007, + "num_input_tokens_seen": 223660176, + "step": 103650 + }, + { + "epoch": 16.9094616639478, + "grad_norm": 0.0001595184876350686, + "learning_rate": 7.099898660019016e-05, + "loss": 0.0073, + "num_input_tokens_seen": 223670672, + "step": 103655 + }, + { + "epoch": 16.910277324632954, + "grad_norm": 0.009725823067128658, + "learning_rate": 7.096242969485189e-05, + "loss": 0.0108, + "num_input_tokens_seen": 223681264, + "step": 103660 + }, + { + "epoch": 16.911092985318106, + "grad_norm": 0.0030517042614519596, + "learning_rate": 7.092588148450413e-05, + "loss": 0.0001, + "num_input_tokens_seen": 223691248, + "step": 103665 + }, + { + "epoch": 16.911908646003262, + "grad_norm": 0.19508150219917297, + "learning_rate": 7.088934196988795e-05, + "loss": 0.0035, + "num_input_tokens_seen": 223702576, + "step": 103670 + }, + { + "epoch": 16.912724306688418, + "grad_norm": 0.003291555680334568, + "learning_rate": 7.085281115174335e-05, + "loss": 0.0002, + "num_input_tokens_seen": 223714512, + "step": 103675 + }, + { + "epoch": 16.913539967373573, + "grad_norm": 0.00030334253096953034, + "learning_rate": 7.081628903081116e-05, + "loss": 0.0008, + "num_input_tokens_seen": 223725040, + "step": 103680 + }, + { + "epoch": 16.91435562805873, + "grad_norm": 0.012916112318634987, + "learning_rate": 7.077977560783117e-05, + "loss": 0.0004, + "num_input_tokens_seen": 223736560, + "step": 103685 + }, + { + "epoch": 16.91517128874388, + "grad_norm": 0.00027748465072363615, + "learning_rate": 7.074327088354371e-05, + "loss": 0.0007, + "num_input_tokens_seen": 223748176, + "step": 103690 + }, + { + "epoch": 16.915986949429037, + "grad_norm": 0.0005521825514733791, + "learning_rate": 7.070677485868821e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223759088, + "step": 103695 + }, + { + "epoch": 16.916802610114193, + "grad_norm": 0.09723838418722153, + "learning_rate": 7.067028753400473e-05, + "loss": 0.0028, + "num_input_tokens_seen": 223770288, + "step": 103700 + }, + { + "epoch": 16.91761827079935, + "grad_norm": 1.0925171375274658, + "learning_rate": 7.06338089102323e-05, + "loss": 0.0371, + "num_input_tokens_seen": 223780976, + "step": 103705 + }, + { + "epoch": 16.918433931484504, + "grad_norm": 0.002950022928416729, + "learning_rate": 7.05973389881106e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223792272, + "step": 103710 + }, + { + "epoch": 16.919249592169656, + "grad_norm": 0.00037494589923880994, + "learning_rate": 7.056087776837838e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223804080, + "step": 103715 + }, + { + "epoch": 16.920065252854812, + "grad_norm": 0.00020330646657384932, + "learning_rate": 7.052442525177499e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223815696, + "step": 103720 + }, + { + "epoch": 16.920880913539968, + "grad_norm": 0.00018692537560127676, + "learning_rate": 7.048798143903873e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223826608, + "step": 103725 + }, + { + "epoch": 16.921696574225123, + "grad_norm": 0.0022232630290091038, + "learning_rate": 7.045154633090861e-05, + "loss": 0.0014, + "num_input_tokens_seen": 223838032, + "step": 103730 + }, + { + "epoch": 16.92251223491028, + "grad_norm": 0.0012517538852989674, + "learning_rate": 7.041511992812255e-05, + "loss": 0.0003, + "num_input_tokens_seen": 223848144, + "step": 103735 + }, + { + "epoch": 16.92332789559543, + "grad_norm": 0.11352401226758957, + "learning_rate": 7.037870223141935e-05, + "loss": 0.0018, + "num_input_tokens_seen": 223857360, + "step": 103740 + }, + { + "epoch": 16.924143556280587, + "grad_norm": 0.16355040669441223, + "learning_rate": 7.034229324153652e-05, + "loss": 0.0039, + "num_input_tokens_seen": 223868336, + "step": 103745 + }, + { + "epoch": 16.924959216965743, + "grad_norm": 0.02217322587966919, + "learning_rate": 7.030589295921224e-05, + "loss": 0.0026, + "num_input_tokens_seen": 223879504, + "step": 103750 + }, + { + "epoch": 16.9257748776509, + "grad_norm": 0.008536133915185928, + "learning_rate": 7.026950138518423e-05, + "loss": 0.0012, + "num_input_tokens_seen": 223890128, + "step": 103755 + }, + { + "epoch": 16.92659053833605, + "grad_norm": 0.002739880932494998, + "learning_rate": 7.023311852018988e-05, + "loss": 0.0005, + "num_input_tokens_seen": 223902352, + "step": 103760 + }, + { + "epoch": 16.927406199021206, + "grad_norm": 0.0017929644091054797, + "learning_rate": 7.019674436496653e-05, + "loss": 0.0897, + "num_input_tokens_seen": 223912464, + "step": 103765 + }, + { + "epoch": 16.928221859706362, + "grad_norm": 0.0001150484531535767, + "learning_rate": 7.01603789202515e-05, + "loss": 0.0019, + "num_input_tokens_seen": 223924688, + "step": 103770 + }, + { + "epoch": 16.929037520391518, + "grad_norm": 1.1168887615203857, + "learning_rate": 7.01240221867816e-05, + "loss": 0.0248, + "num_input_tokens_seen": 223936720, + "step": 103775 + }, + { + "epoch": 16.929853181076673, + "grad_norm": 0.00023720126773696393, + "learning_rate": 7.008767416529376e-05, + "loss": 0.0134, + "num_input_tokens_seen": 223947344, + "step": 103780 + }, + { + "epoch": 16.930668841761825, + "grad_norm": 0.000110608983959537, + "learning_rate": 7.00513348565246e-05, + "loss": 0.001, + "num_input_tokens_seen": 223958832, + "step": 103785 + }, + { + "epoch": 16.93148450244698, + "grad_norm": 0.04574248194694519, + "learning_rate": 7.001500426121055e-05, + "loss": 0.0094, + "num_input_tokens_seen": 223967248, + "step": 103790 + }, + { + "epoch": 16.932300163132137, + "grad_norm": 0.03339003771543503, + "learning_rate": 6.997868238008793e-05, + "loss": 0.0009, + "num_input_tokens_seen": 223979184, + "step": 103795 + }, + { + "epoch": 16.933115823817293, + "grad_norm": 0.041160427033901215, + "learning_rate": 6.994236921389268e-05, + "loss": 0.001, + "num_input_tokens_seen": 223989104, + "step": 103800 + }, + { + "epoch": 16.93393148450245, + "grad_norm": 0.0002784416719805449, + "learning_rate": 6.990606476336114e-05, + "loss": 0.0001, + "num_input_tokens_seen": 224000336, + "step": 103805 + }, + { + "epoch": 16.9347471451876, + "grad_norm": 0.0007323070312850177, + "learning_rate": 6.98697690292286e-05, + "loss": 0.0208, + "num_input_tokens_seen": 224011600, + "step": 103810 + }, + { + "epoch": 16.935562805872756, + "grad_norm": 0.00030831916956231, + "learning_rate": 6.983348201223105e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224022320, + "step": 103815 + }, + { + "epoch": 16.936378466557912, + "grad_norm": 0.0003351138439029455, + "learning_rate": 6.97972037131035e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224033264, + "step": 103820 + }, + { + "epoch": 16.937194127243067, + "grad_norm": 0.0009383990545757115, + "learning_rate": 6.976093413258156e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224043408, + "step": 103825 + }, + { + "epoch": 16.938009787928223, + "grad_norm": 0.0009019788703881204, + "learning_rate": 6.972467327139987e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224054288, + "step": 103830 + }, + { + "epoch": 16.938825448613375, + "grad_norm": 0.5222061276435852, + "learning_rate": 6.968842113029372e-05, + "loss": 0.0517, + "num_input_tokens_seen": 224064528, + "step": 103835 + }, + { + "epoch": 16.93964110929853, + "grad_norm": 0.001082535949535668, + "learning_rate": 6.965217770999738e-05, + "loss": 0.0014, + "num_input_tokens_seen": 224075248, + "step": 103840 + }, + { + "epoch": 16.940456769983687, + "grad_norm": 0.0001152159966295585, + "learning_rate": 6.961594301124585e-05, + "loss": 0.0001, + "num_input_tokens_seen": 224085616, + "step": 103845 + }, + { + "epoch": 16.941272430668842, + "grad_norm": 0.0013030244736000896, + "learning_rate": 6.957971703477301e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224095536, + "step": 103850 + }, + { + "epoch": 16.942088091353998, + "grad_norm": 0.02141076698899269, + "learning_rate": 6.954349978131342e-05, + "loss": 0.0013, + "num_input_tokens_seen": 224105264, + "step": 103855 + }, + { + "epoch": 16.94290375203915, + "grad_norm": 0.001945957075804472, + "learning_rate": 6.950729125160066e-05, + "loss": 0.0094, + "num_input_tokens_seen": 224116720, + "step": 103860 + }, + { + "epoch": 16.943719412724306, + "grad_norm": 0.27618762850761414, + "learning_rate": 6.947109144636898e-05, + "loss": 0.0066, + "num_input_tokens_seen": 224127120, + "step": 103865 + }, + { + "epoch": 16.94453507340946, + "grad_norm": 0.0052630482241511345, + "learning_rate": 6.943490036635158e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224138864, + "step": 103870 + }, + { + "epoch": 16.945350734094617, + "grad_norm": 0.010405884124338627, + "learning_rate": 6.939871801228236e-05, + "loss": 0.0005, + "num_input_tokens_seen": 224150864, + "step": 103875 + }, + { + "epoch": 16.946166394779773, + "grad_norm": 0.00034579061320982873, + "learning_rate": 6.936254438489414e-05, + "loss": 0.0064, + "num_input_tokens_seen": 224162416, + "step": 103880 + }, + { + "epoch": 16.946982055464925, + "grad_norm": 0.014014569111168385, + "learning_rate": 6.932637948492038e-05, + "loss": 0.0016, + "num_input_tokens_seen": 224173616, + "step": 103885 + }, + { + "epoch": 16.94779771615008, + "grad_norm": 0.0006147479871287942, + "learning_rate": 6.929022331309392e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224184944, + "step": 103890 + }, + { + "epoch": 16.948613376835237, + "grad_norm": 0.000267721334239468, + "learning_rate": 6.925407587014743e-05, + "loss": 0.0052, + "num_input_tokens_seen": 224195184, + "step": 103895 + }, + { + "epoch": 16.949429037520392, + "grad_norm": 0.0011904146522283554, + "learning_rate": 6.921793715681358e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224204592, + "step": 103900 + }, + { + "epoch": 16.950244698205548, + "grad_norm": 0.0005559956189244986, + "learning_rate": 6.918180717382466e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224216112, + "step": 103905 + }, + { + "epoch": 16.9510603588907, + "grad_norm": 0.00016044436779338866, + "learning_rate": 6.914568592191301e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224228080, + "step": 103910 + }, + { + "epoch": 16.951876019575856, + "grad_norm": 0.011081630364060402, + "learning_rate": 6.910957340181056e-05, + "loss": 0.0018, + "num_input_tokens_seen": 224239216, + "step": 103915 + }, + { + "epoch": 16.95269168026101, + "grad_norm": 0.018934747204184532, + "learning_rate": 6.907346961424926e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224250960, + "step": 103920 + }, + { + "epoch": 16.953507340946167, + "grad_norm": 0.0015153349377214909, + "learning_rate": 6.903737455996073e-05, + "loss": 0.0101, + "num_input_tokens_seen": 224261584, + "step": 103925 + }, + { + "epoch": 16.954323001631323, + "grad_norm": 0.000330324110109359, + "learning_rate": 6.900128823967655e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224271760, + "step": 103930 + }, + { + "epoch": 16.955138662316475, + "grad_norm": 8.078888640739024e-05, + "learning_rate": 6.896521065412803e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224282160, + "step": 103935 + }, + { + "epoch": 16.95595432300163, + "grad_norm": 0.0020767743699252605, + "learning_rate": 6.89291418040463e-05, + "loss": 0.0014, + "num_input_tokens_seen": 224292464, + "step": 103940 + }, + { + "epoch": 16.956769983686787, + "grad_norm": 0.004754452500492334, + "learning_rate": 6.889308169016229e-05, + "loss": 0.0014, + "num_input_tokens_seen": 224304656, + "step": 103945 + }, + { + "epoch": 16.957585644371942, + "grad_norm": 0.0001012511711451225, + "learning_rate": 6.885703031320706e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224316016, + "step": 103950 + }, + { + "epoch": 16.958401305057095, + "grad_norm": 0.00039377258508466184, + "learning_rate": 6.882098767391087e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224326736, + "step": 103955 + }, + { + "epoch": 16.95921696574225, + "grad_norm": 0.00030580288148485124, + "learning_rate": 6.878495377300453e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224337904, + "step": 103960 + }, + { + "epoch": 16.960032626427406, + "grad_norm": 0.000311100302496925, + "learning_rate": 6.874892861121795e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224348464, + "step": 103965 + }, + { + "epoch": 16.96084828711256, + "grad_norm": 0.006358220707625151, + "learning_rate": 6.871291218928166e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224359472, + "step": 103970 + }, + { + "epoch": 16.961663947797717, + "grad_norm": 0.07430107891559601, + "learning_rate": 6.867690450792508e-05, + "loss": 0.002, + "num_input_tokens_seen": 224371120, + "step": 103975 + }, + { + "epoch": 16.96247960848287, + "grad_norm": 0.001159152016043663, + "learning_rate": 6.864090556787838e-05, + "loss": 0.0027, + "num_input_tokens_seen": 224379472, + "step": 103980 + }, + { + "epoch": 16.963295269168025, + "grad_norm": 0.0028858864679932594, + "learning_rate": 6.860491536987079e-05, + "loss": 0.0003, + "num_input_tokens_seen": 224390832, + "step": 103985 + }, + { + "epoch": 16.96411092985318, + "grad_norm": 0.00033770734444260597, + "learning_rate": 6.856893391463192e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224402576, + "step": 103990 + }, + { + "epoch": 16.964926590538337, + "grad_norm": 0.00015203746443148702, + "learning_rate": 6.853296120289094e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224413264, + "step": 103995 + }, + { + "epoch": 16.965742251223492, + "grad_norm": 0.0015931521775200963, + "learning_rate": 6.849699723537684e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224422608, + "step": 104000 + }, + { + "epoch": 16.966557911908644, + "grad_norm": 0.0001250641216756776, + "learning_rate": 6.84610420128185e-05, + "loss": 0.001, + "num_input_tokens_seen": 224434384, + "step": 104005 + }, + { + "epoch": 16.9673735725938, + "grad_norm": 0.0014689202653244138, + "learning_rate": 6.842509553594462e-05, + "loss": 0.0052, + "num_input_tokens_seen": 224444368, + "step": 104010 + }, + { + "epoch": 16.968189233278956, + "grad_norm": 0.004452031571418047, + "learning_rate": 6.83891578054836e-05, + "loss": 0.0013, + "num_input_tokens_seen": 224455600, + "step": 104015 + }, + { + "epoch": 16.96900489396411, + "grad_norm": 0.00591098889708519, + "learning_rate": 6.835322882216388e-05, + "loss": 0.0035, + "num_input_tokens_seen": 224465936, + "step": 104020 + }, + { + "epoch": 16.969820554649267, + "grad_norm": 0.0008718215394765139, + "learning_rate": 6.831730858671353e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224477392, + "step": 104025 + }, + { + "epoch": 16.97063621533442, + "grad_norm": 0.003005698788911104, + "learning_rate": 6.828139709986058e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224488880, + "step": 104030 + }, + { + "epoch": 16.971451876019575, + "grad_norm": 0.0017840074142441154, + "learning_rate": 6.824549436233279e-05, + "loss": 0.0001, + "num_input_tokens_seen": 224499952, + "step": 104035 + }, + { + "epoch": 16.97226753670473, + "grad_norm": 0.00011672579421428964, + "learning_rate": 6.820960037485779e-05, + "loss": 0.0008, + "num_input_tokens_seen": 224510000, + "step": 104040 + }, + { + "epoch": 16.973083197389887, + "grad_norm": 0.0019611844327300787, + "learning_rate": 6.8173715138163e-05, + "loss": 0.0005, + "num_input_tokens_seen": 224520592, + "step": 104045 + }, + { + "epoch": 16.973898858075042, + "grad_norm": 0.0015148085076361895, + "learning_rate": 6.813783865297563e-05, + "loss": 0.0146, + "num_input_tokens_seen": 224530096, + "step": 104050 + }, + { + "epoch": 16.974714518760194, + "grad_norm": 0.002171371364966035, + "learning_rate": 6.810197092002285e-05, + "loss": 0.0024, + "num_input_tokens_seen": 224540848, + "step": 104055 + }, + { + "epoch": 16.97553017944535, + "grad_norm": 0.00023539249377790838, + "learning_rate": 6.806611194003154e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224551280, + "step": 104060 + }, + { + "epoch": 16.976345840130506, + "grad_norm": 0.06872601807117462, + "learning_rate": 6.803026171372845e-05, + "loss": 0.0012, + "num_input_tokens_seen": 224561584, + "step": 104065 + }, + { + "epoch": 16.97716150081566, + "grad_norm": 0.037328965961933136, + "learning_rate": 6.799442024184005e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224571952, + "step": 104070 + }, + { + "epoch": 16.977977161500817, + "grad_norm": 0.0001360759633826092, + "learning_rate": 6.795858752509276e-05, + "loss": 0.0019, + "num_input_tokens_seen": 224584304, + "step": 104075 + }, + { + "epoch": 16.97879282218597, + "grad_norm": 8.876788342604414e-05, + "learning_rate": 6.792276356421278e-05, + "loss": 0.1537, + "num_input_tokens_seen": 224594768, + "step": 104080 + }, + { + "epoch": 16.979608482871125, + "grad_norm": 0.005096145905554295, + "learning_rate": 6.788694835992615e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224605456, + "step": 104085 + }, + { + "epoch": 16.98042414355628, + "grad_norm": 0.0011529172770678997, + "learning_rate": 6.785114191295854e-05, + "loss": 0.0001, + "num_input_tokens_seen": 224617424, + "step": 104090 + }, + { + "epoch": 16.981239804241437, + "grad_norm": 0.011850215494632721, + "learning_rate": 6.78153442240359e-05, + "loss": 0.0014, + "num_input_tokens_seen": 224628688, + "step": 104095 + }, + { + "epoch": 16.982055464926592, + "grad_norm": 0.0003019870782736689, + "learning_rate": 6.777955529388358e-05, + "loss": 0.0027, + "num_input_tokens_seen": 224638864, + "step": 104100 + }, + { + "epoch": 16.982871125611744, + "grad_norm": 1.606163501739502, + "learning_rate": 6.774377512322688e-05, + "loss": 0.0978, + "num_input_tokens_seen": 224650672, + "step": 104105 + }, + { + "epoch": 16.9836867862969, + "grad_norm": 0.001395959174260497, + "learning_rate": 6.77080037127909e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224660368, + "step": 104110 + }, + { + "epoch": 16.984502446982056, + "grad_norm": 0.0006764968857169151, + "learning_rate": 6.767224106330067e-05, + "loss": 0.0091, + "num_input_tokens_seen": 224670992, + "step": 104115 + }, + { + "epoch": 16.98531810766721, + "grad_norm": 0.0005900398246012628, + "learning_rate": 6.763648717548088e-05, + "loss": 0.0603, + "num_input_tokens_seen": 224681520, + "step": 104120 + }, + { + "epoch": 16.986133768352367, + "grad_norm": 0.00024271916481666267, + "learning_rate": 6.760074205005617e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224693040, + "step": 104125 + }, + { + "epoch": 16.98694942903752, + "grad_norm": 0.00457680132240057, + "learning_rate": 6.756500568775098e-05, + "loss": 0.0026, + "num_input_tokens_seen": 224704464, + "step": 104130 + }, + { + "epoch": 16.987765089722675, + "grad_norm": 0.004052008036524057, + "learning_rate": 6.752927808928955e-05, + "loss": 0.003, + "num_input_tokens_seen": 224715056, + "step": 104135 + }, + { + "epoch": 16.98858075040783, + "grad_norm": 0.012083387933671474, + "learning_rate": 6.749355925539591e-05, + "loss": 0.003, + "num_input_tokens_seen": 224725392, + "step": 104140 + }, + { + "epoch": 16.989396411092986, + "grad_norm": 0.014885442331433296, + "learning_rate": 6.745784918679399e-05, + "loss": 0.0004, + "num_input_tokens_seen": 224735856, + "step": 104145 + }, + { + "epoch": 16.99021207177814, + "grad_norm": 0.0610637366771698, + "learning_rate": 6.742214788420742e-05, + "loss": 0.0041, + "num_input_tokens_seen": 224746768, + "step": 104150 + }, + { + "epoch": 16.991027732463294, + "grad_norm": 0.015486551448702812, + "learning_rate": 6.73864553483598e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224755056, + "step": 104155 + }, + { + "epoch": 16.99184339314845, + "grad_norm": 0.03883671015501022, + "learning_rate": 6.735077157997448e-05, + "loss": 0.0017, + "num_input_tokens_seen": 224766128, + "step": 104160 + }, + { + "epoch": 16.992659053833606, + "grad_norm": 0.0019841697067022324, + "learning_rate": 6.731509657977464e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224777680, + "step": 104165 + }, + { + "epoch": 16.99347471451876, + "grad_norm": 0.00148242327850312, + "learning_rate": 6.727943034848327e-05, + "loss": 0.0005, + "num_input_tokens_seen": 224788496, + "step": 104170 + }, + { + "epoch": 16.994290375203914, + "grad_norm": 0.0005242001498118043, + "learning_rate": 6.72437728868232e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224799216, + "step": 104175 + }, + { + "epoch": 16.99510603588907, + "grad_norm": 0.00011859647202072665, + "learning_rate": 6.720812419551703e-05, + "loss": 0.0036, + "num_input_tokens_seen": 224808112, + "step": 104180 + }, + { + "epoch": 16.995921696574225, + "grad_norm": 0.0043924045749008656, + "learning_rate": 6.717248427528727e-05, + "loss": 0.0002, + "num_input_tokens_seen": 224818512, + "step": 104185 + }, + { + "epoch": 16.99673735725938, + "grad_norm": 0.005811613984405994, + "learning_rate": 6.713685312685619e-05, + "loss": 0.1078, + "num_input_tokens_seen": 224828208, + "step": 104190 + }, + { + "epoch": 16.997553017944536, + "grad_norm": 0.007365775294601917, + "learning_rate": 6.710123075094593e-05, + "loss": 0.0003, + "num_input_tokens_seen": 224839376, + "step": 104195 + }, + { + "epoch": 16.99836867862969, + "grad_norm": 0.008850215002894402, + "learning_rate": 6.70656171482783e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224850000, + "step": 104200 + }, + { + "epoch": 16.999184339314844, + "grad_norm": 0.0024554177653044462, + "learning_rate": 6.703001231957535e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224861936, + "step": 104205 + }, + { + "epoch": 17.0, + "grad_norm": 0.07448272407054901, + "learning_rate": 6.699441626555824e-05, + "loss": 0.0021, + "num_input_tokens_seen": 224870720, + "step": 104210 + }, + { + "epoch": 17.0, + "eval_loss": 0.2945714294910431, + "eval_runtime": 104.3588, + "eval_samples_per_second": 26.112, + "eval_steps_per_second": 6.535, + "num_input_tokens_seen": 224870720, + "step": 104210 + }, + { + "epoch": 17.000815660685156, + "grad_norm": 0.00047500821528956294, + "learning_rate": 6.695882898694883e-05, + "loss": 0.0036, + "num_input_tokens_seen": 224881536, + "step": 104215 + }, + { + "epoch": 17.00163132137031, + "grad_norm": 0.15100841224193573, + "learning_rate": 6.692325048446784e-05, + "loss": 0.0019, + "num_input_tokens_seen": 224892352, + "step": 104220 + }, + { + "epoch": 17.002446982055464, + "grad_norm": 0.025119192898273468, + "learning_rate": 6.688768075883683e-05, + "loss": 0.0013, + "num_input_tokens_seen": 224904064, + "step": 104225 + }, + { + "epoch": 17.00326264274062, + "grad_norm": 0.0007926810649223626, + "learning_rate": 6.685211981077616e-05, + "loss": 0.0015, + "num_input_tokens_seen": 224913856, + "step": 104230 + }, + { + "epoch": 17.004078303425775, + "grad_norm": 0.007619829848408699, + "learning_rate": 6.68165676410069e-05, + "loss": 0.0011, + "num_input_tokens_seen": 224925408, + "step": 104235 + }, + { + "epoch": 17.00489396411093, + "grad_norm": 0.00031914206920191646, + "learning_rate": 6.678102425024946e-05, + "loss": 0.1354, + "num_input_tokens_seen": 224935104, + "step": 104240 + }, + { + "epoch": 17.005709624796086, + "grad_norm": 0.00039243127685040236, + "learning_rate": 6.674548963922412e-05, + "loss": 0.0006, + "num_input_tokens_seen": 224945088, + "step": 104245 + }, + { + "epoch": 17.00652528548124, + "grad_norm": 0.0012928576907142997, + "learning_rate": 6.670996380865101e-05, + "loss": 0.0017, + "num_input_tokens_seen": 224956448, + "step": 104250 + }, + { + "epoch": 17.007340946166394, + "grad_norm": 9.332468471257016e-05, + "learning_rate": 6.667444675925022e-05, + "loss": 0.0009, + "num_input_tokens_seen": 224967264, + "step": 104255 + }, + { + "epoch": 17.00815660685155, + "grad_norm": 0.002784198150038719, + "learning_rate": 6.663893849174147e-05, + "loss": 0.0019, + "num_input_tokens_seen": 224978080, + "step": 104260 + }, + { + "epoch": 17.008972267536706, + "grad_norm": 0.007008255459368229, + "learning_rate": 6.660343900684434e-05, + "loss": 0.0007, + "num_input_tokens_seen": 224989632, + "step": 104265 + }, + { + "epoch": 17.00978792822186, + "grad_norm": 0.007006792817264795, + "learning_rate": 6.656794830527835e-05, + "loss": 0.0005, + "num_input_tokens_seen": 224999936, + "step": 104270 + }, + { + "epoch": 17.010603588907014, + "grad_norm": 0.00026880166842602193, + "learning_rate": 6.653246638776273e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225010688, + "step": 104275 + }, + { + "epoch": 17.01141924959217, + "grad_norm": 0.010388139635324478, + "learning_rate": 6.649699325501657e-05, + "loss": 0.0027, + "num_input_tokens_seen": 225022496, + "step": 104280 + }, + { + "epoch": 17.012234910277325, + "grad_norm": 0.005392506718635559, + "learning_rate": 6.64615289077588e-05, + "loss": 0.0026, + "num_input_tokens_seen": 225033152, + "step": 104285 + }, + { + "epoch": 17.01305057096248, + "grad_norm": 0.00034575830795802176, + "learning_rate": 6.642607334670808e-05, + "loss": 0.0023, + "num_input_tokens_seen": 225043936, + "step": 104290 + }, + { + "epoch": 17.013866231647636, + "grad_norm": 0.0063526565209031105, + "learning_rate": 6.639062657258305e-05, + "loss": 0.0039, + "num_input_tokens_seen": 225055136, + "step": 104295 + }, + { + "epoch": 17.01468189233279, + "grad_norm": 0.00015449080092366785, + "learning_rate": 6.635518858610207e-05, + "loss": 0.0005, + "num_input_tokens_seen": 225064480, + "step": 104300 + }, + { + "epoch": 17.015497553017944, + "grad_norm": 0.0015748885925859213, + "learning_rate": 6.631975938798312e-05, + "loss": 0.0015, + "num_input_tokens_seen": 225075904, + "step": 104305 + }, + { + "epoch": 17.0163132137031, + "grad_norm": 0.0015460324939340353, + "learning_rate": 6.62843389789447e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225086784, + "step": 104310 + }, + { + "epoch": 17.017128874388256, + "grad_norm": 0.0006274620536714792, + "learning_rate": 6.624892735970412e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225096256, + "step": 104315 + }, + { + "epoch": 17.017944535073408, + "grad_norm": 0.013345682993531227, + "learning_rate": 6.621352453097951e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225105600, + "step": 104320 + }, + { + "epoch": 17.018760195758563, + "grad_norm": 0.00988664012402296, + "learning_rate": 6.617813049348787e-05, + "loss": 0.0005, + "num_input_tokens_seen": 225117216, + "step": 104325 + }, + { + "epoch": 17.01957585644372, + "grad_norm": 0.0052864281460642815, + "learning_rate": 6.6142745247947e-05, + "loss": 0.0021, + "num_input_tokens_seen": 225128896, + "step": 104330 + }, + { + "epoch": 17.020391517128875, + "grad_norm": 0.0010683403816074133, + "learning_rate": 6.610736879507356e-05, + "loss": 0.0048, + "num_input_tokens_seen": 225140032, + "step": 104335 + }, + { + "epoch": 17.02120717781403, + "grad_norm": 0.0010587344877421856, + "learning_rate": 6.607200113558493e-05, + "loss": 0.0032, + "num_input_tokens_seen": 225151040, + "step": 104340 + }, + { + "epoch": 17.022022838499183, + "grad_norm": 0.03601657226681709, + "learning_rate": 6.603664227019745e-05, + "loss": 0.0014, + "num_input_tokens_seen": 225162016, + "step": 104345 + }, + { + "epoch": 17.02283849918434, + "grad_norm": 0.0059602633118629456, + "learning_rate": 6.600129219962819e-05, + "loss": 0.0872, + "num_input_tokens_seen": 225173504, + "step": 104350 + }, + { + "epoch": 17.023654159869494, + "grad_norm": 0.000414110254496336, + "learning_rate": 6.596595092459307e-05, + "loss": 0.0106, + "num_input_tokens_seen": 225183488, + "step": 104355 + }, + { + "epoch": 17.02446982055465, + "grad_norm": 0.0004901188658550382, + "learning_rate": 6.593061844580878e-05, + "loss": 0.0002, + "num_input_tokens_seen": 225193856, + "step": 104360 + }, + { + "epoch": 17.025285481239806, + "grad_norm": 0.006629147566854954, + "learning_rate": 6.589529476399097e-05, + "loss": 0.0013, + "num_input_tokens_seen": 225205312, + "step": 104365 + }, + { + "epoch": 17.026101141924958, + "grad_norm": 0.05398989096283913, + "learning_rate": 6.585997987985592e-05, + "loss": 0.0014, + "num_input_tokens_seen": 225215776, + "step": 104370 + }, + { + "epoch": 17.026916802610113, + "grad_norm": 0.00050363625632599, + "learning_rate": 6.582467379411889e-05, + "loss": 0.0049, + "num_input_tokens_seen": 225226368, + "step": 104375 + }, + { + "epoch": 17.02773246329527, + "grad_norm": 0.5783743858337402, + "learning_rate": 6.578937650749573e-05, + "loss": 0.0198, + "num_input_tokens_seen": 225237920, + "step": 104380 + }, + { + "epoch": 17.028548123980425, + "grad_norm": 0.0004515399632509798, + "learning_rate": 6.575408802070171e-05, + "loss": 0.0041, + "num_input_tokens_seen": 225248224, + "step": 104385 + }, + { + "epoch": 17.02936378466558, + "grad_norm": 0.0003436952247284353, + "learning_rate": 6.571880833445198e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225258176, + "step": 104390 + }, + { + "epoch": 17.030179445350733, + "grad_norm": 0.00026193412486463785, + "learning_rate": 6.568353744946154e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225269344, + "step": 104395 + }, + { + "epoch": 17.03099510603589, + "grad_norm": 0.02186772972345352, + "learning_rate": 6.564827536644519e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225278784, + "step": 104400 + }, + { + "epoch": 17.031810766721044, + "grad_norm": 0.0012280478840693831, + "learning_rate": 6.561302208611752e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225289856, + "step": 104405 + }, + { + "epoch": 17.0326264274062, + "grad_norm": 0.00398009130731225, + "learning_rate": 6.557777760919303e-05, + "loss": 0.001, + "num_input_tokens_seen": 225299392, + "step": 104410 + }, + { + "epoch": 17.033442088091356, + "grad_norm": 0.04642460495233536, + "learning_rate": 6.554254193638598e-05, + "loss": 0.0009, + "num_input_tokens_seen": 225308096, + "step": 104415 + }, + { + "epoch": 17.034257748776508, + "grad_norm": 0.044692400842905045, + "learning_rate": 6.550731506841046e-05, + "loss": 0.0021, + "num_input_tokens_seen": 225319168, + "step": 104420 + }, + { + "epoch": 17.035073409461663, + "grad_norm": 0.027239244431257248, + "learning_rate": 6.54720970059804e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225330496, + "step": 104425 + }, + { + "epoch": 17.03588907014682, + "grad_norm": 0.0010891613783314824, + "learning_rate": 6.543688774980944e-05, + "loss": 0.0002, + "num_input_tokens_seen": 225340576, + "step": 104430 + }, + { + "epoch": 17.036704730831975, + "grad_norm": 0.0009796923259273171, + "learning_rate": 6.540168730061141e-05, + "loss": 0.0009, + "num_input_tokens_seen": 225351840, + "step": 104435 + }, + { + "epoch": 17.03752039151713, + "grad_norm": 0.00043389530037529767, + "learning_rate": 6.53664956590993e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225362336, + "step": 104440 + }, + { + "epoch": 17.038336052202283, + "grad_norm": 0.0072618150152266026, + "learning_rate": 6.533131282598676e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225373792, + "step": 104445 + }, + { + "epoch": 17.03915171288744, + "grad_norm": 0.00018621365597937256, + "learning_rate": 6.529613880198638e-05, + "loss": 0.0012, + "num_input_tokens_seen": 225385728, + "step": 104450 + }, + { + "epoch": 17.039967373572594, + "grad_norm": 0.0005378325004130602, + "learning_rate": 6.526097358781141e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225395840, + "step": 104455 + }, + { + "epoch": 17.04078303425775, + "grad_norm": 0.028787760064005852, + "learning_rate": 6.522581718417409e-05, + "loss": 0.0039, + "num_input_tokens_seen": 225407712, + "step": 104460 + }, + { + "epoch": 17.041598694942905, + "grad_norm": 0.00042777816997841, + "learning_rate": 6.519066959178738e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225419072, + "step": 104465 + }, + { + "epoch": 17.042414355628058, + "grad_norm": 0.0012969492236152291, + "learning_rate": 6.515553081136311e-05, + "loss": 0.0017, + "num_input_tokens_seen": 225429248, + "step": 104470 + }, + { + "epoch": 17.043230016313213, + "grad_norm": 0.035520948469638824, + "learning_rate": 6.512040084361388e-05, + "loss": 0.0495, + "num_input_tokens_seen": 225440288, + "step": 104475 + }, + { + "epoch": 17.04404567699837, + "grad_norm": 0.048041932284832, + "learning_rate": 6.508527968925115e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225450592, + "step": 104480 + }, + { + "epoch": 17.044861337683525, + "grad_norm": 0.0013652007328346372, + "learning_rate": 6.505016734898722e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225462208, + "step": 104485 + }, + { + "epoch": 17.045676998368677, + "grad_norm": 0.0021526433993130922, + "learning_rate": 6.501506382353317e-05, + "loss": 0.0034, + "num_input_tokens_seen": 225472832, + "step": 104490 + }, + { + "epoch": 17.046492659053833, + "grad_norm": 0.0020718311425298452, + "learning_rate": 6.497996911360093e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225483136, + "step": 104495 + }, + { + "epoch": 17.04730831973899, + "grad_norm": 0.0022629182785749435, + "learning_rate": 6.494488321990122e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225493792, + "step": 104500 + }, + { + "epoch": 17.048123980424144, + "grad_norm": 0.0040725115686655045, + "learning_rate": 6.490980614314556e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225504288, + "step": 104505 + }, + { + "epoch": 17.0489396411093, + "grad_norm": 0.0017287189839407802, + "learning_rate": 6.487473788404446e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225515776, + "step": 104510 + }, + { + "epoch": 17.049755301794452, + "grad_norm": 0.0010222619166597724, + "learning_rate": 6.483967844330901e-05, + "loss": 0.0002, + "num_input_tokens_seen": 225526880, + "step": 104515 + }, + { + "epoch": 17.050570962479608, + "grad_norm": 0.004299209453165531, + "learning_rate": 6.480462782164925e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225537472, + "step": 104520 + }, + { + "epoch": 17.051386623164763, + "grad_norm": 0.048632021993398666, + "learning_rate": 6.476958601977595e-05, + "loss": 0.0013, + "num_input_tokens_seen": 225548640, + "step": 104525 + }, + { + "epoch": 17.05220228384992, + "grad_norm": 0.06687615811824799, + "learning_rate": 6.473455303839909e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225560896, + "step": 104530 + }, + { + "epoch": 17.053017944535075, + "grad_norm": 0.0003009017091244459, + "learning_rate": 6.469952887822866e-05, + "loss": 0.0002, + "num_input_tokens_seen": 225571648, + "step": 104535 + }, + { + "epoch": 17.053833605220227, + "grad_norm": 0.0136693324893713, + "learning_rate": 6.466451353997455e-05, + "loss": 0.0044, + "num_input_tokens_seen": 225580896, + "step": 104540 + }, + { + "epoch": 17.054649265905383, + "grad_norm": 9.514921111986041e-05, + "learning_rate": 6.462950702434633e-05, + "loss": 0.0015, + "num_input_tokens_seen": 225593600, + "step": 104545 + }, + { + "epoch": 17.05546492659054, + "grad_norm": 0.007636062800884247, + "learning_rate": 6.459450933205346e-05, + "loss": 0.0289, + "num_input_tokens_seen": 225605344, + "step": 104550 + }, + { + "epoch": 17.056280587275694, + "grad_norm": 0.002975823823362589, + "learning_rate": 6.455952046380514e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225615808, + "step": 104555 + }, + { + "epoch": 17.05709624796085, + "grad_norm": 0.0013264185981824994, + "learning_rate": 6.452454042031059e-05, + "loss": 0.0246, + "num_input_tokens_seen": 225626496, + "step": 104560 + }, + { + "epoch": 17.057911908646002, + "grad_norm": 0.00126725307200104, + "learning_rate": 6.448956920227867e-05, + "loss": 0.001, + "num_input_tokens_seen": 225636704, + "step": 104565 + }, + { + "epoch": 17.058727569331158, + "grad_norm": 0.002241946989670396, + "learning_rate": 6.445460681041815e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225647584, + "step": 104570 + }, + { + "epoch": 17.059543230016313, + "grad_norm": 0.02262943796813488, + "learning_rate": 6.441965324543737e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225658592, + "step": 104575 + }, + { + "epoch": 17.06035889070147, + "grad_norm": 0.0007740870933048427, + "learning_rate": 6.438470850804512e-05, + "loss": 0.0015, + "num_input_tokens_seen": 225670336, + "step": 104580 + }, + { + "epoch": 17.061174551386625, + "grad_norm": 0.0006729420856572688, + "learning_rate": 6.43497725989492e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225679872, + "step": 104585 + }, + { + "epoch": 17.061990212071777, + "grad_norm": 0.022588232532143593, + "learning_rate": 6.431484551885797e-05, + "loss": 0.0009, + "num_input_tokens_seen": 225692288, + "step": 104590 + }, + { + "epoch": 17.062805872756933, + "grad_norm": 0.0006558909080922604, + "learning_rate": 6.427992726847892e-05, + "loss": 0.0017, + "num_input_tokens_seen": 225704032, + "step": 104595 + }, + { + "epoch": 17.063621533442088, + "grad_norm": 0.0032833211589604616, + "learning_rate": 6.424501784852004e-05, + "loss": 0.0176, + "num_input_tokens_seen": 225713536, + "step": 104600 + }, + { + "epoch": 17.064437194127244, + "grad_norm": 0.0004656825040001422, + "learning_rate": 6.421011725968856e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225723648, + "step": 104605 + }, + { + "epoch": 17.0652528548124, + "grad_norm": 0.0021402572747319937, + "learning_rate": 6.4175225502692e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225734048, + "step": 104610 + }, + { + "epoch": 17.06606851549755, + "grad_norm": 6.967558874748647e-05, + "learning_rate": 6.414034257823725e-05, + "loss": 0.0006, + "num_input_tokens_seen": 225744960, + "step": 104615 + }, + { + "epoch": 17.066884176182707, + "grad_norm": 0.007131517399102449, + "learning_rate": 6.410546848703153e-05, + "loss": 0.0022, + "num_input_tokens_seen": 225756000, + "step": 104620 + }, + { + "epoch": 17.067699836867863, + "grad_norm": 0.004013559315353632, + "learning_rate": 6.407060322978131e-05, + "loss": 0.0806, + "num_input_tokens_seen": 225765984, + "step": 104625 + }, + { + "epoch": 17.06851549755302, + "grad_norm": 0.02078430727124214, + "learning_rate": 6.403574680719343e-05, + "loss": 0.0012, + "num_input_tokens_seen": 225776928, + "step": 104630 + }, + { + "epoch": 17.069331158238175, + "grad_norm": 0.01177056971937418, + "learning_rate": 6.400089921997415e-05, + "loss": 0.0012, + "num_input_tokens_seen": 225788768, + "step": 104635 + }, + { + "epoch": 17.070146818923327, + "grad_norm": 0.0008083314751274884, + "learning_rate": 6.39660604688298e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225800672, + "step": 104640 + }, + { + "epoch": 17.070962479608482, + "grad_norm": 0.00040724751306697726, + "learning_rate": 6.393123055446637e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225810048, + "step": 104645 + }, + { + "epoch": 17.071778140293638, + "grad_norm": 0.17747963964939117, + "learning_rate": 6.389640947758973e-05, + "loss": 0.0043, + "num_input_tokens_seen": 225821792, + "step": 104650 + }, + { + "epoch": 17.072593800978794, + "grad_norm": 0.019881470128893852, + "learning_rate": 6.38615972389056e-05, + "loss": 0.0021, + "num_input_tokens_seen": 225832000, + "step": 104655 + }, + { + "epoch": 17.07340946166395, + "grad_norm": 0.00024794723140075803, + "learning_rate": 6.382679383911949e-05, + "loss": 0.0005, + "num_input_tokens_seen": 225842752, + "step": 104660 + }, + { + "epoch": 17.0742251223491, + "grad_norm": 0.011731022968888283, + "learning_rate": 6.37919992789367e-05, + "loss": 0.0017, + "num_input_tokens_seen": 225854176, + "step": 104665 + }, + { + "epoch": 17.075040783034257, + "grad_norm": 0.00016393935948144644, + "learning_rate": 6.375721355906245e-05, + "loss": 0.001, + "num_input_tokens_seen": 225865120, + "step": 104670 + }, + { + "epoch": 17.075856443719413, + "grad_norm": 0.000469821912702173, + "learning_rate": 6.372243668020167e-05, + "loss": 0.0011, + "num_input_tokens_seen": 225877440, + "step": 104675 + }, + { + "epoch": 17.07667210440457, + "grad_norm": 0.001573938294313848, + "learning_rate": 6.368766864305914e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225888512, + "step": 104680 + }, + { + "epoch": 17.07748776508972, + "grad_norm": 0.0016154613113030791, + "learning_rate": 6.365290944833952e-05, + "loss": 0.0003, + "num_input_tokens_seen": 225898048, + "step": 104685 + }, + { + "epoch": 17.078303425774877, + "grad_norm": 0.007328188046813011, + "learning_rate": 6.361815909674722e-05, + "loss": 0.01, + "num_input_tokens_seen": 225909056, + "step": 104690 + }, + { + "epoch": 17.079119086460032, + "grad_norm": 0.006951836869120598, + "learning_rate": 6.358341758898656e-05, + "loss": 0.0008, + "num_input_tokens_seen": 225920512, + "step": 104695 + }, + { + "epoch": 17.079934747145188, + "grad_norm": 0.0005717684980481863, + "learning_rate": 6.354868492576154e-05, + "loss": 0.0033, + "num_input_tokens_seen": 225930976, + "step": 104700 + }, + { + "epoch": 17.080750407830344, + "grad_norm": 0.0010395251447334886, + "learning_rate": 6.351396110777613e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225941952, + "step": 104705 + }, + { + "epoch": 17.081566068515496, + "grad_norm": 0.005431427154690027, + "learning_rate": 6.347924613573402e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225953984, + "step": 104710 + }, + { + "epoch": 17.08238172920065, + "grad_norm": 0.007929583080112934, + "learning_rate": 6.344454001033873e-05, + "loss": 0.0025, + "num_input_tokens_seen": 225963840, + "step": 104715 + }, + { + "epoch": 17.083197389885807, + "grad_norm": 0.003908567596226931, + "learning_rate": 6.340984273229355e-05, + "loss": 0.0004, + "num_input_tokens_seen": 225973120, + "step": 104720 + }, + { + "epoch": 17.084013050570963, + "grad_norm": 0.0016220184043049812, + "learning_rate": 6.337515430230196e-05, + "loss": 0.0007, + "num_input_tokens_seen": 225983168, + "step": 104725 + }, + { + "epoch": 17.08482871125612, + "grad_norm": 0.006639786530286074, + "learning_rate": 6.334047472106657e-05, + "loss": 0.0016, + "num_input_tokens_seen": 225994240, + "step": 104730 + }, + { + "epoch": 17.08564437194127, + "grad_norm": 0.004870147909969091, + "learning_rate": 6.330580398929047e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226005344, + "step": 104735 + }, + { + "epoch": 17.086460032626427, + "grad_norm": 0.00044473871821537614, + "learning_rate": 6.327114210767632e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226016416, + "step": 104740 + }, + { + "epoch": 17.087275693311582, + "grad_norm": 0.01985304057598114, + "learning_rate": 6.323648907692642e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226026496, + "step": 104745 + }, + { + "epoch": 17.088091353996738, + "grad_norm": 0.0007581087411381304, + "learning_rate": 6.320184489774317e-05, + "loss": 0.0007, + "num_input_tokens_seen": 226036480, + "step": 104750 + }, + { + "epoch": 17.088907014681894, + "grad_norm": 0.0009678181377239525, + "learning_rate": 6.316720957082867e-05, + "loss": 0.0043, + "num_input_tokens_seen": 226048608, + "step": 104755 + }, + { + "epoch": 17.089722675367046, + "grad_norm": 0.0005660268943756819, + "learning_rate": 6.31325830968848e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226059808, + "step": 104760 + }, + { + "epoch": 17.0905383360522, + "grad_norm": 0.00015661843644920737, + "learning_rate": 6.30979654766134e-05, + "loss": 0.0015, + "num_input_tokens_seen": 226071168, + "step": 104765 + }, + { + "epoch": 17.091353996737357, + "grad_norm": 0.00044742238242179155, + "learning_rate": 6.306335671071589e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226081472, + "step": 104770 + }, + { + "epoch": 17.092169657422513, + "grad_norm": 0.01640053652226925, + "learning_rate": 6.302875679989384e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226092768, + "step": 104775 + }, + { + "epoch": 17.09298531810767, + "grad_norm": 0.005679141264408827, + "learning_rate": 6.299416574484828e-05, + "loss": 0.0004, + "num_input_tokens_seen": 226103840, + "step": 104780 + }, + { + "epoch": 17.09380097879282, + "grad_norm": 0.002406627405434847, + "learning_rate": 6.29595835462804e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226115168, + "step": 104785 + }, + { + "epoch": 17.094616639477977, + "grad_norm": 0.003517636563628912, + "learning_rate": 6.2925010204891e-05, + "loss": 0.0028, + "num_input_tokens_seen": 226127328, + "step": 104790 + }, + { + "epoch": 17.095432300163132, + "grad_norm": 0.0017153732478618622, + "learning_rate": 6.289044572138069e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226137696, + "step": 104795 + }, + { + "epoch": 17.096247960848288, + "grad_norm": 0.00040373069350607693, + "learning_rate": 6.285589009644999e-05, + "loss": 0.002, + "num_input_tokens_seen": 226147552, + "step": 104800 + }, + { + "epoch": 17.097063621533444, + "grad_norm": 0.0003696755738928914, + "learning_rate": 6.282134333079926e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226158304, + "step": 104805 + }, + { + "epoch": 17.097879282218596, + "grad_norm": 0.001891187159344554, + "learning_rate": 6.278680542512866e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226168896, + "step": 104810 + }, + { + "epoch": 17.09869494290375, + "grad_norm": 0.00039233764982782304, + "learning_rate": 6.275227638013803e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226178720, + "step": 104815 + }, + { + "epoch": 17.099510603588907, + "grad_norm": 0.00038862694054841995, + "learning_rate": 6.271775619652719e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226189888, + "step": 104820 + }, + { + "epoch": 17.100326264274063, + "grad_norm": 0.00021970065427012742, + "learning_rate": 6.268324487499583e-05, + "loss": 0.0058, + "num_input_tokens_seen": 226201280, + "step": 104825 + }, + { + "epoch": 17.10114192495922, + "grad_norm": 0.0001851923152571544, + "learning_rate": 6.264874241624324e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226211488, + "step": 104830 + }, + { + "epoch": 17.10195758564437, + "grad_norm": 0.00214143143966794, + "learning_rate": 6.261424882096866e-05, + "loss": 0.004, + "num_input_tokens_seen": 226222752, + "step": 104835 + }, + { + "epoch": 17.102773246329527, + "grad_norm": 0.02651359885931015, + "learning_rate": 6.257976408987115e-05, + "loss": 0.001, + "num_input_tokens_seen": 226231872, + "step": 104840 + }, + { + "epoch": 17.103588907014682, + "grad_norm": 0.0016420877072960138, + "learning_rate": 6.254528822364985e-05, + "loss": 0.002, + "num_input_tokens_seen": 226242272, + "step": 104845 + }, + { + "epoch": 17.104404567699838, + "grad_norm": 0.0012485329061746597, + "learning_rate": 6.2510821223003e-05, + "loss": 0.001, + "num_input_tokens_seen": 226253952, + "step": 104850 + }, + { + "epoch": 17.10522022838499, + "grad_norm": 0.056864190846681595, + "learning_rate": 6.247636308862953e-05, + "loss": 0.0046, + "num_input_tokens_seen": 226264640, + "step": 104855 + }, + { + "epoch": 17.106035889070146, + "grad_norm": 0.0009735542698763311, + "learning_rate": 6.244191382122744e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226274688, + "step": 104860 + }, + { + "epoch": 17.1068515497553, + "grad_norm": 0.0008831945597194135, + "learning_rate": 6.240747342149511e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226286432, + "step": 104865 + }, + { + "epoch": 17.107667210440457, + "grad_norm": 0.006669166032224894, + "learning_rate": 6.237304189013049e-05, + "loss": 0.0018, + "num_input_tokens_seen": 226295456, + "step": 104870 + }, + { + "epoch": 17.108482871125613, + "grad_norm": 0.09573370218276978, + "learning_rate": 6.233861922783135e-05, + "loss": 0.0025, + "num_input_tokens_seen": 226304064, + "step": 104875 + }, + { + "epoch": 17.109298531810765, + "grad_norm": 0.0002602602180559188, + "learning_rate": 6.230420543529525e-05, + "loss": 0.0021, + "num_input_tokens_seen": 226314944, + "step": 104880 + }, + { + "epoch": 17.11011419249592, + "grad_norm": 0.00013803262845613062, + "learning_rate": 6.226980051321973e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226325856, + "step": 104885 + }, + { + "epoch": 17.110929853181077, + "grad_norm": 0.0002574975951574743, + "learning_rate": 6.223540446230202e-05, + "loss": 0.0104, + "num_input_tokens_seen": 226337248, + "step": 104890 + }, + { + "epoch": 17.111745513866232, + "grad_norm": 0.0029914183542132378, + "learning_rate": 6.220101728323913e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226346400, + "step": 104895 + }, + { + "epoch": 17.112561174551388, + "grad_norm": 0.0338774099946022, + "learning_rate": 6.216663897672803e-05, + "loss": 0.0065, + "num_input_tokens_seen": 226357088, + "step": 104900 + }, + { + "epoch": 17.11337683523654, + "grad_norm": 0.020565807819366455, + "learning_rate": 6.213226954346546e-05, + "loss": 0.0027, + "num_input_tokens_seen": 226367488, + "step": 104905 + }, + { + "epoch": 17.114192495921696, + "grad_norm": 0.0035928194411098957, + "learning_rate": 6.209790898414785e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226378080, + "step": 104910 + }, + { + "epoch": 17.11500815660685, + "grad_norm": 0.001142794149927795, + "learning_rate": 6.206355729947171e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226388736, + "step": 104915 + }, + { + "epoch": 17.115823817292007, + "grad_norm": 0.0008078089449554682, + "learning_rate": 6.20292144901331e-05, + "loss": 0.0001, + "num_input_tokens_seen": 226400640, + "step": 104920 + }, + { + "epoch": 17.116639477977163, + "grad_norm": 0.00016623934789095074, + "learning_rate": 6.199488055682806e-05, + "loss": 0.003, + "num_input_tokens_seen": 226411040, + "step": 104925 + }, + { + "epoch": 17.117455138662315, + "grad_norm": 0.005466518457978964, + "learning_rate": 6.196055550025243e-05, + "loss": 0.0004, + "num_input_tokens_seen": 226421408, + "step": 104930 + }, + { + "epoch": 17.11827079934747, + "grad_norm": 0.002257846063002944, + "learning_rate": 6.192623932110187e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226432736, + "step": 104935 + }, + { + "epoch": 17.119086460032626, + "grad_norm": 0.0001249636261491105, + "learning_rate": 6.189193202007176e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226443648, + "step": 104940 + }, + { + "epoch": 17.119902120717782, + "grad_norm": 0.09358309209346771, + "learning_rate": 6.185763359785729e-05, + "loss": 0.0018, + "num_input_tokens_seen": 226454880, + "step": 104945 + }, + { + "epoch": 17.120717781402938, + "grad_norm": 0.0021896101534366608, + "learning_rate": 6.182334405515399e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226465920, + "step": 104950 + }, + { + "epoch": 17.12153344208809, + "grad_norm": 0.0003640578652266413, + "learning_rate": 6.178906339265622e-05, + "loss": 0.0007, + "num_input_tokens_seen": 226476832, + "step": 104955 + }, + { + "epoch": 17.122349102773246, + "grad_norm": 0.004394837189465761, + "learning_rate": 6.175479161105923e-05, + "loss": 0.0024, + "num_input_tokens_seen": 226486464, + "step": 104960 + }, + { + "epoch": 17.1231647634584, + "grad_norm": 0.01414460875093937, + "learning_rate": 6.17205287110571e-05, + "loss": 0.0013, + "num_input_tokens_seen": 226497024, + "step": 104965 + }, + { + "epoch": 17.123980424143557, + "grad_norm": 0.0002519322151783854, + "learning_rate": 6.16862746933447e-05, + "loss": 0.0053, + "num_input_tokens_seen": 226508320, + "step": 104970 + }, + { + "epoch": 17.124796084828713, + "grad_norm": 0.04256436601281166, + "learning_rate": 6.165202955861577e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226518208, + "step": 104975 + }, + { + "epoch": 17.125611745513865, + "grad_norm": 0.0014837670605629683, + "learning_rate": 6.161779330756473e-05, + "loss": 0.0032, + "num_input_tokens_seen": 226529472, + "step": 104980 + }, + { + "epoch": 17.12642740619902, + "grad_norm": 0.0003776443481910974, + "learning_rate": 6.158356594088504e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226539648, + "step": 104985 + }, + { + "epoch": 17.127243066884176, + "grad_norm": 0.0003498637233860791, + "learning_rate": 6.154934745927076e-05, + "loss": 0.0001, + "num_input_tokens_seen": 226549760, + "step": 104990 + }, + { + "epoch": 17.128058727569332, + "grad_norm": 0.0014423379907384515, + "learning_rate": 6.151513786341495e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226559264, + "step": 104995 + }, + { + "epoch": 17.128874388254488, + "grad_norm": 0.0006168496911413968, + "learning_rate": 6.148093715401138e-05, + "loss": 0.0764, + "num_input_tokens_seen": 226568992, + "step": 105000 + }, + { + "epoch": 17.12969004893964, + "grad_norm": 0.00012300346861593425, + "learning_rate": 6.144674533175265e-05, + "loss": 0.0027, + "num_input_tokens_seen": 226580064, + "step": 105005 + }, + { + "epoch": 17.130505709624796, + "grad_norm": 0.047301918268203735, + "learning_rate": 6.141256239733212e-05, + "loss": 0.001, + "num_input_tokens_seen": 226591904, + "step": 105010 + }, + { + "epoch": 17.13132137030995, + "grad_norm": 0.010372013784945011, + "learning_rate": 6.137838835144239e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226601600, + "step": 105015 + }, + { + "epoch": 17.132137030995107, + "grad_norm": 0.00014746721717529, + "learning_rate": 6.1344223194776e-05, + "loss": 0.0014, + "num_input_tokens_seen": 226612448, + "step": 105020 + }, + { + "epoch": 17.13295269168026, + "grad_norm": 0.0018850078340619802, + "learning_rate": 6.13100669280255e-05, + "loss": 0.0013, + "num_input_tokens_seen": 226623072, + "step": 105025 + }, + { + "epoch": 17.133768352365415, + "grad_norm": 0.004195782355964184, + "learning_rate": 6.127591955188295e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226633856, + "step": 105030 + }, + { + "epoch": 17.13458401305057, + "grad_norm": 0.00170124729629606, + "learning_rate": 6.124178106704042e-05, + "loss": 0.0019, + "num_input_tokens_seen": 226643552, + "step": 105035 + }, + { + "epoch": 17.135399673735726, + "grad_norm": 0.0033010239712893963, + "learning_rate": 6.120765147418989e-05, + "loss": 0.0065, + "num_input_tokens_seen": 226654112, + "step": 105040 + }, + { + "epoch": 17.136215334420882, + "grad_norm": 0.0002717708994168788, + "learning_rate": 6.117353077402288e-05, + "loss": 0.0018, + "num_input_tokens_seen": 226664320, + "step": 105045 + }, + { + "epoch": 17.137030995106034, + "grad_norm": 0.0002597243874333799, + "learning_rate": 6.113941896723097e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226674848, + "step": 105050 + }, + { + "epoch": 17.13784665579119, + "grad_norm": 0.0017238686559721828, + "learning_rate": 6.110531605450548e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226685056, + "step": 105055 + }, + { + "epoch": 17.138662316476346, + "grad_norm": 0.02072848007082939, + "learning_rate": 6.107122203653742e-05, + "loss": 0.0007, + "num_input_tokens_seen": 226696192, + "step": 105060 + }, + { + "epoch": 17.1394779771615, + "grad_norm": 0.0015117658767849207, + "learning_rate": 6.103713691401813e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226707040, + "step": 105065 + }, + { + "epoch": 17.140293637846657, + "grad_norm": 0.0015849336050450802, + "learning_rate": 6.1003060687637836e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226717824, + "step": 105070 + }, + { + "epoch": 17.14110929853181, + "grad_norm": 0.0051768929697573185, + "learning_rate": 6.09689933580877e-05, + "loss": 0.0006, + "num_input_tokens_seen": 226728224, + "step": 105075 + }, + { + "epoch": 17.141924959216965, + "grad_norm": 0.014404483139514923, + "learning_rate": 6.0934934926057616e-05, + "loss": 0.0007, + "num_input_tokens_seen": 226738304, + "step": 105080 + }, + { + "epoch": 17.14274061990212, + "grad_norm": 0.00040637131314724684, + "learning_rate": 6.0900885392238316e-05, + "loss": 0.0181, + "num_input_tokens_seen": 226747296, + "step": 105085 + }, + { + "epoch": 17.143556280587276, + "grad_norm": 0.010648543015122414, + "learning_rate": 6.086684475731935e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226758944, + "step": 105090 + }, + { + "epoch": 17.144371941272432, + "grad_norm": 0.042648375034332275, + "learning_rate": 6.083281302199112e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226769472, + "step": 105095 + }, + { + "epoch": 17.145187601957584, + "grad_norm": 0.002539709908887744, + "learning_rate": 6.0798790186942784e-05, + "loss": 0.0037, + "num_input_tokens_seen": 226779488, + "step": 105100 + }, + { + "epoch": 17.14600326264274, + "grad_norm": 0.046020641922950745, + "learning_rate": 6.0764776252864365e-05, + "loss": 0.001, + "num_input_tokens_seen": 226790272, + "step": 105105 + }, + { + "epoch": 17.146818923327896, + "grad_norm": 0.013179299421608448, + "learning_rate": 6.073077122044479e-05, + "loss": 0.0023, + "num_input_tokens_seen": 226800960, + "step": 105110 + }, + { + "epoch": 17.14763458401305, + "grad_norm": 0.00016499825869686902, + "learning_rate": 6.069677509037358e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226812064, + "step": 105115 + }, + { + "epoch": 17.148450244698207, + "grad_norm": 0.001977306790649891, + "learning_rate": 6.066278786333928e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226822080, + "step": 105120 + }, + { + "epoch": 17.14926590538336, + "grad_norm": 0.03290601819753647, + "learning_rate": 6.062880954003114e-05, + "loss": 0.0012, + "num_input_tokens_seen": 226832736, + "step": 105125 + }, + { + "epoch": 17.150081566068515, + "grad_norm": 0.0011648988584056497, + "learning_rate": 6.059484012113736e-05, + "loss": 0.0008, + "num_input_tokens_seen": 226844160, + "step": 105130 + }, + { + "epoch": 17.15089722675367, + "grad_norm": 0.0002047787857009098, + "learning_rate": 6.0560879607346795e-05, + "loss": 0.0005, + "num_input_tokens_seen": 226853440, + "step": 105135 + }, + { + "epoch": 17.151712887438826, + "grad_norm": 0.002967682434245944, + "learning_rate": 6.0526927999347224e-05, + "loss": 0.0002, + "num_input_tokens_seen": 226864416, + "step": 105140 + }, + { + "epoch": 17.152528548123982, + "grad_norm": 0.022811653092503548, + "learning_rate": 6.049298529782721e-05, + "loss": 0.0009, + "num_input_tokens_seen": 226875520, + "step": 105145 + }, + { + "epoch": 17.153344208809134, + "grad_norm": 0.004361279308795929, + "learning_rate": 6.045905150347419e-05, + "loss": 0.0067, + "num_input_tokens_seen": 226886944, + "step": 105150 + }, + { + "epoch": 17.15415986949429, + "grad_norm": 0.0009651994332671165, + "learning_rate": 6.0425126616976186e-05, + "loss": 0.0034, + "num_input_tokens_seen": 226897088, + "step": 105155 + }, + { + "epoch": 17.154975530179446, + "grad_norm": 0.005272808950394392, + "learning_rate": 6.039121063902064e-05, + "loss": 0.0031, + "num_input_tokens_seen": 226908288, + "step": 105160 + }, + { + "epoch": 17.1557911908646, + "grad_norm": 0.0051519968546926975, + "learning_rate": 6.03573035702949e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226918752, + "step": 105165 + }, + { + "epoch": 17.156606851549757, + "grad_norm": 0.0006230318103916943, + "learning_rate": 6.032340541148612e-05, + "loss": 0.0005, + "num_input_tokens_seen": 226929952, + "step": 105170 + }, + { + "epoch": 17.15742251223491, + "grad_norm": 0.00154141488019377, + "learning_rate": 6.0289516163281264e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226941216, + "step": 105175 + }, + { + "epoch": 17.158238172920065, + "grad_norm": 0.00024147120711859316, + "learning_rate": 6.025563582636723e-05, + "loss": 0.0004, + "num_input_tokens_seen": 226952032, + "step": 105180 + }, + { + "epoch": 17.15905383360522, + "grad_norm": 0.04309641942381859, + "learning_rate": 6.0221764401430565e-05, + "loss": 0.0011, + "num_input_tokens_seen": 226963808, + "step": 105185 + }, + { + "epoch": 17.159869494290376, + "grad_norm": 0.014479360543191433, + "learning_rate": 6.0187901889157735e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226974208, + "step": 105190 + }, + { + "epoch": 17.160685154975532, + "grad_norm": 0.001091653946787119, + "learning_rate": 6.015404829023502e-05, + "loss": 0.0003, + "num_input_tokens_seen": 226985888, + "step": 105195 + }, + { + "epoch": 17.161500815660684, + "grad_norm": 0.010097513906657696, + "learning_rate": 6.012020360534853e-05, + "loss": 0.0012, + "num_input_tokens_seen": 226998048, + "step": 105200 + }, + { + "epoch": 17.16231647634584, + "grad_norm": 0.0021096838172525167, + "learning_rate": 6.008636783518401e-05, + "loss": 0.0018, + "num_input_tokens_seen": 227008672, + "step": 105205 + }, + { + "epoch": 17.163132137030995, + "grad_norm": 0.006962651386857033, + "learning_rate": 6.005254098042751e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227020864, + "step": 105210 + }, + { + "epoch": 17.16394779771615, + "grad_norm": 0.0011977879330515862, + "learning_rate": 6.00187230417642e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227032256, + "step": 105215 + }, + { + "epoch": 17.164763458401303, + "grad_norm": 0.3444151282310486, + "learning_rate": 5.998491401987982e-05, + "loss": 0.0099, + "num_input_tokens_seen": 227042048, + "step": 105220 + }, + { + "epoch": 17.16557911908646, + "grad_norm": 0.00021099462173879147, + "learning_rate": 5.9951113915459154e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227052448, + "step": 105225 + }, + { + "epoch": 17.166394779771615, + "grad_norm": 0.0016064007068052888, + "learning_rate": 5.9917322729187594e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227062624, + "step": 105230 + }, + { + "epoch": 17.16721044045677, + "grad_norm": 0.0027550440281629562, + "learning_rate": 5.9883540461749596e-05, + "loss": 0.003, + "num_input_tokens_seen": 227074016, + "step": 105235 + }, + { + "epoch": 17.168026101141926, + "grad_norm": 0.0011356769828125834, + "learning_rate": 5.984976711383017e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227085120, + "step": 105240 + }, + { + "epoch": 17.16884176182708, + "grad_norm": 0.04207877442240715, + "learning_rate": 5.981600268611337e-05, + "loss": 0.001, + "num_input_tokens_seen": 227095296, + "step": 105245 + }, + { + "epoch": 17.169657422512234, + "grad_norm": 0.02632550336420536, + "learning_rate": 5.9782247179283875e-05, + "loss": 0.0015, + "num_input_tokens_seen": 227105792, + "step": 105250 + }, + { + "epoch": 17.17047308319739, + "grad_norm": 0.0010110967559739947, + "learning_rate": 5.9748500594025425e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227117600, + "step": 105255 + }, + { + "epoch": 17.171288743882545, + "grad_norm": 0.0012392437784001231, + "learning_rate": 5.971476293102229e-05, + "loss": 0.001, + "num_input_tokens_seen": 227129376, + "step": 105260 + }, + { + "epoch": 17.1721044045677, + "grad_norm": 0.0025958844926208258, + "learning_rate": 5.9681034190957886e-05, + "loss": 0.0066, + "num_input_tokens_seen": 227139360, + "step": 105265 + }, + { + "epoch": 17.172920065252853, + "grad_norm": 0.0016605426790192723, + "learning_rate": 5.964731437451593e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227150080, + "step": 105270 + }, + { + "epoch": 17.17373572593801, + "grad_norm": 0.00020158324332442135, + "learning_rate": 5.961360348237982e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227159808, + "step": 105275 + }, + { + "epoch": 17.174551386623165, + "grad_norm": 0.0002608635986689478, + "learning_rate": 5.9579901515232684e-05, + "loss": 0.0014, + "num_input_tokens_seen": 227170336, + "step": 105280 + }, + { + "epoch": 17.17536704730832, + "grad_norm": 0.0030505647882819176, + "learning_rate": 5.954620847375758e-05, + "loss": 0.0102, + "num_input_tokens_seen": 227180928, + "step": 105285 + }, + { + "epoch": 17.176182707993476, + "grad_norm": 0.03991697356104851, + "learning_rate": 5.9512524358637296e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227192128, + "step": 105290 + }, + { + "epoch": 17.17699836867863, + "grad_norm": 0.000792121805716306, + "learning_rate": 5.9478849170554513e-05, + "loss": 0.0036, + "num_input_tokens_seen": 227203552, + "step": 105295 + }, + { + "epoch": 17.177814029363784, + "grad_norm": 0.001528636901639402, + "learning_rate": 5.944518291019168e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227213344, + "step": 105300 + }, + { + "epoch": 17.17862969004894, + "grad_norm": 0.01104555744677782, + "learning_rate": 5.9411525578231094e-05, + "loss": 0.002, + "num_input_tokens_seen": 227224128, + "step": 105305 + }, + { + "epoch": 17.179445350734095, + "grad_norm": 0.0010944633977487683, + "learning_rate": 5.9377877175354865e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227233856, + "step": 105310 + }, + { + "epoch": 17.18026101141925, + "grad_norm": 0.0027180698234587908, + "learning_rate": 5.934423770224495e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227243616, + "step": 105315 + }, + { + "epoch": 17.181076672104403, + "grad_norm": 0.026389265432953835, + "learning_rate": 5.931060715958309e-05, + "loss": 0.001, + "num_input_tokens_seen": 227255840, + "step": 105320 + }, + { + "epoch": 17.18189233278956, + "grad_norm": 0.0010636255610734224, + "learning_rate": 5.9276985548050775e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227266688, + "step": 105325 + }, + { + "epoch": 17.182707993474715, + "grad_norm": 0.00023665381013415754, + "learning_rate": 5.924337286832948e-05, + "loss": 0.008, + "num_input_tokens_seen": 227278016, + "step": 105330 + }, + { + "epoch": 17.18352365415987, + "grad_norm": 0.0002918170066550374, + "learning_rate": 5.9209769121100374e-05, + "loss": 0.0009, + "num_input_tokens_seen": 227287776, + "step": 105335 + }, + { + "epoch": 17.184339314845026, + "grad_norm": 0.00031399005092680454, + "learning_rate": 5.917617430704447e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227298208, + "step": 105340 + }, + { + "epoch": 17.18515497553018, + "grad_norm": 0.05315567925572395, + "learning_rate": 5.9142588426842615e-05, + "loss": 0.0014, + "num_input_tokens_seen": 227308896, + "step": 105345 + }, + { + "epoch": 17.185970636215334, + "grad_norm": 0.004027301911264658, + "learning_rate": 5.9109011481175364e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227318976, + "step": 105350 + }, + { + "epoch": 17.18678629690049, + "grad_norm": 0.003152494551613927, + "learning_rate": 5.907544347072352e-05, + "loss": 0.0008, + "num_input_tokens_seen": 227330624, + "step": 105355 + }, + { + "epoch": 17.187601957585645, + "grad_norm": 0.0024295493494719267, + "learning_rate": 5.904188439616692e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227341600, + "step": 105360 + }, + { + "epoch": 17.1884176182708, + "grad_norm": 0.00019575019541662186, + "learning_rate": 5.9008334258186195e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227351296, + "step": 105365 + }, + { + "epoch": 17.189233278955953, + "grad_norm": 0.004555529449135065, + "learning_rate": 5.897479305746079e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227362208, + "step": 105370 + }, + { + "epoch": 17.19004893964111, + "grad_norm": 0.047292839735746384, + "learning_rate": 5.894126079467077e-05, + "loss": 0.0275, + "num_input_tokens_seen": 227372800, + "step": 105375 + }, + { + "epoch": 17.190864600326265, + "grad_norm": 0.36586907505989075, + "learning_rate": 5.890773747049566e-05, + "loss": 0.011, + "num_input_tokens_seen": 227382944, + "step": 105380 + }, + { + "epoch": 17.19168026101142, + "grad_norm": 0.10674824565649033, + "learning_rate": 5.88742230856148e-05, + "loss": 0.0044, + "num_input_tokens_seen": 227392768, + "step": 105385 + }, + { + "epoch": 17.192495921696572, + "grad_norm": 0.00016699254047125578, + "learning_rate": 5.884071764070736e-05, + "loss": 0.0017, + "num_input_tokens_seen": 227403200, + "step": 105390 + }, + { + "epoch": 17.193311582381728, + "grad_norm": 0.012367426417768002, + "learning_rate": 5.880722113645248e-05, + "loss": 0.0016, + "num_input_tokens_seen": 227415072, + "step": 105395 + }, + { + "epoch": 17.194127243066884, + "grad_norm": 0.0002561608562245965, + "learning_rate": 5.877373357352894e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227425280, + "step": 105400 + }, + { + "epoch": 17.19494290375204, + "grad_norm": 0.0003560085315257311, + "learning_rate": 5.874025495261548e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227436320, + "step": 105405 + }, + { + "epoch": 17.195758564437195, + "grad_norm": 0.0010635483777150512, + "learning_rate": 5.870678527439049e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227447008, + "step": 105410 + }, + { + "epoch": 17.196574225122347, + "grad_norm": 0.0006939658196642995, + "learning_rate": 5.867332453953228e-05, + "loss": 0.0024, + "num_input_tokens_seen": 227457664, + "step": 105415 + }, + { + "epoch": 17.197389885807503, + "grad_norm": 0.0016882333438843489, + "learning_rate": 5.863987274871907e-05, + "loss": 0.0015, + "num_input_tokens_seen": 227469216, + "step": 105420 + }, + { + "epoch": 17.19820554649266, + "grad_norm": 0.0002990370849147439, + "learning_rate": 5.860642990262871e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227482208, + "step": 105425 + }, + { + "epoch": 17.199021207177815, + "grad_norm": 0.010654507204890251, + "learning_rate": 5.857299600193899e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227493056, + "step": 105430 + }, + { + "epoch": 17.19983686786297, + "grad_norm": 0.07037343084812164, + "learning_rate": 5.853957104732749e-05, + "loss": 0.0051, + "num_input_tokens_seen": 227504480, + "step": 105435 + }, + { + "epoch": 17.200652528548122, + "grad_norm": 0.0012316829524934292, + "learning_rate": 5.850615503947166e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227513024, + "step": 105440 + }, + { + "epoch": 17.201468189233278, + "grad_norm": 0.027927590534090996, + "learning_rate": 5.8472747979048665e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227524960, + "step": 105445 + }, + { + "epoch": 17.202283849918434, + "grad_norm": 0.008259845897555351, + "learning_rate": 5.843934986673549e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227536416, + "step": 105450 + }, + { + "epoch": 17.20309951060359, + "grad_norm": 0.0029634875245392323, + "learning_rate": 5.840596070320914e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227547136, + "step": 105455 + }, + { + "epoch": 17.203915171288745, + "grad_norm": 0.007753181271255016, + "learning_rate": 5.837258048914612e-05, + "loss": 0.0043, + "num_input_tokens_seen": 227558400, + "step": 105460 + }, + { + "epoch": 17.204730831973897, + "grad_norm": 0.0015975162386894226, + "learning_rate": 5.833920922522301e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227568960, + "step": 105465 + }, + { + "epoch": 17.205546492659053, + "grad_norm": 0.00013117071648593992, + "learning_rate": 5.830584691211615e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227579264, + "step": 105470 + }, + { + "epoch": 17.20636215334421, + "grad_norm": 0.44636672735214233, + "learning_rate": 5.827249355050163e-05, + "loss": 0.1446, + "num_input_tokens_seen": 227590656, + "step": 105475 + }, + { + "epoch": 17.207177814029365, + "grad_norm": 0.00048099533887580037, + "learning_rate": 5.823914914105527e-05, + "loss": 0.0067, + "num_input_tokens_seen": 227600896, + "step": 105480 + }, + { + "epoch": 17.20799347471452, + "grad_norm": 0.006845272146165371, + "learning_rate": 5.820581368445316e-05, + "loss": 0.0023, + "num_input_tokens_seen": 227611840, + "step": 105485 + }, + { + "epoch": 17.208809135399672, + "grad_norm": 0.004301134496927261, + "learning_rate": 5.817248718137053e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227621632, + "step": 105490 + }, + { + "epoch": 17.209624796084828, + "grad_norm": 0.0016513109439983964, + "learning_rate": 5.8139169632483e-05, + "loss": 0.0093, + "num_input_tokens_seen": 227632320, + "step": 105495 + }, + { + "epoch": 17.210440456769984, + "grad_norm": 0.0020857781637459993, + "learning_rate": 5.810586103846577e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227642784, + "step": 105500 + }, + { + "epoch": 17.21125611745514, + "grad_norm": 0.008492519147694111, + "learning_rate": 5.807256139999384e-05, + "loss": 0.005, + "num_input_tokens_seen": 227653024, + "step": 105505 + }, + { + "epoch": 17.212071778140295, + "grad_norm": 0.015060825273394585, + "learning_rate": 5.8039270717742065e-05, + "loss": 0.0006, + "num_input_tokens_seen": 227663232, + "step": 105510 + }, + { + "epoch": 17.212887438825447, + "grad_norm": 0.023475801572203636, + "learning_rate": 5.8005988992385184e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227674944, + "step": 105515 + }, + { + "epoch": 17.213703099510603, + "grad_norm": 0.024295667186379433, + "learning_rate": 5.79727162245976e-05, + "loss": 0.0013, + "num_input_tokens_seen": 227685568, + "step": 105520 + }, + { + "epoch": 17.21451876019576, + "grad_norm": 0.0002447074220981449, + "learning_rate": 5.7939452415053664e-05, + "loss": 0.002, + "num_input_tokens_seen": 227696032, + "step": 105525 + }, + { + "epoch": 17.215334420880914, + "grad_norm": 0.0003919857263099402, + "learning_rate": 5.7906197564427557e-05, + "loss": 0.0014, + "num_input_tokens_seen": 227706336, + "step": 105530 + }, + { + "epoch": 17.21615008156607, + "grad_norm": 0.0210751723498106, + "learning_rate": 5.7872951673393184e-05, + "loss": 0.001, + "num_input_tokens_seen": 227717632, + "step": 105535 + }, + { + "epoch": 17.216965742251222, + "grad_norm": 0.005465270951390266, + "learning_rate": 5.7839714742624284e-05, + "loss": 0.0025, + "num_input_tokens_seen": 227728544, + "step": 105540 + }, + { + "epoch": 17.217781402936378, + "grad_norm": 0.00745729124173522, + "learning_rate": 5.780648677279454e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227740192, + "step": 105545 + }, + { + "epoch": 17.218597063621534, + "grad_norm": 0.0003603911027312279, + "learning_rate": 5.777326776457725e-05, + "loss": 0.001, + "num_input_tokens_seen": 227750816, + "step": 105550 + }, + { + "epoch": 17.21941272430669, + "grad_norm": 0.0011287000961601734, + "learning_rate": 5.774005771864571e-05, + "loss": 0.0022, + "num_input_tokens_seen": 227762784, + "step": 105555 + }, + { + "epoch": 17.22022838499184, + "grad_norm": 0.007719321176409721, + "learning_rate": 5.7706856635672986e-05, + "loss": 0.0012, + "num_input_tokens_seen": 227772672, + "step": 105560 + }, + { + "epoch": 17.221044045676997, + "grad_norm": 0.00017907106666825712, + "learning_rate": 5.767366451633188e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227782304, + "step": 105565 + }, + { + "epoch": 17.221859706362153, + "grad_norm": 0.21372942626476288, + "learning_rate": 5.764048136129507e-05, + "loss": 0.0047, + "num_input_tokens_seen": 227793088, + "step": 105570 + }, + { + "epoch": 17.22267536704731, + "grad_norm": 0.0008800557116046548, + "learning_rate": 5.760730717123508e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227804704, + "step": 105575 + }, + { + "epoch": 17.223491027732464, + "grad_norm": 0.03665255010128021, + "learning_rate": 5.757414194682426e-05, + "loss": 0.0005, + "num_input_tokens_seen": 227815296, + "step": 105580 + }, + { + "epoch": 17.224306688417617, + "grad_norm": 0.013609733432531357, + "learning_rate": 5.754098568873456e-05, + "loss": 0.0025, + "num_input_tokens_seen": 227826656, + "step": 105585 + }, + { + "epoch": 17.225122349102772, + "grad_norm": 0.009662250988185406, + "learning_rate": 5.7507838397638346e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227836832, + "step": 105590 + }, + { + "epoch": 17.225938009787928, + "grad_norm": 0.00258969166316092, + "learning_rate": 5.7474700074206856e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227848640, + "step": 105595 + }, + { + "epoch": 17.226753670473084, + "grad_norm": 0.0017230057856068015, + "learning_rate": 5.7441570719112216e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227858624, + "step": 105600 + }, + { + "epoch": 17.22756933115824, + "grad_norm": 0.0019686620216816664, + "learning_rate": 5.740845033302533e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227869792, + "step": 105605 + }, + { + "epoch": 17.22838499184339, + "grad_norm": 0.0114396121352911, + "learning_rate": 5.737533891661789e-05, + "loss": 0.0085, + "num_input_tokens_seen": 227879712, + "step": 105610 + }, + { + "epoch": 17.229200652528547, + "grad_norm": 0.0009749207529239357, + "learning_rate": 5.734223647056053e-05, + "loss": 0.0026, + "num_input_tokens_seen": 227891744, + "step": 105615 + }, + { + "epoch": 17.230016313213703, + "grad_norm": 0.00017402649973519146, + "learning_rate": 5.7309142995524475e-05, + "loss": 0.0035, + "num_input_tokens_seen": 227901664, + "step": 105620 + }, + { + "epoch": 17.23083197389886, + "grad_norm": 0.00041545473504811525, + "learning_rate": 5.7276058492179984e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227912160, + "step": 105625 + }, + { + "epoch": 17.231647634584014, + "grad_norm": 0.010690425522625446, + "learning_rate": 5.724298296119796e-05, + "loss": 0.001, + "num_input_tokens_seen": 227920928, + "step": 105630 + }, + { + "epoch": 17.232463295269167, + "grad_norm": 0.00011045865539927036, + "learning_rate": 5.7209916403248574e-05, + "loss": 0.0011, + "num_input_tokens_seen": 227930880, + "step": 105635 + }, + { + "epoch": 17.233278955954322, + "grad_norm": 0.006294461898505688, + "learning_rate": 5.717685881900192e-05, + "loss": 0.0004, + "num_input_tokens_seen": 227941408, + "step": 105640 + }, + { + "epoch": 17.234094616639478, + "grad_norm": 0.00013909833796788007, + "learning_rate": 5.714381020912801e-05, + "loss": 0.0019, + "num_input_tokens_seen": 227952032, + "step": 105645 + }, + { + "epoch": 17.234910277324634, + "grad_norm": 0.0008638381259515882, + "learning_rate": 5.711077057429659e-05, + "loss": 0.0002, + "num_input_tokens_seen": 227962912, + "step": 105650 + }, + { + "epoch": 17.23572593800979, + "grad_norm": 0.019511796534061432, + "learning_rate": 5.7077739915177226e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227973952, + "step": 105655 + }, + { + "epoch": 17.23654159869494, + "grad_norm": 0.00047131560859270394, + "learning_rate": 5.704471823243934e-05, + "loss": 0.0003, + "num_input_tokens_seen": 227984320, + "step": 105660 + }, + { + "epoch": 17.237357259380097, + "grad_norm": 0.004795871675014496, + "learning_rate": 5.701170552675217e-05, + "loss": 0.0007, + "num_input_tokens_seen": 227996128, + "step": 105665 + }, + { + "epoch": 17.238172920065253, + "grad_norm": 0.015304290689527988, + "learning_rate": 5.6978701798784785e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228006976, + "step": 105670 + }, + { + "epoch": 17.23898858075041, + "grad_norm": 0.013266884721815586, + "learning_rate": 5.6945707049205985e-05, + "loss": 0.0253, + "num_input_tokens_seen": 228019040, + "step": 105675 + }, + { + "epoch": 17.239804241435564, + "grad_norm": 0.0014411715092137456, + "learning_rate": 5.691272127868452e-05, + "loss": 0.0001, + "num_input_tokens_seen": 228028832, + "step": 105680 + }, + { + "epoch": 17.240619902120716, + "grad_norm": 0.002396760042756796, + "learning_rate": 5.6879744487888854e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228040288, + "step": 105685 + }, + { + "epoch": 17.241435562805872, + "grad_norm": 0.0037071153055876493, + "learning_rate": 5.684677667748717e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228050752, + "step": 105690 + }, + { + "epoch": 17.242251223491028, + "grad_norm": 0.003118707099929452, + "learning_rate": 5.681381784814799e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228062144, + "step": 105695 + }, + { + "epoch": 17.243066884176184, + "grad_norm": 0.0002485084696672857, + "learning_rate": 5.678086800053878e-05, + "loss": 0.0676, + "num_input_tokens_seen": 228073152, + "step": 105700 + }, + { + "epoch": 17.24388254486134, + "grad_norm": 0.025430649518966675, + "learning_rate": 5.674792713532772e-05, + "loss": 0.0636, + "num_input_tokens_seen": 228084544, + "step": 105705 + }, + { + "epoch": 17.24469820554649, + "grad_norm": 0.0011166390031576157, + "learning_rate": 5.671499525318208e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228094656, + "step": 105710 + }, + { + "epoch": 17.245513866231647, + "grad_norm": 0.035816699266433716, + "learning_rate": 5.668207235476957e-05, + "loss": 0.0028, + "num_input_tokens_seen": 228104544, + "step": 105715 + }, + { + "epoch": 17.246329526916803, + "grad_norm": 0.005151256453245878, + "learning_rate": 5.664915844075702e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228115072, + "step": 105720 + }, + { + "epoch": 17.24714518760196, + "grad_norm": 0.0011781752109527588, + "learning_rate": 5.6616253511811934e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228124736, + "step": 105725 + }, + { + "epoch": 17.247960848287114, + "grad_norm": 0.00013962779485154897, + "learning_rate": 5.6583357568600776e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228135008, + "step": 105730 + }, + { + "epoch": 17.248776508972266, + "grad_norm": 0.00789322704076767, + "learning_rate": 5.6550470611790584e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228146208, + "step": 105735 + }, + { + "epoch": 17.249592169657422, + "grad_norm": 0.17940707504749298, + "learning_rate": 5.6517592642047424e-05, + "loss": 0.0035, + "num_input_tokens_seen": 228156192, + "step": 105740 + }, + { + "epoch": 17.250407830342578, + "grad_norm": 0.00036730911233462393, + "learning_rate": 5.648472366003804e-05, + "loss": 0.0049, + "num_input_tokens_seen": 228166464, + "step": 105745 + }, + { + "epoch": 17.251223491027734, + "grad_norm": 0.013420809991657734, + "learning_rate": 5.6451863666428236e-05, + "loss": 0.005, + "num_input_tokens_seen": 228178112, + "step": 105750 + }, + { + "epoch": 17.252039151712886, + "grad_norm": 0.0009929410880431533, + "learning_rate": 5.6419012661884206e-05, + "loss": 0.0027, + "num_input_tokens_seen": 228189792, + "step": 105755 + }, + { + "epoch": 17.25285481239804, + "grad_norm": 0.024342723190784454, + "learning_rate": 5.6386170647071464e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228200928, + "step": 105760 + }, + { + "epoch": 17.253670473083197, + "grad_norm": 0.004579450003802776, + "learning_rate": 5.6353337622655935e-05, + "loss": 0.0007, + "num_input_tokens_seen": 228212480, + "step": 105765 + }, + { + "epoch": 17.254486133768353, + "grad_norm": 0.020500490441918373, + "learning_rate": 5.632051358930263e-05, + "loss": 0.0021, + "num_input_tokens_seen": 228223616, + "step": 105770 + }, + { + "epoch": 17.25530179445351, + "grad_norm": 0.03280545398592949, + "learning_rate": 5.628769854767707e-05, + "loss": 0.0017, + "num_input_tokens_seen": 228234720, + "step": 105775 + }, + { + "epoch": 17.25611745513866, + "grad_norm": 0.00033209254615940154, + "learning_rate": 5.6254892498444175e-05, + "loss": 0.0017, + "num_input_tokens_seen": 228244832, + "step": 105780 + }, + { + "epoch": 17.256933115823816, + "grad_norm": 0.0006637353799305856, + "learning_rate": 5.6222095442268805e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228255264, + "step": 105785 + }, + { + "epoch": 17.257748776508972, + "grad_norm": 0.02377443201839924, + "learning_rate": 5.6189307379815645e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228266944, + "step": 105790 + }, + { + "epoch": 17.258564437194128, + "grad_norm": 0.0002918375248555094, + "learning_rate": 5.615652831174917e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228278400, + "step": 105795 + }, + { + "epoch": 17.259380097879284, + "grad_norm": 0.6315314173698425, + "learning_rate": 5.612375823873373e-05, + "loss": 0.0163, + "num_input_tokens_seen": 228289280, + "step": 105800 + }, + { + "epoch": 17.260195758564436, + "grad_norm": 0.10928847640752792, + "learning_rate": 5.60909971614334e-05, + "loss": 0.0039, + "num_input_tokens_seen": 228300672, + "step": 105805 + }, + { + "epoch": 17.26101141924959, + "grad_norm": 0.002070619259029627, + "learning_rate": 5.605824508051216e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228311360, + "step": 105810 + }, + { + "epoch": 17.261827079934747, + "grad_norm": 0.0024908948689699173, + "learning_rate": 5.602550199663381e-05, + "loss": 0.0001, + "num_input_tokens_seen": 228322048, + "step": 105815 + }, + { + "epoch": 17.262642740619903, + "grad_norm": 0.003324811113998294, + "learning_rate": 5.599276791046182e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228332128, + "step": 105820 + }, + { + "epoch": 17.26345840130506, + "grad_norm": 0.000472935673315078, + "learning_rate": 5.5960042822659596e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228342880, + "step": 105825 + }, + { + "epoch": 17.26427406199021, + "grad_norm": 0.448454886674881, + "learning_rate": 5.592732673389056e-05, + "loss": 0.01, + "num_input_tokens_seen": 228353536, + "step": 105830 + }, + { + "epoch": 17.265089722675366, + "grad_norm": 0.01826861873269081, + "learning_rate": 5.5894619644817455e-05, + "loss": 0.005, + "num_input_tokens_seen": 228365312, + "step": 105835 + }, + { + "epoch": 17.265905383360522, + "grad_norm": 0.00028996451874263585, + "learning_rate": 5.586192155610342e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228376064, + "step": 105840 + }, + { + "epoch": 17.266721044045678, + "grad_norm": 0.006420481484383345, + "learning_rate": 5.582923246841082e-05, + "loss": 0.001, + "num_input_tokens_seen": 228387424, + "step": 105845 + }, + { + "epoch": 17.267536704730833, + "grad_norm": 0.009194244630634785, + "learning_rate": 5.5796552382402446e-05, + "loss": 0.0025, + "num_input_tokens_seen": 228396864, + "step": 105850 + }, + { + "epoch": 17.268352365415986, + "grad_norm": 0.0013687668833881617, + "learning_rate": 5.576388129874027e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228406912, + "step": 105855 + }, + { + "epoch": 17.26916802610114, + "grad_norm": 0.0013235857477411628, + "learning_rate": 5.5731219218086824e-05, + "loss": 0.0008, + "num_input_tokens_seen": 228417760, + "step": 105860 + }, + { + "epoch": 17.269983686786297, + "grad_norm": 0.00015787457232363522, + "learning_rate": 5.569856614110358e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228427424, + "step": 105865 + }, + { + "epoch": 17.270799347471453, + "grad_norm": 0.003471981268376112, + "learning_rate": 5.566592206845272e-05, + "loss": 0.0029, + "num_input_tokens_seen": 228437248, + "step": 105870 + }, + { + "epoch": 17.27161500815661, + "grad_norm": 0.0009811193449422717, + "learning_rate": 5.563328700079545e-05, + "loss": 0.0033, + "num_input_tokens_seen": 228447360, + "step": 105875 + }, + { + "epoch": 17.27243066884176, + "grad_norm": 0.000607303692959249, + "learning_rate": 5.560066093879351e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228457984, + "step": 105880 + }, + { + "epoch": 17.273246329526916, + "grad_norm": 0.0005140582215972245, + "learning_rate": 5.556804388310777e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228468000, + "step": 105885 + }, + { + "epoch": 17.274061990212072, + "grad_norm": 0.0003740263928193599, + "learning_rate": 5.5535435834399626e-05, + "loss": 0.0128, + "num_input_tokens_seen": 228479200, + "step": 105890 + }, + { + "epoch": 17.274877650897228, + "grad_norm": 0.034084126353263855, + "learning_rate": 5.550283679332951e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228490112, + "step": 105895 + }, + { + "epoch": 17.275693311582383, + "grad_norm": 0.013113155961036682, + "learning_rate": 5.5470246760558455e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228500096, + "step": 105900 + }, + { + "epoch": 17.276508972267536, + "grad_norm": 0.1485210508108139, + "learning_rate": 5.543766573674663e-05, + "loss": 0.0036, + "num_input_tokens_seen": 228511264, + "step": 105905 + }, + { + "epoch": 17.27732463295269, + "grad_norm": 0.005449605640023947, + "learning_rate": 5.5405093722554534e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228523168, + "step": 105910 + }, + { + "epoch": 17.278140293637847, + "grad_norm": 0.00012212347064632922, + "learning_rate": 5.5372530718642235e-05, + "loss": 0.0011, + "num_input_tokens_seen": 228534304, + "step": 105915 + }, + { + "epoch": 17.278955954323003, + "grad_norm": 0.0010362443281337619, + "learning_rate": 5.533997672566965e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228542944, + "step": 105920 + }, + { + "epoch": 17.27977161500816, + "grad_norm": 0.0012470668880268931, + "learning_rate": 5.5307431744296534e-05, + "loss": 0.0012, + "num_input_tokens_seen": 228553120, + "step": 105925 + }, + { + "epoch": 17.28058727569331, + "grad_norm": 0.021871287375688553, + "learning_rate": 5.5274895775182464e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228564352, + "step": 105930 + }, + { + "epoch": 17.281402936378466, + "grad_norm": 0.007044681813567877, + "learning_rate": 5.524236881898681e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228574848, + "step": 105935 + }, + { + "epoch": 17.282218597063622, + "grad_norm": 0.0025700305122882128, + "learning_rate": 5.5209850876368705e-05, + "loss": 0.0005, + "num_input_tokens_seen": 228586560, + "step": 105940 + }, + { + "epoch": 17.283034257748778, + "grad_norm": 0.004051051568239927, + "learning_rate": 5.517734194798729e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228596928, + "step": 105945 + }, + { + "epoch": 17.28384991843393, + "grad_norm": 0.00021742185344919562, + "learning_rate": 5.514484203450132e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228608576, + "step": 105950 + }, + { + "epoch": 17.284665579119086, + "grad_norm": 0.00047381609329022467, + "learning_rate": 5.511235113656943e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228619616, + "step": 105955 + }, + { + "epoch": 17.28548123980424, + "grad_norm": 0.002084142994135618, + "learning_rate": 5.50798692548502e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228629664, + "step": 105960 + }, + { + "epoch": 17.286296900489397, + "grad_norm": 0.009078197181224823, + "learning_rate": 5.504739639000178e-05, + "loss": 0.0019, + "num_input_tokens_seen": 228640384, + "step": 105965 + }, + { + "epoch": 17.287112561174553, + "grad_norm": 0.000528795353602618, + "learning_rate": 5.501493254268225e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228651424, + "step": 105970 + }, + { + "epoch": 17.287928221859705, + "grad_norm": 0.00025582790840417147, + "learning_rate": 5.4982477713549806e-05, + "loss": 0.0032, + "num_input_tokens_seen": 228662720, + "step": 105975 + }, + { + "epoch": 17.28874388254486, + "grad_norm": 0.0011853290488943458, + "learning_rate": 5.495003190326181e-05, + "loss": 0.006, + "num_input_tokens_seen": 228673920, + "step": 105980 + }, + { + "epoch": 17.289559543230016, + "grad_norm": 0.011951207183301449, + "learning_rate": 5.491759511247618e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228684128, + "step": 105985 + }, + { + "epoch": 17.290375203915172, + "grad_norm": 0.006131981033831835, + "learning_rate": 5.488516734184995e-05, + "loss": 0.0208, + "num_input_tokens_seen": 228695296, + "step": 105990 + }, + { + "epoch": 17.291190864600328, + "grad_norm": 0.0015121581964194775, + "learning_rate": 5.485274859204065e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228706496, + "step": 105995 + }, + { + "epoch": 17.29200652528548, + "grad_norm": 0.0008288269746117294, + "learning_rate": 5.482033886370491e-05, + "loss": 0.0011, + "num_input_tokens_seen": 228716896, + "step": 106000 + }, + { + "epoch": 17.292822185970635, + "grad_norm": 0.001796778873540461, + "learning_rate": 5.478793815749994e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228727520, + "step": 106005 + }, + { + "epoch": 17.29363784665579, + "grad_norm": 0.0002680800389498472, + "learning_rate": 5.4755546474082044e-05, + "loss": 0.002, + "num_input_tokens_seen": 228738816, + "step": 106010 + }, + { + "epoch": 17.294453507340947, + "grad_norm": 0.00013674308138433844, + "learning_rate": 5.472316381410786e-05, + "loss": 0.0044, + "num_input_tokens_seen": 228749344, + "step": 106015 + }, + { + "epoch": 17.295269168026103, + "grad_norm": 0.0016212406335398555, + "learning_rate": 5.46907901782337e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228761152, + "step": 106020 + }, + { + "epoch": 17.296084828711255, + "grad_norm": 0.0003002994053531438, + "learning_rate": 5.4658425567115535e-05, + "loss": 0.0056, + "num_input_tokens_seen": 228772352, + "step": 106025 + }, + { + "epoch": 17.29690048939641, + "grad_norm": 0.0004346913192421198, + "learning_rate": 5.4626069981409395e-05, + "loss": 0.0013, + "num_input_tokens_seen": 228783328, + "step": 106030 + }, + { + "epoch": 17.297716150081566, + "grad_norm": 0.000790232908912003, + "learning_rate": 5.459372342177088e-05, + "loss": 0.0002, + "num_input_tokens_seen": 228795392, + "step": 106035 + }, + { + "epoch": 17.298531810766722, + "grad_norm": 0.0025121932849287987, + "learning_rate": 5.456138588885562e-05, + "loss": 0.0088, + "num_input_tokens_seen": 228805888, + "step": 106040 + }, + { + "epoch": 17.299347471451878, + "grad_norm": 0.00296417367644608, + "learning_rate": 5.452905738331898e-05, + "loss": 0.0006, + "num_input_tokens_seen": 228817824, + "step": 106045 + }, + { + "epoch": 17.30016313213703, + "grad_norm": 0.0004501324438024312, + "learning_rate": 5.449673790581611e-05, + "loss": 0.0012, + "num_input_tokens_seen": 228829856, + "step": 106050 + }, + { + "epoch": 17.300978792822185, + "grad_norm": 0.01644732616841793, + "learning_rate": 5.446442745700198e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228840864, + "step": 106055 + }, + { + "epoch": 17.30179445350734, + "grad_norm": 0.003633821615949273, + "learning_rate": 5.443212603753145e-05, + "loss": 0.0028, + "num_input_tokens_seen": 228851904, + "step": 106060 + }, + { + "epoch": 17.302610114192497, + "grad_norm": 0.0003840426215901971, + "learning_rate": 5.439983364805912e-05, + "loss": 0.0011, + "num_input_tokens_seen": 228863360, + "step": 106065 + }, + { + "epoch": 17.303425774877653, + "grad_norm": 0.0026016277261078358, + "learning_rate": 5.436755028923945e-05, + "loss": 0.0018, + "num_input_tokens_seen": 228875008, + "step": 106070 + }, + { + "epoch": 17.304241435562805, + "grad_norm": 0.003150344593450427, + "learning_rate": 5.433527596172666e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228886272, + "step": 106075 + }, + { + "epoch": 17.30505709624796, + "grad_norm": 0.017075197771191597, + "learning_rate": 5.430301066617493e-05, + "loss": 0.0148, + "num_input_tokens_seen": 228897952, + "step": 106080 + }, + { + "epoch": 17.305872756933116, + "grad_norm": 0.00038182083517313004, + "learning_rate": 5.4270754403238034e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228909792, + "step": 106085 + }, + { + "epoch": 17.306688417618272, + "grad_norm": 0.00461201136931777, + "learning_rate": 5.4238507173569816e-05, + "loss": 0.001, + "num_input_tokens_seen": 228921056, + "step": 106090 + }, + { + "epoch": 17.307504078303428, + "grad_norm": 0.016461290419101715, + "learning_rate": 5.420626897782366e-05, + "loss": 0.0009, + "num_input_tokens_seen": 228931232, + "step": 106095 + }, + { + "epoch": 17.30831973898858, + "grad_norm": 0.006799416150897741, + "learning_rate": 5.417403981665309e-05, + "loss": 0.0003, + "num_input_tokens_seen": 228940512, + "step": 106100 + }, + { + "epoch": 17.309135399673735, + "grad_norm": 0.00012546725338324904, + "learning_rate": 5.414181969071108e-05, + "loss": 0.0012, + "num_input_tokens_seen": 228951840, + "step": 106105 + }, + { + "epoch": 17.30995106035889, + "grad_norm": 0.008052029646933079, + "learning_rate": 5.410960860065073e-05, + "loss": 0.0014, + "num_input_tokens_seen": 228962400, + "step": 106110 + }, + { + "epoch": 17.310766721044047, + "grad_norm": 0.0025201477110385895, + "learning_rate": 5.407740654712473e-05, + "loss": 0.0042, + "num_input_tokens_seen": 228972832, + "step": 106115 + }, + { + "epoch": 17.3115823817292, + "grad_norm": 0.003215489676222205, + "learning_rate": 5.4045213530785896e-05, + "loss": 0.0016, + "num_input_tokens_seen": 228982752, + "step": 106120 + }, + { + "epoch": 17.312398042414355, + "grad_norm": 0.002369002439081669, + "learning_rate": 5.401302955228654e-05, + "loss": 0.0004, + "num_input_tokens_seen": 228992064, + "step": 106125 + }, + { + "epoch": 17.31321370309951, + "grad_norm": 0.0002680527395568788, + "learning_rate": 5.398085461227886e-05, + "loss": 0.0014, + "num_input_tokens_seen": 229002752, + "step": 106130 + }, + { + "epoch": 17.314029363784666, + "grad_norm": 0.00018555365386418998, + "learning_rate": 5.394868871141506e-05, + "loss": 0.003, + "num_input_tokens_seen": 229014272, + "step": 106135 + }, + { + "epoch": 17.31484502446982, + "grad_norm": 0.0006892183446325362, + "learning_rate": 5.3916531850346895e-05, + "loss": 0.0022, + "num_input_tokens_seen": 229023936, + "step": 106140 + }, + { + "epoch": 17.315660685154974, + "grad_norm": 0.00020649514044634998, + "learning_rate": 5.388438402972612e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229034272, + "step": 106145 + }, + { + "epoch": 17.31647634584013, + "grad_norm": 0.00021816276421304792, + "learning_rate": 5.385224525020421e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229045696, + "step": 106150 + }, + { + "epoch": 17.317292006525285, + "grad_norm": 0.03758164122700691, + "learning_rate": 5.382011551243254e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229055424, + "step": 106155 + }, + { + "epoch": 17.31810766721044, + "grad_norm": 0.0017299660248681903, + "learning_rate": 5.3787994817062256e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229066528, + "step": 106160 + }, + { + "epoch": 17.318923327895597, + "grad_norm": 0.001880289171822369, + "learning_rate": 5.3755883164744335e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229075264, + "step": 106165 + }, + { + "epoch": 17.31973898858075, + "grad_norm": 0.00023340160259976983, + "learning_rate": 5.372378055612953e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229085600, + "step": 106170 + }, + { + "epoch": 17.320554649265905, + "grad_norm": 0.01466730423271656, + "learning_rate": 5.369168699186844e-05, + "loss": 0.002, + "num_input_tokens_seen": 229094912, + "step": 106175 + }, + { + "epoch": 17.32137030995106, + "grad_norm": 0.00207155873067677, + "learning_rate": 5.365960247261148e-05, + "loss": 0.0019, + "num_input_tokens_seen": 229106496, + "step": 106180 + }, + { + "epoch": 17.322185970636216, + "grad_norm": 0.016799405217170715, + "learning_rate": 5.3627526999008966e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229116960, + "step": 106185 + }, + { + "epoch": 17.32300163132137, + "grad_norm": 0.007079609204083681, + "learning_rate": 5.359546057171083e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229127808, + "step": 106190 + }, + { + "epoch": 17.323817292006524, + "grad_norm": 0.012293044477701187, + "learning_rate": 5.356340319136699e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229138752, + "step": 106195 + }, + { + "epoch": 17.32463295269168, + "grad_norm": 0.0008502997225150466, + "learning_rate": 5.353135485862715e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229150560, + "step": 106200 + }, + { + "epoch": 17.325448613376835, + "grad_norm": 0.004772664979100227, + "learning_rate": 5.3499315574140784e-05, + "loss": 0.001, + "num_input_tokens_seen": 229162464, + "step": 106205 + }, + { + "epoch": 17.32626427406199, + "grad_norm": 0.004363675136119127, + "learning_rate": 5.3467285338557213e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229172384, + "step": 106210 + }, + { + "epoch": 17.327079934747147, + "grad_norm": 0.00044891564175486565, + "learning_rate": 5.343526415252553e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229182656, + "step": 106215 + }, + { + "epoch": 17.3278955954323, + "grad_norm": 0.0005116835818625987, + "learning_rate": 5.340325201669477e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229193952, + "step": 106220 + }, + { + "epoch": 17.328711256117455, + "grad_norm": 0.00029387185350060463, + "learning_rate": 5.337124893171358e-05, + "loss": 0.001, + "num_input_tokens_seen": 229205120, + "step": 106225 + }, + { + "epoch": 17.32952691680261, + "grad_norm": 0.0018498359713703394, + "learning_rate": 5.333925489823077e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229216608, + "step": 106230 + }, + { + "epoch": 17.330342577487766, + "grad_norm": 0.0008364535751752555, + "learning_rate": 5.330726991689439e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229226592, + "step": 106235 + }, + { + "epoch": 17.33115823817292, + "grad_norm": 0.017117716372013092, + "learning_rate": 5.327529398835307e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229238240, + "step": 106240 + }, + { + "epoch": 17.331973898858074, + "grad_norm": 0.0012407746398821473, + "learning_rate": 5.324332711325447e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229250240, + "step": 106245 + }, + { + "epoch": 17.33278955954323, + "grad_norm": 0.2877473831176758, + "learning_rate": 5.3211369292246735e-05, + "loss": 0.0111, + "num_input_tokens_seen": 229262784, + "step": 106250 + }, + { + "epoch": 17.333605220228385, + "grad_norm": 0.013145347125828266, + "learning_rate": 5.317942052597724e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229273696, + "step": 106255 + }, + { + "epoch": 17.33442088091354, + "grad_norm": 0.00016302223957609385, + "learning_rate": 5.3147480815093684e-05, + "loss": 0.0027, + "num_input_tokens_seen": 229284256, + "step": 106260 + }, + { + "epoch": 17.335236541598697, + "grad_norm": 0.0003913108666893095, + "learning_rate": 5.311555016024328e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229295744, + "step": 106265 + }, + { + "epoch": 17.33605220228385, + "grad_norm": 0.00483996607363224, + "learning_rate": 5.308362856207322e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229307328, + "step": 106270 + }, + { + "epoch": 17.336867862969005, + "grad_norm": 0.0005214695120230317, + "learning_rate": 5.3051716021230375e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229317472, + "step": 106275 + }, + { + "epoch": 17.33768352365416, + "grad_norm": 0.08287959545850754, + "learning_rate": 5.3019812538361466e-05, + "loss": 0.0024, + "num_input_tokens_seen": 229327360, + "step": 106280 + }, + { + "epoch": 17.338499184339316, + "grad_norm": 0.012906786054372787, + "learning_rate": 5.298791811411313e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229337888, + "step": 106285 + }, + { + "epoch": 17.339314845024468, + "grad_norm": 0.019009793177247047, + "learning_rate": 5.295603274913169e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229349440, + "step": 106290 + }, + { + "epoch": 17.340130505709624, + "grad_norm": 0.0007152992184273899, + "learning_rate": 5.292415644406334e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229360032, + "step": 106295 + }, + { + "epoch": 17.34094616639478, + "grad_norm": 0.0007699343841522932, + "learning_rate": 5.289228919955413e-05, + "loss": 0.0077, + "num_input_tokens_seen": 229370848, + "step": 106300 + }, + { + "epoch": 17.341761827079935, + "grad_norm": 0.017016250640153885, + "learning_rate": 5.286043101624988e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229382464, + "step": 106305 + }, + { + "epoch": 17.34257748776509, + "grad_norm": 0.0005845070700161159, + "learning_rate": 5.2828581894796226e-05, + "loss": 0.0072, + "num_input_tokens_seen": 229393280, + "step": 106310 + }, + { + "epoch": 17.343393148450243, + "grad_norm": 0.0001427880924893543, + "learning_rate": 5.2796741835838656e-05, + "loss": 0.0001, + "num_input_tokens_seen": 229404608, + "step": 106315 + }, + { + "epoch": 17.3442088091354, + "grad_norm": 0.0002691572590265423, + "learning_rate": 5.276491084002238e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229415456, + "step": 106320 + }, + { + "epoch": 17.345024469820554, + "grad_norm": 0.00023667467758059502, + "learning_rate": 5.273308890799261e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229426464, + "step": 106325 + }, + { + "epoch": 17.34584013050571, + "grad_norm": 0.004824917763471603, + "learning_rate": 5.270127604039404e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229438016, + "step": 106330 + }, + { + "epoch": 17.346655791190866, + "grad_norm": 0.0016199341043829918, + "learning_rate": 5.266947223787177e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229448448, + "step": 106335 + }, + { + "epoch": 17.347471451876018, + "grad_norm": 0.00015872031508479267, + "learning_rate": 5.263767750106996e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229457984, + "step": 106340 + }, + { + "epoch": 17.348287112561174, + "grad_norm": 0.0008224875200539827, + "learning_rate": 5.2605891830633304e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229468512, + "step": 106345 + }, + { + "epoch": 17.34910277324633, + "grad_norm": 0.013289394788444042, + "learning_rate": 5.257411522720562e-05, + "loss": 0.0023, + "num_input_tokens_seen": 229479392, + "step": 106350 + }, + { + "epoch": 17.349918433931485, + "grad_norm": 0.0003221442748326808, + "learning_rate": 5.2542347691431235e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229488640, + "step": 106355 + }, + { + "epoch": 17.35073409461664, + "grad_norm": 0.013211195357143879, + "learning_rate": 5.251058922395368e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229498464, + "step": 106360 + }, + { + "epoch": 17.351549755301793, + "grad_norm": 0.29433706402778625, + "learning_rate": 5.24788398254169e-05, + "loss": 0.0103, + "num_input_tokens_seen": 229508704, + "step": 106365 + }, + { + "epoch": 17.35236541598695, + "grad_norm": 0.0184700358659029, + "learning_rate": 5.2447099496463925e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229519200, + "step": 106370 + }, + { + "epoch": 17.353181076672104, + "grad_norm": 0.001096223364584148, + "learning_rate": 5.241536823773846e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229530336, + "step": 106375 + }, + { + "epoch": 17.35399673735726, + "grad_norm": 0.0007579786470159888, + "learning_rate": 5.238364604988316e-05, + "loss": 0.0074, + "num_input_tokens_seen": 229540768, + "step": 106380 + }, + { + "epoch": 17.354812398042416, + "grad_norm": 0.016769718378782272, + "learning_rate": 5.235193293354129e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229550976, + "step": 106385 + }, + { + "epoch": 17.355628058727568, + "grad_norm": 0.04806697368621826, + "learning_rate": 5.2320228889355224e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229561920, + "step": 106390 + }, + { + "epoch": 17.356443719412724, + "grad_norm": 0.006219265982508659, + "learning_rate": 5.228853391796784e-05, + "loss": 0.0008, + "num_input_tokens_seen": 229572000, + "step": 106395 + }, + { + "epoch": 17.35725938009788, + "grad_norm": 0.010985017754137516, + "learning_rate": 5.225684802002106e-05, + "loss": 0.0006, + "num_input_tokens_seen": 229582816, + "step": 106400 + }, + { + "epoch": 17.358075040783035, + "grad_norm": 0.00018111498502548784, + "learning_rate": 5.222517119615733e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229594176, + "step": 106405 + }, + { + "epoch": 17.35889070146819, + "grad_norm": 0.002602667547762394, + "learning_rate": 5.2193503447018564e-05, + "loss": 0.001, + "num_input_tokens_seen": 229605312, + "step": 106410 + }, + { + "epoch": 17.359706362153343, + "grad_norm": 0.00032914700568653643, + "learning_rate": 5.216184477324659e-05, + "loss": 0.0032, + "num_input_tokens_seen": 229616192, + "step": 106415 + }, + { + "epoch": 17.3605220228385, + "grad_norm": 0.3959972560405731, + "learning_rate": 5.2130195175482896e-05, + "loss": 0.0103, + "num_input_tokens_seen": 229627456, + "step": 106420 + }, + { + "epoch": 17.361337683523654, + "grad_norm": 0.0006621154025197029, + "learning_rate": 5.209855465436897e-05, + "loss": 0.0001, + "num_input_tokens_seen": 229638688, + "step": 106425 + }, + { + "epoch": 17.36215334420881, + "grad_norm": 0.0010241816053166986, + "learning_rate": 5.2066923210546015e-05, + "loss": 0.0012, + "num_input_tokens_seen": 229649056, + "step": 106430 + }, + { + "epoch": 17.362969004893966, + "grad_norm": 0.009524654597043991, + "learning_rate": 5.203530084465513e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229661024, + "step": 106435 + }, + { + "epoch": 17.363784665579118, + "grad_norm": 0.2556225657463074, + "learning_rate": 5.20036875573372e-05, + "loss": 0.0053, + "num_input_tokens_seen": 229670304, + "step": 106440 + }, + { + "epoch": 17.364600326264274, + "grad_norm": 0.00032961269607767463, + "learning_rate": 5.197208334923281e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229682368, + "step": 106445 + }, + { + "epoch": 17.36541598694943, + "grad_norm": 0.00016183225670829415, + "learning_rate": 5.1940488220982516e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229692768, + "step": 106450 + }, + { + "epoch": 17.366231647634585, + "grad_norm": 0.0002187024219892919, + "learning_rate": 5.1908902173226524e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229704320, + "step": 106455 + }, + { + "epoch": 17.36704730831974, + "grad_norm": 0.0034628326538950205, + "learning_rate": 5.1877325206605316e-05, + "loss": 0.0536, + "num_input_tokens_seen": 229715360, + "step": 106460 + }, + { + "epoch": 17.367862969004893, + "grad_norm": 0.01701584830880165, + "learning_rate": 5.1845757321758394e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229727744, + "step": 106465 + }, + { + "epoch": 17.36867862969005, + "grad_norm": 0.0002982603618875146, + "learning_rate": 5.181419851932589e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229738528, + "step": 106470 + }, + { + "epoch": 17.369494290375204, + "grad_norm": 0.0002628464426379651, + "learning_rate": 5.178264879994704e-05, + "loss": 0.0001, + "num_input_tokens_seen": 229749408, + "step": 106475 + }, + { + "epoch": 17.37030995106036, + "grad_norm": 0.053000468760728836, + "learning_rate": 5.17511081642616e-05, + "loss": 0.0013, + "num_input_tokens_seen": 229759072, + "step": 106480 + }, + { + "epoch": 17.371125611745512, + "grad_norm": 0.0012487275525927544, + "learning_rate": 5.171957661290838e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229769888, + "step": 106485 + }, + { + "epoch": 17.371941272430668, + "grad_norm": 0.00022184985573403537, + "learning_rate": 5.1688054146526886e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229781920, + "step": 106490 + }, + { + "epoch": 17.372756933115824, + "grad_norm": 0.0002451998880133033, + "learning_rate": 5.165654076575543e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229793248, + "step": 106495 + }, + { + "epoch": 17.37357259380098, + "grad_norm": 0.0002106036408804357, + "learning_rate": 5.162503647123318e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229803328, + "step": 106500 + }, + { + "epoch": 17.374388254486135, + "grad_norm": 0.0001926125551108271, + "learning_rate": 5.159354126359816e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229814304, + "step": 106505 + }, + { + "epoch": 17.375203915171287, + "grad_norm": 0.00021141576871741563, + "learning_rate": 5.156205514348905e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229824928, + "step": 106510 + }, + { + "epoch": 17.376019575856443, + "grad_norm": 0.00019295100355520844, + "learning_rate": 5.1530578111543605e-05, + "loss": 0.0015, + "num_input_tokens_seen": 229834976, + "step": 106515 + }, + { + "epoch": 17.3768352365416, + "grad_norm": 0.00033570147934369743, + "learning_rate": 5.149911016840009e-05, + "loss": 0.0014, + "num_input_tokens_seen": 229845216, + "step": 106520 + }, + { + "epoch": 17.377650897226754, + "grad_norm": 0.01954066939651966, + "learning_rate": 5.146765131469594e-05, + "loss": 0.0091, + "num_input_tokens_seen": 229856896, + "step": 106525 + }, + { + "epoch": 17.37846655791191, + "grad_norm": 0.00038411637069657445, + "learning_rate": 5.1436201551068987e-05, + "loss": 0.0007, + "num_input_tokens_seen": 229867168, + "step": 106530 + }, + { + "epoch": 17.379282218597062, + "grad_norm": 0.0018491477239876986, + "learning_rate": 5.140476087815621e-05, + "loss": 0.0005, + "num_input_tokens_seen": 229878176, + "step": 106535 + }, + { + "epoch": 17.380097879282218, + "grad_norm": 0.0001909626880660653, + "learning_rate": 5.137332929659522e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229888384, + "step": 106540 + }, + { + "epoch": 17.380913539967374, + "grad_norm": 0.00026629571220837533, + "learning_rate": 5.134190680702278e-05, + "loss": 0.001, + "num_input_tokens_seen": 229899200, + "step": 106545 + }, + { + "epoch": 17.38172920065253, + "grad_norm": 0.0004384434432722628, + "learning_rate": 5.1310493410075765e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229909088, + "step": 106550 + }, + { + "epoch": 17.382544861337685, + "grad_norm": 0.00019313773373141885, + "learning_rate": 5.127908910639084e-05, + "loss": 0.001, + "num_input_tokens_seen": 229919840, + "step": 106555 + }, + { + "epoch": 17.383360522022837, + "grad_norm": 0.016378013417124748, + "learning_rate": 5.1247693896604386e-05, + "loss": 0.0011, + "num_input_tokens_seen": 229930272, + "step": 106560 + }, + { + "epoch": 17.384176182707993, + "grad_norm": 0.00027041565044783056, + "learning_rate": 5.1216307781352724e-05, + "loss": 0.0009, + "num_input_tokens_seen": 229940864, + "step": 106565 + }, + { + "epoch": 17.38499184339315, + "grad_norm": 0.0029941725078970194, + "learning_rate": 5.11849307612719e-05, + "loss": 0.0002, + "num_input_tokens_seen": 229952640, + "step": 106570 + }, + { + "epoch": 17.385807504078304, + "grad_norm": 0.00020495610078796744, + "learning_rate": 5.115356283699779e-05, + "loss": 0.0021, + "num_input_tokens_seen": 229963424, + "step": 106575 + }, + { + "epoch": 17.38662316476346, + "grad_norm": 0.0008458722149953246, + "learning_rate": 5.112220400916617e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229974912, + "step": 106580 + }, + { + "epoch": 17.387438825448612, + "grad_norm": 0.0001893808221211657, + "learning_rate": 5.109085427841248e-05, + "loss": 0.0003, + "num_input_tokens_seen": 229985568, + "step": 106585 + }, + { + "epoch": 17.388254486133768, + "grad_norm": 0.00023286663054022938, + "learning_rate": 5.1059513645372146e-05, + "loss": 0.0004, + "num_input_tokens_seen": 229997024, + "step": 106590 + }, + { + "epoch": 17.389070146818923, + "grad_norm": 0.00015590531984344125, + "learning_rate": 5.1028182110680275e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230007936, + "step": 106595 + }, + { + "epoch": 17.38988580750408, + "grad_norm": 0.0010861444752663374, + "learning_rate": 5.0996859674971805e-05, + "loss": 0.001, + "num_input_tokens_seen": 230018528, + "step": 106600 + }, + { + "epoch": 17.390701468189235, + "grad_norm": 0.0005482531851157546, + "learning_rate": 5.096554633888173e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230028224, + "step": 106605 + }, + { + "epoch": 17.391517128874387, + "grad_norm": 0.0014682140899822116, + "learning_rate": 5.093424210304426e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230039072, + "step": 106610 + }, + { + "epoch": 17.392332789559543, + "grad_norm": 0.0002388711873209104, + "learning_rate": 5.090294696809428e-05, + "loss": 0.0024, + "num_input_tokens_seen": 230050976, + "step": 106615 + }, + { + "epoch": 17.3931484502447, + "grad_norm": 7.840488251531497e-05, + "learning_rate": 5.087166093466566e-05, + "loss": 0.0001, + "num_input_tokens_seen": 230061472, + "step": 106620 + }, + { + "epoch": 17.393964110929854, + "grad_norm": 0.3183748126029968, + "learning_rate": 5.0840384003392745e-05, + "loss": 0.0115, + "num_input_tokens_seen": 230072032, + "step": 106625 + }, + { + "epoch": 17.39477977161501, + "grad_norm": 0.00010872969141928479, + "learning_rate": 5.080911617490902e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230082688, + "step": 106630 + }, + { + "epoch": 17.395595432300162, + "grad_norm": 0.00040338674443773925, + "learning_rate": 5.0777857449848644e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230094272, + "step": 106635 + }, + { + "epoch": 17.396411092985318, + "grad_norm": 0.002177192596718669, + "learning_rate": 5.074660782884461e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230104928, + "step": 106640 + }, + { + "epoch": 17.397226753670473, + "grad_norm": 9.600628254702315e-05, + "learning_rate": 5.071536731253074e-05, + "loss": 0.0035, + "num_input_tokens_seen": 230115520, + "step": 106645 + }, + { + "epoch": 17.39804241435563, + "grad_norm": 0.0339292548596859, + "learning_rate": 5.0684135901539694e-05, + "loss": 0.0083, + "num_input_tokens_seen": 230126336, + "step": 106650 + }, + { + "epoch": 17.39885807504078, + "grad_norm": 0.0021789383608847857, + "learning_rate": 5.0652913596504704e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230137120, + "step": 106655 + }, + { + "epoch": 17.399673735725937, + "grad_norm": 0.00033596681896597147, + "learning_rate": 5.062170039805847e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230148480, + "step": 106660 + }, + { + "epoch": 17.400489396411093, + "grad_norm": 0.0008120349957607687, + "learning_rate": 5.05904963068336e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230159744, + "step": 106665 + }, + { + "epoch": 17.40130505709625, + "grad_norm": 0.002797073684632778, + "learning_rate": 5.055930132346237e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230170432, + "step": 106670 + }, + { + "epoch": 17.402120717781404, + "grad_norm": 0.03989632800221443, + "learning_rate": 5.0528115448577105e-05, + "loss": 0.0017, + "num_input_tokens_seen": 230182112, + "step": 106675 + }, + { + "epoch": 17.402936378466556, + "grad_norm": 0.01214857678860426, + "learning_rate": 5.0496938682809744e-05, + "loss": 0.12, + "num_input_tokens_seen": 230193344, + "step": 106680 + }, + { + "epoch": 17.403752039151712, + "grad_norm": 0.0006154502043500543, + "learning_rate": 5.0465771026792175e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230204480, + "step": 106685 + }, + { + "epoch": 17.404567699836868, + "grad_norm": 0.0004532141610980034, + "learning_rate": 5.043461248115605e-05, + "loss": 0.0266, + "num_input_tokens_seen": 230215584, + "step": 106690 + }, + { + "epoch": 17.405383360522023, + "grad_norm": 0.002548135118559003, + "learning_rate": 5.040346304653276e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230226816, + "step": 106695 + }, + { + "epoch": 17.40619902120718, + "grad_norm": 0.0013706674799323082, + "learning_rate": 5.037232272355369e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230239104, + "step": 106700 + }, + { + "epoch": 17.40701468189233, + "grad_norm": 0.0031612992752343416, + "learning_rate": 5.034119151284988e-05, + "loss": 0.0049, + "num_input_tokens_seen": 230249248, + "step": 106705 + }, + { + "epoch": 17.407830342577487, + "grad_norm": 0.0032532510813325644, + "learning_rate": 5.031006941505228e-05, + "loss": 0.0008, + "num_input_tokens_seen": 230259840, + "step": 106710 + }, + { + "epoch": 17.408646003262643, + "grad_norm": 0.003893185406923294, + "learning_rate": 5.0278956430791555e-05, + "loss": 0.0948, + "num_input_tokens_seen": 230270400, + "step": 106715 + }, + { + "epoch": 17.4094616639478, + "grad_norm": 0.0027550188824534416, + "learning_rate": 5.0247852560698304e-05, + "loss": 0.0016, + "num_input_tokens_seen": 230281760, + "step": 106720 + }, + { + "epoch": 17.410277324632954, + "grad_norm": 0.00015496286505367607, + "learning_rate": 5.0216757805402856e-05, + "loss": 0.001, + "num_input_tokens_seen": 230292288, + "step": 106725 + }, + { + "epoch": 17.411092985318106, + "grad_norm": 0.001585847930982709, + "learning_rate": 5.018567216553543e-05, + "loss": 0.003, + "num_input_tokens_seen": 230303712, + "step": 106730 + }, + { + "epoch": 17.411908646003262, + "grad_norm": 0.00018380499386694282, + "learning_rate": 5.015459564172597e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230314528, + "step": 106735 + }, + { + "epoch": 17.412724306688418, + "grad_norm": 0.00019137914932798594, + "learning_rate": 5.0123528234604307e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230325248, + "step": 106740 + }, + { + "epoch": 17.413539967373573, + "grad_norm": 0.003777424106374383, + "learning_rate": 5.009246994479999e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230335104, + "step": 106745 + }, + { + "epoch": 17.41435562805873, + "grad_norm": 0.01063018012791872, + "learning_rate": 5.006142077294268e-05, + "loss": 0.0286, + "num_input_tokens_seen": 230346496, + "step": 106750 + }, + { + "epoch": 17.41517128874388, + "grad_norm": 0.005791367031633854, + "learning_rate": 5.003038071966126e-05, + "loss": 0.0038, + "num_input_tokens_seen": 230357536, + "step": 106755 + }, + { + "epoch": 17.415986949429037, + "grad_norm": 0.00046310084871947765, + "learning_rate": 4.999934978558513e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230368000, + "step": 106760 + }, + { + "epoch": 17.416802610114193, + "grad_norm": 0.00045191802200861275, + "learning_rate": 4.996832797134299e-05, + "loss": 0.0023, + "num_input_tokens_seen": 230379520, + "step": 106765 + }, + { + "epoch": 17.41761827079935, + "grad_norm": 0.0022176422644406557, + "learning_rate": 4.9937315277563625e-05, + "loss": 0.0101, + "num_input_tokens_seen": 230390304, + "step": 106770 + }, + { + "epoch": 17.418433931484504, + "grad_norm": 0.0001474494783906266, + "learning_rate": 4.990631170487553e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230400608, + "step": 106775 + }, + { + "epoch": 17.419249592169656, + "grad_norm": 0.00011739812907762825, + "learning_rate": 4.987531725390698e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230411456, + "step": 106780 + }, + { + "epoch": 17.420065252854812, + "grad_norm": 0.0033404682762920856, + "learning_rate": 4.9844331925286145e-05, + "loss": 0.0005, + "num_input_tokens_seen": 230422816, + "step": 106785 + }, + { + "epoch": 17.420880913539968, + "grad_norm": 0.0011010445887222886, + "learning_rate": 4.981335571964102e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230433856, + "step": 106790 + }, + { + "epoch": 17.421696574225123, + "grad_norm": 0.10387986153364182, + "learning_rate": 4.978238863759932e-05, + "loss": 0.0019, + "num_input_tokens_seen": 230444704, + "step": 106795 + }, + { + "epoch": 17.42251223491028, + "grad_norm": 0.0006742404657416046, + "learning_rate": 4.975143067978866e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230455232, + "step": 106800 + }, + { + "epoch": 17.42332789559543, + "grad_norm": 0.02422933094203472, + "learning_rate": 4.9720481846836416e-05, + "loss": 0.0676, + "num_input_tokens_seen": 230465920, + "step": 106805 + }, + { + "epoch": 17.424143556280587, + "grad_norm": 0.00021187974198255688, + "learning_rate": 4.968954213936988e-05, + "loss": 0.0001, + "num_input_tokens_seen": 230474944, + "step": 106810 + }, + { + "epoch": 17.424959216965743, + "grad_norm": 0.0013578995130956173, + "learning_rate": 4.9658611558015984e-05, + "loss": 0.0026, + "num_input_tokens_seen": 230486912, + "step": 106815 + }, + { + "epoch": 17.4257748776509, + "grad_norm": 0.0002529274206608534, + "learning_rate": 4.962769010340163e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230498048, + "step": 106820 + }, + { + "epoch": 17.42659053833605, + "grad_norm": 0.0045602452009916306, + "learning_rate": 4.959677777615351e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230509120, + "step": 106825 + }, + { + "epoch": 17.427406199021206, + "grad_norm": 0.0008446667343378067, + "learning_rate": 4.956587457689804e-05, + "loss": 0.003, + "num_input_tokens_seen": 230520224, + "step": 106830 + }, + { + "epoch": 17.428221859706362, + "grad_norm": 0.0007012642454355955, + "learning_rate": 4.953498050626154e-05, + "loss": 0.0023, + "num_input_tokens_seen": 230530144, + "step": 106835 + }, + { + "epoch": 17.429037520391518, + "grad_norm": 0.0001134545891545713, + "learning_rate": 4.9504095564870124e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230540288, + "step": 106840 + }, + { + "epoch": 17.429853181076673, + "grad_norm": 0.00016923660587053746, + "learning_rate": 4.947321975334967e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230552096, + "step": 106845 + }, + { + "epoch": 17.430668841761825, + "grad_norm": 0.014024686999619007, + "learning_rate": 4.944235307232597e-05, + "loss": 0.0006, + "num_input_tokens_seen": 230562976, + "step": 106850 + }, + { + "epoch": 17.43148450244698, + "grad_norm": 0.018372340127825737, + "learning_rate": 4.941149552242458e-05, + "loss": 0.0012, + "num_input_tokens_seen": 230574688, + "step": 106855 + }, + { + "epoch": 17.432300163132137, + "grad_norm": 0.002177318325266242, + "learning_rate": 4.9380647104270814e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230586528, + "step": 106860 + }, + { + "epoch": 17.433115823817293, + "grad_norm": 0.0006069048540666699, + "learning_rate": 4.93498078184898e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230598016, + "step": 106865 + }, + { + "epoch": 17.43393148450245, + "grad_norm": 0.010042526759207249, + "learning_rate": 4.9318977665706866e-05, + "loss": 0.0045, + "num_input_tokens_seen": 230609312, + "step": 106870 + }, + { + "epoch": 17.4347471451876, + "grad_norm": 0.0119392741471529, + "learning_rate": 4.928815664654635e-05, + "loss": 0.0983, + "num_input_tokens_seen": 230620608, + "step": 106875 + }, + { + "epoch": 17.435562805872756, + "grad_norm": 0.023288115859031677, + "learning_rate": 4.9257344761633236e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230630016, + "step": 106880 + }, + { + "epoch": 17.436378466557912, + "grad_norm": 0.2284889668226242, + "learning_rate": 4.9226542011591716e-05, + "loss": 0.0022, + "num_input_tokens_seen": 230639392, + "step": 106885 + }, + { + "epoch": 17.437194127243067, + "grad_norm": 0.0031613903120160103, + "learning_rate": 4.919574839704627e-05, + "loss": 0.0247, + "num_input_tokens_seen": 230650688, + "step": 106890 + }, + { + "epoch": 17.438009787928223, + "grad_norm": 5.948877151240595e-05, + "learning_rate": 4.916496391862085e-05, + "loss": 0.0034, + "num_input_tokens_seen": 230661760, + "step": 106895 + }, + { + "epoch": 17.438825448613375, + "grad_norm": 0.2740603983402252, + "learning_rate": 4.913418857693936e-05, + "loss": 0.0046, + "num_input_tokens_seen": 230673056, + "step": 106900 + }, + { + "epoch": 17.43964110929853, + "grad_norm": 8.32523001008667e-05, + "learning_rate": 4.9103422372625496e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230684736, + "step": 106905 + }, + { + "epoch": 17.440456769983687, + "grad_norm": 0.000422953482484445, + "learning_rate": 4.907266530630278e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230695232, + "step": 106910 + }, + { + "epoch": 17.441272430668842, + "grad_norm": 0.0021661666687577963, + "learning_rate": 4.904191737859454e-05, + "loss": 0.0015, + "num_input_tokens_seen": 230705824, + "step": 106915 + }, + { + "epoch": 17.442088091353998, + "grad_norm": 0.03287786245346069, + "learning_rate": 4.901117859012394e-05, + "loss": 0.016, + "num_input_tokens_seen": 230716864, + "step": 106920 + }, + { + "epoch": 17.44290375203915, + "grad_norm": 0.6260375380516052, + "learning_rate": 4.898044894151393e-05, + "loss": 0.0321, + "num_input_tokens_seen": 230728000, + "step": 106925 + }, + { + "epoch": 17.443719412724306, + "grad_norm": 0.09280558675527573, + "learning_rate": 4.894972843338724e-05, + "loss": 0.0047, + "num_input_tokens_seen": 230738560, + "step": 106930 + }, + { + "epoch": 17.44453507340946, + "grad_norm": 0.0009539235034026206, + "learning_rate": 4.891901706636653e-05, + "loss": 0.0013, + "num_input_tokens_seen": 230749408, + "step": 106935 + }, + { + "epoch": 17.445350734094617, + "grad_norm": 0.0019774583633989096, + "learning_rate": 4.88883148410742e-05, + "loss": 0.0019, + "num_input_tokens_seen": 230757856, + "step": 106940 + }, + { + "epoch": 17.446166394779773, + "grad_norm": 0.08651755005121231, + "learning_rate": 4.885762175813241e-05, + "loss": 0.0024, + "num_input_tokens_seen": 230769344, + "step": 106945 + }, + { + "epoch": 17.446982055464925, + "grad_norm": 0.029628556221723557, + "learning_rate": 4.882693781816327e-05, + "loss": 0.0021, + "num_input_tokens_seen": 230779520, + "step": 106950 + }, + { + "epoch": 17.44779771615008, + "grad_norm": 0.0013729347847402096, + "learning_rate": 4.8796263021788524e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230789472, + "step": 106955 + }, + { + "epoch": 17.448613376835237, + "grad_norm": 0.514598548412323, + "learning_rate": 4.876559736962999e-05, + "loss": 0.017, + "num_input_tokens_seen": 230800704, + "step": 106960 + }, + { + "epoch": 17.449429037520392, + "grad_norm": 0.006085938308387995, + "learning_rate": 4.8734940862309006e-05, + "loss": 0.0021, + "num_input_tokens_seen": 230810624, + "step": 106965 + }, + { + "epoch": 17.450244698205548, + "grad_norm": 0.06879095733165741, + "learning_rate": 4.8704293500446806e-05, + "loss": 0.0021, + "num_input_tokens_seen": 230821248, + "step": 106970 + }, + { + "epoch": 17.4510603588907, + "grad_norm": 0.004780937451869249, + "learning_rate": 4.867365528466477e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230832288, + "step": 106975 + }, + { + "epoch": 17.451876019575856, + "grad_norm": 0.013082013465464115, + "learning_rate": 4.864302621558353e-05, + "loss": 0.075, + "num_input_tokens_seen": 230842816, + "step": 106980 + }, + { + "epoch": 17.45269168026101, + "grad_norm": 0.00034014901029877365, + "learning_rate": 4.861240629382413e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230851936, + "step": 106985 + }, + { + "epoch": 17.453507340946167, + "grad_norm": 0.0027973665855824947, + "learning_rate": 4.858179552000674e-05, + "loss": 0.0004, + "num_input_tokens_seen": 230862432, + "step": 106990 + }, + { + "epoch": 17.454323001631323, + "grad_norm": 0.0006131393602117896, + "learning_rate": 4.85511938947521e-05, + "loss": 0.0098, + "num_input_tokens_seen": 230873536, + "step": 106995 + }, + { + "epoch": 17.455138662316475, + "grad_norm": 0.0004974842886440456, + "learning_rate": 4.8520601418680085e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230883904, + "step": 107000 + }, + { + "epoch": 17.45595432300163, + "grad_norm": 0.023120643571019173, + "learning_rate": 4.849001809241099e-05, + "loss": 0.0007, + "num_input_tokens_seen": 230893312, + "step": 107005 + }, + { + "epoch": 17.456769983686787, + "grad_norm": 0.0011372148292139173, + "learning_rate": 4.845944391656426e-05, + "loss": 0.0068, + "num_input_tokens_seen": 230904192, + "step": 107010 + }, + { + "epoch": 17.457585644371942, + "grad_norm": 0.009961425326764584, + "learning_rate": 4.84288788917599e-05, + "loss": 0.001, + "num_input_tokens_seen": 230914272, + "step": 107015 + }, + { + "epoch": 17.458401305057095, + "grad_norm": 0.0006256845081225038, + "learning_rate": 4.839832301861696e-05, + "loss": 0.0019, + "num_input_tokens_seen": 230925088, + "step": 107020 + }, + { + "epoch": 17.45921696574225, + "grad_norm": 0.00014246850332710892, + "learning_rate": 4.836777629775513e-05, + "loss": 0.0009, + "num_input_tokens_seen": 230935808, + "step": 107025 + }, + { + "epoch": 17.460032626427406, + "grad_norm": 0.06999503821134567, + "learning_rate": 4.833723872979306e-05, + "loss": 0.0017, + "num_input_tokens_seen": 230946368, + "step": 107030 + }, + { + "epoch": 17.46084828711256, + "grad_norm": 0.00010566286073299125, + "learning_rate": 4.830671031534989e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230957024, + "step": 107035 + }, + { + "epoch": 17.461663947797717, + "grad_norm": 0.0004541479574982077, + "learning_rate": 4.827619105504427e-05, + "loss": 0.0002, + "num_input_tokens_seen": 230967360, + "step": 107040 + }, + { + "epoch": 17.46247960848287, + "grad_norm": 0.00042015130748040974, + "learning_rate": 4.8245680949494664e-05, + "loss": 0.0014, + "num_input_tokens_seen": 230977024, + "step": 107045 + }, + { + "epoch": 17.463295269168025, + "grad_norm": 0.00017645805201027542, + "learning_rate": 4.821517999931946e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230988928, + "step": 107050 + }, + { + "epoch": 17.46411092985318, + "grad_norm": 0.004381467122584581, + "learning_rate": 4.8184688205136716e-05, + "loss": 0.0003, + "num_input_tokens_seen": 230999648, + "step": 107055 + }, + { + "epoch": 17.464926590538337, + "grad_norm": 0.008292367681860924, + "learning_rate": 4.8154205567564503e-05, + "loss": 0.0009, + "num_input_tokens_seen": 231010976, + "step": 107060 + }, + { + "epoch": 17.465742251223492, + "grad_norm": 0.0002453875495120883, + "learning_rate": 4.812373208722048e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231021312, + "step": 107065 + }, + { + "epoch": 17.466557911908644, + "grad_norm": 0.003620786825194955, + "learning_rate": 4.809326776472228e-05, + "loss": 0.0002, + "num_input_tokens_seen": 231033376, + "step": 107070 + }, + { + "epoch": 17.4673735725938, + "grad_norm": 0.00039699708577245474, + "learning_rate": 4.806281260068729e-05, + "loss": 0.0103, + "num_input_tokens_seen": 231043936, + "step": 107075 + }, + { + "epoch": 17.468189233278956, + "grad_norm": 0.0021901039872318506, + "learning_rate": 4.803236659573274e-05, + "loss": 0.0002, + "num_input_tokens_seen": 231053984, + "step": 107080 + }, + { + "epoch": 17.46900489396411, + "grad_norm": 0.00045752059668302536, + "learning_rate": 4.800192975047551e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231064640, + "step": 107085 + }, + { + "epoch": 17.469820554649267, + "grad_norm": 0.00013071543071419, + "learning_rate": 4.79715020655328e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231074528, + "step": 107090 + }, + { + "epoch": 17.47063621533442, + "grad_norm": 0.05839240550994873, + "learning_rate": 4.794108354152082e-05, + "loss": 0.0016, + "num_input_tokens_seen": 231086208, + "step": 107095 + }, + { + "epoch": 17.471451876019575, + "grad_norm": 0.00019631041504908353, + "learning_rate": 4.791067417905648e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231097024, + "step": 107100 + }, + { + "epoch": 17.47226753670473, + "grad_norm": 0.0013051963178440928, + "learning_rate": 4.7880273978755606e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231107136, + "step": 107105 + }, + { + "epoch": 17.473083197389887, + "grad_norm": 0.8512068390846252, + "learning_rate": 4.784988294123477e-05, + "loss": 0.0311, + "num_input_tokens_seen": 231118112, + "step": 107110 + }, + { + "epoch": 17.473898858075042, + "grad_norm": 0.1298152655363083, + "learning_rate": 4.781950106710942e-05, + "loss": 0.0018, + "num_input_tokens_seen": 231128768, + "step": 107115 + }, + { + "epoch": 17.474714518760194, + "grad_norm": 0.0013202829286456108, + "learning_rate": 4.7789128356995727e-05, + "loss": 0.001, + "num_input_tokens_seen": 231139456, + "step": 107120 + }, + { + "epoch": 17.47553017944535, + "grad_norm": 0.0006224493263289332, + "learning_rate": 4.775876481150887e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231151712, + "step": 107125 + }, + { + "epoch": 17.476345840130506, + "grad_norm": 0.00013778747234027833, + "learning_rate": 4.772841043126447e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231162272, + "step": 107130 + }, + { + "epoch": 17.47716150081566, + "grad_norm": 0.00026349161635152996, + "learning_rate": 4.769806521687742e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231171616, + "step": 107135 + }, + { + "epoch": 17.477977161500817, + "grad_norm": 0.0453668013215065, + "learning_rate": 4.766772916896306e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231182688, + "step": 107140 + }, + { + "epoch": 17.47879282218597, + "grad_norm": 0.00011575184907997027, + "learning_rate": 4.763740228813579e-05, + "loss": 0.0002, + "num_input_tokens_seen": 231194400, + "step": 107145 + }, + { + "epoch": 17.479608482871125, + "grad_norm": 0.00038516559288837016, + "learning_rate": 4.760708457501062e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231205696, + "step": 107150 + }, + { + "epoch": 17.48042414355628, + "grad_norm": 0.07061001658439636, + "learning_rate": 4.7576776030201606e-05, + "loss": 0.0017, + "num_input_tokens_seen": 231215552, + "step": 107155 + }, + { + "epoch": 17.481239804241437, + "grad_norm": 0.0015508484793826938, + "learning_rate": 4.754647665432338e-05, + "loss": 0.003, + "num_input_tokens_seen": 231224768, + "step": 107160 + }, + { + "epoch": 17.482055464926592, + "grad_norm": 0.0017022773390635848, + "learning_rate": 4.751618644798955e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231234688, + "step": 107165 + }, + { + "epoch": 17.482871125611744, + "grad_norm": 0.00650228513404727, + "learning_rate": 4.7485905411814414e-05, + "loss": 0.003, + "num_input_tokens_seen": 231245888, + "step": 107170 + }, + { + "epoch": 17.4836867862969, + "grad_norm": 0.002747894497588277, + "learning_rate": 4.745563354641125e-05, + "loss": 0.001, + "num_input_tokens_seen": 231256224, + "step": 107175 + }, + { + "epoch": 17.484502446982056, + "grad_norm": 0.000500457885209471, + "learning_rate": 4.74253708523939e-05, + "loss": 0.0042, + "num_input_tokens_seen": 231267040, + "step": 107180 + }, + { + "epoch": 17.48531810766721, + "grad_norm": 0.00358394393697381, + "learning_rate": 4.7395117330375494e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231276992, + "step": 107185 + }, + { + "epoch": 17.486133768352367, + "grad_norm": 0.0006886483752168715, + "learning_rate": 4.7364872980969254e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231287904, + "step": 107190 + }, + { + "epoch": 17.48694942903752, + "grad_norm": 0.029810473322868347, + "learning_rate": 4.733463780478808e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231298304, + "step": 107195 + }, + { + "epoch": 17.487765089722675, + "grad_norm": 0.00021939484577160329, + "learning_rate": 4.7304411802444656e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231309600, + "step": 107200 + }, + { + "epoch": 17.48858075040783, + "grad_norm": 0.0010209924075752497, + "learning_rate": 4.7274194974551656e-05, + "loss": 0.0038, + "num_input_tokens_seen": 231320288, + "step": 107205 + }, + { + "epoch": 17.489396411092986, + "grad_norm": 0.00041308620711788535, + "learning_rate": 4.724398732172142e-05, + "loss": 0.0141, + "num_input_tokens_seen": 231331168, + "step": 107210 + }, + { + "epoch": 17.49021207177814, + "grad_norm": 0.000247310905251652, + "learning_rate": 4.721378884456612e-05, + "loss": 0.0018, + "num_input_tokens_seen": 231341760, + "step": 107215 + }, + { + "epoch": 17.491027732463294, + "grad_norm": 0.00029042139067314565, + "learning_rate": 4.718359954369783e-05, + "loss": 0.001, + "num_input_tokens_seen": 231352640, + "step": 107220 + }, + { + "epoch": 17.49184339314845, + "grad_norm": 0.009616367518901825, + "learning_rate": 4.7153419419728285e-05, + "loss": 0.0047, + "num_input_tokens_seen": 231362560, + "step": 107225 + }, + { + "epoch": 17.492659053833606, + "grad_norm": 0.12656067311763763, + "learning_rate": 4.7123248473269096e-05, + "loss": 0.0015, + "num_input_tokens_seen": 231373408, + "step": 107230 + }, + { + "epoch": 17.49347471451876, + "grad_norm": 0.002197021385654807, + "learning_rate": 4.7093086704931955e-05, + "loss": 0.001, + "num_input_tokens_seen": 231384416, + "step": 107235 + }, + { + "epoch": 17.494290375203914, + "grad_norm": 1.1977633237838745, + "learning_rate": 4.7062934115327804e-05, + "loss": 0.0335, + "num_input_tokens_seen": 231394688, + "step": 107240 + }, + { + "epoch": 17.49510603588907, + "grad_norm": 0.000950126675888896, + "learning_rate": 4.7032790705068105e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231405184, + "step": 107245 + }, + { + "epoch": 17.495921696574225, + "grad_norm": 0.00019221189722884446, + "learning_rate": 4.700265647476332e-05, + "loss": 0.0002, + "num_input_tokens_seen": 231415968, + "step": 107250 + }, + { + "epoch": 17.49673735725938, + "grad_norm": 0.002731620566919446, + "learning_rate": 4.69725314250245e-05, + "loss": 0.002, + "num_input_tokens_seen": 231425856, + "step": 107255 + }, + { + "epoch": 17.497553017944536, + "grad_norm": 0.00039533551898784935, + "learning_rate": 4.6942415556461894e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231436416, + "step": 107260 + }, + { + "epoch": 17.49836867862969, + "grad_norm": 0.00029871094739064574, + "learning_rate": 4.691230886968617e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231448224, + "step": 107265 + }, + { + "epoch": 17.499184339314844, + "grad_norm": 0.00025034582358784974, + "learning_rate": 4.688221136530712e-05, + "loss": 0.0022, + "num_input_tokens_seen": 231459776, + "step": 107270 + }, + { + "epoch": 17.5, + "grad_norm": 0.04252872243523598, + "learning_rate": 4.6852123043935044e-05, + "loss": 0.0017, + "num_input_tokens_seen": 231469600, + "step": 107275 + }, + { + "epoch": 17.500815660685156, + "grad_norm": 0.004845942836254835, + "learning_rate": 4.682204390617939e-05, + "loss": 0.0021, + "num_input_tokens_seen": 231480832, + "step": 107280 + }, + { + "epoch": 17.50163132137031, + "grad_norm": 0.0006755726644769311, + "learning_rate": 4.6791973952650056e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231490880, + "step": 107285 + }, + { + "epoch": 17.502446982055464, + "grad_norm": 0.00027618242893368006, + "learning_rate": 4.6761913183956175e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231501472, + "step": 107290 + }, + { + "epoch": 17.50326264274062, + "grad_norm": 0.0004023680812679231, + "learning_rate": 4.673186160070714e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231512512, + "step": 107295 + }, + { + "epoch": 17.504078303425775, + "grad_norm": 0.00011338602780597284, + "learning_rate": 4.6701819203511964e-05, + "loss": 0.0092, + "num_input_tokens_seen": 231524224, + "step": 107300 + }, + { + "epoch": 17.50489396411093, + "grad_norm": 0.00013245809532236308, + "learning_rate": 4.667178599297944e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231535200, + "step": 107305 + }, + { + "epoch": 17.505709624796086, + "grad_norm": 0.022497806698083878, + "learning_rate": 4.664176196971831e-05, + "loss": 0.0012, + "num_input_tokens_seen": 231545856, + "step": 107310 + }, + { + "epoch": 17.50652528548124, + "grad_norm": 0.030618511140346527, + "learning_rate": 4.661174713433697e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231556864, + "step": 107315 + }, + { + "epoch": 17.507340946166394, + "grad_norm": 9.352411871077493e-05, + "learning_rate": 4.6581741487443765e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231567776, + "step": 107320 + }, + { + "epoch": 17.50815660685155, + "grad_norm": 0.00038306074566207826, + "learning_rate": 4.655174502964676e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231579040, + "step": 107325 + }, + { + "epoch": 17.508972267536706, + "grad_norm": 0.06389070302248001, + "learning_rate": 4.6521757761553873e-05, + "loss": 0.0018, + "num_input_tokens_seen": 231588864, + "step": 107330 + }, + { + "epoch": 17.50978792822186, + "grad_norm": 0.0009548309608362615, + "learning_rate": 4.6491779683772825e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231599712, + "step": 107335 + }, + { + "epoch": 17.510603588907014, + "grad_norm": 0.004496999550610781, + "learning_rate": 4.64618107969112e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231611072, + "step": 107340 + }, + { + "epoch": 17.51141924959217, + "grad_norm": 0.0007356259156949818, + "learning_rate": 4.643185110157633e-05, + "loss": 0.0019, + "num_input_tokens_seen": 231621952, + "step": 107345 + }, + { + "epoch": 17.512234910277325, + "grad_norm": 0.0001852070854511112, + "learning_rate": 4.640190059837535e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231633344, + "step": 107350 + }, + { + "epoch": 17.51305057096248, + "grad_norm": 0.00037350706406868994, + "learning_rate": 4.637195928791532e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231644768, + "step": 107355 + }, + { + "epoch": 17.513866231647633, + "grad_norm": 0.0017707423539832234, + "learning_rate": 4.634202717080305e-05, + "loss": 0.0018, + "num_input_tokens_seen": 231656960, + "step": 107360 + }, + { + "epoch": 17.51468189233279, + "grad_norm": 0.00100712850689888, + "learning_rate": 4.6312104247645035e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231667712, + "step": 107365 + }, + { + "epoch": 17.515497553017944, + "grad_norm": 0.005286789033561945, + "learning_rate": 4.6282190519047805e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231678752, + "step": 107370 + }, + { + "epoch": 17.5163132137031, + "grad_norm": 0.02047114260494709, + "learning_rate": 4.625228598561748e-05, + "loss": 0.0015, + "num_input_tokens_seen": 231688928, + "step": 107375 + }, + { + "epoch": 17.517128874388256, + "grad_norm": 0.0009013927774503827, + "learning_rate": 4.6222390647960356e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231698720, + "step": 107380 + }, + { + "epoch": 17.517944535073408, + "grad_norm": 0.0006200579227879643, + "learning_rate": 4.619250450668194e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231709568, + "step": 107385 + }, + { + "epoch": 17.518760195758563, + "grad_norm": 0.0003490679955575615, + "learning_rate": 4.616262756238837e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231718080, + "step": 107390 + }, + { + "epoch": 17.51957585644372, + "grad_norm": 0.006202478893101215, + "learning_rate": 4.613275981568465e-05, + "loss": 0.0042, + "num_input_tokens_seen": 231728896, + "step": 107395 + }, + { + "epoch": 17.520391517128875, + "grad_norm": 6.818358815507963e-05, + "learning_rate": 4.610290126717642e-05, + "loss": 0.0033, + "num_input_tokens_seen": 231740704, + "step": 107400 + }, + { + "epoch": 17.52120717781403, + "grad_norm": 0.0002767993719317019, + "learning_rate": 4.607305191746874e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231751968, + "step": 107405 + }, + { + "epoch": 17.522022838499183, + "grad_norm": 0.0002150321815861389, + "learning_rate": 4.604321176716647e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231762592, + "step": 107410 + }, + { + "epoch": 17.52283849918434, + "grad_norm": 0.026049789041280746, + "learning_rate": 4.6013380816874394e-05, + "loss": 0.0029, + "num_input_tokens_seen": 231773344, + "step": 107415 + }, + { + "epoch": 17.523654159869494, + "grad_norm": 0.04671616479754448, + "learning_rate": 4.598355906719709e-05, + "loss": 0.0009, + "num_input_tokens_seen": 231783232, + "step": 107420 + }, + { + "epoch": 17.52446982055465, + "grad_norm": 0.026642097160220146, + "learning_rate": 4.595374651873896e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231794336, + "step": 107425 + }, + { + "epoch": 17.525285481239806, + "grad_norm": 0.02379734255373478, + "learning_rate": 4.592394317210413e-05, + "loss": 0.0018, + "num_input_tokens_seen": 231805376, + "step": 107430 + }, + { + "epoch": 17.526101141924958, + "grad_norm": 0.00013111306179780513, + "learning_rate": 4.589414902789662e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231815648, + "step": 107435 + }, + { + "epoch": 17.526916802610113, + "grad_norm": 0.0001725140755297616, + "learning_rate": 4.586436408672023e-05, + "loss": 0.0035, + "num_input_tokens_seen": 231827136, + "step": 107440 + }, + { + "epoch": 17.52773246329527, + "grad_norm": 0.0001053861778927967, + "learning_rate": 4.583458834917864e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231838048, + "step": 107445 + }, + { + "epoch": 17.528548123980425, + "grad_norm": 0.004166079219430685, + "learning_rate": 4.580482181587531e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231847808, + "step": 107450 + }, + { + "epoch": 17.52936378466558, + "grad_norm": 0.011203252710402012, + "learning_rate": 4.5775064487413424e-05, + "loss": 0.0022, + "num_input_tokens_seen": 231859136, + "step": 107455 + }, + { + "epoch": 17.530179445350733, + "grad_norm": 0.008083295077085495, + "learning_rate": 4.574531636439605e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231869568, + "step": 107460 + }, + { + "epoch": 17.53099510603589, + "grad_norm": 0.003132057609036565, + "learning_rate": 4.57155774474261e-05, + "loss": 0.0004, + "num_input_tokens_seen": 231880000, + "step": 107465 + }, + { + "epoch": 17.531810766721044, + "grad_norm": 0.005387485958635807, + "learning_rate": 4.568584773710632e-05, + "loss": 0.0013, + "num_input_tokens_seen": 231890304, + "step": 107470 + }, + { + "epoch": 17.5326264274062, + "grad_norm": 0.1772170215845108, + "learning_rate": 4.565612723403911e-05, + "loss": 0.0024, + "num_input_tokens_seen": 231900576, + "step": 107475 + }, + { + "epoch": 17.533442088091356, + "grad_norm": 0.00015290868759620935, + "learning_rate": 4.562641593882694e-05, + "loss": 0.0006, + "num_input_tokens_seen": 231912192, + "step": 107480 + }, + { + "epoch": 17.534257748776508, + "grad_norm": 0.00014589809870813042, + "learning_rate": 4.5596713852071816e-05, + "loss": 0.0001, + "num_input_tokens_seen": 231923008, + "step": 107485 + }, + { + "epoch": 17.535073409461663, + "grad_norm": 0.05643502622842789, + "learning_rate": 4.556702097437576e-05, + "loss": 0.0024, + "num_input_tokens_seen": 231933632, + "step": 107490 + }, + { + "epoch": 17.53588907014682, + "grad_norm": 0.01618698611855507, + "learning_rate": 4.5537337306340466e-05, + "loss": 0.0005, + "num_input_tokens_seen": 231942592, + "step": 107495 + }, + { + "epoch": 17.536704730831975, + "grad_norm": 0.00454922579228878, + "learning_rate": 4.550766284856761e-05, + "loss": 0.0007, + "num_input_tokens_seen": 231953504, + "step": 107500 + }, + { + "epoch": 17.53752039151713, + "grad_norm": 0.004269898869097233, + "learning_rate": 4.5477997601658384e-05, + "loss": 0.0011, + "num_input_tokens_seen": 231964992, + "step": 107505 + }, + { + "epoch": 17.538336052202283, + "grad_norm": 0.000503774790558964, + "learning_rate": 4.5448341566214354e-05, + "loss": 0.0003, + "num_input_tokens_seen": 231975008, + "step": 107510 + }, + { + "epoch": 17.53915171288744, + "grad_norm": 0.00024780057719908655, + "learning_rate": 4.541869474283616e-05, + "loss": 0.0019, + "num_input_tokens_seen": 231987168, + "step": 107515 + }, + { + "epoch": 17.539967373572594, + "grad_norm": 0.00039215278229676187, + "learning_rate": 4.538905713212488e-05, + "loss": 0.0008, + "num_input_tokens_seen": 231998912, + "step": 107520 + }, + { + "epoch": 17.54078303425775, + "grad_norm": 9.749254968483001e-05, + "learning_rate": 4.535942873468102e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232010176, + "step": 107525 + }, + { + "epoch": 17.541598694942905, + "grad_norm": 0.0003566923551261425, + "learning_rate": 4.532980955110516e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232021280, + "step": 107530 + }, + { + "epoch": 17.542414355628058, + "grad_norm": 0.0031282338313758373, + "learning_rate": 4.530019958199744e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232031392, + "step": 107535 + }, + { + "epoch": 17.543230016313213, + "grad_norm": 0.0003685698320623487, + "learning_rate": 4.527059882795803e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232041600, + "step": 107540 + }, + { + "epoch": 17.54404567699837, + "grad_norm": 0.0005581863806582987, + "learning_rate": 4.52410072895868e-05, + "loss": 0.0014, + "num_input_tokens_seen": 232051488, + "step": 107545 + }, + { + "epoch": 17.544861337683525, + "grad_norm": 0.0018373315688222647, + "learning_rate": 4.521142496748348e-05, + "loss": 0.0014, + "num_input_tokens_seen": 232062336, + "step": 107550 + }, + { + "epoch": 17.545676998368677, + "grad_norm": 0.0007517214398831129, + "learning_rate": 4.5181851862247544e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232074272, + "step": 107555 + }, + { + "epoch": 17.546492659053833, + "grad_norm": 0.00013128573482390493, + "learning_rate": 4.51522879744784e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232085952, + "step": 107560 + }, + { + "epoch": 17.54730831973899, + "grad_norm": 0.002161432057619095, + "learning_rate": 4.5122733304775124e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232097056, + "step": 107565 + }, + { + "epoch": 17.548123980424144, + "grad_norm": 0.000629114278126508, + "learning_rate": 4.509318785373667e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232108864, + "step": 107570 + }, + { + "epoch": 17.5489396411093, + "grad_norm": 0.0015067337080836296, + "learning_rate": 4.506365162196191e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232120192, + "step": 107575 + }, + { + "epoch": 17.549755301794452, + "grad_norm": 0.0011252169497311115, + "learning_rate": 4.503412461004935e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232130944, + "step": 107580 + }, + { + "epoch": 17.550570962479608, + "grad_norm": 0.0021618830505758524, + "learning_rate": 4.500460681859742e-05, + "loss": 0.0122, + "num_input_tokens_seen": 232140192, + "step": 107585 + }, + { + "epoch": 17.551386623164763, + "grad_norm": 0.00010752450907602906, + "learning_rate": 4.4975098248204394e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232151136, + "step": 107590 + }, + { + "epoch": 17.55220228384992, + "grad_norm": 0.00012228570994921029, + "learning_rate": 4.494559889946814e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232162560, + "step": 107595 + }, + { + "epoch": 17.553017944535075, + "grad_norm": 0.9727087020874023, + "learning_rate": 4.4916108772986686e-05, + "loss": 0.0146, + "num_input_tokens_seen": 232174304, + "step": 107600 + }, + { + "epoch": 17.553833605220227, + "grad_norm": 0.00298587279394269, + "learning_rate": 4.48866278693576e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232184576, + "step": 107605 + }, + { + "epoch": 17.554649265905383, + "grad_norm": 0.007504434324800968, + "learning_rate": 4.485715618917818e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232195392, + "step": 107610 + }, + { + "epoch": 17.55546492659054, + "grad_norm": 0.00019498044275678694, + "learning_rate": 4.482769373304613e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232206112, + "step": 107615 + }, + { + "epoch": 17.556280587275694, + "grad_norm": 0.002148254541680217, + "learning_rate": 4.4798240501558115e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232217344, + "step": 107620 + }, + { + "epoch": 17.55709624796085, + "grad_norm": 9.691908053355291e-05, + "learning_rate": 4.4768796495311406e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232228192, + "step": 107625 + }, + { + "epoch": 17.557911908646002, + "grad_norm": 0.0016043169889599085, + "learning_rate": 4.473936171490228e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232238752, + "step": 107630 + }, + { + "epoch": 17.558727569331158, + "grad_norm": 0.012164101004600525, + "learning_rate": 4.470993616092778e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232249728, + "step": 107635 + }, + { + "epoch": 17.559543230016313, + "grad_norm": 0.0072899796068668365, + "learning_rate": 4.46805198339838e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232260352, + "step": 107640 + }, + { + "epoch": 17.56035889070147, + "grad_norm": 0.0001126296556321904, + "learning_rate": 4.4651112734666874e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232270176, + "step": 107645 + }, + { + "epoch": 17.561174551386625, + "grad_norm": 0.06507442891597748, + "learning_rate": 4.462171486357264e-05, + "loss": 0.0022, + "num_input_tokens_seen": 232281312, + "step": 107650 + }, + { + "epoch": 17.561990212071777, + "grad_norm": 0.0002563142916187644, + "learning_rate": 4.459232622129722e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232292704, + "step": 107655 + }, + { + "epoch": 17.562805872756933, + "grad_norm": 0.001829111366532743, + "learning_rate": 4.4562946808435864e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232304064, + "step": 107660 + }, + { + "epoch": 17.563621533442088, + "grad_norm": 0.00022922967036720365, + "learning_rate": 4.453357662558422e-05, + "loss": 0.0014, + "num_input_tokens_seen": 232315200, + "step": 107665 + }, + { + "epoch": 17.564437194127244, + "grad_norm": 0.006343925837427378, + "learning_rate": 4.450421567333746e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232326464, + "step": 107670 + }, + { + "epoch": 17.5652528548124, + "grad_norm": 0.012010899372398853, + "learning_rate": 4.447486395229061e-05, + "loss": 0.015, + "num_input_tokens_seen": 232337696, + "step": 107675 + }, + { + "epoch": 17.56606851549755, + "grad_norm": 0.00045470925397239625, + "learning_rate": 4.4445521463038486e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232349632, + "step": 107680 + }, + { + "epoch": 17.566884176182707, + "grad_norm": 0.0004941055085510015, + "learning_rate": 4.441618820617582e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232359680, + "step": 107685 + }, + { + "epoch": 17.567699836867863, + "grad_norm": 0.0008943114662542939, + "learning_rate": 4.438686418229698e-05, + "loss": 0.0023, + "num_input_tokens_seen": 232371424, + "step": 107690 + }, + { + "epoch": 17.56851549755302, + "grad_norm": 0.0068136705085635185, + "learning_rate": 4.4357549391996376e-05, + "loss": 0.0089, + "num_input_tokens_seen": 232382112, + "step": 107695 + }, + { + "epoch": 17.569331158238175, + "grad_norm": 0.00011578125122468919, + "learning_rate": 4.432824383586809e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232392832, + "step": 107700 + }, + { + "epoch": 17.570146818923327, + "grad_norm": 0.0002473185013514012, + "learning_rate": 4.429894751450597e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232403552, + "step": 107705 + }, + { + "epoch": 17.570962479608482, + "grad_norm": 0.00023744201462250203, + "learning_rate": 4.4269660428503774e-05, + "loss": 0.004, + "num_input_tokens_seen": 232414112, + "step": 107710 + }, + { + "epoch": 17.571778140293638, + "grad_norm": 0.0004821094626095146, + "learning_rate": 4.4240382578454915e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232425536, + "step": 107715 + }, + { + "epoch": 17.572593800978794, + "grad_norm": 0.0004740917938761413, + "learning_rate": 4.4211113964953144e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232437376, + "step": 107720 + }, + { + "epoch": 17.57340946166395, + "grad_norm": 0.0003923209151253104, + "learning_rate": 4.4181854588591085e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232447552, + "step": 107725 + }, + { + "epoch": 17.5742251223491, + "grad_norm": 0.00022461362823378295, + "learning_rate": 4.415260444996222e-05, + "loss": 0.026, + "num_input_tokens_seen": 232457952, + "step": 107730 + }, + { + "epoch": 17.575040783034257, + "grad_norm": 0.002107961568981409, + "learning_rate": 4.4123363549658955e-05, + "loss": 0.0766, + "num_input_tokens_seen": 232469120, + "step": 107735 + }, + { + "epoch": 17.575856443719413, + "grad_norm": 0.0001552294852444902, + "learning_rate": 4.409413188827416e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232480928, + "step": 107740 + }, + { + "epoch": 17.57667210440457, + "grad_norm": 0.00021098223805893213, + "learning_rate": 4.4064909466400014e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232491936, + "step": 107745 + }, + { + "epoch": 17.57748776508972, + "grad_norm": 0.0024288848508149385, + "learning_rate": 4.4035696284629e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232502656, + "step": 107750 + }, + { + "epoch": 17.578303425774877, + "grad_norm": 0.013313405215740204, + "learning_rate": 4.4006492343552915e-05, + "loss": 0.0024, + "num_input_tokens_seen": 232511488, + "step": 107755 + }, + { + "epoch": 17.579119086460032, + "grad_norm": 0.002712896326556802, + "learning_rate": 4.39772976437639e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232522048, + "step": 107760 + }, + { + "epoch": 17.579934747145188, + "grad_norm": 0.0011226610513404012, + "learning_rate": 4.394811218585326e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232533216, + "step": 107765 + }, + { + "epoch": 17.580750407830344, + "grad_norm": 0.006600612308830023, + "learning_rate": 4.3918935970412796e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232543808, + "step": 107770 + }, + { + "epoch": 17.581566068515496, + "grad_norm": 0.0007957402849569917, + "learning_rate": 4.38897689980336e-05, + "loss": 0.1196, + "num_input_tokens_seen": 232555328, + "step": 107775 + }, + { + "epoch": 17.58238172920065, + "grad_norm": 0.0027287255506962538, + "learning_rate": 4.386061126930696e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232566464, + "step": 107780 + }, + { + "epoch": 17.583197389885807, + "grad_norm": 0.00041984106064774096, + "learning_rate": 4.3831462784823525e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232577024, + "step": 107785 + }, + { + "epoch": 17.584013050570963, + "grad_norm": 0.0013384693302214146, + "learning_rate": 4.380232354517433e-05, + "loss": 0.0017, + "num_input_tokens_seen": 232588384, + "step": 107790 + }, + { + "epoch": 17.58482871125612, + "grad_norm": 0.0025955268647521734, + "learning_rate": 4.3773193550949664e-05, + "loss": 0.0042, + "num_input_tokens_seen": 232599584, + "step": 107795 + }, + { + "epoch": 17.58564437194127, + "grad_norm": 0.0003401061403565109, + "learning_rate": 4.374407280274007e-05, + "loss": 0.0009, + "num_input_tokens_seen": 232610560, + "step": 107800 + }, + { + "epoch": 17.586460032626427, + "grad_norm": 0.00015748431906104088, + "learning_rate": 4.371496130113561e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232621440, + "step": 107805 + }, + { + "epoch": 17.587275693311582, + "grad_norm": 0.00021306000417098403, + "learning_rate": 4.3685859046726284e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232631456, + "step": 107810 + }, + { + "epoch": 17.588091353996738, + "grad_norm": 0.0006012442754581571, + "learning_rate": 4.3656766040101933e-05, + "loss": 0.0024, + "num_input_tokens_seen": 232642240, + "step": 107815 + }, + { + "epoch": 17.588907014681894, + "grad_norm": 0.00041340969619341195, + "learning_rate": 4.362768228185216e-05, + "loss": 0.0041, + "num_input_tokens_seen": 232652928, + "step": 107820 + }, + { + "epoch": 17.589722675367046, + "grad_norm": 0.00014005298726260662, + "learning_rate": 4.35986077725663e-05, + "loss": 0.0001, + "num_input_tokens_seen": 232663584, + "step": 107825 + }, + { + "epoch": 17.5905383360522, + "grad_norm": 0.0013785763876512647, + "learning_rate": 4.3569542512833684e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232674432, + "step": 107830 + }, + { + "epoch": 17.591353996737357, + "grad_norm": 0.00011182734306203201, + "learning_rate": 4.354048650324327e-05, + "loss": 0.0039, + "num_input_tokens_seen": 232684192, + "step": 107835 + }, + { + "epoch": 17.592169657422513, + "grad_norm": 0.0030250470153987408, + "learning_rate": 4.3511439744383984e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232694688, + "step": 107840 + }, + { + "epoch": 17.59298531810767, + "grad_norm": 0.00022759442799724638, + "learning_rate": 4.348240223684447e-05, + "loss": 0.0016, + "num_input_tokens_seen": 232706400, + "step": 107845 + }, + { + "epoch": 17.59380097879282, + "grad_norm": 0.001337374676950276, + "learning_rate": 4.3453373981213184e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232717344, + "step": 107850 + }, + { + "epoch": 17.594616639477977, + "grad_norm": 0.004847290460020304, + "learning_rate": 4.342435497807845e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232727616, + "step": 107855 + }, + { + "epoch": 17.595432300163132, + "grad_norm": 0.029198680073022842, + "learning_rate": 4.3395345228028294e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232738592, + "step": 107860 + }, + { + "epoch": 17.596247960848288, + "grad_norm": 0.0016016301233321428, + "learning_rate": 4.336634473165091e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232748736, + "step": 107865 + }, + { + "epoch": 17.597063621533444, + "grad_norm": 0.0001360478054266423, + "learning_rate": 4.3337353489533606e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232759040, + "step": 107870 + }, + { + "epoch": 17.597879282218596, + "grad_norm": 0.00022030959371477365, + "learning_rate": 4.3308371502264355e-05, + "loss": 0.0005, + "num_input_tokens_seen": 232770080, + "step": 107875 + }, + { + "epoch": 17.59869494290375, + "grad_norm": 0.006948619615286589, + "learning_rate": 4.327939877043013e-05, + "loss": 0.0095, + "num_input_tokens_seen": 232781888, + "step": 107880 + }, + { + "epoch": 17.599510603588907, + "grad_norm": 0.00011340141645632684, + "learning_rate": 4.3250435294618473e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232793792, + "step": 107885 + }, + { + "epoch": 17.600326264274063, + "grad_norm": 0.00015941663878038526, + "learning_rate": 4.322148107541596e-05, + "loss": 0.0282, + "num_input_tokens_seen": 232804608, + "step": 107890 + }, + { + "epoch": 17.601141924959215, + "grad_norm": 0.008725306950509548, + "learning_rate": 4.3192536113409785e-05, + "loss": 0.0004, + "num_input_tokens_seen": 232815680, + "step": 107895 + }, + { + "epoch": 17.60195758564437, + "grad_norm": 0.009106230922043324, + "learning_rate": 4.316360040918621e-05, + "loss": 0.0006, + "num_input_tokens_seen": 232825504, + "step": 107900 + }, + { + "epoch": 17.602773246329527, + "grad_norm": 0.0616520494222641, + "learning_rate": 4.3134673963331985e-05, + "loss": 0.0013, + "num_input_tokens_seen": 232834592, + "step": 107905 + }, + { + "epoch": 17.603588907014682, + "grad_norm": 0.11202183365821838, + "learning_rate": 4.310575677643297e-05, + "loss": 0.0026, + "num_input_tokens_seen": 232846080, + "step": 107910 + }, + { + "epoch": 17.604404567699838, + "grad_norm": 0.002079889876767993, + "learning_rate": 4.307684884907559e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232855424, + "step": 107915 + }, + { + "epoch": 17.605220228384994, + "grad_norm": 0.00037449836963787675, + "learning_rate": 4.304795018184537e-05, + "loss": 0.0003, + "num_input_tokens_seen": 232865760, + "step": 107920 + }, + { + "epoch": 17.606035889070146, + "grad_norm": 0.0005226247594691813, + "learning_rate": 4.3019060775328186e-05, + "loss": 0.0007, + "num_input_tokens_seen": 232876256, + "step": 107925 + }, + { + "epoch": 17.6068515497553, + "grad_norm": 0.00025452132103964686, + "learning_rate": 4.2990180630109455e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232888416, + "step": 107930 + }, + { + "epoch": 17.607667210440457, + "grad_norm": 0.02603933773934841, + "learning_rate": 4.296130974677448e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232899840, + "step": 107935 + }, + { + "epoch": 17.608482871125613, + "grad_norm": 0.017154935747385025, + "learning_rate": 4.293244812590835e-05, + "loss": 0.0057, + "num_input_tokens_seen": 232912320, + "step": 107940 + }, + { + "epoch": 17.609298531810765, + "grad_norm": 0.0014675318961963058, + "learning_rate": 4.2903595768095995e-05, + "loss": 0.0015, + "num_input_tokens_seen": 232922336, + "step": 107945 + }, + { + "epoch": 17.61011419249592, + "grad_norm": 0.0024152847472578287, + "learning_rate": 4.28747526739221e-05, + "loss": 0.0002, + "num_input_tokens_seen": 232932704, + "step": 107950 + }, + { + "epoch": 17.610929853181077, + "grad_norm": 0.0009341238765046, + "learning_rate": 4.284591884397132e-05, + "loss": 0.0084, + "num_input_tokens_seen": 232943136, + "step": 107955 + }, + { + "epoch": 17.611745513866232, + "grad_norm": 0.18886710703372955, + "learning_rate": 4.281709427882791e-05, + "loss": 0.0021, + "num_input_tokens_seen": 232954304, + "step": 107960 + }, + { + "epoch": 17.612561174551388, + "grad_norm": 0.0012467281194403768, + "learning_rate": 4.2788278979076003e-05, + "loss": 0.0011, + "num_input_tokens_seen": 232965760, + "step": 107965 + }, + { + "epoch": 17.61337683523654, + "grad_norm": 0.0010694157099351287, + "learning_rate": 4.275947294529969e-05, + "loss": 0.0053, + "num_input_tokens_seen": 232977472, + "step": 107970 + }, + { + "epoch": 17.614192495921696, + "grad_norm": 0.0014752644347026944, + "learning_rate": 4.2730676178082736e-05, + "loss": 0.0008, + "num_input_tokens_seen": 232987328, + "step": 107975 + }, + { + "epoch": 17.61500815660685, + "grad_norm": 0.010396101512014866, + "learning_rate": 4.2701888678008674e-05, + "loss": 0.0062, + "num_input_tokens_seen": 232997152, + "step": 107980 + }, + { + "epoch": 17.615823817292007, + "grad_norm": 0.002614020137116313, + "learning_rate": 4.267311044566097e-05, + "loss": 0.0055, + "num_input_tokens_seen": 233008608, + "step": 107985 + }, + { + "epoch": 17.616639477977163, + "grad_norm": 0.0037702666595578194, + "learning_rate": 4.2644341481622825e-05, + "loss": 0.0024, + "num_input_tokens_seen": 233019744, + "step": 107990 + }, + { + "epoch": 17.617455138662315, + "grad_norm": 0.0014823731034994125, + "learning_rate": 4.2615581786477234e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233030272, + "step": 107995 + }, + { + "epoch": 17.61827079934747, + "grad_norm": 0.006815560162067413, + "learning_rate": 4.2586831360807265e-05, + "loss": 0.0014, + "num_input_tokens_seen": 233040384, + "step": 108000 + }, + { + "epoch": 17.619086460032626, + "grad_norm": 0.0009163341601379216, + "learning_rate": 4.25580902051953e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233052096, + "step": 108005 + }, + { + "epoch": 17.619902120717782, + "grad_norm": 0.004782544448971748, + "learning_rate": 4.252935832022409e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233063200, + "step": 108010 + }, + { + "epoch": 17.620717781402938, + "grad_norm": 0.18926797807216644, + "learning_rate": 4.250063570647561e-05, + "loss": 0.0036, + "num_input_tokens_seen": 233074528, + "step": 108015 + }, + { + "epoch": 17.62153344208809, + "grad_norm": 0.0066077494993805885, + "learning_rate": 4.247192236453229e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233084608, + "step": 108020 + }, + { + "epoch": 17.622349102773246, + "grad_norm": 0.013650842942297459, + "learning_rate": 4.244321829497566e-05, + "loss": 0.0678, + "num_input_tokens_seen": 233095808, + "step": 108025 + }, + { + "epoch": 17.6231647634584, + "grad_norm": 0.0005442793481051922, + "learning_rate": 4.2414523498387926e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233105984, + "step": 108030 + }, + { + "epoch": 17.623980424143557, + "grad_norm": 0.0002677010197658092, + "learning_rate": 4.2385837975350115e-05, + "loss": 0.0018, + "num_input_tokens_seen": 233116992, + "step": 108035 + }, + { + "epoch": 17.624796084828713, + "grad_norm": 0.0024015968665480614, + "learning_rate": 4.235716172644394e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233128256, + "step": 108040 + }, + { + "epoch": 17.625611745513865, + "grad_norm": 0.09645512700080872, + "learning_rate": 4.232849475225048e-05, + "loss": 0.0031, + "num_input_tokens_seen": 233139840, + "step": 108045 + }, + { + "epoch": 17.62642740619902, + "grad_norm": 0.0002454793138895184, + "learning_rate": 4.2299837053350606e-05, + "loss": 0.0065, + "num_input_tokens_seen": 233151296, + "step": 108050 + }, + { + "epoch": 17.627243066884176, + "grad_norm": 0.0007525387918576598, + "learning_rate": 4.2271188630325195e-05, + "loss": 0.1287, + "num_input_tokens_seen": 233162720, + "step": 108055 + }, + { + "epoch": 17.628058727569332, + "grad_norm": 0.033866602927446365, + "learning_rate": 4.2242549483754836e-05, + "loss": 0.0016, + "num_input_tokens_seen": 233173216, + "step": 108060 + }, + { + "epoch": 17.628874388254488, + "grad_norm": 0.0008869385346770287, + "learning_rate": 4.221391961421989e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233183008, + "step": 108065 + }, + { + "epoch": 17.62969004893964, + "grad_norm": 0.0014903295086696744, + "learning_rate": 4.218529902230062e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233193472, + "step": 108070 + }, + { + "epoch": 17.630505709624796, + "grad_norm": 0.010620646178722382, + "learning_rate": 4.2156687708577e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233202656, + "step": 108075 + }, + { + "epoch": 17.63132137030995, + "grad_norm": 0.00045275670709088445, + "learning_rate": 4.212808567362897e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233213152, + "step": 108080 + }, + { + "epoch": 17.632137030995107, + "grad_norm": 0.006362323183566332, + "learning_rate": 4.209949291803611e-05, + "loss": 0.0015, + "num_input_tokens_seen": 233224608, + "step": 108085 + }, + { + "epoch": 17.63295269168026, + "grad_norm": 7.067230035318062e-05, + "learning_rate": 4.207090944237796e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233236384, + "step": 108090 + }, + { + "epoch": 17.633768352365415, + "grad_norm": 0.0015735257184132934, + "learning_rate": 4.204233524723372e-05, + "loss": 0.0013, + "num_input_tokens_seen": 233246624, + "step": 108095 + }, + { + "epoch": 17.63458401305057, + "grad_norm": 0.016705073416233063, + "learning_rate": 4.201377033318249e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233258240, + "step": 108100 + }, + { + "epoch": 17.635399673735726, + "grad_norm": 0.00030592328403145075, + "learning_rate": 4.198521470080324e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233268992, + "step": 108105 + }, + { + "epoch": 17.636215334420882, + "grad_norm": 0.0003127037489321083, + "learning_rate": 4.195666835067463e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233279072, + "step": 108110 + }, + { + "epoch": 17.637030995106034, + "grad_norm": 0.0013172245817258954, + "learning_rate": 4.1928131283375246e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233290752, + "step": 108115 + }, + { + "epoch": 17.63784665579119, + "grad_norm": 0.008333837613463402, + "learning_rate": 4.189960349948335e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233301216, + "step": 108120 + }, + { + "epoch": 17.638662316476346, + "grad_norm": 0.009946209378540516, + "learning_rate": 4.1871084999577146e-05, + "loss": 0.0032, + "num_input_tokens_seen": 233312096, + "step": 108125 + }, + { + "epoch": 17.6394779771615, + "grad_norm": 0.0004881436179857701, + "learning_rate": 4.184257578423456e-05, + "loss": 0.0021, + "num_input_tokens_seen": 233323520, + "step": 108130 + }, + { + "epoch": 17.640293637846657, + "grad_norm": 0.0004600707325153053, + "learning_rate": 4.1814075854033405e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233334240, + "step": 108135 + }, + { + "epoch": 17.64110929853181, + "grad_norm": 0.0008929728646762669, + "learning_rate": 4.178558520955117e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233345024, + "step": 108140 + }, + { + "epoch": 17.641924959216965, + "grad_norm": 0.00017338867473881692, + "learning_rate": 4.175710385136539e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233356640, + "step": 108145 + }, + { + "epoch": 17.64274061990212, + "grad_norm": 0.002758380025625229, + "learning_rate": 4.172863178005326e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233367360, + "step": 108150 + }, + { + "epoch": 17.643556280587276, + "grad_norm": 0.0004112807509955019, + "learning_rate": 4.1700168996191726e-05, + "loss": 0.0178, + "num_input_tokens_seen": 233377856, + "step": 108155 + }, + { + "epoch": 17.644371941272432, + "grad_norm": 0.00020189663337077945, + "learning_rate": 4.16717155003577e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233388256, + "step": 108160 + }, + { + "epoch": 17.645187601957584, + "grad_norm": 0.07259958237409592, + "learning_rate": 4.164327129312778e-05, + "loss": 0.0032, + "num_input_tokens_seen": 233399264, + "step": 108165 + }, + { + "epoch": 17.64600326264274, + "grad_norm": 0.07394101470708847, + "learning_rate": 4.161483637507846e-05, + "loss": 0.0044, + "num_input_tokens_seen": 233410816, + "step": 108170 + }, + { + "epoch": 17.646818923327896, + "grad_norm": 0.01238948293030262, + "learning_rate": 4.1586410746785927e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233420352, + "step": 108175 + }, + { + "epoch": 17.64763458401305, + "grad_norm": 0.0001495694014010951, + "learning_rate": 4.155799440882635e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233431584, + "step": 108180 + }, + { + "epoch": 17.648450244698207, + "grad_norm": 0.00932259764522314, + "learning_rate": 4.152958736177559e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233443616, + "step": 108185 + }, + { + "epoch": 17.64926590538336, + "grad_norm": 0.05312154069542885, + "learning_rate": 4.1501189606209356e-05, + "loss": 0.0018, + "num_input_tokens_seen": 233455264, + "step": 108190 + }, + { + "epoch": 17.650081566068515, + "grad_norm": 0.00014607422053813934, + "learning_rate": 4.147280114270319e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233465344, + "step": 108195 + }, + { + "epoch": 17.65089722675367, + "grad_norm": 0.00015902251470834017, + "learning_rate": 4.1444421971832346e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233476768, + "step": 108200 + }, + { + "epoch": 17.651712887438826, + "grad_norm": 0.00014436140190809965, + "learning_rate": 4.1416052094171985e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233486496, + "step": 108205 + }, + { + "epoch": 17.652528548123982, + "grad_norm": 8.24671151349321e-05, + "learning_rate": 4.1387691510297146e-05, + "loss": 0.0015, + "num_input_tokens_seen": 233497248, + "step": 108210 + }, + { + "epoch": 17.653344208809134, + "grad_norm": 0.000402944308007136, + "learning_rate": 4.1359340220782524e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233507424, + "step": 108215 + }, + { + "epoch": 17.65415986949429, + "grad_norm": 0.0073605673387646675, + "learning_rate": 4.133099822620268e-05, + "loss": 0.001, + "num_input_tokens_seen": 233518752, + "step": 108220 + }, + { + "epoch": 17.654975530179446, + "grad_norm": 0.005550917703658342, + "learning_rate": 4.130266552713202e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233527904, + "step": 108225 + }, + { + "epoch": 17.6557911908646, + "grad_norm": 0.008589318953454494, + "learning_rate": 4.1274342124144713e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233538336, + "step": 108230 + }, + { + "epoch": 17.656606851549757, + "grad_norm": 0.3332434594631195, + "learning_rate": 4.124602801781485e-05, + "loss": 0.0112, + "num_input_tokens_seen": 233549344, + "step": 108235 + }, + { + "epoch": 17.65742251223491, + "grad_norm": 0.15488383173942566, + "learning_rate": 4.1217723208716196e-05, + "loss": 0.0019, + "num_input_tokens_seen": 233560320, + "step": 108240 + }, + { + "epoch": 17.658238172920065, + "grad_norm": 0.0019430075772106647, + "learning_rate": 4.118942769742234e-05, + "loss": 0.0119, + "num_input_tokens_seen": 233572224, + "step": 108245 + }, + { + "epoch": 17.65905383360522, + "grad_norm": 0.01220698095858097, + "learning_rate": 4.116114148450673e-05, + "loss": 0.0013, + "num_input_tokens_seen": 233580480, + "step": 108250 + }, + { + "epoch": 17.659869494290376, + "grad_norm": 1.0205851793289185, + "learning_rate": 4.113286457054283e-05, + "loss": 0.0512, + "num_input_tokens_seen": 233592128, + "step": 108255 + }, + { + "epoch": 17.660685154975532, + "grad_norm": 0.0017681606113910675, + "learning_rate": 4.1104596956103356e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233602496, + "step": 108260 + }, + { + "epoch": 17.661500815660684, + "grad_norm": 0.014548354782164097, + "learning_rate": 4.107633864176158e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233612384, + "step": 108265 + }, + { + "epoch": 17.66231647634584, + "grad_norm": 0.00037902678013779223, + "learning_rate": 4.104808962808976e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233623616, + "step": 108270 + }, + { + "epoch": 17.663132137030995, + "grad_norm": 0.009568002074956894, + "learning_rate": 4.101984991566082e-05, + "loss": 0.0016, + "num_input_tokens_seen": 233633568, + "step": 108275 + }, + { + "epoch": 17.66394779771615, + "grad_norm": 0.00016255929949693382, + "learning_rate": 4.0991619505046764e-05, + "loss": 0.0021, + "num_input_tokens_seen": 233645152, + "step": 108280 + }, + { + "epoch": 17.664763458401303, + "grad_norm": 0.00015098000585567206, + "learning_rate": 4.096339839681984e-05, + "loss": 0.001, + "num_input_tokens_seen": 233656224, + "step": 108285 + }, + { + "epoch": 17.66557911908646, + "grad_norm": 0.00950339064002037, + "learning_rate": 4.0935186591552044e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233665152, + "step": 108290 + }, + { + "epoch": 17.666394779771615, + "grad_norm": 0.06841231882572174, + "learning_rate": 4.0906984089815026e-05, + "loss": 0.0021, + "num_input_tokens_seen": 233676096, + "step": 108295 + }, + { + "epoch": 17.66721044045677, + "grad_norm": 0.03269435837864876, + "learning_rate": 4.087879089218033e-05, + "loss": 0.0008, + "num_input_tokens_seen": 233686944, + "step": 108300 + }, + { + "epoch": 17.668026101141926, + "grad_norm": 0.001592471613548696, + "learning_rate": 4.085060699921944e-05, + "loss": 0.0014, + "num_input_tokens_seen": 233697312, + "step": 108305 + }, + { + "epoch": 17.66884176182708, + "grad_norm": 0.001219558878801763, + "learning_rate": 4.0822432411503464e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233708224, + "step": 108310 + }, + { + "epoch": 17.669657422512234, + "grad_norm": 0.0005606732447631657, + "learning_rate": 4.079426712960338e-05, + "loss": 0.0006, + "num_input_tokens_seen": 233719456, + "step": 108315 + }, + { + "epoch": 17.67047308319739, + "grad_norm": 0.00917492900043726, + "learning_rate": 4.076611115409001e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233730080, + "step": 108320 + }, + { + "epoch": 17.671288743882545, + "grad_norm": 0.0009105582721531391, + "learning_rate": 4.073796448553402e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233740224, + "step": 108325 + }, + { + "epoch": 17.6721044045677, + "grad_norm": 0.00020460327505134046, + "learning_rate": 4.070982712450571e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233750784, + "step": 108330 + }, + { + "epoch": 17.672920065252853, + "grad_norm": 0.00019601680105552077, + "learning_rate": 4.068169907157548e-05, + "loss": 0.0017, + "num_input_tokens_seen": 233762336, + "step": 108335 + }, + { + "epoch": 17.67373572593801, + "grad_norm": 0.0015990632819011807, + "learning_rate": 4.065358032731331e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233772768, + "step": 108340 + }, + { + "epoch": 17.674551386623165, + "grad_norm": 0.29436194896698, + "learning_rate": 4.062547089228902e-05, + "loss": 0.0071, + "num_input_tokens_seen": 233783296, + "step": 108345 + }, + { + "epoch": 17.67536704730832, + "grad_norm": 0.0036432554479688406, + "learning_rate": 4.0597370767072315e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233794368, + "step": 108350 + }, + { + "epoch": 17.676182707993476, + "grad_norm": 0.00014322370407171547, + "learning_rate": 4.056927995223264e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233804960, + "step": 108355 + }, + { + "epoch": 17.67699836867863, + "grad_norm": 0.005534231662750244, + "learning_rate": 4.054119844833948e-05, + "loss": 0.0007, + "num_input_tokens_seen": 233816640, + "step": 108360 + }, + { + "epoch": 17.677814029363784, + "grad_norm": 0.0008401111699640751, + "learning_rate": 4.0513126255961594e-05, + "loss": 0.0005, + "num_input_tokens_seen": 233828256, + "step": 108365 + }, + { + "epoch": 17.67862969004894, + "grad_norm": 0.0002920089173130691, + "learning_rate": 4.0485063375668316e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233838688, + "step": 108370 + }, + { + "epoch": 17.679445350734095, + "grad_norm": 0.0013781589223071933, + "learning_rate": 4.045700980802802e-05, + "loss": 0.0011, + "num_input_tokens_seen": 233849472, + "step": 108375 + }, + { + "epoch": 17.68026101141925, + "grad_norm": 0.0002219887392129749, + "learning_rate": 4.042896555360953e-05, + "loss": 0.0038, + "num_input_tokens_seen": 233860032, + "step": 108380 + }, + { + "epoch": 17.681076672104403, + "grad_norm": 0.00032488370197825134, + "learning_rate": 4.040093061298089e-05, + "loss": 0.0001, + "num_input_tokens_seen": 233870656, + "step": 108385 + }, + { + "epoch": 17.68189233278956, + "grad_norm": 0.04000347852706909, + "learning_rate": 4.037290498671059e-05, + "loss": 0.0024, + "num_input_tokens_seen": 233882848, + "step": 108390 + }, + { + "epoch": 17.682707993474715, + "grad_norm": 0.009271733462810516, + "learning_rate": 4.0344888675366285e-05, + "loss": 0.0004, + "num_input_tokens_seen": 233893696, + "step": 108395 + }, + { + "epoch": 17.68352365415987, + "grad_norm": 0.0014449028531089425, + "learning_rate": 4.031688167951614e-05, + "loss": 0.0016, + "num_input_tokens_seen": 233904992, + "step": 108400 + }, + { + "epoch": 17.684339314845026, + "grad_norm": 0.00025338304112665355, + "learning_rate": 4.02888839997273e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233915296, + "step": 108405 + }, + { + "epoch": 17.68515497553018, + "grad_norm": 0.007041054777801037, + "learning_rate": 4.0260895636567654e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233925344, + "step": 108410 + }, + { + "epoch": 17.685970636215334, + "grad_norm": 0.011270069517195225, + "learning_rate": 4.0232916590603964e-05, + "loss": 0.0016, + "num_input_tokens_seen": 233937440, + "step": 108415 + }, + { + "epoch": 17.68678629690049, + "grad_norm": 0.06087367609143257, + "learning_rate": 4.020494686240361e-05, + "loss": 0.0012, + "num_input_tokens_seen": 233947264, + "step": 108420 + }, + { + "epoch": 17.687601957585645, + "grad_norm": 6.953623960725963e-05, + "learning_rate": 4.017698645253321e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233958432, + "step": 108425 + }, + { + "epoch": 17.6884176182708, + "grad_norm": 0.00015272473683580756, + "learning_rate": 4.0149035361559504e-05, + "loss": 0.0009, + "num_input_tokens_seen": 233969728, + "step": 108430 + }, + { + "epoch": 17.689233278955953, + "grad_norm": 0.00020958666573278606, + "learning_rate": 4.0121093590049004e-05, + "loss": 0.0003, + "num_input_tokens_seen": 233980832, + "step": 108435 + }, + { + "epoch": 17.69004893964111, + "grad_norm": 0.00012452987721189857, + "learning_rate": 4.009316113856798e-05, + "loss": 0.0002, + "num_input_tokens_seen": 233990144, + "step": 108440 + }, + { + "epoch": 17.690864600326265, + "grad_norm": 0.003428572788834572, + "learning_rate": 4.0065238007682414e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234000544, + "step": 108445 + }, + { + "epoch": 17.69168026101142, + "grad_norm": 0.0008958429098129272, + "learning_rate": 4.0037324197958304e-05, + "loss": 0.0008, + "num_input_tokens_seen": 234011488, + "step": 108450 + }, + { + "epoch": 17.692495921696576, + "grad_norm": 0.0011487689334899187, + "learning_rate": 4.00094197099613e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234022976, + "step": 108455 + }, + { + "epoch": 17.693311582381728, + "grad_norm": 0.01628243364393711, + "learning_rate": 3.9981524544256964e-05, + "loss": 0.001, + "num_input_tokens_seen": 234034528, + "step": 108460 + }, + { + "epoch": 17.694127243066884, + "grad_norm": 0.00017118206596933305, + "learning_rate": 3.995363870141061e-05, + "loss": 0.0001, + "num_input_tokens_seen": 234045984, + "step": 108465 + }, + { + "epoch": 17.69494290375204, + "grad_norm": 0.010191963985562325, + "learning_rate": 3.9925762181987345e-05, + "loss": 0.0015, + "num_input_tokens_seen": 234056064, + "step": 108470 + }, + { + "epoch": 17.695758564437195, + "grad_norm": 0.00038367457455024123, + "learning_rate": 3.9897894986552216e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234066848, + "step": 108475 + }, + { + "epoch": 17.696574225122347, + "grad_norm": 0.011004485189914703, + "learning_rate": 3.987003711566978e-05, + "loss": 0.0178, + "num_input_tokens_seen": 234078176, + "step": 108480 + }, + { + "epoch": 17.697389885807503, + "grad_norm": 0.0054056174121797085, + "learning_rate": 3.984218856990496e-05, + "loss": 0.0015, + "num_input_tokens_seen": 234089408, + "step": 108485 + }, + { + "epoch": 17.69820554649266, + "grad_norm": 0.010032473132014275, + "learning_rate": 3.981434934982176e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234099840, + "step": 108490 + }, + { + "epoch": 17.699021207177815, + "grad_norm": 0.0347306989133358, + "learning_rate": 3.978651945598472e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234110208, + "step": 108495 + }, + { + "epoch": 17.69983686786297, + "grad_norm": 0.021666377782821655, + "learning_rate": 3.975869888895756e-05, + "loss": 0.0064, + "num_input_tokens_seen": 234119552, + "step": 108500 + }, + { + "epoch": 17.700652528548122, + "grad_norm": 0.021957948803901672, + "learning_rate": 3.973088764930433e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234130720, + "step": 108505 + }, + { + "epoch": 17.701468189233278, + "grad_norm": 0.0002314764424227178, + "learning_rate": 3.9703085737588405e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234141280, + "step": 108510 + }, + { + "epoch": 17.702283849918434, + "grad_norm": 0.0006506431382149458, + "learning_rate": 3.967529315437357e-05, + "loss": 0.0044, + "num_input_tokens_seen": 234153280, + "step": 108515 + }, + { + "epoch": 17.70309951060359, + "grad_norm": 0.0002907815796788782, + "learning_rate": 3.96475099002227e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234163488, + "step": 108520 + }, + { + "epoch": 17.703915171288745, + "grad_norm": 0.008935083635151386, + "learning_rate": 3.9619735975699236e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234175360, + "step": 108525 + }, + { + "epoch": 17.704730831973897, + "grad_norm": 0.0002285427472088486, + "learning_rate": 3.9591971381365665e-05, + "loss": 0.0053, + "num_input_tokens_seen": 234185920, + "step": 108530 + }, + { + "epoch": 17.705546492659053, + "grad_norm": 0.004376788157969713, + "learning_rate": 3.956421611778499e-05, + "loss": 0.0008, + "num_input_tokens_seen": 234197376, + "step": 108535 + }, + { + "epoch": 17.70636215334421, + "grad_norm": 0.002194557571783662, + "learning_rate": 3.953647018551948e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234207488, + "step": 108540 + }, + { + "epoch": 17.707177814029365, + "grad_norm": 0.027959568426012993, + "learning_rate": 3.950873358513168e-05, + "loss": 0.0011, + "num_input_tokens_seen": 234219680, + "step": 108545 + }, + { + "epoch": 17.70799347471452, + "grad_norm": 0.0012162302155047655, + "learning_rate": 3.948100631718338e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234230048, + "step": 108550 + }, + { + "epoch": 17.708809135399672, + "grad_norm": 0.00028241900145076215, + "learning_rate": 3.945328838223688e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234240992, + "step": 108555 + }, + { + "epoch": 17.709624796084828, + "grad_norm": 0.014012595638632774, + "learning_rate": 3.942557978085354e-05, + "loss": 0.0775, + "num_input_tokens_seen": 234251936, + "step": 108560 + }, + { + "epoch": 17.710440456769984, + "grad_norm": 0.0034606587141752243, + "learning_rate": 3.939788051359522e-05, + "loss": 0.0051, + "num_input_tokens_seen": 234261664, + "step": 108565 + }, + { + "epoch": 17.71125611745514, + "grad_norm": 0.0002501814451534301, + "learning_rate": 3.93701905810232e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234271808, + "step": 108570 + }, + { + "epoch": 17.712071778140295, + "grad_norm": 0.0033483714796602726, + "learning_rate": 3.934250998369859e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234281984, + "step": 108575 + }, + { + "epoch": 17.712887438825447, + "grad_norm": 0.00021294719772413373, + "learning_rate": 3.931483872218239e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234293568, + "step": 108580 + }, + { + "epoch": 17.713703099510603, + "grad_norm": 0.00040649285074323416, + "learning_rate": 3.928717679703542e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234304480, + "step": 108585 + }, + { + "epoch": 17.71451876019576, + "grad_norm": 0.001774911186657846, + "learning_rate": 3.925952420881823e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234314656, + "step": 108590 + }, + { + "epoch": 17.715334420880914, + "grad_norm": 0.029066352173686028, + "learning_rate": 3.9231880958091325e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234325760, + "step": 108595 + }, + { + "epoch": 17.71615008156607, + "grad_norm": 0.26989907026290894, + "learning_rate": 3.920424704541481e-05, + "loss": 0.0096, + "num_input_tokens_seen": 234336064, + "step": 108600 + }, + { + "epoch": 17.716965742251222, + "grad_norm": 0.00025042338529601693, + "learning_rate": 3.9176622471348845e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234348000, + "step": 108605 + }, + { + "epoch": 17.717781402936378, + "grad_norm": 0.00035195687087252736, + "learning_rate": 3.9149007236453204e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234359232, + "step": 108610 + }, + { + "epoch": 17.718597063621534, + "grad_norm": 0.0011137579567730427, + "learning_rate": 3.912140134128761e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234369056, + "step": 108615 + }, + { + "epoch": 17.71941272430669, + "grad_norm": 0.06830008327960968, + "learning_rate": 3.909380478641139e-05, + "loss": 0.0031, + "num_input_tokens_seen": 234379904, + "step": 108620 + }, + { + "epoch": 17.72022838499184, + "grad_norm": 0.000205979187740013, + "learning_rate": 3.906621757238393e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234390272, + "step": 108625 + }, + { + "epoch": 17.721044045676997, + "grad_norm": 0.0004948555142618716, + "learning_rate": 3.90386396997644e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234401792, + "step": 108630 + }, + { + "epoch": 17.721859706362153, + "grad_norm": 0.00019920221529901028, + "learning_rate": 3.901107116911145e-05, + "loss": 0.0048, + "num_input_tokens_seen": 234412736, + "step": 108635 + }, + { + "epoch": 17.72267536704731, + "grad_norm": 0.00018796759832184762, + "learning_rate": 3.8983511980984154e-05, + "loss": 0.001, + "num_input_tokens_seen": 234423840, + "step": 108640 + }, + { + "epoch": 17.723491027732464, + "grad_norm": 0.006423200946301222, + "learning_rate": 3.895596213594066e-05, + "loss": 0.0015, + "num_input_tokens_seen": 234434848, + "step": 108645 + }, + { + "epoch": 17.724306688417617, + "grad_norm": 0.03359965980052948, + "learning_rate": 3.892842163453964e-05, + "loss": 0.0444, + "num_input_tokens_seen": 234444672, + "step": 108650 + }, + { + "epoch": 17.725122349102772, + "grad_norm": 0.0003748275339603424, + "learning_rate": 3.8900890477338856e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234455968, + "step": 108655 + }, + { + "epoch": 17.725938009787928, + "grad_norm": 0.0005921075353398919, + "learning_rate": 3.887336866489666e-05, + "loss": 0.0294, + "num_input_tokens_seen": 234467392, + "step": 108660 + }, + { + "epoch": 17.726753670473084, + "grad_norm": 0.0017802788643166423, + "learning_rate": 3.884585619777048e-05, + "loss": 0.0121, + "num_input_tokens_seen": 234478272, + "step": 108665 + }, + { + "epoch": 17.72756933115824, + "grad_norm": 0.16301719844341278, + "learning_rate": 3.881835307651816e-05, + "loss": 0.0055, + "num_input_tokens_seen": 234488928, + "step": 108670 + }, + { + "epoch": 17.72838499184339, + "grad_norm": 0.029957765713334084, + "learning_rate": 3.879085930169685e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234500128, + "step": 108675 + }, + { + "epoch": 17.729200652528547, + "grad_norm": 0.02622179314494133, + "learning_rate": 3.8763374873863886e-05, + "loss": 0.0026, + "num_input_tokens_seen": 234509568, + "step": 108680 + }, + { + "epoch": 17.730016313213703, + "grad_norm": 0.24510152637958527, + "learning_rate": 3.873589979357633e-05, + "loss": 0.0069, + "num_input_tokens_seen": 234520480, + "step": 108685 + }, + { + "epoch": 17.73083197389886, + "grad_norm": 0.0002289361582370475, + "learning_rate": 3.870843406139085e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234531520, + "step": 108690 + }, + { + "epoch": 17.731647634584014, + "grad_norm": 0.14010895788669586, + "learning_rate": 3.868097767786416e-05, + "loss": 0.0017, + "num_input_tokens_seen": 234543296, + "step": 108695 + }, + { + "epoch": 17.732463295269167, + "grad_norm": 0.00021706173720303923, + "learning_rate": 3.86535306435527e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234555232, + "step": 108700 + }, + { + "epoch": 17.733278955954322, + "grad_norm": 0.4328491687774658, + "learning_rate": 3.8626092959012706e-05, + "loss": 0.0122, + "num_input_tokens_seen": 234566528, + "step": 108705 + }, + { + "epoch": 17.734094616639478, + "grad_norm": 0.0002987213956657797, + "learning_rate": 3.8598664624800215e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234578208, + "step": 108710 + }, + { + "epoch": 17.734910277324634, + "grad_norm": 0.000590853625908494, + "learning_rate": 3.857124564147113e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234589120, + "step": 108715 + }, + { + "epoch": 17.73572593800979, + "grad_norm": 0.0025401469320058823, + "learning_rate": 3.8543836009581115e-05, + "loss": 0.0001, + "num_input_tokens_seen": 234599392, + "step": 108720 + }, + { + "epoch": 17.73654159869494, + "grad_norm": 0.00018461898434907198, + "learning_rate": 3.851643572968566e-05, + "loss": 0.0019, + "num_input_tokens_seen": 234611360, + "step": 108725 + }, + { + "epoch": 17.737357259380097, + "grad_norm": 0.0029929070733487606, + "learning_rate": 3.848904480234006e-05, + "loss": 0.0014, + "num_input_tokens_seen": 234622528, + "step": 108730 + }, + { + "epoch": 17.738172920065253, + "grad_norm": 0.003368227044120431, + "learning_rate": 3.846166322809941e-05, + "loss": 0.0765, + "num_input_tokens_seen": 234634784, + "step": 108735 + }, + { + "epoch": 17.73898858075041, + "grad_norm": 0.0004544317489489913, + "learning_rate": 3.8434291007518665e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234645568, + "step": 108740 + }, + { + "epoch": 17.739804241435564, + "grad_norm": 0.0015195683808997273, + "learning_rate": 3.8406928141152596e-05, + "loss": 0.0015, + "num_input_tokens_seen": 234656800, + "step": 108745 + }, + { + "epoch": 17.740619902120716, + "grad_norm": 0.0062785097397863865, + "learning_rate": 3.8379574629555656e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234667840, + "step": 108750 + }, + { + "epoch": 17.741435562805872, + "grad_norm": 0.0058403462171554565, + "learning_rate": 3.835223047328229e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234679616, + "step": 108755 + }, + { + "epoch": 17.742251223491028, + "grad_norm": 0.003449544310569763, + "learning_rate": 3.8324895672886554e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234690496, + "step": 108760 + }, + { + "epoch": 17.743066884176184, + "grad_norm": 0.00012372207129374146, + "learning_rate": 3.829757022892255e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234701600, + "step": 108765 + }, + { + "epoch": 17.74388254486134, + "grad_norm": 0.00031904526986181736, + "learning_rate": 3.827025414194385e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234711648, + "step": 108770 + }, + { + "epoch": 17.74469820554649, + "grad_norm": 0.00018416567763779312, + "learning_rate": 3.824294741250439e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234723040, + "step": 108775 + }, + { + "epoch": 17.745513866231647, + "grad_norm": 0.00019524396338965744, + "learning_rate": 3.821565004115723e-05, + "loss": 0.0012, + "num_input_tokens_seen": 234733216, + "step": 108780 + }, + { + "epoch": 17.746329526916803, + "grad_norm": 0.005155547522008419, + "learning_rate": 3.8188362028455826e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234742976, + "step": 108785 + }, + { + "epoch": 17.74714518760196, + "grad_norm": 0.006279274821281433, + "learning_rate": 3.8161083374953056e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234754112, + "step": 108790 + }, + { + "epoch": 17.747960848287114, + "grad_norm": 0.0029015159234404564, + "learning_rate": 3.8133814081201866e-05, + "loss": 0.001, + "num_input_tokens_seen": 234764960, + "step": 108795 + }, + { + "epoch": 17.748776508972266, + "grad_norm": 0.0019166948040947318, + "learning_rate": 3.810655414775482e-05, + "loss": 0.0016, + "num_input_tokens_seen": 234775936, + "step": 108800 + }, + { + "epoch": 17.749592169657422, + "grad_norm": 0.0014838333008810878, + "learning_rate": 3.807930357516448e-05, + "loss": 0.0007, + "num_input_tokens_seen": 234787392, + "step": 108805 + }, + { + "epoch": 17.750407830342578, + "grad_norm": 0.0014232834801077843, + "learning_rate": 3.8052062363982957e-05, + "loss": 0.0001, + "num_input_tokens_seen": 234799200, + "step": 108810 + }, + { + "epoch": 17.751223491027734, + "grad_norm": 0.0007205366273410618, + "learning_rate": 3.8024830514762465e-05, + "loss": 0.0006, + "num_input_tokens_seen": 234811200, + "step": 108815 + }, + { + "epoch": 17.752039151712886, + "grad_norm": 0.0006578321335837245, + "learning_rate": 3.79976080280548e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234822848, + "step": 108820 + }, + { + "epoch": 17.75285481239804, + "grad_norm": 0.0024306431878358126, + "learning_rate": 3.7970394904411733e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234833984, + "step": 108825 + }, + { + "epoch": 17.753670473083197, + "grad_norm": 0.0006951441173441708, + "learning_rate": 3.7943191144384716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 234844448, + "step": 108830 + }, + { + "epoch": 17.754486133768353, + "grad_norm": 0.0059017338789999485, + "learning_rate": 3.7915996748525086e-05, + "loss": 0.0003, + "num_input_tokens_seen": 234855712, + "step": 108835 + }, + { + "epoch": 17.75530179445351, + "grad_norm": 0.0008645120542496443, + "learning_rate": 3.788881171738401e-05, + "loss": 0.0001, + "num_input_tokens_seen": 234867648, + "step": 108840 + }, + { + "epoch": 17.75611745513866, + "grad_norm": 7.370325329247862e-05, + "learning_rate": 3.7861636051512385e-05, + "loss": 0.0026, + "num_input_tokens_seen": 234879456, + "step": 108845 + }, + { + "epoch": 17.756933115823816, + "grad_norm": 0.0016245251754298806, + "learning_rate": 3.783446975146099e-05, + "loss": 0.0005, + "num_input_tokens_seen": 234890304, + "step": 108850 + }, + { + "epoch": 17.757748776508972, + "grad_norm": 0.001289760461077094, + "learning_rate": 3.7807312817780325e-05, + "loss": 0.0009, + "num_input_tokens_seen": 234900480, + "step": 108855 + }, + { + "epoch": 17.758564437194128, + "grad_norm": 0.00034233214682899415, + "learning_rate": 3.7780165251020794e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234910400, + "step": 108860 + }, + { + "epoch": 17.759380097879284, + "grad_norm": 0.09299731999635696, + "learning_rate": 3.7753027051732615e-05, + "loss": 0.0019, + "num_input_tokens_seen": 234920000, + "step": 108865 + }, + { + "epoch": 17.760195758564436, + "grad_norm": 0.0001516353222541511, + "learning_rate": 3.772589822046568e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234930432, + "step": 108870 + }, + { + "epoch": 17.76101141924959, + "grad_norm": 0.07532747834920883, + "learning_rate": 3.7698778757769944e-05, + "loss": 0.0014, + "num_input_tokens_seen": 234941024, + "step": 108875 + }, + { + "epoch": 17.761827079934747, + "grad_norm": 0.0005381538067013025, + "learning_rate": 3.767166866419486e-05, + "loss": 0.0029, + "num_input_tokens_seen": 234951552, + "step": 108880 + }, + { + "epoch": 17.762642740619903, + "grad_norm": 0.0006777732050977647, + "learning_rate": 3.764456794028992e-05, + "loss": 0.0002, + "num_input_tokens_seen": 234961856, + "step": 108885 + }, + { + "epoch": 17.76345840130506, + "grad_norm": 0.00014735996956005692, + "learning_rate": 3.7617476586604304e-05, + "loss": 0.0018, + "num_input_tokens_seen": 234972576, + "step": 108890 + }, + { + "epoch": 17.76427406199021, + "grad_norm": 0.0002033437485806644, + "learning_rate": 3.759039460368724e-05, + "loss": 0.0004, + "num_input_tokens_seen": 234984672, + "step": 108895 + }, + { + "epoch": 17.765089722675366, + "grad_norm": 0.00012549122038763016, + "learning_rate": 3.756332199208728e-05, + "loss": 0.0013, + "num_input_tokens_seen": 234995584, + "step": 108900 + }, + { + "epoch": 17.765905383360522, + "grad_norm": 0.0063478341326117516, + "learning_rate": 3.753625875235345e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235005120, + "step": 108905 + }, + { + "epoch": 17.766721044045678, + "grad_norm": 0.0003417044354137033, + "learning_rate": 3.750920488503379e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235017792, + "step": 108910 + }, + { + "epoch": 17.767536704730833, + "grad_norm": 0.007389608770608902, + "learning_rate": 3.7482160390676866e-05, + "loss": 0.0432, + "num_input_tokens_seen": 235028704, + "step": 108915 + }, + { + "epoch": 17.768352365415986, + "grad_norm": 0.007980763912200928, + "learning_rate": 3.745512526983075e-05, + "loss": 0.0019, + "num_input_tokens_seen": 235039008, + "step": 108920 + }, + { + "epoch": 17.76916802610114, + "grad_norm": 0.0023068757727742195, + "learning_rate": 3.7428099523043325e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235049344, + "step": 108925 + }, + { + "epoch": 17.769983686786297, + "grad_norm": 0.0007429802208207548, + "learning_rate": 3.7401083150862216e-05, + "loss": 0.0017, + "num_input_tokens_seen": 235060320, + "step": 108930 + }, + { + "epoch": 17.770799347471453, + "grad_norm": 0.0001011027125059627, + "learning_rate": 3.7374076153835033e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235071904, + "step": 108935 + }, + { + "epoch": 17.77161500815661, + "grad_norm": 0.06495852768421173, + "learning_rate": 3.734707853250907e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235081952, + "step": 108940 + }, + { + "epoch": 17.77243066884176, + "grad_norm": 0.00016571665764786303, + "learning_rate": 3.73200902874315e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235092832, + "step": 108945 + }, + { + "epoch": 17.773246329526916, + "grad_norm": 0.0037552814465016127, + "learning_rate": 3.729311141914926e-05, + "loss": 0.0035, + "num_input_tokens_seen": 235104768, + "step": 108950 + }, + { + "epoch": 17.774061990212072, + "grad_norm": 0.00029592993087135255, + "learning_rate": 3.72661419282091e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235116032, + "step": 108955 + }, + { + "epoch": 17.774877650897228, + "grad_norm": 0.04774320870637894, + "learning_rate": 3.723918181515756e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235127456, + "step": 108960 + }, + { + "epoch": 17.775693311582383, + "grad_norm": 0.0007996910135261714, + "learning_rate": 3.721223108054106e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235137824, + "step": 108965 + }, + { + "epoch": 17.776508972267536, + "grad_norm": 0.0007741995505057275, + "learning_rate": 3.7185289724905814e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235148928, + "step": 108970 + }, + { + "epoch": 17.77732463295269, + "grad_norm": 0.0001396966981701553, + "learning_rate": 3.7158357748797775e-05, + "loss": 0.001, + "num_input_tokens_seen": 235159552, + "step": 108975 + }, + { + "epoch": 17.778140293637847, + "grad_norm": 0.00517880916595459, + "learning_rate": 3.7131435152762735e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235170880, + "step": 108980 + }, + { + "epoch": 17.778955954323003, + "grad_norm": 0.0026017839554697275, + "learning_rate": 3.710452193734643e-05, + "loss": 0.0034, + "num_input_tokens_seen": 235181568, + "step": 108985 + }, + { + "epoch": 17.77977161500816, + "grad_norm": 0.001313367742113769, + "learning_rate": 3.707761810309418e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235192704, + "step": 108990 + }, + { + "epoch": 17.78058727569331, + "grad_norm": 0.0020635046530514956, + "learning_rate": 3.705072365055112e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235202528, + "step": 108995 + }, + { + "epoch": 17.781402936378466, + "grad_norm": 0.019754497334361076, + "learning_rate": 3.7023838580262706e-05, + "loss": 0.002, + "num_input_tokens_seen": 235211328, + "step": 109000 + }, + { + "epoch": 17.782218597063622, + "grad_norm": 0.0002752688014879823, + "learning_rate": 3.699696289277327e-05, + "loss": 0.0001, + "num_input_tokens_seen": 235220928, + "step": 109005 + }, + { + "epoch": 17.783034257748778, + "grad_norm": 0.00021329843730200082, + "learning_rate": 3.697009658862793e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235231808, + "step": 109010 + }, + { + "epoch": 17.78384991843393, + "grad_norm": 0.06523372232913971, + "learning_rate": 3.694323966837088e-05, + "loss": 0.1181, + "num_input_tokens_seen": 235242240, + "step": 109015 + }, + { + "epoch": 17.784665579119086, + "grad_norm": 0.00025183262187056243, + "learning_rate": 3.6916392132546605e-05, + "loss": 0.0037, + "num_input_tokens_seen": 235253216, + "step": 109020 + }, + { + "epoch": 17.78548123980424, + "grad_norm": 0.009605488739907742, + "learning_rate": 3.6889553981698966e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235264352, + "step": 109025 + }, + { + "epoch": 17.786296900489397, + "grad_norm": 0.0003754932258743793, + "learning_rate": 3.6862725216372185e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235274144, + "step": 109030 + }, + { + "epoch": 17.787112561174553, + "grad_norm": 0.010473106987774372, + "learning_rate": 3.683590583710961e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235285856, + "step": 109035 + }, + { + "epoch": 17.787928221859705, + "grad_norm": 0.0005504979053512216, + "learning_rate": 3.6809095844455134e-05, + "loss": 0.0041, + "num_input_tokens_seen": 235296928, + "step": 109040 + }, + { + "epoch": 17.78874388254486, + "grad_norm": 0.002847556257620454, + "learning_rate": 3.678229523895177e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235307328, + "step": 109045 + }, + { + "epoch": 17.789559543230016, + "grad_norm": 0.009820504114031792, + "learning_rate": 3.675550402114303e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235317792, + "step": 109050 + }, + { + "epoch": 17.790375203915172, + "grad_norm": 0.0012126201763749123, + "learning_rate": 3.6728722191571476e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235328736, + "step": 109055 + }, + { + "epoch": 17.791190864600328, + "grad_norm": 0.0015601757913827896, + "learning_rate": 3.670194975078017e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235339520, + "step": 109060 + }, + { + "epoch": 17.79200652528548, + "grad_norm": 0.0011537439422681928, + "learning_rate": 3.667518669931158e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235350912, + "step": 109065 + }, + { + "epoch": 17.792822185970635, + "grad_norm": 0.0054458510130643845, + "learning_rate": 3.6648433037708094e-05, + "loss": 0.0071, + "num_input_tokens_seen": 235361632, + "step": 109070 + }, + { + "epoch": 17.79363784665579, + "grad_norm": 0.03249862790107727, + "learning_rate": 3.66216887665119e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235373504, + "step": 109075 + }, + { + "epoch": 17.794453507340947, + "grad_norm": 0.007340879645198584, + "learning_rate": 3.659495388626505e-05, + "loss": 0.0042, + "num_input_tokens_seen": 235384736, + "step": 109080 + }, + { + "epoch": 17.795269168026103, + "grad_norm": 0.00026531258481554687, + "learning_rate": 3.6568228397509286e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235396544, + "step": 109085 + }, + { + "epoch": 17.796084828711255, + "grad_norm": 0.00012415989476721734, + "learning_rate": 3.654151230078628e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235405920, + "step": 109090 + }, + { + "epoch": 17.79690048939641, + "grad_norm": 0.000654736184515059, + "learning_rate": 3.6514805596637504e-05, + "loss": 0.0065, + "num_input_tokens_seen": 235416032, + "step": 109095 + }, + { + "epoch": 17.797716150081566, + "grad_norm": 0.0004680246929638088, + "learning_rate": 3.648810828560417e-05, + "loss": 0.0008, + "num_input_tokens_seen": 235427680, + "step": 109100 + }, + { + "epoch": 17.798531810766722, + "grad_norm": 0.0009875830728560686, + "learning_rate": 3.6461420368227304e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235438176, + "step": 109105 + }, + { + "epoch": 17.799347471451878, + "grad_norm": 0.009584791958332062, + "learning_rate": 3.643474184504775e-05, + "loss": 0.0131, + "num_input_tokens_seen": 235449728, + "step": 109110 + }, + { + "epoch": 17.80016313213703, + "grad_norm": 0.00047371580149047077, + "learning_rate": 3.6408072716606344e-05, + "loss": 0.0001, + "num_input_tokens_seen": 235459424, + "step": 109115 + }, + { + "epoch": 17.800978792822185, + "grad_norm": 0.16172336041927338, + "learning_rate": 3.6381412983443277e-05, + "loss": 0.0029, + "num_input_tokens_seen": 235469536, + "step": 109120 + }, + { + "epoch": 17.80179445350734, + "grad_norm": 0.00036688242107629776, + "learning_rate": 3.635476264609922e-05, + "loss": 0.0001, + "num_input_tokens_seen": 235480288, + "step": 109125 + }, + { + "epoch": 17.802610114192497, + "grad_norm": 0.0008985521853901446, + "learning_rate": 3.6328121705113905e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235489984, + "step": 109130 + }, + { + "epoch": 17.803425774877653, + "grad_norm": 0.0030469854827970266, + "learning_rate": 3.6301490161027574e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235501376, + "step": 109135 + }, + { + "epoch": 17.804241435562805, + "grad_norm": 0.0004418538592290133, + "learning_rate": 3.6274868014379624e-05, + "loss": 0.0013, + "num_input_tokens_seen": 235512416, + "step": 109140 + }, + { + "epoch": 17.80505709624796, + "grad_norm": 0.00020172262156847864, + "learning_rate": 3.6248255265709906e-05, + "loss": 0.0001, + "num_input_tokens_seen": 235523712, + "step": 109145 + }, + { + "epoch": 17.805872756933116, + "grad_norm": 0.0001276957045774907, + "learning_rate": 3.6221651915557484e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235535616, + "step": 109150 + }, + { + "epoch": 17.806688417618272, + "grad_norm": 0.0015978385927155614, + "learning_rate": 3.6195057964461764e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235546976, + "step": 109155 + }, + { + "epoch": 17.807504078303424, + "grad_norm": 0.0072805024683475494, + "learning_rate": 3.616847341296137e-05, + "loss": 0.0947, + "num_input_tokens_seen": 235557792, + "step": 109160 + }, + { + "epoch": 17.80831973898858, + "grad_norm": 0.004473361186683178, + "learning_rate": 3.6141898261595475e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235568960, + "step": 109165 + }, + { + "epoch": 17.809135399673735, + "grad_norm": 0.00046136623132042587, + "learning_rate": 3.611533251090232e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235578624, + "step": 109170 + }, + { + "epoch": 17.80995106035889, + "grad_norm": 0.0003029107174370438, + "learning_rate": 3.608877616142053e-05, + "loss": 0.1253, + "num_input_tokens_seen": 235588064, + "step": 109175 + }, + { + "epoch": 17.810766721044047, + "grad_norm": 0.04958463832736015, + "learning_rate": 3.606222921368807e-05, + "loss": 0.0031, + "num_input_tokens_seen": 235599808, + "step": 109180 + }, + { + "epoch": 17.8115823817292, + "grad_norm": 0.03860737383365631, + "learning_rate": 3.603569166824327e-05, + "loss": 0.0037, + "num_input_tokens_seen": 235612160, + "step": 109185 + }, + { + "epoch": 17.812398042414355, + "grad_norm": 0.00010412315896246582, + "learning_rate": 3.600916352562356e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235621664, + "step": 109190 + }, + { + "epoch": 17.81321370309951, + "grad_norm": 0.00028946265229023993, + "learning_rate": 3.598264478636698e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235632768, + "step": 109195 + }, + { + "epoch": 17.814029363784666, + "grad_norm": 0.0009232889278791845, + "learning_rate": 3.595613545101056e-05, + "loss": 0.0035, + "num_input_tokens_seen": 235643808, + "step": 109200 + }, + { + "epoch": 17.81484502446982, + "grad_norm": 0.0008702556369826198, + "learning_rate": 3.592963552009182e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235655808, + "step": 109205 + }, + { + "epoch": 17.815660685154974, + "grad_norm": 0.005472228862345219, + "learning_rate": 3.590314499414771e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235666720, + "step": 109210 + }, + { + "epoch": 17.81647634584013, + "grad_norm": 0.015534707345068455, + "learning_rate": 3.587666387371513e-05, + "loss": 0.038, + "num_input_tokens_seen": 235676960, + "step": 109215 + }, + { + "epoch": 17.817292006525285, + "grad_norm": 0.0008995892130769789, + "learning_rate": 3.585019215933072e-05, + "loss": 0.0286, + "num_input_tokens_seen": 235687136, + "step": 109220 + }, + { + "epoch": 17.81810766721044, + "grad_norm": 0.0024676453322172165, + "learning_rate": 3.5823729851530983e-05, + "loss": 0.0027, + "num_input_tokens_seen": 235698272, + "step": 109225 + }, + { + "epoch": 17.818923327895597, + "grad_norm": 9.706372657092288e-05, + "learning_rate": 3.5797276950852276e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235709056, + "step": 109230 + }, + { + "epoch": 17.81973898858075, + "grad_norm": 0.017627853900194168, + "learning_rate": 3.5770833457830554e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235719424, + "step": 109235 + }, + { + "epoch": 17.820554649265905, + "grad_norm": 0.0002536752144806087, + "learning_rate": 3.5744399373001834e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235730560, + "step": 109240 + }, + { + "epoch": 17.82137030995106, + "grad_norm": 0.012731221504509449, + "learning_rate": 3.57179746969018e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235741184, + "step": 109245 + }, + { + "epoch": 17.822185970636216, + "grad_norm": 8.880451787263155e-05, + "learning_rate": 3.569155943006602e-05, + "loss": 0.0006, + "num_input_tokens_seen": 235751840, + "step": 109250 + }, + { + "epoch": 17.82300163132137, + "grad_norm": 0.008327648974955082, + "learning_rate": 3.566515357302974e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235761952, + "step": 109255 + }, + { + "epoch": 17.823817292006524, + "grad_norm": 0.00041553424671292305, + "learning_rate": 3.56387571263283e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235771072, + "step": 109260 + }, + { + "epoch": 17.82463295269168, + "grad_norm": 0.013816016726195812, + "learning_rate": 3.561237009049639e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235782240, + "step": 109265 + }, + { + "epoch": 17.825448613376835, + "grad_norm": 0.002118813106790185, + "learning_rate": 3.558599246606903e-05, + "loss": 0.0021, + "num_input_tokens_seen": 235791968, + "step": 109270 + }, + { + "epoch": 17.82626427406199, + "grad_norm": 0.007530451752245426, + "learning_rate": 3.555962425358056e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235802880, + "step": 109275 + }, + { + "epoch": 17.827079934747147, + "grad_norm": 9.668348502600566e-05, + "learning_rate": 3.5533265453565664e-05, + "loss": 0.0146, + "num_input_tokens_seen": 235813024, + "step": 109280 + }, + { + "epoch": 17.8278955954323, + "grad_norm": 0.0386476069688797, + "learning_rate": 3.55069160665582e-05, + "loss": 0.0298, + "num_input_tokens_seen": 235825088, + "step": 109285 + }, + { + "epoch": 17.828711256117455, + "grad_norm": 0.013386093080043793, + "learning_rate": 3.5480576093092466e-05, + "loss": 0.0006, + "num_input_tokens_seen": 235835808, + "step": 109290 + }, + { + "epoch": 17.82952691680261, + "grad_norm": 0.0004151395696680993, + "learning_rate": 3.545424553370202e-05, + "loss": 0.0017, + "num_input_tokens_seen": 235847008, + "step": 109295 + }, + { + "epoch": 17.830342577487766, + "grad_norm": 0.00012074953701812774, + "learning_rate": 3.5427924388920727e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235858336, + "step": 109300 + }, + { + "epoch": 17.83115823817292, + "grad_norm": 0.10005111247301102, + "learning_rate": 3.540161265928177e-05, + "loss": 0.0012, + "num_input_tokens_seen": 235868992, + "step": 109305 + }, + { + "epoch": 17.831973898858074, + "grad_norm": 0.00021929304057266563, + "learning_rate": 3.537531034531855e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235880064, + "step": 109310 + }, + { + "epoch": 17.83278955954323, + "grad_norm": 0.015258029103279114, + "learning_rate": 3.5349017447564135e-05, + "loss": 0.0009, + "num_input_tokens_seen": 235891392, + "step": 109315 + }, + { + "epoch": 17.833605220228385, + "grad_norm": 0.0015490633668377995, + "learning_rate": 3.532273396655128e-05, + "loss": 0.0005, + "num_input_tokens_seen": 235903200, + "step": 109320 + }, + { + "epoch": 17.83442088091354, + "grad_norm": 0.0010220548138022423, + "learning_rate": 3.5296459902812775e-05, + "loss": 0.0011, + "num_input_tokens_seen": 235913472, + "step": 109325 + }, + { + "epoch": 17.835236541598697, + "grad_norm": 0.0015712177846580744, + "learning_rate": 3.527019525688097e-05, + "loss": 0.0062, + "num_input_tokens_seen": 235923744, + "step": 109330 + }, + { + "epoch": 17.83605220228385, + "grad_norm": 0.015524071641266346, + "learning_rate": 3.524394002928821e-05, + "loss": 0.0004, + "num_input_tokens_seen": 235934944, + "step": 109335 + }, + { + "epoch": 17.836867862969005, + "grad_norm": 0.0005214174743741751, + "learning_rate": 3.5217694220566644e-05, + "loss": 0.0002, + "num_input_tokens_seen": 235946272, + "step": 109340 + }, + { + "epoch": 17.83768352365416, + "grad_norm": 0.005835136864334345, + "learning_rate": 3.5191457831248054e-05, + "loss": 0.0023, + "num_input_tokens_seen": 235958176, + "step": 109345 + }, + { + "epoch": 17.838499184339316, + "grad_norm": 0.008611517958343029, + "learning_rate": 3.516523086186429e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235967776, + "step": 109350 + }, + { + "epoch": 17.839314845024468, + "grad_norm": 0.0016350997611880302, + "learning_rate": 3.513901331294678e-05, + "loss": 0.0003, + "num_input_tokens_seen": 235978176, + "step": 109355 + }, + { + "epoch": 17.840130505709624, + "grad_norm": 0.005218234844505787, + "learning_rate": 3.5112805185026853e-05, + "loss": 0.0007, + "num_input_tokens_seen": 235989120, + "step": 109360 + }, + { + "epoch": 17.84094616639478, + "grad_norm": 0.00012885425530839711, + "learning_rate": 3.5086606478635706e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236000384, + "step": 109365 + }, + { + "epoch": 17.841761827079935, + "grad_norm": 0.0017135993111878633, + "learning_rate": 3.506041719430425e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236011200, + "step": 109370 + }, + { + "epoch": 17.84257748776509, + "grad_norm": 0.0007697655819356441, + "learning_rate": 3.503423733256328e-05, + "loss": 0.0035, + "num_input_tokens_seen": 236022912, + "step": 109375 + }, + { + "epoch": 17.843393148450243, + "grad_norm": 0.0006897774874232709, + "learning_rate": 3.500806689394337e-05, + "loss": 0.0001, + "num_input_tokens_seen": 236033152, + "step": 109380 + }, + { + "epoch": 17.8442088091354, + "grad_norm": 0.00022810795053374022, + "learning_rate": 3.4981905878974815e-05, + "loss": 0.0001, + "num_input_tokens_seen": 236043936, + "step": 109385 + }, + { + "epoch": 17.845024469820554, + "grad_norm": 0.0008527372847311199, + "learning_rate": 3.495575428818787e-05, + "loss": 0.0007, + "num_input_tokens_seen": 236054720, + "step": 109390 + }, + { + "epoch": 17.84584013050571, + "grad_norm": 0.0006138753960840404, + "learning_rate": 3.492961212211249e-05, + "loss": 0.002, + "num_input_tokens_seen": 236066688, + "step": 109395 + }, + { + "epoch": 17.846655791190866, + "grad_norm": 0.00042438702075742185, + "learning_rate": 3.490347938127847e-05, + "loss": 0.0028, + "num_input_tokens_seen": 236077056, + "step": 109400 + }, + { + "epoch": 17.847471451876018, + "grad_norm": 0.0013728683115914464, + "learning_rate": 3.4877356066215614e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236088864, + "step": 109405 + }, + { + "epoch": 17.848287112561174, + "grad_norm": 0.0018336598295718431, + "learning_rate": 3.4851242177453e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236100032, + "step": 109410 + }, + { + "epoch": 17.84910277324633, + "grad_norm": 0.0015003210864961147, + "learning_rate": 3.482513771552021e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236110944, + "step": 109415 + }, + { + "epoch": 17.849918433931485, + "grad_norm": 0.0004090873699169606, + "learning_rate": 3.4799042680945966e-05, + "loss": 0.001, + "num_input_tokens_seen": 236119904, + "step": 109420 + }, + { + "epoch": 17.85073409461664, + "grad_norm": 0.004009857773780823, + "learning_rate": 3.477295707425937e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236130944, + "step": 109425 + }, + { + "epoch": 17.851549755301793, + "grad_norm": 0.004864791873842478, + "learning_rate": 3.474688089598893e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236141600, + "step": 109430 + }, + { + "epoch": 17.85236541598695, + "grad_norm": 0.004608397372066975, + "learning_rate": 3.4720814146663226e-05, + "loss": 0.0025, + "num_input_tokens_seen": 236152960, + "step": 109435 + }, + { + "epoch": 17.853181076672104, + "grad_norm": 0.11731166392564774, + "learning_rate": 3.469475682681045e-05, + "loss": 0.0065, + "num_input_tokens_seen": 236163648, + "step": 109440 + }, + { + "epoch": 17.85399673735726, + "grad_norm": 0.005209504161030054, + "learning_rate": 3.466870893695867e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236175200, + "step": 109445 + }, + { + "epoch": 17.854812398042416, + "grad_norm": 0.000542353605851531, + "learning_rate": 3.4642670477635866e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236184640, + "step": 109450 + }, + { + "epoch": 17.855628058727568, + "grad_norm": 0.00028636265778914094, + "learning_rate": 3.4616641449369656e-05, + "loss": 0.0093, + "num_input_tokens_seen": 236196160, + "step": 109455 + }, + { + "epoch": 17.856443719412724, + "grad_norm": 0.0009390448685735464, + "learning_rate": 3.459062185268763e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236208128, + "step": 109460 + }, + { + "epoch": 17.85725938009788, + "grad_norm": 0.014394709840416908, + "learning_rate": 3.456461168811703e-05, + "loss": 0.001, + "num_input_tokens_seen": 236218496, + "step": 109465 + }, + { + "epoch": 17.858075040783035, + "grad_norm": 0.023063551634550095, + "learning_rate": 3.4538610956185044e-05, + "loss": 0.0007, + "num_input_tokens_seen": 236229280, + "step": 109470 + }, + { + "epoch": 17.85889070146819, + "grad_norm": 0.03617573529481888, + "learning_rate": 3.451261965741859e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236240384, + "step": 109475 + }, + { + "epoch": 17.859706362153343, + "grad_norm": 0.00037709600292146206, + "learning_rate": 3.44866377923444e-05, + "loss": 0.0022, + "num_input_tokens_seen": 236251744, + "step": 109480 + }, + { + "epoch": 17.8605220228385, + "grad_norm": 0.0001585066202096641, + "learning_rate": 3.446066536148901e-05, + "loss": 0.0001, + "num_input_tokens_seen": 236264096, + "step": 109485 + }, + { + "epoch": 17.861337683523654, + "grad_norm": 0.007819241844117641, + "learning_rate": 3.4434702365378825e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236275072, + "step": 109490 + }, + { + "epoch": 17.86215334420881, + "grad_norm": 0.0017550470074638724, + "learning_rate": 3.4408748804540034e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236284640, + "step": 109495 + }, + { + "epoch": 17.862969004893966, + "grad_norm": 0.0011767300311475992, + "learning_rate": 3.4382804679498616e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236295712, + "step": 109500 + }, + { + "epoch": 17.863784665579118, + "grad_norm": 0.0025206918362528086, + "learning_rate": 3.4356869990780305e-05, + "loss": 0.0015, + "num_input_tokens_seen": 236306656, + "step": 109505 + }, + { + "epoch": 17.864600326264274, + "grad_norm": 0.05342914164066315, + "learning_rate": 3.4330944738910744e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236318432, + "step": 109510 + }, + { + "epoch": 17.86541598694943, + "grad_norm": 0.00012434620293788612, + "learning_rate": 3.430502892441528e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236328896, + "step": 109515 + }, + { + "epoch": 17.866231647634585, + "grad_norm": 0.0002981308498419821, + "learning_rate": 3.427912254781923e-05, + "loss": 0.0038, + "num_input_tokens_seen": 236340512, + "step": 109520 + }, + { + "epoch": 17.86704730831974, + "grad_norm": 0.000512448837980628, + "learning_rate": 3.425322560964761e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236351328, + "step": 109525 + }, + { + "epoch": 17.867862969004893, + "grad_norm": 0.0004139299562666565, + "learning_rate": 3.422733811042506e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236361280, + "step": 109530 + }, + { + "epoch": 17.86867862969005, + "grad_norm": 0.04813412204384804, + "learning_rate": 3.420146005067659e-05, + "loss": 0.0031, + "num_input_tokens_seen": 236371584, + "step": 109535 + }, + { + "epoch": 17.869494290375204, + "grad_norm": 0.00045760214561596513, + "learning_rate": 3.4175591430926244e-05, + "loss": 0.0021, + "num_input_tokens_seen": 236383072, + "step": 109540 + }, + { + "epoch": 17.87030995106036, + "grad_norm": 0.00015284138498827815, + "learning_rate": 3.414973225169854e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236394688, + "step": 109545 + }, + { + "epoch": 17.871125611745512, + "grad_norm": 0.00026452724705450237, + "learning_rate": 3.412388251351756e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236404992, + "step": 109550 + }, + { + "epoch": 17.871941272430668, + "grad_norm": 0.0001895769964903593, + "learning_rate": 3.4098042216907045e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236415584, + "step": 109555 + }, + { + "epoch": 17.872756933115824, + "grad_norm": 0.002294727601110935, + "learning_rate": 3.4072211362390746e-05, + "loss": 0.005, + "num_input_tokens_seen": 236426144, + "step": 109560 + }, + { + "epoch": 17.87357259380098, + "grad_norm": 0.001398144057020545, + "learning_rate": 3.40463899504922e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236437856, + "step": 109565 + }, + { + "epoch": 17.874388254486135, + "grad_norm": 0.00013819339801557362, + "learning_rate": 3.402057798173463e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236450112, + "step": 109570 + }, + { + "epoch": 17.875203915171287, + "grad_norm": 9.536023571854457e-05, + "learning_rate": 3.39947754566412e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236461056, + "step": 109575 + }, + { + "epoch": 17.876019575856443, + "grad_norm": 0.00015681496006436646, + "learning_rate": 3.3968982375734813e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236471104, + "step": 109580 + }, + { + "epoch": 17.8768352365416, + "grad_norm": 0.0014975497033447027, + "learning_rate": 3.394319873953816e-05, + "loss": 0.0011, + "num_input_tokens_seen": 236482816, + "step": 109585 + }, + { + "epoch": 17.877650897226754, + "grad_norm": 9.779324318515137e-05, + "learning_rate": 3.391742454857388e-05, + "loss": 0.0016, + "num_input_tokens_seen": 236494336, + "step": 109590 + }, + { + "epoch": 17.87846655791191, + "grad_norm": 0.00044408676330931485, + "learning_rate": 3.3891659803364225e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236506208, + "step": 109595 + }, + { + "epoch": 17.879282218597062, + "grad_norm": 0.000934326380956918, + "learning_rate": 3.386590450443139e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236517248, + "step": 109600 + }, + { + "epoch": 17.880097879282218, + "grad_norm": 0.001685318537056446, + "learning_rate": 3.3840158652297335e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236527264, + "step": 109605 + }, + { + "epoch": 17.880913539967374, + "grad_norm": 0.0011990563943982124, + "learning_rate": 3.381442224748382e-05, + "loss": 0.0209, + "num_input_tokens_seen": 236537184, + "step": 109610 + }, + { + "epoch": 17.88172920065253, + "grad_norm": 0.00048670225078240037, + "learning_rate": 3.378869529051243e-05, + "loss": 0.0029, + "num_input_tokens_seen": 236547616, + "step": 109615 + }, + { + "epoch": 17.882544861337685, + "grad_norm": 0.0008226591162383556, + "learning_rate": 3.376297778190457e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236559552, + "step": 109620 + }, + { + "epoch": 17.883360522022837, + "grad_norm": 0.03791895508766174, + "learning_rate": 3.373726972218144e-05, + "loss": 0.0011, + "num_input_tokens_seen": 236569280, + "step": 109625 + }, + { + "epoch": 17.884176182707993, + "grad_norm": 0.0005806846893392503, + "learning_rate": 3.3711571111864014e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236579680, + "step": 109630 + }, + { + "epoch": 17.88499184339315, + "grad_norm": 0.00023863333626650274, + "learning_rate": 3.3685881951473096e-05, + "loss": 0.0022, + "num_input_tokens_seen": 236590592, + "step": 109635 + }, + { + "epoch": 17.885807504078304, + "grad_norm": 0.0036455406807363033, + "learning_rate": 3.366020224152949e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236601728, + "step": 109640 + }, + { + "epoch": 17.88662316476346, + "grad_norm": 0.000620608392637223, + "learning_rate": 3.363453198255328e-05, + "loss": 0.0042, + "num_input_tokens_seen": 236612192, + "step": 109645 + }, + { + "epoch": 17.887438825448612, + "grad_norm": 0.0223153717815876, + "learning_rate": 3.360887117506506e-05, + "loss": 0.0004, + "num_input_tokens_seen": 236623552, + "step": 109650 + }, + { + "epoch": 17.888254486133768, + "grad_norm": 0.0005907841259613633, + "learning_rate": 3.358321981958462e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236634432, + "step": 109655 + }, + { + "epoch": 17.889070146818923, + "grad_norm": 0.0022148978896439075, + "learning_rate": 3.3557577916632055e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236645216, + "step": 109660 + }, + { + "epoch": 17.88988580750408, + "grad_norm": 0.04052725434303284, + "learning_rate": 3.353194546672672e-05, + "loss": 0.0026, + "num_input_tokens_seen": 236654848, + "step": 109665 + }, + { + "epoch": 17.890701468189235, + "grad_norm": 0.028868235647678375, + "learning_rate": 3.3506322470388426e-05, + "loss": 0.001, + "num_input_tokens_seen": 236666048, + "step": 109670 + }, + { + "epoch": 17.891517128874387, + "grad_norm": 0.002948655281215906, + "learning_rate": 3.3480708928136204e-05, + "loss": 0.0019, + "num_input_tokens_seen": 236676672, + "step": 109675 + }, + { + "epoch": 17.892332789559543, + "grad_norm": 0.02042117528617382, + "learning_rate": 3.34551048404893e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236687520, + "step": 109680 + }, + { + "epoch": 17.8931484502447, + "grad_norm": 0.005958755500614643, + "learning_rate": 3.342951020796647e-05, + "loss": 0.0037, + "num_input_tokens_seen": 236698240, + "step": 109685 + }, + { + "epoch": 17.893964110929854, + "grad_norm": 0.00022422813344746828, + "learning_rate": 3.3403925031086525e-05, + "loss": 0.0027, + "num_input_tokens_seen": 236708928, + "step": 109690 + }, + { + "epoch": 17.894779771615006, + "grad_norm": 0.0002073352225124836, + "learning_rate": 3.337834931036798e-05, + "loss": 0.0484, + "num_input_tokens_seen": 236718272, + "step": 109695 + }, + { + "epoch": 17.895595432300162, + "grad_norm": 0.0001897782931337133, + "learning_rate": 3.335278304632916e-05, + "loss": 0.0001, + "num_input_tokens_seen": 236728800, + "step": 109700 + }, + { + "epoch": 17.896411092985318, + "grad_norm": 0.0004179094103164971, + "learning_rate": 3.332722623948814e-05, + "loss": 0.0035, + "num_input_tokens_seen": 236741152, + "step": 109705 + }, + { + "epoch": 17.897226753670473, + "grad_norm": 0.00017372150614392012, + "learning_rate": 3.330167889036295e-05, + "loss": 0.0106, + "num_input_tokens_seen": 236752960, + "step": 109710 + }, + { + "epoch": 17.89804241435563, + "grad_norm": 0.0052957418374717236, + "learning_rate": 3.327614099947124e-05, + "loss": 0.0017, + "num_input_tokens_seen": 236764000, + "step": 109715 + }, + { + "epoch": 17.898858075040785, + "grad_norm": 0.0020662148017436266, + "learning_rate": 3.325061256733058e-05, + "loss": 0.1816, + "num_input_tokens_seen": 236776192, + "step": 109720 + }, + { + "epoch": 17.899673735725937, + "grad_norm": 0.0006791003397665918, + "learning_rate": 3.3225093594458465e-05, + "loss": 0.0018, + "num_input_tokens_seen": 236786336, + "step": 109725 + }, + { + "epoch": 17.900489396411093, + "grad_norm": 0.14716164767742157, + "learning_rate": 3.319958408137192e-05, + "loss": 0.0047, + "num_input_tokens_seen": 236797312, + "step": 109730 + }, + { + "epoch": 17.90130505709625, + "grad_norm": 0.016907401382923126, + "learning_rate": 3.317408402858796e-05, + "loss": 0.0023, + "num_input_tokens_seen": 236808160, + "step": 109735 + }, + { + "epoch": 17.902120717781404, + "grad_norm": 0.05061078444123268, + "learning_rate": 3.314859343662335e-05, + "loss": 0.0016, + "num_input_tokens_seen": 236818304, + "step": 109740 + }, + { + "epoch": 17.902936378466556, + "grad_norm": 0.0032555044163018465, + "learning_rate": 3.312311230599491e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236828352, + "step": 109745 + }, + { + "epoch": 17.903752039151712, + "grad_norm": 0.00038270451477728784, + "learning_rate": 3.3097640637218654e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236839872, + "step": 109750 + }, + { + "epoch": 17.904567699836868, + "grad_norm": 0.0010188523447141051, + "learning_rate": 3.307217843081123e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236851520, + "step": 109755 + }, + { + "epoch": 17.905383360522023, + "grad_norm": 0.009678504429757595, + "learning_rate": 3.3046725687288285e-05, + "loss": 0.0005, + "num_input_tokens_seen": 236861600, + "step": 109760 + }, + { + "epoch": 17.90619902120718, + "grad_norm": 0.007704134564846754, + "learning_rate": 3.302128240716595e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236872320, + "step": 109765 + }, + { + "epoch": 17.90701468189233, + "grad_norm": 0.08227112889289856, + "learning_rate": 3.299584859095961e-05, + "loss": 0.0034, + "num_input_tokens_seen": 236883040, + "step": 109770 + }, + { + "epoch": 17.907830342577487, + "grad_norm": 0.013328625820577145, + "learning_rate": 3.297042423918495e-05, + "loss": 0.0006, + "num_input_tokens_seen": 236893824, + "step": 109775 + }, + { + "epoch": 17.908646003262643, + "grad_norm": 0.000333370640873909, + "learning_rate": 3.2945009352357e-05, + "loss": 0.0009, + "num_input_tokens_seen": 236903424, + "step": 109780 + }, + { + "epoch": 17.9094616639478, + "grad_norm": 9.934213449014351e-05, + "learning_rate": 3.291960393099108e-05, + "loss": 0.0013, + "num_input_tokens_seen": 236914048, + "step": 109785 + }, + { + "epoch": 17.910277324632954, + "grad_norm": 1.0264517068862915, + "learning_rate": 3.289420797560172e-05, + "loss": 0.0488, + "num_input_tokens_seen": 236924512, + "step": 109790 + }, + { + "epoch": 17.911092985318106, + "grad_norm": 0.00013575241609942168, + "learning_rate": 3.2868821486704003e-05, + "loss": 0.0008, + "num_input_tokens_seen": 236936224, + "step": 109795 + }, + { + "epoch": 17.911908646003262, + "grad_norm": 0.0005185059271752834, + "learning_rate": 3.284344446481208e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236947616, + "step": 109800 + }, + { + "epoch": 17.912724306688418, + "grad_norm": 0.0001716252154437825, + "learning_rate": 3.2818076910440476e-05, + "loss": 0.0002, + "num_input_tokens_seen": 236958848, + "step": 109805 + }, + { + "epoch": 17.913539967373573, + "grad_norm": 0.0016533852322027087, + "learning_rate": 3.279271882410312e-05, + "loss": 0.0003, + "num_input_tokens_seen": 236969280, + "step": 109810 + }, + { + "epoch": 17.91435562805873, + "grad_norm": 0.0201679989695549, + "learning_rate": 3.27673702063141e-05, + "loss": 0.0028, + "num_input_tokens_seen": 236979968, + "step": 109815 + }, + { + "epoch": 17.91517128874388, + "grad_norm": 0.0004525840631686151, + "learning_rate": 3.274203105758694e-05, + "loss": 0.0034, + "num_input_tokens_seen": 236991648, + "step": 109820 + }, + { + "epoch": 17.915986949429037, + "grad_norm": 0.00011505446309456602, + "learning_rate": 3.2716701378435355e-05, + "loss": 0.0013, + "num_input_tokens_seen": 237001824, + "step": 109825 + }, + { + "epoch": 17.916802610114193, + "grad_norm": 0.04304012283682823, + "learning_rate": 3.269138116937259e-05, + "loss": 0.0023, + "num_input_tokens_seen": 237013216, + "step": 109830 + }, + { + "epoch": 17.91761827079935, + "grad_norm": 0.1449601799249649, + "learning_rate": 3.2666070430911796e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237023104, + "step": 109835 + }, + { + "epoch": 17.918433931484504, + "grad_norm": 0.013744697906076908, + "learning_rate": 3.264076916356601e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237034208, + "step": 109840 + }, + { + "epoch": 17.919249592169656, + "grad_norm": 0.01814051903784275, + "learning_rate": 3.2615477367847866e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237044224, + "step": 109845 + }, + { + "epoch": 17.920065252854812, + "grad_norm": 0.00024261377984657884, + "learning_rate": 3.2590195044269965e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237054272, + "step": 109850 + }, + { + "epoch": 17.920880913539968, + "grad_norm": 0.00010054680024040863, + "learning_rate": 3.256492219334478e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237064992, + "step": 109855 + }, + { + "epoch": 17.921696574225123, + "grad_norm": 0.00031958584440872073, + "learning_rate": 3.2539658815584404e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237076864, + "step": 109860 + }, + { + "epoch": 17.92251223491028, + "grad_norm": 0.0003665457188617438, + "learning_rate": 3.2514404911500814e-05, + "loss": 0.0108, + "num_input_tokens_seen": 237087360, + "step": 109865 + }, + { + "epoch": 17.92332789559543, + "grad_norm": 0.001172934309579432, + "learning_rate": 3.248916048160588e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237098016, + "step": 109870 + }, + { + "epoch": 17.924143556280587, + "grad_norm": 0.052066121250391006, + "learning_rate": 3.246392552641125e-05, + "loss": 0.0065, + "num_input_tokens_seen": 237107776, + "step": 109875 + }, + { + "epoch": 17.924959216965743, + "grad_norm": 0.0002733437577262521, + "learning_rate": 3.2438700046428185e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237116736, + "step": 109880 + }, + { + "epoch": 17.9257748776509, + "grad_norm": 0.15915557742118835, + "learning_rate": 3.2413484042167984e-05, + "loss": 0.0047, + "num_input_tokens_seen": 237127296, + "step": 109885 + }, + { + "epoch": 17.92659053833605, + "grad_norm": 0.00013232874334789813, + "learning_rate": 3.2388277514141864e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237137472, + "step": 109890 + }, + { + "epoch": 17.927406199021206, + "grad_norm": 0.00862794741988182, + "learning_rate": 3.236308046286035e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237148288, + "step": 109895 + }, + { + "epoch": 17.928221859706362, + "grad_norm": 0.0074185533449053764, + "learning_rate": 3.2337892888834375e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237158272, + "step": 109900 + }, + { + "epoch": 17.929037520391518, + "grad_norm": 0.013781985267996788, + "learning_rate": 3.231271479257414e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237168192, + "step": 109905 + }, + { + "epoch": 17.929853181076673, + "grad_norm": 0.0004937905468977988, + "learning_rate": 3.228754617459023e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237178944, + "step": 109910 + }, + { + "epoch": 17.930668841761825, + "grad_norm": 0.0010518053313717246, + "learning_rate": 3.2262387035392305e-05, + "loss": 0.0009, + "num_input_tokens_seen": 237190432, + "step": 109915 + }, + { + "epoch": 17.93148450244698, + "grad_norm": 0.0017680478049442172, + "learning_rate": 3.2237237375490666e-05, + "loss": 0.0038, + "num_input_tokens_seen": 237202144, + "step": 109920 + }, + { + "epoch": 17.932300163132137, + "grad_norm": 0.0899263322353363, + "learning_rate": 3.221209719539469e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237213120, + "step": 109925 + }, + { + "epoch": 17.933115823817293, + "grad_norm": 0.03345358744263649, + "learning_rate": 3.218696649561409e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237223872, + "step": 109930 + }, + { + "epoch": 17.93393148450245, + "grad_norm": 0.0014258904848247766, + "learning_rate": 3.2161845276658e-05, + "loss": 0.002, + "num_input_tokens_seen": 237235360, + "step": 109935 + }, + { + "epoch": 17.9347471451876, + "grad_norm": 0.0002832882455550134, + "learning_rate": 3.213673353903568e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237246688, + "step": 109940 + }, + { + "epoch": 17.935562805872756, + "grad_norm": 0.0025299324188381433, + "learning_rate": 3.211163128325589e-05, + "loss": 0.0019, + "num_input_tokens_seen": 237255936, + "step": 109945 + }, + { + "epoch": 17.936378466557912, + "grad_norm": 0.07608918845653534, + "learning_rate": 3.208653850982746e-05, + "loss": 0.0021, + "num_input_tokens_seen": 237265920, + "step": 109950 + }, + { + "epoch": 17.937194127243067, + "grad_norm": 0.07930117845535278, + "learning_rate": 3.206145521925896e-05, + "loss": 0.0016, + "num_input_tokens_seen": 237276096, + "step": 109955 + }, + { + "epoch": 17.938009787928223, + "grad_norm": 0.0012122656917199492, + "learning_rate": 3.2036381412058725e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237285824, + "step": 109960 + }, + { + "epoch": 17.938825448613375, + "grad_norm": 0.00015188926772680134, + "learning_rate": 3.2011317088734836e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237298176, + "step": 109965 + }, + { + "epoch": 17.93964110929853, + "grad_norm": 0.0011666889768093824, + "learning_rate": 3.1986262249795286e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237308224, + "step": 109970 + }, + { + "epoch": 17.940456769983687, + "grad_norm": 0.0001669765915721655, + "learning_rate": 3.196121689574782e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237320160, + "step": 109975 + }, + { + "epoch": 17.941272430668842, + "grad_norm": 0.02599497139453888, + "learning_rate": 3.193618102710011e-05, + "loss": 0.0017, + "num_input_tokens_seen": 237330816, + "step": 109980 + }, + { + "epoch": 17.942088091353998, + "grad_norm": 0.0661001205444336, + "learning_rate": 3.191115464435945e-05, + "loss": 0.0014, + "num_input_tokens_seen": 237343328, + "step": 109985 + }, + { + "epoch": 17.94290375203915, + "grad_norm": 0.00020838069031015038, + "learning_rate": 3.188613774803306e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237355392, + "step": 109990 + }, + { + "epoch": 17.943719412724306, + "grad_norm": 0.04982484504580498, + "learning_rate": 3.186113033862792e-05, + "loss": 0.0018, + "num_input_tokens_seen": 237366528, + "step": 109995 + }, + { + "epoch": 17.94453507340946, + "grad_norm": 0.01650574989616871, + "learning_rate": 3.1836132416650844e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237377184, + "step": 110000 + }, + { + "epoch": 17.945350734094617, + "grad_norm": 0.0003568715474102646, + "learning_rate": 3.1811143982608426e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237387424, + "step": 110005 + }, + { + "epoch": 17.946166394779773, + "grad_norm": 0.002765743061900139, + "learning_rate": 3.1786165037007156e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237397856, + "step": 110010 + }, + { + "epoch": 17.946982055464925, + "grad_norm": 0.0012080416781827807, + "learning_rate": 3.176119558035323e-05, + "loss": 0.0026, + "num_input_tokens_seen": 237409184, + "step": 110015 + }, + { + "epoch": 17.94779771615008, + "grad_norm": 0.00021396845113486052, + "learning_rate": 3.173623561315259e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237419488, + "step": 110020 + }, + { + "epoch": 17.948613376835237, + "grad_norm": 0.0001211381604662165, + "learning_rate": 3.171128513591132e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237429536, + "step": 110025 + }, + { + "epoch": 17.949429037520392, + "grad_norm": 0.00026926648570224643, + "learning_rate": 3.1686344149134735e-05, + "loss": 0.001, + "num_input_tokens_seen": 237440032, + "step": 110030 + }, + { + "epoch": 17.950244698205548, + "grad_norm": 0.03701049089431763, + "learning_rate": 3.1661412653328724e-05, + "loss": 0.001, + "num_input_tokens_seen": 237452256, + "step": 110035 + }, + { + "epoch": 17.9510603588907, + "grad_norm": 0.001862829434685409, + "learning_rate": 3.1636490648998095e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237462816, + "step": 110040 + }, + { + "epoch": 17.951876019575856, + "grad_norm": 0.00027980009326711297, + "learning_rate": 3.1611578136648336e-05, + "loss": 0.0001, + "num_input_tokens_seen": 237474304, + "step": 110045 + }, + { + "epoch": 17.95269168026101, + "grad_norm": 9.462568414164707e-05, + "learning_rate": 3.158667511678393e-05, + "loss": 0.0019, + "num_input_tokens_seen": 237483840, + "step": 110050 + }, + { + "epoch": 17.953507340946167, + "grad_norm": 0.0009649645071476698, + "learning_rate": 3.156178158990991e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237495296, + "step": 110055 + }, + { + "epoch": 17.954323001631323, + "grad_norm": 0.00014840575749985874, + "learning_rate": 3.153689755653061e-05, + "loss": 0.0029, + "num_input_tokens_seen": 237506592, + "step": 110060 + }, + { + "epoch": 17.955138662316475, + "grad_norm": 0.00020257123105693609, + "learning_rate": 3.151202301715034e-05, + "loss": 0.0058, + "num_input_tokens_seen": 237518272, + "step": 110065 + }, + { + "epoch": 17.95595432300163, + "grad_norm": 0.00034909433452412486, + "learning_rate": 3.148715797227331e-05, + "loss": 0.0009, + "num_input_tokens_seen": 237529632, + "step": 110070 + }, + { + "epoch": 17.956769983686787, + "grad_norm": 0.00012179746408946812, + "learning_rate": 3.1462302422403334e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237540416, + "step": 110075 + }, + { + "epoch": 17.957585644371942, + "grad_norm": 0.0011263869237154722, + "learning_rate": 3.143745636804418e-05, + "loss": 0.0063, + "num_input_tokens_seen": 237551680, + "step": 110080 + }, + { + "epoch": 17.958401305057095, + "grad_norm": 0.00034597155172377825, + "learning_rate": 3.14126198096994e-05, + "loss": 0.0014, + "num_input_tokens_seen": 237562048, + "step": 110085 + }, + { + "epoch": 17.95921696574225, + "grad_norm": 0.00484886858612299, + "learning_rate": 3.138779274787235e-05, + "loss": 0.0124, + "num_input_tokens_seen": 237571904, + "step": 110090 + }, + { + "epoch": 17.960032626427406, + "grad_norm": 0.621157705783844, + "learning_rate": 3.136297518306614e-05, + "loss": 0.0138, + "num_input_tokens_seen": 237581952, + "step": 110095 + }, + { + "epoch": 17.96084828711256, + "grad_norm": 0.014954537153244019, + "learning_rate": 3.133816711578369e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237593696, + "step": 110100 + }, + { + "epoch": 17.961663947797717, + "grad_norm": 0.0019056128803640604, + "learning_rate": 3.131336854652789e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237605504, + "step": 110105 + }, + { + "epoch": 17.96247960848287, + "grad_norm": 0.0022746589966118336, + "learning_rate": 3.1288579475801215e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237616544, + "step": 110110 + }, + { + "epoch": 17.963295269168025, + "grad_norm": 0.0021992353722453117, + "learning_rate": 3.12637999041061e-05, + "loss": 0.0011, + "num_input_tokens_seen": 237627328, + "step": 110115 + }, + { + "epoch": 17.96411092985318, + "grad_norm": 0.015646236017346382, + "learning_rate": 3.123902983194471e-05, + "loss": 0.0017, + "num_input_tokens_seen": 237637696, + "step": 110120 + }, + { + "epoch": 17.964926590538337, + "grad_norm": 0.012814161367714405, + "learning_rate": 3.1214269259819014e-05, + "loss": 0.0008, + "num_input_tokens_seen": 237648736, + "step": 110125 + }, + { + "epoch": 17.965742251223492, + "grad_norm": 0.0010603488190099597, + "learning_rate": 3.11895181882309e-05, + "loss": 0.0152, + "num_input_tokens_seen": 237659776, + "step": 110130 + }, + { + "epoch": 17.966557911908644, + "grad_norm": 0.00015223717491608113, + "learning_rate": 3.116477661768191e-05, + "loss": 0.0023, + "num_input_tokens_seen": 237671328, + "step": 110135 + }, + { + "epoch": 17.9673735725938, + "grad_norm": 0.03964661434292793, + "learning_rate": 3.1140044548673476e-05, + "loss": 0.0018, + "num_input_tokens_seen": 237681568, + "step": 110140 + }, + { + "epoch": 17.968189233278956, + "grad_norm": 0.0032020951621234417, + "learning_rate": 3.11153219817068e-05, + "loss": 0.0051, + "num_input_tokens_seen": 237691360, + "step": 110145 + }, + { + "epoch": 17.96900489396411, + "grad_norm": 0.43427884578704834, + "learning_rate": 3.109060891728299e-05, + "loss": 0.0121, + "num_input_tokens_seen": 237701952, + "step": 110150 + }, + { + "epoch": 17.969820554649267, + "grad_norm": 0.0062524909153580666, + "learning_rate": 3.1065905355902865e-05, + "loss": 0.0007, + "num_input_tokens_seen": 237712480, + "step": 110155 + }, + { + "epoch": 17.97063621533442, + "grad_norm": 0.00010246759484289214, + "learning_rate": 3.104121129806697e-05, + "loss": 0.0001, + "num_input_tokens_seen": 237723680, + "step": 110160 + }, + { + "epoch": 17.971451876019575, + "grad_norm": 0.005706596188247204, + "learning_rate": 3.101652674427585e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237732832, + "step": 110165 + }, + { + "epoch": 17.97226753670473, + "grad_norm": 0.32325279712677, + "learning_rate": 3.0991851695029825e-05, + "loss": 0.0066, + "num_input_tokens_seen": 237744000, + "step": 110170 + }, + { + "epoch": 17.973083197389887, + "grad_norm": 0.003499767044559121, + "learning_rate": 3.0967186150828886e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237754784, + "step": 110175 + }, + { + "epoch": 17.973898858075042, + "grad_norm": 0.000473067193524912, + "learning_rate": 3.0942530112172905e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237764896, + "step": 110180 + }, + { + "epoch": 17.974714518760194, + "grad_norm": 0.0008625888149254024, + "learning_rate": 3.0917883579561604e-05, + "loss": 0.0011, + "num_input_tokens_seen": 237774560, + "step": 110185 + }, + { + "epoch": 17.97553017944535, + "grad_norm": 0.017908316105604172, + "learning_rate": 3.0893246553494516e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237784256, + "step": 110190 + }, + { + "epoch": 17.976345840130506, + "grad_norm": 0.0024404495488852262, + "learning_rate": 3.08686190344708e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237796096, + "step": 110195 + }, + { + "epoch": 17.97716150081566, + "grad_norm": 0.0003625154495239258, + "learning_rate": 3.084400102298973e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237807776, + "step": 110200 + }, + { + "epoch": 17.977977161500817, + "grad_norm": 0.0005655138520523906, + "learning_rate": 3.0819392519550125e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237817568, + "step": 110205 + }, + { + "epoch": 17.97879282218597, + "grad_norm": 0.002760243369266391, + "learning_rate": 3.079479352465076e-05, + "loss": 0.0006, + "num_input_tokens_seen": 237827744, + "step": 110210 + }, + { + "epoch": 17.979608482871125, + "grad_norm": 0.0010122376261278987, + "learning_rate": 3.077020403879005e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237839136, + "step": 110215 + }, + { + "epoch": 17.98042414355628, + "grad_norm": 0.012110142037272453, + "learning_rate": 3.07456240624665e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237850048, + "step": 110220 + }, + { + "epoch": 17.981239804241437, + "grad_norm": 0.005608526524156332, + "learning_rate": 3.072105359617811e-05, + "loss": 0.0026, + "num_input_tokens_seen": 237860640, + "step": 110225 + }, + { + "epoch": 17.982055464926592, + "grad_norm": 0.00066922209225595, + "learning_rate": 3.0696492640422954e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237872096, + "step": 110230 + }, + { + "epoch": 17.982871125611744, + "grad_norm": 0.0011449019657447934, + "learning_rate": 3.067194119569866e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237883456, + "step": 110235 + }, + { + "epoch": 17.9836867862969, + "grad_norm": 0.00391927408054471, + "learning_rate": 3.064739926250293e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237893600, + "step": 110240 + }, + { + "epoch": 17.984502446982056, + "grad_norm": 0.0008255235152319074, + "learning_rate": 3.062286684133303e-05, + "loss": 0.0003, + "num_input_tokens_seen": 237903936, + "step": 110245 + }, + { + "epoch": 17.98531810766721, + "grad_norm": 0.00012633785081561655, + "learning_rate": 3.059834393268618e-05, + "loss": 0.0005, + "num_input_tokens_seen": 237914304, + "step": 110250 + }, + { + "epoch": 17.986133768352367, + "grad_norm": 0.0189680103212595, + "learning_rate": 3.057383053705937e-05, + "loss": 0.001, + "num_input_tokens_seen": 237923712, + "step": 110255 + }, + { + "epoch": 17.98694942903752, + "grad_norm": 0.0003443536115810275, + "learning_rate": 3.054932665494936e-05, + "loss": 0.0058, + "num_input_tokens_seen": 237933856, + "step": 110260 + }, + { + "epoch": 17.987765089722675, + "grad_norm": 0.00010128107533091679, + "learning_rate": 3.052483228685282e-05, + "loss": 0.0011, + "num_input_tokens_seen": 237944160, + "step": 110265 + }, + { + "epoch": 17.98858075040783, + "grad_norm": 0.00018471080693416297, + "learning_rate": 3.050034743326613e-05, + "loss": 0.0002, + "num_input_tokens_seen": 237955936, + "step": 110270 + }, + { + "epoch": 17.989396411092986, + "grad_norm": 0.1534983217716217, + "learning_rate": 3.0475872094685443e-05, + "loss": 0.0191, + "num_input_tokens_seen": 237967008, + "step": 110275 + }, + { + "epoch": 17.99021207177814, + "grad_norm": 0.17751476168632507, + "learning_rate": 3.0451406271606974e-05, + "loss": 0.0025, + "num_input_tokens_seen": 237975840, + "step": 110280 + }, + { + "epoch": 17.991027732463294, + "grad_norm": 0.0005809574504382908, + "learning_rate": 3.0426949964526272e-05, + "loss": 0.0012, + "num_input_tokens_seen": 237986784, + "step": 110285 + }, + { + "epoch": 17.99184339314845, + "grad_norm": 0.0001680754212429747, + "learning_rate": 3.0402503173939277e-05, + "loss": 0.0004, + "num_input_tokens_seen": 237998528, + "step": 110290 + }, + { + "epoch": 17.992659053833606, + "grad_norm": 0.013349932618439198, + "learning_rate": 3.0378065900341146e-05, + "loss": 0.0212, + "num_input_tokens_seen": 238008576, + "step": 110295 + }, + { + "epoch": 17.99347471451876, + "grad_norm": 0.007926372811198235, + "learning_rate": 3.035363814422737e-05, + "loss": 0.0017, + "num_input_tokens_seen": 238019200, + "step": 110300 + }, + { + "epoch": 17.994290375203914, + "grad_norm": 0.00016683909052517265, + "learning_rate": 3.0329219906092776e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238030336, + "step": 110305 + }, + { + "epoch": 17.99510603588907, + "grad_norm": 0.001373286941088736, + "learning_rate": 3.030481118643247e-05, + "loss": 0.0047, + "num_input_tokens_seen": 238040000, + "step": 110310 + }, + { + "epoch": 17.995921696574225, + "grad_norm": 0.00034008824150078, + "learning_rate": 3.0280411985740995e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238050880, + "step": 110315 + }, + { + "epoch": 17.99673735725938, + "grad_norm": 8.24348462629132e-05, + "learning_rate": 3.0256022304512854e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238060768, + "step": 110320 + }, + { + "epoch": 17.997553017944536, + "grad_norm": 0.00018118004663847387, + "learning_rate": 3.023164214324231e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238071296, + "step": 110325 + }, + { + "epoch": 17.99836867862969, + "grad_norm": 0.0001870966370915994, + "learning_rate": 3.0207271502423527e-05, + "loss": 0.003, + "num_input_tokens_seen": 238082464, + "step": 110330 + }, + { + "epoch": 17.999184339314844, + "grad_norm": 0.003663206472992897, + "learning_rate": 3.018291038255033e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238092288, + "step": 110335 + }, + { + "epoch": 18.0, + "grad_norm": 0.000310079543851316, + "learning_rate": 3.0158558784116442e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238102672, + "step": 110340 + }, + { + "epoch": 18.0, + "eval_loss": 0.3278021812438965, + "eval_runtime": 104.5697, + "eval_samples_per_second": 26.059, + "eval_steps_per_second": 6.522, + "num_input_tokens_seen": 238102672, + "step": 110340 + }, + { + "epoch": 18.000815660685156, + "grad_norm": 0.0006559292669408023, + "learning_rate": 3.0134216707615404e-05, + "loss": 0.006, + "num_input_tokens_seen": 238112176, + "step": 110345 + }, + { + "epoch": 18.00163132137031, + "grad_norm": 0.005173215642571449, + "learning_rate": 3.0109884153540545e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238123920, + "step": 110350 + }, + { + "epoch": 18.002446982055464, + "grad_norm": 0.00027394629432819784, + "learning_rate": 3.0085561122384974e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238134992, + "step": 110355 + }, + { + "epoch": 18.00326264274062, + "grad_norm": 0.00021991279209032655, + "learning_rate": 3.0061247614641684e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238145968, + "step": 110360 + }, + { + "epoch": 18.004078303425775, + "grad_norm": 0.009549359790980816, + "learning_rate": 3.0036943630803282e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238156848, + "step": 110365 + }, + { + "epoch": 18.00489396411093, + "grad_norm": 0.00031416211277246475, + "learning_rate": 3.0012649171362482e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238168272, + "step": 110370 + }, + { + "epoch": 18.005709624796086, + "grad_norm": 0.0018150412943214178, + "learning_rate": 2.998836423681156e-05, + "loss": 0.0041, + "num_input_tokens_seen": 238178736, + "step": 110375 + }, + { + "epoch": 18.00652528548124, + "grad_norm": 0.00015611901471856982, + "learning_rate": 2.9964088827642564e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238190256, + "step": 110380 + }, + { + "epoch": 18.007340946166394, + "grad_norm": 0.00040266558062285185, + "learning_rate": 2.993982294434777e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238200432, + "step": 110385 + }, + { + "epoch": 18.00815660685155, + "grad_norm": 0.00329483300447464, + "learning_rate": 2.991556658741862e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238210576, + "step": 110390 + }, + { + "epoch": 18.008972267536706, + "grad_norm": 0.009533674456179142, + "learning_rate": 2.9891319757347047e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238221200, + "step": 110395 + }, + { + "epoch": 18.00978792822186, + "grad_norm": 0.0928882509469986, + "learning_rate": 2.986708245462405e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238232208, + "step": 110400 + }, + { + "epoch": 18.010603588907014, + "grad_norm": 0.03144010901451111, + "learning_rate": 2.984285467974124e-05, + "loss": 0.0195, + "num_input_tokens_seen": 238243632, + "step": 110405 + }, + { + "epoch": 18.01141924959217, + "grad_norm": 0.1581980139017105, + "learning_rate": 2.981863643318922e-05, + "loss": 0.0045, + "num_input_tokens_seen": 238254192, + "step": 110410 + }, + { + "epoch": 18.012234910277325, + "grad_norm": 0.03431587293744087, + "learning_rate": 2.979442771545915e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238265392, + "step": 110415 + }, + { + "epoch": 18.01305057096248, + "grad_norm": 0.0005015961360186338, + "learning_rate": 2.9770228527041364e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238275376, + "step": 110420 + }, + { + "epoch": 18.013866231647636, + "grad_norm": 0.02533356472849846, + "learning_rate": 2.9746038868426584e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238286640, + "step": 110425 + }, + { + "epoch": 18.01468189233279, + "grad_norm": 0.015627441927790642, + "learning_rate": 2.9721858740104747e-05, + "loss": 0.0116, + "num_input_tokens_seen": 238297360, + "step": 110430 + }, + { + "epoch": 18.015497553017944, + "grad_norm": 8.755864837439731e-05, + "learning_rate": 2.9697688142566127e-05, + "loss": 0.0125, + "num_input_tokens_seen": 238308240, + "step": 110435 + }, + { + "epoch": 18.0163132137031, + "grad_norm": 0.03598572686314583, + "learning_rate": 2.967352707630039e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238318128, + "step": 110440 + }, + { + "epoch": 18.017128874388256, + "grad_norm": 0.00012057910498697311, + "learning_rate": 2.9649375541797418e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238330512, + "step": 110445 + }, + { + "epoch": 18.017944535073408, + "grad_norm": 0.0013280296698212624, + "learning_rate": 2.9625233539546326e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238339792, + "step": 110450 + }, + { + "epoch": 18.018760195758563, + "grad_norm": 0.06542807817459106, + "learning_rate": 2.960110107003672e-05, + "loss": 0.0024, + "num_input_tokens_seen": 238349904, + "step": 110455 + }, + { + "epoch": 18.01957585644372, + "grad_norm": 0.00022691742924507707, + "learning_rate": 2.9576978133757536e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238359600, + "step": 110460 + }, + { + "epoch": 18.020391517128875, + "grad_norm": 0.011758371256291866, + "learning_rate": 2.955286473119767e-05, + "loss": 0.0045, + "num_input_tokens_seen": 238370768, + "step": 110465 + }, + { + "epoch": 18.02120717781403, + "grad_norm": 0.00015552518016193062, + "learning_rate": 2.9528760862845783e-05, + "loss": 0.001, + "num_input_tokens_seen": 238380208, + "step": 110470 + }, + { + "epoch": 18.022022838499183, + "grad_norm": 0.00984133966267109, + "learning_rate": 2.9504666529190426e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238391280, + "step": 110475 + }, + { + "epoch": 18.02283849918434, + "grad_norm": 0.0018640848575159907, + "learning_rate": 2.9480581730719825e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238402192, + "step": 110480 + }, + { + "epoch": 18.023654159869494, + "grad_norm": 0.0007044014637358487, + "learning_rate": 2.945650646792214e-05, + "loss": 0.0001, + "num_input_tokens_seen": 238413200, + "step": 110485 + }, + { + "epoch": 18.02446982055465, + "grad_norm": 0.004239337518811226, + "learning_rate": 2.9432440741285314e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238424816, + "step": 110490 + }, + { + "epoch": 18.025285481239806, + "grad_norm": 0.03135032206773758, + "learning_rate": 2.940838455129696e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238433808, + "step": 110495 + }, + { + "epoch": 18.026101141924958, + "grad_norm": 0.0012819814728572965, + "learning_rate": 2.9384337898444747e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238445392, + "step": 110500 + }, + { + "epoch": 18.026916802610113, + "grad_norm": 0.00012603666982613504, + "learning_rate": 2.9360300783215832e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238456688, + "step": 110505 + }, + { + "epoch": 18.02773246329527, + "grad_norm": 0.0008271850529126823, + "learning_rate": 2.9336273206097663e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238468272, + "step": 110510 + }, + { + "epoch": 18.028548123980425, + "grad_norm": 0.0038844754453748465, + "learning_rate": 2.931225516757685e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238479280, + "step": 110515 + }, + { + "epoch": 18.02936378466558, + "grad_norm": 0.03933820128440857, + "learning_rate": 2.9288246668140396e-05, + "loss": 0.0024, + "num_input_tokens_seen": 238490640, + "step": 110520 + }, + { + "epoch": 18.030179445350733, + "grad_norm": 0.027884211391210556, + "learning_rate": 2.9264247708274628e-05, + "loss": 0.0011, + "num_input_tokens_seen": 238501680, + "step": 110525 + }, + { + "epoch": 18.03099510603589, + "grad_norm": 0.0002979639102704823, + "learning_rate": 2.9240258288466215e-05, + "loss": 0.0091, + "num_input_tokens_seen": 238513264, + "step": 110530 + }, + { + "epoch": 18.031810766721044, + "grad_norm": 0.00021208927500993013, + "learning_rate": 2.921627840920099e-05, + "loss": 0.0001, + "num_input_tokens_seen": 238523312, + "step": 110535 + }, + { + "epoch": 18.0326264274062, + "grad_norm": 0.07630857080221176, + "learning_rate": 2.919230807096529e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238534992, + "step": 110540 + }, + { + "epoch": 18.033442088091356, + "grad_norm": 0.002384261228144169, + "learning_rate": 2.916834727424461e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238546256, + "step": 110545 + }, + { + "epoch": 18.034257748776508, + "grad_norm": 0.0061640688218176365, + "learning_rate": 2.9144396019524788e-05, + "loss": 0.002, + "num_input_tokens_seen": 238556656, + "step": 110550 + }, + { + "epoch": 18.035073409461663, + "grad_norm": 0.01841193623840809, + "learning_rate": 2.9120454307290933e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238567216, + "step": 110555 + }, + { + "epoch": 18.03588907014682, + "grad_norm": 0.015881473198533058, + "learning_rate": 2.90965221380286e-05, + "loss": 0.0015, + "num_input_tokens_seen": 238576816, + "step": 110560 + }, + { + "epoch": 18.036704730831975, + "grad_norm": 9.019353456096724e-05, + "learning_rate": 2.9072599512222464e-05, + "loss": 0.0021, + "num_input_tokens_seen": 238587440, + "step": 110565 + }, + { + "epoch": 18.03752039151713, + "grad_norm": 0.0011284251231700182, + "learning_rate": 2.9048686430357685e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238598480, + "step": 110570 + }, + { + "epoch": 18.038336052202283, + "grad_norm": 0.10209257900714874, + "learning_rate": 2.9024782892918543e-05, + "loss": 0.0027, + "num_input_tokens_seen": 238609488, + "step": 110575 + }, + { + "epoch": 18.03915171288744, + "grad_norm": 0.0004868714022450149, + "learning_rate": 2.9000888900389764e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238620496, + "step": 110580 + }, + { + "epoch": 18.039967373572594, + "grad_norm": 0.0004860328044742346, + "learning_rate": 2.8977004453255406e-05, + "loss": 0.0013, + "num_input_tokens_seen": 238630608, + "step": 110585 + }, + { + "epoch": 18.04078303425775, + "grad_norm": 0.012272709049284458, + "learning_rate": 2.8953129551999634e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238640784, + "step": 110590 + }, + { + "epoch": 18.041598694942905, + "grad_norm": 0.00010388634109403938, + "learning_rate": 2.892926419710623e-05, + "loss": 0.0001, + "num_input_tokens_seen": 238651440, + "step": 110595 + }, + { + "epoch": 18.042414355628058, + "grad_norm": 0.003930697217583656, + "learning_rate": 2.8905408389058917e-05, + "loss": 0.0023, + "num_input_tokens_seen": 238662864, + "step": 110600 + }, + { + "epoch": 18.043230016313213, + "grad_norm": 0.005672941450029612, + "learning_rate": 2.8881562128341088e-05, + "loss": 0.0009, + "num_input_tokens_seen": 238672240, + "step": 110605 + }, + { + "epoch": 18.04404567699837, + "grad_norm": 0.0036455909721553326, + "learning_rate": 2.885772541543613e-05, + "loss": 0.0007, + "num_input_tokens_seen": 238682608, + "step": 110610 + }, + { + "epoch": 18.044861337683525, + "grad_norm": 0.022672155871987343, + "learning_rate": 2.8833898250826994e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238693072, + "step": 110615 + }, + { + "epoch": 18.045676998368677, + "grad_norm": 0.00017792798462323844, + "learning_rate": 2.881008063499663e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238704240, + "step": 110620 + }, + { + "epoch": 18.046492659053833, + "grad_norm": 0.0013916065217927098, + "learning_rate": 2.878627256842775e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238716112, + "step": 110625 + }, + { + "epoch": 18.04730831973899, + "grad_norm": 0.010079164057970047, + "learning_rate": 2.8762474051602816e-05, + "loss": 0.0004, + "num_input_tokens_seen": 238726096, + "step": 110630 + }, + { + "epoch": 18.048123980424144, + "grad_norm": 0.011101754382252693, + "learning_rate": 2.8738685085004156e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238737744, + "step": 110635 + }, + { + "epoch": 18.0489396411093, + "grad_norm": 0.0006997852469794452, + "learning_rate": 2.871490566911389e-05, + "loss": 0.0045, + "num_input_tokens_seen": 238748560, + "step": 110640 + }, + { + "epoch": 18.049755301794452, + "grad_norm": 0.016348257660865784, + "learning_rate": 2.8691135804413905e-05, + "loss": 0.0012, + "num_input_tokens_seen": 238759760, + "step": 110645 + }, + { + "epoch": 18.050570962479608, + "grad_norm": 0.0015223800437524915, + "learning_rate": 2.8667375491385928e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238770928, + "step": 110650 + }, + { + "epoch": 18.051386623164763, + "grad_norm": 0.00031539518386125565, + "learning_rate": 2.864362473051163e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238781392, + "step": 110655 + }, + { + "epoch": 18.05220228384992, + "grad_norm": 0.0007792735705152154, + "learning_rate": 2.8619883522272072e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238790448, + "step": 110660 + }, + { + "epoch": 18.053017944535075, + "grad_norm": 0.0035716292914003134, + "learning_rate": 2.85961518671487e-05, + "loss": 0.0011, + "num_input_tokens_seen": 238800624, + "step": 110665 + }, + { + "epoch": 18.053833605220227, + "grad_norm": 0.010899029672145844, + "learning_rate": 2.8572429765622243e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238811760, + "step": 110670 + }, + { + "epoch": 18.054649265905383, + "grad_norm": 0.004851315636187792, + "learning_rate": 2.8548717218173647e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238822448, + "step": 110675 + }, + { + "epoch": 18.05546492659054, + "grad_norm": 0.01262225303798914, + "learning_rate": 2.8525014225283195e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238833392, + "step": 110680 + }, + { + "epoch": 18.056280587275694, + "grad_norm": 0.003260652767494321, + "learning_rate": 2.8501320787431673e-05, + "loss": 0.001, + "num_input_tokens_seen": 238843568, + "step": 110685 + }, + { + "epoch": 18.05709624796085, + "grad_norm": 0.000378935772459954, + "learning_rate": 2.8477636905098802e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238854928, + "step": 110690 + }, + { + "epoch": 18.057911908646002, + "grad_norm": 0.04385722428560257, + "learning_rate": 2.845396257876487e-05, + "loss": 0.0008, + "num_input_tokens_seen": 238865136, + "step": 110695 + }, + { + "epoch": 18.058727569331158, + "grad_norm": 0.002747379010543227, + "learning_rate": 2.84302978089096e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238875824, + "step": 110700 + }, + { + "epoch": 18.059543230016313, + "grad_norm": 0.002311385702341795, + "learning_rate": 2.840664259601261e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238885904, + "step": 110705 + }, + { + "epoch": 18.06035889070147, + "grad_norm": 0.003220928367227316, + "learning_rate": 2.838299694055324e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238896208, + "step": 110710 + }, + { + "epoch": 18.061174551386625, + "grad_norm": 0.00012777902884408832, + "learning_rate": 2.835936084301072e-05, + "loss": 0.0005, + "num_input_tokens_seen": 238906608, + "step": 110715 + }, + { + "epoch": 18.061990212071777, + "grad_norm": 8.947848255047575e-05, + "learning_rate": 2.8335734303864047e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238917840, + "step": 110720 + }, + { + "epoch": 18.062805872756933, + "grad_norm": 0.00027031838544644415, + "learning_rate": 2.8312117323592125e-05, + "loss": 0.0006, + "num_input_tokens_seen": 238927376, + "step": 110725 + }, + { + "epoch": 18.063621533442088, + "grad_norm": 0.04812120273709297, + "learning_rate": 2.8288509902673454e-05, + "loss": 0.0014, + "num_input_tokens_seen": 238937680, + "step": 110730 + }, + { + "epoch": 18.064437194127244, + "grad_norm": 0.0036094679962843657, + "learning_rate": 2.8264912041586598e-05, + "loss": 0.0027, + "num_input_tokens_seen": 238949168, + "step": 110735 + }, + { + "epoch": 18.0652528548124, + "grad_norm": 0.004373582545667887, + "learning_rate": 2.8241323740809676e-05, + "loss": 0.001, + "num_input_tokens_seen": 238959728, + "step": 110740 + }, + { + "epoch": 18.06606851549755, + "grad_norm": 0.001661698566749692, + "learning_rate": 2.821774500082086e-05, + "loss": 0.0002, + "num_input_tokens_seen": 238969904, + "step": 110745 + }, + { + "epoch": 18.066884176182707, + "grad_norm": 0.0006155354785732925, + "learning_rate": 2.819417582209788e-05, + "loss": 0.0001, + "num_input_tokens_seen": 238982704, + "step": 110750 + }, + { + "epoch": 18.067699836867863, + "grad_norm": 0.003346961224451661, + "learning_rate": 2.8170616205118516e-05, + "loss": 0.0003, + "num_input_tokens_seen": 238992048, + "step": 110755 + }, + { + "epoch": 18.06851549755302, + "grad_norm": 0.10005409270524979, + "learning_rate": 2.8147066150360167e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239003696, + "step": 110760 + }, + { + "epoch": 18.069331158238175, + "grad_norm": 0.0009065805934369564, + "learning_rate": 2.8123525658300066e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239014448, + "step": 110765 + }, + { + "epoch": 18.070146818923327, + "grad_norm": 0.006155765615403652, + "learning_rate": 2.8099994729415377e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239025232, + "step": 110770 + }, + { + "epoch": 18.070962479608482, + "grad_norm": 0.00010765331535367295, + "learning_rate": 2.8076473364182897e-05, + "loss": 0.0042, + "num_input_tokens_seen": 239035600, + "step": 110775 + }, + { + "epoch": 18.071778140293638, + "grad_norm": 0.0002873715420719236, + "learning_rate": 2.8052961563079403e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239047376, + "step": 110780 + }, + { + "epoch": 18.072593800978794, + "grad_norm": 0.0013168805744498968, + "learning_rate": 2.8029459326581353e-05, + "loss": 0.0013, + "num_input_tokens_seen": 239057168, + "step": 110785 + }, + { + "epoch": 18.07340946166395, + "grad_norm": 0.0005378556088544428, + "learning_rate": 2.8005966655165026e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239068496, + "step": 110790 + }, + { + "epoch": 18.0742251223491, + "grad_norm": 0.0007988149882294238, + "learning_rate": 2.7982483549306435e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239079952, + "step": 110795 + }, + { + "epoch": 18.075040783034257, + "grad_norm": 0.028562987223267555, + "learning_rate": 2.795901000948181e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239090960, + "step": 110800 + }, + { + "epoch": 18.075856443719413, + "grad_norm": 0.047057125717401505, + "learning_rate": 2.7935546036166548e-05, + "loss": 0.0016, + "num_input_tokens_seen": 239100336, + "step": 110805 + }, + { + "epoch": 18.07667210440457, + "grad_norm": 0.0016235620714724064, + "learning_rate": 2.7912091629836324e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239111216, + "step": 110810 + }, + { + "epoch": 18.07748776508972, + "grad_norm": 0.0006812784122303128, + "learning_rate": 2.7888646790966476e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239122576, + "step": 110815 + }, + { + "epoch": 18.078303425774877, + "grad_norm": 0.08591090887784958, + "learning_rate": 2.786521152003213e-05, + "loss": 0.0032, + "num_input_tokens_seen": 239133712, + "step": 110820 + }, + { + "epoch": 18.079119086460032, + "grad_norm": 0.0038968082517385483, + "learning_rate": 2.784178581750818e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239144016, + "step": 110825 + }, + { + "epoch": 18.079934747145188, + "grad_norm": 0.000136450253194198, + "learning_rate": 2.781836968386947e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239154352, + "step": 110830 + }, + { + "epoch": 18.080750407830344, + "grad_norm": 0.002850303193554282, + "learning_rate": 2.7794963119590454e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239165616, + "step": 110835 + }, + { + "epoch": 18.081566068515496, + "grad_norm": 0.0028036253061145544, + "learning_rate": 2.7771566125145588e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239176048, + "step": 110840 + }, + { + "epoch": 18.08238172920065, + "grad_norm": 0.0006606015958823264, + "learning_rate": 2.774817870100893e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239186320, + "step": 110845 + }, + { + "epoch": 18.083197389885807, + "grad_norm": 0.014084829948842525, + "learning_rate": 2.7724800847654608e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239196528, + "step": 110850 + }, + { + "epoch": 18.084013050570963, + "grad_norm": 0.00018625934899318963, + "learning_rate": 2.7701432565556296e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239207760, + "step": 110855 + }, + { + "epoch": 18.08482871125612, + "grad_norm": 0.004975112155079842, + "learning_rate": 2.767807385518756e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239218992, + "step": 110860 + }, + { + "epoch": 18.08564437194127, + "grad_norm": 0.000280034844763577, + "learning_rate": 2.765472471702185e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239229808, + "step": 110865 + }, + { + "epoch": 18.086460032626427, + "grad_norm": 0.00017918055527843535, + "learning_rate": 2.7631385151532405e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239240528, + "step": 110870 + }, + { + "epoch": 18.087275693311582, + "grad_norm": 0.012052093632519245, + "learning_rate": 2.7608055159192125e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239251216, + "step": 110875 + }, + { + "epoch": 18.088091353996738, + "grad_norm": 0.0036937850527465343, + "learning_rate": 2.7584734740473905e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239261488, + "step": 110880 + }, + { + "epoch": 18.088907014681894, + "grad_norm": 0.004613324999809265, + "learning_rate": 2.756142389585037e-05, + "loss": 0.0059, + "num_input_tokens_seen": 239272496, + "step": 110885 + }, + { + "epoch": 18.089722675367046, + "grad_norm": 0.00020710681565105915, + "learning_rate": 2.753812262579386e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239283824, + "step": 110890 + }, + { + "epoch": 18.0905383360522, + "grad_norm": 0.0017628376372158527, + "learning_rate": 2.7514830930776667e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239293936, + "step": 110895 + }, + { + "epoch": 18.091353996737357, + "grad_norm": 0.00018775918579194695, + "learning_rate": 2.749154881127086e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239305712, + "step": 110900 + }, + { + "epoch": 18.092169657422513, + "grad_norm": 0.0003207278496120125, + "learning_rate": 2.7468276267748172e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239315696, + "step": 110905 + }, + { + "epoch": 18.09298531810767, + "grad_norm": 0.00201684539206326, + "learning_rate": 2.7445013300680333e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239327568, + "step": 110910 + }, + { + "epoch": 18.09380097879282, + "grad_norm": 0.020108338445425034, + "learning_rate": 2.7421759910538745e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239339312, + "step": 110915 + }, + { + "epoch": 18.094616639477977, + "grad_norm": 0.0005559224518947303, + "learning_rate": 2.739851609779481e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239349936, + "step": 110920 + }, + { + "epoch": 18.095432300163132, + "grad_norm": 0.00020967914315406233, + "learning_rate": 2.737528186291932e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239359568, + "step": 110925 + }, + { + "epoch": 18.096247960848288, + "grad_norm": 0.003921035211533308, + "learning_rate": 2.735205720638351e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239369712, + "step": 110930 + }, + { + "epoch": 18.097063621533444, + "grad_norm": 0.008295831270515919, + "learning_rate": 2.732884212865766e-05, + "loss": 0.0109, + "num_input_tokens_seen": 239380880, + "step": 110935 + }, + { + "epoch": 18.097879282218596, + "grad_norm": 0.0066918982192873955, + "learning_rate": 2.730563663021257e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239392464, + "step": 110940 + }, + { + "epoch": 18.09869494290375, + "grad_norm": 0.000509702309500426, + "learning_rate": 2.7282440711518363e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239402224, + "step": 110945 + }, + { + "epoch": 18.099510603588907, + "grad_norm": 0.0007984688272699714, + "learning_rate": 2.725925437304522e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239413072, + "step": 110950 + }, + { + "epoch": 18.100326264274063, + "grad_norm": 0.0020236854907125235, + "learning_rate": 2.7236077615262976e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239423472, + "step": 110955 + }, + { + "epoch": 18.10114192495922, + "grad_norm": 0.028827782720327377, + "learning_rate": 2.721291043864138e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239434064, + "step": 110960 + }, + { + "epoch": 18.10195758564437, + "grad_norm": 0.09916806221008301, + "learning_rate": 2.7189752843649885e-05, + "loss": 0.0013, + "num_input_tokens_seen": 239444560, + "step": 110965 + }, + { + "epoch": 18.102773246329527, + "grad_norm": 0.13658368587493896, + "learning_rate": 2.716660483075789e-05, + "loss": 0.0035, + "num_input_tokens_seen": 239456560, + "step": 110970 + }, + { + "epoch": 18.103588907014682, + "grad_norm": 0.00030747923301532865, + "learning_rate": 2.714346640043447e-05, + "loss": 0.0021, + "num_input_tokens_seen": 239468464, + "step": 110975 + }, + { + "epoch": 18.104404567699838, + "grad_norm": 0.00017850229050964117, + "learning_rate": 2.7120337553148578e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239478448, + "step": 110980 + }, + { + "epoch": 18.10522022838499, + "grad_norm": 0.00015514488040935248, + "learning_rate": 2.7097218289368896e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239489968, + "step": 110985 + }, + { + "epoch": 18.106035889070146, + "grad_norm": 0.001853234483860433, + "learning_rate": 2.7074108609564053e-05, + "loss": 0.0015, + "num_input_tokens_seen": 239500368, + "step": 110990 + }, + { + "epoch": 18.1068515497553, + "grad_norm": 0.02025492675602436, + "learning_rate": 2.7051008514202336e-05, + "loss": 0.0016, + "num_input_tokens_seen": 239510928, + "step": 110995 + }, + { + "epoch": 18.107667210440457, + "grad_norm": 0.0004975805059075356, + "learning_rate": 2.7027918003751873e-05, + "loss": 0.0021, + "num_input_tokens_seen": 239522064, + "step": 111000 + }, + { + "epoch": 18.108482871125613, + "grad_norm": 7.811758405296132e-05, + "learning_rate": 2.7004837078680678e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239532912, + "step": 111005 + }, + { + "epoch": 18.109298531810765, + "grad_norm": 0.11428035795688629, + "learning_rate": 2.698176573945654e-05, + "loss": 0.0025, + "num_input_tokens_seen": 239544112, + "step": 111010 + }, + { + "epoch": 18.11011419249592, + "grad_norm": 7.730815559625626e-05, + "learning_rate": 2.695870398654693e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239554320, + "step": 111015 + }, + { + "epoch": 18.110929853181077, + "grad_norm": 0.0014162738807499409, + "learning_rate": 2.693565182041924e-05, + "loss": 0.001, + "num_input_tokens_seen": 239564784, + "step": 111020 + }, + { + "epoch": 18.111745513866232, + "grad_norm": 0.0005722580244764686, + "learning_rate": 2.6912609241540818e-05, + "loss": 0.0011, + "num_input_tokens_seen": 239575952, + "step": 111025 + }, + { + "epoch": 18.112561174551388, + "grad_norm": 0.0018707423005253077, + "learning_rate": 2.688957625037841e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239586032, + "step": 111030 + }, + { + "epoch": 18.11337683523654, + "grad_norm": 0.0067637525498867035, + "learning_rate": 2.6866552847399028e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239597008, + "step": 111035 + }, + { + "epoch": 18.114192495921696, + "grad_norm": 0.0008572746883146465, + "learning_rate": 2.684353903306902e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239607344, + "step": 111040 + }, + { + "epoch": 18.11500815660685, + "grad_norm": 0.00022151188750285655, + "learning_rate": 2.6820534807855124e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239617552, + "step": 111045 + }, + { + "epoch": 18.115823817292007, + "grad_norm": 0.008883276954293251, + "learning_rate": 2.679754017222319e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239628656, + "step": 111050 + }, + { + "epoch": 18.116639477977163, + "grad_norm": 0.0006711349706165493, + "learning_rate": 2.677455512663951e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239639120, + "step": 111055 + }, + { + "epoch": 18.117455138662315, + "grad_norm": 0.009341733530163765, + "learning_rate": 2.6751579671569715e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239648464, + "step": 111060 + }, + { + "epoch": 18.11827079934747, + "grad_norm": 0.00010760693112388253, + "learning_rate": 2.6728613807479594e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239660080, + "step": 111065 + }, + { + "epoch": 18.119086460032626, + "grad_norm": 0.0004025481757707894, + "learning_rate": 2.6705657534834394e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239671280, + "step": 111070 + }, + { + "epoch": 18.119902120717782, + "grad_norm": 0.0002477603848092258, + "learning_rate": 2.6682710854099623e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239681328, + "step": 111075 + }, + { + "epoch": 18.120717781402938, + "grad_norm": 0.04462388530373573, + "learning_rate": 2.6659773765740025e-05, + "loss": 0.0018, + "num_input_tokens_seen": 239691472, + "step": 111080 + }, + { + "epoch": 18.12153344208809, + "grad_norm": 0.004458005074411631, + "learning_rate": 2.6636846270220615e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239701392, + "step": 111085 + }, + { + "epoch": 18.122349102773246, + "grad_norm": 0.006411997135728598, + "learning_rate": 2.661392836800608e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239712016, + "step": 111090 + }, + { + "epoch": 18.1231647634584, + "grad_norm": 0.000510230369400233, + "learning_rate": 2.6591020059560766e-05, + "loss": 0.001, + "num_input_tokens_seen": 239722512, + "step": 111095 + }, + { + "epoch": 18.123980424143557, + "grad_norm": 0.00016511759895365685, + "learning_rate": 2.656812134534897e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239734352, + "step": 111100 + }, + { + "epoch": 18.124796084828713, + "grad_norm": 0.009457617066800594, + "learning_rate": 2.6545232225834825e-05, + "loss": 0.0005, + "num_input_tokens_seen": 239745904, + "step": 111105 + }, + { + "epoch": 18.125611745513865, + "grad_norm": 0.00010033736180048436, + "learning_rate": 2.6522352701482178e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239756400, + "step": 111110 + }, + { + "epoch": 18.12642740619902, + "grad_norm": 0.0004305252805352211, + "learning_rate": 2.6499482772754714e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239767440, + "step": 111115 + }, + { + "epoch": 18.127243066884176, + "grad_norm": 0.0002640663296915591, + "learning_rate": 2.6476622440115894e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239777936, + "step": 111120 + }, + { + "epoch": 18.128058727569332, + "grad_norm": 0.008301780559122562, + "learning_rate": 2.6453771704029017e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239788368, + "step": 111125 + }, + { + "epoch": 18.128874388254488, + "grad_norm": 0.015624604187905788, + "learning_rate": 2.6430930564957213e-05, + "loss": 0.0006, + "num_input_tokens_seen": 239800016, + "step": 111130 + }, + { + "epoch": 18.12969004893964, + "grad_norm": 0.0074993399903178215, + "learning_rate": 2.6408099023363275e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239810288, + "step": 111135 + }, + { + "epoch": 18.130505709624796, + "grad_norm": 0.0008496912778355181, + "learning_rate": 2.6385277079710113e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239820400, + "step": 111140 + }, + { + "epoch": 18.13132137030995, + "grad_norm": 0.00036895289667882025, + "learning_rate": 2.6362464734460024e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239830960, + "step": 111145 + }, + { + "epoch": 18.132137030995107, + "grad_norm": 0.006931063253432512, + "learning_rate": 2.633966198807558e-05, + "loss": 0.0002, + "num_input_tokens_seen": 239840976, + "step": 111150 + }, + { + "epoch": 18.13295269168026, + "grad_norm": 0.0042406837455928326, + "learning_rate": 2.631686884101864e-05, + "loss": 0.003, + "num_input_tokens_seen": 239853136, + "step": 111155 + }, + { + "epoch": 18.133768352365415, + "grad_norm": 0.00016545032849535346, + "learning_rate": 2.6294085293751435e-05, + "loss": 0.008, + "num_input_tokens_seen": 239864688, + "step": 111160 + }, + { + "epoch": 18.13458401305057, + "grad_norm": 0.035217154771089554, + "learning_rate": 2.6271311346735326e-05, + "loss": 0.0007, + "num_input_tokens_seen": 239875312, + "step": 111165 + }, + { + "epoch": 18.135399673735726, + "grad_norm": 0.0007654440123587847, + "learning_rate": 2.624854700043222e-05, + "loss": 0.0001, + "num_input_tokens_seen": 239885904, + "step": 111170 + }, + { + "epoch": 18.136215334420882, + "grad_norm": 0.22622781991958618, + "learning_rate": 2.6225792255303195e-05, + "loss": 0.0108, + "num_input_tokens_seen": 239895856, + "step": 111175 + }, + { + "epoch": 18.137030995106034, + "grad_norm": 0.00032420834759250283, + "learning_rate": 2.6203047111809597e-05, + "loss": 0.0032, + "num_input_tokens_seen": 239906992, + "step": 111180 + }, + { + "epoch": 18.13784665579119, + "grad_norm": 0.215866819024086, + "learning_rate": 2.6180311570412174e-05, + "loss": 0.0074, + "num_input_tokens_seen": 239916720, + "step": 111185 + }, + { + "epoch": 18.138662316476346, + "grad_norm": 0.021184224635362625, + "learning_rate": 2.6157585631572e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239928144, + "step": 111190 + }, + { + "epoch": 18.1394779771615, + "grad_norm": 0.02616288885474205, + "learning_rate": 2.613486929574932e-05, + "loss": 0.0009, + "num_input_tokens_seen": 239939152, + "step": 111195 + }, + { + "epoch": 18.140293637846657, + "grad_norm": 0.0003412305668462068, + "learning_rate": 2.611216256340476e-05, + "loss": 0.0003, + "num_input_tokens_seen": 239950128, + "step": 111200 + }, + { + "epoch": 18.14110929853181, + "grad_norm": 0.00027304835384711623, + "learning_rate": 2.6089465434998296e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239961040, + "step": 111205 + }, + { + "epoch": 18.141924959216965, + "grad_norm": 0.0011335198068991303, + "learning_rate": 2.6066777910990104e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239971184, + "step": 111210 + }, + { + "epoch": 18.14274061990212, + "grad_norm": 0.007375058252364397, + "learning_rate": 2.6044099991839766e-05, + "loss": 0.0008, + "num_input_tokens_seen": 239982096, + "step": 111215 + }, + { + "epoch": 18.143556280587276, + "grad_norm": 0.00014955218648537993, + "learning_rate": 2.602143167800719e-05, + "loss": 0.0004, + "num_input_tokens_seen": 239991728, + "step": 111220 + }, + { + "epoch": 18.144371941272432, + "grad_norm": 0.09610569477081299, + "learning_rate": 2.59987729699514e-05, + "loss": 0.0033, + "num_input_tokens_seen": 240002192, + "step": 111225 + }, + { + "epoch": 18.145187601957584, + "grad_norm": 0.0016175741329789162, + "learning_rate": 2.5976123868131864e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240014320, + "step": 111230 + }, + { + "epoch": 18.14600326264274, + "grad_norm": 0.001302407355979085, + "learning_rate": 2.5953484373007487e-05, + "loss": 0.0014, + "num_input_tokens_seen": 240024688, + "step": 111235 + }, + { + "epoch": 18.146818923327896, + "grad_norm": 0.00047696384717710316, + "learning_rate": 2.5930854485037124e-05, + "loss": 0.002, + "num_input_tokens_seen": 240034288, + "step": 111240 + }, + { + "epoch": 18.14763458401305, + "grad_norm": 0.0010280075948685408, + "learning_rate": 2.590823420467947e-05, + "loss": 0.0028, + "num_input_tokens_seen": 240045616, + "step": 111245 + }, + { + "epoch": 18.148450244698207, + "grad_norm": 0.00012670339492615312, + "learning_rate": 2.5885623532392823e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240056496, + "step": 111250 + }, + { + "epoch": 18.14926590538336, + "grad_norm": 0.0006197491311468184, + "learning_rate": 2.586302246863548e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240068080, + "step": 111255 + }, + { + "epoch": 18.150081566068515, + "grad_norm": 0.0014059824170544744, + "learning_rate": 2.584043101386546e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240079184, + "step": 111260 + }, + { + "epoch": 18.15089722675367, + "grad_norm": 0.2954062819480896, + "learning_rate": 2.5817849168540576e-05, + "loss": 0.0089, + "num_input_tokens_seen": 240089264, + "step": 111265 + }, + { + "epoch": 18.151712887438826, + "grad_norm": 0.00026600301498547196, + "learning_rate": 2.5795276933118618e-05, + "loss": 0.0099, + "num_input_tokens_seen": 240101424, + "step": 111270 + }, + { + "epoch": 18.152528548123982, + "grad_norm": 0.0025818159338086843, + "learning_rate": 2.5772714308056887e-05, + "loss": 0.002, + "num_input_tokens_seen": 240112304, + "step": 111275 + }, + { + "epoch": 18.153344208809134, + "grad_norm": 0.00012467047781683505, + "learning_rate": 2.5750161293812635e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240120944, + "step": 111280 + }, + { + "epoch": 18.15415986949429, + "grad_norm": 0.00017481452960055321, + "learning_rate": 2.572761789084316e-05, + "loss": 0.0016, + "num_input_tokens_seen": 240131472, + "step": 111285 + }, + { + "epoch": 18.154975530179446, + "grad_norm": 0.00013756347470916808, + "learning_rate": 2.570508409960498e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240142640, + "step": 111290 + }, + { + "epoch": 18.1557911908646, + "grad_norm": 0.12732544541358948, + "learning_rate": 2.5682559920555127e-05, + "loss": 0.0024, + "num_input_tokens_seen": 240153040, + "step": 111295 + }, + { + "epoch": 18.156606851549757, + "grad_norm": 0.00013433521962724626, + "learning_rate": 2.5660045354149786e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240163728, + "step": 111300 + }, + { + "epoch": 18.15742251223491, + "grad_norm": 0.0002620774321258068, + "learning_rate": 2.5637540400845483e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240174448, + "step": 111305 + }, + { + "epoch": 18.158238172920065, + "grad_norm": 0.0005444762646220624, + "learning_rate": 2.561504506109802e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240185616, + "step": 111310 + }, + { + "epoch": 18.15905383360522, + "grad_norm": 0.03833063691854477, + "learning_rate": 2.5592559335363696e-05, + "loss": 0.0115, + "num_input_tokens_seen": 240195888, + "step": 111315 + }, + { + "epoch": 18.159869494290376, + "grad_norm": 0.01958480477333069, + "learning_rate": 2.5570083224097763e-05, + "loss": 0.0045, + "num_input_tokens_seen": 240205360, + "step": 111320 + }, + { + "epoch": 18.160685154975532, + "grad_norm": 0.0032453276216983795, + "learning_rate": 2.554761672775613e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240216080, + "step": 111325 + }, + { + "epoch": 18.161500815660684, + "grad_norm": 0.0008954997756518424, + "learning_rate": 2.5525159846793822e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240226672, + "step": 111330 + }, + { + "epoch": 18.16231647634584, + "grad_norm": 0.00018683764210436493, + "learning_rate": 2.550271258166609e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240237776, + "step": 111335 + }, + { + "epoch": 18.163132137030995, + "grad_norm": 0.00020356355526018888, + "learning_rate": 2.548027493282784e-05, + "loss": 0.0037, + "num_input_tokens_seen": 240249040, + "step": 111340 + }, + { + "epoch": 18.16394779771615, + "grad_norm": 0.00039322132943198085, + "learning_rate": 2.5457846900733774e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240260464, + "step": 111345 + }, + { + "epoch": 18.164763458401303, + "grad_norm": 8.257712033810094e-05, + "learning_rate": 2.5435428485838465e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240270544, + "step": 111350 + }, + { + "epoch": 18.16557911908646, + "grad_norm": 0.0004246453463565558, + "learning_rate": 2.5413019688596218e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240281840, + "step": 111355 + }, + { + "epoch": 18.166394779771615, + "grad_norm": 0.0019960044883191586, + "learning_rate": 2.539062050946117e-05, + "loss": 0.001, + "num_input_tokens_seen": 240292752, + "step": 111360 + }, + { + "epoch": 18.16721044045677, + "grad_norm": 0.0001875641755759716, + "learning_rate": 2.5368230948887295e-05, + "loss": 0.0011, + "num_input_tokens_seen": 240303632, + "step": 111365 + }, + { + "epoch": 18.168026101141926, + "grad_norm": 0.0003185457899235189, + "learning_rate": 2.5345851007328336e-05, + "loss": 0.0056, + "num_input_tokens_seen": 240314160, + "step": 111370 + }, + { + "epoch": 18.16884176182708, + "grad_norm": 0.0007590515888296068, + "learning_rate": 2.532348068523782e-05, + "loss": 0.007, + "num_input_tokens_seen": 240324720, + "step": 111375 + }, + { + "epoch": 18.169657422512234, + "grad_norm": 0.00021520676091313362, + "learning_rate": 2.5301119983069165e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240335632, + "step": 111380 + }, + { + "epoch": 18.17047308319739, + "grad_norm": 0.00937348511070013, + "learning_rate": 2.5278768901275506e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240346000, + "step": 111385 + }, + { + "epoch": 18.171288743882545, + "grad_norm": 0.0010706925531849265, + "learning_rate": 2.5256427440309815e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240356592, + "step": 111390 + }, + { + "epoch": 18.1721044045677, + "grad_norm": 0.0029469416476786137, + "learning_rate": 2.5234095600624896e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240366832, + "step": 111395 + }, + { + "epoch": 18.172920065252853, + "grad_norm": 0.0007542863604612648, + "learning_rate": 2.5211773382673274e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240377328, + "step": 111400 + }, + { + "epoch": 18.17373572593801, + "grad_norm": 0.0007846896769478917, + "learning_rate": 2.5189460786907425e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240388144, + "step": 111405 + }, + { + "epoch": 18.174551386623165, + "grad_norm": 0.018730157986283302, + "learning_rate": 2.5167157813779485e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240399184, + "step": 111410 + }, + { + "epoch": 18.17536704730832, + "grad_norm": 0.0010466370731592178, + "learning_rate": 2.5144864463741423e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240409584, + "step": 111415 + }, + { + "epoch": 18.176182707993476, + "grad_norm": 0.00024146890791598707, + "learning_rate": 2.5122580737245105e-05, + "loss": 0.0078, + "num_input_tokens_seen": 240419600, + "step": 111420 + }, + { + "epoch": 18.17699836867863, + "grad_norm": 0.0001388684759149328, + "learning_rate": 2.5100306634742053e-05, + "loss": 0.0483, + "num_input_tokens_seen": 240430448, + "step": 111425 + }, + { + "epoch": 18.177814029363784, + "grad_norm": 0.00012916902778670192, + "learning_rate": 2.5078042156683854e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240440784, + "step": 111430 + }, + { + "epoch": 18.17862969004894, + "grad_norm": 0.08085226267576218, + "learning_rate": 2.5055787303521483e-05, + "loss": 0.0176, + "num_input_tokens_seen": 240450800, + "step": 111435 + }, + { + "epoch": 18.179445350734095, + "grad_norm": 0.0008967618341557682, + "learning_rate": 2.5033542075706184e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240461936, + "step": 111440 + }, + { + "epoch": 18.18026101141925, + "grad_norm": 0.002806832082569599, + "learning_rate": 2.5011306473688656e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240472112, + "step": 111445 + }, + { + "epoch": 18.181076672104403, + "grad_norm": 0.015159782022237778, + "learning_rate": 2.4989080497919593e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240480560, + "step": 111450 + }, + { + "epoch": 18.18189233278956, + "grad_norm": 0.032956313341856, + "learning_rate": 2.496686414884941e-05, + "loss": 0.0026, + "num_input_tokens_seen": 240491376, + "step": 111455 + }, + { + "epoch": 18.182707993474715, + "grad_norm": 0.028668271377682686, + "learning_rate": 2.4944657426928306e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240502384, + "step": 111460 + }, + { + "epoch": 18.18352365415987, + "grad_norm": 0.007430367171764374, + "learning_rate": 2.492246033260642e-05, + "loss": 0.0014, + "num_input_tokens_seen": 240511888, + "step": 111465 + }, + { + "epoch": 18.184339314845026, + "grad_norm": 0.0005121852736920118, + "learning_rate": 2.490027286633356e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240523088, + "step": 111470 + }, + { + "epoch": 18.18515497553018, + "grad_norm": 0.01615333929657936, + "learning_rate": 2.487809502855931e-05, + "loss": 0.0022, + "num_input_tokens_seen": 240534192, + "step": 111475 + }, + { + "epoch": 18.185970636215334, + "grad_norm": 0.0005389000871218741, + "learning_rate": 2.4855926819733253e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240545072, + "step": 111480 + }, + { + "epoch": 18.18678629690049, + "grad_norm": 0.015493880957365036, + "learning_rate": 2.4833768240304587e-05, + "loss": 0.0005, + "num_input_tokens_seen": 240556112, + "step": 111485 + }, + { + "epoch": 18.187601957585645, + "grad_norm": 0.0922309160232544, + "learning_rate": 2.48116192907224e-05, + "loss": 0.0033, + "num_input_tokens_seen": 240566896, + "step": 111490 + }, + { + "epoch": 18.1884176182708, + "grad_norm": 0.0005712299607694149, + "learning_rate": 2.4789479971435602e-05, + "loss": 0.0016, + "num_input_tokens_seen": 240578096, + "step": 111495 + }, + { + "epoch": 18.189233278955953, + "grad_norm": 0.00026665496989153326, + "learning_rate": 2.4767350282892788e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240589712, + "step": 111500 + }, + { + "epoch": 18.19004893964111, + "grad_norm": 0.00017703999765217304, + "learning_rate": 2.4745230225542536e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240599600, + "step": 111505 + }, + { + "epoch": 18.190864600326265, + "grad_norm": 9.171931742457673e-05, + "learning_rate": 2.472311979983305e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240610768, + "step": 111510 + }, + { + "epoch": 18.19168026101142, + "grad_norm": 0.0030498059932142496, + "learning_rate": 2.470101900621252e-05, + "loss": 0.001, + "num_input_tokens_seen": 240621392, + "step": 111515 + }, + { + "epoch": 18.192495921696572, + "grad_norm": 0.02175014652311802, + "learning_rate": 2.4678927845128762e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240633264, + "step": 111520 + }, + { + "epoch": 18.193311582381728, + "grad_norm": 0.0009540608734823763, + "learning_rate": 2.4656846317029524e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240643664, + "step": 111525 + }, + { + "epoch": 18.194127243066884, + "grad_norm": 0.00011374294990673661, + "learning_rate": 2.463477442236234e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240654192, + "step": 111530 + }, + { + "epoch": 18.19494290375204, + "grad_norm": 0.00024359769304282963, + "learning_rate": 2.4612712161574457e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240665008, + "step": 111535 + }, + { + "epoch": 18.195758564437195, + "grad_norm": 0.00033563151373527944, + "learning_rate": 2.459065953511308e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240676336, + "step": 111540 + }, + { + "epoch": 18.196574225122347, + "grad_norm": 0.0007685106247663498, + "learning_rate": 2.456861654342507e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240687248, + "step": 111545 + }, + { + "epoch": 18.197389885807503, + "grad_norm": 6.707174179609865e-05, + "learning_rate": 2.454658318695713e-05, + "loss": 0.0006, + "num_input_tokens_seen": 240699568, + "step": 111550 + }, + { + "epoch": 18.19820554649266, + "grad_norm": 0.000572481716517359, + "learning_rate": 2.4524559466155838e-05, + "loss": 0.0023, + "num_input_tokens_seen": 240710640, + "step": 111555 + }, + { + "epoch": 18.199021207177815, + "grad_norm": 0.0190622229129076, + "learning_rate": 2.450254538146762e-05, + "loss": 0.0011, + "num_input_tokens_seen": 240721456, + "step": 111560 + }, + { + "epoch": 18.19983686786297, + "grad_norm": 0.0011266616638749838, + "learning_rate": 2.44805409333384e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240732336, + "step": 111565 + }, + { + "epoch": 18.200652528548122, + "grad_norm": 0.02177298627793789, + "learning_rate": 2.445854612221432e-05, + "loss": 0.0015, + "num_input_tokens_seen": 240742736, + "step": 111570 + }, + { + "epoch": 18.201468189233278, + "grad_norm": 0.013846100308001041, + "learning_rate": 2.443656094854113e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240753200, + "step": 111575 + }, + { + "epoch": 18.202283849918434, + "grad_norm": 0.0007878526230342686, + "learning_rate": 2.4414585412764255e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240765392, + "step": 111580 + }, + { + "epoch": 18.20309951060359, + "grad_norm": 0.0033822732511907816, + "learning_rate": 2.4392619515329173e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240777456, + "step": 111585 + }, + { + "epoch": 18.203915171288745, + "grad_norm": 0.0005126554751768708, + "learning_rate": 2.437066325668097e-05, + "loss": 0.0017, + "num_input_tokens_seen": 240788240, + "step": 111590 + }, + { + "epoch": 18.204730831973897, + "grad_norm": 0.8306630253791809, + "learning_rate": 2.434871663726468e-05, + "loss": 0.0264, + "num_input_tokens_seen": 240799376, + "step": 111595 + }, + { + "epoch": 18.205546492659053, + "grad_norm": 0.04039228335022926, + "learning_rate": 2.4326779657525055e-05, + "loss": 0.0681, + "num_input_tokens_seen": 240810800, + "step": 111600 + }, + { + "epoch": 18.20636215334421, + "grad_norm": 0.011307901702821255, + "learning_rate": 2.430485231790669e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240822064, + "step": 111605 + }, + { + "epoch": 18.207177814029365, + "grad_norm": 0.0001125154085457325, + "learning_rate": 2.428293461885389e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240832272, + "step": 111610 + }, + { + "epoch": 18.20799347471452, + "grad_norm": 0.0004823763156309724, + "learning_rate": 2.426102656081097e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240842320, + "step": 111615 + }, + { + "epoch": 18.208809135399672, + "grad_norm": 0.0032255221158266068, + "learning_rate": 2.4239128144221857e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240852240, + "step": 111620 + }, + { + "epoch": 18.209624796084828, + "grad_norm": 0.013869868591427803, + "learning_rate": 2.4217239369530354e-05, + "loss": 0.0004, + "num_input_tokens_seen": 240862544, + "step": 111625 + }, + { + "epoch": 18.210440456769984, + "grad_norm": 0.00011027485015802085, + "learning_rate": 2.4195360237180053e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240872112, + "step": 111630 + }, + { + "epoch": 18.21125611745514, + "grad_norm": 0.020707515999674797, + "learning_rate": 2.417349074761438e-05, + "loss": 0.0012, + "num_input_tokens_seen": 240882352, + "step": 111635 + }, + { + "epoch": 18.212071778140295, + "grad_norm": 0.0003316039510536939, + "learning_rate": 2.4151630901276534e-05, + "loss": 0.0014, + "num_input_tokens_seen": 240893264, + "step": 111640 + }, + { + "epoch": 18.212887438825447, + "grad_norm": 0.00014401703083422035, + "learning_rate": 2.4129780698609606e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240903952, + "step": 111645 + }, + { + "epoch": 18.213703099510603, + "grad_norm": 0.024156300351023674, + "learning_rate": 2.4107940140056294e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240914768, + "step": 111650 + }, + { + "epoch": 18.21451876019576, + "grad_norm": 0.00023887238057795912, + "learning_rate": 2.4086109226059305e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240926064, + "step": 111655 + }, + { + "epoch": 18.215334420880914, + "grad_norm": 0.005682197865098715, + "learning_rate": 2.4064287957061003e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240936720, + "step": 111660 + }, + { + "epoch": 18.21615008156607, + "grad_norm": 0.00033835420617833734, + "learning_rate": 2.404247633350376e-05, + "loss": 0.0003, + "num_input_tokens_seen": 240947728, + "step": 111665 + }, + { + "epoch": 18.216965742251222, + "grad_norm": 0.0001965287228813395, + "learning_rate": 2.402067435582944e-05, + "loss": 0.0001, + "num_input_tokens_seen": 240958704, + "step": 111670 + }, + { + "epoch": 18.217781402936378, + "grad_norm": 0.0001592914923094213, + "learning_rate": 2.3998882024480085e-05, + "loss": 0.0002, + "num_input_tokens_seen": 240970832, + "step": 111675 + }, + { + "epoch": 18.218597063621534, + "grad_norm": 0.004611158277839422, + "learning_rate": 2.3977099339897112e-05, + "loss": 0.0008, + "num_input_tokens_seen": 240982448, + "step": 111680 + }, + { + "epoch": 18.21941272430669, + "grad_norm": 0.007273135241121054, + "learning_rate": 2.395532630252223e-05, + "loss": 0.0007, + "num_input_tokens_seen": 240993328, + "step": 111685 + }, + { + "epoch": 18.22022838499184, + "grad_norm": 0.10769865661859512, + "learning_rate": 2.393356291279647e-05, + "loss": 0.0113, + "num_input_tokens_seen": 241003856, + "step": 111690 + }, + { + "epoch": 18.221044045676997, + "grad_norm": 0.0004946508561260998, + "learning_rate": 2.391180917116109e-05, + "loss": 0.0001, + "num_input_tokens_seen": 241013616, + "step": 111695 + }, + { + "epoch": 18.221859706362153, + "grad_norm": 0.00012052438978571445, + "learning_rate": 2.389006507805669e-05, + "loss": 0.0001, + "num_input_tokens_seen": 241023312, + "step": 111700 + }, + { + "epoch": 18.22267536704731, + "grad_norm": 0.031420283019542694, + "learning_rate": 2.3868330633924295e-05, + "loss": 0.0065, + "num_input_tokens_seen": 241033776, + "step": 111705 + }, + { + "epoch": 18.223491027732464, + "grad_norm": 0.0005678536836057901, + "learning_rate": 2.3846605839204062e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241044528, + "step": 111710 + }, + { + "epoch": 18.224306688417617, + "grad_norm": 0.0001636188681004569, + "learning_rate": 2.3824890694336467e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241056144, + "step": 111715 + }, + { + "epoch": 18.225122349102772, + "grad_norm": 0.001963597722351551, + "learning_rate": 2.380318519976149e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241066960, + "step": 111720 + }, + { + "epoch": 18.225938009787928, + "grad_norm": 0.0001720907457638532, + "learning_rate": 2.3781489355919117e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241077744, + "step": 111725 + }, + { + "epoch": 18.226753670473084, + "grad_norm": 0.00010691180796129629, + "learning_rate": 2.375980316324894e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241089328, + "step": 111730 + }, + { + "epoch": 18.22756933115824, + "grad_norm": 0.00128295982722193, + "learning_rate": 2.373812662219055e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241100784, + "step": 111735 + }, + { + "epoch": 18.22838499184339, + "grad_norm": 0.011291184462606907, + "learning_rate": 2.3716459733183205e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241111504, + "step": 111740 + }, + { + "epoch": 18.229200652528547, + "grad_norm": 0.0002385491825407371, + "learning_rate": 2.3694802496665945e-05, + "loss": 0.0001, + "num_input_tokens_seen": 241122864, + "step": 111745 + }, + { + "epoch": 18.230016313213703, + "grad_norm": 0.0025898965541273355, + "learning_rate": 2.367315491307781e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241134128, + "step": 111750 + }, + { + "epoch": 18.23083197389886, + "grad_norm": 0.053422100841999054, + "learning_rate": 2.3651516982857448e-05, + "loss": 0.0018, + "num_input_tokens_seen": 241144912, + "step": 111755 + }, + { + "epoch": 18.231647634584014, + "grad_norm": 0.00037376375985331833, + "learning_rate": 2.362988870644339e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241155696, + "step": 111760 + }, + { + "epoch": 18.232463295269167, + "grad_norm": 0.018608706071972847, + "learning_rate": 2.3608270084273853e-05, + "loss": 0.004, + "num_input_tokens_seen": 241166896, + "step": 111765 + }, + { + "epoch": 18.233278955954322, + "grad_norm": 0.0025842750910669565, + "learning_rate": 2.3586661116787255e-05, + "loss": 0.0024, + "num_input_tokens_seen": 241176880, + "step": 111770 + }, + { + "epoch": 18.234094616639478, + "grad_norm": 0.002329292008653283, + "learning_rate": 2.3565061804421195e-05, + "loss": 0.0039, + "num_input_tokens_seen": 241187568, + "step": 111775 + }, + { + "epoch": 18.234910277324634, + "grad_norm": 0.00013235141523182392, + "learning_rate": 2.3543472147613654e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241198640, + "step": 111780 + }, + { + "epoch": 18.23572593800979, + "grad_norm": 0.01847231760621071, + "learning_rate": 2.3521892146801947e-05, + "loss": 0.0033, + "num_input_tokens_seen": 241209840, + "step": 111785 + }, + { + "epoch": 18.23654159869494, + "grad_norm": 0.004472263157367706, + "learning_rate": 2.350032180242373e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241220720, + "step": 111790 + }, + { + "epoch": 18.237357259380097, + "grad_norm": 0.004683862905949354, + "learning_rate": 2.3478761114915814e-05, + "loss": 0.0125, + "num_input_tokens_seen": 241229744, + "step": 111795 + }, + { + "epoch": 18.238172920065253, + "grad_norm": 0.00271758995950222, + "learning_rate": 2.3457210084715462e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241240976, + "step": 111800 + }, + { + "epoch": 18.23898858075041, + "grad_norm": 0.0001395432627759874, + "learning_rate": 2.3435668712259105e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241250736, + "step": 111805 + }, + { + "epoch": 18.239804241435564, + "grad_norm": 0.004566899035125971, + "learning_rate": 2.341413699798367e-05, + "loss": 0.0034, + "num_input_tokens_seen": 241262160, + "step": 111810 + }, + { + "epoch": 18.240619902120716, + "grad_norm": 0.0009133021812886, + "learning_rate": 2.3392614942325196e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241272112, + "step": 111815 + }, + { + "epoch": 18.241435562805872, + "grad_norm": 0.00015958191943354905, + "learning_rate": 2.3371102545720112e-05, + "loss": 0.0011, + "num_input_tokens_seen": 241283696, + "step": 111820 + }, + { + "epoch": 18.242251223491028, + "grad_norm": 0.0027808973100036383, + "learning_rate": 2.3349599808604182e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241294032, + "step": 111825 + }, + { + "epoch": 18.243066884176184, + "grad_norm": 0.00016685556329321116, + "learning_rate": 2.332810673141339e-05, + "loss": 0.0012, + "num_input_tokens_seen": 241305296, + "step": 111830 + }, + { + "epoch": 18.24388254486134, + "grad_norm": 0.09901204705238342, + "learning_rate": 2.3306623314583108e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241316176, + "step": 111835 + }, + { + "epoch": 18.24469820554649, + "grad_norm": 0.0011112043866887689, + "learning_rate": 2.3285149558548934e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241326192, + "step": 111840 + }, + { + "epoch": 18.245513866231647, + "grad_norm": 0.0018237336771562696, + "learning_rate": 2.3263685463745854e-05, + "loss": 0.0001, + "num_input_tokens_seen": 241336016, + "step": 111845 + }, + { + "epoch": 18.246329526916803, + "grad_norm": 0.00126490811817348, + "learning_rate": 2.324223103060913e-05, + "loss": 0.0184, + "num_input_tokens_seen": 241345328, + "step": 111850 + }, + { + "epoch": 18.24714518760196, + "grad_norm": 0.005309413652867079, + "learning_rate": 2.322078625957319e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241356304, + "step": 111855 + }, + { + "epoch": 18.247960848287114, + "grad_norm": 0.00023570338089484721, + "learning_rate": 2.319935115107302e-05, + "loss": 0.1068, + "num_input_tokens_seen": 241366704, + "step": 111860 + }, + { + "epoch": 18.248776508972266, + "grad_norm": 0.007995817810297012, + "learning_rate": 2.317792570554278e-05, + "loss": 0.0034, + "num_input_tokens_seen": 241377584, + "step": 111865 + }, + { + "epoch": 18.249592169657422, + "grad_norm": 0.004873568192124367, + "learning_rate": 2.3156509923416778e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241389072, + "step": 111870 + }, + { + "epoch": 18.250407830342578, + "grad_norm": 0.007514289114624262, + "learning_rate": 2.3135103805129065e-05, + "loss": 0.001, + "num_input_tokens_seen": 241401488, + "step": 111875 + }, + { + "epoch": 18.251223491027734, + "grad_norm": 0.0007669845945201814, + "learning_rate": 2.31137073511134e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241411088, + "step": 111880 + }, + { + "epoch": 18.252039151712886, + "grad_norm": 0.0006619488704018295, + "learning_rate": 2.3092320561803436e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241422736, + "step": 111885 + }, + { + "epoch": 18.25285481239804, + "grad_norm": 0.02382073551416397, + "learning_rate": 2.3070943437632553e-05, + "loss": 0.001, + "num_input_tokens_seen": 241433360, + "step": 111890 + }, + { + "epoch": 18.253670473083197, + "grad_norm": 0.0049252319149672985, + "learning_rate": 2.3049575979034066e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241444016, + "step": 111895 + }, + { + "epoch": 18.254486133768353, + "grad_norm": 0.0001665644667809829, + "learning_rate": 2.3028218186440964e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241453232, + "step": 111900 + }, + { + "epoch": 18.25530179445351, + "grad_norm": 0.00012745970161631703, + "learning_rate": 2.3006870060286123e-05, + "loss": 0.0011, + "num_input_tokens_seen": 241464592, + "step": 111905 + }, + { + "epoch": 18.25611745513866, + "grad_norm": 0.0003851005167234689, + "learning_rate": 2.2985531601002084e-05, + "loss": 0.0028, + "num_input_tokens_seen": 241473744, + "step": 111910 + }, + { + "epoch": 18.256933115823816, + "grad_norm": 0.0012868442572653294, + "learning_rate": 2.2964202809021563e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241484720, + "step": 111915 + }, + { + "epoch": 18.257748776508972, + "grad_norm": 0.020699461922049522, + "learning_rate": 2.2942883684776428e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241495248, + "step": 111920 + }, + { + "epoch": 18.258564437194128, + "grad_norm": 0.0012420794228091836, + "learning_rate": 2.2921574228699116e-05, + "loss": 0.0001, + "num_input_tokens_seen": 241505104, + "step": 111925 + }, + { + "epoch": 18.259380097879284, + "grad_norm": 0.020401576533913612, + "learning_rate": 2.290027444122117e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241515664, + "step": 111930 + }, + { + "epoch": 18.260195758564436, + "grad_norm": 0.001875039772130549, + "learning_rate": 2.2878984322774578e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241526672, + "step": 111935 + }, + { + "epoch": 18.26101141924959, + "grad_norm": 0.008413786068558693, + "learning_rate": 2.2857703873790435e-05, + "loss": 0.0484, + "num_input_tokens_seen": 241538032, + "step": 111940 + }, + { + "epoch": 18.261827079934747, + "grad_norm": 0.003389539662748575, + "learning_rate": 2.2836433094700405e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241549264, + "step": 111945 + }, + { + "epoch": 18.262642740619903, + "grad_norm": 0.00013239214604254812, + "learning_rate": 2.2815171985935246e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241559504, + "step": 111950 + }, + { + "epoch": 18.26345840130506, + "grad_norm": 0.017234724014997482, + "learning_rate": 2.279392054792612e-05, + "loss": 0.0048, + "num_input_tokens_seen": 241570896, + "step": 111955 + }, + { + "epoch": 18.26427406199021, + "grad_norm": 0.04076504707336426, + "learning_rate": 2.277267878110345e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241581776, + "step": 111960 + }, + { + "epoch": 18.265089722675366, + "grad_norm": 0.0005027551087550819, + "learning_rate": 2.275144668589796e-05, + "loss": 0.0017, + "num_input_tokens_seen": 241592240, + "step": 111965 + }, + { + "epoch": 18.265905383360522, + "grad_norm": 0.011663506738841534, + "learning_rate": 2.2730224262739687e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241602096, + "step": 111970 + }, + { + "epoch": 18.266721044045678, + "grad_norm": 0.05251770094037056, + "learning_rate": 2.270901151205895e-05, + "loss": 0.001, + "num_input_tokens_seen": 241612016, + "step": 111975 + }, + { + "epoch": 18.267536704730833, + "grad_norm": 0.003341264557093382, + "learning_rate": 2.2687808434285585e-05, + "loss": 0.0013, + "num_input_tokens_seen": 241623280, + "step": 111980 + }, + { + "epoch": 18.268352365415986, + "grad_norm": 9.221860818797722e-05, + "learning_rate": 2.266661502984929e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241633776, + "step": 111985 + }, + { + "epoch": 18.26916802610114, + "grad_norm": 0.033762503415346146, + "learning_rate": 2.264543129917962e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241644176, + "step": 111990 + }, + { + "epoch": 18.269983686786297, + "grad_norm": 0.013073122128844261, + "learning_rate": 2.2624257242705838e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241655792, + "step": 111995 + }, + { + "epoch": 18.270799347471453, + "grad_norm": 0.0031560033094137907, + "learning_rate": 2.2603092860857045e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241667568, + "step": 112000 + }, + { + "epoch": 18.27161500815661, + "grad_norm": 9.263480023946613e-05, + "learning_rate": 2.258193815406223e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241677584, + "step": 112005 + }, + { + "epoch": 18.27243066884176, + "grad_norm": 0.00013640511315315962, + "learning_rate": 2.2560793122750056e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241688208, + "step": 112010 + }, + { + "epoch": 18.273246329526916, + "grad_norm": 0.0005320285563357174, + "learning_rate": 2.253965776734912e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241697936, + "step": 112015 + }, + { + "epoch": 18.274061990212072, + "grad_norm": 0.00047322444152086973, + "learning_rate": 2.251853208828769e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241708624, + "step": 112020 + }, + { + "epoch": 18.274877650897228, + "grad_norm": 0.020302293822169304, + "learning_rate": 2.2497416085993983e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241718672, + "step": 112025 + }, + { + "epoch": 18.275693311582383, + "grad_norm": 0.001144544337876141, + "learning_rate": 2.247630976089582e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241728944, + "step": 112030 + }, + { + "epoch": 18.276508972267536, + "grad_norm": 0.0009663297678343952, + "learning_rate": 2.245521311342108e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241740592, + "step": 112035 + }, + { + "epoch": 18.27732463295269, + "grad_norm": 0.05448532477021217, + "learning_rate": 2.2434126143997258e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241751568, + "step": 112040 + }, + { + "epoch": 18.278140293637847, + "grad_norm": 0.0002787858829833567, + "learning_rate": 2.241304885305162e-05, + "loss": 0.0004, + "num_input_tokens_seen": 241763024, + "step": 112045 + }, + { + "epoch": 18.278955954323003, + "grad_norm": 0.004174410365521908, + "learning_rate": 2.2391981241011495e-05, + "loss": 0.0011, + "num_input_tokens_seen": 241774128, + "step": 112050 + }, + { + "epoch": 18.27977161500816, + "grad_norm": 0.009915916249155998, + "learning_rate": 2.2370923308303702e-05, + "loss": 0.0033, + "num_input_tokens_seen": 241784304, + "step": 112055 + }, + { + "epoch": 18.28058727569331, + "grad_norm": 0.0026495119091123343, + "learning_rate": 2.234987505535513e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241794608, + "step": 112060 + }, + { + "epoch": 18.281402936378466, + "grad_norm": 0.142722487449646, + "learning_rate": 2.2328836482592208e-05, + "loss": 0.0013, + "num_input_tokens_seen": 241804784, + "step": 112065 + }, + { + "epoch": 18.282218597063622, + "grad_norm": 0.0017449831357225776, + "learning_rate": 2.2307807590441486e-05, + "loss": 0.0014, + "num_input_tokens_seen": 241815824, + "step": 112070 + }, + { + "epoch": 18.283034257748778, + "grad_norm": 0.003926493227481842, + "learning_rate": 2.2286788379328905e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241827120, + "step": 112075 + }, + { + "epoch": 18.28384991843393, + "grad_norm": 0.001307156402617693, + "learning_rate": 2.2265778849680673e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241838448, + "step": 112080 + }, + { + "epoch": 18.284665579119086, + "grad_norm": 0.00039811068563722074, + "learning_rate": 2.2244779001922457e-05, + "loss": 0.0007, + "num_input_tokens_seen": 241849808, + "step": 112085 + }, + { + "epoch": 18.28548123980424, + "grad_norm": 0.00015886998153291643, + "learning_rate": 2.222378883647985e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241859792, + "step": 112090 + }, + { + "epoch": 18.286296900489397, + "grad_norm": 0.0006618754705414176, + "learning_rate": 2.2202808353778302e-05, + "loss": 0.0033, + "num_input_tokens_seen": 241870736, + "step": 112095 + }, + { + "epoch": 18.287112561174553, + "grad_norm": 0.0007651000050827861, + "learning_rate": 2.2181837554242968e-05, + "loss": 0.0018, + "num_input_tokens_seen": 241880816, + "step": 112100 + }, + { + "epoch": 18.287928221859705, + "grad_norm": 0.0021941522136330605, + "learning_rate": 2.216087643829884e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241890960, + "step": 112105 + }, + { + "epoch": 18.28874388254486, + "grad_norm": 0.04327227547764778, + "learning_rate": 2.213992500637074e-05, + "loss": 0.0008, + "num_input_tokens_seen": 241902288, + "step": 112110 + }, + { + "epoch": 18.289559543230016, + "grad_norm": 0.0029766445513814688, + "learning_rate": 2.211898325888323e-05, + "loss": 0.0006, + "num_input_tokens_seen": 241912368, + "step": 112115 + }, + { + "epoch": 18.290375203915172, + "grad_norm": 0.0008847187855280936, + "learning_rate": 2.2098051196260794e-05, + "loss": 0.0002, + "num_input_tokens_seen": 241922096, + "step": 112120 + }, + { + "epoch": 18.291190864600328, + "grad_norm": 0.00020161761494819075, + "learning_rate": 2.207712881892765e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241932944, + "step": 112125 + }, + { + "epoch": 18.29200652528548, + "grad_norm": 6.845914322184399e-05, + "learning_rate": 2.205621612730774e-05, + "loss": 0.0005, + "num_input_tokens_seen": 241943760, + "step": 112130 + }, + { + "epoch": 18.292822185970635, + "grad_norm": 0.37388888001441956, + "learning_rate": 2.2035313121824884e-05, + "loss": 0.003, + "num_input_tokens_seen": 241956080, + "step": 112135 + }, + { + "epoch": 18.29363784665579, + "grad_norm": 0.036025431007146835, + "learning_rate": 2.2014419802902808e-05, + "loss": 0.0009, + "num_input_tokens_seen": 241966064, + "step": 112140 + }, + { + "epoch": 18.294453507340947, + "grad_norm": 0.05812336876988411, + "learning_rate": 2.1993536170964832e-05, + "loss": 0.0019, + "num_input_tokens_seen": 241976240, + "step": 112145 + }, + { + "epoch": 18.295269168026103, + "grad_norm": 0.007383849937468767, + "learning_rate": 2.1972662226434292e-05, + "loss": 0.0003, + "num_input_tokens_seen": 241985840, + "step": 112150 + }, + { + "epoch": 18.296084828711255, + "grad_norm": 0.019497990608215332, + "learning_rate": 2.1951797969734178e-05, + "loss": 0.0335, + "num_input_tokens_seen": 241997616, + "step": 112155 + }, + { + "epoch": 18.29690048939641, + "grad_norm": 0.0003177412727382034, + "learning_rate": 2.193094340128726e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242007504, + "step": 112160 + }, + { + "epoch": 18.297716150081566, + "grad_norm": 0.03221796452999115, + "learning_rate": 2.191009852151632e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242018096, + "step": 112165 + }, + { + "epoch": 18.298531810766722, + "grad_norm": 0.026434438303112984, + "learning_rate": 2.188926333084368e-05, + "loss": 0.0018, + "num_input_tokens_seen": 242029264, + "step": 112170 + }, + { + "epoch": 18.299347471451878, + "grad_norm": 0.0003537992888595909, + "learning_rate": 2.186843782969167e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242039408, + "step": 112175 + }, + { + "epoch": 18.30016313213703, + "grad_norm": 0.0718768835067749, + "learning_rate": 2.1847622018482283e-05, + "loss": 0.0014, + "num_input_tokens_seen": 242049904, + "step": 112180 + }, + { + "epoch": 18.300978792822185, + "grad_norm": 0.05533561483025551, + "learning_rate": 2.182681589763741e-05, + "loss": 0.001, + "num_input_tokens_seen": 242060848, + "step": 112185 + }, + { + "epoch": 18.30179445350734, + "grad_norm": 0.0006087366491556168, + "learning_rate": 2.1806019467578765e-05, + "loss": 0.0029, + "num_input_tokens_seen": 242071216, + "step": 112190 + }, + { + "epoch": 18.302610114192497, + "grad_norm": 0.0007219164399430156, + "learning_rate": 2.1785232728727734e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242083408, + "step": 112195 + }, + { + "epoch": 18.303425774877653, + "grad_norm": 0.0025687534362077713, + "learning_rate": 2.1764455681505645e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242093680, + "step": 112200 + }, + { + "epoch": 18.304241435562805, + "grad_norm": 0.0004158923402428627, + "learning_rate": 2.1743688326333555e-05, + "loss": 0.002, + "num_input_tokens_seen": 242104752, + "step": 112205 + }, + { + "epoch": 18.30505709624796, + "grad_norm": 0.00020183775632176548, + "learning_rate": 2.1722930663632344e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242114064, + "step": 112210 + }, + { + "epoch": 18.305872756933116, + "grad_norm": 0.006387508474290371, + "learning_rate": 2.1702182693822625e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242125840, + "step": 112215 + }, + { + "epoch": 18.306688417618272, + "grad_norm": 0.0001686208270257339, + "learning_rate": 2.1681444417325004e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242137680, + "step": 112220 + }, + { + "epoch": 18.307504078303428, + "grad_norm": 0.0012397938407957554, + "learning_rate": 2.166071583455964e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242148496, + "step": 112225 + }, + { + "epoch": 18.30831973898858, + "grad_norm": 0.0014254723209887743, + "learning_rate": 2.1639996945946706e-05, + "loss": 0.0036, + "num_input_tokens_seen": 242158992, + "step": 112230 + }, + { + "epoch": 18.309135399673735, + "grad_norm": 0.12355409562587738, + "learning_rate": 2.1619287751906135e-05, + "loss": 0.0016, + "num_input_tokens_seen": 242168720, + "step": 112235 + }, + { + "epoch": 18.30995106035889, + "grad_norm": 0.00024456268874928355, + "learning_rate": 2.1598588252857486e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242181168, + "step": 112240 + }, + { + "epoch": 18.310766721044047, + "grad_norm": 0.00032808969262987375, + "learning_rate": 2.157789844922037e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242192656, + "step": 112245 + }, + { + "epoch": 18.3115823817292, + "grad_norm": 0.00030544534092769027, + "learning_rate": 2.1557218341414055e-05, + "loss": 0.001, + "num_input_tokens_seen": 242203088, + "step": 112250 + }, + { + "epoch": 18.312398042414355, + "grad_norm": 0.0003525118518155068, + "learning_rate": 2.1536547929857707e-05, + "loss": 0.0024, + "num_input_tokens_seen": 242213968, + "step": 112255 + }, + { + "epoch": 18.31321370309951, + "grad_norm": 0.0013165242271497846, + "learning_rate": 2.1515887214970165e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242223952, + "step": 112260 + }, + { + "epoch": 18.314029363784666, + "grad_norm": 0.03705214336514473, + "learning_rate": 2.1495236197170143e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242234800, + "step": 112265 + }, + { + "epoch": 18.31484502446982, + "grad_norm": 0.000305120280245319, + "learning_rate": 2.1474594876876198e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242246064, + "step": 112270 + }, + { + "epoch": 18.315660685154974, + "grad_norm": 0.0022705693263560534, + "learning_rate": 2.1453963254506604e-05, + "loss": 0.003, + "num_input_tokens_seen": 242257232, + "step": 112275 + }, + { + "epoch": 18.31647634584013, + "grad_norm": 0.00741948839277029, + "learning_rate": 2.1433341330479583e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242268304, + "step": 112280 + }, + { + "epoch": 18.317292006525285, + "grad_norm": 0.005444641690701246, + "learning_rate": 2.141272910521297e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242279408, + "step": 112285 + }, + { + "epoch": 18.31810766721044, + "grad_norm": 0.001036886009387672, + "learning_rate": 2.1392126579124536e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242289520, + "step": 112290 + }, + { + "epoch": 18.318923327895597, + "grad_norm": 0.0009344645077362657, + "learning_rate": 2.1371533752631844e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242300048, + "step": 112295 + }, + { + "epoch": 18.31973898858075, + "grad_norm": 0.0002540464047342539, + "learning_rate": 2.135095062615211e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242311152, + "step": 112300 + }, + { + "epoch": 18.320554649265905, + "grad_norm": 0.001426513772457838, + "learning_rate": 2.1330377200102723e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242321808, + "step": 112305 + }, + { + "epoch": 18.32137030995106, + "grad_norm": 0.00020313957065809518, + "learning_rate": 2.130981347490035e-05, + "loss": 0.0032, + "num_input_tokens_seen": 242333840, + "step": 112310 + }, + { + "epoch": 18.322185970636216, + "grad_norm": 0.000228429475100711, + "learning_rate": 2.1289259450961995e-05, + "loss": 0.0124, + "num_input_tokens_seen": 242345680, + "step": 112315 + }, + { + "epoch": 18.32300163132137, + "grad_norm": 0.020780308172106743, + "learning_rate": 2.1268715128703932e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242356624, + "step": 112320 + }, + { + "epoch": 18.323817292006524, + "grad_norm": 0.017772536724805832, + "learning_rate": 2.124818050854277e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242368144, + "step": 112325 + }, + { + "epoch": 18.32463295269168, + "grad_norm": 0.006072433199733496, + "learning_rate": 2.122765559089451e-05, + "loss": 0.1445, + "num_input_tokens_seen": 242379056, + "step": 112330 + }, + { + "epoch": 18.325448613376835, + "grad_norm": 0.02361849509179592, + "learning_rate": 2.1207140376175214e-05, + "loss": 0.0066, + "num_input_tokens_seen": 242390096, + "step": 112335 + }, + { + "epoch": 18.32626427406199, + "grad_norm": 0.00016288089682348073, + "learning_rate": 2.1186634864800603e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242400464, + "step": 112340 + }, + { + "epoch": 18.327079934747147, + "grad_norm": 0.003500057850033045, + "learning_rate": 2.116613905718623e-05, + "loss": 0.0003, + "num_input_tokens_seen": 242410192, + "step": 112345 + }, + { + "epoch": 18.3278955954323, + "grad_norm": 0.06170036271214485, + "learning_rate": 2.114565295374754e-05, + "loss": 0.0035, + "num_input_tokens_seen": 242419088, + "step": 112350 + }, + { + "epoch": 18.328711256117455, + "grad_norm": 0.0003199071215931326, + "learning_rate": 2.112517655489965e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242429808, + "step": 112355 + }, + { + "epoch": 18.32952691680261, + "grad_norm": 0.022327788174152374, + "learning_rate": 2.110470986105756e-05, + "loss": 0.0011, + "num_input_tokens_seen": 242441232, + "step": 112360 + }, + { + "epoch": 18.330342577487766, + "grad_norm": 0.010179494507610798, + "learning_rate": 2.1084252872636046e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242452144, + "step": 112365 + }, + { + "epoch": 18.33115823817292, + "grad_norm": 0.00040118867764249444, + "learning_rate": 2.1063805590049667e-05, + "loss": 0.0019, + "num_input_tokens_seen": 242463472, + "step": 112370 + }, + { + "epoch": 18.331973898858074, + "grad_norm": 0.002735297428444028, + "learning_rate": 2.1043368013712872e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242474640, + "step": 112375 + }, + { + "epoch": 18.33278955954323, + "grad_norm": 0.0024701044894754887, + "learning_rate": 2.102294014403977e-05, + "loss": 0.0003, + "num_input_tokens_seen": 242486576, + "step": 112380 + }, + { + "epoch": 18.333605220228385, + "grad_norm": 0.0010051358258351684, + "learning_rate": 2.1002521981444477e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242498992, + "step": 112385 + }, + { + "epoch": 18.33442088091354, + "grad_norm": 0.0018813575152307749, + "learning_rate": 2.0982113526340662e-05, + "loss": 0.0003, + "num_input_tokens_seen": 242510992, + "step": 112390 + }, + { + "epoch": 18.335236541598697, + "grad_norm": 0.0007130947196856141, + "learning_rate": 2.0961714779142048e-05, + "loss": 0.0023, + "num_input_tokens_seen": 242521904, + "step": 112395 + }, + { + "epoch": 18.33605220228385, + "grad_norm": 0.004652603529393673, + "learning_rate": 2.0941325740261975e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242532208, + "step": 112400 + }, + { + "epoch": 18.336867862969005, + "grad_norm": 0.0018975748680531979, + "learning_rate": 2.0920946410113604e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242542192, + "step": 112405 + }, + { + "epoch": 18.33768352365416, + "grad_norm": 0.005432966630905867, + "learning_rate": 2.0900576789110116e-05, + "loss": 0.0004, + "num_input_tokens_seen": 242553712, + "step": 112410 + }, + { + "epoch": 18.338499184339316, + "grad_norm": 0.0006149554974399507, + "learning_rate": 2.0880216877664116e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242564240, + "step": 112415 + }, + { + "epoch": 18.339314845024468, + "grad_norm": 0.004233523737639189, + "learning_rate": 2.0859866676188445e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242576304, + "step": 112420 + }, + { + "epoch": 18.340130505709624, + "grad_norm": 0.00025609773001633584, + "learning_rate": 2.083952618509527e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242587376, + "step": 112425 + }, + { + "epoch": 18.34094616639478, + "grad_norm": 0.0008060354157350957, + "learning_rate": 2.0819195404797098e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242596528, + "step": 112430 + }, + { + "epoch": 18.341761827079935, + "grad_norm": 0.013853384181857109, + "learning_rate": 2.0798874335705707e-05, + "loss": 0.0013, + "num_input_tokens_seen": 242606704, + "step": 112435 + }, + { + "epoch": 18.34257748776509, + "grad_norm": 0.05319775268435478, + "learning_rate": 2.077856297823316e-05, + "loss": 0.0013, + "num_input_tokens_seen": 242616528, + "step": 112440 + }, + { + "epoch": 18.343393148450243, + "grad_norm": 0.0005019660457037389, + "learning_rate": 2.0758261332790796e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242628176, + "step": 112445 + }, + { + "epoch": 18.3442088091354, + "grad_norm": 0.02034541592001915, + "learning_rate": 2.0737969399790392e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242638576, + "step": 112450 + }, + { + "epoch": 18.345024469820554, + "grad_norm": 0.00020141320419497788, + "learning_rate": 2.0717687179642896e-05, + "loss": 0.001, + "num_input_tokens_seen": 242650256, + "step": 112455 + }, + { + "epoch": 18.34584013050571, + "grad_norm": 0.00015041821461636573, + "learning_rate": 2.0697414672759596e-05, + "loss": 0.0007, + "num_input_tokens_seen": 242662384, + "step": 112460 + }, + { + "epoch": 18.346655791190866, + "grad_norm": 0.00010680585546651855, + "learning_rate": 2.0677151879551103e-05, + "loss": 0.0008, + "num_input_tokens_seen": 242671952, + "step": 112465 + }, + { + "epoch": 18.347471451876018, + "grad_norm": 0.013491165824234486, + "learning_rate": 2.0656898800428313e-05, + "loss": 0.0013, + "num_input_tokens_seen": 242683184, + "step": 112470 + }, + { + "epoch": 18.348287112561174, + "grad_norm": 0.01424252800643444, + "learning_rate": 2.0636655435801455e-05, + "loss": 0.0013, + "num_input_tokens_seen": 242693616, + "step": 112475 + }, + { + "epoch": 18.34910277324633, + "grad_norm": 0.0018390337936580181, + "learning_rate": 2.061642178608092e-05, + "loss": 0.0014, + "num_input_tokens_seen": 242705360, + "step": 112480 + }, + { + "epoch": 18.349918433931485, + "grad_norm": 0.0003951316175516695, + "learning_rate": 2.0596197851676768e-05, + "loss": 0.0012, + "num_input_tokens_seen": 242716528, + "step": 112485 + }, + { + "epoch": 18.35073409461664, + "grad_norm": 0.00025501978234387934, + "learning_rate": 2.057598363299884e-05, + "loss": 0.0035, + "num_input_tokens_seen": 242727888, + "step": 112490 + }, + { + "epoch": 18.351549755301793, + "grad_norm": 0.000439140887465328, + "learning_rate": 2.055577913045675e-05, + "loss": 0.0009, + "num_input_tokens_seen": 242738832, + "step": 112495 + }, + { + "epoch": 18.35236541598695, + "grad_norm": 0.0002477615780662745, + "learning_rate": 2.0535584344460066e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242749776, + "step": 112500 + }, + { + "epoch": 18.353181076672104, + "grad_norm": 0.042442336678504944, + "learning_rate": 2.0515399275417958e-05, + "loss": 0.034, + "num_input_tokens_seen": 242759536, + "step": 112505 + }, + { + "epoch": 18.35399673735726, + "grad_norm": 0.00012584026262629777, + "learning_rate": 2.0495223923739593e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242770192, + "step": 112510 + }, + { + "epoch": 18.354812398042416, + "grad_norm": 0.00017816043691709638, + "learning_rate": 2.0475058289833815e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242781936, + "step": 112515 + }, + { + "epoch": 18.355628058727568, + "grad_norm": 0.0008547751349397004, + "learning_rate": 2.045490237410924e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242794352, + "step": 112520 + }, + { + "epoch": 18.356443719412724, + "grad_norm": 0.004768925253301859, + "learning_rate": 2.043475617697449e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242805488, + "step": 112525 + }, + { + "epoch": 18.35725938009788, + "grad_norm": 0.0006921677268110216, + "learning_rate": 2.0414619698837677e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242815280, + "step": 112530 + }, + { + "epoch": 18.358075040783035, + "grad_norm": 0.004409309942275286, + "learning_rate": 2.0394492940107144e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242826064, + "step": 112535 + }, + { + "epoch": 18.35889070146819, + "grad_norm": 0.032236918807029724, + "learning_rate": 2.0374375901190456e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242836592, + "step": 112540 + }, + { + "epoch": 18.359706362153343, + "grad_norm": 0.003828071290627122, + "learning_rate": 2.0354268582495673e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242846736, + "step": 112545 + }, + { + "epoch": 18.3605220228385, + "grad_norm": 0.33862701058387756, + "learning_rate": 2.0334170984429966e-05, + "loss": 0.0077, + "num_input_tokens_seen": 242855952, + "step": 112550 + }, + { + "epoch": 18.361337683523654, + "grad_norm": 0.0021652725990861654, + "learning_rate": 2.0314083107400904e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242867824, + "step": 112555 + }, + { + "epoch": 18.36215334420881, + "grad_norm": 0.11023194342851639, + "learning_rate": 2.0294004951815324e-05, + "loss": 0.0027, + "num_input_tokens_seen": 242879920, + "step": 112560 + }, + { + "epoch": 18.362969004893966, + "grad_norm": 0.03780139982700348, + "learning_rate": 2.027393651808046e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242890064, + "step": 112565 + }, + { + "epoch": 18.363784665579118, + "grad_norm": 0.06482169777154922, + "learning_rate": 2.0253877806602648e-05, + "loss": 0.0021, + "num_input_tokens_seen": 242899472, + "step": 112570 + }, + { + "epoch": 18.364600326264274, + "grad_norm": 0.007225289940834045, + "learning_rate": 2.0233828817788792e-05, + "loss": 0.0024, + "num_input_tokens_seen": 242911184, + "step": 112575 + }, + { + "epoch": 18.36541598694943, + "grad_norm": 0.0003011747030541301, + "learning_rate": 2.0213789552044893e-05, + "loss": 0.0003, + "num_input_tokens_seen": 242921008, + "step": 112580 + }, + { + "epoch": 18.366231647634585, + "grad_norm": 0.00018467400514055043, + "learning_rate": 2.0193760009777295e-05, + "loss": 0.0021, + "num_input_tokens_seen": 242930800, + "step": 112585 + }, + { + "epoch": 18.36704730831974, + "grad_norm": 0.0009470807272009552, + "learning_rate": 2.0173740191391732e-05, + "loss": 0.0006, + "num_input_tokens_seen": 242942704, + "step": 112590 + }, + { + "epoch": 18.367862969004893, + "grad_norm": 0.000264754198724404, + "learning_rate": 2.0153730097294153e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242955504, + "step": 112595 + }, + { + "epoch": 18.36867862969005, + "grad_norm": 0.00023852323647588491, + "learning_rate": 2.0133729727889794e-05, + "loss": 0.0027, + "num_input_tokens_seen": 242966256, + "step": 112600 + }, + { + "epoch": 18.369494290375204, + "grad_norm": 0.014845756813883781, + "learning_rate": 2.0113739083584327e-05, + "loss": 0.0005, + "num_input_tokens_seen": 242977776, + "step": 112605 + }, + { + "epoch": 18.37030995106036, + "grad_norm": 0.00024399602261837572, + "learning_rate": 2.0093758164782595e-05, + "loss": 0.0002, + "num_input_tokens_seen": 242987536, + "step": 112610 + }, + { + "epoch": 18.371125611745512, + "grad_norm": 0.0001967293646885082, + "learning_rate": 2.0073786971889662e-05, + "loss": 0.0001, + "num_input_tokens_seen": 242998416, + "step": 112615 + }, + { + "epoch": 18.371941272430668, + "grad_norm": 0.00012221001088619232, + "learning_rate": 2.0053825505310318e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243009232, + "step": 112620 + }, + { + "epoch": 18.372756933115824, + "grad_norm": 0.0009234918979927897, + "learning_rate": 2.0033873765449018e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243021392, + "step": 112625 + }, + { + "epoch": 18.37357259380098, + "grad_norm": 0.0004698358825407922, + "learning_rate": 2.0013931752710214e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243031632, + "step": 112630 + }, + { + "epoch": 18.374388254486135, + "grad_norm": 0.14206872880458832, + "learning_rate": 1.9993999467497913e-05, + "loss": 0.0027, + "num_input_tokens_seen": 243043184, + "step": 112635 + }, + { + "epoch": 18.375203915171287, + "grad_norm": 0.000202857336262241, + "learning_rate": 1.9974076910216188e-05, + "loss": 0.001, + "num_input_tokens_seen": 243054416, + "step": 112640 + }, + { + "epoch": 18.376019575856443, + "grad_norm": 0.0011092222994193435, + "learning_rate": 1.995416408126871e-05, + "loss": 0.0019, + "num_input_tokens_seen": 243065264, + "step": 112645 + }, + { + "epoch": 18.3768352365416, + "grad_norm": 0.0012072621611878276, + "learning_rate": 1.9934260981059103e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243075632, + "step": 112650 + }, + { + "epoch": 18.377650897226754, + "grad_norm": 0.0012436105171218514, + "learning_rate": 1.9914367609990713e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243086832, + "step": 112655 + }, + { + "epoch": 18.37846655791191, + "grad_norm": 0.0002832302125170827, + "learning_rate": 1.9894483968466715e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243097584, + "step": 112660 + }, + { + "epoch": 18.379282218597062, + "grad_norm": 0.000622832216322422, + "learning_rate": 1.9874610056890007e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243108816, + "step": 112665 + }, + { + "epoch": 18.380097879282218, + "grad_norm": 0.21556414663791656, + "learning_rate": 1.9854745875663438e-05, + "loss": 0.0062, + "num_input_tokens_seen": 243119056, + "step": 112670 + }, + { + "epoch": 18.380913539967374, + "grad_norm": 0.00017058798403013498, + "learning_rate": 1.983489142518946e-05, + "loss": 0.001, + "num_input_tokens_seen": 243129840, + "step": 112675 + }, + { + "epoch": 18.38172920065253, + "grad_norm": 0.0009094258421100676, + "learning_rate": 1.9815046705870697e-05, + "loss": 0.0037, + "num_input_tokens_seen": 243140336, + "step": 112680 + }, + { + "epoch": 18.382544861337685, + "grad_norm": 0.0006058274884708226, + "learning_rate": 1.979521171810905e-05, + "loss": 0.0021, + "num_input_tokens_seen": 243151344, + "step": 112685 + }, + { + "epoch": 18.383360522022837, + "grad_norm": 0.02099059708416462, + "learning_rate": 1.9775386462306756e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243162832, + "step": 112690 + }, + { + "epoch": 18.384176182707993, + "grad_norm": 0.00020450733427423984, + "learning_rate": 1.9755570938865263e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243173072, + "step": 112695 + }, + { + "epoch": 18.38499184339315, + "grad_norm": 0.00023555538791697472, + "learning_rate": 1.9735765148186536e-05, + "loss": 0.033, + "num_input_tokens_seen": 243184144, + "step": 112700 + }, + { + "epoch": 18.385807504078304, + "grad_norm": 0.0008784665260463953, + "learning_rate": 1.9715969090671693e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243196496, + "step": 112705 + }, + { + "epoch": 18.38662316476346, + "grad_norm": 0.0002827657444868237, + "learning_rate": 1.969618276672208e-05, + "loss": 0.006, + "num_input_tokens_seen": 243206416, + "step": 112710 + }, + { + "epoch": 18.387438825448612, + "grad_norm": 0.009730017744004726, + "learning_rate": 1.9676406176738547e-05, + "loss": 0.0019, + "num_input_tokens_seen": 243218128, + "step": 112715 + }, + { + "epoch": 18.388254486133768, + "grad_norm": 0.0013416331494227052, + "learning_rate": 1.965663932112205e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243230480, + "step": 112720 + }, + { + "epoch": 18.389070146818923, + "grad_norm": 0.0031567977275699377, + "learning_rate": 1.96368822002731e-05, + "loss": 0.0128, + "num_input_tokens_seen": 243242128, + "step": 112725 + }, + { + "epoch": 18.38988580750408, + "grad_norm": 0.0005152710364200175, + "learning_rate": 1.9617134814592096e-05, + "loss": 0.0018, + "num_input_tokens_seen": 243253008, + "step": 112730 + }, + { + "epoch": 18.390701468189235, + "grad_norm": 0.0005147183546796441, + "learning_rate": 1.9597397164479282e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243263664, + "step": 112735 + }, + { + "epoch": 18.391517128874387, + "grad_norm": 0.00013038274482823908, + "learning_rate": 1.957766925033466e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243274416, + "step": 112740 + }, + { + "epoch": 18.392332789559543, + "grad_norm": 0.008495137095451355, + "learning_rate": 1.9557951072557978e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243283824, + "step": 112745 + }, + { + "epoch": 18.3931484502447, + "grad_norm": 0.0007109582656994462, + "learning_rate": 1.9538242631548965e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243294384, + "step": 112750 + }, + { + "epoch": 18.393964110929854, + "grad_norm": 0.0012787666637450457, + "learning_rate": 1.9518543927706968e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243305392, + "step": 112755 + }, + { + "epoch": 18.39477977161501, + "grad_norm": 0.0005416476051323116, + "learning_rate": 1.949885496143117e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243316176, + "step": 112760 + }, + { + "epoch": 18.395595432300162, + "grad_norm": 0.0016060203779488802, + "learning_rate": 1.947917573312069e-05, + "loss": 0.0024, + "num_input_tokens_seen": 243326800, + "step": 112765 + }, + { + "epoch": 18.396411092985318, + "grad_norm": 0.00013448089885059744, + "learning_rate": 1.945950624317422e-05, + "loss": 0.0015, + "num_input_tokens_seen": 243338128, + "step": 112770 + }, + { + "epoch": 18.397226753670473, + "grad_norm": 0.00010961738735204563, + "learning_rate": 1.943984649199054e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243349936, + "step": 112775 + }, + { + "epoch": 18.39804241435563, + "grad_norm": 0.0025447194930166006, + "learning_rate": 1.9420196479967957e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243360528, + "step": 112780 + }, + { + "epoch": 18.39885807504078, + "grad_norm": 0.7841143012046814, + "learning_rate": 1.9400556207504805e-05, + "loss": 0.0826, + "num_input_tokens_seen": 243370704, + "step": 112785 + }, + { + "epoch": 18.399673735725937, + "grad_norm": 0.0002695608127396554, + "learning_rate": 1.9380925674998995e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243382416, + "step": 112790 + }, + { + "epoch": 18.400489396411093, + "grad_norm": 0.0003508669906295836, + "learning_rate": 1.9361304882848487e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243394096, + "step": 112795 + }, + { + "epoch": 18.40130505709625, + "grad_norm": 0.000279960164334625, + "learning_rate": 1.9341693831450847e-05, + "loss": 0.0021, + "num_input_tokens_seen": 243405744, + "step": 112800 + }, + { + "epoch": 18.402120717781404, + "grad_norm": 0.018858248367905617, + "learning_rate": 1.9322092521203537e-05, + "loss": 0.0056, + "num_input_tokens_seen": 243415312, + "step": 112805 + }, + { + "epoch": 18.402936378466556, + "grad_norm": 0.01391497440636158, + "learning_rate": 1.93025009525038e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243426320, + "step": 112810 + }, + { + "epoch": 18.403752039151712, + "grad_norm": 0.0009840668644756079, + "learning_rate": 1.92829191257487e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243437200, + "step": 112815 + }, + { + "epoch": 18.404567699836868, + "grad_norm": 0.011342951096594334, + "learning_rate": 1.9263347041335033e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243447568, + "step": 112820 + }, + { + "epoch": 18.405383360522023, + "grad_norm": 0.00019291370699647814, + "learning_rate": 1.9243784699659538e-05, + "loss": 0.0862, + "num_input_tokens_seen": 243459088, + "step": 112825 + }, + { + "epoch": 18.40619902120718, + "grad_norm": 0.00047606180305592716, + "learning_rate": 1.9224232101118623e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243470352, + "step": 112830 + }, + { + "epoch": 18.40701468189233, + "grad_norm": 0.02866898663341999, + "learning_rate": 1.9204689246108576e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243480240, + "step": 112835 + }, + { + "epoch": 18.407830342577487, + "grad_norm": 0.00013726988981943578, + "learning_rate": 1.9185156135025417e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243491760, + "step": 112840 + }, + { + "epoch": 18.408646003262643, + "grad_norm": 0.00030970387160778046, + "learning_rate": 1.9165632768264994e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243502512, + "step": 112845 + }, + { + "epoch": 18.4094616639478, + "grad_norm": 0.00011494851787574589, + "learning_rate": 1.9146119146223052e-05, + "loss": 0.0067, + "num_input_tokens_seen": 243512496, + "step": 112850 + }, + { + "epoch": 18.410277324632954, + "grad_norm": 0.0036086428444832563, + "learning_rate": 1.9126615269294988e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243523664, + "step": 112855 + }, + { + "epoch": 18.411092985318106, + "grad_norm": 0.0012122170301154256, + "learning_rate": 1.9107121137876106e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243534800, + "step": 112860 + }, + { + "epoch": 18.411908646003262, + "grad_norm": 0.007827001623809338, + "learning_rate": 1.908763675236147e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243546448, + "step": 112865 + }, + { + "epoch": 18.412724306688418, + "grad_norm": 0.00012466864427551627, + "learning_rate": 1.906816211314599e-05, + "loss": 0.0033, + "num_input_tokens_seen": 243556560, + "step": 112870 + }, + { + "epoch": 18.413539967373573, + "grad_norm": 0.001562965800985694, + "learning_rate": 1.9048697220624244e-05, + "loss": 0.012, + "num_input_tokens_seen": 243566352, + "step": 112875 + }, + { + "epoch": 18.41435562805873, + "grad_norm": 0.0007549161673523486, + "learning_rate": 1.9029242075190856e-05, + "loss": 0.0006, + "num_input_tokens_seen": 243578032, + "step": 112880 + }, + { + "epoch": 18.41517128874388, + "grad_norm": 0.035474929958581924, + "learning_rate": 1.9009796677239953e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243588656, + "step": 112885 + }, + { + "epoch": 18.415986949429037, + "grad_norm": 0.00039997900603339076, + "learning_rate": 1.8990361027165726e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243599824, + "step": 112890 + }, + { + "epoch": 18.416802610114193, + "grad_norm": 0.00024053956440184265, + "learning_rate": 1.8970935125362076e-05, + "loss": 0.0017, + "num_input_tokens_seen": 243610928, + "step": 112895 + }, + { + "epoch": 18.41761827079935, + "grad_norm": 0.14684133231639862, + "learning_rate": 1.8951518972222637e-05, + "loss": 0.0031, + "num_input_tokens_seen": 243621936, + "step": 112900 + }, + { + "epoch": 18.418433931484504, + "grad_norm": 0.0022532010916620493, + "learning_rate": 1.893211256814087e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243632432, + "step": 112905 + }, + { + "epoch": 18.419249592169656, + "grad_norm": 9.569845860823989e-05, + "learning_rate": 1.891271591351018e-05, + "loss": 0.0009, + "num_input_tokens_seen": 243641264, + "step": 112910 + }, + { + "epoch": 18.420065252854812, + "grad_norm": 0.0001510514848632738, + "learning_rate": 1.8893329008723593e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243652144, + "step": 112915 + }, + { + "epoch": 18.420880913539968, + "grad_norm": 0.03407705947756767, + "learning_rate": 1.8873951854173955e-05, + "loss": 0.0008, + "num_input_tokens_seen": 243663152, + "step": 112920 + }, + { + "epoch": 18.421696574225123, + "grad_norm": 0.005168826784938574, + "learning_rate": 1.885458445025412e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243673552, + "step": 112925 + }, + { + "epoch": 18.42251223491028, + "grad_norm": 0.004437894094735384, + "learning_rate": 1.883522679735644e-05, + "loss": 0.001, + "num_input_tokens_seen": 243683632, + "step": 112930 + }, + { + "epoch": 18.42332789559543, + "grad_norm": 0.14317293465137482, + "learning_rate": 1.8815878895873328e-05, + "loss": 0.0024, + "num_input_tokens_seen": 243693264, + "step": 112935 + }, + { + "epoch": 18.424143556280587, + "grad_norm": 0.0002511715574655682, + "learning_rate": 1.87965407461968e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243702832, + "step": 112940 + }, + { + "epoch": 18.424959216965743, + "grad_norm": 0.006827319040894508, + "learning_rate": 1.877721234871893e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243713904, + "step": 112945 + }, + { + "epoch": 18.4257748776509, + "grad_norm": 0.0001629033504286781, + "learning_rate": 1.8757893703831243e-05, + "loss": 0.0018, + "num_input_tokens_seen": 243724592, + "step": 112950 + }, + { + "epoch": 18.42659053833605, + "grad_norm": 0.004327400587499142, + "learning_rate": 1.8738584811925417e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243734448, + "step": 112955 + }, + { + "epoch": 18.427406199021206, + "grad_norm": 0.0018603994976729155, + "learning_rate": 1.8719285673392594e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243746192, + "step": 112960 + }, + { + "epoch": 18.428221859706362, + "grad_norm": 0.002887835493311286, + "learning_rate": 1.869999628862401e-05, + "loss": 0.0005, + "num_input_tokens_seen": 243756944, + "step": 112965 + }, + { + "epoch": 18.429037520391518, + "grad_norm": 0.0008257298613898456, + "learning_rate": 1.8680716658010633e-05, + "loss": 0.0012, + "num_input_tokens_seen": 243768560, + "step": 112970 + }, + { + "epoch": 18.429853181076673, + "grad_norm": 0.019958918914198875, + "learning_rate": 1.8661446781943093e-05, + "loss": 0.0006, + "num_input_tokens_seen": 243779440, + "step": 112975 + }, + { + "epoch": 18.430668841761825, + "grad_norm": 0.0011849829461425543, + "learning_rate": 1.8642186660811965e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243789488, + "step": 112980 + }, + { + "epoch": 18.43148450244698, + "grad_norm": 0.00022633271873928607, + "learning_rate": 1.862293629500761e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243799536, + "step": 112985 + }, + { + "epoch": 18.432300163132137, + "grad_norm": 0.00015755971253383905, + "learning_rate": 1.8603695684920042e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243809840, + "step": 112990 + }, + { + "epoch": 18.433115823817293, + "grad_norm": 0.0010795500129461288, + "learning_rate": 1.858446483093934e-05, + "loss": 0.0014, + "num_input_tokens_seen": 243820688, + "step": 112995 + }, + { + "epoch": 18.43393148450245, + "grad_norm": 0.018984688445925713, + "learning_rate": 1.856524373345514e-05, + "loss": 0.0104, + "num_input_tokens_seen": 243831696, + "step": 113000 + }, + { + "epoch": 18.4347471451876, + "grad_norm": 0.0003212641749996692, + "learning_rate": 1.8546032392857014e-05, + "loss": 0.0037, + "num_input_tokens_seen": 243842384, + "step": 113005 + }, + { + "epoch": 18.435562805872756, + "grad_norm": 9.408987534698099e-05, + "learning_rate": 1.8526830809534377e-05, + "loss": 0.0007, + "num_input_tokens_seen": 243853680, + "step": 113010 + }, + { + "epoch": 18.436378466557912, + "grad_norm": 0.0021875854581594467, + "learning_rate": 1.8507638983876252e-05, + "loss": 0.001, + "num_input_tokens_seen": 243864368, + "step": 113015 + }, + { + "epoch": 18.437194127243067, + "grad_norm": 8.035081555135548e-05, + "learning_rate": 1.84884569162716e-05, + "loss": 0.0001, + "num_input_tokens_seen": 243875920, + "step": 113020 + }, + { + "epoch": 18.438009787928223, + "grad_norm": 0.005544235464185476, + "learning_rate": 1.8469284607109282e-05, + "loss": 0.0031, + "num_input_tokens_seen": 243887568, + "step": 113025 + }, + { + "epoch": 18.438825448613375, + "grad_norm": 0.02612287923693657, + "learning_rate": 1.8450122056777762e-05, + "loss": 0.0539, + "num_input_tokens_seen": 243897712, + "step": 113030 + }, + { + "epoch": 18.43964110929853, + "grad_norm": 0.0011155609972774982, + "learning_rate": 1.8430969265665398e-05, + "loss": 0.0013, + "num_input_tokens_seen": 243907920, + "step": 113035 + }, + { + "epoch": 18.440456769983687, + "grad_norm": 0.00500500900670886, + "learning_rate": 1.8411826234160324e-05, + "loss": 0.0018, + "num_input_tokens_seen": 243917136, + "step": 113040 + }, + { + "epoch": 18.441272430668842, + "grad_norm": 0.00012462127779144794, + "learning_rate": 1.8392692962650504e-05, + "loss": 0.0007, + "num_input_tokens_seen": 243927312, + "step": 113045 + }, + { + "epoch": 18.442088091353998, + "grad_norm": 0.00014893901243340224, + "learning_rate": 1.8373569451523853e-05, + "loss": 0.0031, + "num_input_tokens_seen": 243937776, + "step": 113050 + }, + { + "epoch": 18.44290375203915, + "grad_norm": 0.0007951904553920031, + "learning_rate": 1.8354455701167672e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243949424, + "step": 113055 + }, + { + "epoch": 18.443719412724306, + "grad_norm": 0.0006478818249888718, + "learning_rate": 1.833535171196954e-05, + "loss": 0.0004, + "num_input_tokens_seen": 243958960, + "step": 113060 + }, + { + "epoch": 18.44453507340946, + "grad_norm": 0.0286207627505064, + "learning_rate": 1.831625748431648e-05, + "loss": 0.0011, + "num_input_tokens_seen": 243970416, + "step": 113065 + }, + { + "epoch": 18.445350734094617, + "grad_norm": 0.000699805561453104, + "learning_rate": 1.829717301859557e-05, + "loss": 0.0003, + "num_input_tokens_seen": 243981072, + "step": 113070 + }, + { + "epoch": 18.446166394779773, + "grad_norm": 0.0014704841887578368, + "learning_rate": 1.8278098315193504e-05, + "loss": 0.0002, + "num_input_tokens_seen": 243992112, + "step": 113075 + }, + { + "epoch": 18.446982055464925, + "grad_norm": 0.022263644263148308, + "learning_rate": 1.8259033374496915e-05, + "loss": 0.0008, + "num_input_tokens_seen": 244002672, + "step": 113080 + }, + { + "epoch": 18.44779771615008, + "grad_norm": 0.00048678187886253, + "learning_rate": 1.8239978196892105e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244013456, + "step": 113085 + }, + { + "epoch": 18.448613376835237, + "grad_norm": 0.00013627595035359263, + "learning_rate": 1.8220932782765377e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244024080, + "step": 113090 + }, + { + "epoch": 18.449429037520392, + "grad_norm": 0.0010126022389158607, + "learning_rate": 1.8201897132502476e-05, + "loss": 0.0023, + "num_input_tokens_seen": 244035568, + "step": 113095 + }, + { + "epoch": 18.450244698205548, + "grad_norm": 0.009037659503519535, + "learning_rate": 1.8182871246489487e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244045872, + "step": 113100 + }, + { + "epoch": 18.4510603588907, + "grad_norm": 0.00017513181956019253, + "learning_rate": 1.8163855125111707e-05, + "loss": 0.0016, + "num_input_tokens_seen": 244056368, + "step": 113105 + }, + { + "epoch": 18.451876019575856, + "grad_norm": 0.0005213628755882382, + "learning_rate": 1.8144848768754717e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244066544, + "step": 113110 + }, + { + "epoch": 18.45269168026101, + "grad_norm": 0.00048614697880111635, + "learning_rate": 1.8125852177803658e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244078416, + "step": 113115 + }, + { + "epoch": 18.453507340946167, + "grad_norm": 0.0024417529348284006, + "learning_rate": 1.8106865352643498e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244089008, + "step": 113120 + }, + { + "epoch": 18.454323001631323, + "grad_norm": 0.07494306564331055, + "learning_rate": 1.808788829365904e-05, + "loss": 0.0018, + "num_input_tokens_seen": 244099536, + "step": 113125 + }, + { + "epoch": 18.455138662316475, + "grad_norm": 0.00021067020134069026, + "learning_rate": 1.8068921001234862e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244110992, + "step": 113130 + }, + { + "epoch": 18.45595432300163, + "grad_norm": 0.0003184058587066829, + "learning_rate": 1.804996347575538e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244120784, + "step": 113135 + }, + { + "epoch": 18.456769983686787, + "grad_norm": 0.0010034851729869843, + "learning_rate": 1.8031015717604793e-05, + "loss": 0.0017, + "num_input_tokens_seen": 244131120, + "step": 113140 + }, + { + "epoch": 18.457585644371942, + "grad_norm": 0.0011672114487737417, + "learning_rate": 1.8012077727167065e-05, + "loss": 0.0008, + "num_input_tokens_seen": 244140848, + "step": 113145 + }, + { + "epoch": 18.458401305057095, + "grad_norm": 0.0002785520628094673, + "learning_rate": 1.7993149504826056e-05, + "loss": 0.0046, + "num_input_tokens_seen": 244151856, + "step": 113150 + }, + { + "epoch": 18.45921696574225, + "grad_norm": 0.02631617896258831, + "learning_rate": 1.7974231050965352e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244162064, + "step": 113155 + }, + { + "epoch": 18.460032626427406, + "grad_norm": 0.33877938985824585, + "learning_rate": 1.7955322365968253e-05, + "loss": 0.0066, + "num_input_tokens_seen": 244173104, + "step": 113160 + }, + { + "epoch": 18.46084828711256, + "grad_norm": 0.0002899516839534044, + "learning_rate": 1.793642345021823e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244184944, + "step": 113165 + }, + { + "epoch": 18.461663947797717, + "grad_norm": 9.419047273695469e-05, + "learning_rate": 1.7917534304097983e-05, + "loss": 0.0015, + "num_input_tokens_seen": 244195696, + "step": 113170 + }, + { + "epoch": 18.46247960848287, + "grad_norm": 0.0002840980014298111, + "learning_rate": 1.7898654927990587e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244205008, + "step": 113175 + }, + { + "epoch": 18.463295269168025, + "grad_norm": 0.0034703039564192295, + "learning_rate": 1.7879785322278408e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244215728, + "step": 113180 + }, + { + "epoch": 18.46411092985318, + "grad_norm": 0.00012313942715991288, + "learning_rate": 1.786092548734408e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244226096, + "step": 113185 + }, + { + "epoch": 18.464926590538337, + "grad_norm": 0.0047932881861925125, + "learning_rate": 1.7842075423569692e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244237552, + "step": 113190 + }, + { + "epoch": 18.465742251223492, + "grad_norm": 0.0009236320038326085, + "learning_rate": 1.782323513133738e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244247696, + "step": 113195 + }, + { + "epoch": 18.466557911908644, + "grad_norm": 0.00014505947183351964, + "learning_rate": 1.7804404611028778e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244259312, + "step": 113200 + }, + { + "epoch": 18.4673735725938, + "grad_norm": 0.00025368211208842695, + "learning_rate": 1.7785583863025757e-05, + "loss": 0.0352, + "num_input_tokens_seen": 244271280, + "step": 113205 + }, + { + "epoch": 18.468189233278956, + "grad_norm": 0.0003196932084392756, + "learning_rate": 1.776677288770945e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244281648, + "step": 113210 + }, + { + "epoch": 18.46900489396411, + "grad_norm": 0.0003526093205437064, + "learning_rate": 1.7747971685461383e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244292848, + "step": 113215 + }, + { + "epoch": 18.469820554649267, + "grad_norm": 0.00029602015274576843, + "learning_rate": 1.772918025666237e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244304048, + "step": 113220 + }, + { + "epoch": 18.47063621533442, + "grad_norm": 0.0007744957692921162, + "learning_rate": 1.7710398601693432e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244314192, + "step": 113225 + }, + { + "epoch": 18.471451876019575, + "grad_norm": 0.0001596067741047591, + "learning_rate": 1.769162672093494e-05, + "loss": 0.0012, + "num_input_tokens_seen": 244325584, + "step": 113230 + }, + { + "epoch": 18.47226753670473, + "grad_norm": 0.00021798996021971107, + "learning_rate": 1.7672864614767636e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244335632, + "step": 113235 + }, + { + "epoch": 18.473083197389887, + "grad_norm": 0.015995528548955917, + "learning_rate": 1.7654112283571446e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244346768, + "step": 113240 + }, + { + "epoch": 18.473898858075042, + "grad_norm": 0.000784515228588134, + "learning_rate": 1.7635369727726726e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244357904, + "step": 113245 + }, + { + "epoch": 18.474714518760194, + "grad_norm": 0.0015430337516590953, + "learning_rate": 1.7616636947613063e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244368944, + "step": 113250 + }, + { + "epoch": 18.47553017944535, + "grad_norm": 0.04915463551878929, + "learning_rate": 1.759791394361021e-05, + "loss": 0.0038, + "num_input_tokens_seen": 244380080, + "step": 113255 + }, + { + "epoch": 18.476345840130506, + "grad_norm": 0.0017209550132974982, + "learning_rate": 1.757920071609764e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244390288, + "step": 113260 + }, + { + "epoch": 18.47716150081566, + "grad_norm": 0.00023064545530360192, + "learning_rate": 1.75604972654545e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244400848, + "step": 113265 + }, + { + "epoch": 18.477977161500817, + "grad_norm": 0.00010949008719762787, + "learning_rate": 1.754180359205998e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244411984, + "step": 113270 + }, + { + "epoch": 18.47879282218597, + "grad_norm": 0.055630162358284, + "learning_rate": 1.752311969629278e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244423664, + "step": 113275 + }, + { + "epoch": 18.479608482871125, + "grad_norm": 0.001257435418665409, + "learning_rate": 1.7504445578531703e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244434576, + "step": 113280 + }, + { + "epoch": 18.48042414355628, + "grad_norm": 0.0027210931293666363, + "learning_rate": 1.7485781239155063e-05, + "loss": 0.0013, + "num_input_tokens_seen": 244444848, + "step": 113285 + }, + { + "epoch": 18.481239804241437, + "grad_norm": 0.00043554743751883507, + "learning_rate": 1.7467126678541223e-05, + "loss": 0.0021, + "num_input_tokens_seen": 244455984, + "step": 113290 + }, + { + "epoch": 18.482055464926592, + "grad_norm": 0.011018267832696438, + "learning_rate": 1.7448481897068158e-05, + "loss": 0.0029, + "num_input_tokens_seen": 244467024, + "step": 113295 + }, + { + "epoch": 18.482871125611744, + "grad_norm": 0.3835802376270294, + "learning_rate": 1.742984689511379e-05, + "loss": 0.012, + "num_input_tokens_seen": 244477968, + "step": 113300 + }, + { + "epoch": 18.4836867862969, + "grad_norm": 0.00013254994701128453, + "learning_rate": 1.7411221673055644e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244489232, + "step": 113305 + }, + { + "epoch": 18.484502446982056, + "grad_norm": 0.00016580455121584237, + "learning_rate": 1.739260623127148e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244500240, + "step": 113310 + }, + { + "epoch": 18.48531810766721, + "grad_norm": 0.003212176961824298, + "learning_rate": 1.737400057013827e-05, + "loss": 0.0004, + "num_input_tokens_seen": 244511664, + "step": 113315 + }, + { + "epoch": 18.486133768352367, + "grad_norm": 8.161358709912747e-05, + "learning_rate": 1.735540469003327e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244522928, + "step": 113320 + }, + { + "epoch": 18.48694942903752, + "grad_norm": 0.07459031790494919, + "learning_rate": 1.733681859133318e-05, + "loss": 0.0016, + "num_input_tokens_seen": 244533296, + "step": 113325 + }, + { + "epoch": 18.487765089722675, + "grad_norm": 0.0006917008431628346, + "learning_rate": 1.7318242274414864e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244545136, + "step": 113330 + }, + { + "epoch": 18.48858075040783, + "grad_norm": 0.0046510337851941586, + "learning_rate": 1.7299675739654575e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244555984, + "step": 113335 + }, + { + "epoch": 18.489396411092986, + "grad_norm": 0.0029942996334284544, + "learning_rate": 1.7281118987428847e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244567760, + "step": 113340 + }, + { + "epoch": 18.49021207177814, + "grad_norm": 0.00010808218939928338, + "learning_rate": 1.7262572018113488e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244577328, + "step": 113345 + }, + { + "epoch": 18.491027732463294, + "grad_norm": 0.010638603940606117, + "learning_rate": 1.7244034832084587e-05, + "loss": 0.001, + "num_input_tokens_seen": 244589104, + "step": 113350 + }, + { + "epoch": 18.49184339314845, + "grad_norm": 0.00026497142971493304, + "learning_rate": 1.722550742971768e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244600880, + "step": 113355 + }, + { + "epoch": 18.492659053833606, + "grad_norm": 0.010536917485296726, + "learning_rate": 1.720698981138835e-05, + "loss": 0.0036, + "num_input_tokens_seen": 244611344, + "step": 113360 + }, + { + "epoch": 18.49347471451876, + "grad_norm": 0.0001788009685697034, + "learning_rate": 1.7188481977471804e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244622416, + "step": 113365 + }, + { + "epoch": 18.494290375203914, + "grad_norm": 0.02241145819425583, + "learning_rate": 1.716998392834318e-05, + "loss": 0.0008, + "num_input_tokens_seen": 244632112, + "step": 113370 + }, + { + "epoch": 18.49510603588907, + "grad_norm": 0.06023215502500534, + "learning_rate": 1.715149566437735e-05, + "loss": 0.0012, + "num_input_tokens_seen": 244643088, + "step": 113375 + }, + { + "epoch": 18.495921696574225, + "grad_norm": 0.00018361856928095222, + "learning_rate": 1.7133017185949007e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244654800, + "step": 113380 + }, + { + "epoch": 18.49673735725938, + "grad_norm": 0.004694739356637001, + "learning_rate": 1.711454849343258e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244666032, + "step": 113385 + }, + { + "epoch": 18.497553017944536, + "grad_norm": 0.0002533750084694475, + "learning_rate": 1.709608958720249e-05, + "loss": 0.0427, + "num_input_tokens_seen": 244676016, + "step": 113390 + }, + { + "epoch": 18.49836867862969, + "grad_norm": 0.00014440105587709695, + "learning_rate": 1.7077640467632714e-05, + "loss": 0.0049, + "num_input_tokens_seen": 244686896, + "step": 113395 + }, + { + "epoch": 18.499184339314844, + "grad_norm": 0.00025351435760967433, + "learning_rate": 1.705920113509718e-05, + "loss": 0.0011, + "num_input_tokens_seen": 244696784, + "step": 113400 + }, + { + "epoch": 18.5, + "grad_norm": 0.00017982722783926874, + "learning_rate": 1.7040771589969583e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244707024, + "step": 113405 + }, + { + "epoch": 18.500815660685156, + "grad_norm": 9.10629823920317e-05, + "learning_rate": 1.7022351832623407e-05, + "loss": 0.0278, + "num_input_tokens_seen": 244717808, + "step": 113410 + }, + { + "epoch": 18.50163132137031, + "grad_norm": 0.035746604204177856, + "learning_rate": 1.7003941863432014e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244729008, + "step": 113415 + }, + { + "epoch": 18.502446982055464, + "grad_norm": 0.00011923842248506844, + "learning_rate": 1.6985541682768445e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244739792, + "step": 113420 + }, + { + "epoch": 18.50326264274062, + "grad_norm": 0.00014563207514584064, + "learning_rate": 1.696715129100562e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244750224, + "step": 113425 + }, + { + "epoch": 18.504078303425775, + "grad_norm": 0.0005306898383423686, + "learning_rate": 1.6948770688516248e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244760592, + "step": 113430 + }, + { + "epoch": 18.50489396411093, + "grad_norm": 0.00025607491261325777, + "learning_rate": 1.6930399875672853e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244771120, + "step": 113435 + }, + { + "epoch": 18.505709624796086, + "grad_norm": 0.05940388888120651, + "learning_rate": 1.69120388528477e-05, + "loss": 0.0018, + "num_input_tokens_seen": 244782160, + "step": 113440 + }, + { + "epoch": 18.50652528548124, + "grad_norm": 0.002851438010111451, + "learning_rate": 1.6893687620412933e-05, + "loss": 0.0005, + "num_input_tokens_seen": 244792688, + "step": 113445 + }, + { + "epoch": 18.507340946166394, + "grad_norm": 0.0002184223267249763, + "learning_rate": 1.687534617874037e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244802800, + "step": 113450 + }, + { + "epoch": 18.50815660685155, + "grad_norm": 0.004037537612020969, + "learning_rate": 1.685701452820193e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244813552, + "step": 113455 + }, + { + "epoch": 18.508972267536706, + "grad_norm": 0.0005608230130746961, + "learning_rate": 1.6838692669168876e-05, + "loss": 0.0007, + "num_input_tokens_seen": 244824752, + "step": 113460 + }, + { + "epoch": 18.50978792822186, + "grad_norm": 0.043882813304662704, + "learning_rate": 1.682038060201274e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244834832, + "step": 113465 + }, + { + "epoch": 18.510603588907014, + "grad_norm": 0.0009162705391645432, + "learning_rate": 1.680207832710451e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244844624, + "step": 113470 + }, + { + "epoch": 18.51141924959217, + "grad_norm": 0.0020605996251106262, + "learning_rate": 1.6783785844815157e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244855280, + "step": 113475 + }, + { + "epoch": 18.512234910277325, + "grad_norm": 0.00019161589443683624, + "learning_rate": 1.6765503155515394e-05, + "loss": 0.0042, + "num_input_tokens_seen": 244864976, + "step": 113480 + }, + { + "epoch": 18.51305057096248, + "grad_norm": 0.0014300381299108267, + "learning_rate": 1.6747230259575696e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244875952, + "step": 113485 + }, + { + "epoch": 18.513866231647633, + "grad_norm": 0.001193741918541491, + "learning_rate": 1.6728967157366492e-05, + "loss": 0.0032, + "num_input_tokens_seen": 244886448, + "step": 113490 + }, + { + "epoch": 18.51468189233279, + "grad_norm": 0.007712116464972496, + "learning_rate": 1.671071384925782e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244897232, + "step": 113495 + }, + { + "epoch": 18.515497553017944, + "grad_norm": 0.00014363822992891073, + "learning_rate": 1.66924703356196e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244908656, + "step": 113500 + }, + { + "epoch": 18.5163132137031, + "grad_norm": 0.004824102856218815, + "learning_rate": 1.6674236616821602e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244919632, + "step": 113505 + }, + { + "epoch": 18.517128874388256, + "grad_norm": 0.01151628140360117, + "learning_rate": 1.6656012693233357e-05, + "loss": 0.0066, + "num_input_tokens_seen": 244929616, + "step": 113510 + }, + { + "epoch": 18.517944535073408, + "grad_norm": 0.002041020430624485, + "learning_rate": 1.6637798565224127e-05, + "loss": 0.0003, + "num_input_tokens_seen": 244940848, + "step": 113515 + }, + { + "epoch": 18.518760195758563, + "grad_norm": 0.011595416814088821, + "learning_rate": 1.6619594233163172e-05, + "loss": 0.0032, + "num_input_tokens_seen": 244950896, + "step": 113520 + }, + { + "epoch": 18.51957585644372, + "grad_norm": 0.0001725497713778168, + "learning_rate": 1.6601399697419306e-05, + "loss": 0.0002, + "num_input_tokens_seen": 244961296, + "step": 113525 + }, + { + "epoch": 18.520391517128875, + "grad_norm": 0.0004373275733087212, + "learning_rate": 1.658321495836135e-05, + "loss": 0.0001, + "num_input_tokens_seen": 244972720, + "step": 113530 + }, + { + "epoch": 18.52120717781403, + "grad_norm": 0.0001945996336871758, + "learning_rate": 1.6565040016357725e-05, + "loss": 0.0009, + "num_input_tokens_seen": 244983184, + "step": 113535 + }, + { + "epoch": 18.522022838499183, + "grad_norm": 0.00021469392231665552, + "learning_rate": 1.654687487177692e-05, + "loss": 0.0006, + "num_input_tokens_seen": 244994512, + "step": 113540 + }, + { + "epoch": 18.52283849918434, + "grad_norm": 0.00017178760026581585, + "learning_rate": 1.6528719524986967e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245005616, + "step": 113545 + }, + { + "epoch": 18.523654159869494, + "grad_norm": 0.015634089708328247, + "learning_rate": 1.6510573976355858e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245015568, + "step": 113550 + }, + { + "epoch": 18.52446982055465, + "grad_norm": 0.0003746241272892803, + "learning_rate": 1.6492438226251295e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245026576, + "step": 113555 + }, + { + "epoch": 18.525285481239806, + "grad_norm": 0.00048120360588654876, + "learning_rate": 1.647431227504087e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245037840, + "step": 113560 + }, + { + "epoch": 18.526101141924958, + "grad_norm": 0.00044903435627929866, + "learning_rate": 1.645619612309185e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245047312, + "step": 113565 + }, + { + "epoch": 18.526916802610113, + "grad_norm": 0.035293225198984146, + "learning_rate": 1.6438089770771435e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245058896, + "step": 113570 + }, + { + "epoch": 18.52773246329527, + "grad_norm": 0.0005223414627835155, + "learning_rate": 1.6419993218446673e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245069232, + "step": 113575 + }, + { + "epoch": 18.528548123980425, + "grad_norm": 0.0001966454874491319, + "learning_rate": 1.640190646648404e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245080368, + "step": 113580 + }, + { + "epoch": 18.52936378466558, + "grad_norm": 0.004070186987519264, + "learning_rate": 1.638382951525047e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245091664, + "step": 113585 + }, + { + "epoch": 18.530179445350733, + "grad_norm": 0.0005542365834116936, + "learning_rate": 1.6365762365111947e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245102736, + "step": 113590 + }, + { + "epoch": 18.53099510603589, + "grad_norm": 0.0043274471536278725, + "learning_rate": 1.6347705016434844e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245113840, + "step": 113595 + }, + { + "epoch": 18.531810766721044, + "grad_norm": 0.00015732672181911767, + "learning_rate": 1.6329657469585037e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245125872, + "step": 113600 + }, + { + "epoch": 18.5326264274062, + "grad_norm": 0.00012730853632092476, + "learning_rate": 1.6311619724928283e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245137328, + "step": 113605 + }, + { + "epoch": 18.533442088091356, + "grad_norm": 0.030906662344932556, + "learning_rate": 1.6293591782830186e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245147728, + "step": 113610 + }, + { + "epoch": 18.534257748776508, + "grad_norm": 0.0009210177813656628, + "learning_rate": 1.6275573643656115e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245158096, + "step": 113615 + }, + { + "epoch": 18.535073409461663, + "grad_norm": 0.0001677673717495054, + "learning_rate": 1.6257565307771115e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245167504, + "step": 113620 + }, + { + "epoch": 18.53588907014682, + "grad_norm": 0.0008662198670208454, + "learning_rate": 1.6239566775540283e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245176624, + "step": 113625 + }, + { + "epoch": 18.536704730831975, + "grad_norm": 0.0006733891204930842, + "learning_rate": 1.6221578047328322e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245187280, + "step": 113630 + }, + { + "epoch": 18.53752039151713, + "grad_norm": 0.034582946449518204, + "learning_rate": 1.6203599123499778e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245198544, + "step": 113635 + }, + { + "epoch": 18.538336052202283, + "grad_norm": 0.00028598078642971814, + "learning_rate": 1.6185630004419027e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245208720, + "step": 113640 + }, + { + "epoch": 18.53915171288744, + "grad_norm": 0.0009323310223408043, + "learning_rate": 1.6167670690450276e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245219792, + "step": 113645 + }, + { + "epoch": 18.539967373572594, + "grad_norm": 0.0009227304253727198, + "learning_rate": 1.6149721181957456e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245231024, + "step": 113650 + }, + { + "epoch": 18.54078303425775, + "grad_norm": 0.006220404524356127, + "learning_rate": 1.6131781479304332e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245240432, + "step": 113655 + }, + { + "epoch": 18.541598694942905, + "grad_norm": 0.00016405832138843834, + "learning_rate": 1.61138515828545e-05, + "loss": 0.001, + "num_input_tokens_seen": 245251632, + "step": 113660 + }, + { + "epoch": 18.542414355628058, + "grad_norm": 0.0005501600680872798, + "learning_rate": 1.6095931492971282e-05, + "loss": 0.001, + "num_input_tokens_seen": 245262192, + "step": 113665 + }, + { + "epoch": 18.543230016313213, + "grad_norm": 0.08265355229377747, + "learning_rate": 1.6078021210017945e-05, + "loss": 0.0041, + "num_input_tokens_seen": 245273680, + "step": 113670 + }, + { + "epoch": 18.54404567699837, + "grad_norm": 0.0008021110552363098, + "learning_rate": 1.6060120734357366e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245284112, + "step": 113675 + }, + { + "epoch": 18.544861337683525, + "grad_norm": 0.0026776292361319065, + "learning_rate": 1.604223006635236e-05, + "loss": 0.0018, + "num_input_tokens_seen": 245295760, + "step": 113680 + }, + { + "epoch": 18.545676998368677, + "grad_norm": 0.00019359066209290177, + "learning_rate": 1.6024349206365475e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245305552, + "step": 113685 + }, + { + "epoch": 18.546492659053833, + "grad_norm": 0.0009847967885434628, + "learning_rate": 1.6006478154759197e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245316272, + "step": 113690 + }, + { + "epoch": 18.54730831973899, + "grad_norm": 0.004172735847532749, + "learning_rate": 1.598861691189557e-05, + "loss": 0.0508, + "num_input_tokens_seen": 245327088, + "step": 113695 + }, + { + "epoch": 18.548123980424144, + "grad_norm": 0.021711774170398712, + "learning_rate": 1.5970765478136696e-05, + "loss": 0.0015, + "num_input_tokens_seen": 245337968, + "step": 113700 + }, + { + "epoch": 18.5489396411093, + "grad_norm": 0.00020194351964164525, + "learning_rate": 1.5952923853844224e-05, + "loss": 0.002, + "num_input_tokens_seen": 245349616, + "step": 113705 + }, + { + "epoch": 18.549755301794452, + "grad_norm": 0.008815481327474117, + "learning_rate": 1.5935092039379874e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245360560, + "step": 113710 + }, + { + "epoch": 18.550570962479608, + "grad_norm": 0.034336645156145096, + "learning_rate": 1.5917270035104903e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245371376, + "step": 113715 + }, + { + "epoch": 18.551386623164763, + "grad_norm": 0.0014819727512076497, + "learning_rate": 1.5899457841380637e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245381552, + "step": 113720 + }, + { + "epoch": 18.55220228384992, + "grad_norm": 0.001234989264048636, + "learning_rate": 1.5881655458567847e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245392336, + "step": 113725 + }, + { + "epoch": 18.553017944535075, + "grad_norm": 0.008184137754142284, + "learning_rate": 1.5863862887027626e-05, + "loss": 0.0022, + "num_input_tokens_seen": 245401456, + "step": 113730 + }, + { + "epoch": 18.553833605220227, + "grad_norm": 0.0014360505156219006, + "learning_rate": 1.5846080127120244e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245412368, + "step": 113735 + }, + { + "epoch": 18.554649265905383, + "grad_norm": 0.00032791803823783994, + "learning_rate": 1.58283071792063e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245422448, + "step": 113740 + }, + { + "epoch": 18.55546492659054, + "grad_norm": 0.0005348768318071961, + "learning_rate": 1.581054404364596e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245433296, + "step": 113745 + }, + { + "epoch": 18.556280587275694, + "grad_norm": 0.00014964849106036127, + "learning_rate": 1.5792790720799144e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245443792, + "step": 113750 + }, + { + "epoch": 18.55709624796085, + "grad_norm": 0.0008529218030162156, + "learning_rate": 1.5775047211025685e-05, + "loss": 0.0053, + "num_input_tokens_seen": 245455024, + "step": 113755 + }, + { + "epoch": 18.557911908646002, + "grad_norm": 0.11114954203367233, + "learning_rate": 1.575731351468518e-05, + "loss": 0.0028, + "num_input_tokens_seen": 245464944, + "step": 113760 + }, + { + "epoch": 18.558727569331158, + "grad_norm": 0.00020189626957289875, + "learning_rate": 1.5739589632137006e-05, + "loss": 0.0055, + "num_input_tokens_seen": 245475856, + "step": 113765 + }, + { + "epoch": 18.559543230016313, + "grad_norm": 0.00014133706281427294, + "learning_rate": 1.572187556374044e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245487184, + "step": 113770 + }, + { + "epoch": 18.56035889070147, + "grad_norm": 0.004797428846359253, + "learning_rate": 1.5704171309854354e-05, + "loss": 0.0012, + "num_input_tokens_seen": 245498736, + "step": 113775 + }, + { + "epoch": 18.561174551386625, + "grad_norm": 0.000234750346862711, + "learning_rate": 1.568647687083763e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245509552, + "step": 113780 + }, + { + "epoch": 18.561990212071777, + "grad_norm": 0.013239082880318165, + "learning_rate": 1.5668792247048868e-05, + "loss": 0.0063, + "num_input_tokens_seen": 245520112, + "step": 113785 + }, + { + "epoch": 18.562805872756933, + "grad_norm": 0.00013345596380531788, + "learning_rate": 1.565111743884634e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245530608, + "step": 113790 + }, + { + "epoch": 18.563621533442088, + "grad_norm": 0.00033124216133728623, + "learning_rate": 1.5633452446588537e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245541104, + "step": 113795 + }, + { + "epoch": 18.564437194127244, + "grad_norm": 0.013894570991396904, + "learning_rate": 1.5615797270633114e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245550064, + "step": 113800 + }, + { + "epoch": 18.5652528548124, + "grad_norm": 0.000383746373699978, + "learning_rate": 1.5598151911338176e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245560848, + "step": 113805 + }, + { + "epoch": 18.56606851549755, + "grad_norm": 0.006368584930896759, + "learning_rate": 1.5580516369061103e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245572112, + "step": 113810 + }, + { + "epoch": 18.566884176182707, + "grad_norm": 0.011551005765795708, + "learning_rate": 1.55628906441595e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245582800, + "step": 113815 + }, + { + "epoch": 18.567699836867863, + "grad_norm": 0.0038184334989637136, + "learning_rate": 1.5545274736990354e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245593488, + "step": 113820 + }, + { + "epoch": 18.56851549755302, + "grad_norm": 0.0004881782515440136, + "learning_rate": 1.5527668647910886e-05, + "loss": 0.0039, + "num_input_tokens_seen": 245605296, + "step": 113825 + }, + { + "epoch": 18.569331158238175, + "grad_norm": 0.00230991980060935, + "learning_rate": 1.5510072377277696e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245616464, + "step": 113830 + }, + { + "epoch": 18.570146818923327, + "grad_norm": 0.0003153955331072211, + "learning_rate": 1.5492485925447663e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245627408, + "step": 113835 + }, + { + "epoch": 18.570962479608482, + "grad_norm": 0.00011702616029651836, + "learning_rate": 1.5474909292776895e-05, + "loss": 0.002, + "num_input_tokens_seen": 245637136, + "step": 113840 + }, + { + "epoch": 18.571778140293638, + "grad_norm": 9.748440061230212e-05, + "learning_rate": 1.5457342479621883e-05, + "loss": 0.0011, + "num_input_tokens_seen": 245646736, + "step": 113845 + }, + { + "epoch": 18.572593800978794, + "grad_norm": 0.0002707436797209084, + "learning_rate": 1.5439785486338396e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245657936, + "step": 113850 + }, + { + "epoch": 18.57340946166395, + "grad_norm": 0.00020789432164747268, + "learning_rate": 1.5422238313282434e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245668944, + "step": 113855 + }, + { + "epoch": 18.5742251223491, + "grad_norm": 0.00013995815243106335, + "learning_rate": 1.540470096080948e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245679856, + "step": 113860 + }, + { + "epoch": 18.575040783034257, + "grad_norm": 0.001201069331727922, + "learning_rate": 1.538717342927509e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245689360, + "step": 113865 + }, + { + "epoch": 18.575856443719413, + "grad_norm": 0.0015918446006253362, + "learning_rate": 1.536965571903437e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245699568, + "step": 113870 + }, + { + "epoch": 18.57667210440457, + "grad_norm": 0.018776508048176765, + "learning_rate": 1.535214783044242e-05, + "loss": 0.0028, + "num_input_tokens_seen": 245710288, + "step": 113875 + }, + { + "epoch": 18.57748776508972, + "grad_norm": 0.00014075938088353723, + "learning_rate": 1.5334649763853903e-05, + "loss": 0.0008, + "num_input_tokens_seen": 245721968, + "step": 113880 + }, + { + "epoch": 18.578303425774877, + "grad_norm": 0.00010330978693673387, + "learning_rate": 1.5317161519623647e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245733360, + "step": 113885 + }, + { + "epoch": 18.579119086460032, + "grad_norm": 0.00012656515173148364, + "learning_rate": 1.529968309810592e-05, + "loss": 0.0013, + "num_input_tokens_seen": 245743632, + "step": 113890 + }, + { + "epoch": 18.579934747145188, + "grad_norm": 0.00012493549729697406, + "learning_rate": 1.5282214499655055e-05, + "loss": 0.0007, + "num_input_tokens_seen": 245755568, + "step": 113895 + }, + { + "epoch": 18.580750407830344, + "grad_norm": 0.00072783965151757, + "learning_rate": 1.526475572462499e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245766448, + "step": 113900 + }, + { + "epoch": 18.581566068515496, + "grad_norm": 0.0032054707407951355, + "learning_rate": 1.5247306773369552e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245777712, + "step": 113905 + }, + { + "epoch": 18.58238172920065, + "grad_norm": 0.00864079687744379, + "learning_rate": 1.5229867646242457e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245788016, + "step": 113910 + }, + { + "epoch": 18.583197389885807, + "grad_norm": 0.00032738802838139236, + "learning_rate": 1.5212438343597036e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245799536, + "step": 113915 + }, + { + "epoch": 18.584013050570963, + "grad_norm": 0.0007115579210221767, + "learning_rate": 1.5195018865786559e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245809968, + "step": 113920 + }, + { + "epoch": 18.58482871125612, + "grad_norm": 0.005877731367945671, + "learning_rate": 1.5177609213164023e-05, + "loss": 0.0009, + "num_input_tokens_seen": 245820880, + "step": 113925 + }, + { + "epoch": 18.58564437194127, + "grad_norm": 0.021433550864458084, + "learning_rate": 1.5160209386082314e-05, + "loss": 0.0028, + "num_input_tokens_seen": 245831312, + "step": 113930 + }, + { + "epoch": 18.586460032626427, + "grad_norm": 0.000252114434260875, + "learning_rate": 1.5142819384893925e-05, + "loss": 0.0002, + "num_input_tokens_seen": 245842608, + "step": 113935 + }, + { + "epoch": 18.587275693311582, + "grad_norm": 0.001148088718764484, + "learning_rate": 1.512543920995152e-05, + "loss": 0.0004, + "num_input_tokens_seen": 245852432, + "step": 113940 + }, + { + "epoch": 18.588091353996738, + "grad_norm": 9.25151543924585e-05, + "learning_rate": 1.5108068861607094e-05, + "loss": 0.0001, + "num_input_tokens_seen": 245862896, + "step": 113945 + }, + { + "epoch": 18.588907014681894, + "grad_norm": 0.014639918692409992, + "learning_rate": 1.5090708340212867e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245873744, + "step": 113950 + }, + { + "epoch": 18.589722675367046, + "grad_norm": 0.20178943872451782, + "learning_rate": 1.5073357646120501e-05, + "loss": 0.0027, + "num_input_tokens_seen": 245883440, + "step": 113955 + }, + { + "epoch": 18.5905383360522, + "grad_norm": 1.3993545770645142, + "learning_rate": 1.5056016779681825e-05, + "loss": 0.0591, + "num_input_tokens_seen": 245894480, + "step": 113960 + }, + { + "epoch": 18.591353996737357, + "grad_norm": 0.008963163942098618, + "learning_rate": 1.5038685741248059e-05, + "loss": 0.0015, + "num_input_tokens_seen": 245904368, + "step": 113965 + }, + { + "epoch": 18.592169657422513, + "grad_norm": 0.0014928284799680114, + "learning_rate": 1.502136453117059e-05, + "loss": 0.0022, + "num_input_tokens_seen": 245914864, + "step": 113970 + }, + { + "epoch": 18.59298531810767, + "grad_norm": 0.12074530869722366, + "learning_rate": 1.5004053149800356e-05, + "loss": 0.0011, + "num_input_tokens_seen": 245926512, + "step": 113975 + }, + { + "epoch": 18.59380097879282, + "grad_norm": 0.010296367108821869, + "learning_rate": 1.4986751597488357e-05, + "loss": 0.0026, + "num_input_tokens_seen": 245937744, + "step": 113980 + }, + { + "epoch": 18.594616639477977, + "grad_norm": 0.00025401939637959003, + "learning_rate": 1.4969459874585034e-05, + "loss": 0.0003, + "num_input_tokens_seen": 245948080, + "step": 113985 + }, + { + "epoch": 18.595432300163132, + "grad_norm": 0.01832355558872223, + "learning_rate": 1.495217798144094e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245957296, + "step": 113990 + }, + { + "epoch": 18.596247960848288, + "grad_norm": 0.004861706402152777, + "learning_rate": 1.4934905918406239e-05, + "loss": 0.0005, + "num_input_tokens_seen": 245967088, + "step": 113995 + }, + { + "epoch": 18.597063621533444, + "grad_norm": 0.00014990688941907138, + "learning_rate": 1.491764368583104e-05, + "loss": 0.0006, + "num_input_tokens_seen": 245977776, + "step": 114000 + }, + { + "epoch": 18.597879282218596, + "grad_norm": 0.002280246466398239, + "learning_rate": 1.4900391284065229e-05, + "loss": 0.0014, + "num_input_tokens_seen": 245987600, + "step": 114005 + }, + { + "epoch": 18.59869494290375, + "grad_norm": 0.0009315757197327912, + "learning_rate": 1.4883148713458306e-05, + "loss": 0.0331, + "num_input_tokens_seen": 245997680, + "step": 114010 + }, + { + "epoch": 18.599510603588907, + "grad_norm": 0.07105767726898193, + "learning_rate": 1.4865915974359823e-05, + "loss": 0.001, + "num_input_tokens_seen": 246007664, + "step": 114015 + }, + { + "epoch": 18.600326264274063, + "grad_norm": 0.007895289920270443, + "learning_rate": 1.4848693067119e-05, + "loss": 0.0012, + "num_input_tokens_seen": 246018576, + "step": 114020 + }, + { + "epoch": 18.601141924959215, + "grad_norm": 0.0044876085594296455, + "learning_rate": 1.483147999208484e-05, + "loss": 0.001, + "num_input_tokens_seen": 246029712, + "step": 114025 + }, + { + "epoch": 18.60195758564437, + "grad_norm": 0.00015581764455419034, + "learning_rate": 1.4814276749606226e-05, + "loss": 0.0026, + "num_input_tokens_seen": 246040336, + "step": 114030 + }, + { + "epoch": 18.602773246329527, + "grad_norm": 0.0004913487937301397, + "learning_rate": 1.4797083340031769e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246050896, + "step": 114035 + }, + { + "epoch": 18.603588907014682, + "grad_norm": 0.005815307144075632, + "learning_rate": 1.477989976370997e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246061264, + "step": 114040 + }, + { + "epoch": 18.604404567699838, + "grad_norm": 0.0006005639443174005, + "learning_rate": 1.4762726020989047e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246072048, + "step": 114045 + }, + { + "epoch": 18.605220228384994, + "grad_norm": 0.0003308606974314898, + "learning_rate": 1.4745562112217059e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246082640, + "step": 114050 + }, + { + "epoch": 18.606035889070146, + "grad_norm": 0.001669471152126789, + "learning_rate": 1.4728408037741836e-05, + "loss": 0.0025, + "num_input_tokens_seen": 246093680, + "step": 114055 + }, + { + "epoch": 18.6068515497553, + "grad_norm": 0.020279869437217712, + "learning_rate": 1.4711263797911045e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246105104, + "step": 114060 + }, + { + "epoch": 18.607667210440457, + "grad_norm": 0.00283541320823133, + "learning_rate": 1.469412939307213e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246115856, + "step": 114065 + }, + { + "epoch": 18.608482871125613, + "grad_norm": 0.9388675093650818, + "learning_rate": 1.4677004823572316e-05, + "loss": 0.0653, + "num_input_tokens_seen": 246125552, + "step": 114070 + }, + { + "epoch": 18.609298531810765, + "grad_norm": 0.00022171816090121865, + "learning_rate": 1.4659890089758654e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246135664, + "step": 114075 + }, + { + "epoch": 18.61011419249592, + "grad_norm": 0.00010004785872297361, + "learning_rate": 1.4642785191978036e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246146064, + "step": 114080 + }, + { + "epoch": 18.610929853181077, + "grad_norm": 0.0031590969301760197, + "learning_rate": 1.462569013057713e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246156688, + "step": 114085 + }, + { + "epoch": 18.611745513866232, + "grad_norm": 0.044977400451898575, + "learning_rate": 1.4608604905902268e-05, + "loss": 0.0009, + "num_input_tokens_seen": 246167792, + "step": 114090 + }, + { + "epoch": 18.612561174551388, + "grad_norm": 0.0025062975473701954, + "learning_rate": 1.4591529518299896e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246178224, + "step": 114095 + }, + { + "epoch": 18.61337683523654, + "grad_norm": 0.07357843965291977, + "learning_rate": 1.4574463968115903e-05, + "loss": 0.0014, + "num_input_tokens_seen": 246189776, + "step": 114100 + }, + { + "epoch": 18.614192495921696, + "grad_norm": 0.00012679799692705274, + "learning_rate": 1.4557408255696181e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246200688, + "step": 114105 + }, + { + "epoch": 18.61500815660685, + "grad_norm": 0.05889088287949562, + "learning_rate": 1.4540362381386452e-05, + "loss": 0.0015, + "num_input_tokens_seen": 246211536, + "step": 114110 + }, + { + "epoch": 18.615823817292007, + "grad_norm": 0.00018091562378685921, + "learning_rate": 1.4523326345532163e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246221424, + "step": 114115 + }, + { + "epoch": 18.616639477977163, + "grad_norm": 0.002784978598356247, + "learning_rate": 1.450630014847848e-05, + "loss": 0.0014, + "num_input_tokens_seen": 246233232, + "step": 114120 + }, + { + "epoch": 18.617455138662315, + "grad_norm": 0.0001938427158165723, + "learning_rate": 1.4489283790570518e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246244784, + "step": 114125 + }, + { + "epoch": 18.61827079934747, + "grad_norm": 6.282919639488682e-05, + "learning_rate": 1.4472277272153167e-05, + "loss": 0.001, + "num_input_tokens_seen": 246256432, + "step": 114130 + }, + { + "epoch": 18.619086460032626, + "grad_norm": 0.003416488179937005, + "learning_rate": 1.445528059357104e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246267568, + "step": 114135 + }, + { + "epoch": 18.619902120717782, + "grad_norm": 0.001794111798517406, + "learning_rate": 1.4438293755168585e-05, + "loss": 0.0001, + "num_input_tokens_seen": 246276656, + "step": 114140 + }, + { + "epoch": 18.620717781402938, + "grad_norm": 0.00014321469643618912, + "learning_rate": 1.4421316757290082e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246287760, + "step": 114145 + }, + { + "epoch": 18.62153344208809, + "grad_norm": 0.002007300266996026, + "learning_rate": 1.4404349600279642e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246299504, + "step": 114150 + }, + { + "epoch": 18.622349102773246, + "grad_norm": 0.00889118853956461, + "learning_rate": 1.4387392284481049e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246310512, + "step": 114155 + }, + { + "epoch": 18.6231647634584, + "grad_norm": 0.03204870969057083, + "learning_rate": 1.437044481023797e-05, + "loss": 0.08, + "num_input_tokens_seen": 246321744, + "step": 114160 + }, + { + "epoch": 18.623980424143557, + "grad_norm": 0.0008324621594510972, + "learning_rate": 1.4353507177893964e-05, + "loss": 0.0023, + "num_input_tokens_seen": 246332400, + "step": 114165 + }, + { + "epoch": 18.624796084828713, + "grad_norm": 0.0006312076584435999, + "learning_rate": 1.4336579387792148e-05, + "loss": 0.0001, + "num_input_tokens_seen": 246343376, + "step": 114170 + }, + { + "epoch": 18.625611745513865, + "grad_norm": 0.0221247635781765, + "learning_rate": 1.4319661440275689e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246354256, + "step": 114175 + }, + { + "epoch": 18.62642740619902, + "grad_norm": 0.00036081450525671244, + "learning_rate": 1.4302753335687423e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246364848, + "step": 114180 + }, + { + "epoch": 18.627243066884176, + "grad_norm": 0.00029853556770831347, + "learning_rate": 1.4285855074370025e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246377072, + "step": 114185 + }, + { + "epoch": 18.628058727569332, + "grad_norm": 0.0001232485519722104, + "learning_rate": 1.4268966656665938e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246388656, + "step": 114190 + }, + { + "epoch": 18.628874388254488, + "grad_norm": 0.014406480826437473, + "learning_rate": 1.4252088082917391e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246400208, + "step": 114195 + }, + { + "epoch": 18.62969004893964, + "grad_norm": 0.02046489156782627, + "learning_rate": 1.4235219353466555e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246410672, + "step": 114200 + }, + { + "epoch": 18.630505709624796, + "grad_norm": 0.0015816426603123546, + "learning_rate": 1.4218360468655212e-05, + "loss": 0.0176, + "num_input_tokens_seen": 246421136, + "step": 114205 + }, + { + "epoch": 18.63132137030995, + "grad_norm": 0.016574203968048096, + "learning_rate": 1.4201511428824976e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246430480, + "step": 114210 + }, + { + "epoch": 18.632137030995107, + "grad_norm": 0.0032248462084680796, + "learning_rate": 1.4184672234317463e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246441552, + "step": 114215 + }, + { + "epoch": 18.63295269168026, + "grad_norm": 0.009117607958614826, + "learning_rate": 1.4167842885473903e-05, + "loss": 0.0035, + "num_input_tokens_seen": 246453040, + "step": 114220 + }, + { + "epoch": 18.633768352365415, + "grad_norm": 0.0006510632811114192, + "learning_rate": 1.4151023382635298e-05, + "loss": 0.0479, + "num_input_tokens_seen": 246464304, + "step": 114225 + }, + { + "epoch": 18.63458401305057, + "grad_norm": 0.00025970517890527844, + "learning_rate": 1.4134213726142541e-05, + "loss": 0.0046, + "num_input_tokens_seen": 246477072, + "step": 114230 + }, + { + "epoch": 18.635399673735726, + "grad_norm": 0.00031824264442548156, + "learning_rate": 1.4117413916336307e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246487984, + "step": 114235 + }, + { + "epoch": 18.636215334420882, + "grad_norm": 0.0001128049407270737, + "learning_rate": 1.4100623953557045e-05, + "loss": 0.0013, + "num_input_tokens_seen": 246498864, + "step": 114240 + }, + { + "epoch": 18.637030995106034, + "grad_norm": 0.005889858119189739, + "learning_rate": 1.4083843838145095e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246509424, + "step": 114245 + }, + { + "epoch": 18.63784665579119, + "grad_norm": 0.00014588913472834975, + "learning_rate": 1.4067073570440458e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246520944, + "step": 114250 + }, + { + "epoch": 18.638662316476346, + "grad_norm": 0.00015838281251490116, + "learning_rate": 1.4050313150782978e-05, + "loss": 0.0001, + "num_input_tokens_seen": 246531632, + "step": 114255 + }, + { + "epoch": 18.6394779771615, + "grad_norm": 0.003635368775576353, + "learning_rate": 1.4033562579512438e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246542576, + "step": 114260 + }, + { + "epoch": 18.640293637846657, + "grad_norm": 0.0023953872732818127, + "learning_rate": 1.4016821856968232e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246552912, + "step": 114265 + }, + { + "epoch": 18.64110929853181, + "grad_norm": 0.00042845893767662346, + "learning_rate": 1.4000090983489588e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246562960, + "step": 114270 + }, + { + "epoch": 18.641924959216965, + "grad_norm": 0.004912909585982561, + "learning_rate": 1.3983369959415682e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246574224, + "step": 114275 + }, + { + "epoch": 18.64274061990212, + "grad_norm": 0.00020509555179160088, + "learning_rate": 1.3966658785085352e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246585616, + "step": 114280 + }, + { + "epoch": 18.643556280587276, + "grad_norm": 0.0003504542401060462, + "learning_rate": 1.394995746083727e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246595472, + "step": 114285 + }, + { + "epoch": 18.644371941272432, + "grad_norm": 0.02167712152004242, + "learning_rate": 1.3933265987009836e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246606032, + "step": 114290 + }, + { + "epoch": 18.645187601957584, + "grad_norm": 0.000883794273249805, + "learning_rate": 1.3916584363941442e-05, + "loss": 0.0017, + "num_input_tokens_seen": 246617072, + "step": 114295 + }, + { + "epoch": 18.64600326264274, + "grad_norm": 0.0007453064899891615, + "learning_rate": 1.3899912591970099e-05, + "loss": 0.0009, + "num_input_tokens_seen": 246627728, + "step": 114300 + }, + { + "epoch": 18.646818923327896, + "grad_norm": 0.0004139711381867528, + "learning_rate": 1.3883250671433645e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246637168, + "step": 114305 + }, + { + "epoch": 18.64763458401305, + "grad_norm": 0.00027853078790940344, + "learning_rate": 1.3866598602669866e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246647312, + "step": 114310 + }, + { + "epoch": 18.648450244698207, + "grad_norm": 0.0038243611343204975, + "learning_rate": 1.3849956386016049e-05, + "loss": 0.0018, + "num_input_tokens_seen": 246658224, + "step": 114315 + }, + { + "epoch": 18.64926590538336, + "grad_norm": 0.001313136308453977, + "learning_rate": 1.3833324021809756e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246670320, + "step": 114320 + }, + { + "epoch": 18.650081566068515, + "grad_norm": 0.027203811332583427, + "learning_rate": 1.3816701510387775e-05, + "loss": 0.0011, + "num_input_tokens_seen": 246680624, + "step": 114325 + }, + { + "epoch": 18.65089722675367, + "grad_norm": 0.00023463308752980083, + "learning_rate": 1.3800088852087166e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246691952, + "step": 114330 + }, + { + "epoch": 18.651712887438826, + "grad_norm": 0.0008195968111976981, + "learning_rate": 1.3783486047244497e-05, + "loss": 0.0007, + "num_input_tokens_seen": 246704112, + "step": 114335 + }, + { + "epoch": 18.652528548123982, + "grad_norm": 0.005534283816814423, + "learning_rate": 1.3766893096196386e-05, + "loss": 0.0003, + "num_input_tokens_seen": 246714960, + "step": 114340 + }, + { + "epoch": 18.653344208809134, + "grad_norm": 0.022545916959643364, + "learning_rate": 1.3750309999278899e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246725712, + "step": 114345 + }, + { + "epoch": 18.65415986949429, + "grad_norm": 0.003004055470228195, + "learning_rate": 1.373373675682832e-05, + "loss": 0.0017, + "num_input_tokens_seen": 246736528, + "step": 114350 + }, + { + "epoch": 18.654975530179446, + "grad_norm": 0.010261151939630508, + "learning_rate": 1.371717336918038e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246745968, + "step": 114355 + }, + { + "epoch": 18.6557911908646, + "grad_norm": 0.00013710869825445116, + "learning_rate": 1.3700619836670813e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246757424, + "step": 114360 + }, + { + "epoch": 18.656606851549757, + "grad_norm": 0.0002121803699992597, + "learning_rate": 1.3684076159635129e-05, + "loss": 0.0018, + "num_input_tokens_seen": 246767792, + "step": 114365 + }, + { + "epoch": 18.65742251223491, + "grad_norm": 0.004757937975227833, + "learning_rate": 1.3667542338408611e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246779088, + "step": 114370 + }, + { + "epoch": 18.658238172920065, + "grad_norm": 0.015649249777197838, + "learning_rate": 1.3651018373326219e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246789136, + "step": 114375 + }, + { + "epoch": 18.65905383360522, + "grad_norm": 0.0003124087234027684, + "learning_rate": 1.3634504264723013e-05, + "loss": 0.0479, + "num_input_tokens_seen": 246800816, + "step": 114380 + }, + { + "epoch": 18.659869494290376, + "grad_norm": 0.00013174302875995636, + "learning_rate": 1.3618000012933506e-05, + "loss": 0.0006, + "num_input_tokens_seen": 246812016, + "step": 114385 + }, + { + "epoch": 18.660685154975532, + "grad_norm": 0.0006356770754791796, + "learning_rate": 1.3601505618292264e-05, + "loss": 0.0618, + "num_input_tokens_seen": 246822608, + "step": 114390 + }, + { + "epoch": 18.661500815660684, + "grad_norm": 0.02123635821044445, + "learning_rate": 1.3585021081133575e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246833872, + "step": 114395 + }, + { + "epoch": 18.66231647634584, + "grad_norm": 0.00020763315842486918, + "learning_rate": 1.3568546401791449e-05, + "loss": 0.0024, + "num_input_tokens_seen": 246845808, + "step": 114400 + }, + { + "epoch": 18.663132137030995, + "grad_norm": 0.00029170908965170383, + "learning_rate": 1.355208158059984e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246855088, + "step": 114405 + }, + { + "epoch": 18.66394779771615, + "grad_norm": 0.0012112419353798032, + "learning_rate": 1.3535626617892426e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246866896, + "step": 114410 + }, + { + "epoch": 18.664763458401303, + "grad_norm": 0.0015520005254074931, + "learning_rate": 1.3519181514002665e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246878512, + "step": 114415 + }, + { + "epoch": 18.66557911908646, + "grad_norm": 0.027273552492260933, + "learning_rate": 1.3502746269263788e-05, + "loss": 0.0008, + "num_input_tokens_seen": 246889104, + "step": 114420 + }, + { + "epoch": 18.666394779771615, + "grad_norm": 0.30397555232048035, + "learning_rate": 1.3486320884008918e-05, + "loss": 0.0063, + "num_input_tokens_seen": 246900208, + "step": 114425 + }, + { + "epoch": 18.66721044045677, + "grad_norm": 0.00016032405255828053, + "learning_rate": 1.3469905358570956e-05, + "loss": 0.0019, + "num_input_tokens_seen": 246910288, + "step": 114430 + }, + { + "epoch": 18.668026101141926, + "grad_norm": 0.004200716968625784, + "learning_rate": 1.3453499693282633e-05, + "loss": 0.0005, + "num_input_tokens_seen": 246920336, + "step": 114435 + }, + { + "epoch": 18.66884176182708, + "grad_norm": 0.00010748956265160814, + "learning_rate": 1.3437103888476244e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246931344, + "step": 114440 + }, + { + "epoch": 18.669657422512234, + "grad_norm": 0.017067406326532364, + "learning_rate": 1.342071794448435e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246942736, + "step": 114445 + }, + { + "epoch": 18.67047308319739, + "grad_norm": 0.0005001574172638357, + "learning_rate": 1.340434186163869e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246952944, + "step": 114450 + }, + { + "epoch": 18.671288743882545, + "grad_norm": 0.0015025590546429157, + "learning_rate": 1.33879756402715e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246964464, + "step": 114455 + }, + { + "epoch": 18.6721044045677, + "grad_norm": 0.007270271889865398, + "learning_rate": 1.3371619280714175e-05, + "loss": 0.0004, + "num_input_tokens_seen": 246975280, + "step": 114460 + }, + { + "epoch": 18.672920065252853, + "grad_norm": 0.00030608524684794247, + "learning_rate": 1.3355272783298455e-05, + "loss": 0.0065, + "num_input_tokens_seen": 246987056, + "step": 114465 + }, + { + "epoch": 18.67373572593801, + "grad_norm": 0.0003080165188293904, + "learning_rate": 1.3338936148355351e-05, + "loss": 0.0002, + "num_input_tokens_seen": 246998032, + "step": 114470 + }, + { + "epoch": 18.674551386623165, + "grad_norm": 0.028972506523132324, + "learning_rate": 1.3322609376216155e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247009904, + "step": 114475 + }, + { + "epoch": 18.67536704730832, + "grad_norm": 0.0001130843666032888, + "learning_rate": 1.33062924672116e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247019472, + "step": 114480 + }, + { + "epoch": 18.676182707993476, + "grad_norm": 0.0014309418620541692, + "learning_rate": 1.3289985421672534e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247030192, + "step": 114485 + }, + { + "epoch": 18.67699836867863, + "grad_norm": 0.0005481012631207705, + "learning_rate": 1.3273688239929248e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247040656, + "step": 114490 + }, + { + "epoch": 18.677814029363784, + "grad_norm": 0.004359756596386433, + "learning_rate": 1.3257400922312258e-05, + "loss": 0.0046, + "num_input_tokens_seen": 247052336, + "step": 114495 + }, + { + "epoch": 18.67862969004894, + "grad_norm": 0.00014204307808540761, + "learning_rate": 1.3241123469151406e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247062576, + "step": 114500 + }, + { + "epoch": 18.679445350734095, + "grad_norm": 0.009868775494396687, + "learning_rate": 1.322485588077671e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247072976, + "step": 114505 + }, + { + "epoch": 18.68026101141925, + "grad_norm": 0.0021861260756850243, + "learning_rate": 1.3208598157517849e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247083696, + "step": 114510 + }, + { + "epoch": 18.681076672104403, + "grad_norm": 0.0001000143529381603, + "learning_rate": 1.3192350299704225e-05, + "loss": 0.0072, + "num_input_tokens_seen": 247093808, + "step": 114515 + }, + { + "epoch": 18.68189233278956, + "grad_norm": 0.0002090293710352853, + "learning_rate": 1.3176112307665245e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247103888, + "step": 114520 + }, + { + "epoch": 18.682707993474715, + "grad_norm": 0.0008344078669324517, + "learning_rate": 1.315988418172992e-05, + "loss": 0.0021, + "num_input_tokens_seen": 247115664, + "step": 114525 + }, + { + "epoch": 18.68352365415987, + "grad_norm": 0.0003252045135013759, + "learning_rate": 1.3143665922227155e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247126192, + "step": 114530 + }, + { + "epoch": 18.684339314845026, + "grad_norm": 0.00018119774176739156, + "learning_rate": 1.3127457529485576e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247136880, + "step": 114535 + }, + { + "epoch": 18.68515497553018, + "grad_norm": 9.592909191269428e-05, + "learning_rate": 1.3111259003833753e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247149424, + "step": 114540 + }, + { + "epoch": 18.685970636215334, + "grad_norm": 0.00012704270193353295, + "learning_rate": 1.3095070345599924e-05, + "loss": 0.0006, + "num_input_tokens_seen": 247159600, + "step": 114545 + }, + { + "epoch": 18.68678629690049, + "grad_norm": 0.019536789506673813, + "learning_rate": 1.3078891555112161e-05, + "loss": 0.0023, + "num_input_tokens_seen": 247170992, + "step": 114550 + }, + { + "epoch": 18.687601957585645, + "grad_norm": 0.0032452100422233343, + "learning_rate": 1.306272263269831e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247181808, + "step": 114555 + }, + { + "epoch": 18.6884176182708, + "grad_norm": 0.03559986501932144, + "learning_rate": 1.3046563578686222e-05, + "loss": 0.0026, + "num_input_tokens_seen": 247193232, + "step": 114560 + }, + { + "epoch": 18.689233278955953, + "grad_norm": 0.0530209019780159, + "learning_rate": 1.303041439340319e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247203568, + "step": 114565 + }, + { + "epoch": 18.69004893964111, + "grad_norm": 0.00014043497503735125, + "learning_rate": 1.3014275077176618e-05, + "loss": 0.0028, + "num_input_tokens_seen": 247215216, + "step": 114570 + }, + { + "epoch": 18.690864600326265, + "grad_norm": 0.0001132222605519928, + "learning_rate": 1.2998145630333469e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247225072, + "step": 114575 + }, + { + "epoch": 18.69168026101142, + "grad_norm": 0.023613497614860535, + "learning_rate": 1.2982026053200813e-05, + "loss": 0.0018, + "num_input_tokens_seen": 247234864, + "step": 114580 + }, + { + "epoch": 18.692495921696576, + "grad_norm": 0.002976249670609832, + "learning_rate": 1.2965916346105166e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247246192, + "step": 114585 + }, + { + "epoch": 18.693311582381728, + "grad_norm": 0.010633801110088825, + "learning_rate": 1.2949816509373102e-05, + "loss": 0.0026, + "num_input_tokens_seen": 247257488, + "step": 114590 + }, + { + "epoch": 18.694127243066884, + "grad_norm": 0.003944421652704477, + "learning_rate": 1.2933726543330804e-05, + "loss": 0.1379, + "num_input_tokens_seen": 247268240, + "step": 114595 + }, + { + "epoch": 18.69494290375204, + "grad_norm": 0.00021685218962375075, + "learning_rate": 1.2917646448304509e-05, + "loss": 0.0016, + "num_input_tokens_seen": 247280336, + "step": 114600 + }, + { + "epoch": 18.695758564437195, + "grad_norm": 0.0017565287416800857, + "learning_rate": 1.2901576224619959e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247290928, + "step": 114605 + }, + { + "epoch": 18.696574225122347, + "grad_norm": 0.0035854957532137632, + "learning_rate": 1.2885515872602949e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247301904, + "step": 114610 + }, + { + "epoch": 18.697389885807503, + "grad_norm": 0.0004449295811355114, + "learning_rate": 1.2869465392578828e-05, + "loss": 0.0006, + "num_input_tokens_seen": 247312976, + "step": 114615 + }, + { + "epoch": 18.69820554649266, + "grad_norm": 0.10363830626010895, + "learning_rate": 1.2853424784873059e-05, + "loss": 0.004, + "num_input_tokens_seen": 247323632, + "step": 114620 + }, + { + "epoch": 18.699021207177815, + "grad_norm": 0.0008755187736824155, + "learning_rate": 1.2837394049810547e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247334704, + "step": 114625 + }, + { + "epoch": 18.69983686786297, + "grad_norm": 0.07996739447116852, + "learning_rate": 1.2821373187716311e-05, + "loss": 0.0017, + "num_input_tokens_seen": 247345200, + "step": 114630 + }, + { + "epoch": 18.700652528548122, + "grad_norm": 0.002408280037343502, + "learning_rate": 1.2805362198914872e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247355056, + "step": 114635 + }, + { + "epoch": 18.701468189233278, + "grad_norm": 0.00031723087886348367, + "learning_rate": 1.2789361083730911e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247366480, + "step": 114640 + }, + { + "epoch": 18.702283849918434, + "grad_norm": 0.0001380786852678284, + "learning_rate": 1.2773369842488614e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247376880, + "step": 114645 + }, + { + "epoch": 18.70309951060359, + "grad_norm": 0.002176763257011771, + "learning_rate": 1.2757388475512055e-05, + "loss": 0.0017, + "num_input_tokens_seen": 247388112, + "step": 114650 + }, + { + "epoch": 18.703915171288745, + "grad_norm": 0.03151402249932289, + "learning_rate": 1.2741416983125143e-05, + "loss": 0.0006, + "num_input_tokens_seen": 247399728, + "step": 114655 + }, + { + "epoch": 18.704730831973897, + "grad_norm": 0.0006864515016786754, + "learning_rate": 1.2725455365651507e-05, + "loss": 0.0027, + "num_input_tokens_seen": 247411760, + "step": 114660 + }, + { + "epoch": 18.705546492659053, + "grad_norm": 0.0004300368600524962, + "learning_rate": 1.270950362341472e-05, + "loss": 0.0012, + "num_input_tokens_seen": 247424176, + "step": 114665 + }, + { + "epoch": 18.70636215334421, + "grad_norm": 0.00943394098430872, + "learning_rate": 1.269356175673797e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247434704, + "step": 114670 + }, + { + "epoch": 18.707177814029365, + "grad_norm": 0.00015163994976319373, + "learning_rate": 1.2677629765944387e-05, + "loss": 0.0006, + "num_input_tokens_seen": 247447216, + "step": 114675 + }, + { + "epoch": 18.70799347471452, + "grad_norm": 0.033136021345853806, + "learning_rate": 1.266170765135688e-05, + "loss": 0.0016, + "num_input_tokens_seen": 247458192, + "step": 114680 + }, + { + "epoch": 18.708809135399672, + "grad_norm": 0.011512527242302895, + "learning_rate": 1.2645795413298078e-05, + "loss": 0.001, + "num_input_tokens_seen": 247469040, + "step": 114685 + }, + { + "epoch": 18.709624796084828, + "grad_norm": 0.05417841300368309, + "learning_rate": 1.2629893052090502e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247481104, + "step": 114690 + }, + { + "epoch": 18.710440456769984, + "grad_norm": 0.0003318540984764695, + "learning_rate": 1.2614000568056395e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247491600, + "step": 114695 + }, + { + "epoch": 18.71125611745514, + "grad_norm": 0.0007666782476007938, + "learning_rate": 1.259811796151783e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247503568, + "step": 114700 + }, + { + "epoch": 18.712071778140295, + "grad_norm": 0.00015107820217963308, + "learning_rate": 1.258224523279683e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247514800, + "step": 114705 + }, + { + "epoch": 18.712887438825447, + "grad_norm": 0.007576937787234783, + "learning_rate": 1.2566382382214859e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247525424, + "step": 114710 + }, + { + "epoch": 18.713703099510603, + "grad_norm": 0.017279163002967834, + "learning_rate": 1.2550529410093548e-05, + "loss": 0.001, + "num_input_tokens_seen": 247537232, + "step": 114715 + }, + { + "epoch": 18.71451876019576, + "grad_norm": 0.0032858445774763823, + "learning_rate": 1.2534686316754085e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247548368, + "step": 114720 + }, + { + "epoch": 18.715334420880914, + "grad_norm": 0.002145764883607626, + "learning_rate": 1.2518853102517657e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247558416, + "step": 114725 + }, + { + "epoch": 18.71615008156607, + "grad_norm": 0.007384400349110365, + "learning_rate": 1.250302976770501e-05, + "loss": 0.0015, + "num_input_tokens_seen": 247568304, + "step": 114730 + }, + { + "epoch": 18.716965742251222, + "grad_norm": 0.007147055119276047, + "learning_rate": 1.248721631263705e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247579888, + "step": 114735 + }, + { + "epoch": 18.717781402936378, + "grad_norm": 0.007985968142747879, + "learning_rate": 1.2471412737633914e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247592336, + "step": 114740 + }, + { + "epoch": 18.718597063621534, + "grad_norm": 0.0001928783458424732, + "learning_rate": 1.2455619043016175e-05, + "loss": 0.0011, + "num_input_tokens_seen": 247604016, + "step": 114745 + }, + { + "epoch": 18.71941272430669, + "grad_norm": 0.16357722878456116, + "learning_rate": 1.2439835229103803e-05, + "loss": 0.0021, + "num_input_tokens_seen": 247613872, + "step": 114750 + }, + { + "epoch": 18.72022838499184, + "grad_norm": 0.0014038957888260484, + "learning_rate": 1.242406129621665e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247624784, + "step": 114755 + }, + { + "epoch": 18.721044045676997, + "grad_norm": 0.026683393865823746, + "learning_rate": 1.240829724467446e-05, + "loss": 0.0012, + "num_input_tokens_seen": 247636048, + "step": 114760 + }, + { + "epoch": 18.721859706362153, + "grad_norm": 0.00015764446288812906, + "learning_rate": 1.2392543074796702e-05, + "loss": 0.0056, + "num_input_tokens_seen": 247647248, + "step": 114765 + }, + { + "epoch": 18.72267536704731, + "grad_norm": 0.0007419177563861012, + "learning_rate": 1.2376798786902621e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247658320, + "step": 114770 + }, + { + "epoch": 18.723491027732464, + "grad_norm": 0.0014033725019544363, + "learning_rate": 1.2361064381311293e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247668176, + "step": 114775 + }, + { + "epoch": 18.724306688417617, + "grad_norm": 0.0004770663217641413, + "learning_rate": 1.2345339858341576e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247678384, + "step": 114780 + }, + { + "epoch": 18.725122349102772, + "grad_norm": 0.0004608016461133957, + "learning_rate": 1.2329625218312213e-05, + "loss": 0.0072, + "num_input_tokens_seen": 247690288, + "step": 114785 + }, + { + "epoch": 18.725938009787928, + "grad_norm": 0.00033010277547873557, + "learning_rate": 1.2313920461541672e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247700368, + "step": 114790 + }, + { + "epoch": 18.726753670473084, + "grad_norm": 0.0015485156327486038, + "learning_rate": 1.22982255883482e-05, + "loss": 0.0001, + "num_input_tokens_seen": 247711824, + "step": 114795 + }, + { + "epoch": 18.72756933115824, + "grad_norm": 0.0023706944193691015, + "learning_rate": 1.2282540599049873e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247722672, + "step": 114800 + }, + { + "epoch": 18.72838499184339, + "grad_norm": 0.011705518700182438, + "learning_rate": 1.2266865493964551e-05, + "loss": 0.0496, + "num_input_tokens_seen": 247734192, + "step": 114805 + }, + { + "epoch": 18.729200652528547, + "grad_norm": 0.00022848584922030568, + "learning_rate": 1.2251200273409923e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247744368, + "step": 114810 + }, + { + "epoch": 18.730016313213703, + "grad_norm": 0.01750882901251316, + "learning_rate": 1.2235544937703513e-05, + "loss": 0.0013, + "num_input_tokens_seen": 247754064, + "step": 114815 + }, + { + "epoch": 18.73083197389886, + "grad_norm": 0.015057169832289219, + "learning_rate": 1.2219899487162567e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247764720, + "step": 114820 + }, + { + "epoch": 18.731647634584014, + "grad_norm": 0.0019125000108033419, + "learning_rate": 1.2204263922104108e-05, + "loss": 0.0029, + "num_input_tokens_seen": 247774576, + "step": 114825 + }, + { + "epoch": 18.732463295269167, + "grad_norm": 0.00011809881834778935, + "learning_rate": 1.2188638242845108e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247785360, + "step": 114830 + }, + { + "epoch": 18.733278955954322, + "grad_norm": 0.04285113885998726, + "learning_rate": 1.2173022449702142e-05, + "loss": 0.0017, + "num_input_tokens_seen": 247796784, + "step": 114835 + }, + { + "epoch": 18.734094616639478, + "grad_norm": 0.00012305729615036398, + "learning_rate": 1.215741654299174e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247807024, + "step": 114840 + }, + { + "epoch": 18.734910277324634, + "grad_norm": 0.002029049675911665, + "learning_rate": 1.214182052303009e-05, + "loss": 0.0058, + "num_input_tokens_seen": 247817712, + "step": 114845 + }, + { + "epoch": 18.73572593800979, + "grad_norm": 0.0019563096575438976, + "learning_rate": 1.2126234390133439e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247828720, + "step": 114850 + }, + { + "epoch": 18.73654159869494, + "grad_norm": 0.00483985198661685, + "learning_rate": 1.2110658144617538e-05, + "loss": 0.016, + "num_input_tokens_seen": 247839504, + "step": 114855 + }, + { + "epoch": 18.737357259380097, + "grad_norm": 0.02257121354341507, + "learning_rate": 1.2095091786798074e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247850320, + "step": 114860 + }, + { + "epoch": 18.738172920065253, + "grad_norm": 0.00011557836114661768, + "learning_rate": 1.207953531699052e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247860912, + "step": 114865 + }, + { + "epoch": 18.73898858075041, + "grad_norm": 0.003206391353160143, + "learning_rate": 1.206398873551018e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247872112, + "step": 114870 + }, + { + "epoch": 18.739804241435564, + "grad_norm": 0.02100180648267269, + "learning_rate": 1.2048452042672075e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247883056, + "step": 114875 + }, + { + "epoch": 18.740619902120716, + "grad_norm": 0.0007993963663466275, + "learning_rate": 1.2032925238791071e-05, + "loss": 0.0003, + "num_input_tokens_seen": 247893776, + "step": 114880 + }, + { + "epoch": 18.741435562805872, + "grad_norm": 0.010476021096110344, + "learning_rate": 1.2017408324181911e-05, + "loss": 0.0005, + "num_input_tokens_seen": 247904176, + "step": 114885 + }, + { + "epoch": 18.742251223491028, + "grad_norm": 0.015390019863843918, + "learning_rate": 1.2001901299159013e-05, + "loss": 0.0004, + "num_input_tokens_seen": 247913904, + "step": 114890 + }, + { + "epoch": 18.743066884176184, + "grad_norm": 0.0010978586506098509, + "learning_rate": 1.1986404164036679e-05, + "loss": 0.002, + "num_input_tokens_seen": 247924624, + "step": 114895 + }, + { + "epoch": 18.74388254486134, + "grad_norm": 0.00046196600305847824, + "learning_rate": 1.1970916919128937e-05, + "loss": 0.0007, + "num_input_tokens_seen": 247935568, + "step": 114900 + }, + { + "epoch": 18.74469820554649, + "grad_norm": 0.0025275114458054304, + "learning_rate": 1.1955439564749649e-05, + "loss": 0.0009, + "num_input_tokens_seen": 247947056, + "step": 114905 + }, + { + "epoch": 18.745513866231647, + "grad_norm": 0.0009774677455425262, + "learning_rate": 1.1939972101212503e-05, + "loss": 0.0008, + "num_input_tokens_seen": 247958448, + "step": 114910 + }, + { + "epoch": 18.746329526916803, + "grad_norm": 0.0004529117722995579, + "learning_rate": 1.1924514528831032e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247968816, + "step": 114915 + }, + { + "epoch": 18.74714518760196, + "grad_norm": 0.0030062783043831587, + "learning_rate": 1.190906684791837e-05, + "loss": 0.0522, + "num_input_tokens_seen": 247978768, + "step": 114920 + }, + { + "epoch": 18.747960848287114, + "grad_norm": 0.0011736209271475673, + "learning_rate": 1.1893629058787714e-05, + "loss": 0.0002, + "num_input_tokens_seen": 247990448, + "step": 114925 + }, + { + "epoch": 18.748776508972266, + "grad_norm": 0.13988733291625977, + "learning_rate": 1.187820116175181e-05, + "loss": 0.0039, + "num_input_tokens_seen": 248002448, + "step": 114930 + }, + { + "epoch": 18.749592169657422, + "grad_norm": 0.005715536884963512, + "learning_rate": 1.1862783157123413e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248010192, + "step": 114935 + }, + { + "epoch": 18.750407830342578, + "grad_norm": 0.016355089843273163, + "learning_rate": 1.1847375045214992e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248019600, + "step": 114940 + }, + { + "epoch": 18.751223491027734, + "grad_norm": 0.00010598308290354908, + "learning_rate": 1.1831976826338742e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248030800, + "step": 114945 + }, + { + "epoch": 18.752039151712886, + "grad_norm": 0.00014388456474989653, + "learning_rate": 1.1816588500806802e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248040464, + "step": 114950 + }, + { + "epoch": 18.75285481239804, + "grad_norm": 0.0010162570979446173, + "learning_rate": 1.1801210068930923e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248049872, + "step": 114955 + }, + { + "epoch": 18.753670473083197, + "grad_norm": 0.00019280660490039736, + "learning_rate": 1.1785841531022968e-05, + "loss": 0.0018, + "num_input_tokens_seen": 248060912, + "step": 114960 + }, + { + "epoch": 18.754486133768353, + "grad_norm": 0.022366119548678398, + "learning_rate": 1.177048288739413e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248070384, + "step": 114965 + }, + { + "epoch": 18.75530179445351, + "grad_norm": 0.00016436123405583203, + "learning_rate": 1.1755134138355995e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248081680, + "step": 114970 + }, + { + "epoch": 18.75611745513866, + "grad_norm": 0.0002143423043889925, + "learning_rate": 1.1739795284219256e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248092592, + "step": 114975 + }, + { + "epoch": 18.756933115823816, + "grad_norm": 0.0006554779247380793, + "learning_rate": 1.172446632529517e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248103728, + "step": 114980 + }, + { + "epoch": 18.757748776508972, + "grad_norm": 0.00021207697864156216, + "learning_rate": 1.1709147261894037e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248115280, + "step": 114985 + }, + { + "epoch": 18.758564437194128, + "grad_norm": 0.0009598220349289477, + "learning_rate": 1.1693838094326502e-05, + "loss": 0.001, + "num_input_tokens_seen": 248126960, + "step": 114990 + }, + { + "epoch": 18.759380097879284, + "grad_norm": 0.00028253268101252615, + "learning_rate": 1.1678538822902817e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248138416, + "step": 114995 + }, + { + "epoch": 18.760195758564436, + "grad_norm": 0.0008653725381009281, + "learning_rate": 1.1663249447933067e-05, + "loss": 0.0021, + "num_input_tokens_seen": 248149328, + "step": 115000 + }, + { + "epoch": 18.76101141924959, + "grad_norm": 0.0009756143554113805, + "learning_rate": 1.1647969969727e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248159920, + "step": 115005 + }, + { + "epoch": 18.761827079934747, + "grad_norm": 0.00043027338688261807, + "learning_rate": 1.1632700388594375e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248172176, + "step": 115010 + }, + { + "epoch": 18.762642740619903, + "grad_norm": 0.0007359184673987329, + "learning_rate": 1.1617440704844661e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248183216, + "step": 115015 + }, + { + "epoch": 18.76345840130506, + "grad_norm": 0.0001405288785463199, + "learning_rate": 1.1602190918787004e-05, + "loss": 0.0015, + "num_input_tokens_seen": 248194064, + "step": 115020 + }, + { + "epoch": 18.76427406199021, + "grad_norm": 0.00012149715621490031, + "learning_rate": 1.1586951030730542e-05, + "loss": 0.0047, + "num_input_tokens_seen": 248205072, + "step": 115025 + }, + { + "epoch": 18.765089722675366, + "grad_norm": 0.000864771893247962, + "learning_rate": 1.1571721040984084e-05, + "loss": 0.0021, + "num_input_tokens_seen": 248217264, + "step": 115030 + }, + { + "epoch": 18.765905383360522, + "grad_norm": 9.966091602109373e-05, + "learning_rate": 1.1556500949856386e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248228528, + "step": 115035 + }, + { + "epoch": 18.766721044045678, + "grad_norm": 0.05634091794490814, + "learning_rate": 1.1541290757655754e-05, + "loss": 0.0028, + "num_input_tokens_seen": 248240240, + "step": 115040 + }, + { + "epoch": 18.767536704730833, + "grad_norm": 8.024124690564349e-05, + "learning_rate": 1.1526090464690553e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248250448, + "step": 115045 + }, + { + "epoch": 18.768352365415986, + "grad_norm": 0.005807225126773119, + "learning_rate": 1.1510900071268815e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248261040, + "step": 115050 + }, + { + "epoch": 18.76916802610114, + "grad_norm": 0.005371989216655493, + "learning_rate": 1.149571957769835e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248270576, + "step": 115055 + }, + { + "epoch": 18.769983686786297, + "grad_norm": 0.00045483189751394093, + "learning_rate": 1.1480548984286853e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248281008, + "step": 115060 + }, + { + "epoch": 18.770799347471453, + "grad_norm": 0.0006490772357210517, + "learning_rate": 1.1465388291341804e-05, + "loss": 0.006, + "num_input_tokens_seen": 248291280, + "step": 115065 + }, + { + "epoch": 18.77161500815661, + "grad_norm": 0.0039050534833222628, + "learning_rate": 1.145023749917029e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248301968, + "step": 115070 + }, + { + "epoch": 18.77243066884176, + "grad_norm": 0.011832404881715775, + "learning_rate": 1.143509660807962e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248312720, + "step": 115075 + }, + { + "epoch": 18.773246329526916, + "grad_norm": 0.005738573148846626, + "learning_rate": 1.1419965618376383e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248322992, + "step": 115080 + }, + { + "epoch": 18.774061990212072, + "grad_norm": 0.00042150355875492096, + "learning_rate": 1.1404844530367498e-05, + "loss": 0.0044, + "num_input_tokens_seen": 248334224, + "step": 115085 + }, + { + "epoch": 18.774877650897228, + "grad_norm": 0.0014195777475833893, + "learning_rate": 1.138973334435911e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248345104, + "step": 115090 + }, + { + "epoch": 18.775693311582383, + "grad_norm": 0.004056460689753294, + "learning_rate": 1.1374632060657753e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248356048, + "step": 115095 + }, + { + "epoch": 18.776508972267536, + "grad_norm": 0.006195846479386091, + "learning_rate": 1.1359540679569236e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248365712, + "step": 115100 + }, + { + "epoch": 18.77732463295269, + "grad_norm": 0.014896090142428875, + "learning_rate": 1.1344459201399592e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248377136, + "step": 115105 + }, + { + "epoch": 18.778140293637847, + "grad_norm": 0.006751941051334143, + "learning_rate": 1.1329387626454358e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248386896, + "step": 115110 + }, + { + "epoch": 18.778955954323003, + "grad_norm": 0.002600416075438261, + "learning_rate": 1.1314325955039007e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248398640, + "step": 115115 + }, + { + "epoch": 18.77977161500816, + "grad_norm": 0.0001328575162915513, + "learning_rate": 1.1299274187458741e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248409744, + "step": 115120 + }, + { + "epoch": 18.78058727569331, + "grad_norm": 0.00016118235362228006, + "learning_rate": 1.1284232324018761e-05, + "loss": 0.0019, + "num_input_tokens_seen": 248420688, + "step": 115125 + }, + { + "epoch": 18.781402936378466, + "grad_norm": 0.001956700813025236, + "learning_rate": 1.1269200365023657e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248430960, + "step": 115130 + }, + { + "epoch": 18.782218597063622, + "grad_norm": 0.010709714144468307, + "learning_rate": 1.125417831077824e-05, + "loss": 0.0035, + "num_input_tokens_seen": 248441936, + "step": 115135 + }, + { + "epoch": 18.783034257748778, + "grad_norm": 0.001223007682710886, + "learning_rate": 1.1239166161586933e-05, + "loss": 0.0012, + "num_input_tokens_seen": 248452080, + "step": 115140 + }, + { + "epoch": 18.78384991843393, + "grad_norm": 0.0001316919515375048, + "learning_rate": 1.1224163917753993e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248461328, + "step": 115145 + }, + { + "epoch": 18.784665579119086, + "grad_norm": 0.014832563698291779, + "learning_rate": 1.1209171579583399e-05, + "loss": 0.0009, + "num_input_tokens_seen": 248472912, + "step": 115150 + }, + { + "epoch": 18.78548123980424, + "grad_norm": 0.000773149193264544, + "learning_rate": 1.1194189147379018e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248485424, + "step": 115155 + }, + { + "epoch": 18.786296900489397, + "grad_norm": 0.00024225556990131736, + "learning_rate": 1.1179216621444499e-05, + "loss": 0.0007, + "num_input_tokens_seen": 248496688, + "step": 115160 + }, + { + "epoch": 18.787112561174553, + "grad_norm": 0.15521162748336792, + "learning_rate": 1.1164254002083262e-05, + "loss": 0.0031, + "num_input_tokens_seen": 248508208, + "step": 115165 + }, + { + "epoch": 18.787928221859705, + "grad_norm": 0.00017205023323185742, + "learning_rate": 1.1149301289598569e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248518672, + "step": 115170 + }, + { + "epoch": 18.78874388254486, + "grad_norm": 0.00143424142152071, + "learning_rate": 1.1134358484293395e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248529680, + "step": 115175 + }, + { + "epoch": 18.789559543230016, + "grad_norm": 0.08642268925905228, + "learning_rate": 1.1119425586470667e-05, + "loss": 0.0019, + "num_input_tokens_seen": 248540720, + "step": 115180 + }, + { + "epoch": 18.790375203915172, + "grad_norm": 0.14971041679382324, + "learning_rate": 1.1104502596432863e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248551568, + "step": 115185 + }, + { + "epoch": 18.791190864600328, + "grad_norm": 0.0008267206139862537, + "learning_rate": 1.1089589514482635e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248563248, + "step": 115190 + }, + { + "epoch": 18.79200652528548, + "grad_norm": 0.027315234765410423, + "learning_rate": 1.1074686340922068e-05, + "loss": 0.0021, + "num_input_tokens_seen": 248574736, + "step": 115195 + }, + { + "epoch": 18.792822185970635, + "grad_norm": 0.0018830314511433244, + "learning_rate": 1.105979307605326e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248585168, + "step": 115200 + }, + { + "epoch": 18.79363784665579, + "grad_norm": 0.07751666754484177, + "learning_rate": 1.104490972017791e-05, + "loss": 0.0011, + "num_input_tokens_seen": 248595472, + "step": 115205 + }, + { + "epoch": 18.794453507340947, + "grad_norm": 0.0641179159283638, + "learning_rate": 1.1030036273597888e-05, + "loss": 0.0012, + "num_input_tokens_seen": 248607024, + "step": 115210 + }, + { + "epoch": 18.795269168026103, + "grad_norm": 0.01836971752345562, + "learning_rate": 1.1015172736614343e-05, + "loss": 0.002, + "num_input_tokens_seen": 248618160, + "step": 115215 + }, + { + "epoch": 18.796084828711255, + "grad_norm": 0.00017557290266267955, + "learning_rate": 1.1000319109528755e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248629424, + "step": 115220 + }, + { + "epoch": 18.79690048939641, + "grad_norm": 0.0030864253640174866, + "learning_rate": 1.0985475392641941e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248640208, + "step": 115225 + }, + { + "epoch": 18.797716150081566, + "grad_norm": 0.00014531533815898, + "learning_rate": 1.0970641586254937e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248652272, + "step": 115230 + }, + { + "epoch": 18.798531810766722, + "grad_norm": 0.0011381496442481875, + "learning_rate": 1.0955817690668169e-05, + "loss": 0.0017, + "num_input_tokens_seen": 248663888, + "step": 115235 + }, + { + "epoch": 18.799347471451878, + "grad_norm": 0.004792896565049887, + "learning_rate": 1.094100370618223e-05, + "loss": 0.0042, + "num_input_tokens_seen": 248673712, + "step": 115240 + }, + { + "epoch": 18.80016313213703, + "grad_norm": 0.23594005405902863, + "learning_rate": 1.0926199633097156e-05, + "loss": 0.0052, + "num_input_tokens_seen": 248686416, + "step": 115245 + }, + { + "epoch": 18.800978792822185, + "grad_norm": 0.03721102699637413, + "learning_rate": 1.091140547171321e-05, + "loss": 0.0017, + "num_input_tokens_seen": 248697872, + "step": 115250 + }, + { + "epoch": 18.80179445350734, + "grad_norm": 0.0001786552311386913, + "learning_rate": 1.0896621222329983e-05, + "loss": 0.0006, + "num_input_tokens_seen": 248709360, + "step": 115255 + }, + { + "epoch": 18.802610114192497, + "grad_norm": 0.0001221275597345084, + "learning_rate": 1.0881846885247293e-05, + "loss": 0.0092, + "num_input_tokens_seen": 248721168, + "step": 115260 + }, + { + "epoch": 18.803425774877653, + "grad_norm": 0.00020840838260482997, + "learning_rate": 1.0867082460764343e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248731920, + "step": 115265 + }, + { + "epoch": 18.804241435562805, + "grad_norm": 0.004760478623211384, + "learning_rate": 1.0852327949180618e-05, + "loss": 0.1287, + "num_input_tokens_seen": 248742992, + "step": 115270 + }, + { + "epoch": 18.80505709624796, + "grad_norm": 0.0001585778663866222, + "learning_rate": 1.0837583350794878e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248751984, + "step": 115275 + }, + { + "epoch": 18.805872756933116, + "grad_norm": 0.008134212344884872, + "learning_rate": 1.0822848665906104e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248762128, + "step": 115280 + }, + { + "epoch": 18.806688417618272, + "grad_norm": 0.0005845374544151127, + "learning_rate": 1.0808123894812838e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248772112, + "step": 115285 + }, + { + "epoch": 18.807504078303424, + "grad_norm": 0.00023589735792484134, + "learning_rate": 1.0793409037813562e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248783856, + "step": 115290 + }, + { + "epoch": 18.80831973898858, + "grad_norm": 0.012870309874415398, + "learning_rate": 1.0778704095206427e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248794800, + "step": 115295 + }, + { + "epoch": 18.809135399673735, + "grad_norm": 0.00019547424744814634, + "learning_rate": 1.0764009067289526e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248805168, + "step": 115300 + }, + { + "epoch": 18.80995106035889, + "grad_norm": 0.00034028731170110404, + "learning_rate": 1.0749323954360568e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248815696, + "step": 115305 + }, + { + "epoch": 18.810766721044047, + "grad_norm": 0.0003825095191132277, + "learning_rate": 1.0734648756717258e-05, + "loss": 0.0017, + "num_input_tokens_seen": 248828048, + "step": 115310 + }, + { + "epoch": 18.8115823817292, + "grad_norm": 0.00013366201892495155, + "learning_rate": 1.0719983474656914e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248838512, + "step": 115315 + }, + { + "epoch": 18.812398042414355, + "grad_norm": 9.2554502771236e-05, + "learning_rate": 1.0705328108476852e-05, + "loss": 0.0003, + "num_input_tokens_seen": 248849712, + "step": 115320 + }, + { + "epoch": 18.81321370309951, + "grad_norm": 0.0001617198286112398, + "learning_rate": 1.0690682658474004e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248861872, + "step": 115325 + }, + { + "epoch": 18.814029363784666, + "grad_norm": 0.09099197387695312, + "learning_rate": 1.0676047124945187e-05, + "loss": 0.0013, + "num_input_tokens_seen": 248873488, + "step": 115330 + }, + { + "epoch": 18.81484502446982, + "grad_norm": 0.00011856336641358212, + "learning_rate": 1.0661421508187109e-05, + "loss": 0.0005, + "num_input_tokens_seen": 248885104, + "step": 115335 + }, + { + "epoch": 18.815660685154974, + "grad_norm": 0.0001734161050990224, + "learning_rate": 1.0646805808495974e-05, + "loss": 0.0004, + "num_input_tokens_seen": 248896528, + "step": 115340 + }, + { + "epoch": 18.81647634584013, + "grad_norm": 0.0008717067539691925, + "learning_rate": 1.0632200026168215e-05, + "loss": 0.0008, + "num_input_tokens_seen": 248907600, + "step": 115345 + }, + { + "epoch": 18.817292006525285, + "grad_norm": 0.0002089901827275753, + "learning_rate": 1.061760416149965e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248918320, + "step": 115350 + }, + { + "epoch": 18.81810766721044, + "grad_norm": 0.001059941714629531, + "learning_rate": 1.0603018214786264e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248928560, + "step": 115355 + }, + { + "epoch": 18.818923327895597, + "grad_norm": 0.0002324677334399894, + "learning_rate": 1.0588442186323433e-05, + "loss": 0.0001, + "num_input_tokens_seen": 248937936, + "step": 115360 + }, + { + "epoch": 18.81973898858075, + "grad_norm": 0.0359821692109108, + "learning_rate": 1.0573876076406807e-05, + "loss": 0.0015, + "num_input_tokens_seen": 248948848, + "step": 115365 + }, + { + "epoch": 18.820554649265905, + "grad_norm": 0.00010055158782051876, + "learning_rate": 1.055931988533132e-05, + "loss": 0.0016, + "num_input_tokens_seen": 248958224, + "step": 115370 + }, + { + "epoch": 18.82137030995106, + "grad_norm": 0.00010091799049405381, + "learning_rate": 1.0544773613392289e-05, + "loss": 0.0023, + "num_input_tokens_seen": 248970704, + "step": 115375 + }, + { + "epoch": 18.822185970636216, + "grad_norm": 0.006807617377489805, + "learning_rate": 1.0530237260884146e-05, + "loss": 0.0002, + "num_input_tokens_seen": 248980560, + "step": 115380 + }, + { + "epoch": 18.82300163132137, + "grad_norm": 0.000923826708458364, + "learning_rate": 1.051571082810182e-05, + "loss": 0.002, + "num_input_tokens_seen": 248991056, + "step": 115385 + }, + { + "epoch": 18.823817292006524, + "grad_norm": 0.012724040076136589, + "learning_rate": 1.0501194315339523e-05, + "loss": 0.0007, + "num_input_tokens_seen": 249002288, + "step": 115390 + }, + { + "epoch": 18.82463295269168, + "grad_norm": 0.0008356375037692487, + "learning_rate": 1.048668772289152e-05, + "loss": 0.0015, + "num_input_tokens_seen": 249012848, + "step": 115395 + }, + { + "epoch": 18.825448613376835, + "grad_norm": 0.0012188871623948216, + "learning_rate": 1.0472191051051738e-05, + "loss": 0.0003, + "num_input_tokens_seen": 249023408, + "step": 115400 + }, + { + "epoch": 18.82626427406199, + "grad_norm": 0.00021507685596589, + "learning_rate": 1.0457704300114057e-05, + "loss": 0.0011, + "num_input_tokens_seen": 249034288, + "step": 115405 + }, + { + "epoch": 18.827079934747147, + "grad_norm": 0.09337083995342255, + "learning_rate": 1.0443227470372018e-05, + "loss": 0.0015, + "num_input_tokens_seen": 249046064, + "step": 115410 + }, + { + "epoch": 18.8278955954323, + "grad_norm": 0.018217457458376884, + "learning_rate": 1.0428760562119e-05, + "loss": 0.0037, + "num_input_tokens_seen": 249056816, + "step": 115415 + }, + { + "epoch": 18.828711256117455, + "grad_norm": 0.005493684206157923, + "learning_rate": 1.041430357564821e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249067248, + "step": 115420 + }, + { + "epoch": 18.82952691680261, + "grad_norm": 0.0001601478725206107, + "learning_rate": 1.0399856511252692e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249078256, + "step": 115425 + }, + { + "epoch": 18.830342577487766, + "grad_norm": 0.0003187454422004521, + "learning_rate": 1.0385419369225157e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249089360, + "step": 115430 + }, + { + "epoch": 18.83115823817292, + "grad_norm": 0.0017229376826435328, + "learning_rate": 1.0370992149858205e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249100944, + "step": 115435 + }, + { + "epoch": 18.831973898858074, + "grad_norm": 0.000387068692361936, + "learning_rate": 1.0356574853444211e-05, + "loss": 0.0315, + "num_input_tokens_seen": 249111664, + "step": 115440 + }, + { + "epoch": 18.83278955954323, + "grad_norm": 0.004921728745102882, + "learning_rate": 1.0342167480275444e-05, + "loss": 0.0011, + "num_input_tokens_seen": 249120304, + "step": 115445 + }, + { + "epoch": 18.833605220228385, + "grad_norm": 0.01804129034280777, + "learning_rate": 1.032777003064378e-05, + "loss": 0.0017, + "num_input_tokens_seen": 249131088, + "step": 115450 + }, + { + "epoch": 18.83442088091354, + "grad_norm": 0.002478140639141202, + "learning_rate": 1.0313382504841096e-05, + "loss": 0.0003, + "num_input_tokens_seen": 249141040, + "step": 115455 + }, + { + "epoch": 18.835236541598697, + "grad_norm": 0.005434748250991106, + "learning_rate": 1.0299004903158882e-05, + "loss": 0.0074, + "num_input_tokens_seen": 249149552, + "step": 115460 + }, + { + "epoch": 18.83605220228385, + "grad_norm": 0.0017370822606608272, + "learning_rate": 1.0284637225888626e-05, + "loss": 0.0006, + "num_input_tokens_seen": 249159536, + "step": 115465 + }, + { + "epoch": 18.836867862969005, + "grad_norm": 0.0025667534209787846, + "learning_rate": 1.0270279473321375e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249169904, + "step": 115470 + }, + { + "epoch": 18.83768352365416, + "grad_norm": 0.0266465712338686, + "learning_rate": 1.0255931645748174e-05, + "loss": 0.0009, + "num_input_tokens_seen": 249180240, + "step": 115475 + }, + { + "epoch": 18.838499184339316, + "grad_norm": 0.017406191676855087, + "learning_rate": 1.0241593743459898e-05, + "loss": 0.0007, + "num_input_tokens_seen": 249190448, + "step": 115480 + }, + { + "epoch": 18.839314845024468, + "grad_norm": 0.00014975924568716437, + "learning_rate": 1.0227265766746874e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249202640, + "step": 115485 + }, + { + "epoch": 18.840130505709624, + "grad_norm": 0.00044702773448079824, + "learning_rate": 1.0212947715899757e-05, + "loss": 0.0001, + "num_input_tokens_seen": 249214224, + "step": 115490 + }, + { + "epoch": 18.84094616639478, + "grad_norm": 0.004530956968665123, + "learning_rate": 1.0198639591208535e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249225360, + "step": 115495 + }, + { + "epoch": 18.841761827079935, + "grad_norm": 0.040663208812475204, + "learning_rate": 1.0184341392963259e-05, + "loss": 0.0018, + "num_input_tokens_seen": 249236624, + "step": 115500 + }, + { + "epoch": 18.84257748776509, + "grad_norm": 0.0007601089891977608, + "learning_rate": 1.0170053121453694e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249247408, + "step": 115505 + }, + { + "epoch": 18.843393148450243, + "grad_norm": 0.012715485878288746, + "learning_rate": 1.0155774776969385e-05, + "loss": 0.0013, + "num_input_tokens_seen": 249258000, + "step": 115510 + }, + { + "epoch": 18.8442088091354, + "grad_norm": 0.00012260829680599272, + "learning_rate": 1.0141506359799712e-05, + "loss": 0.0017, + "num_input_tokens_seen": 249268848, + "step": 115515 + }, + { + "epoch": 18.845024469820554, + "grad_norm": 0.000624390144366771, + "learning_rate": 1.0127247870233836e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249279312, + "step": 115520 + }, + { + "epoch": 18.84584013050571, + "grad_norm": 0.002787157194688916, + "learning_rate": 1.011299930856069e-05, + "loss": 0.0004, + "num_input_tokens_seen": 249289392, + "step": 115525 + }, + { + "epoch": 18.846655791190866, + "grad_norm": 0.0014392283046618104, + "learning_rate": 1.0098760675069151e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249300976, + "step": 115530 + }, + { + "epoch": 18.847471451876018, + "grad_norm": 0.0001576627546455711, + "learning_rate": 1.0084531970047662e-05, + "loss": 0.0023, + "num_input_tokens_seen": 249312528, + "step": 115535 + }, + { + "epoch": 18.848287112561174, + "grad_norm": 8.1260921433568e-05, + "learning_rate": 1.0070313193784653e-05, + "loss": 0.0175, + "num_input_tokens_seen": 249322544, + "step": 115540 + }, + { + "epoch": 18.84910277324633, + "grad_norm": 0.0006838338449597359, + "learning_rate": 1.0056104346568285e-05, + "loss": 0.0081, + "num_input_tokens_seen": 249333136, + "step": 115545 + }, + { + "epoch": 18.849918433931485, + "grad_norm": 0.0013454832369461656, + "learning_rate": 1.0041905428686493e-05, + "loss": 0.0002, + "num_input_tokens_seen": 249343888, + "step": 115550 + }, + { + "epoch": 18.85073409461664, + "grad_norm": 0.0001398637832608074, + "learning_rate": 1.0027716440427049e-05, + "loss": 0.001, + "num_input_tokens_seen": 249354448, + "step": 115555 + }, + { + "epoch": 18.851549755301793, + "grad_norm": 0.00023302740009967238, + "learning_rate": 1.0013537382077443e-05, + "loss": 0.0082, + "num_input_tokens_seen": 249365904, + "step": 115560 + }, + { + "epoch": 18.85236541598695, + "grad_norm": 0.015334675088524818, + "learning_rate": 9.999368253925167e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249375920, + "step": 115565 + }, + { + "epoch": 18.853181076672104, + "grad_norm": 0.07721404731273651, + "learning_rate": 9.985209056257272e-06, + "loss": 0.0011, + "num_input_tokens_seen": 249386480, + "step": 115570 + }, + { + "epoch": 18.85399673735726, + "grad_norm": 0.00011039109813282266, + "learning_rate": 9.971059789360749e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249397232, + "step": 115575 + }, + { + "epoch": 18.854812398042416, + "grad_norm": 0.002622603438794613, + "learning_rate": 9.956920453522366e-06, + "loss": 0.01, + "num_input_tokens_seen": 249407088, + "step": 115580 + }, + { + "epoch": 18.855628058727568, + "grad_norm": 0.0042481510899960995, + "learning_rate": 9.942791049028621e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249417936, + "step": 115585 + }, + { + "epoch": 18.856443719412724, + "grad_norm": 0.000555854057893157, + "learning_rate": 9.928671576165893e-06, + "loss": 0.0001, + "num_input_tokens_seen": 249428272, + "step": 115590 + }, + { + "epoch": 18.85725938009788, + "grad_norm": 0.0002620182349346578, + "learning_rate": 9.914562035220287e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249439056, + "step": 115595 + }, + { + "epoch": 18.858075040783035, + "grad_norm": 0.0007934111636132002, + "learning_rate": 9.900462426477908e-06, + "loss": 0.0001, + "num_input_tokens_seen": 249449808, + "step": 115600 + }, + { + "epoch": 18.85889070146819, + "grad_norm": 0.0006646020920015872, + "learning_rate": 9.886372750224304e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249460496, + "step": 115605 + }, + { + "epoch": 18.859706362153343, + "grad_norm": 0.00039193034172058105, + "learning_rate": 9.872293006745192e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249470704, + "step": 115610 + }, + { + "epoch": 18.8605220228385, + "grad_norm": 0.002484399126842618, + "learning_rate": 9.858223196325789e-06, + "loss": 0.0007, + "num_input_tokens_seen": 249480400, + "step": 115615 + }, + { + "epoch": 18.861337683523654, + "grad_norm": 0.00014312159328255802, + "learning_rate": 9.844163319251253e-06, + "loss": 0.0025, + "num_input_tokens_seen": 249491024, + "step": 115620 + }, + { + "epoch": 18.86215334420881, + "grad_norm": 0.00018568239465821534, + "learning_rate": 9.830113375806582e-06, + "loss": 0.0007, + "num_input_tokens_seen": 249502000, + "step": 115625 + }, + { + "epoch": 18.862969004893966, + "grad_norm": 0.0013387922663241625, + "learning_rate": 9.816073366276545e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249512624, + "step": 115630 + }, + { + "epoch": 18.863784665579118, + "grad_norm": 0.0005999141721986234, + "learning_rate": 9.802043290945529e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249523056, + "step": 115635 + }, + { + "epoch": 18.864600326264274, + "grad_norm": 0.0003056122222915292, + "learning_rate": 9.788023150098024e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249534032, + "step": 115640 + }, + { + "epoch": 18.86541598694943, + "grad_norm": 0.19921019673347473, + "learning_rate": 9.774012944018085e-06, + "loss": 0.0563, + "num_input_tokens_seen": 249543760, + "step": 115645 + }, + { + "epoch": 18.866231647634585, + "grad_norm": 0.0003052055835723877, + "learning_rate": 9.760012672989704e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249555504, + "step": 115650 + }, + { + "epoch": 18.86704730831974, + "grad_norm": 0.00015224488743115216, + "learning_rate": 9.746022337296546e-06, + "loss": 0.001, + "num_input_tokens_seen": 249566960, + "step": 115655 + }, + { + "epoch": 18.867862969004893, + "grad_norm": 0.00014219099830370396, + "learning_rate": 9.732041937222157e-06, + "loss": 0.0009, + "num_input_tokens_seen": 249577712, + "step": 115660 + }, + { + "epoch": 18.86867862969005, + "grad_norm": 0.0138579485937953, + "learning_rate": 9.718071473049927e-06, + "loss": 0.0009, + "num_input_tokens_seen": 249588208, + "step": 115665 + }, + { + "epoch": 18.869494290375204, + "grad_norm": 0.0002877220686059445, + "learning_rate": 9.70411094506296e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249599440, + "step": 115670 + }, + { + "epoch": 18.87030995106036, + "grad_norm": 0.0031672543846070766, + "learning_rate": 9.690160353544142e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249609360, + "step": 115675 + }, + { + "epoch": 18.871125611745512, + "grad_norm": 0.03700711578130722, + "learning_rate": 9.67621969877619e-06, + "loss": 0.0012, + "num_input_tokens_seen": 249619312, + "step": 115680 + }, + { + "epoch": 18.871941272430668, + "grad_norm": 0.00040889246156439185, + "learning_rate": 9.66228898104171e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249630800, + "step": 115685 + }, + { + "epoch": 18.872756933115824, + "grad_norm": 0.004928232170641422, + "learning_rate": 9.64836820062298e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249642160, + "step": 115690 + }, + { + "epoch": 18.87357259380098, + "grad_norm": 0.00010906018724199384, + "learning_rate": 9.634457357802107e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249652432, + "step": 115695 + }, + { + "epoch": 18.874388254486135, + "grad_norm": 0.00016577192582190037, + "learning_rate": 9.62055645286103e-06, + "loss": 0.0001, + "num_input_tokens_seen": 249663312, + "step": 115700 + }, + { + "epoch": 18.875203915171287, + "grad_norm": 0.047372423112392426, + "learning_rate": 9.606665486081522e-06, + "loss": 0.0013, + "num_input_tokens_seen": 249673456, + "step": 115705 + }, + { + "epoch": 18.876019575856443, + "grad_norm": 0.0010182997211813927, + "learning_rate": 9.592784457744918e-06, + "loss": 0.0013, + "num_input_tokens_seen": 249683920, + "step": 115710 + }, + { + "epoch": 18.8768352365416, + "grad_norm": 0.0002519431582186371, + "learning_rate": 9.578913368132824e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249694160, + "step": 115715 + }, + { + "epoch": 18.877650897226754, + "grad_norm": 0.007803650572896004, + "learning_rate": 9.565052217526072e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249705776, + "step": 115720 + }, + { + "epoch": 18.87846655791191, + "grad_norm": 0.00029125160654075444, + "learning_rate": 9.551201006205767e-06, + "loss": 0.0015, + "num_input_tokens_seen": 249715376, + "step": 115725 + }, + { + "epoch": 18.879282218597062, + "grad_norm": 0.04248567298054695, + "learning_rate": 9.537359734452466e-06, + "loss": 0.0011, + "num_input_tokens_seen": 249726064, + "step": 115730 + }, + { + "epoch": 18.880097879282218, + "grad_norm": 0.03596873953938484, + "learning_rate": 9.523528402546888e-06, + "loss": 0.004, + "num_input_tokens_seen": 249736496, + "step": 115735 + }, + { + "epoch": 18.880913539967374, + "grad_norm": 0.0009461218724027276, + "learning_rate": 9.509707010769086e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249745264, + "step": 115740 + }, + { + "epoch": 18.88172920065253, + "grad_norm": 0.00011305028601782396, + "learning_rate": 9.495895559399449e-06, + "loss": 0.0009, + "num_input_tokens_seen": 249755088, + "step": 115745 + }, + { + "epoch": 18.882544861337685, + "grad_norm": 0.007860338315367699, + "learning_rate": 9.482094048717637e-06, + "loss": 0.0007, + "num_input_tokens_seen": 249765744, + "step": 115750 + }, + { + "epoch": 18.883360522022837, + "grad_norm": 0.15482278168201447, + "learning_rate": 9.468302479003487e-06, + "loss": 0.0089, + "num_input_tokens_seen": 249776688, + "step": 115755 + }, + { + "epoch": 18.884176182707993, + "grad_norm": 0.00021274105529300869, + "learning_rate": 9.45452085053644e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249787696, + "step": 115760 + }, + { + "epoch": 18.88499184339315, + "grad_norm": 0.00402427464723587, + "learning_rate": 9.44074916359583e-06, + "loss": 0.0012, + "num_input_tokens_seen": 249799760, + "step": 115765 + }, + { + "epoch": 18.885807504078304, + "grad_norm": 0.0005044505232945085, + "learning_rate": 9.42698741846082e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249810768, + "step": 115770 + }, + { + "epoch": 18.88662316476346, + "grad_norm": 0.0007903426303528249, + "learning_rate": 9.413235615410188e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249819504, + "step": 115775 + }, + { + "epoch": 18.887438825448612, + "grad_norm": 0.016701823100447655, + "learning_rate": 9.39949375472271e-06, + "loss": 0.0013, + "num_input_tokens_seen": 249830960, + "step": 115780 + }, + { + "epoch": 18.888254486133768, + "grad_norm": 0.0005252612172625959, + "learning_rate": 9.385761836676832e-06, + "loss": 0.0177, + "num_input_tokens_seen": 249841232, + "step": 115785 + }, + { + "epoch": 18.889070146818923, + "grad_norm": 0.0013875165022909641, + "learning_rate": 9.37203986155094e-06, + "loss": 0.0003, + "num_input_tokens_seen": 249851600, + "step": 115790 + }, + { + "epoch": 18.88988580750408, + "grad_norm": 0.013317313976585865, + "learning_rate": 9.358327829623038e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249862032, + "step": 115795 + }, + { + "epoch": 18.890701468189235, + "grad_norm": 0.0010609684977680445, + "learning_rate": 9.344625741171009e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249871888, + "step": 115800 + }, + { + "epoch": 18.891517128874387, + "grad_norm": 0.0005948944599367678, + "learning_rate": 9.330933596472635e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249883024, + "step": 115805 + }, + { + "epoch": 18.892332789559543, + "grad_norm": 0.005537915043532848, + "learning_rate": 9.317251395805304e-06, + "loss": 0.0005, + "num_input_tokens_seen": 249894512, + "step": 115810 + }, + { + "epoch": 18.8931484502447, + "grad_norm": 0.00031513129943050444, + "learning_rate": 9.303579139446349e-06, + "loss": 0.0049, + "num_input_tokens_seen": 249905296, + "step": 115815 + }, + { + "epoch": 18.893964110929854, + "grad_norm": 0.0003033119719475508, + "learning_rate": 9.28991682767294e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249916240, + "step": 115820 + }, + { + "epoch": 18.894779771615006, + "grad_norm": 0.0007239855476655066, + "learning_rate": 9.27626446076174e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249926384, + "step": 115825 + }, + { + "epoch": 18.895595432300162, + "grad_norm": 0.0005246683722361922, + "learning_rate": 9.2626220389897e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249936240, + "step": 115830 + }, + { + "epoch": 18.896411092985318, + "grad_norm": 0.00043501914478838444, + "learning_rate": 9.248989562633037e-06, + "loss": 0.0008, + "num_input_tokens_seen": 249946896, + "step": 115835 + }, + { + "epoch": 18.897226753670473, + "grad_norm": 0.00028696752269752324, + "learning_rate": 9.235367031968312e-06, + "loss": 0.0004, + "num_input_tokens_seen": 249957552, + "step": 115840 + }, + { + "epoch": 18.89804241435563, + "grad_norm": 0.0015993199776858091, + "learning_rate": 9.221754447271302e-06, + "loss": 0.0002, + "num_input_tokens_seen": 249967664, + "step": 115845 + }, + { + "epoch": 18.898858075040785, + "grad_norm": 0.08960124850273132, + "learning_rate": 9.208151808818177e-06, + "loss": 0.0017, + "num_input_tokens_seen": 249977520, + "step": 115850 + }, + { + "epoch": 18.899673735725937, + "grad_norm": 0.037692099809646606, + "learning_rate": 9.194559116884327e-06, + "loss": 0.0079, + "num_input_tokens_seen": 249989136, + "step": 115855 + }, + { + "epoch": 18.900489396411093, + "grad_norm": 0.03317434713244438, + "learning_rate": 9.18097637174553e-06, + "loss": 0.0006, + "num_input_tokens_seen": 249999824, + "step": 115860 + }, + { + "epoch": 18.90130505709625, + "grad_norm": 0.03367041051387787, + "learning_rate": 9.167403573676736e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250012016, + "step": 115865 + }, + { + "epoch": 18.902120717781404, + "grad_norm": 0.0017553603975102305, + "learning_rate": 9.153840722953278e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250022544, + "step": 115870 + }, + { + "epoch": 18.902936378466556, + "grad_norm": 0.0019005544018000364, + "learning_rate": 9.14028781984988e-06, + "loss": 0.0001, + "num_input_tokens_seen": 250032784, + "step": 115875 + }, + { + "epoch": 18.903752039151712, + "grad_norm": 0.018906716257333755, + "learning_rate": 9.126744864641267e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250043728, + "step": 115880 + }, + { + "epoch": 18.904567699836868, + "grad_norm": 0.22294510900974274, + "learning_rate": 9.113211857601833e-06, + "loss": 0.0076, + "num_input_tokens_seen": 250053488, + "step": 115885 + }, + { + "epoch": 18.905383360522023, + "grad_norm": 0.002565832110121846, + "learning_rate": 9.099688799005967e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250064464, + "step": 115890 + }, + { + "epoch": 18.90619902120718, + "grad_norm": 0.0022950791753828526, + "learning_rate": 9.086175689127618e-06, + "loss": 0.1642, + "num_input_tokens_seen": 250075184, + "step": 115895 + }, + { + "epoch": 18.90701468189233, + "grad_norm": 0.005248554516583681, + "learning_rate": 9.072672528240733e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250085264, + "step": 115900 + }, + { + "epoch": 18.907830342577487, + "grad_norm": 0.0042555625550448895, + "learning_rate": 9.059179316618871e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250096752, + "step": 115905 + }, + { + "epoch": 18.908646003262643, + "grad_norm": 0.006807858124375343, + "learning_rate": 9.045696054535535e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250108560, + "step": 115910 + }, + { + "epoch": 18.9094616639478, + "grad_norm": 0.025907788425683975, + "learning_rate": 9.032222742264008e-06, + "loss": 0.0061, + "num_input_tokens_seen": 250119024, + "step": 115915 + }, + { + "epoch": 18.910277324632954, + "grad_norm": 0.0004443409852683544, + "learning_rate": 9.018759380077346e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250129648, + "step": 115920 + }, + { + "epoch": 18.911092985318106, + "grad_norm": 0.000489893602207303, + "learning_rate": 9.005305968248334e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250140304, + "step": 115925 + }, + { + "epoch": 18.911908646003262, + "grad_norm": 0.0006855711108073592, + "learning_rate": 8.991862507049698e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250152528, + "step": 115930 + }, + { + "epoch": 18.912724306688418, + "grad_norm": 0.21178455650806427, + "learning_rate": 8.978428996753885e-06, + "loss": 0.0052, + "num_input_tokens_seen": 250163440, + "step": 115935 + }, + { + "epoch": 18.913539967373573, + "grad_norm": 0.004870542325079441, + "learning_rate": 8.965005437633067e-06, + "loss": 0.0057, + "num_input_tokens_seen": 250174288, + "step": 115940 + }, + { + "epoch": 18.91435562805873, + "grad_norm": 0.0001280376745853573, + "learning_rate": 8.95159182995936e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250185168, + "step": 115945 + }, + { + "epoch": 18.91517128874388, + "grad_norm": 0.00025770801585167646, + "learning_rate": 8.938188174004602e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250196080, + "step": 115950 + }, + { + "epoch": 18.915986949429037, + "grad_norm": 0.00038275305996648967, + "learning_rate": 8.924794470040354e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250207088, + "step": 115955 + }, + { + "epoch": 18.916802610114193, + "grad_norm": 0.001069587655365467, + "learning_rate": 8.91141071833812e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250218000, + "step": 115960 + }, + { + "epoch": 18.91761827079935, + "grad_norm": 0.0006560476613231003, + "learning_rate": 8.89803691916924e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250229200, + "step": 115965 + }, + { + "epoch": 18.918433931484504, + "grad_norm": 0.028136856853961945, + "learning_rate": 8.88467307280455e-06, + "loss": 0.0011, + "num_input_tokens_seen": 250240240, + "step": 115970 + }, + { + "epoch": 18.919249592169656, + "grad_norm": 0.0001870090636657551, + "learning_rate": 8.871319179515058e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250250800, + "step": 115975 + }, + { + "epoch": 18.920065252854812, + "grad_norm": 0.00036353571340441704, + "learning_rate": 8.857975239571215e-06, + "loss": 0.001, + "num_input_tokens_seen": 250260336, + "step": 115980 + }, + { + "epoch": 18.920880913539968, + "grad_norm": 0.0002050613984465599, + "learning_rate": 8.84464125324369e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250270800, + "step": 115985 + }, + { + "epoch": 18.921696574225123, + "grad_norm": 0.007538105361163616, + "learning_rate": 8.831317220802493e-06, + "loss": 0.001, + "num_input_tokens_seen": 250280784, + "step": 115990 + }, + { + "epoch": 18.92251223491028, + "grad_norm": 0.009140829555690289, + "learning_rate": 8.818003142517794e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250292560, + "step": 115995 + }, + { + "epoch": 18.92332789559543, + "grad_norm": 0.0019140589283779263, + "learning_rate": 8.804699018659324e-06, + "loss": 0.0001, + "num_input_tokens_seen": 250303632, + "step": 116000 + }, + { + "epoch": 18.924143556280587, + "grad_norm": 0.0004126394633203745, + "learning_rate": 8.79140484949681e-06, + "loss": 0.0001, + "num_input_tokens_seen": 250313360, + "step": 116005 + }, + { + "epoch": 18.924959216965743, + "grad_norm": 0.09174416214227676, + "learning_rate": 8.778120635299537e-06, + "loss": 0.002, + "num_input_tokens_seen": 250323824, + "step": 116010 + }, + { + "epoch": 18.9257748776509, + "grad_norm": 0.004726526327431202, + "learning_rate": 8.7648463763369e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250334320, + "step": 116015 + }, + { + "epoch": 18.92659053833605, + "grad_norm": 0.001307390397414565, + "learning_rate": 8.751582072877739e-06, + "loss": 0.0247, + "num_input_tokens_seen": 250345456, + "step": 116020 + }, + { + "epoch": 18.927406199021206, + "grad_norm": 0.0002464427088852972, + "learning_rate": 8.738327725191064e-06, + "loss": 0.0013, + "num_input_tokens_seen": 250356688, + "step": 116025 + }, + { + "epoch": 18.928221859706362, + "grad_norm": 0.00010794488480314612, + "learning_rate": 8.725083333545326e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250366480, + "step": 116030 + }, + { + "epoch": 18.929037520391518, + "grad_norm": 0.012857906520366669, + "learning_rate": 8.711848898208974e-06, + "loss": 0.0766, + "num_input_tokens_seen": 250377840, + "step": 116035 + }, + { + "epoch": 18.929853181076673, + "grad_norm": 0.0007313964888453484, + "learning_rate": 8.698624419450296e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250388528, + "step": 116040 + }, + { + "epoch": 18.930668841761825, + "grad_norm": 0.000257728184806183, + "learning_rate": 8.685409897537244e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250400656, + "step": 116045 + }, + { + "epoch": 18.93148450244698, + "grad_norm": 0.00035792725975625217, + "learning_rate": 8.672205332737603e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250411856, + "step": 116050 + }, + { + "epoch": 18.932300163132137, + "grad_norm": 0.02234448678791523, + "learning_rate": 8.65901072531905e-06, + "loss": 0.0016, + "num_input_tokens_seen": 250423632, + "step": 116055 + }, + { + "epoch": 18.933115823817293, + "grad_norm": 0.06252387166023254, + "learning_rate": 8.64582607554898e-06, + "loss": 0.0016, + "num_input_tokens_seen": 250435760, + "step": 116060 + }, + { + "epoch": 18.93393148450245, + "grad_norm": 0.04080217331647873, + "learning_rate": 8.632651383694513e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250447504, + "step": 116065 + }, + { + "epoch": 18.9347471451876, + "grad_norm": 0.0019238536478951573, + "learning_rate": 8.619486650022768e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250458448, + "step": 116070 + }, + { + "epoch": 18.935562805872756, + "grad_norm": 7.234750228235498e-05, + "learning_rate": 8.606331874800421e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250470096, + "step": 116075 + }, + { + "epoch": 18.936378466557912, + "grad_norm": 0.0030563047621399164, + "learning_rate": 8.593187058294205e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250479312, + "step": 116080 + }, + { + "epoch": 18.937194127243067, + "grad_norm": 0.0001508641871623695, + "learning_rate": 8.580052200770405e-06, + "loss": 0.001, + "num_input_tokens_seen": 250491536, + "step": 116085 + }, + { + "epoch": 18.938009787928223, + "grad_norm": 0.0017143875593319535, + "learning_rate": 8.566927302495254e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250502128, + "step": 116090 + }, + { + "epoch": 18.938825448613375, + "grad_norm": 0.00010162648686673492, + "learning_rate": 8.553812363734759e-06, + "loss": 0.0017, + "num_input_tokens_seen": 250514128, + "step": 116095 + }, + { + "epoch": 18.93964110929853, + "grad_norm": 0.0008070730837062001, + "learning_rate": 8.54070738475471e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250524784, + "step": 116100 + }, + { + "epoch": 18.940456769983687, + "grad_norm": 0.0002918957034125924, + "learning_rate": 8.527612365820613e-06, + "loss": 0.001, + "num_input_tokens_seen": 250535536, + "step": 116105 + }, + { + "epoch": 18.941272430668842, + "grad_norm": 0.00021723966347053647, + "learning_rate": 8.514527307198038e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250547184, + "step": 116110 + }, + { + "epoch": 18.942088091353998, + "grad_norm": 0.0003141718334518373, + "learning_rate": 8.501452209151995e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250558608, + "step": 116115 + }, + { + "epoch": 18.94290375203915, + "grad_norm": 0.00011193722457392141, + "learning_rate": 8.488387071947601e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250569616, + "step": 116120 + }, + { + "epoch": 18.943719412724306, + "grad_norm": 0.0015807384625077248, + "learning_rate": 8.47533189584948e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250580432, + "step": 116125 + }, + { + "epoch": 18.94453507340946, + "grad_norm": 0.0014430314768105745, + "learning_rate": 8.46228668112231e-06, + "loss": 0.001, + "num_input_tokens_seen": 250590896, + "step": 116130 + }, + { + "epoch": 18.945350734094617, + "grad_norm": 0.00016158228390850127, + "learning_rate": 8.449251428030492e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250602384, + "step": 116135 + }, + { + "epoch": 18.946166394779773, + "grad_norm": 0.008578302338719368, + "learning_rate": 8.436226136838198e-06, + "loss": 0.1124, + "num_input_tokens_seen": 250613680, + "step": 116140 + }, + { + "epoch": 18.946982055464925, + "grad_norm": 0.0009416278917342424, + "learning_rate": 8.423210807809333e-06, + "loss": 0.0089, + "num_input_tokens_seen": 250624240, + "step": 116145 + }, + { + "epoch": 18.94779771615008, + "grad_norm": 0.0006393971852958202, + "learning_rate": 8.410205441207741e-06, + "loss": 0.0023, + "num_input_tokens_seen": 250634672, + "step": 116150 + }, + { + "epoch": 18.948613376835237, + "grad_norm": 0.0002604046603664756, + "learning_rate": 8.397210037296931e-06, + "loss": 0.0007, + "num_input_tokens_seen": 250645552, + "step": 116155 + }, + { + "epoch": 18.949429037520392, + "grad_norm": 0.00019371202506590635, + "learning_rate": 8.384224596340306e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250657232, + "step": 116160 + }, + { + "epoch": 18.950244698205548, + "grad_norm": 0.017916183918714523, + "learning_rate": 8.371249118601043e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250667120, + "step": 116165 + }, + { + "epoch": 18.9510603588907, + "grad_norm": 0.016919193789362907, + "learning_rate": 8.358283604342098e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250677040, + "step": 116170 + }, + { + "epoch": 18.951876019575856, + "grad_norm": 0.14813938736915588, + "learning_rate": 8.345328053826207e-06, + "loss": 0.0019, + "num_input_tokens_seen": 250688016, + "step": 116175 + }, + { + "epoch": 18.95269168026101, + "grad_norm": 0.00016505412349943072, + "learning_rate": 8.33238246731599e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250698960, + "step": 116180 + }, + { + "epoch": 18.953507340946167, + "grad_norm": 0.001108679105527699, + "learning_rate": 8.319446845073741e-06, + "loss": 0.0001, + "num_input_tokens_seen": 250709744, + "step": 116185 + }, + { + "epoch": 18.954323001631323, + "grad_norm": 0.002254722872748971, + "learning_rate": 8.306521187361638e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250721008, + "step": 116190 + }, + { + "epoch": 18.955138662316475, + "grad_norm": 0.0006669023423455656, + "learning_rate": 8.293605494441636e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250731568, + "step": 116195 + }, + { + "epoch": 18.95595432300163, + "grad_norm": 0.000516266212798655, + "learning_rate": 8.280699766575528e-06, + "loss": 0.001, + "num_input_tokens_seen": 250742288, + "step": 116200 + }, + { + "epoch": 18.956769983686787, + "grad_norm": 0.0007828929228708148, + "learning_rate": 8.26780400402477e-06, + "loss": 0.0004, + "num_input_tokens_seen": 250754128, + "step": 116205 + }, + { + "epoch": 18.957585644371942, + "grad_norm": 0.0026458760257810354, + "learning_rate": 8.254918207050821e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250766416, + "step": 116210 + }, + { + "epoch": 18.958401305057095, + "grad_norm": 0.007931684143841267, + "learning_rate": 8.242042375914748e-06, + "loss": 0.0247, + "num_input_tokens_seen": 250776496, + "step": 116215 + }, + { + "epoch": 18.95921696574225, + "grad_norm": 0.0011175910476595163, + "learning_rate": 8.229176510877512e-06, + "loss": 0.0019, + "num_input_tokens_seen": 250786736, + "step": 116220 + }, + { + "epoch": 18.960032626427406, + "grad_norm": 9.593347931513563e-05, + "learning_rate": 8.216320612199901e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250796816, + "step": 116225 + }, + { + "epoch": 18.96084828711256, + "grad_norm": 0.0001431167038390413, + "learning_rate": 8.203474680142431e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250806704, + "step": 116230 + }, + { + "epoch": 18.961663947797717, + "grad_norm": 0.012681758031249046, + "learning_rate": 8.190638714965393e-06, + "loss": 0.0019, + "num_input_tokens_seen": 250817552, + "step": 116235 + }, + { + "epoch": 18.96247960848287, + "grad_norm": 0.00017878312792163342, + "learning_rate": 8.177812716928967e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250828496, + "step": 116240 + }, + { + "epoch": 18.963295269168025, + "grad_norm": 0.0031497713644057512, + "learning_rate": 8.164996686293114e-06, + "loss": 0.0015, + "num_input_tokens_seen": 250838352, + "step": 116245 + }, + { + "epoch": 18.96411092985318, + "grad_norm": 0.0001439456973457709, + "learning_rate": 8.152190623317569e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250850032, + "step": 116250 + }, + { + "epoch": 18.964926590538337, + "grad_norm": 0.00030255160527303815, + "learning_rate": 8.13939452826179e-06, + "loss": 0.0022, + "num_input_tokens_seen": 250860976, + "step": 116255 + }, + { + "epoch": 18.965742251223492, + "grad_norm": 0.0005760218482464552, + "learning_rate": 8.126608401385183e-06, + "loss": 0.0012, + "num_input_tokens_seen": 250871664, + "step": 116260 + }, + { + "epoch": 18.966557911908644, + "grad_norm": 0.0001028142505674623, + "learning_rate": 8.113832242946818e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250882256, + "step": 116265 + }, + { + "epoch": 18.9673735725938, + "grad_norm": 0.0006137289456091821, + "learning_rate": 8.101066053205653e-06, + "loss": 0.0003, + "num_input_tokens_seen": 250892656, + "step": 116270 + }, + { + "epoch": 18.968189233278956, + "grad_norm": 0.03855578973889351, + "learning_rate": 8.08830983242037e-06, + "loss": 0.0009, + "num_input_tokens_seen": 250903600, + "step": 116275 + }, + { + "epoch": 18.96900489396411, + "grad_norm": 0.01619657874107361, + "learning_rate": 8.0755635808496e-06, + "loss": 0.003, + "num_input_tokens_seen": 250914480, + "step": 116280 + }, + { + "epoch": 18.969820554649267, + "grad_norm": 7.846741937100887e-05, + "learning_rate": 8.062827298751518e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250925264, + "step": 116285 + }, + { + "epoch": 18.97063621533442, + "grad_norm": 0.0006032337551005185, + "learning_rate": 8.050100986384312e-06, + "loss": 0.0017, + "num_input_tokens_seen": 250937456, + "step": 116290 + }, + { + "epoch": 18.971451876019575, + "grad_norm": 0.0014376030303537846, + "learning_rate": 8.037384644005941e-06, + "loss": 0.0001, + "num_input_tokens_seen": 250948016, + "step": 116295 + }, + { + "epoch": 18.97226753670473, + "grad_norm": 0.0007324400939978659, + "learning_rate": 8.024678271874031e-06, + "loss": 0.0005, + "num_input_tokens_seen": 250959536, + "step": 116300 + }, + { + "epoch": 18.973083197389887, + "grad_norm": 0.0009427487966604531, + "learning_rate": 8.011981870246099e-06, + "loss": 0.0008, + "num_input_tokens_seen": 250969488, + "step": 116305 + }, + { + "epoch": 18.973898858075042, + "grad_norm": 0.00024886216851882637, + "learning_rate": 7.99929543937955e-06, + "loss": 0.0002, + "num_input_tokens_seen": 250980176, + "step": 116310 + }, + { + "epoch": 18.974714518760194, + "grad_norm": 0.0004808894591405988, + "learning_rate": 7.9866189795314e-06, + "loss": 0.0006, + "num_input_tokens_seen": 250990352, + "step": 116315 + }, + { + "epoch": 18.97553017944535, + "grad_norm": 0.00212913122959435, + "learning_rate": 7.973952490958559e-06, + "loss": 0.0011, + "num_input_tokens_seen": 251001424, + "step": 116320 + }, + { + "epoch": 18.976345840130506, + "grad_norm": 0.00044627260649576783, + "learning_rate": 7.961295973917759e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251011472, + "step": 116325 + }, + { + "epoch": 18.97716150081566, + "grad_norm": 0.012283191084861755, + "learning_rate": 7.948649428665522e-06, + "loss": 0.001, + "num_input_tokens_seen": 251022288, + "step": 116330 + }, + { + "epoch": 18.977977161500817, + "grad_norm": 0.0003199919010512531, + "learning_rate": 7.936012855458085e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251033072, + "step": 116335 + }, + { + "epoch": 18.97879282218597, + "grad_norm": 0.00031763766310177743, + "learning_rate": 7.923386254551523e-06, + "loss": 0.0518, + "num_input_tokens_seen": 251042992, + "step": 116340 + }, + { + "epoch": 18.979608482871125, + "grad_norm": 0.010115230455994606, + "learning_rate": 7.910769626201908e-06, + "loss": 0.0016, + "num_input_tokens_seen": 251054448, + "step": 116345 + }, + { + "epoch": 18.98042414355628, + "grad_norm": 0.0006558912573382258, + "learning_rate": 7.898162970664702e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251066832, + "step": 116350 + }, + { + "epoch": 18.981239804241437, + "grad_norm": 0.00017292254779022187, + "learning_rate": 7.88556628819559e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251078448, + "step": 116355 + }, + { + "epoch": 18.982055464926592, + "grad_norm": 0.0011193545069545507, + "learning_rate": 7.872979579049644e-06, + "loss": 0.0023, + "num_input_tokens_seen": 251089392, + "step": 116360 + }, + { + "epoch": 18.982871125611744, + "grad_norm": 0.00015031429938971996, + "learning_rate": 7.860402843482218e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251098416, + "step": 116365 + }, + { + "epoch": 18.9836867862969, + "grad_norm": 0.00010591888712951913, + "learning_rate": 7.847836081747939e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251108688, + "step": 116370 + }, + { + "epoch": 18.984502446982056, + "grad_norm": 0.03437451645731926, + "learning_rate": 7.83527929410166e-06, + "loss": 0.0034, + "num_input_tokens_seen": 251120432, + "step": 116375 + }, + { + "epoch": 18.98531810766721, + "grad_norm": 0.0016349928919225931, + "learning_rate": 7.822732480797734e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251130096, + "step": 116380 + }, + { + "epoch": 18.986133768352367, + "grad_norm": 0.009741709567606449, + "learning_rate": 7.810195642090568e-06, + "loss": 0.0012, + "num_input_tokens_seen": 251139984, + "step": 116385 + }, + { + "epoch": 18.98694942903752, + "grad_norm": 0.0003107144439127296, + "learning_rate": 7.797668778234179e-06, + "loss": 0.0046, + "num_input_tokens_seen": 251150352, + "step": 116390 + }, + { + "epoch": 18.987765089722675, + "grad_norm": 0.07953647524118423, + "learning_rate": 7.785151889482422e-06, + "loss": 0.0024, + "num_input_tokens_seen": 251160912, + "step": 116395 + }, + { + "epoch": 18.98858075040783, + "grad_norm": 0.000604417989961803, + "learning_rate": 7.772644976088982e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251172400, + "step": 116400 + }, + { + "epoch": 18.989396411092986, + "grad_norm": 0.002135306363925338, + "learning_rate": 7.760148038307324e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251182288, + "step": 116405 + }, + { + "epoch": 18.99021207177814, + "grad_norm": 0.0010087854461744428, + "learning_rate": 7.747661076390688e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251192336, + "step": 116410 + }, + { + "epoch": 18.991027732463294, + "grad_norm": 0.00012645094830077142, + "learning_rate": 7.735184090592206e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251202480, + "step": 116415 + }, + { + "epoch": 18.99184339314845, + "grad_norm": 7.001680933171883e-05, + "learning_rate": 7.722717081164677e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251213328, + "step": 116420 + }, + { + "epoch": 18.992659053833606, + "grad_norm": 0.0023621264845132828, + "learning_rate": 7.710260048360784e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251224240, + "step": 116425 + }, + { + "epoch": 18.99347471451876, + "grad_norm": 0.0001349316880805418, + "learning_rate": 7.697812992432996e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251236112, + "step": 116430 + }, + { + "epoch": 18.994290375203914, + "grad_norm": 0.0005849722074344754, + "learning_rate": 7.685375913633607e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251246096, + "step": 116435 + }, + { + "epoch": 18.99510603588907, + "grad_norm": 0.00024042680161073804, + "learning_rate": 7.67294881221453e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251257680, + "step": 116440 + }, + { + "epoch": 18.995921696574225, + "grad_norm": 0.006447424180805683, + "learning_rate": 7.660531688427729e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251267696, + "step": 116445 + }, + { + "epoch": 18.99673735725938, + "grad_norm": 0.0001192994459415786, + "learning_rate": 7.648124542524892e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251278512, + "step": 116450 + }, + { + "epoch": 18.997553017944536, + "grad_norm": 0.00023178647097665817, + "learning_rate": 7.635727374757318e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251289520, + "step": 116455 + }, + { + "epoch": 18.99836867862969, + "grad_norm": 0.00013555011537391692, + "learning_rate": 7.623340185376415e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251300528, + "step": 116460 + }, + { + "epoch": 18.999184339314844, + "grad_norm": 0.0009752605692483485, + "learning_rate": 7.6109629746330955e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251310992, + "step": 116465 + }, + { + "epoch": 19.0, + "grad_norm": 0.002769812010228634, + "learning_rate": 7.5985957427782695e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251320768, + "step": 116470 + }, + { + "epoch": 19.0, + "eval_loss": 0.3454802930355072, + "eval_runtime": 104.4786, + "eval_samples_per_second": 26.082, + "eval_steps_per_second": 6.528, + "num_input_tokens_seen": 251320768, + "step": 116470 + }, + { + "epoch": 19.000815660685156, + "grad_norm": 0.00028177094645798206, + "learning_rate": 7.5862384900625135e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251331200, + "step": 116475 + }, + { + "epoch": 19.00163132137031, + "grad_norm": 0.0005034432397224009, + "learning_rate": 7.573891216736406e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251342368, + "step": 116480 + }, + { + "epoch": 19.002446982055464, + "grad_norm": 0.028141312301158905, + "learning_rate": 7.561553923049969e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251351840, + "step": 116485 + }, + { + "epoch": 19.00326264274062, + "grad_norm": 0.00022920592164155096, + "learning_rate": 7.549226609253446e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251363296, + "step": 116490 + }, + { + "epoch": 19.004078303425775, + "grad_norm": 0.00011687728692777455, + "learning_rate": 7.536909275596471e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251373536, + "step": 116495 + }, + { + "epoch": 19.00489396411093, + "grad_norm": 0.002491643186658621, + "learning_rate": 7.524601922328844e-06, + "loss": 0.0078, + "num_input_tokens_seen": 251383776, + "step": 116500 + }, + { + "epoch": 19.005709624796086, + "grad_norm": 0.0011212375247851014, + "learning_rate": 7.512304549699811e-06, + "loss": 0.002, + "num_input_tokens_seen": 251394208, + "step": 116505 + }, + { + "epoch": 19.00652528548124, + "grad_norm": 0.0001056869950843975, + "learning_rate": 7.500017157958838e-06, + "loss": 0.001, + "num_input_tokens_seen": 251405504, + "step": 116510 + }, + { + "epoch": 19.007340946166394, + "grad_norm": 0.0001518863718956709, + "learning_rate": 7.487739747354672e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251415648, + "step": 116515 + }, + { + "epoch": 19.00815660685155, + "grad_norm": 0.00010570868471404538, + "learning_rate": 7.475472318136334e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251426528, + "step": 116520 + }, + { + "epoch": 19.008972267536706, + "grad_norm": 0.0007762995082885027, + "learning_rate": 7.4632148705522374e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251437632, + "step": 116525 + }, + { + "epoch": 19.00978792822186, + "grad_norm": 0.00488898204639554, + "learning_rate": 7.450967404851017e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251448576, + "step": 116530 + }, + { + "epoch": 19.010603588907014, + "grad_norm": 0.0298672616481781, + "learning_rate": 7.438729921280752e-06, + "loss": 0.001, + "num_input_tokens_seen": 251458656, + "step": 116535 + }, + { + "epoch": 19.01141924959217, + "grad_norm": 0.001917483750730753, + "learning_rate": 7.42650242008952e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251469216, + "step": 116540 + }, + { + "epoch": 19.012234910277325, + "grad_norm": 0.0006010333308950067, + "learning_rate": 7.41428490152507e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251479456, + "step": 116545 + }, + { + "epoch": 19.01305057096248, + "grad_norm": 9.717236389406025e-05, + "learning_rate": 7.402077365835036e-06, + "loss": 0.0032, + "num_input_tokens_seen": 251490112, + "step": 116550 + }, + { + "epoch": 19.013866231647636, + "grad_norm": 0.01498456485569477, + "learning_rate": 7.389879813266831e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251500960, + "step": 116555 + }, + { + "epoch": 19.01468189233279, + "grad_norm": 0.01681513711810112, + "learning_rate": 7.377692244067591e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251510752, + "step": 116560 + }, + { + "epoch": 19.015497553017944, + "grad_norm": 0.0649740993976593, + "learning_rate": 7.36551465848434e-06, + "loss": 0.0016, + "num_input_tokens_seen": 251521216, + "step": 116565 + }, + { + "epoch": 19.0163132137031, + "grad_norm": 0.001003578188829124, + "learning_rate": 7.353347056763937e-06, + "loss": 0.001, + "num_input_tokens_seen": 251531200, + "step": 116570 + }, + { + "epoch": 19.017128874388256, + "grad_norm": 0.001010693609714508, + "learning_rate": 7.341189439152907e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251542528, + "step": 116575 + }, + { + "epoch": 19.017944535073408, + "grad_norm": 0.00067968072835356, + "learning_rate": 7.329041805897551e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251553408, + "step": 116580 + }, + { + "epoch": 19.018760195758563, + "grad_norm": 0.0070663755759596825, + "learning_rate": 7.316904157244342e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251565568, + "step": 116585 + }, + { + "epoch": 19.01957585644372, + "grad_norm": 0.019253183156251907, + "learning_rate": 7.304776493438914e-06, + "loss": 0.0013, + "num_input_tokens_seen": 251576128, + "step": 116590 + }, + { + "epoch": 19.020391517128875, + "grad_norm": 0.017462970688939095, + "learning_rate": 7.2926588147273484e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251586592, + "step": 116595 + }, + { + "epoch": 19.02120717781403, + "grad_norm": 0.012116583064198494, + "learning_rate": 7.280551121355005e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251598048, + "step": 116600 + }, + { + "epoch": 19.022022838499183, + "grad_norm": 0.0011931859189644456, + "learning_rate": 7.268453413567467e-06, + "loss": 0.001, + "num_input_tokens_seen": 251609472, + "step": 116605 + }, + { + "epoch": 19.02283849918434, + "grad_norm": 0.009402960538864136, + "learning_rate": 7.256365691609645e-06, + "loss": 0.0018, + "num_input_tokens_seen": 251620320, + "step": 116610 + }, + { + "epoch": 19.023654159869494, + "grad_norm": 0.010799596086144447, + "learning_rate": 7.244287955726791e-06, + "loss": 0.0031, + "num_input_tokens_seen": 251631648, + "step": 116615 + }, + { + "epoch": 19.02446982055465, + "grad_norm": 0.014306819997727871, + "learning_rate": 7.232220206163431e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251642624, + "step": 116620 + }, + { + "epoch": 19.025285481239806, + "grad_norm": 0.0006117624579928815, + "learning_rate": 7.220162443164369e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251653920, + "step": 116625 + }, + { + "epoch": 19.026101141924958, + "grad_norm": 0.011523784138262272, + "learning_rate": 7.2081146669737416e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251664800, + "step": 116630 + }, + { + "epoch": 19.026916802610113, + "grad_norm": 0.0003540304605849087, + "learning_rate": 7.196076877835911e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251676000, + "step": 116635 + }, + { + "epoch": 19.02773246329527, + "grad_norm": 0.10838975012302399, + "learning_rate": 7.1840490759946805e-06, + "loss": 0.0017, + "num_input_tokens_seen": 251686432, + "step": 116640 + }, + { + "epoch": 19.028548123980425, + "grad_norm": 0.0010924191446974874, + "learning_rate": 7.172031261693967e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251698048, + "step": 116645 + }, + { + "epoch": 19.02936378466558, + "grad_norm": 0.001310434308834374, + "learning_rate": 7.160023435177132e-06, + "loss": 0.0001, + "num_input_tokens_seen": 251707776, + "step": 116650 + }, + { + "epoch": 19.030179445350733, + "grad_norm": 0.01372646912932396, + "learning_rate": 7.148025596687702e-06, + "loss": 0.005, + "num_input_tokens_seen": 251719328, + "step": 116655 + }, + { + "epoch": 19.03099510603589, + "grad_norm": 0.005580388940870762, + "learning_rate": 7.136037746468704e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251731136, + "step": 116660 + }, + { + "epoch": 19.031810766721044, + "grad_norm": 0.14864104986190796, + "learning_rate": 7.124059884763168e-06, + "loss": 0.0045, + "num_input_tokens_seen": 251742528, + "step": 116665 + }, + { + "epoch": 19.0326264274062, + "grad_norm": 0.0004833217244595289, + "learning_rate": 7.112092011813842e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251751392, + "step": 116670 + }, + { + "epoch": 19.033442088091356, + "grad_norm": 0.0017427537823095918, + "learning_rate": 7.1001341278632e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251762112, + "step": 116675 + }, + { + "epoch": 19.034257748776508, + "grad_norm": 0.00019203309784643352, + "learning_rate": 7.08818623315366e-06, + "loss": 0.0006, + "num_input_tokens_seen": 251772576, + "step": 116680 + }, + { + "epoch": 19.035073409461663, + "grad_norm": 0.0002256287116324529, + "learning_rate": 7.076248327927359e-06, + "loss": 0.0004, + "num_input_tokens_seen": 251784448, + "step": 116685 + }, + { + "epoch": 19.03588907014682, + "grad_norm": 0.0001782215986168012, + "learning_rate": 7.064320412426162e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251794368, + "step": 116690 + }, + { + "epoch": 19.036704730831975, + "grad_norm": 0.01535695232450962, + "learning_rate": 7.052402486891818e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251805824, + "step": 116695 + }, + { + "epoch": 19.03752039151713, + "grad_norm": 0.0003223973617423326, + "learning_rate": 7.040494551565912e-06, + "loss": 0.0014, + "num_input_tokens_seen": 251815424, + "step": 116700 + }, + { + "epoch": 19.038336052202283, + "grad_norm": 0.01861644722521305, + "learning_rate": 7.028596606689808e-06, + "loss": 0.0015, + "num_input_tokens_seen": 251826144, + "step": 116705 + }, + { + "epoch": 19.03915171288744, + "grad_norm": 0.282966673374176, + "learning_rate": 7.016708652504477e-06, + "loss": 0.0058, + "num_input_tokens_seen": 251837792, + "step": 116710 + }, + { + "epoch": 19.039967373572594, + "grad_norm": 0.002146463142707944, + "learning_rate": 7.004830689251007e-06, + "loss": 0.0021, + "num_input_tokens_seen": 251848640, + "step": 116715 + }, + { + "epoch": 19.04078303425775, + "grad_norm": 0.00024671226856298745, + "learning_rate": 6.992962717170038e-06, + "loss": 0.0018, + "num_input_tokens_seen": 251860960, + "step": 116720 + }, + { + "epoch": 19.041598694942905, + "grad_norm": 0.006902644410729408, + "learning_rate": 6.981104736502042e-06, + "loss": 0.0009, + "num_input_tokens_seen": 251871872, + "step": 116725 + }, + { + "epoch": 19.042414355628058, + "grad_norm": 0.0032473220489919186, + "learning_rate": 6.969256747487496e-06, + "loss": 0.0003, + "num_input_tokens_seen": 251883072, + "step": 116730 + }, + { + "epoch": 19.043230016313213, + "grad_norm": 0.029473217204213142, + "learning_rate": 6.957418750366318e-06, + "loss": 0.0007, + "num_input_tokens_seen": 251892864, + "step": 116735 + }, + { + "epoch": 19.04404567699837, + "grad_norm": 0.001046478166244924, + "learning_rate": 6.945590745378594e-06, + "loss": 0.001, + "num_input_tokens_seen": 251903424, + "step": 116740 + }, + { + "epoch": 19.044861337683525, + "grad_norm": 0.014905816875398159, + "learning_rate": 6.9337727327639096e-06, + "loss": 0.0011, + "num_input_tokens_seen": 251914048, + "step": 116745 + }, + { + "epoch": 19.045676998368677, + "grad_norm": 0.001980874687433243, + "learning_rate": 6.921964712761853e-06, + "loss": 0.0005, + "num_input_tokens_seen": 251924448, + "step": 116750 + }, + { + "epoch": 19.046492659053833, + "grad_norm": 0.0007835765718482435, + "learning_rate": 6.910166685611674e-06, + "loss": 0.0017, + "num_input_tokens_seen": 251935296, + "step": 116755 + }, + { + "epoch": 19.04730831973899, + "grad_norm": 0.04969790577888489, + "learning_rate": 6.898378651552517e-06, + "loss": 0.0044, + "num_input_tokens_seen": 251946752, + "step": 116760 + }, + { + "epoch": 19.048123980424144, + "grad_norm": 0.0008774483576416969, + "learning_rate": 6.88660061082319e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251958208, + "step": 116765 + }, + { + "epoch": 19.0489396411093, + "grad_norm": 0.0001345430064247921, + "learning_rate": 6.874832563662559e-06, + "loss": 0.0002, + "num_input_tokens_seen": 251968800, + "step": 116770 + }, + { + "epoch": 19.049755301794452, + "grad_norm": 0.0007818325539119542, + "learning_rate": 6.863074510308931e-06, + "loss": 0.0008, + "num_input_tokens_seen": 251980032, + "step": 116775 + }, + { + "epoch": 19.050570962479608, + "grad_norm": 0.0001431761629646644, + "learning_rate": 6.851326451000783e-06, + "loss": 0.0456, + "num_input_tokens_seen": 251991296, + "step": 116780 + }, + { + "epoch": 19.051386623164763, + "grad_norm": 0.0017999854171648622, + "learning_rate": 6.839588385976036e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252002784, + "step": 116785 + }, + { + "epoch": 19.05220228384992, + "grad_norm": 0.00015301971870940179, + "learning_rate": 6.827860315472667e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252014112, + "step": 116790 + }, + { + "epoch": 19.053017944535075, + "grad_norm": 0.00034009269438683987, + "learning_rate": 6.816142239728373e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252025568, + "step": 116795 + }, + { + "epoch": 19.053833605220227, + "grad_norm": 0.0017962187994271517, + "learning_rate": 6.804434158980577e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252037408, + "step": 116800 + }, + { + "epoch": 19.054649265905383, + "grad_norm": 0.0013915542513132095, + "learning_rate": 6.792736073466587e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252048448, + "step": 116805 + }, + { + "epoch": 19.05546492659054, + "grad_norm": 0.005512189120054245, + "learning_rate": 6.781047983423439e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252058560, + "step": 116810 + }, + { + "epoch": 19.056280587275694, + "grad_norm": 0.0060982778668403625, + "learning_rate": 6.769369889088106e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252069216, + "step": 116815 + }, + { + "epoch": 19.05709624796085, + "grad_norm": 0.003994780592620373, + "learning_rate": 6.75770179069718e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252077920, + "step": 116820 + }, + { + "epoch": 19.057911908646002, + "grad_norm": 0.0012616260210052133, + "learning_rate": 6.746043688487136e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252088320, + "step": 116825 + }, + { + "epoch": 19.058727569331158, + "grad_norm": 0.00020941105321981013, + "learning_rate": 6.734395582694286e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252098368, + "step": 116830 + }, + { + "epoch": 19.059543230016313, + "grad_norm": 0.030388424172997475, + "learning_rate": 6.722757473554608e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252109440, + "step": 116835 + }, + { + "epoch": 19.06035889070147, + "grad_norm": 0.0014290224062278867, + "learning_rate": 6.71112936130408e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252118816, + "step": 116840 + }, + { + "epoch": 19.061174551386625, + "grad_norm": 0.00035428086994215846, + "learning_rate": 6.6995112461782355e-06, + "loss": 0.0014, + "num_input_tokens_seen": 252130560, + "step": 116845 + }, + { + "epoch": 19.061990212071777, + "grad_norm": 0.003363620722666383, + "learning_rate": 6.6879031284126646e-06, + "loss": 0.0028, + "num_input_tokens_seen": 252140992, + "step": 116850 + }, + { + "epoch": 19.062805872756933, + "grad_norm": 0.01926584355533123, + "learning_rate": 6.676305008242512e-06, + "loss": 0.0021, + "num_input_tokens_seen": 252151264, + "step": 116855 + }, + { + "epoch": 19.063621533442088, + "grad_norm": 0.01173666212707758, + "learning_rate": 6.664716885902811e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252161216, + "step": 116860 + }, + { + "epoch": 19.064437194127244, + "grad_norm": 0.00019005569629371166, + "learning_rate": 6.653138761628541e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252172192, + "step": 116865 + }, + { + "epoch": 19.0652528548124, + "grad_norm": 0.010669474489986897, + "learning_rate": 6.641570635654182e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252182400, + "step": 116870 + }, + { + "epoch": 19.06606851549755, + "grad_norm": 0.027841776609420776, + "learning_rate": 6.630012508214322e-06, + "loss": 0.0014, + "num_input_tokens_seen": 252192352, + "step": 116875 + }, + { + "epoch": 19.066884176182707, + "grad_norm": 0.003972830716520548, + "learning_rate": 6.618464379543166e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252202080, + "step": 116880 + }, + { + "epoch": 19.067699836867863, + "grad_norm": 0.01569187268614769, + "learning_rate": 6.6069262498746895e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252213312, + "step": 116885 + }, + { + "epoch": 19.06851549755302, + "grad_norm": 0.00013618692173622549, + "learning_rate": 6.595398119442764e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252224512, + "step": 116890 + }, + { + "epoch": 19.069331158238175, + "grad_norm": 0.0201239213347435, + "learning_rate": 6.583879988481034e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252236384, + "step": 116895 + }, + { + "epoch": 19.070146818923327, + "grad_norm": 0.0005762167857028544, + "learning_rate": 6.572371857222925e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252247168, + "step": 116900 + }, + { + "epoch": 19.070962479608482, + "grad_norm": 0.0012171623529866338, + "learning_rate": 6.560873725901695e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252257152, + "step": 116905 + }, + { + "epoch": 19.071778140293638, + "grad_norm": 0.00040827374323271215, + "learning_rate": 6.5493855947502674e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252266624, + "step": 116910 + }, + { + "epoch": 19.072593800978794, + "grad_norm": 0.12049645185470581, + "learning_rate": 6.537907464001569e-06, + "loss": 0.0015, + "num_input_tokens_seen": 252278464, + "step": 116915 + }, + { + "epoch": 19.07340946166395, + "grad_norm": 0.10069382935762405, + "learning_rate": 6.5264393338881345e-06, + "loss": 0.0038, + "num_input_tokens_seen": 252291104, + "step": 116920 + }, + { + "epoch": 19.0742251223491, + "grad_norm": 0.021143339574337006, + "learning_rate": 6.514981204642445e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252302368, + "step": 116925 + }, + { + "epoch": 19.075040783034257, + "grad_norm": 0.0004274807288311422, + "learning_rate": 6.503533076496704e-06, + "loss": 0.0001, + "num_input_tokens_seen": 252313504, + "step": 116930 + }, + { + "epoch": 19.075856443719413, + "grad_norm": 0.0014191556256264448, + "learning_rate": 6.492094949682892e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252324896, + "step": 116935 + }, + { + "epoch": 19.07667210440457, + "grad_norm": 0.007025528699159622, + "learning_rate": 6.480666824432879e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252334976, + "step": 116940 + }, + { + "epoch": 19.07748776508972, + "grad_norm": 0.0020549034234136343, + "learning_rate": 6.469248700978148e-06, + "loss": 0.1388, + "num_input_tokens_seen": 252344704, + "step": 116945 + }, + { + "epoch": 19.078303425774877, + "grad_norm": 0.014825491234660149, + "learning_rate": 6.457840579550234e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252355072, + "step": 116950 + }, + { + "epoch": 19.079119086460032, + "grad_norm": 0.12954717874526978, + "learning_rate": 6.4464424603802865e-06, + "loss": 0.0024, + "num_input_tokens_seen": 252366432, + "step": 116955 + }, + { + "epoch": 19.079934747145188, + "grad_norm": 0.0002758090849965811, + "learning_rate": 6.435054343699287e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252377792, + "step": 116960 + }, + { + "epoch": 19.080750407830344, + "grad_norm": 0.06096493452787399, + "learning_rate": 6.423676229738051e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252388864, + "step": 116965 + }, + { + "epoch": 19.081566068515496, + "grad_norm": 0.0001701121509540826, + "learning_rate": 6.412308118727117e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252399296, + "step": 116970 + }, + { + "epoch": 19.08238172920065, + "grad_norm": 0.00011954054934903979, + "learning_rate": 6.400950010896966e-06, + "loss": 0.0098, + "num_input_tokens_seen": 252409856, + "step": 116975 + }, + { + "epoch": 19.083197389885807, + "grad_norm": 0.00028709403704851866, + "learning_rate": 6.389601906477693e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252421568, + "step": 116980 + }, + { + "epoch": 19.084013050570963, + "grad_norm": 0.06499593704938889, + "learning_rate": 6.378263805699391e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252433568, + "step": 116985 + }, + { + "epoch": 19.08482871125612, + "grad_norm": 0.0003727501316461712, + "learning_rate": 6.36693570879171e-06, + "loss": 0.0001, + "num_input_tokens_seen": 252442528, + "step": 116990 + }, + { + "epoch": 19.08564437194127, + "grad_norm": 0.028141794726252556, + "learning_rate": 6.355617615984355e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252452448, + "step": 116995 + }, + { + "epoch": 19.086460032626427, + "grad_norm": 0.0008397915516979992, + "learning_rate": 6.344309527506587e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252463296, + "step": 117000 + }, + { + "epoch": 19.087275693311582, + "grad_norm": 0.00014411697338800877, + "learning_rate": 6.333011443587722e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252471840, + "step": 117005 + }, + { + "epoch": 19.088091353996738, + "grad_norm": 0.0006208732957020402, + "learning_rate": 6.3217233644565216e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252482816, + "step": 117010 + }, + { + "epoch": 19.088907014681894, + "grad_norm": 0.0006604628288187087, + "learning_rate": 6.3104452903419704e-06, + "loss": 0.0033, + "num_input_tokens_seen": 252493696, + "step": 117015 + }, + { + "epoch": 19.089722675367046, + "grad_norm": 0.0009543391643092036, + "learning_rate": 6.299177221472496e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252504768, + "step": 117020 + }, + { + "epoch": 19.0905383360522, + "grad_norm": 0.008625343441963196, + "learning_rate": 6.287919158076472e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252515712, + "step": 117025 + }, + { + "epoch": 19.091353996737357, + "grad_norm": 0.00034453420084901154, + "learning_rate": 6.2766711003821035e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252526880, + "step": 117030 + }, + { + "epoch": 19.092169657422513, + "grad_norm": 8.671904652146623e-05, + "learning_rate": 6.265433048617375e-06, + "loss": 0.0018, + "num_input_tokens_seen": 252537728, + "step": 117035 + }, + { + "epoch": 19.09298531810767, + "grad_norm": 0.03390451520681381, + "learning_rate": 6.254205003009938e-06, + "loss": 0.0015, + "num_input_tokens_seen": 252549568, + "step": 117040 + }, + { + "epoch": 19.09380097879282, + "grad_norm": 0.00019637358491308987, + "learning_rate": 6.242986963787445e-06, + "loss": 0.001, + "num_input_tokens_seen": 252560864, + "step": 117045 + }, + { + "epoch": 19.094616639477977, + "grad_norm": 0.2152816355228424, + "learning_rate": 6.231778931177157e-06, + "loss": 0.0042, + "num_input_tokens_seen": 252570816, + "step": 117050 + }, + { + "epoch": 19.095432300163132, + "grad_norm": 0.0036635827273130417, + "learning_rate": 6.220580905406226e-06, + "loss": 0.0025, + "num_input_tokens_seen": 252581280, + "step": 117055 + }, + { + "epoch": 19.096247960848288, + "grad_norm": 0.5060181021690369, + "learning_rate": 6.209392886701692e-06, + "loss": 0.0136, + "num_input_tokens_seen": 252592064, + "step": 117060 + }, + { + "epoch": 19.097063621533444, + "grad_norm": 0.011936215683817863, + "learning_rate": 6.198214875290209e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252602400, + "step": 117065 + }, + { + "epoch": 19.097879282218596, + "grad_norm": 0.0006445012404583395, + "learning_rate": 6.187046871398316e-06, + "loss": 0.0001, + "num_input_tokens_seen": 252613312, + "step": 117070 + }, + { + "epoch": 19.09869494290375, + "grad_norm": 0.00013280721032060683, + "learning_rate": 6.175888875252389e-06, + "loss": 0.0001, + "num_input_tokens_seen": 252623872, + "step": 117075 + }, + { + "epoch": 19.099510603588907, + "grad_norm": 0.001934311119839549, + "learning_rate": 6.1647408870785236e-06, + "loss": 0.0026, + "num_input_tokens_seen": 252635168, + "step": 117080 + }, + { + "epoch": 19.100326264274063, + "grad_norm": 0.0030935672111809254, + "learning_rate": 6.1536029071025955e-06, + "loss": 0.0986, + "num_input_tokens_seen": 252646080, + "step": 117085 + }, + { + "epoch": 19.10114192495922, + "grad_norm": 0.0007188143208622932, + "learning_rate": 6.142474935550535e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252656512, + "step": 117090 + }, + { + "epoch": 19.10195758564437, + "grad_norm": 0.0006241886876523495, + "learning_rate": 6.131356972647606e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252667776, + "step": 117095 + }, + { + "epoch": 19.102773246329527, + "grad_norm": 0.0007863701903261244, + "learning_rate": 6.120249018619295e-06, + "loss": 0.0013, + "num_input_tokens_seen": 252678464, + "step": 117100 + }, + { + "epoch": 19.103588907014682, + "grad_norm": 0.0007161149405874312, + "learning_rate": 6.109151073690644e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252689440, + "step": 117105 + }, + { + "epoch": 19.104404567699838, + "grad_norm": 0.04049726575613022, + "learning_rate": 6.0980631380866405e-06, + "loss": 0.0011, + "num_input_tokens_seen": 252700960, + "step": 117110 + }, + { + "epoch": 19.10522022838499, + "grad_norm": 0.024886978790163994, + "learning_rate": 6.086985212031881e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252712000, + "step": 117115 + }, + { + "epoch": 19.106035889070146, + "grad_norm": 0.0012817103415727615, + "learning_rate": 6.075917295750965e-06, + "loss": 0.0017, + "num_input_tokens_seen": 252722880, + "step": 117120 + }, + { + "epoch": 19.1068515497553, + "grad_norm": 0.004278149455785751, + "learning_rate": 6.064859389468158e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252733248, + "step": 117125 + }, + { + "epoch": 19.107667210440457, + "grad_norm": 0.0004911816795356572, + "learning_rate": 6.053811493407613e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252743872, + "step": 117130 + }, + { + "epoch": 19.108482871125613, + "grad_norm": 0.022411055862903595, + "learning_rate": 6.04277360779315e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252754336, + "step": 117135 + }, + { + "epoch": 19.109298531810765, + "grad_norm": 0.0007711627404205501, + "learning_rate": 6.031745732848593e-06, + "loss": 0.0021, + "num_input_tokens_seen": 252765120, + "step": 117140 + }, + { + "epoch": 19.11011419249592, + "grad_norm": 0.00018716967315413058, + "learning_rate": 6.02072786879726e-06, + "loss": 0.0004, + "num_input_tokens_seen": 252776832, + "step": 117145 + }, + { + "epoch": 19.110929853181077, + "grad_norm": 0.18330000340938568, + "learning_rate": 6.009720015862585e-06, + "loss": 0.0039, + "num_input_tokens_seen": 252787040, + "step": 117150 + }, + { + "epoch": 19.111745513866232, + "grad_norm": 0.00031747479806654155, + "learning_rate": 5.9987221742675566e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252797472, + "step": 117155 + }, + { + "epoch": 19.112561174551388, + "grad_norm": 0.004758198745548725, + "learning_rate": 5.987734344235107e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252808768, + "step": 117160 + }, + { + "epoch": 19.11337683523654, + "grad_norm": 0.0006726587889716029, + "learning_rate": 5.976756525987948e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252819584, + "step": 117165 + }, + { + "epoch": 19.114192495921696, + "grad_norm": 0.024424806237220764, + "learning_rate": 5.965788719748566e-06, + "loss": 0.0007, + "num_input_tokens_seen": 252830528, + "step": 117170 + }, + { + "epoch": 19.11500815660685, + "grad_norm": 0.0022546490654349327, + "learning_rate": 5.954830925739174e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252839936, + "step": 117175 + }, + { + "epoch": 19.115823817292007, + "grad_norm": 0.00031928785028867424, + "learning_rate": 5.943883144181872e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252850848, + "step": 117180 + }, + { + "epoch": 19.116639477977163, + "grad_norm": 0.00033015222288668156, + "learning_rate": 5.932945375298537e-06, + "loss": 0.0008, + "num_input_tokens_seen": 252861408, + "step": 117185 + }, + { + "epoch": 19.117455138662315, + "grad_norm": 0.0023020415101200342, + "learning_rate": 5.922017619310826e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252871456, + "step": 117190 + }, + { + "epoch": 19.11827079934747, + "grad_norm": 0.004900916945189238, + "learning_rate": 5.911099876440173e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252883200, + "step": 117195 + }, + { + "epoch": 19.119086460032626, + "grad_norm": 0.029135456308722496, + "learning_rate": 5.900192146907957e-06, + "loss": 0.0005, + "num_input_tokens_seen": 252892896, + "step": 117200 + }, + { + "epoch": 19.119902120717782, + "grad_norm": 0.0034956608433276415, + "learning_rate": 5.889294430935111e-06, + "loss": 0.0031, + "num_input_tokens_seen": 252903648, + "step": 117205 + }, + { + "epoch": 19.120717781402938, + "grad_norm": 0.006260004825890064, + "learning_rate": 5.8784067287424584e-06, + "loss": 0.0009, + "num_input_tokens_seen": 252914880, + "step": 117210 + }, + { + "epoch": 19.12153344208809, + "grad_norm": 0.0007126359851099551, + "learning_rate": 5.8675290405508785e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252925056, + "step": 117215 + }, + { + "epoch": 19.122349102773246, + "grad_norm": 0.000646061496809125, + "learning_rate": 5.856661366580584e-06, + "loss": 0.0012, + "num_input_tokens_seen": 252934624, + "step": 117220 + }, + { + "epoch": 19.1231647634584, + "grad_norm": 0.0010053542209789157, + "learning_rate": 5.845803707051955e-06, + "loss": 0.0006, + "num_input_tokens_seen": 252944800, + "step": 117225 + }, + { + "epoch": 19.123980424143557, + "grad_norm": 0.0002882974804379046, + "learning_rate": 5.834956062184926e-06, + "loss": 0.0003, + "num_input_tokens_seen": 252955200, + "step": 117230 + }, + { + "epoch": 19.124796084828713, + "grad_norm": 0.002109148073941469, + "learning_rate": 5.824118432199488e-06, + "loss": 0.001, + "num_input_tokens_seen": 252966752, + "step": 117235 + }, + { + "epoch": 19.125611745513865, + "grad_norm": 0.004663508385419846, + "learning_rate": 5.813290817315131e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252977472, + "step": 117240 + }, + { + "epoch": 19.12642740619902, + "grad_norm": 0.0029989690519869328, + "learning_rate": 5.8024732177514585e-06, + "loss": 0.0034, + "num_input_tokens_seen": 252988192, + "step": 117245 + }, + { + "epoch": 19.127243066884176, + "grad_norm": 0.0024849416222423315, + "learning_rate": 5.791665633727461e-06, + "loss": 0.0002, + "num_input_tokens_seen": 252999488, + "step": 117250 + }, + { + "epoch": 19.128058727569332, + "grad_norm": 0.00015557196456938982, + "learning_rate": 5.780868065462408e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253011104, + "step": 117255 + }, + { + "epoch": 19.128874388254488, + "grad_norm": 0.020317405462265015, + "learning_rate": 5.770080513174958e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253021888, + "step": 117260 + }, + { + "epoch": 19.12969004893964, + "grad_norm": 0.00010429122630739585, + "learning_rate": 5.75930297708388e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253032352, + "step": 117265 + }, + { + "epoch": 19.130505709624796, + "grad_norm": 0.001174905919469893, + "learning_rate": 5.748535457407444e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253043744, + "step": 117270 + }, + { + "epoch": 19.13132137030995, + "grad_norm": 0.0013565943809226155, + "learning_rate": 5.737777954364032e-06, + "loss": 0.0007, + "num_input_tokens_seen": 253053856, + "step": 117275 + }, + { + "epoch": 19.132137030995107, + "grad_norm": 0.0004821173206437379, + "learning_rate": 5.727030468171468e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253064288, + "step": 117280 + }, + { + "epoch": 19.13295269168026, + "grad_norm": 0.0002024052373599261, + "learning_rate": 5.71629299904769e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253075104, + "step": 117285 + }, + { + "epoch": 19.133768352365415, + "grad_norm": 0.0001703380112303421, + "learning_rate": 5.705565547210301e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253085728, + "step": 117290 + }, + { + "epoch": 19.13458401305057, + "grad_norm": 0.022170057520270348, + "learning_rate": 5.694848112876683e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253094816, + "step": 117295 + }, + { + "epoch": 19.135399673735726, + "grad_norm": 0.004822219256311655, + "learning_rate": 5.684140696263995e-06, + "loss": 0.0121, + "num_input_tokens_seen": 253105600, + "step": 117300 + }, + { + "epoch": 19.136215334420882, + "grad_norm": 0.0016027381643652916, + "learning_rate": 5.673443297589287e-06, + "loss": 0.0007, + "num_input_tokens_seen": 253116352, + "step": 117305 + }, + { + "epoch": 19.137030995106034, + "grad_norm": 0.0003533402632456273, + "learning_rate": 5.662755917069384e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253128448, + "step": 117310 + }, + { + "epoch": 19.13784665579119, + "grad_norm": 0.0010634259087964892, + "learning_rate": 5.652078554920836e-06, + "loss": 0.0024, + "num_input_tokens_seen": 253138208, + "step": 117315 + }, + { + "epoch": 19.138662316476346, + "grad_norm": 0.6747114062309265, + "learning_rate": 5.6414112113600254e-06, + "loss": 0.0212, + "num_input_tokens_seen": 253148928, + "step": 117320 + }, + { + "epoch": 19.1394779771615, + "grad_norm": 0.0003557452582754195, + "learning_rate": 5.630753886603168e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253160480, + "step": 117325 + }, + { + "epoch": 19.140293637846657, + "grad_norm": 0.013489640317857265, + "learning_rate": 5.6201065808662025e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253171456, + "step": 117330 + }, + { + "epoch": 19.14110929853181, + "grad_norm": 0.0015448734629899263, + "learning_rate": 5.609469294364955e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253181824, + "step": 117335 + }, + { + "epoch": 19.141924959216965, + "grad_norm": 0.00013942382065579295, + "learning_rate": 5.598842027315032e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253192512, + "step": 117340 + }, + { + "epoch": 19.14274061990212, + "grad_norm": 0.00011500628897920251, + "learning_rate": 5.588224779931761e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253202816, + "step": 117345 + }, + { + "epoch": 19.143556280587276, + "grad_norm": 0.0006052239914424717, + "learning_rate": 5.577617552430303e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253214560, + "step": 117350 + }, + { + "epoch": 19.144371941272432, + "grad_norm": 0.001092093763872981, + "learning_rate": 5.567020345025597e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253224704, + "step": 117355 + }, + { + "epoch": 19.145187601957584, + "grad_norm": 0.0011473286431282759, + "learning_rate": 5.556433157932528e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253234176, + "step": 117360 + }, + { + "epoch": 19.14600326264274, + "grad_norm": 0.0012462937738746405, + "learning_rate": 5.5458559913655335e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253244544, + "step": 117365 + }, + { + "epoch": 19.146818923327896, + "grad_norm": 0.007192329503595829, + "learning_rate": 5.5352888455390546e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253254944, + "step": 117370 + }, + { + "epoch": 19.14763458401305, + "grad_norm": 0.005542058497667313, + "learning_rate": 5.524731720667197e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253266080, + "step": 117375 + }, + { + "epoch": 19.148450244698207, + "grad_norm": 0.003108179895207286, + "learning_rate": 5.514184616964013e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253276736, + "step": 117380 + }, + { + "epoch": 19.14926590538336, + "grad_norm": 0.00014723450294695795, + "learning_rate": 5.503647534643108e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253287072, + "step": 117385 + }, + { + "epoch": 19.150081566068515, + "grad_norm": 0.009642918594181538, + "learning_rate": 5.493120473918145e-06, + "loss": 0.0023, + "num_input_tokens_seen": 253297280, + "step": 117390 + }, + { + "epoch": 19.15089722675367, + "grad_norm": 0.003752480261027813, + "learning_rate": 5.4826034350023426e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253308032, + "step": 117395 + }, + { + "epoch": 19.151712887438826, + "grad_norm": 0.00021067139459773898, + "learning_rate": 5.472096418108974e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253318080, + "step": 117400 + }, + { + "epoch": 19.152528548123982, + "grad_norm": 9.630475688027218e-05, + "learning_rate": 5.461599423450924e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253328032, + "step": 117405 + }, + { + "epoch": 19.153344208809134, + "grad_norm": 0.002185633173212409, + "learning_rate": 5.451112451240914e-06, + "loss": 0.0031, + "num_input_tokens_seen": 253339392, + "step": 117410 + }, + { + "epoch": 19.15415986949429, + "grad_norm": 0.00022383141913451254, + "learning_rate": 5.440635501691493e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253349248, + "step": 117415 + }, + { + "epoch": 19.154975530179446, + "grad_norm": 0.00011819570499937981, + "learning_rate": 5.4301685750149935e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253359872, + "step": 117420 + }, + { + "epoch": 19.1557911908646, + "grad_norm": 0.0002307681570528075, + "learning_rate": 5.419711671423577e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253370336, + "step": 117425 + }, + { + "epoch": 19.156606851549757, + "grad_norm": 0.005955645814538002, + "learning_rate": 5.409264791129076e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253381600, + "step": 117430 + }, + { + "epoch": 19.15742251223491, + "grad_norm": 0.0014145580353215337, + "learning_rate": 5.398827934343264e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253393184, + "step": 117435 + }, + { + "epoch": 19.158238172920065, + "grad_norm": 0.0040762717835605145, + "learning_rate": 5.38840110127764e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253404736, + "step": 117440 + }, + { + "epoch": 19.15905383360522, + "grad_norm": 0.0004782885662280023, + "learning_rate": 5.377984292143534e-06, + "loss": 0.0146, + "num_input_tokens_seen": 253415008, + "step": 117445 + }, + { + "epoch": 19.159869494290376, + "grad_norm": 0.0001632419298402965, + "learning_rate": 5.367577507152055e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253426176, + "step": 117450 + }, + { + "epoch": 19.160685154975532, + "grad_norm": 0.005206952337175608, + "learning_rate": 5.35718074651409e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253435680, + "step": 117455 + }, + { + "epoch": 19.161500815660684, + "grad_norm": 0.000222133137867786, + "learning_rate": 5.346794010440359e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253447200, + "step": 117460 + }, + { + "epoch": 19.16231647634584, + "grad_norm": 0.00024197626044042408, + "learning_rate": 5.336417299141361e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253458496, + "step": 117465 + }, + { + "epoch": 19.163132137030995, + "grad_norm": 0.0036572362296283245, + "learning_rate": 5.326050612827426e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253467712, + "step": 117470 + }, + { + "epoch": 19.16394779771615, + "grad_norm": 0.00017844563990365714, + "learning_rate": 5.315693951708555e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253478848, + "step": 117475 + }, + { + "epoch": 19.164763458401303, + "grad_norm": 0.00023036589846014977, + "learning_rate": 5.305347315994747e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253487648, + "step": 117480 + }, + { + "epoch": 19.16557911908646, + "grad_norm": 0.0003476330603007227, + "learning_rate": 5.295010705895609e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253498496, + "step": 117485 + }, + { + "epoch": 19.166394779771615, + "grad_norm": 0.009443684481084347, + "learning_rate": 5.284684121620697e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253510112, + "step": 117490 + }, + { + "epoch": 19.16721044045677, + "grad_norm": 9.934185800375417e-05, + "learning_rate": 5.2743675633792345e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253520448, + "step": 117495 + }, + { + "epoch": 19.168026101141926, + "grad_norm": 0.013910762034356594, + "learning_rate": 5.264061031380274e-06, + "loss": 0.0712, + "num_input_tokens_seen": 253529920, + "step": 117500 + }, + { + "epoch": 19.16884176182708, + "grad_norm": 0.0010928745614364743, + "learning_rate": 5.253764525832761e-06, + "loss": 0.0022, + "num_input_tokens_seen": 253541216, + "step": 117505 + }, + { + "epoch": 19.169657422512234, + "grad_norm": 0.00023541273549199104, + "learning_rate": 5.243478046945305e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253552256, + "step": 117510 + }, + { + "epoch": 19.17047308319739, + "grad_norm": 0.0054008448496460915, + "learning_rate": 5.233201594926462e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253562688, + "step": 117515 + }, + { + "epoch": 19.171288743882545, + "grad_norm": 0.00017960129480343312, + "learning_rate": 5.222935169984455e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253573632, + "step": 117520 + }, + { + "epoch": 19.1721044045677, + "grad_norm": 0.00045109112397767603, + "learning_rate": 5.212678772327284e-06, + "loss": 0.0021, + "num_input_tokens_seen": 253585120, + "step": 117525 + }, + { + "epoch": 19.172920065252853, + "grad_norm": 0.0007623362471349537, + "learning_rate": 5.202432402162893e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253596736, + "step": 117530 + }, + { + "epoch": 19.17373572593801, + "grad_norm": 0.04738568887114525, + "learning_rate": 5.192196059698895e-06, + "loss": 0.0012, + "num_input_tokens_seen": 253607616, + "step": 117535 + }, + { + "epoch": 19.174551386623165, + "grad_norm": 0.0009224917157553136, + "learning_rate": 5.18196974514279e-06, + "loss": 0.0007, + "num_input_tokens_seen": 253617280, + "step": 117540 + }, + { + "epoch": 19.17536704730832, + "grad_norm": 0.0001266537728952244, + "learning_rate": 5.1717534587017445e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253628096, + "step": 117545 + }, + { + "epoch": 19.176182707993476, + "grad_norm": 0.0003706437419168651, + "learning_rate": 5.161547200582872e-06, + "loss": 0.0035, + "num_input_tokens_seen": 253639968, + "step": 117550 + }, + { + "epoch": 19.17699836867863, + "grad_norm": 0.009790387935936451, + "learning_rate": 5.151350970993007e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253651584, + "step": 117555 + }, + { + "epoch": 19.177814029363784, + "grad_norm": 0.00018436310347169638, + "learning_rate": 5.141164770138707e-06, + "loss": 0.0003, + "num_input_tokens_seen": 253662240, + "step": 117560 + }, + { + "epoch": 19.17862969004894, + "grad_norm": 0.016101539134979248, + "learning_rate": 5.130988598226527e-06, + "loss": 0.0015, + "num_input_tokens_seen": 253672800, + "step": 117565 + }, + { + "epoch": 19.179445350734095, + "grad_norm": 0.5355294346809387, + "learning_rate": 5.120822455462637e-06, + "loss": 0.0172, + "num_input_tokens_seen": 253682400, + "step": 117570 + }, + { + "epoch": 19.18026101141925, + "grad_norm": 0.0006680356455035508, + "learning_rate": 5.110666342053094e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253692992, + "step": 117575 + }, + { + "epoch": 19.181076672104403, + "grad_norm": 0.008890760131180286, + "learning_rate": 5.100520258203734e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253703520, + "step": 117580 + }, + { + "epoch": 19.18189233278956, + "grad_norm": 0.00253989826887846, + "learning_rate": 5.090384204120113e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253713376, + "step": 117585 + }, + { + "epoch": 19.182707993474715, + "grad_norm": 1.1718759536743164, + "learning_rate": 5.08025818000768e-06, + "loss": 0.0133, + "num_input_tokens_seen": 253723680, + "step": 117590 + }, + { + "epoch": 19.18352365415987, + "grad_norm": 0.0058137886226177216, + "learning_rate": 5.0701421860717135e-06, + "loss": 0.0065, + "num_input_tokens_seen": 253735264, + "step": 117595 + }, + { + "epoch": 19.184339314845026, + "grad_norm": 0.002442616270855069, + "learning_rate": 5.060036222517161e-06, + "loss": 0.0009, + "num_input_tokens_seen": 253746592, + "step": 117600 + }, + { + "epoch": 19.18515497553018, + "grad_norm": 0.0004011603305116296, + "learning_rate": 5.049940289548804e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253757728, + "step": 117605 + }, + { + "epoch": 19.185970636215334, + "grad_norm": 0.0003494208212941885, + "learning_rate": 5.039854387371368e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253769152, + "step": 117610 + }, + { + "epoch": 19.18678629690049, + "grad_norm": 0.00024524933542124927, + "learning_rate": 5.0297785161891315e-06, + "loss": 0.0008, + "num_input_tokens_seen": 253780832, + "step": 117615 + }, + { + "epoch": 19.187601957585645, + "grad_norm": 0.049177031964063644, + "learning_rate": 5.019712676206323e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253790400, + "step": 117620 + }, + { + "epoch": 19.1884176182708, + "grad_norm": 0.0012544021010398865, + "learning_rate": 5.009656867627055e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253802272, + "step": 117625 + }, + { + "epoch": 19.189233278955953, + "grad_norm": 0.00016192629118449986, + "learning_rate": 4.999611090654943e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253812960, + "step": 117630 + }, + { + "epoch": 19.19004893964111, + "grad_norm": 0.031114500015974045, + "learning_rate": 4.989575345493713e-06, + "loss": 0.0005, + "num_input_tokens_seen": 253824320, + "step": 117635 + }, + { + "epoch": 19.190864600326265, + "grad_norm": 0.00046009646030142903, + "learning_rate": 4.979549632346702e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253833472, + "step": 117640 + }, + { + "epoch": 19.19168026101142, + "grad_norm": 0.00235239346511662, + "learning_rate": 4.969533951417082e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253843648, + "step": 117645 + }, + { + "epoch": 19.192495921696572, + "grad_norm": 0.00022276783420238644, + "learning_rate": 4.959528302907857e-06, + "loss": 0.0016, + "num_input_tokens_seen": 253853568, + "step": 117650 + }, + { + "epoch": 19.193311582381728, + "grad_norm": 0.0026003194507211447, + "learning_rate": 4.949532687021751e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253864416, + "step": 117655 + }, + { + "epoch": 19.194127243066884, + "grad_norm": 0.08289016783237457, + "learning_rate": 4.939547103961439e-06, + "loss": 0.0027, + "num_input_tokens_seen": 253875200, + "step": 117660 + }, + { + "epoch": 19.19494290375204, + "grad_norm": 0.0014308587415143847, + "learning_rate": 4.929571553929202e-06, + "loss": 0.0001, + "num_input_tokens_seen": 253886560, + "step": 117665 + }, + { + "epoch": 19.195758564437195, + "grad_norm": 0.0005965350428596139, + "learning_rate": 4.919606037127267e-06, + "loss": 0.0002, + "num_input_tokens_seen": 253898752, + "step": 117670 + }, + { + "epoch": 19.196574225122347, + "grad_norm": 0.04841861501336098, + "learning_rate": 4.909650553757583e-06, + "loss": 0.0007, + "num_input_tokens_seen": 253910496, + "step": 117675 + }, + { + "epoch": 19.197389885807503, + "grad_norm": 0.000141278505907394, + "learning_rate": 4.8997051040218235e-06, + "loss": 0.0026, + "num_input_tokens_seen": 253921056, + "step": 117680 + }, + { + "epoch": 19.19820554649266, + "grad_norm": 0.01310582272708416, + "learning_rate": 4.889769688121715e-06, + "loss": 0.0027, + "num_input_tokens_seen": 253932928, + "step": 117685 + }, + { + "epoch": 19.199021207177815, + "grad_norm": 0.32635921239852905, + "learning_rate": 4.87984430625843e-06, + "loss": 0.0062, + "num_input_tokens_seen": 253943776, + "step": 117690 + }, + { + "epoch": 19.19983686786297, + "grad_norm": 0.0029534141067415476, + "learning_rate": 4.869928958633252e-06, + "loss": 0.0013, + "num_input_tokens_seen": 253955936, + "step": 117695 + }, + { + "epoch": 19.200652528548122, + "grad_norm": 0.006285388488322496, + "learning_rate": 4.860023645447076e-06, + "loss": 0.0016, + "num_input_tokens_seen": 253967520, + "step": 117700 + }, + { + "epoch": 19.201468189233278, + "grad_norm": 0.0002549230703152716, + "learning_rate": 4.85012836690063e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253977824, + "step": 117705 + }, + { + "epoch": 19.202283849918434, + "grad_norm": 0.0007658255635760725, + "learning_rate": 4.840243123194477e-06, + "loss": 0.0006, + "num_input_tokens_seen": 253988960, + "step": 117710 + }, + { + "epoch": 19.20309951060359, + "grad_norm": 0.002077503129839897, + "learning_rate": 4.83036791452901e-06, + "loss": 0.0004, + "num_input_tokens_seen": 253998784, + "step": 117715 + }, + { + "epoch": 19.203915171288745, + "grad_norm": 0.006746372673660517, + "learning_rate": 4.820502741104238e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254010304, + "step": 117720 + }, + { + "epoch": 19.204730831973897, + "grad_norm": 0.00014421303058043122, + "learning_rate": 4.810647603120166e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254019616, + "step": 117725 + }, + { + "epoch": 19.205546492659053, + "grad_norm": 0.0005585712497122586, + "learning_rate": 4.800802500776524e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254030112, + "step": 117730 + }, + { + "epoch": 19.20636215334421, + "grad_norm": 0.0006013751844875515, + "learning_rate": 4.790967434272819e-06, + "loss": 0.0026, + "num_input_tokens_seen": 254039968, + "step": 117735 + }, + { + "epoch": 19.207177814029365, + "grad_norm": 0.0009617886389605701, + "learning_rate": 4.781142403808392e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254050240, + "step": 117740 + }, + { + "epoch": 19.20799347471452, + "grad_norm": 0.06798414885997772, + "learning_rate": 4.771327409582305e-06, + "loss": 0.0017, + "num_input_tokens_seen": 254060128, + "step": 117745 + }, + { + "epoch": 19.208809135399672, + "grad_norm": 0.01036740094423294, + "learning_rate": 4.761522451793565e-06, + "loss": 0.0014, + "num_input_tokens_seen": 254070976, + "step": 117750 + }, + { + "epoch": 19.209624796084828, + "grad_norm": 0.00017157958063762635, + "learning_rate": 4.751727530640793e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254081312, + "step": 117755 + }, + { + "epoch": 19.210440456769984, + "grad_norm": 0.008614970371127129, + "learning_rate": 4.74194264632255e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254092704, + "step": 117760 + }, + { + "epoch": 19.21125611745514, + "grad_norm": 0.003977464511990547, + "learning_rate": 4.732167799037068e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254103264, + "step": 117765 + }, + { + "epoch": 19.212071778140295, + "grad_norm": 0.00023598514962941408, + "learning_rate": 4.722402988982577e-06, + "loss": 0.0034, + "num_input_tokens_seen": 254113600, + "step": 117770 + }, + { + "epoch": 19.212887438825447, + "grad_norm": 0.00014818696945440024, + "learning_rate": 4.7126482163568075e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254124704, + "step": 117775 + }, + { + "epoch": 19.213703099510603, + "grad_norm": 0.00021955471311230212, + "learning_rate": 4.702903481357601e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254134784, + "step": 117780 + }, + { + "epoch": 19.21451876019576, + "grad_norm": 0.0006277711945585907, + "learning_rate": 4.693168784182356e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254146048, + "step": 117785 + }, + { + "epoch": 19.215334420880914, + "grad_norm": 0.00029214436654001474, + "learning_rate": 4.6834441250284135e-06, + "loss": 0.0014, + "num_input_tokens_seen": 254157600, + "step": 117790 + }, + { + "epoch": 19.21615008156607, + "grad_norm": 0.0009113376145251095, + "learning_rate": 4.673729504092783e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254168128, + "step": 117795 + }, + { + "epoch": 19.216965742251222, + "grad_norm": 0.0001504243555245921, + "learning_rate": 4.664024921572419e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254179008, + "step": 117800 + }, + { + "epoch": 19.217781402936378, + "grad_norm": 0.0001879182964330539, + "learning_rate": 4.654330377663996e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254190784, + "step": 117805 + }, + { + "epoch": 19.218597063621534, + "grad_norm": 0.0005090906633995473, + "learning_rate": 4.644645872563913e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254201024, + "step": 117810 + }, + { + "epoch": 19.21941272430669, + "grad_norm": 0.004909905139356852, + "learning_rate": 4.634971406468514e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254212000, + "step": 117815 + }, + { + "epoch": 19.22022838499184, + "grad_norm": 0.0019054664298892021, + "learning_rate": 4.625306979573807e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254223296, + "step": 117820 + }, + { + "epoch": 19.221044045676997, + "grad_norm": 0.00022406052448786795, + "learning_rate": 4.615652592075747e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254234912, + "step": 117825 + }, + { + "epoch": 19.221859706362153, + "grad_norm": 0.0003814722876995802, + "learning_rate": 4.606008244169846e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254246080, + "step": 117830 + }, + { + "epoch": 19.22267536704731, + "grad_norm": 0.0009109728853218257, + "learning_rate": 4.596373936051667e-06, + "loss": 0.0017, + "num_input_tokens_seen": 254257056, + "step": 117835 + }, + { + "epoch": 19.223491027732464, + "grad_norm": 0.00017613654199521989, + "learning_rate": 4.586749667916446e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254267968, + "step": 117840 + }, + { + "epoch": 19.224306688417617, + "grad_norm": 8.620345033705235e-05, + "learning_rate": 4.57713543995919e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254278144, + "step": 117845 + }, + { + "epoch": 19.225122349102772, + "grad_norm": 0.015763506293296814, + "learning_rate": 4.567531252374801e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254288992, + "step": 117850 + }, + { + "epoch": 19.225938009787928, + "grad_norm": 0.00017938186647370458, + "learning_rate": 4.557937105357901e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254299872, + "step": 117855 + }, + { + "epoch": 19.226753670473084, + "grad_norm": 0.00010060738713946193, + "learning_rate": 4.54835299910289e-06, + "loss": 0.0378, + "num_input_tokens_seen": 254311328, + "step": 117860 + }, + { + "epoch": 19.22756933115824, + "grad_norm": 0.00010393088450655341, + "learning_rate": 4.5387789338040555e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254322624, + "step": 117865 + }, + { + "epoch": 19.22838499184339, + "grad_norm": 0.010525264777243137, + "learning_rate": 4.529214909655355e-06, + "loss": 0.0024, + "num_input_tokens_seen": 254333472, + "step": 117870 + }, + { + "epoch": 19.229200652528547, + "grad_norm": 0.00014067695883568376, + "learning_rate": 4.519660926850744e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254344000, + "step": 117875 + }, + { + "epoch": 19.230016313213703, + "grad_norm": 0.0005757188773714006, + "learning_rate": 4.510116985583679e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254355296, + "step": 117880 + }, + { + "epoch": 19.23083197389886, + "grad_norm": 0.0013182057300582528, + "learning_rate": 4.500583086047782e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254365952, + "step": 117885 + }, + { + "epoch": 19.231647634584014, + "grad_norm": 0.06332371383905411, + "learning_rate": 4.491059228436012e-06, + "loss": 0.0021, + "num_input_tokens_seen": 254376704, + "step": 117890 + }, + { + "epoch": 19.232463295269167, + "grad_norm": 0.0005930504994466901, + "learning_rate": 4.481545412941657e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254386528, + "step": 117895 + }, + { + "epoch": 19.233278955954322, + "grad_norm": 0.08251247555017471, + "learning_rate": 4.472041639757285e-06, + "loss": 0.0032, + "num_input_tokens_seen": 254397280, + "step": 117900 + }, + { + "epoch": 19.234094616639478, + "grad_norm": 0.0002140836586477235, + "learning_rate": 4.462547909075687e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254408544, + "step": 117905 + }, + { + "epoch": 19.234910277324634, + "grad_norm": 0.006163290236145258, + "learning_rate": 4.453064221089154e-06, + "loss": 0.0018, + "num_input_tokens_seen": 254420288, + "step": 117910 + }, + { + "epoch": 19.23572593800979, + "grad_norm": 0.0006913598044775426, + "learning_rate": 4.44359057598992e-06, + "loss": 0.0008, + "num_input_tokens_seen": 254429984, + "step": 117915 + }, + { + "epoch": 19.23654159869494, + "grad_norm": 0.00019974197493866086, + "learning_rate": 4.434126973969998e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254440128, + "step": 117920 + }, + { + "epoch": 19.237357259380097, + "grad_norm": 8.292843995150179e-05, + "learning_rate": 4.424673415221181e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254450976, + "step": 117925 + }, + { + "epoch": 19.238172920065253, + "grad_norm": 0.00022149246069602668, + "learning_rate": 4.41522989993498e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254461952, + "step": 117930 + }, + { + "epoch": 19.23898858075041, + "grad_norm": 0.0067072375677526, + "learning_rate": 4.405796428302855e-06, + "loss": 0.0102, + "num_input_tokens_seen": 254471808, + "step": 117935 + }, + { + "epoch": 19.239804241435564, + "grad_norm": 0.0003836864489130676, + "learning_rate": 4.396373000515986e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254482624, + "step": 117940 + }, + { + "epoch": 19.240619902120716, + "grad_norm": 0.0007252515060827136, + "learning_rate": 4.3869596167653296e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254492768, + "step": 117945 + }, + { + "epoch": 19.241435562805872, + "grad_norm": 0.0020254042465239763, + "learning_rate": 4.377556277241679e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254504000, + "step": 117950 + }, + { + "epoch": 19.242251223491028, + "grad_norm": 0.0010949743445962667, + "learning_rate": 4.368162982135604e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254515840, + "step": 117955 + }, + { + "epoch": 19.243066884176184, + "grad_norm": 0.0006008145282976329, + "learning_rate": 4.3587797316373965e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254526432, + "step": 117960 + }, + { + "epoch": 19.24388254486134, + "grad_norm": 0.0011378350900486112, + "learning_rate": 4.34940652593735e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254537952, + "step": 117965 + }, + { + "epoch": 19.24469820554649, + "grad_norm": 0.0006761500262655318, + "learning_rate": 4.34004336522531e-06, + "loss": 0.0021, + "num_input_tokens_seen": 254549024, + "step": 117970 + }, + { + "epoch": 19.245513866231647, + "grad_norm": 0.000511779508087784, + "learning_rate": 4.330690249691127e-06, + "loss": 0.0038, + "num_input_tokens_seen": 254559712, + "step": 117975 + }, + { + "epoch": 19.246329526916803, + "grad_norm": 0.00151029322296381, + "learning_rate": 4.321347179524316e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254571840, + "step": 117980 + }, + { + "epoch": 19.24714518760196, + "grad_norm": 0.004007595591247082, + "learning_rate": 4.312014154914113e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254581536, + "step": 117985 + }, + { + "epoch": 19.247960848287114, + "grad_norm": 0.0010497610783204436, + "learning_rate": 4.302691176049922e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254592416, + "step": 117990 + }, + { + "epoch": 19.248776508972266, + "grad_norm": 0.005528536159545183, + "learning_rate": 4.293378243120371e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254603392, + "step": 117995 + }, + { + "epoch": 19.249592169657422, + "grad_norm": 0.00016743221203796566, + "learning_rate": 4.284075356314476e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254613536, + "step": 118000 + }, + { + "epoch": 19.250407830342578, + "grad_norm": 0.0008669524686411023, + "learning_rate": 4.2747825158205855e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254624096, + "step": 118005 + }, + { + "epoch": 19.251223491027734, + "grad_norm": 0.0001844579674070701, + "learning_rate": 4.265499721827159e-06, + "loss": 0.0001, + "num_input_tokens_seen": 254634240, + "step": 118010 + }, + { + "epoch": 19.252039151712886, + "grad_norm": 0.16749875247478485, + "learning_rate": 4.256226974522215e-06, + "loss": 0.0021, + "num_input_tokens_seen": 254644576, + "step": 118015 + }, + { + "epoch": 19.25285481239804, + "grad_norm": 0.00045024033170193434, + "learning_rate": 4.246964274093767e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254656000, + "step": 118020 + }, + { + "epoch": 19.253670473083197, + "grad_norm": 0.003741712775081396, + "learning_rate": 4.237711620729501e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254666976, + "step": 118025 + }, + { + "epoch": 19.254486133768353, + "grad_norm": 0.005742361769080162, + "learning_rate": 4.228469014616931e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254676608, + "step": 118030 + }, + { + "epoch": 19.25530179445351, + "grad_norm": 0.0006892826058901846, + "learning_rate": 4.219236455943298e-06, + "loss": 0.0016, + "num_input_tokens_seen": 254688000, + "step": 118035 + }, + { + "epoch": 19.25611745513866, + "grad_norm": 0.00036326583358459175, + "learning_rate": 4.210013944895841e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254698944, + "step": 118040 + }, + { + "epoch": 19.256933115823816, + "grad_norm": 0.01618233695626259, + "learning_rate": 4.2008014816613534e-06, + "loss": 0.0012, + "num_input_tokens_seen": 254709216, + "step": 118045 + }, + { + "epoch": 19.257748776508972, + "grad_norm": 0.019004283472895622, + "learning_rate": 4.191599066426632e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254720832, + "step": 118050 + }, + { + "epoch": 19.258564437194128, + "grad_norm": 0.00018877835827879608, + "learning_rate": 4.182406699378138e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254732224, + "step": 118055 + }, + { + "epoch": 19.259380097879284, + "grad_norm": 9.273934119846672e-05, + "learning_rate": 4.173224380702112e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254742144, + "step": 118060 + }, + { + "epoch": 19.260195758564436, + "grad_norm": 0.00028407564968802035, + "learning_rate": 4.164052110584737e-06, + "loss": 0.0005, + "num_input_tokens_seen": 254753120, + "step": 118065 + }, + { + "epoch": 19.26101141924959, + "grad_norm": 0.015913421288132668, + "learning_rate": 4.154889889211866e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254763136, + "step": 118070 + }, + { + "epoch": 19.261827079934747, + "grad_norm": 0.0001920036447700113, + "learning_rate": 4.145737716769182e-06, + "loss": 0.0013, + "num_input_tokens_seen": 254774016, + "step": 118075 + }, + { + "epoch": 19.262642740619903, + "grad_norm": 0.03168654441833496, + "learning_rate": 4.136595593442149e-06, + "loss": 0.0015, + "num_input_tokens_seen": 254785312, + "step": 118080 + }, + { + "epoch": 19.26345840130506, + "grad_norm": 0.0004416762385517359, + "learning_rate": 4.1274635194160086e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254796608, + "step": 118085 + }, + { + "epoch": 19.26427406199021, + "grad_norm": 0.007172382436692715, + "learning_rate": 4.118341494875944e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254806240, + "step": 118090 + }, + { + "epoch": 19.265089722675366, + "grad_norm": 0.00012511806562542915, + "learning_rate": 4.1092295200066966e-06, + "loss": 0.0002, + "num_input_tokens_seen": 254817440, + "step": 118095 + }, + { + "epoch": 19.265905383360522, + "grad_norm": 0.003151315962895751, + "learning_rate": 4.100127594993064e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254828672, + "step": 118100 + }, + { + "epoch": 19.266721044045678, + "grad_norm": 0.0013245183508843184, + "learning_rate": 4.091035720019398e-06, + "loss": 0.001, + "num_input_tokens_seen": 254838880, + "step": 118105 + }, + { + "epoch": 19.267536704730833, + "grad_norm": 0.00023752260312903672, + "learning_rate": 4.081953895269996e-06, + "loss": 0.0052, + "num_input_tokens_seen": 254849856, + "step": 118110 + }, + { + "epoch": 19.268352365415986, + "grad_norm": 0.002729987958446145, + "learning_rate": 4.072882120928933e-06, + "loss": 0.1375, + "num_input_tokens_seen": 254861472, + "step": 118115 + }, + { + "epoch": 19.26916802610114, + "grad_norm": 0.011673888191580772, + "learning_rate": 4.063820397180007e-06, + "loss": 0.0019, + "num_input_tokens_seen": 254872128, + "step": 118120 + }, + { + "epoch": 19.269983686786297, + "grad_norm": 0.0004465667007025331, + "learning_rate": 4.054768724206958e-06, + "loss": 0.0124, + "num_input_tokens_seen": 254883776, + "step": 118125 + }, + { + "epoch": 19.270799347471453, + "grad_norm": 0.0002701320918276906, + "learning_rate": 4.045727102193087e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254895424, + "step": 118130 + }, + { + "epoch": 19.27161500815661, + "grad_norm": 0.013942504301667213, + "learning_rate": 4.036695531321799e-06, + "loss": 0.0003, + "num_input_tokens_seen": 254906080, + "step": 118135 + }, + { + "epoch": 19.27243066884176, + "grad_norm": 0.0009549465612508357, + "learning_rate": 4.027674011776006e-06, + "loss": 0.0009, + "num_input_tokens_seen": 254917312, + "step": 118140 + }, + { + "epoch": 19.273246329526916, + "grad_norm": 0.0019771873485296965, + "learning_rate": 4.018662543738616e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254927392, + "step": 118145 + }, + { + "epoch": 19.274061990212072, + "grad_norm": 0.023635823279619217, + "learning_rate": 4.009661127392206e-06, + "loss": 0.0011, + "num_input_tokens_seen": 254937664, + "step": 118150 + }, + { + "epoch": 19.274877650897228, + "grad_norm": 0.00023211816733237356, + "learning_rate": 4.00066976291924e-06, + "loss": 0.0006, + "num_input_tokens_seen": 254948032, + "step": 118155 + }, + { + "epoch": 19.275693311582383, + "grad_norm": 0.3432193398475647, + "learning_rate": 3.9916884505019065e-06, + "loss": 0.0089, + "num_input_tokens_seen": 254958208, + "step": 118160 + }, + { + "epoch": 19.276508972267536, + "grad_norm": 0.046641167253255844, + "learning_rate": 3.982717190322227e-06, + "loss": 0.0007, + "num_input_tokens_seen": 254968544, + "step": 118165 + }, + { + "epoch": 19.27732463295269, + "grad_norm": 0.0848323181271553, + "learning_rate": 3.973755982562055e-06, + "loss": 0.002, + "num_input_tokens_seen": 254979488, + "step": 118170 + }, + { + "epoch": 19.278140293637847, + "grad_norm": 0.0001621060073375702, + "learning_rate": 3.964804827402913e-06, + "loss": 0.0004, + "num_input_tokens_seen": 254989440, + "step": 118175 + }, + { + "epoch": 19.278955954323003, + "grad_norm": 0.00018049211939796805, + "learning_rate": 3.955863725026321e-06, + "loss": 0.0191, + "num_input_tokens_seen": 255002144, + "step": 118180 + }, + { + "epoch": 19.27977161500816, + "grad_norm": 0.0001427222741767764, + "learning_rate": 3.946932675613413e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255014048, + "step": 118185 + }, + { + "epoch": 19.28058727569331, + "grad_norm": 0.0013997963396832347, + "learning_rate": 3.93801167934521e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255025376, + "step": 118190 + }, + { + "epoch": 19.281402936378466, + "grad_norm": 0.0013937221374362707, + "learning_rate": 3.929100736402513e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255034912, + "step": 118195 + }, + { + "epoch": 19.282218597063622, + "grad_norm": 0.04219157248735428, + "learning_rate": 3.920199846965844e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255046848, + "step": 118200 + }, + { + "epoch": 19.283034257748778, + "grad_norm": 0.0005554206436499953, + "learning_rate": 3.911309011215725e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255057312, + "step": 118205 + }, + { + "epoch": 19.28384991843393, + "grad_norm": 0.00041106014396063983, + "learning_rate": 3.902428229332233e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255068064, + "step": 118210 + }, + { + "epoch": 19.284665579119086, + "grad_norm": 0.0003953674458898604, + "learning_rate": 3.8935575014953374e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255079168, + "step": 118215 + }, + { + "epoch": 19.28548123980424, + "grad_norm": 0.0021449574269354343, + "learning_rate": 3.884696827884893e-06, + "loss": 0.0017, + "num_input_tokens_seen": 255089440, + "step": 118220 + }, + { + "epoch": 19.286296900489397, + "grad_norm": 0.0003595837624743581, + "learning_rate": 3.8758462086804225e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255101184, + "step": 118225 + }, + { + "epoch": 19.287112561174553, + "grad_norm": 0.00026148033794015646, + "learning_rate": 3.867005644061283e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255113280, + "step": 118230 + }, + { + "epoch": 19.287928221859705, + "grad_norm": 0.0003752747143153101, + "learning_rate": 3.8581751342067205e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255124992, + "step": 118235 + }, + { + "epoch": 19.28874388254486, + "grad_norm": 0.0005535103264264762, + "learning_rate": 3.849354679295591e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255136032, + "step": 118240 + }, + { + "epoch": 19.289559543230016, + "grad_norm": 8.100105333141983e-05, + "learning_rate": 3.840544279506753e-06, + "loss": 0.0123, + "num_input_tokens_seen": 255146848, + "step": 118245 + }, + { + "epoch": 19.290375203915172, + "grad_norm": 0.02684246562421322, + "learning_rate": 3.831743935018672e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255156992, + "step": 118250 + }, + { + "epoch": 19.291190864600328, + "grad_norm": 0.0036999117583036423, + "learning_rate": 3.822953646009708e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255168608, + "step": 118255 + }, + { + "epoch": 19.29200652528548, + "grad_norm": 0.0003492805699352175, + "learning_rate": 3.8141734126580505e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255179712, + "step": 118260 + }, + { + "epoch": 19.292822185970635, + "grad_norm": 0.0054467059671878815, + "learning_rate": 3.805403235141669e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255190560, + "step": 118265 + }, + { + "epoch": 19.29363784665579, + "grad_norm": 0.050895992666482925, + "learning_rate": 3.7966431136381985e-06, + "loss": 0.0029, + "num_input_tokens_seen": 255201504, + "step": 118270 + }, + { + "epoch": 19.294453507340947, + "grad_norm": 0.15420714020729065, + "learning_rate": 3.7878930483252195e-06, + "loss": 0.0048, + "num_input_tokens_seen": 255212672, + "step": 118275 + }, + { + "epoch": 19.295269168026103, + "grad_norm": 0.06786397844552994, + "learning_rate": 3.7791530393801456e-06, + "loss": 0.0015, + "num_input_tokens_seen": 255224256, + "step": 118280 + }, + { + "epoch": 19.296084828711255, + "grad_norm": 0.03941697999835014, + "learning_rate": 3.7704230869800015e-06, + "loss": 0.001, + "num_input_tokens_seen": 255235680, + "step": 118285 + }, + { + "epoch": 19.29690048939641, + "grad_norm": 0.0003048558428417891, + "learning_rate": 3.7617031913017573e-06, + "loss": 0.0026, + "num_input_tokens_seen": 255246720, + "step": 118290 + }, + { + "epoch": 19.297716150081566, + "grad_norm": 0.0014914603671059012, + "learning_rate": 3.752993352522105e-06, + "loss": 0.0092, + "num_input_tokens_seen": 255257792, + "step": 118295 + }, + { + "epoch": 19.298531810766722, + "grad_norm": 0.06184706091880798, + "learning_rate": 3.7442935708176253e-06, + "loss": 0.0013, + "num_input_tokens_seen": 255268768, + "step": 118300 + }, + { + "epoch": 19.299347471451878, + "grad_norm": 0.0034872207324951887, + "learning_rate": 3.7356038463645105e-06, + "loss": 0.0009, + "num_input_tokens_seen": 255280224, + "step": 118305 + }, + { + "epoch": 19.30016313213703, + "grad_norm": 0.0006304801208898425, + "learning_rate": 3.7269241793390084e-06, + "loss": 0.003, + "num_input_tokens_seen": 255291008, + "step": 118310 + }, + { + "epoch": 19.300978792822185, + "grad_norm": 0.0010227963794022799, + "learning_rate": 3.7182545699169236e-06, + "loss": 0.0021, + "num_input_tokens_seen": 255301952, + "step": 118315 + }, + { + "epoch": 19.30179445350734, + "grad_norm": 0.021946966648101807, + "learning_rate": 3.7095950182739478e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255312096, + "step": 118320 + }, + { + "epoch": 19.302610114192497, + "grad_norm": 0.00018372037447988987, + "learning_rate": 3.700945524585664e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255323424, + "step": 118325 + }, + { + "epoch": 19.303425774877653, + "grad_norm": 0.0055175586603581905, + "learning_rate": 3.6923060890273195e-06, + "loss": 0.0022, + "num_input_tokens_seen": 255333280, + "step": 118330 + }, + { + "epoch": 19.304241435562805, + "grad_norm": 0.0008713052957318723, + "learning_rate": 3.683676711773998e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255344544, + "step": 118335 + }, + { + "epoch": 19.30505709624796, + "grad_norm": 0.45501455664634705, + "learning_rate": 3.6750573930005583e-06, + "loss": 0.0215, + "num_input_tokens_seen": 255355424, + "step": 118340 + }, + { + "epoch": 19.305872756933116, + "grad_norm": 0.002180634066462517, + "learning_rate": 3.66644813288175e-06, + "loss": 0.0011, + "num_input_tokens_seen": 255365632, + "step": 118345 + }, + { + "epoch": 19.306688417618272, + "grad_norm": 0.00021976424613967538, + "learning_rate": 3.6578489315919893e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255376672, + "step": 118350 + }, + { + "epoch": 19.307504078303428, + "grad_norm": 0.00014493390335701406, + "learning_rate": 3.6492597893056367e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255387584, + "step": 118355 + }, + { + "epoch": 19.30831973898858, + "grad_norm": 0.00016816183051560074, + "learning_rate": 3.6406807061966085e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255397600, + "step": 118360 + }, + { + "epoch": 19.309135399673735, + "grad_norm": 0.0027694024611264467, + "learning_rate": 3.6321116824388767e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255407680, + "step": 118365 + }, + { + "epoch": 19.30995106035889, + "grad_norm": 0.0007239347905851901, + "learning_rate": 3.6235527182061912e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255418944, + "step": 118370 + }, + { + "epoch": 19.310766721044047, + "grad_norm": 0.0014389572897925973, + "learning_rate": 3.615003813671802e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255429824, + "step": 118375 + }, + { + "epoch": 19.3115823817292, + "grad_norm": 0.0009211852448061109, + "learning_rate": 3.6064649690091268e-06, + "loss": 0.0737, + "num_input_tokens_seen": 255441632, + "step": 118380 + }, + { + "epoch": 19.312398042414355, + "grad_norm": 0.0001435963058611378, + "learning_rate": 3.5979361843910817e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255451520, + "step": 118385 + }, + { + "epoch": 19.31321370309951, + "grad_norm": 0.03387012705206871, + "learning_rate": 3.589417459990696e-06, + "loss": 0.001, + "num_input_tokens_seen": 255461472, + "step": 118390 + }, + { + "epoch": 19.314029363784666, + "grad_norm": 0.00017496715008746833, + "learning_rate": 3.580908795980442e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255472704, + "step": 118395 + }, + { + "epoch": 19.31484502446982, + "grad_norm": 0.021043628454208374, + "learning_rate": 3.572410192532849e-06, + "loss": 0.0147, + "num_input_tokens_seen": 255484416, + "step": 118400 + }, + { + "epoch": 19.315660685154974, + "grad_norm": 0.00014954346988815814, + "learning_rate": 3.563921649820112e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255495616, + "step": 118405 + }, + { + "epoch": 19.31647634584013, + "grad_norm": 0.006252608727663755, + "learning_rate": 3.555443168014261e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255506304, + "step": 118410 + }, + { + "epoch": 19.317292006525285, + "grad_norm": 0.05369849503040314, + "learning_rate": 3.5469747472871574e-06, + "loss": 0.0025, + "num_input_tokens_seen": 255518240, + "step": 118415 + }, + { + "epoch": 19.31810766721044, + "grad_norm": 0.000458421534858644, + "learning_rate": 3.5385163878103864e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255529760, + "step": 118420 + }, + { + "epoch": 19.318923327895597, + "grad_norm": 0.00019737222464755177, + "learning_rate": 3.5300680897554226e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255540576, + "step": 118425 + }, + { + "epoch": 19.31973898858075, + "grad_norm": 0.0004869088879786432, + "learning_rate": 3.5216298532934068e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255550976, + "step": 118430 + }, + { + "epoch": 19.320554649265905, + "grad_norm": 0.00030218841857276857, + "learning_rate": 3.5132016785954235e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255561280, + "step": 118435 + }, + { + "epoch": 19.32137030995106, + "grad_norm": 0.0030433088541030884, + "learning_rate": 3.504783565832226e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255571296, + "step": 118440 + }, + { + "epoch": 19.322185970636216, + "grad_norm": 0.0002221856266260147, + "learning_rate": 3.496375515174455e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255582816, + "step": 118445 + }, + { + "epoch": 19.32300163132137, + "grad_norm": 0.01359740924090147, + "learning_rate": 3.4879775267925297e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255593760, + "step": 118450 + }, + { + "epoch": 19.323817292006524, + "grad_norm": 0.0005367157864384353, + "learning_rate": 3.4795896008565363e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255604256, + "step": 118455 + }, + { + "epoch": 19.32463295269168, + "grad_norm": 0.0004055321915075183, + "learning_rate": 3.4712117375365615e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255614976, + "step": 118460 + }, + { + "epoch": 19.325448613376835, + "grad_norm": 0.007184091955423355, + "learning_rate": 3.4628439370024133e-06, + "loss": 0.0006, + "num_input_tokens_seen": 255626048, + "step": 118465 + }, + { + "epoch": 19.32626427406199, + "grad_norm": 0.00022854907729197294, + "learning_rate": 3.454486199423568e-06, + "loss": 0.0021, + "num_input_tokens_seen": 255637568, + "step": 118470 + }, + { + "epoch": 19.327079934747147, + "grad_norm": 0.010007885284721851, + "learning_rate": 3.4461385249695e-06, + "loss": 0.0012, + "num_input_tokens_seen": 255648384, + "step": 118475 + }, + { + "epoch": 19.3278955954323, + "grad_norm": 0.003142556408420205, + "learning_rate": 3.4378009138093524e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255658240, + "step": 118480 + }, + { + "epoch": 19.328711256117455, + "grad_norm": 0.0001928832207340747, + "learning_rate": 3.429473366112157e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255669536, + "step": 118485 + }, + { + "epoch": 19.32952691680261, + "grad_norm": 0.0012695303885266185, + "learning_rate": 3.421155882046556e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255680096, + "step": 118490 + }, + { + "epoch": 19.330342577487766, + "grad_norm": 0.2068803459405899, + "learning_rate": 3.4128484617812482e-06, + "loss": 0.0038, + "num_input_tokens_seen": 255691008, + "step": 118495 + }, + { + "epoch": 19.33115823817292, + "grad_norm": 0.01590617187321186, + "learning_rate": 3.404551105484488e-06, + "loss": 0.002, + "num_input_tokens_seen": 255702976, + "step": 118500 + }, + { + "epoch": 19.331973898858074, + "grad_norm": 0.0003944123163819313, + "learning_rate": 3.3962638133245296e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255714400, + "step": 118505 + }, + { + "epoch": 19.33278955954323, + "grad_norm": 0.00040506833465769887, + "learning_rate": 3.3879865854691825e-06, + "loss": 0.0042, + "num_input_tokens_seen": 255724736, + "step": 118510 + }, + { + "epoch": 19.333605220228385, + "grad_norm": 0.04011167958378792, + "learning_rate": 3.3797194220863694e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255735328, + "step": 118515 + }, + { + "epoch": 19.33442088091354, + "grad_norm": 0.004475913010537624, + "learning_rate": 3.371462323343455e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255747072, + "step": 118520 + }, + { + "epoch": 19.335236541598697, + "grad_norm": 0.0011126851895824075, + "learning_rate": 3.3632152894079727e-06, + "loss": 0.0011, + "num_input_tokens_seen": 255758304, + "step": 118525 + }, + { + "epoch": 19.33605220228385, + "grad_norm": 0.003824569983407855, + "learning_rate": 3.3549783204469e-06, + "loss": 0.001, + "num_input_tokens_seen": 255769056, + "step": 118530 + }, + { + "epoch": 19.336867862969005, + "grad_norm": 0.0036015701480209827, + "learning_rate": 3.3467514166272696e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255780832, + "step": 118535 + }, + { + "epoch": 19.33768352365416, + "grad_norm": 0.002772236242890358, + "learning_rate": 3.338534578115726e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255790720, + "step": 118540 + }, + { + "epoch": 19.338499184339316, + "grad_norm": 0.00015267882554326206, + "learning_rate": 3.3303278050789143e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255801792, + "step": 118545 + }, + { + "epoch": 19.339314845024468, + "grad_norm": 0.1238447055220604, + "learning_rate": 3.3221310976829787e-06, + "loss": 0.0108, + "num_input_tokens_seen": 255813184, + "step": 118550 + }, + { + "epoch": 19.340130505709624, + "grad_norm": 0.0018206024542450905, + "learning_rate": 3.313944456094231e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255824000, + "step": 118555 + }, + { + "epoch": 19.34094616639478, + "grad_norm": 0.006197202485054731, + "learning_rate": 3.3057678804784276e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255834400, + "step": 118560 + }, + { + "epoch": 19.341761827079935, + "grad_norm": 0.015896322205662727, + "learning_rate": 3.29760137100138e-06, + "loss": 0.0032, + "num_input_tokens_seen": 255845984, + "step": 118565 + }, + { + "epoch": 19.34257748776509, + "grad_norm": 0.004965710919350386, + "learning_rate": 3.289444927828511e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255855968, + "step": 118570 + }, + { + "epoch": 19.343393148450243, + "grad_norm": 0.00013055918680038303, + "learning_rate": 3.281298551125189e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255866848, + "step": 118575 + }, + { + "epoch": 19.3442088091354, + "grad_norm": 0.00013076665345579386, + "learning_rate": 3.2731622410565043e-06, + "loss": 0.0007, + "num_input_tokens_seen": 255877984, + "step": 118580 + }, + { + "epoch": 19.345024469820554, + "grad_norm": 0.0016432912088930607, + "learning_rate": 3.265035997787269e-06, + "loss": 0.0005, + "num_input_tokens_seen": 255889408, + "step": 118585 + }, + { + "epoch": 19.34584013050571, + "grad_norm": 0.008219994604587555, + "learning_rate": 3.256919821482296e-06, + "loss": 0.0048, + "num_input_tokens_seen": 255900640, + "step": 118590 + }, + { + "epoch": 19.346655791190866, + "grad_norm": 0.03133723512291908, + "learning_rate": 3.2488137123059537e-06, + "loss": 0.0008, + "num_input_tokens_seen": 255910880, + "step": 118595 + }, + { + "epoch": 19.347471451876018, + "grad_norm": 0.00015701410302426666, + "learning_rate": 3.2407176704226102e-06, + "loss": 0.0002, + "num_input_tokens_seen": 255921728, + "step": 118600 + }, + { + "epoch": 19.348287112561174, + "grad_norm": 0.005754875484853983, + "learning_rate": 3.2326316959962463e-06, + "loss": 0.0003, + "num_input_tokens_seen": 255932736, + "step": 118605 + }, + { + "epoch": 19.34910277324633, + "grad_norm": 0.013132015243172646, + "learning_rate": 3.224555789190897e-06, + "loss": 0.0004, + "num_input_tokens_seen": 255943968, + "step": 118610 + }, + { + "epoch": 19.349918433931485, + "grad_norm": 0.00019180115486960858, + "learning_rate": 3.216489950170043e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255954720, + "step": 118615 + }, + { + "epoch": 19.35073409461664, + "grad_norm": 0.04192522168159485, + "learning_rate": 3.208434179097275e-06, + "loss": 0.0018, + "num_input_tokens_seen": 255966656, + "step": 118620 + }, + { + "epoch": 19.351549755301793, + "grad_norm": 0.00014962237037252635, + "learning_rate": 3.200388476135796e-06, + "loss": 0.0009, + "num_input_tokens_seen": 255977984, + "step": 118625 + }, + { + "epoch": 19.35236541598695, + "grad_norm": 0.0007152362377382815, + "learning_rate": 3.1923528414487535e-06, + "loss": 0.0001, + "num_input_tokens_seen": 255988640, + "step": 118630 + }, + { + "epoch": 19.353181076672104, + "grad_norm": 0.00014208834909368306, + "learning_rate": 3.184327275198795e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256000576, + "step": 118635 + }, + { + "epoch": 19.35399673735726, + "grad_norm": 0.00015851900388952345, + "learning_rate": 3.1763117775487903e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256011104, + "step": 118640 + }, + { + "epoch": 19.354812398042416, + "grad_norm": 0.0032300513703376055, + "learning_rate": 3.168306348661054e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256023200, + "step": 118645 + }, + { + "epoch": 19.355628058727568, + "grad_norm": 0.00023195683024823666, + "learning_rate": 3.160310988697901e-06, + "loss": 0.0015, + "num_input_tokens_seen": 256034080, + "step": 118650 + }, + { + "epoch": 19.356443719412724, + "grad_norm": 0.00010403442865936086, + "learning_rate": 3.152325697821312e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256043168, + "step": 118655 + }, + { + "epoch": 19.35725938009788, + "grad_norm": 0.002620348474010825, + "learning_rate": 3.1443504761931585e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256054432, + "step": 118660 + }, + { + "epoch": 19.358075040783035, + "grad_norm": 0.0015812115743756294, + "learning_rate": 3.1363853239750327e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256065472, + "step": 118665 + }, + { + "epoch": 19.35889070146819, + "grad_norm": 0.00018971448298543692, + "learning_rate": 3.1284302413283615e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256075840, + "step": 118670 + }, + { + "epoch": 19.359706362153343, + "grad_norm": 0.024445684626698494, + "learning_rate": 3.1204852284143493e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256085632, + "step": 118675 + }, + { + "epoch": 19.3605220228385, + "grad_norm": 0.0005833973991684616, + "learning_rate": 3.1125502853941444e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256097120, + "step": 118680 + }, + { + "epoch": 19.361337683523654, + "grad_norm": 0.00027141981991007924, + "learning_rate": 3.1046254124283413e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256108256, + "step": 118685 + }, + { + "epoch": 19.36215334420881, + "grad_norm": 0.004508001729846001, + "learning_rate": 3.0967106096777e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256119648, + "step": 118690 + }, + { + "epoch": 19.362969004893966, + "grad_norm": 0.0003442360321059823, + "learning_rate": 3.088805877302592e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256130080, + "step": 118695 + }, + { + "epoch": 19.363784665579118, + "grad_norm": 0.0011631031520664692, + "learning_rate": 3.0809112154632226e-06, + "loss": 0.0057, + "num_input_tokens_seen": 256139680, + "step": 118700 + }, + { + "epoch": 19.364600326264274, + "grad_norm": 0.008932788856327534, + "learning_rate": 3.073026624319575e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256149280, + "step": 118705 + }, + { + "epoch": 19.36541598694943, + "grad_norm": 0.0002427997678751126, + "learning_rate": 3.06515210403141e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256160032, + "step": 118710 + }, + { + "epoch": 19.366231647634585, + "grad_norm": 0.023460719734430313, + "learning_rate": 3.0572876547583785e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256171520, + "step": 118715 + }, + { + "epoch": 19.36704730831974, + "grad_norm": 9.563328785588965e-05, + "learning_rate": 3.0494332766597967e-06, + "loss": 0.0016, + "num_input_tokens_seen": 256182208, + "step": 118720 + }, + { + "epoch": 19.367862969004893, + "grad_norm": 0.07000649720430374, + "learning_rate": 3.0415889698949262e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256194080, + "step": 118725 + }, + { + "epoch": 19.36867862969005, + "grad_norm": 0.0010812929831445217, + "learning_rate": 3.0337547346226404e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256204640, + "step": 118730 + }, + { + "epoch": 19.369494290375204, + "grad_norm": 0.0015740001108497381, + "learning_rate": 3.025930571001756e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256215872, + "step": 118735 + }, + { + "epoch": 19.37030995106036, + "grad_norm": 0.004447279963642359, + "learning_rate": 3.018116479190869e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256226240, + "step": 118740 + }, + { + "epoch": 19.371125611745512, + "grad_norm": 0.00020363663497846574, + "learning_rate": 3.0103124593483522e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256237728, + "step": 118745 + }, + { + "epoch": 19.371941272430668, + "grad_norm": 0.005963383708149195, + "learning_rate": 3.002518511632246e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256246720, + "step": 118750 + }, + { + "epoch": 19.372756933115824, + "grad_norm": 0.041212525218725204, + "learning_rate": 2.9947346362006466e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256257984, + "step": 118755 + }, + { + "epoch": 19.37357259380098, + "grad_norm": 0.0002067715540761128, + "learning_rate": 2.986960833211205e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256268000, + "step": 118760 + }, + { + "epoch": 19.374388254486135, + "grad_norm": 0.00032472560997121036, + "learning_rate": 2.9791971028215737e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256278368, + "step": 118765 + }, + { + "epoch": 19.375203915171287, + "grad_norm": 0.00011396812624298036, + "learning_rate": 2.9714434451889595e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256288800, + "step": 118770 + }, + { + "epoch": 19.376019575856443, + "grad_norm": 0.00012621170026250184, + "learning_rate": 2.9636998604706255e-06, + "loss": 0.0022, + "num_input_tokens_seen": 256299744, + "step": 118775 + }, + { + "epoch": 19.3768352365416, + "grad_norm": 0.00012961390893906355, + "learning_rate": 2.955966348823391e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256310720, + "step": 118780 + }, + { + "epoch": 19.377650897226754, + "grad_norm": 0.0012271021259948611, + "learning_rate": 2.948242910404131e-06, + "loss": 0.001, + "num_input_tokens_seen": 256321504, + "step": 118785 + }, + { + "epoch": 19.37846655791191, + "grad_norm": 0.00026982862618751824, + "learning_rate": 2.9405295453692195e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256331904, + "step": 118790 + }, + { + "epoch": 19.379282218597062, + "grad_norm": 0.03251657262444496, + "learning_rate": 2.9328262538750316e-06, + "loss": 0.0016, + "num_input_tokens_seen": 256342848, + "step": 118795 + }, + { + "epoch": 19.380097879282218, + "grad_norm": 0.02493751421570778, + "learning_rate": 2.9251330360777205e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256353856, + "step": 118800 + }, + { + "epoch": 19.380913539967374, + "grad_norm": 0.0011174265528097749, + "learning_rate": 2.9174498921331616e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256364416, + "step": 118805 + }, + { + "epoch": 19.38172920065253, + "grad_norm": 0.009552324190735817, + "learning_rate": 2.909776822197063e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256373472, + "step": 118810 + }, + { + "epoch": 19.382544861337685, + "grad_norm": 0.00013382371980696917, + "learning_rate": 2.902113826424968e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256385216, + "step": 118815 + }, + { + "epoch": 19.383360522022837, + "grad_norm": 0.002633386291563511, + "learning_rate": 2.8944609049721406e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256395648, + "step": 118820 + }, + { + "epoch": 19.384176182707993, + "grad_norm": 0.04632335528731346, + "learning_rate": 2.8868180579936787e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256407360, + "step": 118825 + }, + { + "epoch": 19.38499184339315, + "grad_norm": 0.00012273927859496325, + "learning_rate": 2.8791852856445143e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256417984, + "step": 118830 + }, + { + "epoch": 19.385807504078304, + "grad_norm": 0.006481477525085211, + "learning_rate": 2.8715625880792463e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256428608, + "step": 118835 + }, + { + "epoch": 19.38662316476346, + "grad_norm": 0.002791687846183777, + "learning_rate": 2.8639499654524724e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256440192, + "step": 118840 + }, + { + "epoch": 19.387438825448612, + "grad_norm": 0.006817100569605827, + "learning_rate": 2.856347417918348e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256452224, + "step": 118845 + }, + { + "epoch": 19.388254486133768, + "grad_norm": 0.0228599701076746, + "learning_rate": 2.8487549456310824e-06, + "loss": 0.0025, + "num_input_tokens_seen": 256462944, + "step": 118850 + }, + { + "epoch": 19.389070146818923, + "grad_norm": 0.0003468099457677454, + "learning_rate": 2.841172548744442e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256473440, + "step": 118855 + }, + { + "epoch": 19.38988580750408, + "grad_norm": 0.00020141841378062963, + "learning_rate": 2.8336002274121365e-06, + "loss": 0.0138, + "num_input_tokens_seen": 256483104, + "step": 118860 + }, + { + "epoch": 19.390701468189235, + "grad_norm": 0.00035959508386440575, + "learning_rate": 2.8260379817875993e-06, + "loss": 0.0028, + "num_input_tokens_seen": 256494016, + "step": 118865 + }, + { + "epoch": 19.391517128874387, + "grad_norm": 0.009039847180247307, + "learning_rate": 2.818485812024152e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256503968, + "step": 118870 + }, + { + "epoch": 19.392332789559543, + "grad_norm": 0.005495529621839523, + "learning_rate": 2.810943718274783e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256515136, + "step": 118875 + }, + { + "epoch": 19.3931484502447, + "grad_norm": 0.00021118400036357343, + "learning_rate": 2.8034117006924264e-06, + "loss": 0.0023, + "num_input_tokens_seen": 256525984, + "step": 118880 + }, + { + "epoch": 19.393964110929854, + "grad_norm": 0.04508800804615021, + "learning_rate": 2.795889759429626e-06, + "loss": 0.0237, + "num_input_tokens_seen": 256537248, + "step": 118885 + }, + { + "epoch": 19.39477977161501, + "grad_norm": 0.009599662385880947, + "learning_rate": 2.788377894638816e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256547552, + "step": 118890 + }, + { + "epoch": 19.395595432300162, + "grad_norm": 0.00011785190145019442, + "learning_rate": 2.7808761064723186e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256557728, + "step": 118895 + }, + { + "epoch": 19.396411092985318, + "grad_norm": 0.0001640346454223618, + "learning_rate": 2.773384395082179e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256568800, + "step": 118900 + }, + { + "epoch": 19.397226753670473, + "grad_norm": 0.0014635203406214714, + "learning_rate": 2.765902760620165e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256577888, + "step": 118905 + }, + { + "epoch": 19.39804241435563, + "grad_norm": 0.10303962230682373, + "learning_rate": 2.758431203237877e-06, + "loss": 0.002, + "num_input_tokens_seen": 256588512, + "step": 118910 + }, + { + "epoch": 19.39885807504078, + "grad_norm": 0.000996480812318623, + "learning_rate": 2.7509697230868048e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256598400, + "step": 118915 + }, + { + "epoch": 19.399673735725937, + "grad_norm": 0.00019520179193932563, + "learning_rate": 2.7435183203181613e-06, + "loss": 0.0013, + "num_input_tokens_seen": 256608928, + "step": 118920 + }, + { + "epoch": 19.400489396411093, + "grad_norm": 0.0034996974281966686, + "learning_rate": 2.7360769950828814e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256618048, + "step": 118925 + }, + { + "epoch": 19.40130505709625, + "grad_norm": 0.0002294249425176531, + "learning_rate": 2.728645747531844e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256630432, + "step": 118930 + }, + { + "epoch": 19.402120717781404, + "grad_norm": 0.04066091403365135, + "learning_rate": 2.721224577815651e-06, + "loss": 0.002, + "num_input_tokens_seen": 256639680, + "step": 118935 + }, + { + "epoch": 19.402936378466556, + "grad_norm": 0.001438430743291974, + "learning_rate": 2.713813486084682e-06, + "loss": 0.0292, + "num_input_tokens_seen": 256650688, + "step": 118940 + }, + { + "epoch": 19.403752039151712, + "grad_norm": 0.0002195403940277174, + "learning_rate": 2.7064124724891505e-06, + "loss": 0.0016, + "num_input_tokens_seen": 256660256, + "step": 118945 + }, + { + "epoch": 19.404567699836868, + "grad_norm": 0.0001556773786433041, + "learning_rate": 2.6990215371789916e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256671328, + "step": 118950 + }, + { + "epoch": 19.405383360522023, + "grad_norm": 9.870591020444408e-05, + "learning_rate": 2.691640680304086e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256682816, + "step": 118955 + }, + { + "epoch": 19.40619902120718, + "grad_norm": 0.00023748722742311656, + "learning_rate": 2.684269902013925e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256694080, + "step": 118960 + }, + { + "epoch": 19.40701468189233, + "grad_norm": 0.0005493731005117297, + "learning_rate": 2.676909202457889e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256704864, + "step": 118965 + }, + { + "epoch": 19.407830342577487, + "grad_norm": 0.013341154903173447, + "learning_rate": 2.6695585817852476e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256715136, + "step": 118970 + }, + { + "epoch": 19.408646003262643, + "grad_norm": 0.003276692470535636, + "learning_rate": 2.6622180401448815e-06, + "loss": 0.0001, + "num_input_tokens_seen": 256725120, + "step": 118975 + }, + { + "epoch": 19.4094616639478, + "grad_norm": 0.0026900172233581543, + "learning_rate": 2.6548875776856163e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256734720, + "step": 118980 + }, + { + "epoch": 19.410277324632954, + "grad_norm": 0.00019983973470516503, + "learning_rate": 2.6475671945559442e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256745344, + "step": 118985 + }, + { + "epoch": 19.411092985318106, + "grad_norm": 0.00012815906666219234, + "learning_rate": 2.6402568909042467e-06, + "loss": 0.0028, + "num_input_tokens_seen": 256757248, + "step": 118990 + }, + { + "epoch": 19.411908646003262, + "grad_norm": 0.11474797129631042, + "learning_rate": 2.6329566668787384e-06, + "loss": 0.0017, + "num_input_tokens_seen": 256768000, + "step": 118995 + }, + { + "epoch": 19.412724306688418, + "grad_norm": 0.017117729410529137, + "learning_rate": 2.625666522627301e-06, + "loss": 0.0006, + "num_input_tokens_seen": 256778176, + "step": 119000 + }, + { + "epoch": 19.413539967373573, + "grad_norm": 0.0026034030597656965, + "learning_rate": 2.6183864582976503e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256789376, + "step": 119005 + }, + { + "epoch": 19.41435562805873, + "grad_norm": 0.018916333094239235, + "learning_rate": 2.611116474037445e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256799136, + "step": 119010 + }, + { + "epoch": 19.41517128874388, + "grad_norm": 0.012309384532272816, + "learning_rate": 2.603856569993901e-06, + "loss": 0.0005, + "num_input_tokens_seen": 256810592, + "step": 119015 + }, + { + "epoch": 19.415986949429037, + "grad_norm": 0.00020488417067099363, + "learning_rate": 2.596606746314234e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256821184, + "step": 119020 + }, + { + "epoch": 19.416802610114193, + "grad_norm": 0.0026599301490932703, + "learning_rate": 2.589367003145271e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256832160, + "step": 119025 + }, + { + "epoch": 19.41761827079935, + "grad_norm": 0.0008563905139453709, + "learning_rate": 2.5821373406338387e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256841408, + "step": 119030 + }, + { + "epoch": 19.418433931484504, + "grad_norm": 0.14199940860271454, + "learning_rate": 2.574917758926376e-06, + "loss": 0.0026, + "num_input_tokens_seen": 256853024, + "step": 119035 + }, + { + "epoch": 19.419249592169656, + "grad_norm": 0.006780951749533415, + "learning_rate": 2.5677082581692657e-06, + "loss": 0.0011, + "num_input_tokens_seen": 256865120, + "step": 119040 + }, + { + "epoch": 19.420065252854812, + "grad_norm": 0.00016649358440190554, + "learning_rate": 2.5605088385085573e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256876768, + "step": 119045 + }, + { + "epoch": 19.420880913539968, + "grad_norm": 0.0003078467561863363, + "learning_rate": 2.553319500090245e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256888544, + "step": 119050 + }, + { + "epoch": 19.421696574225123, + "grad_norm": 0.0009791761403903365, + "learning_rate": 2.5461402430599357e-06, + "loss": 0.0007, + "num_input_tokens_seen": 256899168, + "step": 119055 + }, + { + "epoch": 19.42251223491028, + "grad_norm": 0.0001941788214026019, + "learning_rate": 2.5389710675631227e-06, + "loss": 0.0002, + "num_input_tokens_seen": 256908384, + "step": 119060 + }, + { + "epoch": 19.42332789559543, + "grad_norm": 0.0016292885411530733, + "learning_rate": 2.5318119737451905e-06, + "loss": 0.0004, + "num_input_tokens_seen": 256920672, + "step": 119065 + }, + { + "epoch": 19.424143556280587, + "grad_norm": 0.0005550052737817168, + "learning_rate": 2.524662961751134e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256932000, + "step": 119070 + }, + { + "epoch": 19.424959216965743, + "grad_norm": 0.00013194416533224285, + "learning_rate": 2.517524031725893e-06, + "loss": 0.0008, + "num_input_tokens_seen": 256943488, + "step": 119075 + }, + { + "epoch": 19.4257748776509, + "grad_norm": 0.0003847342450171709, + "learning_rate": 2.5103951838141292e-06, + "loss": 0.0003, + "num_input_tokens_seen": 256955904, + "step": 119080 + }, + { + "epoch": 19.42659053833605, + "grad_norm": 0.0011214209953323007, + "learning_rate": 2.503276418160283e-06, + "loss": 0.0011, + "num_input_tokens_seen": 256967968, + "step": 119085 + }, + { + "epoch": 19.427406199021206, + "grad_norm": 0.00016893558495212346, + "learning_rate": 2.496167734908683e-06, + "loss": 0.0012, + "num_input_tokens_seen": 256978368, + "step": 119090 + }, + { + "epoch": 19.428221859706362, + "grad_norm": 0.01589166186749935, + "learning_rate": 2.489069134203381e-06, + "loss": 0.0009, + "num_input_tokens_seen": 256989856, + "step": 119095 + }, + { + "epoch": 19.429037520391518, + "grad_norm": 0.003723285859450698, + "learning_rate": 2.481980616188262e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257000768, + "step": 119100 + }, + { + "epoch": 19.429853181076673, + "grad_norm": 0.031040076166391373, + "learning_rate": 2.474902181006877e-06, + "loss": 0.0014, + "num_input_tokens_seen": 257011840, + "step": 119105 + }, + { + "epoch": 19.430668841761825, + "grad_norm": 0.01211348082870245, + "learning_rate": 2.467833828802779e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257022464, + "step": 119110 + }, + { + "epoch": 19.43148450244698, + "grad_norm": 0.0008157999836839736, + "learning_rate": 2.4607755597192417e-06, + "loss": 0.001, + "num_input_tokens_seen": 257032960, + "step": 119115 + }, + { + "epoch": 19.432300163132137, + "grad_norm": 0.002909276634454727, + "learning_rate": 2.453727373899206e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257043168, + "step": 119120 + }, + { + "epoch": 19.433115823817293, + "grad_norm": 0.0036067490000277758, + "learning_rate": 2.4466892714856137e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257053920, + "step": 119125 + }, + { + "epoch": 19.43393148450245, + "grad_norm": 0.01366361789405346, + "learning_rate": 2.439661252621017e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257066592, + "step": 119130 + }, + { + "epoch": 19.4347471451876, + "grad_norm": 0.09551262110471725, + "learning_rate": 2.4326433174479133e-06, + "loss": 0.0014, + "num_input_tokens_seen": 257077024, + "step": 119135 + }, + { + "epoch": 19.435562805872756, + "grad_norm": 0.004896916914731264, + "learning_rate": 2.4256354661084666e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257087456, + "step": 119140 + }, + { + "epoch": 19.436378466557912, + "grad_norm": 0.00012975318531971425, + "learning_rate": 2.4186376987447857e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257097216, + "step": 119145 + }, + { + "epoch": 19.437194127243067, + "grad_norm": 0.0017046157736331224, + "learning_rate": 2.41165001549859e-06, + "loss": 0.0482, + "num_input_tokens_seen": 257108192, + "step": 119150 + }, + { + "epoch": 19.438009787928223, + "grad_norm": 0.0016280838754028082, + "learning_rate": 2.4046724165115998e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257118560, + "step": 119155 + }, + { + "epoch": 19.438825448613375, + "grad_norm": 0.0013878579484298825, + "learning_rate": 2.3977049019250907e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257129216, + "step": 119160 + }, + { + "epoch": 19.43964110929853, + "grad_norm": 0.00014716072473675013, + "learning_rate": 2.3907474718803944e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257137760, + "step": 119165 + }, + { + "epoch": 19.440456769983687, + "grad_norm": 0.000378182390704751, + "learning_rate": 2.383800126518454e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257147808, + "step": 119170 + }, + { + "epoch": 19.441272430668842, + "grad_norm": 0.007061046548187733, + "learning_rate": 2.3768628659801005e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257159136, + "step": 119175 + }, + { + "epoch": 19.442088091353998, + "grad_norm": 0.009216031059622765, + "learning_rate": 2.3699356904058334e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257169824, + "step": 119180 + }, + { + "epoch": 19.44290375203915, + "grad_norm": 0.0019694368820637465, + "learning_rate": 2.363018599936151e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257181504, + "step": 119185 + }, + { + "epoch": 19.443719412724306, + "grad_norm": 0.0002259332250105217, + "learning_rate": 2.3561115947111635e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257191488, + "step": 119190 + }, + { + "epoch": 19.44453507340946, + "grad_norm": 0.030159763991832733, + "learning_rate": 2.349214674870925e-06, + "loss": 0.001, + "num_input_tokens_seen": 257203392, + "step": 119195 + }, + { + "epoch": 19.445350734094617, + "grad_norm": 0.0015522867906838655, + "learning_rate": 2.3423278405551583e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257214656, + "step": 119200 + }, + { + "epoch": 19.446166394779773, + "grad_norm": 0.0019559916108846664, + "learning_rate": 2.335451091903418e-06, + "loss": 0.001, + "num_input_tokens_seen": 257224576, + "step": 119205 + }, + { + "epoch": 19.446982055464925, + "grad_norm": 0.0008649926166981459, + "learning_rate": 2.3285844290550916e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257236288, + "step": 119210 + }, + { + "epoch": 19.44779771615008, + "grad_norm": 0.00012155095464549959, + "learning_rate": 2.321727852149402e-06, + "loss": 0.0086, + "num_input_tokens_seen": 257245952, + "step": 119215 + }, + { + "epoch": 19.448613376835237, + "grad_norm": 0.0008496473892591894, + "learning_rate": 2.314881361325183e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257257280, + "step": 119220 + }, + { + "epoch": 19.449429037520392, + "grad_norm": 0.00012694452016148716, + "learning_rate": 2.308044956721267e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257268736, + "step": 119225 + }, + { + "epoch": 19.450244698205548, + "grad_norm": 0.00022975579486228526, + "learning_rate": 2.30121863847621e-06, + "loss": 0.001, + "num_input_tokens_seen": 257279872, + "step": 119230 + }, + { + "epoch": 19.4510603588907, + "grad_norm": 0.0004101041704416275, + "learning_rate": 2.294402406728291e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257290112, + "step": 119235 + }, + { + "epoch": 19.451876019575856, + "grad_norm": 0.0014843905810266733, + "learning_rate": 2.2875962616157318e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257300608, + "step": 119240 + }, + { + "epoch": 19.45269168026101, + "grad_norm": 0.0003166191454511136, + "learning_rate": 2.2808002032763676e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257312320, + "step": 119245 + }, + { + "epoch": 19.453507340946167, + "grad_norm": 0.00019361302838660777, + "learning_rate": 2.2740142318480873e-06, + "loss": 0.0034, + "num_input_tokens_seen": 257322784, + "step": 119250 + }, + { + "epoch": 19.454323001631323, + "grad_norm": 0.0011503675486892462, + "learning_rate": 2.267238347468226e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257333824, + "step": 119255 + }, + { + "epoch": 19.455138662316475, + "grad_norm": 0.03772636130452156, + "learning_rate": 2.2604725502742286e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257342976, + "step": 119260 + }, + { + "epoch": 19.45595432300163, + "grad_norm": 0.01808868534862995, + "learning_rate": 2.2537168404032082e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257355072, + "step": 119265 + }, + { + "epoch": 19.456769983686787, + "grad_norm": 0.00011219976295251399, + "learning_rate": 2.2469712179920555e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257364832, + "step": 119270 + }, + { + "epoch": 19.457585644371942, + "grad_norm": 0.00725690508261323, + "learning_rate": 2.2402356831774383e-06, + "loss": 0.0022, + "num_input_tokens_seen": 257374976, + "step": 119275 + }, + { + "epoch": 19.458401305057095, + "grad_norm": 9.38585726544261e-05, + "learning_rate": 2.2335102360959148e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257386848, + "step": 119280 + }, + { + "epoch": 19.45921696574225, + "grad_norm": 0.035774584859609604, + "learning_rate": 2.226794876883764e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257397920, + "step": 119285 + }, + { + "epoch": 19.460032626427406, + "grad_norm": 0.0002034591743722558, + "learning_rate": 2.2200896056771004e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257410048, + "step": 119290 + }, + { + "epoch": 19.46084828711256, + "grad_norm": 0.0026589222252368927, + "learning_rate": 2.2133944226117587e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257421280, + "step": 119295 + }, + { + "epoch": 19.461663947797717, + "grad_norm": 0.037036072462797165, + "learning_rate": 2.2067093278235194e-06, + "loss": 0.0016, + "num_input_tokens_seen": 257433024, + "step": 119300 + }, + { + "epoch": 19.46247960848287, + "grad_norm": 0.5362663269042969, + "learning_rate": 2.2000343214477746e-06, + "loss": 0.0129, + "num_input_tokens_seen": 257444672, + "step": 119305 + }, + { + "epoch": 19.463295269168025, + "grad_norm": 0.00010561345698079094, + "learning_rate": 2.1933694036198605e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257455744, + "step": 119310 + }, + { + "epoch": 19.46411092985318, + "grad_norm": 0.00022067761165089905, + "learning_rate": 2.1867145744747796e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257465920, + "step": 119315 + }, + { + "epoch": 19.464926590538337, + "grad_norm": 0.0002512575883883983, + "learning_rate": 2.1800698341475355e-06, + "loss": 0.0009, + "num_input_tokens_seen": 257475392, + "step": 119320 + }, + { + "epoch": 19.465742251223492, + "grad_norm": 0.00011261735926382244, + "learning_rate": 2.173435182772632e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257487808, + "step": 119325 + }, + { + "epoch": 19.466557911908644, + "grad_norm": 0.0008177530253306031, + "learning_rate": 2.166810620484627e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257498208, + "step": 119330 + }, + { + "epoch": 19.4673735725938, + "grad_norm": 0.0007842247141525149, + "learning_rate": 2.160196147417748e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257509376, + "step": 119335 + }, + { + "epoch": 19.468189233278956, + "grad_norm": 0.0008269516984000802, + "learning_rate": 2.153591763706053e-06, + "loss": 0.0012, + "num_input_tokens_seen": 257520032, + "step": 119340 + }, + { + "epoch": 19.46900489396411, + "grad_norm": 0.0001589106541359797, + "learning_rate": 2.1469974694833805e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257531072, + "step": 119345 + }, + { + "epoch": 19.469820554649267, + "grad_norm": 0.09193243831396103, + "learning_rate": 2.140413264883401e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257541024, + "step": 119350 + }, + { + "epoch": 19.47063621533442, + "grad_norm": 0.1113378182053566, + "learning_rate": 2.1338391500394516e-06, + "loss": 0.004, + "num_input_tokens_seen": 257551264, + "step": 119355 + }, + { + "epoch": 19.471451876019575, + "grad_norm": 0.0013603615807369351, + "learning_rate": 2.1272751250849263e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257561248, + "step": 119360 + }, + { + "epoch": 19.47226753670473, + "grad_norm": 0.00011289698886685073, + "learning_rate": 2.120721190152719e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257572640, + "step": 119365 + }, + { + "epoch": 19.473083197389887, + "grad_norm": 0.00130069674924016, + "learning_rate": 2.114177345375723e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257583328, + "step": 119370 + }, + { + "epoch": 19.473898858075042, + "grad_norm": 0.0395248606801033, + "learning_rate": 2.1076435908864986e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257594016, + "step": 119375 + }, + { + "epoch": 19.474714518760194, + "grad_norm": 0.0001660191483097151, + "learning_rate": 2.1011199268175517e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257602464, + "step": 119380 + }, + { + "epoch": 19.47553017944535, + "grad_norm": 0.02291187085211277, + "learning_rate": 2.0946063533009986e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257612640, + "step": 119385 + }, + { + "epoch": 19.476345840130506, + "grad_norm": 0.004429606255143881, + "learning_rate": 2.0881028704688997e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257623456, + "step": 119390 + }, + { + "epoch": 19.47716150081566, + "grad_norm": 0.00016154415789060295, + "learning_rate": 2.0816094784530394e-06, + "loss": 0.0013, + "num_input_tokens_seen": 257633664, + "step": 119395 + }, + { + "epoch": 19.477977161500817, + "grad_norm": 0.03298862650990486, + "learning_rate": 2.075126177385034e-06, + "loss": 0.0089, + "num_input_tokens_seen": 257644832, + "step": 119400 + }, + { + "epoch": 19.47879282218597, + "grad_norm": 0.002902628155425191, + "learning_rate": 2.0686529673962784e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257655008, + "step": 119405 + }, + { + "epoch": 19.479608482871125, + "grad_norm": 0.0017371232388541102, + "learning_rate": 2.06218984861789e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257665984, + "step": 119410 + }, + { + "epoch": 19.48042414355628, + "grad_norm": 0.001359336543828249, + "learning_rate": 2.0557368211809314e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257676384, + "step": 119415 + }, + { + "epoch": 19.481239804241437, + "grad_norm": 0.0029154985677450895, + "learning_rate": 2.0492938852161304e-06, + "loss": 0.0151, + "num_input_tokens_seen": 257686528, + "step": 119420 + }, + { + "epoch": 19.482055464926592, + "grad_norm": 0.02250189520418644, + "learning_rate": 2.042861040854105e-06, + "loss": 0.0016, + "num_input_tokens_seen": 257697696, + "step": 119425 + }, + { + "epoch": 19.482871125611744, + "grad_norm": 0.008624950423836708, + "learning_rate": 2.0364382882251952e-06, + "loss": 0.001, + "num_input_tokens_seen": 257708960, + "step": 119430 + }, + { + "epoch": 19.4836867862969, + "grad_norm": 0.01807945780456066, + "learning_rate": 2.030025627459575e-06, + "loss": 0.0016, + "num_input_tokens_seen": 257718976, + "step": 119435 + }, + { + "epoch": 19.484502446982056, + "grad_norm": 0.00031700325780548155, + "learning_rate": 2.023623058687196e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257730432, + "step": 119440 + }, + { + "epoch": 19.48531810766721, + "grad_norm": 0.0010034743463620543, + "learning_rate": 2.0172305820378434e-06, + "loss": 0.0024, + "num_input_tokens_seen": 257740960, + "step": 119445 + }, + { + "epoch": 19.486133768352367, + "grad_norm": 0.00044112239265814424, + "learning_rate": 2.010848197641024e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257752000, + "step": 119450 + }, + { + "epoch": 19.48694942903752, + "grad_norm": 0.0012064583133906126, + "learning_rate": 2.0044759056261354e-06, + "loss": 0.0001, + "num_input_tokens_seen": 257763456, + "step": 119455 + }, + { + "epoch": 19.487765089722675, + "grad_norm": 0.018535533919930458, + "learning_rate": 1.9981137061222954e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257774112, + "step": 119460 + }, + { + "epoch": 19.48858075040783, + "grad_norm": 0.0003633775340858847, + "learning_rate": 1.9917615992584017e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257784672, + "step": 119465 + }, + { + "epoch": 19.489396411092986, + "grad_norm": 0.00574346212670207, + "learning_rate": 1.985419585163295e-06, + "loss": 0.0006, + "num_input_tokens_seen": 257794720, + "step": 119470 + }, + { + "epoch": 19.49021207177814, + "grad_norm": 0.0021029370836913586, + "learning_rate": 1.9790876639653733e-06, + "loss": 0.0024, + "num_input_tokens_seen": 257806400, + "step": 119475 + }, + { + "epoch": 19.491027732463294, + "grad_norm": 0.05862275883555412, + "learning_rate": 1.972765835793089e-06, + "loss": 0.0005, + "num_input_tokens_seen": 257816256, + "step": 119480 + }, + { + "epoch": 19.49184339314845, + "grad_norm": 0.0004049956623930484, + "learning_rate": 1.9664541007744508e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257828000, + "step": 119485 + }, + { + "epoch": 19.492659053833606, + "grad_norm": 0.0001392452686559409, + "learning_rate": 1.960152459037412e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257837312, + "step": 119490 + }, + { + "epoch": 19.49347471451876, + "grad_norm": 0.00011865168926306069, + "learning_rate": 1.953860910709704e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257847552, + "step": 119495 + }, + { + "epoch": 19.494290375203914, + "grad_norm": 0.002724696882069111, + "learning_rate": 1.9475794559188354e-06, + "loss": 0.0009, + "num_input_tokens_seen": 257858336, + "step": 119500 + }, + { + "epoch": 19.49510603588907, + "grad_norm": 0.013737517409026623, + "learning_rate": 1.9413080947920934e-06, + "loss": 0.0003, + "num_input_tokens_seen": 257868864, + "step": 119505 + }, + { + "epoch": 19.495921696574225, + "grad_norm": 0.03135950118303299, + "learning_rate": 1.9350468274565434e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257878816, + "step": 119510 + }, + { + "epoch": 19.49673735725938, + "grad_norm": 0.0003455507685430348, + "learning_rate": 1.9287956540391395e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257889056, + "step": 119515 + }, + { + "epoch": 19.497553017944536, + "grad_norm": 0.00019873857672791928, + "learning_rate": 1.9225545746665575e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257899360, + "step": 119520 + }, + { + "epoch": 19.49836867862969, + "grad_norm": 0.03223171457648277, + "learning_rate": 1.9163235894651965e-06, + "loss": 0.0011, + "num_input_tokens_seen": 257911264, + "step": 119525 + }, + { + "epoch": 19.499184339314844, + "grad_norm": 0.004259428940713406, + "learning_rate": 1.9101026985614558e-06, + "loss": 0.0023, + "num_input_tokens_seen": 257922432, + "step": 119530 + }, + { + "epoch": 19.5, + "grad_norm": 0.0008316703024320304, + "learning_rate": 1.903891902081345e-06, + "loss": 0.0002, + "num_input_tokens_seen": 257932448, + "step": 119535 + }, + { + "epoch": 19.500815660685156, + "grad_norm": 0.0002092146605718881, + "learning_rate": 1.8976912001507084e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257943776, + "step": 119540 + }, + { + "epoch": 19.50163132137031, + "grad_norm": 0.004607275128364563, + "learning_rate": 1.8915005928953344e-06, + "loss": 0.0004, + "num_input_tokens_seen": 257954880, + "step": 119545 + }, + { + "epoch": 19.502446982055464, + "grad_norm": 0.006937575060874224, + "learning_rate": 1.8853200804405113e-06, + "loss": 0.0102, + "num_input_tokens_seen": 257966208, + "step": 119550 + }, + { + "epoch": 19.50326264274062, + "grad_norm": 0.006508595775812864, + "learning_rate": 1.879149662911639e-06, + "loss": 0.0008, + "num_input_tokens_seen": 257975872, + "step": 119555 + }, + { + "epoch": 19.504078303425775, + "grad_norm": 0.0003008491767104715, + "learning_rate": 1.8729893404336728e-06, + "loss": 0.0058, + "num_input_tokens_seen": 257986272, + "step": 119560 + }, + { + "epoch": 19.50489396411093, + "grad_norm": 0.00021317979553714395, + "learning_rate": 1.8668391131315133e-06, + "loss": 0.0007, + "num_input_tokens_seen": 257996032, + "step": 119565 + }, + { + "epoch": 19.505709624796086, + "grad_norm": 0.0016288916813209653, + "learning_rate": 1.8606989811297824e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258006976, + "step": 119570 + }, + { + "epoch": 19.50652528548124, + "grad_norm": 0.0006956443539820611, + "learning_rate": 1.8545689445528813e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258017152, + "step": 119575 + }, + { + "epoch": 19.507340946166394, + "grad_norm": 0.0003117949818260968, + "learning_rate": 1.8484490035251544e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258027232, + "step": 119580 + }, + { + "epoch": 19.50815660685155, + "grad_norm": 0.0008829758153297007, + "learning_rate": 1.842339158170503e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258038560, + "step": 119585 + }, + { + "epoch": 19.508972267536706, + "grad_norm": 0.00022951370920054615, + "learning_rate": 1.8362394086128276e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258049984, + "step": 119590 + }, + { + "epoch": 19.50978792822186, + "grad_norm": 0.014124314300715923, + "learning_rate": 1.8301497549757518e-06, + "loss": 0.001, + "num_input_tokens_seen": 258059680, + "step": 119595 + }, + { + "epoch": 19.510603588907014, + "grad_norm": 0.0030965576879680157, + "learning_rate": 1.8240701973826213e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258071136, + "step": 119600 + }, + { + "epoch": 19.51141924959217, + "grad_norm": 0.0002585167239885777, + "learning_rate": 1.8180007359567263e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258081312, + "step": 119605 + }, + { + "epoch": 19.512234910277325, + "grad_norm": 0.01583932153880596, + "learning_rate": 1.8119413708210243e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258092384, + "step": 119610 + }, + { + "epoch": 19.51305057096248, + "grad_norm": 0.00013076342293061316, + "learning_rate": 1.8058921020983055e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258103328, + "step": 119615 + }, + { + "epoch": 19.513866231647633, + "grad_norm": 0.0007007982349023223, + "learning_rate": 1.7998529299111944e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258114464, + "step": 119620 + }, + { + "epoch": 19.51468189233279, + "grad_norm": 0.00018750074377749115, + "learning_rate": 1.7938238543820928e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258124992, + "step": 119625 + }, + { + "epoch": 19.515497553017944, + "grad_norm": 9.54270944930613e-05, + "learning_rate": 1.7878048756331256e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258135904, + "step": 119630 + }, + { + "epoch": 19.5163132137031, + "grad_norm": 0.0012814825167879462, + "learning_rate": 1.7817959937863615e-06, + "loss": 0.0038, + "num_input_tokens_seen": 258146848, + "step": 119635 + }, + { + "epoch": 19.517128874388256, + "grad_norm": 0.0013141214149072766, + "learning_rate": 1.7757972089635367e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258157920, + "step": 119640 + }, + { + "epoch": 19.517944535073408, + "grad_norm": 0.0002455166250001639, + "learning_rate": 1.7698085212862203e-06, + "loss": 0.0036, + "num_input_tokens_seen": 258169280, + "step": 119645 + }, + { + "epoch": 19.518760195758563, + "grad_norm": 0.0035972970072180033, + "learning_rate": 1.76382993087576e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258179488, + "step": 119650 + }, + { + "epoch": 19.51957585644372, + "grad_norm": 0.013142626732587814, + "learning_rate": 1.7578614378533365e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258190272, + "step": 119655 + }, + { + "epoch": 19.520391517128875, + "grad_norm": 0.0001205108710564673, + "learning_rate": 1.751903042339964e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258201280, + "step": 119660 + }, + { + "epoch": 19.52120717781403, + "grad_norm": 0.021634070202708244, + "learning_rate": 1.745954744456324e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258211424, + "step": 119665 + }, + { + "epoch": 19.522022838499183, + "grad_norm": 0.010791029781103134, + "learning_rate": 1.7400165443229865e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258222688, + "step": 119670 + }, + { + "epoch": 19.52283849918434, + "grad_norm": 0.0010952608427032828, + "learning_rate": 1.7340884420603e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258232256, + "step": 119675 + }, + { + "epoch": 19.523654159869494, + "grad_norm": 0.0006848338525742292, + "learning_rate": 1.7281704377884454e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258243936, + "step": 119680 + }, + { + "epoch": 19.52446982055465, + "grad_norm": 0.00024134786508511752, + "learning_rate": 1.7222625316272723e-06, + "loss": 0.0031, + "num_input_tokens_seen": 258252480, + "step": 119685 + }, + { + "epoch": 19.525285481239806, + "grad_norm": 0.0032356330193579197, + "learning_rate": 1.7163647236965728e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258262976, + "step": 119690 + }, + { + "epoch": 19.526101141924958, + "grad_norm": 9.544688509777188e-05, + "learning_rate": 1.7104770141158631e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258273312, + "step": 119695 + }, + { + "epoch": 19.526916802610113, + "grad_norm": 0.018587835133075714, + "learning_rate": 1.704599403004492e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258284384, + "step": 119700 + }, + { + "epoch": 19.52773246329527, + "grad_norm": 0.04586613178253174, + "learning_rate": 1.6987318904814753e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258295936, + "step": 119705 + }, + { + "epoch": 19.528548123980425, + "grad_norm": 0.00969522912055254, + "learning_rate": 1.6928744766658844e-06, + "loss": 0.0022, + "num_input_tokens_seen": 258307392, + "step": 119710 + }, + { + "epoch": 19.52936378466558, + "grad_norm": 0.06709425896406174, + "learning_rate": 1.687027161676291e-06, + "loss": 0.0023, + "num_input_tokens_seen": 258319936, + "step": 119715 + }, + { + "epoch": 19.530179445350733, + "grad_norm": 0.00017794195446185768, + "learning_rate": 1.6811899456312119e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258331168, + "step": 119720 + }, + { + "epoch": 19.53099510603589, + "grad_norm": 0.0001635753724258393, + "learning_rate": 1.6753628286490518e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258341472, + "step": 119725 + }, + { + "epoch": 19.531810766721044, + "grad_norm": 0.19339795410633087, + "learning_rate": 1.6695458108477724e-06, + "loss": 0.0045, + "num_input_tokens_seen": 258353504, + "step": 119730 + }, + { + "epoch": 19.5326264274062, + "grad_norm": 0.09403607249259949, + "learning_rate": 1.66373889234539e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258364992, + "step": 119735 + }, + { + "epoch": 19.533442088091356, + "grad_norm": 9.41340476856567e-05, + "learning_rate": 1.6579420732594774e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258375520, + "step": 119740 + }, + { + "epoch": 19.534257748776508, + "grad_norm": 0.012245571240782738, + "learning_rate": 1.6521553537075518e-06, + "loss": 0.0046, + "num_input_tokens_seen": 258384992, + "step": 119745 + }, + { + "epoch": 19.535073409461663, + "grad_norm": 0.00110302260145545, + "learning_rate": 1.646378733806908e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258395648, + "step": 119750 + }, + { + "epoch": 19.53588907014682, + "grad_norm": 0.07286670804023743, + "learning_rate": 1.6406122136746193e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258405248, + "step": 119755 + }, + { + "epoch": 19.536704730831975, + "grad_norm": 0.0011534657096490264, + "learning_rate": 1.634855793427481e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258416192, + "step": 119760 + }, + { + "epoch": 19.53752039151713, + "grad_norm": 0.008723132312297821, + "learning_rate": 1.6291094731822886e-06, + "loss": 0.0035, + "num_input_tokens_seen": 258426272, + "step": 119765 + }, + { + "epoch": 19.538336052202283, + "grad_norm": 0.0004291546647436917, + "learning_rate": 1.6233732530553935e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258437152, + "step": 119770 + }, + { + "epoch": 19.53915171288744, + "grad_norm": 0.0018804225837811828, + "learning_rate": 1.6176471331630915e-06, + "loss": 0.0011, + "num_input_tokens_seen": 258447776, + "step": 119775 + }, + { + "epoch": 19.539967373572594, + "grad_norm": 0.00047921977238729596, + "learning_rate": 1.6119311136213455e-06, + "loss": 0.0013, + "num_input_tokens_seen": 258458400, + "step": 119780 + }, + { + "epoch": 19.54078303425775, + "grad_norm": 0.05375475063920021, + "learning_rate": 1.6062251945461737e-06, + "loss": 0.0023, + "num_input_tokens_seen": 258467776, + "step": 119785 + }, + { + "epoch": 19.541598694942905, + "grad_norm": 8.574515959480777e-05, + "learning_rate": 1.6005293760530393e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258477312, + "step": 119790 + }, + { + "epoch": 19.542414355628058, + "grad_norm": 0.00012147615052526817, + "learning_rate": 1.594843658257461e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258487392, + "step": 119795 + }, + { + "epoch": 19.543230016313213, + "grad_norm": 0.000594177923630923, + "learning_rate": 1.5891680412746246e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258497568, + "step": 119800 + }, + { + "epoch": 19.54404567699837, + "grad_norm": 9.804667206481099e-05, + "learning_rate": 1.5835025252196044e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258507808, + "step": 119805 + }, + { + "epoch": 19.544861337683525, + "grad_norm": 0.040015898644924164, + "learning_rate": 1.5778471102071423e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258519104, + "step": 119810 + }, + { + "epoch": 19.545676998368677, + "grad_norm": 0.001228774432092905, + "learning_rate": 1.572201796351924e-06, + "loss": 0.0022, + "num_input_tokens_seen": 258529824, + "step": 119815 + }, + { + "epoch": 19.546492659053833, + "grad_norm": 0.00015720717783551663, + "learning_rate": 1.5665665837683584e-06, + "loss": 0.0008, + "num_input_tokens_seen": 258541184, + "step": 119820 + }, + { + "epoch": 19.54730831973899, + "grad_norm": 0.003557945368811488, + "learning_rate": 1.5609414725706317e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258552672, + "step": 119825 + }, + { + "epoch": 19.548123980424144, + "grad_norm": 0.0003948390076402575, + "learning_rate": 1.5553264628727082e-06, + "loss": 0.001, + "num_input_tokens_seen": 258564032, + "step": 119830 + }, + { + "epoch": 19.5489396411093, + "grad_norm": 0.07191413640975952, + "learning_rate": 1.5497215547884414e-06, + "loss": 0.0021, + "num_input_tokens_seen": 258575296, + "step": 119835 + }, + { + "epoch": 19.549755301794452, + "grad_norm": 0.003514338983222842, + "learning_rate": 1.544126748431407e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258585568, + "step": 119840 + }, + { + "epoch": 19.550570962479608, + "grad_norm": 0.015387758612632751, + "learning_rate": 1.538542043914959e-06, + "loss": 0.0024, + "num_input_tokens_seen": 258596224, + "step": 119845 + }, + { + "epoch": 19.551386623164763, + "grad_norm": 0.0009225309477187693, + "learning_rate": 1.5329674413522843e-06, + "loss": 0.001, + "num_input_tokens_seen": 258606304, + "step": 119850 + }, + { + "epoch": 19.55220228384992, + "grad_norm": 0.11416856199502945, + "learning_rate": 1.527402940856404e-06, + "loss": 0.0055, + "num_input_tokens_seen": 258616960, + "step": 119855 + }, + { + "epoch": 19.553017944535075, + "grad_norm": 0.00018328806618228555, + "learning_rate": 1.5218485425400607e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258628672, + "step": 119860 + }, + { + "epoch": 19.553833605220227, + "grad_norm": 0.0009122491464950144, + "learning_rate": 1.516304246515776e-06, + "loss": 0.0012, + "num_input_tokens_seen": 258640192, + "step": 119865 + }, + { + "epoch": 19.554649265905383, + "grad_norm": 0.00024961569579318166, + "learning_rate": 1.5107700528960156e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258652288, + "step": 119870 + }, + { + "epoch": 19.55546492659054, + "grad_norm": 0.00016333845269400626, + "learning_rate": 1.505245961792856e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258662880, + "step": 119875 + }, + { + "epoch": 19.556280587275694, + "grad_norm": 0.00013592213508673012, + "learning_rate": 1.4997319733182636e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258674112, + "step": 119880 + }, + { + "epoch": 19.55709624796085, + "grad_norm": 0.02177118882536888, + "learning_rate": 1.494228087583982e-06, + "loss": 0.0043, + "num_input_tokens_seen": 258685152, + "step": 119885 + }, + { + "epoch": 19.557911908646002, + "grad_norm": 0.02415476180613041, + "learning_rate": 1.4887343047016444e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258696288, + "step": 119890 + }, + { + "epoch": 19.558727569331158, + "grad_norm": 0.0001532110763946548, + "learning_rate": 1.4832506247824396e-06, + "loss": 0.0016, + "num_input_tokens_seen": 258707040, + "step": 119895 + }, + { + "epoch": 19.559543230016313, + "grad_norm": 0.002942811232060194, + "learning_rate": 1.4777770479376118e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258717984, + "step": 119900 + }, + { + "epoch": 19.56035889070147, + "grad_norm": 0.004368784371763468, + "learning_rate": 1.472313574278017e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258728960, + "step": 119905 + }, + { + "epoch": 19.561174551386625, + "grad_norm": 0.0002912446216214448, + "learning_rate": 1.4668602039144551e-06, + "loss": 0.0034, + "num_input_tokens_seen": 258739648, + "step": 119910 + }, + { + "epoch": 19.561990212071777, + "grad_norm": 0.00017460700473748147, + "learning_rate": 1.4614169369573382e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258750560, + "step": 119915 + }, + { + "epoch": 19.562805872756933, + "grad_norm": 0.0007287058397196233, + "learning_rate": 1.4559837735171333e-06, + "loss": 0.0056, + "num_input_tokens_seen": 258760608, + "step": 119920 + }, + { + "epoch": 19.563621533442088, + "grad_norm": 0.001503968145698309, + "learning_rate": 1.450560713703808e-06, + "loss": 0.0004, + "num_input_tokens_seen": 258772736, + "step": 119925 + }, + { + "epoch": 19.564437194127244, + "grad_norm": 0.000822266039904207, + "learning_rate": 1.4451477576273298e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258783008, + "step": 119930 + }, + { + "epoch": 19.5652528548124, + "grad_norm": 0.0001348108344245702, + "learning_rate": 1.4397449053973888e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258793728, + "step": 119935 + }, + { + "epoch": 19.56606851549755, + "grad_norm": 7.797917351126671e-05, + "learning_rate": 1.4343521571235086e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258804416, + "step": 119940 + }, + { + "epoch": 19.566884176182707, + "grad_norm": 0.00042448207386769354, + "learning_rate": 1.4289695129149349e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258816320, + "step": 119945 + }, + { + "epoch": 19.567699836867863, + "grad_norm": 0.0010192388435825706, + "learning_rate": 1.423596972880803e-06, + "loss": 0.0002, + "num_input_tokens_seen": 258827616, + "step": 119950 + }, + { + "epoch": 19.56851549755302, + "grad_norm": 0.1093149334192276, + "learning_rate": 1.4182345371299699e-06, + "loss": 0.0014, + "num_input_tokens_seen": 258839456, + "step": 119955 + }, + { + "epoch": 19.569331158238175, + "grad_norm": 0.012513099238276482, + "learning_rate": 1.412882205771071e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258850240, + "step": 119960 + }, + { + "epoch": 19.570146818923327, + "grad_norm": 0.036890849471092224, + "learning_rate": 1.4075399789126308e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258861440, + "step": 119965 + }, + { + "epoch": 19.570962479608482, + "grad_norm": 0.00011577965778997168, + "learning_rate": 1.4022078566629515e-06, + "loss": 0.0001, + "num_input_tokens_seen": 258871520, + "step": 119970 + }, + { + "epoch": 19.571778140293638, + "grad_norm": 0.001227507134899497, + "learning_rate": 1.396885839130002e-06, + "loss": 0.0005, + "num_input_tokens_seen": 258882368, + "step": 119975 + }, + { + "epoch": 19.572593800978794, + "grad_norm": 0.0009670022409409285, + "learning_rate": 1.3915739264216964e-06, + "loss": 0.0017, + "num_input_tokens_seen": 258893152, + "step": 119980 + }, + { + "epoch": 19.57340946166395, + "grad_norm": 0.00010742289305198938, + "learning_rate": 1.3862721186456706e-06, + "loss": 0.0007, + "num_input_tokens_seen": 258903648, + "step": 119985 + }, + { + "epoch": 19.5742251223491, + "grad_norm": 0.0006557407905347645, + "learning_rate": 1.3809804159093386e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258914432, + "step": 119990 + }, + { + "epoch": 19.575040783034257, + "grad_norm": 0.022727755829691887, + "learning_rate": 1.3756988183200037e-06, + "loss": 0.0029, + "num_input_tokens_seen": 258924736, + "step": 119995 + }, + { + "epoch": 19.575856443719413, + "grad_norm": 0.00023981585400179029, + "learning_rate": 1.3704273259847467e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258935776, + "step": 120000 + }, + { + "epoch": 19.57667210440457, + "grad_norm": 0.0039139301516115665, + "learning_rate": 1.36516593901026e-06, + "loss": 0.0009, + "num_input_tokens_seen": 258948480, + "step": 120005 + }, + { + "epoch": 19.57748776508972, + "grad_norm": 0.011489437893033028, + "learning_rate": 1.3599146575032363e-06, + "loss": 0.0003, + "num_input_tokens_seen": 258959712, + "step": 120010 + }, + { + "epoch": 19.578303425774877, + "grad_norm": 0.0003109066456090659, + "learning_rate": 1.3546734815702012e-06, + "loss": 0.0068, + "num_input_tokens_seen": 258970432, + "step": 120015 + }, + { + "epoch": 19.579119086460032, + "grad_norm": 0.0014314486179500818, + "learning_rate": 1.349442411317181e-06, + "loss": 0.0006, + "num_input_tokens_seen": 258981792, + "step": 120020 + }, + { + "epoch": 19.579934747145188, + "grad_norm": 0.029344748705625534, + "learning_rate": 1.3442214468503688e-06, + "loss": 0.0013, + "num_input_tokens_seen": 258992192, + "step": 120025 + }, + { + "epoch": 19.580750407830344, + "grad_norm": 0.0007600208627991378, + "learning_rate": 1.3390105882754577e-06, + "loss": 0.0017, + "num_input_tokens_seen": 259003168, + "step": 120030 + }, + { + "epoch": 19.581566068515496, + "grad_norm": 0.0002573929086793214, + "learning_rate": 1.333809835698141e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259013856, + "step": 120035 + }, + { + "epoch": 19.58238172920065, + "grad_norm": 0.00020731140102725476, + "learning_rate": 1.3286191892237231e-06, + "loss": 0.0015, + "num_input_tokens_seen": 259024960, + "step": 120040 + }, + { + "epoch": 19.583197389885807, + "grad_norm": 0.011729683727025986, + "learning_rate": 1.323438648957509e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259036096, + "step": 120045 + }, + { + "epoch": 19.584013050570963, + "grad_norm": 0.0003391050559002906, + "learning_rate": 1.318268215004359e-06, + "loss": 0.0352, + "num_input_tokens_seen": 259046432, + "step": 120050 + }, + { + "epoch": 19.58482871125612, + "grad_norm": 0.000766593380831182, + "learning_rate": 1.3131078874691337e-06, + "loss": 0.0009, + "num_input_tokens_seen": 259056640, + "step": 120055 + }, + { + "epoch": 19.58564437194127, + "grad_norm": 0.0025415606796741486, + "learning_rate": 1.3079576664564163e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259068800, + "step": 120060 + }, + { + "epoch": 19.586460032626427, + "grad_norm": 0.0012644734233617783, + "learning_rate": 1.302817552070623e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259080672, + "step": 120065 + }, + { + "epoch": 19.587275693311582, + "grad_norm": 0.002381799276918173, + "learning_rate": 1.297687544415782e-06, + "loss": 0.0015, + "num_input_tokens_seen": 259091968, + "step": 120070 + }, + { + "epoch": 19.588091353996738, + "grad_norm": 0.007190469652414322, + "learning_rate": 1.292567643596032e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259103328, + "step": 120075 + }, + { + "epoch": 19.588907014681894, + "grad_norm": 0.0001733337267069146, + "learning_rate": 1.2874578497150125e-06, + "loss": 0.0008, + "num_input_tokens_seen": 259114528, + "step": 120080 + }, + { + "epoch": 19.589722675367046, + "grad_norm": 0.0020400425419211388, + "learning_rate": 1.282358162876307e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259125664, + "step": 120085 + }, + { + "epoch": 19.5905383360522, + "grad_norm": 0.0001142189503298141, + "learning_rate": 1.277268583183333e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259136768, + "step": 120090 + }, + { + "epoch": 19.591353996737357, + "grad_norm": 0.2502383589744568, + "learning_rate": 1.2721891107391192e-06, + "loss": 0.0031, + "num_input_tokens_seen": 259148064, + "step": 120095 + }, + { + "epoch": 19.592169657422513, + "grad_norm": 0.00032056582858785987, + "learning_rate": 1.2671197456467497e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259158144, + "step": 120100 + }, + { + "epoch": 19.59298531810767, + "grad_norm": 0.021195361390709877, + "learning_rate": 1.2620604880088093e-06, + "loss": 0.0076, + "num_input_tokens_seen": 259169568, + "step": 120105 + }, + { + "epoch": 19.59380097879282, + "grad_norm": 0.014732168056070805, + "learning_rate": 1.2570113379279936e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259178400, + "step": 120110 + }, + { + "epoch": 19.594616639477977, + "grad_norm": 0.00016474338190164417, + "learning_rate": 1.2519722955064982e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259189696, + "step": 120115 + }, + { + "epoch": 19.595432300163132, + "grad_norm": 0.0025344607420265675, + "learning_rate": 1.2469433608464642e-06, + "loss": 0.0028, + "num_input_tokens_seen": 259201024, + "step": 120120 + }, + { + "epoch": 19.596247960848288, + "grad_norm": 0.00037778160185553133, + "learning_rate": 1.2419245340498652e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259210944, + "step": 120125 + }, + { + "epoch": 19.597063621533444, + "grad_norm": 0.01112985797226429, + "learning_rate": 1.236915815218398e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259221888, + "step": 120130 + }, + { + "epoch": 19.597879282218596, + "grad_norm": 0.02651910111308098, + "learning_rate": 1.2319172044535365e-06, + "loss": 0.0036, + "num_input_tokens_seen": 259232992, + "step": 120135 + }, + { + "epoch": 19.59869494290375, + "grad_norm": 0.049628980457782745, + "learning_rate": 1.2269287018565888e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259242912, + "step": 120140 + }, + { + "epoch": 19.599510603588907, + "grad_norm": 0.0018081717425957322, + "learning_rate": 1.2219503075286963e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259253568, + "step": 120145 + }, + { + "epoch": 19.600326264274063, + "grad_norm": 0.001607294543646276, + "learning_rate": 1.2169820215707228e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259264640, + "step": 120150 + }, + { + "epoch": 19.601141924959215, + "grad_norm": 0.03617676720023155, + "learning_rate": 1.2120238440833653e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259276448, + "step": 120155 + }, + { + "epoch": 19.60195758564437, + "grad_norm": 0.06851811707019806, + "learning_rate": 1.207075775167099e-06, + "loss": 0.0013, + "num_input_tokens_seen": 259288064, + "step": 120160 + }, + { + "epoch": 19.602773246329527, + "grad_norm": 0.0009358471143059433, + "learning_rate": 1.2021378149221773e-06, + "loss": 0.0007, + "num_input_tokens_seen": 259298880, + "step": 120165 + }, + { + "epoch": 19.603588907014682, + "grad_norm": 0.00011552513751666993, + "learning_rate": 1.1972099634487422e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259309888, + "step": 120170 + }, + { + "epoch": 19.604404567699838, + "grad_norm": 0.00029978991369716823, + "learning_rate": 1.1922922208466026e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259320288, + "step": 120175 + }, + { + "epoch": 19.605220228384994, + "grad_norm": 0.013405055738985538, + "learning_rate": 1.1873845872154565e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259329888, + "step": 120180 + }, + { + "epoch": 19.606035889070146, + "grad_norm": 0.02005852572619915, + "learning_rate": 1.1824870626547247e-06, + "loss": 0.0011, + "num_input_tokens_seen": 259342176, + "step": 120185 + }, + { + "epoch": 19.6068515497553, + "grad_norm": 0.002242861781269312, + "learning_rate": 1.1775996472637163e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259352768, + "step": 120190 + }, + { + "epoch": 19.607667210440457, + "grad_norm": 0.00048186391359195113, + "learning_rate": 1.1727223411414078e-06, + "loss": 0.0016, + "num_input_tokens_seen": 259363552, + "step": 120195 + }, + { + "epoch": 19.608482871125613, + "grad_norm": 0.006878603715449572, + "learning_rate": 1.1678551443867203e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259374688, + "step": 120200 + }, + { + "epoch": 19.609298531810765, + "grad_norm": 0.0018231449648737907, + "learning_rate": 1.1629980570982967e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259385856, + "step": 120205 + }, + { + "epoch": 19.61011419249592, + "grad_norm": 0.006002233363687992, + "learning_rate": 1.1581510793745032e-06, + "loss": 0.0027, + "num_input_tokens_seen": 259396640, + "step": 120210 + }, + { + "epoch": 19.610929853181077, + "grad_norm": 0.07118792831897736, + "learning_rate": 1.153314211313594e-06, + "loss": 0.0022, + "num_input_tokens_seen": 259407904, + "step": 120215 + }, + { + "epoch": 19.611745513866232, + "grad_norm": 0.00013495850726030767, + "learning_rate": 1.1484874530136025e-06, + "loss": 0.0013, + "num_input_tokens_seen": 259419968, + "step": 120220 + }, + { + "epoch": 19.612561174551388, + "grad_norm": 0.12293318659067154, + "learning_rate": 1.1436708045723388e-06, + "loss": 0.0229, + "num_input_tokens_seen": 259430272, + "step": 120225 + }, + { + "epoch": 19.61337683523654, + "grad_norm": 0.00012957165017724037, + "learning_rate": 1.1388642660875025e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259441824, + "step": 120230 + }, + { + "epoch": 19.614192495921696, + "grad_norm": 0.00018594680295791477, + "learning_rate": 1.1340678376563495e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259450848, + "step": 120235 + }, + { + "epoch": 19.61500815660685, + "grad_norm": 0.006359973456710577, + "learning_rate": 1.1292815193761907e-06, + "loss": 0.0041, + "num_input_tokens_seen": 259462912, + "step": 120240 + }, + { + "epoch": 19.615823817292007, + "grad_norm": 0.001045146374963224, + "learning_rate": 1.1245053113440596e-06, + "loss": 0.001, + "num_input_tokens_seen": 259473536, + "step": 120245 + }, + { + "epoch": 19.616639477977163, + "grad_norm": 0.0006417073891498148, + "learning_rate": 1.1197392136566565e-06, + "loss": 0.0011, + "num_input_tokens_seen": 259483744, + "step": 120250 + }, + { + "epoch": 19.617455138662315, + "grad_norm": 0.00016656941443216056, + "learning_rate": 1.114983226410571e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259494752, + "step": 120255 + }, + { + "epoch": 19.61827079934747, + "grad_norm": 0.00039735998143441975, + "learning_rate": 1.110237349702281e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259506240, + "step": 120260 + }, + { + "epoch": 19.619086460032626, + "grad_norm": 0.00011876939242938533, + "learning_rate": 1.1055015836279326e-06, + "loss": 0.0006, + "num_input_tokens_seen": 259517536, + "step": 120265 + }, + { + "epoch": 19.619902120717782, + "grad_norm": 0.006379480939358473, + "learning_rate": 1.1007759282834484e-06, + "loss": 0.0009, + "num_input_tokens_seen": 259529024, + "step": 120270 + }, + { + "epoch": 19.620717781402938, + "grad_norm": 0.004534132778644562, + "learning_rate": 1.096060383764641e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259539936, + "step": 120275 + }, + { + "epoch": 19.62153344208809, + "grad_norm": 0.0015257432824000716, + "learning_rate": 1.0913549501671004e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259551456, + "step": 120280 + }, + { + "epoch": 19.622349102773246, + "grad_norm": 0.0018679461209103465, + "learning_rate": 1.0866596275861395e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259563328, + "step": 120285 + }, + { + "epoch": 19.6231647634584, + "grad_norm": 0.003943427465856075, + "learning_rate": 1.0819744161169597e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259573216, + "step": 120290 + }, + { + "epoch": 19.623980424143557, + "grad_norm": 0.04213641583919525, + "learning_rate": 1.0772993158544297e-06, + "loss": 0.0012, + "num_input_tokens_seen": 259583776, + "step": 120295 + }, + { + "epoch": 19.624796084828713, + "grad_norm": 0.0002173772663809359, + "learning_rate": 1.072634326893418e-06, + "loss": 0.0145, + "num_input_tokens_seen": 259594560, + "step": 120300 + }, + { + "epoch": 19.625611745513865, + "grad_norm": 0.0037545945961028337, + "learning_rate": 1.0679794493284045e-06, + "loss": 0.0003, + "num_input_tokens_seen": 259604512, + "step": 120305 + }, + { + "epoch": 19.62642740619902, + "grad_norm": 0.025840003043413162, + "learning_rate": 1.0633346832537026e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259614848, + "step": 120310 + }, + { + "epoch": 19.627243066884176, + "grad_norm": 0.00027903332374989986, + "learning_rate": 1.0587000287634596e-06, + "loss": 0.0026, + "num_input_tokens_seen": 259624800, + "step": 120315 + }, + { + "epoch": 19.628058727569332, + "grad_norm": 0.0034105891827493906, + "learning_rate": 1.0540754859516554e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259635200, + "step": 120320 + }, + { + "epoch": 19.628874388254488, + "grad_norm": 0.0017770391423255205, + "learning_rate": 1.0494610549119377e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259646880, + "step": 120325 + }, + { + "epoch": 19.62969004893964, + "grad_norm": 0.0008589364588260651, + "learning_rate": 1.0448567357378424e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259658048, + "step": 120330 + }, + { + "epoch": 19.630505709624796, + "grad_norm": 0.0001961359812412411, + "learning_rate": 1.0402625285227396e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259668576, + "step": 120335 + }, + { + "epoch": 19.63132137030995, + "grad_norm": 0.005150578450411558, + "learning_rate": 1.0356784333596658e-06, + "loss": 0.0021, + "num_input_tokens_seen": 259679392, + "step": 120340 + }, + { + "epoch": 19.632137030995107, + "grad_norm": 0.0018754310440272093, + "learning_rate": 1.0311044503415468e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259689440, + "step": 120345 + }, + { + "epoch": 19.63295269168026, + "grad_norm": 0.0012420470593497157, + "learning_rate": 1.026540579561086e-06, + "loss": 0.0001, + "num_input_tokens_seen": 259698784, + "step": 120350 + }, + { + "epoch": 19.633768352365415, + "grad_norm": 0.00038859809865243733, + "learning_rate": 1.0219868211108208e-06, + "loss": 0.1471, + "num_input_tokens_seen": 259710304, + "step": 120355 + }, + { + "epoch": 19.63458401305057, + "grad_norm": 0.00021763973927590996, + "learning_rate": 1.0174431750828993e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259720992, + "step": 120360 + }, + { + "epoch": 19.635399673735726, + "grad_norm": 0.0004820450267288834, + "learning_rate": 1.0129096415695816e-06, + "loss": 0.0004, + "num_input_tokens_seen": 259731712, + "step": 120365 + }, + { + "epoch": 19.636215334420882, + "grad_norm": 0.0008670609095133841, + "learning_rate": 1.008386220662627e-06, + "loss": 0.0005, + "num_input_tokens_seen": 259741600, + "step": 120370 + }, + { + "epoch": 19.637030995106034, + "grad_norm": 0.00019783554307650775, + "learning_rate": 1.0038729124537405e-06, + "loss": 0.0002, + "num_input_tokens_seen": 259752608, + "step": 120375 + }, + { + "epoch": 19.63784665579119, + "grad_norm": 0.0013491013087332249, + "learning_rate": 9.993697170343485e-07, + "loss": 0.011, + "num_input_tokens_seen": 259762176, + "step": 120380 + }, + { + "epoch": 19.638662316476346, + "grad_norm": 0.023857267573475838, + "learning_rate": 9.948766344958227e-07, + "loss": 0.0005, + "num_input_tokens_seen": 259772928, + "step": 120385 + }, + { + "epoch": 19.6394779771615, + "grad_norm": 0.00020221401064191014, + "learning_rate": 9.9039366492909e-07, + "loss": 0.0002, + "num_input_tokens_seen": 259783808, + "step": 120390 + }, + { + "epoch": 19.640293637846657, + "grad_norm": 0.00046457070857286453, + "learning_rate": 9.859208084251337e-07, + "loss": 0.0028, + "num_input_tokens_seen": 259793952, + "step": 120395 + }, + { + "epoch": 19.64110929853181, + "grad_norm": 0.002529977820813656, + "learning_rate": 9.81458065074492e-07, + "loss": 0.0017, + "num_input_tokens_seen": 259804672, + "step": 120400 + }, + { + "epoch": 19.641924959216965, + "grad_norm": 0.07785668969154358, + "learning_rate": 9.770054349677037e-07, + "loss": 0.0027, + "num_input_tokens_seen": 259815456, + "step": 120405 + }, + { + "epoch": 19.64274061990212, + "grad_norm": 0.0005841149832122028, + "learning_rate": 9.725629181949192e-07, + "loss": 0.0001, + "num_input_tokens_seen": 259825216, + "step": 120410 + }, + { + "epoch": 19.643556280587276, + "grad_norm": 0.004167287144809961, + "learning_rate": 9.681305148462328e-07, + "loss": 0.0004, + "num_input_tokens_seen": 259836576, + "step": 120415 + }, + { + "epoch": 19.644371941272432, + "grad_norm": 0.000581933360081166, + "learning_rate": 9.63708225011406e-07, + "loss": 0.0013, + "num_input_tokens_seen": 259846592, + "step": 120420 + }, + { + "epoch": 19.645187601957584, + "grad_norm": 0.00800230074673891, + "learning_rate": 9.59296048780145e-07, + "loss": 0.0003, + "num_input_tokens_seen": 259857728, + "step": 120425 + }, + { + "epoch": 19.64600326264274, + "grad_norm": 0.018968792632222176, + "learning_rate": 9.54893986241767e-07, + "loss": 0.0006, + "num_input_tokens_seen": 259867712, + "step": 120430 + }, + { + "epoch": 19.646818923327896, + "grad_norm": 0.00036240601912140846, + "learning_rate": 9.505020374855899e-07, + "loss": 0.0052, + "num_input_tokens_seen": 259879872, + "step": 120435 + }, + { + "epoch": 19.64763458401305, + "grad_norm": 0.000716322276275605, + "learning_rate": 9.461202026005978e-07, + "loss": 0.0026, + "num_input_tokens_seen": 259891072, + "step": 120440 + }, + { + "epoch": 19.648450244698207, + "grad_norm": 0.0001823025377234444, + "learning_rate": 9.417484816755528e-07, + "loss": 0.0007, + "num_input_tokens_seen": 259901440, + "step": 120445 + }, + { + "epoch": 19.64926590538336, + "grad_norm": 0.00032433151500299573, + "learning_rate": 9.37386874799051e-07, + "loss": 0.0012, + "num_input_tokens_seen": 259911968, + "step": 120450 + }, + { + "epoch": 19.650081566068515, + "grad_norm": 0.0001419754116795957, + "learning_rate": 9.330353820595217e-07, + "loss": 0.0022, + "num_input_tokens_seen": 259922496, + "step": 120455 + }, + { + "epoch": 19.65089722675367, + "grad_norm": 0.00014117614773567766, + "learning_rate": 9.286940035451718e-07, + "loss": 0.0004, + "num_input_tokens_seen": 259934272, + "step": 120460 + }, + { + "epoch": 19.651712887438826, + "grad_norm": 0.020046623423695564, + "learning_rate": 9.243627393439313e-07, + "loss": 0.0268, + "num_input_tokens_seen": 259944608, + "step": 120465 + }, + { + "epoch": 19.652528548123982, + "grad_norm": 0.0720062255859375, + "learning_rate": 9.200415895436187e-07, + "loss": 0.0007, + "num_input_tokens_seen": 259954368, + "step": 120470 + }, + { + "epoch": 19.653344208809134, + "grad_norm": 0.0010253662476316094, + "learning_rate": 9.157305542317751e-07, + "loss": 0.0019, + "num_input_tokens_seen": 259965568, + "step": 120475 + }, + { + "epoch": 19.65415986949429, + "grad_norm": 0.001307834405452013, + "learning_rate": 9.11429633495775e-07, + "loss": 0.0007, + "num_input_tokens_seen": 259975392, + "step": 120480 + }, + { + "epoch": 19.654975530179446, + "grad_norm": 0.00032342091435566545, + "learning_rate": 9.071388274228264e-07, + "loss": 0.0023, + "num_input_tokens_seen": 259986144, + "step": 120485 + }, + { + "epoch": 19.6557911908646, + "grad_norm": 0.00013705079618375748, + "learning_rate": 9.028581360998045e-07, + "loss": 0.0016, + "num_input_tokens_seen": 259996640, + "step": 120490 + }, + { + "epoch": 19.656606851549757, + "grad_norm": 0.00015725781850051135, + "learning_rate": 8.985875596135285e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260007904, + "step": 120495 + }, + { + "epoch": 19.65742251223491, + "grad_norm": 0.0020698842126876116, + "learning_rate": 8.943270980505957e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260017952, + "step": 120500 + }, + { + "epoch": 19.658238172920065, + "grad_norm": 0.00031230729655362666, + "learning_rate": 8.900767514972152e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260028544, + "step": 120505 + }, + { + "epoch": 19.65905383360522, + "grad_norm": 0.00013245023728813976, + "learning_rate": 8.858365200395957e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260037760, + "step": 120510 + }, + { + "epoch": 19.659869494290376, + "grad_norm": 0.0002526281459722668, + "learning_rate": 8.816064037636684e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260047968, + "step": 120515 + }, + { + "epoch": 19.660685154975532, + "grad_norm": 0.00011055797222070396, + "learning_rate": 8.773864027551981e-07, + "loss": 0.0011, + "num_input_tokens_seen": 260059264, + "step": 120520 + }, + { + "epoch": 19.661500815660684, + "grad_norm": 0.0005761940265074372, + "learning_rate": 8.73176517099672e-07, + "loss": 0.001, + "num_input_tokens_seen": 260070176, + "step": 120525 + }, + { + "epoch": 19.66231647634584, + "grad_norm": 0.0002128307824023068, + "learning_rate": 8.689767468824105e-07, + "loss": 0.0058, + "num_input_tokens_seen": 260081120, + "step": 120530 + }, + { + "epoch": 19.663132137030995, + "grad_norm": 0.055983614176511765, + "learning_rate": 8.647870921885126e-07, + "loss": 0.0017, + "num_input_tokens_seen": 260092576, + "step": 120535 + }, + { + "epoch": 19.66394779771615, + "grad_norm": 0.0010357380378991365, + "learning_rate": 8.606075531029101e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260103392, + "step": 120540 + }, + { + "epoch": 19.664763458401303, + "grad_norm": 0.00012491563393268734, + "learning_rate": 8.564381297102575e-07, + "loss": 0.0015, + "num_input_tokens_seen": 260114048, + "step": 120545 + }, + { + "epoch": 19.66557911908646, + "grad_norm": 0.00020337002933956683, + "learning_rate": 8.522788220951538e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260125152, + "step": 120550 + }, + { + "epoch": 19.666394779771615, + "grad_norm": 0.006340001709759235, + "learning_rate": 8.481296303418096e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260136192, + "step": 120555 + }, + { + "epoch": 19.66721044045677, + "grad_norm": 0.0016682266723364592, + "learning_rate": 8.439905545343796e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260147424, + "step": 120560 + }, + { + "epoch": 19.668026101141926, + "grad_norm": 0.00046493191621266305, + "learning_rate": 8.398615947566302e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260156448, + "step": 120565 + }, + { + "epoch": 19.66884176182708, + "grad_norm": 0.00019686552695930004, + "learning_rate": 8.357427510923832e-07, + "loss": 0.0024, + "num_input_tokens_seen": 260165408, + "step": 120570 + }, + { + "epoch": 19.669657422512234, + "grad_norm": 0.052009306848049164, + "learning_rate": 8.316340236249609e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260176224, + "step": 120575 + }, + { + "epoch": 19.67047308319739, + "grad_norm": 0.0052222260273993015, + "learning_rate": 8.275354124377965e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260187008, + "step": 120580 + }, + { + "epoch": 19.671288743882545, + "grad_norm": 0.00020554542425088584, + "learning_rate": 8.234469176138238e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260197440, + "step": 120585 + }, + { + "epoch": 19.6721044045677, + "grad_norm": 0.00016549527936149389, + "learning_rate": 8.193685392359762e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260208160, + "step": 120590 + }, + { + "epoch": 19.672920065252853, + "grad_norm": 0.008693763986229897, + "learning_rate": 8.153002773868546e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260219072, + "step": 120595 + }, + { + "epoch": 19.67373572593801, + "grad_norm": 0.00015896520926617086, + "learning_rate": 8.112421321489483e-07, + "loss": 0.1232, + "num_input_tokens_seen": 260228320, + "step": 120600 + }, + { + "epoch": 19.674551386623165, + "grad_norm": 0.00011750786507036537, + "learning_rate": 8.07194103604525e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260240576, + "step": 120605 + }, + { + "epoch": 19.67536704730832, + "grad_norm": 0.013163416646420956, + "learning_rate": 8.03156191835519e-07, + "loss": 0.0012, + "num_input_tokens_seen": 260250144, + "step": 120610 + }, + { + "epoch": 19.676182707993476, + "grad_norm": 0.00028841852326877415, + "learning_rate": 7.99128396923865e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260261248, + "step": 120615 + }, + { + "epoch": 19.67699836867863, + "grad_norm": 0.0005565843312069774, + "learning_rate": 7.951107189511641e-07, + "loss": 0.0033, + "num_input_tokens_seen": 260271232, + "step": 120620 + }, + { + "epoch": 19.677814029363784, + "grad_norm": 0.007324339356273413, + "learning_rate": 7.91103157998796e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260282656, + "step": 120625 + }, + { + "epoch": 19.67862969004894, + "grad_norm": 0.031712856143713, + "learning_rate": 7.871057141480287e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260293408, + "step": 120630 + }, + { + "epoch": 19.679445350734095, + "grad_norm": 0.00021701065998058766, + "learning_rate": 7.831183874798531e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260305856, + "step": 120635 + }, + { + "epoch": 19.68026101141925, + "grad_norm": 0.0019486701348796487, + "learning_rate": 7.791411780750935e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260315648, + "step": 120640 + }, + { + "epoch": 19.681076672104403, + "grad_norm": 0.08165998756885529, + "learning_rate": 7.751740860143519e-07, + "loss": 0.0018, + "num_input_tokens_seen": 260325408, + "step": 120645 + }, + { + "epoch": 19.68189233278956, + "grad_norm": 0.00022842299949843436, + "learning_rate": 7.712171113780086e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260335360, + "step": 120650 + }, + { + "epoch": 19.682707993474715, + "grad_norm": 0.0058161853812634945, + "learning_rate": 7.672702542462773e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260345888, + "step": 120655 + }, + { + "epoch": 19.68352365415987, + "grad_norm": 0.0004813872801605612, + "learning_rate": 7.633335146991493e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260356000, + "step": 120660 + }, + { + "epoch": 19.684339314845026, + "grad_norm": 0.0003719656087923795, + "learning_rate": 7.594068928163944e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260366624, + "step": 120665 + }, + { + "epoch": 19.68515497553018, + "grad_norm": 0.10317173600196838, + "learning_rate": 7.554903886775599e-07, + "loss": 0.0021, + "num_input_tokens_seen": 260377280, + "step": 120670 + }, + { + "epoch": 19.685970636215334, + "grad_norm": 0.006149946711957455, + "learning_rate": 7.515840023620824e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260387520, + "step": 120675 + }, + { + "epoch": 19.68678629690049, + "grad_norm": 0.0005045794532634318, + "learning_rate": 7.476877339490651e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260399264, + "step": 120680 + }, + { + "epoch": 19.687601957585645, + "grad_norm": 0.005421181209385395, + "learning_rate": 7.438015835175005e-07, + "loss": 0.0026, + "num_input_tokens_seen": 260410144, + "step": 120685 + }, + { + "epoch": 19.6884176182708, + "grad_norm": 0.002164709148928523, + "learning_rate": 7.399255511461589e-07, + "loss": 0.0016, + "num_input_tokens_seen": 260421728, + "step": 120690 + }, + { + "epoch": 19.689233278955953, + "grad_norm": 0.0005270749679766595, + "learning_rate": 7.360596369135886e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260433056, + "step": 120695 + }, + { + "epoch": 19.69004893964111, + "grad_norm": 0.00015023611194919795, + "learning_rate": 7.322038408981157e-07, + "loss": 0.0021, + "num_input_tokens_seen": 260443328, + "step": 120700 + }, + { + "epoch": 19.690864600326265, + "grad_norm": 0.0002106478641508147, + "learning_rate": 7.283581631779002e-07, + "loss": 0.0209, + "num_input_tokens_seen": 260454816, + "step": 120705 + }, + { + "epoch": 19.69168026101142, + "grad_norm": 0.005718587897717953, + "learning_rate": 7.245226038308794e-07, + "loss": 0.0065, + "num_input_tokens_seen": 260466208, + "step": 120710 + }, + { + "epoch": 19.692495921696576, + "grad_norm": 0.00015601520135533065, + "learning_rate": 7.206971629348246e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260477920, + "step": 120715 + }, + { + "epoch": 19.693311582381728, + "grad_norm": 0.011997099965810776, + "learning_rate": 7.16881840567174e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260490592, + "step": 120720 + }, + { + "epoch": 19.694127243066884, + "grad_norm": 0.0009030011715367436, + "learning_rate": 7.130766368053099e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260501312, + "step": 120725 + }, + { + "epoch": 19.69494290375204, + "grad_norm": 0.021783960983157158, + "learning_rate": 7.092815517263373e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260511328, + "step": 120730 + }, + { + "epoch": 19.695758564437195, + "grad_norm": 0.014522673562169075, + "learning_rate": 7.054965854071948e-07, + "loss": 0.0011, + "num_input_tokens_seen": 260521728, + "step": 120735 + }, + { + "epoch": 19.696574225122347, + "grad_norm": 0.00040766363963484764, + "learning_rate": 7.017217379245433e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260531136, + "step": 120740 + }, + { + "epoch": 19.697389885807503, + "grad_norm": 0.0012313512852415442, + "learning_rate": 6.979570093548771e-07, + "loss": 0.0022, + "num_input_tokens_seen": 260541184, + "step": 120745 + }, + { + "epoch": 19.69820554649266, + "grad_norm": 0.016703639179468155, + "learning_rate": 6.942023997745794e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260551552, + "step": 120750 + }, + { + "epoch": 19.699021207177815, + "grad_norm": 0.00021083030151203275, + "learning_rate": 6.904579092596452e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260562112, + "step": 120755 + }, + { + "epoch": 19.69983686786297, + "grad_norm": 0.00029680339503102005, + "learning_rate": 6.867235378860137e-07, + "loss": 0.007, + "num_input_tokens_seen": 260571776, + "step": 120760 + }, + { + "epoch": 19.700652528548122, + "grad_norm": 0.0009986067889258265, + "learning_rate": 6.829992857293465e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260584032, + "step": 120765 + }, + { + "epoch": 19.701468189233278, + "grad_norm": 0.3044307827949524, + "learning_rate": 6.792851528651389e-07, + "loss": 0.0272, + "num_input_tokens_seen": 260594688, + "step": 120770 + }, + { + "epoch": 19.702283849918434, + "grad_norm": 0.00016105464601423591, + "learning_rate": 6.755811393686084e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260605248, + "step": 120775 + }, + { + "epoch": 19.70309951060359, + "grad_norm": 8.647357753943652e-05, + "learning_rate": 6.718872453149172e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260617376, + "step": 120780 + }, + { + "epoch": 19.703915171288745, + "grad_norm": 0.023390719667077065, + "learning_rate": 6.682034707788386e-07, + "loss": 0.0012, + "num_input_tokens_seen": 260628384, + "step": 120785 + }, + { + "epoch": 19.704730831973897, + "grad_norm": 0.00010313260281691328, + "learning_rate": 6.645298158350909e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260640320, + "step": 120790 + }, + { + "epoch": 19.705546492659053, + "grad_norm": 0.00010727273911470547, + "learning_rate": 6.608662805580589e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260651488, + "step": 120795 + }, + { + "epoch": 19.70636215334421, + "grad_norm": 0.00028386470512486994, + "learning_rate": 6.572128650220721e-07, + "loss": 0.0227, + "num_input_tokens_seen": 260661184, + "step": 120800 + }, + { + "epoch": 19.707177814029365, + "grad_norm": 0.00018891184299718589, + "learning_rate": 6.535695693011268e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260673056, + "step": 120805 + }, + { + "epoch": 19.70799347471452, + "grad_norm": 0.008058300241827965, + "learning_rate": 6.499363934690528e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260683712, + "step": 120810 + }, + { + "epoch": 19.708809135399672, + "grad_norm": 0.00796644389629364, + "learning_rate": 6.463133375994579e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260695168, + "step": 120815 + }, + { + "epoch": 19.709624796084828, + "grad_norm": 0.022055169567465782, + "learning_rate": 6.427004017658389e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260705888, + "step": 120820 + }, + { + "epoch": 19.710440456769984, + "grad_norm": 0.0003091795661021024, + "learning_rate": 6.390975860413594e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260717376, + "step": 120825 + }, + { + "epoch": 19.71125611745514, + "grad_norm": 0.013718018308281898, + "learning_rate": 6.355048904990724e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260728896, + "step": 120830 + }, + { + "epoch": 19.712071778140295, + "grad_norm": 0.0005609778454527259, + "learning_rate": 6.319223152117526e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260740224, + "step": 120835 + }, + { + "epoch": 19.712887438825447, + "grad_norm": 0.003551957430317998, + "learning_rate": 6.283498602520088e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260751168, + "step": 120840 + }, + { + "epoch": 19.713703099510603, + "grad_norm": 0.00037248714943416417, + "learning_rate": 6.247875256922275e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260762528, + "step": 120845 + }, + { + "epoch": 19.71451876019576, + "grad_norm": 0.00025810726219788194, + "learning_rate": 6.212353116046843e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260772832, + "step": 120850 + }, + { + "epoch": 19.715334420880914, + "grad_norm": 0.00036101555451750755, + "learning_rate": 6.17693218061266e-07, + "loss": 0.0009, + "num_input_tokens_seen": 260782720, + "step": 120855 + }, + { + "epoch": 19.71615008156607, + "grad_norm": 0.0002058661193586886, + "learning_rate": 6.141612451338596e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260792448, + "step": 120860 + }, + { + "epoch": 19.716965742251222, + "grad_norm": 0.0011083518620580435, + "learning_rate": 6.106393928939635e-07, + "loss": 0.0052, + "num_input_tokens_seen": 260802688, + "step": 120865 + }, + { + "epoch": 19.717781402936378, + "grad_norm": 0.000600383267737925, + "learning_rate": 6.07127661412965e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260813568, + "step": 120870 + }, + { + "epoch": 19.718597063621534, + "grad_norm": 0.002915219869464636, + "learning_rate": 6.036260507620849e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260825152, + "step": 120875 + }, + { + "epoch": 19.71941272430669, + "grad_norm": 9.103534102905542e-05, + "learning_rate": 6.001345610122111e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260836288, + "step": 120880 + }, + { + "epoch": 19.72022838499184, + "grad_norm": 0.0028546657413244247, + "learning_rate": 5.966531922341756e-07, + "loss": 0.0005, + "num_input_tokens_seen": 260846400, + "step": 120885 + }, + { + "epoch": 19.721044045676997, + "grad_norm": 0.0015072508249431849, + "learning_rate": 5.931819444984777e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260856224, + "step": 120890 + }, + { + "epoch": 19.721859706362153, + "grad_norm": 0.0024740747176110744, + "learning_rate": 5.897208178755054e-07, + "loss": 0.0004, + "num_input_tokens_seen": 260867104, + "step": 120895 + }, + { + "epoch": 19.72267536704731, + "grad_norm": 0.015272167511284351, + "learning_rate": 5.862698124353694e-07, + "loss": 0.001, + "num_input_tokens_seen": 260875072, + "step": 120900 + }, + { + "epoch": 19.723491027732464, + "grad_norm": 0.0002585486799944192, + "learning_rate": 5.828289282480692e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260885728, + "step": 120905 + }, + { + "epoch": 19.724306688417617, + "grad_norm": 0.0002255940344184637, + "learning_rate": 5.793981653832714e-07, + "loss": 0.0003, + "num_input_tokens_seen": 260897536, + "step": 120910 + }, + { + "epoch": 19.725122349102772, + "grad_norm": 0.00035128468880429864, + "learning_rate": 5.759775239105314e-07, + "loss": 0.0007, + "num_input_tokens_seen": 260908832, + "step": 120915 + }, + { + "epoch": 19.725938009787928, + "grad_norm": 0.03820820897817612, + "learning_rate": 5.72567003899127e-07, + "loss": 0.0011, + "num_input_tokens_seen": 260919328, + "step": 120920 + }, + { + "epoch": 19.726753670473084, + "grad_norm": 0.29728615283966064, + "learning_rate": 5.691666054182809e-07, + "loss": 0.0073, + "num_input_tokens_seen": 260930016, + "step": 120925 + }, + { + "epoch": 19.72756933115824, + "grad_norm": 0.010047786869108677, + "learning_rate": 5.657763285368267e-07, + "loss": 0.0481, + "num_input_tokens_seen": 260941632, + "step": 120930 + }, + { + "epoch": 19.72838499184339, + "grad_norm": 0.0003806646855082363, + "learning_rate": 5.623961733234873e-07, + "loss": 0.0012, + "num_input_tokens_seen": 260951680, + "step": 120935 + }, + { + "epoch": 19.729200652528547, + "grad_norm": 0.00014518050011247396, + "learning_rate": 5.590261398467633e-07, + "loss": 0.0001, + "num_input_tokens_seen": 260962432, + "step": 120940 + }, + { + "epoch": 19.730016313213703, + "grad_norm": 0.01622369885444641, + "learning_rate": 5.556662281749891e-07, + "loss": 0.0013, + "num_input_tokens_seen": 260974016, + "step": 120945 + }, + { + "epoch": 19.73083197389886, + "grad_norm": 0.00016020231123548, + "learning_rate": 5.523164383762213e-07, + "loss": 0.0011, + "num_input_tokens_seen": 260985536, + "step": 120950 + }, + { + "epoch": 19.731647634584014, + "grad_norm": 0.0002876273065339774, + "learning_rate": 5.489767705183501e-07, + "loss": 0.0002, + "num_input_tokens_seen": 260995872, + "step": 120955 + }, + { + "epoch": 19.732463295269167, + "grad_norm": 0.0001329690421698615, + "learning_rate": 5.456472246690436e-07, + "loss": 0.0062, + "num_input_tokens_seen": 261004960, + "step": 120960 + }, + { + "epoch": 19.733278955954322, + "grad_norm": 0.00016414815036114305, + "learning_rate": 5.423278008958032e-07, + "loss": 0.0014, + "num_input_tokens_seen": 261015616, + "step": 120965 + }, + { + "epoch": 19.734094616639478, + "grad_norm": 0.002059618942439556, + "learning_rate": 5.390184992659641e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261025728, + "step": 120970 + }, + { + "epoch": 19.734910277324634, + "grad_norm": 0.008537248708307743, + "learning_rate": 5.357193198464727e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261037120, + "step": 120975 + }, + { + "epoch": 19.73572593800979, + "grad_norm": 0.0003149285330437124, + "learning_rate": 5.324302627042199e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261048608, + "step": 120980 + }, + { + "epoch": 19.73654159869494, + "grad_norm": 0.044517580419778824, + "learning_rate": 5.291513279059301e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261060000, + "step": 120985 + }, + { + "epoch": 19.737357259380097, + "grad_norm": 0.0006717867800034583, + "learning_rate": 5.258825155179948e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261069568, + "step": 120990 + }, + { + "epoch": 19.738172920065253, + "grad_norm": 0.0008974156808108091, + "learning_rate": 5.226238256066384e-07, + "loss": 0.001, + "num_input_tokens_seen": 261080992, + "step": 120995 + }, + { + "epoch": 19.73898858075041, + "grad_norm": 0.00016981227963697165, + "learning_rate": 5.193752582379752e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261092928, + "step": 121000 + }, + { + "epoch": 19.739804241435564, + "grad_norm": 0.00028405783814378083, + "learning_rate": 5.16136813477841e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261104064, + "step": 121005 + }, + { + "epoch": 19.740619902120716, + "grad_norm": 0.002693180227652192, + "learning_rate": 5.129084913917948e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261115072, + "step": 121010 + }, + { + "epoch": 19.741435562805872, + "grad_norm": 0.0017996877431869507, + "learning_rate": 5.096902920453395e-07, + "loss": 0.0024, + "num_input_tokens_seen": 261125664, + "step": 121015 + }, + { + "epoch": 19.742251223491028, + "grad_norm": 0.004277325700968504, + "learning_rate": 5.064822155036453e-07, + "loss": 0.0019, + "num_input_tokens_seen": 261136288, + "step": 121020 + }, + { + "epoch": 19.743066884176184, + "grad_norm": 0.0005290390690788627, + "learning_rate": 5.032842618317157e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261146432, + "step": 121025 + }, + { + "epoch": 19.74388254486134, + "grad_norm": 0.0005273078568279743, + "learning_rate": 5.000964310943878e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261156576, + "step": 121030 + }, + { + "epoch": 19.74469820554649, + "grad_norm": 0.0001518469216534868, + "learning_rate": 4.969187233562767e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261167936, + "step": 121035 + }, + { + "epoch": 19.745513866231647, + "grad_norm": 0.0004309504001867026, + "learning_rate": 4.937511386817751e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261179744, + "step": 121040 + }, + { + "epoch": 19.746329526916803, + "grad_norm": 0.003770847339183092, + "learning_rate": 4.905936771351094e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261191616, + "step": 121045 + }, + { + "epoch": 19.74714518760196, + "grad_norm": 0.0033215880393981934, + "learning_rate": 4.874463387801731e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261203072, + "step": 121050 + }, + { + "epoch": 19.747960848287114, + "grad_norm": 0.0019436877919360995, + "learning_rate": 4.843091236808594e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261213984, + "step": 121055 + }, + { + "epoch": 19.748776508972266, + "grad_norm": 0.0007883536163717508, + "learning_rate": 4.811820319006732e-07, + "loss": 0.0253, + "num_input_tokens_seen": 261224672, + "step": 121060 + }, + { + "epoch": 19.749592169657422, + "grad_norm": 0.0004968271823599935, + "learning_rate": 4.780650635030081e-07, + "loss": 0.0029, + "num_input_tokens_seen": 261235232, + "step": 121065 + }, + { + "epoch": 19.750407830342578, + "grad_norm": 0.0006624003872275352, + "learning_rate": 4.7495821855109145e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261245152, + "step": 121070 + }, + { + "epoch": 19.751223491027734, + "grad_norm": 0.0004453869187273085, + "learning_rate": 4.718614971078172e-07, + "loss": 0.0104, + "num_input_tokens_seen": 261254592, + "step": 121075 + }, + { + "epoch": 19.752039151712886, + "grad_norm": 0.0002238437591586262, + "learning_rate": 4.6877489923596863e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261264448, + "step": 121080 + }, + { + "epoch": 19.75285481239804, + "grad_norm": 0.002639062236994505, + "learning_rate": 4.6569842499805113e-07, + "loss": 0.0005, + "num_input_tokens_seen": 261276256, + "step": 121085 + }, + { + "epoch": 19.753670473083197, + "grad_norm": 0.00016537810734007508, + "learning_rate": 4.626320744565149e-07, + "loss": 0.0023, + "num_input_tokens_seen": 261287360, + "step": 121090 + }, + { + "epoch": 19.754486133768353, + "grad_norm": 0.5715612769126892, + "learning_rate": 4.5957584767342133e-07, + "loss": 0.0577, + "num_input_tokens_seen": 261298976, + "step": 121095 + }, + { + "epoch": 19.75530179445351, + "grad_norm": 0.0057955156080424786, + "learning_rate": 4.5652974471077637e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261310816, + "step": 121100 + }, + { + "epoch": 19.75611745513866, + "grad_norm": 0.00018301222007721663, + "learning_rate": 4.534937656301974e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261321536, + "step": 121105 + }, + { + "epoch": 19.756933115823816, + "grad_norm": 0.0001135728889494203, + "learning_rate": 4.5046791049335733e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261331456, + "step": 121110 + }, + { + "epoch": 19.757748776508972, + "grad_norm": 0.03851823881268501, + "learning_rate": 4.47452179361485e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261343392, + "step": 121115 + }, + { + "epoch": 19.758564437194128, + "grad_norm": 0.0006471836240962148, + "learning_rate": 4.444465722956981e-07, + "loss": 0.001, + "num_input_tokens_seen": 261352608, + "step": 121120 + }, + { + "epoch": 19.759380097879284, + "grad_norm": 0.13367073237895966, + "learning_rate": 4.414510893569479e-07, + "loss": 0.0037, + "num_input_tokens_seen": 261362688, + "step": 121125 + }, + { + "epoch": 19.760195758564436, + "grad_norm": 0.00031477041193284094, + "learning_rate": 4.384657306059636e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261374432, + "step": 121130 + }, + { + "epoch": 19.76101141924959, + "grad_norm": 0.02459036000072956, + "learning_rate": 4.354904961031414e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261385984, + "step": 121135 + }, + { + "epoch": 19.761827079934747, + "grad_norm": 0.001429171534255147, + "learning_rate": 4.3252538590893285e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261396000, + "step": 121140 + }, + { + "epoch": 19.762642740619903, + "grad_norm": 0.06391991674900055, + "learning_rate": 4.2957040008323456e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261406400, + "step": 121145 + }, + { + "epoch": 19.76345840130506, + "grad_norm": 0.0001450068666599691, + "learning_rate": 4.266255386861095e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261417344, + "step": 121150 + }, + { + "epoch": 19.76427406199021, + "grad_norm": 0.00017913653573486954, + "learning_rate": 4.2369080177717676e-07, + "loss": 0.001, + "num_input_tokens_seen": 261426752, + "step": 121155 + }, + { + "epoch": 19.765089722675366, + "grad_norm": 0.004933996591717005, + "learning_rate": 4.2076618941588875e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261438336, + "step": 121160 + }, + { + "epoch": 19.765905383360522, + "grad_norm": 0.022717682644724846, + "learning_rate": 4.178517016615313e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261448928, + "step": 121165 + }, + { + "epoch": 19.766721044045678, + "grad_norm": 0.13345511257648468, + "learning_rate": 4.1494733857322385e-07, + "loss": 0.005, + "num_input_tokens_seen": 261460256, + "step": 121170 + }, + { + "epoch": 19.767536704730833, + "grad_norm": 0.026908867061138153, + "learning_rate": 4.120531002096972e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261472000, + "step": 121175 + }, + { + "epoch": 19.768352365415986, + "grad_norm": 0.00014431579620577395, + "learning_rate": 4.091689866297377e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261483104, + "step": 121180 + }, + { + "epoch": 19.76916802610114, + "grad_norm": 0.00017585551540832967, + "learning_rate": 4.0629499789174293e-07, + "loss": 0.0268, + "num_input_tokens_seen": 261493856, + "step": 121185 + }, + { + "epoch": 19.769983686786297, + "grad_norm": 0.0036410815082490444, + "learning_rate": 4.034311340539443e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261506272, + "step": 121190 + }, + { + "epoch": 19.770799347471453, + "grad_norm": 0.00018914179236162454, + "learning_rate": 4.005773951744063e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261517088, + "step": 121195 + }, + { + "epoch": 19.77161500815661, + "grad_norm": 0.04991000518202782, + "learning_rate": 3.977337813109716e-07, + "loss": 0.0014, + "num_input_tokens_seen": 261528640, + "step": 121200 + }, + { + "epoch": 19.77243066884176, + "grad_norm": 0.0025885808281600475, + "learning_rate": 3.949002925212053e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261539904, + "step": 121205 + }, + { + "epoch": 19.773246329526916, + "grad_norm": 0.018848761916160583, + "learning_rate": 3.920769288626169e-07, + "loss": 0.0018, + "num_input_tokens_seen": 261550496, + "step": 121210 + }, + { + "epoch": 19.774061990212072, + "grad_norm": 0.00012422636791598052, + "learning_rate": 3.8926369039238295e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261560032, + "step": 121215 + }, + { + "epoch": 19.774877650897228, + "grad_norm": 0.06297443062067032, + "learning_rate": 3.864605771675134e-07, + "loss": 0.0014, + "num_input_tokens_seen": 261571360, + "step": 121220 + }, + { + "epoch": 19.775693311582383, + "grad_norm": 0.0006991114933043718, + "learning_rate": 3.8366758924479605e-07, + "loss": 0.0351, + "num_input_tokens_seen": 261580896, + "step": 121225 + }, + { + "epoch": 19.776508972267536, + "grad_norm": 0.05538404732942581, + "learning_rate": 3.808847266809079e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261591648, + "step": 121230 + }, + { + "epoch": 19.77732463295269, + "grad_norm": 0.0006372855859808624, + "learning_rate": 3.781119895321927e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261601408, + "step": 121235 + }, + { + "epoch": 19.778140293637847, + "grad_norm": 0.01783588156104088, + "learning_rate": 3.753493778548278e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261611648, + "step": 121240 + }, + { + "epoch": 19.778955954323003, + "grad_norm": 0.0005751413991674781, + "learning_rate": 3.725968917048794e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261623552, + "step": 121245 + }, + { + "epoch": 19.77977161500816, + "grad_norm": 0.000310877658193931, + "learning_rate": 3.6985453113802525e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261634720, + "step": 121250 + }, + { + "epoch": 19.78058727569331, + "grad_norm": 0.004458635114133358, + "learning_rate": 3.6712229620988744e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261646080, + "step": 121255 + }, + { + "epoch": 19.781402936378466, + "grad_norm": 0.0001367017102893442, + "learning_rate": 3.644001869758662e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261656640, + "step": 121260 + }, + { + "epoch": 19.782218597063622, + "grad_norm": 0.00035173006472177804, + "learning_rate": 3.616882034911395e-07, + "loss": 0.021, + "num_input_tokens_seen": 261668352, + "step": 121265 + }, + { + "epoch": 19.783034257748778, + "grad_norm": 0.00048390828305855393, + "learning_rate": 3.58986345810608e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261678624, + "step": 121270 + }, + { + "epoch": 19.78384991843393, + "grad_norm": 0.000842386158183217, + "learning_rate": 3.56294613989061e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261690240, + "step": 121275 + }, + { + "epoch": 19.784665579119086, + "grad_norm": 0.0010565625270828605, + "learning_rate": 3.5361300808106625e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261701536, + "step": 121280 + }, + { + "epoch": 19.78548123980424, + "grad_norm": 0.00011837128840852529, + "learning_rate": 3.509415281409134e-07, + "loss": 0.0022, + "num_input_tokens_seen": 261712512, + "step": 121285 + }, + { + "epoch": 19.786296900489397, + "grad_norm": 0.0010221133707091212, + "learning_rate": 3.4828017422278146e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261723680, + "step": 121290 + }, + { + "epoch": 19.787112561174553, + "grad_norm": 0.003999342210590839, + "learning_rate": 3.4562894638062727e-07, + "loss": 0.0006, + "num_input_tokens_seen": 261734368, + "step": 121295 + }, + { + "epoch": 19.787928221859705, + "grad_norm": 0.0007516882033087313, + "learning_rate": 3.4298784466818553e-07, + "loss": 0.0009, + "num_input_tokens_seen": 261744512, + "step": 121300 + }, + { + "epoch": 19.78874388254486, + "grad_norm": 0.00012191032874397933, + "learning_rate": 3.403568691389136e-07, + "loss": 0.0019, + "num_input_tokens_seen": 261756416, + "step": 121305 + }, + { + "epoch": 19.789559543230016, + "grad_norm": 0.005896417889744043, + "learning_rate": 3.3773601984615766e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261766656, + "step": 121310 + }, + { + "epoch": 19.790375203915172, + "grad_norm": 0.00038232895894907415, + "learning_rate": 3.3512529684309736e-07, + "loss": 0.0004, + "num_input_tokens_seen": 261777792, + "step": 121315 + }, + { + "epoch": 19.791190864600328, + "grad_norm": 0.05466506630182266, + "learning_rate": 3.325247001825793e-07, + "loss": 0.0012, + "num_input_tokens_seen": 261789344, + "step": 121320 + }, + { + "epoch": 19.79200652528548, + "grad_norm": 0.0034589816350489855, + "learning_rate": 3.299342299172836e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261800544, + "step": 121325 + }, + { + "epoch": 19.792822185970635, + "grad_norm": 0.0009512797114439309, + "learning_rate": 3.2735388609977936e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261811616, + "step": 121330 + }, + { + "epoch": 19.79363784665579, + "grad_norm": 0.0015861641149967909, + "learning_rate": 3.24783668782358e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261822592, + "step": 121335 + }, + { + "epoch": 19.794453507340947, + "grad_norm": 0.0031898810993880033, + "learning_rate": 3.222235780170335e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261834400, + "step": 121340 + }, + { + "epoch": 19.795269168026103, + "grad_norm": 0.00012407901522237808, + "learning_rate": 3.196736138557088e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261844640, + "step": 121345 + }, + { + "epoch": 19.796084828711255, + "grad_norm": 0.08343454450368881, + "learning_rate": 3.171337763501203e-07, + "loss": 0.0016, + "num_input_tokens_seen": 261854432, + "step": 121350 + }, + { + "epoch": 19.79690048939641, + "grad_norm": 0.002018151106312871, + "learning_rate": 3.146040655517268e-07, + "loss": 0.0003, + "num_input_tokens_seen": 261862784, + "step": 121355 + }, + { + "epoch": 19.797716150081566, + "grad_norm": 0.0002815184707287699, + "learning_rate": 3.1208448151176516e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261874336, + "step": 121360 + }, + { + "epoch": 19.798531810766722, + "grad_norm": 0.0010095942998304963, + "learning_rate": 3.0957502428130557e-07, + "loss": 0.0007, + "num_input_tokens_seen": 261884416, + "step": 121365 + }, + { + "epoch": 19.799347471451878, + "grad_norm": 0.00036899797851219773, + "learning_rate": 3.070756939111963e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261895360, + "step": 121370 + }, + { + "epoch": 19.80016313213703, + "grad_norm": 0.03369932249188423, + "learning_rate": 3.0458649045211895e-07, + "loss": 0.0056, + "num_input_tokens_seen": 261905472, + "step": 121375 + }, + { + "epoch": 19.800978792822185, + "grad_norm": 0.0014723138883709908, + "learning_rate": 3.021074139545332e-07, + "loss": 0.0001, + "num_input_tokens_seen": 261915680, + "step": 121380 + }, + { + "epoch": 19.80179445350734, + "grad_norm": 0.0050400434993207455, + "learning_rate": 2.996384644686212e-07, + "loss": 0.0023, + "num_input_tokens_seen": 261925440, + "step": 121385 + }, + { + "epoch": 19.802610114192497, + "grad_norm": 0.00027292364393360913, + "learning_rate": 2.971796420444539e-07, + "loss": 0.0042, + "num_input_tokens_seen": 261936000, + "step": 121390 + }, + { + "epoch": 19.803425774877653, + "grad_norm": 0.00039037541137076914, + "learning_rate": 2.947309467318804e-07, + "loss": 0.0017, + "num_input_tokens_seen": 261948128, + "step": 121395 + }, + { + "epoch": 19.804241435562805, + "grad_norm": 0.0007301748846657574, + "learning_rate": 2.922923785804721e-07, + "loss": 0.0011, + "num_input_tokens_seen": 261957568, + "step": 121400 + }, + { + "epoch": 19.80505709624796, + "grad_norm": 0.007306003477424383, + "learning_rate": 2.898639376396894e-07, + "loss": 0.0022, + "num_input_tokens_seen": 261969120, + "step": 121405 + }, + { + "epoch": 19.805872756933116, + "grad_norm": 0.008622409775853157, + "learning_rate": 2.8744562395877083e-07, + "loss": 0.0008, + "num_input_tokens_seen": 261979136, + "step": 121410 + }, + { + "epoch": 19.806688417618272, + "grad_norm": 0.003100133268162608, + "learning_rate": 2.850374375866216e-07, + "loss": 0.0002, + "num_input_tokens_seen": 261989824, + "step": 121415 + }, + { + "epoch": 19.807504078303424, + "grad_norm": 0.0007444845396094024, + "learning_rate": 2.826393785722026e-07, + "loss": 0.0506, + "num_input_tokens_seen": 262000544, + "step": 121420 + }, + { + "epoch": 19.80831973898858, + "grad_norm": 0.0009362988057546318, + "learning_rate": 2.80251446963975e-07, + "loss": 0.0022, + "num_input_tokens_seen": 262010912, + "step": 121425 + }, + { + "epoch": 19.809135399673735, + "grad_norm": 0.010639713145792484, + "learning_rate": 2.778736428104556e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262022080, + "step": 121430 + }, + { + "epoch": 19.80995106035889, + "grad_norm": 0.0008151759975589812, + "learning_rate": 2.75505966159717e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262033216, + "step": 121435 + }, + { + "epoch": 19.810766721044047, + "grad_norm": 0.0001737466373015195, + "learning_rate": 2.73148417059832e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262044576, + "step": 121440 + }, + { + "epoch": 19.8115823817292, + "grad_norm": 0.00013938546180725098, + "learning_rate": 2.708009955584845e-07, + "loss": 0.001, + "num_input_tokens_seen": 262054752, + "step": 121445 + }, + { + "epoch": 19.812398042414355, + "grad_norm": 0.007595475297421217, + "learning_rate": 2.684637017033587e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262066464, + "step": 121450 + }, + { + "epoch": 19.81321370309951, + "grad_norm": 0.00035946944262832403, + "learning_rate": 2.6613653554175e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262077856, + "step": 121455 + }, + { + "epoch": 19.814029363784666, + "grad_norm": 0.1427484154701233, + "learning_rate": 2.6381949712089846e-07, + "loss": 0.0041, + "num_input_tokens_seen": 262087840, + "step": 121460 + }, + { + "epoch": 19.81484502446982, + "grad_norm": 0.0016747943591326475, + "learning_rate": 2.6151258648765553e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262099968, + "step": 121465 + }, + { + "epoch": 19.815660685154974, + "grad_norm": 0.0001741334708640352, + "learning_rate": 2.59215803688817e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262111040, + "step": 121470 + }, + { + "epoch": 19.81647634584013, + "grad_norm": 0.0002936297096312046, + "learning_rate": 2.5692914877090135e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262122080, + "step": 121475 + }, + { + "epoch": 19.817292006525285, + "grad_norm": 0.00021707426640205085, + "learning_rate": 2.546526217803713e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262132640, + "step": 121480 + }, + { + "epoch": 19.81810766721044, + "grad_norm": 0.0010890491539612412, + "learning_rate": 2.5238622276319014e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262143840, + "step": 121485 + }, + { + "epoch": 19.818923327895597, + "grad_norm": 0.0001341850875178352, + "learning_rate": 2.501299517654321e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262155040, + "step": 121490 + }, + { + "epoch": 19.81973898858075, + "grad_norm": 0.00023502767726313323, + "learning_rate": 2.4788380883278285e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262166816, + "step": 121495 + }, + { + "epoch": 19.820554649265905, + "grad_norm": 0.00014115948579274118, + "learning_rate": 2.4564779401070604e-07, + "loss": 0.0042, + "num_input_tokens_seen": 262177312, + "step": 121500 + }, + { + "epoch": 19.82137030995106, + "grad_norm": 0.00010768734500743449, + "learning_rate": 2.434219073445543e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262188096, + "step": 121505 + }, + { + "epoch": 19.822185970636216, + "grad_norm": 0.011562298983335495, + "learning_rate": 2.412061488795136e-07, + "loss": 0.0023, + "num_input_tokens_seen": 262198304, + "step": 121510 + }, + { + "epoch": 19.82300163132137, + "grad_norm": 0.0017658811993896961, + "learning_rate": 2.390005186603261e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262209664, + "step": 121515 + }, + { + "epoch": 19.823817292006524, + "grad_norm": 0.0002010182070080191, + "learning_rate": 2.3680501673184474e-07, + "loss": 0.0032, + "num_input_tokens_seen": 262220672, + "step": 121520 + }, + { + "epoch": 19.82463295269168, + "grad_norm": 0.013422760181128979, + "learning_rate": 2.346196431384784e-07, + "loss": 0.0042, + "num_input_tokens_seen": 262231776, + "step": 121525 + }, + { + "epoch": 19.825448613376835, + "grad_norm": 0.03260614722967148, + "learning_rate": 2.324443979245805e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262242880, + "step": 121530 + }, + { + "epoch": 19.82626427406199, + "grad_norm": 0.00022336999245453626, + "learning_rate": 2.302792811341714e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262254304, + "step": 121535 + }, + { + "epoch": 19.827079934747147, + "grad_norm": 0.0002389697910984978, + "learning_rate": 2.2812429281116043e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262264640, + "step": 121540 + }, + { + "epoch": 19.8278955954323, + "grad_norm": 0.0014500232646241784, + "learning_rate": 2.2597943299923484e-07, + "loss": 0.001, + "num_input_tokens_seen": 262276256, + "step": 121545 + }, + { + "epoch": 19.828711256117455, + "grad_norm": 0.00019344192696735263, + "learning_rate": 2.2384470174180438e-07, + "loss": 0.0019, + "num_input_tokens_seen": 262287008, + "step": 121550 + }, + { + "epoch": 19.82952691680261, + "grad_norm": 0.01335355918854475, + "learning_rate": 2.2172009908216772e-07, + "loss": 0.0013, + "num_input_tokens_seen": 262297920, + "step": 121555 + }, + { + "epoch": 19.830342577487766, + "grad_norm": 0.0006252493476495147, + "learning_rate": 2.1960562506340153e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262307200, + "step": 121560 + }, + { + "epoch": 19.83115823817292, + "grad_norm": 0.001038793008774519, + "learning_rate": 2.1750127972836042e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262317024, + "step": 121565 + }, + { + "epoch": 19.831973898858074, + "grad_norm": 0.000309836701489985, + "learning_rate": 2.1540706311967695e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262327456, + "step": 121570 + }, + { + "epoch": 19.83278955954323, + "grad_norm": 0.07128357887268066, + "learning_rate": 2.1332297527976164e-07, + "loss": 0.0015, + "num_input_tokens_seen": 262338976, + "step": 121575 + }, + { + "epoch": 19.833605220228385, + "grad_norm": 0.005687952972948551, + "learning_rate": 2.1124901625091397e-07, + "loss": 0.001, + "num_input_tokens_seen": 262349408, + "step": 121580 + }, + { + "epoch": 19.83442088091354, + "grad_norm": 0.00019407353829592466, + "learning_rate": 2.091851860751004e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262359904, + "step": 121585 + }, + { + "epoch": 19.835236541598697, + "grad_norm": 0.17579586803913116, + "learning_rate": 2.071314847941763e-07, + "loss": 0.0044, + "num_input_tokens_seen": 262371200, + "step": 121590 + }, + { + "epoch": 19.83605220228385, + "grad_norm": 0.00020799037883989513, + "learning_rate": 2.050879124498306e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262382624, + "step": 121595 + }, + { + "epoch": 19.836867862969005, + "grad_norm": 0.0026390249840915203, + "learning_rate": 2.0305446908336355e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262393152, + "step": 121600 + }, + { + "epoch": 19.83768352365416, + "grad_norm": 0.013744687661528587, + "learning_rate": 2.0103115473601996e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262404192, + "step": 121605 + }, + { + "epoch": 19.838499184339316, + "grad_norm": 0.012884072959423065, + "learning_rate": 1.9901796944882254e-07, + "loss": 0.0017, + "num_input_tokens_seen": 262414528, + "step": 121610 + }, + { + "epoch": 19.839314845024468, + "grad_norm": 8.874024933902547e-05, + "learning_rate": 1.9701491326257203e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262426144, + "step": 121615 + }, + { + "epoch": 19.840130505709624, + "grad_norm": 0.0003579501644708216, + "learning_rate": 1.9502198621790257e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262437696, + "step": 121620 + }, + { + "epoch": 19.84094616639478, + "grad_norm": 0.005468745715916157, + "learning_rate": 1.9303918835511526e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262447872, + "step": 121625 + }, + { + "epoch": 19.841761827079935, + "grad_norm": 0.0002583398309070617, + "learning_rate": 1.9106651971445564e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262458496, + "step": 121630 + }, + { + "epoch": 19.84257748776509, + "grad_norm": 0.010793328285217285, + "learning_rate": 1.8910398033589182e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262470464, + "step": 121635 + }, + { + "epoch": 19.843393148450243, + "grad_norm": 0.04736420139670372, + "learning_rate": 1.8715157025916972e-07, + "loss": 0.0013, + "num_input_tokens_seen": 262480000, + "step": 121640 + }, + { + "epoch": 19.8442088091354, + "grad_norm": 0.00011987720790784806, + "learning_rate": 1.8520928952386885e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262491424, + "step": 121645 + }, + { + "epoch": 19.845024469820554, + "grad_norm": 9.274165495298803e-05, + "learning_rate": 1.8327713816940207e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262501568, + "step": 121650 + }, + { + "epoch": 19.84584013050571, + "grad_norm": 0.0015363277634605765, + "learning_rate": 1.8135511623484925e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262511616, + "step": 121655 + }, + { + "epoch": 19.846655791190866, + "grad_norm": 0.008731590583920479, + "learning_rate": 1.7944322375923472e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262522880, + "step": 121660 + }, + { + "epoch": 19.847471451876018, + "grad_norm": 0.006054314784705639, + "learning_rate": 1.7754146078124976e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262535008, + "step": 121665 + }, + { + "epoch": 19.848287112561174, + "grad_norm": 0.04257241263985634, + "learning_rate": 1.7564982733947465e-07, + "loss": 0.001, + "num_input_tokens_seen": 262546144, + "step": 121670 + }, + { + "epoch": 19.84910277324633, + "grad_norm": 0.036077942699193954, + "learning_rate": 1.7376832347221206e-07, + "loss": 0.0006, + "num_input_tokens_seen": 262556736, + "step": 121675 + }, + { + "epoch": 19.849918433931485, + "grad_norm": 0.10760064423084259, + "learning_rate": 1.7189694921759813e-07, + "loss": 0.002, + "num_input_tokens_seen": 262566560, + "step": 121680 + }, + { + "epoch": 19.85073409461664, + "grad_norm": 0.0001221897837240249, + "learning_rate": 1.700357046136025e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262577440, + "step": 121685 + }, + { + "epoch": 19.851549755301793, + "grad_norm": 0.03172318637371063, + "learning_rate": 1.6818458969786177e-07, + "loss": 0.0037, + "num_input_tokens_seen": 262588224, + "step": 121690 + }, + { + "epoch": 19.85236541598695, + "grad_norm": 0.000911700539290905, + "learning_rate": 1.6634360450795694e-07, + "loss": 0.001, + "num_input_tokens_seen": 262598240, + "step": 121695 + }, + { + "epoch": 19.853181076672104, + "grad_norm": 0.0008234859560616314, + "learning_rate": 1.6451274908124703e-07, + "loss": 0.0015, + "num_input_tokens_seen": 262608288, + "step": 121700 + }, + { + "epoch": 19.85399673735726, + "grad_norm": 0.0010570456506684422, + "learning_rate": 1.6269202345470247e-07, + "loss": 0.0014, + "num_input_tokens_seen": 262619616, + "step": 121705 + }, + { + "epoch": 19.854812398042416, + "grad_norm": 0.000743250478990376, + "learning_rate": 1.6088142766529367e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262630560, + "step": 121710 + }, + { + "epoch": 19.855628058727568, + "grad_norm": 0.000722036580555141, + "learning_rate": 1.5908096174976904e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262642240, + "step": 121715 + }, + { + "epoch": 19.856443719412724, + "grad_norm": 0.03593895211815834, + "learning_rate": 1.5729062574448838e-07, + "loss": 0.0014, + "num_input_tokens_seen": 262653952, + "step": 121720 + }, + { + "epoch": 19.85725938009788, + "grad_norm": 0.27253684401512146, + "learning_rate": 1.55510419685867e-07, + "loss": 0.0047, + "num_input_tokens_seen": 262664704, + "step": 121725 + }, + { + "epoch": 19.858075040783035, + "grad_norm": 0.014550449326634407, + "learning_rate": 1.5374034360993162e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262675936, + "step": 121730 + }, + { + "epoch": 19.85889070146819, + "grad_norm": 0.00022710264602210373, + "learning_rate": 1.5198039755248693e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262686624, + "step": 121735 + }, + { + "epoch": 19.859706362153343, + "grad_norm": 7.258884579641744e-05, + "learning_rate": 1.5023058154928216e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262696608, + "step": 121740 + }, + { + "epoch": 19.8605220228385, + "grad_norm": 0.0001718393323244527, + "learning_rate": 1.4849089563578888e-07, + "loss": 0.0009, + "num_input_tokens_seen": 262707520, + "step": 121745 + }, + { + "epoch": 19.861337683523654, + "grad_norm": 0.00023617647821083665, + "learning_rate": 1.467613398472567e-07, + "loss": 0.0011, + "num_input_tokens_seen": 262719008, + "step": 121750 + }, + { + "epoch": 19.86215334420881, + "grad_norm": 0.12043055891990662, + "learning_rate": 1.4504191421865765e-07, + "loss": 0.0017, + "num_input_tokens_seen": 262728800, + "step": 121755 + }, + { + "epoch": 19.862969004893966, + "grad_norm": 9.241277439286932e-05, + "learning_rate": 1.433326187849082e-07, + "loss": 0.016, + "num_input_tokens_seen": 262739200, + "step": 121760 + }, + { + "epoch": 19.863784665579118, + "grad_norm": 9.414489613845944e-05, + "learning_rate": 1.416334535806474e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262750144, + "step": 121765 + }, + { + "epoch": 19.864600326264274, + "grad_norm": 0.0017074478091672063, + "learning_rate": 1.3994441864029206e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262760832, + "step": 121770 + }, + { + "epoch": 19.86541598694943, + "grad_norm": 0.01668325997889042, + "learning_rate": 1.3826551399809263e-07, + "loss": 0.0003, + "num_input_tokens_seen": 262771104, + "step": 121775 + }, + { + "epoch": 19.866231647634585, + "grad_norm": 0.00016367671196348965, + "learning_rate": 1.3659673968802188e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262782976, + "step": 121780 + }, + { + "epoch": 19.86704730831974, + "grad_norm": 0.011783347465097904, + "learning_rate": 1.3493809574399717e-07, + "loss": 0.0067, + "num_input_tokens_seen": 262794112, + "step": 121785 + }, + { + "epoch": 19.867862969004893, + "grad_norm": 0.0007143389084376395, + "learning_rate": 1.3328958219954724e-07, + "loss": 0.0485, + "num_input_tokens_seen": 262805728, + "step": 121790 + }, + { + "epoch": 19.86867862969005, + "grad_norm": 0.006544121075421572, + "learning_rate": 1.3165119908808976e-07, + "loss": 0.0017, + "num_input_tokens_seen": 262815168, + "step": 121795 + }, + { + "epoch": 19.869494290375204, + "grad_norm": 0.0030304354149848223, + "learning_rate": 1.3002294644287593e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262826816, + "step": 121800 + }, + { + "epoch": 19.87030995106036, + "grad_norm": 0.045270949602127075, + "learning_rate": 1.284048242968794e-07, + "loss": 0.0008, + "num_input_tokens_seen": 262837440, + "step": 121805 + }, + { + "epoch": 19.871125611745512, + "grad_norm": 0.00015700666699558496, + "learning_rate": 1.267968326829072e-07, + "loss": 0.0001, + "num_input_tokens_seen": 262847904, + "step": 121810 + }, + { + "epoch": 19.871941272430668, + "grad_norm": 0.0127540472894907, + "learning_rate": 1.2519897163348894e-07, + "loss": 0.0005, + "num_input_tokens_seen": 262858368, + "step": 121815 + }, + { + "epoch": 19.872756933115824, + "grad_norm": 0.0017892577452585101, + "learning_rate": 1.2361124118109856e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262869408, + "step": 121820 + }, + { + "epoch": 19.87357259380098, + "grad_norm": 0.0001326821802649647, + "learning_rate": 1.220336413578216e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262881024, + "step": 121825 + }, + { + "epoch": 19.874388254486135, + "grad_norm": 0.00015726377023383975, + "learning_rate": 1.204661721956879e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262891264, + "step": 121830 + }, + { + "epoch": 19.875203915171287, + "grad_norm": 0.01101471297442913, + "learning_rate": 1.1890883372644989e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262901888, + "step": 121835 + }, + { + "epoch": 19.876019575856443, + "grad_norm": 0.0004454998124856502, + "learning_rate": 1.1736162598163791e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262914336, + "step": 121840 + }, + { + "epoch": 19.8768352365416, + "grad_norm": 0.00022058164176996797, + "learning_rate": 1.1582454899267126e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262924352, + "step": 121845 + }, + { + "epoch": 19.877650897226754, + "grad_norm": 0.010358653031289577, + "learning_rate": 1.1429760279069168e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262935264, + "step": 121850 + }, + { + "epoch": 19.87846655791191, + "grad_norm": 0.0038260549772530794, + "learning_rate": 1.1278078740656339e-07, + "loss": 0.0025, + "num_input_tokens_seen": 262945920, + "step": 121855 + }, + { + "epoch": 19.879282218597062, + "grad_norm": 0.0023127051535993814, + "learning_rate": 1.1127410287115059e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262957376, + "step": 121860 + }, + { + "epoch": 19.880097879282218, + "grad_norm": 8.856541535351425e-05, + "learning_rate": 1.0977754921487337e-07, + "loss": 0.0012, + "num_input_tokens_seen": 262967168, + "step": 121865 + }, + { + "epoch": 19.880913539967374, + "grad_norm": 0.019418878480792046, + "learning_rate": 1.0829112646809635e-07, + "loss": 0.0007, + "num_input_tokens_seen": 262976832, + "step": 121870 + }, + { + "epoch": 19.88172920065253, + "grad_norm": 0.008534329943358898, + "learning_rate": 1.068148346610176e-07, + "loss": 0.0004, + "num_input_tokens_seen": 262988512, + "step": 121875 + }, + { + "epoch": 19.882544861337685, + "grad_norm": 0.0019725686870515347, + "learning_rate": 1.0534867382344659e-07, + "loss": 0.0002, + "num_input_tokens_seen": 262999840, + "step": 121880 + }, + { + "epoch": 19.883360522022837, + "grad_norm": 0.011746141128242016, + "learning_rate": 1.0389264398519283e-07, + "loss": 0.0003, + "num_input_tokens_seen": 263010912, + "step": 121885 + }, + { + "epoch": 19.884176182707993, + "grad_norm": 0.00014871385064907372, + "learning_rate": 1.024467451756772e-07, + "loss": 0.0006, + "num_input_tokens_seen": 263020992, + "step": 121890 + }, + { + "epoch": 19.88499184339315, + "grad_norm": 0.014584255404770374, + "learning_rate": 1.0101097742426513e-07, + "loss": 0.0012, + "num_input_tokens_seen": 263031456, + "step": 121895 + }, + { + "epoch": 19.885807504078304, + "grad_norm": 0.0004221704148221761, + "learning_rate": 9.958534075998893e-08, + "loss": 0.0025, + "num_input_tokens_seen": 263043136, + "step": 121900 + }, + { + "epoch": 19.88662316476346, + "grad_norm": 0.0002824121620506048, + "learning_rate": 9.816983521182543e-08, + "loss": 0.0047, + "num_input_tokens_seen": 263053728, + "step": 121905 + }, + { + "epoch": 19.887438825448612, + "grad_norm": 0.001333700492978096, + "learning_rate": 9.676446080841839e-08, + "loss": 0.0001, + "num_input_tokens_seen": 263064832, + "step": 121910 + }, + { + "epoch": 19.888254486133768, + "grad_norm": 0.0019030733965337276, + "learning_rate": 9.536921757824502e-08, + "loss": 0.0017, + "num_input_tokens_seen": 263075360, + "step": 121915 + }, + { + "epoch": 19.889070146818923, + "grad_norm": 0.0017227169591933489, + "learning_rate": 9.39841055495605e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263086880, + "step": 121920 + }, + { + "epoch": 19.88988580750408, + "grad_norm": 0.07061638683080673, + "learning_rate": 9.260912475050898e-08, + "loss": 0.0013, + "num_input_tokens_seen": 263097696, + "step": 121925 + }, + { + "epoch": 19.890701468189235, + "grad_norm": 0.22646136581897736, + "learning_rate": 9.124427520890155e-08, + "loss": 0.0033, + "num_input_tokens_seen": 263108384, + "step": 121930 + }, + { + "epoch": 19.891517128874387, + "grad_norm": 0.34716567397117615, + "learning_rate": 8.988955695238277e-08, + "loss": 0.01, + "num_input_tokens_seen": 263117984, + "step": 121935 + }, + { + "epoch": 19.892332789559543, + "grad_norm": 0.00038270559161901474, + "learning_rate": 8.854497000843065e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263128544, + "step": 121940 + }, + { + "epoch": 19.8931484502447, + "grad_norm": 0.0007194194477051497, + "learning_rate": 8.721051440435668e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263139136, + "step": 121945 + }, + { + "epoch": 19.893964110929854, + "grad_norm": 0.0011695417342707515, + "learning_rate": 8.588619016708377e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263149536, + "step": 121950 + }, + { + "epoch": 19.894779771615006, + "grad_norm": 0.007348670624196529, + "learning_rate": 8.457199732353482e-08, + "loss": 0.0067, + "num_input_tokens_seen": 263159808, + "step": 121955 + }, + { + "epoch": 19.895595432300162, + "grad_norm": 0.0017977103125303984, + "learning_rate": 8.32679359003552e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263170752, + "step": 121960 + }, + { + "epoch": 19.896411092985318, + "grad_norm": 0.004908894654363394, + "learning_rate": 8.197400592391268e-08, + "loss": 0.0878, + "num_input_tokens_seen": 263180736, + "step": 121965 + }, + { + "epoch": 19.897226753670473, + "grad_norm": 0.02416328527033329, + "learning_rate": 8.069020742040855e-08, + "loss": 0.0015, + "num_input_tokens_seen": 263190464, + "step": 121970 + }, + { + "epoch": 19.89804241435563, + "grad_norm": 0.014457735233008862, + "learning_rate": 7.941654041598856e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263201344, + "step": 121975 + }, + { + "epoch": 19.898858075040785, + "grad_norm": 0.00019608857110142708, + "learning_rate": 7.815300493635436e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263212672, + "step": 121980 + }, + { + "epoch": 19.899673735725937, + "grad_norm": 0.00011699625611072406, + "learning_rate": 7.68996010071521e-08, + "loss": 0.0168, + "num_input_tokens_seen": 263223136, + "step": 121985 + }, + { + "epoch": 19.900489396411093, + "grad_norm": 0.0005303024081513286, + "learning_rate": 7.565632865375039e-08, + "loss": 0.0018, + "num_input_tokens_seen": 263233632, + "step": 121990 + }, + { + "epoch": 19.90130505709625, + "grad_norm": 0.012945755384862423, + "learning_rate": 7.442318790140679e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263243136, + "step": 121995 + }, + { + "epoch": 19.902120717781404, + "grad_norm": 0.01447217632085085, + "learning_rate": 7.32001787750458e-08, + "loss": 0.0027, + "num_input_tokens_seen": 263255008, + "step": 122000 + }, + { + "epoch": 19.902936378466556, + "grad_norm": 0.00020997915999032557, + "learning_rate": 7.198730129948094e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263266656, + "step": 122005 + }, + { + "epoch": 19.903752039151712, + "grad_norm": 0.00017425768601242453, + "learning_rate": 7.078455549935914e-08, + "loss": 0.0015, + "num_input_tokens_seen": 263277120, + "step": 122010 + }, + { + "epoch": 19.904567699836868, + "grad_norm": 0.004223216790705919, + "learning_rate": 6.959194139893876e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263288640, + "step": 122015 + }, + { + "epoch": 19.905383360522023, + "grad_norm": 0.0001269157655769959, + "learning_rate": 6.840945902242268e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263300256, + "step": 122020 + }, + { + "epoch": 19.90619902120718, + "grad_norm": 0.0027934664394706488, + "learning_rate": 6.723710839384723e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263311552, + "step": 122025 + }, + { + "epoch": 19.90701468189233, + "grad_norm": 0.01773557811975479, + "learning_rate": 6.607488953691565e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263322112, + "step": 122030 + }, + { + "epoch": 19.907830342577487, + "grad_norm": 0.0010644650319591165, + "learning_rate": 6.492280247516469e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263331680, + "step": 122035 + }, + { + "epoch": 19.908646003262643, + "grad_norm": 0.013242494314908981, + "learning_rate": 6.378084723196453e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263342784, + "step": 122040 + }, + { + "epoch": 19.9094616639478, + "grad_norm": 0.0009295962518081069, + "learning_rate": 6.264902383051885e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263354336, + "step": 122045 + }, + { + "epoch": 19.910277324632954, + "grad_norm": 0.0027307053096592426, + "learning_rate": 6.152733229364272e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263365408, + "step": 122050 + }, + { + "epoch": 19.911092985318106, + "grad_norm": 0.00023858583881519735, + "learning_rate": 6.041577264415122e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263376096, + "step": 122055 + }, + { + "epoch": 19.911908646003262, + "grad_norm": 0.02509663626551628, + "learning_rate": 5.9314344904581876e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263386944, + "step": 122060 + }, + { + "epoch": 19.912724306688418, + "grad_norm": 0.0012945306953042746, + "learning_rate": 5.822304909719467e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263396768, + "step": 122065 + }, + { + "epoch": 19.913539967373573, + "grad_norm": 0.001174515695311129, + "learning_rate": 5.714188524413855e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263407104, + "step": 122070 + }, + { + "epoch": 19.91435562805873, + "grad_norm": 0.0025397315621376038, + "learning_rate": 5.6070853367284903e-08, + "loss": 0.0001, + "num_input_tokens_seen": 263417184, + "step": 122075 + }, + { + "epoch": 19.91517128874388, + "grad_norm": 0.0005087873432785273, + "learning_rate": 5.500995348844962e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263428608, + "step": 122080 + }, + { + "epoch": 19.915986949429037, + "grad_norm": 0.004631673917174339, + "learning_rate": 5.395918562900448e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263439360, + "step": 122085 + }, + { + "epoch": 19.916802610114193, + "grad_norm": 0.00020141237473580986, + "learning_rate": 5.2918549810376806e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263449664, + "step": 122090 + }, + { + "epoch": 19.91761827079935, + "grad_norm": 0.0003067314100917429, + "learning_rate": 5.188804605349429e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263460512, + "step": 122095 + }, + { + "epoch": 19.918433931484504, + "grad_norm": 0.0008396131452172995, + "learning_rate": 5.086767437939566e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263470976, + "step": 122100 + }, + { + "epoch": 19.919249592169656, + "grad_norm": 0.00029287804500199854, + "learning_rate": 4.985743480867555e-08, + "loss": 0.0014, + "num_input_tokens_seen": 263481600, + "step": 122105 + }, + { + "epoch": 19.920065252854812, + "grad_norm": 0.030488567426800728, + "learning_rate": 4.885732736181758e-08, + "loss": 0.001, + "num_input_tokens_seen": 263492896, + "step": 122110 + }, + { + "epoch": 19.920880913539968, + "grad_norm": 0.00018521711172070354, + "learning_rate": 4.7867352059138835e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263504480, + "step": 122115 + }, + { + "epoch": 19.921696574225123, + "grad_norm": 0.011767351999878883, + "learning_rate": 4.688750892062332e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263515872, + "step": 122120 + }, + { + "epoch": 19.92251223491028, + "grad_norm": 0.0005152577068656683, + "learning_rate": 4.5917797966144037e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263526656, + "step": 122125 + }, + { + "epoch": 19.92332789559543, + "grad_norm": 0.0009250710136257112, + "learning_rate": 4.495821921540744e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263537920, + "step": 122130 + }, + { + "epoch": 19.924143556280587, + "grad_norm": 0.05991659313440323, + "learning_rate": 4.400877268784242e-08, + "loss": 0.0015, + "num_input_tokens_seen": 263549984, + "step": 122135 + }, + { + "epoch": 19.924959216965743, + "grad_norm": 0.0002137508854502812, + "learning_rate": 4.306945840265586e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263559392, + "step": 122140 + }, + { + "epoch": 19.9257748776509, + "grad_norm": 0.00016504968516528606, + "learning_rate": 4.2140276378943576e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263569920, + "step": 122145 + }, + { + "epoch": 19.92659053833605, + "grad_norm": 0.04650947079062462, + "learning_rate": 4.1221226635468345e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263580352, + "step": 122150 + }, + { + "epoch": 19.927406199021206, + "grad_norm": 0.004327744711190462, + "learning_rate": 4.031230919088191e-08, + "loss": 0.0534, + "num_input_tokens_seen": 263591104, + "step": 122155 + }, + { + "epoch": 19.928221859706362, + "grad_norm": 0.0025043096393346786, + "learning_rate": 3.941352406361398e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263602336, + "step": 122160 + }, + { + "epoch": 19.929037520391518, + "grad_norm": 0.021079905331134796, + "learning_rate": 3.852487127187221e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263613472, + "step": 122165 + }, + { + "epoch": 19.929853181076673, + "grad_norm": 0.0001473976590204984, + "learning_rate": 3.7646350833697715e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263623104, + "step": 122170 + }, + { + "epoch": 19.930668841761825, + "grad_norm": 0.018984239548444748, + "learning_rate": 3.677796276685408e-08, + "loss": 0.0031, + "num_input_tokens_seen": 263634592, + "step": 122175 + }, + { + "epoch": 19.93148450244698, + "grad_norm": 0.005450270604342222, + "learning_rate": 3.591970708893832e-08, + "loss": 0.0033, + "num_input_tokens_seen": 263643808, + "step": 122180 + }, + { + "epoch": 19.932300163132137, + "grad_norm": 0.006243106909096241, + "learning_rate": 3.507158381738096e-08, + "loss": 0.0014, + "num_input_tokens_seen": 263654336, + "step": 122185 + }, + { + "epoch": 19.933115823817293, + "grad_norm": 0.0002482594863977283, + "learning_rate": 3.4233592969334926e-08, + "loss": 0.0007, + "num_input_tokens_seen": 263665440, + "step": 122190 + }, + { + "epoch": 19.93393148450245, + "grad_norm": 0.0002609306829981506, + "learning_rate": 3.340573456184215e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263676224, + "step": 122195 + }, + { + "epoch": 19.9347471451876, + "grad_norm": 0.00012904845061711967, + "learning_rate": 3.258800861155598e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263687040, + "step": 122200 + }, + { + "epoch": 19.935562805872756, + "grad_norm": 0.0018513076938688755, + "learning_rate": 3.178041513518526e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263697504, + "step": 122205 + }, + { + "epoch": 19.936378466557912, + "grad_norm": 0.00012276865891180933, + "learning_rate": 3.098295414899477e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263708672, + "step": 122210 + }, + { + "epoch": 19.937194127243067, + "grad_norm": 0.0001461346255382523, + "learning_rate": 3.019562566924927e-08, + "loss": 0.0014, + "num_input_tokens_seen": 263718688, + "step": 122215 + }, + { + "epoch": 19.938009787928223, + "grad_norm": 0.0035820852499455214, + "learning_rate": 2.9418429711769445e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263730464, + "step": 122220 + }, + { + "epoch": 19.938825448613375, + "grad_norm": 0.00939035601913929, + "learning_rate": 2.865136629243148e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263741696, + "step": 122225 + }, + { + "epoch": 19.93964110929853, + "grad_norm": 0.030633147805929184, + "learning_rate": 2.7894435426722988e-08, + "loss": 0.0008, + "num_input_tokens_seen": 263752448, + "step": 122230 + }, + { + "epoch": 19.940456769983687, + "grad_norm": 0.0037531449925154448, + "learning_rate": 2.7147637130020553e-08, + "loss": 0.0018, + "num_input_tokens_seen": 263763232, + "step": 122235 + }, + { + "epoch": 19.941272430668842, + "grad_norm": 0.001052591484040022, + "learning_rate": 2.6410971417423214e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263774624, + "step": 122240 + }, + { + "epoch": 19.942088091353998, + "grad_norm": 0.000489742262288928, + "learning_rate": 2.5684438303807955e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263784992, + "step": 122245 + }, + { + "epoch": 19.94290375203915, + "grad_norm": 0.002013757824897766, + "learning_rate": 2.496803780405177e-08, + "loss": 0.0005, + "num_input_tokens_seen": 263796576, + "step": 122250 + }, + { + "epoch": 19.943719412724306, + "grad_norm": 0.013849758543074131, + "learning_rate": 2.426176993253204e-08, + "loss": 0.0014, + "num_input_tokens_seen": 263806752, + "step": 122255 + }, + { + "epoch": 19.94453507340946, + "grad_norm": 0.027245236560702324, + "learning_rate": 2.356563470357065e-08, + "loss": 0.0013, + "num_input_tokens_seen": 263817344, + "step": 122260 + }, + { + "epoch": 19.945350734094617, + "grad_norm": 0.006294277496635914, + "learning_rate": 2.287963213137845e-08, + "loss": 0.0002, + "num_input_tokens_seen": 263828416, + "step": 122265 + }, + { + "epoch": 19.946166394779773, + "grad_norm": 0.00018989348609466106, + "learning_rate": 2.2203762229777713e-08, + "loss": 0.0021, + "num_input_tokens_seen": 263838976, + "step": 122270 + }, + { + "epoch": 19.946982055464925, + "grad_norm": 0.015877440571784973, + "learning_rate": 2.15380250124797e-08, + "loss": 0.0006, + "num_input_tokens_seen": 263848800, + "step": 122275 + }, + { + "epoch": 19.94779771615008, + "grad_norm": 0.0006651472649537027, + "learning_rate": 2.0882420493029132e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263859520, + "step": 122280 + }, + { + "epoch": 19.948613376835237, + "grad_norm": 0.008316482417285442, + "learning_rate": 2.0236948684582147e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263869792, + "step": 122285 + }, + { + "epoch": 19.949429037520392, + "grad_norm": 0.0008784636738710105, + "learning_rate": 1.96016096003504e-08, + "loss": 0.0001, + "num_input_tokens_seen": 263880160, + "step": 122290 + }, + { + "epoch": 19.950244698205548, + "grad_norm": 0.00036165866185911, + "learning_rate": 1.8976403253156972e-08, + "loss": 0.0001, + "num_input_tokens_seen": 263889824, + "step": 122295 + }, + { + "epoch": 19.9510603588907, + "grad_norm": 0.00034009746741503477, + "learning_rate": 1.836132965571391e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263901472, + "step": 122300 + }, + { + "epoch": 19.951876019575856, + "grad_norm": 0.00017482943076174706, + "learning_rate": 1.7756388820400205e-08, + "loss": 0.0018, + "num_input_tokens_seen": 263911808, + "step": 122305 + }, + { + "epoch": 19.95269168026101, + "grad_norm": 0.0007163725094869733, + "learning_rate": 1.716158075953933e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263922784, + "step": 122310 + }, + { + "epoch": 19.953507340946167, + "grad_norm": 0.0038587991148233414, + "learning_rate": 1.6576905485177206e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263932768, + "step": 122315 + }, + { + "epoch": 19.954323001631323, + "grad_norm": 0.00663127051666379, + "learning_rate": 1.6002363009137712e-08, + "loss": 0.0119, + "num_input_tokens_seen": 263944320, + "step": 122320 + }, + { + "epoch": 19.955138662316475, + "grad_norm": 0.023233676329255104, + "learning_rate": 1.5437953343078182e-08, + "loss": 0.0026, + "num_input_tokens_seen": 263955456, + "step": 122325 + }, + { + "epoch": 19.95595432300163, + "grad_norm": 0.0009905535262078047, + "learning_rate": 1.488367649848943e-08, + "loss": 0.0004, + "num_input_tokens_seen": 263965664, + "step": 122330 + }, + { + "epoch": 19.956769983686787, + "grad_norm": 9.307449363404885e-05, + "learning_rate": 1.4339532486529195e-08, + "loss": 0.0003, + "num_input_tokens_seen": 263976960, + "step": 122335 + }, + { + "epoch": 19.957585644371942, + "grad_norm": 0.002083148341625929, + "learning_rate": 1.3805521318244196e-08, + "loss": 0.0009, + "num_input_tokens_seen": 263987776, + "step": 122340 + }, + { + "epoch": 19.958401305057095, + "grad_norm": 0.004904035944491625, + "learning_rate": 1.3281643004514621e-08, + "loss": 0.0013, + "num_input_tokens_seen": 263997536, + "step": 122345 + }, + { + "epoch": 19.95921696574225, + "grad_norm": 0.00023577299725729972, + "learning_rate": 1.2767897555887587e-08, + "loss": 0.0076, + "num_input_tokens_seen": 264008864, + "step": 122350 + }, + { + "epoch": 19.960032626427406, + "grad_norm": 0.0030490641947835684, + "learning_rate": 1.2264284982743679e-08, + "loss": 0.0007, + "num_input_tokens_seen": 264018784, + "step": 122355 + }, + { + "epoch": 19.96084828711256, + "grad_norm": 0.003503830172121525, + "learning_rate": 1.1770805295407972e-08, + "loss": 0.0002, + "num_input_tokens_seen": 264029312, + "step": 122360 + }, + { + "epoch": 19.961663947797717, + "grad_norm": 0.0024471955839544535, + "learning_rate": 1.1287458503816961e-08, + "loss": 0.0031, + "num_input_tokens_seen": 264040672, + "step": 122365 + }, + { + "epoch": 19.96247960848287, + "grad_norm": 0.00034319362021051347, + "learning_rate": 1.0814244617740609e-08, + "loss": 0.0014, + "num_input_tokens_seen": 264050432, + "step": 122370 + }, + { + "epoch": 19.963295269168025, + "grad_norm": 0.046251315623521805, + "learning_rate": 1.0351163646782346e-08, + "loss": 0.0008, + "num_input_tokens_seen": 264061280, + "step": 122375 + }, + { + "epoch": 19.96411092985318, + "grad_norm": 0.0005360693321563303, + "learning_rate": 9.898215600379068e-09, + "loss": 0.0005, + "num_input_tokens_seen": 264072000, + "step": 122380 + }, + { + "epoch": 19.964926590538337, + "grad_norm": 0.0002083016006508842, + "learning_rate": 9.455400487634602e-09, + "loss": 0.0068, + "num_input_tokens_seen": 264083296, + "step": 122385 + }, + { + "epoch": 19.965742251223492, + "grad_norm": 0.0033022670540958643, + "learning_rate": 9.022718317597267e-09, + "loss": 0.0096, + "num_input_tokens_seen": 264093568, + "step": 122390 + }, + { + "epoch": 19.966557911908644, + "grad_norm": 0.00023030802549328655, + "learning_rate": 8.600169098982313e-09, + "loss": 0.0002, + "num_input_tokens_seen": 264105248, + "step": 122395 + }, + { + "epoch": 19.9673735725938, + "grad_norm": 0.014019564725458622, + "learning_rate": 8.187752840338458e-09, + "loss": 0.0003, + "num_input_tokens_seen": 264115232, + "step": 122400 + }, + { + "epoch": 19.968189233278956, + "grad_norm": 0.003953394014388323, + "learning_rate": 7.785469550103397e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264126912, + "step": 122405 + }, + { + "epoch": 19.96900489396411, + "grad_norm": 0.008718741126358509, + "learning_rate": 7.393319236326246e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264138176, + "step": 122410 + }, + { + "epoch": 19.969820554649267, + "grad_norm": 0.013759390451014042, + "learning_rate": 7.011301907056122e-09, + "loss": 0.0012, + "num_input_tokens_seen": 264149888, + "step": 122415 + }, + { + "epoch": 19.97063621533442, + "grad_norm": 0.0017301725456491113, + "learning_rate": 6.639417570009076e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264160064, + "step": 122420 + }, + { + "epoch": 19.971451876019575, + "grad_norm": 0.0003452931996434927, + "learning_rate": 6.2776662326236025e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264170304, + "step": 122425 + }, + { + "epoch": 19.97226753670473, + "grad_norm": 0.0003665284893941134, + "learning_rate": 5.926047902393705e-09, + "loss": 0.0006, + "num_input_tokens_seen": 264181344, + "step": 122430 + }, + { + "epoch": 19.973083197389887, + "grad_norm": 0.0010030491976067424, + "learning_rate": 5.584562586313791e-09, + "loss": 0.0019, + "num_input_tokens_seen": 264192736, + "step": 122435 + }, + { + "epoch": 19.973898858075042, + "grad_norm": 0.0021375154610723257, + "learning_rate": 5.253210291322752e-09, + "loss": 0.0012, + "num_input_tokens_seen": 264204096, + "step": 122440 + }, + { + "epoch": 19.974714518760194, + "grad_norm": 0.005665700417011976, + "learning_rate": 4.93199102419295e-09, + "loss": 0.0012, + "num_input_tokens_seen": 264215520, + "step": 122445 + }, + { + "epoch": 19.97553017944535, + "grad_norm": 0.0030792446341365576, + "learning_rate": 4.620904791419189e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264226464, + "step": 122450 + }, + { + "epoch": 19.976345840130506, + "grad_norm": 0.001539702876470983, + "learning_rate": 4.31995159927423e-09, + "loss": 0.0016, + "num_input_tokens_seen": 264237760, + "step": 122455 + }, + { + "epoch": 19.97716150081566, + "grad_norm": 0.0001498729398008436, + "learning_rate": 4.029131453864299e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264248096, + "step": 122460 + }, + { + "epoch": 19.977977161500817, + "grad_norm": 0.00016214455536101013, + "learning_rate": 3.748444361129088e-09, + "loss": 0.0001, + "num_input_tokens_seen": 264258752, + "step": 122465 + }, + { + "epoch": 19.97879282218597, + "grad_norm": 0.001954850973561406, + "learning_rate": 3.477890326675226e-09, + "loss": 0.0019, + "num_input_tokens_seen": 264268864, + "step": 122470 + }, + { + "epoch": 19.979608482871125, + "grad_norm": 0.0005985397729091346, + "learning_rate": 3.217469356053826e-09, + "loss": 0.001, + "num_input_tokens_seen": 264280096, + "step": 122475 + }, + { + "epoch": 19.98042414355628, + "grad_norm": 0.0006360471015796065, + "learning_rate": 2.9671814545384477e-09, + "loss": 0.0019, + "num_input_tokens_seen": 264291424, + "step": 122480 + }, + { + "epoch": 19.981239804241437, + "grad_norm": 0.06450608372688293, + "learning_rate": 2.7270266271806065e-09, + "loss": 0.0011, + "num_input_tokens_seen": 264302528, + "step": 122485 + }, + { + "epoch": 19.982055464926592, + "grad_norm": 0.00010054052108898759, + "learning_rate": 2.4970048788652833e-09, + "loss": 0.0009, + "num_input_tokens_seen": 264312960, + "step": 122490 + }, + { + "epoch": 19.982871125611744, + "grad_norm": 0.0019660114776343107, + "learning_rate": 2.2771162141999036e-09, + "loss": 0.0011, + "num_input_tokens_seen": 264322368, + "step": 122495 + }, + { + "epoch": 19.9836867862969, + "grad_norm": 8.202803292078897e-05, + "learning_rate": 2.0673606376808707e-09, + "loss": 0.0002, + "num_input_tokens_seen": 264332800, + "step": 122500 + }, + { + "epoch": 19.984502446982056, + "grad_norm": 0.0013509375276044011, + "learning_rate": 1.8677381535825435e-09, + "loss": 0.0003, + "num_input_tokens_seen": 264342176, + "step": 122505 + }, + { + "epoch": 19.98531810766721, + "grad_norm": 0.018857009708881378, + "learning_rate": 1.6782487659572354e-09, + "loss": 0.0015, + "num_input_tokens_seen": 264352576, + "step": 122510 + }, + { + "epoch": 19.986133768352367, + "grad_norm": 0.0012630521086975932, + "learning_rate": 1.4988924785797053e-09, + "loss": 0.0001, + "num_input_tokens_seen": 264364192, + "step": 122515 + }, + { + "epoch": 19.98694942903752, + "grad_norm": 0.16563434898853302, + "learning_rate": 1.329669295113689e-09, + "loss": 0.003, + "num_input_tokens_seen": 264375104, + "step": 122520 + }, + { + "epoch": 19.987765089722675, + "grad_norm": 0.0006059006555005908, + "learning_rate": 1.1705792190008778e-09, + "loss": 0.0004, + "num_input_tokens_seen": 264385600, + "step": 122525 + }, + { + "epoch": 19.98858075040783, + "grad_norm": 0.054114602506160736, + "learning_rate": 1.0216222534609189e-09, + "loss": 0.0012, + "num_input_tokens_seen": 264396864, + "step": 122530 + }, + { + "epoch": 19.989396411092986, + "grad_norm": 0.00035415164893493056, + "learning_rate": 8.827984014914137e-10, + "loss": 0.0003, + "num_input_tokens_seen": 264408416, + "step": 122535 + }, + { + "epoch": 19.99021207177814, + "grad_norm": 0.00042005794239230454, + "learning_rate": 7.541076659234314e-10, + "loss": 0.0008, + "num_input_tokens_seen": 264418656, + "step": 122540 + }, + { + "epoch": 19.991027732463294, + "grad_norm": 0.00015410421474371105, + "learning_rate": 6.355500494215072e-10, + "loss": 0.0007, + "num_input_tokens_seen": 264430432, + "step": 122545 + }, + { + "epoch": 19.99184339314845, + "grad_norm": 0.0011554670054465532, + "learning_rate": 5.271255543171094e-10, + "loss": 0.001, + "num_input_tokens_seen": 264441248, + "step": 122550 + }, + { + "epoch": 19.992659053833606, + "grad_norm": 0.0007945868419483304, + "learning_rate": 4.2883418277517293e-10, + "loss": 0.0003, + "num_input_tokens_seen": 264451520, + "step": 122555 + }, + { + "epoch": 19.99347471451876, + "grad_norm": 0.017619745805859566, + "learning_rate": 3.4067593690512154e-10, + "loss": 0.0005, + "num_input_tokens_seen": 264461696, + "step": 122560 + }, + { + "epoch": 19.994290375203914, + "grad_norm": 0.007235695607960224, + "learning_rate": 2.6265081837228976e-10, + "loss": 0.0005, + "num_input_tokens_seen": 264474336, + "step": 122565 + }, + { + "epoch": 19.99510603588907, + "grad_norm": 0.017511583864688873, + "learning_rate": 1.9475882884201212e-10, + "loss": 0.0004, + "num_input_tokens_seen": 264486016, + "step": 122570 + }, + { + "epoch": 19.995921696574225, + "grad_norm": 0.0021859160624444485, + "learning_rate": 1.3699996964655626e-10, + "loss": 0.001, + "num_input_tokens_seen": 264496320, + "step": 122575 + }, + { + "epoch": 19.99673735725938, + "grad_norm": 0.015178943984210491, + "learning_rate": 8.937424195165634e-11, + "loss": 0.0004, + "num_input_tokens_seen": 264505856, + "step": 122580 + }, + { + "epoch": 19.997553017944536, + "grad_norm": 0.00012104311463190243, + "learning_rate": 5.188164675651308e-11, + "loss": 0.0005, + "num_input_tokens_seen": 264515744, + "step": 122585 + }, + { + "epoch": 19.99836867862969, + "grad_norm": 0.0004609785682987422, + "learning_rate": 2.4522184838282614e-11, + "loss": 0.0003, + "num_input_tokens_seen": 264526496, + "step": 122590 + }, + { + "epoch": 19.999184339314844, + "grad_norm": 0.015589098446071148, + "learning_rate": 7.295856696565295e-12, + "loss": 0.0003, + "num_input_tokens_seen": 264537760, + "step": 122595 + }, + { + "epoch": 20.0, + "grad_norm": 0.0014483574777841568, + "learning_rate": 2.0266266442803271e-13, + "loss": 0.0004, + "num_input_tokens_seen": 264547520, + "step": 122600 + }, + { + "epoch": 20.0, + "eval_loss": 0.3435961902141571, + "eval_runtime": 104.2687, + "eval_samples_per_second": 26.134, + "eval_steps_per_second": 6.541, + "num_input_tokens_seen": 264547520, + "step": 122600 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 264547520, + "step": 122600, + "total_flos": 1.191245525858648e+19, + "train_loss": 0.06821744520573833, + "train_runtime": 45761.4588, + "train_samples_per_second": 10.716, + "train_steps_per_second": 2.679 + } + ], + "logging_steps": 5, + "max_steps": 122600, + "num_input_tokens_seen": 264547520, + "num_train_epochs": 20, + "save_steps": 6130, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.191245525858648e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}